diff --git a/.bazelrc b/.bazelrc
index 1538ddd284f616..bd71ab3016563d 100644
--- a/.bazelrc
+++ b/.bazelrc
@@ -255,6 +255,14 @@ build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-1
 build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 build:cuda_clang_official --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
 
+# Build with nvcc for CUDA and clang for host
+build:nvcc_clang --config=cuda
+# Unfortunately, cuda_configure.bzl demands this for using nvcc + clang
+build:nvcc_clang --action_env=TF_CUDA_CLANG="1"
+build:nvcc_clang --action_env=TF_NVCC_CLANG="1"
+build:nvcc_clang --@local_config_cuda//:cuda_compiler=nvcc
+
+
 # Debug config
 build:dbg -c dbg
 # Only include debug info for files under tensorflow/, excluding kernels, to
@@ -527,8 +535,8 @@ build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.16-clang_confi
 test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
 build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
+build:rbe_linux_cuda_nvcc --config=nvcc_clang
 build:rbe_linux_cuda_nvcc --repo_env TF_NCCL_USE_STUB=1
-build:rbe_linux_cuda_nvcc --action_env=TF_NVCC_CLANG="1"
 
 # TODO(kanglan): Remove rbe_win and rbe_win_py3* after b/289091160 is fixed
 build:rbe_win --config=rbe_base
@@ -577,6 +585,7 @@ build:elinux_armhf --copt -mfp16-format=ieee
 
 # Load rc file written by ./configure.
 try-import %workspace%/.tf_configure.bazelrc
+try-import %workspace%/xla_configure.bazelrc
 
 # Load rc file with user-specific options.
 try-import %workspace%/.bazelrc.user
@@ -719,7 +728,7 @@ build:no_tfrt --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compil
 # Use --config=tf_public_cache to try and use the TensorFlow public build cache
 # to build TensorFlow. Look at ci/official/envs to find which types of jobs
 # push to the cache.  For macOS, use --config=tf_public_macos_cache
-build:tf_public_cache --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --remote_upload_local_results=false
+build:tf_public_cache --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/january2024" --remote_upload_local_results=false
 # Cache pushes are limited to TF's CI system.
 build:tf_public_cache_push --config=tf_public_cache --remote_upload_local_results=true --google_default_credentials
 # Public cache for macOS builds
@@ -777,28 +786,38 @@ test:linux_cuda_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-os
 test:linux_cuda_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
 test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # ARM64 PYCPP
-test:linux_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
-test:linux_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
-test:linux_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium --flaky_test_attempts=3
+# In Linux Arm64 presubmit/continuous build, we cross-compile the binaries on
+# Linux x86 so that we can use RBE. Since tests still need to run on the single
+# host Arm64 machine, the build becomes too slow (~30 min) to be a presubmit.
+# For testing purposes, we want to see the runtime performance of an
+# experimental job that is build-only, i.e, we only build the test targets and
+# do not run them. By prefixing the configs with "build", we can run both
+# `bazel build` and `bazel test` commands with the same config as test configs
+# inherit from build.
+build:linux_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+build:linux_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+build:linux_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium --flaky_test_attempts=3
 # TODO(michaelhudgins): Why do we need to specifically omit go and java here? 
-test:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test -//tensorflow/compiler/mlir/tfr/examples/customization:test_ops_test -//tensorflow/compiler/mlir/tfr/examples/mnist:mnist_ops_test -//tensorflow/compiler/mlir/tfr/examples/pad:pad_ops_test -//tensorflow/python/tools:aot_compiled_test
+build:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test -//tensorflow/compiler/mlir/tfr/examples/customization:test_ops_test -//tensorflow/compiler/mlir/tfr/examples/mnist:mnist_ops_test -//tensorflow/compiler/mlir/tfr/examples/pad:pad_ops_test -//tensorflow/python/tools:aot_compiled_test
 # CROSS-COMPILE ARM64 PYCPP
-test:cross_compile_linux_arm64_pycpp_test --config=linux_arm64_pycpp_test
+build:cross_compile_linux_arm64_pycpp_test --config=linux_arm64_pycpp_test
 # Tests that fail only when cross-compiled
-test:cross_compile_linux_arm64_pycpp_test -//tensorflow/compiler/mlir/quantization/stablehlo:convert_tf_quant_to_mhlo_int_test
+build:cross_compile_linux_arm64_pycpp_test -//tensorflow/compiler/mlir/quantization/stablehlo:convert_tf_quant_to_mhlo_int_test
 # MACOS ARM64 PYCPP
 test:macos_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
 test:macos_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
 test:macos_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
 test:macos_arm64_pycpp_test --config=macos_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/compiler/aot/... -//tensorflow/core/kernels/image:resize_bicubic_op_test
 # MACOS X86 PYCPP
-test:macos_x86_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
-test:macos_x86_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
-test:macos_x86_pycpp_test_filters --keep_going --test_lang_filters=cc,py --test_size_filters=small,medium
-test:macos_x86_pycpp_test --config=macos_x86_pycpp_test_filters -- //tensorflow/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/python/integration_testing/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/...
+# These are defined as build configs so that we can run a build only job. See
+# the note under "ARM64 PYCPP" for more details.
+build:macos_x86_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
+build:macos_x86_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
+build:macos_x86_pycpp_test_filters --keep_going --test_lang_filters=cc,py --test_size_filters=small,medium
+build:macos_x86_pycpp_test --config=macos_x86_pycpp_test_filters -- //tensorflow/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/python/integration_testing/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/...
 # CROSS-COMPILE MACOS X86 PYCPP
-test:cross_compile_macos_x86_pycpp_test --config=macos_x86_pycpp_test
-test:cross_compile_macos_x86_pycpp_test -//tensorflow/core/kernels:quantized_conv_ops_test -//tensorflow/core/kernels:quantized_matmul_op_test -//tensorflow/python/ops:quantized_conv_ops_test -//tensorflow/tools/graph_transforms:transforms_test -//tensorflow/python/tools:aot_compiled_test
+build:cross_compile_macos_x86_pycpp_test --config=macos_x86_pycpp_test
+build:cross_compile_macos_x86_pycpp_test -//tensorflow/core/kernels:quantized_conv_ops_test -//tensorflow/core/kernels:quantized_matmul_op_test -//tensorflow/python/ops:quantized_conv_ops_test -//tensorflow/tools/graph_transforms:transforms_test -//tensorflow/python/tools:aot_compiled_test
 # END TF TEST SUITE OPTIONS
 
 # START CROSS-COMPILE CONFIGS
@@ -855,8 +874,12 @@ build:cross_compile_macos_x86 --platform_mappings=tensorflow/tools/toolchains/cr
 # RBE cross-compile configs for Darwin x86
 build:rbe_cross_compile_macos_x86 --config=cross_compile_macos_x86
 build:rbe_cross_compile_macos_x86 --config=rbe_cross_compile_base
+build:rbe_cross_compile_macos_x86 --bes_upload_mode=fully_async
 test:rbe_cross_compile_macos_x86 --config=rbe_cross_compile_base
 # Increase the test timeout as tests often take longer on mac.
 test:rbe_cross_compile_macos_x86 --test_timeout=300,450,1200,3600
+# Limit jobs to 100 to avoid running into "out of memory" issues (b/316266643)
+build:rbe_cross_compile_macos_x86 --jobs=100
+test:rbe_cross_compile_macos_x86 --jobs=100
 # END MACOS CROSS-COMPILE CONFIGS
 # END CROSS-COMPILE CONFIGS
diff --git a/.bazelversion b/.bazelversion
index b536fbc5061305..f3c238740e5bc3 100644
--- a/.bazelversion
+++ b/.bazelversion
@@ -1,2 +1,2 @@
-6.1.0
+6.5.0
 # NOTE: Update Bazel version in tensorflow/tools/ci_build/release/common.sh.oss
\ No newline at end of file
diff --git a/.github/workflows/arm-cd.yml b/.github/workflows/arm-cd.yml
index 15433f8f14be32..ddcc1b373e5c14 100644
--- a/.github/workflows/arm-cd.yml
+++ b/.github/workflows/arm-cd.yml
@@ -57,12 +57,12 @@ jobs:
         run: find /home/ubuntu/actions-runner/_work/tensorflow/tensorflow/. -name . -o -prune -exec sudo rm -rf -- {} + || true
       - name: Checkout repository for nightly (skipped for releases)
         if: ${{ github.event_name == 'schedule' }}
-        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
         with:
           ref: 'nightly'
       - name: Checkout repository for releases (skipped for nightly)
         if: ${{ github.event_name == 'push' }}
-        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       - name: Build and test pip wheel
         shell: bash
         run: |
diff --git a/.github/workflows/arm-ci-extended-cpp.yml b/.github/workflows/arm-ci-extended-cpp.yml
index e648297d37e789..2f9c67fb81eede 100644
--- a/.github/workflows/arm-ci-extended-cpp.yml
+++ b/.github/workflows/arm-ci-extended-cpp.yml
@@ -50,12 +50,12 @@ jobs:
         run: find /home/ubuntu/actions-runner/_work/tensorflow/tensorflow/. -name . -o -prune -exec sudo rm -rf -- {} + || true
       - name: Checkout repository for nightly (skipped for releases)
         if: ${{ github.event_name == 'schedule' }}
-        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
         with:
           ref: 'nightly'
       - name: Checkout repository
         if: ${{ github.event_name == 'push' }}
-        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       - name: Build binary and run C++ tests
         shell: bash
         run: |
diff --git a/.github/workflows/arm-ci-extended.yml b/.github/workflows/arm-ci-extended.yml
index 01ce70ba82ecfa..db782d3cf35f30 100644
--- a/.github/workflows/arm-ci-extended.yml
+++ b/.github/workflows/arm-ci-extended.yml
@@ -51,12 +51,12 @@ jobs:
         run: find /home/ubuntu/actions-runner/_work/tensorflow/tensorflow/. -name . -o -prune -exec sudo rm -rf -- {} + || true
       - name: Checkout repository for nightly (skipped for releases)
         if: ${{ github.event_name == 'schedule' }}
-        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
         with:
           ref: 'nightly'
       - name: Checkout repository
         if: ${{ github.event_name == 'push' }}
-        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       - name: Build binary and run python tests on nightly for all python versions
         shell: bash
         run: |
diff --git a/.github/workflows/arm-ci.yml b/.github/workflows/arm-ci.yml
index 3b07683008391d..7b3e8c6f24df49 100644
--- a/.github/workflows/arm-ci.yml
+++ b/.github/workflows/arm-ci.yml
@@ -47,7 +47,7 @@ jobs:
         shell: bash
         run: find /home/ubuntu/actions-runner/_work/tensorflow/tensorflow/. -name . -o -prune -exec sudo rm -rf -- {} + || true
       - name: Checkout repository
-        uses: actions/checkout@755da8c3cf115ac066823e79a1e1788f8940201b # v3.2.0
+        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       - name: Build binary and run python tests
         shell: bash
         run: |
diff --git a/.github/workflows/osv-scanner-scheduled.yml b/.github/workflows/osv-scanner-scheduled.yml
index fb7366768436c5..a471d68b4fd2d7 100644
--- a/.github/workflows/osv-scanner-scheduled.yml
+++ b/.github/workflows/osv-scanner-scheduled.yml
@@ -28,7 +28,7 @@ permissions:
 jobs:
   scan-scheduled:
     if: github.repository == 'tensorflow/tensorflow'
-    uses: "google/osv-scanner/.github/workflows/osv-scanner-reusable.yml@main"
+    uses: "google/osv-scanner-action/.github/workflows/osv-scanner-reusable.yml@v1.6.2-beta1"
     with:
       scan-args: |-
         --lockfile=requirements.txt:./requirements_lock_3_9.txt
diff --git a/.github/workflows/update-rbe.yml b/.github/workflows/update-rbe.yml
index 1b421effec8198..bdce23b94d02f1 100644
--- a/.github/workflows/update-rbe.yml
+++ b/.github/workflows/update-rbe.yml
@@ -106,13 +106,13 @@ jobs:
         map sigbuild-r2.14-clang-python3.10 2.14-python3.10
         map sigbuild-r2.14-clang-python3.11 2.14-python3.11
         # TF 2.16
-        map sigbuild-r2.16 2.16-python3.9
+        map sigbuild-r2.16 2.16-python3.11
         map sigbuild-r2.16-python3.9 2.16-python3.9
         map sigbuild-r2.16-python3.10 2.16-python3.10
         map sigbuild-r2.16-python3.11 2.16-python3.11
         map sigbuild-r2.16-python3.12 2.16-python3.12
         # TF 2.16 + Clang (containers are the same, but env vars in configs.bzl are different)
-        map sigbuild-r2.16-clang 2.16-python3.9
+        map sigbuild-r2.16-clang 2.16-python3.11
         map sigbuild-r2.16-clang-python3.9 2.16-python3.9
         map sigbuild-r2.16-clang-python3.10 2.16-python3.10
         map sigbuild-r2.16-clang-python3.11 2.16-python3.11
diff --git a/.gitignore b/.gitignore
index cebef4f590d47e..614cde3446a16f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 node_modules
 /.bazelrc.user
 /.tf_configure.bazelrc
+/xla_configure.bazelrc
 /bazel-*
 /bazel_pip
 /tools/python_bin_path.sh
diff --git a/RELEASE.md b/RELEASE.md
index 784e2ac28ceea7..e75ca35b589d73 100644
--- a/RELEASE.md
+++ b/RELEASE.md
@@ -1,4 +1,4 @@
-# Release 2.16.0
+# Release 2.17.0
 
 ## TensorFlow
 
@@ -9,11 +9,31 @@
 * <DOCUMENT BREAKING CHANGES HERE>
 * <THIS SECTION SHOULD CONTAIN API, ABI AND BEHAVIORAL BREAKING CHANGES>
 
-* `tf.summary.trace_on` now takes a `profiler_outdir` argument. This must be set
-  if `profiler` arg is set to `True`.
-    * `tf.summary.trace_export`'s `profiler_outdir` arg is now a no-op. Enabling
-      the profiler now requires setting `profiler_outdir` in `trace_on`.
+### Known Caveats
 
+* <CAVEATS REGARDING THE RELEASE (BUT NOT BREAKING CHANGES).>
+* <ADDING/BUMPING DEPENDENCIES SHOULD GO HERE>
+* <KNOWN LACK OF SUPPORT ON SOME PLATFORM, SHOULD GO HERE>
+
+### Major Features and Improvements
+
+*   <INSERT MAJOR FEATURE HERE, USING MARKDOWN SYNTAX>
+*   <IF RELEASE CONTAINS MULTIPLE FEATURES FROM SAME AREA, GROUP THEM TOGETHER>
+
+### Bug Fixes and Other Changes
+
+* <SIMILAR TO ABOVE SECTION, BUT FOR OTHER IMPORTANT CHANGES / BUG FIXES>
+* <IF A CHANGE CLOSES A GITHUB ISSUE, IT SHOULD BE DOCUMENTED HERE>
+* <NOTES SHOULD BE GROUPED PER AREA>
+
+## Keras
+
+<INSERT SMALL BLURB ABOUT RELEASE FOCUS AREA AND POTENTIAL TOOLCHAIN CHANGES>
+
+### Breaking Changes
+
+* <DOCUMENT BREAKING CHANGES HERE>
+* <THIS SECTION SHOULD CONTAIN API, ABI AND BEHAVIORAL BREAKING CHANGES>
 
 ### Known Caveats
 
@@ -26,6 +46,101 @@
 *   <INSERT MAJOR FEATURE HERE, USING MARKDOWN SYNTAX>
 *   <IF RELEASE CONTAINS MULTIPLE FEATURES FROM SAME AREA, GROUP THEM TOGETHER>
 
+### Bug Fixes and Other Changes
+
+* <SIMILAR TO ABOVE SECTION, BUT FOR OTHER IMPORTANT CHANGES / BUG FIXES>
+* <IF A CHANGE CLOSES A GITHUB ISSUE, IT SHOULD BE DOCUMENTED HERE>
+* <NOTES SHOULD BE GROUPED PER AREA>
+
+* `tf.lite`
+    * Quantization for `FullyConnected` layer is switched from per-tensor to
+      per-channel scales for dynamic range quantization use case (`float32`
+      inputs / outputs and `int8` weights). The change enables new quantization
+      schema globally in the converter and inference engine. The new behaviour
+      can be disabled via experimental
+      flag `converter._experimental_disable_per_channel_quantization_for_dense_layers = True`.
+
+## Thanks to our Contributors
+
+This release contains contributions from many people at Google, as well as:
+
+<INSERT>, <NAME>, <HERE>, <USING>, <GITHUB>, <HANDLE>
+
+# Release 2.16.0
+
+## TensorFlow
+
+<INSERT SMALL BLURB ABOUT RELEASE FOCUS AREA AND POTENTIAL TOOLCHAIN CHANGES>
+
+*   TensorFlow Windows Build:
+
+    *   Clang is now the default compiler to build TensorFlow CPU wheels on the
+        Windows Platform starting with this release. The currently supported
+        version is LLVM/clang 17. The official Wheels-published on PyPI will be
+        based on Clang; however, users retain the option to build wheels using
+        the MSVC compiler following the steps mentioned in
+        https://www.tensorflow.org/install/source_windows as has been the case
+        before
+
+### Breaking Changes
+
+*   <DOCUMENT BREAKING CHANGES HERE>
+*   <THIS SECTION SHOULD CONTAIN API, ABI AND BEHAVIORAL BREAKING CHANGES>
+
+*   `tf.summary.trace_on` now takes a `profiler_outdir` argument. This must be
+    set if `profiler` arg is set to `True`.
+
+    *   `tf.summary.trace_export`'s `profiler_outdir` arg is now a no-op.
+        Enabling the profiler now requires setting `profiler_outdir` in
+        `trace_on`.
+
+*   `tf.estimator`
+
+    *   The tf.estimator API is removed.
+
+*   Keras 3.0 will be the default Keras version. You may need to update your
+    script to use Keras 3.0.
+
+*   Please refer to the new Keras documentation for Keras 3.0
+    (https://keras.io/keras_3).
+
+*   To continue using Keras 2.0, do the following.
+
+*   1.  Install tf-keras via pip install tf-keras~=2.16
+
+    1.  To switch tf.keras to use Keras 2 (tf-keras), set the environment
+        variable TF_USE_LEGACY_KERAS=1 directly or in your python program by
+        import os;os.environ["TF_USE_LEGACY_KERAS"]=1. Please note that this
+        will set it for all packages in your Python runtime program
+
+*   1.  Change import of keras from tensorflow as follows
+*   import tensorflow.keras as keras and import keras to import tf_keras as
+    keras
+* **Apple Silicon users:** If you previously installed TensorFlow using
+    `pip install tensorflow-macos`, please update your installation method. Use
+    `pip install tensorflow` from now on. Starting with TF 2.17, the
+    `tensorflow-macos` package will no longer receive updates.
+
+### Known Caveats
+
+* <CAVEATS REGARDING THE RELEASE (BUT NOT BREAKING CHANGES).>
+* <ADDING/BUMPING DEPENDENCIES SHOULD GO HERE>
+* <KNOWN LACK OF SUPPORT ON SOME PLATFORM, SHOULD GO HERE>
+
+*  Full aarch64 Linux and Arm64 macOS wheels are now published to the
+  `tensorflow` pypi repository and no longer redirect to a separate package.
+
+### Major Features and Improvements
+
+*   <INSERT MAJOR FEATURE HERE, USING MARKDOWN SYNTAX>
+*   <IF RELEASE CONTAINS MULTIPLE FEATURES FROM SAME AREA, GROUP THEM TOGETHER>
+
+*  Support for Python 3.12 has been added.
+*  [tensorflow-tpu](https://pypi.org/project/tensorflow-tpu/) package is now
+   available for easier TPU based installs.
+*  TensorFlow pip packages are now built with CUDA 12.3 and cuDNN 8.9.7
+
+
 ### Bug Fixes and Other Changes
 
 * <SIMILAR TO ABOVE SECTION, BUT FOR OTHER IMPORTANT CHANGES / BUG FIXES>
@@ -54,6 +169,21 @@
     * Added `experimental_skip_slot_variables` (a boolean option) to skip
     restoring of optimizer slot variables in a checkpoint.
 
+*   `tf.saved_model.SaveOptions`
+
+    * `SaveOptions` now takes a new argument called
+      `experimental_debug_stripper`. When enabled, this strips the debug nodes
+      from both the node defs and the function defs of the graph. Note that
+      this currently only strips the `Assert` nodes from the graph and
+      converts them into `NoOp`s instead.
+
+*   `tf.data`
+
+    * `tf.data` now has an `autotune_options.initial_parallelism` option to
+      control the initial parallelism setting used by autotune before the data
+      pipeline has started running. The default is 16. A lower value reduces
+      initial memory usage, while a higher value improves startup time.
+
 ## Keras
 
 *  `keras.layers.experimental.DynamicEmbedding`
diff --git a/ci/official/README.md b/ci/official/README.md
index d070af86cd8090..3c0181c5384392 100644
--- a/ci/official/README.md
+++ b/ci/official/README.md
@@ -45,7 +45,7 @@ cd tensorflow-git-dir
 # Here is a single-line example of running a script on Linux to build the
 # GPU version of TensorFlow for Python 3.12, using the public TF bazel cache and
 # a local build cache:
-TFCI=py312,linux_x86_cuda,multicache ci/official/wheel.sh
+TFCI=py312,linux_x86_cuda,public_cache,disk_cache ci/official/wheel.sh
 
 # First, set your TFCI variable to choose the environment settings.
 #   TFCI is a comma-separated list of filenames from the envs directory, which
@@ -57,9 +57,10 @@ TFCI=py312,linux_x86_cuda,multicache ci/official/wheel.sh
 #   value in the "env_vars" list that you can choose to copy that environment.
 #      Ex. 1: TFCI=py311,linux_x86_cuda,nightly_upload  (nightly job)
 #      Ex. 2: TFCI=py39,linux_x86,rbe                   (continuous job)
-#   Non-Googlers should replace "nightly_upload" or "rbe" with "multicache".
-#   Googlers should replace "nightly_upload" with "multicache" or "rbe", if
-#     you have set up your system to use RBE (see further below).
+#   Non-Googlers should replace "nightly_upload" or "rbe" with
+#   "public_cache,disk_cache".
+#   Googlers should replace "nightly_upload" with "public_cache,disk_cache" or
+#   "rbe", if you have set up your system to use RBE (see further below).
 #
 # Here is how to choose your TFCI value:
 # 1. A Python version must come first, because other scripts reference it.
@@ -74,7 +75,9 @@ TFCI=py312,linux_x86_cuda,multicache ci/official/wheel.sh
 #      Ex. linux_x86_cuda   -- x86_64 Linux platform, with Nvidia CUDA support
 #      Ex. macos_arm64      -- arm64 MacOS platform
 # 3. Add modifiers. Some modifiers for local execution are:
-#      Ex. multicache -- Use a local cache combined with TF's public cache
+#      Ex. disk_cache -- Use a local cache
+#      Ex. public_cache -- Use TF's public cache (read-only)
+#      Ex. public_cache_push -- Use TF's public cache (read and write, Googlers only)
 #      Ex. rbe        -- Use RBE for faster builds (Googlers only; see below)
 #      Ex. no_docker  -- Disable docker on enabled platforms
 #    See full examples below for more details on these. Some other modifiers are:
@@ -94,7 +97,7 @@ TFCI=py312,linux_x86_cuda,multicache ci/official/wheel.sh
 #      or tests passing incorrectly.
 #    - Automatic LLVM updates are known to extend build time even with
 #      the cache; this is unavoidable.
-export TFCI=py311,linux_x86,multicache
+export TFCI=py311,linux_x86,public_cache,disk_cache
 
 # Recommended: Configure Docker. (Linux only)
 #
@@ -127,7 +130,7 @@ export TFCI=py311,linux_x86,multicache
 #   it is only available to a limited set of internal TensorFlow developers.
 #
 #   RBE is incompatible with local caching, so you must remove
-#   ci/official/envs/local_multicache from your $TFCI file.
+#   disk_cache, public_cache, and public_cache_push from your $TFCI file.
 #
 # To use RBE, you must first run `gcloud auth application-default login`, then:
 export TFCI=py311,linux_x86,rbe
diff --git a/ci/official/any.sh b/ci/official/any.sh
index 980bb3cfdf403a..dc1484b64dc9ea 100755
--- a/ci/official/any.sh
+++ b/ci/official/any.sh
@@ -29,7 +29,7 @@
 #       ./any.sh
 #
 # 3. DO THE SAME WITH A LOCAL CACHE OR RBE:
-#       export TF_ANY_EXTRA_ENV=ci/official/envs/local_multicache
+#       export TF_ANY_EXTRA_ENV=ci/official/envs/public_cache,ci/official/envs/disk_cache
 #       ...
 #       ./any.sh
 #     or
@@ -39,8 +39,8 @@
 set -euxo pipefail
 cd "$(dirname "$0")/../../"  # tensorflow/
 # Any request that includes "nightly_upload" should just use the
-# local multi-cache instead.
-export TFCI="$(echo $TFCI | sed 's/,nightly_upload/,multicache/')"
+# local multi-cache (public read-only cache + disk cache) instead.
+export TFCI="$(echo $TFCI | sed 's/,nightly_upload/,public_cache,disk_cache/')"
 if [[ -n "${TF_ANY_EXTRA_ENV:-}" ]]; then
   export TFCI="$TFCI,$TF_ANY_EXTRA_ENV"
 fi
diff --git a/ci/official/bisect.sh b/ci/official/bisect.sh
index 4076a73b867e7a..7f18dd1460ff5b 100755
--- a/ci/official/bisect.sh
+++ b/ci/official/bisect.sh
@@ -34,6 +34,6 @@
 #   export TF_ANY_MODE=test
 set -euxo pipefail
 cd "$(dirname "$0")/../../"  # tensorflow/
-export TFCI="$(echo $TFCI | sed 's/,nightly_upload/,multicache/')"
+export TFCI="$(echo $TFCI | sed 's/,nightly_upload/,public_cache,disk_cache/')"
 git bisect start "$TF_BISECT_BAD" "$TF_BISECT_GOOD"
 git bisect run $TF_BISECT_SCRIPT
diff --git a/ci/official/code_check_full.sh b/ci/official/code_check_full.sh
index 5b1370b4f31e06..448fb82bf288b9 100755
--- a/ci/official/code_check_full.sh
+++ b/ci/official/code_check_full.sh
@@ -15,4 +15,4 @@
 # ==============================================================================
 source "${BASH_SOURCE%/*}/utilities/setup.sh"
 
-tfrun bats ./ci/official/utilities/code_check_full.bats --timing --output "$TFCI_OUTPUT_DIR"
+tfrun bats ./ci/official/utilities/code_check_full.bats --timing --output "$TFCI_OUTPUT_DIR"
\ No newline at end of file
diff --git a/ci/official/containers/linux_arm64/devel.packages.txt b/ci/official/containers/linux_arm64/devel.packages.txt
index efbae80eefacee..a8a9cb442c8b0b 100644
--- a/ci/official/containers/linux_arm64/devel.packages.txt
+++ b/ci/official/containers/linux_arm64/devel.packages.txt
@@ -3,8 +3,6 @@ autoconf
 automake
 build-essential
 ca-certificates
-# TODO(b/308399490) Remove CMake once dm-tree (Keras dependency) has 3.12 wheels
-cmake
 llvm-17
 clang-17
 clang-format-12
diff --git a/ci/official/containers/linux_arm64/jax.requirements.txt b/ci/official/containers/linux_arm64/jax.requirements.txt
index 878d229d0f237e..6211528986fdc0 100644
--- a/ci/official/containers/linux_arm64/jax.requirements.txt
+++ b/ci/official/containers/linux_arm64/jax.requirements.txt
@@ -24,4 +24,6 @@ scipy==1.9.2;python_version=="3.11"
 scipy==1.7.3;python_version<"3.11"
 ml_dtypes>=0.2.0
 
+# For using Python 3.11 with Bazel 6 (b/286090018)
 
+lit ~= 17.0.2
diff --git a/ci/official/envs/ci_default b/ci/official/envs/ci_default
index d080a4566efe16..96d87423392541 100644
--- a/ci/official/envs/ci_default
+++ b/ci/official/envs/ci_default
@@ -58,6 +58,7 @@ TFCI_MACOS_UPGRADE_PYENV_ENABLE=
 TFCI_NIGHTLY_UPDATE_VERSION_ENABLE=
 TFCI_NVIDIA_SMI_ENABLE=
 TFCI_OUTPUT_DIR=
+TFCI_PYCPP_SWAP_TO_BUILD_ENABLE=
 TFCI_PYTHON_VERIFY_PIP_INSTALL_ARGS=
 TFCI_PYTHON_VERSION=
 TFCI_WHL_AUDIT_ENABLE=
diff --git a/ci/official/envs/disk_cache b/ci/official/envs/disk_cache
new file mode 100644
index 00000000000000..bd8ccfa0d95692
--- /dev/null
+++ b/ci/official/envs/disk_cache
@@ -0,0 +1,20 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Sourcing this enables local disk cache
+if [[ $(uname -s) == "Darwin" ]]; then
+  echo "Please note that using disk cache on macOS is not recommended because the"
+  echo "cache can end up being pretty big and make the build process inefficient."
+fi
+TFCI_BAZEL_COMMON_ARGS="$TFCI_BAZEL_COMMON_ARGS --disk_cache=$TFCI_OUTPUT_DIR/cache"
diff --git a/ci/official/envs/enable_pycpp_build b/ci/official/envs/enable_pycpp_build
new file mode 100644
index 00000000000000..d7e0e5ea8065c3
--- /dev/null
+++ b/ci/official/envs/enable_pycpp_build
@@ -0,0 +1,20 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+#
+# Changes the behavior in pycpp.sh from "run all tests" to "verify that all
+# tests can compile." Used in some CI jobs (macOS and Linux Arm64) where test
+# execution is too expensive.
+TFCI_PYCPP_SWAP_TO_BUILD_ENABLE=1
+TFCI_BAZEL_COMMON_ARGS="$TFCI_BAZEL_COMMON_ARGS --build_tests_only"
\ No newline at end of file
diff --git a/ci/official/envs/linux_arm64 b/ci/official/envs/linux_arm64
index 7c4270408dd68d..161b0e2e803822 100644
--- a/ci/official/envs/linux_arm64
+++ b/ci/official/envs/linux_arm64
@@ -19,7 +19,7 @@ TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_arm64
 # despite lacking Nvidia CUDA support.
 TFCI_BUILD_PIP_PACKAGE_ARGS="--repo_env=WHEEL_NAME=tensorflow"
 TFCI_DOCKER_ENABLE=1
-TFCI_DOCKER_IMAGE=gcr.io/tensorflow-sigs/build-arm64:tf-latest-multi-python
+TFCI_DOCKER_IMAGE=gcr.io/tensorflow-sigs/build-arm64:tf-2-16-multi-python
 TFCI_DOCKER_PULL_ENABLE=1
 TFCI_DOCKER_REBUILD_ARGS="--target=tf ci/official/containers/linux_arm64"
 TFCI_INDEX_HTML_ENABLE=1
diff --git a/ci/official/envs/linux_arm64_onednn b/ci/official/envs/linux_arm64_onednn
new file mode 100644
index 00000000000000..0d4b7cbd03bbaa
--- /dev/null
+++ b/ci/official/envs/linux_arm64_onednn
@@ -0,0 +1,16 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+source ci/official/envs/linux_arm64
+TFCI_BAZEL_COMMON_ARGS="$TFCI_BAZEL_COMMON_ARGS  --test_env=TF_ENABLE_ONEDNN_OPTS=1"
diff --git a/ci/official/envs/linux_x86 b/ci/official/envs/linux_x86
index 97fe9956f14ee1..2efc0fac466b00 100644
--- a/ci/official/envs/linux_x86
+++ b/ci/official/envs/linux_x86
@@ -16,7 +16,7 @@ TFCI_BAZEL_COMMON_ARGS="--repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION --conf
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=linux_cpu
 TFCI_BUILD_PIP_PACKAGE_ARGS="--repo_env=WHEEL_NAME=tensorflow_cpu"
 TFCI_DOCKER_ENABLE=1
-TFCI_DOCKER_IMAGE=tensorflow/build:latest-python${TFCI_PYTHON_VERSION}
+TFCI_DOCKER_IMAGE=tensorflow/build:2.16-python${TFCI_PYTHON_VERSION}
 TFCI_DOCKER_PULL_ENABLE=1
 TFCI_DOCKER_REBUILD_ARGS="--build-arg PYTHON_VERSION=python$TFCI_PYTHON_VERSION --target=devel tensorflow/tools/tf_sig_build_dockerfiles"
 TFCI_INDEX_HTML_ENABLE=1
diff --git a/ci/official/envs/macos_x86 b/ci/official/envs/macos_x86
index 3959830535628b..56166a0d0d4309 100644
--- a/ci/official/envs/macos_x86
+++ b/ci/official/envs/macos_x86
@@ -22,8 +22,8 @@ TFCI_MACOS_BAZEL_TEST_DIR_PATH="/System/Volumes/Data/bazel_output"
 TFCI_MACOS_INSTALL_BAZELISK_ENABLE=1
 TFCI_MACOS_INSTALL_BAZELISK_URL="https://github.com/bazelbuild/bazelisk/releases/download/v1.11.0/bazelisk-darwin-amd64"
 TFCI_MACOS_TWINE_INSTALL_ENABLE=1
-TFCI_MACOS_UPGRADE_PYENV_ENABLE=1
 TFCI_OUTPUT_DIR=build_output
+TFCI_WHL_BAZEL_TEST_ENABLE=1
 TFCI_WHL_SIZE_LIMIT=255M
 TFCI_WHL_SIZE_LIMIT_ENABLE=1
 
diff --git a/ci/official/envs/macos_x86_cross_compile b/ci/official/envs/macos_x86_cross_compile
index 79f717156ea939..3a9dd2557faa1c 100644
--- a/ci/official/envs/macos_x86_cross_compile
+++ b/ci/official/envs/macos_x86_cross_compile
@@ -13,8 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 source ci/official/envs/macos_x86
-# Limit jobs to 100 to avoid running into "out of memory" issues (b/316266643)
-TFCI_BAZEL_COMMON_ARGS="--repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION --jobs=100 --config cross_compile_macos_x86"
+TFCI_BAZEL_COMMON_ARGS="--repo_env=TF_PYTHON_VERSION=$TFCI_PYTHON_VERSION  --config cross_compile_macos_x86"
 TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX=cross_compile_macos_x86
 TFCI_MACOS_CROSS_COMPILE_ENABLE=1
 TFCI_MACOS_CROSS_COMPILE_SDK_DEST="tensorflow/tools/toolchains/cross_compile/cc/MacOSX.sdk"
diff --git a/ci/official/envs/multicache b/ci/official/envs/public_cache
similarity index 85%
rename from ci/official/envs/multicache
rename to ci/official/envs/public_cache
index eb5c58e68e646f..ec57aad869ca47 100644
--- a/ci/official/envs/multicache
+++ b/ci/official/envs/public_cache
@@ -12,10 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ==============================================================================
-# Combine TF public build cache and local disk cache
+# Sourcing this enables Bazel remote cache (public, read-only)
 # The cache configs are different for MacOS and Linux
 if [[ $(uname -s) == "Darwin" ]]; then
-  TFCI_BAZEL_COMMON_ARGS="$TFCI_BAZEL_COMMON_ARGS --config tf_public_macos_cache --disk_cache=$TFCI_OUTPUT_DIR/cache"
+  TFCI_BAZEL_COMMON_ARGS="$TFCI_BAZEL_COMMON_ARGS --config tf_public_macos_cache"
 else
-  TFCI_BAZEL_COMMON_ARGS="$TFCI_BAZEL_COMMON_ARGS --config tf_public_cache --disk_cache=$TFCI_OUTPUT_DIR/cache"
+  TFCI_BAZEL_COMMON_ARGS="$TFCI_BAZEL_COMMON_ARGS --config tf_public_cache"
 fi
diff --git a/ci/official/envs/public_cache_push b/ci/official/envs/public_cache_push
new file mode 100644
index 00000000000000..e686a0aac5d5ce
--- /dev/null
+++ b/ci/official/envs/public_cache_push
@@ -0,0 +1,24 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# Sourcing this enables Bazel remote cache (read and write)
+# Note that "_push" cache configs write to GCS buckets and require
+# authentication. If you are not a Googler, source "public_cache" to enable the
+# public read-only cache.
+# The cache configs are different for MacOS and Linux
+if [[ $(uname -s) == "Darwin" ]]; then
+  TFCI_BAZEL_COMMON_ARGS="$TFCI_BAZEL_COMMON_ARGS --config tf_public_macos_cache_push"
+else
+  TFCI_BAZEL_COMMON_ARGS="$TFCI_BAZEL_COMMON_ARGS --config tf_public_cache_push"
+fi
diff --git a/ci/official/pycpp.sh b/ci/official/pycpp.sh
index 34294fe8a107f6..cf346007949c1e 100755
--- a/ci/official/pycpp.sh
+++ b/ci/official/pycpp.sh
@@ -15,7 +15,11 @@
 # ==============================================================================
 source "${BASH_SOURCE%/*}/utilities/setup.sh"
 
-tfrun bazel test $TFCI_BAZEL_COMMON_ARGS --profile "$TFCI_OUTPUT_DIR/profile.json.gz" --config="${TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX}_pycpp_test"
+if [[ $TFCI_PYCPP_SWAP_TO_BUILD_ENABLE == 1 ]]; then
+   tfrun bazel build $TFCI_BAZEL_COMMON_ARGS --profile "$TFCI_OUTPUT_DIR/profile.json.gz" --config="${TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX}_pycpp_test"
+else
+   tfrun bazel test $TFCI_BAZEL_COMMON_ARGS --profile "$TFCI_OUTPUT_DIR/profile.json.gz" --config="${TFCI_BAZEL_TARGET_SELECTING_CONFIG_PREFIX}_pycpp_test"
+fi
 
 # Note: the profile can be viewed by visiting chrome://tracing in a Chrome browser.
 # See https://docs.bazel.build/versions/main/skylark/performance.html#performance-profiling
diff --git a/ci/official/requirements_updater/.bazelversion b/ci/official/requirements_updater/.bazelversion
new file mode 100644
index 00000000000000..f22d756da39d4c
--- /dev/null
+++ b/ci/official/requirements_updater/.bazelversion
@@ -0,0 +1 @@
+6.5.0
diff --git a/ci/official/requirements_updater/release_updater.sh b/ci/official/requirements_updater/release_updater.sh
index 88d54666eb21db..3d47199c7187af 100644
--- a/ci/official/requirements_updater/release_updater.sh
+++ b/ci/official/requirements_updater/release_updater.sh
@@ -25,7 +25,10 @@ SUPPORTED_VERSIONS=("3_9" "3_10" "3_11" "3_12")
 for VERSION in "${SUPPORTED_VERSIONS[@]}"
 do
   cp ../../../requirements_lock_"$VERSION".txt "requirements_lock_"$VERSION".txt"
-  bazel run --experimental_convenience_symlinks=ignore //:requirements_"$VERSION"_release.update
+  bazel run \
+    --experimental_convenience_symlinks=ignore \
+    --enable_bzlmod=false \
+    //:requirements_"$VERSION"_release.update
   sed -i '/^#/d' requirements_lock_"$VERSION".txt
   mv "requirements_lock_"$VERSION".txt" ../../../requirements_lock_"$VERSION".txt
 done
diff --git a/ci/official/requirements_updater/requirements.in b/ci/official/requirements_updater/requirements.in
index 46b5a532d5bb17..364134fcf7c39b 100644
--- a/ci/official/requirements_updater/requirements.in
+++ b/ci/official/requirements_updater/requirements.in
@@ -11,7 +11,7 @@ astor == 0.7.1
 typing_extensions == 4.8.0
 gast == 0.4.0
 termcolor == 2.3.0
-wrapt == 1.14.1
+wrapt == 1.16.0
 tblib == 2.0.0
 
 # Install tensorboard, and keras
@@ -19,7 +19,7 @@ tblib == 2.0.0
 # Note that we must use nightly here as these are used in nightly jobs
 # For release jobs, we will pin these on the release branch
 keras-nightly ~= 3.0.0.dev
-tb-nightly ~= 2.15.0.a
+tb-nightly ~= 2.17.0.a
 
 # Test dependencies
 grpcio >= 1.24.3, < 2.0
diff --git a/ci/official/requirements_updater/updater.sh b/ci/official/requirements_updater/updater.sh
index 898151dab1b599..95c67322966d11 100755
--- a/ci/official/requirements_updater/updater.sh
+++ b/ci/official/requirements_updater/updater.sh
@@ -28,7 +28,10 @@ SUPPORTED_VERSIONS=("3_9" "3_10" "3_11" "3_12")
 for VERSION in "${SUPPORTED_VERSIONS[@]}"
 do
   touch "requirements_lock_$VERSION.txt"
-  bazel run --experimental_convenience_symlinks=ignore //:requirements_"$VERSION".update
+  bazel run \
+    --experimental_convenience_symlinks=ignore \
+    --enable_bzlmod=false \
+    //:requirements_"$VERSION".update
   sed -i '/^#/d' requirements_lock_"$VERSION".txt
   mv requirements_lock_"$VERSION".txt ../../../requirements_lock_"$VERSION".txt
 done
diff --git a/ci/official/upload.sh b/ci/official/upload.sh
index d3411cb2284876..28ad6a68409419 100755
--- a/ci/official/upload.sh
+++ b/ci/official/upload.sh
@@ -25,28 +25,36 @@ if [[ "$TFCI_NIGHTLY_UPDATE_VERSION_ENABLE" == 1 ]]; then
 fi
 source ci/official/utilities/get_versions.sh
 
-# $TF_VER_FULL will resolve to e.g. "2.15.0-rc2". When given a directory path,
-# gsutil copies the basename of the directory into the provided path. Uploading
-# /tmp/$TF_VER_FULL to gs://bucket/ will create gs://bucket/$TF_VER_FULL/files.
-# Since $TF_VER_FULL comes from get_versions.sh, which must be run *after*
-# update_version.py, this can't be set inside the rest of the _upload envs.
-DOWNLOADS="$(mktemp -d)/$TF_VER_FULL"
+# Note on gsutil commands:
+# "gsutil cp" always "copies into". It cannot act on the contents of a directory
+# and it does not seem possible to e.g. copy "gs://foo/bar" as anything other than
+# "/path/bar". This script uses "gsutil rsync" instead, which acts on directory
+# contents. About arguments to gsutil:
+# "gsutil -m rsync" runs in parallel.
+# "gsutil rsync -r" is recursive and makes directories work.
+# "gsutil rsync -d" is "sync and delete files from destination if not present in source"
+
+DOWNLOADS="$(mktemp -d)"
 mkdir -p "$DOWNLOADS"
-# -r is needed to copy a whole folder.
-gsutil -m cp -r "$TFCI_ARTIFACT_STAGING_GCS_URI" "$DOWNLOADS"
+gsutil -m rsync -r "$TFCI_ARTIFACT_STAGING_GCS_URI" "$DOWNLOADS"
 ls "$DOWNLOADS"
 
 # Upload all build artifacts to e.g. gs://tensorflow/versions/2.16.0-rc1 (releases) or
 # gs://tensorflow/nightly/2.16.0-dev20240105 (nightly), overwriting previous values.
 if [[ "$TFCI_ARTIFACT_FINAL_GCS_ENABLE" == 1 ]]; then
   gcloud auth activate-service-account --key-file="$TFCI_ARTIFACT_FINAL_GCS_SA_PATH"
-  gsutil -m cp -r "$DOWNLOADS" "$TFCI_ARTIFACT_FINAL_GCS_URI"
+
+  # $TF_VER_FULL will resolve to e.g. "2.15.0-rc2". Since $TF_VER_FULL comes
+  # from get_versions.sh, which must be run *after* update_version.py, FINAL_URI
+  # can't be set inside the rest of the _upload envs.
+  FINAL_URI="$TFCI_ARTIFACT_FINAL_GCS_URI/$TF_VER_FULL"
+  gsutil -m rsync -d -r "$DOWNLOADS" "$FINAL_URI"
+
   # Also mirror the latest-uploaded folder to the "latest" directory.
-  # GCS does not support symlinks. -p preserves ACLs. -d deletes
-  # no-longer-present files (it's what makes this act as a mirror).
-  gsutil rsync -d -p -r "$TFCI_ARTIFACT_FINAL_GCS_URI" "$TFCI_ARTIFACT_LATEST_GCS_URI"
+  # GCS does not support symlinks.
+  gsutil -m rsync -d -r "$FINAL_URI" "$TFCI_ARTIFACT_LATEST_GCS_URI"
 fi
 
 if [[ "$TFCI_ARTIFACT_FINAL_PYPI_ENABLE" == 1 ]]; then
-  twine upload $TFCI_UPLOAD_WHL_PYPI_ARGS "$DOWNLOADS"/*.whl
+  twine upload $TFCI_ARTIFACT_FINAL_PYPI_ARGS "$DOWNLOADS"/*.whl
 fi
diff --git a/ci/official/utilities/code_check_full.bats b/ci/official/utilities/code_check_full.bats
index 78dd88f1d56be6..8dacee0875535d 100644
--- a/ci/official/utilities/code_check_full.bats
+++ b/ci/official/utilities/code_check_full.bats
@@ -306,6 +306,12 @@ EOF
     echo "Look at the instructions for ':api_compatibility_test -- --update_goldens=True'"
 }
 
+# See b/279852433 (internal).
+# TODO(b/279852433) Replace deps(//tensorflow/...) with deps(//...)
+@test "Verify that it's possible to query every TensorFlow target without BUILD errors" {
+    bazel query "deps(//tensorflow/...)" > /dev/null
+}
+
 teardown_file() {
     bazel shutdown
 }
diff --git a/ci/official/utilities/setup.sh b/ci/official/utilities/setup.sh
index 5a03886c96eab9..10e847d0679391 100755
--- a/ci/official/utilities/setup.sh
+++ b/ci/official/utilities/setup.sh
@@ -88,18 +88,6 @@ if [[ "${OSTYPE}" =~ darwin* ]]; then
   source ./ci/official/utilities/setup_macos.sh
 fi
 
-# Force-disable uploads if the job initiator is not louhi-bridge-server
-# (the user that triggers all of the nightly and release jobs on Louhi)
-# This is temporary: it's currently standard practice for employees to
-# run nightly jobs for testing purposes. We're aiming to move away from
-# this with more convenient methods, but as long as it's possible to do,
-# we want to make sure those extra jobs don't upload anything.
-# TODO(angerson) Remove this once artifact staging is done; after that,
-# simply running a nightly again will not risk upload anything.
-if [[ "${KOKORO_BUILD_INITIATOR:-}" != "louhi-bridge-server" ]]; then
-  source ./ci/official/envs/no_upload
-fi
-
 # Create and expand to the full path of TFCI_OUTPUT_DIR
 export TFCI_OUTPUT_DIR=$(realpath "$TFCI_OUTPUT_DIR")
 mkdir -p "$TFCI_OUTPUT_DIR"
diff --git a/ci/official/utilities/setup_docker.sh b/ci/official/utilities/setup_docker.sh
index c50ea618cfea6c..36afa2545eb244 100755
--- a/ci/official/utilities/setup_docker.sh
+++ b/ci/official/utilities/setup_docker.sh
@@ -14,8 +14,13 @@
 # limitations under the License.
 # ==============================================================================
 if [[ "$TFCI_DOCKER_PULL_ENABLE" == 1 ]]; then
+  # Simple retry logic for docker-pull errors. Sleeps for 15s if a pull fails.
+  # Pulling an already-pulled container image will finish instantly, so
+  # repeating the command costs nothing.
+  docker pull "$TFCI_DOCKER_IMAGE" || sleep 15
+  docker pull "$TFCI_DOCKER_IMAGE" || sleep 15
   docker pull "$TFCI_DOCKER_IMAGE"
-fi
+fi 
 
 if [[ "$TFCI_DOCKER_REBUILD_ENABLE" == 1 ]]; then
   DOCKER_BUILDKIT=1 docker build --cache-from "$TFCI_DOCKER_IMAGE" -t "$TFCI_DOCKER_IMAGE" $TFCI_DOCKER_REBUILD_ARGS
diff --git a/ci/official/utilities/setup_macos.sh b/ci/official/utilities/setup_macos.sh
index f5b07d565437e8..8a63d318c6e18e 100644
--- a/ci/official/utilities/setup_macos.sh
+++ b/ci/official/utilities/setup_macos.sh
@@ -74,19 +74,13 @@ fi
 # it. TFCI Mac VMs only have one Python version installed so we need to install
 # the other versions manually.
 if [[ "${TFCI_MACOS_PYENV_INSTALL_ENABLE}" == 1 ]]; then
-  pyenv install "$TFCI_PYTHON_VERSION"
+  # Install the necessary Python, unless it's already present
+  pyenv install -s "$TFCI_PYTHON_VERSION"
   pyenv local "$TFCI_PYTHON_VERSION"
   # Do a sanity check to make sure that we using the correct Python version
   python --version
 fi
 
-if [[ "$TFCI_PYTHON_VERSION" == "3.12" ]]; then
-  # dm-tree (Keras v3 dependency) doesn't have pre-built wheels for 3.12 yet.
-  # Having CMake allows building them.
-  # Once the wheels are added, this should be removed - b/308399490.
-  brew install cmake
-fi
-
 # TFCI Mac VM images do not have twine installed by default so we need to
 # install it manually. We use Twine in nightly builds to upload Python packages
 # to PyPI.
diff --git a/ci/official/wheel.sh b/ci/official/wheel.sh
index ac8c282389920f..9b8c0a6850d6e6 100755
--- a/ci/official/wheel.sh
+++ b/ci/official/wheel.sh
@@ -28,7 +28,7 @@ if [[ "$TFCI_NIGHTLY_UPDATE_VERSION_ENABLE" == 1 ]]; then
 fi
 
 tfrun bazel build $TFCI_BAZEL_COMMON_ARGS //tensorflow/tools/pip_package/v2:wheel $TFCI_BUILD_PIP_PACKAGE_ARGS
-tfrun cp -a "./bazel-bin/tensorflow/tools/pip_package/v2/wheel_house/." "$TFCI_OUTPUT_DIR"
+tfrun find ./bazel-bin/tensorflow/tools/pip_package -iname "*.whl" -exec cp {} $TFCI_OUTPUT_DIR \;
 tfrun ./ci/official/utilities/rename_and_verify_wheels.sh
 
 if [[ "$TFCI_ARTIFACT_STAGING_GCS_ENABLE" == 1 ]]; then
diff --git a/requirements_lock_3_10.txt b/requirements_lock_3_10.txt
index 434b38d603df80..2335d295d0faf6 100644
--- a/requirements_lock_3_10.txt
+++ b/requirements_lock_3_10.txt
@@ -1,6 +1,6 @@
-absl-py==2.0.0 \
-    --hash=sha256:9a28abb62774ae4e8edbe2dd4c49ffcd45a6a848952a5eccc6a49f3f0fc1e2f3 \
-    --hash=sha256:d9690211c5fcfefcdd1a45470ac2b5c5acd45241c3af71eed96bc5441746c0d5
+absl-py==2.1.0 \
+    --hash=sha256:526a04eadab8b4ee719ce68f204172ead1027549089702d99b9059f129ff1308 \
+    --hash=sha256:7820790efbb316739cde8b4e19357243fc3608a152024288513dd968d7d959ff
     # via
     #   keras-nightly
     #   tb-nightly
@@ -12,105 +12,101 @@ astunparse==1.6.3 \
     --hash=sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872 \
     --hash=sha256:c2652417f2c8b5bb325c885ae329bdf3f86424075c4fd1a128674bc6fba4b8e8
     # via -r requirements.in
-cachetools==5.3.2 \
-    --hash=sha256:086ee420196f7b2ab9ca2db2520aca326318b68fe5ba8bc4d49cca91add450f2 \
-    --hash=sha256:861f35a13a451f94e301ce2bec7cac63e881232ccce7ed67fab9b5df4d3beaa1
-    # via google-auth
-certifi==2023.7.22 \
-    --hash=sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082 \
-    --hash=sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9
+certifi==2024.2.2 \
+    --hash=sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f \
+    --hash=sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1
     # via requests
-charset-normalizer==3.3.1 \
-    --hash=sha256:06cf46bdff72f58645434d467bf5228080801298fbba19fe268a01b4534467f5 \
-    --hash=sha256:0c8c61fb505c7dad1d251c284e712d4e0372cef3b067f7ddf82a7fa82e1e9a93 \
-    --hash=sha256:10b8dd31e10f32410751b3430996f9807fc4d1587ca69772e2aa940a82ab571a \
-    --hash=sha256:1171ef1fc5ab4693c5d151ae0fdad7f7349920eabbaca6271f95969fa0756c2d \
-    --hash=sha256:17a866d61259c7de1bdadef418a37755050ddb4b922df8b356503234fff7932c \
-    --hash=sha256:1d6bfc32a68bc0933819cfdfe45f9abc3cae3877e1d90aac7259d57e6e0f85b1 \
-    --hash=sha256:1ec937546cad86d0dce5396748bf392bb7b62a9eeb8c66efac60e947697f0e58 \
-    --hash=sha256:223b4d54561c01048f657fa6ce41461d5ad8ff128b9678cfe8b2ecd951e3f8a2 \
-    --hash=sha256:2465aa50c9299d615d757c1c888bc6fef384b7c4aec81c05a0172b4400f98557 \
-    --hash=sha256:28f512b9a33235545fbbdac6a330a510b63be278a50071a336afc1b78781b147 \
-    --hash=sha256:2c092be3885a1b7899cd85ce24acedc1034199d6fca1483fa2c3a35c86e43041 \
-    --hash=sha256:2c4c99f98fc3a1835af8179dcc9013f93594d0670e2fa80c83aa36346ee763d2 \
-    --hash=sha256:31445f38053476a0c4e6d12b047b08ced81e2c7c712e5a1ad97bc913256f91b2 \
-    --hash=sha256:31bbaba7218904d2eabecf4feec0d07469284e952a27400f23b6628439439fa7 \
-    --hash=sha256:34d95638ff3613849f473afc33f65c401a89f3b9528d0d213c7037c398a51296 \
-    --hash=sha256:352a88c3df0d1fa886562384b86f9a9e27563d4704ee0e9d56ec6fcd270ea690 \
-    --hash=sha256:39b70a6f88eebe239fa775190796d55a33cfb6d36b9ffdd37843f7c4c1b5dc67 \
-    --hash=sha256:3c66df3f41abee950d6638adc7eac4730a306b022570f71dd0bd6ba53503ab57 \
-    --hash=sha256:3f70fd716855cd3b855316b226a1ac8bdb3caf4f7ea96edcccc6f484217c9597 \
-    --hash=sha256:3f9bc2ce123637a60ebe819f9fccc614da1bcc05798bbbaf2dd4ec91f3e08846 \
-    --hash=sha256:3fb765362688821404ad6cf86772fc54993ec11577cd5a92ac44b4c2ba52155b \
-    --hash=sha256:45f053a0ece92c734d874861ffe6e3cc92150e32136dd59ab1fb070575189c97 \
-    --hash=sha256:46fb9970aa5eeca547d7aa0de5d4b124a288b42eaefac677bde805013c95725c \
-    --hash=sha256:4cb50a0335382aac15c31b61d8531bc9bb657cfd848b1d7158009472189f3d62 \
-    --hash=sha256:4e12f8ee80aa35e746230a2af83e81bd6b52daa92a8afaef4fea4a2ce9b9f4fa \
-    --hash=sha256:4f3100d86dcd03c03f7e9c3fdb23d92e32abbca07e7c13ebd7ddfbcb06f5991f \
-    --hash=sha256:4f6e2a839f83a6a76854d12dbebde50e4b1afa63e27761549d006fa53e9aa80e \
-    --hash=sha256:4f861d94c2a450b974b86093c6c027888627b8082f1299dfd5a4bae8e2292821 \
-    --hash=sha256:501adc5eb6cd5f40a6f77fbd90e5ab915c8fd6e8c614af2db5561e16c600d6f3 \
-    --hash=sha256:520b7a142d2524f999447b3a0cf95115df81c4f33003c51a6ab637cbda9d0bf4 \
-    --hash=sha256:548eefad783ed787b38cb6f9a574bd8664468cc76d1538215d510a3cd41406cb \
-    --hash=sha256:555fe186da0068d3354cdf4bbcbc609b0ecae4d04c921cc13e209eece7720727 \
-    --hash=sha256:55602981b2dbf8184c098bc10287e8c245e351cd4fdcad050bd7199d5a8bf514 \
-    --hash=sha256:58e875eb7016fd014c0eea46c6fa92b87b62c0cb31b9feae25cbbe62c919f54d \
-    --hash=sha256:5a3580a4fdc4ac05f9e53c57f965e3594b2f99796231380adb2baaab96e22761 \
-    --hash=sha256:5b70bab78accbc672f50e878a5b73ca692f45f5b5e25c8066d748c09405e6a55 \
-    --hash=sha256:5ceca5876032362ae73b83347be8b5dbd2d1faf3358deb38c9c88776779b2e2f \
-    --hash=sha256:61f1e3fb621f5420523abb71f5771a204b33c21d31e7d9d86881b2cffe92c47c \
-    --hash=sha256:633968254f8d421e70f91c6ebe71ed0ab140220469cf87a9857e21c16687c034 \
-    --hash=sha256:63a6f59e2d01310f754c270e4a257426fe5a591dc487f1983b3bbe793cf6bac6 \
-    --hash=sha256:63accd11149c0f9a99e3bc095bbdb5a464862d77a7e309ad5938fbc8721235ae \
-    --hash=sha256:6db3cfb9b4fcecb4390db154e75b49578c87a3b9979b40cdf90d7e4b945656e1 \
-    --hash=sha256:71ef3b9be10070360f289aea4838c784f8b851be3ba58cf796262b57775c2f14 \
-    --hash=sha256:7ae8e5142dcc7a49168f4055255dbcced01dc1714a90a21f87448dc8d90617d1 \
-    --hash=sha256:7b6cefa579e1237ce198619b76eaa148b71894fb0d6bcf9024460f9bf30fd228 \
-    --hash=sha256:800561453acdecedaac137bf09cd719c7a440b6800ec182f077bb8e7025fb708 \
-    --hash=sha256:82ca51ff0fc5b641a2d4e1cc8c5ff108699b7a56d7f3ad6f6da9dbb6f0145b48 \
-    --hash=sha256:851cf693fb3aaef71031237cd68699dded198657ec1e76a76eb8be58c03a5d1f \
-    --hash=sha256:854cc74367180beb327ab9d00f964f6d91da06450b0855cbbb09187bcdb02de5 \
-    --hash=sha256:87071618d3d8ec8b186d53cb6e66955ef2a0e4fa63ccd3709c0c90ac5a43520f \
-    --hash=sha256:871d045d6ccc181fd863a3cd66ee8e395523ebfbc57f85f91f035f50cee8e3d4 \
-    --hash=sha256:8aee051c89e13565c6bd366813c386939f8e928af93c29fda4af86d25b73d8f8 \
-    --hash=sha256:8af5a8917b8af42295e86b64903156b4f110a30dca5f3b5aedea123fbd638bff \
-    --hash=sha256:8ec8ef42c6cd5856a7613dcd1eaf21e5573b2185263d87d27c8edcae33b62a61 \
-    --hash=sha256:91e43805ccafa0a91831f9cd5443aa34528c0c3f2cc48c4cb3d9a7721053874b \
-    --hash=sha256:9505dc359edb6a330efcd2be825fdb73ee3e628d9010597aa1aee5aa63442e97 \
-    --hash=sha256:985c7965f62f6f32bf432e2681173db41336a9c2611693247069288bcb0c7f8b \
-    --hash=sha256:9a74041ba0bfa9bc9b9bb2cd3238a6ab3b7618e759b41bd15b5f6ad958d17605 \
-    --hash=sha256:9edbe6a5bf8b56a4a84533ba2b2f489d0046e755c29616ef8830f9e7d9cf5728 \
-    --hash=sha256:a15c1fe6d26e83fd2e5972425a772cca158eae58b05d4a25a4e474c221053e2d \
-    --hash=sha256:a66bcdf19c1a523e41b8e9d53d0cedbfbac2e93c649a2e9502cb26c014d0980c \
-    --hash=sha256:ae4070f741f8d809075ef697877fd350ecf0b7c5837ed68738607ee0a2c572cf \
-    --hash=sha256:ae55d592b02c4349525b6ed8f74c692509e5adffa842e582c0f861751701a673 \
-    --hash=sha256:b578cbe580e3b41ad17b1c428f382c814b32a6ce90f2d8e39e2e635d49e498d1 \
-    --hash=sha256:b891a2f68e09c5ef989007fac11476ed33c5c9994449a4e2c3386529d703dc8b \
-    --hash=sha256:baec8148d6b8bd5cee1ae138ba658c71f5b03e0d69d5907703e3e1df96db5e41 \
-    --hash=sha256:bb06098d019766ca16fc915ecaa455c1f1cd594204e7f840cd6258237b5079a8 \
-    --hash=sha256:bc791ec3fd0c4309a753f95bb6c749ef0d8ea3aea91f07ee1cf06b7b02118f2f \
-    --hash=sha256:bd28b31730f0e982ace8663d108e01199098432a30a4c410d06fe08fdb9e93f4 \
-    --hash=sha256:be4d9c2770044a59715eb57c1144dedea7c5d5ae80c68fb9959515037cde2008 \
-    --hash=sha256:c0c72d34e7de5604df0fde3644cc079feee5e55464967d10b24b1de268deceb9 \
-    --hash=sha256:c0e842112fe3f1a4ffcf64b06dc4c61a88441c2f02f373367f7b4c1aa9be2ad5 \
-    --hash=sha256:c15070ebf11b8b7fd1bfff7217e9324963c82dbdf6182ff7050519e350e7ad9f \
-    --hash=sha256:c2000c54c395d9e5e44c99dc7c20a64dc371f777faf8bae4919ad3e99ce5253e \
-    --hash=sha256:c30187840d36d0ba2893bc3271a36a517a717f9fd383a98e2697ee890a37c273 \
-    --hash=sha256:cb7cd68814308aade9d0c93c5bd2ade9f9441666f8ba5aa9c2d4b389cb5e2a45 \
-    --hash=sha256:cd805513198304026bd379d1d516afbf6c3c13f4382134a2c526b8b854da1c2e \
-    --hash=sha256:d0bf89afcbcf4d1bb2652f6580e5e55a840fdf87384f6063c4a4f0c95e378656 \
-    --hash=sha256:d9137a876020661972ca6eec0766d81aef8a5627df628b664b234b73396e727e \
-    --hash=sha256:dbd95e300367aa0827496fe75a1766d198d34385a58f97683fe6e07f89ca3e3c \
-    --hash=sha256:dced27917823df984fe0c80a5c4ad75cf58df0fbfae890bc08004cd3888922a2 \
-    --hash=sha256:de0b4caa1c8a21394e8ce971997614a17648f94e1cd0640fbd6b4d14cab13a72 \
-    --hash=sha256:debb633f3f7856f95ad957d9b9c781f8e2c6303ef21724ec94bea2ce2fcbd056 \
-    --hash=sha256:e372d7dfd154009142631de2d316adad3cc1c36c32a38b16a4751ba78da2a397 \
-    --hash=sha256:ecd26be9f112c4f96718290c10f4caea6cc798459a3a76636b817a0ed7874e42 \
-    --hash=sha256:edc0202099ea1d82844316604e17d2b175044f9bcb6b398aab781eba957224bd \
-    --hash=sha256:f194cce575e59ffe442c10a360182a986535fd90b57f7debfaa5c845c409ecc3 \
-    --hash=sha256:f5fb672c396d826ca16a022ac04c9dce74e00a1c344f6ad1a0fdc1ba1f332213 \
-    --hash=sha256:f6a02a3c7950cafaadcd46a226ad9e12fc9744652cc69f9e5534f98b47f3bbcf \
-    --hash=sha256:fe81b35c33772e56f4b6cf62cf4aedc1762ef7162a31e6ac7fe5e40d0149eb67
+charset-normalizer==3.3.2 \
+    --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \
+    --hash=sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087 \
+    --hash=sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786 \
+    --hash=sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8 \
+    --hash=sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09 \
+    --hash=sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185 \
+    --hash=sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574 \
+    --hash=sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e \
+    --hash=sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519 \
+    --hash=sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898 \
+    --hash=sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269 \
+    --hash=sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3 \
+    --hash=sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f \
+    --hash=sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6 \
+    --hash=sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8 \
+    --hash=sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a \
+    --hash=sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73 \
+    --hash=sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc \
+    --hash=sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714 \
+    --hash=sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2 \
+    --hash=sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc \
+    --hash=sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce \
+    --hash=sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d \
+    --hash=sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e \
+    --hash=sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6 \
+    --hash=sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269 \
+    --hash=sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96 \
+    --hash=sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d \
+    --hash=sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a \
+    --hash=sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4 \
+    --hash=sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77 \
+    --hash=sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d \
+    --hash=sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0 \
+    --hash=sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed \
+    --hash=sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068 \
+    --hash=sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac \
+    --hash=sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25 \
+    --hash=sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8 \
+    --hash=sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab \
+    --hash=sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26 \
+    --hash=sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2 \
+    --hash=sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db \
+    --hash=sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f \
+    --hash=sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5 \
+    --hash=sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99 \
+    --hash=sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c \
+    --hash=sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d \
+    --hash=sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811 \
+    --hash=sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa \
+    --hash=sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a \
+    --hash=sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03 \
+    --hash=sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b \
+    --hash=sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04 \
+    --hash=sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c \
+    --hash=sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001 \
+    --hash=sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458 \
+    --hash=sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389 \
+    --hash=sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99 \
+    --hash=sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985 \
+    --hash=sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537 \
+    --hash=sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238 \
+    --hash=sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f \
+    --hash=sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d \
+    --hash=sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796 \
+    --hash=sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a \
+    --hash=sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143 \
+    --hash=sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8 \
+    --hash=sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c \
+    --hash=sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5 \
+    --hash=sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5 \
+    --hash=sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711 \
+    --hash=sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4 \
+    --hash=sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6 \
+    --hash=sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c \
+    --hash=sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7 \
+    --hash=sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4 \
+    --hash=sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b \
+    --hash=sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae \
+    --hash=sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12 \
+    --hash=sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c \
+    --hash=sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae \
+    --hash=sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8 \
+    --hash=sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887 \
+    --hash=sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b \
+    --hash=sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4 \
+    --hash=sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f \
+    --hash=sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5 \
+    --hash=sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33 \
+    --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \
+    --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561
     # via requests
 dill==0.3.7 \
     --hash=sha256:76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e \
@@ -133,6 +129,7 @@ dm-tree==0.1.8 \
     --hash=sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60 \
     --hash=sha256:378cc8ad93c5fe3590f405a309980721f021c790ca1bdf9b15bb1d59daec57f5 \
     --hash=sha256:39070ba268c0491af9fe7a58644d99e8b4f2cde6e5884ba3380bddc84ed43d5f \
+    --hash=sha256:435227cf3c5dc63f4de054cf3d00183790bd9ead4c3623138c74dde7f67f521b \
     --hash=sha256:5483dca4d7eb1a0d65fe86d3b6a53ae717face83c1f17e0887b1a4a64ae5c410 \
     --hash=sha256:694c3654cfd2a81552c08ec66bb5c4a3d48fa292b9a181880fb081c36c5b9134 \
     --hash=sha256:803bfc53b4659f447ac694dbd04235f94a73ef7c1fd1e0df7c84ac41e0bc963b \
@@ -140,6 +137,8 @@ dm-tree==0.1.8 \
     --hash=sha256:83b7764de0d855338abefc6e3ee9fe40d301668310aa3baea3f778ff051f4393 \
     --hash=sha256:8c60a7eadab64c2278861f56bca320b2720f163dca9d7558103c3b77f2416571 \
     --hash=sha256:8ed3564abed97c806db122c2d3e1a2b64c74a63debe9903aad795167cc301368 \
+    --hash=sha256:94d3f0826311f45ee19b75f5b48c99466e4218a0489e81c0f0167bda50cacf22 \
+    --hash=sha256:96a548a406a6fb15fe58f6a30a57ff2f2aafbf25f05afab00c8f5e5977b6c715 \
     --hash=sha256:a5d819c38c03f0bb5b3b3703c60e4b170355a0fc6b5819325bf3d4ceb3ae7e80 \
     --hash=sha256:ad16ceba90a56ec47cf45b21856d14962ac314787975ef786efb5e6e9ca75ec7 \
     --hash=sha256:af4b3d372f2477dcd89a6e717e4a575ca35ccc20cc4454a8a4b6f8838a00672d \
@@ -147,6 +146,7 @@ dm-tree==0.1.8 \
     --hash=sha256:b9bd9b9ccb59409d33d51d84b7668010c04c2af7d4a371632874c1ca356cff3d \
     --hash=sha256:b9f89a454e98806b44fe9d40ec9eee61f848388f7e79ac2371a55679bd5a3ac6 \
     --hash=sha256:bb2d109f42190225112da899b9f3d46d0d5f26aef501c61e43529fe9322530b5 \
+    --hash=sha256:c0a94aba18a35457a1b5cd716fd7b46c5dafdc4cf7869b4bae665b91c4682a8e \
     --hash=sha256:c5c8c12e3fda754ef6af94161bacdaeda816d941995fac415d6855c6c386af68 \
     --hash=sha256:d1612fcaecd79023dbc6a6ae48d51a80beb5c385d6f3f6d71688e57bc8d07de8 \
     --hash=sha256:d16e1f2a073604cfcc09f7131ae8d534674f43c3aef4c25742eae295bc60d04f \
@@ -154,6 +154,7 @@ dm-tree==0.1.8 \
     --hash=sha256:d40fa4106ca6edc66760246a08f500ec0c85ef55c762fb4a363f6ee739ba02ee \
     --hash=sha256:de287fabc464b8734be251e46e06aa9aa1001f34198da2b6ce07bd197172b9cb \
     --hash=sha256:e4d714371bb08839e4e5e29024fc95832d9affe129825ef38836b143028bd144 \
+    --hash=sha256:ea9e59e0451e7d29aece402d9f908f2e2a80922bcde2ebfd5dcb07750fcbfee8 \
     --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \
     --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d
     # via keras-nightly
@@ -161,71 +162,61 @@ gast==0.4.0 \
     --hash=sha256:40feb7b8b8434785585ab224d1568b857edb18297e5a3047f1ba012bc83b42c1 \
     --hash=sha256:b7adcdd5adbebf1adf17378da5ba3f543684dbec47b1cda1f3997e573cd542c4
     # via -r requirements.in
-google-auth==2.23.3 \
-    --hash=sha256:6864247895eea5d13b9c57c9e03abb49cb94ce2dc7c58e91cba3248c7477c9e3 \
-    --hash=sha256:a8f4608e65c244ead9e0538f181a96c6e11199ec114d41f1d7b1bffa96937bda
-    # via
-    #   google-auth-oauthlib
-    #   tb-nightly
-google-auth-oauthlib==1.1.0 \
-    --hash=sha256:089c6e587d36f4803ac7e0720c045c6a8b1fd1790088b8424975b90d0ee61c12 \
-    --hash=sha256:83ea8c3b0881e453790baff4448e8a6112ac8778d1de9da0b68010b843937afb
-    # via tb-nightly
-grpcio==1.59.2 \
-    --hash=sha256:023088764012411affe7db183d1ada3ad9daf2e23ddc719ff46d7061de661340 \
-    --hash=sha256:08d77e682f2bf730a4961eea330e56d2f423c6a9b91ca222e5b1eb24a357b19f \
-    --hash=sha256:0a4a3833c0e067f3558538727235cd8a49709bff1003200bbdefa2f09334e4b1 \
-    --hash=sha256:0a754aff9e3af63bdc4c75c234b86b9d14e14a28a30c4e324aed1a9b873d755f \
-    --hash=sha256:11168ef43e4a43ff1b1a65859f3e0ef1a173e277349e7fb16923ff108160a8cd \
-    --hash=sha256:128e20f57c5f27cb0157e73756d1586b83c1b513ebecc83ea0ac37e4b0e4e758 \
-    --hash=sha256:1f9524d1d701e399462d2c90ba7c193e49d1711cf429c0d3d97c966856e03d00 \
-    --hash=sha256:1ff16d68bf453275466a9a46739061a63584d92f18a0f5b33d19fc97eb69867c \
-    --hash=sha256:2067274c88bc6de89c278a672a652b4247d088811ece781a4858b09bdf8448e3 \
-    --hash=sha256:2171c39f355ba5b551c5d5928d65aa6c69807fae195b86ef4a7d125bcdb860a9 \
-    --hash=sha256:242adc47725b9a499ee77c6a2e36688fa6c96484611f33b1be4c57ab075a92dd \
-    --hash=sha256:27f879ae604a7fcf371e59fba6f3ff4635a4c2a64768bd83ff0cac503142fef4 \
-    --hash=sha256:2b230028a008ae1d0f430acb227d323ff8a619017415cf334c38b457f814119f \
-    --hash=sha256:3059668df17627f0e0fa680e9ef8c995c946c792612e9518f5cc1503be14e90b \
-    --hash=sha256:31176aa88f36020055ace9adff2405a33c8bdbfa72a9c4980e25d91b2f196873 \
-    --hash=sha256:36f53c2b3449c015880e7d55a89c992c357f176327b0d2873cdaaf9628a37c69 \
-    --hash=sha256:3b4368b33908f683a363f376dfb747d40af3463a6e5044afee07cf9436addf96 \
-    --hash=sha256:3c61d641d4f409c5ae46bfdd89ea42ce5ea233dcf69e74ce9ba32b503c727e29 \
-    --hash=sha256:4abb717e320e74959517dc8e84a9f48fbe90e9abe19c248541e9418b1ce60acd \
-    --hash=sha256:4c93f4abbb54321ee6471e04a00139c80c754eda51064187963ddf98f5cf36a4 \
-    --hash=sha256:535561990e075fa6bd4b16c4c3c1096b9581b7bb35d96fac4650f1181e428268 \
-    --hash=sha256:53c9aa5ddd6857c0a1cd0287225a2a25873a8e09727c2e95c4aebb1be83a766a \
-    --hash=sha256:5d573e70a6fe77555fb6143c12d3a7d3fa306632a3034b4e7c59ca09721546f8 \
-    --hash=sha256:6009386a2df66159f64ac9f20425ae25229b29b9dd0e1d3dd60043f037e2ad7e \
-    --hash=sha256:686e975a5d16602dc0982c7c703948d17184bd1397e16c8ee03511ecb8c4cdda \
-    --hash=sha256:6959fb07e8351e20501ffb8cc4074c39a0b7ef123e1c850a7f8f3afdc3a3da01 \
-    --hash=sha256:6b25ed37c27e652db01be341af93fbcea03d296c024d8a0e680017a268eb85dd \
-    --hash=sha256:6da6dea3a1bacf99b3c2187e296db9a83029ed9c38fd4c52b7c9b7326d13c828 \
-    --hash=sha256:72ca2399097c0b758198f2ff30f7178d680de8a5cfcf3d9b73a63cf87455532e \
-    --hash=sha256:73abb8584b0cf74d37f5ef61c10722adc7275502ab71789a8fe3cb7ef04cf6e2 \
-    --hash=sha256:74100fecaec8a535e380cf5f2fb556ff84957d481c13e54051c52e5baac70541 \
-    --hash=sha256:75c6ecb70e809cf1504465174343113f51f24bc61e22a80ae1c859f3f7034c6d \
-    --hash=sha256:7cf05053242f61ba94014dd3a986e11a083400a32664058f80bf4cf817c0b3a1 \
-    --hash=sha256:9411e24328a2302e279e70cae6e479f1fddde79629fcb14e03e6d94b3956eabf \
-    --hash=sha256:a213acfbf186b9f35803b52e4ca9addb153fc0b67f82a48f961be7000ecf6721 \
-    --hash=sha256:bb7e0fe6ad73b7f06d7e2b689c19a71cf5cc48f0c2bf8608469e51ffe0bd2867 \
-    --hash=sha256:c2504eed520958a5b77cc99458297cb7906308cb92327f35fb7fbbad4e9b2188 \
-    --hash=sha256:c35aa9657f5d5116d23b934568e0956bd50c615127810fffe3ac356a914c176a \
-    --hash=sha256:c5f09cffa619adfb44799fa4a81c2a1ad77c887187613fb0a8f201ab38d89ba1 \
-    --hash=sha256:c978f864b35f2261e0819f5cd88b9830b04dc51bcf055aac3c601e525a10d2ba \
-    --hash=sha256:cbe946b3e6e60a7b4618f091e62a029cb082b109a9d6b53962dd305087c6e4fd \
-    --hash=sha256:cc3e4cd087f07758b16bef8f31d88dbb1b5da5671d2f03685ab52dece3d7a16e \
-    --hash=sha256:cf0dead5a2c5a3347af2cfec7131d4f2a2e03c934af28989c9078f8241a491fa \
-    --hash=sha256:d2794f0e68b3085d99b4f6ff9c089f6fdd02b32b9d3efdfbb55beac1bf22d516 \
-    --hash=sha256:d2fa68a96a30dd240be80bbad838a0ac81a61770611ff7952b889485970c4c71 \
-    --hash=sha256:d6f70406695e3220f09cd7a2f879333279d91aa4a8a1d34303b56d61a8180137 \
-    --hash=sha256:d8f9cd4ad1be90b0cf350a2f04a38a36e44a026cac1e036ac593dc48efe91d52 \
-    --hash=sha256:da2d94c15f88cd40d7e67f7919d4f60110d2b9d5b1e08cf354c2be773ab13479 \
-    --hash=sha256:e1727c1c0e394096bb9af185c6923e8ea55a5095b8af44f06903bcc0e06800a2 \
-    --hash=sha256:e420ced29b5904cdf9ee5545e23f9406189d8acb6750916c2db4793dada065c6 \
-    --hash=sha256:e82c5cf1495244adf5252f925ac5932e5fd288b3e5ab6b70bec5593074b7236c \
-    --hash=sha256:f1ef0d39bc1feb420caf549b3c657c871cad4ebbcf0580c4d03816b0590de0cf \
-    --hash=sha256:f8753a6c88d1d0ba64302309eecf20f70d2770f65ca02d83c2452279085bfcd3 \
-    --hash=sha256:f93dbf58f03146164048be5426ffde298b237a5e059144847e4940f5b80172c3
+grpcio==1.60.1 \
+    --hash=sha256:0250a7a70b14000fa311de04b169cc7480be6c1a769b190769d347939d3232a8 \
+    --hash=sha256:069fe2aeee02dfd2135d562d0663fe70fbb69d5eed6eb3389042a7e963b54de8 \
+    --hash=sha256:082081e6a36b6eb5cf0fd9a897fe777dbb3802176ffd08e3ec6567edd85bc104 \
+    --hash=sha256:0c5807e9152eff15f1d48f6b9ad3749196f79a4a050469d99eecb679be592acc \
+    --hash=sha256:14e8f2c84c0832773fb3958240c69def72357bc11392571f87b2d7b91e0bb092 \
+    --hash=sha256:2a6087f234cb570008a6041c8ffd1b7d657b397fdd6d26e83d72283dae3527b1 \
+    --hash=sha256:2bb2a2911b028f01c8c64d126f6b632fcd8a9ac975aa1b3855766c94e4107180 \
+    --hash=sha256:2f44c32aef186bbba254129cea1df08a20be414144ac3bdf0e84b24e3f3b2e05 \
+    --hash=sha256:30e980cd6db1088c144b92fe376747328d5554bc7960ce583ec7b7d81cd47287 \
+    --hash=sha256:33aed0a431f5befeffd9d346b0fa44b2c01aa4aeae5ea5b2c03d3e25e0071216 \
+    --hash=sha256:33bdea30dcfd4f87b045d404388469eb48a48c33a6195a043d116ed1b9a0196c \
+    --hash=sha256:39aa848794b887120b1d35b1b994e445cc028ff602ef267f87c38122c1add50d \
+    --hash=sha256:4216e67ad9a4769117433814956031cb300f85edc855252a645a9a724b3b6594 \
+    --hash=sha256:49c9b6a510e3ed8df5f6f4f3c34d7fbf2d2cae048ee90a45cd7415abab72912c \
+    --hash=sha256:4eec8b8c1c2c9b7125508ff7c89d5701bf933c99d3910e446ed531cd16ad5d87 \
+    --hash=sha256:50d56280b482875d1f9128ce596e59031a226a8b84bec88cb2bf76c289f5d0de \
+    --hash=sha256:53b69e79d00f78c81eecfb38f4516080dc7f36a198b6b37b928f1c13b3c063e9 \
+    --hash=sha256:55ccb7db5a665079d68b5c7c86359ebd5ebf31a19bc1a91c982fd622f1e31ff2 \
+    --hash=sha256:5a1ebbae7e2214f51b1f23b57bf98eeed2cf1ba84e4d523c48c36d5b2f8829ff \
+    --hash=sha256:61b7199cd2a55e62e45bfb629a35b71fc2c0cb88f686a047f25b1112d3810904 \
+    --hash=sha256:660fc6b9c2a9ea3bb2a7e64ba878c98339abaf1811edca904ac85e9e662f1d73 \
+    --hash=sha256:6d140bdeb26cad8b93c1455fa00573c05592793c32053d6e0016ce05ba267549 \
+    --hash=sha256:6e490fa5f7f5326222cb9f0b78f207a2b218a14edf39602e083d5f617354306f \
+    --hash=sha256:6ecf21d20d02d1733e9c820fb5c114c749d888704a7ec824b545c12e78734d1c \
+    --hash=sha256:70c83bb530572917be20c21f3b6be92cd86b9aecb44b0c18b1d3b2cc3ae47df0 \
+    --hash=sha256:72153a0d2e425f45b884540a61c6639436ddafa1829a42056aa5764b84108b8e \
+    --hash=sha256:73e14acd3d4247169955fae8fb103a2b900cfad21d0c35f0dcd0fdd54cd60367 \
+    --hash=sha256:76eaaba891083fcbe167aa0f03363311a9f12da975b025d30e94b93ac7a765fc \
+    --hash=sha256:79ae0dc785504cb1e1788758c588c711f4e4a0195d70dff53db203c95a0bd303 \
+    --hash=sha256:7d142bcd604166417929b071cd396aa13c565749a4c840d6c702727a59d835eb \
+    --hash=sha256:8c9554ca8e26241dabe7951aa1fa03a1ba0856688ecd7e7bdbdd286ebc272e4c \
+    --hash=sha256:8d488fbdbf04283f0d20742b64968d44825617aa6717b07c006168ed16488804 \
+    --hash=sha256:91422ba785a8e7a18725b1dc40fbd88f08a5bb4c7f1b3e8739cab24b04fa8a03 \
+    --hash=sha256:9a66f4d2a005bc78e61d805ed95dedfcb35efa84b7bba0403c6d60d13a3de2d6 \
+    --hash=sha256:9b106bc52e7f28170e624ba61cc7dc6829566e535a6ec68528f8e1afbed1c41f \
+    --hash=sha256:9b54577032d4f235452f77a83169b6527bf4b77d73aeada97d45b2aaf1bf5ce0 \
+    --hash=sha256:a09506eb48fa5493c58f946c46754ef22f3ec0df64f2b5149373ff31fb67f3dd \
+    --hash=sha256:a212e5dea1a4182e40cd3e4067ee46be9d10418092ce3627475e995cca95de21 \
+    --hash=sha256:a731ac5cffc34dac62053e0da90f0c0b8560396a19f69d9703e88240c8f05858 \
+    --hash=sha256:af5ef6cfaf0d023c00002ba25d0751e5995fa0e4c9eec6cd263c30352662cbce \
+    --hash=sha256:b58b855d0071575ea9c7bc0d84a06d2edfbfccec52e9657864386381a7ce1ae9 \
+    --hash=sha256:bc808924470643b82b14fe121923c30ec211d8c693e747eba8a7414bc4351a23 \
+    --hash=sha256:c557e94e91a983e5b1e9c60076a8fd79fea1e7e06848eb2e48d0ccfb30f6e073 \
+    --hash=sha256:c71be3f86d67d8d1311c6076a4ba3b75ba5703c0b856b4e691c9097f9b1e8bd2 \
+    --hash=sha256:c8754c75f55781515a3005063d9a05878b2cfb3cb7e41d5401ad0cf19de14872 \
+    --hash=sha256:cb0af13433dbbd1c806e671d81ec75bd324af6ef75171fd7815ca3074fe32bfe \
+    --hash=sha256:cba6209c96828711cb7c8fcb45ecef8c8859238baf15119daa1bef0f6c84bfe7 \
+    --hash=sha256:cf77f8cf2a651fbd869fbdcb4a1931464189cd210abc4cfad357f1cacc8642a6 \
+    --hash=sha256:d7404cebcdb11bb5bd40bf94131faf7e9a7c10a6c60358580fe83913f360f929 \
+    --hash=sha256:dd1d3a8d1d2e50ad9b59e10aa7f07c7d1be2b367f3f2d33c5fade96ed5460962 \
+    --hash=sha256:e5d97c65ea7e097056f3d1ead77040ebc236feaf7f71489383d20f3b4c28412a \
+    --hash=sha256:f1c3dc536b3ee124e8b24feb7533e5c70b9f2ef833e3b2e5513b2897fd46763a \
+    --hash=sha256:f2212796593ad1d0235068c79836861f2201fc7137a99aa2fea7beeb3b101177 \
+    --hash=sha256:fead980fbc68512dfd4e0c7b1f5754c2a8e5015a04dea454b9cada54a8423525
     # via
     #   -r requirements.in
     #   tb-nightly
@@ -258,113 +249,115 @@ h5py==3.10.0 \
     # via
     #   -r requirements.in
     #   keras-nightly
-idna==3.4 \
-    --hash=sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4 \
-    --hash=sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2
+idna==3.6 \
+    --hash=sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca \
+    --hash=sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f
     # via requests
 jax==0.4.7 \
     --hash=sha256:5e7002d74db25f97c99b979d4ba1233b1ef26e1597e5fc468ad11d1c8a9dc4f8
     # via -r requirements.in
-keras-nightly==3.0.0.dev2023103103 \
-    --hash=sha256:25a6a9030d7e067d535ec41ae6700636c90cabada80fbddb3bb8775006807860 \
-    --hash=sha256:c39fccebb3d4cc9d838371981c4d3eef08fc06eadf6cffb39b356cb625bab50f
+keras-nightly==3.0.4.dev2024021403 \
+    --hash=sha256:24ce69d29d582771685bf4235f59663723405b5a5b16f3eaff2657e52e74663a \
+    --hash=sha256:9f416e66b820ef833779d219d255b346b8b90a72fdbd0b2f1e90a43ad142a03d
     # via -r requirements.in
-lit==17.0.4 \
-    --hash=sha256:ee2e180128e770abc6aed3a02de2daf09d81b7d30225e315205d3599c311d304
+lit==17.0.6 \
+    --hash=sha256:dfa9af9b55fc4509a56be7bf2346f079d7f4a242d583b9f2e0b078fd0abae31b
     # via -r requirements.in
-markdown==3.5 \
-    --hash=sha256:4afb124395ce5fc34e6d9886dab977fd9ae987fc6e85689f08278cf0c69d4bf3 \
-    --hash=sha256:a807eb2e4778d9156c8f07876c6e4d50b5494c5665c4834f67b06459dfd877b3
+markdown==3.5.2 \
+    --hash=sha256:d43323865d89fc0cb9b20c75fc8ad313af307cc087e84b657d9eec768eddeadd \
+    --hash=sha256:e1ac7b3dc550ee80e602e71c1d168002f062e49f1b11e26a36264dafd4df2ef8
     # via tb-nightly
 markdown-it-py==3.0.0 \
     --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \
     --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb
     # via rich
-markupsafe==2.1.3 \
-    --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \
-    --hash=sha256:0a4e4a1aff6c7ac4cd55792abf96c915634c2b97e3cc1c7129578aa68ebd754e \
-    --hash=sha256:10bbfe99883db80bdbaff2dcf681dfc6533a614f700da1287707e8a5d78a8431 \
-    --hash=sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686 \
-    --hash=sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c \
-    --hash=sha256:1577735524cdad32f9f694208aa75e422adba74f1baee7551620e43a3141f559 \
-    --hash=sha256:1b40069d487e7edb2676d3fbdb2b0829ffa2cd63a2ec26c4938b2d34391b4ecc \
-    --hash=sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb \
-    --hash=sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939 \
-    --hash=sha256:282c2cb35b5b673bbcadb33a585408104df04f14b2d9b01d4c345a3b92861c2c \
-    --hash=sha256:2c1b19b3aaacc6e57b7e25710ff571c24d6c3613a45e905b1fde04d691b98ee0 \
-    --hash=sha256:2ef12179d3a291be237280175b542c07a36e7f60718296278d8593d21ca937d4 \
-    --hash=sha256:338ae27d6b8745585f87218a3f23f1512dbf52c26c28e322dbe54bcede54ccb9 \
-    --hash=sha256:3c0fae6c3be832a0a0473ac912810b2877c8cb9d76ca48de1ed31e1c68386575 \
-    --hash=sha256:3fd4abcb888d15a94f32b75d8fd18ee162ca0c064f35b11134be77050296d6ba \
-    --hash=sha256:42de32b22b6b804f42c5d98be4f7e5e977ecdd9ee9b660fda1a3edf03b11792d \
-    --hash=sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd \
-    --hash=sha256:504b320cd4b7eff6f968eddf81127112db685e81f7e36e75f9f84f0df46041c3 \
-    --hash=sha256:525808b8019e36eb524b8c68acdd63a37e75714eac50e988180b169d64480a00 \
-    --hash=sha256:56d9f2ecac662ca1611d183feb03a3fa4406469dafe241673d521dd5ae92a155 \
-    --hash=sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac \
-    --hash=sha256:65c1a9bcdadc6c28eecee2c119465aebff8f7a584dd719facdd9e825ec61ab52 \
-    --hash=sha256:68e78619a61ecf91e76aa3e6e8e33fc4894a2bebe93410754bd28fce0a8a4f9f \
-    --hash=sha256:69c0f17e9f5a7afdf2cc9fb2d1ce6aabdb3bafb7f38017c0b77862bcec2bbad8 \
-    --hash=sha256:6b2b56950d93e41f33b4223ead100ea0fe11f8e6ee5f641eb753ce4b77a7042b \
-    --hash=sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007 \
-    --hash=sha256:787003c0ddb00500e49a10f2844fac87aa6ce977b90b0feaaf9de23c22508b24 \
-    --hash=sha256:7ef3cb2ebbf91e330e3bb937efada0edd9003683db6b57bb108c4001f37a02ea \
-    --hash=sha256:8023faf4e01efadfa183e863fefde0046de576c6f14659e8782065bcece22198 \
-    --hash=sha256:8758846a7e80910096950b67071243da3e5a20ed2546e6392603c096778d48e0 \
-    --hash=sha256:8afafd99945ead6e075b973fefa56379c5b5c53fd8937dad92c662da5d8fd5ee \
-    --hash=sha256:8c41976a29d078bb235fea9b2ecd3da465df42a562910f9022f1a03107bd02be \
-    --hash=sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2 \
-    --hash=sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1 \
-    --hash=sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707 \
-    --hash=sha256:962f82a3086483f5e5f64dbad880d31038b698494799b097bc59c2edf392fce6 \
-    --hash=sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c \
-    --hash=sha256:9dcdfd0eaf283af041973bff14a2e143b8bd64e069f4c383416ecd79a81aab58 \
-    --hash=sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823 \
-    --hash=sha256:aa7bd130efab1c280bed0f45501b7c8795f9fdbeb02e965371bbef3523627779 \
-    --hash=sha256:ab4a0df41e7c16a1392727727e7998a467472d0ad65f3ad5e6e765015df08636 \
-    --hash=sha256:ad9e82fb8f09ade1c3e1b996a6337afac2b8b9e365f926f5a61aacc71adc5b3c \
-    --hash=sha256:af598ed32d6ae86f1b747b82783958b1a4ab8f617b06fe68795c7f026abbdcad \
-    --hash=sha256:b076b6226fb84157e3f7c971a47ff3a679d837cf338547532ab866c57930dbee \
-    --hash=sha256:b7ff0f54cb4ff66dd38bebd335a38e2c22c41a8ee45aa608efc890ac3e3931bc \
-    --hash=sha256:bfce63a9e7834b12b87c64d6b155fdd9b3b96191b6bd334bf37db7ff1fe457f2 \
-    --hash=sha256:c011a4149cfbcf9f03994ec2edffcb8b1dc2d2aede7ca243746df97a5d41ce48 \
-    --hash=sha256:c9c804664ebe8f83a211cace637506669e7890fec1b4195b505c214e50dd4eb7 \
-    --hash=sha256:ca379055a47383d02a5400cb0d110cef0a776fc644cda797db0c5696cfd7e18e \
-    --hash=sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b \
-    --hash=sha256:cd0f502fe016460680cd20aaa5a76d241d6f35a1c3350c474bac1273803893fa \
-    --hash=sha256:ceb01949af7121f9fc39f7d27f91be8546f3fb112c608bc4029aef0bab86a2a5 \
-    --hash=sha256:d080e0a5eb2529460b30190fcfcc4199bd7f827663f858a226a81bc27beaa97e \
-    --hash=sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb \
-    --hash=sha256:df0be2b576a7abbf737b1575f048c23fb1d769f267ec4358296f31c2479db8f9 \
-    --hash=sha256:e09031c87a1e51556fdcb46e5bd4f59dfb743061cf93c4d6831bf894f125eb57 \
-    --hash=sha256:e4dd52d80b8c83fdce44e12478ad2e85c64ea965e75d66dbeafb0a3e77308fcc \
-    --hash=sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc \
-    --hash=sha256:fec21693218efe39aa7f8599346e90c705afa52c5b31ae019b2e57e8f6542bb2 \
-    --hash=sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11
+markupsafe==2.1.5 \
+    --hash=sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf \
+    --hash=sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff \
+    --hash=sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f \
+    --hash=sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3 \
+    --hash=sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532 \
+    --hash=sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f \
+    --hash=sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617 \
+    --hash=sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df \
+    --hash=sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4 \
+    --hash=sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906 \
+    --hash=sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f \
+    --hash=sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4 \
+    --hash=sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8 \
+    --hash=sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371 \
+    --hash=sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2 \
+    --hash=sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465 \
+    --hash=sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52 \
+    --hash=sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6 \
+    --hash=sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169 \
+    --hash=sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad \
+    --hash=sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2 \
+    --hash=sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0 \
+    --hash=sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029 \
+    --hash=sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f \
+    --hash=sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a \
+    --hash=sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced \
+    --hash=sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5 \
+    --hash=sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c \
+    --hash=sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf \
+    --hash=sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9 \
+    --hash=sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb \
+    --hash=sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad \
+    --hash=sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3 \
+    --hash=sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1 \
+    --hash=sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46 \
+    --hash=sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc \
+    --hash=sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a \
+    --hash=sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee \
+    --hash=sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900 \
+    --hash=sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5 \
+    --hash=sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea \
+    --hash=sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f \
+    --hash=sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5 \
+    --hash=sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e \
+    --hash=sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a \
+    --hash=sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f \
+    --hash=sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50 \
+    --hash=sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a \
+    --hash=sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b \
+    --hash=sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4 \
+    --hash=sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff \
+    --hash=sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2 \
+    --hash=sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46 \
+    --hash=sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b \
+    --hash=sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf \
+    --hash=sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5 \
+    --hash=sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5 \
+    --hash=sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab \
+    --hash=sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd \
+    --hash=sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68
     # via werkzeug
 mdurl==0.1.2 \
     --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \
     --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba
     # via markdown-it-py
-ml-dtypes==0.3.1 \
-    --hash=sha256:3d8ca0acbd377082792d8b97081ba580abdad67c6afb7f827012c675b052f058 \
-    --hash=sha256:42a8980afd8b7c8e270e8b5c260237286b5b26acd276fcb758d13cd7cb567e99 \
-    --hash=sha256:438437e2e614a3c91d75581653b6c40ec890e8b5994d7190a90c931740151c95 \
-    --hash=sha256:4828b62fa3bf1ae35faa40f3db9a38ec72fbce02f328a1d14c3a9da4606af364 \
-    --hash=sha256:4d94b2d1bed77284694f7fd0479640fa7aa5d96433dca3cbcec407a5ef752e77 \
-    --hash=sha256:510d249a91face47211762eb294d6fe64f325356b965fb6388c1bf51bd339267 \
-    --hash=sha256:5727effa7650f7ab10906542d137cfb3244fdc3b2b519beff42f82def8ba59be \
-    --hash=sha256:5e0b0b6bb07fa5ad11bb61d174667176bee5e05857225067aabfc5adc1b51d23 \
-    --hash=sha256:60778f99194b4c4f36ba42da200b35ef851ce4d4af698aaf70f5b91fe70fc611 \
-    --hash=sha256:70984b473db6489ec1d8c79b082a1322105155193049d08a3b0c515094e9777b \
-    --hash=sha256:979d7d196d9a17e0135ae22878f74241fbd3522cef58d7b292f1fd5b32282201 \
-    --hash=sha256:a777928dcba8865ab4a8157eeb25d23aed7bc82e5fd74e1d5eca821d3f148b39 \
-    --hash=sha256:cb0c404e0dd3e25b56362c1c1e5de0ef717f727dde59fa721af4ce6ab2acca44 \
-    --hash=sha256:d1a8dc3bac1da2a17d0e2e4cba36ee89721d0bd33ea4765af2eefb5f41409e0f \
-    --hash=sha256:da274599e4950a9b488d21571061f49a185537cc77f2d3f8121151d58a9e9f16 \
-    --hash=sha256:f83ff080df8910c0f987f615b03e4f8198638e0c00c6e679ea8892dda909763b \
-    --hash=sha256:fcae2c69715410d96906e1dfe8f017d9f78a0d10e0df91aae52e91f51fdfe45e
-    # via jax
+ml-dtypes==0.3.2 \
+    --hash=sha256:2c34f2ba9660b21fe1034b608308a01be82bbef2a92fb8199f24dc6bad0d5226 \
+    --hash=sha256:3a17ef2322e60858d93584e9c52a5be7dd6236b056b7fa1ec57f1bb6ba043e33 \
+    --hash=sha256:533059bc5f1764fac071ef54598db358c167c51a718f68f5bb55e3dee79d2967 \
+    --hash=sha256:6604877d567a29bfe7cc02969ae0f2425260e5335505cf5e7fefc3e5465f5655 \
+    --hash=sha256:6b35c4e8ca957c877ac35c79ffa77724ecc3702a1e4b18b08306c03feae597bb \
+    --hash=sha256:763697ab8a88d47443997a7cdf3aac7340049aed45f7521f6b0ec8a0594821fe \
+    --hash=sha256:7a4c3fcbf86fa52d0204f07cfd23947ef05b4ad743a1a988e163caa34a201e5e \
+    --hash=sha256:7afde548890a92b41c0fed3a6c525f1200a5727205f73dc21181a2726571bb53 \
+    --hash=sha256:7ba8e1fafc7fff3e643f453bffa7d082df1678a73286ce8187d3e825e776eb94 \
+    --hash=sha256:91f8783fd1f2c23fd3b9ee5ad66b785dafa58ba3cdb050c4458021fa4d1eb226 \
+    --hash=sha256:93b78f53431c93953f7850bb1b925a17f0ab5d97527e38a7e865b5b4bc5cfc18 \
+    --hash=sha256:961134ea44c7b8ca63eda902a44b58cd8bd670e21d62e255c81fba0a8e70d9b7 \
+    --hash=sha256:b89b194e9501a92d289c1ffd411380baf5daafb9818109a4f49b0a1b6dce4462 \
+    --hash=sha256:c7b3fb3d4f6b39bcd4f6c4b98f406291f0d681a895490ee29a0f95bab850d53c \
+    --hash=sha256:d1a746fe5fb9cd974a91070174258f0be129c592b93f9ce7df6cc336416c3fbd \
+    --hash=sha256:e8505946df1665db01332d885c2020b4cb9e84a8b1241eb4ba69d59591f65855 \
+    --hash=sha256:f47619d978ab1ae7dfdc4052ea97c636c6263e1f19bd1be0e42c346b98d15ff4
+    # via
+    #   jax
+    #   keras-nightly
 namex==0.0.7 \
     --hash=sha256:84ba65bc4d22bd909e3d26bf2ffb4b9529b608cb3f9a4336f776b04204ced69b \
     --hash=sha256:8a4f062945f405d77cb66b907f16aa2fd83681945e998be840eb6c4154d40108
@@ -407,10 +400,6 @@ numpy==1.23.5 ; python_version <= "3.11" \
     #   opt-einsum
     #   scipy
     #   tb-nightly
-oauthlib==3.2.2 \
-    --hash=sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca \
-    --hash=sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918
-    # via requests-oauthlib
 opt-einsum==3.3.0 \
     --hash=sha256:2455e59e3947d3c275477df7f5205b30635e266fe6dc300e3d9f9646bfcea147 \
     --hash=sha256:59f6475f77bbc37dcf7cd748519c0ec60722e91e63ca114e68821c0c54a46549
@@ -440,57 +429,36 @@ protobuf==4.23.4 \
     --hash=sha256:effeac51ab79332d44fba74660d40ae79985901ac21bca408f8dc335a81aa597 \
     --hash=sha256:fee88269a090ada09ca63551bf2f573eb2424035bcf2cb1b121895b01a46594a
     # via tb-nightly
-psutil==5.9.6 \
-    --hash=sha256:10e8c17b4f898d64b121149afb136c53ea8b68c7531155147867b7b1ac9e7e28 \
-    --hash=sha256:18cd22c5db486f33998f37e2bb054cc62fd06646995285e02a51b1e08da97017 \
-    --hash=sha256:3ebf2158c16cc69db777e3c7decb3c0f43a7af94a60d72e87b2823aebac3d602 \
-    --hash=sha256:51dc3d54607c73148f63732c727856f5febec1c7c336f8f41fcbd6315cce76ac \
-    --hash=sha256:6e5fb8dc711a514da83098bc5234264e551ad980cec5f85dabf4d38ed6f15e9a \
-    --hash=sha256:70cb3beb98bc3fd5ac9ac617a327af7e7f826373ee64c80efd4eb2856e5051e9 \
-    --hash=sha256:748c9dd2583ed86347ed65d0035f45fa8c851e8d90354c122ab72319b5f366f4 \
-    --hash=sha256:91ecd2d9c00db9817a4b4192107cf6954addb5d9d67a969a4f436dbc9200f88c \
-    --hash=sha256:92e0cc43c524834af53e9d3369245e6cc3b130e78e26100d1f63cdb0abeb3d3c \
-    --hash=sha256:a6f01f03bf1843280f4ad16f4bde26b817847b4c1a0db59bf6419807bc5ce05c \
-    --hash=sha256:c69596f9fc2f8acd574a12d5f8b7b1ba3765a641ea5d60fb4736bf3c08a8214a \
-    --hash=sha256:ca2780f5e038379e520281e4c032dddd086906ddff9ef0d1b9dcf00710e5071c \
-    --hash=sha256:daecbcbd29b289aac14ece28eca6a3e60aa361754cf6da3dfb20d4d32b6c7f57 \
-    --hash=sha256:e4b92ddcd7dd4cdd3f900180ea1e104932c7bce234fb88976e2a3b296441225a \
-    --hash=sha256:fb8a697f11b0f5994550555fcfe3e69799e5b060c8ecf9e2f75c69302cc35c0d \
-    --hash=sha256:ff18b8d1a784b810df0b0fff3bcb50ab941c3b8e2c8de5726f9c71c601c611aa
+psutil==5.9.8 \
+    --hash=sha256:02615ed8c5ea222323408ceba16c60e99c3f91639b07da6373fb7e6539abc56d \
+    --hash=sha256:05806de88103b25903dff19bb6692bd2e714ccf9e668d050d144012055cbca73 \
+    --hash=sha256:26bd09967ae00920df88e0352a91cff1a78f8d69b3ecabbfe733610c0af486c8 \
+    --hash=sha256:27cc40c3493bb10de1be4b3f07cae4c010ce715290a5be22b98493509c6299e2 \
+    --hash=sha256:36f435891adb138ed3c9e58c6af3e2e6ca9ac2f365efe1f9cfef2794e6c93b4e \
+    --hash=sha256:50187900d73c1381ba1454cf40308c2bf6f34268518b3f36a9b663ca87e65e36 \
+    --hash=sha256:611052c4bc70432ec770d5d54f64206aa7203a101ec273a0cd82418c86503bb7 \
+    --hash=sha256:6be126e3225486dff286a8fb9a06246a5253f4c7c53b475ea5f5ac934e64194c \
+    --hash=sha256:7d79560ad97af658a0f6adfef8b834b53f64746d45b403f225b85c5c2c140eee \
+    --hash=sha256:8cb6403ce6d8e047495a701dc7c5bd788add903f8986d523e3e20b98b733e421 \
+    --hash=sha256:8db4c1b57507eef143a15a6884ca10f7c73876cdf5d51e713151c1236a0e68cf \
+    --hash=sha256:aee678c8720623dc456fa20659af736241f575d79429a0e5e9cf88ae0605cc81 \
+    --hash=sha256:bc56c2a1b0d15aa3eaa5a60c9f3f8e3e565303b465dbf57a1b730e7a2b9844e0 \
+    --hash=sha256:bd1184ceb3f87651a67b2708d4c3338e9b10c5df903f2e3776b62303b26cb631 \
+    --hash=sha256:d06016f7f8625a1825ba3732081d77c94589dca78b7a3fc072194851e88461a4 \
+    --hash=sha256:d16bbddf0693323b8c6123dd804100241da461e41d6e332fb0ba6058f630f8c8
     # via portpicker
-pyasn1==0.5.0 \
-    --hash=sha256:87a2121042a1ac9358cabcaf1d07680ff97ee6404333bacca15f76aa8ad01a57 \
-    --hash=sha256:97b7290ca68e62a832558ec3976f15cbf911bf5d7c7039d8b861c2a0ece69fde
-    # via
-    #   pyasn1-modules
-    #   rsa
-pyasn1-modules==0.3.0 \
-    --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \
-    --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d
-    # via google-auth
-pygments==2.16.1 \
-    --hash=sha256:13fc09fa63bc8d8671a6d247e1eb303c4b343eaee81d861f3404db2935653692 \
-    --hash=sha256:1daff0494820c69bc8941e407aa20f577374ee88364ee10a98fdbe0aece96e29
+pygments==2.17.2 \
+    --hash=sha256:b27c2826c47d0f3219f29554824c30c5e8945175d888647acd804ddd04af846c \
+    --hash=sha256:da46cec9fd2de5be3a8a784f434e4c4ab670b4ff54d605c4c2717e9d49c4c367
     # via rich
 requests==2.31.0 \
     --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \
     --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1
-    # via
-    #   -r requirements.in
-    #   requests-oauthlib
-    #   tb-nightly
-requests-oauthlib==1.3.1 \
-    --hash=sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5 \
-    --hash=sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a
-    # via google-auth-oauthlib
-rich==13.6.0 \
-    --hash=sha256:2b38e2fe9ca72c9a00170a1a2d20c63c790d0e10ef1fe35eba76e1e7b1d7d245 \
-    --hash=sha256:5c14d22737e6d5084ef4771b62d5d4363165b403455a30a1c8ca39dc7b644bef
+    # via -r requirements.in
+rich==13.7.0 \
+    --hash=sha256:5cb5123b5cf9ee70584244246816e9114227e0b98ad9176eede6ad54bf5403fa \
+    --hash=sha256:6da14c108c4866ee9520bbffa71f6fe3962e193b7da68720583850cd4548e235
     # via keras-nightly
-rsa==4.9 \
-    --hash=sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7 \
-    --hash=sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21
-    # via google-auth
 scipy==1.11.3 \
     --hash=sha256:00f325434b6424952fbb636506f0567898dca7b0f7654d48f1c382ea338ce9a3 \
     --hash=sha256:033c3fd95d55012dd1148b201b72ae854d5086d25e7c316ec9850de4fe776929 \
@@ -526,8 +494,8 @@ six==1.16.0 \
     # via
     #   astunparse
     #   tb-nightly
-tb-nightly==2.15.0a20231023 \
-    --hash=sha256:8990a52985e3296aa18a6825efc017bcded9e2fb2cbcdd2d01f5c62d4fdc9825
+tb-nightly==2.17.0a20240214 \
+    --hash=sha256:dbd59bcfd9b028e6199050e36cbb4ee1db4a94e69a7f8c00865f1498112a2b83
     # via -r requirements.in
 tblib==2.0.0 \
     --hash=sha256:9100bfa016b047d5b980d66e7efed952fbd20bd85b56110aaf473cb97d18709a \
@@ -542,13 +510,17 @@ termcolor==2.3.0 \
     --hash=sha256:3afb05607b89aed0ffe25202399ee0867ad4d3cb4180d98aaf8eefa6a5f7d475 \
     --hash=sha256:b5b08f68937f138fe92f6c089b99f1e2da0ae56c52b78bf7075fd95420fd9a5a
     # via -r requirements.in
+tf-keras-nightly==2.16.0.dev2024021410 \
+    --hash=sha256:05a4c19c6a795ec9ed6f4dac69d443601b9206b910524d8b19bad6e362d49f47 \
+    --hash=sha256:83d5a1dd3979b42164a2fac4bdd2ed5981ef0c2a0514ca64e4bc67e369caef5c
+    # via tb-nightly
 typing-extensions==4.8.0 \
     --hash=sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0 \
     --hash=sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef
     # via -r requirements.in
-urllib3==2.0.7 \
-    --hash=sha256:c97dfde1f7bd43a71c8d2a58e369e9b2bf692d1334ea9f9cae55add7d0dd0f84 \
-    --hash=sha256:fdb6d215c776278489906c2f8916e6e7d4f5a9b602ccbcfdf7f016fc8da0596e
+urllib3==2.2.0 \
+    --hash=sha256:051d961ad0c62a94e50ecf1af379c3aba230c66c710493493560c0c223c49f20 \
+    --hash=sha256:ce3711610ddce217e6d113a2732fafad960a03fd0318c91faa79481e35c11224
     # via requests
 werkzeug==3.0.1 \
     --hash=sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc \
@@ -560,81 +532,77 @@ wheel==0.41.3 \
     # via
     #   -r requirements.in
     #   astunparse
-wrapt==1.14.1 \
-    --hash=sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3 \
-    --hash=sha256:01c205616a89d09827986bc4e859bcabd64f5a0662a7fe95e0d359424e0e071b \
-    --hash=sha256:02b41b633c6261feff8ddd8d11c711df6842aba629fdd3da10249a53211a72c4 \
-    --hash=sha256:07f7a7d0f388028b2df1d916e94bbb40624c59b48ecc6cbc232546706fac74c2 \
-    --hash=sha256:11871514607b15cfeb87c547a49bca19fde402f32e2b1c24a632506c0a756656 \
-    --hash=sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3 \
-    --hash=sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9 \
-    --hash=sha256:21ac0156c4b089b330b7666db40feee30a5d52634cc4560e1905d6529a3897ff \
-    --hash=sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9 \
-    --hash=sha256:257fd78c513e0fb5cdbe058c27a0624c9884e735bbd131935fd49e9fe719d310 \
-    --hash=sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224 \
-    --hash=sha256:2b39d38039a1fdad98c87279b48bc5dce2c0ca0d73483b12cb72aa9609278e8a \
-    --hash=sha256:2cf71233a0ed05ccdabe209c606fe0bac7379fdcf687f39b944420d2a09fdb57 \
-    --hash=sha256:2fe803deacd09a233e4762a1adcea5db5d31e6be577a43352936179d14d90069 \
-    --hash=sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335 \
-    --hash=sha256:3232822c7d98d23895ccc443bbdf57c7412c5a65996c30442ebe6ed3df335383 \
-    --hash=sha256:34aa51c45f28ba7f12accd624225e2b1e5a3a45206aa191f6f9aac931d9d56fe \
-    --hash=sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204 \
-    --hash=sha256:36f582d0c6bc99d5f39cd3ac2a9062e57f3cf606ade29a0a0d6b323462f4dd87 \
-    --hash=sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d \
-    --hash=sha256:40e7bc81c9e2b2734ea4bc1aceb8a8f0ceaac7c5299bc5d69e37c44d9081d43b \
-    --hash=sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907 \
-    --hash=sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be \
-    --hash=sha256:4fcc4649dc762cddacd193e6b55bc02edca674067f5f98166d7713b193932b7f \
-    --hash=sha256:5a0f54ce2c092aaf439813735584b9537cad479575a09892b8352fea5e988dc0 \
-    --hash=sha256:5a9a0d155deafd9448baff28c08e150d9b24ff010e899311ddd63c45c2445e28 \
-    --hash=sha256:5b02d65b9ccf0ef6c34cba6cf5bf2aab1bb2f49c6090bafeecc9cd81ad4ea1c1 \
-    --hash=sha256:60db23fa423575eeb65ea430cee741acb7c26a1365d103f7b0f6ec412b893853 \
-    --hash=sha256:642c2e7a804fcf18c222e1060df25fc210b9c58db7c91416fb055897fc27e8cc \
-    --hash=sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf \
-    --hash=sha256:6a9a25751acb379b466ff6be78a315e2b439d4c94c1e99cb7266d40a537995d3 \
-    --hash=sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3 \
-    --hash=sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164 \
-    --hash=sha256:6e743de5e9c3d1b7185870f480587b75b1cb604832e380d64f9504a0535912d1 \
-    --hash=sha256:709fe01086a55cf79d20f741f39325018f4df051ef39fe921b1ebe780a66184c \
-    --hash=sha256:7b7c050ae976e286906dd3f26009e117eb000fb2cf3533398c5ad9ccc86867b1 \
-    --hash=sha256:7d2872609603cb35ca513d7404a94d6d608fc13211563571117046c9d2bcc3d7 \
-    --hash=sha256:7ef58fb89674095bfc57c4069e95d7a31cfdc0939e2a579882ac7d55aadfd2a1 \
-    --hash=sha256:80bb5c256f1415f747011dc3604b59bc1f91c6e7150bd7db03b19170ee06b320 \
-    --hash=sha256:81b19725065dcb43df02b37e03278c011a09e49757287dca60c5aecdd5a0b8ed \
-    --hash=sha256:833b58d5d0b7e5b9832869f039203389ac7cbf01765639c7309fd50ef619e0b1 \
-    --hash=sha256:88bd7b6bd70a5b6803c1abf6bca012f7ed963e58c68d76ee20b9d751c74a3248 \
-    --hash=sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c \
-    --hash=sha256:8c0ce1e99116d5ab21355d8ebe53d9460366704ea38ae4d9f6933188f327b456 \
-    --hash=sha256:8d649d616e5c6a678b26d15ece345354f7c2286acd6db868e65fcc5ff7c24a77 \
-    --hash=sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef \
-    --hash=sha256:9736af4641846491aedb3c3f56b9bc5568d92b0692303b5a305301a95dfd38b1 \
-    --hash=sha256:988635d122aaf2bdcef9e795435662bcd65b02f4f4c1ae37fbee7401c440b3a7 \
-    --hash=sha256:9cca3c2cdadb362116235fdbd411735de4328c61425b0aa9f872fd76d02c4e86 \
-    --hash=sha256:9e0fd32e0148dd5dea6af5fee42beb949098564cc23211a88d799e434255a1f4 \
-    --hash=sha256:9f3e6f9e05148ff90002b884fbc2a86bd303ae847e472f44ecc06c2cd2fcdb2d \
-    --hash=sha256:a85d2b46be66a71bedde836d9e41859879cc54a2a04fad1191eb50c2066f6e9d \
-    --hash=sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8 \
-    --hash=sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8 \
-    --hash=sha256:aa31fdcc33fef9eb2552cbcbfee7773d5a6792c137b359e82879c101e98584c5 \
-    --hash=sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a \
-    --hash=sha256:b014c23646a467558be7da3d6b9fa409b2c567d2110599b7cf9a0c5992b3b471 \
-    --hash=sha256:b21bb4c09ffabfa0e85e3a6b623e19b80e7acd709b9f91452b8297ace2a8ab00 \
-    --hash=sha256:b5901a312f4d14c59918c221323068fad0540e34324925c8475263841dbdfe68 \
-    --hash=sha256:b9b7a708dd92306328117d8c4b62e2194d00c365f18eff11a9b53c6f923b01e3 \
-    --hash=sha256:d1967f46ea8f2db647c786e78d8cc7e4313dbd1b0aca360592d8027b8508e24d \
-    --hash=sha256:d52a25136894c63de15a35bc0bdc5adb4b0e173b9c0d07a2be9d3ca64a332735 \
-    --hash=sha256:d77c85fedff92cf788face9bfa3ebaa364448ebb1d765302e9af11bf449ca36d \
-    --hash=sha256:d79d7d5dc8a32b7093e81e97dad755127ff77bcc899e845f41bf71747af0c569 \
-    --hash=sha256:dbcda74c67263139358f4d188ae5faae95c30929281bc6866d00573783c422b7 \
-    --hash=sha256:ddaea91abf8b0d13443f6dac52e89051a5063c7d014710dcb4d4abb2ff811a59 \
-    --hash=sha256:dee0ce50c6a2dd9056c20db781e9c1cfd33e77d2d569f5d1d9321c641bb903d5 \
-    --hash=sha256:dee60e1de1898bde3b238f18340eec6148986da0455d8ba7848d50470a7a32fb \
-    --hash=sha256:e2f83e18fe2f4c9e7db597e988f72712c0c3676d337d8b101f6758107c42425b \
-    --hash=sha256:e3fb1677c720409d5f671e39bac6c9e0e422584e5f518bfd50aa4cbbea02433f \
-    --hash=sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55 \
-    --hash=sha256:ee2b1b1769f6707a8a445162ea16dddf74285c3964f605877a20e38545c3c462 \
-    --hash=sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015 \
-    --hash=sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af
+wrapt==1.16.0 \
+    --hash=sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc \
+    --hash=sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81 \
+    --hash=sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09 \
+    --hash=sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e \
+    --hash=sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca \
+    --hash=sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0 \
+    --hash=sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb \
+    --hash=sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487 \
+    --hash=sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40 \
+    --hash=sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c \
+    --hash=sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060 \
+    --hash=sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202 \
+    --hash=sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41 \
+    --hash=sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9 \
+    --hash=sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b \
+    --hash=sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664 \
+    --hash=sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d \
+    --hash=sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362 \
+    --hash=sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00 \
+    --hash=sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc \
+    --hash=sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1 \
+    --hash=sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267 \
+    --hash=sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956 \
+    --hash=sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966 \
+    --hash=sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1 \
+    --hash=sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228 \
+    --hash=sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72 \
+    --hash=sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d \
+    --hash=sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292 \
+    --hash=sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0 \
+    --hash=sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0 \
+    --hash=sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36 \
+    --hash=sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c \
+    --hash=sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5 \
+    --hash=sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f \
+    --hash=sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73 \
+    --hash=sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b \
+    --hash=sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2 \
+    --hash=sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593 \
+    --hash=sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39 \
+    --hash=sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389 \
+    --hash=sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf \
+    --hash=sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf \
+    --hash=sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89 \
+    --hash=sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c \
+    --hash=sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c \
+    --hash=sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f \
+    --hash=sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440 \
+    --hash=sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465 \
+    --hash=sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136 \
+    --hash=sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b \
+    --hash=sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8 \
+    --hash=sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3 \
+    --hash=sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8 \
+    --hash=sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6 \
+    --hash=sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e \
+    --hash=sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f \
+    --hash=sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c \
+    --hash=sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e \
+    --hash=sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8 \
+    --hash=sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2 \
+    --hash=sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020 \
+    --hash=sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35 \
+    --hash=sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d \
+    --hash=sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3 \
+    --hash=sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537 \
+    --hash=sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809 \
+    --hash=sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d \
+    --hash=sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a \
+    --hash=sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4
     # via -r requirements.in
 
 setuptools==68.2.2 \
diff --git a/requirements_lock_3_11.txt b/requirements_lock_3_11.txt
index 434b38d603df80..2335d295d0faf6 100644
--- a/requirements_lock_3_11.txt
+++ b/requirements_lock_3_11.txt
@@ -1,6 +1,6 @@
-absl-py==2.0.0 \
-    --hash=sha256:9a28abb62774ae4e8edbe2dd4c49ffcd45a6a848952a5eccc6a49f3f0fc1e2f3 \
-    --hash=sha256:d9690211c5fcfefcdd1a45470ac2b5c5acd45241c3af71eed96bc5441746c0d5
+absl-py==2.1.0 \
+    --hash=sha256:526a04eadab8b4ee719ce68f204172ead1027549089702d99b9059f129ff1308 \
+    --hash=sha256:7820790efbb316739cde8b4e19357243fc3608a152024288513dd968d7d959ff
     # via
     #   keras-nightly
     #   tb-nightly
@@ -12,105 +12,101 @@ astunparse==1.6.3 \
     --hash=sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872 \
     --hash=sha256:c2652417f2c8b5bb325c885ae329bdf3f86424075c4fd1a128674bc6fba4b8e8
     # via -r requirements.in
-cachetools==5.3.2 \
-    --hash=sha256:086ee420196f7b2ab9ca2db2520aca326318b68fe5ba8bc4d49cca91add450f2 \
-    --hash=sha256:861f35a13a451f94e301ce2bec7cac63e881232ccce7ed67fab9b5df4d3beaa1
-    # via google-auth
-certifi==2023.7.22 \
-    --hash=sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082 \
-    --hash=sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9
+certifi==2024.2.2 \
+    --hash=sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f \
+    --hash=sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1
     # via requests
-charset-normalizer==3.3.1 \
-    --hash=sha256:06cf46bdff72f58645434d467bf5228080801298fbba19fe268a01b4534467f5 \
-    --hash=sha256:0c8c61fb505c7dad1d251c284e712d4e0372cef3b067f7ddf82a7fa82e1e9a93 \
-    --hash=sha256:10b8dd31e10f32410751b3430996f9807fc4d1587ca69772e2aa940a82ab571a \
-    --hash=sha256:1171ef1fc5ab4693c5d151ae0fdad7f7349920eabbaca6271f95969fa0756c2d \
-    --hash=sha256:17a866d61259c7de1bdadef418a37755050ddb4b922df8b356503234fff7932c \
-    --hash=sha256:1d6bfc32a68bc0933819cfdfe45f9abc3cae3877e1d90aac7259d57e6e0f85b1 \
-    --hash=sha256:1ec937546cad86d0dce5396748bf392bb7b62a9eeb8c66efac60e947697f0e58 \
-    --hash=sha256:223b4d54561c01048f657fa6ce41461d5ad8ff128b9678cfe8b2ecd951e3f8a2 \
-    --hash=sha256:2465aa50c9299d615d757c1c888bc6fef384b7c4aec81c05a0172b4400f98557 \
-    --hash=sha256:28f512b9a33235545fbbdac6a330a510b63be278a50071a336afc1b78781b147 \
-    --hash=sha256:2c092be3885a1b7899cd85ce24acedc1034199d6fca1483fa2c3a35c86e43041 \
-    --hash=sha256:2c4c99f98fc3a1835af8179dcc9013f93594d0670e2fa80c83aa36346ee763d2 \
-    --hash=sha256:31445f38053476a0c4e6d12b047b08ced81e2c7c712e5a1ad97bc913256f91b2 \
-    --hash=sha256:31bbaba7218904d2eabecf4feec0d07469284e952a27400f23b6628439439fa7 \
-    --hash=sha256:34d95638ff3613849f473afc33f65c401a89f3b9528d0d213c7037c398a51296 \
-    --hash=sha256:352a88c3df0d1fa886562384b86f9a9e27563d4704ee0e9d56ec6fcd270ea690 \
-    --hash=sha256:39b70a6f88eebe239fa775190796d55a33cfb6d36b9ffdd37843f7c4c1b5dc67 \
-    --hash=sha256:3c66df3f41abee950d6638adc7eac4730a306b022570f71dd0bd6ba53503ab57 \
-    --hash=sha256:3f70fd716855cd3b855316b226a1ac8bdb3caf4f7ea96edcccc6f484217c9597 \
-    --hash=sha256:3f9bc2ce123637a60ebe819f9fccc614da1bcc05798bbbaf2dd4ec91f3e08846 \
-    --hash=sha256:3fb765362688821404ad6cf86772fc54993ec11577cd5a92ac44b4c2ba52155b \
-    --hash=sha256:45f053a0ece92c734d874861ffe6e3cc92150e32136dd59ab1fb070575189c97 \
-    --hash=sha256:46fb9970aa5eeca547d7aa0de5d4b124a288b42eaefac677bde805013c95725c \
-    --hash=sha256:4cb50a0335382aac15c31b61d8531bc9bb657cfd848b1d7158009472189f3d62 \
-    --hash=sha256:4e12f8ee80aa35e746230a2af83e81bd6b52daa92a8afaef4fea4a2ce9b9f4fa \
-    --hash=sha256:4f3100d86dcd03c03f7e9c3fdb23d92e32abbca07e7c13ebd7ddfbcb06f5991f \
-    --hash=sha256:4f6e2a839f83a6a76854d12dbebde50e4b1afa63e27761549d006fa53e9aa80e \
-    --hash=sha256:4f861d94c2a450b974b86093c6c027888627b8082f1299dfd5a4bae8e2292821 \
-    --hash=sha256:501adc5eb6cd5f40a6f77fbd90e5ab915c8fd6e8c614af2db5561e16c600d6f3 \
-    --hash=sha256:520b7a142d2524f999447b3a0cf95115df81c4f33003c51a6ab637cbda9d0bf4 \
-    --hash=sha256:548eefad783ed787b38cb6f9a574bd8664468cc76d1538215d510a3cd41406cb \
-    --hash=sha256:555fe186da0068d3354cdf4bbcbc609b0ecae4d04c921cc13e209eece7720727 \
-    --hash=sha256:55602981b2dbf8184c098bc10287e8c245e351cd4fdcad050bd7199d5a8bf514 \
-    --hash=sha256:58e875eb7016fd014c0eea46c6fa92b87b62c0cb31b9feae25cbbe62c919f54d \
-    --hash=sha256:5a3580a4fdc4ac05f9e53c57f965e3594b2f99796231380adb2baaab96e22761 \
-    --hash=sha256:5b70bab78accbc672f50e878a5b73ca692f45f5b5e25c8066d748c09405e6a55 \
-    --hash=sha256:5ceca5876032362ae73b83347be8b5dbd2d1faf3358deb38c9c88776779b2e2f \
-    --hash=sha256:61f1e3fb621f5420523abb71f5771a204b33c21d31e7d9d86881b2cffe92c47c \
-    --hash=sha256:633968254f8d421e70f91c6ebe71ed0ab140220469cf87a9857e21c16687c034 \
-    --hash=sha256:63a6f59e2d01310f754c270e4a257426fe5a591dc487f1983b3bbe793cf6bac6 \
-    --hash=sha256:63accd11149c0f9a99e3bc095bbdb5a464862d77a7e309ad5938fbc8721235ae \
-    --hash=sha256:6db3cfb9b4fcecb4390db154e75b49578c87a3b9979b40cdf90d7e4b945656e1 \
-    --hash=sha256:71ef3b9be10070360f289aea4838c784f8b851be3ba58cf796262b57775c2f14 \
-    --hash=sha256:7ae8e5142dcc7a49168f4055255dbcced01dc1714a90a21f87448dc8d90617d1 \
-    --hash=sha256:7b6cefa579e1237ce198619b76eaa148b71894fb0d6bcf9024460f9bf30fd228 \
-    --hash=sha256:800561453acdecedaac137bf09cd719c7a440b6800ec182f077bb8e7025fb708 \
-    --hash=sha256:82ca51ff0fc5b641a2d4e1cc8c5ff108699b7a56d7f3ad6f6da9dbb6f0145b48 \
-    --hash=sha256:851cf693fb3aaef71031237cd68699dded198657ec1e76a76eb8be58c03a5d1f \
-    --hash=sha256:854cc74367180beb327ab9d00f964f6d91da06450b0855cbbb09187bcdb02de5 \
-    --hash=sha256:87071618d3d8ec8b186d53cb6e66955ef2a0e4fa63ccd3709c0c90ac5a43520f \
-    --hash=sha256:871d045d6ccc181fd863a3cd66ee8e395523ebfbc57f85f91f035f50cee8e3d4 \
-    --hash=sha256:8aee051c89e13565c6bd366813c386939f8e928af93c29fda4af86d25b73d8f8 \
-    --hash=sha256:8af5a8917b8af42295e86b64903156b4f110a30dca5f3b5aedea123fbd638bff \
-    --hash=sha256:8ec8ef42c6cd5856a7613dcd1eaf21e5573b2185263d87d27c8edcae33b62a61 \
-    --hash=sha256:91e43805ccafa0a91831f9cd5443aa34528c0c3f2cc48c4cb3d9a7721053874b \
-    --hash=sha256:9505dc359edb6a330efcd2be825fdb73ee3e628d9010597aa1aee5aa63442e97 \
-    --hash=sha256:985c7965f62f6f32bf432e2681173db41336a9c2611693247069288bcb0c7f8b \
-    --hash=sha256:9a74041ba0bfa9bc9b9bb2cd3238a6ab3b7618e759b41bd15b5f6ad958d17605 \
-    --hash=sha256:9edbe6a5bf8b56a4a84533ba2b2f489d0046e755c29616ef8830f9e7d9cf5728 \
-    --hash=sha256:a15c1fe6d26e83fd2e5972425a772cca158eae58b05d4a25a4e474c221053e2d \
-    --hash=sha256:a66bcdf19c1a523e41b8e9d53d0cedbfbac2e93c649a2e9502cb26c014d0980c \
-    --hash=sha256:ae4070f741f8d809075ef697877fd350ecf0b7c5837ed68738607ee0a2c572cf \
-    --hash=sha256:ae55d592b02c4349525b6ed8f74c692509e5adffa842e582c0f861751701a673 \
-    --hash=sha256:b578cbe580e3b41ad17b1c428f382c814b32a6ce90f2d8e39e2e635d49e498d1 \
-    --hash=sha256:b891a2f68e09c5ef989007fac11476ed33c5c9994449a4e2c3386529d703dc8b \
-    --hash=sha256:baec8148d6b8bd5cee1ae138ba658c71f5b03e0d69d5907703e3e1df96db5e41 \
-    --hash=sha256:bb06098d019766ca16fc915ecaa455c1f1cd594204e7f840cd6258237b5079a8 \
-    --hash=sha256:bc791ec3fd0c4309a753f95bb6c749ef0d8ea3aea91f07ee1cf06b7b02118f2f \
-    --hash=sha256:bd28b31730f0e982ace8663d108e01199098432a30a4c410d06fe08fdb9e93f4 \
-    --hash=sha256:be4d9c2770044a59715eb57c1144dedea7c5d5ae80c68fb9959515037cde2008 \
-    --hash=sha256:c0c72d34e7de5604df0fde3644cc079feee5e55464967d10b24b1de268deceb9 \
-    --hash=sha256:c0e842112fe3f1a4ffcf64b06dc4c61a88441c2f02f373367f7b4c1aa9be2ad5 \
-    --hash=sha256:c15070ebf11b8b7fd1bfff7217e9324963c82dbdf6182ff7050519e350e7ad9f \
-    --hash=sha256:c2000c54c395d9e5e44c99dc7c20a64dc371f777faf8bae4919ad3e99ce5253e \
-    --hash=sha256:c30187840d36d0ba2893bc3271a36a517a717f9fd383a98e2697ee890a37c273 \
-    --hash=sha256:cb7cd68814308aade9d0c93c5bd2ade9f9441666f8ba5aa9c2d4b389cb5e2a45 \
-    --hash=sha256:cd805513198304026bd379d1d516afbf6c3c13f4382134a2c526b8b854da1c2e \
-    --hash=sha256:d0bf89afcbcf4d1bb2652f6580e5e55a840fdf87384f6063c4a4f0c95e378656 \
-    --hash=sha256:d9137a876020661972ca6eec0766d81aef8a5627df628b664b234b73396e727e \
-    --hash=sha256:dbd95e300367aa0827496fe75a1766d198d34385a58f97683fe6e07f89ca3e3c \
-    --hash=sha256:dced27917823df984fe0c80a5c4ad75cf58df0fbfae890bc08004cd3888922a2 \
-    --hash=sha256:de0b4caa1c8a21394e8ce971997614a17648f94e1cd0640fbd6b4d14cab13a72 \
-    --hash=sha256:debb633f3f7856f95ad957d9b9c781f8e2c6303ef21724ec94bea2ce2fcbd056 \
-    --hash=sha256:e372d7dfd154009142631de2d316adad3cc1c36c32a38b16a4751ba78da2a397 \
-    --hash=sha256:ecd26be9f112c4f96718290c10f4caea6cc798459a3a76636b817a0ed7874e42 \
-    --hash=sha256:edc0202099ea1d82844316604e17d2b175044f9bcb6b398aab781eba957224bd \
-    --hash=sha256:f194cce575e59ffe442c10a360182a986535fd90b57f7debfaa5c845c409ecc3 \
-    --hash=sha256:f5fb672c396d826ca16a022ac04c9dce74e00a1c344f6ad1a0fdc1ba1f332213 \
-    --hash=sha256:f6a02a3c7950cafaadcd46a226ad9e12fc9744652cc69f9e5534f98b47f3bbcf \
-    --hash=sha256:fe81b35c33772e56f4b6cf62cf4aedc1762ef7162a31e6ac7fe5e40d0149eb67
+charset-normalizer==3.3.2 \
+    --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \
+    --hash=sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087 \
+    --hash=sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786 \
+    --hash=sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8 \
+    --hash=sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09 \
+    --hash=sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185 \
+    --hash=sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574 \
+    --hash=sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e \
+    --hash=sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519 \
+    --hash=sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898 \
+    --hash=sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269 \
+    --hash=sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3 \
+    --hash=sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f \
+    --hash=sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6 \
+    --hash=sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8 \
+    --hash=sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a \
+    --hash=sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73 \
+    --hash=sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc \
+    --hash=sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714 \
+    --hash=sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2 \
+    --hash=sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc \
+    --hash=sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce \
+    --hash=sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d \
+    --hash=sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e \
+    --hash=sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6 \
+    --hash=sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269 \
+    --hash=sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96 \
+    --hash=sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d \
+    --hash=sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a \
+    --hash=sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4 \
+    --hash=sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77 \
+    --hash=sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d \
+    --hash=sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0 \
+    --hash=sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed \
+    --hash=sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068 \
+    --hash=sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac \
+    --hash=sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25 \
+    --hash=sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8 \
+    --hash=sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab \
+    --hash=sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26 \
+    --hash=sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2 \
+    --hash=sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db \
+    --hash=sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f \
+    --hash=sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5 \
+    --hash=sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99 \
+    --hash=sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c \
+    --hash=sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d \
+    --hash=sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811 \
+    --hash=sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa \
+    --hash=sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a \
+    --hash=sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03 \
+    --hash=sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b \
+    --hash=sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04 \
+    --hash=sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c \
+    --hash=sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001 \
+    --hash=sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458 \
+    --hash=sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389 \
+    --hash=sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99 \
+    --hash=sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985 \
+    --hash=sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537 \
+    --hash=sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238 \
+    --hash=sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f \
+    --hash=sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d \
+    --hash=sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796 \
+    --hash=sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a \
+    --hash=sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143 \
+    --hash=sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8 \
+    --hash=sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c \
+    --hash=sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5 \
+    --hash=sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5 \
+    --hash=sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711 \
+    --hash=sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4 \
+    --hash=sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6 \
+    --hash=sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c \
+    --hash=sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7 \
+    --hash=sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4 \
+    --hash=sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b \
+    --hash=sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae \
+    --hash=sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12 \
+    --hash=sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c \
+    --hash=sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae \
+    --hash=sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8 \
+    --hash=sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887 \
+    --hash=sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b \
+    --hash=sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4 \
+    --hash=sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f \
+    --hash=sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5 \
+    --hash=sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33 \
+    --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \
+    --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561
     # via requests
 dill==0.3.7 \
     --hash=sha256:76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e \
@@ -133,6 +129,7 @@ dm-tree==0.1.8 \
     --hash=sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60 \
     --hash=sha256:378cc8ad93c5fe3590f405a309980721f021c790ca1bdf9b15bb1d59daec57f5 \
     --hash=sha256:39070ba268c0491af9fe7a58644d99e8b4f2cde6e5884ba3380bddc84ed43d5f \
+    --hash=sha256:435227cf3c5dc63f4de054cf3d00183790bd9ead4c3623138c74dde7f67f521b \
     --hash=sha256:5483dca4d7eb1a0d65fe86d3b6a53ae717face83c1f17e0887b1a4a64ae5c410 \
     --hash=sha256:694c3654cfd2a81552c08ec66bb5c4a3d48fa292b9a181880fb081c36c5b9134 \
     --hash=sha256:803bfc53b4659f447ac694dbd04235f94a73ef7c1fd1e0df7c84ac41e0bc963b \
@@ -140,6 +137,8 @@ dm-tree==0.1.8 \
     --hash=sha256:83b7764de0d855338abefc6e3ee9fe40d301668310aa3baea3f778ff051f4393 \
     --hash=sha256:8c60a7eadab64c2278861f56bca320b2720f163dca9d7558103c3b77f2416571 \
     --hash=sha256:8ed3564abed97c806db122c2d3e1a2b64c74a63debe9903aad795167cc301368 \
+    --hash=sha256:94d3f0826311f45ee19b75f5b48c99466e4218a0489e81c0f0167bda50cacf22 \
+    --hash=sha256:96a548a406a6fb15fe58f6a30a57ff2f2aafbf25f05afab00c8f5e5977b6c715 \
     --hash=sha256:a5d819c38c03f0bb5b3b3703c60e4b170355a0fc6b5819325bf3d4ceb3ae7e80 \
     --hash=sha256:ad16ceba90a56ec47cf45b21856d14962ac314787975ef786efb5e6e9ca75ec7 \
     --hash=sha256:af4b3d372f2477dcd89a6e717e4a575ca35ccc20cc4454a8a4b6f8838a00672d \
@@ -147,6 +146,7 @@ dm-tree==0.1.8 \
     --hash=sha256:b9bd9b9ccb59409d33d51d84b7668010c04c2af7d4a371632874c1ca356cff3d \
     --hash=sha256:b9f89a454e98806b44fe9d40ec9eee61f848388f7e79ac2371a55679bd5a3ac6 \
     --hash=sha256:bb2d109f42190225112da899b9f3d46d0d5f26aef501c61e43529fe9322530b5 \
+    --hash=sha256:c0a94aba18a35457a1b5cd716fd7b46c5dafdc4cf7869b4bae665b91c4682a8e \
     --hash=sha256:c5c8c12e3fda754ef6af94161bacdaeda816d941995fac415d6855c6c386af68 \
     --hash=sha256:d1612fcaecd79023dbc6a6ae48d51a80beb5c385d6f3f6d71688e57bc8d07de8 \
     --hash=sha256:d16e1f2a073604cfcc09f7131ae8d534674f43c3aef4c25742eae295bc60d04f \
@@ -154,6 +154,7 @@ dm-tree==0.1.8 \
     --hash=sha256:d40fa4106ca6edc66760246a08f500ec0c85ef55c762fb4a363f6ee739ba02ee \
     --hash=sha256:de287fabc464b8734be251e46e06aa9aa1001f34198da2b6ce07bd197172b9cb \
     --hash=sha256:e4d714371bb08839e4e5e29024fc95832d9affe129825ef38836b143028bd144 \
+    --hash=sha256:ea9e59e0451e7d29aece402d9f908f2e2a80922bcde2ebfd5dcb07750fcbfee8 \
     --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \
     --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d
     # via keras-nightly
@@ -161,71 +162,61 @@ gast==0.4.0 \
     --hash=sha256:40feb7b8b8434785585ab224d1568b857edb18297e5a3047f1ba012bc83b42c1 \
     --hash=sha256:b7adcdd5adbebf1adf17378da5ba3f543684dbec47b1cda1f3997e573cd542c4
     # via -r requirements.in
-google-auth==2.23.3 \
-    --hash=sha256:6864247895eea5d13b9c57c9e03abb49cb94ce2dc7c58e91cba3248c7477c9e3 \
-    --hash=sha256:a8f4608e65c244ead9e0538f181a96c6e11199ec114d41f1d7b1bffa96937bda
-    # via
-    #   google-auth-oauthlib
-    #   tb-nightly
-google-auth-oauthlib==1.1.0 \
-    --hash=sha256:089c6e587d36f4803ac7e0720c045c6a8b1fd1790088b8424975b90d0ee61c12 \
-    --hash=sha256:83ea8c3b0881e453790baff4448e8a6112ac8778d1de9da0b68010b843937afb
-    # via tb-nightly
-grpcio==1.59.2 \
-    --hash=sha256:023088764012411affe7db183d1ada3ad9daf2e23ddc719ff46d7061de661340 \
-    --hash=sha256:08d77e682f2bf730a4961eea330e56d2f423c6a9b91ca222e5b1eb24a357b19f \
-    --hash=sha256:0a4a3833c0e067f3558538727235cd8a49709bff1003200bbdefa2f09334e4b1 \
-    --hash=sha256:0a754aff9e3af63bdc4c75c234b86b9d14e14a28a30c4e324aed1a9b873d755f \
-    --hash=sha256:11168ef43e4a43ff1b1a65859f3e0ef1a173e277349e7fb16923ff108160a8cd \
-    --hash=sha256:128e20f57c5f27cb0157e73756d1586b83c1b513ebecc83ea0ac37e4b0e4e758 \
-    --hash=sha256:1f9524d1d701e399462d2c90ba7c193e49d1711cf429c0d3d97c966856e03d00 \
-    --hash=sha256:1ff16d68bf453275466a9a46739061a63584d92f18a0f5b33d19fc97eb69867c \
-    --hash=sha256:2067274c88bc6de89c278a672a652b4247d088811ece781a4858b09bdf8448e3 \
-    --hash=sha256:2171c39f355ba5b551c5d5928d65aa6c69807fae195b86ef4a7d125bcdb860a9 \
-    --hash=sha256:242adc47725b9a499ee77c6a2e36688fa6c96484611f33b1be4c57ab075a92dd \
-    --hash=sha256:27f879ae604a7fcf371e59fba6f3ff4635a4c2a64768bd83ff0cac503142fef4 \
-    --hash=sha256:2b230028a008ae1d0f430acb227d323ff8a619017415cf334c38b457f814119f \
-    --hash=sha256:3059668df17627f0e0fa680e9ef8c995c946c792612e9518f5cc1503be14e90b \
-    --hash=sha256:31176aa88f36020055ace9adff2405a33c8bdbfa72a9c4980e25d91b2f196873 \
-    --hash=sha256:36f53c2b3449c015880e7d55a89c992c357f176327b0d2873cdaaf9628a37c69 \
-    --hash=sha256:3b4368b33908f683a363f376dfb747d40af3463a6e5044afee07cf9436addf96 \
-    --hash=sha256:3c61d641d4f409c5ae46bfdd89ea42ce5ea233dcf69e74ce9ba32b503c727e29 \
-    --hash=sha256:4abb717e320e74959517dc8e84a9f48fbe90e9abe19c248541e9418b1ce60acd \
-    --hash=sha256:4c93f4abbb54321ee6471e04a00139c80c754eda51064187963ddf98f5cf36a4 \
-    --hash=sha256:535561990e075fa6bd4b16c4c3c1096b9581b7bb35d96fac4650f1181e428268 \
-    --hash=sha256:53c9aa5ddd6857c0a1cd0287225a2a25873a8e09727c2e95c4aebb1be83a766a \
-    --hash=sha256:5d573e70a6fe77555fb6143c12d3a7d3fa306632a3034b4e7c59ca09721546f8 \
-    --hash=sha256:6009386a2df66159f64ac9f20425ae25229b29b9dd0e1d3dd60043f037e2ad7e \
-    --hash=sha256:686e975a5d16602dc0982c7c703948d17184bd1397e16c8ee03511ecb8c4cdda \
-    --hash=sha256:6959fb07e8351e20501ffb8cc4074c39a0b7ef123e1c850a7f8f3afdc3a3da01 \
-    --hash=sha256:6b25ed37c27e652db01be341af93fbcea03d296c024d8a0e680017a268eb85dd \
-    --hash=sha256:6da6dea3a1bacf99b3c2187e296db9a83029ed9c38fd4c52b7c9b7326d13c828 \
-    --hash=sha256:72ca2399097c0b758198f2ff30f7178d680de8a5cfcf3d9b73a63cf87455532e \
-    --hash=sha256:73abb8584b0cf74d37f5ef61c10722adc7275502ab71789a8fe3cb7ef04cf6e2 \
-    --hash=sha256:74100fecaec8a535e380cf5f2fb556ff84957d481c13e54051c52e5baac70541 \
-    --hash=sha256:75c6ecb70e809cf1504465174343113f51f24bc61e22a80ae1c859f3f7034c6d \
-    --hash=sha256:7cf05053242f61ba94014dd3a986e11a083400a32664058f80bf4cf817c0b3a1 \
-    --hash=sha256:9411e24328a2302e279e70cae6e479f1fddde79629fcb14e03e6d94b3956eabf \
-    --hash=sha256:a213acfbf186b9f35803b52e4ca9addb153fc0b67f82a48f961be7000ecf6721 \
-    --hash=sha256:bb7e0fe6ad73b7f06d7e2b689c19a71cf5cc48f0c2bf8608469e51ffe0bd2867 \
-    --hash=sha256:c2504eed520958a5b77cc99458297cb7906308cb92327f35fb7fbbad4e9b2188 \
-    --hash=sha256:c35aa9657f5d5116d23b934568e0956bd50c615127810fffe3ac356a914c176a \
-    --hash=sha256:c5f09cffa619adfb44799fa4a81c2a1ad77c887187613fb0a8f201ab38d89ba1 \
-    --hash=sha256:c978f864b35f2261e0819f5cd88b9830b04dc51bcf055aac3c601e525a10d2ba \
-    --hash=sha256:cbe946b3e6e60a7b4618f091e62a029cb082b109a9d6b53962dd305087c6e4fd \
-    --hash=sha256:cc3e4cd087f07758b16bef8f31d88dbb1b5da5671d2f03685ab52dece3d7a16e \
-    --hash=sha256:cf0dead5a2c5a3347af2cfec7131d4f2a2e03c934af28989c9078f8241a491fa \
-    --hash=sha256:d2794f0e68b3085d99b4f6ff9c089f6fdd02b32b9d3efdfbb55beac1bf22d516 \
-    --hash=sha256:d2fa68a96a30dd240be80bbad838a0ac81a61770611ff7952b889485970c4c71 \
-    --hash=sha256:d6f70406695e3220f09cd7a2f879333279d91aa4a8a1d34303b56d61a8180137 \
-    --hash=sha256:d8f9cd4ad1be90b0cf350a2f04a38a36e44a026cac1e036ac593dc48efe91d52 \
-    --hash=sha256:da2d94c15f88cd40d7e67f7919d4f60110d2b9d5b1e08cf354c2be773ab13479 \
-    --hash=sha256:e1727c1c0e394096bb9af185c6923e8ea55a5095b8af44f06903bcc0e06800a2 \
-    --hash=sha256:e420ced29b5904cdf9ee5545e23f9406189d8acb6750916c2db4793dada065c6 \
-    --hash=sha256:e82c5cf1495244adf5252f925ac5932e5fd288b3e5ab6b70bec5593074b7236c \
-    --hash=sha256:f1ef0d39bc1feb420caf549b3c657c871cad4ebbcf0580c4d03816b0590de0cf \
-    --hash=sha256:f8753a6c88d1d0ba64302309eecf20f70d2770f65ca02d83c2452279085bfcd3 \
-    --hash=sha256:f93dbf58f03146164048be5426ffde298b237a5e059144847e4940f5b80172c3
+grpcio==1.60.1 \
+    --hash=sha256:0250a7a70b14000fa311de04b169cc7480be6c1a769b190769d347939d3232a8 \
+    --hash=sha256:069fe2aeee02dfd2135d562d0663fe70fbb69d5eed6eb3389042a7e963b54de8 \
+    --hash=sha256:082081e6a36b6eb5cf0fd9a897fe777dbb3802176ffd08e3ec6567edd85bc104 \
+    --hash=sha256:0c5807e9152eff15f1d48f6b9ad3749196f79a4a050469d99eecb679be592acc \
+    --hash=sha256:14e8f2c84c0832773fb3958240c69def72357bc11392571f87b2d7b91e0bb092 \
+    --hash=sha256:2a6087f234cb570008a6041c8ffd1b7d657b397fdd6d26e83d72283dae3527b1 \
+    --hash=sha256:2bb2a2911b028f01c8c64d126f6b632fcd8a9ac975aa1b3855766c94e4107180 \
+    --hash=sha256:2f44c32aef186bbba254129cea1df08a20be414144ac3bdf0e84b24e3f3b2e05 \
+    --hash=sha256:30e980cd6db1088c144b92fe376747328d5554bc7960ce583ec7b7d81cd47287 \
+    --hash=sha256:33aed0a431f5befeffd9d346b0fa44b2c01aa4aeae5ea5b2c03d3e25e0071216 \
+    --hash=sha256:33bdea30dcfd4f87b045d404388469eb48a48c33a6195a043d116ed1b9a0196c \
+    --hash=sha256:39aa848794b887120b1d35b1b994e445cc028ff602ef267f87c38122c1add50d \
+    --hash=sha256:4216e67ad9a4769117433814956031cb300f85edc855252a645a9a724b3b6594 \
+    --hash=sha256:49c9b6a510e3ed8df5f6f4f3c34d7fbf2d2cae048ee90a45cd7415abab72912c \
+    --hash=sha256:4eec8b8c1c2c9b7125508ff7c89d5701bf933c99d3910e446ed531cd16ad5d87 \
+    --hash=sha256:50d56280b482875d1f9128ce596e59031a226a8b84bec88cb2bf76c289f5d0de \
+    --hash=sha256:53b69e79d00f78c81eecfb38f4516080dc7f36a198b6b37b928f1c13b3c063e9 \
+    --hash=sha256:55ccb7db5a665079d68b5c7c86359ebd5ebf31a19bc1a91c982fd622f1e31ff2 \
+    --hash=sha256:5a1ebbae7e2214f51b1f23b57bf98eeed2cf1ba84e4d523c48c36d5b2f8829ff \
+    --hash=sha256:61b7199cd2a55e62e45bfb629a35b71fc2c0cb88f686a047f25b1112d3810904 \
+    --hash=sha256:660fc6b9c2a9ea3bb2a7e64ba878c98339abaf1811edca904ac85e9e662f1d73 \
+    --hash=sha256:6d140bdeb26cad8b93c1455fa00573c05592793c32053d6e0016ce05ba267549 \
+    --hash=sha256:6e490fa5f7f5326222cb9f0b78f207a2b218a14edf39602e083d5f617354306f \
+    --hash=sha256:6ecf21d20d02d1733e9c820fb5c114c749d888704a7ec824b545c12e78734d1c \
+    --hash=sha256:70c83bb530572917be20c21f3b6be92cd86b9aecb44b0c18b1d3b2cc3ae47df0 \
+    --hash=sha256:72153a0d2e425f45b884540a61c6639436ddafa1829a42056aa5764b84108b8e \
+    --hash=sha256:73e14acd3d4247169955fae8fb103a2b900cfad21d0c35f0dcd0fdd54cd60367 \
+    --hash=sha256:76eaaba891083fcbe167aa0f03363311a9f12da975b025d30e94b93ac7a765fc \
+    --hash=sha256:79ae0dc785504cb1e1788758c588c711f4e4a0195d70dff53db203c95a0bd303 \
+    --hash=sha256:7d142bcd604166417929b071cd396aa13c565749a4c840d6c702727a59d835eb \
+    --hash=sha256:8c9554ca8e26241dabe7951aa1fa03a1ba0856688ecd7e7bdbdd286ebc272e4c \
+    --hash=sha256:8d488fbdbf04283f0d20742b64968d44825617aa6717b07c006168ed16488804 \
+    --hash=sha256:91422ba785a8e7a18725b1dc40fbd88f08a5bb4c7f1b3e8739cab24b04fa8a03 \
+    --hash=sha256:9a66f4d2a005bc78e61d805ed95dedfcb35efa84b7bba0403c6d60d13a3de2d6 \
+    --hash=sha256:9b106bc52e7f28170e624ba61cc7dc6829566e535a6ec68528f8e1afbed1c41f \
+    --hash=sha256:9b54577032d4f235452f77a83169b6527bf4b77d73aeada97d45b2aaf1bf5ce0 \
+    --hash=sha256:a09506eb48fa5493c58f946c46754ef22f3ec0df64f2b5149373ff31fb67f3dd \
+    --hash=sha256:a212e5dea1a4182e40cd3e4067ee46be9d10418092ce3627475e995cca95de21 \
+    --hash=sha256:a731ac5cffc34dac62053e0da90f0c0b8560396a19f69d9703e88240c8f05858 \
+    --hash=sha256:af5ef6cfaf0d023c00002ba25d0751e5995fa0e4c9eec6cd263c30352662cbce \
+    --hash=sha256:b58b855d0071575ea9c7bc0d84a06d2edfbfccec52e9657864386381a7ce1ae9 \
+    --hash=sha256:bc808924470643b82b14fe121923c30ec211d8c693e747eba8a7414bc4351a23 \
+    --hash=sha256:c557e94e91a983e5b1e9c60076a8fd79fea1e7e06848eb2e48d0ccfb30f6e073 \
+    --hash=sha256:c71be3f86d67d8d1311c6076a4ba3b75ba5703c0b856b4e691c9097f9b1e8bd2 \
+    --hash=sha256:c8754c75f55781515a3005063d9a05878b2cfb3cb7e41d5401ad0cf19de14872 \
+    --hash=sha256:cb0af13433dbbd1c806e671d81ec75bd324af6ef75171fd7815ca3074fe32bfe \
+    --hash=sha256:cba6209c96828711cb7c8fcb45ecef8c8859238baf15119daa1bef0f6c84bfe7 \
+    --hash=sha256:cf77f8cf2a651fbd869fbdcb4a1931464189cd210abc4cfad357f1cacc8642a6 \
+    --hash=sha256:d7404cebcdb11bb5bd40bf94131faf7e9a7c10a6c60358580fe83913f360f929 \
+    --hash=sha256:dd1d3a8d1d2e50ad9b59e10aa7f07c7d1be2b367f3f2d33c5fade96ed5460962 \
+    --hash=sha256:e5d97c65ea7e097056f3d1ead77040ebc236feaf7f71489383d20f3b4c28412a \
+    --hash=sha256:f1c3dc536b3ee124e8b24feb7533e5c70b9f2ef833e3b2e5513b2897fd46763a \
+    --hash=sha256:f2212796593ad1d0235068c79836861f2201fc7137a99aa2fea7beeb3b101177 \
+    --hash=sha256:fead980fbc68512dfd4e0c7b1f5754c2a8e5015a04dea454b9cada54a8423525
     # via
     #   -r requirements.in
     #   tb-nightly
@@ -258,113 +249,115 @@ h5py==3.10.0 \
     # via
     #   -r requirements.in
     #   keras-nightly
-idna==3.4 \
-    --hash=sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4 \
-    --hash=sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2
+idna==3.6 \
+    --hash=sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca \
+    --hash=sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f
     # via requests
 jax==0.4.7 \
     --hash=sha256:5e7002d74db25f97c99b979d4ba1233b1ef26e1597e5fc468ad11d1c8a9dc4f8
     # via -r requirements.in
-keras-nightly==3.0.0.dev2023103103 \
-    --hash=sha256:25a6a9030d7e067d535ec41ae6700636c90cabada80fbddb3bb8775006807860 \
-    --hash=sha256:c39fccebb3d4cc9d838371981c4d3eef08fc06eadf6cffb39b356cb625bab50f
+keras-nightly==3.0.4.dev2024021403 \
+    --hash=sha256:24ce69d29d582771685bf4235f59663723405b5a5b16f3eaff2657e52e74663a \
+    --hash=sha256:9f416e66b820ef833779d219d255b346b8b90a72fdbd0b2f1e90a43ad142a03d
     # via -r requirements.in
-lit==17.0.4 \
-    --hash=sha256:ee2e180128e770abc6aed3a02de2daf09d81b7d30225e315205d3599c311d304
+lit==17.0.6 \
+    --hash=sha256:dfa9af9b55fc4509a56be7bf2346f079d7f4a242d583b9f2e0b078fd0abae31b
     # via -r requirements.in
-markdown==3.5 \
-    --hash=sha256:4afb124395ce5fc34e6d9886dab977fd9ae987fc6e85689f08278cf0c69d4bf3 \
-    --hash=sha256:a807eb2e4778d9156c8f07876c6e4d50b5494c5665c4834f67b06459dfd877b3
+markdown==3.5.2 \
+    --hash=sha256:d43323865d89fc0cb9b20c75fc8ad313af307cc087e84b657d9eec768eddeadd \
+    --hash=sha256:e1ac7b3dc550ee80e602e71c1d168002f062e49f1b11e26a36264dafd4df2ef8
     # via tb-nightly
 markdown-it-py==3.0.0 \
     --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \
     --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb
     # via rich
-markupsafe==2.1.3 \
-    --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \
-    --hash=sha256:0a4e4a1aff6c7ac4cd55792abf96c915634c2b97e3cc1c7129578aa68ebd754e \
-    --hash=sha256:10bbfe99883db80bdbaff2dcf681dfc6533a614f700da1287707e8a5d78a8431 \
-    --hash=sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686 \
-    --hash=sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c \
-    --hash=sha256:1577735524cdad32f9f694208aa75e422adba74f1baee7551620e43a3141f559 \
-    --hash=sha256:1b40069d487e7edb2676d3fbdb2b0829ffa2cd63a2ec26c4938b2d34391b4ecc \
-    --hash=sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb \
-    --hash=sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939 \
-    --hash=sha256:282c2cb35b5b673bbcadb33a585408104df04f14b2d9b01d4c345a3b92861c2c \
-    --hash=sha256:2c1b19b3aaacc6e57b7e25710ff571c24d6c3613a45e905b1fde04d691b98ee0 \
-    --hash=sha256:2ef12179d3a291be237280175b542c07a36e7f60718296278d8593d21ca937d4 \
-    --hash=sha256:338ae27d6b8745585f87218a3f23f1512dbf52c26c28e322dbe54bcede54ccb9 \
-    --hash=sha256:3c0fae6c3be832a0a0473ac912810b2877c8cb9d76ca48de1ed31e1c68386575 \
-    --hash=sha256:3fd4abcb888d15a94f32b75d8fd18ee162ca0c064f35b11134be77050296d6ba \
-    --hash=sha256:42de32b22b6b804f42c5d98be4f7e5e977ecdd9ee9b660fda1a3edf03b11792d \
-    --hash=sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd \
-    --hash=sha256:504b320cd4b7eff6f968eddf81127112db685e81f7e36e75f9f84f0df46041c3 \
-    --hash=sha256:525808b8019e36eb524b8c68acdd63a37e75714eac50e988180b169d64480a00 \
-    --hash=sha256:56d9f2ecac662ca1611d183feb03a3fa4406469dafe241673d521dd5ae92a155 \
-    --hash=sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac \
-    --hash=sha256:65c1a9bcdadc6c28eecee2c119465aebff8f7a584dd719facdd9e825ec61ab52 \
-    --hash=sha256:68e78619a61ecf91e76aa3e6e8e33fc4894a2bebe93410754bd28fce0a8a4f9f \
-    --hash=sha256:69c0f17e9f5a7afdf2cc9fb2d1ce6aabdb3bafb7f38017c0b77862bcec2bbad8 \
-    --hash=sha256:6b2b56950d93e41f33b4223ead100ea0fe11f8e6ee5f641eb753ce4b77a7042b \
-    --hash=sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007 \
-    --hash=sha256:787003c0ddb00500e49a10f2844fac87aa6ce977b90b0feaaf9de23c22508b24 \
-    --hash=sha256:7ef3cb2ebbf91e330e3bb937efada0edd9003683db6b57bb108c4001f37a02ea \
-    --hash=sha256:8023faf4e01efadfa183e863fefde0046de576c6f14659e8782065bcece22198 \
-    --hash=sha256:8758846a7e80910096950b67071243da3e5a20ed2546e6392603c096778d48e0 \
-    --hash=sha256:8afafd99945ead6e075b973fefa56379c5b5c53fd8937dad92c662da5d8fd5ee \
-    --hash=sha256:8c41976a29d078bb235fea9b2ecd3da465df42a562910f9022f1a03107bd02be \
-    --hash=sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2 \
-    --hash=sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1 \
-    --hash=sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707 \
-    --hash=sha256:962f82a3086483f5e5f64dbad880d31038b698494799b097bc59c2edf392fce6 \
-    --hash=sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c \
-    --hash=sha256:9dcdfd0eaf283af041973bff14a2e143b8bd64e069f4c383416ecd79a81aab58 \
-    --hash=sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823 \
-    --hash=sha256:aa7bd130efab1c280bed0f45501b7c8795f9fdbeb02e965371bbef3523627779 \
-    --hash=sha256:ab4a0df41e7c16a1392727727e7998a467472d0ad65f3ad5e6e765015df08636 \
-    --hash=sha256:ad9e82fb8f09ade1c3e1b996a6337afac2b8b9e365f926f5a61aacc71adc5b3c \
-    --hash=sha256:af598ed32d6ae86f1b747b82783958b1a4ab8f617b06fe68795c7f026abbdcad \
-    --hash=sha256:b076b6226fb84157e3f7c971a47ff3a679d837cf338547532ab866c57930dbee \
-    --hash=sha256:b7ff0f54cb4ff66dd38bebd335a38e2c22c41a8ee45aa608efc890ac3e3931bc \
-    --hash=sha256:bfce63a9e7834b12b87c64d6b155fdd9b3b96191b6bd334bf37db7ff1fe457f2 \
-    --hash=sha256:c011a4149cfbcf9f03994ec2edffcb8b1dc2d2aede7ca243746df97a5d41ce48 \
-    --hash=sha256:c9c804664ebe8f83a211cace637506669e7890fec1b4195b505c214e50dd4eb7 \
-    --hash=sha256:ca379055a47383d02a5400cb0d110cef0a776fc644cda797db0c5696cfd7e18e \
-    --hash=sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b \
-    --hash=sha256:cd0f502fe016460680cd20aaa5a76d241d6f35a1c3350c474bac1273803893fa \
-    --hash=sha256:ceb01949af7121f9fc39f7d27f91be8546f3fb112c608bc4029aef0bab86a2a5 \
-    --hash=sha256:d080e0a5eb2529460b30190fcfcc4199bd7f827663f858a226a81bc27beaa97e \
-    --hash=sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb \
-    --hash=sha256:df0be2b576a7abbf737b1575f048c23fb1d769f267ec4358296f31c2479db8f9 \
-    --hash=sha256:e09031c87a1e51556fdcb46e5bd4f59dfb743061cf93c4d6831bf894f125eb57 \
-    --hash=sha256:e4dd52d80b8c83fdce44e12478ad2e85c64ea965e75d66dbeafb0a3e77308fcc \
-    --hash=sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc \
-    --hash=sha256:fec21693218efe39aa7f8599346e90c705afa52c5b31ae019b2e57e8f6542bb2 \
-    --hash=sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11
+markupsafe==2.1.5 \
+    --hash=sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf \
+    --hash=sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff \
+    --hash=sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f \
+    --hash=sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3 \
+    --hash=sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532 \
+    --hash=sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f \
+    --hash=sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617 \
+    --hash=sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df \
+    --hash=sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4 \
+    --hash=sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906 \
+    --hash=sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f \
+    --hash=sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4 \
+    --hash=sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8 \
+    --hash=sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371 \
+    --hash=sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2 \
+    --hash=sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465 \
+    --hash=sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52 \
+    --hash=sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6 \
+    --hash=sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169 \
+    --hash=sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad \
+    --hash=sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2 \
+    --hash=sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0 \
+    --hash=sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029 \
+    --hash=sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f \
+    --hash=sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a \
+    --hash=sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced \
+    --hash=sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5 \
+    --hash=sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c \
+    --hash=sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf \
+    --hash=sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9 \
+    --hash=sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb \
+    --hash=sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad \
+    --hash=sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3 \
+    --hash=sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1 \
+    --hash=sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46 \
+    --hash=sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc \
+    --hash=sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a \
+    --hash=sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee \
+    --hash=sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900 \
+    --hash=sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5 \
+    --hash=sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea \
+    --hash=sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f \
+    --hash=sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5 \
+    --hash=sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e \
+    --hash=sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a \
+    --hash=sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f \
+    --hash=sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50 \
+    --hash=sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a \
+    --hash=sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b \
+    --hash=sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4 \
+    --hash=sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff \
+    --hash=sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2 \
+    --hash=sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46 \
+    --hash=sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b \
+    --hash=sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf \
+    --hash=sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5 \
+    --hash=sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5 \
+    --hash=sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab \
+    --hash=sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd \
+    --hash=sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68
     # via werkzeug
 mdurl==0.1.2 \
     --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \
     --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba
     # via markdown-it-py
-ml-dtypes==0.3.1 \
-    --hash=sha256:3d8ca0acbd377082792d8b97081ba580abdad67c6afb7f827012c675b052f058 \
-    --hash=sha256:42a8980afd8b7c8e270e8b5c260237286b5b26acd276fcb758d13cd7cb567e99 \
-    --hash=sha256:438437e2e614a3c91d75581653b6c40ec890e8b5994d7190a90c931740151c95 \
-    --hash=sha256:4828b62fa3bf1ae35faa40f3db9a38ec72fbce02f328a1d14c3a9da4606af364 \
-    --hash=sha256:4d94b2d1bed77284694f7fd0479640fa7aa5d96433dca3cbcec407a5ef752e77 \
-    --hash=sha256:510d249a91face47211762eb294d6fe64f325356b965fb6388c1bf51bd339267 \
-    --hash=sha256:5727effa7650f7ab10906542d137cfb3244fdc3b2b519beff42f82def8ba59be \
-    --hash=sha256:5e0b0b6bb07fa5ad11bb61d174667176bee5e05857225067aabfc5adc1b51d23 \
-    --hash=sha256:60778f99194b4c4f36ba42da200b35ef851ce4d4af698aaf70f5b91fe70fc611 \
-    --hash=sha256:70984b473db6489ec1d8c79b082a1322105155193049d08a3b0c515094e9777b \
-    --hash=sha256:979d7d196d9a17e0135ae22878f74241fbd3522cef58d7b292f1fd5b32282201 \
-    --hash=sha256:a777928dcba8865ab4a8157eeb25d23aed7bc82e5fd74e1d5eca821d3f148b39 \
-    --hash=sha256:cb0c404e0dd3e25b56362c1c1e5de0ef717f727dde59fa721af4ce6ab2acca44 \
-    --hash=sha256:d1a8dc3bac1da2a17d0e2e4cba36ee89721d0bd33ea4765af2eefb5f41409e0f \
-    --hash=sha256:da274599e4950a9b488d21571061f49a185537cc77f2d3f8121151d58a9e9f16 \
-    --hash=sha256:f83ff080df8910c0f987f615b03e4f8198638e0c00c6e679ea8892dda909763b \
-    --hash=sha256:fcae2c69715410d96906e1dfe8f017d9f78a0d10e0df91aae52e91f51fdfe45e
-    # via jax
+ml-dtypes==0.3.2 \
+    --hash=sha256:2c34f2ba9660b21fe1034b608308a01be82bbef2a92fb8199f24dc6bad0d5226 \
+    --hash=sha256:3a17ef2322e60858d93584e9c52a5be7dd6236b056b7fa1ec57f1bb6ba043e33 \
+    --hash=sha256:533059bc5f1764fac071ef54598db358c167c51a718f68f5bb55e3dee79d2967 \
+    --hash=sha256:6604877d567a29bfe7cc02969ae0f2425260e5335505cf5e7fefc3e5465f5655 \
+    --hash=sha256:6b35c4e8ca957c877ac35c79ffa77724ecc3702a1e4b18b08306c03feae597bb \
+    --hash=sha256:763697ab8a88d47443997a7cdf3aac7340049aed45f7521f6b0ec8a0594821fe \
+    --hash=sha256:7a4c3fcbf86fa52d0204f07cfd23947ef05b4ad743a1a988e163caa34a201e5e \
+    --hash=sha256:7afde548890a92b41c0fed3a6c525f1200a5727205f73dc21181a2726571bb53 \
+    --hash=sha256:7ba8e1fafc7fff3e643f453bffa7d082df1678a73286ce8187d3e825e776eb94 \
+    --hash=sha256:91f8783fd1f2c23fd3b9ee5ad66b785dafa58ba3cdb050c4458021fa4d1eb226 \
+    --hash=sha256:93b78f53431c93953f7850bb1b925a17f0ab5d97527e38a7e865b5b4bc5cfc18 \
+    --hash=sha256:961134ea44c7b8ca63eda902a44b58cd8bd670e21d62e255c81fba0a8e70d9b7 \
+    --hash=sha256:b89b194e9501a92d289c1ffd411380baf5daafb9818109a4f49b0a1b6dce4462 \
+    --hash=sha256:c7b3fb3d4f6b39bcd4f6c4b98f406291f0d681a895490ee29a0f95bab850d53c \
+    --hash=sha256:d1a746fe5fb9cd974a91070174258f0be129c592b93f9ce7df6cc336416c3fbd \
+    --hash=sha256:e8505946df1665db01332d885c2020b4cb9e84a8b1241eb4ba69d59591f65855 \
+    --hash=sha256:f47619d978ab1ae7dfdc4052ea97c636c6263e1f19bd1be0e42c346b98d15ff4
+    # via
+    #   jax
+    #   keras-nightly
 namex==0.0.7 \
     --hash=sha256:84ba65bc4d22bd909e3d26bf2ffb4b9529b608cb3f9a4336f776b04204ced69b \
     --hash=sha256:8a4f062945f405d77cb66b907f16aa2fd83681945e998be840eb6c4154d40108
@@ -407,10 +400,6 @@ numpy==1.23.5 ; python_version <= "3.11" \
     #   opt-einsum
     #   scipy
     #   tb-nightly
-oauthlib==3.2.2 \
-    --hash=sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca \
-    --hash=sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918
-    # via requests-oauthlib
 opt-einsum==3.3.0 \
     --hash=sha256:2455e59e3947d3c275477df7f5205b30635e266fe6dc300e3d9f9646bfcea147 \
     --hash=sha256:59f6475f77bbc37dcf7cd748519c0ec60722e91e63ca114e68821c0c54a46549
@@ -440,57 +429,36 @@ protobuf==4.23.4 \
     --hash=sha256:effeac51ab79332d44fba74660d40ae79985901ac21bca408f8dc335a81aa597 \
     --hash=sha256:fee88269a090ada09ca63551bf2f573eb2424035bcf2cb1b121895b01a46594a
     # via tb-nightly
-psutil==5.9.6 \
-    --hash=sha256:10e8c17b4f898d64b121149afb136c53ea8b68c7531155147867b7b1ac9e7e28 \
-    --hash=sha256:18cd22c5db486f33998f37e2bb054cc62fd06646995285e02a51b1e08da97017 \
-    --hash=sha256:3ebf2158c16cc69db777e3c7decb3c0f43a7af94a60d72e87b2823aebac3d602 \
-    --hash=sha256:51dc3d54607c73148f63732c727856f5febec1c7c336f8f41fcbd6315cce76ac \
-    --hash=sha256:6e5fb8dc711a514da83098bc5234264e551ad980cec5f85dabf4d38ed6f15e9a \
-    --hash=sha256:70cb3beb98bc3fd5ac9ac617a327af7e7f826373ee64c80efd4eb2856e5051e9 \
-    --hash=sha256:748c9dd2583ed86347ed65d0035f45fa8c851e8d90354c122ab72319b5f366f4 \
-    --hash=sha256:91ecd2d9c00db9817a4b4192107cf6954addb5d9d67a969a4f436dbc9200f88c \
-    --hash=sha256:92e0cc43c524834af53e9d3369245e6cc3b130e78e26100d1f63cdb0abeb3d3c \
-    --hash=sha256:a6f01f03bf1843280f4ad16f4bde26b817847b4c1a0db59bf6419807bc5ce05c \
-    --hash=sha256:c69596f9fc2f8acd574a12d5f8b7b1ba3765a641ea5d60fb4736bf3c08a8214a \
-    --hash=sha256:ca2780f5e038379e520281e4c032dddd086906ddff9ef0d1b9dcf00710e5071c \
-    --hash=sha256:daecbcbd29b289aac14ece28eca6a3e60aa361754cf6da3dfb20d4d32b6c7f57 \
-    --hash=sha256:e4b92ddcd7dd4cdd3f900180ea1e104932c7bce234fb88976e2a3b296441225a \
-    --hash=sha256:fb8a697f11b0f5994550555fcfe3e69799e5b060c8ecf9e2f75c69302cc35c0d \
-    --hash=sha256:ff18b8d1a784b810df0b0fff3bcb50ab941c3b8e2c8de5726f9c71c601c611aa
+psutil==5.9.8 \
+    --hash=sha256:02615ed8c5ea222323408ceba16c60e99c3f91639b07da6373fb7e6539abc56d \
+    --hash=sha256:05806de88103b25903dff19bb6692bd2e714ccf9e668d050d144012055cbca73 \
+    --hash=sha256:26bd09967ae00920df88e0352a91cff1a78f8d69b3ecabbfe733610c0af486c8 \
+    --hash=sha256:27cc40c3493bb10de1be4b3f07cae4c010ce715290a5be22b98493509c6299e2 \
+    --hash=sha256:36f435891adb138ed3c9e58c6af3e2e6ca9ac2f365efe1f9cfef2794e6c93b4e \
+    --hash=sha256:50187900d73c1381ba1454cf40308c2bf6f34268518b3f36a9b663ca87e65e36 \
+    --hash=sha256:611052c4bc70432ec770d5d54f64206aa7203a101ec273a0cd82418c86503bb7 \
+    --hash=sha256:6be126e3225486dff286a8fb9a06246a5253f4c7c53b475ea5f5ac934e64194c \
+    --hash=sha256:7d79560ad97af658a0f6adfef8b834b53f64746d45b403f225b85c5c2c140eee \
+    --hash=sha256:8cb6403ce6d8e047495a701dc7c5bd788add903f8986d523e3e20b98b733e421 \
+    --hash=sha256:8db4c1b57507eef143a15a6884ca10f7c73876cdf5d51e713151c1236a0e68cf \
+    --hash=sha256:aee678c8720623dc456fa20659af736241f575d79429a0e5e9cf88ae0605cc81 \
+    --hash=sha256:bc56c2a1b0d15aa3eaa5a60c9f3f8e3e565303b465dbf57a1b730e7a2b9844e0 \
+    --hash=sha256:bd1184ceb3f87651a67b2708d4c3338e9b10c5df903f2e3776b62303b26cb631 \
+    --hash=sha256:d06016f7f8625a1825ba3732081d77c94589dca78b7a3fc072194851e88461a4 \
+    --hash=sha256:d16bbddf0693323b8c6123dd804100241da461e41d6e332fb0ba6058f630f8c8
     # via portpicker
-pyasn1==0.5.0 \
-    --hash=sha256:87a2121042a1ac9358cabcaf1d07680ff97ee6404333bacca15f76aa8ad01a57 \
-    --hash=sha256:97b7290ca68e62a832558ec3976f15cbf911bf5d7c7039d8b861c2a0ece69fde
-    # via
-    #   pyasn1-modules
-    #   rsa
-pyasn1-modules==0.3.0 \
-    --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \
-    --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d
-    # via google-auth
-pygments==2.16.1 \
-    --hash=sha256:13fc09fa63bc8d8671a6d247e1eb303c4b343eaee81d861f3404db2935653692 \
-    --hash=sha256:1daff0494820c69bc8941e407aa20f577374ee88364ee10a98fdbe0aece96e29
+pygments==2.17.2 \
+    --hash=sha256:b27c2826c47d0f3219f29554824c30c5e8945175d888647acd804ddd04af846c \
+    --hash=sha256:da46cec9fd2de5be3a8a784f434e4c4ab670b4ff54d605c4c2717e9d49c4c367
     # via rich
 requests==2.31.0 \
     --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \
     --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1
-    # via
-    #   -r requirements.in
-    #   requests-oauthlib
-    #   tb-nightly
-requests-oauthlib==1.3.1 \
-    --hash=sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5 \
-    --hash=sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a
-    # via google-auth-oauthlib
-rich==13.6.0 \
-    --hash=sha256:2b38e2fe9ca72c9a00170a1a2d20c63c790d0e10ef1fe35eba76e1e7b1d7d245 \
-    --hash=sha256:5c14d22737e6d5084ef4771b62d5d4363165b403455a30a1c8ca39dc7b644bef
+    # via -r requirements.in
+rich==13.7.0 \
+    --hash=sha256:5cb5123b5cf9ee70584244246816e9114227e0b98ad9176eede6ad54bf5403fa \
+    --hash=sha256:6da14c108c4866ee9520bbffa71f6fe3962e193b7da68720583850cd4548e235
     # via keras-nightly
-rsa==4.9 \
-    --hash=sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7 \
-    --hash=sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21
-    # via google-auth
 scipy==1.11.3 \
     --hash=sha256:00f325434b6424952fbb636506f0567898dca7b0f7654d48f1c382ea338ce9a3 \
     --hash=sha256:033c3fd95d55012dd1148b201b72ae854d5086d25e7c316ec9850de4fe776929 \
@@ -526,8 +494,8 @@ six==1.16.0 \
     # via
     #   astunparse
     #   tb-nightly
-tb-nightly==2.15.0a20231023 \
-    --hash=sha256:8990a52985e3296aa18a6825efc017bcded9e2fb2cbcdd2d01f5c62d4fdc9825
+tb-nightly==2.17.0a20240214 \
+    --hash=sha256:dbd59bcfd9b028e6199050e36cbb4ee1db4a94e69a7f8c00865f1498112a2b83
     # via -r requirements.in
 tblib==2.0.0 \
     --hash=sha256:9100bfa016b047d5b980d66e7efed952fbd20bd85b56110aaf473cb97d18709a \
@@ -542,13 +510,17 @@ termcolor==2.3.0 \
     --hash=sha256:3afb05607b89aed0ffe25202399ee0867ad4d3cb4180d98aaf8eefa6a5f7d475 \
     --hash=sha256:b5b08f68937f138fe92f6c089b99f1e2da0ae56c52b78bf7075fd95420fd9a5a
     # via -r requirements.in
+tf-keras-nightly==2.16.0.dev2024021410 \
+    --hash=sha256:05a4c19c6a795ec9ed6f4dac69d443601b9206b910524d8b19bad6e362d49f47 \
+    --hash=sha256:83d5a1dd3979b42164a2fac4bdd2ed5981ef0c2a0514ca64e4bc67e369caef5c
+    # via tb-nightly
 typing-extensions==4.8.0 \
     --hash=sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0 \
     --hash=sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef
     # via -r requirements.in
-urllib3==2.0.7 \
-    --hash=sha256:c97dfde1f7bd43a71c8d2a58e369e9b2bf692d1334ea9f9cae55add7d0dd0f84 \
-    --hash=sha256:fdb6d215c776278489906c2f8916e6e7d4f5a9b602ccbcfdf7f016fc8da0596e
+urllib3==2.2.0 \
+    --hash=sha256:051d961ad0c62a94e50ecf1af379c3aba230c66c710493493560c0c223c49f20 \
+    --hash=sha256:ce3711610ddce217e6d113a2732fafad960a03fd0318c91faa79481e35c11224
     # via requests
 werkzeug==3.0.1 \
     --hash=sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc \
@@ -560,81 +532,77 @@ wheel==0.41.3 \
     # via
     #   -r requirements.in
     #   astunparse
-wrapt==1.14.1 \
-    --hash=sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3 \
-    --hash=sha256:01c205616a89d09827986bc4e859bcabd64f5a0662a7fe95e0d359424e0e071b \
-    --hash=sha256:02b41b633c6261feff8ddd8d11c711df6842aba629fdd3da10249a53211a72c4 \
-    --hash=sha256:07f7a7d0f388028b2df1d916e94bbb40624c59b48ecc6cbc232546706fac74c2 \
-    --hash=sha256:11871514607b15cfeb87c547a49bca19fde402f32e2b1c24a632506c0a756656 \
-    --hash=sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3 \
-    --hash=sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9 \
-    --hash=sha256:21ac0156c4b089b330b7666db40feee30a5d52634cc4560e1905d6529a3897ff \
-    --hash=sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9 \
-    --hash=sha256:257fd78c513e0fb5cdbe058c27a0624c9884e735bbd131935fd49e9fe719d310 \
-    --hash=sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224 \
-    --hash=sha256:2b39d38039a1fdad98c87279b48bc5dce2c0ca0d73483b12cb72aa9609278e8a \
-    --hash=sha256:2cf71233a0ed05ccdabe209c606fe0bac7379fdcf687f39b944420d2a09fdb57 \
-    --hash=sha256:2fe803deacd09a233e4762a1adcea5db5d31e6be577a43352936179d14d90069 \
-    --hash=sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335 \
-    --hash=sha256:3232822c7d98d23895ccc443bbdf57c7412c5a65996c30442ebe6ed3df335383 \
-    --hash=sha256:34aa51c45f28ba7f12accd624225e2b1e5a3a45206aa191f6f9aac931d9d56fe \
-    --hash=sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204 \
-    --hash=sha256:36f582d0c6bc99d5f39cd3ac2a9062e57f3cf606ade29a0a0d6b323462f4dd87 \
-    --hash=sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d \
-    --hash=sha256:40e7bc81c9e2b2734ea4bc1aceb8a8f0ceaac7c5299bc5d69e37c44d9081d43b \
-    --hash=sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907 \
-    --hash=sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be \
-    --hash=sha256:4fcc4649dc762cddacd193e6b55bc02edca674067f5f98166d7713b193932b7f \
-    --hash=sha256:5a0f54ce2c092aaf439813735584b9537cad479575a09892b8352fea5e988dc0 \
-    --hash=sha256:5a9a0d155deafd9448baff28c08e150d9b24ff010e899311ddd63c45c2445e28 \
-    --hash=sha256:5b02d65b9ccf0ef6c34cba6cf5bf2aab1bb2f49c6090bafeecc9cd81ad4ea1c1 \
-    --hash=sha256:60db23fa423575eeb65ea430cee741acb7c26a1365d103f7b0f6ec412b893853 \
-    --hash=sha256:642c2e7a804fcf18c222e1060df25fc210b9c58db7c91416fb055897fc27e8cc \
-    --hash=sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf \
-    --hash=sha256:6a9a25751acb379b466ff6be78a315e2b439d4c94c1e99cb7266d40a537995d3 \
-    --hash=sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3 \
-    --hash=sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164 \
-    --hash=sha256:6e743de5e9c3d1b7185870f480587b75b1cb604832e380d64f9504a0535912d1 \
-    --hash=sha256:709fe01086a55cf79d20f741f39325018f4df051ef39fe921b1ebe780a66184c \
-    --hash=sha256:7b7c050ae976e286906dd3f26009e117eb000fb2cf3533398c5ad9ccc86867b1 \
-    --hash=sha256:7d2872609603cb35ca513d7404a94d6d608fc13211563571117046c9d2bcc3d7 \
-    --hash=sha256:7ef58fb89674095bfc57c4069e95d7a31cfdc0939e2a579882ac7d55aadfd2a1 \
-    --hash=sha256:80bb5c256f1415f747011dc3604b59bc1f91c6e7150bd7db03b19170ee06b320 \
-    --hash=sha256:81b19725065dcb43df02b37e03278c011a09e49757287dca60c5aecdd5a0b8ed \
-    --hash=sha256:833b58d5d0b7e5b9832869f039203389ac7cbf01765639c7309fd50ef619e0b1 \
-    --hash=sha256:88bd7b6bd70a5b6803c1abf6bca012f7ed963e58c68d76ee20b9d751c74a3248 \
-    --hash=sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c \
-    --hash=sha256:8c0ce1e99116d5ab21355d8ebe53d9460366704ea38ae4d9f6933188f327b456 \
-    --hash=sha256:8d649d616e5c6a678b26d15ece345354f7c2286acd6db868e65fcc5ff7c24a77 \
-    --hash=sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef \
-    --hash=sha256:9736af4641846491aedb3c3f56b9bc5568d92b0692303b5a305301a95dfd38b1 \
-    --hash=sha256:988635d122aaf2bdcef9e795435662bcd65b02f4f4c1ae37fbee7401c440b3a7 \
-    --hash=sha256:9cca3c2cdadb362116235fdbd411735de4328c61425b0aa9f872fd76d02c4e86 \
-    --hash=sha256:9e0fd32e0148dd5dea6af5fee42beb949098564cc23211a88d799e434255a1f4 \
-    --hash=sha256:9f3e6f9e05148ff90002b884fbc2a86bd303ae847e472f44ecc06c2cd2fcdb2d \
-    --hash=sha256:a85d2b46be66a71bedde836d9e41859879cc54a2a04fad1191eb50c2066f6e9d \
-    --hash=sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8 \
-    --hash=sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8 \
-    --hash=sha256:aa31fdcc33fef9eb2552cbcbfee7773d5a6792c137b359e82879c101e98584c5 \
-    --hash=sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a \
-    --hash=sha256:b014c23646a467558be7da3d6b9fa409b2c567d2110599b7cf9a0c5992b3b471 \
-    --hash=sha256:b21bb4c09ffabfa0e85e3a6b623e19b80e7acd709b9f91452b8297ace2a8ab00 \
-    --hash=sha256:b5901a312f4d14c59918c221323068fad0540e34324925c8475263841dbdfe68 \
-    --hash=sha256:b9b7a708dd92306328117d8c4b62e2194d00c365f18eff11a9b53c6f923b01e3 \
-    --hash=sha256:d1967f46ea8f2db647c786e78d8cc7e4313dbd1b0aca360592d8027b8508e24d \
-    --hash=sha256:d52a25136894c63de15a35bc0bdc5adb4b0e173b9c0d07a2be9d3ca64a332735 \
-    --hash=sha256:d77c85fedff92cf788face9bfa3ebaa364448ebb1d765302e9af11bf449ca36d \
-    --hash=sha256:d79d7d5dc8a32b7093e81e97dad755127ff77bcc899e845f41bf71747af0c569 \
-    --hash=sha256:dbcda74c67263139358f4d188ae5faae95c30929281bc6866d00573783c422b7 \
-    --hash=sha256:ddaea91abf8b0d13443f6dac52e89051a5063c7d014710dcb4d4abb2ff811a59 \
-    --hash=sha256:dee0ce50c6a2dd9056c20db781e9c1cfd33e77d2d569f5d1d9321c641bb903d5 \
-    --hash=sha256:dee60e1de1898bde3b238f18340eec6148986da0455d8ba7848d50470a7a32fb \
-    --hash=sha256:e2f83e18fe2f4c9e7db597e988f72712c0c3676d337d8b101f6758107c42425b \
-    --hash=sha256:e3fb1677c720409d5f671e39bac6c9e0e422584e5f518bfd50aa4cbbea02433f \
-    --hash=sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55 \
-    --hash=sha256:ee2b1b1769f6707a8a445162ea16dddf74285c3964f605877a20e38545c3c462 \
-    --hash=sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015 \
-    --hash=sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af
+wrapt==1.16.0 \
+    --hash=sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc \
+    --hash=sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81 \
+    --hash=sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09 \
+    --hash=sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e \
+    --hash=sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca \
+    --hash=sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0 \
+    --hash=sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb \
+    --hash=sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487 \
+    --hash=sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40 \
+    --hash=sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c \
+    --hash=sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060 \
+    --hash=sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202 \
+    --hash=sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41 \
+    --hash=sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9 \
+    --hash=sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b \
+    --hash=sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664 \
+    --hash=sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d \
+    --hash=sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362 \
+    --hash=sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00 \
+    --hash=sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc \
+    --hash=sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1 \
+    --hash=sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267 \
+    --hash=sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956 \
+    --hash=sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966 \
+    --hash=sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1 \
+    --hash=sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228 \
+    --hash=sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72 \
+    --hash=sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d \
+    --hash=sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292 \
+    --hash=sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0 \
+    --hash=sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0 \
+    --hash=sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36 \
+    --hash=sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c \
+    --hash=sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5 \
+    --hash=sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f \
+    --hash=sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73 \
+    --hash=sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b \
+    --hash=sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2 \
+    --hash=sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593 \
+    --hash=sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39 \
+    --hash=sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389 \
+    --hash=sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf \
+    --hash=sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf \
+    --hash=sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89 \
+    --hash=sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c \
+    --hash=sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c \
+    --hash=sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f \
+    --hash=sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440 \
+    --hash=sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465 \
+    --hash=sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136 \
+    --hash=sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b \
+    --hash=sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8 \
+    --hash=sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3 \
+    --hash=sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8 \
+    --hash=sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6 \
+    --hash=sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e \
+    --hash=sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f \
+    --hash=sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c \
+    --hash=sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e \
+    --hash=sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8 \
+    --hash=sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2 \
+    --hash=sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020 \
+    --hash=sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35 \
+    --hash=sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d \
+    --hash=sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3 \
+    --hash=sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537 \
+    --hash=sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809 \
+    --hash=sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d \
+    --hash=sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a \
+    --hash=sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4
     # via -r requirements.in
 
 setuptools==68.2.2 \
diff --git a/requirements_lock_3_12.txt b/requirements_lock_3_12.txt
index 4697b1849dc273..9bc6eff7313ec3 100644
--- a/requirements_lock_3_12.txt
+++ b/requirements_lock_3_12.txt
@@ -1,6 +1,6 @@
-absl-py==2.0.0 \
-    --hash=sha256:9a28abb62774ae4e8edbe2dd4c49ffcd45a6a848952a5eccc6a49f3f0fc1e2f3 \
-    --hash=sha256:d9690211c5fcfefcdd1a45470ac2b5c5acd45241c3af71eed96bc5441746c0d5
+absl-py==2.1.0 \
+    --hash=sha256:526a04eadab8b4ee719ce68f204172ead1027549089702d99b9059f129ff1308 \
+    --hash=sha256:7820790efbb316739cde8b4e19357243fc3608a152024288513dd968d7d959ff
     # via
     #   keras-nightly
     #   tb-nightly
@@ -12,105 +12,101 @@ astunparse==1.6.3 \
     --hash=sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872 \
     --hash=sha256:c2652417f2c8b5bb325c885ae329bdf3f86424075c4fd1a128674bc6fba4b8e8
     # via -r requirements.in
-cachetools==5.3.2 \
-    --hash=sha256:086ee420196f7b2ab9ca2db2520aca326318b68fe5ba8bc4d49cca91add450f2 \
-    --hash=sha256:861f35a13a451f94e301ce2bec7cac63e881232ccce7ed67fab9b5df4d3beaa1
-    # via google-auth
-certifi==2023.7.22 \
-    --hash=sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082 \
-    --hash=sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9
+certifi==2024.2.2 \
+    --hash=sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f \
+    --hash=sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1
     # via requests
-charset-normalizer==3.3.1 \
-    --hash=sha256:06cf46bdff72f58645434d467bf5228080801298fbba19fe268a01b4534467f5 \
-    --hash=sha256:0c8c61fb505c7dad1d251c284e712d4e0372cef3b067f7ddf82a7fa82e1e9a93 \
-    --hash=sha256:10b8dd31e10f32410751b3430996f9807fc4d1587ca69772e2aa940a82ab571a \
-    --hash=sha256:1171ef1fc5ab4693c5d151ae0fdad7f7349920eabbaca6271f95969fa0756c2d \
-    --hash=sha256:17a866d61259c7de1bdadef418a37755050ddb4b922df8b356503234fff7932c \
-    --hash=sha256:1d6bfc32a68bc0933819cfdfe45f9abc3cae3877e1d90aac7259d57e6e0f85b1 \
-    --hash=sha256:1ec937546cad86d0dce5396748bf392bb7b62a9eeb8c66efac60e947697f0e58 \
-    --hash=sha256:223b4d54561c01048f657fa6ce41461d5ad8ff128b9678cfe8b2ecd951e3f8a2 \
-    --hash=sha256:2465aa50c9299d615d757c1c888bc6fef384b7c4aec81c05a0172b4400f98557 \
-    --hash=sha256:28f512b9a33235545fbbdac6a330a510b63be278a50071a336afc1b78781b147 \
-    --hash=sha256:2c092be3885a1b7899cd85ce24acedc1034199d6fca1483fa2c3a35c86e43041 \
-    --hash=sha256:2c4c99f98fc3a1835af8179dcc9013f93594d0670e2fa80c83aa36346ee763d2 \
-    --hash=sha256:31445f38053476a0c4e6d12b047b08ced81e2c7c712e5a1ad97bc913256f91b2 \
-    --hash=sha256:31bbaba7218904d2eabecf4feec0d07469284e952a27400f23b6628439439fa7 \
-    --hash=sha256:34d95638ff3613849f473afc33f65c401a89f3b9528d0d213c7037c398a51296 \
-    --hash=sha256:352a88c3df0d1fa886562384b86f9a9e27563d4704ee0e9d56ec6fcd270ea690 \
-    --hash=sha256:39b70a6f88eebe239fa775190796d55a33cfb6d36b9ffdd37843f7c4c1b5dc67 \
-    --hash=sha256:3c66df3f41abee950d6638adc7eac4730a306b022570f71dd0bd6ba53503ab57 \
-    --hash=sha256:3f70fd716855cd3b855316b226a1ac8bdb3caf4f7ea96edcccc6f484217c9597 \
-    --hash=sha256:3f9bc2ce123637a60ebe819f9fccc614da1bcc05798bbbaf2dd4ec91f3e08846 \
-    --hash=sha256:3fb765362688821404ad6cf86772fc54993ec11577cd5a92ac44b4c2ba52155b \
-    --hash=sha256:45f053a0ece92c734d874861ffe6e3cc92150e32136dd59ab1fb070575189c97 \
-    --hash=sha256:46fb9970aa5eeca547d7aa0de5d4b124a288b42eaefac677bde805013c95725c \
-    --hash=sha256:4cb50a0335382aac15c31b61d8531bc9bb657cfd848b1d7158009472189f3d62 \
-    --hash=sha256:4e12f8ee80aa35e746230a2af83e81bd6b52daa92a8afaef4fea4a2ce9b9f4fa \
-    --hash=sha256:4f3100d86dcd03c03f7e9c3fdb23d92e32abbca07e7c13ebd7ddfbcb06f5991f \
-    --hash=sha256:4f6e2a839f83a6a76854d12dbebde50e4b1afa63e27761549d006fa53e9aa80e \
-    --hash=sha256:4f861d94c2a450b974b86093c6c027888627b8082f1299dfd5a4bae8e2292821 \
-    --hash=sha256:501adc5eb6cd5f40a6f77fbd90e5ab915c8fd6e8c614af2db5561e16c600d6f3 \
-    --hash=sha256:520b7a142d2524f999447b3a0cf95115df81c4f33003c51a6ab637cbda9d0bf4 \
-    --hash=sha256:548eefad783ed787b38cb6f9a574bd8664468cc76d1538215d510a3cd41406cb \
-    --hash=sha256:555fe186da0068d3354cdf4bbcbc609b0ecae4d04c921cc13e209eece7720727 \
-    --hash=sha256:55602981b2dbf8184c098bc10287e8c245e351cd4fdcad050bd7199d5a8bf514 \
-    --hash=sha256:58e875eb7016fd014c0eea46c6fa92b87b62c0cb31b9feae25cbbe62c919f54d \
-    --hash=sha256:5a3580a4fdc4ac05f9e53c57f965e3594b2f99796231380adb2baaab96e22761 \
-    --hash=sha256:5b70bab78accbc672f50e878a5b73ca692f45f5b5e25c8066d748c09405e6a55 \
-    --hash=sha256:5ceca5876032362ae73b83347be8b5dbd2d1faf3358deb38c9c88776779b2e2f \
-    --hash=sha256:61f1e3fb621f5420523abb71f5771a204b33c21d31e7d9d86881b2cffe92c47c \
-    --hash=sha256:633968254f8d421e70f91c6ebe71ed0ab140220469cf87a9857e21c16687c034 \
-    --hash=sha256:63a6f59e2d01310f754c270e4a257426fe5a591dc487f1983b3bbe793cf6bac6 \
-    --hash=sha256:63accd11149c0f9a99e3bc095bbdb5a464862d77a7e309ad5938fbc8721235ae \
-    --hash=sha256:6db3cfb9b4fcecb4390db154e75b49578c87a3b9979b40cdf90d7e4b945656e1 \
-    --hash=sha256:71ef3b9be10070360f289aea4838c784f8b851be3ba58cf796262b57775c2f14 \
-    --hash=sha256:7ae8e5142dcc7a49168f4055255dbcced01dc1714a90a21f87448dc8d90617d1 \
-    --hash=sha256:7b6cefa579e1237ce198619b76eaa148b71894fb0d6bcf9024460f9bf30fd228 \
-    --hash=sha256:800561453acdecedaac137bf09cd719c7a440b6800ec182f077bb8e7025fb708 \
-    --hash=sha256:82ca51ff0fc5b641a2d4e1cc8c5ff108699b7a56d7f3ad6f6da9dbb6f0145b48 \
-    --hash=sha256:851cf693fb3aaef71031237cd68699dded198657ec1e76a76eb8be58c03a5d1f \
-    --hash=sha256:854cc74367180beb327ab9d00f964f6d91da06450b0855cbbb09187bcdb02de5 \
-    --hash=sha256:87071618d3d8ec8b186d53cb6e66955ef2a0e4fa63ccd3709c0c90ac5a43520f \
-    --hash=sha256:871d045d6ccc181fd863a3cd66ee8e395523ebfbc57f85f91f035f50cee8e3d4 \
-    --hash=sha256:8aee051c89e13565c6bd366813c386939f8e928af93c29fda4af86d25b73d8f8 \
-    --hash=sha256:8af5a8917b8af42295e86b64903156b4f110a30dca5f3b5aedea123fbd638bff \
-    --hash=sha256:8ec8ef42c6cd5856a7613dcd1eaf21e5573b2185263d87d27c8edcae33b62a61 \
-    --hash=sha256:91e43805ccafa0a91831f9cd5443aa34528c0c3f2cc48c4cb3d9a7721053874b \
-    --hash=sha256:9505dc359edb6a330efcd2be825fdb73ee3e628d9010597aa1aee5aa63442e97 \
-    --hash=sha256:985c7965f62f6f32bf432e2681173db41336a9c2611693247069288bcb0c7f8b \
-    --hash=sha256:9a74041ba0bfa9bc9b9bb2cd3238a6ab3b7618e759b41bd15b5f6ad958d17605 \
-    --hash=sha256:9edbe6a5bf8b56a4a84533ba2b2f489d0046e755c29616ef8830f9e7d9cf5728 \
-    --hash=sha256:a15c1fe6d26e83fd2e5972425a772cca158eae58b05d4a25a4e474c221053e2d \
-    --hash=sha256:a66bcdf19c1a523e41b8e9d53d0cedbfbac2e93c649a2e9502cb26c014d0980c \
-    --hash=sha256:ae4070f741f8d809075ef697877fd350ecf0b7c5837ed68738607ee0a2c572cf \
-    --hash=sha256:ae55d592b02c4349525b6ed8f74c692509e5adffa842e582c0f861751701a673 \
-    --hash=sha256:b578cbe580e3b41ad17b1c428f382c814b32a6ce90f2d8e39e2e635d49e498d1 \
-    --hash=sha256:b891a2f68e09c5ef989007fac11476ed33c5c9994449a4e2c3386529d703dc8b \
-    --hash=sha256:baec8148d6b8bd5cee1ae138ba658c71f5b03e0d69d5907703e3e1df96db5e41 \
-    --hash=sha256:bb06098d019766ca16fc915ecaa455c1f1cd594204e7f840cd6258237b5079a8 \
-    --hash=sha256:bc791ec3fd0c4309a753f95bb6c749ef0d8ea3aea91f07ee1cf06b7b02118f2f \
-    --hash=sha256:bd28b31730f0e982ace8663d108e01199098432a30a4c410d06fe08fdb9e93f4 \
-    --hash=sha256:be4d9c2770044a59715eb57c1144dedea7c5d5ae80c68fb9959515037cde2008 \
-    --hash=sha256:c0c72d34e7de5604df0fde3644cc079feee5e55464967d10b24b1de268deceb9 \
-    --hash=sha256:c0e842112fe3f1a4ffcf64b06dc4c61a88441c2f02f373367f7b4c1aa9be2ad5 \
-    --hash=sha256:c15070ebf11b8b7fd1bfff7217e9324963c82dbdf6182ff7050519e350e7ad9f \
-    --hash=sha256:c2000c54c395d9e5e44c99dc7c20a64dc371f777faf8bae4919ad3e99ce5253e \
-    --hash=sha256:c30187840d36d0ba2893bc3271a36a517a717f9fd383a98e2697ee890a37c273 \
-    --hash=sha256:cb7cd68814308aade9d0c93c5bd2ade9f9441666f8ba5aa9c2d4b389cb5e2a45 \
-    --hash=sha256:cd805513198304026bd379d1d516afbf6c3c13f4382134a2c526b8b854da1c2e \
-    --hash=sha256:d0bf89afcbcf4d1bb2652f6580e5e55a840fdf87384f6063c4a4f0c95e378656 \
-    --hash=sha256:d9137a876020661972ca6eec0766d81aef8a5627df628b664b234b73396e727e \
-    --hash=sha256:dbd95e300367aa0827496fe75a1766d198d34385a58f97683fe6e07f89ca3e3c \
-    --hash=sha256:dced27917823df984fe0c80a5c4ad75cf58df0fbfae890bc08004cd3888922a2 \
-    --hash=sha256:de0b4caa1c8a21394e8ce971997614a17648f94e1cd0640fbd6b4d14cab13a72 \
-    --hash=sha256:debb633f3f7856f95ad957d9b9c781f8e2c6303ef21724ec94bea2ce2fcbd056 \
-    --hash=sha256:e372d7dfd154009142631de2d316adad3cc1c36c32a38b16a4751ba78da2a397 \
-    --hash=sha256:ecd26be9f112c4f96718290c10f4caea6cc798459a3a76636b817a0ed7874e42 \
-    --hash=sha256:edc0202099ea1d82844316604e17d2b175044f9bcb6b398aab781eba957224bd \
-    --hash=sha256:f194cce575e59ffe442c10a360182a986535fd90b57f7debfaa5c845c409ecc3 \
-    --hash=sha256:f5fb672c396d826ca16a022ac04c9dce74e00a1c344f6ad1a0fdc1ba1f332213 \
-    --hash=sha256:f6a02a3c7950cafaadcd46a226ad9e12fc9744652cc69f9e5534f98b47f3bbcf \
-    --hash=sha256:fe81b35c33772e56f4b6cf62cf4aedc1762ef7162a31e6ac7fe5e40d0149eb67
+charset-normalizer==3.3.2 \
+    --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \
+    --hash=sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087 \
+    --hash=sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786 \
+    --hash=sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8 \
+    --hash=sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09 \
+    --hash=sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185 \
+    --hash=sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574 \
+    --hash=sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e \
+    --hash=sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519 \
+    --hash=sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898 \
+    --hash=sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269 \
+    --hash=sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3 \
+    --hash=sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f \
+    --hash=sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6 \
+    --hash=sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8 \
+    --hash=sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a \
+    --hash=sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73 \
+    --hash=sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc \
+    --hash=sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714 \
+    --hash=sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2 \
+    --hash=sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc \
+    --hash=sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce \
+    --hash=sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d \
+    --hash=sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e \
+    --hash=sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6 \
+    --hash=sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269 \
+    --hash=sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96 \
+    --hash=sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d \
+    --hash=sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a \
+    --hash=sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4 \
+    --hash=sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77 \
+    --hash=sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d \
+    --hash=sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0 \
+    --hash=sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed \
+    --hash=sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068 \
+    --hash=sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac \
+    --hash=sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25 \
+    --hash=sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8 \
+    --hash=sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab \
+    --hash=sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26 \
+    --hash=sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2 \
+    --hash=sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db \
+    --hash=sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f \
+    --hash=sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5 \
+    --hash=sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99 \
+    --hash=sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c \
+    --hash=sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d \
+    --hash=sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811 \
+    --hash=sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa \
+    --hash=sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a \
+    --hash=sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03 \
+    --hash=sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b \
+    --hash=sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04 \
+    --hash=sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c \
+    --hash=sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001 \
+    --hash=sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458 \
+    --hash=sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389 \
+    --hash=sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99 \
+    --hash=sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985 \
+    --hash=sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537 \
+    --hash=sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238 \
+    --hash=sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f \
+    --hash=sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d \
+    --hash=sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796 \
+    --hash=sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a \
+    --hash=sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143 \
+    --hash=sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8 \
+    --hash=sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c \
+    --hash=sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5 \
+    --hash=sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5 \
+    --hash=sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711 \
+    --hash=sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4 \
+    --hash=sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6 \
+    --hash=sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c \
+    --hash=sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7 \
+    --hash=sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4 \
+    --hash=sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b \
+    --hash=sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae \
+    --hash=sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12 \
+    --hash=sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c \
+    --hash=sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae \
+    --hash=sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8 \
+    --hash=sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887 \
+    --hash=sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b \
+    --hash=sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4 \
+    --hash=sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f \
+    --hash=sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5 \
+    --hash=sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33 \
+    --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \
+    --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561
     # via requests
 dill==0.3.7 \
     --hash=sha256:76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e \
@@ -133,6 +129,7 @@ dm-tree==0.1.8 \
     --hash=sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60 \
     --hash=sha256:378cc8ad93c5fe3590f405a309980721f021c790ca1bdf9b15bb1d59daec57f5 \
     --hash=sha256:39070ba268c0491af9fe7a58644d99e8b4f2cde6e5884ba3380bddc84ed43d5f \
+    --hash=sha256:435227cf3c5dc63f4de054cf3d00183790bd9ead4c3623138c74dde7f67f521b \
     --hash=sha256:5483dca4d7eb1a0d65fe86d3b6a53ae717face83c1f17e0887b1a4a64ae5c410 \
     --hash=sha256:694c3654cfd2a81552c08ec66bb5c4a3d48fa292b9a181880fb081c36c5b9134 \
     --hash=sha256:803bfc53b4659f447ac694dbd04235f94a73ef7c1fd1e0df7c84ac41e0bc963b \
@@ -140,6 +137,8 @@ dm-tree==0.1.8 \
     --hash=sha256:83b7764de0d855338abefc6e3ee9fe40d301668310aa3baea3f778ff051f4393 \
     --hash=sha256:8c60a7eadab64c2278861f56bca320b2720f163dca9d7558103c3b77f2416571 \
     --hash=sha256:8ed3564abed97c806db122c2d3e1a2b64c74a63debe9903aad795167cc301368 \
+    --hash=sha256:94d3f0826311f45ee19b75f5b48c99466e4218a0489e81c0f0167bda50cacf22 \
+    --hash=sha256:96a548a406a6fb15fe58f6a30a57ff2f2aafbf25f05afab00c8f5e5977b6c715 \
     --hash=sha256:a5d819c38c03f0bb5b3b3703c60e4b170355a0fc6b5819325bf3d4ceb3ae7e80 \
     --hash=sha256:ad16ceba90a56ec47cf45b21856d14962ac314787975ef786efb5e6e9ca75ec7 \
     --hash=sha256:af4b3d372f2477dcd89a6e717e4a575ca35ccc20cc4454a8a4b6f8838a00672d \
@@ -147,6 +146,7 @@ dm-tree==0.1.8 \
     --hash=sha256:b9bd9b9ccb59409d33d51d84b7668010c04c2af7d4a371632874c1ca356cff3d \
     --hash=sha256:b9f89a454e98806b44fe9d40ec9eee61f848388f7e79ac2371a55679bd5a3ac6 \
     --hash=sha256:bb2d109f42190225112da899b9f3d46d0d5f26aef501c61e43529fe9322530b5 \
+    --hash=sha256:c0a94aba18a35457a1b5cd716fd7b46c5dafdc4cf7869b4bae665b91c4682a8e \
     --hash=sha256:c5c8c12e3fda754ef6af94161bacdaeda816d941995fac415d6855c6c386af68 \
     --hash=sha256:d1612fcaecd79023dbc6a6ae48d51a80beb5c385d6f3f6d71688e57bc8d07de8 \
     --hash=sha256:d16e1f2a073604cfcc09f7131ae8d534674f43c3aef4c25742eae295bc60d04f \
@@ -154,6 +154,7 @@ dm-tree==0.1.8 \
     --hash=sha256:d40fa4106ca6edc66760246a08f500ec0c85ef55c762fb4a363f6ee739ba02ee \
     --hash=sha256:de287fabc464b8734be251e46e06aa9aa1001f34198da2b6ce07bd197172b9cb \
     --hash=sha256:e4d714371bb08839e4e5e29024fc95832d9affe129825ef38836b143028bd144 \
+    --hash=sha256:ea9e59e0451e7d29aece402d9f908f2e2a80922bcde2ebfd5dcb07750fcbfee8 \
     --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \
     --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d
     # via keras-nightly
@@ -161,71 +162,61 @@ gast==0.4.0 \
     --hash=sha256:40feb7b8b8434785585ab224d1568b857edb18297e5a3047f1ba012bc83b42c1 \
     --hash=sha256:b7adcdd5adbebf1adf17378da5ba3f543684dbec47b1cda1f3997e573cd542c4
     # via -r requirements.in
-google-auth==2.23.3 \
-    --hash=sha256:6864247895eea5d13b9c57c9e03abb49cb94ce2dc7c58e91cba3248c7477c9e3 \
-    --hash=sha256:a8f4608e65c244ead9e0538f181a96c6e11199ec114d41f1d7b1bffa96937bda
-    # via
-    #   google-auth-oauthlib
-    #   tb-nightly
-google-auth-oauthlib==1.1.0 \
-    --hash=sha256:089c6e587d36f4803ac7e0720c045c6a8b1fd1790088b8424975b90d0ee61c12 \
-    --hash=sha256:83ea8c3b0881e453790baff4448e8a6112ac8778d1de9da0b68010b843937afb
-    # via tb-nightly
-grpcio==1.59.2 \
-    --hash=sha256:023088764012411affe7db183d1ada3ad9daf2e23ddc719ff46d7061de661340 \
-    --hash=sha256:08d77e682f2bf730a4961eea330e56d2f423c6a9b91ca222e5b1eb24a357b19f \
-    --hash=sha256:0a4a3833c0e067f3558538727235cd8a49709bff1003200bbdefa2f09334e4b1 \
-    --hash=sha256:0a754aff9e3af63bdc4c75c234b86b9d14e14a28a30c4e324aed1a9b873d755f \
-    --hash=sha256:11168ef43e4a43ff1b1a65859f3e0ef1a173e277349e7fb16923ff108160a8cd \
-    --hash=sha256:128e20f57c5f27cb0157e73756d1586b83c1b513ebecc83ea0ac37e4b0e4e758 \
-    --hash=sha256:1f9524d1d701e399462d2c90ba7c193e49d1711cf429c0d3d97c966856e03d00 \
-    --hash=sha256:1ff16d68bf453275466a9a46739061a63584d92f18a0f5b33d19fc97eb69867c \
-    --hash=sha256:2067274c88bc6de89c278a672a652b4247d088811ece781a4858b09bdf8448e3 \
-    --hash=sha256:2171c39f355ba5b551c5d5928d65aa6c69807fae195b86ef4a7d125bcdb860a9 \
-    --hash=sha256:242adc47725b9a499ee77c6a2e36688fa6c96484611f33b1be4c57ab075a92dd \
-    --hash=sha256:27f879ae604a7fcf371e59fba6f3ff4635a4c2a64768bd83ff0cac503142fef4 \
-    --hash=sha256:2b230028a008ae1d0f430acb227d323ff8a619017415cf334c38b457f814119f \
-    --hash=sha256:3059668df17627f0e0fa680e9ef8c995c946c792612e9518f5cc1503be14e90b \
-    --hash=sha256:31176aa88f36020055ace9adff2405a33c8bdbfa72a9c4980e25d91b2f196873 \
-    --hash=sha256:36f53c2b3449c015880e7d55a89c992c357f176327b0d2873cdaaf9628a37c69 \
-    --hash=sha256:3b4368b33908f683a363f376dfb747d40af3463a6e5044afee07cf9436addf96 \
-    --hash=sha256:3c61d641d4f409c5ae46bfdd89ea42ce5ea233dcf69e74ce9ba32b503c727e29 \
-    --hash=sha256:4abb717e320e74959517dc8e84a9f48fbe90e9abe19c248541e9418b1ce60acd \
-    --hash=sha256:4c93f4abbb54321ee6471e04a00139c80c754eda51064187963ddf98f5cf36a4 \
-    --hash=sha256:535561990e075fa6bd4b16c4c3c1096b9581b7bb35d96fac4650f1181e428268 \
-    --hash=sha256:53c9aa5ddd6857c0a1cd0287225a2a25873a8e09727c2e95c4aebb1be83a766a \
-    --hash=sha256:5d573e70a6fe77555fb6143c12d3a7d3fa306632a3034b4e7c59ca09721546f8 \
-    --hash=sha256:6009386a2df66159f64ac9f20425ae25229b29b9dd0e1d3dd60043f037e2ad7e \
-    --hash=sha256:686e975a5d16602dc0982c7c703948d17184bd1397e16c8ee03511ecb8c4cdda \
-    --hash=sha256:6959fb07e8351e20501ffb8cc4074c39a0b7ef123e1c850a7f8f3afdc3a3da01 \
-    --hash=sha256:6b25ed37c27e652db01be341af93fbcea03d296c024d8a0e680017a268eb85dd \
-    --hash=sha256:6da6dea3a1bacf99b3c2187e296db9a83029ed9c38fd4c52b7c9b7326d13c828 \
-    --hash=sha256:72ca2399097c0b758198f2ff30f7178d680de8a5cfcf3d9b73a63cf87455532e \
-    --hash=sha256:73abb8584b0cf74d37f5ef61c10722adc7275502ab71789a8fe3cb7ef04cf6e2 \
-    --hash=sha256:74100fecaec8a535e380cf5f2fb556ff84957d481c13e54051c52e5baac70541 \
-    --hash=sha256:75c6ecb70e809cf1504465174343113f51f24bc61e22a80ae1c859f3f7034c6d \
-    --hash=sha256:7cf05053242f61ba94014dd3a986e11a083400a32664058f80bf4cf817c0b3a1 \
-    --hash=sha256:9411e24328a2302e279e70cae6e479f1fddde79629fcb14e03e6d94b3956eabf \
-    --hash=sha256:a213acfbf186b9f35803b52e4ca9addb153fc0b67f82a48f961be7000ecf6721 \
-    --hash=sha256:bb7e0fe6ad73b7f06d7e2b689c19a71cf5cc48f0c2bf8608469e51ffe0bd2867 \
-    --hash=sha256:c2504eed520958a5b77cc99458297cb7906308cb92327f35fb7fbbad4e9b2188 \
-    --hash=sha256:c35aa9657f5d5116d23b934568e0956bd50c615127810fffe3ac356a914c176a \
-    --hash=sha256:c5f09cffa619adfb44799fa4a81c2a1ad77c887187613fb0a8f201ab38d89ba1 \
-    --hash=sha256:c978f864b35f2261e0819f5cd88b9830b04dc51bcf055aac3c601e525a10d2ba \
-    --hash=sha256:cbe946b3e6e60a7b4618f091e62a029cb082b109a9d6b53962dd305087c6e4fd \
-    --hash=sha256:cc3e4cd087f07758b16bef8f31d88dbb1b5da5671d2f03685ab52dece3d7a16e \
-    --hash=sha256:cf0dead5a2c5a3347af2cfec7131d4f2a2e03c934af28989c9078f8241a491fa \
-    --hash=sha256:d2794f0e68b3085d99b4f6ff9c089f6fdd02b32b9d3efdfbb55beac1bf22d516 \
-    --hash=sha256:d2fa68a96a30dd240be80bbad838a0ac81a61770611ff7952b889485970c4c71 \
-    --hash=sha256:d6f70406695e3220f09cd7a2f879333279d91aa4a8a1d34303b56d61a8180137 \
-    --hash=sha256:d8f9cd4ad1be90b0cf350a2f04a38a36e44a026cac1e036ac593dc48efe91d52 \
-    --hash=sha256:da2d94c15f88cd40d7e67f7919d4f60110d2b9d5b1e08cf354c2be773ab13479 \
-    --hash=sha256:e1727c1c0e394096bb9af185c6923e8ea55a5095b8af44f06903bcc0e06800a2 \
-    --hash=sha256:e420ced29b5904cdf9ee5545e23f9406189d8acb6750916c2db4793dada065c6 \
-    --hash=sha256:e82c5cf1495244adf5252f925ac5932e5fd288b3e5ab6b70bec5593074b7236c \
-    --hash=sha256:f1ef0d39bc1feb420caf549b3c657c871cad4ebbcf0580c4d03816b0590de0cf \
-    --hash=sha256:f8753a6c88d1d0ba64302309eecf20f70d2770f65ca02d83c2452279085bfcd3 \
-    --hash=sha256:f93dbf58f03146164048be5426ffde298b237a5e059144847e4940f5b80172c3
+grpcio==1.60.1 \
+    --hash=sha256:0250a7a70b14000fa311de04b169cc7480be6c1a769b190769d347939d3232a8 \
+    --hash=sha256:069fe2aeee02dfd2135d562d0663fe70fbb69d5eed6eb3389042a7e963b54de8 \
+    --hash=sha256:082081e6a36b6eb5cf0fd9a897fe777dbb3802176ffd08e3ec6567edd85bc104 \
+    --hash=sha256:0c5807e9152eff15f1d48f6b9ad3749196f79a4a050469d99eecb679be592acc \
+    --hash=sha256:14e8f2c84c0832773fb3958240c69def72357bc11392571f87b2d7b91e0bb092 \
+    --hash=sha256:2a6087f234cb570008a6041c8ffd1b7d657b397fdd6d26e83d72283dae3527b1 \
+    --hash=sha256:2bb2a2911b028f01c8c64d126f6b632fcd8a9ac975aa1b3855766c94e4107180 \
+    --hash=sha256:2f44c32aef186bbba254129cea1df08a20be414144ac3bdf0e84b24e3f3b2e05 \
+    --hash=sha256:30e980cd6db1088c144b92fe376747328d5554bc7960ce583ec7b7d81cd47287 \
+    --hash=sha256:33aed0a431f5befeffd9d346b0fa44b2c01aa4aeae5ea5b2c03d3e25e0071216 \
+    --hash=sha256:33bdea30dcfd4f87b045d404388469eb48a48c33a6195a043d116ed1b9a0196c \
+    --hash=sha256:39aa848794b887120b1d35b1b994e445cc028ff602ef267f87c38122c1add50d \
+    --hash=sha256:4216e67ad9a4769117433814956031cb300f85edc855252a645a9a724b3b6594 \
+    --hash=sha256:49c9b6a510e3ed8df5f6f4f3c34d7fbf2d2cae048ee90a45cd7415abab72912c \
+    --hash=sha256:4eec8b8c1c2c9b7125508ff7c89d5701bf933c99d3910e446ed531cd16ad5d87 \
+    --hash=sha256:50d56280b482875d1f9128ce596e59031a226a8b84bec88cb2bf76c289f5d0de \
+    --hash=sha256:53b69e79d00f78c81eecfb38f4516080dc7f36a198b6b37b928f1c13b3c063e9 \
+    --hash=sha256:55ccb7db5a665079d68b5c7c86359ebd5ebf31a19bc1a91c982fd622f1e31ff2 \
+    --hash=sha256:5a1ebbae7e2214f51b1f23b57bf98eeed2cf1ba84e4d523c48c36d5b2f8829ff \
+    --hash=sha256:61b7199cd2a55e62e45bfb629a35b71fc2c0cb88f686a047f25b1112d3810904 \
+    --hash=sha256:660fc6b9c2a9ea3bb2a7e64ba878c98339abaf1811edca904ac85e9e662f1d73 \
+    --hash=sha256:6d140bdeb26cad8b93c1455fa00573c05592793c32053d6e0016ce05ba267549 \
+    --hash=sha256:6e490fa5f7f5326222cb9f0b78f207a2b218a14edf39602e083d5f617354306f \
+    --hash=sha256:6ecf21d20d02d1733e9c820fb5c114c749d888704a7ec824b545c12e78734d1c \
+    --hash=sha256:70c83bb530572917be20c21f3b6be92cd86b9aecb44b0c18b1d3b2cc3ae47df0 \
+    --hash=sha256:72153a0d2e425f45b884540a61c6639436ddafa1829a42056aa5764b84108b8e \
+    --hash=sha256:73e14acd3d4247169955fae8fb103a2b900cfad21d0c35f0dcd0fdd54cd60367 \
+    --hash=sha256:76eaaba891083fcbe167aa0f03363311a9f12da975b025d30e94b93ac7a765fc \
+    --hash=sha256:79ae0dc785504cb1e1788758c588c711f4e4a0195d70dff53db203c95a0bd303 \
+    --hash=sha256:7d142bcd604166417929b071cd396aa13c565749a4c840d6c702727a59d835eb \
+    --hash=sha256:8c9554ca8e26241dabe7951aa1fa03a1ba0856688ecd7e7bdbdd286ebc272e4c \
+    --hash=sha256:8d488fbdbf04283f0d20742b64968d44825617aa6717b07c006168ed16488804 \
+    --hash=sha256:91422ba785a8e7a18725b1dc40fbd88f08a5bb4c7f1b3e8739cab24b04fa8a03 \
+    --hash=sha256:9a66f4d2a005bc78e61d805ed95dedfcb35efa84b7bba0403c6d60d13a3de2d6 \
+    --hash=sha256:9b106bc52e7f28170e624ba61cc7dc6829566e535a6ec68528f8e1afbed1c41f \
+    --hash=sha256:9b54577032d4f235452f77a83169b6527bf4b77d73aeada97d45b2aaf1bf5ce0 \
+    --hash=sha256:a09506eb48fa5493c58f946c46754ef22f3ec0df64f2b5149373ff31fb67f3dd \
+    --hash=sha256:a212e5dea1a4182e40cd3e4067ee46be9d10418092ce3627475e995cca95de21 \
+    --hash=sha256:a731ac5cffc34dac62053e0da90f0c0b8560396a19f69d9703e88240c8f05858 \
+    --hash=sha256:af5ef6cfaf0d023c00002ba25d0751e5995fa0e4c9eec6cd263c30352662cbce \
+    --hash=sha256:b58b855d0071575ea9c7bc0d84a06d2edfbfccec52e9657864386381a7ce1ae9 \
+    --hash=sha256:bc808924470643b82b14fe121923c30ec211d8c693e747eba8a7414bc4351a23 \
+    --hash=sha256:c557e94e91a983e5b1e9c60076a8fd79fea1e7e06848eb2e48d0ccfb30f6e073 \
+    --hash=sha256:c71be3f86d67d8d1311c6076a4ba3b75ba5703c0b856b4e691c9097f9b1e8bd2 \
+    --hash=sha256:c8754c75f55781515a3005063d9a05878b2cfb3cb7e41d5401ad0cf19de14872 \
+    --hash=sha256:cb0af13433dbbd1c806e671d81ec75bd324af6ef75171fd7815ca3074fe32bfe \
+    --hash=sha256:cba6209c96828711cb7c8fcb45ecef8c8859238baf15119daa1bef0f6c84bfe7 \
+    --hash=sha256:cf77f8cf2a651fbd869fbdcb4a1931464189cd210abc4cfad357f1cacc8642a6 \
+    --hash=sha256:d7404cebcdb11bb5bd40bf94131faf7e9a7c10a6c60358580fe83913f360f929 \
+    --hash=sha256:dd1d3a8d1d2e50ad9b59e10aa7f07c7d1be2b367f3f2d33c5fade96ed5460962 \
+    --hash=sha256:e5d97c65ea7e097056f3d1ead77040ebc236feaf7f71489383d20f3b4c28412a \
+    --hash=sha256:f1c3dc536b3ee124e8b24feb7533e5c70b9f2ef833e3b2e5513b2897fd46763a \
+    --hash=sha256:f2212796593ad1d0235068c79836861f2201fc7137a99aa2fea7beeb3b101177 \
+    --hash=sha256:fead980fbc68512dfd4e0c7b1f5754c2a8e5015a04dea454b9cada54a8423525
     # via
     #   -r requirements.in
     #   tb-nightly
@@ -258,150 +249,156 @@ h5py==3.10.0 \
     # via
     #   -r requirements.in
     #   keras-nightly
-idna==3.4 \
-    --hash=sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4 \
-    --hash=sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2
+idna==3.6 \
+    --hash=sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca \
+    --hash=sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f
     # via requests
 jax==0.4.7 \
     --hash=sha256:5e7002d74db25f97c99b979d4ba1233b1ef26e1597e5fc468ad11d1c8a9dc4f8
     # via -r requirements.in
-keras-nightly==3.0.0.dev2023103103 \
-    --hash=sha256:25a6a9030d7e067d535ec41ae6700636c90cabada80fbddb3bb8775006807860 \
-    --hash=sha256:c39fccebb3d4cc9d838371981c4d3eef08fc06eadf6cffb39b356cb625bab50f
+keras-nightly==3.0.4.dev2024021403 \
+    --hash=sha256:24ce69d29d582771685bf4235f59663723405b5a5b16f3eaff2657e52e74663a \
+    --hash=sha256:9f416e66b820ef833779d219d255b346b8b90a72fdbd0b2f1e90a43ad142a03d
     # via -r requirements.in
-lit==17.0.4 \
-    --hash=sha256:ee2e180128e770abc6aed3a02de2daf09d81b7d30225e315205d3599c311d304
+lit==17.0.6 \
+    --hash=sha256:dfa9af9b55fc4509a56be7bf2346f079d7f4a242d583b9f2e0b078fd0abae31b
     # via -r requirements.in
-markdown==3.5 \
-    --hash=sha256:4afb124395ce5fc34e6d9886dab977fd9ae987fc6e85689f08278cf0c69d4bf3 \
-    --hash=sha256:a807eb2e4778d9156c8f07876c6e4d50b5494c5665c4834f67b06459dfd877b3
+markdown==3.5.2 \
+    --hash=sha256:d43323865d89fc0cb9b20c75fc8ad313af307cc087e84b657d9eec768eddeadd \
+    --hash=sha256:e1ac7b3dc550ee80e602e71c1d168002f062e49f1b11e26a36264dafd4df2ef8
     # via tb-nightly
 markdown-it-py==3.0.0 \
     --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \
     --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb
     # via rich
-markupsafe==2.1.3 \
-    --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \
-    --hash=sha256:0a4e4a1aff6c7ac4cd55792abf96c915634c2b97e3cc1c7129578aa68ebd754e \
-    --hash=sha256:10bbfe99883db80bdbaff2dcf681dfc6533a614f700da1287707e8a5d78a8431 \
-    --hash=sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686 \
-    --hash=sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c \
-    --hash=sha256:1577735524cdad32f9f694208aa75e422adba74f1baee7551620e43a3141f559 \
-    --hash=sha256:1b40069d487e7edb2676d3fbdb2b0829ffa2cd63a2ec26c4938b2d34391b4ecc \
-    --hash=sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb \
-    --hash=sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939 \
-    --hash=sha256:282c2cb35b5b673bbcadb33a585408104df04f14b2d9b01d4c345a3b92861c2c \
-    --hash=sha256:2c1b19b3aaacc6e57b7e25710ff571c24d6c3613a45e905b1fde04d691b98ee0 \
-    --hash=sha256:2ef12179d3a291be237280175b542c07a36e7f60718296278d8593d21ca937d4 \
-    --hash=sha256:338ae27d6b8745585f87218a3f23f1512dbf52c26c28e322dbe54bcede54ccb9 \
-    --hash=sha256:3c0fae6c3be832a0a0473ac912810b2877c8cb9d76ca48de1ed31e1c68386575 \
-    --hash=sha256:3fd4abcb888d15a94f32b75d8fd18ee162ca0c064f35b11134be77050296d6ba \
-    --hash=sha256:42de32b22b6b804f42c5d98be4f7e5e977ecdd9ee9b660fda1a3edf03b11792d \
-    --hash=sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd \
-    --hash=sha256:504b320cd4b7eff6f968eddf81127112db685e81f7e36e75f9f84f0df46041c3 \
-    --hash=sha256:525808b8019e36eb524b8c68acdd63a37e75714eac50e988180b169d64480a00 \
-    --hash=sha256:56d9f2ecac662ca1611d183feb03a3fa4406469dafe241673d521dd5ae92a155 \
-    --hash=sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac \
-    --hash=sha256:65c1a9bcdadc6c28eecee2c119465aebff8f7a584dd719facdd9e825ec61ab52 \
-    --hash=sha256:68e78619a61ecf91e76aa3e6e8e33fc4894a2bebe93410754bd28fce0a8a4f9f \
-    --hash=sha256:69c0f17e9f5a7afdf2cc9fb2d1ce6aabdb3bafb7f38017c0b77862bcec2bbad8 \
-    --hash=sha256:6b2b56950d93e41f33b4223ead100ea0fe11f8e6ee5f641eb753ce4b77a7042b \
-    --hash=sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007 \
-    --hash=sha256:787003c0ddb00500e49a10f2844fac87aa6ce977b90b0feaaf9de23c22508b24 \
-    --hash=sha256:7ef3cb2ebbf91e330e3bb937efada0edd9003683db6b57bb108c4001f37a02ea \
-    --hash=sha256:8023faf4e01efadfa183e863fefde0046de576c6f14659e8782065bcece22198 \
-    --hash=sha256:8758846a7e80910096950b67071243da3e5a20ed2546e6392603c096778d48e0 \
-    --hash=sha256:8afafd99945ead6e075b973fefa56379c5b5c53fd8937dad92c662da5d8fd5ee \
-    --hash=sha256:8c41976a29d078bb235fea9b2ecd3da465df42a562910f9022f1a03107bd02be \
-    --hash=sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2 \
-    --hash=sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1 \
-    --hash=sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707 \
-    --hash=sha256:962f82a3086483f5e5f64dbad880d31038b698494799b097bc59c2edf392fce6 \
-    --hash=sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c \
-    --hash=sha256:9dcdfd0eaf283af041973bff14a2e143b8bd64e069f4c383416ecd79a81aab58 \
-    --hash=sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823 \
-    --hash=sha256:aa7bd130efab1c280bed0f45501b7c8795f9fdbeb02e965371bbef3523627779 \
-    --hash=sha256:ab4a0df41e7c16a1392727727e7998a467472d0ad65f3ad5e6e765015df08636 \
-    --hash=sha256:ad9e82fb8f09ade1c3e1b996a6337afac2b8b9e365f926f5a61aacc71adc5b3c \
-    --hash=sha256:af598ed32d6ae86f1b747b82783958b1a4ab8f617b06fe68795c7f026abbdcad \
-    --hash=sha256:b076b6226fb84157e3f7c971a47ff3a679d837cf338547532ab866c57930dbee \
-    --hash=sha256:b7ff0f54cb4ff66dd38bebd335a38e2c22c41a8ee45aa608efc890ac3e3931bc \
-    --hash=sha256:bfce63a9e7834b12b87c64d6b155fdd9b3b96191b6bd334bf37db7ff1fe457f2 \
-    --hash=sha256:c011a4149cfbcf9f03994ec2edffcb8b1dc2d2aede7ca243746df97a5d41ce48 \
-    --hash=sha256:c9c804664ebe8f83a211cace637506669e7890fec1b4195b505c214e50dd4eb7 \
-    --hash=sha256:ca379055a47383d02a5400cb0d110cef0a776fc644cda797db0c5696cfd7e18e \
-    --hash=sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b \
-    --hash=sha256:cd0f502fe016460680cd20aaa5a76d241d6f35a1c3350c474bac1273803893fa \
-    --hash=sha256:ceb01949af7121f9fc39f7d27f91be8546f3fb112c608bc4029aef0bab86a2a5 \
-    --hash=sha256:d080e0a5eb2529460b30190fcfcc4199bd7f827663f858a226a81bc27beaa97e \
-    --hash=sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb \
-    --hash=sha256:df0be2b576a7abbf737b1575f048c23fb1d769f267ec4358296f31c2479db8f9 \
-    --hash=sha256:e09031c87a1e51556fdcb46e5bd4f59dfb743061cf93c4d6831bf894f125eb57 \
-    --hash=sha256:e4dd52d80b8c83fdce44e12478ad2e85c64ea965e75d66dbeafb0a3e77308fcc \
-    --hash=sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc \
-    --hash=sha256:fec21693218efe39aa7f8599346e90c705afa52c5b31ae019b2e57e8f6542bb2 \
-    --hash=sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11
+markupsafe==2.1.5 \
+    --hash=sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf \
+    --hash=sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff \
+    --hash=sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f \
+    --hash=sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3 \
+    --hash=sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532 \
+    --hash=sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f \
+    --hash=sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617 \
+    --hash=sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df \
+    --hash=sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4 \
+    --hash=sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906 \
+    --hash=sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f \
+    --hash=sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4 \
+    --hash=sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8 \
+    --hash=sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371 \
+    --hash=sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2 \
+    --hash=sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465 \
+    --hash=sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52 \
+    --hash=sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6 \
+    --hash=sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169 \
+    --hash=sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad \
+    --hash=sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2 \
+    --hash=sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0 \
+    --hash=sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029 \
+    --hash=sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f \
+    --hash=sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a \
+    --hash=sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced \
+    --hash=sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5 \
+    --hash=sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c \
+    --hash=sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf \
+    --hash=sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9 \
+    --hash=sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb \
+    --hash=sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad \
+    --hash=sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3 \
+    --hash=sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1 \
+    --hash=sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46 \
+    --hash=sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc \
+    --hash=sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a \
+    --hash=sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee \
+    --hash=sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900 \
+    --hash=sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5 \
+    --hash=sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea \
+    --hash=sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f \
+    --hash=sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5 \
+    --hash=sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e \
+    --hash=sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a \
+    --hash=sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f \
+    --hash=sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50 \
+    --hash=sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a \
+    --hash=sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b \
+    --hash=sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4 \
+    --hash=sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff \
+    --hash=sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2 \
+    --hash=sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46 \
+    --hash=sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b \
+    --hash=sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf \
+    --hash=sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5 \
+    --hash=sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5 \
+    --hash=sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab \
+    --hash=sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd \
+    --hash=sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68
     # via werkzeug
 mdurl==0.1.2 \
     --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \
     --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba
     # via markdown-it-py
-ml-dtypes==0.3.1 \
-    --hash=sha256:3d8ca0acbd377082792d8b97081ba580abdad67c6afb7f827012c675b052f058 \
-    --hash=sha256:42a8980afd8b7c8e270e8b5c260237286b5b26acd276fcb758d13cd7cb567e99 \
-    --hash=sha256:438437e2e614a3c91d75581653b6c40ec890e8b5994d7190a90c931740151c95 \
-    --hash=sha256:4828b62fa3bf1ae35faa40f3db9a38ec72fbce02f328a1d14c3a9da4606af364 \
-    --hash=sha256:4d94b2d1bed77284694f7fd0479640fa7aa5d96433dca3cbcec407a5ef752e77 \
-    --hash=sha256:510d249a91face47211762eb294d6fe64f325356b965fb6388c1bf51bd339267 \
-    --hash=sha256:5727effa7650f7ab10906542d137cfb3244fdc3b2b519beff42f82def8ba59be \
-    --hash=sha256:5e0b0b6bb07fa5ad11bb61d174667176bee5e05857225067aabfc5adc1b51d23 \
-    --hash=sha256:60778f99194b4c4f36ba42da200b35ef851ce4d4af698aaf70f5b91fe70fc611 \
-    --hash=sha256:70984b473db6489ec1d8c79b082a1322105155193049d08a3b0c515094e9777b \
-    --hash=sha256:979d7d196d9a17e0135ae22878f74241fbd3522cef58d7b292f1fd5b32282201 \
-    --hash=sha256:a777928dcba8865ab4a8157eeb25d23aed7bc82e5fd74e1d5eca821d3f148b39 \
-    --hash=sha256:cb0c404e0dd3e25b56362c1c1e5de0ef717f727dde59fa721af4ce6ab2acca44 \
-    --hash=sha256:d1a8dc3bac1da2a17d0e2e4cba36ee89721d0bd33ea4765af2eefb5f41409e0f \
-    --hash=sha256:da274599e4950a9b488d21571061f49a185537cc77f2d3f8121151d58a9e9f16 \
-    --hash=sha256:f83ff080df8910c0f987f615b03e4f8198638e0c00c6e679ea8892dda909763b \
-    --hash=sha256:fcae2c69715410d96906e1dfe8f017d9f78a0d10e0df91aae52e91f51fdfe45e
-    # via jax
+ml-dtypes==0.3.2 \
+    --hash=sha256:2c34f2ba9660b21fe1034b608308a01be82bbef2a92fb8199f24dc6bad0d5226 \
+    --hash=sha256:3a17ef2322e60858d93584e9c52a5be7dd6236b056b7fa1ec57f1bb6ba043e33 \
+    --hash=sha256:533059bc5f1764fac071ef54598db358c167c51a718f68f5bb55e3dee79d2967 \
+    --hash=sha256:6604877d567a29bfe7cc02969ae0f2425260e5335505cf5e7fefc3e5465f5655 \
+    --hash=sha256:6b35c4e8ca957c877ac35c79ffa77724ecc3702a1e4b18b08306c03feae597bb \
+    --hash=sha256:763697ab8a88d47443997a7cdf3aac7340049aed45f7521f6b0ec8a0594821fe \
+    --hash=sha256:7a4c3fcbf86fa52d0204f07cfd23947ef05b4ad743a1a988e163caa34a201e5e \
+    --hash=sha256:7afde548890a92b41c0fed3a6c525f1200a5727205f73dc21181a2726571bb53 \
+    --hash=sha256:7ba8e1fafc7fff3e643f453bffa7d082df1678a73286ce8187d3e825e776eb94 \
+    --hash=sha256:91f8783fd1f2c23fd3b9ee5ad66b785dafa58ba3cdb050c4458021fa4d1eb226 \
+    --hash=sha256:93b78f53431c93953f7850bb1b925a17f0ab5d97527e38a7e865b5b4bc5cfc18 \
+    --hash=sha256:961134ea44c7b8ca63eda902a44b58cd8bd670e21d62e255c81fba0a8e70d9b7 \
+    --hash=sha256:b89b194e9501a92d289c1ffd411380baf5daafb9818109a4f49b0a1b6dce4462 \
+    --hash=sha256:c7b3fb3d4f6b39bcd4f6c4b98f406291f0d681a895490ee29a0f95bab850d53c \
+    --hash=sha256:d1a746fe5fb9cd974a91070174258f0be129c592b93f9ce7df6cc336416c3fbd \
+    --hash=sha256:e8505946df1665db01332d885c2020b4cb9e84a8b1241eb4ba69d59591f65855 \
+    --hash=sha256:f47619d978ab1ae7dfdc4052ea97c636c6263e1f19bd1be0e42c346b98d15ff4
+    # via
+    #   jax
+    #   keras-nightly
 namex==0.0.7 \
     --hash=sha256:84ba65bc4d22bd909e3d26bf2ffb4b9529b608cb3f9a4336f776b04204ced69b \
     --hash=sha256:8a4f062945f405d77cb66b907f16aa2fd83681945e998be840eb6c4154d40108
     # via keras-nightly
-numpy==1.26.1 ; python_version >= "3.12" \
-    --hash=sha256:06934e1a22c54636a059215d6da99e23286424f316fddd979f5071093b648668 \
-    --hash=sha256:1c59c046c31a43310ad0199d6299e59f57a289e22f0f36951ced1c9eac3665b9 \
-    --hash=sha256:1d1bd82d539607951cac963388534da3b7ea0e18b149a53cf883d8f699178c0f \
-    --hash=sha256:1e11668d6f756ca5ef534b5be8653d16c5352cbb210a5c2a79ff288e937010d5 \
-    --hash=sha256:3649d566e2fc067597125428db15d60eb42a4e0897fc48d28cb75dc2e0454e53 \
-    --hash=sha256:59227c981d43425ca5e5c01094d59eb14e8772ce6975d4b2fc1e106a833d5ae2 \
-    --hash=sha256:6081aed64714a18c72b168a9276095ef9155dd7888b9e74b5987808f0dd0a974 \
-    --hash=sha256:6965888d65d2848e8768824ca8288db0a81263c1efccec881cb35a0d805fcd2f \
-    --hash=sha256:76ff661a867d9272cd2a99eed002470f46dbe0943a5ffd140f49be84f68ffc42 \
-    --hash=sha256:78ca54b2f9daffa5f323f34cdf21e1d9779a54073f0018a3094ab907938331a2 \
-    --hash=sha256:82e871307a6331b5f09efda3c22e03c095d957f04bf6bc1804f30048d0e5e7af \
-    --hash=sha256:8ab9163ca8aeb7fd32fe93866490654d2f7dda4e61bc6297bf72ce07fdc02f67 \
-    --hash=sha256:9696aa2e35cc41e398a6d42d147cf326f8f9d81befcb399bc1ed7ffea339b64e \
-    --hash=sha256:97e5d6a9f0702c2863aaabf19f0d1b6c2628fbe476438ce0b5ce06e83085064c \
-    --hash=sha256:9f42284ebf91bdf32fafac29d29d4c07e5e9d1af862ea73686581773ef9e73a7 \
-    --hash=sha256:a03fb25610ef560a6201ff06df4f8105292ba56e7cdd196ea350d123fc32e24e \
-    --hash=sha256:a5b411040beead47a228bde3b2241100454a6abde9df139ed087bd73fc0a4908 \
-    --hash=sha256:af22f3d8e228d84d1c0c44c1fbdeb80f97a15a0abe4f080960393a00db733b66 \
-    --hash=sha256:afd5ced4e5a96dac6725daeb5242a35494243f2239244fad10a90ce58b071d24 \
-    --hash=sha256:b9d45d1dbb9de84894cc50efece5b09939752a2d75aab3a8b0cef6f3a35ecd6b \
-    --hash=sha256:bb894accfd16b867d8643fc2ba6c8617c78ba2828051e9a69511644ce86ce83e \
-    --hash=sha256:c8c6c72d4a9f831f328efb1312642a1cafafaa88981d9ab76368d50d07d93cbe \
-    --hash=sha256:cd7837b2b734ca72959a1caf3309457a318c934abef7a43a14bb984e574bbb9a \
-    --hash=sha256:cdd9ec98f0063d93baeb01aad472a1a0840dee302842a2746a7a8e92968f9575 \
-    --hash=sha256:d1cfc92db6af1fd37a7bb58e55c8383b4aa1ba23d012bdbba26b4bcca45ac297 \
-    --hash=sha256:d1d2c6b7dd618c41e202c59c1413ef9b2c8e8a15f5039e344af64195459e3104 \
-    --hash=sha256:d2984cb6caaf05294b8466966627e80bf6c7afd273279077679cb010acb0e5ab \
-    --hash=sha256:d58e8c51a7cf43090d124d5073bc29ab2755822181fcad978b12e144e5e5a4b3 \
-    --hash=sha256:d78f269e0c4fd365fc2992c00353e4530d274ba68f15e968d8bc3c69ce5f5244 \
-    --hash=sha256:dcfaf015b79d1f9f9c9fd0731a907407dc3e45769262d657d754c3a028586124 \
-    --hash=sha256:e44ccb93f30c75dfc0c3aa3ce38f33486a75ec9abadabd4e59f114994a9c4617 \
-    --hash=sha256:e509cbc488c735b43b5ffea175235cec24bbc57b227ef1acc691725beb230d1c
+numpy==1.26.4 ; python_version >= "3.12" \
+    --hash=sha256:03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b \
+    --hash=sha256:08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 \
+    --hash=sha256:1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20 \
+    --hash=sha256:1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0 \
+    --hash=sha256:2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010 \
+    --hash=sha256:2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a \
+    --hash=sha256:3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea \
+    --hash=sha256:47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c \
+    --hash=sha256:4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71 \
+    --hash=sha256:50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110 \
+    --hash=sha256:52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be \
+    --hash=sha256:60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a \
+    --hash=sha256:62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a \
+    --hash=sha256:666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5 \
+    --hash=sha256:675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed \
+    --hash=sha256:679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd \
+    --hash=sha256:7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c \
+    --hash=sha256:7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e \
+    --hash=sha256:7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0 \
+    --hash=sha256:95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c \
+    --hash=sha256:96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a \
+    --hash=sha256:9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b \
+    --hash=sha256:9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0 \
+    --hash=sha256:a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6 \
+    --hash=sha256:a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2 \
+    --hash=sha256:ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a \
+    --hash=sha256:afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30 \
+    --hash=sha256:b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218 \
+    --hash=sha256:b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5 \
+    --hash=sha256:bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07 \
+    --hash=sha256:cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2 \
+    --hash=sha256:d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4 \
+    --hash=sha256:d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764 \
+    --hash=sha256:edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef \
+    --hash=sha256:f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 \
+    --hash=sha256:ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f
     # via
     #   -r requirements.in
     #   h5py
@@ -411,10 +408,6 @@ numpy==1.26.1 ; python_version >= "3.12" \
     #   opt-einsum
     #   scipy
     #   tb-nightly
-oauthlib==3.2.2 \
-    --hash=sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca \
-    --hash=sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918
-    # via requests-oauthlib
 opt-einsum==3.3.0 \
     --hash=sha256:2455e59e3947d3c275477df7f5205b30635e266fe6dc300e3d9f9646bfcea147 \
     --hash=sha256:59f6475f77bbc37dcf7cd748519c0ec60722e91e63ca114e68821c0c54a46549
@@ -444,57 +437,36 @@ protobuf==4.23.4 \
     --hash=sha256:effeac51ab79332d44fba74660d40ae79985901ac21bca408f8dc335a81aa597 \
     --hash=sha256:fee88269a090ada09ca63551bf2f573eb2424035bcf2cb1b121895b01a46594a
     # via tb-nightly
-psutil==5.9.6 \
-    --hash=sha256:10e8c17b4f898d64b121149afb136c53ea8b68c7531155147867b7b1ac9e7e28 \
-    --hash=sha256:18cd22c5db486f33998f37e2bb054cc62fd06646995285e02a51b1e08da97017 \
-    --hash=sha256:3ebf2158c16cc69db777e3c7decb3c0f43a7af94a60d72e87b2823aebac3d602 \
-    --hash=sha256:51dc3d54607c73148f63732c727856f5febec1c7c336f8f41fcbd6315cce76ac \
-    --hash=sha256:6e5fb8dc711a514da83098bc5234264e551ad980cec5f85dabf4d38ed6f15e9a \
-    --hash=sha256:70cb3beb98bc3fd5ac9ac617a327af7e7f826373ee64c80efd4eb2856e5051e9 \
-    --hash=sha256:748c9dd2583ed86347ed65d0035f45fa8c851e8d90354c122ab72319b5f366f4 \
-    --hash=sha256:91ecd2d9c00db9817a4b4192107cf6954addb5d9d67a969a4f436dbc9200f88c \
-    --hash=sha256:92e0cc43c524834af53e9d3369245e6cc3b130e78e26100d1f63cdb0abeb3d3c \
-    --hash=sha256:a6f01f03bf1843280f4ad16f4bde26b817847b4c1a0db59bf6419807bc5ce05c \
-    --hash=sha256:c69596f9fc2f8acd574a12d5f8b7b1ba3765a641ea5d60fb4736bf3c08a8214a \
-    --hash=sha256:ca2780f5e038379e520281e4c032dddd086906ddff9ef0d1b9dcf00710e5071c \
-    --hash=sha256:daecbcbd29b289aac14ece28eca6a3e60aa361754cf6da3dfb20d4d32b6c7f57 \
-    --hash=sha256:e4b92ddcd7dd4cdd3f900180ea1e104932c7bce234fb88976e2a3b296441225a \
-    --hash=sha256:fb8a697f11b0f5994550555fcfe3e69799e5b060c8ecf9e2f75c69302cc35c0d \
-    --hash=sha256:ff18b8d1a784b810df0b0fff3bcb50ab941c3b8e2c8de5726f9c71c601c611aa
+psutil==5.9.8 \
+    --hash=sha256:02615ed8c5ea222323408ceba16c60e99c3f91639b07da6373fb7e6539abc56d \
+    --hash=sha256:05806de88103b25903dff19bb6692bd2e714ccf9e668d050d144012055cbca73 \
+    --hash=sha256:26bd09967ae00920df88e0352a91cff1a78f8d69b3ecabbfe733610c0af486c8 \
+    --hash=sha256:27cc40c3493bb10de1be4b3f07cae4c010ce715290a5be22b98493509c6299e2 \
+    --hash=sha256:36f435891adb138ed3c9e58c6af3e2e6ca9ac2f365efe1f9cfef2794e6c93b4e \
+    --hash=sha256:50187900d73c1381ba1454cf40308c2bf6f34268518b3f36a9b663ca87e65e36 \
+    --hash=sha256:611052c4bc70432ec770d5d54f64206aa7203a101ec273a0cd82418c86503bb7 \
+    --hash=sha256:6be126e3225486dff286a8fb9a06246a5253f4c7c53b475ea5f5ac934e64194c \
+    --hash=sha256:7d79560ad97af658a0f6adfef8b834b53f64746d45b403f225b85c5c2c140eee \
+    --hash=sha256:8cb6403ce6d8e047495a701dc7c5bd788add903f8986d523e3e20b98b733e421 \
+    --hash=sha256:8db4c1b57507eef143a15a6884ca10f7c73876cdf5d51e713151c1236a0e68cf \
+    --hash=sha256:aee678c8720623dc456fa20659af736241f575d79429a0e5e9cf88ae0605cc81 \
+    --hash=sha256:bc56c2a1b0d15aa3eaa5a60c9f3f8e3e565303b465dbf57a1b730e7a2b9844e0 \
+    --hash=sha256:bd1184ceb3f87651a67b2708d4c3338e9b10c5df903f2e3776b62303b26cb631 \
+    --hash=sha256:d06016f7f8625a1825ba3732081d77c94589dca78b7a3fc072194851e88461a4 \
+    --hash=sha256:d16bbddf0693323b8c6123dd804100241da461e41d6e332fb0ba6058f630f8c8
     # via portpicker
-pyasn1==0.5.0 \
-    --hash=sha256:87a2121042a1ac9358cabcaf1d07680ff97ee6404333bacca15f76aa8ad01a57 \
-    --hash=sha256:97b7290ca68e62a832558ec3976f15cbf911bf5d7c7039d8b861c2a0ece69fde
-    # via
-    #   pyasn1-modules
-    #   rsa
-pyasn1-modules==0.3.0 \
-    --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \
-    --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d
-    # via google-auth
-pygments==2.16.1 \
-    --hash=sha256:13fc09fa63bc8d8671a6d247e1eb303c4b343eaee81d861f3404db2935653692 \
-    --hash=sha256:1daff0494820c69bc8941e407aa20f577374ee88364ee10a98fdbe0aece96e29
+pygments==2.17.2 \
+    --hash=sha256:b27c2826c47d0f3219f29554824c30c5e8945175d888647acd804ddd04af846c \
+    --hash=sha256:da46cec9fd2de5be3a8a784f434e4c4ab670b4ff54d605c4c2717e9d49c4c367
     # via rich
 requests==2.31.0 \
     --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \
     --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1
-    # via
-    #   -r requirements.in
-    #   requests-oauthlib
-    #   tb-nightly
-requests-oauthlib==1.3.1 \
-    --hash=sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5 \
-    --hash=sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a
-    # via google-auth-oauthlib
-rich==13.6.0 \
-    --hash=sha256:2b38e2fe9ca72c9a00170a1a2d20c63c790d0e10ef1fe35eba76e1e7b1d7d245 \
-    --hash=sha256:5c14d22737e6d5084ef4771b62d5d4363165b403455a30a1c8ca39dc7b644bef
+    # via -r requirements.in
+rich==13.7.0 \
+    --hash=sha256:5cb5123b5cf9ee70584244246816e9114227e0b98ad9176eede6ad54bf5403fa \
+    --hash=sha256:6da14c108c4866ee9520bbffa71f6fe3962e193b7da68720583850cd4548e235
     # via keras-nightly
-rsa==4.9 \
-    --hash=sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7 \
-    --hash=sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21
-    # via google-auth
 scipy==1.11.3 \
     --hash=sha256:00f325434b6424952fbb636506f0567898dca7b0f7654d48f1c382ea338ce9a3 \
     --hash=sha256:033c3fd95d55012dd1148b201b72ae854d5086d25e7c316ec9850de4fe776929 \
@@ -530,8 +502,8 @@ six==1.16.0 \
     # via
     #   astunparse
     #   tb-nightly
-tb-nightly==2.15.0a20231023 \
-    --hash=sha256:8990a52985e3296aa18a6825efc017bcded9e2fb2cbcdd2d01f5c62d4fdc9825
+tb-nightly==2.17.0a20240214 \
+    --hash=sha256:dbd59bcfd9b028e6199050e36cbb4ee1db4a94e69a7f8c00865f1498112a2b83
     # via -r requirements.in
 tblib==2.0.0 \
     --hash=sha256:9100bfa016b047d5b980d66e7efed952fbd20bd85b56110aaf473cb97d18709a \
@@ -546,13 +518,17 @@ termcolor==2.3.0 \
     --hash=sha256:3afb05607b89aed0ffe25202399ee0867ad4d3cb4180d98aaf8eefa6a5f7d475 \
     --hash=sha256:b5b08f68937f138fe92f6c089b99f1e2da0ae56c52b78bf7075fd95420fd9a5a
     # via -r requirements.in
+tf-keras-nightly==2.16.0.dev2024021410 \
+    --hash=sha256:05a4c19c6a795ec9ed6f4dac69d443601b9206b910524d8b19bad6e362d49f47 \
+    --hash=sha256:83d5a1dd3979b42164a2fac4bdd2ed5981ef0c2a0514ca64e4bc67e369caef5c
+    # via tb-nightly
 typing-extensions==4.8.0 \
     --hash=sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0 \
     --hash=sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef
     # via -r requirements.in
-urllib3==2.0.7 \
-    --hash=sha256:c97dfde1f7bd43a71c8d2a58e369e9b2bf692d1334ea9f9cae55add7d0dd0f84 \
-    --hash=sha256:fdb6d215c776278489906c2f8916e6e7d4f5a9b602ccbcfdf7f016fc8da0596e
+urllib3==2.2.0 \
+    --hash=sha256:051d961ad0c62a94e50ecf1af379c3aba230c66c710493493560c0c223c49f20 \
+    --hash=sha256:ce3711610ddce217e6d113a2732fafad960a03fd0318c91faa79481e35c11224
     # via requests
 werkzeug==3.0.1 \
     --hash=sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc \
@@ -564,81 +540,77 @@ wheel==0.41.3 \
     # via
     #   -r requirements.in
     #   astunparse
-wrapt==1.14.1 \
-    --hash=sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3 \
-    --hash=sha256:01c205616a89d09827986bc4e859bcabd64f5a0662a7fe95e0d359424e0e071b \
-    --hash=sha256:02b41b633c6261feff8ddd8d11c711df6842aba629fdd3da10249a53211a72c4 \
-    --hash=sha256:07f7a7d0f388028b2df1d916e94bbb40624c59b48ecc6cbc232546706fac74c2 \
-    --hash=sha256:11871514607b15cfeb87c547a49bca19fde402f32e2b1c24a632506c0a756656 \
-    --hash=sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3 \
-    --hash=sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9 \
-    --hash=sha256:21ac0156c4b089b330b7666db40feee30a5d52634cc4560e1905d6529a3897ff \
-    --hash=sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9 \
-    --hash=sha256:257fd78c513e0fb5cdbe058c27a0624c9884e735bbd131935fd49e9fe719d310 \
-    --hash=sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224 \
-    --hash=sha256:2b39d38039a1fdad98c87279b48bc5dce2c0ca0d73483b12cb72aa9609278e8a \
-    --hash=sha256:2cf71233a0ed05ccdabe209c606fe0bac7379fdcf687f39b944420d2a09fdb57 \
-    --hash=sha256:2fe803deacd09a233e4762a1adcea5db5d31e6be577a43352936179d14d90069 \
-    --hash=sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335 \
-    --hash=sha256:3232822c7d98d23895ccc443bbdf57c7412c5a65996c30442ebe6ed3df335383 \
-    --hash=sha256:34aa51c45f28ba7f12accd624225e2b1e5a3a45206aa191f6f9aac931d9d56fe \
-    --hash=sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204 \
-    --hash=sha256:36f582d0c6bc99d5f39cd3ac2a9062e57f3cf606ade29a0a0d6b323462f4dd87 \
-    --hash=sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d \
-    --hash=sha256:40e7bc81c9e2b2734ea4bc1aceb8a8f0ceaac7c5299bc5d69e37c44d9081d43b \
-    --hash=sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907 \
-    --hash=sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be \
-    --hash=sha256:4fcc4649dc762cddacd193e6b55bc02edca674067f5f98166d7713b193932b7f \
-    --hash=sha256:5a0f54ce2c092aaf439813735584b9537cad479575a09892b8352fea5e988dc0 \
-    --hash=sha256:5a9a0d155deafd9448baff28c08e150d9b24ff010e899311ddd63c45c2445e28 \
-    --hash=sha256:5b02d65b9ccf0ef6c34cba6cf5bf2aab1bb2f49c6090bafeecc9cd81ad4ea1c1 \
-    --hash=sha256:60db23fa423575eeb65ea430cee741acb7c26a1365d103f7b0f6ec412b893853 \
-    --hash=sha256:642c2e7a804fcf18c222e1060df25fc210b9c58db7c91416fb055897fc27e8cc \
-    --hash=sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf \
-    --hash=sha256:6a9a25751acb379b466ff6be78a315e2b439d4c94c1e99cb7266d40a537995d3 \
-    --hash=sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3 \
-    --hash=sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164 \
-    --hash=sha256:6e743de5e9c3d1b7185870f480587b75b1cb604832e380d64f9504a0535912d1 \
-    --hash=sha256:709fe01086a55cf79d20f741f39325018f4df051ef39fe921b1ebe780a66184c \
-    --hash=sha256:7b7c050ae976e286906dd3f26009e117eb000fb2cf3533398c5ad9ccc86867b1 \
-    --hash=sha256:7d2872609603cb35ca513d7404a94d6d608fc13211563571117046c9d2bcc3d7 \
-    --hash=sha256:7ef58fb89674095bfc57c4069e95d7a31cfdc0939e2a579882ac7d55aadfd2a1 \
-    --hash=sha256:80bb5c256f1415f747011dc3604b59bc1f91c6e7150bd7db03b19170ee06b320 \
-    --hash=sha256:81b19725065dcb43df02b37e03278c011a09e49757287dca60c5aecdd5a0b8ed \
-    --hash=sha256:833b58d5d0b7e5b9832869f039203389ac7cbf01765639c7309fd50ef619e0b1 \
-    --hash=sha256:88bd7b6bd70a5b6803c1abf6bca012f7ed963e58c68d76ee20b9d751c74a3248 \
-    --hash=sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c \
-    --hash=sha256:8c0ce1e99116d5ab21355d8ebe53d9460366704ea38ae4d9f6933188f327b456 \
-    --hash=sha256:8d649d616e5c6a678b26d15ece345354f7c2286acd6db868e65fcc5ff7c24a77 \
-    --hash=sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef \
-    --hash=sha256:9736af4641846491aedb3c3f56b9bc5568d92b0692303b5a305301a95dfd38b1 \
-    --hash=sha256:988635d122aaf2bdcef9e795435662bcd65b02f4f4c1ae37fbee7401c440b3a7 \
-    --hash=sha256:9cca3c2cdadb362116235fdbd411735de4328c61425b0aa9f872fd76d02c4e86 \
-    --hash=sha256:9e0fd32e0148dd5dea6af5fee42beb949098564cc23211a88d799e434255a1f4 \
-    --hash=sha256:9f3e6f9e05148ff90002b884fbc2a86bd303ae847e472f44ecc06c2cd2fcdb2d \
-    --hash=sha256:a85d2b46be66a71bedde836d9e41859879cc54a2a04fad1191eb50c2066f6e9d \
-    --hash=sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8 \
-    --hash=sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8 \
-    --hash=sha256:aa31fdcc33fef9eb2552cbcbfee7773d5a6792c137b359e82879c101e98584c5 \
-    --hash=sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a \
-    --hash=sha256:b014c23646a467558be7da3d6b9fa409b2c567d2110599b7cf9a0c5992b3b471 \
-    --hash=sha256:b21bb4c09ffabfa0e85e3a6b623e19b80e7acd709b9f91452b8297ace2a8ab00 \
-    --hash=sha256:b5901a312f4d14c59918c221323068fad0540e34324925c8475263841dbdfe68 \
-    --hash=sha256:b9b7a708dd92306328117d8c4b62e2194d00c365f18eff11a9b53c6f923b01e3 \
-    --hash=sha256:d1967f46ea8f2db647c786e78d8cc7e4313dbd1b0aca360592d8027b8508e24d \
-    --hash=sha256:d52a25136894c63de15a35bc0bdc5adb4b0e173b9c0d07a2be9d3ca64a332735 \
-    --hash=sha256:d77c85fedff92cf788face9bfa3ebaa364448ebb1d765302e9af11bf449ca36d \
-    --hash=sha256:d79d7d5dc8a32b7093e81e97dad755127ff77bcc899e845f41bf71747af0c569 \
-    --hash=sha256:dbcda74c67263139358f4d188ae5faae95c30929281bc6866d00573783c422b7 \
-    --hash=sha256:ddaea91abf8b0d13443f6dac52e89051a5063c7d014710dcb4d4abb2ff811a59 \
-    --hash=sha256:dee0ce50c6a2dd9056c20db781e9c1cfd33e77d2d569f5d1d9321c641bb903d5 \
-    --hash=sha256:dee60e1de1898bde3b238f18340eec6148986da0455d8ba7848d50470a7a32fb \
-    --hash=sha256:e2f83e18fe2f4c9e7db597e988f72712c0c3676d337d8b101f6758107c42425b \
-    --hash=sha256:e3fb1677c720409d5f671e39bac6c9e0e422584e5f518bfd50aa4cbbea02433f \
-    --hash=sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55 \
-    --hash=sha256:ee2b1b1769f6707a8a445162ea16dddf74285c3964f605877a20e38545c3c462 \
-    --hash=sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015 \
-    --hash=sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af
+wrapt==1.16.0 \
+    --hash=sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc \
+    --hash=sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81 \
+    --hash=sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09 \
+    --hash=sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e \
+    --hash=sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca \
+    --hash=sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0 \
+    --hash=sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb \
+    --hash=sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487 \
+    --hash=sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40 \
+    --hash=sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c \
+    --hash=sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060 \
+    --hash=sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202 \
+    --hash=sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41 \
+    --hash=sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9 \
+    --hash=sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b \
+    --hash=sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664 \
+    --hash=sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d \
+    --hash=sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362 \
+    --hash=sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00 \
+    --hash=sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc \
+    --hash=sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1 \
+    --hash=sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267 \
+    --hash=sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956 \
+    --hash=sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966 \
+    --hash=sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1 \
+    --hash=sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228 \
+    --hash=sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72 \
+    --hash=sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d \
+    --hash=sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292 \
+    --hash=sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0 \
+    --hash=sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0 \
+    --hash=sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36 \
+    --hash=sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c \
+    --hash=sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5 \
+    --hash=sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f \
+    --hash=sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73 \
+    --hash=sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b \
+    --hash=sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2 \
+    --hash=sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593 \
+    --hash=sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39 \
+    --hash=sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389 \
+    --hash=sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf \
+    --hash=sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf \
+    --hash=sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89 \
+    --hash=sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c \
+    --hash=sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c \
+    --hash=sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f \
+    --hash=sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440 \
+    --hash=sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465 \
+    --hash=sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136 \
+    --hash=sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b \
+    --hash=sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8 \
+    --hash=sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3 \
+    --hash=sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8 \
+    --hash=sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6 \
+    --hash=sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e \
+    --hash=sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f \
+    --hash=sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c \
+    --hash=sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e \
+    --hash=sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8 \
+    --hash=sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2 \
+    --hash=sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020 \
+    --hash=sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35 \
+    --hash=sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d \
+    --hash=sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3 \
+    --hash=sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537 \
+    --hash=sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809 \
+    --hash=sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d \
+    --hash=sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a \
+    --hash=sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4
     # via -r requirements.in
 
 setuptools==68.2.2 \
diff --git a/requirements_lock_3_9.txt b/requirements_lock_3_9.txt
index 0fb35480a8f886..9d9e85aceda9c7 100644
--- a/requirements_lock_3_9.txt
+++ b/requirements_lock_3_9.txt
@@ -1,6 +1,6 @@
-absl-py==2.0.0 \
-    --hash=sha256:9a28abb62774ae4e8edbe2dd4c49ffcd45a6a848952a5eccc6a49f3f0fc1e2f3 \
-    --hash=sha256:d9690211c5fcfefcdd1a45470ac2b5c5acd45241c3af71eed96bc5441746c0d5
+absl-py==2.1.0 \
+    --hash=sha256:526a04eadab8b4ee719ce68f204172ead1027549089702d99b9059f129ff1308 \
+    --hash=sha256:7820790efbb316739cde8b4e19357243fc3608a152024288513dd968d7d959ff
     # via
     #   keras-nightly
     #   tb-nightly
@@ -12,105 +12,101 @@ astunparse==1.6.3 \
     --hash=sha256:5ad93a8456f0d084c3456d059fd9a92cce667963232cbf763eac3bc5b7940872 \
     --hash=sha256:c2652417f2c8b5bb325c885ae329bdf3f86424075c4fd1a128674bc6fba4b8e8
     # via -r requirements.in
-cachetools==5.3.2 \
-    --hash=sha256:086ee420196f7b2ab9ca2db2520aca326318b68fe5ba8bc4d49cca91add450f2 \
-    --hash=sha256:861f35a13a451f94e301ce2bec7cac63e881232ccce7ed67fab9b5df4d3beaa1
-    # via google-auth
-certifi==2023.7.22 \
-    --hash=sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082 \
-    --hash=sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9
+certifi==2024.2.2 \
+    --hash=sha256:0569859f95fc761b18b45ef421b1290a0f65f147e92a1e5eb3e635f9a5e4e66f \
+    --hash=sha256:dc383c07b76109f368f6106eee2b593b04a011ea4d55f652c6ca24a754d1cdd1
     # via requests
-charset-normalizer==3.3.1 \
-    --hash=sha256:06cf46bdff72f58645434d467bf5228080801298fbba19fe268a01b4534467f5 \
-    --hash=sha256:0c8c61fb505c7dad1d251c284e712d4e0372cef3b067f7ddf82a7fa82e1e9a93 \
-    --hash=sha256:10b8dd31e10f32410751b3430996f9807fc4d1587ca69772e2aa940a82ab571a \
-    --hash=sha256:1171ef1fc5ab4693c5d151ae0fdad7f7349920eabbaca6271f95969fa0756c2d \
-    --hash=sha256:17a866d61259c7de1bdadef418a37755050ddb4b922df8b356503234fff7932c \
-    --hash=sha256:1d6bfc32a68bc0933819cfdfe45f9abc3cae3877e1d90aac7259d57e6e0f85b1 \
-    --hash=sha256:1ec937546cad86d0dce5396748bf392bb7b62a9eeb8c66efac60e947697f0e58 \
-    --hash=sha256:223b4d54561c01048f657fa6ce41461d5ad8ff128b9678cfe8b2ecd951e3f8a2 \
-    --hash=sha256:2465aa50c9299d615d757c1c888bc6fef384b7c4aec81c05a0172b4400f98557 \
-    --hash=sha256:28f512b9a33235545fbbdac6a330a510b63be278a50071a336afc1b78781b147 \
-    --hash=sha256:2c092be3885a1b7899cd85ce24acedc1034199d6fca1483fa2c3a35c86e43041 \
-    --hash=sha256:2c4c99f98fc3a1835af8179dcc9013f93594d0670e2fa80c83aa36346ee763d2 \
-    --hash=sha256:31445f38053476a0c4e6d12b047b08ced81e2c7c712e5a1ad97bc913256f91b2 \
-    --hash=sha256:31bbaba7218904d2eabecf4feec0d07469284e952a27400f23b6628439439fa7 \
-    --hash=sha256:34d95638ff3613849f473afc33f65c401a89f3b9528d0d213c7037c398a51296 \
-    --hash=sha256:352a88c3df0d1fa886562384b86f9a9e27563d4704ee0e9d56ec6fcd270ea690 \
-    --hash=sha256:39b70a6f88eebe239fa775190796d55a33cfb6d36b9ffdd37843f7c4c1b5dc67 \
-    --hash=sha256:3c66df3f41abee950d6638adc7eac4730a306b022570f71dd0bd6ba53503ab57 \
-    --hash=sha256:3f70fd716855cd3b855316b226a1ac8bdb3caf4f7ea96edcccc6f484217c9597 \
-    --hash=sha256:3f9bc2ce123637a60ebe819f9fccc614da1bcc05798bbbaf2dd4ec91f3e08846 \
-    --hash=sha256:3fb765362688821404ad6cf86772fc54993ec11577cd5a92ac44b4c2ba52155b \
-    --hash=sha256:45f053a0ece92c734d874861ffe6e3cc92150e32136dd59ab1fb070575189c97 \
-    --hash=sha256:46fb9970aa5eeca547d7aa0de5d4b124a288b42eaefac677bde805013c95725c \
-    --hash=sha256:4cb50a0335382aac15c31b61d8531bc9bb657cfd848b1d7158009472189f3d62 \
-    --hash=sha256:4e12f8ee80aa35e746230a2af83e81bd6b52daa92a8afaef4fea4a2ce9b9f4fa \
-    --hash=sha256:4f3100d86dcd03c03f7e9c3fdb23d92e32abbca07e7c13ebd7ddfbcb06f5991f \
-    --hash=sha256:4f6e2a839f83a6a76854d12dbebde50e4b1afa63e27761549d006fa53e9aa80e \
-    --hash=sha256:4f861d94c2a450b974b86093c6c027888627b8082f1299dfd5a4bae8e2292821 \
-    --hash=sha256:501adc5eb6cd5f40a6f77fbd90e5ab915c8fd6e8c614af2db5561e16c600d6f3 \
-    --hash=sha256:520b7a142d2524f999447b3a0cf95115df81c4f33003c51a6ab637cbda9d0bf4 \
-    --hash=sha256:548eefad783ed787b38cb6f9a574bd8664468cc76d1538215d510a3cd41406cb \
-    --hash=sha256:555fe186da0068d3354cdf4bbcbc609b0ecae4d04c921cc13e209eece7720727 \
-    --hash=sha256:55602981b2dbf8184c098bc10287e8c245e351cd4fdcad050bd7199d5a8bf514 \
-    --hash=sha256:58e875eb7016fd014c0eea46c6fa92b87b62c0cb31b9feae25cbbe62c919f54d \
-    --hash=sha256:5a3580a4fdc4ac05f9e53c57f965e3594b2f99796231380adb2baaab96e22761 \
-    --hash=sha256:5b70bab78accbc672f50e878a5b73ca692f45f5b5e25c8066d748c09405e6a55 \
-    --hash=sha256:5ceca5876032362ae73b83347be8b5dbd2d1faf3358deb38c9c88776779b2e2f \
-    --hash=sha256:61f1e3fb621f5420523abb71f5771a204b33c21d31e7d9d86881b2cffe92c47c \
-    --hash=sha256:633968254f8d421e70f91c6ebe71ed0ab140220469cf87a9857e21c16687c034 \
-    --hash=sha256:63a6f59e2d01310f754c270e4a257426fe5a591dc487f1983b3bbe793cf6bac6 \
-    --hash=sha256:63accd11149c0f9a99e3bc095bbdb5a464862d77a7e309ad5938fbc8721235ae \
-    --hash=sha256:6db3cfb9b4fcecb4390db154e75b49578c87a3b9979b40cdf90d7e4b945656e1 \
-    --hash=sha256:71ef3b9be10070360f289aea4838c784f8b851be3ba58cf796262b57775c2f14 \
-    --hash=sha256:7ae8e5142dcc7a49168f4055255dbcced01dc1714a90a21f87448dc8d90617d1 \
-    --hash=sha256:7b6cefa579e1237ce198619b76eaa148b71894fb0d6bcf9024460f9bf30fd228 \
-    --hash=sha256:800561453acdecedaac137bf09cd719c7a440b6800ec182f077bb8e7025fb708 \
-    --hash=sha256:82ca51ff0fc5b641a2d4e1cc8c5ff108699b7a56d7f3ad6f6da9dbb6f0145b48 \
-    --hash=sha256:851cf693fb3aaef71031237cd68699dded198657ec1e76a76eb8be58c03a5d1f \
-    --hash=sha256:854cc74367180beb327ab9d00f964f6d91da06450b0855cbbb09187bcdb02de5 \
-    --hash=sha256:87071618d3d8ec8b186d53cb6e66955ef2a0e4fa63ccd3709c0c90ac5a43520f \
-    --hash=sha256:871d045d6ccc181fd863a3cd66ee8e395523ebfbc57f85f91f035f50cee8e3d4 \
-    --hash=sha256:8aee051c89e13565c6bd366813c386939f8e928af93c29fda4af86d25b73d8f8 \
-    --hash=sha256:8af5a8917b8af42295e86b64903156b4f110a30dca5f3b5aedea123fbd638bff \
-    --hash=sha256:8ec8ef42c6cd5856a7613dcd1eaf21e5573b2185263d87d27c8edcae33b62a61 \
-    --hash=sha256:91e43805ccafa0a91831f9cd5443aa34528c0c3f2cc48c4cb3d9a7721053874b \
-    --hash=sha256:9505dc359edb6a330efcd2be825fdb73ee3e628d9010597aa1aee5aa63442e97 \
-    --hash=sha256:985c7965f62f6f32bf432e2681173db41336a9c2611693247069288bcb0c7f8b \
-    --hash=sha256:9a74041ba0bfa9bc9b9bb2cd3238a6ab3b7618e759b41bd15b5f6ad958d17605 \
-    --hash=sha256:9edbe6a5bf8b56a4a84533ba2b2f489d0046e755c29616ef8830f9e7d9cf5728 \
-    --hash=sha256:a15c1fe6d26e83fd2e5972425a772cca158eae58b05d4a25a4e474c221053e2d \
-    --hash=sha256:a66bcdf19c1a523e41b8e9d53d0cedbfbac2e93c649a2e9502cb26c014d0980c \
-    --hash=sha256:ae4070f741f8d809075ef697877fd350ecf0b7c5837ed68738607ee0a2c572cf \
-    --hash=sha256:ae55d592b02c4349525b6ed8f74c692509e5adffa842e582c0f861751701a673 \
-    --hash=sha256:b578cbe580e3b41ad17b1c428f382c814b32a6ce90f2d8e39e2e635d49e498d1 \
-    --hash=sha256:b891a2f68e09c5ef989007fac11476ed33c5c9994449a4e2c3386529d703dc8b \
-    --hash=sha256:baec8148d6b8bd5cee1ae138ba658c71f5b03e0d69d5907703e3e1df96db5e41 \
-    --hash=sha256:bb06098d019766ca16fc915ecaa455c1f1cd594204e7f840cd6258237b5079a8 \
-    --hash=sha256:bc791ec3fd0c4309a753f95bb6c749ef0d8ea3aea91f07ee1cf06b7b02118f2f \
-    --hash=sha256:bd28b31730f0e982ace8663d108e01199098432a30a4c410d06fe08fdb9e93f4 \
-    --hash=sha256:be4d9c2770044a59715eb57c1144dedea7c5d5ae80c68fb9959515037cde2008 \
-    --hash=sha256:c0c72d34e7de5604df0fde3644cc079feee5e55464967d10b24b1de268deceb9 \
-    --hash=sha256:c0e842112fe3f1a4ffcf64b06dc4c61a88441c2f02f373367f7b4c1aa9be2ad5 \
-    --hash=sha256:c15070ebf11b8b7fd1bfff7217e9324963c82dbdf6182ff7050519e350e7ad9f \
-    --hash=sha256:c2000c54c395d9e5e44c99dc7c20a64dc371f777faf8bae4919ad3e99ce5253e \
-    --hash=sha256:c30187840d36d0ba2893bc3271a36a517a717f9fd383a98e2697ee890a37c273 \
-    --hash=sha256:cb7cd68814308aade9d0c93c5bd2ade9f9441666f8ba5aa9c2d4b389cb5e2a45 \
-    --hash=sha256:cd805513198304026bd379d1d516afbf6c3c13f4382134a2c526b8b854da1c2e \
-    --hash=sha256:d0bf89afcbcf4d1bb2652f6580e5e55a840fdf87384f6063c4a4f0c95e378656 \
-    --hash=sha256:d9137a876020661972ca6eec0766d81aef8a5627df628b664b234b73396e727e \
-    --hash=sha256:dbd95e300367aa0827496fe75a1766d198d34385a58f97683fe6e07f89ca3e3c \
-    --hash=sha256:dced27917823df984fe0c80a5c4ad75cf58df0fbfae890bc08004cd3888922a2 \
-    --hash=sha256:de0b4caa1c8a21394e8ce971997614a17648f94e1cd0640fbd6b4d14cab13a72 \
-    --hash=sha256:debb633f3f7856f95ad957d9b9c781f8e2c6303ef21724ec94bea2ce2fcbd056 \
-    --hash=sha256:e372d7dfd154009142631de2d316adad3cc1c36c32a38b16a4751ba78da2a397 \
-    --hash=sha256:ecd26be9f112c4f96718290c10f4caea6cc798459a3a76636b817a0ed7874e42 \
-    --hash=sha256:edc0202099ea1d82844316604e17d2b175044f9bcb6b398aab781eba957224bd \
-    --hash=sha256:f194cce575e59ffe442c10a360182a986535fd90b57f7debfaa5c845c409ecc3 \
-    --hash=sha256:f5fb672c396d826ca16a022ac04c9dce74e00a1c344f6ad1a0fdc1ba1f332213 \
-    --hash=sha256:f6a02a3c7950cafaadcd46a226ad9e12fc9744652cc69f9e5534f98b47f3bbcf \
-    --hash=sha256:fe81b35c33772e56f4b6cf62cf4aedc1762ef7162a31e6ac7fe5e40d0149eb67
+charset-normalizer==3.3.2 \
+    --hash=sha256:06435b539f889b1f6f4ac1758871aae42dc3a8c0e24ac9e60c2384973ad73027 \
+    --hash=sha256:06a81e93cd441c56a9b65d8e1d043daeb97a3d0856d177d5c90ba85acb3db087 \
+    --hash=sha256:0a55554a2fa0d408816b3b5cedf0045f4b8e1a6065aec45849de2d6f3f8e9786 \
+    --hash=sha256:0b2b64d2bb6d3fb9112bafa732def486049e63de9618b5843bcdd081d8144cd8 \
+    --hash=sha256:10955842570876604d404661fbccbc9c7e684caf432c09c715ec38fbae45ae09 \
+    --hash=sha256:122c7fa62b130ed55f8f285bfd56d5f4b4a5b503609d181f9ad85e55c89f4185 \
+    --hash=sha256:1ceae2f17a9c33cb48e3263960dc5fc8005351ee19db217e9b1bb15d28c02574 \
+    --hash=sha256:1d3193f4a680c64b4b6a9115943538edb896edc190f0b222e73761716519268e \
+    --hash=sha256:1f79682fbe303db92bc2b1136016a38a42e835d932bab5b3b1bfcfbf0640e519 \
+    --hash=sha256:2127566c664442652f024c837091890cb1942c30937add288223dc895793f898 \
+    --hash=sha256:22afcb9f253dac0696b5a4be4a1c0f8762f8239e21b99680099abd9b2b1b2269 \
+    --hash=sha256:25baf083bf6f6b341f4121c2f3c548875ee6f5339300e08be3f2b2ba1721cdd3 \
+    --hash=sha256:2e81c7b9c8979ce92ed306c249d46894776a909505d8f5a4ba55b14206e3222f \
+    --hash=sha256:3287761bc4ee9e33561a7e058c72ac0938c4f57fe49a09eae428fd88aafe7bb6 \
+    --hash=sha256:34d1c8da1e78d2e001f363791c98a272bb734000fcef47a491c1e3b0505657a8 \
+    --hash=sha256:37e55c8e51c236f95b033f6fb391d7d7970ba5fe7ff453dad675e88cf303377a \
+    --hash=sha256:3d47fa203a7bd9c5b6cee4736ee84ca03b8ef23193c0d1ca99b5089f72645c73 \
+    --hash=sha256:3e4d1f6587322d2788836a99c69062fbb091331ec940e02d12d179c1d53e25fc \
+    --hash=sha256:42cb296636fcc8b0644486d15c12376cb9fa75443e00fb25de0b8602e64c1714 \
+    --hash=sha256:45485e01ff4d3630ec0d9617310448a8702f70e9c01906b0d0118bdf9d124cf2 \
+    --hash=sha256:4a78b2b446bd7c934f5dcedc588903fb2f5eec172f3d29e52a9096a43722adfc \
+    --hash=sha256:4ab2fe47fae9e0f9dee8c04187ce5d09f48eabe611be8259444906793ab7cbce \
+    --hash=sha256:4d0d1650369165a14e14e1e47b372cfcb31d6ab44e6e33cb2d4e57265290044d \
+    --hash=sha256:549a3a73da901d5bc3ce8d24e0600d1fa85524c10287f6004fbab87672bf3e1e \
+    --hash=sha256:55086ee1064215781fff39a1af09518bc9255b50d6333f2e4c74ca09fac6a8f6 \
+    --hash=sha256:572c3763a264ba47b3cf708a44ce965d98555f618ca42c926a9c1616d8f34269 \
+    --hash=sha256:573f6eac48f4769d667c4442081b1794f52919e7edada77495aaed9236d13a96 \
+    --hash=sha256:5b4c145409bef602a690e7cfad0a15a55c13320ff7a3ad7ca59c13bb8ba4d45d \
+    --hash=sha256:6463effa3186ea09411d50efc7d85360b38d5f09b870c48e4600f63af490e56a \
+    --hash=sha256:65f6f63034100ead094b8744b3b97965785388f308a64cf8d7c34f2f2e5be0c4 \
+    --hash=sha256:663946639d296df6a2bb2aa51b60a2454ca1cb29835324c640dafb5ff2131a77 \
+    --hash=sha256:6897af51655e3691ff853668779c7bad41579facacf5fd7253b0133308cf000d \
+    --hash=sha256:68d1f8a9e9e37c1223b656399be5d6b448dea850bed7d0f87a8311f1ff3dabb0 \
+    --hash=sha256:6ac7ffc7ad6d040517be39eb591cac5ff87416c2537df6ba3cba3bae290c0fed \
+    --hash=sha256:6b3251890fff30ee142c44144871185dbe13b11bab478a88887a639655be1068 \
+    --hash=sha256:6c4caeef8fa63d06bd437cd4bdcf3ffefe6738fb1b25951440d80dc7df8c03ac \
+    --hash=sha256:6ef1d82a3af9d3eecdba2321dc1b3c238245d890843e040e41e470ffa64c3e25 \
+    --hash=sha256:753f10e867343b4511128c6ed8c82f7bec3bd026875576dfd88483c5c73b2fd8 \
+    --hash=sha256:7cd13a2e3ddeed6913a65e66e94b51d80a041145a026c27e6bb76c31a853c6ab \
+    --hash=sha256:7ed9e526742851e8d5cc9e6cf41427dfc6068d4f5a3bb03659444b4cabf6bc26 \
+    --hash=sha256:7f04c839ed0b6b98b1a7501a002144b76c18fb1c1850c8b98d458ac269e26ed2 \
+    --hash=sha256:802fe99cca7457642125a8a88a084cef28ff0cf9407060f7b93dca5aa25480db \
+    --hash=sha256:80402cd6ee291dcb72644d6eac93785fe2c8b9cb30893c1af5b8fdd753b9d40f \
+    --hash=sha256:8465322196c8b4d7ab6d1e049e4c5cb460d0394da4a27d23cc242fbf0034b6b5 \
+    --hash=sha256:86216b5cee4b06df986d214f664305142d9c76df9b6512be2738aa72a2048f99 \
+    --hash=sha256:87d1351268731db79e0f8e745d92493ee2841c974128ef629dc518b937d9194c \
+    --hash=sha256:8bdb58ff7ba23002a4c5808d608e4e6c687175724f54a5dade5fa8c67b604e4d \
+    --hash=sha256:8c622a5fe39a48f78944a87d4fb8a53ee07344641b0562c540d840748571b811 \
+    --hash=sha256:8d756e44e94489e49571086ef83b2bb8ce311e730092d2c34ca8f7d925cb20aa \
+    --hash=sha256:8f4a014bc36d3c57402e2977dada34f9c12300af536839dc38c0beab8878f38a \
+    --hash=sha256:9063e24fdb1e498ab71cb7419e24622516c4a04476b17a2dab57e8baa30d6e03 \
+    --hash=sha256:90d558489962fd4918143277a773316e56c72da56ec7aa3dc3dbbe20fdfed15b \
+    --hash=sha256:923c0c831b7cfcb071580d3f46c4baf50f174be571576556269530f4bbd79d04 \
+    --hash=sha256:95f2a5796329323b8f0512e09dbb7a1860c46a39da62ecb2324f116fa8fdc85c \
+    --hash=sha256:96b02a3dc4381e5494fad39be677abcb5e6634bf7b4fa83a6dd3112607547001 \
+    --hash=sha256:9f96df6923e21816da7e0ad3fd47dd8f94b2a5ce594e00677c0013018b813458 \
+    --hash=sha256:a10af20b82360ab00827f916a6058451b723b4e65030c5a18577c8b2de5b3389 \
+    --hash=sha256:a50aebfa173e157099939b17f18600f72f84eed3049e743b68ad15bd69b6bf99 \
+    --hash=sha256:a981a536974bbc7a512cf44ed14938cf01030a99e9b3a06dd59578882f06f985 \
+    --hash=sha256:a9a8e9031d613fd2009c182b69c7b2c1ef8239a0efb1df3f7c8da66d5dd3d537 \
+    --hash=sha256:ae5f4161f18c61806f411a13b0310bea87f987c7d2ecdbdaad0e94eb2e404238 \
+    --hash=sha256:aed38f6e4fb3f5d6bf81bfa990a07806be9d83cf7bacef998ab1a9bd660a581f \
+    --hash=sha256:b01b88d45a6fcb69667cd6d2f7a9aeb4bf53760d7fc536bf679ec94fe9f3ff3d \
+    --hash=sha256:b261ccdec7821281dade748d088bb6e9b69e6d15b30652b74cbbac25e280b796 \
+    --hash=sha256:b2b0a0c0517616b6869869f8c581d4eb2dd83a4d79e0ebcb7d373ef9956aeb0a \
+    --hash=sha256:b4a23f61ce87adf89be746c8a8974fe1c823c891d8f86eb218bb957c924bb143 \
+    --hash=sha256:bd8f7df7d12c2db9fab40bdd87a7c09b1530128315d047a086fa3ae3435cb3a8 \
+    --hash=sha256:beb58fe5cdb101e3a055192ac291b7a21e3b7ef4f67fa1d74e331a7f2124341c \
+    --hash=sha256:c002b4ffc0be611f0d9da932eb0f704fe2602a9a949d1f738e4c34c75b0863d5 \
+    --hash=sha256:c083af607d2515612056a31f0a8d9e0fcb5876b7bfc0abad3ecd275bc4ebc2d5 \
+    --hash=sha256:c180f51afb394e165eafe4ac2936a14bee3eb10debc9d9e4db8958fe36afe711 \
+    --hash=sha256:c235ebd9baae02f1b77bcea61bce332cb4331dc3617d254df3323aa01ab47bd4 \
+    --hash=sha256:cd70574b12bb8a4d2aaa0094515df2463cb429d8536cfb6c7ce983246983e5a6 \
+    --hash=sha256:d0eccceffcb53201b5bfebb52600a5fb483a20b61da9dbc885f8b103cbe7598c \
+    --hash=sha256:d965bba47ddeec8cd560687584e88cf699fd28f192ceb452d1d7ee807c5597b7 \
+    --hash=sha256:db364eca23f876da6f9e16c9da0df51aa4f104a972735574842618b8c6d999d4 \
+    --hash=sha256:ddbb2551d7e0102e7252db79ba445cdab71b26640817ab1e3e3648dad515003b \
+    --hash=sha256:deb6be0ac38ece9ba87dea880e438f25ca3eddfac8b002a2ec3d9183a454e8ae \
+    --hash=sha256:e06ed3eb3218bc64786f7db41917d4e686cc4856944f53d5bdf83a6884432e12 \
+    --hash=sha256:e27ad930a842b4c5eb8ac0016b0a54f5aebbe679340c26101df33424142c143c \
+    --hash=sha256:e537484df0d8f426ce2afb2d0f8e1c3d0b114b83f8850e5f2fbea0e797bd82ae \
+    --hash=sha256:eb00ed941194665c332bf8e078baf037d6c35d7c4f3102ea2d4f16ca94a26dc8 \
+    --hash=sha256:eb6904c354526e758fda7167b33005998fb68c46fbc10e013ca97f21ca5c8887 \
+    --hash=sha256:eb8821e09e916165e160797a6c17edda0679379a4be5c716c260e836e122f54b \
+    --hash=sha256:efcb3f6676480691518c177e3b465bcddf57cea040302f9f4e6e191af91174d4 \
+    --hash=sha256:f27273b60488abe721a075bcca6d7f3964f9f6f067c8c4c605743023d7d3944f \
+    --hash=sha256:f30c3cb33b24454a82faecaf01b19c18562b1e89558fb6c56de4d9118a032fd5 \
+    --hash=sha256:fb69256e180cb6c8a894fee62b3afebae785babc1ee98b81cdf68bbca1987f33 \
+    --hash=sha256:fd1abc0d89e30cc4e02e4064dc67fcc51bd941eb395c502aac3ec19fab46b519 \
+    --hash=sha256:ff8fa367d09b717b2a17a052544193ad76cd49979c805768879cb63d9ca50561
     # via requests
 dill==0.3.7 \
     --hash=sha256:76b122c08ef4ce2eedcd4d1abd8e641114bfc6c2867f49f3c41facf65bf19f5e \
@@ -133,6 +129,7 @@ dm-tree==0.1.8 \
     --hash=sha256:35cc164a79336bfcfafb47e5f297898359123bbd3330c1967f0c4994f9cf9f60 \
     --hash=sha256:378cc8ad93c5fe3590f405a309980721f021c790ca1bdf9b15bb1d59daec57f5 \
     --hash=sha256:39070ba268c0491af9fe7a58644d99e8b4f2cde6e5884ba3380bddc84ed43d5f \
+    --hash=sha256:435227cf3c5dc63f4de054cf3d00183790bd9ead4c3623138c74dde7f67f521b \
     --hash=sha256:5483dca4d7eb1a0d65fe86d3b6a53ae717face83c1f17e0887b1a4a64ae5c410 \
     --hash=sha256:694c3654cfd2a81552c08ec66bb5c4a3d48fa292b9a181880fb081c36c5b9134 \
     --hash=sha256:803bfc53b4659f447ac694dbd04235f94a73ef7c1fd1e0df7c84ac41e0bc963b \
@@ -140,6 +137,8 @@ dm-tree==0.1.8 \
     --hash=sha256:83b7764de0d855338abefc6e3ee9fe40d301668310aa3baea3f778ff051f4393 \
     --hash=sha256:8c60a7eadab64c2278861f56bca320b2720f163dca9d7558103c3b77f2416571 \
     --hash=sha256:8ed3564abed97c806db122c2d3e1a2b64c74a63debe9903aad795167cc301368 \
+    --hash=sha256:94d3f0826311f45ee19b75f5b48c99466e4218a0489e81c0f0167bda50cacf22 \
+    --hash=sha256:96a548a406a6fb15fe58f6a30a57ff2f2aafbf25f05afab00c8f5e5977b6c715 \
     --hash=sha256:a5d819c38c03f0bb5b3b3703c60e4b170355a0fc6b5819325bf3d4ceb3ae7e80 \
     --hash=sha256:ad16ceba90a56ec47cf45b21856d14962ac314787975ef786efb5e6e9ca75ec7 \
     --hash=sha256:af4b3d372f2477dcd89a6e717e4a575ca35ccc20cc4454a8a4b6f8838a00672d \
@@ -147,6 +146,7 @@ dm-tree==0.1.8 \
     --hash=sha256:b9bd9b9ccb59409d33d51d84b7668010c04c2af7d4a371632874c1ca356cff3d \
     --hash=sha256:b9f89a454e98806b44fe9d40ec9eee61f848388f7e79ac2371a55679bd5a3ac6 \
     --hash=sha256:bb2d109f42190225112da899b9f3d46d0d5f26aef501c61e43529fe9322530b5 \
+    --hash=sha256:c0a94aba18a35457a1b5cd716fd7b46c5dafdc4cf7869b4bae665b91c4682a8e \
     --hash=sha256:c5c8c12e3fda754ef6af94161bacdaeda816d941995fac415d6855c6c386af68 \
     --hash=sha256:d1612fcaecd79023dbc6a6ae48d51a80beb5c385d6f3f6d71688e57bc8d07de8 \
     --hash=sha256:d16e1f2a073604cfcc09f7131ae8d534674f43c3aef4c25742eae295bc60d04f \
@@ -154,6 +154,7 @@ dm-tree==0.1.8 \
     --hash=sha256:d40fa4106ca6edc66760246a08f500ec0c85ef55c762fb4a363f6ee739ba02ee \
     --hash=sha256:de287fabc464b8734be251e46e06aa9aa1001f34198da2b6ce07bd197172b9cb \
     --hash=sha256:e4d714371bb08839e4e5e29024fc95832d9affe129825ef38836b143028bd144 \
+    --hash=sha256:ea9e59e0451e7d29aece402d9f908f2e2a80922bcde2ebfd5dcb07750fcbfee8 \
     --hash=sha256:f7ac31b9aecccb2c6e1ab29706f6ded3eba0c2c69c770322c9c685929c3d6afb \
     --hash=sha256:fa42a605d099ee7d41ba2b5fb75e21423951fd26e5d50583a00471238fb3021d
     # via keras-nightly
@@ -161,71 +162,61 @@ gast==0.4.0 \
     --hash=sha256:40feb7b8b8434785585ab224d1568b857edb18297e5a3047f1ba012bc83b42c1 \
     --hash=sha256:b7adcdd5adbebf1adf17378da5ba3f543684dbec47b1cda1f3997e573cd542c4
     # via -r requirements.in
-google-auth==2.23.3 \
-    --hash=sha256:6864247895eea5d13b9c57c9e03abb49cb94ce2dc7c58e91cba3248c7477c9e3 \
-    --hash=sha256:a8f4608e65c244ead9e0538f181a96c6e11199ec114d41f1d7b1bffa96937bda
-    # via
-    #   google-auth-oauthlib
-    #   tb-nightly
-google-auth-oauthlib==1.1.0 \
-    --hash=sha256:089c6e587d36f4803ac7e0720c045c6a8b1fd1790088b8424975b90d0ee61c12 \
-    --hash=sha256:83ea8c3b0881e453790baff4448e8a6112ac8778d1de9da0b68010b843937afb
-    # via tb-nightly
-grpcio==1.59.2 \
-    --hash=sha256:023088764012411affe7db183d1ada3ad9daf2e23ddc719ff46d7061de661340 \
-    --hash=sha256:08d77e682f2bf730a4961eea330e56d2f423c6a9b91ca222e5b1eb24a357b19f \
-    --hash=sha256:0a4a3833c0e067f3558538727235cd8a49709bff1003200bbdefa2f09334e4b1 \
-    --hash=sha256:0a754aff9e3af63bdc4c75c234b86b9d14e14a28a30c4e324aed1a9b873d755f \
-    --hash=sha256:11168ef43e4a43ff1b1a65859f3e0ef1a173e277349e7fb16923ff108160a8cd \
-    --hash=sha256:128e20f57c5f27cb0157e73756d1586b83c1b513ebecc83ea0ac37e4b0e4e758 \
-    --hash=sha256:1f9524d1d701e399462d2c90ba7c193e49d1711cf429c0d3d97c966856e03d00 \
-    --hash=sha256:1ff16d68bf453275466a9a46739061a63584d92f18a0f5b33d19fc97eb69867c \
-    --hash=sha256:2067274c88bc6de89c278a672a652b4247d088811ece781a4858b09bdf8448e3 \
-    --hash=sha256:2171c39f355ba5b551c5d5928d65aa6c69807fae195b86ef4a7d125bcdb860a9 \
-    --hash=sha256:242adc47725b9a499ee77c6a2e36688fa6c96484611f33b1be4c57ab075a92dd \
-    --hash=sha256:27f879ae604a7fcf371e59fba6f3ff4635a4c2a64768bd83ff0cac503142fef4 \
-    --hash=sha256:2b230028a008ae1d0f430acb227d323ff8a619017415cf334c38b457f814119f \
-    --hash=sha256:3059668df17627f0e0fa680e9ef8c995c946c792612e9518f5cc1503be14e90b \
-    --hash=sha256:31176aa88f36020055ace9adff2405a33c8bdbfa72a9c4980e25d91b2f196873 \
-    --hash=sha256:36f53c2b3449c015880e7d55a89c992c357f176327b0d2873cdaaf9628a37c69 \
-    --hash=sha256:3b4368b33908f683a363f376dfb747d40af3463a6e5044afee07cf9436addf96 \
-    --hash=sha256:3c61d641d4f409c5ae46bfdd89ea42ce5ea233dcf69e74ce9ba32b503c727e29 \
-    --hash=sha256:4abb717e320e74959517dc8e84a9f48fbe90e9abe19c248541e9418b1ce60acd \
-    --hash=sha256:4c93f4abbb54321ee6471e04a00139c80c754eda51064187963ddf98f5cf36a4 \
-    --hash=sha256:535561990e075fa6bd4b16c4c3c1096b9581b7bb35d96fac4650f1181e428268 \
-    --hash=sha256:53c9aa5ddd6857c0a1cd0287225a2a25873a8e09727c2e95c4aebb1be83a766a \
-    --hash=sha256:5d573e70a6fe77555fb6143c12d3a7d3fa306632a3034b4e7c59ca09721546f8 \
-    --hash=sha256:6009386a2df66159f64ac9f20425ae25229b29b9dd0e1d3dd60043f037e2ad7e \
-    --hash=sha256:686e975a5d16602dc0982c7c703948d17184bd1397e16c8ee03511ecb8c4cdda \
-    --hash=sha256:6959fb07e8351e20501ffb8cc4074c39a0b7ef123e1c850a7f8f3afdc3a3da01 \
-    --hash=sha256:6b25ed37c27e652db01be341af93fbcea03d296c024d8a0e680017a268eb85dd \
-    --hash=sha256:6da6dea3a1bacf99b3c2187e296db9a83029ed9c38fd4c52b7c9b7326d13c828 \
-    --hash=sha256:72ca2399097c0b758198f2ff30f7178d680de8a5cfcf3d9b73a63cf87455532e \
-    --hash=sha256:73abb8584b0cf74d37f5ef61c10722adc7275502ab71789a8fe3cb7ef04cf6e2 \
-    --hash=sha256:74100fecaec8a535e380cf5f2fb556ff84957d481c13e54051c52e5baac70541 \
-    --hash=sha256:75c6ecb70e809cf1504465174343113f51f24bc61e22a80ae1c859f3f7034c6d \
-    --hash=sha256:7cf05053242f61ba94014dd3a986e11a083400a32664058f80bf4cf817c0b3a1 \
-    --hash=sha256:9411e24328a2302e279e70cae6e479f1fddde79629fcb14e03e6d94b3956eabf \
-    --hash=sha256:a213acfbf186b9f35803b52e4ca9addb153fc0b67f82a48f961be7000ecf6721 \
-    --hash=sha256:bb7e0fe6ad73b7f06d7e2b689c19a71cf5cc48f0c2bf8608469e51ffe0bd2867 \
-    --hash=sha256:c2504eed520958a5b77cc99458297cb7906308cb92327f35fb7fbbad4e9b2188 \
-    --hash=sha256:c35aa9657f5d5116d23b934568e0956bd50c615127810fffe3ac356a914c176a \
-    --hash=sha256:c5f09cffa619adfb44799fa4a81c2a1ad77c887187613fb0a8f201ab38d89ba1 \
-    --hash=sha256:c978f864b35f2261e0819f5cd88b9830b04dc51bcf055aac3c601e525a10d2ba \
-    --hash=sha256:cbe946b3e6e60a7b4618f091e62a029cb082b109a9d6b53962dd305087c6e4fd \
-    --hash=sha256:cc3e4cd087f07758b16bef8f31d88dbb1b5da5671d2f03685ab52dece3d7a16e \
-    --hash=sha256:cf0dead5a2c5a3347af2cfec7131d4f2a2e03c934af28989c9078f8241a491fa \
-    --hash=sha256:d2794f0e68b3085d99b4f6ff9c089f6fdd02b32b9d3efdfbb55beac1bf22d516 \
-    --hash=sha256:d2fa68a96a30dd240be80bbad838a0ac81a61770611ff7952b889485970c4c71 \
-    --hash=sha256:d6f70406695e3220f09cd7a2f879333279d91aa4a8a1d34303b56d61a8180137 \
-    --hash=sha256:d8f9cd4ad1be90b0cf350a2f04a38a36e44a026cac1e036ac593dc48efe91d52 \
-    --hash=sha256:da2d94c15f88cd40d7e67f7919d4f60110d2b9d5b1e08cf354c2be773ab13479 \
-    --hash=sha256:e1727c1c0e394096bb9af185c6923e8ea55a5095b8af44f06903bcc0e06800a2 \
-    --hash=sha256:e420ced29b5904cdf9ee5545e23f9406189d8acb6750916c2db4793dada065c6 \
-    --hash=sha256:e82c5cf1495244adf5252f925ac5932e5fd288b3e5ab6b70bec5593074b7236c \
-    --hash=sha256:f1ef0d39bc1feb420caf549b3c657c871cad4ebbcf0580c4d03816b0590de0cf \
-    --hash=sha256:f8753a6c88d1d0ba64302309eecf20f70d2770f65ca02d83c2452279085bfcd3 \
-    --hash=sha256:f93dbf58f03146164048be5426ffde298b237a5e059144847e4940f5b80172c3
+grpcio==1.60.1 \
+    --hash=sha256:0250a7a70b14000fa311de04b169cc7480be6c1a769b190769d347939d3232a8 \
+    --hash=sha256:069fe2aeee02dfd2135d562d0663fe70fbb69d5eed6eb3389042a7e963b54de8 \
+    --hash=sha256:082081e6a36b6eb5cf0fd9a897fe777dbb3802176ffd08e3ec6567edd85bc104 \
+    --hash=sha256:0c5807e9152eff15f1d48f6b9ad3749196f79a4a050469d99eecb679be592acc \
+    --hash=sha256:14e8f2c84c0832773fb3958240c69def72357bc11392571f87b2d7b91e0bb092 \
+    --hash=sha256:2a6087f234cb570008a6041c8ffd1b7d657b397fdd6d26e83d72283dae3527b1 \
+    --hash=sha256:2bb2a2911b028f01c8c64d126f6b632fcd8a9ac975aa1b3855766c94e4107180 \
+    --hash=sha256:2f44c32aef186bbba254129cea1df08a20be414144ac3bdf0e84b24e3f3b2e05 \
+    --hash=sha256:30e980cd6db1088c144b92fe376747328d5554bc7960ce583ec7b7d81cd47287 \
+    --hash=sha256:33aed0a431f5befeffd9d346b0fa44b2c01aa4aeae5ea5b2c03d3e25e0071216 \
+    --hash=sha256:33bdea30dcfd4f87b045d404388469eb48a48c33a6195a043d116ed1b9a0196c \
+    --hash=sha256:39aa848794b887120b1d35b1b994e445cc028ff602ef267f87c38122c1add50d \
+    --hash=sha256:4216e67ad9a4769117433814956031cb300f85edc855252a645a9a724b3b6594 \
+    --hash=sha256:49c9b6a510e3ed8df5f6f4f3c34d7fbf2d2cae048ee90a45cd7415abab72912c \
+    --hash=sha256:4eec8b8c1c2c9b7125508ff7c89d5701bf933c99d3910e446ed531cd16ad5d87 \
+    --hash=sha256:50d56280b482875d1f9128ce596e59031a226a8b84bec88cb2bf76c289f5d0de \
+    --hash=sha256:53b69e79d00f78c81eecfb38f4516080dc7f36a198b6b37b928f1c13b3c063e9 \
+    --hash=sha256:55ccb7db5a665079d68b5c7c86359ebd5ebf31a19bc1a91c982fd622f1e31ff2 \
+    --hash=sha256:5a1ebbae7e2214f51b1f23b57bf98eeed2cf1ba84e4d523c48c36d5b2f8829ff \
+    --hash=sha256:61b7199cd2a55e62e45bfb629a35b71fc2c0cb88f686a047f25b1112d3810904 \
+    --hash=sha256:660fc6b9c2a9ea3bb2a7e64ba878c98339abaf1811edca904ac85e9e662f1d73 \
+    --hash=sha256:6d140bdeb26cad8b93c1455fa00573c05592793c32053d6e0016ce05ba267549 \
+    --hash=sha256:6e490fa5f7f5326222cb9f0b78f207a2b218a14edf39602e083d5f617354306f \
+    --hash=sha256:6ecf21d20d02d1733e9c820fb5c114c749d888704a7ec824b545c12e78734d1c \
+    --hash=sha256:70c83bb530572917be20c21f3b6be92cd86b9aecb44b0c18b1d3b2cc3ae47df0 \
+    --hash=sha256:72153a0d2e425f45b884540a61c6639436ddafa1829a42056aa5764b84108b8e \
+    --hash=sha256:73e14acd3d4247169955fae8fb103a2b900cfad21d0c35f0dcd0fdd54cd60367 \
+    --hash=sha256:76eaaba891083fcbe167aa0f03363311a9f12da975b025d30e94b93ac7a765fc \
+    --hash=sha256:79ae0dc785504cb1e1788758c588c711f4e4a0195d70dff53db203c95a0bd303 \
+    --hash=sha256:7d142bcd604166417929b071cd396aa13c565749a4c840d6c702727a59d835eb \
+    --hash=sha256:8c9554ca8e26241dabe7951aa1fa03a1ba0856688ecd7e7bdbdd286ebc272e4c \
+    --hash=sha256:8d488fbdbf04283f0d20742b64968d44825617aa6717b07c006168ed16488804 \
+    --hash=sha256:91422ba785a8e7a18725b1dc40fbd88f08a5bb4c7f1b3e8739cab24b04fa8a03 \
+    --hash=sha256:9a66f4d2a005bc78e61d805ed95dedfcb35efa84b7bba0403c6d60d13a3de2d6 \
+    --hash=sha256:9b106bc52e7f28170e624ba61cc7dc6829566e535a6ec68528f8e1afbed1c41f \
+    --hash=sha256:9b54577032d4f235452f77a83169b6527bf4b77d73aeada97d45b2aaf1bf5ce0 \
+    --hash=sha256:a09506eb48fa5493c58f946c46754ef22f3ec0df64f2b5149373ff31fb67f3dd \
+    --hash=sha256:a212e5dea1a4182e40cd3e4067ee46be9d10418092ce3627475e995cca95de21 \
+    --hash=sha256:a731ac5cffc34dac62053e0da90f0c0b8560396a19f69d9703e88240c8f05858 \
+    --hash=sha256:af5ef6cfaf0d023c00002ba25d0751e5995fa0e4c9eec6cd263c30352662cbce \
+    --hash=sha256:b58b855d0071575ea9c7bc0d84a06d2edfbfccec52e9657864386381a7ce1ae9 \
+    --hash=sha256:bc808924470643b82b14fe121923c30ec211d8c693e747eba8a7414bc4351a23 \
+    --hash=sha256:c557e94e91a983e5b1e9c60076a8fd79fea1e7e06848eb2e48d0ccfb30f6e073 \
+    --hash=sha256:c71be3f86d67d8d1311c6076a4ba3b75ba5703c0b856b4e691c9097f9b1e8bd2 \
+    --hash=sha256:c8754c75f55781515a3005063d9a05878b2cfb3cb7e41d5401ad0cf19de14872 \
+    --hash=sha256:cb0af13433dbbd1c806e671d81ec75bd324af6ef75171fd7815ca3074fe32bfe \
+    --hash=sha256:cba6209c96828711cb7c8fcb45ecef8c8859238baf15119daa1bef0f6c84bfe7 \
+    --hash=sha256:cf77f8cf2a651fbd869fbdcb4a1931464189cd210abc4cfad357f1cacc8642a6 \
+    --hash=sha256:d7404cebcdb11bb5bd40bf94131faf7e9a7c10a6c60358580fe83913f360f929 \
+    --hash=sha256:dd1d3a8d1d2e50ad9b59e10aa7f07c7d1be2b367f3f2d33c5fade96ed5460962 \
+    --hash=sha256:e5d97c65ea7e097056f3d1ead77040ebc236feaf7f71489383d20f3b4c28412a \
+    --hash=sha256:f1c3dc536b3ee124e8b24feb7533e5c70b9f2ef833e3b2e5513b2897fd46763a \
+    --hash=sha256:f2212796593ad1d0235068c79836861f2201fc7137a99aa2fea7beeb3b101177 \
+    --hash=sha256:fead980fbc68512dfd4e0c7b1f5754c2a8e5015a04dea454b9cada54a8423525
     # via
     #   -r requirements.in
     #   tb-nightly
@@ -258,117 +249,119 @@ h5py==3.10.0 \
     # via
     #   -r requirements.in
     #   keras-nightly
-idna==3.4 \
-    --hash=sha256:814f528e8dead7d329833b91c5faa87d60bf71824cd12a7530b5526063d02cb4 \
-    --hash=sha256:90b77e79eaa3eba6de819a0c442c0b4ceefc341a7a2ab77d7562bf49f425c5c2
+idna==3.6 \
+    --hash=sha256:9ecdbbd083b06798ae1e86adcbfe8ab1479cf864e4ee30fe4e46a003d12491ca \
+    --hash=sha256:c05567e9c24a6b9faaa835c4821bad0590fbb9d5779e7caa6e1cc4978e7eb24f
     # via requests
-importlib-metadata==6.8.0 \
-    --hash=sha256:3ebb78df84a805d7698245025b975d9d67053cd94c79245ba4b3eb694abe68bb \
-    --hash=sha256:dbace7892d8c0c4ac1ad096662232f831d4e64f4c4545bd53016a3e9d4654743
+importlib-metadata==7.0.1 \
+    --hash=sha256:4805911c3a4ec7c3966410053e9ec6a1fecd629117df5adee56dfc9432a1081e \
+    --hash=sha256:f238736bb06590ae52ac1fab06a3a9ef1d8dce2b7a35b5ab329371d6c8f5d2cc
     # via markdown
 jax==0.4.7 \
     --hash=sha256:5e7002d74db25f97c99b979d4ba1233b1ef26e1597e5fc468ad11d1c8a9dc4f8
     # via -r requirements.in
-keras-nightly==3.0.0.dev2023103103 \
-    --hash=sha256:25a6a9030d7e067d535ec41ae6700636c90cabada80fbddb3bb8775006807860 \
-    --hash=sha256:c39fccebb3d4cc9d838371981c4d3eef08fc06eadf6cffb39b356cb625bab50f
+keras-nightly==3.0.4.dev2024021403 \
+    --hash=sha256:24ce69d29d582771685bf4235f59663723405b5a5b16f3eaff2657e52e74663a \
+    --hash=sha256:9f416e66b820ef833779d219d255b346b8b90a72fdbd0b2f1e90a43ad142a03d
     # via -r requirements.in
-lit==17.0.4 \
-    --hash=sha256:ee2e180128e770abc6aed3a02de2daf09d81b7d30225e315205d3599c311d304
+lit==17.0.6 \
+    --hash=sha256:dfa9af9b55fc4509a56be7bf2346f079d7f4a242d583b9f2e0b078fd0abae31b
     # via -r requirements.in
-markdown==3.5 \
-    --hash=sha256:4afb124395ce5fc34e6d9886dab977fd9ae987fc6e85689f08278cf0c69d4bf3 \
-    --hash=sha256:a807eb2e4778d9156c8f07876c6e4d50b5494c5665c4834f67b06459dfd877b3
+markdown==3.5.2 \
+    --hash=sha256:d43323865d89fc0cb9b20c75fc8ad313af307cc087e84b657d9eec768eddeadd \
+    --hash=sha256:e1ac7b3dc550ee80e602e71c1d168002f062e49f1b11e26a36264dafd4df2ef8
     # via tb-nightly
 markdown-it-py==3.0.0 \
     --hash=sha256:355216845c60bd96232cd8d8c40e8f9765cc86f46880e43a8fd22dc1a1a8cab1 \
     --hash=sha256:e3f60a94fa066dc52ec76661e37c851cb232d92f9886b15cb560aaada2df8feb
     # via rich
-markupsafe==2.1.3 \
-    --hash=sha256:05fb21170423db021895e1ea1e1f3ab3adb85d1c2333cbc2310f2a26bc77272e \
-    --hash=sha256:0a4e4a1aff6c7ac4cd55792abf96c915634c2b97e3cc1c7129578aa68ebd754e \
-    --hash=sha256:10bbfe99883db80bdbaff2dcf681dfc6533a614f700da1287707e8a5d78a8431 \
-    --hash=sha256:134da1eca9ec0ae528110ccc9e48041e0828d79f24121a1a146161103c76e686 \
-    --hash=sha256:14ff806850827afd6b07a5f32bd917fb7f45b046ba40c57abdb636674a8b559c \
-    --hash=sha256:1577735524cdad32f9f694208aa75e422adba74f1baee7551620e43a3141f559 \
-    --hash=sha256:1b40069d487e7edb2676d3fbdb2b0829ffa2cd63a2ec26c4938b2d34391b4ecc \
-    --hash=sha256:1b8dd8c3fd14349433c79fa8abeb573a55fc0fdd769133baac1f5e07abf54aeb \
-    --hash=sha256:1f67c7038d560d92149c060157d623c542173016c4babc0c1913cca0564b9939 \
-    --hash=sha256:282c2cb35b5b673bbcadb33a585408104df04f14b2d9b01d4c345a3b92861c2c \
-    --hash=sha256:2c1b19b3aaacc6e57b7e25710ff571c24d6c3613a45e905b1fde04d691b98ee0 \
-    --hash=sha256:2ef12179d3a291be237280175b542c07a36e7f60718296278d8593d21ca937d4 \
-    --hash=sha256:338ae27d6b8745585f87218a3f23f1512dbf52c26c28e322dbe54bcede54ccb9 \
-    --hash=sha256:3c0fae6c3be832a0a0473ac912810b2877c8cb9d76ca48de1ed31e1c68386575 \
-    --hash=sha256:3fd4abcb888d15a94f32b75d8fd18ee162ca0c064f35b11134be77050296d6ba \
-    --hash=sha256:42de32b22b6b804f42c5d98be4f7e5e977ecdd9ee9b660fda1a3edf03b11792d \
-    --hash=sha256:47d4f1c5f80fc62fdd7777d0d40a2e9dda0a05883ab11374334f6c4de38adffd \
-    --hash=sha256:504b320cd4b7eff6f968eddf81127112db685e81f7e36e75f9f84f0df46041c3 \
-    --hash=sha256:525808b8019e36eb524b8c68acdd63a37e75714eac50e988180b169d64480a00 \
-    --hash=sha256:56d9f2ecac662ca1611d183feb03a3fa4406469dafe241673d521dd5ae92a155 \
-    --hash=sha256:5bbe06f8eeafd38e5d0a4894ffec89378b6c6a625ff57e3028921f8ff59318ac \
-    --hash=sha256:65c1a9bcdadc6c28eecee2c119465aebff8f7a584dd719facdd9e825ec61ab52 \
-    --hash=sha256:68e78619a61ecf91e76aa3e6e8e33fc4894a2bebe93410754bd28fce0a8a4f9f \
-    --hash=sha256:69c0f17e9f5a7afdf2cc9fb2d1ce6aabdb3bafb7f38017c0b77862bcec2bbad8 \
-    --hash=sha256:6b2b56950d93e41f33b4223ead100ea0fe11f8e6ee5f641eb753ce4b77a7042b \
-    --hash=sha256:715d3562f79d540f251b99ebd6d8baa547118974341db04f5ad06d5ea3eb8007 \
-    --hash=sha256:787003c0ddb00500e49a10f2844fac87aa6ce977b90b0feaaf9de23c22508b24 \
-    --hash=sha256:7ef3cb2ebbf91e330e3bb937efada0edd9003683db6b57bb108c4001f37a02ea \
-    --hash=sha256:8023faf4e01efadfa183e863fefde0046de576c6f14659e8782065bcece22198 \
-    --hash=sha256:8758846a7e80910096950b67071243da3e5a20ed2546e6392603c096778d48e0 \
-    --hash=sha256:8afafd99945ead6e075b973fefa56379c5b5c53fd8937dad92c662da5d8fd5ee \
-    --hash=sha256:8c41976a29d078bb235fea9b2ecd3da465df42a562910f9022f1a03107bd02be \
-    --hash=sha256:8e254ae696c88d98da6555f5ace2279cf7cd5b3f52be2b5cf97feafe883b58d2 \
-    --hash=sha256:8f9293864fe09b8149f0cc42ce56e3f0e54de883a9de90cd427f191c346eb2e1 \
-    --hash=sha256:9402b03f1a1b4dc4c19845e5c749e3ab82d5078d16a2a4c2cd2df62d57bb0707 \
-    --hash=sha256:962f82a3086483f5e5f64dbad880d31038b698494799b097bc59c2edf392fce6 \
-    --hash=sha256:9aad3c1755095ce347e26488214ef77e0485a3c34a50c5a5e2471dff60b9dd9c \
-    --hash=sha256:9dcdfd0eaf283af041973bff14a2e143b8bd64e069f4c383416ecd79a81aab58 \
-    --hash=sha256:aa57bd9cf8ae831a362185ee444e15a93ecb2e344c8e52e4d721ea3ab6ef1823 \
-    --hash=sha256:aa7bd130efab1c280bed0f45501b7c8795f9fdbeb02e965371bbef3523627779 \
-    --hash=sha256:ab4a0df41e7c16a1392727727e7998a467472d0ad65f3ad5e6e765015df08636 \
-    --hash=sha256:ad9e82fb8f09ade1c3e1b996a6337afac2b8b9e365f926f5a61aacc71adc5b3c \
-    --hash=sha256:af598ed32d6ae86f1b747b82783958b1a4ab8f617b06fe68795c7f026abbdcad \
-    --hash=sha256:b076b6226fb84157e3f7c971a47ff3a679d837cf338547532ab866c57930dbee \
-    --hash=sha256:b7ff0f54cb4ff66dd38bebd335a38e2c22c41a8ee45aa608efc890ac3e3931bc \
-    --hash=sha256:bfce63a9e7834b12b87c64d6b155fdd9b3b96191b6bd334bf37db7ff1fe457f2 \
-    --hash=sha256:c011a4149cfbcf9f03994ec2edffcb8b1dc2d2aede7ca243746df97a5d41ce48 \
-    --hash=sha256:c9c804664ebe8f83a211cace637506669e7890fec1b4195b505c214e50dd4eb7 \
-    --hash=sha256:ca379055a47383d02a5400cb0d110cef0a776fc644cda797db0c5696cfd7e18e \
-    --hash=sha256:cb0932dc158471523c9637e807d9bfb93e06a95cbf010f1a38b98623b929ef2b \
-    --hash=sha256:cd0f502fe016460680cd20aaa5a76d241d6f35a1c3350c474bac1273803893fa \
-    --hash=sha256:ceb01949af7121f9fc39f7d27f91be8546f3fb112c608bc4029aef0bab86a2a5 \
-    --hash=sha256:d080e0a5eb2529460b30190fcfcc4199bd7f827663f858a226a81bc27beaa97e \
-    --hash=sha256:dd15ff04ffd7e05ffcb7fe79f1b98041b8ea30ae9234aed2a9168b5797c3effb \
-    --hash=sha256:df0be2b576a7abbf737b1575f048c23fb1d769f267ec4358296f31c2479db8f9 \
-    --hash=sha256:e09031c87a1e51556fdcb46e5bd4f59dfb743061cf93c4d6831bf894f125eb57 \
-    --hash=sha256:e4dd52d80b8c83fdce44e12478ad2e85c64ea965e75d66dbeafb0a3e77308fcc \
-    --hash=sha256:f698de3fd0c4e6972b92290a45bd9b1536bffe8c6759c62471efaa8acb4c37bc \
-    --hash=sha256:fec21693218efe39aa7f8599346e90c705afa52c5b31ae019b2e57e8f6542bb2 \
-    --hash=sha256:ffcc3f7c66b5f5b7931a5aa68fc9cecc51e685ef90282f4a82f0f5e9b704ad11
+markupsafe==2.1.5 \
+    --hash=sha256:00e046b6dd71aa03a41079792f8473dc494d564611a8f89bbbd7cb93295ebdcf \
+    --hash=sha256:075202fa5b72c86ad32dc7d0b56024ebdbcf2048c0ba09f1cde31bfdd57bcfff \
+    --hash=sha256:0e397ac966fdf721b2c528cf028494e86172b4feba51d65f81ffd65c63798f3f \
+    --hash=sha256:17b950fccb810b3293638215058e432159d2b71005c74371d784862b7e4683f3 \
+    --hash=sha256:1f3fbcb7ef1f16e48246f704ab79d79da8a46891e2da03f8783a5b6fa41a9532 \
+    --hash=sha256:2174c595a0d73a3080ca3257b40096db99799265e1c27cc5a610743acd86d62f \
+    --hash=sha256:2b7c57a4dfc4f16f7142221afe5ba4e093e09e728ca65c51f5620c9aaeb9a617 \
+    --hash=sha256:2d2d793e36e230fd32babe143b04cec8a8b3eb8a3122d2aceb4a371e6b09b8df \
+    --hash=sha256:30b600cf0a7ac9234b2638fbc0fb6158ba5bdcdf46aeb631ead21248b9affbc4 \
+    --hash=sha256:397081c1a0bfb5124355710fe79478cdbeb39626492b15d399526ae53422b906 \
+    --hash=sha256:3a57fdd7ce31c7ff06cdfbf31dafa96cc533c21e443d57f5b1ecc6cdc668ec7f \
+    --hash=sha256:3c6b973f22eb18a789b1460b4b91bf04ae3f0c4234a0a6aa6b0a92f6f7b951d4 \
+    --hash=sha256:3e53af139f8579a6d5f7b76549125f0d94d7e630761a2111bc431fd820e163b8 \
+    --hash=sha256:4096e9de5c6fdf43fb4f04c26fb114f61ef0bf2e5604b6ee3019d51b69e8c371 \
+    --hash=sha256:4275d846e41ecefa46e2015117a9f491e57a71ddd59bbead77e904dc02b1bed2 \
+    --hash=sha256:4c31f53cdae6ecfa91a77820e8b151dba54ab528ba65dfd235c80b086d68a465 \
+    --hash=sha256:4f11aa001c540f62c6166c7726f71f7573b52c68c31f014c25cc7901deea0b52 \
+    --hash=sha256:5049256f536511ee3f7e1b3f87d1d1209d327e818e6ae1365e8653d7e3abb6a6 \
+    --hash=sha256:58c98fee265677f63a4385256a6d7683ab1832f3ddd1e66fe948d5880c21a169 \
+    --hash=sha256:598e3276b64aff0e7b3451b72e94fa3c238d452e7ddcd893c3ab324717456bad \
+    --hash=sha256:5b7b716f97b52c5a14bffdf688f971b2d5ef4029127f1ad7a513973cfd818df2 \
+    --hash=sha256:5dedb4db619ba5a2787a94d877bc8ffc0566f92a01c0ef214865e54ecc9ee5e0 \
+    --hash=sha256:619bc166c4f2de5caa5a633b8b7326fbe98e0ccbfacabd87268a2b15ff73a029 \
+    --hash=sha256:629ddd2ca402ae6dbedfceeba9c46d5f7b2a61d9749597d4307f943ef198fc1f \
+    --hash=sha256:656f7526c69fac7f600bd1f400991cc282b417d17539a1b228617081106feb4a \
+    --hash=sha256:6ec585f69cec0aa07d945b20805be741395e28ac1627333b1c5b0105962ffced \
+    --hash=sha256:72b6be590cc35924b02c78ef34b467da4ba07e4e0f0454a2c5907f473fc50ce5 \
+    --hash=sha256:7502934a33b54030eaf1194c21c692a534196063db72176b0c4028e140f8f32c \
+    --hash=sha256:7a68b554d356a91cce1236aa7682dc01df0edba8d043fd1ce607c49dd3c1edcf \
+    --hash=sha256:7b2e5a267c855eea6b4283940daa6e88a285f5f2a67f2220203786dfa59b37e9 \
+    --hash=sha256:823b65d8706e32ad2df51ed89496147a42a2a6e01c13cfb6ffb8b1e92bc910bb \
+    --hash=sha256:8590b4ae07a35970728874632fed7bd57b26b0102df2d2b233b6d9d82f6c62ad \
+    --hash=sha256:8dd717634f5a044f860435c1d8c16a270ddf0ef8588d4887037c5028b859b0c3 \
+    --hash=sha256:8dec4936e9c3100156f8a2dc89c4b88d5c435175ff03413b443469c7c8c5f4d1 \
+    --hash=sha256:97cafb1f3cbcd3fd2b6fbfb99ae11cdb14deea0736fc2b0952ee177f2b813a46 \
+    --hash=sha256:a17a92de5231666cfbe003f0e4b9b3a7ae3afb1ec2845aadc2bacc93ff85febc \
+    --hash=sha256:a549b9c31bec33820e885335b451286e2969a2d9e24879f83fe904a5ce59d70a \
+    --hash=sha256:ac07bad82163452a6884fe8fa0963fb98c2346ba78d779ec06bd7a6262132aee \
+    --hash=sha256:ae2ad8ae6ebee9d2d94b17fb62763125f3f374c25618198f40cbb8b525411900 \
+    --hash=sha256:b91c037585eba9095565a3556f611e3cbfaa42ca1e865f7b8015fe5c7336d5a5 \
+    --hash=sha256:bc1667f8b83f48511b94671e0e441401371dfd0f0a795c7daa4a3cd1dde55bea \
+    --hash=sha256:bec0a414d016ac1a18862a519e54b2fd0fc8bbfd6890376898a6c0891dd82e9f \
+    --hash=sha256:bf50cd79a75d181c9181df03572cdce0fbb75cc353bc350712073108cba98de5 \
+    --hash=sha256:bff1b4290a66b490a2f4719358c0cdcd9bafb6b8f061e45c7a2460866bf50c2e \
+    --hash=sha256:c061bb86a71b42465156a3ee7bd58c8c2ceacdbeb95d05a99893e08b8467359a \
+    --hash=sha256:c8b29db45f8fe46ad280a7294f5c3ec36dbac9491f2d1c17345be8e69cc5928f \
+    --hash=sha256:ce409136744f6521e39fd8e2a24c53fa18ad67aa5bc7c2cf83645cce5b5c4e50 \
+    --hash=sha256:d050b3361367a06d752db6ead6e7edeb0009be66bc3bae0ee9d97fb326badc2a \
+    --hash=sha256:d283d37a890ba4c1ae73ffadf8046435c76e7bc2247bbb63c00bd1a709c6544b \
+    --hash=sha256:d9fad5155d72433c921b782e58892377c44bd6252b5af2f67f16b194987338a4 \
+    --hash=sha256:daa4ee5a243f0f20d528d939d06670a298dd39b1ad5f8a72a4275124a7819eff \
+    --hash=sha256:db0b55e0f3cc0be60c1f19efdde9a637c32740486004f20d1cff53c3c0ece4d2 \
+    --hash=sha256:e61659ba32cf2cf1481e575d0462554625196a1f2fc06a1c777d3f48e8865d46 \
+    --hash=sha256:ea3d8a3d18833cf4304cd2fc9cbb1efe188ca9b5efef2bdac7adc20594a0e46b \
+    --hash=sha256:ec6a563cff360b50eed26f13adc43e61bc0c04d94b8be985e6fb24b81f6dcfdf \
+    --hash=sha256:f5dfb42c4604dddc8e4305050aa6deb084540643ed5804d7455b5df8fe16f5e5 \
+    --hash=sha256:fa173ec60341d6bb97a89f5ea19c85c5643c1e7dedebc22f5181eb73573142c5 \
+    --hash=sha256:fa9db3f79de01457b03d4f01b34cf91bc0048eb2c3846ff26f66687c2f6d16ab \
+    --hash=sha256:fce659a462a1be54d2ffcacea5e3ba2d74daa74f30f5f143fe0c58636e355fdd \
+    --hash=sha256:ffee1f21e5ef0d712f9033568f8344d5da8cc2869dbd08d87c84656e6a2d2f68
     # via werkzeug
 mdurl==0.1.2 \
     --hash=sha256:84008a41e51615a49fc9966191ff91509e3c40b939176e643fd50a5c2196b8f8 \
     --hash=sha256:bb413d29f5eea38f31dd4754dd7377d4465116fb207585f97bf925588687c1ba
     # via markdown-it-py
-ml-dtypes==0.3.1 \
-    --hash=sha256:3d8ca0acbd377082792d8b97081ba580abdad67c6afb7f827012c675b052f058 \
-    --hash=sha256:42a8980afd8b7c8e270e8b5c260237286b5b26acd276fcb758d13cd7cb567e99 \
-    --hash=sha256:438437e2e614a3c91d75581653b6c40ec890e8b5994d7190a90c931740151c95 \
-    --hash=sha256:4828b62fa3bf1ae35faa40f3db9a38ec72fbce02f328a1d14c3a9da4606af364 \
-    --hash=sha256:4d94b2d1bed77284694f7fd0479640fa7aa5d96433dca3cbcec407a5ef752e77 \
-    --hash=sha256:510d249a91face47211762eb294d6fe64f325356b965fb6388c1bf51bd339267 \
-    --hash=sha256:5727effa7650f7ab10906542d137cfb3244fdc3b2b519beff42f82def8ba59be \
-    --hash=sha256:5e0b0b6bb07fa5ad11bb61d174667176bee5e05857225067aabfc5adc1b51d23 \
-    --hash=sha256:60778f99194b4c4f36ba42da200b35ef851ce4d4af698aaf70f5b91fe70fc611 \
-    --hash=sha256:70984b473db6489ec1d8c79b082a1322105155193049d08a3b0c515094e9777b \
-    --hash=sha256:979d7d196d9a17e0135ae22878f74241fbd3522cef58d7b292f1fd5b32282201 \
-    --hash=sha256:a777928dcba8865ab4a8157eeb25d23aed7bc82e5fd74e1d5eca821d3f148b39 \
-    --hash=sha256:cb0c404e0dd3e25b56362c1c1e5de0ef717f727dde59fa721af4ce6ab2acca44 \
-    --hash=sha256:d1a8dc3bac1da2a17d0e2e4cba36ee89721d0bd33ea4765af2eefb5f41409e0f \
-    --hash=sha256:da274599e4950a9b488d21571061f49a185537cc77f2d3f8121151d58a9e9f16 \
-    --hash=sha256:f83ff080df8910c0f987f615b03e4f8198638e0c00c6e679ea8892dda909763b \
-    --hash=sha256:fcae2c69715410d96906e1dfe8f017d9f78a0d10e0df91aae52e91f51fdfe45e
-    # via jax
+ml-dtypes==0.3.2 \
+    --hash=sha256:2c34f2ba9660b21fe1034b608308a01be82bbef2a92fb8199f24dc6bad0d5226 \
+    --hash=sha256:3a17ef2322e60858d93584e9c52a5be7dd6236b056b7fa1ec57f1bb6ba043e33 \
+    --hash=sha256:533059bc5f1764fac071ef54598db358c167c51a718f68f5bb55e3dee79d2967 \
+    --hash=sha256:6604877d567a29bfe7cc02969ae0f2425260e5335505cf5e7fefc3e5465f5655 \
+    --hash=sha256:6b35c4e8ca957c877ac35c79ffa77724ecc3702a1e4b18b08306c03feae597bb \
+    --hash=sha256:763697ab8a88d47443997a7cdf3aac7340049aed45f7521f6b0ec8a0594821fe \
+    --hash=sha256:7a4c3fcbf86fa52d0204f07cfd23947ef05b4ad743a1a988e163caa34a201e5e \
+    --hash=sha256:7afde548890a92b41c0fed3a6c525f1200a5727205f73dc21181a2726571bb53 \
+    --hash=sha256:7ba8e1fafc7fff3e643f453bffa7d082df1678a73286ce8187d3e825e776eb94 \
+    --hash=sha256:91f8783fd1f2c23fd3b9ee5ad66b785dafa58ba3cdb050c4458021fa4d1eb226 \
+    --hash=sha256:93b78f53431c93953f7850bb1b925a17f0ab5d97527e38a7e865b5b4bc5cfc18 \
+    --hash=sha256:961134ea44c7b8ca63eda902a44b58cd8bd670e21d62e255c81fba0a8e70d9b7 \
+    --hash=sha256:b89b194e9501a92d289c1ffd411380baf5daafb9818109a4f49b0a1b6dce4462 \
+    --hash=sha256:c7b3fb3d4f6b39bcd4f6c4b98f406291f0d681a895490ee29a0f95bab850d53c \
+    --hash=sha256:d1a746fe5fb9cd974a91070174258f0be129c592b93f9ce7df6cc336416c3fbd \
+    --hash=sha256:e8505946df1665db01332d885c2020b4cb9e84a8b1241eb4ba69d59591f65855 \
+    --hash=sha256:f47619d978ab1ae7dfdc4052ea97c636c6263e1f19bd1be0e42c346b98d15ff4
+    # via
+    #   jax
+    #   keras-nightly
 namex==0.0.7 \
     --hash=sha256:84ba65bc4d22bd909e3d26bf2ffb4b9529b608cb3f9a4336f776b04204ced69b \
     --hash=sha256:8a4f062945f405d77cb66b907f16aa2fd83681945e998be840eb6c4154d40108
@@ -411,10 +404,6 @@ numpy==1.23.5 ; python_version <= "3.11" \
     #   opt-einsum
     #   scipy
     #   tb-nightly
-oauthlib==3.2.2 \
-    --hash=sha256:8139f29aac13e25d502680e9e19963e83f16838d48a0d71c287fe40e7067fbca \
-    --hash=sha256:9859c40929662bec5d64f34d01c99e093149682a3f38915dc0655d5a633dd918
-    # via requests-oauthlib
 opt-einsum==3.3.0 \
     --hash=sha256:2455e59e3947d3c275477df7f5205b30635e266fe6dc300e3d9f9646bfcea147 \
     --hash=sha256:59f6475f77bbc37dcf7cd748519c0ec60722e91e63ca114e68821c0c54a46549
@@ -444,57 +433,36 @@ protobuf==4.23.4 \
     --hash=sha256:effeac51ab79332d44fba74660d40ae79985901ac21bca408f8dc335a81aa597 \
     --hash=sha256:fee88269a090ada09ca63551bf2f573eb2424035bcf2cb1b121895b01a46594a
     # via tb-nightly
-psutil==5.9.6 \
-    --hash=sha256:10e8c17b4f898d64b121149afb136c53ea8b68c7531155147867b7b1ac9e7e28 \
-    --hash=sha256:18cd22c5db486f33998f37e2bb054cc62fd06646995285e02a51b1e08da97017 \
-    --hash=sha256:3ebf2158c16cc69db777e3c7decb3c0f43a7af94a60d72e87b2823aebac3d602 \
-    --hash=sha256:51dc3d54607c73148f63732c727856f5febec1c7c336f8f41fcbd6315cce76ac \
-    --hash=sha256:6e5fb8dc711a514da83098bc5234264e551ad980cec5f85dabf4d38ed6f15e9a \
-    --hash=sha256:70cb3beb98bc3fd5ac9ac617a327af7e7f826373ee64c80efd4eb2856e5051e9 \
-    --hash=sha256:748c9dd2583ed86347ed65d0035f45fa8c851e8d90354c122ab72319b5f366f4 \
-    --hash=sha256:91ecd2d9c00db9817a4b4192107cf6954addb5d9d67a969a4f436dbc9200f88c \
-    --hash=sha256:92e0cc43c524834af53e9d3369245e6cc3b130e78e26100d1f63cdb0abeb3d3c \
-    --hash=sha256:a6f01f03bf1843280f4ad16f4bde26b817847b4c1a0db59bf6419807bc5ce05c \
-    --hash=sha256:c69596f9fc2f8acd574a12d5f8b7b1ba3765a641ea5d60fb4736bf3c08a8214a \
-    --hash=sha256:ca2780f5e038379e520281e4c032dddd086906ddff9ef0d1b9dcf00710e5071c \
-    --hash=sha256:daecbcbd29b289aac14ece28eca6a3e60aa361754cf6da3dfb20d4d32b6c7f57 \
-    --hash=sha256:e4b92ddcd7dd4cdd3f900180ea1e104932c7bce234fb88976e2a3b296441225a \
-    --hash=sha256:fb8a697f11b0f5994550555fcfe3e69799e5b060c8ecf9e2f75c69302cc35c0d \
-    --hash=sha256:ff18b8d1a784b810df0b0fff3bcb50ab941c3b8e2c8de5726f9c71c601c611aa
+psutil==5.9.8 \
+    --hash=sha256:02615ed8c5ea222323408ceba16c60e99c3f91639b07da6373fb7e6539abc56d \
+    --hash=sha256:05806de88103b25903dff19bb6692bd2e714ccf9e668d050d144012055cbca73 \
+    --hash=sha256:26bd09967ae00920df88e0352a91cff1a78f8d69b3ecabbfe733610c0af486c8 \
+    --hash=sha256:27cc40c3493bb10de1be4b3f07cae4c010ce715290a5be22b98493509c6299e2 \
+    --hash=sha256:36f435891adb138ed3c9e58c6af3e2e6ca9ac2f365efe1f9cfef2794e6c93b4e \
+    --hash=sha256:50187900d73c1381ba1454cf40308c2bf6f34268518b3f36a9b663ca87e65e36 \
+    --hash=sha256:611052c4bc70432ec770d5d54f64206aa7203a101ec273a0cd82418c86503bb7 \
+    --hash=sha256:6be126e3225486dff286a8fb9a06246a5253f4c7c53b475ea5f5ac934e64194c \
+    --hash=sha256:7d79560ad97af658a0f6adfef8b834b53f64746d45b403f225b85c5c2c140eee \
+    --hash=sha256:8cb6403ce6d8e047495a701dc7c5bd788add903f8986d523e3e20b98b733e421 \
+    --hash=sha256:8db4c1b57507eef143a15a6884ca10f7c73876cdf5d51e713151c1236a0e68cf \
+    --hash=sha256:aee678c8720623dc456fa20659af736241f575d79429a0e5e9cf88ae0605cc81 \
+    --hash=sha256:bc56c2a1b0d15aa3eaa5a60c9f3f8e3e565303b465dbf57a1b730e7a2b9844e0 \
+    --hash=sha256:bd1184ceb3f87651a67b2708d4c3338e9b10c5df903f2e3776b62303b26cb631 \
+    --hash=sha256:d06016f7f8625a1825ba3732081d77c94589dca78b7a3fc072194851e88461a4 \
+    --hash=sha256:d16bbddf0693323b8c6123dd804100241da461e41d6e332fb0ba6058f630f8c8
     # via portpicker
-pyasn1==0.5.0 \
-    --hash=sha256:87a2121042a1ac9358cabcaf1d07680ff97ee6404333bacca15f76aa8ad01a57 \
-    --hash=sha256:97b7290ca68e62a832558ec3976f15cbf911bf5d7c7039d8b861c2a0ece69fde
-    # via
-    #   pyasn1-modules
-    #   rsa
-pyasn1-modules==0.3.0 \
-    --hash=sha256:5bd01446b736eb9d31512a30d46c1ac3395d676c6f3cafa4c03eb54b9925631c \
-    --hash=sha256:d3ccd6ed470d9ffbc716be08bd90efbd44d0734bc9303818f7336070984a162d
-    # via google-auth
-pygments==2.16.1 \
-    --hash=sha256:13fc09fa63bc8d8671a6d247e1eb303c4b343eaee81d861f3404db2935653692 \
-    --hash=sha256:1daff0494820c69bc8941e407aa20f577374ee88364ee10a98fdbe0aece96e29
+pygments==2.17.2 \
+    --hash=sha256:b27c2826c47d0f3219f29554824c30c5e8945175d888647acd804ddd04af846c \
+    --hash=sha256:da46cec9fd2de5be3a8a784f434e4c4ab670b4ff54d605c4c2717e9d49c4c367
     # via rich
 requests==2.31.0 \
     --hash=sha256:58cd2187c01e70e6e26505bca751777aa9f2ee0b7f4300988b709f44e013003f \
     --hash=sha256:942c5a758f98d790eaed1a29cb6eefc7ffb0d1cf7af05c3d2791656dbd6ad1e1
-    # via
-    #   -r requirements.in
-    #   requests-oauthlib
-    #   tb-nightly
-requests-oauthlib==1.3.1 \
-    --hash=sha256:2577c501a2fb8d05a304c09d090d6e47c306fef15809d102b327cf8364bddab5 \
-    --hash=sha256:75beac4a47881eeb94d5ea5d6ad31ef88856affe2332b9aafb52c6452ccf0d7a
-    # via google-auth-oauthlib
-rich==13.6.0 \
-    --hash=sha256:2b38e2fe9ca72c9a00170a1a2d20c63c790d0e10ef1fe35eba76e1e7b1d7d245 \
-    --hash=sha256:5c14d22737e6d5084ef4771b62d5d4363165b403455a30a1c8ca39dc7b644bef
+    # via -r requirements.in
+rich==13.7.0 \
+    --hash=sha256:5cb5123b5cf9ee70584244246816e9114227e0b98ad9176eede6ad54bf5403fa \
+    --hash=sha256:6da14c108c4866ee9520bbffa71f6fe3962e193b7da68720583850cd4548e235
     # via keras-nightly
-rsa==4.9 \
-    --hash=sha256:90260d9058e514786967344d0ef75fa8727eed8a7d2e43ce9f4bcf1b536174f7 \
-    --hash=sha256:e38464a49c6c85d7f1351b0126661487a7e0a14a50f1675ec50eb34d4f20ef21
-    # via google-auth
 scipy==1.11.3 \
     --hash=sha256:00f325434b6424952fbb636506f0567898dca7b0f7654d48f1c382ea338ce9a3 \
     --hash=sha256:033c3fd95d55012dd1148b201b72ae854d5086d25e7c316ec9850de4fe776929 \
@@ -530,8 +498,8 @@ six==1.16.0 \
     # via
     #   astunparse
     #   tb-nightly
-tb-nightly==2.15.0a20231023 \
-    --hash=sha256:8990a52985e3296aa18a6825efc017bcded9e2fb2cbcdd2d01f5c62d4fdc9825
+tb-nightly==2.17.0a20240214 \
+    --hash=sha256:dbd59bcfd9b028e6199050e36cbb4ee1db4a94e69a7f8c00865f1498112a2b83
     # via -r requirements.in
 tblib==2.0.0 \
     --hash=sha256:9100bfa016b047d5b980d66e7efed952fbd20bd85b56110aaf473cb97d18709a \
@@ -546,13 +514,17 @@ termcolor==2.3.0 \
     --hash=sha256:3afb05607b89aed0ffe25202399ee0867ad4d3cb4180d98aaf8eefa6a5f7d475 \
     --hash=sha256:b5b08f68937f138fe92f6c089b99f1e2da0ae56c52b78bf7075fd95420fd9a5a
     # via -r requirements.in
+tf-keras-nightly==2.16.0.dev2024021410 \
+    --hash=sha256:05a4c19c6a795ec9ed6f4dac69d443601b9206b910524d8b19bad6e362d49f47 \
+    --hash=sha256:83d5a1dd3979b42164a2fac4bdd2ed5981ef0c2a0514ca64e4bc67e369caef5c
+    # via tb-nightly
 typing-extensions==4.8.0 \
     --hash=sha256:8f92fc8806f9a6b641eaa5318da32b44d401efaac0f6678c9bc448ba3605faa0 \
     --hash=sha256:df8e4339e9cb77357558cbdbceca33c303714cf861d1eef15e1070055ae8b7ef
     # via -r requirements.in
-urllib3==2.0.7 \
-    --hash=sha256:c97dfde1f7bd43a71c8d2a58e369e9b2bf692d1334ea9f9cae55add7d0dd0f84 \
-    --hash=sha256:fdb6d215c776278489906c2f8916e6e7d4f5a9b602ccbcfdf7f016fc8da0596e
+urllib3==2.2.0 \
+    --hash=sha256:051d961ad0c62a94e50ecf1af379c3aba230c66c710493493560c0c223c49f20 \
+    --hash=sha256:ce3711610ddce217e6d113a2732fafad960a03fd0318c91faa79481e35c11224
     # via requests
 werkzeug==3.0.1 \
     --hash=sha256:507e811ecea72b18a404947aded4b3390e1db8f826b494d76550ef45bb3b1dcc \
@@ -564,81 +536,77 @@ wheel==0.41.3 \
     # via
     #   -r requirements.in
     #   astunparse
-wrapt==1.14.1 \
-    --hash=sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3 \
-    --hash=sha256:01c205616a89d09827986bc4e859bcabd64f5a0662a7fe95e0d359424e0e071b \
-    --hash=sha256:02b41b633c6261feff8ddd8d11c711df6842aba629fdd3da10249a53211a72c4 \
-    --hash=sha256:07f7a7d0f388028b2df1d916e94bbb40624c59b48ecc6cbc232546706fac74c2 \
-    --hash=sha256:11871514607b15cfeb87c547a49bca19fde402f32e2b1c24a632506c0a756656 \
-    --hash=sha256:1b376b3f4896e7930f1f772ac4b064ac12598d1c38d04907e696cc4d794b43d3 \
-    --hash=sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9 \
-    --hash=sha256:21ac0156c4b089b330b7666db40feee30a5d52634cc4560e1905d6529a3897ff \
-    --hash=sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9 \
-    --hash=sha256:257fd78c513e0fb5cdbe058c27a0624c9884e735bbd131935fd49e9fe719d310 \
-    --hash=sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224 \
-    --hash=sha256:2b39d38039a1fdad98c87279b48bc5dce2c0ca0d73483b12cb72aa9609278e8a \
-    --hash=sha256:2cf71233a0ed05ccdabe209c606fe0bac7379fdcf687f39b944420d2a09fdb57 \
-    --hash=sha256:2fe803deacd09a233e4762a1adcea5db5d31e6be577a43352936179d14d90069 \
-    --hash=sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335 \
-    --hash=sha256:3232822c7d98d23895ccc443bbdf57c7412c5a65996c30442ebe6ed3df335383 \
-    --hash=sha256:34aa51c45f28ba7f12accd624225e2b1e5a3a45206aa191f6f9aac931d9d56fe \
-    --hash=sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204 \
-    --hash=sha256:36f582d0c6bc99d5f39cd3ac2a9062e57f3cf606ade29a0a0d6b323462f4dd87 \
-    --hash=sha256:380a85cf89e0e69b7cfbe2ea9f765f004ff419f34194018a6827ac0e3edfed4d \
-    --hash=sha256:40e7bc81c9e2b2734ea4bc1aceb8a8f0ceaac7c5299bc5d69e37c44d9081d43b \
-    --hash=sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907 \
-    --hash=sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be \
-    --hash=sha256:4fcc4649dc762cddacd193e6b55bc02edca674067f5f98166d7713b193932b7f \
-    --hash=sha256:5a0f54ce2c092aaf439813735584b9537cad479575a09892b8352fea5e988dc0 \
-    --hash=sha256:5a9a0d155deafd9448baff28c08e150d9b24ff010e899311ddd63c45c2445e28 \
-    --hash=sha256:5b02d65b9ccf0ef6c34cba6cf5bf2aab1bb2f49c6090bafeecc9cd81ad4ea1c1 \
-    --hash=sha256:60db23fa423575eeb65ea430cee741acb7c26a1365d103f7b0f6ec412b893853 \
-    --hash=sha256:642c2e7a804fcf18c222e1060df25fc210b9c58db7c91416fb055897fc27e8cc \
-    --hash=sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf \
-    --hash=sha256:6a9a25751acb379b466ff6be78a315e2b439d4c94c1e99cb7266d40a537995d3 \
-    --hash=sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3 \
-    --hash=sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164 \
-    --hash=sha256:6e743de5e9c3d1b7185870f480587b75b1cb604832e380d64f9504a0535912d1 \
-    --hash=sha256:709fe01086a55cf79d20f741f39325018f4df051ef39fe921b1ebe780a66184c \
-    --hash=sha256:7b7c050ae976e286906dd3f26009e117eb000fb2cf3533398c5ad9ccc86867b1 \
-    --hash=sha256:7d2872609603cb35ca513d7404a94d6d608fc13211563571117046c9d2bcc3d7 \
-    --hash=sha256:7ef58fb89674095bfc57c4069e95d7a31cfdc0939e2a579882ac7d55aadfd2a1 \
-    --hash=sha256:80bb5c256f1415f747011dc3604b59bc1f91c6e7150bd7db03b19170ee06b320 \
-    --hash=sha256:81b19725065dcb43df02b37e03278c011a09e49757287dca60c5aecdd5a0b8ed \
-    --hash=sha256:833b58d5d0b7e5b9832869f039203389ac7cbf01765639c7309fd50ef619e0b1 \
-    --hash=sha256:88bd7b6bd70a5b6803c1abf6bca012f7ed963e58c68d76ee20b9d751c74a3248 \
-    --hash=sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c \
-    --hash=sha256:8c0ce1e99116d5ab21355d8ebe53d9460366704ea38ae4d9f6933188f327b456 \
-    --hash=sha256:8d649d616e5c6a678b26d15ece345354f7c2286acd6db868e65fcc5ff7c24a77 \
-    --hash=sha256:903500616422a40a98a5a3c4ff4ed9d0066f3b4c951fa286018ecdf0750194ef \
-    --hash=sha256:9736af4641846491aedb3c3f56b9bc5568d92b0692303b5a305301a95dfd38b1 \
-    --hash=sha256:988635d122aaf2bdcef9e795435662bcd65b02f4f4c1ae37fbee7401c440b3a7 \
-    --hash=sha256:9cca3c2cdadb362116235fdbd411735de4328c61425b0aa9f872fd76d02c4e86 \
-    --hash=sha256:9e0fd32e0148dd5dea6af5fee42beb949098564cc23211a88d799e434255a1f4 \
-    --hash=sha256:9f3e6f9e05148ff90002b884fbc2a86bd303ae847e472f44ecc06c2cd2fcdb2d \
-    --hash=sha256:a85d2b46be66a71bedde836d9e41859879cc54a2a04fad1191eb50c2066f6e9d \
-    --hash=sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8 \
-    --hash=sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8 \
-    --hash=sha256:aa31fdcc33fef9eb2552cbcbfee7773d5a6792c137b359e82879c101e98584c5 \
-    --hash=sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a \
-    --hash=sha256:b014c23646a467558be7da3d6b9fa409b2c567d2110599b7cf9a0c5992b3b471 \
-    --hash=sha256:b21bb4c09ffabfa0e85e3a6b623e19b80e7acd709b9f91452b8297ace2a8ab00 \
-    --hash=sha256:b5901a312f4d14c59918c221323068fad0540e34324925c8475263841dbdfe68 \
-    --hash=sha256:b9b7a708dd92306328117d8c4b62e2194d00c365f18eff11a9b53c6f923b01e3 \
-    --hash=sha256:d1967f46ea8f2db647c786e78d8cc7e4313dbd1b0aca360592d8027b8508e24d \
-    --hash=sha256:d52a25136894c63de15a35bc0bdc5adb4b0e173b9c0d07a2be9d3ca64a332735 \
-    --hash=sha256:d77c85fedff92cf788face9bfa3ebaa364448ebb1d765302e9af11bf449ca36d \
-    --hash=sha256:d79d7d5dc8a32b7093e81e97dad755127ff77bcc899e845f41bf71747af0c569 \
-    --hash=sha256:dbcda74c67263139358f4d188ae5faae95c30929281bc6866d00573783c422b7 \
-    --hash=sha256:ddaea91abf8b0d13443f6dac52e89051a5063c7d014710dcb4d4abb2ff811a59 \
-    --hash=sha256:dee0ce50c6a2dd9056c20db781e9c1cfd33e77d2d569f5d1d9321c641bb903d5 \
-    --hash=sha256:dee60e1de1898bde3b238f18340eec6148986da0455d8ba7848d50470a7a32fb \
-    --hash=sha256:e2f83e18fe2f4c9e7db597e988f72712c0c3676d337d8b101f6758107c42425b \
-    --hash=sha256:e3fb1677c720409d5f671e39bac6c9e0e422584e5f518bfd50aa4cbbea02433f \
-    --hash=sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55 \
-    --hash=sha256:ee2b1b1769f6707a8a445162ea16dddf74285c3964f605877a20e38545c3c462 \
-    --hash=sha256:ee6acae74a2b91865910eef5e7de37dc6895ad96fa23603d1d27ea69df545015 \
-    --hash=sha256:ef3f72c9666bba2bab70d2a8b79f2c6d2c1a42a7f7e2b0ec83bb2f9e383950af
+wrapt==1.16.0 \
+    --hash=sha256:0d2691979e93d06a95a26257adb7bfd0c93818e89b1406f5a28f36e0d8c1e1fc \
+    --hash=sha256:14d7dc606219cdd7405133c713f2c218d4252f2a469003f8c46bb92d5d095d81 \
+    --hash=sha256:1a5db485fe2de4403f13fafdc231b0dbae5eca4359232d2efc79025527375b09 \
+    --hash=sha256:1acd723ee2a8826f3d53910255643e33673e1d11db84ce5880675954183ec47e \
+    --hash=sha256:1ca9b6085e4f866bd584fb135a041bfc32cab916e69f714a7d1d397f8c4891ca \
+    --hash=sha256:1dd50a2696ff89f57bd8847647a1c363b687d3d796dc30d4dd4a9d1689a706f0 \
+    --hash=sha256:2076fad65c6736184e77d7d4729b63a6d1ae0b70da4868adeec40989858eb3fb \
+    --hash=sha256:2a88e6010048489cda82b1326889ec075a8c856c2e6a256072b28eaee3ccf487 \
+    --hash=sha256:3ebf019be5c09d400cf7b024aa52b1f3aeebeff51550d007e92c3c1c4afc2a40 \
+    --hash=sha256:418abb18146475c310d7a6dc71143d6f7adec5b004ac9ce08dc7a34e2babdc5c \
+    --hash=sha256:43aa59eadec7890d9958748db829df269f0368521ba6dc68cc172d5d03ed8060 \
+    --hash=sha256:44a2754372e32ab315734c6c73b24351d06e77ffff6ae27d2ecf14cf3d229202 \
+    --hash=sha256:490b0ee15c1a55be9c1bd8609b8cecd60e325f0575fc98f50058eae366e01f41 \
+    --hash=sha256:49aac49dc4782cb04f58986e81ea0b4768e4ff197b57324dcbd7699c5dfb40b9 \
+    --hash=sha256:5eb404d89131ec9b4f748fa5cfb5346802e5ee8836f57d516576e61f304f3b7b \
+    --hash=sha256:5f15814a33e42b04e3de432e573aa557f9f0f56458745c2074952f564c50e664 \
+    --hash=sha256:5f370f952971e7d17c7d1ead40e49f32345a7f7a5373571ef44d800d06b1899d \
+    --hash=sha256:66027d667efe95cc4fa945af59f92c5a02c6f5bb6012bff9e60542c74c75c362 \
+    --hash=sha256:66dfbaa7cfa3eb707bbfcd46dab2bc6207b005cbc9caa2199bcbc81d95071a00 \
+    --hash=sha256:685f568fa5e627e93f3b52fda002c7ed2fa1800b50ce51f6ed1d572d8ab3e7fc \
+    --hash=sha256:6906c4100a8fcbf2fa735f6059214bb13b97f75b1a61777fcf6432121ef12ef1 \
+    --hash=sha256:6a42cd0cfa8ffc1915aef79cb4284f6383d8a3e9dcca70c445dcfdd639d51267 \
+    --hash=sha256:6dcfcffe73710be01d90cae08c3e548d90932d37b39ef83969ae135d36ef3956 \
+    --hash=sha256:6f6eac2360f2d543cc875a0e5efd413b6cbd483cb3ad7ebf888884a6e0d2e966 \
+    --hash=sha256:72554a23c78a8e7aa02abbd699d129eead8b147a23c56e08d08dfc29cfdddca1 \
+    --hash=sha256:73870c364c11f03ed072dda68ff7aea6d2a3a5c3fe250d917a429c7432e15228 \
+    --hash=sha256:73aa7d98215d39b8455f103de64391cb79dfcad601701a3aa0dddacf74911d72 \
+    --hash=sha256:75ea7d0ee2a15733684badb16de6794894ed9c55aa5e9903260922f0482e687d \
+    --hash=sha256:7bd2d7ff69a2cac767fbf7a2b206add2e9a210e57947dd7ce03e25d03d2de292 \
+    --hash=sha256:807cc8543a477ab7422f1120a217054f958a66ef7314f76dd9e77d3f02cdccd0 \
+    --hash=sha256:8e9723528b9f787dc59168369e42ae1c3b0d3fadb2f1a71de14531d321ee05b0 \
+    --hash=sha256:9090c9e676d5236a6948330e83cb89969f433b1943a558968f659ead07cb3b36 \
+    --hash=sha256:9153ed35fc5e4fa3b2fe97bddaa7cbec0ed22412b85bcdaf54aeba92ea37428c \
+    --hash=sha256:9159485323798c8dc530a224bd3ffcf76659319ccc7bbd52e01e73bd0241a0c5 \
+    --hash=sha256:941988b89b4fd6b41c3f0bfb20e92bd23746579736b7343283297c4c8cbae68f \
+    --hash=sha256:94265b00870aa407bd0cbcfd536f17ecde43b94fb8d228560a1e9d3041462d73 \
+    --hash=sha256:98b5e1f498a8ca1858a1cdbffb023bfd954da4e3fa2c0cb5853d40014557248b \
+    --hash=sha256:9b201ae332c3637a42f02d1045e1d0cccfdc41f1f2f801dafbaa7e9b4797bfc2 \
+    --hash=sha256:a0ea261ce52b5952bf669684a251a66df239ec6d441ccb59ec7afa882265d593 \
+    --hash=sha256:a33a747400b94b6d6b8a165e4480264a64a78c8a4c734b62136062e9a248dd39 \
+    --hash=sha256:a452f9ca3e3267cd4d0fcf2edd0d035b1934ac2bd7e0e57ac91ad6b95c0c6389 \
+    --hash=sha256:a86373cf37cd7764f2201b76496aba58a52e76dedfaa698ef9e9688bfd9e41cf \
+    --hash=sha256:ac83a914ebaf589b69f7d0a1277602ff494e21f4c2f743313414378f8f50a4cf \
+    --hash=sha256:aefbc4cb0a54f91af643660a0a150ce2c090d3652cf4052a5397fb2de549cd89 \
+    --hash=sha256:b3646eefa23daeba62643a58aac816945cadc0afaf21800a1421eeba5f6cfb9c \
+    --hash=sha256:b47cfad9e9bbbed2339081f4e346c93ecd7ab504299403320bf85f7f85c7d46c \
+    --hash=sha256:b935ae30c6e7400022b50f8d359c03ed233d45b725cfdd299462f41ee5ffba6f \
+    --hash=sha256:bb2dee3874a500de01c93d5c71415fcaef1d858370d405824783e7a8ef5db440 \
+    --hash=sha256:bc57efac2da352a51cc4658878a68d2b1b67dbe9d33c36cb826ca449d80a8465 \
+    --hash=sha256:bf5703fdeb350e36885f2875d853ce13172ae281c56e509f4e6eca049bdfb136 \
+    --hash=sha256:c31f72b1b6624c9d863fc095da460802f43a7c6868c5dda140f51da24fd47d7b \
+    --hash=sha256:c5cd603b575ebceca7da5a3a251e69561bec509e0b46e4993e1cac402b7247b8 \
+    --hash=sha256:d2efee35b4b0a347e0d99d28e884dfd82797852d62fcd7ebdeee26f3ceb72cf3 \
+    --hash=sha256:d462f28826f4657968ae51d2181a074dfe03c200d6131690b7d65d55b0f360f8 \
+    --hash=sha256:d5e49454f19ef621089e204f862388d29e6e8d8b162efce05208913dde5b9ad6 \
+    --hash=sha256:da4813f751142436b075ed7aa012a8778aa43a99f7b36afe9b742d3ed8bdc95e \
+    --hash=sha256:db2e408d983b0e61e238cf579c09ef7020560441906ca990fe8412153e3b291f \
+    --hash=sha256:db98ad84a55eb09b3c32a96c576476777e87c520a34e2519d3e59c44710c002c \
+    --hash=sha256:dbed418ba5c3dce92619656802cc5355cb679e58d0d89b50f116e4a9d5a9603e \
+    --hash=sha256:dcdba5c86e368442528f7060039eda390cc4091bfd1dca41e8046af7c910dda8 \
+    --hash=sha256:decbfa2f618fa8ed81c95ee18a387ff973143c656ef800c9f24fb7e9c16054e2 \
+    --hash=sha256:e4fdb9275308292e880dcbeb12546df7f3e0f96c6b41197e0cf37d2826359020 \
+    --hash=sha256:eb1b046be06b0fce7249f1d025cd359b4b80fc1c3e24ad9eca33e0dcdb2e4a35 \
+    --hash=sha256:eb6e651000a19c96f452c85132811d25e9264d836951022d6e81df2fff38337d \
+    --hash=sha256:ed867c42c268f876097248e05b6117a65bcd1e63b779e916fe2e33cd6fd0d3c3 \
+    --hash=sha256:edfad1d29c73f9b863ebe7082ae9321374ccb10879eeabc84ba3b69f2579d537 \
+    --hash=sha256:f2058f813d4f2b5e3a9eb2eb3faf8f1d99b81c3e51aeda4b168406443e8ba809 \
+    --hash=sha256:f6b2d0c6703c988d334f297aa5df18c45e97b0af3679bb75059e0e0bd8b1069d \
+    --hash=sha256:f8212564d49c50eb4565e502814f694e240c55551a5f1bc841d4fcaabb0a9b8a \
+    --hash=sha256:ffa565331890b90056c01db69c0fe634a776f8019c143a5ae265f9c6bc4bd6d4
     # via -r requirements.in
 zipp==3.17.0 \
     --hash=sha256:0e923e726174922dce09c53c59ad483ff7bbb8e572e00c7f7c46b88556409f31 \
diff --git a/tensorflow/BUILD b/tensorflow/BUILD
index acc8468d6168e4..9eb036f01e0614 100644
--- a/tensorflow/BUILD
+++ b/tensorflow/BUILD
@@ -44,7 +44,7 @@ load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 # # buildifier: disable=out-of-order-load
 # load("//devtools/build_cleaner/skylark:action_config_test.bzl", "action_config_test")
 # load("//devtools/copybara/rules:copybara.bzl", "copybara_config_test")
-# load("//tools/build_defs/license:license.bzl", "license")
+# load("@rules_license//rules:license.bzl", "license")
 # # buildifier: enable=out-of-order-load
 # copybara:uncomment_end
 
@@ -1631,6 +1631,7 @@ genrule(
 
         d="$${d#*external/farmhash_archive/src}"
         d="$${d#*external/$${extname}/}"
+        d="$${d#_virtual_includes/*/}"
       fi
 
       mkdir -p "$@/$${d}"
diff --git a/tensorflow/c/eager/abstract_tensor_handle.cc b/tensorflow/c/eager/abstract_tensor_handle.cc
index 8a4438e2b9e75a..e04a9810638f61 100644
--- a/tensorflow/c/eager/abstract_tensor_handle.cc
+++ b/tensorflow/c/eager/abstract_tensor_handle.cc
@@ -34,7 +34,7 @@ std::string AbstractTensorHandle::DebugString() const {
 Status AbstractTensorHandle::TensorHandleStatus() const {
   // Tensor handles in current runtime don't carry error info and this method
   // should always return OK status.
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/c_api.cc b/tensorflow/c/eager/c_api.cc
index 29301e4e37f754..2d79853d7988ec 100644
--- a/tensorflow/c/eager/c_api.cc
+++ b/tensorflow/c/eager/c_api.cc
@@ -555,7 +555,7 @@ class CAPICustomDeviceTensorHandle
     }
     summary = std::string(reinterpret_cast<const char*>(summary_buffer->data),
                           summary_buffer->length);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
diff --git a/tensorflow/c/eager/c_api_distributed_test.cc b/tensorflow/c/eager/c_api_distributed_test.cc
index 13b688889a4567..3cb7a5d0fa5f1a 100644
--- a/tensorflow/c/eager/c_api_distributed_test.cc
+++ b/tensorflow/c/eager/c_api_distributed_test.cc
@@ -315,11 +315,11 @@ class GraphErrorInjectionPass : public tensorflow::GraphOptimizationPass {
   tensorflow::Status Run(
       const tensorflow::GraphOptimizationPassOptions& options) override {
     if (!enabled_) {
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
     if (first_call_) {
       first_call_ = false;
-      return ::tensorflow::OkStatus();
+      return absl::OkStatus();
     }
     return tensorflow::errors::Internal("Graph pass runs for more than once!");
   }
@@ -447,7 +447,7 @@ class FunctionErrorInjectionPass : public tensorflow::FunctionOptimizationPass {
         return tensorflow::errors::Internal("Injected graph pass error.");
       }
     }
-    return ::tensorflow::OkStatus();
+    return absl::OkStatus();
   }
 
  private:
diff --git a/tensorflow/c/eager/c_api_experimental.cc b/tensorflow/c/eager/c_api_experimental.cc
index d52e938c047a6d..c2f8125ddbb76a 100644
--- a/tensorflow/c/eager/c_api_experimental.cc
+++ b/tensorflow/c/eager/c_api_experimental.cc
@@ -606,7 +606,7 @@ void TFE_OpSetCancellationManager(TFE_Op* op,
                                   TF_Status* status) {
   tensorflow::unwrap(op)->SetCancellationManager(
       tensorflow::unwrap(cancellation_manager));
-  status->status = ::tensorflow::OkStatus();
+  status->status = absl::OkStatus();
 }
 
 TFE_Executor* TFE_NewExecutor(bool is_async, bool enable_streaming_enqueue,
@@ -667,7 +667,7 @@ void TFE_ContextGetFunctionDef(TFE_Context* ctx, const char* function_name,
   buf->data_deallocator = [](void* data, size_t length) {
     tensorflow::port::Free(data);
   };
-  status->status = ::tensorflow::OkStatus();
+  status->status = absl::OkStatus();
 }
 
 void TFE_ContextGetGraphDebugInfo(TFE_Context* ctx, const char* function_name,
@@ -691,7 +691,7 @@ void TFE_ContextGetGraphDebugInfo(TFE_Context* ctx, const char* function_name,
   buf->data_deallocator = [](void* data, size_t length) {
     tensorflow::port::Free(data);
   };
-  status->status = ::tensorflow::OkStatus();
+  status->status = absl::OkStatus();
 }
 
 TF_Tensor* TFE_AllocateHostTensor(TFE_Context* ctx, TF_DataType dtype,
@@ -817,7 +817,7 @@ void TFE_GetExecutedOpNames(TFE_Context* ctx, TF_Buffer* buf,
   buf->data_deallocator = [](void* data, size_t length) {
     tensorflow::port::Free(data);
   };
-  status->status = ::tensorflow::OkStatus();
+  status->status = absl::OkStatus();
 }
 
 void TFE_SetLogicalCpuDevices(TFE_Context* ctx, int num_cpus,
@@ -960,7 +960,7 @@ void TFE_GetTaskStates(TFE_Context* ctx, const TF_Buffer& tasks, void* states,
     *state_iter = std::move(s);
     ++state_iter;
   }
-  status->status = tensorflow::OkStatus();
+  status->status = absl::OkStatus();
 }
 
 void TFE_WaitAtBarrier(TFE_Context* ctx, const char* barrier_id,
diff --git a/tensorflow/c/eager/c_api_unified_experimental.cc b/tensorflow/c/eager/c_api_unified_experimental.cc
index 53f340ee2aa450..8422459c21b529 100644
--- a/tensorflow/c/eager/c_api_unified_experimental.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental.cc
@@ -52,7 +52,7 @@ Status SetDefaultTracingEngine(const char* name) {
   auto entry = GetFactories().find(name);
   if (entry != GetFactories().end()) {
     default_factory = GetFactories().find(name)->second;
-    return OkStatus();
+    return absl::OkStatus();
   }
   string msg = absl::StrCat(
       "No tracing engine factory has been registered with the key '", name,
diff --git a/tensorflow/c/eager/c_api_unified_experimental_graph.cc b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
index 5e804dca267a0d..0c9d4830850bb7 100644
--- a/tensorflow/c/eager/c_api_unified_experimental_graph.cc
+++ b/tensorflow/c/eager/c_api_unified_experimental_graph.cc
@@ -71,7 +71,7 @@ class GraphTensor : public TracingTensorHandle {
     DCHECK_GE(num_dims, -1);
     TF_RETURN_IF_ERROR(StatusFromTF_Status(&status));
     if (num_dims == kUnknownRank) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     std::vector<int64_t> dims(num_dims, kUnknownDim);
@@ -81,7 +81,7 @@ class GraphTensor : public TracingTensorHandle {
     TF_RETURN_IF_ERROR(StatusFromTF_Status(&status));
     TF_RETURN_IF_ERROR(tensorflow::TensorShapeUtils::MakeShape(dims, shape));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   tensorflow::FullTypeDef FullType() const override {
@@ -119,7 +119,7 @@ class GraphOperation : public TracingOperation {
       device_name_ = raw_device_name;
     }
     op_type_ = op;
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status SetOpName(const char* const op_name) override {
     if (op_) {
@@ -135,7 +135,7 @@ class GraphOperation : public TracingOperation {
     mutex_lock l(g_->mu);
     op_.reset(new TF_OperationDescription(g_, op_type_.c_str(),
                                           g_->graph.NewName(op_name).c_str()));
-    return OkStatus();
+    return absl::OkStatus();
   }
   const string& Name() const override { return op_type_; }
   const string& DeviceName() const override { return device_name_; }
@@ -143,7 +143,7 @@ class GraphOperation : public TracingOperation {
   Status SetDeviceName(const char* name) override {
     // TODO(srbs): Implement this.
     device_name_ = name;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status AddInput(AbstractTensorHandle* input) override {
@@ -153,7 +153,7 @@ class GraphOperation : public TracingOperation {
           "Unable to cast input to GraphTensor");
     }
     TF_AddInput(op_.get(), t->output_);
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status AddInputList(absl::Span<AbstractTensorHandle* const> inputs) override {
     std::vector<TF_Output> tf_outputs(inputs.size());
@@ -166,7 +166,7 @@ class GraphOperation : public TracingOperation {
       tf_outputs[i] = t->output_;
     }
     TF_AddInputList(op_.get(), tf_outputs.data(), tf_outputs.size());
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status Execute(absl::Span<AbstractTensorHandle*> retvals,
                  int* num_retvals) override {
@@ -182,26 +182,26 @@ class GraphOperation : public TracingOperation {
     for (int i = 0; i < *num_retvals; ++i) {
       retvals[i] = new GraphTensor({operation, i}, g_);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status SetAttrString(const char* attr_name, const char* data,
                        size_t length) override {
     tensorflow::StringPiece s(data, length);
     op_->node_builder.Attr(attr_name, s);
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status SetAttrInt(const char* attr_name, int64_t value) override {
     op_->node_builder.Attr(attr_name, static_cast<int64_t>(value));
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status SetAttrFloat(const char* attr_name, float value) override {
     op_->node_builder.Attr(attr_name, value);
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status SetAttrBool(const char* attr_name, bool value) override {
     op_->node_builder.Attr(attr_name, value);
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status SetAttrType(const char* const attr_name, DataType value) override {
     if (!op_) {
@@ -210,7 +210,7 @@ class GraphOperation : public TracingOperation {
           "op_type and op_name must be specified before specifying attrs.");
     }
     op_->node_builder.Attr(attr_name, value);
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status SetAttrShape(const char* attr_name, const int64_t* dims,
                       const int num_dims) override {
@@ -220,7 +220,7 @@ class GraphOperation : public TracingOperation {
           reinterpret_cast<const int64_t*>(dims), num_dims));
     }
     op_->node_builder.Attr(attr_name, shape);
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status SetAttrFunction(const char* attr_name,
                          const AbstractOperation* value) override {
@@ -232,7 +232,7 @@ class GraphOperation : public TracingOperation {
     tensorflow::NameAttrList func_name;
     func_name.set_name(string(value, value + length));
     op_->node_builder.Attr(attr_name, func_name);
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status SetAttrTensor(const char* attr_name,
                        AbstractTensorInterface* tensor) override {
@@ -255,26 +255,26 @@ class GraphOperation : public TracingOperation {
       }
       op_->node_builder.Attr(attr_name, v);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status SetAttrFloatList(const char* attr_name, const float* values,
                           int num_values) override {
     op_->node_builder.Attr(attr_name,
                            ArraySlice<const float>(values, num_values));
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status SetAttrIntList(const char* attr_name, const int64_t* values,
                         int num_values) override {
     op_->node_builder.Attr(
         attr_name, ArraySlice<const int64_t>(
                        reinterpret_cast<const int64_t*>(values), num_values));
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status SetAttrTypeList(const char* attr_name, const DataType* values,
                          int num_values) override {
     op_->node_builder.Attr(attr_name,
                            ArraySlice<const DataType>(values, num_values));
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status SetAttrBoolList(const char* attr_name, const unsigned char* values,
                          int num_values) override {
@@ -285,7 +285,7 @@ class GraphOperation : public TracingOperation {
     op_->node_builder.Attr(attr_name,
                            ArraySlice<const bool>(b.get(), num_values));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status SetAttrShapeList(const char* attr_name, const int64_t** dims,
                           const int* num_dims, int num_values) override {
@@ -300,7 +300,7 @@ class GraphOperation : public TracingOperation {
       }
     }
     op_->node_builder.Attr(attr_name, shapes);
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status SetAttrFunctionList(
       const char* attr_name,
@@ -368,7 +368,7 @@ class GraphContext : public TracingContext {
     }
     inputs_.push_back(t->output_);
     *output = tensorflow::down_cast<TracingTensorHandle*>(outputs[0]);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Finalize(OutputList* outputs, AbstractFunction** f) override {
@@ -393,7 +393,7 @@ class GraphContext : public TracingContext {
     TF_DeleteFunction(func);
     TF_RETURN_IF_ERROR(StatusFromTF_Status(s));
     TF_DeleteStatus(s);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status RegisterFunction(AbstractFunction* func) override {
diff --git a/tensorflow/c/eager/dlpack.cc b/tensorflow/c/eager/dlpack.cc
index ca033cd2266b01..f8de31aadbaa6f 100644
--- a/tensorflow/c/eager/dlpack.cc
+++ b/tensorflow/c/eager/dlpack.cc
@@ -169,57 +169,57 @@ Status TfDataTypeFormDlDataType(const DLDataType& dtype,
             "Only DLPack bools of bitwidth 8 are supported, got: ", dtype.bits);
       }
       *tf_dtype = TF_DataType::TF_BOOL;
-      return OkStatus();
+      return absl::OkStatus();
 
     case DLDataTypeCode::kDLUInt:
       switch (dtype.bits) {
         case 8:
           *tf_dtype = TF_DataType::TF_UINT8;
-          return OkStatus();
+          return absl::OkStatus();
         case 16:
           *tf_dtype = TF_DataType::TF_UINT16;
-          return OkStatus();
+          return absl::OkStatus();
         case 32:
           *tf_dtype = TF_DataType::TF_UINT32;
-          return OkStatus();
+          return absl::OkStatus();
         case 64:
           *tf_dtype = TF_DataType::TF_UINT64;
-          return OkStatus();
+          return absl::OkStatus();
         default:
           return tensorflow::errors::InvalidArgument("Unsupported UInt bits: ",
                                                      dtype.bits);
       }
-      return OkStatus();
+      return absl::OkStatus();
     case DLDataTypeCode::kDLInt:
       switch (dtype.bits) {
         case 8:
           *tf_dtype = TF_DataType::TF_INT8;
-          return OkStatus();
+          return absl::OkStatus();
         case 16:
           *tf_dtype = TF_DataType::TF_INT16;
-          return OkStatus();
+          return absl::OkStatus();
         case 32:
           *tf_dtype = TF_DataType::TF_INT32;
-          return OkStatus();
+          return absl::OkStatus();
         case 64:
           *tf_dtype = TF_DataType::TF_INT64;
-          return OkStatus();
+          return absl::OkStatus();
         default:
           return tensorflow::errors::InvalidArgument("Unsupported Int bits: ",
                                                      dtype.bits);
       }
-      return OkStatus();
+      return absl::OkStatus();
     case DLDataTypeCode::kDLFloat:
       switch (dtype.bits) {
         case 16:
           *tf_dtype = TF_DataType::TF_HALF;
-          return OkStatus();
+          return absl::OkStatus();
         case 32:
           *tf_dtype = TF_DataType::TF_FLOAT;
-          return OkStatus();
+          return absl::OkStatus();
         case 64:
           *tf_dtype = TF_DataType::TF_DOUBLE;
-          return OkStatus();
+          return absl::OkStatus();
         default:
           return tensorflow::errors::InvalidArgument("Unsupported Float bits: ",
                                                      dtype.bits);
@@ -229,7 +229,7 @@ Status TfDataTypeFormDlDataType(const DLDataType& dtype,
       switch (dtype.bits) {
         case 16:
           *tf_dtype = TF_DataType::TF_BFLOAT16;
-          return OkStatus();
+          return absl::OkStatus();
         default:
           return tensorflow::errors::InvalidArgument(
               "Unsupported BFloat bits: ", dtype.bits);
@@ -239,10 +239,10 @@ Status TfDataTypeFormDlDataType(const DLDataType& dtype,
       switch (dtype.bits) {
         case 64:
           *tf_dtype = TF_DataType::TF_COMPLEX64;
-          return OkStatus();
+          return absl::OkStatus();
         case 128:
           *tf_dtype = TF_DataType::TF_COMPLEX128;
-          return OkStatus();
+          return absl::OkStatus();
         default:
           return tensorflow::errors::InvalidArgument(
               "Unsupported Complex bits: ", dtype.bits);
diff --git a/tensorflow/c/eager/gradient_checker.cc b/tensorflow/c/eager/gradient_checker.cc
index 2042c857e8b211..2fcaee07b37f50 100644
--- a/tensorflow/c/eager/gradient_checker.cc
+++ b/tensorflow/c/eager/gradient_checker.cc
@@ -65,7 +65,7 @@ Status RunAndMaybeSum(AbstractContext* ctx, Model forward,
   // If the output is a scalar, then return the scalar output
   if (num_dims_out == 0) {
     outputs[0] = model_out.release();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Else, reduce sum the output to get a scalar
@@ -85,7 +85,7 @@ Status RunAndMaybeSum(AbstractContext* ctx, Model forward,
   // Reduce sum the output on all dimensions.
   TF_RETURN_IF_ERROR(ops::Sum(ctx, model_out.get(), sum_dims.get(), &outputs[0],
                               /*keep_dims=*/false, "sum_output"));
-  return OkStatus();
+  return absl::OkStatus();
 }
 // ========================= End Helper Functions==============================
 
@@ -198,7 +198,7 @@ Status CalcNumericalGrad(AbstractContext* ctx, Model forward,
   TF_RETURN_IF_ERROR(TestTensorHandleWithDims<float, TF_FLOAT>(
       ctx, dtheta_approx.data(), theta_dims.data(), num_dims, numerical_grad));
   TF_DeleteTensor(theta_tensor);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace gradients
diff --git a/tensorflow/c/eager/gradients.cc b/tensorflow/c/eager/gradients.cc
index 6f4ab3016beb63..326a9e8cb829d4 100644
--- a/tensorflow/c/eager/gradients.cc
+++ b/tensorflow/c/eager/gradients.cc
@@ -47,7 +47,7 @@ Status ZerosLike(AbstractContext* ctx, AbstractTensorHandle* t,
   TF_RETURN_IF_ERROR(
       op->Execute(absl::Span<AbstractTensorHandle*>(outputs), &num_outputs));
   *result = outputs[0];
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
@@ -59,7 +59,7 @@ Status GradientRegistry::Register(
     return errors::AlreadyExists(error_msg);
   }
   registry_.insert({op_name, gradient_function_factory});
-  return OkStatus();
+  return absl::OkStatus();
 }
 Status GradientRegistry::Lookup(
     const ForwardOperation& op,
@@ -70,7 +70,7 @@ Status GradientRegistry::Lookup(
     return errors::NotFound(error_msg);
   }
   gradient_function->reset(iter->second(op));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 TapeTensor::TapeTensor(AbstractTensorHandle* handle) : handle_(handle) {
@@ -200,7 +200,7 @@ Status TapeVSpace::BuildOnesLike(const TapeTensor& t,
   TF_RETURN_IF_ERROR(
       op->Execute(absl::Span<AbstractTensorHandle*>(outputs), &num_outputs));
   *result = outputs[0];
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Looks up the ID of a Gradient.
@@ -292,7 +292,7 @@ Status Tape::ComputeGradient(
   TF_RETURN_IF_ERROR(GradientTape::ComputeGradient(
       vspace, target_tensor_ids, source_tensor_ids, sources_that_are_targets,
       output_gradients, result, /*build_default_zeros_grads*/ false));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Helper functions which delegate to `AbstractOperation`, update
@@ -309,7 +309,7 @@ Status AddInput(AbstractOperation* op_, AbstractTensorHandle* input,
                 ForwardOperation* forward_op_) {
   TF_RETURN_IF_ERROR(op_->AddInput(input));
   forward_op_->inputs.push_back(input);
-  return OkStatus();
+  return absl::OkStatus();
 }
 Status AddInputList(AbstractOperation* op_,
                     absl::Span<AbstractTensorHandle* const> inputs,
@@ -318,7 +318,7 @@ Status AddInputList(AbstractOperation* op_,
   for (auto input : inputs) {
     forward_op_->inputs.push_back(input);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SetAttrString(AbstractOperation* op_, const char* attr_name,
@@ -482,7 +482,7 @@ Status Execute(AbstractOperation* op_, AbstractContext* ctx,
   TF_RETURN_IF_ERROR(registry.Lookup(*forward_op_, &gradient_fn));
   tape->RecordOperation(forward_op_->inputs, retvals, gradient_fn.release(),
                         op_->Name());
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace internal
 
diff --git a/tensorflow/c/eager/gradients_test.cc b/tensorflow/c/eager/gradients_test.cc
index a345240e8c3e4f..9df16f10290d0b 100644
--- a/tensorflow/c/eager/gradients_test.cc
+++ b/tensorflow/c/eager/gradients_test.cc
@@ -59,7 +59,7 @@ class CppGradients
 
 Status RegisterGradients(GradientRegistry* registry) {
   TF_RETURN_IF_ERROR(RegisterNotDifferentiable(registry, "CheckNumerics"));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 TEST_P(CppGradients, TestSetAttrString) {
diff --git a/tensorflow/c/eager/graph_function.cc b/tensorflow/c/eager/graph_function.cc
index 3f4430bb614ea1..bf45feb34afb0f 100644
--- a/tensorflow/c/eager/graph_function.cc
+++ b/tensorflow/c/eager/graph_function.cc
@@ -22,7 +22,7 @@ GraphFunction::GraphFunction(FunctionDef fdef)
 GraphFunction::~GraphFunction() {}
 Status GraphFunction::GetFunctionDef(FunctionDef** fdef) {
   *fdef = &fdef_;
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace graph
 }  // namespace tracing
diff --git a/tensorflow/c/eager/immediate_execution_tensor_handle.cc b/tensorflow/c/eager/immediate_execution_tensor_handle.cc
index d8cb9e165495c1..c99a270f0cb804 100644
--- a/tensorflow/c/eager/immediate_execution_tensor_handle.cc
+++ b/tensorflow/c/eager/immediate_execution_tensor_handle.cc
@@ -55,7 +55,7 @@ Status ImmediateExecutionTensorHandle::SummarizeValue(
     return status;
   }
   summary = resolved->SummarizeValue();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
index 0522ad3b73072f..e5b1ee97a2e802 100644
--- a/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
+++ b/tensorflow/c/eager/parallel_device/parallel_device_lib.cc
@@ -604,7 +604,7 @@ Status ParallelTensor::Shape(const std::vector<int64_t>** shape) const {
     shape_ = std::vector<int64_t>(dim_sizes.begin(), dim_sizes.end());
   }
   *shape = &*shape_;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ParallelTensor::SummarizeValue(std::string& summary) {
@@ -624,7 +624,7 @@ Status ParallelTensor::SummarizeValue(std::string& summary) {
                     "\": ", component_summary);
   }
   summary += "}";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace parallel_device
diff --git a/tensorflow/c/experimental/stream_executor/BUILD b/tensorflow/c/experimental/stream_executor/BUILD
index 48bdcb2c9a26bf..c0b62760cd4207 100644
--- a/tensorflow/c/experimental/stream_executor/BUILD
+++ b/tensorflow/c/experimental/stream_executor/BUILD
@@ -80,6 +80,7 @@ tf_cc_test(
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/protobuf:error_codes_proto_impl_cc",
+        "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/stream_executor",
     ],
 )
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor.cc b/tensorflow/c/experimental/stream_executor/stream_executor.cc
index 12391143a4d9e0..e20cfcfd83a205 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor.cc
@@ -573,11 +573,6 @@ class CStreamExecutor : public internal::StreamExecutorInterface {
     return std::unique_ptr<internal::EventInterface>(
         new CEvent(&device_, stream_executor_));
   }
-  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
-      override {
-    LOG(FATAL)
-        << "CreateKernelImplementation is not supported by pluggable device.";
-  }
   std::unique_ptr<internal::StreamInterface> GetStreamImplementation()
       override {
     return std::unique_ptr<internal::StreamInterface>(
diff --git a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
index 0f3e2e76aa4ebe..3542586ffa8e4e 100644
--- a/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
+++ b/tensorflow/c/experimental/stream_executor/stream_executor_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/status_test_util.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
+#include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 namespace {
@@ -200,11 +201,11 @@ TEST_F(StreamExecutorTest, HostMemoryAllocate) {
   };
   StreamExecutor* executor = GetExecutor(0);
   ASSERT_FALSE(allocate_called);
-  void* mem = executor->HostMemoryAllocate(8);
-  ASSERT_NE(mem, nullptr);
+  TF_ASSERT_OK_AND_ASSIGN(auto mem, executor->HostMemoryAllocate(8));
+  ASSERT_NE(mem->opaque(), nullptr);
   ASSERT_TRUE(allocate_called);
   ASSERT_FALSE(deallocate_called);
-  executor->HostMemoryDeallocate(mem);
+  mem.reset();
   ASSERT_TRUE(deallocate_called);
 }
 
diff --git a/tensorflow/cc/framework/fuzzing/BUILD b/tensorflow/cc/framework/fuzzing/BUILD
index 772bff55a9251c..ec424fc0425630 100644
--- a/tensorflow/cc/framework/fuzzing/BUILD
+++ b/tensorflow/cc/framework/fuzzing/BUILD
@@ -29,7 +29,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:hash",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:status",
     ],
diff --git a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.cc b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.cc
index 416bb56e820359..cacc15ca32d28f 100644
--- a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.cc
+++ b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.cc
@@ -26,14 +26,19 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/substitute.h"
 #include "tensorflow/cc/framework/cc_op_gen_util.h"
+#include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_def_util.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/hash.h"
 #include "tensorflow/core/platform/strcat.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/public/version.h"
 
 namespace tensorflow {
diff --git a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen_main.cc b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen_main.cc
index b76be0fd608715..8b2570fdc953a7 100644
--- a/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen_main.cc
+++ b/tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen_main.cc
@@ -19,15 +19,17 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "absl/status/status.h"
 #include "tensorflow/cc/framework/cc_op_gen_util.h"
 #include "tensorflow/cc/framework/fuzzing/cc_op_fuzz_gen.h"
+#include "tensorflow/core/framework/api_def.pb.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/core/framework/op_gen_lib.h"
 #include "tensorflow/core/lib/core/stringpiece.h"
-#include "tensorflow/core/lib/io/path.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/file_system.h"
 #include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/str_util.h"
 #include "tensorflow/core/platform/types.h"
 #include "tsl/platform/status.h"
 
diff --git a/tensorflow/cc/saved_model/BUILD b/tensorflow/cc/saved_model/BUILD
index 935b37b37aa5c0..4d8d5dfa11da87 100644
--- a/tensorflow/cc/saved_model/BUILD
+++ b/tensorflow/cc/saved_model/BUILD
@@ -109,7 +109,7 @@ cc_library(
         "//tensorflow/core:direct_session",
         "//tensorflow/core:all_kernels",
     ] + if_google(
-        ["@local_tsl//tsl/platform/default/build_config:tensorflow_platform_specific"],
+        ["//tensorflow/core/platform/default/build_config:tensorflow_platform_specific"],
         [],
     )) + if_not_mobile([
         "//tensorflow/core:core_cpu",
diff --git a/tensorflow/cc/saved_model/loader.cc b/tensorflow/cc/saved_model/loader.cc
index a245bf59a1f187..ae63fdab2fa32c 100644
--- a/tensorflow/cc/saved_model/loader.cc
+++ b/tensorflow/cc/saved_model/loader.cc
@@ -104,7 +104,7 @@ static Status ValidateNode(const NodeDef& node) {
         "Saved model contains node \"", node.name(),
         "\" which is a constant tensor but no value has been provided"));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 static Status ValidateFunctionNotRecursive(const FunctionDef& function) {
@@ -117,7 +117,7 @@ static Status ValidateFunctionNotRecursive(const FunctionDef& function) {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 static Status ValidateSavedTensors(const GraphDef& graph_def) {
@@ -137,7 +137,7 @@ static Status ValidateSavedTensors(const GraphDef& graph_def) {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Tensor CreateStringTensor(const string& value) {
@@ -223,7 +223,7 @@ Status RunInitOp(const RunOptions& run_options, const string& export_dir,
     return RunOnce(run_options, inputs, {}, {init_op_name},
                    nullptr /* outputs */, &run_metadata, session);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RunRestore(const RunOptions& run_options, const string& export_dir,
@@ -247,7 +247,7 @@ Status RunRestore(const RunOptions& run_options, const string& export_dir,
     LOG(INFO) << "The specified SavedModel has no variables; no checkpoints "
                  "were restored. File does not exist: "
               << variables_index_path;
-    return OkStatus();
+    return absl::OkStatus();
   }
   const string variables_path =
       io::JoinPath(variables_directory, kSavedModelVariablesFilename);
@@ -293,7 +293,7 @@ Status LoadSavedModelInternal(const SessionOptions& session_options,
       session_options, bundle->meta_graph_def, &bundle->session));
   TF_RETURN_IF_ERROR(RestoreSession(run_options, bundle->meta_graph_def,
                                     export_dir, &bundle->session));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LoadSavedModel(const SessionOptions& session_options,
@@ -469,7 +469,7 @@ Status RestoreSession(const RunOptions& run_options,
   // Record wall time spent in init op.
   load_latency_by_stage->GetCell(export_dir, "init_graph")
       ->Add(GetLatencyMicroseconds(graph_init_start_microseconds));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LoadSavedModel(const SessionOptions& session_options,
@@ -494,7 +494,7 @@ Status LoadSavedModel(const SessionOptions& session_options,
   *bundle = SavedModelBundleLite(
       std::make_unique<LiteSessionWrapper>(std::move(legacy_bundle.session)),
       std::move(*legacy_bundle.meta_graph_def.mutable_signature_def()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool MaybeSavedModelDirectory(const string& export_dir) {
diff --git a/tensorflow/cc/saved_model/loader_util.cc b/tensorflow/cc/saved_model/loader_util.cc
index e17f2ed4abb690..3a984bf31b3cd9 100644
--- a/tensorflow/cc/saved_model/loader_util.cc
+++ b/tensorflow/cc/saved_model/loader_util.cc
@@ -42,7 +42,7 @@ Status GetInitOp(const string& export_dir, const MetaGraphDef& meta_graph_def,
                                         kSavedModelInitOpSignatureKey);
     }
     *init_op_name = sig_def_outputs_it->second.name();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const auto& collection_def_map = meta_graph_def.collection_def();
@@ -62,7 +62,7 @@ Status GetInitOp(const string& export_dir, const MetaGraphDef& meta_graph_def,
     }
     *init_op_name = init_op_it->second.node_list().value(0);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GetAssetFileDefs(const MetaGraphDef& meta_graph_def,
@@ -73,13 +73,13 @@ Status GetAssetFileDefs(const MetaGraphDef& meta_graph_def,
     for (const auto& asset : meta_graph_def.asset_file_def()) {
       asset_file_defs->push_back(asset);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
   // Fall back to read from collection to be backward compatible with v1.
   const auto& collection_def_map = meta_graph_def.collection_def();
   const auto assets_it = collection_def_map.find(kSavedModelAssetsKey);
   if (assets_it == collection_def_map.end()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   const auto& any_assets = assets_it->second.any_list().value();
   for (const auto& any_asset : any_assets) {
@@ -88,7 +88,7 @@ Status GetAssetFileDefs(const MetaGraphDef& meta_graph_def,
         ParseAny(any_asset, &asset_file_def, "tensorflow.AssetFileDef"));
     asset_file_defs->push_back(asset_file_def);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace internal
diff --git a/tensorflow/cc/saved_model/reader.cc b/tensorflow/cc/saved_model/reader.cc
index 5563439f290391..b90f84438b3abf 100644
--- a/tensorflow/cc/saved_model/reader.cc
+++ b/tensorflow/cc/saved_model/reader.cc
@@ -63,7 +63,7 @@ Status FindMetaGraphDef(const std::unordered_set<string>& tags,
       if (!port::kLittleEndian) {
         TF_RETURN_IF_ERROR(ByteSwapTensorContentInMetaGraphDef(meta_graph_def));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
   return Status(
@@ -137,7 +137,7 @@ Status ReadMetaGraphDefFromSavedModel(const string& export_dir,
   TF_RETURN_IF_ERROR(ReadSavedModel(export_dir, &saved_model_proto));
   TF_RETURN_IF_ERROR(
       FindMetaGraphDef(tags, &saved_model_proto, meta_graph_def));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ReadSavedModelDebugInfoIfPresent(
@@ -156,7 +156,7 @@ Status ReadSavedModelDebugInfoIfPresent(
         ReadBinaryProto(Env::Default(), debug_info_pb_path, &debug_info));
     *debug_info_proto = std::make_unique<GraphDebugInfo>(std::move(debug_info));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/cc/saved_model/util.cc b/tensorflow/cc/saved_model/util.cc
index 3e0b1eb27026bb..b474f1ef3ed0f3 100644
--- a/tensorflow/cc/saved_model/util.cc
+++ b/tensorflow/cc/saved_model/util.cc
@@ -86,7 +86,7 @@ Status GetInputValues(
         absl::StrJoin(seen_request_inputs, ","),
         ", request input: ", absl::StrJoin(GetMapKeys(request_inputs), ",")));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace saved_model
diff --git a/tensorflow/compiler/aot/BUILD b/tensorflow/compiler/aot/BUILD
index 8eb99e2b46fa09..ee7f9e4805a1aa 100644
--- a/tensorflow/compiler/aot/BUILD
+++ b/tensorflow/compiler/aot/BUILD
@@ -1,7 +1,7 @@
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
-load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 load("//tensorflow:tensorflow.bzl", "if_google", "if_oss", "tf_cc_binary", "tf_cc_test")
+load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
 load("//tensorflow/core/platform:build_config.bzl", "if_llvm_aarch64_available", "if_llvm_system_z_available")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -18,7 +18,7 @@ cc_library(
     deps = if_oss([
         "//tensorflow/core:test_main",
     ]) + if_google([
-        "@local_tsl//tsl/platform/default/build_config:test_main",
+        "//tensorflow/core/platform/default/build_config:test_main",
     ]),
 )
 
diff --git a/tensorflow/compiler/aot/codegen_test.cc b/tensorflow/compiler/aot/codegen_test.cc
index fcd230f4ab296a..f7273c091a4d37 100644
--- a/tensorflow/compiler/aot/codegen_test.cc
+++ b/tensorflow/compiler/aot/codegen_test.cc
@@ -153,7 +153,7 @@ static void CompareWithGoldenFile(
 
   // To update the golden file, flip update_golden to true and run the
   // following:
-  // bazel test --test_strategy=local \
+  // blaz test --test_strategy=local \
   //   "third_party/tensorflow/compiler/aot:codegen_test"
   const bool update_golden = false;
   string golden_file_name =
@@ -230,7 +230,7 @@ TEST(CodegenTest, Golden) {
                                        /*result_param_number=*/1),
        BufferInfo::MakeResultParameter(/*size=*/5 * 4,
                                        /*result_param_number=*/2)},
-      0, {}));
+      0, nullptr, {}));
   compile_result.program_shape =
       xla::ShapeUtil::MakeProgramShape(
           {
diff --git a/tensorflow/compiler/aot/codegen_test_h.golden b/tensorflow/compiler/aot/codegen_test_h.golden
index 1131d95fdb18ad..71b234a8385806 100644
--- a/tensorflow/compiler/aot/codegen_test_h.golden
+++ b/tensorflow/compiler/aot/codegen_test_h.golden
@@ -462,7 +462,7 @@ class MyClass final : public tensorflow::XlaCompiledCpuFunction {
   static const ::xla::ProgramShapeProto* StaticProgramShape() {
     static const ::xla::ProgramShapeProto* kShape = []() {
     ::xla::ProgramShapeProto* proto = new ::xla::ProgramShapeProto;
-    proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShapeProto_protobuf_array_contents[0], 133);
+    proto->ParseFromArray(&__tfcompile_foo_bar_MyClass_ProgramShapeProto_protobuf_array_contents[0], 157);
     return proto;
   }();
     return kShape;
diff --git a/tensorflow/compiler/aot/codegen_test_o.golden b/tensorflow/compiler/aot/codegen_test_o.golden
index 55178e3ee7130a..28910275330f8c 100644
Binary files a/tensorflow/compiler/aot/codegen_test_o.golden and b/tensorflow/compiler/aot/codegen_test_o.golden differ
diff --git a/tensorflow/compiler/jit/BUILD b/tensorflow/compiler/jit/BUILD
index a180ee12e22e9b..9e3fd27b8f6e86 100644
--- a/tensorflow/compiler/jit/BUILD
+++ b/tensorflow/compiler/jit/BUILD
@@ -1814,9 +1814,6 @@ tf_cc_test(
 tf_cuda_cc_test(
     name = "device_compiler_test",
     srcs = ["device_compiler_test.cc"],
-    env = {
-        "XLA_FLAGS": "--xla_gpu_enable_xla_runtime_executable",
-    },
     tags = [
         "config-cuda-only",
         "no_oss",  # This test only runs with GPU.
diff --git a/tensorflow/compiler/jit/device_compiler.h b/tensorflow/compiler/jit/device_compiler.h
index 05c3f35a0042b8..fc562d8277a77e 100644
--- a/tensorflow/compiler/jit/device_compiler.h
+++ b/tensorflow/compiler/jit/device_compiler.h
@@ -38,6 +38,7 @@ limitations under the License.
 #include "tensorflow/compiler/jit/xla_compile_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "xla/client/local_client.h"
+#include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/lib/core/threadpool.h"
 #include "tensorflow/core/platform/mutex.h"
@@ -317,6 +318,7 @@ DeviceCompiler<ExecutableType, ClientType>::CompileStrict(
     cache_value.compilation_status = loaded_executable->status();
     if (loaded_executable->ok()) {
       out_executable = *std::move(*loaded_executable);
+      metrics::UpdatePersistentCacheLoadCount();
     }
   } else {
     auto built_executable =
diff --git a/tensorflow/compiler/jit/flags.cc b/tensorflow/compiler/jit/flags.cc
index 82ed25767b90de..f85fd5fde4c1fa 100644
--- a/tensorflow/compiler/jit/flags.cc
+++ b/tensorflow/compiler/jit/flags.cc
@@ -282,6 +282,7 @@ void AllocateAndParseFlags() {
   bool enable_mlir_bridge_is_explicit = false;
   bool enable_mlir_merge_control_flow_pass = true;
   bool enable_mlir_convert_control_to_data_outputs_pass = false;
+  bool enable_mlir_composite_tpuexecute_side_effects = false;
   bool enable_mlir_strict_clusters = false;
   bool enable_mlir_multiple_local_cpu_devices = false;
   // Dump graphs in TFG dialect.
@@ -376,6 +377,10 @@ void AllocateAndParseFlags() {
             &enable_mlir_convert_control_to_data_outputs_pass,
             "Enables `tf-executor-convert-control-to-data-outputs` pass for "
             "MLIR-Based TensorFlow Compiler Bridge."),
+       Flag("tf_mlir_composite_tpuexecute_side_effects",
+            &enable_mlir_composite_tpuexecute_side_effects,
+            "Enables certain TPUExecute ops to run in parallel if they only "
+            "operate on resources that live on composite devices."),
        Flag("tf_mlir_enable_strict_clusters", &enable_mlir_strict_clusters,
             "Do not allow clusters that have cyclic control dependencies."),
        Flag("tf_mlir_enable_multiple_local_cpu_devices",
@@ -414,6 +419,8 @@ void AllocateAndParseFlags() {
       enable_mlir_merge_control_flow_pass;
   mlir_flags->tf_mlir_enable_convert_control_to_data_outputs_pass =
       enable_mlir_convert_control_to_data_outputs_pass;
+  mlir_flags->tf_mlir_enable_composite_tpuexecute_side_effects =
+      enable_mlir_composite_tpuexecute_side_effects;
   mlir_flags->tf_mlir_enable_strict_clusters = enable_mlir_strict_clusters;
   mlir_flags->tf_mlir_enable_generic_outside_compilation =
       enable_mlir_generic_outside_compilation;
diff --git a/tensorflow/compiler/jit/flags.h b/tensorflow/compiler/jit/flags.h
index 45a4c83a614afd..d2c078a617b258 100644
--- a/tensorflow/compiler/jit/flags.h
+++ b/tensorflow/compiler/jit/flags.h
@@ -288,6 +288,7 @@ struct MlirCommonFlags {
 
   bool tf_mlir_enable_merge_control_flow_pass;
   bool tf_mlir_enable_convert_control_to_data_outputs_pass;
+  bool tf_mlir_enable_composite_tpuexecute_side_effects;
   bool tf_mlir_enable_strict_clusters;
   bool tf_mlir_enable_generic_outside_compilation;
   bool tf_mlir_enable_tpu_variable_runtime_reformatting_pass;
diff --git a/tensorflow/compiler/jit/kernels/xla_ops.cc b/tensorflow/compiler/jit/kernels/xla_ops.cc
index 435b63d8f5dbe9..6ba7afe9884e91 100644
--- a/tensorflow/compiler/jit/kernels/xla_ops.cc
+++ b/tensorflow/compiler/jit/kernels/xla_ops.cc
@@ -212,7 +212,7 @@ Status GetTaskName(const std::string_view device_name, std::string* task_name) {
                                    device_name);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Provide SendDeviceMemoryFunction for XLA host callbacks.  This callback
@@ -400,7 +400,7 @@ Status CompileToLocalExecutable(
       rm->default_container(), "device_compilation_profiler", &profiler,
       [](DeviceCompilationProfiler** profiler) {
         *profiler = new DeviceCompilationProfiler();
-        return OkStatus();
+        return absl::OkStatus();
       }));
   // Hold the reference to the XLA device compiler and profiler during
   // evaluation. (We could probably free them sooner because the ResourceMgr
@@ -899,7 +899,7 @@ void XlaRunOp::Compute(OpKernelContext* ctx) {
                                  closure.client(), closure.executable(), ctx));
     }
 
-    OP_REQUIRES_OK(ctx, OkStatus());
+    OP_REQUIRES_OK(ctx, absl::OkStatus());
     return;
   }
 
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass.cc b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
index 526059c22fde8b..bf5a0ce68dd4a4 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass.cc
@@ -637,7 +637,7 @@ Status IgnoreResourceOpForSafetyAnalysis(
 
   if (n.assigned_device_name().empty()) {
     *ignore = false;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   TF_ASSIGN_OR_RETURN(
@@ -649,7 +649,7 @@ Status IgnoreResourceOpForSafetyAnalysis(
   } else {
     *ignore = registration->cluster_resource_variable_ops_unsafely;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<bool> MarkForCompilationPassImpl::Initialize() {
@@ -892,7 +892,7 @@ Status MarkForCompilationPassImpl::RunEdgeContractionLoop() {
                       }));
   TF_RET_CHECK(!changed);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MarkForCompilationPassImpl::DeclusterNodes() {
@@ -922,7 +922,7 @@ Status MarkForCompilationPassImpl::DeclusterNodes() {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Tracks monotonic sequence numbers for graphs.
@@ -1010,7 +1010,7 @@ Status MarkForCompilationPassImpl::CreateClusters() {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MarkForCompilationPassImpl::DumpDebugInfo() {
@@ -1022,7 +1022,7 @@ Status MarkForCompilationPassImpl::DumpDebugInfo() {
 
   VLogClusteringSummary();
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<bool>
@@ -1181,7 +1181,7 @@ Status MarkForCompilationPassImpl::BuildInitialClusterSet() {
     cluster_for_node_[node->id()].Get() = new_cluster;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<bool> IsIdentityDrivingConstsInLoop(Node* node) {
@@ -1475,7 +1475,7 @@ Status MarkForCompilationPassImpl::FindCompilationCandidates() {
 
   VLOG(2) << "compilation_candidates_.size() = "
           << compilation_candidates_.size();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool MarkForCompilationPassImpl::CompilationDisallowedByXlaCompileAttr(
@@ -1596,7 +1596,7 @@ Status MarkForCompilationPassImpl::Run() {
   if (!initialized) {
     // Initialization exited early which means this instance of
     // MarkForCompilationPassImpl is not set up to run the subsequent phases.
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   TF_RETURN_IF_ERROR(RunEdgeContractionLoop());
@@ -1604,7 +1604,7 @@ Status MarkForCompilationPassImpl::Run() {
   TF_RETURN_IF_ERROR(CreateClusters());
   TF_RETURN_IF_ERROR(DumpDebugInfo());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void MarkForCompilationPassImpl::DumpPostClusteringGraphs() {
@@ -1864,14 +1864,14 @@ Status MarkForCompilation(
   for (Node* n : graph->nodes()) {
     // See explanation on `kXlaAlreadyClustered`.
     if (n->attrs().Find(kXlaAlreadyClustered)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     // Skip the pass if we found TPUExecute or TPUExecuteAndUpdateVariables ops
     // in the graph, which indicates the graph is produced by TPU TF-XLA bridge
     // and doesn't require auto clustering.
     if (n->type_string() == "TPUExecute" ||
         n->type_string() == "TPUExecuteAndUpdateVariables") {
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -2277,6 +2277,7 @@ absl::flat_hash_set<string> GetKnownXLAAllowlistOp() {
       "VariableShape",
       "Where",
       "While",
+      "XlaAllReduce",
       "XlaBroadcastHelper",
       "XlaCallModule",
       "XlaConcatND",
@@ -2298,6 +2299,7 @@ absl::flat_hash_set<string> GetKnownXLAAllowlistOp() {
       "XlaRecv",
       "XlaReduce",
       "XlaReducePrecision",
+      "XlaReduceScatter",
       "XlaReduceWindow",
       "XlaRemoveDynamicDimensionSize",
       "XlaReplicaId",
diff --git a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
index b9527ae9ec56b0..aabedf61202d3f 100644
--- a/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
+++ b/tensorflow/compiler/jit/mark_for_compilation_pass_test.cc
@@ -390,7 +390,7 @@ static Status GradForUnaryCwise(FunctionDef* g,
       {},
       // Nodes
       nodes);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // A gradient containing only supported operators
@@ -1816,7 +1816,7 @@ TEST(XlaCompilationTest, DeterministicClusterNames) {
           " rhs: ", rhs_cluster_name);
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   testing::ResetClusterSequenceNumber();
diff --git a/tensorflow/compiler/jit/ops/xla_ops.cc b/tensorflow/compiler/jit/ops/xla_ops.cc
index cf0e03bc9f64e5..7c370e46dec63f 100644
--- a/tensorflow/compiler/jit/ops/xla_ops.cc
+++ b/tensorflow/compiler/jit/ops/xla_ops.cc
@@ -55,7 +55,7 @@ REGISTER_OP("XlaClusterOutput")
       for (int i = 0; i < c->num_outputs(); ++i) {
         c->set_output(i, c->input(0));
       }
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc(
         "Operator that connects the output of an XLA computation to other "
@@ -112,7 +112,7 @@ REGISTER_OP("_XlaMerge")
     .Attr("T: type")
     .SetShapeFn([](InferenceContext* c) {
       c->set_output(0, c->input(0));
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc(R"(XLA Merge Op. For use by the XLA JIT only.
 
diff --git a/tensorflow/compiler/jit/partially_decluster_pass.cc b/tensorflow/compiler/jit/partially_decluster_pass.cc
index 715cc1b31738b5..eb66a8d905cc8c 100644
--- a/tensorflow/compiler/jit/partially_decluster_pass.cc
+++ b/tensorflow/compiler/jit/partially_decluster_pass.cc
@@ -113,7 +113,7 @@ Status FindNodesToDecluster(const Graph& graph,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PartiallyDeclusterNode(Graph* graph, Node* n) {
@@ -156,7 +156,7 @@ Status PartiallyDeclusterNode(Graph* graph, Node* n) {
     graph->RemoveNode(n);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Clones nodes to outside their cluster to avoid device-to-host copies.  For
@@ -221,7 +221,7 @@ Status PartiallyDeclusterGraph(Graph* graph) {
       FindNodesToDecluster(*graph, &nodes_to_partially_decluster, post_order));
   CHECK(nodes_to_partially_decluster.empty());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace reduce_device_to_host_copies
 
@@ -251,12 +251,12 @@ Status MustCompileNode(const Node* n, bool* must_compile) {
 
   if (IsMustCompileDevice(device_type)) {
     *must_compile = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // We must compile `n` if it does not have a TensorFlow kernel.
   *must_compile = !FindKernelDef(device_type, n->def(), nullptr, nullptr).ok();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Declusters nodes to reduce the number of times we think we need to recompile
@@ -363,7 +363,7 @@ Status PartiallyDeclusterGraph(Graph* graph,
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace reduce_recompilation
 
@@ -397,7 +397,7 @@ Status PartiallyDeclusterGraph(Graph* graph) {
             << " because it is a root shape consumer";
     RemoveFromXlaCluster(n);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace decluster_root_shape_consumers
 }  // namespace
@@ -430,6 +430,6 @@ Status PartiallyDeclusterPass::Run(
   TF_RETURN_IF_ERROR(
       decluster_root_shape_consumers::PartiallyDeclusterGraph(graph));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/pjrt_device_context.cc b/tensorflow/compiler/jit/pjrt_device_context.cc
index cd8d231af19f5b..a403557f9a9087 100644
--- a/tensorflow/compiler/jit/pjrt_device_context.cc
+++ b/tensorflow/compiler/jit/pjrt_device_context.cc
@@ -96,7 +96,7 @@ void PjRtDeviceContext::CopyDeviceTensorToCPU(const Tensor* device_tensor,
   profiler::TraceMe traceme("PjRtDeviceContext::CopyDeviceTensorToCPU");
   if (device_tensor->NumElements() == 0) {
     VLOG(2) << "CopyDeviceTensorToCPU empty tensor";
-    done(OkStatus());
+    done(absl::OkStatus());
     return;
   }
   auto literal = std::make_unique<xla::MutableBorrowingLiteral>();
@@ -149,7 +149,7 @@ void PjRtDeviceContext::CopyCPUTensorToDevice(const Tensor* cpu_tensor,
   profiler::TraceMe traceme("PjRtDeviceContext::CopyCPUTensorToDevice");
   if (cpu_tensor->NumElements() == 0) {
     VLOG(2) << "CopyCPUTensorToDevice empty tensor";
-    done(OkStatus());
+    done(absl::OkStatus());
     return;
   }
 
@@ -243,7 +243,7 @@ void PjRtDeviceToDeviceCopy(DeviceContext* send_dev_context,
   profiler::TraceMe traceme("PjRtDevice_DeviceToDeviceCopy");
   if (input->NumElements() == 0) {
     VLOG(2) << "PjRtDevice_DeviceToDeviceCopy empty tensor";
-    done(OkStatus());
+    done(absl::OkStatus());
     return;
   }
 
diff --git a/tensorflow/compiler/jit/rearrange_function_argument_pass_test.cc b/tensorflow/compiler/jit/rearrange_function_argument_pass_test.cc
index 9cf49b0666ddd5..ffbcef3371ae81 100644
--- a/tensorflow/compiler/jit/rearrange_function_argument_pass_test.cc
+++ b/tensorflow/compiler/jit/rearrange_function_argument_pass_test.cc
@@ -118,7 +118,7 @@ TEST(RearrangeFunctionArgumentForFunctionTest, Basic) {
                                                    &fld, &new_fbody));
         *fbody = new_fbody.get();
         fbodies.push_back(std::move(new_fbody));
-        return OkStatus();
+        return absl::OkStatus();
       },
       g.get(), &fld));
 
@@ -229,7 +229,7 @@ TEST(RearrangeFunctionArgumentForFunctionTest,
                                                    &fld, &new_fbody));
         *fbody = new_fbody.get();
         fbodies.push_back(std::move(new_fbody));
-        return OkStatus();
+        return absl::OkStatus();
       },
       g.get(), &fld);
   EXPECT_EQ(status.code(), error::UNIMPLEMENTED);
diff --git a/tensorflow/compiler/jit/resource_operation_safety_analysis.cc b/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
index 650369863c4d05..92f79dde874217 100644
--- a/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
+++ b/tensorflow/compiler/jit/resource_operation_safety_analysis.cc
@@ -106,13 +106,13 @@ Status XlaResourceOpKindForNode(
   }
   if (should_ignore) {
     *out_resource_op_kind = std::nullopt;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const XlaResourceOpInfo* op_info = GetResourceOpInfoForOp(n.type_string());
   if (op_info) {
     *out_resource_op_kind = op_info->kind();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // We conservatively assume that functions will both read and write resource
@@ -124,7 +124,7 @@ Status XlaResourceOpKindForNode(
     *out_resource_op_kind = std::nullopt;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Returns true if a control or data dependence from a TensorFlow operation of
@@ -314,6 +314,6 @@ Status ComputeIncompatibleResourceOperationPairs(
   std::sort(result->begin(), result->end());
   CHECK(std::unique(result->begin(), result->end()) == result->end());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/shape_inference.cc b/tensorflow/compiler/jit/shape_inference.cc
index b9cbd1e3105b13..9a6bb729149dbd 100644
--- a/tensorflow/compiler/jit/shape_inference.cc
+++ b/tensorflow/compiler/jit/shape_inference.cc
@@ -33,7 +33,7 @@ Status ShapeHandleToTensorShape(shape_inference::InferenceContext* context,
                                 const shape_inference::ShapeHandle& handle,
                                 PartialTensorShape* shape) {
   // The default is already unknown
-  if (!context->RankKnown(handle)) return OkStatus();
+  if (!context->RankKnown(handle)) return absl::OkStatus();
 
   std::vector<int64_t> dims(context->Rank(handle));
   for (int32_t i = 0, end = dims.size(); i < end; ++i) {
@@ -199,7 +199,7 @@ Status PropagateShapes(Graph* graph,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Store the shapes of the output tensors in a map
@@ -235,7 +235,7 @@ Status StoreOutputShapes(const Graph& graph, const ShapeRefiner& shape_refiner,
               << output.handle_shape.DebugString();
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/jit/shape_inference_helpers.cc b/tensorflow/compiler/jit/shape_inference_helpers.cc
index f3dd0c7ec78453..9290861d48f0bc 100644
--- a/tensorflow/compiler/jit/shape_inference_helpers.cc
+++ b/tensorflow/compiler/jit/shape_inference_helpers.cc
@@ -41,7 +41,7 @@ Status BackEdgeHelper::Remove(Graph* graph) {
   for (const BackEdge& be : back_edges_) {
     graph_->RemoveEdge(be.edge);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 const std::vector<BackEdgeHelper::BackEdge>& BackEdgeHelper::RemovedEdges()
@@ -60,7 +60,7 @@ Status BackEdgeHelper::Replace() {
   for (const BackEdge& be : back_edges_) {
     graph_->AddEdge(be.src, be.src_output, be.dst, be.dst_input);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/test_util.cc b/tensorflow/compiler/jit/test_util.cc
index 41afd63cca3b1e..f073902bc03d4a 100644
--- a/tensorflow/compiler/jit/test_util.cc
+++ b/tensorflow/compiler/jit/test_util.cc
@@ -58,7 +58,7 @@ Status ShapeAnnotationsMatch(
     return errors::InvalidArgument("Missing shapes for nodes: ",
                                    absl::StrJoin(missing, ","));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void DeviceSetup::AddDevicesAndSetUp(
diff --git a/tensorflow/compiler/jit/tests/BUILD b/tensorflow/compiler/jit/tests/BUILD
index 2d382cbda5f7f0..e9880013bf2611 100644
--- a/tensorflow/compiler/jit/tests/BUILD
+++ b/tensorflow/compiler/jit/tests/BUILD
@@ -88,9 +88,6 @@ tf_cc_test(
     srcs = [
         "device_compiler_serialize_test.cc",
     ],
-    env = {
-        "XLA_FLAGS": "--xla_gpu_enable_xla_runtime_executable",
-    },
     tags = [
         "config-cuda-only",
         "no_oss",  # This test only runs with GPU.
@@ -110,9 +107,6 @@ tf_cc_test(
     srcs = [
         "device_compiler_serialize_options_test.cc",
     ],
-    env = {
-        "XLA_FLAGS": "--xla_gpu_enable_xla_runtime_executable",
-    },
     tags = [
         "config-cuda-only",
         "no_oss",  # This test only runs with GPU.
diff --git a/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc b/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc
index e93af0df217e84..8833ce69d4bf6b 100644
--- a/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc
+++ b/tensorflow/compiler/jit/tests/auto_clustering_test_helper.cc
@@ -95,7 +95,7 @@ Status AssertGraphDefIsUnclustered(const GraphDef& graphdef) {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ReadTextProtoFromString(Env* env, const string& data,
@@ -103,7 +103,7 @@ Status ReadTextProtoFromString(Env* env, const string& data,
   if (!::tensorflow::protobuf::TextFormat::ParseFromString(data, proto)) {
     return errors::DataLoss("Can't parse input data as text proto");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
@@ -121,7 +121,7 @@ Status AutoClusteringTest::RunAutoClusteringTestImpl(
     LOG(INFO) << "Not running "
               << ::testing::UnitTest::GetInstance()->current_test_info()->name()
               << " since test was not built with --config=cuda";
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   TF_RETURN_IF_ERROR(AssertGraphDefIsUnclustered(graphdef));
@@ -158,7 +158,7 @@ Status AutoClusteringTest::RunAutoClusteringTestImpl(
 
   EXPECT_EQ(golden_file_contents, clustering_summary);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AutoClusteringTest::RunAutoClusteringTestWithPbtxt(
@@ -221,7 +221,7 @@ Status BenchmarkMarkForCompilation(absl::string_view graph_def_path,
                                   std::move(graph_def_copy), &result));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 #endif  // PLATFORM_GOOGLE
 
diff --git a/tensorflow/compiler/jit/tests/device_compiler_test_helper.cc b/tensorflow/compiler/jit/tests/device_compiler_test_helper.cc
index c1ae143ee94508..4eb93e85819651 100644
--- a/tensorflow/compiler/jit/tests/device_compiler_test_helper.cc
+++ b/tensorflow/compiler/jit/tests/device_compiler_test_helper.cc
@@ -131,7 +131,7 @@ Status DeviceCompilerSerializeTest::ExecuteWithBatch(const GraphDef& graph,
     EXPECT_NEAR(golden_output_tensors[0].flat<float>()(i),
                 output_tensors[0].flat<float>()(i), 1e-3);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DeviceCompilerSerializeTest::AlterPersistentCacheEntryHloModuleNames(
@@ -160,7 +160,7 @@ Status DeviceCompilerSerializeTest::AlterPersistentCacheEntryHloModuleNames(
     return errors::NotFound(
         "Did not find any persistent XLA compilation cache entries to alter.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/jit/tests/device_compiler_test_helper.h b/tensorflow/compiler/jit/tests/device_compiler_test_helper.h
index e8ae70928d17d4..9cf36d0cbc6cb1 100644
--- a/tensorflow/compiler/jit/tests/device_compiler_test_helper.h
+++ b/tensorflow/compiler/jit/tests/device_compiler_test_helper.h
@@ -33,17 +33,17 @@ class JitCompilationListener : public XlaActivityListener {
  public:
   Status Listen(
       const XlaAutoClusteringActivity& auto_clustering_activity) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Listen(
       const XlaJitCompilationActivity& jit_compilation_activity) override {
     activity_history_.push_back(jit_compilation_activity);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Listen(const XlaOptimizationRemark& optimization_remark) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   ~JitCompilationListener() override = default;
@@ -55,7 +55,7 @@ class JitCompilationListener : public XlaActivityListener {
         return absl::FailedPreconditionError("Unexpected listener history.");
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::vector<XlaJitCompilationActivity> GetListenerHistory() {
diff --git a/tensorflow/compiler/jit/xla_device.cc b/tensorflow/compiler/jit/xla_device.cc
index 6f71814bbe7b6a..c47096796d7f86 100644
--- a/tensorflow/compiler/jit/xla_device.cc
+++ b/tensorflow/compiler/jit/xla_device.cc
@@ -501,38 +501,6 @@ Status XlaDevice::Sync() {
   return OkStatus();
 }
 
-// TODO(b/112409994): This is no longer necessary. Consolidate it with the
-// synchronous version.
-void XlaDevice::Sync(const DoneCallback& done) {
-  VLOG(1) << "XlaDevice::Sync (asynchronous)";
-  std::shared_ptr<se::Stream> stream;
-  {
-    mutex_lock lock(mu_);
-    stream = stream_;
-  }
-  if (!stream) {
-    done(OkStatus());
-    return;
-  }
-
-  // The call to ThenEnqueueOnBackgroundThread below enqueues a host callback at
-  // the end of the stream, after everything that has already been enqueued
-  // there at this moment. When the host callback is called, everything before
-  // it must have already finished, and the host callback will then place the
-  // task below onto a background thread. (See the implementation of
-  // ThenEnqueueOnBackgroundThread for details.) Therefore, when the done
-  // callback is finally called from that background thread, we know for sure
-  // that everything enqueued onto the stream (i.e., the device) at this very
-  // moment--when ThenEnqueueOnBackgroundThread is called--will have finished.
-  // This achieves a device-wide sync.
-  stream->ThenEnqueueOnBackgroundThread([stream, done](se::StreamExecutor*) {
-    profiler::TraceMe activity("XlaDevice::Sync::Callback",
-                               profiler::TraceMeLevel::kInfo);
-    done(stream->ok() ? OkStatus()
-                      : errors::Internal("XlaDevice::Sync() failed."));
-  });
-}
-
 Status XlaDevice::MakeTensorFromProto(DeviceContext* device_context,
                                       const TensorProto& tensor_proto,
                                       const AllocatorAttributes alloc_attrs,
diff --git a/tensorflow/compiler/jit/xla_device.h b/tensorflow/compiler/jit/xla_device.h
index aeff4501af480d..ecfdd073253a80 100644
--- a/tensorflow/compiler/jit/xla_device.h
+++ b/tensorflow/compiler/jit/xla_device.h
@@ -158,7 +158,6 @@ class XlaDevice : public LocalDevice {
   void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
                     AsyncOpKernel::DoneCallback done) override;
   Status Sync() override;
-  void Sync(const DoneCallback& done) override;
 
   Status TryGetDeviceContext(DeviceContext** out_context) override
       TF_LOCKS_EXCLUDED(mu_);
diff --git a/tensorflow/compiler/jit/xla_launch_util.cc b/tensorflow/compiler/jit/xla_launch_util.cc
index 288344eb341d23..dff81fdd367fcc 100644
--- a/tensorflow/compiler/jit/xla_launch_util.cc
+++ b/tensorflow/compiler/jit/xla_launch_util.cc
@@ -673,7 +673,6 @@ Status PreparePjRtExecutableArguments(
       }
     } else {
       if (av_tensor->GetBuffer() == nullptr) {
-        // TODO(b/260799971): verify size 0 argument is supported.
         CHECK_EQ(tensor->NumElements(), 0);  // Crash OK
         continue;
       }
diff --git a/tensorflow/compiler/mlir/BUILD b/tensorflow/compiler/mlir/BUILD
index 742f97d902522a..b30f08a1bfe1b4 100644
--- a/tensorflow/compiler/mlir/BUILD
+++ b/tensorflow/compiler/mlir/BUILD
@@ -74,7 +74,6 @@ cc_library(
         "@local_xla//xla/mlir/framework/transforms:passes",
         "@local_xla//xla/mlir_hlo:all_passes",
         "@local_xla//xla/service/cpu:hlo_xla_runtime_pipeline",
-        "@local_xla//xla/translate/mhlo_to_lhlo_with_xla",
     ],
 )
 
@@ -204,7 +203,6 @@ cc_library(
         "@local_xla//xla/mlir/framework/ir:xla_framework",
         "@local_xla//xla/mlir_hlo:hlo_dialect_registration",
         "@local_xla//xla/service/cpu:hlo_xla_runtime_pipeline",
-        "@local_xla//xla/translate/mhlo_to_lhlo_with_xla",
         "@stablehlo//:register",
     ],
 )
@@ -239,7 +237,6 @@ tf_cc_binary(
         "@llvm-project//mlir:TranslateLib",
         "@local_xla//xla/translate/hlo_to_mhlo:translate_registration",
         "@local_xla//xla/translate/mhlo_to_hlo:translate_registration",
-        "@local_xla//xla/translate/mhlo_to_lhlo_with_xla:translate_registration",
     ],
 )
 
diff --git a/tensorflow/compiler/mlir/lite/BUILD b/tensorflow/compiler/mlir/lite/BUILD
index 3ebbd3b1b81942..38610dcef42b52 100644
--- a/tensorflow/compiler/mlir/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/BUILD
@@ -5,7 +5,7 @@ load("//tensorflow:tensorflow.default.bzl", "filegroup", "get_compatible_with_po
 load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:LICENSE"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -452,14 +452,15 @@ cc_library(
         "utils/constant_utils.h",
     ],
     deps = [
-        "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:mangling_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_attributes",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/status",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -754,6 +755,7 @@ cc_library(
         "transforms/passes.h",
     ],
     deps = [
+        ":constant_utils",
         ":convert_type",
         ":tensorflow_lite",
         ":tensorflow_lite_passes_inc_gen",
@@ -1267,24 +1269,29 @@ tf_cc_binary(
         ":tf_to_tfl_flatbuffer",
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir:init_mlir",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibrator_singleton_impl",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/mlir/tensorflow:translate_cl_options",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:errors",
         "//tensorflow/lite:framework",
         "//tensorflow/lite/schema:schema_fbs",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
-        "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla/translate/hlo_to_mhlo:translate",
         "@stablehlo//:stablehlo_ops",
     ],
@@ -1338,23 +1345,17 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/stablehlo:tfl_legalize_hlo",  # buildcleaner: keep
         "//tensorflow/compiler/mlir/lite/stablehlo:transforms",
         "//tensorflow/compiler/mlir/lite/stablehlo:uniform_quantized_stablehlo_to_tfl_pass",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
-        "//tensorflow/compiler/mlir/tensorflow:translate_lib",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_passes",
         "//tensorflow/core:core_cpu_base",
-        "//tensorflow/lite/toco:model_flags_proto_cc",
         "//tensorflow/lite/toco:toco_flags_proto_cc",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
-        "@local_xla//xla/mlir_hlo",
-        "@local_xla//xla/mlir_hlo:all_passes",
         "@local_xla//xla/mlir_hlo:mhlo_passes",
+        "@stablehlo//stablehlo/experimental:experimental_stablehlo_passes",
     ],
 )
 
@@ -1370,45 +1371,54 @@ cc_library(
         ":tensorflow_lite",
         ":tf_tfl_passes",
         "//tensorflow/cc/saved_model:loader",
+        "//tensorflow/compiler/mlir:op_or_arg_name_mapper",
         "//tensorflow/compiler/mlir/lite/debug",
+        "//tensorflow/compiler/mlir/lite/metrics:error_collector",
         "//tensorflow/compiler/mlir/lite/metrics:error_collector_inst",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
+        "//tensorflow/compiler/mlir/lite/quantization/stablehlo:quantization",
         "//tensorflow/compiler/mlir/lite/stablehlo:op_stat_pass",
         "//tensorflow/compiler/mlir/lite/stablehlo:stablehlo_tfl",
         "//tensorflow/compiler/mlir/lite/stablehlo:stablehlo_util",
         "//tensorflow/compiler/mlir/lite/stablehlo:transforms",
-        "//tensorflow/compiler/mlir/lite/stablehlo/serializer:flatbuffer_export",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantize_passes",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:static_range_ptq_impl",  # buildcleaner: keep; prevents undefined reference
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_preprocess",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
+        "//tensorflow/compiler/mlir/tensorflow:mlir_import_options",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/compiler/mlir/tensorflow:tf_dialect_lib",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_freeze_variables",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_passes",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
-        "//tensorflow/core/platform:status",
+        "//tensorflow/lite/c:c_api_types",
+        "//tensorflow/lite/experimental/remat:metadata_util",
         "//tensorflow/lite/python/metrics:converter_error_data_proto_cc",
+        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/toco:toco_flags_proto_cc",
         "//tensorflow/lite/tools/optimize:quantize_weights",
         "//tensorflow/lite/tools/optimize:reduced_precision_support",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@flatbuffers//:runtime_cc",
         "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:Transforms",
+        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h b/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
index 9dd57f2bdea429..39f81c7a6a770d 100644
--- a/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
+++ b/tensorflow/compiler/mlir/lite/common/tfl_pass_config.h
@@ -95,6 +95,10 @@ struct PassConfig {
   // ops and to convert kernels to quantized kernels wherever appropriate.
   quant::QDQConversionMode qdq_conversion_mode =
       quant::QDQConversionMode::kQDQNone;
+
+  // When set to true, StableHLO Quantizer is run. The full configuration for
+  // the quantizer is at `TocoFlags::quantization_config`.
+  bool enable_stablehlo_quantizer = false;
 };
 
 inline llvm::raw_ostream& operator<<(llvm::raw_ostream& os,
diff --git a/tensorflow/compiler/mlir/lite/debug/debug_test.cc b/tensorflow/compiler/mlir/lite/debug/debug_test.cc
index 1bcb86de9a4a94..127d485b842f94 100644
--- a/tensorflow/compiler/mlir/lite/debug/debug_test.cc
+++ b/tensorflow/compiler/mlir/lite/debug/debug_test.cc
@@ -53,11 +53,7 @@ limitations under the License.
 #include "tsl/platform/status.h"
 
 namespace tensorflow {
-namespace {
-
-using ::testing::HasSubstr;
-using ::testing::IsEmpty;
-using ::testing::Not;
+namespace debug_test {
 
 class NopPass : public mlir::PassWrapper<NopPass, mlir::OperationPass<>> {
  public:
@@ -84,6 +80,15 @@ class AlwaysFailPass
   void runOnOperation() override { signalPassFailure(); }
 };
 
+}  // namespace debug_test
+
+namespace {
+
+using ::testing::HasSubstr;
+using ::testing::IsEmpty;
+using ::testing::Not;
+using namespace tensorflow::debug_test;
+
 class InitPassManagerTest : public testing::Test {
  protected:
   InitPassManagerTest()
@@ -179,8 +184,7 @@ TEST_F(InitPassManagerTest, DumpToDir) {
     TF_ASSERT_OK(tsl::ReadFileToString(
         tsl::Env::Default(),
         tsl::io::JoinPath(
-            dump_dir,
-            "00000000.main.tensorflow_anonymous_namespace_NopPass_after.mlir"),
+            dump_dir, "00000000.main.tensorflow_debug_test_NopPass_after.mlir"),
         &mlir_dump));
     EXPECT_THAT(mlir_dump, Not(IsEmpty()));
   }
@@ -190,7 +194,7 @@ TEST_F(InitPassManagerTest, DumpToDir) {
         tsl::Env::Default(),
         tsl::io::JoinPath(
             dump_dir,
-            "00000000.main.tensorflow_anonymous_namespace_NopPass_before.mlir"),
+            "00000000.main.tensorflow_debug_test_NopPass_before.mlir"),
         &mlir_dump));
     EXPECT_THAT(mlir_dump, Not(IsEmpty()));
   }
@@ -207,12 +211,10 @@ TEST_F(InitPassManagerTest, PrintIRBeforeEverything) {
   pm.addPass(std::make_unique<NopPass>());
   ASSERT_TRUE(mlir::succeeded(pm.run(*module_)));
 
-  EXPECT_THAT(
-      captured_out,
-      HasSubstr("IR Dump Before tensorflow::(anonymous namespace)::NopPass"));
   EXPECT_THAT(captured_out,
-              Not(HasSubstr(
-                  "IR Dump After tensorflow::(anonymous namespace)::NopPass")));
+              HasSubstr("IR Dump Before tensorflow::debug_test::NopPass"));
+  EXPECT_THAT(captured_out,
+              Not(HasSubstr("IR Dump After tensorflow::debug_test::NopPass")));
 }
 
 TEST_F(InitPassManagerTest, PrintIRAfterEverything) {
@@ -226,13 +228,11 @@ TEST_F(InitPassManagerTest, PrintIRAfterEverything) {
   pm.addPass(std::make_unique<MutatePass>());
   ASSERT_TRUE(mlir::succeeded(pm.run(*module_)));
 
+  EXPECT_THAT(captured_out,
+              HasSubstr("IR Dump After tensorflow::debug_test::MutatePass"));
   EXPECT_THAT(
       captured_out,
-      HasSubstr("IR Dump After tensorflow::(anonymous namespace)::MutatePass"));
-  EXPECT_THAT(
-      captured_out,
-      Not(HasSubstr(
-          "IR Dump Before tensorflow::(anonymous namespace)::MutatePass")));
+      Not(HasSubstr("IR Dump Before tensorflow::debug_test::MutatePass")));
 }
 
 TEST_F(InitPassManagerTest, PrintIRBeforeAndAfterEverything) {
@@ -247,13 +247,10 @@ TEST_F(InitPassManagerTest, PrintIRBeforeAndAfterEverything) {
   pm.addPass(std::make_unique<MutatePass>());
   ASSERT_TRUE(mlir::succeeded(pm.run(*module_)));
 
-  EXPECT_THAT(
-      captured_out,
-      HasSubstr("IR Dump After tensorflow::(anonymous namespace)::MutatePass"));
-  EXPECT_THAT(
-      captured_out,
-      HasSubstr(
-          "IR Dump Before tensorflow::(anonymous namespace)::MutatePass"));
+  EXPECT_THAT(captured_out,
+              HasSubstr("IR Dump After tensorflow::debug_test::MutatePass"));
+  EXPECT_THAT(captured_out,
+              HasSubstr("IR Dump Before tensorflow::debug_test::MutatePass"));
 }
 
 TEST_F(InitPassManagerTest, ElideLargeElementAttrs) {
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
index 04a92f3412ba82..25f62ce0981b6f 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_export.cc
@@ -2632,8 +2632,7 @@ Translator::CreateMetadataVector() {
       } else {
         module_.emitError(
             "all values in tfl.metadata's dictionary key-value pairs should "
-            "be "
-            "string attributes");
+            "be string attributes");
         return std::nullopt;
       }
     }
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
index 26ae90d95a97ea..bd912797d44820 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_import.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/AssumeBundleQueries.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/FormatVariadic.h"
@@ -1495,6 +1496,8 @@ OwningOpRef<mlir::ModuleOp> tflite::FlatBufferToMlir(
 
   bool use_stablehlo_constant = false;
 
+  llvm::SmallVector<mlir::NamedAttribute> metadata_attrs;
+  mlir::StringSet<> seen_attr;
   for (const auto& metadata : model->metadata) {
     if (metadata->name == tflite::kModelControlDependenciesMetadataKey) {
       const std::vector<uint8_t>& data = model->buffers[metadata->buffer]->data;
@@ -1502,15 +1505,28 @@ OwningOpRef<mlir::ModuleOp> tflite::FlatBufferToMlir(
               reinterpret_cast<const char*>(data.data()), data.size(),
               &model_control_dependencies)) {
         return emitError(base_loc,
-                         "Invalid model_control_dependencies metadata"),
+                         "invalid model_control_dependencies metadata"),
                nullptr;
       }
-      break;
+      continue;
     }
+
+    // Skip already seen attributes. Ideally there should be no duplicates here.
+    if (!seen_attr.try_emplace(metadata->name).second) continue;
+
     // check if the model is serialized using stablehlo constant tensor
     if (metadata->name == tflite::kModelUseStablehloTensorKey) {
       use_stablehlo_constant = true;
+      metadata_attrs.emplace_back(builder.getStringAttr(metadata->name),
+                                  builder.getStringAttr("true"));
+      continue;
     }
+
+    std::vector<uint8_t> buffer = model->buffers[metadata->buffer]->data;
+    metadata_attrs.emplace_back(
+        builder.getStringAttr(metadata->name),
+        builder.getStringAttr(llvm::StringRef(
+            reinterpret_cast<char*>(buffer.data()), buffer.size())));
   }
 
   std::vector<std::string> func_names;
@@ -1528,18 +1544,15 @@ OwningOpRef<mlir::ModuleOp> tflite::FlatBufferToMlir(
                     builder.getStringAttr(model->description));
   }
 
+  if (!metadata_attrs.empty()) {
+    module->setAttr("tfl.metadata", builder.getDictionaryAttr(metadata_attrs));
+  }
+
   if (!model->signature_defs.empty()) {
     module->setAttr("tf_saved_model.semantics",
                     mlir::UnitAttr::get(builder.getContext()));
   }
 
-  if (use_stablehlo_constant) {
-    module->setAttr("tfl.metadata",
-                    builder.getDictionaryAttr(builder.getNamedAttr(
-                        tflite::kModelUseStablehloTensorKey,
-                        builder.getStringAttr("true"))));
-  }
-
   absl::flat_hash_map<uint32_t, tflite::SignatureDefT*>
       subgraph_to_signature_map;
   for (int i = 0; i < model->signature_defs.size(); i++) {
diff --git a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
index 665d101a4f19af..db138d74b76048 100644
--- a/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
+++ b/tensorflow/compiler/mlir/lite/flatbuffer_operator.cc
@@ -285,14 +285,6 @@ static mlir::Attribute BuildRankedTensorAttr(std::vector<int64_t> shape,
   return mlir::DenseIntElementsAttr::get(ty, value);
 }
 
-static mlir::Attribute BuildI64ArrayAttr(std::vector<int64_t> shape,
-                                         std::vector<int64_t> value,
-                                         mlir::Builder builder) {
-  // Expand splats. BuildI64ArrayAttr assumes shape.size() == 1.
-  if (value.size() == 1) value.resize(shape[0], value[0]);
-  return builder.getDenseI64ArrayAttr(value);
-}
-
 static mlir::Attribute BuildF32ArrayAttr(std::vector<float> value,
                                          mlir::Builder builder) {
   std::vector<float> typecast(value.begin(), value.end());
@@ -399,29 +391,22 @@ void BuiltinOptions2ToAttributesManual(
   if (const auto* op = op_union.AsStablehloBroadcastInDimOptions()) {
     attributes.emplace_back(builder.getNamedAttr(
         "broadcast_dimensions",
-        BuildRankedTensorAttr(
-            {static_cast<int64_t>(op->broadcast_dimensions.size())},
-            op->broadcast_dimensions, builder)));
+        builder.getDenseI64ArrayAttr(op->broadcast_dimensions)));
     return;
   }
   if (const auto* op = op_union.AsStablehloSliceOptions()) {
-    std::vector<int64_t> shape = {
-        static_cast<int64_t>(op->start_indices.size())};
     attributes.emplace_back(builder.getNamedAttr(
-        "start_indices", BuildI64ArrayAttr(shape, op->start_indices, builder)));
+        "start_indices", builder.getDenseI64ArrayAttr(op->start_indices)));
     attributes.emplace_back(builder.getNamedAttr(
-        "limit_indices", BuildI64ArrayAttr(shape, op->limit_indices, builder)));
+        "limit_indices", builder.getDenseI64ArrayAttr(op->limit_indices)));
     attributes.emplace_back(builder.getNamedAttr(
-        "strides", BuildI64ArrayAttr(shape, op->strides, builder)));
+        "strides", builder.getDenseI64ArrayAttr(op->strides)));
     return;
   }
   if (const auto* op = op_union.AsStablehloConvolutionOptions()) {
     if (!(op->window_strides.empty())) {
-      std::vector<int64_t> shape;
-      shape.push_back(static_cast<int64_t>(op->window_strides.size()));
       attributes.emplace_back(builder.getNamedAttr(
-          "window_strides",
-          BuildRankedTensorAttr(shape, op->window_strides, builder)));
+          "window_strides", builder.getDenseI64ArrayAttr(op->window_strides)));
     }
     if (!(op->padding.empty())) {
       std::vector<int64_t> shape;
@@ -431,25 +416,19 @@ void BuiltinOptions2ToAttributesManual(
           "padding", BuildRankedTensorAttr(shape, op->padding, builder)));
     }
     if (!(op->lhs_dilation.empty())) {
-      std::vector<int64_t> shape;
-      shape.push_back(static_cast<int64_t>(op->lhs_dilation.size()));
       attributes.emplace_back(builder.getNamedAttr(
-          "lhs_dilation",
-          BuildRankedTensorAttr(shape, op->lhs_dilation, builder)));
+          "lhs_dilation", builder.getDenseI64ArrayAttr(op->lhs_dilation)));
     }
     if (!(op->rhs_dilation.empty())) {
-      std::vector<int64_t> shape;
-      shape.push_back(static_cast<int64_t>(op->rhs_dilation.size()));
       attributes.emplace_back(builder.getNamedAttr(
-          "rhs_dilation",
-          BuildRankedTensorAttr(shape, op->rhs_dilation, builder)));
+          "rhs_dilation", builder.getDenseI64ArrayAttr(op->rhs_dilation)));
     }
-    if (!(op->window_reversal.empty()))
+    if (!(op->window_reversal.empty())) {
+      llvm::SmallVector<bool> window_reversal;
+      for (bool b : op->window_reversal) window_reversal.push_back(b);
       attributes.emplace_back(builder.getNamedAttr(
-          "window_reversal",
-          BuildRankedTensorAttr(
-              {static_cast<int64_t>(op->window_reversal.size())},
-              op->window_reversal, builder)));
+          "window_reversal", builder.getDenseBoolArrayAttr(window_reversal)));
+    }
     attributes.emplace_back(builder.getNamedAttr(
         "dimension_numbers",
         mlir::stablehlo::ConvDimensionNumbersAttr::get(
@@ -502,20 +481,18 @@ void BuiltinOptions2ToAttributesManual(
         static_cast<int64_t>(op->edge_padding_low.size())};
     attributes.emplace_back(builder.getNamedAttr(
         "edge_padding_low",
-        BuildI64ArrayAttr(shape, op->edge_padding_low, builder)));
+        builder.getDenseI64ArrayAttr(op->edge_padding_low)));
     attributes.emplace_back(builder.getNamedAttr(
         "edge_padding_high",
-        BuildI64ArrayAttr(shape, op->edge_padding_high, builder)));
+        builder.getDenseI64ArrayAttr(op->edge_padding_high)));
     attributes.emplace_back(builder.getNamedAttr(
         "interior_padding",
-        BuildI64ArrayAttr(shape, op->interior_padding, builder)));
+        builder.getDenseI64ArrayAttr(op->interior_padding)));
     return;
   }
   if (const auto* op = op_union.AsStablehloDynamicSliceOptions()) {
     attributes.emplace_back(builder.getNamedAttr(
-        "slice_sizes",
-        BuildI64ArrayAttr({static_cast<int64_t>(op->slice_sizes.size())},
-                          op->slice_sizes, builder)));
+        "slice_sizes", builder.getDenseI64ArrayAttr(op->slice_sizes)));
     return;
   }
   if (const auto* op = op_union.AsStablehloCompareOptions()) {
@@ -540,39 +517,27 @@ void BuiltinOptions2ToAttributesManual(
   }
   if (const auto* op = op_union.AsStablehloReduceOptions()) {
     attributes.emplace_back(builder.getNamedAttr(
-        "dimensions",
-        BuildRankedTensorAttr({static_cast<int64_t>(op->dimensions.size())},
-                              op->dimensions, builder)));
+        "dimensions", builder.getDenseI64ArrayAttr(op->dimensions)));
     return;
   }
   if (const auto* op = op_union.AsStablehloReduceWindowOptions()) {
     if (!op->window_dimensions.empty()) {
       attributes.emplace_back(builder.getNamedAttr(
           "window_dimensions",
-          BuildRankedTensorAttr(
-              {static_cast<int64_t>(op->window_dimensions.size())},
-              op->window_dimensions, builder)));
+          builder.getDenseI64ArrayAttr(op->window_dimensions)));
     }
     if (!op->window_strides.empty()) {
       attributes.emplace_back(builder.getNamedAttr(
-          "window_strides",
-          BuildRankedTensorAttr(
-              {static_cast<int64_t>(op->window_strides.size())},
-              op->window_strides, builder)));
+          "window_strides", builder.getDenseI64ArrayAttr(op->window_strides)));
     }
     if (!op->base_dilations.empty()) {
       attributes.emplace_back(builder.getNamedAttr(
-          "base_dilations",
-          BuildRankedTensorAttr(
-              {static_cast<int64_t>(op->base_dilations.size())},
-              op->base_dilations, builder)));
+          "base_dilations", builder.getDenseI64ArrayAttr(op->base_dilations)));
     }
     if (!op->window_dilations.empty()) {
       attributes.emplace_back(builder.getNamedAttr(
           "window_dilations",
-          BuildRankedTensorAttr(
-              {static_cast<int64_t>(op->window_dilations.size())},
-              op->window_dilations, builder)));
+          builder.getDenseI64ArrayAttr(op->window_dilations)));
     }
     if (!op->padding.empty()) {
       attributes.emplace_back(builder.getNamedAttr(
@@ -617,9 +582,7 @@ void BuiltinOptions2ToAttributesManual(
         builder.getNamedAttr("dimension_numbers", gather_dim));
     if (!op->slice_sizes.empty()) {
       attributes.emplace_back(builder.getNamedAttr(
-          "slice_sizes",
-          BuildRankedTensorAttr({static_cast<int64_t>(op->slice_sizes.size())},
-                                op->slice_sizes, builder)));
+          "slice_sizes", builder.getDenseI64ArrayAttr(op->slice_sizes)));
     }
     attributes.emplace_back(builder.getNamedAttr(
         "indices_are_sorted", BuildBoolAttr(op->indices_are_sorted, builder)));
@@ -628,9 +591,7 @@ void BuiltinOptions2ToAttributesManual(
   if (const auto* op = op_union.AsStablehloTransposeOptions()) {
     if (!op->permutation.empty()) {
       attributes.emplace_back(builder.getNamedAttr(
-          "permutation",
-          BuildI64ArrayAttr({static_cast<int64_t>(op->permutation.size())},
-                            op->permutation, builder)));
+          "permutation", builder.getDenseI64ArrayAttr(op->permutation)));
     }
 
     return;
diff --git a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
index 45a1e3b25e1335..e8f9787947d6eb 100644
--- a/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
+++ b/tensorflow/compiler/mlir/lite/ir/tfl_ops.td
@@ -1061,7 +1061,7 @@ def TFL_DepthwiseConv2DOp :
 def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [
     Pure, AccumulatorUniformScale<2, 0, 1>,
     AffineQuantizedOpInterface,
-    AffineOpCoefficient<-1, 1>,
+    AffineOpCoefficient<0, 1>,
     TFL_SparseOp,
     DeclareOpInterfaceMethods<TFL_ArithmeticCount>,
     QuantizableResult,
@@ -1097,7 +1097,7 @@ def TFL_FullyConnectedOp : TFL_Op<"fully_connected", [
   let extraClassDeclaration = [{
     // AffineQuantizedOpInterface:
     int GetChannelDimIndex() { return 0; }
-    int GetQuantizationDimIndex() { return -1; }
+    int GetQuantizationDimIndex() { return 0; }
     // SparseOpInterface:
     std::vector<int> GetSparseOperands() { return {1}; }
     std::vector<std::vector<int>> GetFloatBlockSize() { return {{1, 4}}; }
@@ -1219,12 +1219,12 @@ def TFL_GatherNdOp : TFL_Op<"gather_nd", [
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I1, I8, I16, I64, I32, UI8, TFL_Str]>:$params,
+    TFL_TensorOf<[F32, I1, I8, I16, I64, I32, UI8, QI8, TFL_Str]>:$params,
     TFL_TensorOf<[I16, I32, I64]>:$indices
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I1, I8, I16, I64, I32, UI8, TFL_Str]>:$output
+    TFL_TensorOf<[F32, I1, I8, I16, I64, I32, UI8, QI8, TFL_Str]>:$output
   );
 }
 
@@ -5399,12 +5399,12 @@ subsequent operation and then be optimized away, however.)
   }];
 
   let arguments = (ins
-    TFL_TensorOf<[F32, I32, I1, TFL_I4, I8, QI8, UI8, QUI8, I16, QI16, I64, Complex<F<32>>]>:$input,
+    TFL_TensorOf<[F32, I32, I1, TFL_I4, I8, QI8, UI8, UI32, QUI8, I16, QI16, I64, Complex<F<32>>]>:$input,
     TFL_I32OrI64Tensor:$shape
   );
 
   let results = (outs
-    TFL_TensorOf<[F32, I32, I1, TFL_I4, I8, QI8, UI8, QUI8, I16, QI16, I64, Complex<F<32>>]>:$output
+    TFL_TensorOf<[F32, I32, I1, TFL_I4, I8, QI8, UI8, UI32, QUI8, I16, QI16, I64, Complex<F<32>>]>:$output
   );
 
   let hasCanonicalizer = 1;
diff --git a/tensorflow/compiler/mlir/lite/python/BUILD b/tensorflow/compiler/mlir/lite/python/BUILD
index a911d438d20368..0ac8bc0ff65117 100644
--- a/tensorflow/compiler/mlir/lite/python/BUILD
+++ b/tensorflow/compiler/mlir/lite/python/BUILD
@@ -23,21 +23,25 @@ cc_library(
     srcs = ["tf_tfl_flatbuffer_helpers.cc"],
     hdrs = ["tf_tfl_flatbuffer_helpers.h"],
     deps = [
+        "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir/lite:common",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "//tensorflow/compiler/mlir/lite:tf_tfl_passes",
         "//tensorflow/compiler/mlir/lite:tf_to_tfl_flatbuffer",
-        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
-        "//tensorflow/compiler/mlir/tensorflow:import_model",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite/toco:model_flags_proto_cc",
         "//tensorflow/lite/toco:toco_flags_proto_cc",
         "//tensorflow/lite/toco:types_proto_cc",
         "//tensorflow/lite/tools/optimize:reduced_precision_support",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -56,10 +60,8 @@ cc_library(
     deps = [
         ":tf_tfl_flatbuffer_helpers",
         "//tensorflow/compiler/mlir/lite:common",
-        "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "//tensorflow/compiler/mlir/lite:tf_tfl_passes",
-        "//tensorflow/compiler/mlir/lite:tf_to_tfl_flatbuffer",
-        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
         "//tensorflow/compiler/mlir/tensorflow:import_model",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/core:lib",
@@ -68,12 +70,8 @@ cc_library(
         "//tensorflow/lite/toco:toco_flags_proto_cc",
         "//tensorflow/lite/toco:types_proto_cc",
         "//tensorflow/lite/tools/optimize:reduced_precision_support",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
+        "@com_google_absl//absl/status",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:Transforms",
     ],
 )
 
@@ -90,6 +88,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite:tf_to_tfl_flatbuffer",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -114,23 +113,19 @@ cc_library(
         ":tf_tfl_flatbuffer_helpers",
         "//tensorflow/compiler/mlir/lite:common",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "//tensorflow/compiler/mlir/lite:tf_tfl_passes",
-        "//tensorflow/compiler/mlir/lite:tf_to_tfl_flatbuffer",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/lite/toco:model_flags_proto_cc",
         "//tensorflow/lite/toco:toco_flags_proto_cc",
         "//tensorflow/lite/toco:types_proto_cc",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:Transforms",
         "@local_xla//xla/service:hlo_parser",
         "@local_xla//xla/service:hlo_proto_cc",
         "@local_xla//xla/translate/hlo_to_mhlo:hlo_to_mlir_hlo",
diff --git a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
index 9409856cd4c864..f678daf32f234c 100644
--- a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.cc
@@ -16,49 +16,41 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.h"
 
 #include <optional>
-#include <ostream>
 #include <string>
 #include <utility>
 #include <vector>
 
-#include "llvm/Support/ToolOutputFile.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "absl/status/status.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/FileUtilities.h"  // from @llvm-project
-#include "mlir/Transforms/ViewOpGraph.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h"
-#include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
-#include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
-#include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/status.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 #include "tensorflow/lite/toco/types.pb.h"
+#include "tensorflow/lite/tools/optimize/reduced_precision_support.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace tensorflow {
-Status ConvertGraphDefToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
-                                         const toco::TocoFlags& toco_flags,
-                                         const GraphDebugInfo& debug_info,
-                                         const GraphDef& input,
-                                         string* result) {
+
+absl::Status ConvertGraphDefToTFLiteFlatBuffer(
+    const toco::ModelFlags& model_flags, const toco::TocoFlags& toco_flags,
+    const GraphDebugInfo& debug_info, const GraphDef& input,
+    std::string* result) {
   using ::tflite::optimize::ReducedPrecisionSupport;
   mlir::MLIRContext context;
   GraphImportConfig specs;
   mlir::quant::QuantizationSpecs quant_specs;
 
   // Parse input arrays.
-  std::vector<string> node_names;
-  std::vector<string> node_dtypes;
+  std::vector<std::string> node_names;
+  std::vector<std::string> node_dtypes;
   std::vector<std::optional<std::vector<int>>> node_shapes;
   std::vector<std::optional<double>> node_mins;
   std::vector<std::optional<double>> node_maxs;
@@ -68,21 +60,20 @@ Status ConvertGraphDefToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
       model_flags, toco_flags, &quant_specs, &node_names, &node_dtypes,
       &node_shapes, &node_mins, &node_maxs));
 
-  TF_RETURN_IF_ERROR(tensorflow::ParseInputArrayInfo(
-      node_names, node_dtypes, node_shapes, &specs.inputs));
+  TF_RETURN_IF_ERROR(
+      ParseInputArrayInfo(node_names, node_dtypes, node_shapes, &specs.inputs));
 
   // Parse output arrays.
-  std::vector<string> output_arrays(model_flags.output_arrays().begin(),
-                                    model_flags.output_arrays().end());
-  TF_RETURN_IF_ERROR(
-      tensorflow::ParseOutputArrayInfo(output_arrays, &specs.outputs));
+  std::vector<std::string> output_arrays(model_flags.output_arrays().begin(),
+                                         model_flags.output_arrays().end());
+  TF_RETURN_IF_ERROR(ParseOutputArrayInfo(output_arrays, &specs.outputs));
 
   // Parse control output arrays.
-  std::vector<string> control_output_arrays(
+  std::vector<std::string> control_output_arrays(
       model_flags.control_output_arrays().begin(),
       model_flags.control_output_arrays().end());
-  TF_RETURN_IF_ERROR(tensorflow::ParseOutputArrayInfo(control_output_arrays,
-                                                      &specs.control_outputs));
+  TF_RETURN_IF_ERROR(
+      ParseOutputArrayInfo(control_output_arrays, &specs.control_outputs));
 
   specs.prune_unused_nodes = true;
   specs.convert_legacy_fed_inputs = true;
@@ -118,10 +109,12 @@ Status ConvertGraphDefToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
       toco_flags.guarantee_all_funcs_one_use();
   pass_config.enable_stablehlo_conversion = toco_flags.convert_to_stablehlo();
 
+  // StableHLO Quantizer is not supported for GraphDef inputs, so
+  // quantization_py_function_lib is set to nullptr.
   return internal::ConvertMLIRToTFLiteFlatBuffer(
       model_flags, toco_flags, std::move(module), pass_config,
-      /*saved_model_tags=*/{}, result,
-      /*session=*/std::nullopt);
+      /*saved_model_tags=*/{}, result, /*saved_model_bundle=*/nullptr,
+      /*quantization_py_function_lib=*/nullptr);
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.h
index e69d3c718d9b37..54f8a996e8883c 100644
--- a/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/python/graphdef_to_tfl_flatbuffer.h
@@ -15,9 +15,11 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_GRAPHDEF_TO_TFL_FLATBUFFER_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_GRAPHDEF_TO_TFL_FLATBUFFER_H_
 
+#include <string>
+
+#include "absl/status/status.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_debug_info.pb.h"
-#include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 
@@ -26,10 +28,10 @@ namespace tensorflow {
 // Converts the given GraphDef to a TF Lite FlatBuffer string according to the
 // given model flags, toco flags and debug information. Returns error status if
 // it fails to convert the input.
-Status ConvertGraphDefToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
-                                         const toco::TocoFlags& toco_flags,
-                                         const GraphDebugInfo& debug_info,
-                                         const GraphDef& input, string* result);
+absl::Status ConvertGraphDefToTFLiteFlatBuffer(
+    const toco::ModelFlags& model_flags, const toco::TocoFlags& toco_flags,
+    const GraphDebugInfo& debug_info, const GraphDef& input,
+    std::string* result);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc
index f81b5e8b5da6a7..b25040827ccbae 100644
--- a/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/jax_to_tfl_flatbuffer.cc
@@ -20,27 +20,23 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_join.h"
-#include "absl/types/span.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringSet.h"
-#include "llvm/Support/ToolOutputFile.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
-#include "mlir/Transforms/ViewOpGraph.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h"
-#include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
-#include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_parser.h"
 #include "xla/translate/hlo_to_mhlo/hlo_to_mlir_hlo.h"
@@ -49,23 +45,24 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/status.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 #include "tensorflow/lite/toco/types.pb.h"
-#include "tsl/platform/statusor.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 
 namespace tensorflow {
 namespace {
 
 // Error collector that simply ignores errors reported.
-class NoOpErrorCollector : public tensorflow::protobuf::io::ErrorCollector {
+class NoOpErrorCollector : public protobuf::io::ErrorCollector {
  public:
-  void AddError(int line, int column, const string& message) override {}
+  void AddError(int line, int column, const std::string& message) override {}
 };
 
 bool LoadHloProto(const std::string& contents, xla::HloProto* hlo_proto) {
-  tensorflow::protobuf::TextFormat::Parser parser;
+  // NOLINTNEXTLINE: Use tsl::protobuf to be compatible with OSS.
+  tsl::protobuf::TextFormat::Parser parser;
   NoOpErrorCollector collector;
   parser.RecordErrorsTo(&collector);
   return hlo_proto->ParseFromString(contents) ||
@@ -75,10 +72,10 @@ bool LoadHloProto(const std::string& contents, xla::HloProto* hlo_proto) {
 }
 
 mlir::OwningOpRef<mlir::ModuleOp> HloToMlirHloTranslateFunction(
-    llvm::StringRef input, mlir::MLIRContext* context,
+    mlir::StringRef input, mlir::MLIRContext* context,
     bool import_all_computations) {
   xla::HloProto hlo_proto;
-  string content(input.data(), input.size());
+  std::string content(input.data(), input.size());
   if (!LoadHloProto(content, &hlo_proto)) {
     LOG(ERROR) << "Failed to load proto";
     return nullptr;
@@ -100,7 +97,7 @@ mlir::OwningOpRef<mlir::ModuleOp> HloTextToMlirHloTranslateFunction(
     llvm::StringRef input, mlir::MLIRContext* context,
     bool import_all_computations) {
   xla::HloProto hlo_proto;
-  string content(input.data(), input.size());
+  std::string content(input.data(), input.size());
 
   auto hlo_module_error = xla::ParseAndReturnUnverifiedModule(content);
   if (!hlo_module_error.ok()) {
@@ -122,16 +119,16 @@ mlir::OwningOpRef<mlir::ModuleOp> HloTextToMlirHloTranslateFunction(
 }
 
 }  // namespace
-Status ConvertJaxToTFLiteFlatBuffer(const std::string& input,
-                                    const toco::ModelFlags& model_flags,
-                                    const toco::TocoFlags& toco_flags,
-                                    string* result) {
+absl::Status ConvertJaxToTFLiteFlatBuffer(const std::string& input,
+                                          const toco::ModelFlags& model_flags,
+                                          const toco::TocoFlags& toco_flags,
+                                          std::string* result) {
   mlir::MLIRContext context;
   mlir::quant::QuantizationSpecs quant_specs;
 
   // Parse input arrays.
-  std::vector<string> node_names;
-  std::vector<string> node_dtypes;
+  std::vector<std::string> node_names;
+  std::vector<std::string> node_dtypes;
   std::vector<std::optional<std::vector<int>>> node_shapes;
   std::vector<std::optional<double>> node_mins;
   std::vector<std::optional<double>> node_maxs;
@@ -191,10 +188,12 @@ Status ConvertJaxToTFLiteFlatBuffer(const std::string& input,
   // phase.
   main_func->setAttr("tf.entry_function", builder.getDictionaryAttr(attrs));
 
+  // StableHLO Quantizer is not supported for JAX input models, so
+  // quantization_py_function_lib is set to nullptr.
   auto status = internal::ConvertMLIRToTFLiteFlatBuffer(
       model_flags, toco_flags, std::move(module), pass_config,
-      /*saved_model_tags=*/{}, result,
-      /*session=*/std::nullopt);
+      /*saved_model_tags=*/{}, result, /*saved_model_bundle=*/nullptr,
+      /*quantization_py_function_lib=*/nullptr);
   return status;
 }
 
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
index 3bd7b947e61021..57550c9f5b0f9d 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_debug_info.pb.h"
@@ -48,6 +49,8 @@ limitations under the License.
 
 namespace tensorflow {
 
+using tensorflow::quantization::PyFunctionLibrary;
+
 Status HandleInputOutputArraysWithModule(
     const toco::ModelFlags& model_flags,
     mlir::OwningOpRef<mlir::ModuleOp>* module) {
@@ -124,9 +127,10 @@ Status HandleInputOutputArraysWithModule(
   return OkStatus();
 }
 
-Status ConvertSavedModelToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
-                                           const toco::TocoFlags& toco_flags,
-                                           string* result) {
+Status ConvertSavedModelToTFLiteFlatBuffer(
+    const toco::ModelFlags& model_flags, const toco::TocoFlags& toco_flags,
+    std::string* result,
+    const PyFunctionLibrary* quantization_py_function_lib) {
   mlir::MLIRContext context;
   mlir::quant::QuantizationSpecs quant_specs;
 
@@ -199,15 +203,19 @@ Status ConvertSavedModelToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
   pass_config.enable_stablehlo_conversion = toco_flags.convert_to_stablehlo();
   pass_config.legalize_custom_tensor_list_ops =
       toco_flags.legalize_custom_tensor_list_ops();
+  pass_config.enable_stablehlo_quantizer = toco_flags.has_quantization_config();
 
   if (toco_flags.qdq_conversion_mode() == "STATIC") {
-    pass_config.qdq_conversion_mode =
+    pass_config.quant_specs.qdq_conversion_mode =
         mlir::quant::QDQConversionMode::kQDQStatic;
   } else if (toco_flags.qdq_conversion_mode() == "DYNAMIC") {
-    pass_config.qdq_conversion_mode =
+    pass_config.quant_specs.qdq_conversion_mode =
         mlir::quant::QDQConversionMode::kQDQDynamic;
+    // Need to set this or else the ops will still use floating point kernels
+    pass_config.quant_specs.inference_type = tensorflow::DT_QINT8;
   } else if (toco_flags.qdq_conversion_mode() == "NONE") {
-    pass_config.qdq_conversion_mode = mlir::quant::QDQConversionMode::kQDQNone;
+    pass_config.quant_specs.qdq_conversion_mode =
+        mlir::quant::QDQConversionMode::kQDQNone;
   } else {
     return errors::InvalidArgument("Unknown QDQ conversion mode: ",
                                    toco_flags.qdq_conversion_mode());
@@ -225,7 +233,7 @@ Status ConvertSavedModelToTFLiteFlatBuffer(const toco::ModelFlags& model_flags,
   // TODO(b/153507667): Pass the session object when importing logic is removed.
   auto status = internal::ConvertMLIRToTFLiteFlatBuffer(
       model_flags, toco_flags, std::move(module), pass_config, tags, result,
-      bundle ? bundle->GetSession() : nullptr);
+      bundle.get(), quantization_py_function_lib);
   return status;
 }
 
diff --git a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
index 362e9e39ae54c8..50d61dbd4f873b 100644
--- a/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_SAVED_MODEL_TO_TFL_FLATBUFFER_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_PYTHON_SAVED_MODEL_TO_TFL_FLATBUFFER_H_
 
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
@@ -28,7 +29,8 @@ namespace tensorflow {
 // status if it fails to convert the input.
 Status ConvertSavedModelToTFLiteFlatBuffer(
     const toco::ModelFlags& model_flags, const toco::TocoFlags& toco_flags,
-    string* result);
+    string* result,
+    const quantization::PyFunctionLibrary* quantization_py_function_lib);
 
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
index 980d74d6a47aa5..795485328d4779 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.cc
@@ -15,48 +15,55 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h"
 
 #include <optional>
-#include <ostream>
 #include <string>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/ViewOpGraph.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
-#include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops_n_z.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/core/framework/graph.pb.h"
 #include "tensorflow/core/framework/graph_debug_info.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/public/session.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 #include "tensorflow/lite/toco/types.pb.h"
 #include "tensorflow/lite/tools/optimize/reduced_precision_support.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 #include "tsl/platform/statusor.h"
 
-using tsl::StatusOr;
-
 namespace tensorflow {
 namespace internal {
 namespace {
 
 using ::mlir::quant::ReducedPrecisionSupport;
+using ::tensorflow::quantization::PyFunctionLibrary;
 
 // Op def string for TFLite_Detection_PostProcess Op.
-const char kDetectionPostProcessOp[] =
+constexpr mlir::StringRef kDetectionPostProcessOp =
     "name: 'TFLite_Detection_PostProcess' input_arg: { name: "
     "'raw_outputs/box_encodings' type: DT_FLOAT } input_arg: { name: "
     "'raw_outputs/class_predictions' type: DT_FLOAT } input_arg: { name: "
@@ -74,7 +81,7 @@ const char kDetectionPostProcessOp[] =
     "'detections_per_class' type: 'int' default_value { i : 100 }} attr { "
     "name: 'use_regular_nms' type: 'bool' default_value { b : false }}";
 
-const char kUnidirectionalSequenceLstmOp[] =
+constexpr mlir::StringRef kUnidirectionalSequenceLstmOp =
     "name: 'UnidirectionalSequenceLstm' input_arg: {name: 'Input' type: "
     "DT_FLOAT} input_arg: { name: 'InputToInputWeights' type: DT_FLOAT } "
     "input_arg: { name: 'InputToForgetWeights' type: DT_FLOAT } input_arg: { "
@@ -98,7 +105,7 @@ const char kUnidirectionalSequenceLstmOp[] =
     "'LastState' type: DT_FLOAT } output_arg: { name: 'Output' type: DT_FLOAT} "
     "attr : { name: '_tflite_input_indices' type: 'list(int)'}";
 
-const char kUnidirectionalSequenceRnnOp[] =
+constexpr mlir::StringRef kUnidirectionalSequenceRnnOp =
     "name: 'UnidirectionalSequenceRnn' input_arg: {name: 'Input' type: "
     "DT_FLOAT} input_arg: { name: 'Weights' type: DT_FLOAT } "
     "input_arg: { name: 'RecurrentWeights' type: DT_FLOAT } input_arg: { "
@@ -158,8 +165,9 @@ DataType ConvertIODataTypeToDataType(toco::IODataType dtype) {
   }
 }
 
-StatusOr<std::pair<double, double>> InputStatsToMinMax(double mean, double std,
-                                                       DataType type) {
+absl::StatusOr<std::pair<double, double>> InputStatsToMinMax(double mean,
+                                                             double std,
+                                                             DataType type) {
   // Only qint8 and quint8 are considered here.
   double qmin, qmax;
   if (type == DT_QUINT8) {
@@ -169,58 +177,59 @@ StatusOr<std::pair<double, double>> InputStatsToMinMax(double mean, double std,
     qmin = -128.0;
     qmax = 127.0;
   } else {
-    return errors::InvalidArgument("Only int8 and uint8 are considered.");
+    return absl::InvalidArgumentError("Only int8 and uint8 are considered.");
   }
   return std::make_pair((qmin - mean) / std, (qmax - mean) / std);
 }
 
-Status RegisterCustomBuiltinOps(const std::vector<string> extra_tf_opdefs) {
+absl::Status RegisterCustomBuiltinOps(
+    const std::vector<std::string> extra_tf_opdefs) {
   for (const auto& tf_opdefs_string : extra_tf_opdefs) {
-    tensorflow::OpDef opdef;
-    if (!tensorflow::protobuf::TextFormat::ParseFromString(tf_opdefs_string,
-                                                           &opdef)) {
+    OpDef opdef;
+    // NOLINTNEXTLINE: Use tsl::protobuf to be compatible with OSS.
+    if (!tsl::protobuf::TextFormat::ParseFromString(tf_opdefs_string, &opdef)) {
       return errors::InvalidArgument("fail to parse extra OpDef");
     }
     // Make sure the op is not already registered. If registered continue.
     const OpRegistrationData* op_reg =
-        tensorflow::OpRegistry::Global()->LookUp(opdef.name());
+        OpRegistry::Global()->LookUp(opdef.name());
     if (op_reg) continue;
 
-    tensorflow::OpRegistry::Global()->Register(
-        [opdef](tensorflow::OpRegistrationData* op_reg_data) -> Status {
-          *op_reg_data = tensorflow::OpRegistrationData(opdef);
-          return OkStatus();
+    OpRegistry::Global()->Register(
+        [opdef](OpRegistrationData* op_reg_data) -> absl::Status {
+          *op_reg_data = OpRegistrationData(opdef);
+          return absl::OkStatus();
         });
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
 
-Status RegisterAllCustomOps(const toco::TocoFlags& toco_flags) {
+absl::Status RegisterAllCustomOps(const toco::TocoFlags& toco_flags) {
   // Register any custom OpDefs.
-  std::vector<string> extra_tf_opdefs(toco_flags.custom_opdefs().begin(),
-                                      toco_flags.custom_opdefs().end());
-  extra_tf_opdefs.push_back(kDetectionPostProcessOp);
-  extra_tf_opdefs.push_back(kUnidirectionalSequenceLstmOp);
-  extra_tf_opdefs.push_back(kUnidirectionalSequenceRnnOp);
+  std::vector<std::string> extra_tf_opdefs(toco_flags.custom_opdefs().begin(),
+                                           toco_flags.custom_opdefs().end());
+  extra_tf_opdefs.push_back(kDetectionPostProcessOp.str());
+  extra_tf_opdefs.push_back(kUnidirectionalSequenceLstmOp.str());
+  extra_tf_opdefs.push_back(kUnidirectionalSequenceRnnOp.str());
   return RegisterCustomBuiltinOps(extra_tf_opdefs);
 }
 
-Status PopulateQuantizationSpecs(
+absl::Status PopulateQuantizationSpecs(
     const toco::ModelFlags& model_flags, const toco::TocoFlags& toco_flags,
     mlir::quant::QuantizationSpecs* quant_specs,
-    std::vector<string>* node_names, std::vector<string>* node_dtypes,
+    std::vector<std::string>* node_names, std::vector<std::string>* node_dtypes,
     std::vector<std::optional<std::vector<int>>>* node_shapes,
     std::vector<std::optional<double>>* node_mins,
     std::vector<std::optional<double>>* node_maxs) {
   quant_specs->inference_input_type =
       ConvertIODataTypeToDataType(toco_flags.inference_input_type());
-  tensorflow::DataType inference_type =
+  DataType inference_type =
       ConvertIODataTypeToDataType(toco_flags.inference_type());
   // Use non-float flag `inference_input_type` to override the `inference_type`
   // because we have to apply quantization to satisfy that.
-  if (quant_specs->inference_input_type != tensorflow::DT_FLOAT) {
+  if (quant_specs->inference_input_type != DT_FLOAT) {
     inference_type = quant_specs->inference_input_type;
   }
 
@@ -270,11 +279,11 @@ Status PopulateQuantizationSpecs(
     quant_specs->disable_per_channel =
         toco_flags.disable_per_channel_quantization();
     if (toco_flags.quantize_to_float16()) {
-      quant_specs->inference_type = tensorflow::DT_HALF;
-      quant_specs->inference_input_type = tensorflow::DT_HALF;
+      quant_specs->inference_type = DT_HALF;
+      quant_specs->inference_input_type = DT_HALF;
     } else {
-      quant_specs->inference_type = tensorflow::DT_QINT8;
-      quant_specs->inference_input_type = tensorflow::DT_QINT8;
+      quant_specs->inference_type = DT_QINT8;
+      quant_specs->inference_input_type = DT_QINT8;
     }
   } else {
     // These flags are incompatible with post_training_quantize() as only
@@ -313,11 +322,14 @@ Status PopulateQuantizationSpecs(
       toco_flags.enable_mlir_dynamic_range_quantizer();
   quant_specs->enable_mlir_variable_quantization =
       toco_flags.enable_mlir_variable_quantization();
-  return OkStatus();
+  quant_specs->disable_per_channel_for_dense_layers =
+      toco_flags.disable_per_channel_quantization_for_dense_layers();
+  return absl::OkStatus();
 }
 
 // Dumps the op graph of the `module` to `filename` in DOT format.
-Status DumpOpGraphToFile(mlir::ModuleOp module, const std::string& filename) {
+absl::Status DumpOpGraphToFile(mlir::ModuleOp module,
+                               const std::string& filename) {
   std::string error_message;
   auto output = mlir::openOutputFile(filename, &error_message);
   if (!error_message.empty()) {
@@ -329,15 +341,16 @@ Status DumpOpGraphToFile(mlir::ModuleOp module, const std::string& filename) {
     return errors::Unknown("Failed to dump Op Graph from MLIR module.");
   }
   output->keep();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status ConvertMLIRToTFLiteFlatBuffer(
+absl::Status ConvertMLIRToTFLiteFlatBuffer(
     const toco::ModelFlags& model_flags, const toco::TocoFlags& toco_flags,
     mlir::OwningOpRef<mlir::ModuleOp> module,
     const mlir::TFL::PassConfig& pass_config,
-    const std::unordered_set<std::string>& saved_model_tags, string* result,
-    std::optional<tensorflow::Session*> session) {
+    const std::unordered_set<std::string>& saved_model_tags,
+    std::string* result, SavedModelBundle* saved_model_bundle,
+    const PyFunctionLibrary* quantization_py_function_lib) {
   if (toco_flags.has_dump_graphviz_dir()) {
     TF_RETURN_IF_ERROR(DumpOpGraphToFile(
         module.get(),
@@ -361,7 +374,8 @@ Status ConvertMLIRToTFLiteFlatBuffer(
 
   auto status = ConvertTFExecutorToTFLOrFlatbuffer(
       module.get(), /*export_to_mlir=*/false, toco_flags, pass_config_copy,
-      saved_model_tags, model_flags.saved_model_dir(), session, result);
+      saved_model_tags, model_flags.saved_model_dir(), saved_model_bundle,
+      result, /*serialize_stablehlo_ops=*/false, quantization_py_function_lib);
   if (toco_flags.has_dump_graphviz_dir()) {
     TF_RETURN_IF_ERROR(DumpOpGraphToFile(
         // rename once we enable the new converter feature flag.
diff --git a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
index 8d7eeb2912a7b6..039e56672ddadc 100644
--- a/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
+++ b/tensorflow/compiler/mlir/lite/python/tf_tfl_flatbuffer_helpers.h
@@ -24,8 +24,10 @@ limitations under the License.
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/lite/toco/model_flags.pb.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
@@ -55,7 +57,8 @@ Status ConvertMLIRToTFLiteFlatBuffer(
     mlir::OwningOpRef<mlir::ModuleOp> module,
     const mlir::TFL::PassConfig& pass_config,
     const std::unordered_set<std::string>& saved_model_tags, string* result,
-    std::optional<tensorflow::Session*> session);
+    SavedModelBundle* saved_model_bundle,
+    const quantization::PyFunctionLibrary* quantization_py_function_lib);
 
 // Give a warning for any unused flags that have been specified.
 void WarningUnusedFlags(const toco::ModelFlags& model_flags,
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
index 356801e7fd38a0..cf437c27e2ec4e 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/BUILD
@@ -2,7 +2,7 @@ load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_binary", "tf_cc_test")
 
 package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:LICENSE"],
     default_visibility = [
         ":friends",
         "//tensorflow:__pkg__",
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
index 4545aa412686b7..f7ffc7d71d02f9 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.cc
@@ -58,7 +58,8 @@ TfLiteStatus QuantizeModel(
     bool whole_model_verify, bool legacy_float_scale,
     const absl::flat_hash_set<std::string>& denylisted_ops,
     const absl::flat_hash_set<std::string>& denylisted_nodes,
-    const bool enable_variable_quantization) {
+    const bool enable_variable_quantization,
+    bool disable_per_channel_for_dense_layers) {
   // Translate TFLite names to mlir op names.
   absl::flat_hash_set<std::string> denylisted_mlir_op_names;
   for (const auto& entry : denylisted_ops) {
@@ -84,6 +85,8 @@ TfLiteStatus QuantizeModel(
   quant_specs.inference_type = tflite::TflTypeToTfType(inference_type);
   quant_specs.post_training_quantization = true;
   quant_specs.disable_per_channel = disable_per_channel;
+  quant_specs.disable_per_channel_for_dense_layers =
+      disable_per_channel_for_dense_layers;
   quant_specs.verify_numeric = verify_numeric;
   quant_specs.whole_model_verify = whole_model_verify;
   quant_specs.legacy_float_scale = legacy_float_scale;
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
index d85aba47811675..50b397ba0206d2 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h
@@ -54,7 +54,8 @@ TfLiteStatus QuantizeModel(
     bool whole_model_verify = false, bool legacy_float_scale = true,
     const absl::flat_hash_set<std::string>& denylisted_ops = {},
     const absl::flat_hash_set<std::string>& denylisted_nodes = {},
-    bool enable_variable_quantization = false);
+    bool enable_variable_quantization = false,
+    bool disable_per_channel_for_dense_layers = false);
 
 }  // namespace lite
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model_test.cc b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model_test.cc
index 5898c9e54234a5..696a2545d7097a 100644
--- a/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model_test.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/lite/quantize_model_test.cc
@@ -72,7 +72,8 @@ TfLiteStatus QuantizeModel(
     const TensorType& activations_type, ErrorReporter* error_reporter,
     std::string& output_buffer, const bool disable_per_channel = false,
     const absl::flat_hash_set<std::string>& blocked_ops = {},
-    const absl::flat_hash_set<std::string>& blocked_nodes = {}) {
+    const absl::flat_hash_set<std::string>& blocked_nodes = {},
+    const bool disable_per_channel_for_dense_layers = false) {
   TensorType inference_tensor_type = activations_type;
   const bool fully_quantize = !allow_float;
 
@@ -87,7 +88,10 @@ TfLiteStatus QuantizeModel(
       input_buffer, input_type, output_type, inference_tensor_type,
       /*operator_names=*/{}, disable_per_channel, fully_quantize, output_buffer,
       error_reporter, /*verify_numeric=*/false, /*whole_model_verify=*/false,
-      /*legacy_float_scale=*/true, blocked_ops, blocked_nodes);
+      /*legacy_float_scale=*/true, blocked_ops, blocked_nodes,
+      /*enable_variable_quantization=*/false,
+      /*disable_per_channel_for_dense_layers=*/
+      disable_per_channel_for_dense_layers);
   if (status != kTfLiteOk) {
     return status;
   }
@@ -140,6 +144,21 @@ TfLiteStatus QuantizeModelAllOperators(
                        output_buffer);
 }
 
+TfLiteStatus QuantizeModelAllOperators(
+    ModelT* model, const TensorType& input_type, const TensorType& output_type,
+    bool allow_float, const TensorType& activations_type,
+    ErrorReporter* error_reporter, std::string& output_buffer,
+    bool disable_per_channel_for_dense_layers) {
+  return QuantizeModel(model, input_type, output_type, allow_float,
+                       /*operator_names=*/{}, activations_type, error_reporter,
+                       output_buffer,
+                       /*disable_per_channel=*/false,
+                       /* blocked_ops=*/{},
+                       /*blocked_nodes=*/{},
+                       /*disable_per_channel_for_dense_layers=*/
+                       disable_per_channel_for_dense_layers);
+}
+
 std::unique_ptr<FlatBufferModel> ReadModel(const string& model_name) {
   auto model_path = tensorflow::io::JoinPath(*g_test_model_dir, model_name);
   return FlatBufferModel::BuildFromFile(model_path.c_str());
@@ -1118,16 +1137,20 @@ TEST_F(QuantizeSVDFTest, VerifySVDF) {
   ExpectSameModels(model_, expected_model);
 }
 
-class QuantizeFCTest : public QuantizeModelTest {
+class QuantizeFCTest : public QuantizeModelTest,
+                       public testing::WithParamInterface<bool> {
  protected:
   QuantizeFCTest() {
+    disable_per_channel_quantization_for_dense_ = GetParam();
     input_model_ = ReadModel(internal::kModelWithFCOp);
     readonly_model_ = input_model_->GetModel();
     model_ = UnPackFlatBufferModel(*readonly_model_);
   }
+
+  bool disable_per_channel_quantization_for_dense_;
 };
 
-TEST_F(QuantizeFCTest, VerifyFC8x8) {
+TEST_P(QuantizeFCTest, VerifyFC8x8) {
   auto status = QuantizeModelAllOperators(
       &model_, TensorType_INT8, TensorType_INT8, /*allow_float=*/false,
       TensorType_INT8, &error_reporter_, output_buffer_);
@@ -1180,7 +1203,7 @@ TEST_F(QuantizeFCTest, VerifyFC8x8) {
                           /*bit_num=*/8, /*symmetric=*/false);
 }
 
-TEST_F(QuantizeFCTest, VerifyFCFor16x8) {
+TEST_P(QuantizeFCTest, VerifyFCFor16x8) {
   auto status = QuantizeModelAllOperators(
       &model_, TensorType_INT8, TensorType_INT8, /*allow_float=*/false,
       TensorType_INT16, &error_reporter_, output_buffer_);
@@ -1195,7 +1218,7 @@ TEST_F(QuantizeFCTest, VerifyFCFor16x8) {
   ASSERT_THAT(op->outputs, SizeIs(1));
 
   const SubGraph* float_graph = readonly_model_->subgraphs()->Get(0);
-  // Verify FC input tesnor and weight are int16 and int8 quantized.
+  // Verify FC input tensor and weight are int16 and int8 quantized.
   const Operator* float_op = float_graph->operators()->Get(0);
   ASSERT_THAT(float_graph->tensors()->Get(float_op->inputs()->Get(0))->type(),
               Eq(TensorType_FLOAT32));
@@ -1235,6 +1258,136 @@ TEST_F(QuantizeFCTest, VerifyFCFor16x8) {
                           /*bit_num=*/16, /*symmetric=*/true);
 }
 
+TEST_P(QuantizeFCTest, VerifyDisablePerChannelQuantization) {
+  auto status = QuantizeModelAllOperators(
+      &model_, TensorType_INT8, TensorType_INT8, /*allow_float=*/false,
+      TensorType_INT8, &error_reporter_, output_buffer_,
+      /*disable_per_channel_for_dense_layers=*/
+      disable_per_channel_quantization_for_dense_);
+  ASSERT_THAT(status, Eq(kTfLiteOk));
+  const auto& subgraph = model_.subgraphs[0];
+  auto fc_op = subgraph->operators[0].get();
+
+  ASSERT_THAT(fc_op->inputs, SizeIs(3));
+  ASSERT_THAT(fc_op->outputs, SizeIs(1));
+
+  const int input_tensor_idx = 0;
+  const int weights_tensor_idx = 1;
+  const int bias_tensor_index = 2;
+  const int output_tensor_idx = 0;
+  const auto bias_tensor =
+      subgraph->tensors[fc_op->inputs[bias_tensor_index]].get();
+  const auto input_tensor =
+      subgraph->tensors[fc_op->inputs[input_tensor_idx]].get();
+  const auto weights_tensor =
+      subgraph->tensors[fc_op->inputs[weights_tensor_idx]].get();
+  const auto output_tensor =
+      subgraph->tensors[fc_op->outputs[output_tensor_idx]].get();
+
+  EXPECT_THAT(bias_tensor->type, Eq(TensorType_INT32));
+  EXPECT_THAT(input_tensor->type, Eq(TensorType_INT8));
+  EXPECT_THAT(weights_tensor->type, Eq(TensorType_INT8));
+  EXPECT_THAT(output_tensor->type, Eq(TensorType_INT8));
+
+  ASSERT_TRUE(weights_tensor->quantization);
+  ASSERT_TRUE(bias_tensor->quantization);
+  ASSERT_TRUE(weights_tensor->quantization);
+  const std::vector<float>& bias_scales = bias_tensor->quantization->scale;
+  const std::vector<float>& weights_scales =
+      weights_tensor->quantization->scale;
+  const std::vector<int64_t>& weights_zero_points =
+      weights_tensor->quantization->zero_point;
+
+  const int out_channel_size = 2;
+  ASSERT_THAT(bias_scales, SizeIs(disable_per_channel_quantization_for_dense_
+                                      ? 1
+                                      : out_channel_size));
+  ASSERT_THAT(weights_scales, SizeIs(disable_per_channel_quantization_for_dense_
+                                         ? 1
+                                         : out_channel_size));
+  ASSERT_THAT(
+      weights_zero_points,
+      SizeIs(disable_per_channel_quantization_for_dense_ ? 1
+                                                         : out_channel_size));
+  ASSERT_THAT(input_tensor->quantization->scale, SizeIs(1));
+  ASSERT_THAT(output_tensor->quantization->scale, SizeIs(1));
+
+  const float eps = 1e-7;
+
+  // Bias scale should be input * per_channel_weight_scale.
+  for (size_t i = 0; i < out_channel_size; i++) {
+    EXPECT_THAT((disable_per_channel_quantization_for_dense_ ? bias_scales[0]
+                                                             : bias_scales[i]),
+                FloatNear(input_tensor->quantization->scale[0] *
+                              (disable_per_channel_quantization_for_dense_
+                                   ? weights_scales[0]
+                                   : weights_scales[i]),
+                          eps));
+  }
+
+  const auto bias_buffer = model_.buffers[bias_tensor->buffer].get();
+  auto control_size = sizeof(int32_t) * bias_tensor->shape[0];
+
+  ASSERT_THAT(bias_buffer->data, SizeIs(control_size));
+  const auto float_op =
+      readonly_model_->subgraphs()->Get(0)->operators()->Get(0);
+  const auto original_bias_tensor =
+      readonly_model_->subgraphs()->Get(0)->tensors()->Get(
+          float_op->inputs()->Get(2));
+  ASSERT_THAT(bias_buffer->data, SizeIs(control_size));
+  const auto original_bias_buffer =
+      readonly_model_->buffers()->Get(original_bias_tensor->buffer());
+  const float* bias_float_buffer =
+      reinterpret_cast<const float*>(original_bias_buffer->data()->data());
+
+  int32_t* bias_values = reinterpret_cast<int32_t*>(bias_buffer->data.data());
+  for (size_t i = 0; i < out_channel_size; i++) {
+    const float bias_scale = disable_per_channel_quantization_for_dense_
+                                 ? bias_scales[0]
+                                 : bias_scales[i];
+    auto dequantized_value = bias_values[i] * bias_scale;
+    EXPECT_THAT(dequantized_value,
+                FloatNear(bias_float_buffer[i], bias_scale / 2));
+  }
+
+  const auto weights_buffer = model_.buffers[weights_tensor->buffer].get();
+  const auto original_weights_tensor =
+      readonly_model_->subgraphs()->Get(0)->tensors()->Get(
+          float_op->inputs()->Get(1));
+  const auto original_weights_buffer =
+      readonly_model_->buffers()->Get(original_weights_tensor->buffer());
+  const int8_t* weight_values =
+      reinterpret_cast<int8_t*>(weights_buffer->data.data());
+  const float* weights_float_buffer =
+      reinterpret_cast<const float*>(original_weights_buffer->data()->data());
+  ASSERT_THAT(sizeof(float) * weights_buffer->data.size(),
+              Eq(original_weights_buffer->data()->size()));
+  int num_values_in_channel = weights_buffer->data.size() / out_channel_size;
+  for (size_t channel_idx = 0; channel_idx < out_channel_size; channel_idx++) {
+    for (size_t j = 0; j < num_values_in_channel; j++) {
+      size_t element_idx = channel_idx * num_values_in_channel + j;
+      auto scale = disable_per_channel_quantization_for_dense_
+                       ? weights_scales[0]
+                       : weights_scales[channel_idx];
+      auto zero_point = disable_per_channel_quantization_for_dense_
+                            ? weights_zero_points[0]
+                            : weights_zero_points[channel_idx];
+      auto dequantized_value = weight_values[element_idx] * scale;
+      EXPECT_THAT(dequantized_value,
+                  FloatNear(weights_float_buffer[element_idx], scale / 2));
+      EXPECT_THAT(zero_point, Eq(0));
+    }
+  }
+
+  // check op and versioning.
+  EXPECT_THAT(model_.operator_codes, SizeIs(1));
+  EXPECT_THAT(GetBuiltinCode(model_.operator_codes[0].get()),
+              Eq(BuiltinOperator_FULLY_CONNECTED));
+  ASSERT_THAT(model_.operator_codes[0]->version, 5);
+}
+
+INSTANTIATE_TEST_SUITE_P(QuantizeFCTestInst, QuantizeFCTest, testing::Bool());
+
 class QuantizeCustomOpTest
     : public QuantizeModelTest,
       public ::testing::WithParamInterface<tflite::TensorType> {
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_config.h b/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
index fcf36de6247f02..66175aabf394e4 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_config.h
@@ -45,6 +45,7 @@ struct CustomOpInfo {
 using ::tflite::optimize::ReducedPrecisionSupport;
 using CustomOpMap = std::unordered_map<std::string, CustomOpInfo>;
 enum CustomOpUpdateOptions { kInputIndices, kWeightOnly, kNoSideEffect };
+enum class QDQConversionMode { kQDQNone, kQDQStatic, kQDQDynamic };
 
 struct QuantizationSpecs {
   // Which function this node quant specifications belong to.
@@ -85,6 +86,11 @@ struct QuantizationSpecs {
   // weight FakeQuant).
   bool disable_per_channel = false;
 
+  // Disables per channel weights quantization for Dense layers and enables
+  // legacy per tensor quantization. The legacy quantization for Dense layers is
+  // inconsistent with Conv 1x1 which always performs per channel quantization.
+  bool disable_per_channel_for_dense_layers = false;
+
   // When set to true, the fixed output ranges of the activation ops (tanh,
   // sigmoid, etc.) and the weight constants are not inferred. Then, to quantize
   // these ops, quantization emulation ops should be placed after the ops in the
@@ -209,9 +215,11 @@ struct QuantizationSpecs {
   // For dynamic range quantization, among the custom ops in the graph those
   // specified in this map are subject to quantization.
   CustomOpMap custom_map;
-};
 
-enum class QDQConversionMode { kQDQNone, kQDQStatic, kQDQDynamic };
+  // If other than kQDQNone, the model is a floating point graph with QDQ ops
+  // to be eliminated and fused into quantized kernels.
+  QDQConversionMode qdq_conversion_mode = QDQConversionMode::kQDQNone;
+};
 
 // Parses the command line flag strings to the CustomOpMap specification.
 void ParseCustomOpSpecs(absl::string_view node_names,
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
index 62c2733d2b510c..408540cd84a146 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_driver.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
+#include <vector>
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -32,6 +33,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
@@ -109,7 +111,8 @@ class QuantizationDriver {
                               bool disable_per_channel,
                               OpQuantSpecGetter op_quant_spec_getter,
                               OpQuantScaleSpecGetter op_quant_scale_spec_getter,
-                              bool infer_tensor_range, bool legacy_float_scale)
+                              bool infer_tensor_range, bool legacy_float_scale,
+                              bool is_qdq_conversion)
       : fn_(fn),
         builder_(fn.getBody()),
         is_signed_(is_signed),
@@ -118,7 +121,8 @@ class QuantizationDriver {
         op_quant_spec_getter_(op_quant_spec_getter),
         op_quant_scale_spec_getter_(op_quant_scale_spec_getter),
         infer_tensor_range_(infer_tensor_range),
-        legacy_float_scale_(legacy_float_scale) {}
+        legacy_float_scale_(legacy_float_scale),
+        is_qdq_conversion_(is_qdq_conversion) {}
 
   // The entry point of the quantization parameters propagation.
   void Run();
@@ -198,7 +202,7 @@ class QuantizationDriver {
   // Returns the quantization params for the bias input from the non-bias
   // operands which have their indexes in the `non_biases` vector. The returned
   // parameters are calculated by `func`.
-  QuantParams GetBiasParams(Operation *op, int bias,
+  QuantParams GetBiasParams(Operation *op, int bias_index,
                             const std::vector<int> &non_biases,
                             AccumulatorScaleFunc func);
 
@@ -429,6 +433,10 @@ class QuantizationDriver {
   // Calculate scales in float instead of double, so that the scales and
   // quantized values are exactly the same with the TOCO quantizer.
   bool legacy_float_scale_;
+
+  // If true, the model is a floating point graph with QDQ ops to be eliminated
+  // and fused into quantized kernels.
+  bool is_qdq_conversion_;
 };
 }  // namespace
 
@@ -518,20 +526,35 @@ bool QuantizationDriver::SetResultParams(Operation *op, int res_index,
 }
 
 QuantParams QuantizationDriver::GetBiasParams(
-    Operation *op, int bias, const std::vector<int> &non_biases,
+    Operation *op, const int bias_index, const std::vector<int> &non_biases,
     AccumulatorScaleFunc func) {
-  auto &bias_state = GetOperandQuantState(op, bias);
+  QuantState &bias_state = GetOperandQuantState(op, bias_index);
   if (!bias_state.IsEmpty()) {
     return bias_state.params;
   }
   std::vector<QuantParams> op_types;
   op_types.reserve(non_biases.size());
+  int adjusted_quant_dim = -1;
+  if (op->getNumOperands() > bias_index) {
+    // Some kernels allow 1D bias, broadcasting it inside the kernel. In this
+    // case, the `quantizedDimension=0` when quantizing per-channel.
+    // However, for some kernels which require bias to be already broadcasted
+    // to match the accumulation shape, the very last index should be used.
+    Operation *bias_op = op->getOperand(bias_index).getDefiningOp();
+    if (bias_op != nullptr) {
+      Type bias_type = bias_op->getResult(0).getType();
+      if (bias_type != builder_.getNoneType()) {
+        int bias_rank = bias_type.dyn_cast<ShapedType>().getRank();
+        adjusted_quant_dim = bias_rank > 1 ? bias_rank - 1 : 0;
+      }
+    }
+  }
+
   for (auto non_bias : non_biases) {
     auto &non_bias_type = GetOperandQuantState(op, non_bias);
     op_types.push_back(non_bias_type.params);
   }
-  if (op_types.empty()) return {};
-  return func(op_types, legacy_float_scale_);
+  return func(op_types, adjusted_quant_dim, legacy_float_scale_);
 }
 
 bool QuantizationDriver::SetOperandParams(Operation *op, int index,
@@ -956,7 +979,10 @@ bool QuantizationDriver::PropagateParams() {
         }
     }
 
-    if (scale_spec->has_fixed_output_range && infer_tensor_range_) {
+    // If the model already contains immutable QDQs, require upstream to
+    // explicitly fix output range instead.
+    if (scale_spec->has_fixed_output_range && infer_tensor_range_ &&
+        !is_qdq_conversion_) {
       // Infer ranges from the activation ops. This is usually required for
       // the post-training quantization workflow.
       // TODO(fengliuai): different result can have different fixed range.
@@ -1182,20 +1208,22 @@ void ApplyQuantizationParamsPropagation(mlir::func::FuncOp func, bool is_signed,
                                         int bit_width, bool disable_per_channel,
                                         OpQuantSpecGetter op_quant_spec_getter,
                                         bool infer_tensor_ranges,
-                                        bool legacy_float_scale) {
+                                        bool legacy_float_scale,
+                                        bool is_qdq_conversion) {
   ApplyQuantizationParamsPropagation(
       func, is_signed, bit_width, disable_per_channel, op_quant_spec_getter,
-      GetDefaultQuantScaleSpec, infer_tensor_ranges, legacy_float_scale);
+      GetDefaultQuantScaleSpec, infer_tensor_ranges, legacy_float_scale,
+      is_qdq_conversion);
 }
 
 void ApplyQuantizationParamsPropagation(
     mlir::func::FuncOp func, bool is_signed, int bit_width,
     bool disable_per_channel, OpQuantSpecGetter op_quant_spec_getter,
     OpQuantScaleSpecGetter op_quant_scale_spec_getter, bool infer_tensor_ranges,
-    bool legacy_float_scale) {
+    bool legacy_float_scale, bool is_qdq_conversion) {
   QuantizationDriver(func, is_signed, bit_width, disable_per_channel,
                      op_quant_spec_getter, op_quant_scale_spec_getter,
-                     infer_tensor_ranges, legacy_float_scale)
+                     infer_tensor_ranges, legacy_float_scale, is_qdq_conversion)
       .Run();
 }
 
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
index 9a151a80e8f48b..53f8024c7900dd 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cmath>
 #include <cstdint>
 #include <cstdlib>
+#include <functional>
 #include <iterator>
 #include <limits>
 #include <memory>
@@ -29,13 +30,20 @@ limitations under the License.
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
@@ -43,7 +51,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantizeUtils.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/UniformSupport.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_traits.h"
-#include "tensorflow/lite/kernels/internal/tensor_utils.h"
+#include "tensorflow/lite/kernels/internal/portable_tensor_utils.h"
 #include "tensorflow/lite/tools/optimize/quantization_utils.h"
 
 namespace mlir {
@@ -469,7 +477,7 @@ Type GetUniformQuantizedPerAxisTypeForWeight(ElementsAttr attr, int quant_dim,
 
 quant::QuantizedType GetUniformQuantizedTypeForBias(
     const std::vector<quant::QuantizedType>& op_types,
-    bool legacy_float_scale) {
+    const int adjusted_quant_dim, const bool legacy_float_scale) {
   if (op_types.empty()) return {};
 
   size_t axis_size = 1;
@@ -531,13 +539,14 @@ quant::QuantizedType GetUniformQuantizedTypeForBias(
         /*zeroPoint=*/0, storage_type_min, storage_type_max);
   } else {
     llvm::SmallVector<int64_t, 4> zero_points(axis_size, 0);
-    // Assume the bias is a 1-D tensor, and set the quantization dim to the last
-    // dimension, which is 0. If the bias rank is larger than 1, this returned
-    // quantized type couldn't be used to quantize the bias.
+    // If the bias is a 1-D tensor, set the `quantizedDimension` to 0.
+    // If the bias rank is larger than 1 because it was already broadcasted
+    // to match the output shape, use the last index.
     return quant::UniformQuantizedPerAxisType::getChecked(
         builder.getUnknownLoc(),
         /*flags=*/true, storage_type, expressed_type, scales, zero_points,
-        /*quantizedDimension=*/0, storage_type_min, storage_type_max);
+        /*quantizedDimension=*/std::max(adjusted_quant_dim, 0),
+        storage_type_min, storage_type_max);
   }
 }
 
@@ -598,7 +607,7 @@ ElementsAttr QuantizeLegacy(Attribute real_value, Type tensor_type) {
     return DenseElementsAttr::get(new_dense_type, quantized_attr);
   } else if (width == 8) {
     // This can be a state tensor, or an actual constant tensor with
-    // asymmetric range. For a state tensor, assigining correct quantization
+    // asymmetric range. For a state tensor, assigning correct quantization
     // parameters is sufficient, and for constants with asymmetric range it's
     // not correctly quantized by legacy quantizer so call the new Quantize.
     return Quantize(real_value, tensor_type);
@@ -643,7 +652,7 @@ ElementsAttr Quantize(Attribute real_value, Type tensor_type) {
           quant::QuantizedType::getQuantizedElementType(tensor_type)) {
     Type converted_type;
     return quantfork::quantizeAttr(real_value, q_type, converted_type)
-        .dyn_cast<ElementsAttr>();
+        .dyn_cast_or_null<ElementsAttr>();
   }
   return {};
 }
@@ -816,7 +825,7 @@ bool RemoveRedundantStatsOps(
     }
   }
 
-  // Step 2: backward pass: For the ops skiped in the forward pass, propagate
+  // Step 2: backward pass: For the ops skipped in the forward pass, propagate
   // its results scale backwards as far as possible.
   func.walk([&](quantfork::StatisticsOp stats_op) {
     if (redundant_stats_ops.find(stats_op) == redundant_stats_ops.end()) {
diff --git a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
index 1113bb868fa3e8..e1b697e3be67d1 100644
--- a/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
+++ b/tensorflow/compiler/mlir/lite/quantization/quantization_utils.h
@@ -66,29 +66,29 @@ namespace quant {
 // A unit attribute can be attached to the quantize/dequantize ops which are
 // added by the quantization passes. These ops can be removed erased without
 // losing accuracy.
-constexpr char kVolatileOpAttrName[] = "volatile";
+inline constexpr char kVolatileOpAttrName[] = "volatile";
 
 // Following attributes are used to mark ops that are not quantizable during
 // debug model generation process for whole-model verify mode. If these
 // attributes are attached, the upstream float/quantized ops know which ops to
 // connect to, and it also prevents these ops from being copied again.
-constexpr char kDebugModeOpFloatAttrName[] = "debug_float";
-constexpr char kDebugModeOpQuantAttrName[] = "debug_quant";
+inline constexpr char kDebugModeOpFloatAttrName[] = "debug_float";
+inline constexpr char kDebugModeOpQuantAttrName[] = "debug_quant";
 
 // Used to annotate custom ops if they are quantizable.
-constexpr char kQuantTraitAttrName[] = "_tfl_quant_trait";
+inline constexpr char kQuantTraitAttrName[] = "_tfl_quant_trait";
 enum QuantizationTrait { FullyQuantizable = 0, NotQuantizable = 1 };
-constexpr absl::string_view QuantTraitValues[] = {"fully_quantizable",
-                                                  "not_quantizable"};
+inline constexpr absl::string_view QuantTraitValues[] = {"fully_quantizable",
+                                                         "not_quantizable"};
 
-constexpr double kNearZeroTolerance = 1.0e-6;
+inline constexpr double kNearZeroTolerance = 1.0e-6;
 
 using QuantParams = QuantizedType;
 using QuantSpec = QuantizationSpecs;
 using SignedInteger = std::pair<unsigned, unsigned>;  // bitwidth and sign
 using QuantParamsForResults = llvm::SmallVector<QuantParams, 4>;
 using AccumulatorScaleFunc =
-    std::function<QuantParams(const std::vector<QuantParams>&, bool)>;
+    std::function<QuantParams(const std::vector<QuantParams>&, int, bool)>;
 using BiasParamsMap =
     std::unordered_map<int, std::pair<std::vector<int>, AccumulatorScaleFunc>>;
 // UniformQuantizedType GetFixedOutputRange(bool sign, int bit_width)
@@ -890,7 +890,7 @@ Type GetUniformQuantizedPerAxisTypeForWeight(
 // other operands which are multiply-accumulated (the bias is added to the
 // accumulated value).
 quant::QuantizedType GetUniformQuantizedTypeForBias(
-    const std::vector<quant::QuantizedType>& op_types,
+    const std::vector<quant::QuantizedType>& op_types, int adjusted_quant_dim,
     bool legacy_float_scale = false);
 
 // Propagates quantization parameters across ops in this function and satisfy
@@ -906,13 +906,14 @@ void ApplyQuantizationParamsPropagation(mlir::func::FuncOp func, bool is_signed,
                                         int bit_width, bool disable_per_channel,
                                         OpQuantSpecGetter op_quant_spec_getter,
                                         bool infer_tensor_ranges,
-                                        bool legacy_float_scale = false);
+                                        bool legacy_float_scale = false,
+                                        bool is_qdq_conversion = false);
 
 void ApplyQuantizationParamsPropagation(
     mlir::func::FuncOp func, bool is_signed, int bit_width,
     bool disable_per_channel, OpQuantSpecGetter op_quant_spec_getter,
     OpQuantScaleSpecGetter op_quant_scale_spec_getter, bool infer_tensor_ranges,
-    bool legacy_float_scale = false);
+    bool legacy_float_scale = false, bool is_qdq_conversion = false);
 
 // Gets quantization scale specs (e.g. fixed output range, same result and
 // operand scales) from the default quantization interfaces. The op should
diff --git a/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
new file mode 100644
index 00000000000000..7f6b74431a95b7
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/stablehlo/BUILD
@@ -0,0 +1,48 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//tensorflow/compiler/mlir/lite:__subpackages__"],
+    licenses = ["notice"],
+)
+
+cc_library(
+    name = "quantization",
+    srcs = ["quantization.cc"],
+    hdrs = ["quantization.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/cc/saved_model:constants",
+        "//tensorflow/cc/saved_model:loader",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:static_range_ptq",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_saved_model_freeze_variables",
+        "//tensorflow/core/protobuf:for_core_protos_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "quantization_test",
+    srcs = ["quantization_test.cc"],
+    deps = [
+        ":quantization",
+        "//tensorflow/cc/saved_model:loader",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:io",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:static_range_ptq_impl",  # buildcleaner: keep; prevents undefined reference
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibrator_singleton_impl",  # buildcleaner: keep; prevents undefined reference
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:status_matchers",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc
new file mode 100644
index 00000000000000..929634164fba3b
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.cc
@@ -0,0 +1,118 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.h"
+
+#include <string>
+#include <unordered_set>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/constants.h"
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+
+namespace tensorflow {
+namespace {
+
+using ::mlir::quant::stablehlo::StaticRangePtqComponent;
+using ::stablehlo::quantization::QuantizationConfig;
+using ::tensorflow::SignatureDef;
+using ::tensorflow::quantization::PyFunctionLibrary;
+
+// Returns signature key -> `SignatureDef` mapping, excluding the signature for
+// initialization op, which is only used during initialization.
+// TODO: b/314124142 - Remove the need for this function.
+absl::flat_hash_map<std::string, SignatureDef> GetSignatureDefMapFromBundle(
+    const SavedModelBundle& saved_model_bundle) {
+  // Translate protobuf::Map -> absl::flat_hash_map.
+  const protobuf::Map<std::string, SignatureDef>& signatures =
+      saved_model_bundle.GetSignatures();
+  absl::flat_hash_map<std::string, SignatureDef> signature_def_map(
+      signatures.begin(), signatures.end());
+
+  // Init op is only used during initialization and it's not a target for
+  // quantization.
+  signature_def_map.erase(kSavedModelInitOpSignatureKey);
+  return signature_def_map;
+}
+
+// Retrieves the function name -> function alias mapping from the
+// `SavedModelBundle`.
+// TODO: b/314124142 - Remove the need for this function.
+absl::flat_hash_map<std::string, std::string> GetFunctionAliases(
+    const SavedModelBundle& saved_model_bundle) {
+  const protobuf::Map<std::string, std::string>& function_aliases =
+      saved_model_bundle.meta_graph_def.meta_info_def().function_aliases();
+  return absl::flat_hash_map<std::string, std::string>(function_aliases.begin(),
+                                                       function_aliases.end());
+}
+
+}  // namespace
+
+absl::StatusOr<mlir::ModuleOp> RunQuantization(
+    const SavedModelBundle* saved_model_bundle,
+    const absl::string_view saved_model_dir,
+    const std::unordered_set<std::string>& saved_model_tags,
+    const QuantizationConfig& quantization_config,
+    const PyFunctionLibrary* quantization_py_function_lib,
+    mlir::ModuleOp module_op) {
+  if (saved_model_bundle == nullptr) {
+    return absl::InvalidArgumentError(
+        "Failed to run quantization. `saved_model_bundle` should not be "
+        "nullptr.");
+  }
+
+  if (quantization_py_function_lib == nullptr) {
+    return absl::InvalidArgumentError(
+        "Failed to run quantization. `quantization_py_function_lib` should not "
+        "be nullptr.");
+  }
+
+  const absl::flat_hash_map<std::string, SignatureDef> signature_def_map =
+      GetSignatureDefMapFromBundle(*saved_model_bundle);
+
+  std::vector<std::string> exported_names;
+  for (const auto& [key, value_unused] : signature_def_map) {
+    exported_names.push_back(key);
+  }
+
+  if (failed(mlir::tf_saved_model::FreezeVariables(
+          module_op, saved_model_bundle->GetSession()))) {
+    return absl::InternalError("Failed to freeze variables.");
+  }
+
+  StaticRangePtqComponent static_range_ptq_component(
+      module_op.getContext(), quantization_py_function_lib, saved_model_dir,
+      /*signature_keys=*/exported_names, saved_model_tags, signature_def_map,
+      GetFunctionAliases(*saved_model_bundle));
+  const absl::StatusOr<mlir::ModuleOp> quantized_module_op =
+      static_range_ptq_component.Run(module_op, quantization_config);
+  if (!quantized_module_op.ok()) {
+    return absl::InternalError("Failed to run quantization. Status msg: " +
+                               quantized_module_op.status().ToString());
+  }
+  return quantized_module_op;
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.h b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.h
new file mode 100644
index 00000000000000..c55d59cad0f1a0
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.h
@@ -0,0 +1,60 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Adaptor functions for StableHLO Quantizer.
+// Provides simpler interfaces when integrating StableHLO Quantizer into TFLite
+// Converter.
+
+#ifndef TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_STABLEHLO_QUANTIZATION_H_
+#define TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_STABLEHLO_QUANTIZATION_H_
+
+#include <string>
+#include <unordered_set>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+
+namespace tensorflow {
+
+// Runs quantization on `module_op`. `saved_model_bundle` is required to
+// retrieve information about the original model (e.g. signature def mapping)
+// because quantization requires exporting the intermediate `ModuleOp` back to
+// SavedModel for calibration. Similarly, `saved_model_dir` is required to
+// access the assets of the original model. `saved_model_tags` uniquely
+// identifies the `MetaGraphDef`. `quantization_config` determines the behavior
+// of StableHLO Quantizer. `quantization_py_function_lib` contains python
+// implementations of certain APIs that are required for calibration.
+// `module_op` is the input graph to be quantized and it should contain
+// StableHLO ops.
+//
+// Returns a quantized `ModuleOp` in StableHLO, potentially wrapped inside a
+// XlaCallModuleOp. Returns a non-OK status if quantization fails, or any of
+// `saved_model_bundle` or `quantization_py_function_lib` is a nullptr.
+absl::StatusOr<mlir::ModuleOp> RunQuantization(
+    const SavedModelBundle* saved_model_bundle,
+    absl::string_view saved_model_dir,
+    const std::unordered_set<std::string>& saved_model_tags,
+    const stablehlo::quantization::QuantizationConfig& quantization_config,
+    const tensorflow::quantization::PyFunctionLibrary*
+        quantization_py_function_lib,
+    mlir::ModuleOp module_op);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_LITE_QUANTIZATION_STABLEHLO_QUANTIZATION_H_
diff --git a/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization_test.cc b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization_test.cc
new file mode 100644
index 00000000000000..3cbc9e6ea47864
--- /dev/null
+++ b/tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization_test.cc
@@ -0,0 +1,79 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+// Test cases for the StableHLO Quantizer adaptor functions.
+
+#include "tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.h"
+
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tsl/platform/status_matchers.h"
+
+namespace tensorflow {
+namespace {
+
+using ::stablehlo::quantization::QuantizationConfig;
+using ::stablehlo::quantization::io::CreateTmpDir;
+using ::testing::HasSubstr;
+using ::tsl::testing::IsOk;
+using ::tsl::testing::StatusIs;
+
+// Test cases for `RunQuantization` mainly tests for error cases because testing
+// for successful cases require passing python implementation to
+// `quantization_py_function_lib`, which requires testing from the python level.
+// Internal integration tests exist for testing successful quantization.
+
+TEST(RunQuantizationTest,
+     WhenSavedModelBundleIsNullptrReturnsInvalidArgumentError) {
+  const absl::StatusOr<std::string> tmp_saved_model_dir = CreateTmpDir();
+  ASSERT_THAT(tmp_saved_model_dir, IsOk());
+
+  const absl::StatusOr<mlir::ModuleOp> quantized_module_op = RunQuantization(
+      /*saved_model_bundle=*/nullptr, *tmp_saved_model_dir,
+      /*saved_model_tags=*/{}, QuantizationConfig(),
+      /*quantization_py_function_lib=*/nullptr, /*module_op=*/{});
+  EXPECT_THAT(
+      quantized_module_op,
+      StatusIs(absl::StatusCode::kInvalidArgument,
+               HasSubstr("`saved_model_bundle` should not be nullptr")));
+}
+
+TEST(RunQuantizationTest,
+     WhenPyFunctionLibIsNullptrReturnsInvalidArgumentError) {
+  const absl::StatusOr<std::string> tmp_saved_model_dir = CreateTmpDir();
+  ASSERT_THAT(tmp_saved_model_dir, IsOk());
+
+  // Dummy SavedModelBundle to pass a non-nullptr argument.
+  SavedModelBundle bundle{};
+  const absl::StatusOr<mlir::ModuleOp> quantized_module_op = RunQuantization(
+      /*saved_model_bundle=*/&bundle, *tmp_saved_model_dir,
+      /*saved_model_tags=*/{}, QuantizationConfig(),
+      /*quantization_py_function_lib=*/nullptr, /*module_op=*/{});
+  EXPECT_THAT(
+      quantized_module_op,
+      StatusIs(
+          absl::StatusCode::kInvalidArgument,
+          HasSubstr("`quantization_py_function_lib` should not be nullptr")));
+}
+
+}  // namespace
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc b/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc
index ad4112a05ad4a9..47440b4c4c0beb 100644
--- a/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc
+++ b/tensorflow/compiler/mlir/lite/quantization/tools/op_quant_spec_getters_gen.cc
@@ -15,7 +15,10 @@ limitations under the License.
 
 #include <vector>
 
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/PrettyStackTrace.h"
@@ -24,6 +27,7 @@ limitations under the License.
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
 #include "mlir/TableGen/Operator.h"  // from @llvm-project
+#include "mlir/TableGen/Trait.h"  // from @llvm-project
 
 using llvm::LessRecord;
 using llvm::raw_ostream;
@@ -50,7 +54,8 @@ static bool OpQuantSpecWriter(raw_ostream &os, RecordKeeper &records) {
   llvm::sort(defs, LessRecord());
 
   OUT(0) << "static std::unique_ptr<quant::OpQuantSpec> "
-            "GetOpQuantSpec(mlir::Operation *op) {\n";
+            "GetOpQuantSpec(mlir::Operation *op, bool "
+            "disable_per_channel_for_dense_layers = false) {\n";
   // TODO(b/176258587): Move to OpTrait if this should be generalized.
   // Add special handling for LSTM.
   OUT(2) << "if (auto lstm_op = llvm::dyn_cast<TFL::LSTMOp>(op)) {\n";
@@ -94,7 +99,9 @@ static bool OpQuantSpecWriter(raw_ostream &os, RecordKeeper &records) {
         // There is a "QuantChannelDim" trait, set the quantization dimension.
         if (coeff_index_trait_regex.match(trait_str, &matches)) {
           OUT(4) << "spec->coeff_op_quant_dim[tfl.GetCoefficientOperandIndex()"
-                 << "] = tfl.GetQuantizationDim();\n";
+                 << "] = llvm::dyn_cast<TFL::FullyConnectedOp>(op) && "
+                    "disable_per_channel_for_dense_layers ? -1 :  "
+                    "tfl.GetQuantizationDim();\n";
           matches.clear();
         }
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/BUILD b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
index feb47bfe4420b0..89d6139ba7ba19 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/lite/stablehlo/BUILD
@@ -280,7 +280,7 @@ cc_library(
     deps = [
         ":passes_inc_gen",
         "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
-        "//tensorflow/compiler/mlir/quantization/stablehlo:uniform_quantized_types",
+        "//tensorflow/compiler/mlir/quantization/common:uniform_quantized_types",
         "@com_google_absl//absl/algorithm:container",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
@@ -407,12 +407,14 @@ cc_library(
     ],
     deps = [
         "//tensorflow/compiler/mlir/tensorflow",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
         "@stablehlo//:stablehlo_ops",
         "@stablehlo//:stablehlo_serialization",
@@ -455,10 +457,12 @@ cc_library(
     deps = [
         ":passes_inc_gen",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
-        "//tensorflow/compiler/mlir/quantization/stablehlo:uniform_quantized_types",
+        "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/common:uniform_quantized_types",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log:check",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
@@ -599,8 +603,10 @@ tf_cc_binary(
         "//tensorflow/compiler/mlir/lite:flatbuffer_export",
         "//tensorflow/compiler/mlir/lite:tf_to_tfl_flatbuffer",
         "//tensorflow/compiler/mlir/lite/stablehlo/serializer:flatbuffer_export",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:static_range_ptq_impl",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_preprocess",
         "//tensorflow/compiler/mlir/quantization/tensorflow:tf_quant_ops",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/calibrator:calibrator_singleton_impl",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_graph_optimization_pass",
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/call_xla_module_to_stablehlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/call_xla_module_to_stablehlo.mlir
index 795840247cab93..292802ec92e5e1 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/call_xla_module_to_stablehlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/call_xla_module_to_stablehlo.mlir
@@ -17,7 +17,8 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
   }
 }
 
-// CHECK: module attributes {tfl.description = "MLIR Converted.", tfl.metadata = {keep_stablehlo_constant = "true"}, tfl.schema_version = 3 : i32} {
+// CHECK: module attributes
+// CHECK-SAME:  tfl.metadata = {{{.*}}keep_stablehlo_constant = "true"{{.*}}}
 // CHECK-NEXT:  func.func @main(%arg0: tensor<2x3xi32>) -> tensor<2x3xi32> attributes {tf.entry_function = {inputs = "args_tf_0", outputs = "Identity"}} {
 // CHECK-NEXT:    %0 = stablehlo.custom_call @Sharding(%arg0) {mhlo.sharding = ""} : (tensor<2x3xi32>) -> tensor<2x3xi32>
 // CHECK-NEXT:    %1 = stablehlo.multiply %0, %0 : tensor<2x3xi32>
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tf-fb-tf.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tf-fb-tf.mlir
index f50c399deb5579..d0da1f09fa5ae1 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tf-fb-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tf-fb-tf.mlir
@@ -8,10 +8,8 @@ func.func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 }
 }
 
-// CHECK:     module attributes {tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32} {
-// CHECK-NEXT:  func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
+// CHECK:       func @main(%arg0: tensor<2xi32>) -> tensor<2xi32> attributes {tf.entry_function = {inputs = "arg0", outputs = "tfl.custom1"}} {
 // CHECK-NEXT:    %0 = "tfl.custom"(%arg0, %arg0) {custom_code = "stablehlo.add", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
 // CHECK-NEXT:    %1 = "tfl.custom"(%0, %arg0) {custom_code = "stablehlo.subtract", custom_option = #tfl<const_bytes : "0x00000100002401">} : (tensor<2xi32>, tensor<2xi32>) -> tensor<2xi32>
 // CHECK-NEXT:    return %1 : tensor<2xi32>
 // CHECK-NEXT:  }
-// CHECK-NEXT:}
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-broadcast_in_dim.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-broadcast_in_dim.mlir
index 3b5dae4706e877..85653de898aa01 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-broadcast_in_dim.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-broadcast_in_dim.mlir
@@ -2,7 +2,7 @@
 
 module {
 func.func @main(%arg0: tensor<1x2xi32>) -> tensor<1x2x2xi32> {
-  %0= "stablehlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
+  %0= "stablehlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = array<i64: 1, 2>} : (tensor<1x2xi32>) -> tensor<1x2x2xi32>
   func.return %0 : tensor<1x2x2xi32>
 }
 }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-conv.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-conv.mlir
index 4030506472268c..aa7742c15e4c42 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-conv.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-conv.mlir
@@ -13,15 +13,15 @@ func.func @main(%arg0: tensor<8x8x1x207xf32>, %arg1: tensor<3x3x16x207xf32>) ->
       output_batch_dimension = 3,
       output_feature_dimension = 0,
       output_spatial_dimensions = [1, 2]
-    >, feature_group_count = 1 : i64, lhs_dilation = dense<1> : tensor<2xi64>, padding = dense<1> : tensor<2x2xi64>, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>], rhs_dilation = dense<1> : tensor<2xi64>, window_strides = dense<1> : tensor<2xi64>} :
+    >, feature_group_count = 1 : i64, lhs_dilation = array<i64: 1, 1>, padding = dense<1> : tensor<2x2xi64>, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>], rhs_dilation = array<i64: 1, 1>, window_strides = array<i64: 1, 1>, window_reversal = array<i1: true, false>} :
        (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
   func.return %0 : tensor<16x8x8x1xf32>
 }
 }
 
-// CHECK:      module {
-// CHECK-NEXT:    func @main(%arg0: tensor<8x8x1x207xf32>, %arg1: tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32> {
-// CHECK-NEXT:    %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.convolution", custom_option = #tfl<const_bytes : "0x62617463685F67726F75705F636F756E740064696D656E73696F6E5F6E756D62657273000200010404020001040402010204040902031103020F03000D040428040428040428666561747572655F67726F75705F636F756E74006C68735F64696C6174696F6E0002010170616464696E67000401010101707265636973696F6E5F636F6E666967000744454641554C54000744454641554C540002120A7268735F64696C6174696F6E0002010177696E646F775F737472696465730002010108C0AF7C695A4E291A080108019801665C3526150428042C2C3C2C2C102401">} : (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
-// CHECK-NEXT:    return %0 : tensor<16x8x8x1xf32>
-// CHECK-NEXT:    }
+// CHECK: module {
+// CHECK-NEXT:   func.func @main(%arg0: tensor<8x8x1x207xf32>, %arg1: tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32> {
+// CHECK-NEXT:     %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.convolution", custom_option = #tfl<const_bytes : "0x62617463685F67726F75705F636F756E740064696D656E73696F6E5F6E756D62657273000200010404020001040402010204040902031103020F03000D040428040428040428666561747572655F67726F75705F636F756E74006C68735F64696C6174696F6E0002010170616464696E67000401010101707265636973696F6E5F636F6E666967000744454641554C54000744454641554C540002120A7268735F64696C6174696F6E0002010177696E646F775F726576657273616C0002010077696E646F775F737472696465730002010109D3C28F7C6D613C2D1B09010901AC017A70493A28170428042C2C3C2C902C122401">} : (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
+// CHECK-NEXT:     return %0 : tensor<16x8x8x1xf32>
+// CHECK-NEXT:   }
 // CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-gather.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-gather.mlir
index 5fb78f0540c3c5..47c716c0ca5243 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-gather.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-stablehlo-tfl-gather.mlir
@@ -9,7 +9,7 @@ func.func @main(%arg0: tensor<1x128x256xf32>, %arg1: tensor<30x1x2xi32>) -> tens
       start_index_map = [0, 1],
       index_vector_dim = 2>,
     indices_are_sorted = false,
-    slice_sizes = dense<[1, 1, 256]> : tensor<3xi64>} :
+    slice_sizes = array<i64: 1, 1, 256>} :
        (tensor<1x128x256xf32>, tensor<30x1x2xi32>) -> tensor<30x1x256xf32>
   func.return %0 : tensor<30x1x256xf32>
 }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-conv.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-conv.mlir
index 65f988cebe7c70..a8a3c8a18683b5 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-conv.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize-tfl-stablehlo-conv.mlir
@@ -2,13 +2,14 @@
 
 module {
   func.func @main(%arg0: tensor<8x8x1x207xf32>, %arg1: tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32> {
-  %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.convolution", custom_option = #tfl<const_bytes : "0x62617463685F67726F75705F636F756E740064696D656E73696F6E5F6E756D62657273000200010404020001040402010204040902031103020F03000D040428040428040428666561747572655F67726F75705F636F756E74006C68735F64696C6174696F6E0002010170616464696E67000401010101707265636973696F6E5F636F6E666967000744454641554C54000744454641554C540002120A7268735F64696C6174696F6E0002010177696E646F775F737472696465730002010108C0AF7C695A4E291A080108019801665C3526150428042C2C3C2C2C102401">} : (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
+  %0 = "tfl.custom"(%arg0, %arg1) {custom_code = "stablehlo.convolution", custom_option = #tfl<const_bytes : "0x62617463685F67726F75705F636F756E740064696D656E73696F6E5F6E756D62657273000200010404020001040402010204040902031103020F03000D040428040428040428666561747572655F67726F75705F636F756E74006C68735F64696C6174696F6E0002010170616464696E67000401010101707265636973696F6E5F636F6E666967000744454641554C54000744454641554C540002120A7268735F64696C6174696F6E0002010177696E646F775F726576657273616C0002010077696E646F775F737472696465730002010109D3C28F7C6D613C2D1B09010901AC017A70493A28170428042C2C3C2C902C122401">} : (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
   func.return %0 : tensor<16x8x8x1xf32>
   }
 }
 
-
-// CHECK:      module {
-// CHECK:      func @main(%arg0: tensor<8x8x1x207xf32>, %arg1: tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32> {
-// CHECK:      %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [0, 1, b, f]x[0, 1, o, i]->[f, 0, 1, b], window = {stride = [1, 1], pad = {{\[}}[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
-// CHECK:      return %0 : tensor<16x8x8x1xf32>
+// CHECK: module {
+// CHECK-NEXT:   func.func @main(%arg0: tensor<8x8x1x207xf32>, %arg1: tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32> {
+// CHECK-NEXT:     %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [0, 1, b, f]x[0, 1, o, i]->[f, 0, 1, b], window = {stride = [1, 1], pad = {{\[}}[1, 1], [1, 1]], lhs_dilate = [1, 1], rhs_dilate = [1, 1], reverse = [true, false]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<8x8x1x207xf32>, tensor<3x3x16x207xf32>) -> tensor<16x8x8x1xf32>
+// CHECK-NEXT:     return %0 : tensor<16x8x8x1xf32>
+// CHECK-NEXT:   }
+// CHECK-NEXT: }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
index d9d86dac6782e6..4152a1b785daa6 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/legalize_hlo.mlir
@@ -8,7 +8,7 @@
 // CHECK:           return %[[VAL_2]] : tensor<1x32x10x32xi32>
 // CHECK:         }
 func.func @biasAdd_NHWC(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = array<i64: 3>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   func.return %0 : tensor<1x32x10x32xi32>
 }
 
@@ -19,7 +19,7 @@ func.func @biasAdd_NHWC(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) ->
 // CHECK:           return %[[VAL_2]] : tensor<1x32x10x32xi32>
 // CHECK:         }
 func.func @biasAdd_NCHW(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = array<i64: 3>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   func.return %0 : tensor<1x32x10x32xi32>
 }
 
@@ -30,7 +30,7 @@ func.func @biasAdd_NCHW(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) ->
 // CHECK:           return %[[VAL_2]] : tensor<?x?x?x?xi32>
 // CHECK:         }
 func.func @biasAdd_dynamic(%arg0: tensor<?x?x?x?xi32>, %arg1: tensor<?xi32>) -> tensor<?x?x?x?xi32> {
-  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<?x?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?x?xi32>
+  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = array<i64: 3>} : (tensor<?x?x?x?xi32>, tensor<?xi32>) -> tensor<?x?x?x?xi32>
   func.return %0 : tensor<?x?x?x?xi32>
 }
 
@@ -68,7 +68,7 @@ func.func @broadcast_add(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1000xf32>) -> (
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
 // CHECK:         }
 func.func @broadcast_add_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   func.return %0 : tensor<1x2xi32>
 }
 
@@ -79,14 +79,14 @@ func.func @broadcast_add_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> t
 // CHECK:           return %[[VAL_2]] : tensor<4x4x4x4xi32>
 // CHECK:         }
 func.func @broadcast_multi_dim_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
-  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<[1, 2, 3]> : tensor<3xi64>} : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
+  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1, 2, 3>} : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
   func.return %0 : tensor<4x4x4x4xi32>
 }
 
 // CHECK-LABEL:   func @unsupported_broadcast_add
 // CHECK: chlo.broadcast_add
 func.func @unsupported_broadcast_add(%arg0: tensor<4x1x1xi32>, %arg1: tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32> {
-  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<[0, 1, 2]> : tensor<3xi64>} : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
+  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = array<i64: 0, 1, 2>} : (tensor<4x1x1xi32>, tensor<4x4x4x4xi32>) -> tensor<4x4x4x4xi32>
   func.return %0 : tensor<4x4x4x4xi32>
 }
 
@@ -122,7 +122,7 @@ func.func @broadcast_div(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1000xf32>) -> (
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
 // CHECK:         }
 func.func @broadcast_div_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   func.return %0 : tensor<1x2xi32>
 }
 
@@ -159,7 +159,7 @@ func.func @broadcast_shift_left(%arg0: tensor<1xi32>, %arg1: tensor<4xi32>) -> (
 // CHECK:           return %[[VAL_2]] : tensor<?x?xi32>
 // CHECK:         }
 func.func @div_dynamic(%arg0: tensor<?xi32>, %arg1: tensor<?x?xi32>) -> tensor<?x?xi32> {
-  %0 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
+  %0 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>} : (tensor<?xi32>, tensor<?x?xi32>) -> tensor<?x?xi32>
   func.return %0 : tensor<?x?xi32>
 }
 
@@ -247,7 +247,7 @@ func.func @broadcast_mul(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1000xf32>) -> (
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
 // CHECK:         }
 func.func @broadcast_mul_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "chlo.broadcast_multiply"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "chlo.broadcast_multiply"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   func.return %0 : tensor<1x2xi32>
 }
 
@@ -268,7 +268,7 @@ func.func @real_div(%arg0: tensor<2xi32>) -> tensor<2xi32> {
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
 // CHECK:         }
 func.func @broadcast_real_div(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   func.return %0 : tensor<1x2xi32>
 }
 
@@ -304,7 +304,7 @@ func.func @broadcast_sub(%arg0: tensor<1x1xf32>, %arg1: tensor<1x1000xf32>) -> (
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi32>
 // CHECK:         }
 func.func @broadcast_sub_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi32> {
-  %0 = "chlo.broadcast_subtract"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
+  %0 = "chlo.broadcast_subtract"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi32>
   func.return %0 : tensor<1x2xi32>
 }
 
@@ -341,7 +341,7 @@ func.func @shift_right(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi
 // CHECK:           return %[[VAL_2]] : tensor<2x4xi32>
 // CHECK:         }
 func.func @broadcast_shift_right(%arg0: tensor<4xi32>, %arg1: tensor<2x4xi32>) -> tensor<2x4xi32> {
-  %0 = "chlo.broadcast_shift_right_arithmetic"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
+  %0 = "chlo.broadcast_shift_right_arithmetic"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>} : (tensor<4xi32>, tensor<2x4xi32>) -> tensor<2x4xi32>
   func.return %0 : tensor<2x4xi32>
 }
 
@@ -363,7 +363,7 @@ func.func @and(%arg0: tensor<2xi1>, %arg1: tensor<2xi1>) -> tensor<2xi1> {
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @and_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
-  %0 = "chlo.broadcast_and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
+  %0 = "chlo.broadcast_and"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
   func.return %0 : tensor<1x2xi1>
 }
 
@@ -396,7 +396,7 @@ func.func @or(%arg0: tensor<2xi1>, %arg1: tensor<2xi1>) -> tensor<2xi1> {
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @or_broadcast(%arg0: tensor<1xi1>, %arg1: tensor<1x2xi1>) -> tensor<1x2xi1> {
-  %0 = "chlo.broadcast_or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
+  %0 = "chlo.broadcast_or"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>} : (tensor<1xi1>, tensor<1x2xi1>) -> tensor<1x2xi1>
   func.return %0 : tensor<1x2xi1>
 }
 
@@ -441,7 +441,7 @@ func.func @bitwise_or_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> t
 // CHECK:           return %[[VAL_2]] : tensor<1x4xi8>
 // CHECK:         }
 func.func @bitwise_or_broadcast_chlo(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  %0 = "chlo.broadcast_or"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
+  %0 = "chlo.broadcast_or"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
   func.return %0 : tensor<1x4xi8>
 }
 
@@ -474,7 +474,7 @@ func.func @bitwise_xor(%arg0: tensor<4xi32>, %arg1: tensor<4xi32>) -> tensor<4xi
 // CHECK:           return %[[VAL_2]] : tensor<1x4xi8>
 // CHECK:         }
 func.func @bitwise_xor_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[1]> : tensor<1xi64>} : (tensor<1xi8>) -> tensor<1x4xi8>
+  %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>) -> tensor<1x4xi8>
   %1 = mhlo.xor %0, %arg1 : tensor<1x4xi8>
   func.return %1 : tensor<1x4xi8>
 }
@@ -509,7 +509,7 @@ func.func @bitwise_and_broadcast(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) ->
 // CHECK:           return %[[VAL_2]] : tensor<1x4xi8>
 // CHECK:         }
 func.func @bitwise_and_broadcast_chlo(%arg0: tensor<1xi8>, %arg1: tensor<1x4xi8>) -> tensor<1x4xi8> {
-  %0 = "chlo.broadcast_and"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
+  %0 = "chlo.broadcast_and"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>} : (tensor<1xi8>, tensor<1x4xi8>) -> tensor<1x4xi8>
   func.return %0 : tensor<1x4xi8>
 }
 
@@ -584,16 +584,16 @@ func.func @floordiv_broadcast_i32(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>)
   %1 = "chlo.broadcast_compare"(%arg0, %0) {comparison_direction = #chlo<comparison_direction LT>} : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
   %2 = mhlo.constant dense<0> : tensor<3xi32>
   %3 = "chlo.broadcast_compare"(%arg1, %2) {comparison_direction = #chlo<comparison_direction LT>} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
-  %4 = "chlo.broadcast_compare"(%1, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction EQ>} : (tensor<2x3xi1>, tensor<3xi1>) -> tensor<2x3xi1>
-  %5 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+  %4 = "chlo.broadcast_compare"(%1, %3) {broadcast_dimensions = array<i64: 1>, comparison_direction = #chlo<comparison_direction EQ>} : (tensor<2x3xi1>, tensor<3xi1>) -> tensor<2x3xi1>
+  %5 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
   %6 = "mhlo.abs"(%arg0) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %7 = "mhlo.abs"(%arg1) : (tensor<3xi32>) -> tensor<3xi32>
   %8 = mhlo.constant dense<1> : tensor<3xi32>
   %9 = mhlo.subtract %7, %8 : tensor<3xi32>
-  %10 = "chlo.broadcast_add"(%6, %9) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+  %10 = "chlo.broadcast_add"(%6, %9) {broadcast_dimensions = array<i64: 1>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
   %11 = "mhlo.negate"(%10) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %12 = "mhlo.abs"(%arg1) : (tensor<3xi32>) -> tensor<3xi32>
-  %13 = "chlo.broadcast_divide"(%11, %12) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
+  %13 = "chlo.broadcast_divide"(%11, %12) {broadcast_dimensions = array<i64: 1>} : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
   %14 = "mhlo.select"(%4, %5, %13) : (tensor<2x3xi1>, tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   func.return %14 : tensor<2x3xi32>
 }
@@ -623,13 +623,13 @@ func.func @floordiv_reverse_broadcast_i32(%arg0: tensor<3xi32>, %arg1: tensor<2x
   %1 = "mhlo.compare"(%arg0, %0) {comparison_direction = #mhlo<comparison_direction LT>} : (tensor<3xi32>, tensor<3xi32>) -> tensor<3xi1>
   %2 = mhlo.constant dense<0> : tensor<2x3xi32>
   %3 = "chlo.broadcast_compare"(%arg1, %2) {comparison_direction = #chlo<comparison_direction LT>} : (tensor<2x3xi32>, tensor<2x3xi32>) -> tensor<2x3xi1>
-  %4 = "chlo.broadcast_compare"(%1, %3) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction EQ>} : (tensor<3xi1>, tensor<2x3xi1>) -> tensor<2x3xi1>
-  %5 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  %4 = "chlo.broadcast_compare"(%1, %3) {broadcast_dimensions = array<i64: 1>, comparison_direction = #chlo<comparison_direction EQ>} : (tensor<3xi1>, tensor<2x3xi1>) -> tensor<2x3xi1>
+  %5 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>} : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   %6 = "mhlo.abs"(%arg0) : (tensor<3xi32>) -> tensor<3xi32>
   %7 = "mhlo.abs"(%arg1) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %8 = mhlo.constant dense<1> : tensor<2x3xi32>
   %9 = mhlo.subtract %7, %8 : tensor<2x3xi32>
-  %10 = "chlo.broadcast_add"(%6, %9) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
+  %10 = "chlo.broadcast_add"(%6, %9) {broadcast_dimensions = array<i64: 1>} : (tensor<3xi32>, tensor<2x3xi32>) -> tensor<2x3xi32>
   %11 = "mhlo.negate"(%10) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %12 = "mhlo.abs"(%arg1) : (tensor<2x3xi32>) -> tensor<2x3xi32>
   %13 = mhlo.divide %11, %12 : tensor<2x3xi32>
@@ -660,8 +660,8 @@ func.func @floordiv_f32(%arg0: tensor<2xf32>) -> tensor<2xf32> {
 // CHECK:           return %[[VAL_4]] : tensor<2x3xf16>
 // CHECK:         }
 func.func @floordiv_f16_broadcast(%arg0: tensor<2x3xf16>, %arg1: tensor<3xf16>) -> tensor<2x3xf16> {
-  %0 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
-  %1 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
+  %0 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
+  %1 = "chlo.broadcast_divide"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>} : (tensor<2x3xf16>, tensor<3xf16>) -> tensor<2x3xf16>
   %2 = "mhlo.floor"(%1) : (tensor<2x3xf16>) -> tensor<2x3xf16>
   func.return %2 : tensor<2x3xf16>
 }
@@ -707,7 +707,7 @@ func.func @equal_broadcast(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>) -> te
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @equal_broadcast_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction EQ>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>, comparison_direction = #chlo<comparison_direction EQ>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %0 : tensor<1x2xi1>
 }
 
@@ -718,7 +718,7 @@ func.func @equal_broadcast_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) ->
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @equal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction EQ>} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>, comparison_direction = #chlo<comparison_direction EQ>} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %0 : tensor<1x2xi1>
 }
 
@@ -736,7 +736,7 @@ func.func @equal_incompatible_shape_broadcastable(%arg0: tensor<?xi32>, %arg1: t
 // CHECK-LABEL: func @equal_unsupported_compare_type
 func.func @equal_unsupported_compare_type(%arg0: tensor<1xf32>, %arg1: tensor<1x2xf32>) -> tensor<1x2xi1> {
   // CHECK: chlo.broadcast_compare
-  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, compare_type = #chlo<comparison_type TOTALORDER>, comparison_direction = #chlo<comparison_direction EQ>} : (tensor<1xf32>, tensor<1x2xf32>) -> tensor<1x2xi1>
+  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>, compare_type = #chlo<comparison_type TOTALORDER>, comparison_direction = #chlo<comparison_direction EQ>} : (tensor<1xf32>, tensor<1x2xf32>) -> tensor<1x2xi1>
   func.return %0 : tensor<1x2xi1>
 }
 
@@ -770,7 +770,7 @@ func.func @notequal_broadcast(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>) ->
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @notequal_broadcast_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction NE>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>, comparison_direction = #chlo<comparison_direction NE>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %0 : tensor<1x2xi1>
 }
 
@@ -781,7 +781,7 @@ func.func @notequal_broadcast_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>)
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @notequal_broadcast_no_incompatible_shapes_error(%arg0: tensor<2xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction NE>} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>, comparison_direction = #chlo<comparison_direction NE>} : (tensor<2xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %0 : tensor<1x2xi1>
 }
 
@@ -826,7 +826,7 @@ func.func @broadcast_greater(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>) ->
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @broadcast_greater_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction GT>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>, comparison_direction = #chlo<comparison_direction GT>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %0 : tensor<1x2xi1>
 }
 
@@ -867,7 +867,7 @@ func.func @broadcast_greater_equal(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @broadcast_greater_equal_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction GE>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>, comparison_direction = #chlo<comparison_direction GE>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %0 : tensor<1x2xi1>
 }
 
@@ -901,7 +901,7 @@ func.func @broadcast_less(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>) -> ten
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @broadcast_less_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction LT>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>, comparison_direction = #chlo<comparison_direction LT>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %0 : tensor<1x2xi1>
 }
 
@@ -935,7 +935,7 @@ func.func @broadcast_less_equal(%arg0: tensor<1x1xi32>, %arg1: tensor<1x2xi32>)
 // CHECK:           return %[[VAL_2]] : tensor<1x2xi1>
 // CHECK:         }
 func.func @broadcast_less_equal_chlo(%arg0: tensor<1xi32>, %arg1: tensor<1x2xi32>) -> tensor<1x2xi1> {
-  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction LE>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
+  %0 = "chlo.broadcast_compare"(%arg0, %arg1) {broadcast_dimensions = array<i64: 1>, comparison_direction = #chlo<comparison_direction LE>} : (tensor<1xi32>, tensor<1x2xi32>) -> tensor<1x2xi1>
   func.return %0 : tensor<1x2xi1>
 }
 
@@ -980,7 +980,7 @@ func.func @const() -> tensor<2xi32> {
 // CHECK:         }
 func.func @relu(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   %0 = mhlo.constant dense<0> : tensor<i32>
-  %1 = "chlo.broadcast_maximum"(%0, %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
+  %1 = "chlo.broadcast_maximum"(%0, %arg0) {broadcast_dimensions = array<i64>} : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
   func.return %1 : tensor<1xi32>
 }
 
@@ -992,7 +992,7 @@ func.func @relu(%arg0: tensor<1xi32>) -> tensor<1xi32> {
 // CHECK:         }
 func.func @relu_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = mhlo.constant dense<0> : tensor<i32>
-  %1 = "chlo.broadcast_maximum"(%0, %arg0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
+  %1 = "chlo.broadcast_maximum"(%0, %arg0) {broadcast_dimensions = array<i64>} : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
   func.return %1 : tensor<?xi32>
 }
 
@@ -1007,8 +1007,8 @@ func.func @relu_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
 func.func @relu6(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   %0 = mhlo.constant dense<0> : tensor<i32>
   %1 = mhlo.constant dense<6> : tensor<i32>
-  %2 = "chlo.broadcast_minimum"(%arg0, %1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
-  %3 = "chlo.broadcast_maximum"(%2, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  %2 = "chlo.broadcast_minimum"(%arg0, %1) {broadcast_dimensions = array<i64>} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
+  %3 = "chlo.broadcast_maximum"(%2, %0) {broadcast_dimensions = array<i64>} : (tensor<1xi32>, tensor<i32>) -> tensor<1xi32>
   func.return %3 : tensor<1xi32>
 }
 
@@ -1023,8 +1023,8 @@ func.func @relu6(%arg0: tensor<1xi32>) -> tensor<1xi32> {
 func.func @relu6_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   %0 = mhlo.constant dense<0> : tensor<i32>
   %1 = mhlo.constant dense<6> : tensor<i32>
-  %2 = "chlo.broadcast_minimum"(%arg0, %1) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
-  %3 = "chlo.broadcast_maximum"(%2, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+  %2 = "chlo.broadcast_minimum"(%arg0, %1) {broadcast_dimensions = array<i64>} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
+  %3 = "chlo.broadcast_maximum"(%2, %0) {broadcast_dimensions = array<i64>} : (tensor<?xi32>, tensor<i32>) -> tensor<?xi32>
   func.return %3 : tensor<?xi32>
 }
 
@@ -1039,7 +1039,7 @@ func.func @relu6_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
 // CHECK:         }
 func.func @relu_grad(%arg0: tensor<4x8xf32>, %arg1: tensor<?x?xf32>) -> tensor<4x8xf32> {
   %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %1 = "chlo.broadcast_compare"(%arg1, %0) {broadcast_dimensions = dense<[]> : tensor<0xi64>, comparison_direction = #chlo<comparison_direction GT>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xi1>
+  %1 = "chlo.broadcast_compare"(%arg1, %0) {broadcast_dimensions = array<i64>, comparison_direction = #chlo<comparison_direction GT>} : (tensor<?x?xf32>, tensor<f32>) -> tensor<?x?xi1>
   %2 = mhlo.constant dense<0.000000e+00> : tensor<4x8xf32>
   %3 = "mhlo.select"(%1, %arg0, %2) : (tensor<?x?xi1>, tensor<4x8xf32>, tensor<4x8xf32>) -> tensor<4x8xf32>
   func.return %3 : tensor<4x8xf32>
@@ -2140,6 +2140,8 @@ func.func @convert_dot_general_dynamic_contracting_dim(%arg0: tensor<4x4x?xf32>,
 func.return %0 : tensor<4x4x256xf32>
 }
 
+
+
 // CHECK-LABEL:   func.func @convert_conv1d(
 // CHECK-SAME:                              %[[VAL_0:.*]]: tensor<16x32x256xbf16>,
 // CHECK-SAME:                              %[[VAL_1:.*]]: tensor<1x256x256xbf16>) -> tensor<16x32x256xbf16> {
@@ -2204,7 +2206,29 @@ func.func @convert_conv1d_dynamic_batch(%arg0: tensor<?x32x256xbf16>, %arg1: ten
   func.return %0 : tensor<?x32x256xbf16>
 }
 
-
+// CHECK-LABEL: convert_dynamic_1d_group_conv
+func.func private @convert_dynamic_1d_group_conv(%arg1: tensor<?x768x2xf32>, %arg2: tensor<768x48x128xf32>) -> (tensor<?x768x3xf32>) {
+  %0 = mhlo.convolution(%arg1, %arg2) 
+    dim_numbers = [b, f, 0]x[o, i, 0]->[b, f, 0], 
+    window = {pad = [[64, 64]]} 
+      {batch_group_count = 1 : i64, feature_group_count = 16 : i64}
+        : (tensor<?x768x2xf32>, tensor<768x48x128xf32>) -> tensor<?x768x3xf32>
+  return %0 : tensor<?x768x3xf32>
+// CHECK:  %cst = arith.constant dense<[-9223372036854775808, 768, 2, 1]> : tensor<4xi64>
+// CHECK:  %0 = "tf.Reshape"(%arg0, %cst) : (tensor<?x768x2xf32>, tensor<4xi64>) -> tensor<?x768x2x1xf32>
+// CHECK:  %cst_0 = "tf.Const"() <{value = dense<[0, 2, 3, 1]> : tensor<4xi64>}> : () -> tensor<4xi64>
+// CHECK:  %1 = "tf.Transpose"(%0, %cst_0) : (tensor<?x768x2x1xf32>, tensor<4xi64>) -> tensor<?x2x1x768xf32>
+// CHECK:  %cst_1 = arith.constant dense<[768, 48, 128, 1]> : tensor<4xi64>
+// CHECK:  %2 = "tf.Reshape"(%arg1, %cst_1) : (tensor<768x48x128xf32>, tensor<4xi64>) -> tensor<768x48x128x1xf32>
+// CHECK:  %cst_2 = "tf.Const"() <{value = dense<[2, 3, 1, 0]> : tensor<4xi64>}> : () -> tensor<4xi64>
+// CHECK:  %3 = "tf.Transpose"(%2, %cst_2) : (tensor<768x48x128x1xf32>, tensor<4xi64>) -> tensor<128x1x48x768xf32>
+// CHECK:  %4 = "tf.Conv2D"(%1, %3) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [0, 0, 64, 64, 0, 0, 0, 0], padding = "EXPLICIT", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<?x2x1x768xf32>, tensor<128x1x48x768xf32>) -> tensor<?x3x1x768xf32>
+// CHECK:  %cst_3 = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
+// CHECK:  %5 = "tf.Transpose"(%4, %cst_3) : (tensor<?x3x1x768xf32>, tensor<4xi64>) -> tensor<?x768x3x1xf32>
+// CHECK:  %cst_4 = arith.constant dense<[-9223372036854775808, 768, 3]> : tensor<3xi64>
+// CHECK:  %6 = "tf.Reshape"(%5, %cst_4) : (tensor<?x768x3x1xf32>, tensor<3xi64>) -> tensor<?x768x3xf32>
+// CHECK:  return %6 : tensor<?x768x3xf32>
+}
 
 // CHECK-LABEL:   func.func @convert_conv1d_no_lhs_dil_rhs_dil_precision_conf(
 // CHECK-SAME:                              %[[VAL_0:.*]]: tensor<16x32x256xbf16>,
@@ -2318,13 +2342,22 @@ func.func @no_convert_conv1d_dynamic(%arg0: tensor<16x?x256xbf16>, %arg1: tensor
   func.return %0 : tensor<16x?x256xbf16>
 }
 
-// CHECK-LABEL:   func.func @no_convert_conv1d_feature_group_gt_1(
-// CHECK-SAME:                                                    %[[VAL_0:.*]]: tensor<16x32x256xbf16>,
-// CHECK-SAME:                                                    %[[VAL_1:.*]]: tensor<1x128x128xbf16>) -> tensor<16x32x128xbf16> {
-// CHECK:           %[[VAL_2:.*]] = mhlo.convolution(%[[VAL_0]], %[[VAL_1]]) dim_numbers = [b, 0, f]x[0, i, o]->[b, 0, f], window = {stride = [1], pad = {{\[\[}}0, 0]], lhs_dilate = [1], rhs_dilate = [1]} {batch_group_count = 1 : i64, feature_group_count = 2 : i64, precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]} : (tensor<16x32x256xbf16>, tensor<1x128x128xbf16>) -> tensor<16x32x128xbf16>
-// CHECK:           return %[[VAL_2]] : tensor<16x32x128xbf16>
-// CHECK:         }
-func.func @no_convert_conv1d_feature_group_gt_1(%arg0: tensor<16x32x256xbf16>, %arg1: tensor<1x128x128xbf16>) -> tensor<16x32x128xbf16> {
+// CHECK-LABEL:   func.func @convert_conv1d_feature_group_gt_1(
+// CHECK:    %cst = arith.constant dense<[16, 32, 256, 1]> : tensor<4xi64>
+// CHECK:    %0 = "tf.Reshape"(%arg0, %cst) : (tensor<16x32x256xbf16>, tensor<4xi64>) -> tensor<16x32x256x1xbf16>
+// CHECK:    %cst_0 = "tf.Const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
+// CHECK:    %1 = "tf.Transpose"(%0, %cst_0) : (tensor<16x32x256x1xbf16>, tensor<4xi64>) -> tensor<16x32x1x256xbf16>
+// CHECK:    %cst_1 = arith.constant dense<[1, 128, 128, 1]> : tensor<4xi64>
+// CHECK:    %2 = "tf.Reshape"(%arg1, %cst_1) : (tensor<1x128x128xbf16>, tensor<4xi64>) -> tensor<1x128x128x1xbf16>
+// CHECK:    %cst_2 = "tf.Const"() <{value = dense<[0, 3, 1, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
+// CHECK:    %3 = "tf.Transpose"(%2, %cst_2) : (tensor<1x128x128x1xbf16>, tensor<4xi64>) -> tensor<1x1x128x128xbf16>
+// CHECK:    %4 = "tf.Conv2D"(%1, %3) <{data_format = "NHWC", dilations = [1, 1, 1, 1], explicit_paddings = [], padding = "VALID", strides = [1, 1, 1, 1], use_cudnn_on_gpu = true}> : (tensor<16x32x1x256xbf16>, tensor<1x1x128x128xbf16>) -> tensor<16x32x1x128xbf16>
+// CHECK:    %cst_3 = "tf.Const"() <{value = dense<[0, 1, 3, 2]> : tensor<4xi64>}> : () -> tensor<4xi64>
+// CHECK:    %5 = "tf.Transpose"(%4, %cst_3) : (tensor<16x32x1x128xbf16>, tensor<4xi64>) -> tensor<16x32x128x1xbf16>
+// CHECK:    %cst_4 = arith.constant dense<[16, 32, 128]> : tensor<3xi64>
+// CHECK:    %6 = "tf.Reshape"(%5, %cst_4) : (tensor<16x32x128x1xbf16>, tensor<3xi64>) -> tensor<16x32x128xbf16>
+// CHECK:    return %6 : tensor<16x32x128xbf16>
+func.func @convert_conv1d_feature_group_gt_1(%arg0: tensor<16x32x256xbf16>, %arg1: tensor<1x128x128xbf16>) -> tensor<16x32x128xbf16> {
 	%0 = "mhlo.convolution"(%arg0, %arg1) {
     batch_group_count = 1 : i64,
     dimension_numbers = #mhlo.conv<[b, 0, f]x[0, i, o]->[b, 0, f]>,
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-serialize-stablehlo.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-serialize-stablehlo.mlir
index 4e12ffd931c5f2..27e22cb524b8af 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-serialize-stablehlo.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/tf-tfl-translate-serialize-stablehlo.mlir
@@ -11,12 +11,12 @@ func.func @tfInplaceUpdate(%arg0: tensor<2x1x2xf32>) -> tensor<2x1x2xf32> {
 }
 }
 
-//CHECK: module attributes {tfl.description = "MLIR Converted.", tfl.metadata = {keep_stablehlo_constant = "true"}, tfl.schema_version = 3 : i32} {
+//CHECK: module attributes 
+//CHECK-SAME: keep_stablehlo_constant = "true"
 //CHECK-NEXT:  func.func @main(%arg0: tensor<2x1x2xf32>) -> tensor<2x1x2xf32> attributes {tf.entry_function = {inputs = "arg0", outputs = "stablehlo.dynamic_update_slice"}} {
 //CHECK-DAG:    %0 = stablehlo.constant dense<2.000000e+00> : tensor<1x1x2xf32>
 //CHECK-DAG:    %1 = stablehlo.constant dense<1> : tensor<i32>
 //CHECK-DAG:    %2 = stablehlo.constant dense<0> : tensor<i32>
 //CHECK-NEXT:   %3 = stablehlo.dynamic_update_slice %arg0, %0, %1, %2, %2 : (tensor<2x1x2xf32>, tensor<1x1x2xf32>, tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<2x1x2xf32>
 //CHECK-NEXT:   return %3 : tensor<2x1x2xf32>
-//CHECK-NEXT:  }
-//CHECK-NEXT:}
\ No newline at end of file
+//CHECK-NEXT:  }
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
index c87e75e8ed1ee5..fd850862f15196 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
+++ b/tensorflow/compiler/mlir/lite/stablehlo/tests/uniform-quantized-stablehlo-to-tfl.mlir
@@ -1,12 +1,19 @@
 // RUN: odml-to-stablehlo-opt --uniform-quantized-stablehlo-to-tfl \
 // RUN:     --split-input-file --verify-diagnostics %s | FileCheck %s
 
-// CHECK-LABEL: uniform_quantize_op
+// ============================================================================
+// The following functions tests example quantization patterns outputted from
+// JAX Quantizer. JAX Quantizer should output integer types, which are
+// composed into `UniformQuantized{|PerAxis}Type` via
+// `compose_uniform_quantized_type_pass.cc`.
+// ============================================================================
+
 func.func @uniform_quantize_op(%arg: tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 3.000000e+0:127>> {
   %0 = stablehlo.uniform_quantize %arg : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 3.000000e+0:127>>
   return %0 : tensor<2x2x!quant.uniform<i8:f32, 3.000000e+0:127>>
 }
-// CHECK: %[[QUANT:.*]] = "tfl.quantize"({{.*}}) {qtype = tensor<2x2x!quant.uniform<i8:f32, 3.000000e+00:127>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 3.000000e+00:127>>
+// CHECK-LABEL: uniform_quantize_op
+// CHECK: %[[QUANT:.+]] = "tfl.quantize"({{.*}}) {qtype = tensor<2x2x!quant.uniform<i8:f32, 3.000000e+00:127>>} : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i8:f32, 3.000000e+00:127>>
 // CHECK: return %[[QUANT]]
 
 // -----
@@ -14,11 +21,11 @@ func.func @uniform_quantize_op(%arg: tensor<2x2xf32>) -> tensor<2x2x!quant.unifo
 // Tests that the pattern doesn't match when the input tensor's type is a
 // quantized type.
 
-// CHECK-LABEL: uniform_quantize_op_quantized_input
 func.func @uniform_quantize_op_quantized_input(%arg: tensor<2x2x!quant.uniform<i8:f32, 2.000000e+0:16>>) -> tensor<2x2x!quant.uniform<i8:f32, 3.000000e+0:127>> {
   %0 = stablehlo.uniform_quantize %arg : (tensor<2x2x!quant.uniform<i8:f32, 2.000000e+0:16>>) -> tensor<2x2x!quant.uniform<i8:f32, 3.000000e+0:127>>
   return %0 : tensor<2x2x!quant.uniform<i8:f32, 3.000000e+0:127>>
 }
+// CHECK-LABEL: uniform_quantize_op_quantized_input
 // CHECK: stablehlo.uniform_quantize
 // CHECK-NOT: tfl.quantize
 
@@ -28,11 +35,11 @@ func.func @uniform_quantize_op_quantized_input(%arg: tensor<2x2x!quant.uniform<i
 // is ui16. ui16 storage type for quantized type is not compatible with
 // `tfl.quantize`.
 
-// CHECK-LABEL: uniform_quantize_op_uint16_output
 func.func @uniform_quantize_op_uint16_output(%arg: tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<ui16:f32, 3.000000e+0:127>> {
   %0 = stablehlo.uniform_quantize %arg : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<ui16:f32, 3.000000e+0:127>>
   return %0 : tensor<2x2x!quant.uniform<ui16:f32, 3.000000e+0:127>>
 }
+// CHECK-LABEL: uniform_quantize_op_uint16_output
 // CHECK: stablehlo.uniform_quantize
 // CHECK-NOT: tfl.quantize
 
@@ -42,22 +49,22 @@ func.func @uniform_quantize_op_uint16_output(%arg: tensor<2x2xf32>) -> tensor<2x
 // is i32. i32 storage type for quantized type is not compatible with
 // `tfl.quantize`.
 
-// CHECK-LABEL: uniform_quantize_op_i32_output
 func.func @uniform_quantize_op_i32_output(%arg: tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i32:f32, 3.000000e+0:127>> {
   %0 = stablehlo.uniform_quantize %arg : (tensor<2x2xf32>) -> tensor<2x2x!quant.uniform<i32:f32, 3.000000e+0:127>>
   return %0 : tensor<2x2x!quant.uniform<i32:f32, 3.000000e+0:127>>
 }
+// CHECK-LABEL: uniform_quantize_op_i32_output
 // CHECK: stablehlo.uniform_quantize
 // CHECK-NOT: tfl.quantize
 
 // -----
 
-// CHECK-LABEL: uniform_dequantize_op
 func.func @uniform_dequantize_op(%arg: tensor<2x2x!quant.uniform<i8:f32, 1.000000e+0:8>>) -> tensor<2x2xf32> {
   %0 = stablehlo.uniform_dequantize %arg : (tensor<2x2x!quant.uniform<i8:f32, 1.000000e+0:8>>) -> tensor<2x2xf32>
   return %0 : tensor<2x2xf32>
 }
-// CHECK: %[[DEQUANT:.*]] = "tfl.dequantize"({{.*}}) : (tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:8>>) -> tensor<2x2xf32>
+// CHECK-LABEL: uniform_dequantize_op
+// CHECK: %[[DEQUANT:.+]] = "tfl.dequantize"({{.*}}) : (tensor<2x2x!quant.uniform<i8:f32, 1.000000e+00:8>>) -> tensor<2x2xf32>
 // CHECK: return %[[DEQUANT]]
 
 // -----
@@ -66,11 +73,11 @@ func.func @uniform_dequantize_op(%arg: tensor<2x2x!quant.uniform<i8:f32, 1.00000
 // storage type is ui16. ui16 storage type is not compatible with
 // `tfl.dequantize`.
 
-// CHECK-LABEL: uniform_dequantize_op_ui16_storage_input
 func.func @uniform_dequantize_op_ui16_storage_input(%arg: tensor<2x2x!quant.uniform<ui16:f32, 1.000000e+0:8>>) -> tensor<2x2xf32> {
   %0 = stablehlo.uniform_dequantize %arg : (tensor<2x2x!quant.uniform<ui16:f32, 1.000000e+0:8>>) -> tensor<2x2xf32>
   return %0 : tensor<2x2xf32>
 }
+// CHECK-LABEL: uniform_dequantize_op_ui16_storage_input
 // CHECK: stablehlo.uniform_dequantize
 // CHECK-NOT: tfl.dequantize
 
@@ -80,11 +87,11 @@ func.func @uniform_dequantize_op_ui16_storage_input(%arg: tensor<2x2x!quant.unif
 // storage type is i32. i32 storage type is not compatible with
 // `tfl.dequantize`.
 
-// CHECK-LABEL: uniform_dequantize_op_i32_storage_input
 func.func @uniform_dequantize_op_i32_storage_input(%arg: tensor<2x2x!quant.uniform<i32:f32, 1.000000e+0:8>>) -> tensor<2x2xf32> {
   %0 = stablehlo.uniform_dequantize %arg : (tensor<2x2x!quant.uniform<i32:f32, 1.000000e+0:8>>) -> tensor<2x2xf32>
   return %0 : tensor<2x2xf32>
 }
+// CHECK-LABEL: uniform_dequantize_op_i32_storage_input
 // CHECK: stablehlo.uniform_dequantize
 // CHECK-NOT: tfl.dequantize
 
@@ -94,109 +101,104 @@ func.func @uniform_dequantize_op_i32_storage_input(%arg: tensor<2x2x!quant.unifo
 // storage type is i32. i32 storage type is not compatible with
 // `tfl.dequantize`.
 
-// CHECK-LABEL: uniform_dequantize_op_return_f64
 func.func @uniform_dequantize_op_return_f64(%arg: tensor<2x2x!quant.uniform<i8:f64, 1.000000e+0:8>>) -> tensor<2x2xf64> {
   %0 = stablehlo.uniform_dequantize %arg : (tensor<2x2x!quant.uniform<i8:f64, 1.000000e+0:8>>) -> tensor<2x2xf64>
   return %0 : tensor<2x2xf64>
 }
+// CHECK-LABEL: uniform_dequantize_op_return_f64
 // CHECK: stablehlo.uniform_dequantize
 // CHECK-NOT: tfl.dequantize
 
 // -----
 
-// CHECK-LABEL: convolution_upstream_full_integer
-func.func @convolution_upstream_full_integer(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 3.000000e+0:-100>>) -> tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+0>> {
+func.func @convolution_upstream_same_padding_srq(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 3.000000e+0:-100>>) -> tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<3> : tensor<3x3x4x2xi8>} : () -> tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>
   %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 3.000000e+0:-100>>, tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+0>>
   return %1 : tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+0>>
 }
-// CHECK-SAME: %[[ARG:.*]]: tensor<1x3x3x4x!quant.uniform<i8:f32, 3.000000e+00:-100>>
-// CHECK-DAG: %[[CONST_0:.*]] = "tfl.pseudo_const"(){{.*}}dense<{{\[\[0, 0\], \[1, 1\], \[1, 1\], \[0, 0\]\]}}> : tensor<4x2xi32>
 // Note that the quantized dimension is 0, and the shape has been transposed
 // to (2, 3, 3, 4).
-// CHECK-DAG: %[[QCONST_0:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x4x!quant.uniform<i8:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>} : () -> tensor<2x3x3x4x!quant.uniform<i8:f32:0, {2.000000e+02,3.000000e+03}>>
-// CHECK-DAG: %[[QCONST_1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {6.000000e+02,9.000000e+03}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {6.000000e+02,9.000000e+03}>>
-// Explicit tfl.pad op to reflect explicit padding attribute.
-// CHECK: %[[PAD:.*]] = "tfl.pad"(%[[ARG]], %[[CONST_0]]) : (tensor<1x3x3x4x!quant.uniform<i8:f32, 3.000000e+00:-100>>, tensor<4x2xi32>) -> tensor<1x5x5x4x!quant.uniform<i8:f32, 3.000000e+00:-100>>
-// CHECK: %[[CONV2D:.*]] = "tfl.conv_2d"(%[[PAD]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x5x5x4x!quant.uniform<i8:f32, 3.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {6.000000e+02,9.000000e+03}>>) -> tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+00>>
+// CHECK-LABEL: convolution_upstream_same_padding_srq
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x3x3x4x!quant.uniform<i8:f32, 3.000000e+00:-100>>
+// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>} : () -> tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {6.000000e+02,9.000000e+03}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {6.000000e+02,9.000000e+03}>>
+// CHECK: %[[CONV2D:.+]] = "tfl.conv_2d"(%[[ARG]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 3.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {6.000000e+02,9.000000e+03}>>) -> tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+00>>
 // CHECK: return %[[CONV2D]] : tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+00>>
 
 // -----
 
-// CHECK-LABEL: convolution_upstream_full_integer_non_const_filter
-func.func @convolution_upstream_full_integer_non_const_filter(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>, %arg1: tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+0>> {
+func.func @convolution_upstream_srq_non_const_filter(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>, %arg1: tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>, tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+0>>
   return %0 : tensor<1x3x3x2x!quant.uniform<i8:f32, 4.000000e+0>>
 }
-// CHECK-SAME: %[[ARG:.*]]: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
-
 // Confirm that the `stablehlo.convolution` is not converted to `tfl.conv_2d`.
+// CHECK-LABEL: convolution_upstream_srq_non_const_filter
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
 // CHECK: stablehlo.convolution
 // CHECK-NOT: tfl.conv_2d
 
 // -----
 
-// Test that if the window padding contains values of 0, tfl.pad op is not
+// Tests that if the window padding contains values of 0, tfl.pad op is not
 // created and the `padding` attribute is set as "VALID".
 
-// CHECK-LABEL: convolution_upstream_full_integer_valid_padding
-func.func @convolution_upstream_full_integer_valid_padding(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+0>> {
+func.func @convolution_upstream_srq_valid_padding(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<3> : tensor<3x3x4x2xi8>} : () -> tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>
   %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 0], [0, 0]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>, tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+0>>
   return %1 : tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+0>>
 }
-// CHECK-SAME: %[[ARG:.*]]: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
-// CHECK: %[[QCONST_0:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x4x!quant.uniform<i8:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>} : () -> tensor<2x3x3x4x!quant.uniform<i8:f32:0, {2.000000e+02,3.000000e+03}>>
-// CHECK: %[[QCONST_1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>
+// CHECK-LABEL: convolution_upstream_srq_valid_padding
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
+// CHECK: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>} : () -> tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>
+// CHECK: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>
 // CHECK-NOT: tfl.pad
-// CHECK: %[[CONV2D:.*]] = "tfl.conv_2d"(%[[ARG]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+00>>
+// CHECK: %[[CONV2D:.+]] = "tfl.conv_2d"(%[[ARG]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+00>>
 // CHECK: return %[[CONV2D]] : tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+00>>
 
 // -----
 
-// Test that if the window padding value is missing, tfl.pad op is not
+// Tests that if the window padding value is missing, tfl.pad op is not
 // created and the `padding` attribute is set as "VALID".
 
-// CHECK-LABEL: convolution_upstream_full_integer_valid_padding
-func.func @convolution_upstream_full_integer_valid_padding(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+0>> {
+func.func @convolution_upstream_srq_valid_padding(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<3> : tensor<3x3x4x2xi8>} : () -> tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>
   // The `window` attribute is empty.
   %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>, tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+0>>
   return %1 : tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+0>>
 }
-// CHECK-SAME: %[[ARG:.*]]: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
-// CHECK: %[[QCONST_0:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x4x!quant.uniform<i8:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>} : () -> tensor<2x3x3x4x!quant.uniform<i8:f32:0, {2.000000e+02,3.000000e+03}>>
-// CHECK: %[[QCONST_1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>
-// CHECK: %[[CONV2D:.*]] = "tfl.conv_2d"(%[[ARG]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+00>>
+// CHECK-LABEL: convolution_upstream_srq_valid_padding
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
+// CHECK: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>} : () -> tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>
+// CHECK: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>
+// CHECK: %[[CONV2D:.+]] = "tfl.conv_2d"(%[[ARG]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>) -> tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+00>>
 // CHECK: return %[[CONV2D]] : tensor<1x1x1x2x!quant.uniform<i8:f32, 4.000000e+00>>
 
 // -----
 
-// Test that if the window stride value is explicitly set, the attribute
+// Tests that if the window stride value is explicitly set, the attribute
 // value is transferred to tfl.conv_2d's stridw_h and stride_w values.
 
-// CHECK-LABEL: convolution_upstream_full_integer_strides
-func.func @convolution_upstream_full_integer_strides(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+0>> {
+func.func @convolution_upstream_srq_strides(%arg0: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<3> : tensor<3x3x4x2xi8>} : () -> tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>
   // The stride value is explicitly set to [1, 2].
   %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {stride = [1, 2], pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>, tensor<3x3x4x2x!quant.uniform<i8:f32:3, {2.000000e+2, 3.000000e+3}>>) -> tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+0>>
   return %1 : tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+0>>
 }
-// CHECK-SAME: %[[ARG:.*]]: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
-// CHECK-DAG: %[[CONST:.*]] = "tfl.pseudo_const"(){{.*}}dense<{{\[\[0, 0\], \[1, 1\], \[1, 1\], \[0, 0\]\]}}> : tensor<4x2xi32>
-// CHECK-DAG: %[[QCONST_0:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x4x!quant.uniform<i8:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>} : () -> tensor<2x3x3x4x!quant.uniform<i8:f32:0, {2.000000e+02,3.000000e+03}>>
-// CHECK-DAG: %[[QCONST_1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>
-// CHECK: %[[PAD:.*]] = "tfl.pad"(%arg0, %[[CONST]]) : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<4x2xi32>) -> tensor<1x5x5x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
+// CHECK-LABEL: convolution_upstream_srq_strides
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
+// CHECK-DAG: %[[CONST:.+]] = "tfl.pseudo_const"(){{.*}}dense<{{\[\[0, 0\], \[1, 1\], \[1, 1\], \[0, 0\]\]}}> : tensor<4x2xi32>
+// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<3> : tensor<2x3x3x4xi8>} : () -> tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>
+// CHECK: %[[PAD:.+]] = "tfl.pad"(%arg0, %[[CONST]]) : (tensor<1x3x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<4x2xi32>) -> tensor<1x5x5x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
 // Tests that the stride_w is set to 2.
-// CHECK: %[[CONV2D:.*]] = "tfl.conv_2d"(%[[PAD]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 2 : i32} : (tensor<1x5x5x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>) -> tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+00>>
+// CHECK: %[[CONV2D:.+]] = "tfl.conv_2d"(%[[PAD]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 2 : i32} : (tensor<1x5x5x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<2x3x3x4x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {2.000000e+02,3.000000e+03}>>) -> tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+00>>
 // CHECK: return %[[CONV2D]] : tensor<1x3x2x2x!quant.uniform<i8:f32, 4.000000e+00>>
 
 // -----
 
-// Test full integer quantized dot_general with asymmetric quantized input.
+// Tests static range quantized dot_general with asymmetric quantized input.
 
-// CHECK-LABEL: dot_general_upstream_full_integer_asym_input
-func.func @dot_general_upstream_full_integer_asym_input(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
+func.func @dot_general_upstream_srq_asym_input(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
   %1 = "stablehlo.dot_general"(%arg0, %0) {
     dot_dimension_numbers = #stablehlo.dot<
@@ -208,16 +210,16 @@ func.func @dot_general_upstream_full_integer_asym_input(%arg0: tensor<1x2x3x4x!q
   } : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
   return %1 : tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
 }
-// CHECK-SAME: %[[ARG:.*]]: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
-// CHECK: %[[QCONST_0:.*]] =  "tfl.pseudo_qconst"() {qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>
-// CHECK: %[[BMM:.*]] = "tfl.batch_matmul"(%[[ARG]], %[[QCONST_0]]) {adj_x = false, adj_y = false} : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+00>>
+// CHECK-LABEL: dot_general_upstream_srq_asym_input
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
+// CHECK: %[[QCONST_0:.+]] =  "tfl.pseudo_qconst"() {qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK: %[[BMM:.+]] = "tfl.batch_matmul"(%[[ARG]], %[[QCONST_0]]) {adj_x = false, adj_y = false} : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+00>>
 
 // -----
 
-// Test full integer quantized dot_general with symmetric quantized input.
+// Tests static range quantized dot_general with symmetric quantized input.
 
-// CHECK-LABEL: dot_general_upstream_full_integer_sym_input
-func.func @dot_general_upstream_full_integer_sym_input(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
+func.func @dot_general_upstream_srq_sym_input(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
   %1 = "stablehlo.dot_general"(%arg0, %0) {
     dot_dimension_numbers = #stablehlo.dot<
@@ -230,41 +232,16 @@ func.func @dot_general_upstream_full_integer_sym_input(%arg0: tensor<1x2x3x4x!qu
   } : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
   return %1 : tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
 }
-
-// CHECK-SAME: %[[ARG:.*]]: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00>>
-// CHECK: %[[QCONST_0:.*]] =  "tfl.pseudo_qconst"()
+// CHECK-LABEL: dot_general_upstream_srq_sym_input
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK: %[[QCONST_0:.+]] =  "tfl.pseudo_qconst"()
 // CHECK: "tfl.batch_matmul"(%[[ARG]], %[[QCONST_0]]) {adj_x = false, adj_y = false}
 
 // -----
 
-// Tests that the pattern does not match when the output tensor's storage
-// type is i32. Currently we support qi8, qi8 -> qi8 only for GEMM ops that
-// are quantized upstream. Other cases should be handled by regular quantized
-// stablehlo.dot_general case.
+// Tests static range quantized dot_general with activation as RHS
 
-// CHECK-LABEL: dot_general_upstream_full_integer_i32_output
-func.func @dot_general_upstream_full_integer_i32_output(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i32:f32, 4.000000e+0>> {
-  %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
-  %1 = "stablehlo.dot_general"(%arg0, %0) {
-    dot_dimension_numbers = #stablehlo.dot<
-      lhs_batching_dimensions = [0, 1],
-      rhs_batching_dimensions = [0, 1],
-      lhs_contracting_dimensions = [3],
-      rhs_contracting_dimensions = [2]
-    >,
-    precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]
-  } : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i32:f32, 4.000000e+0>>
-  return %1 : tensor<1x2x3x5x!quant.uniform<i32:f32, 4.000000e+0>>
-}
-// CHECK: stablehlo.dot_general
-// CHECK-NOT: tfl.quantize
-
-// -----
-
-// Test full integer quantized dot_general with activation as RHS
-
-// CHECK-LABEL: dot_general_upstream_full_integer_activation_rhs
-func.func @dot_general_upstream_full_integer_activation_rhs(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>, %arg1: tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
+func.func @dot_general_upstream_srq_activation_rhs(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>, %arg1: tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = "stablehlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #stablehlo.dot<
       lhs_batching_dimensions = [0, 1],
@@ -276,14 +253,15 @@ func.func @dot_general_upstream_full_integer_activation_rhs(%arg0: tensor<1x2x3x
   } : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
   return %0 : tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
 }
+// CHECK-LABEL: dot_general_upstream_srq_activation_rhs
 // CHECK: "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = false} : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+00>>
 
 // -----
 
-// Test full integer quantized dot_general with adj_x
+// Tests static range quantized dot_general with adj_x
 
-// CHECK-LABEL: dot_general_upstream_full_integer_adj_x
-func.func @dot_general_upstream_full_integer_adj_x(%arg0: tensor<1x2x4x3x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
+// CHECK-LABEL: dot_general_upstream_srq_adj_x
+func.func @dot_general_upstream_srq_adj_x(%arg0: tensor<1x2x4x3x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
   %1 = "stablehlo.dot_general"(%arg0, %0) {
     dot_dimension_numbers = #stablehlo.dot<
@@ -297,17 +275,15 @@ func.func @dot_general_upstream_full_integer_adj_x(%arg0: tensor<1x2x4x3x!quant.
   } : (tensor<1x2x4x3x!quant.uniform<i8:f32, 1.000000e+0>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
   return %1 : tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
 }
-
-// CHECK-SAME: %[[ARG:.*]]: tensor<1x2x4x3x!quant.uniform<i8:f32, 1.000000e+00>>
-// CHECK: %[[QCONST_0:.*]] =  "tfl.pseudo_qconst"() {qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x2x4x3x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK: %[[QCONST_0:.+]] =  "tfl.pseudo_qconst"() {qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00>>
 // CHECK: "tfl.batch_matmul"(%[[ARG]], %[[QCONST_0]]) {adj_x = true, adj_y = false}
 
 // -----
 
-// Test full integer quantized dot_general with adj_y
+// Tests static range quantized dot_general with adj_y
 
-// CHECK-LABEL: dot_general_upstream_full_integer_adj_y
-func.func @dot_general_upstream_full_integer_adj_y(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
+func.func @dot_general_upstream_srq_adj_y(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x5x4xi8>} : () -> tensor<1x2x5x4x!quant.uniform<i8:f32, 1.000000e+0>>
   %1 = "stablehlo.dot_general"(%arg0, %0) {
     dot_dimension_numbers = #stablehlo.dot<
@@ -321,17 +297,16 @@ func.func @dot_general_upstream_full_integer_adj_y(%arg0: tensor<1x2x3x4x!quant.
   } : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>, tensor<1x2x5x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
   return %1 : tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
 }
-
-// CHECK-SAME: %[[ARG:.*]]: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00>>
-// CHECK: %[[QCONST_0:.*]] =  "tfl.pseudo_qconst"() {qtype = tensor<1x2x5x4x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x5x4xi8>} : () -> tensor<1x2x5x4x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK-LABEL: dot_general_upstream_srq_adj_y
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00>>
+// CHECK: %[[QCONST_0:.+]] =  "tfl.pseudo_qconst"() {qtype = tensor<1x2x5x4x!quant.uniform<i8:f32, 1.000000e+00>>, value = dense<1> : tensor<1x2x5x4xi8>} : () -> tensor<1x2x5x4x!quant.uniform<i8:f32, 1.000000e+00>>
 // CHECK: "tfl.batch_matmul"(%[[ARG]], %[[QCONST_0]]) {adj_x = false, adj_y = true}
 
 // -----
 
-// Test full integer quantized dot_general with wrong batch dims
+// Tests static range quantized dot_general with wrong batch dims
 
-// CHECK-LABEL: dot_general_upstream_full_integer_too_many_batches
-func.func @dot_general_upstream_full_integer_too_many_batches(%arg0: tensor<1x1x1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x1x1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
+func.func @dot_general_upstream_srq_too_many_batches(%arg0: tensor<1x1x1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x1x1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x1x1x2x4x5xi8>} : () -> tensor<1x1x1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
   %1 = "stablehlo.dot_general"(%arg0, %0) {
     dot_dimension_numbers = #stablehlo.dot<
@@ -345,15 +320,15 @@ func.func @dot_general_upstream_full_integer_too_many_batches(%arg0: tensor<1x1x
   return %1 : tensor<1x1x1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
 }
 // Only support size(batching_dimensions) <= 3
+// CHECK-LABEL: dot_general_upstream_srq_too_many_batches
 // CHECK: stablehlo.dot_general
 // CHECK-NOT: tfl.batch_matmul
 
 // -----
 
-// Test full integer quantized dot_general with too many contracting dimension
+// Tests static range quantized dot_general with too many contracting dimension
 
-// CHECK-LABEL: dot_general_upstream_full_integer_too_many_contractions
-func.func @dot_general_upstream_full_integer_too_many_contractions(%arg0: tensor<1x2x3x4x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
+func.func @dot_general_upstream_srq_too_many_contractions(%arg0: tensor<1x2x3x4x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x4x5xi8>} : () -> tensor<1x2x4x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
   %1 = "stablehlo.dot_general"(%arg0, %0) {
     dot_dimension_numbers = #stablehlo.dot<
@@ -367,15 +342,15 @@ func.func @dot_general_upstream_full_integer_too_many_contractions(%arg0: tensor
   return %1 : tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
 }
 // Only support size(contracting_dimensions) == 1
+// CHECK-LABEL: dot_general_upstream_srq_too_many_contractions
 // CHECK: stablehlo.dot_general
 // CHECK-NOT: tfl.batch_matmul
 
 // -----
 
-// Test full integer quantized dot_general with unsupported contracting dim
+// Tests static range quantized dot_general with unsupported contracting dim
 
-// CHECK-LABEL: dot_general_upstream_full_integer_wrong_contracting
-func.func @dot_general_upstream_full_integer_wrong_contracting(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x4x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
+func.func @dot_general_upstream_srq_wrong_contracting(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x4x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>
   %1 = "stablehlo.dot_general"(%arg0, %0) {
     dot_dimension_numbers = #stablehlo.dot<
@@ -388,17 +363,17 @@ func.func @dot_general_upstream_full_integer_wrong_contracting(%arg0: tensor<1x2
   } : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0>>) -> tensor<1x4x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
   return %1 : tensor<1x4x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
 }
-
 // Contracting dimension must be the last two dimension
+// CHECK-LABEL: dot_general_upstream_srq_wrong_contracting
 // CHECK: stablehlo.dot_general
 // CHECK-NOT: tfl.batch_matmul
 
 // -----
 
-// Test full integer quantized dot_general with float operands
+// Tests static range quantized dot_general with float operands
 
-// CHECK-LABEL: dot_general_upstream_full_integer_float_operands
-func.func @dot_general_upstream_full_integer_float_operands(%arg0: tensor<1x2x3x4xf32>, %arg1: tensor<1x2x4x5xf32>) -> tensor<1x2x3x5xf32> {
+// CHECK-LABEL: dot_general_upstream_srq_float_operands
+func.func @dot_general_upstream_srq_float_operands(%arg0: tensor<1x2x3x4xf32>, %arg1: tensor<1x2x4x5xf32>) -> tensor<1x2x3x5xf32> {
   %0 = "stablehlo.dot_general"(%arg0, %arg1) {
     dot_dimension_numbers = #stablehlo.dot<
       lhs_batching_dimensions = [0, 1],
@@ -416,44 +391,44 @@ func.func @dot_general_upstream_full_integer_float_operands(%arg0: tensor<1x2x3x
 
 // -----
 
-// Test full integer quantized dot_general with asymmetric weight (rhs).
+// Tests static range quantized dot_general with asymmetric weight (rhs).
 
-// CHECK-LABEL: dot_general_upstream_full_integer_asym_weight
-func.func @dot_general_upstream_full_integer_asym_weight(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
+// CHECK-LABEL: dot_general_upstream_srq_asym_weight
+func.func @dot_general_upstream_srq_asym_weight(%arg0: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0:5>>
   %1 = "stablehlo.dot_general"(%arg0, %0) {dot_dimension_numbers = #stablehlo.dot<lhs_batching_dimensions = [0, 1], rhs_batching_dimensions = [0, 1], lhs_contracting_dimensions = [3], rhs_contracting_dimensions = [2]>, precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>]} : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+0:-100>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+0:5>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
   return %1 : tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+0>>
 }
-// CHECK-SAME: %[[ARG:.*]]: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
-// CHECK: %[[QCONST_0:.*]] =  "tfl.pseudo_qconst"() {qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00:5>>, value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00:5>>
-// CHECK: %[[BMM:.*]] = "tfl.batch_matmul"(%[[ARG]], %[[QCONST_0]]) {adj_x = false, adj_y = false} : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00:5>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+00>>
+// CHECK-SAME: %[[ARG:.+]]: tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>
+// CHECK: %[[QCONST_0:.+]] =  "tfl.pseudo_qconst"() {qtype = tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00:5>>, value = dense<1> : tensor<1x2x4x5xi8>} : () -> tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00:5>>
+// CHECK: %[[BMM:.+]] = "tfl.batch_matmul"(%[[ARG]], %[[QCONST_0]]) {adj_x = false, adj_y = false} : (tensor<1x2x3x4x!quant.uniform<i8:f32, 1.000000e+00:-100>>, tensor<1x2x4x5x!quant.uniform<i8:f32, 1.000000e+00:5>>) -> tensor<1x2x3x5x!quant.uniform<i8:f32, 4.000000e+00>>
 
 // -----
 
-// Test that when the weight tensor for `stablehlo.dot_general` is per-axis
+// Tests that when the weight tensor for `stablehlo.dot_general` is per-axis
 // quantized, it is converted to `tfl.fully_connected` op.
 
-// CHECK-LABEL: dot_general_upstream_full_integer_per_axis_quantized_filter
-func.func @dot_general_upstream_full_integer_per_axis_quantized_filter(%arg0: tensor<1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+04:127>> {
+// CHECK-LABEL: dot_general_upstream_srq_per_axis_quantized_filter
+func.func @dot_general_upstream_srq_per_axis_quantized_filter(%arg0: tensor<1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+04:127>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<3x2xi8>} : () -> tensor<3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>
   %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0] : (tensor<1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>, tensor<3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
   return %1 : tensor<1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
 }
-// CHECK-SAME: %[[ARG_0:.*]]: tensor<1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>
+// CHECK-SAME: %[[ARG_0:.+]]: tensor<1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>
 // Weight tensor is transposed, as tfl.fully_connected accepts a [o, i] matrix.
-// CHECK-DAG: %[[QCONST_0:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x!quant.uniform<i8:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<1> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8:f32:0, {2.000000e+02,3.000000e+03}>>
-// CHECK-DAG: %[[QCONST_1:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32<-128:127>:f32:0, {1.000000e+08,1.500000e+09}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32<-128:127>:f32:0, {1.000000e+08,1.500000e+09}>>
+// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, value = dense<1> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {1.000000e+08,1.500000e+09}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {1.000000e+08,1.500000e+09}>>
 // Bias tensor's scale is input scale * filter scale.
-// CHECK: %[[FC:.*]] = "tfl.fully_connected"(%[[ARG_0]], %[[QCONST_0]], %[[QCONST_1]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>, tensor<2x3x!quant.uniform<i8:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32<-128:127>:f32:0, {1.000000e+08,1.500000e+09}>>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
+// CHECK: %[[FC:.+]] = "tfl.fully_connected"(%[[ARG_0]], %[[QCONST_0]], %[[QCONST_1]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32:0, {2.000000e+02,3.000000e+03}>>, tensor<2x!quant.uniform<i32:f32:0, {1.000000e+08,1.500000e+09}>>) -> tensor<1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
 // CHECK-NEXT: return %[[FC]] : tensor<1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
 
 // -----
 
-// Test that when the weight tensor for `stablehlo.dot_general` is per-axis
+// Tests that when the weight tensor for `stablehlo.dot_general` is per-axis
 // quantized but has a batch dimension, it is not converted.
 
-// CHECK-LABEL: dot_general_upstream_full_integer_per_axis_quantized_filter_with_batch_dim
-func.func @dot_general_upstream_full_integer_per_axis_quantized_filter_with_batch_dim(%arg0: tensor<1x1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<1x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>> {
+// CHECK-LABEL: dot_general_upstream_srq_per_axis_quantized_filter_with_batch_dim
+func.func @dot_general_upstream_srq_per_axis_quantized_filter_with_batch_dim(%arg0: tensor<1x1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<1x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x3x2xi8>} : () -> tensor<1x3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>
   %1 = stablehlo.dot_general %arg0, %0, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<1x1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>, tensor<1x3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>) -> tensor<1x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
   return %1 : tensor<1x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
@@ -465,11 +440,11 @@ func.func @dot_general_upstream_full_integer_per_axis_quantized_filter_with_batc
 
 // -----
 
-// Test that when the weight tensor for `stablehlo.dot_general` is per-axis
+// Tests that when the weight tensor for `stablehlo.dot_general` is per-axis
 // quantized but has a batch dim > 1, it is not converted.
 
-// CHECK-LABEL: dot_general_upstream_full_integer_per_axis_quantized_filter_multibatch
-func.func @dot_general_upstream_full_integer_per_axis_quantized_filter_multibatch(%arg0: tensor<3x1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<3x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>> {
+// CHECK-LABEL: dot_general_upstream_srq_per_axis_quantized_filter_multibatch
+func.func @dot_general_upstream_srq_per_axis_quantized_filter_multibatch(%arg0: tensor<3x1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<3x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<3x3x2xi8>} : () -> tensor<3x3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>
   %1 = stablehlo.dot_general %arg0, %0, batching_dims = [0] x [0], contracting_dims = [2] x [1] : (tensor<3x1x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>, tensor<3x3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>) -> tensor<3x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
   return %1 : tensor<3x1x2x!quant.uniform<i8:f32, 4.000000e+04:127>>
@@ -481,11 +456,11 @@ func.func @dot_general_upstream_full_integer_per_axis_quantized_filter_multibatc
 
 // -----
 
-// Test that when the weight tensor for `stablehlo.dot_general` is per-axis
+// Tests that when the weight tensor for `stablehlo.dot_general` is per-axis
 // quantized but has more than one contracting dimension, it is not converted.
 
-// CHECK-LABEL: dot_general_upstream_full_integer_per_axis_quantized_filter_with_multiple_contracting_dims
-func.func @dot_general_upstream_full_integer_per_axis_quantized_filter_with_multiple_contracting_dims(%arg0: tensor<1x2x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<1x1x!quant.uniform<i8:f32, 4.000000e+04:127>> {
+// CHECK-LABEL: dot_general_upstream_srq_per_axis_quantized_filter_with_multiple_contracting_dims
+func.func @dot_general_upstream_srq_per_axis_quantized_filter_with_multiple_contracting_dims(%arg0: tensor<1x2x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>) -> tensor<1x1x!quant.uniform<i8:f32, 4.000000e+04:127>> {
   %0 = stablehlo.constant() {value = dense<1> : tensor<1x3x2xi8>} : () -> tensor<1x3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>
   %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [1, 2] x [2, 1] : (tensor<1x2x3x!quant.uniform<i8:f32, 5.000000e+05:-100>>, tensor<1x3x2x!quant.uniform<i8:f32:1,{2.000000e+02, 3.000000e+03}>>) -> tensor<1x1x!quant.uniform<i8:f32, 4.000000e+04:127>>
   return %1 : tensor<1x1x!quant.uniform<i8:f32, 4.000000e+04:127>>
@@ -497,22 +472,592 @@ func.func @dot_general_upstream_full_integer_per_axis_quantized_filter_with_mult
 
 // -----
 
-// Test that a simple per-tensor quantized stablehlo.dot_general is properly
-// fused with a subsequent requantize (qi32->qi8) op then legalized.
-// Supports the following format: (lhs: qi8, rhs: qi8) -> result: qi32
+// ============================================================================
+// The following functions tests example quantization patterns outputted from
+// StableHLO Quantizer. These patterns should be legalized early directly
+// to fused tflite ops.
+// ============================================================================
+
+// Tests that a simple per-tensor quantized `stablehlo.dot_general` is properly
+// lowered to fused `tfl.fully_connected`.
+// This case covers for the following quantization patterns because
+// activation clipping ranges take affect in scale and zp of the final
+// `stablehlo.uniform_quantize`. See more details in b/319168201.
+// * dot_general_fn
+// * dot_general_with_relu_fn
+// * dot_general_with_relu6_fn
+
+func.func @dot_general_srq(%arg0: tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+0:0>>) -> (tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>) {
+  %0 = stablehlo.constant() {value = dense<1> : tensor<1024x3xi8>} : () -> tensor<1024x3x!quant.uniform<i8<-127:127>:f32, 2.000000e+0:0>>
+  %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0] : (tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+0:0>>, tensor<1024x3x!quant.uniform<i8<-127:127>:f32, 2.000000e+0:0>>) -> tensor<1x3x!quant.uniform<i32:f32, 2.000000e+0:-127>>
+  %2 = stablehlo.uniform_quantize %1 : (tensor<1x3x!quant.uniform<i32:f32, 2.000000e+0:-127>>) -> tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>
+  return %2 : tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>
+}
+// CHECK-LABEL: dot_general_srq
+// CHECK-SAME: (%[[ARG_1:.+]]: tensor<1x1024x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>
+// CHECK-NOT: stablehlo.dot_general
+// CHECK: %[[QCONST_0:.+]] =  "tfl.pseudo_qconst"() {qtype = tensor<3x1024x!quant.uniform<i8<-127:127>:f32, 2.000000e+00>>, value = dense<1> : tensor<3x1024xi8>} : () -> tensor<3x1024x!quant.uniform<i8<-127:127>:f32, 2.000000e+00>>
+// CHECK: %[[QCONST_1:.+]] =  "tfl.pseudo_qconst"() {qtype = tensor<3x!quant.uniform<i32:f32, 2.000000e+00>>, value = dense<0> : tensor<3xi32>} : () -> tensor<3x!quant.uniform<i32:f32, 2.000000e+00>>
+// CHECK: %[[FULLY_CONNECTED:.+]] =  "tfl.fully_connected"(%[[ARG_1]], %[[QCONST_0]], %[[QCONST_1]])  {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+00>>, tensor<3x1024x!quant.uniform<i8<-127:127>:f32, 2.000000e+00>>, tensor<3x!quant.uniform<i32:f32, 2.000000e+00>>) -> tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>
+// CHECK-NOT: tfl.batch_matmul
+// CHECK: return %[[FULLY_CONNECTED]]
 
-// CHECK-LABEL: dot_general_full_integer
-// CHECK-SAME: (%[[ARG_1:.*]]: tensor<1x1024x!quant.uniform<i8:f32, {{.*}}>
- func.func @dot_general_full_integer(%arg0: tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+0:0>> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<1x3xf32> {tf_saved_model.index_path = ["output"]}) {
-    %0 = stablehlo.constant() {value = dense<1> : tensor<1024x3xi8>} : () -> tensor<1024x3x!quant.uniform<i8<-127:127>:f32, 2.000000e+0:0>>
-    %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0] : (tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+0:0>>, tensor<1024x3x!quant.uniform<i8<-127:127>:f32, 2.000000e+0:0>>) -> tensor<1x3x!quant.uniform<i32:f32, 4.000000e+0:-127>>
-    %2 = stablehlo.uniform_quantize %1 : (tensor<1x3x!quant.uniform<i32:f32, 4.000000e+0:-127>>) -> tensor<1x3x!quant.uniform<i8:f32, 2.000000e+0:-127>>
-    %3 = stablehlo.uniform_dequantize %2 : (tensor<1x3x!quant.uniform<i8:f32, 2.000000e+0:-127>>) -> tensor<1x3xf32>
-    return %3 : tensor<1x3xf32>
+// -----
+
+// Tests that a fused per-tensor quantized `stablehlo.dot_general` is properly
+// lowered to fused `tfl.fully_connected`.
+// TODO: b/309896242 - Add more support for dynamic bias fusion cases.
+
+func.func @dot_general_with_bias_same_shape_srq(%arg0: tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+0:0>>) -> (tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>) {
+  %0 = stablehlo.constant() {value = dense<1> : tensor<1024x3xi8>} : () -> tensor<1024x3x!quant.uniform<i8<-127:127>:f32, 2.000000e+0:0>>
+  %1 = stablehlo.constant() {value = dense<2> : tensor<1x3xi32>} : () -> tensor<1x3x!quant.uniform<i32:f32, 2.000000e+0:-127>>
+  %2 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0] : (tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+0:0>>, tensor<1024x3x!quant.uniform<i8<-127:127>:f32, 2.000000e+0:0>>) -> tensor<1x3x!quant.uniform<i32:f32, 2.000000e+0:-127>>
+  %3 = stablehlo.add %2, %1 : tensor<1x3x!quant.uniform<i32:f32, 2.000000e+0:-127>>
+  %4 = stablehlo.uniform_quantize %3 : (tensor<1x3x!quant.uniform<i32:f32, 2.000000e+0:-127>>) -> tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>
+  return %4 : tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>
+}
+// CHECK-LABEL: dot_general_with_bias_same_shape
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+00>>) -> tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>
+// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<3x1024x!quant.uniform<i8<-127:127>:f32, 2.000000e+00>>, value = dense<1> : tensor<3x1024xi8>} : () -> tensor<3x1024x!quant.uniform<i8<-127:127>:f32, 2.000000e+00>>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<3x!quant.uniform<i32:f32, 2.000000e+00:-127>>, value = dense<2> : tensor<1x3xi32>} : () -> tensor<3x!quant.uniform<i32:f32, 2.000000e+00:-127>>
+// CHECK: %[[FULLY_CONNECTED:.+]] = "tfl.fully_connected"(%[[ARG_0]], %[[QCONST_0]], %[[QCONST_1]]) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+00>>, tensor<3x1024x!quant.uniform<i8<-127:127>:f32, 2.000000e+00>>, tensor<3x!quant.uniform<i32:f32, 2.000000e+00:-127>>) -> tensor<1x3x!quant.uniform<i8:f32, 3.000000e+00:-127>>
+// CHECK: return %[[FULLY_CONNECTED]]
+
+// -----
+
+// Tests that a simple per-channel quantized `stablehlo.convolution` is properly
+// lowered to fused `tfl.conv_2d`.
+// This case covers for the following quantization patterns because
+// activation clipping ranges take affect in scale and zp of the final
+// `stablehlo.uniform_quantize`. See more details in b/319168201.
+// * conv_fn
+// * conv_with_relu_fn
+// * conv_with_relu6_fn
+
+func.func @conv_srq(%arg0: tensor<1x5x5x2x!quant.uniform<i8:f32, 2.000000e+00:0>>) -> (tensor<1x6x6x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>) {
+  %0 = stablehlo.constant() {value = dense<3> : tensor<2x2x2x4xi8>} : () -> tensor<2x2x2x4x!quant.uniform<i8:f32:3, {3.000000e+00, 3.000000e+00, 3.000000e+00, 3.000000e+00}>>
+  %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x5x5x2x!quant.uniform<i8:f32, 2.000000e+00:0>>, tensor<2x2x2x4x!quant.uniform<i8:f32:3, {3.000000e+00, 3.000000e+00, 3.000000e+00, 3.000000e+00}>>) -> tensor<1x6x6x4x!quant.uniform<i32:f32:3, {6.000000e+00, 6.000000e+00, 6.000000e+00, 6.000000e+00}>>
+  %2 = stablehlo.uniform_quantize %1 : (tensor<1x6x6x4x!quant.uniform<i32:f32:3, {6.000000e+00, 6.000000e+00, 6.000000e+00, 6.000000e+00}>>) -> tensor<1x6x6x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+  return %2 : tensor<1x6x6x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+}
+// CHECK-LABEL: func.func @conv_srq
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x5x5x2x!quant.uniform<i8:f32, 2.000000e+00>>) -> tensor<1x6x6x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_const"() {value = dense<{{\[\[0, 0\], \[1, 1\], \[1, 1\], \[0, 0\]\]}}> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<4x2x2x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>, value = dense<3> : tensor<4x2x2x2xi8>} : () -> tensor<4x2x2x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>
+// CHECK-DAG: %[[QCONST_2:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>, value = dense<0> : tensor<4xi32>} : () -> tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>
+// CHECK: %[[PAD:.+]] = "tfl.pad"(%[[ARG_0]], %[[QCONST_0]]) : (tensor<1x5x5x2x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<4x2xi32>) -> tensor<1x7x7x2x!quant.uniform<i8:f32, 2.000000e+00>>
+// CHECK: %[[CONV_2D:.+]] = "tfl.conv_2d"(%[[PAD]], %[[QCONST_1]], %[[QCONST_2]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x7x7x2x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<4x2x2x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>, tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>) -> tensor<1x6x6x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+// CHECK: return %[[CONV_2D]]
+
+func.func @conv_same_padding_srq(%arg0: tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00:0>>) -> (tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>) {
+  %0 = stablehlo.constant() {value = dense<3> : tensor<3x3x3x2xi8>} : () -> tensor<3x3x3x2x!quant.uniform<i8:f32:3, {3.000000e+00, 3.000000e+00}>>
+  %1 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00:0>>, tensor<3x3x3x2x!quant.uniform<i8:f32:3, {3.000000e+00, 3.000000e+00}>>) -> tensor<1x32x32x2x!quant.uniform<i32:f32:3, {6.000000e+00, 6.000000e+00}>>
+  %2 = stablehlo.uniform_quantize %1 : (tensor<1x32x32x2x!quant.uniform<i32:f32:3, {6.000000e+00, 6.000000e+00}>>) -> tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+  return %2 : tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+}
+// CHECK-LABEL: func.func @conv_same_padding_srq
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00>>) -> tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>, value = dense<3> : tensor<2x3x3x3xi8>} : () -> tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>, value = dense<0> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>
+// CHECK: %[[CONV_2D:.+]] = "tfl.conv_2d"(%[[ARG_0]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>, tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>) -> tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+// CHECK: return %[[CONV_2D]] : tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+
+// -----
+
+// Tests that a fused per-channel quantized `stablehlo.convolution` is properly
+// lowered to fused `tfl.conv_2d`.
+// This case covers for the following quantization patterns because
+// activation clipping ranges take affect in scale and zp of the final
+// `stablehlo.uniform_quantize`. See more details in b/319168201.
+// * conv_with_bias_fn
+// * conv_with_bias_and_relu_fn
+// * conv_with_bias_and_relu6_fn
+
+func.func @conv_with_bias_and_relu_srq(%arg0: tensor<1x5x5x2x!quant.uniform<i8:f32, 2.000000e+00:0>>) -> (tensor<1x6x6x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>) {
+    %0 = stablehlo.constant() {value = dense<5> : tensor<1x1x1x4xi32>} : () -> tensor<1x1x1x4x!quant.uniform<i32:f32:3, {2.000000e+00, 2.000000e+00, 2.000000e+00, 2.000000e+00}>>
+    %1 = stablehlo.constant() {value = dense<3> : tensor<2x2x2x4xi8>} : () -> tensor<2x2x2x4x!quant.uniform<i8:f32:3, {3.000000e+00, 3.000000e+00, 3.000000e+00, 3.000000e+00}>>
+    %2 = stablehlo.broadcast_in_dim %0, dims = [0, 1, 2, 3] : (tensor<1x1x1x4x!quant.uniform<i32:f32:3, {2.000000e+00, 2.000000e+00, 2.000000e+00, 2.000000e+00}>>) -> tensor<1x6x6x4x!quant.uniform<i32:f32:3, {6.000000e+00, 6.000000e+00, 6.000000e+00, 6.000000e+00}>>
+    %3 = stablehlo.convolution(%arg0, %1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x5x5x2x!quant.uniform<i8:f32, 2.000000e+00:0>>, tensor<2x2x2x4x!quant.uniform<i8:f32:3, {3.000000e+00, 3.000000e+00, 3.000000e+00, 3.000000e+00}>>) -> tensor<1x6x6x4x!quant.uniform<i32:f32:3, {6.000000e+00, 6.000000e+00, 6.000000e+00, 6.000000e+00}>>
+    %4 = stablehlo.add %3, %2 : tensor<1x6x6x4x!quant.uniform<i32:f32:3, {6.000000e+00, 6.000000e+00, 6.000000e+00, 6.000000e+00}>>
+    %5 = stablehlo.uniform_quantize %4 : (tensor<1x6x6x4x!quant.uniform<i32:f32:3, {6.000000e+00, 6.000000e+00, 6.000000e+00, 6.000000e+00}>>) -> tensor<1x6x6x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+    return %5 : tensor<1x6x6x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>
   }
+// CHECK-LABEL: func.func @conv_with_bias_and_relu_srq
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x5x5x2x!quant.uniform<i8:f32, 2.000000e+00>>) -> tensor<1x6x6x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+// CHECK-DAG: %[[CONST_0:.+]] = "tfl.pseudo_const"() {value = dense<{{\[\[0, 0\], \[1, 1\], \[1, 1\], \[0, 0\]\]}}> : tensor<4x2xi32>} : () -> tensor<4x2xi32>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<4x2x2x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>, value = dense<3> : tensor<4x2x2x2xi8>} : () -> tensor<4x2x2x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>
+// CHECK-DAG: %[[QCONST_2:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>, value = dense<5> : tensor<1x1x1x4xi32>} : () -> tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>
+// CHECK: %[[PAD:.+]] = "tfl.pad"(%[[ARG_0]], %[[CONST_0]]) : (tensor<1x5x5x2x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<4x2xi32>) -> tensor<1x7x7x2x!quant.uniform<i8:f32, 2.000000e+00>>
+// CHECK: %[[CONV_2D:.+]] = "tfl.conv_2d"(%[[PAD]], %[[QCONST_1]], %[[QCONST_2]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x7x7x2x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<4x2x2x2x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00,3.000000e+00,3.000000e+00}>>, tensor<4x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00,6.000000e+00,6.000000e+00}>>) -> tensor<1x6x6x4x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+// CHECK: return %[[CONV_2D]]
+
+func.func @conv_with_bias_same_padding_srq(%arg0: tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00:0>>) -> (tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>) {
+    %0 = stablehlo.constant() {value = dense<3> : tensor<3x3x3x2xi8>} : () -> tensor<3x3x3x2x!quant.uniform<i8:f32:3, {3.000000e+00, 3.000000e+00}>>
+    %1 = stablehlo.constant() {value = dense<5> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {2.000000e+00, 2.000000e+00}>>
+    %2 = stablehlo.broadcast_in_dim %1, dims = [0, 1, 2, 3] : (tensor<1x1x1x2x!quant.uniform<i32:f32:3, {2.000000e+00, 2.000000e+00}>>) -> tensor<1x32x32x2x!quant.uniform<i32:f32:3, {6.000000e+00, 6.000000e+00}>>
+    %3 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00:0>>, tensor<3x3x3x2x!quant.uniform<i8:f32:3, {3.000000e+00, 3.000000e+00}>>) -> tensor<1x32x32x2x!quant.uniform<i32:f32:3, {6.000000e+00, 6.000000e+00}>>
+    %4 = stablehlo.add %3, %2 : tensor<1x32x32x2x!quant.uniform<i32:f32:3, {6.000000e+00, 6.000000e+00}>>
+    %5 = stablehlo.uniform_quantize %4 : (tensor<1x32x32x2x!quant.uniform<i32:f32:3, {6.000000e+00, 6.000000e+00}>>) -> tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+    return %5 : tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+}
+// CHECK-LABEL: func.func @conv_with_bias_same_padding_srq
+// CHECK-SAME: (%[[ARG_0:.+]]: tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00>>) -> tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+// CHECK-DAG: %[[QCONST_0:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>, value = dense<3> : tensor<2x3x3x3xi8>} : () -> tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>
+// CHECK-DAG: %[[QCONST_1:.+]] = "tfl.pseudo_qconst"() {qtype = tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>, value = dense<5> : tensor<1x1x1x2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>
+// CHECK: %[[CONV_2D:.+]] = "tfl.conv_2d"(%[[ARG_0]], %[[QCONST_0]], %[[QCONST_1]]) {dilation_h_factor = 1 : i32, dilation_w_factor = 1 : i32, fused_activation_function = "NONE", padding = "SAME", stride_h = 1 : i32, stride_w = 1 : i32} : (tensor<1x32x32x3x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<2x3x3x3x!quant.uniform<i8<-127:127>:f32:0, {3.000000e+00,3.000000e+00}>>, tensor<2x!quant.uniform<i32:f32:0, {6.000000e+00,6.000000e+00}>>) -> tensor<1x32x32x2x!quant.uniform<i8:f32, 8.000000e+00:-128>>
+// CHECK: return %[[CONV_2D]]
 
-// CHECK-NOT: stablehlo.dot_general
-// CHECK: %[[QCONST_0:.*]] =  "tfl.pseudo_qconst"() {qtype = tensor<3x1024x!quant.uniform<i8:f32, 2.000000e+00>>, value = dense<1> : tensor<3x1024xi8>} : () -> tensor<3x1024x!quant.uniform<i8:f32, 2.000000e+00>>
-// CHECK: %[[QCONST_1:.*]] =  "tfl.pseudo_qconst"() {qtype = tensor<3x!quant.uniform<i32<-128:127>:f32, 2.000000e+00>>, value = dense<0> : tensor<3xi32>} : () -> tensor<3x!quant.uniform<i32<-128:127>:f32, 2.000000e+00>>
-// CHECK: "tfl.fully_connected"(%[[ARG_1]], %[[QCONST_0]], %[[QCONST_1]])  {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x1024x!quant.uniform<i8:f32, 1.000000e+00>>, tensor<3x1024x!quant.uniform<i8:f32, 2.000000e+00>>, tensor<3x!quant.uniform<i32<-128:127>:f32, 2.000000e+00>>) -> tensor<1x3x!quant.uniform<i8:f32, 2.000000e+00:-127>>
-// CHECK-NOT: tfl.batch_matmul
+// -----
+
+// Tests that a quantized stablehlo.transpose is converted to tfl.transpose.
+
+func.func @transpose(
+    %arg0: tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  ) -> tensor<4x3x2x!quant.uniform<i8:f32, 2.000000e+00:-1>> {
+  %0 = stablehlo.transpose %arg0, dims = [2, 1, 0] : (tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>) -> tensor<4x3x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  return %0 : tensor<4x3x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+}
+// CHECK-LABEL: transpose
+// CHECK-SAME: %[[ARG0:.+]]: tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK-NOT: stablehlo.transpose
+// CHECK: %[[CST:.+]] = arith.constant dense<[2, 1, 0]> : tensor<3xi32>
+// CHECK: %[[TRANSPOSE:.+]] = "tfl.transpose"(%[[ARG0]], %[[CST]]) : (tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>, tensor<3xi32>) -> tensor<4x3x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK: return %[[TRANSPOSE]]
+
+// -----
+
+// Tests that a float stablehlo.transpose is not converted to tfl.transpose.
+
+func.func @float_transpose(%arg0: tensor<2x3x4xf32>) -> tensor<4x3x2xf32> {
+  %0 = stablehlo.transpose %arg0, dims = [2, 1, 0] : (tensor<2x3x4xf32>) -> tensor<4x3x2xf32>
+  return %0 : tensor<4x3x2xf32>
+}
+// CHECK-LABEL: float_transpose
+// CHECK-NOT: tfl.transpose
+// CHECK: stablehlo.transpose
+
+// -----
+
+// Tests that a quantized stablehlo.reshape is converted to tfl.reshape.
+
+func.func @reshape(
+    %arg0: tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  ) -> tensor<6x4x!quant.uniform<i8:f32, 2.000000e+00:-1>> {
+  %0 = stablehlo.reshape %arg0 : (tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>) -> tensor<6x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  return %0 : tensor<6x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+}
+// CHECK-LABEL: reshape
+// CHECK-SAME: %[[ARG0:.+]]: tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK-NOT: stablehlo.reshape
+// CHECK: %[[CST:.+]] = arith.constant dense<[6, 4]> : tensor<2xi32>
+// CHECK: %[[RESHAPE:.+]] = "tfl.reshape"(%[[ARG0]], %[[CST]]) : (tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>, tensor<2xi32>) -> tensor<6x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK: return %[[RESHAPE]]
+
+// -----
+
+// Tests that a float stablehlo.reshape is not converted to tfl.reshape.
+
+func.func @float_reshape(%arg0: tensor<2x3x4xf32>) -> tensor<6x4xf32> {
+  %0 = stablehlo.reshape %arg0 : (tensor<2x3x4xf32>) -> tensor<6x4xf32>
+  return %0 : tensor<6x4xf32>
+}
+// CHECK-LABEL: float_reshape
+// CHECK-NOT: tfl.reshape
+// CHECK: stablehlo.reshape
+
+// -----
+
+// Tests that a quantized stablehlo.select is converted to tfl.select_v2.
+
+func.func @select(
+    %arg0: tensor<1x3xi1>,
+    %arg1: tensor<1x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>,
+    %arg2: tensor<1x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  ) -> tensor<1x3x!quant.uniform<i8:f32, 2.000000e+00:-1>> {
+  %0 = "stablehlo.select"(%arg0, %arg1, %arg2) : (
+    tensor<1x3xi1>,
+    tensor<1x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>,
+    tensor<1x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  ) -> tensor<1x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  return %0 : tensor<1x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+}
+// CHECK-LABEL: select
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x3xi1>, %[[ARG1:.+]]: tensor<1x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>, %[[ARG2:.+]]: tensor<1x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK-NOT: stablehlo.select
+// CHECK: %[[SELECT:.+]] = "tfl.select_v2"(%[[ARG0]], %[[ARG1]], %[[ARG2]]) : (tensor<1x3xi1>, tensor<1x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>, tensor<1x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>) -> tensor<1x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK: return %[[SELECT]]
+
+// -----
+
+// Tests that a float stablehlo.select is not converted to tfl.select_v2.
+
+func.func @float_select(%arg0: tensor<1x3xi1>, %arg1: tensor<1x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> {
+  %0 = "stablehlo.select"(%arg0, %arg1, %arg2) : (tensor<1x3xi1>, tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+  return %0 : tensor<1x3xf32>
+}
+// CHECK-LABEL: float_select
+// CHECK-NOT: tfl.select_v2
+// CHECK: stablehlo.select
+
+// -----
+
+// Tests that a quantized stablehlo.concatenate is converted to tfl.concatenation.
+
+func.func @concatenate(
+    %arg0: tensor<3x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>,
+    %arg1: tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  ) -> tensor<4x2x!quant.uniform<i8:f32, 2.000000e+00:-1>> {
+  %0 = "stablehlo.concatenate"(%arg0, %arg1) {dimension = 0 : i64} : (
+    tensor<3x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>,
+    tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  ) -> tensor<4x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  return %0 : tensor<4x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+}
+// CHECK-LABEL: concatenate
+// CHECK-SAME: %[[ARG0:.+]]: tensor<3x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>, %[[ARG1:.+]]: tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK-NOT: stablehlo.concatenate
+// CHECK: %[[CONCAT:.+]] = "tfl.concatenation"(%arg0, %arg1) {axis = 0 : i32, fused_activation_function = "NONE"} : (tensor<3x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>, tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>) -> tensor<4x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK: return %[[CONCAT]]
+
+// -----
+
+// Tests that a float stablehlo.concatenate is not converted to tfl.concatenation.
+
+func.func @float_concatenate(%arg0: tensor<3x2xf32>, %arg1: tensor<1x2xf32>) -> tensor<4x2xf32> {
+  %0 = "stablehlo.concatenate"(%arg0, %arg1) {dimension = 0 : i64} : (tensor<3x2xf32>, tensor<1x2xf32>) -> tensor<4x2xf32>
+  return %0 : tensor<4x2xf32>
+}
+// CHECK-LABEL: float_concatenate
+// CHECK-NOT: tfl.concatenation
+// CHECK: stablehlo.concatenate
+
+// -----
+
+// Tests that a quantized stablehlo.pad without interior padding is converted to
+// tfl.padv2.
+
+func.func @pad_without_interior_padding(
+    %arg0: tensor<2x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>,
+    %arg1: tensor<!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  ) -> tensor<4x5x!quant.uniform<i8:f32, 2.000000e+00:-1>> {
+  %0 = stablehlo.pad %arg0, %arg1, low = [0, 1], high = [2, 1], interior = [0, 0] : (
+    tensor<2x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>,
+    tensor<!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  ) -> tensor<4x5x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  return %0 : tensor<4x5x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+}
+// CHECK-LABEL: pad_without_interior_padding
+// CHECK-SAME: %[[ARG0:.+]]: tensor<2x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK-SAME: %[[ARG1:.+]]: tensor<!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK: %[[PADDING:.+]] = arith.constant
+// CHECK{LITERAL}: dense<[[0, 2], [1, 1]]> : tensor<2x2xi32>
+// CHECK: %[[PAD:.+]] = "tfl.padv2"(%[[ARG0]], %[[PADDING]], %[[ARG1]]) : (tensor<2x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>, tensor<2x2xi32>, tensor<!quant.uniform<i8:f32, 2.000000e+00:-1>>) -> tensor<4x5x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK: return %[[PAD]]
+
+// -----
+
+// Tests that a quantized stablehlo.pad with interior padding is converted to
+// tfl.dilate and tfl.padv2.
+
+func.func @pad_with_interior_padding(
+    %arg0: tensor<2x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>,
+    %arg1: tensor<!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  ) -> tensor<5x9x!quant.uniform<i8:f32, 2.000000e+00:-1>> {
+  %0 = stablehlo.pad %arg0, %arg1, low = [0, 1], high = [2, 1], interior = [1, 2] : (
+    tensor<2x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>,
+    tensor<!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  ) -> tensor<5x9x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  return %0 : tensor<5x9x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+}
+// CHECK-LABEL: pad_with_interior_padding
+// CHECK-SAME: %[[ARG0:.+]]: tensor<2x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK-SAME: %[[ARG1:.+]]: tensor<!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK: %[[PADDING:.+]] = arith.constant
+// CHECK{LITERAL}: dense<[[0, 2], [1, 1]]> : tensor<2x2xi32>
+// CHECK: %[[INTERIOR:.+]] = arith.constant
+// CHECK{LITERAL}: dense<[1, 2]> : tensor<2xi32>
+// CHECK: %[[DILATE:.+]] = "tfl.dilate"(%[[ARG0]], %[[INTERIOR]], %[[ARG1]]) : (tensor<2x3x!quant.uniform<i8:f32, 2.000000e+00:-1>>, tensor<2xi32>, tensor<!quant.uniform<i8:f32, 2.000000e+00:-1>>) -> tensor<3x7x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK: %[[PAD:.+]] = "tfl.padv2"(%[[DILATE]], %[[PADDING]], %[[ARG1]]) : (tensor<3x7x!quant.uniform<i8:f32, 2.000000e+00:-1>>, tensor<2x2xi32>, tensor<!quant.uniform<i8:f32, 2.000000e+00:-1>>) -> tensor<5x9x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK: return %[[PAD]]
+
+// -----
+
+// Tests that a float stablehlo.pad is not converted to tfl.padv2.
+
+func.func @float_pad(%arg0: tensor<2x3xf32>, %arg1: tensor<f32>) -> tensor<4x5xf32> {
+  %0 = stablehlo.pad %arg0, %arg1, low = [0, 1], high = [2, 1], interior = [0, 0] : (tensor<2x3xf32>, tensor<f32>) -> tensor<4x5xf32>
+  return %0 : tensor<4x5xf32>
+}
+// CHECK-LABEL: float_pad
+// CHECK-NOT: tfl.padv2
+// CHECK: stablehlo.pad
+
+// -----
+
+// Tests that a quantized stablehlo.slice is converted to tfl.slice when stride
+// is 1.
+
+func.func @slice(
+    %arg0: tensor<3x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  ) -> tensor<2x2x!quant.uniform<i8:f32, 2.000000e+00:-1>> {
+  %0 = "stablehlo.slice"(%arg0) {
+    start_indices = array<i64: 1, 2>,
+    limit_indices = array<i64: 3, 4>,
+    strides = array<i64: 1, 1>
+  } : (
+    tensor<3x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  ) -> tensor<2x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  return %0 : tensor<2x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+}
+// CHECK-LABEL: slice
+// CHECK-SAME: %[[ARG0:.+]]: tensor<3x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK-DAG: %[[START:.+]] = arith.constant dense<{{\[1, 2\]}}> : tensor<2xi32>
+// CHECK-DAG: %[[SIZE:.+]] = arith.constant dense<2> : tensor<2xi32>
+// CHECK: %[[SLICE:.+]] = "tfl.slice"(%[[ARG0]], %[[START]], %[[SIZE]]) : (tensor<3x4x!quant.uniform<i8:f32, 2.000000e+00:-1>>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK: return %[[SLICE]]
+
+// -----
+
+// Tests that a quantized stablehlo.slice is converted to tfl.strided_slice when
+// stride is not 1.
+
+func.func @strided_slice(
+    %arg0: tensor<3x6x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  ) -> tensor<2x2x!quant.uniform<i8:f32, 2.000000e+00:-1>> {
+  %0 = "stablehlo.slice"(%arg0) {
+    start_indices = array<i64: 0, 2>,
+    limit_indices = array<i64: 3, 6>,
+    strides = array<i64: 2, 3>
+  } : (
+    tensor<3x6x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  ) -> tensor<2x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+  return %0 : tensor<2x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+}
+// CHECK-LABEL: strided_slice
+// CHECK-SAME: %[[ARG0:.+]]: tensor<3x6x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK: %[[START:.+]] = arith.constant
+// CHECK{LITERAL}: dense<[0, 2]> : tensor<2xi32>
+// CHECK: %[[SIZE:.+]] = arith.constant
+// CHECK{LITERAL}: dense<[3, 4]> : tensor<2xi32>
+// CHECK: %[[STRIDE:.+]] = arith.constant
+// CHECK{LITERAL}: dense<[2, 3]> : tensor<2xi32>
+// CHECK: %[[SLICE:.+]] = "tfl.strided_slice"(%[[ARG0]], %[[START]], %[[SIZE]], %[[STRIDE]]) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<3x6x!quant.uniform<i8:f32, 2.000000e+00:-1>>, tensor<2xi32>, tensor<2xi32>, tensor<2xi32>) -> tensor<2x2x!quant.uniform<i8:f32, 2.000000e+00:-1>>
+// CHECK: return %[[SLICE]]
+
+// -----
+
+// Tests that a float stablehlo.slice is not converted to tfl.slice.
+
+func.func @float_slice(%arg0: tensor<3x4xf32>) -> tensor<2x2xf32> {
+  %0 = "stablehlo.slice"(%arg0) {
+    start_indices = array<i64: 1, 2>,
+    limit_indices = array<i64: 3, 4>,
+    strides = array<i64: 1, 1>
+  } : (tensor<3x4xf32>) -> tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+}
+// CHECK-LABEL: float_slice
+// CHECK-NOT: tfl.slice
+// CHECK-NOT: tfl.strided_slice
+// CHECK: stablehlo.slice
+
+// -----
+
+// Tests that a quantized stablehlo.broadcast_in_dim is converted to
+// tfl.broadcast_to.
+
+func.func @broadcast_in_dim(
+    %arg0: tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
+  ) -> tensor<3x2x!quant.uniform<i8:f32, 2.000000e+00:3>> {
+  %0 = "stablehlo.broadcast_in_dim"(%arg0) {
+    broadcast_dimensions = array<i64: 0, 1>
+  } : (tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>) -> tensor<3x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
+  return %0 : tensor<3x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
+}
+// CHECK-LABEL: broadcast_in_dim
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
+// CHECK: %[[SHAPE:.+]] = arith.constant
+// CHECK{LITERAL}: dense<[3, 2]> : tensor<2xi32>
+// CHECK: %[[BROADCAST:.+]] = "tfl.broadcast_to"(%[[ARG0]], %[[SHAPE]]) : (tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>, tensor<2xi32>) -> tensor<3x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
+// CHECK: return %[[BROADCAST]]
+
+// -----
+
+// Tests that a quantized stablehlo.broadcast_in_dim is converted to
+// tfl.transpose and tfl.broadcast_to when broadcast_dimensions is not in
+// ascending order.
+
+func.func @broadcast_in_dim_with_transpose(
+    %arg0: tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
+  ) -> tensor<2x3x!quant.uniform<i8:f32, 2.000000e+00:3>> {
+  %0 = "stablehlo.broadcast_in_dim"(%arg0) {
+    broadcast_dimensions = array<i64: 1, 0>
+  } : (tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>) -> tensor<2x3x!quant.uniform<i8:f32, 2.000000e+00:3>>
+  return %0 : tensor<2x3x!quant.uniform<i8:f32, 2.000000e+00:3>>
+}
+// CHECK-LABEL: broadcast_in_dim_with_transpose
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
+// CHECK: %[[BROADCAST_DIM:.+]] = arith.constant
+// CHECK{LITERAL}: dense<[2, 3]> : tensor<2xi32>
+// CHECK: %[[PERM:.+]] = arith.constant
+// CHECK{LITERAL}: dense<[1, 0]> : tensor<2xi32>
+// CHECK: %[[TRANSPOSE:.+]] = "tfl.transpose"(%[[ARG0]], %[[PERM]]) : (tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>, tensor<2xi32>) -> tensor<2x1x!quant.uniform<i8:f32, 2.000000e+00:3>>
+// CHECK: %[[BROADCAST:.+]] = "tfl.broadcast_to"(%[[TRANSPOSE]], %[[BROADCAST_DIM]]) : (tensor<2x1x!quant.uniform<i8:f32, 2.000000e+00:3>>, tensor<2xi32>) -> tensor<2x3x!quant.uniform<i8:f32, 2.000000e+00:3>>
+// CHECK: return %[[BROADCAST]]
+
+// -----
+
+// Tests that a quantized stablehlo.broadcast_in_dim is converted to
+// tfl.expand_dims and tfl.broadcast_to when input rank is smaller than output
+// rank.
+
+func.func @broadcast_in_dim_with_expand(
+    %arg0: tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
+  ) -> tensor<3x2x1x1x!quant.uniform<i8:f32, 2.000000e+00:3>> {
+  %0 = "stablehlo.broadcast_in_dim"(%arg0) {
+    broadcast_dimensions = array<i64: 0, 1>
+  } : (tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>) -> tensor<3x2x1x1x!quant.uniform<i8:f32, 2.000000e+00:3>>
+  return %0 : tensor<3x2x1x1x!quant.uniform<i8:f32, 2.000000e+00:3>>
+}
+// CHECK-LABEL: broadcast_in_dim_with_expand
+// CHECK-SAME: %[[ARG0:.+]]: tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>
+// CHECK-DAG: %[[BROADCAST_DIM:.+]] = arith.constant dense<{{\[3, 2, 1, 1\]}}> : tensor<4xi32>
+// CHECK-DAG: %[[EXPAND_DIM1:.+]] = arith.constant dense<3> : tensor<1xi32>
+// CHECK-DAG: %[[EXPAND_DIM0:.+]] = arith.constant dense<2> : tensor<1xi32>
+// CHECK: %[[EXPAND0:.+]] = "tfl.expand_dims"(%[[ARG0]], %[[EXPAND_DIM0]]) : (tensor<1x2x!quant.uniform<i8:f32, 2.000000e+00:3>>, tensor<1xi32>) -> tensor<1x2x1x!quant.uniform<i8:f32, 2.000000e+00:3>>
+// CHECK: %[[EXPAND1:.+]] = "tfl.expand_dims"(%[[EXPAND0]], %[[EXPAND_DIM1]]) : (tensor<1x2x1x!quant.uniform<i8:f32, 2.000000e+00:3>>, tensor<1xi32>) -> tensor<1x2x1x1x!quant.uniform<i8:f32, 2.000000e+00:3>>
+// CHECK: %[[BROADCAST:.+]] = "tfl.broadcast_to"(%[[EXPAND1]], %[[BROADCAST_DIM]]) : (tensor<1x2x1x1x!quant.uniform<i8:f32, 2.000000e+00:3>>, tensor<4xi32>) -> tensor<3x2x1x1x!quant.uniform<i8:f32, 2.000000e+00:3>>
+// CHECK: return %[[BROADCAST]]
+
+// -----
+
+// Tests that a quantized stablehlo.broadcast_in_dim is converted to
+// tfl.transpose, tfl.expand_dims and tfl.broadcast_to when broadcast_dimensions
+// is not in ascending order and input rank is smaller than output rank.
+
+func.func @broadcast_in_dim_with_transpose_and_expand(
+    %arg0: tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:3>>
+  ) -> tensor<3x2x1x1x4x!quant.uniform<i8:f32, 2.000000e+00:3>> {
+  %0 = "stablehlo.broadcast_in_dim"(%arg0) {
+    broadcast_dimensions = array<i64: 1, 0, 4>
+  } : (tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:3>>) -> tensor<3x2x1x1x4x!quant.uniform<i8:f32, 2.000000e+00:3>>
+  return %0 : tensor<3x2x1x1x4x!quant.uniform<i8:f32, 2.000000e+00:3>>
+}
+// CHECK-LABEL: broadcast_in_dim_with_transpose_and_expand
+// CHECK-SAME: %[[ARG0:.+]]: tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:3>>
+// CHECK-DAG: %[[BROADCAST_DIM:.+]] = arith.constant dense<{{\[3, 2, 1, 1, 4\]}}> : tensor<5xi32>
+// CHECK-DAG: %[[EXPAND_DIM1:.+]] = arith.constant dense<3> : tensor<1xi32>
+// CHECK-DAG: %[[EXPAND_DIM0:.+]] = arith.constant dense<2> : tensor<1xi32>
+// CHECK-DAG: %[[PERM:.+]] = arith.constant dense<{{\[1, 0, 2\]}}> : tensor<3xi32>
+// CHECK: %[[TRANSPOSE:.+]] = "tfl.transpose"(%[[ARG0]], %[[PERM]]) : (tensor<2x3x4x!quant.uniform<i8:f32, 2.000000e+00:3>>, tensor<3xi32>) -> tensor<3x2x4x!quant.uniform<i8:f32, 2.000000e+00:3>>
+// CHECK: %[[EXPAND0:.+]] = "tfl.expand_dims"(%[[TRANSPOSE]], %[[EXPAND_DIM0]]) : (tensor<3x2x4x!quant.uniform<i8:f32, 2.000000e+00:3>>, tensor<1xi32>) -> tensor<3x2x1x4x!quant.uniform<i8:f32, 2.000000e+00:3>>
+// CHECK: %[[EXPAND1:.+]] = "tfl.expand_dims"(%[[EXPAND0]], %[[EXPAND_DIM1]]) : (tensor<3x2x1x4x!quant.uniform<i8:f32, 2.000000e+00:3>>, tensor<1xi32>) -> tensor<3x2x1x1x4x!quant.uniform<i8:f32, 2.000000e+00:3>>
+// CHECK: %[[BROADCAST:.+]] = "tfl.broadcast_to"(%[[EXPAND1]], %[[BROADCAST_DIM]]) : (tensor<3x2x1x1x4x!quant.uniform<i8:f32, 2.000000e+00:3>>, tensor<5xi32>) -> tensor<3x2x1x1x4x!quant.uniform<i8:f32, 2.000000e+00:3>>
+// CHECK: return %[[BROADCAST]]
+
+// -----
+
+// Tests that a float stablehlo.broadcast_in_dim is not converted to tfl.broadcast_to.
+
+func.func @float_broadcast_in_dim(%arg0: tensor<1x2xf32>) -> tensor<3x2xf32> {
+  %0 = "stablehlo.broadcast_in_dim"(%arg0) {
+    broadcast_dimensions = array<i64: 0, 1>
+  } : (tensor<1x2xf32>) -> tensor<3x2xf32>
+  return %0 : tensor<3x2xf32>
+}
+// CHECK-LABEL: float_broadcast_in_dim
+// CHECK-NOT: tfl.broadcast_to
+// CHECK-NOT: tfl.transpose
+// CHECK-NOT: tfl.expand_dims
+// CHECK: stablehlo.broadcast_in_dim
+
+// -----
+
+// Test that a quantized stablehlo.reduce_window with max is converted to
+// tfl.max_pool_2d.
+
+func.func @reduce_window_with_max(
+  %arg0: tensor<2x9x10x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>,
+  %arg1: tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>
+) -> tensor<2x4x3x3x!quant.uniform<i8:f32, 3.000000e-01:-5>> {
+  %0 = "stablehlo.reduce_window"(%arg0, %arg1) ({
+  ^bb0(%arg2: tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>, %arg3: tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>):
+    %1 = stablehlo.maximum %arg2, %arg3 : tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>
+    stablehlo.return %1 : tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>
+  }) {window_dimensions = array<i64: 1, 3, 4, 1>, window_strides = array<i64: 1, 2, 3, 1>} : (tensor<2x9x10x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>, tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>) -> tensor<2x4x3x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>
+  return %0 : tensor<2x4x3x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>
+}
+
+// CHECK-LABEL: reduce_window_with_max
+// CHECK-SAME: %[[ARG0:.*]]: tensor<2x9x10x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>
+// CHECK-SAME: %[[ARG1:.*]]: tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>
+// CHECK: %[[MAX_POOL:.*]] = "tfl.max_pool_2d"(%[[ARG0]])
+// CHECK-SAME: {filter_height = 3 : i32, filter_width = 4 : i32, fused_activation_function = "NONE", padding = "VALID", stride_h = 2 : i32, stride_w = 3 : i32}
+// CHECK-SAME: (tensor<2x9x10x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>) -> tensor<2x4x3x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>
+// CHECK: return %[[MAX_POOL]]
+
+// -----
+
+// Test that a quantized stablehlo.reduce_window with max whose rank is not 4
+// is not converted to tfl.max_pool_2d.
+
+func.func @reduce_window_not_4d(
+  %arg0: tensor<3x2x9x10x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>,
+  %arg1: tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>
+) -> tensor<3x2x4x3x3x!quant.uniform<i8:f32, 3.000000e-01:-5>> {
+  %0 = "stablehlo.reduce_window"(%arg0, %arg1) ({
+  ^bb0(%arg2: tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>, %arg3: tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>):
+    %1 = stablehlo.maximum %arg2, %arg3 : tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>
+    stablehlo.return %1 : tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>
+  }) {window_dimensions = array<i64: 1, 1, 3, 4, 1>, window_strides = array<i64: 1, 1, 2, 3, 1>} : (tensor<3x2x9x10x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>, tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>) -> tensor<3x2x4x3x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>
+  return %0 : tensor<3x2x4x3x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>
+}
+
+// CHECK-LABEL: reduce_window_not_4d
+// CHECK: stablehlo.reduce_window
+// CHECK-NOT: tfl.max_pool_2d
+
+// -----
+
+// Test that a quantized stablehlo.reduce_window with max that takes multiple
+// inputs is not converted to tfl.max_pool_2d.
+
+func.func @reduce_window_not_binary(
+  %arg0: tensor<3x2x9x10x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>,
+  %arg1: tensor<3x2x9x10x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>,
+  %arg2: tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>,
+  %arg3: tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>
+) -> tensor<3x2x4x3x3x!quant.uniform<i8:f32, 3.000000e-01:-5>> {
+  %0, %1 = "stablehlo.reduce_window"(%arg0, %arg1, %arg2, %arg3) ({
+  ^bb0(%arg4: tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>, %arg5: tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>, %arg6: tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>, %arg7: tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>):
+    %2 = stablehlo.maximum %arg4, %arg5 : tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>
+    %3 = stablehlo.maximum %arg6, %arg7 : tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>
+    stablehlo.return %2, %3 : tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>, tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>
+  }) {window_dimensions = array<i64: 1, 1, 3, 4, 1>, window_strides = array<i64: 1, 1, 2, 3, 1>} : (tensor<3x2x9x10x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>, tensor<3x2x9x10x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>, tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>, tensor<!quant.uniform<i8:f32, 3.000000e-01:-5>>) -> (tensor<3x2x4x3x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>, tensor<3x2x4x3x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>)
+  return %0 : tensor<3x2x4x3x3x!quant.uniform<i8:f32, 3.000000e-01:-5>>
+}
+
+// CHECK-LABEL: reduce_window_not_binary
+// CHECK: stablehlo.reduce_window
+// CHECK-NOT: tfl.max_pool_2d
+
+// -----
+
+// Test that a float stablehlo.reduce_window with max is not converted to
+// tfl.max_pool_2d.
+
+func.func @float_reduce_window_with_max(
+  %arg0: tensor<2x9x10x3xf32>,
+  %arg1: tensor<f32>
+) -> tensor<2x4x3x3xf32> {
+  %0 = "stablehlo.reduce_window"(%arg0, %arg1) ({
+  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+    %1 = stablehlo.maximum %arg2, %arg3 : tensor<f32>
+    stablehlo.return %1 : tensor<f32>
+  }) {window_dimensions = array<i64: 1, 3, 4, 1>, window_strides = array<i64: 1, 2, 3, 1>} : (tensor<2x9x10x3xf32>, tensor<f32>) -> tensor<2x4x3x3xf32>
+  return %0 : tensor<2x4x3x3xf32>
+}
+
+// CHECK-LABEL: float_reduce_window_with_max
+// CHECK: stablehlo.reduce_window
+// CHECK-NOT: tfl.max_pool_2d
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc
index 4ddfdb0d33ff05..15481b9a0a1ad2 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/compose_uniform_quantized_type_pass.cc
@@ -38,7 +38,7 @@ limitations under the License.
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h"
+#include "tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h"
 
 #define DEBUG_TYPE "stablehlo-compose-uniform-quantized-type"
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
index 26d45005c835a8..000f88639240f3 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo.cc
@@ -482,10 +482,15 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     const int64_t input_channels =
         conv_op.getLhs().getType().cast<ShapedType>().getDimSize(
             input_feature_dimension);
+    const int kernel_input_feature_dimension =
+        dnums.getKernelInputFeatureDimension();
+    const int kernel_input_channels =
+        conv_op.getRhs().getType().cast<ShapedType>().getDimSize(
+            kernel_input_feature_dimension);
     const int64_t feature_group_count = conv_op.getFeatureGroupCount();
-    if (feature_group_count != 1 && feature_group_count != input_channels)
-      return rewriter.notifyMatchFailure(conv_op,
-                                         "Group convolution is not supported,");
+    if (feature_group_count != input_channels / kernel_input_channels ||
+        input_channels % kernel_input_channels != 0)
+      return failure();
 
     //
     // Transpose and reshape the input and kernel
@@ -498,6 +503,7 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     image_2d_shape.push_back(1);
     auto image_2d_type =
         RankedTensorType::get(image_2d_shape, image_type.getElementType());
+    auto loc = conv_op.getLoc();
     auto image_2d_op = rewriter.create<mhlo::ReshapeOp>(
         conv_op.getLoc(), image_2d_type, conv_op.getLhs());
 
@@ -509,8 +515,8 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     auto image_permutation_and_shape = GetPermutationAndTransposedShape(
         image_permutation, image_2d_type, rewriter);
     auto transposed_image_2d_op = rewriter.create<mhlo::TransposeOp>(
-        conv_op.getLoc(), image_permutation_and_shape.shape,
-        image_2d_op->getResult(0), image_permutation_and_shape.permutation);
+        loc, image_permutation_and_shape.shape, image_2d_op->getResult(0),
+        image_permutation_and_shape.permutation);
 
     // Reshape kernel to add a new spatial dimension.
     auto kernel_type = conv_op.getRhs().getType().cast<ShapedType>();
@@ -521,8 +527,8 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     kernel_2d_shape.push_back(1);
     auto kernel_2d_type =
         RankedTensorType::get(kernel_2d_shape, kernel_type.getElementType());
-    auto kernel_2d_op = rewriter.create<mhlo::ReshapeOp>(
-        conv_op.getLoc(), kernel_2d_type, conv_op.getRhs());
+    auto kernel_2d_op =
+        rewriter.create<mhlo::ReshapeOp>(loc, kernel_2d_type, conv_op.getRhs());
 
     // Transpose kernel to get it into WHIO form (where H is the added dim).
     SmallVector<int64_t, 4> kernel_permutation = {
@@ -533,8 +539,8 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     auto kernel_permutation_and_shape = GetPermutationAndTransposedShape(
         kernel_permutation, kernel_2d_type, rewriter);
     auto transposed_kernel_2d_op = rewriter.create<mhlo::TransposeOp>(
-        conv_op.getLoc(), kernel_permutation_and_shape.shape,
-        kernel_2d_op->getResult(0), kernel_permutation_and_shape.permutation);
+        loc, kernel_permutation_and_shape.shape, kernel_2d_op->getResult(0),
+        kernel_permutation_and_shape.permutation);
 
     //
     // Create 2d equivalents for 1d convolution attributes.
@@ -624,11 +630,11 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
             .shape;
 
     auto conv2d_op = rewriter.create<mhlo::ConvolutionOp>(
-        conv_op.getLoc(), transposed_output_2d_shape,
-        transposed_image_2d_op.getResult(), transposed_kernel_2d_op.getResult(),
-        window_strides_2d, padding_2d, lhs_dilation_2d, rhs_dilation_2d,
-        window_reversal_2d, dnums_2d, conv_op.getFeatureGroupCount(),
-        conv_op.getBatchGroupCount(), conv_op.getPrecisionConfigAttr());
+        loc, transposed_output_2d_shape, transposed_image_2d_op.getResult(),
+        transposed_kernel_2d_op.getResult(), window_strides_2d, padding_2d,
+        lhs_dilation_2d, rhs_dilation_2d, window_reversal_2d, dnums_2d,
+        conv_op.getFeatureGroupCount(), conv_op.getBatchGroupCount(),
+        conv_op.getPrecisionConfigAttr());
 
     OpResult conv2d_output = conv2d_op->getResult(0);
     auto conv2d_output_type = conv2d_output.getType().cast<ShapedType>();
@@ -642,7 +648,7 @@ class Convert1DConvOp : public OpConversionPattern<mhlo::ConvolutionOp> {
     auto output_permutation_and_shape = GetInversePermutationAndShape(
         output_permutation, conv2d_output_type, rewriter);
     auto transposed_output_2d_op = rewriter.create<mhlo::TransposeOp>(
-        conv_op.getLoc(), output_permutation_and_shape.shape, conv2d_output,
+        loc, output_permutation_and_shape.shape, conv2d_output,
         output_permutation_and_shape.permutation);
 
     // Drop the trailing spatial dimension from the output.
@@ -734,13 +740,13 @@ class ConvertNonTrivialConvOp
 
     // Mirror the filter in the spatial dimensions.
     mlir::Value reverse_filter_in = conv_op.getRhs();
-    // If the kernel is with format [0,1,i,o] we transpose it to [0,1,o,i]
-    // as the TF->TFL pass anticipates this and the kernel format information
-    // will be lost once we legalize to TF
-    if (isKernelFormatHWIO(dnums)) {
+    // If the kernel is with format anythoing other than HWOI, we
+    // transpose it to [0,1,o,i] as the TF->TFL pass anticipates this and the
+    // kernel format information will be lost once we legalize to TF
+    if (!isKernelFormatHWOI(dnums)) {
       SmallVector<int64_t, 4> permutation;
-      for (int64_t dim : dnums.getInputSpatialDimensions()) {
-        permutation.push_back(dim - 1);
+      for (int64_t dim : dnums.getKernelSpatialDimensions()) {
+        permutation.push_back(dim);
       }
       permutation.push_back(dnums.getKernelOutputFeatureDimension());
       permutation.push_back(dnums.getKernelInputFeatureDimension());
@@ -753,10 +759,11 @@ class ConvertNonTrivialConvOp
               permutation));
       reverse_filter_in = filter_transposed;
     }
-    mhlo::ReverseOp filter;
-    filter = rewriter.create<mhlo::ReverseOp>(
-        conv_op.getLoc(), reverse_filter_in,
-        rewriter.getI64TensorAttr(dnums.getKernelSpatialDimensions()));
+
+    // Lets hard-code the reverse indexes to be {0, 1} as the expectation is
+    // that the kernel is always in HWOI format, with the above code.
+    mhlo::ReverseOp filter = rewriter.create<mhlo::ReverseOp>(
+        conv_op.getLoc(), reverse_filter_in, rewriter.getI64TensorAttr({0, 1}));
 
     // if output is not in [b, 0, 1, f] format, insert transpose to go back
     if (dnums.getOutputBatchDimension() != 0 ||
@@ -914,12 +921,6 @@ class ConvertNonTrivialConvOp
     return true;
   }
 
-  bool isKernelFormatHWIO(mhlo::ConvDimensionNumbersAttr dnums) const {
-    int64_t num_spatial_dims = dnums.getKernelSpatialDimensions().size();
-    return dnums.getKernelInputFeatureDimension() == num_spatial_dims &&
-           dnums.getKernelOutputFeatureDimension() == num_spatial_dims + 1;
-  }
-
   bool isKernelFormatHWOI(mhlo::ConvDimensionNumbersAttr dnums) const {
     int64_t num_spatial_dims = dnums.getKernelSpatialDimensions().size();
     return dnums.getKernelInputFeatureDimension() == num_spatial_dims + 1 &&
@@ -969,17 +970,6 @@ class ConvertNonTrivialConvOp
                                          "doesn't support negative pads");
     }
 
-    // Checks kernel dimensions.
-    if (!isKernelFormatHWIO(dnums) && !isKernelFormatHWOI(dnums))
-      return rewriter.notifyMatchFailure(
-          conv_op, "requires kernel format [0, 1, o, i] or [0, 1, i, o]");
-    auto kernel_spatial_dimensions = dnums.getKernelSpatialDimensions();
-    for (auto p : llvm::enumerate(kernel_spatial_dimensions)) {
-      if (p.value() != p.index())
-        return rewriter.notifyMatchFailure(
-            conv_op, "requires kernel format [0, 1, o, i] or [0, 1, i, o]");
-    }
-
     return success();
   }
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
index 1607343ab15879..fe988ba9b20265 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_hlo_patterns.td
@@ -23,7 +23,7 @@ include "mhlo/IR/hlo_ops.td"
 
 // Check if broadcasting is compatible with TF ops.
 def IsLegalNumpyRankedBroadcast :
-    Constraint<CPred<"hlo::isLegalNumpyRankedBroadcast($0, $1, hlo::getI64Array($2))">,
+    Constraint<CPred<"hlo::isLegalNumpyRankedBroadcast($0, $1, $2 ? $2.asArrayRef() : ArrayRef<int64_t>{})">,
     "broadcasting should be compatible with TF ops">;
 
 // Return a constant op that carries the shape of the given value.
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc
index 6e7ebdec6058dc..5f04704d54ef78 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.cc
@@ -20,19 +20,26 @@ limitations under the License.
 #include <string_view>
 #include <utility>
 
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "stablehlo/dialect/Serialization.h"  // from @stablehlo
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
@@ -48,12 +55,11 @@ static constexpr char kShardingAttr[] = "mhlo.sharding";
 static constexpr char kShardingName[] = "Sharding";
 
 class RemoveCustomCallWithSharding
-    : public mlir::OpRewritePattern<mlir::stablehlo::CustomCallOp> {
-  using OpRewritePattern<mlir::stablehlo::CustomCallOp>::OpRewritePattern;
+    : public OpRewritePattern<stablehlo::CustomCallOp> {
+  using OpRewritePattern<stablehlo::CustomCallOp>::OpRewritePattern;
 
-  mlir::LogicalResult matchAndRewrite(
-      mlir::stablehlo::CustomCallOp op,
-      PatternRewriter &rewriter) const override {
+  LogicalResult matchAndRewrite(stablehlo::CustomCallOp op,
+                                PatternRewriter &rewriter) const override {
     // Removes the custom call with sharding op if the operand type is the
     // same as the result type.
     if (op->hasAttr(kShardingAttr) && op.getCallTargetName() == kShardingName &&
@@ -61,15 +67,15 @@ class RemoveCustomCallWithSharding
         op.getOperands().front().getType() ==
             op.getResults().front().getType()) {
       rewriter.replaceOp(op, op.getOperands());
-      return mlir::success();
+      return success();
     }
-    return mlir::failure();
+    return failure();
   }
 };
 
 namespace {
 
-bool IsShloMainFuncOp(mlir::func::FuncOp func_op) {
+bool IsShloMainFuncOp(func::FuncOp func_op) {
   if (func_op == nullptr) {
     return false;
   }
@@ -86,32 +92,55 @@ bool IsShloMainFuncOp(mlir::func::FuncOp func_op) {
   return true;
 }
 
+// Returns true if XlaCallModuleOp has the "platform index argument". The
+// platform index argument is an extra 0-dimensional i32 tensor argument at
+// index 0 when the XlaCallModuleOp contains more than one platform specified at
+// the "platform" attribute.
+//
+// See:
+// https://github.com/tensorflow/tensorflow/blob/eba24f41ba9d661d2f58a515921720cf90708cd4/tensorflow/compiler/tf2xla/ops/xla_ops.cc#L1376-L1385
+bool ContainsPlatformIndexArg(TF::XlaCallModuleOp xla_call_module_op) {
+  return xla_call_module_op.getPlatforms().size() > 1;
+}
+
+// Removes the platform index argument from the function. It is equivalent to
+// removing the first argument from `func_op` (see the comments at
+// `ContainsPlatformIndexArg`). This function assumes that `func_op` is a valid
+// function deserialized from XlaCallModule op.
+void RemovePlatformIndexArg(MLIRContext *ctx, func::FuncOp func_op) {
+  // If there are multiple platforms, the first argument is reserved for
+  // passing the platform index.
+  FunctionType function_type = func_op.getFunctionType();
+  ArrayRef<Type> new_input_types =
+      function_type.getInputs().take_back(func_op.getNumArguments() - 1);
+  func_op.setFunctionType(
+      FunctionType::get(ctx, new_input_types, function_type.getResults()));
+  func_op.getBody().eraseArgument(0);
+}
+
 }  // namespace
 
-class ConvertTFXlaCallModuleOp
-    : public mlir::OpRewritePattern<mlir::TF::XlaCallModuleOp> {
+class ConvertTFXlaCallModuleOp : public OpRewritePattern<TF::XlaCallModuleOp> {
  public:
   explicit ConvertTFXlaCallModuleOp(MLIRContext *context, ModuleOp module_op)
-      : OpRewritePattern<mlir::TF::XlaCallModuleOp>(context),
-        module_op_(module_op) {}
-  using OpRewritePattern<mlir::TF::XlaCallModuleOp>::OpRewritePattern;
+      : OpRewritePattern<TF::XlaCallModuleOp>(context), module_op_(module_op) {}
+  using OpRewritePattern<TF::XlaCallModuleOp>::OpRewritePattern;
 
  private:
   ModuleOp module_op_;
-  mlir::LogicalResult matchAndRewrite(
-      mlir::TF::XlaCallModuleOp op, PatternRewriter &rewriter) const override {
-    mlir::OwningOpRef<ModuleOp> stablehlo_module_op =
-        mlir::stablehlo::deserializePortableArtifact(op.getModuleAttr(),
-                                                     getContext());
+  LogicalResult matchAndRewrite(TF::XlaCallModuleOp op,
+                                PatternRewriter &rewriter) const override {
+    OwningOpRef<ModuleOp> stablehlo_module_op =
+        stablehlo::deserializePortableArtifact(op.getModuleAttr(),
+                                               getContext());
     if (stablehlo_module_op.get() == nullptr) {
-      return mlir::failure();
+      return failure();
     }
     SymbolTable parent_module_symbol_table(module_op_);
     SymbolTable stablehlo_module_symbol_table(stablehlo_module_op.get());
     {
-      auto main_func_op =
-          stablehlo_module_symbol_table.lookup<mlir::func::FuncOp>(
-              kStablehloModuleDefaultEntryFuncName);
+      auto main_func_op = stablehlo_module_symbol_table.lookup<func::FuncOp>(
+          kStablehloModuleDefaultEntryFuncName);
       // TODO(b/291988976): move enforcement of this variable outside of this
       // rewrite pattern such that it's only checked once. Currently, this
       // approach results in duplicate error messages as this pattern executes
@@ -126,25 +155,23 @@ class ConvertTFXlaCallModuleOp
         return rewriter.notifyMatchFailure(op, error_msg);
       }
     }
-    mlir::Builder stablehlo_builder(stablehlo_module_op.get().getContext());
+    Builder stablehlo_builder(stablehlo_module_op.get().getContext());
     // Rename XlaCallModuleOp's functions to avoid naming conflicts.
-    for (auto func_op :
-         stablehlo_module_op.get().getOps<mlir::func::FuncOp>()) {
+    for (auto func_op : stablehlo_module_op.get().getOps<func::FuncOp>()) {
       const std::string new_func_name =
           CreateNewFuncName(func_op.getSymName(), parent_module_symbol_table);
       if (failed(stablehlo_module_symbol_table.replaceAllSymbolUses(
               func_op, stablehlo_builder.getStringAttr(new_func_name),
               stablehlo_module_op.get()))) {
-        return mlir::failure();
+        return failure();
       }
-      mlir::SymbolTable::setSymbolName(func_op, new_func_name);
+      SymbolTable::setSymbolName(func_op, new_func_name);
     }
     // Move all functions from XlaCallModuleOp's stablehlo module, to parent
     // module. Also marks the stablehlo module entry function as private.
-    mlir::func::FuncOp main_fn;
-    for (auto func_op :
-         stablehlo_module_op.get().getOps<mlir::func::FuncOp>()) {
-      mlir::func::FuncOp cloned_func_op = func_op.clone();
+    func::FuncOp main_fn;
+    for (auto func_op : stablehlo_module_op.get().getOps<func::FuncOp>()) {
+      func::FuncOp cloned_func_op = func_op.clone();
       if (IsShloMainFuncOp(cloned_func_op)) {
         main_fn = cloned_func_op;
       }
@@ -153,11 +180,20 @@ class ConvertTFXlaCallModuleOp
       parent_module_symbol_table.insert(cloned_func_op);
     }
 
+    // When the `XlaCallModuleOp`'s callee accepts a platform index argument,
+    // remove it. This is because when converted to `CallOp` there will be a
+    // mismatch btw. the number of arguments passed and number of parameters
+    // accepted (the platform index argument is an extra argument that is not
+    // expressed by the operands of XlaCallModuleOp).
+    if (ContainsPlatformIndexArg(op)) {
+      RemovePlatformIndexArg(getContext(), main_fn);
+    }
+
     // The stablehlo module main function's input tensor types might be
     // different from the XlaCallModuleOp's input tensor types. For example,
     // The XlaCallModuleOp's input is tensor<*xf32> while the function's
     // argument type is tensor<1x2f32>.
-    llvm::SmallVector<Value, 4> casted_operands;
+    SmallVector<Value, 4> casted_operands;
     casted_operands.reserve(main_fn.getNumArguments());
     for (const auto &operand_and_type :
          zip(op.getOperands(), main_fn.getFunctionType().getInputs())) {
@@ -176,7 +212,7 @@ class ConvertTFXlaCallModuleOp
         casted_operands);
     rewriter.replaceOp(op, call->getResults());
 
-    return mlir::success();
+    return success();
   }
 
   // Creates a new function name to avoid collision. The naming scheme is
@@ -206,9 +242,9 @@ class TFXlaCallModuleOpToStablehloPass
   StringRef getDescription() const final {
     return "Legalize TF_XlaCallModule Op to stablehlo";
   }
-  void getDependentDialects(::mlir::DialectRegistry &registry) const override {
-    registry.insert<mlir::stablehlo::StablehloDialect, mlir::vhlo::VhloDialect,
-                    mlir::quant::QuantizationDialect, shape::ShapeDialect>();
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<stablehlo::StablehloDialect, vhlo::VhloDialect,
+                    quant::QuantizationDialect, shape::ShapeDialect>();
   }
 
   void runOnOperation() override {
@@ -222,7 +258,7 @@ class TFXlaCallModuleOpToStablehloPass
   }
 };
 
-std::unique_ptr<mlir::OperationPass<ModuleOp>>
+std::unique_ptr<OperationPass<ModuleOp>>
 CreateLegalizeTFXlaCallModuleToStablehloPass() {
   return std::make_unique<TFXlaCallModuleOpToStablehloPass>();
 }
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h
index 211019b70524f5..a91584b0ff050e 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h
@@ -47,7 +47,7 @@ CreateComposeUniformQuantizedTypePass();
 // quantized typed tensors and converts them to equivalent ops in the TFLite
 // dialect.
 std::unique_ptr<OperationPass<func::FuncOp>>
-CreateUniformQuantizedStablehloToTflPass();
+CreateUniformQuantizedStableHloToTflPass();
 
 // Create a pass that legalizes MHLO to TF dialect.
 std::unique_ptr<OperationPass<ModuleOp>> CreateLegalizeHloToTfPass();
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td
index ca49787a715bd5..be4c936602eafb 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.td
@@ -58,10 +58,10 @@ def ComposeUniformQuantizedTypePass : Pass<"compose-uniform-quantized-type", "Mo
   ];
 }
 
-def UniformQuantizedStablehloToTflPass
+def UniformQuantizedStableHloToTflPass
     : Pass<"uniform-quantized-stablehlo-to-tfl", "mlir::func::FuncOp"> {
   let summary = "Converts StableHLO ops using uniform quantized types to equivalent TFL ops.";
-  let constructor = "mlir::odml::CreateUniformQuantizedStablehloToTflPass()";
+  let constructor = "mlir::odml::CreateUniformQuantizedStableHloToTflPass()";
   let description = [{
     Converts StableHLO ops that accept or return uniform quantized types to
     equivalent ops in the TFLite dialect.
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.cc
index 3bb9eddbfa5021..b120ca89c290d4 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.cc
@@ -135,6 +135,16 @@ void StablehloToTflPass::runOnOperation() {
         continue;
       }
 
+      if (attr.isa<::mlir::DenseBoolArrayAttr>()) {
+        auto array_attr = attr.dyn_cast<mlir::DenseBoolArrayAttr>();
+        auto start = fbb->StartVector(key);
+        for (auto bool_value : array_attr.asArrayRef()) {
+          fbb->Add(bool_value);
+        }
+        fbb->EndVector(start, /*typed=*/true, /*fixed=*/false);
+        continue;
+      }
+
       if (attr.isa<::mlir::StringAttr>()) {
         fbb->String(key, attr.dyn_cast<mlir::StringAttr>().data());
         continue;
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.cc
index 5e4f79f18ce503..8e22b343d7f3d0 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/tfl_stablehlo_pass.cc
@@ -44,6 +44,51 @@ limitations under the License.
 namespace mlir {
 namespace odml {
 
+static bool isDenseI64Array(llvm::StringRef op_name,
+                            llvm::StringRef field_name) {
+  if (op_name == "stablehlo.broadcast" && field_name == "broadcast_sizes")
+    return true;
+  if (op_name == "stablehlo.broadcast_in_dim" &&
+      field_name == "broadcast_dimensions")
+    return true;
+  if ((op_name == "stablehlo.convolution" ||
+       op_name == "stablehlo.dynamic_conv") &&
+      (field_name == "window_strides" || field_name == "lhs_dilation" ||
+       field_name == "rhs_dilation"))
+    return true;
+  if (op_name == "stablehlo.dynamic_broadcast_in_dim" &&
+      (field_name == "broadcast_dimensions" ||
+       field_name == "known_expanding_dimensions" ||
+       field_name == "known_nonexpanding_dimensions"))
+    return true;
+  if ((op_name == "stablehlo.dynamic_slice" || op_name == "stablehlo.gather") &&
+      field_name == "slice_sizes")
+    return true;
+  if (op_name == "stablehlo.fft" && field_name == "fft_length") return true;
+  if ((op_name == "stablehlo.map" || op_name == "stablehlo.reduce" ||
+       op_name == "stablehlo.reverse") &&
+      field_name == "dimensions")
+    return true;
+  if (op_name == "stablehlo.pad" &&
+      (field_name == "edge_padding_low" || field_name == "edge_padding_high" ||
+       field_name == "interior_padding"))
+    return true;
+  if (op_name == "stablehlo.reduce_window" &&
+      (field_name == "window_dimensions" || field_name == "window_strides" ||
+       field_name == "base_dilations" || field_name == "window_dilations"))
+    return true;
+  if (op_name == "stablehlo.select_and_scatter" &&
+      (field_name == "window_dimensions" || field_name == "window_strides"))
+    return true;
+  if (op_name == "stablehlo.slice" &&
+      (field_name == "start_indices" || field_name == "limit_indices" ||
+       field_name == "strides"))
+    return true;
+  if (op_name == "stablehlo.transpose" && field_name == "permutation")
+    return true;
+  return false;
+}
+
 class TflToStablehloPass
     : public mlir::PassWrapper<TflToStablehloPass,
                                mlir::OperationPass<mlir::func::FuncOp>> {
@@ -90,6 +135,16 @@ class TflToStablehloPass
           attrs.push_back(named_attr);
           break;
         }
+        case flexbuffers::FBT_VECTOR_BOOL: {
+          llvm::SmallVector<bool> vec;
+          const auto& vector = value.AsTypedVector();
+          for (size_t i = 0; i < vector.size(); i++) {
+            vec.push_back(vector[i].AsBool());
+          }
+          attrs.push_back(
+              builder->getNamedAttr(key, builder->getDenseBoolArrayAttr(vec)));
+          break;
+        }
         case flexbuffers::FBT_VECTOR_INT: {
           const auto& vector = value.AsTypedVector();
           std::vector<int64_t> vec;
@@ -104,11 +159,7 @@ class TflToStablehloPass
             shape.push_back(vec.size());
           }
           Attribute value;
-          if (op_name == "stablehlo.broadcast" ||
-              op_name == "stablehlo.dynamic_slice" ||
-              op_name == "stablehlo.fft" || op_name == "stablehlo.pad" ||
-              op_name == "stablehlo.reverse" || op_name == "stablehlo.slice" ||
-              op_name == "stablehlo.transpose") {
+          if (isDenseI64Array(op_name, key)) {
             value = builder->getDenseI64ArrayAttr(vec);
           } else {
             RankedTensorType ty = tensorflow::GetTypeFromTFTensorShape(
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
index df3a5f62e8ff59..678c136d5a9571 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.cc
@@ -38,7 +38,7 @@ void AddTFToStablehloPasses(OpPassManager& pm, bool skip_resize,
 
   // if the input is a call_xla_module, then unwrap the content
   pm.addPass(mlir::odml::CreateLegalizeTFXlaCallModuleToStablehloPass());
-  // TODO(b/230572023): Consider improving shape inference for While op instead
+  // TODO: b/230572023 - Consider improving shape inference for While op instead
   // of dropping the attribute. This need not be correct for models not trained
   // on TPU.
 
@@ -85,11 +85,18 @@ void AddTFToStablehloPasses(OpPassManager& pm, bool skip_resize,
   }
 }
 
-void AddMhloOptimizationPasses(OpPassManager& pm) {
+void AddMhloOptimizationPasses(OpPassManager& pm,
+                               const bool enable_stablehlo_quantizer) {
   // Rewrites some patterns for better performance.
   pm.addNestedPass<func::FuncOp>(createUnfuseBatchNormPass());
   pm.addNestedPass<func::FuncOp>(createFuseConvolutionPass());
   pm.addNestedPass<func::FuncOp>(createOptimizePass());
+  // Conditionally enable below pass because this causes unfused convolutions
+  // described in b/293149194. This problem is not replicated in
+  // StableHLO Quantizer.
+  if (enable_stablehlo_quantizer) {
+    pm.addNestedPass<func::FuncOp>(createFoldBroadcastPass());
+  }
 
   // Rewrites legacy StableHLO ops.
   pm.addNestedPass<func::FuncOp>(mhlo::createLegalizeEinsumToDotGeneralPass());
@@ -109,8 +116,8 @@ void AddStablehloOptimizationPasses(OpPassManager& pm) {
   // StableHLO -> MHLO legalization.
   pm.addPass(mhlo::createStablehloLegalizeToHloPass());
 
-  AddMhloOptimizationPasses(pm);
-  // TODO(b/293149194) Add `createFoldBroadcastPass` back to
+  AddMhloOptimizationPasses(pm, /*enable_stablehlo_quantizer=*/false);
+  // TODO: b/293149194 - Add `createFoldBroadcastPass` back to
   // `AddMhloOptimizationPasses`
   pm.addNestedPass<func::FuncOp>(createFoldBroadcastPass());
 
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.h b/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.h
index 02d5d527901c9a..dc23a5b30f0b3c 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.h
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.h
@@ -36,7 +36,8 @@ void AddTFToStablehloPasses(OpPassManager& pm, bool skip_resize,
 void AddStablehloOptimizationPasses(OpPassManager& pm);
 
 // Adds all the backend-agonstic stableHLO optimization passes
-void AddMhloOptimizationPasses(OpPassManager& pm);
+void AddMhloOptimizationPasses(OpPassManager& pm,
+                               bool enable_stablehlo_quantizer);
 
 }  // namespace odml
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
index dffb51ecd3f800..429c6b820623dd 100644
--- a/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
+++ b/tensorflow/compiler/mlir/lite/stablehlo/transforms/uniform_quantized_stablehlo_to_tfl_pass.cc
@@ -14,17 +14,21 @@ limitations under the License.
 ==============================================================================*/
 #include <cstdint>
 #include <iterator>
-#include <limits>
 #include <memory>
+#include <optional>
 #include <utility>
 
 #include "absl/algorithm/container.h"
 #include "absl/log/check.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/MathExtras.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project  // NOLINT: Required to register quantization dialect.
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
@@ -37,10 +41,10 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "stablehlo/dialect/Base.h"  // from @stablehlo
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h"
 
 #define DEBUG_TYPE "uniform-quantized-stablehlo-to-tfl"
 
@@ -51,24 +55,38 @@ namespace {
 // TODO: b/311029361: Add e2e test for verifying this legalization once
 // StableHLO Quantizer API migration is complete.
 
+using ::mlir::quant::CastI64ArrayToI32;
+using ::mlir::quant::CastI64ToI32;
+using ::mlir::quant::CreateI32F32UniformQuantizedPerAxisType;
+using ::mlir::quant::CreateI32F32UniformQuantizedType;
+using ::mlir::quant::CreateI8F32UniformQuantizedPerAxisType;
+using ::mlir::quant::CreateI8F32UniformQuantizedType;
+using ::mlir::quant::FindUserOfType;
+using ::mlir::quant::IsI32F32UniformQuantizedPerAxisType;
 using ::mlir::quant::IsI32F32UniformQuantizedType;
 using ::mlir::quant::IsI8F32UniformQuantizedPerAxisType;
 using ::mlir::quant::IsI8F32UniformQuantizedType;
+using ::mlir::quant::IsOpFullyQuantized;
+using ::mlir::quant::IsQuantizedTensorType;
 using ::mlir::quant::IsSupportedByTfliteQuantizeOrDequantizeOps;
 using ::mlir::quant::QuantizedType;
 using ::mlir::quant::UniformQuantizedPerAxisType;
 using ::mlir::quant::UniformQuantizedType;
 
+const char* kPaddingSame = "SAME";
+const char* kPaddingValid = "VALID";
+
 #define GEN_PASS_DEF_UNIFORMQUANTIZEDSTABLEHLOTOTFLPASS
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h.inc"
 
-class UniformQuantizedStablehloToTflPass
-    : public impl::UniformQuantizedStablehloToTflPassBase<
-          UniformQuantizedStablehloToTflPass> {
+class UniformQuantizedStableHloToTflPass
+    : public impl::UniformQuantizedStableHloToTflPassBase<
+          UniformQuantizedStableHloToTflPass> {
  private:
   void runOnOperation() override;
 };
 
+// TODO: b/323645515 - Refactor reference functions.
 // Bias scales for matmul-like ops should be input scale * filter scale. Here it
 // is assumed that the input is per-tensor quantized and filter is per-channel
 // quantized.
@@ -94,7 +112,7 @@ double GetBiasScale(const double input_scale, const double filter_scale) {
 // whereas `tfl.fully_connected` accepts an OI format.
 TFL::QConstOp CreateTflConstOpForFilter(
     stablehlo::ConstantOp filter_constant_op, PatternRewriter& rewriter,
-    bool is_per_axis) {
+    bool is_per_channel) {
   const auto filter_values = filter_constant_op.getValue()
                                  .cast<DenseIntElementsAttr>()
                                  .getValues<int8_t>();
@@ -123,35 +141,27 @@ TFL::QConstOp CreateTflConstOpForFilter(
 
   Type new_filter_quantized_type;
 
-  if (is_per_axis) {
+  if (is_per_channel) {
     auto filter_quantized_type = filter_constant_op.getResult()
                                      .getType()
                                      .cast<TensorType>()
                                      .getElementType()
                                      .cast<UniformQuantizedPerAxisType>();
-
-    new_filter_quantized_type = UniformQuantizedPerAxisType::getChecked(
-        filter_constant_op.getLoc(), /*flags=*/true,
-        /*storageType=*/filter_quantized_type.getStorageType(),
-        /*expressedType=*/filter_quantized_type.getExpressedType(),
-        /*scales=*/filter_quantized_type.getScales(),
-        /*zeroPoints=*/filter_quantized_type.getZeroPoints(),
-        /*quantizedDimension=*/0, /*storageTypeMin=*/llvm::minIntN(8),
-        /*storageTypeMax=*/llvm::maxIntN(8));
+    new_filter_quantized_type = CreateI8F32UniformQuantizedPerAxisType(
+        filter_constant_op->getLoc(), *rewriter.getContext(),
+        filter_quantized_type.getScales(),
+        filter_quantized_type.getZeroPoints(),
+        /*quantization_dimension=*/0, /*narrow_range=*/true);
   } else {
     auto filter_quantized_type = filter_constant_op.getResult()
                                      .getType()
                                      .cast<TensorType>()
                                      .getElementType()
                                      .cast<UniformQuantizedType>();
-    new_filter_quantized_type = UniformQuantizedType::getChecked(
-        filter_constant_op.getLoc(), /*flags=*/true,
-        /*storageType=*/filter_quantized_type.getStorageType(),
-        /*expressedType=*/filter_quantized_type.getExpressedType(),
-        /*scale=*/filter_quantized_type.getScale(),
-        /*zeroPoint=*/filter_quantized_type.getZeroPoint(),
-        /*storageTypeMin=*/llvm::minIntN(8),
-        /*storageTypeMax=*/llvm::maxIntN(8));
+    new_filter_quantized_type = CreateI8F32UniformQuantizedType(
+        filter_constant_op->getLoc(), *rewriter.getContext(),
+        filter_quantized_type.getScale(), filter_quantized_type.getZeroPoint(),
+        /*narrow_range=*/true);
   }
 
   // Required because the quantized dimension is changed from 3 -> 0.
@@ -172,17 +182,16 @@ TFL::QConstOp CreateTflConstOpForFilter(
 // transformation). The quantization scale for the bias is input scale *
 // filter scale. `filter_const_op` is used to retrieve the filter scales and
 // the size of the bias constant.
-// TODO - b/309896242: Support bias fusion legalization.
-TFL::QConstOp CreateTflConstOpForDummyBias(const Location loc,
-                                           const double input_scale,
-                                           TFL::QConstOp filter_const_op,
-                                           PatternRewriter& rewriter,
-                                           bool is_per_axis) {
+// TODO - b/309896242: Support bias fusion legalization and spatial dimension
+// check when `stride` is not 1.
+TFL::QConstOp CreateTflConstOpForDummyBias(
+    const Location loc, const double input_scale, TFL::QConstOp filter_const_op,
+    PatternRewriter& rewriter, bool is_per_channel, MLIRContext& ctx) {
   const ArrayRef<int64_t> filter_shape =
       filter_const_op.getResult().getType().getShape();
 
   Type bias_quantized_type;
-  if (is_per_axis) {
+  if (is_per_channel) {
     const auto filter_quantized_element_type =
         filter_const_op.getResult()
             .getType()
@@ -191,13 +200,11 @@ TFL::QConstOp CreateTflConstOpForDummyBias(const Location loc,
 
     // The storage type is i32 for bias, which is the precision used for
     // accumulation.
-    bias_quantized_type = UniformQuantizedPerAxisType::getChecked(
-        loc, /*flags=*/true, /*storageType=*/rewriter.getI32Type(),
-        /*expressedType=*/rewriter.getF32Type(), /*scales=*/
+    bias_quantized_type = CreateI32F32UniformQuantizedPerAxisType(
+        loc, ctx,
         GetBiasScales(input_scale, filter_quantized_element_type.getScales()),
-        /*zeroPoints=*/filter_quantized_element_type.getZeroPoints(),
-        /*quantizedDimension=*/0, /*storageTypeMin=*/llvm::minIntN(8),
-        /*storageTypeMax=*/llvm::maxIntN(8));
+        filter_quantized_element_type.getZeroPoints(),
+        /*quantization_dimension=*/0);
   } else {
     const auto filter_quantized_element_type =
         filter_const_op.getResult()
@@ -207,13 +214,10 @@ TFL::QConstOp CreateTflConstOpForDummyBias(const Location loc,
 
     // The storage type is i32 for bias, which is the precision used for
     // accumulation.
-    bias_quantized_type = UniformQuantizedType::getChecked(
-        loc, /*flags=*/true, /*storageType=*/rewriter.getI32Type(),
-        /*expressedType=*/rewriter.getF32Type(), /*scale=*/
+    bias_quantized_type = CreateI32F32UniformQuantizedType(
+        loc, ctx,
         GetBiasScale(input_scale, filter_quantized_element_type.getScale()),
-        /*zeroPoint=*/filter_quantized_element_type.getZeroPoint(),
-        /*storageTypeMin=*/llvm::minIntN(8),
-        /*storageTypeMax=*/llvm::maxIntN(8));
+        filter_quantized_element_type.getZeroPoint());
   }
 
   SmallVector<int64_t, 1> bias_shape = {filter_shape[0]};
@@ -230,6 +234,7 @@ TFL::QConstOp CreateTflConstOpForDummyBias(const Location loc,
 }
 
 // stablehlo.uniform_quantize -> tfl.quantize
+// TODO: b/322428814 - Add StableHLO quantizer integration tests for ODML.
 class RewriteUniformQuantizeOp
     : public OpRewritePattern<stablehlo::UniformQuantizeOp> {
   using OpRewritePattern<stablehlo::UniformQuantizeOp>::OpRewritePattern;
@@ -242,7 +247,8 @@ class RewriteUniformQuantizeOp
     const Type input_element_type =
         op.getOperand().getType().cast<TensorType>().getElementType();
     if (!(input_element_type.isa<FloatType>() ||
-          IsI32F32UniformQuantizedType(input_element_type))) {
+          IsI32F32UniformQuantizedType(input_element_type) ||
+          IsI32F32UniformQuantizedPerAxisType(input_element_type))) {
       LLVM_DEBUG(llvm::dbgs() << "Uniform quantize op's input should be a "
                                  "float type or int32. Got: "
                               << input_element_type << ".\n");
@@ -322,93 +328,472 @@ class RewriteUniformDequantizeOp
   }
 };
 
-// Rewrites `stablehlo.convolution` -> `tfl.conv_2d` when it accepts uniform
-// quantized tensors.
+// Rewrites `stablehlo.dot_general` to `tfl.fully_connected` or
+// `tfl.batch_matmul` when it accepts uniform quantized tensors.
 //
-// Conditions for the conversion:
-//   * Input and output tensors are per-tensor uniform quantized (i8->f32)
+// StableHLO Quantizer output:
+//   * input: per-tensor qi8
+//   * filter: per-tensor qi8
+//   * output: per-tensor qi32
+// JAX Quantizer output:
+//   * input: per-tensor qi8
+//   * filter: per-channel qi8
+//   * output: per-tensor qi8
+//
+// Conditions for the `tfl.batch_matmul` conversion:
+//   * size(batching_dimensions) <= 3 (TFLite support restriction)
+//   * size(contracting_dimensions) = 1
+//   * Input tensors are per-tensor uniform quantized (i8->f32)
+//     tensors (full integer) with shape [..., r_x, c_x] or [..., c_x, r_x].
+//   * The filter tensor is a per-tensor uniform quantized (i8->f32) tensor
+//     (constant or activation) with shape [..., r_y, c_y] or [..., c_y, r_y].
+//   * Output tensors are per-tensor uniform quantized (i8->f32) or
+//     per-channel uniform quantized (i32->f32) tensors.
+//
+// Conditions for `tfl.fully_connected` conversion:
+//   * Input tensors are per-tensor uniform quantized (i8->f32)
 //     tensors.
-//   * The filter tensor is constant a per-channel uniform quantized (i8->f32)
-//     tensor.
-//   * Convolution is a 2D convolution op and both the input's and filter's
-//     shape is 4 dimensional.
-//   * The filter tensor's format is `[0, 1, i, o]`.
-//   * Not a depthwise convolution.
-//   * Does not consider bias add fusion.
-// TODO: b/294771704 - Support bias quantization.
-class RewriteUpstreamQuantizedConvolutionOp
-    : public OpRewritePattern<stablehlo::ConvolutionOp> {
+//   * The filter tensor is constant a per-tensor uniform quantized (i8->f32)
+//     tensor. The quantization dimension should be 1 (the non-contracting
+//     dimension).
+//   * Output tensors are per-tensor uniform quantized (i8->f32) or
+//     per-channel uniform quantized (i32->f32) tensors.
+//   * The input tensor's rank is either 2 or 3. The last dimension of the input
+//     tensor should be the contracting dimension, i.e. [..., c_x, r_x].
+//   * The filter tensor's rank is 2. The contracting dimension should be the
+//     first dimension (dim 0), i.e. [c_y, r_y] where c_y == r_x.
+// TODO: b/309896242 - Add support for fused op case. Add support for
+// per-channel quantization.
+// TODO: b/295264927 - `stablehlo.dot_general` with per-axis quantized operands
+// is not specified in the StableHLO dialect. Update the spec to allow this.
+class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
+    : public OpRewritePattern<stablehlo::DotGeneralOp> {
+  using OpRewritePattern<stablehlo::DotGeneralOp>::OpRewritePattern;
+
  public:
-  using OpRewritePattern<stablehlo::ConvolutionOp>::OpRewritePattern;
+  LogicalResult match(stablehlo::DotGeneralOp op) const override {
+    const stablehlo::DotDimensionNumbersAttr dot_dimension_nums =
+        op.getDotDimensionNumbers();
+    const bool is_batch_matmul =
+        !dot_dimension_nums.getLhsBatchingDimensions().empty();
+    const bool has_i32_output = IsI32F32UniformQuantizedType(
+        op.getResult().getType().cast<TensorType>().getElementType());
 
-  static LogicalResult MatchInput(Value input) {
-    auto input_type = input.getType().cast<TensorType>();
-    if (input_type.getRank() != 4) {
-      LLVM_DEBUG(llvm::dbgs() << "Only 2D convolution op is supported. "
-                                 "Expected input rank of 4. Got: "
-                              << input_type.getRank() << ".\n");
+    if (failed(MatchInputDotGeneralCommonPattern(op.getLhs()))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Failed to match input for quantized dot_general.\n");
+      return failure();
+    }
+    if (failed(MatchFilterCommonPattern(op.getRhs()))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Failed to match filter for quantized dot_general.\n");
+      return failure();
+    }
+    if (failed(MatchOutput(op.getResult(), has_i32_output))) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Failed to match output for quantized dot_general.\n");
       return failure();
     }
 
-    if (const auto input_element_type = input_type.getElementType();
-        !IsI8F32UniformQuantizedType(input_element_type)) {
+    if (is_batch_matmul) {
+      return MatchDotGeneralToTflBatchMatmulOp(op, dot_dimension_nums,
+                                               has_i32_output);
+    }
+    return MatchDotGeneralToTflFullyConnectedOp(op, dot_dimension_nums,
+                                                has_i32_output);
+  }
+
+  void rewrite(stablehlo::DotGeneralOp op,
+               PatternRewriter& rewriter) const override {
+    const bool has_i32_output = IsI32F32UniformQuantizedType(
+        op.getResult().getType().cast<TensorType>().getElementType());
+    const stablehlo::DotDimensionNumbersAttr dot_dimension_nums =
+        op.getDotDimensionNumbers();
+    const bool is_batch_matmul =
+        !dot_dimension_nums.getLhsBatchingDimensions().empty();
+
+    if (is_batch_matmul) {
+      RewriteDotGeneralToTflBatchMatmulOp(op, rewriter, dot_dimension_nums,
+                                          has_i32_output);
+    } else {
+      RewriteDotGeneralToTflFullyConnectedOp(op, rewriter, dot_dimension_nums,
+                                             has_i32_output);
+    }
+  }
+
+ private:
+  static LogicalResult MatchDotGeneralToTflBatchMatmulOp(
+      stablehlo::DotGeneralOp op,
+      const stablehlo::DotDimensionNumbersAttr dot_dimension_nums,
+      const bool has_i32_output) {
+    const int num_lhs_batching_dims =
+        dot_dimension_nums.getLhsBatchingDimensions().size();
+    const int num_lhs_contracting_dims =
+        dot_dimension_nums.getLhsContractingDimensions().size();
+    if (num_lhs_batching_dims > 3) {
+      LLVM_DEBUG(llvm::dbgs() << "Failed to match batching dimension for "
+                                 "quantized dot_general.\n");
+      return failure();
+    }
+    // Checking one side is enough since
+    // (C1) size(lhs_batching_dimensions) = size(rhs_batching_dimensions).
+    if (num_lhs_contracting_dims != 1) {
+      // Check one side is enough since
+      // (C2) size(lhs_contracting_dimensions) =
+      // size(rhs_contracting_dimensions).
+      LLVM_DEBUG(llvm::dbgs() << "Failed to match contract dimension for "
+                                 "quantized dot_general.\n");
+      return failure();
+    }
+    const auto input_type = op.getLhs().getType().cast<TensorType>();
+    const int input_rank = input_type.getRank();
+    const auto input_contracting_dim =
+        dot_dimension_nums.getLhsContractingDimensions()[0];
+    if ((input_contracting_dim != input_rank - 1) &&
+        (input_contracting_dim != input_rank - 2)) {
       LLVM_DEBUG(llvm::dbgs()
-                 << "Expected an i8->f32 uniform quantized type. Got: "
-                 << input_element_type << ".\n");
+                 << "Failed to match input contracting dimensions.\n");
       return failure();
     }
 
+    const auto filter_type = op.getRhs().getType().cast<TensorType>();
+    const Type filter_element_type = filter_type.getElementType();
+    if (!IsI8F32UniformQuantizedType(filter_element_type)) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Expected a per-tensor uniform "
+                    "quantized (i8->f32) weight for dot_general. Got: "
+                 << filter_type << "\n");
+      return failure();
+    }
+    const int rhs_rank = filter_type.cast<TensorType>().getRank();
+    const auto rhs_contracting_dim =
+        dot_dimension_nums.getRhsContractingDimensions()[0];
+    if ((rhs_contracting_dim != rhs_rank - 1) &&
+        (rhs_contracting_dim != rhs_rank - 2)) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Not supported rhs contracting dim for dot_general.\n");
+      return failure();
+    }
     return success();
   }
 
-  static LogicalResult MatchFilter(Value filter) {
-    auto filter_type = filter.getType().cast<TensorType>();
-    if (filter_type.getRank() != 4) {
-      LLVM_DEBUG(llvm::dbgs() << "Only 2D convolution op is supported. "
-                                 "Expected filter rank of 4. Got: "
-                              << filter_type.getRank() << ".\n");
+  static LogicalResult MatchDotGeneralToTflFullyConnectedOp(
+      stablehlo::DotGeneralOp op,
+      const stablehlo::DotDimensionNumbersAttr dot_dimension_nums,
+      const bool has_i32_output) {
+    const int num_lhs_contracting_dims =
+        dot_dimension_nums.getLhsContractingDimensions().size();
+    const int num_rhs_contracting_dims =
+        dot_dimension_nums.getRhsContractingDimensions().size();
+    if (num_lhs_contracting_dims != 1 || num_rhs_contracting_dims != 1) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Expected number of contracting dimensions to be 1. Got: "
+                 << num_rhs_contracting_dims << ".\n");
       return failure();
     }
 
-    const Type filter_element_type = filter_type.getElementType();
-    if (!IsI8F32UniformQuantizedPerAxisType(filter_type.getElementType())) {
-      LLVM_DEBUG(
-          llvm::dbgs()
-          << "Expected a per-channel uniform quantized (i8->f32) type. Got: "
-          << filter_element_type << "\n");
+    const auto input_type = op.getLhs().getType().cast<TensorType>();
+    if (!(input_type.getRank() == 2 || input_type.getRank() == 3)) {
+      LLVM_DEBUG(llvm::dbgs() << "Input expected to have rank of 2 or 3. Got: "
+                              << input_type << ".\n");
       return failure();
     }
 
-    if (filter_element_type.cast<UniformQuantizedPerAxisType>()
-            .getQuantizedDimension() != 3) {
-      LLVM_DEBUG(llvm::dbgs() << "Quantized dimension should be 3. Got: "
-                              << filter_element_type << "\n");
+    const auto filter_type = op.getRhs().getType().cast<TensorType>();
+    if (filter_type.getRank() != 2) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Filter tensor expected to have a tensor rank of 2. Got: "
+                 << filter_type << ".\n");
       return failure();
     }
+    if (has_i32_output) {
+      if (!IsI8F32UniformQuantizedType(filter_type.getElementType())) {
+        LLVM_DEBUG(llvm::dbgs() << "Expected a per-channel uniform quantized "
+                                   "(i8->f32) type. Got: "
+                                << filter_type.getElementType() << "\n");
+        return failure();
+      }
+    } else {
+      if (!IsI8F32UniformQuantizedPerAxisType(filter_type.getElementType())) {
+        LLVM_DEBUG(llvm::dbgs() << "Expected a per-channel uniform quantized "
+                                   "(i8->f32) type. Got: "
+                                << filter_type.getElementType() << "\n");
+        return failure();
+      }
+    }
 
-    if (Operation* filter_op = filter.getDefiningOp();
-        filter_op == nullptr || !isa<stablehlo::ConstantOp>(filter_op)) {
-      LLVM_DEBUG(llvm::dbgs() << "Filter should be a constant.\n");
+    // If the op has a fusible bias, make sure the bias is a constant.
+    if (auto add_op = FindUserOfType<stablehlo::AddOp>(op);
+        add_op != nullptr &&
+        !isa<stablehlo::ConstantOp>(add_op->getOperand(1).getDefiningOp())) {
+      LLVM_DEBUG(llvm::dbgs() << "Expected a `stablehlo.constant` as the "
+                              << "rhs of `stablehlo.add`.\n");
+    }
+
+    return success();
+  }
+
+  static LogicalResult MatchInputDotGeneralCommonPattern(const Value input) {
+    const auto input_type = input.getType().cast<TensorType>();
+    if (const auto input_element_type = input_type.getElementType();
+        !IsI8F32UniformQuantizedType(input_element_type)) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Expected an i8->f32 uniform quantized type. Got: "
+                 << input_element_type << ".\n");
+      return failure();
+    }
+
+    if (!input_type.hasRank()) {
+      LLVM_DEBUG(llvm::dbgs() << "Expected input_type to have rank.\n");
       return failure();
     }
+    return success();
+  }
 
+  static LogicalResult MatchFilterCommonPattern(const Value filter) {
+    auto filter_type = filter.getType().cast<TensorType>();
+    if (!filter_type.hasRank()) {
+      LLVM_DEBUG(llvm::dbgs() << "Expected rhs of dot_general has rank. Got: "
+                              << filter.getType() << "\n");
+      return failure();
+    }
     return success();
   }
 
-  static LogicalResult MatchOutput(Value output) {
+  static LogicalResult MatchOutput(const Value output,
+                                   const bool has_i32_output) {
     const Type output_element_type =
         output.getType().cast<TensorType>().getElementType();
+    if (has_i32_output) {
+      if (!IsI32F32UniformQuantizedType(output_element_type)) {
+        LLVM_DEBUG(
+            llvm::dbgs()
+            << "Expected a per-tensor uniform quantized (i32->f32) type. Got: "
+            << output_element_type << ".\n");
+        return failure();
+      }
+      return success();
+    }
     if (!IsI8F32UniformQuantizedType(output_element_type)) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Expected a uniform quantized (i8->f32) type. Got: "
-                 << output_element_type << ".\n");
+      LLVM_DEBUG(
+          llvm::dbgs()
+          << "Expected a per-tensor uniform quantized (i8->f32) type. Got: "
+          << output_element_type << ".\n");
       return failure();
     }
-
     return success();
   }
 
+  static void RewriteDotGeneralToTflBatchMatmulOp(
+      stablehlo::DotGeneralOp op, PatternRewriter& rewriter,
+      const stablehlo::DotDimensionNumbersAttr dot_dimension_nums,
+      const bool has_i32_output) {
+    const auto rhs_contracting_dims =
+        dot_dimension_nums.getRhsContractingDimensions();
+    const auto lhs_contracting_dims =
+        dot_dimension_nums.getLhsContractingDimensions();
+
+    const Value rhs_value = op.getRhs();
+    const Value lhs_value = op.getLhs();
+
+    Operation* rhs_op = rhs_value.getDefiningOp();
+    auto filter_constant_op = dyn_cast_or_null<stablehlo::ConstantOp>(rhs_op);
+
+    // Set to `nullptr` because this attribute only matters when the input is
+    // dynamic-range quantized.
+    const BoolAttr asymmetric_quantize_inputs = nullptr;
+
+    const int lhs_rank = lhs_value.getType().cast<TensorType>().getRank();
+    const BoolAttr adj_x =
+        (lhs_contracting_dims[0] == lhs_rank - 2 ? rewriter.getBoolAttr(true)
+                                                 : rewriter.getBoolAttr(false));
+    const int rhs_rank = rhs_value.getType().cast<TensorType>().getRank();
+    const BoolAttr adj_y =
+        (rhs_contracting_dims[0] == rhs_rank - 1 ? rewriter.getBoolAttr(true)
+                                                 : rewriter.getBoolAttr(false));
+
+    // Create BMM assuming rhs is activation.
+    auto tfl_batchmatmul_op = rewriter.create<TFL::BatchMatMulOp>(
+        op.getLoc(), /*output=*/op.getResult().getType(),
+        /*input=*/lhs_value,
+        /*filter=*/rhs_value, adj_x, adj_y, asymmetric_quantize_inputs);
+
+    // Update BMM if rhs is a constant.
+    if (filter_constant_op != nullptr) {
+      const auto rhs_uniform_quantized_type =
+          rhs_value.getType().cast<ShapedType>();
+      const auto rhs_constant_value_attr =
+          cast<DenseIntElementsAttr>(filter_constant_op.getValue());
+      auto rhs_constant_op = rewriter.create<TFL::QConstOp>(
+          rhs_op->getLoc(),
+          /*output=*/TypeAttr::get(rhs_uniform_quantized_type),
+          rhs_constant_value_attr);
+      tfl_batchmatmul_op = rewriter.create<TFL::BatchMatMulOp>(
+          op.getLoc(), /*output=*/op.getResult().getType(),
+          /*input=*/lhs_value, /*filter=*/rhs_constant_op.getResult(), adj_x,
+          adj_y, asymmetric_quantize_inputs);
+    }
+
+    rewriter.replaceAllUsesWith(op.getResult(), tfl_batchmatmul_op.getResult());
+  }
+  static void RewriteDotGeneralToTflFullyConnectedOp(
+      stablehlo::DotGeneralOp op, PatternRewriter& rewriter,
+      const stablehlo::DotDimensionNumbersAttr dot_dimension_nums,
+      const bool has_i32_output) {
+    const Value rhs_value = op.getRhs();
+    const Value lhs_value = op.getLhs();
+
+    Operation* rhs_op = rhs_value.getDefiningOp();
+    const auto filter_constant_op =
+        dyn_cast_or_null<stablehlo::ConstantOp>(rhs_op);
+
+    // Set to `nullptr` because this attribute only matters when the input is
+    // dynamic-range quantized.
+    const BoolAttr asymmetric_quantize_inputs = nullptr;
+
+    // Checks for `tfl.fully_connected` condition.
+
+    // StableHLO Quantizer does not yet support per-channel quantization of
+    // dot_general.
+    const bool is_per_channel = !has_i32_output;
+    // Create the new filter constant - transpose filter value
+    // from [i, o] -> [o, i]. This is because we assume `[i, o]` format for
+    // `stablehlo.dot_general` (i.e. contracting dimension == 1) whereas
+    // `tfl.fully_connected` accepts an OI format.
+    TFL::QConstOp new_filter_constant_op =
+        CreateTflConstOpForFilter(filter_constant_op, rewriter, is_per_channel);
+
+    const double input_scale = lhs_value.getType()
+                                   .cast<TensorType>()
+                                   .getElementType()
+                                   .cast<UniformQuantizedType>()
+                                   .getScale();
+    TFL::QConstOp bias_tfl_op;
+    bool fuse_bias_constant =
+        FindUserOfType<stablehlo::AddOp>(op) && has_i32_output;
+    // Get the desired output type and extract any existing fusible bias
+    // as `TFL::QConstOp` so that it can be fused with TFL::FullyConnectedOp`.
+    TensorType output_type = GetOutputTypeAndOptionallyUpdateBias(
+        op, rewriter, &bias_tfl_op, has_i32_output, fuse_bias_constant);
+
+    // If there is no explicit bias, create a dummy value filled with zeroes.
+    if (!fuse_bias_constant) {
+      bias_tfl_op = CreateTflConstOpForDummyBias(
+          op.getLoc(), input_scale, new_filter_constant_op, rewriter,
+          is_per_channel, *op.getContext());
+    }
+    rewriter.replaceOpWithNewOp<TFL::FullyConnectedOp>(
+        op, /*output=*/output_type,
+        /*input=*/lhs_value,
+        /*filter=*/new_filter_constant_op.getResult(),
+        /*bias=*/bias_tfl_op.getResult(),
+        /*fused_activation_function=*/rewriter.getStringAttr("NONE"),
+        /*weights_format=*/rewriter.getStringAttr("DEFAULT"),
+        /*keep_num_dims=*/rewriter.getBoolAttr(false),
+        asymmetric_quantize_inputs);
+  }
+
+  static TensorType GetOutputTypeAndOptionallyUpdateBias(
+      Operation* op, PatternRewriter& rewriter, TFL::QConstOp* bias_tfl_op,
+      const bool has_i32_output, const bool fuse_bias_constant) {
+    TensorType output_type;
+    if (has_i32_output) {
+      Operation* uniform_quantize_op;
+      if (fuse_bias_constant) {
+        Operation* add_op = FindUserOfType<stablehlo::AddOp>(op);
+        uniform_quantize_op = FindUserOfType<TFL::QuantizeOp>(add_op);
+        auto filter_quantized_type = op->getOperand(1)
+                                         .getType()
+                                         .cast<TensorType>()
+                                         .getElementType()
+                                         .cast<UniformQuantizedType>();
+        double bias_scale = GetBiasScale(
+            /*input_scale=*/op->getOperand(0)
+                .getType()
+                .cast<TensorType>()
+                .getElementType()
+                .cast<UniformQuantizedType>()
+                .getScale(),
+            /*filter_scale=*/filter_quantized_type.getScale());
+        ArrayRef<int64_t> output_shape =
+            op->getResult(0).getType().cast<TensorType>().getShape();
+        const SmallVector<int64_t, 1> bias_shape = {
+            output_shape[output_shape.size() - 1]};
+        auto bias_quantized_type = CreateI32F32UniformQuantizedType(
+            op->getLoc(), *op->getContext(), std::move(bias_scale),
+            op->getResult(0)
+                .getType()
+                .cast<TensorType>()
+                .getElementType()
+                .cast<UniformQuantizedType>()
+                .getZeroPoint());
+        Operation* stablehlo_bias_op = add_op->getOperand(1).getDefiningOp();
+        auto bias_type = RankedTensorType::getChecked(op->getLoc(), bias_shape,
+                                                      bias_quantized_type);
+        auto bias_value = cast<DenseIntElementsAttr>(
+            cast<stablehlo::ConstantOp>(stablehlo_bias_op).getValue());
+
+        *bias_tfl_op = rewriter.create<TFL::QConstOp>(
+            op->getLoc(),
+            /*output=*/TypeAttr::get(bias_type), /*value=*/bias_value);
+      } else {
+        uniform_quantize_op = FindUserOfType<TFL::QuantizeOp>(op);
+      }
+
+      auto result_quantized_type = uniform_quantize_op->getResult(0)
+                                       .getType()
+                                       .cast<TensorType>()
+                                       .getElementType()
+                                       .cast<UniformQuantizedType>();
+      auto new_result_quantized_type = CreateI8F32UniformQuantizedType(
+          uniform_quantize_op->getLoc(), *rewriter.getContext(),
+          result_quantized_type.getScale(),
+          result_quantized_type.getZeroPoint());
+      output_type = op->getResult(0).getType().cast<TensorType>().clone(
+          new_result_quantized_type);
+      // Omit any bias and requantize ops as `tfl.fully_connected` outputs a
+      // fused `qi8` type.
+      FindUserOfType<>(uniform_quantize_op)->setOperand(0, op->getResult(0));
+    } else {
+      output_type = op->getResult(0).getType().cast<TensorType>();
+    }
+    return output_type;
+  }
+};
+
+// Rewrites `stablehlo.convolution` into fused `tfl.conv_2d`.
+// If available, fuse bias and activation adjacent to `stablehlo.convolution`.
+// This RewritePattern rewrites both the following into `tfl.conv_2d` op:
+//
+// StableHLO Quantizer output:
+//   * input: per-tensor qi8
+//   * filter: per-channel qi8 (`quantization_dimension` = 3)
+//   * output: per-channel qi32 (`quantization_dimension` = 3)
+// JAX Quantizer output:
+//   * input: per-tensor qi8
+//   * filter: per-channel qi8 (`quantization_dimension` = 3)
+//   * output: per-tensor qi8
+//
+// Conditions for the conversion:
+//   * Input tensors are per-tensor uniform quantized (i8->f32)
+//     tensors.
+//   * The filter tensor is constant a per-channel uniform quantized (i8->f32)
+//     tensor.
+//   * Output tensors are per-tensor uniform quantized (i8->f32) or
+//     per-channel uniform quantized (i32->f32) tensors.
+//   * Convolution is a 2D convolution op and both the input's and filter's
+//     shape is 4 dimensional.
+//   * The filter tensor's format is `[0, 1, i, o]`.
+//   * Not a depthwise convolution.
+class RewriteQuantizedConvolutionOp
+    : public OpRewritePattern<stablehlo::ConvolutionOp> {
+ public:
+  using OpRewritePattern<stablehlo::ConvolutionOp>::OpRewritePattern;
   LogicalResult match(stablehlo::ConvolutionOp op) const override {
+    const bool has_i32_output = IsI32F32UniformQuantizedPerAxisType(
+        op.getResult().getType().cast<TensorType>().getElementType());
+    const bool fuse_bias_constant =
+        FindUserOfType<stablehlo::AddOp>(op) && has_i32_output;
     stablehlo::ConvDimensionNumbersAttr dimension_numbers =
         op.getDimensionNumbers();
 
@@ -446,14 +831,39 @@ class RewriteUpstreamQuantizedConvolutionOp
       return failure();
     }
 
+    // TODO: b/309896242 - Lift the assumptions on adjacent ops below
+    // as we cover more dynamic fused pattern legalization.
+    if (fuse_bias_constant) {
+      Operation* add_op = FindUserOfType<stablehlo::AddOp>(op);
+      if (add_op == nullptr) {
+        LLVM_DEBUG(llvm::dbgs() << "Failed to find AddOp for bias fusion.\n");
+        return failure();
+      }
+      Operation* broadcast_in_dim_op = add_op->getOperand(1).getDefiningOp();
+      if (!isa<stablehlo::BroadcastInDimOp>(broadcast_in_dim_op)) {
+        LLVM_DEBUG(llvm::dbgs() << "Failed to find broadcasted bias.\n");
+        return failure();
+      }
+      Operation* bias_const_op =
+          broadcast_in_dim_op->getOperand(0).getDefiningOp();
+      if (!isa<stablehlo::ConstantOp>(bias_const_op)) {
+        LLVM_DEBUG(llvm::dbgs() << "Failed to find bias constant.\n");
+        return failure();
+      }
+    }
+
     return success();
   }
 
   void rewrite(stablehlo::ConvolutionOp op,
                PatternRewriter& rewriter) const override {
+    const bool has_i32_output = IsI32F32UniformQuantizedPerAxisType(
+        op.getResult().getType().cast<TensorType>().getElementType());
+    stablehlo::ConvDimensionNumbersAttr dimension_numbers =
+        op.getDimensionNumbers();
+
     Value filter_value = op.getOperand(1);
     Operation* filter_op = filter_value.getDefiningOp();
-
     auto filter_uniform_quantized_type =
         filter_value.getType()
             .cast<TensorType>()
@@ -467,15 +877,11 @@ class RewriteUpstreamQuantizedConvolutionOp
     // (https://github.com/tensorflow/tensorflow/blob/5430e5e238f868ce977df96ba89c9c1d31fbe8fa/tensorflow/compiler/mlir/lite/ir/tfl_ops.td#L933).
     // The quantized dimension should correspond to the output feature
     // dimension.
-    auto new_filter_quantized_type = UniformQuantizedPerAxisType::getChecked(
-        filter_op->getLoc(), /*flags=*/true,
-        /*storageType=*/filter_uniform_quantized_type.getStorageType(),
-        filter_uniform_quantized_type.getExpressedType(),
+    auto new_filter_quantized_type = CreateI8F32UniformQuantizedPerAxisType(
+        filter_op->getLoc(), *op.getContext(),
         filter_uniform_quantized_type.getScales(),
         filter_uniform_quantized_type.getZeroPoints(),
-        /*quantizedDimension=*/0,
-        filter_uniform_quantized_type.getStorageTypeMin(),
-        filter_uniform_quantized_type.getStorageTypeMax());
+        /*quantization_dimension=*/0, /*narrow_range=*/true);
 
     auto filter_constant_value_attr = cast<DenseIntElementsAttr>(
         cast<stablehlo::ConstantOp>(filter_value.getDefiningOp()).getValue());
@@ -495,48 +901,33 @@ class RewriteUpstreamQuantizedConvolutionOp
         filter_op->getLoc(), /*output=*/TypeAttr::get(new_filter_result_type),
         new_filter_value_attr);
 
-    SmallVector<double> bias_scales =
-        GetBiasScales(/*input_scale=*/op.getOperand(0)
-                          .getType()
-                          .cast<TensorType>()
-                          .getElementType()
-                          .cast<UniformQuantizedType>()
-                          .getScale(),
-                      /*filter_scales=*/new_filter_quantized_type.getScales());
+    Operation* uniform_quantize_op;
+    const bool fuse_bias_constant =
+        FindUserOfType<stablehlo::AddOp>(op) && has_i32_output;
+    if (has_i32_output) {
+      if (fuse_bias_constant) {
+        Operation* add_op = FindUserOfType<stablehlo::AddOp>(op);
+        uniform_quantize_op = FindUserOfType<TFL::QuantizeOp>(add_op);
+      } else {
+        uniform_quantize_op = FindUserOfType<TFL::QuantizeOp>(op);
+      }
+    }
 
-    // Create a bias filled with zeros. Mimics the behavior of no bias add.
     const int64_t num_output_features = new_filter_result_type.getShape()[0];
     const SmallVector<int64_t, 1> bias_shape = {num_output_features};
-    auto bias_quantized_type = UniformQuantizedPerAxisType::getChecked(
-        op.getLoc(), /*flags=*/true,
-        /*storageType=*/rewriter.getI32Type(),  // i32 for bias
-        /*expressedType=*/rewriter.getF32Type(),
-        /*scales=*/std::move(bias_scales),
-        /*zeroPoints=*/new_filter_quantized_type.getZeroPoints(),  // Zeros.
-        /*quantizedDimension=*/0,
-        /*storageTypeMin=*/std::numeric_limits<int32_t>::min(),
-        /*storageTypeMax=*/std::numeric_limits<int32_t>::max());
-    auto bias_type = RankedTensorType::getChecked(op.getLoc(), bias_shape,
-                                                  bias_quantized_type);
-
-    // Create a bias constant. It should have values of 0.
-    auto bias_value_type = RankedTensorType::getChecked(op.getLoc(), bias_shape,
-                                                        rewriter.getI32Type());
-    auto bias_value = DenseIntElementsAttr::get(
-        bias_value_type, APInt(/*numBits=*/32, /*value=*/0, /*isSigned=*/true));
-    auto bias = rewriter.create<TFL::QConstOp>(
-        op.getLoc(), /*output=*/TypeAttr::get(bias_type),
-        /*value=*/bias_value);
+
+    TFL::QConstOp bias = GetBiasOp(op, rewriter, new_filter_result_type,
+                                   new_filter_quantized_type, bias_shape,
+                                   has_i32_output, fuse_bias_constant);
 
     // Determine the attributes for the TFL::Conv2DOp.
-    // TODO: b/294808863 - Use `padding = "SAME"` if the padding attribute
-    // matches the semantics.
+
     Value input_value = op.getOperand(0);
     if (const DenseIntElementsAttr padding_attr = op.getPaddingAttr();
-        !IsPaddingValid(padding_attr)) {
+        !HasProperPadding(op, dimension_numbers, padding_attr)) {
       // Add an extra tfl.pad_op if there are explicit padding values. This
-      // extra pad op will allow us to always set the `padding` attribute of the
-      // newly created tfl.conv_2d op as "VALID".
+      // extra pad op will allow us to always set the `padding` attribute of
+      // the newly created tfl.conv_2d op as "VALID".
       TFL::PadOp pad_op =
           CreateTflPadOp(op.getLoc(), padding_attr, input_value, rewriter);
       input_value = pad_op.getResult();
@@ -545,40 +936,130 @@ class RewriteUpstreamQuantizedConvolutionOp
     const auto [stride_h, stride_w] = GetStrides(op);
     const auto [dilation_h_factor, dilation_w_factor] = GetDilationFactors(op);
 
-    auto tfl_conv2d_op = rewriter.create<TFL::Conv2DOp>(
-        op.getLoc(), /*output=*/op.getResult().getType(), /*input=*/input_value,
+    Type output_type;
+    if (has_i32_output) {
+      // StableHLO Quantizer outputs an i32 type. Rewrite to i8 type result
+      // to meet TFLite op requirement.
+      auto result_quantized_type = uniform_quantize_op->getResult(0)
+                                       .getType()
+                                       .cast<TensorType>()
+                                       .getElementType()
+                                       .cast<UniformQuantizedType>();
+      auto new_result_quantized_type = CreateI8F32UniformQuantizedType(
+          uniform_quantize_op->getLoc(), *rewriter.getContext(),
+          result_quantized_type.getScale(),
+          result_quantized_type.getZeroPoint());
+      output_type = op.getResult().getType().cast<TensorType>().clone(
+          new_result_quantized_type);
+      // Omit any bias and requantize ops as `tfl.fully_connected` outputs a
+      // fused `qi8` type.
+      FindUserOfType<>(uniform_quantize_op)->setOperand(0, op->getResult(0));
+    } else {
+      output_type = op.getResult().getType();
+    }
+    rewriter.replaceOpWithNewOp<TFL::Conv2DOp>(
+        // op result should be recasted to desired quantized type.
+        op, output_type,
+        /*input=*/input_value,
         /*filter=*/new_filter_constant_op, /*bias=*/bias.getResult(),
         /*dilation_h_factor=*/rewriter.getI32IntegerAttr(dilation_h_factor),
         /*dilation_w_factor=*/rewriter.getI32IntegerAttr(dilation_w_factor),
         /*fused_activation_function=*/rewriter.getStringAttr("NONE"),
-        /*padding=*/rewriter.getStringAttr("VALID"),
+        /*padding=*/
+        rewriter.getStringAttr(UseSamePadding(op, dimension_numbers)
+                                   ? kPaddingSame
+                                   : kPaddingValid),
         /*stride_h=*/rewriter.getI32IntegerAttr(stride_h),
         /*stride_w=*/rewriter.getI32IntegerAttr(stride_w));
-
-    rewriter.replaceAllUsesWith(op.getResult(), tfl_conv2d_op.getResult());
-    rewriter.eraseOp(op);
   }
 
  private:
-  // Create a `tfl.pad` op to apply explicit padding to the input tensor that
-  // correspond to the `padding` attribute from the `stablehlo.convolution` op.
-  TFL::PadOp CreateTflPadOp(Location loc,
-                            const DenseIntElementsAttr& padding_attr,
-                            Value input_value,
-                            PatternRewriter& rewriter) const {
-    auto padding_values = padding_attr.getValues<int64_t>();
-    // [[h_l, h_r], [w_l, w_r]].
-    DCHECK_EQ(padding_attr.size(), 4);
+  static LogicalResult MatchInput(Value input) {
+    auto input_type = input.getType().cast<TensorType>();
+    if (input_type.getRank() != 4) {
+      LLVM_DEBUG(llvm::dbgs() << "Only 2D convolution op is supported. "
+                                 "Expected input rank of 4. Got: "
+                              << input_type.getRank() << ".\n");
+      return failure();
+    }
 
-    // In StableHLO the padding attribute doesn't include the padding values for
-    // input and output feature dimensions (because they are 0 anyways). In
-    // TFLite, padding values for input and output feature dimensions should be
-    // explicitly set to 0s. Note that TFLite's input tensor is formatted as
-    // OHWI. The resulting pad values becomes: [[0, 0], [h_l, h_r], [w_l, w_r],
+    if (const auto input_element_type = input_type.getElementType();
+        !IsI8F32UniformQuantizedType(input_element_type)) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Expected an i8->f32 uniform quantized type. Got: "
+                 << input_element_type << ".\n");
+      return failure();
+    }
+
+    return success();
+  }
+
+  static LogicalResult MatchFilter(Value filter) {
+    auto filter_type = filter.getType().cast<TensorType>();
+    if (filter_type.getRank() != 4) {
+      LLVM_DEBUG(llvm::dbgs() << "Only 2D convolution op is supported. "
+                                 "Expected filter rank of 4. Got: "
+                              << filter_type.getRank() << ".\n");
+      return failure();
+    }
+
+    const Type filter_element_type = filter_type.getElementType();
+    if (!IsI8F32UniformQuantizedPerAxisType(filter_type.getElementType())) {
+      LLVM_DEBUG(
+          llvm::dbgs()
+          << "Expected a per-channel uniform quantized (i8->f32) type. Got: "
+          << filter_element_type << "\n");
+      return failure();
+    }
+
+    if (filter_element_type.cast<UniformQuantizedPerAxisType>()
+            .getQuantizedDimension() != 3) {
+      LLVM_DEBUG(llvm::dbgs() << "Quantized dimension should be 3. Got: "
+                              << filter_element_type << "\n");
+      return failure();
+    }
+
+    if (Operation* filter_op = filter.getDefiningOp();
+        filter_op == nullptr || !isa<stablehlo::ConstantOp>(filter_op)) {
+      LLVM_DEBUG(llvm::dbgs() << "Filter should be a constant.\n");
+      return failure();
+    }
+    return success();
+  }
+
+  static LogicalResult MatchOutput(Value output) {
+    const Type output_element_type =
+        output.getType().cast<TensorType>().getElementType();
+    if (!IsI32F32UniformQuantizedPerAxisType(output_element_type) &&
+        !IsI8F32UniformQuantizedType(output_element_type)) {
+      LLVM_DEBUG(
+          llvm::dbgs()
+          << "Expected a per-channel uniform quantized (i32->f32) type or "
+          << "per-tensor uniform quantized (i8->f32) type. Got: "
+          << output_element_type << ".\n");
+      return failure();
+    }
+    return success();
+  }
+  // Create a `tfl.pad` op to apply explicit padding to the input tensor that
+  // correspond to the `padding` attribute from the `stablehlo.convolution` op.
+  TFL::PadOp CreateTflPadOp(Location loc,
+                            const DenseIntElementsAttr& padding_attr,
+                            Value input_value,
+                            PatternRewriter& rewriter) const {
+    auto padding_values = padding_attr.getValues<int64_t>();
+    // [[h_l, h_r], [w_l, w_r]].
+    DCHECK_EQ(padding_attr.size(), 4);
+
+    // In StableHLO the padding attribute doesn't include the padding values for
+    // input and output feature dimensions (because they are 0 anyways). In
+    // TFLite, padding values for input and output feature dimensions should be
+    // explicitly set to 0s. Note that TFLite's input tensor is formatted as
+    // OHWI. The resulting pad values becomes: [[0, 0], [h_l, h_r], [w_l, w_r],
     // [0, 0]]
     SmallVector<int32_t, 8> tfl_pad_values = {0, 0};  // For output feature dim.
     for (const int64_t padding_value : padding_values) {
-      tfl_pad_values.push_back(static_cast<int32_t>(padding_value));
+      tfl_pad_values.push_back(CastI64ToI32(padding_value).value());
     }
     // For input feature dim.
     tfl_pad_values.push_back(0);
@@ -634,7 +1115,7 @@ class RewriteUpstreamQuantizedConvolutionOp
     ArrayRef<int64_t> filter_shape =
         filter_value_attr.getShapedType().getShape();
     SmallVector<int8_t> filter_constant_values;
-    for (const auto filter_val : filter_value_attr.getValues<int8_t>()) {
+    for (auto filter_val : filter_value_attr.getValues<int8_t>()) {
       filter_constant_values.push_back(filter_val);
     }
 
@@ -659,8 +1140,8 @@ class RewriteUpstreamQuantizedConvolutionOp
         for (int k = 0; k < filter_shape[2]; ++k) {
           for (int l = 0; l < filter_shape[3]; ++l) {
             // [i][j][k][l] -> [l][i][j][k]
-            const int old_idx = get_array_idx(filter_shape, i, j, k, l);
-            const int new_idx = get_array_idx(new_filter_shape, l, i, j, k);
+            int old_idx = get_array_idx(filter_shape, i, j, k, l);
+            int new_idx = get_array_idx(new_filter_shape, l, i, j, k);
 
             new_filter_constant_values[new_idx] =
                 filter_constant_values[old_idx];
@@ -679,23 +1160,44 @@ class RewriteUpstreamQuantizedConvolutionOp
     return new_filter_constant_value_attr;
   }
 
-  // Determines if the padding attribute corresponds to "VALID"
+  bool UseSamePadding(
+      Operation* op,
+      stablehlo::ConvDimensionNumbersAttr dimension_numbers) const {
+    // TODO: b/294808863 - Account for dynamic shapes.
+    const ArrayRef<int64_t> input_shape =
+        op->getOperand(0).getType().cast<ShapedType>().getShape();
+    const ArrayRef<int64_t> output_shape =
+        op->getResult(0).getType().cast<ShapedType>().getShape();
+    const ArrayRef<int64_t> input_spatial_dim_inds =
+        dimension_numbers.getInputSpatialDimensions();
+    const ArrayRef<int64_t> output_spatial_dim_inds =
+        dimension_numbers.getOutputSpatialDimensions();
+    return (input_shape[input_spatial_dim_inds[0]] ==
+                output_shape[output_spatial_dim_inds[0]] &&
+            input_shape[input_spatial_dim_inds[1]] ==
+                output_shape[output_spatial_dim_inds[1]]);
+  }
+
+  // Determines if the padding attribute corresponds to "VALID" or "SAME".
+  // If not, the input's shape should be adjusted with explicit `tfl.pad` op.
   // (https://www.tensorflow.org/api_docs/python/tf/nn).
-  bool IsPaddingValid(const DenseIntElementsAttr& padding_attr) const {
+  bool HasProperPadding(Operation* op,
+                        stablehlo::ConvDimensionNumbersAttr dimension_numbers,
+                        const DenseIntElementsAttr& padding_attr) const {
     // If padding_attr is empty, it defaults to splat 0s.
-    return !padding_attr || (padding_attr.isSplat() &&
-                             padding_attr.getSplatValue<int64_t>() == 0);
+    return UseSamePadding(op, dimension_numbers) ||
+           (!padding_attr || (padding_attr.isSplat() &&
+                              padding_attr.getSplatValue<int64_t>() == 0));
   }
 
   // Returns the stride amount for the height and width, respectively.
   std::pair<int64_t, int64_t> GetStrides(stablehlo::ConvolutionOp op) const {
-    const Attribute window_strides_attr = op.getWindowStridesAttr();
+    DenseI64ArrayAttr window_strides_attr = op.getWindowStridesAttr();
     if (!window_strides_attr) {
       return {1, 1};  // Default values.
     }
 
-    const auto window_strides_attr_value =
-        hlo::getI64Array(window_strides_attr);
+    auto window_strides_attr_value = window_strides_attr.asArrayRef();
     // It is guaranteed from the spec that it has two values:
     // https://github.com/openxla/stablehlo/blob/main/docs/spec.md#convolution.
     return {window_strides_attr_value[0], window_strides_attr_value[1]};
@@ -704,580 +1206,510 @@ class RewriteUpstreamQuantizedConvolutionOp
   // Returns the dilation amount for the height and width, respectively.
   std::pair<int64_t, int64_t> GetDilationFactors(
       stablehlo::ConvolutionOp op) const {
-    const Attribute lhs_dilation_attr = op.getLhsDilationAttr();
+    DenseI64ArrayAttr lhs_dilation_attr = op.getLhsDilationAttr();
     if (!lhs_dilation_attr) {
       return {1, 1};  // Default values.
     }
 
-    const auto lhs_dilation_attr_value = hlo::getI64Array(lhs_dilation_attr);
+    auto lhs_dilation_attr_value = lhs_dilation_attr.asArrayRef();
     // It is guaranteed from the spec that it has two values:
     // https://github.com/openxla/stablehlo/blob/main/docs/spec.md#convolution.
     return {lhs_dilation_attr_value[0], lhs_dilation_attr_value[1]};
   }
+
+  TFL::QConstOp GetBiasOp(
+      stablehlo::ConvolutionOp op, PatternRewriter& rewriter,
+      const RankedTensorType new_filter_result_type,
+      const UniformQuantizedPerAxisType new_filter_quantized_type,
+      const SmallVector<int64_t, 1> bias_shape, const bool has_i32_output,
+      const bool fuse_bias_constant) const {
+    const SmallVector<double> bias_scales = GetBiasScales(
+        /*input_scale=*/op.getOperand(0)
+            .getType()
+            .cast<TensorType>()
+            .getElementType()
+            .cast<UniformQuantizedType>()
+            .getScale(),
+        /*filter_scales=*/new_filter_quantized_type.getScales());
+
+    const auto bias_quantized_type = CreateI32F32UniformQuantizedPerAxisType(
+        op.getLoc(), *op.getContext(), std::move(bias_scales),
+        new_filter_quantized_type.getZeroPoints(),
+        /*quantization_dimension=*/0);
+    const auto bias_type = RankedTensorType::getChecked(op.getLoc(), bias_shape,
+                                                        bias_quantized_type);
+    TFL::QConstOp bias;
+    if (fuse_bias_constant && has_i32_output) {
+      Operation* add_op = FindUserOfType<stablehlo::AddOp>(op);
+      // TODO: b/309896242 - Lift the assumptions on adjacent ops below
+      // as we cover more dynamic fused pattern legalization.
+      Operation* broadcast_in_dim_op = add_op->getOperand(1).getDefiningOp();
+      Operation* bias_const_op =
+          broadcast_in_dim_op->getOperand(0).getDefiningOp();
+      const ElementsAttr bias_constant_value =
+          cast<stablehlo::ConstantOp>(bias_const_op).getValue();
+      bias = rewriter.create<TFL::QConstOp>(op.getLoc(),
+                                            /*output=*/TypeAttr::get(bias_type),
+                                            /*value=*/bias_constant_value);
+    } else {
+      // Create a bias constant. It should have values of 0.
+      const auto bias_value_type = RankedTensorType::getChecked(
+          op.getLoc(), bias_shape, rewriter.getI32Type());
+      // Create a bias filled with zeros. Mimics the behavior of no bias add.
+      const auto bias_value = DenseIntElementsAttr::get(
+          bias_value_type,
+          APInt(/*numBits=*/32, /*value=*/0, /*isSigned=*/true));
+      bias = rewriter.create<TFL::QConstOp>(op.getLoc(),
+                                            /*output=*/TypeAttr::get(bias_type),
+                                            /*value=*/bias_value);
+    }
+    return bias;
+  }
 };
 
-// Rewrites full-integer quantized `stablehlo.dot_general` ->`tfl.batch_matmul`
-// when it accepts uniform quantized tensors.
-//
-// Since transpose and reshape of quantized tensors are not natively supported
-// at the moment, the conversion condition is relatively strict, following
-// (https://www.tensorflow.org/api_docs/cc/class/tensorflow/ops/batch-mat-mul-v3)
-//
-// Conditions for the conversion :
-//   * size(batching_dimensions) <= 3 (TFLite support restriction)
-//   * size(contracting_dimensions) = 1
-//   * Input (lhs) and output tensors are per-tensor uniform quantized (i8->f32)
-//     tensors (full integer) with shape [..., r_x, c_x] or [..., c_x, r_x].
-//   * The rhs tensor is a per-tensor uniform quantized (i8->f32) tensor
-//     (constant or activation) with shape [..., r_y, c_y] or [..., c_y, r_y].
-//
-// TODO: b/293650675 - Relax the conversion condition to support dot_general in
-// general.
-class RewriteUpstreamQuantizedDotGeneralOpToBatchMatmulOp
-    : public OpRewritePattern<stablehlo::DotGeneralOp> {
+// Rewrites quantized stablehlo.transpose to tfl.transpose.
+// TODO: b/322428814 - Add StableHLO quantizer integration tests for ODML.
+class RewriteQuantizedTransposeOp
+    : public OpRewritePattern<stablehlo::TransposeOp> {
  public:
-  using OpRewritePattern<stablehlo::DotGeneralOp>::OpRewritePattern;
+  using OpRewritePattern<stablehlo::TransposeOp>::OpRewritePattern;
 
-  static LogicalResult MatchLhs(
-      Value lhs, stablehlo::DotDimensionNumbersAttr dimension_numbers) {
-    auto lhs_type = lhs.getType().cast<TensorType>();
-    if (!IsI8F32UniformQuantizedType(lhs_type.getElementType())) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Expected a per-tensor uniform "
-                    "quantized (i8->f32) input for dot_general. Got: "
-                 << lhs_type << "\n");
-      return failure();
-    }
-    if (!lhs_type.hasRank()) {
-      LLVM_DEBUG(llvm::dbgs() << "Expected lhs of dot_general has rank. Got: "
-                              << lhs_type << "\n");
-      return failure();
-    }
-    const int lhs_rank = lhs_type.getRank();
-    auto lhs_contracting_dim =
-        dimension_numbers.getLhsContractingDimensions()[0];
-    if ((lhs_contracting_dim != lhs_rank - 1) &&
-        (lhs_contracting_dim != lhs_rank - 2)) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Not supported lhs contracting dim for dot_general.\n");
-      return failure();
-    }
-    return success();
+  LogicalResult match(stablehlo::TransposeOp op) const override {
+    return success(IsOpFullyQuantized(op));
   }
 
-  static LogicalResult MatchRhs(
-      Value rhs, stablehlo::DotDimensionNumbersAttr dimension_numbers) {
-    if (!rhs.getType().cast<TensorType>().hasRank()) {
-      LLVM_DEBUG(llvm::dbgs() << "Expected rhs of dot_general has rank. Got: "
-                              << rhs.getType() << "\n");
-      return failure();
-    }
-    const int rhs_rank = rhs.getType().cast<TensorType>().getRank();
-    auto rhs_contracting_dim =
-        dimension_numbers.getRhsContractingDimensions()[0];
-    if ((rhs_contracting_dim != rhs_rank - 1) &&
-        (rhs_contracting_dim != rhs_rank - 2)) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Not supported rhs contracting dim for dot_general.\n");
-      return failure();
-    }
-
-    auto rhs_type = rhs.getType().cast<TensorType>();
-    if (!IsI8F32UniformQuantizedType(rhs_type.getElementType())) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Expected a per-tensor uniform "
-                    "quantized (i8->f32) weight for dot_general. Got: "
-                 << rhs_type << "\n");
-      return failure();
-    }
-    return success();
+  void rewrite(stablehlo::TransposeOp op,
+               PatternRewriter& rewriter) const override {
+    auto operand_type = op.getOperand().getType().cast<TensorType>();
+    const int64_t rank = operand_type.getRank();
+    ArrayRef<int64_t> shape(rank);
+    TensorType permutation_type =
+        operand_type.cloneWith(shape, rewriter.getI32Type());
+    // Cast permutation attribute from i64 to i32 as they are required to be i32
+    // in TFLite.
+    SmallVector<int32_t> permutation_i32 =
+        CastI64ArrayToI32(op.getPermutation()).value();
+    auto permutation_attr =
+        DenseIntElementsAttr::get(permutation_type, permutation_i32);
+    auto permutation =
+        rewriter.create<arith::ConstantOp>(op.getLoc(), permutation_attr);
+    rewriter.replaceOpWithNewOp<TFL::TransposeOp>(op, op.getOperand(),
+                                                  permutation);
   }
+};
 
-  static LogicalResult MatchOutput(
-      Value output, stablehlo::DotDimensionNumbersAttr dimension_numbers) {
-    auto output_type = output.getType().cast<TensorType>();
-    if (!IsI8F32UniformQuantizedType(output_type.getElementType())) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Expected a per-tensor uniform "
-                    "quantized (i8->f32) output for dot_general. Got: "
-                 << output_type << "\n");
-      return failure();
-    }
-    return success();
+// Rewrites quantized stablehlo.reshape to tfl.reshape.
+// TODO: b/322428814 - Add StableHLO quantizer integration tests for ODML.
+class RewriteQuantizedReshapeOp
+    : public OpRewritePattern<stablehlo::ReshapeOp> {
+ public:
+  using OpRewritePattern<stablehlo::ReshapeOp>::OpRewritePattern;
+
+  LogicalResult match(stablehlo::ReshapeOp op) const override {
+    return success(IsOpFullyQuantized(op));
   }
 
-  LogicalResult match(stablehlo::DotGeneralOp op) const override {
-    stablehlo::DotDimensionNumbersAttr dimension_numbers =
-        op.getDotDimensionNumbers();
+  void rewrite(stablehlo::ReshapeOp op,
+               PatternRewriter& rewriter) const override {
+    auto result_type = op->getResult(0).getType().cast<TensorType>();
+    // Cast result shapes from i64 to i32 as they are required to be i32 in
+    // TFLite.
+    SmallVector<int32_t> shape_i32 =
+        CastI64ArrayToI32(result_type.getShape()).value();
+
+    const int64_t shape_length = shape_i32.size();
+    ArrayRef<int64_t> shape(shape_length);
+    TensorType shape_type = result_type.cloneWith(shape, rewriter.getI32Type());
+    auto shape_attr = DenseIntElementsAttr::get(shape_type, shape_i32);
+    auto new_shape =
+        rewriter.create<arith::ConstantOp>(op.getLoc(), shape_attr);
+    rewriter.replaceOpWithNewOp<TFL::ReshapeOp>(op, op.getOperand(), new_shape);
+  }
+};
 
-    // Check one side is enough since
-    // (C1) size(lhs_batching_dimensions) = size(rhs_batching_dimensions).
-    if (dimension_numbers.getLhsBatchingDimensions().size() > 3) {
-      LLVM_DEBUG(
-          llvm::dbgs()
-          << "Failed to match batch dimention for quantized dot_general.\n");
-      return failure();
-    }
-    // Check one side is enough since
-    // (C2) size(lhs_contracting_dimensions) = size(rhs_contracting_dimensions).
-    if (dimension_numbers.getLhsContractingDimensions().size() != 1) {
-      LLVM_DEBUG(
-          llvm::dbgs()
-          << "Failed to match contract dimention for quantized dot_general.\n");
-      return failure();
-    }
+// Rewrites quantized stablehlo.select to tfl.select_v2.
+// TODO: b/322428814 - Add StableHLO quantizer integration tests for ODML.
+class RewriteQuantizedSelectOp : public OpRewritePattern<stablehlo::SelectOp> {
+ public:
+  using OpRewritePattern<stablehlo::SelectOp>::OpRewritePattern;
 
-    if (failed(MatchLhs(op.getLhs(), dimension_numbers))) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Failed to match input for quantized dot_general.\n");
+  LogicalResult match(stablehlo::SelectOp op) const override {
+    if (!IsQuantizedTensorType(op.getOperand(1).getType())) {
       return failure();
     }
-    if (failed(MatchRhs(op.getRhs(), dimension_numbers))) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Failed to match weight for quantized dot_general.\n");
+    if (!IsQuantizedTensorType(op.getOperand(2).getType())) {
       return failure();
     }
-
-    if (failed(MatchOutput(op.getResult(), dimension_numbers))) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Failed to match output for quantized dot_general.\n");
+    if (!IsQuantizedTensorType(op.getResult().getType())) {
       return failure();
     }
-
     return success();
   }
 
-  void rewrite(stablehlo::DotGeneralOp op,
+  void rewrite(stablehlo::SelectOp op,
                PatternRewriter& rewriter) const override {
-    Value rhs_value = op.getRhs();
-    Operation* rhs_op = rhs_value.getDefiningOp();
-
-    stablehlo::DotDimensionNumbersAttr dimension_numbers =
-        op.getDotDimensionNumbers();
-    Value input_value = op.getLhs();
-    const int lhs_rank = input_value.getType().cast<TensorType>().getRank();
-    auto lhs_contracting_dim =
-        dimension_numbers.getLhsContractingDimensions()[0];
-    BoolAttr adj_x =
-        (lhs_contracting_dim == lhs_rank - 2 ? rewriter.getBoolAttr(true)
-                                             : rewriter.getBoolAttr(false));
-    auto rhs_contracting_dim =
-        dimension_numbers.getRhsContractingDimensions()[0];
-    const int rhs_rank = rhs_value.getType().cast<TensorType>().getRank();
-    BoolAttr adj_y =
-        (rhs_contracting_dim == rhs_rank - 1 ? rewriter.getBoolAttr(true)
-                                             : rewriter.getBoolAttr(false));
-
-    // Set to `nullptr` because this attribute only matters when the input is
-    // dynamic-range quantized.
-    BoolAttr asymmetric_quantize_inputs = nullptr;
-
-    // Create BMM assuming rhs is activation.
-    auto tfl_batchmatmul_op = rewriter.create<TFL::BatchMatMulOp>(
-        op.getLoc(), /*output=*/op.getResult().getType(), /*input=*/input_value,
-        /*filter=*/rhs_value, adj_x, adj_y, asymmetric_quantize_inputs);
-
-    // Update BMM if rhs is a constant.
-    auto const_rhs = dyn_cast_or_null<stablehlo::ConstantOp>(rhs_op);
-    if (const_rhs) {
-      auto rhs_uniform_quantized_type = rhs_value.getType().cast<ShapedType>();
-      auto rhs_constant_value_attr =
-          cast<DenseIntElementsAttr>(const_rhs.getValue());
-      auto rhs_constant_op = rewriter.create<TFL::QConstOp>(
-          rhs_op->getLoc(),
-          /*output=*/TypeAttr::get(rhs_uniform_quantized_type),
-          rhs_constant_value_attr);
-      tfl_batchmatmul_op = rewriter.create<TFL::BatchMatMulOp>(
-          op.getLoc(), /*output=*/op.getResult().getType(),
-          /*input=*/input_value, /*filter=*/rhs_constant_op.getResult(), adj_x,
-          adj_y, asymmetric_quantize_inputs);
-    }
-
-    rewriter.replaceAllUsesWith(op.getResult(), tfl_batchmatmul_op.getResult());
+    Value pred = op.getOperand(0);
+    Value on_true = op.getOperand(1);
+    Value on_false = op.getOperand(2);
+    rewriter.replaceOpWithNewOp<TFL::SelectV2Op>(op, pred, on_true, on_false);
   }
 };
 
-// Rewrites `stablehlo.dot_general` -> `tfl.fully_connected` when it accepts
-// uniform quantized tensors with per-axis quantized filter tensor (rhs).
-//
-// Conditions for the conversion:
-//   * Input and output tensors are per-tensor uniform quantized (i8->f32)
-//     tensors.
-//   * The filter tensor is constant a per-channel uniform quantized (i8->f32)
-//     tensor. The quantization dimension should be 1 (the non-contracting
-//     dimension).
-//   * The input tensor's rank is either 2 or 3. The last dimension of the input
-//     tensor should be the contracting dimension, i.e. [..., c_x, r_x].
-//   * The filter tensor's rank is 2. The contracting dimension should be the
-//     first dimension (dim 0), i.e. [c_y, r_y] where c_y == r_x.
-//   * Does not consider activation fusion.
-//   * Does not consider bias add fusion.
-//
-// TODO: b/294983811 - Merge this pattern into
-// `RewriteFullIntegerQuantizedDotGeneralOp`.
-// TODO: b/295264927 - `stablehlo.dot_general` with per-axis quantized operands
-// is not specified in the StableHLO dialect. Update the spec to allow this.
-class RewriteUpstreamQuantizedDotGeneralOpToTflFullyConnectedOp
-    : public OpRewritePattern<stablehlo::DotGeneralOp> {
-  using OpRewritePattern<stablehlo::DotGeneralOp>::OpRewritePattern;
-
+// Rewrites quantized stablehlo.concatenate to tfl.concatenation.
+// TODO: b/322428814 - Add StableHLO quantizer integration tests for ODML.
+class RewriteQuantizedConcatenateOp
+    : public OpRewritePattern<stablehlo::ConcatenateOp> {
  public:
-  LogicalResult match(stablehlo::DotGeneralOp op) const override {
-    const stablehlo::DotDimensionNumbersAttr dot_dimension_nums =
-        op.getDotDimensionNumbers();
-    if (const int num_rhs_contracting_dims =
-            dot_dimension_nums.getRhsContractingDimensions().size();
-        num_rhs_contracting_dims != 1) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Expected number of contracting dimensions to be 1. Got: "
-                 << num_rhs_contracting_dims << ".\n");
-      return failure();
-    }
-
-    if (failed(MatchInput(op.getOperand(0)))) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Failed to match input for quantized dot_general op.\n");
-      return failure();
-    }
-
-    if (failed(MatchFilter(op.getOperand(1)))) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Failed to match filter for quantized dot_general op.\n");
-      return failure();
-    }
+  using OpRewritePattern<stablehlo::ConcatenateOp>::OpRewritePattern;
 
-    if (failed(MatchOutput(op.getResult()))) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Failed to match output for quantized dot_general op.\n");
-      return failure();
-    }
-
-    return success();
+  LogicalResult match(stablehlo::ConcatenateOp op) const override {
+    return success(IsOpFullyQuantized(op));
   }
 
-  void rewrite(stablehlo::DotGeneralOp op,
+  void rewrite(stablehlo::ConcatenateOp op,
                PatternRewriter& rewriter) const override {
-    // Create the new filter constant - transpose filter value
-    // from [i, o] -> [o, i]. This is because we assume `[i, o]` format for
-    // `stablehlo.dot_general` (i.e. contracting dimension == 1) whereas
-    // `tfl.fully_connected` accepts an OI format.
-    auto filter_constant_op =
-        cast<stablehlo::ConstantOp>(op.getOperand(1).getDefiningOp());
-
-    TFL::QConstOp new_filter_constant_op =
-        CreateTflConstOpForFilter(filter_constant_op, rewriter,
-                                  /*is_per_axis=*/true);
-    const Value input_value = op.getOperand(0);
-    const double input_scale = input_value.getType()
-                                   .cast<TensorType>()
-                                   .getElementType()
-                                   .cast<UniformQuantizedType>()
-                                   .getScale();
-    TFL::QConstOp bias_constant_op = CreateTflConstOpForDummyBias(
-        op.getLoc(), input_scale, new_filter_constant_op, rewriter,
-        /*is_per_axis=*/true);
+    Type output_type = op.getResult().getType();
+    uint32_t axis = CastI64ToI32(op.getDimension()).value();
+    rewriter.replaceOpWithNewOp<TFL::ConcatenationOp>(
+        op, output_type, op.getOperands(), axis,
+        /*fused_activation_function=*/rewriter.getStringAttr("NONE"));
+  }
+};
 
-    const Value result_value = op.getResult();
-    // Set to `nullptr` because this attribute only matters when the input is
-    // dynamic-range quantized.
-    const BoolAttr asymmetric_quantize_inputs = nullptr;
-    auto tfl_fully_connected_op = rewriter.create<TFL::FullyConnectedOp>(
-        op.getLoc(), /*output=*/result_value.getType(),
-        /*input=*/input_value, /*filter=*/new_filter_constant_op.getResult(),
-        /*bias=*/bias_constant_op.getResult(),
-        /*fused_activation_function=*/rewriter.getStringAttr("NONE"),
-        /*weights_format=*/rewriter.getStringAttr("DEFAULT"),
-        /*keep_num_dims=*/rewriter.getBoolAttr(false),
-        asymmetric_quantize_inputs);
+// Rewrites quantized stablehlo.pad to tfl.padv2.
+// tfl.dilate is introduced in between when interior padding exists.
+// TODO: b/322428814 - Add StableHLO quantizer integration tests for ODML.
+class RewriteQuantizedPadOp : public OpRewritePattern<stablehlo::PadOp> {
+ public:
+  using OpRewritePattern<stablehlo::PadOp>::OpRewritePattern;
 
-    rewriter.replaceAllUsesWith(result_value,
-                                tfl_fully_connected_op.getResult(0));
-    rewriter.eraseOp(op);
+  LogicalResult match(stablehlo::PadOp op) const override {
+    return success(IsOpFullyQuantized(op));
   }
 
- private:
-  static LogicalResult MatchInput(Value input) {
-    auto input_type = input.getType().cast<TensorType>();
-    if (!input_type.hasRank() ||
-        !(input_type.getRank() == 2 || input_type.getRank() == 3)) {
-      LLVM_DEBUG(llvm::dbgs() << "Input expected to have rank of 2 or 3. Got: "
-                              << input_type << ".\n");
-      return failure();
+  void rewrite(stablehlo::PadOp op, PatternRewriter& rewriter) const override {
+    Value input = op.getOperand();
+    // If any of the interior padding is non-zero, operand should be dilated
+    // first, and then padded.
+    if (llvm::any_of(op.getInteriorPadding(),
+                     [](int64_t pad) { return pad != 0; })) {
+      input = InsertDilateOp(op, rewriter);
     }
 
-    if (const auto input_element_type = input_type.getElementType();
-        !IsI8F32UniformQuantizedType(input_element_type)) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Expected an i8->f32 uniform quantized type. Got: "
-                 << input_element_type << ".\n");
-      return failure();
+    TensorType operand_type = input.getType().cast<TensorType>();
+    const int64_t rank = operand_type.getRank();
+    // Shape of padding should be [rank, 2].
+    SmallVector<int64_t> shape{rank, 2};
+    TensorType padding_type =
+        operand_type.cloneWith(shape, rewriter.getI32Type());
+
+    ArrayRef<int64_t> padding_low = op.getEdgePaddingLow();
+    ArrayRef<int64_t> padding_high = op.getEdgePaddingHigh();
+    SmallVector<int32_t> padding_value;
+    for (int i = 0; i < rank; ++i) {
+      padding_value.push_back(CastI64ToI32(padding_low[i]).value());
+      padding_value.push_back(CastI64ToI32(padding_high[i]).value());
     }
 
-    return success();
+    TensorType output_type = op.getResult().getType().cast<TensorType>();
+    Value constant_values = op.getPaddingValue();
+    auto padding_attr = DenseIntElementsAttr::get(padding_type, padding_value);
+    auto padding =
+        rewriter.create<arith::ConstantOp>(op.getLoc(), padding_attr);
+    rewriter.replaceOpWithNewOp<TFL::PadV2Op>(op, output_type, input, padding,
+                                              constant_values);
   }
 
-  static LogicalResult MatchFilter(Value filter) {
-    auto filter_type = filter.getType().cast<TensorType>();
-    if (!filter_type.hasRank() || filter_type.getRank() != 2) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Filter tensor expected to have a tensor rank of 2. Got: "
-                 << filter_type << ".\n");
-      return failure();
-    }
-
-    const Type filter_element_type = filter_type.getElementType();
-    if (!IsI8F32UniformQuantizedPerAxisType(filter_type.getElementType())) {
-      LLVM_DEBUG(
-          llvm::dbgs()
-          << "Expected a per-channel uniform quantized (i8->f32) type. Got: "
-          << filter_element_type << "\n");
-      return failure();
+  Value InsertDilateOp(stablehlo::PadOp op, PatternRewriter& rewriter) const {
+    Value input = op.getOperand();
+    TensorType operand_type = input.getType().cast<TensorType>();
+    const int64_t rank = operand_type.getRank();
+
+    ArrayRef<int64_t> dilate_shape(rank);
+    TensorType dilate_type =
+        operand_type.cloneWith(dilate_shape, rewriter.getI32Type());
+    ArrayRef<int64_t> interior_padding_i64 = op.getInteriorPadding();
+    SmallVector<int32_t> interior_padding_i32 =
+        CastI64ArrayToI32(interior_padding_i64).value();
+    auto dilate_attr =
+        DenseIntElementsAttr::get(dilate_type, interior_padding_i32);
+    auto dilate = rewriter.create<arith::ConstantOp>(op.getLoc(), dilate_attr);
+
+    // Shape after dilation.
+    SmallVector<int64_t> dilated_shape(rank);
+    ArrayRef<int64_t> operand_shape = operand_type.getShape();
+    for (int i = 0; i < rank; ++i) {
+      dilated_shape[i] =
+          operand_shape[i] + interior_padding_i64[i] * (operand_shape[i] - 1);
     }
+    TensorType output_type = op.getResult().getType().cast<TensorType>();
+    Type dilated_output_type = output_type.clone(dilated_shape);
+    Value constant_values = op.getPaddingValue();
 
-    if (filter_element_type.cast<UniformQuantizedPerAxisType>()
-            .getQuantizedDimension() != 1) {
-      LLVM_DEBUG(llvm::dbgs() << "Quantized dimension should be 1. Got: "
-                              << filter_element_type << "\n");
-      return failure();
-    }
+    return rewriter.create<TFL::DilateOp>(dilate.getLoc(), dilated_output_type,
+                                          input, dilate, constant_values);
+  }
+};
 
-    if (Operation* filter_op = filter.getDefiningOp();
-        filter_op == nullptr || !isa<stablehlo::ConstantOp>(filter_op)) {
-      LLVM_DEBUG(llvm::dbgs() << "Filter should be a constant.\n");
-      return failure();
-    }
+// Rewrites quantized stablehlo.slice to tfl.slice or tfl.strided_slice.
+// TODO: b/322428814 - Add StableHLO quantizer integration tests for ODML.
+class RewriteQuantizedSliceOp : public OpRewritePattern<stablehlo::SliceOp> {
+ public:
+  using OpRewritePattern<stablehlo::SliceOp>::OpRewritePattern;
 
-    return success();
+  LogicalResult match(stablehlo::SliceOp op) const override {
+    return success(IsOpFullyQuantized(op));
   }
 
-  static LogicalResult MatchOutput(Value output) {
-    const Type output_element_type =
-        output.getType().cast<TensorType>().getElementType();
-    if (!IsI8F32UniformQuantizedType(output_element_type)) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Expected a uniform quantized (i8->f32) type. Got: "
-                 << output_element_type << ".\n");
-      return failure();
+  void rewrite(stablehlo::SliceOp op,
+               PatternRewriter& rewriter) const override {
+    auto operand_type = op.getOperand().getType().cast<TensorType>();
+    Type output_type = op.getResult().getType();
+    const int64_t rank = operand_type.getRank();
+
+    ArrayRef<int64_t> idx_shape(rank);
+    TensorType idx_type =
+        operand_type.cloneWith(idx_shape, rewriter.getI32Type());
+
+    ArrayRef<int64_t> start_idx_i64 = op.getStartIndices();
+    ArrayRef<int64_t> limit_idx_i64 = op.getLimitIndices();
+
+    SmallVector<int32_t> start_idx_i32 =
+        CastI64ArrayToI32(start_idx_i64).value();
+    auto start_idx_attr = DenseIntElementsAttr::get(idx_type, start_idx_i32);
+    auto start_idx =
+        rewriter.create<arith::ConstantOp>(op.getLoc(), start_idx_attr);
+
+    SmallVector<int32_t> slice_size_i32(rank);
+    for (int i = 0; i < rank; ++i) {
+      slice_size_i32[i] =
+          CastI64ToI32(limit_idx_i64[i] - start_idx_i64[i]).value();
+    }
+    auto slice_size_attr = DenseIntElementsAttr::get(idx_type, slice_size_i32);
+    auto slice_size =
+        rewriter.create<arith::ConstantOp>(op.getLoc(), slice_size_attr);
+
+    ArrayRef<int64_t> strides = op.getStrides();
+    // If stride of every dimension is 1, create tfl.slice and return early.
+    // Otherwise, create tfl.strided_slice instead.
+    if (llvm::all_of(strides, [](int64_t stride) { return stride == 1; })) {
+      rewriter.replaceOpWithNewOp<TFL::SliceOp>(
+          op, output_type, op.getOperand(), start_idx, slice_size);
+      return;
     }
 
-    return success();
+    SmallVector<int32_t> stride_i32 = CastI64ArrayToI32(strides).value();
+    auto stride_attr = DenseIntElementsAttr::get(idx_type, stride_i32);
+    auto stride = rewriter.create<arith::ConstantOp>(op.getLoc(), stride_attr);
+    rewriter.replaceOpWithNewOp<TFL::StridedSliceOp>(
+        op, output_type, op.getOperand(), start_idx, slice_size, stride,
+        /*begin_mask=*/0, /*end_mask=*/0,
+        /*ellipsis_mask=*/0, /*new_axis_mask=*/0, /*shrink_axis_mask=*/0,
+        /*offset=*/false);
   }
 };
 
-// Rewrites `stablehlo.dot_general` to `tfl.fully_connected` or
-// `tfl.batch_matmul` when it accepts uniform quantized tensors.
-//
-// Conditions for `tfl.fully_connected` conversion:
-//   * Input and output tensors are per-tensor uniform quantized (i8->f32)
-//     tensors.
-//   * The filter tensor is constant a per-tensor uniform quantized (i8->f32)
-//     tensor. The quantization dimension should be 1 (the non-contracting
-//     dimension).
-//   * The input tensor's rank is either 2 or 3. The last dimension of the input
-//     tensor should be the contracting dimension, i.e. [..., c_x, r_x].
-//   * The filter tensor's rank is 2. The contracting dimension should be the
-//     first dimension (dim 0), i.e. [c_y, r_y] where c_y == r_x.
-//   * Does not consider activation fusion.
-//   * Does not consider bias add fusion.
-// TODO: b/580909703 - Include conversion conditions for `tfl.batch_matmul` op.
-//
-// TODO: b/295264927 - `stablehlo.dot_general` with per-axis quantized operands
-// is not specified in the StableHLO dialect. Update the spec to allow this.
-class RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp
-    : public OpRewritePattern<stablehlo::DotGeneralOp> {
-  using OpRewritePattern<stablehlo::DotGeneralOp>::OpRewritePattern;
-
+// Rewrites quantized stablehlo.broadcast_in_dim to tfl.broadcast_to.
+// tfl.transpose is introduced when broadcast_dimensions is not in ascending
+// order. Also, tfl.expand_dims is introduced when input rank is smaller than
+// output rank.
+// TODO: b/322428814 - Add StableHLO quantizer integration tests for ODML.
+class RewriteQuantizedBroadcastInDimOp
+    : public OpRewritePattern<stablehlo::BroadcastInDimOp> {
  public:
-  LogicalResult match(stablehlo::DotGeneralOp op) const override {
-    const stablehlo::DotDimensionNumbersAttr dot_dimension_nums =
-        op.getDotDimensionNumbers();
-    if (const int num_rhs_contracting_dims =
-            dot_dimension_nums.getRhsContractingDimensions().size();
-        num_rhs_contracting_dims != 1) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Expected number of contracting dimensions to be 1. Got: "
-                 << num_rhs_contracting_dims << ".\n");
-      return failure();
-    }
+  using OpRewritePattern<stablehlo::BroadcastInDimOp>::OpRewritePattern;
 
-    if (failed(MatchInput(op.getOperand(0)))) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Failed to match input for quantized dot_general op.\n");
-      return failure();
-    }
+  LogicalResult match(stablehlo::BroadcastInDimOp op) const override {
+    return success(IsOpFullyQuantized(op));
+  }
 
-    if (failed(MatchFilter(op.getOperand(1)))) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Failed to match filter for quantized dot_general op.\n");
-      return failure();
-    }
+  void rewrite(stablehlo::BroadcastInDimOp op,
+               PatternRewriter& rewriter) const override {
+    auto operand_type = op.getOperand().getType().cast<TensorType>();
+    auto output_type = op.getResult().getType().cast<TensorType>();
+    Value input = op.getOperand();
 
-    if (failed(MatchOutput(op.getResult()))) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Failed to match output for quantized dot_general op.\n");
-      return failure();
+    // If broadcast_dimensions is not in ascending order, transpose first.
+    if (!llvm::is_sorted(op.getBroadcastDimensions())) {
+      input = InsertTransposeOp(op, rewriter);
     }
 
-    if (failed(MatchUsers(op.getResult()))) {
-      LLVM_DEBUG(llvm::dbgs() << "Failed to match subsequent requantize for "
-                                 "quantized dot_general op.\n");
-      return failure();
+    // If rank of operand is smaller than that of the output, expand dimensions
+    // before broadcasting.
+    if (operand_type.getRank() < output_type.getRank()) {
+      input = InsertExpandDimsOp(op, rewriter, input, output_type.getRank());
     }
 
-    return success();
+    SmallVector<int32_t> broadcast_shape =
+        CastI64ArrayToI32(output_type.getShape()).value();
+    TensorType broadcast_shape_type =
+        output_type.cloneWith({output_type.getRank()}, rewriter.getI32Type());
+    auto broadcast_shape_attr =
+        DenseIntElementsAttr::get(broadcast_shape_type, broadcast_shape);
+    auto shape =
+        rewriter.create<arith::ConstantOp>(op.getLoc(), broadcast_shape_attr);
+
+    rewriter.replaceOpWithNewOp<TFL::BroadcastToOp>(op, output_type, input,
+                                                    shape);
   }
 
-  void rewrite(stablehlo::DotGeneralOp op,
-               PatternRewriter& rewriter) const override {
-    // Create the new filter constant - transpose filter value
-    // from [i, o] -> [o, i]. This is because we assume `[i, o]` format for
-    // `stablehlo.dot_general` (i.e. contracting dimension == 1) whereas
-    // `tfl.fully_connected` accepts an OI format.
-    auto filter_constant_op =
-        cast<stablehlo::ConstantOp>(op.getOperand(1).getDefiningOp());
+  Value InsertTransposeOp(stablehlo::BroadcastInDimOp op,
+                          PatternRewriter& rewriter) const {
+    SmallVector<int64_t> sorted_dims =
+        llvm::to_vector(op.getBroadcastDimensions());
+    llvm::sort(sorted_dims);
+    auto broadcast_dims = op.getBroadcastDimensions();
+    SmallVector<int32_t> permutation(
+        llvm::map_range(broadcast_dims, [sorted_dims](int64_t dim) {
+          return static_cast<int32_t>(llvm::find(sorted_dims, dim) -
+                                      sorted_dims.begin());
+        }));
+    auto operand_type = op.getOperand().getType().cast<TensorType>();
+    TensorType perm_type = operand_type.cloneWith(
+        {static_cast<int64_t>(permutation.size())}, rewriter.getI32Type());
+    auto perm_attr = DenseIntElementsAttr::get(perm_type, permutation);
+    auto perm = rewriter.create<arith::ConstantOp>(op.getLoc(), perm_attr);
+    Value input = op.getOperand();
+
+    return rewriter.create<TFL::TransposeOp>(op.getLoc(), input, perm);
+  }
 
-    TFL::QConstOp new_filter_constant_op = CreateTflConstOpForFilter(
-        filter_constant_op, rewriter, /*is_per_axis=*/false);
-    const Value input_value = op.getOperand(0);
-    const double input_scale = input_value.getType()
-                                   .cast<TensorType>()
-                                   .getElementType()
-                                   .cast<UniformQuantizedType>()
-                                   .getScale();
-    TFL::QConstOp bias_constant_op = CreateTflConstOpForDummyBias(
-        op.getLoc(), input_scale, new_filter_constant_op, rewriter,
-        /*is_per_axis=*/false);
+  Value InsertExpandDimsOp(stablehlo::BroadcastInDimOp op,
+                           PatternRewriter& rewriter, Value input,
+                           int64_t output_rank) const {
+    auto input_type = input.getType().cast<TensorType>();
+    SmallVector<int64_t> input_shape(input_type.getShape());
+    SmallVector<int64_t> input_dims =
+        llvm::to_vector(op.getBroadcastDimensions());
+
+    while (input_dims.size() < output_rank) {
+      int32_t dim_to_expand = 0;
+      for (int32_t i = 0; i < output_rank; ++i) {
+        if (!llvm::is_contained(input_dims, i)) {
+          dim_to_expand = i;
+          break;
+        }
+      }
 
-    auto output_op = op.getResult().getDefiningOp();
-    Operation* requantize_op = *output_op->getResult(0).getUsers().begin();
-    Operation* dequantize_op = *requantize_op->getResult(0).getUsers().begin();
+      TensorType dim_type = input_type.cloneWith({static_cast<int64_t>(1)},
+                                                 rewriter.getI32Type());
+      ArrayRef<int32_t> dims(dim_to_expand);
+      auto dim_attr = DenseIntElementsAttr::get(dim_type, dims);
+      auto dim = rewriter.create<arith::ConstantOp>(op.getLoc(), dim_attr);
 
-    // Set to `nullptr` because this attribute only matters when the input is
-    // dynamic-range quantized.
-    const BoolAttr asymmetric_quantize_inputs = nullptr;
-    auto tfl_fully_connected_op = rewriter.create<TFL::FullyConnectedOp>(
-        op.getLoc(),
-        /*output=*/
-        requantize_op->getResult(0).getType(),  // result_value.getType(),
-        /*input=*/input_value, /*filter=*/new_filter_constant_op.getResult(),
-        /*bias=*/bias_constant_op.getResult(),
-        /*fused_activation_function=*/rewriter.getStringAttr("NONE"),
-        /*weights_format=*/rewriter.getStringAttr("DEFAULT"),
-        /*keep_num_dims=*/rewriter.getBoolAttr(false),
-        asymmetric_quantize_inputs);
+      input_shape.insert(input_shape.begin() + dim_to_expand, 1);
+      TensorType expanded_type = input_type.clone(input_shape);
+      input = rewriter.create<TFL::ExpandDimsOp>(op.getLoc(), expanded_type,
+                                                 input, dim);
+
+      // Update expanded dimension in the input dimensions for the next
+      // iteration.
+      input_dims.push_back(static_cast<int64_t>(dim_to_expand));
+    }
+    return input;
+  }
+};
 
-    auto tfl_dequantize_op = rewriter.create<TFL::DequantizeOp>(
-        op.getLoc(), dequantize_op->getResult(0).getType(),
-        tfl_fully_connected_op->getResult(0));
+// Rewrites quantized stablehlo.reduce_window with max to tfl.max_pool_2d.
+// TODO: b/322428814 - Add StableHLO quantizer integration tests for ODML.
+class RewriteQuantizedReduceWindowOpWithMax
+    : public OpRewritePattern<stablehlo::ReduceWindowOp> {
+ public:
+  using OpRewritePattern<stablehlo::ReduceWindowOp>::OpRewritePattern;
 
-    rewriter.replaceAllUsesWith(dequantize_op->getResult(0),
-                                tfl_dequantize_op->getResult(0));
+  LogicalResult MatchBinaryReduceFunction(Region& function) const {
+    Block& body = function.front();
+    if (body.getNumArguments() != 2) return failure();
 
-    rewriter.replaceAllUsesWith(op.getResult(),
-                                tfl_fully_connected_op.getResult(0));
+    auto return_op = dyn_cast<stablehlo::ReturnOp>(body.back());
+    if (!return_op) return failure();
+    if (return_op.getNumOperands() != 1) return failure();
 
-    rewriter.eraseOp(op);
+    auto reduce_op = dyn_cast_or_null<stablehlo::MaxOp>(
+        return_op.getOperands().front().getDefiningOp());
+    if (!reduce_op) return failure();
+    return success(reduce_op.getLhs() == body.getArgument(0) &&
+                   reduce_op.getRhs() == body.getArgument(1));
   }
 
- private:
-  static LogicalResult MatchInput(Value input) {
-    auto input_type = input.getType().cast<TensorType>();
-    if (!input_type.hasRank() ||
-        !(input_type.getRank() == 2 || input_type.getRank() == 3)) {
-      LLVM_DEBUG(llvm::dbgs() << "Input expected to have rank of 2 or 3. Got: "
-                              << input_type << ".\n");
+  LogicalResult match(stablehlo::ReduceWindowOp op) const override {
+    // Check that the reduce-window is a max-reduce-window.
+    if (failed(MatchBinaryReduceFunction(op.getBody()))) {
       return failure();
     }
 
-    if (const auto input_element_type = input_type.getElementType();
-        !IsI8F32UniformQuantizedType(input_element_type)) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Expected an i8->f32 uniform quantized type. Got: "
-                 << input_element_type << ".\n");
+    // Only 2d pooling is supported in TFLite.
+    if (op.getWindowDimensions().size() != 4) {
       return failure();
     }
 
-    return success();
-  }
-
-  static LogicalResult MatchFilter(Value filter) {
-    auto filter_type = filter.getType().cast<TensorType>();
-    if (!filter_type.hasRank() || filter_type.getRank() != 2) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Filter tensor expected to have a tensor rank of 2. Got: "
-                 << filter_type << ".\n");
+    // reduce_window op with dilations or padding will supported later.
+    // TODO: b/321099943 - Support reduce_window op with dilations and padding.
+    if (op.getBaseDilations().has_value() ||
+        op.getWindowDilations().has_value() || op.getPadding().has_value()) {
       return failure();
     }
 
-    const Type filter_element_type = filter_type.getElementType();
-    if (!IsI8F32UniformQuantizedType(filter_element_type)) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Expected a uniform quantized (i8->f32) type. Got: "
-                 << filter_element_type << "\n");
+    // Window_dimensions and window_strides should have batch and channel
+    // dimension of 1 as they cannot be specified in tfl.max_pool_2d.
+    ArrayRef<int64_t> window_dims = op.getWindowDimensions();
+    if (window_dims[0] != 1 || window_dims[3] != 1) {
       return failure();
     }
-
-    if (Operation* filter_op = filter.getDefiningOp();
-        filter_op == nullptr || !isa<stablehlo::ConstantOp>(filter_op)) {
-      LLVM_DEBUG(llvm::dbgs() << "Filter should be a constant.\n");
-      return failure();
+    std::optional<ArrayRef<int64_t>> window_strides = op.getWindowStrides();
+    if (window_strides.has_value()) {
+      if ((*window_strides)[0] != 1 || (*window_strides)[3] != 1) {
+        return failure();
+      }
     }
 
-    return success();
-  }
-
-  static LogicalResult MatchOutput(Value output) {
-    const Type output_element_type =
-        output.getType().cast<TensorType>().getElementType();
-    if (!IsI32F32UniformQuantizedType(output_element_type)) {
-      LLVM_DEBUG(llvm::dbgs()
-                 << "Expected a uniform quantized (i32->f32) type. Got: "
-                 << output_element_type << ".\n");
-      return failure();
-    }
-    return success();
+    return success(IsOpFullyQuantized(op));
   }
 
-  static LogicalResult MatchUsers(Value output) {
-    auto output_op = output.getDefiningOp();
-
-    if (!output_op->hasOneUse()) {
-      LLVM_DEBUG(llvm::dbgs() << "Expected output to be used only once.\n");
-      return failure();
-    }
-    // TODO: b/309896242 - Add support for fused op case.
-    if (Operation* requantize_op = dyn_cast_or_null<TFL::QuantizeOp>(
-            *output_op->getResult(0).getUsers().begin())) {
-      const Type requantize_element_type = requantize_op->getResult(0)
-                                               .getType()
-                                               .cast<TensorType>()
-                                               .getElementType();
-      if (!IsI8F32UniformQuantizedType(requantize_element_type)) {
-        LLVM_DEBUG(llvm::dbgs() << "Expected a quantize (i8->f32) type. Got: "
-                                << requantize_element_type << ".\n");
-        return failure();
-      }
-      if (!isa<TFL::DequantizeOp>(
-              *requantize_op->getResult(0).getUsers().begin())) {
-        LLVM_DEBUG(llvm::dbgs() << "Expected a dequantize type.\n");
-        return failure();
-      }
+  void rewrite(stablehlo::ReduceWindowOp op,
+               PatternRewriter& rewriter) const override {
+    Type result_type = op.getResult(0).getType();
+    Value input = op.getOperand(0);
+    // Ops with padding is rejected in matching function, so we can use the
+    // padding to be 'VALID'.
+    StringAttr padding = rewriter.getStringAttr("VALID");
+
+    // Use NHWC format.
+    int32_t stride_h = 1;
+    int32_t stride_w = 1;
+    std::optional<ArrayRef<int64_t>> window_strides = op.getWindowStrides();
+    if (window_strides.has_value()) {
+      stride_h = CastI64ToI32((*window_strides)[1]).value();
+      stride_w = CastI64ToI32((*window_strides)[2]).value();
     }
-    return success();
+    auto stride_h_attr = IntegerAttr::get(rewriter.getI32Type(), stride_h);
+    auto stride_w_attr = IntegerAttr::get(rewriter.getI32Type(), stride_w);
+
+    ArrayRef<int64_t> window_dims = op.getWindowDimensions();
+    auto window_w_attr = IntegerAttr::get(rewriter.getI32Type(),
+                                          CastI64ToI32(window_dims[2]).value());
+    auto window_h_attr = IntegerAttr::get(rewriter.getI32Type(),
+                                          CastI64ToI32(window_dims[1]).value());
+    StringAttr activation_function = rewriter.getStringAttr("NONE");
+
+    rewriter.replaceOpWithNewOp<TFL::MaxPool2DOp>(
+        op, result_type, input, padding, stride_w_attr, stride_h_attr,
+        window_w_attr, window_h_attr, activation_function);
   }
 };
 
-void UniformQuantizedStablehloToTflPass::runOnOperation() {
+void UniformQuantizedStableHloToTflPass::runOnOperation() {
   func::FuncOp func_op = getOperation();
   MLIRContext& ctx = getContext();
 
   RewritePatternSet patterns(&ctx);
   patterns.add<RewriteUniformQuantizeOp, RewriteUniformDequantizeOp,
-               RewriteUpstreamQuantizedConvolutionOp,
-               RewriteUpstreamQuantizedDotGeneralOpToBatchMatmulOp,
-               RewriteUpstreamQuantizedDotGeneralOpToTflFullyConnectedOp,
-               RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp>(
-      &ctx);
+               RewriteQuantizedDotGeneralOpToTflFullyConnectedOrBatchMatmulOp,
+               RewriteQuantizedConvolutionOp, RewriteQuantizedTransposeOp,
+               RewriteQuantizedReshapeOp, RewriteQuantizedSelectOp,
+               RewriteQuantizedConcatenateOp, RewriteQuantizedPadOp,
+               RewriteQuantizedSliceOp, RewriteQuantizedBroadcastInDimOp,
+               RewriteQuantizedReduceWindowOpWithMax>(&ctx);
 
   if (failed(applyPatternsAndFoldGreedily(func_op, std::move(patterns)))) {
     func_op.emitError() << "Failed to convert stablehlo ops with uniform "
@@ -1289,11 +1721,11 @@ void UniformQuantizedStablehloToTflPass::runOnOperation() {
 }  // namespace
 
 std::unique_ptr<OperationPass<func::FuncOp>>
-CreateUniformQuantizedStablehloToTflPass() {
-  return std::make_unique<UniformQuantizedStablehloToTflPass>();
+CreateUniformQuantizedStableHloToTflPass() {
+  return std::make_unique<UniformQuantizedStableHloToTflPass>();
 }
 
-static PassRegistration<UniformQuantizedStablehloToTflPass> pass;
+static PassRegistration<UniformQuantizedStableHloToTflPass> pass;
 
 }  // namespace odml
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/mix_tflite_stablehlo.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/mix_tflite_stablehlo.mlir
index cbf7c3dd6cebfe..3c70fd1dfacf42 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/mix_tflite_stablehlo.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/mix_tflite_stablehlo.mlir
@@ -9,10 +9,8 @@ func.func @main(%arg0: tensor<1x1x1x96xf32>) -> tensor<1x1x1x96xf32> {
 }
 }
 
-// CHECK:module attributes {tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32} {
-// CHECK-NEXT: func.func @main(%arg0: tensor<1x1x1x96xf32>) -> tensor<1x1x1x96xf32> attributes {tf.entry_function = {inputs = "arg0", outputs = "exp"}} {
+// CHECK: func.func @main(%arg0: tensor<1x1x1x96xf32>) -> tensor<1x1x1x96xf32> attributes {tf.entry_function = {inputs = "arg0", outputs = "exp"}} {
 // CHECK-NEXT:  %0 = stablehlo.logistic %arg0 : tensor<1x1x1x96xf32>
 // CHECK-NEXT:  %1 = "tfl.exp"(%0) : (tensor<1x1x1x96xf32>) -> tensor<1x1x1x96xf32>
 // CHECK-NEXT:  return %1 : tensor<1x1x1x96xf32>
-// CHECK-NEXT: }
-// CHECK-NEXT:}
\ No newline at end of file
+// CHECK-NEXT: }
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/simple.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/simple.mlir
index 75474e7ec8b268..76f778bcebec20 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/simple.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/simple.mlir
@@ -5,7 +5,9 @@
 
 func.func @main(tensor<3x2xi32>) -> tensor<3x2xi32> {
 ^bb0(%arg0: tensor<3x2xi32>):
-  // CHECK: module attributes {tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32}
+  // CHECK: module attributes
+  // CHECK-SAME: tfl.description = "MLIR Converted."
+  // CHECK-SAME: tfl.schema_version = 3 : i32
 
   // CHECK:          %{{.*}} = "tfl.pseudo_const"() {value = dense<{{\[\[1, 2\], \[3, 4\], \[5, 6\]\]}}> : tensor<3x2xi32>}
   // CHECK-NEXT:     [[SUB:%.*]] = tfl.sub %{{.*}}, %{{.*}} {fused_activation_function = "RELU6"} : tensor<3x2xi32>
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/stablehlo.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/stablehlo.mlir
index bf044517c7aa3f..f7fc2f3ff12c01 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/stablehlo.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/stablehlo.mlir
@@ -351,7 +351,7 @@ func.func @reduce_window(%arg0: tensor<1x160x1xf32>, %arg1: tensor<f32>) -> tens
     ^bb0(%arg23: tensor<f32>, %arg24: tensor<f32>):
       %1112 = stablehlo.add %arg23, %arg24 : tensor<f32>
       stablehlo.return %1112 : tensor<f32>
-    }) {base_dilations = dense<1> : tensor<3xi64>, padding = dense<[[0, 0], [159, 0], [0, 0]]> : tensor<3x2xi64>, window_dilations = dense<1> : tensor<3xi64>, window_dimensions = dense<[1, 160, 1]> : tensor<3xi64>, window_strides = dense<1> : tensor<3xi64>} : (tensor<1x160x1xf32>, tensor<f32>) -> tensor<1x160x1xf32>
+  }) {base_dilations = array<i64: 1, 1, 1>, padding = dense<[[0, 0], [159, 0], [0, 0]]> : tensor<3x2xi64>, window_dilations = array<i64: 1, 1, 1>, window_dimensions = array<i64: 1, 160, 1>, window_strides = array<i64: 1, 1, 1>} : (tensor<1x160x1xf32>, tensor<f32>) -> tensor<1x160x1xf32>
   return %0 : tensor<1x160x1xf32>
 }
 
@@ -360,7 +360,7 @@ func.func @reduce_window(%arg0: tensor<1x160x1xf32>, %arg1: tensor<f32>) -> tens
 //CHECK-NEXT:  ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
 //CHECK-NEXT:   %1 = stablehlo.add %arg2, %arg3 : tensor<f32>
 //CHECK-NEXT:   stablehlo.return %1 : tensor<f32>
-//CHECK-NEXT{LITERAL}:  }) {base_dilations = dense<1> : tensor<3xi64>, padding = dense<[[0, 0], [159, 0], [0, 0]]> : tensor<3x2xi64>, window_dilations = dense<1> : tensor<3xi64>, window_dimensions = dense<[1, 160, 1]> : tensor<3xi64>, window_strides = dense<1> : tensor<3xi64>} : (tensor<1x160x1xf32>, tensor<f32>) -> tensor<1x160x1xf32>
+//CHECK-NEXT{LITERAL}:  }) {base_dilations = array<i64: 1, 1, 1>, padding = dense<[[0, 0], [159, 0], [0, 0]]> : tensor<3x2xi64>, window_dilations = array<i64: 1, 1, 1>, window_dimensions = array<i64: 1, 160, 1>, window_strides = array<i64: 1, 1, 1>} : (tensor<1x160x1xf32>, tensor<f32>) -> tensor<1x160x1xf32>
 //CHECK-NEXT: return %0 : tensor<1x160x1xf32>
 //CHECK-NEXT:}
 
@@ -507,7 +507,7 @@ func.func @gather(%operand: tensor<3x4x2xi32>, %start_indices: tensor<2x3x2xi64>
     collapsed_slice_dims = [0],
     start_index_map = [1, 0],
     index_vector_dim = 2>,
-  slice_sizes = dense<[1, 2, 2]> : tensor<3xi64>,
+  slice_sizes = array<i64: 1, 2, 2>,
   indices_are_sorted = false
   } : (tensor<3x4x2xi32>, tensor<2x3x2xi64>) -> tensor<2x3x2x2xi32>
   return %result : tensor<2x3x2x2xi32>
@@ -515,7 +515,7 @@ func.func @gather(%operand: tensor<3x4x2xi32>, %start_indices: tensor<2x3x2xi64>
 
 
 // CHECK: func.func private @gather(%arg0: tensor<3x4x2xi32>, %arg1: tensor<2x3x2xi64>) -> tensor<2x3x2x2xi32> {
-// CHECK-NEXT: %0 = "stablehlo.gather"(%arg0, %arg1) {dimension_numbers = #stablehlo.gather<offset_dims = [2, 3], collapsed_slice_dims = [0], start_index_map = [1, 0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = dense<[1, 2, 2]> : tensor<3xi64>} : (tensor<3x4x2xi32>, tensor<2x3x2xi64>) -> tensor<2x3x2x2xi32>
+// CHECK-NEXT: %0 = "stablehlo.gather"(%arg0, %arg1) {dimension_numbers = #stablehlo.gather<offset_dims = [2, 3], collapsed_slice_dims = [0], start_index_map = [1, 0], index_vector_dim = 2>, indices_are_sorted = false, slice_sizes = array<i64: 1, 2, 2>} : (tensor<3x4x2xi32>, tensor<2x3x2xi64>) -> tensor<2x3x2x2xi32>
 // CHECK-NEXT: return %0 : tensor<2x3x2x2xi32>
 // CHECK-NEXT:}
 
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/stablehlo_const.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/stablehlo_const.mlir
index f3c64f67fc5f9b..e9708e0f14a877 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/stablehlo_const.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/stablehlo_const.mlir
@@ -8,9 +8,12 @@ module attributes {tfl.metadata = {"keep_stablehlo_constant" = "true"}} {
  }
 }
 
-//CHECK:module attributes {tfl.description = "MLIR Converted.", tfl.metadata = {keep_stablehlo_constant = "true"}, tfl.schema_version = 3 : i32} {
-//CHECK-NEXT: func.func @main() -> tensor<1x1x1x96xf32> attributes {tf.entry_function = {outputs = "stablehlo.constant"}} {
-//CHECK-NEXT:  %0 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x1x96xf32>
-//CHECK-NEXT:  return %0 : tensor<1x1x1x96xf32>
-//CHECK-NEXT: }
-//CHECK-NEXT:}
\ No newline at end of file
+// CHECK: module attributes {
+// CHECK-SAME: tfl.metadata
+// CHECK-SAME: keep_stablehlo_constant = "true"
+
+// CHECK-NEXT: func.func @main() -> tensor<1x1x1x96xf32> attributes {tf.entry_function = {outputs = "stablehlo.constant"}} {
+// CHECK-NEXT:  %0 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x1x96xf32>
+// CHECK-NEXT:  return %0 : tensor<1x1x1x96xf32>
+// CHECK-NEXT: }
+// CHECK-NEXT:}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/tf_variant_type.mlir b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/tf_variant_type.mlir
index 5d1566cf121590..07738c1102f767 100644
--- a/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/tf_variant_type.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/flatbuffer2mlir/tf_variant_type.mlir
@@ -4,6 +4,5 @@ func.func @main(%arg0 : tensor<!tf_type.variant<tensor<2xi32>>>, %arg1: tensor<!
   func.return %arg0 : tensor<!tf_type.variant<tensor<2xi32>>>
 }
 
-// CHECK: module attributes {tfl.description = "MLIR Converted.", tfl.schema_version = 3 : i32}
 // CHECK:          func.func @main(%[[ARG0:.*]]: tensor<!tf_type.variant<tensor<2xi32>>>, %[[ARG1:.*]]: tensor<!tf_type.variant<tensor<*xi32>>>) -> tensor<!tf_type.variant<tensor<2xi32>>>
 // CHECK-NEXT:       return %[[ARG0]] : tensor<!tf_type.variant<tensor<2xi32>>>
diff --git a/tensorflow/compiler/mlir/lite/tests/legalize-tf-no-runtime-verification.mlir b/tensorflow/compiler/mlir/lite/tests/legalize-tf-no-runtime-verification.mlir
index 40db57950cc30f..46ff509b7cc46e 100644
--- a/tensorflow/compiler/mlir/lite/tests/legalize-tf-no-runtime-verification.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/legalize-tf-no-runtime-verification.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s -tfl-prepare-tf -tfl-legalize-tf='run-tfl-runtime-verification=false' | FileCheck %s
+// RUN: tf-opt %s -tfl-prepare-tf -tfl-legalize-tf='run-tfl-runtime-verification=false'  -tfl-optimize | FileCheck %s
 
 func.func @broadcast_to_bf16(%arg0: tensor<3xbf16>, %arg1: tensor<2xi64>) -> tensor<3x3xbf16> {
   %0 = "tf.BroadcastTo"(%arg0, %arg1) : (tensor<3xbf16>, tensor<2xi64>) -> tensor<3x3xbf16>
diff --git a/tensorflow/compiler/mlir/lite/tests/optimize.mlir b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
index ed5370b7cda3dc..9da0e13c0471ac 100644
--- a/tensorflow/compiler/mlir/lite/tests/optimize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/optimize.mlir
@@ -377,6 +377,28 @@ func.func @DontFuseMulIntoFullyConnectedForLargeFilter(%arg0: tensor<128x256000x
 }
 
 
+// CHECK-LABEL: @skipFuseMulIntoFullyConnected
+func.func @skipFuseMulIntoFullyConnected(%arg0: tensor<4x2xf32>) -> (tensor<1x8xf32>, tensor<4x2xf32>) {
+  %cst0 = arith.constant dense<[[1.0, 2.0], [3.0, 4.0]]> : tensor<2x2xf32>
+  %cst1 = arith.constant dense<2.0> : tensor<2xf32>
+  %cst2 = arith.constant dense<[1.0, 2.0]> : tensor<2xf32>
+  %cst3 = arith.constant dense<[1, 8]> : tensor<2xi32>
+
+  %0 = "tfl.fully_connected"(%arg0, %cst0, %cst1) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<4x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+  %1 = "tfl.reshape"(%0, %cst3) : (tensor<4x2xf32>, tensor<2xi32>) -> tensor<1x8xf32>
+  %2 = "tfl.mul"(%0, %cst2) {fused_activation_function = "RELU6"} : (tensor<4x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+
+  func.return %1, %2 : tensor<1x8xf32>, tensor<4x2xf32>
+  // CHECK:  %cst = arith.constant dense<{{\[}}[1.000000e+00, 2.000000e+00], [3.000000e+00, 4.000000e+00]]> : tensor<2x2xf32>
+  // CHECK:  %cst_0 = arith.constant dense<2.000000e+00> : tensor<2xf32>
+  // CHECK:  %cst_1 = arith.constant dense<[1.000000e+00, 2.000000e+00]> : tensor<2xf32>
+  // CHECK:  %cst_2 = arith.constant dense<[1, 8]> : tensor<2xi32>
+  // CHECK:  %0 = "tfl.fully_connected"(%arg0, %cst, %cst_0) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<4x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+  // CHECK:  %1 = "tfl.reshape"(%0, %cst_2) : (tensor<4x2xf32>, tensor<2xi32>) -> tensor<1x8xf32>
+  // CHECK:  %2 = tfl.mul(%0, %cst_1) {fused_activation_function = "RELU6"} : (tensor<4x2xf32>, tensor<2xf32>) -> tensor<4x2xf32>
+  // CHECK:  return %1, %2 : tensor<1x8xf32>, tensor<4x2xf32>
+}
+
 // CHECK-LABEL: @fuseAddIntoFollowingFullyConnectedWithQDQs
 func.func @fuseAddIntoFollowingFullyConnectedWithQDQs(%arg0: tensor<4x2xf32>) -> tensor<4x2xf32> {
   %cst2 = arith.constant dense<1.5> : tensor<f32>
@@ -762,26 +784,6 @@ func.func @FuseReshapeAroundBMMRHS(%arg0: tensor<1x3x6x5x1024xf32>) -> tensor<1x
   // CHECK: return %0 : tensor<1x3x6x5x8192xf32>
 }
 
-// CHECK-LABEL: @FuseTransposeIntoBMM_RHS
-func.func @FuseTransposeIntoBMM_RHS(%arg0: tensor<1x4x1440x256xf32>, %arg1: tensor<1x1440x256xf32>) -> tensor<1x4x1440x1440xf32> {
-  %cst_1 = arith.constant dense<[0, 2, 1]> : tensor<3xi32>
-  %32 = "tfl.transpose"(%arg1, %cst_1) : (tensor<1x1440x256xf32>, tensor<3xi32>) -> tensor<1x256x1440xf32>
-  %33 = "tfl.batch_matmul"(%arg0, %32) {adj_x = false, adj_y = false} : (tensor<1x4x1440x256xf32>, tensor<1x256x1440xf32>) -> tensor<1x4x1440x1440xf32>
-  return %33 : tensor<1x4x1440x1440xf32>
-  // CHECK: %0 = "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = true} : (tensor<1x4x1440x256xf32>, tensor<1x1440x256xf32>) -> tensor<1x4x1440x1440xf32>
-  // CHECK: return %0 : tensor<1x4x1440x1440xf32>
-}
-
-// CHECK-LABEL: @FuseTransposeIntoBMM_RHS2
-func.func @FuseTransposeIntoBMM_RHS2(%arg0: tensor<?x?x40xf32>,  %arg1: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
-  %cst_1 = arith.constant dense<[0, 2, 1]> : tensor<3xi32>
-  %32 = "tfl.transpose"(%arg1, %cst_1) : (tensor<?x?x?xf32>, tensor<3xi32>) -> tensor<?x?x?xf32>
-  %33 = "tfl.batch_matmul"(%arg0, %32) {adj_x = false, adj_y = false} : (tensor<?x?x40xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-  return %33 : tensor<?x?x?xf32>
-  // CHECK: %0 = "tfl.batch_matmul"(%arg0, %arg1) {adj_x = false, adj_y = true} : (tensor<?x?x40xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
-  // CHECK: return %0 : tensor<?x?x?xf32>
-}
-
 // CHECK-LABEL: @FuseTransposeIntoBMM_LHS
 func.func @FuseTransposeIntoBMM_LHS(%arg0: tensor<1x4x1440x256xf32>, %arg1: tensor<1x1440x256xf32>) -> tensor<1x4x256x256xf32> {
   %cst_1 = arith.constant dense<[0, 2, 1]> : tensor<3xi32>
@@ -3897,3 +3899,163 @@ func.func @NoReorderNCHWTransposeAddNotBias(%arg0: tensor<1x40x40x1xf32>, %filte
   // CHECK: %[[add:.*]] = tfl.add %[[transpose]],
   // CHECK: return %[[add]]
 }
+
+// CHECK-LABEL: @ConvertStridedSliceToSlice
+func.func @ConvertStridedSliceToSlice(%arg0: tensor<2x3872x1x128xf32>) -> tensor<1x3872x1x128xf32> {
+  %44 = arith.constant dense<0> : tensor<4xi32>
+  %45 = arith.constant dense<[1, 3872, 1, 128]> : tensor<4xi32>
+  %46 = arith.constant dense<1> : tensor<4xi32>
+  %47 = "tfl.strided_slice"(%arg0, %44, %45, %46) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<2x3872x1x128xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<1x3872x1x128xf32>
+  func.return %47 : tensor<1x3872x1x128xf32>
+
+  // CHECK: %[[slice:.*]] = "tfl.slice"
+  // CHECK: return %[[slice]]
+}
+
+// CHECK-LABEL: @FuseExcessBroadcastingOnReshapes
+func.func @FuseExcessBroadcastingOnReshapes(%arg0: tensor<1x8xf32>) -> tensor<1x1x1x128xf32> {
+    %cst = arith.constant dense<[1, 1, 1, 8, 1, 1]> : tensor<6xi32>
+    %cst_0 = arith.constant dense<[1, 1, 1, 8, 16, 1]> : tensor<6xi32>
+    %cst_1 = arith.constant dense<[1, 1, 1, 128]> : tensor<4xi32>
+    %0 = "tfl.reshape"(%arg0, %cst) : (tensor<1x8xf32>, tensor<6xi32>) -> tensor<1x1x1x8x1x1xf32>
+    %1 = "tfl.broadcast_to"(%0, %cst_0) : (tensor<1x1x1x8x1x1xf32>, tensor<6xi32>) -> tensor<1x1x1x8x16x1xf32>
+    %2 = "tfl.reshape"(%1, %cst_1) : (tensor<1x1x1x8x16x1xf32>, tensor<4xi32>) -> tensor<1x1x1x128xf32>
+    return %2 : tensor<1x1x1x128xf32>
+    // CHECK: %cst = arith.constant dense<1.000000e+00> : tensor<8x16xf32>
+    // CHECK: %cst_0 = arith.constant dense<[1, 1, 1, 128]> : tensor<4xi32>
+    // CHECK: %cst_1 = arith.constant dense<[8, 1]> : tensor<2xi32>
+    // CHECK: %0 = "tfl.reshape"(%arg0, %cst_1) : (tensor<1x8xf32>, tensor<2xi32>) -> tensor<8x1xf32>
+    // CHECK: %1 = tfl.mul(%0, %cst) {fused_activation_function = "NONE"} : (tensor<8x1xf32>, tensor<8x16xf32>) -> tensor<8x16xf32>
+    // CHECK: %2 = "tfl.reshape"(%1, %cst_0) : (tensor<8x16xf32>, tensor<4xi32>) -> tensor<1x1x1x128xf32>
+    // CHECK: return %2 : tensor<1x1x1x128xf32>
+}
+
+// CHECK-LABEL: @FuseExcessBroadcastingOnReshapesDynamicShapes
+func.func @FuseExcessBroadcastingOnReshapesDynamicShapes(%arg0: tensor<?x10x1xf32>, %arg1: tensor<6xi32>, %arg2: tensor<6xi32>, %arg3: tensor<2xi32>) -> tensor<?x50xf32> {
+    %1196 = "tfl.reshape"(%arg0, %arg1) : (tensor<?x10x1xf32>, tensor<6xi32>) -> tensor<1x?x1x10x1x1xf32>
+    %1197 = "tfl.broadcast_to"(%1196, %arg2) : (tensor<1x?x1x10x1x1xf32>, tensor<6xi32>) -> tensor<1x?x1x10x5x1xf32>
+    %1198 = "tfl.reshape"(%1197, %arg3) : (tensor<1x?x1x10x5x1xf32>, tensor<2xi32>) -> tensor<?x50xf32>
+    return %1198 : tensor<?x50xf32>
+
+    // CHECK: %0 = "tfl.reshape"(%arg0, %arg1) : (tensor<?x10x1xf32>, tensor<6xi32>) -> tensor<1x?x1x10x1x1xf32>
+    // CHECK: %1 = "tfl.broadcast_to"(%0, %arg2) : (tensor<1x?x1x10x1x1xf32>, tensor<6xi32>) -> tensor<1x?x1x10x5x1xf32>
+    // CHECK: %2 = "tfl.reshape"(%1, %arg3) : (tensor<1x?x1x10x5x1xf32>, tensor<2xi32>) -> tensor<?x50xf32>
+    // CHECK: return %2 : tensor<?x50xf32>
+}
+
+// CHECK-LABEL: @broadcast_to_f32_low_dim
+func.func @broadcast_to_f32_low_dim(%arg0: tensor<3xf32>, %arg1: tensor<2xi32>) -> tensor<3x3xf32> {
+  %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xf32>, tensor<2xi32>) -> tensor<3x3xf32>
+  return %0 : tensor<3x3xf32>
+  // CHECK:  %cst = arith.constant dense<1.000000e+00> : tensor<3x3xf32>
+  // CHECK:  %0 = tfl.mul(%arg0, %cst) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+  // CHECK:  return %0 : tensor<3x3xf32>
+}
+
+// CHECK-LABEL: @broadcast_to_i32_low_dim
+func.func @broadcast_to_i32_low_dim(%arg0: tensor<3xi32>, %arg1: tensor<2xi32>) -> tensor<3x3xi32> {
+  %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xi32>, tensor<2xi32>) -> tensor<3x3xi32>
+  return %0 : tensor<3x3xi32>
+  // CHECK:  %cst = arith.constant dense<1> : tensor<3x3xi32>
+  // CHECK:  %0 = tfl.mul(%arg0, %cst) {fused_activation_function = "NONE"} : (tensor<3xi32>, tensor<3x3xi32>) -> tensor<3x3xi32>
+  // CHECK:  return %0 : tensor<3x3xi32>
+}
+
+// CHECK-LABEL: @broadcast_to_low_dim_with_unknown_shape
+func.func @broadcast_to_low_dim_with_unknown_shape(%arg0: tensor<3xf32>, %arg1: tensor<*xi32>) -> tensor<3x3xf32> {
+  %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xf32>, tensor<*xi32>) -> tensor<3x3xf32>
+  return %0 : tensor<3x3xf32>
+  // CHECK:  %cst = arith.constant dense<1.000000e+00> : tensor<3x3xf32>
+  // CHECK:  %0 = tfl.mul(%arg0, %cst) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+  // CHECK:  return %0 : tensor<3x3xf32>
+}
+
+// CHECK-LABEL: @broadcast_to_i16_low_dim
+func.func @broadcast_to_i16_low_dim(%arg0: tensor<3xi16>, %arg1: tensor<2xi32>) -> tensor<3x3xi16> {
+  %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xi16>, tensor<2xi32>) -> tensor<3x3xi16>
+  return %0 : tensor<3x3xi16>
+  // CHECK:  %cst = arith.constant dense<1> : tensor<3x3xi16>
+  // CHECK:  %0 = tfl.mul(%arg0, %cst) {fused_activation_function = "NONE"} : (tensor<3xi16>, tensor<3x3xi16>) -> tensor<3x3xi16>
+  // CHECK:  return %0 : tensor<3x3xi16>
+}
+
+// CHECK-LABEL: @broadcast_to_i32_low_dim_with_unknown_output
+func.func @broadcast_to_i32_low_dim_with_unknown_output(%arg0: tensor<3xi32>, %arg1: tensor<2xi32>) -> tensor<*xi32> {
+  %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xi32>, tensor<2xi32>) -> tensor<*xi32>
+  return %0 : tensor<*xi32>
+  // CHECK:  %cst = arith.constant dense<1> : tensor<i32>
+  // CHECK:  %0 = "tfl.fill"(%arg1, %cst) : (tensor<2xi32>, tensor<i32>) -> tensor<*xi32>
+  // CHECK:  %1 = tfl.mul(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<3xi32>, tensor<*xi32>) -> tensor<*xi32>
+  // CHECK:  return %1 : tensor<*xi32>
+}
+
+// CHECK-LABEL: @broadcast_to_ui32
+func.func @broadcast_to_ui32(%arg0: tensor<ui32>, %arg1: tensor<1xi64>) -> tensor<10xui32> {
+  %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<ui32>, tensor<1xi64>) -> tensor<10xui32>
+  return %0 : tensor<10xui32>
+  // CHECK:  %cst = arith.constant dense<1> : tensor<10xui32>
+  // CHECK:  %0 = tfl.mul(%arg0, %cst) {fused_activation_function = "NONE"} : (tensor<ui32>, tensor<10xui32>) -> tensor<10xui32>
+  // CHECK:  return %0 : tensor<10xui32>
+}
+
+// CHECK-LABEL: @broadcast_to_f32
+func.func @broadcast_to_f32(%arg0: tensor<3xf32>, %arg1: tensor<2xi32>) -> tensor<3x3xf32> {
+  %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xf32>, tensor<2xi32>) -> tensor<3x3xf32>
+  return %0 : tensor<3x3xf32>
+  // CHECK:  %cst = arith.constant dense<1.000000e+00> : tensor<3x3xf32>
+  // CHECK:  %0 = tfl.mul(%arg0, %cst) {fused_activation_function = "NONE"} : (tensor<3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+  // CHECK:  return %0 : tensor<3x3xf32>
+}
+
+// CHECK-LABEL: @broadcast_to_i32
+func.func @broadcast_to_i32(%arg0: tensor<3xi32>, %arg1: tensor<2xi32>) -> tensor<3x3xi32> {
+  %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xi32>, tensor<2xi32>) -> tensor<3x3xi32>
+  return %0 : tensor<3x3xi32>
+  // CHECK:  %cst = arith.constant dense<1> : tensor<3x3xi32>
+  // CHECK:  %0 = tfl.mul(%arg0, %cst) {fused_activation_function = "NONE"} : (tensor<3xi32>, tensor<3x3xi32>) -> tensor<3x3xi32>
+  // CHECK:  return %0 : tensor<3x3xi32>
+}
+
+// CHECK-LABEL: @broadcast_to_i32_with_dynamic_shape_and_output
+func.func @broadcast_to_i32_with_dynamic_shape_and_output(%arg0: tensor<3xi32>, %arg1: tensor<2xi32>) -> tensor<3x?xi32> {
+  %0 = "tfl.broadcast_to"(%arg0, %arg1) : (tensor<3xi32>, tensor<2xi32>) -> tensor<3x?xi32>
+  return %0 : tensor<3x?xi32>
+  // CHECK:  %cst = arith.constant dense<1> : tensor<i32>
+  // CHECK:  %0 = "tfl.fill"(%arg1, %cst) : (tensor<2xi32>, tensor<i32>) -> tensor<3x?xi32>
+  // CHECK:  %1 = tfl.mul(%arg0, %0) {fused_activation_function = "NONE"} : (tensor<3xi32>, tensor<3x?xi32>) -> tensor<3x?xi32>
+  // CHECK:  return %1 : tensor<3x?xi32>
+}
+
+// CHECK-LABEL: @broadcast_to_ui32_with_dynamic_output
+func.func @broadcast_to_ui32_with_dynamic_output(%arg0: tensor<1xi32>) -> tensor<?xui32> {
+  %cst = arith.constant dense<0> : tensor<1xui32>
+  %0 = "tfl.broadcast_to"(%cst, %arg0) : (tensor<1xui32>, tensor<1xi32>) -> tensor<?xui32>
+  return %0 : tensor<?xui32>
+
+  // CHECK:  %cst = arith.constant dense<0> : tensor<1xui32>
+  // CHECK:  %0 = "tfl.broadcast_to"(%cst, %arg0) : (tensor<1xui32>, tensor<1xi32>) -> tensor<?xui32>
+  // CHECK:  return %0 : tensor<?xui32>
+}
+
+
+// CHECK-LABEL: @ConvertStridedSliceToSliceNeg
+func.func @ConvertStridedSliceToSliceNeg(%arg0: tensor<5x5x5x5xf32>) -> tensor<*xf32> {
+  %44 = arith.constant dense<[5, 5, 5, 5]> : tensor<4xi32>
+  %45 = arith.constant dense<[1, 1, 1, 1]> : tensor<4xi32>
+  %46 = arith.constant dense<1> : tensor<4xi32>
+  %47 = "tfl.strided_slice"(%arg0, %44, %45, %46) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<5x5x5x5xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<*xf32>
+  func.return %47 : tensor<*xf32>
+
+  // CHECK-NOT: %[[slice:.*]] = "tfl.slice"
+}
+
+// CHECK-LABEL: @StridedSliceToSliceBeginNeg
+func.func @StridedSliceToSliceBeginNeg(%arg0: tensor<5x5x5x5xf32>) -> tensor<*xf32> {
+  %44 = arith.constant dense<[-5, 0, 0, 0]> : tensor<4xi32>
+  %45 = arith.constant dense<[1, 1, 1, 1]> : tensor<4xi32>
+  %46 = arith.constant dense<1> : tensor<4xi32>
+  %47 = "tfl.strided_slice"(%arg0, %44, %45, %46) {begin_mask = 0 : i32, ellipsis_mask = 0 : i32, end_mask = 0 : i32, new_axis_mask = 0 : i32, offset = false, shrink_axis_mask = 0 : i32} : (tensor<5x5x5x5xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<*xf32>
+  func.return %47 : tensor<*xf32>
+
+  // CHECK-NOT: %[[slice:.*]] = "tfl.slice"
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir
index 0c9f058c1912c9..b35355524127dc 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-dynamic-range.mlir
@@ -103,8 +103,8 @@ func.func @QuantizeFullyConnected(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x11
   func.return %fc : tensor<1x112x112x512xf32>
 
 // CHECK-DAG: %[[w:.*]] = arith.constant dense<1.270000e+02> : tensor<512x12xf32>
-// CHECK-DAG: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>}
-// CHECK-DAG: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<512x12xf32>
+// CHECK-DAG: %[[q_w:.*]] = "tfl.quantize"(%[[w]]) {qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,
+// CHECK-DAG: %[[dq_w:.*]] = "tfl.dequantize"(%[[q_w]]) : (tensor<512x12x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,
 // CHECK-DAG: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<512xf32>
 // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[dq_w]], %[[b]]) {
 // CHECK-NOT: fused_activation_function = "NONE"
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training-16bits.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training-16bits.mlir
index d2e04734e0e2e6..15ede0019e12d6 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training-16bits.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-post-training-16bits.mlir
@@ -298,8 +298,8 @@ func.func @QuantizeFullyConnectedOp(%arg0: tensor<1x3xf32>) -> (tensor<1x1xf32>)
 // CHECK-NEXT: %[[q1:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<1x!quant.uniform<i32:f32, {{.*}}>>, volatile} : (tensor<1xf32>) -> tensor<1x!quant.uniform<i32:f32, {{.*}}>>
 // CHECK-NEXT: %[[dq1:.*]] = "tfl.dequantize"(%[[q1]]) : (tensor<1x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1xf32>
 // CHECK-NEXT: %[[cst_0:.*]] = arith.constant dense<{{.*}}> : tensor<1x3xf32>
-// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[cst_0]]) {qtype = tensor<1x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>
-// CHECK-NEXT: %[[dq2:.*]] = "tfl.dequantize"(%[[q2]]) : (tensor<1x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3xf32>
+// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[cst_0]]) {qtype = tensor<1x3x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>, volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>
+// CHECK-NEXT: %[[dq2:.*]] = "tfl.dequantize"(%[[q2]]) : (tensor<1x3x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>) -> tensor<1x3xf32>
 // CHECK-NEXT: %[[q3:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x3x!quant.uniform<i16:f32, {{.*}}>>, volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i16:f32, {{.*}}>>
 // CHECK-NEXT: %[[dq3:.*]] = "tfl.dequantize"(%[[q3]]) : (tensor<1x3x!quant.uniform<i16:f32, {{.*}}>>) -> tensor<1x3xf32>
 // CHECK-NEXT: %[[fc:.*]] = "tfl.fully_connected"(%[[dq3]], %[[dq2]], %[[dq1]]) {{{.*}}} : (tensor<1x3xf32>, tensor<1x3xf32>, tensor<1xf32>) -> tensor<1x1xf32>
@@ -324,8 +324,8 @@ func.func @QuantizeReshapeAndFullyConnectedOp(%arg0: tensor<1x1x3xf32>) -> (tens
 // CHECK-NEXT: %[[q1:.*]] = "tfl.quantize"(%[[cst]]) {qtype = tensor<1x!quant.uniform<i32:f32, {{.*}}>>, volatile} : (tensor<1xf32>) -> tensor<1x!quant.uniform<i32:f32, {{.*}}>>
 // CHECK-NEXT: %[[dq1:.*]] = "tfl.dequantize"(%[[q1]]) : (tensor<1x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1xf32>
 // CHECK-NEXT: %[[cst_0:.*]] = arith.constant dense<{{.*}}> : tensor<1x3xf32>
-// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[cst_0]]) {qtype = tensor<1x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>
-// CHECK-NEXT: %[[dq2:.*]] = "tfl.dequantize"(%[[q2]]) : (tensor<1x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3xf32>
+// CHECK-NEXT: %[[q2:.*]] = "tfl.quantize"(%[[cst_0]]) {qtype = tensor<1x3x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>, volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>
+// CHECK-NEXT: %[[dq2:.*]] = "tfl.dequantize"(%[[q2]]) : (tensor<1x3x!quant.uniform<i8<-127:127>:f32:0, {{.*}}>>) -> tensor<1x3xf32>
 // CHECK-NEXT: %[[cst_1:.*]] = arith.constant dense<[-1, 3]> : tensor<2xi32>
 // CHECK-NEXT: %[[q3:.*]] = "tfl.quantize"(%arg0) {qtype = tensor<1x1x3x!quant.uniform<i16:f32, {{.*}}>>, volatile} : (tensor<1x1x3xf32>) -> tensor<1x1x3x!quant.uniform<i16:f32, {{.*}}>>
 // CHECK-NEXT: %[[dq3:.*]] = "tfl.dequantize"(%[[q3]]) : (tensor<1x1x3x!quant.uniform<i16:f32, {{.*}}>>) -> tensor<1x1x3xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
index 6e9ca99e11f492..882b335135cf74 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize-signed.mlir
@@ -166,20 +166,20 @@ func.func @prepareDepthwiseConv2D(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x11
 
 // CHECK-LABEL: QuantizeFullyConnected
 // PerTensor-LABEL: QuantizeFullyConnected
-func.func @QuantizeFullyConnected(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112x32xf32> {
-  %w = arith.constant dense<127.0> : tensor<32x12xf32>
-  %b = arith.constant dense<0.0> : tensor<32xf32>
-  %fc = "tfl.fully_connected"(%arg0, %w, %b) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x224x224x3xf32>, tensor<32x12xf32>, tensor<32xf32>) -> tensor<1x112x112x32xf32>
-  func.return %fc : tensor<1x112x112x32xf32>
-
-// CHECK: %[[cst:.*]] = arith.constant dense<1.270000e+02> : tensor<32x12xf32>
-// CHECK: %[[q:.*]] = "tfl.quantize"(%cst) {qtype = tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, volatile}
-// CHECK: %[[dq:.*]] = "tfl.dequantize"(%0) : (tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<32x12xf32>
+func.func @QuantizeFullyConnected(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x112x112x4xf32> {
+  %w = arith.constant dense<127.0> : tensor<4x12xf32>
+  %b = arith.constant dense<0.0> : tensor<4xf32>
+  %fc = "tfl.fully_connected"(%arg0, %w, %b) {fused_activation_function = "NONE", keep_num_dims = false, weights_format = "DEFAULT"} : (tensor<1x224x224x3xf32>, tensor<4x12xf32>, tensor<4xf32>) -> tensor<1x112x112x4xf32>
+  func.return %fc : tensor<1x112x112x4xf32>
+
+// CHECK: %[[cst:.*]] = arith.constant dense<1.270000e+02> : tensor<4x12xf32>
+// CHECK: %[[q:.*]] = "tfl.quantize"(%cst) {qtype = tensor<4x12x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00}>>, volatile}
+// CHECK: %[[dq:.*]] = "tfl.dequantize"(%0) : (tensor<4x12x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00}>>) -> tensor<4x12xf32>
 // CHECK: "tfl.fully_connected"(%arg0, %[[dq]]
 
-// PerTensor: %[[cst:.*]] = arith.constant dense<1.270000e+02> : tensor<32x12xf32>
-// PerTensor: %[[q:.*]] = "tfl.quantize"(%cst) {qtype = tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, volatile}
-// PerTensor: %[[dq:.*]] = "tfl.dequantize"(%0) : (tensor<32x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<32x12xf32>
+// PerTensor: %[[cst:.*]] = arith.constant dense<1.270000e+02> : tensor<4x12xf32>
+// PerTensor: %[[q:.*]] = "tfl.quantize"(%cst) {qtype = tensor<4x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>, volatile}
+// PerTensor: %[[dq:.*]] = "tfl.dequantize"(%0) : (tensor<4x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>) -> tensor<4x12xf32>
 // PerTensor: "tfl.fully_connected"(%arg0, %[[dq]]
 }
 
@@ -215,8 +215,8 @@ func.func @bias_adjust_pertensor(%arg0: tensor<1x2xf32>) -> (tensor<1x2xf32>) {
   func.return %fc : tensor<1x2xf32>
 // CHECK-DAG: %[[weight:.*]] = arith.constant dense<{{\[\[}}0.000000e+00, 1.000000e+00]
 // CHECK-DAG: %[[bias:.*]] = arith.constant dense<[0.000000e+00, 2147364.75]>
-// CHECK-DAG: %[[b_q:.*]] = "tfl.quantize"(%[[bias]]){{.*}}quant.uniform<i32:f32, 0.0019998892694710656>>
-// CHECK-DAG: %[[w_q:.*]] = "tfl.quantize"(%[[weight]]){{.*}}quant.uniform<i8<-127:127>:f32, 19998.892343977564>>
+// CHECK-DAG: %[[b_q:.*]] = "tfl.quantize"(%[[bias]]){{.*}}quant.uniform<i32:f32:0, {7.8740158861230386E-10,0.0019998892694710656}>>
+// CHECK-DAG: %[[w_q:.*]] = "tfl.quantize"(%[[weight]]){{.*}}quant.uniform<i8<-127:127>:f32:0, {0.0078740157480314959,19998.892343977564}>>
 // CHECK-DAG: %[[b_dq:.*]] = "tfl.dequantize"(%[[b_q]])
 // CHECK-DAG: %[[w_dq:.*]] = "tfl.dequantize"(%[[w_q]])
 // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%[[input:.*]], %[[w_dq]], %[[b_dq]])
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
index 2a4b2af88f5319..cce986eb8f1a8e 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-quantize.mlir
@@ -1,5 +1,6 @@
 // RUN: tf-opt %s -tfl-prepare-quantize="quantize-allowlist=quantize_float_placeholder_only,not_reset_input" | FileCheck %s
 // RUN: tf-opt %s -tfl-prepare-quantize="disable-set-input-nodes-quantization-params=true" | FileCheck --check-prefix=MixedPrecision %s
+// RUN: tf-opt %s -tfl-prepare-quantize="is-qdq-conversion=true" | FileCheck --check-prefix=QDQ %s
 
 // CHECK-LABEL: main
 // Uses `main` function to match the default target function of QuantSpecs and
@@ -394,6 +395,32 @@ func.func @NotRescaleLogistic(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 7.81
 // CHECK: return %[[log]]
 }
 
+// QDQ-LABEL: QDQNoQuantizeLogistic
+func.func @QDQNoQuantizeLogistic(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x6x6x16xf32> {
+^bb0(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>):
+  %0 = "tfl.dequantize"(%arg0) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x6x6x16xf32>
+  %1 = "tfl.logistic"(%0) : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32>
+  func.return %1 : tensor<1x6x6x16xf32>
+
+// QDQ: %0 = "tfl.dequantize"(%arg0)
+// QDQ: %1 = "tfl.logistic"(%0) : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32>
+// QDQ-NOT:"tfl.quantize"
+// QDQ: return %1 : tensor<1x6x6x16xf32>
+}
+
+// QDQ-LABEL: QDQNoQuantizeSoftmax
+func.func @QDQNoQuantizeSoftmax(tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x6x6x16xf32> {
+^bb0(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>):
+  %0 = "tfl.dequantize"(%arg0) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 7.812500e-03:128>>) -> tensor<1x6x6x16xf32>
+  %1 = "tfl.softmax"(%0) {beta = 1.000000e+00 : f32} : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32>
+  func.return %1 : tensor<1x6x6x16xf32>
+
+// QDQ: %0 = "tfl.dequantize"(%arg0)
+// QDQ: %1 = "tfl.softmax"(%0) {beta = 1.000000e+00 : f32} : (tensor<1x6x6x16xf32>) -> tensor<1x6x6x16xf32>
+// QDQ-NOT: "tfl.quantize"
+// QDQ: return %1 : tensor<1x6x6x16xf32>
+}
+
 // CHECK-LABEL: QuantizeL2Norm
 func.func @QuantizeL2Norm(%arg0: tensor<1x6x6x16x!quant.uniform<u8:f32, 1.0>>) -> tensor<1x6x6x16xf32> {
   %0 = "tfl.dequantize"(%arg0) : (tensor<1x6x6x16x!quant.uniform<u8:f32, 1.0>>) -> tensor<1x6x6x16xf32>
diff --git a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
index 0769e768507ee7..bfbcbd573cb0e0 100644
--- a/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/prepare-tf.mlir
@@ -440,34 +440,13 @@ func.func @StridedSliceShrinkAxisAndNewAxisMaskBothSet(%arg0: tensor<6x7x8xf32>)
   // CHECK: %[[STRIDED_SLICE:.*]] = "tf.StridedSlice"(%[[RESHAPE]], %[[BEGIN]], %[[END]], %[[STEP]]) <{begin_mask = 26 : i64, ellipsis_mask = 0 : i64, end_mask = 26 : i64, new_axis_mask = 0 : i64, shrink_axis_mask = 1 : i64}> : (tensor<6x1x7x1x8xf32>, tensor<5xi32>, tensor<5xi32>, tensor<5xi32>) -> tensor<1x4x1x8xf32>
 }
 
-func.func @broadcast_to_f32_low_dim(%arg0: tensor<3xf32>, %arg1: tensor<2xi32>) -> tensor<3x3xf32> {
-  %0 = "tf.BroadcastTo"(%arg0, %arg1) : (tensor<3xf32>, tensor<2xi32>) -> tensor<3x3xf32>
-  func.return %0: tensor<3x3xf32>
-
-// CHECK-LABEL: broadcast_to_f32_low_dim
-// CHECK:  [[CST:%.*]] = arith.constant dense<1.000000e+00> : tensor<3x3xf32>
-// CHECK:  [[MUL:%.*]] = "tf.Mul"(%arg0, [[CST]]) : (tensor<3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
-// CHECK:  return [[MUL]] : tensor<3x3xf32>
-}
-
-func.func @broadcast_to_i32_low_dim(%input: tensor<3xi32>, %shape: tensor<2xi32>) -> tensor<3x3xi32> {
-  %0 = "tf.BroadcastTo"(%input, %shape) : (tensor<3xi32>, tensor<2xi32>) -> tensor<3x3xi32>
-  func.return %0: tensor<3x3xi32>
-
-// CHECK-LABEL: broadcast_to_i32_low_dim
-// CHECK:  [[CST:%.*]] = arith.constant dense<1> : tensor<3x3xi32>
-// CHECK:  [[MUL:%.*]] = "tf.Mul"(%arg0, [[CST]]) : (tensor<3xi32>, tensor<3x3xi32>) -> tensor<3x3xi32>
-// CHECK:  return [[MUL]] : tensor<3x3xi32>
-}
-
 func.func @broadcast_to_i16_low_dim(%input: tensor<3xi16>, %shape: tensor<2xi32>) -> tensor<3x3xi16> {
   %0 = "tf.BroadcastTo"(%input, %shape) : (tensor<3xi16>, tensor<2xi32>) -> tensor<3x3xi16>
   func.return %0: tensor<3x3xi16>
 
 // CHECK-LABEL: broadcast_to_i16_low_dim
-// CHECK:  [[CST:%.*]] = arith.constant dense<1> : tensor<3x3xi16>
-// CHECK:  [[MUL:%.*]] = "tf.Mul"(%arg0, [[CST]]) : (tensor<3xi16>, tensor<3x3xi16>) -> tensor<3x3xi16>
-// CHECK:  return [[MUL]] : tensor<3x3xi16>
+// CHECK:    %0 = "tf.BroadcastTo"(%arg0, %arg1) : (tensor<3xi16>, tensor<2xi32>) -> tensor<3x3xi16>
+// CHECK:    return %0 : tensor<3x3xi16>
 }
 
 func.func @broadcast_to_low_dim_with_unknown_shape(%arg0: tensor<3xf32>, %arg1: tensor<*xi32>) -> tensor<3x3xf32> {
@@ -475,9 +454,8 @@ func.func @broadcast_to_low_dim_with_unknown_shape(%arg0: tensor<3xf32>, %arg1:
   func.return %0: tensor<3x3xf32>
 
 // CHECK-LABEL: broadcast_to_low_dim_with_unknown_shape
-// CHECK:  [[CST:%.*]] = arith.constant dense<1.000000e+00> : tensor<3x3xf32>
-// CHECK:  [[MUL:%.*]] = "tf.Mul"(%arg0, [[CST]]) : (tensor<3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
-// CHECK:  return [[MUL]] : tensor<3x3xf32>
+// CHECK: %0 = "tf.BroadcastTo"(%arg0, %arg1) : (tensor<3xf32>, tensor<*xi32>) -> tensor<3x3xf32>
+// CHECK: return %0 : tensor<3x3xf32>
 }
 
 func.func @broadcast_to_i32_low_dim_with_unknown_output(%input: tensor<3xi32>, %shape: tensor<2xi32>) -> tensor<*xi32> {
@@ -485,10 +463,8 @@ func.func @broadcast_to_i32_low_dim_with_unknown_output(%input: tensor<3xi32>, %
   func.return %0: tensor<*xi32>
 
 // CHECK-LABEL: broadcast_to_i32_low_dim_with_unknown_output
-// CHECK:  [[CST:%.*]] = arith.constant dense<1> : tensor<i32>
-// CHECK:  [[FILL:%.*]] = "tf.Fill"(%arg1, [[CST]]) : (tensor<2xi32>, tensor<i32>) -> tensor<*xi32>
-// CHECK:  [[MUL:%.*]] = "tf.Mul"(%arg0, [[FILL]]) : (tensor<3xi32>, tensor<*xi32>) -> tensor<*xi32>
-// CHECK:  return [[MUL]] : tensor<*xi32>
+// CHECK:  %0 = "tf.BroadcastTo"(%arg0, %arg1) : (tensor<3xi32>, tensor<2xi32>) -> tensor<*xi32>
+// CHECK:  return %0 : tensor<*xi32>
 }
 
 func.func @broadcast_to_high_dim_with_unknown_shape(%arg0: tensor<1x2x3x4x5x6xf32>, %arg1: tensor<*xi32>) -> tensor<7x8x1x2x3x4x5x6xf32> {
@@ -517,16 +493,6 @@ func.func @broadcast_to_with_unknown_shape_and_output(%arg0: tensor<1x2x3x4x5x6x
 // CHECK:  "tf.BroadcastTo"(%arg0, %arg1)
 }
 
-func.func @broadcast_to_ui32(%arg0: tensor<ui32>, %arg1: tensor<1xi64>) -> tensor<10xui32> {
-  %0 = "tf.BroadcastTo"(%arg0, %arg1) : (tensor<ui32>, tensor<1xi64>) -> tensor<10xui32>
-  func.return %0: tensor<10xui32>
-
-// CHECK-LABEL: broadcast_to_ui32
-// CHECK:  [[CST:%.*]] = arith.constant dense<1> : tensor<10xui32>
-// CHECK:  [[MUL:%.*]] = "tf.Mul"(%arg0, [[CST]]) : (tensor<ui32>, tensor<10xui32>) -> tensor<10xui32>
-// CHECK:  return [[MUL]] : tensor<10xui32>
-}
-
 // CHECK-LABEL: xla_conv_v2
 func.func @xla_conv_v2(%arg0: tensor<4x8x8x16xf32>) -> tensor<4x8x8x16xf32> {
   %0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<3x3x16x16xf32>} : () -> tensor<3x3x16x16xf32> loc("Const_1")
@@ -541,26 +507,6 @@ func.func @xla_conv_v2(%arg0: tensor<4x8x8x16xf32>) -> tensor<4x8x8x16xf32> {
   // CHECK: return %[[RES]]
 }
 
-func.func @broadcast_to_f32(%arg0: tensor<3xf32>, %arg1: tensor<2xi32>) -> tensor<3x3xf32> {
-  %0 = "tf.BroadcastTo"(%arg0, %arg1) : (tensor<3xf32>, tensor<2xi32>) -> tensor<3x3xf32>
-  func.return %0: tensor<3x3xf32>
-
-// CHECK-LABEL: broadcast_to_f32
-// CHECK:  [[CST:%.*]] = arith.constant dense<1.000000e+00> : tensor<3x3xf32>
-// CHECK:  [[MUL:%.*]] = "tf.Mul"(%arg0, [[CST]]) : (tensor<3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
-// CHECK:  return [[MUL]] : tensor<3x3xf32>
-}
-
-func.func @broadcast_to_i32(%input: tensor<3xi32>, %shape: tensor<2xi32>) -> tensor<3x3xi32> {
-  %0 = "tf.BroadcastTo"(%input, %shape) : (tensor<3xi32>, tensor<2xi32>) -> tensor<3x3xi32>
-  func.return %0: tensor<3x3xi32>
-
-// CHECK-LABEL: broadcast_to_i32
-// CHECK:  [[CST:%.*]] = arith.constant dense<1> : tensor<3x3xi32>
-// CHECK:  [[MUL:%.*]] = "tf.Mul"(%arg0, [[CST]]) : (tensor<3xi32>, tensor<3x3xi32>) -> tensor<3x3xi32>
-// CHECK:  return [[MUL]] : tensor<3x3xi32>
-}
-
 // CHECK-LABEL: lower_rfft_to_rfft2d
 func.func @lower_rfft_to_rfft2d(%input: tensor<10x20x30xf32>, %fft_len: tensor<1xi32>) -> tensor<10x20x30xcomplex<f64>> {
   %0 = "tf.RFFT"(%input, %fft_len) : (tensor<10x20x30xf32>, tensor<1xi32>) -> tensor<10x20x30xcomplex<f64>>
diff --git a/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range.mlir b/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range.mlir
index 15684bc4bd2204..ad4ff5a129f4a2 100644
--- a/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range.mlir
+++ b/tensorflow/compiler/mlir/lite/tests/quantize-dynamic-range.mlir
@@ -88,7 +88,7 @@ func.func @QuantizeFullyConnected(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x11
   func.return %fc : tensor<1x112x112x512xf32>
 
 // CHECK: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<512xf32>
-// CHECK: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// CHECK: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00
 // CHECK: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[w]], %[[b]]) {
 // CHECK-NOT: fused_activation_function = "NONE",
 // CHECK-SAME: asymmetric_quantize_inputs = true,
@@ -102,8 +102,8 @@ func.func @QuantizeFullyConnected(%arg0: tensor<1x224x224x3xf32>) -> tensor<1x11
 // PerTensor: return %[[fc:.*]]
 
 // PerChannelWeightOnly: %[[b:.*]] = arith.constant dense<0.000000e+00> : tensor<512xf32>
-// PerChannelWeightOnly: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
-// PerChannelWeightOnly: %[[dq_w:.*]] = "tfl.dequantize"(%[[w]]) : (tensor<512x12x!quant.uniform<i8<-127:127>:f32, 1.000000e+00>>
+// PerChannelWeightOnly: %[[w:.*]] = "tfl.pseudo_qconst"() {qtype = tensor<512x12x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00
+// PerChannelWeightOnly: %[[dq_w:.*]] = "tfl.dequantize"(%[[w]]) : (tensor<512x12x!quant.uniform<i8<-127:127>:f32:0, {1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00
 // PerChannelWeightOnly: %[[fc:.*]] = "tfl.fully_connected"(%arg0, %[[dq_w]], %[[b]]) {
 // PerChannelWeightOnly-NOT: fused_activation_function = "NONE",
 // PerChannelWeightOnly-SAME: asymmetric_quantize_inputs = true,
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
index 6da1d1e546ea32..749e5f383b289b 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.cc
@@ -19,21 +19,22 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "stablehlo/experimental/transforms/Passes.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_passes.h"
 #include "tensorflow/compiler/mlir/lite/quantization/tensorflow/passes.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/fake_quant_utils.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
@@ -47,7 +48,7 @@ CreateTFExecutorToControlDialectConversion();
 namespace tensorflow {
 namespace {
 // Data layout supported by TFLite.
-const char kTFLiteDataLayout[] = "NHWC";
+constexpr mlir::StringRef kTFLiteDataLayout = "NHWC";
 }  // namespace
 
 void AddQuantizationPasses(const mlir::TFL::PassConfig& pass_config,
@@ -136,12 +137,26 @@ void AddDynamicRangeQuantizationPasses(const mlir::TFL::PassConfig& pass_config,
       mlir::TFL::CreateOptimizePass(/*enable_canonicalization=*/true));
 }
 
-void AddConvertHloToTfPass(std::string entry_function_name,
-                           const mlir::TFL::PassConfig& pass_config,
-                           mlir::OpPassManager* pass_manager) {
-  pass_manager->addPass(
+void AddPreQuantizationStableHloToTfPasses(
+    const mlir::StringRef entry_function_name,
+    const mlir::TFL::PassConfig& pass_config,
+    mlir::OpPassManager& pass_manager) {
+  pass_manager.addPass(
       mlir::odml::CreateLegalizeTFXlaCallModuleToStablehloPass());
 
+  // Add CHLO to StableHLO Decompositions:
+  // This is needed since we are relying on XlaCallModule uses MHLO
+  // specific features like mhlo::ErfOp which aren't supported
+  // in StableHLO, but we have CHLO->StableHLO decompositions to legalize.
+  pass_manager.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
+  pass_manager.addPass(
+      mlir::stablehlo::experimental::createChloRecomposeOpsPass());
+  pass_manager.addNestedPass<mlir::func::FuncOp>(
+      mlir::mhlo::createChloLegalizeToHloBasisOpsPass());
+  pass_manager.addNestedPass<mlir::func::FuncOp>(
+      mlir::mhlo::createChloLegalizeToHloPass());
+  pass_manager.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
+
   // The following two passes find specific uniform quantization patterns in
   // StableHLO and converts them to TFLite ops that accept or produce uniform
   // quantized types. They only target a specific set of models that contain
@@ -153,65 +168,93 @@ void AddConvertHloToTfPass(std::string entry_function_name,
   // There are future plans to make the framework to directly produce StableHLO
   // uniform quantized ops and deprecate `ComposeUniformQuantizedTypePass`. If
   // no quantization patterns are found, it is a no-op.
-  pass_manager->addPass(mlir::odml::CreateComposeUniformQuantizedTypePass());
-  pass_manager->addNestedPass<mlir::func::FuncOp>(
-      mlir::odml::CreateUniformQuantizedStablehloToTflPass());
+  pass_manager.addPass(mlir::odml::CreateComposeUniformQuantizedTypePass());
+  pass_manager.addNestedPass<mlir::func::FuncOp>(
+      mlir::odml::CreateUniformQuantizedStableHloToTflPass());
 
-  pass_manager->addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
+  pass_manager.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
   // Legalize jax random to tflite custom op.
   // The CreateLegalizeJaxRandom Pass has to stay at because we need to replace
   // the random function body before being inlined.
-  pass_manager->addNestedPass<mlir::func::FuncOp>(
+  pass_manager.addNestedPass<mlir::func::FuncOp>(
       mlir::TFL::CreateLegalizeJaxRandomPass());
 
   // Canonicalize, CSE etc.
-  pass_manager->addNestedPass<mlir::func::FuncOp>(
+  pass_manager.addNestedPass<mlir::func::FuncOp>(
       mlir::createCanonicalizerPass());
-  pass_manager->addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
+  pass_manager.addNestedPass<mlir::func::FuncOp>(mlir::createCSEPass());
   // DCE for private symbols.
-  pass_manager->addPass(mlir::createSymbolDCEPass());
+  pass_manager.addPass(mlir::createSymbolDCEPass());
 
-  pass_manager->addPass(mlir::TF::CreateStripNoinlineAttributePass());
+  pass_manager.addPass(mlir::TF::CreateStripNoinlineAttributePass());
   // Add inline pass.
-  pass_manager->addPass(mlir::createInlinerPass());
+  pass_manager.addPass(mlir::createInlinerPass());
 
   // Expands mhlo.tuple ops.
-  pass_manager->addPass(
-      mlir::mhlo::createExpandHloTuplesPass(entry_function_name));
+  pass_manager.addPass(
+      mlir::mhlo::createExpandHloTuplesPass(entry_function_name.str()));
   // Flatten tuples for control flows.
-  pass_manager->addNestedPass<mlir::func::FuncOp>(
+  pass_manager.addNestedPass<mlir::func::FuncOp>(
       mlir::mhlo::createFlattenTuplePass());
 
-  mlir::odml::AddMhloOptimizationPasses(*pass_manager);
+  mlir::odml::AddMhloOptimizationPasses(pass_manager,
+                                        pass_config.enable_stablehlo_quantizer);
 
   // Undo the MHLO::BroadcastInDimOp folding pattern on splat constants. This
   // pass must be added right before the legalization because pattern rewriter
   // driver applies folding by default.
-  // TODO(b/295966255): Remove this pass after moving MHLO folders to a separate
-  // pass.
-  pass_manager->addPass(mlir::odml::CreateUnfoldSplatConstantPass());
+  // TODO: b/295966255 - Remove this pass after moving MHLO folders to a
+  // separate pass.
+  pass_manager.addPass(mlir::odml::CreateUnfoldSplatConstantPass());
+
+  if (pass_config.enable_stablehlo_quantizer) {
+    // When using StableHLO Quantizer, MHLO ops should be transformed back into
+    // StableHLO because the quantizer takes StableHLO dialect as its input.
+    pass_manager.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
+  }
+}
+
+void AddPostQuantizationStableHloToTfPasses(
+    const mlir::TFL::PassConfig& pass_config,
+    mlir::OpPassManager& pass_manager) {
+  if (pass_config.enable_stablehlo_quantizer) {
+    // StableHLO Quantizer emits quantized StableHLO module serialized within a
+    // XlaCallModule op. Add this pass to extract StableHLO module from the
+    // XlaCallModuleOp.
+    pass_manager.addPass(
+        mlir::odml::CreateLegalizeTFXlaCallModuleToStablehloPass());
+
+    // Convert StableHLO -> TFLite for fused quantization patterns early so that
+    // quantized types do not go through the TF dialect which doesn't support
+    // quantized types.
+    pass_manager.addNestedPass<mlir::func::FuncOp>(
+        mlir::odml::CreateUniformQuantizedStableHloToTflPass());
+
+    // StableHLO -> MHLO
+    pass_manager.addPass(mlir::mhlo::createStablehloLegalizeToHloPass());
+  }
 
   // TFLite dialect passes.
   if (!pass_config.disable_hlo_to_tfl_conversion) {
-    pass_manager->addPass(mlir::odml::CreateLegalizeHloToTfLitePass());
+    pass_manager.addPass(mlir::odml::CreateLegalizeHloToTfLitePass());
   }
   // TF dialect passes
-  pass_manager->addPass(mlir::odml::CreateLegalizeHloToTfPass());
+  pass_manager.addPass(mlir::odml::CreateLegalizeHloToTfPass());
 
   // folds tf.BroadcastTo ops with subsequent ops if they have built in
   // broadcasting support. This needs to be run immediately after HLO->TF
   // legalization; otherwise other passes like `ConvertTFBroadcastTo` will
   // constant fold the newly generated TF broadcast ops and materialize the
   // weights.
-  pass_manager->addNestedPass<mlir::func::FuncOp>(
+  pass_manager.addNestedPass<mlir::func::FuncOp>(
       mlir::TF::CreateBroadcastFoldPass());
 
   // Canonicalization after TF legalization.
-  pass_manager->addNestedPass<mlir::func::FuncOp>(
+  pass_manager.addNestedPass<mlir::func::FuncOp>(
       mlir::createCanonicalizerPass());
 
   // Legalize all remaining mhlo ops to stableHLO
-  pass_manager->addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
+  pass_manager.addPass(mlir::mhlo::createHloLegalizeToStablehloPass());
 }
 
 // This is the early part of the conversion in isolation. This enables a caller
@@ -220,11 +263,6 @@ void AddConvertHloToTfPass(std::string entry_function_name,
 void AddPreVariableFreezingTFToTFLConversionPasses(
     const mlir::TFL::PassConfig& pass_config,
     mlir::OpPassManager* pass_manager) {
-  if (pass_config.enable_hlo_to_tf_conversion) {
-    // TODO(b/194747383): We need to valid that indeed the "main" func is
-    // presented.
-    AddConvertHloToTfPass("main", pass_config, pass_manager);
-  }
   // This pass wraps all the tf.FakeQuant ops in a custom op so they are not
   // folded before being converted to tfl.quantize and tfl.dequantize ops.
   auto wrapped_ops = mlir::TFL::AllTfFakeQuantOps();
@@ -266,7 +304,7 @@ void AddPreVariableFreezingTFToTFLConversionPasses(
 
   // This decomposes resource ops like ResourceGather into read-variable op
   // followed by gather. This is used when the saved model import path is used
-  // during which resources dont get frozen in the python layer.
+  // during which resources don't get frozen in the python layer.
   pass_manager->addNestedPass<mlir::func::FuncOp>(
       mlir::TFDevice::CreateDecomposeResourceOpsPass());
 
@@ -375,7 +413,7 @@ void AddPostVariableFreezingTFToTFLConversionPasses(
     // Force layout supported by TFLite, this will transpose the data
     // to match 'kTFLiteDataLayout'
     mlir::TF::LayoutOptimizationPipelineOptions layout_optimization_options;
-    layout_optimization_options.force_data_format = kTFLiteDataLayout;
+    layout_optimization_options.force_data_format = kTFLiteDataLayout.str();
     layout_optimization_options.skip_fold_transpose_in_ops = true;
     mlir::TF::CreateLayoutOptimizationPipeline(
         pass_manager->nest<mlir::func::FuncOp>(), layout_optimization_options);
@@ -436,7 +474,7 @@ void AddPostVariableFreezingTFToTFLConversionPasses(
     // completed. Add either full integer quantization or dynamic range
     // quantization passes based on quant_specs.
     if (pass_config.quant_specs.RunPropagationAndRewriteQuantizationPasses() ||
-        pass_config.qdq_conversion_mode !=
+        pass_config.quant_specs.qdq_conversion_mode !=
             mlir::quant::QDQConversionMode::kQDQNone) {
       AddQuantizationPasses(pass_config, *pass_manager);
       // Remove unnecessary QDQs while handling QAT models.
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_passes.h b/tensorflow/compiler/mlir/lite/tf_tfl_passes.h
index 50bf75023a808a..8de2142f0ebd83 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_passes.h
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_passes.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_LITE_TF_TFL_PASSES_H_
 
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
 
@@ -32,6 +33,25 @@ void AddTFToTFLConversionPasses(llvm::StringRef saved_model_dir,
                                 const mlir::TFL::PassConfig& pass_config,
                                 mlir::OpPassManager* pass_manager);
 
+// Adds the first portion of StableHLO->TF passes happening before quantization.
+// The `pass_manager` that runs on a `mlir::ModuleOp` expects a graph containing
+// a `mlir::TF::XlaCallModuleOp` with serialized StableHLO module. The resulting
+// `mlir::ModuleOp` after running these passes will be an MHLO module, or a
+// StableHLO module if `pass_config.enable_stablehlo_quantizer` is `true`. This
+// is because StableHLO Quantizer accepts StableHLO modules.
+void AddPreQuantizationStableHloToTfPasses(
+    mlir::StringRef entry_function_name,
+    const mlir::TFL::PassConfig& pass_config,
+    mlir::OpPassManager& pass_manager);
+
+// Adds the second portion of StableHlo->TF passes happening after quantization.
+// The input module is expected to be an MHLO module, or a quantized StableHLO
+// graph (expressed as `mlir::TF::XlaCallModuleOp`s) if
+// `pass_config.enable_stablehlo_quantizer` is `true`.
+void AddPostQuantizationStableHloToTfPasses(
+    const mlir::TFL::PassConfig& pass_config,
+    mlir::OpPassManager& pass_manager);
+
 // This is the early part of the conversion in isolation. This enables a caller
 // to inject more information in the middle of the conversion before resuming it
 // (like freezing variables for example).
diff --git a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
index dc0ae41ba49a2b..892eed27385035 100644
--- a/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
+++ b/tensorflow/compiler/mlir/lite/tf_tfl_translate.cc
@@ -17,56 +17,55 @@ limitations under the License.
 #include <iostream>
 #include <memory>
 #include <optional>
+#include <string>
 #include <unordered_set>
 #include <vector>
 
+#include "absl/status/statusor.h"
 #include "absl/strings/str_split.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/AsmState.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/init_mlir.h"
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
-#include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export_flags.h"
-#include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/lite/tf_tfl_translate_cl.h"
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate_cl.h"
 #include "xla/translate/hlo_to_mhlo/translate.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/errors.h"
-#include "tensorflow/lite/model.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/lite/model_builder.h"
 #include "tensorflow/lite/schema/schema_generated.h"
-#include "tsl/platform/statusor.h"
 
 using mlir::MLIRContext;
 using mlir::ModuleOp;
 using mlir::func::FuncOp;
-using tsl::StatusOr;
 
 // Debugging flag to print function mapping in the flatbuffer.
 // NOLINTNEXTLINE
@@ -170,7 +169,7 @@ int main(int argc, char **argv) {
     context.appendDialectRegistry(registry);
   }
 
-  StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> module;
+  absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> module;
   std::unordered_set<std::string> tags;
 
   tensorflow::GraphImportConfig specs;
@@ -321,7 +320,7 @@ int main(int argc, char **argv) {
   if (bundle) session = bundle->GetSession();
   auto status = tensorflow::ConvertTFExecutorToTFLOrFlatbuffer(
       module.value().get(), output_mlir, toco_flags, pass_config, tags,
-      /*saved_model_dir=*/"", session, &result, serialize_stablehlo_ops);
+      /*saved_model_dir=*/"", bundle.get(), &result, serialize_stablehlo_ops);
   if (!status.ok()) {
     llvm::errs() << status.message() << '\n';
     return kTrFailure;
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
index 1b3c5d21ba3dfb..b6a08b35d69445 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h"
 
+#include <cstdint>
 #include <iostream>
 #include <memory>
 #include <optional>
@@ -23,60 +24,75 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "flatbuffers/flatbuffer_builder.h"  // from @flatbuffers
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/FileUtilities.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/loader.h"
+#include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/debug/debug.h"
 #include "tensorflow/compiler/mlir/lite/flatbuffer_export.h"
+#include "tensorflow/compiler/mlir/lite/metrics/error_collector.h"
 #include "tensorflow/compiler/mlir/lite/metrics/error_collector_inst.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/serializer/flatbuffer_export.h"
+#include "tensorflow/compiler/mlir/lite/quantization/stablehlo/quantization.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/op_stat_pass.h"
-#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_tfl_pass.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/stablehlo_util.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/transforms.h"
 #include "tensorflow/compiler/mlir/lite/tf_tfl_passes.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
+#include "tensorflow/compiler/mlir/op_or_arg_name_mapper.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantize_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_freeze_variables.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/tf_saved_model_passes.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_import_options.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
+#include "tensorflow/core/framework/op_def_builder.h"
 #include "tensorflow/core/framework/types.pb.h"
-#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/protobuf/meta_graph.pb.h"
+#include "tensorflow/core/public/session.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/experimental/remat/metadata_util.h"
 #include "tensorflow/lite/python/metrics/converter_error_data.pb.h"
+#include "tensorflow/lite/schema/schema_generated.h"
 #include "tensorflow/lite/tools/optimize/quantize_weights.h"
 #include "tensorflow/lite/tools/optimize/reduced_precision_support.h"
-#include "tsl/platform/statusor.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 
 namespace tensorflow {
 namespace {
+
 using mlir::MLIRContext;
 using mlir::ModuleOp;
 using mlir::Operation;
 using mlir::OwningOpRef;
-using tsl::StatusOr;
+using ::stablehlo::quantization::QuantizationConfig;
+using ::tensorflow::quantization::PyFunctionLibrary;
 
 bool IsControlFlowV1Op(Operation* op) {
   return mlir::isa<mlir::tf_executor::SwitchOp, mlir::tf_executor::MergeOp,
@@ -124,27 +140,79 @@ mlir::LogicalResult GraphContainsStatefulPartitionedOp(mlir::ModuleOp module) {
 
 // Util that registers 'extra_tf_opdefs' to the TF global registry.
 // Return OK on success, failure if registering failed.
-Status RegisterExtraTfOpDefs(absl::Span<const std::string> extra_tf_opdefs) {
+absl::Status RegisterExtraTfOpDefs(
+    absl::Span<const std::string> extra_tf_opdefs) {
   for (const auto& tf_opdefs_string : extra_tf_opdefs) {
-    tensorflow::OpDef opdef;
-    if (!tensorflow::protobuf::TextFormat::ParseFromString(tf_opdefs_string,
-                                                           &opdef)) {
+    OpDef opdef;
+    // NOLINTNEXTLINE: Use tsl::protobuf to be compatible with OSS.
+    if (!tsl::protobuf::TextFormat::ParseFromString(tf_opdefs_string, &opdef)) {
       LOG(ERROR) << "OpDef parsing failed for: " << tf_opdefs_string;
-      return errors::InvalidArgument("fail to parse extra OpDef");
+      return absl::InvalidArgumentError("fail to parse extra OpDef");
     }
     // Register extra opdefs.
-    // TODO(b/133770952): Support shape functions.
-    tensorflow::OpRegistry::Global()->Register(
-        [opdef](tensorflow::OpRegistrationData* op_reg_data) -> Status {
-          *op_reg_data = tensorflow::OpRegistrationData(opdef);
-          return OkStatus();
+    // TODO: b/133770952 - Support shape functions.
+    OpRegistry::Global()->Register(
+        [opdef](OpRegistrationData* op_reg_data) -> absl::Status {
+          *op_reg_data = OpRegistrationData(opdef);
+          return absl::OkStatus();
         });
   }
-  return OkStatus();
+  return absl::OkStatus();
+}
+
+// The hlo->tf conversion is done in three steps; pre-quantization,
+// quantization, and post-quantization. Quantization is optional, enabled only
+// when `pass_config.enable_stablehlo_quantizer` is `true`. If quantization is
+// not run, it only performs the hlo->tf conversion.
+//
+// All parameters except for `pass_config`, `pass_manager`, `status_handler`,
+// and `module` are only required for quantization. See the comments of
+// `RunQuantization` for details. If quantization is not performed, they will be
+// ignored.
+//
+// Returns a failure status when any of the three steps fail. `pass_manager`
+// will be cleared before returning.
+mlir::LogicalResult RunHloToTfConversion(
+    const mlir::TFL::PassConfig& pass_config,
+    const absl::string_view saved_model_dir,
+    const std::unordered_set<std::string>& saved_model_tags,
+    const QuantizationConfig& quantization_config,
+    const PyFunctionLibrary* quantization_py_function_lib,
+    const SavedModelBundle* saved_model_bundle, mlir::PassManager& pass_manager,
+    mlir::StatusScopedDiagnosticHandler& status_handler, ModuleOp& module) {
+  // TODO: b/194747383 - We need to valid that indeed the "main" func is
+  // presented.
+  AddPreQuantizationStableHloToTfPasses(/*entry_function_name=*/"main",
+                                        pass_config, pass_manager);
+  if (failed(pass_manager.run(module))) {
+    return mlir::failure();
+  }
+  pass_manager.clear();
+
+  if (pass_config.enable_stablehlo_quantizer) {
+    const absl::StatusOr<mlir::ModuleOp> quantized_module_op = RunQuantization(
+        saved_model_bundle, saved_model_dir, saved_model_tags,
+        quantization_config, quantization_py_function_lib, module);
+    if (!quantized_module_op.ok()) {
+      LOG(ERROR) << "Failed to run quantization: "
+                 << quantized_module_op.status();
+      return mlir::failure();
+    }
+    module = *quantized_module_op;
+  }
+
+  AddPostQuantizationStableHloToTfPasses(pass_config, pass_manager);
+  if (failed(pass_manager.run(module))) {
+    return mlir::failure();
+  }
+  pass_manager.clear();
+
+  return mlir::success();
 }
+
 }  // namespace
 
-StatusOr<OwningOpRef<ModuleOp>> LoadFromGraphdefOrMlirSource(
+absl::StatusOr<OwningOpRef<ModuleOp>> LoadFromGraphdefOrMlirSource(
     const std::string& input_filename, bool input_mlir,
     bool use_splatted_constant, const std::vector<std::string>& extra_tf_opdefs,
     const GraphImportConfig& specs, absl::string_view debug_info_file,
@@ -156,8 +224,8 @@ StatusOr<OwningOpRef<ModuleOp>> LoadFromGraphdefOrMlirSource(
   std::string error_message;
   auto file = mlir::openInputFile(input_filename, &error_message);
   if (!file) {
-    llvm::errs() << error_message << "\n";
-    return errors::InvalidArgument("fail to open input file");
+    return absl::InvalidArgumentError(
+        absl::StrCat("Failed to open input file: ", error_message));
   }
 
   if (input_mlir) {
@@ -170,7 +238,7 @@ StatusOr<OwningOpRef<ModuleOp>> LoadFromGraphdefOrMlirSource(
   auto extra_opdefs_status = RegisterExtraTfOpDefs(extra_tf_opdefs);
   if (!extra_opdefs_status.ok()) return extra_opdefs_status;
 
-  ::tensorflow::GraphdefToMlirOptions graphdef_conversion_options{
+  GraphdefToMlirOptions graphdef_conversion_options{
       std::string(debug_info_file),
       /*xla_compile_device_type=*/"",
       /*prune_unused_nodes=*/specs.prune_unused_nodes,
@@ -182,21 +250,21 @@ StatusOr<OwningOpRef<ModuleOp>> LoadFromGraphdefOrMlirSource(
       /*enable_soft_placement=*/false};
 
   if (use_splatted_constant) {
-    return tensorflow::GraphdefToSplattedMlirTranslateFunction(
+    return GraphdefToSplattedMlirTranslateFunction(
         file->getBuffer(), input_arrays, input_dtypes, input_shapes,
         output_arrays, control_output_arrays, graphdef_conversion_options,
         context);
   }
-  return tensorflow::GraphdefToMlirTranslateFunction(
-      file->getBuffer(), input_arrays, input_dtypes, input_shapes,
-      output_arrays, control_output_arrays, graphdef_conversion_options,
-      context);
+  return GraphdefToMlirTranslateFunction(file->getBuffer(), input_arrays,
+                                         input_dtypes, input_shapes,
+                                         output_arrays, control_output_arrays,
+                                         graphdef_conversion_options, context);
 }
 
 // Applying post-training dynamic range quantization from the old TOCO quantizer
 // on the translated_result using quant_specs and saving the final output in
 // result.
-Status ApplyDynamicRangeQuantizationFromOldQuantizer(
+absl::Status ApplyDynamicRangeQuantizationFromOldQuantizer(
     const mlir::quant::QuantizationSpecs& quant_specs,
     std::string translated_result, std::string* result) {
   flatbuffers::FlatBufferBuilder q_builder(/*initial_size=*/10240);
@@ -206,14 +274,14 @@ Status ApplyDynamicRangeQuantizationFromOldQuantizer(
 
   ::tflite::optimize::BufferType quantized_type;
   switch (quant_specs.inference_type) {
-    case tensorflow::DT_QINT8:
+    case DT_QINT8:
       quantized_type = ::tflite::optimize::BufferType::QUANTIZED_INT8;
       break;
-    case tensorflow::DT_HALF:
+    case DT_HALF:
       quantized_type = ::tflite::optimize::BufferType::QUANTIZED_FLOAT16;
       break;
     default:
-      return errors::InvalidArgument("Quantized type not supported");
+      return absl::InvalidArgumentError("Quantized type not supported");
       break;
   }
 
@@ -221,59 +289,59 @@ Status ApplyDynamicRangeQuantizationFromOldQuantizer(
   if (::tflite::optimize::QuantizeWeights(
           &q_builder, input_model, quantized_type, use_updated_hybrid_scheme,
           ::tflite::optimize::QuantizerType::OLD_QUANTIZER) != kTfLiteOk) {
-    return errors::InvalidArgument("Quantize weights transformation failed.");
+    return absl::InvalidArgumentError(
+        "Quantize weights transformation failed.");
   }
   const uint8_t* q_buffer = q_builder.GetBufferPointer();
   *result =
-      string(reinterpret_cast<const char*>(q_buffer), q_builder.GetSize());
+      std::string(reinterpret_cast<const char*>(q_buffer), q_builder.GetSize());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status ConvertTFExecutorToStablehloFlatbuffer(
+absl::Status ConvertTFExecutorToStablehloFlatbuffer(
     mlir::PassManager& pass_manager, mlir::ModuleOp module, bool export_to_mlir,
-    mlir::StatusScopedDiagnosticHandler& statusHandler,
+    mlir::StatusScopedDiagnosticHandler& status_handler,
     const toco::TocoFlags& toco_flags, const mlir::TFL::PassConfig& pass_config,
-    std::optional<tensorflow::Session*> session, std::string* result,
+    std::optional<Session*> session, std::string* result,
     const std::unordered_set<std::string>& saved_model_tags) {
   // Currently, TF quantization only support dynamic range quant, as such
   // when toco flag post training quantization is specified with converting to
   // stablehlo, we automatically enable dynamic range quantization
 
   if (toco_flags.post_training_quantize()) {
-    const auto status = tensorflow::quantization::PreprocessAndFreezeGraph(
+    const auto status = quantization::PreprocessAndFreezeGraph(
         module, module.getContext(), session);
     if (!status.ok()) {
-      return errors::Aborted("Failed to preprocess & freeze TF graph");
+      return status_handler.Combine(
+          absl::InternalError("Failed to preprocess & freeze TF graph."));
     }
 
-    // TODO(b/264218457): Refactor the component below once StableHLO Quantizer
+    // TODO: b/264218457 - Refactor the component below once StableHLO Quantizer
     // can run DRQ. Temporarily using TF Quantization for StableHLO DRQ.
     if (!toco_flags.has_quantization_options()) {
       // The default minimum number of elements a weights array must have to be
       // quantized by this transformation.
       const int kWeightsMinNumElementsDefault = 1024;
 
-      tensorflow::quantization::QuantizationOptions quantization_options;
+      quantization::QuantizationOptions quantization_options;
 
       quantization_options.mutable_quantization_method()->set_preset_method(
-          tensorflow::quantization::QuantizationMethod::
-              METHOD_DYNAMIC_RANGE_INT8);
-      quantization_options.set_op_set(
-          tensorflow::quantization::UNIFORM_QUANTIZED);
+          quantization::QuantizationMethod::METHOD_DYNAMIC_RANGE_INT8);
+      quantization_options.set_op_set(quantization::UNIFORM_QUANTIZED);
       quantization_options.set_min_num_elements_for_weights(
           kWeightsMinNumElementsDefault);
-      tensorflow::quantization::AddQuantizePtqDynamicRangePasses(
-          pass_manager, quantization_options);
+      quantization::AddQuantizePtqDynamicRangePasses(pass_manager,
+                                                     quantization_options);
     }
     if (failed(pass_manager.run(module))) {
-      return statusHandler.ConsumeStatus();
+      return status_handler.ConsumeStatus();
     }
   }
 
   pass_manager.clear();
-  mlir::odml::AddTFToStablehloPasses(pass_manager, /*skip_resize*/ true,
-                                     /*smuggle_disallowed_ops*/ true);
+  mlir::odml::AddTFToStablehloPasses(pass_manager, /*skip_resize=*/true,
+                                     /*smuggle_disallowed_ops=*/true);
   // Print out a detailed report of non-converted stats.
   pass_manager.addPass(mlir::odml::createPrintOpStatsPass(
       mlir::odml::GetAcceptedStableHLODialects()));
@@ -283,13 +351,13 @@ Status ConvertTFExecutorToStablehloFlatbuffer(
         pass_manager, toco_flags.quantization_options());
   }
   if (failed(pass_manager.run(module))) {
-    return statusHandler.ConsumeStatus();
+    return status_handler.ConsumeStatus();
   }
 
   if (export_to_mlir) {
     llvm::raw_string_ostream os(*result);
     module.print(os);
-    return statusHandler.ConsumeStatus();
+    return status_handler.ConsumeStatus();
   }
 
   // Write MLIR Stablehlo dialect into FlatBuffer
@@ -301,24 +369,20 @@ Status ConvertTFExecutorToStablehloFlatbuffer(
   options.metadata[tflite::kModelUseStablehloTensorKey] = "true";
   if (!tflite::MlirToFlatBufferTranslateFunction(module, options, result,
                                                  true)) {
-    auto s = statusHandler.ConsumeStatus();
-    std::string message = "Could not translate MLIR to FlatBuffer.";
-    if (!s.ok()) {
-      absl::StrAppend(&message, " ", s.ToString());
-    }
-    return absl::UnknownError(message);
+    return status_handler.Combine(
+        absl::InternalError("Could not translate MLIR to FlatBuffer."));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-Status ConvertTFExecutorToTFLOrFlatbuffer(
+absl::Status ConvertTFExecutorToTFLOrFlatbuffer(
     mlir::ModuleOp module, bool export_to_mlir,
     const toco::TocoFlags& toco_flags, const mlir::TFL::PassConfig& pass_config,
     const std::unordered_set<std::string>& saved_model_tags,
-    llvm::StringRef saved_model_dir,
-    std::optional<tensorflow::Session*> session, std::string* result,
-    bool serialize_stablehlo_ops) {
+    llvm::StringRef saved_model_dir, SavedModelBundle* saved_model_bundle,
+    std::string* result, bool serialize_stablehlo_ops,
+    const PyFunctionLibrary* quantization_py_function_lib) {
   // Explicitly disable dumping Op details on failures.
   module.getContext()->printOpOnDiagnostic(false);
 
@@ -326,92 +390,77 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
   mlir::func::registerAllExtensions(registry);
   module.getContext()->appendDialectRegistry(registry);
 
-  // Register a warning handler only log to std out.
-  mlir::ScopedDiagnosticHandler s(
-      module.getContext(), [](mlir::Diagnostic& diag) {
-        if (diag.getSeverity() == mlir::DiagnosticSeverity::Warning) {
-          for (auto& note : diag.getNotes()) {
-            std::cout << note.str() << "\n";
-            LOG(WARNING) << note.str() << "\n";
-          }
-        }
-        return mlir::failure();
-      });
-
-  mlir::StatusScopedDiagnosticHandler statusHandler(module.getContext(),
-                                                    /*propagate=*/true);
-
-  if (failed(IsValidGraph(module))) {
-    return statusHandler.ConsumeStatus();
-  }
+  mlir::StatusScopedDiagnosticHandler status_handler(module.getContext(),
+                                                     /*propagate=*/true);
 
   mlir::PassManager pass_manager(module.getContext());
   mlir::registerPassManagerCLOptions();
   if (mlir::failed(mlir::applyPassManagerCLOptions(pass_manager))) {
-    return absl::UnknownError("failed to apply MLIR pass manager CL options");
+    return absl::InternalError("Failed to apply MLIR pass manager CL options.");
   }
+  InitPassManager(pass_manager, toco_flags.debug_options());
+
   pass_manager.addInstrumentation(
       std::make_unique<mlir::TFL::ErrorCollectorInstrumentation>(
           pass_manager.getContext()));
-  InitPassManager(pass_manager, toco_flags.debug_options());
 
+  if (failed(IsValidGraph(module))) {
+    return status_handler.ConsumeStatus();
+  }
+
+  Session* session = saved_model_bundle == nullptr
+                         ? nullptr
+                         : saved_model_bundle->GetSession();
   if (pass_config.enable_stablehlo_conversion) {
+    // `ConvertTFExecutorToStablehloFlatbuffer` expects a `std::nullopt` if the
+    // `Session*` is a nullptr.
+    std::optional<Session*> session_opt =
+        session == nullptr ? std::nullopt : std::make_optional(session);
+
     // return to avoid adding TFL converter path
     return ConvertTFExecutorToStablehloFlatbuffer(
-        pass_manager, module, export_to_mlir, statusHandler, toco_flags,
-        pass_config, session, result, saved_model_tags);
+        pass_manager, module, export_to_mlir, status_handler, toco_flags,
+        pass_config, std::move(session_opt), result, saved_model_tags);
   }
 
-  tensorflow::AddPreVariableFreezingTFToTFLConversionPasses(pass_config,
-                                                            &pass_manager);
+  if (pass_config.enable_hlo_to_tf_conversion) {
+    if (failed(RunHloToTfConversion(
+            pass_config, saved_model_dir, saved_model_tags,
+            toco_flags.quantization_config(), quantization_py_function_lib,
+            saved_model_bundle, pass_manager, status_handler, module))) {
+      return status_handler.ConsumeStatus();
+    }
+  }
+
+  AddPreVariableFreezingTFToTFLConversionPasses(pass_config, &pass_manager);
   if (failed(pass_manager.run(module))) {
-    return statusHandler.ConsumeStatus();
+    return status_handler.ConsumeStatus();
   }
+
   // Freeze variables if a session is provided.
-  if (session.has_value()) {
-    mlir::TFL::ErrorCollectorInstrumentation collector(module.getContext());
-    if (failed(
-            mlir::tf_saved_model::FreezeVariables(module, session.value()))) {
-      auto status = statusHandler.ConsumeStatus();
-      mlir::TFL::ErrorCollector* collector =
-          mlir::TFL::ErrorCollector::GetErrorCollector();
-      if (!collector->CollectedErrors().empty()) {
-        // LINT.IfChange
-        return errors::InvalidArgument(
-            "Variable constant folding is failed. Please consider using "
-            "enabling `experimental_enable_resource_variables` flag in the "
-            "TFLite converter object. For example, "
-            "converter.experimental_enable_resource_variables = True");
-        // LINT.ThenChange(//tensorflow/lite/python/lite_v2_test.py)
-      }
-      return status;
-    }
+  if (session != nullptr &&
+      failed(mlir::tf_saved_model::FreezeVariables(module, session))) {
+    return status_handler.Combine(absl::InvalidArgumentError(
+        "Variable constant folding is failed. Please consider using "
+        "enabling `experimental_enable_resource_variables` flag in the "
+        "TFLite converter object. For example, "
+        "converter.experimental_enable_resource_variables = True"));
   }
 
   pass_manager.clear();
 
-  tensorflow::AddPostVariableFreezingTFToTFLConversionPasses(
-      saved_model_dir, toco_flags, pass_config, &pass_manager);
+  AddPostVariableFreezingTFToTFLConversionPasses(saved_model_dir, toco_flags,
+                                                 pass_config, &pass_manager);
   if (failed(pass_manager.run(module))) {
-    auto status = statusHandler.ConsumeStatus();
-    mlir::TFL::ErrorCollector* collector =
-        mlir::TFL::ErrorCollector::GetErrorCollector();
-    for (const auto& error_data : collector->CollectedErrors()) {
-      if (error_data.subcomponent() == "FreezeGlobalTensorsPass") {
-        // LINT.IfChange
-        return errors::InvalidArgument(
-            "Variable constant folding is failed. Please consider using "
-            "enabling `experimental_enable_resource_variables` flag in the "
-            "TFLite converter object. For example, "
-            "converter.experimental_enable_resource_variables = True");
-        // LINT.ThenChange(//tensorflow/lite/python/lite_v2_test.py)
-      }
-    }
-    return status;
+    return status_handler.Combine(absl::InvalidArgumentError(
+        "Variable constant folding is failed. Please consider using "
+        "enabling `experimental_enable_resource_variables` flag in the "
+        "TFLite converter object. For example, "
+        "converter.experimental_enable_resource_variables = True"));
   }
 
   if (failed(GraphContainsStatefulPartitionedOp(module))) {
-    return statusHandler.ConsumeStatus();
+    return status_handler.ConsumeStatus();
   }
 
   if (export_to_mlir) {
@@ -420,12 +469,12 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
     pass_manager.addPass(mlir::odml::createPrintOpStatsPass(
         mlir::odml::GetAcceptedTFLiteDialects()));
     if (failed(pass_manager.run(module))) {
-      return statusHandler.ConsumeStatus();
+      return status_handler.ConsumeStatus();
     }
 
     llvm::raw_string_ostream os(*result);
     module.print(os);
-    return statusHandler.ConsumeStatus();
+    return status_handler.ConsumeStatus();
   }
 
   // Write MLIR TFLite dialect into FlatBuffer
@@ -443,15 +492,11 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
   }
   if (!tflite::MlirToFlatBufferTranslateFunction(
           module, options, &translated_result, serialize_stablehlo_ops)) {
-    auto s = statusHandler.ConsumeStatus();
-    std::string message = "Could not translate MLIR to FlatBuffer.";
-    if (!s.ok()) {
-      absl::StrAppend(&message, " ", s.ToString());
-    }
-    return absl::UnknownError(message);
+    return status_handler.Combine(
+        absl::InternalError("Could not translate MLIR to FlatBuffer."));
   }
 
-  // TODO(b/176267167): Quantize flex fallback in the MLIR pipeline
+  // TODO: b/176267167 - Quantize flex fallback in the MLIR pipeline
   if (quant_specs.weight_quantization &&
       (!quant_specs.RunAndRewriteDynamicRangeQuantizationPasses() ||
        !pass_config.emit_builtin_tflite_ops)) {
@@ -460,30 +505,33 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
     // statement.
     auto status = ApplyDynamicRangeQuantizationFromOldQuantizer(
         quant_specs, translated_result, result);
-    if (!status.ok()) return status;
+    if (!status.ok()) {
+      return status_handler.Combine(status);
+    }
   } else {
     *result = translated_result;
   }
 
   if (mlir::failed(module.verifyInvariants())) {
-    return tensorflow::errors::Unknown("Final module is invalid");
+    return status_handler.Combine(
+        absl::InternalError("Final module is invalid."));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
-StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSavedModel(
+absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSavedModel(
     const std::string& input_filename, const int saved_model_version,
     const std::unordered_set<std::string>& tags,
     absl::Span<const std::string> extra_tf_opdefs,
     absl::Span<std::string> exported_names, const GraphImportConfig& specs,
     bool enable_variable_lifting, mlir::MLIRContext* context,
-    std::unique_ptr<tensorflow::SavedModelBundle>* saved_model_bundle) {
+    std::unique_ptr<SavedModelBundle>* saved_model_bundle) {
   // Register extra TF ops passed as OpDef.
   auto extra_opdefs_status = RegisterExtraTfOpDefs(extra_tf_opdefs);
   if (!extra_opdefs_status.ok()) return extra_opdefs_status;
 
   if (saved_model_version == 2) {
-    auto module_or = tensorflow::SavedModelObjectGraphToMlirImport(
+    auto module_or = SavedModelObjectGraphToMlirImport(
         input_filename, tags, exported_names, context,
         /*unconditionally_use_set_output_shapes=*/true);
     if (!module_or.status().ok()) return module_or.status();
@@ -493,15 +541,14 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ImportSavedModel(
     options.upgrade_legacy = specs.upgrade_legacy;
     options.unconditionally_use_set_output_shapes = true;
     options.lift_variables = enable_variable_lifting;
-    auto module_or = tensorflow::SavedModelSignatureDefsToMlirImport(
+    auto module_or = SavedModelSignatureDefsToMlirImport(
         input_filename, tags, exported_names, context, options,
         saved_model_bundle);
 
     if (!module_or.status().ok()) return module_or.status();
     return std::move(module_or).value();
   } else {
-    return tensorflow::errors::InvalidArgument(
-        "Should be either saved model v1 or v2");
+    return absl::InvalidArgumentError("Should be either saved model v1 or v2.");
   }
 }
 
diff --git a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
index 8ef8813c6c19fc..a82afbeee7eda6 100644
--- a/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
+++ b/tensorflow/compiler/mlir/lite/tf_to_tfl_flatbuffer.h
@@ -22,15 +22,17 @@ limitations under the License.
 #include <unordered_set>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/Support/SourceMgr.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/lite/common/tfl_pass_config.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/lite/toco/toco_flags.pb.h"
@@ -84,9 +86,10 @@ Status ConvertTFExecutorToTFLOrFlatbuffer(
     mlir::ModuleOp module, bool export_to_mlir,
     const toco::TocoFlags& toco_flags, const mlir::TFL::PassConfig& pass_config,
     const std::unordered_set<std::string>& saved_model_tags,
-    llvm::StringRef saved_model_dir,
-    std::optional<tensorflow::Session*> session, std::string* result,
-    bool serialize_stablehlo_ops = false);
+    llvm::StringRef saved_model_dir, SavedModelBundle* saved_model_bundle,
+    std::string* result, bool serialize_stablehlo_ops = false,
+    const quantization::PyFunctionLibrary* quantization_py_function_lib =
+        nullptr);
 }  // namespace tensorflow
 
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_TF_TO_TFL_FLATBUFFER_H_
diff --git a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
index bf4224c7631dd0..f620995ea2ecfd 100644
--- a/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/default_quant_params.cc
@@ -14,25 +14,24 @@ limitations under the License.
 ==============================================================================*/
 
 #include <memory>
+#include <vector>
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineMap.h"
-#include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/PatternMatch.h"
 #include "mlir/Pass/Pass.h"
-#include "mlir/Support/LLVM.h"
-#include "absl/memory/memory.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "mlir/IR/Location.h"  // from @llvm-project
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/ir/tfl_ops.h"
 #include "tensorflow/compiler/mlir/lite/quantization/ir/FakeQuantSupport.h"
-#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
-#include "tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h"
+#include "tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/lite/utils/utils.h"
 
 //===----------------------------------------------------------------------===//
 // The Pass to add default quantization parameters for the activations which
@@ -215,7 +214,8 @@ quant::QuantParams DefaultQuantParamsPass::GetQuantParamsForBias(
   // The non-bias hasn't been quantized, let's skip this bias.
   if (non_bias_types.size() != non_biases.size()) return {};
 
-  return func(non_bias_types, false);
+  return func(/*op_types=*/non_bias_types, /*adjusted_quant_dim=*/-1,
+              /*legacy_float_scale=*/false);
 }
 
 quant::QuantParams DefaultQuantParamsPass::GetDefaultQuantParams(
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
index a2ea10fe199736..cfe9bc754d8077 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_patterns.td
@@ -70,6 +70,9 @@ def ExtractSingleElementAsInt32 : NativeCodeCall<
 def CreateTFCastToInt32Op : NativeCodeCall<
   "CreateCastToInt32($0, $_loc, $_builder)">;
 
+def CreateInt32ConstOrCast : NativeCodeCall<
+  "CreateInt32ConstOrCast($0, $_loc, $_builder)">;
+
 def CreateNoneValue : NativeCodeCall<
   "$_builder.create<TFL::NoValueOp>($0.getLoc(), $_builder.getUnitAttr())">;
 
@@ -587,10 +590,7 @@ def LegalizeCumsum : Pat<
 
 def LegalizeReshape : Pat<
   (TF_ReshapeOp $input, $shape),
-  (TFL_ReshapeOp $input, (CreateTFCastToInt32Op $shape))>;
-
-def ZeroIntAttr
-  : AttrConstraint<CPred<"$_self.cast<::mlir::IntegerAttr>().getInt() == 0">>;
+  (TFL_ReshapeOp $input, (CreateInt32ConstOrCast $shape))>;
 
 def LegalizeStridedSlice : Pat<
   (TF_StridedSliceOp
diff --git a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
index 1ae84c64ddea7f..cc4e7e46b71b99 100644
--- a/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/legalize_tf.cc
@@ -33,13 +33,17 @@ limitations under the License.
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
@@ -106,6 +110,34 @@ Value CreateCastToInt32(Value val, Location loc, PatternRewriter& rewriter) {
       rewriter.getBoolAttr(false));
 }
 
+// Utility function to-
+// 1. Create a tfl.const op with an int32_t values, from an MLIR Value, if the
+// `Value` can be matched to a Constant DenseIntElementsAttr.
+// This will make sure the dynamic dimensions are asigned to be `-1`
+// 2. In the default case, cast the `Value` to an int32_t.
+Value CreateInt32ConstOrCast(Value val, Location loc,
+                             PatternRewriter& rewriter) {
+  if (val.getType().cast<ShapedType>().hasStaticShape()) {
+    DenseElementsAttr shape_value_attr;
+    if (matchPattern(val, m_Constant(&shape_value_attr))) {
+      SmallVector<int32_t, 4> new_shape_array_i32;
+      auto shape_value_array = shape_value_attr.getValues<llvm::APInt>();
+      for (int32_t idx = 0; idx < shape_value_array.size(); ++idx) {
+        auto size = shape_value_array[idx].getSExtValue();
+        new_shape_array_i32.push_back(
+            ShapedType::isDynamic(size) ? -1 : static_cast<int32_t>(size));
+      }
+      return rewriter.create<arith::ConstantOp>(
+          loc, DenseIntElementsAttr::get(
+                   RankedTensorType::get(new_shape_array_i32.size(),
+                                         rewriter.getIntegerType(32)),
+                   new_shape_array_i32));
+    }
+  }
+
+  return CreateCastToInt32(val, loc, rewriter);
+}
+
 // Get shape of an operand or result, support both dynamic and static shape.
 Value GetShape(Value input, Location loc, PatternRewriter& rewriter) {
   auto shaped_type = input.getType().cast<ShapedType>();
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize.cc b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
index ec416dc3ea7939..13703233f6259f 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -60,6 +60,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/lite/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/utils/attribute_utils.h"
+#include "tensorflow/compiler/mlir/lite/utils/constant_utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/convert_type.h"
 #include "tensorflow/compiler/mlir/lite/utils/utils.h"
 #include "tensorflow/compiler/mlir/lite/utils/validators.h"
@@ -704,6 +705,252 @@ bool IsPermutationNCHW(Value perm) {
 
 #include "tensorflow/compiler/mlir/lite/transforms/generated_optimize.inc"
 
+// Returns 1D 32-bit dense elements attribute with the given values.
+static DenseIntElementsAttr GetI32ElementsAttr(ArrayRef<int32_t> values,
+                                               Builder *builder) {
+  RankedTensorType ty = mlir::RankedTensorType::get(
+      {static_cast<int32_t>(values.size())}, builder->getIntegerType(32));
+  return DenseIntElementsAttr::get(ty, values);
+}
+
+DenseIntElementsAttr GetI64ElementsAttr(ArrayRef<int64_t> values,
+                                        Builder *builder) {
+  RankedTensorType ty = RankedTensorType::get(
+      {static_cast<int64_t>(values.size())}, builder->getIntegerType(64));
+  return DenseIntElementsAttr::get(ty, values);
+}
+
+// Get the number of leading 1s in the shape of the given input.
+// Ex. input_shape = [1 x 1 x 1 x 1 x 2 x 1] => 4
+// returns 0 if the input shape is not static.
+int GetNumLeadingOnes(ShapedType input_type) {
+  if (!input_type.hasStaticShape()) return 0;
+  auto input_shape = input_type.getShape();
+  int num_leading_broadcast_dims = 0;
+  for (int i = 0; i < input_shape.size(); ++i) {
+    if (input_shape[i] == 1) {
+      ++num_leading_broadcast_dims;
+    } else {
+      break;
+    }
+  }
+  return num_leading_broadcast_dims;
+}
+
+// Return the number of trailing 1s in the shape of the given input.
+// Ex. input_shape = [1 x 1 x 2 x 1] => 1
+// returns 0 if the input shape is not static.
+int GetNumTrailingOnes(ShapedType input_type) {
+  if (!input_type.hasStaticShape()) return 0;
+  auto input_shape = input_type.getShape();
+  int num_trailing_broadcast_dims = 0;
+  for (int i = input_shape.size() - 1; i >= 0; --i) {
+    if (input_shape[i] == 1) {
+      ++num_trailing_broadcast_dims;
+    } else {
+      break;
+    }
+  }
+  return num_trailing_broadcast_dims;
+}
+
+// Consider as Reshape(
+//               Broadcast(
+//                 Reshape(input, // input_shape=[1 x n]
+//                         inner_shape), // inner_shape=[1 x 1 x 1 x n x 1 x 1]
+//                 broadcast_shape), // broadcast_shape=[1 x 1 x 1 x n x m x 1]
+//               outer_shape))) // outer_shape=[1 x 1 x n*m]
+// Here the broadcast operation is used to create `m` repetetions of the `n`
+// elements in the origiginal tensor, making a total of `m*n` number of elements
+// in the final tensor that will then be reshaped to form something like
+// [1 x 1 x 1 x m*n] by the outermost reshape_op.
+// problem: The inefficiency here is that the innermost reshape_op and the
+// broadcast_op are introducing unnecessary leading and trailing 1s'.
+// fix: Remove the unnecessary 1s' in the inner reshape_op and broadcast_op.
+struct SqueezeReshapesAroundBroadcastOp
+    : public OpRewritePattern<TFL::BroadcastToOp> {
+  using OpRewritePattern<TFL::BroadcastToOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TFL::BroadcastToOp tfl_broadcast_to_op,
+                                PatternRewriter &rewriter) const override {
+    auto loc = tfl_broadcast_to_op->getLoc();
+
+    // Match the
+    // Reshape(
+    //   Broadcast(
+    //     Reshape(input,inner_shape),
+    //     broadcast_shape),
+    //   outer_shape))) pattern.
+    if (!llvm::dyn_cast_or_null<TFL::ReshapeOp>(
+            tfl_broadcast_to_op.getInput().getDefiningOp()) ||
+        // Check that the broadcast_to op has only one use.
+        !tfl_broadcast_to_op.getOutput().hasOneUse() ||
+        !llvm::dyn_cast_or_null<TFL::ReshapeOp>(
+            *tfl_broadcast_to_op.getOutput().getUsers().begin())) {
+      return rewriter.notifyMatchFailure(
+          loc, "No Reshape->BroadcastTo->Reshape pattern found");
+    }
+
+    // Pattern is applied only if the broadcast_to shape has more than 5
+    // dimensions.
+    if (tfl_broadcast_to_op.getShape()
+            .getType()
+            .cast<ShapedType>()
+            .getNumElements() < 6) {
+      return rewriter.notifyMatchFailure(loc,
+                                         "Not supported broadcast_to shape");
+    }
+    auto inner_reshape_op = llvm::dyn_cast_or_null<TFL::ReshapeOp>(
+        tfl_broadcast_to_op.getInput().getDefiningOp());
+    auto inner_reshape_input = inner_reshape_op.getInput();
+    auto outer_reshape_op = llvm::dyn_cast_or_null<TFL::ReshapeOp>(
+        *tfl_broadcast_to_op.getOutput().getUsers().begin());
+
+    // Check that the outermost reshape_op in the pattern does not add
+    // additional elements to the final output tensor.
+    // TODO: b/323217483. This code needs to generalized to additional cases.
+    // For example- inner-shape = [1, 1, 1, 8, 1, 10],
+    // broadcast_shape = [1, 1, 1, 8, 16, 10] & outer_shape = [1, 1, 1, 1280, 1]
+    // And extend the pettern to handle dynamic shapes.
+    if (!inner_reshape_op.getOutput().getType().hasStaticShape() ||
+        !tfl_broadcast_to_op.getOutput().getType().hasStaticShape() ||
+        !outer_reshape_op.getOutput().getType().hasStaticShape()) {
+      return rewriter.notifyMatchFailure(
+          loc, "Unsupported shapes. Currely only static shapes are supported");
+    }
+
+    if (!IsLastDimEqualToNumElements(inner_reshape_input.getType(),
+                                     inner_reshape_op.getOutput().getType()) ||
+        !IsLastDimEqualToNumElements(
+            outer_reshape_op.getOutput().getType(),
+            tfl_broadcast_to_op.getOutput().getType())) {
+      return rewriter.notifyMatchFailure(
+          loc, "Not supported Reshape->BroadcastTo->Reshape pattern");
+    }
+
+    // Calculate the number of extra leading and trailing 1s in the
+    // broadcast_op output.
+    auto broadcast_output_shapetype =
+        tfl_broadcast_to_op.getOutput().getType().cast<ShapedType>();
+    int num_leading_broadcast_dims =
+        GetNumLeadingOnes(broadcast_output_shapetype);
+    int num_trailing_broadcast_dims =
+        GetNumTrailingOnes(broadcast_output_shapetype);
+
+    // Get the new shape for the inner reshape_op after removing the extra 1s.
+    llvm::SmallVector<int32_t, 6> new_reshape_shape_i32{
+        inner_reshape_op.getOutput()
+            .getType()
+            .cast<RankedTensorType>()
+            .getShape()
+            .drop_back(num_trailing_broadcast_dims)
+            .drop_front(num_leading_broadcast_dims)};
+
+    Value new_reshape_shape_value = rewriter.create<arith::ConstantOp>(
+        inner_reshape_op->getLoc(),
+        GetI32ElementsAttr(new_reshape_shape_i32, &rewriter));
+
+    auto new_inner_reshape_op = rewriter.create<TFL::ReshapeOp>(
+        inner_reshape_op->getLoc(),
+        inner_reshape_input, new_reshape_shape_value);
+
+    // Create a new reshape_op to replace the old inner reshape_op.
+    rewriter.replaceOp(inner_reshape_op, new_inner_reshape_op.getResult());
+
+    // Get the new shape for the broadcast_op after removing the extra 1s.
+    llvm::SmallVector<int64_t, 6> new_broadcast_shape{
+        broadcast_output_shapetype.getShape()
+            .drop_back(num_trailing_broadcast_dims)
+            .drop_front(num_leading_broadcast_dims)};
+
+    Value new_broadcast_shape_value = rewriter.create<arith::ConstantOp>(
+        loc, GetI64ElementsAttr(new_broadcast_shape, &rewriter));
+
+    auto new_broadcast_to_op = rewriter.create<TFL::BroadcastToOp>(
+        loc, RankedTensorType::get(new_broadcast_shape, rewriter.getF32Type()),
+        new_inner_reshape_op.getOutput(), new_broadcast_shape_value);
+
+    // Create a new broadcast_op to replace the old broadcast_op.
+    rewriter.replaceOp(tfl_broadcast_to_op, new_broadcast_to_op.getResult());
+
+    return success();
+  }
+};
+
+// This pattern matches TFL::BroadcastToOp WITH TENSOR RANK <= 4 and replaces
+// it with a MulOp that multiplies the tensor by a splat constant with 1s.
+struct ConvertTFLBroadcastToMulOp
+    : public OpRewritePattern<TFL::BroadcastToOp> {
+  using OpRewritePattern<TFL::BroadcastToOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TFL::BroadcastToOp tfl_broadcast_to_op,
+                                PatternRewriter &rewriter) const override {
+    auto input_type =
+        tfl_broadcast_to_op.getInput().getType().cast<ShapedType>();
+    auto output_type =
+        tfl_broadcast_to_op.getOutput().getType().cast<ShapedType>();
+    auto shape_type =
+        tfl_broadcast_to_op.getShape().getType().cast<ShapedType>();
+    Type element_type = input_type.getElementType();
+
+    auto loc = tfl_broadcast_to_op->getLoc();
+
+    // Check that the output type is not dynamic and is less-than-equal to 4D or
+    // the shape type is static, 1D and has less-than-equal to 4 elements.
+    bool is_output_shape_dynamic =
+        (!output_type.hasRank() || (output_type.getRank() > 4) ||
+         (output_type.getNumDynamicDims() > 0));
+    bool is_broadcast_shape_dynamic =
+        (!shape_type.hasStaticShape() || (shape_type.getRank() != 1) ||
+         (shape_type.getDimSize(0) > 4));
+    if (is_output_shape_dynamic && is_broadcast_shape_dynamic)
+      return rewriter.notifyMatchFailure(
+          loc, "output_rank or broadcast_to shape not supported");
+
+    // Allow lowering when the input's elements type is F32, BFloat16, I32 or
+    // I16.
+    if (!(element_type.isa<BFloat16Type, Float32Type>() ||
+          element_type.isInteger(32) || element_type.isInteger(16)))
+      return rewriter.notifyMatchFailure(loc, "element_type_not_supported");
+
+    // TFL_FillOp is created only if is_output_shape_dynamic is true, otherwise
+    // a Arith.ConstOp is created.
+    if (is_output_shape_dynamic &&
+        output_type.getElementType().isUnsignedInteger()) {
+      return rewriter.notifyMatchFailure(
+          loc,
+          "Unsigned broadcast_to output with dynamic shape is not supported");
+    }
+
+    Value mul_rhs_value;
+    if (!output_type.hasRank() || (output_type.getNumDynamicDims() > 0)) {
+      auto status_or_const_op =
+          CreateConstOpWithSingleValue(&rewriter, loc, input_type, 1);
+      if (!status_or_const_op.ok()) {
+        return failure();
+      }
+
+      mul_rhs_value = rewriter.create<TFL::FillOp>(
+          loc, output_type, tfl_broadcast_to_op.getShape(),
+          status_or_const_op.value());
+    } else {
+      auto status_or_const_op =
+          CreateConstOpWithVectorValue(&rewriter, loc, output_type, 1);
+      if (!status_or_const_op.ok()) {
+        return failure();
+      }
+
+      mul_rhs_value = status_or_const_op.value();
+    }
+
+    auto mul_op = rewriter.create<TFL::MulOp>(
+        loc, output_type, tfl_broadcast_to_op.getInput(), mul_rhs_value,
+        rewriter.getStringAttr("NONE"));
+    rewriter.replaceOp(tfl_broadcast_to_op, mul_op.getResult());
+    return success();
+  }
+};
+
 struct FuseAddAndStridedSlice : public OpRewritePattern<TFL::StridedSliceOp> {
   using OpRewritePattern<TFL::StridedSliceOp>::OpRewritePattern;
 
@@ -886,14 +1133,9 @@ struct Convert2DUpscalingToResizeNearestNeighor
     SmallVector<int64_t, 4> reshape_shape_in_int64(
         {1, image_size, image_size, feature_size});
 
-    auto reshape_shape_type =
-        RankedTensorType::get({static_cast<int32_t>(reshape_shape.size())},
-                              rewriter.getIntegerType(32));
-    auto reshape_shape_attr =
-        DenseIntElementsAttr::get(reshape_shape_type, reshape_shape);
-
     auto reshape_shape_const_op = rewriter.create<TFL::ConstOp>(
-        gather_nd_first->getLoc(), reshape_shape_attr);
+        gather_nd_first->getLoc(),
+        GetI32ElementsAttr(reshape_shape, &rewriter));
 
     auto reshape_op = rewriter.create<TFL::ReshapeOp>(
         gather_nd_first->getLoc(),
@@ -903,12 +1145,8 @@ struct Convert2DUpscalingToResizeNearestNeighor
 
     // Add TFL::resize_nearest_neighor op for 2x upscaling.
     SmallVector<int32_t, 2> size_vec = {image_size * 2, image_size * 2};
-    auto size_type = mlir::RankedTensorType::get(
-        {static_cast<int32_t>(size_vec.size())}, rewriter.getIntegerType(32));
-    auto size_attr = mlir::DenseIntElementsAttr::get(size_type, size_vec);
-
-    auto size_const_op =
-        rewriter.create<TFL::ConstOp>(gather_nd_first->getLoc(), size_attr);
+    auto size_const_op = rewriter.create<TFL::ConstOp>(
+        gather_nd_first->getLoc(), GetI32ElementsAttr(size_vec, &rewriter));
 
     auto resize = rewriter.create<TFL::ResizeNearestNeighborOp>(
         gather_nd_first->getLoc(), transpose_second.getResult().getType(),
@@ -1249,6 +1487,12 @@ struct FuseFullyConnectedAndMul : public OpRewritePattern<TFL::MulOp> {
     auto fc_op = dyn_cast_or_null<TFL::FullyConnectedOp>(
         mul_op.getLhs().getDefiningOp());
     if (!fc_op) return failure();
+
+    // Check if FullyConnected has only one use, that is the LHS of Mul Op.
+    // Otherwise this will duplicate the fullyconnected op to serve the
+    // remaining uses.
+    if (!fc_op->hasOneUse()) return failure();
+
     Value filter = fc_op.getFilter();
     Value bias = fc_op.getBias();
     ElementsAttr cst_tmp;
@@ -1759,11 +2003,9 @@ struct ConvertTrivialTransposeOpToReshapeOp
       output_shape_values.push_back(
           ShapedType::isDynamic(dim) ? -1 : static_cast<int32_t>(dim));
     }
-    auto type = mlir::RankedTensorType::get(output_shape_values.size(),
-                                            rewriter.getIntegerType(32));
-    auto new_shape_attr =
-        mlir::DenseIntElementsAttr::get(type, output_shape_values);
-    auto new_shape = rewriter.create<TF::ConstOp>(loc, new_shape_attr);
+
+    auto new_shape = rewriter.create<TF::ConstOp>(
+        loc, GetI32ElementsAttr(output_shape_values, &rewriter));
 
     rewriter.replaceOpWithNewOp<TFL::ReshapeOp>(
         transpose_op, transpose_op.getOutput().getType(),
@@ -1932,11 +2174,7 @@ struct FuseUnpackAndConcatToReshape
           ShapedType::isDynamic(size) ? -1 : static_cast<int32_t>(size));
     }
     auto new_shape = rewriter.create<TFL::ConstOp>(
-        concat_op.getLoc(),
-        DenseIntElementsAttr::get(
-            RankedTensorType::get(new_shape_array_i32.size(),
-                                  rewriter.getIntegerType(32)),
-            new_shape_array_i32));
+        concat_op.getLoc(), GetI32ElementsAttr(new_shape_array_i32, &rewriter));
 
     rewriter.replaceOpWithNewOp<TFL::ReshapeOp>(
         concat_op, output_type, unpack_op.getInput(), new_shape);
@@ -2126,9 +2364,7 @@ struct FuseReshapeAndTransposeAroundBatchMatmul
             transpose_input.getType().getShape().begin() + 2,
             transpose_input.getType().getShape().end(), 1, std::multiplies()))};
     auto shape_constant = rewriter.create<ConstOp>(
-        batch_matmul.getLoc(),
-        DenseIntElementsAttr::get(
-            RankedTensorType::get(3, rewriter.getI32Type()), new_shape));
+        batch_matmul.getLoc(), GetI32ElementsAttr(new_shape, &rewriter));
     auto reshaped_input = rewriter.create<ReshapeOp>(
         batch_matmul.getLoc(), transpose_op.getInput(), shape_constant);
     rewriter.replaceOpWithNewOp<BatchMatMulOp>(
@@ -2190,10 +2426,7 @@ struct FuseTransposeReshapeIntoBatchMatmul
         reshape_op.getType().getShape().drop_front().end());
     new_shape.push_back(reshape_op.getType().getDimSize(0));
     auto shape_constant = rewriter.create<ConstOp>(
-        op.getLoc(), DenseIntElementsAttr::get(
-                         RankedTensorType::get(reshape_op.getType().getRank(),
-                                               rewriter.getI32Type()),
-                         new_shape));
+        op.getLoc(), GetI32ElementsAttr(new_shape, &rewriter));
     auto new_reshape = rewriter.create<ReshapeOp>(
         op.getLoc(), transpose_op.getInput(), shape_constant);
     rewriter.replaceOpWithNewOp<BatchMatMulOp>(
@@ -2421,8 +2654,8 @@ void OptimizePass::runOnOperation() {
   // binary ops.
   RewritePatternSet phase_0_patterns(&getContext());
   phase_0_patterns
-      .add<RemoveReshapeAfterFullyConnected, RemoveReshapeBeforeFullyConnected>(
-          ctx);
+      .add<SqueezeReshapesAroundBroadcastOp, RemoveReshapeAfterFullyConnected,
+           RemoveReshapeBeforeFullyConnected, ConvertTFLBroadcastToMulOp>(ctx);
   (void)applyPatternsAndFoldGreedily(func, std::move(phase_0_patterns));
 
   // Potentially the binary ops might be fused together, like hard_swish, thus
diff --git a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
index 114a01492ff16b..008decb62b0d55 100644
--- a/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
+++ b/tensorflow/compiler/mlir/lite/transforms/optimize_patterns.td
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -1129,6 +1129,26 @@ def OptimizeSliceOp : Pat<
   (replaceWithValue $input),
   [(CanOptimizeIdentitySliceOp $input, $begin, $size)]>;
 
+// Convert the StridedSliceOp to a SliceOp when possible. This will enable other
+// optimizations on SliceOp to run.
+def OptimizeStridedSlice : Pat<
+  (TFL_StridedSliceOp $input,
+    (Arith_ConstantOp $begin),
+    (Arith_ConstantOp $end),
+    (Arith_ConstantOp $stride),
+    ZeroIntAttr:$_,
+    ZeroIntAttr:$_,
+    ZeroIntAttr:$_,
+    ZeroIntAttr:$_,
+    ZeroIntAttr:$_,
+    ConstBoolAttrFalse),
+  (TFL_SliceOp $input,
+    (Arith_ConstantOp $begin),
+    (Arith_ConstantOp (GetOffSet $begin, $end))),
+  [(IsAllOnesConstant $stride),
+   (HasNonNegativeValues $begin),
+   (HasNonNegativeOffset $begin, $end)]>;
+
 def GetNumElementsOrOne: NativeCodeCall<"GetNumElementsOrOne($0.getType())">;
 
 def ReshapeValueDroppingLastDim : NativeCodeCall<
@@ -1510,14 +1530,7 @@ def FuseReshapesAroundBatchMatMulLHS1: Pat<
    (BroadcastDimsProductEqual<1> $final_shape_change, $bmm_tmp_output),
    (AreTensorSubSectionShapesEqual<1, 1> $input, $final_shape_change)]>;
 
-// Fuse redundant TFL_TransposeOp into TFL_BatchMatMulOp
-def FuseTransposeIntoBatchMatMulRHS: Pat<
-  (TFL_BatchMatMulOp $lhs,
-    (TFL_TransposeOp:$transposed_value $input, (Arith_ConstantOp:$perm_value $p0)),
-    $adj_x, $adj_y, $asymmetric_quantize_inputs),
-  (TFL_BatchMatMulOp $lhs, $input, $adj_x, ConstBoolAttrTrue, $asymmetric_quantize_inputs),
-  [(AreLastTwoDimsTransposed $perm_value),
-   (IsBoolAttrEqual<"false"> $adj_y)]>;
+
 
 // Fuse redundant TFL_TransposeOp into TFL_BatchMatMulOp
 def FuseTransposeIntoBatchMatMulLHS: Pat<
diff --git a/tensorflow/compiler/mlir/lite/transforms/passes.td b/tensorflow/compiler/mlir/lite/transforms/passes.td
index 4dfea319b336d7..45428d2648a43f 100644
--- a/tensorflow/compiler/mlir/lite/transforms/passes.td
+++ b/tensorflow/compiler/mlir/lite/transforms/passes.td
@@ -309,6 +309,8 @@ def PrepareQuantizePass : Pass<"tfl-prepare-quantize", "mlir::func::FuncOp"> {
              "disable-set-input-nodes-quantization-params",
              "bool", "false",
              "Whether disable set input nodes quantization parameters.">,
+      Option<"is_qdq_conversion_", "is-qdq-conversion", "bool", "false",
+             "Whether the source graph is a QDQ model intended for conversion only.">,
   ];
 }
 
@@ -323,6 +325,9 @@ def PrepareDynamicRangeQuantizePass : Pass<"tfl-prepare-quantize-dynamic-range",
       Option<"enable_dynamic_range_per_channel_quantization_",
               "enable-dynamic-range-per-channel-quantization", "bool",
               "true", "Whether enable per-channel quantized weights.">,
+      Option<"enable_dynamic_range_per_channel_quantization_for_dense_layers_",
+              "enable-dynamic-range-per-channel-quantization-for-dense-layers", "bool",
+              "true", "Whether enable per-channel quantized weights for Fully Connected layers (default is per tensor).">,
       Option<"min_elements_for_weights_",
               "min-elements-for-weights", "int64_t", "1024",
               "The minimum number of elements in a weights array required to apply quantization.">,
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
index 807d1c2dfaa2b5..fb613d74bbfaa2 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize.cc
@@ -14,7 +14,9 @@ limitations under the License.
 ==============================================================================*/
 
 // This transformation pass applies quantization propagation on TFLite dialect.
+#include <functional>
 #include <iterator>
+#include <memory>
 #include <optional>
 #include <string>
 #include <utility>
@@ -217,7 +219,9 @@ bool PrepareQuantizePass::SetInputNodesQuantizationParams(func::FuncOp func) {
 #include "tensorflow/compiler/mlir/lite/utils/generated_op_quant_spec_getters.inc"
 
 bool PrepareQuantizePass::RemoveRedundantStats(func::FuncOp func) {
-  return RemoveRedundantStatsOps(func, GetOpQuantSpec);
+  return RemoveRedundantStatsOps(
+      func, std::bind(GetOpQuantSpec, std::placeholders::_1,
+                      quant_specs_.disable_per_channel_for_dense_layers));
 }
 
 static Value Quantized(Operation* user) {
@@ -402,12 +406,21 @@ void PrepareQuantizePass::runOnOperation() {
 
   SanityCheckAndAdjustment(func);
 
+  // Bind the getter with the fixed configuration parameter for the correct
+  // quantization settings of the ops.
+  std::function<std::unique_ptr<quant::OpQuantSpec>(Operation*)>
+      op_quant_spec_getter =
+          std::bind(GetOpQuantSpec, std::placeholders::_1,
+                    quant_specs_.disable_per_channel_for_dense_layers);
+
   // Finally, the quantization parameters can be propagated to the rest of the
   // values (tensors).
   ApplyQuantizationParamsPropagation(
       func, is_signed, bit_width,
-      disable_per_channel_ || quant_specs_.disable_per_channel, GetOpQuantSpec,
-      infer_tensor_range, quant_specs_.legacy_float_scale);
+      disable_per_channel_ || quant_specs_.disable_per_channel,
+      op_quant_spec_getter, infer_tensor_range, quant_specs_.legacy_float_scale,
+      (is_qdq_conversion_ ||
+       quant_specs_.qdq_conversion_mode != quant::QDQConversionMode::kQDQNone));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
index 951748b31273f3..a60ebe57212f9e 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_dynamic_range.cc
@@ -72,6 +72,8 @@ class PrepareDynamicRangeQuantizePass
       : quant_specs_(quant_specs) {
     enable_dynamic_range_per_channel_quantization_ =
         !quant_specs_.disable_per_channel;
+    enable_dynamic_range_per_channel_quantization_for_dense_layers_ =
+        !quant_specs_.disable_per_channel_for_dense_layers;
     min_elements_for_weights_ = quant_specs_.minimum_elements_for_weights;
   }
 
@@ -275,6 +277,10 @@ class PrepareDynamicRangeQuantizableOp
       op_with_per_axis_support = op_with_narrow_range &&
                                  affine_user.GetQuantizationDimIndex() != -1 &&
                                  !quant_specs_.disable_per_channel;
+      if (dyn_cast<FullyConnectedOp>(quantize_op)) {
+        op_with_per_axis_support &=
+            !quant_specs_.disable_per_channel_for_dense_layers;
+      }
     }
 
     QuantizedType quant_type = nullptr;
@@ -473,6 +479,8 @@ void PrepareDynamicRangeQuantizePass::runOnOperation() {
 
   quant_specs_.disable_per_channel =
       !enable_dynamic_range_per_channel_quantization_;
+  quant_specs_.disable_per_channel_for_dense_layers =
+      !enable_dynamic_range_per_channel_quantization_for_dense_layers_;
   quant_specs_.minimum_elements_for_weights = min_elements_for_weights_;
 
   if (!enable_custom_op_quantization_.empty()) {
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
index 90a48d577ef669..216c6756ab67db 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_quantize_helper.h
@@ -505,9 +505,10 @@ class ConvertLstmStatsToQDQs : public ConvertOpStatsToQDQs<SourceOp> {
 inline quant::AccumulatorScaleFunc GetUniformQuantizedTypeForBiasWithScale(
     double scale) {
   return [=](const std::vector<quant::QuantParams>& quant_params,
-             bool legacy_float_scale) -> quant::QuantParams {
-    if (auto qtype = quant::GetUniformQuantizedTypeForBias(quant_params,
-                                                           legacy_float_scale)
+             const int adjusted_quant_dim,
+             const bool legacy_float_scale) -> quant::QuantParams {
+    if (auto qtype = quant::GetUniformQuantizedTypeForBias(
+                         quant_params, legacy_float_scale, adjusted_quant_dim)
                          .dyn_cast_or_null<UniformQuantizedType>()) {
       return quant::UniformQuantizedType::get(
           qtype.getFlags(), qtype.getStorageType(), qtype.getExpressedType(),
diff --git a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
index c80be89c567e09..2e920595819f84 100644
--- a/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/prepare_tf.cc
@@ -39,7 +39,6 @@ limitations under the License.
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "mlir/Dialect/Affine/Analysis/LoopAnalysis.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
@@ -780,48 +779,6 @@ struct ConvertTFStridedSlice : public RewritePattern {
   }
 };
 
-struct ConvertTFBroadcastTo : public RewritePattern {
-  explicit ConvertTFBroadcastTo(MLIRContext *context)
-      : RewritePattern(TF::BroadcastToOp::getOperationName(), 1, context) {}
-
-  LogicalResult matchAndRewrite(Operation *op,
-                                PatternRewriter &rewriter) const override {
-    auto tf_broadcast_to_op = cast<TF::BroadcastToOp>(op);
-    auto input_type =
-        tf_broadcast_to_op.getInput().getType().cast<ShapedType>();
-    auto output_type =
-        tf_broadcast_to_op.getOutput().getType().cast<ShapedType>();
-    auto shape_type =
-        tf_broadcast_to_op.getShape().getType().cast<ShapedType>();
-    Type element_type = input_type.getElementType();
-
-    // Allow lowering when low dimension inputs are given and its type is F32 or
-    // I32.
-    if (!((output_type.hasRank() && output_type.getRank() <= 4) ||
-          (shape_type.hasStaticShape() && shape_type.getRank() == 1 &&
-           shape_type.getDimSize(0) <= 4)))
-      return failure();
-
-    if (!(element_type.isa<BFloat16Type, Float32Type>() ||
-          element_type.isInteger(32) || element_type.isInteger(16)))
-      return failure();
-
-    auto status_or_const_op =
-        CreateConstOpWithSingleValue(&rewriter, op->getLoc(), input_type, 1);
-    if (!status_or_const_op.ok()) {
-      return failure();
-    }
-
-    auto tf_fill_op = rewriter.create<TF::FillOp>(op->getLoc(), output_type,
-                                                  tf_broadcast_to_op.getShape(),
-                                                  status_or_const_op.value());
-
-    auto mul_op = rewriter.create<TF::MulOp>(
-        op->getLoc(), output_type, tf_broadcast_to_op.getInput(), tf_fill_op);
-    rewriter.replaceOp(op, mul_op.getResult());
-    return success();
-  }
-};
 
 // The below pattern is equivalent to the DRR rule below
 // The checks are dependent on generated values, so we can't add
@@ -1591,9 +1548,8 @@ void PrepareTFPass::runOnOperation() {
   if (unfold_batch_matmul_) {
     TF::PopulateUnrollTfBatchMatMul(ctx, phase_2_patterns);
   }
-  phase_2_patterns
-      .add<TF::ConvertTFEinsumOp, ConvertTFBroadcastTo, ConvertTFStridedSlice,
-           ConvertRfftToRfft2d, RemoveIdentity>(ctx);
+  phase_2_patterns.add<TF::ConvertTFEinsumOp, ConvertTFStridedSlice,
+                       ConvertRfftToRfft2d, RemoveIdentity>(ctx);
   phase_2_patterns.add<ConvertTFConv2D, ConvertTFDepthwiseConv2dNative>(
       ctx, allow_bf16_and_f16_type_legalization_);
   // Remove redundant reshape ops.
diff --git a/tensorflow/compiler/mlir/lite/transforms/quantize.cc b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
index 0bc21e41f68ced..0d3494d851aaba 100644
--- a/tensorflow/compiler/mlir/lite/transforms/quantize.cc
+++ b/tensorflow/compiler/mlir/lite/transforms/quantize.cc
@@ -257,7 +257,9 @@ void QuantizePass::runOnOperation() {
 
   populateWithGenerated(patterns);
 
-  if (quant_specs.weight_quantization || quant_specs.use_fake_quant_num_bits) {
+  if (quant_specs.weight_quantization || quant_specs.use_fake_quant_num_bits ||
+      quant_specs.qdq_conversion_mode ==
+          quant::QDQConversionMode::kQDQDynamic) {
     patterns.add<TFLDynamicRangeQuantization>(ctx, quant_params);
   } else {
     patterns.add<TFLFullQuantization, TFLFullQuantizationReverse>(ctx,
diff --git a/tensorflow/compiler/mlir/lite/utils/constant_utils.cc b/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
index 86d0509ceb7e65..0e5d10e9e7469a 100644
--- a/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
+++ b/tensorflow/compiler/mlir/lite/utils/constant_utils.cc
@@ -15,39 +15,47 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/lite/utils/constant_utils.h"
 
+#include <complex>
+#include <cstdint>
 #include <string>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_attributes.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/mangling_util.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/platform/status.h"
+#include "tsl/platform/statusor.h"
 
 namespace mlir {
 namespace TFL {
 
-tsl::StatusOr<arith::ConstantOp> CreateConstOpWithSingleValue(
-    PatternRewriter* rewriter, Location loc, ShapedType shaped_type,
-    int value) {
+tsl::StatusOr<TypedAttr> CreateTypedAttr(ShapedType shaped_type, int value) {
   Type element_type = shaped_type.getElementType();
-  ShapedType scalar_type = RankedTensorType::get({}, element_type);
-  TypedAttr attr;
   if (element_type.isF16()) {
     auto floatType = mlir::FloatType::getF16(element_type.getContext());
     auto floatAttr = mlir::FloatAttr::get(floatType, static_cast<float>(value));
     std::vector<Attribute> floatValues({floatAttr});
-    attr = DenseElementsAttr::get(scalar_type, floatValues);
+    return DenseElementsAttr::get(shaped_type, floatValues);
   } else if (element_type.isBF16()) {
     auto floatType = mlir::FloatType::getBF16(element_type.getContext());
     auto floatAttr = mlir::FloatAttr::get(floatType, static_cast<float>(value));
     std::vector<Attribute> floatValues({floatAttr});
-    attr = DenseElementsAttr::get(scalar_type, floatValues);
+    return DenseElementsAttr::get(shaped_type, floatValues);
   } else if (element_type.isF32()) {
-    attr =
-        DenseElementsAttr::get<float>(scalar_type, static_cast<float>(value));
+    return DenseElementsAttr::get<float>(shaped_type,
+                                         static_cast<float>(value));
   } else if (auto complex_type = element_type.dyn_cast<mlir::ComplexType>()) {
     auto etype = complex_type.getElementType();
     if (etype.isF32()) {
@@ -64,7 +72,7 @@ tsl::StatusOr<arith::ConstantOp> CreateConstOpWithSingleValue(
       repr.set_tensor_content(content);
       std::string mangled = tensorflow::mangling_util::MangleTensor(repr);
 
-      attr = mlir::TF::TensorProtoAttr::get(scalar_type, mangled);
+      return mlir::TF::TensorProtoAttr::get(shaped_type, mangled);
     } else {
       return tensorflow::Status(absl::StatusCode::kInvalidArgument,
                                 "Unsupported type");
@@ -73,19 +81,19 @@ tsl::StatusOr<arith::ConstantOp> CreateConstOpWithSingleValue(
     if (element_type.isSignedInteger()) {
       switch (itype.getWidth()) {
         case 8:
-          attr = DenseElementsAttr::get<int8_t>(scalar_type,
+          return DenseElementsAttr::get<int8_t>(shaped_type,
                                                 static_cast<int8_t>(value));
           break;
         case 16:
-          attr = DenseElementsAttr::get<int16_t>(scalar_type,
+          return DenseElementsAttr::get<int16_t>(shaped_type,
                                                  static_cast<int16_t>(value));
           break;
         case 32:
-          attr = DenseElementsAttr::get<int32_t>(scalar_type,
+          return DenseElementsAttr::get<int32_t>(shaped_type,
                                                  static_cast<int32_t>(value));
           break;
         case 64:
-          attr = DenseElementsAttr::get<int64_t>(scalar_type,
+          return DenseElementsAttr::get<int64_t>(shaped_type,
                                                  static_cast<int64_t>(value));
           break;
         default:
@@ -95,19 +103,19 @@ tsl::StatusOr<arith::ConstantOp> CreateConstOpWithSingleValue(
     } else {
       switch (itype.getWidth()) {
         case 8:
-          attr = DenseElementsAttr::get<uint8_t>(scalar_type,
+          return DenseElementsAttr::get<uint8_t>(shaped_type,
                                                  static_cast<uint8_t>(value));
           break;
         case 16:
-          attr = DenseElementsAttr::get<uint16_t>(scalar_type,
+          return DenseElementsAttr::get<uint16_t>(shaped_type,
                                                   static_cast<uint16_t>(value));
           break;
         case 32:
-          attr = DenseElementsAttr::get<uint32_t>(scalar_type,
+          return DenseElementsAttr::get<uint32_t>(shaped_type,
                                                   static_cast<uint32_t>(value));
           break;
         case 64:
-          attr = DenseElementsAttr::get<uint64_t>(scalar_type,
+          return DenseElementsAttr::get<uint64_t>(shaped_type,
                                                   static_cast<uint64_t>(value));
           break;
         default:
@@ -119,8 +127,29 @@ tsl::StatusOr<arith::ConstantOp> CreateConstOpWithSingleValue(
     return tensorflow::Status(absl::StatusCode::kInvalidArgument,
                               "Unsupported type");
   }
+}
+
+// Returns a Constant op with a splat vector value.
+tsl::StatusOr<arith::ConstantOp> CreateConstOpWithVectorValue(
+    PatternRewriter* rewriter, Location loc, ShapedType shaped_type,
+    int value) {
+  ShapedType dense_type = RankedTensorType::get(shaped_type.getShape(),
+                                                shaped_type.getElementType());
+  auto attr = CreateTypedAttr(dense_type, value);
+
+  return rewriter->create<arith::ConstantOp>(loc, dense_type,
+                                             cast<TypedAttr>(*attr));
+}
+
+tsl::StatusOr<arith::ConstantOp> CreateConstOpWithSingleValue(
+    PatternRewriter* rewriter, Location loc, ShapedType shaped_type,
+    int value) {
+  ShapedType scalar_type =
+      RankedTensorType::get({}, shaped_type.getElementType());
+  auto attr = CreateTypedAttr(scalar_type, value);
+
   return rewriter->create<arith::ConstantOp>(loc, scalar_type,
-                                             cast<TypedAttr>(attr));
+                                             cast<TypedAttr>(*attr));
 }
 
 }  // namespace TFL
diff --git a/tensorflow/compiler/mlir/lite/utils/constant_utils.h b/tensorflow/compiler/mlir/lite/utils/constant_utils.h
index 1a71bd55a85e8a..f062e31a557d17 100644
--- a/tensorflow/compiler/mlir/lite/utils/constant_utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/constant_utils.h
@@ -31,6 +31,10 @@ namespace TFL {
 tsl::StatusOr<arith::ConstantOp> CreateConstOpWithSingleValue(
     PatternRewriter* rewriter, Location loc, ShapedType shaped_type, int value);
 
+// Returns a Constant op with a splat vector value.
+tsl::StatusOr<arith::ConstantOp> CreateConstOpWithVectorValue(
+    PatternRewriter* rewriter, Location loc, ShapedType shaped_type, int value);
+
 }  // namespace TFL
 }  // namespace mlir
 #endif  // TENSORFLOW_COMPILER_MLIR_LITE_UTILS_CONSTANT_UTILS_H_
diff --git a/tensorflow/compiler/mlir/lite/utils/utils.h b/tensorflow/compiler/mlir/lite/utils/utils.h
index 6130bab6531ba2..9fce1bc44387c3 100644
--- a/tensorflow/compiler/mlir/lite/utils/utils.h
+++ b/tensorflow/compiler/mlir/lite/utils/utils.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,16 +16,20 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_LITE_UTILS_UTILS_H_
 #define TENSORFLOW_COMPILER_MLIR_LITE_UTILS_UTILS_H_
 
+#include <algorithm>
 #include <cstddef>
 #include <cstdint>
 #include <utility>
 
 #include "llvm/ADT/ArrayRef.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/Matchers.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 
 namespace mlir {
@@ -58,6 +62,44 @@ inline bool OpHasSameStaticShapes(Operation* op) {
   return true;
 }
 
+// Checks if all elements in the constant attribute value are 1.
+inline bool IsAllOnesConstant(Attribute value) {
+  auto values = value.cast<DenseElementsAttr>().getValues<int32_t>();
+  return !std::any_of(values.begin(), values.end(),
+                      [](int32_t element_value) { return element_value != 1; });
+}
+
+// Checks if all elements in the constant attribute value are non-negative.
+inline bool HasNonNegativeValues(Attribute value) {
+  auto values = value.cast<DenseElementsAttr>().getValues<APInt>();
+  return !std::any_of(
+      values.begin(), values.end(),
+      [](const APInt& element_value) { return element_value.isNegative(); });
+}
+
+// Utility function to get the offset between two dense attribute values.
+inline TypedAttr GetOffSet(Attribute begin, Attribute end) {
+  auto begin_values = begin.cast<DenseElementsAttr>().getValues<int32_t>();
+  auto end_values = end.cast<DenseElementsAttr>().getValues<int32_t>();
+
+  SmallVector<int32_t> offsets;
+  if (begin_values.size() == end_values.size()) {
+    for (size_t i = 0; i < begin_values.size(); ++i) {
+      offsets.push_back(end_values[i] - begin_values[i]);
+    }
+  }
+
+  return mlir::DenseElementsAttr::get(
+      RankedTensorType::get({static_cast<int>(offsets.size())},
+                            mlir::IntegerType::get(begin.getContext(), 32)),
+      llvm::ArrayRef(offsets));
+}
+
+// Check if the offset between two dense attribute values is non-negative.
+inline bool HasNonNegativeOffset(Attribute begin, Attribute end) {
+  return HasNonNegativeValues(GetOffSet(begin, end));
+}
+
 // Return true if the permutation value only swaps the last two dimensions
 inline bool AreLastTwoDimsTransposed(Value permutation) {
   if (!permutation) return false;
diff --git a/tensorflow/compiler/mlir/lite/utils/utils.td b/tensorflow/compiler/mlir/lite/utils/utils.td
index e64b591ae78eda..42af8c67b2a7ce 100644
--- a/tensorflow/compiler/mlir/lite/utils/utils.td
+++ b/tensorflow/compiler/mlir/lite/utils/utils.td
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -23,6 +23,23 @@ include "mlir/IR/PatternBase.td"
 // if called without a ranked tensor it will fail.
 def GetShape: NativeCodeCall<"GetShape($0)">;
 
+// Constraint that values in list attribute are all ones.
+def IsAllOnesConstant : Constraint<CPred<"TFL::IsAllOnesConstant($0)">>;
+
+// Constraint that checks if all values in offset between two
+// attributes are non-negative.
+def HasNonNegativeOffset : Constraint<CPred<"TFL::HasNonNegativeOffset($0, $1)">>;
+
+// Constraint that checks if all values in list attribute are non-negative.
+def HasNonNegativeValues : Constraint<CPred<"TFL::HasNonNegativeValues($0)">>;
+
+// Utility function to get the offset between two dense attribute values.
+def GetOffSet : NativeCodeCall<"TFL::GetOffSet($0, $1)">;
+
+// Attribute Constraint that checks if the attribute value is zero.
+def ZeroIntAttr
+  : AttrConstraint<CPred<"$_self.cast<::mlir::IntegerAttr>().getInt() == 0">>;
+
 // Checks if the value has rank at most 'n'.
 class HasRankAtLeast<int n> : Constraint<
     CPred<"$0.getType().cast<ShapedType>().hasRank() && "
diff --git a/tensorflow/compiler/mlir/quantization/common/BUILD b/tensorflow/compiler/mlir/quantization/common/BUILD
index fd1ce2faa8e026..421b3df68642b2 100644
--- a/tensorflow/compiler/mlir/quantization/common/BUILD
+++ b/tensorflow/compiler/mlir/quantization/common/BUILD
@@ -52,6 +52,7 @@ tf_cc_test(
     name = "lift_as_function_call_test",
     srcs = ["lift_as_function_call_test.cc"],
     deps = [
+        ":func",
         ":lift_as_function_call",
         ":test_base",
         "//tensorflow/compiler/mlir/tensorflow",
@@ -65,6 +66,35 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "func",
+    srcs = ["func.cc"],
+    hdrs = ["func.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/cc/saved_model:signature_constants",
+        "//tensorflow/compiler/mlir/tensorflow:import_model",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "func_test",
+    srcs = ["func_test.cc"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        ":func",
+        ":test_base",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
 cc_library(
     name = "test_base",
     testonly = 1,
@@ -73,7 +103,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
-        "//tensorflow/compiler/mlir/quantization/stablehlo/ops:stablehlo_op_quant_spec",
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:context",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/core:test",
@@ -101,9 +131,9 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:xla_call_module_attrs",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:Support",
     ],
 )
@@ -113,11 +143,12 @@ tf_cc_test(
     srcs = ["attrs_and_constraints_test.cc"],
     deps = [
         ":attrs_and_constraints",
+        ":func",
         ":test_base",
-        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/tensorflow",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
@@ -139,3 +170,28 @@ td_library(
         "@llvm-project//mlir:FuncTdFiles",
     ],
 )
+
+cc_library(
+    name = "uniform_quantized_types",
+    srcs = ["uniform_quantized_types.cc"],
+    hdrs = ["uniform_quantized_types.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
+tf_cc_test(
+    name = "uniform_quantized_types_test",
+    srcs = ["uniform_quantized_types_test.cc"],
+    deps = [
+        ":uniform_quantized_types",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:Support",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.cc b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.cc
index 9c3192d5345f28..1d2ccbdaaf4d2b 100644
--- a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.cc
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.cc
@@ -14,15 +14,18 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 
+#include <cstdint>
+
 #include "llvm/ADT/STLExtras.h"
-#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"  // IWYU pragma: keep
 
 namespace mlir::quant {
@@ -59,4 +62,30 @@ SmallVector<Value> CloneOpWithReplacedOperands(
   return builder.clone(*op, mapping)->getResults();
 }
 
+FailureOr<int32_t> CastI64ToI32(const int64_t value) {
+  if (!llvm::isInt<32>(value)) {
+    DEBUG_WITH_TYPE(
+        "mlir-quant-attrs-and-constraints",
+        llvm::dbgs()
+            << "Tried to cast " << value
+            << "from int64 to int32, but lies out of range of int32.\n");
+    return failure();
+  }
+  return static_cast<int32_t>(value);
+}
+
+FailureOr<SmallVector<int32_t>> CastI64ArrayToI32(
+    const ArrayRef<int64_t> int64_array) {
+  SmallVector<int32_t> int32_array{};
+  int32_array.reserve(int64_array.size());
+
+  for (const int64_t i64 : int64_array) {
+    FailureOr<int32_t> cast_i32 = CastI64ToI32(i64);
+    if (failed(cast_i32)) return failure();
+
+    int32_array.push_back(*cast_i32);
+  }
+  return int32_array;
+}
+
 }  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
index 8f298b56ec947e..e4c5e92294e221 100644
--- a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/utils/xla_call_module_attrs.h"
 
 namespace mlir::quant {
 
@@ -150,6 +151,45 @@ FailureOr<T> TryCast(Operation *op, const StringRef name) {
   }
 }
 
+FailureOr<int32_t> CastI64ToI32(int64_t value);
+
+// Tries to cast an array of int64 to int32. If any of the element in the
+// array is not in the range of int32, returns failure().
+FailureOr<SmallVector<int32_t>> CastI64ArrayToI32(
+    ArrayRef<int64_t> int64_array);
+
+// Returns the first user of the given operation, optionally of the given
+// type if provided. If there is no user or user of type, return nullptr.
+template <typename T = Operation *>
+Operation *FindUserOfType(Operation *op) {
+  for (Operation *user : op->getUsers()) {
+    if (isa<T>(user)) {
+      return user;
+    }
+  }
+  return nullptr;
+}
+
+// Returns the function attribute for the given call op which is lifted for
+// quantization.
+template <typename LiftedOp>
+inline FlatSymbolRefAttr GetFuncAttr(LiftedOp call_op) {
+  static_assert(false, "DuplicateOp for call_op is not implemented.");
+}
+
+template <>
+inline FlatSymbolRefAttr GetFuncAttr<TF::PartitionedCallOp>(
+    TF::PartitionedCallOp call_op) {
+  return call_op.getFAttr().template dyn_cast<FlatSymbolRefAttr>();
+}
+
+template <>
+inline FlatSymbolRefAttr GetFuncAttr<TF::XlaCallModuleOp>(
+    TF::XlaCallModuleOp call_op) {
+  return call_op->getAttrOfType<FlatSymbolRefAttr>(
+      TF::kStablehloEntryFunctionAttrName);
+}
+
 }  // namespace mlir::quant
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_ATTRS_AND_CONSTRAINTS_H_
diff --git a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc
index afa7d44e1b595e..2c466b4415818b 100644
--- a/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/attrs_and_constraints_test.cc
@@ -15,8 +15,12 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 
+#include <cstdint>
+
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
+#include "llvm/Support/MathExtras.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -25,20 +29,26 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/func.h"
 #include "tensorflow/compiler/mlir/quantization/common/test_base.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir::quant {
 namespace {
 
 using ::mlir::quant::QuantizationTestBase;
 using ::mlir::stablehlo::AddOp;
+using ::mlir::stablehlo::ConvolutionOp;
 using ::mlir::stablehlo::DotGeneralOp;
+using ::mlir::stablehlo::SubtractOp;
+using ::testing::ElementsAreArray;
+using ::testing::NotNull;
 
 class AttrsAndConstraintsTest : public QuantizationTestBase {};
 
 constexpr absl::string_view kModuleStatic = R"mlir(
   module {
-    func.func private @main(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    func.func @main(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
       %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
       return %0 : tensor<1x3xf32>
     }
@@ -47,16 +57,57 @@ constexpr absl::string_view kModuleStatic = R"mlir(
 
 constexpr absl::string_view kModuleDynamic = R"mlir(
   module {
-    func.func private @main(%arg0: tensor<?x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<?x3xf32> attributes {_from_xla_call_module} {
+    func.func @main(%arg0: tensor<?x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<?x3xf32> attributes {_from_xla_call_module} {
       %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [] : (tensor<?x1024xf32>, tensor<1024x3xf32>) -> tensor<?x3xf32>
       return %0 : tensor<?x3xf32>
     }
   }
 )mlir";
 
+constexpr absl::string_view kModuleMultipleUses = R"mlir(
+  module {
+    func.func @main(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+      %1 = stablehlo.subtract %0, %arg2 : tensor<1x3xf32>
+      %2 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
+      return %2 : tensor<1x3xf32>
+    }
+  }
+)mlir";
+
+constexpr absl::string_view kModuleXlaCallModule = R"mlir(
+  module {
+    func.func @main(%arg0: tensor<?x2xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<?x2xf32>) {
+      %0 = stablehlo.constant dense<[-0.211145893, -0.708605706]> : tensor<2xf32>
+      %1 = stablehlo.constant dense<[[-0.630731344, 0.54962182], [0.180364341, -0.764542698]]> : tensor<2x2xf32>
+      %2 = "tf.XlaCallModule"(%arg0, %1, %0) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_fn_1, _original_entry_function = "composite_fn_1", _tfl_quant_trait = "fully_quantizable"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
+      return %2 : tensor<?x2xf32>
+    }
+    func.func private @composite_fn_1(%arg0: tensor<?x2xf32>, %arg1: tensor<2x2xf32>, %arg2: tensor<2xf32>) -> tensor<?x2xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+      return %arg0 : tensor<?x2xf32>
+    }
+  }
+)mlir";
+
+constexpr absl::string_view kModulePartitionedCall = R"mlir(
+  module {
+    func.func @main(%arg0: tensor<2x2xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<2x2xf32>) {
+      %cst = "tf.Const"() {device = "", value = dense<[[-0.630731344, 0.54962182], [0.180364341, -0.764542698]]> : tensor<2x2xf32>} : () -> tensor<2x2xf32>
+      %0 = "tf.PartitionedCall"(%arg0, %cst) {_tfl_quant_trait = "fully_quantizable", config = "", config_proto = "", executor_type = "", f = @composite_fn_1} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32> loc(callsite("test@main"("MatMul") at "QuantizationUnit(\12\06MatMul\1a\07main)"))
+      return %0 : tensor<2x2xf32>
+    }
+    func.func private @composite_fn_1(%arg0: tensor<2x2xf32>, %arg1: tensor<2x2xf32>) -> tensor<2x2xf32> attributes {tf_quant.composite_function} {
+      %0 = "tf.MatMul"(%arg0, %arg1) {attr_map = "0:transpose_a,1:transpose_b", device = "", transpose_a = false, transpose_b = false} : (tensor<2x2xf32>, tensor<2x2xf32>) -> tensor<2x2xf32>
+      return %0 : tensor<2x2xf32>
+    }
+  }
+)mlir";
+
 TEST_F(AttrsAndConstraintsTest, HasStaticShapeSucceedsWithStaticShapes) {
   OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleStatic);
-  func::FuncOp main_fn = GetFunctionFromModule(*module_op_ref, "main");
+  func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
+  ASSERT_THAT(main_fn, NotNull());
+
   Value dot_general_result =
       FindOperationOfType<DotGeneralOp>(main_fn)->getResult(0);
   EXPECT_TRUE(HasStaticShape(dot_general_result));
@@ -66,7 +117,9 @@ TEST_F(AttrsAndConstraintsTest, HasStaticShapeSucceedsWithStaticShapes) {
 
 TEST_F(AttrsAndConstraintsTest, HasStaticShapeFailsWithDynamicShapes) {
   OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleDynamic);
-  func::FuncOp main_fn = GetFunctionFromModule(*module_op_ref, "main");
+  func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
+  ASSERT_THAT(main_fn, NotNull());
+
   Value dot_general_result =
       FindOperationOfType<DotGeneralOp>(main_fn)->getResult(0);
   EXPECT_FALSE(HasStaticShape(dot_general_result));
@@ -76,7 +129,9 @@ TEST_F(AttrsAndConstraintsTest, HasStaticShapeFailsWithDynamicShapes) {
 
 TEST_F(AttrsAndConstraintsTest, TryCastSucceeds) {
   OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleStatic);
-  func::FuncOp main_fn = GetFunctionFromModule(*module_op_ref, "main");
+  func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
+  ASSERT_THAT(main_fn, NotNull());
+
   Operation* dot_general_op = FindOperationOfType<DotGeneralOp>(main_fn);
   EXPECT_TRUE(succeeded(
       TryCast<DotGeneralOp>(dot_general_op, /*name=*/"dot_general_op")));
@@ -84,7 +139,9 @@ TEST_F(AttrsAndConstraintsTest, TryCastSucceeds) {
 
 TEST_F(AttrsAndConstraintsTest, TryCastFailsOnWrongType) {
   OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleStatic);
-  func::FuncOp main_fn = GetFunctionFromModule(*module_op_ref, "main");
+  func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
+  ASSERT_THAT(main_fn, NotNull());
+
   Operation* dot_general_op = FindOperationOfType<DotGeneralOp>(main_fn);
   EXPECT_TRUE(
       failed(TryCast<AddOp>(dot_general_op, /*name=*/"dot_general_op")));
@@ -92,7 +149,9 @@ TEST_F(AttrsAndConstraintsTest, TryCastFailsOnWrongType) {
 
 TEST_F(AttrsAndConstraintsTest, TryCastFailsOnNullPtr) {
   OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleStatic);
-  func::FuncOp main_fn = GetFunctionFromModule(*module_op_ref, "main");
+  func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
+  ASSERT_THAT(main_fn, NotNull());
+
   Operation* op_nullptr =
       FindOperationOfType<DotGeneralOp>(main_fn)->getNextNode()->getNextNode();
   // getNextNode() returns a nullptr if at the very last node.
@@ -101,5 +160,74 @@ TEST_F(AttrsAndConstraintsTest, TryCastFailsOnNullPtr) {
   EXPECT_TRUE(failed(TryCast<DotGeneralOp>(nullptr, /*name=*/"nullptr")));
 }
 
+TEST_F(AttrsAndConstraintsTest, I64ValueInI32RangeAreCastedCorrectly) {
+  EXPECT_TRUE(succeeded(CastI64ToI32(llvm::minIntN(32))));
+  EXPECT_TRUE(succeeded(CastI64ToI32(llvm::maxIntN(32))));
+}
+
+TEST_F(AttrsAndConstraintsTest, CastingFailsForI64ValueOutOfI32Range) {
+  EXPECT_TRUE(failed(CastI64ToI32(llvm::minIntN(32) - 10)));
+  EXPECT_TRUE(failed(CastI64ToI32(llvm::maxIntN(32) + 10)));
+}
+
+TEST_F(AttrsAndConstraintsTest, I64ArrayInI32RangeAreCastedCorrectly) {
+  const SmallVector<int64_t> array_i64 = {llvm::minIntN(32), -2, -1, 0, 1, 2,
+                                          llvm::maxIntN(32)};
+
+  FailureOr<SmallVector<int32_t>> array_i32 = CastI64ArrayToI32(array_i64);
+  EXPECT_TRUE(succeeded(array_i32));
+  EXPECT_THAT(
+      *array_i32,
+      ElementsAreArray({static_cast<int32_t>(llvm::minIntN(32)), -2, -1, 0, 1,
+                        2, static_cast<int32_t>(llvm::maxIntN(32))}));
+}
+
+TEST_F(AttrsAndConstraintsTest, CastingFailsForI64ArrayUnderI32Range) {
+  const int64_t under_min_i32 = -2147483658;
+  ArrayRef<int64_t> array_i64{under_min_i32};
+  EXPECT_EQ(under_min_i32, llvm::minIntN(32) - 10);
+  EXPECT_TRUE(failed(CastI64ArrayToI32(array_i64)));
+}
+
+TEST_F(AttrsAndConstraintsTest, CastingFailsForI64ArrayAboveI32Range) {
+  const int64_t below_max_i32 = 2147483657;
+  ArrayRef<int64_t> array_i64{below_max_i32};
+  EXPECT_EQ(below_max_i32, llvm::maxIntN(32) + 10);
+  EXPECT_TRUE(failed(CastI64ArrayToI32(array_i64)));
+}
+
+TEST_F(AttrsAndConstraintsTest, FindUserOfDifferentTypes) {
+  OwningOpRef<ModuleOp> module_op_ref =
+      ParseModuleOpString(kModuleMultipleUses);
+  func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
+  ASSERT_THAT(main_fn, NotNull());
+
+  Operation* dot_general_op = FindOperationOfType<DotGeneralOp>(main_fn);
+  ASSERT_NE(FindUserOfType<AddOp>(dot_general_op), nullptr);
+  ASSERT_NE(FindUserOfType<SubtractOp>(dot_general_op), nullptr);
+  ASSERT_NE(FindUserOfType<>(dot_general_op), nullptr);
+  ASSERT_EQ(FindUserOfType<ConvolutionOp>(dot_general_op), nullptr);
+}
+
+TEST_F(AttrsAndConstraintsTest, CallGetFuncAttr) {
+  OwningOpRef<ModuleOp> xla_module_op_ref =
+      ParseModuleOpString(kModuleXlaCallModule);
+  func::FuncOp xml_main_fn = FindMainFuncOp(*xla_module_op_ref);
+  Operation* xla_op = FindOperationOfType<TF::XlaCallModuleOp>(xml_main_fn);
+  auto xla_call_op = dyn_cast_or_null<TF::XlaCallModuleOp>(*xla_op);
+  FlatSymbolRefAttr xla_call_op_attr = GetFuncAttr(xla_call_op);
+  EXPECT_EQ(xla_call_op_attr.getValue(), "composite_fn_1");
+
+  OwningOpRef<ModuleOp> partitioned_module_op_ref =
+      ParseModuleOpString(kModulePartitionedCall);
+  func::FuncOp partitioned_main_fn = FindMainFuncOp(*partitioned_module_op_ref);
+  Operation* partitioned_op =
+      FindOperationOfType<TF::PartitionedCallOp>(partitioned_main_fn);
+  auto partitioned_call_op =
+      dyn_cast_or_null<TF::PartitionedCallOp>(*partitioned_op);
+  FlatSymbolRefAttr partitioned_call_op_attr = GetFuncAttr(partitioned_call_op);
+  EXPECT_EQ(partitioned_call_op_attr.getValue(), "composite_fn_1");
+}
+
 }  // namespace
 }  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/common/func.cc b/tensorflow/compiler/mlir/quantization/common/func.cc
new file mode 100644
index 00000000000000..5849289e6d7ebd
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/func.cc
@@ -0,0 +1,55 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/common/func.h"
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/cc/saved_model/signature_constants.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
+
+namespace mlir::quant {
+namespace {
+
+using ::tensorflow::kDefaultServingSignatureDefKey;
+using ::tensorflow::kImportModelDefaultGraphFuncName;
+
+// Returns true iff the function's symbol is public.
+bool IsPublicFuncOp(func::FuncOp func_op) {
+  return SymbolTable::getSymbolVisibility(&*func_op) ==
+         SymbolTable::Visibility::Public;
+}
+
+}  // namespace
+
+func::FuncOp FindMainFuncOp(ModuleOp module_op) {
+  if (const auto main_func_op = module_op.lookupSymbol<func::FuncOp>(
+          kImportModelDefaultGraphFuncName);
+      main_func_op != nullptr && IsPublicFuncOp(main_func_op)) {
+    return main_func_op;
+  }
+
+  if (const auto serving_default_func_op =
+          module_op.lookupSymbol<func::FuncOp>(kDefaultServingSignatureDefKey);
+      serving_default_func_op != nullptr &&
+      IsPublicFuncOp(serving_default_func_op)) {
+    return serving_default_func_op;
+  }
+
+  return nullptr;
+}
+
+}  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/common/func.h b/tensorflow/compiler/mlir/quantization/common/func.h
new file mode 100644
index 00000000000000..ade7bcfc71027b
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/func.h
@@ -0,0 +1,31 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_FUNC_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_FUNC_H_
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+
+namespace mlir::quant {
+
+// Returns a public `func::FuncOp` in `module_op` whose name matches either
+// `main` or `serving_default`. If `func::FuncOps` with both names exist, the
+// function with name "main" takes precedence. Returns null if no such a
+// function exists.
+func::FuncOp FindMainFuncOp(ModuleOp module_op);
+
+}  // namespace mlir::quant
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_FUNC_H_
diff --git a/tensorflow/compiler/mlir/quantization/common/func_test.cc b/tensorflow/compiler/mlir/quantization/common/func_test.cc
new file mode 100644
index 00000000000000..8555da63b71feb
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/common/func_test.cc
@@ -0,0 +1,113 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/common/func.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/test_base.h"
+
+namespace mlir::quant {
+namespace {
+
+using ::testing::IsNull;
+using ::testing::NotNull;
+
+class FindMainFuncOpTest : public QuantizationTestBase {};
+
+TEST_F(FindMainFuncOpTest, ReturnsMainFuncOp) {
+  constexpr absl::string_view kModuleWithMainFunc = R"mlir(
+    module {
+      func.func @main() -> () {
+        return
+      }
+    }
+  )mlir";
+
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(kModuleWithMainFunc);
+  EXPECT_THAT(*module_op, NotNull());
+
+  func::FuncOp main_func_op = FindMainFuncOp(*module_op);
+  EXPECT_THAT(main_func_op, NotNull());
+}
+
+TEST_F(FindMainFuncOpTest, ReturnsNullWhenMainFuncOpIsPrivate) {
+  constexpr absl::string_view kModuleWithPrivateMainFunc = R"mlir(
+    module {
+      func.func private @main() -> () {
+        return
+      }
+    }
+  )mlir";
+
+  OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kModuleWithPrivateMainFunc);
+  EXPECT_THAT(*module_op, NotNull());
+
+  EXPECT_THAT(FindMainFuncOp(*module_op), IsNull());
+}
+
+TEST_F(FindMainFuncOpTest, ReturnsServingDefaultFuncOp) {
+  constexpr absl::string_view kModuleWithServingDefaultFunc = R"mlir(
+    module {
+      func.func @serving_default() -> () {
+        return
+      }
+    }
+  )mlir";
+
+  OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kModuleWithServingDefaultFunc);
+  EXPECT_THAT(*module_op, NotNull());
+
+  EXPECT_THAT(FindMainFuncOp(*module_op), NotNull());
+}
+
+TEST_F(FindMainFuncOpTest, ReturnsNullWhenServingDefaultFuncOpIsPrivate) {
+  constexpr absl::string_view kModuleWithPrivateServingDefaultFunc = R"mlir(
+    module {
+      func.func private @serving_default() -> () {
+        return
+      }
+    }
+  )mlir";
+
+  OwningOpRef<ModuleOp> module_op =
+      ParseModuleOpString(kModuleWithPrivateServingDefaultFunc);
+  EXPECT_THAT(*module_op, NotNull());
+
+  EXPECT_THAT(FindMainFuncOp(*module_op), IsNull());
+}
+
+TEST_F(FindMainFuncOpTest, ReturnsNullWhenMainFuncNotFound) {
+  constexpr absl::string_view kModuleWithNoMainFunc = R"mlir(
+    module {
+      func.func @foo() -> () {
+        return
+      }
+    }
+  )mlir";
+
+  OwningOpRef<ModuleOp> module_op = ParseModuleOpString(kModuleWithNoMainFunc);
+  EXPECT_THAT(*module_op, NotNull());
+
+  EXPECT_THAT(FindMainFuncOp(*module_op), IsNull());
+}
+
+}  // namespace
+}  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
index 36ed212afb6259..2a1b10cc3163a2 100644
--- a/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/lift_as_function_call_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "llvm/ADT/SmallVector.h"
@@ -28,13 +29,14 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/func.h"
 #include "tensorflow/compiler/mlir/quantization/common/test_base.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
 namespace mlir::quant {
 namespace {
 
-using ::mlir::quant::QuantizationTestBase;
+using ::testing::NotNull;
 
 class LiftAsFunctionCallTest : public QuantizationTestBase {};
 
@@ -49,8 +51,10 @@ constexpr absl::string_view kModuleLifted = R"mlir(
 
 TEST_F(LiftAsFunctionCallTest, LiftedFunctionSucceeds) {
   OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleLifted);
-  func::FuncOp composite_dot_general_fn =
-      GetFunctionFromModule(*module_op_ref, "composite_dot_general_fn_1");
+  auto composite_dot_general_fn =
+      module_op_ref->lookupSymbol<func::FuncOp>("composite_dot_general_fn_1");
+  ASSERT_THAT(composite_dot_general_fn, NotNull());
+
   Operation* dot_general_op =
       FindOperationOfType<mlir::stablehlo::DotGeneralOp>(
           composite_dot_general_fn);
@@ -59,7 +63,7 @@ TEST_F(LiftAsFunctionCallTest, LiftedFunctionSucceeds) {
 
 constexpr absl::string_view kModuleStableHlo = R"mlir(
   module {
-    func.func private @main(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    func.func @main(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
       %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
       return %0 : tensor<1x3xf32>
     }
@@ -68,15 +72,18 @@ constexpr absl::string_view kModuleStableHlo = R"mlir(
 
 TEST_F(LiftAsFunctionCallTest, FunctionLiftedAsXlaCallModuleOp) {
   OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleStableHlo);
-  func::FuncOp main_fn = GetFunctionFromModule(*module_op_ref, "main");
+  func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
+  ASSERT_THAT(main_fn, NotNull());
+
   Operation* dot_general_op =
       FindOperationOfType<mlir::stablehlo::DotGeneralOp>(main_fn);
 
   const SmallVector<NamedAttribute>& attributes = {
-      builder_.getNamedAttr("precision_config",
-                            builder_.getArrayAttr(SmallVector<Attribute>(
-                                1, stablehlo::PrecisionAttr::get(
-                                       &ctx_, stablehlo::Precision::DEFAULT)))),
+      builder_.getNamedAttr(
+          "precision_config",
+          builder_.getArrayAttr(SmallVector<Attribute>(
+              1, mlir::stablehlo::PrecisionAttr::get(
+                     ctx_.get(), mlir::stablehlo::Precision::DEFAULT)))),
   };
   Operation* lifted_op =
       LiftAsFunctionCall(builder_, dot_general_op->getLoc(),
@@ -99,13 +106,15 @@ TEST_F(LiftAsFunctionCallTest, FunctionLiftedAsXlaCallModuleOp) {
   EXPECT_EQ(
       lifted_dot_general_op->getAttr("precision_config").cast<ArrayAttr>(),
       builder_.getArrayAttr(SmallVector<Attribute>(
-          1, stablehlo::PrecisionAttr::get(&ctx_,
-                                           stablehlo::Precision::DEFAULT))));
+          1, mlir::stablehlo::PrecisionAttr::get(
+                 ctx_.get(), mlir::stablehlo::Precision::DEFAULT))));
 }
 
 TEST_F(LiftAsFunctionCallTest, FunctionNoAttrLiftedAsXlaCallModuleOp) {
   OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleStableHlo);
-  func::FuncOp main_fn = GetFunctionFromModule(*module_op_ref, "main");
+  func::FuncOp main_fn = FindMainFuncOp(*module_op_ref);
+  ASSERT_THAT(main_fn, NotNull());
+
   Operation* dot_general_op =
       FindOperationOfType<mlir::stablehlo::DotGeneralOp>(main_fn);
   Operation* lifted_op =
diff --git a/tensorflow/compiler/mlir/quantization/common/test_base.h b/tensorflow/compiler/mlir/quantization/common/test_base.h
index 5402769bf04c2a..1068a42a615027 100644
--- a/tensorflow/compiler/mlir/quantization/common/test_base.h
+++ b/tensorflow/compiler/mlir/quantization/common/test_base.h
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TEST_BASE_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_TEST_BASE_H_
 
+#include <memory>
+
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
@@ -25,10 +27,10 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/context.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
@@ -40,31 +42,26 @@ using ::testing::Test;
 
 class QuantizationTestBase : public Test {
  protected:
-  QuantizationTestBase() {
-    ctx_.loadDialect<arith::ArithDialect, mlir::stablehlo::StablehloDialect,
-                     func::FuncDialect, TF::TensorFlowDialect,
-                     tf_saved_model::TensorFlowSavedModelDialect,
-                     tf_executor::TensorFlowExecutorDialect,
-                     quant::QuantizationDialect,
-                     quantfork::QuantizationForkDialect>();
+  QuantizationTestBase()
+      : ctx_(stablehlo::CreateMlirContextForQuantization()),
+        builder_(ctx_.get()) {
+    ctx_->loadDialect<arith::ArithDialect, mlir::stablehlo::StablehloDialect,
+                      func::FuncDialect, TF::TensorFlowDialect,
+                      tf_saved_model::TensorFlowSavedModelDialect,
+                      tf_executor::TensorFlowExecutorDialect,
+                      quant::QuantizationDialect,
+                      quantfork::QuantizationForkDialect>();
   }
 
   // Parses `module_op_str` to create a `ModuleOp`. Checks whether the created
   // module op is valid.
   OwningOpRef<ModuleOp> ParseModuleOpString(
       const absl::string_view module_op_str) {
-    auto module_op_ref = parseSourceString<ModuleOp>(module_op_str, &ctx_);
+    auto module_op_ref = parseSourceString<ModuleOp>(module_op_str, ctx_.get());
     EXPECT_TRUE(module_op_ref);
     return module_op_ref;
   }
 
-  // Gets the function with the given name from the module.
-  func::FuncOp GetFunctionFromModule(ModuleOp module,
-                                     absl::string_view function_name) {
-    SymbolTable symbol_table(module);
-    return symbol_table.lookup<func::FuncOp>(function_name);
-  }
-
   // Returns the first operation with the given type in the function.
   template <typename OpType>
   OpType FindOperationOfType(func::FuncOp function) {
@@ -74,8 +71,8 @@ class QuantizationTestBase : public Test {
     return nullptr;
   }
 
-  mlir::MLIRContext ctx_{};
-  OpBuilder builder_{&ctx_};
+  std::unique_ptr<MLIRContext> ctx_;
+  OpBuilder builder_;
 };
 
 }  // namespace mlir::quant
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.cc b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.cc
similarity index 72%
rename from tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.cc
rename to tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.cc
index eecc96b04be9eb..5cee0692080b2a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.cc
+++ b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.cc
@@ -12,16 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h"
+#include "tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h"
 
 #include <cstdint>
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 
@@ -33,12 +35,14 @@ namespace quant {
 UniformQuantizedType CreateI8F32UniformQuantizedType(const Location loc,
                                                      MLIRContext& context,
                                                      const double scale,
-                                                     const int64_t zero_point) {
+                                                     const int64_t zero_point,
+                                                     const bool narrow_range) {
   return UniformQuantizedType::getChecked(
       loc, /*flags=*/QuantizationFlags::Signed,
       /*storageType=*/IntegerType::get(&context, /*width=*/8),
       /*expressedType=*/FloatType::getF32(&context), scale, zero_point,
-      /*storageTypeMin=*/llvm::minIntN(8), /*storageTypeMax=*/llvm::maxIntN(8));
+      /*storageTypeMin=*/llvm::minIntN(8) + (narrow_range ? 1 : 0),
+      /*storageTypeMax=*/llvm::maxIntN(8));
 }
 
 UniformQuantizedType CreateI32F32UniformQuantizedType(
@@ -54,16 +58,30 @@ UniformQuantizedType CreateI32F32UniformQuantizedType(
 
 UniformQuantizedPerAxisType CreateI8F32UniformQuantizedPerAxisType(
     const Location loc, MLIRContext& context, const ArrayRef<double> scales,
-    const ArrayRef<int64_t> zero_points, const int quantization_dimension) {
+    const ArrayRef<int64_t> zero_points, const int quantization_dimension,
+    const bool narrow_range) {
   return UniformQuantizedPerAxisType::getChecked(
       loc, /*flags=*/QuantizationFlags::Signed,
       /*storageType=*/IntegerType::get(&context, /*width=*/8),
       /*expressedType=*/FloatType::getF32(&context),
       SmallVector<double>(scales), SmallVector<int64_t>(zero_points),
-      quantization_dimension, /*storageTypeMin=*/llvm::minIntN(8),
+      quantization_dimension,
+      /*storageTypeMin=*/llvm::minIntN(8) + (narrow_range ? 1 : 0),
       /*storageTypeMax=*/llvm::maxIntN(8));
 }
 
+UniformQuantizedPerAxisType CreateI32F32UniformQuantizedPerAxisType(
+    const Location loc, MLIRContext& context, const ArrayRef<double> scales,
+    const ArrayRef<int64_t> zero_points, const int quantization_dimension) {
+  return UniformQuantizedPerAxisType::getChecked(
+      loc, /*flags=*/QuantizationFlags::Signed,
+      /*storageType=*/IntegerType::get(&context, /*width=*/32),
+      /*expressedType=*/FloatType::getF32(&context),
+      SmallVector<double>(scales), SmallVector<int64_t>(zero_points),
+      quantization_dimension, /*storageTypeMin=*/llvm::minIntN(32),
+      /*storageTypeMax=*/llvm::maxIntN(32));
+}
+
 bool IsStorageTypeI8(const QuantizedType quantized_type) {
   const Type storage_type = quantized_type.getStorageType();
   return storage_type.isInteger(/*width=*/8);
@@ -151,6 +169,30 @@ bool IsI32F32UniformQuantizedType(const Type type) {
   return true;
 }
 
+bool IsI32F32UniformQuantizedPerAxisType(const Type type) {
+  const UniformQuantizedPerAxisType quantized_per_axis_type =
+      type.dyn_cast_or_null<UniformQuantizedPerAxisType>();
+  if (!quantized_per_axis_type) {
+    LLVM_DEBUG(llvm::dbgs()
+               << "Expected a uniform quantized type. Got: " << type << ".\n");
+    return false;
+  }
+
+  if (!IsStorageTypeI32(quantized_per_axis_type)) {
+    LLVM_DEBUG(llvm::dbgs() << "Expected an i32 storage type. Got: "
+                            << quantized_per_axis_type << ".\n");
+    return false;
+  }
+
+  if (!IsExpressedTypeF32(quantized_per_axis_type)) {
+    LLVM_DEBUG(llvm::dbgs() << "Expected an f32 expressed type. Got: "
+                            << quantized_per_axis_type << ".\n");
+    return false;
+  }
+
+  return true;
+}
+
 // Determines whether the storage type of a quantized type is supported by
 // `tfl.quantize` or `tfl.dequantize` ops. ui8, i8 and i16 are supported.
 bool IsSupportedByTfliteQuantizeOrDequantizeOps(IntegerType storage_type) {
@@ -165,5 +207,18 @@ bool IsSupportedByTfliteQuantizeOrDequantizeOps(IntegerType storage_type) {
   return false;
 }
 
+bool IsQuantizedTensorType(Type type) {
+  if (!type.isa<TensorType>()) {
+    return false;
+  }
+  Type element_type = type.cast<TensorType>().getElementType();
+  return element_type.isa<QuantizedType>();
+}
+
+bool IsOpFullyQuantized(Operation* op) {
+  return llvm::all_of(op->getOperandTypes(), IsQuantizedTensorType) &&
+         llvm::all_of(op->getResultTypes(), IsQuantizedTensorType);
+}
+
 }  // namespace quant
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h
similarity index 69%
rename from tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h
rename to tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h
index d04dc5a5761b8f..f1c94302d816b3 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h
+++ b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h
@@ -12,8 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UNIFORM_QUANTIZED_TYPES_H_
-#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UNIFORM_QUANTIZED_TYPES_H_
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_UNIFORM_QUANTIZED_TYPES_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_UNIFORM_QUANTIZED_TYPES_H_
 
 #include <cstdint>
 
@@ -21,6 +21,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 
@@ -31,17 +32,20 @@ namespace quant {
 // values. The produced type has f32 as its expressed type and i8 as its
 // storage type. The available values use the full range of the storage value,
 // i.e. [-128, 127]. Assumes asymmetric quantization, meaning the zero point
-// values can be non-zero values.
+// value can be a non-zero value.
+// If `narrow_range` is set true (ex: for weights), a restricted range of
+// integers will be used for symmetric mapping, i.e. [-127, 127].
 UniformQuantizedType CreateI8F32UniformQuantizedType(Location loc,
                                                      MLIRContext& context,
                                                      double scale,
-                                                     int64_t zero_point);
+                                                     int64_t zero_point,
+                                                     bool narrow_range = false);
 
 // Creates a `UniformQuantizedType` with the given `scale` and `zero_point`
 // values. The produced type has f32 as its expressed type and i32 as its
 // storage type. The available values use the full range of the storage value.
-// Assumes asymmetric quantization, meaning the zero point values can be
-// non-zero values.
+// Assumes asymmetric quantization, meaning the zero point value can be
+// a non-zero value.
 UniformQuantizedType CreateI32F32UniformQuantizedType(Location loc,
                                                       MLIRContext& context,
                                                       double scale,
@@ -52,7 +56,19 @@ UniformQuantizedType CreateI32F32UniformQuantizedType(Location loc,
 // i8 as its storage type. The available values use the full range of the
 // storage value, i.e. [-128, 127]. Assumes asymmetric quantization, meaning the
 // zero point values can be non-zero values.
+// If `narrow_range` is set true (ex: for weights), a restricted range of
+// integers will be used for symmetric mapping, i.e. [-127, 127].
 UniformQuantizedPerAxisType CreateI8F32UniformQuantizedPerAxisType(
+    Location loc, MLIRContext& context, ArrayRef<double> scales,
+    ArrayRef<int64_t> zero_points, int quantization_dimension,
+    bool narrow_range = false);
+
+// Creates a `UniformQuantizedPerAxisType` with the given `scales` and
+// `zero_points` values. The produced type has f32 as its expressed type and
+// i32 as its storage type. The available values use the full range of the
+// storage value. Assumes asymmetric quantization, meaning the
+// zero point values can be non-zero values.
+UniformQuantizedPerAxisType CreateI32F32UniformQuantizedPerAxisType(
     Location loc, MLIRContext& context, ArrayRef<double> scales,
     ArrayRef<int64_t> zero_points, int quantization_dimension);
 
@@ -74,11 +90,21 @@ bool IsI8F32UniformQuantizedPerAxisType(Type type);
 // 32-bit integer and expressed type is f32.
 bool IsI32F32UniformQuantizedType(Type type);
 
+// Returns true iff `type` is a uniform quantized per-axis (per-channel) type
+// whose storage type is 32-bit integer and expressed type is f32.
+bool IsI32F32UniformQuantizedPerAxisType(Type type);
+
 // Determines whether the storage type of a quantized type is supported by
 // `tfl.quantize` or `tfl.dequantize` ops. ui8, i8 and i16 are supported.
 bool IsSupportedByTfliteQuantizeOrDequantizeOps(IntegerType storage_type);
 
+// Returns true if a type is quantized tensor type.
+bool IsQuantizedTensorType(Type type);
+
+// Returns true if all operands and results are quantized.
+bool IsOpFullyQuantized(Operation* op);
+
 }  // namespace quant
 }  // namespace mlir
 
-#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_UNIFORM_QUANTIZED_TYPES_H_
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_COMMON_UNIFORM_QUANTIZED_TYPES_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types_test.cc b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types_test.cc
similarity index 56%
rename from tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types_test.cc
rename to tensorflow/compiler/mlir/quantization/common/uniform_quantized_types_test.cc
index f33b322cfbd9e4..10499526873f2c 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types_test.cc
+++ b/tensorflow/compiler/mlir/quantization/common/uniform_quantized_types_test.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h"
+#include "tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h"
 
 #include <cstdint>
 #include <limits>
@@ -31,6 +31,7 @@ namespace quant {
 namespace {
 
 using ::testing::ElementsAreArray;
+using ::testing::IsNull;
 using ::testing::NotNull;
 using ::testing::Test;
 
@@ -47,7 +48,7 @@ TEST_F(CreateI8F32UniformQuantizedTypeTest, I8StorageTypeSucceeds) {
   const UniformQuantizedType quantized_type =
       CreateI8F32UniformQuantizedType(UnknownLoc::get(&ctx_), ctx_,
                                       /*scale=*/1.0, /*zero_point=*/0);
-
+  // Storage type of `i8` is currently verifiable as `unsigned` in `Types.cpp`.
   EXPECT_TRUE(quantized_type.getStorageType().isSignlessInteger(8));
 }
 
@@ -76,6 +77,15 @@ TEST_F(CreateI8F32UniformQuantizedTypeTest, StorageTypeMinMaxEqualToI8MinMax) {
   EXPECT_EQ(quantized_type.getStorageTypeMax(), 127);
 }
 
+TEST_F(CreateI8F32UniformQuantizedTypeTest, StorageTypeMinMaxNarrowRange) {
+  const UniformQuantizedType quantized_type = CreateI8F32UniformQuantizedType(
+      UnknownLoc::get(&ctx_), ctx_,
+      /*scale=*/1.0, /*zero_point=*/0, /*narrow_range=*/true);
+
+  EXPECT_EQ(quantized_type.getStorageTypeMin(), -127);
+  EXPECT_EQ(quantized_type.getStorageTypeMax(), 127);
+}
+
 TEST_F(CreateI8F32UniformQuantizedTypeTest, HasScaleAndZeroPointProperlySet) {
   const UniformQuantizedType quantized_type =
       CreateI8F32UniformQuantizedType(UnknownLoc::get(&ctx_), ctx_,
@@ -99,6 +109,7 @@ TEST_F(CreateI32F32UniformQuantizedTypeTest, I32StorageTypeSucceeds) {
       CreateI32F32UniformQuantizedType(UnknownLoc::get(&ctx_), ctx_,
                                        /*scale=*/1.0, /*zero_point=*/0);
 
+  // Storage type of `i32` is currently verifiable as `unsigned` in `Types.cpp`.
   EXPECT_TRUE(quantized_type.getStorageType().isSignlessInteger(32));
 }
 
@@ -156,6 +167,7 @@ TEST_F(CreateI8F32UniformQuantizedPerAxisTypeTest, I8StorageTypeSucceeds) {
           /*zero_points=*/SmallVector<int64_t, 2>{0, 0},
           /*quantization_dimension=*/0);
 
+  // Storage type of `i8` is currently verifiable as `unsigned` in `Types.cpp`.
   EXPECT_TRUE(quantized_type.getStorageType().isSignlessInteger(8));
 }
 
@@ -195,6 +207,19 @@ TEST_F(CreateI8F32UniformQuantizedPerAxisTypeTest,
   EXPECT_EQ(quantized_type.getStorageTypeMax(), 127);
 }
 
+TEST_F(CreateI8F32UniformQuantizedPerAxisTypeTest,
+       StorageTypeMinMaxNarrowRange) {
+  const UniformQuantizedPerAxisType quantized_type =
+      CreateI8F32UniformQuantizedPerAxisType(
+          UnknownLoc::get(&ctx_), ctx_,
+          /*scales=*/SmallVector<double, 2>{1.0, 1.0},
+          /*zero_points=*/SmallVector<int64_t, 2>{0, 0},
+          /*quantization_dimension=*/0, /*narrow_range=*/true);
+
+  EXPECT_EQ(quantized_type.getStorageTypeMin(), -127);
+  EXPECT_EQ(quantized_type.getStorageTypeMax(), 127);
+}
+
 TEST_F(CreateI8F32UniformQuantizedPerAxisTypeTest,
        HasQuantizationDimensionProperlySet) {
   const UniformQuantizedPerAxisType quantized_type =
@@ -220,62 +245,139 @@ TEST_F(CreateI8F32UniformQuantizedPerAxisTypeTest,
   EXPECT_THAT(quantized_type.getZeroPoints(), ElementsAreArray({98, 99}));
 }
 
+class CreateI32F32UniformQuantizedPerAxisTypeTest : public Test {
+ protected:
+  CreateI32F32UniformQuantizedPerAxisTypeTest() : ctx_() {
+    ctx_.loadDialect<quant::QuantizationDialect>();
+  }
+
+  MLIRContext ctx_;
+};
+
+TEST_F(CreateI32F32UniformQuantizedPerAxisTypeTest, I32StorageTypeSucceeds) {
+  const UniformQuantizedPerAxisType quantized_type =
+      CreateI32F32UniformQuantizedPerAxisType(
+          UnknownLoc::get(&ctx_), ctx_,
+          /*scales=*/SmallVector<double, 2>{1.0, 1.0},
+          /*zero_points=*/SmallVector<int64_t, 2>{0, 0},
+          /*quantization_dimension=*/0);
+
+  // Storage type of `i32` is currently verifiable as `unsigned` in `Types.cpp`.
+  EXPECT_TRUE(quantized_type.getStorageType().isSignlessInteger(32));
+}
+
+TEST_F(CreateI32F32UniformQuantizedPerAxisTypeTest, F32ExpressedTypeSucceeds) {
+  const UniformQuantizedPerAxisType quantized_type =
+      CreateI32F32UniformQuantizedPerAxisType(
+          UnknownLoc::get(&ctx_), ctx_,
+          /*scales=*/SmallVector<double, 2>{1.0, 1.0},
+          /*zero_points=*/SmallVector<int64_t, 2>{0, 0},
+          /*quantization_dimension=*/0);
+
+  EXPECT_TRUE(quantized_type.getExpressedType().isF32());
+}
+
+TEST_F(CreateI32F32UniformQuantizedPerAxisTypeTest,
+       StorageTypeMinMaxEqualToI32MinMax) {
+  const UniformQuantizedPerAxisType quantized_type =
+      CreateI32F32UniformQuantizedPerAxisType(
+          UnknownLoc::get(&ctx_), ctx_,
+          /*scales=*/SmallVector<double, 2>{1.0, 1.0},
+          /*zero_points=*/SmallVector<int64_t, 2>{0, 0},
+          /*quantization_dimension=*/0);
+
+  EXPECT_EQ(quantized_type.getStorageTypeMin(),
+            std::numeric_limits<int32_t>::min());
+  EXPECT_EQ(quantized_type.getStorageTypeMax(),
+            std::numeric_limits<int32_t>::max());
+}
+
+TEST_F(CreateI32F32UniformQuantizedPerAxisTypeTest,
+       HasQuantizationDimensionProperlySet) {
+  const UniformQuantizedPerAxisType quantized_type =
+      CreateI32F32UniformQuantizedPerAxisType(
+          UnknownLoc::get(&ctx_), ctx_,
+          /*scales=*/SmallVector<double, 2>{1.0, 1.0},
+          /*zero_points=*/SmallVector<int64_t, 2>{0, 0},
+          /*quantization_dimension=*/3);
+
+  EXPECT_EQ(quantized_type.getQuantizedDimension(), 3);
+}
+
+TEST_F(CreateI32F32UniformQuantizedPerAxisTypeTest,
+       HasScaleAndZeroPointProperlySet) {
+  const UniformQuantizedPerAxisType quantized_type =
+      CreateI32F32UniformQuantizedPerAxisType(
+          UnknownLoc::get(&ctx_), ctx_,
+          /*scales=*/SmallVector<double, 2>{8.0, 9.0},
+          /*zero_points=*/SmallVector<int64_t, 2>{98, 99},
+          /*quantization_dimension=*/0);
+
+  EXPECT_THAT(quantized_type.getScales(), ElementsAreArray({8.0, 9.0}));
+  EXPECT_THAT(quantized_type.getZeroPoints(), ElementsAreArray({98, 99}));
+}
+
 class IsI8F32UniformQuantizedTypeTest : public Test {
  protected:
-  IsI8F32UniformQuantizedTypeTest() {
+  IsI8F32UniformQuantizedTypeTest() : builder_(&ctx_) {
     ctx_.loadDialect<quant::QuantizationDialect>();
   }
 
   MLIRContext ctx_;
-  OpBuilder builder_{&ctx_};
+  OpBuilder builder_;
 };
 
 TEST_F(IsI8F32UniformQuantizedTypeTest, I8F32UniformQuantizedTypeSucceeds) {
   const UniformQuantizedType qi8_type = quant::UniformQuantizedType::get(
-      /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(), /*scale=*/1.0,
-      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+      /*flags=*/QuantizationFlags::Signed, builder_.getI8Type(),
+      builder_.getF32Type(), /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/-128, /*storageTypeMax=*/127);
   EXPECT_TRUE(IsI8F32UniformQuantizedType(qi8_type));
 }
 
 TEST_F(IsI8F32UniformQuantizedTypeTest, UniformQuantizedTypeSucceeds) {
   const UniformQuantizedType qi8_type = quant::UniformQuantizedType::get(
-      /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(), /*scale=*/1.0,
-      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+      /*flags=*/QuantizationFlags::Signed, builder_.getI8Type(),
+      builder_.getF32Type(), /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/-128, /*storageTypeMax=*/127);
   EXPECT_THAT(qi8_type.dyn_cast_or_null<UniformQuantizedType>(), NotNull());
 }
 
 TEST_F(IsI8F32UniformQuantizedTypeTest, StorageTypeI8Succeeds) {
   const UniformQuantizedType qi8_type = quant::UniformQuantizedType::get(
-      /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(), /*scale=*/1.0,
-      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+      /*flags=*/QuantizationFlags::Signed, builder_.getI8Type(),
+      builder_.getF32Type(), /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/-128, /*storageTypeMax=*/127);
   EXPECT_TRUE(IsStorageTypeI8(qi8_type));
 }
 
 TEST_F(IsI8F32UniformQuantizedTypeTest, ExpressedTypeF32Succeeds) {
   const UniformQuantizedType qi8_type = quant::UniformQuantizedType::get(
-      /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(), /*scale=*/1.0,
-      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+      /*flags=*/QuantizationFlags::Signed, builder_.getI8Type(),
+      builder_.getF32Type(), /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/-128, /*storageTypeMax=*/127);
   EXPECT_TRUE(IsExpressedTypeF32(qi8_type));
 }
 
 class IsI8F32UniformQuantizedPerAxisTypeTest : public Test {
  protected:
-  IsI8F32UniformQuantizedPerAxisTypeTest() {
+  IsI8F32UniformQuantizedPerAxisTypeTest() : builder_(&ctx_) {
     ctx_.loadDialect<quant::QuantizationDialect>();
   }
 
   MLIRContext ctx_;
-  OpBuilder builder_{&ctx_};
+  OpBuilder builder_;
 };
 
 TEST_F(IsI8F32UniformQuantizedPerAxisTypeTest,
        I8F32UniformQuantizedPerAxisTypeSucceeds) {
   const UniformQuantizedPerAxisType qi8_per_axis_type =
       quant::UniformQuantizedPerAxisType::get(
-          /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(),
+          /*flags=*/QuantizationFlags::Signed, builder_.getI8Type(),
+          builder_.getF32Type(),
           /*scales=*/{1.0},
-          /*zeroPoints=*/{0}, /*quantizedDimension=*/0, /*storageTypeMin=*/0,
-          /*storageTypeMax=*/255);
+          /*zeroPoints=*/{0}, /*quantizedDimension=*/0, /*storageTypeMin=*/-128,
+          /*storageTypeMax=*/127);
   EXPECT_TRUE(IsI8F32UniformQuantizedPerAxisType(qi8_per_axis_type));
   EXPECT_FALSE(IsI8F32UniformQuantizedType(qi8_per_axis_type));
 }
@@ -283,10 +385,11 @@ TEST_F(IsI8F32UniformQuantizedPerAxisTypeTest,
 TEST_F(IsI8F32UniformQuantizedTypeTest, UniformQuantizedPerAxisTypeSucceeds) {
   const UniformQuantizedPerAxisType qi8_per_axis_type =
       quant::UniformQuantizedPerAxisType::get(
-          /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(),
+          /*flags=*/QuantizationFlags::Signed, builder_.getI8Type(),
+          builder_.getF32Type(),
           /*scales=*/{1.0},
-          /*zeroPoints=*/{0}, /*quantizedDimension=*/0, /*storageTypeMin=*/0,
-          /*storageTypeMax=*/255);
+          /*zeroPoints=*/{0}, /*quantizedDimension=*/0, /*storageTypeMin=*/-128,
+          /*storageTypeMax=*/127);
   EXPECT_THAT(qi8_per_axis_type.dyn_cast_or_null<UniformQuantizedPerAxisType>(),
               NotNull());
 }
@@ -294,96 +397,187 @@ TEST_F(IsI8F32UniformQuantizedTypeTest, UniformQuantizedPerAxisTypeSucceeds) {
 TEST_F(IsI8F32UniformQuantizedPerAxisTypeTest, StorageTypeI8Succeeds) {
   const UniformQuantizedPerAxisType qi8_per_axis_type =
       quant::UniformQuantizedPerAxisType::get(
-          /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(),
+          /*flags=*/QuantizationFlags::Signed, builder_.getI8Type(),
+          builder_.getF32Type(),
           /*scales=*/{1.0},
-          /*zeroPoints=*/{0}, /*quantizedDimension=*/0, /*storageTypeMin=*/0,
-          /*storageTypeMax=*/255);
+          /*zeroPoints=*/{0}, /*quantizedDimension=*/0, /*storageTypeMin=*/-128,
+          /*storageTypeMax=*/127);
   EXPECT_TRUE(IsStorageTypeI8(qi8_per_axis_type));
 }
 
 TEST_F(IsI8F32UniformQuantizedPerAxisTypeTest, ExpressedTypeF32Succeeds) {
   const UniformQuantizedPerAxisType qi8_per_axis_type =
       quant::UniformQuantizedPerAxisType::get(
-          /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(),
+          /*flags=*/QuantizationFlags::Signed, builder_.getI8Type(),
+          builder_.getF32Type(),
           /*scales=*/{1.0},
-          /*zeroPoints=*/{0}, /*quantizedDimension=*/0, /*storageTypeMin=*/0,
-          /*storageTypeMax=*/255);
+          /*zeroPoints=*/{0}, /*quantizedDimension=*/0, /*storageTypeMin=*/-128,
+          /*storageTypeMax=*/127);
   EXPECT_TRUE(IsExpressedTypeF32(qi8_per_axis_type));
 }
 
 class IsI32F32UniformQuantizedTypeTest : public Test {
  protected:
-  IsI32F32UniformQuantizedTypeTest() {
+  IsI32F32UniformQuantizedTypeTest() : builder_(&ctx_) {
     ctx_.loadDialect<quant::QuantizationDialect>();
   }
 
   MLIRContext ctx_;
-  OpBuilder builder_{&ctx_};
+  OpBuilder builder_;
 };
 
 TEST_F(IsI32F32UniformQuantizedTypeTest, I32F32UniformQuantizedTypeSucceeds) {
   const UniformQuantizedType qi32_type = quant::UniformQuantizedType::get(
-      /*flags=*/0, builder_.getI32Type(), builder_.getF32Type(), /*scale=*/1.0,
-      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+      /*flags=*/QuantizationFlags::Signed, builder_.getI32Type(),
+      builder_.getF32Type(),
+      /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/-2147483647,
+      /*storageTypeMax=*/2147483646);
   EXPECT_TRUE(IsI32F32UniformQuantizedType(qi32_type));
 }
 
 TEST_F(IsI32F32UniformQuantizedTypeTest, UniformQuantizedTypeSucceeds) {
   const UniformQuantizedType qi32_type = quant::UniformQuantizedType::get(
-      /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(), /*scale=*/1.0,
-      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+      /*flags=*/QuantizationFlags::Signed, builder_.getI32Type(),
+      builder_.getF32Type(),
+      /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/-2147483647,
+      /*storageTypeMax=*/2147483646);
+  EXPECT_TRUE(IsI32F32UniformQuantizedType(qi32_type));
   EXPECT_THAT(qi32_type.dyn_cast_or_null<UniformQuantizedType>(), NotNull());
 }
 
 TEST_F(IsI32F32UniformQuantizedTypeTest, StorageTypeI32Succeeds) {
   const UniformQuantizedType qi32_type = quant::UniformQuantizedType::get(
-      /*flags=*/0, builder_.getI32Type(), builder_.getF32Type(), /*scale=*/1.0,
-      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+      /*flags=*/QuantizationFlags::Signed, builder_.getI32Type(),
+      builder_.getF32Type(),
+      /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/-2147483647,
+      /*storageTypeMax=*/2147483646);
+  EXPECT_TRUE(IsI32F32UniformQuantizedType(qi32_type));
   EXPECT_TRUE(IsStorageTypeI32(qi32_type));
 }
 
 TEST_F(IsI32F32UniformQuantizedTypeTest, ExpressedTypeF32Succeeds) {
   const UniformQuantizedType qi32_per_axis_type =
       quant::UniformQuantizedType::get(
-          /*flags=*/0, builder_.getI8Type(), builder_.getF32Type(),
+          /*flags=*/QuantizationFlags::Signed, builder_.getI32Type(),
+          builder_.getF32Type(),
           /*scale=*/1.0,
-          /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+          /*zeroPoint=*/0, /*storageTypeMin=*/-2147483647,
+          /*storageTypeMax=*/2147483646);
+  EXPECT_TRUE(IsExpressedTypeF32(qi32_per_axis_type));
+}
+
+class IsI32F32UniformQuantizedPerAxisTypeTest : public Test {
+ protected:
+  IsI32F32UniformQuantizedPerAxisTypeTest() : builder_(&ctx_) {
+    ctx_.loadDialect<quant::QuantizationDialect>();
+  }
+
+  MLIRContext ctx_;
+  OpBuilder builder_;
+};
+
+TEST_F(IsI32F32UniformQuantizedPerAxisTypeTest,
+       I32F32UniformQuantizedPerAxisTypeSucceeds) {
+  const UniformQuantizedPerAxisType qi32_per_axis_type =
+      quant::UniformQuantizedPerAxisType::get(
+          /*flags=*/QuantizationFlags::Signed, builder_.getI32Type(),
+          builder_.getF32Type(),
+          /*scales=*/{1.0},
+          /*zeroPoints=*/{0}, /*quantizedDimension=*/0,
+          /*storageTypeMin=*/-2147483647, /*storageTypeMax=*/2147483646);
+  EXPECT_TRUE(IsI32F32UniformQuantizedPerAxisType(qi32_per_axis_type));
+  EXPECT_FALSE(IsI32F32UniformQuantizedType(qi32_per_axis_type));
+}
+
+TEST_F(IsI32F32UniformQuantizedPerAxisTypeTest,
+       I8F32UniformQuantizedTypeFails) {
+  const UniformQuantizedType qi8_type = quant::UniformQuantizedType::get(
+      /*flags=*/QuantizationFlags::Signed, builder_.getI8Type(),
+      builder_.getF32Type(),
+      /*scale=*/1.0, /*zeroPoint=*/0, /*storageTypeMin=*/-128,
+      /*storageTypeMax=*/127);
+  EXPECT_FALSE(IsI32F32UniformQuantizedPerAxisType(qi8_type));
+  EXPECT_FALSE(IsStorageTypeI32(qi8_type));
+  EXPECT_THAT(qi8_type.dyn_cast_or_null<UniformQuantizedPerAxisType>(),
+              IsNull());
+}
+
+TEST_F(IsI32F32UniformQuantizedTypeTest, UniformQuantizedPerAxisTypeSucceeds) {
+  const UniformQuantizedPerAxisType qi32_per_axis_type =
+      quant::UniformQuantizedPerAxisType::get(
+          /*flags=*/QuantizationFlags::Signed, builder_.getI32Type(),
+          builder_.getF32Type(),
+          /*scales=*/{1.0},
+          /*zeroPoints=*/{0}, /*quantizedDimension=*/0,
+          /*storageTypeMin=*/-2147483647, /*storageTypeMax=*/2147483646);
+
+  EXPECT_THAT(
+      qi32_per_axis_type.dyn_cast_or_null<UniformQuantizedPerAxisType>(),
+      NotNull());
+}
+
+TEST_F(IsI32F32UniformQuantizedPerAxisTypeTest, StorageTypeI8Succeeds) {
+  const UniformQuantizedPerAxisType qi32_per_axis_type =
+      quant::UniformQuantizedPerAxisType::get(
+          /*flags=*/QuantizationFlags::Signed, builder_.getI32Type(),
+          builder_.getF32Type(),
+          /*scales=*/{1.0},
+          /*zeroPoints=*/{0}, /*quantizedDimension=*/0,
+          /*storageTypeMin=*/-2147483647, /*storageTypeMax=*/2147483646);
+
+  EXPECT_TRUE(IsStorageTypeI32(qi32_per_axis_type));
+}
+
+TEST_F(IsI32F32UniformQuantizedPerAxisTypeTest, ExpressedTypeF32Succeeds) {
+  const UniformQuantizedPerAxisType qi32_per_axis_type =
+      quant::UniformQuantizedPerAxisType::get(
+          /*flags=*/QuantizationFlags::Signed, builder_.getI32Type(),
+          builder_.getF32Type(),
+          /*scales=*/{1.0},
+          /*zeroPoints=*/{0}, /*quantizedDimension=*/0,
+          /*storageTypeMin=*/-2147483647, /*storageTypeMax=*/2147483646);
   EXPECT_TRUE(IsExpressedTypeF32(qi32_per_axis_type));
 }
 
 class IsSupportedByTfliteQuantizeOrDequantizeOpsTest : public Test {
  protected:
-  IsSupportedByTfliteQuantizeOrDequantizeOpsTest() {
+  IsSupportedByTfliteQuantizeOrDequantizeOpsTest() : builder_(&ctx_) {
     ctx_.loadDialect<quant::QuantizationDialect>();
   }
 
   MLIRContext ctx_;
-  OpBuilder builder_{&ctx_};
+  OpBuilder builder_;
 };
 
 TEST_F(IsSupportedByTfliteQuantizeOrDequantizeOpsTest, StorageTypeI8Succeeds) {
   auto qi8_type = quant::UniformQuantizedType::get(
-      /*flags=*/0, builder_.getIntegerType(8, /*isSigned=*/true),
-      builder_.getF32Type(), /*scale=*/1.0,
-      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+      /*flags=*/QuantizationFlags::Signed, builder_.getI8Type(),
+      builder_.getF32Type(),
+      /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/-128, /*storageTypeMax=*/127);
   EXPECT_TRUE(IsSupportedByTfliteQuantizeOrDequantizeOps(
       dyn_cast_or_null<IntegerType>(qi8_type.getStorageType())));
 }
 
 TEST_F(IsSupportedByTfliteQuantizeOrDequantizeOpsTest, StorageTypeI16Succeeds) {
   auto qi16_type = quant::UniformQuantizedType::get(
-      /*flags=*/0, builder_.getIntegerType(16, /*isSigned=*/true),
-      builder_.getF32Type(), /*scale=*/1.0,
-      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+      /*flags=*/QuantizationFlags::Signed, builder_.getI8Type(),
+      builder_.getF32Type(),
+      /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/-128, /*storageTypeMax=*/127);
   EXPECT_TRUE(IsSupportedByTfliteQuantizeOrDequantizeOps(
       dyn_cast_or_null<IntegerType>(qi16_type.getStorageType())));
 }
 
 TEST_F(IsSupportedByTfliteQuantizeOrDequantizeOpsTest, StorageTypeUI8Succeeds) {
   auto qi8_type = quant::UniformQuantizedType::get(
-      /*flags=*/0, builder_.getIntegerType(8, /*isSigned=*/false),
-      builder_.getF32Type(), /*scale=*/1.0,
-      /*zeroPoint=*/0, /*storageTypeMin=*/0, /*storageTypeMax=*/255);
+      /*flags=*/QuantizationFlags::Signed, builder_.getI8Type(),
+      builder_.getF32Type(),
+      /*scale=*/1.0,
+      /*zeroPoint=*/0, /*storageTypeMin=*/-128, /*storageTypeMax=*/127);
   EXPECT_TRUE(IsSupportedByTfliteQuantizeOrDequantizeOps(
       dyn_cast_or_null<IntegerType>(qi8_type.getStorageType())));
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
index 7d2d1f87ef1831..439e8542cf7d0e 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/BUILD
@@ -52,7 +52,6 @@ cc_library(
         "passes/lift_quantizable_spots_as_functions_fusion.inc",
         "passes/lift_quantizable_spots_as_functions_simple.inc",
         "passes/optimize_graph.cc",
-        "passes/populate_shape.cc",
         "passes/post_quantize.cc",
         "passes/prepare_quantize.cc",
         "passes/quantize.cc",
@@ -74,16 +73,17 @@ cc_library(
         ":lift_quantizable_spots_as_functions_fusion_inc_gen",
         ":lift_quantizable_spots_as_functions_simple_inc_gen",
         ":optimize_graph_inc_gen",
+        ":quantization_config_proto_cc",
         ":quantization_options_proto_cc",
         ":quantization_patterns",
         ":stablehlo_passes_inc_gen",
         ":stablehlo_type_utils",
-        ":uniform_quantized_types",
         "//tensorflow/compiler/mlir/lite:tensorflow_lite",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_config",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/common:func",
         "//tensorflow/compiler/mlir/quantization/common:lift_as_function_call",
         "//tensorflow/compiler/mlir/quantization/common/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/stablehlo/ops:stablehlo_op_quant_spec",
@@ -109,6 +109,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/random",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@eigen_archive//:eigen3",
@@ -124,6 +125,7 @@ cc_library(
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:TransformUtils",
         "@llvm-project//mlir:Transforms",
+        "@local_tsl//tsl/platform:regexp",
         "@local_tsl//tsl/platform:str_util",
         "@local_tsl//tsl/protobuf:protos_all_cc",
         "@local_xla//xla/mlir_hlo",
@@ -144,15 +146,14 @@ cc_library(
     ],
     compatible_with = get_compatible_with_portable(),
     deps = [
-        ":uniform_quantized_types",
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/common:uniform_quantized_types",
         "//tensorflow/compiler/mlir/quantization/stablehlo/ops:stablehlo_op_quant_spec",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:xla_call_module_attrs",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:path",
         "@com_google_absl//absl/algorithm:container",
@@ -474,28 +475,38 @@ gentbl_cc_library(
 cc_library(
     name = "test_passes",
     srcs = [
+        "passes/testing/test_lift_quantizable_spots_as_functions_with_quantization_specs.cc",
         "passes/testing/test_post_calibration_component.cc",
         "passes/testing/test_pre_calibration_component.cc",
+        "passes/testing/test_tf_to_stablehlo_pass.cc",
     ],
     hdrs = [
         "passes/testing/passes.h",
     ],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        ":passes",
         ":quantization_config_proto_cc",
         ":stablehlo_test_passes_inc_gen",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:post_calibration",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:pre_calibration",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantize_preprocess",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:QuantOps",
+        "@llvm-project//mlir:SparseTensorDialect",
         "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:protobuf",
         "@local_xla//xla/mlir_hlo",
         "@stablehlo//:chlo_ops",
         "@stablehlo//:stablehlo_ops",
@@ -741,28 +752,3 @@ tf_cc_binary(
         "@stablehlo//:vhlo_ops",
     ],
 )
-
-cc_library(
-    name = "uniform_quantized_types",
-    srcs = ["uniform_quantized_types.cc"],
-    hdrs = ["uniform_quantized_types.h"],
-    compatible_with = get_compatible_with_portable(),
-    deps = [
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:QuantOps",
-        "@llvm-project//mlir:Support",
-    ],
-)
-
-tf_cc_test(
-    name = "uniform_quantized_types_test",
-    srcs = ["uniform_quantized_types_test.cc"],
-    deps = [
-        ":uniform_quantized_types",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:QuantOps",
-        "@llvm-project//mlir:Support",
-    ],
-)
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
index d02f2651cfd3e2..10d4b020166552 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/BUILD
@@ -7,8 +7,9 @@ load(
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = [
-        "//tensorflow/compiler/mlir/quantization/stablehlo:__subpackages__",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:__subpackages__",
+        # For TFLite Converter integration.
+        "//tensorflow/compiler/mlir/lite:__subpackages__",
+        "//tensorflow/compiler/mlir/quantization:__subpackages__",
     ],
     licenses = ["notice"],
 )
@@ -32,6 +33,29 @@ cc_library(
     ],
 )
 
+# OSS: This is a header-only target. Do NOT directly depend on `config_impl` unless it is necessary
+# (e.g. undefined symbol error), to avoid ODR violation.
+cc_library(
+    name = "config",
+    hdrs = ["config.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+    ],
+)
+
+# OSS: This is a impl target corresponding to `config`. Do NOT directly depend on `config_impl`
+# unless it is necessary (e.g. undefined symbol error), to avoid ODR violation.
+cc_library(
+    name = "config_impl",
+    srcs = ["config.cc"],
+    hdrs = ["config.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+    ],
+)
+
 cc_library(
     name = "io",
     srcs = ["io.cc"],
@@ -187,6 +211,9 @@ cc_library(
     deps = [
         "//tensorflow/compiler/mlir/quantization/stablehlo:bridge_passes",
         "//tensorflow/compiler/mlir/quantization/stablehlo:passes",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:Pass",
@@ -210,7 +237,6 @@ cc_library(
     deps = [
         ":component",
         ":pass_pipeline",
-        "//tensorflow/compiler/mlir/quantization/stablehlo:passes",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
@@ -219,7 +245,6 @@ cc_library(
         "@com_google_absl//absl/log:die_if_null",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@local_tsl//tsl/platform:errors",
@@ -264,7 +289,6 @@ cc_library(
     deps = [
         ":component",
         ":pass_pipeline",
-        "//tensorflow/compiler/mlir/quantization/stablehlo:passes",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
         "//tensorflow/compiler/mlir/quantization/tensorflow:passes",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:run_passes",
@@ -272,7 +296,6 @@ cc_library(
         "@com_google_absl//absl/log:die_if_null",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:string_view",
-        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@local_tsl//tsl/platform:errors",
@@ -311,6 +334,11 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     visibility = [
         "//tensorflow:__pkg__",
+        "//tensorflow/compiler/mlir/lite:__pkg__",  # For tf_tfl_translate binary.
+        # For odml_to_stablehlo binary.
+        "//tensorflow/compiler/mlir/lite/stablehlo:__pkg__",
+        # For StableHLO Quantizer adapter functionalities within TFLite. Testonly.
+        "//tensorflow/compiler/mlir/lite/quantization/stablehlo:__pkg__",
         "//tensorflow/python:__pkg__",
     ],
     deps = [
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
index 427f61996b2881..b31b9d5321a1ab 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/BUILD
@@ -67,6 +67,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":assign_ids",
+        ":representative_dataset",
         ":statistics",
         "//tensorflow/cc/saved_model:loader",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
@@ -104,3 +105,33 @@ cc_library(
         "@local_tsl//tsl/platform:statusor",
     ],
 )
+
+cc_library(
+    name = "representative_dataset",
+    srcs = ["representative_dataset.cc"],
+    hdrs = ["representative_dataset.h"],
+    compatible_with = get_compatible_with_portable(),
+    deps = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+tf_cc_test(
+    name = "representative_dataset_test",
+    srcs = ["representative_dataset_test.cc"],
+    deps = [
+        ":representative_dataset",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_cc",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:status_matchers",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
index 98131a96bb3aba..4f6478efebf2f5 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/cc/saved_model/loader.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/assign_ids.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/statistics.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/io.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h"
@@ -60,12 +61,13 @@ limitations under the License.
 #include "tsl/platform/statusor.h"
 
 namespace mlir::quant::stablehlo {
-
 namespace {
 
 using ::stablehlo::quantization::AddCalibrationStatistics;
 using ::stablehlo::quantization::AssignIdsToCustomAggregatorOps;
+using ::stablehlo::quantization::CreateRepresentativeDatasetFileMap;
 using ::stablehlo::quantization::QuantizationConfig;
+using ::stablehlo::quantization::RepresentativeDatasetConfig;
 using ::stablehlo::quantization::io::CreateTmpDir;
 using ::stablehlo::quantization::io::GetLocalTmpFileName;
 using ::tensorflow::AssetFileDef;
@@ -77,7 +79,6 @@ using ::tensorflow::quantization::CalibrationOptions;
 using ::tensorflow::quantization::ExportedModel;
 using ::tensorflow::quantization::PreprocessAndFreezeGraph;
 using ::tensorflow::quantization::PyFunctionLibrary;
-using ::tensorflow::quantization::RepresentativeDatasetFile;
 using ::tensorflow::quantization::RunPasses;
 using ::tensorflow::quantization::UnfreezeConstantsAndSaveVariables;
 
@@ -132,7 +133,8 @@ absl::StatusOr<SmallVector<AssetFileDef>> RunExportPasses(
   }
 
   if (absl::Status pass_run_status = RunPasses(
-          /*name=*/export_opts.debug_name,
+          /*name=*/
+          export_opts.debug_name,
           /*add_passes_func=*/
           [dup_constants = export_opts.duplicate_shape_determining_constants](
               PassManager& pm) { AddExportPasses(pm, dup_constants); },
@@ -160,8 +162,6 @@ CalibrationComponent::CalibrationComponent(
     std::unordered_set<std::string> tags,
     absl::flat_hash_map<std::string, SignatureDef> signature_def_map,
     std::vector<std::string> signature_keys,
-    absl::flat_hash_map<std::string, RepresentativeDatasetFile>
-        representative_dataset_file_map,
     const CalibrationOptions& calibration_options)
     : ctx_(ABSL_DIE_IF_NULL(ctx)),                          // Crash OK
       py_function_lib_(ABSL_DIE_IF_NULL(py_function_lib)),  // Crash OK
@@ -170,7 +170,6 @@ CalibrationComponent::CalibrationComponent(
       tags_(std::move(tags)),
       signature_def_map_(std::move(signature_def_map)),
       signature_keys_(std::move(signature_keys)),
-      representative_dataset_file_map_(representative_dataset_file_map),
       calibration_options_(calibration_options) {}
 
 absl::StatusOr<ExportedModel> CalibrationComponent::ExportToSavedModel(
@@ -247,13 +246,23 @@ absl::StatusOr<ModuleOp> CalibrationComponent::Run(
       ExportedModel exported_model,
       ExportToSavedModel(module_op, precalibrated_saved_model_dir));
 
+  // Translates `RepresentativeDatasetConfig`s to signature key ->
+  // `RepresentativeDatasetFile` mapping.
+  const auto dataset_configs =
+      config.static_range_ptq_preset().representative_datasets();
+  const std::vector<RepresentativeDatasetConfig> dataset_config_vector(
+      dataset_configs.begin(), dataset_configs.end());
+  TF_ASSIGN_OR_RETURN(
+      const auto representative_dataset_file_map,
+      CreateRepresentativeDatasetFileMap(dataset_config_vector));
+
   // Runs calibration on the exported model. The statistics will be stored in a
   // separate singleton object `CalibratorSingleton` and are directly added to
   // `exported_model` without re-importing it.
   py_function_lib_->RunCalibration(precalibrated_saved_model_dir,
                                    signature_keys_, tags_, calibration_options_,
                                    /*force_graph_mode_calibration=*/true,
-                                   representative_dataset_file_map_);
+                                   representative_dataset_file_map);
 
   if (absl::Status status =
           AddCalibrationStatistics(*exported_model.mutable_graph_def(),
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h
index f049ab45c7561f..1d2cb94b93a0bc 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/component.h
@@ -66,9 +66,6 @@ class CalibrationComponent : public Component {
       absl::flat_hash_map<std::string, tensorflow::SignatureDef>
           signature_def_map,
       std::vector<std::string> signature_keys,
-      absl::flat_hash_map<std::string,
-                          tensorflow::quantization::RepresentativeDatasetFile>
-          representative_dataset_file_map,
       const tensorflow::quantization::CalibrationOptions& calibration_options);
 
   // Runs calibration on `module_op` and returns a calibrated ModuleOp with
@@ -113,13 +110,6 @@ class CalibrationComponent : public Component {
   // Signature keys to identify the functions to load & quantize.
   const std::vector<std::string> signature_keys_;
 
-  // Map from signature key to the representative dataset file. The keys should
-  // match `signature_keys_`. The representative datasets will be fed to the
-  // pre-calibrated graph to collect statistics.
-  const absl::flat_hash_map<std::string,
-                            tensorflow::quantization::RepresentativeDatasetFile>
-      representative_dataset_file_map_;
-
   // Configures the calibration behavior.
   const tensorflow::quantization::CalibrationOptions calibration_options_;
 };
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset.cc
new file mode 100644
index 00000000000000..4c8ab1df2ecc03
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset.cc
@@ -0,0 +1,60 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset.h"
+
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace stablehlo::quantization {
+
+using ::tensorflow::quantization::RepresentativeDatasetFile;
+
+absl::StatusOr<absl::flat_hash_map<std::string, RepresentativeDatasetFile>>
+CreateRepresentativeDatasetFileMap(absl::Span<const RepresentativeDatasetConfig>
+                                       representative_dataset_configs) {
+  absl::flat_hash_map<std::string, RepresentativeDatasetFile>
+      repr_dataset_file_map{};
+
+  for (const RepresentativeDatasetConfig& dataset_config :
+       representative_dataset_configs) {
+    RepresentativeDatasetFile repr_dataset_file;
+
+    repr_dataset_file.set_tfrecord_file_path(dataset_config.tf_record().path());
+    // If the signature_key has not been explicitly specified, use the default
+    // value of "serving_default".
+    const std::string signature_key = dataset_config.has_signature_key()
+                                          ? dataset_config.signature_key()
+                                          : "serving_default";
+    if (repr_dataset_file_map.contains(signature_key)) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("RepresentativeDatasetConfig should not contain "
+                       "duplicate signature key: ",
+                       signature_key));
+    }
+    repr_dataset_file_map[signature_key] = std::move(repr_dataset_file);
+  }
+
+  return repr_dataset_file_map;
+}
+
+}  // namespace stablehlo::quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset.h
new file mode 100644
index 00000000000000..33357630aa1098
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset.h
@@ -0,0 +1,41 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_REPRESENTATIVE_DATASET_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_REPRESENTATIVE_DATASET_H_
+
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+
+namespace stablehlo::quantization {
+
+// Translates a set of `RepresentativeDatsetConfig` to signature key ->
+// `RepresentativeDatasetFile` mapping. This is useful when using
+// `RepresentativeDatasetConfig`s at places that accept the legacy
+// `RepresentativeDatasetFile` mapping.
+// Returns a non-OK status when there is a duplicate signature key among
+// `representative_dataset_configs`.
+absl::StatusOr<absl::flat_hash_map<
+    std::string, tensorflow::quantization::RepresentativeDatasetFile>>
+CreateRepresentativeDatasetFileMap(absl::Span<const RepresentativeDatasetConfig>
+                                       representative_dataset_configs);
+
+}  // namespace stablehlo::quantization
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CALIBRATION_REPRESENTATIVE_DATASET_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset_test.cc
new file mode 100644
index 00000000000000..aaedfc72086f07
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset_test.cc
@@ -0,0 +1,110 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/calibration/representative_dataset.h"
+
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tsl/platform/status_matchers.h"
+
+namespace mlir::quant::stablehlo {
+namespace {
+
+using ::stablehlo::quantization::RepresentativeDatasetConfig;
+using ::tensorflow::quantization::RepresentativeDatasetFile;
+using ::testing::Contains;
+using ::testing::HasSubstr;
+using ::testing::Key;
+using ::testing::SizeIs;
+using ::testing::StrEq;
+using ::tsl::testing::IsOk;
+using ::tsl::testing::StatusIs;
+
+TEST(CreateRepresentativeDatasetFileMapTest,
+     ConfigWithoutExplicitSignatureKeyMappedToServingDefault) {
+  std::vector<RepresentativeDatasetConfig> representative_dataset_configs;
+
+  RepresentativeDatasetConfig config{};
+  *(config.mutable_tf_record()->mutable_path()) = "test_path";
+  representative_dataset_configs.push_back(config);
+
+  const absl::StatusOr<
+      absl::flat_hash_map<std::string, RepresentativeDatasetFile>>
+      representative_dataset_file_map =
+          CreateRepresentativeDatasetFileMap(representative_dataset_configs);
+
+  ASSERT_THAT(representative_dataset_file_map, IsOk());
+  ASSERT_THAT(*representative_dataset_file_map, SizeIs(1));
+  EXPECT_THAT(*representative_dataset_file_map,
+              Contains(Key("serving_default")));
+  EXPECT_THAT(representative_dataset_file_map->at("serving_default")
+                  .tfrecord_file_path(),
+              StrEq("test_path"));
+}
+
+TEST(CreateRepresentativeDatasetFileMapTest, ConfigWithExplicitSignatureKey) {
+  std::vector<RepresentativeDatasetConfig> representative_dataset_configs;
+
+  RepresentativeDatasetConfig config{};
+  config.set_signature_key("test_signature_key");
+  *(config.mutable_tf_record()->mutable_path()) = "test_path";
+  representative_dataset_configs.push_back(config);
+
+  const absl::StatusOr<
+      absl::flat_hash_map<std::string, RepresentativeDatasetFile>>
+      representative_dataset_file_map =
+          CreateRepresentativeDatasetFileMap(representative_dataset_configs);
+
+  ASSERT_THAT(representative_dataset_file_map, IsOk());
+  ASSERT_THAT(*representative_dataset_file_map, SizeIs(1));
+  EXPECT_THAT(*representative_dataset_file_map,
+              Contains(Key(StrEq("test_signature_key"))));
+  EXPECT_THAT(representative_dataset_file_map->at("test_signature_key")
+                  .tfrecord_file_path(),
+              StrEq("test_path"));
+}
+
+TEST(CreateRepresentativeDatasetFileMapTest,
+     ConfigWithDuplicateSignatureKeyReturnsInvalidArgumentError) {
+  std::vector<RepresentativeDatasetConfig> representative_dataset_configs;
+
+  RepresentativeDatasetConfig config_1{};
+  config_1.set_signature_key("serving_default");
+  *(config_1.mutable_tf_record()->mutable_path()) = "test_path_1";
+  representative_dataset_configs.push_back(config_1);
+
+  // Signature key is implicitly "serving_default".
+  RepresentativeDatasetConfig config_2{};
+  *(config_2.mutable_tf_record()->mutable_path()) = "test_path_2";
+  representative_dataset_configs.push_back(config_2);
+
+  const absl::StatusOr<
+      absl::flat_hash_map<std::string, RepresentativeDatasetFile>>
+      representative_dataset_file_map =
+          CreateRepresentativeDatasetFileMap(representative_dataset_configs);
+
+  EXPECT_THAT(representative_dataset_file_map,
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("duplicate signature key: serving_default")));
+}
+
+}  // namespace
+}  // namespace mlir::quant::stablehlo
diff --git a/third_party/xla/third_party/tsl/tsl/platform/jpeg.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
similarity index 52%
rename from third_party/xla/third_party/tsl/tsl/platform/jpeg.h
rename to tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
index a7b640db03943f..679e1f8754be9b 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/jpeg.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,18 +12,20 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 
-#ifndef TENSORFLOW_TSL_PLATFORM_JPEG_H_
-#define TENSORFLOW_TSL_PLATFORM_JPEG_H_
+namespace stablehlo::quantization {
 
-#include <stddef.h>
-#include <stdio.h>
-#include <stdlib.h>
-#include <sys/types.h>
+QuantizationConfig PopulateDefaults(
+    const QuantizationConfig& user_provided_config) {
+  QuantizationConfig config = user_provided_config;
 
-extern "C" {
-#include "jerror.h"  // from @libjpeg_turbo   // IWYU pragma: export
-#include "jpeglib.h"  // from @libjpeg_turbo  // IWYU pragma: export
+  PipelineConfig& pipeline_config = *config.mutable_pipeline_config();
+  if (!pipeline_config.has_unpack_quantized_types()) {
+    pipeline_config.set_unpack_quantized_types(true);
+  }
+
+  return config;
 }
 
-#endif  // TENSORFLOW_TSL_PLATFORM_JPEG_H_
+}  // namespace stablehlo::quantization
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h
new file mode 100644
index 00000000000000..20b9efa4a60fa0
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h
@@ -0,0 +1,29 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CONFIG_H_
+#define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CONFIG_H_
+
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+
+namespace stablehlo::quantization {
+
+// Returns a copy of `user_provided_config` with default values populated where
+// the user did not explicitly specify.
+QuantizationConfig PopulateDefaults(
+    const QuantizationConfig& user_provided_config);
+
+}  // namespace stablehlo::quantization
+
+#endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_CONFIG_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
index 1b14bd56aaa44f..31c67f2d20c4ff 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.cc
@@ -20,11 +20,45 @@ limitations under the License.
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
 
 namespace mlir::quant::stablehlo {
 
+using ::stablehlo::quantization::PipelineConfig;
+using ::stablehlo::quantization::QuantizationSpecs;
+using ::stablehlo::quantization::StaticRangePtqPreset;
+using ::tensorflow::quantization::CalibrationOptions;
+
+void AddPreCalibrationPasses(OpPassManager& pm,
+                             const CalibrationOptions& calibration_options,
+                             const QuantizationSpecs& quantization_specs) {
+  pm.addPass(CreateLiftQuantizableSpotsAsFunctionsPass(quantization_specs));
+  pm.addNestedPass<func::FuncOp>(
+      CreateInsertCustomAggregationOpsPass(calibration_options));
+  pm.addPass(CreateIssueIDsOfCustomAggregationOpsPass());
+  // StableHLO Quantizer currently uses TF's calibration passes. Serialize
+  // the StableHLO module as tf.XlaCallModule to run calibration.
+  AddCallModuleSerializationPasses(pm);
+}
+
+void AddPostCalibrationPasses(
+    OpPassManager& pm, const PipelineConfig& pipeline_config,
+    const StaticRangePtqPreset& static_range_ptq_preset) {
+  QuantizeCompositeFunctionsPassOptions options;
+  options.enable_per_channel_quantized_weight_ =
+      static_range_ptq_preset.enable_per_channel_quantized_weight();
+  pm.addNestedPass<func::FuncOp>(
+      CreateConvertCustomAggregationOpToQuantStatsPass());
+  pm.addPass(createQuantizeCompositeFunctionsPass(options));
+  if (pipeline_config.unpack_quantized_types()) {
+    AddStablehloQuantToIntPasses(pm);
+  }
+  AddCallModuleSerializationPasses(pm);
+}
+
 void AddXlaCallModuleOpDeserializationPasses(OpPassManager& pm) {
   pm.addPass(TF::CreateXlaCallModuleDeserializationPass());
   pm.addPass(createRestoreFunctionNamePass());
@@ -60,6 +94,10 @@ void AddStablehloQuantToIntPasses(OpPassManager& pm) {
 // NOMUTANTS -- Add tests for individual passes with migration below.
 void AddCallModuleSerializationPasses(OpPassManager& pm) {
   AddShapeLegalizationPasses(pm);
+  // Add an inliner pass to inline quantized StableHLO functions (and others) so
+  // that StableHLO ops are properly grouped and converted into XlaCallModule
+  // ops by the ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass.
+  pm.addPass(createInlinerPass());
   pm.addPass(createReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass());
   // ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass may create
   // duplicate constants. Add canonicalizer to deduplicate.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h
index e5272732400365..5920619bd3fb8d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h
@@ -16,9 +16,26 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_CC_PASS_PIPELINE_H_
 
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 
 namespace mlir::quant::stablehlo {
 
+// Adds passes for static-range quantization pre-calibration. Inserts ops
+// required to collect tensor statistics.
+void AddPreCalibrationPasses(
+    OpPassManager& pm,
+    const ::tensorflow::quantization::CalibrationOptions& calibration_options,
+    const ::stablehlo::quantization::QuantizationSpecs& specs);
+
+// Adds passes for static-range quantization post-calibration. Utilizes tensor
+// statistics collected from the calibration step and performs quantization.
+void AddPostCalibrationPasses(
+    OpPassManager& pm,
+    const ::stablehlo::quantization::PipelineConfig& pipeline_config,
+    const ::stablehlo::quantization::StaticRangePtqPreset&
+        static_range_ptq_preset);
+
 // Deserializes StableHLO functions serialized and embedded in XlaCallModuleOps.
 void AddXlaCallModuleOpDeserializationPasses(OpPassManager& pm);
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.cc
index 7c52a1c7d6790b..6f5f10b48f41f5 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.cc
@@ -17,20 +17,19 @@ limitations under the License.
 #include "absl/base/nullability.h"
 #include "absl/log/die_if_null.h"
 #include "absl/status/statusor.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project  // IWYU: keep
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "tsl/platform/errors.h"
 
 namespace mlir::quant::stablehlo {
 
+using ::stablehlo::quantization::PipelineConfig;
 using ::stablehlo::quantization::QuantizationConfig;
+using ::stablehlo::quantization::StaticRangePtqPreset;
 using ::tensorflow::quantization::RunPasses;
 
 PostCalibrationComponent::PostCalibrationComponent(
@@ -39,20 +38,20 @@ PostCalibrationComponent::PostCalibrationComponent(
 
 absl::StatusOr<ModuleOp> PostCalibrationComponent::Run(
     ModuleOp module_op, const QuantizationConfig& config) {
-  TF_RETURN_IF_ERROR(
-      RunPasses(/*name=*/kName,
-                /*add_passes_func=*/[this](PassManager& pm) { AddPasses(pm); },
-                *ctx_, module_op));
+  TF_RETURN_IF_ERROR(RunPasses(
+      kName, /*add_passes_func=*/
+      [&config, this](PassManager& pm) {
+        AddPostCalibrationPasses(pm, config.pipeline_config(),
+                                 config.static_range_ptq_preset());
+      },
+      *ctx_, module_op));
   return module_op;
 }
 
-void PostCalibrationComponent::AddPasses(OpPassManager& pm) const {
-  pm.addNestedPass<func::FuncOp>(
-      CreateConvertCustomAggregationOpToQuantStatsPass());
-  pm.addPass(createQuantizeCompositeFunctionsPass());
-  pm.addPass(createOptimizeGraphPass());
-  AddStablehloQuantToIntPasses(pm);
-  AddCallModuleSerializationPasses(pm);
+void PostCalibrationComponent::AddPasses(
+    OpPassManager& pm, const StaticRangePtqPreset& static_range_ptq_preset,
+    const PipelineConfig& pipeline_config) const {
+  AddPostCalibrationPasses(pm, pipeline_config, static_range_ptq_preset);
 }
 
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h
index 3ad71f05079162..3c218c9f857524 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h
@@ -45,9 +45,11 @@ class PostCalibrationComponent : public Component {
       ModuleOp module_op,
       const ::stablehlo::quantization::QuantizationConfig& config) override;
 
-  // Adds MLIR passes to the pass manager. `Run` will essentially run these
-  // passes on the module op.
-  void AddPasses(OpPassManager& pm) const;
+  void AddPasses(
+      OpPassManager& pm,
+      const ::stablehlo::quantization::StaticRangePtqPreset&
+          static_range_ptq_preset,
+      const ::stablehlo::quantization::PipelineConfig& pipeline_config) const;
 
  private:
   absl::Nonnull<MLIRContext*> ctx_;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.cc
index 2f9882417420dd..f54f947990866b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.cc
@@ -19,14 +19,11 @@ limitations under the License.
 #include "absl/base/nullability.h"
 #include "absl/log/die_if_null.h"
 #include "absl/status/statusor.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tsl/platform/errors.h"
 
@@ -44,15 +41,9 @@ PreCalibrationComponent::PreCalibrationComponent(
 absl::StatusOr<ModuleOp> PreCalibrationComponent::Run(
     ModuleOp module_op, const QuantizationConfig& config) {
   TF_RETURN_IF_ERROR(RunPasses(
-      /*name=*/kName, /*add_passes_func=*/
-      [this](PassManager& pm) {
-        pm.addPass(createLiftQuantizableSpotsAsFunctionsPass());
-        pm.addNestedPass<func::FuncOp>(
-            CreateInsertCustomAggregationOpsPass(calibration_options_));
-        pm.addPass(CreateIssueIDsOfCustomAggregationOpsPass());
-        // StableHLO Quantizer currently uses TF's calibration passes. Serialize
-        // the StableHLO module as tf.XlaCallModule to run calibration.
-        AddCallModuleSerializationPasses(pm);
+      kName, /*add_passes_func=*/
+      [&config, this](PassManager& pm) {
+        AddPreCalibrationPasses(pm, calibration_options_, config.specs());
       },
       *ctx_, module_op));
   return module_op;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration_test.cc
index 69f9679f90a357..37f83e57fb678d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration_test.cc
@@ -68,7 +68,7 @@ class PreCalibrationComponentTest : public QuantizationTestBase {};
 
 TEST_F(PreCalibrationComponentTest,
        HasCustomAggregatorOpAndQuantizableFuncForSimpleDotGeneral) {
-  PreCalibrationComponent component(&ctx_, CalibrationOptions());
+  PreCalibrationComponent component(ctx_.get(), CalibrationOptions());
   OwningOpRef<ModuleOp> module_op = ParseModuleOpString(R"mlir(
     module attributes {} {
       func.func @main(%arg0: tensor<1x4xf32>) -> tensor<1x3xf32> attributes {} {
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc
index dcde93f81d60e2..c756505f9c52ac 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.cc
@@ -47,6 +47,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pre_calibration.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_export.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/saved_model_import.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/types.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
@@ -77,7 +78,6 @@ using ::tensorflow::quantization::CalibrationOptions;
 using ::tensorflow::quantization::ExportedModel;
 using ::tensorflow::quantization::PreprocessAndFreezeGraph;
 using ::tensorflow::quantization::PyFunctionLibrary;
-using ::tensorflow::quantization::RepresentativeDatasetFile;
 using ::tensorflow::quantization::RunPasses;
 using ::tensorflow::quantization::UnfreezeConstantsAndSaveVariables;
 
@@ -105,7 +105,8 @@ absl::StatusOr<SmallVector<AssetFileDef>> RunExportPasses(
   }
 
   if (absl::Status pass_run_status = RunPasses(
-          /*name=*/export_opts.debug_name,
+          /*name=*/
+          export_opts.debug_name,
           /*add_passes_func=*/
           [dup_constants = export_opts.duplicate_shape_determining_constants](
               PassManager& pm) { AddExportPasses(pm, dup_constants); },
@@ -229,9 +230,6 @@ StaticRangePtqComponent::StaticRangePtqComponent(
     const absl::string_view src_saved_model_path,
     std::vector<std::string> signature_keys,
     std::unordered_set<std::string> tags,
-    absl::flat_hash_map<std::string,
-                        tensorflow::quantization::RepresentativeDatasetFile>
-        representative_dataset_file_map,
     absl::flat_hash_map<std::string, SignatureDef> signature_def_map,
     absl::flat_hash_map<FunctionName, FunctionAlias> function_aliases)
     : ctx_(ctx) {
@@ -244,7 +242,7 @@ StaticRangePtqComponent::StaticRangePtqComponent(
       ctx_, py_function_library, src_saved_model_path,
       std::move(function_aliases), std::move(tags),
       std::move(signature_def_map), std::move(signature_keys),
-      std::move(representative_dataset_file_map), calibration_options);
+      calibration_options);
   sub_components_[2] = std::make_unique<PostCalibrationComponent>(ctx_);
 }
 
@@ -267,9 +265,7 @@ absl::Status QuantizeStaticRangePtq(
     const std::vector<std::string>& signature_keys,
     const absl::flat_hash_map<std::string, SignatureDef>& signature_def_map,
     const absl::flat_hash_map<FunctionName, FunctionAlias>& function_aliases,
-    const PyFunctionLibrary& py_function_library,
-    const absl::flat_hash_map<std::string, RepresentativeDatasetFile>&
-        representative_dataset_file_map) {
+    const PyFunctionLibrary& py_function_library) {
   std::unordered_set<std::string> tags;
   tags.insert(quantization_config.tf_saved_model().tags().begin(),
               quantization_config.tf_saved_model().tags().end());
@@ -283,8 +279,7 @@ absl::Status QuantizeStaticRangePtq(
 
   StaticRangePtqComponent static_range_ptq_component(
       ctx.get(), &py_function_library, src_saved_model_path, signature_keys,
-      tags, representative_dataset_file_map, signature_def_map,
-      function_aliases);
+      tags, signature_def_map, function_aliases);
   TF_ASSIGN_OR_RETURN(module_op, static_range_ptq_component.Run(
                                      module_op, quantization_config));
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h
index 93697101e42178..4f2867b034cba3 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h
@@ -57,9 +57,6 @@ class StaticRangePtqComponent : public Component {
       absl::string_view src_saved_model_path,
       std::vector<std::string> signature_keys,
       std::unordered_set<std::string> tags,
-      absl::flat_hash_map<std::string,
-                          tensorflow::quantization::RepresentativeDatasetFile>
-          representative_dataset_file_map,
       absl::flat_hash_map<std::string, tensorflow::SignatureDef>
           signature_def_map,
       absl::flat_hash_map<FunctionName, FunctionAlias> function_aliases);
@@ -101,10 +98,7 @@ absl::Status QuantizeStaticRangePtq(
     const absl::flat_hash_map<std::string, tensorflow::SignatureDef>&
         signature_def_map,
     const absl::flat_hash_map<std::string, std::string>& function_aliases,
-    const tensorflow::quantization::PyFunctionLibrary& py_function_library,
-    const absl::flat_hash_map<
-        std::string, tensorflow::quantization::RepresentativeDatasetFile>&
-        representative_dataset_file_map);
+    const tensorflow::quantization::PyFunctionLibrary& py_function_library);
 // LINT.ThenChange(../python/pywrap_quantization.cc:static_range_ptq)
 
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD
index 2138a04d2bb389..b7d75897cddd07 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/BUILD
@@ -1,3 +1,4 @@
+load("//tensorflow:tensorflow.bzl", "tf_cc_test")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 
 package(
@@ -17,6 +18,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
+        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/tensorflow",
         "@llvm-project//llvm:Support",
@@ -26,3 +28,20 @@ cc_library(
         "@stablehlo//:stablehlo_ops",
     ],
 )
+
+tf_cc_test(
+    name = "stablehlo_op_quant_spec_test",
+    srcs = ["stablehlo_op_quant_spec_test.cc"],
+    deps = [
+        ":stablehlo_op_quant_spec",
+        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
+        "//tensorflow/compiler/mlir/quantization/common:test_base",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/core:test",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@stablehlo//:stablehlo_ops",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc
index 9e832650c24a23..bbcff2dcdbe6d2 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -59,8 +60,8 @@ std::unique_ptr<OpQuantSpec> GetStableHloOpQuantSpec(Operation* op) {
                                   quant::GetUniformQuantizedTypeForBias};
       }
     }
-    for (auto quantizable_operand : spec->coeff_op_quant_dim) {
-      spec->quantizable_operands.insert(quantizable_operand.first);
+    for (const auto [operand_idx, per_channel_dim] : spec->coeff_op_quant_dim) {
+      spec->quantizable_operands.insert(operand_idx);
     }
   }
   return spec;
@@ -69,18 +70,17 @@ std::unique_ptr<OpQuantSpec> GetStableHloOpQuantSpec(Operation* op) {
 std::unique_ptr<OpQuantScaleSpec> GetStableHloQuantScaleSpec(Operation* op) {
   auto scale_spec = std::make_unique<OpQuantScaleSpec>();
   if (llvm::isa<mlir::stablehlo::BroadcastInDimOp,
-                mlir::stablehlo::ConcatenateOp, mlir::stablehlo::ConvertOp,
-                mlir::stablehlo::GatherOp, mlir::stablehlo::PadOp,
-                mlir::stablehlo::ReduceWindowOp, mlir::stablehlo::ReshapeOp,
-                mlir::stablehlo::SelectOp, mlir::stablehlo::SliceOp,
-                mlir::stablehlo::TransposeOp>(op)) {
+                mlir::stablehlo::ConcatenateOp, mlir::stablehlo::GatherOp,
+                mlir::stablehlo::PadOp, mlir::stablehlo::ReduceWindowOp,
+                mlir::stablehlo::ReshapeOp, mlir::stablehlo::SelectOp,
+                mlir::stablehlo::SliceOp, mlir::stablehlo::TransposeOp>(op)) {
     scale_spec->has_same_scale_requirement = true;
   }
   return scale_spec;
 }
 
 bool IsOpQuantizableStableHlo(Operation* op) {
-  if (mlir::isa<func::ConstantOp, mlir::stablehlo::ConstantOp>(op)) {
+  if (isa<func::ConstantOp, mlir::stablehlo::ConstantOp>(op)) {
     // Constant ops do not have QuantizableResult attribute but can be
     // quantized.
     return true;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/stablehlo_op_quant_spec_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec_test.cc
similarity index 56%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/stablehlo_op_quant_spec_test.cc
rename to tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec_test.cc
index a469bd7c349c99..80a6d2fa451e6c 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/stablehlo_op_quant_spec_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec_test.cc
@@ -15,33 +15,28 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h"
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/quantization/common/test_base.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/test.h"
 
 namespace mlir::quant::stablehlo {
 namespace {
 
-using ::mlir::quant::QuantizationTestBase;
+using ::testing::NotNull;
 
 class IsOpQuantizableStableHloTest : public QuantizationTestBase {};
 
 // Quantizable ops: constants
 // Non-quantizable ops: normal StableHLO ops and terminators
-constexpr absl::string_view module_constant_add = R"mlir(
+constexpr absl::string_view kModuleConstantAdd = R"mlir(
   module {
     func.func @constant_add() -> (tensor<3x2xf32>) {
       %cst1 = stablehlo.constant dense<2.4> : tensor<3x2xf32>
@@ -55,7 +50,7 @@ constexpr absl::string_view module_constant_add = R"mlir(
 // Quantizable ops: XlaCallModule op with "fully_quantizable" attribute and
 // same-scale StableHLO ops
 // Non-quantizable ops: quantize/dequantize ops
-constexpr absl::string_view module_composite_same_scale = R"mlir(
+constexpr absl::string_view kModuleCompositeSameScale = R"mlir(
   module {
     func.func @same_scale_after_composite() -> tensor<3x1xf32> {
       %0 = "tf.XlaCallModule"() {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : () -> tensor<1x3xf32>
@@ -70,7 +65,7 @@ constexpr absl::string_view module_composite_same_scale = R"mlir(
 )mlir";
 
 // Non-quantizable ops: XlaCallModule op without "fully_quantizable" attribute
-constexpr absl::string_view module_composite_no_attr = R"mlir(
+constexpr absl::string_view kModuleCompositeNoAttr = R"mlir(
   module {
     func.func @composite_without_attr() -> tensor<1x3xf32> {
       %0 = "tf.XlaCallModule"() {Sout = [#tf_type.shape<1x3>], _entry_function = @non_quantizable_composite, _original_entry_function = "non_quantizable_composite", _stablehlo_module_attrs = {}, device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : () -> tensor<1x3xf32>
@@ -80,97 +75,79 @@ constexpr absl::string_view module_composite_no_attr = R"mlir(
 )mlir";
 
 TEST_F(IsOpQuantizableStableHloTest, ConstantOpQuantizable) {
-  OwningOpRef<ModuleOp> module_op_ref =
-      ParseModuleOpString(module_constant_add);
-  func::FuncOp test_func =
-      GetFunctionFromModule(*module_op_ref, "constant_add");
-  Operation* constant_op =
-      FindOperationOfType<mlir::stablehlo::ConstantOp>(test_func);
-  bool is_constant_quantizable =
-      mlir::quant::stablehlo::IsOpQuantizableStableHlo(constant_op);
+  OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleConstantAdd);
+  auto test_func = module_op_ref->lookupSymbol<func::FuncOp>("constant_add");
+  ASSERT_THAT(test_func, NotNull());
 
-  EXPECT_TRUE(is_constant_quantizable);
+  auto constant_op =
+      FindOperationOfType<mlir::stablehlo::ConstantOp>(test_func);
+  EXPECT_TRUE(IsOpQuantizableStableHlo(constant_op));
 }
 
 TEST_F(IsOpQuantizableStableHloTest, TerminatorOpNotQuantizable) {
-  OwningOpRef<ModuleOp> module_op_ref =
-      ParseModuleOpString(module_constant_add);
-  func::FuncOp test_func =
-      GetFunctionFromModule(*module_op_ref, "constant_add");
-  Operation* return_op = FindOperationOfType<func::ReturnOp>(test_func);
-  bool is_return_quantizable =
-      mlir::quant::stablehlo::IsOpQuantizableStableHlo(return_op);
-
-  EXPECT_FALSE(is_return_quantizable);
+  OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleConstantAdd);
+  auto test_func = module_op_ref->lookupSymbol<func::FuncOp>("constant_add");
+  ASSERT_THAT(test_func, NotNull());
+
+  auto return_op = FindOperationOfType<func::ReturnOp>(test_func);
+  EXPECT_FALSE(IsOpQuantizableStableHlo(return_op));
 }
 
 TEST_F(IsOpQuantizableStableHloTest, SameScaleOpQuantizable) {
   OwningOpRef<ModuleOp> module_op_ref =
-      ParseModuleOpString(module_composite_same_scale);
-  func::FuncOp test_func =
-      GetFunctionFromModule(*module_op_ref, "same_scale_after_composite");
-  Operation* reshape_op =
-      FindOperationOfType<mlir::stablehlo::ReshapeOp>(test_func);
-  bool is_reshape_quantizable =
-      mlir::quant::stablehlo::IsOpQuantizableStableHlo(reshape_op);
-
-  EXPECT_TRUE(is_reshape_quantizable);
+      ParseModuleOpString(kModuleCompositeSameScale);
+  auto test_func =
+      module_op_ref->lookupSymbol<func::FuncOp>("same_scale_after_composite");
+  ASSERT_THAT(test_func, NotNull());
+
+  auto reshape_op = FindOperationOfType<mlir::stablehlo::ReshapeOp>(test_func);
+  EXPECT_TRUE(IsOpQuantizableStableHlo(reshape_op));
 }
 
 TEST_F(IsOpQuantizableStableHloTest, NonSameScaleOpNotQuantizable) {
-  OwningOpRef<ModuleOp> module_op_ref =
-      ParseModuleOpString(module_constant_add);
-  func::FuncOp test_func =
-      GetFunctionFromModule(*module_op_ref, "constant_add");
-  Operation* add_op = FindOperationOfType<mlir::stablehlo::AddOp>(test_func);
-  bool is_add_quantizable =
-      mlir::quant::stablehlo::IsOpQuantizableStableHlo(add_op);
-
-  EXPECT_FALSE(is_add_quantizable);
+  OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleConstantAdd);
+  auto test_func = module_op_ref->lookupSymbol<func::FuncOp>("constant_add");
+  ASSERT_THAT(test_func, NotNull());
+
+  auto add_op = FindOperationOfType<mlir::stablehlo::AddOp>(test_func);
+  EXPECT_FALSE(IsOpQuantizableStableHlo(add_op));
 }
 
 TEST_F(IsOpQuantizableStableHloTest, ValidXlaCallModuleOpQuantizable) {
   OwningOpRef<ModuleOp> module_op_ref =
-      ParseModuleOpString(module_composite_same_scale);
-  func::FuncOp test_func =
-      GetFunctionFromModule(*module_op_ref, "same_scale_after_composite");
-  Operation* xla_call_module_op =
-      FindOperationOfType<TF::XlaCallModuleOp>(test_func);
-  bool is_xla_call_module_quantizable =
-      mlir::quant::stablehlo::IsOpQuantizableStableHlo(xla_call_module_op);
-
-  EXPECT_TRUE(is_xla_call_module_quantizable);
+      ParseModuleOpString(kModuleCompositeSameScale);
+  auto test_func =
+      module_op_ref->lookupSymbol<func::FuncOp>("same_scale_after_composite");
+  ASSERT_THAT(test_func, NotNull());
+
+  auto xla_call_module_op = FindOperationOfType<TF::XlaCallModuleOp>(test_func);
+  EXPECT_TRUE(IsOpQuantizableStableHlo(xla_call_module_op));
 }
 
 TEST_F(IsOpQuantizableStableHloTest, InvalidXlaCallModuleOpNotQuantizable) {
   OwningOpRef<ModuleOp> module_op_ref =
-      ParseModuleOpString(module_composite_no_attr);
-  func::FuncOp test_func =
-      GetFunctionFromModule(*module_op_ref, "composite_without_attr");
-  Operation* xla_call_module_op =
-      FindOperationOfType<TF::XlaCallModuleOp>(test_func);
-  bool is_xla_call_module_quantizable =
-      mlir::quant::stablehlo::IsOpQuantizableStableHlo(xla_call_module_op);
-
-  EXPECT_FALSE(is_xla_call_module_quantizable);
+      ParseModuleOpString(kModuleCompositeNoAttr);
+  auto test_func =
+      module_op_ref->lookupSymbol<func::FuncOp>("composite_without_attr");
+  ASSERT_THAT(test_func, NotNull());
+
+  auto xla_call_module_op = FindOperationOfType<TF::XlaCallModuleOp>(test_func);
+  EXPECT_FALSE(IsOpQuantizableStableHlo(xla_call_module_op));
 }
 
 TEST_F(IsOpQuantizableStableHloTest, QuantizeDequantizeOpNotQuantizable) {
   OwningOpRef<ModuleOp> module_op_ref =
-      ParseModuleOpString(module_composite_same_scale);
-  func::FuncOp test_func =
-      GetFunctionFromModule(*module_op_ref, "same_scale_after_composite");
-  Operation* quantize_op =
-      FindOperationOfType<quantfork::QuantizeCastOp>(test_func);
-  Operation* dequantize_op =
-      FindOperationOfType<quantfork::DequantizeCastOp>(test_func);
-  bool is_quantize_quantizable =
-      mlir::quant::stablehlo::IsOpQuantizableStableHlo(quantize_op);
-  bool is_dequantize_quantizable =
-      mlir::quant::stablehlo::IsOpQuantizableStableHlo(dequantize_op);
+      ParseModuleOpString(kModuleCompositeSameScale);
+  auto test_func =
+      module_op_ref->lookupSymbol<func::FuncOp>("same_scale_after_composite");
+  ASSERT_THAT(test_func, NotNull());
+
+  auto quantize_op = FindOperationOfType<quantfork::QuantizeCastOp>(test_func);
+  EXPECT_FALSE(IsOpQuantizableStableHlo(quantize_op));
 
-  EXPECT_FALSE(is_quantize_quantizable);
-  EXPECT_FALSE(is_dequantize_quantizable);
+  auto dequantize_op =
+      FindOperationOfType<quantfork::DequantizeCastOp>(test_func);
+  EXPECT_FALSE(IsOpQuantizableStableHlo(dequantize_op));
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc
index 633c193ab57c7c..f572a0795e3b77 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_mhlo_quant_to_int.cc
@@ -89,7 +89,7 @@ UniformQuantizedPerAxisType GetPerChannelType(QuantType quant_type) {
 void GetQuantizationParams(OpBuilder &builder, Location loc,
                            QuantType quant_type, Value &scales,
                            Value &zero_points, bool output_zero_point_in_fp,
-                           DenseIntElementsAttr &broadcast_dims) {
+                           DenseI64ArrayAttr &broadcast_dims) {
   // Get scales/zero points for per-tensor and per-axis quantization cases.
   if (auto *quant_per_tensor_type =
           std::get_if<UniformQuantizedType>(&quant_type)) {
@@ -140,8 +140,8 @@ void GetQuantizationParams(OpBuilder &builder, Location loc,
                        builder.getI32Type()),
                    zero_points_vec));
     }
-    broadcast_dims = DenseIntElementsAttr::get(
-        RankedTensorType::get({1}, builder.getI64Type()),
+    broadcast_dims = DenseI64ArrayAttr::get(
+        builder.getContext(),
         {static_cast<int64_t>(quant_per_channel_type.getQuantizedDimension())});
   }
 }
@@ -256,9 +256,8 @@ Value ApplyMergedScalesAndZps(OpBuilder &builder, Location loc,
                                           merged_scale_double.end()),
         merged_zp_float(merged_zp_double.begin(), merged_zp_double.end());
 
-    auto broadcast_dims = DenseIntElementsAttr::get(
-        RankedTensorType::get({1}, builder.getI64Type()),
-        {quantized_dimension});
+    auto broadcast_dims =
+        DenseI64ArrayAttr::get(builder.getContext(), {quantized_dimension});
     Value merged_scale = builder.create<mhlo::ConstantOp>(
         loc, DenseFPElementsAttr::get(
                  RankedTensorType::get({channel_size}, builder.getF32Type()),
@@ -367,7 +366,7 @@ class ConvertUniformQuantizeOp
                                         ConversionPatternRewriter &rewriter,
                                         QuantType quant_type) const {
     Value scales, zero_points;
-    DenseIntElementsAttr broadcast_dims;
+    DenseI64ArrayAttr broadcast_dims;
     GetQuantizationParams(rewriter, op->getLoc(), quant_type, scales,
                           zero_points, /*output_zero_point_in_fp=*/true,
                           broadcast_dims);
@@ -425,7 +424,7 @@ class ConvertUniformDequantizeOp
       return failure();
     }
     Value scales, zero_points;
-    DenseIntElementsAttr broadcast_dims;
+    DenseI64ArrayAttr broadcast_dims;
     GetQuantizationParams(rewriter, op->getLoc(), *quant_type, scales,
                           zero_points,
                           /*output_zero_point_in_fp=*/false, broadcast_dims);
@@ -465,15 +464,41 @@ class ConvertUniformQuantizedAddOp : public OpConversionPattern<mhlo::AddOp> {
 
     // We only handle cases where lhs, rhs and results all have quantized
     // element type.
-    if (failed(lhs_quant_type) || IsPerChannelType(*lhs_quant_type) ||
-        failed(rhs_quant_type) || IsPerChannelType(*rhs_quant_type) ||
-        failed(res_quant_type) || IsPerChannelType(*res_quant_type)) {
+    if (failed(lhs_quant_type) || failed(rhs_quant_type) ||
+        failed(res_quant_type)) {
       op->emitError(
-          "AddOp requires the same quantized element type for all operands and "
+          "AddOp requires the quantized element type for all operands and "
           "results");
       return failure();
     }
 
+    if (IsPerChannelType(*lhs_quant_type) ||
+        IsPerChannelType(*rhs_quant_type) ||
+        IsPerChannelType(*res_quant_type)) {
+      // Handle Per-Channel Quantized Types. We only support lhs/rhs/result with
+      // exact same per-channel quantized types with I32 storage type.
+      if (!IsPerChannelType(*lhs_quant_type) ||
+          !IsPerChannelType(*rhs_quant_type) ||
+          !IsPerChannelType(*res_quant_type) ||
+          GetPerChannelType(*lhs_quant_type) !=
+              GetPerChannelType(*rhs_quant_type) ||
+          GetPerChannelType(*lhs_quant_type) !=
+              GetPerChannelType(*res_quant_type)) {
+        op->emitError(
+            "Per-channel quantized AddOp requires the same quantized element "
+            "type for all operands and results");
+        return failure();
+      }
+      if (!GetPerChannelType(*lhs_quant_type).getStorageType().isInteger(32)) {
+        // For server-side StableHLO Quantization, add is quantized only when
+        // fused with conv/dot ops, whose output must be i32.
+        op->emitError("Per-channel quantized AddOp requires i32 storage type");
+        return failure();
+      }
+      return matchAndRewritePerChannel(op, adaptor, rewriter,
+                                       GetPerChannelType(*lhs_quant_type));
+    }
+
     // TODO: b/260280919 - Consider avoiding conversion to int32.
     auto res_int32_tensor_type =
         op.getResult().getType().clone(rewriter.getI32Type());
@@ -536,6 +561,33 @@ class ConvertUniformQuantizedAddOp : public OpConversionPattern<mhlo::AddOp> {
 
     return success();
   }
+
+  LogicalResult matchAndRewritePerChannel(
+      mhlo::AddOp op, mhlo::AddOpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter,
+      UniformQuantizedPerAxisType quant_type) const {
+    // We assume lhs/rhs/result have the same quantized type with i32 storage.
+    Value add_result = rewriter.create<mhlo::AddOp>(
+        op->getLoc(), adaptor.getLhs(), adaptor.getRhs());
+    // Add zp contribution if it is non-zero for any channel.
+    if (llvm::any_of(quant_type.getZeroPoints(),
+                     [](int64_t zp) { return zp != 0; })) {
+      SmallVector<int32_t> zps_vec(quant_type.getZeroPoints().begin(),
+                                   quant_type.getZeroPoints().end());
+      Value zps = rewriter.create<mhlo::ConstantOp>(
+          op->getLoc(),
+          DenseIntElementsAttr::get(
+              RankedTensorType::get({static_cast<int64_t>(zps_vec.size())},
+                                    rewriter.getI32Type()),
+              zps_vec));
+      add_result = rewriter.create<chlo::BroadcastSubOp>(
+          op->getLoc(), add_result, zps,
+          rewriter.getDenseI64ArrayAttr(
+              {static_cast<int64_t>(quant_type.getQuantizedDimension())}));
+    }
+    rewriter.replaceOp(op, add_result);
+    return success();
+  }
 };
 
 // This is a convenient struct for holding dimension numbers for dot-like ops
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_ops_to_mhlo.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_ops_to_mhlo.cc
index a24db896d79f1d..a6312e067af50f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_ops_to_mhlo.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/convert_tf_quant_ops_to_mhlo.cc
@@ -596,7 +596,7 @@ class ConvertUniformQuantizedAddOp
 
     // rhs (bias) is always 1D that broadcasts to the last dim of lhs.
     auto broadcast_dims =
-        mhlo::GetI64ElementsAttr({lhs_type.getRank() - 1}, &rewriter);
+        rewriter.getDenseI64ArrayAttr({lhs_type.getRank() - 1});
 
     auto rhs_type = GetUniformQuantizedType(
         op, op.getRhs().getType(), op.getRhsScales(), op.getRhsZeroPoints(),
@@ -651,8 +651,7 @@ class ConvertUniformQuantizedClipByValueOp
     if (quantization_axis >= 0) {
       broadcast_dims_values.push_back(quantization_axis);
     }
-    auto broadcast_dims =
-        mhlo::GetI64ElementsAttr(broadcast_dims_values, &rewriter);
+    auto broadcast_dims = rewriter.getDenseI64ArrayAttr(broadcast_dims_values);
 
     auto min_max_type = GetUniformQuantizedType(
         op, op.getMin().getType(), op.getScales(), op.getZeroPoints(),
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/legalize_tf_quant_test.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/legalize_tf_quant_test.cc
index 4c20b6bebdcdad..93946fdc320a97 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/legalize_tf_quant_test.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/bridge/legalize_tf_quant_test.cc
@@ -23,8 +23,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.h"
 #include "xla/client/client_library.h"
 #include "xla/shape.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
@@ -45,7 +45,7 @@ class LegalizeTFQuantTest : public Test {
         tensorflow::ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_UNSPECIFIED;
     mlir_to_hlo_args.mlir_module = mlir_module_string;
     tensorflow::se::Platform* platform =
-        tensorflow::se::MultiPlatformManager::PlatformWithName("Host").value();
+        tensorflow::se::PlatformManager::PlatformWithName("Host").value();
     auto client =
         xla::ClientLibrary::GetOrCreateCompileOnlyClient(platform).value();
     tensorflow::tpu::TPUCompileMetadataProto metadata_proto;
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_func_to_bfloat16.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_func_to_bfloat16.cc
index 3e5b7e1f8d5ace..17300611e356ce 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_func_to_bfloat16.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_func_to_bfloat16.cc
@@ -47,7 +47,29 @@ class BFloat16TypeConverter : public TypeConverter {
   }
 };
 
-// An Op is illegal iff it is non-UQ op and it contains qint types.
+// This helper function makes legality check easier. Both convert ops in the
+// patterns below are considered legal:
+//  - BitcastConvertOp(i32 -> f32) + ConvertOp(f32 -> bf16)
+//  - ConvertOp(bf16 -> f32) -> BitcastConvertOp(f32 -> i32)
+template <typename ConvertOp, typename OtherConvertOp>
+bool IsConvertOpLegal(ConvertOp convert_op, BFloat16TypeConverter &converter) {
+  if (!converter.isLegal(convert_op.getOperand().getType())) {
+    auto other_convert_op = dyn_cast_or_null<OtherConvertOp>(
+        convert_op.getOperand().getDefiningOp());
+    return other_convert_op &&
+           converter.isLegal(other_convert_op.getOperand().getType());
+  } else if (!converter.isLegal(convert_op.getResult().getType())) {
+    if (!convert_op.getResult().hasOneUse()) {
+      return false;
+    }
+    auto other_convert_op = dyn_cast_or_null<OtherConvertOp>(
+        *convert_op.getResult().getUsers().begin());
+    return other_convert_op &&
+           converter.isLegal(other_convert_op.getResult().getType());
+  }
+  return true;
+}
+
 class BFloat16TypeConversionTarget : public ConversionTarget {
  public:
   explicit BFloat16TypeConversionTarget(MLIRContext &ctx,
@@ -58,6 +80,15 @@ class BFloat16TypeConversionTarget : public ConversionTarget {
       // types do not contain.
       if (auto func = dyn_cast<func::FuncOp>(op)) {
         if (!converter_.isSignatureLegal(func.getFunctionType())) return false;
+      } else if (auto bitcast_convert_op =
+                     dyn_cast<mlir::stablehlo::BitcastConvertOp>(op)) {
+        return IsConvertOpLegal<mlir::stablehlo::BitcastConvertOp,
+                                mlir::stablehlo::ConvertOp>(bitcast_convert_op,
+                                                            converter_);
+      } else if (auto convert_op = dyn_cast<mlir::stablehlo::ConvertOp>(op)) {
+        return IsConvertOpLegal<mlir::stablehlo::ConvertOp,
+                                mlir::stablehlo::BitcastConvertOp>(convert_op,
+                                                                   converter_);
       }
       return converter_.isLegal(op);
     });
@@ -69,7 +100,7 @@ class BFloat16TypeConversionTarget : public ConversionTarget {
 
 class BFloat16TypePattern : public ConversionPattern {
  public:
-  BFloat16TypePattern(MLIRContext *ctx, TypeConverter &converter)
+  BFloat16TypePattern(TypeConverter &converter, MLIRContext *ctx)
       : ConversionPattern(converter, MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
 
   LogicalResult matchAndRewrite(
@@ -78,6 +109,10 @@ class BFloat16TypePattern : public ConversionPattern {
     if (getTypeConverter()->isLegal(op)) {
       return failure();
     }
+    if (isa<mlir::stablehlo::BitcastConvertOp>(op)) {
+      // Skip BitcastConvertOp, which is handled by the other pattern.
+      return failure();
+    }
 
     // Update the results.
     SmallVector<Type, 4> new_results;
@@ -118,6 +153,42 @@ class BFloat16TypePattern : public ConversionPattern {
     return success();
   }
 };
+
+class BitcastConvertOpPattern
+    : public OpConversionPattern<mlir::stablehlo::BitcastConvertOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult matchAndRewrite(
+      mlir::stablehlo::BitcastConvertOp op,
+      mlir::stablehlo::BitcastConvertOpAdaptor adaptor,
+      ConversionPatternRewriter &rewriter) const override {
+    bool is_input_legal =
+        getTypeConverter()->isLegal(op.getOperand().getType());
+    bool is_output_legal =
+        getTypeConverter()->isLegal(op.getResult().getType());
+    if (is_input_legal && is_output_legal) {
+      return failure();
+    } else if (is_input_legal) {
+      // output is f32, we bitcast_convert to f32 and then convert to bf16.
+      Value output = rewriter.create<mlir::stablehlo::BitcastConvertOp>(
+          op->getLoc(), op.getResult().getType(), adaptor.getOperand());
+      rewriter.replaceOpWithNewOp<mlir::stablehlo::ConvertOp>(
+          op, getTypeConverter()->convertType(op.getResult().getType()),
+          output);
+    } else if (is_output_legal) {
+      // input is f32, we convert from bf16 and then bitcast_convert.
+      Value output = rewriter.create<mlir::stablehlo::ConvertOp>(
+          op->getLoc(), op.getOperand().getType(), adaptor.getOperand());
+      rewriter.replaceOpWithNewOp<mlir::stablehlo::BitcastConvertOp>(
+          op, op.getResult().getType(), output);
+    } else {
+      // Both input/output are f32. Convert to no-op.
+      rewriter.replaceOp(op, adaptor.getOperand());
+    }
+    return success();
+  }
+};
 }  // namespace
 
 #define GEN_PASS_DEF_CONVERTFUNCTOBFLOAT16PASS
@@ -140,7 +211,8 @@ void ConvertFuncToBfloat16Pass::runOnOperation() {
   RewritePatternSet patterns(context);
 
   BFloat16TypeConverter converter;
-  patterns.add<BFloat16TypePattern>(context, converter);
+  patterns.add<BFloat16TypePattern, BitcastConvertOpPattern>(converter,
+                                                             context);
   populateFunctionOpInterfaceTypeConversionPattern<func::FuncOp>(patterns,
                                                                  converter);
   BFloat16TypeConversionTarget target(*context, converter);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_xla_call_module_op_to_bfloat16.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_xla_call_module_op_to_bfloat16.cc
index a5f35c9a64b5bd..5f69cd7ee3e082 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_xla_call_module_op_to_bfloat16.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/convert_xla_call_module_op_to_bfloat16.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project  // IWYU pragma: keep
@@ -43,22 +45,26 @@ limitations under the License.
 
 namespace mlir::quant::stablehlo {
 
-FailureOr<std::string> ConvertSerializedStableHloModuleToBfloat16(
-    MLIRContext* context, StringRef serialized_stablehlo_module) {
+absl::StatusOr<std::string> ConvertSerializedStableHloModuleToBfloat16(
+    StringRef serialized_stablehlo_module) {
   // StableHLO module is empty often because the XlaCallModuleOp is already
   // deserialized, e.g. after invoking XlaCallModuleDeserializationPass. We
   // don't handle this situation.
-  if (serialized_stablehlo_module.empty()) return failure();
+  if (serialized_stablehlo_module.empty()) {
+    return absl::InvalidArgumentError("StableHLO module is empty.");
+  }
 
+  MLIRContext context;
   OwningOpRef<ModuleOp> stablehlo_module_op =
       mlir::stablehlo::deserializePortableArtifact(serialized_stablehlo_module,
-                                                   context);
+                                                   &context);
 
   // Convert the StableHLO module to bfloat16.
-  PassManager pm(context);
+  PassManager pm(&context);
   pm.addNestedPass<func::FuncOp>(createConvertFuncToBfloat16Pass());
   if (failed(pm.run(stablehlo_module_op.get()))) {
-    return failure();
+    return absl::InternalError(
+        "Failed to convert StableHLO module to bfloat16.");
   }
 
   std::string bytecode;
@@ -66,7 +72,7 @@ FailureOr<std::string> ConvertSerializedStableHloModuleToBfloat16(
   if (failed(mlir::stablehlo::serializePortableArtifact(
           stablehlo_module_op.get(), mlir::stablehlo::getCurrentVersion(),
           os))) {
-    return failure();
+    return absl::InternalError("Failed to serialize StableHLO module.");
   }
   return bytecode;
 }
@@ -95,9 +101,11 @@ void ConvertXlaCallModuleOpToBfloat16Pass::runOnOperation() {
 
   auto result = func_op->walk([&](TF::XlaCallModuleOp op) {
     // Converts the serialized StableHLO module to bfloat16.
-    auto result = ConvertSerializedStableHloModuleToBfloat16(
-        &getContext(), op.getModuleAttr());
-    if (failed(result)) {
+    auto result =
+        ConvertSerializedStableHloModuleToBfloat16(op.getModuleAttr());
+    if (!result.ok()) {
+      llvm::errs() << "Failed to convert StableHLO module to bfloat16: "
+                   << result.status().message();
       return WalkResult::interrupt();
     }
     op.setModuleAttr(StringAttr::get(&getContext(), *result));
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
index 6f13634b317aa4..dbe88208ae7b03 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions.cc
@@ -12,9 +12,13 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <memory>
+#include <string>
 #include <utility>
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
@@ -31,6 +35,11 @@ limitations under the License.
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tsl/platform/regexp.h"  // IWYU pragma: keep
+
+#define DEBUG_TYPE "lift_quantizable_spots_as_functions"
 
 namespace mlir::quant::stablehlo {
 
@@ -39,13 +48,16 @@ namespace mlir::quant::stablehlo {
 
 namespace {
 
+using ::stablehlo::quantization::FunctionNameMatcherSpec;
+using ::stablehlo::quantization::Method;
+using ::stablehlo::quantization::QuantizationSpec;
+using ::stablehlo::quantization::QuantizationSpecs;
+
 // TODO - b/303543789: Move the helper functions below to a separate util.
 // Fetches the default or null attribute, used for pattern matching.
 Attribute DefaultOrNullAttr(OpBuilder& builder, const Attribute& attr) {
-  if (!attr) {
-    return builder.getStringAttr(kNullAttributeValue);
-  }
-  return attr;
+  if (attr) return attr;
+  return builder.getStringAttr(kNullAttributeValue);
 }
 
 // Checks whether the value of a constant equals the given float, regardless
@@ -62,6 +74,12 @@ bool FloatValueEquals(const Attribute& attr, const double value) {
   });
 }
 
+// Lifts quantizable units as separate functions, thereby identifying the
+// boundaries of quantizable subgraphs. `QuantizationSpecs` influences how
+// quantizable units are lifted.
+//
+// FileCheck test cases using various `QuantizationSpecs` can be seen at
+// `TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass`.
 class LiftQuantizableSpotsAsFunctionsPass
     : public impl::LiftQuantizableSpotsAsFunctionsPassBase<
           LiftQuantizableSpotsAsFunctionsPass> {
@@ -69,10 +87,19 @@ class LiftQuantizableSpotsAsFunctionsPass
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
       LiftQuantizableSpotsAsFunctionsPass)
 
-  explicit LiftQuantizableSpotsAsFunctionsPass() = default;
+  LiftQuantizableSpotsAsFunctionsPass() = default;
+
+  // Constructor with explicit user-provided `QuantizationSpecs`.
+  explicit LiftQuantizableSpotsAsFunctionsPass(
+      QuantizationSpecs quantization_specs)
+      : quantization_specs_(std::move(quantization_specs)) {}
 
  private:
   void runOnOperation() override;
+
+  // No explicit quantization spec is specified by default. Implicitly this
+  // means that all quantizable units will be identified and lifted.
+  QuantizationSpecs quantization_specs_{};
 };
 
 namespace simple_patterns {
@@ -83,6 +110,91 @@ namespace fusion_patterns {
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_fusion.inc"
 }
 
+// Returns a `func::FuncOp` in `module_op` (not nested) whose name matches
+// `name`. Returns null if no such a function exists.
+// TODO: b/307620778 - Factor out "FindMainFuncOp" functionality.
+func::FuncOp FindFuncOp(ModuleOp module_op, const StringRef name) {
+  auto func_ops = module_op.getOps<func::FuncOp>();
+  auto func_itr = llvm::find_if(func_ops, [name](func::FuncOp func_op) {
+    return func_op.getName() == name;
+  });
+
+  if (func_itr == func_ops.end()) return {};
+  return *func_itr;
+}
+
+// Quantizable Unit matcher that uses lifted function's name for matching.
+class FunctionNameMatcher {
+ public:
+  explicit FunctionNameMatcher(const FunctionNameMatcherSpec& spec)
+      : match_regex_(GetMatchRegex(spec)) {}
+
+  // Returns `true` when matched with the entry function of
+  // `xla_call_module_op`.
+  bool Match(TF::XlaCallModuleOp xla_call_module_op) const {
+    if (match_regex_ == nullptr) return false;
+
+    const std::string lifted_func_name =
+        xla_call_module_op->getAttrOfType<FlatSymbolRefAttr>("_entry_function")
+            .getValue()
+            .str();
+
+    return RE2::FullMatch(lifted_func_name, *match_regex_);  // NOLINT
+  }
+
+ private:
+  // Returns an owned `RE2` object that corresponds to the `spec`. Returns
+  // `nullptr` if the `spec` is invalid.
+  // NOLINTNEXTLINE - RE2 included via TSL regexp.h
+  std::unique_ptr<RE2> GetMatchRegex(const FunctionNameMatcherSpec& spec) {
+    const std::string& regex = spec.regex();
+    if (regex.empty()) return nullptr;
+
+    return std::make_unique<RE2>(regex);  // NOLINT
+  }
+
+  // Regex object used for matching against a lifted function's name.
+  std::unique_ptr<RE2> match_regex_;  // NOLINT
+};
+
+// Applies quantization spec to all matched lifted functions. At this point only
+// denylisting (`NoQuantization`) will be applied if specs is nonempty.
+// TODO: b/307620778 - Support more advanced selective quantization methods.
+LogicalResult ApplyQuantizationSpec(const QuantizationSpec& spec,
+                                    ModuleOp module_op) {
+  func::FuncOp main_func = FindFuncOp(module_op, "main");
+  if (!main_func) return failure();
+
+  const Method& quantization_method = spec.method();
+  if (!quantization_method.has_no_quantization()) {
+    module_op->emitError() << "Unsupported quantization method: "
+                           << quantization_method.DebugString() << "\n";
+    return failure();
+  }
+
+  const FunctionNameMatcher matcher(spec.matcher().function_name());
+  for (auto xla_call_module_op : main_func.getOps<TF::XlaCallModuleOp>()) {
+    if (!matcher.Match(xla_call_module_op)) continue;
+
+    // Disable quantization when matched.
+    const std::string lifted_func_name =
+        xla_call_module_op->getAttrOfType<FlatSymbolRefAttr>("_entry_function")
+            .getValue()
+            .str();
+    func::FuncOp lifted_func = FindFuncOp(module_op, lifted_func_name);
+
+    // Remove relevant attributes that enable quantization. This essentially
+    // disables quantization for the matched `xla_call_module_op`.
+    xla_call_module_op->removeAttr("_original_entry_function");
+    xla_call_module_op->removeAttr("_tfl_quant_trait");
+    lifted_func->removeAttr("tf_quant.composite_function");
+
+    LLVM_DEBUG(llvm::dbgs() << "Disabled quantization for quantizable unit: "
+                            << lifted_func_name << "\n");
+  }
+  return success();
+}
+
 void LiftQuantizableSpotsAsFunctionsPass::runOnOperation() {
   MLIRContext* ctx = &getContext();
   RewritePatternSet patterns(ctx);
@@ -101,8 +213,26 @@ void LiftQuantizableSpotsAsFunctionsPass::runOnOperation() {
 
   // Remove all attr_map attributes.
   module_op.walk([](Operation* op) { op->removeAttr(kAttrMapAttribute); });
+
+  // Perform selective quantization. Iterates over the quantization specs and
+  // applies quantization methods to each matched lifted function.
+  for (const QuantizationSpec& spec : quantization_specs_.specs()) {
+    if (failed(ApplyQuantizationSpec(spec, module_op))) {
+      signalPassFailure();
+      return;
+    }
+  }
 }
 
 }  // namespace
 
+// Creates `LiftQuantizableSpotsAsFunctionsPass` with user-defined
+// `QuantizationSpecs`.
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateLiftQuantizableSpotsAsFunctionsPass(
+    const QuantizationSpecs& quantization_specs) {
+  return std::make_unique<LiftQuantizableSpotsAsFunctionsPass>(
+      quantization_specs);
+}
+
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_fusion.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_fusion.td
index 6e134a60e3816e..a0bc228397465b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_fusion.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/lift_quantizable_spots_as_functions_fusion.td
@@ -27,12 +27,25 @@ include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.td"
 // Pattern rules for lifting ops with bias as functions
 //===----------------------------------------------------------------------===//
 
+def LiftDotGeneralWithBiasSameShape : Pat<
+  (StableHLO_AddOp:$res
+    (StableHLO_DotGeneralOp
+        $lhs, $rhs, $dot_dimension_numbers, $precision_config),
+    $bias),
+  (LiftAsTFXlaCallModule<"composite_dot_general_with_bias_same_shape_fn">
+    (ArgumentList $lhs, $rhs, $bias),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"dot_dimension_numbers"> $dot_dimension_numbers),
+      (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
+  [(IsNotInLiftedFunc $res), (IsStableHLOConstantOp $bias)], [], (addBenefit 5)>;
+
 def LiftConvWithBias : Pat<
   (StableHLO_AddOp:$res
     (StableHLO_ConvolutionOp $lhs, $rhs, $window_strides, $padding,
         $lhs_dilation, $rhs_dilation, $window_reversal, $dimension_numbers,
         $feature_group_count, $batch_group_count, $precision_config),
-    $bias),
+    (StableHLO_BroadcastInDimOp $bias, $dims)),
   (LiftAsTFXlaCallModule<"composite_conv_with_bias_fn">
     (ArgumentList $lhs, $rhs, $bias),
     (ResultList $res),
@@ -52,7 +65,7 @@ def LiftDotGeneralWithBias : Pat<
   (StableHLO_AddOp:$res
     (StableHLO_DotGeneralOp
         $lhs, $rhs, $dot_dimension_numbers, $precision_config),
-    $bias),
+    (StableHLO_BroadcastInDimOp $bias, $dims)),
   (LiftAsTFXlaCallModule<"composite_dot_general_with_bias_fn">
     (ArgumentList $lhs, $rhs, $bias),
     (ResultList $res),
@@ -216,13 +229,29 @@ def LiftDotGeneralWithRelu6 : Pat<
 // Pattern rules for lifting ops with bias and activation as functions
 //===----------------------------------------------------------------------===//
 
+def LiftDotGeneralWithBiasSameShapeAndRelu : Pat<
+  (StableHLO_MaxOp:$res
+    (StableHLO_AddOp
+      (StableHLO_DotGeneralOp
+          $lhs, $rhs, $dot_dimension_numbers, $precision_config),
+      $bias),
+    (StableHLO_ConstantOp $cst)),
+  (LiftAsTFXlaCallModule<"composite_dot_general_with_bias_same_shape_and_relu_fn">
+    (ArgumentList $lhs, $rhs, $bias),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"dot_dimension_numbers"> $dot_dimension_numbers),
+      (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
+  [(IsNotInLiftedFunc $res),
+   (FloatValueEquals<"0"> $cst), (IsStableHLOConstantOp $bias)], [], (addBenefit 10)>;
+
 def LiftConvWithBiasAndRelu : Pat<
   (StableHLO_MaxOp:$res
     (StableHLO_AddOp
       (StableHLO_ConvolutionOp $lhs, $rhs, $window_strides, $padding,
           $lhs_dilation, $rhs_dilation, $window_reversal, $dimension_numbers,
           $feature_group_count, $batch_group_count, $precision_config),
-      $bias),
+      (StableHLO_BroadcastInDimOp $bias, $dims)),
     (StableHLO_ConstantOp $cst)),
   (LiftAsTFXlaCallModule<"composite_conv_with_bias_and_relu_fn">
     (ArgumentList $lhs, $rhs, $bias),
@@ -245,7 +274,7 @@ def LiftDotGeneralWithBiasAndRelu : Pat<
     (StableHLO_AddOp
       (StableHLO_DotGeneralOp
           $lhs, $rhs, $dot_dimension_numbers, $precision_config),
-      $bias),
+      (StableHLO_BroadcastInDimOp $bias, $dims)),
     (StableHLO_ConstantOp $cst)),
   (LiftAsTFXlaCallModule<"composite_dot_general_with_bias_and_relu_fn">
     (ArgumentList $lhs, $rhs, $bias),
@@ -303,6 +332,21 @@ def LiftDotGeneralWithBiasAndReluDynamic : Pat<
   [(IsNotInLiftedFunc $res),
    (FloatValueEquals<"0"> $cst), (IsStableHLOConstantOp $bias)], [], (addBenefit 15)>;
 
+def LiftDotGeneralWithBiasSameShapeAndRelu6 : Pat<
+  (StableHLO_ClampOp:$res
+    (StableHLO_ConstantOp $cst_0),
+    (StableHLO_AddOp
+      (StableHLO_DotGeneralOp
+          $lhs, $rhs, $dot_dimension_numbers, $precision_config),
+      $bias),
+    (StableHLO_ConstantOp $cst_1)),
+  (LiftAsTFXlaCallModule<"composite_dot_general_with_bias_same_shape_and_relu6_fn">
+    (ArgumentList $lhs, $rhs, $bias),
+    (ResultList $res),
+    (NamedAttributeList
+      (NamedAttr<"dot_dimension_numbers"> $dot_dimension_numbers),
+      (NamedAttr<"precision_config"> (DefaultOrNullAttr $precision_config)))),
+  [(IsNotInLiftedFunc $res), (IsStableHLOConstantOp $bias), (FloatValueEquals<"0"> $cst_0), (FloatValueEquals<"6"> $cst_1)], [], (addBenefit 10)>;
 
 def LiftConvWithBiasAndRelu6 : Pat<
   (StableHLO_ClampOp:$res
@@ -311,7 +355,7 @@ def LiftConvWithBiasAndRelu6 : Pat<
       (StableHLO_ConvolutionOp $lhs, $rhs, $window_strides, $padding,
           $lhs_dilation, $rhs_dilation, $window_reversal, $dimension_numbers,
           $feature_group_count, $batch_group_count, $precision_config),
-      $bias),
+      (StableHLO_BroadcastInDimOp $bias, $dims)),
     (StableHLO_ConstantOp $cst_1)),
   (LiftAsTFXlaCallModule<"composite_conv_with_bias_and_relu6_fn">
     (ArgumentList $lhs, $rhs, $bias),
@@ -334,7 +378,7 @@ def LiftDotGeneralWithBiasAndRelu6 : Pat<
     (StableHLO_AddOp
       (StableHLO_DotGeneralOp
           $lhs, $rhs, $dot_dimension_numbers, $precision_config),
-      $bias),
+      (StableHLO_BroadcastInDimOp $bias, $dims)),
     (StableHLO_ConstantOp $cst_1)),
   (LiftAsTFXlaCallModule<"composite_dot_general_with_bias_and_relu6_fn">
     (ArgumentList $lhs, $rhs, $bias),
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
index d89b044b189ec0..5bb3e58be01d58 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h
@@ -19,13 +19,13 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/status/statusor.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.pb.h"
 
 namespace mlir::quant::stablehlo {
@@ -33,22 +33,22 @@ namespace mlir::quant::stablehlo {
 // Creates a `QuantizePass` that quantizes ops according to surrounding qcast /
 // dcast ops.
 std::unique_ptr<OperationPass<ModuleOp>> CreateQuantizePass(
-    const quant::QuantizationSpecs& quantization_specs);
+    const quant::QuantizationSpecs& quantization_specs,
+    bool enable_per_channel_quantized_weight = true);
 
 // Creates a pass that quantizes weight component of StableHLO graph.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateQuantizeWeightPass(
     const ::stablehlo::quantization::QuantizationComponentSpec&
         quantization_component_spec = {});
 
-// Creates an instance of the StableHLO dialect PrepareQuantize pass without any
-// arguments. Preset method of SRQ is set to the quantization option by default.
-std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareQuantizePass(
-    bool enable_per_channel_quantization = false, int bit_width = 8);
-
 // Converts a serialized StableHLO module to bfloat16 and output serialized
 // module.
-FailureOr<std::string> ConvertSerializedStableHloModuleToBfloat16(
-    MLIRContext* context, StringRef serialized_stablehlo_module);
+absl::StatusOr<std::string> ConvertSerializedStableHloModuleToBfloat16(
+    StringRef serialized_stablehlo_module);
+
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateLiftQuantizableSpotsAsFunctionsPass(
+    const ::stablehlo::quantization::QuantizationSpecs& quantization_specs);
 
 // Adds generated pass default constructors or options definitions.
 #define GEN_PASS_DECL
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
index 3e1c431b6e6fd5..dcebf70eb5c1e9 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.td
@@ -21,25 +21,6 @@ def QuantizeWeightPass : Pass<"stablehlo-quantize-weight", "mlir::func::FuncOp">
   let constructor = "mlir::quant::stablehlo::CreateQuantizeWeightPass()";
 }
 
-def PrepareQuantizePass : Pass<"stablehlo-prepare-quantize", "mlir::func::FuncOp"> {
-  let summary = "Prepare StableHLO dialect for static range quantization.";
-  let options = [
-    Option<"enable_per_channel_quantization_",
-        "enable-per-channel-quantization",
-        "bool", /*default=*/"true",
-        "Whether enable per-channel quantized weights.">,
-    Option<"bit_width_", "bit-width", "int", /*default=*/"8",
-        "Bitwidth of quantized integer">
-    ];
-  let constructor = "mlir::quant::stablehlo::CreatePrepareQuantizePass()";
-  let dependentDialects = [
-      "mlir::stablehlo::StablehloDialect",
-      "mlir::quant::QuantizationDialect",
-      "mlir::quantfork::QuantizationForkDialect",
-      "mlir::arith::ArithDialect",
-  ];
-}
-
 def UnfuseMhloBatchNormPass : Pass<"stablehlo-unfuse-mhlo-batch-norm", "mlir::func::FuncOp"> {
   let summary = "Unfuses batch normalization into arithmetic ops.";
 }
@@ -53,6 +34,7 @@ def LiftQuantizableSpotsAsFunctionsPass : Pass<"stablehlo-lift-quantizable-spots
     that disperse values. (ex: convolution, dot_general)
   }];
   let dependentDialects = [
+      "mlir::func::FuncDialect",
       "mlir::stablehlo::StablehloDialect",
       "TF::TensorFlowDialect",
   ];
@@ -67,40 +49,62 @@ def ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass : Pass<"stablehlo-
   }];
 }
 
-def QuantizePass : Pass<"stablehlo-quantize", "mlir::ModuleOp"> {
-  let summary = "Applies static-range quantization on ops.";
+def RestoreFunctionNamePass : Pass<"stablehlo-restore-function-name", "ModuleOp"> {
+  let summary = "Restores function name from XlaCallModule op.";
+}
+
+def QuantizeCompositeFunctionsPass : Pass<"stablehlo-quantize-composite-functions", "ModuleOp"> {
+  let summary = "Quantize composite functions with QDQ input / outputs.";
+  let options = [
+    Option<"enable_per_channel_quantized_weight_",
+        "enable-per-channel-quantized-weight",
+        "bool", /*default=*/"true",
+        "Whether to enable per-channel quantized weights.">,
+    Option<"mlir_dump_file_name_", "mlir-dump-file-name",
+        "std::optional<std::string>", /*default=*/"std::nullopt",
+        "MLIR dump file name.">
+  ];
   let dependentDialects = [
+    "mlir::arith::ArithDialect",
     "mlir::stablehlo::StablehloDialect",
     "mlir::quant::QuantizationDialect",
     "mlir::quantfork::QuantizationForkDialect",
+    "TF::TensorFlowDialect",
   ];
 }
 
-def RestoreFunctionNamePass : Pass<"stablehlo-restore-function-name", "ModuleOp"> {
-  let summary = "Restores function name from XlaCallModule op.";
+def PrepareQuantizePass : Pass<"stablehlo-prepare-quantize", "mlir::func::FuncOp"> {
+  let summary = "Prepare StableHLO dialect for static range quantization by converting quantfork.stats into quantfork.qcast and dcast ops.";
+  let options = [
+    Option<"enable_per_channel_quantized_weight_",
+        "enable-per-channel-quantized-weight",
+        "bool", /*default=*/"true",
+        "Whether to enable per-channel quantized weights.">,
+    Option<"bit_width_", "bit-width", "int", /*default=*/"8",
+        "Bitwidth of quantized integer">
+    ];
+  let dependentDialects = [
+      "mlir::stablehlo::StablehloDialect",
+      "mlir::quant::QuantizationDialect",
+      "mlir::quantfork::QuantizationForkDialect",
+      "mlir::arith::ArithDialect",
+  ];
 }
 
-def PostQuantizePass : Pass<"stablehlo-post-quantize", "mlir::func::FuncOp"> {
-  let summary = "Apply clean-up after quantization.";
+def QuantizePass : Pass<"stablehlo-quantize", "mlir::ModuleOp"> {
+  let summary = "Applies static-range quantization on ops by converting quantfork.qcast, quantfork.dcast, and float op into uniform quantized ops .";
   let dependentDialects = [
     "mlir::stablehlo::StablehloDialect",
+    "mlir::quant::QuantizationDialect",
     "mlir::quantfork::QuantizationForkDialect",
   ];
 }
 
-def QuantizeCompositeFunctionsPass : Pass<"stablehlo-quantize-composite-functions", "ModuleOp"> {
-  let summary = "Quantize composite functions with QDQ input / outputs.";
-  let options = [
-    Option<"mlir_dump_file_name_", "mlir-dump-file-name",
-        "std::optional<std::string>", /*default=*/"std::nullopt",
-        "MLIR dump file name.">
-  ];
+def PostQuantizePass : Pass<"stablehlo-post-quantize", "mlir::func::FuncOp"> {
+  let summary = "Apply clean-up after quantization.";
   let dependentDialects = [
-    "mlir::arith::ArithDialect",
     "mlir::stablehlo::StablehloDialect",
-    "mlir::quant::QuantizationDialect",
     "mlir::quantfork::QuantizationForkDialect",
-    "TF::TensorFlowDialect",
   ];
 }
 
@@ -124,11 +128,6 @@ def ConvertXlaCallModuleOpToBfloat16Pass : Pass<"stablehlo-convert-xla-call-modu
   ];
 }
 
-def PopulateShapePass : Pass<"populate-shape", "ModuleOp"> {
-  let summary = "Populate output shape with known information for CustomAggregatorOp and XlaCallModuleOp.";
-  let dependentDialects = ["TF::TensorFlowDialect"];
-}
-
 def OptimizeGraphPass : Pass<"optimize-graph", "ModuleOp"> {
   let summary = "Optimize the sub-optimal patterns after quantization.";
   let dependentDialects = ["mlir::stablehlo::StablehloDialect",];
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/populate_shape.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/populate_shape.cc
deleted file mode 100644
index 0d4f0594f5c7d8..00000000000000
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/populate_shape.cc
+++ /dev/null
@@ -1,144 +0,0 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <utility>
-
-#include "llvm/Support/Casting.h"
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/TypeRange.h"  // from @llvm-project
-#include "mlir/IR/TypeUtilities.h"  // from @llvm-project
-#include "mlir/IR/Types.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Support/TypeID.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"  // IWYU pragma: keep
-#include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/dynamic_shape_utils.h"
-#include "tensorflow/core/ir/types/dialect.h"
-
-namespace mlir::quant::stablehlo {
-
-#define GEN_PASS_DEF_POPULATESHAPEPASS
-#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h.inc"
-
-namespace {
-
-class PopulateShapeForCustomAggregatorOp
-    : public OpConversionPattern<TF::CustomAggregatorOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      TF::CustomAggregatorOp op, TF::CustomAggregatorOpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    auto input_shape_type = op.getInput().getType().dyn_cast<Type>();
-    auto output_shape_type = op.getOutput().getType();
-
-    if (!input_shape_type.isa<RankedTensorType>()) {
-      input_shape_type = adaptor.getInput().getType();
-    }
-
-    if (input_shape_type.isa<RankedTensorType>() &&
-        !output_shape_type.isa<RankedTensorType>() &&
-        TF::HasCompatibleElementTypes(input_shape_type, output_shape_type)) {
-      auto new_op = rewriter.create<TF::CustomAggregatorOp>(
-          op->getLoc(), /*output=*/input_shape_type,
-          /*args=*/adaptor.getInput(),
-          /*Id=*/op.getId());
-      new_op->setAttrs(op->getAttrs());
-      rewriter.replaceOp(op, new_op);
-      return success();
-    }
-    return failure();
-  }
-};
-
-class PopulateShapeForXlaCallModuleOp
-    : public OpConversionPattern<TF::XlaCallModuleOp> {
- public:
-  using OpConversionPattern::OpConversionPattern;
-
-  LogicalResult matchAndRewrite(
-      TF::XlaCallModuleOp op, TF::XlaCallModuleOpAdaptor adaptor,
-      ConversionPatternRewriter &rewriter) const override {
-    if (op->getNumResults() != 1) {
-      op->emitError("XlaCallModuleOp doesn't have 1 output.");
-      return failure();
-    }
-    // Assume XlaCallModuleOp only has 1 output.
-    auto output_shape_type = op->getResultTypes()[0];
-    if (!output_shape_type.isa<RankedTensorType>()) {
-      auto output_shape_attr = op.getSout()[0].dyn_cast<tf_type::ShapeAttr>();
-      if (!output_shape_attr.hasRank()) {
-        return failure();
-      }
-      auto new_output_shape_type = tensorflow::GetTypeFromTFTensorShape(
-          output_shape_attr.getShape(),
-          getElementTypeOrSelf(op.getResultTypes()[0]));
-      auto new_op = rewriter.create<TF::XlaCallModuleOp>(
-          op->getLoc(), /*output=*/new_output_shape_type,
-          /*args=*/adaptor.getOperands(),
-          /*version=*/op.getVersionAttr(),
-          /*module=*/op.getModuleAttr(),
-          /*Sout=*/op.getSoutAttr());
-      new_op->setAttrs(op->getAttrs());
-      rewriter.replaceOp(op, new_op);
-      return success();
-    }
-    return failure();
-  }
-};
-
-class PopulateShapePass
-    : public impl::PopulateShapePassBase<PopulateShapePass> {
- public:
-  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PopulateShapePass)
-
-  explicit PopulateShapePass() = default;
-
- private:
-  void runOnOperation() override;
-};
-
-void PopulateShapePass::runOnOperation() {
-  Operation *op = getOperation();
-  MLIRContext *context = op->getContext();
-  RewritePatternSet patterns(context);
-  ConversionTarget target(*context);
-  target.addDynamicallyLegalOp<TF::CustomAggregatorOp>([](Operation *op) {
-    auto custom_aggregator_op = llvm::dyn_cast<TF::CustomAggregatorOp>(op);
-    return custom_aggregator_op.getInput().getType().isa<RankedTensorType>() &&
-           custom_aggregator_op.getOutput().getType().isa<RankedTensorType>();
-  });
-  target.addDynamicallyLegalOp<TF::XlaCallModuleOp>([](Operation *op) {
-    if (op->getNumResults() != 1) return true;
-    return op->getResultTypes()[0].isa<RankedTensorType>();
-  });
-
-  patterns
-      .add<PopulateShapeForCustomAggregatorOp, PopulateShapeForXlaCallModuleOp>(
-          context);
-
-  if (failed(applyPartialConversion(op, target, std::move(patterns)))) {
-    return signalPassFailure();
-  }
-}
-}  // namespace
-
-}  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc
index 1291b0f7aa83eb..688e21b7d898dc 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/prepare_quantize.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
@@ -40,11 +41,11 @@ namespace mlir {
 namespace quant {
 namespace stablehlo {
 
-namespace {
-
 #define GEN_PASS_DEF_PREPAREQUANTIZEPASS
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h.inc"
 
+namespace {
+
 // Applies prepare quantization on the model in TF dialect. This pass runs
 // before the quantization pass and propagate the quantization parameters
 // across ops. This step is necessary for post-training quantization and also
@@ -53,12 +54,14 @@ namespace {
 class PrepareQuantizePass
     : public impl::PrepareQuantizePassBase<PrepareQuantizePass> {
  public:
-  PrepareQuantizePass() = default;
-  PrepareQuantizePass(const PrepareQuantizePass&) = default;
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(PrepareQuantizePass)
+
+  using impl::PrepareQuantizePassBase<
+      PrepareQuantizePass>::PrepareQuantizePassBase;
 
-  explicit PrepareQuantizePass(bool enable_per_channel_quantization,
+  explicit PrepareQuantizePass(bool enable_per_channel_quantized_weight,
                                int bit_width) {
-    enable_per_channel_quantization_ = enable_per_channel_quantization;
+    enable_per_channel_quantized_weight_ = enable_per_channel_quantized_weight;
     bit_width_ = bit_width;
   }
 
@@ -162,9 +165,11 @@ void PrepareQuantizePass::runOnOperation() {
   // Finally, the quantization parameters can be propagated to the rest of the
   // values (tensors).
   ApplyQuantizationParamsPropagation(
-      func, /*is_signed=*/true, bit_width_, !enable_per_channel_quantization_,
-      GetStableHloOpQuantSpec, GetStableHloQuantScaleSpec,
-      /*infer_tensor_ranges=*/true, /*legacy_float_scale=*/false);
+      func, /*is_signed=*/true, bit_width_,
+      !enable_per_channel_quantized_weight_, GetStableHloOpQuantSpec,
+      GetStableHloQuantScaleSpec,
+      /*infer_tensor_ranges=*/true, /*legacy_float_scale=*/false,
+      /*is_qdq_conversion=*/false);
 
   // Restore constants as stablehlo::ConstantOp.
   RewritePatternSet patterns_2(ctx);
@@ -180,9 +185,9 @@ void PrepareQuantizePass::runOnOperation() {
 
 // Creates an instance of the TensorFlow dialect PrepareQuantize pass.
 std::unique_ptr<OperationPass<func::FuncOp>> CreatePrepareQuantizePass(
-    bool enable_per_channel_quantization, int bit_width) {
-  return std::make_unique<PrepareQuantizePass>(enable_per_channel_quantization,
-                                               bit_width);
+    bool enable_per_channel_quantized_weight, int bit_width) {
+  return std::make_unique<PrepareQuantizePass>(
+      enable_per_channel_quantized_weight, bit_width);
 }
 
 }  // namespace stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
index 91129477a726af..76430bec75e4ce 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <string>
 #include <type_traits>
 #include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "llvm/ADT/STLExtras.h"
@@ -48,8 +49,8 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
+#include "tensorflow/compiler/mlir/quantization/common/uniform_quantized_types.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/ops/stablehlo_op_quant_spec.h"
-#include "tensorflow/compiler/mlir/quantization/stablehlo/uniform_quantized_types.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
@@ -59,12 +60,15 @@ namespace mlir::quant::stablehlo {
 
 namespace {
 
+using ::mlir::quant::FindUserOfType;
 using ::mlir::quant::TryCast;
 using ::mlir::stablehlo::AddOp;
+using ::mlir::stablehlo::BroadcastInDimOp;
 using ::mlir::stablehlo::ConcatenateOp;
 using ::mlir::stablehlo::ConvolutionOp;
 using ::mlir::stablehlo::DotGeneralOp;
 using ::mlir::stablehlo::DynamicBroadcastInDimOp;
+using ::mlir::stablehlo::GatherOp;
 using ::mlir::stablehlo::GetDimensionSizeOp;
 using ::mlir::stablehlo::ReshapeOp;
 using ::mlir::stablehlo::UniformQuantizeOp;
@@ -79,14 +83,17 @@ bool IsQuantizedTensorType(const Type type) {
          type.cast<TensorType>().getElementType().isa<QuantizedType>();
 }
 
-// Returns dynamically broadcasted user op of an input op. Returns null if
-// the op is not dynamically broadcasted or not the intended type.
-// Dynamic shapes usually has the following pattern. In the example below,
-// the input operand would be stablehlo.convolution op, and return value would
-// be stablehlo.add op.
+// Returns broadcasted user op of an input op. Returns null if
+// the op is not broadcasted or not the intended type.
+// Supports both static broadcast and dynamic broadcast.
 // Note that the patterns below differ from lifted patterns as
 // ShapeLegalizeToHloPass is ran prior to running this pass.
 //
+// Dynamically broadcasted bias due to unknown input batch size
+// usually has the following pattern. In the example below,
+// the input operand would be stablehlo.convolution op, and return value would
+// be stablehlo.add op.
+//
 // ```
 // %0 = stablehlo.constant dense<3>
 // %1 = stablehlo.constant dense<4>
@@ -100,42 +107,44 @@ bool IsQuantizedTensorType(const Type type) {
 // %6 = stablehlo.concatenate %5, %0, %1, %2, dim = 0 :
 //          (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>)
 //            -> tensor<4xi32>
-// %7 = stablehlo.dynamic_broadcast_in_dims %arg2, %6
+// %7 = stablehlo.dynamic_broadcast_in_dim %arg2, %6
 // %8 = stablehlo.add %3, %7
 // ```
+//
+// Statically broadcasted bias will be broadcasted to match the accumulation.
+// ```
+// %3 = stablehlo.convolution(%%arg0, %%arg1) :
+//          (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<?x3x4x2xf32>
+// %4 = stablehlo.broadcast_in_dim %arg2, %3
+// %5 = stablehlo.add %3, %4
+// ```
 template <typename T>
-Operation* GetDynamicallyBroadcastedUserOp(Operation* op) {
-  FailureOr<GetDimensionSizeOp> get_dimension_size_op =
-      TryCast<GetDimensionSizeOp>(op->getNextNode(),
-                                  /*name=*/"get_dimension_size_op");
-  if (failed(get_dimension_size_op)) {
-    return nullptr;
-  }
-  auto reshape_op = TryCast<ReshapeOp>((*get_dimension_size_op)->getNextNode(),
-                                       /*name=*/"reshape_op");
-  if (failed(reshape_op)) {
-    return nullptr;
-  }
-  auto concatenate_op = TryCast<ConcatenateOp>((*reshape_op)->getNextNode(),
-                                               /*name=*/"concatenate_op");
-  if (failed(concatenate_op)) {
-    return nullptr;
+Operation* GetBroadcastedUserOp(Operation* op) {
+  // Broadcast bias for known input shape.
+  auto broadcast_in_dim_op = FindUserOfType<BroadcastInDimOp>(op);
+  if (broadcast_in_dim_op != nullptr) {
+    auto target_op = FindUserOfType<T>(broadcast_in_dim_op);
+    if (target_op != nullptr) return target_op;
   }
+  // Broadcast bias for unknown input shape.
+  auto get_dimension_size_op = FindUserOfType<GetDimensionSizeOp>(op);
+  if (get_dimension_size_op == nullptr) return nullptr;
+
+  auto reshape_op = FindUserOfType<ReshapeOp>(get_dimension_size_op);
+  if (reshape_op == nullptr) return nullptr;
+
+  auto concatenate_op = FindUserOfType<ConcatenateOp>(reshape_op);
+  if (concatenate_op == nullptr) return nullptr;
+
   auto dynamic_broadcast_in_dim_op =
-      TryCast<DynamicBroadcastInDimOp>((*concatenate_op)->getNextNode(),
-                                       /*name=*/"dynamic_broadcast_in_dim_op");
-  if (failed(dynamic_broadcast_in_dim_op)) {
-    return nullptr;
-  }
-  auto target_op = TryCast<T>((*dynamic_broadcast_in_dim_op)->getNextNode(),
-                              /*name=*/"target_op");
-  if (failed(target_op)) {
-    return nullptr;
-  }
-  return *target_op;
+      FindUserOfType<DynamicBroadcastInDimOp>(concatenate_op);
+  if (dynamic_broadcast_in_dim_op == nullptr) return nullptr;
+
+  auto target_op = FindUserOfType<T>(dynamic_broadcast_in_dim_op);
+  return target_op;
 }
 
-// Checks if all inputs and outputs are quantized.
+// Checks if one of the inputs and outputs are quantized.
 bool HasQuantizedOperandOrOutput(Operation* call_op) {
   SmallVector<Type> arg_types;
   for (const Value arg : call_op->getOperands()) {
@@ -147,8 +156,8 @@ bool HasQuantizedOperandOrOutput(Operation* call_op) {
     output_types.push_back(output.getType());
   }
 
-  return absl::c_all_of(arg_types, IsQuantizedTensorType) &&
-         absl::c_all_of(output_types, IsQuantizedTensorType);
+  return absl::c_any_of(arg_types, IsQuantizedTensorType) &&
+         absl::c_any_of(output_types, IsQuantizedTensorType);
 }
 
 // Gets the corresponding quantized function name from the given function name.
@@ -161,7 +170,7 @@ std::string GetQuantizedFunctionName(const StringRef func_name) {
 
 // Returns true if `xla_call_module_op` is quantized. To be considered
 // quantized, it should meet three conditions:
-// 1. At least one of the inputs or outputs should be a uniform quantized type.
+// 1. At least one of the inputs and outputs should be a uniform quantized type.
 // 2. `xla_call_module_op` should have the `kQuantTraitAttrName` attribute.
 // 3. It should also have the `kEntryFuncAttrName` attribute, which points to
 //    the function that `xla_call_module_op` represents.
@@ -211,6 +220,9 @@ void SetQuantizedFunctionType(PatternRewriter& rewriter,
 }
 
 // Creates a UniformQuantize op and sets it as return op.
+// The requantize scale and zero point should be determined from the
+// entry_func_op's output, containing information on layerStats of the
+// entire function.
 void CreateAndReturnUniformQuantizeOp(PatternRewriter& rewriter, Operation& op,
                                       func::FuncOp entry_func_op,
                                       const Type func_result_type) {
@@ -226,35 +238,37 @@ template <typename GemmStyleOp>
 // and sets the quantized bias as the return op.
 void CreateAndReturnQuantizedBiasPattern(
     Operation* op, PatternRewriter& rewriter, func::FuncOp entry_func_op,
-    const Type func_result_type, const Type gemm_style_quantized_element_type,
-    GemmStyleOp gemm_style_op, double result_scale) {
+    const Type func_result_type, const Type accumulation_quantized_element_type,
+    GemmStyleOp gemm_style_op) {
   Value bias_op = op->getOperand(1);
   Value add_op_result = op->getResult(0);
-  // For bias add with dynamic shape, quantize the broadcasted bias.
-  if (auto dynamic_bcast_op =
-          cast_or_null<DynamicBroadcastInDimOp>(bias_op.getDefiningOp())) {
-    const UniformQuantizedType dynamic_bcast_quantized_element_type =
-        CreateI32F32UniformQuantizedType(gemm_style_op->getLoc(),
-                                         *rewriter.getContext(), result_scale,
-                                         /*zero_point=*/0);
-
-    Value dynamic_bcast_op_result = dynamic_bcast_op->getResult(0);
-    auto dynamic_bcast_op_result_type =
-        dynamic_bcast_op_result.getType().cast<RankedTensorType>();
-    const ArrayRef<int64_t> dynamic_bcast_shape =
-        dynamic_bcast_op_result_type.getShape();
-
-    const TensorType new_dynamic_bcast_op_result_type =
-        dynamic_bcast_op_result_type.cloneWith(
-            dynamic_bcast_shape, gemm_style_quantized_element_type);
-    dynamic_bcast_op_result.setType(new_dynamic_bcast_op_result_type);
+
+  // Broadcast bias value if unmatched with output shape.
+  auto bcast_op = TryCast<BroadcastInDimOp>(bias_op.getDefiningOp(),
+                                            /*name=*/"broadcast_in_dim_op");
+
+  if (failed(bcast_op)) {
+    bcast_op = TryCast<DynamicBroadcastInDimOp>(
+        bias_op.getDefiningOp(),
+        /*name=*/"dynamic_broadcast_in_dim_op");
   }
+  // Update the bias type for both static and dynamic broadcasts.
+  if (succeeded(bcast_op)) {
+    Value bcast_op_result = (*bcast_op)->getResult(0);
+    auto bcast_op_result_type =
+        bcast_op_result.getType().cast<RankedTensorType>();
+    const ArrayRef<int64_t> bcast_shape = bcast_op_result_type.getShape();
+    const TensorType new_bcast_op_result_type = bcast_op_result_type.cloneWith(
+        bcast_shape, accumulation_quantized_element_type);
+    bcast_op_result.setType(new_bcast_op_result_type);
+  }
+
   const auto add_op_result_type =
       add_op_result.getType().cast<RankedTensorType>();
   const ArrayRef<int64_t> add_op_shape = add_op_result_type.getShape();
   // For quantized bias add case, lhs, rhs, and result have the same types.
   const TensorType new_add_op_result_type = add_op_result_type.cloneWith(
-      add_op_shape, gemm_style_quantized_element_type);
+      add_op_shape, accumulation_quantized_element_type);
   add_op_result.setType(new_add_op_result_type);
 
   AddOp bias_add_op =
@@ -287,12 +301,14 @@ template <typename GemmStyleOp>
 LogicalResult MatchGemmStyleOp(func::FuncOp entry_func_op) {
   auto op_iterator_range = entry_func_op.getOps<GemmStyleOp>();
   if (op_iterator_range.empty()) {
-    LLVM_DEBUG(llvm::dbgs() << "Function does not have GemmStyle op.\n");
+    LLVM_DEBUG(llvm::dbgs() << "Function does not have "
+                            << GemmStyleOp::getOperationName() << " op.\n");
     return failure();
   }
   if (!isa<RankedTensorType>(
           (*op_iterator_range.begin()).getResult().getType())) {
-    LLVM_DEBUG(llvm::dbgs() << "GemmStyle op must have ranked tensor type.\n");
+    LLVM_DEBUG(llvm::dbgs() << GemmStyleOp::getOperationName()
+                            << " op must have ranked tensor type.\n");
     return failure();
   }
 
@@ -300,8 +316,8 @@ LogicalResult MatchGemmStyleOp(func::FuncOp entry_func_op) {
       entry_func_op.getBody().getArguments();
   // Function must have input, filter, and optionally bias.
   if (operands.size() != 2 && operands.size() != 3) {
-    LLVM_DEBUG(llvm::dbgs()
-               << "GemmStyle op function should have 2 or 3 operands.\n");
+    LLVM_DEBUG(llvm::dbgs() << GemmStyleOp::getOperationName()
+                            << " op function should have 2 or 3 operands.\n");
     return failure();
   }
   return success();
@@ -309,68 +325,144 @@ LogicalResult MatchGemmStyleOp(func::FuncOp entry_func_op) {
 
 // Gemm Style Op: glossary/gemm.
 template <typename GemmStyleOp>
-void RewriteGemmStyleOp(func::FuncOp entry_func_op, PatternRewriter& rewriter) {
-  // Update the output type of the gemm_style op.
-  GemmStyleOp gemm_style_op = *entry_func_op.getOps<GemmStyleOp>().begin();
+void RewriteGemmStyleOp(func::FuncOp entry_func_op, PatternRewriter& rewriter,
+                        bool enable_per_channel_quantized_weight) {
+  const GemmStyleOp gemm_style_op =
+      *entry_func_op.getOps<GemmStyleOp>().begin();
 
   const Type input_type = entry_func_op.getArgumentTypes()[0];
   const Type filter_type = entry_func_op.getArgumentTypes()[1];
   const Type func_result_type = entry_func_op.getResultTypes()[0];
 
-  const double input_scale =
-      getElementTypeOrSelf(input_type).cast<UniformQuantizedType>().getScale();
-  const double filter_scale =
-      getElementTypeOrSelf(filter_type).cast<UniformQuantizedType>().getScale();
-  const double result_scale = input_scale * filter_scale;
-
-  // Define the intermediate output type, which is an i32 quantized type.
-  // This is intermediate because the final output type of the entry_func_op
-  // should be an i8 quantized type.
-  const UniformQuantizedType gemm_style_quantized_element_type =
-      CreateI32F32UniformQuantizedType(gemm_style_op->getLoc(),
-                                       *rewriter.getContext(), result_scale,
-                                       /*zero_point=*/0);
-
   Value gemm_style_op_result = gemm_style_op->getResult(0);
-  auto gemm_style_op_result_type =
+  const auto gemm_style_op_result_type =
       gemm_style_op_result.getType().cast<RankedTensorType>();
   const ArrayRef<int64_t> gemm_style_shape =
       gemm_style_op_result_type.getShape();
 
-  const TensorType new_gemm_style_op_result_type =
-      gemm_style_op_result_type.cloneWith(gemm_style_shape,
-                                          gemm_style_quantized_element_type);
+  Type accumulation_quantized_element_type;
+  TensorType new_gemm_style_op_result_type;
+
+  const double input_scale =
+      getElementTypeOrSelf(input_type).cast<UniformQuantizedType>().getScale();
+
+  if (enable_per_channel_quantized_weight) {
+    ArrayRef<double> filter_scales = getElementTypeOrSelf(filter_type)
+                                         .cast<UniformQuantizedPerAxisType>()
+                                         .getScales();
+    std::vector<double> result_scales;
+    result_scales.reserve(filter_scales.size());
+
+    for (double filter_scale : filter_scales) {
+      result_scales.push_back(input_scale * filter_scale);
+    }
+
+    const ArrayRef<int64_t> zero_points =
+        getElementTypeOrSelf(filter_type)
+            .cast<UniformQuantizedPerAxisType>()
+            .getZeroPoints();
+
+    // [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]
+    accumulation_quantized_element_type =
+        CreateI32F32UniformQuantizedPerAxisType(
+            gemm_style_op->getLoc(), *rewriter.getContext(), result_scales,
+            zero_points, /*quantization_dimension=*/3);
+
+    new_gemm_style_op_result_type = gemm_style_op_result_type.cloneWith(
+        gemm_style_shape, accumulation_quantized_element_type);
+  } else {
+    const double filter_scale = getElementTypeOrSelf(filter_type)
+                                    .cast<UniformQuantizedType>()
+                                    .getScale();
+    double result_scale = input_scale * filter_scale;
+
+    accumulation_quantized_element_type = CreateI32F32UniformQuantizedType(
+        gemm_style_op->getLoc(), *rewriter.getContext(), result_scale,
+        /*zero_point=*/0);
+
+    new_gemm_style_op_result_type = gemm_style_op_result_type.cloneWith(
+        gemm_style_shape, accumulation_quantized_element_type);
+  }
+
   gemm_style_op_result.setType(new_gemm_style_op_result_type);
 
   rewriter.setInsertionPointAfter(gemm_style_op);
 
-  Operation* next_op = gemm_style_op->getNextNode();
+  Operation* next_op = FindUserOfType<>(gemm_style_op);
 
+  // If activation exists, omit clipping op.
+  // Since out_scale and out_zp are computed based on clipped range,
+  // explicit activation clipping op is not required.
   if (isa<AddOp>(next_op) && gemm_style_op->hasOneUse()) {
     // bias fusion
     CreateAndReturnQuantizedBiasPattern(
         next_op, rewriter, entry_func_op, func_result_type,
-        gemm_style_quantized_element_type, gemm_style_op, result_scale);
+        accumulation_quantized_element_type, gemm_style_op);
   } else if (auto add_op = cast_or_null<AddOp>(
-                 GetDynamicallyBroadcastedUserOp<AddOp>(gemm_style_op))) {
-    // dynamic bias fusion
+                 GetBroadcastedUserOp<AddOp>(gemm_style_op))) {
+    // broadcasted bias fusion
     rewriter.setInsertionPointAfter(add_op);
     CreateAndReturnQuantizedBiasPattern(
         add_op, rewriter, entry_func_op, func_result_type,
-        gemm_style_quantized_element_type, gemm_style_op, result_scale);
+        accumulation_quantized_element_type, gemm_style_op);
   } else {
     // Non fusible op
-    // If an op is used multiple times and is not a dynamic shape case, do not
-    // apply quantization of fused patterns to prevent removal of dependee ops.
+    // If an op is used multiple times and is not a broadcasted shape case,
+    // do not apply quantization of fused patterns to prevent removal of
+    // dependee ops.
     CreateAndReturnUniformQuantizeOp(rewriter, *gemm_style_op, entry_func_op,
                                      func_result_type);
   }
 }
 
+template <typename SingularOp>
+// Match for tensor manipulation op.
+LogicalResult MatchSingularOp(func::FuncOp entry_func_op) {
+  auto op_iterator_range = entry_func_op.getOps<SingularOp>();
+  if (op_iterator_range.empty()) {
+    LLVM_DEBUG(llvm::dbgs() << "Function does not have "
+                            << SingularOp::getOperationName() << " op.\n");
+    return failure();
+  }
+  if (!isa<RankedTensorType>(
+          (*op_iterator_range.begin()).getResult().getType())) {
+    LLVM_DEBUG(llvm::dbgs() << SingularOp::getOperationName()
+                            << " op must have ranked tensor type.\n");
+    return failure();
+  }
+  return success();
+}
+
+template <typename SingularOp>
+void RewriteSingularOp(func::FuncOp entry_func_op, PatternRewriter& rewriter) {
+  SingularOp singular_op = *entry_func_op.getOps<SingularOp>().begin();
+
+  const Type operand_type = entry_func_op.getArgumentTypes()[0];
+  const Type func_result_type = entry_func_op.getResultTypes()[0];
+
+  // Get the quantized tensor manipulation op's output type and update.
+  Value singular_op_result = singular_op.getResult();
+  auto singular_op_result_type =
+      singular_op_result.getType().cast<RankedTensorType>();
+  const ArrayRef<int64_t> singular_op_shape =
+      singular_op_result_type.getShape();
+  const TensorType new_singular_op_result_type =
+      singular_op_result_type.cloneWith(
+          singular_op_shape,
+          getElementTypeOrSelf(operand_type).cast<UniformQuantizedType>());
+  singular_op_result.setType(new_singular_op_result_type);
+
+  // Create requantization op and return.
+  rewriter.setInsertionPointAfter(singular_op);
+  CreateAndReturnUniformQuantizeOp(rewriter, *singular_op, entry_func_op,
+                                   func_result_type);
+}
+
 // Quantizes the entry function's body containing a `DotGeneralOp`.
 class QuantizeDotGeneralOpPattern : public EntryFuncBodyQuantizationPattern {
  public:
-  explicit QuantizeDotGeneralOpPattern() = default;
+  explicit QuantizeDotGeneralOpPattern(
+      bool enable_per_channel_quantized_weight) {}
 
   LogicalResult match(func::FuncOp entry_func_op) const override {
     return MatchGemmStyleOp<DotGeneralOp>(entry_func_op);
@@ -378,14 +470,19 @@ class QuantizeDotGeneralOpPattern : public EntryFuncBodyQuantizationPattern {
 
   void rewrite(func::FuncOp entry_func_op,
                PatternRewriter& rewriter) const override {
-    RewriteGemmStyleOp<DotGeneralOp>(entry_func_op, rewriter);
+    RewriteGemmStyleOp<DotGeneralOp>(
+        entry_func_op, rewriter,
+        /*enable_per_channel_quantized_weight=*/false);
   }
 };
 
 // Quantizes the entry function's body containing a `ConvolutionOp`.
 class QuantizeConvolutionOpPattern : public EntryFuncBodyQuantizationPattern {
  public:
-  explicit QuantizeConvolutionOpPattern() = default;
+  explicit QuantizeConvolutionOpPattern(
+      bool enable_per_channel_quantized_weight)
+      : enable_per_channel_quantized_weight_(
+            enable_per_channel_quantized_weight) {}
 
   LogicalResult match(func::FuncOp entry_func_op) const override {
     return MatchGemmStyleOp<ConvolutionOp>(entry_func_op);
@@ -393,8 +490,32 @@ class QuantizeConvolutionOpPattern : public EntryFuncBodyQuantizationPattern {
 
   void rewrite(func::FuncOp entry_func_op,
                PatternRewriter& rewriter) const override {
-    RewriteGemmStyleOp<ConvolutionOp>(entry_func_op, rewriter);
+    RewriteGemmStyleOp<ConvolutionOp>(entry_func_op, rewriter,
+                                      enable_per_channel_quantized_weight_);
   }
+
+ private:
+  bool enable_per_channel_quantized_weight_;
+};
+
+// Quantizes the entry function's body containing a `GatherOp`.
+class QuantizeGatherOpPattern : public EntryFuncBodyQuantizationPattern {
+ public:
+  explicit QuantizeGatherOpPattern(bool enable_per_channel_quantized_weight)
+      : enable_per_channel_quantized_weight_(
+            enable_per_channel_quantized_weight) {}
+
+  LogicalResult match(func::FuncOp entry_func_op) const override {
+    return MatchSingularOp<GatherOp>(entry_func_op);
+  }
+
+  void rewrite(func::FuncOp entry_func_op,
+               PatternRewriter& rewriter) const override {
+    RewriteSingularOp<GatherOp>(entry_func_op, rewriter);
+  }
+
+ private:
+  bool enable_per_channel_quantized_weight_;
 };
 
 // Converts `entry_func_op` to be quantized according to the respective
@@ -453,8 +574,11 @@ template <typename FuncBodyRewritePatternT,
               EntryFuncBodyQuantizationPattern, FuncBodyRewritePatternT>>>
 class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
  public:
-  explicit XlaCallModuleOpToCallOp(MLIRContext& ctx)
-      : OpRewritePattern<TF::XlaCallModuleOp>(&ctx) {}
+  explicit XlaCallModuleOpToCallOp(MLIRContext& ctx,
+                                   bool enable_per_channel_quantized_weight)
+      : OpRewritePattern<TF::XlaCallModuleOp>(&ctx),
+        enable_per_channel_quantized_weight_(
+            enable_per_channel_quantized_weight) {}
 
   LogicalResult match(TF::XlaCallModuleOp op) const override {
     ModuleOp module_op = op->getParentOfType<ModuleOp>();
@@ -468,15 +592,19 @@ class XlaCallModuleOpToCallOp : public OpRewritePattern<TF::XlaCallModuleOp> {
       op->emitError("Failed to find a valid entry function.");
       return failure();
     }
-    return FuncBodyRewritePatternT().match(entry_func_op);
+    return FuncBodyRewritePatternT(enable_per_channel_quantized_weight_)
+        .match(entry_func_op);
   }
 
   void rewrite(TF::XlaCallModuleOp xla_call_module_op,
                PatternRewriter& rewriter) const override {
     ReplaceQuantizedXlaCallModuleOpWithQuantizedCallOp(
         *rewriter.getContext(), rewriter, xla_call_module_op,
-        FuncBodyRewritePatternT());
+        FuncBodyRewritePatternT(enable_per_channel_quantized_weight_));
   }
+
+ private:
+  bool enable_per_channel_quantized_weight_;
 };
 
 // Quantizes op with regions such as stablehlo.reduce_window op.
@@ -670,7 +798,8 @@ bool IsQuantizedCompositeFunction(func::CallOp call_op) {
       if (type.getElementType().isa<FloatType>()) {
         return false;
       }
-      if (type.getElementType().isa<UniformQuantizedType>()) {
+      if (type.getElementType()
+              .isa<UniformQuantizedType, UniformQuantizedPerAxisType>()) {
         has_quantized_types = true;
       }
     }
@@ -680,7 +809,8 @@ bool IsQuantizedCompositeFunction(func::CallOp call_op) {
       if (type.getElementType().isa<FloatType>()) {
         return false;
       }
-      if (type.getElementType().isa<UniformQuantizedType>()) {
+      if (type.getElementType()
+              .isa<UniformQuantizedType, UniformQuantizedPerAxisType>()) {
         has_quantized_types = true;
       }
     }
@@ -749,9 +879,17 @@ bool IsConnectedWithQuantizedCompsiteFunction(Operation* same_scale_op) {
 
 // TODO: b/307620428 - Increase fused op coverage for static range quantization.
 void PopulateFusedGemmStylePatterns(MLIRContext& ctx,
-                                    RewritePatternSet& patterns) {
-  patterns.add<XlaCallModuleOpToCallOp<QuantizeDotGeneralOpPattern>,
-               XlaCallModuleOpToCallOp<QuantizeConvolutionOpPattern>>(ctx);
+                                    RewritePatternSet& patterns,
+                                    bool enable_per_channel_quantized_weight) {
+  patterns.add<XlaCallModuleOpToCallOp<QuantizeConvolutionOpPattern>>(
+      ctx, enable_per_channel_quantized_weight);
+  // By default, we set `enable_per_channel_quantized_weight` to true for
+  // passes to ensure per-channel quantization for all supported ops.
+  // For ops that do not yet support per-channel quantization, explicitly
+  // mark as false like below. We will soon add support for per-channel
+  // quantization of the following ops.
+  patterns.add<XlaCallModuleOpToCallOp<QuantizeDotGeneralOpPattern>>(
+      ctx, /*enable_per_channel_quantized_weight=*/false);
 }
 
 void PopulateQuantizeOpWithRegionPattern(MLIRContext& ctx,
@@ -759,4 +897,10 @@ void PopulateQuantizeOpWithRegionPattern(MLIRContext& ctx,
   patterns.add<QuantizeOpWithRegionPattern>(ctx);
 }
 
+void PopulateQuantizeSingularOpPatterns(MLIRContext& ctx,
+                                        RewritePatternSet& patterns) {
+  // TODO: b/307620772 - Per-channel quantization for gather.
+  patterns.add<XlaCallModuleOpToCallOp<QuantizeGatherOpPattern>>(
+      ctx, /*enable_per_channel_quantized_weight=*/false);
+}
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
index 91170115ce2baa..9922e5bd69eb49 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantization_patterns.h
@@ -124,11 +124,20 @@ class StableHloQuantizationPattern : public RewritePattern {
         // Const-> QuantizeOp pattern will be handled separately.
         return failure();
       }
-      if (Operation* quantizing_op = quantize_operand.getDefiningOp()) {
+      if (Operation* quantizing_op = quantize_operand.getDefiningOp();
+          quantizing_op != nullptr) {
         quantizing_ops.push_back(quantizing_op);
+      } else {
+        // When `QuantizeOpT`'s operand does not have a defining op, it means it
+        // is a `BlockArgument`. The pattern does not match if there is no op to
+        // quantize.
+        return failure();
       }
     }
 
+    // Safeguard check to ensure that there is at least one quantizable op.
+    if (quantizing_ops.empty()) return failure();
+
     absl::flat_hash_set<std::string> ops_blocklist =
         quant_params_.quant_spec.ops_blocklist;
     absl::flat_hash_set<std::string> nodes_blocklist =
@@ -276,15 +285,19 @@ class StableHloQuantizationPattern : public RewritePattern {
 };
 
 // Gemm Style Op: glossary/gemm.
-// Populates conversion patterns to unfuse batch normalization operations.
 void PopulateFusedGemmStylePatterns(MLIRContext& ctx,
-                                    RewritePatternSet& patterns);
+                                    RewritePatternSet& patterns,
+                                    bool enable_per_channel_quantized_weight);
 
 // Populates pattern for quantization of ops with regions such as
 // stablehlo.reduce_window op.
 void PopulateQuantizeOpWithRegionPattern(MLIRContext& ctx,
                                          RewritePatternSet& patterns);
 
+// Populates conversion patterns for unary data movement ops.
+void PopulateQuantizeSingularOpPatterns(MLIRContext& ctx,
+                                        RewritePatternSet& patterns);
+
 }  // namespace mlir::quant::stablehlo
 
 #endif  // TENSORFLOW_COMPILER_MLIR_QUANTIZATION_STABLEHLO_PASSES_QUANTIZATION_PATTERNS_H_
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
index 8d321d9269345c..fd5898d686be96 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize.cc
@@ -44,7 +44,7 @@ namespace mlir::quant::stablehlo {
 namespace {
 
 // Base struct for quantization.
-template <typename ConcreteT, typename RootOpT = quant::DequantizeCastOp>
+template <typename ConcreteT, typename RootOpT = quantfork::DequantizeCastOp>
 struct StableHloQuantizationBase
     : public StableHloQuantizationPattern<ConcreteT, quantfork::QuantizeCastOp,
                                           quantfork::DequantizeCastOp,
@@ -105,15 +105,22 @@ class QuantizePass : public impl::QuantizePassBase<QuantizePass> {
 
   explicit QuantizePass() = default;
 
-  explicit QuantizePass(const QuantizationSpecs& quant_specs)
-      : quant_specs_(quant_specs) {}
+  explicit QuantizePass(const QuantizationSpecs& quant_specs,
+                        bool enable_per_channel_quantized_weight)
+      : quant_specs_(quant_specs),
+        enable_per_channel_quantized_weight_(
+            enable_per_channel_quantized_weight) {}
 
-  QuantizePass(const QuantizePass& other) : quant_specs_(other.quant_specs_) {}
+  QuantizePass(const QuantizePass& other)
+      : quant_specs_(other.quant_specs_),
+        enable_per_channel_quantized_weight_(
+            other.enable_per_channel_quantized_weight_) {}
 
  private:
   void runOnOperation() override;
 
   QuantizationSpecs quant_specs_;
+  bool enable_per_channel_quantized_weight_;
 };
 
 void QuantizePass::runOnOperation() {
@@ -131,14 +138,14 @@ void QuantizePass::runOnOperation() {
   patterns.add<StableHloQuantization, StableHloQuantizationReverse>(
       &ctx, quant_params);
   PopulateQuantizeOpWithRegionPattern(ctx, patterns);
-  PopulateFusedGemmStylePatterns(ctx, patterns);
+  PopulateFusedGemmStylePatterns(ctx, patterns,
+                                 enable_per_channel_quantized_weight_);
+  PopulateQuantizeSingularOpPatterns(ctx, patterns);
 
   if (failed(applyPatternsAndFoldGreedily(module_op, std::move(patterns)))) {
     // There are cases where no rewrites happen even if a pattern matches,
     // causing this to result in a convergence failure. Consider this as a
     // best-effort.
-    // TODO: b/305469508 - Make QuantizationPattern converge if there are no
-    // patterns that are rewritable.
     module_op.emitWarning("Failed to converge pattern at QuantizePass.");
   }
 }
@@ -146,8 +153,10 @@ void QuantizePass::runOnOperation() {
 }  // namespace
 
 std::unique_ptr<OperationPass<ModuleOp>> CreateQuantizePass(
-    const QuantizationSpecs& quantization_specs) {
-  return std::make_unique<QuantizePass>(quantization_specs);
+    const QuantizationSpecs& quantization_specs,
+    bool enable_per_channel_quantized_weight) {
+  return std::make_unique<QuantizePass>(quantization_specs,
+                                        enable_per_channel_quantized_weight);
 }
 
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
index 026c0742615128..0d491a8cb66404 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/quantize_composite_functions.cc
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include <memory>
+
 #include "absl/status/status.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project  // IWYU pragma: keep
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
@@ -25,6 +27,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"  // IWYU pragma: keep
@@ -50,6 +53,11 @@ class QuantizeCompositeFunctionsPass
   using impl::QuantizeCompositeFunctionsPassBase<
       QuantizeCompositeFunctionsPass>::QuantizeCompositeFunctionsPassBase;
 
+  explicit QuantizeCompositeFunctionsPass(
+      bool enable_per_channel_quantized_weight) {
+    enable_per_channel_quantized_weight_ = enable_per_channel_quantized_weight;
+  }
+
  private:
   void runOnOperation() override;
 };
@@ -65,11 +73,16 @@ void QuantizeCompositeFunctionsPass::runOnOperation() {
   // (XlaCallModuleOps) with quantized input and output types, which are not
   // allowed in the TF dialect.
   pm.enableVerifier(false);
-
-  pm.addNestedPass<func::FuncOp>(CreatePrepareQuantizePass());
+  PrepareQuantizePassOptions options;
+  options.enable_per_channel_quantized_weight_ =
+      enable_per_channel_quantized_weight_;
+  // Change this to user-given bit width once we have custom configuration.
+  options.bit_width_ = 8;
+  pm.addNestedPass<func::FuncOp>(createPrepareQuantizePass(options));
   // QuantizePass modifies FuncOps referenced outside of its given scope
   // and therefore requires a module-level context.
-  pm.addPass(CreateQuantizePass(quant_specs));
+  pm.addPass(
+      CreateQuantizePass(quant_specs, enable_per_channel_quantized_weight_));
   pm.addNestedPass<func::FuncOp>(createPostQuantizePass());
 
   ModuleOp module_op = getOperation();
@@ -79,7 +92,13 @@ void QuantizeCompositeFunctionsPass::runOnOperation() {
     signalPassFailure();
   }
 }
-
 }  // namespace
 
+// Creates an instance of the TensorFlow dialect QuantizeCompositeFunctionsPass.
+std::unique_ptr<OperationPass<ModuleOp>> CreateQuantizeCompositeFunctionsPass(
+    bool enable_per_channel_quantized_weight) {
+  return std::make_unique<QuantizeCompositeFunctionsPass>(
+      enable_per_channel_quantized_weight);
+}
+
 }  // namespace mlir::quant::stablehlo
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc
index aa43b64b97b10b..6a152843e3278a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tensorflow/compiler/mlir/quantization/common/func.h"
 #include "tensorflow/compiler/mlir/quantization/common/lift_as_function_call.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/utils/stablehlo_type_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -45,7 +46,6 @@ namespace mlir::quant::stablehlo {
 
 namespace {
 
-constexpr StringRef kQuantizeTargetOpAttr = "tf_quant.composite_function";
 constexpr StringRef kStablehloModuleAttrsAttrName = "_stablehlo_module_attrs";
 constexpr StringRef kUsesShapePolymorphismAttr = "jax.uses_shape_polymorphism";
 
@@ -73,19 +73,6 @@ class ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass
   void runOnOperation() override;
 };
 
-// Finds the main function from module_op. Returns nullptr if not found.
-// The model's signature keys will contain "@serving_default" as default TF
-// Model signature, or "@main" if it is in being exported from MLIR module to
-// GraphDef.
-func::FuncOp GetMainFunc(ModuleOp module_op) {
-  for (auto func_op : module_op.getOps<func::FuncOp>()) {
-    if (func_op.getSymName().equals("main") ||
-        func_op.getSymName().equals("serving_default"))
-      return func_op;
-  }
-  return nullptr;
-}
-
 // Creates a unique stablehlo function name based on op order.
 std::string CreateStablehloFunctionName(const int id) {
   return Twine("_stablehlo_main_").concat(std::to_string(id)).str();
@@ -447,7 +434,7 @@ void ReplaceStablehloOpsInMainFunctionWithXlaCallModuleOpsPass::
     runOnOperation() {
   ModuleOp module_op = getOperation();
 
-  func::FuncOp main_func = GetMainFunc(module_op);
+  func::FuncOp main_func = FindMainFuncOp(module_op);
   if (!main_func) return;
 
   DuplicateSmallConstantOps(module_op, main_func);
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.td b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.td
index b7e2abd31d03ba..38d60e94f97e9a 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.td
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.td
@@ -35,6 +35,11 @@ def TestPostCalibrationComponentPass : Pass<"stablehlo-test-post-calibration-com
   let description = [{
     Runs the post-calibration passes for post-training quantization.
   }];
+  let options = [
+    Option<"unpack_quantized_types_", "unpack-quantized-types", "bool",
+      /*default=*/"true", "Unpacks ops with uniform quantized types into "
+      "operations without uniform quantized types (mostly i8 or i32).">
+  ];
   let dependentDialects = [
     "mlir::stablehlo::StablehloDialect", "mlir::TF::TensorFlowDialect",
     "mlir::func::FuncDialect", "mlir::mhlo::MhloDialect",
@@ -43,3 +48,30 @@ def TestPostCalibrationComponentPass : Pass<"stablehlo-test-post-calibration-com
     "mlir::quantfork::QuantizationForkDialect",
   ];
 }
+
+def TestTFToStablehloPass : Pass<"stablehlo-test-tf-to-stablehlo", "mlir::ModuleOp"> {
+  let summary = "Test-only pass to test TFToStablehloPasses.";
+  let description = [{
+    Runs the TFToStablehloPasses.
+  }];
+  let dependentDialects = [
+    "mlir::stablehlo::StablehloDialect", "mlir::TF::TensorFlowDialect",
+    "mlir::chlo::ChloDialect", "mlir::quant::QuantizationDialect",
+    "mlir::mhlo::MhloDialect", "mlir::shape::ShapeDialect",
+    "mlir::sparse_tensor::SparseTensorDialect", "mlir::vhlo::VhloDialect",
+  ];
+}
+
+def TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass :
+    Pass<"stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs", "mlir::ModuleOp"> {
+  let summary = "Test-only pass for testing the LiftQuantizableSpotsAsFunctionsPass with a predefined QuantizationSpecs.";
+  let description = [{
+    This test-only pass is the same as `LiftQuantizableSpotsAsFunctionsPass` but
+    has predefined `QuantizationSpecs` to make FileCheck testing easier.
+  }];
+  let dependentDialects = [
+      "mlir::func::FuncDialect",
+      "mlir::stablehlo::StablehloDialect",
+      "TF::TensorFlowDialect",
+  ];
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_lift_quantizable_spots_as_functions_with_quantization_specs.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_lift_quantizable_spots_as_functions_with_quantization_specs.cc
new file mode 100644
index 00000000000000..e8cb185cb7b55d
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_lift_quantizable_spots_as_functions_with_quantization_specs.cc
@@ -0,0 +1,101 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/passes.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"  // IWYU pragma: keep
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+
+namespace mlir::quant::stablehlo::testing {
+
+// NOLINTNEXTLINE - Automatically generated.
+#define GEN_PASS_DEF_TESTLIFTQUANTIZABLESPOTSASFUNCTIONSWITHQUANTIZATIONSPECSPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h.inc"
+
+namespace {
+
+using ::stablehlo::quantization::QuantizationSpecs;
+using ::tsl::protobuf::TextFormat;
+// NOLINTNEXTLINE(misc-include-cleaner) - Required for OSS.
+using ::tsl::protobuf::io::ArrayInputStream;
+
+// Configure `QuantizationSpecs` to disable quantization for all dot_general
+// quantizable units.
+constexpr absl::string_view kSpecsDisableAllDotGeneralByFuncName =
+    R"pb(specs
+         [ {
+           matcher { function_name { regex: "composite_dot_general_.*" } }
+           method { no_quantization {} }
+         }])pb";
+
+class TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass
+    : public impl::
+          TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPassBase<
+              TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass> {
+ public:
+  using impl::TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPassBase<
+      TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass>::
+      TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPassBase;
+
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(
+      TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass)
+
+ private:
+  void runOnOperation() override;
+};
+
+// Parses a text proto into a `QuantizationSpecs` proto. Returns
+// `InvalidArgumentError` if `text_proto` is invalid.
+absl::StatusOr<QuantizationSpecs> ParseQuantizationSpecsTextProto(
+    const absl::string_view text_proto) {
+  QuantizationSpecs quantization_specs;
+  TextFormat::Parser parser;
+  ArrayInputStream input_stream(text_proto.data(), text_proto.size());
+  if (parser.Parse(&input_stream, &quantization_specs)) {
+    return quantization_specs;
+  }
+  return absl::InvalidArgumentError("Could not parse text proto.");
+}
+
+void TestLiftQuantizableSpotsAsFunctionsWithQuantizationSpecsPass::
+    runOnOperation() {
+  PassManager pass_manager{&getContext()};
+
+  const absl::StatusOr<QuantizationSpecs> quantization_specs =
+      ParseQuantizationSpecsTextProto(kSpecsDisableAllDotGeneralByFuncName);
+  if (!quantization_specs.ok()) {
+    signalPassFailure();
+    return;
+  }
+
+  pass_manager.addPass(
+      CreateLiftQuantizableSpotsAsFunctionsPass(*quantization_specs));
+
+  if (failed(pass_manager.run(getOperation()))) {
+    signalPassFailure();
+    return;
+  }
+}
+
+}  // namespace
+}  // namespace mlir::quant::stablehlo::testing
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_post_calibration_component.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_post_calibration_component.cc
index f53d420fe6fc3a..88fa9e59b4977d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_post_calibration_component.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_post_calibration_component.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "stablehlo/dialect/VhloOps.h"  // from @stablehlo  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/post_calibration.h"
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"  // IWYU pragma: keep
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"  // IWYU pragma: keep
@@ -38,10 +39,16 @@ namespace mlir::quant::stablehlo::testing {
 
 namespace {
 
+using ::stablehlo::quantization::PipelineConfig;
+using ::stablehlo::quantization::StaticRangePtqPreset;
+
 class TestPostCalibrationComponentPass
     : public impl::TestPostCalibrationComponentPassBase<
           TestPostCalibrationComponentPass> {
  public:
+  using impl::TestPostCalibrationComponentPassBase<
+      TestPostCalibrationComponentPass>::TestPostCalibrationComponentPassBase;
+
   MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestPostCalibrationComponentPass)
 
  private:
@@ -54,8 +61,12 @@ void TestPostCalibrationComponentPass::runOnOperation() {
 
   OpPassManager pm(ModuleOp::getOperationName());
 
+  StaticRangePtqPreset static_range_ptq_preset;
+  PipelineConfig pipeline_config;
+  pipeline_config.set_unpack_quantized_types(unpack_quantized_types_);
+
   PostCalibrationComponent component(&ctx);
-  component.AddPasses(pm);
+  component.AddPasses(pm, static_range_ptq_preset, pipeline_config);
 
   // Adds a XlaCallModuleOp deserialization pass for easier testing by
   // inspecting the contents of serialized StableHLO function.
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_tf_to_stablehlo_pass.cc b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_tf_to_stablehlo_pass.cc
new file mode 100644
index 00000000000000..3af53a213b0064
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/test_tf_to_stablehlo_pass.cc
@@ -0,0 +1,69 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project  // IWYU pragma: keep
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Pass/PassRegistry.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "stablehlo/dialect/ChloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "stablehlo/dialect/VhloOps.h"  // from @stablehlo  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"  // IWYU pragma: keep
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"  // IWYU pragma: keep
+
+namespace mlir::quant::stablehlo::testing {
+
+#define GEN_PASS_DEF_TESTTFTOSTABLEHLOPASS
+#include "tensorflow/compiler/mlir/quantization/stablehlo/passes/testing/passes.h.inc"
+
+namespace {
+
+using ::tensorflow::quantization::AddTFToStablehloPasses;
+using ::tensorflow::quantization::RunPassesOnModuleOp;
+
+class TestTFToStablehloPass
+    : public impl::TestTFToStablehloPassBase<TestTFToStablehloPass> {
+ public:
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestTFToStablehloPass)
+
+ private:
+  void runOnOperation() override;
+};
+
+void TestTFToStablehloPass::runOnOperation() {
+  ModuleOp module_op = getOperation();
+  MLIRContext* ctx = &getContext();
+  mlir::PassManager pm(ctx);
+
+  AddTFToStablehloPasses(pm);
+  if (!RunPassesOnModuleOp(
+           /*mlir_dump_file_name=*/"test_tf_to_stablehlo_pass", pm, module_op)
+           .ok()) {
+    return signalPassFailure();
+  }
+}
+
+}  // namespace
+}  // namespace mlir::quant::stablehlo::testing
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD
index 3493fdf3cbde92..a91ceec6e151f5 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/BUILD
@@ -32,7 +32,6 @@ pytype_strict_library(
     deps = [
         ":pywrap_quantization",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_py",
-        "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_py",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib_py",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:representative_dataset",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:save_model",
@@ -57,6 +56,7 @@ pytype_strict_library(
         "//tensorflow/python/ops:math_ops",
         "//tensorflow/python/ops:nn_ops",
         "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:save",
         "//tensorflow/python/types:core",
         "//third_party/py/numpy",
@@ -67,16 +67,25 @@ pytype_strict_library(
 tf_py_strict_test(
     name = "quantize_model_test",
     srcs = ["integration_test/quantize_model_test.py"],
+    shard_count = 50,  # Parallelize the test to avoid timeouts.
     deps = [
         ":quantization",
         ":quantize_model_test_base",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_py",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:representative_dataset",
+        "//tensorflow/python/eager:def_function",
+        "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:ops",
+        "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
+        "//tensorflow/python/module",
+        "//tensorflow/python/ops:math_ops",
+        "//tensorflow/python/ops:nn_ops",
         "//tensorflow/python/platform:client_testlib",
         "//tensorflow/python/saved_model:load",
+        "//tensorflow/python/saved_model:save",
         "//tensorflow/python/saved_model:tag_constants",
+        "//tensorflow/python/types:core",
         "@absl_py//absl/testing:parameterized",
     ],
 )
@@ -86,6 +95,7 @@ tf_python_pybind_extension(
     srcs = ["pywrap_quantization.cc"],
     pytype_srcs = ["pywrap_quantization.pyi"],
     deps = [
+        "//tensorflow/compiler/mlir/quantization/stablehlo/cc:config_impl",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:static_range_ptq",
         "//tensorflow/compiler/mlir/quantization/tensorflow/python:type_casters",
         "@pybind11",
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
index 59d728c23813f8..f359981eaf89d7 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 import itertools
-from typing import Optional, Sequence
+from typing import Mapping, Optional, Sequence
 
 from absl.testing import parameterized
 import numpy as np
@@ -22,11 +22,19 @@
 from tensorflow.compiler.mlir.quantization.stablehlo.python import quantization
 from tensorflow.compiler.mlir.quantization.stablehlo.python.integration_test import quantize_model_test_base
 from tensorflow.compiler.mlir.quantization.tensorflow.python import representative_dataset as repr_dataset
+from tensorflow.python.eager import def_function
+from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
+from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
+from tensorflow.python.module import module
+from tensorflow.python.ops import math_ops
+from tensorflow.python.ops import nn_ops
 from tensorflow.python.platform import test
 from tensorflow.python.saved_model import load
+from tensorflow.python.saved_model import save
 from tensorflow.python.saved_model import tag_constants
+from tensorflow.python.types import core
 
 
 def parameter_combinations(test_parameters):
@@ -46,24 +54,33 @@ class StaticRangeQuantizationTest(quantize_model_test_base.QuantizedModelTest):
 
   @parameterized.parameters(
       parameter_combinations([{
-          'activation_fn': [None],
-          'has_bias': [True, False],
-          'dim_sizes': [
+          'bias_fn': (
+              None,
+              nn_ops.bias_add,
+          ),
+          'activation_fn': (
+              None,
+              nn_ops.relu,
+              nn_ops.relu6,
+          ),
+          'dim_sizes': (
               # tf.MatMul cases.
               ([None, 1024], [1024, 3]),  # dynamic batch dim.
               ([1, 1024], [1024, 3]),
               # tf.BatchMatMul cases.
               ([10, 1, 1024], [10, 1024, 3]),
               ([2, 3, 1, 1024], [2, 3, 1024, 3]),
-          ],
+          ),
+          'rng_seed': (1230, 1231, 1232, 1233),
       }])
   )
   @test_util.run_in_graph_and_eager_modes
   def test_matmul_ptq_model(
       self,
+      bias_fn: Optional[ops.Operation],
       activation_fn: Optional[ops.Operation],
-      has_bias: bool,
       dim_sizes: Sequence[int],
+      rng_seed: int,
   ):
     lhs_dim_size, rhs_dim_size = dim_sizes
     input_shape = (*lhs_dim_size,)
@@ -73,11 +90,11 @@ def test_matmul_ptq_model(
         input_shape,
         filter_shape,
         self._input_saved_model_path,
-        has_bias,
+        bias_fn,
         activation_fn,
     )
 
-    rng = np.random.default_rng(seed=1235)
+    rng = np.random.default_rng(rng_seed)
     input_data = ops.convert_to_tensor(
         rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
             np.float32
@@ -128,7 +145,7 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
 
   @parameterized.parameters(
       parameter_combinations([{
-          'same_scale_op': [
+          'same_scale_op': (
               'concatenate',
               'gather',
               'max_pool',
@@ -137,13 +154,15 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
               'select',
               'slice',
               'transpose',
-          ],
+          ),
+          'rng_seed': (0, 11, 222, 3333),
       }])
   )
   @test_util.run_in_graph_and_eager_modes
   def test_matmul_and_same_scale_ptq_model(
       self,
       same_scale_op: str,
+      rng_seed: int,
   ):
     input_shape = (2, 3, 1, 1024)
     filter_shape = (2, 3, 1024, 3)
@@ -156,7 +175,7 @@ def test_matmul_and_same_scale_ptq_model(
         same_scale_op,
     )
 
-    rng = np.random.default_rng(seed=1235)
+    rng = np.random.default_rng(rng_seed)
     input_data = ops.convert_to_tensor(
         rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
             np.float32
@@ -205,42 +224,56 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # values are arbitrary.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.03, atol=0.2)
 
-  @parameterized.named_parameters(
-      {
-          'testcase_name': 'none',
-          'activation_fn': None,
-          'has_bias': False,
-          'has_batch_norm': False,
-          'input_shape_dynamic': False,
-          'enable_per_channel_quantization': False,
-      },
+  @parameterized.parameters(
+      parameter_combinations([{
+          'bias_fn': (
+              None,
+              nn_ops.bias_add,
+          ),
+          'activation_fn': (
+              None,
+              nn_ops.relu,
+              nn_ops.relu6,
+          ),
+          'has_batch_norm': (False,),
+          'input_shape_dynamic': (
+              False,
+              True,
+          ),
+          'enable_per_channel_quantized_weight': (
+              False,
+              True,
+          ),
+          'rng_seed': (10, 11, 12, 13),
+      }])
   )
   @test_util.run_in_graph_and_eager_modes
   def test_conv_ptq_model(
       self,
+      bias_fn: Optional[ops.Operation],
       activation_fn: Optional[ops.Operation],
-      has_bias: bool,
       has_batch_norm: bool,
       input_shape_dynamic: bool,
-      enable_per_channel_quantization: bool,
+      enable_per_channel_quantized_weight: bool,
+      rng_seed: int,
       dilations: Sequence[int] = None,
   ):
-    input_shape = (None, None, None, 3) if input_shape_dynamic else (1, 3, 4, 3)
+    input_shape = (None, 3, 4, 3) if input_shape_dynamic else (1, 3, 4, 3)
     filter_shape = (2, 3, 3, 2)
     strides = (1, 1, 1, 1)
     model = self._create_conv2d_model(
         input_shape,
         filter_shape,
         self._input_saved_model_path,
-        has_bias,
-        has_batch_norm,
+        bias_fn,
         activation_fn,
+        has_batch_norm,
         strides,
         dilations,
     )
 
     # Generate model input data.
-    rng = np.random.default_rng(seed=1224)
+    rng = np.random.default_rng(rng_seed)
     static_input_shape = [dim if dim is not None else 2 for dim in input_shape]
     input_data = ops.convert_to_tensor(
         rng.uniform(low=0.0, high=1.0, size=static_input_shape).astype(
@@ -268,7 +301,8 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
                 qc.RepresentativeDatasetConfig(
                     tf_record=qc.TfRecordFile(path=dataset_path)
                 )
-            ]
+            ],
+            enable_per_channel_quantized_weight=enable_per_channel_quantized_weight,
         ),
         tf_saved_model=qc.TfSavedModelConfig(tags=[tag_constants.SERVING]),
     )
@@ -290,10 +324,19 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
     # values are arbitrary.
     self.assertAllClose(new_outputs, expected_outputs, rtol=0.02, atol=0.05)
 
-  @parameterized.parameters(('abc,cde->abde',), ('abc,dce->abde',))
+  @parameterized.parameters(
+      parameter_combinations([{
+          'equation': (
+              'abc,cde->abde',
+              'abc,dce->abde',
+          ),
+          'rng_seed': (82, 82732, 4444, 14),
+      }])
+  )
   def test_einsum_ptq_model(
       self,
       equation: str,
+      rng_seed: int,
   ):
     _, y_shape, bias_shape, x_signature, y_signature = (
         self._prepare_sample_einsum_datashapes(equation, use_bias=True)
@@ -309,7 +352,7 @@ def test_einsum_ptq_model(
     )
 
     # Generate model input data.
-    rng = np.random.default_rng(seed=1231)
+    rng = np.random.default_rng(rng_seed)
     input_data = ops.convert_to_tensor(
         rng.uniform(low=0.0, high=1.0, size=x_signature).astype('f4')
     )
@@ -373,6 +416,239 @@ def test_when_preset_not_srq_raises_error(self):
           config,
       )
 
+  @test_util.run_in_graph_and_eager_modes
+  def test_ptq_denylist_basic(self):
+    """Tests that the op is not quantized when no quantization is enabled."""
+    input_shape = (1, 2)
+    model = self._create_matmul_model(
+        input_shape,
+        weight_shape=(2, 3),
+        saved_model_path=self._input_saved_model_path,
+    )
+
+    rng = np.random.default_rng(1230)
+    random_tensor_gen_fn = lambda: rng.uniform(
+        low=0.0, high=1.0, size=input_shape
+    ).astype(np.float32)
+
+    def data_gen() -> repr_dataset.RepresentativeDataset:
+      for _ in range(50):
+        yield {'input_tensor': random_tensor_gen_fn()}
+
+    dataset_path = self.create_tempfile('tfrecord').full_path
+    path_map = {'serving_default': dataset_path}
+    repr_dataset.TfRecordRepresentativeDatasetSaver(path_map).save(
+        {'serving_default': data_gen()}
+    )
+
+    config = qc.QuantizationConfig(
+        static_range_ptq_preset=qc.StaticRangePtqPreset(
+            representative_datasets=[
+                qc.RepresentativeDatasetConfig(
+                    tf_record=qc.TfRecordFile(path=dataset_path)
+                )
+            ]
+        ),
+        tf_saved_model=qc.TfSavedModelConfig(tags=[tag_constants.SERVING]),
+        # Disable quantization for the quantizable unit (lifted function) whose
+        # function name starts with "composite_dot_general".
+        specs=qc.QuantizationSpecs(
+            specs=[
+                qc.QuantizationSpec(
+                    matcher=qc.MatcherSpec(
+                        function_name=qc.FunctionNameMatcherSpec(
+                            regex='composite_dot_general.*'
+                        )
+                    ),
+                    method=qc.Method(no_quantization={}),
+                )
+            ]
+        ),
+    )
+    quantization.quantize_saved_model(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        config,
+    )
+
+    input_data = ops.convert_to_tensor(random_tensor_gen_fn())
+    expected_outputs = model.matmul(input_data)
+
+    root = load.load(self._output_saved_model_path)
+    self.assertCountEqual(root.signatures.keys(), {'serving_default'})
+
+    new_outputs = root.signatures['serving_default'](
+        input_tensor=ops.convert_to_tensor(input_data)
+    )
+
+    # Indirectly tests that the model is not quantized by asserting that there
+    # are negligible numeric difference.
+    self.assertAllClose(new_outputs, expected_outputs, rtol=0.000001)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_ptq_selective_denylist(self):
+    """Tests that the op is not quantized when no quantization is enabled."""
+
+    rng = np.random.default_rng(1230)
+    random_tensor_gen_fn = lambda shape: rng.uniform(
+        low=-1.0, high=1.0, size=shape
+    ).astype(np.float32)
+
+    class TwoMatmulModel(module.Module):
+      """A model with two matmul ops."""
+
+      @def_function.function
+      def matmul(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
+        """Performs a matrix multiplication.
+
+        Args:
+          input_tensor: Input tensor to matmul with the filter.
+
+        Returns:
+          A 'output' -> output tensor mapping
+        """
+        out = math_ops.matmul(input_tensor, random_tensor_gen_fn((2, 3)))
+        out = math_ops.matmul(out, random_tensor_gen_fn((3, 4)))
+        return {'output': out}
+
+    model = TwoMatmulModel()
+    input_shape = (1, 2)
+
+    save.save(
+        model,
+        self._input_saved_model_path,
+        signatures=model.matmul.get_concrete_function(
+            tensor_spec.TensorSpec(
+                shape=input_shape, dtype=dtypes.float32, name='input_tensor'
+            )
+        ),
+    )
+
+    def data_gen() -> repr_dataset.RepresentativeDataset:
+      for _ in range(50):
+        yield {'input_tensor': random_tensor_gen_fn(input_shape)}
+
+    dataset_path = self.create_tempfile('tfrecord').full_path
+    path_map = {'serving_default': dataset_path}
+    repr_dataset.TfRecordRepresentativeDatasetSaver(path_map).save(
+        {'serving_default': data_gen()}
+    )
+
+    config = qc.QuantizationConfig(
+        static_range_ptq_preset=qc.StaticRangePtqPreset(
+            representative_datasets=[
+                qc.RepresentativeDatasetConfig(
+                    tf_record=qc.TfRecordFile(path=dataset_path)
+                ),
+            ],
+        ),
+        tf_saved_model=qc.TfSavedModelConfig(tags=[tag_constants.SERVING]),
+        # Disable quantization for the quantizable unit (lifted function) whose
+        # function name matches "composite_dot_general_fn_1".
+        # "composite_dot_general_fn_2" will be quantized.
+        specs=qc.QuantizationSpecs(
+            specs=[
+                qc.QuantizationSpec(
+                    matcher=qc.MatcherSpec(
+                        function_name=qc.FunctionNameMatcherSpec(
+                            regex='composite_dot_general_fn_1'
+                        )
+                    ),
+                    method=qc.Method(no_quantization={}),
+                )
+            ]
+        ),
+    )
+
+    quantization.quantize_saved_model(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        config,
+    )
+
+    input_data = ops.convert_to_tensor(random_tensor_gen_fn(input_shape))
+    expected_outputs = model.matmul(input_data)
+
+    root = load.load(self._output_saved_model_path)
+    self.assertCountEqual(root.signatures.keys(), {'serving_default'})
+
+    new_outputs = root.signatures['serving_default'](
+        input_tensor=ops.convert_to_tensor(input_data)
+    )
+
+    # Indirectly tests that the model is only partially quantized.
+    self.assertAllClose(new_outputs, expected_outputs, rtol=0.011)
+
+  @test_util.run_in_graph_and_eager_modes
+  def test_ptq_quantization_method_not_applied_when_matcher_mismatch(self):
+    """Tests that quantization method is not applied to unmatched units."""
+    input_shape = (1, 2)
+    model = self._create_matmul_model(
+        input_shape,
+        weight_shape=(2, 3),
+        saved_model_path=self._input_saved_model_path,
+    )
+
+    rng = np.random.default_rng(1230)
+    random_tensor_gen_fn = lambda: rng.uniform(
+        low=0.0, high=1.0, size=input_shape
+    ).astype(np.float32)
+
+    def data_gen() -> repr_dataset.RepresentativeDataset:
+      for _ in range(50):
+        yield {'input_tensor': random_tensor_gen_fn()}
+
+    dataset_path = self.create_tempfile('tfrecord').full_path
+    path_map = {'serving_default': dataset_path}
+    repr_dataset.TfRecordRepresentativeDatasetSaver(path_map).save(
+        {'serving_default': data_gen()}
+    )
+
+    config = qc.QuantizationConfig(
+        static_range_ptq_preset=qc.StaticRangePtqPreset(
+            representative_datasets=[
+                qc.RepresentativeDatasetConfig(
+                    tf_record=qc.TfRecordFile(path=dataset_path)
+                )
+            ]
+        ),
+        tf_saved_model=qc.TfSavedModelConfig(tags=[tag_constants.SERVING]),
+        specs=qc.QuantizationSpecs(
+            specs=[
+                qc.QuantizationSpec(
+                    # Provide a regex that wouldn't match any quantizable units.
+                    matcher=qc.MatcherSpec(
+                        function_name=qc.FunctionNameMatcherSpec(
+                            regex='.*invalid_function_name.*'
+                        ),
+                    ),
+                    method=qc.Method(no_quantization={}),
+                ),
+            ],
+        ),
+    )
+    quantization.quantize_saved_model(
+        self._input_saved_model_path,
+        self._output_saved_model_path,
+        config,
+    )
+
+    input_data = ops.convert_to_tensor(random_tensor_gen_fn())
+    expected_outputs = model.matmul(input_data)
+
+    root = load.load(self._output_saved_model_path)
+    self.assertCountEqual(root.signatures.keys(), {'serving_default'})
+
+    new_outputs = root.signatures['serving_default'](
+        input_tensor=ops.convert_to_tensor(input_data)
+    )
+
+    # Tests that the quantized graph outputs similar values. They also shouldn't
+    # be exactly the same. Indirectly proves that the `FunctionNameMatcherSpec`
+    # with regex '.*invalid_function_name.*' did not match the quantizable unit.
+    self.assertAllClose(new_outputs, expected_outputs, rtol=0.04)
+    self.assertNotAllClose(new_outputs, expected_outputs, rtol=0.00001)
+
 
 if __name__ == '__main__':
   test.main()
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py
index a33ccb1b824e3f..9a5b9225c78a82 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/integration_test/quantize_model_test_base.py
@@ -13,7 +13,8 @@
 # limitations under the License.
 # ==============================================================================
 """Base test class for quantize_model Tests."""
-from typing import Mapping, Sequence, Optional, Tuple, List
+
+from typing import List, Mapping, Optional, Sequence, Tuple
 
 from absl.testing import parameterized
 import numpy as np
@@ -53,10 +54,8 @@ def _create_matmul_model(
       input_shape: Sequence[int],
       weight_shape: Sequence[int],
       saved_model_path: str,
-      has_bias: bool = False,
+      bias_fn: Optional[ops.Operation] = None,
       activation_fn: Optional[ops.Operation] = None,
-      bias_size: Optional[int] = None,
-      use_biasadd: bool = True,
   ) -> module.Module:
     class MatmulModel(module.Module):
       """A simple model with a single matmul.
@@ -67,40 +66,28 @@ class MatmulModel(module.Module):
       def __init__(
           self,
           weight_shape: Sequence[int],
-          bias_size: Optional[int] = None,
-          activation_fn: Optional[ops.Operation] = None,
-          use_biasadd: bool = True,
       ) -> None:
         """Initializes a MatmulModel.
 
         Args:
           weight_shape: Shape of the weight tensor.
-          bias_size: If None, do not use bias. Else, use given size as bias.
-          activation_fn: The activation function to be used. No activation
-            function if None.
-          use_biasadd: If True, use BiasAdd for adding bias, else use AddV2.
         """
-        self.bias_size = bias_size
-        self.activation_fn = activation_fn
-        self.use_biasadd = use_biasadd
         self.filters = np.random.uniform(low=-1.0, high=1.0, size=weight_shape)
 
-        if bias_size is not None:
-          self.bias = np.random.uniform(low=-1.0, high=1.0, size=bias_size)
-
-      def has_bias(self) -> bool:
-        return self.bias_size is not None
+        if bias_fn is not None:
+          self.bias = np.random.uniform(
+              low=-1.0, high=1.0, size=weight_shape[-1]
+          )
 
       def has_reshape(self) -> bool:
-        return self.has_bias() and self.bias_size != self.filters.shape[-1]
+        return self.bias_fn() and self.bias_size != self.filters.shape[-1]
 
       @def_function.function
       def matmul(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
         """Performs a matrix multiplication.
 
-        Depending on self.has_bias and self.activation_fn, it may add a bias
-        term or
-        go through the activaction function.
+        Depending on self.bias_fn and self.activation_fn, it may add a bias
+        term or go through the activaction function.
 
         Args:
           input_tensor: Input tensor to matmul with the filter.
@@ -109,18 +96,13 @@ def matmul(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
           A map of: output key -> output result.
         """
         out = math_ops.matmul(input_tensor, self.filters, name='sample/matmul')
-
+        if bias_fn is not None:
+          out = bias_fn(out, self.bias)
+        if activation_fn is not None:
+          out = activation_fn(out)
         return {'output': out}
 
-    # If bias_size is not explictly given, it should default to width of weight.
-    if bias_size is None and has_bias:
-      bias_size = weight_shape[-1]
-
-    # Verify that when bias_size is not None, has_bias should be True.
-    # And if bias_size is None, has_bias should be False.
-    assert (bias_size is None) != has_bias
-
-    model = MatmulModel(weight_shape, bias_size, activation_fn)
+    model = MatmulModel(weight_shape)
     saved_model_save.save(
         model,
         saved_model_path,
@@ -227,9 +209,9 @@ def _create_conv2d_model(
       input_shape: Sequence[int],
       filter_shape: Sequence[int],
       saved_model_path: str,
-      has_bias: bool = False,
-      has_batch_norm: bool = False,
+      bias_fn: Optional[ops.Operation] = None,
       activation_fn: Optional[ops.Operation] = None,
+      has_batch_norm: bool = False,
       strides: Sequence[int] = (1, 1, 1, 1),
       dilations: Sequence[int] = (1, 1, 1, 1),
       padding: str = 'SAME',
@@ -277,6 +259,10 @@ def conv2d(self, input_tensor: core.Tensor) -> Mapping[str, core.Tensor]:
             data_format='NHWC',
             name='sample/conv',
         )
+        if bias_fn is not None:
+          out = nn_ops.bias_add(out, self.bias)
+        if activation_fn is not None:
+          out = activation_fn(out)
         if has_batch_norm:
           # Fusing is supported for non-training case.
           out, _, _, _, _, _ = nn_ops.fused_batch_norm_v3(
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc
index 6ef921c4260841..bfecd82e21e56f 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil   // IWYU pragma: keep
 #include "pybind11_abseil/import_status_module.h"  // from @pybind11_abseil
 #include "pybind11_abseil/status_casters.h"  // from @pybind11_abseil  // IWYU pragma: keep
+#include "tensorflow/compiler/mlir/quantization/stablehlo/cc/config.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/static_range_ptq.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/python/type_casters.h"  // IWYU pragma: keep
 
@@ -28,8 +29,9 @@ namespace py = pybind11;
 namespace {
 
 using ::mlir::quant::stablehlo::QuantizeStaticRangePtq;
+using ::stablehlo::quantization::PopulateDefaults;
 
-}
+}  // namespace
 
 PYBIND11_MODULE(pywrap_quantization, m) {
   // Supports absl::Status type conversions.
@@ -55,17 +57,24 @@ PYBIND11_MODULE(pywrap_quantization, m) {
         as defined by the `MetaGraphDef::MetaInfoDef::function_aliases` from the
         input SavedModel.
 
-        `representative_dataset_file_map_serialized` is a signature key ->
-        `RepresentativeDatasetFile` (serialized) mapping for running the
-        calibration step. Each dataset file stores the representative dataset
-        for the function matching the signature key.
-
         Raises `StatusNotOk` exception if when the run was unsuccessful.
         )pbdoc",
         py::arg("src_saved_model_path"), py::arg("dst_saved_model_path"),
         py::arg("quantization_config_serialized"), py::kw_only(),
         py::arg("signature_keys"), py::arg("signature_def_map_serialized"),
-        py::arg("function_aliases"), py::arg("py_function_library"),
-        py::arg("representative_dataset_file_map_serialized"));
+        py::arg("function_aliases"), py::arg("py_function_library"));
+  // LINT.ThenChange(pywrap_quantization.pyi:static_range_ptq)
+
+  // If the function signature changes, likely its corresponding .pyi type
+  // hinting should also change.
+  // LINT.IfChange(populate_default_configs)
+  m.def("populate_default_configs", &PopulateDefaults,
+        R"pbdoc(
+        Populates `QuantizationConfig` with default values.
+
+        Returns an updated `QuantizationConfig` (serialized) after populating
+        default values to fields that the user did not explicitly specify.
+        )pbdoc",
+        py::arg("user_provided_config_serialized"));
   // LINT.ThenChange(pywrap_quantization.pyi:static_range_ptq)
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi
index aee4463b238a6b..b3d016465004e6 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/pywrap_quantization.pyi
@@ -27,8 +27,13 @@ def static_range_ptq(
     signature_def_map_serialized: dict[str, bytes],
     function_aliases: dict[str, str],
     py_function_library: py_function_lib.PyFunctionLibrary,
-    # Value type: RepresentativeDatasetFile.
-    representative_dataset_file_map_serialized: dict[str, bytes],
 ) -> Any: ...  # Status
 
 # LINT.ThenChange()
+
+# LINT.IfChange(populate_default_configs)
+def populate_default_configs(
+    user_provided_quantization_config_serialized: bytes,
+) -> bytes: ...  # QuantizationConfig
+
+# LINT.ThenChange()
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py b/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py
index 26e38f242af6d9..5e1ce4e7d65ba8 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/python/quantization.py
@@ -17,7 +17,6 @@
 
 from tensorflow.compiler.mlir.quantization.stablehlo import quantization_config_pb2 as qc
 from tensorflow.compiler.mlir.quantization.stablehlo.python import pywrap_quantization
-from tensorflow.compiler.mlir.quantization.tensorflow import quantization_options_pb2 as quant_opts_pb2
 from tensorflow.compiler.mlir.quantization.tensorflow.python import py_function_lib
 from tensorflow.compiler.mlir.quantization.tensorflow.python import save_model
 from tensorflow.core.protobuf import meta_graph_pb2
@@ -45,6 +44,25 @@ def _serialize_signature_def_map(
   return signature_def_map_serialized
 
 
+def _populate_default_quantization_config(
+    config: qc.QuantizationConfig,
+) -> qc.QuantizationConfig:
+  """Populates `QuantizationConfig` with default values.
+
+  Args:
+    config: User-provided quantization config.
+
+  Returns:
+    Updated `QuantizationConfig` after populating default values to fields that
+    the user did not explicitly specify.
+  """
+  pipeline_config = config.pipeline_config
+  if not pipeline_config.HasField('unpack_quantized_types'):
+    pipeline_config.unpack_quantized_types = True
+
+  return config
+
+
 # TODO: b/310594193 - Export API to pip package.
 def quantize_saved_model(
     src_saved_model_path: str,
@@ -71,6 +89,10 @@ def quantize_saved_model(
         ' single signature.'
     )
 
+  config = qc.QuantizationConfig.FromString(
+      pywrap_quantization.populate_default_configs(config.SerializeToString())
+  )
+
   signature_def_map = save_model.get_signatures_from_saved_model(
       src_saved_model_path,
       signature_keys=None,
@@ -82,18 +104,6 @@ def quantize_saved_model(
       config.tf_saved_model.tags
   ).meta_info_def.function_aliases
 
-  # Create a signature key -> `RepresentativeDatasetFile` mapping.
-  # `RepresentativeDatsetFile` should be serialized for `static_range_ptq` due
-  # to pywrap protobuf compatibility requirements.
-  tfrecord_file_path: str = (
-      config.static_range_ptq_preset.representative_datasets[0].tf_record.path
-  )
-  dataset_file_map = {
-      'serving_default': quant_opts_pb2.RepresentativeDatasetFile(
-          tfrecord_file_path=tfrecord_file_path
-      ).SerializeToString()
-  }
-
   signature_def_map_serialized = _serialize_signature_def_map(signature_def_map)
   pywrap_quantization.static_range_ptq(
       src_saved_model_path,
@@ -103,5 +113,4 @@ def quantize_saved_model(
       signature_def_map_serialized=signature_def_map_serialized,
       function_aliases=dict(function_aliases),
       py_function_library=py_function_lib.PyFunctionLibrary(),
-      representative_dataset_file_map_serialized=dataset_file_map,
   )
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
index 89f396061de0b9..25623ad4497655 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto
@@ -31,11 +31,18 @@ message RepresentativeDatasetConfig {
 // Minimal user input about representative datasets is required. Representative
 // datasets are required for static-range PTQ to retrieve quantization
 // statistics via calibration.
-// Next ID: 2
+// Next ID: 3
 message StaticRangePtqPreset {
   // Configures representative dataset. Each item corresponds to a
   // representative dataset used to calibrate a function.
   repeated RepresentativeDatasetConfig representative_datasets = 1;
+
+  // NOTE: This field will be deprecated.
+  // Granularity should be controlled in custom configuration, deprecating
+  // this field once available.
+  // If set true, enable channel-wise quantization for all supported ops.
+  // This value is true by default.
+  bool enable_per_channel_quantized_weight = 2;
 }
 
 // Metadata specific to the input TensorFlow SavedModel, which may be required
@@ -47,10 +54,77 @@ message TfSavedModelConfig {
   repeated string tags = 1;
 }
 
+// Configures the graph transformation pipeline for quantization.
+message PipelineConfig {
+  // When set to True, unpacks ops with uniform quantized types into operations
+  // without uniform quantized types (mostly i8 or i32). Useful when the target
+  // hardware performs better with integer ops.
+  // Default value: true
+  optional bool unpack_quantized_types = 1;
+}
+
+// A quantization method representing "do not quantize". Mostly used for
+// denylisting quantizable units from quantization.
+message NoQuantization {}
+
+// Represents a matching method that matches quantizable units by lifted
+// functions' names.
+message FunctionNameMatcherSpec {
+  // Regular expression to match lifted functions' names. Underlying regex
+  // engine uses re2, which accepts a subset of PCRE. See
+  // https://github.com/google/re2/wiki/Syntax for details.
+  string regex = 1;
+}
+
+// Matcher specification for identifying quantizable units.
+message MatcherSpec {
+  // Matches lifted functions by their names.
+  FunctionNameMatcherSpec function_name = 1;
+}
+
+// Specifies how to quantize matched quantizable units.
+message Method {
+  NoQuantization no_quantization = 1;
+}
+
+// A QuantizationSpec is essentially a (matcher spec, quantization method) pair,
+// where the matcher spec is used to identify quantizable units and the
+// quantization method specifies what type of quantization to apply on the
+// matched quantizable units.
+// Next ID: 3
+message QuantizationSpec {
+  // Configures matchers for identifying quantizable units. Matched quantizable
+  // units will be quantized according to `method`.
+  MatcherSpec matcher = 1;
+
+  // Specifies how to quantize the matched quantizable units.
+  Method method = 2;
+}
+
+// Quantization specifications. A simple wrapper around a sequence of
+// `QuantizationSpec`s so that specs can be easily passed around or represented
+// as a textproto.
+// Next ID: 2
+message QuantizationSpecs {
+  // List of `QuantizationSpec`s. Later spec in the sequence takes precedence.
+  //
+  // NOTE: Tie-breaking mechanism is not yet supported. Providing multiple
+  // `QuantizationSpec` with conflicting quantizable units may result in
+  // undefined behavior.
+  // TODO: b/307620778 - Support tie-breaking for conflicting specs.
+  repeated QuantizationSpec specs = 1;
+}
+
 // Quantization configuration for StableHLO Quantizer. This is the primary
 // message containing all configurable options.
-// Next ID: 4
+// Next ID: 5
 message QuantizationConfig {
+  // Config presets provide predefined popular or common quantization specs.
+  // Lightweight users may choose one of the presets for quick experiments. Each
+  // preset is completely represented by `QuantizationSpecs`. When extra entries
+  // in `QuantizationSpecs` are provided along with a preset, then the preset
+  // will be overridden for the quantizable units matched by those additional
+  // `QuantizationSpec`s.
   oneof preset {
     // Performs best-effort static-range post-training quantization (PTQ).
     StaticRangePtqPreset static_range_ptq_preset = 1;
@@ -58,4 +132,9 @@ message QuantizationConfig {
 
   // TF SavedModel specific information for the input model.
   TfSavedModelConfig tf_saved_model = 2;
+
+  // Configures the graph transformation pipeline for quantization.
+  PipelineConfig pipeline_config = 3;
+
+  QuantizationSpecs specs = 4;
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD b/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD
index 6fc15864fb0f8b..db4bc1a92483c1 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/BUILD
@@ -46,24 +46,3 @@ tf_cc_test(
         "@local_tsl//tsl/platform:protobuf",
     ],
 )
-
-tf_cc_test(
-    name = "stablehlo_op_quant_spec_test",
-    srcs = ["stablehlo_op_quant_spec_test.cc"],
-    deps = [
-        "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
-        "//tensorflow/compiler/mlir/quantization/common:test_base",
-        "//tensorflow/compiler/mlir/quantization/stablehlo/ops:stablehlo_op_quant_spec",
-        "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
-        "//tensorflow/core:test",
-        "@com_google_absl//absl/strings:string_view",
-        "@com_google_googletest//:gtest_main",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
-        "@llvm-project//mlir:QuantOps",
-        "@stablehlo//:stablehlo_ops",
-    ],
-)
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir
index 266b9735224e79..cba7d378fcc190 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert-mhlo-quant-to-int.mlir
@@ -117,10 +117,10 @@ func.func @quantize_per_channel(%arg0: tensor<26x26x3x2xf32>
   // CHECK-DAG: %[[QMIN:.*]] = mhlo.constant dense<-2.14748365E+9> : tensor<f32>
   // CHECK-DAG: %[[QMAX:.*]] = mhlo.constant dense<2.14748365E+9> : tensor<f32>
   // CHECK: %[[DIVIDE:.*]] = chlo.broadcast_divide %arg0, %[[SCALES]]
-  // CHECK-SAME: {broadcast_dimensions = dense<3> : tensor<1xi64>}
+  // CHECK-SAME: {broadcast_dimensions = array<i64: 3>}
   // CHECK-SAME: (tensor<26x26x3x2xf32>, tensor<2xf32>) -> tensor<26x26x3x2xf32>
   // CHECK: %[[ADD:.*]] = chlo.broadcast_add %[[DIVIDE]], %[[ZPS]]
-  // CHECK-SAME: {broadcast_dimensions = dense<3> : tensor<1xi64>}
+  // CHECK-SAME: {broadcast_dimensions = array<i64: 3>}
   // CHECK-SAME: (tensor<26x26x3x2xf32>, tensor<2xf32>) -> tensor<26x26x3x2xf32>
   // CHECK: %[[CLAMP:.*]] = mhlo.clamp %[[QMIN]], %[[ADD]], %[[QMAX]]
   // CHECK: %[[ROUND:.*]] = mhlo.round_nearest_even %[[CLAMP]]
@@ -141,12 +141,12 @@ func.func @dequantize_per_channel(
   // CHECK-DAG: %[[ZPS:.*]] = mhlo.constant dense<[-10, 2]> : tensor<2xi32>
   // CHECK: %[[SUBTRACT:.*]] = chlo.broadcast_subtract
   // CHECK-SAME: %[[INPUT:.*]], %[[ZPS]]
-  // CHECK-SAME: {broadcast_dimensions = dense<3> : tensor<1xi64>}
+  // CHECK-SAME: {broadcast_dimensions = array<i64: 3>}
   // CHECK-SAME: (tensor<26x26x3x2xi32>, tensor<2xi32>) -> tensor<26x26x3x2xi32>
   // CHECK: %[[FLOAT:.*]] = mhlo.convert %[[SUBTRACT]]
   // CHECK: %[[RESULT:.*]] = chlo.broadcast_multiply
   // CHECK-SAME: %[[FLOAT]], %[[SCALES]]
-  // CHECK-SAME: {broadcast_dimensions = dense<3> : tensor<1xi64>}
+  // CHECK-SAME: {broadcast_dimensions = array<i64: 3>}
   // CHECK-SAME: (tensor<26x26x3x2xf32>, tensor<2xf32>) -> tensor<26x26x3x2xf32>
   %0 = mhlo.uniform_dequantize %arg0 : (
       tensor<26x26x3x2x!quant.uniform<i32:f32:3, {1.100000e+00:-10, 1.100000e-01:2}>>
@@ -304,6 +304,78 @@ func.func @add_different_res_type(
 
 // -----
 
+// CHECK-LABEL: func @add_per_channel
+func.func @add_per_channel(
+    %arg0: tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5:3,5.8952903030815205E-5:2}>>,
+    %arg1: tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5:3,5.8952903030815205E-5:2}>>
+  ) -> tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5:3,5.8952903030815205E-5:2}>> {
+  // CHECK: %[[ADD:.*]] = mhlo.add {{.*}} : tensor<?x3x4x2xi32>
+  // CHECK: %[[ZPS:.*]] = mhlo.constant dense<[3, 2]> : tensor<2xi32>
+  // CHECK: %[[BCAST_SUB:.*]] = chlo.broadcast_subtract %[[ADD]], %[[ZPS]]
+  // CHECK-SAME: {broadcast_dimensions = array<i64: 3>}
+  // CHECK-SAME: (tensor<?x3x4x2xi32>, tensor<2xi32>) -> tensor<?x3x4x2xi32>
+  // CHECK: return %[[BCAST_SUB]] : tensor<?x3x4x2xi32>
+  %11 = mhlo.add %arg0, %arg1 : tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5:3,5.8952903030815205E-5:2}>>
+  return %11 : tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5:3,5.8952903030815205E-5:2}>>
+}
+
+// -----
+
+// CHECK-LABEL: func @add_per_channel_no_zp
+func.func @add_per_channel_no_zp(
+    %arg0: tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>>,
+    %arg1: tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>>
+  ) -> tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>> {
+  // CHECK: %[[ADD:.*]] = mhlo.add {{.*}} : tensor<?x3x4x2xi32>
+  // CHECK: return %[[ADD]] : tensor<?x3x4x2xi32>
+  %11 = mhlo.add %arg0, %arg1 : tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>>
+  return %11 : tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>>
+}
+
+// -----
+
+func.func @add_per_channel_i8(
+    %arg0: tensor<?x3x4x2x!quant.uniform<i8:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>>,
+    %arg1: tensor<?x3x4x2x!quant.uniform<i8:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>>
+  ) -> tensor<?x3x4x2x!quant.uniform<i8:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>> {
+  // expected-error@+2 {{Per-channel quantized AddOp requires i32 storage type}}
+  // expected-error@+1 {{failed to legalize operation 'mhlo.add' that was explicitly marked illegal}}
+  %11 = mhlo.add %arg0, %arg1 : tensor<?x3x4x2x!quant.uniform<i8:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>>
+  return %11 : tensor<?x3x4x2x!quant.uniform<i8:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>>
+}
+
+// -----
+
+func.func @add_per_channel_different_quant_types(
+    %arg0: tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>>,
+    %arg1: tensor<?x3x4x2x!quant.uniform<i32:f32:3, {1.1:2,0.4:-3}>>
+  ) -> tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>> {
+  // expected-error@+2 {{Per-channel quantized AddOp requires the same quantized element type for all operands and results}}
+  // expected-error@+1 {{failed to legalize operation 'mhlo.add' that was explicitly marked illegal}}
+  %11 = mhlo.add %arg0, %arg1 : (
+      tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>>,
+      tensor<?x3x4x2x!quant.uniform<i32:f32:3, {1.1:2,0.4:-3}>>
+    ) -> tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>>
+  return %11 : tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>>
+}
+
+// -----
+
+func.func @add_per_channel_per_tensor_mix(
+    %arg0: tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>>,
+    %arg1: tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>>
+  ) -> tensor<?x3x4x2x!quant.uniform<i32:f32, 1.1:2>> {
+  // expected-error@+2 {{Per-channel quantized AddOp requires the same quantized element type for all operands and results}}
+  // expected-error@+1 {{failed to legalize operation 'mhlo.add' that was explicitly marked illegal}}
+  %11 = mhlo.add %arg0, %arg1 : (
+      tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>>,
+      tensor<?x3x4x2x!quant.uniform<i32:f32:3, {2.9455460163317514E-5,5.8952903030815205E-5}>>
+    ) -> tensor<?x3x4x2x!quant.uniform<i32:f32, 1.1:2>>
+  return %11 : tensor<?x3x4x2x!quant.uniform<i32:f32, 1.1:2>>
+}
+
+// -----
+
 // CHECK-LABEL: func @requantize
 func.func @requantize(
     %arg0: tensor<?x?x!quant.uniform<i8:f32, 1.000000e+01:3>>
@@ -351,10 +423,10 @@ func.func @requantize_per_channel(
   // CHECK-DAG: %[[VAL1:.*]] = mhlo.convert %arg0 : (tensor<2x2xi8>) -> tensor<2x2xf32>
   // CHECK-DAG: %[[MERGED_SCALE:.*]] = mhlo.constant dense<[2.000000e+00, 5.000000e-01]> : tensor<2xf32>
   // CHECK: %[[VAL2:.*]] = chlo.broadcast_multiply %[[VAL1]], %[[MERGED_SCALE]]
-  // CHECK-SAME: broadcast_dimensions = dense<1> : tensor<1xi64>
+  // CHECK-SAME: broadcast_dimensions = array<i64: 1>
   // CHECK-DAG: %[[MERGED_ZP:.*]] = mhlo.constant dense<[-5.000000e+00, -2.000000e+00]> : tensor<2xf32>
   // CHECK: %[[VAL3:.*]] = chlo.broadcast_add %[[VAL2]], %[[MERGED_ZP]]
-  // CHECK-SAME: broadcast_dimensions = dense<1> : tensor<1xi64>
+  // CHECK-SAME: broadcast_dimensions = array<i64: 1>
   // CHECK-DAG: %[[QUANT_MIN:.*]] = mhlo.constant dense<-1.280000e+02> : tensor<f32>
   // CHECK-DAG: %[[QUANT_MAX:.*]] = mhlo.constant dense<1.270000e+02> : tensor<f32>
   // CHECK: %[[VAL4:.*]] = mhlo.clamp %[[QUANT_MIN]], %[[VAL3]], %[[QUANT_MAX]]
@@ -375,10 +447,10 @@ func.func @requantize_per_channel_to_per_tensor(
   // CHECK-DAG: %[[VAL1:.*]] = mhlo.convert %arg0 : (tensor<2x2xi8>) -> tensor<2x2xf32>
   // CHECK-DAG: %[[MERGED_SCALE:.*]] = mhlo.constant dense<[2.000000e+00, 1.000000e+00]> : tensor<2xf32>
   // CHECK: %[[VAL2:.*]] = chlo.broadcast_multiply %[[VAL1]], %[[MERGED_SCALE]]
-  // CHECK-SAME: broadcast_dimensions = dense<1> : tensor<1xi64>
+  // CHECK-SAME: broadcast_dimensions = array<i64: 1>
   // CHECK-DAG: %[[MERGED_ZP:.*]] = mhlo.constant dense<[-5.000000e+00, -1.000000e+00]> : tensor<2xf32>
   // CHECK: %[[VAL3:.*]] = chlo.broadcast_add %[[VAL2]], %[[MERGED_ZP]]
-  // CHECK-SAME: broadcast_dimensions = dense<1> : tensor<1xi64>
+  // CHECK-SAME: broadcast_dimensions = array<i64: 1>
   // CHECK-DAG: %[[QUANT_MIN:.*]] = mhlo.constant dense<-1.280000e+02> : tensor<f32>
   // CHECK-DAG: %[[QUANT_MAX:.*]] = mhlo.constant dense<1.270000e+02> : tensor<f32>
   // CHECK: %[[VAL4:.*]] = mhlo.clamp %[[QUANT_MIN]], %[[VAL3]], %[[QUANT_MAX]]
@@ -399,10 +471,10 @@ func.func @requantize_per_tensor_to_per_channel(
   // CHECK-DAG: %[[VAL1:.*]] = mhlo.convert %arg0 : (tensor<2x2xi8>) -> tensor<2x2xf32>
   // CHECK-DAG: %[[MERGED_SCALE:.*]] = mhlo.constant dense<[1.000000e+00, 5.000000e-01]> : tensor<2xf32>
   // CHECK: %[[VAL2:.*]] = chlo.broadcast_multiply %[[VAL1]], %[[MERGED_SCALE]]
-  // CHECK-SAME: broadcast_dimensions = dense<1> : tensor<1xi64>
+  // CHECK-SAME: broadcast_dimensions = array<i64: 1>
   // CHECK-DAG: %[[MERGED_ZP:.*]] = mhlo.constant dense<[-1.000000e+00, -2.000000e+00]> : tensor<2xf32>
   // CHECK: %[[VAL3:.*]] = chlo.broadcast_add %[[VAL2]], %[[MERGED_ZP]]
-  // CHECK-SAME: broadcast_dimensions = dense<1> : tensor<1xi64>
+  // CHECK-SAME: broadcast_dimensions = array<i64: 1>
   // CHECK-DAG: %[[QUANT_MIN:.*]] = mhlo.constant dense<-1.280000e+02> : tensor<f32>
   // CHECK-DAG: %[[QUANT_MAX:.*]] = mhlo.constant dense<1.270000e+02> : tensor<f32>
   // CHECK: %[[VAL4:.*]] = mhlo.clamp %[[QUANT_MIN]], %[[VAL3]], %[[QUANT_MAX]]
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert_tf_quant_ops_to_mhlo.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert_tf_quant_ops_to_mhlo.mlir
index 7a568425415170..69f3f50b64ded6 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert_tf_quant_ops_to_mhlo.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/bridge/convert_tf_quant_ops_to_mhlo.mlir
@@ -40,7 +40,7 @@ func.func @uniform_quantized_add(%input: tensor<3x2xf32>) -> tensor<3x2xf32> {
     quantization_axis = -1 : i64, quantization_min_val = -2147483648 : i64, quantization_max_val = 2147483647 : i64
   } : (tensor<3x2xf32>, tensor<f32>, tensor<i32>) -> tensor<3x2x!tf_type.qint32>
 
-  // CHECK: chlo.broadcast_add %[[LHS2]], %[[RHS]] {broadcast_dimensions = dense<1> : tensor<1xi64>} :
+  // CHECK: chlo.broadcast_add %[[LHS2]], %[[RHS]] {broadcast_dimensions = array<i64: 1>} :
   // CHECK-SAME: (tensor<3x2x!quant.uniform<i32:f32, 2.000000e+00:4>>, tensor<2x!quant.uniform<i32:f32, 2.000000e+00:4>>)
   // CHECK-SAME: -> tensor<3x2x!quant.uniform<i32:f32, 2.000000e+00:4>>
   %1 = "tf.UniformQuantizedAdd"(
@@ -85,7 +85,7 @@ func.func @uniform_quantized_add_bias_not_const(%input1: tensor<3x2xi32>, %input
   %input1_qint = "tf.Cast"(%input1) {Truncate = false} : (tensor<3x2xi32>) -> tensor<3x2x!tf_type.qint32>
   %input2_qint = "tf.Cast"(%input2) {Truncate = false} : (tensor<2xi32>) -> tensor<2x!tf_type.qint32>
 
-  // CHECK: %[[RES:.*]] = chlo.broadcast_add %[[LHS_2]], %[[RHS_2]] {broadcast_dimensions = dense<1> : tensor<1xi64>} :
+  // CHECK: %[[RES:.*]] = chlo.broadcast_add %[[LHS_2]], %[[RHS_2]] {broadcast_dimensions = array<i64: 1>} :
   // CHECK-SAME: (tensor<3x2x!quant.uniform<i32:f32, 2.000000e+00:4>>, tensor<2x!quant.uniform<i32:f32, 2.000000e+00:4>>)
   // CHECK-SAME: -> tensor<3x2x!quant.uniform<i32:f32, 2.000000e+00:4>>
   %result = "tf.UniformQuantizedAdd"(
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/post_calibration_component.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/post_calibration_component.mlir
similarity index 61%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/post_calibration_component.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/components/post_calibration_component.mlir
index e97e5b1d11cfa3..aa11f88937e913 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/post_calibration_component.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/post_calibration_component.mlir
@@ -1,4 +1,7 @@
-// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-test-post-calibration-component | FileCheck %s
+// RUN: stablehlo-quant-opt %s -stablehlo-test-post-calibration-component \
+// RUN:   -split-input-file | FileCheck %s
+// RUN: stablehlo-quant-opt %s -stablehlo-test-post-calibration-component='unpack-quantized-types=false' \
+// RUN:   -split-input-file | FileCheck %s --check-prefix=CHECK-NO-UNPACK
 
 // Tests that a simple dot_general (lifted as a function) with CustomAggregators
 // around it is quantized. The resulting graph has quantized types unpacked into
@@ -31,6 +34,33 @@ func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1:
 
 // -----
 
+// Tests that a simple dot_general (lifted as a function) with CustomAggregators
+// around it is quantized, when the 'unpack-quantized-types' option is set to
+// false. This test case inputs the same graph as the test above. Note that now
+// the uniform quantized types are directly expressed within the graph.
+
+func.func @main_no_unpack(%arg0: tensor<1x1024xf32>) -> tensor<1x3xf32> {
+  %0 = "tf.Const"() <{value = dense<0.5> : tensor<1024x3xf32>}> : () -> tensor<1024x3xf32>
+  %1 = "tf.CustomAggregator"(%arg0) <{id = "1"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 0.999992311 : f32, max_percentile = 0.000000e+00 : f32, min = 7.547870e-07 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x1024xf32>) -> tensor<1x1024xf32>
+  %2 = "tf.XlaCallModule"(%1, %0) <{Sout = [#tf_type.shape<1x3>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+  %3 = "tf.CustomAggregator"(%2) <{id = "2"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 18.3033524 : f32, max_percentile = 0.000000e+00 : f32, min = -17.5216827 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+  return %3 : tensor<1x3xf32>
+}
+func.func private @composite_dot_general_fn_1(%arg0: tensor<1x1024xf32>, %arg1: tensor<1024x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+  %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x1024xf32>, tensor<1024x3xf32>) -> tensor<1x3xf32>
+  return %0 : tensor<1x3xf32>
+}
+// CHECK-NO-UNPACK-LABEL: func.func @main_no_unpack
+// CHECK-NO-UNPACK-SAME: (%[[ARG_0:.+]]: tensor<1x1024xf32>) -> tensor<1x3xf32>
+// CHECK-NO-UNPACK-DAG: %[[CONST:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1024x3xi8>} : () -> tensor<1024x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>
+// CHECK-NO-UNPACK: %[[QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x1024xf32>) -> tensor<1x1024x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-NO-UNPACK: %[[DOT_GENERAL_0:.+]] = stablehlo.dot_general %[[QUANTIZE_0]], %[[CONST]], contracting_dims = [1] x [0] : (tensor<1x1024x!quant.uniform<i8:f32, {{.*}}>>, tensor<1024x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-NO-UNPACK: %[[QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[DOT_GENERAL_0]] : (tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-NO-UNPACK: %[[DEQUANTIZE:.+]] = stablehlo.uniform_dequantize %[[QUANTIZE_1]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>) -> tensor<1x3xf32>
+// CHECK-NO-UNPACK: return %[[DEQUANTIZE]] : tensor<1x3xf32>
+
+// -----
+
 // Tests that a simple dot_general without CustomAggregators is not quantized.
 
 func.func @main(%arg0: tensor<1x1024xf32>) -> tensor<1x3xf32> {
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/pre_calibration_component.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/pre_calibration_component.mlir
similarity index 100%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/pre_calibration_component.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/components/pre_calibration_component.mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/tf_to_stablehlo.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/tf_to_stablehlo.mlir
new file mode 100644
index 00000000000000..09afb528f602aa
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/components/tf_to_stablehlo.mlir
@@ -0,0 +1,59 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -verify-diagnostics -stablehlo-test-tf-to-stablehlo | FileCheck %s
+
+func.func @fused_batchnorm_no_training(%arg0: tensor<8x8x8x8xf32>) -> (tensor<8x8x8x8xf32>) {
+  %cst_0 = "tf.Const"() {value = dense<[0.1, 0.2, 0.1, 0.2, 0.1, 0.2, 0.1, 0.2]> : tensor<8xf32>} : () -> tensor<8xf32>
+  %cst_1 = "tf.Const"() {value = dense<[0.3, 0.4, 0.3, 0.4, 0.3, 0.4, 0.3, 0.4]> : tensor<8xf32>} : () -> tensor<8xf32>
+  %0:6 = "tf.FusedBatchNormV3"(%arg0, %cst_0, %cst_1, %cst_0, %cst_1) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>) -> (tensor<8x8x8x8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>, tensor<8xf32>)
+  func.return %0#0 : tensor<8x8x8x8xf32>
+}
+// CHECK: func.func @main(%[[ARG_0:.+]]: tensor<8x8x8x8xf32>) -> tensor<8x8x8x8xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<{{.*}}> : tensor<8xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<{{.*}}> : tensor<8xf32>
+// CHECK: %[[BROADCAST_0:.*]] = stablehlo.broadcast_in_dim %[[CONST_0]], dims = [3] : (tensor<8xf32>) -> tensor<8x8x8x8xf32>
+// CHECK: %[[BROADCAST_1:.*]] = stablehlo.broadcast_in_dim %[[CONST_1]], dims = [3] : (tensor<8xf32>) -> tensor<8x8x8x8xf32>
+// CHECK: %[[MUL:.*]] = stablehlo.multiply %arg0, %[[BROADCAST_0]] : tensor<8x8x8x8xf32>
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[MUL]], %[[BROADCAST_1]] : tensor<8x8x8x8xf32>
+// CHECK: return %[[ADD]] : tensor<8x8x8x8xf32>
+
+// -----
+
+func.func @fuse_conv_batchnorm(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %cst_1 = "tf.Const"() {value = dense<[0.1, 0.2]> : tensor<2xf32>} : () -> tensor<2xf32>
+  %cst_2 = "tf.Const"() {value = dense<[0.3, 0.4]> : tensor<2xf32>} : () -> tensor<2xf32>
+  %0 = "tf.Conv2D"(%arg0, %cst_0) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+  %1:6 = "tf.FusedBatchNormV3"(%0, %cst_1, %cst_2, %cst_1, %cst_2) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<1x3x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<1x3x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
+  func.return %1#0 : tensor<1x3x2x2xf32>
+}
+// CHECK: func.func @main(%[[ARG:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x2x2xf32> {
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<[{{.*}}]> : tensor<2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<[{{.*}}]> : tensor<2xf32>
+// CHECK: %[[BROADCAST_0:.*]] = stablehlo.broadcast_in_dim %[[CONST_0]], dims = [3] : (tensor<2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK: %[[BROADCAST_1:.*]] = stablehlo.broadcast_in_dim %[[CONST_1]], dims = [3] : (tensor<2xf32>) -> tensor<2x3x3x2xf32>
+// CHECK: %[[CONV:.*]] = stablehlo.convolution(%[[ARG]], %[[BROADCAST_1]]) {{.*}} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %[[BROADCAST_0]] : tensor<1x3x2x2xf32>
+// CHECK: return %[[ADD]] : tensor<1x3x2x2xf32>
+
+// -----
+
+func.func @func_conv_batchnorm_relu6(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst_0 = "tf.Const"() {value = dense<1.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %cst_1 = "tf.Const"() {value = dense<[0.1, 0.2]> : tensor<2xf32>} : () -> tensor<2xf32>
+  %cst_2 = "tf.Const"() {value = dense<[0.3, 0.4]> : tensor<2xf32>} : () -> tensor<2xf32>
+  %0 = "tf.Conv2D"(%arg0, %cst_0) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+  %1:6 = "tf.FusedBatchNormV3"(%0, %cst_1, %cst_2, %cst_1, %cst_2) {T = "tfdtype$DT_FLOAT", data_format = "NHWC", epsilon = 0.001 : f32, is_training = false} : (tensor<1x3x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>) -> (tensor<1x3x2x2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>, tensor<2xf32>)
+  %2 = "tf.Relu6"(%1#0) : (tensor<1x3x2x2xf32>) -> tensor<1x3x2x2xf32>
+  func.return %2 : tensor<1x3x2x2xf32>
+}
+// CHECK: func.func @main(%[[ARG:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x2x2xf32> {
+// CHECK-DAG: %[[CONST_0:.*]] = stablehlo.constant dense<[{{.*}}]> : tensor<2xf32>
+// CHECK-DAG: %[[CONST_1:.*]] = stablehlo.constant dense<[{{.*}}]> : tensor<2xf32>
+// CHECK-DAG: %[[CONST_2:.*]] = stablehlo.constant dense<6.000000e+00> : tensor<f32>
+// CHECK-DAG: %[[CONST_3:.*]] = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+// CHECK: %[[BROADCAST_0:.*]] = stablehlo.broadcast_in_dim %[[CONST_0]], dims = [3] : (tensor<2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK: %[[BROADCAST_1:.*]] = stablehlo.broadcast_in_dim %[[CONST_1]], dims = [3] : (tensor<2xf32>) -> tensor<2x3x3x2xf32>
+// CHECK: %[[CONV:.*]] = stablehlo.convolution(%[[ARG]], %[[BROADCAST_1]]) {{.*}} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %[[BROADCAST_0]] : tensor<1x3x2x2xf32>
+// CHECK: %[[RELU6:.*]] = stablehlo.clamp %[[CONST_3]], %[[ADD]], %[[CONST_2]] : (tensor<f32>, tensor<1x3x2x2xf32>, tensor<f32>) -> tensor<1x3x2x2xf32>
+// CHECK: return %[[RELU6]] : tensor<1x3x2x2xf32>
+
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/convert_func_to_bfloat16.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/convert_func_to_bfloat16.mlir
deleted file mode 100644
index 6923209e531da9..00000000000000
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/convert_func_to_bfloat16.mlir
+++ /dev/null
@@ -1,55 +0,0 @@
-// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-convert-func-to-bfloat16 -verify-diagnostics | FileCheck %s
-
-// CHECK-LABEL: @add_f32(%arg0: tensor<3x3xbf16>, %arg1: tensor<3x3xbf16>) -> tensor<3x3xbf16>
-func.func @add_f32(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x3xf32> {
-  // CHECK-NOT: f32
-  // CHECK: stablehlo.add
-  %0 = stablehlo.add %arg0, %arg1: (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
-  return %0 : tensor<3x3xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @add_f64(%arg0: tensor<3x3xbf16>, %arg1: tensor<3x3xbf16>) -> tensor<3x3xbf16>
-func.func @add_f64(%arg0: tensor<3x3xf64>, %arg1: tensor<3x3xf64>) -> tensor<3x3xf64> {
-  // CHECK-NOT: f64
-  // CHECK: stablehlo.add
-  %0 = stablehlo.add %arg0, %arg1: (tensor<3x3xf64>, tensor<3x3xf64>) -> tensor<3x3xf64>
-  return %0 : tensor<3x3xf64>
-}
-
-// -----
-
-// CHECK-LABEL: @constant_f32() -> tensor<2x2xbf16>
-func.func @constant_f32() -> tensor<2x2xf32> {
-  // CHECK-NOT: f32
-  // CHECK{LITERAL}: stablehlo.constant dense<[[1.398440e+00, 0.000000e+00], [3.093750e+00, -2.001950e-01]]> : tensor<2x2xbf16>
-  %0 = stablehlo.constant dense<[[1.4, 0.0], [3.1, -0.2]]> : tensor<2x2xf32>
-  return %0 : tensor<2x2xf32>
-}
-
-// -----
-
-func.func @constant_elided() -> tensor<2x2xf32> {
-  // expected-error @+1 {{failed to legalize operation 'stablehlo.constant' that was explicitly marked illegal}}
-  %0 = stablehlo.constant dense_resource<__elided__> : tensor<2x2xf32>
-  return %0 : tensor<2x2xf32>
-}
-
-// -----
-
-// CHECK-LABEL: @reduce_window_f32(%arg0: tensor<2x3x1x3xbf16>) -> tensor<2x3x1x3xbf16>
-func.func @reduce_window_f32(%arg0: tensor<2x3x1x3xf32>) -> tensor<2x3x1x3xf32> {
-  // CHECK-NOT: f32
-  // CHECK: stablehlo.reduce_window
-  %0 = stablehlo.constant dense<0.0> : tensor<f32>
-  %1 = "stablehlo.reduce_window"(%arg0, %0) ({
-    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
-      %2 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
-      stablehlo.return %2 : tensor<f32>
-    }) {padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 3, 3, 1]> : tensor<4xi64>} : (tensor<2x3x1x3xf32>, tensor<f32>) -> tensor<2x3x1x3xf32>
-  return %1 : tensor<2x3x1x3xf32>
-}
-
-// -----
-
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/convert_func_to_bfloat16.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/convert_func_to_bfloat16.mlir
new file mode 100644
index 00000000000000..fdb5860eb1bd23
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/convert_func_to_bfloat16.mlir
@@ -0,0 +1,128 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-convert-func-to-bfloat16 -verify-diagnostics | FileCheck %s
+
+// CHECK-LABEL: @add_f32(%arg0: tensor<3x3xbf16>, %arg1: tensor<3x3xbf16>) -> tensor<3x3xbf16>
+func.func @add_f32(%arg0: tensor<3x3xf32>, %arg1: tensor<3x3xf32>) -> tensor<3x3xf32> {
+  // CHECK-NOT: f32
+  // CHECK: stablehlo.add
+  %0 = stablehlo.add %arg0, %arg1: (tensor<3x3xf32>, tensor<3x3xf32>) -> tensor<3x3xf32>
+  return %0 : tensor<3x3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @add_f64(%arg0: tensor<3x3xbf16>, %arg1: tensor<3x3xbf16>) -> tensor<3x3xbf16>
+func.func @add_f64(%arg0: tensor<3x3xf64>, %arg1: tensor<3x3xf64>) -> tensor<3x3xf64> {
+  // CHECK-NOT: f64
+  // CHECK: stablehlo.add
+  %0 = stablehlo.add %arg0, %arg1: (tensor<3x3xf64>, tensor<3x3xf64>) -> tensor<3x3xf64>
+  return %0 : tensor<3x3xf64>
+}
+
+// -----
+
+// CHECK-LABEL: @constant_f32() -> tensor<2x2xbf16>
+func.func @constant_f32() -> tensor<2x2xf32> {
+  // CHECK-NOT: f32
+  // CHECK{LITERAL}: stablehlo.constant dense<[[1.398440e+00, 0.000000e+00], [3.093750e+00, -2.001950e-01]]> : tensor<2x2xbf16>
+  %0 = stablehlo.constant dense<[[1.4, 0.0], [3.1, -0.2]]> : tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+}
+
+// -----
+
+func.func @constant_elided() -> tensor<2x2xf32> {
+  // expected-error @+1 {{failed to legalize operation 'stablehlo.constant' that was explicitly marked illegal}}
+  %0 = stablehlo.constant dense_resource<__elided__> : tensor<2x2xf32>
+  return %0 : tensor<2x2xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @reduce_window_f32(%arg0: tensor<2x3x1x3xbf16>) -> tensor<2x3x1x3xbf16>
+func.func @reduce_window_f32(%arg0: tensor<2x3x1x3xf32>) -> tensor<2x3x1x3xf32> {
+  // CHECK-NOT: f32
+  // CHECK: stablehlo.reduce_window
+  %0 = stablehlo.constant dense<0.0> : tensor<f32>
+  %1 = "stablehlo.reduce_window"(%arg0, %0) ({
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+      %2 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
+      stablehlo.return %2 : tensor<f32>
+  }) {padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>, window_dimensions = array<i64: 1, 3, 3, 1>} : (tensor<2x3x1x3xf32>, tensor<f32>) -> tensor<2x3x1x3xf32>
+  return %1 : tensor<2x3x1x3xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @bitcast_convert_i32_f32(%arg0: tensor<1x256128xi32>) -> tensor<1x256128xbf16>
+func.func @bitcast_convert_i32_f32(%arg0: tensor<1x256128xi32>) -> tensor<1x256128xf32> {
+  // CHECK: %[[BITCAST:.*]] = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xi32>) -> tensor<1x256128xf32>
+  // CHECK: %[[CONVERT:.*]] = stablehlo.convert %[[BITCAST]] : (tensor<1x256128xf32>) -> tensor<1x256128xbf16>
+  // CHECK: return %[[CONVERT]] : tensor<1x256128xbf16>
+  %20 = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xi32>) -> tensor<1x256128xf32>
+  return %20 : tensor<1x256128xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @bitcast_convert_f32_i32(%arg0: tensor<1x256128xbf16>) -> tensor<1x256128xi32>
+func.func @bitcast_convert_f32_i32(%arg0: tensor<1x256128xf32>) -> tensor<1x256128xi32> {
+  // CHECK: %[[CONVERT:.*]] = stablehlo.convert %arg0 : (tensor<1x256128xbf16>) -> tensor<1x256128xf32>
+  // CHECK: %[[BITCAST:.*]] = stablehlo.bitcast_convert %[[CONVERT]] : (tensor<1x256128xf32>) -> tensor<1x256128xi32>
+  // CHECK: return %[[BITCAST]] : tensor<1x256128xi32>
+  %20 = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xf32>) -> tensor<1x256128xi32>
+  return %20 : tensor<1x256128xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @bitcast_convert_ui32_f32(%arg0: tensor<1x256128xui32>) -> tensor<1x256128xbf16>
+func.func @bitcast_convert_ui32_f32(%arg0: tensor<1x256128xui32>) -> tensor<1x256128xf32> {
+  // CHECK: %[[BITCAST:.*]] = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xui32>) -> tensor<1x256128xf32>
+  // CHECK: %[[CONVERT:.*]] = stablehlo.convert %[[BITCAST]] : (tensor<1x256128xf32>) -> tensor<1x256128xbf16>
+  // CHECK: return %[[CONVERT]] : tensor<1x256128xbf16>
+  %20 = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xui32>) -> tensor<1x256128xf32>
+  return %20 : tensor<1x256128xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @bitcast_convert_f32_ui32(%arg0: tensor<1x256128xbf16>) -> tensor<1x256128xui32>
+func.func @bitcast_convert_f32_ui32(%arg0: tensor<1x256128xf32>) -> tensor<1x256128xui32> {
+  // CHECK: %[[CONVERT:.*]] = stablehlo.convert %arg0 : (tensor<1x256128xbf16>) -> tensor<1x256128xf32>
+  // CHECK: %[[BITCAST:.*]] = stablehlo.bitcast_convert %[[CONVERT]] : (tensor<1x256128xf32>) -> tensor<1x256128xui32>
+  // CHECK: return %[[BITCAST]] : tensor<1x256128xui32>
+  %20 = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xf32>) -> tensor<1x256128xui32>
+  return %20 : tensor<1x256128xui32>
+}
+
+// -----
+
+// CHECK-LABEL: @bitcast_convert_f32_f32(%arg0: tensor<1x256128xbf16>) -> tensor<1x256128xbf16>
+func.func @bitcast_convert_f32_f32(%arg0: tensor<1x256128xf32>) -> tensor<1x256128xf32> {
+  // Convert bitcast_convert to no-op for f32->f32.
+  // CHECK: return %arg0 : tensor<1x256128xbf16>
+  %20 = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xf32>) -> tensor<1x256128xf32>
+  return %20 : tensor<1x256128xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @bitcast_convert_i32_ui32(%arg0: tensor<1x256128xi32>) -> tensor<1x256128xui32>
+func.func @bitcast_convert_i32_ui32(%arg0: tensor<1x256128xi32>) -> tensor<1x256128xui32> {
+  // Do not convert bitcast_convert for legal types.
+  // CHECK: %[[BITCAST:.*]] = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xi32>) -> tensor<1x256128xui32>
+  // CHECK: return %[[BITCAST]] : tensor<1x256128xui32>
+  %20 = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xi32>) -> tensor<1x256128xui32>
+  return %20 : tensor<1x256128xui32>
+}
+
+// -----
+
+// CHECK-LABEL: @bitcast_convert_bf16_bf16(%arg0: tensor<1x256128xbf16>) -> tensor<1x256128xbf16>
+func.func @bitcast_convert_bf16_bf16(%arg0: tensor<1x256128xbf16>) -> tensor<1x256128xbf16> {
+  // Do not convert bitcast_convert for legal types.
+  // CHECK: %[[BITCAST:.*]] = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xbf16>) -> tensor<1x256128xbf16>
+  // CHECK: return %[[BITCAST]] : tensor<1x256128xbf16>
+  %20 = stablehlo.bitcast_convert %arg0 : (tensor<1x256128xbf16>) -> tensor<1x256128xbf16>
+  return %20 : tensor<1x256128xbf16>
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/convert_xla_call_module_op_to_bfloat16.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/convert_xla_call_module_op_to_bfloat16.mlir
similarity index 100%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/convert_xla_call_module_op_to_bfloat16.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/convert_xla_call_module_op_to_bfloat16.mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/lift_quantizable_spots_as_functions.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions.mlir
similarity index 85%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/lift_quantizable_spots_as_functions.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions.mlir
index 052d618a2e147c..e743a19dc0a822 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/lift_quantizable_spots_as_functions.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions.mlir
@@ -38,14 +38,38 @@ func.func @dot_general_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
 
 // -----
 
+// CHECK-LABEL: @dot_general_with_bias_same_shape_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x2xf32>
+func.func @dot_general_with_bias_same_shape_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<2x3xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x3xf32>
+  %2 = stablehlo.dot_general %arg0, %0, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+  %3 = stablehlo.add %2, %1 : tensor<1x3xf32>
+  func.return %3: tensor<1x3xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x3xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_bias_same_shape_fn_1
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %arg2
+// CHECK: return %[[ADD]] : tensor<1x3xf32>
+// CHECK: }
+
+// -----
+
 // CHECK-LABEL: @conv_with_bias_fn(
 // CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>
 func.func @conv_with_bias_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
   %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
-  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x3x3x4xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<4xf32>
   %2 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
-  %3 = stablehlo.add %2, %1 : tensor<1x3x3x4xf32>
-  func.return %3: tensor<1x3x3x4xf32>
+  %3 = stablehlo.broadcast_in_dim %1, dims = [3] : (tensor<4xf32>) -> tensor<1x3x3x4xf32>
+  %4 = stablehlo.add %2, %3 : tensor<1x3x3x4xf32>
+  func.return %4: tensor<1x3x3x4xf32>
 }
 // CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
 // CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
@@ -54,8 +78,9 @@ func.func @conv_with_bias_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32>
 // CHECK: }
 
 // CHECK-LABEL: private @composite_conv_with_bias_fn_1
+// CHECK: %[[BROADCAST_IN_DIM:.*]] = stablehlo.broadcast_in_dim %arg2
 // CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
-// CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %arg2
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %[[BROADCAST_IN_DIM]]
 // CHECK: return %[[ADD]] : tensor<1x3x3x4xf32>
 // CHECK: }
 
@@ -65,10 +90,11 @@ func.func @conv_with_bias_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32>
 // CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x1x167xf32>
 func.func @dot_general_with_bias_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
   %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
-  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x1x64xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<64xf32>
   %2 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
-  %3 = stablehlo.add %2, %1 : tensor<1x1x64xf32>
-  func.return %3: tensor<1x1x64xf32>
+  %3 = stablehlo.broadcast_in_dim %1, dims = [2] : (tensor<64xf32>) -> tensor<1x1x64xf32>
+  %4 = stablehlo.add %2, %3 : tensor<1x1x64xf32>
+  func.return %4: tensor<1x1x64xf32>
 }
 // CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
 // CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
@@ -77,8 +103,9 @@ func.func @dot_general_with_bias_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64
 // CHECK: }
 
 // CHECK-LABEL: private @composite_dot_general_with_bias_fn_1
+// CHECK: %[[BROADCAST_IN_DIM:.*]] = stablehlo.broadcast_in_dim %arg2
 // CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
-// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %arg2
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %[[BROADCAST_IN_DIM]]
 // CHECK: return %[[ADD]] : tensor<1x1x64xf32>
 // CHECK: }
 
@@ -361,16 +388,44 @@ func.func @dot_general_with_relu6_dynamic_fn(%arg0: tensor<?x12544xf32>) -> tens
 
 // -----
 
+// CHECK-LABEL: @dot_general_with_bias_same_shape_and_relu_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x1x167xf32>
+func.func @dot_general_with_bias_same_shape_and_relu_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x1x64xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x64xf32>
+  %3 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  %4 = stablehlo.add %3, %1 : tensor<1x1x64xf32>
+  %5 = stablehlo.maximum %4, %2 : tensor<1x1x64xf32>
+  func.return %5: tensor<1x1x64xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_bias_same_shape_and_relu_fn_1
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %arg2
+// CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ADD]], %[[CONST]]
+// CHECK: return %[[MAX]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// -----
+
 // CHECK-LABEL: @conv_with_bias_and_relu_fn(
 // CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>
 func.func @conv_with_bias_and_relu_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
   %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
-  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x3x3x4xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<4xf32>
   %2 = stablehlo.constant dense<0.000000e+00> : tensor<1x3x3x4xf32>
   %3 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
-  %4 = stablehlo.add %3, %1 : tensor<1x3x3x4xf32>
-  %5 = stablehlo.maximum %4, %2 : tensor<1x3x3x4xf32>
-  func.return %5: tensor<1x3x3x4xf32>
+  %4 = stablehlo.broadcast_in_dim %1, dims = [3] : (tensor<4xf32>) -> tensor<1x3x3x4xf32>
+  %5 = stablehlo.add %3, %4 : tensor<1x3x3x4xf32>
+  %6 = stablehlo.maximum %5, %2 : tensor<1x3x3x4xf32>
+  func.return %6: tensor<1x3x3x4xf32>
 }
 // CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
 // CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
@@ -380,8 +435,9 @@ func.func @conv_with_bias_and_relu_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x
 
 // CHECK-LABEL: private @composite_conv_with_bias_and_relu_fn_1
 // CHECK: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[BROADCAST_IN_DIM:.*]] = stablehlo.broadcast_in_dim %arg2
 // CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
-// CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %arg2
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %[[BROADCAST_IN_DIM]]
 // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ADD]], %[[CONST]]
 // CHECK: return %[[MAX]] : tensor<1x3x3x4xf32>
 // CHECK: }
@@ -392,12 +448,13 @@ func.func @conv_with_bias_and_relu_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x
 // CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x1x167xf32>
 func.func @dot_general_with_bias_and_relu_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
   %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
-  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x1x64xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<64xf32>
   %2 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x64xf32>
   %3 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
-  %4 = stablehlo.add %3, %1 : tensor<1x1x64xf32>
-  %5 = stablehlo.maximum %4, %2 : tensor<1x1x64xf32>
-  func.return %5: tensor<1x1x64xf32>
+  %4 = stablehlo.broadcast_in_dim %1, dims = [2] : (tensor<64xf32>) -> tensor<1x1x64xf32>
+  %5 = stablehlo.add %3, %4 : tensor<1x1x64xf32>
+  %6 = stablehlo.maximum %5, %2 : tensor<1x1x64xf32>
+  func.return %6: tensor<1x1x64xf32>
 }
 // CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
 // CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
@@ -407,8 +464,9 @@ func.func @dot_general_with_bias_and_relu_fn(%arg0: tensor<1x1x167xf32>) -> tens
 
 // CHECK-LABEL: private @composite_dot_general_with_bias_and_relu_fn_1
 // CHECK: %[[CONST:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[BROADCAST_IN_DIM:.*]] = stablehlo.broadcast_in_dim %arg2
 // CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
-// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %arg2
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %[[BROADCAST_IN_DIM]]
 // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ADD]], %[[CONST]]
 // CHECK: return %[[MAX]] : tensor<1x1x64xf32>
 // CHECK: }
@@ -485,17 +543,47 @@ func.func @dot_general_with_bias_and_relu_dynamic_fn(%arg0: tensor<?x12544xf32>)
 
 // -----
 
+// CHECK-LABEL: @dot_general_with_bias_same_shape_and_relu6_fn(
+// CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x1x167xf32>
+func.func @dot_general_with_bias_same_shape_and_relu6_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x1x64xf32>
+  %2 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x64xf32>
+  %3 = stablehlo.constant dense<6.000000e+00> : tensor<1x1x64xf32>
+  %4 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  %5 = stablehlo.add %4, %1 : tensor<1x1x64xf32>
+  %6 = stablehlo.clamp %2, %5, %3 : tensor<1x1x64xf32>
+  func.return %6: tensor<1x1x64xf32>
+}
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST_0]], %[[CONST_1]])
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_with_bias_same_shape_and_relu6_fn_1
+// CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
+// CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %arg2
+// CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[ADD]], %[[CONST_1]]
+// CHECK: return %[[CLAMP]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// -----
+
 // CHECK-LABEL: @conv_with_bias_and_relu6_fn(
 // CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x3x3x4xf32>
 func.func @conv_with_bias_and_relu6_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3x3x4xf32> {
   %0 = stablehlo.constant dense<2.000000e+00> : tensor<3x3x4x4xf32>
-  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x3x3x4xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<4xf32>
   %2 = stablehlo.constant dense<0.000000e+00> : tensor<1x3x3x4xf32>
   %3 = stablehlo.constant dense<6.000000e+00> : tensor<1x3x3x4xf32>
   %4 = stablehlo.convolution(%arg0, %0) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[1, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x3x4xf32>, tensor<3x3x4x4xf32>) -> tensor<1x3x3x4xf32>
-  %5 = stablehlo.add %4, %1 : tensor<1x3x3x4xf32>
-  %6 = stablehlo.clamp %2, %5, %3 : tensor<1x3x3x4xf32>
-  func.return %6: tensor<1x3x3x4xf32>
+  %5 = stablehlo.broadcast_in_dim %1, dims = [3] : (tensor<4xf32>) -> tensor<1x3x3x4xf32>
+  %6 = stablehlo.add %4, %5 : tensor<1x3x3x4xf32>
+  %7 = stablehlo.clamp %2, %6, %3 : tensor<1x3x3x4xf32>
+  func.return %7: tensor<1x3x3x4xf32>
 }
 // CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
 // CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
@@ -506,8 +594,9 @@ func.func @conv_with_bias_and_relu6_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3
 // CHECK-LABEL: private @composite_conv_with_bias_and_relu6_fn_1
 // CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
+// CHECK: %[[BROADCAST_IN_DIM:.*]] = stablehlo.broadcast_in_dim %arg2
 // CHECK: %[[CONV:.*]] = stablehlo.convolution(%arg0, %arg1)
-// CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %arg2
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[CONV]], %[[BROADCAST_IN_DIM]]
 // CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[ADD]], %[[CONST_1]]
 // CHECK: return %[[CLAMP]] : tensor<1x3x3x4xf32>
 // CHECK: }
@@ -518,13 +607,14 @@ func.func @conv_with_bias_and_relu6_fn(%arg0: tensor<1x3x3x4xf32>) -> tensor<1x3
 // CHECK-SAME:                    %[[ARG_0:.*]]: tensor<1x1x167xf32>
 func.func @dot_general_with_bias_and_relu6_fn(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
   %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
-  %1 = stablehlo.constant dense<2.000000e+00> : tensor<1x1x64xf32>
+  %1 = stablehlo.constant dense<2.000000e+00> : tensor<64xf32>
   %2 = stablehlo.constant dense<0.000000e+00> : tensor<1x1x64xf32>
   %3 = stablehlo.constant dense<6.000000e+00> : tensor<1x1x64xf32>
   %4 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
-  %5 = stablehlo.add %4, %1 : tensor<1x1x64xf32>
-  %6 = stablehlo.clamp %2, %5, %3 : tensor<1x1x64xf32>
-  func.return %6: tensor<1x1x64xf32>
+  %5 = stablehlo.broadcast_in_dim %1, dims = [2] : (tensor<64xf32>) -> tensor<1x1x64xf32>
+  %6 = stablehlo.add %4, %5 : tensor<1x1x64xf32>
+  %7 = stablehlo.clamp %2, %6, %3 : tensor<1x1x64xf32>
+  func.return %7: tensor<1x1x64xf32>
 }
 // CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<2.000000e+00>
 // CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<2.000000e+00>
@@ -535,8 +625,9 @@ func.func @dot_general_with_bias_and_relu6_fn(%arg0: tensor<1x1x167xf32>) -> ten
 // CHECK-LABEL: private @composite_dot_general_with_bias_and_relu6_fn_1
 // CHECK: %[[CONST_0:.*]] = stablehlo.constant dense<0.000000e+00>
 // CHECK: %[[CONST_1:.*]] = stablehlo.constant dense<6.000000e+00>
+// CHECK: %[[BROADCAST_IN_DIM:.*]] = stablehlo.broadcast_in_dim %arg2
 // CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
-// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %arg2
+// CHECK: %[[ADD:.*]] = stablehlo.add %[[DOT_GENERAL]], %[[BROADCAST_IN_DIM]]
 // CHECK: %[[CLAMP:.*]] = stablehlo.clamp %[[CONST_0]], %[[ADD]], %[[CONST_1]]
 // CHECK: return %[[CLAMP]] : tensor<1x1x64xf32>
 // CHECK: }
@@ -619,7 +710,7 @@ func.func @gather_fn() -> tensor<2x3x2x2xi32> {
     collapsed_slice_dims = [0],
     start_index_map = [1, 0],
     index_vector_dim = 2>,
-  slice_sizes = dense<[1, 2, 2]> : tensor<3xi64>,
+  slice_sizes = array<i64: 1, 2, 2>,
   indices_are_sorted = false
 } : (tensor<3x4x2xi32>, tensor<2x3x2xi64>) -> tensor<2x3x2x2xi32>
   func.return %2: tensor<2x3x2x2xi32>
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir
new file mode 100644
index 00000000000000..00b3dd3b5e57a4
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/lift_quantizable_spots_as_functions_with_quantization_specs.mlir
@@ -0,0 +1,25 @@
+// RUN: stablehlo-quant-opt %s -stablehlo-test-lift-quantizable-spots-as-functions-with-quantization-specs \
+// RUN:   -split-input-file | FileCheck %s
+
+// CHECK: @main
+func.func @main(%arg0: tensor<1x1x167xf32>) -> tensor<1x1x64xf32> {
+  %0 = stablehlo.constant dense<2.000000e+00> : tensor<167x64xf32>
+  %1 = stablehlo.dot_general %arg0, %0, contracting_dims = [2] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x1x167xf32>, tensor<167x64xf32>) -> tensor<1x1x64xf32>
+  return %1 : tensor<1x1x64xf32>
+}
+// Tests that `composite_dot_general_fn_1` and its corresponding XlaCallModuleOp
+// is missing attributes required for quantization.
+
+// CHECK: %[[CONST:.*]] = stablehlo.constant dense<2.000000e+00>
+// CHECK: %[[XLA_CALL_MODULE:.*]] = "tf.XlaCallModule"(%arg0, %[[CONST]])
+// CHECK-SAME: {_entry_function = @composite_dot_general_fn_1, {{.*}}}
+// CHECK-NOT: _original_entry_function
+// CHECK-NOT: _tfl_quant_trait
+// CHECK: return %[[XLA_CALL_MODULE:.*]] : tensor<1x1x64xf32>
+// CHECK: }
+
+// CHECK-LABEL: private @composite_dot_general_fn_1
+// CHECK-NOT: tf_quant.composite_function
+// CHECK: %[[DOT_GENERAL:.*]] = stablehlo.dot_general %arg0, %arg1
+// CHECK: return %[[DOT_GENERAL:.*]] : tensor<1x1x64xf32>
+// CHECK: }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/optimize_graph.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/optimize_graph.mlir
similarity index 100%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/optimize_graph.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/optimize_graph.mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/post_quantize.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/post_quantize.mlir
similarity index 100%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/post_quantize.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/post_quantize.mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/prepare_quantize.mlir
similarity index 91%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/prepare_quantize.mlir
index 67a059c6061ceb..7634d81e976ca1 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/prepare_quantize.mlir
@@ -1,4 +1,4 @@
-// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-prepare-quantize=enable-per-channel-quantization=false -verify-diagnostics | FileCheck %s
+// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-prepare-quantize=enable-per-channel-quantized-weight=false -verify-diagnostics | FileCheck %s
 
 // -----
 
@@ -74,17 +74,17 @@ func.func @dot_redundant_stats(%arg0: tensor<?x3xf32>) -> tensor<?x2xf32> {
 
 // -----
 
-// CHECK-LABEL: func @convert_same_scale_propagate
-func.func @convert_same_scale_propagate(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
+// CHECK-LABEL: func @reshape_same_scale_propagate
+func.func @reshape_same_scale_propagate(%arg0: tensor<2x3xf32>) -> tensor<6xf32> {
   // CHECK: %[[dq:.*]] = "quantfork.dcast"
   // CHECK-SAME: (tensor<2x3x!quant.uniform<i8:f32, 0.0078408040252386357:-1>>)
   %0 = "quantfork.stats"(%arg0) {bitsNum = 8 : i64, layerStats = dense<[-0.999415695, 0.99998933]> : tensor<2xf32>, narrowRange = false} : (tensor<2x3xf32>) -> tensor<2x3xf32>
-  // CHECK: %[[convert:.*]] = stablehlo.convert %[[dq]]
-  %1 = stablehlo.convert %0 : (tensor<2x3xf32>) -> (tensor<2x3xf32>)
-  // CHECK: %[[q:.*]] = "quantfork.qcast"(%[[convert]])
-  // CHECK-SAME: -> tensor<2x3x!quant.uniform<i8:f32, 0.0078408040252386357:-1>>
-  %2 = "quantfork.stats"(%1) {bitsNum = 8 : i64, layerStats = dense<[-2.0, 2.0]> : tensor<2xf32>, narrowRange = false} : (tensor<2x3xf32>) -> tensor<2x3xf32>
-  func.return %2 : tensor<2x3xf32>
+  // CHECK: %[[reshape:.*]] = stablehlo.reshape %[[dq]]
+  %1 = stablehlo.reshape %0 : (tensor<2x3xf32>) -> (tensor<6xf32>)
+  // CHECK: %[[q:.*]] = "quantfork.qcast"(%[[reshape]])
+  // CHECK-SAME: -> tensor<6x!quant.uniform<i8:f32, 0.0078408040252386357:-1>>
+  %2 = "quantfork.stats"(%1) {bitsNum = 8 : i64, layerStats = dense<[-2.0, 2.0]> : tensor<2xf32>, narrowRange = false} : (tensor<6xf32>) -> tensor<6xf32>
+  func.return %2 : tensor<6xf32>
 }
 
 // -----
@@ -135,6 +135,6 @@ func.func @skip_nan_inf_constant(%arg0: tensor<?x112x112x64xf32>) -> tensor<?x56
   ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
     %7 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
     stablehlo.return %7 : tensor<f32>
-  }) {padding = dense<[[0, 0], [0, 1], [0, 1], [0, 0]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 3, 3, 1]> : tensor<4xi64>, window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>} : (tensor<?x112x112x64xf32>, tensor<f32>) -> tensor<?x56x56x64xf32>
+  }) {padding = dense<[[0, 0], [0, 1], [0, 1], [0, 0]]> : tensor<4x2xi64>, window_dimensions = array<i64: 1, 3, 3, 1>, window_strides = array<i64: 1, 2, 2, 1>} : (tensor<?x112x112x64xf32>, tensor<f32>) -> tensor<?x56x56x64xf32>
   return %6 : tensor<?x56x56x64xf32>
 }
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize_int4.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/prepare_quantize_int4.mlir
similarity index 100%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize_int4.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/prepare_quantize_int4.mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize_per_channel.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/prepare_quantize_per_channel.mlir
similarity index 98%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize_per_channel.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/prepare_quantize_per_channel.mlir
index f509ffce05863d..a6159c1dd62b4b 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/prepare_quantize_per_channel.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/prepare_quantize/prepare_quantize_per_channel.mlir
@@ -1,4 +1,4 @@
-// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-prepare-quantize=enable-per-channel-quantization=true -verify-diagnostics | FileCheck %s
+// RUN: stablehlo-quant-opt %s -split-input-file -stablehlo-prepare-quantize=enable-per-channel-quantized-weight=true -verify-diagnostics | FileCheck %s
 
 // -----
 
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize.mlir
similarity index 100%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize.mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_op_with_region.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_op_with_region.mlir
similarity index 95%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_op_with_region.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_op_with_region.mlir
index 747e5ee4188757..04104c308a3b3d 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_op_with_region.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_op_with_region.mlir
@@ -18,7 +18,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ARG1]], %[[ARG2]] : tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>
     // CHECK: stablehlo.return %[[MAX]] : tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>
     // CHECK{LITERAL}: padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>
-    // CHECK-SAME: window_dimensions = dense<[1, 3, 3, 1]> : tensor<4xi64>
+    // CHECK-SAME: window_dimensions = array<i64: 1, 3, 3, 1>
     // CHECK-SAME: (tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>, tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
 
     // CHECK: %[[DQ:.*]] = "quantfork.dcast"(%[[REDUCE]])
@@ -39,7 +39,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
       %14 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
       stablehlo.return %14 : tensor<f32>
-    }) {padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 3, 3, 1]> : tensor<4xi64>} : (tensor<2x3x1x3xf32>, tensor<f32>) -> tensor<2x3x1x3xf32>
+    }) {padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>, window_dimensions = array<i64: 1, 3, 3, 1>} : (tensor<2x3x1x3xf32>, tensor<f32>) -> tensor<2x3x1x3xf32>
     %12 = "quantfork.qcast"(%11) {volatile} : (tensor<2x3x1x3xf32>) -> tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
     %13 = "quantfork.dcast"(%12) : (tensor<2x3x1x3x!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x1x3xf32>
     return %13 : tensor<2x3x1x3xf32>
@@ -74,7 +74,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ARG1]], %[[ARG2]] : tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>
     // CHECK: stablehlo.return %[[MAX]] : tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>
     // CHECK{LITERAL}: padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>
-    // CHECK-SAME: window_dimensions = dense<[1, 3, 3, 1]> : tensor<4xi64>
+    // CHECK-SAME: window_dimensions = array<i64: 1, 3, 3, 1>
     // CHECK-SAME: (tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>, tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
 
     // CHECK: %[[Q2:.*]] = "quantfork.qcast"(%[[CST1]])
@@ -93,7 +93,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
       %14 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
       stablehlo.return %14 : tensor<f32>
-    }) {padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>, window_dimensions = dense<[1, 3, 3, 1]> : tensor<4xi64>} : (tensor<2x3x1x1024xf32>, tensor<f32>) -> tensor<2x3x1x1024xf32>
+    }) {padding = dense<[[0, 0], [1, 1], [1, 1], [0, 0]]> : tensor<4x2xi64>, window_dimensions = array<i64: 1, 3, 3, 1>} : (tensor<2x3x1x1024xf32>, tensor<f32>) -> tensor<2x3x1x1024xf32>
     %7 = "quantfork.qcast"(%6) {volatile} : (tensor<2x3x1x1024xf32>) -> tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
     %8 = "quantfork.dcast"(%7) : (tensor<2x3x1x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1x1024xf32>
     %9 = "quantfork.qcast"(%1) {volatile} : (tensor<2x3x1024x3xf32>) -> tensor<2x3x1024x3x!quant.uniform<i8<-127:127>:f32, 4.000000e-01>>
@@ -136,7 +136,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ARG1]], %[[ARG2]] : tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>
     // CHECK: stablehlo.return %[[MAX]] : tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>
     // CHECK{LITERAL}: padding = dense<[[0, 0], [1, 1], [0, 0]]> : tensor<3x2xi64>
-    // CHECK-SAME: window_dimensions = dense<[1, 3, 1]> : tensor<3xi64>
+    // CHECK-SAME: window_dimensions = array<i64: 1, 3, 1>
     // CHECK-SAME: (tensor<2x3x3x!quant.uniform<i8:f32, 3.000000e-01:1>>, tensor<!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
 
     // CHECK: %[[DQ:.*]] = "quantfork.dcast"(%[[REDUCE]])
@@ -160,7 +160,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
       %17 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
       stablehlo.return %17 : tensor<f32>
-    }) {padding = dense<[[0, 0], [1, 1], [0, 0]]> : tensor<3x2xi64>, window_dimensions = dense<[1, 3, 1]> : tensor<3xi64>} : (tensor<2x3x3xf32>, tensor<f32>) -> tensor<2x3x3xf32>
+    }) {padding = dense<[[0, 0], [1, 1], [0, 0]]> : tensor<3x2xi64>, window_dimensions = array<i64: 1, 3, 1>} : (tensor<2x3x3xf32>, tensor<f32>) -> tensor<2x3x3xf32>
     %15 = "quantfork.qcast"(%14) {volatile} : (tensor<2x3x3xf32>) -> tensor<2x3x3x!quant.uniform<i8:f32, 3.000000e-01:1>>
     %16 = "quantfork.dcast"(%15) : (tensor<2x3x3x!quant.uniform<i8:f32, 3.000000e-01:1>>) -> tensor<2x3x3xf32>
     return %16 : tensor<2x3x3xf32>
@@ -195,7 +195,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     // CHECK: %[[MAX:.*]] = stablehlo.maximum %[[ARG1]], %[[ARG2]] : tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>
     // CHECK: stablehlo.return %[[MAX]] : tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>
     // CHECK{LITERAL}: padding = dense<[[0, 0], [1, 1], [0, 0]]> : tensor<3x2xi64>
-    // CHECK-SAME: window_dimensions = dense<[1, 3, 1]> : tensor<3xi64>
+    // CHECK-SAME: window_dimensions = array<i64: 1, 3, 1>
     // CHECK-SAME: (tensor<2x3x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>, tensor<!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
 
     // CHECK: %[[RESHAPE:.*]] = stablehlo.reshape %[[REDUCE]]
@@ -215,7 +215,7 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
     ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
       %17 = stablehlo.maximum %arg1, %arg2 : tensor<f32>
       stablehlo.return %17 : tensor<f32>
-    }) {padding = dense<[[0, 0], [1, 1], [0, 0]]> : tensor<3x2xi64>, window_dimensions = dense<[1, 3, 1]> : tensor<3xi64>} : (tensor<2x3x1024xf32>, tensor<f32>) -> tensor<2x3x1024xf32>
+    }) {padding = dense<[[0, 0], [1, 1], [0, 0]]> : tensor<3x2xi64>, window_dimensions = array<i64: 1, 3, 1>} : (tensor<2x3x1024xf32>, tensor<f32>) -> tensor<2x3x1024xf32>
     %7 = "quantfork.qcast"(%6) {volatile} : (tensor<2x3x1024xf32>) -> tensor<2x3x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>
     %8 = "quantfork.dcast"(%7) : (tensor<2x3x1024x!quant.uniform<i8:f32, 5.000000e-01:2>>) -> tensor<2x3x1024xf32>
     %9 = stablehlo.reshape %8 : (tensor<2x3x1024xf32>) -> tensor<2x3x1x1024xf32>
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_same_scale.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_same_scale.mlir
similarity index 90%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_same_scale.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_same_scale.mlir
index d149ab12324f52..f437016ed2c6f2 100644
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_same_scale.mlir
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize/quantize_same_scale.mlir
@@ -154,46 +154,6 @@ module attributes {tf_saved_model.semantics} {
 
 // -----
 
-module attributes {tf_saved_model.semantics} {
-  // CHECK-LABEL: composite_and_convert
-  // CHECK-SAME: %[[ARG0:.*]]: tensor<1x2xf32>
-  // CHECK-SAME: %[[ARG1:.*]]: tensor<2x3xf32>
-  func.func private @composite_and_convert(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> {
-    // CHECK: %[[Q1:.*]] = "quantfork.qcast"(%[[ARG0]]) {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>
-    // CHECK: %[[Q2:.*]] = "quantfork.qcast"(%[[ARG1]]) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, 6.000000e-03:13>>
-    // CHECK: %[[CALL:.*]] = call @quantized_dot_general_fn_1(%[[Q1]], %[[Q2]])
-    // CHECK-SAME: (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, 6.000000e-03:13>>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
-    // CHECK: %[[CONVERT:.*]] = stablehlo.convert %[[CALL]] : tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
-    // CHECK: %[[DQ:.*]] = "quantfork.dcast"(%[[CONVERT]]) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
-    // CHECK: return %[[DQ]]
-    %0 = "quantfork.qcast"(%arg0) {volatile} : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>
-    %1 = "quantfork.dcast"(%0) : (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>) -> tensor<1x2xf32>
-    %2 = "quantfork.qcast"(%arg1) {volatile} : (tensor<2x3xf32>) -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, 6.000000e-03:13>>
-    %3 = "quantfork.dcast"(%2) : (tensor<2x3x!quant.uniform<i8<-127:127>:f32, 6.000000e-03:13>>) -> tensor<2x3xf32>
-    %4 = "tf.XlaCallModule"(%1, %3) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-    %5 = "quantfork.qcast"(%4) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
-    %6 = "quantfork.dcast"(%5) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
-    %7 = stablehlo.convert %6 : (tensor<1x3xf32>) -> tensor<1x3xf32>
-    %8 = "quantfork.qcast"(%7) {volatile} : (tensor<1x3xf32>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
-    %9 = "quantfork.dcast"(%8) : (tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>) -> tensor<1x3xf32>
-    return %9 : tensor<1x3xf32>
-  }
-
-  // CHECK: quantized_dot_general_fn_1
-  // CHECK-SAME: %[[ARG2:.*]]: tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>
-  // CHECK-SAME: %[[ARG3:.*]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32, 6.000000e-03:13>>
-  func.func private @composite_dot_general_fn_1(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
-    // CHECK: %[[DOT:.*]] = stablehlo.dot_general %[[ARG2]], %[[ARG3]]
-    // CHECK-SAME: (tensor<1x2x!quant.uniform<i8:f32, 5.000000e-03>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, 6.000000e-03:13>>) -> tensor<1x3x!quant.uniform<i32:f32, 3.000000e-05>>
-    // CHECK: %[[Q3:.*]] = stablehlo.uniform_quantize %0 : (tensor<1x3x!quant.uniform<i32:f32, 3.000000e-05>>) -> tensor<1x3x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
-    // CHECK: return %[[Q3]]
-    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-    return %0 : tensor<1x3xf32>
-  }
-}
-
-// -----
-
 module attributes {tf_saved_model.semantics} {
   // CHECK-LABEL: composite_and_pad
   // CHECK-SAME: %[[ARG0:.*]]: tensor<1x2xf32>
@@ -351,7 +311,7 @@ module attributes {tf_saved_model.semantics} {
       collapsed_slice_dims = [0],
       start_index_map = [1, 0],
       index_vector_dim = 2>,
-    slice_sizes = dense<[1, 2, 2]> : tensor<3xi64>,
+    slice_sizes = array<i64: 1, 2, 2>,
     indices_are_sorted = false
   } : (tensor<3x4x2xf32>, tensor<2x3x2xi64>) -> tensor<2x3x2x2xf32>
     %8 = "quantfork.qcast"(%7) {volatile} : (tensor<2x3x2x2xf32>) -> tensor<2x3x2x2x!quant.uniform<i8:f32, 0.13170163023705575:-1>>
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
new file mode 100644
index 00000000000000..ff7bfafe654099
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/quantize_composite_functions.mlir
@@ -0,0 +1,567 @@
+// RUN: stablehlo-quant-opt %s -split-input-file -verify-diagnostics \
+// RUN:     -stablehlo-quantize-composite-functions | FileCheck %s
+// RUN: stablehlo-quant-opt %s -split-input-file -verify-diagnostics \
+// RUN:     -stablehlo-quantize-composite-functions=enable-per-channel-quantized-weight=false | FileCheck --check-prefix=CHECK-PER-TENSOR %s
+
+// Tests that basic dot_general is properly quantized.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_dot_general_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %2 : tensor<1x3xf32>
+  }
+// Checks that the quantized XlaCallModule has been replaced by a CallOp, which
+// calls the quantized entry function.
+
+// CHECK-PER-TENSOR: func.func private @quantize_dot_general_fn(%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-PER-TENSOR: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
+// CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+// Checks that the entry function is quantized for dot_general. Quantized
+// dot_general outputs an i32 quantized tensor, followed by requantization to
+// i8 quantized tensor.
+
+// CHECK-PER-TENSOR: func.func private @quantized_dot_general_fn(%[[ARG_1:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-PER-TENSOR: %[[DOT_GENERAL_0:.+]] = stablehlo.dot_general %[[ARG_1]], %[[ARG_2]], contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[DOT_GENERAL_0]] : (tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+}
+
+// -----
+
+// Tests that fused pattern for dot_general + bias is properly quantized.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_dot_general_with_bias_same_shape_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x3xf32>} : () -> tensor<1x3xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_with_bias_same_shape_fn, _original_entry_function = "composite_dot_general_with_bias_same_shape_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
+    return %2 : tensor<1x3xf32>
+  }
+// CHECK-PER-TENSOR: func.func private @quantize_dot_general_with_bias_same_shape_fn(%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x3xi32>} : () -> tensor<1x3x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_same_shape_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x3x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
+// CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
+
+  func.func private @composite_dot_general_with_bias_same_shape_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %1 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+// CHECK-PER-TENSOR: func.func private @quantized_dot_general_with_bias_same_shape_fn(%[[ARG_1:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-PER-TENSOR: %[[DOT_GENERAL_0:.+]] = stablehlo.dot_general %[[ARG_1]], %[[ARG_2]], contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i32:f32, 8.3371932554046126E-6>>
+// CHECK-PER-TENSOR: %[[ADD_0:.+]] = stablehlo.add %[[DOT_GENERAL_0]], %[[ARG_3]] : tensor<1x3x!quant.uniform<i32:f32, 8.3371932554046126E-6>>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
+
+}
+
+// -----
+
+// Tests that fused pattern for dot_general + bias with dynamic batch dimension
+// is properly quantized.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_dot_general_with_bias_dynamic_fn(%arg0: tensor<?x2xf32>) -> tensor<?x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<3xf32>} : () -> tensor<3xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<?x2xf32>) -> tensor<?x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<?x3>], _entry_function = @composite_dot_general_with_bias_dynamic_fn, _original_entry_function = "composite_dot_general_with_bias_dynamic_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<?x2xf32>, tensor<2x3xf32>, tensor<3xf32>) -> tensor<?x3xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<?x3xf32>) -> tensor<?x3xf32>
+    return %2 : tensor<?x3xf32>
+  }
+// CHECK-PER-TENSOR: func.func private @quantize_dot_general_with_bias_dynamic_fn(%[[ARG_0:.+]]: tensor<?x2xf32>) -> tensor<?x3xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<3xi32>} : () -> tensor<3x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x2xf32>) -> tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<3x!quant.uniform<i32:f32, {{.*}}>) -> tensor<?x3x!quant.uniform<i8:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3xf32>
+// CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3xf32>
+
+  func.func private @composite_dot_general_with_bias_dynamic_fn(%arg0: tensor<?x2xf32>, %arg1: tensor<2x3xf32>, %arg2: tensor<3xf32>) -> tensor<?x3xf32> attributes {_from_xla_call_module} {
+      %cst_0 = stablehlo.constant dense<2> : tensor<1xi32>
+      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<?x2xf32>, tensor<2x3xf32>) -> tensor<?x3xf32>
+      %1 = stablehlo.get_dimension_size %0, dim = 0 : (tensor<?x3xf32>) -> tensor<i32>
+      %2 = stablehlo.reshape %1 : (tensor<i32>) -> tensor<1xi32>
+      %3 = stablehlo.concatenate %2, %cst_0, dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+      %4 = stablehlo.dynamic_broadcast_in_dim %arg2, %3, dims = [1] : (tensor<3xf32>, tensor<2xi32>) -> tensor<?x3xf32>
+      %5 = stablehlo.add %0, %4 : tensor<?x3xf32>
+      return %5 : tensor<?x3xf32>
+    }
+}
+// CHECK-PER-TENSOR: func.func private @quantized_dot_general_with_bias_dynamic_fn(%[[ARG_1:.+]]: tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_3:.+]]: tensor<3x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<?x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-PER-TENSOR: %[[CONST_2:.+]] = stablehlo.constant dense<2> : tensor<1xi32>
+// CHECK-PER-TENSOR: %[[DOT_GENERAL_0:.+]] = stablehlo.dot_general %[[ARG_1]], %[[ARG_2]], contracting_dims = [1] x [0] : (tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<?x3x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[GET_DIMENSION_SIZE_0:.+]] = stablehlo.get_dimension_size %[[DOT_GENERAL_0]], dim = 0 : (tensor<?x3x!quant.uniform<i32:f32, {{.*}}>)
+// CHECK-PER-TENSOR: %[[RESHAPE_0:.+]] = stablehlo.reshape %[[GET_DIMENSION_SIZE_0]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK-PER-TENSOR: %[[CONCATENATE_0:.+]] = stablehlo.concatenate %[[RESHAPE_0]], %[[CONST_2]], dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
+// CHECK-PER-TENSOR: %[[DYNAMIC_BROADCAST_IN_DIM_0:.+]] = stablehlo.dynamic_broadcast_in_dim %[[ARG_3]], %[[CONCATENATE_0]], dims = [1] : (tensor<3x!quant.uniform<i32:f32, {{.*}}>>, tensor<2xi32>) -> tensor<?x3x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[ADD_0:.+]] = stablehlo.add %[[DOT_GENERAL_0]], %[[DYNAMIC_BROADCAST_IN_DIM_0]] : tensor<?x3x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<?x3x!quant.uniform<i32:f32, {{.*}}>>)
+// CHECK-PER-TENSOR: return %[[UNIFORM_QUANTIZE_1]] : tensor<?x3x!quant.uniform<i8:f32, {{.*}}>>
+
+
+// -----
+
+// Tests that basic convolution is properly quantized.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_fn(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64, _entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
+    return %2 : tensor<1x3x4x2xf32>
+  }
+// Check that the quantized XlaCallModule has been replaced by a CallOp, which
+// calls the quantized entry function.
+
+// CHECK: func.func private @quantize_conv_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
+
+// CHECK-PER-TENSOR: func.func private @quantize_conv_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-PER-TENSOR: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
+// CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
+
+  func.func private @composite_conv_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    return %0 : tensor<1x3x4x2xf32>
+  }
+// Checks that the entry function is quantized for convolution. Quantized
+// convolution outputs an i32 quantized tensor, followed by requantization to
+// i8 quantized tensor.
+
+// CHECK: func.func private @quantized_conv_fn(%[[ARG_1:.+]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[CONVOLUTION_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+
+// CHECK-PER-TENSOR: func.func private @quantized_conv_fn(%[[ARG_1:.+]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-PER-TENSOR: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[CONVOLUTION_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+}
+
+// -----
+
+// Tests that fused pattern for convolution + bias is properly quantized.
+
+// Checks that fused functions with 1D bias is properly quantized.
+// The 1D bias should be broadcasted in dims [3], where it initially has
+// `quantizedDimension=0`, but has `quantizedDimension=3` after broadcasting.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_with_bias_1d_fn(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<2xf32>} : () -> tensor<2xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3x4x2>], _entry_function = @composite_conv_with_bias_1d_fn, _original_entry_function = "composite_conv_with_bias_1d_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<2xf32>) -> tensor<1x3x4x2xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
+    return %2 : tensor<1x3x4x2xf32>
+  }
+// CHECK: func.func private @quantize_conv_with_bias_1d_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<47978> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32:0, {8.3371932554046126E-6,8.3371932554046126E-6}>>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_1d_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>, tensor<2x!quant.uniform<i32:f32:0, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, 0.0027450979924669452:-128>>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
+
+// CHECK-PER-TENSOR: func.func private @quantize_conv_with_bias_1d_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_1d_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<2x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
+// CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
+
+  func.func private @composite_conv_with_bias_1d_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.broadcast_in_dim %arg2, dims = [3] : (tensor<2xf32>) -> tensor<1x3x4x2xf32>
+    %1 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    %2 = stablehlo.add %1, %0 : tensor<1x3x4x2xf32>
+    return %2 : tensor<1x3x4x2xf32>
+  }
+// CHECK: func.func private @quantized_conv_with_bias_1d_fn(%[[ARG_1:.+]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>>, %[[ARG_3:.+]]: tensor<2x!quant.uniform<i32:f32:0, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK: %[[BROADCAST_IN_DIM:.+]] = stablehlo.broadcast_in_dim %arg2, dims = [3] : (tensor<2x!quant.uniform<i32:f32:0, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<1x3x4x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>
+// CHECK: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[BROADCAST_IN_DIM]] : tensor<1x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+
+// CHECK-PER-TENSOR: func.func private @quantized_conv_with_bias_1d_fn(%[[ARG_1:.+]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_3:.+]]: tensor<2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-PER-TENSOR: %[[BROADCAST_IN_DIM:.+]] = stablehlo.broadcast_in_dim %[[ARG_3]]
+// CHECK-PER-TENSOR: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[BROADCAST_IN_DIM]] : tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+}
+
+// -----
+
+// Checks that fused functions with 4D bias is properly quantized.
+// The 4D bias should be braoadcasted in dims [0, 1, 2, 3], where it
+// already has `quantizedDimension=3`.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_with_bias_fn(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3x4x2>], _entry_function = @composite_conv_with_bias_fn, _original_entry_function = "composite_conv_with_bias_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<1x3x4x2xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
+    return %2 : tensor<1x3x4x2xf32>
+  }
+// CHECK: func.func private @quantize_conv_with_bias_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
+
+// CHECK-PER-TENSOR: func.func private @quantize_conv_with_bias_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
+// CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
+
+  func.func private @composite_conv_with_bias_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<1x1x1x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.broadcast_in_dim %arg2, dims = [0, 1, 2, 3] : (tensor<1x1x1x2xf32>) -> tensor<1x3x4x2xf32>
+    %1 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
+    %2 = stablehlo.add %1, %0 : tensor<1x3x4x2xf32>
+    return %2 : tensor<1x3x4x2xf32>
+  }
+// CHECK: func.func private @quantized_conv_with_bias_fn(%[[ARG_1:.+]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK: %[[BROADCAST_IN_DIM:.+]] = stablehlo.broadcast_in_dim %arg2, dims = [0, 1, 2, 3] : (tensor<1x1x1x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<1x3x4x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>
+// CHECK: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[BROADCAST_IN_DIM]] : tensor<1x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+
+// CHECK-PER-TENSOR: func.func private @quantized_conv_with_bias_fn(%[[ARG_1:.+]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-PER-TENSOR: %[[BROADCAST_IN_DIM:.+]] = stablehlo.broadcast_in_dim %arg2
+// CHECK-PER-TENSOR: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[BROADCAST_IN_DIM]] : tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+}
+
+// -----
+
+// Tests that fused pattern for convolution + bias with dynamic batch dimension
+// is properly quantized.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_with_bias_dynamic_fn(%arg0: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3x4x2>], _entry_function = @composite_conv_with_bias_dynamic_fn, _original_entry_function = "composite_conv_with_bias_dynamic_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x2xf32>) -> tensor<?x3x4x2xf32>
+    return %2 : tensor<?x3x4x2xf32>
+  }
+// CHECK: func.func private @quantize_conv_with_bias_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
+
+// CHECK-PER-TENSOR: func.func private @quantize_conv_with_bias_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
+// CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
+
+  func.func private @composite_conv_with_bias_dynamic_fn(%arg0: tensor<?x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32> attributes {_from_xla_call_module} {
+    %cst_0 = stablehlo.constant dense<3> : tensor<1xi32>
+    %cst_1 = stablehlo.constant dense<4> : tensor<1xi32>
+    %cst_2 = stablehlo.constant dense<2> : tensor<1xi32>
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<?x3x4x2xf32>
+    %1 = stablehlo.get_dimension_size %0, dim = 0 : (tensor<?x3x4x2xf32>) -> tensor<i32>
+    %2 = stablehlo.reshape %1 : (tensor<i32>) -> tensor<1xi32>
+    %3 = stablehlo.concatenate %2, %cst_0, %cst_1, %cst_2, dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+    %4 = stablehlo.dynamic_broadcast_in_dim %arg2, %3, dims = [0, 1, 2, 3] : (tensor<1x1x1x2xf32>, tensor<4xi32>) -> tensor<?x3x4x2xf32>
+    %5 = stablehlo.add %0, %4 : tensor<?x3x4x2xf32>
+    return %5 : tensor<?x3x4x2xf32>
+  }
+}
+// CHECK: func.func private @quantized_conv_with_bias_dynamic_fn(%[[ARG_1:.+]]: tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-DAG: %[[CONST_2:.+]] = stablehlo.constant dense<3> : tensor<1xi32>
+// CHECK-DAG: %[[CONST_3:.+]] = stablehlo.constant dense<4> : tensor<1xi32>
+// CHECK-DAG: %[[CONST_4:.+]] = stablehlo.constant dense<2> : tensor<1xi32>
+// CHECK: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>) -> tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[GET_DIMENSION_SIZE_0:.+]] = stablehlo.get_dimension_size %[[CONVOLUTION_0]], dim = 0 : (tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>)
+// CHECK: %[[RESHAPE_0:.+]] = stablehlo.reshape %[[GET_DIMENSION_SIZE_0]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK: %[[CONCATENATE_0:.+]] = stablehlo.concatenate %[[RESHAPE_0]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]], dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.+]] = stablehlo.dynamic_broadcast_in_dim %[[ARG_3]], %[[CONCATENATE_0]], dims = [0, 1, 2, 3] : (tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>>, tensor<4xi32>) -> tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[DYNAMIC_BROADCAST_IN_DIM_0]] : tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>)
+// CHECK: return %[[UNIFORM_QUANTIZE_0]] : tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+
+// CHECK-PER-TENSOR: func.func private @quantized_conv_with_bias_dynamic_fn(%[[ARG_1:.+]]: tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-PER-TENSOR-DAG: %[[CONST_2:.+]] = stablehlo.constant dense<3> : tensor<1xi32>
+// CHECK-PER-TENSOR-DAG: %[[CONST_3:.+]] = stablehlo.constant dense<4> : tensor<1xi32>
+// CHECK-PER-TENSOR-DAG: %[[CONST_4:.+]] = stablehlo.constant dense<2> : tensor<1xi32>
+// CHECK-PER-TENSOR: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[GET_DIMENSION_SIZE_0:.+]] = stablehlo.get_dimension_size %[[CONVOLUTION_0]], dim = 0 : (tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>)
+// CHECK-PER-TENSOR: %[[RESHAPE_0:.+]] = stablehlo.reshape %[[GET_DIMENSION_SIZE_0]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK-PER-TENSOR: %[[CONCATENATE_0:.+]] = stablehlo.concatenate %[[RESHAPE_0]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]], dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+// CHECK-PER-TENSOR: %[[DYNAMIC_BROADCAST_IN_DIM_0:.+]] = stablehlo.dynamic_broadcast_in_dim %[[ARG_3]], %[[CONCATENATE_0]], dims = [0, 1, 2, 3] : (tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>>, tensor<4xi32>) -> tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[DYNAMIC_BROADCAST_IN_DIM_0]] : tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>)
+// CHECK-PER-TENSOR: return %[[UNIFORM_QUANTIZE_0]] : tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+
+// -----
+
+// Tests that fused pattern for convolution + bias + relu with
+// dynamic batch dimension is properly quantized.
+
+// Note that this checks for identical condition as
+// quantize_conv_with_bias_dynamic_fn, omitting stablehlo.maximum.
+// This is because activation clipping which includes 0.0f can be simply
+// omitted from the graph as the lifted function's out_scale and out_zp are
+// already calculated based on the clipped distribution.
+// Note that the resulting scale and zero point should be calculated based on
+// clipped range [0, r_max].
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_with_bias_and_relu_dynamic_fn(%arg0: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3x4x2>], _entry_function = @composite_conv_with_bias_and_relu_dynamic_fn, _original_entry_function = "composite_conv_with_bias_and_relu_dynamic_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[0.00000000e-6, 8.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x2xf32>) -> tensor<?x3x4x2xf32>
+    return %2 : tensor<?x3x4x2xf32>
+  }
+// CHECK: func.func private @quantize_conv_with_bias_and_relu_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0031372549487095253:-128>>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
+
+// CHECK-PER-TENSOR: func.func private @quantize_conv_with_bias_and_relu_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>, tensor<1x1x1x2x!quant.uniform<i32:f32, 8.3371932554046126E-6>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0031372549487095253:-128>>
+// CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
+// CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
+
+  func.func private @composite_conv_with_bias_and_relu_dynamic_fn(%arg0: tensor<?x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32> attributes {_from_xla_call_module} {
+    %cst_0 = stablehlo.constant dense<3> : tensor<1xi32>
+    %cst_1 = stablehlo.constant dense<4> : tensor<1xi32>
+    %cst_2 = stablehlo.constant dense<2> : tensor<1xi32>
+    %cst_3 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+    %cst_4 = stablehlo.constant dense<6.000000e+00> : tensor<f32>
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<?x3x4x2xf32>
+    %1 = stablehlo.get_dimension_size %0, dim = 0 : (tensor<?x3x4x2xf32>) -> tensor<i32>
+    %2 = stablehlo.reshape %1 : (tensor<i32>) -> tensor<1xi32>
+    %3 = stablehlo.concatenate %2, %cst_0, %cst_1, %cst_2, dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+    %4 = stablehlo.dynamic_broadcast_in_dim %arg2, %3, dims = [0, 1, 2, 3] : (tensor<1x1x1x2xf32>, tensor<4xi32>) -> tensor<?x3x4x2xf32>
+    %5 = stablehlo.add %0, %4 : tensor<?x3x4x2xf32>
+    %6 = stablehlo.clamp %cst_3, %5, %cst_4 : (tensor<f32>, tensor<?x3x4x2xf32>, tensor<f32>) -> tensor<?x3x4x2xf32>
+    return %6 : tensor<?x3x4x2xf32>
+  }
+}
+// CHECK: func.func private @quantized_conv_with_bias_and_relu_dynamic_fn(%[[ARG_1:.+]]: tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-DAG: %[[CONST_2:.+]] = stablehlo.constant dense<3> : tensor<1xi32>
+// CHECK-DAG: %[[CONST_3:.+]] = stablehlo.constant dense<4> : tensor<1xi32>
+// CHECK-DAG: %[[CONST_4:.+]] = stablehlo.constant dense<2> : tensor<1xi32>
+// CHECK: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>) -> tensor<?x3x4x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>
+// CHECK: %[[GET_DIMENSION_SIZE_0:.+]] = stablehlo.get_dimension_size %[[CONVOLUTION_0]], dim = 0 : (tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>)
+// CHECK: %[[RESHAPE_0:.+]] = stablehlo.reshape %[[GET_DIMENSION_SIZE_0]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK: %[[CONCATENATE_0:.+]] = stablehlo.concatenate %[[RESHAPE_0]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]], dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.+]] = stablehlo.dynamic_broadcast_in_dim %[[ARG_3]], %[[CONCATENATE_0]], dims = [0, 1, 2, 3] : (tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>>, tensor<4xi32>) -> tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[DYNAMIC_BROADCAST_IN_DIM_0]] : tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<?x3x4x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0031372549487095253:-128>>
+// CHECK: return %[[UNIFORM_QUANTIZE_0]] : tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0031372549487095253:-128>>
+
+// CHECK-PER-TENSOR: func.func private @quantized_conv_with_bias_and_relu_dynamic_fn(%[[ARG_1:.+]]: tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-PER-TENSOR-DAG: %[[CONST_2:.+]] = stablehlo.constant dense<3> : tensor<1xi32>
+// CHECK-PER-TENSOR-DAG: %[[CONST_3:.+]] = stablehlo.constant dense<4> : tensor<1xi32>
+// CHECK-PER-TENSOR-DAG: %[[CONST_4:.+]] = stablehlo.constant dense<2> : tensor<1xi32>
+// CHECK-PER-TENSOR: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<?x3x4x2x!quant.uniform<i32:f32, 8.3371932554046126E-6>>
+// CHECK-PER-TENSOR: %[[GET_DIMENSION_SIZE_0:.+]] = stablehlo.get_dimension_size %[[CONVOLUTION_0]], dim = 0 : (tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>)
+// CHECK-PER-TENSOR: %[[RESHAPE_0:.+]] = stablehlo.reshape %[[GET_DIMENSION_SIZE_0]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK-PER-TENSOR: %[[CONCATENATE_0:.+]] = stablehlo.concatenate %[[RESHAPE_0]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]], dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+// CHECK-PER-TENSOR: %[[DYNAMIC_BROADCAST_IN_DIM_0:.+]] = stablehlo.dynamic_broadcast_in_dim %[[ARG_3]], %[[CONCATENATE_0]], dims = [0, 1, 2, 3] : (tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>>, tensor<4xi32>) -> tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[DYNAMIC_BROADCAST_IN_DIM_0]] : tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<?x3x4x2x!quant.uniform<i32:f32, 8.3371932554046126E-6>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0031372549487095253:-128>>
+// CHECK-PER-TENSOR: return %[[UNIFORM_QUANTIZE_0]] : tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0031372549487095253:-128>>
+
+// -----
+
+// Tests that fused pattern for convolution + bias + relu6 with
+// dynamic batch dimension is properly quantized.
+
+// Note that this checks for identical condition as
+// quantize_conv_with_bias_dynamic_fn, omitting stablehlo.clamp.
+// This is because activation clipping which includes 0.0f can be simply
+// omitted from the graph as the lifted function's out_scale and out_zp are
+// already calculated based on the clipped distribution.
+// Note that the resulting scale and zero point should be calculated based on
+// clipped range [0, r_max].
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @quantize_conv_with_bias_and_relu6_dynamic_fn(%arg0: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
+    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3x4x2>], _entry_function = @composite_conv_with_bias_and_relu6_dynamic_fn, _original_entry_function = "composite_conv_with_bias_and_relu6_dynamic_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 6.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x2xf32>) -> tensor<?x3x4x2xf32>
+    return %2 : tensor<?x3x4x2xf32>
+  }
+// CHECK: func.func private @quantize_conv_with_bias_and_relu6_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>
+// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu6_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>, tensor<1x1x1x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0023529412699680704:-128>>
+// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
+
+// CHECK-PER-TENSOR: func.func private @quantize_conv_with_bias_and_relu6_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK-PER-TENSOR-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
+// CHECK-PER-TENSOR-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x1x1x2xi32>} : () -> tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[CALL_0:.+]] = call @quantized_conv_with_bias_and_relu6_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>, tensor<1x1x1x2x!quant.uniform<i32:f32, 8.3371932554046126E-6>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0023529412699680704:-128>>
+// CHECK-PER-TENSOR: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
+// CHECK-PER-TENSOR: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
+
+  func.func private @composite_conv_with_bias_and_relu6_dynamic_fn(%arg0: tensor<?x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<1x1x1x2xf32>) -> tensor<?x3x4x2xf32> attributes {_from_xla_call_module} {
+    %cst_0 = stablehlo.constant dense<3> : tensor<1xi32>
+    %cst_1 = stablehlo.constant dense<4> : tensor<1xi32>
+    %cst_2 = stablehlo.constant dense<2> : tensor<1xi32>
+    %cst_3 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+    %cst_4 = stablehlo.constant dense<6.000000e+00> : tensor<f32>
+    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<?x3x4x2xf32>
+    %1 = stablehlo.get_dimension_size %0, dim = 0 : (tensor<?x3x4x2xf32>) -> tensor<i32>
+    %2 = stablehlo.reshape %1 : (tensor<i32>) -> tensor<1xi32>
+    %3 = stablehlo.concatenate %2, %cst_0, %cst_1, %cst_2, dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+    %4 = stablehlo.dynamic_broadcast_in_dim %arg2, %3, dims = [0, 1, 2, 3] : (tensor<1x1x1x2xf32>, tensor<4xi32>) -> tensor<?x3x4x2xf32>
+    %5 = stablehlo.add %0, %4 : tensor<?x3x4x2xf32>
+    %6 = stablehlo.clamp %cst_3, %5, %cst_4 : (tensor<f32>, tensor<?x3x4x2xf32>, tensor<f32>) -> tensor<?x3x4x2xf32>
+    return %6 : tensor<?x3x4x2xf32>
+  }
+}
+// CHECK: func.func private @quantized_conv_with_bias_and_relu6_dynamic_fn(%[[ARG_1:.+]]: tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-DAG: %[[CONST_2:.+]] = stablehlo.constant dense<3> : tensor<1xi32>
+// CHECK-DAG: %[[CONST_3:.+]] = stablehlo.constant dense<4> : tensor<1xi32>
+// CHECK-DAG: %[[CONST_4:.+]] = stablehlo.constant dense<2> : tensor<1xi32>
+// CHECK: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32:3, {0.0023622048182750312,0.0023622048182750312}>>) -> tensor<?x3x4x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>
+// CHECK: %[[GET_DIMENSION_SIZE_0:.+]] = stablehlo.get_dimension_size %[[CONVOLUTION_0]], dim = 0 : (tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>)
+// CHECK: %[[RESHAPE_0:.+]] = stablehlo.reshape %[[GET_DIMENSION_SIZE_0]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK: %[[CONCATENATE_0:.+]] = stablehlo.concatenate %[[RESHAPE_0]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]], dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.+]] = stablehlo.dynamic_broadcast_in_dim %[[ARG_3]], %[[CONCATENATE_0]], dims = [0, 1, 2, 3] : (tensor<1x1x1x2x!quant.uniform<i32:f32:3, {{.*}}>>, tensor<4xi32>) -> tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[DYNAMIC_BROADCAST_IN_DIM_0]] : tensor<?x3x4x2x!quant.uniform<i32:f32:3, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<?x3x4x2x!quant.uniform<i32:f32:3, {8.3371932554046126E-6,8.3371932554046126E-6}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0023529412699680704:-128>>
+// CHECK: return %[[UNIFORM_QUANTIZE_0]] : tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0023529412699680704:-128>>
+
+// CHECK-PER-TENSOR: func.func private @quantized_conv_with_bias_and_relu6_dynamic_fn(%[[ARG_1:.+]]: tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+// CHECK-PER-TENSOR-DAG: %[[CONST_2:.+]] = stablehlo.constant dense<3> : tensor<1xi32>
+// CHECK-PER-TENSOR-DAG: %[[CONST_3:.+]] = stablehlo.constant dense<4> : tensor<1xi32>
+// CHECK-PER-TENSOR-DAG: %[[CONST_4:.+]] = stablehlo.constant dense<2> : tensor<1xi32>
+// CHECK-PER-TENSOR: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<?x3x4x3x!quant.uniform<i8:f32, 0.0035294116712084002:-128>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, 0.0023622048182750312>>) -> tensor<?x3x4x2x!quant.uniform<i32:f32, 8.3371932554046126E-6>>
+// CHECK-PER-TENSOR: %[[GET_DIMENSION_SIZE_0:.+]] = stablehlo.get_dimension_size %[[CONVOLUTION_0]], dim = 0 : (tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>)
+// CHECK-PER-TENSOR: %[[RESHAPE_0:.+]] = stablehlo.reshape %[[GET_DIMENSION_SIZE_0]] : (tensor<i32>) -> tensor<1xi32>
+// CHECK-PER-TENSOR: %[[CONCATENATE_0:.+]] = stablehlo.concatenate %[[RESHAPE_0]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]], dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
+// CHECK-PER-TENSOR: %[[DYNAMIC_BROADCAST_IN_DIM_0:.+]] = stablehlo.dynamic_broadcast_in_dim %[[ARG_3]], %[[CONCATENATE_0]], dims = [0, 1, 2, 3] : (tensor<1x1x1x2x!quant.uniform<i32:f32, {{.*}}>>, tensor<4xi32>) -> tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[DYNAMIC_BROADCAST_IN_DIM_0]] : tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
+// CHECK-PER-TENSOR: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<?x3x4x2x!quant.uniform<i32:f32, 8.3371932554046126E-6>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0023529412699680704:-128>>
+// CHECK-PER-TENSOR: return %[[UNIFORM_QUANTIZE_0]] : tensor<?x3x4x2x!quant.uniform<i8:f32, 0.0023529412699680704:-128>>
+
+// -----
+
+// Tests that XlaCallModule op is not quantized without the quantfork.stats ops.
+
+module attributes {tf_saved_model.semantics} {
+  func.func private @not_quantized_without_stats_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
+    %1 = "tf.XlaCallModule"(%arg0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %1 : tensor<1x3xf32>
+  }
+// Check that "tf.Const" is converted to stablehlo.constant. XlaCallModule is
+// not quantized.
+
+// CHECK: func.func private @not_quantized_without_stats_fn(%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
+// CHECK: %[[CONST_0:.+]] = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
+// CHECK: %[[XLA_CALL_MODULE_0:.+]] = "tf.XlaCallModule"(%[[ARG_0]], %[[CONST_0]]) <{{{.*}}}> {{{.*_entry_function = @composite_dot_general_fn.*}}} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+// CHECK: return %[[XLA_CALL_MODULE_0]]
+
+  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+// CHECK: func.func private @composite_dot_general_fn(%[[ARG_1:.+]]: tensor<1x2xf32>, %[[ARG_2:.+]]: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module}
+// Check that the composite_dot_general_fn is untouched.
+// CHECK: %[[DOT_GENERAL_0:.+]] = stablehlo.dot_general %[[ARG_1]], %[[ARG_2]]
+// CHECK: return %[[DOT_GENERAL_0]]
+}
+
+// -----
+
+// Tests that basic gather is properly quantized.
+
+module attributes {tf_saved_model.semantics} {
+// CHECK: func.func private @quantize_gather_fn(%[[ARG:.+]]: tensor<3x4x2xf32>) -> tensor<2x3x2x2xf32> attributes {tf._original_func_name = "main_0"}
+  func.func private @quantize_gather_fn(%arg: tensor<3x4x2xf32>) -> tensor<2x3x2x2xf32> attributes {tf._original_func_name = "main_0"} {
+    %cst = "tf.Const"() {value = dense<1> : tensor<2x3x2xi32>} : () -> tensor<2x3x2xi32>
+    %0 = "quantfork.stats"(%arg) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<3x4x2xf32>) -> tensor<3x4x2xf32>
+    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<2x3x2x2>], _entry_function = @composite_gather_fn, _original_entry_function = "composite_gather_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<3x4x2xf32>, tensor<2x3x2xi32>) -> tensor<2x3x2x2xf32>
+    %2 = "quantfork.stats"(%1) {layerStats = dense<[4.00000000e-6, 9.80000000e-1]> : tensor<2xf32>} : (tensor<2x3x2x2xf32>) -> tensor<2x3x2x2xf32>
+    return %2 : tensor<2x3x2x2xf32>
+  }
+// Checks that the quantized XlaCallModule has been replaced by a CallOp, which
+// calls the quantized entry function.
+// CHECK: %[[CONST:.+]] = stablehlo.constant dense<{{.*}}> : tensor<2x3x2xi32>
+// CHECK: %[[UNIFORM_QUANTIZE:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<3x4x2xf32>) -> tensor<3x4x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[CALL:.+]] = call @quantized_gather_fn(%[[UNIFORM_QUANTIZE]], %[[CONST]]) : (tensor<3x4x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x2xi32>) -> tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_DEQUANTIZE:.+]] = stablehlo.uniform_dequantize %[[CALL]] : (tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<2x3x2x2xf32>
+// CHECK: return %[[UNIFORM_DEQUANTIZE]] : tensor<2x3x2x2xf32>
+
+// CHECK: func.func private @quantized_gather_fn(%[[ARG_0:.+]]: tensor<3x4x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_1:.+]]: tensor<2x3x2xi32>) -> tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
+  func.func private @composite_gather_fn(%arg0: tensor<3x4x2xf32>, %arg1: tensor<2x3x2xi32>) -> tensor<2x3x2x2xf32> attributes {_from_xla_call_module} {
+    %0 = "stablehlo.gather"(%arg0, %arg1) {
+      dimension_numbers = #stablehlo.gather<
+        offset_dims = [2, 3],
+        collapsed_slice_dims = [0],
+        start_index_map = [1, 0],
+        index_vector_dim = 2>,
+      slice_sizes = array<i64: 1, 2, 2>,
+      indices_are_sorted = false
+    } : (tensor<3x4x2xf32>, tensor<2x3x2xi32>) -> tensor<2x3x2x2xf32>
+    return %0 : tensor<2x3x2x2xf32>
+  }
+// CHECK: %[[GATHER:.+]] = "stablehlo.gather"(%[[ARG_0]], %[[ARG_1]]) {{.*}} : (tensor<3x4x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x2xi32>) -> tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: %[[UNIFORM_QUANTIZE:.+]] = stablehlo.uniform_quantize %[[GATHER]] : tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
+// CHECK: return %[[UNIFORM_QUANTIZE]] : tensor<2x3x2x2x!quant.uniform<i8:f32, {{.*}}>>
+}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
similarity index 100%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/replace_stablehlo_ops_in_main_function_with_xla_call_module_ops.mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/restore_function_name.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/restore_function_name.mlir
similarity index 100%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/restore_function_name.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/restore_function_name.mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/unfuse_mhlo_batch_norm.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/unfuse_mhlo_batch_norm.mlir
similarity index 100%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/unfuse_mhlo_batch_norm.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/unfuse_mhlo_batch_norm.mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/unwrap_xla_call_module_op.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/unwrap_xla_call_module_op.mlir
similarity index 100%
rename from tensorflow/compiler/mlir/quantization/stablehlo/tests/unwrap_xla_call_module_op.mlir
rename to tensorflow/compiler/mlir/quantization/stablehlo/tests/passes/unwrap_xla_call_module_op.mlir
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/populate_shape.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/populate_shape.mlir
deleted file mode 100644
index 779ef786714fb7..00000000000000
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/populate_shape.mlir
+++ /dev/null
@@ -1,44 +0,0 @@
-// RUN: stablehlo-quant-opt %s -split-input-file -populate-shape | FileCheck %s
-
-// CHECK-LABEL: @populate_shape_for_custom_aggregator
-func.func @populate_shape_for_custom_aggregator(%input: tensor<?x56x56x64xf32>) {
-  // CHECK: %[[OUTPUT:.*]] = "tf.CustomAggregator"(%[[INPUT:.*]]) <{id = "49d53b0"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 6.000000e+00 : f32, max_percentile = 0.000000e+00 : f32, min = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<?x56x56x64xf32>) -> tensor<?x56x56x64xf32>
-  %0 = "tf.CustomAggregator"(%input) <{id = "49d53b0"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 6.000000e+00 : f32, max_percentile = 0.000000e+00 : f32, min = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<?x56x56x64xf32>) -> tensor<*xf32>
-  func.return
-}
-
-// ----
-
-// CHECK-LABEL: @populate_shape_for_xla_call_module
-func.func @populate_shape_for_xla_call_module(%input: tensor<?x56x56x256xf32>) {
-  %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<1x1x64x256xf32>} : () -> tensor<1x1x64x256xf32>
-  // CHECK: %[[OUTPUT:.*]] = "tf.XlaCallModule"(%[[INPUT:.*]], %[[CST:.*]]) <{Sout = [#tf_type.shape<?x56x56x256>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @main_9, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<?x56x56x256xf32>, tensor<1x1x64x256xf32>) -> tensor<?x56x56x256xf32>
-  %0 = "tf.XlaCallModule"(%input, %cst) <{Sout = [#tf_type.shape<?x56x56x256>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @main_9, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<?x56x56x256xf32>, tensor<1x1x64x256xf32>) -> tensor<*xf32>
-  func.return
-}
-
-// ----
-
-// CHECK-LABEL: @populate_shape_for_chain_of_ops
-func.func @populate_shape_for_chain_of_ops(%input: tensor<?x56x56x64xf32>) {
-  %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<1x1x64x256xf32>} : () -> tensor<1x1x64x256xf32>
-  // CHECK: %[[VAL_0:.*]] = "tf.CustomAggregator"(%[[INPUT:.*]]) <{id = "49d53b0"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 6.000000e+00 : f32, max_percentile = 0.000000e+00 : f32, min = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<?x56x56x64xf32>) -> tensor<?x56x56x64xf32>
-  // CHECK: %[[VAL_1:.*]] = "tf.XlaCallModule"(%[[VAL_0:.*]], %[[CST:.*]]) <{Sout = [#tf_type.shape<?x56x56x256>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @main_9, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<?x56x56x64xf32>, tensor<1x1x64x256xf32>) -> tensor<?x56x56x256xf32>
-  // CHECK: %[[VAL_2:.*]] = "tf.CustomAggregator"(%[[VAL_1:.*]]) <{id = "49d53b1"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 6.000000e+00 : f32, max_percentile = 0.000000e+00 : f32, min = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<?x56x56x256xf32>) -> tensor<?x56x56x256xf32>
-  %0 = "tf.CustomAggregator"(%input) <{id = "49d53b0"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 6.000000e+00 : f32, max_percentile = 0.000000e+00 : f32, min = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<?x56x56x64xf32>) -> tensor<*xf32>
-  %1 = "tf.XlaCallModule"(%0, %cst) <{Sout = [#tf_type.shape<?x56x56x256>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @main_9, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<*xf32>, tensor<1x1x64x256xf32>) -> tensor<*xf32>
-  %2 = "tf.CustomAggregator"(%1) <{id = "49d53b1"}> {calibration_method = 1 : i64, device = "", initial_num_bins = 0 : i64, max = 6.000000e+00 : f32, max_percentile = 0.000000e+00 : f32, min = 0.000000e+00 : f32, min_percentile = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-  func.return
-}
-
-// ----
-
-// CHECK-LABEL: @populate_shape_for_xla_call_module_failure_not_single_output
-func.func @populate_shape_for_xla_call_module_failure_not_single_output(%input: tensor<?x56x56x256xf32>) {
-  %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<1x1x64x256xf32>} : () -> tensor<1x1x64x256xf32>
-  // expected-error @+2 {{XlaCallModuleOp doesn't have 1 output.}}
-  %0, %1 = "tf.XlaCallModule"(%input, %cst) <{Sout = [#tf_type.shape<?x56x56x256>, #tf_type.shape<?x56x56x256>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @main_9, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<?x56x56x256xf32>, tensor<1x1x64x256xf32>) -> (tensor<*xf32>, tensor<*xf32>)
-  // expected-error @+1 {{XlaCallModuleOp doesn't have 1 output.}}
-  "tf.XlaCallModule"(%input, %cst) <{Sout = [], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64}> {_entry_function = @main_9, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<?x56x56x256xf32>, tensor<1x1x64x256xf32>) -> ()
-  func.return
-}
diff --git a/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_composite_functions.mlir b/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_composite_functions.mlir
deleted file mode 100644
index d02b3ada1aa60e..00000000000000
--- a/tensorflow/compiler/mlir/quantization/stablehlo/tests/quantize_composite_functions.mlir
+++ /dev/null
@@ -1,277 +0,0 @@
-// RUN: stablehlo-quant-opt %s -split-input-file -verify-diagnostics \
-// RUN:     -stablehlo-quantize-composite-functions | FileCheck %s
-
-
-// Tests that basic dot_general is properly quantized.
-
-// The following pattern does not converge because of a bug in QuantizePass.
-// TODO - b/305469508: Fix the QuantizePass to avoid this warning.
-// expected-warning @+1 {{Failed to converge pattern at QuantizePass.}}
-module attributes {tf_saved_model.semantics} {
-// CHECK: func.func private @quantize_dot_general_fn(%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
-  func.func private @quantize_dot_general_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
-    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
-    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
-    return %2 : tensor<1x3xf32>
-  }
-// Checks that the quantized XlaCallModule has been replaced by a CallOp, which
-// calls the quantized entry function.
-// CHECK: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
-// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
-// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
-
-// CHECK: func.func private @quantized_dot_general_fn(%[[ARG_1:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
-  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
-    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-    return %0 : tensor<1x3xf32>
-  }
-// Checks that the entry function is quantized for dot_general. Quantized
-// dot_general outputs an i32 quantized tensor, followed by requantization to
-// i8 quantized tensor.
-// CHECK: %[[DOT_GENERAL_0:.+]] = stablehlo.dot_general %[[ARG_1]], %[[ARG_2]], contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>
-// CHECK: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[DOT_GENERAL_0]] : (tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
-}
-
-// -----
-
-// Tests that fused pattern for dot_general + bias is properly quantized.
-
-// The following pattern does not converge because of a bug in QuantizePass.
-// TODO - b/305469508: Fix the QuantizePass to avoid this warning.
-// expected-warning @+1 {{Failed to converge pattern at QuantizePass.}}
-module attributes {tf_saved_model.semantics} {
-// CHECK: func.func private @quantize_dot_general_with_bias_fn(%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
-  func.func private @quantize_dot_general_with_bias_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
-    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
-    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x3xf32>} : () -> tensor<1x3xf32>
-    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x2xf32>) -> tensor<1x2xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_with_bias_fn, _original_entry_function = "composite_dot_general_with_bias_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>, tensor<1x3xf32>) -> tensor<1x3xf32>
-    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3xf32>) -> tensor<1x3xf32>
-    return %2 : tensor<1x3xf32>
-  }
-// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
-// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x3xi32>} : () -> tensor<1x3x!quant.uniform<i32:f32, {{.*}}>
-// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x2xf32>) -> tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x3x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>
-// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3xf32>
-// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3xf32>
-
-// CHECK: func.func private @quantized_dot_general_with_bias_fn(%[[ARG_1:.+]]: tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
-  func.func private @composite_dot_general_with_bias_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>, %arg2: tensor<1x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
-    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-    %1 = stablehlo.add %0, %arg2 : tensor<1x3xf32>
-    return %1 : tensor<1x3xf32>
-  }
-// CHECK: %[[DOT_GENERAL_0:.+]] = stablehlo.dot_general %[[ARG_1]], %[[ARG_2]], contracting_dims = [1] x [0] : (tensor<1x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x!quant.uniform<i32:f32, 8.3371932554046126E-6>>
-// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[DOT_GENERAL_0]], %[[ARG_3]] : tensor<1x3x!quant.uniform<i32:f32, 8.3371932554046126E-6>>
-// CHECK: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<1x3x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x!quant.uniform<i8:f32, {{.*}}>>
-
-}
-
-// -----
-
-// Tests that fused pattern for dot_general + bias with dynamic batch dimension
-// is properly quantized.
-
-// The following pattern does not converge because of a bug in QuantizePass.
-// TODO - b/305469508: Fix the QuantizePass to avoid this warning.
-// expected-warning @+1 {{Failed to converge pattern at QuantizePass.}}
-module attributes {tf_saved_model.semantics} {
-// CHECK: func.func private @quantize_dot_general_with_bias_dynamic_fn(%[[ARG_0:.+]]: tensor<?x2xf32>) -> tensor<?x3xf32> attributes {tf._original_func_name = "main_0"}
-  func.func private @quantize_dot_general_with_bias_dynamic_fn(%arg0: tensor<?x2xf32>) -> tensor<?x3xf32> attributes {tf._original_func_name = "main_0"} {
-    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
-    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<3xf32>} : () -> tensor<3xf32>
-    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<?x2xf32>) -> tensor<?x2xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<?x3>], _entry_function = @composite_dot_general_with_bias_dynamic_fn, _original_entry_function = "composite_dot_general_with_bias_dynamic_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<?x2xf32>, tensor<2x3xf32>, tensor<3xf32>) -> tensor<?x3xf32>
-    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<?x3xf32>) -> tensor<?x3xf32>
-    return %2 : tensor<?x3xf32>
-  }
-// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3xi8>} : () -> tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>
-// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<3xi32>} : () -> tensor<3x!quant.uniform<i32:f32, {{.*}}>
-// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x2xf32>) -> tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.+]] = call @quantized_dot_general_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<3x!quant.uniform<i32:f32, {{.*}}>) -> tensor<?x3x!quant.uniform<i8:f32, {{.*}}>
-// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3xf32>
-// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3xf32>
-
-// CHECK: func.func private @quantized_dot_general_with_bias_dynamic_fn(%[[ARG_1:.+]]: tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_3:.+]]: tensor<3x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<?x3x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
-  func.func private @composite_dot_general_with_bias_dynamic_fn(%arg0: tensor<?x2xf32>, %arg1: tensor<2x3xf32>, %arg2: tensor<3xf32>) -> tensor<?x3xf32> attributes {_from_xla_call_module} {
-      %cst_0 = stablehlo.constant dense<2> : tensor<1xi32>
-      %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<?x2xf32>, tensor<2x3xf32>) -> tensor<?x3xf32>
-      %1 = stablehlo.get_dimension_size %0, dim = 0 : (tensor<?x3xf32>) -> tensor<i32>
-      %2 = stablehlo.reshape %1 : (tensor<i32>) -> tensor<1xi32>
-      %3 = stablehlo.concatenate %2, %cst_0, dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-      %4 = stablehlo.dynamic_broadcast_in_dim %arg2, %3, dims = [1] : (tensor<3xf32>, tensor<2xi32>) -> tensor<?x3xf32>
-      %5 = stablehlo.add %0, %4 : tensor<?x3xf32>
-      return %5 : tensor<?x3xf32>
-    }
-}
-// CHECK: %[[CONST_2:.+]] = stablehlo.constant dense<2> : tensor<1xi32>
-// CHECK: %[[DOT_GENERAL_0:.+]] = stablehlo.dot_general %[[ARG_1]], %[[ARG_2]], contracting_dims = [1] x [0] : (tensor<?x2x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<?x3x!quant.uniform<i32:f32, {{.*}}>>
-// CHECK: %[[GET_DIMENSION_SIZE_0:.+]] = stablehlo.get_dimension_size %[[DOT_GENERAL_0]], dim = 0 : (tensor<?x3x!quant.uniform<i32:f32, {{.*}}>)
-// CHECK: %[[RESHAPE_0:.+]] = stablehlo.reshape %[[GET_DIMENSION_SIZE_0]] : (tensor<i32>) -> tensor<1xi32>
-// CHECK: %[[CONCATENATE_0:.+]] = stablehlo.concatenate %[[RESHAPE_0]], %[[CONST_2]], dim = 0 : (tensor<1xi32>, tensor<1xi32>) -> tensor<2xi32>
-// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.+]] = stablehlo.dynamic_broadcast_in_dim %[[ARG_3]], %[[CONCATENATE_0]], dims = [1] : (tensor<3x!quant.uniform<i32:f32, {{.*}}>>, tensor<2xi32>) -> tensor<?x3x!quant.uniform<i32:f32, {{.*}}>>
-// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[DOT_GENERAL_0]], %[[DYNAMIC_BROADCAST_IN_DIM_0]] : tensor<?x3x!quant.uniform<i32:f32, {{.*}}>>
-// CHECK: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<?x3x!quant.uniform<i32:f32, {{.*}}>>)
-// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<?x3x!quant.uniform<i8:f32, {{.*}}>>
-
-
-// -----
-
-// Tests that basic convolution is properly quantized.
-
-// The following pattern does not converge because of a bug in QuantizePass.
-// TODO - b/305469508: Fix the QuantizePass to avoid this warning.
-// expected-warning @+1 {{Failed to converge pattern at QuantizePass.}}
-module attributes {tf_saved_model.semantics} {
-// CHECK: func.func private @quantize_conv_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
-  func.func private @quantize_conv_fn(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
-    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
-    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst) {Sout = [#tf_type.shape<1x3x4x2>], dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64, _entry_function = @composite_conv_fn, _original_entry_function = "composite_conv_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = ""} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
-    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
-    return %2 : tensor<1x3x4x2xf32>
-  }
-// Check that the quantized XlaCallModule has been replaced by a CallOp, which
-// calls the quantized entry function.
-// CHECK: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
-// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.+]] = call @quantized_conv_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
-// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
-
-// CHECK: func.func private @quantized_conv_fn(%[[ARG_1:.+]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
-  func.func private @composite_conv_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
-    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
-    return %0 : tensor<1x3x4x2xf32>
-  }
-// Checks that the entry function is quantized for convolution. Quantized
-// convolution outputs an i32 quantized tensor, followed by requantization to
-// i8 quantized tensor.
-// CHECK: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
-// CHECK: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[CONVOLUTION_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
-}
-
-// -----
-
-// Tests that fused pattern for convolution + bias is properly quantized.
-
-// The following pattern does not converge because of a bug in QuantizePass.
-// TODO - b/305469508: Fix the QuantizePass to avoid this warning.
-// expected-warning @+1 {{Failed to converge pattern at QuantizePass.}}
-module attributes {tf_saved_model.semantics} {
-// CHECK: func.func private @quantize_conv_with_bias_fn(%[[ARG_0:.+]]: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
-  func.func private @quantize_conv_with_bias_fn(%arg0: tensor<1x3x4x3xf32>) -> tensor<1x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
-    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
-    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<1x3x4x2xf32>} : () -> tensor<1x3x4x2xf32>
-    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3x4x2>], _entry_function = @composite_conv_with_bias_fn, _original_entry_function = "composite_conv_with_bias_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable", device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
-    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32>
-    return %2 : tensor<1x3x4x2xf32>
-  }
-// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
-// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<1x3x4x2xi32>} : () -> tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>
-// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<1x3x4x3xf32>) -> tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>
-// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<1x3x4x2xf32>
-// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<1x3x4x2xf32>
-
-// CHECK: func.func private @quantized_conv_with_bias_fn(%[[ARG_1:.+]]: tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_3:.+]]: tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
-  func.func private @composite_conv_with_bias_fn(%arg0: tensor<1x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<1x3x4x2xf32>) -> tensor<1x3x4x2xf32> attributes {_from_xla_call_module} {
-    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x4x2xf32>
-    %1 = stablehlo.add %0, %arg2 : tensor<1x3x4x2xf32>
-    return %1 : tensor<1x3x4x2xf32>
-  }
-// CHECK: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<1x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
-// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[ARG_3]] : tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
-// CHECK: %[[UNIFORM_QUANTIZE_1:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<1x3x4x2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: return %[[UNIFORM_QUANTIZE_1]] : tensor<1x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
-}
-
-// -----
-
-// Tests that fused pattern for convolution + bias with dynamic batch dimension
-// is properly quantized.
-
-// The following pattern does not converge because of a bug in QuantizePass.
-// TODO - b/305469508: Fix the QuantizePass to avoid this warning.
-// expected-warning @+1 {{Failed to converge pattern at QuantizePass.}}
-module attributes {tf_saved_model.semantics} {
-// CHECK: func.func private @quantize_conv_with_bias_dynamic_fn(%[[ARG_0:.+]]: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"}
-  func.func private @quantize_conv_with_bias_dynamic_fn(%arg0: tensor<?x3x4x3xf32>) -> tensor<?x3x4x2xf32> attributes {tf._original_func_name = "main_0"} {
-    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
-    %cst_0 = "tf.Const"() {value = dense<4.00000000e-1> : tensor<2xf32>} : () -> tensor<2xf32>
-    %0 = "quantfork.stats"(%arg0) {layerStats = dense<[6.00000000e-6, 9.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3xf32>
-    %1 = "tf.XlaCallModule"(%0, %cst, %cst_0) {Sout = [#tf_type.shape<1x3x4x2>], _entry_function = @composite_conv_with_bias_dynamic_fn, _original_entry_function = "composite_conv_with_bias_dynamic_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>, tensor<2xf32>) -> tensor<?x3x4x2xf32>
-    %2 = "quantfork.stats"(%1) {layerStats = dense<[5.00000000e-6, 7.00000000e-1]> : tensor<2xf32>} : (tensor<?x3x4x2xf32>) -> tensor<?x3x4x2xf32>
-    return %2 : tensor<?x3x4x2xf32>
-  }
-
-// CHECK-DAG: %[[CONST_0:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2x3x3x2xi8>} : () -> tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>
-// CHECK-DAG: %[[CONST_1:.+]] = stablehlo.constant() {value = dense<{{.*}}> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32, {{.*}}>
-// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ARG_0]] : (tensor<?x3x4x3xf32>) -> tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>
-// CHECK: %[[CALL_0:.+]] = call @quantized_conv_with_bias_dynamic_fn(%[[UNIFORM_QUANTIZE_0]], %[[CONST_0]], %[[CONST_1]]) : (tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>, tensor<2x!quant.uniform<i32:f32, {{.*}}>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>
-// CHECK: %[[UNIFORM_DEQUANTIZE_0:.+]] = stablehlo.uniform_dequantize %[[CALL_0]] : (tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>) -> tensor<?x3x4x2xf32>
-// CHECK: return %[[UNIFORM_DEQUANTIZE_0]] : tensor<?x3x4x2xf32>
-
-// CHECK: func.func private @quantized_conv_with_bias_dynamic_fn(%[[ARG_1:.+]]: tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, %[[ARG_2:.+]]: tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>>, %[[ARG_3:.+]]: tensor<2x!quant.uniform<i32:f32, {{.*}}>>) -> tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>> attributes {_from_xla_call_module}
-func.func private @composite_conv_with_bias_dynamic_fn(%arg0: tensor<?x3x4x3xf32>, %arg1: tensor<2x3x3x2xf32>, %arg2: tensor<2xf32>) -> tensor<?x3x4x2xf32> attributes {_from_xla_call_module} {
-    %cst_0 = stablehlo.constant dense<3> : tensor<1xi32>
-    %cst_1 = stablehlo.constant dense<4> : tensor<1xi32>
-    %cst_2 = stablehlo.constant dense<2> : tensor<1xi32>
-    %0 = stablehlo.convolution(%arg0, %arg1) dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f], window = {pad = [[0, 1], [1, 1]]} {batch_group_count = 1 : i64, feature_group_count = 1 : i64} : (tensor<?x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<?x3x4x2xf32>
-    %1 = stablehlo.get_dimension_size %0, dim = 0 : (tensor<?x3x4x2xf32>) -> tensor<i32>
-    %2 = stablehlo.reshape %1 : (tensor<i32>) -> tensor<1xi32>
-    %3 = stablehlo.concatenate %2, %cst_0, %cst_1, %cst_2, dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
-    %4 = stablehlo.dynamic_broadcast_in_dim %arg2, %3, dims = [3] : (tensor<2xf32>, tensor<4xi32>) -> tensor<?x3x4x2xf32>
-    %5 = stablehlo.add %0, %4 : tensor<?x3x4x2xf32>
-    return %5 : tensor<?x3x4x2xf32>
-  }
-}
-// CHECK-DAG: %[[CONST_2:.+]] = stablehlo.constant dense<3> : tensor<1xi32>
-// CHECK-DAG: %[[CONST_3:.+]] = stablehlo.constant dense<4> : tensor<1xi32>
-// CHECK-DAG: %[[CONST_4:.+]] = stablehlo.constant dense<2> : tensor<1xi32>
-// CHECK: %[[CONVOLUTION_0:.+]] = stablehlo.convolution(%[[ARG_1]], %[[ARG_2]]) {{.*}} : (tensor<?x3x4x3x!quant.uniform<i8:f32, {{.*}}>>, tensor<2x3x3x2x!quant.uniform<i8<-127:127>:f32, {{.*}}>) -> tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
-// CHECK: %[[GET_DIMENSION_SIZE_0:.+]] = stablehlo.get_dimension_size %[[CONVOLUTION_0]], dim = 0 : (tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>)
-// CHECK: %[[RESHAPE_0:.+]] = stablehlo.reshape %[[GET_DIMENSION_SIZE_0]] : (tensor<i32>) -> tensor<1xi32>
-// CHECK: %[[CONCATENATE_0:.+]] = stablehlo.concatenate %[[RESHAPE_0]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]], dim = 0 : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<4xi32>
-// CHECK: %[[DYNAMIC_BROADCAST_IN_DIM_0:.+]] = stablehlo.dynamic_broadcast_in_dim %[[ARG_3]], %[[CONCATENATE_0]], dims = [3] : (tensor<2x!quant.uniform<i32:f32, {{.*}}>>, tensor<4xi32>) -> tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
-// CHECK: %[[ADD_0:.+]] = stablehlo.add %[[CONVOLUTION_0]], %[[DYNAMIC_BROADCAST_IN_DIM_0]] : tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>
-// CHECK: %[[UNIFORM_QUANTIZE_0:.+]] = stablehlo.uniform_quantize %[[ADD_0]] : (tensor<?x3x4x2x!quant.uniform<i32:f32, {{.*}}>>)
-// CHECK: return %[[UNIFORM_QUANTIZE_0]] : tensor<?x3x4x2x!quant.uniform<i8:f32, {{.*}}>>
-
-// -----
-
-// Tests that XlaCallModule op is not quantized without the quantfork.stats ops.
-
-module attributes {tf_saved_model.semantics} {
-// CHECK: func.func private @not_quantized_without_stats_fn(%[[ARG_0:.+]]: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"}
-  func.func private @not_quantized_without_stats_fn(%arg0: tensor<1x2xf32>) -> tensor<1x3xf32> attributes {tf._original_func_name = "main_0"} {
-    %cst = "tf.Const"() {value = dense<3.00000000e-1> : tensor<2x3xf32>} : () -> tensor<2x3xf32>
-    %1 = "tf.XlaCallModule"(%arg0, %cst) {Sout = [#tf_type.shape<1x3>], _entry_function = @composite_dot_general_fn, _original_entry_function = "composite_dot_general_fn", _stablehlo_module_attrs = {}, _tfl_quant_trait = "fully_quantizable",   device = "", dim_args_spec = [], disabled_checks = [], has_token_input_output = false, module = "", platforms = [], version = 5 : i64} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-    return %1 : tensor<1x3xf32>
-  }
-
-// Check that "tf.Const" is converted to stablehlo.constant. XlaCallModule is
-// not quantized.
-// CHECK: %[[CONST_0:.+]] = stablehlo.constant dense<3.000000e-01> : tensor<2x3xf32>
-// CHECK: %[[XLA_CALL_MODULE_0:.+]] = "tf.XlaCallModule"(%[[ARG_0]], %[[CONST_0]]) <{{{.*}}}> {{{.*_entry_function = @composite_dot_general_fn.*}}} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-// CHECK: return %[[XLA_CALL_MODULE_0]]
-
-// CHECK: func.func private @composite_dot_general_fn(%[[ARG_1:.+]]: tensor<1x2xf32>, %[[ARG_2:.+]]: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module}
-  func.func private @composite_dot_general_fn(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module} {
-    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
-    return %0 : tensor<1x3xf32>
-  }
-
-// Check that the composite_dot_general_fn is untouched.
-// CHECK: %[[DOT_GENERAL_0:.+]] = stablehlo.dot_general %[[ARG_1]], %[[ARG_2]]
-// CHECK: return %[[DOT_GENERAL_0]]
-}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
index 56161914b45dd4..1d2608599bd93b 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/BUILD
@@ -406,6 +406,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/lite/quantization/ir:QuantOps",
         "//tensorflow/compiler/mlir/quantization/common:attrs_and_constraints",
+        "//tensorflow/compiler/mlir/quantization/common:func",
         "//tensorflow/compiler/mlir/quantization/common:lift_as_function_call",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:const_op_size",
         "//tensorflow/compiler/mlir/quantization/tensorflow/cc:constant_fold",
@@ -479,9 +480,11 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     deps = [
         ":passes",
+        "//tensorflow/compiler/mlir/lite/stablehlo:fuse_convolution_pass",
         "//tensorflow/compiler/mlir/lite/stablehlo:legalize_tf_xla_call_module_to_stablehlo_pass",
         "//tensorflow/compiler/mlir/lite/stablehlo:rename_entrypoint_to_main",
         "//tensorflow/compiler/mlir/lite/stablehlo:tf_stablehlo",
+        "//tensorflow/compiler/mlir/lite/stablehlo:unfuse_batch_norm_pass",
         "//tensorflow/compiler/mlir/quantization/stablehlo:bridge_passes",
         "//tensorflow/compiler/mlir/quantization/stablehlo:passes",
         "//tensorflow/compiler/mlir/quantization/stablehlo/cc:pass_pipeline",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
index 01d7a7b37e1907..62a6f27c8ad5f1 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/BUILD
@@ -99,6 +99,7 @@ cc_library(
     hdrs = ["convert_asset_args.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        "//tensorflow/compiler/mlir/quantization/common:func",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:import_model",
         "//tensorflow/core/protobuf:for_core_protos_cc",
@@ -139,9 +140,9 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
-        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -172,13 +173,16 @@ tf_cc_test(
     srcs = ["constant_fold_test.cc"],
     deps = [
         ":constant_fold",
+        "//tensorflow/compiler/mlir/quantization/common:test_base",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/core:tensorflow",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
     ],
 )
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold_test.cc
index f2a7942323f5f5..5122563c235193 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold_test.cc
@@ -14,12 +14,19 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/constant_fold.h"
 
+#include <utility>
+
+#include <gmock/gmock.h>
+#include "absl/strings/string_view.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
-#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/mlir/quantization/common/test_base.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/core/platform/test.h"
 
@@ -30,38 +37,7 @@ namespace {
 using ::testing::NotNull;
 using ::testing::SizeIs;
 
-class ConstantFoldingTest : public ::testing::Test {
- protected:
-  ConstantFoldingTest() {
-    ctx_.loadDialect<func::FuncDialect, TF::TensorFlowDialect>();
-  }
-
-  // Parses `module_op_str` to create a `ModuleOp`. Checks whether the created
-  // module op is valid.
-  OwningOpRef<ModuleOp> ParseModuleOpString(absl::string_view module_op_str) {
-    auto module_op_ref = parseSourceString<ModuleOp>(module_op_str, &ctx_);
-    EXPECT_TRUE(module_op_ref);
-    return module_op_ref;
-  }
-
-  // Gets the function with the given name from the module.
-  func::FuncOp GetFunctionFromModule(ModuleOp module,
-                                     absl::string_view function_name) {
-    SymbolTable symbol_table(module);
-    return symbol_table.lookup<func::FuncOp>(function_name);
-  }
-
-  // Returns the first operation with the given type in the function.
-  template <typename OpType>
-  OpType FindOperationOfType(func::FuncOp function) {
-    for (auto op : function.getBody().getOps<OpType>()) {
-      return op;
-    }
-    return nullptr;
-  }
-
-  MLIRContext ctx_{};
-};
+class ConstantFoldingTest : public QuantizationTestBase {};
 
 TEST_F(ConstantFoldingTest, FoldLargeConstant) {
   constexpr absl::string_view kModuleCode = R"mlir(
@@ -80,8 +56,10 @@ TEST_F(ConstantFoldingTest, FoldLargeConstant) {
   )mlir";
 
   OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleCode);
-  func::FuncOp test_func =
-      GetFunctionFromModule(*module_op_ref, "test_fold_constant");
+  const auto test_func =
+      module_op_ref->lookupSymbol<func::FuncOp>("test_fold_constant");
+  ASSERT_THAT(test_func, NotNull());
+
   Operation* mul_op = FindOperationOfType<TF::MulOp>(test_func);
   SmallVector<Value> results = ConstantFoldOpIfPossible(mul_op);
   EXPECT_THAT(results, SizeIs(1));
@@ -106,8 +84,10 @@ TEST_F(ConstantFoldingTest, NotFoldingIdentity) {
   )mlir";
 
   OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleCode);
-  func::FuncOp test_func =
-      GetFunctionFromModule(*module_op_ref, "test_fold_constant");
+  const auto test_func =
+      module_op_ref->lookupSymbol<func::FuncOp>("test_fold_constant");
+  ASSERT_THAT(test_func, NotNull());
+
   Operation* op_to_fold = FindOperationOfType<TF::MulOp>(test_func);
   SmallVector<Value> results = ConstantFoldOpIfPossible(op_to_fold);
   EXPECT_THAT(results, SizeIs(1));
@@ -135,8 +115,10 @@ TEST_F(ConstantFoldingTest, NotFoldingArgument) {
   )mlir";
 
   OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleCode);
-  func::FuncOp test_func =
-      GetFunctionFromModule(*module_op_ref, "test_fold_constant");
+  const auto test_func =
+      module_op_ref->lookupSymbol<func::FuncOp>("test_fold_constant");
+  ASSERT_THAT(test_func, NotNull());
+
   Operation* op_to_fold = FindOperationOfType<TF::MulOp>(test_func);
   SmallVector<Value> results = ConstantFoldOpIfPossible(op_to_fold);
   EXPECT_THAT(results, SizeIs(1));
@@ -166,11 +148,12 @@ TEST_F(ConstantFoldingTest, FoldDepthwiseConvWeight) {
   )mlir";
 
   OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleCode);
-  func::FuncOp test_func =
-      GetFunctionFromModule(*module_op_ref, "test_fold_constant");
+  const auto test_func =
+      module_op_ref->lookupSymbol<func::FuncOp>("test_fold_constant");
+  ASSERT_THAT(test_func, NotNull());
 
-  RewritePatternSet patterns(&ctx_);
-  patterns.add<ConstantFoldQuantizableOperands>(&ctx_);
+  RewritePatternSet patterns(ctx_.get());
+  patterns.add<ConstantFoldQuantizableOperands>(ctx_.get());
   EXPECT_TRUE(
       succeeded(applyPatternsAndFoldGreedily(test_func, std::move(patterns))));
 
@@ -198,11 +181,12 @@ TEST_F(ConstantFoldingTest, DepthwiseConvWeightNotFoldable) {
   )mlir";
 
   OwningOpRef<ModuleOp> module_op_ref = ParseModuleOpString(kModuleCode);
-  func::FuncOp test_func =
-      GetFunctionFromModule(*module_op_ref, "test_fold_constant");
+  const auto test_func =
+      module_op_ref->lookupSymbol<func::FuncOp>("test_fold_constant");
+  ASSERT_THAT(test_func, NotNull());
 
-  RewritePatternSet patterns(&ctx_);
-  patterns.add<ConstantFoldQuantizableOperands>(&ctx_);
+  RewritePatternSet patterns(ctx_.get());
+  patterns.add<ConstantFoldQuantizableOperands>(ctx_.get());
   EXPECT_TRUE(
       succeeded(applyPatternsAndFoldGreedily(test_func, std::move(patterns))));
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.cc
index d88a1fe42cc555..8e2b3537b34d85 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/convert_asset_args.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/func.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/core/protobuf/meta_graph.pb.h"
@@ -38,20 +39,6 @@ using ::mlir::tf_saved_model::LookupBoundInputOfType;
 using ::tensorflow::AssetFileDef;
 using ::tensorflow::kImportModelDefaultGraphFuncName;
 
-// Gets the "main" function from the module. Returns an empty op iff it doesn't
-// exist.
-func::FuncOp GetMainFunction(ModuleOp module_op) {
-  const auto main_func_id =
-      StringAttr::get(module_op.getContext(), kImportModelDefaultGraphFuncName);
-  auto func_ops = module_op.getOps<func::FuncOp>();
-  auto main_func_itr = absl::c_find_if(func_ops, [&main_func_id](auto func_op) {
-    return func_op.getName() == main_func_id;
-  });
-
-  if (main_func_itr == func_ops.end()) return {};
-  return *main_func_itr;
-}
-
 // Given argument attributes `arg_attrs`, returns a new set of argument
 // attributes where the "tf_saved_model.bound_input" attribute has been replaced
 // with the "tf_saved_model.index_path" attribute. `index_path` is the element
@@ -130,7 +117,7 @@ void ConvertMainArgAttrs(func::FuncOp main_func_op, const int arg_idx,
 }  // namespace
 
 FailureOr<SmallVector<AssetFileDef>> ConvertAssetArgs(ModuleOp module_op) {
-  func::FuncOp main_func_op = GetMainFunction(module_op);
+  func::FuncOp main_func_op = FindMainFuncOp(module_op);
   if (!main_func_op) return failure();
 
   SmallVector<StringRef> input_names = GetEntryFunctionInputs(main_func_op);
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.cc b/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.cc
index ef63e75e52e8c9..ee7ae1a4c6d90a 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tsl/platform/errors.h"
 
 namespace tensorflow {
 namespace quantization {
@@ -33,11 +34,8 @@ absl::Status RunPassesOnModuleOp(
 
   absl::StatusOr<std::unique_ptr<llvm::raw_ostream>> dump_file;
   if (mlir_dump_file_name) {
-    dump_file = tensorflow::quantization::MaybeEnableIrPrinting(
-        pass_manager, mlir_dump_file_name.value());
-    if (!dump_file.ok()) {
-      return dump_file.status();
-    }
+    TF_RETURN_IF_ERROR(tensorflow::quantization::MaybeEnableIrPrinting(
+        pass_manager, mlir_dump_file_name.value()));
   }
 
   if (failed(pass_manager.run(module_op))) {
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h b/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h
index 35f243a9f9626d..06db2acb7b057f 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/cc/run_passes.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace tensorflow {
@@ -47,8 +48,7 @@ absl::Status RunPasses(const absl::string_view name, FuncT add_passes_func,
   add_passes_func(pm);
 
   mlir::StatusScopedDiagnosticHandler diagnostic_handler{&ctx};
-  TF_ASSIGN_OR_RETURN(const std::unique_ptr<llvm::raw_ostream> out_dump_file,
-                      MaybeEnableIrPrinting(pm, name));
+  TF_RETURN_IF_ERROR(MaybeEnableIrPrinting(pm, name));
 
   if (failed(pm.run(module_op))) {
     return absl::InternalError(
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/BUILD
index fa55ce0ba1e391..b465fe15e8d57c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/BUILD
@@ -19,16 +19,21 @@ cc_library(
     hdrs = ["mlir_dump.h"],
     compatible_with = get_compatible_with_portable(),
     deps = [
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:stringpiece",
     ],
 )
 
@@ -39,14 +44,19 @@ tf_cc_test(
     deps = [
         ":mlir_dump",
         "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/status:statusor",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
+        "@stablehlo//:stablehlo_ops",
     ],
 )
 
@@ -57,7 +67,7 @@ tf_kernel_library(
     deps = [
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/core:framework",
-        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:path",
         "@com_google_absl//absl/log",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/dump_tensor_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/dump_tensor_op.cc
index ce52e488774f60..312848e5125af3 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/dump_tensor_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/dump_tensor_op.cc
@@ -25,6 +25,8 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/lib/io/compression.h"
+#include "tensorflow/core/lib/io/record_writer.h"
 #include "tensorflow/core/platform/path.h"
 #include "tensorflow/core/platform/types.h"
 #include "tsl/platform/env.h"
@@ -72,7 +74,19 @@ class DumpTensorOp : public OpKernel {
     OP_REQUIRES_OK(ctx, ctx->GetAttr("node_name", &node_name));
     OP_REQUIRES_OK(ctx, ctx->env()->RecursivelyCreateDir(log_dir_path));
 
-    tensor_data_path_ = io::JoinPath(log_dir_path, file_name);
+    std::string tensor_data_path = io::JoinPath(log_dir_path, file_name);
+    std::unique_ptr<tsl::WritableFile> tensor_data_file;
+    OP_REQUIRES_OK(
+        ctx, ctx->env()->NewWritableFile(tensor_data_path, &tensor_data_file));
+
+    // Turn on Zlib compression.
+    io::RecordWriterOptions options =
+        io::RecordWriterOptions::CreateRecordWriterOptions(
+            io::compression::kZlib);
+    tensor_data_writer_ =
+        std::make_unique<io::RecordWriter>(tensor_data_file.release(), options);
+    OP_REQUIRES(ctx, tensor_data_writer_ != nullptr,
+                absl::AbortedError("Could not create record writer"));
 
     // Fetch func_name and node_name from attributes and save as proto.
     quantization::UnitWiseQuantizationSpec::QuantizationUnit quant_unit_proto;
@@ -80,28 +94,31 @@ class DumpTensorOp : public OpKernel {
     quant_unit_proto.set_node_name(node_name);
 
     string quant_unit_path = io::JoinPath(log_dir_path, "quant_unit.pb");
-
     OP_REQUIRES_OK(
         ctx, SaveSerializedProtoToFile(quant_unit_proto.SerializeAsString(),
                                        quant_unit_path, ctx->env()));
   }
 
+  ~DumpTensorOp() override {
+    (void)tensor_data_writer_->Flush();
+    (void)tensor_data_writer_->Close();
+  }
+
   void Compute(OpKernelContext* ctx) override {
-    if (enabled_) {
-      const Tensor& tensor_data = ctx->input(0);
+    if (!enabled_) return;
+
+    const Tensor& tensor_data = ctx->input(0);
 
-      TensorProto tensor_proto;
-      tensor_data.AsProtoTensorContent(&tensor_proto);
+    TensorProto tensor_proto;
+    tensor_data.AsProtoTensorContent(&tensor_proto);
 
-      OP_REQUIRES_OK(ctx,
-                     SaveSerializedProtoToFile(tensor_proto.SerializeAsString(),
-                                               tensor_data_path_, ctx->env()));
-    }
+    OP_REQUIRES_OK(ctx, tensor_data_writer_->WriteRecord(
+                            tensor_proto.SerializeAsString()));
   }
 
  private:
-  std::string tensor_data_path_;
   bool enabled_;
+  std::unique_ptr<io::RecordWriter> tensor_data_writer_;
 };
 
 REGISTER_KERNEL_BUILDER(Name("DumpTensor").Device(DEVICE_CPU), DumpTensorOp);
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.cc b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.cc
index 1d1a54e5a88db5..7a19c7b5579617 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.cc
@@ -14,22 +14,34 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h"
 
+#include <cstdint>
 #include <cstdlib>
 #include <memory>
 #include <string>
+#include <utility>
 
+#include "absl/log/log.h"
+#include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
+#include "absl/strings/match.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/file_system.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/stringpiece.h"
 
 namespace tensorflow {
 namespace quantization {
@@ -63,39 +75,138 @@ absl::StatusOr<std::string> GetMlirDumpDir() {
   return dump_dir;
 }
 
+// A simple wrapper of tsl::WritableFile so that mlir Pass infra can use it.
+class WritableFileWrapper : public llvm::raw_ostream {
+ public:
+  ~WritableFileWrapper() override = default;
+  static absl::StatusOr<std::unique_ptr<WritableFileWrapper>> Create(
+      const std::string& filepath) {
+    std::unique_ptr<tsl::WritableFile> file;
+    TF_RETURN_IF_ERROR(tsl::Env::Default()->NewWritableFile(filepath, &file));
+    return absl::WrapUnique(new WritableFileWrapper(std::move(file)));
+  }
+
+ private:
+  explicit WritableFileWrapper(std::unique_ptr<tsl::WritableFile> file)
+      : file_(std::move(file)) {
+    SetUnbuffered();
+  }
+
+  uint64_t current_pos() const override {
+    int64_t position;
+    if (file_->Tell(&position).ok()) {
+      return position;
+    } else {
+      return -1;
+    }
+  }
+
+  void write_impl(const char* ptr, size_t size) override {
+    if (file_ && !file_->Append(tsl::StringPiece(ptr, size)).ok()) {
+      file_ = nullptr;
+    }
+  }
+
+  std::unique_ptr<tsl::WritableFile> file_;
+};
+
 // Creates a new file to dump the intermediate MLIRs by prefixing the
 // `dump_file_name` with the value of the TF_QUANT_MLIR_DUMP_PREFIX env
 // variable. Returns absl::FailedPreconditionError if the env variable is not
 // set or set to an empty string.
-absl::StatusOr<std::unique_ptr<llvm::raw_fd_ostream>> CreateMlirDumpFile(
+absl::StatusOr<std::unique_ptr<llvm::raw_ostream>> CreateMlirDumpFile(
     const absl::string_view dump_file_name) {
   const absl::StatusOr<std::string> dump_dir = GetMlirDumpDir();
   if (!dump_dir.ok()) {
     return dump_dir.status();
   }
 
-  auto *env = tsl::Env::Default();
-  const tsl::Status status = env->RecursivelyCreateDir(*dump_dir);
-  if (!status.ok()) {
-    return status;
-  }
+  auto* env = tsl::Env::Default();
+  TF_RETURN_IF_ERROR(env->RecursivelyCreateDir(*dump_dir));
 
-  std::error_code ec{};  // NOLINT: Required to create llvm::raw_fd_ostream
   const std::string dump_file_path =
       tsl::io::JoinPath(*dump_dir, dump_file_name);
-  auto dump_file = std::make_unique<llvm::raw_fd_ostream>(dump_file_path, ec);
-  if (ec) {
-    return absl::InternalError(absl::StrFormat(
-        "Unable to open file: %s, error: %s", dump_file_path, ec.message()));
-  }
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<llvm::raw_ostream> file,
+                      WritableFileWrapper::Create(dump_file_path));
 
   LOG(INFO) << "IR dump file created: " << dump_file_path;
-  return dump_file;
+  return file;
 }
 
+class PrinterConfig : public mlir::PassManager::IRPrinterConfig {
+ public:
+  explicit PrinterConfig(
+      absl::string_view dump_file_prefix, bool print_module_scope = false,
+      bool print_after_only_on_change = true,
+      mlir::OpPrintingFlags op_printing_flags = mlir::OpPrintingFlags())
+      : mlir::PassManager::IRPrinterConfig(
+            print_module_scope, print_after_only_on_change,
+            /*printAfterOnlyOnFailure=*/false, op_printing_flags),
+        mlir_pass_count_(1),
+        dump_file_prefix_(dump_file_prefix) {}
+
+  void printBeforeIfEnabled(mlir::Pass* pass, mlir::Operation* op,
+                            PrintCallbackFn print_callback) override {
+    Dump(pass, print_callback, /*is_before=*/true);
+  }
+
+  void printAfterIfEnabled(mlir::Pass* pass, mlir::Operation* op,
+                           PrintCallbackFn print_callback) override {
+    Dump(pass, print_callback, /*is_before=*/false);
+  }
+
+ private:
+  int64_t mlir_pass_count_;
+  absl::string_view dump_file_prefix_;
+  // Map from pass ptr to dump files and pass number.
+  //
+  // Each pass has unique and stable pointer, even for passes with the same
+  // name. E.g. a PassManager could have multiple Canonicalizer passes.
+  // We use this property to uniquely determine a Pass in a PassManager.
+  //
+  // If multiple consecutive func passes are applied to a Module. PassManager
+  // will iterate over the func in the outer loop and apply the passes in the
+  // inner loop. This may cause passes to run out-of-order. But the 1st runs of
+  // each pass are still in-order. So we use pass_to_number_map_ to keep track
+  // of the number for each pass.
+  llvm::DenseMap<mlir::Pass*, std::unique_ptr<llvm::raw_ostream>>
+      pass_to_dump_file_before_map_;
+  llvm::DenseMap<mlir::Pass*, std::unique_ptr<llvm::raw_ostream>>
+      pass_to_dump_file_after_map_;
+  llvm::DenseMap<mlir::Pass*, int64_t> pass_to_number_map_;
+
+  // Get the unique number for each pass.
+  int64_t GetPassNumber(mlir::Pass* pass) {
+    if (!pass_to_number_map_.contains(pass)) {
+      pass_to_number_map_[pass] = mlir_pass_count_++;
+    }
+    return pass_to_number_map_[pass];
+  }
+
+  void Dump(mlir::Pass* pass, PrintCallbackFn print_callback, bool is_before) {
+    auto& pass_to_dump_file_map = is_before ? pass_to_dump_file_before_map_
+                                            : pass_to_dump_file_after_map_;
+    if (!pass_to_dump_file_map.contains(pass)) {
+      std::string filename = llvm::formatv(
+          "{0}_{1,0+4}_{2}_{3}.mlir", dump_file_prefix_, GetPassNumber(pass),
+          pass->getName().str(), is_before ? "before" : "after");
+      absl::StatusOr<std::unique_ptr<llvm::raw_ostream>> dump_file =
+          CreateMlirDumpFile(filename);
+      if (!dump_file.ok()) {
+        LOG(WARNING) << "Failed to dump MLIR module to " << filename;
+        return;
+      }
+      pass_to_dump_file_map[pass] = std::move(*dump_file);
+    }
+
+    return print_callback(*(pass_to_dump_file_map[pass]));
+  }
+};
+
 }  // namespace
 
-void EnableIrPrinting(llvm::raw_ostream &out_stream, mlir::PassManager &pm) {
+void EnableIrPrinting(mlir::PassManager& pm,
+                      absl::string_view file_name_prefix) {
   mlir::OpPrintingFlags flag{};
   flag.useLocalScope().elideLargeElementsAttrs().enableDebugInfo();
 
@@ -112,39 +223,23 @@ void EnableIrPrinting(llvm::raw_ostream &out_stream, mlir::PassManager &pm) {
   // `PassManager::enableIRPrinting`, except for the `printModuleScope`
   // parameter, which is true by default. It is set to false to avoid the dump
   // file size becoming too large when the passes are running on a large model.
-  pm.enableIRPrinting(
-      /*shouldPrintBeforePass=*/[](mlir::Pass *,
-                                   mlir::Operation *) { return true; },
-      /*shouldPrintAfterPass=*/
-      [](mlir::Pass *, mlir::Operation *) { return true; },
-      /*printModuleScope=*/false, /*printAfterOnlyOnChange=*/true,
-      /*printAfterOnlyOnFailure=*/false, out_stream, flag);
-
-  LOG(INFO) << "IR dump for TensorFlow quantization pipeline enabled.";
+  pm.enableIRPrinting(std::make_unique<PrinterConfig>(
+      file_name_prefix, /*print_module_scope=*/false,
+      /*print_after_only_on_change=*/true, flag));
 }
 
 // TODO(b/259374854): Create tests for MaybeEnableIrPrinting.
-absl::StatusOr<std::unique_ptr<llvm::raw_ostream>> MaybeEnableIrPrinting(
-    mlir::PassManager &pm, const absl::string_view name) {
+absl::Status MaybeEnableIrPrinting(mlir::PassManager& pm,
+                                   absl::string_view file_name_prefix) {
   if (!VLOG_IS_ON(1)) {
     LOG(INFO) << "Verbosity level too low to enable IR printing.";
-    return nullptr;
+    return absl::OkStatus();
   }
 
-  absl::StatusOr<std::unique_ptr<llvm::raw_fd_ostream>> dump_file =
-      CreateMlirDumpFile(/*dump_file_name=*/absl::StrCat(name, ".mlir"));
-  if (absl::IsFailedPrecondition(dump_file.status())) {
-    // Requirements for enabling IR dump are not met. IR printing will not be
-    // enabled.
-    LOG(WARNING) << dump_file.status();
-    return nullptr;
-  } else if (!dump_file.ok()) {
-    return dump_file.status();
-  }
-
-  EnableIrPrinting(**dump_file, pm);
+  EnableIrPrinting(pm, file_name_prefix);
 
-  return dump_file;
+  LOG(INFO) << "IR dump for TensorFlow quantization pipeline enabled.";
+  return absl::OkStatus();
 }
 
 }  // namespace quantization
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h
index 803cd39a0a5bae..38a9c4fae4f912 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump.h
@@ -15,27 +15,29 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_DEBUGGING_MLIR_DUMP_H_
 #define TENSORFLOW_COMPILER_MLIR_QUANTIZATION_TENSORFLOW_DEBUGGING_MLIR_DUMP_H_
 
-#include <memory>
-
-#include "absl/status/statusor.h"
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
-#include "llvm/Support/raw_ostream.h"
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 
 namespace tensorflow {
 namespace quantization {
 
-// Enables IR printing for `pm`. When the passes are run, the IRs will be dumped
-// to `out_stream`.
-void EnableIrPrinting(llvm::raw_ostream &out_stream, mlir::PassManager &pm);
+// Enables IR printing for `pm`. When the passes are run, each pass will dump to
+// its own file with prefix `file_name_prefix`.
+void EnableIrPrinting(mlir::PassManager &pm,
+                      absl::string_view file_name_prefix);
 
 // If verbosity level >= 1, this will dump intermediate IRs of passes to a file.
-// The file path is given by prefixing `name`.mlir with the value of the
-// TF_QUANT_MLIR_DUMP_PREFIX env variable. Returns `nullptr` iff the verbosity
-// level < 1 or TF_QUANT_MLIR_DUMP_PREFIX is not set or set to an empty string.
-// The returned ostream instance should live until the pass run is complete.
-absl::StatusOr<std::unique_ptr<llvm::raw_ostream>> MaybeEnableIrPrinting(
-    mlir::PassManager &pm, absl::string_view name);
+// The dumped mlir files with be under a directory determined by
+// the TF_QUANT_MLIR_DUMP_PREFIX env variable. The PassManager will dump to a
+// new file for each pass. The file name will have the format
+// {file_name_prefix}_{pass_number}_{pass_name}_{before|after}.mlir.
+// * `file_name_prefix` is from input.
+// * `pass_number` increments from 1 for each pass.
+// * `pass_name` is the name of the pass.
+// * `before|after` indicates whether the dump occurs before or after the pass.
+absl::Status MaybeEnableIrPrinting(mlir::PassManager &pm,
+                                   absl::string_view file_name_prefix);
 
 }  // namespace quantization
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump_test.cc b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump_test.cc
index a7162d9a05a4ff..c3034f4294b13d 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump_test.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/debugging/mlir_dump_test.cc
@@ -16,23 +16,30 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <vector>
 
 #include "absl/cleanup/cleanup.h"
-#include "absl/status/statusor.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinDialect.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
+#include "stablehlo/dialect/StablehloOps.h"  // from @stablehlo
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/env.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/test.h"
 
 namespace tensorflow {
 namespace quantization {
-namespace {
+namespace mlir_dump_test {
 
 class NoOpPass
     : public mlir::PassWrapper<NoOpPass, mlir::OperationPass<mlir::ModuleOp>> {
@@ -69,12 +76,7 @@ class ParentPass
 
     pm.addPass(CreateNoOpPass());
 
-    std::error_code ec{};  // NOLINT: Required to create llvm::raw_fd_ostream
-    const std::string tmp_dump_filename =
-        tsl::io::GetTempFilename(/*extension=*/".mlir");
-    llvm::raw_fd_ostream dump_file{tmp_dump_filename, ec};
-
-    EnableIrPrinting(dump_file, pm);
+    EnableIrPrinting(pm, "dump2");
 
     if (failed(pm.run(module_op))) {
       signalPassFailure();
@@ -86,41 +88,88 @@ std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> CreateParentPass() {
   return std::make_unique<ParentPass>();
 }
 
-TEST(EnableIrPrintingTest, PassSuccessfullyRuns) {
-  mlir::MLIRContext ctx{};
+}  // namespace mlir_dump_test
 
-  mlir::PassManager pm = {&ctx};
-  pm.addPass(CreateNoOpPass());
+namespace {
 
-  std::error_code ec{};  // NOLINT: Required to create llvm::raw_fd_ostream
-  const std::string tmp_dump_filename =
-      tsl::io::GetTempFilename(/*extension=*/".mlir");
-  llvm::raw_fd_ostream dump_file{tmp_dump_filename, ec};
+using namespace tensorflow::quantization::mlir_dump_test;
 
-  EnableIrPrinting(dump_file, pm);
+class EnableIrPrintingTest : public ::testing::Test {
+ protected:
+  EnableIrPrintingTest() : env_(tsl::Env::Default()) {
+    if (!tsl::io::GetTestUndeclaredOutputsDir(&test_dir_)) {
+      test_dir_ = tsl::testing::TmpDir();
+    }
+  }
 
-  mlir::OpBuilder builder(&ctx);
-  auto module_op = builder.create<mlir::ModuleOp>(builder.getUnknownLoc());
-  // Destroy by calling destroy() to avoid memory leak since it is allocated
-  // with malloc().
-  const absl::Cleanup module_op_cleanup = [module_op] { module_op->destroy(); };
+  void SetUp() override {
+    tsl::setenv("TF_QUANT_MLIR_DUMP_PREFIX", test_dir_.c_str(), 1);
 
-  const mlir::LogicalResult result = pm.run(module_op);
+    mlir::DialectRegistry dialects;
+    dialects.insert<mlir::BuiltinDialect, mlir::func::FuncDialect,
+                    mlir::stablehlo::StablehloDialect>();
+    ctx_ = std::make_unique<mlir::MLIRContext>(dialects);
+    ctx_->loadAllAvailableDialects();
+  }
+
+  void TearDown() override {
+    // Delete files in the test directory.
+    std::vector<std::string> files;
+    TF_ASSERT_OK(
+        env_->GetMatchingPaths(tsl::io::JoinPath(test_dir_, "*"), &files));
+    for (const std::string& file : files) {
+      TF_ASSERT_OK(env_->DeleteFile(file));
+    }
+  }
+
+  tsl::Env* env_;
+  std::string test_dir_;
+  std::unique_ptr<mlir::MLIRContext> ctx_;
+};
+
+TEST_F(EnableIrPrintingTest, PassSuccessfullyRuns) {
+  mlir::PassManager pm = {ctx_.get()};
+  pm.addPass(CreateNoOpPass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::createCanonicalizerPass());
+
+  EnableIrPrinting(pm, "dump");
+
+  constexpr absl::string_view program = R"mlir(
+module{
+  func.func @main(%arg0: tensor<10xf32>) -> tensor<10xf32> {
+    return %arg0 : tensor<10xf32>
+  }
+  func.func @func1(%arg0: tensor<10xf32>, %arg1: tensor<10xf32>) -> tensor<10xf32> {
+    %0 = stablehlo.add %arg0, %arg1 : tensor<10xf32>
+    %1 = stablehlo.add %arg0, %arg1 : tensor<10xf32>
+    return %0 : tensor<10xf32>
+  }
+})mlir";
+  auto module_op = mlir::parseSourceString<mlir::ModuleOp>(program, ctx_.get());
+
+  const mlir::LogicalResult result = pm.run(module_op.get());
   EXPECT_FALSE(failed(result));
+
+  TF_EXPECT_OK(tsl::Env::Default()->FileExists(
+      tsl::io::JoinPath(test_dir_,
+                        "dump_0001_tensorflow::quantization::mlir_dump_test"
+                        "::NoOpPass_before.mlir")));
+  TF_EXPECT_OK(tsl::Env::Default()->FileExists(
+      tsl::io::JoinPath(test_dir_, "dump_0002_Canonicalizer_before.mlir")));
+  TF_EXPECT_OK(tsl::Env::Default()->FileExists(
+      tsl::io::JoinPath(test_dir_, "dump_0002_Canonicalizer_after.mlir")));
+  TF_EXPECT_OK(tsl::Env::Default()->FileExists(
+      tsl::io::JoinPath(test_dir_, "dump_0003_Canonicalizer_before.mlir")));
 }
 
-TEST(EnableNestedIrPrintingTest, PassSuccessfullyRuns) {
+TEST_F(EnableIrPrintingTest, NestedPassSuccessfullyRuns) {
   mlir::MLIRContext ctx{};
 
   mlir::PassManager pm = {&ctx};
   pm.addPass(CreateParentPass());
 
-  std::error_code ec{};  // NOLINT: Required to create llvm::raw_fd_ostream
-  const std::string tmp_dump_filename =
-      tsl::io::GetTempFilename(/*extension=*/".mlir");
-  llvm::raw_fd_ostream dump_file{tmp_dump_filename, ec};
-
-  EnableIrPrinting(dump_file, pm);
+  EnableIrPrinting(pm, "dump");
 
   mlir::OpBuilder builder(&ctx);
   auto module_op = builder.create<mlir::ModuleOp>(builder.getUnknownLoc());
@@ -130,6 +179,15 @@ TEST(EnableNestedIrPrintingTest, PassSuccessfullyRuns) {
 
   const mlir::LogicalResult result = pm.run(module_op);
   EXPECT_FALSE(failed(result));
+
+  TF_EXPECT_OK(tsl::Env::Default()->FileExists(
+      tsl::io::JoinPath(test_dir_,
+                        "dump_0001_tensorflow::quantization::mlir_dump_test"
+                        "::ParentPass_before.mlir")));
+  TF_EXPECT_OK(tsl::Env::Default()->FileExists(
+      tsl::io::JoinPath(test_dir_,
+                        "dump2_0001_tensorflow::quantization::mlir_dump_test"
+                        "::NoOpPass_before.mlir")));
 }
 }  // namespace
 }  // namespace quantization
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD
index fa201ff6a716bc..7042b6c5b17cdb 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/BUILD
@@ -20,8 +20,8 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/quantization:quantization_lib",
         "//tensorflow/compiler/mlir/quantization/tensorflow:quantization_options_proto_cc",
         "//tensorflow/compiler/mlir/tensorflow",
-        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
     ],
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
index 3a81ff0dfd2c91..fb13da8489a81c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.cc
@@ -14,17 +14,17 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/quantization/tensorflow/ops/tf_op_quant_spec.h"
 
-#include <algorithm>
-#include <iterator>
 #include <memory>
 #include <optional>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "llvm/Support/Casting.h"
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/quantization_options.pb.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc
index 595534433849ee..a6ad3e34835672 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/add_dump_tensor_op.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/quantization/ir/QuantOps.h"
 #include "tensorflow/compiler/mlir/lite/quantization/quantization_utils.h"
+#include "tensorflow/compiler/mlir/quantization/common/attrs_and_constraints.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/cc/quantization_unit_loc.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/tf_quant_ops.h"
@@ -54,7 +55,76 @@ namespace {
 using DebuggerType = tensorflow::quantization::DebuggerOptions::DebuggerType;
 using DebuggerOptions = tensorflow::quantization::DebuggerOptions;
 
+constexpr StringRef kEntryFuncAttrName = "_entry_function";
+constexpr StringRef kOriginalEntryFuncAttrName = "_original_entry_function";
 constexpr StringRef kCompositeFuncPrefix = "composite_";
+constexpr StringRef kEmptyNodeName = "_empty_node";
+
+template <typename LiftedOp>
+std::pair<std::string, std::string> GetFuncNameAndNodeName(
+    LiftedOp op, const FlatSymbolRefAttr &f_attr) {
+  static_assert(false,
+                "GetFuncNameAndNodeName for call_op is not implemented.");
+}
+
+// Returns a pair: `func_name` and `node_name` for the lifted function. In TF
+// quantizer, both are filled. For StableHLO quantizer, the func_name is only
+// filled and node_name is always set to "_empty_node".
+template <>
+std::pair<std::string, std::string>
+GetFuncNameAndNodeName<TF::PartitionedCallOp>(TF::PartitionedCallOp call_op,
+                                              const FlatSymbolRefAttr &f_attr) {
+  std::optional<QuantizationUnitLoc::QuantizationUnit> quant_unit =
+      FindQuantizationUnitFromLoc(call_op->getLoc());
+  return std::make_pair(quant_unit->func_name(), quant_unit->node_name());
+}
+
+template <>
+std::pair<std::string, std::string> GetFuncNameAndNodeName<TF::XlaCallModuleOp>(
+    TF::XlaCallModuleOp call_op, const FlatSymbolRefAttr &f_attr) {
+  return std::make_pair(f_attr.getValue().str(), kEmptyNodeName.str());
+}
+
+template <typename LiftedOp>
+Operation *DuplicateOp(LiftedOp op, PatternRewriter &rewriter,
+                       const StringAttr &new_ref_func_name) {
+  static_assert(false, "DuplicateOp for call_op is not implemented.");
+}
+
+template <>
+Operation *DuplicateOp<TF::PartitionedCallOp>(
+    TF::PartitionedCallOp call_op, PatternRewriter &rewriter,
+    const StringAttr &new_ref_func_name) {
+  // Create PartitionedCallOp to the copied composite function. This
+  // PartitionedCallOp does not have kQuantTraitAttrName, and therefore won't
+  // get quantized.
+  auto new_call_op = rewriter.create<TF::PartitionedCallOp>(
+      call_op.getLoc(), call_op.getResultTypes(), call_op.getOperands(),
+      FlatSymbolRefAttr::get(new_ref_func_name));
+  return new_call_op;
+}
+
+template <>
+Operation *DuplicateOp<TF::XlaCallModuleOp>(
+    TF::XlaCallModuleOp call_op, PatternRewriter &rewriter,
+    const StringAttr &new_ref_func_name) {
+  // Create XlaCallModuleOp to the copied composite function. This
+  // XlaCallModuleOp does not have kQuantTraitAttrName, and therefore won't get
+  // quantized.
+  auto new_call_op = rewriter.create<TF::XlaCallModuleOp>(
+      call_op.getLoc(), call_op.getResultTypes(), call_op.getOperands(),
+      call_op.getVersionAttr(), call_op.getModuleAttr(), call_op.getSoutAttr());
+  new_call_op->setAttr(kEntryFuncAttrName,
+                       rewriter.getStringAttr(new_ref_func_name.getValue()));
+  new_call_op->setAttrs(call_op->getAttrs());
+  new_call_op->removeAttr(rewriter.getStringAttr(kQuantTraitAttrName));
+
+  FlatSymbolRefAttr new_func_name_attr =
+      FlatSymbolRefAttr::get(rewriter.getContext(), new_ref_func_name);
+  new_call_op->setAttr(kEntryFuncAttrName, new_func_name_attr);
+  new_call_op->setAttr(kOriginalEntryFuncAttrName, new_ref_func_name);
+  return new_call_op;
+}
 
 // AddDumpTensorOp pass adds DumpTensorOp - which saves entire value of its
 // input into a file - to quantizable layer's output.
@@ -110,49 +180,66 @@ class AddDumpTensorOpPass
   std::string log_dir_path_ = "/tmp/dumps";
 };
 
-class AddDumpTensorOp : public OpRewritePattern<TF::PartitionedCallOp> {
+template <typename LiftedOpT>
+class AddDumpTensorOp : public OpRewritePattern<LiftedOpT> {
  public:
   // Does not take ownership of context, which must refer to a valid value that
   // outlives this object.
   explicit AddDumpTensorOp(MLIRContext *context, DebuggerType debugger_type,
                            std::string log_dir_path)
-      : OpRewritePattern(context),
+      : OpRewritePattern<LiftedOpT>(context),
         debugger_type_(debugger_type),
         log_dir_path_(std::move(log_dir_path)) {}
 
  private:
-  DebuggerType debugger_type_;
-  std::string log_dir_path_;
+  SmallVector<NamedAttribute> CreateDumpAttributes(
+      PatternRewriter &rewriter, const StringRef folder_name,
+      const StringRef file_name, const bool enabled, const StringRef func_name,
+      const StringRef node_name) const {
+    SmallVector<NamedAttribute> dump_attributes{
+        rewriter.getNamedAttr("log_dir_path",
+                              rewriter.getStringAttr(folder_name)),
+        rewriter.getNamedAttr("file_name", rewriter.getStringAttr(file_name)),
+        // The op is disabled by default. Otherwise, values will be saved
+        // during calibration.
+        rewriter.getNamedAttr("enabled", rewriter.getBoolAttr(false)),
+        rewriter.getNamedAttr("func_name", rewriter.getStringAttr(func_name)),
+        rewriter.getNamedAttr("node_name", rewriter.getStringAttr(node_name)),
+    };
+    return dump_attributes;
+  }
 
-  LogicalResult matchAndRewrite(TF::PartitionedCallOp call_op,
-                                PatternRewriter &rewriter) const override {
-    const auto f_attr = call_op.getFAttr().dyn_cast<FlatSymbolRefAttr>();
-    if (!call_op->hasAttr(kQuantTraitAttrName)) {
-      return failure();
-    }
-    if (!f_attr.getValue().starts_with(kCompositeFuncPrefix)) {
-      return failure();
-    }
+  StringAttr DuplicateFunction(Operation *op,
+                               const FlatSymbolRefAttr &f_attr) const {
+    ModuleOp module = op->getParentOfType<ModuleOp>();
+    SymbolTable symbol_table(module);
 
-    // For now, only support ops with 1 results
-    if (call_op->getNumResults() != 1) return failure();
+    const func::FuncOp ref_func =
+        dyn_cast_or_null<func::FuncOp>(symbol_table.lookup(f_attr.getValue()));
+    func::FuncOp new_ref_func = dyn_cast<func::FuncOp>(ref_func->clone());
+    return symbol_table.insert(new_ref_func);
+  }
 
-    Value result = call_op->getResult(0);
+  LogicalResult match(LiftedOpT op) const override {
+    if (!op->hasAttr(kQuantTraitAttrName) || op->getNumResults() != 1) {
+      return failure();
+    }
 
-    // If one of the user is DumpTensorOp, do nothing
+    Value result = op->getResult(0);
     for (auto user : result.getUsers()) {
       if (dyn_cast_or_null<TF::DumpTensorOp>(user)) return failure();
     }
 
-    rewriter.setInsertionPointAfterValue(result);
-
-    std::optional<QuantizationUnitLoc::QuantizationUnit> quant_unit =
-        FindQuantizationUnitFromLoc(call_op->getLoc());
+    const FlatSymbolRefAttr f_attr = GetFuncAttr(op);
+    if (!f_attr.getValue().starts_with(kCompositeFuncPrefix)) return failure();
+    return success();
+  }
 
-    if (!quant_unit.has_value()) return failure();
+  void rewrite(LiftedOpT op, PatternRewriter &rewriter) const override {
+    // Only support ops with 1 results
+    Value result = op->getResult(0);
+    rewriter.setInsertionPointAfterValue(result);
 
-    auto folder_name =
-        tensorflow::io::JoinPath(log_dir_path_, f_attr.getValue());
     // In Whole model, we first need to set file_name as
     // unquantized_tensor_data.pb as it is used by unquantized dump model.
     // After saving unquantized dump model, the file name will be changed to
@@ -161,77 +248,56 @@ class AddDumpTensorOp : public OpRewritePattern<TF::PartitionedCallOp> {
     // as quantized_tensor_data.pb here.
     // TODO: b/296933893 - Refactor the debugger code when no quantize option
     // is added
-    auto file_name =
+    std::string file_name =
         debugger_type_ == DebuggerOptions::DEBUGGER_TYPE_WHOLE_MODEL
             ? "unquantized_tensor_data.pb"
             : "quantized_tensor_data.pb";
 
-    SmallVector<NamedAttribute> dump_attributes{
-        rewriter.getNamedAttr("log_dir_path",
-                              rewriter.getStringAttr(folder_name)),
-        rewriter.getNamedAttr("file_name", rewriter.getStringAttr(file_name)),
-        // The op is disabled by default. Otherwise, values will be saved
-        // during calibration.
-        rewriter.getNamedAttr("enabled", rewriter.getBoolAttr(false)),
-        rewriter.getNamedAttr("func_name",
-                              rewriter.getStringAttr(quant_unit->func_name())),
-        rewriter.getNamedAttr("node_name",
-                              rewriter.getStringAttr(quant_unit->node_name())),
-    };
+    const FlatSymbolRefAttr f_attr = GetFuncAttr(op);
+
+    // In TF::PartitionedCallOp case, func_name and node_name are filled.
+    // But in TF::XlaCallModuleOp case, node_name is `kEmptyNodeName` since
+    // debugging and selective quantization of StableHLO Quantizer only uses
+    // func_name for op matching.
+    auto [func_name, node_name] = GetFuncNameAndNodeName(op, f_attr);
+    std::string folder_name =
+        tensorflow::io::JoinPath(log_dir_path_, f_attr.getValue());
 
-    rewriter.create<TF::DumpTensorOp>(call_op->getLoc(), TypeRange{}, result,
+    // Attach DumpTensorOp to its output layer.
+    SmallVector<NamedAttribute> dump_attributes =
+        CreateDumpAttributes(rewriter, folder_name, file_name,
+                             /*enabled=*/false, func_name, node_name);
+    rewriter.create<TF::DumpTensorOp>(op->getLoc(), TypeRange{}, result,
                                       dump_attributes);
 
     // Per-layer mode.
     if (debugger_type_ == DebuggerOptions::DEBUGGER_TYPE_INT_PER_LAYER ||
         debugger_type_ == DebuggerOptions::DEBUGGER_TYPE_FLOAT_PER_LAYER) {
-      auto module = call_op->getParentOfType<ModuleOp>();
-      SymbolTable symbol_table(module);
-
-      // Copy composite function of quantizable layer.
-      const mlir::func::FuncOp ref_func = dyn_cast_or_null<func::FuncOp>(
-          symbol_table.lookup(f_attr.getValue()));
-      mlir::func::FuncOp new_ref_func =
-          dyn_cast<func::FuncOp>(ref_func->clone());
-      const StringAttr new_ref_func_name = symbol_table.insert(new_ref_func);
-
-      // Create PartitionedCallOp to the copied composite function.
-      // This PartitionedCallOp does not have kQuantTraitAttrName, and therefore
-      // won't get quantized.
-      auto ref_call_op = rewriter.create<TF::PartitionedCallOp>(
-          call_op.getLoc(), call_op.getResultTypes(), call_op.getOperands(),
-          FlatSymbolRefAttr::get(new_ref_func_name));
-
-      // Attach DumpTensorOp to its output unquantized layer.
-      SmallVector<NamedAttribute> dump_attributes{
-          rewriter.getNamedAttr("log_dir_path",
-                                rewriter.getStringAttr(folder_name)),
-          rewriter.getNamedAttr("file_name", rewriter.getStringAttr(
-                                                 "unquantized_tensor_data.pb")),
-          rewriter.getNamedAttr("enabled", rewriter.getBoolAttr(false)),
-          rewriter.getNamedAttr(
-              "func_name", rewriter.getStringAttr(quant_unit->func_name())),
-          rewriter.getNamedAttr(
-              "node_name", rewriter.getStringAttr(quant_unit->node_name())),
-      };
-
-      rewriter.create<TF::DumpTensorOp>(call_op->getLoc(), TypeRange{},
-                                        ref_call_op.getResult(0),
-                                        dump_attributes);
+      // Duplicate composite function and op of quantizable layer for creating
+      // unquantized layer.
+      StringAttr new_ref_func_name = DuplicateFunction(op, f_attr);
+      Operation *new_op = DuplicateOp(op, rewriter, new_ref_func_name);
+
+      // Attach second DumpTensorOp to its output unquantized layer.
+      SmallVector<NamedAttribute> dump_attributes = CreateDumpAttributes(
+          rewriter, folder_name, /*file_name=*/"unquantized_tensor_data.pb",
+          /*enabled=*/false, func_name, node_name);
+      rewriter.create<TF::DumpTensorOp>(op.getLoc(), TypeRange{},
+                                        new_op->getResult(0), dump_attributes);
 
       if (debugger_type_ == DebuggerOptions::DEBUGGER_TYPE_FLOAT_PER_LAYER) {
         // Swap all uses between call_op and ref_call_op, except for the
         // particular use that owns DumpTensor.
         rewriter.replaceUsesWithIf(
-            call_op.getResult(0), ref_call_op.getResult(0),
-            [](OpOperand &use) -> bool {
+            op.getResult(0), new_op->getResult(0), [](OpOperand &use) -> bool {
               return !isa<TF::DumpTensorOp>(use.getOwner());
             });
       }
     }
-
-    return success();
   }
+
+  DebuggerType debugger_type_;
+  std::string log_dir_path_;
 };
 
 static PassRegistration<AddDumpTensorOpPass> pass;
@@ -241,7 +307,10 @@ void AddDumpTensorOpPass::runOnOperation() {
   RewritePatternSet patterns(ctx);
   ModuleOp module = getOperation();
 
-  patterns.add<AddDumpTensorOp>(ctx, debugger_type_, log_dir_path_);
+  patterns.add<AddDumpTensorOp<TF::PartitionedCallOp>,
+               AddDumpTensorOp<TF::XlaCallModuleOp>>(ctx, debugger_type_,
+                                                     log_dir_path_);
+
   if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns)))) {
     module.emitError() << "quant-add-dump-tensor-op failed.";
     signalPassFailure();
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc
index 6e94beb6b0a057..f1f65a1a183371 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/merge_initializer_function_ops_to_main.cc
@@ -35,6 +35,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/quantization/common/func.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/manipulate_model_attr.h"
 #include "tensorflow/compiler/mlir/quantization/tensorflow/passes/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -107,20 +108,6 @@ class MergeInitializerFunctionOpsToMainPass
   }
 };
 
-// Gets the "main" function from the module. Returns an empty op iff it doesn't
-// exist.
-func::FuncOp GetMainFunction(ModuleOp module_op) {
-  const auto main_func_id =
-      StringAttr::get(module_op.getContext(), kImportModelDefaultGraphFuncName);
-  auto func_ops = module_op.getOps<func::FuncOp>();
-  auto main_func_itr = absl::c_find_if(func_ops, [&main_func_id](auto func_op) {
-    return func_op.getName() == main_func_id;
-  });
-
-  if (main_func_itr == func_ops.end()) return {};
-  return *main_func_itr;
-}
-
 // Returns true iff func_op has either no Region or the body has no Blocks.
 bool IsFuncOpEmpty(func::FuncOp func_op) {
   return func_op->getNumRegions() == 0 || func_op.getBody().empty();
@@ -336,7 +323,7 @@ void MergeInitializerFunctionOpsToMainPass::runOnOperation() {
   ModuleOp module_op = getOperation();
   MLIRContext* ctx = module_op.getContext();
 
-  func::FuncOp main_func_op = GetMainFunction(module_op);
+  func::FuncOp main_func_op = FindMainFuncOp(module_op);
   if (!main_func_op) {
     module_op.emitError("Main function op not found.");
     return signalPassFailure();
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
index 886a27011b1825..ebdd374288a065 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Quant/QuantTypes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/DialectRegistry.h"  // from @llvm-project
@@ -120,6 +121,27 @@ bool HasEqualElementSize(Value val1, Value val2, ArrayRef<int> val1_indices,
   return val1_result == val2_result;
 }
 
+// Checks if a shape has dim sizes of all ones except the right most dim.
+bool ReshapableTo1DTensor(ShapedType rhs_shape) {
+  for (auto rank = 0; rank < rhs_shape.getRank() - 1; rank++) {
+    if (rhs_shape.getDimSize(rank) != 1) {
+      return false;
+    }
+  }
+  return true;
+}
+
+Value ReshapeTo1DTensor(OpBuilder& builder, Location loc, Value value) {
+  auto shape = value.getType().cast<ShapedType>();
+  if (shape.getRank() != 1) {
+    SmallVector<int64_t> new_shape;
+    new_shape.push_back(shape.getNumElements());
+    value = builder.create<TF::ReshapeOp>(
+        loc, value, Create1DConstValue(builder, loc, new_shape));
+  }
+  return ConstantFoldOpIfPossible(value.getDefiningOp()).front();
+}
+
 // Matches convolution op with "NHWC" data format or matmul op with false adj_y.
 // The list of supported ops in this function is:
 // - Conv2DOp
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td
index 30e298dd6e7048..d75a01be7d2182 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_lifting.td
@@ -82,6 +82,13 @@ class HasEqualElementSize<list<int> shape_1, list<int> shape_2> : Constraint<
   "llvm::ArrayRef<int>({" # !interleave(shape_2, ", ") # "}))">,
   "Checks if the given dimensions contain the same number of elements.">;
 
+def ReshapableTo1DTensor : Constraint<
+  CPred<"quant::ReshapableTo1DTensor($0.getType().cast<ShapedType>())">,
+  "Checks if the value dims are all ones except the right most dim">;
+
+def ReshapeTo1DTensor : NativeCodeCall<
+  "quant::ReshapeTo1DTensor($_builder, $_loc, $0)">;
+
 def HasEqualShape : Constraint<CPred<
   "$0.getType().cast<ShapedType>().hasRank() && "
   "$1.getType().cast<ShapedType>().hasRank() && "
@@ -112,7 +119,29 @@ def ConvertAddToBiasAdd : Pat<
     (TF_ConstOp:$add_rhs IsFloatElementsAttr:$add_rhs_value)),
   (TF_BiasAddOp $conv_out, $add_rhs, (CreateStringAttr<"NHWC">)),
   [(HasRankOf<1> $add_rhs_value),
-   (HasEqualElementSize<[-1], [0]> $conv_out, $add_rhs)]>;
+   (HasEqualElementSize<[-1], [0]> $conv_out, $add_rhs)], [], (addBenefit -1)>;
+
+// Convert conv+sub+mul pattern to conv+mul+add.
+// (conv - sub) * mul -> conv * mul + (-sub) * mul
+//
+// This is needed to support Conv+BatchNorm pattern from Jax models converted
+// using jax2tf w/o native serialization. Note that Jax2tf patterns always
+// extend bias shapes to a rank of 4, e.g. 1x1x1x5.
+def ConvertSubMulToMulAdd : Pat<
+  (TF_MulOp
+    (TF_SubOp
+      (SupportedAffineOpMatcher $conv_out, $input, $weight),
+      (TF_ConstOp:$sub_rhs IsFloatElementsAttr:$sub_rhs_value)),
+    (TF_ConstOp:$mul_rhs IsFloatElementsAttr:$mul_rhs_value)),
+  (TF_AddV2Op
+    (TF_MulOp $conv_out, (ReshapeTo1DTensor $mul_rhs)),
+    (TF_MulOp
+      (TF_NegOp (ReshapeTo1DTensor $sub_rhs)),
+      (ReshapeTo1DTensor $mul_rhs))),
+  [(ReshapableTo1DTensor $mul_rhs),
+   (ReshapableTo1DTensor $sub_rhs),
+   (HasEqualElementSize<[-1], [-1]> $conv_out, $mul_rhs),
+   (HasEqualElementSize<[-1], [-1]> $conv_out, $sub_rhs)]>;
 
 // TODO(b/278493977): Create generic implementation of lifting any fused op
 // with any reshaping op
@@ -128,6 +157,7 @@ def ConvertAddWithReshapeToBiasAddWithReshape : Pat<
    (HasEqualElementSize<[-1], [0]> $reshape_out, $add_rhs)]>;
 
 // Fuse consecutive BiasAddOp and an AddV2Op.
+// We also handle the case where add_rhs has rank 4.
 def FuseBiasAndAddV2 : Pat<
   (TF_AddV2Op
     (TF_BiasAddOp:$bias_add
@@ -135,9 +165,10 @@ def FuseBiasAndAddV2 : Pat<
       (TF_ConstOp:$bias IsFloatElementsAttr:$bias_value), $data_format),
     (TF_ConstOp:$add_rhs IsFloatElementsAttr:$add_rhs_value)),
   (TF_BiasAddOp
-    $conv_out, (TF_AddV2Op $bias, $add_rhs), $data_format),
+    $conv_out, (TF_AddV2Op $bias, (ReshapeTo1DTensor $add_rhs)), $data_format),
   [(HasOneUse $bias_add),
-   (HasEqualShape $bias_value, $add_rhs_value)]>;
+   (ReshapableTo1DTensor $add_rhs),
+   (HasEqualElementSize<[-1], [-1]> $bias, $add_rhs)]>;
 
 // Fuse AffineOp followed by an MulOp patterns.
 def FuseAffineOpAndMul : Pat<
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
index b5fb96396f7ef9..20ffa2adcfa969 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/passes/prepare_quantize.cc
@@ -414,7 +414,7 @@ void PrepareQuantizePass::runOnOperation() {
   ApplyQuantizationParamsPropagation(
       func, is_signed, /*bit_width=*/8, !enable_per_channel_quantization_,
       GetTFOpQuantSpec, GetTfQuantScaleSpec, infer_tensor_range,
-      quant_specs_.legacy_float_scale);
+      quant_specs_.legacy_float_scale, /*is_qdq_conversion=*/false);
 
   RewritePatternSet patterns2(ctx);
   patterns2.add<MergeConsecutiveQuantizeCast>(ctx);
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
index 5248d95c9f9e10..490cc1fc889b91 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/BUILD
@@ -11,6 +11,7 @@ package(
     default_visibility = [
         "//tensorflow/compiler/mlir/quantization/stablehlo:__subpackages__",
         "//tensorflow/compiler/mlir/quantization/tensorflow:internal_visibility_allowlist_package",
+        "//tensorflow/lite:__subpackages__",
         "//tensorflow/python:__subpackages__",
         "//tensorflow/tools/pip_package/v2:__subpackages__",
     ],
@@ -345,6 +346,7 @@ tf_py_strict_test(
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:test_lib",
         "//tensorflow/python/lib/io:file_io",
+        "//tensorflow/python/lib/io:tf_record",
         "//tensorflow/python/module",
         "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/ops:math_ops",
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
index a67a6451341e4e..ca7027dbe312bd 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/integration_test/quantize_model_test.py
@@ -42,6 +42,7 @@
 from tensorflow.python.framework import tensor_spec
 from tensorflow.python.framework import test_util
 from tensorflow.python.lib.io import file_io
+from tensorflow.python.lib.io import tf_record
 from tensorflow.python.module import module
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
@@ -1790,6 +1791,15 @@ def gen_data() -> repr_dataset.RepresentativeDataset:
           'enable_per_channel_quantization': True,
           'dilations': [1, 2, 2, 1],
       },
+      {
+          'testcase_name': 'with_bias_and_relu6_to_stablehlo_per_channel',
+          'activation_fn': nn_ops.relu6,
+          'has_bias': True,
+          'has_batch_norm': False,
+          'target_opset': quant_opts_pb2.STABLEHLO,
+          'input_shape_dynamic': False,
+          'enable_per_channel_quantization': True,
+      },
   )
   @test_util.run_in_graph_and_eager_modes
   def test_conv_ptq_model(
@@ -1950,6 +1960,10 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
           ),
       )
       self.assertFalse(self._contains_op(output_graphdef, 'Conv2D'))
+    elif target_opset == quant_opts_pb2.STABLEHLO:
+      # This is to verify the invocation of StableHLO quantizer works. More
+      # thorough functional tests are in StableHLO quantizer directory.
+      self.assertTrue(self._contains_op(output_graphdef, 'XlaCallModule'))
     else:
       self.assertTrue(self._contains_quantized_function_call(output_graphdef))
     self.assertFalse(self._contains_op(output_graphdef, 'FusedBatchNormV3'))
@@ -5831,7 +5845,7 @@ def test_while_op_model(
 
 class DebuggerTest(quantize_model_test_base.QuantizedModelTest):
 
-  def _run_model_in_sess(self, model_dir, tags, signature_key, sample_input):
+  def _run_model_in_sess(self, model_dir, tags, signature_key, sample_inputs):
     with tensorflow.compat.v1.Session(graph=tensorflow.Graph()) as sess:
       meta_graph = saved_model_loader.load(sess, tags, export_dir=model_dir)
       signature_def = meta_graph.signature_def[signature_key]
@@ -5843,13 +5857,26 @@ def _run_model_in_sess(self, model_dir, tags, signature_key, sample_input):
           for output_tensor_info in signature_def.outputs.values()
       ]
 
-      feed_dict = {}
-      for input_key, input_value in sample_input.items():
-        input_tensor_name = signature_def.inputs[input_key].name
-        feed_dict[input_tensor_name] = input_value
+      output_values = []
+      for sample_input in sample_inputs:
+        feed_dict = {}
+        for input_key, input_value in sample_input.items():
+          input_tensor_name = signature_def.inputs[input_key].name
+          feed_dict[input_tensor_name] = input_value
 
-      # Obtain the output of the model.
-      return sess.run(output_tensor_names, feed_dict=feed_dict)[0]
+        # Obtain the output of the model.
+        output_values.append(
+            sess.run(output_tensor_names, feed_dict=feed_dict)[0]
+        )
+    return output_values
+
+  def _read_tensor_array_file(self, file_path):
+    tensor_protos = []
+    for raw_record in tf_record.tf_record_iterator(file_path, options='ZLIB'):
+      tensor_protos.append(
+          tensorflow.make_ndarray(tensor_pb2.TensorProto.FromString(raw_record))
+      )
+    return np.array(tensor_protos)
 
   @parameterized.named_parameters(
       {
@@ -5926,9 +5953,10 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
         converted_model.signatures._signatures.keys(), {'serving_default'}
     )
 
-    sample_input = {
-        'input_tensor': np.random.uniform(low=0, high=1, size=(16, 3, 4, 3))
-    }
+    sample_inputs = [
+        {'input_tensor': np.random.uniform(low=0, high=1, size=(16, 3, 4, 3))},
+        {'input_tensor': np.random.uniform(low=0, high=1, size=(16, 3, 4, 3))},
+    ]
 
     # Check if output of the model and value saved by DumpTensorOp matches.
     # Verify for both unquantized model and quantized model.
@@ -5936,24 +5964,19 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
         [unquantized_dump_model_path, 'unquantized_tensor_data.pb'],
         [self._output_saved_model_path, 'quantized_tensor_data.pb'],
     ]:
-      output_value = self._run_model_in_sess(
-          model_path, tags, 'serving_default', sample_input
+      output_values = self._run_model_in_sess(
+          model_path, tags, 'serving_default', sample_inputs
       )
 
       # Find the dump file and parse it.
       folder = os.path.join(log_dir_path, os.listdir(log_dir_path)[0])
       dump_file_path = os.path.join(log_dir_path, folder, file_name)
-
-      dump_file_proto = tensor_pb2.TensorProto.FromString(
-          open(dump_file_path, 'rb').read()
-      )
-
-      dump_file_numpy = tensorflow.make_ndarray(dump_file_proto)
+      dump_file_numpy = self._read_tensor_array_file(dump_file_path)
 
       # Since the model only has one conv2d and its output is directly used as
       # the output of the model, output of the model and conv2d's dump value
       # should be the same.
-      self.assertAllEqual(output_value, dump_file_numpy)
+      self.assertAllEqual(output_values, dump_file_numpy)
 
       # Verify if quant_unit.pb file was created correctly.
       quant_unit_file_path = os.path.join(log_dir_path, folder, 'quant_unit.pb')
@@ -6070,15 +6093,16 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
         converted_model.signatures._signatures.keys(), {'serving_default'}
     )
 
-    sample_input = {
-        'input_tensor': np.random.uniform(low=0, high=1, size=(16, 3, 4, 3))
-    }
+    sample_inputs = [
+        {'input_tensor': np.random.uniform(low=0, high=1, size=(16, 3, 4, 3))},
+        {'input_tensor': np.random.uniform(low=0, high=1, size=(16, 3, 4, 3))},
+    ]
 
     output_value_from_original_model = self._run_model_in_sess(
-        self._input_saved_model_path, tags, 'serving_default', sample_input
+        self._input_saved_model_path, tags, 'serving_default', sample_inputs
     )
     output_value_from_debugging_model = self._run_model_in_sess(
-        self._output_saved_model_path, tags, 'serving_default', sample_input
+        self._output_saved_model_path, tags, 'serving_default', sample_inputs
     )
 
     # Find the both quantized and unquantized dump file.
@@ -6090,18 +6114,11 @@ def data_gen() -> repr_dataset.RepresentativeDataset:
         log_dir_path, folder, 'quantized_tensor_data.pb'
     )
 
-    unquantized_dump_file_proto = tensor_pb2.TensorProto.FromString(
-        open(unquantized_dump_file_path, 'rb').read()
-    )
-    quantized_dump_file_proto = tensor_pb2.TensorProto.FromString(
-        open(quantized_dump_file_path, 'rb').read()
-    )
-
-    unquantized_dump_file_numpy = tensorflow.make_ndarray(
-        unquantized_dump_file_proto
+    unquantized_dump_file_numpy = self._read_tensor_array_file(
+        unquantized_dump_file_path
     )
-    quantized_dump_file_numpy = tensorflow.make_ndarray(
-        quantized_dump_file_proto
+    quantized_dump_file_numpy = self._read_tensor_array_file(
+        quantized_dump_file_path
     )
 
     # Since the model only has one conv2d and its output is directly used as
@@ -6143,169 +6160,46 @@ class CalibrationOptionsTest(quantize_model_test_base.QuantizedModelTest):
   (default in TF2) to ensure support for when TF2 is disabled.
   """
 
-  @parameterized.named_parameters(
-      {
-          'testcase_name': 'with_min_max',
-          'target_opset': quant_opts_pb2.TF,
-          'calibration_options': quant_opts_pb2.CalibrationOptions(
-              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_MIN_MAX
-          ),
-      },
-      {
-          'testcase_name': 'with_min_max_to_xla',
-          'target_opset': quant_opts_pb2.XLA,
-          'calibration_options': quant_opts_pb2.CalibrationOptions(
-              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_MIN_MAX
-          ),
-      },
-      {
-          'testcase_name': 'with_min_max_to_uq',
-          'target_opset': quant_opts_pb2.UNIFORM_QUANTIZED,
-          'calibration_options': quant_opts_pb2.CalibrationOptions(
-              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_MIN_MAX
-          ),
-      },
-      {
-          'testcase_name': 'with_average_min_max',
-          'target_opset': quant_opts_pb2.TF,
-          'calibration_options': quant_opts_pb2.CalibrationOptions(
-              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_AVERAGE_MIN_MAX
-          ),
-      },
-      {
-          'testcase_name': 'with_average_min_max_to_xla',
-          'target_opset': quant_opts_pb2.XLA,
-          'calibration_options': quant_opts_pb2.CalibrationOptions(
-              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_AVERAGE_MIN_MAX
-          ),
-      },
-      {
-          'testcase_name': 'with_average_min_max_to_uq',
-          'target_opset': quant_opts_pb2.UNIFORM_QUANTIZED,
-          'calibration_options': quant_opts_pb2.CalibrationOptions(
-              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_AVERAGE_MIN_MAX
-          ),
-      },
-      {
-          'testcase_name': 'with_histogram_percentile',
-          'target_opset': quant_opts_pb2.TF,
-          'calibration_options': quant_opts_pb2.CalibrationOptions(
-              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_PERCENTILE,
-              calibration_parameters=quant_opts_pb2.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=10,
-              ),
-          ),
-      },
-      {
-          'testcase_name': 'with_histogram_percentile_to_xla',
-          'target_opset': quant_opts_pb2.XLA,
-          'calibration_options': quant_opts_pb2.CalibrationOptions(
-              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_PERCENTILE,
-              calibration_parameters=quant_opts_pb2.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=10,
-              ),
-          ),
-      },
-      {
-          'testcase_name': 'with_histogram_percentile_to_uq',
-          'target_opset': quant_opts_pb2.UNIFORM_QUANTIZED,
-          'calibration_options': quant_opts_pb2.CalibrationOptions(
-              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_PERCENTILE,
-              calibration_parameters=quant_opts_pb2.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=10,
-              ),
-          ),
-      },
-      {
-          'testcase_name': 'with_histogram_mse_bruteforce',
-          'target_opset': quant_opts_pb2.TF,
-          'calibration_options': quant_opts_pb2.CalibrationOptions(
-              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,
-              calibration_parameters=quant_opts_pb2.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=10,
-              ),
-          ),
-      },
-      {
-          'testcase_name': 'with_histogram_mse_bruteforce_to_xla',
-          'target_opset': quant_opts_pb2.XLA,
-          'calibration_options': quant_opts_pb2.CalibrationOptions(
-              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,
-              calibration_parameters=quant_opts_pb2.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=10,
-              ),
-          ),
-      },
-      {
-          'testcase_name': 'with_histogram_mse_bruteforce_to_uq',
-          'target_opset': quant_opts_pb2.UNIFORM_QUANTIZED,
-          'calibration_options': quant_opts_pb2.CalibrationOptions(
-              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,
-              calibration_parameters=quant_opts_pb2.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=10,
-              ),
-          ),
-      },
-      {
-          'testcase_name': 'with_histogram_mse_max_frequency',
-          'target_opset': quant_opts_pb2.TF,
-          'calibration_options': quant_opts_pb2.CalibrationOptions(
-              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY,
-              calibration_parameters=quant_opts_pb2.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=10,
+  @parameterized.parameters(
+      parameter_combinations([{
+          'target_opset': [
+              quant_opts_pb2.TF,
+              quant_opts_pb2.XLA,
+              quant_opts_pb2.UNIFORM_QUANTIZED,
+          ],
+          'calibration_options': [
+              quant_opts_pb2.CalibrationOptions(
+                  calibration_method=_CalibrationMethod.CALIBRATION_METHOD_MIN_MAX
               ),
-          ),
-      },
-      {
-          'testcase_name': 'with_histogram_mse_max_frequency_to_xla',
-          'target_opset': quant_opts_pb2.XLA,
-          'calibration_options': quant_opts_pb2.CalibrationOptions(
-              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY,
-              calibration_parameters=quant_opts_pb2.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=10,
+              quant_opts_pb2.CalibrationOptions(
+                  calibration_method=_CalibrationMethod.CALIBRATION_METHOD_AVERAGE_MIN_MAX
               ),
-          ),
-      },
-      {
-          'testcase_name': 'with_histogram_mse_max_frequency_to_uq',
-          'target_opset': quant_opts_pb2.UNIFORM_QUANTIZED,
-          'calibration_options': quant_opts_pb2.CalibrationOptions(
-              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY,
-              calibration_parameters=quant_opts_pb2.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=10,
+              quant_opts_pb2.CalibrationOptions(
+                  calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_PERCENTILE,
+                  calibration_parameters=quant_opts_pb2.CalibrationOptions.CalibrationParameters(
+                      initial_num_bins=10,
+                  ),
               ),
-          ),
-      },
-      {
-          'testcase_name': 'with_histogram_mse_symmetric',
-          'target_opset': quant_opts_pb2.TF,
-          'calibration_options': quant_opts_pb2.CalibrationOptions(
-              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC,
-              calibration_parameters=quant_opts_pb2.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=10,
+              quant_opts_pb2.CalibrationOptions(
+                  calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_BRUTEFORCE,
+                  calibration_parameters=quant_opts_pb2.CalibrationOptions.CalibrationParameters(
+                      initial_num_bins=10,
+                  ),
               ),
-          ),
-      },
-      {
-          'testcase_name': 'with_histogram_mse_symmetric_to_xla',
-          'target_opset': quant_opts_pb2.XLA,
-          'calibration_options': quant_opts_pb2.CalibrationOptions(
-              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC,
-              calibration_parameters=quant_opts_pb2.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=10,
+              quant_opts_pb2.CalibrationOptions(
+                  calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_MAX_FREQUENCY,
+                  calibration_parameters=quant_opts_pb2.CalibrationOptions.CalibrationParameters(
+                      initial_num_bins=10,
+                  ),
               ),
-          ),
-      },
-      {
-          'testcase_name': 'with_histogram_mse_symmetric_to_uq',
-          'target_opset': quant_opts_pb2.UNIFORM_QUANTIZED,
-          'calibration_options': quant_opts_pb2.CalibrationOptions(
-              calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC,
-              calibration_parameters=quant_opts_pb2.CalibrationOptions.CalibrationParameters(
-                  initial_num_bins=10,
+              quant_opts_pb2.CalibrationOptions(
+                  calibration_method=_CalibrationMethod.CALIBRATION_METHOD_HISTOGRAM_MSE_SYMMETRIC,
+                  calibration_parameters=quant_opts_pb2.CalibrationOptions.CalibrationParameters(
+                      initial_num_bins=10,
+                  ),
               ),
-          ),
-      },
+          ],
+      }])
   )
   @test_util.run_in_graph_and_eager_modes
   def test_conv_ptq_model_by_calibration_options(
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.py
index d9f8c9781fc4ca..902ee3e5c94e2e 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.py
@@ -22,7 +22,6 @@
 from tensorflow.compiler.mlir.quantization.tensorflow import quantization_options_pb2
 from tensorflow.compiler.mlir.quantization.tensorflow.calibrator import calibration_algorithm
 from tensorflow.compiler.mlir.quantization.tensorflow.calibrator import calibration_statistics_pb2
-from tensorflow.compiler.mlir.quantization.tensorflow.calibrator import pywrap_calibration
 from tensorflow.compiler.mlir.quantization.tensorflow.python import pywrap_function_lib
 from tensorflow.compiler.mlir.quantization.tensorflow.python import representative_dataset as rd
 from tensorflow.compiler.mlir.quantization.tensorflow.python import save_model
@@ -500,31 +499,6 @@ def _run_graph_for_calibration(
   logging.info('Calibration step complete.')
 
 
-def _get_min_max_from_calibrator(
-    node_id: bytes,
-    calib_opts: quantization_options_pb2.CalibrationOptions,
-) -> tuple[float, float]:
-  """Calculate min and max from statistics using calibration options.
-
-  Args:
-    node_id: bytes of node id.
-    calib_opts: Calibration options used for calculating min and max.
-
-  Returns:
-    (min_value, max_value): Min and max calculated using calib_opts.
-
-  Raises:
-    ValueError: Unsupported calibration method is given.
-  """
-  statistics: calibration_statistics_pb2.CalibrationStatistics = (
-      pywrap_calibration.get_statistics_from_calibrator(node_id)
-  )
-  min_value, max_value = calibration_algorithm.get_min_max_value(
-      statistics, calib_opts
-  )
-  return min_value, max_value
-
-
 class PyFunctionLibrary(pywrap_function_lib.PyFunctionLibrary):
   """Wrapper class for overridden python method definitions.
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
index 84959d9ba30405..512102d0a5f53c 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.cc
@@ -93,7 +93,8 @@ absl::StatusOr<llvm::SmallVector<AssetFileDef>> RunExportPasses(
   }
 
   if (absl::Status pass_run_status = RunPasses(
-          /*name=*/export_opts.debug_name,
+          /*name=*/
+          export_opts.debug_name,
           /*add_passes_func=*/
           [dup_constants = export_opts.duplicate_shape_determining_constants](
               mlir::PassManager &pm) { AddExportPasses(pm, dup_constants); },
@@ -163,7 +164,8 @@ absl::StatusOr<ExportedModel> QuantizeQatModel(
       /*deserialize_xla_call_module=*/false));
 
   TF_RETURN_IF_ERROR(RunPasses(
-      /*name=*/kTfQuantQatStepName, /*add_passes_func=*/
+      /*name=*/
+      kTfQuantQatStepName, /*add_passes_func=*/
       [&quantization_options](mlir::PassManager &pm) {
         AddQuantizeQatPasses(pm, quantization_options, kTfQuantQatStepName);
       },
@@ -243,7 +245,8 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPreCalibration(
                                          *module_ref, QuantizationConfig()));
   } else {
     TF_RETURN_IF_ERROR(RunPasses(
-        /*name=*/kTfQuantPtqPreCalibrationStepName, /*add_passes_func=*/
+        /*name=*/
+        kTfQuantPtqPreCalibrationStepName, /*add_passes_func=*/
         [&quantization_options](mlir::PassManager &pm) {
           AddQuantizePtqPreCalibrationPasses(pm, quantization_options);
         },
@@ -325,12 +328,22 @@ absl::StatusOr<ExportedModel> QuantizePtqModelPostCalibration(
 
   // Use StableHLO Quantizer option if opset is specified.
   if (is_stablehlo) {
+    QuantizationConfig quantization_config{};
+    quantization_config.mutable_static_range_ptq_preset()
+        ->set_enable_per_channel_quantized_weight(
+            quantization_options.enable_per_channel_quantization());
+    // When targeting server TPUs quantized types should be unpacked into
+    // integer ops.
+    quantization_config.mutable_pipeline_config()->set_unpack_quantized_types(
+        true);
+
     PostCalibrationComponent post_calibration_component(context.get());
     TF_ASSIGN_OR_RETURN(*module_ref, post_calibration_component.Run(
-                                         *module_ref, QuantizationConfig()));
+                                         *module_ref, quantization_config));
   } else {
     TF_RETURN_IF_ERROR(RunPasses(
-        /*name=*/kTfQuantPtqPostCalibrationStepName, /*add_passes_func=*/
+        /*name=*/
+        kTfQuantPtqPostCalibrationStepName, /*add_passes_func=*/
         [&quantization_options](mlir::PassManager &pm) {
           AddQuantizePtqPostCalibrationPasses(
               pm, quantization_options, kTfQuantPtqPostCalibrationStepName);
@@ -405,7 +418,8 @@ absl::StatusOr<ExportedModel> QuantizePtqDynamicRange(
       /*run_tf_to_stablehlo=*/false, /*deserialize_xla_call_module=*/false));
 
   TF_RETURN_IF_ERROR(RunPasses(
-      /*name=*/kTfQuantPtqDynamicRangeStepName, /*add_passes_func=*/
+      /*name=*/
+      kTfQuantPtqDynamicRangeStepName, /*add_passes_func=*/
       [&quantization_options](mlir::PassManager &pm) {
         AddQuantizePtqDynamicRangePasses(pm, quantization_options,
                                          kTfQuantPtqDynamicRangeStepName);
@@ -484,13 +498,14 @@ absl::StatusOr<ExportedModel> QuantizeWeightOnly(
       /*run_tf_to_stablehlo=*/false,
       /*deserialize_xla_call_module=*/false));
 
-  TF_RETURN_IF_ERROR(
-      RunPasses(/*name=*/kTfQuantWeightOnlyStepName, /*add_passes_func=*/
-                [&quantization_options](mlir::PassManager &pm) {
-                  AddQuantizeWeightOnlyPasses(pm, quantization_options,
-                                              kTfQuantWeightOnlyStepName);
-                },
-                *context, *module_ref));
+  TF_RETURN_IF_ERROR(RunPasses(
+      kTfQuantWeightOnlyStepName,
+      /*add_passes_func=*/
+      [&quantization_options](mlir::PassManager &pm) {
+        AddQuantizeWeightOnlyPasses(pm, quantization_options,
+                                    kTfQuantWeightOnlyStepName);
+      },
+      *context, *module_ref));
 
   const bool unfreeze_constants = !quantization_options.freeze_all_variables();
   TF_ASSIGN_OR_RETURN(const std::string checkpoint_dir, GetLocalTmpFileName());
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
index 13db5fff7a8cdc..556222ef4797d5 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/python/quantize_model.py
@@ -682,15 +682,16 @@ def _populate_quantization_options_default_values(
           == _PresetMethod.METHOD_STATIC_RANGE_WEIGHT_ONLY_INT8
       )
       or (
-          quantization_options.op_set == quant_opts_pb2.OpSet.XLA
+          quantization_options.op_set
+          in (quant_opts_pb2.OpSet.XLA, quant_opts_pb2.OpSet.STABLEHLO)
           and quantization_options.quantization_method.preset_method
           == _PresetMethod.METHOD_STATIC_RANGE_INT8
       )
   ):
     raise ValueError(
         'Currently, per-channel quantization is supported for Uniform Quantized'
-        ' opset, weight only quantization, or XLA opset with static range'
-        ' quantization.'
+        ' opset, weight only quantization, or XLA/StableHLO opset with static'
+        ' range quantization.'
     )
 
   if (
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
index 136aab9f583030..2ca81b72aa71aa 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/legalize_tf_xla_call_module_to_stablehlo_pass.h"
+#include "tensorflow/compiler/mlir/lite/stablehlo/transforms/passes.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/rename_entrypoint_to_main.h"
 #include "tensorflow/compiler/mlir/lite/stablehlo/transforms/tf_stablehlo_pass.h"
 #include "tensorflow/compiler/mlir/quantization/stablehlo/cc/pass_pipeline.h"
@@ -56,8 +57,10 @@ void AddUnfuseMhloOpsPasses(mlir::PassManager& pm) {
       mlir::mhlo::createLegalizeEinsumToDotGeneralPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::mhlo::createLegalizeDotToDotGeneralPass());
-  pm.addNestedPass<mlir::func::FuncOp>(
-      mlir::quant::stablehlo::createUnfuseMhloBatchNormPass());
+  // Unfuse mhlo BatchNorm to primitive ops.
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::odml::createUnfuseBatchNormPass());
+  // Fuse Conv + Mul to Conv.
+  pm.addNestedPass<mlir::func::FuncOp>(mlir::odml::createFuseConvolutionPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::mhlo::createLegalizeTorchIndexSelectToGatherPass());
 }
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h
index c7e191796031f4..740dca6c7b106b 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/quantize_preprocess.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/core/public/session.h"
 
 namespace tensorflow {
@@ -58,6 +59,8 @@ inline absl::Status PreprocessAndFreezeGraph(mlir::ModuleOp module_op,
       /*deserialize_xla_call_module=*/false);
 }
 
+void AddTFToStablehloPasses(mlir::PassManager& pm);
+
 }  // namespace quantization
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op_stablehlo.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op_stablehlo.mlir
new file mode 100644
index 00000000000000..f4ef2e0f1d26f8
--- /dev/null
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/add_dump_tensor_op_stablehlo.mlir
@@ -0,0 +1,76 @@
+// RUN: tf-quant-opt %s -split-input-file -quant-add-dump-tensor-op='debugger_type=int_per_layer' | FileCheck --check-prefix=IntPerLayer %s
+
+module {
+  func.func @matmul2(%arg0: tensor<?x2xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<?x2xf32>) {
+    %0 = stablehlo.constant dense<[-0.211145893, -0.708605706]> : tensor<2xf32>
+    %1 = stablehlo.constant dense<[[-0.630731344, 0.54962182], [0.180364341, -0.764542698]]> : tensor<2x2xf32>
+    %2 = "tf.XlaCallModule"(%arg0, %1, %0) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_2, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", _tfl_quant_trait = "fully_quantizable"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
+    %3 = "tf.XlaCallModule"(%2, %1, %0) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_1, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", _tfl_quant_trait = "fully_quantizable"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
+    return %3 : tensor<?x2xf32>
+  }
+  func.func private @composite_dot_general_with_bias_and_relu6_dynamic_fn_2(%arg0: tensor<?x2xf32>, %arg1: tensor<2x2xf32>, %arg2: tensor<2xf32>) -> tensor<?x2xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+    %1 = stablehlo.constant dense<6.000000e+00> : tensor<f32>
+    %2 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<?x2xf32>, tensor<2x2xf32>) -> tensor<?x2xf32>
+    %3 = shape.shape_of %2 : tensor<?x2xf32> -> tensor<2xindex>
+    %4 = stablehlo.dynamic_broadcast_in_dim %arg2, %3, dims = [1] : (tensor<2xf32>, tensor<2xindex>) -> tensor<?x2xf32>
+    %5 = stablehlo.add %2, %4 : tensor<?x2xf32>
+    %6 = stablehlo.clamp %0, %5, %1 : (tensor<f32>, tensor<?x2xf32>, tensor<f32>) -> tensor<?x2xf32>
+    return %6 : tensor<?x2xf32>
+  }
+  func.func private @composite_dot_general_with_bias_and_relu6_dynamic_fn_1(%arg0: tensor<?x2xf32>, %arg1: tensor<2x2xf32>, %arg2: tensor<2xf32>) -> tensor<?x2xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %0 = stablehlo.constant dense<0.000000e+00> : tensor<f32>
+    %1 = stablehlo.constant dense<6.000000e+00> : tensor<f32>
+    %2 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<?x2xf32>, tensor<2x2xf32>) -> tensor<?x2xf32>
+    %3 = shape.shape_of %2 : tensor<?x2xf32> -> tensor<2xindex>
+    %4 = stablehlo.dynamic_broadcast_in_dim %arg2, %3, dims = [1] : (tensor<2xf32>, tensor<2xindex>) -> tensor<?x2xf32>
+    %5 = stablehlo.add %2, %4 : tensor<?x2xf32>
+    %6 = stablehlo.clamp %0, %5, %1 : (tensor<f32>, tensor<?x2xf32>, tensor<f32>) -> tensor<?x2xf32>
+    return %6 : tensor<?x2xf32>
+  }
+
+// IntPerLayer-LABEL: func @matmul2
+// IntPerLayer-DAG: %[[b0:.*]] = stablehlo.constant dense<[-0.211145893
+// IntPerLayer-DAG: %[[w0:.*]] = stablehlo.constant dense<{{\[\[}}-0.630731344
+// IntPerLayer-DAG: %[[matmul0_q:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_2, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", _tfl_quant_trait = "fully_quantizable"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// IntPerLayer-DAG: %[[matmul0_uq:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_2_0, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2_0"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_uq]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_2", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_2", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// IntPerLayer-DAG: %[[matmul1_q:.*]] = "tf.XlaCallModule"(%[[matmul0_q]], %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_1, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", _tfl_quant_trait = "fully_quantizable"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul1_q]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// IntPerLayer-DAG: %[[matmul1_uq:.*]] = "tf.XlaCallModule"(%[[matmul0_q]], %[[w0]], %[[b0]]) <{Sout = [#tf_type.shape<?x2>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_with_bias_and_relu6_dynamic_fn_1_0, _original_entry_function = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1_0"} : (tensor<?x2xf32>, tensor<2x2xf32>, tensor<2xf32>) -> tensor<?x2xf32>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul1_uq]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_with_bias_and_relu6_dynamic_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_with_bias_and_relu6_dynamic_fn_1", node_name = "_empty_node"}> : (tensor<?x2xf32>) -> ()
+// IntPerLayer-DAG: return %[[matmul1_q]] : tensor<?x2xf32>
+// IntPerLayer-DAG: func.func private @composite_dot_general_with_bias_and_relu6_dynamic_fn_2
+// IntPerLayer-DAG: func.func private @composite_dot_general_with_bias_and_relu6_dynamic_fn_1
+// IntPerLayer-DAG: func.func private @composite_dot_general_with_bias_and_relu6_dynamic_fn_2_0
+// IntPerLayer-DAG: func.func private @composite_dot_general_with_bias_and_relu6_dynamic_fn_1_0
+}
+
+// -----
+
+module {
+  func.func @matmul_concat(%arg0: tensor<1x2xf32> {tf_saved_model.index_path = ["input_tensor"]}) -> (tensor<2x3xf32>) {
+    %0 = stablehlo.constant dense<[[-0.630731344, 0.54962182, 0.180364341], [-0.764542698, -0.211145893, -0.708605706]]> : tensor<2x3xf32>
+    %1 = stablehlo.constant dense<1.000000e+00> : tensor<1x3xf32>
+    %2 = "tf.XlaCallModule"(%arg0, %0) <{Sout = [#tf_type.shape<1x3>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    %3 = stablehlo.concatenate %2, %1, dim = 0 : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<2x3xf32>
+    return %3 : tensor<2x3xf32>
+  }
+  func.func private @composite_dot_general_fn_1(%arg0: tensor<1x2xf32>, %arg1: tensor<2x3xf32>) -> tensor<1x3xf32> attributes {_from_xla_call_module, tf_quant.composite_function} {
+    %0 = stablehlo.dot_general %arg0, %arg1, contracting_dims = [1] x [0], precision = [DEFAULT, DEFAULT] : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+    return %0 : tensor<1x3xf32>
+  }
+
+// IntPerLayer-LABEL: func @matmul_concat
+// IntPerLayer-DAG: %[[w0:.*]] = stablehlo.constant dense<{{\[\[}}-0.630731344
+// IntPerLayer-DAG: %[[c0:.*]] = stablehlo.constant dense<1.000000e+00
+// IntPerLayer-DAG: %[[matmul0_q:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]]) <{Sout = [#tf_type.shape<1x3>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1, _original_entry_function = "composite_dot_general_fn_1", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}, _tfl_quant_trait = "fully_quantizable"} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_q]]) <{enabled = false, file_name = "quantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
+// IntPerLayer-DAG: %[[matmul0_uq:.*]] = "tf.XlaCallModule"(%arg0, %[[w0]]) <{Sout = [#tf_type.shape<1x3>], module = "", version = 9 : i64}> {_entry_function = @composite_dot_general_fn_1_0, _original_entry_function = "composite_dot_general_fn_1_0", _stablehlo_module_attrs = {jax.uses_shape_polymorphism = true}} : (tensor<1x2xf32>, tensor<2x3xf32>) -> tensor<1x3xf32>
+// IntPerLayer-DAG: "tf.DumpTensor"(%[[matmul0_uq]]) <{enabled = false, file_name = "unquantized_tensor_data.pb", func_name = "composite_dot_general_fn_1", log_dir_path = "/tmp/dumps/composite_dot_general_fn_1", node_name = "_empty_node"}> : (tensor<1x3xf32>) -> ()
+// IntPerLayer-DAG: %[[concat:.*]] = stablehlo.concatenate %[[matmul0_q]], %[[c0]], dim = 0 : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<2x3xf32>
+// IntPerLayer-DAG: return %[[concat]] : tensor<2x3xf32>
+// IntPerLayer-DAG: func.func private @composite_dot_general_fn_1
+// IntPerLayer-DAG: func.func private @composite_dot_general_fn_1_0
+}
diff --git a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir
index 1e771e2586a61e..772b38f56e242b 100644
--- a/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir
+++ b/tensorflow/compiler/mlir/quantization/tensorflow/tests/prepare_lifting.mlir
@@ -359,3 +359,43 @@ func.func @depthwise_conv2d_with_large_weight_and_add(%arg0: tensor<*xf32>) -> (
 // CHECK-NEXT: %[[DEPTHWISE_CONV2D:.*]] = "tf.DepthwiseConv2dNative"(%arg0, %[[CONST]])
 // CHECK-NEXT: %[[BIASADD:.*]] = "tf.BiasAdd"(%[[DEPTHWISE_CONV2D]], %[[CONST_0]])
 // CHECK-NEXT: return %[[BIASADD]]
+
+// ----
+
+func.func @fuse_conv2d_with_sub_and_mul(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
+  %cst_1 = "tf.Const"() {value = dense<0.200000e+00> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
+  %0 = "tf.Conv2D"(%arg0, %cst) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+  %1 = "tf.Sub"(%0, %cst_0) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<1x1x1x2xf32>) -> tensor<1x3x2x2xf32>
+  %2 = "tf.Mul"(%1, %cst_1) : (tensor<1x3x2x2xf32>, tensor<1x1x1x2xf32>) -> tensor<1x3x2x2xf32>
+  func.return %2 : tensor<1x3x2x2xf32>
+}
+
+// CHECK: func @fuse_conv2d_with_sub_and_mul
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<-0.0800000056> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<4.000000e-01> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-NEXT: %[[CONV:.*]] = "tf.Conv2D"(%arg0, %[[CONST_0]])
+// CHECK-NEXT: %[[BIAS_ADD:.*]] = "tf.BiasAdd"(%[[CONV]], %[[CONST]])
+// CHECK-NEXT: return %[[BIAS_ADD]]
+
+// -----
+
+func.func @fuse_conv2d_with_sub_mul_addv2(%arg0: tensor<1x3x4x3xf32>) -> (tensor<1x3x2x2xf32>) {
+  %cst = "tf.Const"() {value = dense<2.000000e+00> : tensor<2x3x3x2xf32>} : () -> tensor<2x3x3x2xf32>
+  %cst_0 = "tf.Const"() {value = dense<0.400000e+00> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
+  %cst_1 = "tf.Const"() {value = dense<0.200000e+00> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
+  %cst_2 = "tf.Const"() {value = dense<0.300000e+00> : tensor<1x1x1x2xf32>} : () -> tensor<1x1x1x2xf32>
+  %0 = "tf.Conv2D"(%arg0, %cst) {data_format = "NHWC", dilations = [1, 1, 2, 1], explicit_paddings = [], padding = "SAME", strides = [1, 1, 2, 1], use_cudnn_on_gpu = true} : (tensor<1x3x4x3xf32>, tensor<2x3x3x2xf32>) -> tensor<1x3x2x2xf32>
+  %1 = "tf.Sub"(%0, %cst_0) {data_format = "NHWC"} : (tensor<1x3x2x2xf32>, tensor<1x1x1x2xf32>) -> tensor<1x3x2x2xf32>
+  %2 = "tf.Mul"(%1, %cst_1) : (tensor<1x3x2x2xf32>, tensor<1x1x1x2xf32>) -> tensor<1x3x2x2xf32>
+  %3 = "tf.AddV2"(%2, %cst_2) : (tensor<1x3x2x2xf32>, tensor<1x1x1x2xf32>) -> tensor<1x3x2x2xf32>
+  func.return %3 : tensor<1x3x2x2xf32>
+}
+
+// CHECK: func @fuse_conv2d_with_sub_mul_addv2
+// CHECK-DAG: %[[CONST:.*]] = "tf.Const"() <{value = dense<2.200000e-01> : tensor<2xf32>}> : () -> tensor<2xf32>
+// CHECK-DAG: %[[CONST_0:.*]] = "tf.Const"() <{value = dense<4.000000e-01> : tensor<2x3x3x2xf32>}> : () -> tensor<2x3x3x2xf32>
+// CHECK-NEXT: %[[CONV:.*]] = "tf.Conv2D"(%arg0, %[[CONST_0]])
+// CHECK-NEXT: %[[BIAS_ADD:.*]] = "tf.BiasAdd"(%[[CONV]], %[[CONST]])
+// CHECK-NEXT: return %[[BIAS_ADD]]
diff --git a/tensorflow/compiler/mlir/register_common_dialects.cc b/tensorflow/compiler/mlir/register_common_dialects.cc
index 4cda39bdbb6745..b089bd9a1eb787 100644
--- a/tensorflow/compiler/mlir/register_common_dialects.cc
+++ b/tensorflow/compiler/mlir/register_common_dialects.cc
@@ -29,7 +29,6 @@ limitations under the License.
 #include "xla/mlir/framework/ir/xla_framework.h"
 #include "xla/mlir_hlo/mhlo/IR/register.h"
 #include "xla/service/cpu/hlo_xla_runtime_pipeline.h"
-#include "xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h"
 
 namespace mlir {
 
diff --git a/tensorflow/compiler/mlir/runlit.cfg.py b/tensorflow/compiler/mlir/runlit.cfg.py
index e23afc78e6de3b..3bc7791bf2f477 100644
--- a/tensorflow/compiler/mlir/runlit.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.cfg.py
@@ -71,14 +71,10 @@
     'flatbuffer_to_string',
     'flatbuffer_translate',
     'hlo_to_kernel',
-    'hlo_to_llvm_ir',
-    'ifrt-opt',
     'json_to_flatbuffer',
     'kernel-gen-opt',
     'lhlo-tfrt-opt',
-    'mlir-bisect',
     'mlir-hlo-opt',
-    'mlir-interpreter-runner',
     'mlir-opt',
     'mlir-tflite-runner',
     'mlir-translate',
@@ -99,13 +95,6 @@
     'tfg-transforms-opt',
     'tfg-translate',
     'tfjs-opt',
-    'xla-cpu-opt',
-    'xla-gpu-opt',
-    'xla-mlir-gpu-opt',
-    'xla-runtime-opt',
-    'xla-translate',
-    'xla-translate-gpu-opt',
-    'xla-translate-opt',
 ]
 tools = [ToolSubst(s, unresolved='ignore') for s in tool_names]
 llvm_config.add_tool_substitutions(tools, tool_dirs)
diff --git a/tensorflow/compiler/mlir/runlit.site.cfg.py b/tensorflow/compiler/mlir/runlit.site.cfg.py
index 5541e2fb580b8a..dc75547758e11f 100644
--- a/tensorflow/compiler/mlir/runlit.site.cfg.py
+++ b/tensorflow/compiler/mlir/runlit.site.cfg.py
@@ -46,20 +46,7 @@
     'tensorflow/compiler/mlir/tensorflow',
     'tensorflow/compiler/mlir/tfrt',
     'tensorflow/compiler/mlir/tools/kernel_gen',
-    'tensorflow/compiler/mlir/xla',
-    os.path.join(external_srcdir, 'local_xla/xla/mlir/backends/cpu'),
-    os.path.join(external_srcdir, 'local_xla/xla/mlir/backends/gpu'),
-    os.path.join(external_srcdir, 'local_xla/xla/mlir/runtime'),
-    os.path.join(external_srcdir, 'local_xla/xla/mlir/tools/mlir_bisect'),
     os.path.join(external_srcdir, 'local_xla/xla/mlir_hlo'),
-    os.path.join(external_srcdir, 'local_xla/xla/python/ifrt/ir/tests'),
-    os.path.join(external_srcdir, 'local_xla/xla/service/gpu/tests'),
-    os.path.join(external_srcdir, 'local_xla/xla/service/mlir_gpu'),
-    os.path.join(external_srcdir, 'local_xla/xla/translate'),
-    os.path.join(
-        external_srcdir,
-        'local_xla/xla/translate/mhlo_to_lhlo_with_xla'
-    ),
     'tensorflow/core/ir/importexport/',
     'tensorflow/core/ir/tests/',
     'tensorflow/core/transforms/',
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.cc b/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.cc
index 5ce3ccf0cc2c8d..5d9c1d32a92a10 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
@@ -40,6 +41,10 @@ limitations under the License.
 namespace mlir {
 namespace TF {
 
+namespace {
+constexpr char kCompositeDevice[] = "tf._composite_device";
+}  // namespace
+
 ResourceConstructingOps::ResourceConstructingOps(Operation *op) {
   if (op) ops.insert(op);
 }
@@ -57,7 +62,11 @@ ResourceConstructingOps ResourceConstructingOps::getPessimisticValueState(
       auto global_tensor = tf_saved_model::LookupBoundInputOfType<
           tf_saved_model::GlobalTensorOp>(func, barg.getArgNumber(),
                                           symbol_table);
-      return ResourceConstructingOps(global_tensor);
+      ResourceConstructingOps result(global_tensor);
+      if (func.getArgAttr(barg.getArgNumber(), kCompositeDevice)) {
+        result.is_on_composite_device = true;
+      }
+      return result;
     }
   } else if (auto vh = dyn_cast<TF::VarHandleOp>(value.getDefiningOp())) {
     return ResourceConstructingOps(vh);
@@ -74,17 +83,24 @@ ResourceConstructingOps ResourceConstructingOps::join(
   ResourceConstructingOps ret;
   ret.ops.insert(lhs.ops.begin(), lhs.ops.end());
   ret.ops.insert(rhs.ops.begin(), rhs.ops.end());
+  ret.is_on_composite_device =
+      lhs.is_on_composite_device || rhs.is_on_composite_device;
   return ret;
 }
 
 void ResourceConstructingOps::print(raw_ostream &os) const {
-  llvm::interleaveComma(ops, os << "["), os << "]";
+  llvm::interleaveComma(ops, os << "[");
+  if (is_on_composite_device) {
+    os << " COMPOSITE";
+  }
+  os << "]";
 }
 
 void ResourceDataflowAnalysis::visitOperation(Operation *op,
                                               ArrayRef<const StateT *> operands,
                                               ArrayRef<StateT *> results) {
   LLVM_DEBUG(llvm::dbgs() << "ResAn: Visiting operation: " << *op << "\n");
+
   if (auto cast = dyn_cast<TF::CastOp>(op)) {
     join(results[0], *operands[0]);
   } else if (auto while_op = dyn_cast<TF::WhileRegionOp>(op)) {
@@ -94,6 +110,30 @@ void ResourceDataflowAnalysis::visitOperation(Operation *op,
         join(getLatticeElement(arg), *getLatticeElement(value));
       }
     }
+  } else if (auto while_op = dyn_cast<TF::WhileOp>(op)) {
+    func::FuncOp cond = SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(
+        while_op, while_op.getCondAttr());
+    func::FuncOp body = SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(
+        while_op, while_op.getBodyAttr());
+    for (auto &arg : while_op->getOpOperands()) {
+      BlockArgument cond_arg = cond.getArgument(arg.getOperandNumber());
+      join(getLatticeElement(cond_arg), *getLatticeElement(arg.get()));
+      BlockArgument body_arg = body.getArgument(arg.getOperandNumber());
+      join(getLatticeElement(body_arg), *getLatticeElement(arg.get()));
+    }
+  } else if (auto graph = dyn_cast<tf_executor::GraphOp>(op)) {
+    for (auto &arg : graph.GetFetch()->getOpOperands()) {
+      if (arg.getOperandNumber() < graph.getNumResults()) {
+        auto result = graph.getResult(arg.getOperandNumber());
+        join(getLatticeElement(result), *getLatticeElement(arg.get()));
+      }
+    }
+  } else if (auto island = dyn_cast<tf_executor::IslandOp>(op)) {
+    for (auto &arg : island.GetYield()->getOpOperands()) {
+      auto result = island.getResult(arg.getOperandNumber());
+      join(getLatticeElement(result), *getLatticeElement(arg.get()));
+      // getLatticeElement(arg.get())->print(llvm::errs());
+    }
   } else {
     setAllToEntryStates(results);
   }
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h b/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h
index 68f6fa2d44c763..61fdb0c39f0693 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h
@@ -42,7 +42,8 @@ struct ResourceConstructingOps {
   static ResourceConstructingOps getPessimisticValueState(MLIRContext *context);
   static ResourceConstructingOps getPessimisticValueState(Value value);
   bool operator==(const ResourceConstructingOps &rhs) const {
-    return ops == rhs.ops;
+    return ops == rhs.ops &&
+           is_on_composite_device == rhs.is_on_composite_device;
   }
 
   static ResourceConstructingOps join(const ResourceConstructingOps &lhs,
@@ -52,6 +53,8 @@ struct ResourceConstructingOps {
   // The operation(s) which created the resource value.
   // IR constructs (i.e., GlobalTensorOp) are not const-correct.
   mutable DenseSet<Operation *> ops;
+
+  bool is_on_composite_device = false;
 };
 
 class ResourceDataflowAnalysis
diff --git a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
index b0d730898316d5..c95dd020497385 100644
--- a/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
+++ b/tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.cc
@@ -194,7 +194,7 @@ void CategorizeParallelIdsMap(
   groups_different_branch = 0;
   groups_from_only = 0;
   groups_to_only = 0;
-  for (auto [group, branch] : from) {
+  for (const auto& [group, branch] : from) {
     auto to_iter = to.find(group);
     if (to_iter == to.end()) {
       ++groups_from_only;
@@ -207,7 +207,7 @@ void CategorizeParallelIdsMap(
       }
     }
   }
-  for (auto [group, _] : to) {
+  for (const auto& [group, _] : to) {
     auto from_iter = from.find(group);
     if (from_iter == from.end()) {
       ++groups_to_only;
@@ -246,13 +246,13 @@ void SideEffectAnalysisInfo::SetLastWrites(
 
 void SideEffectAnalysisInfo::Enter() {
   per_resource_access_info_.clear();
-  for (auto [resource, last_writes] : stack_down_.back()) {
+  for (const auto& [resource, last_writes] : stack_down_.back()) {
     SetLastWrites(resource, last_writes);
   }
 }
 
 void SideEffectAnalysisInfo::Exit() {
-  for (auto [resource, _] : per_resource_access_info_) {
+  for (const auto& [resource, _] : per_resource_access_info_) {
     absl::flat_hash_set<Operation*> last_writes = GetLastWrites(resource);
     auto& resource_to_operations = stack_up_.back();
     resource_to_operations.try_emplace(resource);
@@ -265,7 +265,7 @@ void SideEffectAnalysisInfo::Exit() {
 void SideEffectAnalysisInfo::Down() {
   stack_down_.emplace_back();
   stack_up_.emplace_back();
-  for (auto [resource, _] : per_resource_access_info_) {
+  for (const auto& [resource, _] : per_resource_access_info_) {
     absl::flat_hash_set<Operation*> last_writes = GetLastWrites(resource);
     stack_down_.back()[resource] = last_writes;
   }
@@ -279,7 +279,7 @@ void SideEffectAnalysisInfo::Lateral() {
 
 void SideEffectAnalysisInfo::Up() {
   Exit();
-  for (auto [resource, last_writes] : stack_up_.back()) {
+  for (const auto& [resource, last_writes] : stack_up_.back()) {
     SetLastWrites(resource, last_writes);
   }
   stack_down_.pop_back();
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
index a0549858ffd111..5d145c85a68a06 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/SMLoc.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
@@ -37,10 +38,12 @@ limitations under the License.
 #include "mlir/IR/OpImplementation.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/UseDefLists.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Interfaces/CallInterfaces.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Support/TypeID.h"  // from @llvm-project
@@ -64,9 +67,10 @@ struct TFInlinerInterface : public DialectInlinerInterface {
   // Analysis Hooks
   //===--------------------------------------------------------------------===//
 
-  // Allow all call operations to be inlined.
+  // Returns whether it's legal to inline a call to a function.
   bool isLegalToInline(Operation* call, Operation* callable,
                        bool wouldBeCloned) const final {
+    if (isa<ClusterFuncOp>(call)) return false;
     return true;
   }
 
@@ -77,10 +81,8 @@ struct TFInlinerInterface : public DialectInlinerInterface {
     return true;
   }
 
-  // Defines the legality of inlining TF Device operations.
-  bool isLegalToInline(Operation*, Region*, bool,
-                       IRMapping&) const final {
-    // For now, enable inlining all operations.
+  // Defines the legality of inlining TF Device operations into a region.
+  bool isLegalToInline(Operation* call, Region*, bool, IRMapping&) const final {
     return true;
   }
 
@@ -130,6 +132,27 @@ TensorFlowDeviceDialect::TensorFlowDeviceDialect(MLIRContext* context)
   addInterfaces<TFInlinerInterface>();
 }
 
+//===----------------------------------------------------------------------===//
+// tf_device.cluster_func
+//===----------------------------------------------------------------------===//
+
+LogicalResult ClusterFuncOp::verifySymbolUses(
+    mlir::SymbolTableCollection& symbolTable) {
+  StringAttr func_attr = getFuncAttr().getRootReference();
+  func::FuncOp func =
+      symbolTable.lookupNearestSymbolFrom<func::FuncOp>(*this, func_attr);
+  if (!func) {
+    return emitError("'func' attribute refers to an undefined function: ")
+           << func_attr.getValue();
+  }
+  return success();
+}
+
+void ClusterFuncOp::setCalleeFromCallable(mlir::CallInterfaceCallable callee) {
+  SymbolRefAttr calleeAttr = callee.get<SymbolRefAttr>();
+  return setFuncAttr(cast<FlatSymbolRefAttr>(calleeAttr));
+}
+
 //===----------------------------------------------------------------------===//
 // tf_device.launch
 //===----------------------------------------------------------------------===//
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
index 343127301d4057..74a12cfa1e22db 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_device_ops.td
@@ -19,8 +19,10 @@ limitations under the License.
 #define TF_DEVICE_DIALECT
 
 include "mlir/IR/OpBase.td"
+include "mlir/Interfaces/CallInterfaces.td"
 include "mlir/Interfaces/ControlFlowInterfaces.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/SymbolInterfaces.td"
 
 //===----------------------------------------------------------------------===//
 // TensorFlow Device Dialect definitions
@@ -335,7 +337,8 @@ used to form the cluster.
   let hasCanonicalizer = 1;
 }
 
-def TfDevice_ClusterFuncOp : TfDevice_Op<"cluster_func", []> {
+def TfDevice_ClusterFuncOp : TfDevice_Op<"cluster_func",
+    [CallOpInterface, DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
   let summary = [{
 The `tf_device.cluster_func` launches a function containing the body of a
 cluster.
@@ -347,7 +350,7 @@ This op is used for outlining a cluster.
 
   let arguments = (ins
     FlatSymbolRefAttr:$func,
-    Variadic<AnyType>
+    Variadic<AnyType>:$args
   );
 
   let results = (outs
@@ -355,10 +358,19 @@ This op is used for outlining a cluster.
   );
 
   let extraClassDeclaration = [{
+    // Gets the argument operands to the called function.
+    operand_range getArgOperands() { return getArgs(); }
+    MutableOperandRange getArgOperandsMutable() {
+      return getArgsMutable();
+    }
     // returns the function that this operation will launch.
     func::FuncOp getFuncOp() {
       return SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(*this, getFuncAttr());
     }
+    CallInterfaceCallable getCallableForCallee() {
+      return getFuncAttr();
+    }
+    void setCalleeFromCallable(::mlir::CallInterfaceCallable callee);
   }];
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
index 71cf2490e911f4..5e0e58c279e358 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tf_generated_ops.td
@@ -12463,6 +12463,18 @@ of the tensor. Rank is also known as "order", "degree", or "ndims."
   let hasFolder = 1;
 }
 
+def TF_ReadFileOp : TF_Op<"ReadFile", [Pure, TF_NoConstantFold]> {
+  let summary = "Reads and outputs the entire contents of the input filename.";
+
+  let arguments = (ins
+    TF_StrTensor:$filename
+  );
+
+  let results = (outs
+    TF_StrTensor:$contents
+  );
+}
+
 def TF_ReadVariableOp : TF_Op<"ReadVariableOp", []> {
   let summary = "Reads the value of a variable.";
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc
index 879aa62ab28f03..b2ae51a1189686 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.cc
@@ -90,7 +90,7 @@ mlir::LogicalResult PwStreamResultsOp::verify() {
 }
 
 //===----------------------------------------------------------------------===//
-// IfrtProgramCall
+// IfrtCall
 //===----------------------------------------------------------------------===//
 
 mlir::LogicalResult IfrtCallOp::verify() {
@@ -115,6 +115,26 @@ mlir::LogicalResult IfrtCallOp::verify() {
     }
   }
 
+  // Verify variable_arg_indices is sorted in ascending order.
+  int64_t prev_index = -1;
+  for (auto arg_index_attr : getVariableArgIndicesAttr()) {
+    if (!arg_index_attr.isa_and_nonnull<mlir::IntegerAttr>()) {
+      return emitOpError() << "variable_arg_indices must be an integer";
+    }
+
+    int64_t index =
+        arg_index_attr.dyn_cast<mlir::IntegerAttr>().getValue().getSExtValue();
+    if (index < 0) {
+      return emitOpError() << "variable_arg_indices must be positive";
+    }
+
+    if (index <= prev_index) {
+      return emitOpError()
+             << "variable_arg_indices must be sorted in ascending order";
+    }
+    prev_index = index;
+  }
+
   return mlir::success();
 }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.td b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.td
index af4bdcea69182e..fe230904c241be 100644
--- a/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.td
@@ -63,7 +63,7 @@ def TF__TfrtGetResourceOp : TF_Op<"_TfrtGetResource",
   let hasVerifier = 1;
 }
 
-def TF_IfrtLoadVariableOp : TF_Op<"IfrtLoadVariable", []> {
+def TF_IfrtLoadVariableOp : TF_Op<"IfrtLoadVariable", [Pure]> {
   let summary = "Loads a variable tensor as an IFRT array";
 
   let description = [{
@@ -77,6 +77,9 @@ def TF_IfrtLoadVariableOp : TF_Op<"IfrtLoadVariable", []> {
 
     `tf.IfrtLoadVariableOp` converts the tensor into an IFRT array based on device and sharding
     configuration specified in `VariableDeviceShardingConfigProto`.
+
+    This op returns a scalar string tensor containing the loaded variable name, which can be
+    used as a key to look for the loaded IFRT array in runtime.
   }];
 
   let arguments = (ins
@@ -85,7 +88,12 @@ def TF_IfrtLoadVariableOp : TF_Op<"IfrtLoadVariable", []> {
     DefaultValuedAttr<StrAttr, "">:$name
   );
 
+  let results = (outs
+    TF_StrTensor:$array_key
+  );
+
   TF_DerivedOperandTypeListAttr Tin = TF_DerivedOperandTypeListAttr<0>;
+  TF_DerivedResultTypeListAttr Tout = TF_DerivedResultTypeListAttr<0>;
 }
 
 
@@ -101,14 +109,13 @@ def TF_IfrtCallOp : TF_Op<"IfrtCall", []> {
     that the outlined function is compiled into an executable and is available
     for lookup from `IfrtCall` TF ops.
 
-    This op also takes `variable_names` attribute to bind the variables (weights)
-    by names.
+    `variable_arg_indices` is a sorted (ascending order) array and indicates which
+    element of `args` is a key to a loaded array corresponding to a variable.
   }];
 
   let arguments = (ins
     Variadic<TF_Tensor> : $args,
     I64Attr : $program_id,
-    StrArrayAttr : $variable_names,
     I32ArrayAttr : $variable_arg_indices
   );
 
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
index be0ab858484acd..05e5105883f94b 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/constant-fold.mlir
@@ -752,3 +752,14 @@ func.func @testGlobalIterIdNotFolded() -> (tensor<i64>) {
   // CHECK: return %[[X]]
   func.return %0: tensor<i64>
 }
+
+// -----
+
+// CHECK-LABEL: func @testReadFileOpNotFolded
+func.func @testReadFileOpNotFolded() -> (tensor<!tf_type.string>) {
+  %0 = "tf.Const"() { value = dense<"filepath"> : tensor<!tf_type.string> } : () -> tensor<!tf_type.string>
+  // CHECK: %[[X:.*]] = "tf.ReadFile"
+  %1 = "tf.ReadFile"(%0) : (tensor<!tf_type.string>) -> tensor<!tf_type.string>
+  // CHECK: return %[[X]]
+  func.return %1: tensor<!tf_type.string>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/convert_control_to_data_outputs.mlir b/tensorflow/compiler/mlir/tensorflow/tests/convert_control_to_data_outputs.mlir
index d473c1a7d7b67e..5162a2df7366e4 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/convert_control_to_data_outputs.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/convert_control_to_data_outputs.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt -tf-executor-convert-control-to-data-outputs -split-input-file %s | FileCheck %s
+// RUN: tf-opt %s -pass-pipeline='builtin.module(tf-executor-convert-control-to-data-outputs{composite-tpuexecute-side-effects})' -split-input-file -verify-diagnostics | FileCheck %s
 
 !tf_res = tensor<!tf_type.resource<tensor<f32>>>
 
@@ -574,3 +574,227 @@ func.func @unconnected(%arg0: !tf_res, %arg1: !tf_res, %arg2: tensor<f32>) {
   }
   func.return
 }
+
+// -----
+
+!tf_res = tensor<!tf_type.resource<tensor<f32>>>
+!tf_str = tensor<3x!tf_type.string>
+
+// CHECK-LABEL: @tpu_execute_while_body
+func.func @tpu_execute_while_body(%arg0: !tf_res, %arg1: !tf_res,
+                                                %arg2: tensor<f32>)
+    -> (!tf_res, !tf_res, tensor<f32>) {
+  %graph:3 = tf_executor.graph {
+    %key, %key_control = tf_executor.island wraps "tf.Const"() {value = dense<"">: !tf_str} : () -> !tf_str
+    // CHECK: [[exe:%.*]] = tf_executor.island({{.*}}) wraps "tf.TPUExecuteAndUpdateVariables"
+    %exe_control = tf_executor.island wraps "tf.TPUExecuteAndUpdateVariables"(%arg0, %arg0, %key) {
+        device_var_reads_indices = [0, 1],
+        device_var_updates_indices = [0, 1]} : (!tf_res, !tf_res, !tf_str) -> ()
+
+    %assign_control_0 = tf_executor.island wraps "tf.AssignVariableOp"(%arg0, %arg2) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
+    %assign_control_1 = tf_executor.island wraps "tf.AssignVariableOp"(%arg1, %arg2) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
+    %add, %add_control = tf_executor.island wraps "tf.Add"(%arg2, %arg2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %mul, %mul_control = tf_executor.island wraps "tf.Mul"(%arg2, %arg2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %control_barrier = tf_executor.island(%assign_control_0, %assign_control_1, %add_control, %exe_control) wraps "tf.NoOp"() : () -> ()
+    // CHECK-DAG: [[exe]]{{.*}}"tf.Identity"(%arg3)
+    // CHECK-DAG: "tf.Identity"(%arg4)
+    // CHECK: tf_executor.fetch
+    tf_executor.fetch %arg0, %arg1, %add, %control_barrier, %mul_control : tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>, !tf_executor.control, !tf_executor.control
+  }
+  func.return %graph#0, %graph#1, %graph#2 : !tf_res, !tf_res, tensor<f32>
+}
+
+// CHECK-LABEL: @tpu_execute_while_cond
+func.func @tpu_execute_while_cond(%arg0: !tf_res, %arg1: !tf_res, %arg2: tensor<f32>) -> (tensor<i32>) {
+  %graph = tf_executor.graph {
+    %island, %ctrl = tf_executor.island {
+      %pred = "tf.SomeOp"(%arg2) : (tensor<f32>) -> tensor<i32>
+      tf_executor.yield %pred : tensor<i32>
+    }
+    tf_executor.fetch %island : tensor<i32>
+  }
+  func.return %graph : tensor<i32>
+}
+
+// CHECK-LABEL: @tpu_execute
+func.func @tpu_execute(%arg0: !tf_res, %arg1: !tf_res, %arg2: tensor<f32>) {
+  tf_executor.graph {
+    // CHECK: "tf.Const"{{.*}}tensor<i32>
+    %while_out:3, %control_while = tf_executor.island wraps "tf.While"(%arg0, %arg1, %arg2)
+        {body = @tpu_execute_while_body,
+         cond = @tpu_execute_while_cond, is_stateless = false}
+    : (tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>)
+    -> (tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>)
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// -----
+
+!tf_res = tensor<!tf_type.resource<tensor<f32>>>
+!tf_str = tensor<3x!tf_type.string>
+
+// CHECK-LABEL: @incomplete_composite_devices_while_body
+func.func @incomplete_composite_devices_while_body(%arg0: !tf_res, %arg1: !tf_res,
+                                                %arg2: tensor<f32>)
+    -> (!tf_res, !tf_res, tensor<f32>) {
+  %graph:3 = tf_executor.graph {
+    %key, %key_control = tf_executor.island wraps "tf.Const"() {value = dense<"">: !tf_str} : () -> !tf_str
+    // CHECK: [[exe:%.*]] = tf_executor.island({{.*}}) wraps "tf.TPUExecuteAndUpdateVariables"
+    %exe_control = tf_executor.island wraps "tf.TPUExecuteAndUpdateVariables"(%arg0, %arg1, %key) {
+        device_var_reads_indices = [0, 1],
+        device_var_updates_indices = [0, 1]} : (!tf_res, !tf_res, !tf_str) -> ()
+
+    %assign_control_0 = tf_executor.island wraps "tf.AssignVariableOp"(%arg0, %arg2) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
+    %assign_control_1 = tf_executor.island wraps "tf.AssignVariableOp"(%arg1, %arg2) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
+    %add, %add_control = tf_executor.island wraps "tf.Add"(%arg2, %arg2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %mul, %mul_control = tf_executor.island wraps "tf.Mul"(%arg2, %arg2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %control_barrier = tf_executor.island(%assign_control_0, %assign_control_1, %add_control, %exe_control) wraps "tf.NoOp"() : () -> ()
+    // CHECK: [[exe]]{{.*}}"tf.Identity"(%arg3)
+    // CHECK-NOT: "tf.Identity"
+    // CHECK: tf_executor.fetch
+    tf_executor.fetch %arg0, %arg1, %add, %control_barrier, %mul_control : tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>, !tf_executor.control, !tf_executor.control
+  }
+  func.return %graph#0, %graph#1, %graph#2 : !tf_res, !tf_res, tensor<f32>
+}
+
+// CHECK-LABEL: @incomplete_composite_devices_while_cond
+func.func @incomplete_composite_devices_while_cond(%arg0: !tf_res, %arg1: !tf_res, %arg2: tensor<f32>) -> (tensor<i32>) {
+  %graph = tf_executor.graph {
+    %island, %ctrl = tf_executor.island {
+      %pred = "tf.SomeOp"(%arg2) : (tensor<f32>) -> tensor<i32>
+      tf_executor.yield %pred : tensor<i32>
+    }
+    tf_executor.fetch %island : tensor<i32>
+  }
+  func.return %graph : tensor<i32>
+}
+
+// CHECK-LABEL: @incomplete_composite_devices
+func.func @incomplete_composite_devices(%arg0: !tf_res {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"},
+                       %arg1: !tf_res, %arg2: tensor<f32>) {
+  tf_executor.graph {
+    // CHECK: "tf.Const"{{.*}}tensor<i32>
+    %while_out:3, %control_while = tf_executor.island wraps "tf.While"(%arg0, %arg1, %arg2)
+        {body = @incomplete_composite_devices_while_body,
+         cond = @incomplete_composite_devices_while_cond, is_stateless = false}
+    : (tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>)
+    -> (tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>)
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// -----
+
+!tf_res = tensor<!tf_type.resource<tensor<f32>>>
+!tf_str = tensor<3x!tf_type.string>
+
+// CHECK-LABEL: @complete_composite_devices_while_body
+func.func @complete_composite_devices_while_body(%arg0: !tf_res, %arg1: !tf_res,
+                                                %arg2: tensor<f32>)
+    -> (!tf_res, !tf_res, tensor<f32>) {
+  %graph:3 = tf_executor.graph {
+    %key, %key_control = tf_executor.island wraps "tf.Const"() {value = dense<"">: !tf_str} : () -> !tf_str
+    // CHECK: [[exe:%.*]] = tf_executor.island wraps "tf.TPUExecuteAndUpdateVariables"
+    %exe_control = tf_executor.island wraps "tf.TPUExecuteAndUpdateVariables"(%arg0, %arg1, %key) {
+        device_var_reads_indices = [0, 1],
+        device_var_updates_indices = [0, 1]} : (!tf_res, !tf_res, !tf_str) -> ()
+
+    %assign_control_0 = tf_executor.island wraps "tf.AssignVariableOp"(%arg0, %arg2) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
+    %assign_control_1 = tf_executor.island wraps "tf.AssignVariableOp"(%arg1, %arg2) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
+    %add, %add_control = tf_executor.island wraps "tf.Add"(%arg2, %arg2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %mul, %mul_control = tf_executor.island wraps "tf.Mul"(%arg2, %arg2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %control_barrier = tf_executor.island(%assign_control_0, %assign_control_1, %add_control, %exe_control) wraps "tf.NoOp"() : () -> ()
+    // CHECK: "tf.Identity"(%arg3)
+    // CHECK: "tf.Identity"(%arg4)
+    // CHECK: tf_executor.fetch
+    tf_executor.fetch %arg0, %arg1, %add, %control_barrier, %mul_control : tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>, !tf_executor.control, !tf_executor.control
+  }
+  func.return %graph#0, %graph#1, %graph#2 : !tf_res, !tf_res, tensor<f32>
+}
+
+// CHECK-LABEL: @complete_composite_devices_while_cond
+func.func @complete_composite_devices_while_cond(%arg0: !tf_res, %arg1: !tf_res, %arg2: tensor<f32>) -> (tensor<i32>) {
+  %graph = tf_executor.graph {
+    %island, %ctrl = tf_executor.island {
+      %pred = "tf.SomeOp"(%arg2) : (tensor<f32>) -> tensor<i32>
+      tf_executor.yield %pred : tensor<i32>
+    }
+    tf_executor.fetch %island : tensor<i32>
+  }
+  func.return %graph : tensor<i32>
+}
+
+// CHECK-LABEL: @complete_composite_devices
+func.func @complete_composite_devices(
+                       %arg0: !tf_res {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"},
+                       %arg1: !tf_res {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}, %arg2: tensor<f32>) {
+  tf_executor.graph {
+    // CHECK: "tf.Const"{{.*}}tensor<i32>
+    %while_out:3, %control_while = tf_executor.island wraps "tf.While"(%arg0, %arg1, %arg2)
+        {body = @complete_composite_devices_while_body,
+         cond = @complete_composite_devices_while_cond, is_stateless = false}
+    : (tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>)
+    -> (tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>)
+    tf_executor.fetch
+  }
+  func.return
+}
+
+// -----
+
+!tf_res = tensor<!tf_type.resource<tensor<f32>>>
+!tf_str = tensor<3x!tf_type.string>
+
+// CHECK-LABEL: @tpu_execute_with_non_resource_operands_while_body
+func.func @tpu_execute_with_non_resource_operands_while_body(%arg0: !tf_res, %arg1: !tf_res,
+                                                %arg2: tensor<f32>)
+    -> (!tf_res, !tf_res, tensor<f32>) {
+  %graph:3 = tf_executor.graph {
+    %key, %key_control = tf_executor.island wraps "tf.Const"() {value = dense<"">: !tf_str} : () -> !tf_str
+    // CHECK: [[exe:%.*]] = tf_executor.island wraps "tf.TPUExecuteAndUpdateVariables"
+    %exe_control = tf_executor.island wraps "tf.TPUExecuteAndUpdateVariables"(%arg2, %arg0, %arg1, %key) {
+        device_var_reads_indices = [1, 2],
+        device_var_updates_indices = [1, 2]} : (tensor<f32>, !tf_res, !tf_res, !tf_str) -> ()
+
+    %assign_control_0 = tf_executor.island wraps "tf.AssignVariableOp"(%arg0, %arg2) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
+    %assign_control_1 = tf_executor.island wraps "tf.AssignVariableOp"(%arg1, %arg2) : (tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
+    %add, %add_control = tf_executor.island wraps "tf.Add"(%arg2, %arg2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %mul, %mul_control = tf_executor.island wraps "tf.Mul"(%arg2, %arg2) : (tensor<f32>, tensor<f32>) -> tensor<f32>
+    %control_barrier = tf_executor.island(%assign_control_0, %assign_control_1, %add_control, %exe_control) wraps "tf.NoOp"() : () -> ()
+    // CHECK: "tf.Identity"(%arg3)
+    // CHECK: "tf.Identity"(%arg4)
+    // CHECK: tf_executor.fetch
+    tf_executor.fetch %arg0, %arg1, %add, %control_barrier, %mul_control : tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>, !tf_executor.control, !tf_executor.control
+  }
+  func.return %graph#0, %graph#1, %graph#2 : !tf_res, !tf_res, tensor<f32>
+}
+
+// CHECK-LABEL: @tpu_execute_with_non_resource_operands_while_cond
+func.func @tpu_execute_with_non_resource_operands_while_cond(%arg0: !tf_res, %arg1: !tf_res, %arg2: tensor<f32>) -> (tensor<i32>) {
+  %graph = tf_executor.graph {
+    %island, %ctrl = tf_executor.island {
+      %pred = "tf.SomeOp"(%arg2) : (tensor<f32>) -> tensor<i32>
+      tf_executor.yield %pred : tensor<i32>
+    }
+    tf_executor.fetch %island : tensor<i32>
+  }
+  func.return %graph : tensor<i32>
+}
+
+// CHECK-LABEL: @tpu_execute_with_non_resource_operands
+func.func @tpu_execute_with_non_resource_operands(%arg0: !tf_res {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"},
+                       %arg1: !tf_res {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}, %arg2: tensor<f32>) {
+  tf_executor.graph {
+    // CHECK: "tf.Const"{{.*}}tensor<i32>
+    %while_out:3, %control_while = tf_executor.island wraps "tf.While"(%arg0, %arg1, %arg2)
+        {body = @tpu_execute_with_non_resource_operands_while_body,
+         cond = @tpu_execute_with_non_resource_operands_while_cond, is_stateless = false}
+    : (tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>)
+    -> (tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>, tensor<f32>)
+    tf_executor.fetch
+  }
+  func.return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir
index 9c0eeaaf33e782..9c3202f2dc4945 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/decompose_resource_ops.mlir
@@ -495,9 +495,9 @@ func.func @decompose_resource_gather_op(%indices : tensor<?xi32>) -> tensor<*xi3
     %resource = "tf.VarHandleOp"() {container = "c", shared_name = "v"} : () -> tensor<*x!tf_type.resource<tensor<*xi32>>>
 
     // CHECK-DAG: [[READVAR:%.+]] = "tf.ReadVariableOp"([[VAR]])
-    // CHECK: [[GATHER:%.+]] = "tf.GatherV2"([[READVAR]], [[INDEX]], [[ZERO]]) <{batch_dims = 0 : i64}> {_xla_outside_compilation = "0"} : (tensor<*xi32>, tensor<?xi32>, tensor<i64>) -> tensor<*xi32>
+    // CHECK: [[GATHER:%.+]] = "tf.GatherV2"([[READVAR]], [[INDEX]], [[ZERO]]) <{batch_dims = 0 : i64}> : (tensor<*xi32>, tensor<?xi32>, tensor<i64>) -> tensor<*xi32>
     // CHECK: return [[GATHER]]
-    %1 = "tf.ResourceGather"(%resource, %indices) {_xla_outside_compilation = "0"} : (tensor<*x!tf_type.resource<tensor<*xi32>>>, tensor<?xi32>) -> (tensor<*xi32>)
+    %1 = "tf.ResourceGather"(%resource, %indices) : (tensor<*x!tf_type.resource<tensor<*xi32>>>, tensor<?xi32>) -> (tensor<*xi32>)
     tf_device.return %1 : tensor<*xi32>
   }) : () -> (tensor<*xi32>)
   func.return %0: tensor<*xi32>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/end-to-end-tpu-reshard-variables.mlir b/tensorflow/compiler/mlir/tensorflow/tests/end-to-end-tpu-reshard-variables.mlir
index 90ffba031e12de..f4a9e62c3b80c1 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/end-to-end-tpu-reshard-variables.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/end-to-end-tpu-reshard-variables.mlir
@@ -12,55 +12,51 @@
 // CHECK:   TPUExecuteAndUpdateVariables
 // CHECK: tf_device.replicate
 // CHECK:   TPUReshardVariables
-"builtin.module"() ({
-  "func.func"() ({
-  ^bb0(%res : tensor<*x!tf_type.resource>):
-    "tf_executor.graph"() ({
-      %ctrl = "tf_executor.island"() ({
-        "tf.StatefulPartitionedCall"(%res) {config = "", config_proto = "", executor_type = "", f = @partitioned} : (tensor<*x!tf_type.resource>) -> ()
-        "tf_executor.yield"() : () -> ()
-      }) : () -> (!tf_executor.control)
-      "tf_executor.fetch"(%ctrl) : (!tf_executor.control) -> ()
-    }) : () -> ()
-    "func.return"() : () -> ()
-  }) {arg_attrs = [{tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}], function_type = (tensor<*x!tf_type.resource>) -> (), sym_name = "main"} : () -> ()
-  "func.func"() ({
-  ^bb0(%res : tensor<*x!tf_type.resource>):
-    "tf_executor.graph"() ({
-      %ctrl = "tf_executor.island"() ({
-        %w = "tf.While"(%res) {body = @while_body, cond = @while_cond, is_stateless = false, shape_invariant} : (tensor<*x!tf_type.resource>) -> (tensor<!tf_type.resource<tensor<i32>>>)
-        "tf_executor.yield"() : () -> ()
-      }) : () -> (!tf_executor.control)
-      "tf_executor.fetch"(%ctrl) : (!tf_executor.control) -> ()
-    }) : () -> ()
-    "func.return"() : () -> ()
-  }) {arg_attrs = [{tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}], function_type = (tensor<*x!tf_type.resource>) -> (), sym_name = "partitioned", sym_visibility = "private"} : () -> ()
-  "func.func"() ({
-  ^bb0(%res : tensor<!tf_type.resource<tensor<i32>>>):
-    %g = "tf_executor.graph"() ({
-      %i:2 = "tf_executor.island"() ({
-        "tf.TPUReplicateMetadata"() {_tpu_replicate = "cluster", allow_soft_placement = true, computation_shape = [], device = "", device_assignment = [], host_compute_core = [], num_cores_per_replica = 1 : i64, num_replicas = 2 : i64, padding_map = [], step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", tpu_compile_options_proto = "", use_spmd_for_xla_partitioning = false, use_tpu = true} : () -> ()
-        %one = "tf.Const"() {_tpu_replicate = "cluster", value = dense<1> : tensor<i32>} : () -> tensor<*xi32>
-	%res_rep = "tf.TPUReplicatedInput"(%res) {index = -1 : i64, is_mirrored_variable = true, is_packed = true} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
-	%read = "tf.ReadVariableOp"(%res_rep) {_tpu_replicate = "cluster", device = ""} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<*xi32>
-	%inc = "tf.Add"(%read, %one) {_tpu_replicate = "cluster"} : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
-	"tf.AssignVariableOp"(%res_rep, %inc) {_tpu_replicate = "cluster"} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<*xi32>) -> ()
-	%res_out:2 = "tf.TPUReplicatedOutput"(%res_rep) : (tensor<!tf_type.resource<tensor<i32>>>) -> (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>)
-        "tf_executor.yield"(%res) : (tensor<!tf_type.resource<tensor<i32>>>) -> ()
-      }) : () -> (tensor<!tf_type.resource<tensor<i32>>>, !tf_executor.control)
-      "tf_executor.fetch"(%i#0) : (tensor<!tf_type.resource<tensor<i32>>>) -> ()
-    }) : () -> tensor<!tf_type.resource<tensor<i32>>>
-    "func.return"(%g) : (tensor<!tf_type.resource<tensor<i32>>>) -> ()
-  }) {arg_attrs = [{tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}], function_type = (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>, sym_name = "while_body", sym_visibility = "private"} : () -> ()
-  "func.func"() ({
-  ^bb0(%res : tensor<!tf_type.resource<tensor<i32>>>):
-    %g = "tf_executor.graph"() ({
-      %i:2 = "tf_executor.island"() ({
-        %c = "tf.Const"() {value = dense<0> : tensor<i1>} : () -> tensor<*xi1>
-        "tf_executor.yield"(%c) : (tensor<*xi1>) -> ()
-      }) : () -> (tensor<*xi1>, !tf_executor.control)
-      "tf_executor.fetch"(%i#0) : (tensor<*xi1>) -> ()
-    }) : () -> tensor<*xi1>
-    "func.return"(%g) : (tensor<*xi1>) -> ()
-  }) {arg_attrs = [{tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}], function_type = (tensor<!tf_type.resource<tensor<i32>>>) -> (tensor<*xi1>), sym_name = "while_cond", sym_visibility = "private"} : () -> ()
-}) {tf.devices = {"/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0" = {}, "/job:tpu_host_worker/replica:0/task:0/device:CPU:0" = {}, "/job:tpu_host_worker/replica:0/task:0/device:TPU:0" = {}, "/job:tpu_host_worker/replica:0/task:0/device:TPU:1" = {}, "/job:tpu_host_worker/replica:0/task:0/device:TPU_SYSTEM:0" = {}}, tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1163 : i32}} : () -> ()
+module attributes {tf.devices = {"/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0" = {}, "/job:tpu_host_worker/replica:0/task:0/device:CPU:0" = {}, "/job:tpu_host_worker/replica:0/task:0/device:TPU:0" = {}, "/job:tpu_host_worker/replica:0/task:0/device:TPU:1" = {}, "/job:tpu_host_worker/replica:0/task:0/device:TPU_SYSTEM:0" = {}}, tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1163 : i32}} {
+  func.func @main(%arg0: tensor<*x!tf_type.resource> {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}) {
+    tf_executor.graph {
+      %control = tf_executor.island {
+        "tf.StatefulPartitionedCall"(%arg0) <{config = "", config_proto = "", executor_type = "", f = @partitioned}> : (tensor<*x!tf_type.resource>) -> ()
+        tf_executor.yield
+      }
+      tf_executor.fetch %control : !tf_executor.control
+    }
+    return
+  }
+  func.func private @partitioned(%arg0: tensor<*x!tf_type.resource> {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}) {
+    tf_executor.graph {
+      %control = tf_executor.island {
+        %0 = "tf.While"(%arg0) <{body = @while_body, cond = @while_cond, is_stateless = false, shape_invariant}> : (tensor<*x!tf_type.resource>) -> tensor<!tf_type.resource<tensor<i32>>>
+        tf_executor.yield
+      }
+      tf_executor.fetch %control : !tf_executor.control
+    }
+    return
+  }
+  func.func private @while_body(%arg0: tensor<!tf_type.resource<tensor<i32>>> {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}) -> tensor<!tf_type.resource<tensor<i32>>> {
+    %0 = tf_executor.graph {
+      %outputs, %control = tf_executor.island {
+        "tf.TPUReplicateMetadata"() <{allow_soft_placement = true, computation_shape = [], device_assignment = [], host_compute_core = [], num_cores_per_replica = 1 : i64, num_replicas = 2 : i64, padding_map = [], step_marker_location = "STEP_MARK_AT_ENTRY", topology = "", tpu_compile_options_proto = "", use_spmd_for_xla_partitioning = false, use_tpu = true}> {_tpu_replicate = "cluster", device = ""} : () -> ()
+        %cst = "tf.Const"() <{value = dense<1> : tensor<i32>}> {_tpu_replicate = "cluster"} : () -> tensor<*xi32>
+        %1 = "tf.TPUReplicatedInput"(%arg0) <{index = -1 : i64, is_mirrored_variable = true, is_packed = true}> : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
+        %2 = "tf.ReadVariableOp"(%1) {_tpu_replicate = "cluster", device = ""} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<*xi32>
+        %3 = "tf.Add"(%2, %cst) {_tpu_replicate = "cluster"} : (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+        "tf.AssignVariableOp"(%1, %3) {_tpu_replicate = "cluster"} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<*xi32>) -> ()
+        %4:2 = "tf.TPUReplicatedOutput"(%1) : (tensor<!tf_type.resource<tensor<i32>>>) -> (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>)
+        tf_executor.yield %arg0 : tensor<!tf_type.resource<tensor<i32>>>
+      }
+      tf_executor.fetch %outputs : tensor<!tf_type.resource<tensor<i32>>>
+    }
+    return %0 : tensor<!tf_type.resource<tensor<i32>>>
+  }
+  func.func private @while_cond(%arg0: tensor<!tf_type.resource<tensor<i32>>> {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf.device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0"}) -> tensor<*xi1> {
+    %0 = tf_executor.graph {
+      %outputs, %control = tf_executor.island {
+        %cst = "tf.Const"() <{value = dense<false> : tensor<i1>}> : () -> tensor<*xi1>
+        tf_executor.yield %cst : tensor<*xi1>
+      }
+      tf_executor.fetch %outputs : tensor<*xi1>
+    }
+    return %0 : tensor<*xi1>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/hoist_broadcast_read.mlir b/tensorflow/compiler/mlir/tensorflow/tests/hoist_broadcast_read.mlir
new file mode 100644
index 00000000000000..c88310443ed0d1
--- /dev/null
+++ b/tensorflow/compiler/mlir/tensorflow/tests/hoist_broadcast_read.mlir
@@ -0,0 +1,68 @@
+// RUN: tf-opt %s -split-input-file -tf-hoist-broadcast-read | FileCheck %s
+
+// The read should be hoisted.
+
+// CHECK-LABEL: func @hoist_cpu
+func.func @hoist_cpu(%arg0: tensor<*x!tf_type.resource<tensor<f32>>> {tf.device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"}) -> () {
+  // CHECK:      %[[READ:.*]] = "tf.ReadVariableOp"
+  // CHECK-NEXT: tf_device.replicate
+  // CHECK-NEXT:   "tf.OpA"(%[[READ]])
+  tf_device.replicate {n = 2 : i32} {
+    %0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<f32>
+    "tf.OpA"(%0) : (tensor<f32>) -> ()
+  }
+  func.return
+}
+
+// -----
+
+// The read should not be hoisted because the resource does not have device type CPU.
+
+// CHECK-LABEL: func @only_hoist_cpu
+func.func @only_hoist_cpu(%arg0: tensor<*x!tf_type.resource<tensor<f32>>>) -> () {
+  // CHECK:      tf_device.replicate
+  // CHECK-NEXT:   "tf.ReadVariableOp"
+  tf_device.replicate {n = 2 : i32} {
+    %0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<f32>
+    "tf.OpA"(%0) : (tensor<f32>) -> ()
+  }
+  func.return
+}
+
+// -----
+
+// The read should not be hoisted because it follows a write.
+
+// CHECK-LABEL: func @skip_read_after_write
+func.func @skip_read_after_write(%arg0: tensor<*x!tf_type.resource<tensor<f32>>> {tf.device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"}) -> () {
+  // CHECK:      tf_device.replicate
+  // CHECK:        "tf.AssignVariableOp"
+  // CHECK-NEXT:   "tf.ReadVariableOp"
+  tf_device.replicate {n = 2 : i32} {
+    %0 = "tf.OpA"() : () -> tensor<f32>
+    "tf.AssignVariableOp"(%arg0, %0) : (tensor<*x!tf_type.resource<tensor<f32>>>, tensor<f32>) -> ()
+    %1 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<f32>
+    "tf.OpB"(%1) : (tensor<f32>) -> ()
+  }
+  func.return
+}
+
+// -----
+
+// Check that hoisting preserves read order.
+
+// CHECK-LABEL: func @order_preserved
+func.func @order_preserved(%arg0: tensor<*x!tf_type.resource<tensor<f32>>> {tf.device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"}, %arg1: tensor<*x!tf_type.resource<tensor<f32>>>, %arg2: tensor<*x!tf_type.resource<tensor<f32>>> {tf.device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0"}) -> () {
+  // CHECK:      %[[READ0:.*]] = "tf.ReadVariableOp"(%arg0)
+  // CHECK-NEXT: %[[READ2:.*]] = "tf.ReadVariableOp"(%arg2)
+  // CHECK-NEXT: tf_device.replicate
+  // CHECK-NEXT:   %[[READ1:.*]] = "tf.ReadVariableOp"(%arg1)
+  // CHECK-NEXT:   "tf.OpA"(%[[READ0]], %[[READ1]], %[[READ2]])
+  tf_device.replicate {n = 2 : i32} {
+    %0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<f32>
+    %1 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<f32>
+    %2 = "tf.ReadVariableOp"(%arg2) : (tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<f32>
+    "tf.OpA"(%0, %1, %2) : (tensor<f32>, tensor<f32>, tensor<f32>) -> ()
+  }
+  func.return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/prepare_tpu_computation_for_tf_export.mlir b/tensorflow/compiler/mlir/tensorflow/tests/prepare_tpu_computation_for_tf_export.mlir
index 021cad3b78be8f..389a682d3afe46 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/prepare_tpu_computation_for_tf_export.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/prepare_tpu_computation_for_tf_export.mlir
@@ -167,9 +167,25 @@ func.func @UnsupportedOp(%arg0: tensor<i32>) -> tensor<i32> {
 
 // _XlaHostComputeMlir with manual_sharding should not fall back to
 // XlaHostCompute, because XlaHostCompute does not support manual_sharding.
+// Instead, it is skipped and the MlirXlaOpKernel is expected to handle it.
 
 func.func @HostComputeManualNoFallback(%arg0: tensor<i32>) -> () {
-  // expected-error @+1 {{manual_sharding not supported with fallback}}
+  // CHECK: "tf._XlaHostComputeMlir"
   %1 = "tf._XlaHostComputeMlir"(%arg0) {recv_key = "host_compute_channel_recv1", send_key = "host_compute_channel_send1", host_mlir_module = "", manual_sharding = true} : (tensor<i32>) -> (tensor<f32>)
   func.return
 }
+
+// -----
+
+// CHECK-LABEL: test_xla_call_module_with_host_communicative_subcomputation
+func.func @test_xla_call_module_with_host_communicative_subcomputation() {
+  "tf.XlaCallModule"() {Sout = [], device = "", dim_args_spec = [], function_list = [@callee], module = "", platforms = [], version = 4 : i64} : () -> ()
+  func.return
+}
+
+// CHECK-LABEL: callee
+func.func private @callee(%arg0: tensor<i32>) {
+   "tf.XlaHostCompute"(%arg0) <{ancestors = [],  key = "@host_func", recv_key = "", send_key = "", shapes = []}> {_xla_original_oc_node_name = "hcb0", _xla_token_input_nodes = ["_xla_token_arg_node"]} : (tensor<i32>) -> ()
+   return
+ }
+
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir b/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir
index eff3e38ab5ace2..479df14b6546f4 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/region-control-flow-to-functional.mlir
@@ -960,3 +960,59 @@ func.func @testGeneratorDatasetRegionWithComplexBlocks(%arg0: tensor<4xf32>, %ar
   }) {device = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0", metadata = "", operandSegmentSizes = array<i32: 1, 2, 1>, output_shapes = [#tf_type.shape<>], output_types = [!tf_type.string]} : (tensor<4xf32>, tensor<3xf32>, tensor<!tf_type.resource>, tensor<2xf32>) -> tensor<!tf_type.variant>
   return
 }
+
+// -----
+
+func.func private @tf.WhileRegion_body(%arg0: tensor<i32>, %arg1: tensor<f32>) -> (tensor<i32>, tensor<f32>) {
+  %1 = builtin.unrealized_conversion_cast to tensor<i32>
+  %2 = builtin.unrealized_conversion_cast to tensor<f32>
+  return %1, %2 : tensor<i32>, tensor<f32>
+}
+func.func private @tf.WhileRegion_cond(%arg0: tensor<i32>) -> tensor<i1> {
+  %0 = builtin.unrealized_conversion_cast to tensor<i1>
+  return %0 : tensor<i1>
+}
+// CHECK-LABEL: testNameCollision
+func.func @testNameCollision(%arg0: tensor<i32>) {
+  %1 = builtin.unrealized_conversion_cast to tensor<i32>
+  %2 = builtin.unrealized_conversion_cast to tensor<f32>
+  // CHECK: "tf.While"
+  // CHECK-SAME: body = @tf.WhileRegion_body_1
+  // CHECK-SAME: cond = @tf.WhileRegion_cond_0
+  %3:2 = "tf.WhileRegion"(%1, %2) <{is_stateless = false}> ({
+  ^bb0(%arg1: tensor<i32>, %arg2: tensor<f32>):
+    %8 = func.call @tf.WhileRegion_cond(%arg1) : (tensor<i32>) -> tensor<i1>
+    "tf.Yield"(%8, %arg1, %arg2) : (tensor<i1>, tensor<i32>, tensor<f32>) -> ()
+  }, {
+  ^bb0(%arg1: tensor<i32>, %arg2: tensor<f32>):
+    %8:2 = func.call @tf.WhileRegion_body(%arg1, %arg2) : (tensor<i32>, tensor<f32>) -> (tensor<i32>, tensor<f32>)
+    "tf.Yield"(%8#0, %8#1) : (tensor<i32>, tensor<f32>) -> ()
+  }) : (tensor<i32>, tensor<f32>) -> (tensor<i32>, tensor<f32>)
+  return
+}
+
+// -----
+
+func.func private @my_cond(%arg0: tensor<i32>, %arg1: tensor<f32>) -> tensor<i1> {
+  %0 = builtin.unrealized_conversion_cast to tensor<i1>
+  return %0 : tensor<i1>
+}
+func.func private @my_body(%arg0: tensor<i32>, %arg1: tensor<f32>) -> (tensor<i32>, tensor<f32>) {
+  return %arg0, %arg1 : tensor<i32>, tensor<f32>
+}
+// CHECK-LABEL: testConditionWithPassthroughArgs
+func.func @testConditionWithPassthroughArgs(%arg1: tensor<i32>, %arg2: tensor<f32>) {
+  // CHECK: "tf.While"
+  // CHECK-SAME: body = @my_body
+  // CHECK-SAME: cond = @my_cond
+  %3:2 = "tf.WhileRegion"(%arg1, %arg2) <{is_stateless = false}> ({
+  ^bb0(%barg1: tensor<i32>, %barg2: tensor<f32>):
+    %8 = func.call @my_cond(%barg1, %barg2) : (tensor<i32>, tensor<f32>) -> tensor<i1>
+    "tf.Yield"(%8, %barg1, %barg2) : (tensor<i1>, tensor<i32>, tensor<f32>) -> ()
+  }, {
+  ^bb0(%barg1: tensor<i32>, %barg2: tensor<f32>):
+    %r1, %r2 = func.call @my_body(%barg1, %barg2) : (tensor<i32>, tensor<f32>) -> (tensor<i32>, tensor<f32>)
+    "tf.Yield"(%r1, %r2) : (tensor<i32>, tensor<f32>) -> ()
+  }) : (tensor<i32>, tensor<f32>) -> (tensor<i32>, tensor<f32>)
+  return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/remove_unused_arguments.mlir b/tensorflow/compiler/mlir/tensorflow/tests/remove_unused_arguments.mlir
index e4c1941a1c202f..c3608a2fb13145 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/remove_unused_arguments.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/remove_unused_arguments.mlir
@@ -175,11 +175,13 @@ func.func @can_remove_all_results(%arg0: f32) -> (f32, f32) {
 
 // CHECK-LABEL: @has_inner_function
 func.func private @has_inner_function(%arg0: f32) -> (f32, f32) {
-  func.func private @inner() -> (tensor<f32>, tensor<f32>) {
-    %0, %1 = "some_constant"() : () -> (tensor<f32>, tensor<f32>)
-    // CHECK: return
-    // CHECK-SAME: tensor<f32>, tensor<f32>
-    return %0, %1 : tensor<f32>, tensor<f32>
+  builtin.module {
+    func.func private @inner() -> (tensor<f32>, tensor<f32>) {
+      %0, %1 = "some_constant"() : () -> (tensor<f32>, tensor<f32>)
+      // CHECK: return
+      // CHECK-SAME: tensor<f32>, tensor<f32>
+      return %0, %1 : tensor<f32>, tensor<f32>
+    }
   }
   // CHECK: return
   // CHECK-NOT: arg
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
index 034592fe138580..f7bac4ba31b50a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/shape_inference.mlir
@@ -1436,6 +1436,15 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     func.return %0 : tensor<*xi32>
   }
 
+  // CHECK-LABEL: func @xla_gather_with_fold
+  // CHECK-SAME: (%arg0: tensor<1x1x9xi32>, %arg1: tensor<1xi32>) -> tensor<1x1x8xi32>
+  func.func @xla_gather_with_fold(%arg0: tensor<1x1x9xi32>, %arg1: tensor<1xi32>) -> tensor<*xi32> {
+    %cst = "tf.Const"() {value = dense<[1, 1, 8]> : tensor<3xi32>} : () -> tensor<3xi32>
+    %slice_size = "tf.Identity"(%cst) : (tensor<3xi32>) -> tensor<3xi32>
+    %0 = "tf.XlaGather"(%arg0, %arg1, %slice_size) {dimension_numbers = "\0A\03\00\01\02\1A\01\02", indices_are_sorted = true} : (tensor<1x1x9xi32>, tensor<1xi32>, tensor<3xi32>) -> tensor<*xi32>
+    func.return %0 : tensor<*xi32>
+  }
+
   // CHECK:      func private @sum_reducer3(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<f32> {
   // CHECK-NEXT:   %0 = "tf.AddV2"(%arg0, %arg1) {device = ""} : (tensor<f32>, tensor<f32>) -> tensor<f32>
   // CHECK-NEXT:   return %0 : tensor<f32>
@@ -2256,4 +2265,4 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, pr
     return %3#1, %3#2, %4, %5 : tensor<1x1120x?xi32>, tensor<1x1120x?xi32>, tensor<1120x?xi32>, tensor<2xi32>
   }
 
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_device_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_device_ops.mlir
index fc6ef646e340dd..4632eea672bb78 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_device_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_device_ops.mlir
@@ -1,4 +1,4 @@
-// RUN: tf-opt %s | tf-opt | FileCheck %s
+// RUN: tf-opt --verify-diagnostics --split-input-file %s | FileCheck %s
 
 // CHECK-LABEL: func @return_no_operands
 func.func @return_no_operands() {
@@ -9,6 +9,8 @@ func.func @return_no_operands() {
   func.return
 }
 
+// -----
+
 // CHECK-LABEL: func @return_one_operand
 // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<*xf32>)
 func.func @return_one_operand(%arg_0: tensor<*xf32>) {
@@ -19,6 +21,8 @@ func.func @return_one_operand(%arg_0: tensor<*xf32>) {
   func.return
 }
 
+// -----
+
 // CHECK-LABEL: func @return_multiple_operands
 // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<*xf32>, %[[ARG_1:[a-z0-9]*]]: tensor<*xi32>)
 func.func @return_multiple_operands(%arg_0: tensor<*xf32>, %arg_1: tensor<*xi32>) {
@@ -29,6 +33,8 @@ func.func @return_multiple_operands(%arg_0: tensor<*xf32>, %arg_1: tensor<*xi32>
   func.return
 }
 
+// -----
+
 // CHECK-LABEL: func @empty_replicate
 func.func @empty_replicate() {
   tf_device.replicate {n = 2 : i32} {
@@ -40,6 +46,8 @@ func.func @empty_replicate() {
 // CHECK-NEXT:   tf_device.return
 }
 
+// -----
+
 // CHECK-LABEL: func @no_operand_replicate
 func.func @no_operand_replicate() {
   tf_device.replicate {n = 2 : i32} {
@@ -53,6 +61,8 @@ func.func @no_operand_replicate() {
   // CHECK:   tf_device.return
 }
 
+// -----
+
 // CHECK-LABEL: func @replicate_with_multiple_operands
 func.func @replicate_with_multiple_operands() {
   %0 = "tf.opA"() : () -> tensor<*xi1>
@@ -95,6 +105,8 @@ func.func @replicate_with_multiple_operands() {
 // CHECK-NEXT:   tf_device.return
 }
 
+// -----
+
 // CHECK-LABEL: func @replicate_derived_operandSegmentSizes
 func.func @replicate_derived_operandSegmentSizes() {
   tf_device.replicate {n = 2 : i32, operandSegmentSizes = array<i32: 0, 0>} {
@@ -107,6 +119,8 @@ func.func @replicate_derived_operandSegmentSizes() {
 // CHECK-NEXT:   tf_device.return
 }
 
+// -----
+
 // CHECK-LABEL: func @replicate_with_return
 // CHECK-SAME: (%[[ARG_0:[a-z0-9]*]]: tensor<*xf32>, %[[ARG_1:[a-z0-9]*]]: tensor<*xf32>, %[[ARG_2:[a-z0-9]*]]: tensor<*xi32>)
 func.func @replicate_with_return(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %arg2: tensor<*xi32>) {
@@ -121,6 +135,8 @@ func.func @replicate_with_return(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>, %ar
 // CHECK-NEXT:   tf_device.return %[[INPUT_0]], %[[ARG_2]]
 }
 
+// -----
+
 // CHECK-LABEL: func @replicate_with_devices
 func.func @replicate_with_devices() {
   tf_device.replicate() {n = 2 : i32, devices = {TPU_REPLICATED_CORE_0 = ["/DEVICE:0", "/DEVICE:1"]}} {
@@ -134,6 +150,8 @@ func.func @replicate_with_devices() {
 // CHECK-NEXT:   tf_device.return
 }
 
+// -----
+
 // CHECK-LABEL: func @replicate_with_multiple_devices
 func.func @replicate_with_multiple_devices() {
   tf_device.replicate() {n = 2 : i32, devices = {TPU_REPLICATED_CORE_0 = ["/DEVICE:0", "/DEVICE:1"], TPU_REPLICATED_CORE_1 = ["/DEVICE:2", "/DEVICE:3"]}} {
@@ -147,6 +165,8 @@ func.func @replicate_with_multiple_devices() {
 // CHECK-NEXT:   tf_device.return
 }
 
+// -----
+
 // CHECK-LABEL: func @replicate_with_inner_ops
 func.func @replicate_with_inner_ops() {
   %0 = "tf.opA"() : () -> (tensor<*xi1>)
@@ -162,6 +182,8 @@ func.func @replicate_with_inner_ops() {
   func.return
 }
 
+// -----
+
 // CHECK-LABEL: func @parallel_execute_two_regions
 func.func @parallel_execute_two_regions() {
   "tf_device.parallel_execute"() ({
@@ -173,6 +195,8 @@ func.func @parallel_execute_two_regions() {
   func.return
 }
 
+// -----
+
 // CHECK-LABEL: func @parallel_execute_two_regions_with_ops
 func.func @parallel_execute_two_regions_with_ops() {
   "tf_device.parallel_execute"() ({
@@ -187,6 +211,8 @@ func.func @parallel_execute_two_regions_with_ops() {
   func.return
 }
 
+// -----
+
 // CHECK-LABEL: func @parallel_execute_regions_with_data_results
 func.func @parallel_execute_regions_with_data_results() {
   "tf_device.parallel_execute"() ({
@@ -200,3 +226,11 @@ func.func @parallel_execute_regions_with_data_results() {
   }) {} : () -> (tensor<*xi1>, tensor<*xi32>, tensor<*xf32>)
   func.return
 }
+
+// -----
+
+func.func @parallel_execute_regions_with_data_results(%arg0: tensor<i32>) -> tensor<i32> {
+  // expected-error @+1 {{'func' attribute refers to an undefined function: undefined_func}}
+  %0 = "tf_device.cluster_func"(%arg0) {func = @undefined_func} : (tensor<i32>) -> tensor<i32>
+  func.return %0 : tensor<i32>
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
index 9c0f9b2eddb29b..3e7f029d8ff864 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tf_saved_model_freeze_global_tensors.mlir
@@ -161,3 +161,70 @@ module attributes {tf_saved_model.semantics} {
 // tf_saved_model.semantics.
 // CHECK-LABEL: module
 module {}
+
+// -----
+
+!tf_res = tensor<!tf_type.resource<tensor<f32>>>
+!tf_str = tensor<3x!tf_type.string>
+
+// CHECK-LABEL: module attributes
+module attributes {tf_saved_model.semantics} {
+
+"tf_saved_model.global_tensor"() {sym_name = "v1", type = tensor<f32>, value = dense<3.0> : tensor<f32> } : () -> ()
+"tf_saved_model.global_tensor"() {sym_name = "v2", type = tensor<f32>, value = dense<2.0> : tensor<f32> } : () -> ()
+
+// CHECK-LABEL: @body
+func.func private @body(%arg0: !tf_res, %arg1: !tf_res) -> (!tf_res, !tf_res) {
+  %graph:2 = tf_executor.graph {
+    %value, %value_control = tf_executor.island wraps "tf.GetKey"() : () -> tensor<f32>
+    %ret0, %ret0_control = tf_executor.island wraps "tf.SomeOp"() : () -> !tf_res
+    %ret1, %ret1_control = tf_executor.island wraps "tf.SomeOp"() : () -> !tf_res
+    %control_unknown = tf_executor.island wraps "tf.UnknownOp"() : () -> ()
+    %key, %key_control = tf_executor.island wraps "tf.GetKey"() : () -> !tf_str
+    // CHECK: "tf.ReadVariableOp"(%arg0)
+    %read1, %read1_control  = tf_executor.island wraps "tf.ReadVariableOp"(%arg0) : (!tf_res) -> tensor<f32>
+    // CHECK: "tf.ReadVariableOp"(%arg1)
+    %read2, %read2_control  = tf_executor.island wraps "tf.ReadVariableOp"(%arg1) : (!tf_res) -> tensor<f32>
+    tf_executor.island wraps "tf.TPUExecuteAndUpdateVariables"(%ret0, %ret1, %key) {
+        device_var_reads_indices = [0, 1],
+        device_var_updates_indices = [0, 1]} : (!tf_res, !tf_res, !tf_str) -> ()
+    tf_executor.fetch %ret0, %ret1: !tf_res, !tf_res
+  }
+  func.return %graph#0, %graph#1 : !tf_res, !tf_res
+}
+
+// CHECK-LABEL: @cond
+func.func private @cond(%arg0: !tf_res, %arg1: !tf_res) -> (tensor<i32>) {
+  %graph = tf_executor.graph {
+    %island, %ctrl = tf_executor.island {
+      %pred = "tf.SomeOp"() : () -> tensor<i32>
+      tf_executor.yield %pred : tensor<i32>
+    }
+    tf_executor.fetch %island : tensor<i32>
+  }
+  func.return %graph : tensor<i32>
+}
+
+// CHECK-LABEL: @test_while_loop
+func.func @test_while_loop(%arg0: !tf_res {tf._composite_device = "/job:tpu_host_worker/replica:0/task:0/device:COMPOSITE:0", tf_saved_model.bound_input = @v1},
+                           %arg1: !tf_res {tf_saved_model.bound_input = @v2})
+    attributes {tf_saved_model.exported_names = ["test_while_loop"]} {
+  // CHECK-DAG: Const{{.*}}2.0
+  // CHECK-DAG: Const{{.*}}3.0
+  %read1 = "tf.ReadVariableOp"(%arg0) : (!tf_res) -> tensor<f32>
+  %read2 = "tf.ReadVariableOp"(%arg1) : (!tf_res) -> tensor<f32>
+  // CHECK: tf_executor.graph
+  tf_executor.graph {
+    %handle0, %handle0_control = tf_executor.island wraps "tf.SomeOp"() : () -> !tf_res
+    %handle1, %handle1_control = tf_executor.island wraps "tf.SomeOp"() : () -> !tf_res
+    %control_A = tf_executor.island wraps "tf.OpA"() : () -> ()
+    %while_out:2, %while_control = tf_executor.island(%control_A) wraps "tf.While"(
+            %handle0, %handle1) {
+        body = @body, cond = @cond, is_stateless = false
+    } : (tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>) -> (tensor<!tf_type.resource<tensor<f32>>>, tensor<!tf_type.resource<tensor<f32>>>)
+    %control_B = tf_executor.island(%while_control) wraps "tf.OpB"() : () -> ()
+    tf_executor.fetch
+  }
+  func.return
+}
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tfrt_ops.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tfrt_ops.mlir
index 3fb11e56172276..3bd1677bc30baa 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tfrt_ops.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tfrt_ops.mlir
@@ -13,3 +13,15 @@ func.func @testPwStreamResults(%arg0: tensor<f32>, %arg1: tensor<f32>) {
 }
 
 // -----
+// CHECK-LABEL: func @test_ifrt_call
+func.func @test_ifrt_call(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) {
+  %result = "tf.IfrtCall"(%arg0, %arg1) <{program_id = 1234 : i64, variable_arg_indices = [0 : i32, 1 : i32], variable_names = ["a", "b"]}> : (tensor<?xf32>, tensor<?xf32>) -> (tensor<1x1xf32>)
+  func.return
+}
+
+// -----
+func.func @test_ifrt_call_fail_unsorted_variable_arg_indices(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) {
+  // expected-error@below {{variable_arg_indices must be sorted in ascending order}}
+  %result = "tf.IfrtCall"(%arg0, %arg1) <{program_id = 1234 : i64, variable_arg_indices = [1 : i32, 0 : i32], variable_names = ["a", "b"]}> : (tensor<?xf32>, tensor<?xf32>) -> (tensor<1x1xf32>)
+  func.return
+}
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_resource_partitioning.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_resource_partitioning.mlir
index 91e4ff2b714cd2..4d400d437ab28a 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_resource_partitioning.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_resource_partitioning.mlir
@@ -148,6 +148,8 @@ func.func @resource_missing_subtype(%arg0: tensor<!tf_type.resource>, %arg1: ten
 
 // -----
 
+func.func private @computation(%arg0: tensor<i32>) -> tensor<i32>
+
 func.func @missing_num_cores_per_replica(%arg0: tensor<!tf_type.resource<tensor<i32>>>) {
   // expected-error@+1 {{op num cores per replica unavailable}}
   %0 = "tf.TPUPartitionedInputV2"(%arg0) {_XlaSharding = "", partition_dims = [], is_packed = true} : (tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
@@ -159,6 +161,8 @@ func.func @missing_num_cores_per_replica(%arg0: tensor<!tf_type.resource<tensor<
 
 // -----
 
+func.func private @computation(%arg0: tensor<i32>) -> tensor<i32>
+
 func.func @mismatch_num_cores_per_replica(%arg0: tensor<!tf_type.resource<tensor<i32>>>) {
   // expected-error@+1 {{expects 2 operands but found 3}}
   %0 = "tf.TPUPartitionedInputV2"(%arg0, %arg0, %arg0) {_XlaSharding = "", partition_dims = []} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
@@ -243,6 +247,8 @@ func.func @non_replicated_sharding(%arg0: tensor<!tf_type.resource<tensor<i32>>>
 
 // -----
 
+func.func private @computation(%arg0: tensor<i32>) -> tensor<i32>
+
 func.func @packed_replicated(%arg0: tensor<!tf_type.resource<tensor<i32>>> {tf.device = "COMPOSITE"}) {
   // expected-error@+1 {{support}}
   %0 = "tf.TPUPartitionedInputV2"(%arg0, %arg0) {_XlaSharding = "", partition_dims = [], is_packed = false} : (tensor<!tf_type.resource<tensor<i32>>>, tensor<!tf_type.resource<tensor<i32>>>) -> tensor<!tf_type.resource<tensor<i32>>>
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
index 9796913d50cc15..55b68e5de2fb5f 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_rewrite.mlir
@@ -1334,6 +1334,9 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     }) : () -> ()
     func.return
   }
+  func.func @empty_func() {
+    func.return
+  }
 }
 
 // -----
@@ -2447,6 +2450,9 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     "tf_device.cluster_func"() {_replication_info = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = [], use_spmd_for_xla_partitioning = false} : () -> ()
     func.return
   }
+  func.func @empty_func() {
+    func.return
+  }
 }
 
 // -----
@@ -2457,6 +2463,9 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     "tf_device.cluster_func"() {_xla_compile_device_type = "TPU", _replication_info = "", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = [], use_spmd_for_xla_partitioning = false} : () -> ()
     func.return
   }
+  func.func @empty_func() {
+    func.return
+  }
 }
 
 // -----
@@ -2467,6 +2476,9 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     "tf_device.cluster_func"() {_xla_compile_device_type = "XPU", _replication_info = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = [], use_spmd_for_xla_partitioning = false} : () -> ()
     func.return
   }
+  func.func @empty_func() {
+    func.return
+  }
 }
 
 // -----
@@ -2477,6 +2489,9 @@ module attributes {tf.versions = {producer = 888 : i32}, tf.devices = ["/job:wor
     "tf_device.cluster_func"() {_xla_compile_device_type = "XPU", _replication_info = "cluster0", func = @empty_func, num_cores_per_replica = 1, step_marker_location = "", topology = "", device_assignment = [], input_sharding_configuration = [], output_sharding_configuration = [], use_spmd_for_xla_partitioning = false} : () -> ()
     func.return
   }
+  func.func @empty_func() {
+    func.return
+  }
 }
 
 // -----
diff --git a/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir b/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
index 468e3495439799..66cae16a67424d 100644
--- a/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
+++ b/tensorflow/compiler/mlir/tensorflow/tests/tpu_sharding_identification.mlir
@@ -174,6 +174,72 @@ func.func @func_with_sharding_after_read_variable(%arg0: tensor<*x!tf_type.resou
 
 // -----
 
+// Tests sharding propagation in while region body.
+// CHECK-LABEL: func @check_sharding_for_read_variable_inside_while_body
+func.func @check_sharding_for_read_variable_inside_while_body(%arg0 : tensor<i32>, %arg1: tensor<*x!tf_type.resource<tensor<128x1024xf32>>>) {
+  %0 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf_type.resource<tensor<128x1024xf32>>>) -> tensor<128x1024xf32>
+  %1:1 = "tf_device.cluster_func"(%arg0, %0) {func = @func_with_sharding_inside_while_body, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", num_cores_per_replica = 2 : i64, use_spmd_for_xla_partitioning = true, use_tpu = true} : (tensor<i32>, tensor<128x1024xf32>) -> (tensor<128x1024xf32>)
+  // CHECK: input_sharding_configuration
+  // CHECK-SAME: ["", "\0A\0B\0C"]
+  // CHECK: output_sharding_configuration
+  // CHECK-SAME: ["\0D\0E\0F"]
+  func.return
+}
+
+// CHECK-LABEL: func @func_with_sharding_inside_while_body
+// CHECK-SAME: (%{{[a-z0-9]+}}: tensor<i32> {mhlo.sharding = ""}, %{{[a-z0-9]+}}: tensor<128x1024xf32> {mhlo.sharding = "\0A\0B\0C"})
+// CHECK-SAME: -> (tensor<128x1024xf32> {mhlo.sharding = "\0D\0E\0F"})
+func.func @func_with_sharding_inside_while_body(%arg0: tensor<i32>, %arg1: tensor<128x1024xf32>) -> (tensor<128x1024xf32>) {
+  %cst = "tf.Const"() <{value = dense<0> : tensor<i32>}> {device = ""} : () -> tensor<i32>
+  %0:2 = "tf.WhileRegion"(%cst, %arg1) <{is_stateless = false, parallel_iterations = 1 : i64}> ({
+    ^bb0(%arg2: tensor<i32>, %arg3: tensor<128x1024xf32>):
+      %1 = "tf.Less"(%arg2, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      "tf.Yield"(%1) : (tensor<i1>) -> ()
+  }, {
+    ^bb0(%arg2: tensor<i32>, %arg3: tensor<128x1024xf32>):
+      %1 = "tf.XlaSharding"(%arg3) <{_XlaSharding = "\0A\0B\0C", sharding = "\0A\0B\0C"}> {unspecified_dims = []} : (tensor<128x1024xf32>) -> tensor<128x1024xf32>
+      %2 = "tf.Square"(%1) : (tensor<128x1024xf32>) -> tensor<128x1024xf32>
+      "tf.Yield"(%arg2, %2) : (tensor<i32>, tensor<128x1024xf32>) -> ()
+  }) {_num_original_outputs = 1 : i64, _read_only_resource_inputs = [1], _xla_propagate_compile_time_consts = true} : (tensor<i32>, tensor<128x1024xf32>) -> (tensor<i32>, tensor<128x1024xf32>)
+  %1 = "tf.XlaSharding"(%0#1) { _XlaSharding = "\0D\0E\0F", sharding = "\0D\0E\0F" } : (tensor<128x1024xf32>) -> tensor<128x1024xf32>
+  func.return %1 : tensor<128x1024xf32>
+}
+
+// -----
+
+// Tests sharding propagation in while region condition.
+// CHECK-LABEL: func @check_sharding_for_read_variable_inside_while_cond
+func.func @check_sharding_for_read_variable_inside_while_cond(%arg0 : tensor<i32>, %arg1: tensor<*x!tf_type.resource<tensor<128x1024xf32>>>) {
+  %0 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf_type.resource<tensor<128x1024xf32>>>) -> tensor<128x1024xf32>
+  %1:1 = "tf_device.cluster_func"(%arg0, %0) {func = @func_with_sharding_inside_while_cond, step_marker_location = "STEP_MARK_AT_TOP_LEVEL_WHILE_LOOP", num_cores_per_replica = 2 : i64, use_spmd_for_xla_partitioning = true, use_tpu = true} : (tensor<i32>, tensor<128x1024xf32>) -> (tensor<128x1024xf32>)
+  // CHECK: input_sharding_configuration
+  // CHECK-SAME: ["", "\0A\0B\0C"]
+  // CHECK: output_sharding_configuration
+  // CHECK-SAME: ["\0D\0E\0F"]
+  func.return
+}
+
+// CHECK-LABEL: func @func_with_sharding_inside_while_cond
+// CHECK-SAME: (%{{[a-z0-9]+}}: tensor<i32> {mhlo.sharding = ""}, %{{[a-z0-9]+}}: tensor<128x1024xf32> {mhlo.sharding = "\0A\0B\0C"})
+// CHECK-SAME: -> (tensor<128x1024xf32> {mhlo.sharding = "\0D\0E\0F"})
+func.func @func_with_sharding_inside_while_cond(%arg0: tensor<i32>, %arg1: tensor<128x1024xf32>) -> (tensor<128x1024xf32>) {
+  %cst = "tf.Const"() <{value = dense<0> : tensor<i32>}> {device = ""} : () -> tensor<i32>
+  %0:2 = "tf.WhileRegion"(%cst, %arg1) <{is_stateless = false, parallel_iterations = 1 : i64}> ({
+    ^bb0(%arg2: tensor<i32>, %arg3: tensor<128x1024xf32>):
+      %1 = "tf.XlaSharding"(%arg3) <{_XlaSharding = "\0A\0B\0C", sharding = "\0A\0B\0C"}> {unspecified_dims = []} : (tensor<128x1024xf32>) -> tensor<128x1024xf32>
+      %2 = "tf.Less"(%arg2, %arg0) : (tensor<i32>, tensor<i32>) -> tensor<i1>
+      "tf.Yield"(%2) : (tensor<i1>) -> ()
+  }, {
+    ^bb0(%arg2: tensor<i32>, %arg3: tensor<128x1024xf32>):
+      %1 = "tf.Square"(%arg3) : (tensor<128x1024xf32>) -> tensor<128x1024xf32>
+      "tf.Yield"(%arg2, %1) : (tensor<i32>, tensor<128x1024xf32>) -> ()
+  }) {_num_original_outputs = 1 : i64, _read_only_resource_inputs = [1], _xla_propagate_compile_time_consts = true} : (tensor<i32>, tensor<128x1024xf32>) -> (tensor<i32>, tensor<128x1024xf32>)
+  %1 = "tf.XlaSharding"(%0#1) { _XlaSharding = "\0D\0E\0F", sharding = "\0D\0E\0F" } : (tensor<128x1024xf32>) -> tensor<128x1024xf32>
+  func.return %1 : tensor<128x1024xf32>
+}
+
+// -----
+
 // Tests with input sharding following an identity op and cast op.
 // CHECK-LABEL: func @check_sharding_after_cast_op
 func.func @check_sharding_after_cast_op(%arg0: tensor<*xi32>, %arg1: tensor<*xi1>) {
@@ -313,33 +379,6 @@ func.func @cluster_func(%arg0: tensor<*xf32>) {
 
 // -----
 
-// Tests that device variable sharding defaults to xla.OpSharding
-// { type : MAXIMAL
-//   tile_assignment_dimensions: [ 1 ]
-//   tile_assignment_devices   : [ 0 ]
-// }
-
-// CHECK-LABEL: func @maximal_device_variable
-func.func @maximal_device_variable(%arg0: tensor<*x!tf_type.resource<tensor<*xf32>>>) {
-   tf_device.replicate(%arg0 as %arg1: tensor<*x!tf_type.resource<tensor<*xf32>>>)
-     {_mirrored_variable_indices = [0], n = 2 : i32} {
-     %0 = "tf.ReadVariableOp"(%arg1) : (tensor<*x!tf_type.resource<tensor<*xf32>>>) -> tensor<*xf32>
-     // CHECK:      tf_device.cluster_func
-     // CHECK-SAME: input_sharding_configuration = ["\08\01\1A\01\01\22\01\00"]
-     "tf_device.cluster_func"(%0) {func = @cluster_func, use_spmd_for_xla_partitioning = true, num_cores_per_replica = 1 : i64} : (tensor<*xf32>) -> ()
-     tf_device.return
-  }
-  func.return
-}
-
-// CHECK-LABEL: func @cluster_func
-// CHECK-SAME: ({{.+}}: tensor<*xf32> {mhlo.sharding = "\08\01\1A\01\01\22\01\00"})
-func.func @cluster_func(%arg0: tensor<*xf32>) {
-  func.return
-}
-
-// -----
-
 // Tests that device variable sharding for an implicitly capture device variable
 // defaults to REPLICATE.
 
@@ -564,6 +603,33 @@ func.func @func(%arg0: tensor<*xi32> {tf.aliasing_output = 1 : i64},
 
 // -----
 
+// CHECK-LABEL: func @check_symmetric_alias_propagation
+func.func @check_symmetric_alias_propagation(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) {
+  // CHECK:      tf_device.cluster_func
+  // CHECK-SAME: input_sharding_configuration = ["\01\02\03", "\04\05\06"]
+  // CHECK-SAME: output_sharding_configuration = ["\01\02\03", "\04\05\06"]
+  "tf_device.cluster_func"(%arg0, %arg1) {
+      func = @func,
+      use_spmd_for_xla_partitioning = false, num_cores_per_replica = 1 : i64
+  } : (tensor<*xi32>, tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>)
+  func.return
+}
+
+// CHECK-LABEL: func @func
+// CHECK-SAME: %arg0: tensor<*xi32> {mhlo.sharding = "\01\02\03"
+// CHECK-SAME: %arg1: tensor<*xi32> {mhlo.sharding = "\04\05\06"
+// CHECK-SAME: ->{{.*}}mhlo.sharding = "\01\02\03"{{.*}}mhlo.sharding = "\04\05\06"
+func.func @func(%arg0: tensor<*xi32> {tf.aliasing_output = 0 : i64},
+           %arg1: tensor<*xi32> {tf.aliasing_output = 1 : i64}) -> (tensor<*xi32>, tensor<*xi32>) {
+  %0 = "tf.XlaSharding"(%arg0) { _XlaSharding = "\01\02\03"} : (tensor<*xi32>) -> tensor<*xi32>
+  %1 = "tf.A"(%0) : (tensor<*xi32>) -> (tensor<*xi32>)
+  %2 = "tf.B"(%arg1) : (tensor<*xi32>) -> (tensor<*xi32>)
+  %3 = "tf.XlaSharding"(%2) { _XlaSharding = "\04\05\06"} : (tensor<*xi32>) -> tensor<*xi32>
+  func.return %2, %3 : tensor<*xi32>, tensor<*xi32>
+}
+
+// -----
+
 // Partial tiled inputs using XlaSharding ops identified as REPLICATED should keep the sharding configuration.
 // The following xla.OpSharding is used:
 // Proto debug string:
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
index bd660e8e8fcce5..66b81b0af9bb9a 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/BUILD
@@ -595,6 +595,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_analysis",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_side_effects",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_traits",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/mlir/tensorflow:tf_ops_layout_helper",
         "//tensorflow/compiler/mlir/tensorflow:tf_saved_model_inc_gen",
@@ -627,11 +628,13 @@ cc_library(
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "//tensorflow/core/protobuf/tpu:dynamic_padding_proto_cc",
         "//tensorflow/core/tpu:tpu_embedding_optimization_parameters_utils",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:variant",
@@ -781,6 +784,7 @@ cc_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/ir/types:Dialect",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc b/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
index e16082fad89c4c..71ac89140d7a32 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/convert_control_to_data_outputs.cc
@@ -19,15 +19,25 @@ limitations under the License.
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/EquivalenceClasses.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
+#include "mlir/Analysis/DataFlow/ConstantPropagationAnalysis.h"  // from @llvm-project
+#include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"  // from @llvm-project
+#include "mlir/Analysis/DataFlow/SparseAnalysis.h"  // from @llvm-project
+#include "mlir/Analysis/DataFlowFramework.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/OperationSupport.h"  // from @llvm-project
 #include "mlir/IR/SymbolTable.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/analysis/resource_alias_analysis.h"
+#include "tensorflow/compiler/mlir/tensorflow/analysis/resource_dataflow.h"
 #include "tensorflow/compiler/mlir/tensorflow/analysis/side_effect_analysis.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -48,12 +58,19 @@ using OperationSetTy = SmallPtrSet<Operation*, 4>;
 using ResourceToOpsMapTy = DenseMap<ResourceId, OperationSetTy>;
 
 #define GEN_PASS_DEF_EXECUTORCONVERTCONTROLTODATAOUTPUTSPASS
+#define GEN_PASS_DECL_EXECUTORCONVERTCONTROLTODATAOUTPUTSPASS
 #include "tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.h.inc"
 
-class ConvertControlToDataOutputsPass
+struct ConvertControlToDataOutputsPass
     : public impl::ExecutorConvertControlToDataOutputsPassBase<
           ConvertControlToDataOutputsPass> {
- public:
+  ConvertControlToDataOutputsPass() = default;
+  explicit ConvertControlToDataOutputsPass(
+      bool composite_tpuexecute_side_effects)
+      : ExecutorConvertControlToDataOutputsPassBase(
+            ExecutorConvertControlToDataOutputsPassOptions{
+                composite_tpuexecute_side_effects}) {}
+
   void runOnOperation() override;
 };
 
@@ -74,6 +91,41 @@ SmallVector<TF::WhileOp> GetWhileCallers(func::FuncOp func,
   return while_callers;
 }
 
+bool IsResourceType(Type type) {
+  if (auto tensor_type = type.dyn_cast<TensorType>()) {
+    return tensor_type.getElementType().isa<TF::ResourceType>();
+  }
+  return false;
+}
+
+bool OnlyOperatesOnCompositeDevices(TF::TPUExecuteAndUpdateVariablesOp& op,
+                                    const DataFlowSolver& solver) {
+  llvm::SmallSet<int, 8> read_array;
+  for (const Attribute& attr : op.getDeviceVarReadsIndices()) {
+    read_array.insert(attr.cast<IntegerAttr>().getInt());
+  }
+  llvm::SmallSet<int, 8> update_array;
+  for (const Attribute& attr : op.getDeviceVarUpdatesIndices()) {
+    update_array.insert(attr.cast<IntegerAttr>().getInt());
+  }
+  for (auto& arg : op->getOpOperands()) {
+    if (!IsResourceType(arg.get().getType())) {
+      continue;
+    }
+    auto lattice =
+        solver.lookupState<TF::ResourceDataflowAnalysis::StateT>(arg.get())
+            ->getValue();
+    bool is_read = read_array.contains(arg.getOperandNumber());
+    bool is_update = update_array.contains(arg.getOperandNumber());
+    // We want the resource operands that are on composite devices to be the
+    // exact same set as the resource operands that are read or updated.
+    if ((is_read || is_update) != lattice.is_on_composite_device) {
+      return false;
+    }
+  }
+  return true;
+}
+
 // Populates `chain_resource_to_ops_map`, the map from all resources that need
 // to be chained to the set of operations that access the resource, and
 // `resource_equivalence_classes`. Resources are equivalent if they are accessed
@@ -81,7 +133,8 @@ SmallVector<TF::WhileOp> GetWhileCallers(func::FuncOp func,
 void CollectChainResources(
     func::FuncOp func, ResourceToOpsMapTy& chain_resource_to_ops_map,
     llvm::EquivalenceClasses<ResourceId>& resource_equivalence_classes,
-    const TF::SideEffectAnalysis::Info& side_effect_analysis) {
+    const TF::SideEffectAnalysis::Info& side_effect_analysis,
+    const DataFlowSolver& solver, bool composite_tpuexecute_side_effects) {
   auto graph_op = cast<GraphOp>(func.front().front());
 
   // For each op in the graph, get the resources it uses and update the access
@@ -93,6 +146,17 @@ void CollectChainResources(
     assert(island.WrapsSingleOp());
     Operation& op = island.GetBody().front();
 
+    // If the op only operates on resources stored on devices that are
+    // "COMPOSITE", then this op is defined to work in parallel with other
+    // TPUExecute* ops. So we don't need to track resources for it.
+    // TODO(b/325290168): Do this check per resource, not per op.
+    if (auto execute = llvm::dyn_cast<TF::TPUExecuteAndUpdateVariablesOp>(op)) {
+      if (composite_tpuexecute_side_effects &&
+          OnlyOperatesOnCompositeDevices(execute, solver)) {
+        return WalkResult::advance();
+      }
+    }
+
     ResourceId prev_resource_id = kInvalidResourceId;
     for (auto resource_id_read_only_pair :
          side_effect_analysis.GetResourceIds(&op)) {
@@ -113,6 +177,7 @@ void CollectChainResources(
         prev_resource_id = resource_id;
       }
     }
+    return WalkResult::advance();
   });
 }
 
@@ -386,7 +451,8 @@ TF::WhileOp RewriteWhileOp(TF::WhileOp while_op, int num_resource_inputs,
 void ConvertControlToDataOutputs(
     func::FuncOp while_body, SmallVectorImpl<TF::WhileOp>& while_callers,
     OperationSetTy& recompute_analysis_for_funcs,
-    const TF::SideEffectAnalysis::Info& side_effect_analysis) {
+    const TF::SideEffectAnalysis::Info& side_effect_analysis,
+    const DataFlowSolver& solver, bool composite_tpuexecute_side_effects) {
   if (while_callers.empty()) return;
 
   // Collect access information for each resource in the while body that needs
@@ -395,7 +461,8 @@ void ConvertControlToDataOutputs(
   ResourceToOpsMapTy chain_resource_to_ops_map;
   llvm::EquivalenceClasses<ResourceId> resource_equivalence_classes;
   CollectChainResources(while_body, chain_resource_to_ops_map,
-                        resource_equivalence_classes, side_effect_analysis);
+                        resource_equivalence_classes, side_effect_analysis,
+                        solver, composite_tpuexecute_side_effects);
 
   // Check for presence of unknown side-effecting ops within the while loop
   // body. These ops act as barriers and the optimization would not yield much
@@ -459,6 +526,13 @@ void ConvertControlToDataOutputs(
 
 void ConvertControlToDataOutputsPass::runOnOperation() {
   ModuleOp module = getOperation();
+
+  DataFlowSolver solver;
+  solver.load<dataflow::DeadCodeAnalysis>();
+  solver.load<dataflow::SparseConstantPropagation>();
+  solver.load<TF::ResourceDataflowAnalysis>();
+  if (failed(solver.initializeAndRun(module))) return signalPassFailure();
+
   // This pass assumes that all functions are suitable for export i.e., each
   // function has a single tf_executor.graph op and all islands wrap the
   // internal op perfectly. Verify that in the beginning once.
@@ -500,7 +574,8 @@ void ConvertControlToDataOutputsPass::runOnOperation() {
     }
     ConvertControlToDataOutputs(
         while_body, while_callers, recompute_analysis_for_funcs,
-        side_effect_analysis.GetAnalysisForFunc(while_body));
+        side_effect_analysis.GetAnalysisForFunc(while_body), solver,
+        composite_tpuexecute_side_effects_);
   }
 }
 
@@ -511,5 +586,12 @@ CreateTFExecutorConvertControlToDataOutputsPass() {
   return std::make_unique<ConvertControlToDataOutputsPass>();
 }
 
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTFExecutorConvertControlToDataOutputsPass(
+    bool composite_tpuexecute_side_effects) {
+  return std::make_unique<ConvertControlToDataOutputsPass>(
+      composite_tpuexecute_side_effects);
+}
+
 }  // namespace tf_executor
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
index de4fabe59969c1..8a9e8451a91ba5 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/decompose_resource_ops.td
@@ -55,9 +55,6 @@ def Clamp: NativeCodeCall<
     "  $0.getLoc(),"
     "  $2.getType(), $2, $1, $3)">;
 
-def CopyAttrs: NativeCodeCallVoid<
-    "CopyDeviceAndUnderscoredAttributesAdaptor($0, $1)">;
-
 def DecomposeAssignAddVariableOp :
   Pat<
     (TF_AssignAddVariableOp:$src_op $resource, $value),
@@ -341,12 +338,12 @@ def DecomposeResourceApplyAdamNesterov :
 def DecomposeResourceGather : Pat<
   (TF_ResourceGatherOp:$old_result
     $resource, $indices, $batch_dims, $validate_indices),
-  (TF_GatherV2Op:$dest
+  (TF_GatherV2Op
     (CreateTFReadVariableOp $old_result, $old_result, $resource),
     $indices,
     (TF_ConstOp $batch_dims), // axis
     $batch_dims
-  ), [], [(CopyAttrs $old_result, $dest)]>;
+  )>;
 
 // Pattern to decompose tf.ResourceScatterAdd into tf.ReadVariable,
 // tf.TensorScatterAdd, and tf.AssignVariable.
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
index 125cbbd6163c33..0cff8946687dcb 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/functional_control_flow_to_regions.cc
@@ -48,6 +48,11 @@ namespace {
 struct FunctionalControlFlowToRegions
     : public impl::FunctionalControlFlowToRegionsPassBase<
           FunctionalControlFlowToRegions> {
+  FunctionalControlFlowToRegions() = default;
+  explicit FunctionalControlFlowToRegions(bool allow_passthrough_args)
+      : FunctionalControlFlowToRegionsPassBase(
+            FunctionalControlFlowToRegionsPassOptions{allow_passthrough_args}) {
+  }
   void runOnOperation() override;
 };
 
@@ -251,6 +256,11 @@ std::unique_ptr<OperationPass<ModuleOp>>
 CreateTFFunctionalControlFlowToRegions() {
   return std::make_unique<FunctionalControlFlowToRegions>();
 }
+std::unique_ptr<OperationPass<ModuleOp>> CreateTFFunctionalControlFlowToRegions(
+    bool allow_passthrough_args) {
+  return std::make_unique<FunctionalControlFlowToRegions>(
+      allow_passthrough_args);
+}
 
 }  // namespace TF
 }  // namespace mlir
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
index a7fa9268e152d0..1c0a125598cdbe 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/fused_kernel_matcher.cc
@@ -330,6 +330,16 @@ class FuseMatMulBiasAdd
       });
       return false;
     }
+    // FusedMatMul kernel does not support grad_a/grad_b attrs
+    if ((matmul->hasAttr("grad_a") &&
+         matmul->getAttr("grad_a").cast<BoolAttr>().getValue()) ||
+        (matmul->hasAttr("grad_b") &&
+         matmul->getAttr("grad_b").cast<BoolAttr>().getValue())) {
+      (void)rewriter.notifyMatchFailure(matmul, [&](Diagnostic &diag) {
+        diag << "FusedMatMul kernel does not support grad_a/grad_b attrs";
+      });
+      return false;
+    }
     return true;
   }
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
index a52962e5f7f326..9c475f1f9f5281 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/passes.h
@@ -80,6 +80,8 @@ CreateTFFunctionalControlFlowToCFG();
 // their region based counterparts.
 std::unique_ptr<OperationPass<ModuleOp>>
 CreateTFFunctionalControlFlowToRegions();
+std::unique_ptr<OperationPass<ModuleOp>> CreateTFFunctionalControlFlowToRegions(
+    bool allow_passthrough_args);
 
 // Transforms region bases control flow operations in the TensorFlow dialect to
 // their functional counterparts.
@@ -339,6 +341,9 @@ namespace tf_executor {
 // Creates a pass to chain control outputs of while loop body.
 std::unique_ptr<OperationPass<ModuleOp>>
 CreateTFExecutorConvertControlToDataOutputsPass();
+std::unique_ptr<OperationPass<ModuleOp>>
+CreateTFExecutorConvertControlToDataOutputsPass(
+    bool composite_tpuexecute_side_effects);
 
 std::unique_ptr<OperationPass<ModuleOp>>
 CreateTFExecutorCheckControlDependenciesPass();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc b/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
index 56fcdd761999b1..661dafe2a2f327 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/prepare_tpu_computation_for_tf_export.cc
@@ -47,8 +47,8 @@ bool IsCommunicationOp(Operation* op) {
 // subcomputation in the TF/XLA bridge.
 bool SupportsCommunicationComputation(Operation* op) {
   return isa<TF::IfRegionOp, TF::WhileRegionOp, TF::CaseRegionOp,
-             TF::StatefulPartitionedCallOp, TF::PartitionedCallOp,
-             TF::LegacyCallOp>(op);
+             TF::XlaCallModuleOp, TF::StatefulPartitionedCallOp,
+             TF::PartitionedCallOp, TF::LegacyCallOp>(op);
 }
 
 #define GEN_PASS_DEF_PREPARETPUCOMPUTATIONFORTFEXPORTPASS
@@ -65,14 +65,17 @@ class RewriteXlaHostComputeMlir
  public:
   using OpRewritePattern<TF::_XlaHostComputeMlirOp>::OpRewritePattern;
 
-  LogicalResult matchAndRewrite(TF::_XlaHostComputeMlirOp op,
-                                PatternRewriter& rewriter) const override {
+  LogicalResult match(TF::_XlaHostComputeMlirOp op) const override {
     if (op.getManualSharding()) {
-      op.emitOpError() << "manual_sharding not supported with fallback of "
-                          "phase 2 legalize TF/XLA bridge. manual_sharding is "
-                          "used by map_outside_compilation";
+      // This rewrite does not support manual_sharding. It is expected that the
+      // _XlaHostComputeMlirOp registered as an MlirXlaOpKernel will handle this
+      // case later once the XlaBuilder graph reaches it.
       return failure();
     }
+    return success();
+  }
+  void rewrite(TF::_XlaHostComputeMlirOp op,
+               PatternRewriter& rewriter) const override {
     llvm::SmallVector<Attribute> shape_attrs;
     shape_attrs.reserve(op.getNumResults());
     for (Type ty : op.getResultTypes()) {
@@ -132,7 +135,6 @@ class RewriteXlaHostComputeMlir
         op.getRecvKeyAttr(),
         /*cost_estimate_ns=*/rewriter.getI64IntegerAttr(kDefaultCostEstimate),
         /*tpu_core=*/rewriter.getI64IntegerAttr(0));
-    return success();
   }
 };
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc b/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
index 78f2c3e0423124..a669276e35a175 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/region_control_flow_to_functional.cc
@@ -183,7 +183,7 @@ StringRef ExtractSingleBlockRegion(
   }
 
   ModuleOp module = region.getParentOfType<ModuleOp>();
-  auto builder = OpBuilder::atBlockBegin(module.getBody());
+  OpBuilder builder(module.getContext());
   auto loc = region.getParentOp()->getLoc();
   Block& entry = region.front();
   int num_region_arguments = entry.getNumArguments();
@@ -236,8 +236,9 @@ StringRef ExtractSingleBlockRegion(
 
   outlined_func.setPrivate();
 
-  // Uniquify the function name.
-  symbol_table.getSymbolTable(module).insert(outlined_func);
+  // Uniquify the function name, and insert into module.
+  symbol_table.getSymbolTable(module).insert(outlined_func,
+                                             module.getBody()->begin());
 
   // Add the outlined function to the worklist in case its body has
   // IfRegion or WhileRegion ops that need to converted.
@@ -246,9 +247,10 @@ StringRef ExtractSingleBlockRegion(
 }
 
 // Returns call for region with single call whose result feeds into the
-// terminator of the region. if `allow_to_bool` is true, also allows a single
-// ToBoolOp between the region yield and the call. Returns none if the region
-// does not conform to this pattern.
+// terminator of the region. If `allow_to_bool` is true, it allows patterns used
+// in the condition of While ops, i.e. it allows a single bool (possibly passed
+// through a ToBoolOp) between the region yield and the call. Returns none if
+// the region does not conform to this pattern.
 std::optional<func::CallOp> IsSingleCallRegion(Region& region,
                                                bool allow_to_bool = false) {
   if (!llvm::hasSingleElement(region)) return std::nullopt;
@@ -275,10 +277,23 @@ std::optional<func::CallOp> IsSingleCallRegion(Region& region,
   func::CallOp call = dyn_cast<func::CallOp>(*it++);
   if (!call) return std::nullopt;
 
-  // All call results should feed into expected consumer
-  // All results of the call should feed into the yield.
-  if (call.getNumResults() != call_consumer->getNumOperands())
-    return std::nullopt;
+  if (allow_to_bool && call.getNumResults() == 1 &&
+      yield->getNumOperands() != 1) {
+    // Allow patterns of the form
+    // %cond = call(...)
+    // yield %cond, [...passthrough args...]
+    if (yield->getNumOperands() != block.getNumArguments() + 1)
+      return std::nullopt;
+    for (auto [yield_operand, block_arg] :
+         llvm::zip(yield->getOperands().drop_front(1), block.getArguments())) {
+      if (yield_operand != block_arg) return std::nullopt;
+    }
+  } else {
+    // All call results should feed into expected consumer
+    // All results of the call should feed into the yield.
+    if (call.getNumResults() != call_consumer->getNumOperands())
+      return std::nullopt;
+  }
 
   for (auto res_it : llvm::zip(call.getResults(), call_consumer->getOperands()))
     if (std::get<0>(res_it) != std::get<1>(res_it)) return std::nullopt;
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
index c18debb978fe0f..dc1cfe3f5920fe 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.cc
@@ -28,7 +28,9 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/STLExtras.h"
@@ -67,6 +69,7 @@ limitations under the License.
 #include "mlir/Support/DebugStringHelper.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 #include "mlir/Transforms/FoldUtils.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
@@ -434,6 +437,24 @@ Type GetType(Attribute shape_attr, Attribute type_attr) {
   else
     return UnrankedTensorType::get(type.getValue());
 }
+}  // namespace
+
+// Returns whether type can be further refined.
+bool CanBeRefined(Type type) {
+  auto shape_type = type.dyn_cast<ShapedType>();
+  if (!shape_type) return false;
+
+  // Returns whether type with subtypes can be further refined.
+  auto can_refine_subtypes = [](TF::TensorFlowTypeWithSubtype tws) {
+    return tws.GetSubtypes().empty() ||
+           llvm::any_of(tws.GetSubtypes(), CanBeRefined);
+  };
+  auto type_with_subtype =
+      shape_type.getElementType().dyn_cast<TF::TensorFlowTypeWithSubtype>();
+  if (type_with_subtype && can_refine_subtypes(type_with_subtype)) return true;
+
+  return !shape_type.hasStaticShape();
+}
 
 // Returns a new arg type based on the shape and element type. If there are
 // dynamic bounds attribute to the arg, update the bounds based on the shape
@@ -464,25 +485,6 @@ Type GetNewArgType(Type old_arg_type, ArrayRef<int64_t> shape,
   return new_arg_type;
 }
 
-}  // namespace
-
-// Returns whether type can be further refined.
-bool CanBeRefined(Type type) {
-  auto shape_type = type.dyn_cast<ShapedType>();
-  if (!shape_type) return false;
-
-  // Returns whether type with subtypes can be further refined.
-  auto can_refine_subtypes = [](TF::TensorFlowTypeWithSubtype tws) {
-    return tws.GetSubtypes().empty() ||
-           llvm::any_of(tws.GetSubtypes(), CanBeRefined);
-  };
-  auto type_with_subtype =
-      shape_type.getElementType().dyn_cast<TF::TensorFlowTypeWithSubtype>();
-  if (type_with_subtype && can_refine_subtypes(type_with_subtype)) return true;
-
-  return !shape_type.hasStaticShape();
-}
-
 // Combination of value producer and port of value produced (e.g.,
 //   <value result output>:<value in output tensor>,
 // so for tf.Const -> tensor<10x20xf32>, [0,2,18] would point to a unique output
@@ -728,7 +730,8 @@ Attribute ComputeOutputComponent(const ValuePort& value_port,
 class ShapeInference {
  public:
   ShapeInference(int64_t graph_version, ModuleOp module,
-                 bool propagate_caller_callee_constants);
+                 bool propagate_caller_callee_constants,
+                 ArrayRef<TypeID> ops_to_skip);
 
   LogicalResult ComputeInputsRequiredForOutput(ValuePort value_port,
                                                ValuePortInputs* inputs) {
@@ -901,8 +904,8 @@ class ShapeInference {
   bool RefineResultType(Operation* op, Value result,
                         Type potential_refined_type);
 
-  // Infers the shape from a (Stateful)PartionedCall operation by looking up the
-  // called function and propagating the return type.
+  // Infers the shape from a (Stateful)PartitionedCall operation by looking up
+  // the called function and propagating the return type.
   bool InferShapeForCall(CallOpInterface call_op);
 
   bool InferShapeForCast(Operation* op);
@@ -1006,6 +1009,9 @@ class ShapeInference {
 
   int64_t graph_version_;
 
+  // Op types for which shape inference should be skipped.
+  llvm::SmallDenseSet<TypeID> ops_to_skip_;
+
   // TODO(b/154065712): Remove propagate_caller_callee_constants once using
   // SCCP pass instead.
   bool propagate_caller_callee_constants_;
@@ -1020,11 +1026,15 @@ class ShapeInference {
 };
 
 ShapeInference::ShapeInference(int64_t graph_version, ModuleOp module,
-                               bool propagate_caller_callee_constants)
+                               bool propagate_caller_callee_constants,
+                               ArrayRef<TypeID> ops_to_skip)
     : tf_dialect_(module->getContext()->getLoadedDialect<TensorFlowDialect>()),
       symbol_users_(symbol_table_, module),
       graph_version_(graph_version),
       propagate_caller_callee_constants_(propagate_caller_callee_constants) {
+  for (const auto& op_type : ops_to_skip) {
+    ops_to_skip_.insert(op_type);
+  }
   // Create symbol table for module.
   symbol_table_.getSymbolTable(module);
 }
@@ -1079,8 +1089,8 @@ bool ShapeInference::RefineResultType(Operation* op, Value result,
                                                  result);
 }
 
-// Infers the shape from a (Stateful)PartionedCall operation by looking up the
-// called function and propagating the return type.
+// Infers the shape from a (Stateful)PartitionedCall operation by looking up
+// the called function and propagating the return type.
 bool ShapeInference::InferShapeForCall(CallOpInterface call_op) {
   func::FuncOp func =
       dyn_cast_or_null<func::FuncOp>(call_op.resolveCallable(&symbol_table_));
@@ -1256,8 +1266,6 @@ bool ShapeInference::InferShapeForXlaCallModule(XlaCallModuleOp op) {
 
   tsl::Status status = loader->RefineDynamicShapes(input_shapes);
   if (!status.ok()) {
-    llvm::errs() << "Failed during XlaCallModule shape refinement: "
-                 << status.ToString();
     // Do not return false here.
     //
     // RefineDynamicShapes returns ok only when it produces full static shapes.
@@ -1266,6 +1274,7 @@ bool ShapeInference::InferShapeForXlaCallModule(XlaCallModuleOp op) {
     // to abort here.
     // TODO(b/316639984): improve RefineDynamicShapes return values to include
     // these info.
+    VLOG(1) << "Failed during XlaCallModule shape refinement: " << status;
   }
   mlir::ResultRange op_results = op.getResults();
   // The main_outputs may include tokens that are not among the op_results;
@@ -1972,8 +1981,17 @@ bool ShapeInference::InferShapeForXlaGatherOp(XlaGatherOp op) {
     return false;
 
   DenseIntElementsAttr slice_sizes_attr;
-  if (!matchPattern(op.getSliceSizes(), m_Constant(&slice_sizes_attr)))
+  if (DenseIntElementsAttr attr;
+      matchPattern(op.getSliceSizes(), m_Constant(&attr))) {
+    slice_sizes_attr = attr;
+  } else if (const auto it = results_.find(ValuePort(op.getSliceSizes()));
+             it != results_.end() &&
+             llvm::isa_and_nonnull<DenseIntElementsAttr>(it->second)) {
+    slice_sizes_attr = llvm::cast<DenseIntElementsAttr>(it->second);
+  } else {
     return false;
+  }
+
   llvm::SmallVector<int64_t> slice_sizes;
   for (const auto& attr : slice_sizes_attr.getValues<APInt>()) {
     slice_sizes.push_back(attr.getSExtValue());
@@ -3135,6 +3153,14 @@ FailureOr<bool> ShapeInference::InferShapeUntilFixPoint(
     LLVM_DEBUG(llvm::dbgs()
                << "Shape inference, iteration " << iteration << "\n");
     auto res = region->walk([&](Operation* op) {
+      auto abstract_op = op->getRegisteredInfo();
+      if (abstract_op && ops_to_skip_.contains(abstract_op->getTypeID())) {
+        LLVM_DEBUG(llvm::dbgs()
+                   << "Skipping shape inference for explicitly skipped op '"
+                   << op->getName() << "'.\n");
+        return WalkResult::advance();
+      }
+
       DCOMMENT_OP(op, "Inferring for");
       if (auto infer_ti = dyn_cast<InferTypeOpInterface>(op)) {
         DCOMMENT("\tRefinining with type op interface");
@@ -3201,9 +3227,11 @@ static FailureOr<bool> InferShapeForFunction(ShapeInference& context,
 FailureOr<bool> InferShapeForFunction(func::FuncOp func,
                                       ArrayRef<ArrayRef<int64_t>> arg_shapes,
                                       int64_t graph_version,
-                                      int64_t max_iterations) {
+                                      int64_t max_iterations,
+                                      ArrayRef<TypeID> ops_to_skip) {
   ShapeInference context(graph_version, func->getParentOfType<ModuleOp>(),
-                         /*propagate_caller_callee_constants=*/true);
+                         /*propagate_caller_callee_constants=*/true,
+                         ops_to_skip);
   if (arg_shapes.empty()) {
     return InferShapeForFunction(context, func, max_iterations);
   }
@@ -3255,7 +3283,8 @@ FailureOr<bool> InferShapeForFunction(func::FuncOp func,
   return true;
 }
 
-FailureOr<bool> InferModuleShape(ModuleOp module, int64_t max_iterations) {
+FailureOr<bool> InferModuleShape(ModuleOp module, int64_t max_iterations,
+                                 ArrayRef<TypeID> ops_to_skip) {
   auto producer_or = tensorflow::GetTfGraphProducerVersion(module);
   if (!producer_or.ok()) {
     // TODO(jpienaar): Keeping the existing behavior for now but this could
@@ -3268,7 +3297,8 @@ FailureOr<bool> InferModuleShape(ModuleOp module, int64_t max_iterations) {
   // TODO(jpienaar): Clean up propagate_NextIterationSinkOp_callee_constants if
   // it is no longer needed.
   ShapeInference context(producer, module,
-                         /*propagate_caller_callee_constants=*/false);
+                         /*propagate_caller_callee_constants=*/false,
+                         ops_to_skip);
   if (auto main = module.lookupSymbol<mlir::func::FuncOp>("main"))
     context.enqueue(main);
   for (auto func : module.getOps<func::FuncOp>()) context.enqueue(func);
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
index e12c470d53f4cb..bc1cf7b3c8f475 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference.h
@@ -20,9 +20,11 @@ limitations under the License.
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/Region.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Support/TypeID.h"  // from @llvm-project
 
 namespace mlir {
 namespace TF {
@@ -30,13 +32,22 @@ namespace TF {
 // Returns whether type can be further refined.
 bool CanBeRefined(Type type);
 
-// Refines all the shapes in a module.
+// Returns a new arg type based on the shape and element type. If there are
+// dynamic bounds attribute to the arg, update the bounds based on the shape
+// as well.
+Type GetNewArgType(Type old_arg_type, ArrayRef<int64_t> shape,
+                   Type element_type, mlir::MLIRContext* context);
+
+// Refines all the shapes in a module, skipping the inference for all ops
+// whose type is in ops_to_skip.
 // Returns a failure() on error, otherwise returns true to indicate that it
 // reached convergence, false otherwise.
-FailureOr<bool> InferModuleShape(ModuleOp module, int64_t max_iterations = 10);
+FailureOr<bool> InferModuleShape(ModuleOp module, int64_t max_iterations = 10,
+                                 ArrayRef<TypeID> ops_to_skip = {});
 
 // Given a list of refined shapes matching the function arguments of func, runs
-// shape inference over the function to propagate this updated information.
+// shape inference over the function to propagate this updated information,
+// skipping the inference for all ops whose type is in ops_to_skip.
 // If arg_shapes are empty, then argument shapes will be left unchanged.
 // Note: This affects the entire module, and changes are not just scoped to the
 // function being inferred.
@@ -45,7 +56,8 @@ FailureOr<bool> InferModuleShape(ModuleOp module, int64_t max_iterations = 10);
 FailureOr<bool> InferShapeForFunction(func::FuncOp func,
                                       ArrayRef<ArrayRef<int64_t>> arg_shapes,
                                       int64_t graph_version,
-                                      int64_t max_iterations = 10);
+                                      int64_t max_iterations = 10,
+                                      ArrayRef<TypeID> ops_to_skip = {});
 
 }  // namespace TF
 
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
index 8180e2d4084dba..37bcb46b95cc57 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/shape_inference_pass.cc
@@ -38,7 +38,7 @@ class ShapeInference
  public:
   void runOnOperation() override {
     auto failure_or_converged =
-        InferModuleShape(getOperation(), max_iterations_);
+        InferModuleShape(getOperation(), max_iterations_, /*ops_to_skip=*/{});
     if (failed(failure_or_converged)) return signalPassFailure();
     if (!failure_or_converged.value()) {
       getOperation().emitError()
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td b/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
index f50286123f2478..de47fbaca69635 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tf_passes.td
@@ -516,6 +516,13 @@ def ExecutorConvertControlToDataOutputsPass : Pass<"tf-executor-convert-control-
   }];
 
   let constructor = "tf_executor::CreateTFExecutorConvertControlToDataOutputsPass()";
+
+  let options = [
+    Option<"composite_tpuexecute_side_effects_", "composite-tpuexecute-side-effects", "bool",
+           /*default=*/"false",
+           "Enables certain TPUExecute ops to run in parallel if they only "
+           "operate on resources that live on composite devices.">
+  ];
 }
 
 def ExecutorUpdateControlDependenciesPass : Pass<"tf-executor-update-control-dependencies", "ModuleOp"> {
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
index 5366f8e0f92702..3edb71bf08e627 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/tpu_sharding_identification_pass.cc
@@ -17,8 +17,13 @@ limitations under the License.
 #include <optional>
 #include <string>
 #include <type_traits>
+#include <utility>
 #include <variant>
 
+#include "absl/algorithm/container.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -31,6 +36,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
@@ -41,9 +47,11 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_traits.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/xla_sharding_util.h"
 #include "xla/client/sharding_builder.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
 
 namespace mlir {
 namespace TFTPU {
@@ -51,6 +59,8 @@ namespace {
 
 using OpShardingVariant = std::variant<mlir::Operation*, llvm::StringRef>;
 using OpShardingVector = llvm::SmallVector<OpShardingVariant, 8>;
+using OptionalOpShardingVector =
+    llvm::SmallVector<std::optional<OpShardingVariant>, 8>;
 
 constexpr char kReplicateSharding[] = "";
 constexpr char kShardingAttr[] = "mhlo.sharding";
@@ -148,16 +158,6 @@ mlir::ArrayAttr GetStrArrayAttr(Builder* builder,
   return builder->getArrayAttr(strings);
 }
 
-// Given a `tf_device.cluster_func` operand value return true iff it a device
-// variable that should default to MAXIMAL sharding. Device variables that are
-// per-replica or distributed default to MAXIMAL sharding, which corresponds to
-// arguments of the `tf_device.replicate`. Otherwise the variable is broadcast,
-// which corresponds to edges that are implicitly captured by the `replicate`.
-bool IsMaximalVariable(Value value) {
-  auto read_var = value.getDefiningOp<TF::ReadVariableOp>();
-  return read_var && read_var->getParentOfType<tf_device::ReplicateOp>();
-}
-
 // Verify whether the given sharding can be applied to the given (tensor) type.
 // (A bad sharding might mean failing tf.Split ops if the graph later executes
 //  on CPU)
@@ -248,7 +248,7 @@ std::optional<llvm::StringRef> AssignLogicalDeviceFromTPUReplicatedCoreAttr(
 // Cast op may be added right after the input.
 //
 // TODO(hongjunchoi): Add logic to parse XlaSharding op inside control flow (If,
-// Case, While) ops and Caller return values.
+// Case) ops and Caller return values.
 // TODO(hongjunchoi): Consider explicitly checking op patterns to detect sharded
 // inputs.
 std::optional<llvm::StringRef> GetXlaShardingFromArg(
@@ -270,6 +270,15 @@ std::optional<llvm::StringRef> GetXlaShardingFromArg(
           return logical_device;
         }
 
+        if (auto while_op = llvm::dyn_cast<TF::WhileRegionOp>(owner)) {
+          const int operand_number = use.getOperandNumber();
+          next_values_to_visit.push_back(
+              while_op.getCond().front().getArgument(operand_number));
+          next_values_to_visit.push_back(
+              while_op.getBody().front().getArgument(operand_number));
+          continue;
+        }
+
         if (llvm::isa<TF::IdentityOp, TF::CastOp, TF::ReadVariableOp>(owner)) {
           next_values_to_visit.push_back(use.getOwner()->getResult(0));
           continue;
@@ -291,15 +300,16 @@ std::optional<llvm::StringRef> GetXlaShardingFromArg(
   return std::nullopt;
 }
 
-// Extracts sharding configurations for all inputs by parsing XlaSharding/
-// TPUPartitionedInput op connected to the operands/arguments. If argument to
-// the `cluster_func` directly feeds into another function call op, then
-// recursively walk the function definition to find the connected XlaSharding
-// op.
+// Tries to extract sharding configurations for all inputs by parsing
+// XlaSharding/ TPUPartitionedInput op connected to the operands/arguments. If
+// argument to the `cluster_func` directly feeds into another function call op,
+// then recursively walk the function definition to find the connected
+// XlaSharding op.
 void IdentifyXlaShardingForComputationInputs(
-    const llvm::SmallVector<std::string>& logical_device_vec, bool use_spmd,
+    const llvm::SmallVector<std::string>& logical_device_vec,
     bool infer_from_computation, tf_device::ClusterFuncOp cluster_func,
-    func::FuncOp func, Builder* builder, OpShardingVector& sharding_for_args) {
+    func::FuncOp func, Builder* builder,
+    OptionalOpShardingVector& sharding_for_args) {
   // Look up function definition from module.
   Block& function_block = func.front();
 
@@ -310,8 +320,6 @@ void IdentifyXlaShardingForComputationInputs(
   //   1) a TPUPartitionedInput Op if the input has a non-resource type;
   //   2) a ReadVariableOp else.
   //
-  // Replicate sharding is used if `use_spmd` is set.
-  //
   // Iterate through input arguments to the entry block of
   // tf_device.ClusterFunc. For input ops, look for XlaSharding ops.
   // XlaSharding ops can:
@@ -340,17 +348,7 @@ void IdentifyXlaShardingForComputationInputs(
       }
     }
 
-    if (use_spmd && !IsMaximalVariable(operand)) {
-      // If XLA SPMD is enabled, host variables or non-variable per-replica
-      // inputs should take on replicate sharding, so that every device gets the
-      // whole tensor(s) (and can slice them up later). Exclude device
-      // variables, which always should take maximal sharding.
-      sharding_for_args.push_back(kReplicateSharding);
-      continue;
-    }
-
-    // Otherwise, default to maximal sharding core 0.
-    sharding_for_args.push_back(logical_device_vec[0]);
+    sharding_for_args.push_back(std::nullopt);
   }
 }
 
@@ -374,31 +372,32 @@ mlir::Operation* GetXlaShardingFromResult(Value value) {
   return nullptr;
 }
 
-// Looks up arg->retval aliases for every argument, and builds a reverse map.
-void ExtractAliases(func::FuncOp func, llvm::SmallVectorImpl<int>& aliases) {
-  aliases.resize(func.getNumResults(), -1);
-  for (int i = 0; i < func.getNumArguments(); i++) {
-    if (auto v = func.getArgAttrOfType<mlir::IntegerAttr>(i, kAliasingAttr)) {
-      int retval_index = v.getInt();
-      if (retval_index >= 0 && retval_index < aliases.size()) {
-        aliases[retval_index] = i;
+absl::Status DetermineShardingFromAlias(
+    func::FuncOp func, OptionalOpShardingVector& input_shardings,
+    OptionalOpShardingVector& output_shardings) {
+  for (int arg_idx = 0; arg_idx < func.getNumArguments(); ++arg_idx) {
+    if (auto v =
+            func.getArgAttrOfType<mlir::IntegerAttr>(arg_idx, kAliasingAttr)) {
+      if (int retval_idx = v.getInt();
+          retval_idx >= 0 && retval_idx < func.getNumResults()) {
+        auto& input_sharding = input_shardings[arg_idx];
+        auto& output_sharding = output_shardings[retval_idx];
+
+        if (input_sharding.has_value() && output_sharding.has_value() &&
+            input_sharding.value() != output_sharding.value()) {
+          return absl::InvalidArgumentError(absl::StrCat(
+              "arg#", arg_idx, " is aliased to retval#", retval_idx,
+              " but their sharding configurations don't match."));
+        } else if (input_sharding.has_value() && !output_sharding.has_value()) {
+          output_sharding = input_sharding;
+        } else if (!input_sharding.has_value() && output_sharding.has_value()) {
+          input_sharding = output_sharding;
+        }
       }
     }
   }
-}
 
-// Returns XLA sharding from argument connected via tf.aliasing_output.
-std::optional<StringRef> GetXlaShardingFromAlias(
-    Value value, llvm::SmallVectorImpl<int>& aliases,
-    const OpShardingVector& sharding_for_args) {
-  int retval_index = value.cast<OpResult>().getResultNumber();
-  if (retval_index >= 0 && retval_index < aliases.size()) {
-    int arg_index = aliases[retval_index];
-    if (arg_index >= 0 && arg_index < sharding_for_args.size()) {
-      return GetShardingStringFromVariant(sharding_for_args[arg_index]);
-    }
-  }
-  return std::nullopt;
+  return absl::OkStatus();
 }
 
 // Returns XLA sharding from XlaSharding op connected to a result value.
@@ -471,26 +470,20 @@ std::optional<StringRef> GetXlaShardingFromRetval(
   return std::nullopt;
 }
 
-// Extracts sharding configurations for all outputs by parsing XlaSharding/
-// TPUPartitionedOutput op connected to the retvals/results.
+// Tries to extract sharding configurations for all outputs by parsing
+// XlaSharding/ TPUPartitionedOutput op connected to the retvals/results.
 void IdentifyXlaShardingForComputationOutputs(
-    const llvm::SmallVector<std::string>& logical_device_vec, bool use_spmd,
+    const llvm::SmallVector<std::string>& logical_device_vec,
     bool infer_from_computation, tf_device::ClusterFuncOp cluster_func,
     func::FuncOp func, Builder* builder,
-    const OpShardingVector& sharding_for_args,
-    OpShardingVector& sharding_for_rets) {
+    OptionalOpShardingVector& sharding_for_rets) {
   Block& function_block = func.front();
   Operation* terminator = function_block.getTerminator();
   sharding_for_rets.reserve(terminator->getNumOperands());
 
-  llvm::SmallVector<int, 8> aliases;  // maps return value index to arg index
-  ExtractAliases(func, aliases);
-
   // Iterate through results of `cluster_func`. For output ops, look for
   // TPUPartitionedOutput ops.
   //
-  // Replicate sharding is used if `use_spmd` is set.
-  //
   // Iterate through operands of the terminator. If the preceding op is
   // XlaShardingOp, then the provided sharding configuration is added to the
   // tf_device.ClusterFunc as an attribute and the function as a result
@@ -505,12 +498,6 @@ void IdentifyXlaShardingForComputationOutputs(
       continue;
     }
 
-    if (auto from_alias =
-            GetXlaShardingFromAlias(result, aliases, sharding_for_args)) {
-      sharding_for_rets.push_back(from_alias.value());
-      continue;
-    }
-
     if (infer_from_computation) {
       if (auto retval_sharding =
               GetXlaShardingFromRetval(retval.get(), logical_device_vec)) {
@@ -519,18 +506,76 @@ void IdentifyXlaShardingForComputationOutputs(
       }
     }
 
-    if (use_spmd) {
-      // If XLA SPMD is enabled, we default to replicate sharding. This way,
-      // all devices get the whole tensor(s), but if there's an XlaSharding op
-      // deeper in the function, they can use dynamic-slice to slice off their
-      // part of the computation.
-      sharding_for_rets.push_back(kReplicateSharding);
-      continue;
+    sharding_for_rets.push_back(std::nullopt);
+  }
+}
+
+void SetReplicatedOrMaximalShardingIfNoShardingFound(
+    const llvm::SmallVector<std::string>& logical_device_vec, bool use_spmd,
+    OptionalOpShardingVector& shardings) {
+  for (auto& sharding : shardings) {
+    if (sharding == std::nullopt) {
+      // If we haven't found sharding, default to either replicated or maximal
+      // sharding depending on whether XLA SPMD is enabled.
+      if (use_spmd) {
+        // If XLA SPMD is enabled, host variables or non-variable per-replica
+        // inputs, and outputs should take on replicate sharding, so that every
+        // device gets the whole tensor(s) (and can slice them up later eg.
+        // using dynamic-slice).
+        sharding = kReplicateSharding;
+      } else {
+        // Otherwise, default to maximal sharding core 0.
+        sharding = logical_device_vec[0];
+      }
+    }
+  }
+}
+
+// Moves shardings from `optional_shardings` to `shardings`.
+absl::Status MoveSharding(OptionalOpShardingVector& optional_shardings,
+                          OpShardingVector& shardings) {
+  shardings.clear();
+  for (auto& sharding : optional_shardings) {
+    if (!sharding) {
+      return absl::InternalError(
+          "Couldn't find/assign sharding for an input/output. All shardings "
+          "should have been identified by this point.");
     }
 
-    // Otherwise, default to maximal sharding core 0.
-    sharding_for_rets.push_back(logical_device_vec[0]);
+    shardings.push_back(std::move(sharding.value()));
   }
+
+  return absl::OkStatus();
+}
+
+// Determines XlaSharding for inputs and outputs. If there are aliased
+// inputs/outputs for which no sharding was found directly, the corresponding
+// output/input sharding is used (if it exists). If we still don't find sharding
+// for some inputs/outputs, we default to replicated or maximal sharding
+// depending on `use_spmd`.
+absl::Status IdentifyXlaShardingForInputsAndOutputs(
+    const llvm::SmallVector<std::string>& logical_device_vec, bool use_spmd,
+    bool infer_from_computation, tf_device::ClusterFuncOp cluster_func,
+    func::FuncOp func, Builder* builder, OpShardingVector& input_sharding,
+    OpShardingVector& output_sharding) {
+  OptionalOpShardingVector optional_input_sharding;
+  OptionalOpShardingVector optional_output_sharding;
+  IdentifyXlaShardingForComputationInputs(
+      logical_device_vec, infer_from_computation, cluster_func, func, builder,
+      optional_input_sharding);
+  IdentifyXlaShardingForComputationOutputs(
+      logical_device_vec, infer_from_computation, cluster_func, func, builder,
+      optional_output_sharding);
+  TF_RETURN_IF_ERROR(DetermineShardingFromAlias(func, optional_input_sharding,
+                                                optional_output_sharding));
+  SetReplicatedOrMaximalShardingIfNoShardingFound(logical_device_vec, use_spmd,
+                                                  optional_input_sharding);
+  SetReplicatedOrMaximalShardingIfNoShardingFound(logical_device_vec, use_spmd,
+                                                  optional_output_sharding);
+  TF_RETURN_IF_ERROR(MoveSharding(optional_input_sharding, input_sharding));
+  TF_RETURN_IF_ERROR(MoveSharding(optional_output_sharding, output_sharding));
+
+  return absl::OkStatus();
 }
 
 // Extracts input/output sharding configuration of `cluster_func` by parsing
@@ -561,15 +606,15 @@ LogicalResult IdentifyXlaShardingForTPUComputation(
   }
 
   OpShardingVector sharding_for_args;
-  IdentifyXlaShardingForComputationInputs(logical_device_vec, use_spmd,
-                                          /*infer_from_computation=*/true,
-                                          cluster_func, func, builder,
-                                          sharding_for_args);
-
   OpShardingVector sharding_for_rets;
-  IdentifyXlaShardingForComputationOutputs(
-      logical_device_vec, use_spmd, /*infer_from_computation=*/true,
-      cluster_func, func, builder, sharding_for_args, sharding_for_rets);
+  if (auto status = IdentifyXlaShardingForInputsAndOutputs(
+          logical_device_vec, use_spmd,
+          /*infer_from_computation=*/true, cluster_func, func, builder,
+          sharding_for_args, sharding_for_rets);
+      !status.ok()) {
+    LOG(ERROR) << status;
+    return failure();
+  };
 
   auto has_maximal_sharding =
       [](const OpShardingVariant& sharding_or_op) -> bool {
@@ -592,14 +637,14 @@ LogicalResult IdentifyXlaShardingForTPUComputation(
     sharding_for_rets.clear();
     cluster_func->setAttr(kUseSpmdAttr, builder->getBoolAttr(false));
 
-    IdentifyXlaShardingForComputationInputs(
-        logical_device_vec,
-        /*use_spmd=*/false, /*infer_from_computation=*/false, cluster_func,
-        func, builder, sharding_for_args);
-    IdentifyXlaShardingForComputationOutputs(
-        logical_device_vec,
-        /*use_spmd=*/false, /*infer_from_computation=*/false, cluster_func,
-        func, builder, sharding_for_args, sharding_for_rets);
+    if (auto status = IdentifyXlaShardingForInputsAndOutputs(
+            logical_device_vec, /*use_spmd=*/false,
+            /*infer_from_computation=*/false, cluster_func, func, builder,
+            sharding_for_args, sharding_for_rets);
+        !status.ok()) {
+      LOG(ERROR) << status;
+      return failure();
+    }
   }
 
   // Update sharding on function arguments and returns.
@@ -645,7 +690,7 @@ void TPUShardingIdentificationPass::runOnOperation() {
   if (result.wasInterrupted()) return signalPassFailure();
 }
 
-}  // anonymous namespace
+}  // namespace
 
 std::unique_ptr<OperationPass<ModuleOp>> CreateTPUShardingIdentificationPass() {
   return std::make_unique<TPUShardingIdentificationPass>();
diff --git a/tensorflow/compiler/mlir/tensorflow/transforms/update_control_dependencies.cc b/tensorflow/compiler/mlir/tensorflow/transforms/update_control_dependencies.cc
index 615911f0cbd741..9f36e838206804 100644
--- a/tensorflow/compiler/mlir/tensorflow/transforms/update_control_dependencies.cc
+++ b/tensorflow/compiler/mlir/tensorflow/transforms/update_control_dependencies.cc
@@ -116,7 +116,8 @@ LogicalResult FillOpToParallelIdsMap(
     if (id_pairs.empty()) continue;
 
     TF::ParallelIdsMap& ids_map = op_to_parallel_ids_map[island];
-    for (auto [group_id, branch_id] : id_pairs) ids_map[group_id] = branch_id;
+    for (const auto& [group_id, branch_id] : id_pairs)
+      ids_map[group_id] = branch_id;
   }
   return success();
 }
diff --git a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
index 4160c31515cf62..95c67e6084d90a 100644
--- a/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tensorflow/translate/import_model.cc
@@ -182,6 +182,17 @@ void LoadImporterDialects(mlir::MLIRContext& context) {
     context.getOrLoadDialect(name);
 }
 
+absl::StatusOr<std::string> GetDenseTensorNameFromTensorInfo(
+    const TensorInfo& tensor_info) {
+  // TODO(b/184675681): Support other encoding cases.
+  //
+  // TODO(b/184679394): Add unit test for this check.
+  TF_RET_CHECK(tensor_info.encoding_case() == tensorflow::TensorInfo::kName)
+      << "Only dense tensor is supported, but got encoding case "
+      << tensor_info.encoding_case();
+  return tensor_info.name();
+}
+
 // This class is used to generate new MLIR function name strings that are both
 // unique in the TF function library `flib_` and unique among the name strings
 // generated by the class object during its lifetime.
@@ -3945,7 +3956,11 @@ SavedModelSignatureDefImporterLite::ConvertGraph(
   specs.graph_func_name = name;
   specs.prune_unused_nodes = true;
   TF_ASSIGN_OR_RETURN(specs.inputs, ParseInputArrays(inputs));
-  for (auto& output : outputs) specs.outputs.push_back(output.second.name());
+  for (auto& output : outputs) {
+    TF_ASSIGN_OR_RETURN(std::string name,
+                        GetDenseTensorNameFromTensorInfo(output.second));
+    specs.outputs.push_back(std::move(name));
+  }
   specs.control_outputs = control_outputs;
   specs.enable_shape_inference = false;
   specs.unconditionally_use_set_output_shapes =
@@ -4031,12 +4046,8 @@ SavedModelSignatureDefImporterLite::ParseInputArrays(
   for (const auto& iter : inputs) {
     const auto& tensor_info = iter.second;
 
-    // TODO(b/184675681): Support other encoding cases.
-    //
-    // TODO(b/184679394): Add unit test for this check.
-    TF_RET_CHECK(tensor_info.encoding_case() == tensorflow::TensorInfo::kName)
-        << "Only dense tensor is supported, but got encoding case "
-        << tensor_info.encoding_case();
+    TF_ASSIGN_OR_RETURN(std::string name,
+                        GetDenseTensorNameFromTensorInfo(tensor_info));
 
     VLOG(1) << "Importing Signature Input: input_name = " << iter.first
             << ", tensor_info = " << tensor_info.DebugString();
@@ -4052,7 +4063,7 @@ SavedModelSignatureDefImporterLite::ParseInputArrays(
       array_info.shape.set_unknown_rank(true);
     }
 
-    results.insert(std::pair<std::string, ArrayInfo>(tensor_info.name(),
+    results.insert(std::pair<std::string, ArrayInfo>(std::move(name),
                                                      std::move(array_info)));
   }
   return results;
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
index 693d1f37766d81..4efec99e6ecd4d 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/BUILD
@@ -100,6 +100,7 @@ cc_library(
     hdrs = ["compile_tf_graph.h"],
     deps = [
         "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:bridge_logger",
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow:import_model",
@@ -108,6 +109,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:translate_utils",
         "//tensorflow/compiler/mlir/tensorflow/transforms:set_tpu_infeed_layout",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
+        "//tensorflow/compiler/mlir/tf2xla/internal:logging_hooks",
         "//tensorflow/compiler/tf2xla:layout_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_helpers",
@@ -142,6 +144,9 @@ tf_cc_test(
     name = "compile_tf_graph_test",
     testonly = 1,
     srcs = ["compile_tf_graph_test.cc"],
+    data = [
+        "testdata/prepare_to_library.mlir",
+    ],
     linkstatic = 1,
     deps = [
         ":compile_tf_graph",
@@ -161,6 +166,7 @@ tf_cc_test(
         "//tensorflow/core/lib/monitoring:cell_reader",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "//tensorflow/core/tpu/kernels:tpu_compile_op_support",
+        "//tensorflow/core/tpu/kernels/xla:host_compute_ops",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest",
         "@llvm-project//mlir:FuncDialect",
@@ -171,7 +177,6 @@ tf_cc_test(
         "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla:shape_util",
         "@local_xla//xla/client:client_library",
-        "@local_xla//xla/stream_executor",
         "@local_xla//xla/translate/mhlo_to_hlo:type_to_shape",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc
index 003732ffb22f5a..0355204506068c 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
@@ -40,6 +41,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/serialize_mlir_module_utils.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/translate_utils.h"
+#include "tensorflow/compiler/mlir/tf2xla/internal/logging_hooks.h"
 #include "tensorflow/compiler/tf2xla/layout_util.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
@@ -62,6 +64,7 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
 #include "tensorflow/core/tpu/tpu_compile.h"
+#include "tensorflow/core/util/debug_data_dumper.h"
 #include "tsl/lib/monitoring/sampler.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/status.h"
@@ -170,11 +173,21 @@ Status PrepareAndExportToLibrary(mlir::ModuleOp module,
   applyTensorflowAndCLOptions(manager);
   manager.addPass(mlir::TF::CreatePrepareTpuComputationForTfExportPass());
   manager.addPass(mlir::TF::CreateTFRegionControlFlowToFunctional());
+  manager.addPass(mlir::TF::CreateTFShapeInferencePass());
   manager.addNestedPass<mlir::func::FuncOp>(
       mlir::CreateFunctionalToExecutorDialectConversionPass());
   manager.addPass(mlir::CreateBreakUpIslandsPass());
 
   mlir::StatusScopedDiagnosticHandler diag_handler(module.getContext());
+
+  if (VLOG_IS_ON(2)) {
+    llvm::StringRef module_name = llvm::StringRef();
+    constexpr const char* kDebugGroupBridgePhase2 =
+        "v1_prepare_and_export_to_library";
+    internal::EnablePassIRPrinting(manager, kDebugGroupBridgePhase2,
+                                   module_name);
+  }
+
   auto prepare_status = manager.run(module);
   auto diag_handler_status = diag_handler.ConsumeStatus();
   // There are cases where the scoped diagnostic handler catches a failure that
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph_test.cc
index e3b2339eedbba5..9bea916088c133 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.h"
 
+#include <string>
 #include <variant>
 #include <vector>
 
@@ -213,9 +214,9 @@ TEST_F(CompileTFGraphTest, RecordsStreamzForFunctionToHlo) {
   EXPECT_EQ(compilation_status.Delta("kOldBridgeNoMlirSuccess"), 1);
 }
 
-TEST_F(CompileTFGraphTest, CatchesErrorMissedByPassManagerRun) {
+TEST_F(CompileTFGraphTest, SuccessfullyCompilesWithManualSharding) {
   // MLIR module from failing test.
-  constexpr char kUnsupportedManualSharding[] = R"(
+  constexpr char kSupportedManualSharding[] = R"(
     module @module___inference_tpu_function_41 attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 1617 : i32}} {
       func.func @main(%arg0: tensor<2x2xf32>) -> (tensor<2x2xf32> {mhlo.sharding = "\08\03\1A\02\02\01\22\02\00\01"}) {
         %0 = tf_executor.graph {
@@ -223,7 +224,7 @@ TEST_F(CompileTFGraphTest, CatchesErrorMissedByPassManagerRun) {
           %outputs_0, %control_1 = tf_executor.island wraps "tf.XlaSharding"(%outputs) {_XlaSharding = "\08\03\1A\02\02\01\22\02\00\01", sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []} : (tensor<2x2xf32>) -> tensor<2x2xf32>
           %outputs_2, %control_3 = tf_executor.island wraps "tf.XlaSpmdFullToShardShape"(%outputs_0) {dim = -1 : i64, manual_sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []} : (tensor<2x2xf32>) -> tensor<1x2xf32>
           %control_4 = tf_executor.island wraps "tf._XlaHostComputeMlir"(%outputs_2) {host_mlir_module = "", manual_sharding = true, recv_key = "host_compute_channel_0_retvals", send_key = "host_compute_channel_0_args"} : (tensor<1x2xf32>) -> ()
-          %outputs_5, %control_6 = tf_executor.island(%control_4) wraps "tf._XlaHostComputeMlir"() {host_mlir_module = "", manual_sharding = true, recv_key = "host_compute_channel_1_retvals", send_key = "host_compute_channel_1_args"} : () -> tensor<1x2xf32>
+          %outputs_5, %control_6 = tf_executor.island(%control_4) wraps "tf._XlaHostComputeMlir"() {host_mlir_module = "module {\0A func.func @host_func() -> tensor<1x2xf32> {\0A %0 = \22tf.Const\22() {value = dense<0.1> : tensor<1x2xf32>} : () -> tensor<1x2xf32> \0A return %0 : tensor<1x2xf32>}}", manual_sharding = true, recv_key = "host_compute_channel_1_retvals", send_key = "host_compute_channel_1_args"} : () -> tensor<1x2xf32>
           %outputs_7, %control_8 = tf_executor.island wraps "tf.XlaSpmdShardToFullShape"(%outputs_5) {dim = -1 : i64, full_shape = #tf_type.shape<2x2>, manual_sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []} : (tensor<1x2xf32>) -> tensor<2x2xf32>
           %outputs_9, %control_10 = tf_executor.island wraps "tf.XlaSharding"(%outputs_7) {_XlaSharding = "\08\03\1A\02\02\01\22\02\00\01", sharding = "\08\03\1A\02\02\01\22\02\00\01", unspecified_dims = []} : (tensor<2x2xf32>) -> tensor<2x2xf32>
           tf_executor.fetch %outputs_9 : tensor<2x2xf32>
@@ -232,13 +233,11 @@ TEST_F(CompileTFGraphTest, CatchesErrorMissedByPassManagerRun) {
       }
     }
   )";
-  auto mlir_to_hlo_args = CreateTestMlirToHloArgs(kUnsupportedManualSharding);
+  auto mlir_to_hlo_args = CreateTestMlirToHloArgs(kSupportedManualSharding);
 
   auto result = CompileWithComputation(mlir_to_hlo_args);
 
-  ASSERT_THAT(result.ok(), false);
-  EXPECT_THAT(result.status().message(),
-              testing::ContainsRegex("op manual_sharding"));
+  EXPECT_TRUE(result.ok());
 }
 
 TEST_F(CompileTFGraphTest, DoesNotInlineStatelessRandomOps) {
@@ -261,6 +260,24 @@ TEST_F(CompileTFGraphTest, DoesNotInlineStatelessRandomOps) {
               ComputationProtoContains("tf.StatelessRandomNormal"));
 }
 
+TEST_F(CompileTFGraphTest, TestRunsShapeInference) {
+  static constexpr char kShapeInferenceModule[] = R"(
+    module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+      func.func @main() -> () {
+        %0 = "tf.Const"() <{value = dense<-1> : tensor<3360x8xi32>}> : () -> tensor<3360x8xi32>
+        %cst_33 = "tf.Const"() <{value = dense<[1120, -1]> : tensor<2xi32>}> : () -> tensor<2xi32>
+        %cst_34 = "tf.Const"() <{value = dense<[3, 1120, -1]> : tensor<3xi32>}> : () -> tensor<3xi32>
+        %cst_63 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+        %1965:4 = "tf._XlaHostComputeMlir"(%0, %cst_34, %cst_63, %cst_33) <{host_mlir_module = "#loc1 = loc(\22Reshape:\22)\0A#loc2 = loc(\22Reshape_4\22)\0A#loc3 = loc(\22Reshape\22)\0A#loc9 = loc(fused[#loc1, #loc2, #loc3])\0Amodule {\0A  func.func @host_func(%arg0: tensor<3360x?xi32> loc(fused[#loc1, #loc2, #loc3]), %arg1: tensor<3xi32> loc(fused[#loc1, #loc2, #loc3]), %arg2: tensor<i32> loc(fused[#loc1, #loc2, #loc3]), %arg3: tensor<2xi32> loc(fused[#loc1, #loc2, #loc3])) -> (tensor<1x1120x?xi32>, tensor<1x1120x?xi32>, tensor<1120x?xi32>, tensor<2xi32>) {\0A    %0 = \22tf.Reshape\22(%arg0, %arg1) {_xla_outside_compilation = \220\22} : (tensor<3360x?xi32>, tensor<3xi32>) -> tensor<3x1120x?xi32> loc(#loc9)\0A    %1:3 = \22tf.Split\22(%arg2, %0) {_xla_outside_compilation = \220\22} : (tensor<i32>, tensor<3x1120x?xi32>) -> (tensor<1x1120x?xi32>, tensor<1x1120x?xi32>, tensor<1x1120x?xi32>) loc(#loc10)\0A    %2 = \22tf.Reshape\22(%1#0, %arg3) {_xla_outside_compilation = \220\22} : (tensor<1x1120x?xi32>, tensor<2xi32>) -> tensor<1120x?xi32> loc(#loc11)\0A    %3 = \22tf.Shape\22(%2) {_xla_outside_compilation = \220\22} : (tensor<1120x?xi32>) -> tensor<2xi32> loc(#loc12)\0A    return %1#1, %1#2, %2, %3 : tensor<1x1120x?xi32>, tensor<1x1120x?xi32>, tensor<1120x?xi32>, tensor<2xi32> loc(#loc9)\0A  } loc(#loc9)\0A} loc(#loc)\0A#loc = loc(unknown)\0A#loc4 = loc(\22Split:\22)\0A#loc5 = loc(\22split\22)\0A#loc6 = loc(\22Reshape_5\22)\0A#loc7 = loc(\22Shape:\22)\0A#loc8 = loc(\22Shape_4\22)\0A#loc10 = loc(fused[#loc4, #loc5])\0A#loc11 = loc(fused[#loc1, #loc6])\0A#loc12 = loc(fused[#loc7, #loc8])\0A", recv_key = "host_compute_channel_0_retvals", send_key = "host_compute_channel_0_args"}> : (tensor<3360x8xi32>, tensor<3xi32>, tensor<i32>, tensor<2xi32>) -> (tensor<1x1120x?xi32>, tensor<1x1120x?xi32>, tensor<1120x?xi32>, tensor<2xi32>)
+        return
+        }
+      }
+    )";
+
+  auto compilation_result =
+      CompileWithComputation(CreateTestMlirToHloArgs(kShapeInferenceModule));
+  EXPECT_TRUE(compilation_result.ok());
+}
 }  // namespace
 }  // namespace v1
 }  // namespace tf2xla
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/testdata/prepare_to_library.mlir b/tensorflow/compiler/mlir/tf2xla/api/v1/testdata/prepare_to_library.mlir
new file mode 100644
index 00000000000000..42e145effa742f
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/testdata/prepare_to_library.mlir
@@ -0,0 +1,10 @@
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+  func.func @main() -> () {
+    %0 = "tf.Const"() <{value = dense<-1> : tensor<3360x8xi32>}> : () -> tensor<3360x8xi32>
+    %cst_33 = "tf.Const"() <{value = dense<[1120, -1]> : tensor<2xi32>}> : () -> tensor<2xi32>
+    %cst_34 = "tf.Const"() <{value = dense<[3, 1120, -1]> : tensor<3xi32>}> : () -> tensor<3xi32>
+    %cst_63 = "tf.Const"() <{value = dense<0> : tensor<i32>}> : () -> tensor<i32>
+    %1965:4 = "tf._XlaHostComputeMlir"(%0, %cst_34, %cst_63, %cst_33) <{host_mlir_module = "#loc1 = loc(\22Reshape:\22)\0A#loc2 = loc(\22Reshape_4\22)\0A#loc3 = loc(\22Reshape\22)\0A#loc9 = loc(fused[#loc1, #loc2, #loc3])\0Amodule {\0A  func.func @host_func(%arg0: tensor<3360x?xi32> loc(fused[#loc1, #loc2, #loc3]), %arg1: tensor<3xi32> loc(fused[#loc1, #loc2, #loc3]), %arg2: tensor<i32> loc(fused[#loc1, #loc2, #loc3]), %arg3: tensor<2xi32> loc(fused[#loc1, #loc2, #loc3])) -> (tensor<1x1120x?xi32>, tensor<1x1120x?xi32>, tensor<1120x?xi32>, tensor<2xi32>) {\0A    %0 = \22tf.Reshape\22(%arg0, %arg1) {_xla_outside_compilation = \220\22} : (tensor<3360x?xi32>, tensor<3xi32>) -> tensor<3x1120x?xi32> loc(#loc9)\0A    %1:3 = \22tf.Split\22(%arg2, %0) {_xla_outside_compilation = \220\22} : (tensor<i32>, tensor<3x1120x?xi32>) -> (tensor<1x1120x?xi32>, tensor<1x1120x?xi32>, tensor<1x1120x?xi32>) loc(#loc10)\0A    %2 = \22tf.Reshape\22(%1#0, %arg3) {_xla_outside_compilation = \220\22} : (tensor<1x1120x?xi32>, tensor<2xi32>) -> tensor<1120x?xi32> loc(#loc11)\0A    %3 = \22tf.Shape\22(%2) {_xla_outside_compilation = \220\22} : (tensor<1120x?xi32>) -> tensor<2xi32> loc(#loc12)\0A    return %1#1, %1#2, %2, %3 : tensor<1x1120x?xi32>, tensor<1x1120x?xi32>, tensor<1120x?xi32>, tensor<2xi32> loc(#loc9)\0A  } loc(#loc9)\0A} loc(#loc)\0A#loc = loc(unknown)\0A#loc4 = loc(\22Split:\22)\0A#loc5 = loc(\22split\22)\0A#loc6 = loc(\22Reshape_5\22)\0A#loc7 = loc(\22Shape:\22)\0A#loc8 = loc(\22Shape_4\22)\0A#loc10 = loc(fused[#loc4, #loc5])\0A#loc11 = loc(fused[#loc1, #loc6])\0A#loc12 = loc(fused[#loc7, #loc8])\0A", recv_key = "host_compute_channel_0_retvals", send_key = "host_compute_channel_0_args"}> : (tensor<3360x8xi32>, tensor<3xi32>, tensor<i32>, tensor<2xi32>) -> (tensor<1x1120x?xi32>, tensor<1x1120x?xi32>, tensor<1120x?xi32>, tensor<2xi32>)
+    return
+    }
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc
index 9d0b884ebbe85d..aa9a73215b7284 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.cc
@@ -91,8 +91,12 @@ void AddTfDialectToExecutorPasses(OpPassManager &pm) {
   pm.addPass(mlir::createSymbolDCEPass());
   if (tensorflow::GetMlirCommonFlags()
           ->tf_mlir_enable_convert_control_to_data_outputs_pass) {
+    bool composite_tpuexecute_side_effects =
+        tensorflow::GetMlirCommonFlags()
+            ->tf_mlir_enable_composite_tpuexecute_side_effects;
     pm.addPass(
-        mlir::tf_executor::CreateTFExecutorConvertControlToDataOutputsPass());
+        mlir::tf_executor::CreateTFExecutorConvertControlToDataOutputsPass(
+            composite_tpuexecute_side_effects));
   }
   pm.addPass(mlir::TF::CreateVerifySuitableForExportPass());
 }
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
index 6749f012da37fb..76369ade79193f 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/BUILD
@@ -35,6 +35,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tf2xla:compile_mlir_util",
         "//tensorflow/compiler/mlir/tf2xla/api/v1:compile_tf_graph",
+        "//tensorflow/compiler/mlir/tf2xla/internal:compilation_timer",
         "//tensorflow/compiler/mlir/tf2xla/internal:legalize_tf_mlir",
         "//tensorflow/compiler/mlir/tf2xla/internal:legalize_tf_to_hlo",
         "//tensorflow/compiler/tf2xla:layout_util",
@@ -52,6 +53,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:variant",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@local_tsl//tsl/platform:error_logging",
@@ -76,12 +78,15 @@ tf_cc_test(
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_helpers",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
         "//tensorflow/core/lib/monitoring:cell_reader",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "//tensorflow/core/tpu/kernels:tpu_compile_op_support",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/monitoring:test_utils",
         "@local_tsl//tsl/platform:statusor",
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc
index 4fa9b9bb98da3f..9729f755c611ce 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.cc
@@ -160,19 +160,18 @@ void CreateTPUClusteringPipelineV2(OpPassManager &pm) {
 }
 
 tensorflow::Status RunFunctionTf2xlaClusteringBridge(
-    ModuleOp module, DeviceType device_type, bool is_in_fallback_enabled_mode,
-    llvm::StringRef module_name) {
-  bool is_replicated = device_type == DeviceType::XLA_TPU_JIT;
+    ModuleOp module, bool is_supported_by_replicated_brige,
+    bool is_in_fallback_enabled_mode, llvm::StringRef module_name) {
   std::string device_type_filter =
-      device_type == DeviceType::XLA_TPU_JIT ? "tpu" : "cpu/gpu";
+      is_supported_by_replicated_brige ? "tpu" : "cpu/gpu";
 
   VLOG(2)
-      << (is_replicated ? "Replicated" : "NonReplicated")
+      << (is_supported_by_replicated_brige ? "Replicated" : "NonReplicated")
       << " Bridge called stack trace is "
       << "(NOTE: this is not an error; rather the stack trace for debugging) : "
       << tensorflow::CurrentStackTrace();
   Status clustering_status =
-      is_replicated
+      is_supported_by_replicated_brige
           ? RunTFXLABridge(
                 module,
                 [module_name](OpPassManager &pm) {
@@ -187,12 +186,12 @@ tensorflow::Status RunFunctionTf2xlaClusteringBridge(
                 },
                 module_name, /*dump_prefix=*/"tf_xla_bridge_v2_nonreplicated");
 
-  // TODO(b/317798386): add is_replicated as a filter.
+  // TODO(b/317798386): add is_supported_by_replicated_brige as a filter.
   TF_RETURN_IF_ERROR(RecordIfErrorStatus(
       /*error_prefix=*/"clustering_v2", is_in_fallback_enabled_mode,
       device_type_filter, clustering_status));
 
-  // TODO(b/317798386): add is_replicated as a filter.
+  // TODO(b/317798386): add is_supported_by_replicated_brige as a filter.
   tensorflow::metrics::UpdateTfMlirBridgeFirstPhaseCounter(
       device_type_filter, /*bridge_version=*/"v2",
       /*fallback_enabled=*/is_in_fallback_enabled_mode,
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h
index e1298ac53560d3..8963fe7b126b9e 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h
@@ -39,17 +39,19 @@ namespace v2 {
 // Inputs:
 //   module - The MLIR Module that will be clustered. Expected to be in TF
 //   Executor Dialect or TF Functional Dialect. Will convert to TF Functional.
-// . device_type - The device type to cluster for.
-//   is_in_fallback_enabled_mode - Whether this was called with fallback to the
-//   non-MLIR Bridge. This is just for logging purposes and doesn't affect
-//   logic.
-//   module_name - What the input module name is for debugging help.
+//   is_supported_by_replicated_brige - If the graph targets the replicated
+//   bridge. Set it to true for replicated/partitioned graphs. e.g. replicated
+//   and single-core TPU graphs. Set this to false if the graph is not
+//   replicated, e.g. CPU/GPU graphs. is_in_fallback_enabled_mode - Whether this
+//   was called with fallback to the non-MLIR Bridge. This is just for logging
+//   purposes and doesn't affect logic. module_name - What the input module name
+//   is for debugging help.
 //
 // Output: Modifies the input module in place with clustered operations.
 //   status - Whether the transformation to cluster the input MLIR module was
 //   successful.
 tensorflow::Status RunFunctionTf2xlaClusteringBridge(
-    mlir::ModuleOp module, DeviceType device_type,
+    mlir::ModuleOp module, bool is_supported_by_replicated_brige,
     bool is_in_fallback_enabled_mode,
     llvm::StringRef module_name = llvm::StringRef());
 }  // namespace v2
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
index d00d8b43d9e790..c4a96702533c49 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf_test.cc
@@ -82,14 +82,14 @@ class FunctionClusterTensorflowDialectTest : public ::testing::Test {
   OwningOpRef<mlir::ModuleOp> mlir_module_;
 };
 
-TEST_F(FunctionClusterTensorflowDialectTest, ClustersTfTPU) {
+TEST_F(FunctionClusterTensorflowDialectTest, ClustersTfReplicatedBridge) {
   CellReader<int64_t> compilation_status(kCompilationStreamz);
 
   TF_ASSERT_OK(CreateMlirModule("empty_func.mlir"));
 
-  TF_EXPECT_OK(
-      RunFunctionTf2xlaClusteringBridge(*mlir_module_, DeviceType::XLA_TPU_JIT,
-                                        /*is_in_fallback_enabled_mode=*/false));
+  TF_EXPECT_OK(RunFunctionTf2xlaClusteringBridge(
+      *mlir_module_, /*is_supported_by_replicated_brige*/ true,
+      /*is_in_fallback_enabled_mode=*/false));
 
   FuncOp main = mlir_module_->lookupSymbol<mlir::func::FuncOp>("main");
   ASSERT_TRUE(main);
@@ -98,14 +98,15 @@ TEST_F(FunctionClusterTensorflowDialectTest, ClustersTfTPU) {
       compilation_status.Delta("tpu", "v2", "fallback_disabled", "success"), 1);
 }
 
-TEST_F(FunctionClusterTensorflowDialectTest, RunsOutsideCompilationTPU) {
+TEST_F(FunctionClusterTensorflowDialectTest,
+       RunsOutsideCompilationReplicatedBridge) {
   CellReader<int64_t> compilation_status(kCompilationStreamz);
 
   TF_ASSERT_OK(CreateMlirModule("outside_compilation.mlir"));
 
-  TF_EXPECT_OK(
-      RunFunctionTf2xlaClusteringBridge(*mlir_module_, DeviceType::XLA_TPU_JIT,
-                                        /*is_in_fallback_enabled_mode=*/false));
+  TF_EXPECT_OK(RunFunctionTf2xlaClusteringBridge(
+      *mlir_module_, /*is_supported_by_replicated_brige*/ true,
+      /*is_in_fallback_enabled_mode=*/false));
 
   FuncOp main = mlir_module_->lookupSymbol<mlir::func::FuncOp>("main");
   ASSERT_TRUE(main);
@@ -121,31 +122,14 @@ TEST_F(FunctionClusterTensorflowDialectTest, RunsOutsideCompilationTPU) {
       compilation_status.Delta("tpu", "v2", "fallback_disabled", "success"), 1);
 }
 
-TEST_F(FunctionClusterTensorflowDialectTest, ClustersTFCPU) {
+TEST_F(FunctionClusterTensorflowDialectTest, ClustersTFNonReplicatedBridge) {
   CellReader<int64_t> compilation_status(kCompilationStreamz);
 
   TF_ASSERT_OK(CreateMlirModule("empty_func.mlir"));
 
-  TF_EXPECT_OK(
-      RunFunctionTf2xlaClusteringBridge(*mlir_module_, DeviceType::XLA_CPU_JIT,
-                                        /*is_in_fallback_enabled_mode=*/false));
-
-  FuncOp main = mlir_module_->lookupSymbol<mlir::func::FuncOp>("main");
-  ASSERT_TRUE(main);
-
-  EXPECT_EQ(
-      compilation_status.Delta("cpu/gpu", "v2", "fallback_disabled", "success"),
-      1);
-}
-
-TEST_F(FunctionClusterTensorflowDialectTest, ClustersTFGPU) {
-  CellReader<int64_t> compilation_status(kCompilationStreamz);
-
-  TF_ASSERT_OK(CreateMlirModule("empty_func.mlir"));
-
-  TF_EXPECT_OK(
-      RunFunctionTf2xlaClusteringBridge(*mlir_module_, DeviceType::XLA_GPU_JIT,
-                                        /*is_in_fallback_enabled_mode=*/false));
+  TF_EXPECT_OK(RunFunctionTf2xlaClusteringBridge(
+      *mlir_module_, /*is_supported_by_replicated_brige*/ false,
+      /*is_in_fallback_enabled_mode=*/false));
 
   FuncOp main = mlir_module_->lookupSymbol<mlir::func::FuncOp>("main");
   ASSERT_TRUE(main);
@@ -160,9 +144,9 @@ TEST_F(FunctionClusterTensorflowDialectTest, LogsFallbackMode) {
 
   TF_ASSERT_OK(CreateMlirModule("empty_func.mlir"));
 
-  TF_EXPECT_OK(
-      RunFunctionTf2xlaClusteringBridge(*mlir_module_, DeviceType::XLA_TPU_JIT,
-                                        /*is_in_fallback_enabled_mode=*/true));
+  TF_EXPECT_OK(RunFunctionTf2xlaClusteringBridge(
+      *mlir_module_, /*is_supported_by_replicated_brige*/ true,
+      /*is_in_fallback_enabled_mode=*/true));
 
   EXPECT_EQ(
       compilation_status.Delta("tpu", "v2", "fallback_enabled", "success"), 1);
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc
index 8bfb3308ffead2..d297e45b70e0bb 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf.cc
@@ -25,9 +25,11 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/variant.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/compile_mlir_util.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/compile_tf_graph.h"
+#include "tensorflow/compiler/mlir/tf2xla/internal/compilation_timer.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir.h"
 #include "tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo.h"
 #include "tensorflow/compiler/tf2xla/layout_util.h"
@@ -38,7 +40,9 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+#include "tensorflow/core/util/debug_data_dumper.h"
 #include "tensorflow/core/util/dump_graph.h"
+#include "tsl/lib/monitoring/sampler.h"
 #include "tsl/platform/error_logging.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -53,9 +57,17 @@ using tpu::FunctionToHloArgs;
 using tpu::MlirToHloArgs;
 using tpu::ShardingAndIndex;
 
+auto* phase2_bridge_compilation_time = tsl::monitoring::Sampler<1>::New(
+    {"/tensorflow/core/tf2xla/api/v2/phase2_compilation_time",
+     "The wall-clock time spent on executing graphs in milliseconds.",
+     "configuration"},
+    // Power of 1.5 with bucket count 45 (> 23 hours)
+    {tsl::monitoring::Buckets::Exponential(1, 1.5, 45)});
+
 // Name of component for error logging. This name is fixed and required to
 // enable logging.
 constexpr char kBridgeComponent[] = "TFXLABridge";
+constexpr char kFullBridge[] = "full_bridge";
 
 namespace {
 
@@ -73,6 +85,7 @@ void DumpComputationInput(
   if (!VLOG_IS_ON(2)) {
     return;
   }
+
   switch (computation.index()) {
     case 0:
       VLOG(2) << "LegalizeMlirToHlo with MLIR computation input: "
@@ -95,25 +108,28 @@ void DumpComputationInput(
 
 Status DumpHloCompilationResult(std::string_view name,
                                 XlaCompilationResult* compilation_result) {
-  if (VLOG_IS_ON(2)) {
-    TF_ASSIGN_OR_RETURN(
-        auto hlo_module_config,
-        xla::HloModule::CreateModuleConfigFromProto(
-            compilation_result->computation->proto(), xla::DebugOptions()));
-
-    TF_ASSIGN_OR_RETURN(
-        std::unique_ptr<xla::HloModule> hlo_module,
-        xla::HloModule::CreateFromProto(
-            compilation_result->computation->proto(), hlo_module_config));
-
-    std::string all_computations;
-    for (auto computation : hlo_module->computations()) {
-      all_computations += computation->ToString() + "\n\n";
-    }
-
-    tensorflow::DumpRawStringToFile(name, all_computations);
+  if (!VLOG_IS_ON(2) &&
+      !DEBUG_DATA_DUMPER()->ShouldDump(std::string(name), kDebugGroupMain)) {
+    return OkStatus();
   }
 
+  TF_ASSIGN_OR_RETURN(
+      auto hlo_module_config,
+      xla::HloModule::CreateModuleConfigFromProto(
+          compilation_result->computation->proto(), xla::DebugOptions()));
+
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<xla::HloModule> hlo_module,
+      xla::HloModule::CreateFromProto(compilation_result->computation->proto(),
+                                      hlo_module_config));
+
+  std::string all_computations;
+  for (auto computation : hlo_module->computations()) {
+    all_computations += computation->ToString() + "\n\n";
+  }
+
+  tensorflow::DumpRawStringToFile(name, all_computations);
+
   return OkStatus();
 }
 
@@ -129,6 +145,12 @@ tsl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
     std::vector<tpu::ShardingAndIndex>* arg_core_mapping,
     std::vector<std::vector<xla::Shape>>* per_core_arg_shapes,
     xla::CompileOnlyClient* client) {
+  CompilationTimer timer;
+  auto record_time = llvm::make_scope_exit([&timer] {
+    phase2_bridge_compilation_time->GetCell(kFullBridge)
+        ->Add(timer.ElapsedCyclesInMilliseconds());
+  });
+
   auto compilation_result = std::make_unique<XlaCompilationResult>();
 
   DumpComputationInput(computation);
@@ -140,7 +162,8 @@ tsl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
         arg_shapes, arg_core_mapping, per_core_arg_shapes, client,
         compilation_result.get()));
 
-    DumpHloCompilationResult("legalize_tf_fallback", compilation_result.get())
+    DumpHloCompilationResult("legalize_tf_fallback.hlo",
+                             compilation_result.get())
         .IgnoreError();
     return *compilation_result;
   }
@@ -155,15 +178,15 @@ tsl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
     VLOG(1) << "Successfully compiled MLIR computation to XLA HLO using "
                "Combined MLIR and XlaBuilder Bridge.";
 
-    DumpHloCompilationResult("legalize_tf_combined_bridge",
+    DumpHloCompilationResult("legalize_tf_combined_bridge.hlo",
                              compilation_result.get())
         .IgnoreError();
     return *compilation_result;
   }
 
-  VLOG(1)
-      << "Failed to compile MLIR computation to XLA HLO using "
-         "Combined MLIR and XlaBuilder Bridge. Falling back to Graph Bridge.";
+  VLOG(1) << "Failed to compile MLIR computation to XLA HLO using Combined "
+             "MLIR and XlaBuilder Bridge. Falling back to MLIR tf2xla Bridge. "
+          << combined_bridge_status.status();
   tsl::error_logging::Log(kBridgeComponent, "TFXLA_API_V2_COMBINED_BRIDGE",
                           combined_bridge_status.status().ToString())
       .IgnoreError();
@@ -176,21 +199,22 @@ tsl::StatusOr<tensorflow::XlaCompilationResult> LegalizeMlirToHlo(
 
   if (mlir_bridge_status.ok()) {
     VLOG(1) << "Successfully compiled MLIR computation to XLA HLO using MLIR "
-               "tf2xla bridge";
+               "tf2xla Bridge";
     IncrementTfMlirBridgeSecondPhaseCounter(
         MlirBridgeSecondPhaseMetric::kMlirWithFallbackModeSuccess);
 
-    DumpHloCompilationResult("legalize_tf_mlir_bridge",
+    DumpHloCompilationResult("legalize_tf_mlir_bridge.hlo",
                              compilation_result.get())
         .IgnoreError();
     return *compilation_result;
   } else if (mlir_bridge_status.status() ==
              CompileToHloGraphAnalysisFailedError()) {
     VLOG(1) << "Filtered out MLIR computation to XLA HLO using MLIR tf2xla "
-               "bridge. Falling back to Combined Bridge.";
+               "Bridge. Could not generate HLO.";
   } else {
-    VLOG(1) << "Failed to compile MLIR computation to XLA HLO using "
-               "MLIR Bridge. Falling back to Combined Bridge.";
+    VLOG(1) << "Failed to compile MLIR computation to XLA HLO using MLIR "
+               "tf2xla Bridge. Could not generate HLO. "
+            << mlir_bridge_status.status();
     tsl::error_logging::Log(kBridgeComponent, "TFXLA_API_V2_PHASE2_MLIR_BRIDGE",
                             mlir_bridge_status.status().ToString())
         .IgnoreError();
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf_test.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf_test.cc
index ab7aa70b9bcc74..99c213916f6f3f 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/legalize_tf_test.cc
@@ -17,19 +17,26 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/strings/str_format.h"
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/device_type.pb.h"
 #include "tensorflow/compiler/tf2xla/xla_compiler.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "xla/client/client_library.h"
 #include "tensorflow/core/lib/monitoring/cell_reader.h"
+#include "tensorflow/core/lib/monitoring/test_utils.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
+#include "tensorflow/core/util/debug_data_dumper.h"
+#include "tsl/lib/core/status_test_util.h"
 #include "tsl/lib/monitoring/test_utils.h"
 #include "tsl/platform/statusor.h"
 
@@ -38,14 +45,15 @@ namespace tf2xla {
 namespace v2 {
 
 using ::tensorflow::monitoring::testing::CellReader;
+using ::testing::TestWithParam;
 using tpu::FunctionToHloArgs;
 using tpu::MlirToHloArgs;
 using tpu::ShardingAndIndex;
 using tpu::TPUCompileMetadataProto;
-using ::tsl::monitoring::testing::Histogram;
 
 static constexpr char kCompilationTimeStreamzName[] =
     "/tensorflow/core/tf2xla/api/v2/phase2_compilation_time";
+static constexpr char kFullBridge[] = "full_bridge";
 static constexpr char kCompilationStatusStreamzName[] =
     "/tensorflow/core/tf2xla/api/v2/phase2_compilation_status";
 static const char kMlirWithFallbackModeSuccess[] =
@@ -106,6 +114,7 @@ tsl::StatusOr<XlaCompiler::CompilationResult> CompileMlirModule(
 
   std::vector<TensorShape> arg_shapes;
   TPUCompileMetadataProto metadata_proto;
+  metadata_proto.add_retvals();
   bool use_tuple_args = true;
   std::vector<ShardingAndIndex> arg_core_mapping;
   std::vector<std::vector<xla::Shape>> per_core_arg_shapes;
@@ -131,6 +140,89 @@ TEST(LegalizeTFTest, RecordsStreamzForSuccessfulLegalizeWithMlirBridge) {
   EXPECT_EQ(compilation_status.Delta(kMlirWithFallbackModeFailure), 0);
 }
 
+TEST(LegalizeTFTest, MatMul) {
+  static constexpr char kMatMulModuleStr[] = R"(
+  module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+    func.func @main() -> (tensor<5x11xf32>) {
+      %arg0 = "tf.Const"() {value = dense<-3.0> : tensor<5x7xf32>} : () -> tensor<5x7xf32>
+      %arg1 = "tf.Const"() {value = dense<-3.0> : tensor<11x7xf32>} : () -> tensor<11x7xf32>
+
+      %1 = "tf.MatMul"(%arg0, %arg1) {transpose_a = false, transpose_b = true} : (tensor<5x7xf32>, tensor<11x7xf32>) -> tensor<5x11xf32>
+
+      func.return %1 : tensor<5x11xf32>
+    }
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(
+      XlaCompiler::CompilationResult result,
+      CompileMlirModule(
+          kMatMulModuleStr,
+          ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_UNSPECIFIED));
+}
+
+struct MatMulTestCase {
+  std::string mat_mul_method;
+};
+
+using BatchMatMulTest = TestWithParam<MatMulTestCase>;
+
+TEST_P(BatchMatMulTest, BatchMatMul) {
+  const MatMulTestCase& test_case = GetParam();
+  static constexpr char kMatMulModuleStr[] = R"(
+  module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+    func.func @main() -> (tensor<1x4x4xf32>) {
+      %%arg0 = "tf.Const"() {value = dense<-3.0> : tensor<1x4x2xf32>} : () -> tensor<1x4x2xf32>
+      %%arg1 = "tf.Const"() {value = dense<-3.0> : tensor<1x2x4xf32>} : () -> tensor<1x2x4xf32>
+
+      %%1 = "tf.%s"(%%arg0, %%arg1) {T = f32, adj_x = false, adj_y = false, grad_x = false, grad_y = false, device = ""} : (tensor<1x4x2xf32>, tensor<1x2x4xf32>) -> tensor<1x4x4xf32>
+
+      func.return %%1 : tensor<1x4x4xf32>
+    }
+  })";
+  std::string mat_mul_method =
+      absl::StrFormat(kMatMulModuleStr, test_case.mat_mul_method);
+  TF_ASSERT_OK_AND_ASSIGN(
+      XlaCompiler::CompilationResult result,
+      CompileMlirModule(
+          mat_mul_method.c_str(),
+          ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_UNSPECIFIED));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    BatchMatMulTest, BatchMatMulTest,
+    ::testing::ValuesIn<MatMulTestCase>({
+        {"BatchMatMul"},
+        {"BatchMatMulV2"},
+        {"BatchMatMulV3"},
+    }),
+    [](const ::testing::TestParamInfo<BatchMatMulTest::ParamType>& info) {
+      return info.param.mat_mul_method;
+    });
+
+TEST(LegalizeTFTest, DumpsProducedHLO) {
+  Env* env = Env::Default();
+  std::string test_dir = testing::TmpDir();
+  setenv("TF_DUMP_GRAPH_PREFIX", test_dir.c_str(), /*overwrite=*/1);
+  setenv("TF_DUMP_GRAPH_NAME_FILTER", "*", 1);
+  DEBUG_DATA_DUMPER()->LoadEnvvars();
+
+  std::vector<std::string> files;
+  TF_ASSERT_OK(env->GetChildren(test_dir, &files));
+  int original_files_size = files.size();
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      XlaCompiler::CompilationResult result,
+      CompileMlirModule(
+          kMlirModuleStr,
+          ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_UNSPECIFIED));
+
+  // Due to the shared test of this infrastructure, we just need to make sure
+  // that the dumped file size is greater than what was originally inside
+  // the test directory.
+  TF_ASSERT_OK(env->GetChildren(test_dir, &files));
+  EXPECT_THAT(files.size(), ::testing::Gt(original_files_size));
+  setenv("TF_DUMP_GRAPH_PREFIX", test_dir.c_str(), /*overwrite=*/0);
+}
+
 TEST(LegalizeTFTest, RecordsStreamzForFailedLegalizeWithMlirBridge) {
   CellReader<int64_t> compilation_status(kCompilationStatusStreamzName);
 
@@ -203,6 +295,20 @@ TEST(LegalizeTFTest, RecordsStreamzForNoMlirFallback) {
   EXPECT_FALSE(compile_result.ok());
 }
 
+TEST(LegalizeTFTest, RecordsCompilationTimeForSuccessfulCompilation) {
+  CellReader<monitoring::testing::Histogram> compilation_time(
+      kCompilationTimeStreamzName);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      XlaCompiler::CompilationResult result,
+      CompileMlirModule(
+          kMlirModuleStr,
+          ConfigProto::Experimental::MLIR_BRIDGE_ROLLOUT_ENABLED));
+
+  // Compilation time should have been updated.
+  EXPECT_GT(compilation_time.Delta(kFullBridge).num(), 0);
+}
+
 }  // namespace v2
 }  // namespace tf2xla
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
index 455a59d6607c49..9befa9b7714a27 100644
--- a/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
+++ b/tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.cc
@@ -90,8 +90,12 @@ void AddTfDialectToExecutorPasses(OpPassManager &pm) {
   pm.addPass(mlir::createSymbolDCEPass());
   if (tensorflow::GetMlirCommonFlags()
           ->tf_mlir_enable_convert_control_to_data_outputs_pass) {
+    bool composite_tpuexecute_side_effects =
+        tensorflow::GetMlirCommonFlags()
+            ->tf_mlir_enable_composite_tpuexecute_side_effects;
     pm.addPass(
-        mlir::tf_executor::CreateTFExecutorConvertControlToDataOutputsPass());
+        mlir::tf_executor::CreateTFExecutorConvertControlToDataOutputsPass(
+            composite_tpuexecute_side_effects));
   }
   pm.addPass(mlir::TF::CreateVerifySuitableForExportPass());
 }
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/BUILD b/tensorflow/compiler/mlir/tf2xla/internal/BUILD
index f211357de28df9..1f96ecb120795d 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/internal/BUILD
@@ -243,3 +243,47 @@ tf_cc_test(
         "@local_tsl//tsl/platform:status",
     ],
 )
+
+cc_library(
+    name = "mlir_bridge_pass_util",
+    srcs = ["mlir_bridge_pass_util.cc"],
+    hdrs = ["mlir_bridge_pass_util.h"],
+    visibility = ["//tensorflow/compiler/tf2xla:__pkg__"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:tf2xla_defs",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core/common_runtime:function_body",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+        "@local_tsl//tsl/platform:status",
+    ],
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "mlir_bridge_pass_util_test",
+    srcs = ["mlir_bridge_pass_util_test.cc"],
+    deps = [
+        ":mlir_bridge_pass_util",
+        "//tensorflow/cc:function_ops",
+        "//tensorflow/cc:functional_ops",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/tf2xla:tf2xla_defs",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core:testlib",
+        "//tensorflow/core/platform:enable_tf2_utils",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@local_tsl//tsl/lib/core:status_test_util",
+    ],
+)
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
index ed04adc6a394ff..f00df1513215cc 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes.cc
@@ -143,6 +143,8 @@ void AddReplicatedBridgeClusteringPipelinePasses(OpPassManager& pm,
 
   pm.addNestedPass<FuncOp>(mlir::TFDevice::CreateClusterConstantSinkingPass());
   pm.addPass(mlir::TF::CreateResourceDeviceInferencePass());
+  pm.addNestedPass<FuncOp>(
+      tensorflow::tf2xla::internal::CreateHoistBroadcastReadPass());
   pm.addPass(mlir::TFDevice::CreateClusterOutliningPass());
   pm.addPass(mlir::TFTPU::CreateTPUResourceReadForWritePass());
   pm.addPass(mlir::TFDevice::CreateMarkInputOutputAliasesPass());
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes_test.cc
index c9cc5a4d1df16c..756ec42f31268c 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/clustering_bridge_passes_test.cc
@@ -28,7 +28,7 @@ TEST(ClusteringBridgePassesTest, AddsBridgePasses) {
   OpPassManager pass_manager;
   AddReplicatedBridgeClusteringPipelinePasses(pass_manager);
 
-  EXPECT_EQ(pass_manager.size(), 43);
+  EXPECT_EQ(pass_manager.size(), 44);
 }
 
 TEST(ClusteringBridgePassesTest, AddsNonTPUBridgePasses) {
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir.cc b/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir.cc
index 3b11eaaf2287d6..4fd0c21d68331f 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_mlir.cc
@@ -44,7 +44,6 @@ limitations under the License.
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
 #include "tensorflow/core/tpu/tpu_compile.h"
-#include "tsl/lib/monitoring/sampler.h"
 #include "tsl/platform/error_logging.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -52,12 +51,6 @@ limitations under the License.
 namespace tensorflow {
 namespace tf2xla {
 namespace internal {
-auto* phase2_bridge_compilation_time = tsl::monitoring::Sampler<1>::New(
-    {"/tensorflow/core/tf2xla/api/v2/phase2_compilation_time",
-     "The wall-clock time spent on executing graphs in milliseconds.",
-     "configuration"},
-    // Power of 1.5 with bucket count 45 (> 23 hours)
-    {tsl::monitoring::Buckets::Exponential(1, 1.5, 45)});
 
 // Name of component for error logging. This name is fixed and required to
 // enable logging.
@@ -126,20 +119,11 @@ tsl::StatusOr<XlaCompilationResult> LegalizeWithMlirBridge(
   // Enabling op fallback also enables whole graph fallback if op by op
   // fallback failed.
 
-  tsl::StatusOr<std::string> mlir_bridge_status;
-  {
-    CompilationTimer timer;
-    const std::string kMlirBridgeFallback = "mlir_bridge_op_fallback_enabled";
-
-    mlir_bridge_status = CompileFromMlirToXlaHlo(
-        /*lower_to_xla_hlo=*/true, computation, metadata, device_type,
-        shape_determination_fns, use_tuple_args, compilation_result,
-        custom_legalization_passes, arg_shapes, arg_core_mapping,
-        per_core_arg_shapes);
-
-    phase2_bridge_compilation_time->GetCell(kMlirBridgeFallback)
-        ->Add(timer.ElapsedCyclesInMilliseconds());
-  }
+  tsl::StatusOr<std::string> mlir_bridge_status = CompileFromMlirToXlaHlo(
+      /*lower_to_xla_hlo=*/true, computation, metadata, device_type,
+      shape_determination_fns, use_tuple_args, compilation_result,
+      custom_legalization_passes, arg_shapes, arg_core_mapping,
+      per_core_arg_shapes);
 
   if (mlir_bridge_status.ok()) {
     VLOG(1) << "Successfully compiled MLIR computation to XLA HLO using MLIR "
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo.cc b/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo.cc
index 8d7531f87eed3e..2d321246463494 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo.cc
+++ b/tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tf2xla/internal/legalize_tf_to_hlo.h"
 
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "absl/log/log.h"
@@ -33,19 +34,11 @@ limitations under the License.
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
-#include "tsl/lib/monitoring/sampler.h"
 #include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace tf2xla {
 namespace internal {
-auto* phase2_combined_bridge_compilation_time =
-    tsl::monitoring::Sampler<1>::New(
-        {"/tensorflow/core/tf2xla/api/v2/phase2_combined_compilation_time",
-         "The wall-clock time spent on combined graphs in milliseconds.",
-         "configuration"},
-        // Power of 1.5 with bucket count 45 (> 23 hours)
-        {tsl::monitoring::Buckets::Exponential(1, 1.5, 45)});
 
 using metrics::IncrementTfMlirBridgeSecondPhaseCounter;
 using metrics::MlirBridgeSecondPhaseMetric;
@@ -63,14 +56,14 @@ tsl::StatusOr<XlaCompilationResult> LegalizeTfToHlo(
     xla::CompileOnlyClient* client, XlaCompilationResult* compilation_result) {
   LOG_FIRST_N(INFO, 1) << "Compiling MLIR computation to XLA HLO using the "
                           "Combined MLIR Tf2Xla Bridge.";
-  CompilationTimer timer;
-  constexpr char kCombinedBridgeTimer[] = "combined_bridge";
 
-  auto mlir_compilation = internal::CompileFromMlirToXlaHlo(
-      /*lower_to_xla_hlo=*/false, computation, metadata, device_type,
-      shape_determination_fns, use_tuple_args, compilation_result,
-      custom_legalization_passes, arg_shapes, arg_core_mapping,
-      per_core_arg_shapes);
+  tsl::StatusOr<std::string> mlir_compilation
+
+      = internal::CompileFromMlirToXlaHlo(
+          /*lower_to_xla_hlo=*/false, computation, metadata, device_type,
+          shape_determination_fns, use_tuple_args, compilation_result,
+          custom_legalization_passes, arg_shapes, arg_core_mapping,
+          per_core_arg_shapes);
 
   if (!mlir_compilation.ok()) {
     IncrementTfMlirBridgeSecondPhaseCounter(
@@ -94,8 +87,6 @@ tsl::StatusOr<XlaCompilationResult> LegalizeTfToHlo(
   IncrementTfMlirBridgeSecondPhaseCounter(
       MlirBridgeSecondPhaseMetric::kMlirCombinedOldSuccess);
 
-  phase2_combined_bridge_compilation_time->GetCell(kCombinedBridgeTimer)
-      ->Add(timer.ElapsedCyclesInMilliseconds());
   return *compilation_result;
 }
 
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.cc b/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.cc
new file mode 100644
index 00000000000000..3672b0bbe0f47a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.cc
@@ -0,0 +1,177 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.h"
+
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
+#include "tensorflow/core/common_runtime/function_body.h"
+#include "tensorflow/core/common_runtime/function_def_utils.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tsl/platform/status.h"
+
+namespace tensorflow {
+
+using ::mlir::failure;
+using ::mlir::LogicalResult;
+using ::mlir::success;
+
+namespace {
+LogicalResult HasAttr(
+    const Graph& graph, const FunctionLibraryDefinition* function_library,
+    const std::function<bool(const Graph& graph)>& predicate) {
+  if (predicate(graph)) {
+    return success();
+  }
+
+  // Check if any reachable functions from the graph has the target attribute.
+  GraphDef graph_def;
+  graph.ToGraphDef(&graph_def);
+  if (!function_library) return failure();
+  for (const std::string& func_name :
+       function_library->ReachableDefinitions(graph_def).ListFunctionNames()) {
+    const FunctionDef* func_def = function_library->Find(func_name);
+    std::unique_ptr<FunctionBody> func_body;
+    absl::Status status = FunctionDefToBodyHelper(
+        *func_def, AttrSlice(&func_def->attr()), function_library, &func_body);
+    // This is not expected to happen in practice
+    if (!status.ok()) {
+      LOG(ERROR) << "Failed to parse " << func_name << ": "
+                 << tsl::NullTerminatedMessage(status);
+      return failure();
+    }
+    if (predicate(*func_body->graph)) {
+      return success();
+    }
+  }
+  return failure();
+}
+
+bool IsNonReplicatedGraph(const Graph& graph,
+                          const FunctionLibraryDefinition* function_library) {
+  auto predicate = [](const Graph& graph) {
+    const std::string kStatefulPartitionedCallOp = "StatefulPartitionedCall";
+    for (const Node* node : graph.nodes()) {
+      auto node_op = node->type_string();
+      if (node_op == kStatefulPartitionedCallOp) {
+        // Functions called by StatefulfulPartitionedCall ops with
+        // _XlaMustCompile=true are compiled by XLA.
+        auto attr = node->attrs().FindByString(std::string(kMustCompileAttr));
+        if (attr != nullptr && attr->b() == true) {
+          return true;
+        }
+      }
+    }
+    return false;
+  };
+  return HasAttr(graph, function_library, predicate).succeeded();
+}
+
+bool IsReplicatedGraph(const Graph& graph,
+                       const FunctionLibraryDefinition* function_library) {
+  auto predicate = [](const Graph& graph) {
+    for (const Node* node : graph.nodes()) {
+      // _tpu_replicate is used in replicated TPU graphs. It will be converted
+      // to_replication_info and _xla_compile_device_type in phase 1 pipelines.
+      if (node->attrs().FindByString(std::string(kTpuReplicateAttr))) {
+        return true;
+      }
+    }
+    return false;
+  };
+  return HasAttr(graph, function_library, predicate).succeeded();
+}
+
+bool IsSingleCoreTpuGraph(const Graph& graph,
+                          const FunctionLibraryDefinition* function_library) {
+  auto predicate = [](const Graph& graph) {
+    for (const Node* node : graph.nodes()) {
+      // _xla_compile_device_type=TPU is found in single-core TPU graphs.
+      auto attr =
+          node->attrs().FindByString(std::string(kCompileDeviceTypeAttr));
+      if (attr && attr->s() == kTpuDevice) {
+        return true;
+      }
+    }
+    return false;
+  };
+  return HasAttr(graph, function_library, predicate).succeeded();
+}
+
+bool IsReplicatedGraph(mlir::ModuleOp module) {
+  auto walk_result = module.walk([&](mlir::Operation* op) {
+    // TODO(b/223677572): Once the scope for new compilation and replication
+    // markers is expanded beyond bridge we can remove this check for
+    // `kTPUReplicateAttr`, we will then always have a `kCompileDeviceTypeAttr`
+    // in such cases (see above).
+    // TODO(b/229028654): Remove string conversion once we have C++17.
+    const llvm::StringRef tpu_replicate_attr_name(kTpuReplicateAttr.data(),
+                                                  kTpuReplicateAttr.size());
+    auto replicate_attr =
+        op->getAttrOfType<mlir::StringAttr>(tpu_replicate_attr_name);
+    if (replicate_attr) return mlir::WalkResult::interrupt();
+    return mlir::WalkResult::advance();
+  });
+  return walk_result.wasInterrupted();
+}
+
+bool IsSingleCoreTPUGraph(mlir::ModuleOp module) {
+  auto walk_result = module.walk([&](mlir::Operation* op) {
+    // Check for ops with compile device type "TPU". This allows us to support
+    // TPU compilation without replication. Note that currently the compile
+    // device type is not set by default before bridge, only if eager context
+    // attribute `jit_compile_rewrite` is true.
+    // TODO(b/229028654): Remove string conversion once we have C++17.
+    const llvm::StringRef compile_device_type_attr_name(
+        kCompileDeviceTypeAttr.data(), kCompileDeviceTypeAttr.size());
+    auto compilation_attr =
+        op->getAttrOfType<mlir::StringAttr>(compile_device_type_attr_name);
+    if (compilation_attr && compilation_attr.getValue().str() == kTpuDevice) {
+      return mlir::WalkResult::interrupt();
+    }
+    return mlir::WalkResult::advance();
+  });
+  return walk_result.wasInterrupted();
+}
+
+}  // namespace
+
+bool IsSupportedByNonReplicatedBridge(
+    const Graph& graph, const FunctionLibraryDefinition* function_library) {
+  return IsNonReplicatedGraph(graph, function_library);
+}
+
+bool IsSupportedByReplicatedBridge(
+    const Graph& graph, const FunctionLibraryDefinition* function_library) {
+  return IsReplicatedGraph(graph, function_library) ||
+         IsSingleCoreTpuGraph(graph, function_library);
+}
+
+bool IsSupportedByReplicatedBridge(mlir::ModuleOp module) {
+  return IsReplicatedGraph(module) || IsSingleCoreTPUGraph(module);
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.h b/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.h
new file mode 100644
index 00000000000000..5ea0bdc71ea0d0
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.h
@@ -0,0 +1,43 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_MLIR_BRIDGE_PASS_UTIL_H_
+#define TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_MLIR_BRIDGE_PASS_UTIL_H_
+
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "tensorflow/core/framework/function.h"
+
+namespace tensorflow {
+
+// Checks if a graph or reachable functions in the library have any
+// StatefulPartitionedOps with _XlaMustCompile=true. The function library will
+// be skipped if nullptr is provided.
+bool IsSupportedByNonReplicatedBridge(
+    const Graph& graph, const FunctionLibraryDefinition* function_library);
+
+// Checks if a graph or reachable functions in the library have any ops with
+// _tpu_replicate or _xla_compile_device_type=TPU. The function library will be
+// skipped if nullptr is provided.
+
+bool IsSupportedByReplicatedBridge(
+    const Graph& graph, const FunctionLibraryDefinition* function_library);
+
+// Check if an MLIR module has any ops with _tpu_replicate or
+// _xla_compile_device_type=TPU.
+bool IsSupportedByReplicatedBridge(mlir::ModuleOp module);
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_COMPILER_MLIR_TF2XLA_INTERNAL_MLIR_BRIDGE_PASS_UTIL_H_
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util_test.cc b/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util_test.cc
new file mode 100644
index 00000000000000..8ce19560f1f349
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util_test.cc
@@ -0,0 +1,228 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.h"
+
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/cc/ops/function_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
+#include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/node_builder.h"
+#include "tensorflow/core/platform/enable_tf2_utils.h"
+#include "tsl/lib/core/status_test_util.h"
+
+namespace tensorflow {
+
+namespace {
+
+FunctionDef OuterXTimesTwo() {
+  return FunctionDefHelper::Define(
+      // Name
+      "OuterXTimesTwo",
+      // Args
+      {"x: float"},
+      // Return values
+      {"y: float"},
+      // Attr def
+      {},
+      {{{"y"},
+        "StatefulPartitionedCall",
+        {"x"},
+        {{"Tin", DataTypeSlice{DT_FLOAT}},
+         {"Tout", DataTypeSlice{DT_FLOAT}},
+         {"f",
+          FunctionDefHelper::FunctionRef("XTimesTwoFloat", {{"T", DT_FLOAT}})},
+         {std::string(kMustCompileAttr), true}}}});
+}
+
+TEST(IsSupportedByNonReplicatedBridge, NonReplicatedGraph) {
+  const FunctionDef& fd = test::function::XTimesTwo();
+  FunctionDefLibrary flib;
+  *flib.add_function() = fd;
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), flib);
+  Graph graph(flib_def);
+  graph.SetConstructionContext(ConstructionContext::kEagerRuntime);
+  tensorflow::set_tf2_execution(true);
+
+  ConfigProto config = ConfigProto();
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output a = ops::_Arg(root.WithOpName("A"), DT_FLOAT, 0);
+  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(a.node())});
+
+  Node* call;
+  NameAttrList f_name_attr;
+  f_name_attr.set_name(fd.signature().name());
+  TF_ASSERT_OK(
+      NodeBuilder("B", "StatefulPartitionedCall", &root.graph()->flib_def())
+          .Input(inputs)
+          .Attr("Tin", {DT_FLOAT})
+          .Attr("Tout", {DT_FLOAT})
+          .Attr("f", f_name_attr)
+          .Finalize(root.graph(), &call));
+  call->AddAttr(std::string(kMustCompileAttr), true);
+
+  TF_ASSERT_OK(root.ToGraph(&graph));
+
+  EXPECT_TRUE(
+      IsSupportedByNonReplicatedBridge(graph, /*function_library=*/nullptr));
+}
+
+// Checks that HasAttr actually goes through function library.
+TEST(IsSupportedByNonReplicatedBridge, NonReplicatedFunctionLibrary) {
+  const FunctionDef& fd = OuterXTimesTwo();
+  FunctionDefLibrary flib;
+  *flib.add_function() = fd;
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), flib);
+  Graph graph(OpRegistry::Global());
+  graph.SetConstructionContext(ConstructionContext::kEagerRuntime);
+  tensorflow::set_tf2_execution(true);
+
+  ConfigProto config = ConfigProto();
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output a = ops::_Arg(root.WithOpName("A"), DT_FLOAT, 0);
+  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(a.node())});
+
+  // Builds a call without compilation markers that calls a function with Xla
+  // clusters.
+  Node* call;
+  NameAttrList f_name_attr;
+  f_name_attr.set_name(fd.signature().name());
+  TF_ASSERT_OK(
+      NodeBuilder("B", "StatefulPartitionedCall", &root.graph()->flib_def())
+          .Input(inputs)
+          .Attr("Tin", {DT_FLOAT})
+          .Attr("Tout", {DT_FLOAT})
+          .Attr("f", f_name_attr)
+          .Finalize(root.graph(), &call));
+
+  TF_ASSERT_OK(root.ToGraph(&graph));
+  EXPECT_TRUE(
+      IsSupportedByNonReplicatedBridge(graph, /*function_library=*/&flib_def));
+}
+
+TEST(IsSupportedByReplicatedBridge, ReplicatedGraph) {
+  const FunctionDef& fd = test::function::XTimesTwo();
+  FunctionDefLibrary flib;
+  *flib.add_function() = fd;
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), flib);
+  Graph graph(flib_def);
+  graph.SetConstructionContext(ConstructionContext::kEagerRuntime);
+  tensorflow::set_tf2_execution(true);
+
+  ConfigProto config = ConfigProto();
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output a = ops::_Arg(root.WithOpName("A"), DT_FLOAT, 0);
+  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(a.node())});
+
+  Node* call;
+  NameAttrList f_name_attr;
+  f_name_attr.set_name(fd.signature().name());
+  TF_ASSERT_OK(
+      NodeBuilder("B", "StatefulPartitionedCall", &root.graph()->flib_def())
+          .Input(inputs)
+          .Attr("Tin", {DT_FLOAT})
+          .Attr("Tout", {DT_FLOAT})
+          .Attr("f", f_name_attr)
+          .Finalize(root.graph(), &call));
+  call->AddAttr(std::string(kTpuReplicateAttr), "cluster");
+
+  TF_ASSERT_OK(root.ToGraph(&graph));
+
+  EXPECT_TRUE(
+      IsSupportedByReplicatedBridge(graph, /*function_library=*/nullptr));
+}
+
+TEST(IsSupportedByReplicatedBridge, SingleCoreTpuGraph) {
+  const FunctionDef& fd = test::function::XTimesTwo();
+  FunctionDefLibrary flib;
+  *flib.add_function() = fd;
+  FunctionLibraryDefinition flib_def(OpRegistry::Global(), flib);
+  Graph graph(flib_def);
+  graph.SetConstructionContext(ConstructionContext::kEagerRuntime);
+  tensorflow::set_tf2_execution(true);
+
+  ConfigProto config = ConfigProto();
+  Scope root = Scope::NewRootScope().ExitOnError();
+
+  Output a = ops::_Arg(root.WithOpName("A"), DT_FLOAT, 0);
+  std::vector<NodeBuilder::NodeOut> inputs({NodeBuilder::NodeOut(a.node())});
+
+  Node* call;
+  NameAttrList f_name_attr;
+  f_name_attr.set_name(fd.signature().name());
+  TF_ASSERT_OK(
+      NodeBuilder("B", "StatefulPartitionedCall", &root.graph()->flib_def())
+          .Input(inputs)
+          .Attr("Tin", {DT_FLOAT})
+          .Attr("Tout", {DT_FLOAT})
+          .Attr("f", f_name_attr)
+          .Finalize(root.graph(), &call));
+  call->AddAttr(std::string(kCompileDeviceTypeAttr), kTpuDevice);
+
+  TF_ASSERT_OK(root.ToGraph(&graph));
+
+  EXPECT_TRUE(
+      IsSupportedByReplicatedBridge(graph, /*function_library=*/nullptr));
+}
+
+TEST(IsSupportedByReplicatedBridge, ReplicatedModule) {
+  const char* const code = R"mlir(
+func.func @entry_func_1(%arg0: tensor<i32>) -> tensor<i32> attributes {tf.entry_function = {}} {
+  %0 = "tf.Identity"(%arg0) {_tpu_replicate = "cluster"} : (tensor<i32>) -> (tensor<i32>)
+  func.return %0 : tensor<i32>
+}
+)mlir";
+  mlir::MLIRContext context;
+  context.loadDialect<mlir::func::FuncDialect, mlir::TF::TensorFlowDialect>();
+  mlir::OwningOpRef<mlir::ModuleOp> module =
+      mlir::parseSourceString<mlir::ModuleOp>(code, &context);
+  ASSERT_TRUE(module);
+  EXPECT_TRUE(IsSupportedByReplicatedBridge(*module));
+}
+
+TEST(IsSupportedByReplicatedBridge, SingleCoreTpuModule) {
+  const char* const code = R"mlir(
+func.func @entry_func_1(%arg0: tensor<i32>) -> tensor<i32> attributes {tf.entry_function = {}} {
+  %0 = "tf.Identity"(%arg0) {_xla_compile_device_type = "TPU"} : (tensor<i32>) -> (tensor<i32>)
+  func.return %0 : tensor<i32>
+}
+)mlir";
+  mlir::MLIRContext context;
+  context.loadDialect<mlir::func::FuncDialect, mlir::TF::TensorFlowDialect>();
+  mlir::OwningOpRef<mlir::ModuleOp> module =
+      mlir::parseSourceString<mlir::ModuleOp>(code, &context);
+  ASSERT_TRUE(module);
+  EXPECT_TRUE(IsSupportedByReplicatedBridge(*module));
+}
+
+}  // namespace
+
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD b/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
index 6250f2cf0ca7c5..df25fdd9ffe4be 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/BUILD
@@ -26,6 +26,7 @@ cc_library(
     deps = [
         ":extract_head_tail_outside_compilation",
         ":extract_outside_compilation",
+        ":hoist_broadcast_read",
         ":mark_ops_for_outside_compilation",
         ":tpu_cluster_formation",
         ":verify_clustering_pass",
@@ -351,6 +352,41 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "hoist_broadcast_read",
+    srcs = ["hoist_broadcast_read.cc"],
+    textual_hdrs = [
+        "clustering_passes.h.inc",
+    ],
+    deps = [
+        ":clustering_passes_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow",
+        "//tensorflow/compiler/mlir/tensorflow:attribute_utils",
+        "//tensorflow/compiler/mlir/tensorflow:string_util",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_analysis",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
+        "//tensorflow/compiler/mlir/tensorflow:tpu_rewrite_device_util",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:lower_tf_lib",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:tf_pass_inc_gen",
+        "//tensorflow/compiler/mlir/tensorflow/transforms:verify_no_outside_compilation_markers_pass",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:legalization_op_config",
+        "//tensorflow/compiler/mlir/tf2xla/transforms:legalize_tf",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Pass",
+        "@llvm-project//mlir:Rewrite",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:TransformUtils",
+    ],
+)
+
 tf_cc_test(
     name = "tpu_cluster_formation_test",
     srcs = ["tpu_cluster_formation_test.cc"],
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h
index ea6187a2309205..3ccf990a4b5272 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h
@@ -56,6 +56,11 @@ CreateXlaOutlineEntryFunctionsPass();
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateMarkOpsForOutsideCompilationPass();
 
+// Creates a pass that hoists reads out of a replicate that are on a variable
+// whose value is broacast to all replicas.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateHoistBroadcastReadPass();
+
 #define GEN_PASS_REGISTRATION
 #define GEN_PASS_DECL_MARKOPSFOROUTSIDECOMPILATIONPASS
 #define GEN_PASS_DECL_TPUCLUSTERFORMATIONPASS
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td
index c219c35842c401..90d2e962bc9b1c 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.td
@@ -349,3 +349,42 @@ def MarkOpsForOutsideCompilationPass : Pass<"tf-mark-ops-for-outside-compilation
 
   let constructor = "tensorflow::tf2xla::internal::CreateMarkOpsForOutsideCompilationPass()";
 }
+
+def HoistBroadcastReadPass : Pass<"tf-hoist-broadcast-read", "mlir::func::FuncOp"> {
+  let summary = "Hoist reads out of a replicate that are on a resource that is broacast to all replicas.";
+
+  let description = [{
+    Some `ReadVariableOp`s that are within a `tf_device.replicate` read the same
+    value across all replicas. These reads can be hoisted out of the
+    `tf_device.replicate` so there's one read for all replicas, and each replica
+    depends on the result of the read. This transform enables the
+    xla-broadcast-pass to optimize the broadcast value.
+
+    For example, the following:
+
+    ```mlir
+    tf_device.replicate {n = 2 : i32} {
+      %0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<f32>
+      "tf.OpA"(%0) : (tensor<f32>) -> ()
+    }
+    ```
+
+    will be transformed into:
+
+    ``mlir
+    %0 = "tf.ReadVariableOp"(%arg0) : (tensor<*x!tf_type.resource<tensor<f32>>>) -> tensor<f32>
+    tf_device.replicate {n = 2 : i32} {
+      "tf.OpA"(%0) : (tensor<f32>) -> ()
+    }
+    ```
+
+    We must ensure that there is a single underlying resource that not
+    distributed across replicas. There is a single underlying resource when the
+    resource device type is CPU, so we cautiously only apply in this case.
+
+    To be cautious we never hoist a read that comes after a write to the same
+    resource.
+  }];
+
+  let constructor = "tensorflow::tf2xla::internal::CreateHoistBroadcastReadPass()";
+}
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/hoist_broadcast_read.cc b/tensorflow/compiler/mlir/tf2xla/internal/passes/hoist_broadcast_read.cc
new file mode 100644
index 00000000000000..732bae8c67b018
--- /dev/null
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/hoist_broadcast_read.cc
@@ -0,0 +1,154 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <string>
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_device.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/core/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace tf2xla {
+namespace internal {
+
+namespace {
+
+using mlir::BlockArgument;
+using mlir::failure;
+using mlir::LogicalResult;
+using mlir::Operation;
+using mlir::OperationPass;
+using mlir::OpOperand;
+using mlir::StringAttr;
+using mlir::success;
+using mlir::Value;
+using mlir::WalkResult;
+using mlir::func::FuncOp;
+using mlir::TF::ReadVariableOp;
+using mlir::tf_device::ReplicateOp;
+
+#define GEN_PASS_DEF_HOISTBROADCASTREADPASS
+#include "tensorflow/compiler/mlir/tf2xla/internal/passes/clustering_passes.h.inc"
+
+constexpr char kFuncDeviceAttr[] = "tf.device";
+constexpr char kCpuDeviceType[] = "CPU";
+
+struct HoistBroadcastRead
+    : public impl::HoistBroadcastReadPassBase<HoistBroadcastRead> {
+  void runOnOperation() override;
+};
+
+// Get the ancestor of `descendant` that is a direct child of `ancestor`.
+Operation* GetAncestorBelow(Operation* descendant, Operation* ancestor) {
+  Operation* parent = descendant->getParentOp();
+  if (!parent) return nullptr;
+  if (parent == ancestor) return descendant;
+  return GetAncestorBelow(parent, ancestor);
+}
+
+// `is_cpu_read` is set to `true` iff `read` is on a resource with device type
+// CPU.
+LogicalResult IsCpuRead(FuncOp func, ReadVariableOp read, bool& is_cpu_read) {
+  if (auto arg = read->getOperand(0).dyn_cast<BlockArgument>()) {
+    if (arg.getOwner() != &(func.front())) {
+      is_cpu_read = false;
+      return success();
+    }
+    if (auto attr = func.getArgAttrOfType<StringAttr>(arg.getArgNumber(),
+                                                      kFuncDeviceAttr)) {
+      std::string device = attr.getValue().str();
+      tensorflow::DeviceNameUtils::ParsedName parsed_name;
+      if (!tensorflow::DeviceNameUtils::ParseFullName(device, &parsed_name)) {
+        return read->emitOpError() << "invalid device '" << device << "'";
+      }
+      is_cpu_read = parsed_name.type == kCpuDeviceType;
+      return success();
+    }
+  }
+  is_cpu_read = false;
+  return success();
+}
+
+// Get the reads to hoist in the `replicate`.
+LogicalResult GetReads(FuncOp func, ReplicateOp replicate,
+                       llvm::SmallVector<ReadVariableOp, 4>& reads) {
+  for (Operation& op : replicate.getBody().front()) {
+    if (auto read = llvm::dyn_cast<ReadVariableOp>(&op)) {
+      bool is_cpu_read;
+      if (failed(IsCpuRead(func, read, is_cpu_read))) return failure();
+      if (is_cpu_read) reads.push_back(read);
+    }
+  }
+  return success();
+}
+
+// Move reads above the `replicate`. Skip reads that come after a write to the
+// same resource.
+void MoveReads(ReplicateOp replicate,
+               llvm::SmallVector<ReadVariableOp, 4>& reads) {
+  for (ReadVariableOp read : reads) {
+    Value res = read.getResource();
+    Operation* scope = res.getParentBlock()->getParentOp();
+    if (!scope->isProperAncestor(replicate)) continue;
+    bool has_conflicting_write = false;
+    for (OpOperand& use : res.getUses()) {
+      Operation* using_op = use.getOwner();
+      if (using_op == read) continue;
+      if (!replicate->isProperAncestor(using_op)) continue;
+      Operation* peer = GetAncestorBelow(using_op, replicate);
+      if (read->isBeforeInBlock(peer)) continue;
+      if (llvm::isa<ReadVariableOp>(peer)) continue;
+      has_conflicting_write = true;
+    }
+    if (has_conflicting_write) continue;
+    read->moveBefore(replicate);
+  }
+}
+
+// Hoist `ReadVariableOp`s above the `tf_device.replicate`s.
+void HoistBroadcastRead::runOnOperation() {
+  FuncOp func = getOperation();
+
+  auto result = func.walk([&](ReplicateOp replicate) {
+    llvm::SmallVector<ReadVariableOp, 4> reads;
+    if (failed(GetReads(func, replicate, reads)))
+      return WalkResult::interrupt();
+    MoveReads(replicate, reads);
+    return WalkResult::advance();
+  });
+
+  if (result.wasInterrupted()) return signalPassFailure();
+}
+
+}  // namespace
+
+std::unique_ptr<OperationPass<FuncOp>> CreateHoistBroadcastReadPass() {
+  return std::make_unique<HoistBroadcastRead>();
+}
+
+}  // namespace internal
+}  // namespace tf2xla
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass_test.mlir b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass_test.mlir
index 7ba98798c126df..c098bf494272fd 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass_test.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_clustering_pass_test.mlir
@@ -3,7 +3,7 @@
 
 func.func @testNotTfDialect(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
  // expected-error@below {{op is in dialect chlo not in tf functional dialect}}
-  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = array<i64: 3>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   func.return %0 : tensor<1x32x10x32xi32>
 }
 
diff --git a/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_input_dialect_to_executor_pass_test.mlir b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_input_dialect_to_executor_pass_test.mlir
index 5a6fda697d23fa..7d88228447e4af 100644
--- a/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_input_dialect_to_executor_pass_test.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/internal/passes/verify_input_dialect_to_executor_pass_test.mlir
@@ -11,6 +11,10 @@ func.func @testNoClusterFuncOpPasses(%arg0: tensor<4x?x!tf_type.stringref>) -> t
 
 // -----
 
+func.func @_func(%arg0: tensor<i32>) -> tensor<i32> {
+  func.return %arg0 : tensor<i32>
+}
+
 func.func @testClusterFuncOpFails(%arg0: tensor<i32>) -> tensor<i32> {
    // expected-error@below {{failed TF functional to executor validation, op tf_device.cluster_func is not allowed}}
   %cluster = "tf_device.cluster_func"(%arg0) {func = @_func} : (tensor<i32>) -> tensor<i32>
@@ -29,6 +33,6 @@ func.func @testTFDialect(%arg0: tensor<4x?x!tf_type.stringref>) -> tensor<4x2x!t
 
 func.func @testNotTfDialect(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
  // expected-error@below {{op is in dialect chlo which is not an accepted dialect}}
-  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = array<i64: 3>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   func.return %0 : tensor<1x32x10x32xi32>
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.cc b/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.cc
index 3f35813744c60d..f00e12690d380b 100644
--- a/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.cc
+++ b/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.cc
@@ -24,7 +24,8 @@ namespace tensorflow {
 MlirBridgeRolloutPolicy GetMlirBridgeRolloutPolicy(
     const tensorflow::Graph& graph,
     const FunctionLibraryDefinition* function_library,
-    std::optional<ConfigProto> config_proto, bool run_tpu_bridge,
+    std::optional<ConfigProto> config_proto,
+    bool is_supported_by_replicated_brige,
     bool uses_uninitialized_resource_args, bool is_v1_compat,
     bool record_stats) {
   switch (GetMlirBridgeRolloutState(config_proto)) {
diff --git a/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h b/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h
index 5c7f47a219e10e..66a68ae53f535f 100644
--- a/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h
+++ b/tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h
@@ -53,7 +53,8 @@ enum class MlirBridgeRolloutPolicy {
 MlirBridgeRolloutPolicy GetMlirBridgeRolloutPolicy(
     const tensorflow::Graph& graph,
     const FunctionLibraryDefinition* function_library,
-    std::optional<tensorflow::ConfigProto> config_proto, bool run_tpu_bridge,
+    std::optional<tensorflow::ConfigProto> config_proto,
+    bool is_supported_by_replicated_brige,
     bool uses_uninitialized_resource_args, bool is_v1_compat,
     bool record_stats);
 
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/BUILD b/tensorflow/compiler/mlir/tf2xla/tests/BUILD
index c68c485954de1b..97bb01c30d1855 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/tests/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "filegroup")
 load("//tensorflow/compiler/mlir:glob_lit_test.bzl", "glob_lit_tests")
+load("//tensorflow:tensorflow.default.bzl", "filegroup")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-communication.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-communication.mlir
index 11cfffc24eaa33..d79804a6d38cc6 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-communication.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-communication.mlir
@@ -119,7 +119,9 @@ func.func @send_to_host(%arg0: tensor<i32>) {
   // CHECK:      [[INIT_TOKEN:%.*]] = mhlo.create_token
 
   // CHECK:      "mhlo.send"([[ARG0]], [[INIT_TOKEN]])
-  // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 1, type = 2>
+  // CHECK-SAME: channel_handle = #mhlo.channel_handle
+  // CHECK-SAME: handle =
+  // CHECK-SAME: type = 2
   // CHECK-SAME: is_host_transfer = true
   // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "send_key_dtoh_0"}
   // CHECK-SAME: mhlo.sharding = "\08\01\1A\01\01\22\01\00"
@@ -137,7 +139,9 @@ func.func @recv_from_host() -> tensor<i32> {
   // CHECK:      [[INIT_TOKEN:%.*]] = mhlo.create_token
 
   // CHECK:      [[RECV_TUPLE:%.*]]:2 = "mhlo.recv"([[INIT_TOKEN]])
-  // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 1, type = 3>
+  // CHECK-SAME: channel_handle = #mhlo.channel_handle
+  // CHECK-SAME: handle =
+  // CHECK-SAME: type = 3
   // CHECK-SAME: is_host_transfer = true
   // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "recv_key_htod_0"}
   // CHECK-SAME: mhlo.sharding = "\08\01\1A\01\01\22\01\00"
@@ -158,21 +162,29 @@ func.func @multiple_consecutive_ops(%arg0: tensor<i32>) -> tensor<i32> {
   // CHECK:      [[INIT_TOKEN:%.*]] = mhlo.create_token
 
   // CHECK:      [[SEND0_ARG0_TOKEN:%.*]] = "mhlo.send"([[ARG0]], [[INIT_TOKEN]])
-  // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 1, type = 2>
+  // CHECK-SAME: channel_handle = #mhlo.channel_handle
+  // CHECK-SAME: handle =
+  // CHECK-SAME: type = 2
   // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "send0_dtoh_0"}
 
   // CHECK:      [[RECV0_RETVAL0_TUPLE:%.*]]:2 = "mhlo.recv"([[SEND0_ARG0_TOKEN]])
-  // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 2, type = 3>
+  // CHECK-SAME: channel_handle = #mhlo.channel_handle
+  // CHECK-SAME: handle =
+  // CHECK-SAME: type = 3
   // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "recv0_htod_0"}
 
   %0 = "tf._XlaHostComputeMlir"(%arg0) {recv_key = "recv0", send_key = "send0", host_mlir_module = ""} : (tensor<i32>) -> tensor<i32>
 
   // CHECK:      [[SEND1_ARG0_TOKEN:%.*]] = "mhlo.send"([[RECV0_RETVAL0_TUPLE]]#0, [[RECV0_RETVAL0_TUPLE]]#1)
-  // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 3, type = 2>
+  // CHECK-SAME: channel_handle = #mhlo.channel_handle
+  // CHECK-SAME: handle =
+  // CHECK-SAME: type = 2
   // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "send1_dtoh_0"}
 
   // CHECK:      [[RECV1_RETVAL0_TUPLE:%.*]]:2 = "mhlo.recv"([[SEND1_ARG0_TOKEN]])
-  // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 4, type = 3>
+  // CHECK-SAME: channel_handle = #mhlo.channel_handle
+  // CHECK-SAME: handle =
+  // CHECK-SAME: type = 3
   // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "recv1_htod_0"}
 
   %1 = "tf._XlaHostComputeMlir"(%0) {recv_key = "recv1", send_key = "send1", host_mlir_module = ""} : (tensor<i32>) -> tensor<i32>
@@ -376,11 +388,15 @@ func.func @if_both_branches(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor
   // CHECK: [[IF:%.*]]:2 = "mhlo.if"([[ARG0]])
   %0 = "mhlo.if"(%arg0) ({
     // CHECK:      [[TRUE_SEND_TOKEN:%.*]] = "mhlo.send"([[ARG1]], [[INIT_TOKEN]])
-    // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 1, type = 2>
+    // CHECK-SAME: channel_handle = #mhlo.channel_handle
+    // CHECK-SAME: handle =
+    // CHECK-SAME: type = 2
     // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "send_if_true_dtoh_0"}
 
     // CHECK:      [[TRUE_RECV_TUPLE:%.*]]:2 = "mhlo.recv"([[TRUE_SEND_TOKEN]])
-    // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 2, type = 3>
+    // CHECK-SAME: channel_handle = #mhlo.channel_handle
+    // CHECK-SAME: handle =
+    // CHECK-SAME: type = 3
     // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "recv_if_true_htod_0"}
     %1 = "tf._XlaHostComputeMlir"(%arg1) {recv_key = "recv_if_true", send_key = "send_if_true", host_mlir_module = ""} : (tensor<f32>) -> tensor<f32>
 
@@ -388,11 +404,15 @@ func.func @if_both_branches(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor
     "mhlo.return"(%1) : (tensor<f32>) -> ()
   },  {
     // CHECK:      [[FALSE_SEND_TOKEN:%.*]] = "mhlo.send"([[ARG2]], [[INIT_TOKEN]])
-    // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 3, type = 2>
+    // CHECK-SAME: channel_handle = #mhlo.channel_handle
+    // CHECK-SAME: handle =
+    // CHECK-SAME: type = 2
     // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "send_if_false_dtoh_0"}
 
     // CHECK:      [[FALSE_RECV_TUPLE:%.*]]:2 = "mhlo.recv"([[FALSE_SEND_TOKEN]])
-    // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 4, type = 3>
+    // CHECK-SAME: channel_handle = #mhlo.channel_handle
+    // CHECK-SAME: handle =
+    // CHECK-SAME: type = 3
     // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "recv_if_false_htod_0"}
     %1 = "tf._XlaHostComputeMlir"(%arg2) {recv_key = "recv_if_false", send_key = "send_if_false", host_mlir_module = ""} : (tensor<f32>) -> tensor<f32>
 
@@ -419,11 +439,15 @@ func.func @if_true_branch(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<f
   // CHECK: [[IF:%.*]]:2 = "mhlo.if"([[ARG0]])
   %0 = "mhlo.if"(%arg0) ({
     // CHECK:      [[TRUE_SEND_TOKEN:%.*]] = "mhlo.send"([[ARG1]], [[INIT_TOKEN]])
-    // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 1, type = 2>
+    // CHECK-SAME: channel_handle = #mhlo.channel_handle
+    // CHECK-SAME: handle =
+    // CHECK-SAME: type = 2
     // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "send_if_true_dtoh_0"}
 
     // CHECK:      [[TRUE_RECV_TUPLE:%.*]]:2 = "mhlo.recv"([[TRUE_SEND_TOKEN]])
-    // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 2, type = 3>
+    // CHECK-SAME: channel_handle = #mhlo.channel_handle
+    // CHECK-SAME: handle =
+    // CHECK-SAME: type = 3
     // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "recv_if_true_htod_0"}
     %1 = "tf._XlaHostComputeMlir"(%arg1) {recv_key = "recv_if_true", send_key = "send_if_true", host_mlir_module = ""} : (tensor<f32>) -> tensor<f32>
 
@@ -456,11 +480,15 @@ func.func @if_false_branch(%arg0: tensor<i1>, %arg1: tensor<f32>, %arg2: tensor<
     "mhlo.return"(%arg1) : (tensor<f32>) -> ()
   },  {
     // CHECK:      [[FALSE_SEND_TOKEN:%.*]] = "mhlo.send"([[ARG2]], [[INIT_TOKEN]])
-    // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 1, type = 2>
+    // CHECK-SAME: channel_handle = #mhlo.channel_handle
+    // CHECK-SAME: handle =
+    // CHECK-SAME: type = 2
     // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "send_if_false_dtoh_0"}
 
     // CHECK:      [[FALSE_RECV_TUPLE:%.*]]:2 = "mhlo.recv"([[FALSE_SEND_TOKEN]])
-    // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 2, type = 3>
+    // CHECK-SAME: channel_handle = #mhlo.channel_handle
+    // CHECK-SAME: handle =
+    // CHECK-SAME: type = 3
     // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "recv_if_false_htod_0"}
     %1 = "tf._XlaHostComputeMlir"(%arg2) {recv_key = "recv_if_false", send_key = "send_if_false", host_mlir_module = ""} : (tensor<f32>) -> tensor<f32>
 
@@ -681,11 +709,15 @@ func.func @while_cond_body(%arg0: tensor<f32>) -> tensor<f32> {
   %0 = "mhlo.while"(%arg0) ({
   ^bb0(%arg1: tensor<f32>):
     // CHECK:      [[COND_SEND_TOKEN:%.*]] = "mhlo.send"([[ITER_ARG_VALUE]], [[ITER_ARG_TOKEN]])
-    // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 1, type = 2>
+    // CHECK-SAME: channel_handle = #mhlo.channel_handle
+    // CHECK-SAME: handle =
+    // CHECK-SAME: type = 2
     // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "send_while_cond_dtoh_0"}
 
     // CHECK:      [[COND_RECV_TUPLE:%.*]]:2 = "mhlo.recv"([[COND_SEND_TOKEN]])
-    // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 2, type = 3>
+    // CHECK-SAME: channel_handle = #mhlo.channel_handle
+    // CHECK-SAME: handle =
+    // CHECK-SAME: type = 3
     // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "recv_while_cond_htod_0"}
     %1 = "tf._XlaHostComputeMlir"(%arg1) {recv_key = "recv_while_cond", send_key = "send_while_cond", host_mlir_module = ""} : (tensor<f32>) -> tensor<f32>
 
@@ -697,11 +729,15 @@ func.func @while_cond_body(%arg0: tensor<f32>) -> tensor<f32> {
   },  {
   ^bb0(%arg1: tensor<f32>):
     // CHECK:      [[BODY_SEND_TOKEN:%.*]] = "mhlo.send"([[ITER_ARG_VALUE]], [[ITER_ARG_TOKEN]])
-    // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 3, type = 2>
+    // CHECK-SAME: channel_handle = #mhlo.channel_handle
+    // CHECK-SAME: handle =
+    // CHECK-SAME: type = 2
     // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "send_while_body_dtoh_0"}
 
     // CHECK:      [[BODY_RECV_TUPLE:%.*]]:2 = "mhlo.recv"([[BODY_SEND_TOKEN]])
-    // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 4, type = 3>
+    // CHECK-SAME: channel_handle = #mhlo.channel_handle
+    // CHECK-SAME: handle =
+    // CHECK-SAME: type = 3
     // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "recv_while_body_htod_0"}
     %1 = "tf._XlaHostComputeMlir"(%arg1) {recv_key = "recv_while_body", send_key = "send_while_body", host_mlir_module = ""} : (tensor<f32>) -> tensor<f32>
 
@@ -727,11 +763,15 @@ func.func @while_cond(%arg0: tensor<f32>) -> tensor<f32> {
   ^bb0(%arg1: tensor<f32>):
 
     // CHECK:      [[COND_SEND_TOKEN:%.*]] = "mhlo.send"([[ITER_ARG_VALUE]], [[ITER_ARG_TOKEN]])
-    // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 1, type = 2>
+    // CHECK-SAME: channel_handle = #mhlo.channel_handle
+    // CHECK-SAME: handle =
+    // CHECK-SAME: type = 2
     // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "send_while_cond_dtoh_0"}
 
     // CHECK:      [[COND_RECV_TUPLE:%.*]]:2 = "mhlo.recv"([[COND_SEND_TOKEN]])
-    // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 2, type = 3>
+    // CHECK-SAME: channel_handle = #mhlo.channel_handle
+    // CHECK-SAME: handle =
+    // CHECK-SAME: type = 3
     // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "recv_while_cond_htod_0"}
     %1 = "tf._XlaHostComputeMlir"(%arg1) {recv_key = "recv_while_cond", send_key = "send_while_cond", host_mlir_module = ""} : (tensor<f32>) -> tensor<f32>
 
@@ -772,11 +812,15 @@ func.func @while_body(%arg0: tensor<f32>) -> tensor<f32> {
   ^bb0(%arg1: tensor<f32>):
 
     // CHECK:      [[BODY_SEND_TOKEN:%.*]] = "mhlo.send"([[ITER_ARG_VALUE]], [[ITER_ARG_TOKEN]])
-    // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 1, type = 2>
+    // CHECK-SAME: channel_handle = #mhlo.channel_handle
+    // CHECK-SAME: handle =
+    // CHECK-SAME: type = 2
     // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "send_while_body_dtoh_0"}
 
     // CHECK:      [[BODY_RECV_TUPLE:%.*]]:2 = "mhlo.recv"([[BODY_SEND_TOKEN]])
-    // CHECK-SAME: channel_handle = #mhlo.channel_handle<handle = 2, type = 3>
+    // CHECK-SAME: channel_handle = #mhlo.channel_handle
+    // CHECK-SAME: handle =
+    // CHECK-SAME: type = 3
     // CHECK-SAME: mhlo.frontend_attributes = {_xla_host_transfer_handler_name = "tf_rendezvous", _xla_host_transfer_rendezvous = "recv_while_body_htod_0"}
     %1 = "tf._XlaHostComputeMlir"(%arg1) {recv_key = "recv_while_body", send_key = "send_while_body", host_mlir_module = ""} : (tensor<f32>) -> tensor<f32>
 
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-quant.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-quant.mlir
index 44997584147a0f..8e2876f5707deb 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-quant.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf-quant.mlir
@@ -368,7 +368,7 @@ func.func @uniform_quantized_add(%arg0: tensor<3x2x!tf_type.qint32>) -> tensor<3
 
   // CHECK-DAG: %[[LHS:.*]] = mhlo.convert %arg0 : (tensor<3x2xi32>) -> tensor<3x2x!quant.uniform<i32:f32, 2.000000e+00:4>>
   // CHECK-DAG: %[[RHS:.*]] = mhlo.constant() {value = dense<127> : tensor<2xi32>} : () -> tensor<2x!quant.uniform<i32:f32, 2.000000e+00:4>>
-  // CHECK: %[[RES:.*]] = chlo.broadcast_add %[[LHS]], %[[RHS]] {broadcast_dimensions = dense<1> : tensor<1xi64>} :
+  // CHECK: %[[RES:.*]] = chlo.broadcast_add %[[LHS]], %[[RHS]] {broadcast_dimensions = array<i64: 1>} :
   // CHECK-SAME: (tensor<3x2x!quant.uniform<i32:f32, 2.000000e+00:4>>, tensor<2x!quant.uniform<i32:f32, 2.000000e+00:4>>)
   // CHECK-SAME: -> tensor<3x2x!quant.uniform<i32:f32, 2.000000e+00:4>>
   // CHECK: %[[RES_INT:.*]] = mhlo.convert %[[RES]] : (tensor<3x2x!quant.uniform<i32:f32, 2.000000e+00:4>>) -> tensor<3x2xi32>
@@ -418,10 +418,10 @@ func.func @uniform_quantized_clip_by_value(%input: tensor<3x2xf32>) -> tensor<3x
 
   // CHECK-DAG: %[[CONVERT_1:.*]] = mhlo.convert %[[OPERAND]] : (tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>) -> tensor<3x2xi32>
   // CHECK-DAG: %[[CONVERT_2:.*]] = mhlo.convert %[[CONVERT_1]] : (tensor<3x2xi32>) -> tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
-  // CHECK: %[[MIN_CLIPPED:.*]] = chlo.broadcast_maximum %[[CONVERT_2]], %[[MIN_MAX]] {broadcast_dimensions = dense<1> : tensor<1xi64>} :
+  // CHECK: %[[MIN_CLIPPED:.*]] = chlo.broadcast_maximum %[[CONVERT_2]], %[[MIN_MAX]] {broadcast_dimensions = array<i64: 1>} :
   // CHECK-SAME: (tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>, tensor<2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>)
   // CHECK-SAME: -> tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
-  // CHECK: chlo.broadcast_minimum %[[MIN_CLIPPED]], %[[MIN_MAX]] {broadcast_dimensions = dense<1> : tensor<1xi64>} :
+  // CHECK: chlo.broadcast_minimum %[[MIN_CLIPPED]], %[[MIN_MAX]] {broadcast_dimensions = array<i64: 1>} :
   // CHECK-SAME: (tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>, tensor<2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>)
   // CHECK-SAME: -> tensor<3x2x!quant.uniform<i32:f32:1, {2.000000e+00:4,2.000000e+00:4}>>
   %1 = "tf.UniformQuantizedClipByValue"(%0, %min, %max, %scales, %zps) {
@@ -557,4 +557,4 @@ func.func @while_region_with_quant_two_args(%arg0: tensor<2x2xf32>, %arg1: tenso
 
   // return %[[RESULT0]], %[[RESULT1]]
   func.return %3, %4 : tensor<2x?xf32>, tensor<?x2xf32>
-}
\ No newline at end of file
+}
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir
index ba0f70e369f3bf..018046d66d57a2 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/legalize-tf.mlir
@@ -166,7 +166,7 @@ func.func @fusedBatchNormGrad_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: tens
   // CHECK-NEXT: %[[act:.*]] = mhlo.convert %arg1 : tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[eps:.*]] = mhlo.constant dense<1.000000e-03> : tensor<f32>
 
-  // CHECK-NEXT: %[[add:.*]] = chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK-NEXT: %[[add:.*]] = chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = array<i64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = mhlo.rsqrt %[[add]] : tensor<8xf32>
 
   // CHECK:      %[[bcast_arg3:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
@@ -218,7 +218,7 @@ func.func @fusedBatchNormGradV2_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: te
   // CHECK-NEXT: %[[act:.*]] = mhlo.convert %arg1 : tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[eps:.*]] = mhlo.constant dense<1.000000e-03> : tensor<f32>
 
-  // CHECK-NEXT: %[[add:.*]] = chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK-NEXT: %[[add:.*]] = chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = array<i64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = mhlo.rsqrt %[[add]] : tensor<8xf32>
 
   // CHECK:      %[[bcast_arg3:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
@@ -299,7 +299,7 @@ func.func @fusedBatchNormGradV3_noTraining(%arg0: tensor<8x8x8x8xf32>, %arg1: te
   // CHECK-NEXT: %[[act:.*]] = mhlo.convert %arg1 : tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[eps:.*]] = mhlo.constant dense<1.000000e-03> : tensor<f32>
 
-  // CHECK-NEXT: %[[add:.*]] = chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK-NEXT: %[[add:.*]] = chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = array<i64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = mhlo.rsqrt %[[add]] : tensor<8xf32>
 
   // CHECK:      %[[bcast_arg3:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
@@ -381,7 +381,7 @@ func.func @fusedBatchNormGradV3_noTraining_NCHW(%arg0: tensor<8x8x8x8xf32>, %arg
   // CHECK-NEXT: %[[act:.*]] = mhlo.convert %arg1 : tensor<8x8x8x8xf32>
   // CHECK-NEXT: %[[eps:.*]] = mhlo.constant dense<1.000000e-03> : tensor<f32>
 
-  // CHECK-NEXT: %[[add:.*]] = chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
+  // CHECK-NEXT: %[[add:.*]] = chlo.broadcast_add %arg4, %[[eps]] {broadcast_dimensions = array<i64>} : (tensor<8xf32>, tensor<f32>) -> tensor<8xf32>
   // CHECK-NEXT: %[[scr1:.*]] = mhlo.rsqrt %[[add]] : tensor<8xf32>
 
   // CHECK:      %[[bcast_arg3:.+]] = "mhlo.dynamic_broadcast_in_dim"(%arg3, {{.*}}) {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<8xf32>, tensor<4xindex>) -> tensor<8x8x8x8xf32>
@@ -739,7 +739,7 @@ func.func @matrix_diag_part_align_7d(%arg0: tensor<3x5x7x9x11x13x17xf32>) -> ten
 
 // CHECK-LABEL: func @erf
 func.func @erf(%arg0: tensor<2x3xf32>) -> tensor<2x3xf32> {
-  // CHECK: chlo.erf %arg0 : tensor<2x3xf32>
+  // CHECK: mhlo.erf %arg0 : tensor<2x3xf32>
   %0 = "tf.Erf"(%arg0) : (tensor<2x3xf32>) -> tensor<2x3xf32>
   func.return %0 : tensor<2x3xf32>
 }
@@ -787,14 +787,14 @@ func.func @unary_einsum(%arg0: tensor<2x3xf32>) -> tensor<2x2xf32> {
 
 // CHECK-LABEL: func @floordiv_broadcast_i32
 func.func @floordiv_broadcast_i32(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> tensor<2x3xi32> {
-  // CHECK-DAG: [[DIV:%.+]] = chlo.broadcast_divide %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[MUL:%.+]] = chlo.broadcast_multiply [[DIV]], %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[DIV:%.+]] = chlo.broadcast_divide %arg0, %arg1 {broadcast_dimensions = array<i64: 1>}
+  // CHECK-DAG: [[MUL:%.+]] = chlo.broadcast_multiply [[DIV]], %arg1 {broadcast_dimensions = array<i64: 1>}
   // CHECK-DAG: [[CMP1:%.+]] = chlo.broadcast_compare [[MUL]], %arg0 {comparison_direction = #chlo<comparison_direction NE>}
   // CHECK-DAG: [[ZEROS1:%.+]] = mhlo.constant dense<0>
   // CHECK-DAG: [[CMP2:%.+]] = chlo.broadcast_compare %arg0, [[ZEROS1]] {comparison_direction = #chlo<comparison_direction LT>}
   // CHECK-DAG: [[ZEROS2:%.+]] = mhlo.constant dense<0>
   // CHECK-DAG: [[CMP3:%.+]] = chlo.broadcast_compare %arg1, [[ZEROS2]] {comparison_direction = #chlo<comparison_direction LT>}
-  // CHECK-DAG: [[CMP4:%.+]] = chlo.broadcast_compare [[CMP2]], [[CMP3]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction NE>}
+  // CHECK-DAG: [[CMP4:%.+]] = chlo.broadcast_compare [[CMP2]], [[CMP3]] {broadcast_dimensions = array<i64: 1>, comparison_direction = #chlo<comparison_direction NE>}
   // CHECK-DAG: [[AND:%.+]] = chlo.broadcast_and [[CMP1]], [[CMP4]]
   // CHECK-DAG: [[ONES:%.+]] = mhlo.constant dense<1>
   // CHECK-DAG: [[SUB:%.+]] = chlo.broadcast_subtract [[DIV]], [[ONES]]
@@ -808,14 +808,14 @@ func.func @floordiv_broadcast_i32(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>)
 
 // CHECK-LABEL: func @floordiv_reverse_broadcast_i32
 func.func @floordiv_reverse_broadcast_i32(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32>) -> tensor<2x3xi32> {
-  // CHECK-DAG: [[DIV:%.+]] = chlo.broadcast_divide %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[DIV:%.+]] = chlo.broadcast_divide %arg0, %arg1 {broadcast_dimensions = array<i64: 1>}
   // CHECK-DAG: [[MUL:%.+]] = chlo.broadcast_multiply [[DIV]]
-  // CHECK-DAG: [[CMP1:%.+]] = chlo.broadcast_compare [[MUL]], %arg0 {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction NE>}
+  // CHECK-DAG: [[CMP1:%.+]] = chlo.broadcast_compare [[MUL]], %arg0 {broadcast_dimensions = array<i64: 1>, comparison_direction = #chlo<comparison_direction NE>}
   // CHECK-DAG: [[ZEROS1:%.+]] = mhlo.constant dense<0>
   // CHECK-DAG: [[CMP2:%.+]] = chlo.broadcast_compare %arg0, [[ZEROS1]] {comparison_direction = #chlo<comparison_direction LT>}
   // CHECK-DAG: [[ZEROS2:%.+]] = mhlo.constant dense<0>
   // CHECK-DAG: [[CMP3:%.+]] = chlo.broadcast_compare %arg1, [[ZEROS2]] {comparison_direction = #chlo<comparison_direction LT>}
-  // CHECK-DAG: [[CMP4:%.+]] = chlo.broadcast_compare [[CMP2]], [[CMP3]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction NE>}
+  // CHECK-DAG: [[CMP4:%.+]] = chlo.broadcast_compare [[CMP2]], [[CMP3]] {broadcast_dimensions = array<i64: 1>, comparison_direction = #chlo<comparison_direction NE>}
   // CHECK-DAG: [[AND:%.+]] = chlo.broadcast_and [[CMP1]], [[CMP4]]
   // CHECK-DAG: [[ONES:%.+]] = mhlo.constant dense<1>
   // CHECK-DAG: [[SUB:%.+]] = chlo.broadcast_subtract [[DIV]], [[ONES]]
@@ -865,14 +865,14 @@ func.func @floordiv_f16_broadcast(%arg0: tensor<2x3xf16>, %arg1: tensor<3xf16>)
 
 // CHECK-LABEL: func @floordiv_dynamic
 func.func @floordiv_dynamic(%arg0: tensor<?x?xi32>, %arg1: tensor<?xi32>) -> tensor<?x?xi32> {
-  // CHECK-DAG: [[DIV:%.+]] = chlo.broadcast_divide %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
-  // CHECK-DAG: [[MUL:%.+]] = chlo.broadcast_multiply [[DIV]], %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[DIV:%.+]] = chlo.broadcast_divide %arg0, %arg1 {broadcast_dimensions = array<i64: 1>}
+  // CHECK-DAG: [[MUL:%.+]] = chlo.broadcast_multiply [[DIV]], %arg1 {broadcast_dimensions = array<i64: 1>}
   // CHECK-DAG: [[CMP1:%.+]] = chlo.broadcast_compare [[MUL]], %arg0 {comparison_direction = #chlo<comparison_direction NE>}
   // CHECK-DAG: [[ZEROS1:%.+]] = mhlo.constant dense<0>
   // CHECK-DAG: [[CMP2:%.+]] = chlo.broadcast_compare %arg0, [[ZEROS1]] {comparison_direction = #chlo<comparison_direction LT>}
   // CHECK-DAG: [[ZEROS2:%.+]] = mhlo.constant dense<0>
   // CHECK-DAG: [[CMP3:%.+]] = chlo.broadcast_compare %arg1, [[ZEROS2]] {comparison_direction = #chlo<comparison_direction LT>}
-  // CHECK-DAG: [[CMP4:%.+]] = chlo.broadcast_compare [[CMP2]], [[CMP3]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction NE>}
+  // CHECK-DAG: [[CMP4:%.+]] = chlo.broadcast_compare [[CMP2]], [[CMP3]] {broadcast_dimensions = array<i64: 1>, comparison_direction = #chlo<comparison_direction NE>}
   // CHECK-DAG: [[AND:%.+]] = chlo.broadcast_and [[CMP1]], [[CMP4]]
   // CHECK-DAG: [[ONES:%.+]] = mhlo.constant dense<1>
   // CHECK-DAG: [[SUB:%.+]] = chlo.broadcast_subtract [[DIV]], [[ONES]]
@@ -886,7 +886,7 @@ func.func @floordiv_dynamic(%arg0: tensor<?x?xi32>, %arg1: tensor<?xi32>) -> ten
 
 // CHECK-LABEL: func @floordiv_unsigned
 func.func @floordiv_unsigned(%arg0: tensor<?x?xui32>, %arg1: tensor<?xui32>) -> tensor<?x?xui32> {
-  // CHECK-DAG: [[DIV:%.+]] = chlo.broadcast_divide %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[DIV:%.+]] = chlo.broadcast_divide %arg0, %arg1 {broadcast_dimensions = array<i64: 1>}
   // CHECK: return [[DIV]]
   %0 = "tf.FloorDiv"(%arg0, %arg1) : (tensor<?x?xui32>, tensor<?xui32>) -> tensor<?x?xui32>
   func.return %0: tensor<?x?xui32>
@@ -926,12 +926,12 @@ func.func @floordiv_int(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>) -> tensor<*x
 
 // CHECK-LABEL: func @floormod_broadcast_numerator
 func.func @floormod_broadcast_numerator(%arg0: tensor<3xi32>, %arg1: tensor<2x3xi32>) -> tensor<2x3xi32> {
-  // CHECK-DAG: [[REM:%.+]] = chlo.broadcast_remainder %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[REM:%.+]] = chlo.broadcast_remainder %arg0, %arg1 {broadcast_dimensions = array<i64: 1>}
   // CHECK-DAG: [[ZL:%.+]] = mhlo.constant dense<0>
   // CHECK-DAG: [[CMP1:%.+]] = chlo.broadcast_compare [[REM]], [[ZL]] {comparison_direction = #chlo<comparison_direction NE>}
   // CHECK-DAG: [[ZR:%.+]] = mhlo.constant dense<0>
   // CHECK-DAG: [[CMP2:%.+]] = chlo.broadcast_compare %arg1, [[ZR]] {comparison_direction = #chlo<comparison_direction LT>}
-  // CHECK-DAG: [[CMP3:%.+]] = chlo.broadcast_compare [[REM]], [[ZR]] {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = #chlo<comparison_direction LT>}
+  // CHECK-DAG: [[CMP3:%.+]] = chlo.broadcast_compare [[REM]], [[ZR]] {broadcast_dimensions = array<i64>, comparison_direction = #chlo<comparison_direction LT>}
   // CHECK-DAG: [[CMP4:%.+]] = chlo.broadcast_compare [[CMP2]], [[CMP3]] {comparison_direction = #chlo<comparison_direction NE>}
   // CHECK-DAG: [[AND:%.+]] = chlo.broadcast_and [[CMP1]], [[CMP4]]
   // CHECK-DAG: [[ADD:%.+]] = chlo.broadcast_add %arg1, [[REM]]
@@ -945,15 +945,15 @@ func.func @floormod_broadcast_numerator(%arg0: tensor<3xi32>, %arg1: tensor<2x3x
 
 // CHECK-LABEL: func @floormod_broadcast_denominator
 func.func @floormod_broadcast_denominator(%arg0: tensor<2x3xi32>, %arg1: tensor<3xi32>) -> tensor<2x3xi32> {
-  // CHECK-DAG: [[REM:%.+]] = chlo.broadcast_remainder %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[REM:%.+]] = chlo.broadcast_remainder %arg0, %arg1 {broadcast_dimensions = array<i64: 1>}
   // CHECK-DAG: [[ZL:%.+]] = mhlo.constant dense<0>
   // CHECK-DAG: [[CMP1:%.+]] = chlo.broadcast_compare [[REM]], [[ZL]] {comparison_direction = #chlo<comparison_direction NE>}
   // CHECK-DAG: [[ZR:%.+]] = mhlo.constant dense<0>
   // CHECK-DAG: [[CMP2:%.+]] = chlo.broadcast_compare %arg1, [[ZR]] {comparison_direction = #chlo<comparison_direction LT>}
-  // CHECK-DAG: [[CMP3:%.+]] = chlo.broadcast_compare [[REM]], [[ZR]] {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = #chlo<comparison_direction LT>}
-  // CHECK-DAG: [[CMP4:%.+]] = chlo.broadcast_compare [[CMP2]], [[CMP3]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction NE>}
+  // CHECK-DAG: [[CMP3:%.+]] = chlo.broadcast_compare [[REM]], [[ZR]] {broadcast_dimensions = array<i64>, comparison_direction = #chlo<comparison_direction LT>}
+  // CHECK-DAG: [[CMP4:%.+]] = chlo.broadcast_compare [[CMP2]], [[CMP3]] {broadcast_dimensions = array<i64: 1>, comparison_direction = #chlo<comparison_direction NE>}
   // CHECK-DAG: [[AND:%.+]] = chlo.broadcast_and [[CMP1]], [[CMP4]]
-  // CHECK-DAG: [[ADD:%.+]] = chlo.broadcast_add %arg1, [[REM]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[ADD:%.+]] = chlo.broadcast_add %arg1, [[REM]] {broadcast_dimensions = array<i64: 1>}
   // CHECK-DAG: [[SELECT:%.+]] = mhlo.select [[AND]], [[ADD]], [[REM]]
   // CHECK-NEXT: return [[SELECT]]
   %0 = "tf.FloorMod"(%arg0, %arg1) : (tensor<2x3xi32>, tensor<3xi32>) -> tensor<2x3xi32>
@@ -964,7 +964,7 @@ func.func @floormod_broadcast_denominator(%arg0: tensor<2x3xi32>, %arg1: tensor<
 
 // CHECK-LABEL: func @floormod_unsigned_broadcast_denominator
 func.func @floormod_unsigned_broadcast_denominator(%arg0: tensor<2x3xui32>, %arg1: tensor<3xui32>) -> tensor<2x3xui32> {
-  // CHECK-DAG: [[REM:%.+]] = chlo.broadcast_remainder %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[REM:%.+]] = chlo.broadcast_remainder %arg0, %arg1 {broadcast_dimensions = array<i64: 1>}
   // CHECK-NEXT: return [[REM]]
   %0 = "tf.FloorMod"(%arg0, %arg1) : (tensor<2x3xui32>, tensor<3xui32>) -> tensor<2x3xui32>
   func.return %0: tensor<2x3xui32>
@@ -974,15 +974,15 @@ func.func @floormod_unsigned_broadcast_denominator(%arg0: tensor<2x3xui32>, %arg
 
 // CHECK-LABEL: func @floormod_dynamic_broadcast_numerator
 func.func @floormod_dynamic_broadcast_numerator_(%arg0: tensor<?x?xi32>, %arg1: tensor<?xi32>) -> tensor<?x?xi32> {
-  // CHECK-DAG: [[REM:%.+]] = chlo.broadcast_remainder %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[REM:%.+]] = chlo.broadcast_remainder %arg0, %arg1 {broadcast_dimensions = array<i64: 1>}
   // CHECK-DAG: [[ZL:%.+]] = mhlo.constant dense<0>
   // CHECK-DAG: [[CMP1:%.+]] = chlo.broadcast_compare [[REM]], [[ZL]] {comparison_direction = #chlo<comparison_direction NE>}
   // CHECK-DAG: [[ZR:%.+]] = mhlo.constant dense<0>
   // CHECK-DAG: [[CMP2:%.+]] = chlo.broadcast_compare %arg1, [[ZR]] {comparison_direction = #chlo<comparison_direction LT>}
-  // CHECK-DAG: [[CMP3:%.+]] = chlo.broadcast_compare [[REM]], [[ZR]] {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = #chlo<comparison_direction LT>}
-  // CHECK-DAG: [[CMP4:%.+]] = chlo.broadcast_compare [[CMP2]], [[CMP3]] {broadcast_dimensions = dense<1> : tensor<1xi64>, comparison_direction = #chlo<comparison_direction NE>}
+  // CHECK-DAG: [[CMP3:%.+]] = chlo.broadcast_compare [[REM]], [[ZR]] {broadcast_dimensions = array<i64>, comparison_direction = #chlo<comparison_direction LT>}
+  // CHECK-DAG: [[CMP4:%.+]] = chlo.broadcast_compare [[CMP2]], [[CMP3]] {broadcast_dimensions = array<i64: 1>, comparison_direction = #chlo<comparison_direction NE>}
   // CHECK-DAG: [[AND:%.+]] = chlo.broadcast_and [[CMP1]], [[CMP4]]
-  // CHECK-DAG: [[ADD:%.+]] = chlo.broadcast_add %arg1, [[REM]] {broadcast_dimensions = dense<1> : tensor<1xi64>}
+  // CHECK-DAG: [[ADD:%.+]] = chlo.broadcast_add %arg1, [[REM]] {broadcast_dimensions = array<i64: 1>}
   // CHECK-DAG: [[SELECT:%.+]] = mhlo.select [[AND]], [[ADD]], [[REM]]
   // CHECK-NEXT: return [[SELECT]]
   %0 = "tf.FloorMod"(%arg0, %arg1) : (tensor<?x?xi32>, tensor<?xi32>) -> tensor<?x?xi32>
@@ -994,12 +994,12 @@ func.func @floormod_dynamic_broadcast_numerator_(%arg0: tensor<?x?xi32>, %arg1:
 // CHECK-LABEL: func @floormod_dynamic_broadcast_denominator
 func.func @floormod_dynamic_broadcast_denominator_(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?x?xf32>) -> tensor<?x?x?xf32> {
   // CHECK-NOT: tf.FloorMod
-  // CHECK-DAG: [[REM:%.+]] = chlo.broadcast_remainder %arg0, %arg1 {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+  // CHECK-DAG: [[REM:%.+]] = chlo.broadcast_remainder %arg0, %arg1 {broadcast_dimensions = array<i64: 1, 2>} : (tensor<?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
   // CHECK-DAG: [[ZL:%.+]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
   // CHECK-DAG: [[CMP1:%.+]] = chlo.broadcast_compare [[REM]], [[ZL]] {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?xf32>, tensor<f32>) -> tensor<?x?x?xi1>
   // CHECK-DAG: [[ZR:%.+]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
   // CHECK-DAG: [[CMP2:%.+]] = chlo.broadcast_compare %arg1, [[ZR]] {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?xf32>, tensor<f32>) -> tensor<?x?x?xi1>
-  // CHECK-DAG: [[CMP3:%.+]] = chlo.broadcast_compare [[REM]], [[ZR]] {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?xf32>, tensor<f32>) -> tensor<?x?x?xi1>
+  // CHECK-DAG: [[CMP3:%.+]] = chlo.broadcast_compare [[REM]], [[ZR]] {broadcast_dimensions = array<i64>, comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?xf32>, tensor<f32>) -> tensor<?x?x?xi1>
   // CHECK-DAG: [[CMP4:%.+]] = chlo.broadcast_compare [[CMP2]], [[CMP3]] {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?xi1>, tensor<?x?x?xi1>) -> tensor<?x?x?xi1>
   // CHECK-DAG: [[AND:%.+]] = chlo.broadcast_and [[CMP1]], [[CMP4]] : (tensor<?x?x?xi1>, tensor<?x?x?xi1>) -> tensor<?x?x?xi1>
   // CHECK-DAG: [[ADD:%.+]] = chlo.broadcast_add %arg1, [[REM]] : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
@@ -1839,8 +1839,8 @@ func.func @elu_unranked(%arg0: tensor<?xf32>) -> tensor<?xf32> {
 func.func @elu_grad(%gradients: tensor<4x8xf32>, %features: tensor<?x?xf32>) -> tensor<4x8xf32> {
   // CHECK-DAG: %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
   // CHECK-DAG: %[[ONE:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
-  // CHECK-DAG: %[[PRED:.*]] = chlo.broadcast_compare %[[FEATURES]], %[[ZERO]] {broadcast_dimensions = dense<> : tensor<0xi64>, comparison_direction = #chlo<comparison_direction GT>}
-  // CHECK-DAG: %[[ADD1:.*]] = chlo.broadcast_add %[[FEATURES]], %[[ONE]] {broadcast_dimensions = dense<> : tensor<0xi64>}
+  // CHECK-DAG: %[[PRED:.*]] = chlo.broadcast_compare %[[FEATURES]], %[[ZERO]] {broadcast_dimensions = array<i64>, comparison_direction = #chlo<comparison_direction GT>}
+  // CHECK-DAG: %[[ADD1:.*]] = chlo.broadcast_add %[[FEATURES]], %[[ONE]] {broadcast_dimensions = array<i64>}
   // CHECK-DAG: %[[MULGRAD:.*]] = mhlo.multiply %[[GRADIENTS]], %[[ADD1]] : (tensor<4x8xf32>, tensor<?x?xf32>) -> tensor<4x8xf32>
   // CHECK: %[[RESULT:.*]] = mhlo.select %[[PRED]], %[[GRADIENTS]], %[[MULGRAD]]
   // CHECK: return %[[RESULT]]
@@ -1857,7 +1857,7 @@ func.func @elu_grad(%gradients: tensor<4x8xf32>, %features: tensor<?x?xf32>) ->
 // CHECK-LABEL: func @relu
 func.func @relu(%arg0: tensor<1xi32>) -> tensor<1xi32> {
   // CHECK: %[[ZERO:.*]] = mhlo.constant dense<0> : tensor<i32>
-  // CHECK: chlo.broadcast_maximum %[[ZERO]], %arg0 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
+  // CHECK: chlo.broadcast_maximum %[[ZERO]], %arg0 {broadcast_dimensions = array<i64>} : (tensor<i32>, tensor<1xi32>) -> tensor<1xi32>
   %0 = "tf.Relu"(%arg0) : (tensor<1xi32>) -> tensor<1xi32>
   func.return %0: tensor<1xi32>
 }
@@ -1867,7 +1867,7 @@ func.func @relu(%arg0: tensor<1xi32>) -> tensor<1xi32> {
 // CHECK-LABEL: func @relu_unranked
 func.func @relu_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
   // CHECK: %[[ZERO:.*]] = mhlo.constant dense<0> : tensor<i32>
-  // CHECK: chlo.broadcast_maximum %[[ZERO]], %arg0 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
+  // CHECK: chlo.broadcast_maximum %[[ZERO]], %arg0 {broadcast_dimensions = array<i64>} : (tensor<i32>, tensor<?xi32>) -> tensor<?xi32>
   %0 = "tf.Relu"(%arg0) : (tensor<?xi32>) -> tensor<?xi32>
   func.return %0: tensor<?xi32>
 }
@@ -1877,7 +1877,7 @@ func.func @relu_unranked(%arg0: tensor<?xi32>) -> tensor<?xi32> {
 // CHECK-LABEL: func @relu_unsigned
 func.func @relu_unsigned(%arg0: tensor<?xui32>) -> tensor<?xui32> {
   // CHECK: %[[ZERO:.*]] = mhlo.constant dense<0> : tensor<ui32>
-  // CHECK: chlo.broadcast_maximum %[[ZERO]], %arg0 {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<ui32>, tensor<?xui32>) -> tensor<?xui32>
+  // CHECK: chlo.broadcast_maximum %[[ZERO]], %arg0 {broadcast_dimensions = array<i64>} : (tensor<ui32>, tensor<?xui32>) -> tensor<?xui32>
   %0 = "tf.Relu"(%arg0) : (tensor<?xui32>) -> tensor<?xui32>
   func.return %0: tensor<?xui32>
 }
@@ -2017,7 +2017,7 @@ func.func @softsign_grad(%arg0: tensor<4x10xf32>, %arg1: tensor<4x10xf32>) -> te
 
     // CHECK-NEXT: %[[ONE:.*]] = mhlo.constant dense<1.000000e+00> : tensor<f32>
     // CHECK-NEXT: %[[ABS:.*]] = mhlo.abs %{{.*}} : tensor<4x10xf32>
-    // CHECK-NEXT: %[[BROADCAST_ADD:.*]] = chlo.broadcast_add %[[ONE]], %[[ABS]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<4x10xf32>) -> tensor<4x10xf32>
+    // CHECK-NEXT: %[[BROADCAST_ADD:.*]] = chlo.broadcast_add %[[ONE]], %[[ABS]] {broadcast_dimensions = array<i64>} : (tensor<f32>, tensor<4x10xf32>) -> tensor<4x10xf32>
     // CHECK-NEXT: %[[MUL:.*]] = mhlo.multiply %[[BROADCAST_ADD]], %[[BROADCAST_ADD]] : tensor<4x10xf32>
     // CHECK-NEXT: %[[BROADCAST_DIV:.*]] = chlo.broadcast_divide %{{.*}}, %[[MUL]] : (tensor<4x10xf32>, tensor<4x10xf32>) -> tensor<4x10xf32>
     // CHECK-NEXT: return %[[BROADCAST_DIV]] : tensor<4x10xf32>
@@ -2775,7 +2775,7 @@ func.func @sigmoid_grad_complex(%arg0: tensor<2xcomplex<f32>>, %arg1: tensor<2xc
 // CHECK-LABEL: @sigmoid_grad_dynamic
 func.func @sigmoid_grad_dynamic(%arg0: tensor<?xf32>, %arg1: tensor<?xf32>) -> tensor<?xf32> {
   // CHECK: chlo.broadcast_multiply {{.*}} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
-  // CHECK: chlo.broadcast_subtract {{.*}} {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<f32>, tensor<?xf32>) -> tensor<?xf32>
+  // CHECK: chlo.broadcast_subtract {{.*}} {broadcast_dimensions = array<i64>} : (tensor<f32>, tensor<?xf32>) -> tensor<?xf32>
   // CHECK: chlo.broadcast_multiply {{.*}} : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   %0 = "tf.SigmoidGrad"(%arg0, %arg1) : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
   func.return %0 : tensor<?xf32>
@@ -3662,7 +3662,7 @@ func.func @mean(%arg0: tensor<4x8xf16>) -> tensor<4x1xf16> {
   // CHECK: %[[CAST:.*]] = mhlo.convert %arg0 : (tensor<4x8xf16>) -> tensor<4x8xf32>
   // CHECK: %[[INITIAL:.*]] = mhlo.constant dense<-0.000000e+00> : tensor<f32>
   // CHECK: %[[REDUCED:.*]] = mhlo.reduce(%[[CAST]] init: %[[INITIAL]]) applies mhlo.add across dimensions = [1] : (tensor<4x8xf32>, tensor<f32>) -> tensor<4xf32>
-  // CHECK: %[[MEAN:.*]] = chlo.broadcast_divide %[[REDUCED]], %{{.*}} {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
+  // CHECK: %[[MEAN:.*]] = chlo.broadcast_divide %[[REDUCED]], %{{.*}} {broadcast_dimensions = array<i64>} : (tensor<4xf32>, tensor<f32>) -> tensor<4xf32>
   // CHECK: %[[CAST_BACK:.*]] = mhlo.convert %[[MEAN]] : (tensor<4xf32>) -> tensor<4xf16>
   // CHECK: %[[RESULT:.*]] = mhlo.reshape %[[CAST_BACK]] : (tensor<4xf16>) -> tensor<4x1xf16>
   // CHECK: return %[[RESULT]] : tensor<4x1xf16>
@@ -3698,7 +3698,7 @@ func.func @mean_dynamic(%arg0: tensor<?x?xf16>) -> tensor<?x1xf16> {
   // CHECK: %[[INDEX_CAST:.*]] = arith.index_cast %[[MUL]] : index to i64
   // CHECK: %[[TENSOR:.*]] = tensor.from_elements %[[INDEX_CAST]] : tensor<i64>
   // CHECK: %[[CONVERT:.*]] = mhlo.convert %[[TENSOR]] : (tensor<i64>) -> tensor<f32>
-  // CHECK: %[[MEAN:.*]] = chlo.broadcast_divide %[[REDUCED]], %[[CONVERT]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<?xf32>, tensor<f32>) -> tensor<?xf32>
+  // CHECK: %[[MEAN:.*]] = chlo.broadcast_divide %[[REDUCED]], %[[CONVERT]] {broadcast_dimensions = array<i64>} : (tensor<?xf32>, tensor<f32>) -> tensor<?xf32>
   // CHECK: %[[MEAN_CONVERTED:.*]] = mhlo.convert %[[MEAN]] : (tensor<?xf32>) -> tensor<?xf16>
   // CHECK: %[[SHAPE1:.*]] = shape.shape_of %[[MEAN_CONVERTED]] : tensor<?xf16> -> tensor<1xindex>
   // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index
@@ -4120,8 +4120,8 @@ func.func @rng_std_normal(%arg0: tensor<3xi32>) -> tensor<12x?x64xf32> {
 func.func @range(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<5xf32> {
   %1 = "tf.Const"() {device = "", dtype = "tfdtype$DT_FLOAT", name = "range/limit", value = dense<5.000000e+00> : tensor<f32>} : () -> tensor<f32>
   // CHECK-DAG: [[IOTA:%.*]] = "mhlo.iota"
-  // CHECK-DAG: [[MUL:%.*]] = chlo.broadcast_multiply [[IOTA]], [[DELTA]] {broadcast_dimensions = dense<> : tensor<0xi64>}
-  // CHECK: chlo.broadcast_add [[MUL]], [[START]] {broadcast_dimensions = dense<> : tensor<0xi64>}
+  // CHECK-DAG: [[MUL:%.*]] = chlo.broadcast_multiply [[IOTA]], [[DELTA]] {broadcast_dimensions = array<i64>}
+  // CHECK: chlo.broadcast_add [[MUL]], [[START]] {broadcast_dimensions = array<i64>}
   %3 = "tf.Range"(%arg0, %1, %arg1) {Tidx = "tfdtype$DT_FLOAT", device = "", name = "range"} : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<5xf32>
   func.return %3 : tensor<5xf32>
 }
@@ -4142,8 +4142,8 @@ func.func @range_dynamic(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<f
   // CHECK-DAG: [[IOTA:%.+]] = "mhlo.dynamic_iota"([[RESHAPE]]) {iota_dimension = 0 : i64}
   // CHECK-DAG: [[CONVERT_3:%.+]] = mhlo.convert %arg0
   // CHECK-DAG: [[CONVERT_4:%.+]] = mhlo.convert %arg2
-  // CHECK-DAG: [[MUL:%.+]] = chlo.broadcast_multiply [[IOTA]], [[CONVERT_4]] {broadcast_dimensions = dense<> : tensor<0xi64>}
-  // CHECK-DAG: [[ADD:%.+]] = chlo.broadcast_add [[MUL]], [[CONVERT_3]] {broadcast_dimensions = dense<> : tensor<0xi64>}
+  // CHECK-DAG: [[MUL:%.+]] = chlo.broadcast_multiply [[IOTA]], [[CONVERT_4]] {broadcast_dimensions = array<i64>}
+  // CHECK-DAG: [[ADD:%.+]] = chlo.broadcast_add [[MUL]], [[CONVERT_3]] {broadcast_dimensions = array<i64>}
   %2 = "tf.Range"(%arg0, %arg1, %arg2) {Tidx = "tfdtype$DT_FLOAT", device = "", name = "range"} : (tensor<f32>, tensor<f32>, tensor<f32>) -> tensor<?xf32>
 
   // CHECK: return [[ADD]]
@@ -4166,8 +4166,8 @@ func.func @range_int_dynamic(%arg0: tensor<i32>, %arg1: tensor<i32>, %arg2: tens
   // CHECK-DAG: [[IOTA:%.+]] = "mhlo.dynamic_iota"([[RESHAPE]]) {iota_dimension = 0 : i64}
   // CHECK-DAG: [[CONVERT_3:%.+]] = mhlo.convert %arg0
   // CHECK-DAG: [[CONVERT_4:%.+]] = mhlo.convert %arg2
-  // CHECK-DAG: [[MUL:%.+]] = chlo.broadcast_multiply [[IOTA]], [[CONVERT_4]] {broadcast_dimensions = dense<> : tensor<0xi64>}
-  // CHECK-DAG: [[ADD:%.+]] = chlo.broadcast_add [[MUL]], [[CONVERT_3]] {broadcast_dimensions = dense<> : tensor<0xi64>}
+  // CHECK-DAG: [[MUL:%.+]] = chlo.broadcast_multiply [[IOTA]], [[CONVERT_4]] {broadcast_dimensions = array<i64>}
+  // CHECK-DAG: [[ADD:%.+]] = chlo.broadcast_add [[MUL]], [[CONVERT_3]] {broadcast_dimensions = array<i64>}
   %2 = "tf.Range"(%arg0, %arg1, %arg2) {Tidx = "tfdtype$DT_FLOAT", device = "", name = "range"} : (tensor<i32>, tensor<i32>, tensor<i32>) -> tensor<?xi32>
 
   // CHECK: return [[ADD]]
@@ -4186,8 +4186,8 @@ func.func @linspace_static(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<4xf
   // CHECK-DAG: [[STEP_NUMERATOR:%.*]] = chlo.broadcast_subtract [[STOP]], [[START]]
   // CHECK-DAG: [[STEP:%.*]] = chlo.broadcast_divide [[STEP_NUMERATOR]], [[STEP_DENOMINATOR]]
   // CHECK-DAG: [[IOTA:%.*]] = "mhlo.iota"() {iota_dimension = 0 : i64}
-  // CHECK-DAG: [[MUL:%.*]] = chlo.broadcast_multiply [[IOTA]], [[STEP]] {broadcast_dimensions = dense<> : tensor<0xi64>}
-  // CHECK-DAG: [[LINSPACE:%.*]] = chlo.broadcast_add [[MUL]], [[START]] {broadcast_dimensions = dense<> : tensor<0xi64>}
+  // CHECK-DAG: [[MUL:%.*]] = chlo.broadcast_multiply [[IOTA]], [[STEP]] {broadcast_dimensions = array<i64>}
+  // CHECK-DAG: [[LINSPACE:%.*]] = chlo.broadcast_add [[MUL]], [[START]] {broadcast_dimensions = array<i64>}
   // CHECK: return [[LINSPACE]]
   %0 = "tf.Const"() {_output_shapes = ["tfshape$"], device = "", dtype = i32, value = dense<4> : tensor<i32>} : () -> tensor<i32>
   %1 = "tf.LinSpace"(%arg0, %arg1, %0) : (tensor<f32>, tensor<f32>, tensor<i32>) -> tensor<4xf32>
@@ -5334,7 +5334,7 @@ func.func @random_shuffle_3D(%input: tensor<4x?x16xf32>) -> tensor<4x?x16xf32> {
 // CHECK-SAME:        -> tensor<2x3x5x7xf32>
 // CHECK:           [[COUNT:%.+]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
 // CHECK:           [[DIV_RESULT:%.+]] = chlo.broadcast_divide [[DIVIDEND]], [[COUNT]]
-// CHECK-SAME:        broadcast_dimensions = dense<>
+// CHECK-SAME:        broadcast_dimensions = array<i64>
 // CHECK-SAME:        -> tensor<2x3x5x7xf32>
 // CHECK:           [[CONV16:%.+]] = mhlo.convert [[DIV_RESULT]]
 // CHECK-SAME:        -> tensor<2x3x5x7xf16>
@@ -5360,7 +5360,7 @@ func.func @avgpool_valid_padding(%arg0: tensor<2x12x21x7xf16>) -> tensor<2x3x5x7
 // CHECK-SAME:        -> tensor<2x4x3x5x7xf32>
 // CHECK:           [[COUNT:%.+]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
 // CHECK:           [[DIV_RESULT:%.+]] = chlo.broadcast_divide [[DIVIDEND]], [[COUNT]]
-// CHECK-SAME:        broadcast_dimensions = dense<>
+// CHECK-SAME:        broadcast_dimensions = array<i64>
 // CHECK-SAME:        -> tensor<2x4x3x5x7xf32>
 // CHECK:           [[CONV16:%.+]] = mhlo.convert [[DIV_RESULT]]
 // CHECK-SAME:        -> tensor<2x4x3x5x7xf16>
@@ -5386,7 +5386,7 @@ func.func @avgpool_3d_valid_padding(%arg0: tensor<2x4x12x21x7xf16>) -> tensor<2x
 // CHECK-SAME:        -> tensor<2x7x3x5xf32>
 // CHECK:           [[COUNT:%.+]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
 // CHECK:           [[DIV_RESULT:%.+]] = chlo.broadcast_divide [[DIVIDEND]], [[COUNT]]
-// CHECK-SAME:        broadcast_dimensions = dense<>
+// CHECK-SAME:        broadcast_dimensions = array<i64>
 // CHECK-SAME:        -> tensor<2x7x3x5xf32>
 // CHECK:           [[CONV16:%.+]] = mhlo.convert [[DIV_RESULT]]
 // CHECK-SAME:        -> tensor<2x7x3x5xf16>
@@ -5412,7 +5412,7 @@ func.func @avgpool_nchw_format(%arg0: tensor<2x7x12x21xf16>) -> tensor<2x7x3x5xf
 // CHECK-SAME:        -> tensor<2x7x4x3x5xf32>
 // CHECK:           [[COUNT:%.+]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
 // CHECK:           [[DIV_RESULT:%.+]] = chlo.broadcast_divide [[DIVIDEND]], [[COUNT]]
-// CHECK-SAME:        broadcast_dimensions = dense<>
+// CHECK-SAME:        broadcast_dimensions = array<i64>
 // CHECK-SAME:        -> tensor<2x7x4x3x5xf32>
 // CHECK:           [[CONV16:%.+]] = mhlo.convert [[DIV_RESULT]]
 // CHECK-SAME:        -> tensor<2x7x4x3x5xf16>
@@ -5497,7 +5497,7 @@ func.func @avgpool_3d_same_padding(%arg0: tensor<2x4x12x21x7xf32>) -> tensor<2x4
 // CHECK-DAG:       %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
 // CHECK-DAG:       %[[DIVISOR:.*]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
 // CHECK:           %[[OUT_GRAD_DIVIDED:.*]] = chlo.broadcast_divide %[[OUT_GRAD]], %[[DIVISOR]]
-// CHECK-SAME:        broadcast_dimensions = dense<>
+// CHECK-SAME:        broadcast_dimensions = array<i64>
 // CHECK-SAME:        -> tensor<10x12x16x64xf32>
 // CHECK:           %[[REDUCE_WINDOW_INPUT:.*]] = "mhlo.pad"(%[[OUT_GRAD_DIVIDED]], %[[ZERO]])
 // CHECK-SAME:        edge_padding_high = dense<[0, 1, 1, 0]>
@@ -5530,7 +5530,7 @@ func.func @avgpool_grad_valid_padding(%grad: tensor<10x12x16x64xf32>) -> tensor<
 // CHECK-SAME:      %[[OUT_GRAD:.*]]: tensor<10x8x12x16x64xf32>) -> tensor<10x8x24x32x64xf32> {
 // CHECK-DAG:       %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
 // CHECK-DAG:       %[[DIVISOR:.*]] = mhlo.constant dense<4.000000e+00> : tensor<f32>
-// CHECK:           %[[OUT_GRAD_DIVIDED:.*]] = chlo.broadcast_divide %[[OUT_GRAD]], %[[DIVISOR]] {broadcast_dimensions = dense<> : tensor<0xi64>} : (tensor<10x8x12x16x64xf32>, tensor<f32>) -> tensor<10x8x12x16x64xf32>
+// CHECK:           %[[OUT_GRAD_DIVIDED:.*]] = chlo.broadcast_divide %[[OUT_GRAD]], %[[DIVISOR]] {broadcast_dimensions = array<i64>} : (tensor<10x8x12x16x64xf32>, tensor<f32>) -> tensor<10x8x12x16x64xf32>
 // CHECK:           %[[REDUCE_WINDOW_INPUT:.*]] = "mhlo.pad"(%[[OUT_GRAD_DIVIDED]], %[[ZERO]])
 // CHECK-SAME:        edge_padding_high = dense<[0, 0, 1, 1, 0]>
 // CHECK-SAME:        edge_padding_low = dense<[0, 0, 1, 1, 0]>
@@ -5724,7 +5724,7 @@ func.func @avgpool_3d_grad_ncdwh_format(%grad: tensor<2x9x8x4x7xf32>) -> tensor<
 // CHECK-DAG:       %[[ZERO:.*]] = mhlo.constant dense<0.000000e+00> : tensor<bf16>
 // CHECK-DAG:       %[[DIVISOR:.*]] = mhlo.constant dense<4.000000e+00> : tensor<bf16>
 // CHECK:           %[[OUT_GRAD_DIVIDED:.*]] = chlo.broadcast_divide %[[OUT_GRAD]], %[[DIVISOR]]
-// CHECK-SAME:        broadcast_dimensions = dense<>
+// CHECK-SAME:        broadcast_dimensions = array<i64>
 // CHECK-SAME:        -> tensor<10x12x16x64xbf16>
 // CHECK:           %[[REDUCE_WINDOW_INPUT:.*]] = "mhlo.pad"(%[[OUT_GRAD_DIVIDED]], %[[ZERO]])
 // CHECK-SAME:        edge_padding_high = dense<[0, 1, 1, 0]>
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/verify-tfxla-legalization-no-chlo.mlir b/tensorflow/compiler/mlir/tf2xla/tests/verify-tfxla-legalization-no-chlo.mlir
index 09374ca8006a2f..673e6c9ffd329a 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/verify-tfxla-legalization-no-chlo.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/verify-tfxla-legalization-no-chlo.mlir
@@ -6,6 +6,6 @@
 
 // CHECK-LABEL: allows_chlo
 func.func @allows_chlo(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
-  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = array<i64: 3>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   func.return %0 : tensor<1x32x10x32xi32>
 }
diff --git a/tensorflow/compiler/mlir/tf2xla/tests/verify-tfxla-legalization.mlir b/tensorflow/compiler/mlir/tf2xla/tests/verify-tfxla-legalization.mlir
index c91c7c9da8ac77..e6623350380fcb 100644
--- a/tensorflow/compiler/mlir/tf2xla/tests/verify-tfxla-legalization.mlir
+++ b/tensorflow/compiler/mlir/tf2xla/tests/verify-tfxla-legalization.mlir
@@ -31,7 +31,7 @@ func.func @invalid_mixed_mhlo() -> (tensor<8x64x128xcomplex<f32>> {mhlo.sharding
 
 func.func @fails_chlo(%arg0: tensor<1x32x10x32xi32>, %arg1: tensor<32xi32>) -> tensor<1x32x10x32xi32> {
   // expected-error @+1 {{Could not legalize op: chlo.broadcast_add}}
-  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = dense<3> : tensor<1xi64>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
+  %0 = "chlo.broadcast_add"(%arg0, %arg1) {broadcast_dimensions = array<i64: 3>} : (tensor<1x32x10x32xi32>, tensor<32xi32>) -> tensor<1x32x10x32xi32>
   func.return %0 : tensor<1x32x10x32xi32>
 }
 
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
index 0f0b1182e50bb7..28a459ccff2eac 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/BUILD
@@ -270,7 +270,9 @@ cc_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/util/quantization:uniform_quant_ops_params",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
index b8818bb9d3824b..4d96bb24acf152 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalization_op_config_test.cc
@@ -132,7 +132,7 @@ TEST_F(LegalizationOpConfigTest, CountLoweringsSet) {
   // a new op, we should expect these to change too.
   EXPECT_EQ(mlir_lowering_count, 67);
   EXPECT_EQ(tf2xla_fallback_count, 316);
-  EXPECT_EQ(non_categorized_count, 422);
+  EXPECT_EQ(non_categorized_count, 423);
 }
 
 // Just a counter test to see which ops have duplicate lowerings. This isn't a
@@ -224,7 +224,7 @@ TEST_F(LegalizationOpConfigTest, MlirLoweringWithoutXlaKernel) {
     }
   }
 
-  EXPECT_EQ(mlir_without_xla_count, 14);
+  EXPECT_EQ(mlir_without_xla_count, 13);
 }
 
 }  // namespace mhlo
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
index 70370bffc41f20..76056458079964 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf.cc
@@ -39,6 +39,7 @@ limitations under the License.
 #include "mlir/Dialect/Tensor/IR/Tensor.h"  // from @llvm-project
 #include "mlir/Dialect/Traits.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/IR/Diagnostics.h"  // from @llvm-project
@@ -497,7 +498,7 @@ static void CreateWhile32(Location loc, int num_iterations,
     // Increment the loop induction variable by one.
     auto one =
         builder->create<mhlo::ConstantOp>(loc, builder->getI32IntegerAttr(1));
-    auto scalar_broadcast_dims = GetI64ElementsAttr({}, builder);
+    auto scalar_broadcast_dims = builder->getDenseI64ArrayAttr({});
     auto plus_one = builder->create<chlo::BroadcastAddOp>(
         loc, block->getArgument(0), one, scalar_broadcast_dims);
     // Prepend with the updated loop induction variable.
@@ -2172,7 +2173,7 @@ class ConvertFusedBatchNormGradBase
         non_feature_dims.push_back(i);
       }
       auto reduce_dims = GetI64ElementsAttr(non_feature_dims, &rewriter);
-      auto scalar_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
+      auto scalar_broadcast_dims = rewriter.getDenseI64ArrayAttr({});
 
       // scratch1 = rsqrt(var + epsilon)
       RankedTensorType scalar_float =
@@ -2315,7 +2316,7 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
 
       Value corrected_variance = rewriter.create<chlo::BroadcastMulOp>(
           op.getLoc(), batch_variance.getType(), batch_variance,
-          factor_const_op, /*broadcast_dimensions=*/DenseIntElementsAttr());
+          factor_const_op, /*broadcast_dimensions=*/DenseI64ArrayAttr());
 
       // Convert back to input type to stay aligned with expected output type
       // for TF op.
@@ -2335,24 +2336,24 @@ class ConvertFusedBatchNormBase : public OpRewritePattern<FusedBatchNormOpT> {
         // new_running_mean = alpha * old_mean + beta * batch_mean.
         auto alpha_mul_old_mean = rewriter.create<chlo::BroadcastMulOp>(
             op.getLoc(), op.getMean().getType(), alpha, op.getMean(),
-            /*broadcast_dimensions=*/DenseIntElementsAttr());
+            /*broadcast_dimensions=*/DenseI64ArrayAttr());
         auto beta_mul_batch_mean = rewriter.create<chlo::BroadcastMulOp>(
             op.getLoc(), batch_mean.getType(), beta, batch_mean,
-            /*broadcast_dimensions=*/DenseIntElementsAttr());
+            /*broadcast_dimensions=*/DenseI64ArrayAttr());
         batch_mean = rewriter.create<chlo::BroadcastAddOp>(
             op.getLoc(), alpha_mul_old_mean, beta_mul_batch_mean,
-            /*broadcast_dimensions=*/DenseIntElementsAttr());
+            /*broadcast_dimensions=*/DenseI64ArrayAttr());
 
         // new_running_variance = alpha * old_variance + beta * batch_variance.
         auto alpha_mul_old_variance = rewriter.create<chlo::BroadcastMulOp>(
             op.getLoc(), op.getVariance().getType(), alpha, op.getVariance(),
-            /*broadcast_dimensions=*/DenseIntElementsAttr());
+            /*broadcast_dimensions=*/DenseI64ArrayAttr());
         auto beta_mul_batch_variance = rewriter.create<chlo::BroadcastMulOp>(
             op.getLoc(), corrected_variance.getType(), beta, corrected_variance,
-            /*broadcast_dimensions=*/DenseIntElementsAttr());
+            /*broadcast_dimensions=*/DenseI64ArrayAttr());
         corrected_variance = rewriter.create<chlo::BroadcastAddOp>(
             op.getLoc(), alpha_mul_old_variance, beta_mul_batch_variance,
-            /*broadcast_dimensions=*/DenseIntElementsAttr());
+            /*broadcast_dimensions=*/DenseI64ArrayAttr());
       }
 
       if (std::is_same<FusedBatchNormOpT, TF::FusedBatchNormV2Op>::value) {
@@ -2522,7 +2523,7 @@ Operation *AvgPoolDivideByCount(
     // Divide `pooled` by window counts.
     Value divisor =
         GetScalarConstOfType(element_type, loc, window_count, &rewriter);
-    auto scalar_broadcast_dims = GetI64ElementsAttr({}, &rewriter);
+    auto scalar_broadcast_dims = rewriter.getDenseI64ArrayAttr({});
     result = rewriter.create<chlo::BroadcastDivOp>(
         loc, pooled_type, pooled, divisor, scalar_broadcast_dims);
   } else {
@@ -4091,7 +4092,7 @@ class GenericConvertReductionOp : public OpRewritePattern<OpTy> {
       Value divisor = rewriter.create<ConvertOp>(
           loc, tensorflow::GetTypeFromTFTensorShape({}, reduce_element_type),
           divisor_tensor);
-      auto broadcast_dims = GetI64ElementsAttr({}, &rewriter);
+      auto broadcast_dims = rewriter.getDenseI64ArrayAttr({});
       result = rewriter.create<chlo::BroadcastDivOp>(loc, result, divisor,
                                                      broadcast_dims);
     }
@@ -6103,7 +6104,7 @@ class ConvertXlaReduceScatterOp
       if (replica_group_size == 0) return failure();
       auto divisor = GetScalarConstOfType(element_type, loc, replica_group_size,
                                           &rewriter);
-      auto broadcast_dims = GetI64ElementsAttr({}, &rewriter);
+      auto broadcast_dims = rewriter.getDenseI64ArrayAttr({});
       result = rewriter.create<chlo::BroadcastDivOp>(
           loc, result, divisor.getResult(), broadcast_dims);
     }
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc
index d7937cce42ff24..54bd5812644488 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_collective.cc
@@ -176,7 +176,7 @@ LogicalResult ConvertAllReduce(OpBuilder& builder, int64_t channel_id,
     }
     auto divisor =
         GetScalarConstOfType(element_type, loc, replica_group_size, &builder);
-    auto broadcast_dims = GetI64ElementsAttr({}, &builder);
+    auto broadcast_dims = builder.getDenseI64ArrayAttr({});
     result = builder.create<chlo::BroadcastDivOp>(
         loc, all_reduce.getResult(0), divisor.getResult(), broadcast_dims);
   } else if (final_op != "Id") {
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc
index 763e94734f6d01..3e8dd5b58ed2f1 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_communication.cc
@@ -16,6 +16,9 @@ limitations under the License.
 // This file implements logic for lowering TensorFlow dialect's communication
 // ops (TF/XLA) to the HLO dialect.
 
+#include <atomic>
+#include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <string>
@@ -31,6 +34,7 @@ limitations under the License.
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
 #include "mlir/IR/TypeUtilities.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
@@ -69,6 +73,28 @@ class LegalizeTFCommunication
   void runOnOperation() override;
 };
 
+// A generator to serve out unique channel ids.
+class ChannelIdGenerator {
+ public:
+  ChannelIdGenerator() = default;
+  ChannelIdGenerator(const ChannelIdGenerator&) = delete;
+  ChannelIdGenerator& operator=(const ChannelIdGenerator&) = delete;
+  ChannelIdGenerator(ChannelIdGenerator&&) = delete;
+  ChannelIdGenerator& operator=(ChannelIdGenerator&&) = delete;
+  int64_t operator++(int) { return next(); }
+  int64_t next() { return channel_id_.fetch_add(1, std::memory_order_relaxed); }
+
+ private:
+  // All usage code expects positive int64_t values so we can't use uint64_t
+  // and will just have to limit ourselves to half the number space.
+  std::atomic<int64_t> channel_id_ = 1;
+};
+
+int64_t GetNextChannelId() {
+  static ChannelIdGenerator* channel_id = new ChannelIdGenerator();
+  return channel_id->next();
+}
+
 // Checks if an op is a TF/XLA communication op.
 bool IsCommunicationOp(Operation* op) {
   return isa<TF::_XlaHostComputeMlirOp, TF::XlaSendToHostOp,
@@ -253,12 +279,12 @@ void SetFrontendAttributes(Operation* op, int32_t index, StringRef key,
 }
 
 // Creates a `mhlo.send` op for sending value `operand`.
-Value CreateSendOp(OpBuilder& builder, int64_t& channel_id, Location loc,
-                   Value operand, StringRef key, size_t index, Value token,
+Value CreateSendOp(OpBuilder& builder, Location loc, Value operand,
+                   StringRef key, size_t index, Value token,
                    StringRef host_handler_name, bool manual_sharding) {
   // type 2 == DEVICE_TO_HOST
   auto channel_handle = ChannelHandleAttr::get(builder.getContext(),
-                                               /*handle=*/channel_id++,
+                                               /*handle=*/GetNextChannelId(),
                                                /*type=*/2);
   auto send = builder.create<SendOp>(
       loc, token.getType(), operand, token, channel_handle,
@@ -273,12 +299,12 @@ Value CreateSendOp(OpBuilder& builder, int64_t& channel_id, Location loc,
 }
 
 // Creates a `mhlo.recv` op for receiving a value.
-Value CreateRecvOp(OpBuilder& builder, int64_t& channel_id, Location loc,
-                   Value result, StringRef key, size_t index, Value token,
+Value CreateRecvOp(OpBuilder& builder, Location loc, Value result,
+                   StringRef key, size_t index, Value token,
                    StringRef host_handler_name, bool manual_sharding) {
   // type 3 == HOST_TO_DEVICE
   auto channel_handle = ChannelHandleAttr::get(builder.getContext(),
-                                               /*handle=*/channel_id++,
+                                               /*handle=*/GetNextChannelId(),
                                                /*type=*/3);
   auto result_type = result.getType();
   SmallVector<Type, 2> recv_result_type = {result_type, token.getType()};
@@ -315,7 +341,7 @@ Value CreateSinkToken(OpBuilder& builder, Location loc, ArrayRef<Value> tokens,
 // ops per operand and result. Unique Channel IDs are assigned per transfer.
 // Sink tokens are created across all `mhlo.send` ops first and then by
 // all `mhlo.recv` ops.
-Value RewriteHostComputeOp(OpBuilder& builder, int64_t& channel_id,
+Value RewriteHostComputeOp(OpBuilder& builder,
                            TF::_XlaHostComputeMlirOp host_compute,
                            Value token) {
   builder.setInsertionPoint(host_compute);
@@ -325,7 +351,7 @@ Value RewriteHostComputeOp(OpBuilder& builder, int64_t& channel_id,
   SmallVector<Value, 4> send_tokens;
   for (auto operand : llvm::enumerate(host_compute.getInputs())) {
     auto send_token = CreateSendOp(
-        builder, channel_id, loc, operand.value(), host_compute.getSendKey(),
+        builder, loc, operand.value(), host_compute.getSendKey(),
         operand.index(), token, xla::kXlaHostTransferTfRendezvousHandlerName,
         manual_sharding);
     send_tokens.push_back(send_token);
@@ -335,9 +361,8 @@ Value RewriteHostComputeOp(OpBuilder& builder, int64_t& channel_id,
   SmallVector<Value, 4> recv_tokens;
   for (auto result : llvm::enumerate(host_compute.getOutputs())) {
     auto recv_token = CreateRecvOp(
-        builder, channel_id, loc, result.value(), host_compute.getRecvKey(),
-        result.index(), token, xla::kXlaHostTransferTfRendezvousHandlerName,
-        manual_sharding);
+        builder, loc, result.value(), host_compute.getRecvKey(), result.index(),
+        token, xla::kXlaHostTransferTfRendezvousHandlerName, manual_sharding);
     recv_tokens.push_back(recv_token);
   }
   token = CreateSinkToken(builder, loc, recv_tokens, token);
@@ -347,11 +372,11 @@ Value RewriteHostComputeOp(OpBuilder& builder, int64_t& channel_id,
 }
 
 // Replaces `tf.XlaSendToHost` with a `mhlo.send`.
-Value RewriteSendToHostOp(OpBuilder& builder, int64_t& channel_id,
-                          TF::XlaSendToHostOp send_to_host, Value token) {
+Value RewriteSendToHostOp(OpBuilder& builder, TF::XlaSendToHostOp send_to_host,
+                          Value token) {
   builder.setInsertionPoint(send_to_host);
-  token = CreateSendOp(builder, channel_id, send_to_host.getLoc(),
-                       send_to_host.getInput(), send_to_host.getKey(),
+  token = CreateSendOp(builder, send_to_host.getLoc(), send_to_host.getInput(),
+                       send_to_host.getKey(),
                        /*index=*/0, token,
                        xla::kXlaHostTransferTfRendezvousHandlerName,
                        /*manual_sharding=*/false);
@@ -361,10 +386,10 @@ Value RewriteSendToHostOp(OpBuilder& builder, int64_t& channel_id,
 }
 
 // Replaces `tf.XlaRecvFromHost` with a `mhlo.recv`.
-Value RewriteRecvFromHostOp(OpBuilder& builder, int64_t& channel_id,
+Value RewriteRecvFromHostOp(OpBuilder& builder,
                             TF::XlaRecvFromHostOp recv_from_host, Value token) {
   builder.setInsertionPoint(recv_from_host);
-  token = CreateRecvOp(builder, channel_id, recv_from_host.getLoc(),
+  token = CreateRecvOp(builder, recv_from_host.getLoc(),
                        recv_from_host.getOutput(), recv_from_host.getKey(),
                        /*index=*/0, token,
                        xla::kXlaHostTransferTfRendezvousHandlerName,
@@ -795,7 +820,7 @@ void RewriteFunctionTerminator(OpBuilder& builder,
 // rewritten to create a token or take in and return a token, depending on its
 // visibility and if there are any callers.
 LogicalResult RewriteFunction(
-    OpBuilder& builder, int64_t& channel_id, ModuleOp module, FuncOp func,
+    OpBuilder& builder, ModuleOp module, FuncOp func,
     const llvm::SmallDenseMap<StringRef, FuncToRewrite>& funcs,
     const llvm::SmallPtrSetImpl<Operation*>& control_flow_ops,
     const llvm::SmallPtrSetImpl<Block*>& control_flow_blocks, bool is_clone) {
@@ -832,11 +857,11 @@ LogicalResult RewriteFunction(
     Operation* next_op = curr_op->getNextNode();
 
     if (auto host_compute = dyn_cast<TF::_XlaHostComputeMlirOp>(curr_op)) {
-      token = RewriteHostComputeOp(builder, channel_id, host_compute, token);
+      token = RewriteHostComputeOp(builder, host_compute, token);
     } else if (auto send_to_host = dyn_cast<TF::XlaSendToHostOp>(curr_op)) {
-      token = RewriteSendToHostOp(builder, channel_id, send_to_host, token);
+      token = RewriteSendToHostOp(builder, send_to_host, token);
     } else if (auto recv_from_host = dyn_cast<TF::XlaRecvFromHostOp>(curr_op)) {
-      token = RewriteRecvFromHostOp(builder, channel_id, recv_from_host, token);
+      token = RewriteRecvFromHostOp(builder, recv_from_host, token);
     } else if (auto call = dyn_cast<mlir::func::CallOp>(curr_op)) {
       // Only `mlir::func::CallOp` is supported as this requires knowing how to
       // rewrite arguments and results to a function.
@@ -929,14 +954,11 @@ void LegalizeTFCommunication::runOnOperation() {
   if (failed(GetFunctionsToRewrite(module, funcs_to_rewrite)))
     return signalPassFailure();
 
-  // Module level counter to make sure Channel IDs are unique.
-  int64_t channel_id = 1;
   OpBuilder builder(&getContext());
   for (const auto& func_and_name : funcs_to_rewrite) {
     const auto& func_to_rewrite = func_and_name.getSecond();
     func::FuncOp func = func_to_rewrite.original;
-    if (failed(RewriteFunction(builder, channel_id, module, func,
-                               funcs_to_rewrite,
+    if (failed(RewriteFunction(builder, module, func, funcs_to_rewrite,
                                func_to_rewrite.control_flow_ops,
                                func_to_rewrite.control_flow_blocks,
                                /*is_clone=*/false)))
@@ -949,8 +971,8 @@ void LegalizeTFCommunication::runOnOperation() {
     GetCommunicationControlFlowOps(clone, funcs_to_rewrite,
                                    clone_control_flow_ops,
                                    clone_control_flow_blocks);
-    if (failed(RewriteFunction(builder, channel_id, module, clone,
-                               funcs_to_rewrite, clone_control_flow_ops,
+    if (failed(RewriteFunction(builder, module, clone, funcs_to_rewrite,
+                               clone_control_flow_ops,
                                clone_control_flow_blocks,
                                /*is_clone=*/true)))
       llvm_unreachable(
diff --git a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
index 0ee5d1dee5925d..108b1bf6e6bc86 100644
--- a/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
+++ b/tensorflow/compiler/mlir/tf2xla/transforms/legalize_tf_patterns.td
@@ -75,7 +75,7 @@ def : Pat<(TF_ApproximateEqualOp:$result $x, $y, $tolerance),
           (CHLO_BroadcastCompareOp
            (MHLO_AbsOp:$abs (MHLO_SubtractOp $x, $y)),
            (CastValueToElementType $result, (MHLO_ConstantOp $tolerance), $abs),
-           (NullDenseIntElementsAttr),
+           (NullDenseI64ArrayAttr),
            CHLO_ComparisonDirectionValue<"LT">,
            (CHLO_DEFAULT_COMPARISON_TYPE))>;
 
@@ -158,18 +158,18 @@ def : Pat<(TF_FloorDivOp AnyTensor:$l, AnyTensor:$r),
         (CHLO_BroadcastCompareOp
          (CHLO_BroadcastCompareOp:$l_cmp $l,
           (MHLO_ConstantOp:$l_zeros (GetScalarOfType<0> $l)),
-          (NullDenseIntElementsAttr), CHLO_ComparisonDirectionValue<"LT">,
+          (NullDenseI64ArrayAttr), CHLO_ComparisonDirectionValue<"LT">,
           (CHLO_DEFAULT_COMPARISON_TYPE)),
          (CHLO_BroadcastCompareOp:$r_cmp $r,
           (MHLO_ConstantOp:$r_zeros (GetScalarOfType<0> $r)),
-          (NullDenseIntElementsAttr), CHLO_ComparisonDirectionValue<"LT">,
+          (NullDenseI64ArrayAttr), CHLO_ComparisonDirectionValue<"LT">,
           (CHLO_DEFAULT_COMPARISON_TYPE)),
          (BinBroadcastDimensions $l_cmp, $r_cmp), CHLO_ComparisonDirectionValue<"NE">,
          (CHLO_DEFAULT_COMPARISON_TYPE)),
-        (NullDenseIntElementsAttr)),
+        (NullDenseI64ArrayAttr)),
        (CHLO_BroadcastSubOp $div,
         (MHLO_ConstantOp:$ones (GetScalarOfType<1> $div)),
-        (NullDenseIntElementsAttr)), $div),
+        (NullDenseI64ArrayAttr)), $div),
       [(SignedIntTensor $l)]>;
 
 // FloorDiv of unsigned is just div.
@@ -189,19 +189,19 @@ def : Pat<(TF_FloorModOp AnyTensor:$l, AnyTensor:$r),
         (CHLO_BroadcastCompareOp
          (CHLO_BroadcastRemOp:$rem $l, $r, (BinBroadcastDimensions $l, $r)),
          (MHLO_ConstantOp:$l_zeros (GetScalarOfType<0> $l)),
-         (NullDenseIntElementsAttr), CHLO_ComparisonDirectionValue<"NE">,
+         (NullDenseI64ArrayAttr), CHLO_ComparisonDirectionValue<"NE">,
          (CHLO_DEFAULT_COMPARISON_TYPE)),
         (CHLO_BroadcastCompareOp
          (CHLO_BroadcastCompareOp:$r_cmp $r,
           (MHLO_ConstantOp:$r_zeros (GetScalarOfType<0> $r)),
-          (NullDenseIntElementsAttr), CHLO_ComparisonDirectionValue<"LT">,
+          (NullDenseI64ArrayAttr), CHLO_ComparisonDirectionValue<"LT">,
           (CHLO_DEFAULT_COMPARISON_TYPE)),
          (CHLO_BroadcastCompareOp:$rem_cmp $rem, $r_zeros,
           (BinBroadcastDimensions $rem, $r_zeros), CHLO_ComparisonDirectionValue<"LT">,
           (CHLO_DEFAULT_COMPARISON_TYPE)),
          (BinBroadcastDimensions $r_cmp, $rem_cmp), CHLO_ComparisonDirectionValue<"NE">,
          (CHLO_DEFAULT_COMPARISON_TYPE)),
-        (NullDenseIntElementsAttr)),
+        (NullDenseI64ArrayAttr)),
        (CHLO_BroadcastAddOp $r,
         $rem, (BinBroadcastDimensions $r, $rem)), $rem),
       [(TensorOf<[I8, I16, I32, I64, F16, F32, F64]> $l)]>;
@@ -580,7 +580,7 @@ foreach Mapping = [
                    [TF_DigammaOp, CHLO_DigammaOp],
                    [TF_ExpOp, MHLO_ExpOp],
                    [TF_Expm1Op, MHLO_Expm1Op],
-                   [TF_ErfOp, CHLO_ErfOp],
+                   [TF_ErfOp, MHLO_ErfOp],
                    [TF_ErfcOp, CHLO_ErfcOp],
                    [TF_FloorOp, MHLO_FloorOp],
                    [TF_ImagOp, MHLO_ImagOp],
@@ -694,13 +694,13 @@ def : Pattern<(TF_SoftplusOp AnyTensor:$features),
                 (CHLO_BroadcastAddOp:$threshold
                  (MHLO_LogOp (MHLO_ConstantOp (EpsilonValue $features))),
                  (MHLO_ConstantOp (GetScalarOfType<2> $features)),
-                 (NullDenseIntElementsAttr)
+                 (NullDenseI64ArrayAttr)
                 ),
                 (MHLO_SelectOp:$output
                  (CHLO_BroadcastCompareOp
                   $features,
                   (MHLO_NegOp $threshold),
-                  (NullDenseIntElementsAttr),
+                  (NullDenseI64ArrayAttr),
                   CHLO_ComparisonDirectionValue<"GT">,
                   (CHLO_DEFAULT_COMPARISON_TYPE)
                  ),
@@ -709,7 +709,7 @@ def : Pattern<(TF_SoftplusOp AnyTensor:$features),
                   (CHLO_BroadcastCompareOp
                    $features,
                    $threshold,
-                   (NullDenseIntElementsAttr),
+                   (NullDenseI64ArrayAttr),
                    CHLO_ComparisonDirectionValue<"LT">,
                    (CHLO_DEFAULT_COMPARISON_TYPE)
                   ),
diff --git a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
index 122a9084771d88..2c49198be7bad8 100644
--- a/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
+++ b/tensorflow/compiler/mlir/tf_mlir_opt_main.cc
@@ -40,7 +40,6 @@ limitations under the License.
 #include "xla/mlir_hlo/lhlo/transforms/passes.h"
 #include "xla/mlir_hlo/mhlo/transforms/passes.h"
 #include "xla/service/cpu/hlo_xla_runtime_pipeline.h"
-#include "xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h"
 
 int main(int argc, char **argv) {
   tensorflow::InitMlir y(&argc, &argv);
diff --git a/tensorflow/compiler/mlir/tfr/BUILD b/tensorflow/compiler/mlir/tfr/BUILD
index 10862b22c4f8d6..04cd4282e5c451 100644
--- a/tensorflow/compiler/mlir/tfr/BUILD
+++ b/tensorflow/compiler/mlir/tfr/BUILD
@@ -344,7 +344,6 @@ tf_python_pybind_extension(
         ":tfr",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/python/lib/core:pybind11_lib",
-        "//tensorflow/python/lib/core:pybind11_status",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:FuncDialect",
@@ -352,6 +351,7 @@ tf_python_pybind_extension(
         "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:SCFDialect",
         "@llvm-project//mlir:ShapeDialect",
+        "@llvm-project//mlir:Support",
         "@pybind11",
     ],
 )
diff --git a/tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc b/tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc
index 760ddab974c7fd..5d572f8278684b 100644
--- a/tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc
+++ b/tensorflow/compiler/mlir/tfr/python/tfr_wrapper.cc
@@ -15,24 +15,25 @@ limitations under the License.
 
 #include <string>
 
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
 #include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
 #include "mlir/IR/AsmState.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Verifier.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/stl.h"  // from @pybind11
-#include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_executor.h"
-#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tfr/ir/tfr_ops.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
-#include "tensorflow/python/lib/core/pybind11_status.h"
 
 PYBIND11_MODULE(tfr_wrapper, m) {
   m.def("verify", [](std::string input) {
diff --git a/tensorflow/compiler/mlir/tfrt/BUILD b/tensorflow/compiler/mlir/tfrt/BUILD
index a73fc10ae083c4..315d9fca646d1d 100644
--- a/tensorflow/compiler/mlir/tfrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/BUILD
@@ -313,25 +313,21 @@ cc_library(
         ":tf_to_tfrt",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:convert_type",
-        "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
-        "//tensorflow/compiler/mlir/tensorflow:error_util",
-        "//tensorflow/compiler/mlir/tensorflow:import_model",
         "//tensorflow/compiler/mlir/tensorflow:translate_lib",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tf_dialect_passes",
         "//tensorflow/core:framework",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
+        "@local_tsl//tsl/platform:errors",
         "@tf_runtime//:bef",
         "@tf_runtime//:core_runtime",
-        "@tf_runtime//:hostcontext",
         "@tf_runtime//:mlirtobef",
-        "@tf_runtime//:tensor",
     ],
 )
 
@@ -707,6 +703,7 @@ cc_library(
 
 cc_library(
     name = "backend_compiler",
+    srcs = ["backend_compiler.cc"],
     hdrs = ["backend_compiler.h"],
     deps = [
         "//tensorflow/core/tfrt/runtime",
diff --git a/third_party/xla/third_party/tsl/tsl/platform/gif.h b/tensorflow/compiler/mlir/tfrt/backend_compiler.cc
similarity index 73%
rename from third_party/xla/third_party/tsl/tsl/platform/gif.h
rename to tensorflow/compiler/mlir/tfrt/backend_compiler.cc
index 865b6f201e66fe..7c04c778fda8da 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/gif.h
+++ b/tensorflow/compiler/mlir/tfrt/backend_compiler.cc
@@ -1,4 +1,4 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,9 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_TSL_PLATFORM_GIF_H_
-#define TENSORFLOW_TSL_PLATFORM_GIF_H_
+#include "tensorflow/compiler/mlir/tfrt/backend_compiler.h"
 
-#include "gif_lib.h"  // from @gif
+namespace tensorflow {
 
-#endif  // TENSORFLOW_TSL_PLATFORM_GIF_H_
+BackendCompiler::~BackendCompiler() = default;
+
+}
diff --git a/tensorflow/compiler/mlir/tfrt/backend_compiler.h b/tensorflow/compiler/mlir/tfrt/backend_compiler.h
index 827dc92bd72f2e..0e959f04f43554 100644
--- a/tensorflow/compiler/mlir/tfrt/backend_compiler.h
+++ b/tensorflow/compiler/mlir/tfrt/backend_compiler.h
@@ -17,13 +17,16 @@ limitations under the License.
 #define TENSORFLOW_COMPILER_MLIR_TFRT_BACKEND_COMPILER_H_
 
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
 #include "tensorflow/core/tfrt/runtime/runtime.h"
 
 namespace tensorflow {
 
 class BackendCompiler {
  public:
-  virtual ~BackendCompiler() = default;
+  virtual ~BackendCompiler();
+
+  virtual void GetDependentDialects(mlir::DialectRegistry& registry) const {}
 
   // Compile the `module` in TF dialect. The result module should be also in TF
   // dialect.
diff --git a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
index 87ee9fc91ca57e..7fbc42ad3db93f 100644
--- a/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
+++ b/tensorflow/compiler/mlir/tfrt/ir/mlrt/tf_mlrt_ops.td
@@ -443,6 +443,8 @@ def IfrtLoadVariableOp: TensorflowMlrt_Op<"ifrt_load_variable", []> {
 
     `tf.IfrtLoadVariableOp` converts the tensor into an IFRT array based on device and sharding
     configuration specified in `VariableDeviceShardingConfigProto`.
+
+    This op returns a scalar string tensor as a key for user to look for the loaded array.
   }];
 
   let arguments = (ins
@@ -450,6 +452,10 @@ def IfrtLoadVariableOp: TensorflowMlrt_Op<"ifrt_load_variable", []> {
     StrAttr:$device_sharding_config_proto_text,
     StrAttr:$name
   );
+
+  let results = (outs
+    TFTensorType:$array_key
+  );
 }
 
 
diff --git a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
index b96eae576e4983..b15cc904b8039c 100644
--- a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.cc
@@ -17,24 +17,24 @@ limitations under the License.
 
 #include <utility>
 
-#include "absl/strings/str_split.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/SymbolTable.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
-#include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
-#include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/tf_mlir_translate.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/convert_type.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
-#include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
-#include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/errors.h"
 #include "tfrt/bef_converter/mlir_to_bef.h"  // from @tf_runtime
-#include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
-#include "tfrt/core_runtime/op_handler.h"  // from @tf_runtime
-#include "tfrt/host_context/host_context.h"  // from @tf_runtime
-#include "tfrt/tensor/dense_host_tensor_view.h"  // from @tf_runtime
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
index 94b7f73fd73068..091e6642650b25 100644
--- a/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
+++ b/tensorflow/compiler/mlir/tfrt/saved_model/saved_model.h
@@ -23,7 +23,11 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.pb.h"
diff --git a/tensorflow/compiler/mlir/tfrt/tests/ifrt/rewrite_cluster_to_ifrt_call.mlir b/tensorflow/compiler/mlir/tfrt/tests/ifrt/rewrite_cluster_to_ifrt_call.mlir
index a23fafec92c028..dbb77732a3d6f6 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/ifrt/rewrite_cluster_to_ifrt_call.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/ifrt/rewrite_cluster_to_ifrt_call.mlir
@@ -1,17 +1,17 @@
-// RUN: tf-tfrt-opt -split-input-file -rewrite-cluster-to-ifrt-call=tpu-compile-metadata-debug %s | FileCheck %s
+// RUN: tf-tfrt-opt -split-input-file -rewrite-cluster-to-ifrt-call %s | FileCheck %s
 // TODO(b/316226111): the printer may not guarantee the same order of fields. Rewrite the checks to be less sensitive to proto serialization formats.
 // -----
 // Non-SPMD: one input and one output
 //
 // CHECK-LABEL: func.func @serving_default(%arg0: tensor<1x3xf32>) -> tensor<1x3xf32> {
 // CHECK-NEXT:  "tf.IfrtCall"(%arg0)
-// CHECK-SAME:       {program_id = [[PROGRAM_ID:.*]] : i64, variable_arg_indices = [], variable_names = []}
+// CHECK-SAME:       {program_id = [[PROGRAM_ID:.*]] : i64, variable_arg_indices = []}
 // CHECK-SAME:       (tensor<1x3xf32>) -> tensor<1x3xf32>
 // CHECK:    return
 //
-// CHECK:  func.func private @_ifrt_program__func(%arg0: tensor<1x3xf32>)
+// CHECK:  func.func @_ifrt_program__func(%arg0: tensor<1x3xf32>)
 // CHECK-SAME: __tpu_compile_metadata_text = "args { dtype: DT_FLOAT shape { dim { size: 1 } dim { size: 3 } } kind: PARAMETER sharding { } is_bounded_dynamic_dim: false } retvals { sharding { } } num_replicas: 1 num_cores_per_replica: 1 "
-// CHECK-SAME: tfrt_ifrt_serving.program_id = [[PROGRAM_ID]] : i64,  tpu_compile_metadata =
+// CHECK-SAME: tfrt_ifrt_serving.program_id = [[PROGRAM_ID]] : i64
 // CHECK:      return
 
 module attributes {tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1"], tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1704 : i32}} {
@@ -33,13 +33,13 @@ func.func private @_func(%arg0: tensor<1x3xf32>) -> (tensor<1x3xf32>) {
 //
 // CHECK-LABEL: func.func @serving_default(%arg0: tensor<1x3xf32>) {
 // CHECK-NEXT:  "tf.IfrtCall"(%arg0)
-// CHECK-SAME:       {program_id = [[PROGRAM_ID:.*]] : i64, variable_arg_indices = [], variable_names = []}
+// CHECK-SAME:       {program_id = [[PROGRAM_ID:.*]] : i64, variable_arg_indices = []}
 // CHECK-SAME:       (tensor<1x3xf32>) -> ()
 // CHECK:    return
 //
-// CHECK:  func.func private @_ifrt_program__func(%arg0: tensor<1x3xf32>)
+// CHECK:  func.func @_ifrt_program__func(%arg0: tensor<1x3xf32>)
 // CHECK-SAME: __tpu_compile_metadata_text = "args { dtype: DT_FLOAT shape { dim { size: 1 } dim { size: 3 } } kind: PARAMETER sharding { type: OTHER tile_assignment_dimensions: 2 tile_assignment_dimensions: 1 tile_assignment_devices: 0 tile_assignment_devices: 1 } is_bounded_dynamic_dim: false } num_replicas: 1 num_cores_per_replica: 2 device_assignment { replica_count: 1 computation_count: 2 computation_devices { replica_device_ids: 0 } computation_devices { replica_device_ids: 1 } } use_spmd_for_xla_partitioning: true "
-// CHECK-SAME: tfrt_ifrt_serving.program_id = [[PROGRAM_ID]] : i64,  tpu_compile_metadata =
+// CHECK-SAME: tfrt_ifrt_serving.program_id = [[PROGRAM_ID]] : i64
 // CHECK:      return
 
 module attributes {tf.devices = ["/job:localhost/replica:0/task:0/device:CPU:0", "/job:localhost/replica:0/task:0/device:TPU_SYSTEM:0", "/job:localhost/replica:0/task:0/device:TPU:0", "/job:localhost/replica:0/task:0/device:TPU:1"], tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1704 : i32}} {
@@ -60,17 +60,17 @@ func.func private @_func(%arg0: tensor<1x3xf32>) -> () {
 
 // CHECK-LABEL: func.func @serving_default(%arg0: tensor<3x1xf32>, %arg1: tensor<1x3xf32>) -> tensor<1x1xf32> {
 // CHECK-NEXT:  %0 = "tf.IfrtCall"(%arg1, %arg0)
-// CHECK-SAME:       {program_id = [[PROGRAM_ID:.*]] : i64, variable_arg_indices = [], variable_names = []}
+// CHECK-SAME:       {program_id = [[PROGRAM_ID:.*]] : i64, variable_arg_indices = []
 // CHECK-SAME:       (tensor<1x3xf32>, tensor<3x1xf32>) -> tensor<1x1xf32>
 // CHECK-NEXT:    %1 = "tf.Identity"(%arg1) {device = ""} : (tensor<1x3xf32>) -> tensor<1x3xf32>
 // CHECK-NEXT:    %2 = "tf.IfrtCall"(%1, %arg0)
-// CHECK-SAME:       {program_id = [[PROGRAM_ID]] : i64, variable_arg_indices = [], variable_names = []
+// CHECK-SAME:       {program_id = [[PROGRAM_ID]] : i64, variable_arg_indices = []
 // CHECK-SAME:       (tensor<1x3xf32>, tensor<3x1xf32>) -> tensor<1x1xf32>
 // CHECK-NEXT:    %3 = "tf.add"(%0, %2) : (tensor<1x1xf32>, tensor<1x1xf32>) -> tensor<1x1xf32>
 // CHECK:    return
 //
-// CHECK:  func.func private @_ifrt_program__func(%arg0: tensor<1x3xf32>, %arg1: tensor<3x1xf32>) -> tensor<1x1xf32>
-// CHECK-SAME:      tfrt_ifrt_serving.program_id = [[PROGRAM_ID]] : i64, tpu_compile_metadata =
+// CHECK:  func.func @_ifrt_program__func(%arg0: tensor<1x3xf32>, %arg1: tensor<3x1xf32>) -> tensor<1x1xf32>
+// CHECK-SAME:      tfrt_ifrt_serving.program_id = [[PROGRAM_ID]] : i64
 // CHECK-NEXT:     %0 = "tf.MatMul"(%arg0, %arg1)
 // CHECK:          return
 
@@ -97,12 +97,12 @@ func.func private @_func(%arg0: tensor<1x3xf32>, %arg1: tensor<3x1xf32>) -> (ten
 
 // CHECK-LABEL: func.func @serving_default(%arg0: tensor<3x1xf32>, %arg1: tensor<1x3xf32>) -> tensor<1x1xf32> {
 // CHECK-NEXT:  %0 = "tf.IfrtCall"(%arg1, %arg0)
-// CHECK-SAME:       {program_id = [[PROGRAM_ID:.*]] : i64, variable_arg_indices = [], variable_names = []}
+// CHECK-SAME:       {program_id = [[PROGRAM_ID:.*]] : i64, variable_arg_indices = []
 // CHECK-SAME:       (tensor<1x3xf32>, tensor<3x1xf32>) -> tensor<1x1xf32>
 // CHECK:    return
 //
-// CHECK:  func.func private @_ifrt_program__func(%arg0: tensor<1x3xf32>, %arg1: tensor<3x1xf32>) -> tensor<1x1xf32>
-// CHECK-SAME:      tfrt_ifrt_serving.program_id = [[PROGRAM_ID]] : i64, tpu_compile_metadata =
+// CHECK:  func.func @_ifrt_program__func(%arg0: tensor<1x3xf32>, %arg1: tensor<3x1xf32>) -> tensor<1x1xf32>
+// CHECK-SAME:      tfrt_ifrt_serving.program_id = [[PROGRAM_ID]] : i64
 // CHECK-NEXT:     %0 = "tf.MatMul"(%arg0, %arg1)
 // CHECK:          return
 
diff --git a/tensorflow/compiler/mlir/tfrt/tests/ifrt/sink_variable_as_named_array.mlir b/tensorflow/compiler/mlir/tfrt/tests/ifrt/sink_variable_as_named_array.mlir
new file mode 100644
index 00000000000000..ba644948c6b06d
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/ifrt/sink_variable_as_named_array.mlir
@@ -0,0 +1,44 @@
+// RUN: tf-tfrt-opt -split-input-file -sink-variable-as-named-array %s | FileCheck %s
+
+// -----
+// Basic test: all variables tensors are for devices and sinked as named ifrt arrays
+//
+//
+// CHECK-LABEL:  func.func @serving_default(%arg0: tensor<1x3xf32>) -> tensor<1x1xf32> {
+// CHECK-NEXT:   [[HANDLE2:%.*]] = "tf.VarHandleOp"
+// CHECK-NEXT:   [[KEY:%.*]] = "tf.IfrtLoadVariable"([[HANDLE2]])
+// CHECK-SAME:       device_sharding_config_proto_text = "sharding { type: OTHER tile_assignment_dimensions: 2 tile_assignment_dimensions: 1 tile_assignment_devices: 0 tile_assignment_devices: 1 } device_ids: 0 device_ids: 1 "
+// CHECK-SAME:       name = "__y"
+// CHECK-NEXT:   [[RES:%.*]] = "tf.IfrtCall"([[KEY]], %arg0) <{program_id = 6515870160938153680 : i64, variable_arg_indices = [0 : i32]}>
+// CHECK-SAME:    : (tensor<!tf_type.string>, tensor<1x3xf32>) -> tensor<1x1xf32>
+// CHECK-NEXT:    return [[RES]] : tensor<1x1xf32>
+//
+module {
+  func.func @serving_default(%arg0: tensor<1x3xf32>) -> tensor<1x1xf32> {
+    %0 = "tf.VarHandleOp"() <{container = "", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
+    %2 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<3x1xf32>>>) -> tensor<3x1xf32>
+    %result = "tf.IfrtCall"(%2, %arg0) <{program_id = 6515870160938153680 : i64, variable_arg_indices = []}> { __tpu_compile_metadata_text = "args { dtype: DT_FLOAT shape { dim { size: 3 } dim { size: 1 } } kind: PARAMETER sharding { type: OTHER tile_assignment_dimensions: 2 tile_assignment_dimensions: 1 tile_assignment_devices: 0 tile_assignment_devices: 1 } is_bounded_dynamic_dim: false } args { dtype: DT_FLOAT shape { dim { size: 3 } dim { size: 1 } } kind: PARAMETER sharding { } is_bounded_dynamic_dim: false } retvals { sharding { } } num_replicas: 1 num_cores_per_replica: 2 device_assignment { replica_count: 1 computation_count: 2 computation_devices { replica_device_ids: 0 } computation_devices { replica_device_ids: 1 } } use_spmd_for_xla_partitioning: true "} : (tensor<3x1xf32>, tensor<1x3xf32>) -> (tensor<1x1xf32>)
+    return %result : tensor<1x1xf32>
+  }
+}
+
+// -----
+// Variable tensor for host can still be used.
+//
+// CHECK-LABEL:  func.func @serving_default(%arg0: tensor<1x3xf32>) -> tensor<1x1xf32> {
+// CHECK:  "tf.VarHandleOp"
+// CHECK-NEXT:  [[VARIABLE:%.*]] = "tf.ReadVariableOp"
+// CHECK-NEXT:  [[KEY:%.*]] = "tf.IfrtLoadVariable"
+// CHECK-NEXT:  "tf.MatMul"(%arg0, [[VARIABLE]])
+// CHECK-NEXT:   [[RES:%.*]] = "tf.IfrtCall"(%arg0, [[KEY]]) <{program_id = 6515870160938153680 : i64, variable_arg_indices = [1 : i32]}>
+// CHECK-NEXT:    return [[RES]] : tensor<1x1xf32>
+//
+module {
+  func.func @serving_default(%arg0: tensor<1x3xf32>) -> tensor<1x1xf32> {
+    %0 = "tf.VarHandleOp"() <{container = "", shared_name = "y"}> : () -> tensor<!tf_type.resource<tensor<3x1xf32>>>
+    %2 = "tf.ReadVariableOp"(%0) : (tensor<!tf_type.resource<tensor<3x1xf32>>>) -> tensor<3x1xf32>
+    %3 = "tf.MatMul"(%arg0, %2) : (tensor<1x3xf32>, tensor<3x1xf32>) -> tensor<1x1xf32>
+    %result = "tf.IfrtCall"(%arg0, %2) <{program_id = 6515870160938153680 : i64, variable_arg_indices = []}> { __tpu_compile_metadata_text = "args { dtype: DT_FLOAT shape { dim { size: 1 } dim { size: 3 } } kind: PARAMETER sharding { type: OTHER tile_assignment_dimensions: 2 tile_assignment_dimensions: 1 tile_assignment_devices: 0 tile_assignment_devices: 1 } is_bounded_dynamic_dim: false } args { dtype: DT_FLOAT shape { dim { size: 3 } dim { size: 1 } } kind: PARAMETER sharding { } is_bounded_dynamic_dim: false } retvals { sharding { } } num_replicas: 1 num_cores_per_replica: 2 device_assignment { replica_count: 1 computation_count: 2 computation_devices { replica_device_ids: 0 } computation_devices { replica_device_ids: 1 } } use_spmd_for_xla_partitioning: true "} : (tensor<1x3xf32>, tensor<3x1xf32>) -> (tensor<1x1xf32>)
+    return %result : tensor<1x1xf32>
+  }
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_restore_merging.mlir b/tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_restore_merging.mlir
new file mode 100644
index 00000000000000..cd2ee741ba86bf
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_restore_merging.mlir
@@ -0,0 +1,21 @@
+// RUN: tf-tfrt-opt %s -tf-restore-merging | FileCheck %s
+
+// CHECK-LABEL: func @single_restore_group
+// CHECK-SAME:    (%[[ARG0:.*]]: {{.*}})
+func.func @single_restore_group(%arg0: tensor<!tf_type.string>) -> (tensor<*xf32>, tensor<*xi32>) {
+  %0 = "tf.Const"() {value = dense<"foo"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+  %1 = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+  %2 = "tf.RestoreV2"(%arg0, %0, %1) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<*xf32>
+
+  %3 = "tf.Const"() {value = dense<"bar"> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+  %4 = "tf.Const"() {value = dense<""> : tensor<1x!tf_type.string>} : () -> tensor<1x!tf_type.string>
+  %5 = "tf.RestoreV2"(%arg0, %3, %4) : (tensor<!tf_type.string>, tensor<1x!tf_type.string>, tensor<1x!tf_type.string>) -> tensor<*xi32>
+
+  // CHECK:      %[[NAMES:.*]] = "tf.Const"() <{value = dense<["foo", "bar"]> : tensor<2x!tf_type.string>}>
+  // CHECK-NEXT:      %[[SHAPES:.*]] = "tf.Const"() <{value = dense<""> : tensor<2x!tf_type.string>}>
+  // CHECK-NEXT:      %[[TENSORS:.*]]:2 = "tf.RestoreV2"(%[[ARG0]], %[[NAMES]], %[[SHAPES]])
+  // CHECK-SAME:   -> (tensor<*xf32>, tensor<*xi32>)
+
+  // CHECK:      return %[[TENSORS]]#0, %[[TENSORS]]#1
+  func.return %2, %5 : tensor<*xf32>, tensor<*xi32>
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_restore_splitting.mlir b/tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_restore_splitting.mlir
new file mode 100644
index 00000000000000..1aafed888aa9b9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/tests/ifrt/tf_restore_splitting.mlir
@@ -0,0 +1,18 @@
+// RUN: tf-tfrt-opt %s -tf-restore-splitting | FileCheck %s
+
+// CHECK-LABEL: func @single_restore
+// CHECK-SAME:    (%[[ARG0:.*]]: {{.*}})
+func.func @single_restore(%arg0: tensor<!tf_type.string>) -> (tensor<*xf32>, tensor<*xi32>) {
+  %0 = "tf.Const"() {value = dense<["foo", "bar"]> : tensor<2x!tf_type.string>} : () -> tensor<2x!tf_type.string>
+  %1 = "tf.Const"() {value = dense<""> : tensor<2x!tf_type.string>} : () -> tensor<2x!tf_type.string>
+  %2:2 = "tf.RestoreV2"(%arg0, %0, %1) : (tensor<!tf_type.string>, tensor<2x!tf_type.string>, tensor<2x!tf_type.string>) -> (tensor<*xf32>, tensor<*xi32>)
+
+  // CHECK: %[[FOO_NAME:.*]] = "tf.Const"() <{value = dense<"foo"> : tensor<1x!tf_type.string>}>
+  // CHECK: %[[FOO:.*]] = "tf.RestoreV2"(%[[ARG0]], %[[FOO_NAME]], {{.*}})
+
+  // CHECK: %[[BAR_NAME:.*]] = "tf.Const"() <{value = dense<"bar"> : tensor<1x!tf_type.string>}>
+  // CHECK: %[[BAR:.*]] = "tf.RestoreV2"(%[[ARG0]], %[[BAR_NAME]], {{.*}})
+
+  // CHECK: return %[[FOO]], %[[BAR]]
+  func.return %2#0, %2#1 : tensor<*xf32>, tensor<*xi32>
+}
diff --git a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
index 94a28091c7235f..eb2e0587364d6e 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/mlrt/tf_to_mlrt.mlir
@@ -458,3 +458,21 @@ func.func @xla_func(%arg0: tensor<1x3xf32>) -> tensor<*xf32> attributes {tf.entr
   %2 = "tf.XlaLaunch"(%arg0, %1) {__op_key = 3: i32, _noinline = true, _xla_compile_device_type = "GPU", device = "/device:GPU:0", function = @xla_func_0, operandSegmentSizes = array<i32: 0, 2, 0>} : (tensor<1x3xf32>, tensor<1x3xf32>) -> tensor<*xf32>
   func.return %2 : tensor<*xf32>
 }
+
+// -----
+
+// Test lowering of IfrtLoadVariableOp
+
+// CHECK-LABEL: func @ifrt_load_variable_test
+func.func @ifrt_load_variable_test() -> () {
+  // CHECK: [[HANDLE:%.*]] = tf_mlrt.executeop()
+  // CHECK-SAME:  VarHandleOp
+  %0 = "tf.VarHandleOp"() {__op_key = 1: i32, device = "/device:CPU:0", container = "", shared_name = "variable"} : () -> tensor<!tf_type.resource<tensor<1x3xf32>>>
+  // CHECK-NEXT: "tf_mlrt.ifrt_load_variable"([[HANDLE]])
+  // CHECK-SAME: device_sharding_config_proto_text
+  // CHECK-SAME: name = "__variable"
+  %1 = "tf.IfrtLoadVariable"(%0) <{device_sharding_config_proto_text = "sharding { } device_ids: 0 device_ids: 1 ", name = "__variable"}> {__op_key = 2: i32, device = "/device:CPU:0"} : (tensor<!tf_type.resource<tensor<1x3xf32>>>) -> (tensor<!tf_type.string>)
+  // CHECK-NEXT: return
+  func.return
+}
+
diff --git a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/decompose_resource_op.mlir b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/decompose_resource_op.mlir
index a2e2922cc7ee7f..a872b96a2fd6b4 100644
--- a/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/decompose_resource_op.mlir
+++ b/tensorflow/compiler/mlir/tfrt/tests/tf_to_corert/decompose_resource_op.mlir
@@ -8,8 +8,8 @@ module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, p
 // CHECK: [[const_th:%.*]] = corert.const_dense_tensor
 // CHECK-NEXT: [[const:%.*]] = tfrt_fallback_async.corert_tensorhandle_to_fallback_tensor [[const_th]] {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
 // CHECK-NEXT: [[out_chain:%.*]], [[value:%.*]] = tfrt_fallback_async.executeop.seq([[in_chain]]) key(0) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.ReadVariableOp"({{.*}})
-// CHECK-NEXT: [[res:%.*]] = tfrt_fallback_async.executeop key(1) cost({{.*}}) device("/device:CPU:0") "tf.GatherV2"([[value]], {{.*}}, [[const]])
-// CHECK-NEXT: [[res_th:%.*]] = tfrt_fallback_async.fallback_tensor_to_corert_tensorhandle [[res:%.*]] {device = "/device:CPU:0"}
+// CHECK-NEXT: [[res:%.*]] = tfrt_fallback_async.executeop key(1) cost({{.*}}) device("/job:localhost/replica:0/task:0/device:CPU:0") "tf.GatherV2"([[value]], {{.*}}, [[const]])
+// CHECK-NEXT: [[res_th:%.*]] = tfrt_fallback_async.fallback_tensor_to_corert_tensorhandle [[res]] {device = "/job:localhost/replica:0/task:0/device:CPU:0"}
 // CHECK-NEXT: tfrt.return [[out_chain]], [[res_th]] : !tfrt.chain, !corert.tensorhandle
 func.func @gather(%indices: tensor<?xi32>,
              %resource: tensor<*x!tf_type.resource>) -> tensor<*xi32> {
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
index c3a2a757f3776b..d120d1fe4f5005 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/BUILD
@@ -15,6 +15,8 @@ package_group(
         "//tensorflow/core/tfrt/saved_model/tests/...",
     ] + if_google([
         "//learning/brain/tfrt/cpp_tests/...",
+        "//learning/pathways/serving/runtime/...",
+        "//learning/pathways/serving/tests/...",
         # Allow visibility from the mlir language server.
         "//learning/brain/mlir/mlir_lsp_server/...",
     ]),
@@ -51,7 +53,10 @@ cc_library(
     name = "tf_ifrt_passes",
     srcs = [
         "rewrite_cluster_to_ifrt_call.cc",
+        "sink_variable_as_named_array.cc",
         "tf_ifrt_passes.cc",
+        "tf_restore_merging.cc",
+        "tf_restore_splitting.cc",
     ],
     hdrs = [
         "tf_ifrt_passes.h",
@@ -66,22 +71,31 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:dump_mlir_util",
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_types",
         "//tensorflow/compiler/mlir/tensorflow:tpu_rewrite_device_util",
         "//tensorflow/compiler/mlir/tensorflow/transforms:tensorflow_passes",
         "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:tpu_metadata_utils",
+        "//tensorflow/compiler/mlir/tfrt:tf_to_tfrt",
         "//tensorflow/core:framework",
+        "//tensorflow/core/platform:protobuf",
         "//tensorflow/core/platform:random",
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
+        "//tensorflow/core/tfrt/ifrt:ifrt_config_proto_cc",
         "@com_google_absl//absl/base",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:Transforms",
         "@local_tsl//tsl/platform:protobuf",
+        "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/service:computation_placer_hdr",
     ],
 )
 
@@ -104,6 +118,7 @@ cc_library(
         "//tensorflow/core/protobuf/tpu:compile_metadata_proto_cc",
         "//tensorflow/core/protobuf/tpu:topology_proto_cc",
         "//tensorflow/core/tpu/kernels:tpu_compile_op_support",
+        "//tensorflow/core/tpu/kernels/xla:host_compute_ops",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -115,7 +130,6 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:protobuf",
         "@local_tsl//tsl/platform:statusor",
         "@local_xla//xla:shape_util",
         "@local_xla//xla:xla_data_proto_cc",
@@ -141,7 +155,6 @@ tf_cc_test(
         ":tf2hlo",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/tf2xla:xla_helpers",
-        "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core/platform:resource_loader",
         "@com_google_absl//absl/log",
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.cc
index 412e86ef4e39f3..b0815b8f5c0272 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.cc
@@ -92,6 +92,7 @@ CompileAndRegisterIfrtPrograms(absl::string_view model_name,
         model_name, entry_function_name.str(), *std::move(submodule),
         ifrt_model_context.GetClient(),
         &ifrt_model_context.GetThreadPoolDevice(),
+        &ifrt_model_context.GetLoadedVariableRegistry(),
         ifrt_model_context.GetShapeRepresentationFn());
 
     // Register the Ifrt program to `ServingExecutableRegistry` so that
@@ -159,7 +160,7 @@ absl::Status IfrtBackendCompiler::CompileTensorflow(
 
   // Use bridge for cluster formation.
   TF_RETURN_IF_ERROR(tensorflow::tf2xla::v2::RunFunctionTf2xlaClusteringBridge(
-      module, tensorflow::tf2xla::v2::DeviceType::XLA_TPU_JIT,
+      module, /*is_supported_by_replicated_brige*/ true,
       /*is_in_fallback_enabled_mode=*/false));
 
   if (VLOG_IS_ON(1)) {
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler_test.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler_test.cc
index 7ade31aa2a4506..1b599fd2c33c2b 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler_test.cc
@@ -17,7 +17,9 @@ limitations under the License.
 
 #include <memory>
 #include <string>
-#include <utility>
+
+// Enable definition of Eigen::ThreadPoolDevice instead of just declaration.
+#define EIGEN_USE_THREADS
 
 #include <gtest/gtest.h>
 #include "absl/strings/str_cat.h"
@@ -79,8 +81,6 @@ TEST(IfrtBackendCompilerTest, Basic) {
                           xla::ifrt::test_util::GetClient());
   Eigen::ThreadPoolDevice thread_pool_device = GetThreadPoolDevice();
 
-  IfrtModelContext model_context(client, &thread_pool_device);
-
   std::unique_ptr<tensorflow::tfrt_stub::Runtime> runtime =
       tensorflow::tfrt_stub::DefaultTfrtRuntime(/*num_threads=*/1);
   tensorflow::tfrt_stub::GraphExecutionOptions graph_execution_options(
@@ -90,7 +90,7 @@ TEST(IfrtBackendCompilerTest, Basic) {
       &graph_execution_options, /*export_dir=*/"", &resource_context);
 
   runtime_context.resource_context().CreateResource<IfrtModelContext>(
-      "IfrtModelContext", std::move(model_context));
+      "IfrtModelContext", client, &thread_pool_device);
 
   IfrtBackendCompiler compiler;
   TF_ASSERT_OK(compiler.CompileTensorflow(runtime_context, mlir_module.get()));
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_constants.h b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_constants.h
index 4cd5bf2cbfc3bb..3e4971826c67b6 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_constants.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_constants.h
@@ -21,14 +21,19 @@ limitations under the License.
 namespace tensorflow {
 namespace ifrt_serving {
 
-// Attribute name of a serialized TpuCompileMetadataProto. This is backward
-// compatible.
-inline constexpr absl::string_view kMetadataAttrName = "tpu_compile_metadata";
 // Attribute name of a text TpuCompileMetadataProto. Note that the text proto is
-// not backward compatible and only used for debug.
+// not backward compatible and shall not be serialized.
 inline constexpr absl::string_view kMetadataTextAttrName =
     "__tpu_compile_metadata_text";
 
+// Name of a variable as loaded IFRT array .
+inline constexpr absl::string_view kVariableArrayNameAttr =
+    "__variable_array_name";
+
+// Attribute of a text `VariableDeviceShardingConfigProto`.
+inline constexpr absl::string_view kVariableShardingConfigTextAttr =
+    "__variable_sharding_config_text";
+
 }  // namespace ifrt_serving
 }  // namespace tensorflow
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.td b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.td
index a79e91f0422983..c725aa85b0157b 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.td
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.td
@@ -26,11 +26,52 @@ def RewriteClusterToIfrtCallPass: Pass<"rewrite-cluster-to-ifrt-call", "mlir::Mo
   let dependentDialects = ["mlir::tf_device::TensorFlowDeviceDialect"];
 
   let constructor = "CreateRewriteClusterToIfrtCallPass()";
+ }
 
-  let options = [
-    Option<"tpu_compile_metadata_debug_", "tpu-compile-metadata-debug", "bool", "false",
-          "if enabled, output compile metadata as readable string in "
-         "an extra __tpu_compile_metadata_debug attribute for debug">,
-  ];
+def SinkVariableAsNamedArrayPass: Pass<"sink-variable-as-named-array", "mlir::ModuleOp"> {
+  let summary = "Sink variable tensor for tpu device as named IFRT array for tf.IfrtCall";
 
+  let description = [{
+    This pass sinks variable tensor argument to `tf.IfrtCall` as variable_arg_indices
+    and variable_names attributes and also lowers `tf.ReadVariableOp` to
+    `tf.IfrtLoadVariableOp`.
+
+    The runtime ensures that `tf.IfrtCall` kernel can bind the IFRT array by
+    its name as input to the TPU program.
+
+ }];
+
+  let constructor = "CreateSinkVariableAsNamedArrayPass()";
 }
+
+def TfRestoreSplittingPass
+    : Pass<"tf-restore-splitting", "mlir::func::FuncOp"> {
+  let summary = "Splits `tf.RestoreV2` ops";
+
+  let description = [{
+    This pass splits each `tf.RestoreV2` op so that one restore op handles one
+    variable only. This pass can split restore ops only if the tensor names and
+    the shape/slices arguments are constants, which is usually the case.
+
+    Splitting monolithic restore ops into per-tensor restore ops makes it easier
+    to shard SavedModel initialization across multiple clusters.
+  }];
+
+  let constructor = "CreateTfRestoreSplittingPass()";
+}
+
+def TfRestoreMergingPass : Pass<"tf-restore-merging", "mlir::func::FuncOp"> {
+  let summary = "Merges `tf.RestoreV2` ops";
+
+  let description = [{
+    This pass merges multiple `tf.RestoreV2` ops into one `tf.RestoreV2` op
+    using variadic results. The current implementation merges restore ops only
+    if they have the same `prefix` and have constant tensor names and
+    shape/slice arguments.
+
+    This pass is run in order to undo `tf-restore-splitting` after cluster
+    formation and reduce the op dispatch overhead.
+  }];
+
+  let constructor = "CreateTfRestoreMergingPass()";
+}
\ No newline at end of file
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/rewrite_cluster_to_ifrt_call.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/rewrite_cluster_to_ifrt_call.cc
index 89428db2dd148c..b8bd4685919d4c 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/rewrite_cluster_to_ifrt_call.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/rewrite_cluster_to_ifrt_call.cc
@@ -190,9 +190,15 @@ class RewriteClusterToIfrtCallPass
         return signalPassFailure();
       }
 
+      auto metadata_attr =
+          ifrt_program->getAttrOfType<mlir::StringAttr>(kMetadataTextAttrName);
+      if (!metadata_attr) {
+        return signalPassFailure();
+      }
+      ifrt_call_op->setAttr(kMetadataTextAttrName, metadata_attr);
+
       // TODO(b/304839793): populate variable names after adding a variable
       // hoisting pass.
-      ifrt_call_op.setVariableNamesAttr(builder.getArrayAttr({}));
       ifrt_call_op.setVariableArgIndicesAttr(builder.getI32ArrayAttr({}));
       ifrt_call_op.setProgramId(program_id);
 
@@ -214,25 +220,24 @@ class RewriteClusterToIfrtCallPass
     if (mlir::failed(GetTpuCompileMetadata(cluster_func, devices, &metadata))) {
       return signalPassFailure();
     }
+    std::string serialized_metadata;
+    tsl::protobuf::TextFormat::Printer printer;
+    printer.SetSingleLineMode(true);
+    printer.PrintToString(metadata, &serialized_metadata);
 
-    cloned_ifrt_program->setAttr(
-        kMetadataAttrName, builder.getStringAttr(metadata.SerializeAsString()));
+    cloned_ifrt_program->setAttr(kMetadataTextAttrName,
+                                 builder.getStringAttr(serialized_metadata));
 
-    if (tpu_compile_metadata_debug_) {
-      std::string serialized_metadata;
-      tsl::protobuf::TextFormat::Printer printer;
-      printer.SetSingleLineMode(true);
-      printer.PrintToString(metadata, &serialized_metadata);
-
-      cloned_ifrt_program->setAttr(kMetadataTextAttrName,
-                                   builder.getStringAttr(serialized_metadata));
-    }
     cloned_ifrt_program.setName(ifrt_program_name);
 
     int64_t program_id = NewProgramId();
     cloned_ifrt_program->setAttr("tfrt_ifrt_serving.program_id",
                                  builder.getI64IntegerAttr(program_id));
 
+    // Make clonet ifrt program public so that it does not get dropped by
+    // inliner.
+    cloned_ifrt_program.setPublic();
+
     builder.setInsertionPoint(cluster_func);
 
     mlir::TF::IfrtCallOp ifrt_call_op = builder.create<mlir::TF::IfrtCallOp>(
@@ -241,9 +246,12 @@ class RewriteClusterToIfrtCallPass
 
     // TODO(b/304839793): populate variable names after adding a variable
     // hoisting pass.
-    ifrt_call_op.setVariableNamesAttr(builder.getArrayAttr({}));
     ifrt_call_op.setVariableArgIndicesAttr(builder.getI32ArrayAttr({}));
     ifrt_call_op.setProgramId(program_id);
+    // Additionally attach tpu_compile_metadata to IfrtCallOp. Some subsequent
+    // pass such as SinkVariableAsNamedArrayPass relies on this attribute.
+    ifrt_call_op->setAttr(kMetadataTextAttrName,
+                          builder.getStringAttr(serialized_metadata));
 
     cluster_func->replaceAllUsesWith(ifrt_call_op.getResults());
     cluster_func->erase();
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/sink_variable_as_named_array.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/sink_variable_as_named_array.cc
new file mode 100644
index 00000000000000..450c441ef4e3f7
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/sink_variable_as_named_array.cc
@@ -0,0 +1,359 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <memory>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_constants.h"
+#include "xla/service/computation_placer.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/platform/protobuf.h"  // IWYU pragma: keep
+#include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_config.pb.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+namespace {
+
+#define GEN_PASS_DEF_SINKVARIABLEASNAMEDARRAYPASS
+#define GEN_PASS_DECL_SINKVARIABLEASNAMEDARRAYPASS
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.h.inc"  // IWYU pragma: keep
+
+class SinkVariableAsNamedArrayPass
+    : public impl::SinkVariableAsNamedArrayPassBase<
+          SinkVariableAsNamedArrayPass> {
+ public:
+  void runOnOperation() override {
+    mlir::ModuleOp module = getOperation();
+    mlir::OpBuilder builder(&getContext());
+
+    absl::flat_hash_map<std::string, VariableConfig> variable_config_by_name;
+    llvm::SmallDenseMap<mlir::TF::IfrtCallOp, IfrtArgConfigList>
+        ifrt_call_argument_configs;
+
+    // First, we backtrack from IFRT call to collect variable tensors that needs
+    // to converted to loaded ifrt arrays and their associated information such
+    // as their name and defining ops.
+    std::vector<mlir::TF::IfrtCallOp> ifrt_call_ops;
+    module.walk([&ifrt_call_ops](mlir::TF::IfrtCallOp call) {
+      ifrt_call_ops.push_back(call);
+    });
+    for (const auto& call : ifrt_call_ops) {
+      if (mlir::failed(CollectVariablesUsedByDevice(
+              call, variable_config_by_name, ifrt_call_argument_configs))) {
+        return signalPassFailure();
+      }
+    }
+
+    // Rewrite ReadVariableOp with IfrtLoadVariableOp
+    llvm::SmallDenseMap<mlir::TF::ReadVariableOp, mlir::TF::IfrtLoadVariableOp>
+        read_to_load;
+    for (auto& [name, variable_config] : variable_config_by_name) {
+      for (auto& read_variable_op : variable_config.read_variable_op) {
+        builder.setInsertionPointAfter(read_variable_op);
+        // TODO(b/319045348): consider use resource alias analysis for this.
+        auto var_handle = GetDefiningOp<mlir::TF::VarHandleOp>(
+            read_variable_op.getResource());
+
+        if (!var_handle) {
+          read_variable_op->emitError(
+              "ReadVariableOp has no defining VarHandleOp.");
+          return signalPassFailure();
+        }
+
+        auto load_variable_op = builder.create<mlir::TF::IfrtLoadVariableOp>(
+            read_variable_op->getLoc(),
+            mlir::RankedTensorType::get(
+                {}, builder.getType<mlir::TF::StringType>()),
+            var_handle.getResult(),
+            builder.getStringAttr(variable_config.device_sharding_config),
+            builder.getStringAttr(name));
+        read_to_load[read_variable_op] = load_variable_op;
+      }
+    }
+
+    // Rewrite ifrt call: variable tensors are sunk as attribute.
+    // The runtime guarantees the binding of corresponding loaded ifrt array
+    // based on attributes.
+    for (auto& call : ifrt_call_ops) {
+      if (!call.getVariableArgIndicesAttr().empty()) {
+        call->emitError() << "Expect empty "
+                          << call.getVariableArgIndicesAttrName().str()
+                          << " attributes, but got "
+                          << call.getVariableArgIndicesAttr().size()
+                          << " elements";
+        return signalPassFailure();
+      }
+      if (call->getOpOperands().size() !=
+          ifrt_call_argument_configs[call].size()) {
+        call->emitError() << "IfrtCallOp got " << call->getOpOperands().size()
+                          << " operands, but expects "
+                          << ifrt_call_argument_configs[call].size();
+        return signalPassFailure();
+      }
+      llvm::SmallVector<int> variable_arg_indices;
+      llvm::SmallVector<mlir::Attribute> variable_arg_names;
+      llvm::SmallVector<mlir::Value> updated_args;
+
+      for (const auto& [arg_idx, arg] :
+           llvm::enumerate(ifrt_call_argument_configs[call])) {
+        if (arg.is_variable) {
+          variable_arg_names.push_back(
+              builder.getStringAttr(arg.variable_name));
+          variable_arg_indices.push_back(arg_idx);
+          // Variable use the key from IfrtLoadVariable.
+          updated_args.push_back(
+              read_to_load[arg.read_variable_op].getResult());
+        } else {
+          // non variable
+          updated_args.push_back(call->getOperand(arg_idx));
+        }
+      }
+
+      builder.setInsertionPointAfter(call);
+      auto updated_ifrt_call = builder.create<mlir::TF::IfrtCallOp>(
+          call->getLoc(), call.getResultTypes(), updated_args);
+
+      updated_ifrt_call->setAttrs(call->getAttrs());
+      // Update variable_arg_indices attribute.
+      updated_ifrt_call.setVariableArgIndicesAttr(
+          builder.getI32ArrayAttr(variable_arg_indices));
+
+      call.replaceAllUsesWith(updated_ifrt_call);
+      call.erase();
+    }
+
+    // Delete all ReadVariableOps that are not used.
+    for (auto& [name, variable_config] : variable_config_by_name) {
+      for (auto& read_variable_op : variable_config.read_variable_op) {
+        if (read_variable_op.use_empty()) {
+          read_variable_op.erase();
+        }
+      }
+    }
+  }
+
+ private:
+  struct VariableConfig {
+    // VariableDeviceShardingConfig text proto.
+    std::string device_sharding_config;
+    // All ReadVariableOps that returns this named variable.
+    std::vector<mlir::TF::ReadVariableOp> read_variable_op;
+  };
+  struct IfrtArgConfig {
+    bool is_variable;
+    std::string variable_name;
+    mlir::TF::ReadVariableOp read_variable_op;
+  };
+  using IfrtArgConfigList = llvm::SmallVector<IfrtArgConfig>;
+
+  // Find defining ReadVariableOps and also build argument configuration map of
+  // a IfrtCallOp.
+  mlir::LogicalResult CollectVariablesUsedByDevice(
+      mlir::TF::IfrtCallOp call,
+      absl::flat_hash_map<std::string, VariableConfig>& variable_config_by_name,
+      llvm::SmallDenseMap<mlir::TF::IfrtCallOp, IfrtArgConfigList>&
+          ifrt_call_argument_configs) {
+    IfrtArgConfigList& args = ifrt_call_argument_configs[call];
+
+    tensorflow::tpu::TPUCompileMetadataProto metadata;
+
+    // TODO(b/319045348):  remove the usage kMetadataAttrName.
+    auto metadata_attr =
+        call->getAttrOfType<mlir::StringAttr>(kMetadataTextAttrName);
+    if (metadata_attr && !metadata_attr.empty()) {
+      if (!tensorflow::protobuf::TextFormat::ParseFromString(
+              metadata_attr.getValue().str(), &metadata)) {
+        return call.emitError()
+               << "Failed to parse TPUCompileMetadataProto from attr :"
+               << metadata_attr.getValue().str();
+      }
+    } else {
+      return call.emitError()
+             << "Failed to Get TPUCompileMetadataProto from attr";
+    }
+
+    for (const auto& [arg_idx, input] : llvm::enumerate(call->getOperands())) {
+      // Assuming the nested function calls are inlined.
+      if (auto read_variable_op =
+              GetDefiningOp<mlir::TF::ReadVariableOp>(input)) {
+        mlir::FailureOr<std::string> variable_tensor_name =
+            GetVariableTensorName(read_variable_op);
+
+        if (mlir::failed(variable_tensor_name)) {
+          return mlir::failure();
+        }
+
+        absl::StatusOr<std::string> device_sharding_config =
+            GetVariableShardingConfig(metadata, arg_idx);
+        if (!device_sharding_config.ok()) {
+          return call->emitError()
+                 << "Fail to get device sharding config for argument index "
+                 << arg_idx;
+        }
+        VariableConfig& variable_config =
+            variable_config_by_name[*variable_tensor_name];
+        if (!variable_config.read_variable_op.empty()) {
+          if (variable_config.device_sharding_config !=
+              *device_sharding_config) {
+            return call->emitError()
+                   << "A variable tensor has different sharding config: "
+                   << variable_config.device_sharding_config << " vs "
+                   << *device_sharding_config;
+          }
+        } else {
+          variable_config.device_sharding_config = *device_sharding_config;
+        }
+
+        variable_config.read_variable_op.push_back(read_variable_op);
+        args.push_back({.is_variable = true,
+                        .variable_name = *variable_tensor_name,
+                        .read_variable_op = read_variable_op});
+      } else {
+        args.push_back({.is_variable = false});
+      }
+    }
+
+    return mlir::success();
+  }
+
+  // The returned variable tensor name is used both as an internal hash key,
+  // and as the binding name between the tensor and the array in the
+  // runtime.
+  std::string GetVariableTensorName(mlir::TF::VarHandleOp var_handle) {
+    return absl::StrCat(absl::string_view(var_handle.getContainer()), "__",
+                        absl::string_view(var_handle.getSharedName()));
+  }
+
+  mlir::FailureOr<std::string> GetVariableTensorName(
+      mlir::TF::ReadVariableOp read_variable_op) {
+    mlir::Value variable_definition = read_variable_op.getResource();
+    auto var_handle = GetDefiningOp<mlir::TF::VarHandleOp>(variable_definition);
+
+    if (!var_handle) {
+      return read_variable_op->emitError("ReadVariableOp has no defining op.");
+    }
+
+    return GetVariableTensorName(var_handle);
+  }
+
+  absl::StatusOr<std::string> GetVariableShardingConfig(
+      const tensorflow::tpu::TPUCompileMetadataProto& metadata, int arg_idx) {
+    tensorflow::ifrt_serving::VariableDeviceShardingConfigProto
+        device_sharding_config;
+    std::vector<int> device_ids;
+
+    if (metadata.has_device_assignment()) {
+      absl::StatusOr<std::unique_ptr<xla::DeviceAssignment>> da =
+          xla::DeviceAssignment::Deserialize(metadata.device_assignment());
+
+      if (!da.ok()) {
+        return da.status();
+      }
+      if (metadata.num_replicas() != (*da)->replica_count() ||
+          metadata.num_cores_per_replica() != (*da)->computation_count()) {
+        return absl::FailedPreconditionError(absl::StrCat(
+            "Device assignment has different replica count: ",
+            metadata.num_replicas(), " vs ", (*da)->replica_count(),
+            " or computation count: ", metadata.num_cores_per_replica(), " vs ",
+            (*da)->computation_count(), "."));
+      }
+
+      device_ids.reserve(metadata.num_replicas() *
+                         metadata.num_cores_per_replica());
+      for (int i = 0; i < (*da)->replica_count(); ++i) {
+        for (int j = 0; j < (*da)->computation_count(); ++j) {
+          device_ids.push_back((**da)(i, j));
+        }
+      }
+    } else {
+      // Default use first N devices.
+      device_ids.resize(metadata.num_replicas() *
+                        metadata.num_cores_per_replica());
+      std::iota(device_ids.begin(), device_ids.end(), 0);
+    }
+
+    device_sharding_config.mutable_device_ids()->Assign(device_ids.begin(),
+                                                        device_ids.end());
+
+    if (metadata.args_size() > 0) {
+      *device_sharding_config.mutable_sharding() =
+          metadata.args(arg_idx).sharding();
+    }
+
+    std::string proto_text;
+    tsl::protobuf::TextFormat::Printer printer;
+    printer.SetSingleLineMode(true);
+    printer.PrintToString(device_sharding_config, &proto_text);
+
+    return proto_text;
+  }
+
+  template <typename OpT>
+  OpT GetDefiningOp(const mlir::Value& value) {
+    mlir::Operation* op = value.getDefiningOp();
+
+    while (op && !llvm::isa<OpT>(op)) {
+      if (llvm::isa<mlir::TF::IdentityOp>(op)) {
+        op = op->getOperand(0).getDefiningOp();
+      } else {
+        return nullptr;
+      }
+    }
+
+    if (op != nullptr) {
+      return llvm::dyn_cast<OpT>(op);
+    } else {
+      return nullptr;
+    }
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateSinkVariableAsNamedArrayPass() {
+  return std::make_unique<SinkVariableAsNamedArrayPass>();
+}
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata/xla_call_host_callback.mlir b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata/xla_call_host_callback.mlir
new file mode 100644
index 00000000000000..4eff0866ba7a66
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata/xla_call_host_callback.mlir
@@ -0,0 +1,23 @@
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 12 : i32, producer = 1758 : i32}} {
+
+  func.func private @callee(%arg0: tensor<?xi32>, %arg1: tensor<*xi32>) {
+    "tf.XlaHostCompute"(%arg0, %arg1) <{ancestors = [], key = "@test_callee", recv_key = "", send_key = "", shapes = []}> {_xla_original_oc_node_name = "hcb0", _xla_token_input_nodes = ["_xla_token_arg_node"]} : (tensor<?xi32>, tensor<*xi32>) -> ()
+    return
+  }
+
+  // The mlir module in XlaCallModule is serialized from:
+  //
+  // func.func private @_stablehlo_main_0(%arg0: tensor<?xi32>, %arg1: tensor<*xi32>) -> () attributes {_from_xla_call_module} {
+  //   stablehlo.custom_call @tf.call_tf_function(%arg0, %arg1) {api_version = 2 : i32, has_side_effect = true, tf.backend_config = {called_func = @callee}} : (tensor<?xi32>, tensor<*xi32>) -> ()
+  //   return
+  // }
+  //
+  // func.func @main(%arg0: tensor<?xi32>, %arg1: tensor<*xi32>) -> () {
+  //   "tf.XlaCallModule"(%arg0, %arg1) {Sout = [#tf_type.shape<?>], dim_args_spec = [], _entry_function = @_stablehlo_main_0, _stablehlo_module_attrs = { mhlo.num_partitions = 1 }, module = "", platforms = [], version = 5 : i64} : (tensor<?xi32>, tensor<*xi32>) -> ()
+  //   func.return
+  // }
+  func.func @main(%arg0: tensor<?xi32>, %arg1: tensor<*xi32>) attributes {tfrt_ifrt_serving.program_id = -2372940092539171444 : i64, __tpu_compile_metadata_text = "args { dtype: DT_INT32 kind: PARAMETER sharding { } }  args { dtype: DT_INT32 kind: PARAMETER sharding { } } num_replicas: 1 num_cores_per_replica: 1 use_spmd_for_xla_partitioning: true compile_options { }"} {
+    "tf.XlaCallModule"(%arg0, %arg1) <{Sout = [], dim_args_spec = [], function_list = [@callee], module = "ML\EFR\0DStableHLO_v0.17.6\00\01\19\05\01\05\09\01\03\0B\03\07\0F\13\17\03M-\0D\01\19\0B\13\0B\0F\13\13\13\13\13\0B\13\13\03\15\0B\0B\0B\0B\13\0B\0F\0B\0B\0B\01\03\0F\03\0B3\07\0B\17\07\02\B1\05\0D\03\03\05\07\05\0F\11\01\05\17\01A\0B\17\01!\07\17\01!Q\17\01!}\03\03\13!\05\11\17\01#\0B\17\01%\0B\03\01\1D\13#\09\1D\15\0D\03#%\1D\17\13\0B\01\0B\05\1D\19\05\03\01\02\04)\03\00\FF\FF\FF\FF\FF\FF\FF\FF\05\1B3\05\11\05\03\07\01\1D\04O\05\01Q\09\03\01\07\04=\03\01\05\03P\0B\03\07\04)\03\05\0B\05\07\0D\0F\0F\00\05E\15\11\05\05\01\03\07\00\17\06\03\01\05\01\00j\03\1B)\1B\0B\03%)\95\15\1F\11\0F\0B\11builtin\00vhlo\00module\00func_v1\00custom_call_v1\00return_v1\00experimental/users/deqiangc/mira/testdata/xla_call_module_serialized.mlir\00mhlo.num_partitions\00tf.backend_config\00\00main\00called_index\00tf.call_tf_function\00\08'\07\05\01\01\0B\19\1D\19\1F\1B\11'\1B)\19+\19\19\19", platforms = [], version = 5 : i64}> : (tensor<?xi32>, tensor<*xi32>) -> ()
+    return
+  }
+}
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc
index 96794df1b7e34f..3cecc9c90ab4e2 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.cc
@@ -59,7 +59,6 @@ limitations under the License.
 #include "tensorflow/core/protobuf/tpu/topology.pb.h"
 #include "tensorflow/core/tpu/kernels/tpu_compile_op_support.h"
 #include "tsl/platform/errors.h"
-#include "tsl/platform/protobuf.h"
 #include "tsl/platform/statusor.h"
 
 namespace tensorflow {
@@ -68,37 +67,27 @@ namespace {
 static constexpr absl::string_view kEntryFuncName = "main";
 
 absl::StatusOr<tensorflow::tpu::TPUCompileMetadataProto> GetCompileMetadata(
-    mlir::func::FuncOp op, absl::Span<const tensorflow::Tensor> inputs,
+    mlir::func::FuncOp op, absl::Span<const DtypeAndShape> inputs,
     const xla::ifrt::Client& ifrt_client) {
   tensorflow::tpu::TPUCompileMetadataProto metadata;
 
-  auto metadata_attr = op->getAttrOfType<mlir::StringAttr>(kMetadataAttrName);
   auto metadata_text_attr =
       op->getAttrOfType<mlir::StringAttr>(kMetadataTextAttrName);
 
-  if (metadata_attr && !metadata_attr.getValue().empty()) {
-    // tpu_compile_metadata takes priority if exists.
-    VLOG(1) << "Parsing from attribute " << kMetadataAttrName << " : "
-            << metadata_attr.getValue().str();
-    if (!metadata.ParseFromString(metadata_attr.getValue())) {
-      return absl::InternalError(
-          absl::StrCat("Failed to parse tpu_compile_metadata attribute:",
-                       metadata_attr.getValue().str()));
-    }
-  } else if (metadata_text_attr && !metadata_text_attr.getValue().empty()) {
+  if (metadata_text_attr && !metadata_text_attr.getValue().empty()) {
     // Try __tpu_compile_metadata_text attribute. This only for debugging
     // purpose.
     VLOG(1) << "Parsing from attribute " << kMetadataTextAttrName
             << metadata_text_attr.getValue().str();
     if (!tsl::protobuf::TextFormat::ParseFromString(
-            metadata_text_attr.getValue(), &metadata)) {
+            metadata_text_attr.getValue().str(), &metadata)) {
       return absl::InvalidArgumentError(absl::StrCat(
           "Attribute ", kMetadataTextAttrName, ":",
           metadata_text_attr.getValue().str(), " cannot be parsed"));
     }
   } else {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "Missing ", kMetadataAttrName, " and ", kMetadataTextAttrName));
+    return absl::InvalidArgumentError(
+        absl::StrCat("Missing ", kMetadataTextAttrName));
   }
 
   VLOG(3) << "TpuCompileMetadata before shape is populated " << metadata;
@@ -126,14 +115,14 @@ absl::StatusOr<tensorflow::tpu::TPUCompileMetadataProto> GetCompileMetadata(
           "Only support PARAMETER, but got ", metadata.args(i).kind()));
     }
 
-    if (metadata.args(i).dtype() != inputs[i].dtype()) {
+    if (metadata.args(i).dtype() != inputs[i].dtype) {
       return absl::InternalError(absl::StrCat("Dtype mismatched! Expected ",
                                               metadata.args(i).dtype(), " got ",
-                                              inputs[i].dtype()));
+                                              inputs[i].dtype));
     }
 
     // Update shape.
-    *metadata.mutable_args(i)->mutable_shape() = inputs[i].shape().AsProto();
+    *metadata.mutable_args(i)->mutable_shape() = inputs[i].shape.AsProto();
   }
 
   // Create a default device assignment if one is not given by the model.
@@ -154,7 +143,7 @@ absl::StatusOr<tensorflow::tpu::TPUCompileMetadataProto> GetCompileMetadata(
 }  // namespace
 
 absl::StatusOr<Tf2HloResult> CompileTfToHlo(
-    mlir::ModuleOp module, absl::Span<const tensorflow::Tensor> inputs,
+    mlir::ModuleOp module, absl::Span<const DtypeAndShape> inputs,
     absl::string_view entry_function_name, const xla::ifrt::Client& ifrt_client,
     tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn) {
   if (VLOG_IS_ON(1)) {
@@ -192,7 +181,7 @@ absl::StatusOr<Tf2HloResult> CompileTfToHlo(
 
   std::vector<TensorShape> arg_shapes;
   for (const auto& input : inputs) {
-    arg_shapes.push_back(input.shape());
+    arg_shapes.push_back(input.shape);
   }
 
   bool use_tuple_args = false;
@@ -222,6 +211,7 @@ absl::StatusOr<Tf2HloResult> CompileTfToHlo(
   Tf2HloResult result;
   result.mlir_hlo_module = xla::llvm_ir::CreateMlirModuleOp(module->getLoc());
   result.compile_metadata = std::move(compile_metadata);
+  result.host_compute_metadata = compilation_result.host_compute_metadata;
 
   TF_RETURN_IF_ERROR(xla::ConvertHloToMlirHlo(
       *result.mlir_hlo_module, &compilation_result.computation->proto()));
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
index f170a5fc5d9265..74fa271401f547 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h
@@ -24,20 +24,28 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "xla/python/ifrt/client.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
 
 namespace tensorflow {
 namespace ifrt_serving {
 
+struct DtypeAndShape {
+  tensorflow::DataType dtype;
+  tensorflow::TensorShape shape;
+};
+
 struct Tf2HloResult {
   mlir::OwningOpRef<mlir::ModuleOp> mlir_hlo_module;
   tensorflow::tpu::TPUCompileMetadataProto compile_metadata;
+  tf2xla::HostComputeMetadata host_compute_metadata;
 };
 
 // A class that convert tf module to hlo
 // TODO(b/304839793): provide wrap persistent compilation cache.
 absl::StatusOr<Tf2HloResult> CompileTfToHlo(
-    mlir::ModuleOp module, absl::Span<const tensorflow::Tensor> inputs,
+    mlir::ModuleOp module, absl::Span<const DtypeAndShape> inputs,
     absl::string_view entry_function_name, const xla::ifrt::Client& ifrt_client,
     tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn);
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo_test.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo_test.cc
index 89f353c68eafdd..7ee1c450426b20 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo_test.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo_test.cc
@@ -35,8 +35,6 @@ limitations under the License.
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/test_util.h"
-#include "tensorflow/core/framework/tensor.h"
-#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/test.h"
 #include "tsl/lib/core/status_test_util.h"
@@ -124,13 +122,12 @@ TEST(Tf2HloTest, Tuple) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
                           xla::ifrt::test_util::GetClient());
 
-  std::vector<tensorflow::Tensor> tensors;
-  tensorflow::Tensor x(DT_FLOAT, tensorflow::TensorShape({1, 3}));
-  tensorflow::Tensor y(DT_FLOAT, tensorflow::TensorShape({3, 1}));
-  tensors.push_back(x);
-  tensors.push_back(y);
-  auto result = CompileTfToHlo(mlir_module.get(), tensors, "main", *client,
-                               tensorflow::IdentityShapeRepresentationFn());
+  std::vector<DtypeAndShape> dtype_and_shapes;
+  dtype_and_shapes.push_back(DtypeAndShape{DT_FLOAT, {1, 3}});
+  dtype_and_shapes.push_back(DtypeAndShape{DT_FLOAT, {3, 1}});
+  auto result =
+      CompileTfToHlo(mlir_module.get(), dtype_and_shapes, "main", *client,
+                     tensorflow::IdentityShapeRepresentationFn());
 
   TF_ASSERT_OK(result.status());
 }
@@ -158,12 +155,11 @@ TEST(Tf2HloTest, Spmd) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
                           xla::ifrt::test_util::GetClient());
 
-  std::vector<tensorflow::Tensor> tensors;
-  tensorflow::Tensor x(DT_FLOAT, tensorflow::TensorShape({4, 64}));
-  tensors.push_back(x);
-
-  auto result = CompileTfToHlo(mlir_module.get(), tensors, "main", *client,
-                               tensorflow::IdentityShapeRepresentationFn());
+  std::vector<DtypeAndShape> dtype_and_shapes;
+  dtype_and_shapes.push_back(DtypeAndShape{DT_FLOAT, {4, 64}});
+  auto result =
+      CompileTfToHlo(mlir_module.get(), dtype_and_shapes, "main", *client,
+                     tensorflow::IdentityShapeRepresentationFn());
 
   LOG(INFO) << result->compile_metadata;
   TF_ASSERT_OK(result.status());
@@ -227,16 +223,13 @@ TEST(Tf2HloTest, UsingDefaultDeviceAssignment) {
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
                           xla::ifrt::test_util::GetClient());
 
-  std::vector<tensorflow::Tensor> tensors;
-  tensorflow::Tensor x(DT_FLOAT, tensorflow::TensorShape({4, 64}));
-  tensorflow::Tensor y(DT_FLOAT, tensorflow::TensorShape({64, 10}));
-  tensorflow::Tensor z(DT_FLOAT, tensorflow::TensorShape({1, 4}));
-  tensors.push_back(x);
-  tensors.push_back(y);
-  tensors.push_back(z);
-
-  auto result = CompileTfToHlo(mlir_module.get(), tensors, "main", *client,
-                               tensorflow::IdentityShapeRepresentationFn());
+  std::vector<DtypeAndShape> dtype_and_shapes;
+  dtype_and_shapes.push_back(DtypeAndShape{DT_FLOAT, {4, 64}});
+  dtype_and_shapes.push_back(DtypeAndShape{DT_FLOAT, {64, 10}});
+  dtype_and_shapes.push_back(DtypeAndShape{DT_FLOAT, {1, 4}});
+  auto result =
+      CompileTfToHlo(mlir_module.get(), dtype_and_shapes, "main", *client,
+                     tensorflow::IdentityShapeRepresentationFn());
 
   LOG(INFO) << result->compile_metadata;
   TF_ASSERT_OK(result.status());
@@ -302,6 +295,47 @@ TEST(Tf2HloTest, UsingDefaultDeviceAssignment) {
   EXPECT_THAT(result->compile_metadata, EqualsProto(expected_compile_metadata));
 }
 
+// Multiple input and multiple out.
+TEST(Tf2HloTest, XlaCallHostCallback) {
+  // Create test input module
+  constexpr absl::string_view kDataDirectory =
+      "tensorflow/compiler/mlir/tfrt/transforms/ifrt/testdata";
+  std::string mlir_module_path = tensorflow::GetDataDependencyFilepath(
+      absl::StrCat(kDataDirectory, "/xla_call_host_callback.mlir"));
+
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::RegisterAllTensorFlowDialects(registry);
+
+  mlir::MLIRContext context(registry);
+
+  mlir::OwningOpRef<mlir::ModuleOp> mlir_module =
+      mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path,
+                                            mlir::ParserConfig(&context));
+
+  ASSERT_TRUE(mlir_module);
+  ASSERT_TRUE(mlir_module.get() != nullptr);
+
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
+                          xla::ifrt::test_util::GetClient());
+
+  std::vector<DtypeAndShape> dtype_and_shapes;
+  dtype_and_shapes.push_back(DtypeAndShape{DT_INT32, {1}});
+  dtype_and_shapes.push_back(DtypeAndShape{DT_INT32, {1}});
+
+  auto result =
+      CompileTfToHlo(mlir_module.get(), dtype_and_shapes, "main", *client,
+                     tensorflow::IdentityShapeRepresentationFn());
+
+  TF_ASSERT_OK(result.status());
+
+  ASSERT_EQ((*result).host_compute_metadata.device_to_host().size(), 1);
+  ASSERT_EQ(
+      (*result).host_compute_metadata.device_to_host().begin()->metadata_size(),
+      2);
+  ASSERT_EQ((*result).host_compute_metadata.host_to_device().size(), 0);
+}
+
 }  // namespace
 }  // namespace ifrt_serving
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc
index 47fe04925d1881..53bd55cc0d2799 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.cc
@@ -27,10 +27,12 @@ limitations under the License.
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
 #include "mlir/Pass/PassRegistry.h"  // from @llvm-project
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "mlir/Transforms/Passes.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/transforms/passes.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/data_dumper_logger_config.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
+#include "tensorflow/compiler/mlir/tfrt/transforms/passes.h"
 #include "tensorflow/core/util/debug_data_dumper.h"
 
 namespace tensorflow {
@@ -69,6 +71,20 @@ void AddClusterToIfrtRuntimeOpsPassPipeline(OpPassManager& pm,
       mlir::TF::CreateCanonicalizeCompileAndReplicateAttributesPass());
 
   pm.addPass(CreateRewriteClusterToIfrtCallPass());
+
+  // Sink VarHandle with ReadVariableOp: subsequent SinkVariableAsNamedArrayPass
+  // rely on the co-existence of VarHandle and ReadVariable in the same
+  // function.
+  // First, we inline all the function calls. This will sink VarHandle
+  // with ReadVariable in most cases. Then SinkInvariantOpsPass will sink
+  // VarHandle to a few special Ops that inliner does not handle.
+  // TODO(b/319045348): the bridge before this pipeline already does some
+  // inlining. Consider removing this inliner.
+  pm.addPass(mlir::createInlinerPass());
+  pm.addPass(::tensorflow::CreateSinkInInvariantOpsPass());
+
+  // Sink variable tensor as named array in IFRT.
+  pm.addPass(CreateSinkVariableAsNamedArrayPass());
 }
 
 }  // namespace
@@ -111,9 +127,7 @@ absl::Status RunClusterToIfrtRuntimeOpsPassPipeline(
 }
 
 // Register all IfrtPass
-void RegisterTfIfrtPasses() {
-  mlir::registerPass([]() { return CreateRewriteClusterToIfrtCallPass(); });
-}
+void RegisterTfIfrtPasses() { registerTfrtIfrtServingPasses(); }
 
 }  // namespace ifrt_serving
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h
index b04c61eb5c76d8..084b170fa5b9a4 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_ifrt_passes.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 
@@ -30,6 +31,19 @@ namespace ifrt_serving {
 std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
 CreateRewriteClusterToIfrtCallPass();
 
+// Creates a pass that sinks variable tensor argument to `tf.IfrtCall` as named
+// arrays and lowers `tf.ReadVariableOp` to `tf.IfrtLoadVariableOp`.
+std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
+CreateSinkVariableAsNamedArrayPass();
+
+// Creates a pass that splits `tf.RestoreV2` ops.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateTfRestoreSplittingPass();
+
+// Creates a pass that merges `tf.RestoreV2` ops.
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateTfRestoreMergingPass();
+
 #define GEN_PASS_REGISTRATION
 #include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.h.inc"  // IWYU pragma: keep
 
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_merging.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_merging.cc
new file mode 100644
index 00000000000000..5220824d3f716a
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_merging.cc
@@ -0,0 +1,164 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+
+#include <memory>
+#include <vector>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+namespace {
+
+#define GEN_PASS_DEF_TFRESTOREMERGINGPASS
+#define GEN_PASS_DECL_TFRESTOREMERGINGPASS
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.h.inc"  // IWYU pragma: keep
+
+class TfRestoreMergingPass
+    : public impl::TfRestoreMergingPassBase<TfRestoreMergingPass> {
+ public:
+  void runOnOperation() override {
+    mlir::func::FuncOp func = getOperation();
+    for (mlir::Block& block : func) {
+      // Group `tf.RestoreV2` ops by prefixes and merge each group.
+      llvm::SmallDenseMap<mlir::Value, std::vector<mlir::TF::RestoreV2Op>>
+          restore_groups;
+      for (auto restore : block.getOps<mlir::TF::RestoreV2Op>()) {
+        restore_groups[restore.getPrefix()].push_back(restore);
+      }
+      for (const auto& restores : llvm::make_second_range(restore_groups)) {
+        if (mlir::failed(MergeRestores(restores))) {
+          return signalPassFailure();
+        }
+      }
+    }
+  }
+
+ private:
+  mlir::DenseStringElementsAttr GetStringTensorAttr(
+      llvm::ArrayRef<llvm::StringRef> values) {
+    const int size = values.size();
+    const auto type = mlir::RankedTensorType::get(
+        {size}, mlir::TF::StringType::get(&getContext()));
+    return mlir::DenseStringElementsAttr::get(type, values);
+  }
+
+  // Merges `tf.RestoreV2` ops with the same prefix. Ignores restore ops with
+  // non-constant `tensor_names` and/or `shape_and_slices`.
+  mlir::LogicalResult MergeRestores(
+      llvm::ArrayRef<mlir::TF::RestoreV2Op> restores) {
+    if (restores.size() <= 1) {
+      return mlir::success();
+    }
+
+    // All restore ops must have the same prefix.
+    const mlir::Value prefix =
+        mlir::TF::RestoreV2Op(restores.front()).getPrefix();
+
+    std::vector<mlir::TF::RestoreV2Op> restores_to_merge;
+    std::vector<mlir::Value> values_to_replace;
+    std::vector<llvm::StringRef> merged_tensor_names;
+    std::vector<llvm::StringRef> merged_shape_and_slices;
+
+    std::vector<mlir::Location> restore_locs;
+    std::vector<mlir::Location> tensor_names_locs;
+    std::vector<mlir::Location> shape_and_slices_locs;
+
+    for (mlir::TF::RestoreV2Op restore : restores) {
+      mlir::DenseStringElementsAttr tensor_names;
+      mlir::DenseStringElementsAttr shape_and_slices;
+      if (!mlir::matchPattern(restore,
+                              mlir::m_Op<mlir::TF::RestoreV2Op>(
+                                  mlir::matchers::m_Val(prefix),
+                                  mlir::m_Constant(&tensor_names),
+                                  mlir::m_Constant(&shape_and_slices)))) {
+        continue;
+      }
+      if (tensor_names.size() != restore.getNumResults() ||
+          shape_and_slices.size() != restore.getNumResults()) {
+        return restore.emitOpError()
+               << "returns an inconsistent number of results";
+      }
+
+      restores_to_merge.push_back(restore);
+      llvm::append_range(values_to_replace, restore.getTensors());
+      llvm::append_range(merged_tensor_names,
+                         tensor_names.getValues<llvm::StringRef>());
+      llvm::append_range(merged_shape_and_slices,
+                         shape_and_slices.getValues<llvm::StringRef>());
+
+      restore_locs.push_back(restore.getLoc());
+      tensor_names_locs.push_back(restore.getTensorNames().getLoc());
+      shape_and_slices_locs.push_back(restore.getShapeAndSlices().getLoc());
+    }
+    if (restores_to_merge.size() <= 1) {
+      return mlir::success();
+    }
+
+    // Insert the merged restore op right before the first restore op to be
+    // merged in order to keep the dominance property.
+    mlir::OpBuilder builder(restores_to_merge.front());
+
+    auto new_tensor_names = builder.create<mlir::TF::ConstOp>(
+        builder.getFusedLoc(tensor_names_locs),
+        GetStringTensorAttr(merged_tensor_names));
+    auto new_shape_and_slices = builder.create<mlir::TF::ConstOp>(
+        builder.getFusedLoc(shape_and_slices_locs),
+        GetStringTensorAttr(merged_shape_and_slices));
+
+    auto new_restore = builder.create<mlir::TF::RestoreV2Op>(
+        builder.getFusedLoc(restore_locs),
+        mlir::TypeRange(mlir::ValueRange(values_to_replace)), prefix,
+        new_tensor_names, new_shape_and_slices);
+    for (auto [old_value, new_value] :
+         llvm::zip(values_to_replace, new_restore.getTensors())) {
+      old_value.replaceAllUsesWith(new_value);
+    }
+
+    for (mlir::TF::RestoreV2Op restore : restores_to_merge) {
+      restore.erase();
+    }
+    return mlir::success();
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateTfRestoreMergingPass() {
+  return std::make_unique<TfRestoreMergingPass>();
+}
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_splitting.cc b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_splitting.cc
new file mode 100644
index 00000000000000..130ca0a2e90b74
--- /dev/null
+++ b/tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf_restore_splitting.cc
@@ -0,0 +1,122 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <stdint.h>
+
+#include <memory>
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Diagnostics.h"  // from @llvm-project
+#include "mlir/IR/Matchers.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/OperationSupport.h"  // from @llvm-project
+#include "mlir/IR/Value.h"  // from @llvm-project
+#include "mlir/IR/Visitors.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tf_types.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+namespace {
+
+#define GEN_PASS_DEF_TFRESTORESPLITTINGPASS
+#define GEN_PASS_DECL_TFRESTORESPLITTINGPASS
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/passes.h.inc"  // IWYU pragma: keep
+
+class TfRestoreSplittingPass
+    : public impl::TfRestoreSplittingPassBase<TfRestoreSplittingPass> {
+ public:
+  void runOnOperation() override {
+    mlir::func::FuncOp func = getOperation();
+    const mlir::WalkResult result =
+        func.walk([&](mlir::TF::RestoreV2Op restore) {
+          if (mlir::failed(SplitRestore(restore))) {
+            return mlir::WalkResult::interrupt();
+          }
+          return mlir::WalkResult::advance();
+        });
+    if (result.wasInterrupted()) {
+      return signalPassFailure();
+    }
+  }
+
+ private:
+  mlir::DenseStringElementsAttr GetStringTensorAttr(
+      llvm::ArrayRef<llvm::StringRef> values) {
+    const int size = values.size();
+    const auto type = mlir::RankedTensorType::get(
+        {size}, mlir::TF::StringType::get(&getContext()));
+    return mlir::DenseStringElementsAttr::get(type, values);
+  }
+
+  // Splits the `tf.RestoreV2` op into per-variable restore ops if its
+  // `tensor_name` and `shape_and_slices` are constant.
+  mlir::LogicalResult SplitRestore(mlir::TF::RestoreV2Op restore) {
+    mlir::DenseStringElementsAttr tensor_names;
+    mlir::DenseStringElementsAttr shape_and_slices;
+    if (!mlir::matchPattern(restore,
+                            mlir::m_Op<mlir::TF::RestoreV2Op>(
+                                /*prefix=*/mlir::matchers::m_Any(),
+                                mlir::m_Constant(&tensor_names),
+                                mlir::m_Constant(&shape_and_slices)))) {
+      return mlir::success();
+    }
+    if (tensor_names.size() != restore.getNumResults() ||
+        shape_and_slices.size() != restore.getNumResults()) {
+      return restore.emitOpError()
+             << "returns an inconsistent number of results";
+    }
+
+    mlir::OpBuilder builder(restore);
+    for (auto [tensor_name, shape_and_slice, result] :
+         llvm::zip(tensor_names.getValues<llvm::StringRef>(),
+                   shape_and_slices.getValues<llvm::StringRef>(),
+                   restore.getTensors())) {
+      auto new_tensor_names =
+          builder.create<mlir::TF::ConstOp>(restore.getTensorNames().getLoc(),
+                                            GetStringTensorAttr({tensor_name}));
+
+      auto new_shape_and_slices = builder.create<mlir::TF::ConstOp>(
+          restore.getShapeAndSlices().getLoc(),
+          GetStringTensorAttr({shape_and_slice}));
+
+      auto new_restore = builder.create<mlir::TF::RestoreV2Op>(
+          restore.getLoc(), mlir::TypeRange({result.getType()}),
+          restore.getPrefix(), new_tensor_names, new_shape_and_slices);
+      result.replaceAllUsesWith(new_restore.getTensors()[0]);
+    }
+
+    restore.erase();
+    return mlir::success();
+  }
+};
+
+}  // namespace
+
+std::unique_ptr<mlir::OperationPass<mlir::func::FuncOp>>
+CreateTfRestoreSplittingPass() {
+  return std::make_unique<TfRestoreSplittingPass>();
+}
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
index eb61561911f2b4..bec0d45d6b1525 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/BUILD
@@ -51,6 +51,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_a_m_inc_gen",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_ops_n_z_inc_gen",
         "//tensorflow/compiler/mlir/tensorflow:tensorflow_tfrt_ops_inc_gen",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -65,6 +66,7 @@ cc_library(
         ":util",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:export_tf_dialect_op",
+        "//tensorflow/compiler/mlir/tensorflow:tensorflow_tfrt_ops_inc_gen",
         "//tensorflow/compiler/mlir/tfrt:constants",
         "//tensorflow/compiler/mlir/tfrt:tfrt_pipeline_options",
         "//tensorflow/compiler/mlir/tfrt:transform_utils",
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
index be81e54ece871f..a9dca8adca598c 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/tf_to_mlrt.cc
@@ -37,6 +37,7 @@ limitations under the License.
 #include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
+#include "tensorflow/compiler/mlir/tensorflow/ir/tfrt_ops.h.inc"
 #include "tensorflow/compiler/mlir/tensorflow/translate/export_tf_dialect_op.h"
 #include "tensorflow/compiler/mlir/tfrt/constants.h"
 #include "tensorflow/compiler/mlir/tfrt/ir/mlrt/mlrt_dialect.h"
@@ -322,6 +323,26 @@ class GetResourceOpConversion final
   }
 };
 
+// Convert tf.IfrtLoadVariableOp to tf_mlrt.IfrtLoadVariableOp
+class IfrtLoadVariableOpConversion
+    : public mlir::OpConversionPattern<mlir::TF::IfrtLoadVariableOp> {
+ public:
+  using OpConversionPattern::OpConversionPattern;
+
+  mlir::LogicalResult matchAndRewrite(
+      mlir::TF::IfrtLoadVariableOp op, OpAdaptor adaptor,
+      mlir::ConversionPatternRewriter &rewriter) const override {
+    llvm::SmallVector<mlir::Type> result_types(
+        op->getNumResults(), rewriter.getType<tf_mlrt::TFTensorType>());
+    auto new_op = rewriter.create<tf_mlrt::IfrtLoadVariableOp>(
+        op.getLoc(), result_types, adaptor.getOperands()[0],
+        op.getDeviceShardingConfigProtoTextAttr(), op.getNameAttr());
+    rewriter.replaceOp(op, new_op);
+
+    return mlir::success();
+  }
+};
+
 std::optional<std::string> DecodeLongName(mlir::Location loc) {
   if (auto name_loc = loc.dyn_cast<mlir::NameLoc>()) {
     return name_loc.getName().str();
@@ -1167,8 +1188,8 @@ class TfToMlrtConversionPass
     // Order the list of added ops alphabetically.
     patterns.add<WhileOpConversion>(&context, &type_converter_, &symbol_table);
     patterns.add<AsyncOpConversion, GetResourceOpConversion,
-                 SetResourceOpConversion, TFAwaitOpConversion,
-                 TFPromiseOpConversion>(&context);
+                 SetResourceOpConversion, IfrtLoadVariableOpConversion,
+                 TFAwaitOpConversion, TFPromiseOpConversion>(&context);
     patterns.add<BatchFunctionOpConversion, CaseOpConversion, CondOpConversion,
                  TFAsyncWhileOpConversion, TFMapFnOpConversion>(type_converter_,
                                                                 &context);
diff --git a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc
index fb110fb01f2ef1..69c8b08dcbc0b1 100644
--- a/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc
+++ b/tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/util.h"
 
+#include "llvm/Support/Casting.h"
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_dialect.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_ops.h"
@@ -29,13 +30,15 @@ bool UseFallback(mlir::Operation *op) {
 
   // TODO(b/173017701): have a centralized place to hold the information
   // whether a TF op should be lowered to FallbackExecute op.
+  // TODO(b/319045348): Define trait to reflect that IfrtLoadVariableOp has no
+  // TF kernels so that we don't need to check every op here.
   // LINT.IfChange(fallback_allow_list)
-  return !llvm::isa<mlir::TF::_TfrtSetResourceOp, mlir::TF::_TfrtGetResourceOp,
-                    mlir::TF::BatchFunctionOp, mlir::TF::CaseOp,
-                    mlir::TF::StatefulPartitionedCallOp,
-                    mlir::TF::PartitionedCallOp, mlir::TF::LegacyCallOp,
-                    mlir::TF::IfOp, mlir::TF::WhileOp,
-                    mlir::TF::TPUCompileMlirAndExecuteOp>(op);
+  return !llvm::isa<
+      mlir::TF::_TfrtSetResourceOp, mlir::TF::_TfrtGetResourceOp,
+      mlir::TF::BatchFunctionOp, mlir::TF::CaseOp, mlir::TF::IfrtLoadVariableOp,
+      mlir::TF::StatefulPartitionedCallOp, mlir::TF::PartitionedCallOp,
+      mlir::TF::LegacyCallOp, mlir::TF::IfOp, mlir::TF::WhileOp,
+      mlir::TF::TPUCompileMlirAndExecuteOp>(op);
   // LINT.ThenChange(tf_to_mlrt.cc:fallback_allow_list)
 }
 
diff --git a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
index 974bc1a56c938f..66aee10db7e050 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/import_model.cc
@@ -238,7 +238,7 @@ Status ConvertTfMlirToRuntimeExecutable(
 
     TF_RETURN_IF_ERROR(
         tensorflow::tf2xla::v2::RunFunctionTf2xlaClusteringBridge(
-            module, tf2xla::v2::DeviceType::XLA_TPU_JIT,
+            module, /*is_supported_by_replicated_brige*/ true,
             /*is_in_fallback_enabled_mode=*/VLOG_IS_ON(1)));
 
     TF_RETURN_IF_ERROR(
@@ -257,7 +257,7 @@ Status ConvertTfMlirToRuntimeExecutable(
   } else if (options.device_target == TfrtDeviceInfraTarget::kGpu) {
     TF_RETURN_IF_ERROR(
         tensorflow::tf2xla::v2::RunFunctionTf2xlaClusteringBridge(
-            module, tf2xla::v2::DeviceType::XLA_GPU_JIT,
+            module, /*is_supported_by_replicated_brige*/ false,
             /*is_in_fallback_enabled_mode=*/false));
 
     TF_RETURN_IF_ERROR(
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/BUILD b/tensorflow/compiler/mlir/tfrt/translate/mlrt/BUILD
index b065f29da4ddfc..bc887cdfc966f9 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/mlrt/BUILD
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/BUILD
@@ -25,6 +25,7 @@ cc_library(
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
diff --git a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc
index 1953ddd3d93997..98cb26acdba8fa 100644
--- a/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc
+++ b/tensorflow/compiler/mlir/tfrt/translate/mlrt/mlir_to_bytecode.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "mlir/IR/Attributes.h"  // from @llvm-project
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
index 44300ff2209f0d..6cfe2be883e6ca 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/BUILD
@@ -48,6 +48,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tools/kernel_gen/transforms:gpu_passes",  # fixdeps: keep
         "//tensorflow/compiler/mlir/tools/kernel_gen/transforms:passes",
         "//tensorflow/core:lib",
+        "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AffineToStandard",
         "@llvm-project//mlir:ArithDialect",
@@ -56,8 +57,8 @@ cc_library(
         "@llvm-project//mlir:BuiltinToLLVMIRTranslation",
         "@llvm-project//mlir:ComplexDialect",
         "@llvm-project//mlir:ComplexToStandard",
+        "@llvm-project//mlir:ControlFlowDialect",
         "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FuncToLLVM",
         "@llvm-project//mlir:GPUDialect",
         "@llvm-project//mlir:GPUToGPURuntimeTransforms",
         "@llvm-project//mlir:GPUToLLVMIRTranslation",
@@ -67,6 +68,7 @@ cc_library(
         "@llvm-project//mlir:LLVMIRTransforms",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
         "@llvm-project//mlir:LinalgTransforms",
+        "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:MemRefTransforms",
         "@llvm-project//mlir:NVVMToLLVMIRTranslation",
         "@llvm-project//mlir:Parser",
@@ -76,9 +78,10 @@ cc_library(
         "@llvm-project//mlir:SCFToControlFlow",
         "@llvm-project//mlir:SCFToGPU",
         "@llvm-project//mlir:SCFTransforms",
+        "@llvm-project//mlir:ShapeDialect",
         "@llvm-project//mlir:ShapeToStandard",
+        "@llvm-project//mlir:Support",
         "@llvm-project//mlir:Transforms",
-        "@llvm-project//mlir:VectorToLLVM",
         "@local_xla//xla/mlir_hlo",
         "@local_xla//xla/mlir_hlo:all_passes",  # fixdeps: keep
         "@local_xla//xla/mlir_hlo:mhlo_passes",
@@ -162,11 +165,21 @@ cc_library(
         "//tensorflow/compiler/mlir/tools/kernel_gen/ir:tf_framework_ops",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
+        "//tensorflow/core/platform:refcount",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@llvm-project//llvm:JITLink",
+        "@llvm-project//llvm:OrcShared",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ExecutionEngine",
         "@llvm-project//mlir:ExecutionEngineUtils",
+        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MemRefTransforms",
         "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
+        "@llvm-project//mlir:mlir_c_runner_utils",
         "@llvm-project//mlir:mlir_runner_utils",
         "@local_xla//xla/stream_executor",
     ],
@@ -178,8 +191,13 @@ cc_library(
     hdrs = ["tf_jit_cache.h"],
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core/framework:resource_base",
+        "//tensorflow/core/platform:status",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ExecutionEngine",
+        "@local_tsl//tsl/platform:thread_annotations",
     ],
 )
 
@@ -197,6 +215,7 @@ cc_library(
     ]),
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core/framework:resource_base",
         "//tensorflow/core/platform:logging",
         "//tensorflow/core/platform:mutex",
         "//tensorflow/core/platform:refcount",
@@ -206,6 +225,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@llvm-project//mlir:mlir_runner_utils",
         "@local_tsl//tsl/platform:hash",
+        "@local_tsl//tsl/platform:thread_annotations",
         "@local_xla//xla/stream_executor",
     ] + if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc b/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc
index b5537741529d06..4c05366e42af26 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/hlo_to_kernel.cc
@@ -119,8 +119,8 @@ Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
            llvm::StringRef host_triple,
            llvm::ArrayRef<std::string> architectures,
            llvm::ArrayRef<int64_t> tile_sizes,
-           llvm::ArrayRef<int64_t> unroll_factors, int64_t max_supported_rank,
-           bool print_ptx, bool print_llvmir, bool enable_ftz, bool index_64bit,
+           llvm::ArrayRef<int64_t> unroll_factors, bool print_ptx,
+           bool print_llvmir, bool enable_ftz, bool index_64bit,
            bool jit_compile, bool jit_i64_indexed_for_large_tensors) {
   // Read TF code.
   std::string hlo_code;
@@ -138,9 +138,9 @@ Status Run(llvm::StringRef input_file, llvm::StringRef output_file,
   TF_ASSIGN_OR_RETURN(
       mlir::OwningOpRef<mlir::ModuleOp> module,
       GenerateKernelForHloCode(context, hlo_code, architectures, tile_sizes,
-                               unroll_factors, max_supported_rank, print_ptx,
-                               print_llvmir, enable_ftz, index_64bit,
-                               jit_compile, jit_i64_indexed_for_large_tensors,
+                               unroll_factors, print_ptx, print_llvmir,
+                               enable_ftz, index_64bit, jit_compile,
+                               jit_i64_indexed_for_large_tensors,
                                /*apply_cl_options=*/true));
 
   // Get binary.
@@ -186,11 +186,6 @@ int main(int argc, char** argv) {
   llvm::cl::list<std::string> architectures(
       "arch", llvm::cl::desc("target architectures (e.g. sm_70 or compute_75)"),
       llvm::cl::ZeroOrMore, llvm::cl::CommaSeparated);
-  llvm::cl::opt<int64_t> max_supported_rank(
-      "max-supported-rank",
-      llvm::cl::desc("maximum supported rank to be guaranteed by rank "
-                     "specialization lowering"),
-      llvm::cl::init(5));
   llvm::cl::list<int64_t> tile_sizes(
       "tile_sizes", llvm::cl::desc("tile sizes to use"), llvm::cl::ZeroOrMore,
       llvm::cl::CommaSeparated);
@@ -222,8 +217,8 @@ int main(int argc, char** argv) {
 
   auto status = tensorflow::kernel_gen::Run(
       input_file, output_file, host_triple, architectures, tile_sizes,
-      unroll_factors, max_supported_rank, print_ptx, print_llvmir, enable_ftz,
-      index_64bit, jit_compile, jit_i64_indexed_for_large_tensors);
+      unroll_factors, print_ptx, print_llvmir, enable_ftz, index_64bit,
+      jit_compile, jit_i64_indexed_for_large_tensors);
   if (!status.ok()) {
     LOG(ERROR) << status;
     return 1;
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
index 8856ae73b0b4bf..d8e7617cc352ba 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.td
@@ -284,7 +284,6 @@ def TFFramework_JITCompileFromStrOp : TFFramework_Op<"jit_compile_from_str",
     StrAttr:$code,
     I64ArrayAttr:$tileSizes,
     I64ArrayAttr:$unrollFactors,
-    I64Attr:$maxSupportedRank,
     BoolAttr:$enableFtz,
     BoolAttr:$index64Bit,
     BoolAttr:$cpuCodegen
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
index 8bf068241f83fa..c8969f429e1805 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.cc
@@ -23,31 +23,36 @@ limitations under the License.
 
 #include <string>
 
+#include "absl/status/status.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"  // from @llvm-project
 #include "mlir/Conversion/ComplexToStandard/ComplexToStandard.h"  // from @llvm-project
-#include "mlir/Conversion/FuncToLLVM/ConvertFuncToLLVMPass.h"  // from @llvm-project
 #include "mlir/Conversion/GPUCommon/GPUCommonPass.h"  // from @llvm-project
 #include "mlir/Conversion/GPUToNVVM/GPUToNVVMPass.h"  // from @llvm-project
 #include "mlir/Conversion/ReconcileUnrealizedCasts/ReconcileUnrealizedCasts.h"  // from @llvm-project
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"  // from @llvm-project
 #include "mlir/Conversion/SCFToGPU/SCFToGPUPass.h"  // from @llvm-project
 #include "mlir/Conversion/ShapeToStandard/ShapeToStandard.h"  // from @llvm-project
-#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
 #include "mlir/Dialect/Arith/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Bufferization/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/Complex/IR/Complex.h"  // from @llvm-project
+#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
 #include "mlir/Dialect/GPU/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/Transforms/OptimizeForNVVM.h"  // from @llvm-project
 #include "mlir/Dialect/Linalg/Passes.h"  // from @llvm-project
-#include "mlir/Dialect/Linalg/Transforms/Transforms.h"  // from @llvm-project
+#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/Dialect/SCF/Transforms/Passes.h"  // from @llvm-project
+#include "mlir/Dialect/Shape/IR/Shape.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "mlir/Interfaces/DataLayoutInterfaces.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Pass/PassManager.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
@@ -64,7 +69,10 @@ limitations under the License.
 #include "xla/mlir_hlo/transforms/gpu_passes.h"
 #include "xla/mlir_hlo/transforms/passes.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 namespace kernel_gen {
@@ -113,8 +121,7 @@ bool IsSmallAlloc(Value alloc) {
 Status LowerHloToJITInvocation(mlir::ModuleOp module,
                                llvm::ArrayRef<int64_t> tile_sizes,
                                llvm::ArrayRef<int64_t> unroll_factors,
-                               int64_t max_supported_rank, bool enable_ftz,
-                               bool index_64bit,
+                               bool enable_ftz, bool index_64bit,
                                bool jit_i64_indexed_for_large_tensors,
                                bool apply_cl_options) {
   mlir::PassManager pm(module.getContext());
@@ -122,8 +129,7 @@ Status LowerHloToJITInvocation(mlir::ModuleOp module,
 
   pm.addNestedPass<FuncOp>(
       mlir::kernel_gen::transforms::CreateFuncToJITInvocationPass(
-          tile_sizes, unroll_factors, max_supported_rank, enable_ftz,
-          index_64bit,
+          tile_sizes, unroll_factors, enable_ftz, index_64bit,
           /*cpu_codegen=*/false, jit_i64_indexed_for_large_tensors));
   pm.addPass(mlir::kernel_gen::tf_framework::CreateEmbedTFFrameworkPass());
   pm.addNestedPass<FuncOp>(
@@ -143,8 +149,7 @@ Status LowerHloToJITInvocation(mlir::ModuleOp module,
 
 Status LowerHlotoLoops(mlir::ModuleOp module,
                        llvm::ArrayRef<int64_t> tile_sizes,
-                       llvm::ArrayRef<int64_t> unroll_factors,
-                       int64_t max_supported_rank, bool enable_ftz,
+                       llvm::ArrayRef<int64_t> unroll_factors, bool enable_ftz,
                        bool index_64bit, bool jit_i64_indexed_for_large_tensors,
                        bool apply_cl_options) {
   mlir::PassManager pm(module.getContext());
@@ -152,14 +157,11 @@ Status LowerHlotoLoops(mlir::ModuleOp module,
   if (jit_i64_indexed_for_large_tensors) {
     pm.addNestedPass<FuncOp>(
         mlir::kernel_gen::transforms::CreateFuncToJITInvocationPass(
-            tile_sizes, unroll_factors, max_supported_rank, enable_ftz,
-            index_64bit,
+            tile_sizes, unroll_factors, enable_ftz, index_64bit,
             /*cpu_codegen=*/false,
             /*jit_i64_indexed_for_large_tensors=*/true));
   }
-  pm.addNestedPass<FuncOp>(mlir::mhlo::createRankSpecializationClusterPass());
-  pm.addNestedPass<FuncOp>(
-      mlir::mhlo::createRankSpecializationToSCFPass(max_supported_rank));
+
   pm.addNestedPass<FuncOp>(mlir::mhlo::createChloLegalizeToHloPass());
 
   pm.addNestedPass<FuncOp>(mlir::createCanonicalizerPass());
@@ -409,6 +411,8 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> SetupContextAndParseModule(
     mlir::MLIRContext& context, llvm::StringRef tf_code) {
   mlir::DialectRegistry registry;
   registry.insert<mlir::chlo::ChloDialect, mlir::mhlo::MhloDialect>();
+  registry.insert<mlir::shape::ShapeDialect, mlir::scf::SCFDialect,
+                  mlir::cf::ControlFlowDialect>();
   registry.insert<mlir::complex::ComplexDialect, mlir::func::FuncDialect>();
   mlir::registerBuiltinDialectTranslation(registry);
   mlir::registerGPUDialectTranslation(registry);
@@ -418,9 +422,10 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> SetupContextAndParseModule(
   context.appendDialectRegistry(registry);
   mlir::OwningOpRef<mlir::ModuleOp> module =
       mlir::parseSourceString<mlir::ModuleOp>(tf_code, &context);
-  if (!module)
+  if (!module) {
     return tensorflow::Status(absl::StatusCode::kInvalidArgument,
                               "invalid kernel IR");
+  }
   return module;
 }
 
@@ -428,9 +433,9 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GenerateKernelForHloCode(
     mlir::MLIRContext& context, llvm::StringRef tf_code,
     llvm::ArrayRef<std::string> architectures,
     llvm::ArrayRef<int64_t> tile_sizes, llvm::ArrayRef<int64_t> unroll_factors,
-    int64_t max_supported_rank, bool print_ptx, bool print_llvmir,
-    bool enable_ftz, bool index_64bit, bool jit_compile,
-    bool jit_i64_indexed_for_large_tensors, bool apply_cl_options) {
+    bool print_ptx, bool print_llvmir, bool enable_ftz, bool index_64bit,
+    bool jit_compile, bool jit_i64_indexed_for_large_tensors,
+    bool apply_cl_options) {
   if (jit_compile && jit_i64_indexed_for_large_tensors) {
     return tensorflow::Status(
         absl::StatusCode::kInvalidArgument,
@@ -446,14 +451,12 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GenerateKernelForHloCode(
     assert(!jit_i64_indexed_for_large_tensors &&
            "expect to have reported an error earlier");
     TF_RETURN_IF_ERROR(LowerHloToJITInvocation(
-        module.get(), tile_sizes, unroll_factors, max_supported_rank,
-        enable_ftz, index_64bit,
+        module.get(), tile_sizes, unroll_factors, enable_ftz, index_64bit,
         /*jit_i64_indexed_for_large_tensors=*/false, apply_cl_options));
   } else {
-    TF_RETURN_IF_ERROR(
-        LowerHlotoLoops(module.get(), tile_sizes, unroll_factors,
-                        max_supported_rank, enable_ftz, index_64bit,
-                        jit_i64_indexed_for_large_tensors, apply_cl_options));
+    TF_RETURN_IF_ERROR(LowerHlotoLoops(
+        module.get(), tile_sizes, unroll_factors, enable_ftz, index_64bit,
+        jit_i64_indexed_for_large_tensors, apply_cl_options));
     TF_RETURN_IF_ERROR(
         LowerLoopsToGPU(module.get(), index_64bit, apply_cl_options));
     TF_RETURN_IF_ERROR(
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
index ac8666874224aa..f92ff42405db38 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h
@@ -29,6 +29,7 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "tensorflow/core/platform/statusor.h"
 
 namespace tensorflow {
@@ -44,9 +45,9 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> GenerateKernelForHloCode(
     mlir::MLIRContext& context, llvm::StringRef tf_code,
     llvm::ArrayRef<std::string> architectures,
     llvm::ArrayRef<int64_t> tile_sizes, llvm::ArrayRef<int64_t> unroll_factors,
-    int64_t max_supported_rank, bool print_ptx, bool print_llvmir,
-    bool enable_ftz, bool index_64bit, bool jit_compile,
-    bool jit_i64_indexed_for_large_tensors, bool apply_cl_options);
+    bool print_ptx, bool print_llvmir, bool enable_ftz, bool index_64bit,
+    bool jit_compile, bool jit_i64_indexed_for_large_tensors,
+    bool apply_cl_options);
 
 }  // namespace kernel_gen
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/embed_tf_framework.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/embed_tf_framework.mlir
index d6a7c2698f15ea..e6404a84319c0d 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/embed_tf_framework.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/embed_tf_framework.mlir
@@ -80,7 +80,7 @@ func.func @jit_compile_from_str(%ctx : !tf_framework.op_kernel_context)
   // CHECK: return %[[RES]]
   %0 = tf_framework.jit_compile_from_str "placeholder" {
       architectures = ["sm_123", "sm_456"], tileSizes = [1, 2, 3],
-      unrollFactors = [4], maxSupportedRank = 3 : i64, enableFtz = false,
+      unrollFactors = [4], enableFtz = false,
       index64Bit = false, cpuCodegen = false }
   func.return %0 : !tf_framework.jit_callable
 }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/func_to_jit_invocations.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/func_to_jit_invocations.mlir
index 7da7a482f63774..c1c7a1f2ca9c00 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/func_to_jit_invocations.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/func_to_jit_invocations.mlir
@@ -1,11 +1,11 @@
 // RUN: kernel-gen-opt %s --split-input-file \
 // RUN:   --func-to-jit-invocation="tile-sizes=1,2,3 unroll-factors=3,2,1 \
-// RUN:       max-supported-rank=32 enable-ftz=false cpu-codegen=false" | \
+// RUN:       enable-ftz=false cpu-codegen=false" | \
 // RUN: FileCheck %s
 
 // RUN: kernel-gen-opt %s --split-input-file \
 // RUN:   --func-to-jit-invocation="tile-sizes=1,2,3 unroll-factors=3,2,1 \
-// RUN:       max-supported-rank=32 enable-ftz=false cpu-codegen=false \
+// RUN:       enable-ftz=false cpu-codegen=false \
 // RUN:       jit_i64_indexed_for_large_tensors=true" | \
 // RUN: FileCheck %s --check-prefix=CHECK-JFLT
 
@@ -30,7 +30,6 @@ func.func @unary_tanh(%arg : tensor<*xf32>) -> tensor<*xf32> {
 // CHECK-SAME:      {
 // CHECK-SAME:        cpuCodegen = false
 // CHECK-SAME:        enableFtz = false
-// CHECK-SAME:        maxSupportedRank = 32 : i64
 // CHECK-SAME:        tileSizes = [1, 2, 3]
 // CHECK-SAME:        unrollFactors = [3, 2, 1]
 // CHECK-SAME:      }
@@ -49,7 +48,6 @@ func.func @unary_tanh(%arg : tensor<*xf32>) -> tensor<*xf32> {
 // CHECK-JFLT-SAME:        cpuCodegen = false
 // CHECK-JFLT-SAME:        enableFtz = false
 // CHECK-JFLT-SAME:        index64Bit = true
-// CHECK-JFLT-SAME:        maxSupportedRank = 32
 // CHECK-JFLT-SAME:        tileSizes = [1, 2, 3]
 // CHECK-JFLT-SAME:        unrollFactors = [3, 2, 1]
 // CHECK-JFLT:         %[[JIT_0:.*]] = tf_framework.jit_execute %[[JIT]](%[[ARG0]])
@@ -82,7 +80,6 @@ func.func @binary_sub(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>) -> tensor<*x
 // CHECK-SAME:      {
 // CHECK-SAME:        cpuCodegen = false
 // CHECK-SAME:        enableFtz = false
-// CHECK-SAME:        maxSupportedRank = 32 : i64
 // CHECK-SAME:        tileSizes = [1, 2, 3]
 // CHECK-SAME:        unrollFactors = [3, 2, 1]
 // CHECK-SAME:      }
@@ -114,7 +111,6 @@ func.func @binary_sub(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>) -> tensor<*x
 // CHECK-JFLT-SAME:      {
 // CHECK-JFLT-SAME:        cpuCodegen = false
 // CHECK-JFLT-SAME:        enableFtz = false
-// CHECK-JFLT-SAME:        maxSupportedRank = 32 : i64
 // CHECK-JFLT-SAME:        tileSizes = [1, 2, 3]
 // CHECK-JFLT-SAME:        unrollFactors = [3, 2, 1]
 // CHECK-JFLT-SAME:      }
@@ -149,7 +145,6 @@ func.func @reciprocal(%arg0: tensor<*xf32>)
 // CHECK-SAME:     cpuCodegen = false,
 // CHECK-SAME:     enableFtz = false,
 // CHECK-SAME:     index64Bit = false,
-// CHECK-SAME:     maxSupportedRank = 32 : i64,
 // CHECK-SAME:     tileSizes = [1, 2, 3],
 // CHECK-SAME:     unrollFactors = [3, 2, 1]
 // CHECK-SAME:   }
@@ -168,7 +163,6 @@ func.func @reciprocal(%arg0: tensor<*xf32>)
 // CHECK-JFLT-SAME:        cpuCodegen = false
 // CHECK-JFLT-SAME:        enableFtz = false
 // CHECK-JFLT-SAME:        index64Bit = true
-// CHECK-JFLT-SAME:        maxSupportedRank = 32
 // CHECK-JFLT-SAME:        tileSizes = [1, 2, 3]
 // CHECK-JFLT-SAME:        unrollFactors = [3, 2, 1]
 // CHECK-JFLT:         %[[JIT_0:.*]] = tf_framework.jit_execute %[[JIT]](%[[ARG0]])
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/add_v2.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/add_v2.mlir
index a1b35ccecc993e..686b34e0d138db 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/add_v2.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/add_v2.mlir
@@ -1,7 +1,131 @@
 // RUN: hlo_to_kernel --input=%s --output=%t --unroll_factors=4 --tile_sizes=256 --arch=sm_70
 
-func.func @AddV2(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>)
-    -> tensor<*xf32> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-  return %0 : tensor<*xf32>
+func.func @AddV2(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xf32> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xf32> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xf32>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xf32>) -> tensor<f32>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+    %17 = chlo.broadcast_add %15, %16 : (tensor<f32>, tensor<?xf32>) -> tensor<?xf32>
+    %cast = tensor.cast %17 : tensor<?xf32> to tensor<*xf32>
+    scf.yield %cast : tensor<*xf32>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xf32>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+      %19 = mhlo.reshape %arg1 : (tensor<*xf32>) -> tensor<f32>
+      %20 = chlo.broadcast_add %18, %19 : (tensor<?xf32>, tensor<f32>) -> tensor<?xf32>
+      %cast = tensor.cast %20 : tensor<?xf32> to tensor<*xf32>
+      scf.yield %cast : tensor<*xf32>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xf32>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+        %23 = chlo.broadcast_add %21, %22 : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+        %cast = tensor.cast %23 : tensor<?xf32> to tensor<*xf32>
+        scf.yield %cast : tensor<*xf32>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xf32>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+          %30 = chlo.broadcast_add %27, %29 : (tensor<?xf32>, tensor<?xf32>) -> tensor<?xf32>
+          %cast_1 = tensor.cast %30 : tensor<?xf32> to tensor<*xf32>
+          scf.yield %cast_1 : tensor<*xf32>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xf32>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+            %32 = chlo.broadcast_add %29, %31 : (tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
+            %cast_1 = tensor.cast %32 : tensor<?x?xf32> to tensor<*xf32>
+            scf.yield %cast_1 : tensor<*xf32>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xf32>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xf32>, tensor<3xindex>) -> tensor<?x?x?xf32>
+              %34 = chlo.broadcast_add %31, %33 : (tensor<?x?x?xf32>, tensor<?x?x?xf32>) -> tensor<?x?x?xf32>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xf32> to tensor<*xf32>
+              scf.yield %cast_1 : tensor<*xf32>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xf32>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xf32>, tensor<4xindex>) -> tensor<?x?x?x?xf32>
+                %36 = chlo.broadcast_add %33, %35 : (tensor<?x?x?x?xf32>, tensor<?x?x?x?xf32>) -> tensor<?x?x?x?xf32>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xf32> to tensor<*xf32>
+                scf.yield %cast_1 : tensor<*xf32>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xf32>, tensor<5xindex>) -> tensor<?x?x?x?x?xf32>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xf32>, tensor<5xindex>) -> tensor<?x?x?x?x?xf32>
+                %37 = chlo.broadcast_add %34, %36 : (tensor<?x?x?x?x?xf32>, tensor<?x?x?x?x?xf32>) -> tensor<?x?x?x?x?xf32>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xf32> to tensor<*xf32>
+                scf.yield %cast_1 : tensor<*xf32>
+              }
+              scf.yield %31 : tensor<*xf32>
+            }
+            scf.yield %29 : tensor<*xf32>
+          }
+          scf.yield %27 : tensor<*xf32>
+        }
+        scf.yield %25 : tensor<*xf32>
+      }
+      scf.yield %18 : tensor<*xf32>
+    }
+    scf.yield %16 : tensor<*xf32>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xf32> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xf32> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xf32>, tensor<?xindex>) -> tensor<*xf32>
+  return %13 : tensor<*xf32>
 }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/add_v2_unsigned.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/add_v2_unsigned.mlir
index ee01fe543eaac4..f38a2dca1bc8cd 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/add_v2_unsigned.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/add_v2_unsigned.mlir
@@ -1,7 +1,131 @@
 // RUN: hlo_to_kernel --input=%s --output=%t --unroll_factors=4 --tile_sizes=256 --arch=sm_70
 
-func.func @AddV2(%arg0: tensor<*xui32>, %arg1: tensor<*xui32>)
-    -> tensor<*xui32> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_add %arg0, %arg1 : (tensor<*xui32>, tensor<*xui32>) -> tensor<*xui32>
-  return %0 : tensor<*xui32>
+func.func @AddV2(%arg0: tensor<*xui32>, %arg1: tensor<*xui32>) -> tensor<*xui32> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xui32> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xui32> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xui32>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xui32>) -> tensor<ui32>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xui32>, tensor<1xindex>) -> tensor<?xui32>
+    %17 = chlo.broadcast_add %15, %16 : (tensor<ui32>, tensor<?xui32>) -> tensor<?xui32>
+    %cast = tensor.cast %17 : tensor<?xui32> to tensor<*xui32>
+    scf.yield %cast : tensor<*xui32>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xui32>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xui32>, tensor<1xindex>) -> tensor<?xui32>
+      %19 = mhlo.reshape %arg1 : (tensor<*xui32>) -> tensor<ui32>
+      %20 = chlo.broadcast_add %18, %19 : (tensor<?xui32>, tensor<ui32>) -> tensor<?xui32>
+      %cast = tensor.cast %20 : tensor<?xui32> to tensor<*xui32>
+      scf.yield %cast : tensor<*xui32>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xui32>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xui32>, tensor<1xindex>) -> tensor<?xui32>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xui32>, tensor<1xindex>) -> tensor<?xui32>
+        %23 = chlo.broadcast_add %21, %22 : (tensor<?xui32>, tensor<?xui32>) -> tensor<?xui32>
+        %cast = tensor.cast %23 : tensor<?xui32> to tensor<*xui32>
+        scf.yield %cast : tensor<*xui32>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xui32>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xui32>, tensor<1xindex>) -> tensor<?xui32>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xui32>, tensor<1xindex>) -> tensor<?xui32>
+          %30 = chlo.broadcast_add %27, %29 : (tensor<?xui32>, tensor<?xui32>) -> tensor<?xui32>
+          %cast_1 = tensor.cast %30 : tensor<?xui32> to tensor<*xui32>
+          scf.yield %cast_1 : tensor<*xui32>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xui32>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xui32>, tensor<2xindex>) -> tensor<?x?xui32>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xui32>, tensor<2xindex>) -> tensor<?x?xui32>
+            %32 = chlo.broadcast_add %29, %31 : (tensor<?x?xui32>, tensor<?x?xui32>) -> tensor<?x?xui32>
+            %cast_1 = tensor.cast %32 : tensor<?x?xui32> to tensor<*xui32>
+            scf.yield %cast_1 : tensor<*xui32>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xui32>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xui32>, tensor<3xindex>) -> tensor<?x?x?xui32>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xui32>, tensor<3xindex>) -> tensor<?x?x?xui32>
+              %34 = chlo.broadcast_add %31, %33 : (tensor<?x?x?xui32>, tensor<?x?x?xui32>) -> tensor<?x?x?xui32>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xui32> to tensor<*xui32>
+              scf.yield %cast_1 : tensor<*xui32>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xui32>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xui32>, tensor<4xindex>) -> tensor<?x?x?x?xui32>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xui32>, tensor<4xindex>) -> tensor<?x?x?x?xui32>
+                %36 = chlo.broadcast_add %33, %35 : (tensor<?x?x?x?xui32>, tensor<?x?x?x?xui32>) -> tensor<?x?x?x?xui32>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xui32> to tensor<*xui32>
+                scf.yield %cast_1 : tensor<*xui32>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xui32>, tensor<5xindex>) -> tensor<?x?x?x?x?xui32>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xui32>, tensor<5xindex>) -> tensor<?x?x?x?x?xui32>
+                %37 = chlo.broadcast_add %34, %36 : (tensor<?x?x?x?x?xui32>, tensor<?x?x?x?x?xui32>) -> tensor<?x?x?x?x?xui32>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xui32> to tensor<*xui32>
+                scf.yield %cast_1 : tensor<*xui32>
+              }
+              scf.yield %31 : tensor<*xui32>
+            }
+            scf.yield %29 : tensor<*xui32>
+          }
+          scf.yield %27 : tensor<*xui32>
+        }
+        scf.yield %25 : tensor<*xui32>
+      }
+      scf.yield %18 : tensor<*xui32>
+    }
+    scf.yield %16 : tensor<*xui32>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xui32> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xui32> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xui32>, tensor<?xindex>) -> tensor<*xui32>
+  return %13 : tensor<*xui32>
 }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/minimum.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/minimum.mlir
new file mode 100644
index 00000000000000..1facc06ee500e9
--- /dev/null
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/minimum.mlir
@@ -0,0 +1,131 @@
+// RUN: hlo_to_kernel --input=%s --output=%t --unroll_factors=4 --tile_sizes=256 --arch=sm_70
+
+func.func @Minimum_GPU_DT_UINT32_DT_UINT32(%arg0: tensor<*xui32>, %arg1: tensor<*xui32>) -> tensor<*xui32> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xui32> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xui32> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xui32>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xui32>) -> tensor<ui32>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xui32>, tensor<1xindex>) -> tensor<?xui32>
+    %17 = chlo.broadcast_minimum %15, %16 : (tensor<ui32>, tensor<?xui32>) -> tensor<?xui32>
+    %cast = tensor.cast %17 : tensor<?xui32> to tensor<*xui32>
+    scf.yield %cast : tensor<*xui32>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xui32>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xui32>, tensor<1xindex>) -> tensor<?xui32>
+      %19 = mhlo.reshape %arg1 : (tensor<*xui32>) -> tensor<ui32>
+      %20 = chlo.broadcast_minimum %18, %19 : (tensor<?xui32>, tensor<ui32>) -> tensor<?xui32>
+      %cast = tensor.cast %20 : tensor<?xui32> to tensor<*xui32>
+      scf.yield %cast : tensor<*xui32>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xui32>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xui32>, tensor<1xindex>) -> tensor<?xui32>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xui32>, tensor<1xindex>) -> tensor<?xui32>
+        %23 = chlo.broadcast_minimum %21, %22 : (tensor<?xui32>, tensor<?xui32>) -> tensor<?xui32>
+        %cast = tensor.cast %23 : tensor<?xui32> to tensor<*xui32>
+        scf.yield %cast : tensor<*xui32>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xui32>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xui32>, tensor<1xindex>) -> tensor<?xui32>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xui32>, tensor<1xindex>) -> tensor<?xui32>
+          %30 = chlo.broadcast_minimum %27, %29 : (tensor<?xui32>, tensor<?xui32>) -> tensor<?xui32>
+          %cast_1 = tensor.cast %30 : tensor<?xui32> to tensor<*xui32>
+          scf.yield %cast_1 : tensor<*xui32>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xui32>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xui32>, tensor<2xindex>) -> tensor<?x?xui32>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xui32>, tensor<2xindex>) -> tensor<?x?xui32>
+            %32 = chlo.broadcast_minimum %29, %31 : (tensor<?x?xui32>, tensor<?x?xui32>) -> tensor<?x?xui32>
+            %cast_1 = tensor.cast %32 : tensor<?x?xui32> to tensor<*xui32>
+            scf.yield %cast_1 : tensor<*xui32>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xui32>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xui32>, tensor<3xindex>) -> tensor<?x?x?xui32>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xui32>, tensor<3xindex>) -> tensor<?x?x?xui32>
+              %34 = chlo.broadcast_minimum %31, %33 : (tensor<?x?x?xui32>, tensor<?x?x?xui32>) -> tensor<?x?x?xui32>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xui32> to tensor<*xui32>
+              scf.yield %cast_1 : tensor<*xui32>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xui32>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xui32>, tensor<4xindex>) -> tensor<?x?x?x?xui32>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xui32>, tensor<4xindex>) -> tensor<?x?x?x?xui32>
+                %36 = chlo.broadcast_minimum %33, %35 : (tensor<?x?x?x?xui32>, tensor<?x?x?x?xui32>) -> tensor<?x?x?x?xui32>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xui32> to tensor<*xui32>
+                scf.yield %cast_1 : tensor<*xui32>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xui32>, tensor<5xindex>) -> tensor<?x?x?x?x?xui32>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xui32>, tensor<5xindex>) -> tensor<?x?x?x?x?xui32>
+                %37 = chlo.broadcast_minimum %34, %36 : (tensor<?x?x?x?x?xui32>, tensor<?x?x?x?x?xui32>) -> tensor<?x?x?x?x?xui32>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xui32> to tensor<*xui32>
+                scf.yield %cast_1 : tensor<*xui32>
+              }
+              scf.yield %31 : tensor<*xui32>
+            }
+            scf.yield %29 : tensor<*xui32>
+          }
+          scf.yield %27 : tensor<*xui32>
+        }
+        scf.yield %25 : tensor<*xui32>
+      }
+      scf.yield %18 : tensor<*xui32>
+    }
+    scf.yield %16 : tensor<*xui32>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xui32> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xui32> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xui32>, tensor<?xindex>) -> tensor<*xui32>
+  return %13 : tensor<*xui32>
+}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/tanh.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/tanh.mlir
index 8685ab17faee7a..2d3c8e6f5b9ef7 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/tanh.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/hlo_to_kernel/tanh.mlir
@@ -1,6 +1,11 @@
 // RUN: hlo_to_kernel --input=%s --output=%t --unroll_factors=4 --tile_sizes=256 --arch=sm_70
 
-func.func @tanh(%arg: tensor<*xf32>) -> tensor<*xf32> attributes {tf_entry} {
-  %0 = mhlo.tanh %arg : tensor<*xf32>
-  return %0 : tensor<*xf32>
+func.func @tanh(%arg0: tensor<*xf32>) -> tensor<*xf32> attributes {tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xf32> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
+  %3 = mhlo.tanh %2 : tensor<?xf32>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
+  return %4 : tensor<*xf32>
 }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/isinf.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/isinf.mlir
index 22c9572dfd810c..da3ca471a857e4 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/isinf.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/isinf.mlir
@@ -1,6 +1,6 @@
 // RUN: tf-opt %s --test-tf-lower-tf --xla-legalize-tf | \
-// RUN: mlir-hlo-opt --mhlo-rank-specialization-cluster \
-// RUN: --mhlo-rank-specialization-to-scf --hlo-legalize-to-linalg \
+// RUN: mlir-hlo-opt \
+// RUN: --hlo-legalize-to-linalg \
 // RUN: --empty-tensor-to-alloc-tensor \
 // RUN: --computeop-and-func-bufferize --canonicalize | \
 // RUN: kernel-gen-opt -allow-unregistered-dialect \
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/ops.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/ops.mlir
index ccd7a901fd0513..b5055e9ba4c8ab 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/ops.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/ops.mlir
@@ -82,7 +82,7 @@ func.func @jit_compile(%ctx : !tf_framework.op_kernel_context)
 func.func @jit_compile_from_str_wo_ctx() -> !tf_framework.jit_callable {
   %callable = tf_framework.jit_compile_from_str "placeholder" {
       architectures = ["sm_123", "sm_456"], tileSizes = [1, 2, 3],
-      unrollFactors = [4], maxSupportedRank = 3 : i64, enableFtz = false,
+      unrollFactors = [4], enableFtz = false,
       index64Bit = false, cpuCodegen = false }
   func.return %callable : !tf_framework.jit_callable
 }
@@ -92,7 +92,7 @@ func.func @jit_compile_from_str(%ctx : !tf_framework.op_kernel_context)
     -> !tf_framework.jit_callable {
   %callable = tf_framework.jit_compile_from_str %ctx , "placeholder" {
       architectures = ["sm_123", "sm_456"], tileSizes = [1, 2, 3],
-      unrollFactors = [4], maxSupportedRank = 3 : i64, enableFtz = false,
+      unrollFactors = [4], enableFtz = false,
       index64Bit = false, cpuCodegen = false }
   func.return %callable : !tf_framework.jit_callable
 }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tanh.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tanh.mlir
deleted file mode 100644
index b8a227688c9dcb..00000000000000
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tanh.mlir
+++ /dev/null
@@ -1,26 +0,0 @@
-// RUN: tf-opt %s --test-tf-lower-tf --xla-legalize-tf | \
-// RUN: mlir-hlo-opt --mhlo-rank-specialization-cluster \
-// RUN: --mhlo-rank-specialization-to-scf --hlo-legalize-to-linalg \
-// RUN: --empty-tensor-to-alloc-tensor \
-// RUN: --computeop-and-func-bufferize --canonicalize | \
-// RUN: kernel-gen-opt -allow-unregistered-dialect \
-// RUN: --shape-to-descriptors \
-// RUN: --canonicalize  --kernelgen-final-bufferize | \
-// RUN: FileCheck %s
-
-// Test whether all shape computations required for tanh can be lowered to
-// the standard dialect, scf and descriptors. We check for a sparse pattern here,
-// as each lowering pattern is already tested and we just care for the
-// integration.
-// TODO: Expand this pattern once things have stabilized.
-// CHECK-LABEL: @tanh
-func.func @tanh(%arg0: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: alloc
-  // CHECK: scf.for
-  // CHECK: memref.reshape
-  // CHECK: alloc
-  // CHECK: linalg.generic
-  // CHECK: memref.reshape
-  %0 = "tf.Tanh"(%arg0) { } : (tensor<*xf32>) -> tensor<*xf32>
-  func.return %0 : tensor<*xf32>
-}
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf-legalize-to-lmhlo.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf-legalize-to-lmhlo.mlir
deleted file mode 100644
index c30b21d8cd3162..00000000000000
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf-legalize-to-lmhlo.mlir
+++ /dev/null
@@ -1,38 +0,0 @@
-// RUN: tf-opt %s --xla-legalize-tf='legalize-chlo=false' | \
-// RUN: mlir-hlo-opt --mhlo-rank-specialization-cluster \
-// RUN:   --mhlo-rank-specialization-to-scf --chlo-legalize-to-hlo \
-// RUN:   --hlo-legalize-to-linalg --empty-tensor-to-alloc-tensor \
-// RUN:   --computeop-and-func-bufferize | \
-// RUN: kernel-gen-opt --shape-to-descriptors \
-// RUN:   --canonicalize --kernelgen-final-bufferize
-
-func.func @acos(%arg0: tensor<*xf32>) -> tensor<*xf32> {
-  %0 = "tf.Acos"(%arg0) { } : (tensor<*xf32>) -> tensor<*xf32>
-  func.return %0 : tensor<*xf32>
-}
-
-func.func @tan(%arg0: tensor<*xf32>) -> tensor<*xf32> {
-  %0 = "tf.Tan"(%arg0) { } : (tensor<*xf32>) -> tensor<*xf32>
-  func.return %0 : tensor<*xf32>
-}
-
-func.func @tanh(%arg0: tensor<*xf32>) -> tensor<*xf32> {
-  %0 = "tf.Tanh"(%arg0) { } : (tensor<*xf32>) -> tensor<*xf32>
-  func.return %0 : tensor<*xf32>
-}
-
-func.func @sin(%arg0: tensor<*xf32>) -> tensor<*xf32> {
-  %0 = "tf.Sin"(%arg0) { } : (tensor<*xf32>) -> tensor<*xf32>
-  func.return %0 : tensor<*xf32>
-}
-
-func.func @sinh(%arg0: tensor<*xf32>) -> tensor<*xf32> {
-  %0 = "tf.Sinh"(%arg0) { } : (tensor<*xf32>) -> tensor<*xf32>
-  func.return %0 : tensor<*xf32>
-}
-
-func.func @erf(%arg0: tensor<*xf32>) -> tensor<*xf32> {
-  %0 = "tf.Erf"(%arg0) { } : (tensor<*xf32>) -> tensor<*xf32>
-  func.return %0 : tensor<*xf32>
-}
-
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir
index ed65470eb72728..4da5051a7bd2ac 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tests/tf_framework_legalize_to_llvm.mlir
@@ -151,7 +151,7 @@ func.func @is_valid_memref(%buf: memref<?xf32>) -> i1 {
 
 // -----
 
-// CHECK-LABEL: llvm.func @_mlir_ciface_tf_jit_compile(!llvm.ptr, !llvm.ptr, i64, !llvm.ptr, i64, !llvm.ptr, i64, i1, i1, i1) -> !llvm.ptr
+// CHECK-LABEL: llvm.func @_mlir_ciface_tf_jit_compile(!llvm.ptr, !llvm.ptr, i64, !llvm.ptr, i64, !llvm.ptr, i1, i1, i1) -> !llvm.ptr
 // CHECK: llvm.mlir.global internal constant @[[CODE:jit_module_code_[0-9]+]]("placeholder\00")
 
 // CHECK: @jit_compile_from_str(%[[CTX:.*]]: !llvm.ptr)
@@ -184,17 +184,16 @@ func.func @jit_compile_from_str(%ctx: !tf_framework.op_kernel_context)
   // CHECK: %[[C4:.*]] = llvm.mlir.constant(4 : i64)
   // CHECK: llvm.store %[[C4]], %[[PTR]]
 
-  // CHECK-DAG: %[[MAX_RANK:.*]] = llvm.mlir.constant(3 : i64)
   // CHECK-DAG: %[[ENABLE_FTZ:.*]] = llvm.mlir.constant(false)
   // CHECK-DAG: %[[CPU_CODEGEN:.*]] = llvm.mlir.constant(false)
   // CHECK: %[[RES:.*]] = llvm.call @_mlir_ciface_tf_jit_compile
   // CHECK-SAME: %[[CTX]], %[[CODE_PTR]],
   // CHECK-SAME: %[[NUM_TILE_SIZES]], %[[TILE_SIZES]],
   // CHECK-SAME: %[[NUM_UNROLL_FACTORS]], %[[UNROLL_FACTORS]],
-  // CHECK-SAME: %[[MAX_RANK]], %[[ENABLE_FTZ]], %[[CPU_CODEGEN]]
+  // CHECK-SAME: %[[ENABLE_FTZ]], %[[CPU_CODEGEN]]
   // CHECK: llvm.return %[[RES]]
   %0 = tf_framework.jit_compile_from_str %ctx, "placeholder" {
-      tileSizes = [1, 2, 3], unrollFactors = [4], maxSupportedRank = 3 : i64,
+      tileSizes = [1, 2, 3], unrollFactors = [4],
       enableFtz = false, index64Bit = false, cpuCodegen = false }
   func.return %0 : !tf_framework.jit_callable
 }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
index 34cbb4069ffeac..b2f717046ce1d1 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.cc
@@ -19,22 +19,44 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/Mangling.h"
+#include "llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/raw_ostream.h"
 #include "mlir/Dialect/MemRef/Transforms/AllocationOpInterfaceImpl.h"  // from @llvm-project
+#include "mlir/ExecutionEngine/CRunnerUtils.h"  // from @llvm-project
 #include "mlir/ExecutionEngine/ExecutionEngine.h"  // from @llvm-project
 #include "mlir/ExecutionEngine/OptUtils.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tools/kernel_gen/compile_cache_item.pb.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/ir/tf_framework_ops.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/kernel_creator.h"
 #include "tensorflow/compiler/mlir/tools/kernel_gen/tf_jit_cache.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/stream.h"
 #include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/resource_mgr.h"
-#include "tensorflow/core/lib/io/path.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/path.h"
+#include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
+#include "tsl/framework/allocator.h"
 
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
 #include <optional>
@@ -162,8 +184,8 @@ void InitializeLlvmCompiler() {
 llvm::Expected<std::unique_ptr<ExecutionEngine>> Compile(
     const std::string code, llvm::SmallVectorImpl<std::string>& architectures,
     llvm::SmallVectorImpl<int64_t>& tile_sizes,
-    llvm::SmallVectorImpl<int64_t>& unroll_factors, int64_t max_supported_rank,
-    bool enable_ftz, bool index_64bit) {
+    llvm::SmallVectorImpl<int64_t>& unroll_factors, bool enable_ftz,
+    bool index_64bit) {
   std::string cache_dir;
   if (const char* dir = getenv(kTFJitCacheDirEnvVar.data())) {
     cache_dir = dir;
@@ -197,7 +219,6 @@ llvm::Expected<std::unique_ptr<ExecutionEngine>> Compile(
     tensorflow::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> status_or_module =
         tensorflow::kernel_gen::GenerateKernelForHloCode(
             context, code, architectures, tile_sizes, unroll_factors,
-            max_supported_rank,
             /*print_ptx=*/false, /*print_llvmir=*/false, enable_ftz,
             index_64bit,
             /*jit_compile=*/false,
@@ -261,8 +282,7 @@ llvm::SmallVector<T, 8> SmallVectorFromCArray(int64_t num_elements,
 extern "C" void* _mlir_ciface_tf_jit_compile(
     void* op_kernel_ctx, char* code, int64_t num_tile_sizes,
     int64_t* tile_sizes_ptr, int64_t num_unroll_factors,
-    int64_t* unroll_factors_ptr, int64_t max_supported_rank, bool enable_ftz,
-    bool index_64bit) {
+    int64_t* unroll_factors_ptr, bool enable_ftz, bool index_64bit) {
   // Get the resource manager.
   auto* ctx = static_cast<tensorflow::OpKernelContext*>(op_kernel_ctx);
   tensorflow::ResourceMgr* rm = ctx->resource_manager();
@@ -303,8 +323,8 @@ extern "C" void* _mlir_ciface_tf_jit_compile(
 
   // Lookup or compile the execution module.
   ExecutionEngine* engine = jit_cache->LookupOrCompile(code, [&]() {
-    return Compile(code, architectures, tile_sizes, unroll_factors,
-                   max_supported_rank, enable_ftz, index_64bit);
+    return Compile(code, architectures, tile_sizes, unroll_factors, enable_ftz,
+                   index_64bit);
   });
   if (engine == nullptr) {
     ReportError(op_kernel_ctx, ErrorCode::UNKNOWN, "JIT compilation failed.");
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h
index 3a7c879d2c64c9..a62dc2c7020ab7 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_framework_c_interface.h
@@ -36,8 +36,7 @@ extern "C" MLIR_RUNNERUTILS_EXPORT void _mlir_ciface_tf_report_error(
 extern "C" MLIR_RUNNERUTILS_EXPORT void* _mlir_ciface_tf_jit_compile(
     void* op_kernel_ctx, char* code, int64_t num_tile_sizes,
     int64_t* tile_sizes_ptr, int64_t num_unroll_factors,
-    int64_t* unroll_factors_ptr, int64_t max_supported_rank, bool enable_ftz,
-    bool index_64bit);
+    int64_t* unroll_factors_ptr, bool enable_ftz, bool index_64bit);
 
 extern "C" MLIR_RUNNERUTILS_EXPORT void _mlir_ciface_tf_jit_execute(
     void* op_kernel_ctx, void* callable, void* result, int64_t num_args,
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_gpu_runtime_wrappers.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_gpu_runtime_wrappers.cc
index 6d2e954595645f..0d3b4f2d4a0eaf 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_gpu_runtime_wrappers.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_gpu_runtime_wrappers.cc
@@ -22,7 +22,6 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
-#include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/stream_executor/stream.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
@@ -31,6 +30,10 @@ limitations under the License.
 #include "tensorflow/core/platform/refcount.h"
 #include "tensorflow/core/platform/status.h"
 
+#if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda.h"
+#endif
+
 static void ReportInternalError(tensorflow::OpKernelContext *ctx,
                                 const std::string msg) {
   if (ctx == nullptr) {
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_gpu_runtime_wrappers.h b/tensorflow/compiler/mlir/tools/kernel_gen/tf_gpu_runtime_wrappers.h
index 79e2c0cc530552..be1325c5a4dc16 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_gpu_runtime_wrappers.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_gpu_runtime_wrappers.h
@@ -18,9 +18,12 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "mlir/ExecutionEngine/RunnerUtils.h"  // from @llvm-project
+#include "tensorflow/core/framework/resource_base.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
 #include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
 #include "tsl/platform/hash.h"
+#include "tsl/platform/thread_annotations.h"
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_jit_cache.cc b/tensorflow/compiler/mlir/tools/kernel_gen/tf_jit_cache.cc
index 4d69d754a0a8b8..b2e3cc19b581c5 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_jit_cache.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_jit_cache.cc
@@ -19,6 +19,11 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "llvm/Support/Error.h"
+#include "mlir/ExecutionEngine/ExecutionEngine.h"  // from @llvm-project
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+
 namespace mlir {
 namespace kernel_gen {
 namespace tf_framework {
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/tf_jit_cache.h b/tensorflow/compiler/mlir/tools/kernel_gen/tf_jit_cache.h
index 414e7954be271e..f03e5778f6c8c5 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/tf_jit_cache.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/tf_jit_cache.h
@@ -21,7 +21,11 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "mlir/ExecutionEngine/ExecutionEngine.h"  // from @llvm-project
+#include "tensorflow/core/framework/resource_base.h"
 #include "tensorflow/core/framework/resource_op_kernel.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/thread_annotations.h"
 
 namespace mlir {
 namespace kernel_gen {
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/copy_cleanup_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/copy_cleanup_pass.cc
index af0943ded1f8d8..32faed506e52b4 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/copy_cleanup_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/copy_cleanup_pass.cc
@@ -60,7 +60,7 @@ void RemoveCopyIfTargetOnlyRead(func::FuncOp func) {
         }
         continue;
       }
-      if (auto effect_interface = cast<MemoryEffectOpInterface>(user)) {
+      if (auto effect_interface = dyn_cast<MemoryEffectOpInterface>(user)) {
         if (reader) {
           at_most_one_read = false;
         } else {
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/func_to_jit_invocations.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/func_to_jit_invocations.cc
index d87271e161529b..60f8876f109553 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/func_to_jit_invocations.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/func_to_jit_invocations.cc
@@ -176,8 +176,7 @@ LogicalResult RewriteToLargeSizeJit(FuncOp op) {
 
 void PackJITCompileOp(tf_framework::JITCompileOp op,
                       llvm::ArrayRef<int64_t> tile_sizes,
-                      llvm::ArrayRef<int64_t> unroll_factors,
-                      int64_t max_supported_rank, bool enable_ftz,
+                      llvm::ArrayRef<int64_t> unroll_factors, bool enable_ftz,
                       bool index_64bit, bool cpu_codegen) {
   IRRewriter rewriter(op.getContext());
   Block *body = op.SingleBlock::getBody();
@@ -219,7 +218,6 @@ void PackJITCompileOp(tf_framework::JITCompileOp op,
       op, op->getResultTypes(), op.getCtx(), rewriter.getStringAttr(code),
       rewriter.getI64ArrayAttr(tile_sizes),
       rewriter.getI64ArrayAttr(unroll_factors),
-      rewriter.getI64IntegerAttr(max_supported_rank),
       rewriter.getBoolAttr(enable_ftz), rewriter.getBoolAttr(index_64bit),
       rewriter.getBoolAttr(cpu_codegen));
 }
@@ -231,12 +229,11 @@ struct FuncToJITInvocationPass
     : public impl::FuncToJITInvocationPassBase<FuncToJITInvocationPass> {
   explicit FuncToJITInvocationPass(llvm::ArrayRef<int64_t> tile_sizes,
                                    llvm::ArrayRef<int64_t> unroll_factors,
-                                   int64_t max_supported_rank, bool enable_ftz,
-                                   bool index_64bit, bool cpu_codegen,
+                                   bool enable_ftz, bool index_64bit,
+                                   bool cpu_codegen,
                                    bool jit_i64_indexed_for_large_tensors) {
     tile_sizes_ = tile_sizes;
     unroll_factors_ = unroll_factors;
-    max_supported_rank_ = max_supported_rank;
     enable_ftz_ = enable_ftz;
     index_64bit_ = index_64bit;
     cpu_codegen_ = cpu_codegen;
@@ -255,9 +252,9 @@ struct FuncToJITInvocationPass
     }
 
     getOperation().walk([&](tf_framework::JITCompileOp op) {
-      PackJITCompileOp(
-          op, tile_sizes_, unroll_factors_, max_supported_rank_, enable_ftz_,
-          index_64bit_ || jit_i64_indexed_for_large_tensors_, cpu_codegen_);
+      PackJITCompileOp(op, tile_sizes_, unroll_factors_, enable_ftz_,
+                       index_64bit_ || jit_i64_indexed_for_large_tensors_,
+                       cpu_codegen_);
     });
   }
 };
@@ -266,11 +263,11 @@ struct FuncToJITInvocationPass
 
 std::unique_ptr<OperationPass<func::FuncOp>> CreateFuncToJITInvocationPass(
     llvm::ArrayRef<int64_t> tile_sizes, llvm::ArrayRef<int64_t> unroll_factors,
-    int64_t max_supported_rank, bool enable_ftz, bool index_64bit,
-    bool cpu_codegen, bool jit_i64_indexed_for_large_tensors) {
+    bool enable_ftz, bool index_64bit, bool cpu_codegen,
+    bool jit_i64_indexed_for_large_tensors) {
   return std::make_unique<FuncToJITInvocationPass>(
-      tile_sizes, unroll_factors, max_supported_rank, enable_ftz, index_64bit,
-      cpu_codegen, jit_i64_indexed_for_large_tensors);
+      tile_sizes, unroll_factors, enable_ftz, index_64bit, cpu_codegen,
+      jit_i64_indexed_for_large_tensors);
 }
 
 }  // namespace transforms
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
index b700189aec6608..37b799bb9666ea 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/gpu_kernel_to_blob_pass.cc
@@ -112,12 +112,11 @@ class GpuKernelToBlobPass
         return tensorflow::errors::Internal(
             "Could not parse ROCm architecture prefix (expected gfx)");
       }
-      std::string libdevice_dir = tensorflow::RocdlRoot();
       auto llvm_module_copy = llvm::CloneModule(*llvmModule);
       auto hsaco_or = xla::gpu::amdgpu::CompileToHsaco(
           llvm_module_copy.get(),
           tensorflow::se::RocmComputeCapability{arch_str}, options,
-          libdevice_dir, options.DebugString());
+          options.DebugString());
       if (!hsaco_or.ok()) {
         return tensorflow::errors::Internal("Failure when generating HSACO");
       }
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
index 355030d6009f18..45e248ceb904ff 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.h
@@ -66,8 +66,8 @@ std::unique_ptr<OperationPass<func::FuncOp>> CreateBufferReusePass();
 // framework.
 std::unique_ptr<OperationPass<func::FuncOp>> CreateFuncToJITInvocationPass(
     llvm::ArrayRef<int64_t> tile_sizes = {},
-    llvm::ArrayRef<int64_t> unroll_factors = {}, int64_t max_supported_rank = 5,
-    bool enable_ftz = false, bool index_64bit = false, bool cpu_codegen = false,
+    llvm::ArrayRef<int64_t> unroll_factors = {}, bool enable_ftz = false,
+    bool index_64bit = false, bool cpu_codegen = false,
     bool jit_i64_indexed_for_large_tensors = false);
 
 // Pass for applying LLVM legalization patterns.
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
index 0aa80747ef5ebc..4f92be70d25397 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/passes.td
@@ -52,8 +52,6 @@ def FuncToJITInvocationPass : Pass<"func-to-jit-invocation", "mlir::func::FuncOp
                  "llvm::cl::ZeroOrMore">,
       ListOption<"unroll_factors_", "unroll-factors", "int64_t",
                  "Unrolling in each tile dimension", "llvm::cl::ZeroOrMore">,
-      Option<"max_supported_rank_", "max-supported-rank", "int64_t",
-             /*default=*/"", "Max rank that this kernel supports">,
       Option<"enable_ftz_", "enable-ftz", "bool", /*default=*/"",
              "Enable the denormal flush to zero mode when generating code">,
       Option<"index_64bit_", "index_64bit", "bool", /*default=*/"",
diff --git a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
index fe649e1edeb723..cffa5e7b44691e 100644
--- a/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
+++ b/tensorflow/compiler/mlir/tools/kernel_gen/transforms/tf_framework_legalize_to_llvm.cc
@@ -271,8 +271,6 @@ class JITCompileFromStrOpConverter
         ConvertIntegerArrayAttrToStackAllocatedArray(
             loc, rewriter.getI64Type(), rewriter.getI64Type(),
             op.getUnrollFactors(), &rewriter);
-    Value max_supported_rank = rewriter.create<LLVM::ConstantOp>(
-        loc, rewriter.getI64Type(), op.getMaxSupportedRankAttr());
     Value enable_ftz = rewriter.create<LLVM::ConstantOp>(
         loc, rewriter.getI1Type(), op.getEnableFtzAttr());
     Value index_64bit = rewriter.create<LLVM::ConstantOp>(
@@ -285,8 +283,8 @@ class JITCompileFromStrOpConverter
         op, getVoidPtrType(), tf_func_ref,
         llvm::ArrayRef({adaptor.getCtx(), jit_module_code, tile_sizes.first,
                         tile_sizes.second, unroll_factors.first,
-                        unroll_factors.second, max_supported_rank, enable_ftz,
-                        index_64bit, cpu_codegen}));
+                        unroll_factors.second, enable_ftz, index_64bit,
+                        cpu_codegen}));
     return success();
   }
 
@@ -304,7 +302,6 @@ class JITCompileFromStrOpConverter
                            /*int64_t* tile_sizes_ptr*/ ptr_ty,
                            /*int64_t num_unroll_factors*/ i64_ty,
                            /*int64_t* unroll_factors_ptr*/ ptr_ty,
-                           /*int64_t max_supported_rank*/ i64_ty,
                            /*bool enable_ftz*/ i1_ty,
                            /*bool index_64bit*/ i1_ty,
                            /*bool cpu_codegen*/ i1_ty});
diff --git a/tensorflow/compiler/tests/xla_call_module_test.py b/tensorflow/compiler/tests/xla_call_module_test.py
index 50f81b588c5cd2..76a76a4ab82747 100644
--- a/tensorflow/compiler/tests/xla_call_module_test.py
+++ b/tensorflow/compiler/tests/xla_call_module_test.py
@@ -1018,7 +1018,7 @@ def f(x, y):  # x: f32[b, 4]  y: f32[2, b, 4]
     %2 = stablehlo.reshape %arg0 : (tensor<i32>) -> tensor<1xi32>
     %3 = stablehlo.constant dense<4> : tensor<1xi32>
     %4 = "stablehlo.concatenate"(%0, %2, %3) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
-    %5 = "stablehlo.dynamic_broadcast_in_dim"(%arg1, %4) {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<?x4xf32>, tensor<3xi32>) -> tensor<2x?x4xf32>
+    %5 = "stablehlo.dynamic_broadcast_in_dim"(%arg1, %4) {broadcast_dimensions = array<i64: 1, 2>} : (tensor<?x4xf32>, tensor<3xi32>) -> tensor<2x?x4xf32>
     %6 = stablehlo.add %5, %arg2 : (tensor<2x?x4xf32>, tensor<2x?x4xf32>) -> tensor<2x?x4xf32>
     return %5, %6 : tensor<2x?x4xf32>, tensor<2x?x4xf32>
   }
diff --git a/tensorflow/compiler/tf2tensorrt/BUILD b/tensorflow/compiler/tf2tensorrt/BUILD
index b91fb494667c5f..91ef722b52db86 100644
--- a/tensorflow/compiler/tf2tensorrt/BUILD
+++ b/tensorflow/compiler/tf2tensorrt/BUILD
@@ -298,6 +298,7 @@ cc_library(
     name = "trt_engine_resource_op_kernels",
     srcs = ["kernels/trt_engine_resource_ops.cc"],
     copts = tf_copts(),
+    features = ["-layering_check"],
     visibility = ["//tensorflow/core:__subpackages__"],
     deps = [
         ":trt_allocator",
@@ -994,6 +995,7 @@ tf_cuda_library(
     name = "trt_plugins",
     srcs = ["plugin/trt_plugin.cc"],
     hdrs = ["plugin/trt_plugin.h"],
+    features = ["-layering_check"],
     deps = [
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib_proto_parsing",
diff --git a/tensorflow/compiler/tf2xla/BUILD b/tensorflow/compiler/tf2xla/BUILD
index c565cef1489532..4f6706c11b3234 100644
--- a/tensorflow/compiler/tf2xla/BUILD
+++ b/tensorflow/compiler/tf2xla/BUILD
@@ -944,8 +944,8 @@ tf_cc_test(
         "@local_xla//xla/client:local_client",
         "@local_xla//xla/service:compiler",
         "@local_xla//xla/service:platform_util",
-        "@local_xla//xla/stream_executor:multi_platform_manager",
         "@local_xla//xla/stream_executor:platform",
+        "@local_xla//xla/stream_executor:platform_manager",
     ],
 )
 
@@ -1171,14 +1171,15 @@ cc_library(
         "//tensorflow/compiler/mlir:mlir_graph_optimization_pass",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/mlir/tensorflow:device_util",
-        "//tensorflow/compiler/mlir/tensorflow/transforms:bridge",
         "//tensorflow/compiler/mlir/tensorflow/transforms/host_runtime:lower_cluster_to_runtime_ops",
         "//tensorflow/compiler/mlir/tf2xla:mlir_bridge_rollout_policy",
         "//tensorflow/compiler/mlir/tf2xla/api/v1:cluster_tf",
         "//tensorflow/compiler/mlir/tf2xla/api/v1:tf_dialect_to_executor",
         "//tensorflow/compiler/mlir/tf2xla/api/v2:cluster_tf",
         "//tensorflow/compiler/mlir/tf2xla/api/v2:tf_dialect_to_executor",
+        "//tensorflow/compiler/mlir/tf2xla/internal:mlir_bridge_pass_util",
         "//tensorflow/core:core_cpu",
+        "//tensorflow/core:core_cpu_base",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core/common_runtime:device_set",
diff --git a/tensorflow/compiler/tf2xla/const_analysis.cc b/tensorflow/compiler/tf2xla/const_analysis.cc
index b39d6f8f5d2bed..b8d91294ca7b18 100644
--- a/tensorflow/compiler/tf2xla/const_analysis.cc
+++ b/tensorflow/compiler/tf2xla/const_analysis.cc
@@ -42,7 +42,7 @@ Status GetFunctionBody(FunctionLibraryRuntime* flib_runtime,
   TF_RETURN_IF_ERROR(flib_runtime->Instantiate(
       name_attr_list.name(), AttrSlice(&name_attr_list.attr()), &func_handle));
   *fbody = flib_runtime->GetFunctionBody(func_handle);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GetFunctionBodies(FunctionLibraryRuntime* flib_runtime,
@@ -57,7 +57,7 @@ Status GetFunctionBodies(FunctionLibraryRuntime* flib_runtime,
         &func_handle));
     fbodies->push_back(flib_runtime->GetFunctionBody(func_handle));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CondConstInputIndices(
@@ -84,7 +84,7 @@ Status CondConstInputIndices(
       const_input_idxs->push_back(i + 1);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GetCompileTimeConstInputs(const NodeDef& node, const OpKernel* op_kernel,
@@ -133,7 +133,7 @@ Status GetCompileTimeConstInputs(const NodeDef& node, const OpKernel* op_kernel,
         }
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   } else if (node.op() == "If" || node.op() == "StatelessIf") {
     const FunctionBody* fthen = nullptr;
     const FunctionBody* felse = nullptr;
@@ -162,7 +162,7 @@ Status GetCompileTimeConstInputs(const NodeDef& node, const OpKernel* op_kernel,
         const_input_idxs->push_back(i);
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   } else if (op_def != nullptr) {
     return XlaOpRegistry::CompileTimeConstantInputs(node, *op_def,
                                                     const_input_idxs);
@@ -193,7 +193,7 @@ Status BackwardsConstAnalysis(
       !edge_filter_input) {
     VLOG(5) << "Using cached argument indices on graph " << &g;
     *compile_time_const_arg_indices = g.GetConstArgIndicesCache().value();
-    return OkStatus();
+    return absl::OkStatus();
   }
   auto edge_filter = [&](const Edge& e) {
     return edge_filter_input ? edge_filter_input(e) : true;
diff --git a/tensorflow/compiler/tf2xla/functionalize_cond.cc b/tensorflow/compiler/tf2xla/functionalize_cond.cc
index 1d3b6d3fe873b0..4c7048d205a324 100644
--- a/tensorflow/compiler/tf2xla/functionalize_cond.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_cond.cc
@@ -128,14 +128,14 @@ Status GetSwitchPredicate(const Node& switch_node, OutputTensor* pred) {
     TF_RETURN_IF_ERROR(pred_edge->src()->input_edge(0, &pred_edge));
   }
   *pred = OutputTensor(pred_edge->src(), pred_edge->src_output());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GetSwitchValue(const Node& switch_node, OutputTensor* val) {
   const Edge* val_edge;
   TF_RETURN_IF_ERROR(switch_node.input_edge(0, &val_edge));
   *val = OutputTensor(val_edge->src(), val_edge->src_output());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool StateMap::OutputTensorLess::operator()(const OutputTensor& lhs,
@@ -394,7 +394,7 @@ Conditional::Conditional(OutputTensor predicate, FunctionalizeCond* parent,
 
 Status Conditional::AddMerge(Node* m) {
   merges_.insert(m);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Conditional::AddSwitch(Node* s) {
@@ -410,7 +410,7 @@ Status Conditional::AddSwitch(Node* s) {
   }
   switches_.insert(s);
   parent_->AddSwitchId(s->id());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Conditional::BuildArgumentNodes() {
@@ -492,7 +492,7 @@ Status Conditional::BuildArgumentNodes() {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Conditional::AddSwitchNodeAlongEdge(const Edge* edge, BranchType branch,
@@ -741,7 +741,7 @@ Status Conditional::ExtractBodies(Graph* graph) {
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Conditional::BuildIfNode(Graph* graph,
@@ -834,7 +834,7 @@ Status Conditional::BuildIfNode(Graph* graph,
   TF_ASSIGN_OR_RETURN(if_node_,
                       parent_->AddIfNode(if_def, *merges_.begin(), predicate_));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Conditional::AddInputEdges(
@@ -871,7 +871,7 @@ Status Conditional::AddInputEdges(
   for (Node* n : external_control_inputs_) {
     graph->AddControlEdge(n, if_node_);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Conditional::AddOutputEdges(
@@ -910,7 +910,7 @@ Status Conditional::AddOutputEdges(
     graph->AddControlEdge(if_node_, n);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Conditional::BuildAndReplace(
@@ -918,7 +918,7 @@ Status Conditional::BuildAndReplace(
     std::unordered_map<Node*, OutputTensor>* merge_to_replacement) {
   VLOG(1) << "Build If and replace merge nodes "
           << NodesToString(this->merges_);
-  if (replaced_) return OkStatus();
+  if (replaced_) return absl::OkStatus();
 
   TF_RETURN_IF_ERROR(ExtractBodies(graph));
   TF_RETURN_IF_ERROR(BuildArgumentNodes());
@@ -944,7 +944,7 @@ Status Conditional::BuildAndReplace(
       "Converting to If failed.");
 
   replaced_ = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string Conditional::name() const {
@@ -966,7 +966,7 @@ Status FunctionalizeCond::AddIdentityNode(const Node* replacee, Node* if_node,
   TF_RETURN_IF_ERROR(id_builder.Finalize(graph_, &id));
   state_map_.ResetCondId(id, state_map_.LookupCondId(if_node));
   state_map_.ResetAncestorId(id, state_map_.LookupAncestorId(if_node));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<Node*> FunctionalizeCond::AddIfNode(const NodeDef& def,
@@ -1018,7 +1018,7 @@ Status FunctionalizeCond::PropagateUpdatedState(const Node* replacee) {
       changed.erase(n);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Returns the most restrictive branch of two branches or neither. This is the
@@ -1160,7 +1160,7 @@ Status FunctionalizeCond::DetermineCondStateMerge(Node* dst) {
   // Only Merge nodes with two inputs are supported, but if this is a redundant
   // merge, then the dead edge may already have been removed (if due to a
   // switch) and so the input count would be incorrect.
-  if (state_map_.IsDead(state_map_.LookupCondId(dst))) return OkStatus();
+  if (state_map_.IsDead(state_map_.LookupCondId(dst))) return absl::OkStatus();
 
   int data_inputs = 0;
   for (auto e : dst->in_edges()) {
@@ -1183,7 +1183,7 @@ Status FunctionalizeCond::DetermineCondStateMerge(Node* dst) {
         dst->name(), " only has ", data_inputs,
         " inputs, while only merge nodes with two inputs supported.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FunctionalizeCond::DetermineCondStateNonMerge(Node* dst) {
@@ -1201,13 +1201,14 @@ Status FunctionalizeCond::DetermineCondStateNonMerge(Node* dst) {
                                     FormatNodeForError(*dst));
     state_map_.ResetCondId(dst, id_or.value());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FunctionalizeCond::RemoveRedundantMerge(Node* node) {
   // Handle redundant merge nodes. A merge node is considered redundant if
   // one input edge is dead while the other has a value.
-  if (!state_map_.IsDead(state_map_.LookupCondId(node))) return OkStatus();
+  if (!state_map_.IsDead(state_map_.LookupCondId(node)))
+    return absl::OkStatus();
 
   const Edge* non_dead_edge = nullptr;
   for (auto e : node->in_edges()) {
@@ -1239,7 +1240,7 @@ Status FunctionalizeCond::RemoveRedundantMerge(Node* node) {
                         : non_dead_edge->src_output(),
                     dst_node, dst_port);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FunctionalizeCond::RemoveRedundantSwitch(Node* node) {
@@ -1251,7 +1252,7 @@ Status FunctionalizeCond::RemoveRedundantSwitch(Node* node) {
   // (rather than boolean equivalence) and aimed at redundant switches as
   // currently generated by gradient code.
   StateMap::CondId dst_id = state_map_.LookupCondId(node);
-  if (state_map_.IsDead(dst_id)) return OkStatus();
+  if (state_map_.IsDead(dst_id)) return absl::OkStatus();
 
   BranchType b;
   OutputTensor pred;
@@ -1272,7 +1273,7 @@ Status FunctionalizeCond::RemoveRedundantSwitch(Node* node) {
     }
     b = state_map_.FindBranchOf(dst_id, val);
     if (b != BranchType::kThenBranch && b != BranchType::kElseBranch)
-      return OkStatus();
+      return absl::OkStatus();
   }
 
   VLOG(5) << "Redundant switch " << node->name() << " " << Branch_Name(b) << " "
@@ -1309,7 +1310,7 @@ Status FunctionalizeCond::RemoveRedundantSwitch(Node* node) {
         switch_branch == Graph::kControlSlot ? Graph::kControlSlot : val_port,
         dst_node, dst_input);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FunctionalizeCond::DetermineStates(std::vector<Node*> rev_topo_order) {
@@ -1325,7 +1326,7 @@ Status FunctionalizeCond::DetermineStates(std::vector<Node*> rev_topo_order) {
             << " @ " << state_map_.AncestorStateToString(dst);
     if (VLOG_IS_ON(10)) DumpGraphWithCondState("it");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FunctionalizeCond::DetermineAncestorState(Node* dst) {
@@ -1359,7 +1360,7 @@ Status FunctionalizeCond::DetermineAncestorState(Node* dst) {
     id = insert(id, src);
   }
   state_map_.ResetAncestorId(dst, id);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void FunctionalizeCond::DeleteReachableAndDeadNodes(
@@ -1504,7 +1505,7 @@ Status FunctionalizeCond::FunctionalizeInternal() {
     // No merges mean no switch values consumed (as only considering values
     // fetchable as output of merge);
     DeleteReachableAndDeadNodes(merge_order);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   TF_RETURN_IF_ERROR(DetermineStates(std::move(rev_topo_order)));
@@ -1574,7 +1575,7 @@ Status FunctionalizeCond::FunctionalizeInternal() {
 
   DeleteReachableAndDeadNodes(merge_order);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void FunctionalizeCond::DumpGraphWithCondState(const string& name) {
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
index 1d13170daa43e6..2bad3b58d34761 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow.cc
@@ -117,7 +117,8 @@ Status AddFunctionDefToGraphLibrary(
   // `graph->flib_def().default_registry()` which is done in the following line
   // (we have to use `LookUp` instead of `Contains` or `Find` because the latter
   // both don't check the default registry).
-  if (graph->flib_def().LookUp(func_name, &op_reg_data).ok()) return OkStatus();
+  if (graph->flib_def().LookUp(func_name, &op_reg_data).ok())
+    return absl::OkStatus();
 
   const FunctionDef* new_fdef = fld->Find(func_name);
   DCHECK(new_fdef != nullptr);
@@ -197,7 +198,7 @@ Status FunctionalizeControlFlowForNodeAssociatedFunctions(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FunctionalizeControlFlowForFunction(
@@ -210,7 +211,7 @@ Status FunctionalizeControlFlowForFunction(
   // Convert the function to a graph.
   FunctionLibraryRuntime::Handle handle;
   TF_RETURN_IF_ERROR(flr->Instantiate(func_name, AttrSlice(&attrs), &handle));
-  Status ret_status = OkStatus();
+  Status ret_status = absl::OkStatus();
   auto cleanup_handle = gtl::MakeCleanup([&]() {
     auto s = flr->ReleaseHandle(handle);
     if (!s.ok()) {
@@ -304,7 +305,7 @@ Status FunctionalizeControlFlow(Graph* graph,
   VLOG(2) << "FunctionalizeControlFlow (final): "
           << DumpGraphToFile("functionalize_final", *graph, library);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FunctionalizeControlFlowForGraphDef(GraphDef* graph_def,
@@ -319,7 +320,7 @@ Status FunctionalizeControlFlowForGraphDef(GraphDef* graph_def,
                                               include_functions));
   graph.ToGraphDef(graph_def);
   std::swap(*graph_def->mutable_library(), function_lib);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FunctionalizeControlFlowForXlaPass::Run(
@@ -388,7 +389,7 @@ Status FunctionalizeControlFlowForXlaPass::Run(
     DumpGraphToFile("functionalize_control_flow_after", *graph,
                     options.flib_def);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
index a116f905097989..25a08224c8b946 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_test.cc
@@ -56,7 +56,7 @@ Status FindIfThenAndElse(const GraphDef& graph, string* op_name,
       *then_fn = *result;
       TF_RETURN_IF_ERROR(GetNodeAttr(node, "else_branch", &result));
       *else_fn = *result;
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
   return errors::NotFound("No If node found in graph");
@@ -317,7 +317,7 @@ Status FindWhileCondAndBody(const GraphDef& graph, NameAttrList* cond,
       *cond = *result;
       TF_RETURN_IF_ERROR(GetNodeAttr(node, "body", &result));
       *body = *result;
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
   return errors::NotFound("No While node found in graph");
diff --git a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc
index 71e3f9f69e4445..3260fa139cb673 100644
--- a/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_control_flow_util.cc
@@ -78,7 +78,7 @@ Status ExtractWhileLoopFrames(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Check that the graph has no cycle containing the given node.
@@ -99,7 +99,7 @@ Status CheckNodeNotInCycle(const Node* node, const int num_nodes) {
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/functionalize_while.cc b/tensorflow/compiler/tf2xla/functionalize_while.cc
index 0294c018e512db..7feb847f982c2e 100644
--- a/tensorflow/compiler/tf2xla/functionalize_while.cc
+++ b/tensorflow/compiler/tf2xla/functionalize_while.cc
@@ -103,7 +103,7 @@ Status CopySubgraph(const Graph& graph, const WhileLoopFrame* frame,
       output->AddEdge(src_copy, src_output, dst_copy, e->dst_input());
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<Node*> BuildArgNode(Graph* graph, DataType type, int index) {
@@ -206,7 +206,7 @@ Status BuildLoopBody(const Graph& graph, WhileLoopFrame* frame,
   TF_RETURN_IF_ERROR(CopySubgraph(graph, frame, std::move(next_iterations),
                                   squash_src_outputs, &node_map, output));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FunctionalizeLoop(Graph* graph, WhileLoopFrame* frame,
@@ -216,7 +216,7 @@ Status FunctionalizeLoop(Graph* graph, WhileLoopFrame* frame,
     VLOG(2) << "Skipping functionalization for frame " << frame->name
             << " because it has control flow nodes that are filtered out by "
                "the specified node filter.";
-    return OkStatus();
+    return absl::OkStatus();
   }
   VLOG(2) << "Frame " << frame->name << " before: "
           << DumpGraphToFile("functionalize_before", *graph, library);
@@ -501,7 +501,7 @@ Status FunctionalizeLoop(Graph* graph, WhileLoopFrame* frame,
   VLOG(2) << "Frame " << frame->name << " after: "
           << DumpGraphToFile("functionalize_after", *graph, library);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
@@ -565,7 +565,7 @@ Status FunctionalizeWhileLoop(Graph* graph, FunctionLibraryDefinition* library,
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc b/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc
index 0179168be93bcd..70c09bc84ac275 100644
--- a/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc
+++ b/tensorflow/compiler/tf2xla/fused_batchnorm_reserve_space_test.cc
@@ -60,7 +60,7 @@ Status GetTestDevice(Session* session, string* test_device) {
 
   *test_device = found_gpu ? "GPU" : "CPU";
   VLOG(2) << "Using test device " << *test_device;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void FillZeros(Tensor* tensor) {
diff --git a/tensorflow/compiler/tf2xla/graph_compiler.cc b/tensorflow/compiler/tf2xla/graph_compiler.cc
index 4914efdba55a07..23eb33224dc24b 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler.cc
@@ -114,7 +114,7 @@ Status PrepareArguments(XlaOpKernelContext* ctx, Graph* graph,
         return errors::InvalidArgument("Invalid function argument");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 Status GraphCompiler::Compile() {
@@ -204,7 +204,7 @@ Status GraphCompiler::Compile() {
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -221,7 +221,7 @@ Status GetFunctionNameAndAttr(const FunctionLibraryRuntime& flib,
           " does not have 'func' field set");
     }
     *func = attr_value->func();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (flib.GetFunctionLibraryDefinition()->Find(node.def().op())) {
@@ -230,7 +230,7 @@ Status GetFunctionNameAndAttr(const FunctionLibraryRuntime& flib,
     func->set_name(FunctionLibraryDefinition::kGradientOp);
   }
   *func->mutable_attr() = node.def().attr();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/compiler/tf2xla/graph_compiler_util.cc b/tensorflow/compiler/tf2xla/graph_compiler_util.cc
index 68c576a52cba73..ac064805f1a470 100644
--- a/tensorflow/compiler/tf2xla/graph_compiler_util.cc
+++ b/tensorflow/compiler/tf2xla/graph_compiler_util.cc
@@ -106,7 +106,7 @@ Status AddArgNodes(Graph* graph, const NodeMap& node_map,
       graph->RemoveEdge(edge);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Each fetch id identifies the positional output of some node.  For each fetch
@@ -138,7 +138,7 @@ Status AddRetvalNodes(Graph* graph, const NodeMap& node_map,
             .Finalize(graph, &retval_node));
     retval_nodes->insert(retval_node);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // RewriteAndPruneGraph identifies input and output edges (named by the feed and
@@ -192,7 +192,7 @@ Status RewriteAndPruneGraph(
         ", missing feeds: ", absl::StrJoin(missing_feeds, ", "),
         ", missing fetches: ", absl::StrJoin(missing_fetches, ", "));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // CollectArgNodes collects _Arg nodes from the graph, and performs basic
@@ -224,7 +224,7 @@ Status CollectArgNodes(const Graph& graph, std::vector<Node*>* arg_nodes) {
     }
     arg_nodes->push_back(index_node.second);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -243,7 +243,7 @@ Status CreateXlaArgs(const Graph& graph,
     TF_RETURN_IF_ERROR(GetNodeAttr(node->attrs(), kDebugNameAttr, &arg.name));
     xla_args->push_back(arg);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void PopulateXlaArgs(const tf2xla::Config& config,
@@ -306,7 +306,7 @@ Status InitGraph(const GraphDef& graph_def, const tf2xla::Config& config,
   TF_RETURN_IF_ERROR(g->AddFunctionLibrary(flib_def.ToProto()));
 
   *graph = std::move(g);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/host_compute_metadata.proto b/tensorflow/compiler/tf2xla/host_compute_metadata.proto
index 43ab371a217e6c..9e6eec2cddc99e 100644
--- a/tensorflow/compiler/tf2xla/host_compute_metadata.proto
+++ b/tensorflow/compiler/tf2xla/host_compute_metadata.proto
@@ -1,19 +1,21 @@
 syntax = "proto3";
 
 package tensorflow.tf2xla;
+
+import "tensorflow/core/framework/tensor_shape.proto";
+import "tensorflow/core/framework/types.proto";
+
 option cc_enable_arenas = true;
 option java_outer_classname = "Tf2XlaProtos";
 option java_multiple_files = true;
 option java_package = "org.tensorflow.tf2xla";
 
-import "tensorflow/core/framework/tensor_shape.proto";
-import "tensorflow/core/framework/types.proto";
-
 // TensorMetadata indicates the type and shape of a Tensor that is
 // part of a host compute transfer.
 message TensorMetadata {
   DataType type = 1;
   TensorShapeProto shape = 2;
+  int64 channel_id = 3;
 }
 
 // HostTransferMetadata describes a transfer either from host to device
diff --git a/tensorflow/compiler/tf2xla/kernels/all_reduce_op.cc b/tensorflow/compiler/tf2xla/kernels/all_reduce_op.cc
index 80a43e2026d875..ba99f9b1297542 100644
--- a/tensorflow/compiler/tf2xla/kernels/all_reduce_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/all_reduce_op.cc
@@ -96,7 +96,6 @@ class CollectiveReduceV2Op : public XlaOpKernel {
   void operator=(const CollectiveReduceV2Op&) = delete;
 };
 
-
 REGISTER_XLA_OP(Name("CollectiveReduceV2")
                     .CompileTimeConstantInput("group_key")
                     .CompileTimeConstantInput("group_size"),
@@ -106,4 +105,13 @@ REGISTER_XLA_OP(Name("CollectiveAssignGroupV2")
                     .CompileTimeConstantInput("group_assignment"),
                 MlirXlaOpKernel);
 
+REGISTER_XLA_OP(Name("XlaReduceScatter")
+                    .CompileTimeConstantInput("group_assignment")
+                    .CompileTimeConstantInput("scatter_dimension"),
+                MlirXlaOpKernel);
+
+REGISTER_XLA_OP(
+    Name("XlaAllReduce").CompileTimeConstantInput("group_assignment"),
+    MlirXlaOpKernel);
+
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
index c2bf7f7f606432..f44d800481fe6a 100644
--- a/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
+++ b/tensorflow/compiler/tf2xla/kernels/conv_op_helpers.cc
@@ -148,7 +148,7 @@ Status CheckConvAttrs(const ConvOpAttrs& attrs) {
                                    attrs.dilations[input_dim]);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Wrapper around ConvBackpropComputeDimensions that converts from XLA shapes
diff --git a/tensorflow/compiler/tf2xla/kernels/fused_conv_ops.cc b/tensorflow/compiler/tf2xla/kernels/fused_conv_ops.cc
index 2146284bd69864..92138c9663c556 100644
--- a/tensorflow/compiler/tf2xla/kernels/fused_conv_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/fused_conv_ops.cc
@@ -277,7 +277,7 @@ class FusedConv2DInt8Op : public XlaOpKernel {
     }
 
     ctx->SetOutput(0, result);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void Compile(XlaOpKernelContext* ctx) override {
diff --git a/tensorflow/compiler/tf2xla/kernels/gather_op.cc b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
index 59d196c351c99b..5877aea0269643 100644
--- a/tensorflow/compiler/tf2xla/kernels/gather_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/gather_op.cc
@@ -86,7 +86,7 @@ Status XlaGather(const xla::XlaOp& input, const TensorShape& input_shape,
 
     *gather_output =
         xla::Broadcast(XlaHelpers::Zero(builder, dtype), out_shape.dim_sizes());
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   for (int64_t i = 0; i < num_index_dims; ++i) {
@@ -152,7 +152,7 @@ Status XlaGather(const xla::XlaOp& input, const TensorShape& input_shape,
   }
 
   *gather_output = xla::Gather(input, indices, dim_numbers, slice_sizes);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaGatherWithBatchDimsOpImpl(XlaOpKernelContext* context,
@@ -236,7 +236,7 @@ Status XlaGatherWithBatchDimsOpImpl(XlaOpKernelContext* context,
                   /*indices_are_nd=*/false, context->expected_output_dtype(0),
                   index_type, context->builder(), gather_output));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 class GatherOp : public XlaOpKernel {
  public:
diff --git a/tensorflow/compiler/tf2xla/kernels/if_op.cc b/tensorflow/compiler/tf2xla/kernels/if_op.cc
index 945ae96b46c327..b2d0b2e1d418f2 100644
--- a/tensorflow/compiler/tf2xla/kernels/if_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/if_op.cc
@@ -181,7 +181,7 @@ static Status ValidateShapes(XlaOpKernelContext* ctx,
           "Mismatch in resource of then and else branch for resource ", i);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // TODO(b/35949885): There is duplication here with the handling of the
diff --git a/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc b/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
index fbdcecdcf95dfd..60ef289567fe8f 100644
--- a/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
+++ b/tensorflow/compiler/tf2xla/kernels/light_outside_compilation.cc
@@ -53,8 +53,8 @@ limitations under the License.
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/util.h"
 #include "tensorflow/core/common_runtime/gpu/gpu_process_state.h"
@@ -374,7 +374,7 @@ Status LightOutsideCompilationOp::CompileToCustomCallCallingTfKernel(
                    output_shape.IsTuple() ? xla::GetTupleElement(out, i) : out);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -552,7 +552,7 @@ Status PopulateMetadataBufferIfNeeded(OpKernelContext& ctx,
                          num_dimensions * sizeof(int32_t));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 class FakeDeviceContext : public DeviceContext {
@@ -569,8 +569,7 @@ Status CallTfKernel(void* stream_handle, void** buffers, const char* opaque,
   // Look up the platform only once, for a small performance gain.
   static Status* platform_status = nullptr;
   static se::Platform* platform = [&]() -> se::Platform* {
-    StatusOr<se::Platform*> p =
-        se::MultiPlatformManager::PlatformWithName("CUDA");
+    StatusOr<se::Platform*> p = se::PlatformManager::PlatformWithName("CUDA");
     if (!p.ok()) {
       platform_status = new Status(p.status());
       return nullptr;
@@ -708,7 +707,7 @@ Status CallTfKernel(void* stream_handle, void** buffers, const char* opaque,
   }
 
   TF_RETURN_IF_ERROR(ctx.status());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void GenericTfCallback(void* stream_handle, void** buffers, const char* opaque,
diff --git a/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
index 9a86b26fb8a623..258e0b0d47f6a0 100644
--- a/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/listdiff_op.cc
@@ -98,7 +98,7 @@ class ListDiffOp : public XlaOpKernel {
                        xla::ConstantR1<Tval>(context->builder(), val_output));
     context->SetOutput(1,
                        xla::ConstantR1<Tidx>(context->builder(), idx_output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   template <typename Tval>
diff --git a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
index fdd38e2f6beb32..a82cd2e3b85db1 100644
--- a/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/pooling_ops.cc
@@ -55,7 +55,7 @@ static Status ValidateKernelSizes(const T& ksizes) {
           " must be positive but is ", ksizes[i]);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename T>
@@ -67,7 +67,7 @@ static Status ValidateStrides(const T& strides) {
           " must be positive but is ", strides[i]);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Superclass of pooling ops.
diff --git a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
index ce20763c6146eb..81282578bb2ee6 100644
--- a/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/reduction_ops.cc
@@ -101,7 +101,7 @@ class MaxOp : public XlaReductionOp {
           "Unsupported PrimitiveType in MaxOp: '",
           xla::PrimitiveType_Name(xla_reduction_type), "'");
     } else {
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
diff --git a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
index 77062741c0f91f..55eaf3db8a7570 100644
--- a/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/scatter_nd_op.cc
@@ -58,7 +58,7 @@ Status ValidateUpdateShape(const TensorShape& buffer_shape,
   };
 
   if (updates_shape.dims() == 0 && broadcast_scalar_update) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (updates_shape.dims() < batch_dim) return shape_err();
@@ -81,7 +81,7 @@ Status ValidateUpdateShape(const TensorShape& buffer_shape,
       return shape_err();
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 class ScatterNdOp : public XlaOpKernel {
diff --git a/tensorflow/compiler/tf2xla/kernels/shape_util.cc b/tensorflow/compiler/tf2xla/kernels/shape_util.cc
index 06f7160e392456..1cd296b349a9dc 100644
--- a/tensorflow/compiler/tf2xla/kernels/shape_util.cc
+++ b/tensorflow/compiler/tf2xla/kernels/shape_util.cc
@@ -42,7 +42,7 @@ Status TensorShapeToConstant(const TensorShape& input_shape,
       vec(i) = dim_size;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/sharding_util_ops.cc b/tensorflow/compiler/tf2xla/kernels/sharding_util_ops.cc
index c175a5584e8f2d..3c1ca648769d95 100644
--- a/tensorflow/compiler/tf2xla/kernels/sharding_util_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/sharding_util_ops.cc
@@ -91,7 +91,7 @@ Status GetAndValidateAttributes(OpKernelConstruction* ctx,
     paddings.assign(expected_rank, 0);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::vector<int64_t> GetSliceIndices(absl::Span<const int64> num_partitions,
@@ -174,10 +174,10 @@ class XlaSplitNDBaseOp : public XlaOpKernel {
           xla::Pad(input,
                    xla::ConstantR0WithType(ctx->builder(), type, /*value=*/0),
                    padding_config));
-      return OkStatus();
+      return absl::OkStatus();
     } else if (num_slices_ == 1) {
       ctx->SetOutput(/*index=*/0, input);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Slice shape with optional padding.
@@ -242,7 +242,7 @@ class XlaSplitNDBaseOp : public XlaOpKernel {
                                      slice_limit_indices, slice_strides));
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -426,7 +426,7 @@ class XlaConcatNDBaseOp : public XlaOpKernel {
       output_shape.push_back(max_dim_size - paddings_[dim]);
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::vector<int64_t> num_concats_;
diff --git a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
index e74381aa6f24b4..8131769503086c 100644
--- a/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stack_ops.cc
@@ -85,7 +85,7 @@ Status MaybeInitializeStack(xla::XlaBuilder* builder, XlaResource* resource,
           actual_shape.DebugString());
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 class StackOp : public XlaOpKernel {
diff --git a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
index 6fbb413b46bc3b..d5746a5bfd729b 100644
--- a/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/stateful_random_ops.cc
@@ -142,7 +142,7 @@ Status CheckStateShape(xla::RandomAlgorithm alg, const TensorShape& shape) {
     return errors::InvalidArgument("The size of the state must be at least ",
                                    min_state_size, "; got ", state_size);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<xla::RandomAlgorithm> ResolveAlg(int alg_id) {
@@ -227,7 +227,7 @@ Status CompileImpl(
   var = BitcastConvertType(var, state_element_type);
   TF_RETURN_IF_ERROR(
       ctx->AssignVariable(state_input_idx, STATE_ELEMENT_DTYPE, var));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 class StatefulUniformOp : public XlaOpKernel {
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
index 22614039459a90..aca9973e118c14 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_array_ops.cc
@@ -84,7 +84,7 @@ Status MaybeInitializeTensorArray(xla::XlaBuilder* builder,
           shape.DebugString());
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Checks that the TensorArray 'resource' has been initialized, and has type
@@ -106,14 +106,14 @@ Status CheckTensorArrayIsInitialized(const string& op_name,
         " but op has dtype ", DataTypeString(dtype), ".");
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GetTensorArrayShape(const XlaResource* resource,
                            xla::XlaBuilder* builder, TensorShape* shape) {
   *shape = resource->shape();
   shape->InsertDim(0, resource->max_array_size());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Like XlaBuilder::DynamicUpdateSlice, but adds 'update' to the
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
index 7e4c16ca189de3..8f8e6bd90ca448 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_ops.cc
@@ -119,19 +119,19 @@ Status TryGetElementShapeFromInput(XlaOpKernelContext* ctx, xla::XlaOp input,
   bool is_compile_time_constant = is_compile_time_constant_or.value();
   if (!is_compile_time_constant) {
     *got_shape = false;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   PartialTensorShape partial_shape;
   TF_RETURN_IF_ERROR(ctx->ConstantInputAsPartialShape(0, &partial_shape));
   if (!partial_shape.IsFullyDefined()) {
     *got_shape = false;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   *shape = xla::ShapeUtil::MakeShape(dtype, partial_shape.dim_sizes());
   *got_shape = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 class TensorListReserveOp : public XlaOpKernel {
diff --git a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
index 575a3f400d899b..d7a1b5f970561a 100644
--- a/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
+++ b/tensorflow/compiler/tf2xla/kernels/tensor_list_utils.cc
@@ -117,7 +117,7 @@ bool IsTensorListInput(XlaOpKernelContext* ctx, int index) {
 Status IsTensorListInitialized(xla::XlaOp list, bool* is_initialized) {
   TF_ASSIGN_OR_RETURN(xla::Shape list_shape, list.builder()->GetShape(list));
   *is_initialized = list_shape.IsTuple();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status IsNestedTensorList(xla::XlaOp list, bool* is_nested_list) {
@@ -128,14 +128,14 @@ Status IsNestedTensorList(xla::XlaOp list, bool* is_nested_list) {
   }
   TF_ASSIGN_OR_RETURN(xla::Shape list_shape, list.builder()->GetShape(list));
   *is_nested_list = (xla::ShapeUtil::TupleElementCount(list_shape) > 2);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status BuildNonNestedTensorList(xla::XlaOp buffer, xla::XlaOp push_index,
                                 xla::XlaOp* output_list) {
   TF_RET_CHECK(buffer.builder());
   *output_list = xla::Tuple(buffer.builder(), {buffer, push_index});
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GetTensorListBufferShape(xla::XlaOp list, xla::Shape* buffer_shape) {
@@ -146,7 +146,7 @@ Status GetTensorListBufferShape(xla::XlaOp list, xla::Shape* buffer_shape) {
   }
   TF_ASSIGN_OR_RETURN(xla::Shape list_shape, list.builder()->GetShape(list));
   *buffer_shape = xla::ShapeUtil::GetTupleElementShape(list_shape, 0);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GetTensorListBuffer(xla::XlaOp list, xla::XlaOp* buffer) {
@@ -156,7 +156,7 @@ Status GetTensorListBuffer(xla::XlaOp list, xla::XlaOp* buffer) {
     return errors::InvalidArgument("TensorList is not initialized");
   }
   *buffer = xla::GetTupleElement(list, 0);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GetTensorListPushIndex(xla::XlaOp list, xla::XlaOp* push_index) {
@@ -168,7 +168,7 @@ Status GetTensorListPushIndex(xla::XlaOp list, xla::XlaOp* push_index) {
   TF_ASSIGN_OR_RETURN(xla::Shape list_shape, list.builder()->GetShape(list));
   int tuple_size = xla::ShapeUtil::TupleElementCount(list_shape);
   *push_index = xla::GetTupleElement(list, tuple_size - 1);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SetTensorListPushIndex(xla::XlaOp list, xla::XlaOp push_index,
@@ -187,7 +187,7 @@ Status SetTensorListPushIndex(xla::XlaOp list, xla::XlaOp push_index,
   }
   result_parts.push_back(push_index);
   *result = xla::Tuple(list.builder(), result_parts);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 xla::XlaOp BuildUninitializedTensorList(xla::XlaBuilder* b,
@@ -222,7 +222,7 @@ Status GetLeadingDimForTensorList(xla::XlaOp list, int64_t* leading_dim,
     *leading_dim = list_shape.dimensions(0);
     *leading_dim_dynamic_size = xla::GetDimensionSize(list, 0);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GetTensorListShapeFromElementTensorListShape(
@@ -244,7 +244,7 @@ Status GetTensorListShapeFromElementTensorListShape(
   shapes.push_back(xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32,
                                              std::vector<int64_t>{}));
   *tensor_list_shape = xla::ShapeUtil::MakeTupleShape(shapes);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GetTensorListShapeFromElementShape(const xla::Shape& element_shape,
@@ -267,7 +267,7 @@ Status GetTensorListShapeFromElementShape(const xla::Shape& element_shape,
   shapes.push_back(xla::ShapeUtil::MakeShape(xla::PrimitiveType::S32,
                                              std::vector<int64_t>{}));
   *tensor_list_shape = xla::ShapeUtil::MakeTupleShape(shapes);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CreateZerosTensorListWithShape(
@@ -296,7 +296,7 @@ Status CreateZerosTensorListWithShape(
                    .element_type() == xla::S32);
   elements.push_back(xla::ConstantLiteral(b, xla::LiteralUtil::Zero(xla::S32)));
   *list = xla::Tuple(b, elements);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GetInitializedTensorListForElement(xla::XlaOp list, xla::XlaOp element,
@@ -330,7 +330,7 @@ Status GetInitializedTensorListForElement(xla::XlaOp list, xla::XlaOp element,
           ", expected: ", list_shape.DebugString());
     }
     *initialized_list = list;
-    return OkStatus();
+    return absl::OkStatus();
   } else {
     // Prepare dynamic dimension dimensions for zero tensor list. The dynamic
     // sizes are created by reading the dynamic dimension size of sub-elements.
@@ -414,7 +414,7 @@ Status ExecuteTensorListPushBack(xla::XlaOp list, xla::XlaOp element,
   result_parts.push_back(updated_push_index);
 
   *result = xla::Tuple(b, result_parts);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ExecuteTensorListPopBack(xla::XlaOp list, xla::XlaOp* list_result,
@@ -463,7 +463,7 @@ Status ExecuteTensorListPopBack(xla::XlaOp list, xla::XlaOp* list_result,
     *element_result = element_result_parts[0];
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ExecuteTensorListSetItem(xla::XlaOp list, xla::XlaOp index,
@@ -499,7 +499,7 @@ Status ExecuteTensorListSetItem(xla::XlaOp list, xla::XlaOp index,
   result_parts.push_back(updated_list_part);
   result_parts.push_back(xla::GetTupleElement(list, 1));
   *result = xla::Tuple(b, result_parts);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ExecuteTensorListGetItem(xla::XlaOp list, xla::XlaOp index,
@@ -541,7 +541,7 @@ Status ExecuteTensorListGetItem(xla::XlaOp list, xla::XlaOp index,
   }
   slice_shape.erase(slice_shape.begin());
   *result = xla::Reshape(read, slice_shape);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ExecuteTensorListFromTensor(int push_index, xla::XlaOp tensor,
@@ -558,7 +558,7 @@ Status ExecuteTensorListFromTensor(int push_index, xla::XlaOp tensor,
   std::vector<xla::XlaOp> result_parts{tensor,
                                        xla::ConstantR0<int32>(b, push_index)};
   *result = xla::Tuple(b, result_parts);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/kernels/to_bool_op.cc b/tensorflow/compiler/tf2xla/kernels/to_bool_op.cc
index 015a0ce40e80d3..e7ba8d13082849 100644
--- a/tensorflow/compiler/tf2xla/kernels/to_bool_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/to_bool_op.cc
@@ -42,7 +42,7 @@ class ToBoolOp : public XlaOpKernel {
     if (shape.rank() == 0) {
       auto result = xla::Ne(ctx->Input(0), xla::ZerosLike(input));
       ctx->SetOutput(0, result);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Otherwise, any input tensor with elements returns True. Input tensor
@@ -54,7 +54,7 @@ class ToBoolOp : public XlaOpKernel {
     auto result = xla::Ne(num_elements, xla::ZerosLike(num_elements));
     ctx->SetOutput(0, result);
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
index 7cdf30594581c6..27cfc014bacb04 100644
--- a/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
+++ b/tensorflow/compiler/tf2xla/kernels/variable_ops.cc
@@ -41,7 +41,7 @@ Status ValidateAssignUpdateVariableOpShapes(XlaOpKernelContext* ctx) {
       ctx->GetVariableTypeAndShape(0, &variable_dtype, &variable_shape));
   TF_RETURN_IF_ERROR(
       ValidateAssignUpdateVariableOpShapes(variable_shape, value_shape));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 class VarIsInitializedOp : public XlaOpKernel {
diff --git a/tensorflow/compiler/tf2xla/kernels/while_op.cc b/tensorflow/compiler/tf2xla/kernels/while_op.cc
index c3370dcde64b70..d6685cc1e1d965 100644
--- a/tensorflow/compiler/tf2xla/kernels/while_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/while_op.cc
@@ -77,7 +77,7 @@ Status VerifyResourceArgsGroupedAtEnd(XlaOpKernelContext* ctx,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Builds XlaCompiler argument descriptions `args` from `ctx`.
@@ -128,7 +128,7 @@ Status MakeXlaCompilerArgumentsFromInputs(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Populates loop invariant indices to true in `loop_invariants`.
@@ -186,7 +186,7 @@ Status ConvertLoopInvariantsToConst(
     compile_time_const_arg_indices->at(arg_idx) = true;
     (*num_compile_time_const_args)++;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status VerifyBodyInputAndOutputShapeMatch(
@@ -213,7 +213,7 @@ Status VerifyBodyInputAndOutputShapeMatch(
         xla::ShapeUtil::HumanString(body_input_shape), " vs. ",
         xla::ShapeUtil::HumanString(body_output_shape));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<xla::XlaComputation> BuildWrappedCond(
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
index 589b9daec8772e..6a31628501cece 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.cc
@@ -124,7 +124,7 @@ tsl::StatusOr<std::unique_ptr<XlaCallModuleLoader>> XlaCallModuleLoader::Create(
   return loader;
 }
 
-tsl::Status XlaCallModuleLoader::SetPlatformIndex(
+absl::Status XlaCallModuleLoader::SetPlatformIndex(
     absl::string_view compilation_platform) {
   int platform_index = -1;
   if (!platforms_.empty()) {
@@ -186,7 +186,7 @@ tsl::Status XlaCallModuleLoader::SetPlatformIndex(
   return tsl::OkStatus();
 }
 
-tsl::Status XlaCallModuleLoader::RefineDynamicShapes(
+absl::Status XlaCallModuleLoader::RefineDynamicShapes(
     llvm::ArrayRef<xla::Shape> input_shapes) {
   // Skip shape refinement for new versions if USES_SHAPE_POLYMORPHISM_ATTR=1
   if (version_ >= kVersionStartSupportUsesShapePolymorphismAttr) {
@@ -264,7 +264,7 @@ tsl::Status XlaCallModuleLoader::RefineDynamicShapes(
           ConvertPrimitiveTypeToMLIRType(xla_shape.element_type(), builder));
       mlir::RankedTensorType type =
           mlir::RankedTensorType::get(xla_dimensions, element_type);
-      // TODO(burmako): This fails with an obscure compilation error.
+      // TODO(burmako): This fails with an obscure compilation error on Windows.
       // TF_ASSIGN_OR_RETURN(
       //     mlir::Type type,
       //     ConvertShapeToType<mlir::RankedTensorType>(xla_shape, builder));
@@ -349,7 +349,7 @@ tsl::Status XlaCallModuleLoader::RefineDynamicShapes(
   return tsl::OkStatus();
 }
 
-tsl::Status XlaCallModuleLoader::LoadModule(
+absl::Status XlaCallModuleLoader::LoadModule(
     mlir::MLIRContext *context, int version, std::string module_str,
     std::vector<std::string> disabled_checks,
     std::vector<std::string> platforms, int num_invocation_args,
@@ -446,7 +446,7 @@ tsl::Status XlaCallModuleLoader::LoadModule(
   return tsl::OkStatus();
 }
 
-tsl::Status XlaCallModuleLoader::ValidateDialect() {
+absl::Status XlaCallModuleLoader::ValidateDialect() {
   mlir::StatusScopedDiagnosticHandler diag_handler(module_->getContext());
   bool moduleHasUnsupportedDialects = false;
 
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h
index e77ce0effcf92c..3e9627ebcc29a2 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h
+++ b/tensorflow/compiler/tf2xla/kernels/xla_call_module_loader.h
@@ -52,7 +52,7 @@ class XlaCallModuleLoader {
 
   // Sets the platform index argument, if the module is compiled for multiple
   // platforms, and then erases the argument.
-  tsl::Status SetPlatformIndex(absl::string_view compilation_platform);
+  absl::Status SetPlatformIndex(absl::string_view compilation_platform);
 
   // Refines the dynamic module arguments based on the static argument shapes.
   // This assumes that the module has a "main" function without dimension args,
@@ -71,10 +71,10 @@ class XlaCallModuleLoader {
   // cause lifetime issues.
   // The input_shapes includes only the non-token and the non-platform-index
   // arguments.
-  tsl::Status RefineDynamicShapes(llvm::ArrayRef<xla::Shape> input_shapes);
+  absl::Status RefineDynamicShapes(llvm::ArrayRef<xla::Shape> input_shapes);
 
   // Validates that the module only contains ops from valid dialects.
-  tsl::Status ValidateDialect();
+  absl::Status ValidateDialect();
 
   // Validates that the module represents a statically-shaped StableHLO program,
   // otherwise all sorts of weirdness might happen in the HLO exporter which is
@@ -97,16 +97,16 @@ class XlaCallModuleLoader {
   XlaCallModuleLoader() = default;
 
   // Initializes the loader with the given serialized module string.
-  tsl::Status LoadModule(mlir::MLIRContext* context, int version,
-                         std::string module_str,
-                         std::vector<std::string> disabled_checks,
-                         std::vector<std::string> platforms,
-                         int num_invocation_args,
-                         bool main_has_token_input_output);
+  absl::Status LoadModule(mlir::MLIRContext* context, int version,
+                          std::string module_str,
+                          std::vector<std::string> disabled_checks,
+                          std::vector<std::string> platforms,
+                          int num_invocation_args,
+                          bool main_has_token_input_output);
 
   // Adds a wrapper for the "main" function to compute the platform index and
   // the dimension arguments.
-  tsl::Status AddMainWrapper();
+  absl::Status AddMainWrapper();
 
   mlir::MLIRContext* context_;
   int version_;
diff --git a/tensorflow/compiler/tf2xla/kernels/xla_custom_call_v2_op.cc b/tensorflow/compiler/tf2xla/kernels/xla_custom_call_v2_op.cc
index cb36059f62051a..2f1883a289cd3c 100644
--- a/tensorflow/compiler/tf2xla/kernels/xla_custom_call_v2_op.cc
+++ b/tensorflow/compiler/tf2xla/kernels/xla_custom_call_v2_op.cc
@@ -93,7 +93,7 @@ class XlaCustomCallV2Op : public XlaOpKernel {
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::string call_target_name_;
diff --git a/tensorflow/compiler/tf2xla/layout_util.cc b/tensorflow/compiler/tf2xla/layout_util.cc
index eb4526a7ea3cc4..fb53e079b97857 100644
--- a/tensorflow/compiler/tf2xla/layout_util.cc
+++ b/tensorflow/compiler/tf2xla/layout_util.cc
@@ -74,7 +74,7 @@ Status RewriteLayoutWithShardedShape(
                             layout_preference));
     *xla_shape->mutable_layout() = per_device_xla_shape.layout();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // There is a shape_representation_fn or sharding for an output, this function
diff --git a/tensorflow/compiler/tf2xla/lib/broadcast.cc b/tensorflow/compiler/tf2xla/lib/broadcast.cc
index ef70161fdcdcb2..da1b4182004e8a 100644
--- a/tensorflow/compiler/tf2xla/lib/broadcast.cc
+++ b/tensorflow/compiler/tf2xla/lib/broadcast.cc
@@ -51,7 +51,7 @@ Status BroadcastOpsToSame(xla::XlaOp* lhs, xla::XlaOp* rhs) {
     TF_ASSIGN_OR_RETURN(*lhs, xla::BroadcastTo(*lhs, bcast.output_shape()));
     TF_ASSIGN_OR_RETURN(*rhs, xla::BroadcastTo(*rhs, bcast.output_shape()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
index 1ea2e98903d69e..c57fb68edf8e24 100644
--- a/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
+++ b/tensorflow/compiler/tf2xla/mlir_bridge_pass.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "tensorflow/compiler/tf2xla/mlir_bridge_pass.h"
 
+#include <memory>
 #include <string>
 
 #include "tensorflow/compiler/mlir/tf2xla/mlir_bridge_rollout_policy.h"
@@ -33,6 +34,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tf2xla/api/v1/tf_dialect_to_executor.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/cluster_tf.h"
 #include "tensorflow/compiler/mlir/tf2xla/api/v2/tf_dialect_to_executor.h"
+#include "tensorflow/compiler/mlir/tf2xla/internal/mlir_bridge_pass_util.h"
 #include "tensorflow/compiler/tf2xla/tf2xla_defs.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "tensorflow/core/common_runtime/device_set.h"
@@ -58,14 +60,6 @@ auto* mlir_bridge_gauge_v2 = monitoring::Gauge<bool, 0>::New(
     "/tensorflow/config/experimental/enable_mlir_bridge_gauge_v2",
     "Tracks usage of the MLIR-based TF2XLA bridge among TF2 models");
 
-auto* replicated_graphs_without_device_type_counter =
-    tensorflow::monitoring::Counter<1>::New(
-        /* metric name */
-        "/tensorflow/core/tf2xla/replicated_graphs_without_device_type_count",
-        /* metric description */
-        "Tracks if any replicated graphs are without device type",
-        /* metric field */ "version");
-
 namespace {
 
 using ::mlir::ModuleOp;
@@ -79,22 +73,6 @@ bool HasTPUDevice(const DeviceSet& device_set) {
   return false;
 }
 
-// Check that graph has tf.StatefulPartitionedCall op with _XlaMustCompile.
-bool RunNonReplicatedBridge(const Graph& graph) {
-  const std::string kStatefulPartitionedCallOp = "StatefulPartitionedCall";
-  const std::string kXlaMustCompile = "_XlaMustCompile";
-  for (const Node* node : graph.nodes()) {
-    auto node_op = node->type_string();
-    if (node_op == kStatefulPartitionedCallOp) {
-      auto attr = node->attrs().FindByString(kXlaMustCompile);
-      if (attr != nullptr && attr->b() == true) {
-        return true;
-      }
-    }
-  }
-  return false;
-}
-
 bool HasTPUDevice(mlir::ModuleOp module) {
   mlir::TF::RuntimeDevices devices;
   if (failed(GetDevicesFromOp(module.getOperation(), &devices))) return false;
@@ -105,49 +83,10 @@ bool HasTPUDevice(mlir::ModuleOp module) {
       });
 }
 
-bool IsReplicatedGraph(mlir::ModuleOp module) {
-  auto walk_result = module.walk([&](mlir::Operation* op) {
-    // TODO(b/223677572): Once the scope for new compilation and replication
-    // markers is expanded beyond bridge we can remove this check for
-    // `kTPUReplicateAttr`, we will then always have a `kCompileDeviceTypeAttr`
-    // in such cases (see above).
-    // TODO(b/229028654): Remove string conversion once we have C++17.
-    const llvm::StringRef tpu_replicate_attr_name(kTpuReplicateAttr.data(),
-                                                  kTpuReplicateAttr.size());
-    auto replicate_attr =
-        op->getAttrOfType<mlir::StringAttr>(tpu_replicate_attr_name);
-    if (replicate_attr) return mlir::WalkResult::interrupt();
-    return mlir::WalkResult::advance();
-  });
-  return walk_result.wasInterrupted();
-}
-
-bool IsReplicatedGraphWithoutDeviceType(mlir::ModuleOp module) {
-  return !HasTPUDevice(module) && IsReplicatedGraph(module);
-}
-
-bool IsSingleCoreTPUGraph(mlir::ModuleOp module) {
-  auto walk_result = module.walk([&](mlir::Operation* op) {
-    // Check for ops with compile device type "TPU". This allows us to support
-    // TPU compilation without replication. Note that currently the compile
-    // device type is not set by default before bridge, only if eager context
-    // attribute `jit_compile_rewrite` is true.
-    // TODO(b/229028654): Remove string conversion once we have C++17.
-    const llvm::StringRef compile_device_type_attr_name(
-        kCompileDeviceTypeAttr.data(), kCompileDeviceTypeAttr.size());
-    auto compilation_attr =
-        op->getAttrOfType<mlir::StringAttr>(compile_device_type_attr_name);
-    if (compilation_attr && compilation_attr.getValue().str() == kTpuDevice) {
-      return mlir::WalkResult::interrupt();
-    }
-    return mlir::WalkResult::advance();
-  });
-  return walk_result.wasInterrupted();
-}
-
-bool RunReplicatedBridge(mlir::ModuleOp module) {
-  if (HasTPUDevice(module) && IsReplicatedGraph(module)) return true;
-  return IsSingleCoreTPUGraph(module);
+bool HasDevice(mlir::ModuleOp module) {
+  mlir::TF::RuntimeDevices devices;
+  if (failed(GetDevicesFromOp(module.getOperation(), &devices))) return false;
+  return !devices.device_names().empty();
 }
 
 bool HasTPUPartitionedCallOpInModule(mlir::ModuleOp module) {
@@ -201,10 +140,12 @@ absl::Status RunLowerToRuntimeOpsOnSubmodule(ModuleOp parent_module,
 // The config_proto param is a required input for all TF1 graphs but it is
 // redundant for TF2 graphs.
 MlirOptimizationPassState GetPassStateImpl(
-    bool run_replicated_bridge, const ConfigProto& config_proto,
+    bool is_supported_by_replicated_brige, const ConfigProto& config_proto,
     const Graph& graph, const FunctionLibraryDefinition& function_library) {
   // Skip MLIR TF/XLA Bridge if no XLA-compilable ops are found.
-  if (!run_replicated_bridge && !RunNonReplicatedBridge(graph)) {
+  // TODO(b/324474356): also check the called function in the library.
+  if (!is_supported_by_replicated_brige &&
+      !IsSupportedByNonReplicatedBridge(graph, /*function_library*/ nullptr)) {
     VLOG(3) << "Skipping MLIR Bridge, graph is not qualified to run the bridge";
     return MlirOptimizationPassState::Disabled;
   }
@@ -214,58 +155,43 @@ MlirOptimizationPassState GetPassStateImpl(
   // GetMlirBridgeRolloutPolicy will analyze a TPU graph if users have not
   // explicltly requested a policy.
   MlirBridgeRolloutPolicy policy = GetMlirBridgeRolloutPolicy(
-      graph, &function_library, config_proto,
-      /*run_replicated_bridge*/ run_replicated_bridge,
+      graph, &function_library, config_proto, is_supported_by_replicated_brige,
       /*uses_uninitialized_resource_args=*/false,
       /*is_v1_compat=*/false, /*record_stats=*/false);
   // GetPassState is called once before MlirBridgePass starts, and the pass
   // gets skipped if it is disabled. Log such cases in this function. The cases
   // where the pass is enabled will only be logged during their execution to
   // prevent them from being counted twice.
-  if (run_replicated_bridge) {
-    switch (policy) {
-      case MlirBridgeRolloutPolicy::kEnabledByUser:
-        return MlirOptimizationPassState::Enabled;
-      case MlirBridgeRolloutPolicy::kEnabledAfterGraphAnalysis:
-        return MlirOptimizationPassState::FallbackEnabled;
-      case MlirBridgeRolloutPolicy::kDisabledByUser:
-        VLOG(1) << "Skipping MLIR TPU Bridge, disabled by user. "
-                   "Old bridge will evaluate.";
-        metrics::UpdateTfMlirBridgeFirstPhaseCounter("tpu", "v2", true,
-                                                     "disabled_by_user");
-        return MlirOptimizationPassState::Disabled;
-      case MlirBridgeRolloutPolicy::kDisabledAfterGraphAnalysis:
-        VLOG(1) << "Skipping MLIR TPU Bridge, disabled because "
-                   "graph has unsupported features. Old bridge will evaluate.";
-        metrics::UpdateTfMlirBridgeFirstPhaseCounter("tpu", "v2", true,
-                                                     "invalid_graph");
-        // We set `uses_uninitialized_resource_args` to false here because the
-        // first phase of the bridge is not affected by uninitialized resource
-        // args.
-        // For Invalid Graph Analysis we need to log here because Run will not
-        // be called.
-        LogGraphFeatures(graph, &function_library, config_proto,
-                         /*uses_uninitialized_resource_args=*/false,
-                         /*is_v1_compat=*/false);
-        return MlirOptimizationPassState::Disabled;
-    }
-  }
-  // TODO(b/277112519): Have uniform behavior for GPU/CPU and TPU
   switch (policy) {
     case MlirBridgeRolloutPolicy::kEnabledByUser:
       return MlirOptimizationPassState::Enabled;
     case MlirBridgeRolloutPolicy::kEnabledAfterGraphAnalysis:
       return MlirOptimizationPassState::FallbackEnabled;
-    case MlirBridgeRolloutPolicy::kDisabledByUser:
-      VLOG(1) << "Skipping MLIR CPU/GPU Bridge, disabled by user.";
-      metrics::UpdateTfMlirBridgeFirstPhaseCounter("cpu/gpu", "v2", false,
-                                                   "disabled_by_user");
+    case MlirBridgeRolloutPolicy::kDisabledByUser: {
+      VLOG(1) << "Skipping MLIR "
+              << (is_supported_by_replicated_brige ? "Replicated"
+                                                   : "Non-Replicated")
+              << " Bridge, disabled by user. "
+                 "The fallback will evaluate.";
+      metrics::UpdateTfMlirBridgeFirstPhaseCounter(
+          is_supported_by_replicated_brige ? "tpu" : "cpu/gpu", "v2", true,
+          "disabled_by_user");
       return MlirOptimizationPassState::Disabled;
-    default:
-      // This case should never be hit. Added here to be consistent with OSS
-      // implementation.
-      metrics::UpdateTfMlirBridgeFirstPhaseCounter("cpu/gpu", "v2", false,
+    }
+    case MlirBridgeRolloutPolicy::kDisabledAfterGraphAnalysis:
+      // Graph analysis only runs on TPU graph.
+      VLOG(1) << "Skipping MLIR TPU Bridge, disabled because the "
+                 "graph has unsupported features. The fallback will evaluate.";
+      metrics::UpdateTfMlirBridgeFirstPhaseCounter("tpu", "v2", true,
                                                    "invalid_graph");
+      // We set `uses_uninitialized_resource_args` to false here because the
+      // first phase of the bridge is not affected by uninitialized resource
+      // args.
+      // For Invalid Graph Analysis we need to log here because Run will not
+      // be called.
+      LogGraphFeatures(graph, &function_library, config_proto,
+                       /*uses_uninitialized_resource_args=*/false,
+                       /*is_v1_compat=*/false);
       return MlirOptimizationPassState::Disabled;
   }
 }
@@ -274,14 +200,18 @@ MlirOptimizationPassState MlirBridgePass::GetPassState(
     const DeviceSet* device_set, const ConfigProto& config_proto,
     const Graph& graph,
     const FunctionLibraryDefinition& function_library) const {
+  // While we do not use device type information to choose which pass pipeline
+  // to execute, it's needed for successful execution.
   if (!device_set) {
     // This is not expected in practice.
     VLOG(1) << "Device set is empty!";
     return MlirOptimizationPassState::Disabled;
   }
 
-  return GetPassStateImpl(/*run_replicated_bridge*/ HasTPUDevice(*device_set),
-                          config_proto, graph, function_library);
+  return GetPassStateImpl(
+      /*is_supported_by_replicated_brige*/ IsSupportedByReplicatedBridge(
+          graph, &function_library),
+      config_proto, graph, function_library);
 }
 
 // This runs the first phase of the "bridge", transforming the graph in a form
@@ -297,18 +227,9 @@ Status MlirBridgePass::Run(const std::string& function_name,
   static absl::once_flag flag;
   absl::call_once(flag, UpdateLogVerbosityIfDefined, "TF_DEBUG_LOG_VERBOSITY");
 
-  // Check if it's possible for a replicated graph to not have a device type.
-  if (IsReplicatedGraphWithoutDeviceType(module)) {
-    replicated_graphs_without_device_type_counter->GetCell("v2")->IncrementBy(
-        1);
-  }
-
-  // Check if the graph has any XLA-compilable ops.
-  // This check needs to precede GetPassState for instrumentation purposes.
-  bool run_replicated_bridge = RunReplicatedBridge(module);
-  if (!run_replicated_bridge && !RunNonReplicatedBridge(graph)) {
-    VLOG(1) << "Skipping MLIR TF2XLA Bridge, no XLA-compilable ops found.";
-    return OkStatus();
+  if (!HasDevice(module)) {
+    LOG(INFO) << "No devices in " << function_name << "\n";
+    return absl::OkStatus();
   }
 
   if (HasTPUPartitionedCallOpInModule(module)) {
@@ -320,8 +241,9 @@ Status MlirBridgePass::Run(const std::string& function_name,
 
   // TODO(b/241853328): Add caching of pass state and call logging/metrics
   // related to graph analysis from here.
-  auto pass_state = GetPassStateImpl(run_replicated_bridge, config_proto, graph,
-                                     function_library);
+  bool is_supported_by_replicated_brige = IsSupportedByReplicatedBridge(module);
+  auto pass_state = GetPassStateImpl(is_supported_by_replicated_brige,
+                                     config_proto, graph, function_library);
 
   if (pass_state == MlirOptimizationPassState::Disabled) {
     // GetPassState is called before run() and run() will only be called if the
@@ -333,7 +255,7 @@ Status MlirBridgePass::Run(const std::string& function_name,
   }
 
   bool fallback_enabled = false;
-  if (run_replicated_bridge) {
+  if (is_supported_by_replicated_brige) {
     if (pass_state == MlirOptimizationPassState::FallbackEnabled) {
       // We set `uses_uninitialized_resource_args` to false here because the
       // first phase of the bridge is not affected by uninitialized resource
@@ -350,7 +272,7 @@ Status MlirBridgePass::Run(const std::string& function_name,
 
     TF_RETURN_IF_ERROR(
         tensorflow::tf2xla::v2::RunFunctionTf2xlaClusteringBridge(
-            module, tf2xla::v2::DeviceType::XLA_TPU_JIT, fallback_enabled,
+            module, /*is_supported_by_replicated_brige*/ true, fallback_enabled,
             function_name));
 
     TF_RETURN_IF_ERROR(
@@ -360,8 +282,8 @@ Status MlirBridgePass::Run(const std::string& function_name,
     VLOG(1) << "Running GPU/CPU Bridge";
     TF_RETURN_IF_ERROR(
         tensorflow::tf2xla::v2::RunFunctionTf2xlaClusteringBridge(
-            module, tf2xla::v2::DeviceType::XLA_GPU_JIT, fallback_enabled,
-            function_name));
+            module, /*is_supported_by_replicated_brige*/ false,
+            fallback_enabled, function_name));
 
     TF_RETURN_IF_ERROR(
         tensorflow::tfrt_compiler::RunLowerClusterToRuntimeOpsPassPipeline(
@@ -376,14 +298,14 @@ MlirOptimizationPassState MlirBridgeV1CompatPass::GetPassState(
     const DeviceSet* device_set, const ConfigProto& config_proto,
     const Graph& graph,
     const FunctionLibraryDefinition& function_library) const {
-  // Skip MLIR TPU Bridge if no TPU devices found.
-  if (device_set && !HasTPUDevice(*device_set))
+  // Skip MLIR Bridge if no potential XLA clusters are found.
+  if (!IsSupportedByReplicatedBridge(graph, &function_library))
     return MlirOptimizationPassState::Disabled;
   // We set `uses_uninitialized_resource_args` to false here because the first
   // phase of the bridge is not affected by uninitialized resource args.
   MlirBridgeRolloutPolicy policy = GetMlirBridgeRolloutPolicy(
       graph, /*function_library=*/&function_library, config_proto,
-      /*run_replicated_bridge*/ true,
+      /*is_supported_by_replicated_brige*/ true,
       /*uses_uninitialized_resource_args=*/false, /*is_v1_compat=*/true,
       /*record_stats=*/false);
   switch (policy) {
@@ -423,14 +345,8 @@ Status MlirBridgeV1CompatPass::Run(const GraphOptimizationPassOptions& options,
   // Skip function graphs as MlirBridgePass will be used instead.
   if (options.is_function_graph) return OkStatus();
 
-  // Check if it's possible for a replicated graph to not have a device type.
-  if (IsReplicatedGraphWithoutDeviceType(module)) {
-    replicated_graphs_without_device_type_counter->GetCell("v1")->IncrementBy(
-        1);
-  }
-
   // Skip MLIR TPU Bridge if no TPU devices or TPU ops found.
-  if (!RunReplicatedBridge(module)) {
+  if (!IsSupportedByReplicatedBridge(module)) {
     VLOG(1) << "Skipping MLIR TPU Bridge V1 Compat, no TPU devices or TPU ops "
                "found";
     return OkStatus();
diff --git a/tensorflow/compiler/tf2xla/ops/xla_ops.cc b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
index edb2a40f4d332b..27a534296921cd 100644
--- a/tensorflow/compiler/tf2xla/ops/xla_ops.cc
+++ b/tensorflow/compiler/tf2xla/ops/xla_ops.cc
@@ -42,7 +42,7 @@ Status UnchangedRank(shape_inference::InferenceContext* c) {
   } else {
     c->set_output(0, c->input(0));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_OP("XlaBroadcastHelper")
@@ -294,7 +294,7 @@ static Status XlaDotShapeFunction(shape_inference::InferenceContext* c) {
   }
 
   c->set_output(0, c->MakeShape(output_dims));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_OP("XlaDot")
@@ -398,7 +398,7 @@ REGISTER_OP("XlaDynamicSlice")
         return UnchangedRank(c);
       }
       c->set_output(0, size_indices_value);
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc(R"doc(
 Wraps the XLA DynamicSlice operator, documented at
@@ -556,7 +556,7 @@ REGISTER_OP("XlaPad")
       }
 
       c->set_output(0, c->MakeShape(output_dims));
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc(R"doc(
 Wraps the XLA Pad operator, documented at
@@ -587,7 +587,7 @@ REGISTER_OP("XlaRecv")
       shape_inference::ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(shape_attr, &s));
       c->set_output(0, s);
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc(R"doc(
 Receives the named tensor from another XLA computation. Wraps the XLA Recv
@@ -630,7 +630,7 @@ REGISTER_OP("XlaReduce")
       } else {
         c->set_output(0, c->input(0));
       }
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc(R"doc(
 Wraps the XLA Reduce operator, documented at
@@ -684,7 +684,7 @@ REGISTER_OP("XlaVariadicReduce")
           c->set_output(i, c->input(i));
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc(R"doc(
 Wraps the variadic XLA Reduce operator.
@@ -768,7 +768,7 @@ REGISTER_OP("XlaVariadicReduceV2")
       for (int i = 0; i < nr_inputs; ++i) {
         c->set_output(i, output_shape);
       }
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc(R"doc(
 Wraps the variadic XLA Reduce operator.
@@ -828,7 +828,7 @@ REGISTER_OP("XlaRngBitGenerator")
       shape_inference::ShapeHandle output;
       TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(2, &output));
       c->set_output(1, output);
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc(R"doc(
 Stateless PRNG bit generator.
@@ -912,7 +912,7 @@ REGISTER_OP("XlaKeyValueSort")
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->input(0));
       c->set_output(1, c->input(1));
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc(R"doc(
 Wraps the XLA Sort operator, documented at
@@ -938,7 +938,7 @@ REGISTER_OP("XlaVariadicSort")
       std::vector<shape_inference::ShapeHandle> input_shapes;
       TF_RETURN_IF_ERROR(c->input("inputs", &input_shapes));
       TF_RETURN_IF_ERROR(c->set_output("outputs", input_shapes));
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc(R"doc(
 Wraps the XLA Sort operator, documented at
@@ -1066,7 +1066,7 @@ REGISTER_OP("XlaSpmdFullToShardShape")
         dims.push_back(c->MakeDim(dim));
       }
       c->set_output(0, c->MakeShape(dims));
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc(R"doc(
 An op used by XLA SPMD partitioner to switch from automatic partitioning to
@@ -1092,7 +1092,7 @@ REGISTER_OP("XlaSpmdShardToFullShape")
       shape_inference::ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(shape_attr, &s));
       c->set_output(0, s);
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc(R"doc(
 An op used by XLA SPMD partitioner to switch from manual partitioning to
@@ -1119,7 +1119,7 @@ REGISTER_OP("XlaReplicaId")
     .Output("id: int32")
     .SetShapeFn([](shape_inference::InferenceContext* context) {
       context->set_output(0, context->MakeShape({}));
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc("Replica ID.");
 
@@ -1212,7 +1212,7 @@ Status OptimizationBarrierShape(shape_inference::InferenceContext* c) {
   for (int i = 0; i < c->num_inputs(); ++i) {
     c->set_output(i, c->input(i));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_OP("XlaOptimizationBarrier")
@@ -1258,7 +1258,7 @@ REGISTER_OP("XlaCustomCall")
       shape_inference::ShapeHandle s;
       TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(shape_attr, &s));
       c->set_output(0, s);
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc(R"doc(
 Wraps the XLA CustomCall operator
@@ -1293,7 +1293,7 @@ REGISTER_OP("XlaCustomCallV2")
         TF_RETURN_IF_ERROR(c->MakeShapeFromTensorShape(shapes[i], &shape));
         c->set_output(i, shape);
       }
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc(R"doc(
 Emits an HLO `CustomCall` operation with multiple outputs.
@@ -1346,7 +1346,7 @@ REGISTER_OP("XlaCallModule")
                 << "] : " << c->DebugString(s);
         c->set_output(i, s);
       }
-      return OkStatus();
+      return absl::OkStatus();
     })
     .Doc(R"doc(
 Invokes a StableHLO module.
@@ -1365,7 +1365,7 @@ version: Tracks changes the semantics of the op, to support backwards
   the op carries a StableHLO module with compatibility guarantees. From version
   5, XLACallModule can include `stablehlo.custom_call` op to execute tf
   functions. From version 6 the op supports the `disabled_checks` attribute.
-  See more versioning details at https://github.com/search?q=repo%3Atensorflow%2Ftensorflow+path%3Axla_call_module+%22int+VERSION_MAXIMUM_SUPPORTED%22&type=code.
+  See more versioning details at https://github.com/search?q=repo%3Atensorflow%2Ftensorflow+path%3Axla_call_module+%22int+kVersionMaximumSupported%22&type=code.
 module: A serialized computation, a text or bytecode representation of
   an mlir.Module. The return type must be a tuple if and only if the `Sout` is
   a list with 0 or more than 1 elements. The length of `Tout` and
diff --git a/tensorflow/compiler/tf2xla/python/xla.py b/tensorflow/compiler/tf2xla/python/xla.py
index 27940b7fb92c17..5846013128611c 100644
--- a/tensorflow/compiler/tf2xla/python/xla.py
+++ b/tensorflow/compiler/tf2xla/python/xla.py
@@ -667,7 +667,7 @@ def call_module_maximum_supported_version():
   """Maximum version of XlaCallModule op supported.
 
   See versioning details documentation for the XlaCallModule op at:
-  https://github.com/search?q=repo%3Atensorflow%2Ftensorflow+path%3Axla_call_module+%22int+VERSION_MAXIMUM_SUPPORTED%22&type=code
+  https://github.com/search?q=repo%3Atensorflow%2Ftensorflow+path%3Axla_call_module+%22int+kVersionMaximumSupported%22&type=code
   """
   return 9
 
diff --git a/tensorflow/compiler/tf2xla/rearrange_function_argument.cc b/tensorflow/compiler/tf2xla/rearrange_function_argument.cc
index b29d73adffb0a4..a081fa18891ba2 100644
--- a/tensorflow/compiler/tf2xla/rearrange_function_argument.cc
+++ b/tensorflow/compiler/tf2xla/rearrange_function_argument.cc
@@ -66,7 +66,7 @@ Status InputTypesNeedsRearrange(const std::vector<DataType>& in_types,
   if (first_resource_index == -1) {
     // No resource input. No need to rewrite.
     *need_rewrite = false;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   *need_rewrite = false;
@@ -77,7 +77,7 @@ Status InputTypesNeedsRearrange(const std::vector<DataType>& in_types,
     }
   }
   if (!*need_rewrite) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   *resource_input_count = 0;
@@ -100,7 +100,7 @@ Status InputTypesNeedsRearrange(const std::vector<DataType>& in_types,
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Given mapping between original input index and rearranged input index,
@@ -122,7 +122,7 @@ Status ReorderInputEdges(Graph* g, Node* n,
     g->RemoveEdge(e);
     g->AddEdge(src, src_output, n, new_dst_input)->DebugString();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // For While node, given mapping between original input index and rearranged
@@ -154,7 +154,7 @@ Status ReorderOutputEdges(Graph* g, Node* n, int input_count,
       g->AddEdge(input_edge->src(), input_edge->src_output(), dst, dst_input);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Given mapping between original input index and rearranged input index, change
@@ -203,7 +203,7 @@ Status CalculateRetvalRearrange(
     TF_RETURN_IF_ERROR(GetNodeAttr(arg->def(), "index", &src_index));
     resource_retval_to_arg->insert(std::make_pair(i, src_index));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Given original output types and return value index mapping, return the new
@@ -252,7 +252,7 @@ Status RearrangeOutputEdges(Node* n, Graph* g,
       g->AddEdge(n, iter->second, dst, dst_input);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Given mapping between original output index and rearranged output index,
@@ -287,7 +287,7 @@ Status MaybeRewriteWhileNode(
       types, &input_need_rearrange, &resource_input_count, &index_mapping));
   if (!input_need_rearrange) {
     *node_rewritten = false;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   *node_rewritten = true;
@@ -379,7 +379,7 @@ Status MaybeRewriteWhileNode(
     n->ClearAttr(attr_name);
     n->AddAttr(attr_name, attr_value);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MaybeRewriteIfNode(
@@ -403,7 +403,7 @@ Status MaybeRewriteIfNode(
                                        DT_RESOURCE) != out_types.end();
   if (!input_need_rearrange && !has_resource_output) {
     *node_rewritten = false;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   *node_rewritten = true;
@@ -514,7 +514,7 @@ Status MaybeRewriteIfNode(
     n->ClearAttr("Tout");
     n->AddAttr("Tout", new_out_types);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -557,7 +557,7 @@ Status RearrangeFunctionArguments(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/resource_util.cc b/tensorflow/compiler/tf2xla/resource_util.cc
index e91ce07e6d6983..4180b8f1330bcd 100644
--- a/tensorflow/compiler/tf2xla/resource_util.cc
+++ b/tensorflow/compiler/tf2xla/resource_util.cc
@@ -104,7 +104,7 @@ Status PropagateFromArgOp(
 
   int index;
   TF_RETURN_IF_ERROR(GetNodeAttr(n.attrs(), "index", &index));
-  if (!resource_arg_indices.contains(index)) return OkStatus();
+  if (!resource_arg_indices.contains(index)) return absl::OkStatus();
 
   TF_RET_CHECK(function_name.has_value())
       << "ResourceUsageAnalysis does not support analyzing _Arg nodes "
@@ -122,7 +122,7 @@ Status PropagateFromArgOp(
     (*user_to_source)[o] = src_node_info;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status UpdateResourceUsageFromFunctionBodyAnalysis(
@@ -176,7 +176,7 @@ Status UpdateResourceUsageFromFunctionBodyAnalysis(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PropagateThroughCallOp(
@@ -219,7 +219,7 @@ Status PropagateThroughCallOp(
   TF_RETURN_IF_ERROR(UpdateResourceUsageFromFunctionBodyAnalysis(
       n, function_name, *fbody, called_function_source_to_path, user_to_source,
       source_to_path));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Analyzes pass through values for Identity and IdentityN ops.
@@ -246,7 +246,7 @@ Status PropagateThroughIdentityOp(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AnalyzeResourceUsage(
@@ -313,7 +313,7 @@ Status AnalyzeResourceUsage(
                                          it.first->dst()->type_string());
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // anonymous namespace
diff --git a/tensorflow/compiler/tf2xla/shape_util.cc b/tensorflow/compiler/tf2xla/shape_util.cc
index b9093c2105cd0a..e01d1b919f9699 100644
--- a/tensorflow/compiler/tf2xla/shape_util.cc
+++ b/tensorflow/compiler/tf2xla/shape_util.cc
@@ -45,7 +45,7 @@ Status PopulateInfeedLayoutVector(const xla::Shape& shape,
   } else {
     layouts->insert(layouts->end(), shape.rank(), -1);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Populate the output layout unless the minor_to_major array contains all -1
@@ -83,7 +83,7 @@ Status AssignLayout(
     layout = layout_func(*shape);
   }
   *shape->mutable_layout() = layout;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -100,7 +100,7 @@ Status XLAShapeToTensorShape(const xla::Shape& shape,
   for (int i = 0; i < shape.rank(); ++i) {
     TF_RETURN_IF_ERROR(tensor_shape->AddDimWithStatus(shape.dimensions(i)));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Convert a TensorShape into the equivalent XLA Shape proto.
@@ -110,7 +110,7 @@ Status TensorShapeToXLAShape(DataType dtype,
   xla::PrimitiveType type;
   TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(dtype, &type));
   *shape = TensorShapeToXLAShape(type, tensor_shape);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status TensorShapeToBoundedXLAShape(DataType dtype,
@@ -122,7 +122,7 @@ Status TensorShapeToBoundedXLAShape(DataType dtype,
   if (tensor_shape.unknown_rank()) {
     // For unknown shape, create a rank 1 size 0 tensor.
     *shape = xla::ShapeUtil::MakeShapeWithDenseLayout(type, {0}, {0});
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (tensor_shape.dims() != bound.dims()) {
@@ -157,7 +157,7 @@ Status TensorShapeToBoundedXLAShape(DataType dtype,
     }
   }
   *shape = result;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 xla::Shape TensorShapeToXLAShape(xla::PrimitiveType type,
@@ -190,7 +190,7 @@ Status TensorShapeToXLAShape(DataType dtype, const TensorShape& tensor_shape,
   xla::PrimitiveType type;
   TF_RETURN_IF_ERROR(DataTypeToPrimitiveType(dtype, &type));
   *shape = TensorShapeToXLAShape(type, tensor_shape);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<xla::Shape> TensorShapeToXLAShape(DataType dtype,
@@ -272,7 +272,7 @@ Status GetShapeWithLayout(
     VLOG(4) << "Shape[] = "
             << xla::ShapeUtil::HumanStringWithLayout(*output_shape);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/side_effect_util.cc b/tensorflow/compiler/tf2xla/side_effect_util.cc
index 1a62857c537cc1..9446e4b4adadb9 100644
--- a/tensorflow/compiler/tf2xla/side_effect_util.cc
+++ b/tensorflow/compiler/tf2xla/side_effect_util.cc
@@ -74,7 +74,7 @@ Status SetDeviceOrdinalAttributeForNode(Node* node, int device_ordinal) {
     return errors::Internal("Unknown node type to set 'device_ordinal': ",
                             node->DebugString());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::set<std::string> CalculateTokenInputsForOutputToken(const Graph& g) {
@@ -143,7 +143,7 @@ Status ParseHostComputeCoreList(absl::Span<const string> list_from_attr,
     }
     (*host_compute_core)[parts[0]] = core;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/test_util.cc b/tensorflow/compiler/tf2xla/test_util.cc
index 0fbf983058ca32..3fb8523ce71e0c 100644
--- a/tensorflow/compiler/tf2xla/test_util.cc
+++ b/tensorflow/compiler/tf2xla/test_util.cc
@@ -37,7 +37,7 @@ Status InstantiateFunctionForTest(const string& name,
   for (NodeDef& n : inst.nodes) {
     *result->gdef.add_node() = std::move(n);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla.cc b/tensorflow/compiler/tf2xla/tf2xla.cc
index 61768b0ff5557e..ef87b320cdcd0c 100644
--- a/tensorflow/compiler/tf2xla/tf2xla.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla.cc
@@ -125,7 +125,7 @@ Status ConvertGraphToXla(std::unique_ptr<Graph> graph,
       ++input_index;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvertVarHandlesToAotVarHandles(GraphDef* graph_def) {
@@ -141,7 +141,7 @@ Status ConvertVarHandlesToAotVarHandles(GraphDef* graph_def) {
         node.mutable_attr()->erase("allowed_devices");
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   };
   for (auto& node : *graph_def->mutable_node()) {
     TF_RETURN_IF_ERROR(update_var_handle_op_node(node));
@@ -151,7 +151,7 @@ Status ConvertVarHandlesToAotVarHandles(GraphDef* graph_def) {
       TF_RETURN_IF_ERROR(update_var_handle_op_node(node));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -164,7 +164,7 @@ Status ConvertGraphDefToXla(GraphDef graph_def, const tf2xla::Config& config,
   TF_RETURN_IF_ERROR(InitGraph(graph_def, config, &graph));
   TF_RETURN_IF_ERROR(
       ConvertGraphToXla(std::move(graph), config, client, computation));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util.cc b/tensorflow/compiler/tf2xla/tf2xla_util.cc
index a43ad91c20d828..ad5de83c814d4c 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util.cc
@@ -55,7 +55,7 @@ Status ValidateTensorId(const tf2xla::TensorId& id) {
   if (id.output_index() < 0) {
     return errors::InvalidArgument("TensorId output_index must be positive");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CheckNameDuplicates(const string& kind, const string& name,
@@ -65,7 +65,7 @@ Status CheckNameDuplicates(const string& kind, const string& name,
       return errors::InvalidArgument("duplicate ", kind, " name: ", name);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CheckFeedFetchNameConflicts(const string& kind,
@@ -79,7 +79,7 @@ Status CheckFeedFetchNameConflicts(const string& kind,
                                      " and ", name_data);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // For graph `g`, copy all function call nodes' FunctionDef from `lookup_fld` to
@@ -108,7 +108,7 @@ Status CopyAssociatedFunctions(Graph* g,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Replaces the single edge feeding into {dst,dst_input} with a new
@@ -162,7 +162,7 @@ Status ReplaceSrcOutputUsageWithNode(Graph* g, Node* src, int src_output,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // For graph `g`, replaces _Arg nodes whose "index" attribute is in
@@ -190,7 +190,7 @@ Status ReplaceArgUsageWithConstNode(
     TF_RETURN_IF_ERROR(
         ReplaceSrcOutputUsageWithNode(g, arg_node, 0, const_node));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Replaces the single input to _Retval nodes with an index in the keys of
@@ -220,7 +220,7 @@ Status ReplaceRetvalInputWithArg(
         ReplaceEdge(g, ret_nodes[arg_index], 0, arg_nodes[arg_index], 0)
             .status());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // For a node's function attr (e.g. then/else branch for "If" nodes), rewrites
@@ -276,7 +276,7 @@ Status PropagateConstIntoFuncAttr(
   // Copy associated functions.
   TF_RETURN_IF_ERROR(CopyAssociatedFunctions(func_graph, lookup_fld, fld));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // For an "If" node in graph `g`, if it has Const node inputs, rewrite its
@@ -295,7 +295,7 @@ Status PropagateConstIntoIfNode(Graph* g, Node* if_node,
     }
   }
   if (const_input_index_to_node.empty()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Rewrite "then_branch" and "else_branch" function, replace usage of those
@@ -306,7 +306,7 @@ Status PropagateConstIntoIfNode(Graph* g, Node* if_node,
         if_node, attr_name, const_input_index_to_node, lookup_fld, fld));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 using GraphCache = absl::flat_hash_map<string, std::unique_ptr<FunctionBody>>;
@@ -456,7 +456,7 @@ Status PropagateConstIntoAndAroundWhileNode(
     const_input_index_to_node[i] = input_edge->src();
   }
   if (const_input_index_to_node.empty()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Rewrite "cond" and "body" function, replace usage of those _Arg nodes with
@@ -473,7 +473,7 @@ Status PropagateConstIntoAndAroundWhileNode(
     TF_RETURN_IF_ERROR(
         ReplaceSrcOutputUsageWithNode(g, while_node, it.first, it.second));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -502,7 +502,7 @@ Status ValidateConfig(const tf2xla::Config& config) {
   if (config.fetch().empty()) {
     return errors::InvalidArgument("fetches must be specified");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddPlaceholdersForFeeds(
@@ -599,7 +599,7 @@ Status AddPlaceholdersForFeeds(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PruneGraphDefInto(const tf2xla::Config& config, const GraphDef& in,
@@ -664,7 +664,7 @@ Status PruneGraphDefInto(const tf2xla::Config& config, const GraphDef& in,
       *out->add_node() = node;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string TensorIdToString(const tf2xla::TensorId& id) {
@@ -695,7 +695,7 @@ Status SetNodeShardingFromNeighbors(Node* n, bool out_edges) {
     n->set_assigned_device_name(matching_node->assigned_device_name());
     n->set_requested_device(matching_node->requested_device());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void AddDtypeToKernelDefConstraint(absl::string_view name, DataType dtype,
@@ -858,7 +858,7 @@ Status RewriteAssociatedFunction(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CachedFunctionHandles::GetOrInstantiate(
@@ -868,12 +868,12 @@ Status CachedFunctionHandles::GetOrInstantiate(
   auto iter = handles_.find(canonicalized_name);
   if (iter != handles_.end()) {
     *handle = iter->second;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   TF_RETURN_IF_ERROR(flr_->Instantiate(func_name, attrs, handle));
   handles_[canonicalized_name] = *handle;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CachedFunctionHandles::ReleaseAllHandles() {
@@ -965,7 +965,7 @@ Status PropagateConstIntoFunctionalNodes(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PruneUnreachableFunctionsFromGraph(const Graph& g,
@@ -979,7 +979,7 @@ Status PruneUnreachableFunctionsFromGraph(const Graph& g,
       TF_RETURN_IF_ERROR(fld->RemoveFunction(func_name));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RewriteTensorListWithConstElement(Graph* g,
@@ -1116,7 +1116,7 @@ Status RewriteTensorListWithConstElement(Graph* g,
     bwd_while->ClearAttr("body");
     bwd_while->AddAttr("body", bwd_body_attr);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
index d04dbd314a4931..d1ea22324c7e8c 100644
--- a/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
+++ b/tensorflow/compiler/tf2xla/tf2xla_util_test.cc
@@ -41,7 +41,7 @@ namespace tensorflow {
 namespace {
 
 void ExpectErrorContains(const Status& status, absl::string_view str) {
-  EXPECT_NE(OkStatus(), status);
+  EXPECT_NE(absl::OkStatus(), status);
   EXPECT_TRUE(absl::StrContains(status.message(), str))
       << "expected error: " << status.message() << " to contain: " << str;
 }
diff --git a/tensorflow/compiler/tf2xla/type_util.cc b/tensorflow/compiler/tf2xla/type_util.cc
index 335cfdf37b1605..8221aa28b27b0b 100644
--- a/tensorflow/compiler/tf2xla/type_util.cc
+++ b/tensorflow/compiler/tf2xla/type_util.cc
@@ -28,66 +28,66 @@ Status DataTypeToPrimitiveType(DataType data_type, xla::PrimitiveType* type) {
   switch (data_type) {
     case tensorflow::DT_BOOL:
       *type = xla::PRED;
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_INT4:
       *type = xla::S4;
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_INT8:
     case tensorflow::DT_QINT8:
       *type = xla::S8;
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_INT16:
     case tensorflow::DT_QINT16:
       *type = xla::S16;
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_INT32:
     case tensorflow::DT_QINT32:
       *type = xla::S32;
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_INT64:
       *type = xla::S64;
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_UINT4:
       *type = xla::U4;
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_UINT8:
     case tensorflow::DT_QUINT8:
       *type = xla::U8;
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_UINT16:
     case tensorflow::DT_QUINT16:
       *type = xla::U16;
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_UINT32:
       *type = xla::U32;
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_UINT64:
       *type = xla::U64;
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_FLOAT8_E5M2:
       *type = xla::F8E5M2;
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_FLOAT8_E4M3FN:
       *type = xla::F8E4M3FN;
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_BFLOAT16:
       *type = xla::BF16;
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_HALF:
       *type = xla::F16;
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_FLOAT:
       *type = xla::F32;
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_DOUBLE:
       *type = xla::F64;
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_COMPLEX64:
       *type = xla::C64;
-      return OkStatus();
+      return absl::OkStatus();
     case tensorflow::DT_COMPLEX128:
       *type = xla::C128;
-      return OkStatus();
+      return absl::OkStatus();
     default:
       return errors::InvalidArgument(
           "Unsupported type in DataTypeToPrimitiveType: '",
diff --git a/tensorflow/compiler/tf2xla/xla_compilation_device.cc b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
index 240e4bb1a78ceb..bbdf5c7d2c74fa 100644
--- a/tensorflow/compiler/tf2xla/xla_compilation_device.cc
+++ b/tensorflow/compiler/tf2xla/xla_compilation_device.cc
@@ -140,7 +140,7 @@ void XlaCompilationDevice::Compute(OpKernel* op_kernel,
   VLOG(4) << "Done";
 }
 
-Status XlaCompilationDevice::Sync() { return OkStatus(); }
+Status XlaCompilationDevice::Sync() { return absl::OkStatus(); }
 
 Status XlaCompilationDevice::MakeTensorFromProto(
     const TensorProto& tensor_proto, const AllocatorAttributes alloc_attrs,
diff --git a/tensorflow/compiler/tf2xla/xla_compiler.cc b/tensorflow/compiler/tf2xla/xla_compiler.cc
index bb8b29de5b9acf..1ab4f2fd3a9a81 100644
--- a/tensorflow/compiler/tf2xla/xla_compiler.cc
+++ b/tensorflow/compiler/tf2xla/xla_compiler.cc
@@ -75,6 +75,7 @@ limitations under the License.
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 #include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/core/util/debug_data_dumper.h"
 #include "tensorflow/core/util/dump_graph.h"
 #include "tsl/platform/tensor_float_32_utils.h"
 
@@ -99,7 +100,7 @@ Status CheckSignature(const DataTypeVector& types,
           " but function parameter has type ", DataTypeString(types[i]));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Uses the _Arg and _Retval nodes in the graph to determine an OpSharding for
@@ -444,7 +445,7 @@ Status BuildComputation(
 
   TF_ASSIGN_OR_RETURN(auto program_shape, computation->GetProgramShape());
   *output_shape = program_shape.result();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -518,7 +519,7 @@ string XlaCompiler::Argument::ShapeHumanString() const {
 
 XlaCompiler::XlaCompiler(XlaCompiler::Options options)
     : options_(options),
-      initialization_status_(OkStatus()),
+      initialization_status_(absl::OkStatus()),
       next_step_id_(1),
       device_(new XlaCompilationDevice(SessionOptions(), options_.device_type)),
       device_mgr_(absl::WrapUnique(device_)) {
@@ -571,7 +572,7 @@ static Status GetFunctionBody(const NameAttrList& function,
 
   *fbody = flib_runtime->GetFunctionBody(handle);
   TF_RET_CHECK(*fbody);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaCompiler::FindFunctionBody(const NameAttrList& function,
@@ -598,7 +599,7 @@ Status XlaCompiler::FindFunctionBody(const NameAttrList& function,
     }
     VLOG(4) << "Function " << function.name() << " in local_flib_runtime_";
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::unique_ptr<Graph> XlaCompiler::GetGraph(const FunctionBody* fbody) {
@@ -815,7 +816,7 @@ Status XlaCompiler::CompileFunction(
   auto it = cache_.find({function_id, arg_vector});
   if (it != cache_.end()) {
     *result = it->second;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const FunctionBody* fbody;
@@ -927,7 +928,7 @@ Status XlaCompiler::CompileFunction(
   VLOG(1) << "====================================================";
 
   cache_[{function_id, arg_vector}] = *result;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Computes the XLA shape for argument 'arg'.
@@ -975,12 +976,12 @@ Status XlaCompiler::XLAShapeForArgument(
               arg.type, std::get<TensorShape>(arg.shape), xla_shape));
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
     case XlaCompiler::Argument::kTensorList: {
       TF_RET_CHECK(absl::holds_alternative<xla::Shape>(arg.shape));
       *xla_shape = std::get<xla::Shape>(arg.shape);
-      return OkStatus();
+      return absl::OkStatus();
     }
     case XlaCompiler::Argument::kConstantResource:
     case XlaCompiler::Argument::kResource: {
@@ -1000,7 +1001,7 @@ Status XlaCompiler::XLAShapeForArgument(
           TF_RETURN_IF_ERROR(RewriteLayoutWithShardedShape(
               arg_sharding, arg.fast_mem, options_.shape_determination_fns,
               xla_shape));
-          return OkStatus();
+          return absl::OkStatus();
         }
         case XlaResource::kTensorArray: {
           if (arg.max_array_size < 0) {
@@ -1018,7 +1019,7 @@ Status XlaCompiler::XLAShapeForArgument(
                 arg.tensor_array_gradients.size() + 1, *xla_shape);
             *xla_shape = xla::ShapeUtil::MakeTupleShape(tuple_shape);
           }
-          return OkStatus();
+          return absl::OkStatus();
         }
         case XlaResource::kStack: {
           if (arg.max_array_size < 0) {
@@ -1034,7 +1035,7 @@ Status XlaCompiler::XLAShapeForArgument(
               TensorShapeToXLAShape(arg.type, shape, &buffer_shape));
           *xla_shape = xla::ShapeUtil::MakeTupleShape(
               {buffer_shape, xla::ShapeUtil::MakeShape(xla::S32, {})});
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         case XlaResource::kInvalid:
@@ -1044,7 +1045,7 @@ Status XlaCompiler::XLAShapeForArgument(
     }
     case XlaCompiler::Argument::kToken: {
       *xla_shape = xla::ShapeUtil::MakeTokenShape();
-      return OkStatus();
+      return absl::OkStatus();
     }
     case XlaCompiler::Argument::kInvalid:
       return errors::Internal("Invalid argument type in XLAShapeForArgument()");
@@ -1143,7 +1144,7 @@ Status XlaCompiler::BuildArguments(
   }
 
   if (input_to_args->empty() && !use_tuple_arg) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // `arg_to_inputs[c] = d` means that the c'th original arg index corresponds
@@ -1304,7 +1305,7 @@ Status XlaCompiler::BuildArguments(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -1320,7 +1321,7 @@ Status ValidateFunctionDef(const FunctionDef* fdef,
     const OpDef* op_def;
     TF_RETURN_IF_ERROR(OpRegistry::Global()->LookUpOpDef(op, &op_def));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // If node is PartitionedCall or StatefulPartitionedCall, returns the
@@ -1339,10 +1340,10 @@ Status GetPotentialFunctionName(const Node& node, const string** name) {
           " does not have 'func' field set");
     }
     *name = &attr_value->func().name();
-    return OkStatus();
+    return absl::OkStatus();
   }
   *name = &node.type_string();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Check that the graph doesn't have any invalid nodes (e.g. incompatible with
@@ -1378,7 +1379,7 @@ Status ValidateGraph(const Graph* graph,
 
       return errors::InvalidArgument(errmsg);
     }
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   for (const Node* node : graph->nodes()) {
@@ -1401,7 +1402,7 @@ Status ValidateGraph(const Graph* graph,
     s = FindKernelDef(device_type, node->def(), nullptr, nullptr);
     TF_RETURN_IF_ERROR(maybe_error(node, s));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void ConvertConstantsToExpressions(xla::XlaBuilder* builder,
@@ -1478,7 +1479,7 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
                                  absl::Span<const XlaCompiler::Argument> args,
                                  CompilationResult* result) {
   VLOG(1) << "Executing graph symbolically to populate XlaBuilder.: " << name;
-  if (VLOG_IS_ON(2)) {
+  if (VLOG_IS_ON(2) || DEBUG_DATA_DUMPER()->ShouldDump(name, kDebugGroupMain)) {
     VLOG(2) << "XlaCompiler::CompileGraph: "
             << DumpGraphToFile(absl::StrCat("xla_compile_graph_", name), *graph,
                                flib_runtime_->GetFunctionLibraryDefinition());
@@ -1602,10 +1603,28 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
       result->input_mapping));
 
   for (const auto& [key, send] : host_compute_sends_) {
-    *result->host_compute_metadata.add_device_to_host() = send;
+    auto* d2h = result->host_compute_metadata.add_device_to_host();
+    *d2h = send;
+
+    for (int i = 0; i < d2h->metadata_size(); ++i) {
+      const std::string channel_name =
+          GetDeviceToHostChannelName(d2h->key(), i);
+      xla::ChannelHandle handle;
+      TF_RETURN_IF_ERROR(GetDeviceToHostChannelHandle(channel_name, &handle));
+      d2h->mutable_metadata(i)->set_channel_id(handle.handle());
+    }
   }
   for (const auto& [key, recv] : host_compute_recvs_) {
-    *result->host_compute_metadata.add_host_to_device() = recv;
+    auto* h2d = result->host_compute_metadata.add_host_to_device();
+    *h2d = recv;
+
+    for (int i = 0; i < h2d->metadata_size(); ++i) {
+      const std::string channel_name =
+          GetHostToDeviceChannelName(h2d->key(), i);
+      xla::ChannelHandle handle;
+      TF_RETURN_IF_ERROR(GetHostToDeviceChannelHandle(channel_name, &handle));
+      h2d->mutable_metadata(i)->set_channel_id(handle.handle());
+    }
   }
 
   if (!tsl::tensor_float_32_execution_enabled()) {
@@ -1617,7 +1636,7 @@ Status XlaCompiler::CompileGraph(const XlaCompiler::CompileOptions& options,
   VLOG(2) << "XLA output shape: "
           << xla::ShapeUtil::HumanStringWithLayout(result->xla_output_shape);
   result->collective_info = context->GetCollectiveInfo();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaCompiler::GetChannelHandle(const string& key,
@@ -1628,7 +1647,7 @@ Status XlaCompiler::GetChannelHandle(const string& key,
   }
   *channel = result.first->second;
   VLOG(1) << "Channel: " << key << " " << channel->DebugString();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaCompiler::GetHostToDeviceChannelHandle(const string& key,
@@ -1640,7 +1659,7 @@ Status XlaCompiler::GetHostToDeviceChannelHandle(const string& key,
   }
   *channel = result.first->second;
   VLOG(1) << "Host to device channel: " << key << " " << channel->DebugString();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaCompiler::GetDeviceToHostChannelHandle(const string& key,
@@ -1652,7 +1671,7 @@ Status XlaCompiler::GetDeviceToHostChannelHandle(const string& key,
   }
   *channel = result.first->second;
   VLOG(1) << "Device to host channel: " << key << " " << channel->DebugString();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -1679,7 +1698,7 @@ Status XlaCompiler::SetDeviceToHostMetadata(
     tf2xla::HostTransferMetadata new_transfer;
     SetTransfer(key, types, shapes, &new_transfer);
     if (xla::protobuf_util::ProtobufEquals(existing_transfer, new_transfer)) {
-      return OkStatus();
+      return absl::OkStatus();
     } else {
       return errors::InvalidArgument(
           "Duplicate calls to SetDeviceToHostMetadata with key ", key);
@@ -1687,7 +1706,7 @@ Status XlaCompiler::SetDeviceToHostMetadata(
   }
   tf2xla::HostTransferMetadata& transfer = host_compute_sends_[key];
   SetTransfer(key, types, shapes, &transfer);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaCompiler::GetDeviceToHostShapes(
@@ -1702,7 +1721,7 @@ Status XlaCompiler::GetDeviceToHostShapes(
     TensorShape shape(iter->second.metadata(i).shape());
     shapes->push_back(shape);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaCompiler::SetHostToDeviceMetadata(
@@ -1713,7 +1732,7 @@ Status XlaCompiler::SetHostToDeviceMetadata(
     tf2xla::HostTransferMetadata new_transfer;
     SetTransfer(key, types, shapes, &new_transfer);
     if (xla::protobuf_util::ProtobufEquals(existing_transfer, new_transfer)) {
-      return OkStatus();
+      return absl::OkStatus();
     } else {
       return errors::InvalidArgument(
           "Duplicate calls to SetHostToDeviceMetadata with key ", key);
@@ -1721,7 +1740,7 @@ Status XlaCompiler::SetHostToDeviceMetadata(
   }
   tf2xla::HostTransferMetadata& transfer = host_compute_recvs_[key];
   SetTransfer(key, types, shapes, &transfer);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaCompiler::GetHostComputeControlDependency(
@@ -1734,7 +1753,7 @@ Status XlaCompiler::GetHostComputeControlDependency(
   } else {
     *handle = iter->second;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaCompiler::SetHostComputeControlDependency(
@@ -1746,7 +1765,7 @@ Status XlaCompiler::SetHostComputeControlDependency(
         host_compute_name);
   }
   host_compute_control_output_[host_compute_name] = handle;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void XlaCompiler::PushNodeTokenMapping() {
@@ -1760,7 +1779,7 @@ Status XlaCompiler::PopNodeTokenMapping() {
         "empty.");
   }
   node_token_mapping_stack_.pop();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaCompiler::SetNodeToken(const string& node_name, const xla::XlaOp op) {
@@ -1774,7 +1793,7 @@ Status XlaCompiler::SetNodeToken(const string& node_name, const xla::XlaOp op) {
     return errors::FailedPrecondition("Token mapping already exists for node ",
                                       node_name);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<xla::XlaOp> XlaCompiler::GetNodeToken(const string& node_name) {
diff --git a/tensorflow/compiler/tf2xla/xla_context.cc b/tensorflow/compiler/tf2xla/xla_context.cc
index 04f12c7ca575d4..ff444efe752640 100644
--- a/tensorflow/compiler/tf2xla/xla_context.cc
+++ b/tensorflow/compiler/tf2xla/xla_context.cc
@@ -195,7 +195,7 @@ Status XlaContext::RecordCollectiveInfoFromNestedCompilationResult(
                                 result.collective_info->group_size)
         .status();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<int64_t> XlaContext::RecordCollectiveInfo(int group_key,
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.cc b/tensorflow/compiler/tf2xla/xla_helpers.cc
index dc40bb47e8f8fe..5f99e7f284e26c 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.cc
+++ b/tensorflow/compiler/tf2xla/xla_helpers.cc
@@ -86,7 +86,7 @@ xla::XlaOp XlaHelpers::FloatLiteral(xla::XlaBuilder* b, DataType data_type,
 
   *output = input.Clone();
   output->mutable_shape_do_not_use()->Swap(&shape);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaHelpers::OneHot(xla::XlaBuilder* builder, int64_t depth, int axis,
@@ -110,7 +110,7 @@ Status XlaHelpers::OneHot(xla::XlaBuilder* builder, int64_t depth, int axis,
       xla::Eq(indices, xla::Iota(builder, iota_shape, axis), broadcast_dims),
       xla::Broadcast(on_value, output_shape.dim_sizes()),
       xla::Broadcast(off_value, output_shape.dim_sizes()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 DataType XlaHelpers::SumAccumulationType(const DataType& dtype) {
@@ -253,7 +253,7 @@ Status ResolveDeviceAssignment(
       });
   run_options.set_device_assignment(&device_assignment);
   run_options.set_gpu_executable_run_options(&gpu_options);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/compiler/tf2xla/xla_helpers.h b/tensorflow/compiler/tf2xla/xla_helpers.h
index 0bebf471ecfbe9..ee483af794e26d 100644
--- a/tensorflow/compiler/tf2xla/xla_helpers.h
+++ b/tensorflow/compiler/tf2xla/xla_helpers.h
@@ -18,6 +18,8 @@ limitations under the License.
 #ifndef TENSORFLOW_COMPILER_TF2XLA_XLA_HELPERS_H_
 #define TENSORFLOW_COMPILER_TF2XLA_XLA_HELPERS_H_
 
+#include <string>
+
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
 #include "tensorflow/compiler/tf2xla/host_compute_metadata.pb.h"
@@ -33,6 +35,15 @@ namespace tensorflow {
 
 using XlaLayoutPreference = mlir::XlaLayoutPreference;
 
+inline std::string GetDeviceToHostChannelName(absl::string_view channel_key,
+                                              int index) {
+  return absl::StrCat(channel_key, "_dtoh_", index);
+}
+inline std::string GetHostToDeviceChannelName(absl::string_view channel_key,
+                                              int index) {
+  return absl::StrCat(channel_key, "_htod_", index);
+}
+
 // Helper methods for building XLA computations.
 class XlaHelpers {
  public:
diff --git a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
index ad65c1708794fd..8e8d3f28d8a47a 100644
--- a/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
+++ b/tensorflow/compiler/tf2xla/xla_jit_compiled_cpu_function_test.cc
@@ -25,8 +25,8 @@ limitations under the License.
 #include "xla/service/platform_util.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/test.h"
 #include "xla/xla_data.pb.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
@@ -340,8 +340,8 @@ TEST(XlaJitCompiledCpuFunction, CanCompileWithAdditionalPlatform) {
     string name_;
   };
 
-  TF_EXPECT_OK(se::MultiPlatformManager::RegisterPlatform(
-      std::make_unique<FakePlatform>()));
+  TF_EXPECT_OK(
+      se::PlatformManager::RegisterPlatform(std::make_unique<FakePlatform>()));
   xla::Compiler::RegisterCompilerFactory(kFakePlatformId, []() {
     return std::unique_ptr<xla::Compiler>(nullptr);
   });
diff --git a/tensorflow/compiler/tf2xla/xla_op_kernel.cc b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
index 237d115aa298e5..f70a7df612d149 100644
--- a/tensorflow/compiler/tf2xla/xla_op_kernel.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_kernel.cc
@@ -196,7 +196,7 @@ Status XlaOpKernelContext::ConstantInputReshaped(
   }
 
   TF_ASSIGN_OR_RETURN(*constant_literal, HostTensorToLiteral(temp));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Converts an int16, int32 or int64 scalar literal to an int64.
@@ -214,7 +214,7 @@ static Status LiteralToInt64Scalar(const xla::LiteralSlice& literal,
   } else {
     return errors::InvalidArgument("value must be int16, int32, or int64");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Converts an float32 or float64 scalar literal to a float64.
@@ -230,7 +230,7 @@ static Status LiteralToFloat64Scalar(const xla::LiteralSlice& literal,
   } else {
     return errors::InvalidArgument("value must be either float32 or float64");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaOpKernelContext::ConstantInputAsIntScalar(
@@ -273,7 +273,7 @@ static Status LiteralToPredVector(const xla::LiteralSlice& literal,
   for (int64_t i = 0; i < size; ++i) {
     out->push_back(literal.Get<bool>({i}));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaOpKernelContext::ResolveInputDynamismIntoPred(int index, bool* out) {
@@ -288,7 +288,7 @@ Status XlaOpKernelContext::ResolveInputDynamismIntoPred(int index, bool* out) {
     // TODO(b/176993339): Support resolving dynamism across computations so
     // resolving dynamism will not fail in those cases.
     *out = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
   Tensor dynamism = dynamism_or_status.value();
 
@@ -302,7 +302,7 @@ Status XlaOpKernelContext::ResolveInputDynamismIntoPred(int index, bool* out) {
 
   TF_ASSIGN_OR_RETURN(literal, HostTensorToLiteral(temp));
   *out = literal.Get<bool>({});
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaOpKernelContext::ResolveInputDynamismIntoPredVector(
@@ -332,7 +332,7 @@ Status XlaOpKernelContext::ResolveInputDynamismReshaped(
             .Broadcast(xla::ShapeUtil::MakeShape(xla::PRED, new_dims), {})
             .value();
 
-    return OkStatus();
+    return absl::OkStatus();
   }
   Tensor dynamism = dynamism_or_status.value();
 
@@ -346,7 +346,7 @@ Status XlaOpKernelContext::ResolveInputDynamismReshaped(
   }
 
   TF_ASSIGN_OR_RETURN(*dynamism_literal, HostTensorToLiteral(temp));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaOpKernelContext::ResolveInputDynamismIntoPredVector(
@@ -377,7 +377,7 @@ static Status LiteralToInt64Vector(const xla::LiteralSlice& literal,
   } else {
     return errors::InvalidArgument("value must be either int32 or int64");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaOpKernelContext::ConstantInputAsIntVector(
@@ -424,11 +424,11 @@ Status XlaOpKernelContext::ConstantInputAsInt64Literal(
       for (int64_t i = 0; i < src_data.size(); ++i) {
         out->data<int64_t>()[i] = src_data[i];
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
     case xla::S64:
       *out = std::move(literal);
-      return OkStatus();
+      return absl::OkStatus();
 
     default:
       return errors::InvalidArgument(
@@ -462,7 +462,7 @@ Status XlaOpKernelContext::ConstantInputAsShape(int index, TensorShape* shape,
           ", result: ", num_elements);
   }
   *shape = TensorShape(dims);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaOpKernelContext::ConstantInputAsPartialShape(
@@ -478,12 +478,12 @@ Status XlaOpKernelContext::ConstantInputAsPartialShape(
           "Cannot convert value to PartialTensorShape: ", shape_val);
     }
     *shape = PartialTensorShape();  // Shape with unknown rank.
-    return OkStatus();
+    return absl::OkStatus();
   }
   std::vector<int64_t> dims;
   TF_RETURN_IF_ERROR(LiteralToInt64Vector(literal, &dims));
   *shape = PartialTensorShape(dims);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaOpKernelContext::InputList(absl::string_view name,
@@ -498,7 +498,7 @@ Status XlaOpKernelContext::InputList(absl::string_view name,
         XlaExpression::CastExpressionFromTensor(input)->AsXlaOp(builder()));
     shapes->push_back(input.shape());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaOpKernelContext::ConstantInputList(absl::string_view name,
@@ -510,7 +510,7 @@ Status XlaOpKernelContext::ConstantInputList(absl::string_view name,
   for (int i = start; i < stop; ++i) {
     TF_RETURN_IF_ERROR(ConstantInput(i, &(*outputs)[i], mode));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<Tensor> XlaOpKernelContext::ConstantInputTensor(
@@ -571,7 +571,7 @@ Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
     TF_ASSIGN_OR_RETURN(xla::Literal literal,
                         HostTensorToLiteral(*expression->constant_value()));
     *value = xla::ConstantLiteral(ctx->builder(), literal);
-    return OkStatus();
+    return absl::OkStatus();
   }
   auto shape_determination_fns =
       ctx->compiler()->options().shape_determination_fns;
@@ -590,7 +590,7 @@ Status ReadVariableInputTensor(const Tensor& tensor, DataType type,
   } else {
     *value = xla::Reshape(variable->value(), variable->shape().dim_sizes());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -625,7 +625,7 @@ Status XlaOpKernelContext::GetVariableTypeAndShape(int index, DataType* type,
   }
   *type = variable->type();
   *shape = variable->shape();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void XlaOpKernelContext::SetOutputExpression(int index,
@@ -656,7 +656,7 @@ void XlaOpKernelContext::SetOutputExpression(int index,
     }
     XlaExpression::AssignExpressionToTensor(expression,
                                             context_->mutable_output(index));
-    return OkStatus();
+    return absl::OkStatus();
   }();
   if (!status.ok()) {
     SetStatus(status);
@@ -697,7 +697,7 @@ Status XlaOpKernelContext::GetResourceInput(int index, XlaResource** resource) {
       XlaExpression::CastExpressionFromTensor(context_->input(index));
   TF_RET_CHECK(expression->resource() != nullptr);
   *resource = expression->resource();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
diff --git a/tensorflow/compiler/tf2xla/xla_op_registry.cc b/tensorflow/compiler/tf2xla/xla_op_registry.cc
index 986388ab32b9f2..0109f6a3f07ef3 100644
--- a/tensorflow/compiler/tf2xla/xla_op_registry.cc
+++ b/tensorflow/compiler/tf2xla/xla_op_registry.cc
@@ -53,7 +53,7 @@ static Status LaunchOpHasKernelForDevice(const DeviceType& device_type) {
                                    &kernel_class_name));
   VLOG(1) << "LaunchOpHasKernelForDevice"
           << " kernel_class_name: " << kernel_class_name;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 XlaOpRegistry::XlaOpRegistry() = default;
@@ -437,7 +437,7 @@ XlaOpRegistry::CompileTimeConstantInputArgNames(const string& op) {
     compile_time_constant_inputs =
         CompileTimeConstantInputArgNames(node_def.op());
     if (compile_time_constant_inputs->empty()) {
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -470,7 +470,7 @@ XlaOpRegistry::CompileTimeConstantInputArgNames(const string& op) {
   }
 
   absl::c_sort(*result);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 /*static*/ bool XlaOpRegistry::IsMetadataOp(const string& op) {
diff --git a/tensorflow/compiler/tf2xla/xla_resource.cc b/tensorflow/compiler/tf2xla/xla_resource.cc
index d48c97d35c31f5..0e1d33a0c1c718 100644
--- a/tensorflow/compiler/tf2xla/xla_resource.cc
+++ b/tensorflow/compiler/tf2xla/xla_resource.cc
@@ -109,7 +109,7 @@ Status XlaResource::SetTypeAndShape(DataType type, const TensorShape& shape) {
   }
   type_ = type;
   shape_ = shape;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaResource::SetValue(const xla::XlaOp& value) {
@@ -120,7 +120,7 @@ Status XlaResource::SetValue(const xla::XlaOp& value) {
   }
   value_ = value;
   is_overwritten_ = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaResource::SetZeroValue(xla::XlaBuilder* builder) {
@@ -159,7 +159,7 @@ Status XlaResource::SetZeroValue(xla::XlaBuilder* builder) {
     default:
       LOG(FATAL) << "Invalid resource type";
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaResource::GetOrCreateTensorArrayGradient(const string& source,
@@ -183,7 +183,7 @@ Status XlaResource::GetOrCreateTensorArrayGradient(const string& source,
                         /*tensor_array_multiple_writes_aggregate=*/true));
   }
   *gradient_out = gradient.get();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaResource::Pack(xla::XlaOp* pack, xla::XlaBuilder* builder) const {
@@ -198,7 +198,7 @@ Status XlaResource::Pack(xla::XlaOp* pack, xla::XlaBuilder* builder) const {
     }
     *pack = xla::Tuple(builder, elems);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status XlaResource::SetFromPack(const std::set<string>& gradient_sources,
@@ -229,7 +229,7 @@ Status XlaResource::SetFromPack(const std::set<string>& gradient_sources,
       gradient->value_ = v;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index a3538f62e93456..dde60c6079d704 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -336,10 +336,8 @@ cc_library(
         ":lib_internal",
         ":protos_all_cc",
         "//tensorflow/core/kernels:required",
-        "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_benchmark",
-        "@local_tsl//tsl/platform/default/build_config:gtest",
     ] + tf_additional_test_deps(),
 )
 
@@ -499,7 +497,6 @@ cc_library(
             "//tensorflow/core/platform:platform_port",
             "//tensorflow/core/platform:thread_annotations",
             "//tensorflow/core/platform:mutex",
-            "@local_tsl//tsl/platform/default/build_config:minimal",
             "@local_tsl//tsl/framework:fixedpoint_types",
             "//tensorflow/core/platform:types",
         ],
@@ -711,7 +708,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":tensorflow_opensource",
-        "@local_tsl//tsl/platform/default/build_config:tensorflow_platform_specific",
+        "//tensorflow/core/platform/default/build_config:tensorflow_platform_specific",
     ],
 )
 
@@ -1142,8 +1139,8 @@ cc_library(
     deps = [
         ":portable_tensorflow_lib",
         "//tensorflow/core/kernels:portable_tensorflow_kernels",
+        "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform/default/build_config:gtest",
     ],
 )
 
diff --git a/tensorflow/core/api_def/base_api/api_def_DatasetFingerprint.pbtxt b/tensorflow/core/api_def/base_api/api_def_DatasetFingerprint.pbtxt
new file mode 100644
index 00000000000000..0d2e0ff1793861
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_DatasetFingerprint.pbtxt
@@ -0,0 +1,20 @@
+op {
+  graph_op_name: "DatasetFingerprint"
+  visibility: HIDDEN
+  in_arg {
+    name: "input_dataset"
+    description: <<END
+A variant tensor representing the dataset to return fingerprint for.
+END
+  }
+  out_arg {
+    name: "fingerprint"
+    description: <<END
+The fingerprint of `input_dataset` in `uint64`
+END
+  }
+  summary: "Returns the fingerprint of `input_dataset`."
+  description: <<END
+Returns the fingerprint of `input_dataset`.
+END
+}
diff --git a/tensorflow/core/api_def/base_api/api_def_GlobalShuffleDataset.pbtxt b/tensorflow/core/api_def/base_api/api_def_GlobalShuffleDataset.pbtxt
new file mode 100644
index 00000000000000..8e4a1bdf8ea190
--- /dev/null
+++ b/tensorflow/core/api_def/base_api/api_def_GlobalShuffleDataset.pbtxt
@@ -0,0 +1,4 @@
+op {
+  graph_op_name: "GlobalShuffleDataset"
+  visibility: HIDDEN
+}
diff --git a/tensorflow/core/common_runtime/BUILD b/tensorflow/core/common_runtime/BUILD
index 550f20269532b2..1b2b87e362124d 100644
--- a/tensorflow/core/common_runtime/BUILD
+++ b/tensorflow/core/common_runtime/BUILD
@@ -254,6 +254,7 @@ filegroup(
         "collective_param_resolver_local.h",
         "collective_rma_local.h",
         "collective_util.h",
+        "colocate_predecessor_trees_pass.h",
         "colocation_graph.h",
         "constant_folding.h",
         "copy_tensor.h",
@@ -1180,6 +1181,28 @@ cc_library(
     alwayslink = 1,
 )
 
+cc_library(
+    name = "colocate_predecessor_trees_pass",
+    srcs = ["colocate_predecessor_trees_pass.cc"],
+    hdrs = ["colocate_predecessor_trees_pass.h"],
+    copts = tf_copts(),
+    deps = [
+        ":optimization_registry",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:portable_gif_internal",
+        "//tensorflow/core/framework:node_def_util",
+        "//tensorflow/core/framework:tensor_proto_cc",
+        "//tensorflow/core/framework:tensor_shape_proto_cc",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "local_device",
     srcs = ["local_device.cc"],
@@ -1945,6 +1968,7 @@ tf_cuda_library(
         ":collective_param_resolver_local",
         ":collective_rma_local",
         ":collective_util",
+        ":colocate_predecessor_trees_pass",
         ":composite_device",
         ":copy_tensor",
         ":costmodel_manager",
@@ -2340,6 +2364,7 @@ tf_cc_tests(
         "buf_rendezvous_test.cc",
         "collective_executor_mgr_test.cc",
         "collective_rma_local_test.cc",
+        "colocate_predecessor_trees_pass_test.cc",
         "device_mgr_test.cc",
         "device_resolver_local_test.cc",
         "device_set_test.cc",
@@ -3481,7 +3506,10 @@ cc_library(
     srcs = ["serving_device_selector.cc"],
     hdrs = ["serving_device_selector.h"],
     copts = tf_copts(),
+    visibility = ["//visibility:public"],
     deps = [
+        "//tensorflow/core/platform:logging",
+        "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
diff --git a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
index e399b90428a81d..9b7930ad1ef590 100644
--- a/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
+++ b/tensorflow/core/common_runtime/accumulate_n_optimizer.cc
@@ -53,7 +53,7 @@ class AccumulateNV2RemovePass : public GraphOptimizationPass {
     if (options.graph == nullptr) {
       // TODO(apassos) returning OK feels weird here as we can't do anything
       // without a graph, but some tests require this.
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Graph* g = options.graph->get();
@@ -70,7 +70,7 @@ class AccumulateNV2RemovePass : public GraphOptimizationPass {
         matches.push_back(n);
       }
     }
-    if (matches.empty()) return OkStatus();
+    if (matches.empty()) return absl::OkStatus();
 
     std::vector<ControlFlowInfo> control_flow_info;
     TF_RETURN_IF_ERROR(BuildControlFlowInfo(g, &control_flow_info));
@@ -98,7 +98,7 @@ class AccumulateNV2RemovePass : public GraphOptimizationPass {
         TF_RETURN_IF_ERROR(RewriteIntoTempVariable(n, g));
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status RewriteIntoTempVariable(Node* n, Graph* g) {
@@ -226,7 +226,7 @@ class AccumulateNV2RemovePass : public GraphOptimizationPass {
     // using its incoming/outgoing edge sets.
     g->RemoveNode(n);
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status RewriteIntoAddN(Node* n, Graph* g) {
@@ -281,7 +281,7 @@ class AccumulateNV2RemovePass : public GraphOptimizationPass {
     // using its incoming/outgoing edge sets.
     g->RemoveNode(n);
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 10,
diff --git a/tensorflow/core/common_runtime/all_to_all.h b/tensorflow/core/common_runtime/all_to_all.h
index 0c9716da6a108b..38bfd3ddc2058a 100644
--- a/tensorflow/core/common_runtime/all_to_all.h
+++ b/tensorflow/core/common_runtime/all_to_all.h
@@ -34,7 +34,7 @@ class AllToAll : public CollectiveImplementationInterface {
   void Run(StatusCallback done) override;
 
   Status InitializeCollectiveParams(CollectiveParams* col_params) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Initializes members of CollectiveContext not yet initialized, i.e. device
diff --git a/tensorflow/core/common_runtime/arg_ret_placement.cc b/tensorflow/core/common_runtime/arg_ret_placement.cc
index 0467a36456fd82..a995564c8c2964 100644
--- a/tensorflow/core/common_runtime/arg_ret_placement.cc
+++ b/tensorflow/core/common_runtime/arg_ret_placement.cc
@@ -73,7 +73,7 @@ Status CheckMemoryType(bool use_host_memory, const FullTypeDef& ft) {
                             " but full type information is\n",
                             ft.DebugString());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Note that ints_on_device is only true for single device functions
@@ -152,7 +152,7 @@ static Status SetMemoryTypeForNode(
     aa.set_on_host(mt_from_dtype == HOST_MEMORY);
     alloc_attrs->push_back(aa);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // This helper function takes a list of nodes.
@@ -169,7 +169,7 @@ static Status SetMemoryTypeHelper(
                                             weak_flag, /*ints_on_device=*/false,
                                             memory_types, alloc_attrs));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // This helper function takes a list of pairs that contain an arg node.
@@ -192,7 +192,7 @@ static Status SetMemoryTypeHelper(
         arg.first, dtype, /*is_arg=*/true, weak_flag, ints_on_device,
         /*memory_types=*/nullptr, alloc_attrs));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // This helper function takes a list of pairs that contain a ret node.
@@ -214,7 +214,7 @@ static Status SetMemoryTypeHelper(
         ret.first, dtype, /*is_arg=*/false, weak_flag, ints_on_device,
         /*memory_types=*/nullptr, alloc_attrs));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SetMemoryTypeForArgs(const gtl::InlinedVector<Node*, 4>& nodes,
diff --git a/tensorflow/core/common_runtime/buf_rendezvous.cc b/tensorflow/core/common_runtime/buf_rendezvous.cc
index bd0d21dd2cf95d..8bd8a2c1a10ec6 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous.cc
+++ b/tensorflow/core/common_runtime/buf_rendezvous.cc
@@ -148,7 +148,7 @@ void BufRendezvous::ProvideBuf(const string& key, Device* dev,
     DVLOG(4) << "ProvideBuf: key = " << key << ": calling cons_cb"
              << h->DebugString();
     DeregisterCancellation(h);
-    h->cons_cb(OkStatus(), h);
+    h->cons_cb(absl::OkStatus(), h);
   }
   if (!providebuf_status.ok()) {
     done(providebuf_status);
@@ -224,7 +224,7 @@ void BufRendezvous::ConsumeBuf(const string& key, const string& device_name,
     DVLOG(4) << "ConsumeBuf: key = " << key << ": calling cons_cb"
              << existing_hook->DebugString();
     DeregisterCancellation(existing_hook);
-    existing_hook->cons_cb(OkStatus(), existing_hook);
+    existing_hook->cons_cb(absl::OkStatus(), existing_hook);
     return;
   }
   if (!consumebuf_status.ok()) {
@@ -257,7 +257,7 @@ void BufRendezvous::CancelHook(const string& key) {
 
 /*static*/
 void BufRendezvous::DoneWithHook(Hook* h) {
-  h->prod_cb(OkStatus());
+  h->prod_cb(absl::OkStatus());
   delete h;
 }
 
diff --git a/tensorflow/core/common_runtime/buf_rendezvous_test.cc b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
index 5e2c6172aa4408..1bbb828f32c4e6 100644
--- a/tensorflow/core/common_runtime/buf_rendezvous_test.cc
+++ b/tensorflow/core/common_runtime/buf_rendezvous_test.cc
@@ -35,7 +35,7 @@ class BufRendezvousTest : public ::testing::Test {
      public:
       explicit FakeDevice(const DeviceAttributes& attrs)
           : Device(nullptr, attrs) {}
-      Status Sync() override { return OkStatus(); }
+      Status Sync() override { return absl::OkStatus(); }
       Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
     };
     DeviceAttributes attrs;
diff --git a/tensorflow/core/common_runtime/collective_param_resolver_local.cc b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
index cebcee53ba35a7..46de7b68645064 100644
--- a/tensorflow/core/common_runtime/collective_param_resolver_local.cc
+++ b/tensorflow/core/common_runtime/collective_param_resolver_local.cc
@@ -127,7 +127,7 @@ Status CheckUserSpecifiedRanks(const std::vector<CollGroupMember> members) {
         "Duplicate ranks specified for group members. Received ranks: ",
         received_ranks);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
@@ -644,7 +644,7 @@ Status CollectiveParamResolverLocal::LookupGroup(int32_t group_key,
         group_rec->second->status.ToString());
   }
   *group = group_rec->second->group;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void CollectiveParamResolverLocal::CompleteParamsAsync(
diff --git a/tensorflow/core/common_runtime/collective_rma_local.cc b/tensorflow/core/common_runtime/collective_rma_local.cc
index d0d6c4f35324a2..4c968f703af615 100644
--- a/tensorflow/core/common_runtime/collective_rma_local.cc
+++ b/tensorflow/core/common_runtime/collective_rma_local.cc
@@ -157,7 +157,7 @@ void CollectiveRemoteAccessLocal::MemCpyAsync(
     int64_t bytes = src->TotalBytes();
     DCHECK_EQ(dst->TotalBytes(), bytes);
     memcpy(DMAHelper::base(dst), DMAHelper::base(src), bytes);
-    done(OkStatus());
+    done(absl::OkStatus());
   }
 }
 
diff --git a/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.cc b/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.cc
new file mode 100644
index 00000000000000..54382f3d384738
--- /dev/null
+++ b/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.cc
@@ -0,0 +1,198 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/colocate_predecessor_trees_pass.h"
+
+#include <optional>
+#include <queue>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.pb.h"
+#include "tensorflow/core/framework/tensor_shape.pb.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/platform/logging.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/util/device_name_utils.h"
+#include "tensorflow/core/util/dump_graph.h"
+#include "tsl/util/device_name_utils.h"
+
+namespace tensorflow {
+namespace {
+
+constexpr absl::string_view kClassAttr = "_class";
+
+// Check if the node is a valid tree node. Noticed this node must not be the
+// root of the tree. We find root of the tree in other place.
+// For a valid tree node, it must
+// 1. not a arg node
+// 2. not have device attr
+// 3. not have colocation attr
+// 4. must register for CPU
+// 5. only have one output node
+bool IsValidTreeNode(const Node& node, bool in_node_mode) {
+  if (node.IsArg()) {
+    return false;
+  }
+  if (node.has_assigned_device_name()) {
+    return false;
+  }
+  if (HasNodeAttr(node.def(), kClassAttr)) {
+    return false;
+  }
+  if (!KernelDefAvailable(DeviceType(DEVICE_CPU), node.def())) {
+    return false;
+  }
+
+  int num_parents_to_tree_nodes = 0;
+  auto parent_nodes = in_node_mode ? node.out_nodes() : node.in_nodes();
+  for (auto parent_node : parent_nodes) {
+    if (in_node_mode && (parent_node->IsExit() || parent_node->IsSink()))
+      continue;
+    if (!in_node_mode && parent_node->IsSource()) continue;
+    num_parents_to_tree_nodes++;
+  }
+  if (num_parents_to_tree_nodes != 1) return false;
+  return true;
+}
+
+// Check if the node is potential root node. For a valid root node, it must
+// 1. have device attr
+// 2. not a arg node
+// 3. must register for CPU has device type must be CPU
+// 4. the output node can only be exit or sink node
+bool IsPotentialRootNode(const Node& node) {
+  if (!node.has_assigned_device_name()) {
+    return false;
+  }
+  auto device_name = node.assigned_device_name();
+  DeviceNameUtils::ParsedName parsed_device_name;
+  DeviceNameUtils::ParseFullName(device_name, &parsed_device_name);
+  if (parsed_device_name.type != DEVICE_CPU) {
+    return false;
+  }
+  if (node.IsArg()) {
+    return false;
+  }
+  if (!KernelDefAvailable(DeviceType(DEVICE_CPU), node.def())) {
+    return false;
+  }
+  return true;
+}
+
+// Find all tree nodes for the root node. Otherwise, return false.
+std::optional<absl::flat_hash_set<Node*>> FindTreeNodes(Node* potential_root) {
+  absl::flat_hash_set<Node*> tree_nodes;
+  tree_nodes.insert(potential_root);
+
+  auto seek_tree_nodes = [&](bool in_node_mode) {
+    std::queue<Node*> pending_nodes;
+    auto nodes_to_potential_nodes =
+        in_node_mode ? potential_root->in_nodes() : potential_root->out_nodes();
+    for (Node* node : nodes_to_potential_nodes) {
+      if (in_node_mode && node->IsSource()) continue;
+      if (!in_node_mode && (node->IsSink() || node->IsExit())) continue;
+      pending_nodes.push(node);
+    }
+    while (!pending_nodes.empty()) {
+      Node* node = pending_nodes.front();
+      pending_nodes.pop();
+      if (tree_nodes.find(node) != tree_nodes.end()) {
+        return false;
+      }
+      if (!IsValidTreeNode(*node, in_node_mode)) {
+        return false;
+      }
+      tree_nodes.insert(node);
+      auto nodes_to_potential_node =
+          in_node_mode ? node->in_nodes() : node->out_nodes();
+      for (Node* node : nodes_to_potential_node) {
+        if (in_node_mode && node->IsSource()) continue;
+        if (!in_node_mode && (node->IsSink() || node->IsExit())) continue;
+        pending_nodes.push(node);
+      }
+    }
+    return true;
+  };
+
+  if (!seek_tree_nodes(/*in_node_mode=*/true) ||
+      !seek_tree_nodes(/*in_node_mode=*/false)) {
+    return std::nullopt;
+  }
+  return tree_nodes;
+}
+
+// Propagate colocation info from root node to each tree nodes.
+void PropagateColocationInfo(Node* root_node,
+                             absl::flat_hash_set<Node*>& tree_nodes) {
+  std::string colocation_prefix = "loc:@";
+  std::string node_name = root_node->name();
+  for (auto node : tree_nodes) {
+    node->AddAttr(std::string(kClassAttr),
+                  absl::StrCat(colocation_prefix, node_name));
+  }
+}
+
+}  // namespace
+
+Status ColocatePredecessorTreesPass::Run(
+    const GraphOptimizationPassOptions& options) {
+  // find all potential node.
+  if (options.graph == nullptr) {
+    VLOG(1) << "No graph in colocate_predecessor_trees_pass.\n";
+    return absl::OkStatus();
+  }
+  Graph* graph = options.graph->get();
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << DumpGraphToFile("before_colocate_predecessor_trees", *graph,
+                               options.flib_def);
+  }
+
+  absl::flat_hash_map<Node*, absl::flat_hash_set<Node*>> tree_nodes_map;
+  for (Node* node : graph->nodes()) {
+    if (IsPotentialRootNode(*node)) {
+      std::optional<absl::flat_hash_set<Node*>> nodes = FindTreeNodes(node);
+      if (nodes.has_value()) {
+        tree_nodes_map[node] = *std::move(nodes);
+      }
+    }
+  }
+
+  for (auto& [root_node, tree_nodes] : tree_nodes_map) {
+    PropagateColocationInfo(root_node, tree_nodes);
+  }
+
+  if (VLOG_IS_ON(1)) {
+    VLOG(1) << DumpGraphToFile("after_colocate_predecessor_trees", *graph,
+                               options.flib_def);
+  }
+
+  return absl::OkStatus();
+}
+
+// TODO(b/325245805): Fix the bug then register the pass again.
+// REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 50,
+//                       ColocatePredecessorTreesPass);
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.h b/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.h
new file mode 100644
index 00000000000000..2f1e21aea50b42
--- /dev/null
+++ b/tensorflow/core/common_runtime/colocate_predecessor_trees_pass.h
@@ -0,0 +1,116 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATE_PREDECESSOR_TREES_PASS_H_
+#define TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATE_PREDECESSOR_TREES_PASS_H_
+
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+
+// Colocate a tree of unplaced constants with its placed root. Identify a
+// dangling tree of ops whose root op is assigned but rest of ops are not
+// assigned. Then it should colocate the rest of the ops with the root op.
+//
+// For example, the graph before pass is:
+//
+//   node {
+//     name: "const0"
+//     op: "Const"
+//   }
+//   node {
+//     name: "const1"
+//     op: "Const"
+//   }
+//   node {
+//     name: "fill0"
+//     op: "Fill"
+//     input: "const1"
+//     input: "const0"
+//   }
+//   node {
+//     name: "id0"
+//     op: "Identity"
+//     input: "fill0"
+//     device: "/job:worker/replica:0/task:2/device:CPU:0"
+//   }
+//
+// The graph after pass is:
+//
+//   node {
+//     name: "const0"
+//     op: "Const"
+//     attr {
+//       key: "_class"
+//       value {
+//         list {
+//           s: "loc:@id0"
+//         }
+//       }
+//     }
+//   }
+//   node {
+//     name: "const1"
+//     op: "Const"
+//     attr {
+//       key: "_class"
+//       value {
+//         list {
+//           s: "loc:@id0"
+//         }
+//       }
+//     }
+//   }
+//   node {
+//     name: "fill0"
+//     op: "Fill"
+//     input: "const1"
+//     input: "const0"
+//     attr {
+//       key: "_class"
+//       value {
+//         list {
+//           s: "loc:@id0"
+//         }
+//       }
+//     }
+//   }
+//   node {
+//     name: "id0"
+//     op: "Identity"
+//     input: "fill0"
+//     device: "/job:worker/replica:0/task:2/device:CPU:0"
+//     attr {
+//       key: "_class"
+//       value {
+//         list {
+//           s: "loc:@id0"
+//         }
+//       }
+//     }
+//   }
+
+namespace tensorflow {
+
+// This pass can place each tree of unassigned nodes with its root, when the
+// root is already assigned to a device. Placement is instructed here with the
+// colocation class attribute _class. This is a good heuristic because it
+// reduces number of cut edges and tends to load balance.
+class ColocatePredecessorTreesPass : public GraphOptimizationPass {
+ public:
+  Status Run(const GraphOptimizationPassOptions& options) override;
+};
+
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_COMMON_RUNTIME_COLOCATE_PREDECESSOR_TREES_PASS_H_
diff --git a/tensorflow/core/common_runtime/colocate_predecessor_trees_pass_test.cc b/tensorflow/core/common_runtime/colocate_predecessor_trees_pass_test.cc
new file mode 100644
index 00000000000000..5871b095143db4
--- /dev/null
+++ b/tensorflow/core/common_runtime/colocate_predecessor_trees_pass_test.cc
@@ -0,0 +1,331 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/common_runtime/colocate_predecessor_trees_pass.h"
+
+#include <memory>
+#include <string>
+
+#include "tensorflow/cc/framework/scope.h"
+#include "tensorflow/core/common_runtime/graph_def_builder_util.h"
+#include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/graph/graph.h"
+#include "tensorflow/core/graph/graph_def_builder.h"
+#include "tensorflow/core/platform/test.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/test.h"
+
+namespace tensorflow {
+
+const char kCpu0[] = "/job:tpu_host_worker/replica:0/task:0/device:CPU:0";
+const char kCpu1[] = "/job:tpu_host_worker/replica:0/task:0/device:CPU:1";
+const char kClassAttr[] = "_class";
+
+// Return the node with name `name`.
+Node* GetNode(const Graph& graph, const std::string& name) {
+  for (Node* node : graph.nodes()) {
+    if (node->name() == name) return node;
+  }
+  return nullptr;
+}
+
+// Test a simple colocate predecessor tree example.
+TEST(ColocatePredecessorTreesPassTest, SimpleExample) {
+  auto graph = std::make_unique<Graph>(OpRegistry::Global());
+  GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+  Node* const_0 = ops::SourceOp("Const", builder.opts()
+                                             .WithName("const_0")
+                                             .WithAttr("dtype", DT_INT32)
+                                             .WithAttr("value", Tensor(1.0)));
+  Node* const_1 = ops::SourceOp("Const", builder.opts()
+                                             .WithName("const_1")
+                                             .WithAttr("dtype", DT_INT32)
+                                             .WithAttr("value", Tensor(2.0)));
+  Node* fill =
+      ops::BinaryOp("Fill", const_0, const_1, builder.opts().WithName("fill"));
+  ops::UnaryOp("Identity", fill, builder.opts().WithName("identity"));
+
+  TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  GetNode(*graph, "identity")->set_assigned_device_name(kCpu0);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ColocatePredecessorTreesPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+
+  EXPECT_TRUE(HasNodeAttr(GetNode(*graph, "const_0")->def(), kClassAttr));
+  EXPECT_TRUE(HasNodeAttr(GetNode(*graph, "const_1")->def(), kClassAttr));
+  EXPECT_TRUE(HasNodeAttr(GetNode(*graph, "fill")->def(), kClassAttr));
+  EXPECT_TRUE(HasNodeAttr(GetNode(*graph, "identity")->def(), kClassAttr));
+
+  std::string expected_colocation_info = "loc:@identity";
+  const AttrValue* input_value;
+  TF_EXPECT_OK(
+      GetNode(*graph, "const_0")->attrs().Find(kClassAttr, &input_value));
+  EXPECT_EQ(input_value->s(), expected_colocation_info);
+  TF_EXPECT_OK(
+      GetNode(*graph, "const_1")->attrs().Find(kClassAttr, &input_value));
+  EXPECT_EQ(input_value->s(), expected_colocation_info);
+  TF_EXPECT_OK(GetNode(*graph, "fill")->attrs().Find(kClassAttr, &input_value));
+  EXPECT_EQ(input_value->s(), expected_colocation_info);
+  TF_EXPECT_OK(
+      GetNode(*graph, "identity")->attrs().Find(kClassAttr, &input_value));
+  EXPECT_EQ(input_value->s(), expected_colocation_info);
+}
+
+// Test colocate two predecessor trees case.
+TEST(ColocatePredecessorTreesPassTest, PropagateTwoTrees) {
+  auto graph = std::make_unique<Graph>(OpRegistry::Global());
+  GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+  Node* const_0 = ops::SourceOp("Const", builder.opts()
+                                             .WithName("const_0")
+                                             .WithAttr("dtype", DT_INT32)
+                                             .WithAttr("value", Tensor(1.0)));
+  Node* const_1 = ops::SourceOp("Const", builder.opts()
+                                             .WithName("const_1")
+                                             .WithAttr("dtype", DT_INT32)
+                                             .WithAttr("value", Tensor(2.0)));
+  Node* fill =
+      ops::BinaryOp("Fill", const_0, const_1, builder.opts().WithName("fill"));
+  ops::UnaryOp("Identity", fill, builder.opts().WithName("identity"));
+
+  Node* const_2 = ops::SourceOp("Const", builder.opts()
+                                             .WithName("const_2")
+                                             .WithAttr("dtype", DT_INT32)
+                                             .WithAttr("value", Tensor(1.0)));
+  Node* const_3 = ops::SourceOp("Const", builder.opts()
+                                             .WithName("const_3")
+                                             .WithAttr("dtype", DT_INT32)
+                                             .WithAttr("value", Tensor(2.0)));
+  Node* fill_1 = ops::BinaryOp("Fill", const_2, const_3,
+                               builder.opts().WithName("fill_1"));
+  ops::UnaryOp("Identity", fill_1, builder.opts().WithName("identity_1"));
+
+  TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  GetNode(*graph, "identity")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "identity_1")->set_assigned_device_name(kCpu0);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ColocatePredecessorTreesPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+
+  EXPECT_TRUE(HasNodeAttr(GetNode(*graph, "const_0")->def(), kClassAttr));
+  EXPECT_TRUE(HasNodeAttr(GetNode(*graph, "const_1")->def(), kClassAttr));
+  EXPECT_TRUE(HasNodeAttr(GetNode(*graph, "fill")->def(), kClassAttr));
+  EXPECT_TRUE(HasNodeAttr(GetNode(*graph, "identity")->def(), kClassAttr));
+
+  std::string expected_colocation_info = "loc:@identity";
+  const AttrValue* input_value;
+  TF_EXPECT_OK(
+      GetNode(*graph, "const_0")->attrs().Find(kClassAttr, &input_value));
+  EXPECT_EQ(input_value->s(), expected_colocation_info);
+  TF_EXPECT_OK(
+      GetNode(*graph, "const_1")->attrs().Find(kClassAttr, &input_value));
+  EXPECT_EQ(input_value->s(), expected_colocation_info);
+  TF_EXPECT_OK(GetNode(*graph, "fill")->attrs().Find(kClassAttr, &input_value));
+  EXPECT_EQ(input_value->s(), expected_colocation_info);
+  TF_EXPECT_OK(
+      GetNode(*graph, "identity")->attrs().Find(kClassAttr, &input_value));
+  EXPECT_EQ(input_value->s(), expected_colocation_info);
+
+  EXPECT_TRUE(HasNodeAttr(GetNode(*graph, "const_2")->def(), kClassAttr));
+  EXPECT_TRUE(HasNodeAttr(GetNode(*graph, "const_3")->def(), kClassAttr));
+  EXPECT_TRUE(HasNodeAttr(GetNode(*graph, "fill_1")->def(), kClassAttr));
+  EXPECT_TRUE(HasNodeAttr(GetNode(*graph, "identity_1")->def(), kClassAttr));
+
+  std::string expected_colocation_info_1 = "loc:@identity_1";
+  TF_EXPECT_OK(
+      GetNode(*graph, "const_2")->attrs().Find(kClassAttr, &input_value));
+  EXPECT_EQ(input_value->s(), expected_colocation_info_1);
+  TF_EXPECT_OK(
+      GetNode(*graph, "const_3")->attrs().Find(kClassAttr, &input_value));
+  EXPECT_EQ(input_value->s(), expected_colocation_info_1);
+  TF_EXPECT_OK(
+      GetNode(*graph, "fill_1")->attrs().Find(kClassAttr, &input_value));
+  EXPECT_EQ(input_value->s(), expected_colocation_info_1);
+  TF_EXPECT_OK(
+      GetNode(*graph, "identity_1")->attrs().Find(kClassAttr, &input_value));
+  EXPECT_EQ(input_value->s(), expected_colocation_info_1);
+}
+
+// Test a simple colocate predecessor tree example.
+TEST(ColocatePredecessorTreesPassTest, RootHasMultipleOutputs) {
+  auto graph = std::make_unique<Graph>(OpRegistry::Global());
+  GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+  Node* const_0 = ops::SourceOp("Const", builder.opts()
+                                             .WithName("const_0")
+                                             .WithAttr("dtype", DT_INT32)
+                                             .WithAttr("value", Tensor(1.0)));
+  Node* const_1 = ops::SourceOp("Const", builder.opts()
+                                             .WithName("const_1")
+                                             .WithAttr("dtype", DT_INT32)
+                                             .WithAttr("value", Tensor(2.0)));
+  Node* fill =
+      ops::BinaryOp("Fill", const_0, const_1, builder.opts().WithName("fill"));
+  Node* identity =
+      ops::UnaryOp("Identity", fill, builder.opts().WithName("identity"));
+  ops::UnaryOp("Identity", identity, builder.opts().WithName("identity_1"));
+  ops::UnaryOp("Identity", identity, builder.opts().WithName("identity_2"));
+
+  TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  GetNode(*graph, "identity")->set_assigned_device_name(kCpu0);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ColocatePredecessorTreesPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+
+  EXPECT_TRUE(HasNodeAttr(GetNode(*graph, "const_0")->def(), kClassAttr));
+  EXPECT_TRUE(HasNodeAttr(GetNode(*graph, "const_1")->def(), kClassAttr));
+  EXPECT_TRUE(HasNodeAttr(GetNode(*graph, "fill")->def(), kClassAttr));
+  EXPECT_TRUE(HasNodeAttr(GetNode(*graph, "identity")->def(), kClassAttr));
+  EXPECT_TRUE(HasNodeAttr(GetNode(*graph, "identity_1")->def(), kClassAttr));
+  EXPECT_TRUE(HasNodeAttr(GetNode(*graph, "identity_1")->def(), kClassAttr));
+
+  std::string expected_colocation_info = "loc:@identity";
+  const AttrValue* input_value;
+  TF_EXPECT_OK(
+      GetNode(*graph, "const_0")->attrs().Find(kClassAttr, &input_value));
+  EXPECT_EQ(input_value->s(), expected_colocation_info);
+  TF_EXPECT_OK(
+      GetNode(*graph, "const_1")->attrs().Find(kClassAttr, &input_value));
+  EXPECT_EQ(input_value->s(), expected_colocation_info);
+  TF_EXPECT_OK(GetNode(*graph, "fill")->attrs().Find(kClassAttr, &input_value));
+  EXPECT_EQ(input_value->s(), expected_colocation_info);
+  TF_EXPECT_OK(
+      GetNode(*graph, "identity")->attrs().Find(kClassAttr, &input_value));
+  EXPECT_EQ(input_value->s(), expected_colocation_info);
+  TF_EXPECT_OK(
+      GetNode(*graph, "identity_1")->attrs().Find(kClassAttr, &input_value));
+  EXPECT_EQ(input_value->s(), expected_colocation_info);
+  TF_EXPECT_OK(
+      GetNode(*graph, "identity_2")->attrs().Find(kClassAttr, &input_value));
+  EXPECT_EQ(input_value->s(), expected_colocation_info);
+}
+
+// Test that a const op has device attr, no colocation info is propagated.
+TEST(ColocatePredecessorTreesPassTest, ConstHasDeviceAttr) {
+  auto graph = std::make_unique<Graph>(OpRegistry::Global());
+  GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+  Node* const_0 = ops::SourceOp("Const", builder.opts()
+                                             .WithName("const_0")
+                                             .WithAttr("dtype", DT_INT32)
+                                             .WithAttr("value", Tensor(1.0)));
+  Node* const_1 = ops::SourceOp("Const", builder.opts()
+                                             .WithName("const_1")
+                                             .WithAttr("dtype", DT_INT32)
+                                             .WithAttr("value", Tensor(2.0)));
+  Node* fill =
+      ops::BinaryOp("Fill", const_0, const_1, builder.opts().WithName("fill"));
+
+  ops::UnaryOp("Identity", fill, builder.opts().WithName("identity"));
+
+  TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  GetNode(*graph, "identity")->set_assigned_device_name(kCpu0);
+  GetNode(*graph, "const_0")->set_assigned_device_name(kCpu1);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ColocatePredecessorTreesPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+
+  EXPECT_FALSE(HasNodeAttr(GetNode(*graph, "const_0")->def(), kClassAttr));
+  EXPECT_FALSE(HasNodeAttr(GetNode(*graph, "const_1")->def(), kClassAttr));
+  EXPECT_FALSE(HasNodeAttr(GetNode(*graph, "fill")->def(), kClassAttr));
+  EXPECT_FALSE(HasNodeAttr(GetNode(*graph, "identity")->def(), kClassAttr));
+}
+
+// Test that a const op has colocation info, no colocation info is propagated.
+TEST(ColocatePredecessorTreesPassTest, ConstHasColocationInfo) {
+  auto graph = std::make_unique<Graph>(OpRegistry::Global());
+  GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+  Node* const_0 =
+      ops::SourceOp("Const", builder.opts()
+                                 .WithName("const_0")
+                                 .WithAttr("dtype", DT_INT32)
+                                 .WithAttr("value", Tensor(1.0))
+                                 .WithAttr("_class", {"loc:@fill"}));
+  Node* const_1 = ops::SourceOp("Const", builder.opts()
+                                             .WithName("const_1")
+                                             .WithAttr("dtype", DT_INT32)
+                                             .WithAttr("value", Tensor(2.0)));
+  Node* fill =
+      ops::BinaryOp("Fill", const_0, const_1, builder.opts().WithName("fill"));
+  Node* identity =
+      ops::UnaryOp("Identity", fill, builder.opts().WithName("identity"));
+
+  TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  GetNode(*graph, "identity")->set_assigned_device_name(kCpu0);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ColocatePredecessorTreesPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+
+  EXPECT_TRUE(HasNodeAttr(const_0->def(), kClassAttr));
+  EXPECT_FALSE(HasNodeAttr(const_1->def(), kClassAttr));
+  EXPECT_FALSE(HasNodeAttr(fill->def(), kClassAttr));
+  EXPECT_FALSE(HasNodeAttr(identity->def(), kClassAttr));
+}
+
+// Test that one input is Arg, no colocation info is propagated.
+TEST(ColocatePredecessorTreesPassTest, InputArg) {
+  auto graph = std::make_unique<Graph>(OpRegistry::Global());
+  GraphDefBuilder builder(GraphDefBuilder::kFailImmediately);
+  Node* arg_0 = ops::SourceOp("_Arg", builder.opts()
+                                          .WithName("arg_0")
+                                          .WithAttr("T", DT_INT32)
+                                          .WithAttr("index", 0));
+  Node* const_0 = ops::SourceOp("Const", builder.opts()
+                                             .WithName("const_0")
+                                             .WithAttr("dtype", DT_INT32)
+                                             .WithAttr("value", Tensor(2.0)));
+  Node* fill =
+      ops::BinaryOp("Fill", arg_0, const_0, builder.opts().WithName("fill"));
+
+  ops::UnaryOp("Identity", fill, builder.opts().WithName("identity"));
+
+  TF_EXPECT_OK(GraphDefBuilderToGraph(builder, graph.get()));
+  GetNode(*graph, "identity")->set_assigned_device_name(kCpu0);
+
+  GraphDef before;
+  graph->ToGraphDef(&before);
+  GraphOptimizationPassOptions options;
+  options.graph = &graph;
+  ColocatePredecessorTreesPass pass;
+  TF_ASSERT_OK(pass.Run(options));
+
+  EXPECT_FALSE(HasNodeAttr(GetNode(*graph, "arg_0")->def(), kClassAttr));
+  EXPECT_FALSE(HasNodeAttr(GetNode(*graph, "const_0")->def(), kClassAttr));
+  EXPECT_FALSE(HasNodeAttr(GetNode(*graph, "fill")->def(), kClassAttr));
+  EXPECT_FALSE(HasNodeAttr(GetNode(*graph, "identity")->def(), kClassAttr));
+}
+
+}  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/colocation_graph.cc b/tensorflow/core/common_runtime/colocation_graph.cc
index 795f8fe4678f05..2edb3d7fd7fc7c 100644
--- a/tensorflow/core/common_runtime/colocation_graph.cc
+++ b/tensorflow/core/common_runtime/colocation_graph.cc
@@ -188,7 +188,7 @@ Status Member::SetAssignedDeviceName(const string& device_name) {
   // Set requested device to assigned_device to maintain the invariant that
   // requested is a specialization of assigned.
   requested_device_name_ = assigned_device_name_;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Member::SetResourceDeviceName(const Node& node) {
@@ -208,7 +208,7 @@ Status Member::SetResourceDeviceName(const Node& node) {
   // Set requested device to resource device to maintain the invariant that
   // requested is a specialization of resource.
   requested_device_name_ = resource_device_name_;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Member::SetRequestedDeviceName(const Node& node) {
@@ -228,7 +228,7 @@ Status Member::SetRequestedDeviceName(const Node& node) {
                                    node.requested_device(),
                                    "' in node: ", node.DebugString());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Member::FillPossibleDevices(PossibleDevices* possible_device) const {
@@ -242,7 +242,7 @@ Status Member::FillPossibleDevices(PossibleDevices* possible_device) const {
   possible_device->requested_device_name = requested_device_name_;
   possible_device->resource_device_name = resource_device_name_;
   possible_device->device_types = supported_device_types_;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool Member::IsEdgeFromCompositeDeviceToPhysicalDevice(
@@ -295,7 +295,7 @@ Status Member::EnsureCompatibilityAcrossResourceEdge(
 
   if (DeviceNameUtils::AreCompatibleDevNames(src_root.requested_device_name_,
                                              requested_device_name_)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // If we are here, assigned and resource devices are compatible but requested
@@ -317,7 +317,7 @@ Status Member::EnsureCompatibilityAcrossResourceEdge(
                                        assigned_device_name_);
   DeviceNameUtils::EnsureSpecification(&requested_device_name_,
                                        resource_device_name_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void Member::Merge(std::vector<Member>* tree, int x_root, int y_root,
@@ -416,7 +416,7 @@ Status Member::MergeDeviceNames(const Member& other,
   assigned_device_name_ = std::move(assigned_device_name_copy);
   resource_device_name_ = std::move(resource_device_name_copy);
   requested_device_name_ = std::move(requested_device_name_copy);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Updates this to contain the intersection of the device types in
@@ -489,7 +489,7 @@ bool Member::MergeSupportedDevices(
 
 Status Member::AssignDevice(const Node& node) {
   if (node.assigned_device_name_index() == assigned_device_name_index_) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   DeviceNameUtils::ParsedName parsed;
@@ -525,7 +525,7 @@ Status Member::AssignDevice(const Node& node) {
   assigned_device_name_index_ = node.assigned_device_name_index();
   // Clear cached possible_devices, if any.
   possible_devices_.clear();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void Member::MaybeExcludeXlaDevices() {
@@ -561,7 +561,7 @@ Status Member::LimitToPossibleDevices(const PossibleDevices& devices,
   TF_RETURN_IF_ERROR(DeviceNameUtils::MergeDevNames(
       &resource_device_name_, devices.resource_device_name));
   MergeSupportedDevices(devices.device_types);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string Member::DebugString() const {
@@ -701,7 +701,7 @@ Status ColocationGraph::ColocateAllNodes() {
         ColocateNodeToGroup(&colocation_group_root, node, node->name()));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ColocationGraph::ColocateResourceOrRefEdge(const Node* src,
@@ -717,7 +717,7 @@ Status ColocationGraph::ColocateResourceOrRefEdge(const Node* src,
     // If the src root is assigned to a composite device and the dst root is
     // assigned to a physical device, don't colocate the dst root with the src
     // root.
-    return OkStatus();
+    return absl::OkStatus();
   }
   TF_RETURN_IF_ERROR(dst_root.EnsureCompatibilityAcrossResourceEdge(
       *src, src_root, *dst, log_device_placement_));
@@ -731,7 +731,7 @@ Status ColocationGraph::ColocateResourceOrRefEdge(const Node* src,
             status.message()),
         *dst);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ColocationGraph::ColocateResourceAndRefEdges(
@@ -781,7 +781,7 @@ Status ColocationGraph::ColocateResourceAndRefEdges(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -910,7 +910,7 @@ Status ColocationGraph::AddHostOnlyDataTypesConstraints() {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ColocationGraph::AddInspectionConstraints(
@@ -923,7 +923,7 @@ Status ColocationGraph::AddInspectionConstraints(
             << ":\n\t" << groups.DebugString();
     TF_RETURN_IF_ERROR(ApplyIOColocationGroups(groups, *node));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ColocationGraph::Initialize() {
@@ -940,7 +940,7 @@ Status ColocationGraph::Initialize() {
     members_[root_id].MaybeExcludeXlaDevices();
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // pair containing a node and whether this node has a resource input
@@ -995,7 +995,7 @@ Status GetGroupNodes(const IOColocationGroups& groups, const Node& node,
               << "]";
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Returns whether the device_type in `device_attributes` is supported.
@@ -1065,7 +1065,7 @@ Status ColocationGraph::ApplyIOColocationGroups(
     TF_RETURN_IF_ERROR(LimitToPossibleDevices(*group_node, possible_devices));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ColocationGraph::ColocateNodeToGroup(
@@ -1094,7 +1094,7 @@ Status ColocationGraph::ColocateNodeToGroup(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Merge the (possibly disjoint) sets containing nodes "x" and
@@ -1115,7 +1115,7 @@ Status ColocationGraph::ColocateNodes(const Node& x, const Node& y) {
 Status ColocationGraph::ColocateNodes(const Node& x, int x_root, const Node& y,
                                       int y_root) {
   if (x_root == y_root) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Member* new_root_member;
@@ -1155,7 +1155,7 @@ Status ColocationGraph::ColocateNodes(const Node& x, int x_root, const Node& y,
   // All error checks are done, merge the colocation graphs.
   Member::Merge(&members_, x_root, y_root, &new_root_member, &old_root_member,
                 /*dry_run=*/false);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ColocationGraph::LimitToAssignedDevice(const Node& node) {
@@ -1234,7 +1234,7 @@ Status ColocationGraph::GetDevicesForNode(
   const int node_root = FindAndUpdateRoot(node->id());
   if (!members_[node_root].possible_devices().empty()) {
     *possible_devices = &members_[node_root].possible_devices();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Member& root_member = members_[node_root];
@@ -1361,7 +1361,7 @@ Status ColocationGraph::GetDevicesForNode(
   // Cache the result of the possible devices for this node group.
   root_member.set_possible_devices(std::move(devices));
   *possible_devices = &root_member.possible_devices();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ColocationGraph::InitializeMembers() {
@@ -1371,7 +1371,7 @@ Status ColocationGraph::InitializeMembers() {
       return AttachDef(status, *node);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string ColocationGraph::DebugString() const {
@@ -1481,7 +1481,7 @@ Status ColocationGraph::InitializeMemberWithAssignedDevice(
 
   for (const auto& d : member->supported_device_types()) {
     if (IsSupportedDeviceType(assigned_device->attributes(), d.first)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -1537,7 +1537,7 @@ Status ColocationGraph::InitializeMember(const Node& node, Member* member) {
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Returns a list of devices having type in supported_device_types.  The
diff --git a/tensorflow/core/common_runtime/constant_folding.cc b/tensorflow/core/common_runtime/constant_folding.cc
index e469527fdefd49..cf79d58555568a 100644
--- a/tensorflow/core/common_runtime/constant_folding.cc
+++ b/tensorflow/core/common_runtime/constant_folding.cc
@@ -644,7 +644,7 @@ Status ConstantFold(const ConstantFoldingOptions& opts,
     VLOG(1) << "No constant foldable nodes found";
     *was_mutated = false;
     // This is not an error, so return the status as OK.
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::map<NodeAndOutput, NodeAndOutput> tensors_to_fetch;
@@ -657,7 +657,7 @@ Status ConstantFold(const ConstantFoldingOptions& opts,
     VLOG(1) << "No constant nodes found that feed into the original graph.";
     *was_mutated = false;
     // This is not an error, so return the status as OK.
-    return OkStatus();
+    return absl::OkStatus();
   }
   VLOG(1) << "Constant foldable " << constant_graph->num_node_ids() << " : "
           << graph->num_node_ids();
@@ -714,7 +714,7 @@ Status ConstantFold(const ConstantFoldingOptions& opts,
   DumpGraph("After", graph);
 
   *was_mutated = (num_nodes_replaced > 0);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/constant_folding_test.cc b/tensorflow/core/common_runtime/constant_folding_test.cc
index f65926fc3921a4..bf518b59ac7234 100644
--- a/tensorflow/core/common_runtime/constant_folding_test.cc
+++ b/tensorflow/core/common_runtime/constant_folding_test.cc
@@ -169,7 +169,7 @@ TEST_F(ConstantFoldingTest, DeterministicFolding) {
     opt.generate_new_name = generate_new_name;
     TF_CHECK_OK(
         ConstantFold(opt, nullptr, Env::Default(), nullptr, &g, &was_mutated));
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   Graph g1(OpRegistry::Global());
@@ -691,7 +691,7 @@ class TestTFFileSystem : public ::tensorflow::NullFileSystem {
     const ::tensorflow::StringPiece sp = data_tensor_.tensor_data();
     *result = std::unique_ptr<::tensorflow::ReadOnlyMemoryRegion>(
         new TestReadOnlyMemoryRegion(sp.data(), sp.size()));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
@@ -708,7 +708,7 @@ class TestTFEnvironment : public ::tensorflow::EnvWrapper {
     was_used_ = true;
     if (fname == "test://test") {
       *result = &test_filesystem_;
-      return OkStatus();
+      return absl::OkStatus();
     }
     return tf_base::GetFileSystemForFile(fname, result);
   }
diff --git a/tensorflow/core/common_runtime/copy_tensor.cc b/tensorflow/core/common_runtime/copy_tensor.cc
index 29ad1493c3fbcc..a4712a5c83a742 100644
--- a/tensorflow/core/common_runtime/copy_tensor.cc
+++ b/tensorflow/core/common_runtime/copy_tensor.cc
@@ -74,7 +74,7 @@ void CopyHostToDevice(const Tensor* input, Allocator* cpu_allocator,
         status_cb->Ref();
         CopyHostToDevice(&from, cpu_allocator, out_allocator, edge_name, dst,
                          to, recv_dev_context, wrapped_done, sync_dst_compute);
-        return OkStatus();
+        return absl::OkStatus();
       } else {
         if (!DMAHelper::CanUseDMA(&from)) {
           Status err = errors::InvalidArgument(
@@ -89,7 +89,7 @@ void CopyHostToDevice(const Tensor* input, Allocator* cpu_allocator,
           *to = Tensor(out_allocator, from.dtype(), from.shape());
           recv_dev_context->CopyCPUTensorToDevice(&from, dst, to, wrapped_done,
                                                   sync_dst_compute);
-          return OkStatus();
+          return absl::OkStatus();
         } else {
           return status_cb->status();
         }
@@ -112,7 +112,7 @@ void CopyHostToDevice(const Tensor* input, Allocator* cpu_allocator,
     }
   } else if (input->dtype() == DT_RESOURCE) {
     *output = *input;
-    done(OkStatus());
+    done(absl::OkStatus());
   } else {
     recv_dev_context->CopyCPUTensorToDevice(input, dst, output, std::move(done),
                                             sync_dst_compute);
@@ -148,7 +148,7 @@ void CopyDeviceToDevice(CopyTensor::CopyFunction copy_function,
                            send_dev_context, recv_dev_context, src, dst,
                            src_alloc_attr, dst_alloc_attr, &from, to,
                            dev_to_dev_stream_index, wrapped_done);
-        return OkStatus();
+        return absl::OkStatus();
       } else {
         if (!DMAHelper::CanUseDMA(&from)) {
           Status err = errors::InvalidArgument(
@@ -164,7 +164,7 @@ void CopyDeviceToDevice(CopyTensor::CopyFunction copy_function,
           copy_function(send_dev_context, recv_dev_context, src, dst,
                         src_alloc_attr, dst_alloc_attr, &from, to,
                         dev_to_dev_stream_index, wrapped_done);
-          return OkStatus();
+          return absl::OkStatus();
         } else {
           return status_cb->status();
         }
@@ -188,7 +188,7 @@ void CopyDeviceToDevice(CopyTensor::CopyFunction copy_function,
     }
   } else if (input->dtype() == DT_RESOURCE) {
     *output = *input;
-    done(OkStatus());
+    done(absl::OkStatus());
   } else {
     copy_function(send_dev_context, recv_dev_context, src, dst, src_alloc_attr,
                   dst_alloc_attr, input, output, dev_to_dev_stream_index,
@@ -297,7 +297,7 @@ void CopyTensor::ViaDMA(StringPiece edge_name, DeviceContext* send_dev_context,
   // cpu -> cpu
   CHECK(!non_cpu_src && !non_cpu_dst);
   *output = *input;
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 // static
@@ -308,7 +308,7 @@ Status CopyTensor::Register(DeviceType sender_device_type,
   std::vector<RegistrationInfo>* registry = MutableRegistry();
   registry->emplace_back(sender_device_type, receiver_device_type,
                          copy_function, is_pluggable_device);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -324,7 +324,7 @@ static Status WrappedTensorDeviceCopy(
     *to = from;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #define REGISTER_WRAPPED_TENSOR_COPY(DIRECTION)         \
@@ -357,7 +357,7 @@ void CopyDeviceToHost(const Tensor* input, Allocator* cpu_allocator,
         status_cb->Ref();
         CopyDeviceToHost(&from, cpu_allocator, out_allocator, edge_name, src,
                          to, send_dev_context, wrapped_done);
-        return OkStatus();
+        return absl::OkStatus();
       } else {
         if (!DMAHelper::CanUseDMA(&from)) {
           Status err = errors::InvalidArgument(
@@ -372,7 +372,7 @@ void CopyDeviceToHost(const Tensor* input, Allocator* cpu_allocator,
           *to = Tensor(out_allocator, from.dtype(), from.shape());
           send_dev_context->CopyDeviceTensorToCPU(&from, edge_name, src, to,
                                                   wrapped_done);
-          return OkStatus();
+          return absl::OkStatus();
         } else {
           return status_cb->status();
         }
@@ -395,7 +395,7 @@ void CopyDeviceToHost(const Tensor* input, Allocator* cpu_allocator,
     }
   } else if (input->dtype() == DT_RESOURCE) {
     *output = *input;
-    done(OkStatus());
+    done(absl::OkStatus());
   } else {
     send_dev_context->CopyDeviceTensorToCPU(input, edge_name, src, output,
                                             std::move(done));
diff --git a/tensorflow/core/common_runtime/costmodel_manager.cc b/tensorflow/core/common_runtime/costmodel_manager.cc
index d029b6c89c4c13..36ef7d08933b84 100644
--- a/tensorflow/core/common_runtime/costmodel_manager.cc
+++ b/tensorflow/core/common_runtime/costmodel_manager.cc
@@ -63,7 +63,7 @@ Status CostModelManager::AddToCostGraphDef(const Graph* graph,
   }
   CostModel* cost_model = it->second;
   cost_model->AddToCostGraphDef(graph, cost_graph);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/debugger_state_interface.cc b/tensorflow/core/common_runtime/debugger_state_interface.cc
index a6581a2166cb0f..a9626069a79926 100644
--- a/tensorflow/core/common_runtime/debugger_state_interface.cc
+++ b/tensorflow/core/common_runtime/debugger_state_interface.cc
@@ -69,7 +69,7 @@ Status DebuggerStateRegistry::CreateState(
         "It appears that TFDBG is not linked in this TensorFlow build.");
   } else {
     *state = (*factory_)(debug_options);
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
@@ -90,7 +90,7 @@ Status DebugGraphDecoratorRegistry::CreateDecorator(
         "It appears that TFDBG is not linked in this TensorFlow build.");
   } else {
     *decorator = (*factory_)(options);
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
diff --git a/tensorflow/core/common_runtime/device/device_utils.cc b/tensorflow/core/common_runtime/device/device_utils.cc
index 5203b4c3181597..e95f95cb8dfa8e 100644
--- a/tensorflow/core/common_runtime/device/device_utils.cc
+++ b/tensorflow/core/common_runtime/device/device_utils.cc
@@ -30,7 +30,7 @@ Status ValidateDeviceType(StringPiece type) {
                   strings::StrCat("Device name/type '", type, "' must match ",
                                   kTfDeviceTypeRegEx->pattern(), "."));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace device_utils
diff --git a/tensorflow/core/common_runtime/device_mgr_test.cc b/tensorflow/core/common_runtime/device_mgr_test.cc
index 5d4777018ba529..6cf9be8959599d 100644
--- a/tensorflow/core/common_runtime/device_mgr_test.cc
+++ b/tensorflow/core/common_runtime/device_mgr_test.cc
@@ -30,7 +30,7 @@ static Device* CreateDevice(const char* type, const char* name) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
-    Status Sync() override { return OkStatus(); }
+    Status Sync() override { return absl::OkStatus(); }
     Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
   };
   DeviceAttributes attr;
diff --git a/tensorflow/core/common_runtime/device_resolver_local.cc b/tensorflow/core/common_runtime/device_resolver_local.cc
index 4c2394d80da484..1fca35b15ef012 100644
--- a/tensorflow/core/common_runtime/device_resolver_local.cc
+++ b/tensorflow/core/common_runtime/device_resolver_local.cc
@@ -31,7 +31,7 @@ Status DeviceResolverLocal::GetDeviceAttributes(const string& device,
     return s;
   }
   *attributes = dev->attributes();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DeviceResolverLocal::GetAllDeviceAttributes(
diff --git a/tensorflow/core/common_runtime/device_set_test.cc b/tensorflow/core/common_runtime/device_set_test.cc
index 630754643ce9cd..20141e80f2a458 100644
--- a/tensorflow/core/common_runtime/device_set_test.cc
+++ b/tensorflow/core/common_runtime/device_set_test.cc
@@ -29,7 +29,7 @@ static Device* Dev(const char* type, const char* name) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
-    Status Sync() override { return OkStatus(); }
+    Status Sync() override { return absl::OkStatus(); }
     Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
   };
   DeviceAttributes attr;
@@ -61,11 +61,11 @@ class DeviceSetTest : public ::testing::Test {
 class DummyFactory : public DeviceFactory {
  public:
   Status ListPhysicalDevices(std::vector<string>* devices) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
                        std::vector<std::unique_ptr<Device>>* devices) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/common_runtime/direct_session.cc b/tensorflow/core/common_runtime/direct_session.cc
index 587137c308d77a..36faa488aa34ab 100644
--- a/tensorflow/core/common_runtime/direct_session.cc
+++ b/tensorflow/core/common_runtime/direct_session.cc
@@ -108,7 +108,7 @@ Status NewThreadPoolFromThreadPoolOptions(
         num_threads, !options.config.experimental().disable_thread_spinning(),
         /*allocator=*/nullptr);
     *owned = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Global, named threadpool.
@@ -135,7 +135,7 @@ Status NewThreadPoolFromThreadPoolOptions(
   }
   *owned = false;
   *pool = mvalue->second;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Function to create a global thread pool for sessions. The thread number is
@@ -206,7 +206,7 @@ class DirectSessionFactory : public SessionFactory {
       sessions_.push_back(session);
     }
     *out_session = session;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Reset(const SessionOptions& options,
@@ -430,7 +430,7 @@ Status DirectSession::Create(GraphDef&& graph) {
     }
     return ExtendLocked(std::move(graph));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DirectSession::Extend(const GraphDef& graph) {
@@ -471,7 +471,7 @@ Status DirectSession::ExtendLocked(GraphDef&& graph) {
     execution_state_.swap(state);
     TF_RETURN_IF_ERROR(flib_def_->AddLibrary(graph.library()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DirectSession::Run(const NamedTensorList& inputs,
@@ -499,7 +499,7 @@ Status DirectSession::CreateDebuggerState(
   TF_RETURN_IF_ERROR(debugger_state->get()->PublishDebugMetadata(
       global_step, session_run_index, executor_step_index, input_names,
       output_names, target_names));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DirectSession::DecorateAndPublishGraphForDebug(
@@ -510,7 +510,7 @@ Status DirectSession::DecorateAndPublishGraphForDebug(
 
   TF_RETURN_IF_ERROR(decorator->DecorateGraph(graph, device));
   TF_RETURN_IF_ERROR(decorator->PublishGraph(*graph, device->name()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DirectSession::RunInternal(
@@ -843,7 +843,7 @@ Status DirectSession::RunInternal(
   }
   metrics::UpdateGraphExecTime(options_.env->NowMicros() - start_time_usecs);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DirectSession::Run(const RunOptions& run_options,
@@ -964,7 +964,7 @@ Status DirectSession::Run(const RunOptions& run_options,
     metrics::RecordGraphOutputTensors(output_size);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DirectSession::PRunSetup(const std::vector<string>& input_names,
@@ -1043,7 +1043,7 @@ Status DirectSession::PRunSetup(const std::vector<string>& input_names,
   }
 
   *handle = run_state_args.handle;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DirectSession::PRun(const string& handle, const NamedTensorList& inputs,
@@ -1209,7 +1209,7 @@ Status DirectSession::SendPRunInputs(const NamedTensorList& inputs,
       return s;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DirectSession::RecvPRunOutputs(
@@ -1254,7 +1254,7 @@ Status DirectSession::RecvPRunOutputs(
 
     (*outputs)[output_offset] = output_tensor;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DirectSession::CheckFetch(const NamedTensorList& feeds,
@@ -1315,7 +1315,7 @@ Status DirectSession::CheckFetch(const NamedTensorList& feeds,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DirectSession::CreateExecutors(
@@ -1380,7 +1380,7 @@ Status DirectSession::CreateExecutors(
                              tsl::core::RefCountPtr<Rendezvous>* r) {
         *r = tsl::core::RefCountPtr<Rendezvous>(
             new IntraProcessRendezvous(device_mgr));
-        return OkStatus();
+        return absl::OkStatus();
       }}));
 
   GraphOptimizer optimizer(optimizer_opts);
@@ -1489,7 +1489,7 @@ Status DirectSession::CreateExecutors(
 
   *out_executors_and_keys = std::move(ek);
   *out_func_info = std::move(func_info);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DirectSession::GetOrCreateExecutors(
@@ -1524,7 +1524,7 @@ Status DirectSession::GetOrCreateExecutors(
     auto it = executors_.find(key);
     if (it != executors_.end()) {
       *executors_and_keys = it->second.get();
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -1557,7 +1557,7 @@ Status DirectSession::GetOrCreateExecutors(
     auto it = executors_.find(sorted_key);
     if (it != executors_.end()) {
       *executors_and_keys = it->second.get();
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -1603,7 +1603,7 @@ Status DirectSession::GetOrCreateExecutors(
   executors_.emplace(key, insert_result.first->second);
   *executors_and_keys = insert_result.first->second.get();
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DirectSession::CreateGraphs(
@@ -1775,24 +1775,24 @@ ::tensorflow::Status DirectSession::ListDevices(
     const DeviceAttributes& attrs = d->attributes();
     response->emplace_back(attrs);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 ::tensorflow::Status DirectSession::Reset(
     const std::vector<string>& containers) {
   device_mgr_->ClearContainers(containers);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 ::tensorflow::Status DirectSession::Close() {
   cancellation_manager_->StartCancel();
   {
     mutex_lock l(closed_lock_);
-    if (closed_) return OkStatus();
+    if (closed_) return absl::OkStatus();
     closed_ = true;
   }
   if (factory_ != nullptr) factory_->Deregister(this);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 DirectSession::RunState::RunState(int64_t step_id,
@@ -1868,7 +1868,7 @@ ::tensorflow::Status DirectSession::WaitForNotification(
   } else {
     notification->WaitForNotification();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DirectSession::MakeCallable(const CallableOptions& callable_options,
@@ -1886,7 +1886,7 @@ Status DirectSession::MakeCallable(const CallableOptions& callable_options,
     *out_handle = next_callable_handle_++;
     callables_[*out_handle] = {std::move(ek), std::move(func_info)};
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 class DirectSession::RunCallableCallFrame : public CallFrameInterface {
@@ -1913,7 +1913,7 @@ class DirectSession::RunCallableCallFrame : public CallFrameInterface {
     } else {
       *val = &(*feed_tensors_)[index];
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status SetRetval(int index, const Tensor& val) override {
@@ -1921,7 +1921,7 @@ class DirectSession::RunCallableCallFrame : public CallFrameInterface {
       return errors::Internal("RetVal index out of bounds: ", index);
     }
     (*fetch_tensors_)[index] = val;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -2032,7 +2032,7 @@ ::tensorflow::Status DirectSession::RunCallable(
     metrics::RecordGraphOutputTensors(output_size);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 ::tensorflow::Status DirectSession::ReleaseCallable(CallableHandle handle) {
@@ -2041,7 +2041,7 @@ ::tensorflow::Status DirectSession::ReleaseCallable(CallableHandle handle) {
     return errors::InvalidArgument("No such callable handle: ", handle);
   }
   callables_.erase(handle);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DirectSession::Finalize() {
@@ -2055,7 +2055,7 @@ Status DirectSession::Finalize() {
   execution_state_.reset();
   flib_def_.reset();
   finalized_ = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 DirectSession::Callable::~Callable() {
diff --git a/tensorflow/core/common_runtime/direct_session.h b/tensorflow/core/common_runtime/direct_session.h
index a81a3079345117..6421862af65af0 100644
--- a/tensorflow/core/common_runtime/direct_session.h
+++ b/tensorflow/core/common_runtime/direct_session.h
@@ -111,7 +111,7 @@ class DirectSession : public Session {
   ::tensorflow::Status Close() override;
   ::tensorflow::Status LocalDeviceManager(const DeviceMgr** output) override {
     *output = device_mgr_.get();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void ExportCostModels(CostModelManager::CostModelMap* cost_models) {
@@ -313,7 +313,7 @@ class DirectSession : public Session {
   ::tensorflow::Status CheckNotClosed() {
     mutex_lock l(closed_lock_);
     if (closed_) return errors::Cancelled("Session has been closed.");
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   ::tensorflow::Status CheckGraphCreated(const char* method) {
@@ -322,7 +322,7 @@ class DirectSession : public Session {
       return errors::InvalidArgument(
           "Session was not created with a graph before ", method, "!");
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   ::tensorflow::Status CreateDebuggerState(
diff --git a/tensorflow/core/common_runtime/direct_session_test.cc b/tensorflow/core/common_runtime/direct_session_test.cc
index a4777ea364b307..ab40c0f99eef0f 100644
--- a/tensorflow/core/common_runtime/direct_session_test.cc
+++ b/tensorflow/core/common_runtime/direct_session_test.cc
@@ -2757,7 +2757,7 @@ class DirectSessionCollectiveTest : public ::testing::Test {
       mutex_lock l(direct_session->collective_graph_key_lock_);
       *collective_graph_key = direct_session->collective_graph_key_;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
diff --git a/tensorflow/core/common_runtime/dynamic_device_mgr.cc b/tensorflow/core/common_runtime/dynamic_device_mgr.cc
index 854329f19ea90c..325bbfd97b9849 100644
--- a/tensorflow/core/common_runtime/dynamic_device_mgr.cc
+++ b/tensorflow/core/common_runtime/dynamic_device_mgr.cc
@@ -118,7 +118,7 @@ Status DynamicDeviceMgr::LookupDevice(StringPiece name, Device** device) const {
     return errors::InvalidArgument(name, " unknown device.");
   }
   *device = iter->second;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool DynamicDeviceMgr::ContainsDevice(int64_t device_incarnation) const {
@@ -181,7 +181,7 @@ Status DynamicDeviceMgr::AddDevices(
     device_incarnation_set_.insert(d->attributes().incarnation());
     dynamic_devices_.emplace(d.get(), std::move(d));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DynamicDeviceMgr::RemoveDevices(const std::vector<Device*>& devices) {
@@ -221,7 +221,7 @@ Status DynamicDeviceMgr::RemoveDevices(const std::vector<Device*>& devices) {
     stale_devices_.add(std::move(it->second));
     dynamic_devices_.erase(it);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DynamicDeviceMgr::RemoveDevicesByName(
diff --git a/tensorflow/core/common_runtime/dynamic_device_mgr_test.cc b/tensorflow/core/common_runtime/dynamic_device_mgr_test.cc
index 4749d254c3a1e8..90c9782ab1a3e0 100644
--- a/tensorflow/core/common_runtime/dynamic_device_mgr_test.cc
+++ b/tensorflow/core/common_runtime/dynamic_device_mgr_test.cc
@@ -34,7 +34,7 @@ static Device* CreateDevice(const char* type, const char* name,
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
-    Status Sync() override { return OkStatus(); }
+    Status Sync() override { return absl::OkStatus(); }
     Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
   };
 
diff --git a/tensorflow/core/common_runtime/eager/execute.cc b/tensorflow/core/common_runtime/eager/execute.cc
index e90aff1b8baf97..b16629d1f262e6 100644
--- a/tensorflow/core/common_runtime/eager/execute.cc
+++ b/tensorflow/core/common_runtime/eager/execute.cc
@@ -116,6 +116,10 @@ auto* function_compile_counter =
                                 "The number of times that TF function is "
                                 "called for different compilation options.",
                                 "device", "compilation_option");
+auto* top_level_jit_compilation_counter = monitoring::Counter<1>::New(
+    "/tensorflow/core/tf_top_level_jit_compilation",
+    "The number of times a top-level JIT-compiled function is called.",
+    "device");
 
 const string& DeviceNameOrUnspecified(Device* device) {
   static string* unspecified_string = new string("<unspecified>");
@@ -565,6 +569,7 @@ Status UpdateCompileCounter(const EagerOperation* op, const EagerContext& ctx,
     return OkStatus();
   }
 
+  string device_type = CanonicalizeDeviceType(op->GetDeviceParsedName().type);
   string compilation_option = kDisabled;
   if (!compile_with_xla) {
     bool nested_jit_compile;
@@ -588,9 +593,11 @@ Status UpdateCompileCounter(const EagerOperation* op, const EagerContext& ctx,
         compilation_option = kEnabled;
       }
     }
+  } else {
+    // Top-level JIT compilation
+    top_level_jit_compilation_counter->GetCell(device_type)->IncrementBy(1);
   }
 
-  string device_type = CanonicalizeDeviceType(op->GetDeviceParsedName().type);
   if (device_type == tensorflow::DEVICE_TPU || compile_with_xla) {
     compilation_option = kEnabled;
   }
diff --git a/tensorflow/core/common_runtime/eager/execute_test.cc b/tensorflow/core/common_runtime/eager/execute_test.cc
index c9dc9bc995866f..74e5b88a64578f 100644
--- a/tensorflow/core/common_runtime/eager/execute_test.cc
+++ b/tensorflow/core/common_runtime/eager/execute_test.cc
@@ -223,10 +223,13 @@ TEST(ExecuteTest, CompiledFunction) {
 
   monitoring::testing::CellReader<int64_t> counter_reader(
       "/tensorflow/core/tf_function_compile");
+  monitoring::testing::CellReader<int64_t> top_level_counter(
+      "/tensorflow/core/tf_top_level_jit_compilation");
   std::vector<TensorHandle*> retvals(1);
   int num_retvals = retvals.size();
   TF_ASSERT_OK(EagerExecute(op.get(), retvals.data(), &num_retvals));
   EXPECT_EQ(counter_reader.Delta("CPU", "enabled"), 1);
+  EXPECT_EQ(top_level_counter.Delta("CPU"), 1);
 
   retvals[0]->Unref();
   retvals[0] = nullptr;
@@ -299,11 +302,14 @@ TEST(ExecuteTest, NestedCompiledFunction) {
 
   monitoring::testing::CellReader<int64_t> counter_reader(
       "/tensorflow/core/tf_function_compile");
+  monitoring::testing::CellReader<int64_t> top_level_counter(
+      "/tensorflow/core/tf_top_level_jit_compilation");
   std::vector<TensorHandle*> retvals(1);
   int num_retvals = retvals.size();
   TF_ASSERT_OK(EagerExecute(op.get(), retvals.data(), &num_retvals));
   EXPECT_EQ(counter_reader.Delta("CPU", "enabled"), 1);
   EXPECT_EQ(counter_reader.Delta("CPU", "disabled"), 0);
+  EXPECT_EQ(top_level_counter.Delta("CPU"), 0);
 
   retvals[0]->Unref();
   retvals[0] = nullptr;
@@ -341,7 +347,7 @@ TEST(ExecuteTest, MultipleNestedCompiledFunction) {
   TF_ASSERT_OK(ctx->AddFunctionDef(x_times_two));
 
   const string call_function_name = "FunctionCall";
-  const FunctionDef function_call = FunctionDefHelper::Define(
+  FunctionDef function_call = FunctionDefHelper::Define(
       // Name
       call_function_name,
       // Args
@@ -356,12 +362,21 @@ TEST(ExecuteTest, MultipleNestedCompiledFunction) {
            "StatefulPartitionedCall",
            {"x"},
            {{"_XlaMustCompile", true},
-            {"device", "/job:localhost/replica:0/task:0/device:CPU:0"},
+            {"_device", "/job:localhost/replica:0/task:0/device:CPU:0"},
             {"Tin", DataTypeSlice({DT_INT64})},
             {"Tout", DataTypeSlice({DT_INT64})},
             {"f", tensorflow::FunctionDefHelper::FunctionRef(
                       "XTimesTwo", {{"T", DT_INT64}})}}},
       });
+
+  // Set user requested device for the StatefulPartitionedCall node, as
+  // FunctionDefHelper::Define cannot do that.
+  for (auto& node_def : *function_call.mutable_node_def()) {
+    if (node_def.op() == "StatefulPartitionedCall") {
+      node_def.set_device("/job:localhost/replica:0/task:0/device:CPU:0");
+    }
+  }
+
   TF_ASSERT_OK(ctx->AddFunctionDef(function_call));
 
   const string call_function_name2 = "FunctionCall2";
@@ -399,11 +414,14 @@ TEST(ExecuteTest, MultipleNestedCompiledFunction) {
 
   monitoring::testing::CellReader<int64_t> counter_reader(
       "/tensorflow/core/tf_function_compile");
+  monitoring::testing::CellReader<int64_t> top_level_counter(
+      "/tensorflow/core/tf_top_level_jit_compilation");
   std::vector<TensorHandle*> retvals(1);
   int num_retvals = retvals.size();
   TF_ASSERT_OK(EagerExecute(op.get(), retvals.data(), &num_retvals));
   EXPECT_EQ(counter_reader.Delta("CPU", "enabled"), 1);
   EXPECT_EQ(counter_reader.Delta("CPU", "disabled"), 0);
+  EXPECT_EQ(top_level_counter.Delta("CPU"), 0);
 
   retvals[0]->Unref();
   retvals[0] = nullptr;
diff --git a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
index ab2481b89c2344..bfe25a932b8386 100644
--- a/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
+++ b/tensorflow/core/common_runtime/eager/tensor_handle_test.cc
@@ -514,6 +514,7 @@ TEST_F(RemoteTensorHandleTest, PoisonRemoteMirror) {
       /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
       /* cluster_flr= */ nullptr, /*collective_executor_mgr=*/nullptr,
       /*run_eager_op_as_function=*/true);
+  absl::Cleanup context_cleanup = [&]() { context->Unref(); };
 
   tensorflow::DataType dtype = DT_FLOAT;
   TensorShape shape = {};
@@ -559,6 +560,7 @@ TEST_F(RemoteTensorHandleTest, SetRemoteTensorHandleShapeTwice) {
       /* device_mgr_owned=*/false, /* rendezvous=*/nullptr,
       /* cluster_flr=*/nullptr, /*collective_executor_mgr=*/nullptr,
       /*run_eager_op_as_function=*/true);
+  absl::Cleanup context_cleanup = [&]() { context->Unref(); };
 
   tensorflow::DataType dtype = DT_FLOAT;
   TensorShape shape = {};
@@ -616,6 +618,7 @@ TEST_F(RemoteTensorHandleTest, SetRemoteMirrorShapeTwice) {
       /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
       /* cluster_flr= */ nullptr, /*collective_executor_mgr=*/nullptr,
       /*run_eager_op_as_function=*/true);
+  absl::Cleanup context_cleanup = [&]() { context->Unref(); };
 
   tensorflow::DataType dtype = DT_FLOAT;
   TensorShape shape = {};
@@ -666,6 +669,7 @@ TEST(TensorHandle_LocalTest, TensorFromDeviceSameDevice) {
       /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
       /* cluster_flr= */ nullptr, /*collective_executor_mgr=*/nullptr,
       /*run_eager_op_as_function=*/true);
+  absl::Cleanup context_cleanup = [&]() { context->Unref(); };
 
   tensorflow::DataType dtype = DT_FLOAT;
   TensorShape shape = {};
@@ -697,6 +701,7 @@ TEST(TensorHandle_LocalTest, TensorFromDeviceDifferentDevice) {
       /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
       /* cluster_flr= */ nullptr, /*collective_executor_mgr=*/nullptr,
       /*run_eager_op_as_function=*/true);
+  absl::Cleanup context_cleanup = [&]() { context->Unref(); };
 
   tensorflow::DataType dtype = DT_FLOAT;
   TensorShape shape = {};
@@ -733,6 +738,7 @@ TEST(TensorHandle_LocalTest, TensorFromDeviceInvalidDevice) {
       /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
       /* cluster_flr= */ nullptr, /*collective_executor_mgr=*/nullptr,
       /*run_eager_op_as_function=*/true);
+  absl::Cleanup context_cleanup = [&]() { context->Unref(); };
 
   tensorflow::DataType dtype = DT_FLOAT;
   TensorShape shape = {};
@@ -767,6 +773,7 @@ TEST(TensorHandle_ResourceShapeMirror, CreateAndCheckMirror) {
       /* device_mgr_owned= */ false, /* rendezvous= */ nullptr,
       /* cluster_flr= */ nullptr, /*collective_executor_mgr=*/nullptr,
       /*run_eager_op_as_function=*/true);
+  absl::Cleanup context_cleanup = [&]() { context->Unref(); };
 
   tensorflow::DataType dtype = DT_RESOURCE;
   TensorShape shape = {};
diff --git a/tensorflow/core/common_runtime/executor.cc b/tensorflow/core/common_runtime/executor.cc
index a539a6eaa9ede2..b241a7e60c14e8 100644
--- a/tensorflow/core/common_runtime/executor.cc
+++ b/tensorflow/core/common_runtime/executor.cc
@@ -152,7 +152,7 @@ class ExecutorImpl : public Executor {
   Status Initialize(const Graph& graph) {
     TF_RETURN_IF_ERROR(immutable_state_.Initialize(graph));
     kernel_stats_.Initialize(immutable_state_.graph_view());
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -507,7 +507,7 @@ void ExecutorState<PropagatorStateType>::RunAsync(Executor::DoneCallback done) {
   num_outstanding_ops_ = ready.size();
   if (ready.empty()) {
     delete this;
-    done(OkStatus());
+    done(absl::OkStatus());
   } else {
     done_cb_ = std::move(done);
     // Schedule to run all the ready ops in thread pool.
@@ -1077,7 +1077,7 @@ Status ExecutorState<PropagatorStateType>::PrepareInputs(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <class PropagatorStateType>
@@ -1565,7 +1565,7 @@ class DefaultExecutorRegistrar {
       Executor* ret = nullptr;
       TF_RETURN_IF_ERROR(NewLocalExecutor(params, std::move(graph), &ret));
       out_executor->reset(ret);
-      return OkStatus();
+      return absl::OkStatus();
     }
   };
 };
diff --git a/tensorflow/core/common_runtime/executor_factory.cc b/tensorflow/core/common_runtime/executor_factory.cc
index 1bf582808024bc..470bfb7548fbd9 100644
--- a/tensorflow/core/common_runtime/executor_factory.cc
+++ b/tensorflow/core/common_runtime/executor_factory.cc
@@ -70,7 +70,7 @@ Status ExecutorFactory::GetFactory(const string& executor_type,
   }
 
   *out_factory = iter->second;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status NewExecutor(const string& executor_type,
diff --git a/tensorflow/core/common_runtime/executor_test.cc b/tensorflow/core/common_runtime/executor_test.cc
index 648a2e80acf297..6a847b3f1b2f9f 100644
--- a/tensorflow/core/common_runtime/executor_test.cc
+++ b/tensorflow/core/common_runtime/executor_test.cc
@@ -584,7 +584,7 @@ Status ReplaceEdgeWithSendRecv(Graph* g, const Edge* edge, const string& tensor,
   g->AddControlEdge(edge->src(), recv);
 
   g->RemoveEdge(edge);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Defines a graph to perform the following computation:
diff --git a/tensorflow/core/common_runtime/function.cc b/tensorflow/core/common_runtime/function.cc
index 227821a71e783f..844d6ebd682107 100644
--- a/tensorflow/core/common_runtime/function.cc
+++ b/tensorflow/core/common_runtime/function.cc
@@ -605,7 +605,7 @@ Status FunctionLibraryRuntimeImpl::GetRetTypes(Handle h,
   }
   const FunctionBody* fbody = GetFunctionBody(h);
   *ret_types = fbody->ret_types;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FunctionLibraryRuntimeImpl::CreateKernel(
@@ -732,7 +732,7 @@ Status FunctionLibraryRuntimeImpl::InstantiateSymbolicGradient(
     CHECK_NOTNULL(f_body);
     *g_body = SymbolicGradient(*f_body);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool FunctionLibraryRuntimeImpl::IsLocalTarget(
@@ -792,7 +792,7 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
                                 " not found in items.");
       }
       ++item_handle->second->instantiation_counter;
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -859,7 +859,7 @@ Status FunctionLibraryRuntimeImpl::Instantiate(
     TF_RETURN_IF_ERROR(GetOrCreateItem(local_handle, &item));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
@@ -872,7 +872,7 @@ Status FunctionLibraryRuntimeImpl::ReleaseHandle(Handle handle) {
   {
     mutex_lock l(mu_);
     // Return directly if all items has already been released.
-    if (items_ == nullptr) return OkStatus();
+    if (items_ == nullptr) return absl::OkStatus();
 
     auto it = items_->find(h);
     if (it == items_->end()) {
@@ -1005,7 +1005,7 @@ Status FunctionLibraryRuntimeImpl::CreateItem(Item** item) {
       (*item)->exec = exec.release();
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FunctionLibraryRuntimeImpl::GetOrCreateItem(LocalHandle local_handle,
@@ -1019,7 +1019,7 @@ Status FunctionLibraryRuntimeImpl::GetOrCreateItem(LocalHandle local_handle,
     }
     *item = iter->second.get();
     if ((*item)->exec != nullptr) {
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
   // NOTE: We need to call CreateItem out of mu_ because creating an
@@ -1307,7 +1307,7 @@ Status FunctionLibraryRuntimeImpl::PrepareRunSync(
       device_name_, handle, /*include_multi_device=*/true);
   if (local_handle == kInvalidLocalHandle) {
     *out_item = nullptr;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   TF_RETURN_IF_ERROR(GetOrCreateItem(local_handle, out_item));
@@ -1317,7 +1317,7 @@ Status FunctionLibraryRuntimeImpl::PrepareRunSync(
   }
   DCHECK(run_opts->runner != nullptr);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FunctionLibraryRuntimeImpl::RunSync(Options opts, Handle handle,
@@ -1384,7 +1384,7 @@ Status FunctionLibraryRuntimeImpl::Clone(
                                     skip_flib_def));
   *out_flr = (*out_pflr)->GetFLR(device_->name());
   if (*out_flr != nullptr) {
-    return OkStatus();
+    return absl::OkStatus();
   } else {
     return errors::Internal("Cloning FunctionLibraryRuntime failed.");
   }
diff --git a/tensorflow/core/common_runtime/function_def_utils.cc b/tensorflow/core/common_runtime/function_def_utils.cc
index 1d7b5209ec116c..c4d87114291ac5 100644
--- a/tensorflow/core/common_runtime/function_def_utils.cc
+++ b/tensorflow/core/common_runtime/function_def_utils.cc
@@ -80,7 +80,7 @@ Status FunctionDefToBodyHelper(
 
   *fbody = std::make_unique<FunctionBody>(std::move(record), result.arg_types,
                                           result.ret_types, graph.release());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FunctionDefToBodyHelper(core::RefCountPtr<FunctionRecord>&& record,
diff --git a/tensorflow/core/common_runtime/function_optimization_registration_test.cc b/tensorflow/core/common_runtime/function_optimization_registration_test.cc
index 1a8dddc21f960b..7963119e082786 100644
--- a/tensorflow/core/common_runtime/function_optimization_registration_test.cc
+++ b/tensorflow/core/common_runtime/function_optimization_registration_test.cc
@@ -35,7 +35,7 @@ class TestFunctionPass : public FunctionOptimizationPass {
              std::vector<std::string>* control_ret_node_names,
              bool* control_rets_updated) override {
     ran_ = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -56,7 +56,7 @@ TEST(FunctionOptimizationPassRegistry, RegisteredPass) {
       /*flib_def=*/nullptr,
       /*control_ret_node_names=*/nullptr, /*control_rets_updated=*/nullptr);
 
-  EXPECT_EQ(status, OkStatus());
+  EXPECT_EQ(status, absl::OkStatus());
   EXPECT_TRUE(TestFunctionPass::ran_);
 }
 
diff --git a/tensorflow/core/common_runtime/function_optimization_registry.cc b/tensorflow/core/common_runtime/function_optimization_registry.cc
index 9a6ff8040d9982..3d6427ce372b05 100644
--- a/tensorflow/core/common_runtime/function_optimization_registry.cc
+++ b/tensorflow/core/common_runtime/function_optimization_registry.cc
@@ -34,7 +34,7 @@ Status FunctionOptimizationPassRegistry::Run(
     std::unique_ptr<Graph>* graph, FunctionLibraryDefinition* flib_def,
     std::vector<std::string>* control_ret_node_names,
     bool* control_rets_updated) {
-  if (!pass_) return OkStatus();
+  if (!pass_) return absl::OkStatus();
 
   tensorflow::metrics::ScopedCounter<2> timings(
       tensorflow::metrics::GetGraphOptimizationCounter(),
diff --git a/tensorflow/core/common_runtime/function_optimization_registry_no_pass_test.cc b/tensorflow/core/common_runtime/function_optimization_registry_no_pass_test.cc
index 49bb5c8f06d683..8a1b5716473ca7 100644
--- a/tensorflow/core/common_runtime/function_optimization_registry_no_pass_test.cc
+++ b/tensorflow/core/common_runtime/function_optimization_registry_no_pass_test.cc
@@ -36,7 +36,7 @@ TEST(FunctionOptimizationPassRegistry, NoPassSet) {
       /*flib_def=*/nullptr,
       /*control_ret_node_names=*/nullptr, /*control_rets_updated=*/nullptr);
 
-  EXPECT_EQ(status, OkStatus());
+  EXPECT_EQ(status, absl::OkStatus());
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/function_optimization_registry_test.cc b/tensorflow/core/common_runtime/function_optimization_registry_test.cc
index d23aa4e8b1b77f..af28a7c24261a5 100644
--- a/tensorflow/core/common_runtime/function_optimization_registry_test.cc
+++ b/tensorflow/core/common_runtime/function_optimization_registry_test.cc
@@ -37,7 +37,7 @@ class PassingFunctionPass : public FunctionOptimizationPass {
              std::vector<std::string>* control_ret_node_names,
              bool* control_rets_updated) override {
     ran_ = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -57,7 +57,7 @@ TEST(FunctionOptimizationPassRegistry, PassNoError) {
       /*flib_def=*/nullptr,
       /*control_ret_node_names=*/nullptr, /*control_rets_updated=*/nullptr);
 
-  EXPECT_EQ(status, OkStatus());
+  EXPECT_EQ(status, absl::OkStatus());
   EXPECT_TRUE(PassingFunctionPass::ran_);
 }
 
diff --git a/tensorflow/core/common_runtime/function_test.cc b/tensorflow/core/common_runtime/function_test.cc
index 2369a7e0358e03..aba35de1602ddd 100644
--- a/tensorflow/core/common_runtime/function_test.cc
+++ b/tensorflow/core/common_runtime/function_test.cc
@@ -179,7 +179,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
                                tsl::core::RefCountPtr<Rendezvous>* r) {
           *r = tsl::core::RefCountPtr<Rendezvous>(
               new IntraProcessRendezvous(device_mgr));
-          return OkStatus();
+          return absl::OkStatus();
         }}));
     flr0_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
     flr1_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:1");
@@ -210,7 +210,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
     for (size_t i = 0; i < rets.size(); ++i) {
       *rets[i] = out[i];
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Instantiate(FunctionLibraryRuntime* flr, const string& name,
@@ -280,7 +280,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       return status;
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status InstantiateAndRunViaCallFrameInterface(FunctionLibraryRuntime* flr,
@@ -449,7 +449,7 @@ class ConsumeArgumentCallFrame : public CallFrameInterface {
   Status SetRetval(int index, const Tensor& val) override {
     CHECK_EQ(index, 0);
     *retval_ = val;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -1128,7 +1128,7 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsAndKeepCallerNode) {
     auto a = ops::_Arg(s.WithOpName("a"), DT_FLOAT, 0);
     auto b = test::function::Call(&s, "b", "AddAndMul", {a});
     TF_RETURN_IF_ERROR(s.ToGraph(g->get()));
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   const string input_node = "Func/b/input/_0";
@@ -1216,7 +1216,7 @@ TEST_F(FunctionLibraryRuntimeTest, ExpandInlineFunctionsAndPlaceInlinedNodes) {
     for (Node* node : (*g)->op_nodes()) {
       if (node->name() == "b") node->set_requested_device(call_device);
     }
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   const string input_node = "Func/b/input/_0";
diff --git a/tensorflow/core/common_runtime/function_threadpool_test.cc b/tensorflow/core/common_runtime/function_threadpool_test.cc
index 77890a829c29c5..9454b25f7899ed 100644
--- a/tensorflow/core/common_runtime/function_threadpool_test.cc
+++ b/tensorflow/core/common_runtime/function_threadpool_test.cc
@@ -70,7 +70,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
                                tsl::core::RefCountPtr<Rendezvous>* r) {
           *r = tsl::core::RefCountPtr<Rendezvous>(
               new IntraProcessRendezvous(device_mgr));
-          return OkStatus();
+          return absl::OkStatus();
         }}));
     flr0_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
   }
@@ -110,7 +110,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       EXPECT_GE(call_count, 1);  // Test runner is used.
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Instantiate(FunctionLibraryRuntime* flr, const string& name,
@@ -191,7 +191,7 @@ class FunctionLibraryRuntimeTest : public ::testing::Test {
       EXPECT_GE(call_count, 1);  // Test runner is used.
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   FunctionLibraryRuntime* flr0_;
diff --git a/tensorflow/core/common_runtime/function_utils.cc b/tensorflow/core/common_runtime/function_utils.cc
index bb6b9008c357eb..facd31481c05ed 100644
--- a/tensorflow/core/common_runtime/function_utils.cc
+++ b/tensorflow/core/common_runtime/function_utils.cc
@@ -268,7 +268,7 @@ Status NameAndAttrsFromFunctionCall(const NodeDef& call_def,
     function->set_name(call_def.op());
     *function->mutable_attr() = call_def.attr();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InstantiateFunctionCall(const NodeDef& call_def,
diff --git a/tensorflow/core/common_runtime/gpu/BUILD b/tensorflow/core/common_runtime/gpu/BUILD
index 1920ee32f98303..810c0d6291b17b 100644
--- a/tensorflow/core/common_runtime/gpu/BUILD
+++ b/tensorflow/core/common_runtime/gpu/BUILD
@@ -264,34 +264,17 @@ tf_cuda_library(
 
 tf_cuda_library(
     name = "gpu_virtual_mem_allocator",
-    srcs = [
-        "gpu_virtual_mem_allocator.cc",
-    ],
     hdrs = [
         "gpu_virtual_mem_allocator.h",
     ],
     copts = tf_copts(),
-    cuda_deps = [
-        "@local_xla//xla/stream_executor/gpu:gpu_driver_header",
-        "@local_xla//xla/stream_executor/gpu:gpu_types_header",
-    ],
     features = [
         "-layering_check",
         "parse_headers",
     ],
     visibility = ["//visibility:public"],
     deps = [
-        ":gpu_id",
-        "//tensorflow/core:framework",
-        "//tensorflow/core:framework_internal",
-        "//tensorflow/core:lib",
-        "//tensorflow/core:lib_internal",
-        "//tensorflow/core/framework:allocator",
-        "//tensorflow/core/platform:stream_executor",
-        "//tensorflow/core/profiler/lib:traceme",
-        "@com_google_absl//absl/strings:str_format",
-        "@local_xla//xla/stream_executor",
-        "@local_xla//xla/stream_executor:platform",
+        "@local_xla//xla/stream_executor/integrations:gpu_virtual_mem_allocator",
     ],
 )
 
@@ -335,6 +318,7 @@ tf_cuda_cc_test(
         "//tensorflow/core/common_runtime:direct_session_internal",
         "//tensorflow/core/kernels:ops_util",
         "@local_xla//xla/stream_executor/integrations:device_mem_allocator",
+        "@local_xla//xla/stream_executor/integrations:gpu_virtual_mem_allocator",
     ],
 )
 
@@ -474,24 +458,6 @@ tf_cuda_cc_test(
     ],
 )
 
-tf_cc_test(
-    name = "gpu_virtual_mem_allocator_test",
-    size = "small",
-    srcs = ["gpu_virtual_mem_allocator_test.cc"],
-    features = ["-layering_check"],
-    tags = tf_cuda_tests_tags(),
-    deps = [
-        ":gpu_virtual_mem_allocator",
-        "//tensorflow/core:test",
-        "//tensorflow/core:test_main",
-        "//tensorflow/core:testlib",
-        "//tensorflow/core/framework:allocator",
-        "//tensorflow/core/platform:stream_executor",
-        "@local_xla//xla/stream_executor/gpu:gpu_init",
-        "@local_xla//xla/stream_executor/integrations:device_mem_allocator",
-    ],
-)
-
 cc_library(
     name = "gpu_serving_device_selector",
     srcs = ["gpu_serving_device_selector.cc"],
@@ -499,9 +465,14 @@ cc_library(
     features = ["-layering_check"],
     deps = [
         "//tensorflow/core/common_runtime:serving_device_selector",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:fixed_array",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
     ],
 )
 
@@ -513,6 +484,7 @@ tf_cc_test(
         ":gpu_serving_device_selector",
         "//tensorflow/core/common_runtime:serving_device_selector",
         "//tensorflow/core/common_runtime:serving_device_selector_policies",
+        "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
index b1570a7746d6b8..4756b43132b8cf 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_debug_allocator.cc
@@ -47,7 +47,7 @@ bool CheckMask(se::StreamExecutor* exec, void* ptr, int64_t* mask) {
   se::DeviceMemory<int64_t> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
   int64_t tmp[MASK_WORDS];
 
-  tsl::Status result = exec->SynchronousMemcpyD2H(gpu_ptr, MASK_BYTES, tmp);
+  absl::Status result = exec->SynchronousMemcpyD2H(gpu_ptr, MASK_BYTES, tmp);
   if (!result.ok()) {
     LOG(FATAL) << "Could not copy debug mask, " << result;
   }
@@ -67,7 +67,7 @@ bool CheckMask(se::StreamExecutor* exec, void* ptr, int64_t* mask) {
 
 void InitMask(se::StreamExecutor* exec, void* ptr, int64_t* mask) {
   se::DeviceMemory<int64_t> gpu_ptr{se::DeviceMemoryBase{ptr, MASK_BYTES}};
-  tsl::Status result = exec->SynchronousMemcpyH2D(mask, MASK_BYTES, &gpu_ptr);
+  absl::Status result = exec->SynchronousMemcpyH2D(mask, MASK_BYTES, &gpu_ptr);
   if (!result.ok()) {
     LOG(FATAL) << "Could not copy debug mask, " << result;
   }
@@ -178,7 +178,7 @@ void* GPUNanResetAllocator::AllocateRaw(size_t alignment, size_t num_bytes) {
   se::DeviceMemory<float> nan_ptr{
       se::DeviceMemoryBase{static_cast<float*>(allocated_ptr), req_size}};
 
-  tsl::Status result =
+  absl::Status result =
       stream_exec_->SynchronousMemcpyH2D(&nans[0], req_size, &nan_ptr);
   if (!result.ok()) {
     LOG(ERROR) << "Could not initialize to NaNs, " << result;
@@ -194,7 +194,7 @@ void GPUNanResetAllocator::DeallocateRaw(void* ptr) {
                             std::nanf(""));
     se::DeviceMemory<float> nan_ptr{
         se::DeviceMemoryBase{static_cast<float*>(ptr), req_size}};
-    tsl::Status result =
+    absl::Status result =
         stream_exec_->SynchronousMemcpyH2D(&nans[0], req_size, &nan_ptr);
     if (!result.ok()) {
       LOG(ERROR) << "Could not initialize to NaNs, " << result;
diff --git a/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.cc b/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.cc
index e1122b1f9a2514..b764264d6a91c4 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.cc
@@ -14,18 +14,29 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h"
 
+#include <algorithm>
+#include <cstdint>
 #include <memory>
 #include <utility>
 
+#include "absl/base/attributes.h"
 #include "absl/container/fixed_array.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
 #include "tensorflow/core/common_runtime/serving_device_selector.h"
 
 namespace tensorflow {
 namespace gpu {
 
+ABSL_CONST_INIT int64_t (*NowNs)() = +[]() -> int64_t {
+  return absl::GetCurrentTimeNanos();
+};
+
+using DeviceStates = GpuServingDeviceSelector::DeviceStates;
+
 GpuServingDeviceSelector::GpuServingDeviceSelector(
     const int num_devices,
     std::unique_ptr<ServingDeviceSelector::Policy> device_selector_policy)
@@ -44,19 +55,98 @@ DeviceReservation GpuServingDeviceSelector::ReserveDevice(
   DeviceState::ProgramInfo program_info;
   program_info.fingerprint = program_fingerprint;
   program_info.req_id = ++req_id_counter_;
-  device_states_[device_index].scheduled_programs.push_back(program_info);
+  device_states_[device_index].enqueued_programs[0].push_back(program_info);
 
   return DeviceReservation(device_index, this);
 }
 
 void GpuServingDeviceSelector::FreeDeviceReservation(
     const DeviceReservation& reservation) {
+  Completed(reservation.device_index());
+}
+
+void GpuServingDeviceSelector::Enqueue(int32_t index_on_host,
+                                       absl::string_view fingerprint) {
+  if (fingerprint.empty()) {
+    LOG(ERROR) << "Empty fingerprint.";
+    return;
+  }
+
+  absl::MutexLock lock(&mu_);
+  auto [it, emplaced] =
+      execution_info_.try_emplace(fingerprint, ExecutionInfo());
+
+  DeviceState& device_state = device_states_.at(index_on_host);
+  DeviceState::ProgramInfo program_info;
+  program_info.fingerprint = fingerprint;
+  program_info.execution_info = &(it->second);
+  device_state.enqueued_programs[0].push_back(program_info);
+
+  auto num_programs = device_state.enqueued_programs[0].size();
+  if (num_programs == 1) {
+    device_state.last_started_ns = NowNs();
+    device_state.timer_reset = true;
+  }
+
+  // TODO(xiangll): Metric estimated execution time.
+}
+
+void GpuServingDeviceSelector::Completed(int32_t index_on_host,
+                                         bool had_error) {
   absl::MutexLock lock(&mu_);
-  auto& scheduled_programs =
-      device_states_.at(reservation.device_index()).scheduled_programs;
-  DCHECK(!scheduled_programs.empty());
-  scheduled_programs.pop_front();
+  DeviceState& device_state = device_states_.at(index_on_host);
+
+  DCHECK(!device_state.enqueued_programs[0].empty());
+  auto& program_info = device_state.enqueued_programs[0].front();
+  auto& execution_info = program_info.execution_info;
+  device_state.enqueued_programs[0].pop_front();
+  const int64_t now_ns = NowNs();
+  // To make tracked execution time as accurate as possible, we only record this
+  // execution time if two programs ran back-to-back without host round trip.
+  if (!device_state.timer_reset && !had_error) {
+    const_cast<ExecutionInfo*>(execution_info)
+        ->AddTime(now_ns - device_state.last_started_ns);
+  }
+  // If there are remaining programs, update the start time.
+  if (!device_state.enqueued_programs[0].empty()) {
+    device_state.last_started_ns = now_ns;
+    device_state.timer_reset = false;
+  }
+
+  // TODO(xiangll): Metric estimated execution time.
 }
 
+/*static*/ int64_t GpuServingDeviceSelector::EstimateTimeTillIdleNs(
+    const DeviceState& device_state) {
+  int64_t ns_till_idle = 0;
+  // Add time from each program in queue.
+  for (const auto& program_info : device_state.enqueued_programs[0]) {
+    ns_till_idle += program_info.execution_info->GetTime();
+  }
+
+  // Accounts for the elapsed time of the currently running but unfinished
+  // program.
+  if (ns_till_idle > 0) {
+    DCHECK_GT(device_state.last_started_ns, 0);
+    ns_till_idle = std::max<int64_t>(
+        0, ns_till_idle - (NowNs() - device_state.last_started_ns));
+  }
+
+  return ns_till_idle;
+}
+
+int64_t GpuServingDeviceSelector::TotalGpuLoadNsForTest() {
+  absl::MutexLock lock(&mu_);
+  int64_t total_gpu_load_ns = 0;
+  for (const auto& device_state : device_states_) {
+    total_gpu_load_ns += EstimateTimeTillIdleNs(device_state);
+  }
+  return total_gpu_load_ns;
+}
+
+/*static*/ void GpuServingDeviceSelector::OverwriteNowNsFunctionForTest(
+    int64_t (*now_ns)()) {
+  NowNs = now_ns;
+}
 }  // namespace gpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h b/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h
index 55651e60d790f7..6ea8ab62cdaf72 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h
@@ -15,9 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_SERVING_DEVICE_SELECTOR_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_SERVING_DEVICE_SELECTOR_H_
 
+#include <cstdint>
 #include <memory>
+#include <string>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/fixed_array.h"
+#include "absl/container/node_hash_map.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "tensorflow/core/common_runtime/serving_device_selector.h"
@@ -34,13 +38,32 @@ class GpuServingDeviceSelector : public ServingDeviceSelector {
   DeviceReservation ReserveDevice(
       absl::string_view program_fingerprint) override;
 
+  // Enqueues the program on the stream of index `index_on_host`.
+  void Enqueue(int32_t index_on_host, absl::string_view fingerprint);
+
+  // Marks the completion of a program on the given stream.
+  // If `had_error` is true, this function doesn't update program's execution
+  // time stats to avoid incorrect estimates.
+  void Completed(int32_t index_on_host, bool had_error = false);
+
+  int64_t TotalGpuLoadNsForTest();
+
  private:
+  friend class ServingDeviceSelectorTestHelper;
+  static void OverwriteNowNsFunctionForTest(int64_t (*now_ns)());
+
   void FreeDeviceReservation(const DeviceReservation& reservation) override;
 
+  // Helper to estimate the time until the stream becomes idle in nanoseconds.
+  static int64_t EstimateTimeTillIdleNs(const DeviceState& device_state);
+
   absl::Mutex mu_;
   absl::FixedArray<DeviceState, 8> device_states_ ABSL_GUARDED_BY(mu_);
   std::unique_ptr<ServingDeviceSelector::Policy> device_selector_policy_;
   int64_t req_id_counter_ ABSL_GUARDED_BY(mu_);
+  // Map from program fingerprint to execution info.
+  absl::node_hash_map<std::string, ExecutionInfo> execution_info_
+      ABSL_GUARDED_BY(mu_);
 };
 
 }  // namespace gpu
diff --git a/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector_test.cc b/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector_test.cc
index 42f9361fc2278b..be7451513464fb 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector_test.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_serving_device_selector_test.cc
@@ -14,15 +14,39 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/common_runtime/gpu/gpu_serving_device_selector.h"
 
+#include <cstdint>
 #include <memory>
 #include <string>
+#include <utility>
 
 #include <gtest/gtest.h>
+#include "absl/time/clock.h"
 #include "tensorflow/core/common_runtime/serving_device_selector.h"
 #include "tensorflow/core/common_runtime/serving_device_selector_policies.h"
 
 namespace tensorflow {
 namespace gpu {
+class ServingDeviceSelectorTestHelper {
+ public:
+  ServingDeviceSelectorTestHelper() {
+    GpuServingDeviceSelector::OverwriteNowNsFunctionForTest(NowNs);
+    now_ns_ = 0;
+  }
+
+  ~ServingDeviceSelectorTestHelper() {
+    GpuServingDeviceSelector::OverwriteNowNsFunctionForTest(
+        absl::GetCurrentTimeNanos);
+  }
+
+  static void ElapseNs(int64_t ns) { now_ns_ += ns; }
+
+  static int64_t NowNs() { return now_ns_; }
+
+ private:
+  static int64_t now_ns_;
+};
+
+int64_t ServingDeviceSelectorTestHelper::now_ns_ = 0;
 namespace {
 
 TEST(GpuServingDeviceSelector, Basic) {
@@ -41,6 +65,58 @@ TEST(GpuServingDeviceSelector, Basic) {
   EXPECT_EQ(reservation.device_index(), 0);
 }
 
+TEST(GpuServingDeviceSelector, DefaultPolicyOnlyEnqueueCall) {
+  ServingDeviceSelectorTestHelper helper;
+  auto policy = std::make_unique<RoundRobinPolicy>();
+  auto serving_device_selector =
+      std::make_unique<tensorflow::gpu::GpuServingDeviceSelector>(
+          4, std::move(policy));
+  serving_device_selector->Enqueue(3, "16ms");
+  serving_device_selector->Enqueue(2, "8ms");
+  serving_device_selector->Enqueue(1, "4ms");
+  serving_device_selector->Enqueue(0, "2ms");
+  // Nothing is completed yet, we don't have any estimated execution time, and
+  // we don't know what programs we are enqueueing.
+  serving_device_selector->Enqueue(3, "16ms");
+  serving_device_selector->Enqueue(2, "8ms");
+  serving_device_selector->Enqueue(1, "4ms");
+  serving_device_selector->Enqueue(0, "2ms");
+  helper.ElapseNs(2e6);
+  serving_device_selector->Completed(0);
+  helper.ElapseNs(2e6);
+  serving_device_selector->Completed(0);
+  serving_device_selector->Completed(1);
+  helper.ElapseNs(4e6);
+  serving_device_selector->Completed(1);
+  serving_device_selector->Completed(2);
+  helper.ElapseNs(8e6);
+  serving_device_selector->Completed(2);
+  serving_device_selector->Completed(3);
+  helper.ElapseNs(16e6);
+  serving_device_selector->Completed(3);
+
+  serving_device_selector->Enqueue(3, "16ms");
+  EXPECT_EQ(serving_device_selector->TotalGpuLoadNsForTest(), 16e6);
+  serving_device_selector->Enqueue(2, "8ms");
+  EXPECT_EQ(serving_device_selector->TotalGpuLoadNsForTest(), 24e6);
+  serving_device_selector->Enqueue(1, "4ms");
+  EXPECT_EQ(serving_device_selector->TotalGpuLoadNsForTest(), 28e6);
+  serving_device_selector->Enqueue(0, "2ms");
+  EXPECT_EQ(serving_device_selector->TotalGpuLoadNsForTest(), 30e6);
+  helper.ElapseNs(2e6);
+  serving_device_selector->Completed(0);
+  EXPECT_EQ(serving_device_selector->TotalGpuLoadNsForTest(), 22e6);
+  helper.ElapseNs(2e6);
+  serving_device_selector->Completed(1);
+  EXPECT_EQ(serving_device_selector->TotalGpuLoadNsForTest(), 16e6);
+  helper.ElapseNs(4e6);
+  serving_device_selector->Completed(2);
+  EXPECT_EQ(serving_device_selector->TotalGpuLoadNsForTest(), 8e6);
+  helper.ElapseNs(8e6);
+  serving_device_selector->Completed(3);
+  EXPECT_EQ(serving_device_selector->TotalGpuLoadNsForTest(), 0e6);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util.cc b/tensorflow/core/common_runtime/gpu/gpu_util.cc
index b1284203215750..3ae56b821df2c0 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util.cc
@@ -121,7 +121,7 @@ Status PrepareCopy(Device* device, const DeviceContext* ctx, const Tensor& src,
     return errors::Internal("GPU copy from non-DMA ",
                             DataTypeString(src.dtype()), " tensor");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void* GetBase(const Tensor* src) {
@@ -201,7 +201,7 @@ void GPUUtil::SetProtoFromGPU(const Tensor& tensor, Device* dev,
           }
           alloc->DeallocateRaw(buf);
         }
-        done(OkStatus());
+        done(absl::OkStatus());
       });
 }
 
@@ -268,7 +268,7 @@ void GPUUtil::DeviceToDeviceCopy(
         if (!send_device_to_device_stream->ok()) {
           LOG(FATAL) << "GPU->GPU Memcpy failed";
         }
-        done(OkStatus());
+        done(absl::OkStatus());
       });
   send_dev_context->MaintainLifetimeOnStream(input,
                                              send_device_to_device_stream);
@@ -352,7 +352,7 @@ void GPUUtil::CopyGPUTensorToCPU(Device* gpu_device,
           LOG(FATAL) << "GPU->CPU Memcpy failed";  // Crash OK
         }
         input_ref.Unref();
-        done(OkStatus());
+        done(absl::OkStatus());
       });
 }
 
@@ -433,7 +433,7 @@ void GPUUtil::CopyCPUTensorToGPU(const Tensor* cpu_tensor,
         if (!recv_host_to_device_stream->ok()) {
           LOG(FATAL) << "CPU->GPU Memcpy failed";
         }
-        done(OkStatus());
+        done(absl::OkStatus());
       });
 }
 
@@ -456,7 +456,7 @@ Status GPUUtil::SyncAll(Device* gpu_device) {
       !dev_info->stream->ok()) {
     return errors::Internal("GPU sync failed");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string GPUUtil::MemoryDebugString(const Device* device, Tensor* tensor) {
@@ -537,7 +537,7 @@ void GPUUtil::CopyGPUTensorToSameGPU(Device* gpu_device,
     send_stream->ThenMemcpy(&gpu_dst_ptr, gpu_src_ptr, total_bytes);
   }
 
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc b/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc
index fffb504f2f7afa..783597123a3100 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc
+++ b/tensorflow/core/common_runtime/gpu/gpu_util_platform_specific.cc
@@ -52,7 +52,7 @@ Status GPUDeviceContext::ThenExecute(Device* device, se::Stream* stream,
   const DeviceBase::AcceleratorDeviceInfo* gpu_info =
       device->tensorflow_accelerator_device_info();
   gpu_info->event_mgr->ThenExecute(stream, func);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h
index 20a592bc23ad92..64b5e7b567e3c1 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h
+++ b/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h
@@ -13,106 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// CUDA virtual memory API is only available in CUDA versions greater than 10.2.
-
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_VIRTUAL_MEM_ALLOCATOR_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_VIRTUAL_MEM_ALLOCATOR_H_
 
-#include <memory>
-#include <vector>
-
-#include "xla/stream_executor/stream_executor.h"
-#include "tsl/framework/allocator.h"
-#include "tsl/framework/device_id.h"
-#include "tsl/platform/statusor.h"
+#include "xla/stream_executor/integrations/gpu_virtual_mem_allocator.h"  // IWYU pragma: keep
 
 #if GOOGLE_CUDA
-#include "xla/stream_executor/gpu/gpu_driver.h"
-#include "xla/stream_executor/gpu/gpu_types.h"
-#endif
-
-#if CUDA_VERSION >= 10020
-
 namespace tensorflow {
-
-// GpuVirtualMemAllocator is a SubAllocator for use with BFCAllocator which
-// provides contiguous allocations with each call to Alloc. This is done by
-// reserving a large chunk of virtual addresses at construction and then mapping
-// physical memory pages to this virtual address range as requested.
-//
-// This class is not thread-safe.
-class GpuVirtualMemAllocator : public tsl::SubAllocator {
- public:
-  static tsl::StatusOr<std::unique_ptr<GpuVirtualMemAllocator>> Create(
-      const std::vector<Visitor>& alloc_visitors,
-      const std::vector<Visitor>& free_visitors,
-      stream_executor::gpu::GpuContext& gpu_context,
-      tsl::PlatformDeviceId gpu_id, size_t virtual_address_space_size,
-      const std::vector<tsl::PlatformDeviceId>& peer_gpu_ids);
-  ~GpuVirtualMemAllocator() override;
-
-  // Allocates memory at least as large as requested by num_bytes. Will be
-  // aligned to the min allocation granularity (typically 2MiB).
-  // alignment is ignored by this allocator.
-  void* Alloc(size_t alignment, size_t num_bytes,
-              size_t* bytes_received) override;
-
-  // Frees should only happen at the end of the contiguous memory allocations or
-  // else we introduce pointless fragmentation...But, this is supported. If the
-  // allocation happens at the end, then the next_alloc_offset_ is moved back,
-  // otherwise a hole is created.
-  //
-  // Holes are not re-used, all allocations continue to come at the end of the
-  // next_alloc_offset_. To accommodate this, the virtual_address_space_size
-  // should be much larger than the max physical size of the allocator.
-  //
-  // In practice, since the BFC allocator coalesces adjacent AllocationRegions,
-  // this free function should never be invoked.
-  void Free(void* ptr, size_t num_bytes) override;
-
-  bool SupportsCoalescing() const override { return true; }
-
- private:
-  GpuVirtualMemAllocator(
-      const std::vector<Visitor>& alloc_visitors,
-      const std::vector<Visitor>& free_visitors,
-      stream_executor::gpu::GpuContext& gpu_context,
-      tsl::PlatformDeviceId gpu_id,
-      std::vector<stream_executor::gpu::GpuDeviceHandle> access_device_handles,
-      stream_executor::gpu::GpuDriver::VmemSpan vmem, size_t granularity);
-
-  stream_executor::gpu::GpuContext& gpu_context_;
-  tsl::PlatformDeviceId gpu_id_;
-
-  // Peer access is configured at mmap time so the allocator must be aware of
-  // all gpus that may want to read the memory. This list also includes the
-  // above gpu_id_ to facilitate the invocation of the GpuDriver::MapMemory
-  // function.
-  const std::vector<stream_executor::gpu::GpuDeviceHandle> access_gpu_handles_;
-
-  // The virtual memory span held by this allocator.
-  stream_executor::gpu::GpuDriver::VmemSpan vmem_;
-  // The next offset from the vmem base address that will be allocated. This
-  // corresponds to the size of physically pinned memory if holes haven't been
-  // created with "free".
-  size_t next_alloc_offset_ = 0;
-
-  // Smallest allocation as determined by CUDA.
-  const size_t granularity_;
-
-  struct Mapping {
-    stream_executor::gpu::GpuDevicePtr va;
-    stream_executor::gpu::GpuDriver::GenericMemoryHandle physical;
-  };
-  // List of mappings, sorted by va.
-  std::vector<Mapping> mappings_;
-
-  GpuVirtualMemAllocator(const GpuVirtualMemAllocator&) = delete;
-  void operator=(const GpuVirtualMemAllocator&) = delete;
-};
-
+using stream_executor::GpuVirtualMemAllocator;
 }  // namespace tensorflow
 
-#endif  // CUDA_VERSION >= 10200
+#endif  // GOOGLE_CUDA
 
 #endif  // TENSORFLOW_CORE_COMMON_RUNTIME_GPU_GPU_VIRTUAL_MEM_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/gradients.cc b/tensorflow/core/common_runtime/gradients.cc
index 1e8b2318f3402a..9f65429171a25c 100644
--- a/tensorflow/core/common_runtime/gradients.cc
+++ b/tensorflow/core/common_runtime/gradients.cc
@@ -402,7 +402,7 @@ Status SymbolicGradientBuilder::Compute() {
     (*x_grad_node_outputs_)[i] = SumGradients(x_node_outputs_[i]);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddSymbolicGradients(gtl::ArraySlice<NodeOut> y_node_outputs,
diff --git a/tensorflow/core/common_runtime/graph_constructor.cc b/tensorflow/core/common_runtime/graph_constructor.cc
index 576b143c9b3f9c..0e1f20909eec17 100644
--- a/tensorflow/core/common_runtime/graph_constructor.cc
+++ b/tensorflow/core/common_runtime/graph_constructor.cc
@@ -214,7 +214,7 @@ class GraphConstructor {
     TF_RETURN_IF_ERROR(PopulateMissingUnusedInputMapKeys());
     UpdateUniquifiedColocationNames();
     FixupSourceAndSinkEdges(g_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -643,7 +643,7 @@ Status GraphConstructor::EnsureNoNameCollisions() {
       prefix_ = strings::StrCat(FindUniqueName(prefix_no_slash), "/");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphConstructor::ValidateInputMapAndControlDependencies() {
@@ -670,7 +670,7 @@ Status GraphConstructor::ValidateInputMapAndControlDependencies() {
           "graph");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphConstructor::BuildNodeIndex() {
@@ -714,7 +714,7 @@ Status GraphConstructor::BuildNodeIndex() {
     // Update gdef_prefixes_.
     AddPrefixes(node_def.name(), &gdef_prefixes_);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphConstructor::InitFromEdges() {
@@ -781,15 +781,15 @@ Status GraphConstructor::InitFromEdges() {
     }
     pending_count_.push_back(pending_count);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphConstructor::ValidateColocationConstraints(
     const NodeDef& node_def) {
   if (!opts_.validate_colocation_constraints || !opts_.importing)
-    return OkStatus();
+    return absl::OkStatus();
   const auto iter = node_def.attr().find(kColocationAttrName);
-  if (iter == node_def.attr().end()) return OkStatus();
+  if (iter == node_def.attr().end()) return absl::OkStatus();
   for (const string& c : iter->second.list().s()) {
     StringPiece s(c);
     if (absl::ConsumePrefix(&s, kColocationGroupPrefix) &&
@@ -799,7 +799,7 @@ Status GraphConstructor::ValidateColocationConstraints(
           "' expects to be colocated with unknown node '", s, "'");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphConstructor::MakeNode(NodeDef&& node_def, Node** node) {
@@ -811,18 +811,18 @@ Status GraphConstructor::MakeNode(NodeDef&& node_def, Node** node) {
       (opts_.propagate_device_spec && !(*node)->def().device().empty())) {
     (*node)->set_assigned_device_name((*node)->def().device());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphConstructor::ValidateShape(Node* node) {
-  if (!opts_.importing || !opts_.validate_shape) return OkStatus();
+  if (!opts_.importing || !opts_.validate_shape) return absl::OkStatus();
   TF_RETURN_IF_ERROR(refiner_->AddNode(node));
   // For nodes with the _output_shapes attribute, override the shape.
   std::vector<const TensorShapeProto*> shape_attrs;
   const char* kAttrName = "_output_shapes";
   if (!TryGetNodeAttr(node->attrs(), kAttrName, &shape_attrs)) {
     // No _output_shapes attribute, the AddNode call above was sufficient.
-    return OkStatus();
+    return absl::OkStatus();
   }
   auto* ic = refiner_->GetContext(node);
   DCHECK(ic != nullptr)
@@ -860,7 +860,7 @@ Status GraphConstructor::ValidateShape(Node* node) {
     }
   }
   node->ClearAttr(kAttrName);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphConstructor::ModifyNodeDefForImport(NodeDef* node_def) {
@@ -871,7 +871,7 @@ Status GraphConstructor::ModifyNodeDefForImport(NodeDef* node_def) {
   if (versions()) {
     TF_RETURN_IF_ERROR(CheckOpDeprecation(*op_def, versions()->producer()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void RemoveInputs(const std::vector<int>& inputs_to_remove, NodeDef* node_def,
@@ -1083,11 +1083,11 @@ Status GraphConstructor::IsNodeFullyMapped(const NodeDef& node_def,
   for (int i = 0; i < op_def->output_arg_size(); ++i) {
     if (opts_.input_map.find({node_def.name(), i}) == opts_.input_map.end()) {
       *is_node_mapped = false;
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
   *is_node_mapped = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void GraphConstructor::DFS(int cur_node, std::vector<int>* cur_branch,
@@ -1361,7 +1361,7 @@ Status GraphConstructor::Convert() {
                                    " nodes in a cycle");
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphConstructor::AddBackEdges() {
@@ -1378,15 +1378,15 @@ Status GraphConstructor::AddBackEdges() {
     VLOG(2) << "Add back edge: " << src_node->name() << " -> "
             << e.dst_node->name();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphConstructor::UpdateVersionDef() {
-  if (versions() == nullptr) return OkStatus();
+  if (versions() == nullptr) return absl::OkStatus();
 
   if (!opts_.importing) {
     g_->set_versions(*versions());
-    return OkStatus();
+    return absl::OkStatus();
   }
   VersionDef g_versions = g_->versions();
   g_versions.set_producer(
@@ -1404,11 +1404,11 @@ Status GraphConstructor::UpdateVersionDef() {
     }
   }
   g_->set_versions(g_versions);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphConstructor::PopulateReturnTensors() {
-  if (opts_.return_tensors.empty()) return OkStatus();
+  if (opts_.return_tensors.empty()) return absl::OkStatus();
   for (const TensorId& id : opts_.return_tensors) {
     auto iter = opts_.input_map.find(id);
     if (iter == opts_.input_map.end()) {
@@ -1435,11 +1435,11 @@ Status GraphConstructor::PopulateReturnTensors() {
       return_tensors_->push_back({node, remapped_id.second});
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphConstructor::PopulateReturnNodes() {
-  if (opts_.return_nodes.empty()) return OkStatus();
+  if (opts_.return_nodes.empty()) return absl::OkStatus();
   for (StringPiece name : opts_.return_nodes) {
     auto iter = gdef_nodes_.find(name);
     if (iter == gdef_nodes_.end()) {
@@ -1448,11 +1448,11 @@ Status GraphConstructor::PopulateReturnNodes() {
     }
     return_nodes_->push_back(iter->second.node);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphConstructor::PopulateMissingUnusedInputMapKeys() {
-  if (missing_unused_input_map_keys_ == nullptr) return OkStatus();
+  if (missing_unused_input_map_keys_ == nullptr) return absl::OkStatus();
   for (const auto& input_map_pair : opts_.input_map) {
     TensorId key = input_map_pair.first;
     if (used_input_map_keys_.count(key) > 0) continue;
@@ -1477,7 +1477,7 @@ Status GraphConstructor::PopulateMissingUnusedInputMapKeys() {
       missing_unused_input_map_keys_->push_back(key);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void GraphConstructor::Undo() {
@@ -1511,7 +1511,7 @@ Status GraphConstructor::MakeEdge(Node* src, int output_index, Node* dst,
         " incompatible with expected ", DataTypeString(dst_in), ".");
   }
   g_->AddEdge(src, output_index, dst, input_index);
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
diff --git a/tensorflow/core/common_runtime/graph_constructor_test.cc b/tensorflow/core/common_runtime/graph_constructor_test.cc
index 2af347b29b3a62..2cae8ca92c3c81 100644
--- a/tensorflow/core/common_runtime/graph_constructor_test.cc
+++ b/tensorflow/core/common_runtime/graph_constructor_test.cc
@@ -109,7 +109,7 @@ class GraphConstructorTest : public ::testing::Test {
                 ImportGraphDefResults* results = nullptr) {
     Convert(gdef_ascii);
     Status s = ImportGraphDef(opts, gdef_, &graph_, refiner, results);
-    EXPECT_EQ(OkStatus(), s) << s;
+    EXPECT_EQ(absl::OkStatus(), s) << s;
   }
 
   void ExpectVersions(int min_consumer, int producer) {
@@ -184,7 +184,7 @@ Status Scalars(shape_inference::InferenceContext* c) {
   for (int i = 0; i < c->num_outputs(); ++i) {
     c->set_output(i, c->Scalar());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_OP("ABC");
@@ -812,7 +812,7 @@ versions {
 
   ImportGraphDefOptions opts;
   auto s = ImportGraphDef(opts, def, &graph_, nullptr);
-  ASSERT_EQ(OkStatus(), s) << s;
+  ASSERT_EQ(absl::OkStatus(), s) << s;
 }
 
 TEST_F(GraphConstructorTest, TypeMismatch) {
@@ -933,7 +933,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef) {
 
   // Importing an empty graph is fine.
   Status s = ImportGraphDef(opts, def, &graph_, nullptr);
-  ASSERT_EQ(OkStatus(), s) << s;
+  ASSERT_EQ(absl::OkStatus(), s) << s;
   EXPECT_EQ(2, graph_.num_nodes());
   EXPECT_TRUE(HasControlEdge(source, sink));
   EXPECT_EQ(1, graph_.num_edges());
@@ -968,7 +968,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef) {
 
   // First import should work out fine.
   s = ImportGraphDef(opts, def, &graph_, nullptr);
-  ASSERT_EQ(OkStatus(), s) << s;
+  ASSERT_EQ(absl::OkStatus(), s) << s;
   EXPECT_EQ(5 + 2, graph_.num_nodes());  // Added nodes + source and sink
   EXPECT_EQ("A", ColocationGroup("B"));
   EXPECT_TRUE(HasEdge("A", 0, "B", 0));
@@ -989,7 +989,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef) {
   // But succeed if a unique prefix is provided.
   opts.prefix = "import";
   s = ImportGraphDef(opts, def, &graph_, nullptr);
-  ASSERT_EQ(OkStatus(), s) << s;
+  ASSERT_EQ(absl::OkStatus(), s) << s;
   EXPECT_EQ(
       10 + 2,
       graph_.num_nodes());  // Added nodes + original nodes + source and sink
@@ -1020,7 +1020,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_DefaultAttrs) {
   ASSERT_TRUE(protobuf::TextFormat::ParseFromString(
       "node{ name:'A' op:'TestDefaultAttr'}", &def));
   Status s = ImportGraphDef(ImportGraphDefOptions(), def, &graph_, nullptr);
-  ASSERT_EQ(OkStatus(), s) << s;
+  ASSERT_EQ(absl::OkStatus(), s) << s;
   Node* a = nullptr;
   for (Node* n : graph_.nodes()) {
     if (n->name() == "A") {
@@ -1031,7 +1031,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_DefaultAttrs) {
   ASSERT_TRUE(a != nullptr);
   int value = 0;
   s = GetNodeAttr(a->attrs(), "default_int", &value);
-  ASSERT_EQ(OkStatus(), s) << s << " -- " << a->def().DebugString();
+  ASSERT_EQ(absl::OkStatus(), s) << s << " -- " << a->def().DebugString();
   EXPECT_EQ(31415, value);
 }
 
@@ -1056,14 +1056,14 @@ TEST_F(GraphConstructorTest, ImportGraphDef_Versioning) {
   def.mutable_versions()->Clear();
   graph_.ToGraphDef(&def);
   s = ImportGraphDef(opts, def, &graph_, nullptr);
-  EXPECT_EQ(OkStatus(), s) << s;
+  EXPECT_EQ(absl::OkStatus(), s) << s;
 
   def.Clear();
   const int original_min_consumer = graph_.versions().min_consumer();
   def.mutable_versions()->set_min_consumer(original_min_consumer + 2);
   def.mutable_versions()->add_bad_consumers(TF_GRAPH_DEF_VERSION - 1);
   s = ImportGraphDef(opts, def, &graph_, nullptr);
-  EXPECT_EQ(OkStatus(), s) << s;
+  EXPECT_EQ(absl::OkStatus(), s) << s;
   EXPECT_EQ(original_min_consumer + 2, graph_.versions().min_consumer());
   ASSERT_EQ(1, graph_.versions().bad_consumers_size());
   EXPECT_EQ(TF_GRAPH_DEF_VERSION - 1, graph_.versions().bad_consumers(0));
@@ -1162,7 +1162,7 @@ node {
       &def);
   ASSERT_TRUE(parsed);
   Status s = ImportGraphDef(ImportGraphDefOptions(), def, &graph_, nullptr);
-  EXPECT_EQ(OkStatus(), s) << s;
+  EXPECT_EQ(absl::OkStatus(), s) << s;
 
   Graph g2(OpRegistry::Global());
   def.mutable_versions()->set_producer(10);
@@ -2256,7 +2256,7 @@ versions {
       &def);
   ASSERT_TRUE(parsed);
   Status s = ImportGraphDef(ImportGraphDefOptions(), def, &graph_, nullptr);
-  EXPECT_EQ(OkStatus(), s) << s;
+  EXPECT_EQ(absl::OkStatus(), s) << s;
 }
 
 TEST_F(GraphConstructorTest, ImportGraphDef_ControlDeps) {
@@ -2444,7 +2444,7 @@ TEST_F(GraphConstructorTest, ImportGraphDef_ErrorsDoNoChangeTheGraph) {
   const string& sink = graph_.FindNodeId(Graph::kSinkId)->name();
 
   Status s = ImportGraphDef(opts, def, &graph_, nullptr);
-  ASSERT_EQ(OkStatus(), s) << s;
+  ASSERT_EQ(absl::OkStatus(), s) << s;
   EXPECT_EQ(3, graph_.num_nodes());  // 'scope/A', source and sink
   EXPECT_TRUE(HasControlEdge(source, sink));
   EXPECT_TRUE(HasControlEdge(source, "scope/A"));
diff --git a/tensorflow/core/common_runtime/graph_execution_state.cc b/tensorflow/core/common_runtime/graph_execution_state.cc
index 69c25f0623ea28..c2e115b80c3cb9 100644
--- a/tensorflow/core/common_runtime/graph_execution_state.cc
+++ b/tensorflow/core/common_runtime/graph_execution_state.cc
@@ -127,7 +127,7 @@ GraphExecutionState::~GraphExecutionState() {
     TF_RETURN_IF_ERROR(ret->InitBaseGraph(std::move(base_graph)));
     *out_state = std::move(ret);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 /* static */ Status GraphExecutionState::MakeForPrunedGraph(
@@ -177,7 +177,7 @@ GraphExecutionState::~GraphExecutionState() {
   TF_RETURN_IF_ERROR(ret->InitBaseGraph(std::move(base_graph)));
   TF_RETURN_IF_ERROR(ret->BuildGraph(subgraph_options, out_client_graph));
   *out_state = std::move(ret);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphExecutionState::Extend(
@@ -285,7 +285,7 @@ Status GraphExecutionState::Extend(
   // NOTE(mrry): Extend() is likely to be used for non-throughput-sensitive
   // interactive workloads, but in future we may want to transfer other
   // parts of the placement and/or cost model.
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void GraphExecutionState::SaveStatefulNodes(Graph* graph) {
@@ -344,7 +344,7 @@ class TensorConnectionPruneRewrite : public subgraph::PruneRewrite {
 
     (*out_node)->set_assigned_device_name(
         feed_tensor.node->assigned_device_name());
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -358,12 +358,12 @@ Status LookupDevice(const DeviceSet& device_set, const string& tensor_name,
   *out_device_attrs = nullptr;
   if (tensor2device.empty()) {
     *out_device_attrs = &device_set.client_device()->attributes();
-    return OkStatus();
+    return absl::OkStatus();
   }
   const auto it = tensor2device.find(tensor_name);
   if (it == tensor2device.end()) {
     *out_device_attrs = &device_set.client_device()->attributes();
-    return OkStatus();
+    return absl::OkStatus();
   }
   DeviceNameUtils::ParsedName parsed_name;
   if (!DeviceNameUtils::ParseFullName(it->second, &parsed_name)) {
@@ -379,7 +379,7 @@ Status LookupDevice(const DeviceSet& device_set, const string& tensor_name,
                                    "' in CallableOptions does not exist");
   }
   *out_device_attrs = &device->attributes();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 struct TensorAndDevice {
@@ -433,7 +433,7 @@ bool IsFeedAndFetchSupported(DataType dtype, const string& device_type) {
 Status ValidateFeedAndFetchDevices(
     const Graph& graph,
     const std::vector<TensorAndDevice>& tensors_and_devices) {
-  if (tensors_and_devices.empty()) return OkStatus();
+  if (tensors_and_devices.empty()) return absl::OkStatus();
   std::vector<bool> found(tensors_and_devices.size(), false);
   for (const Node* node : graph.nodes()) {
     // Linearly looping through all nodes and then all feed+fetch tensors isn't
@@ -465,7 +465,7 @@ Status ValidateFeedAndFetchDevices(
           "in the Graph");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GetFeedShapeAndTypeFromAttribute(const NodeDef& node,
@@ -500,7 +500,7 @@ Status GetFeedShapeAndTypeFromAttribute(const NodeDef& node,
     return errors::InvalidArgument("Could not determine shape for feed node: ",
                                    node.name(), " of type ", node.op());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -608,7 +608,7 @@ Status GraphExecutionState::PruneGraph(
   for (int i = 0; i < options.callable_options.tensor_connection_size(); ++i) {
     out_rewrite_metadata->feed_types.pop_back();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphExecutionState::InitBaseGraph(std::unique_ptr<Graph>&& new_graph) {
@@ -646,7 +646,7 @@ Status GraphExecutionState::InitBaseGraph(std::unique_ptr<Graph>&& new_graph) {
 
   SaveStatefulNodes(new_graph.get());
   graph_ = new_graph.release();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphExecutionState::OptimizeGraph(
@@ -849,7 +849,7 @@ Status GraphExecutionState::OptimizeGraph(
     for (Node* node : optimized_graph->get()->nodes()) {
       node->set_assigned_device_name(node->requested_device());
     }
-    return OkStatus();
+    return absl::OkStatus();
   } else {
     return errors::InvalidArgument("Meta Optimizer disabled");
   }
@@ -974,7 +974,7 @@ Status GraphExecutionState::BuildGraph(const BuildGraphOptions& options,
   // TODO(vrv): We should check invariants of the graph here.
   metrics::UpdateGraphBuildTime(Env::Default()->NowMicros() - start_time_usecs);
   *out = std::move(dense_copy);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/graph_runner.cc b/tensorflow/core/common_runtime/graph_runner.cc
index 87de162a87455b..f834ec226df365 100644
--- a/tensorflow/core/common_runtime/graph_runner.cc
+++ b/tensorflow/core/common_runtime/graph_runner.cc
@@ -63,13 +63,13 @@ class SimpleRendezvous : public RendezvousInterface {
       return errors::Internal("Send of an already sent tensor");
     }
     table_[edge_name] = val;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void RecvAsync(const ParsedKey& parsed, const Args& recv_args,
                  DoneCallback done) override {
     Tensor tensor;
-    Status status = OkStatus();
+    Status status = absl::OkStatus();
     {
       string key(parsed.edge_name);
       mutex_lock l(mu_);
@@ -205,7 +205,7 @@ Status GraphRunner::Run(Graph* graph, FunctionLibraryRuntime* function_library,
     (*outputs)[i] = tensor::DeepCopy(output_tensor);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/graph_view.cc b/tensorflow/core/common_runtime/graph_view.cc
index 48aa6d2c80bed9..29458524bd5051 100644
--- a/tensorflow/core/common_runtime/graph_view.cc
+++ b/tensorflow/core/common_runtime/graph_view.cc
@@ -285,7 +285,7 @@ Status GraphView::Initialize(const Graph* g) {
     }
   }
   CHECK_EQ(ptr, space_ + total_bytes);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
diff --git a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
index ed3ae517a979cd..1556290485c17e 100644
--- a/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
+++ b/tensorflow/core/common_runtime/hierarchical_tree_broadcaster.cc
@@ -182,7 +182,7 @@ Status HierarchicalTreeBroadcaster::InitializeCollectiveParams(
   }
 
   VLOG(2) << collective_util::SubdivPermDebugString(*col_params);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status HierarchicalTreeBroadcaster::InitializeCollectiveContext(
diff --git a/tensorflow/core/common_runtime/immutable_executor_state.cc b/tensorflow/core/common_runtime/immutable_executor_state.cc
index ea9b5cc8d9f97d..e3a2435505e041 100644
--- a/tensorflow/core/common_runtime/immutable_executor_state.cc
+++ b/tensorflow/core/common_runtime/immutable_executor_state.cc
@@ -347,7 +347,7 @@ Status ImmutableExecutorState::BuildControlFlowInfo(const Graph* g,
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void ImmutableExecutorState::InitializePending(const Graph* graph,
diff --git a/tensorflow/core/common_runtime/inline_function_utils.cc b/tensorflow/core/common_runtime/inline_function_utils.cc
index 625940689c6eb7..67250c6abbfc13 100644
--- a/tensorflow/core/common_runtime/inline_function_utils.cc
+++ b/tensorflow/core/common_runtime/inline_function_utils.cc
@@ -278,7 +278,7 @@ Status ValidateNoInline(const FunctionBody* fbody) {
     return errors::InvalidArgument(
         "Can't inline function marked with '_noinline'");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 using OutputControlSrc = InlineFunctionBodyOptions::OutputControlSource;
@@ -394,7 +394,7 @@ Status ValidateInlining(const Node* node, const FunctionBody* fbody,
     TF_RETURN_IF_ERROR(ValidateNoInline(fbody));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Function inlining must preserve function execution semantics with regards to
@@ -857,7 +857,7 @@ Status InlineFunctionBody(const FunctionLibraryDefinition& flib_def, Graph* g,
 
   VLOG(4) << "Final graph: " << g->ToGraphDefDebug().DebugString();
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool ExpandInlineFunctions(FunctionLibraryRuntime* lib, Graph* graph,
diff --git a/tensorflow/core/common_runtime/inspecting_placer.cc b/tensorflow/core/common_runtime/inspecting_placer.cc
index ea6a1d46b3ecc5..a84cd700874d8c 100644
--- a/tensorflow/core/common_runtime/inspecting_placer.cc
+++ b/tensorflow/core/common_runtime/inspecting_placer.cc
@@ -102,7 +102,7 @@ class ColocationGraphToIOColocationGroups {
       const Member& member = colocation_graph_->members()[it.first];
       TF_RETURN_IF_ERROR(member.FillPossibleDevices(&possible_devices));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -156,7 +156,7 @@ Status InspectingPlacer::ComputeIOColocationGroups(const Node& node,
   converter.AssignGroups(fbody->arg_nodes, &groups->input_groups);
   converter.AssignGroups(fbody->ret_nodes, &groups->output_groups);
   TF_RETURN_IF_ERROR(converter.FillGroups(&groups->group_devices));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/int32_fulltype.cc b/tensorflow/core/common_runtime/int32_fulltype.cc
index 9971a6cf3b9fbb..ab2ef6867d122b 100644
--- a/tensorflow/core/common_runtime/int32_fulltype.cc
+++ b/tensorflow/core/common_runtime/int32_fulltype.cc
@@ -58,7 +58,7 @@ Status Int32FulltypePass::Int32FullTypeForTensor(DataType dtype,
     tensor_t->set_type_id(TFT_SHAPE_TENSOR);
     (*tensor_t->add_args()) = data_t;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 static bool is_host_memory_int32(MemoryType mtype, DataType dtype) {
@@ -135,7 +135,7 @@ Status Int32FulltypePass::ProcessGraph(Graph* graph, bool ints_on_device) {
               << t.DebugString();
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/int32_fulltype_test.cc b/tensorflow/core/common_runtime/int32_fulltype_test.cc
index 6e816cf88339fc..e6ead597aea23e 100644
--- a/tensorflow/core/common_runtime/int32_fulltype_test.cc
+++ b/tensorflow/core/common_runtime/int32_fulltype_test.cc
@@ -69,7 +69,7 @@ class Int32FulltypeTest : public ::testing::Test {
   Status BuildGraph(const GraphDefBuilder& builder, Graph* out_graph) {
     TF_RETURN_IF_ERROR(GraphDefBuilderToGraph(builder, out_graph));
     RebuildNodeNameMap(*out_graph);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void AddTensorFT(FullTypeDef& t, tensorflow::FullTypeId out_t_id,
diff --git a/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass.cc b/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass.cc
index 698481c8ca946b..7945b5cc3cf1c6 100644
--- a/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass.cc
+++ b/tensorflow/core/common_runtime/isolate_placer_inspection_required_ops_pass.cc
@@ -28,7 +28,7 @@ Status IsolatePlacerInspectionRequiredOpsPass::Run(
   if (options.graph == nullptr) {
     VLOG(1) << "Not running IsolatePlacerInspectionRequiredOpsPass because no "
                "graph is provided";
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   VLOG(1) << "IsolatePlacerInspectionRequiredOpsPass::Run";
diff --git a/tensorflow/core/common_runtime/lower_case_op.cc b/tensorflow/core/common_runtime/lower_case_op.cc
index e0e10fb7e43448..4727c941ec6efc 100644
--- a/tensorflow/core/common_runtime/lower_case_op.cc
+++ b/tensorflow/core/common_runtime/lower_case_op.cc
@@ -136,7 +136,7 @@ Status CaseBuilder::CreatePivotNodes() {
                            .Device(case_op_->requested_device())
                            .Finalize(graph_, &pivots_[b]));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string CaseBuilder::NewName(const string& infix) {
@@ -166,7 +166,7 @@ Status CaseBuilder::AddInput(Node* src, int src_output) {
   for (int b = 0; b < num_branches_; b++) {
     branch_call_builders_[b].Input(input, b);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CaseBuilder::AddInputs() {
@@ -184,7 +184,7 @@ Status CaseBuilder::AddInputs() {
       graph_->AddControlEdge(e->src(), control_predecessor_);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CaseBuilder::AddOutputs() {
@@ -247,7 +247,7 @@ Status CaseBuilder::AddOutputs() {
       graph_->AddEdge(merges[e->src_output()], 0, e->dst(), e->dst_input());
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CaseBuilder::BuildLoweredCaseOutput() {
@@ -287,7 +287,7 @@ Status RewriteCaseNode(Node* n, Graph* g, bool keep_node_fetchable) {
   TF_RETURN_IF_ERROR(cb.AddOutputs());
   g->RemoveNode(n);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/lower_function_call_op.cc b/tensorflow/core/common_runtime/lower_function_call_op.cc
index b5077da20d2398..b05509e5246e9e 100644
--- a/tensorflow/core/common_runtime/lower_function_call_op.cc
+++ b/tensorflow/core/common_runtime/lower_function_call_op.cc
@@ -74,7 +74,7 @@ Status RewriteFunctionCallNode(Node* n, Graph* g,
     fdef = flib_def.FindRecord(func.name());
   } else if (n->type_string() == FunctionLibraryDefinition::kGradientOp) {
     VLOG(2) << "Skip SymbolicGradient lowering";
-    return OkStatus();
+    return absl::OkStatus();
   } else {
     fdef = flib_def.FindRecord(n->type_string());
   }
@@ -97,7 +97,7 @@ Status RewriteFunctionCallNode(Node* n, Graph* g,
             << can_inline_function_call.message();
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/lower_functional_ops.cc b/tensorflow/core/common_runtime/lower_functional_ops.cc
index 6894e677fdd112..62c712a0d15360 100644
--- a/tensorflow/core/common_runtime/lower_functional_ops.cc
+++ b/tensorflow/core/common_runtime/lower_functional_ops.cc
@@ -105,7 +105,7 @@ Status LowerFunctionalOpsPass::Run(
         "Lowering If/While ops should happen before partitioning.");
   }
   if (options.graph == nullptr) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Graph* g = options.graph->get();
@@ -209,7 +209,7 @@ Status LowerFunctionalOpsPass::Run(
       },
       IsPropagatableDevice, g);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 10,
diff --git a/tensorflow/core/common_runtime/lower_if_op.cc b/tensorflow/core/common_runtime/lower_if_op.cc
index 17799a50f90fa0..a2875c7c823b52 100644
--- a/tensorflow/core/common_runtime/lower_if_op.cc
+++ b/tensorflow/core/common_runtime/lower_if_op.cc
@@ -169,7 +169,7 @@ Status CondBuilder::CreatePivotNodes() {
                                    .Input(switch_pred, kThenBranch)
                                    .Device(if_op_->requested_device()),
                                graph_, &pivot_t_));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string CondBuilder::NewName(const string& infix) {
@@ -202,7 +202,7 @@ Status CondBuilder::AddInput(Node* src, int src_output) {
           .Finalize(graph_, &input));
   then_call_builder_.Input(input, kThenBranch);
   else_call_builder_.Input(input, kElseBranch);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CondBuilder::AddInputs() {
@@ -220,7 +220,7 @@ Status CondBuilder::AddInputs() {
       graph_->AddControlEdge(e->src(), control_predecessor_);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CondBuilder::AddOutputs() {
@@ -276,7 +276,7 @@ Status CondBuilder::AddOutputs() {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CondBuilder::BuildLoweredIfOutput() {
@@ -317,7 +317,7 @@ Status RewriteIfNode(Node* n, Graph* g, bool keep_node_fetchable) {
   TF_RETURN_IF_ERROR(cb.AddOutputs());
   g->RemoveNode(n);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/lower_while_op.cc b/tensorflow/core/common_runtime/lower_while_op.cc
index 16e4ac7b0d0a95..67cab4576b44ab 100644
--- a/tensorflow/core/common_runtime/lower_while_op.cc
+++ b/tensorflow/core/common_runtime/lower_while_op.cc
@@ -236,7 +236,7 @@ Status LowerWhileHelper::RunInternal() {
   TF_RETURN_IF_ERROR(CreateNextIterationNodes());
   TF_RETURN_IF_ERROR(UpdateMergeNodes());
   TF_RETURN_IF_ERROR(UpdateConsumers());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void LowerWhileHelper::InitializeInputOutputToLoweredNodeMap() {
@@ -301,7 +301,7 @@ Status LowerWhileHelper::CreateEnterNodes() {
       graph_->AddControlEdge(incoming_control_node, n);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LowerWhileHelper::CreateMergeNodes() {
@@ -325,7 +325,7 @@ Status LowerWhileHelper::CreateMergeNodes() {
     TF_RETURN_IF_ERROR(builder.Finalize(graph_, &merge_node));
     merge_nodes_.emplace_back(merge_node);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LowerWhileHelper::CreateCondFuncCallNode() {
@@ -354,7 +354,7 @@ Status LowerWhileHelper::CreateCondFuncCallNode() {
     }
   }
   TF_RETURN_IF_ERROR(builder.Finalize(graph_, &loop_cond_node_));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LowerWhileHelper::CreateSwitchNodes() {
@@ -390,7 +390,7 @@ Status LowerWhileHelper::CreateSwitchNodes() {
     TF_RETURN_IF_ERROR(builder.Finalize(graph_, &switch_node));
     switch_nodes_.emplace_back(switch_node);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LowerWhileHelper::CreateBodyFuncCallNode() {
@@ -428,7 +428,7 @@ Status LowerWhileHelper::CreateBodyFuncCallNode() {
   }
   TF_RETURN_IF_ERROR(builder.Finalize(graph_, &body_control_node_));
   graph_->AddControlEdge(body_control_node_, body_call_node_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LowerWhileHelper::CreateExitNodes() {
@@ -504,7 +504,7 @@ Status LowerWhileHelper::CreateExitNodes() {
             .Finalize(graph_, &lowered_while_output_));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LowerWhileHelper::CreateNextIterationNodes() {
@@ -530,7 +530,7 @@ Status LowerWhileHelper::CreateNextIterationNodes() {
     TF_RETURN_IF_ERROR(builder.Finalize(graph_, &next_iteration));
     next_iterations_nodes_.emplace_back(next_iteration);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LowerWhileHelper::UpdateMergeNodes() {
@@ -538,7 +538,7 @@ Status LowerWhileHelper::UpdateMergeNodes() {
     TF_RETURN_IF_ERROR(
         graph_->UpdateEdge(next_iterations_nodes_[i], 0, merge_nodes_[i], 1));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LowerWhileHelper::UpdateConsumers() {
@@ -565,7 +565,7 @@ Status LowerWhileHelper::UpdateConsumers() {
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string LowerWhileHelper::NewName(const string& infix) {
@@ -615,7 +615,7 @@ Status RewriteWhileNode(Node* n, Graph* g,
       flib_def, keep_node_fetchable));
   g->RemoveNode(n);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/memory_types.cc b/tensorflow/core/common_runtime/memory_types.cc
index 7f555718947be5..d90981a6e883fc 100644
--- a/tensorflow/core/common_runtime/memory_types.cc
+++ b/tensorflow/core/common_runtime/memory_types.cc
@@ -52,7 +52,7 @@ static Status ProcessMemoryTypes(
   if (device_type != DEVICE_GPU &&
       !DeviceFactory::IsPluggableDevice(device_type.type_string())) {
     // On non-GPU devices, HOST_MEMORY and DEVICE_MEMORY are always compatible.
-    return OkStatus();
+    return absl::OkStatus();
   }
   // For GPU, HOST_MEMORY and DEVICE_MEMORY is not compatible. I.e., a
   // conversion/transfer must be done.
@@ -89,14 +89,14 @@ static Status ProcessMemoryTypes(
             << dm;
     TF_RETURN_IF_ERROR(fn(e, sm, dm));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ValidateMemoryTypes(const DeviceType& device_type, const Graph* g) {
   return ProcessMemoryTypes(
       device_type, g, [](const Edge* e, MemoryType sm, MemoryType dm) {
         if (sm == dm) {
-          return OkStatus();
+          return absl::OkStatus();
         }
         return errors::Internal("Memory type mismatch (", sm, " ", dm,
                                 ") between :", e->src()->id(), ":",
@@ -164,12 +164,12 @@ Status EnsureMemoryTypes(const DeviceType& device_type,
   TF_RETURN_IF_ERROR(ProcessMemoryTypes(
       device_type, g, [&edges](const Edge* e, MemoryType sm, MemoryType dm) {
         if (sm == dm) {
-          return OkStatus();
+          return absl::OkStatus();
         }
         if (((sm == HOST_MEMORY) && (dm == DEVICE_MEMORY)) ||
             ((sm == DEVICE_MEMORY) && (dm == HOST_MEMORY))) {
           edges.push_back({e, sm, dm});
-          return OkStatus();
+          return absl::OkStatus();
         }
         return errors::Internal("Unexpected memory type pair on an edge: ", sm,
                                 " vs. ", dm);
@@ -226,7 +226,7 @@ Status MemoryTypeForOutput(const DeviceType& device_type, const Graph* g,
                             " that has only ", out_mvec.size(), " outputs");
   }
   *memory_type = out_mvec[index];
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // end namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_test.cc b/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_test.cc
index 50ee7d5d641a9d..3b391d00bceab9 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_test.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c/tf_rendezvous_c_api_test.cc
@@ -74,7 +74,7 @@ class FakeAllocator : public Allocator {
 class FakeDevice : public Device {
  public:
   explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
-  Status Sync() override { return OkStatus(); }
+  Status Sync() override { return absl::OkStatus(); }
   Allocator* GetAllocator(AllocatorAttributes) override {
     return allocator_.get();
   }
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent_test.cc b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent_test.cc
index c6f351d59ba608..9070a7e2dcf146 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent_test.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_coordination_service_agent_test.cc
@@ -208,7 +208,7 @@ TEST_F(CPluginCoordinationServiceAgentTest, GetKeyValue_Simple_Success) {
   kv->set_value(test_value);
   ON_CALL(*GetClient(), GetKeyValueAsync(_, _, _, _))
       .WillByDefault(DoAll(SetArgPointee<2>(mocked_response),
-                           InvokeArgument<3>(OkStatus())));
+                           InvokeArgument<3>(absl::OkStatus())));
   // Initialize coordination agent.
   InitializeAgent();
 
@@ -228,7 +228,7 @@ TEST_F(CPluginCoordinationServiceAgentTest, GetKeyValue_WithTimeout_Success) {
   kv->set_value(test_value);
   ON_CALL(*GetClient(), GetKeyValueAsync(_, _, _, _))
       .WillByDefault(DoAll(SetArgPointee<2>(mocked_response),
-                           InvokeArgument<3>(OkStatus())));
+                           InvokeArgument<3>(absl::OkStatus())));
   // Initialize coordination agent.
   InitializeAgent();
 
@@ -282,7 +282,7 @@ TEST_F(CPluginCoordinationServiceAgentTest, InsertKeyValue_Success) {
 
   EXPECT_CALL(*GetClient(),
               InsertKeyValueAsync(Pointee(EqualsProto(expected_input)), _, _))
-      .WillOnce(InvokeArgument<2>(OkStatus()));
+      .WillOnce(InvokeArgument<2>(absl::OkStatus()));
   InitializeAgent();
 
   TF_ASSERT_OK(agent_->InsertKeyValue(test_key, test_value));
@@ -296,7 +296,7 @@ TEST_F(CPluginCoordinationServiceAgentTest, DeleteKeyValue_Success) {
 
   EXPECT_CALL(*GetClient(),
               DeleteKeyValueAsync(Pointee(EqualsProto(expected_input)), _, _))
-      .WillOnce(InvokeArgument<2>(OkStatus()));
+      .WillOnce(InvokeArgument<2>(absl::OkStatus()));
   InitializeAgent();
 
   TF_ASSERT_OK(agent_->DeleteKeyValue(test_key));
@@ -312,7 +312,7 @@ TEST_F(CPluginCoordinationServiceAgentTest, TryGetKeyValue_Simple_Success) {
   kv->set_value(test_value);
   ON_CALL(*GetClient(), TryGetKeyValueAsync(_, _, _))
       .WillByDefault(DoAll(SetArgPointee<1>(mocked_response),
-                           InvokeArgument<2>(OkStatus())));
+                           InvokeArgument<2>(absl::OkStatus())));
 
   // Initialize coordination agent.
   InitializeAgent();
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.cc b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.cc
index 63a713f86ecefa..9ffe65ba8f7243 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.cc
@@ -27,11 +27,11 @@ namespace tensorflow {
 
 CPluginVariable::~CPluginVariable() { TF_DeleteVariableInfo(var_info_); }
 
-tsl::Status CPluginVariable::GetTensorInternal() {
+absl::Status CPluginVariable::GetTensorInternal() {
   // Note: we assume once a variable is initialized, it's underlying tensor
   // won't change during it's lifecycle.
   if (tensor_obtained_) {
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
   TF_StatusPtr c_status_ptr(TF_NewStatus());
   TF_Tensor* c_tensor =
@@ -42,21 +42,21 @@ tsl::Status CPluginVariable::GetTensorInternal() {
   }
   TF_RETURN_IF_ERROR(TF_TensorToTensor(c_tensor_ptr.get(), &tensor_));
   tensor_obtained_ = true;
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
-tsl::Status CPluginVariable::GetTensor(const Tensor** result_tensor) {
+absl::Status CPluginVariable::GetTensor(const Tensor** result_tensor) {
   TF_RETURN_IF_ERROR(GetTensorInternal());
   *result_tensor = &tensor_;
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
-tsl::Status CPluginVariable::GetMutableTensor(Tensor** result_tensor) {
+absl::Status CPluginVariable::GetMutableTensor(Tensor** result_tensor) {
   // Note: we assume once a variable is initialized, it's underlying tensor
   // won't change during it's lifecycle.
   TF_RETURN_IF_ERROR(GetTensorInternal());
   *result_tensor = &tensor_;
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.h b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.h
index 7cca7481e4d7ff..55b6de0adc6585 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/c_plugin_variable.h
@@ -29,16 +29,16 @@ class CPluginVariable : public PluginVariable {
   ~CPluginVariable() override;
   explicit CPluginVariable(TF_VariableInfo* var_info) : var_info_(var_info) {}
 
-  tsl::Status GetTensor(const Tensor** result_tensor) override;
+  absl::Status GetTensor(const Tensor** result_tensor) override;
 
-  tsl::Status GetMutableTensor(Tensor** result_tensor) override;
+  absl::Status GetMutableTensor(Tensor** result_tensor) override;
 
   TF_VariableInfo* GetVariableInfo() { return var_info_; }
 
   friend class CPluginOpKernelContext;
 
  private:
-  tsl::Status GetTensorInternal();
+  absl::Status GetTensorInternal();
 
   TF_VariableInfo* var_info_;  // Owned. Cleared by destructor.
   bool tensor_obtained_ = false;
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.cc b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.cc
index feedccee815bcf..63a147facf9928 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.cc
@@ -74,7 +74,7 @@ Status DirectPluginOpKernelContext::CreatePluginVariable(
   TF_RETURN_IF_ERROR(LookupResource(ctx_, handle, &var));
 
   *variable = new DirectPluginVariable(index, handle.name(), var);
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 Status DirectPluginOpKernelContext::AllocateTempForPluginVariable(
@@ -112,16 +112,16 @@ Status DirectPluginOpKernelContext::LookupOrCreateResource(
         void* opaque_plugin_resource = create_func(create_func_args);
         *new_resource = new tensorflow::PluginResource(
             opaque_plugin_resource, plugin_resource_name, delete_func);
-        return tensorflow::OkStatus();
+        return absl::OkStatus();
       }));
   tf_plugin_resource_ptr.reset(tf_plugin_resource);
   *result_plugin_resource = tf_plugin_resource_ptr->GetOpaquePluginResource();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DirectPluginOpKernelContext::GetInput(int index, Tensor* tensor) const {
   *tensor = ctx_->input(index);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DirectPluginOpKernelContext::GetInput(const char* name,
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h
index 7fd23cdf87dc00..4abb6e627fb6d6 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_op_kernel.h
@@ -120,7 +120,7 @@ class DirectPluginOpKernelContext : public PluginOpKernelContext {
 
   Status GetConfigProto(const ConfigProto** config_proto) const override {
     *config_proto = ctx_->function_library()->config_proto();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void MaybeDeleteConfigProto(const ConfigProto* config_proto) const override {
@@ -131,7 +131,7 @@ class DirectPluginOpKernelContext : public PluginOpKernelContext {
   Status GetFunctionLibraryDefinition(
       const FunctionLibraryDefinition** flib_def) const override {
     *flib_def = ctx_->function_library()->GetFunctionLibraryDefinition();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void MaybeDeleteFunctionLibraryDefinition(
@@ -143,7 +143,7 @@ class DirectPluginOpKernelContext : public PluginOpKernelContext {
   Status GetResourceHandle(int index,
                            const ResourceHandle** handle) const override {
     *handle = &HandleFromInput(ctx_, index);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void MaybeDeleteResourceHandle(const ResourceHandle* handle) const override {
@@ -162,7 +162,7 @@ class DirectPluginOpKernelContext : public PluginOpKernelContext {
 
   Status SetOutput(int index, const Tensor& tensor) override {
     ctx_->set_output(index, tensor);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void CtxFailure(const Status& status) override { ctx_->CtxFailure(status); }
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_variable.h b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_variable.h
index 525cea97b5f234..7f1c071bdc4abb 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_variable.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/direct_plugin_variable.h
@@ -29,14 +29,14 @@ class DirectPluginOpKernelContext;
 class DirectPluginVariable : public PluginVariable {
  public:
   DirectPluginVariable(int index, const std::string& name, Var* var);
-  tsl::Status GetTensor(const Tensor** result_tensor) override {
+  absl::Status GetTensor(const Tensor** result_tensor) override {
     *result_tensor = var_info_.var()->tensor();
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 
-  tsl::Status GetMutableTensor(Tensor** result_tensor) override {
+  absl::Status GetMutableTensor(Tensor** result_tensor) override {
     *result_tensor = var_info_.var()->tensor();
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 
   VariableInfo* GetVariableInfo() { return &var_info_; }
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc
index ab0baa29025dc5..09a2fff9f37177 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device.cc
@@ -105,7 +105,7 @@ void NextPluggableDevice::ComputeAsync(AsyncOpKernel* op_kernel,
 }
 
 // TODO(chuanhao): implement NextPluggableDevice::Sync().
-Status NextPluggableDevice::Sync() { return OkStatus(); }
+Status NextPluggableDevice::Sync() { return absl::OkStatus(); }
 
 // TODO(chuanhao): implement NextPluggableDevice::Sync().
 void NextPluggableDevice::Sync(const DoneCallback& done) { done(Sync()); }
@@ -113,7 +113,7 @@ void NextPluggableDevice::Sync(const DoneCallback& done) { done(Sync()); }
 Status NextPluggableDevice::TryGetDeviceContext(DeviceContext** out_context) {
   *out_context = device_context_.get();
   (*out_context)->Ref();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status NextPluggableDevice::MakeTensorFromProto(
@@ -195,7 +195,7 @@ Status NextPluggableDevice::MakeTensorFromProto(
     device_context_->CopyCPUTensorToDevice(&from, this, copy_dst,
                                            std::move(wrapped_done),
                                            true /*sync_dst_compute*/);
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   Status s;
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc
index 4c2c8633b6ae46..ac7bc33925df8c 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc
+++ b/tensorflow/core/common_runtime/next_pluggable_device/next_pluggable_device_factory.cc
@@ -67,7 +67,7 @@ Status NextPluggableDeviceFactory::ListPhysicalDevices(
     devices->push_back(device_name);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status NextPluggableDeviceFactory::CreateDevices(
@@ -84,7 +84,7 @@ Status NextPluggableDeviceFactory::CreateDevices(
   TF_DeleteStatus(c_status);
 
   if (visible_device_count <= 0) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   const absl::flat_hash_map<std::string, int64_t> device_count_map(
       session_options.config.device_count().begin(),
@@ -118,7 +118,7 @@ Status NextPluggableDeviceFactory::CreateDevices(
   LOG(INFO) << "Created " << num_tf_devices
             << " TensorFlow NextPluggableDevices. "
             << "Physical device type: " << device_type_;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/next_pluggable_device/plugin_variable.h b/tensorflow/core/common_runtime/next_pluggable_device/plugin_variable.h
index 21692601cb1db7..ab2ec9a29fd353 100644
--- a/tensorflow/core/common_runtime/next_pluggable_device/plugin_variable.h
+++ b/tensorflow/core/common_runtime/next_pluggable_device/plugin_variable.h
@@ -36,9 +36,9 @@ class PluginVariable {
 
   // `result_tensor` will point to the tensor possessed by the variable if
   // status is ok.
-  virtual tsl::Status GetTensor(const Tensor** result_tensor) = 0;
+  virtual absl::Status GetTensor(const Tensor** result_tensor) = 0;
 
-  virtual tsl::Status GetMutableTensor(Tensor** result_tensor) = 0;
+  virtual absl::Status GetMutableTensor(Tensor** result_tensor) = 0;
 };
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/node_file_writer.cc b/tensorflow/core/common_runtime/node_file_writer.cc
index c6ad3b7f94ae01..10fb334b38f8d3 100644
--- a/tensorflow/core/common_runtime/node_file_writer.cc
+++ b/tensorflow/core/common_runtime/node_file_writer.cc
@@ -127,7 +127,7 @@ tensorflow::NodeFileWriter::GetNodeFileWriterIfEnabled(
 Status NodeFileWriter::RecordNodeExecution(OpKernel* op_kernel,
                                            OpKernelContext* context) {
   if (kOpsToSkipWriting->count(op_kernel->type_string())) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   NodeDef def;
   def.set_name("NodeFileWriter");
@@ -140,7 +140,7 @@ Status NodeFileWriter::RecordNodeExecution(OpKernel* op_kernel,
     if (!context->has_input(i) || context->input_is_ref(i)) {
       // Calling context->input(i) requires the input to exist and not be a ref,
       // so return immediately if that is not the case.
-      return OkStatus();
+      return absl::OkStatus();
     }
     TensorShapeProto* shape_proto = input_shapes.mutable_list()->add_shape();
     const Tensor& input = context->input(i);
@@ -155,7 +155,7 @@ Status NodeFileWriter::RecordNodeExecution(OpKernel* op_kernel,
     } else if (!DataTypeIsFloating(input.dtype())) {
       // Skip ops with non-floating-point inputs, since these are not useful
       // when testing determinism.
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
   return MaybeWriteNodeDefToFile(def);
@@ -185,7 +185,7 @@ Status NodeFileWriter::MaybeWriteNodeDefToFile(const NodeDef& def) {
     // file is never closed.
     TF_RETURN_IF_ERROR(node_def_file_->Flush());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/optimization_registry.cc b/tensorflow/core/common_runtime/optimization_registry.cc
index b5f25ff9ea22cd..f2999ff521a17a 100644
--- a/tensorflow/core/common_runtime/optimization_registry.cc
+++ b/tensorflow/core/common_runtime/optimization_registry.cc
@@ -103,7 +103,7 @@ Status OptimizationPassRegistry::RunGrouping(
              VLOG_IS_ON(3) || (VLOG_IS_ON(2) &&
                                grouping == Grouping::POST_REWRITE_FOR_EXEC));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void OptimizationPassRegistry::LogGrouping(Grouping grouping, int vlog_level) {
diff --git a/tensorflow/core/common_runtime/optimization_registry_test.cc b/tensorflow/core/common_runtime/optimization_registry_test.cc
index 117105e2ace61b..1b9943d49a2a60 100644
--- a/tensorflow/core/common_runtime/optimization_registry_test.cc
+++ b/tensorflow/core/common_runtime/optimization_registry_test.cc
@@ -25,7 +25,7 @@ class TestOptimization : public GraphOptimizationPass {
   static int count_;
   Status Run(const GraphOptimizationPassOptions& options) override {
     ++count_;
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -43,8 +43,9 @@ TEST(OptimizationRegistry, OptimizationPass) {
       new FunctionLibraryDefinition(OpRegistry::Global(),
                                     FunctionDefLibrary()));
   options.flib_def = flib_def.get();
-  EXPECT_EQ(OkStatus(), OptimizationPassRegistry::Global()->RunGrouping(
-                            OptimizationPassRegistry::PRE_PLACEMENT, options));
+  EXPECT_EQ(absl::OkStatus(),
+            OptimizationPassRegistry::Global()->RunGrouping(
+                OptimizationPassRegistry::PRE_PLACEMENT, options));
   EXPECT_EQ(1, TestOptimization::count_);
 }
 
@@ -72,7 +73,7 @@ class OptimizationPassTest : public ::testing::Test {
     options.flib_def = flib_def_.get();
     // Note that options.graph is not set so this test checks that passes
     // properly handle this being nullptr (esp. Segfault is avoided).
-    EXPECT_EQ(OkStatus(),
+    EXPECT_EQ(absl::OkStatus(),
               OptimizationPassRegistry::Global()->RunGrouping(
                   OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, options));
   }
diff --git a/tensorflow/core/common_runtime/optimize_cross_host_control_deps.cc b/tensorflow/core/common_runtime/optimize_cross_host_control_deps.cc
index cdd4a4dd017f57..6deb020f816b31 100644
--- a/tensorflow/core/common_runtime/optimize_cross_host_control_deps.cc
+++ b/tensorflow/core/common_runtime/optimize_cross_host_control_deps.cc
@@ -42,7 +42,7 @@ Status BuildNoopNode(const Node& source, StringPiece name, const string& device,
   if (!device.empty()) {
     (*node)->set_assigned_device_name(device);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status BuildIdentityNNode(const Node& source, StringPiece name,
@@ -62,7 +62,7 @@ Status BuildIdentityNNode(const Node& source, StringPiece name,
   if (!device.empty()) {
     (*node)->set_assigned_device_name(device);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 const string& RequestedOrAssignedDevice(const Node* n) {
@@ -176,7 +176,7 @@ Status OptimizeCrossHostControlOutputEdges(Graph* graph,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status OptimizeCrossHostDataOutputEdges(Graph* graph,
@@ -249,7 +249,7 @@ Status OptimizeCrossHostDataOutputEdges(Graph* graph,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status OptimizeCrossHostControlInputEdges(Graph* graph,
@@ -322,7 +322,7 @@ Status OptimizeCrossHostControlInputEdges(Graph* graph,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/optimize_function_graph_utils.cc b/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
index c2f330ce6db6fb..e041fbbf360a6d 100644
--- a/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
+++ b/tensorflow/core/common_runtime/optimize_function_graph_utils.cc
@@ -72,7 +72,7 @@ Status ValidateNoListArguments(
           " and outputs");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ValidateMultiDeviceOptions(
@@ -107,7 +107,7 @@ Status ValidateMultiDeviceOptions(
         options.output_devices.size(),
         " number of arguments = ", signature.output_arg_size());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SetArgShape(const std::unordered_map<int, DtypeAndPartialTensorShape>&
@@ -133,7 +133,7 @@ Status SetArgShape(const std::unordered_map<int, DtypeAndPartialTensorShape>&
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 const string* AssignedOrRequestedDeviceName(const Node& node) {
@@ -202,7 +202,7 @@ Status WriteToCache(const std::string& dir_name, const std::string& file_name,
           << absl::ToInt64Milliseconds(cache_writing_duration)
           << " msecs, file name: " << file_name;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Retrieves the OptimizedFunctionGraphInfo from a cache file.
@@ -294,7 +294,7 @@ Status GetGraphAndArgRets(const string& function_name, AttrSlice attrs,
   for (const Node* node : fbody->control_ret_nodes) {
     control_ret_node_names->push_back(node->name());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
@@ -460,7 +460,7 @@ Status PinArgsAndRets(const std::vector<string>& input_devices,
       node->set_assigned_device_name(output_devices[index]);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<OptimizedFunctionGraphInfo> OptimizeFunctionGraph(
diff --git a/tensorflow/core/common_runtime/parallel_concat_optimizer.cc b/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
index ed02bc162175f5..bb6cc17b5b1665 100644
--- a/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
+++ b/tensorflow/core/common_runtime/parallel_concat_optimizer.cc
@@ -30,7 +30,7 @@ class ParallelConcatRemovePass : public GraphOptimizationPass {
     if (options.graph == nullptr) {
       // TODO(apassos) returning OK feels weird here as we can't do anything
       // without a graph, but some tests require this.
-      return OkStatus();
+      return absl::OkStatus();
     }
     Graph* g = options.graph->get();
     if (g == nullptr) {
@@ -110,7 +110,7 @@ class ParallelConcatRemovePass : public GraphOptimizationPass {
       }
       g->RemoveNode(n);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::PRE_PLACEMENT, 10,
diff --git a/tensorflow/core/common_runtime/partitioning_utils.cc b/tensorflow/core/common_runtime/partitioning_utils.cc
index 9756f5584ae02a..b7f84da5441c7a 100644
--- a/tensorflow/core/common_runtime/partitioning_utils.cc
+++ b/tensorflow/core/common_runtime/partitioning_utils.cc
@@ -108,7 +108,7 @@ Status MakeSendRecvDependencyExplicit(Graph* graph) {
     }
     graph->AddControlEdge(send_recv_pair.send_node, send_recv_pair.recv_node);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -136,7 +136,7 @@ Status PartitionFunctionGraph(
     subgraphs->emplace(device, std::move(subgraph));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<std::unique_ptr<Graph>> InsertTransferOps(
@@ -259,7 +259,7 @@ Status UpdateArgAndRetvalMetadata(
         ret_nodes, ints_on_device, *ret_alloc_attrs));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string FunctionNameGenerator::GetName() {
diff --git a/tensorflow/core/common_runtime/permuter.h b/tensorflow/core/common_runtime/permuter.h
index 654ab0b27291d8..a48195d8b10229 100644
--- a/tensorflow/core/common_runtime/permuter.h
+++ b/tensorflow/core/common_runtime/permuter.h
@@ -50,7 +50,7 @@ class Permuter : public CollectiveImplementationInterface {
   void Run(StatusCallback done) override;
 
   Status InitializeCollectiveParams(CollectiveParams* col_params) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Initializes members of CollectiveContext not yet initialized, i.e. device
diff --git a/tensorflow/core/common_runtime/placer.cc b/tensorflow/core/common_runtime/placer.cc
index 5f89a5937c8484..05dc029cc74756 100644
--- a/tensorflow/core/common_runtime/placer.cc
+++ b/tensorflow/core/common_runtime/placer.cc
@@ -86,7 +86,7 @@ Status GetFileName(string base_name, string* fname) {
 
   base_name = MakeUniqueFilename(base_name);
   *fname = absl::StrCat(result, "/", base_name);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void DumpColocationGraph(const string& base_name,
@@ -147,7 +147,7 @@ Status AssignAndLog(int assigned_device, Node* node,
   TF_RETURN_IF_ERROR(colocation_graph->LimitToAssignedDevice(*node));
 
   LogDeviceAssignment(node, log_device_placement);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -327,7 +327,7 @@ Status Placer::Run(const GraphOptimizationPassOptions& options) {
         strings::StrCat(options.debug_filename_prefix, "colocation_graph"),
         colocation_graph);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool Placer::CanAssignToDevice(const string& candidate_device_name,
diff --git a/tensorflow/core/common_runtime/placer_inspection_required_ops_utils.cc b/tensorflow/core/common_runtime/placer_inspection_required_ops_utils.cc
index c6c90d4e772769..52591ec1882c00 100644
--- a/tensorflow/core/common_runtime/placer_inspection_required_ops_utils.cc
+++ b/tensorflow/core/common_runtime/placer_inspection_required_ops_utils.cc
@@ -44,7 +44,7 @@ Status Set(const Node& node, bool value, bool* is_deep,
            std::vector<absl::optional<bool>>* cache) {
   *is_deep = value;
   (*cache)[node.id()] = value;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -59,7 +59,7 @@ Status PlacerInspectionRequiredOpChecker::IsPlacerInspectionRequired(
     const Node& node, bool* is_deep) {
   if (cache_[node.id()].has_value()) {
     *is_deep = cache_[node.id()].value();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (!IsFunctionCall(node)) {
@@ -91,7 +91,7 @@ Status GetFunctionDefAndAttrs(const FunctionLibraryDefinition& flib_def,
         "Failed to find function \"", function_name,
         "\" in function library: ", flib_def.ToProto().DebugString());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 FunctionStack::FunctionStack(const string& function_name)
@@ -194,7 +194,7 @@ Status AddInputIdentity(Node* node, int input_idx, Graph* graph,
 
   VLOG(6) << "Successfully inserted identity. Modified node: \n"
           << node->DebugString();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 struct EdgePtrCompare {
@@ -218,7 +218,7 @@ Status AddOutputIdentities(Node* node, Graph* graph,
 
     TF_ASSIGN_OR_RETURN(*identity_node, graph->AddNode(identity_def));
     graph->AddEdge(node, src_output, *identity_node, 0);
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   // output_used[i] == true iff `node`'s i'th output is used
@@ -264,7 +264,7 @@ Status AddOutputIdentities(Node* node, Graph* graph,
             << " -> <no consumer>: \n"
             << identity_node->DebugString();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status IsolateNode(Node* node, Graph* graph) {
@@ -281,7 +281,7 @@ Status IsolateNode(Node* node, Graph* graph) {
     TF_RETURN_IF_ERROR(AddInputIdentity(node, i, graph, &node_names));
   }
   TF_RETURN_IF_ERROR(AddOutputIdentities(node, graph, &node_names));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -305,7 +305,7 @@ Status IsolatePlacerInspectionRequiredOps(
     TF_RETURN_IF_ERROR(IsolateNode(node, graph));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/placer_test.cc b/tensorflow/core/common_runtime/placer_test.cc
index 0a2d1a278b2463..784436579e0913 100644
--- a/tensorflow/core/common_runtime/placer_test.cc
+++ b/tensorflow/core/common_runtime/placer_test.cc
@@ -112,11 +112,11 @@ class FakeDevice : public Device {
 class DummyFactory : public DeviceFactory {
  public:
   Status ListPhysicalDevices(std::vector<string>* devices) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
                        std::vector<std::unique_ptr<Device>>* devices) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -255,14 +255,14 @@ class PlacerTest : public ::testing::Test {
   Status BuildGraph(const GraphDefBuilder& builder, Graph* out_graph) {
     TF_RETURN_IF_ERROR(GraphDefBuilderToGraph(builder, out_graph));
     RebuildNodeNameMap(*out_graph);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status BuildGraph(const GraphDef& graph_def, Graph* out_graph) {
     GraphConstructorOptions opts;
     TF_RETURN_IF_ERROR(ConvertGraphDefToGraph(opts, graph_def, out_graph));
     RebuildNodeNameMap(*out_graph);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Invokes the Placer on "graph". If no DeviceSet is specified, the
@@ -960,7 +960,7 @@ Status PlacerTest::ReferenceTestHelper(const string& variable_op_type,
     EXPECT_DEVICE_TYPE(g, strings::StrCat("assign_", i), expected_device_type);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Test all 2^3 combinations of Variable and Assignment op types
@@ -1031,7 +1031,7 @@ TEST_F(PlacerTest, TestResourceHandle) {
     EXPECT_COLOCATED(g, "var", "assign");
     EXPECT_DEVICE_TYPE(g, "var", device);
     EXPECT_DEVICE_TYPE(g, "assign", device);
-    return OkStatus();
+    return absl::OkStatus();
   };
   TF_EXPECT_OK(
       handle_test("TestHandleVariable", "TestHandleAssign", "FakeGPU"));
@@ -1097,7 +1097,7 @@ TEST_F(PlacerTest, TestResourceHandlesOnDifferentDevicesFails) {
           << s.ToString();
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   TF_EXPECT_OK(handle_test(false, false));
@@ -1202,7 +1202,7 @@ TEST_F(PlacerTest, TestResourceHandleOnCompositeDevice) {
     // `var` is assigned to COMPOSITE.
     GetNodeByName(*g, "var")->set_assigned_device_name(
         "/job:a/replica:0/task:0/device:COMPOSITE:0");
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   {
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc
index 69749d99792502..3b08d428ff985b 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device.cc
@@ -241,7 +241,7 @@ Status PluggableDevice::Init(const SessionOptions& options) {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Allocator* PluggableDevice::GetAllocator(AllocatorAttributes attr) {
@@ -326,8 +326,8 @@ Status PluggableDevice::MaybeCopyTensorToPluggableDevice(
     StatusCallback done) {
   if (alloc_attrs.on_host()) {
     *to = from;
-    done(OkStatus());
-    return OkStatus();
+    done(absl::OkStatus());
+    return absl::OkStatus();
   } else {
     if (!DMAHelper::CanUseDMA(&from)) {
       Status err = errors::Internal("PluggableDevice copy from non-DMA ",
@@ -359,7 +359,7 @@ Status PluggableDevice::MaybeCopyTensorToPluggableDevice(
 
     device_context_->CopyCPUTensorToDevice(
         &from, this, copy, std::move(wrapped_done), false /*sync_dst_compute*/);
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.cc
index 5c74e602603469..ec3faf2d6329ca 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_context.cc
@@ -55,7 +55,7 @@ Status PluggableDeviceContext::ThenExecute(Device* device, se::Stream* stream,
   const DeviceBase::AcceleratorDeviceInfo* device_info =
       device->tensorflow_accelerator_device_info();
   device_info->event_mgr->ThenExecute(stream, func);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool PluggableDeviceContext::IsPluggableDevice() { return true; }
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc
index f6705b0f15228d..e9b6bee90f733f 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc
@@ -115,7 +115,7 @@ Status SingleVirtualDeviceMemoryLimit(const string& platform_name,
     allocated_memory = total_memory * per_process_device_memory_fraction;
   }
   *memory_limit = allocated_memory;
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
@@ -135,7 +135,7 @@ Status PluggableDeviceFactory::ListPhysicalDevices(
     devices->push_back(device_name);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PluggableDeviceFactory::GetDeviceDetails(
@@ -143,7 +143,7 @@ Status PluggableDeviceFactory::GetDeviceDetails(
   TF_RETURN_IF_ERROR(ValidatePluggableDeviceMachineManager(platform_name_));
   se::Platform* platform = PluggableDeviceMachineManager(platform_name_);
   if (platform == nullptr) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   int device_count = platform->VisibleDeviceCount();
@@ -159,7 +159,7 @@ Status PluggableDeviceFactory::GetDeviceDetails(
 
   auto desc = std::move(desc_status).value();
   (*details)["device_name"] = desc->name();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PluggableDeviceFactory::CreateDevices(
@@ -168,11 +168,11 @@ Status PluggableDeviceFactory::CreateDevices(
   TF_RETURN_IF_ERROR(ValidatePluggableDeviceMachineManager(platform_name_));
   se::Platform* platform = PluggableDeviceMachineManager(platform_name_);
   if (platform == nullptr) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   const int visible_device_count = platform->VisibleDeviceCount();
   if (visible_device_count <= 0) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   const absl::flat_hash_map<std::string, int64_t> device_count_map(
       options.config.device_count().begin(),
@@ -211,7 +211,7 @@ Status PluggableDeviceFactory::CreateDevices(
                                              bytes, device_localities[di],
                                              devices));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 static string GetShortDeviceDescription(PlatformDeviceId platform_device_id,
@@ -274,7 +274,7 @@ Status PluggableDeviceFactory::CreatePluggableDevice(
             << GetShortDeviceDescription(platform_device_id, *desc) << ")";
   TF_RETURN_IF_ERROR(pluggable_device->Init(options));
   devices->push_back(std::move(pluggable_device));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PluggableDeviceFactory::GetDeviceLocalities(
@@ -317,7 +317,7 @@ Status PluggableDeviceFactory::GetDeviceLocalities(
             << dev_locality.bus_id() << " numa: " << numa_node
             << "DeviceLocality: " << dev_locality.DebugString();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
index 02e33fe46f6adc..a3879acd5daa08 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_plugin_init.cc
@@ -47,8 +47,8 @@ static Status InitDeviceModule(void* dso_handle) {
 
   if (absl::IsNotFound(status)) {
     VLOG(1) << "Device module not found.";
-    return OkStatus();
-  } else if (status != OkStatus()) {
+    return absl::OkStatus();
+  } else if (status != absl::OkStatus()) {
     return status;
   }
   auto init_fn = reinterpret_cast<stream_executor::SEInitPluginFn>(dso_symbol);
@@ -68,7 +68,7 @@ static Status InitDeviceModule(void* dso_handle) {
       /*is_pluggable_device=*/true));  // Register the Copy tensor.
 
   VLOG(1) << "Successfully initialized Device module.";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 typedef const PJRT_Api* (*PjrtApiInitFn)();
@@ -81,8 +81,8 @@ static Status InitNextPluggableDeviceModule(void* dso_handle) {
       env->GetSymbolFromLibrary(dso_handle, "TFNPD_InitPlugin", &dso_symbol);
   if (absl::IsNotFound(status)) {
     VLOG(1) << "Next pluggable device module not found.";
-    return OkStatus();
-  } else if (status != OkStatus()) {
+    return absl::OkStatus();
+  } else if (status != absl::OkStatus()) {
     return status;
   }
   auto init_fn = reinterpret_cast<TFNPDInitPluginFn>(dso_symbol);
@@ -98,7 +98,7 @@ static Status InitNextPluggableDeviceModule(void* dso_handle) {
   if (absl::IsNotFound(status)) {
     VLOG(1) << "Loading PJRT plugin failed for " << device_type << ": "
             << status.message();
-    return OkStatus();
+    return absl::OkStatus();
   } else if (!status.ok()) {
     return status;
   }
@@ -140,7 +140,7 @@ static Status InitNextPluggableDeviceModule(void* dso_handle) {
       /*is_pluggable_device=*/true));  // Register the Copy tensor.
 
   VLOG(1) << "Successfully initialized NextPluggableDevice module.";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 static Status InitGraphModule(void* dso_handle) {
@@ -151,15 +151,15 @@ static Status InitGraphModule(void* dso_handle) {
 
   if (absl::IsNotFound(status)) {
     VLOG(1) << "Graph module not found.";
-    return OkStatus();
-  } else if (status != OkStatus()) {
+    return absl::OkStatus();
+  } else if (status != absl::OkStatus()) {
     return status;
   }
   auto init_fn = reinterpret_cast<grappler::TFInitGraphPluginFn>(dso_symbol);
   TF_RETURN_IF_ERROR(grappler::InitGraphPlugin(init_fn));
 
   VLOG(1) << "Successfully initialized Graph module.";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 typedef void (*TFKernelInitFn)();
@@ -171,8 +171,8 @@ static Status InitKernelModule(void* dso_handle) {
 
   if (absl::IsNotFound(status)) {
     VLOG(1) << "Kernel module not found.";
-    return OkStatus();
-  } else if (status != OkStatus()) {
+    return absl::OkStatus();
+  } else if (status != absl::OkStatus()) {
     return status;
   }
 
@@ -180,7 +180,7 @@ static Status InitKernelModule(void* dso_handle) {
   init_fn();
 
   VLOG(1) << "Successfully initialized Kernel module.";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 static Status InitProfilerModule(void* dso_handle) {
@@ -192,8 +192,8 @@ static Status InitProfilerModule(void* dso_handle) {
 
   if (absl::IsNotFound(status)) {
     VLOG(1) << "Profiler module not found.";
-    return OkStatus();
-  } else if (status != OkStatus()) {
+    return absl::OkStatus();
+  } else if (status != absl::OkStatus()) {
     return status;
   }
 
@@ -201,7 +201,7 @@ static Status InitProfilerModule(void* dso_handle) {
   TF_RETURN_IF_ERROR(profiler::InitPluginProfiler(init_fn));
 
   VLOG(1) << "Successfully initialized Profiler module";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RegisterPluggableDevicePlugin(void* dso_handle) {
@@ -220,7 +220,7 @@ Status RegisterPluggableDevicePlugin(void* dso_handle) {
   // Step 4 Init Profiler Module.
   TF_RETURN_IF_ERROR(InitProfilerModule(dso_handle));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_util.cc b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_util.cc
index 1289675cab1d55..b03132e68d3d4e 100644
--- a/tensorflow/core/common_runtime/pluggable_device/pluggable_device_util.cc
+++ b/tensorflow/core/common_runtime/pluggable_device/pluggable_device_util.cc
@@ -96,7 +96,7 @@ static Status PrepareCopy(Device* device, const DeviceContext* ctx,
     return errors::Internal("PluggableDevice copy from non-DMA",
                             DataTypeString(src.dtype()), " tensor.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 static void* GetBase(const Tensor* src) {
@@ -163,7 +163,7 @@ void PluggableDeviceUtil::DeviceToDeviceCopy(
           LOG(FATAL) << "PluggableDevice->PluggableDevice Memcpy "  // Crash OK
                      << "failed.";
         }
-        done(OkStatus());
+        done(absl::OkStatus());
       });
   send_dev_context->MaintainLifetimeOnStream(input,
                                              send_device_to_device_stream);
@@ -212,7 +212,7 @@ void PluggableDeviceUtil::CopyPluggableDeviceTensorToCPU(
           LOG(FATAL) << "PluggableDevice->CPU Memcpy failed.";  // Crash OK
         }
         input_ref.Unref();
-        done(OkStatus());
+        done(absl::OkStatus());
       });
 }
 
@@ -261,7 +261,7 @@ void PluggableDeviceUtil::CopyCPUTensorToPluggableDevice(
         if (!recv_host_to_device_stream->ok()) {
           LOG(FATAL) << "CPU->PluggableDevice Memcpy failed.";  // Crash OK
         }
-        done(OkStatus());
+        done(absl::OkStatus());
       });
 }
 
@@ -284,7 +284,7 @@ Status PluggableDeviceUtil::SyncAll(Device* device) {
       !dev_info->stream->ok()) {
     return errors::Internal("PluggableDevice SyncAll failed.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // static
@@ -311,7 +311,7 @@ void PluggableDeviceUtil::CopyPluggableDeviceTensorToSameDevice(
     send_stream->ThenMemcpy(&device_dst_ptr, device_src_ptr, total_bytes);
   }
 
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime.cc b/tensorflow/core/common_runtime/process_function_library_runtime.cc
index 801addf64a7e84..ba15b2c3d82368 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime.cc
@@ -144,7 +144,7 @@ Status ProcessFunctionLibraryRuntime::SendTensors(
   }
   TF_RETURN_IF_ERROR(SendTensorsToRendezvous(
       rendezvous, device_context, alloc_attrs, keys, tensors_to_send));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 /* static */
@@ -174,7 +174,7 @@ Status ProcessFunctionLibraryRuntime::GetRetTypes(
     auto miter = mdevice_data_.find(h);
     if (miter != mdevice_data_.end()) {
       *ret_types = miter->second->ret_types_;
-      return OkStatus();
+      return absl::OkStatus();
     }
     auto fiter = function_data_.find(h);
     if (fiter != function_data_.end()) {
@@ -194,7 +194,7 @@ Status ProcessFunctionLibraryRuntime::GetDeviceIncarnation(
     return errors::InvalidArgument("Device name: ", device_name, " not found.");
   }
   *incarnation = flr->device()->attributes().incarnation();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ProcessFunctionLibraryRuntime::GetDeviceContext(
@@ -208,14 +208,14 @@ Status ProcessFunctionLibraryRuntime::GetDeviceContext(
   string device_type = device->parsed_name().type;
   if (device_type == "CPU" || device_type == "TPU_SYSTEM") {
     // "TPU_SYSTEM" indicates that `device` is a CPU.
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (device->IsRemoteCallAllowed()) {
     auto* dev_info = flr->device()->tensorflow_accelerator_device_info();
     if (dev_info) {
       *device_context = dev_info->default_context;
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -236,7 +236,7 @@ void ProcessFunctionLibraryRuntime::InitializeDeviceAndFlr() {
   if (parent_ != nullptr && parent_->remote_device_mgr() != nullptr) {
     for (auto d : parent_->remote_device_mgr()->ListDevices()) {
       Device* device = nullptr;
-      if (device_mgr_->LookupDevice(d->name(), &device) == OkStatus()) {
+      if (device_mgr_->LookupDevice(d->name(), &device) == absl::OkStatus()) {
         // If this device exists in device_mgr, i.e., a local device,
         // add this device from the instance included in device_mgr_
         device_set_->AddDevice(device);
@@ -417,7 +417,7 @@ Status FunctionRetsToTensors(const std::vector<FunctionRet>* function_rets,
     // NOLINTNEXTLINE
     tensors->push_back(absl::get<Tensor>(ret));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
@@ -488,7 +488,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
     if (it != table_.end()) {
       *handle = it->second;
       ++mdevice_data_[*handle]->instantiation_counter_;
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -765,7 +765,7 @@ Status ProcessFunctionLibraryRuntime::InstantiateMultiDevice(
   if (should_publish_function_graphs) {
     PublishSubgraphs(function_name, std::move(function_records));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ProcessFunctionLibraryRuntime::GetOutputDevices(
@@ -812,7 +812,7 @@ Status ProcessFunctionLibraryRuntime::GetOutputDevices(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ProcessFunctionLibraryRuntime::PrepareRunMultiDevice(
@@ -844,7 +844,7 @@ Status ProcessFunctionLibraryRuntime::PrepareRunMultiDevice(
         " without an appropriate cross process Rendezvous.");
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::vector<string> ProcessFunctionLibraryRuntime::GetOrderedSubgraphs(
@@ -961,7 +961,7 @@ Status ProcessFunctionLibraryRuntime::RunMultiDeviceSync(
       return s;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void ProcessFunctionLibraryRuntime::RunMultiDeviceAsync(
@@ -1105,12 +1105,12 @@ Status ProcessFunctionLibraryRuntime::IsCrossProcess(
   const auto& mdevice_it = mdevice_data_.find(handle);
   if (mdevice_it != mdevice_data_.end()) {
     *is_cross_process = mdevice_it->second->is_cross_process_;
-    return OkStatus();
+    return absl::OkStatus();
   }
   const auto& it = function_data_.find(handle);
   if (it != function_data_.end()) {
     *is_cross_process = it->second->is_cross_process();
-    return OkStatus();
+    return absl::OkStatus();
   }
   return errors::InvalidArgument("Handle ", handle, " not found.");
 }
@@ -1156,7 +1156,7 @@ Status ProcessFunctionLibraryRuntime::RemoveHandle(
   mutex_lock l(mu_);
   table_.erase(function_data_[handle]->function_key());
   function_data_.erase(handle);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ProcessFunctionLibraryRuntime::ReleaseMultiDeviceHandle(
@@ -1167,7 +1167,7 @@ Status ProcessFunctionLibraryRuntime::ReleaseMultiDeviceHandle(
     auto it = mdevice_data_.find(handle);
     --it->second->instantiation_counter_;
     if (it->second->instantiation_counter_ != 0) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     mdata = std::move(it->second);
     table_.erase(mdata->function_key_);
@@ -1205,7 +1205,7 @@ Status ProcessFunctionLibraryRuntime::ReleaseMultiDeviceHandle(
 Status ProcessFunctionLibraryRuntime::ReleaseHandle(
     FunctionLibraryRuntime::Handle handle) {
   // Return directly if all function handles has already been released.
-  if (flr_map_ == nullptr) return OkStatus();
+  if (flr_map_ == nullptr) return absl::OkStatus();
 
   if (IsMultiDevice(handle)) {
     return ReleaseMultiDeviceHandle(handle);
@@ -1293,7 +1293,7 @@ Status ProcessFunctionLibraryRuntime::GetComponentArgs(
       comp_args->args.push_back(args[it.index]);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #if !defined(IS_MOBILE_PLATFORM)
@@ -1315,7 +1315,7 @@ Status ProcessFunctionLibraryRuntime::GetComponentArgs(
       comp_args->args.push_back(comp_args->remote_args.back().get());
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 #endif  // IS_MOBILE_PLATFORM
 
@@ -1500,7 +1500,7 @@ void ProcessFunctionLibraryRuntime::Run(
             return;
           }
         }
-        done(OkStatus());
+        done(absl::OkStatus());
       });
 }
 
@@ -1650,7 +1650,7 @@ Status ProcessFunctionLibraryRuntime::Clone(
     tf_shared_lock l(mu_);
     for (auto* d : composite_devices_) (*out_pflr)->AddCompositeDevice(d);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
index 01e243f6300f13..c95f118a1a1589 100644
--- a/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
+++ b/tensorflow/core/common_runtime/process_function_library_runtime_test.cc
@@ -67,7 +67,7 @@ class TestClusterFLR : public DistributedFunctionLibraryRuntime {
       *handle = next_handle_;
       next_handle_++;
     }
-    done(OkStatus());
+    done(absl::OkStatus());
   }
 
   void Run(const FunctionLibraryRuntime::Options& opts,
@@ -158,7 +158,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
             return tsl::core::RefCountPtr<IntraProcessRendezvous>(
                 new IntraProcessRendezvous(device_mgr));
           });
-          return OkStatus();
+          return absl::OkStatus();
         }}));
   }
 
@@ -261,7 +261,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     EXPECT_TRUE(errors::IsNotFound(status)) << "Actual status: " << status;
     EXPECT_TRUE(absl::StrContains(status.message(), "not found."));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Run(const string& name, FunctionLibraryRuntime::Options opts,
@@ -308,7 +308,7 @@ class ProcessFunctionLibraryRuntimeTest : public ::testing::Test {
     for (size_t i = 0; i < rets.size(); ++i) {
       *rets[i] = out[i];
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::unique_ptr<DynamicDeviceMgr> device_mgr_;
@@ -935,7 +935,7 @@ class TestFunctionPackedArgs : public FunctionArgsInterface {
   Status GetLocalArg(const FunctionArgIndex& index,
                      Tensor* val) const override {
     *val = *packed_args_.at(index.index).at(index.sub_index).tensor;
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   std::vector<Tensor> GetLocalTensors() const override { return {}; }
diff --git a/tensorflow/core/common_runtime/quantize_training.cc b/tensorflow/core/common_runtime/quantize_training.cc
index 0269739828ca2d..8f225405cf41d3 100644
--- a/tensorflow/core/common_runtime/quantize_training.cc
+++ b/tensorflow/core/common_runtime/quantize_training.cc
@@ -148,7 +148,7 @@ Status FindSaveOp(const Graph* graph, Node** save_op,
       TF_RETURN_IF_ERROR(node->input_edges(in_edges));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Node* FindRestoreAllOp(const Graph* graph, StringPiece save_prefix) {
@@ -237,7 +237,7 @@ Status ConnectVariablesToSaveOp(Graph* graph, Node* save_op,
   }
   graph->RemoveNode(save_op);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Add a restore subgraph for each variable and connect to the restore_all op.
@@ -307,7 +307,7 @@ Status AddRestoreVariableSubgraphs(Graph* graph, Node* save_op,
     // Add a control edge from the assign op to restore_all op.
     graph->AddControlEdge(assign_op, restore_all);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Adds new variables to save and restore ops matching the Save and Restore
@@ -323,7 +323,7 @@ Status AddSaveAndRestore(Graph* graph, const std::vector<Node*>& variables) {
     TF_RETURN_IF_ERROR(
         ConnectVariablesToSaveOp(graph, save_op, in_edges, variables));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Sets output to the Node that computes reduction axes corresponding to all
@@ -358,7 +358,7 @@ Status MakeReductionAxes(Graph* graph, string name_prefix, Node* input,
           .Input(rank)
           .Input(delta)
           .Finalize(graph, output));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Computes the exponential moving average of input, updated in update_variable.
@@ -401,7 +401,7 @@ Status MakeExponentialMovingAverage(Graph* graph, string name_prefix,
           .Input(update_variable)
           .Input(update_value)
           .Finalize(graph, assign_value));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Creates an automatically initialized exponential moving average variable.
@@ -454,7 +454,7 @@ Status MakeInitializedEMAVariable(Graph* graph, const string& name, Node* decay,
           .Input(*var)
           .Input(assign_value)
           .Finalize(graph, var));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Computes the min and max EMA of input and stores them in min_var and max_var.
@@ -492,7 +492,7 @@ Status MakeEMAMinMaxVars(Graph* graph, const string& name_prefix, Node* input,
                                                 added_variables, min_var));
   TF_RETURN_IF_ERROR(MakeInitializedEMAVariable(graph, max_name, decay, max,
                                                 added_variables, max_var));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Makes an input min and max constant if the range is given. Otherwise, makes
@@ -525,7 +525,7 @@ Status MakeInputMinMax(Graph* graph, const string& name_prefix,
                                          input_max));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Adds a QuantizeAndDequantizeV2 or FakeQuantizeWithMinMaxVars op
@@ -559,7 +559,7 @@ Status MakeQuantizeOp(Graph* graph, const string& name_prefix,
   } else {
     return errors::InvalidArgument("Unknown quant op type: ", quant_op_type);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Insert conversion op, connect it to the graph and remove the old edge.
@@ -588,7 +588,7 @@ Status ProcessTargetEdges(Graph* graph, const string& quant_op_type,
 
   TF_RETURN_IF_ERROR(AddSaveAndRestore(graph, added_variables));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -655,7 +655,7 @@ Status DoQuantizeTraining(int32_t num_bits, const string& quant_op_type,
 
   TF_RETURN_IF_ERROR(ProcessTargetEdges(graph, quant_op_type, target_edges));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DoQuantizeTrainingOnGraphDef(const GraphDef& input_graphdef,
@@ -671,7 +671,7 @@ Status DoQuantizeTrainingOnGraphDef(const GraphDef& input_graphdef,
 
   // Convert the result graph back to a GraphDef.
   graph.ToGraphDef(result_graphdef);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DoQuantizeTrainingOnSerializedGraphDef(const string& input_graph_string,
@@ -692,7 +692,7 @@ Status DoQuantizeTrainingOnSerializedGraphDef(const string& input_graph_string,
     return errors::Internal(
         "quantize training transformation resulted in invalid GraphDef");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/quantize_training_test.cc b/tensorflow/core/common_runtime/quantize_training_test.cc
index db228c8e08be85..4031e7405049be 100644
--- a/tensorflow/core/common_runtime/quantize_training_test.cc
+++ b/tensorflow/core/common_runtime/quantize_training_test.cc
@@ -57,14 +57,14 @@ class QuantizeTrainingTest : public ::testing::Test {
                            .Attr("dtype", DT_FLOAT)
                            .Attr("shape", shape)
                            .Finalize(g, out));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status FindNode(Graph* g, const string& name, Node** out) {
     for (Node* node : g->nodes()) {
       if (node->name() == name) {
         *out = node;
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
     return errors::Unimplemented("Node ", name, " not found.");
diff --git a/tensorflow/core/common_runtime/rendezvous_mgr.cc b/tensorflow/core/common_runtime/rendezvous_mgr.cc
index 80bc0ff53f2c08..75bfb99e5ef6e8 100644
--- a/tensorflow/core/common_runtime/rendezvous_mgr.cc
+++ b/tensorflow/core/common_runtime/rendezvous_mgr.cc
@@ -60,7 +60,7 @@ void SameWorkerRecvDone(const DeviceMgr* device_mgr,
       }
     }
     *out = in;
-    done(OkStatus());
+    done(absl::OkStatus());
     return;
   }
 
diff --git a/tensorflow/core/common_runtime/rendezvous_util.cc b/tensorflow/core/common_runtime/rendezvous_util.cc
index dc5ee4f9759f72..bad10d33dee5b1 100644
--- a/tensorflow/core/common_runtime/rendezvous_util.cc
+++ b/tensorflow/core/common_runtime/rendezvous_util.cc
@@ -50,7 +50,7 @@ Status SendTensorsToRendezvous(
     TF_RETURN_IF_ERROR(
         rendezvous->Send(parsed, rendez_args, tensors_to_send[i], false));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void RecvOutputsFromRendezvousAsync(
@@ -59,7 +59,7 @@ void RecvOutputsFromRendezvousAsync(
     const std::vector<string>& keys, std::vector<Tensor>* received_tensors,
     StatusCallback done) {
   if (keys.empty()) {
-    done(OkStatus());
+    done(absl::OkStatus());
     return;
   }
   if (!alloc_attrs.empty() && (keys.size() != alloc_attrs.size())) {
@@ -134,7 +134,7 @@ Status RecvOutputsFromRendezvous(RendezvousInterface* rendezvous,
                                      " was not valid.");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/replicate_constants_pass.cc b/tensorflow/core/common_runtime/replicate_constants_pass.cc
index 376129bad99719..11d6f0d53864f4 100644
--- a/tensorflow/core/common_runtime/replicate_constants_pass.cc
+++ b/tensorflow/core/common_runtime/replicate_constants_pass.cc
@@ -90,7 +90,7 @@ Status DeviceNameToCpuDeviceNameWithDeviceId(const string& device_name,
     device.has_id = true;
     *host_device_name = DeviceNameUtils::ParsedNameToString(device);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Get the CPU device on the same host as dst.
@@ -114,7 +114,7 @@ Status GetSuccessorEdges(
     if (!device_to_edges.count(device)) device_to_edges.insert({device, {}});
     device_to_edges[device].push_back(edge);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Replicate the constant to each successor device.
@@ -148,7 +148,7 @@ Status ReplicateConstantsPass::Run(
 
   if (options.graph == nullptr) {
     VLOG(1) << "No graph in replicate_constants_pass.";
-    return OkStatus();
+    return absl::OkStatus();
   }
   Graph* graph = options.graph->get();
   if (VLOG_IS_ON(1)) {
@@ -206,7 +206,7 @@ Status ReplicateConstantsPass::Run(
     VLOG(1) << DumpGraphToFile("after_replicate_constants_pass", *graph,
                                options.flib_def);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_OPTIMIZATION(OptimizationPassRegistry::POST_REWRITE_FOR_EXEC, 3,
diff --git a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
index 1272cbad8eafbc..58bab38fb1d093 100644
--- a/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
+++ b/tensorflow/core/common_runtime/replicate_per_replica_nodes.cc
@@ -40,7 +40,7 @@ class ReplicateHelper {
     }
     std::vector<Node*> replicated_nodes(num_allowed_devices, nullptr);
     replicated_nodes_map_.emplace(node, std::move(replicated_nodes));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Replicate the given node to an allowed device.
@@ -49,7 +49,7 @@ class ReplicateHelper {
                        int allowed_device_index, Graph* graph) {
     auto& replicated_nodes = replicated_nodes_map_.at(node);
     if (replicated_nodes[allowed_device_index] != nullptr) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     const auto& device = allowed_devices.at(allowed_device_index);
     NodeDef node_def = node->def();
@@ -61,7 +61,7 @@ class ReplicateHelper {
       replicated_node->AddAttr("sub_index", allowed_device_index);
     }
     replicated_nodes[allowed_device_index] = replicated_node;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Replace an edge (a regular device -> composite device) with
@@ -107,7 +107,7 @@ class ReplicateHelper {
       graph->AddEdge(src_replicated_nodes.at(i), edge->src_output(), dst,
                      edge->dst_input());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Data edge: replace an edge (composite device -> a regular device) with
@@ -145,7 +145,7 @@ class ReplicateHelper {
           graph->AddControlEdge(replicated_node, dst,
                                 /*allow_duplicates=*/true);
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
       if (edge->src()->type_string() == "_Arg") {
         // This happens when the dst node runs on a host CPU and
@@ -188,7 +188,7 @@ class ReplicateHelper {
             " assigned to ", dst_device);
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -246,7 +246,7 @@ Status ReplicateNodesAndEdges(const std::vector<string>& allowed_devices,
     cluster_nodes->erase(node);
     graph->RemoveNode(node);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -279,7 +279,7 @@ Status ReplicatePerReplicaNodesInFunctionGraph(
 
   if (composite_device_to_cluster_nodes.empty()) {
     VLOG(1) << "No nodes with composiste device found.";
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   for (auto& it : composite_device_to_cluster_nodes) {
@@ -331,7 +331,7 @@ Status ReplicatePerReplicaNodesInFunctionGraph(
   VLOG(1) << "Finished ReplicatePerReplicaNodesInFunctionGraph";
   VLOG(1) << "Graph #nodes " << graph->num_nodes() << " #edges "
           << graph->num_edges();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/ring_alg.cc b/tensorflow/core/common_runtime/ring_alg.cc
index a88f203e61b751..617127cadcc3f1 100644
--- a/tensorflow/core/common_runtime/ring_alg.cc
+++ b/tensorflow/core/common_runtime/ring_alg.cc
@@ -115,7 +115,7 @@ Status GenerateSubdivsInCollectiveParams(CollectiveParams* col_params) {
   if (col_params->instance.impl_details.max_subdivs_per_device == -1) {
     col_params->instance.impl_details.subdiv_offsets = {0};
     VLOG(2) << "Limiting to 1 subdivision as max_subdivs_per_device == -1";
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (col_params->instance.shape.num_elements() == 0) {
@@ -173,7 +173,7 @@ Status GenerateSubdivsInCollectiveParams(CollectiveParams* col_params) {
             << tensor_size << " chunk_size " << chunk_size;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
@@ -252,7 +252,7 @@ Status RingAlg::InitializeCollectiveParams(CollectiveParams* col_params) {
   }
 
   VLOG(2) << collective_util::SubdivPermDebugString(*col_params);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RingAlg::InitializeCollectiveContext(
diff --git a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
index dbb26ac5beabde..44e64690ccbb25 100644
--- a/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
+++ b/tensorflow/core/common_runtime/scoped_allocator_mgr.cc
@@ -53,7 +53,7 @@ Status ScopedAllocatorContainer::AddScopedAllocator(
     allocators_[f.scope_id] = ScopedAllocatorContainer::SAField(
         i, new ScopedAllocatorInstance(sa, i));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 ScopedAllocator* ScopedAllocatorContainer::GetAllocator(int32_t scope_id) {
diff --git a/tensorflow/core/common_runtime/serving_device_selector.h b/tensorflow/core/common_runtime/serving_device_selector.h
index c776a1fae67d45..5d1351eb3aa992 100644
--- a/tensorflow/core/common_runtime/serving_device_selector.h
+++ b/tensorflow/core/common_runtime/serving_device_selector.h
@@ -15,12 +15,13 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_SERVING_DEVICE_SELECTOR_H_
 #define TENSORFLOW_CORE_COMMON_RUNTIME_SERVING_DEVICE_SELECTOR_H_
 
+#include <cstdint>
 #include <deque>
-#include <memory>
-#include <vector>
 
+#include "absl/container/fixed_array.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
+#include "tensorflow/core/platform/logging.h"
 
 namespace tensorflow {
 
@@ -51,15 +52,94 @@ class DeviceReservation {
 // NOTE: This interface is experimental and subject to change.
 class ServingDeviceSelector {
  public:
-  // The state for a single device.
+  // Enumerates the cases of prefetch hit and miss.
+  enum class PrefetchResults { kPrefetchHit = 0, kPrefetchMiss };
+
+  // Tracks the running average of certain program execution time.
+  class RunningAverage {
+   public:
+    void Add(int64_t value) {
+      DCHECK_GE(value, 0);
+      sum_ += value;
+      ++count_;
+      latency_ = sum_ / count_;
+    }
+
+    int64_t Get() const { return latency_; }
+
+   private:
+    int64_t sum_ = 0;
+    int64_t count_ = 0;
+    int64_t latency_ = 0;
+  };
+
+  // Tracks the program execution information, including execution time.
+  class ExecutionInfo {
+   public:
+    explicit ExecutionInfo(int64_t num_prefetch_result = 1)
+        : running_average_(num_prefetch_result) {}
+
+    void AddTime(int64_t value,
+                 PrefetchResults result = PrefetchResults::kPrefetchHit) {
+      DCHECK_GE(value, 0);
+      const int index = static_cast<int>(result);
+      DCHECK_LT(index, running_average_.size());
+      running_average_.at(index).Add(value);
+    }
+
+    int64_t GetTime(
+        PrefetchResults result = PrefetchResults::kPrefetchHit) const {
+      const int index = static_cast<int>(result);
+      DCHECK_LT(index, running_average_.size());
+      return running_average_.at(index).Get();
+    }
+
+    // To be conservative when one of the path is missing.
+    int64_t MaybeGetValidTime(PrefetchResults result) const {
+      const auto miss_time = GetTime(PrefetchResults::kPrefetchMiss);
+      const auto hit_time = GetTime(PrefetchResults::kPrefetchHit);
+
+      if (miss_time != 0 && hit_time != 0) {
+        return result == PrefetchResults::kPrefetchMiss ? miss_time : hit_time;
+      } else {
+        auto dummy = std::max(miss_time, hit_time);
+        return dummy;
+      }
+    }
+
+   private:
+    // Records program average execution time, one for each prefetch result.
+    absl::FixedArray<RunningAverage> running_average_;
+  };
+
   struct DeviceState {
+    explicit DeviceState(int64_t priority_count = 1)
+        : enqueued_programs(priority_count),
+          scheduled_programs(priority_count) {}
     // TODO(b/295352859): Add more stats to track that are useful for the Policy
     // to use when selecting a device.
     struct ProgramInfo {
       absl::string_view fingerprint;
+      int32_t priority;
       int64_t req_id = -1;
+      ExecutionInfo* execution_info;
+      PrefetchResults prefetch_results;
     };
-    std::deque<ProgramInfo> scheduled_programs;
+    // A queue of enqueued programs, one for each priority level
+    absl::FixedArray<std::deque<ProgramInfo>> enqueued_programs;
+    // A queue of scheduled yet enqueued programs, one for each priority level.
+    // May or may not have fingerprint.
+    absl::FixedArray<std::deque<ProgramInfo>> scheduled_programs;
+    // Timestamp in nanoseconds of last started program.
+    int64_t last_started_ns = 0;
+    // Fingerprint of last enqueued high priority program.
+    absl::string_view last_fingerprint;
+    // The number of scheduled not yet enqueued programs with unknown
+    // fingerprints.
+    int32_t unknown_fingerprint_requests;
+    // Whether execution timer was reset, true iff a program is enqueued while
+    // all queues (for all priorities) were empty.
+    bool timer_reset = true;
   };
 
   // Struct of all tracked device states, which will be passed to Policy.
diff --git a/tensorflow/core/common_runtime/session_factory.cc b/tensorflow/core/common_runtime/session_factory.cc
index 10e269a70477e2..0ce81f9d9aed40 100644
--- a/tensorflow/core/common_runtime/session_factory.cc
+++ b/tensorflow/core/common_runtime/session_factory.cc
@@ -83,7 +83,7 @@ Status SessionFactory::GetFactory(const SessionOptions& options,
 
   if (candidate_factories.size() == 1) {
     *out_factory = candidate_factories[0].second;
-    return OkStatus();
+    return absl::OkStatus();
   } else if (candidate_factories.size() > 1) {
     // NOTE(mrry): This implementation assumes that the domains (in
     // terms of acceptable SessionOptions) of the registered
diff --git a/tensorflow/core/common_runtime/session_state.cc b/tensorflow/core/common_runtime/session_state.cc
index ac5baacd669ca9..7bf14d304c3740 100644
--- a/tensorflow/core/common_runtime/session_state.cc
+++ b/tensorflow/core/common_runtime/session_state.cc
@@ -31,7 +31,7 @@ Status SessionState::GetTensor(const string& handle, Tensor* tensor) {
                                    "' is not in the session store.");
   }
   *tensor = it->second;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SessionState::AddTensor(const string& handle, const Tensor& tensor) {
@@ -40,7 +40,7 @@ Status SessionState::AddTensor(const string& handle, const Tensor& tensor) {
     return errors::InvalidArgument("Failed to add a tensor with handle '",
                                    handle, "' to the session store.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SessionState::DeleteTensor(const string& handle) {
@@ -49,7 +49,7 @@ Status SessionState::DeleteTensor(const string& handle) {
     return errors::InvalidArgument("Failed to delete a tensor with handle '",
                                    handle, "' in the session store.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 int64_t SessionState::GetNewId() {
@@ -64,7 +64,7 @@ Status TensorStore::AddTensor(const string& name, const TensorAndKey& tk) {
                                    "' to the tensor store.");
   }
   dirty_ = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status TensorStore::SaveTensors(const std::vector<string>& output_names,
@@ -83,7 +83,7 @@ Status TensorStore::SaveTensors(const std::vector<string>& output_names,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/session_test.cc b/tensorflow/core/common_runtime/session_test.cc
index cbc5e91281060e..7c68f0f001e471 100644
--- a/tensorflow/core/common_runtime/session_test.cc
+++ b/tensorflow/core/common_runtime/session_test.cc
@@ -50,7 +50,7 @@ class FakeSessionFactory : public SessionFactory {
   Status NewSession(const SessionOptions& options,
                     Session** out_session) override {
     *out_session = nullptr;
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 class FakeSessionRegistrar {
diff --git a/tensorflow/core/common_runtime/shape_refiner.cc b/tensorflow/core/common_runtime/shape_refiner.cc
index 3a4d898a2e20d7..f021896691a078 100644
--- a/tensorflow/core/common_runtime/shape_refiner.cc
+++ b/tensorflow/core/common_runtime/shape_refiner.cc
@@ -147,7 +147,7 @@ Status ShapeRefiner::InferShapesForFunctionSubNode(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // TODO(cwhipkey): When an inference context inside function has
@@ -185,7 +185,7 @@ Status ShapeRefiner::InferShapesForFunction(const FunctionDef* function_def,
   }
 
   absl::flat_hash_set<const Node*> function_nodes;
-  Status inference_status = OkStatus();
+  Status inference_status = absl::OkStatus();
   {
     auto node_shape_inference_lambda = [this, &outer_context, &function_nodes,
                                         &inference_status](const Node* node) {
@@ -272,7 +272,7 @@ Status ShapeRefiner::AddNodeInternal(
   // Store the resulting context object in the map.
   node_to_context_[node].swap(ec);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ShapeRefiner::SetShape(const Node* node, int output_port,
@@ -305,7 +305,7 @@ Status ShapeRefiner::SetShape(const Node* node, int output_port,
   // TODO(vrv): We might need to keep track of the fact that the
   // existing shape is invalidated, in case we need to propagate
   // this information to remote workers.
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ShapeRefiner::UpdateNode(const Node* node, bool relax, bool* refined) {
@@ -390,7 +390,7 @@ Status ShapeRefiner::UpdateNode(const Node* node, bool relax, bool* refined) {
 
   if (!*refined) {
     // No input shape has changed, we're done
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Get and run the shape function for this node to update the shapes of the
@@ -406,7 +406,7 @@ Status ShapeRefiner::UpdateNode(const Node* node, bool relax, bool* refined) {
 
   if (!op_reg_data->shape_inference_fn) {
     // There is nothing more we can infer
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   return RunShapeFn(node, op_reg_data, node_ext_context);
@@ -463,7 +463,7 @@ Status ShapeRefiner::EvaluateConstantTensorForEdge(
     *result = *std::move(tensor);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ShapeRefiner::EvaluateConstantIntScalarEdge(
@@ -489,7 +489,7 @@ Status ShapeRefiner::EvaluateConstantIntScalarEdge(
       *result = scalar.scalar<int64_t>()();
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ShapeRefiner::ConstantPartialShape(
@@ -520,10 +520,10 @@ Status ShapeRefiner::ConstantPartialShape(
     if (t.dims() == 0) {
       if (t.dtype() == DT_INT32 && t.scalar<int32>()() == -1) {
         *result = target_context->UnknownShape();
-        return OkStatus();
+        return absl::OkStatus();
       } else if (t.dtype() == DT_INT64 && t.scalar<int64_t>()() == -1) {
         *result = target_context->UnknownShape();
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
     return errors::InvalidArgument(
@@ -549,7 +549,7 @@ Status ShapeRefiner::ConstantPartialShape(
             .ok()) {
       if (evaluated &&
           target_context->MakeShapeFromTensor(&t, src_shape, result).ok()) {
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
 
@@ -564,7 +564,7 @@ Status ShapeRefiner::ConstantPartialShape(
     if (!target_context->RankKnown(pre_cast_shape)) {
       // Failed to evaluate. Treat the output as completely unknown.
       *result = target_context->UnknownShape();
-      return OkStatus();
+      return absl::OkStatus();
     }
     auto* dest_type = input_edge->src()->attrs().Find("DstT");
     if (dest_type == nullptr || dest_type->value_case() != AttrValue::kType ||
@@ -572,7 +572,7 @@ Status ShapeRefiner::ConstantPartialShape(
       // Casting to a weird type. Do not attempt to infer across it.
       *result = target_context->MakeShape(std::vector<DimensionHandle>(
           target_context->Rank(pre_cast_shape), target_context->UnknownDim()));
-      return OkStatus();
+      return absl::OkStatus();
     }
     *result = pre_cast_shape;
   } else if (src_op == "Shape") {
@@ -612,7 +612,7 @@ Status ShapeRefiner::ConstantPartialShape(
         // TODO(cwhipkey): we could rely on all inputs being the same rank, so
         // figure that rank out and append the right number of unknown dims.
         *result = target_context->UnknownShape();
-        return OkStatus();
+        return absl::OkStatus();
       }
       TF_RETURN_IF_ERROR(
           target_context->Concatenate(*result, sub_result, result));
@@ -635,7 +635,7 @@ Status ShapeRefiner::ConstantPartialShape(
     TF_RETURN_IF_ERROR(target_context->MakeShapeFromTensor(
         evaluated ? &t : nullptr, src_shape, result));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ShapeRefiner::PartialStridedSliceShape(
@@ -646,7 +646,7 @@ Status ShapeRefiner::PartialStridedSliceShape(
     ShapeHandle input_shape = ctx->input(i);
     if (ctx->Value(ctx->Dim(input_shape, 0)) != 1) {
       *result = ctx->UnknownShape();
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -667,7 +667,7 @@ Status ShapeRefiner::PartialStridedSliceShape(
       !(end_mask == 0 || end_mask == 1) || ellipsis_mask != 0 ||
       new_axis_mask != 0 || shrink_axis_mask != 0) {
     *result = ctx->UnknownShape();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   bool evaluated;
@@ -679,7 +679,7 @@ Status ShapeRefiner::PartialStridedSliceShape(
                                                      &begin, outer_context));
     if (!evaluated) {
       *result = ctx->UnknownShape();
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -691,7 +691,7 @@ Status ShapeRefiner::PartialStridedSliceShape(
                                                      &end, outer_context));
     if (!evaluated) {
       *result = ctx->UnknownShape();
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -700,7 +700,7 @@ Status ShapeRefiner::PartialStridedSliceShape(
                                                    &stride, outer_context));
   if (!evaluated) {
     *result = ctx->UnknownShape();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Apply stride to input interpreted as a partial shape.
@@ -708,7 +708,7 @@ Status ShapeRefiner::PartialStridedSliceShape(
   TF_RETURN_IF_ERROR(
       ConstantPartialShape(ctx, slice_node, 0, &input, outer_context));
   TF_RETURN_IF_ERROR(ctx->Subshape(input, begin, end, stride, result));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ShapeRefiner::RunShapeFn(const Node* node,
@@ -761,7 +761,7 @@ Status ShapeRefiner::RunShapeFn(const Node* node,
     } else {
       TF_RETURN_IF_ERROR(c->Run(shape_inference::UnknownShape));
     }
-    return OkStatus();
+    return absl::OkStatus();
   };
   TF_RETURN_IF_ERROR(run_inference_lambda());
 
@@ -828,7 +828,7 @@ Status ShapeRefiner::RunShapeFn(const Node* node,
     }
   } while (rerun_shape_fn);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool ShapeRefiner::SameDefinedShape(InferenceContext* c, ShapeHandle s0,
diff --git a/tensorflow/core/common_runtime/shape_refiner_test.cc b/tensorflow/core/common_runtime/shape_refiner_test.cc
index 7b1e8f8c31367c..b50973981fdb0e 100644
--- a/tensorflow/core/common_runtime/shape_refiner_test.cc
+++ b/tensorflow/core/common_runtime/shape_refiner_test.cc
@@ -330,7 +330,7 @@ REGISTER_OP("TestOp")
       if (c->input_tensor(0)) {
         if (c->input_tensor(1)) {
           c->set_output(0, c->Matrix(10, 10));
-          return OkStatus();
+          return absl::OkStatus();
         }
         return shape_inference::ScalarShape(c);
       }
@@ -384,7 +384,7 @@ REGISTER_OP("ShapeData")
       }
 
       c->set_output(0, c->MakeShape(dims));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("ShapeDataInt64")
@@ -403,7 +403,7 @@ REGISTER_OP("ShapeDataInt64")
       }
 
       c->set_output(0, c->MakeShape(dims));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 // An op with a shape function that looks at its input tensor
@@ -422,7 +422,7 @@ REGISTER_OP("ShapeVectorForAllElements")
       }
 
       c->set_output(0, c->Vector(total));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("MultiIdentity")
@@ -433,7 +433,7 @@ REGISTER_OP("MultiIdentity")
       for (int i = 0; i < c->num_inputs(); ++i) {
         c->set_output(i, c->input(i));
       }
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 class MultiIdentity : public OpKernel {
@@ -834,7 +834,7 @@ Status TensorAsShapeShapeFn(shape_inference::InferenceContext* c) {
   shape_inference::ShapeHandle out;
   TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(0 /* input_idx */, &out));
   c->set_output(0, out);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PartialTensorAsShapeShapeFn(shape_inference::InferenceContext* c) {
@@ -842,12 +842,12 @@ Status PartialTensorAsShapeShapeFn(shape_inference::InferenceContext* c) {
   const Tensor* t = c->input_tensor(0);
   if (t == nullptr || t->NumElements() != 1) {
     c->set_output(0, c->UnknownShape());
-    return OkStatus();
+    return absl::OkStatus();
   }
   TF_RETURN_IF_ERROR(
       c->MakeShapeFromTensorShape(TensorShape({t->flat<int32>()(0)}), &out));
   c->set_output(0, out);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Register ops used by the ConstantValueAsShape* tests.
@@ -881,7 +881,7 @@ REGISTER_OP("WithEmptyVectorShape")
     .SetDoNotOptimize()
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->Vector(0));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("WithPartialShape")
@@ -891,7 +891,7 @@ REGISTER_OP("WithPartialShape")
       c->set_output(
           0, c->MakeShape({1, shape_inference::InferenceContext::kUnknownDim, 3,
                            shape_inference::InferenceContext::kUnknownDim, 5}));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("WithPartialShape2")
@@ -901,7 +901,7 @@ REGISTER_OP("WithPartialShape2")
       c->set_output(
           0,
           c->MakeShape({6, shape_inference::InferenceContext::kUnknownDim, 8}));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("WithUnknownShape")
@@ -909,7 +909,7 @@ REGISTER_OP("WithUnknownShape")
     .SetDoNotOptimize()
     .SetShapeFn([](shape_inference::InferenceContext* c) {
       c->set_output(0, c->UnknownShape());
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 }  // namespace
diff --git a/tensorflow/core/common_runtime/single_threaded_cpu_device.cc b/tensorflow/core/common_runtime/single_threaded_cpu_device.cc
index 495f65cfe910c9..ababa7d14aec33 100644
--- a/tensorflow/core/common_runtime/single_threaded_cpu_device.cc
+++ b/tensorflow/core/common_runtime/single_threaded_cpu_device.cc
@@ -56,7 +56,7 @@ class SingleThreadedCpuDevice : public Device {
 
   ~SingleThreadedCpuDevice() override { eigen_device_.reset(); }
 
-  Status Sync() override { return OkStatus(); }
+  Status Sync() override { return absl::OkStatus(); }
 
   Status MakeTensorFromProto(const TensorProto& tensor_proto,
                              const AllocatorAttributes alloc_attrs,
@@ -66,7 +66,7 @@ class SingleThreadedCpuDevice : public Device {
       return errors::InvalidArgument("Cannot parse tensor from tensor_proto.");
     }
     *tensor = parsed;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void CopyTensorInSameDevice(const Tensor* input_tensor, Tensor* output_tensor,
@@ -79,7 +79,7 @@ class SingleThreadedCpuDevice : public Device {
       return;
     }
     tensor::DeepCopy(*input_tensor, output_tensor);
-    done(OkStatus());
+    done(absl::OkStatus());
   }
 
   Allocator* GetAllocator(AllocatorAttributes attr) override {
diff --git a/tensorflow/core/common_runtime/single_threaded_executor.cc b/tensorflow/core/common_runtime/single_threaded_executor.cc
index 598907b3cd2582..a098f437af17b6 100644
--- a/tensorflow/core/common_runtime/single_threaded_executor.cc
+++ b/tensorflow/core/common_runtime/single_threaded_executor.cc
@@ -57,7 +57,7 @@ Status ValidateOpIsSafeForSyncExecution(
         ".  Perhaps your graph contains old-style control flow primitives? "
         "Try using tf.compat.v1.enable_control_flow_v2().");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -251,7 +251,7 @@ class SingleThreadedExecutorImpl : public Executor {
     } else {
       total_num_inputs_ = 0;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Run(const Args& args) override {
@@ -481,7 +481,7 @@ class SingleThreadedExecutorImpl : public Executor {
         delete val.tensor;
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -582,7 +582,7 @@ class SingleThreadedExecutorRegistrar {
       Executor* ret;
       TF_RETURN_IF_ERROR(NewSingleThreadedExecutor(params, graph, &ret));
       out_executor->reset(ret);
-      return OkStatus();
+      return absl::OkStatus();
     }
   };
 };
@@ -595,7 +595,7 @@ Status NewSingleThreadedExecutor(const LocalExecutorParams& params,
   auto impl = std::make_unique<SingleThreadedExecutorImpl>(params);
   TF_RETURN_IF_ERROR(impl->Initialize(graph));
   *executor = impl.release();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/common_runtime/single_threaded_executor_test.cc b/tensorflow/core/common_runtime/single_threaded_executor_test.cc
index c1343aea326f6d..04fcd51647efc8 100644
--- a/tensorflow/core/common_runtime/single_threaded_executor_test.cc
+++ b/tensorflow/core/common_runtime/single_threaded_executor_test.cc
@@ -99,7 +99,7 @@ class ExecutorTest : public ::testing::Test {
           if ((*kernel)->type_string_view() == "Mock") {
             down_cast<MockOp*>(*kernel)->SetCompute(mock_fn);
           }
-          return OkStatus();
+          return absl::OkStatus();
         };
     params.delete_kernel = [](OpKernel* kernel) {
       DeleteNonCachedKernel(kernel);
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 8365c229c4fd3c..cd77afcf53dbd3 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -126,7 +126,7 @@ Status ThreadPoolDevice::MakeTensorFromProto(
     Tensor parsed(tensor_proto.dtype());
     if (parsed.FromProto(allocator_, tensor_proto)) {
       *tensor = std::move(parsed);
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
   return errors::InvalidArgument("Cannot parse tensor from proto: ",
@@ -143,7 +143,7 @@ void ThreadPoolDevice::CopyTensorInSameDevice(
     return;
   }
   tensor::DeepCopy(*input_tensor, output_tensor);
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 namespace {
diff --git a/tensorflow/core/common_runtime/threadpool_device.h b/tensorflow/core/common_runtime/threadpool_device.h
index 5ca3deb7247602..a2b062e0e6f727 100644
--- a/tensorflow/core/common_runtime/threadpool_device.h
+++ b/tensorflow/core/common_runtime/threadpool_device.h
@@ -43,7 +43,7 @@ class ThreadPoolDevice : public LocalDevice {
                               const DeviceContext* device_context,
                               StatusCallback done) override;
 
-  Status Sync() override { return OkStatus(); }
+  Status Sync() override { return absl::OkStatus(); }
 
   void Compute(OpKernel* op_kernel, OpKernelContext* context) override;
   void ComputeAsync(AsyncOpKernel* op_kernel, OpKernelContext* context,
diff --git a/tensorflow/core/common_runtime/threadpool_device_factory.cc b/tensorflow/core/common_runtime/threadpool_device_factory.cc
index 5840a5d64005ce..4ebf3576167e5f 100644
--- a/tensorflow/core/common_runtime/threadpool_device_factory.cc
+++ b/tensorflow/core/common_runtime/threadpool_device_factory.cc
@@ -32,7 +32,7 @@ class ThreadPoolDeviceFactory : public DeviceFactory {
   Status ListPhysicalDevices(std::vector<string>* devices) override {
     devices->push_back("/physical_device:CPU:0");
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CreateDevices(const SessionOptions& options, const string& name_prefix,
@@ -67,7 +67,7 @@ class ThreadPoolDeviceFactory : public DeviceFactory {
       devices->push_back(std::move(tpd));
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/common_runtime/type_inference.cc b/tensorflow/core/common_runtime/type_inference.cc
index 54ce1a68bc500c..3ad86d8792bc46 100644
--- a/tensorflow/core/common_runtime/type_inference.cc
+++ b/tensorflow/core/common_runtime/type_inference.cc
@@ -99,14 +99,14 @@ std::vector<std::reference_wrapper<const FullTypeDef>> input_types(
 Status update_inferred_type(Node* target, const FullTypeDef& t, bool& updated) {
   if (t.type_id() == TFT_UNSET) {
     VLOG(3) << "  " << target->name() << " no inferred type";
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (target->def().has_experimental_type()) {
     const auto existing = target->def().experimental_type();
     if (full_type::IsSubtype(existing, t)) {
       VLOG(3) << "  " << target->name() << " no new type info";
-      return OkStatus();
+      return absl::OkStatus();
     } else if (!full_type::IsSubtype(t, existing)) {
       // The only allowable type mismatches are those which would further
       // specialize the existing type.
@@ -121,7 +121,7 @@ Status update_inferred_type(Node* target, const FullTypeDef& t, bool& updated) {
   *(target->mutable_def()->mutable_experimental_type()) = t;
   updated = true;
   VLOG(3) << "  " << target->name() << " updated";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<FullTypeDef> run_inference(const string& fn_name,
@@ -131,7 +131,7 @@ StatusOr<FullTypeDef> run_inference(const string& fn_name,
   //  * execute pass on its graph
   //  * get retnode types
   //  * return them here
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -174,7 +174,7 @@ Status TypeInferencePass::Run(
 
   auto infer_forward = [&forward](Node* n, bool& updated) {
     if (!forward.contains(n->id())) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     VLOG(4) << "  " << n->name() << " has forward function";
 
@@ -189,12 +189,12 @@ Status TypeInferencePass::Run(
         update_inferred_type(n, *infer_ret, updated),
         "while updating its output type.");
 
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   auto infer_reverse = [&reverse](Node* n, bool& updated) {
     if (!reverse.contains(n->id())) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     VLOG(4) << "  " << n->name() << " has reverse function";
 
@@ -218,7 +218,7 @@ Status TypeInferencePass::Run(
         absl::StrCat("while updating its output type inferred from '",
                      n->name(), ","));
 
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   std::list<int> queue;
@@ -328,7 +328,7 @@ Status TypeInferencePass::Run(
     DumpGraphToFile("forward_type_inference_after", *g, flib_def);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status WeakTypeInferencePass::Run(
@@ -341,7 +341,7 @@ Status WeakTypeInferencePass::Run(
            "invalid graph that escaped type checking. Error message: "
         << pass_status.ToString();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Note: This needs to run last because Placer needs it.
diff --git a/tensorflow/core/data/BUILD b/tensorflow/core/data/BUILD
index dbaa8f78255c92..d0c2fd315ff4c4 100644
--- a/tensorflow/core/data/BUILD
+++ b/tensorflow/core/data/BUILD
@@ -190,6 +190,7 @@ cc_library(
     # copybara:uncomment copts = ["-Wthread-safety-analysis"],
     deps = [
         ":dataset_utils",
+        ":serialization_utils",
         "//tensorflow/core:core_cpu_internal",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
@@ -226,9 +227,9 @@ cc_library(
         ":tfdataz_metrics",
         "//tensorflow/core/platform:env",
         "//tensorflow/core/platform:numbers",
+        "//tensorflow/core/platform:status",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:logging",
     ],
 )
@@ -278,6 +279,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core/platform:env",
         "//tensorflow/core/platform:mutex",
@@ -414,6 +416,7 @@ cc_library(
         "//tensorflow/core/platform:stringpiece",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -529,6 +532,8 @@ cc_library(
         ":dataset_utils",
         ":root_dataset",
         ":serialization_utils",
+        ":tf_data_memory_logger",
+        ":tfdataz_metrics",
         ":unbounded_thread_pool",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu_internal",
diff --git a/tensorflow/core/data/captured_function.cc b/tensorflow/core/data/captured_function.cc
index d05afc40135f1b..bc65424134a415 100644
--- a/tensorflow/core/data/captured_function.cc
+++ b/tensorflow/core/data/captured_function.cc
@@ -126,7 +126,7 @@ Status GetCapturedInput(const CapturedFunction* const func, int index,
         ". Num captured inputs: ", func->captured_inputs().size());
   }
   *out = &func->captured_inputs()[index];
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RunShortCircuit(const ShortCircuitInfo& info,
@@ -146,7 +146,7 @@ Status RunShortCircuit(const ShortCircuitInfo& info,
       rets->push_back(*captured_input);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RunShortCircuit(const ShortCircuitInfo& info, std::vector<Tensor>&& args,
@@ -169,7 +169,7 @@ Status RunShortCircuit(const ShortCircuitInfo& info, std::vector<Tensor>&& args,
       rets->push_back(*captured_input);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CreateShortCircuitInfo(OpKernelConstruction* ctx,
@@ -190,7 +190,7 @@ Status CreateShortCircuitInfo(OpKernelConstruction* ctx,
   // If the function contains any stateful operations, we conservatively execute
   // the entire function.
   if (ctx->function_library()->IsStateful(func.name())) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const FunctionBody* fn_body =
@@ -228,7 +228,7 @@ Status CreateShortCircuitInfo(OpKernelConstruction* ctx,
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CreateFunctionLibraryDefinition(
@@ -253,7 +253,7 @@ Status LookupFunction(const FunctionLibraryDefinition& lib_def,
         "Failed to find function ", name,
         " in function library: ", lib_def.ToProto().DebugString());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 class CallFrameBase : public CallFrameInterface {
@@ -272,7 +272,7 @@ class CallFrameBase : public CallFrameInterface {
       retvals->emplace_back(std::move(val.value()));
       ++i;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   size_t num_retvals() const override { return retvals_.size(); }
@@ -283,7 +283,7 @@ class CallFrameBase : public CallFrameInterface {
     if (index < retvals_size && val.dtype() == ret_types_[index] &&
         !retvals_[index]) {
       retvals_[index] = val;
-      return OkStatus();
+      return absl::OkStatus();
     } else if (index >= retvals_size) {
       return errors::InvalidArgument("Return value ", index,
                                      " is out of range.");
@@ -324,10 +324,10 @@ class OwnedArgsCallFrame : public CallFrameBase {
     const int captured_inputs_size = captured_inputs_->size();
     if (index < args_size) {
       *val = &args_[index];
-      return OkStatus();
+      return absl::OkStatus();
     } else if (index < args_size + captured_inputs_size) {
       *val = &(*captured_inputs_)[index - args_.size()];
-      return OkStatus();
+      return absl::OkStatus();
     } else {
       return errors::InvalidArgument("Argument ", index, " is out of range.");
     }
@@ -368,10 +368,10 @@ class BorrowedArgsCallFrame : public CallFrameBase {
     const int captured_inputs_size = captured_inputs_->size();
     if (index < args_size) {
       *val = &args_[index];
-      return OkStatus();
+      return absl::OkStatus();
     } else if (index < args_size + captured_inputs_size) {
       *val = &(*captured_inputs_)[index - args_size];
-      return OkStatus();
+      return absl::OkStatus();
     } else {
       return errors::InvalidArgument("Argument ", index, " is out of range.");
     }
@@ -423,7 +423,7 @@ Status MakeIteratorFromInputElement(
   TF_RETURN_IF_ERROR(returned_dataset->MakeIterator(
       &nested_ctx, parent, iterator_prefix, out_iterator));
   ctx->MergeCheckpoint(nested_ctx.checkpoint());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 /* static */
@@ -453,7 +453,7 @@ Status FunctionMetadata::Create(
     VLOG(1) << "Disabling multi-device execution for a function that uses the "
             << FunctionLibraryDefinition::kIntsOnDeviceAttr << " attribute.";
     (*out_metadata)->use_multi_device_function_ = false;
-    return OkStatus();
+    return absl::OkStatus();
   }
   auto validate_arg = [](const OpDef::ArgDef& arg) {
     if (!arg.number_attr().empty() || !arg.type_list_attr().empty()) {
@@ -466,16 +466,16 @@ Status FunctionMetadata::Create(
   for (const auto& arg : fdef->signature().input_arg()) {
     if (!validate_arg(arg)) {
       (*out_metadata)->use_multi_device_function_ = false;
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
   for (const auto& arg : fdef->signature().output_arg()) {
     if (!validate_arg(arg)) {
       (*out_metadata)->use_multi_device_function_ = false;
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 /* static */
@@ -497,7 +497,7 @@ Status CapturedFunction::Create(
     std::unique_ptr<CapturedFunction>* out_function) {
   *out_function = absl::WrapUnique(
       new CapturedFunction(std::move(metadata), std::move(captured_inputs)));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CapturedFunction::AddToGraph(
@@ -520,7 +520,7 @@ Status CapturedFunction::AddToGraph(
   }
   TF_RETURN_IF_ERROR(
       b->AddFunction(ctx, metadata_->func().name(), *metadata_->lib_def()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CapturedFunction::Instantiate(
@@ -679,7 +679,7 @@ Status CapturedFunction::Instantiate(
   *instantiated_captured_function = absl::WrapUnique(
       new InstantiatedCapturedFunction(lib, f_handle, std::move(ret_types),
                                        *params.runner, this, is_multi_device));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CapturedFunction::CheckExternalState() const {
@@ -687,7 +687,7 @@ Status CapturedFunction::CheckExternalState() const {
     TF_RETURN_IF_ERROR(
         IsFunctionStateful(*lib_def(), *(lib_def()->Find(name))));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 CapturedFunction::CapturedFunction(
@@ -700,7 +700,7 @@ Status CapturedFunction::IsMultiDevice(FunctionLibraryRuntime* flr,
                                        bool* is_multi_device) const {
   if (!metadata_->use_multi_device_function()) {
     *is_multi_device = false;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const FunctionDef* fdef;
@@ -735,7 +735,7 @@ Status CapturedFunction::IsMultiDevice(FunctionLibraryRuntime* flr,
       if (!DeviceNameUtils::AreCompatibleDevNames(current_device_name,
                                                   resource_device_name)) {
         *is_multi_device = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
   }
@@ -748,7 +748,7 @@ Status CapturedFunction::IsMultiDevice(FunctionLibraryRuntime* flr,
       // Check if the op has a kernel available for the current device.
       if (!KernelDefAvailable(current_device_type, node)) {
         *is_multi_device = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
       // If the op has a requested device, check if the requested device is
       // compatible with the current device.
@@ -761,14 +761,14 @@ Status CapturedFunction::IsMultiDevice(FunctionLibraryRuntime* flr,
         if (!DeviceNameUtils::AreCompatibleDevNames(current_device_name,
                                                     node_device_name)) {
           *is_multi_device = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
       }
     }
   }
 
   *is_multi_device = false;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 InstantiatedCapturedFunction::InstantiatedCapturedFunction(
diff --git a/tensorflow/core/data/compression_utils.cc b/tensorflow/core/data/compression_utils.cc
index fad3515e85e32f..68c8f27f3127ad 100644
--- a/tensorflow/core/data/compression_utils.cc
+++ b/tensorflow/core/data/compression_utils.cc
@@ -132,7 +132,7 @@ Status CompressElement(const std::vector<Tensor>& element,
   out->set_version(kCompressedElementVersion);
   VLOG(3) << "Compressed element from " << iov.NumBytes() << " bytes to "
           << out->data().size() << " bytes";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status UncompressElement(const CompressedElement& compressed,
@@ -228,7 +228,7 @@ Status UncompressElement(const CompressedElement& compressed,
       nonmemcpyable_pos += metadata.uncompressed_bytes(0);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_UNARY_VARIANT_DECODE_FUNCTION(CompressedElement,
diff --git a/tensorflow/core/data/dataset_test_base.cc b/tensorflow/core/data/dataset_test_base.cc
index ae6518bd1af289..acc099dcda529b 100644
--- a/tensorflow/core/data/dataset_test_base.cc
+++ b/tensorflow/core/data/dataset_test_base.cc
@@ -147,7 +147,7 @@ Status WriteDataToFile(const string& filename, const char* data,
   TF_RETURN_IF_ERROR(file_writer->Flush());
   TF_RETURN_IF_ERROR(file_writer->Close());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status WriteDataToTFRecordFile(const string& filename,
@@ -167,7 +167,7 @@ Status WriteDataToTFRecordFile(const string& filename,
   TF_RETURN_IF_ERROR(record_writer.Close());
   TF_RETURN_IF_ERROR(file_writer->Flush());
   TF_RETURN_IF_ERROR(file_writer->Close());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename T>
@@ -195,7 +195,7 @@ Status IsEqual(const Tensor& t1, const Tensor& t2) {
           i, "]: ", flat_t1(i), " vs. ", flat_t2(i));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 DatasetOpsTestBase::DatasetOpsTestBase()
@@ -247,7 +247,7 @@ Status DatasetOpsTestBase::ExpectEqual(const Tensor& a, const Tensor& b) {
     default:
       return errors::Internal("Unsupported dtype: ", a.dtype());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename T>
@@ -271,7 +271,7 @@ Status DatasetOpsTestBase::ExpectEqual(std::vector<Tensor> produced_tensors,
         " v.s. ", expected_tensors.size(), ")"));
   }
 
-  if (produced_tensors.empty()) return OkStatus();
+  if (produced_tensors.empty()) return absl::OkStatus();
   if (produced_tensors[0].dtype() != expected_tensors[0].dtype()) {
     return Status(tensorflow::errors::Internal(
         "The two tensor vectors have different dtypes (",
@@ -318,7 +318,7 @@ Status DatasetOpsTestBase::ExpectEqual(std::vector<Tensor> produced_tensors,
     TF_RETURN_IF_ERROR(DatasetOpsTestBase::ExpectEqual(produced_tensors[i],
                                                        expected_tensors[i]));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CreateOpKernel(
@@ -343,7 +343,7 @@ Status DatasetOpsTestBase::CreateOpKernel(
       device_->resource_manager(), props_with_defaults, TF_GRAPH_DEF_VERSION,
       &kernel));
   op_kernel->reset(kernel);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CreateDatasetContext(
@@ -357,7 +357,7 @@ Status DatasetOpsTestBase::CreateDatasetContext(
   }
   TF_RETURN_IF_ERROR(CreateOpKernelContext(
       dateset_kernel, inputs, dataset_context_params, dataset_context));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CreateDataset(OpKernel* kernel,
@@ -367,7 +367,7 @@ Status DatasetOpsTestBase::CreateDataset(OpKernel* kernel,
   // Assume that DatasetOp has only one output.
   DCHECK_EQ(context->num_outputs(), 1);
   TF_RETURN_IF_ERROR(GetDatasetFromContext(context, 0, dataset));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::RestoreIterator(
@@ -389,7 +389,7 @@ Status DatasetOpsTestBase::CreateIteratorContext(
   params.ram_budget_manager = std::make_shared<model::RamBudgetManager>(
       /*budget*/ std::numeric_limits<int64_t>::max());
   *iterator_context = std::make_unique<IteratorContext>(params);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::GetDatasetFromContext(OpKernelContext* context,
@@ -408,7 +408,7 @@ Status DatasetOpsTestBase::InitThreadPool(int thread_num) {
   }
   thread_pool_ = std::make_unique<thread::ThreadPool>(
       Env::Default(), ThreadOptions(), "test_thread_pool", thread_num);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::InitFunctionLibraryRuntime(
@@ -441,7 +441,7 @@ Status DatasetOpsTestBase::InitFunctionLibraryRuntime(
                              tsl::core::RefCountPtr<Rendezvous>* r) {
         *r = tsl::core::RefCountPtr<Rendezvous>(
             new IntraProcessRendezvous(device_mgr));
-        return OkStatus();
+        return absl::OkStatus();
       }});
   flr_ = pflr_->GetFLR("/job:localhost/replica:0/task:0/cpu:0");
   if (thread_pool_ == nullptr) {
@@ -451,7 +451,7 @@ Status DatasetOpsTestBase::InitFunctionLibraryRuntime(
       thread_pool_->Schedule(std::move(fn));
     };
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::RunOpKernel(OpKernel* op_kernel,
@@ -511,7 +511,7 @@ Status DatasetOpsTestBase::RunFunction(
   for (int i = 0; i < rets.size(); ++i) {
     *(rets[i]) = computed[i];
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CreateOpKernelContext(
@@ -554,14 +554,14 @@ Status DatasetOpsTestBase::CreateOpKernelContext(
 
   *context = std::make_unique<OpKernelContext>(params.get());
   *context_params = std::move(params);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CreateSerializationContext(
     std::unique_ptr<SerializationContext>* context) {
   *context =
       std::make_unique<SerializationContext>(SerializationContext::Params{});
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CheckOpKernelInput(
@@ -571,7 +571,7 @@ Status DatasetOpsTestBase::CheckOpKernelInput(
                                    kernel.num_inputs(),
                                    ", but got: ", inputs.size());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::AddDatasetInput(
@@ -604,7 +604,7 @@ Status DatasetOpsTestBase::AddDatasetInput(
   // collect the inputs.
   tensors_.push_back(std::move(input));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CheckIteratorGetNext(
@@ -638,7 +638,7 @@ Status DatasetOpsTestBase::CheckIteratorGetNext(
 
   TF_EXPECT_OK(ExpectEqual(out_tensors, expected_outputs,
                            /*compare_order=*/compare_order));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CheckIteratorSkip(
@@ -659,7 +659,7 @@ Status DatasetOpsTestBase::CheckIteratorSkip(
     TF_EXPECT_OK(ExpectEqual(out_tensors, expected_outputs,
                              /*compare_order=*/compare_order));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CheckSplitProviderFullIteration(
@@ -673,7 +673,7 @@ Status DatasetOpsTestBase::CheckSplitProviderFullIteration(
       MakeIterator(params, *dataset, std::move(split_providers), &iterator));
   TF_RETURN_IF_ERROR(CheckIteratorGetNext(iterator.get(), expected_outputs,
                                           /*compare_order=*/true));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CheckSplitProviderShardedIteration(
@@ -703,65 +703,65 @@ Status DatasetOpsTestBase::CheckSplitProviderShardedIteration(
       /*breakpoints=*/
       {0, mid_breakpoint, near_end_breakpoint, end_breakpoint},
       /*compare_order=*/true));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CheckDatasetNodeName(
     const string& expected_dataset_node_name) {
   EXPECT_EQ(dataset_->node_name(), expected_dataset_node_name);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CheckDatasetTypeString(
     const string& expected_type_str) {
   EXPECT_EQ(dataset_->type_string(), expected_type_str);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CheckDatasetOutputDtypes(
     const DataTypeVector& expected_output_dtypes) {
   TF_EXPECT_OK(
       VerifyTypesMatch(dataset_->output_dtypes(), expected_output_dtypes));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CheckDatasetOutputShapes(
     const std::vector<PartialTensorShape>& expected_output_shapes) {
   TF_EXPECT_OK(VerifyShapesCompatible(dataset_->output_shapes(),
                                       expected_output_shapes));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CheckDatasetCardinality(int expected_cardinality) {
   EXPECT_EQ(dataset_->Cardinality(), expected_cardinality);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CheckDatasetOptions(
     const Options& expected_options) {
   EXPECT_EQ(dataset_->options().SerializeAsString(),
             expected_options.SerializeAsString());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CheckIteratorOutputDtypes(
     const DataTypeVector& expected_output_dtypes) {
   TF_EXPECT_OK(
       VerifyTypesMatch(iterator_->output_dtypes(), expected_output_dtypes));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CheckIteratorOutputShapes(
     const std::vector<PartialTensorShape>& expected_output_shapes) {
   TF_EXPECT_OK(VerifyShapesCompatible(iterator_->output_shapes(),
                                       expected_output_shapes));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CheckIteratorPrefix(
     const string& expected_iterator_prefix) {
   EXPECT_EQ(iterator_->prefix(), expected_iterator_prefix);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CheckIteratorSaveAndRestore(
@@ -796,7 +796,7 @@ Status DatasetOpsTestBase::CheckIteratorSaveAndRestore(
   }
   TF_EXPECT_OK(ExpectEqual(out_tensors, expected_outputs,
                            /*compare_order=*/compare_order));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::CheckIteratorSaveAndRestore(
@@ -822,7 +822,7 @@ Status DatasetOpsTestBase::Initialize(const DatasetParams& dataset_params) {
       dataset_->MakeIterator(iterator_ctx_.get(), /*parent=*/nullptr,
                              dataset_params.iterator_prefix(), &iterator_));
   initialized_ = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::InitializeRuntime(
@@ -830,7 +830,7 @@ Status DatasetOpsTestBase::InitializeRuntime(
   TF_RETURN_IF_ERROR(InitThreadPool(thread_num_));
   TF_RETURN_IF_ERROR(
       InitFunctionLibraryRuntime(dataset_params.func_lib(), cpu_num_));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::MakeDataset(const DatasetParams& dataset_params,
@@ -846,7 +846,7 @@ Status DatasetOpsTestBase::MakeDataset(const DatasetParams& dataset_params,
   *dataset = std::make_unique<TestDataset>(
       std::move(dataset_kernel), std::move(dataset_ctx_params),
       std::move(dataset_ctx), std::move(created_tensors), dataset_base);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::RunDatasetOp(
@@ -880,7 +880,7 @@ Status DatasetOpsTestBase::RunDatasetOp(
   TF_RETURN_IF_ERROR(CreateDatasetContext(dataset_kernel->get(), &inputs,
                                           dataset_ctx_params, dataset_ctx));
   TF_RETURN_IF_ERROR(RunOpKernel(dataset_kernel->get(), dataset_ctx->get()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::MakeDataset(
@@ -896,7 +896,7 @@ Status DatasetOpsTestBase::MakeDataset(
   // Assume that DatasetOp has only one output.
   DCHECK_EQ((*dataset_ctx)->num_outputs(), 1);
   TF_RETURN_IF_ERROR(GetDatasetFromContext(dataset_ctx->get(), 0, dataset));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::MakeIterator(
@@ -917,7 +917,7 @@ Status DatasetOpsTestBase::MakeIterator(
       &iterator_base));
   *iterator = std::make_unique<TestIterator>(std::move(iterator_ctx),
                                              std::move(iterator_base));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::MakeIterator(
@@ -934,7 +934,7 @@ Status DatasetOpsTestBase::RunDatasetOp(const DatasetParams& dataset_params,
   for (int i = 0; i < dataset_ctx_->num_outputs(); ++i) {
     outputs->emplace_back(*dataset_ctx_->mutable_output(i));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::MakeDatasetOpKernel(
@@ -950,7 +950,7 @@ Status DatasetOpsTestBase::MakeDatasetOpKernel(
       test::function::NDef(dataset_params.node_name(), dataset_params.op_name(),
                            input_names, attributes);
   TF_RETURN_IF_ERROR(CreateOpKernel(node_def, dataset_kernel));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::MakeGetOptionsOpKernel(
@@ -965,7 +965,7 @@ Status DatasetOpsTestBase::MakeGetOptionsOpKernel(
                                           dataset_params.dataset_type(),
                                           input_names, attributes);
   TF_RETURN_IF_ERROR(CreateOpKernel(node_def, op_kernel));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DatasetOpsTestBase::MakeDatasetTensor(
@@ -1007,7 +1007,7 @@ Status DatasetOpsTestBase::MakeDatasetTensor(
   TF_RETURN_IF_ERROR(
       StoreDatasetInVariantTensor(dataset_base, &dataset_tensor));
   *dataset = std::make_unique<Tensor>(dataset_tensor);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 DatasetParams::DatasetParams(DataTypeVector output_dtypes,
@@ -1057,7 +1057,7 @@ std::vector<Tensor> RangeDatasetParams::GetInputTensors() const {
 Status RangeDatasetParams::GetInputNames(
     std::vector<string>* input_names) const {
   *input_names = {"start", "stop", "step"};
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RangeDatasetParams::GetAttributes(AttributeVector* attr_vector) const {
@@ -1065,7 +1065,7 @@ Status RangeDatasetParams::GetAttributes(AttributeVector* attr_vector) const {
                   {"output_shapes", output_shapes_},
                   {"replicate_on_split", false},
                   {"metadata", ""}};
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string RangeDatasetParams::dataset_type() const { return "Range"; }
@@ -1080,7 +1080,7 @@ std::vector<Tensor> BatchDatasetParams::GetInputTensors() const {
 Status BatchDatasetParams::GetInputNames(
     std::vector<string>* input_names) const {
   *input_names = {"input_dataset", "batch_size", "drop_remainder"};
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status BatchDatasetParams::GetAttributes(AttributeVector* attr_vector) const {
@@ -1088,7 +1088,7 @@ Status BatchDatasetParams::GetAttributes(AttributeVector* attr_vector) const {
                   {"output_types", output_dtypes_},
                   {"output_shapes", output_shapes_},
                   {"metadata", ""}};
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string BatchDatasetParams::dataset_type() const { return "Batch"; }
@@ -1102,7 +1102,7 @@ Status MapDatasetParams::GetInputNames(std::vector<string>* input_names) const {
   for (int i = 0; i < other_arguments_.size(); ++i) {
     input_names->emplace_back(absl::StrCat("other_arguments_", i));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MapDatasetParams::GetAttributes(AttributeVector* attr_vector) const {
@@ -1113,7 +1113,7 @@ Status MapDatasetParams::GetAttributes(AttributeVector* attr_vector) const {
                   {"use_inter_op_parallelism", use_inter_op_parallelism_},
                   {"preserve_cardinality", preserve_cardinality_},
                   {"metadata", ""}};
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string MapDatasetParams::dataset_type() const { return "Map"; }
@@ -1139,7 +1139,7 @@ Status TensorSliceDatasetParams::GetInputNames(
   for (int i = 0; i < components_.size(); ++i) {
     input_names->emplace_back(absl::StrCat("components_", i));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status TensorSliceDatasetParams::GetAttributes(
@@ -1149,7 +1149,7 @@ Status TensorSliceDatasetParams::GetAttributes(
                   {"is_files", is_files_},
                   {"replicate_on_split", false},
                   {"metadata", ""}};
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 DataTypeVector TensorSliceDatasetParams::TensorSliceDtypes(
@@ -1183,14 +1183,14 @@ std::vector<Tensor> TakeDatasetParams::GetInputTensors() const {
 Status TakeDatasetParams::GetInputNames(
     std::vector<string>* input_names) const {
   *input_names = {"input_dataset", "count"};
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status TakeDatasetParams::GetAttributes(AttributeVector* attr_vector) const {
   *attr_vector = {{"output_shapes", output_shapes_},
                   {"output_types", output_dtypes_},
                   {"metadata", ""}};
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string TakeDatasetParams::dataset_type() const { return "Take"; }
@@ -1202,7 +1202,7 @@ std::vector<Tensor> ConcatenateDatasetParams::GetInputTensors() const {
 Status ConcatenateDatasetParams::GetInputNames(
     std::vector<string>* input_names) const {
   *input_names = {"input_dataset", "another_dataset"};
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConcatenateDatasetParams::GetAttributes(
@@ -1210,7 +1210,7 @@ Status ConcatenateDatasetParams::GetAttributes(
   *attr_vector = {{"output_types", output_dtypes_},
                   {"output_shapes", output_shapes_},
                   {"metadata", ""}};
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string ConcatenateDatasetParams::dataset_type() const { return "Concatenate"; }
@@ -1220,7 +1220,7 @@ std::vector<Tensor> OptionsDatasetParams::GetInputTensors() const { return {}; }
 Status OptionsDatasetParams::GetInputNames(
     std::vector<string>* input_names) const {
   input_names->emplace_back("input_dataset");
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status OptionsDatasetParams::GetAttributes(AttributeVector* attr_vector) const {
@@ -1228,7 +1228,7 @@ Status OptionsDatasetParams::GetAttributes(AttributeVector* attr_vector) const {
                   {"output_shapes", output_shapes_},
                   {"output_types", output_dtypes_},
                   {"metadata", ""}};
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string OptionsDatasetParams::dataset_type() const { return "Options"; }
diff --git a/tensorflow/core/data/dataset_utils.cc b/tensorflow/core/data/dataset_utils.cc
index 8cd32bda6f6202..1f6f8207854461 100644
--- a/tensorflow/core/data/dataset_utils.cc
+++ b/tensorflow/core/data/dataset_utils.cc
@@ -99,6 +99,7 @@ constexpr char kMakeSloppyOpt[] = "make_sloppy";
 constexpr char kBatchParallelizationOpt[] = "batch_parallelization";
 constexpr char kEnableGradientDescentOpt[] = "enable_gradient_descent";
 constexpr char kInjectPrefetchOpt[] = "inject_prefetch";
+constexpr char kSeqInterleavePrefetchOpt[] = "seq_interleave_prefetch";
 constexpr char kInjectIoPrefetchEligibleOpt[] = "inject_io_prefetch_eligible";
 constexpr char kInjectIoPrefetchOpt[] = "inject_io_prefetch";
 constexpr char kAutotuneOpt[] = "autotune";
@@ -224,6 +225,14 @@ void DefaultOptimizationGraphRewrites(
       optimization_disabled->insert(kInjectPrefetchOpt);
     }
   }
+  if (optimization_options.optional_seq_interleave_prefetch_case() ==
+      OptimizationOptions::kSeqInterleavePrefetch) {
+    if (optimization_options.seq_interleave_prefetch()) {
+      optimization_enabled->insert(kSeqInterleavePrefetchOpt);
+    } else {
+      optimization_disabled->insert(kSeqInterleavePrefetchOpt);
+    }
+  }
 }
 
 // Returns whether an op has been allowlisted as stateless. Uses a heuristic to
@@ -255,7 +264,7 @@ Status VerifyTypeMatch(const DataType& expected, const DataType& received,
                                    ": expected ", DataTypeString(expected),
                                    " but got ", DataTypeString(received), ".");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status VerifyTypesMatch(const DataTypeVector& expected,
@@ -268,7 +277,7 @@ Status VerifyTypesMatch(const DataTypeVector& expected,
   for (size_t i = 0; i < expected.size(); ++i) {
     TF_RETURN_IF_ERROR(VerifyTypeMatch(expected[i], received[i], i));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status VerifyTypesMatch(const DataTypeVector& expected,
@@ -281,7 +290,7 @@ Status VerifyTypesMatch(const DataTypeVector& expected,
   for (size_t i = 0; i < expected.size(); ++i) {
     TF_RETURN_IF_ERROR(VerifyTypeMatch(expected[i], received[i].dtype(), i));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status VerifyShapeCompatible(const PartialTensorShape& expected,
@@ -291,7 +300,7 @@ Status VerifyShapeCompatible(const PartialTensorShape& expected,
                                    ": expected ", expected.DebugString(),
                                    " but got ", received.DebugString(), ".");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
@@ -305,7 +314,7 @@ Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
     TF_RETURN_IF_ERROR(VerifyShapeCompatible(expected[i], received[i], i));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
@@ -320,7 +329,7 @@ Status VerifyShapesCompatible(const std::vector<PartialTensorShape>& expected,
         VerifyShapeCompatible(expected[i], received[i].shape(), i));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddToFunctionLibrary(FunctionLibraryDefinition* base,
@@ -357,13 +366,13 @@ Status AddToFunctionLibrary(FunctionLibraryDefinition* base,
 Status IsFunctionStateful(const FunctionLibraryDefinition& library,
                           const FunctionDef& function_def) {
   if (!function_def.signature().is_stateful()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   for (const NodeDef& node_def : function_def.node_def()) {
     TF_RETURN_IF_ERROR(IsNodeStateful(library, node_def));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status IsNodeStateful(const FunctionLibraryDefinition& library,
@@ -375,7 +384,7 @@ Status IsNodeStateful(const FunctionLibraryDefinition& library,
   if (!OpRegistry::Global()->LookUpOpDef(node.op(), &op_def).ok() ||
       IsOpAllowlisted(op_def) || !op_def->is_stateful() ||
       op_def->name() == "Assert") {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (op_def->name() == "If") {
@@ -389,7 +398,7 @@ Status IsNodeStateful(const FunctionLibraryDefinition& library,
     if (else_func != nullptr) {
       TF_RETURN_IF_ERROR(IsFunctionStateful(library, *else_func));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (op_def->name() == "While") {
@@ -403,7 +412,7 @@ Status IsNodeStateful(const FunctionLibraryDefinition& library,
     if (body_func != nullptr) {
       TF_RETURN_IF_ERROR(IsFunctionStateful(library, *body_func));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   return errors::FailedPrecondition(op_def->name(), " is stateful.");
@@ -440,7 +449,7 @@ Status DeterminismPolicy::FromString(const std::string& s,
     return errors::InvalidArgument("Unrecognized determinism policy: ", s);
   }
   *out = DeterminismPolicy(type);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 DeterminismPolicy::DeterminismPolicy(bool is_deterministic) {
@@ -641,7 +650,7 @@ Status CopyPartialBatch(int64_t num_elements, const Tensor& value,
       return errors::InvalidArgument("Unsupported data type: ",
                                      DataTypeString(value.dtype()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ReadBatch(IteratorContext* ctx, IteratorStateReader* reader,
@@ -674,7 +683,7 @@ Status ReadBatch(IteratorContext* ctx, IteratorStateReader* reader,
       batch->emplace_back(std::move(t));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status WriteBatch(int64_t batch_size, int64_t num_elements,
@@ -699,7 +708,7 @@ Status WriteBatch(int64_t batch_size, int64_t num_elements,
                               strings::StrCat(kOutput, "_", i), (*batch)[i]));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ReadStatus(const string& iterator_prefix, const string& prefix,
@@ -717,9 +726,9 @@ Status ReadStatus(const string& iterator_prefix, const string& prefix,
         &error_message));
     *status = Status(code, error_message);
   } else {
-    *status = OkStatus();
+    *status = absl::OkStatus();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status WriteStatus(const string& iterator_prefix, const string& prefix,
@@ -732,7 +741,7 @@ Status WriteStatus(const string& iterator_prefix, const string& prefix,
         FullName(iterator_prefix, strings::StrCat(prefix, "_", kMessage)),
         std::string(status.message())));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ProcessBatch(int64_t batch_size, int64_t num_elements,
@@ -742,7 +751,7 @@ Status ProcessBatch(int64_t batch_size, int64_t num_elements,
   if (num_elements == 0) {
     if (status.ok() || absl::IsOutOfRange(status)) {
       *end_of_sequence = true;
-      return OkStatus();
+      return absl::OkStatus();
     } else {
       *end_of_sequence = false;
       return status;
@@ -755,7 +764,7 @@ Status ProcessBatch(int64_t batch_size, int64_t num_elements,
   if (num_elements < batch_size) {
     if (drop_remainder) {
       *end_of_sequence = true;
-      return OkStatus();
+      return absl::OkStatus();
     }
     for (size_t i = 0; i < batch->size(); ++i) {
       TensorShape component_shape((*batch)[i].shape());
@@ -775,7 +784,7 @@ Status ProcessBatch(int64_t batch_size, int64_t num_elements,
     *output = std::move(*batch);
   }
   *end_of_sequence = false;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CopyBatch(CopyBatchParams params,
@@ -865,13 +874,13 @@ Status CopyBatch(CopyBatchParams params,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::flat_hash_set<tstring> CreateGraphRewriteConfigs(const Options& options) {
   absl::flat_hash_set<tstring> configs;
   const auto& autotune_options = options.autotune_options();
-  std::array<tstring, 10> autotune_only_optimizations = {
+  std::array<tstring, 11> autotune_only_optimizations = {
       kAutotuneBufferSizesOpt,
       kBatchParallelizationOpt,
       kDisablePrefetchLegacyAutotuneOpt,
@@ -879,6 +888,7 @@ absl::flat_hash_set<tstring> CreateGraphRewriteConfigs(const Options& options) {
       kFilterParallelizationOpt,
       kMapParallelizationOpt,
       kMapFusionOpt,
+      kSeqInterleavePrefetchOpt,
       kInjectPrefetchOpt,
       kInjectIoPrefetchEligibleOpt,
       kInjectIoPrefetchOpt};
@@ -935,7 +945,17 @@ bool ShouldApplyOptimizations(
 }
 
 int64 GetAutotuneDefaultParallelism(IteratorContext* ctx) {
-  return std::min(kAutotuneDefaultParallelism, ctx->runner_threadpool_size());
+  int64_t initial_parallelism = 16;
+  if (ctx->options()) {
+    int64_t initial_parallelism_option =
+        ctx->options()->autotune_options().initial_parallelism();
+    if (initial_parallelism_option > 0) {
+      initial_parallelism = initial_parallelism_option;
+    }
+  }
+  int64_t runner_threadpool_size = ctx->runner_threadpool_size();
+  int64_t value = std::min(initial_parallelism, runner_threadpool_size);
+  return value;
 }
 
 IteratorContext MakeNestedIteratorContext(IteratorContext* ctx) {
@@ -999,7 +1019,7 @@ REGISTER_DATASET_EXPERIMENT("data_transfer", RandomJobSamplePercentage<0>,
                             AllTasks);
 REGISTER_DATASET_EXPERIMENT("file_locality", RandomJobSamplePercentage<0>,
                             AllTasks);
-REGISTER_DATASET_EXPERIMENT("file_locality_v2", RandomJobSamplePercentage<0>,
+REGISTER_DATASET_EXPERIMENT("file_locality_v2", RandomJobSamplePercentage<50>,
                             AllTasks);
 REGISTER_DATASET_EXPERIMENT("no_compression", RandomJobSamplePercentage<0>,
                             AllTasks);
@@ -1009,10 +1029,8 @@ REGISTER_DATASET_EXPERIMENT("inject_io_prefetch", RandomJobSamplePercentage<0>,
                             AllTasks);
 REGISTER_DATASET_EXPERIMENT("reduce_array_record_dataset_memory_usage",
                             RandomJobSamplePercentage<0>, AllTasks);
-REGISTER_DATASET_EXPERIMENT("map_fusion", RandomJobSamplePercentage<50>,
-                            AllTasks);
-REGISTER_DATASET_EXPERIMENT("log_filenames", RandomJobSamplePercentage<0>,
-                            AllTasks);
+REGISTER_DATASET_EXPERIMENT("map_fusion", RandomJobSamplePercentage<5>,
+                            IndependentHostTasks);
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/dataset_utils.h b/tensorflow/core/data/dataset_utils.h
index b975fc0aad567e..57fe926c5058ad 100644
--- a/tensorflow/core/data/dataset_utils.h
+++ b/tensorflow/core/data/dataset_utils.h
@@ -37,9 +37,6 @@ namespace data {
 // should be supplied by the auto-sharding rewrite.
 constexpr int kShardHint = -1;
 
-// The initial parallelism value before Autotune has a chance to optimize.
-constexpr int kAutotuneDefaultParallelism = 16;
-
 // Creates a resource handle with a unique name for the given resource where
 // the resource is managed by the Resource Manager.
 template <typename T>
@@ -53,7 +50,7 @@ Status CreateWeakHandle(OpKernelContext* ctx, T* resource,
 
   *handle = MakeResourceHandle(container_name, unique_name, *ctx->device(),
                                TypeIndex::Make<T>());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Creates a ref-counting resource handle for the given resource, where the
@@ -65,7 +62,7 @@ Status CreateHandle(OpKernelContext* ctx, T* resource, ResourceHandle* handle) {
       ResourceHandle::MakeRefCountingHandle(resource, ctx->device()->name());
   TF_RETURN_IF_ERROR(
       mgr->CreateUnowned<T>(handle->container(), handle->name(), resource));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // TODO(b/198162355): Merge this class with ResourceOpKernel.
diff --git a/tensorflow/core/data/dataset_utils_test.cc b/tensorflow/core/data/dataset_utils_test.cc
index 4ade9741a8857b..853f5e6c5c0bfa 100644
--- a/tensorflow/core/data/dataset_utils_test.cc
+++ b/tensorflow/core/data/dataset_utils_test.cc
@@ -200,19 +200,19 @@ TEST(DatasetUtilsTest, BoolConstructor) {
 class TestSplitProvider : public SplitProvider {
  public:
   Status GetNext(Tensor* split, bool* end_of_splits) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status Reset() override { return OkStatus(); }
+  Status Reset() override { return absl::OkStatus(); }
 
   Status Save(std::function<std::string(std::string)> key_name_fn,
               IteratorStateWriter* writer) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Restore(std::function<std::string(std::string)> key_name_fn,
                  IteratorStateReader* reader) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -672,13 +672,15 @@ GetOptimizationsTestCase GetOptimizationTestCase4() {
   options.mutable_optimization_options()->set_parallel_batch(true);
   options.mutable_optimization_options()->set_shuffle_and_repeat_fusion(true);
   options.mutable_optimization_options()->set_inject_prefetch(true);
+  options.mutable_optimization_options()->set_seq_interleave_prefetch(true);
   options.set_slack(true);
   return {options,
           /*expected_enabled=*/
           {"filter_fusion", "filter_parallelization", "make_sloppy",
            "map_and_batch_fusion", "map_and_filter_fusion", "map_fusion",
            "map_parallelization", "noop_elimination", "parallel_batch",
-           "shuffle_and_repeat_fusion", "slack", "inject_prefetch"},
+           "shuffle_and_repeat_fusion", "slack", "inject_prefetch",
+           "seq_interleave_prefetch"},
           /*expected_disabled=*/{},
           /*expected_default=*/{}};
 }
diff --git a/tensorflow/core/data/hash_utils.cc b/tensorflow/core/data/hash_utils.cc
index 235a5d28e8de79..8dbab84a533656 100644
--- a/tensorflow/core/data/hash_utils.cc
+++ b/tensorflow/core/data/hash_utils.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/data/serialization_utils.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function.h"
@@ -75,7 +76,7 @@ bool IsNodeOfType(const NodeDef& node,
 
 Status GetSink(const GraphDef& graph_def, const NodeDef** sink) {
   for (auto& node : graph_def.node()) {
-    if (node.op() == "_Retval") {
+    if (node.op() == kRetvalOp) {
       *sink = &node;
       break;
     }
@@ -84,7 +85,7 @@ Status GetSink(const GraphDef& graph_def, const NodeDef** sink) {
   if (sink == nullptr) {
     return errors::Internal("Cannot find sink node for dataset graph.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ShouldIgnoreInput(const NodeDef& node, int i, bool* result) {
@@ -102,7 +103,7 @@ Status ShouldIgnoreInput(const NodeDef& node, int i, bool* result) {
           VLOG(2) << "Ignoring arg: " << input_arg_name
                   << " from node: " << node.name();
           *result = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
       }
     } else if (errors::IsNotFound(status)) {
@@ -113,7 +114,7 @@ Status ShouldIgnoreInput(const NodeDef& node, int i, bool* result) {
       return status;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ParseInputNodeName(absl::string_view input_name,
@@ -122,14 +123,14 @@ Status ParseInputNodeName(absl::string_view input_name,
   if (input_name[0] == '^') {
     *node_name = input_name.substr(1);
     *is_control_input = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
   std::pair<absl::string_view, absl::string_view> node_spec =
       absl::StrSplit(input_name, absl::MaxSplits(':', 1));
   *node_name = node_spec.first;
   *suffix = node_spec.second;
   *is_control_input = false;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Given a graph_def and a root_node, this class computes a fingerprint that
@@ -236,7 +237,7 @@ class GraphHasher {
       }
       nodes_[node] = node_rep;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status HashRoot(uint64* hash) { return HashNode(root_, hash); }
@@ -250,7 +251,7 @@ class GraphHasher {
     auto it = node_cache_->find(node);
     if (it != node_cache_->end()) {
       *hash = it->second;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     NodeRep* node_rep = gtl::FindOrNull(nodes_, node);
@@ -290,7 +291,7 @@ class GraphHasher {
       return errors::Internal(absl::StrCat("Computed the hash for node ",
                                            node->DebugString(), " twice!"));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckNodesEqual(const NodeDef* this_node, GraphHasher* that,
@@ -338,7 +339,7 @@ class GraphHasher {
             that_input_suffix);
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status HashNodeNonInput(const NodeDef* node, bool hash_functions,
@@ -346,7 +347,7 @@ class GraphHasher {
     auto iter = attr_cache_->find(std::make_pair(node, hash_functions));
     if (iter != attr_cache_->end()) {
       *hash = iter->second;
-      return OkStatus();
+      return absl::OkStatus();
     }
     // Hash Attrs. We get the list of attrs from the op registry and then look
     // up their values in the NodeDef attr map. This avoids looping over
@@ -395,7 +396,7 @@ class GraphHasher {
           "Computed the hash for non-input node: ", node->DebugString(),
           " and hash function bool: ", hash_functions, "twice!"));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckNodesEqualNonInput(const NodeDef* this_node, GraphHasher* that,
@@ -450,7 +451,7 @@ class GraphHasher {
           that_node->name(), ": ", this_node->device(), " vs ",
           that_node->device());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status HashAttr(const std::string& attr_name, const AttrValue& attr_value,
@@ -472,7 +473,7 @@ class GraphHasher {
       value_hash = DeterministicProtoHash64(attr_value);
     }
     *hash = Hash64Combine(Hash64(attr_name), value_hash);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckAttrsEqual(const std::string& attr_name,
@@ -488,7 +489,7 @@ class GraphHasher {
         TF_RETURN_IF_ERROR(
             CheckFunctionsEqual(this_attr.func(), that, that_attr.func()));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
     if (this_attr.has_list() != that_attr.has_list()) {
       return errors::FailedPrecondition(
@@ -507,7 +508,7 @@ class GraphHasher {
                                                  that_attr.list().func(i)));
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
     uint64 this_hash, that_hash;
     TF_RETURN_IF_ERROR(
@@ -519,7 +520,7 @@ class GraphHasher {
           "AttrValues are different: ", this_attr.DebugString(), " vs ",
           that_attr.DebugString());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status HashFunction(const NameAttrList& func, uint64* hash) {
@@ -532,7 +533,7 @@ class GraphHasher {
     auto it = function_cache_->find(fdef);
     if (it != function_cache_->end()) {
       *hash = it->second;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Convert to a GraphDef.
@@ -568,7 +569,7 @@ class GraphHasher {
       return errors::Internal(
           absl::StrCat("Computed the hash for function ", name, " twice!"));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckFunctionsEqual(const NameAttrList& this_func, GraphHasher* that,
@@ -637,7 +638,7 @@ class GraphHasher {
     }
     TF_RETURN_IF_ERROR(
         CheckControlInputsEqual(this_control_rets, that, that_control_rets));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status HashControlInputs(const std::vector<const NodeDef*>& inputs,
@@ -649,7 +650,7 @@ class GraphHasher {
           HashNodeNonInput(input, /*hash_functions=*/false, &node_hash));
       *hash = Hash64CombineUnordered(*hash, node_hash);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckControlInputsEqual(
@@ -685,7 +686,7 @@ class GraphHasher {
           "], which don't match any of the other node's dependencies [",
           absl::StrJoin(that_hashes, ", ", formatter), "]");
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -747,7 +748,7 @@ Status HashTensor(const Tensor& tensor, uint64* hash) {
     default:
       *hash = Hash64(tensor.tensor_data().data(), tensor.tensor_data().size());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status HashNode(const GraphDef& graph, const NodeDef& node, uint64* hash) {
diff --git a/tensorflow/core/data/rewrite_utils.cc b/tensorflow/core/data/rewrite_utils.cc
index 76c05e6e47f2fc..5a9b85279c6098 100644
--- a/tensorflow/core/data/rewrite_utils.cc
+++ b/tensorflow/core/data/rewrite_utils.cc
@@ -131,7 +131,7 @@ Status ApplyRewrites(OpKernelContext* ctx,
     RemoveFakeSinks(&function_def);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // anonymous namespace
 
@@ -245,7 +245,7 @@ Status RewriteDataset(OpKernelContext* ctx, const DatasetBase* input,
     });
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::unique_ptr<tensorflow::grappler::GrapplerItem> GetGrapplerItem(
@@ -332,7 +332,7 @@ absl::flat_hash_set<tstring> SelectOptimizations(
 StatusOr<std::string> GetDatasetNode(const GraphDef& graph_def) {
   // Symbolic `_Retval` node indicates which node corresponds to the dataset.
   for (const auto& node : graph_def.node()) {
-    if (node.op() == "_Retval") {
+    if (node.op() == kRetvalOp) {
       return node.input(0);
     }
   }
diff --git a/tensorflow/core/data/root_dataset.cc b/tensorflow/core/data/root_dataset.cc
index 29b32918fc9722..006174e7af28e8 100644
--- a/tensorflow/core/data/root_dataset.cc
+++ b/tensorflow/core/data/root_dataset.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/data/rewrite_utils.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/dataset_options.pb.h"
+#include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/framework/model.pb.h"
 #include "tensorflow/core/platform/errors.h"
@@ -151,16 +152,22 @@ Status RootDataset::FromOptions(const DatasetBase* input,
   SetRootDatasetParams(input->options(), &params);
   *output = new RootDataset(input, params);
   (*output)->Initialize(/*metadata=*/{});
-  return OkStatus();
+  for (const auto& framework : input->options().framework_type()) {
+    metrics::RecordTFDataFrameworkType(framework);
+  }
+  return absl::OkStatus();
 }
 
 Status RootDataset::FromOptions(core::RefCountPtr<DatasetBase> input,
                                 DatasetBase** output) {
   Params params;
+  for (const auto& framework : input->options().framework_type()) {
+    metrics::RecordTFDataFrameworkType(framework);
+  }
   SetRootDatasetParams(input->options(), &params);
   *output = new RootDataset(std::move(input), params);
   (*output)->Initialize(/*metadata=*/{});
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 class RootDataset::Iterator : public DatasetIterator<RootDataset> {
@@ -195,8 +202,12 @@ class RootDataset::Iterator : public DatasetIterator<RootDataset> {
         dataset()->params_.ComputeInitialAutotuneRamBudget());
 
     if (dataset()->params_.autotune) {
-      model_ = ctx->model() != nullptr ? ctx->model()
-                                       : std::make_shared<model::Model>();
+      if (ctx->model() != nullptr) {
+        model_ = ctx->model();
+      } else {
+        model_ = std::make_shared<model::Model>();
+        ctx->SetModel(model_);
+      }
 
       absl::flat_hash_set<string> experiments = GetExperiments();
       if (experiments.contains("stage_based_autotune_v2")) {
@@ -216,7 +227,7 @@ class RootDataset::Iterator : public DatasetIterator<RootDataset> {
     TF_RETURN_IF_ERROR(dataset()->input_->MakeIterator(&iter_ctx, this,
                                                        prefix(), &input_impl_));
     ctx->MergeCheckpoint(iter_ctx.checkpoint());
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
@@ -238,7 +249,7 @@ class RootDataset::Iterator : public DatasetIterator<RootDataset> {
       mutex_lock l(mu_);
       end_time_usec_ = std::max(ctx->env()->NowMicros(), end_time_usec_);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
@@ -250,7 +261,7 @@ class RootDataset::Iterator : public DatasetIterator<RootDataset> {
   Status SaveInternal(SerializationContext* ctx,
                       IteratorStateWriter* writer) override {
     TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status RestoreInternal(IteratorContext* ctx,
@@ -258,7 +269,7 @@ class RootDataset::Iterator : public DatasetIterator<RootDataset> {
     IteratorContext iter_ctx(CreateParams(ctx));
     TF_RETURN_IF_ERROR(RestoreInput(&iter_ctx, reader, input_impl_));
     ctx->MergeCheckpoint(iter_ctx.checkpoint());
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   TraceMeMetadata GetTraceMeMetadata() const override {
@@ -325,6 +336,7 @@ class RootDataset::Iterator : public DatasetIterator<RootDataset> {
       params.runner =
           RunnerWithMaxParallelism(params.runner, max_intra_op_parallelism_);
     }
+    params.options = &dataset()->options();
     return params;
   }
 
@@ -351,7 +363,7 @@ class RootDataset::Iterator : public DatasetIterator<RootDataset> {
         }
       });
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::shared_ptr<model::Model> model_ = nullptr;
@@ -389,6 +401,10 @@ RootDataset::RootDataset(core::RefCountPtr<DatasetBase> input,
       params_(std::move(params)) {
   owned_input_ = std::move(input);
   input_ = owned_input_.get();
+  random_indexing_compatible_ = absl::OkStatus();
+  if (input_ != nullptr) {
+    random_indexing_compatible_ = input_->RandomIndexingCompatible();
+  }
   AddTraceMetadata(params_, input_->options(), &traceme_metadata_);
 }
 
@@ -426,7 +442,7 @@ Status RootDataset::Get(OpKernelContext* ctx, int64 index,
 Status RootDataset::InputDatasets(
     std::vector<const DatasetBase*>* inputs) const {
   inputs->push_back(input_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RootDataset::CheckExternalState() const {
@@ -482,7 +498,7 @@ Status FinalizeDataset(OpKernelContext* ctx, const DatasetBase* input,
   } else {
     return RootDataset::FromOptions(std::move(rewritten_output), output);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #else   // !IS_MOBILE_PLATFORM
diff --git a/tensorflow/core/data/root_dataset.h b/tensorflow/core/data/root_dataset.h
index f679b25541b82a..870741ed9354b2 100644
--- a/tensorflow/core/data/root_dataset.h
+++ b/tensorflow/core/data/root_dataset.h
@@ -67,6 +67,9 @@ class RootDataset : public DatasetBase {
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override;
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override;
+  Status RandomIndexingCompatible() const override {
+    return random_indexing_compatible_;
+  }
 
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
@@ -84,6 +87,7 @@ class RootDataset : public DatasetBase {
   core::RefCountPtr<DatasetBase> owned_input_;
   const Params params_;
   TraceMeMetadata traceme_metadata_;
+  Status random_indexing_compatible_;
 };
 
 // Finalizes the `input` dataset, which is expected to be called before the
diff --git a/tensorflow/core/data/serialization_utils.cc b/tensorflow/core/data/serialization_utils.cc
index 01b5a1289e1257..204cebfab5ab15 100644
--- a/tensorflow/core/data/serialization_utils.cc
+++ b/tensorflow/core/data/serialization_utils.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/dataset.pb.h"
 #include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/variant.h"
 #include "tensorflow/core/framework/variant_op_registry.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
@@ -61,7 +62,7 @@ Status FromGraphDef(FunctionLibraryRuntime* flr, const GraphDef& graph_def,
   TF_RETURN_IF_ERROR(graph_runner.Run(&graph, cloned_flr, input_list,
                                       {output_node}, &outputs));
   *result = outputs[0];
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // FindStatefulOps searches `graph_def` for all of its stateful ops storing
@@ -89,7 +90,7 @@ Status FindStatefulOps(const GraphDef& graph_def,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -118,7 +119,7 @@ Status ReadElementsFromCheckpoint(IteratorContext* ctx,
           &element.back()));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status WriteElement(IteratorStateWriter* writer, StringPiece key_prefix,
@@ -132,7 +133,7 @@ Status WriteElement(IteratorStateWriter* writer, StringPiece key_prefix,
     TF_RETURN_IF_ERROR(writer->WriteTensor(
         element_prefix, absl::StrCat(kComponent, "[", j, "]"), element[j]));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status WriteElementsToCheckpoint(
@@ -143,7 +144,7 @@ Status WriteElementsToCheckpoint(
   for (int i = 0; i < elements.size(); ++i) {
     TF_RETURN_IF_ERROR(WriteElement(writer, key_prefix, elements, i));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status UpdateCheckpointElements(
@@ -155,7 +156,7 @@ Status UpdateCheckpointElements(
   for (int64_t i : checkpoint_indices) {
     TF_RETURN_IF_ERROR(WriteElement(writer, key_prefix, elements, i));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 VariantTensorDataReader::VariantTensorDataReader(
@@ -254,7 +255,7 @@ Status VariantTensorDataReader::ReadScalarInternal(StringPiece n,
     return errors::NotFound(key);
   }
   *val = data_.at(name)->tensors(key_it->second).scalar<T>()();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status VariantTensorDataReader::ReadTensorInternal(FunctionLibraryRuntime* flr,
@@ -275,7 +276,7 @@ Status VariantTensorDataReader::ReadTensorInternal(FunctionLibraryRuntime* flr,
     return errors::NotFound(key);
   }
   *val = data_.at(name)->tensors(key_it->second);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status VariantTensorDataReader::ReadDatasetInternal(FunctionLibraryRuntime* flr,
@@ -294,7 +295,7 @@ Status VariantTensorDataReader::ReadDatasetInternal(FunctionLibraryRuntime* flr,
   GraphDef graph_def;
   graph_def.ParseFromString(serialized_graph_def);
   TF_RETURN_IF_ERROR(FromGraphDef(flr, graph_def, {}, output_node, val));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::map<string, Tensor> VariantTensorDataReader::ReadAllTensors() {
@@ -418,7 +419,7 @@ Status VariantTensorDataWriter::WriteTensorInternal(StringPiece n,
     data_[name]->set_type_name("tensorflow::Iterator");
   }
   *(data_[name]->add_tensors()) = val;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status VariantTensorDataWriter::WriteDatasetInternal(
@@ -428,7 +429,7 @@ Status VariantTensorDataWriter::WriteDatasetInternal(
   TF_RETURN_IF_ERROR(AsGraphDef(dataset, std::move(ctx), &graph_def));
   string output_node;
   for (const auto& node : graph_def.node()) {
-    if (node.op() == "_Retval") {
+    if (node.op() == kRetvalOp) {
       output_node = node.input(0);
       break;
     }
@@ -439,7 +440,7 @@ Status VariantTensorDataWriter::WriteDatasetInternal(
   TF_RETURN_IF_ERROR(
       WriteScalar(n, strings::StrCat(key, kOutputNode), output_node));
   TF_RETURN_IF_ERROR(WriteScalar(n, key, result));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::string IteratorStateVariant::TypeName() {
@@ -455,7 +456,7 @@ IteratorStateVariant::IteratorStateVariant(const IteratorStateVariant& other) {
 Status IteratorStateVariant::InitializeFromVariantData(
     std::unique_ptr<VariantTensorData> data) {
   data_ = std::move(data);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void IteratorStateVariant::Encode(VariantTensorData* data) const {
@@ -544,11 +545,11 @@ Status AsGraphDefForRewrite(OpKernelContext* ctx, const DatasetBase* input,
 
   // Symbolic `_Retval` node indicates which node corresponds to the dataset.
   for (const auto& node : result->node()) {
-    if (node.op() == "_Retval") {
+    if (node.op() == kRetvalOp) {
       *dataset_node = node.input(0);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AsGraphDef(const DatasetBase* dataset,
@@ -577,13 +578,13 @@ Status AsGraphDef(const DatasetBase* dataset,
       db.AddInputDataset(&serialization_ctx, dataset, &output_node));
   // Insert a purely symbolic _Retval node to indicate to consumers which node
   // represents `dataset`.
-  ops::UnaryOp("_Retval", output_node,
+  ops::UnaryOp(std::string(kRetvalOp), output_node,
                b.opts()
                    .WithName("dataset")
                    .WithAttr("T", DT_VARIANT)
                    .WithAttr("index", 0));
   TF_RETURN_IF_ERROR(b.ToGraphDef(graph_def));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 tsl::StatusOr<absl::flat_hash_map<std::string, int64_t>> CheckpointStats(
@@ -599,19 +600,15 @@ tsl::StatusOr<absl::flat_hash_map<std::string, int64_t>> CheckpointStats(
         "Failed to parse checkpoint tensor from proto.");
   }
 
-  int64_t num_tensors = t.dim_size(0);
-  auto serialized_vec = t.vec<Variant>();
-  std::vector<const VariantTensorData*> data;
-  data.reserve(num_tensors);
-  for (int i = 0; i < num_tensors; ++i) {
-    auto* w = serialized_vec(i).get<IteratorStateVariant>();
-    if (!w) {
-      return absl::InvalidArgumentError(
-          "Failed to access IteratorStateVariant inside checkpoint tensor");
-    }
-    data.push_back(w->GetData());
+  auto variant = t.scalar<Variant>()();
+  auto* w = variant.get<IteratorStateVariant>();
+  if (!w) {
+    return absl::InvalidArgumentError(
+        "Failed to access IteratorStateVariant inside checkpoint tensor");
   }
-  auto reader = std::make_unique<VariantTensorDataReader>(data);
+  const VariantTensorData* data = w->GetData();
+  auto reader = std::make_unique<VariantTensorDataReader>(
+      std::vector<const VariantTensorData*>{data});
   absl::flat_hash_map<std::string, int64_t> stats;
   for (const auto& [key, tensor] : reader->ReadAllTensors()) {
     stats[key] = tensor.TotalBytes();
diff --git a/tensorflow/core/data/serialization_utils.h b/tensorflow/core/data/serialization_utils.h
index b55dfdfb7eca8c..ab5acacb797090 100644
--- a/tensorflow/core/data/serialization_utils.h
+++ b/tensorflow/core/data/serialization_utils.h
@@ -24,6 +24,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/dataset.pb.h"
 #include "tensorflow/core/framework/variant_tensor_data.h"
@@ -33,6 +34,8 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
+inline constexpr absl::string_view kRetvalOp = "_Retval";
+
 // Reads dataset elements from the checkpoint reader using the given key prefix.
 Status ReadElementsFromCheckpoint(IteratorContext* ctx,
                                   IteratorStateReader* reader,
diff --git a/tensorflow/core/data/service/BUILD b/tensorflow/core/data/service/BUILD
index cb7a59de37908d..53a5b42936e70d 100644
--- a/tensorflow/core/data/service/BUILD
+++ b/tensorflow/core/data/service/BUILD
@@ -35,10 +35,6 @@ tf_proto_library(
     create_java_proto = False,
     create_kotlin_proto = False,
     protodeps = tf_additional_all_protos(),
-    visibility = [
-        ":data_transfer_visibility",
-        "//tensorflow:internal",
-    ],
 )
 
 tf_proto_library(
@@ -413,6 +409,7 @@ tf_cc_test(
         "//tensorflow/core/platform:status_matchers",
         "//tensorflow/core/platform:statusor",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/protobuf:protos_all_cc",
diff --git a/tensorflow/core/data/service/auto_scaler.cc b/tensorflow/core/data/service/auto_scaler.cc
index 9afffdbac301dd..c340f3dd599be8 100644
--- a/tensorflow/core/data/service/auto_scaler.cc
+++ b/tensorflow/core/data/service/auto_scaler.cc
@@ -130,8 +130,8 @@ std::optional<int64_t> AutoScaler::GetOptimalNumberOfWorkers() const
   return std::max(int64_t{1}, optimal_number_of_workers);
 }
 
-tsl::Status AutoScaler::ReportProcessingTime(const std::string& worker_address,
-                                             absl::Duration processing_time)
+absl::Status AutoScaler::ReportProcessingTime(const std::string& worker_address,
+                                              absl::Duration processing_time)
     TF_LOCKS_EXCLUDED(mu_) {
   if (processing_time <= absl::ZeroDuration()) {
     return absl::InvalidArgumentError(absl::StrCat(
@@ -146,7 +146,7 @@ tsl::Status AutoScaler::ReportProcessingTime(const std::string& worker_address,
   return tsl::OkStatus();
 }
 
-tsl::Status AutoScaler::ReportTargetProcessingTime(
+absl::Status AutoScaler::ReportTargetProcessingTime(
     int64_t consumer_id, absl::Duration target_processing_time)
     TF_LOCKS_EXCLUDED(mu_) {
   if (target_processing_time <= absl::ZeroDuration()) {
@@ -163,7 +163,7 @@ tsl::Status AutoScaler::ReportTargetProcessingTime(
   return tsl::OkStatus();
 }
 
-tsl::Status AutoScaler::RemoveWorker(const std::string& worker_address)
+absl::Status AutoScaler::RemoveWorker(const std::string& worker_address)
     TF_LOCKS_EXCLUDED(mu_) {
   tsl::mutex_lock l(mu_);
   if (!worker_throughputs_.contains(worker_address))
@@ -175,7 +175,7 @@ tsl::Status AutoScaler::RemoveWorker(const std::string& worker_address)
   return tsl::OkStatus();
 }
 
-tsl::Status AutoScaler::RemoveConsumer(int64_t consumer_id)
+absl::Status AutoScaler::RemoveConsumer(int64_t consumer_id)
     TF_LOCKS_EXCLUDED(mu_) {
   tsl::mutex_lock l(mu_);
   if (!consumption_rates_.contains(consumer_id))
@@ -194,7 +194,7 @@ void MultipleIterationsAutoScaler::EnsureIterationIsRegistered(
   }
 }
 
-tsl::Status MultipleIterationsAutoScaler::UnregisterIteration(
+absl::Status MultipleIterationsAutoScaler::UnregisterIteration(
     int64_t iteration_id) TF_LOCKS_EXCLUDED(mu_) {
   tsl::mutex_lock l(mu_);
   if (!auto_scalers_.contains(iteration_id))
@@ -204,7 +204,7 @@ tsl::Status MultipleIterationsAutoScaler::UnregisterIteration(
   return tsl::OkStatus();
 }
 
-tsl::Status MultipleIterationsAutoScaler::UpdateOptimalNumberOfWorkersMetric(
+absl::Status MultipleIterationsAutoScaler::UpdateOptimalNumberOfWorkersMetric(
     int64_t current_number_of_workers) TF_LOCKS_EXCLUDED(mu_) {
   if (current_number_of_workers <= 0)
     return absl::InvalidArgumentError(
@@ -263,29 +263,29 @@ std::optional<int64_t> MultipleIterationsAutoScaler::GetOptimalNumberOfWorkers()
     return optimal_number_of_workers;
 }
 
-tsl::Status MultipleIterationsAutoScaler::ReportProcessingTime(
+absl::Status MultipleIterationsAutoScaler::ReportProcessingTime(
     int64_t iteration_id, const std::string& worker_address,
     absl::Duration processing_time) TF_LOCKS_EXCLUDED(mu_) {
   tsl::mutex_lock l(mu_);
   EnsureIterationIsRegistered(iteration_id);
 
-  tsl::Status status = auto_scalers_[iteration_id]->ReportProcessingTime(
+  absl::Status status = auto_scalers_[iteration_id]->ReportProcessingTime(
       worker_address, processing_time);
   return status;
 }
 
-tsl::Status MultipleIterationsAutoScaler::ReportTargetProcessingTime(
+absl::Status MultipleIterationsAutoScaler::ReportTargetProcessingTime(
     int64_t iteration_id, int64_t consumer_id,
     absl::Duration target_processing_time) TF_LOCKS_EXCLUDED(mu_) {
   tsl::mutex_lock l(mu_);
   EnsureIterationIsRegistered(iteration_id);
 
-  tsl::Status status = auto_scalers_[iteration_id]->ReportTargetProcessingTime(
+  absl::Status status = auto_scalers_[iteration_id]->ReportTargetProcessingTime(
       consumer_id, target_processing_time);
   return status;
 }
 
-tsl::Status MultipleIterationsAutoScaler::RemoveWorker(
+absl::Status MultipleIterationsAutoScaler::RemoveWorker(
     int64_t iteration_id, const std::string& worker_address)
     TF_LOCKS_EXCLUDED(mu_) {
   tsl::tf_shared_lock l(mu_);
@@ -293,20 +293,21 @@ tsl::Status MultipleIterationsAutoScaler::RemoveWorker(
     return absl::NotFoundError(absl::StrCat(
         "There are no reported times for iteration_id ", iteration_id));
 
-  tsl::Status status =
+  absl::Status status =
       auto_scalers_[iteration_id]->RemoveWorker(worker_address);
   return status;
 }
 
-tsl::Status MultipleIterationsAutoScaler::RemoveConsumer(int64_t iteration_id,
-                                                         int64_t consumer_id)
+absl::Status MultipleIterationsAutoScaler::RemoveConsumer(int64_t iteration_id,
+                                                          int64_t consumer_id)
     TF_LOCKS_EXCLUDED(mu_) {
   tsl::tf_shared_lock l(mu_);
   if (!auto_scalers_.contains(iteration_id))
     return absl::NotFoundError(absl::StrCat(
         "There are no reported times for iteration_id ", iteration_id));
 
-  tsl::Status status = auto_scalers_[iteration_id]->RemoveConsumer(consumer_id);
+  absl::Status status =
+      auto_scalers_[iteration_id]->RemoveConsumer(consumer_id);
   return status;
 }
 
diff --git a/tensorflow/core/data/service/auto_scaler.h b/tensorflow/core/data/service/auto_scaler.h
index 0c41700e724633..860ccf77368347 100644
--- a/tensorflow/core/data/service/auto_scaler.h
+++ b/tensorflow/core/data/service/auto_scaler.h
@@ -76,24 +76,24 @@ class AutoScaler {
   // Reports the latest observed processing time from the worker with
   // `worker_address`. Returns an error if `processing_time` is ZeroDuration or
   // negative.
-  tsl::Status ReportProcessingTime(const std::string& worker_address,
-                                   absl::Duration processing_time)
+  absl::Status ReportProcessingTime(const std::string& worker_address,
+                                    absl::Duration processing_time)
       TF_LOCKS_EXCLUDED(mu_);
   // Reports the latest observed target processing time from the consumer
   // identified by `consumer_id`. Returns an error if `target_processing_time`
   // is ZeroDuration or negative.
-  tsl::Status ReportTargetProcessingTime(int64_t consumer_id,
-                                         absl::Duration target_processing_time)
+  absl::Status ReportTargetProcessingTime(int64_t consumer_id,
+                                          absl::Duration target_processing_time)
       TF_LOCKS_EXCLUDED(mu_);
   // Unregisters the worker with `worker_address`, removing its reported
   // processing time from consideration of the current workload estimation.
   // Returns an error if the specified worker does not exist.
-  tsl::Status RemoveWorker(const std::string& worker_address)
+  absl::Status RemoveWorker(const std::string& worker_address)
       TF_LOCKS_EXCLUDED(mu_);
   // Unregisters the consumer identified by `consumer_id`, removing its reported
   // target processing time from consideration of the current workload
   // estimation. Returns an error if the specified consumer does not exist.
-  tsl::Status RemoveConsumer(int64_t consumer_id) TF_LOCKS_EXCLUDED(mu_);
+  absl::Status RemoveConsumer(int64_t consumer_id) TF_LOCKS_EXCLUDED(mu_);
 
  private:
   mutable tsl::mutex mu_;
@@ -118,13 +118,13 @@ class MultipleIterationsAutoScaler {
   // Unregisters iteration with `iteration_id`, removing its reported
   // times from consideration of the current workload estimation.
   // Returns an error if the specified iteration does not exist.
-  tsl::Status UnregisterIteration(int64_t iteration_id) TF_LOCKS_EXCLUDED(mu_);
+  absl::Status UnregisterIteration(int64_t iteration_id) TF_LOCKS_EXCLUDED(mu_);
   // Updates the metric value with the current estimated optimal number of
   // workers. The estimate is limited to min(4 * `current_number_of_workers`,
   // `current_number_of_workers` + 500). Returns an error if there are no
   // previously reported processing and target processing times for at least one
   // iteration, or `current_number_of_workers` is not positive.
-  tsl::Status UpdateOptimalNumberOfWorkersMetric(
+  absl::Status UpdateOptimalNumberOfWorkersMetric(
       int64_t current_number_of_workers) TF_LOCKS_EXCLUDED(mu_);
   // Returns the estimated optimal number of workers according to the current
   // observed workload. If there are no previously reported processing and
@@ -134,31 +134,31 @@ class MultipleIterationsAutoScaler {
   // Reports the latest observed processing time from the worker with
   // `worker_address` for iteration with `iteration_id`. Returns an error if
   // `processing_time` is ZeroDuration or negative.
-  tsl::Status ReportProcessingTime(int64_t iteration_id,
-                                   const std::string& worker_address,
-                                   absl::Duration processing_time)
+  absl::Status ReportProcessingTime(int64_t iteration_id,
+                                    const std::string& worker_address,
+                                    absl::Duration processing_time)
       TF_LOCKS_EXCLUDED(mu_);
   // Reports the latest observed target processing time from the consumer
   // identified by `consumer_id` for iteration with `iteration_id`. Returns an
   // error if `target_processing_time` is ZeroDuration or negative.
-  tsl::Status ReportTargetProcessingTime(int64_t iteration_id,
-                                         int64_t consumer_id,
-                                         absl::Duration target_processing_time)
+  absl::Status ReportTargetProcessingTime(int64_t iteration_id,
+                                          int64_t consumer_id,
+                                          absl::Duration target_processing_time)
       TF_LOCKS_EXCLUDED(mu_);
   // Unregisters the worker with `worker_address` for iteration with
   // `iteration_id`, removing its reported processing time from consideration of
   // the current workload estimation. Returns an error if there are no
   // previously reported processing times for iteration with `iteration_id` and
   // the specified worker.
-  tsl::Status RemoveWorker(int64_t iteration_id,
-                           const std::string& worker_address)
+  absl::Status RemoveWorker(int64_t iteration_id,
+                            const std::string& worker_address)
       TF_LOCKS_EXCLUDED(mu_);
   // Unregisters the consumer identified by `consumer_id` for iteration with
   // `iteration_id`, removing its reported target processing time from
   // consideration of the current workload estimation. Returns an error if there
   // are no previously reported processing times for iteration with
   // `iteration_id` and the specified consumer.
-  tsl::Status RemoveConsumer(int64_t iteration_id, int64_t consumer_id)
+  absl::Status RemoveConsumer(int64_t iteration_id, int64_t consumer_id)
       TF_LOCKS_EXCLUDED(mu_);
 
  private:
diff --git a/tensorflow/core/data/service/auto_scaler_test.cc b/tensorflow/core/data/service/auto_scaler_test.cc
index 6007b7f758c33b..c04ea49d216bf6 100644
--- a/tensorflow/core/data/service/auto_scaler_test.cc
+++ b/tensorflow/core/data/service/auto_scaler_test.cc
@@ -202,14 +202,14 @@ TEST(AutoScalerTest, ReportProcessingTimeNewAndExisting) {
 
 TEST(AutoScalerTest, ReportProcessingTimeZeroDuration) {
   AutoScaler auto_scaler;
-  tsl::Status result = auto_scaler.ReportProcessingTime("/worker/task/0:20000",
-                                                        absl::ZeroDuration());
+  absl::Status result = auto_scaler.ReportProcessingTime("/worker/task/0:20000",
+                                                         absl::ZeroDuration());
   EXPECT_THAT(result, StatusIs(absl::StatusCode::kInvalidArgument));
 }
 
 TEST(AutoScalerTest, ReportProcessingTimeNegativeDuration) {
   AutoScaler auto_scaler;
-  tsl::Status result = auto_scaler.ReportProcessingTime(
+  absl::Status result = auto_scaler.ReportProcessingTime(
       "/worker/task/0:20000", absl::Microseconds(-10));
   EXPECT_THAT(result, StatusIs(absl::StatusCode::kInvalidArgument));
 }
@@ -246,14 +246,14 @@ TEST(AutoScalerTest, ReportTargetProcessingTimeNewAndExisting) {
 
 TEST(AutoScalerTest, ReportTargetProcessingTimeZeroDuration) {
   AutoScaler auto_scaler;
-  tsl::Status result =
+  absl::Status result =
       auto_scaler.ReportTargetProcessingTime(0, absl::ZeroDuration());
   EXPECT_THAT(result, StatusIs(absl::StatusCode::kInvalidArgument));
 }
 
 TEST(AutoScalerTest, ReportTargetProcessingTimeNegativeDuration) {
   AutoScaler auto_scaler;
-  tsl::Status result =
+  absl::Status result =
       auto_scaler.ReportTargetProcessingTime(0, absl::Microseconds(-10));
   EXPECT_THAT(result, StatusIs(absl::StatusCode::kInvalidArgument));
 }
@@ -324,7 +324,7 @@ TEST(MultipleIterationsAutoScalerTest, UnregisterNonexistentIteration) {
 TEST(MultipleIterationsAutoScalerTest,
      UpdateOptimalNumberOfWorkersMetricInvalidCurrentWorkers) {
   MultipleIterationsAutoScaler auto_scaler;
-  tsl::Status status = auto_scaler.UpdateOptimalNumberOfWorkersMetric(0);
+  absl::Status status = auto_scaler.UpdateOptimalNumberOfWorkersMetric(0);
   EXPECT_THAT(status, StatusIs(absl::StatusCode::kInvalidArgument));
   status = auto_scaler.UpdateOptimalNumberOfWorkersMetric(-1);
   EXPECT_THAT(status, StatusIs(absl::StatusCode::kInvalidArgument));
@@ -333,7 +333,7 @@ TEST(MultipleIterationsAutoScalerTest,
 TEST(MultipleIterationsAutoScalerTest,
      UpdateOptimalNumberOfWorkersMetricNoReportedTimes) {
   MultipleIterationsAutoScaler auto_scaler;
-  tsl::Status status = auto_scaler.UpdateOptimalNumberOfWorkersMetric(1);
+  absl::Status status = auto_scaler.UpdateOptimalNumberOfWorkersMetric(1);
   EXPECT_THAT(status, StatusIs(absl::StatusCode::kUnavailable));
 }
 
@@ -345,7 +345,7 @@ TEST(MultipleIterationsAutoScalerTest,
       auto_scaler.ReportTargetProcessingTime(0, 0, absl::Microseconds(5)));
   TF_ASSERT_OK(
       auto_scaler.ReportTargetProcessingTime(1, 0, absl::Microseconds(5)));
-  tsl::Status status = auto_scaler.UpdateOptimalNumberOfWorkersMetric(1);
+  absl::Status status = auto_scaler.UpdateOptimalNumberOfWorkersMetric(1);
   EXPECT_THAT(status, StatusIs(absl::StatusCode::kUnavailable));
 }
 
@@ -357,7 +357,7 @@ TEST(MultipleIterationsAutoScalerTest,
                                                 absl::Microseconds(10)));
   TF_ASSERT_OK(auto_scaler.ReportProcessingTime(1, "/worker/task/0:20000",
                                                 absl::Microseconds(10)));
-  tsl::Status status = auto_scaler.UpdateOptimalNumberOfWorkersMetric(1);
+  absl::Status status = auto_scaler.UpdateOptimalNumberOfWorkersMetric(1);
   EXPECT_THAT(status, StatusIs(absl::StatusCode::kUnavailable));
 }
 
@@ -587,7 +587,7 @@ TEST(MultipleIterationsAutoScalerTest, ReportProcessingTimeNewAndExisting) {
 TEST(MultipleIterationsAutoScalerTest, ReportProcessingTimeZeroDuration) {
   MultipleIterationsAutoScaler auto_scaler;
 
-  tsl::Status result = auto_scaler.ReportProcessingTime(
+  absl::Status result = auto_scaler.ReportProcessingTime(
       0, "/worker/task/0:20000", absl::ZeroDuration());
   EXPECT_THAT(result, StatusIs(absl::StatusCode::kInvalidArgument));
 }
@@ -595,7 +595,7 @@ TEST(MultipleIterationsAutoScalerTest, ReportProcessingTimeZeroDuration) {
 TEST(MultipleIterationsAutoScalerTest, ReportProcessingTimeNegativeDuration) {
   MultipleIterationsAutoScaler auto_scaler;
 
-  tsl::Status result = auto_scaler.ReportProcessingTime(
+  absl::Status result = auto_scaler.ReportProcessingTime(
       0, "/worker/task/0:20000", absl::Microseconds(-10));
   EXPECT_THAT(result, StatusIs(absl::StatusCode::kInvalidArgument));
 }
@@ -655,7 +655,7 @@ TEST(MultipleIterationsAutoScalerTest,
 TEST(MultipleIterationsAutoScalerTest, ReportTargetProcessingTimeZeroDuration) {
   MultipleIterationsAutoScaler auto_scaler;
 
-  tsl::Status result =
+  absl::Status result =
       auto_scaler.ReportTargetProcessingTime(0, 0, absl::ZeroDuration());
   EXPECT_THAT(result, StatusIs(absl::StatusCode::kInvalidArgument));
 }
@@ -664,7 +664,7 @@ TEST(MultipleIterationsAutoScalerTest,
      ReportTargetProcessingTimeNegativeDuration) {
   MultipleIterationsAutoScaler auto_scaler;
 
-  tsl::Status result =
+  absl::Status result =
       auto_scaler.ReportTargetProcessingTime(0, 0, absl::Microseconds(-10));
   EXPECT_THAT(result, StatusIs(absl::StatusCode::kInvalidArgument));
 }
diff --git a/tensorflow/core/data/service/client/data_service_client.cc b/tensorflow/core/data/service/client/data_service_client.cc
index 4b4c16a9c7b009..2f01dc4bb21c0d 100644
--- a/tensorflow/core/data/service/client/data_service_client.cc
+++ b/tensorflow/core/data/service/client/data_service_client.cc
@@ -133,7 +133,7 @@ Status DataServiceClient::Initialize() {
                       params_.address),
       deadline_micros));
   initialized_ = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<GetNextResult> DataServiceClient::GetNext(
@@ -425,7 +425,7 @@ Status DataServiceClient::AddTask(const TaskInfo& task_info)
     std::mt19937 rng;
     std::shuffle(tasks_.begin(), tasks_.end(), rng);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void DataServiceClient::Heartbeat() TF_LOCKS_EXCLUDED(mu_) {
@@ -808,10 +808,10 @@ Status DataServiceClient::MaybeRemoveTask(Task& task, int64_t deadline_micros,
     result.ready = true;
     result.skip = true;
     get_next_cv_.notify_all();
-    return OkStatus();
+    return absl::OkStatus();
   }
   VLOG(1) << "Failed to remove task for worker " << task.info.worker_address();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceClient::GetElement(Task* task, int64_t deadline_micros,
@@ -857,7 +857,7 @@ Status DataServiceClient::GetElement(Task* task, int64_t deadline_micros,
       TF_RETURN_IF_ERROR(MaybeRemoveTask(*task, deadline_micros, *result));
       mutex_lock l(mu_);
       if (result->skip) {
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
     int64_t backoff_until = std::min(
@@ -874,11 +874,11 @@ Status DataServiceClient::GetElement(Task* task, int64_t deadline_micros,
       // task before returning to this one.
       result->ready = true;
       result->skip = true;
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
   ProcessGetElementResponse(enqueue_result, get_element_result, result, *task);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool DataServiceClient::ResultReady() const TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
diff --git a/tensorflow/core/data/service/client/data_service_client.h b/tensorflow/core/data/service/client/data_service_client.h
index e96aa45125ab9f..e339a0d1f0d8d9 100644
--- a/tensorflow/core/data/service/client/data_service_client.h
+++ b/tensorflow/core/data/service/client/data_service_client.h
@@ -229,7 +229,7 @@ class DataServiceClient {
 
   // A status to be returned from the next call to `GetNext`. This is set by
   // asynchronous threads when they encounter errors.
-  Status status_ TF_GUARDED_BY(mu_) = OkStatus();
+  Status status_ TF_GUARDED_BY(mu_) = absl::OkStatus();
   // A queue of results for `GetElement` requests to read from. When doing
   // strict round robin reads, the queue will contain placeholder results with
   // their `Result::ready` field false until their data has been retrieved
diff --git a/tensorflow/core/data/service/client/validate_utils.cc b/tensorflow/core/data/service/client/validate_utils.cc
index c8972d34b60d24..b4769ce1282eee 100644
--- a/tensorflow/core/data/service/client/validate_utils.cc
+++ b/tensorflow/core/data/service/client/validate_utils.cc
@@ -30,7 +30,7 @@ namespace {
 // Validates local worker related parameters.
 Status ValidateLocalWorkers(const DataServiceParams& data_service_params) {
   if (data_service_params.target_workers != TARGET_WORKERS_LOCAL) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (LocalWorkers::Empty()) {
     if (IsStaticShard(data_service_params.processing_mode)) {
@@ -54,13 +54,13 @@ Status ValidateLocalWorkers(const DataServiceParams& data_service_params) {
         "Coordinated reads require non-local workers, but `target_workers` "
         "is \"LOCAL\".");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Validates cross-trainer cache related parameters.
 Status ValidateCrossTrainerCache(const DataServiceParams& data_service_params) {
   if (!data_service_params.cross_trainer_cache_options.has_value()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (data_service_params.job_name.empty()) {
     return errors::InvalidArgument(
@@ -84,14 +84,14 @@ Status ValidateCrossTrainerCache(const DataServiceParams& data_service_params) {
         "Got number of coordinated consumers: ",
         data_service_params.num_consumers.value());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
 Status ValidateDataServiceParams(const DataServiceParams& data_service_params) {
   TF_RETURN_IF_ERROR(ValidateLocalWorkers(data_service_params));
   TF_RETURN_IF_ERROR(ValidateCrossTrainerCache(data_service_params));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/common.cc b/tensorflow/core/data/service/common.cc
index c6ac0e2cdee63b..86ff05fb7aed37 100644
--- a/tensorflow/core/data/service/common.cc
+++ b/tensorflow/core/data/service/common.cc
@@ -61,7 +61,7 @@ Status ValidateProcessingMode(const ProcessingModeDef& processing_mode) {
         "specify a valid sharding policy. Please add the policy to either "
         "`IsDynamicShard` or `IsStaticShard` (i.e., auto-shard).");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<AutoShardPolicy> ToAutoShardPolicy(
diff --git a/tensorflow/core/data/service/common_test.cc b/tensorflow/core/data/service/common_test.cc
index b1214d54d9a136..72da0c2c96ae52 100644
--- a/tensorflow/core/data/service/common_test.cc
+++ b/tensorflow/core/data/service/common_test.cc
@@ -174,7 +174,7 @@ TEST(CommonTest, IsPreemptedError) {
   EXPECT_TRUE(IsPreemptedError(errors::Aborted("Aborted")));
   EXPECT_TRUE(IsPreemptedError(errors::Cancelled("Cancelled")));
   EXPECT_TRUE(IsPreemptedError(errors::Unavailable("Unavailable")));
-  EXPECT_FALSE(IsPreemptedError(OkStatus()));
+  EXPECT_FALSE(IsPreemptedError(absl::OkStatus()));
 }
 
 TEST(CommonTest, IsPermanentError) {
diff --git a/tensorflow/core/data/service/credentials_factory.cc b/tensorflow/core/data/service/credentials_factory.cc
index 4322e3d43754e9..9367296e80bbdf 100644
--- a/tensorflow/core/data/service/credentials_factory.cc
+++ b/tensorflow/core/data/service/credentials_factory.cc
@@ -55,7 +55,7 @@ Status CredentialsFactory::Get(absl::string_view protocol,
   auto it = credentials_factories().find(std::string(protocol));
   if (it != credentials_factories().end()) {
     *out = it->second;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::vector<string> available_types;
@@ -75,7 +75,7 @@ Status CredentialsFactory::CreateServerCredentials(
   CredentialsFactory* factory;
   TF_RETURN_IF_ERROR(CredentialsFactory::Get(protocol, &factory));
   TF_RETURN_IF_ERROR(factory->CreateServerCredentials(out));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CredentialsFactory::CreateClientCredentials(
@@ -84,7 +84,7 @@ Status CredentialsFactory::CreateClientCredentials(
   CredentialsFactory* factory;
   TF_RETURN_IF_ERROR(CredentialsFactory::Get(protocol, &factory));
   TF_RETURN_IF_ERROR(factory->CreateClientCredentials(out));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool CredentialsFactory::Exists(absl::string_view protocol) {
@@ -100,13 +100,13 @@ class InsecureCredentialsFactory : public CredentialsFactory {
   Status CreateServerCredentials(
       std::shared_ptr<::grpc::ServerCredentials>* out) override {
     *out = ::grpc::InsecureServerCredentials();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CreateClientCredentials(
       std::shared_ptr<::grpc::ChannelCredentials>* out) override {
     *out = ::grpc::InsecureChannelCredentials();
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/data/service/cross_trainer_cache.h b/tensorflow/core/data/service/cross_trainer_cache.h
index 98f7a52957af63..98c1725c2dce79 100644
--- a/tensorflow/core/data/service/cross_trainer_cache.h
+++ b/tensorflow/core/data/service/cross_trainer_cache.h
@@ -163,7 +163,7 @@ class CrossTrainerCache {
 
   // If `status_` is non-OK, the cache is cancelled, and all method calls will
   // return this status.
-  Status status_ TF_GUARDED_BY(mu_) = OkStatus();
+  Status status_ TF_GUARDED_BY(mu_) = absl::OkStatus();
 
   // `cache_` stores the cached elements.
   std::deque<std::shared_ptr<const ElementType>> cache_ TF_GUARDED_BY(mu_);
@@ -294,7 +294,7 @@ Status CrossTrainerCache<ElementType>::ExtendCache() TF_LOCKS_EXCLUDED(mu_) {
   FreeSpace(new_element_size_bytes);
   cache_.push_back(std::make_shared<ElementType>(std::move(element)));
   cache_size_bytes_ += new_element_size_bytes;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <class ElementType>
diff --git a/tensorflow/core/data/service/data_transfer.h b/tensorflow/core/data/service/data_transfer.h
index 14f5ce64f4b43d..f50fa12b4105c9 100644
--- a/tensorflow/core/data/service/data_transfer.h
+++ b/tensorflow/core/data/service/data_transfer.h
@@ -99,7 +99,7 @@ class DataTransferClient {
   // properties described in `server_compatibility_info`.
   virtual Status CheckCompatibility(
       const std::string& server_compatibility_info) const {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
diff --git a/tensorflow/core/data/service/data_transfer_test.cc b/tensorflow/core/data/service/data_transfer_test.cc
index 7b057f28a1f06d..182f6198759702 100644
--- a/tensorflow/core/data/service/data_transfer_test.cc
+++ b/tensorflow/core/data/service/data_transfer_test.cc
@@ -37,7 +37,7 @@ class TestDataTransferServer : public DataTransferServer {
   explicit TestDataTransferServer(bool* called) : called_(called) {}
   Status Start() override {
     *called_ = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
   int Port() const override { return 0; }
 
@@ -58,7 +58,7 @@ TEST(DataTransferTest, RegisterDataTransferServerBuilder) {
   bool called = false;
   DataTransferServer::Register("test", [&called](auto ignore, auto* server) {
     *server = std::make_shared<TestDataTransferServer>(&called);
-    return OkStatus();
+    return absl::OkStatus();
   });
 
   std::shared_ptr<DataTransferServer> server;
diff --git a/tensorflow/core/data/service/dataset_store.cc b/tensorflow/core/data/service/dataset_store.cc
index 105c5939378d61..e201ff9ef081ee 100644
--- a/tensorflow/core/data/service/dataset_store.cc
+++ b/tensorflow/core/data/service/dataset_store.cc
@@ -38,7 +38,7 @@ Status FileSystemDatasetStore::Put(const std::string& key,
                                    const DatasetDef& dataset) {
   std::string path_to_write = io::JoinPath(datasets_dir_, key);
   TF_RETURN_IF_ERROR(WriteDatasetDef(path_to_write, dataset));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FileSystemDatasetStore::Get(
@@ -48,14 +48,14 @@ Status FileSystemDatasetStore::Get(
   DatasetDef def;
   TF_RETURN_IF_ERROR(ReadDatasetDef(path, def));
   dataset_def = std::make_shared<const DatasetDef>(def);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MemoryDatasetStore::Put(const std::string& key,
                                const DatasetDef& dataset) {
   auto& stored_dataset = datasets_[key];
   stored_dataset = std::make_shared<const DatasetDef>(dataset);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MemoryDatasetStore::Get(const std::string& key,
@@ -65,7 +65,7 @@ Status MemoryDatasetStore::Get(const std::string& key,
     return errors::NotFound("Dataset with key ", key, " not found");
   }
   dataset_def = stored_dataset;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/dispatcher_client.cc b/tensorflow/core/data/service/dispatcher_client.cc
index 6f59d3a60f055f..ee1a2b859b0fc3 100644
--- a/tensorflow/core/data/service/dispatcher_client.cc
+++ b/tensorflow/core/data/service/dispatcher_client.cc
@@ -49,7 +49,7 @@ namespace data {
 Status DataServiceDispatcherClient::Initialize() {
   mutex_lock l(mu_);
   if (stub_) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   std::shared_ptr<grpc::ChannelCredentials> credentials;
   TF_RETURN_IF_ERROR(
@@ -81,7 +81,7 @@ Status DataServiceDispatcherClient::Initialize() {
         "same version of TensorFlow. If you're running an MPM binary, make "
         "sure the server is running an up-to-date MPM.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<WorkerHeartbeatResponse> DataServiceDispatcherClient::WorkerHeartbeat(
@@ -109,7 +109,7 @@ Status DataServiceDispatcherClient::WorkerUpdate(
   if (!status.ok()) {
     return grpc_util::WrapError("Failed to send worker update", status);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherClient::GetDatasetDef(const std::string& dataset_id,
@@ -123,7 +123,7 @@ Status DataServiceDispatcherClient::GetDatasetDef(const std::string& dataset_id,
     return grpc_util::WrapError("Failed to get dataset def", status);
   }
   dataset_def = resp.dataset_def();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherClient::GetSplit(int64_t iteration_id,
@@ -148,7 +148,7 @@ Status DataServiceDispatcherClient::GetSplit(int64_t iteration_id,
       return errors::Internal("Failed to parse split tensor proto");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherClient::Snapshot(
@@ -167,7 +167,7 @@ Status DataServiceDispatcherClient::Snapshot(
   if (!status.ok()) {
     return grpc_util::WrapError("Failed to snapshot", status);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherClient::GetSnapshotSplit(
@@ -190,13 +190,13 @@ Status DataServiceDispatcherClient::GetSnapshotSplit(
   local_split_index = resp.local_split_index();
   end_of_splits = resp.end_of_splits();
   if (end_of_splits) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (!split.FromProto(resp.split())) {
     return errors::Internal("Failed to parse split tensor proto: ",
                             resp.split().DebugString());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherClient::RegisterDataset(
@@ -218,7 +218,7 @@ Status DataServiceDispatcherClient::RegisterDataset(
     return grpc_util::WrapError("Failed to register dataset", status);
   }
   dataset_id = resp.dataset_id();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherClient::GetOrCreateJob(
@@ -248,7 +248,7 @@ Status DataServiceDispatcherClient::GetOrCreateJob(
         status);
   }
   job_id = resp.job_id();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherClient::GetOrCreateIteration(
@@ -267,7 +267,7 @@ Status DataServiceDispatcherClient::GetOrCreateIteration(
         status);
   }
   iteration_client_id = resp.iteration_client_id();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherClient::ReleaseIterationClient(
@@ -284,7 +284,7 @@ Status DataServiceDispatcherClient::ReleaseIterationClient(
                      iteration_client_id),
         status);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherClient::MaybeRemoveTask(int64_t task_id,
@@ -303,7 +303,7 @@ Status DataServiceDispatcherClient::MaybeRemoveTask(int64_t task_id,
     return grpc_util::WrapError("Failed to call MaybeRemoveTask", status);
   }
   removed = resp.removed();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherClient::ClientHeartbeat(
@@ -314,7 +314,7 @@ Status DataServiceDispatcherClient::ClientHeartbeat(
   if (!s.ok()) {
     return grpc_util::WrapError("Failed to get tasks", s);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherClient::GetWorkers(
@@ -331,7 +331,7 @@ Status DataServiceDispatcherClient::GetWorkers(
   for (auto& worker : resp.workers()) {
     workers.push_back(worker);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherClient::GetDataServiceMetadata(
@@ -346,7 +346,7 @@ Status DataServiceDispatcherClient::GetDataServiceMetadata(
     return grpc_util::WrapError("Failed to get data service metadata", s);
   }
   metadata = resp.metadata();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherClient::GetDataServiceConfig(
@@ -360,7 +360,7 @@ Status DataServiceDispatcherClient::GetDataServiceConfig(
     return grpc_util::WrapError("Failed to get data service config", s);
   }
   config = response.config();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherClient::DisableCompressionAtRuntime(
@@ -376,7 +376,7 @@ Status DataServiceDispatcherClient::DisableCompressionAtRuntime(
     return grpc_util::WrapError(
         "Failed to get runtime compression disabling decision", s);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherClient::EnsureInitialized() {
diff --git a/tensorflow/core/data/service/dispatcher_client_test.cc b/tensorflow/core/data/service/dispatcher_client_test.cc
index 7a000509bc8d81..2cc4d15a5ae59a 100644
--- a/tensorflow/core/data/service/dispatcher_client_test.cc
+++ b/tensorflow/core/data/service/dispatcher_client_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <string>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "tensorflow/core/data/service/common.pb.h"
 #include "tensorflow/core/data/service/data_transfer.h"
 #include "tensorflow/core/data/service/dataset_store.h"
@@ -74,15 +75,17 @@ DataServiceMetadata GetDefaultMetadata() {
 
 class DispatcherClientTest : public ::testing::Test {
  protected:
-  Status SetUpTfDataService(int64_t num_workers) {
+  absl::Status SetUpTfDataService(int64_t num_workers,
+                                  int64_t worker_max_concurrent_snapshots = 0) {
     TestCluster::Config config;
     config.num_workers = num_workers;
     config.work_dir = tsl::io::JoinPath(tsl::testing::TmpDir(), "work_dir");
+    config.worker_max_concurrent_snapshots = worker_max_concurrent_snapshots;
     test_cluster_ = std::make_unique<TestCluster>(config);
     TF_RETURN_IF_ERROR(test_cluster_->Initialize());
     dispatcher_client_ = std::make_unique<DataServiceDispatcherClient>(
         test_cluster_->DispatcherAddress(), kProtocol);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Creates a dataset and returns the dataset ID.
@@ -96,12 +99,15 @@ class DispatcherClientTest : public ::testing::Test {
   }
 
   // Starts snapshots and returns the directories.
-  StatusOr<absl::flat_hash_set<std::string>> StartDummySnapshots() {
+  StatusOr<absl::flat_hash_set<std::string>> StartDummySnapshots(
+      int64_t num_snapshots) {
     DistributedSnapshotMetadata metadata =
         CreateDummyDistributedSnapshotMetadata();
     // Create a set of local file paths to which snapshots will be materialized.
-    absl::flat_hash_set<std::string> directories = {LocalTempFilename(),
-                                                    LocalTempFilename()};
+    absl::flat_hash_set<std::string> directories;
+    for (int64_t i = 0; i < num_snapshots; ++i) {
+      directories.insert(LocalTempFilename());
+    }
     for (const auto& directory : directories) {
       TF_RETURN_IF_ERROR(
           dispatcher_client_->Snapshot(RangeDataset(10), directory, metadata));
@@ -156,7 +162,7 @@ TEST_F(DispatcherClientTest, GetDataServiceConfig) {
 TEST_F(DispatcherClientTest, SnapshotSkeletonWritten) {
   TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/1));
   TF_ASSERT_OK_AND_ASSIGN(absl::flat_hash_set<std::string> paths,
-                          StartDummySnapshots());
+                          StartDummySnapshots(/*num_snapshots=*/3));
   for (const auto& path : paths) {
     TF_ASSERT_OK(Env::Default()->FileExists(CommittedChunksDirectory(path)));
     TF_ASSERT_OK(Env::Default()->FileExists(StreamsDirectory(path)));
@@ -166,7 +172,7 @@ TEST_F(DispatcherClientTest, SnapshotSkeletonWritten) {
 TEST_F(DispatcherClientTest, SnapshotMetadataAndDatasetDefWritten) {
   TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/1));
   TF_ASSERT_OK_AND_ASSIGN(absl::flat_hash_set<std::string> paths,
-                          StartDummySnapshots());
+                          StartDummySnapshots(/*num_snapshots=*/3));
   for (const auto& path : paths) {
     TF_ASSERT_OK(
         Env::Default()->FileExists(io::JoinPath(path, "snapshot.metadata")));
@@ -176,25 +182,30 @@ TEST_F(DispatcherClientTest, SnapshotMetadataAndDatasetDefWritten) {
 }
 
 TEST_F(DispatcherClientTest, SnapshotsInHeartbeat) {
-  TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/1));
+  TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/1,
+                                  /*worker_max_concurrent_snapshots=*/3));
   TF_ASSERT_OK_AND_ASSIGN(absl::flat_hash_set<std::string> paths,
-                          StartDummySnapshots());
+                          StartDummySnapshots(/*num_snapshots=*/3));
   WorkerHeartbeatRequest worker_heartbeat_request;
   worker_heartbeat_request.set_worker_address(test_cluster_->WorkerAddress(0));
-  TF_ASSERT_OK_AND_ASSIGN(
-      WorkerHeartbeatResponse worker_heartbeat_response,
-      dispatcher_client_->WorkerHeartbeat(worker_heartbeat_request));
-  ASSERT_EQ(worker_heartbeat_response.snapshot_tasks_size(), paths.size());
-  for (const auto& snapshot_task : worker_heartbeat_response.snapshot_tasks()) {
-    ASSERT_TRUE(paths.count(snapshot_task.base_path()));
-    ASSERT_EQ(snapshot_task.stream_index(), 0);
+
+  for (int64_t i = 1; i <= 3; ++i) {
+    TF_ASSERT_OK_AND_ASSIGN(
+        WorkerHeartbeatResponse worker_heartbeat_response,
+        dispatcher_client_->WorkerHeartbeat(worker_heartbeat_request));
+    ASSERT_EQ(worker_heartbeat_response.snapshot_tasks_size(), i);
+    for (const auto& snapshot_task :
+         worker_heartbeat_response.snapshot_tasks()) {
+      ASSERT_TRUE(paths.count(snapshot_task.base_path()));
+      ASSERT_EQ(snapshot_task.stream_index(), 0);
+    }
   }
 }
 
 TEST_F(DispatcherClientTest, GetSnapshotSplit) {
   TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/1));
   TF_ASSERT_OK_AND_ASSIGN(absl::flat_hash_set<std::string> paths,
-                          StartDummySnapshots());
+                          StartDummySnapshots(/*num_snapshots=*/3));
   WorkerHeartbeatRequest worker_heartbeat_request;
   worker_heartbeat_request.set_worker_address(test_cluster_->WorkerAddress(0));
   TF_ASSERT_OK_AND_ASSIGN(
@@ -219,10 +230,12 @@ TEST_F(DispatcherClientTest, GetSnapshotSplit) {
 }
 
 TEST_F(DispatcherClientTest, GetSnapshotSplitMultipleStreams) {
-  TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/3));
+  TF_ASSERT_OK(SetUpTfDataService(/*num_workers=*/3,
+                                  /*worker_max_concurrent_snapshots=*/1));
   TF_ASSERT_OK_AND_ASSIGN(absl::flat_hash_set<std::string> paths,
-                          StartDummySnapshots());
+                          StartDummySnapshots(/*num_snapshots=*/3));
 
+  absl::flat_hash_set<std::string> snapshots_in_progress;
   for (int64_t i = 0; i < 3; ++i) {
     WorkerHeartbeatRequest worker_heartbeat_request;
     worker_heartbeat_request.set_worker_address(
@@ -230,8 +243,10 @@ TEST_F(DispatcherClientTest, GetSnapshotSplitMultipleStreams) {
     TF_ASSERT_OK_AND_ASSIGN(
         WorkerHeartbeatResponse worker_heartbeat_response,
         dispatcher_client_->WorkerHeartbeat(worker_heartbeat_request));
+    EXPECT_EQ(worker_heartbeat_response.snapshot_tasks().size(), 1);
     for (const auto& snapshot_task :
          worker_heartbeat_response.snapshot_tasks()) {
+      snapshots_in_progress.insert(snapshot_task.base_path());
       GetSnapshotSplitRequest get_snapshot_split_request;
       Tensor split;
       int64_t local_split_index = 0;
@@ -245,6 +260,9 @@ TEST_F(DispatcherClientTest, GetSnapshotSplitMultipleStreams) {
       EXPECT_FALSE(end_of_splits);
     }
   }
+
+  // Each worker writes one snapshot; each snapshot has been assigned a worker.
+  EXPECT_EQ(snapshots_in_progress, paths);
 }
 
 TEST_F(DispatcherClientTest, RegisterDatasetWithExplicitId) {
diff --git a/tensorflow/core/data/service/dispatcher_impl.cc b/tensorflow/core/data/service/dispatcher_impl.cc
index 09d082b2fff7f9..f77111dff045c7 100644
--- a/tensorflow/core/data/service/dispatcher_impl.cc
+++ b/tensorflow/core/data/service/dispatcher_impl.cc
@@ -34,6 +34,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/numbers.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_split.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/data/dataset_utils.h"
@@ -139,7 +140,7 @@ Status CreateWorkerStub(const std::string& address, const std::string& protocol,
       CredentialsFactory::CreateClientCredentials(protocol, &credentials));
   auto channel = ::grpc::CreateCustomChannel(address, credentials, args);
   stub = WorkerService::NewStub(channel);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void PrepareGraph(GraphDef* graph) {
@@ -227,7 +228,7 @@ Status DataServiceDispatcherImpl::Start() {
     LOG(INFO) << "Running with fault_tolerant_mode=False. The dispatcher will "
                  "not be able to recover its state on restart.";
     started_ = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
   journal_writer_ =
       std::make_unique<FileJournalWriter>(env_, JournalDir(config_.work_dir()));
@@ -267,7 +268,7 @@ Status DataServiceDispatcherImpl::Start() {
   TF_RETURN_IF_ERROR(journal_writer_.value()->EnsureInitialized());
   TF_RETURN_IF_ERROR(RestoreSnapshots());
   started_ = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void DataServiceDispatcherImpl::Stop() TF_LOCKS_EXCLUDED(mu_) {
@@ -334,7 +335,7 @@ Status DataServiceDispatcherImpl::RestoreSplitProviders(
     }
   }
   restored = std::move(split_providers);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::FindTasksToDelete(
@@ -350,7 +351,7 @@ Status DataServiceDispatcherImpl::FindTasksToDelete(
       response->add_tasks_to_delete(current_task);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::FindNewTasks(
@@ -381,7 +382,7 @@ Status DataServiceDispatcherImpl::FindNewTasks(
     TaskDef* task_def = response->add_new_tasks();
     TF_RETURN_IF_ERROR(PopulateTaskDef(task, task_def));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void DataServiceDispatcherImpl::ReportProcessingTimesFromActiveTasks(
@@ -457,12 +458,21 @@ Status DataServiceDispatcherImpl::WorkerHeartbeat(
     TF_RETURN_IF_ERROR(
         FindNewTasks(worker_address, current_tasks, assigned_tasks, response));
   }
+
+  std::vector<std::string> snapshot_paths =
+      snapshot_assignment_manager_.LoadBalanceSnapshots(
+          request->worker_address());
   std::vector<SnapshotManager*> snapshots;
+  snapshots.reserve(snapshot_paths.size());
   {
     tf_shared_lock l(mu_);
-    snapshots.reserve(snapshots_.size());
-    for (const auto& [path, snapshot_manager] : snapshots_) {
-      snapshots.push_back(snapshot_manager.get());
+    for (const std::string& snapshot_path : snapshot_paths) {
+      const auto it = snapshots_.find(snapshot_path);
+      if (it == snapshots_.end()) {
+        return absl::InternalError(absl::StrCat(
+            "Dataset snapshot at ", snapshot_path, " does not exist."));
+      }
+      snapshots.push_back(it->second.get());
     }
   }
   for (SnapshotManager* snapshot_manager : snapshots) {
@@ -471,7 +481,7 @@ Status DataServiceDispatcherImpl::WorkerHeartbeat(
 
   VLOG(3) << "Finished worker heartbeat for worker at address "
           << request->worker_address();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::WorkerUpdate(
@@ -495,7 +505,7 @@ Status DataServiceDispatcherImpl::WorkerUpdate(
               << task->iteration->iteration_id << " completed";
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::GetDatasetDef(
@@ -507,7 +517,7 @@ Status DataServiceDispatcherImpl::GetDatasetDef(
   std::shared_ptr<const DatasetDef> dataset_def;
   TF_RETURN_IF_ERROR(GetDatasetDef(*dataset, dataset_def));
   *response->mutable_dataset_def() = *dataset_def;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::GetSplit(const GetSplitRequest* request,
@@ -538,7 +548,7 @@ Status DataServiceDispatcherImpl::GetSplit(const GetSplitRequest* request,
       VLOG(3) << "Returning end_of_splits since current repetition "
               << current_repetition
               << " is greater than the requested repetition " << repetition;
-      return OkStatus();
+      return absl::OkStatus();
     }
     split_provider = split_providers_[iteration_id][provider_index].get();
   }
@@ -563,7 +573,7 @@ Status DataServiceDispatcherImpl::GetSplit(const GetSplitRequest* request,
   }
   VLOG(3) << "Returning from GetSplit, split=" << split
           << ", end_of_splits=" << end_of_splits;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::MakeSplitProviders(
@@ -575,13 +585,13 @@ Status DataServiceDispatcherImpl::MakeSplitProviders(
   std::shared_ptr<const DatasetDef> dataset_def;
   TF_RETURN_IF_ERROR(GetDatasetDef(*dataset, dataset_def));
   TF_RETURN_IF_ERROR(CreateSplitProviders(*dataset_def, split_providers));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::GetVersion(const GetVersionRequest* request,
                                              GetVersionResponse* response) {
   response->set_version(kDataServiceVersion);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::GetOrRegisterDataset(
@@ -599,7 +609,7 @@ Status DataServiceDispatcherImpl::GetOrRegisterDataset(
     VLOG(3) << "RegisterDataset returns an existing dataset with ID = "
             << *dataset_id;
     response->set_dataset_id(*dataset_id);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::string new_dataset_id;
@@ -607,7 +617,7 @@ Status DataServiceDispatcherImpl::GetOrRegisterDataset(
                                      request->dataset_id(), new_dataset_id));
   response->set_dataset_id(new_dataset_id);
   VLOG(3) << "Registered new dataset with id " << new_dataset_id;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<std::optional<std::string>> DataServiceDispatcherImpl::FindDataset(
@@ -655,7 +665,7 @@ Status DataServiceDispatcherImpl::GetDataServiceMetadata(
   VLOG(3) << "Get the data service metadata for dataset id: " << dataset_id
           << ".";
   *response->mutable_metadata() = dataset->metadata;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::GetDataServiceConfig(
@@ -663,7 +673,7 @@ Status DataServiceDispatcherImpl::GetDataServiceConfig(
     GetDataServiceConfigResponse* response) {
   TF_RETURN_IF_ERROR(CheckStarted());
   response->mutable_config()->set_deployment_mode(config_.deployment_mode());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::GetOrCreateJob(
@@ -692,7 +702,7 @@ Status DataServiceDispatcherImpl::GetOrCreateJob(
   }
   VLOG(3) << "Received job id " << job->id << " for CreateJob("
           << request->DebugString() << ")";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::GetOrCreateIteration(
@@ -723,7 +733,7 @@ Status DataServiceDispatcherImpl::GetOrCreateIteration(
   TF_RETURN_IF_ERROR(AssignTasks(tasks));
   VLOG(3) << "Created iteration " << iteration->iteration_id
           << " for CreateIteration(" << request->DebugString() << ")";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::MaybeRemoveTask(
@@ -737,7 +747,7 @@ Status DataServiceDispatcherImpl::MaybeRemoveTask(
     if (errors::IsNotFound(s)) {
       // Task is already removed.
       response->set_removed(true);
-      return OkStatus();
+      return absl::OkStatus();
     }
     TF_RETURN_IF_ERROR(s);
     auto& remover_ref = remove_task_requests_[task->task_id];
@@ -756,7 +766,7 @@ Status DataServiceDispatcherImpl::MaybeRemoveTask(
   response->set_removed(removed);
   if (!removed) {
     VLOG(1) << "Failed to remove task " << task->task_id;
-    return OkStatus();
+    return absl::OkStatus();
   }
   mutex_lock l(mu_);
   if (!task->removed) {
@@ -774,7 +784,7 @@ Status DataServiceDispatcherImpl::MaybeRemoveTask(
                  << " from tf.data service AutoScaler: " << auto_scaler_status;
   }
   VLOG(1) << "Task " << task->task_id << " successfully removed";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::ReleaseIterationClient(
@@ -799,7 +809,7 @@ Status DataServiceDispatcherImpl::ReleaseIterationClient(
   release_iteration_client->set_iteration_client_id(iteration_client_id);
   release_iteration_client->set_time_micros(env_->NowMicros());
   TF_RETURN_IF_ERROR(Apply(update));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Validates that the job matches the requested processing mode.
@@ -832,7 +842,7 @@ Status DataServiceDispatcherImpl::ValidateMatchingJob(
         "Tried to create job with name ", job->job_name,
         ", but found an existing job with different parameters: ", diff);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::CreateJob(
@@ -857,7 +867,7 @@ Status DataServiceDispatcherImpl::CreateJob(
   TF_RETURN_IF_ERROR(state_.JobFromId(job_id, job));
   tensorflow::metrics::RecordTFDataServiceJobsCreated(
       request.processing_mode_def(), is_coordinated_read);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::CreateIteration(
@@ -882,7 +892,7 @@ Status DataServiceDispatcherImpl::CreateIteration(
   TF_RETURN_IF_ERROR(Apply(update));
   TF_RETURN_IF_ERROR(state_.IterationFromId(iteration_id, iteration));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::CreateTasksForWorker(
@@ -900,7 +910,7 @@ Status DataServiceDispatcherImpl::CreateTasksForWorker(
     std::shared_ptr<const Task> task;
     TF_RETURN_IF_ERROR(CreateTask(iteration, worker_address, task));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::AcquireIterationClientId(
@@ -915,7 +925,7 @@ Status DataServiceDispatcherImpl::AcquireIterationClientId(
   TF_RETURN_IF_ERROR(Apply(update));
   // Does not release clients before they start to read from the dataset.
   latest_client_heartbeats_time_[iteration_client_id] = absl::InfiniteFuture();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::CreateTasksForIteration(
@@ -930,7 +940,7 @@ Status DataServiceDispatcherImpl::CreateTasksForIteration(
     TF_RETURN_IF_ERROR(CreateTask(iteration, worker->address, task));
     tasks.push_back(task);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::CreatePendingTask(
@@ -952,7 +962,7 @@ Status DataServiceDispatcherImpl::CreatePendingTask(
                                          worker->tags.end()};
   create_task->set_worker_uid(worker->uid);
   TF_RETURN_IF_ERROR(Apply(update));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::CreateTask(
@@ -974,7 +984,7 @@ Status DataServiceDispatcherImpl::CreateTask(
   create_task->set_worker_uid(worker->uid);
   TF_RETURN_IF_ERROR(Apply(update));
   TF_RETURN_IF_ERROR(state_.TaskFromId(task_id, task));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::AssignTasks(
@@ -982,7 +992,7 @@ Status DataServiceDispatcherImpl::AssignTasks(
   for (const auto& task : tasks) {
     TF_RETURN_IF_ERROR(AssignTask(task));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::GetOrCreateWorkerStub(
@@ -993,7 +1003,7 @@ Status DataServiceDispatcherImpl::GetOrCreateWorkerStub(
     auto it = worker_stubs_.find(worker_address);
     if (it != worker_stubs_.end()) {
       out_stub = it->second.get();
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
   std::unique_ptr<WorkerService::Stub> stub;
@@ -1008,7 +1018,7 @@ Status DataServiceDispatcherImpl::GetOrCreateWorkerStub(
     }
     out_stub = worker.get();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::AssignTask(std::shared_ptr<const Task> task)
@@ -1032,7 +1042,7 @@ Status DataServiceDispatcherImpl::AssignTask(std::shared_ptr<const Task> task)
         s.error_code() == grpc::StatusCode::CANCELLED) {
       // Worker is presumably preempted. We will assign the task to the worker
       // when it reconnects.
-      return OkStatus();
+      return absl::OkStatus();
     }
     return grpc_util::WrapError(
         absl::StrCat("Failed to submit task to worker ", task->worker_address),
@@ -1040,7 +1050,7 @@ Status DataServiceDispatcherImpl::AssignTask(std::shared_ptr<const Task> task)
   }
   VLOG(2) << "Finished assigning task " << task->task_id << " to worker "
           << task->worker_address;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::ClientHeartbeat(
@@ -1149,7 +1159,7 @@ Status DataServiceDispatcherImpl::ClientHeartbeat(
   VLOG(4) << "Found " << response->task_info_size()
           << " tasks for iteration client id "
           << request->iteration_client_id();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::GetWorkers(const GetWorkersRequest* request,
@@ -1164,7 +1174,7 @@ Status DataServiceDispatcherImpl::GetWorkers(const GetWorkersRequest* request,
   }
   VLOG(3) << "Returning list of " << response->workers_size()
           << " workers from GetWorkers";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::Snapshot(const SnapshotRequest* request,
@@ -1188,6 +1198,7 @@ Status DataServiceDispatcherImpl::Snapshot(const SnapshotRequest* request,
       std::unique_ptr<SnapshotManager> snapshot_manager,
       SnapshotManager::Start(*request, snapshot_assignment_manager_, env_));
   snapshots_.insert({request->path(), std::move(snapshot_manager)});
+  snapshot_assignment_manager_.AddSnapshot(request->path());
 
   Update update;
   SnapshotUpdate* snapshot = update.mutable_snapshot();
@@ -1253,6 +1264,7 @@ absl::Status DataServiceDispatcherImpl::RestoreSnapshots()
         return;
       }
       snapshots_.insert({path, std::move(snapshot_manager.value())});
+      snapshot_assignment_manager_.AddSnapshot(path);
     });
   }
   thread_pool.reset();
@@ -1269,14 +1281,14 @@ Status DataServiceDispatcherImpl::DisableCompressionAtRuntime(
   if (dataset->metadata.compression() !=
       DataServiceMetadata::COMPRESSION_SNAPPY) {
     response->set_no_compression_to_disable(true);
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (std::optional<bool> compression_disabled_at_runtime =
           state_.CompressionDisabledAtRuntime(request->dataset_id());
       compression_disabled_at_runtime.has_value()) {
     response->set_compression_disabled_at_runtime(
         *compression_disabled_at_runtime);
-    return OkStatus();
+    return absl::OkStatus();
   }
   response->set_compression_disabled_at_runtime(
       request->disable_compression_at_runtime());
@@ -1287,7 +1299,7 @@ Status DataServiceDispatcherImpl::DisableCompressionAtRuntime(
   compression_disabled_at_runtime->set_compression_disabled(
       request->disable_compression_at_runtime());
   TF_RETURN_IF_ERROR(Apply(update));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::PopulateTaskDef(
@@ -1326,7 +1338,7 @@ Status DataServiceDispatcherImpl::PopulateTaskDef(
         io::JoinPath(DatasetsDir(config_.work_dir()), dataset->dataset_id);
     task_def->set_path(path);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::CheckStarted() TF_LOCKS_EXCLUDED(mu_) {
@@ -1334,7 +1346,7 @@ Status DataServiceDispatcherImpl::CheckStarted() TF_LOCKS_EXCLUDED(mu_) {
   if (!started_) {
     return errors::Unavailable("Dispatcher has not started yet.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceDispatcherImpl::RecordSplitProduced(
@@ -1439,7 +1451,7 @@ Status DataServiceDispatcherImpl::ReleaseMissingClients()
       TF_RETURN_IF_ERROR(Apply(update));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void DataServiceDispatcherImpl::RemoveWorkerFromAutoScaler(
@@ -1506,7 +1518,7 @@ Status DataServiceDispatcherImpl::GcOldIterations()
     }
     LOG(INFO) << "Garbage collected iteration " << iteration->DebugString();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool DataServiceDispatcherImpl::ShouldGcIteration(const Iteration& iteration,
diff --git a/tensorflow/core/data/service/dispatcher_state.cc b/tensorflow/core/data/service/dispatcher_state.cc
index c7ba48534aa601..5383beaba644d7 100644
--- a/tensorflow/core/data/service/dispatcher_state.cc
+++ b/tensorflow/core/data/service/dispatcher_state.cc
@@ -92,7 +92,7 @@ Status DispatcherState::Apply(const Update& update) {
       return errors::Internal("Update type not set.");
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void DispatcherState::RegisterDataset(
@@ -141,7 +141,7 @@ Status DispatcherState::JobFromId(int64_t job_id,
     return errors::NotFound("Job with id ", job_id, " not found");
   }
   job = it->second;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DispatcherState::JobByName(const std::string& job_name,
@@ -151,7 +151,7 @@ Status DispatcherState::JobByName(const std::string& job_name,
     return errors::NotFound("Job with name ", job_name, " not found");
   }
   job = it->second;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void DispatcherState::CreateIteration(
@@ -330,7 +330,7 @@ Status DispatcherState::DatasetFromId(
     return errors::NotFound("Dataset id ", id, " not found");
   }
   dataset = it->second;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DispatcherState::WorkerFromAddress(
@@ -340,7 +340,7 @@ Status DispatcherState::WorkerFromAddress(
     return errors::NotFound("Worker with address ", address, " not found.");
   }
   worker = it->second;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::vector<std::shared_ptr<const DispatcherState::Worker>>
@@ -370,7 +370,7 @@ Status DispatcherState::IterationFromId(
     return errors::NotFound("Iteration id ", id, " not found");
   }
   iteration = it->second;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DispatcherState::IterationByKey(
@@ -382,7 +382,7 @@ Status DispatcherState::IterationByKey(
                             " not found");
   }
   iteration = it->second;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 int64_t DispatcherState::NextAvailableJobId() const {
@@ -400,7 +400,7 @@ Status DispatcherState::IterationForIterationClientId(
     return errors::NotFound("Iteration client id not found: ",
                             iteration_client_id);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::vector<int64_t> DispatcherState::ListActiveClientIds() {
@@ -424,7 +424,7 @@ Status DispatcherState::TaskFromId(int64_t id,
     return errors::NotFound("Task ", id, " not found");
   }
   task = it->second;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DispatcherState::TasksForIteration(
@@ -439,7 +439,7 @@ Status DispatcherState::TasksForIteration(
   for (const auto& task : it->second) {
     tasks.push_back(task);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DispatcherState::TasksForWorker(
@@ -456,7 +456,7 @@ Status DispatcherState::TasksForWorker(
   for (const auto& task : worker_tasks) {
     tasks.push_back(task.second);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 int64_t DispatcherState::NextAvailableTaskId() const {
diff --git a/tensorflow/core/data/service/graph_rewriters.cc b/tensorflow/core/data/service/graph_rewriters.cc
index 012f36ff175ff1..bf7066e5f51600 100644
--- a/tensorflow/core/data/service/graph_rewriters.cc
+++ b/tensorflow/core/data/service/graph_rewriters.cc
@@ -184,13 +184,13 @@ AutoShardRewriter::GetRewriteConfig() const {
 Status WorkerIndexResolver::ValidateWorker(
     absl::string_view worker_address) const {
   if (worker_addresses_.empty()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   for (absl::string_view config_address : worker_addresses_) {
     if (config_address == worker_address ||
         ShouldReplaceDynamicPort(config_address, worker_address)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
diff --git a/tensorflow/core/data/service/grpc_dispatcher_impl_test.cc b/tensorflow/core/data/service/grpc_dispatcher_impl_test.cc
index bee4676be3e5ca..393e754cdf524a 100644
--- a/tensorflow/core/data/service/grpc_dispatcher_impl_test.cc
+++ b/tensorflow/core/data/service/grpc_dispatcher_impl_test.cc
@@ -77,7 +77,7 @@ class GrpcDispatcherImplTest : public ::testing::Test {
     std::shared_ptr<Channel> channel =
         ::grpc::CreateCustomChannel(GetDispatcherAddress(), credentials, args);
     dispatcher_client_stub_ = DispatcherService::NewStub(channel);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::string GetDispatcherAddress() const {
diff --git a/tensorflow/core/data/service/grpc_worker_impl.cc b/tensorflow/core/data/service/grpc_worker_impl.cc
index ce6ab35729fda2..d83879bed599b9 100644
--- a/tensorflow/core/data/service/grpc_worker_impl.cc
+++ b/tensorflow/core/data/service/grpc_worker_impl.cc
@@ -47,7 +47,7 @@ Status GrpcWorkerImpl::Start(
   worker_address_ = worker_address;
   TF_RETURN_IF_ERROR(impl_->Start(worker_address, transfer_servers));
   LocalWorkers::Add(worker_address, impl_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void GrpcWorkerImpl::Stop() {
diff --git a/tensorflow/core/data/service/grpc_worker_impl_test.cc b/tensorflow/core/data/service/grpc_worker_impl_test.cc
index c21c6b5056b575..062117c94999d6 100644
--- a/tensorflow/core/data/service/grpc_worker_impl_test.cc
+++ b/tensorflow/core/data/service/grpc_worker_impl_test.cc
@@ -88,7 +88,7 @@ class GrpcWorkerImplTest : public ::testing::Test {
     std::shared_ptr<Channel> channel =
         ::grpc::CreateCustomChannel(GetWorkerAddress(), credentials, args);
     worker_client_stub_ = WorkerService::NewStub(channel);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::string GetDispatcherAddress() const {
diff --git a/tensorflow/core/data/service/journal.cc b/tensorflow/core/data/service/journal.cc
index 5890e46fbf020c..d8ecd7adafce52 100644
--- a/tensorflow/core/data/service/journal.cc
+++ b/tensorflow/core/data/service/journal.cc
@@ -42,7 +42,7 @@ Status ParseSequenceNumber(const std::string& journal_file,
     return errors::InvalidArgument("Failed to parse journal file name: ",
                                    journal_file);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
@@ -57,7 +57,7 @@ FileJournalWriter::FileJournalWriter(Env* env, const std::string& journal_dir)
 
 Status FileJournalWriter::EnsureInitialized() {
   if (writer_) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   std::vector<std::string> journal_files;
   TF_RETURN_IF_ERROR(env_->RecursivelyCreateDir(journal_dir_));
@@ -73,7 +73,7 @@ Status FileJournalWriter::EnsureInitialized() {
   TF_RETURN_IF_ERROR(env_->NewAppendableFile(journal_file, &file_));
   writer_ = std::make_unique<io::RecordWriter>(file_.get());
   VLOG(1) << "Created journal writer to write to " << journal_file;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FileJournalWriter::Write(const Update& update) {
@@ -89,7 +89,7 @@ Status FileJournalWriter::Write(const Update& update) {
   if (VLOG_IS_ON(4)) {
     VLOG(4) << "Wrote journal entry: " << update.DebugString();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 FileJournalReader::FileJournalReader(Env* env, StringPiece journal_dir)
@@ -97,7 +97,7 @@ FileJournalReader::FileJournalReader(Env* env, StringPiece journal_dir)
 
 Status FileJournalReader::EnsureInitialized() {
   if (reader_) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   return UpdateFile(DataServiceJournalFile(journal_dir_, 0));
 }
@@ -115,7 +115,7 @@ Status FileJournalReader::Read(Update& update, bool& end_of_journal) {
         VLOG(3) << "Next journal file " << next_journal_file
                 << " does not exist. End of journal reached.";
         end_of_journal = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
       TF_RETURN_IF_ERROR(UpdateFile(next_journal_file));
       continue;
@@ -128,7 +128,7 @@ Status FileJournalReader::Read(Update& update, bool& end_of_journal) {
       VLOG(4) << "Read journal entry: " << update.DebugString();
     }
     end_of_journal = false;
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
@@ -138,7 +138,7 @@ Status FileJournalReader::UpdateFile(const std::string& filename) {
   io::RecordReaderOptions opts;
   opts.buffer_size = 2 << 20;  // 2MB
   reader_ = std::make_unique<io::SequentialRecordReader>(file_.get(), opts);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/journal_test.cc b/tensorflow/core/data/service/journal_test.cc
index 81d993c209d214..6ee4dd4bd3f4af 100644
--- a/tensorflow/core/data/service/journal_test.cc
+++ b/tensorflow/core/data/service/journal_test.cc
@@ -83,7 +83,7 @@ Status CheckJournalContent(StringPiece journal_dir,
   bool end_of_journal = false;
   TF_RETURN_IF_ERROR(reader.Read(result, end_of_journal));
   EXPECT_TRUE(end_of_journal);
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
diff --git a/tensorflow/core/data/service/server_lib.cc b/tensorflow/core/data/service/server_lib.cc
index a329526c909b7b..7dcc57dc2fdc0f 100644
--- a/tensorflow/core/data/service/server_lib.cc
+++ b/tensorflow/core/data/service/server_lib.cc
@@ -54,7 +54,7 @@ Status GrpcDataServerBase::Start() {
         "Server cannot be started after it has been stopped.");
   }
   if (started_) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   ::grpc::ServerBuilder builder;
   for (std::unique_ptr<::grpc::ServerBuilderOption>& option : server_options_) {
@@ -81,7 +81,7 @@ Status GrpcDataServerBase::Start() {
   started_ = true;
   LOG(INFO) << "Started tf.data " << server_type_
             << " running at 0.0.0.0:" << BoundPort();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void GrpcDataServerBase::Stop() {
@@ -136,7 +136,7 @@ Status DispatchGrpcDataServer::NumWorkers(int* num_workers) {
     return grpc_util::WrapError("Failed to get workers", s);
   }
   *num_workers = resp.workers_size();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DispatchGrpcDataServer::SnapshotStreams(
@@ -152,7 +152,7 @@ Status DispatchGrpcDataServer::SnapshotStreams(
   for (const auto& stream : resp.streams()) {
     streams->push_back(SnapshotStreamInfoWrapper(stream));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 size_t DispatchGrpcDataServer::NumActiveIterations() {
@@ -240,7 +240,7 @@ Status WorkerGrpcDataServer::StartServiceInternal() {
   std::vector<DataTransferServerInfo> transfer_servers = {grpc_transfer_server};
   MaybeStartAlternativeDataTransferServer(transfer_servers);
   TF_RETURN_IF_ERROR(service_->Start(worker_address, transfer_servers));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void WorkerGrpcDataServer::StopServiceInternal() { service_->Stop(); }
@@ -254,7 +254,7 @@ Status WorkerGrpcDataServer::NumTasks(int* num_tasks) {
     return grpc_util::WrapError("Failed to get tasks", s);
   }
   *num_tasks = resp.tasks_size();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status WorkerGrpcDataServer::SnapshotTaskProgresses(
@@ -269,7 +269,7 @@ Status WorkerGrpcDataServer::SnapshotTaskProgresses(
   for (const auto& progress : resp.snapshot_task_progresses()) {
     snapshot_task_progresses->push_back(SnapshotTaskProgressWrapper(progress));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 ServerStateExport WorkerGrpcDataServer::ExportState() const {
@@ -281,13 +281,13 @@ ServerStateExport WorkerGrpcDataServer::ExportState() const {
 Status NewDispatchServer(const experimental::DispatcherConfig& config,
                          std::unique_ptr<DispatchGrpcDataServer>& out_server) {
   out_server = std::make_unique<DispatchGrpcDataServer>(config);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status NewWorkerServer(const experimental::WorkerConfig& config,
                        std::unique_ptr<WorkerGrpcDataServer>& out_server) {
   out_server = std::make_unique<WorkerGrpcDataServer>(config);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/snapshot/BUILD b/tensorflow/core/data/service/snapshot/BUILD
index af3a94c94c3abd..a1aee9e11d3eb1 100644
--- a/tensorflow/core/data/service/snapshot/BUILD
+++ b/tensorflow/core/data/service/snapshot/BUILD
@@ -306,6 +306,7 @@ cc_library(
         "//tensorflow/core/data/service:dispatcher_proto_cc",
         "//tensorflow/core/data/service:split_provider",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
@@ -407,6 +408,8 @@ tf_cc_test(
         "//tensorflow/core:framework",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
+        "//tensorflow/core/data:serialization_utils",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
diff --git a/tensorflow/core/data/service/snapshot/file_utils.cc b/tensorflow/core/data/service/snapshot/file_utils.cc
index d5045122d673cc..0440b00b34f7f0 100644
--- a/tensorflow/core/data/service/snapshot/file_utils.cc
+++ b/tensorflow/core/data/service/snapshot/file_utils.cc
@@ -44,7 +44,7 @@ constexpr const char kTempFileSuffix[] = ".tmp";
 
 absl::Status AtomicallyWrite(
     absl::string_view filename, tsl::Env* env,
-    absl::FunctionRef<tsl::Status(const std::string&)> nonatomically_write) {
+    absl::FunctionRef<absl::Status(const std::string&)> nonatomically_write) {
   std::string uncommitted_filename = absl::StrCat(filename, "__");
   if (!env->CreateUniqueFileName(&uncommitted_filename, kTempFileSuffix)) {
     return tsl::errors::Internal("Failed to write file ", filename,
diff --git a/tensorflow/core/data/service/snapshot/parallel_tfrecord_writer.cc b/tensorflow/core/data/service/snapshot/parallel_tfrecord_writer.cc
index 370c84f0762b59..0146b33a6f080a 100644
--- a/tensorflow/core/data/service/snapshot/parallel_tfrecord_writer.cc
+++ b/tensorflow/core/data/service/snapshot/parallel_tfrecord_writer.cc
@@ -47,12 +47,12 @@ ParallelTFRecordWriter::ParallelTFRecordWriter(const std::string& file_prefix,
                                                tsl::Env* env,
                                                ByteSize max_file_size,
                                                int64_t num_write_threads,
-                                               int64_t buffer_size_per_thread)
+                                               int64_t buffer_size)
     : env_(env),
       file_prefix_(file_prefix),
       compression_(compression),
       max_file_size_(max_file_size),
-      buffer_size_(num_write_threads * buffer_size_per_thread) {
+      buffer_size_(buffer_size) {
   thread_pool_ = std::make_unique<tsl::thread::ThreadPool>(
       env_, tsl::ThreadOptions{}, "write_tfrecord_thread", num_write_threads);
   for (int64_t i = 0; i < num_write_threads; ++i) {
@@ -162,8 +162,11 @@ ParallelTFRecordWriter::GetNextRecord(const std::string& filename)
   }
 
   std::vector<Tensor> record = std::move(buffer_.front());
+  ByteSize estimated_size = EstimatedSize(record);
+  LOG_EVERY_N_SEC(INFO, 1) << "Writing TFRecord of " << estimated_size
+                           << " to file " << file_prefix_ << "*.";
   ++file_stats_[filename].num_records;
-  file_stats_[filename].estimated_size += EstimatedSize(record);
+  file_stats_[filename].estimated_size += estimated_size;
   buffer_.pop_front();
   ready_to_push_.SignalAll();
   return record;
diff --git a/tensorflow/core/data/service/snapshot/parallel_tfrecord_writer.h b/tensorflow/core/data/service/snapshot/parallel_tfrecord_writer.h
index 453c0d3c29a7a0..db6cd182213b5e 100644
--- a/tensorflow/core/data/service/snapshot/parallel_tfrecord_writer.h
+++ b/tensorflow/core/data/service/snapshot/parallel_tfrecord_writer.h
@@ -58,9 +58,9 @@ class ParallelTFRecordWriter {
  public:
   explicit ParallelTFRecordWriter(const std::string& file_prefix,
                                   const std::string& compression, tsl::Env* env,
-                                  ByteSize max_file_size = ByteSize::GB(2),
-                                  int64_t num_write_threads = 10,
-                                  int64_t buffer_size_per_thread = 1);
+                                  ByteSize max_file_size = ByteSize::GB(6),
+                                  int64_t num_write_threads = 2,
+                                  int64_t buffer_size = 1);
   virtual ~ParallelTFRecordWriter();
   ParallelTFRecordWriter(const ParallelTFRecordWriter&) = delete;
   ParallelTFRecordWriter& operator=(const ParallelTFRecordWriter&) = delete;
diff --git a/tensorflow/core/data/service/snapshot/parallel_tfrecord_writer_test.cc b/tensorflow/core/data/service/snapshot/parallel_tfrecord_writer_test.cc
index a71629559cc6c8..d55e7d1f8b0b82 100644
--- a/tensorflow/core/data/service/snapshot/parallel_tfrecord_writer_test.cc
+++ b/tensorflow/core/data/service/snapshot/parallel_tfrecord_writer_test.cc
@@ -171,7 +171,7 @@ class ParallelTFRecordWriterParamTest
   int64_t NumClients() const { return std::get<1>(GetParam()); }
   ByteSize MaxFileSize() const { return std::get<2>(GetParam()); }
   int64_t NumWriteThreads() const { return std::get<3>(GetParam()); }
-  int64_t BufferSizePerThread() const { return std::get<4>(GetParam()); }
+  int64_t BufferSize() const { return std::get<4>(GetParam()); }
   std::string Compression() const { return std::get<5>(GetParam()); }
 
   void VerifyFileStats(
@@ -214,7 +214,7 @@ TEST_P(ParallelTFRecordWriterParamTest, WriteRecords) {
   TF_ASSERT_OK_AND_ASSIGN(std::string test_dir, TestDir());
   ParallelTFRecordWriter parallel_tfrecord_writer(
       test_dir, Compression(), tsl::Env::Default(), MaxFileSize(),
-      NumWriteThreads(), BufferSizePerThread());
+      NumWriteThreads(), BufferSize());
 
   RangeIterator range_iterator(NumElements());
   TF_ASSERT_OK_AND_ASSIGN(
@@ -231,7 +231,7 @@ TEST_P(ParallelTFRecordWriterParamTest, ConcurrentWrites) {
   TF_ASSERT_OK_AND_ASSIGN(std::string test_dir, TestDir());
   ParallelTFRecordWriter parallel_tfrecord_writer(
       test_dir, Compression(), tsl::Env::Default(), MaxFileSize(),
-      NumWriteThreads(), BufferSizePerThread());
+      NumWriteThreads(), BufferSize());
 
   std::vector<std::unique_ptr<tsl::Thread>> client_threads;
   for (int i = 0; i < NumClients(); ++i) {
@@ -255,20 +255,21 @@ TEST_P(ParallelTFRecordWriterParamTest, ConcurrentWrites) {
   VerifyFileStats(stats, NumElements() * NumClients());
 }
 
-INSTANTIATE_TEST_SUITE_P(
-    ParallelTFRecordWriterParams, ParallelTFRecordWriterParamTest,
-    ::testing::Combine(
-        /*NumElements*/ ::testing::Values(0, 1, 100),
-        /*NumClients*/ ::testing::Values(1, 5),
-        /*MaxFileSize*/
-        ::testing::Values(ByteSize::Bytes(1), ByteSize::Bytes(100),
-                          ByteSize::GB(1)),
-        /*NumWriteThreads*/ ::testing::Values(1, 5),
-        /*BufferSizePerThread*/ ::testing::Values(1, 10000),
-        /*Compression*/
-        ::testing::Values(tsl::io::compression::kNone,
-                          tsl::io::compression::kSnappy,
-                          tsl::io::compression::kZlib)));
+INSTANTIATE_TEST_SUITE_P(ParallelTFRecordWriterParams,
+                         ParallelTFRecordWriterParamTest,
+                         ::testing::Combine(
+                             /*NumElements*/ ::testing::Values(0, 1, 100),
+                             /*NumClients*/ ::testing::Values(1, 5),
+                             /*MaxFileSize*/
+                             ::testing::Values(ByteSize::Bytes(1),
+                                               ByteSize::Bytes(100),
+                                               ByteSize::GB(1)),
+                             /*NumWriteThreads*/ ::testing::Values(1, 5),
+                             /*BufferSize*/ ::testing::Values(1, 10000),
+                             /*Compression*/
+                             ::testing::Values(tsl::io::compression::kNone,
+                                               tsl::io::compression::kSnappy,
+                                               tsl::io::compression::kZlib)));
 
 TEST(ParallelTFRecordWriterTest, WriteNoRecord) {
   TF_ASSERT_OK_AND_ASSIGN(std::string test_dir, TestDir());
diff --git a/tensorflow/core/data/service/snapshot/prefetched_split_provider.cc b/tensorflow/core/data/service/snapshot/prefetched_split_provider.cc
index 9815020a103252..d28285966ff251 100644
--- a/tensorflow/core/data/service/snapshot/prefetched_split_provider.cc
+++ b/tensorflow/core/data/service/snapshot/prefetched_split_provider.cc
@@ -52,6 +52,7 @@ PrefetchedSplitProvider::PrefetchedSplitProvider(
     UpdateStatus(std::move(status));
     return;
   }
+  absl::MutexLock l(&mu_);
   thread_pool_ = RunPrefetchThreads();
 }
 
@@ -160,13 +161,15 @@ PrefetchedSplitProvider::GetSplitFromProvider() ABSL_LOCKS_EXCLUDED(mu_) {
 }
 
 absl::Status PrefetchedSplitProvider::Reset() ABSL_LOCKS_EXCLUDED(mu_) {
+  std::unique_ptr<tsl::thread::ThreadPool> thread_pool;
   {
     absl::MutexLock l(&mu_);
     reset_ = true;
     ready_to_push_.SignalAll();
     ready_to_pop_.SignalAll();
+    thread_pool = std::move(thread_pool_);
   }
-  thread_pool_.reset();
+  thread_pool.reset();
   TF_RETURN_IF_ERROR(split_provider_->Reset());
 
   absl::MutexLock l(&mu_);
@@ -182,10 +185,14 @@ absl::Status PrefetchedSplitProvider::Reset() ABSL_LOCKS_EXCLUDED(mu_) {
 }
 
 void PrefetchedSplitProvider::Cancel() {
-  // Finishes the in-flight threads.
   UpdateStatus(
       absl::CancelledError("tf.data prefetched split provider is shut down."));
-  thread_pool_.reset();
+  // Finishes the in-flight threads.
+  std::unique_ptr<tsl::thread::ThreadPool> thread_pool;
+  {
+    absl::MutexLock l(&mu_);
+    thread_pool = std::move(thread_pool_);
+  }
 }
 
 absl::Status PrefetchedSplitProvider::InitDirs() {
diff --git a/tensorflow/core/data/service/snapshot/prefetched_split_provider.h b/tensorflow/core/data/service/snapshot/prefetched_split_provider.h
index 741f45ee88771d..518f8a3712d099 100644
--- a/tensorflow/core/data/service/snapshot/prefetched_split_provider.h
+++ b/tensorflow/core/data/service/snapshot/prefetched_split_provider.h
@@ -148,7 +148,7 @@ class PrefetchedSplitProvider {
   // Buffer to hold the splits. The size should be bounded by `buffer_size_`.
   absl::btree_set<SplitAndIndex> buffer_ ABSL_GUARDED_BY(mu_);
 
-  std::unique_ptr<tsl::thread::ThreadPool> thread_pool_;
+  std::unique_ptr<tsl::thread::ThreadPool> thread_pool_ ABSL_GUARDED_BY(mu_);
 };
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/snapshot/snapshot_chunk_provider.cc b/tensorflow/core/data/service/snapshot/snapshot_chunk_provider.cc
index ea06d4d66f4509..731f9435ffcd13 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_chunk_provider.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_chunk_provider.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <functional>
 #include <optional>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -72,18 +73,6 @@ void Backoff(int num_retries, tsl::Env* env) {
   }
 }
 
-std::string SetToString(const absl::btree_set<std::string>& s) {
-  return absl::StrJoin(s, kSetElementDelimiter);
-}
-
-absl::btree_set<std::string> SetFromString(absl::string_view s) {
-  if (s.empty()) {
-    return {};
-  }
-  std::vector<std::string> split = absl::StrSplit(s, kSetElementDelimiter);
-  return absl::btree_set<std::string>(split.begin(), split.end());
-}
-
 }  // namespace
 
 SnapshotChunkProvider::SnapshotChunkProvider(absl::string_view snapshot_path,
@@ -120,7 +109,7 @@ absl::Status SnapshotChunkProvider::UpdateSnapshot()
   TF_ASSIGN_OR_RETURN(snapshot_state_, GetSnapshotState());
   TF_RETURN_IF_ERROR(snapshot_state_.status);
   TF_ASSIGN_OR_RETURN(std::vector<std::string> chunks, GetAvailableChunks());
-  for (absl::string_view chunk : chunks) {
+  for (const std::string& chunk : chunks) {
     if (!chunks_read_.contains(chunk)) {
       chunks_unread_.insert(std::string(chunk));
     }
@@ -198,5 +187,44 @@ void SnapshotChunkProvider::Cancel() {
   VLOG(2) << snapshot_state_.status;
 }
 
+std::string SnapshotChunkProvider::SetToString(
+    const SnapshotChunkProvider::OrderedChunkSet& s) {
+  return absl::StrJoin(s, kSetElementDelimiter);
+}
+
+SnapshotChunkProvider::OrderedChunkSet SnapshotChunkProvider::SetFromString(
+    absl::string_view s) {
+  if (s.empty()) {
+    return {};
+  }
+  std::vector<std::string> split = absl::StrSplit(s, kSetElementDelimiter);
+  return OrderedChunkSet(split.begin(), split.end());
+}
+
+bool SnapshotChunkProvider::ChunkOrder::operator()(
+    const std::string& chunk1, const std::string& chunk2) const {
+  absl::StatusOr<std::tuple<int64_t, int64_t, int64_t>> tokens1 =
+      ParseChunkFilename(chunk1);
+  absl::StatusOr<std::tuple<int64_t, int64_t, int64_t>> tokens2 =
+      ParseChunkFilename(chunk2);
+  if (!tokens1.status().ok()) {
+    LOG_EVERY_N_SEC(ERROR, 60) << "Failed to parse tf.data snapshot chunk file "
+                               << chunk1 << ": " << tokens1.status();
+    return chunk1 < chunk2;
+  }
+  if (!tokens2.status().ok()) {
+    LOG_EVERY_N_SEC(ERROR, 60) << "Failed to parse tf.data snapshot chunk file "
+                               << chunk2 << ": " << tokens2.status();
+    return chunk1 < chunk2;
+  }
+
+  auto [stream_index1, chunk_index1, num_records1] = *tokens1;
+  auto [stream_index2, chunk_index2, num_records2] = *tokens2;
+  if (chunk_index1 != chunk_index2) {
+    return chunk_index1 < chunk_index2;
+  }
+  return stream_index1 < stream_index2;
+}
+
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/snapshot_chunk_provider.h b/tensorflow/core/data/service/snapshot/snapshot_chunk_provider.h
index 9b6ac9afc415c3..fefc49983f46b4 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_chunk_provider.h
+++ b/tensorflow/core/data/service/snapshot/snapshot_chunk_provider.h
@@ -19,6 +19,8 @@ limitations under the License.
 #include <functional>
 #include <optional>
 #include <string>
+#include <utility>
+#include <vector>
 
 #include "absl/base/thread_annotations.h"
 #include "absl/container/btree_set.h"
@@ -38,7 +40,7 @@ namespace data {
 class SnapshotChunkProvider : public SplitProvider {
  public:
   SnapshotChunkProvider(absl::string_view snapshot_path, tsl::Env* env);
-  virtual ~SnapshotChunkProvider() = default;
+  ~SnapshotChunkProvider() override = default;
   SnapshotChunkProvider(const SnapshotChunkProvider&) = delete;
   SnapshotChunkProvider& operator=(const SnapshotChunkProvider&) = delete;
 
@@ -78,6 +80,17 @@ class SnapshotChunkProvider : public SplitProvider {
     absl::Status status = absl::OkStatus();
   };
 
+  // Used to sort chunks by chunk indexes so that chunks are read evenly across
+  // streams and chunks of early repetitions are read first.
+  struct ChunkOrder {
+    bool operator()(const std::string& chunk1, const std::string& chunk2) const;
+  };
+  using OrderedChunkSet = absl::btree_set<std::string, ChunkOrder>;
+
+  // String conversions to support `Save` and `Restore`.
+  static std::string SetToString(const OrderedChunkSet& s);
+  static OrderedChunkSet SetFromString(absl::string_view s);
+
   // Updates the snapshot state and available chunks.
   absl::Status UpdateSnapshot();
 
@@ -95,11 +108,11 @@ class SnapshotChunkProvider : public SplitProvider {
   mutable absl::Mutex mu_;
 
   // The set of read chunks.
-  absl::btree_set<std::string> chunks_read_ ABSL_GUARDED_BY(mu_);
+  OrderedChunkSet chunks_read_ ABSL_GUARDED_BY(mu_);
 
   // The set of unread chunks. Uses an ordered set to make sure repeated reads
   // produce data in a deterministic order.
-  absl::btree_set<std::string> chunks_unread_ ABSL_GUARDED_BY(mu_);
+  OrderedChunkSet chunks_unread_ ABSL_GUARDED_BY(mu_);
 
   // State of the snapshot.
   SnapshotState snapshot_state_ ABSL_GUARDED_BY(mu_);
diff --git a/tensorflow/core/data/service/snapshot/snapshot_chunk_provider_test.cc b/tensorflow/core/data/service/snapshot/snapshot_chunk_provider_test.cc
index 80ee660cedfa41..4dc6f2342e2ea9 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_chunk_provider_test.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_chunk_provider_test.cc
@@ -20,15 +20,18 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/memory/memory.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/mutex.h"
+#include "tensorflow/core/data/serialization_utils.h"
 #include "tensorflow/core/data/service/snapshot/file_utils.h"
 #include "tensorflow/core/data/service/snapshot/path_utils.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
@@ -44,7 +47,7 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
 using ::testing::HasSubstr;
 using ::testing::IsEmpty;
 using ::testing::UnorderedElementsAreArray;
@@ -81,6 +84,17 @@ absl::Status SetStatus(absl::string_view snapshot_path,
                                   tsl::Env::Default());
 }
 
+absl::StatusOr<std::string> GetChunk(
+    SnapshotChunkProvider& snapshot_chunk_provider) {
+  Tensor split;
+  bool end_of_splits = false;
+  TF_RETURN_IF_ERROR(snapshot_chunk_provider.GetNext(&split, &end_of_splits));
+  if (end_of_splits) {
+    return absl::OutOfRangeError("No more available chunks.");
+  }
+  return split.unaligned_flat<tsl::tstring>().data()[0];
+}
+
 absl::StatusOr<std::vector<std::string>> GetAllChunks(
     SnapshotChunkProvider& snapshot_chunk_provider) {
   std::vector<std::string> chunks;
@@ -106,6 +120,20 @@ std::vector<std::string> JoinPaths(absl::string_view snapshot_path,
   return joined_chunks;
 }
 
+std::string full_name(const std::string& name) {
+  return FullName("test", name);
+}
+
+absl::Status SaveAndRestore(SplitProvider& split_provider) {
+  VariantTensorDataWriter writer;
+  TF_RETURN_IF_ERROR(split_provider.Save(full_name, &writer));
+  std::vector<const VariantTensorData*> variants;
+  writer.GetData(&variants);
+  VariantTensorDataReader reader(variants);
+  TF_RETURN_IF_ERROR(split_provider.Restore(full_name, &reader));
+  return absl::OkStatus();
+}
+
 TEST(SnapshotChunkProviderTest, EmptySnapshot) {
   TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
   TF_ASSERT_OK(SetDone(snapshot_path));
@@ -118,9 +146,9 @@ TEST(SnapshotChunkProviderTest, EmptySnapshot) {
 
 TEST(SnapshotChunkProviderTest, SingleReader) {
   TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
-  std::vector<std::string> chunks = {"chunk_0_0_0", "chunk_1_1_1",
-                                     "chunk_2_2_2", "chunk_3_3_3",
-                                     "chunk_4_4_4"};
+  std::vector<std::string> chunks = {"chunk_4_4_4", "chunk_3_3_3",
+                                     "chunk_2_2_2", "chunk_1_1_1",
+                                     "chunk_0_0_0"};
   for (absl::string_view chunk : chunks) {
     TF_ASSERT_OK(WriteChunk(snapshot_path, chunk));
   }
@@ -128,9 +156,10 @@ TEST(SnapshotChunkProviderTest, SingleReader) {
 
   SnapshotChunkProvider snapshot_chunk_provider(snapshot_path,
                                                 tsl::Env::Default());
+  // Chunks are ordered by chunk indices.
+  absl::c_reverse(chunks);
   EXPECT_THAT(GetAllChunks(snapshot_chunk_provider),
-              IsOkAndHolds(
-                  UnorderedElementsAreArray(JoinPaths(snapshot_path, chunks))));
+              IsOkAndHolds(ElementsAreArray(JoinPaths(snapshot_path, chunks))));
 }
 
 TEST(SnapshotChunkProviderTest, Cardinality) {
@@ -185,8 +214,8 @@ TEST(SnapshotChunkProviderTest, WaitForSnapshot) {
   // The reader should be able to get chunks now.
   reader_thread.reset();
   absl::MutexLock l(&mu);
-  EXPECT_THAT(result, UnorderedElementsAreArray(
-                          JoinPaths(snapshot_path, {"chunk_0_0_0"})));
+  EXPECT_THAT(result,
+              ElementsAreArray(JoinPaths(snapshot_path, {"chunk_0_0_0"})));
 }
 
 TEST(SnapshotChunkProviderTest, ConcurrentReadWrite) {
@@ -224,7 +253,7 @@ TEST(SnapshotChunkProviderTest, ConcurrentReadWrite) {
         /*thread_options=*/{}, /*name=*/absl::StrCat("Writer_", i),
         [&snapshot_path, num_chunks_per_stream, i]() {
           for (int j = 0; j < num_chunks_per_stream; ++j) {
-            std::string filename = absl::StrCat("chunk_", i, "_", j);
+            std::string filename = absl::StrCat("chunk_", i, "_", j, "_1");
             TF_ASSERT_OK(WriteChunk(snapshot_path, filename));
             tsl::Env::Default()->SleepForMicroseconds(35);
           }
@@ -238,13 +267,36 @@ TEST(SnapshotChunkProviderTest, ConcurrentReadWrite) {
   std::vector<std::string> expected;
   for (int i = 0; i < num_streams; ++i) {
     for (int j = 0; j < num_chunks_per_stream; ++j) {
-      expected.push_back(absl::StrCat("chunk_", i, "_", j));
+      expected.push_back(absl::StrCat("chunk_", i, "_", j, "_1"));
     }
   }
   EXPECT_THAT(result,
               UnorderedElementsAreArray(JoinPaths(snapshot_path, expected)));
 }
 
+TEST(SnapshotChunkProviderTest, SaveRestore) {
+  TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
+  std::vector<std::string> chunks = {"chunk_4_4_4", "chunk_3_3_3",
+                                     "chunk_2_2_2", "chunk_1_1_1",
+                                     "chunk_0_0_0"};
+  for (absl::string_view chunk : chunks) {
+    TF_ASSERT_OK(WriteChunk(snapshot_path, chunk));
+  }
+  TF_ASSERT_OK(SetDone(snapshot_path));
+
+  SnapshotChunkProvider snapshot_chunk_provider(snapshot_path,
+                                                tsl::Env::Default());
+  EXPECT_THAT(GetChunk(snapshot_chunk_provider),
+              IsOkAndHolds(tsl::io::JoinPath(
+                  CommittedChunksDirectory(snapshot_path), "chunk_0_0_0")));
+  TF_ASSERT_OK(SaveAndRestore(snapshot_chunk_provider));
+
+  EXPECT_THAT(GetAllChunks(snapshot_chunk_provider),
+              IsOkAndHolds(ElementsAreArray(
+                  JoinPaths(snapshot_path, {"chunk_1_1_1", "chunk_2_2_2",
+                                            "chunk_3_3_3", "chunk_4_4_4"}))));
+}
+
 TEST(SnapshotChunkProviderTest, SnapshotError) {
   TF_ASSERT_OK_AND_ASSIGN(std::string snapshot_path, CreateSnapshotDirectory());
   std::unique_ptr<tsl::Thread> reader_thread =
diff --git a/tensorflow/core/data/service/snapshot/snapshot_manager.cc b/tensorflow/core/data/service/snapshot/snapshot_manager.cc
index 510bf8a07ca748..d2860d38ff9e3d 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_manager.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_manager.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
@@ -74,11 +75,13 @@ absl::StatusOr<int64_t> CountSplits(SplitProvider& split_provider) {
   return num_splits;
 }
 
-absl::Status SkipSplit(SplitProvider& split_provider) {
+absl::Status SkipSplit(SplitProvider& split_provider,
+                       int64_t& repetition_index) {
   Tensor tensor;
   bool end_of_splits = false;
   TF_RETURN_IF_ERROR(split_provider.GetNext(&tensor, &end_of_splits));
   while (end_of_splits) {
+    ++repetition_index;
     TF_RETURN_IF_ERROR(split_provider.Reset());
     TF_RETURN_IF_ERROR(split_provider.GetNext(&tensor, &end_of_splits));
   }
@@ -95,7 +98,7 @@ std::string PrefetchedSplitDir(const std::string& snapshot_path,
 
 absl::StatusOr<bool> SnapshotAssignmentManager::TryAddAssignment(
     absl::string_view snapshot_path, absl::string_view worker_address,
-    int64_t stream_index) {
+    int64_t stream_index) TF_LOCKS_EXCLUDED(mu_) {
   tsl::mutex_lock l(mu_);
   if (assignments_[worker_address].size() >=
       worker_max_concurrent_snapshots()) {
@@ -108,15 +111,60 @@ absl::StatusOr<bool> SnapshotAssignmentManager::TryAddAssignment(
                                             " already had an assignment for ",
                                             assignment.DebugString()));
   }
+  ++snapshot_assignment_counts_[snapshot_path];
   return true;
 }
 
 void SnapshotAssignmentManager::RemoveAssignment(
     absl::string_view snapshot_path, absl::string_view worker_address,
-    int64_t stream_index) {
+    int64_t stream_index) TF_LOCKS_EXCLUDED(mu_) {
   tsl::mutex_lock l(mu_);
-  assignments_[worker_address].erase(
+  auto num_erased = assignments_[worker_address].erase(
       {std::string(snapshot_path), stream_index});
+  if ((snapshot_assignment_counts_[snapshot_path] -= num_erased) <= 0) {
+    snapshot_assignment_counts_.erase(snapshot_path);
+  }
+}
+
+void SnapshotAssignmentManager::AddSnapshot(absl::string_view snapshot_path)
+    TF_LOCKS_EXCLUDED(mu_) {
+  tsl::mutex_lock l(mu_);
+  if (!snapshot_assignment_counts_.contains(snapshot_path)) {
+    snapshot_assignment_counts_[snapshot_path] = 0;
+  }
+}
+
+std::vector<std::string> SnapshotAssignmentManager::LoadBalanceSnapshots(
+    absl::string_view worker_address) TF_LOCKS_EXCLUDED(mu_) {
+  std::vector<std::string> result;
+
+  tsl::mutex_lock l(mu_);
+  result.reserve(snapshot_assignment_counts_.size());
+  const auto it = assignments_.find(worker_address);
+  if (it != assignments_.end()) {
+    for (const Assignment& assignment : it->second) {
+      result.push_back(assignment.snapshot_path);
+    }
+  }
+  if (result.size() >= worker_max_concurrent_snapshots()) {
+    return result;
+  }
+
+  absl::btree_multimap<size_t, std::string> snapshots_by_count;
+  for (const auto& [snapshot, count] : snapshot_assignment_counts_) {
+    snapshots_by_count.emplace(count, snapshot);
+  }
+
+  for (const auto& [_, snapshot] : snapshots_by_count) {
+    if (absl::c_find(result, snapshot) == result.end()) {
+      // Assigns the next least-assigned snapshot. Assigns one snapshot at a
+      // time in case workers reach the assignment limit before the user has
+      // submitted all requests.
+      result.push_back(snapshot);
+      return result;
+    }
+  }
+  return result;
 }
 
 absl::StatusOr<std::unique_ptr<SnapshotManager>> SnapshotManager::Start(
@@ -261,7 +309,6 @@ absl::Status SnapshotManager::ReadOnDiskStreams()
     cardinalities.push_back(cardinality);
   }
 
-  streams_.resize(stream_directories.size(), Stream(num_sources()));
   tsl::mutex mu;  // Protects `resume_status` and `global_split_indices`.
   absl::Status resume_status;
   absl::flat_hash_set<int64_t> global_split_indices;
@@ -316,8 +363,8 @@ absl::Status SnapshotManager::ReadOnDiskStreams()
   }
   num_assigned_splits_ = global_split_indices.size();
 
-  if (!streams_.empty() && absl::c_all_of(streams_, [](const Stream& stream) {
-        return stream.state == Stream::State::kDone;
+  if (!streams_.empty() && absl::c_all_of(streams_, [](const auto& stream) {
+        return stream.second.state == Stream::State::kDone;
       })) {
     mode_ = Mode::kDone;
     TF_RETURN_IF_ERROR(AtomicallyWriteStringToFile(SnapshotDoneFilePath(path_),
@@ -395,8 +442,6 @@ absl::Status SnapshotManager::StreamRestorer::ReadOnDiskSource(
       SourceDirectory(path_, stream_index_, source_index);
   TF_ASSIGN_OR_RETURN(std::vector<std::string> repetition_directories,
                       GetChildren(source_directory, env_));
-  repetition_indices_[source_index] =
-      repetition_directories.empty() ? 0 : repetition_directories.size() - 1;
 
   for (const std::string& repetition : repetition_directories) {
     std::string repetition_dir =
@@ -439,17 +484,12 @@ absl::Status SnapshotManager::RestoreFrom(
     TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
   if (!stream_restorer.GetStream().has_value()) {
     // The dispatcher may get preempted when it writes the owner_worker file.
-    // If that happens, we skip the last stream directory.
-    if (stream_restorer.StreamIndex() < stream_directories.size() - 1) {
-      return absl::InternalError(absl::StrCat(
-          "Failed to restore tf.data snapshot at ", path_,
-          ": Found orphaned stream ", stream_restorer.StreamIndex(), "."));
-    }
-    streams_.pop_back();
+    // If that happens, we skip the stream directory.
     return absl::OkStatus();
   }
 
-  streams_[stream_restorer.StreamIndex()] = *stream_restorer.GetStream();
+  streams_.insert(
+      {stream_restorer.StreamIndex(), *stream_restorer.GetStream()});
   auto [it, success] = assignments_.insert(
       {stream_restorer.WorkerAddress(), stream_restorer.StreamIndex()});
   if (!success) {
@@ -461,18 +501,13 @@ absl::Status SnapshotManager::RestoreFrom(
   }
   for (int64_t source_index = 0; source_index < repetition_indices.size();
        ++source_index) {
-    if (stream_restorer.RepetitionIndices()[source_index] >
-        repetition_indices[source_index]) {
-      repetition_indices[source_index] =
-          stream_restorer.RepetitionIndices()[source_index];
-    }
-
     // To account for the splits having been assigned, skips the splits in the
     // respective split providers.
-    int64_t skip_splits = streams_[stream_restorer.StreamIndex()]
+    int64_t skip_splits = GetStream(stream_restorer.StreamIndex())
                               .num_assigned_splits_per_source[source_index];
     for (int64_t i = 0; i < skip_splits; ++i) {
-      TF_RETURN_IF_ERROR(SkipSplit(*split_providers[source_index]));
+      TF_RETURN_IF_ERROR(SkipSplit(*split_providers[source_index],
+                                   repetition_indices[source_index]));
     }
   }
   for (int64_t global_split_index : stream_restorer.GlobalSplitIndices()) {
@@ -487,14 +522,20 @@ absl::Status SnapshotManager::RestoreFrom(
   return absl::OkStatus();
 }
 
+SnapshotManager::Stream& SnapshotManager::GetStream(int64_t stream_index)
+    TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+  auto [it, _] = streams_.try_emplace(stream_index, num_sources());
+  return it->second;
+}
+
 absl::Status SnapshotManager::HandleStreamCompletion(
     int64_t stream_index, absl::string_view worker_address)
     TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  streams_[stream_index].state = Stream::State::kDone;
+  GetStream(stream_index).state = Stream::State::kDone;
   assignment_manager_.RemoveAssignment(path_, worker_address, stream_index);
   ++num_completed_streams_;
-  if (absl::c_all_of(streams_, [](const Stream& stream) {
-        return stream.state == Stream::State::kDone;
+  if (absl::c_all_of(streams_, [](const auto& stream) {
+        return stream.second.state == Stream::State::kDone;
       })) {
     mode_ = Mode::kDone;
     TF_RETURN_IF_ERROR(AtomicallyWriteStringToFile(SnapshotDoneFilePath(path_),
@@ -526,31 +567,20 @@ absl::Status SnapshotManager::HandleStreamError(
 absl::StatusOr<std::optional<int64_t>>
 SnapshotManager::MaybeCreateAndAssignNewStream(absl::string_view worker_address)
     TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-  int64_t new_stream_index = streams_.size();
+  int64_t new_stream_index =
+      streams_.empty() ? 0 : streams_.rbegin()->first + 1;
   TF_ASSIGN_OR_RETURN(bool assignment_added,
                       assignment_manager_.TryAddAssignment(
                           path_, worker_address, new_stream_index));
   if (!assignment_added) {
     return std::optional<int64_t>();
   }
-  for (int64_t source_index = 0; source_index < num_sources(); ++source_index) {
-    for (int64_t repetition_index = 0;
-         repetition_index <= sources_[source_index].repetition_index;
-         ++repetition_index) {
-      TF_RETURN_IF_ERROR(env_->RecursivelyCreateDir(RepetitionDirectory(
-          path_, new_stream_index, source_index, repetition_index)));
-    }
-  }
-  TF_RETURN_IF_ERROR(AtomicallyWriteStringToFile(
-      StreamWorkerFilePath(path_, new_stream_index), worker_address, env_));
-  streams_.push_back(Stream(num_sources()));
+  streams_.insert({new_stream_index, Stream(num_sources())});
   assignments_[worker_address] = new_stream_index;
-  LOG(INFO) << "For snapshot at " << path_ << ", created stream_"
-            << new_stream_index << " and assigned to " << worker_address;
   return new_stream_index;
 }
 
-absl::StatusOr<std::optional<int64_t>>
+absl::StatusOr<std::optional<std::pair<int64_t, bool>>>
 SnapshotManager::MaybeGetOrCreateStreamAssignment(
     absl::string_view worker_address,
     const SnapshotTaskProgress* snapshot_progress)
@@ -571,121 +601,169 @@ SnapshotManager::MaybeGetOrCreateStreamAssignment(
     if (assigned_stream_index.has_value() && snapshot_progress->completed()) {
       TF_RETURN_IF_ERROR(HandleStreamCompletion(
           snapshot_progress->snapshot_task().stream_index(), worker_address));
-      return std::optional<int64_t>();
+      return std::nullopt;
     }
     if (snapshot_progress->status().code() != error::OK) {
       TF_RETURN_IF_ERROR(
           HandleStreamError(worker_address, snapshot_progress->status()));
-      return std::optional<int64_t>();
+      return std::nullopt;
     }
   }
   if (!assigned_stream_index) {
     if (mode_ != Mode::kActive) {
-      return std::optional<int64_t>();
+      return std::nullopt;
     }
     TF_ASSIGN_OR_RETURN(assigned_stream_index,
                         MaybeCreateAndAssignNewStream(worker_address));
+    if (!assigned_stream_index.has_value()) {
+      return std::nullopt;
+    }
+    return std::make_pair(*assigned_stream_index, true);
   }
-  if (assigned_stream_index &&
-      streams_[*assigned_stream_index].state == Stream::State::kDone) {
-    return std::optional<int64_t>();
+  if (!assigned_stream_index.has_value() ||
+      GetStream(*assigned_stream_index).state == Stream::State::kDone) {
+    return std::nullopt;
   }
-  return assigned_stream_index;
+  return std::make_pair(*assigned_stream_index, false);
 }
 
 absl::Status SnapshotManager::WorkerHeartbeat(
     const WorkerHeartbeatRequest& request, WorkerHeartbeatResponse& response)
     TF_LOCKS_EXCLUDED(mu_) {
-  tsl::mutex_lock l(mu_);
-  dead_workers_.erase(request.worker_address());
+  std::optional<std::pair<int64_t, bool>> assigned_stream_index;
+  std::vector<int64_t> repetitions_per_source;
 
-  if (mode_ == Mode::kDone || mode_ == Mode::kError) {
-    // When the snapshot manager is done or in an error state, it returns an
-    // empty response to inform the workers to cancel the ongoing tasks.
-    return absl::OkStatus();
-  }
+  {
+    tsl::mutex_lock l(mu_);
+    dead_workers_.erase(request.worker_address());
 
-  if (absl::Time now = absl::FromUnixMicros(env_->NowMicros());
-      now - last_progress_log_time_ > kProgressLoggingInterval) {
-    LOG(INFO) << "tf.data snapshot progress [" << path_
-              << "]: " << num_completed_streams_ << "/" << streams_.size()
-              << " streams completed; " << num_assigned_splits_ << "/"
-              << num_total_splits_ << " splits assigned or completed.";
-    last_progress_log_time_ = now;
-  }
+    if (mode_ == Mode::kDone || mode_ == Mode::kError) {
+      // When the snapshot manager is done or in an error state, it returns an
+      // empty response to inform the workers to cancel the ongoing tasks.
+      return absl::OkStatus();
+    }
 
-  const SnapshotTaskProgress* snapshot_progress = nullptr;
-  if (auto it = request.snapshot_task_progress().find(path_);
-      it != request.snapshot_task_progress().end()) {
-    snapshot_progress = &it->second;
-  }
-  if (snapshot_progress && snapshot_progress->completed() &&
-      mode_ == Mode::kActive) {
-    mode_ = Mode::kWindingDown;
-  }
-  TF_ASSIGN_OR_RETURN(std::optional<int64_t> assigned_stream_index,
-                      MaybeGetOrCreateStreamAssignment(request.worker_address(),
-                                                       snapshot_progress));
-  if (!assigned_stream_index) {
-    return absl::OkStatus();
-  }
+    if (absl::Time now = absl::FromUnixMicros(env_->NowMicros());
+        now - last_progress_log_time_ > kProgressLoggingInterval) {
+      LOG(INFO) << "tf.data snapshot progress [" << path_
+                << "]: " << num_completed_streams_ << "/" << streams_.size()
+                << " streams completed; " << num_assigned_splits_ << "/"
+                << num_total_splits_ << " splits assigned or completed.";
+      last_progress_log_time_ = now;
+    }
+
+    const SnapshotTaskProgress* snapshot_progress = nullptr;
+    if (auto it = request.snapshot_task_progress().find(path_);
+        it != request.snapshot_task_progress().end()) {
+      snapshot_progress = &it->second;
+    }
+    if (snapshot_progress && snapshot_progress->completed() &&
+        mode_ == Mode::kActive) {
+      mode_ = Mode::kWindingDown;
+    }
+    TF_ASSIGN_OR_RETURN(assigned_stream_index,
+                        MaybeGetOrCreateStreamAssignment(
+                            request.worker_address(), snapshot_progress));
+    if (!assigned_stream_index.has_value()) {
+      return absl::OkStatus();
+    }
+
+    SnapshotTaskDef* snapshot_task = response.add_snapshot_tasks();
+    snapshot_task->set_base_path(path_);
+    snapshot_task->set_num_sources(num_sources());
+    *snapshot_task->mutable_metadata() = metadata_;
+    snapshot_task->set_stream_index(assigned_stream_index->first);
 
-  SnapshotTaskDef* snapshot_task = response.add_snapshot_tasks();
-  snapshot_task->set_base_path(path_);
-  snapshot_task->set_num_sources(num_sources());
-  *snapshot_task->mutable_metadata() = metadata_;
-  snapshot_task->set_stream_index(*assigned_stream_index);
+    for (int64_t source_index = 0; source_index < num_sources();
+         ++source_index) {
+      repetitions_per_source.push_back(sources_[source_index].repetition_index);
+    }
+  }  // Releases `mu_`.
+
+  const auto [stream_index, is_new_stream] = *assigned_stream_index;
+  if (is_new_stream) {
+    TF_RETURN_IF_ERROR(InitStreamDirectory(
+        stream_index, request.worker_address(), repetitions_per_source));
+    LOG(INFO) << "For snapshot at " << path_ << ", created stream_"
+              << stream_index << " and assigned to "
+              << request.worker_address();
+  }
   return absl::OkStatus();
 }
 
+absl::Status SnapshotManager::InitStreamDirectory(
+    int64_t stream_index, const std::string& worker_address,
+    const std::vector<int64_t>& repetitions_per_source) {
+  for (int64_t source_index = 0; source_index < repetitions_per_source.size();
+       ++source_index) {
+    for (int64_t repetition_index = 0;
+         repetition_index <= repetitions_per_source[source_index];
+         ++repetition_index) {
+      TF_RETURN_IF_ERROR(env_->RecursivelyCreateDir(RepetitionDirectory(
+          path_, stream_index, source_index, repetition_index)));
+    }
+  }
+  return AtomicallyWriteStringToFile(StreamWorkerFilePath(path_, stream_index),
+                                     worker_address, env_);
+}
+
 absl::Status SnapshotManager::GetSnapshotSplit(
     const GetSnapshotSplitRequest& request, GetSnapshotSplitResponse& response)
-    TF_LOCKS_EXCLUDED(mu_) {
-  tsl::mutex_lock l(mu_);
-  if (auto it = assignments_.find(request.worker_address());
-      it == assignments_.end()) {
-    return absl::InternalError(
-        absl::StrCat("tf.data snapshot worker ", request.worker_address(),
-                     " was assigned stream ", request.stream_index(),
-                     ", but the assignment is no longer available."));
-  } else if (it->second != request.stream_index()) {
-    return absl::InternalError(
-        absl::StrCat("tf.data snapshot worker ", request.worker_address(),
-                     " was assigned stream ", request.stream_index(),
-                     " but is now assigned a different stream ", it->second));
-  }
+    TF_LOCKS_EXCLUDED(get_split_mu_, mu_) {
+  int64_t local_split_index = 0;
+  int64_t global_split_index = 0;
+  PrefetchedSplitProvider* split_provider = nullptr;
+  tsl::mutex_lock get_split_lock(get_split_mu_);
+  {
+    tsl::mutex_lock l(mu_);
+    if (auto it = assignments_.find(request.worker_address());
+        it == assignments_.end()) {
+      return absl::InternalError(
+          absl::StrCat("tf.data snapshot worker ", request.worker_address(),
+                       " was assigned stream ", request.stream_index(),
+                       ", but the assignment is no longer available."));
+    } else if (it->second != request.stream_index()) {
+      return absl::InternalError(
+          absl::StrCat("tf.data snapshot worker ", request.worker_address(),
+                       " was assigned stream ", request.stream_index(),
+                       " but is now assigned a different stream ", it->second));
+    }
 
-  Stream& stream = streams_[request.stream_index()];
-  int64_t local_split_index =
-      stream.num_assigned_splits_per_source[request.source_index()];
-  int64_t global_split_index = num_assigned_splits_;
-  response.set_local_split_index(local_split_index);
+    Stream& stream = GetStream(request.stream_index());
+    local_split_index =
+        stream.num_assigned_splits_per_source[request.source_index()];
+    global_split_index = num_assigned_splits_;
+    response.set_local_split_index(local_split_index);
 
-  Source& source = sources_[request.source_index()];
-  if (request.repetition_index() < source.repetition_index) {
-    response.set_end_of_splits(true);
-    return absl::OkStatus();
-  }
-  while (request.repetition_index() > source.repetition_index) {
-    // This could happen if an iterator is repeated before reaching end of
-    // input, e.g. for the longer input to `Dataset.zip`. In this case we mark
-    // the previous repetitions as completed and advance to the requested
-    // repetition.
-    TF_RETURN_IF_ERROR(ResetSource(source, request.source_index()));
+    Source& source = sources_[request.source_index()];
+    if (request.repetition_index() < source.repetition_index) {
+      response.set_end_of_splits(true);
+      return absl::OkStatus();
+    }
+    while (request.repetition_index() > source.repetition_index) {
+      // This could happen if an iterator is repeated before reaching end of
+      // input, e.g. for the longer input to `Dataset.zip`. In this case we mark
+      // the previous repetitions as completed and advance to the requested
+      // repetition.
+      TF_RETURN_IF_ERROR(ResetSource(source, request.source_index()));
+    }
+    split_provider = source.split_provider.get();
   }
 
   std::string split_path = SplitPath(
       path_, request.stream_index(), request.source_index(),
       request.repetition_index(), local_split_index, global_split_index);
   TF_ASSIGN_OR_RETURN(std::optional<Tensor> split,
-                      source.split_provider->GetNext(split_path));
+                      split_provider->GetNext(split_path));
   if (!split.has_value()) {
     response.set_end_of_splits(true);
     return absl::OkStatus();
   }
   split->AsProtoTensorContent(response.mutable_split());
 
-  ++stream.num_assigned_splits_per_source[request.source_index()];
+  tsl::mutex_lock l(mu_);
+  ++GetStream(request.stream_index())
+        .num_assigned_splits_per_source[request.source_index()];
   ++num_assigned_splits_;
   return absl::OkStatus();
 }
@@ -696,9 +774,9 @@ absl::Status SnapshotManager::ResetSource(Source& source, int64_t source_index)
   ++source.repetition_index;
   LOG(INFO) << "Starting repetition_" << source.repetition_index << " "
             << "for snapshot " << path_ << ", source " << source_index;
-  for (int64_t i = 0; i < streams_.size(); ++i) {
+  for (const auto& [stream_index, _] : streams_) {
     TF_RETURN_IF_ERROR(env_->RecursivelyCreateDir(RepetitionDirectory(
-        path_, /*stream_index=*/i, source_index, source.repetition_index)));
+        path_, stream_index, source_index, source.repetition_index)));
   }
   return absl::OkStatus();
 }
@@ -706,12 +784,12 @@ absl::Status SnapshotManager::ResetSource(Source& source, int64_t source_index)
 absl::Status SnapshotManager::GetSnapshotStreams(
     GetSnapshotStreamsResponse& response) TF_LOCKS_EXCLUDED(mu_) {
   tsl::tf_shared_lock l(mu_);
-  for (int64_t i = 0; i < streams_.size(); ++i) {
-    SnapshotStreamInfo* stream = response.add_streams();
-    stream->set_index(i);
-    stream->set_state(streams_[i].state == Stream::State::kDone
-                          ? SnapshotStreamInfo::DONE
-                          : SnapshotStreamInfo::ASSIGNED);
+  for (const auto& [stream_index, stream] : streams_) {
+    SnapshotStreamInfo* stream_info = response.add_streams();
+    stream_info->set_index(stream_index);
+    stream_info->set_state(stream.state == Stream::State::kDone
+                               ? SnapshotStreamInfo::DONE
+                               : SnapshotStreamInfo::ASSIGNED);
   }
   return absl::OkStatus();
 }
diff --git a/tensorflow/core/data/service/snapshot/snapshot_manager.h b/tensorflow/core/data/service/snapshot/snapshot_manager.h
index 71a60ec1de9861..8c53ae98650878 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_manager.h
+++ b/tensorflow/core/data/service/snapshot/snapshot_manager.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
@@ -59,6 +60,16 @@ class SnapshotAssignmentManager {
   void RemoveAssignment(absl::string_view snapshot_path,
                         absl::string_view worker_address, int64_t stream_index);
 
+  // Adds a new snapshot.
+  void AddSnapshot(absl::string_view snapshot_path);
+
+  // Load balances snapshots by the number of assigned streams. Given a worker,
+  // returns snapshots in the following order:
+  // - Snapshots already assigned to this worker.
+  // - Snapshots with the fewest assignments.
+  std::vector<std::string> LoadBalanceSnapshots(
+      absl::string_view worker_address);
+
   // Returns the maximum concurrent snapshots processed by each worker.
   int64_t worker_max_concurrent_snapshots() const {
     return worker_max_concurrent_snapshots_;
@@ -90,6 +101,10 @@ class SnapshotAssignmentManager {
   absl::flat_hash_map<std::string, absl::flat_hash_set<Assignment>> assignments_
       TF_GUARDED_BY(mu_);
 
+  // A mapping from snapshot to the number of assigned workers.
+  absl::flat_hash_map<std::string, int64_t> snapshot_assignment_counts_
+      TF_GUARDED_BY(mu_);
+
   // The maximum number of snapshots that a worker can concurrently process at a
   // given point in time. This is a tradeoff between worker resource usage and
   // snapshot wall time. A value of 0 indicates that the decision should be left
@@ -175,7 +190,11 @@ class SnapshotManager {
 
   // Helpers for `WorkerHeartbeat` above. These may update the in-memory and
   // on-disk states.
-  absl::StatusOr<std::optional<int64_t>> MaybeGetOrCreateStreamAssignment(
+  // Gets or creates a new stream. Returns the stream index and a bool value
+  // indicating whether a new stream has been created. Returns `std::nullopt`
+  // if there are no more streams to write or there is an error.
+  absl::StatusOr<std::optional<std::pair<int64_t, bool>>>
+  MaybeGetOrCreateStreamAssignment(
       absl::string_view worker_address,
       const SnapshotTaskProgress* snapshot_progress);
   absl::Status HandleStreamCompletion(int64_t stream_index,
@@ -190,6 +209,10 @@ class SnapshotManager {
                                  const StatusProto& status_proto);
 
   mutable tsl::mutex mu_;
+  // Uses a separate mutex for `GetSnapshotSplit` RPCs. `GetSnapshotSplit` uses
+  // file IO and may be slow, which may slow down `WorkerHeartbeat` RPCs if they
+  // share one mutex.
+  mutable tsl::mutex get_split_mu_;
 
   // The filepath of the on-disk state.
   const std::string path_;
@@ -252,8 +275,7 @@ class SnapshotManager {
           path_(path),
           stream_index_(stream_index),
           num_sources_(num_sources),
-          assignment_manager_(assignment_manager),
-          repetition_indices_(num_sources) {}
+          assignment_manager_(assignment_manager) {}
 
     // Reads snapshot stream from the files and collects data for restoration.
     absl::Status ReadOnDiskStream();
@@ -263,9 +285,6 @@ class SnapshotManager {
     const std::optional<Stream>& GetStream() const { return restored_stream_; }
     int64_t StreamIndex() const { return stream_index_; }
     const std::string& WorkerAddress() const { return worker_address_; }
-    const std::vector<int64_t>& RepetitionIndices() const {
-      return repetition_indices_;
-    }
     const absl::flat_hash_set<int64_t>& GlobalSplitIndices() const {
       return global_split_indices_;
     }
@@ -286,7 +305,6 @@ class SnapshotManager {
 
     std::string worker_address_;
     std::optional<Stream> restored_stream_;
-    std::vector<int64_t> repetition_indices_;
     absl::flat_hash_set<int64_t> global_split_indices_;
   };
 
@@ -299,6 +317,13 @@ class SnapshotManager {
       std::vector<int64_t>& repetition_indices,
       absl::flat_hash_set<int64_t>& global_split_indices);
 
+  // Gets the snapshot stream.
+  Stream& GetStream(int64_t stream_index);
+  // Initializes the stream directory.
+  absl::Status InitStreamDirectory(
+      int64_t stream_index, const std::string& worker_address,
+      const std::vector<int64_t>& repetitions_per_source);
+
   std::vector<Source> sources_ TF_GUARDED_BY(mu_);
   // Creates sources for the specified dataset.
   absl::StatusOr<std::vector<Source>> CreateSources(
@@ -312,7 +337,7 @@ class SnapshotManager {
   }
 
   // All streams for this snapshot.
-  std::vector<Stream> streams_ TF_GUARDED_BY(mu_);
+  absl::btree_map<int64_t, Stream> streams_ TF_GUARDED_BY(mu_);
   // A counter of completed streams for this snapshot.
   int64_t num_completed_streams_ TF_GUARDED_BY(mu_) = 0;
 
diff --git a/tensorflow/core/data/service/snapshot/snapshot_manager_test.cc b/tensorflow/core/data/service/snapshot/snapshot_manager_test.cc
index 98c19fc52f3850..9e116536ad07e5 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_manager_test.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_manager_test.cc
@@ -37,8 +37,13 @@ namespace tensorflow {
 namespace data {
 namespace {
 
+using ::testing::_;
+using ::testing::ElementsAre;
 using ::testing::IsEmpty;
+using ::testing::Not;
 using ::testing::SizeIs;
+using ::testing::UnorderedElementsAre;
+using ::tsl::testing::IsOkAndHolds;
 using ::tsl::testing::StatusIs;
 
 template <class T>
@@ -288,6 +293,65 @@ TEST(SnapshotManagerTest, ResumeFromError) {
   EXPECT_THAT(heartbeat_response.snapshot_tasks(), IsEmpty());
 }
 
+TEST(SnapshotAssignmentManagerTest, LoadBalanceSnapshots) {
+  SnapshotAssignmentManager snapshot_assignment_manager(
+      /*worker_max_concurrent_snapshots=*/2);
+  snapshot_assignment_manager.AddSnapshot("snapshot_1");
+  snapshot_assignment_manager.AddSnapshot("snapshot_2");
+  snapshot_assignment_manager.AddSnapshot("snapshot_3");
+
+  // Worker 1: snapshot 3
+  // Worker 2: N/A
+  EXPECT_THAT(snapshot_assignment_manager.TryAddAssignment(
+                  "snapshot_3", "worker_1", /*stream_index=*/0),
+              IsOkAndHolds(true));
+  EXPECT_THAT(snapshot_assignment_manager.LoadBalanceSnapshots("worker_1"),
+              ElementsAre("snapshot_3", _));
+  ASSERT_THAT(snapshot_assignment_manager.LoadBalanceSnapshots("worker_2"),
+              ElementsAre(Not("snapshot_3")));
+
+  // Worker 1: snapshots 2, 3
+  // Worker 2: N/A
+  EXPECT_THAT(snapshot_assignment_manager.TryAddAssignment(
+                  "snapshot_2", "worker_1", /*stream_index=*/0),
+              IsOkAndHolds(true));
+  ASSERT_THAT(snapshot_assignment_manager.LoadBalanceSnapshots("worker_1"),
+              UnorderedElementsAre("snapshot_2", "snapshot_3"));
+  EXPECT_THAT(snapshot_assignment_manager.LoadBalanceSnapshots("worker_2"),
+              ElementsAre("snapshot_1"));
+
+  // Worker 1: snapshots 2, 3
+  // Worker 2: snapshot 2
+  EXPECT_THAT(snapshot_assignment_manager.TryAddAssignment(
+                  "snapshot_1", "worker_1", /*stream_index=*/0),
+              IsOkAndHolds(false));
+  EXPECT_THAT(snapshot_assignment_manager.TryAddAssignment(
+                  "snapshot_2", "worker_2", /*stream_index=*/0),
+              IsOkAndHolds(true));
+  ASSERT_THAT(snapshot_assignment_manager.LoadBalanceSnapshots("worker_1"),
+              UnorderedElementsAre("snapshot_2", "snapshot_3"));
+  EXPECT_THAT(snapshot_assignment_manager.LoadBalanceSnapshots("worker_2"),
+              ElementsAre("snapshot_2", "snapshot_1"));
+
+  // Worker 1: snapshot 3
+  // Worker 2: snapshot 2
+  snapshot_assignment_manager.RemoveAssignment("snapshot_2", "worker_1",
+                                               /*stream_index=*/0);
+  EXPECT_THAT(snapshot_assignment_manager.LoadBalanceSnapshots("worker_1"),
+              ElementsAre("snapshot_3", "snapshot_1"));
+  ASSERT_THAT(snapshot_assignment_manager.LoadBalanceSnapshots("worker_2"),
+              ElementsAre("snapshot_2", "snapshot_1"));
+
+  // Worker 1: N/A
+  // Worker 2: snapshot 2
+  snapshot_assignment_manager.RemoveAssignment("snapshot_3", "worker_1",
+                                               /*stream_index=*/0);
+  ASSERT_THAT(snapshot_assignment_manager.LoadBalanceSnapshots("worker_1"),
+              ElementsAre("snapshot_1"));
+  ASSERT_THAT(snapshot_assignment_manager.LoadBalanceSnapshots("worker_2"),
+              ElementsAre("snapshot_2", "snapshot_1"));
+}
+
 }  // namespace
 }  // namespace data
 }  // namespace tensorflow
diff --git a/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc b/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc
index 0a356e9dec7bf8..01412806950427 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc
+++ b/tensorflow/core/data/service/snapshot/snapshot_stream_writer.cc
@@ -53,7 +53,7 @@ namespace tensorflow {
 namespace data {
 namespace {
 
-constexpr int64_t kTFRecordReaderOutputBufferSize = 512 << 20;  // 512MB
+constexpr ByteSize kTFRecordReaderOutputBufferSize = ByteSize::GB(1);
 constexpr int64_t kUnknownNumElements = -1;
 
 constexpr const char kFileShardDelimiter[] = "_CHUNK_SHARDS_";
@@ -192,15 +192,16 @@ absl::Status SnapshotStreamWriter::WriteChunks() {
 }
 
 bool SnapshotStreamWriter::ShouldWriteRecord() const {
-  {
-    mutex_lock l(mu_);
-    if (!completed_.ok()) {
-      return false;
-    }
+  mutex_lock l(mu_);
+  if (!completed_.ok() || end_of_sequence_) {
+    return false;
   }
   const absl::Time now = absl::FromUnixMicros(params_.env->NowMicros());
-  return !end_of_sequence_ &&
-         now < last_commit_time_ + params_.checkpoint_interval;
+  // Adjusts the checkpoint interval to speed up initial commits during startup.
+  // It will grow gradually from 5 min to the configured checkpoint interval.
+  const absl::Duration adjusted_checkpoint_interval = std::min(
+      params_.checkpoint_interval, absl::Minutes(0.5 * chunk_index_ + 5));
+  return now < last_commit_time_ + adjusted_checkpoint_interval;
 }
 
 absl::Status SnapshotStreamWriter::WriteRecord(ParallelTFRecordWriter& writer) {
@@ -358,9 +359,9 @@ absl::Status SnapshotStreamWriter::Restore() {
                                     kUnknownNumElements);
   }
   TF_RETURN_IF_ERROR(checkpoint_name.status());
-  snapshot_util::TFRecordReaderImpl reader(CheckpointPath(*checkpoint_name),
-                                           params_.compression,
-                                           kTFRecordReaderOutputBufferSize);
+  snapshot_util::TFRecordReaderImpl reader(
+      CheckpointPath(*checkpoint_name), params_.compression,
+      kTFRecordReaderOutputBufferSize.ToUnsignedBytes());
   TF_RETURN_IF_ERROR(reader.Initialize(params_.env));
   TF_ASSIGN_OR_RETURN(std::vector<Tensor> serialized_tensors,
                       reader.GetTensors());
diff --git a/tensorflow/core/data/service/snapshot/snapshot_stream_writer.h b/tensorflow/core/data/service/snapshot/snapshot_stream_writer.h
index bd8aa358383aa5..3179ab167a6620 100644
--- a/tensorflow/core/data/service/snapshot/snapshot_stream_writer.h
+++ b/tensorflow/core/data/service/snapshot/snapshot_stream_writer.h
@@ -43,8 +43,8 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
-constexpr ByteSize kDefaultMaxChunkSize = ByteSize::GB(2);
-constexpr absl::Duration kDefaultCheckpointInterval = absl::Minutes(20);
+constexpr ByteSize kDefaultMaxChunkSize = ByteSize::GB(6);
+constexpr absl::Duration kDefaultCheckpointInterval = absl::Minutes(30);
 
 struct SnapshotWriterParams {
   // The directory path of the snapshot. See the comment on SnapshotStreamWriter
@@ -64,7 +64,9 @@ struct SnapshotWriterParams {
   // The maximum number of bytes in each chunk.
   ByteSize max_chunk_size = kDefaultMaxChunkSize;
 
-  // How often should checkpoints be written.
+  // How often should checkpoints be written at the steady state. We write
+  // checkpoints (and committing chunks) more frequently at the startup time to
+  // avoid starving training jobs during startup.
   absl::Duration checkpoint_interval = kDefaultCheckpointInterval;
 
   // If true, keep temporary files (e.g., checkpoints) after completing the
diff --git a/tensorflow/core/data/service/snapshot/utils.h b/tensorflow/core/data/service/snapshot/utils.h
index b7613ed718ed4a..1ea4d80b649dcf 100644
--- a/tensorflow/core/data/service/snapshot/utils.h
+++ b/tensorflow/core/data/service/snapshot/utils.h
@@ -25,6 +25,7 @@ limitations under the License.
 namespace tensorflow {
 namespace data {
 
+// Estimates the size of the Tensors when serialized as TensorProtos.
 ByteSize EstimatedSize(const std::vector<Tensor>& tensors);
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/split_provider.cc b/tensorflow/core/data/service/split_provider.cc
index c5aef37fb5b064..ddf9d7c7a67138 100644
--- a/tensorflow/core/data/service/split_provider.cc
+++ b/tensorflow/core/data/service/split_provider.cc
@@ -58,13 +58,13 @@ Status DataServiceSplitProvider::GetNext(Tensor* split, bool* end_of_splits)
             << "; with iteration_id=" << iteration_id_
             << ", repetition=" << repetition_;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceSplitProvider::Reset() TF_LOCKS_EXCLUDED(mu_) {
   mutex_lock l(mu_);
   repetition_++;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceSplitProvider::Save(
@@ -89,7 +89,7 @@ Status CreateSplitProviders(
   TF_RETURN_IF_ERROR(standalone::Dataset::FromGraph(params, dataset_def.graph(),
                                                     &standalone_dataset));
   TF_RETURN_IF_ERROR(standalone_dataset->MakeSplitProviders(&split_providers));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/task_runner.cc b/tensorflow/core/data/service/task_runner.cc
index 3eb3af401647db..c56d0989b1ceaa 100644
--- a/tensorflow/core/data/service/task_runner.cc
+++ b/tensorflow/core/data/service/task_runner.cc
@@ -105,7 +105,7 @@ Status TaskRunner::Create(const experimental::WorkerConfig& worker_config,
   } else {
     out = std::make_unique<FirstComeFirstServedTaskRunner>(std::move(iterator));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 FirstComeFirstServedTaskRunner::FirstComeFirstServedTaskRunner(
@@ -123,14 +123,14 @@ Status FirstComeFirstServedTaskRunner::GetNext(const GetElementRequest& req,
 
 Status FirstComeFirstServedTaskRunner::GetNext(GetElementResult& result) {
   TF_ASSIGN_OR_RETURN(result, buffer_.Pop());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FirstComeFirstServedTaskRunner::PrefetchFn() {
   while (true) {
     TF_RETURN_IF_ERROR(buffer_.Push(GetNextFromInputIterator()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void FirstComeFirstServedTaskRunner::RunPrefetchThread() {
@@ -189,7 +189,7 @@ Status CachingTaskRunner::GetNext(const GetElementRequest& req,
   TF_ASSIGN_OR_RETURN(std::shared_ptr<const GetElementResult> element,
                       cache_.Get(req.trainer_id()));
   result = element->Copy();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 CachingTaskRunner::GetElementResultSequence::GetElementResultSequence(
@@ -248,7 +248,7 @@ Status RoundRobinTaskRunner::ValidateRequest(const GetElementRequest& req) {
         "Requesting data for consumer index ", req.consumer_index(),
         ", but the task is configured for only ", num_consumers_, " consumers");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RoundRobinTaskRunner::PrepareFullRound(int64_t wait_us)
@@ -259,7 +259,7 @@ Status RoundRobinTaskRunner::PrepareFullRound(int64_t wait_us)
   TF_RETURN_IF_ERROR(prefetch_thread_.FillBuffer(wait_us, buffer_));
   round_skipped_ = buffer_.empty();
   new_round_cv_.notify_all();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RoundRobinTaskRunner::PreparePartialRound()
@@ -273,11 +273,11 @@ Status RoundRobinTaskRunner::PreparePartialRound()
   if (next_round_request.skipped_previous_round()) {
     VLOG(1) << "Skipping partial round";
     round_skipped_ = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
   TF_RETURN_IF_ERROR(prefetch_thread_.FillBuffer(/*wait_us=*/-1, buffer_));
   round_skipped_ = false;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RoundRobinTaskRunner::PrepareRound(const GetElementRequest& req) {
@@ -333,7 +333,7 @@ Status RoundRobinTaskRunner::GetNext(const GetElementRequest& req,
   if (round_skipped_) {
     VLOG(1) << worker_address_ << ": Buffer not ready, skipping round "
             << current_round_ << " for consumer " << req.consumer_index();
-    return OkStatus();
+    return absl::OkStatus();
   }
   auto& buffer_result = buffer_[req.consumer_index()];
   result.element_index = buffer_result->index;
@@ -351,7 +351,7 @@ Status RoundRobinTaskRunner::GetNext(const GetElementRequest& req,
             << req.round_index() << ". element size " << size;
   }
   result.components = std::move(element);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void RoundRobinTaskRunner::Cancel() {
@@ -431,14 +431,14 @@ Status PrefetchThread::FillBuffer(int64_t wait_us,
   }
   if (buffer_.size() < round_size_) {
     DCHECK_GE(wait_us, 0);
-    return OkStatus();
+    return absl::OkStatus();
   }
   for (auto& elem : buffer_) {
     out.push_back(std::move(elem));
   }
   buffer_.clear();
   cv_.notify_all();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PrefetchThread::GetStatus() {
diff --git a/tensorflow/core/data/service/task_runner.h b/tensorflow/core/data/service/task_runner.h
index 565a7f4727a477..d1f95a97d9b2aa 100644
--- a/tensorflow/core/data/service/task_runner.h
+++ b/tensorflow/core/data/service/task_runner.h
@@ -229,7 +229,7 @@ class PrefetchThread {
   // Buffered results for the next round.
   std::vector<std::unique_ptr<Element>> buffer_ TF_GUARDED_BY(mu_);
   // The status if the prefetch thread fails.
-  Status status_ TF_GUARDED_BY(mu_) = OkStatus();
+  Status status_ TF_GUARDED_BY(mu_) = absl::OkStatus();
   // Condition variable notified when elements are added to or removed from
   // `buffer_`, or when `status_` is changed.
   condition_variable cv_;
diff --git a/tensorflow/core/data/service/task_runner_test.cc b/tensorflow/core/data/service/task_runner_test.cc
index 0c1ef895742b0c..1ea0d1645adbd8 100644
--- a/tensorflow/core/data/service/task_runner_test.cc
+++ b/tensorflow/core/data/service/task_runner_test.cc
@@ -64,13 +64,13 @@ class RangeIterator : public TaskIterator {
   Status GetNext(std::vector<Tensor>& element, bool& end_of_sequence) override {
     end_of_sequence = (next_ >= range_);
     if (end_of_sequence) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     element = {Tensor{next_++}};
     if (repeat_) {
       next_ = next_ % range_;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   int64_t Cardinality() const override {
@@ -89,7 +89,7 @@ class InfiniteRangeIterator : public TaskIterator {
 
   Status GetNext(std::vector<Tensor>& element, bool& end_of_sequence) override {
     element = {Tensor{next_++}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   int64_t Cardinality() const override { return kInfiniteCardinality; }
@@ -107,12 +107,12 @@ class ElementOrErrorIterator : public TaskIterator {
   Status GetNext(std::vector<Tensor>& element, bool& end_of_sequence) override {
     end_of_sequence = (next_ >= elements_.size());
     if (end_of_sequence) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     const StatusOr<T>& next_element = elements_[next_++];
     TF_RETURN_IF_ERROR(next_element.status());
     element = {Tensor{*next_element}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   int64_t Cardinality() const override { return elements_.size(); }
@@ -193,7 +193,7 @@ Status RunConsumer(int64_t consumer_index, int64_t start_index,
       }
     } while (result.skip);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
diff --git a/tensorflow/core/data/service/test_cluster.cc b/tensorflow/core/data/service/test_cluster.cc
index 28fa31900a68ff..ec6976812e83ad 100644
--- a/tensorflow/core/data/service/test_cluster.cc
+++ b/tensorflow/core/data/service/test_cluster.cc
@@ -72,6 +72,8 @@ Status TestCluster::Initialize() {
       config_.job_gc_check_interval_ms);
   dispatcher_config.set_job_gc_timeout_ms(config_.job_gc_timeout_ms);
   dispatcher_config.set_client_timeout_ms(config_.client_timeout_ms);
+  dispatcher_config.set_worker_max_concurrent_snapshots(
+      config_.worker_max_concurrent_snapshots);
   TF_RETURN_IF_ERROR(NewDispatchServer(dispatcher_config, dispatcher_));
   TF_RETURN_IF_ERROR(dispatcher_->Start());
   dispatcher_address_ = absl::StrCat("localhost:", dispatcher_->BoundPort());
@@ -80,7 +82,7 @@ Status TestCluster::Initialize() {
   for (int i = 0; i < num_workers_; ++i) {
     TF_RETURN_IF_ERROR(AddWorker());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status TestCluster::AddWorker(std::optional<int> port) {
@@ -99,7 +101,7 @@ Status TestCluster::AddWorker(std::optional<int> port) {
   TF_RETURN_IF_ERROR(worker->Start());
   worker_addresses_.push_back(absl::StrCat("localhost:", worker->BoundPort()));
   workers_.push_back(std::move(worker));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::string TestCluster::DispatcherAddress() const {
diff --git a/tensorflow/core/data/service/test_cluster.h b/tensorflow/core/data/service/test_cluster.h
index 8654ca0051ab71..3b5f5f02032d03 100644
--- a/tensorflow/core/data/service/test_cluster.h
+++ b/tensorflow/core/data/service/test_cluster.h
@@ -54,6 +54,7 @@ class TestCluster {
     int64_t worker_heartbeat_interval_ms = 0;
     int64_t job_gc_check_interval_ms = 0;
     int64_t job_gc_timeout_ms = 0;
+    int64_t worker_max_concurrent_snapshots = 0;
     std::string work_dir;
   };
 
diff --git a/tensorflow/core/data/service/thread_safe_buffer.h b/tensorflow/core/data/service/thread_safe_buffer.h
index 49b9200256e086..3c18da024a52ac 100644
--- a/tensorflow/core/data/service/thread_safe_buffer.h
+++ b/tensorflow/core/data/service/thread_safe_buffer.h
@@ -54,7 +54,7 @@ class ThreadSafeBuffer final {
   condition_variable ready_to_pop_;
   condition_variable ready_to_push_;
   std::deque<StatusOr<T>> results_ TF_GUARDED_BY(mu_);
-  Status status_ TF_GUARDED_BY(mu_) = OkStatus();
+  Status status_ TF_GUARDED_BY(mu_) = absl::OkStatus();
 
   ThreadSafeBuffer(const ThreadSafeBuffer&) = delete;
   void operator=(const ThreadSafeBuffer&) = delete;
@@ -94,7 +94,7 @@ Status ThreadSafeBuffer<T>::Push(StatusOr<T> value) {
   }
   results_.push_back(std::move(value));
   ready_to_pop_.notify_one();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <class T>
diff --git a/tensorflow/core/data/service/utils.cc b/tensorflow/core/data/service/utils.cc
index 7a4a0c4824ad27..43ad439d21fc55 100644
--- a/tensorflow/core/data/service/utils.cc
+++ b/tensorflow/core/data/service/utils.cc
@@ -32,7 +32,7 @@ Status WriteDatasetDef(const std::string& path, const DatasetDef& dataset_def) {
   TF_RETURN_IF_ERROR(Env::Default()->NewWritableFile(path, &file));
   io::RecordWriter writer(file.get());
   TF_RETURN_IF_ERROR(writer.WriteRecord(dataset_def.SerializeAsString()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ReadDatasetDef(const std::string& path, DatasetDef& dataset_def) {
@@ -49,7 +49,7 @@ Status ReadDatasetDef(const std::string& path, DatasetDef& dataset_def) {
   if (!dataset_def.ParseFromString(record)) {
     return errors::DataLoss("Failed to parse dataset definition");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace data
diff --git a/tensorflow/core/data/service/validate_utils.cc b/tensorflow/core/data/service/validate_utils.cc
index 99735979d4b574..45d58584d72ae3 100644
--- a/tensorflow/core/data/service/validate_utils.cc
+++ b/tensorflow/core/data/service/validate_utils.cc
@@ -47,7 +47,7 @@ Status ValidateElementSpec(const std::string& dataset_id,
                            const std::string& encoded_spec1,
                            const std::string& encoded_spec2) {
   if (encoded_spec1.empty() && encoded_spec2.empty()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   TF_ASSIGN_OR_RETURN(StructuredValue element_spec1,
                       DecodeElementSpec(dataset_id, encoded_spec1));
@@ -67,7 +67,7 @@ Status ValidateElementSpec(const std::string& dataset_id,
         ". To fix this error, make sure you're registering the same dataset ",
         "with the same ID.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ValidateDatasetMetadata(const std::string& dataset_id,
@@ -89,7 +89,7 @@ Status ValidateDatasetMetadata(const std::string& dataset_id,
         "for dataset ID ", dataset_id, ": ", diff, ". To fix this error, make ",
         "sure you're registering the same dataset with the same ID.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/core/data/service/worker_client.cc b/tensorflow/core/data/service/worker_client.cc
index 9e633849ae2eb9..344a5fa0cbd223 100644
--- a/tensorflow/core/data/service/worker_client.cc
+++ b/tensorflow/core/data/service/worker_client.cc
@@ -77,11 +77,11 @@ Status DataServiceWorkerClient::GetElement(const GetElementRequest& req,
 Status DataServiceWorkerClient::EnsureInitialized() {
   mutex_lock l(mu_);
   if (client_) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   TF_RETURN_IF_ERROR(DataTransferClient::Build(
       GetDataTransferProtocol(), {protocol_, address_}, &client_));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::string DataServiceWorkerClient::GetDataTransferProtocol() const {
@@ -153,7 +153,7 @@ class GrpcDataTransferClient : public DataTransferClient {
       case GetElementResponse::ELEMENT_NOT_SET:
         break;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void TryCancel() override {
@@ -188,7 +188,7 @@ class GrpcTransferClientRegistrar {
               config.protocol, &credentials));
           *out = std::make_unique<GrpcDataTransferClient>(credentials,
                                                           config.address);
-          return OkStatus();
+          return absl::OkStatus();
         });
   }
 };
@@ -234,7 +234,7 @@ class LocalDataTransferClient : public DataTransferClient {
       return errors::Cancelled(absl::Substitute(
           "Client for worker $0 has been cancelled.", worker_address_));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   StatusOr<std::shared_ptr<DataServiceWorkerImpl>> GetWorker(
@@ -263,7 +263,7 @@ class LocalTransferClientRegistrar {
         kLocalTransferProtocol, [](DataTransferClient::Config config,
                                    std::unique_ptr<DataTransferClient>* out) {
           *out = std::make_unique<LocalDataTransferClient>(config.address);
-          return OkStatus();
+          return absl::OkStatus();
         });
   }
 };
diff --git a/tensorflow/core/data/service/worker_impl.cc b/tensorflow/core/data/service/worker_impl.cc
index 41e3b5775971c0..390ad7135ecd21 100644
--- a/tensorflow/core/data/service/worker_impl.cc
+++ b/tensorflow/core/data/service/worker_impl.cc
@@ -90,7 +90,7 @@ Status MoveElementToResponse(std::vector<Tensor>&& element,
       UncompressedElement* uncompressed = resp.mutable_uncompressed();
       component.AsProtoTensorContent(uncompressed->add_components());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
   Variant& variant = element[0].scalar<Variant>()();
   CompressedElement* compressed = variant.get<CompressedElement>();
@@ -101,7 +101,7 @@ Status MoveElementToResponse(std::vector<Tensor>&& element,
         variant.TypeName());
   }
   *resp.mutable_compressed() = *compressed;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 WorkerConfig ApplyWorkerDefaults(const WorkerConfig& config) {
@@ -195,7 +195,7 @@ Status DataServiceWorkerImpl::Start(
       {}, "data-service-worker-heartbeat", [this]() { HeartbeatThread(); }));
   mutex_lock l(mu_);
   registered_ = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void DataServiceWorkerImpl::Stop() {
@@ -236,7 +236,7 @@ Status DataServiceWorkerImpl::ValidateWorkerConfig() const {
                       config_.worker_tags().end(), ", "),
         "}");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<std::unique_ptr<DataServiceDispatcherClient>>
@@ -283,7 +283,7 @@ Status DataServiceWorkerImpl::GetElementResult(
         VLOG(3) << "Task is already finished";
         result->end_of_sequence = true;
         result->skip = false;
-        return OkStatus();
+        return absl::OkStatus();
       }
       // Perhaps the worker hasn't gotten the task from the dispatcher yet.
       // Return Unavailable so that the client knows to continue retrying.
@@ -306,7 +306,7 @@ Status DataServiceWorkerImpl::GetElementResult(
     pending_completed_tasks_.insert(request->task_id());
     task_completion_cv_.notify_one();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceWorkerImpl::ProcessTask(const ProcessTaskRequest* request,
@@ -323,13 +323,13 @@ Status DataServiceWorkerImpl::ProcessTaskInternal(const TaskDef& task_def)
   if (task) {
     VLOG(1) << "Received request to process already-processed task "
             << task->task_def.task_id();
-    return OkStatus();
+    return absl::OkStatus();
   }
   task = std::make_unique<Task>(task_def);
   VLOG(3) << "Began processing for task " << task_def.task_id()
           << " with processing mode "
           << task_def.processing_mode_def().DebugString();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceWorkerImpl::EnsureTaskInitialized(
@@ -342,7 +342,7 @@ Status DataServiceWorkerImpl::EnsureTaskInitialized(
 
   mutex_lock l(task.mu);
   if (task.initialized) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   TF_ASSIGN_OR_RETURN(DatasetDef dataset_def, GetDatasetDef(task.task_def));
   TF_ASSIGN_OR_RETURN(std::unique_ptr<standalone::Dataset> dataset,
@@ -356,7 +356,7 @@ Status DataServiceWorkerImpl::EnsureTaskInitialized(
 
   task.initialized = true;
   VLOG(3) << "Created iterator for task " << task.task_def.task_id();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<DatasetDef> DataServiceWorkerImpl::GetDatasetDef(
@@ -487,7 +487,7 @@ Status DataServiceWorkerImpl::GetElement(const GetElementRequest* request,
         MoveElementToResponse(std::move(result.components), *response));
     VLOG(3) << "Producing an element for task " << request->task_id();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceWorkerImpl::GetWorkerTasks(
@@ -500,7 +500,7 @@ Status DataServiceWorkerImpl::GetWorkerTasks(
     task_info->set_task_id(task->task_def.task_id());
     task_info->set_iteration_id(task->task_def.iteration_id());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status DataServiceWorkerImpl::GetSnapshotTaskProgresses(
@@ -509,7 +509,7 @@ Status DataServiceWorkerImpl::GetSnapshotTaskProgresses(
   for (const auto& snapshot_task_progress : GetSnapshotTaskProgress()) {
     *response->add_snapshot_task_progresses() = snapshot_task_progress;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void DataServiceWorkerImpl::TaskCompletionThread() TF_LOCKS_EXCLUDED(mu_) {
@@ -519,7 +519,7 @@ void DataServiceWorkerImpl::TaskCompletionThread() TF_LOCKS_EXCLUDED(mu_) {
       while (!cancelled_ && pending_completed_tasks_.empty()) {
         task_completion_cv_.wait(l);
       }
-      if (cancelled_) {
+      if (cancelled_ && pending_completed_tasks_.empty()) {
         VLOG(3) << "Task completion thread shutting down";
         return;
       }
@@ -528,7 +528,10 @@ void DataServiceWorkerImpl::TaskCompletionThread() TF_LOCKS_EXCLUDED(mu_) {
     if (!s.ok()) {
       LOG(WARNING) << "Failed to send task updates to dispatcher: " << s;
       mutex_lock l(mu_);
-      if (!cancelled_) {
+      if (cancelled_) {
+        VLOG(3) << "Task completion thread shutting down";
+        return;
+      } else {
         task_completion_cv_.wait_for(
             l, absl::ToChronoMicroseconds(kRetryInterval));
       }
@@ -556,7 +559,7 @@ Status DataServiceWorkerImpl::SendTaskUpdates() TF_LOCKS_EXCLUDED(mu_) {
     pending_completed_tasks_.erase(update.task_id());
   }
   VLOG(3) << "Sent " << task_progress.size() << " task updates ";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void DataServiceWorkerImpl::HeartbeatThread() TF_LOCKS_EXCLUDED(mu_) {
@@ -761,7 +764,7 @@ Status DataServiceWorkerImpl::UpdateSnapshotWriters(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<std::unique_ptr<StandaloneTaskIterator>>
diff --git a/tensorflow/core/data/snapshot_utils.cc b/tensorflow/core/data/snapshot_utils.cc
index d7b8a325f87014..c8a45c854e7f81 100644
--- a/tensorflow/core/data/snapshot_utils.cc
+++ b/tensorflow/core/data/snapshot_utils.cc
@@ -146,7 +146,7 @@ Status TFRecordWriter::Initialize(tensorflow::Env* env) {
   record_writer_ = std::make_unique<io::RecordWriter>(
       dest_.get(), io::RecordWriterOptions::CreateRecordWriterOptions(
                        /*compression_type=*/compression_type_));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status TFRecordWriter::WriteTensors(const std::vector<Tensor>& tensors) {
@@ -174,7 +174,7 @@ Status TFRecordWriter::WriteTensors(const std::vector<Tensor>& tensors) {
     TF_RETURN_IF_ERROR(record_writer_->WriteRecord(proto_serialized));
 #endif  // TF_CORD_SUPPORT
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status TFRecordWriter::Sync() {
@@ -190,7 +190,7 @@ Status TFRecordWriter::Close() {
     record_writer_ = nullptr;
     dest_ = nullptr;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 TFRecordWriter::~TFRecordWriter() {
@@ -238,7 +238,7 @@ Status CustomWriter::Initialize(tensorflow::Env* env) {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CustomWriter::WriteTensors(const std::vector<Tensor>& tensors) {
@@ -321,7 +321,7 @@ Status CustomWriter::WriteTensors(const std::vector<Tensor>& tensors) {
 #endif  // TF_CORD_SUPPORT
   TF_RETURN_IF_ERROR(WriteRecord(metadata_serialized));
   TF_RETURN_IF_ERROR(WriteRecord(output));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CustomWriter::Sync() { return dest_->Sync(); }
@@ -335,7 +335,7 @@ Status CustomWriter::Close() {
     TF_RETURN_IF_ERROR(zlib_underlying_dest_->Close());
     zlib_underlying_dest_ = nullptr;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 CustomWriter::~CustomWriter() {
@@ -392,7 +392,7 @@ Status Reader::SkipRecords(int64_t num_records) {
     std::vector<Tensor> unused_tensors;
     TF_RETURN_IF_ERROR(ReadTensors(&unused_tensors));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 class Reader::Dataset : public DatasetBase {
@@ -419,10 +419,10 @@ class Reader::Dataset : public DatasetBase {
   std::string DebugString() const override { return "SnapshotDatasetReader"; }
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status CheckExternalState() const override { return OkStatus(); }
+  Status CheckExternalState() const override { return absl::OkStatus(); }
 
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
@@ -486,7 +486,7 @@ class Reader::Dataset : public DatasetBase {
       Status status = AdvanceToNextFile(ctx->env());
       if (absl::IsNotFound(status)) {
         *end_of_sequence = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
       return status;
     }
@@ -497,7 +497,7 @@ class Reader::Dataset : public DatasetBase {
                                              current_checkpoint_id_));
       TF_RETURN_IF_ERROR(
           writer->WriteScalar(full_name(kStartIndex), start_index_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -533,7 +533,7 @@ class Reader::Dataset : public DatasetBase {
         std::vector<Tensor> unused;
         TF_RETURN_IF_ERROR(reader_->ReadTensors(&unused));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     std::unique_ptr<Reader> reader_;
@@ -596,10 +596,10 @@ class Reader::NestedDataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->clear();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status CheckExternalState() const override { return OkStatus(); }
+  Status CheckExternalState() const override { return absl::OkStatus(); }
 
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
@@ -616,7 +616,7 @@ class Reader::NestedDataset : public DatasetBase {
         b->AddDataset(this, /*inputs=*/{},
                       /*list_inputs=*/{std::make_pair(0, input_graph_nodes)},
                       /*attrs=*/{}, node));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
@@ -651,19 +651,19 @@ class Reader::NestedDataset : public DatasetBase {
 
         index_++;
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
       TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kIndex), index_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kIndex), &index_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -725,7 +725,7 @@ Status Reader::MakeNestedDataset(Env* env,
                 datasets.end());
   }
   MakeNestedDataset(datasets, output);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void Reader::MakeNestedDataset(const std::vector<DatasetBase*>& datasets,
@@ -758,7 +758,7 @@ Status TFRecordReaderImpl::Initialize(Env* env) {
 #endif  // IS_SLIM_BUILD
   record_reader_ = std::make_unique<io::RecordReader>(file_.get(), options);
   bytes_read_ = 0;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<Tensor> TFRecordReaderImpl::GetNext() {
@@ -805,7 +805,7 @@ Status TFRecordReader::ReadTensors(std::vector<Tensor>* read_tensors) {
     TF_ASSIGN_OR_RETURN(Tensor tensor, reader_impl_.GetNext());
     read_tensors->push_back(std::move(tensor));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 CustomReader::CustomReader(const std::string& filename,
@@ -855,7 +855,7 @@ Status CustomReader::Initialize(Env* env) {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CustomReader::ReadTensors(std::vector<Tensor>* read_tensors) {
@@ -909,7 +909,7 @@ Status CustomReader::ReadTensors(std::vector<Tensor>* read_tensors) {
       complex_index++;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CustomReader::ReadTensorsV0(std::vector<Tensor>* read_tensors) {
@@ -930,7 +930,7 @@ Status CustomReader::ReadTensorsV0(std::vector<Tensor>* read_tensors) {
       return errors::DataLoss("Unable to parse tensor from proto.");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CustomReader::SnappyUncompress(
@@ -980,7 +980,7 @@ Status CustomReader::SnappyUncompress(
                                            iov.data(), num_tensors)) {
     return errors::Internal("Failed to perform snappy decompression.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CustomReader::ReadRecord(tstring* record) {
@@ -1003,7 +1003,7 @@ Status CustomReader::ReadRecord(absl::Cord* record) {
     absl::string_view tmp_str_view(*tmp_str);
     record->Append(absl::MakeCordFromExternal(
         tmp_str_view, [tmp_str](absl::string_view) { delete tmp_str; }));
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 #endif  // TF_CORD_SUPPORT
@@ -1039,7 +1039,7 @@ Status ReadMetadataFile(Env* env, const string& dir,
   if (*file_exists) {
     return ReadBinaryProto(env, metadata_filename, metadata);
   } else {
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
@@ -1053,7 +1053,7 @@ Status ReadMetadataFile(Env* env, const string& dir,
   if (*file_exists) {
     return ReadBinaryProto(env, metadata_filename, metadata);
   } else {
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
@@ -1080,30 +1080,30 @@ Status DetermineOpState(const std::string& mode_string, bool file_exists,
     }
     LOG(INFO) << "Overriding mode to reader.";
     *mode = READER;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (mode_string == kModeWrite) {
     LOG(INFO) << "Overriding mode to writer.";
     *mode = WRITER;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (mode_string == kModePassthrough) {
     LOG(INFO) << "Overriding mode to passthrough.";
     *mode = PASSTHROUGH;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (!file_exists) {
     *mode = WRITER;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (metadata->finalized()) {
     // File found, snapshot has been finalized.
     *mode = READER;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   int64_t expiration_timer = static_cast<int64_t>(EnvTime::NowMicros()) -
@@ -1112,11 +1112,11 @@ Status DetermineOpState(const std::string& mode_string, bool file_exists,
   if (metadata->creation_timestamp() >= expiration_timer) {
     // Someone else is already writing and time has not expired.
     *mode = PASSTHROUGH;
-    return OkStatus();
+    return absl::OkStatus();
   } else {
     // Time has expired, we write regardless.
     *mode = WRITER;
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
@@ -1179,7 +1179,7 @@ Status AsyncWriter::WriterThread(Env* env, const std::string& shard_directory,
 
     TF_RETURN_IF_ERROR(writer->WriteTensors(be.value));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
diff --git a/tensorflow/core/data/split_utils_test.cc b/tensorflow/core/data/split_utils_test.cc
index 15b61e125a0eb7..060f2b99b75018 100644
--- a/tensorflow/core/data/split_utils_test.cc
+++ b/tensorflow/core/data/split_utils_test.cc
@@ -39,7 +39,7 @@ Status SaveAndRestore(SplitProvider* split_provider) {
   writer.GetData(&variants);
   VariantTensorDataReader reader(variants);
   TF_RETURN_IF_ERROR(split_provider->Restore(full_name, &reader));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CheckOutput(SplitProvider* split_provider,
@@ -54,7 +54,7 @@ Status CheckOutput(SplitProvider* split_provider,
     }
   }
   EXPECT_EQ(next, expected.size());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 TEST(IndexSplitProviderTest, Empty) {
diff --git a/tensorflow/core/data/standalone.cc b/tensorflow/core/data/standalone.cc
index 04a425170b27be..01bf7e9a4c261a 100644
--- a/tensorflow/core/data/standalone.cc
+++ b/tensorflow/core/data/standalone.cc
@@ -34,6 +34,8 @@ limitations under the License.
 #include "tensorflow/core/data/dataset_utils.h"
 #include "tensorflow/core/data/root_dataset.h"
 #include "tensorflow/core/data/serialization_utils.h"
+#include "tensorflow/core/data/tf_data_memory_logger.h"
+#include "tensorflow/core/data/tfdataz_metrics.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/device.h"
 #include "tensorflow/core/framework/device_factory.h"
@@ -76,7 +78,21 @@ OpKernelContext::Params CreateParams(
 
 Iterator::Iterator(IteratorBase* iterator, IteratorContext* ctx,
                    SerializationContext* serialization_ctx)
-    : iterator_(iterator), ctx_(ctx), serialization_ctx_(serialization_ctx) {}
+    : iterator_(iterator), ctx_(ctx), serialization_ctx_(serialization_ctx) {
+  if (DatasetBaseIterator* dataset_iterator =
+          dynamic_cast<DatasetBaseIterator*>(iterator_.get())) {
+    tf_dataz_metrics_collector_ = std::make_shared<TfDatazMetricsCollector>(
+        *Env::Default(), dataset_iterator, ctx_->model());
+    TfDatazMetricsRegistry::Register(tf_dataz_metrics_collector_);
+    EnsureIteratorMemoryLoggerStarted();
+  }
+}
+
+Iterator::~Iterator() {
+  if (tf_dataz_metrics_collector_) {
+    TfDatazMetricsRegistry::Deregister(tf_dataz_metrics_collector_);
+  }
+}
 
 Status Iterator::GetNext(std::vector<Tensor>* outputs, bool* end_of_input) {
   return iterator_->GetNext(ctx_.get(), outputs, end_of_input);
@@ -141,7 +157,7 @@ Status Dataset::FromGraph(Params params, const GraphDef& graph_def,
                              tsl::core::RefCountPtr<Rendezvous>* r) {
         *r = tsl::core::RefCountPtr<Rendezvous>(
             new IntraProcessRendezvous(device_mgr));
-        return OkStatus();
+        return absl::OkStatus();
       }});
 
   string fetch_node = "";
@@ -176,7 +192,7 @@ Status Dataset::FromGraph(Params params, const GraphDef& graph_def,
   *result = absl::WrapUnique(new Dataset(
       finalized_dataset, dataset, device_mgr.release(), pflr.release(),
       flib_def.release(), pool.release(), std::move(runner)));
-  return OkStatus();
+  return absl::OkStatus();
 }  // static
 
 Status Dataset::MakeIterator(
@@ -215,7 +231,7 @@ Status Dataset::MakeIterator(
       ctx.get(), /*parent=*/nullptr, "Iterator", &iterator));
   *result = absl::WrapUnique(new Iterator(iterator.release(), ctx.release(),
                                           serialization_ctx.release()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Dataset::MakeIterator(std::unique_ptr<Iterator>* result) {
diff --git a/tensorflow/core/data/standalone.h b/tensorflow/core/data/standalone.h
index 5de0d81b274b30..46d4d40aa477a0 100644
--- a/tensorflow/core/data/standalone.h
+++ b/tensorflow/core/data/standalone.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/core/common_runtime/device_mgr.h"
+#include "tensorflow/core/data/tfdataz_metrics.h"
 #include "tensorflow/core/data/unbounded_thread_pool.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/dataset.h"
@@ -79,6 +80,8 @@ class Dataset;
 // its elements.
 class Iterator {
  public:
+  virtual ~Iterator();
+
   // Returns the next element of the input pipeline (if there is one) and an
   // indication of whether the end of the input pipeline has been reached.
   Status GetNext(std::vector<Tensor>* outputs, bool* end_of_input);
@@ -103,6 +106,7 @@ class Iterator {
   std::unique_ptr<IteratorBase> iterator_;
   std::unique_ptr<IteratorContext> ctx_;
   std::unique_ptr<SerializationContext> serialization_ctx_;
+  std::shared_ptr<TfDatazMetricsCollector> tf_dataz_metrics_collector_;
 };
 
 // Represents an input pipeline as a collection of data sources and a logical
diff --git a/tensorflow/core/data/tf_data_memory_logger.cc b/tensorflow/core/data/tf_data_memory_logger.cc
index a313e435a0bd76..f1edc3fe9d08a8 100644
--- a/tensorflow/core/data/tf_data_memory_logger.cc
+++ b/tensorflow/core/data/tf_data_memory_logger.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/data/tfdataz_metrics.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/numbers.h"
+#include "tensorflow/core/platform/status.h"
 #include "tsl/platform/logging.h"
 
 namespace tensorflow {
@@ -37,6 +38,7 @@ const int64_t kLogFrequencyS = 30;  // How often to log.
 struct IteratorMemoryUsage {
   std::optional<std::string> dataset_name;
   int64_t memory_usage;
+  std::string model_proto;
 };
 
 int64_t TotalMemoryUsage(const std::vector<IteratorMemoryUsage>& usages) {
@@ -52,9 +54,16 @@ void LogDatasetMemoryUsage() {
       metric_collectors = TfDatazMetricsRegistry::GetIteratorMetricCollectors();
   std::vector<IteratorMemoryUsage> usages;
   for (const auto& metric_collector : metric_collectors) {
-    usages.push_back(
-        IteratorMemoryUsage{metric_collector->DatasetName(),
-                            metric_collector->GetIteratorTotalMemoryUsage()});
+    int64_t total_buffered_bytes =
+        metric_collector->GetModel()->output()->TotalBufferedBytes();
+    model::ModelProto model_proto;
+    Status s = metric_collector->GetModel()->ToProto(&model_proto);
+    if (!s.ok()) {
+      LOG(ERROR) << "Failed to convert model to proto: " << s;
+    }
+    usages.push_back(IteratorMemoryUsage{metric_collector->DatasetName(),
+                                         total_buffered_bytes,
+                                         model_proto.ShortDebugString()});
   }
   std::sort(usages.begin(), usages.end(), [](const auto& a, const auto& b) {
     return a.memory_usage > b.memory_usage;
@@ -75,6 +84,7 @@ void LogDatasetMemoryUsage() {
     } else {
       VLOG(4) << "Dataset " << i << " (no name set): " << usage_string;
     }
+    VLOG(5) << "Model proto: " << usages[i].model_proto;
   }
 }
 
diff --git a/tensorflow/core/data/tfdataz_metrics.cc b/tensorflow/core/data/tfdataz_metrics.cc
index 2ba689d5818fe1..0bb251310c8273 100644
--- a/tensorflow/core/data/tfdataz_metrics.cc
+++ b/tensorflow/core/data/tfdataz_metrics.cc
@@ -16,16 +16,18 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
-#include <deque>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
-#include <vector>
 
+#include "absl/container/flat_hash_set.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tsl/platform/thread_annotations.h"
 
 namespace tensorflow {
 namespace data {
@@ -93,9 +95,10 @@ absl::Duration ApproximateLatencyEstimator::GetAverageLatency(Duration duration)
   return absl::Duration(absl::Microseconds(interval_latency)) / interval_count;
 }
 
-TfDatazMetricsCollector::TfDatazMetricsCollector(const Env& env,
-                                                 DatasetBaseIterator* iterator)
-    : iterator_(iterator), latency_estimator_(env) {}
+TfDatazMetricsCollector::TfDatazMetricsCollector(
+    const Env& env, DatasetBaseIterator* iterator,
+    std::shared_ptr<model::Model> model)
+    : iterator_(iterator), model_(std::move(model)), latency_estimator_(env) {}
 
 void TfDatazMetricsCollector::RecordGetNextLatency(
     int64_t get_next_latency_usec) {
@@ -131,6 +134,10 @@ int64_t TfDatazMetricsCollector::GetIteratorTotalMemoryUsage() {
   return iterator_->TotalBufferedBytes();
 }
 
+std::shared_ptr<model::Model> TfDatazMetricsCollector::GetModel() {
+  return model_;
+}
+
 namespace {
 static mutex* get_tfdataz_metrics_registry_lock() {
   static mutex tfdataz_metrics_registry_lock(LINKER_INITIALIZED);
diff --git a/tensorflow/core/data/tfdataz_metrics.h b/tensorflow/core/data/tfdataz_metrics.h
index dc202f6ca784c4..e37daf89c6a47d 100644
--- a/tensorflow/core/data/tfdataz_metrics.h
+++ b/tensorflow/core/data/tfdataz_metrics.h
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/time/time.h"
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/thread_annotations.h"
@@ -94,7 +95,8 @@ class TfDatazMetricsCollector {
   // We only collect metrics for CPU devices. This is a heuristic to avoid
   // collecting metrics for device-side iterators created by the multi-device
   // iterator mechanism.
-  TfDatazMetricsCollector(const Env& env, DatasetBaseIterator* iterator);
+  TfDatazMetricsCollector(const Env& env, DatasetBaseIterator* iterator,
+                          std::shared_ptr<model::Model> model);
 
   // Records `GetNext` call latency.
   void RecordGetNextLatency(int64_t get_next_latency_usec);
@@ -116,8 +118,11 @@ class TfDatazMetricsCollector {
   // buffered in all nodes in the subtree.
   int64_t GetIteratorTotalMemoryUsage();
 
+  std::shared_ptr<model::Model> GetModel();
+
  private:
   DatasetBaseIterator* iterator_;  // not owned
+  std::shared_ptr<model::Model> model_;
   ApproximateLatencyEstimator latency_estimator_;
 };
 
diff --git a/tensorflow/core/data/tfdataz_metrics_test.cc b/tensorflow/core/data/tfdataz_metrics_test.cc
index a1fcc540125936..0a75ed4fe6d877 100644
--- a/tensorflow/core/data/tfdataz_metrics_test.cc
+++ b/tensorflow/core/data/tfdataz_metrics_test.cc
@@ -41,8 +41,8 @@ class TfDatazMetricsTest : public ::testing::Test {
  protected:
   void SetUp() override {
     env_ = std::make_unique<FakeClockEnv>(Env::Default());
-    tfdataz_metrics_ =
-        std::make_unique<TfDatazMetricsCollector>(*env_, iterator_.get());
+    tfdataz_metrics_ = std::make_unique<TfDatazMetricsCollector>(
+        *env_, iterator_.get(), /*model=*/nullptr);
   }
 
   void TearDown() override {
@@ -200,9 +200,9 @@ class ScopedTfDataMetricsRegistration {
 TEST(TfDatazMetricsRegistryTest, Register) {
   std::unique_ptr<DatasetBaseIterator> iterator;
   auto collector_one = std::make_shared<TfDatazMetricsCollector>(
-      *Env::Default(), iterator.get());
+      *Env::Default(), iterator.get(), /*model=*/nullptr);
   auto collector_two = std::make_shared<TfDatazMetricsCollector>(
-      *Env::Default(), iterator.get());
+      *Env::Default(), iterator.get(), /*model=*/nullptr);
 
   ScopedTfDataMetricsRegistration scoped_registration_one(collector_one);
   ScopedTfDataMetricsRegistration scoped_registration_two(collector_two);
@@ -213,11 +213,11 @@ TEST(TfDatazMetricsRegistryTest, Register) {
 TEST(TfDatazMetricsRegistryTest, Deregister) {
   std::unique_ptr<DatasetBaseIterator> iterator;
   auto collector_one = std::make_shared<TfDatazMetricsCollector>(
-      *Env::Default(), iterator.get());
+      *Env::Default(), iterator.get(), /*model=*/nullptr);
   auto collector_two = std::make_shared<TfDatazMetricsCollector>(
-      *Env::Default(), iterator.get());
+      *Env::Default(), iterator.get(), /*model=*/nullptr);
   auto collector_three = std::make_shared<TfDatazMetricsCollector>(
-      *Env::Default(), iterator.get());
+      *Env::Default(), iterator.get(), /*model=*/nullptr);
   ScopedTfDataMetricsRegistration scoped_registration_one(collector_one);
   ScopedTfDataMetricsRegistration scoped_registration_two(collector_two);
   ScopedTfDataMetricsRegistration scoped_registration_three(collector_three);
diff --git a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
index b31dfa4f93f61c..87addeeceddc0d 100644
--- a/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/base_rendezvous_mgr.cc
@@ -129,7 +129,7 @@ Status BaseRemoteRendezvous::Initialize(WorkerSession* session) {
     if (session_ != nullptr) {
       if (session_->worker_name() == session->worker_name()) {
         VLOG(1) << "Skipping rendezvous re-initialization.";
-        return OkStatus();
+        return absl::OkStatus();
       }
       Status s = errors::Internal(
           "Double init! Worker names would have changed from: ",
@@ -143,7 +143,7 @@ Status BaseRemoteRendezvous::Initialize(WorkerSession* session) {
   for (auto& call : deferred_calls) {
     RecvLocalAsyncInternal(call.parsed, std::move(call.done));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 WorkerSession* BaseRemoteRendezvous::session() {
@@ -203,7 +203,7 @@ Status BaseRemoteRendezvous::ValidateDevices(const ParsedKey& parsed,
         "Invalid rendezvous key (dst): ", parsed.FullKey(), " @ ",
         sess->worker_name());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void BaseRemoteRendezvous::SameWorkerRecvDone(
@@ -218,7 +218,7 @@ void BaseRemoteRendezvous::SameWorkerRecvDone(
       (recv_args.alloc_attrs.on_host() || parsed.dst.type == "CPU");
   if (src_host && dst_host) {
     *out = in;
-    done(OkStatus());
+    done(absl::OkStatus());
     return;
   }
 
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
index e4806099683b8f..47dbf7c35a9670 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime.cc
@@ -169,7 +169,7 @@ Status ClusterFunctionLibraryRuntime::ConstructFunctionGraph(
   // from the library.
   *(gdef->mutable_library()) = flib_def.ReachableDefinitions(*gdef).ToProto();
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 ClusterFunctionLibraryRuntime::~ClusterFunctionLibraryRuntime() {
@@ -209,7 +209,7 @@ void ClusterFunctionLibraryRuntime::Instantiate(
     const OpDef& sig = fdef->signature();
     TF_RETURN_IF_ERROR(ConstructFunctionGraph(sig, attrs, options, *lib_def,
                                               &gdef, send_keys, recv_keys));
-    return OkStatus();
+    return absl::OkStatus();
   };
   Status s;
   if (options.lib_def) {
diff --git a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
index 8a3df302cd9566..899cc193ee87d1 100644
--- a/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
+++ b/tensorflow/core/distributed_runtime/cluster_function_library_runtime_test.cc
@@ -120,7 +120,7 @@ class ClusterFunctionLibraryRuntimeTest : public ::testing::Test {
       *rets[i] = out[i];
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
index caffcca302b800..92f7251d3ad2c5 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed.cc
@@ -269,7 +269,7 @@ Status CollectiveParamResolverDistributed::UpdateGroupCache(
           absl::CEscape(previous_gr->group.runtime_details.communicator_key));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void CollectiveParamResolverDistributed::CompleteGroupDistributed(
diff --git a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
index 6a6e6f455603fe..d699b1d2275a2c 100644
--- a/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_param_resolver_distributed_test.cc
@@ -39,7 +39,7 @@ static std::unique_ptr<Device> NewDevice(const string& type,
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
-    Status Sync() override { return OkStatus(); }
+    Status Sync() override { return absl::OkStatus(); }
     Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
   };
   DeviceAttributes attr;
@@ -83,7 +83,7 @@ class FakeCache : public TestWorkerCache {
     for (const auto& it : resp.device_attributes()) {
       if (it.name() == device) {
         *locality = it.locality();
-        done(OkStatus());
+        done(absl::OkStatus());
         return;
       }
     }
@@ -98,7 +98,7 @@ class FakeNcclCommunicator : public NcclCommunicatorInterface {
 
   void Enqueue(std::shared_ptr<CollectiveContext> col_ctx,
                StatusCallback done) override {
-    done(OkStatus());
+    done(absl::OkStatus());
   }
 
   void StartAbort(const Status& s) override {}
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
index b97aff64bbd2a5..984a5bda243c82 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed.cc
@@ -85,7 +85,7 @@ Status PopulateTensorFromResponse(const RecvBufResponse& response,
 
   // If there are no transport options, then the tensor has already been
   // copied into request.buf_ptr.
-  if (!has_transport_options) return OkStatus();
+  if (!has_transport_options) return absl::OkStatus();
 
   const int64_t total_bytes = cpu_tensor->TotalBytes();
   int64_t num_bytes = 0;
@@ -101,7 +101,7 @@ Status PopulateTensorFromResponse(const RecvBufResponse& response,
                             " bytes, expected: ", cpu_tensor->TotalBytes());
   }
   PopulateTensorFromExtra(extra, cpu_tensor);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -236,7 +236,7 @@ void CollectiveRemoteAccessDistributed::CheckPeerHealth(
     const StatusCallback& done) {
   if (peer_task == task_name_) {
     // Fast path if the peer is the worker itself.
-    done(OkStatus());
+    done(absl::OkStatus());
     return;
   }
   // We send a GetStatus RPC to check the health of a peer task. If the RPC
@@ -282,7 +282,7 @@ void CollectiveRemoteAccessDistributed::CheckPeerHealth(
           // Skip validating device incarnation if we don't know what the
           // incarnation should be. The device attribute is cached after the
           // first collective.
-          s = OkStatus();
+          s = absl::OkStatus();
         }
         delete opts;
         delete req;
diff --git a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
index bd9d5f100d8758..8ece4bf2f0fe70 100644
--- a/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
+++ b/tensorflow/core/distributed_runtime/collective_rma_distributed_test.cc
@@ -60,7 +60,7 @@ static std::unique_ptr<Device> NewDevice(const string& type, const string& name,
    public:
     explicit FakeDevice(const DeviceAttributes& attr, Allocator* allocator)
         : Device(nullptr, attr), allocator_(allocator) {}
-    Status Sync() override { return OkStatus(); }
+    Status Sync() override { return absl::OkStatus(); }
     Allocator* GetAllocator(AllocatorAttributes) override { return allocator_; }
 
    private:
@@ -104,7 +104,7 @@ class FakeWorker : public TestWorkerInterface {
     for (const auto& da : dev_attr) {
       *response->add_device_attributes() = da;
     }
-    done(OkStatus());
+    done(absl::OkStatus());
   }
 
   void RecvBufAsync(CallOptions* opts, const RecvBufRequest* request,
@@ -202,7 +202,7 @@ class FakeCache : public TestWorkerCache {
     for (const auto& it : resp.device_attributes()) {
       if (it.name() == device) {
         *locality = it.locality();
-        done(OkStatus());
+        done(absl::OkStatus());
         return;
       }
     }
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
index a1c58b8b6ea756..952d38d852560f 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy.cc
@@ -60,7 +60,7 @@ std::pair<Status, bool> BarrierProxy::Wait() {
       // We should have a mechanism to remove it after it has been passed.
       status_ = agent_->WaitAtBarrier(key_, timeout_, tasks_);
     } else {
-      status_ = OkStatus();
+      status_ = absl::OkStatus();
     }
     status_set_ = true;
     cv_.notify_all();
@@ -98,7 +98,7 @@ Status BarrierProxyManager::Wait(tsl::CoordinationServiceAgent* agent,
                                  int num_local_threads, absl::string_view key,
                                  absl::Duration timeout) {
   // Only one device, no need to wait.
-  if (tasks.size() == 1 && num_local_threads <= 1) return OkStatus();
+  if (tasks.size() == 1 && num_local_threads <= 1) return absl::OkStatus();
 
   profiler::TraceMe traceme([&] {
     return profiler::TraceMeEncode("BarrierProxyManager::Wait",
diff --git a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
index 9ff0473321a07c..11a8af32f19b9e 100644
--- a/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
+++ b/tensorflow/core/distributed_runtime/coordination/coordination_service_barrier_proxy_test.cc
@@ -163,8 +163,8 @@ TEST(BarrierProxyTest, AllThreadsExitBarrier) {
       /*num_threads_planned=*/8,
       /*num_threads_entered=*/8,
       /*expected_ok_count=*/8,
-      /*agent_wait_status=*/OkStatus(),
-      /*expected_same_exit_status_for_all_threads=*/OkStatus());
+      /*agent_wait_status=*/absl::OkStatus(),
+      /*expected_same_exit_status_for_all_threads=*/absl::OkStatus());
 }
 
 TEST(BarrierProxyTest, AgentErrorBroadcastedToAllThreads) {
@@ -184,7 +184,7 @@ TEST(BarrierProxyTest, AgentIsIgnoredIfThereIsOnlyOneTask) {
       /*num_threads_entered=*/8,
       /*expected_ok_count=*/8,
       /*agent_wait_status=*/{},
-      /*expected_same_exit_status_for_all_threads=*/OkStatus());
+      /*expected_same_exit_status_for_all_threads=*/absl::OkStatus());
 }
 
 TEST(BarrierProxyTest, TimeoutIfNotEnoughThreadEntered) {
@@ -204,7 +204,7 @@ TEST(BarrierProxyTest, ExtraThreadsEnteringTheBarrierGetErrors) {
       /*num_threads_planned=*/8,
       /*num_threads_entered=*/10,
       /*expected_ok_count=*/8,
-      /*agent_wait_status=*/OkStatus(),
+      /*agent_wait_status=*/absl::OkStatus(),
       /*expected_same_exit_status_for_all_threads=*/{});
 }
 
@@ -240,7 +240,7 @@ TEST(BarrierProxyManagerTest, AllThreadExited) {
   TestBarrierProxyManagerWaitSingleKey(
       /*num_threads_planned=*/8,
       /*num_threads_entered=*/8,
-      /*agent_wait_status=*/OkStatus(),
+      /*agent_wait_status=*/absl::OkStatus(),
       /*expected_ok_count=*/8);
 }
 
@@ -264,7 +264,7 @@ TEST(BarrierProxyManagerTest, ExtraThreadsEnteringTheSameKeyGetErrors) {
   TestBarrierProxyManagerWaitSingleKey(
       /*num_threads_planned=*/8,
       /*num_threads_entered=*/10,
-      /*agent_wait_status=*/OkStatus(),
+      /*agent_wait_status=*/absl::OkStatus(),
       /*expected_ok_count=*/8);
 }
 
@@ -275,16 +275,16 @@ TEST(BarrierProxyManagerTest, DifferentKeysDoNotInterfereWithEachOther) {
   BarrierProxyManager mgr;
 
   EXPECT_CALL(*agent, WaitAtBarrier("key0", kTestTimeout, _))
-      .WillOnce(Return(OkStatus()));
+      .WillOnce(Return(absl::OkStatus()));
   EXPECT_CALL(*agent, WaitAtBarrier("key1", kTestTimeout, _))
-      .WillOnce(Return(OkStatus()));
+      .WillOnce(Return(absl::OkStatus()));
   {
     thread::ThreadPool pool(Env::Default(), /*name=*/"TestPool",
                             kThreadPoolSize);
     for (int i = 0; i < kNumThreads * 2; ++i) {
       pool.Schedule([&, key = absl::StrCat("key", i % 2)]() {
         ASSERT_EQ(mgr.Wait(agent.get(), tasks, kNumThreads, key, kTestTimeout),
-                  OkStatus());
+                  absl::OkStatus());
       });
     }
   }
diff --git a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
index ac5d1b2bb09022..3c5d53cb0dc6b1 100644
--- a/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
+++ b/tensorflow/core/distributed_runtime/eager/cluster_function_library_runtime.cc
@@ -249,7 +249,7 @@ void EagerClusterFunctionLibraryRuntime::Run(
             return;
           }
         }
-        done(OkStatus());
+        done(absl::OkStatus());
       });
 }
 
diff --git a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
index 45aab6e5f840ea..ca5eaa2526f6cb 100644
--- a/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
+++ b/tensorflow/core/distributed_runtime/eager/destroy_tensor_handle_node.h
@@ -61,7 +61,7 @@ class DestroyTensorHandleNode : public tensorflow::AsyncEagerNode {
                    "remote tensors handles: "
                 << s.ToString();
           }
-          done(OkStatus());
+          done(absl::OkStatus());
           delete response;
         });
   }
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
index f6f3bf1ee1668c..add07e7cf06057 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl.cc
@@ -92,7 +92,7 @@ Status GetNumRetvals(FunctionLibraryDefinition* func_lib_def,
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GetEagerOperationAndNumRetvals(const Operation& operation,
@@ -167,7 +167,7 @@ Status TensorHandleProto(TensorHandle* handle, TensorProto* proto) {
   const tensorflow::Tensor* t = nullptr;
   TF_RETURN_IF_ERROR(handle->Tensor(&t));
   t->AsProtoTensorContent(proto);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status TensorHandleShape(TensorHandle* handle, TensorShapeProto* proto) {
@@ -184,7 +184,7 @@ Status TensorHandleShape(TensorHandle* handle, TensorShapeProto* proto) {
     shape.AsProto(proto);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddOpRetvalsToResponse(
@@ -245,7 +245,7 @@ Status ResetAgentAndConnectToCoordinationService(
       return s;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -431,7 +431,7 @@ Status EagerServiceImpl::CreateContext(const CreateContextRequest* request,
                       new ServerContext(ctx, request->keep_alive_secs(), env_));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status EagerServiceImpl::UpdateContext(const UpdateContextRequest* request,
@@ -463,7 +463,7 @@ Status EagerServiceImpl::UpdateContext(const UpdateContextRequest* request,
     ctx->IncrementContextViewId();
     VLOG(1) << "Processing simplified UpdateContextRequest on "
             << ctx->HostCPU()->name();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   auto session_name =
@@ -522,7 +522,7 @@ Status EagerServiceImpl::UpdateContext(const UpdateContextRequest* request,
     *response->add_device_attributes() = da;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status EagerServiceImpl::CreateMasterContext(
@@ -540,7 +540,7 @@ Status EagerServiceImpl::CreateMasterContext(
       ServerContext::CreateMasterContext(context, env_);
   mutex_lock l(contexts_mu_);
   contexts_.emplace(context_id, server_context);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void EagerServiceImpl::RunComponentFunction(
@@ -723,7 +723,7 @@ Status EagerServiceImpl::Enqueue(CallOptions* call_opts,
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status EagerServiceImpl::WaitQueueDone(const WaitQueueDoneRequest* request,
@@ -748,7 +748,7 @@ Status EagerServiceImpl::KeepAlive(const KeepAliveRequest* request,
 
   tensorflow::EagerContext* ctx = context->Context();
   response->set_context_view_id(ctx->GetContextViewId());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status EagerServiceImpl::CloseContext(const CloseContextRequest* request,
@@ -758,7 +758,7 @@ Status EagerServiceImpl::CloseContext(const CloseContextRequest* request,
   ServerContext* context = nullptr;
   if (!GetServerContext(request->context_id(), &context).ok()) {
     // Swallow the error here.
-    return OkStatus();
+    return absl::OkStatus();
   }
   core::ScopedUnref context_unref(context);
 
@@ -768,7 +768,7 @@ Status EagerServiceImpl::CloseContext(const CloseContextRequest* request,
               << request->context_view_id() << "  for context_id "
               << request->context_id() << ". The current context_view_id is "
               << context->Context()->GetContextViewId() << ".";
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   mutex_lock l(contexts_mu_);
@@ -779,7 +779,7 @@ Status EagerServiceImpl::CloseContext(const CloseContextRequest* request,
   // we are releasing it from the map.
   context->Unref();
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status EagerServiceImpl::RegisterFunction(
@@ -804,7 +804,7 @@ Status EagerServiceImpl::RemoveFunction(const RemoveFunctionOp& remove_function,
 Status EagerServiceImpl::CleanupFunction(
     const CleanupFunctionOp& cleanup_function) {
   env_->rendezvous_mgr->Cleanup(cleanup_function.step_id());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status EagerServiceImpl::SendTensor(const SendTensorOp& send_tensor,
@@ -831,7 +831,7 @@ Status EagerServiceImpl::SendTensor(const SendTensorOp& send_tensor,
 
   eager_context->RemoteMgr()->AddOperationOutputs(tensors, send_tensor.op_id());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status EagerServiceImpl::SendPackedHandle(
@@ -877,7 +877,7 @@ Status EagerServiceImpl::SendPackedHandle(
 
   eager_context->RemoteMgr()->AddOperationOutputs({packed_handle},
                                                   send_packed_handle.op_id());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 tensorflow::Status EagerServiceImpl::GetServerContext(
@@ -897,7 +897,7 @@ tensorflow::Status EagerServiceImpl::GetServerContext(
 
   (*server_context)->RecordAccess();
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace eager
diff --git a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
index 2ab6631de71d9b..84cfe637697c70 100644
--- a/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
+++ b/tensorflow/core/distributed_runtime/eager/eager_service_impl_test.cc
@@ -60,7 +60,7 @@ class TestEagerServiceImpl : public EagerServiceImpl {
     TF_RETURN_IF_ERROR(GetServerContext(context_id, &context));
     core::ScopedUnref context_unref(context);
     *ctx = context->Context();
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status GetTensorHandle(const uint64 context_id,
                          const RemoteTensorHandleInternal& remote_handle,
@@ -139,7 +139,7 @@ class DummyEagerClientCache : public EagerClientCache {
                    core::RefCountPtr<EagerClient>* client) override {
     client->reset(client_.get());
     client_->Ref();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -150,7 +150,7 @@ class FakeCache : public TestWorkerCache {
   Status GetEagerClientCache(
       std::unique_ptr<eager::EagerClientCache>* eager_client_cache) override {
     *eager_client_cache = std::make_unique<DummyEagerClientCache>();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void ListWorkers(std::vector<string>* workers) const override {
@@ -168,7 +168,7 @@ class EagerServiceImplTest : public ::testing::Test {
             [](const ServerDef& server_def,
                WorkerCacheInterface** worker_cache) {
               *worker_cache = new FakeCache;
-              return OkStatus();
+              return absl::OkStatus();
             },
             /*coordination_handler=*/nullptr)) {
     worker_env_.env = Env::Default();
@@ -1009,7 +1009,7 @@ class FunctionWithRemoteInputsTest : public EagerServiceImplTest {
                                    tsl::core::RefCountPtr<Rendezvous>* r) {
           *r = tsl::core::RefCountPtr<Rendezvous>(
               worker_env_.rendezvous_mgr->Find(step_id).release());
-          return OkStatus();
+          return absl::OkStatus();
         }});
   }
 
@@ -1100,7 +1100,7 @@ TEST_F(FunctionWithRemoteInputsTest, EagerPFLRTest) {
       std::move(tensor_args),
       [&inputs](const int i, RemoteTensorHandle* handle) -> Status {
         *handle = inputs.at(i);
-        return OkStatus();
+        return absl::OkStatus();
       });
   eager_pflr_->Run(opts, handle, args, &outputs,
                    [&status, &done](const Status& s) {
@@ -1204,7 +1204,7 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncTest) {
       std::move(input_tensors),
       [&remote_handles](const int index, RemoteTensorHandle* handle) -> Status {
         *handle = remote_handles.at(index);
-        return OkStatus();
+        return absl::OkStatus();
       });
   std::vector<FunctionRet> outputs;
 
@@ -1259,7 +1259,7 @@ TEST_F(FunctionWithRemoteInputsTest, KernelAndDeviceFuncAsyncTest) {
       std::move(input_tensors),
       [&remote_handles](const int index, RemoteTensorHandle* handle) -> Status {
         *handle = remote_handles.at(index);
-        return OkStatus();
+        return absl::OkStatus();
       });
   std::vector<FunctionRet> outputs;
 
diff --git a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
index bd5bc39622b9d6..afcbce0b0a1215 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_copy_node.cc
@@ -216,7 +216,7 @@ Status RemoteCopyNode::RunLocalRecv(EagerOperation* op,
           "Expect to receive a Tensor but got a TensorShape.");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void RemoteCopyNode::RunRemoteRecv(EagerOperation* op, StatusCallback done) {
@@ -358,7 +358,7 @@ Status SerializePackedHandle(const uint64 op_id, TensorHandle* packed_handle,
       return errors::InvalidArgument("Nested packed handles are not supported");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void RemoteCopyNode::StartSendPackedHandle(StatusCallback done) {
@@ -479,7 +479,7 @@ void RemoteCopyNode::StartRemoteSendTensor(StatusCallback done) {
 
 Status RemoteCopyNode::Prepare() {
   TF_RETURN_IF_ERROR(captured_state_->dst()->CopyInferenceShape(src_));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void RemoteCopyNode::RunAsync(StatusCallback done) {
diff --git a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
index 2bec995a1f616e..313c60ccb53bea 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_mgr.cc
@@ -76,7 +76,7 @@ Status RemoteMgr::GetTensorHandleImpl(
 
   *handle = iter->second;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RemoteMgr::GetTensorHandle(
@@ -105,7 +105,7 @@ Status RemoteMgr::GetMirroredResourceShape(
 
   *handle = iter->second;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RemoteMgr::GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
@@ -121,7 +121,7 @@ Status RemoteMgr::GetRemoteTensorHandle(const tensorflow::TensorHandle* handle,
         "Found two different tensor handles with the same op_id:", *op_id,
         " and output_num:", *output_num));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RemoteMgr::DeleteTensorHandle(
@@ -132,7 +132,7 @@ Status RemoteMgr::DeleteTensorHandle(
     if (iter != remote_tensor_handle_map_.end()) {
       iter->second->Unref();
       remote_tensor_handle_map_.erase(iter);
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
   {
@@ -140,7 +140,7 @@ Status RemoteMgr::DeleteTensorHandle(
     auto iter = mirrored_resource_shape_map_.find(remote_handle);
     if (iter != mirrored_resource_shape_map_.end()) {
       mirrored_resource_shape_map_.erase(iter);
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
   return WithErrorSourcePayload(errors::InvalidArgument(
@@ -176,7 +176,7 @@ Status RemoteMgr::SerializeRemoteTensorHandle(
       dtype_and_shape.shape.AsProto(dtype_and_shape_proto->mutable_shape());
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RemoteMgr::DeserializeRemoteTensorHandle(const RemoteTensorHandle& in,
@@ -214,7 +214,7 @@ Status RemoteMgr::DeserializeRemoteTensorHandle(const RemoteTensorHandle& in,
     (*out)->SetResourceHandleDtypeAndShape(std::move(dtypes_and_shapes));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 EagerExecutor& RemoteMgr::GetOrCreateExecutorForStream(uint64 stream_id) {
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
index 48952917f60f69..e89c29774e7ff8 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.cc
@@ -131,7 +131,7 @@ Status RemoteTensorHandleData::Shape(TensorShape* shape) const {
   tf_shared_lock l(mu_);
   *shape = shape_;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RemoteTensorHandleData::NumDims(int* num_dims) const {
@@ -140,7 +140,7 @@ Status RemoteTensorHandleData::NumDims(int* num_dims) const {
   tf_shared_lock l(mu_);
   *num_dims = shape_.dims();
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RemoteTensorHandleData::Dim(int dim_index, int64_t* dim) const {
@@ -149,7 +149,7 @@ Status RemoteTensorHandleData::Dim(int dim_index, int64_t* dim) const {
   tf_shared_lock l(mu_);
   *dim = shape_.dim_size(dim_index);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RemoteTensorHandleData::NumElements(int64_t* num_elements) const {
@@ -158,7 +158,7 @@ Status RemoteTensorHandleData::NumElements(int64_t* num_elements) const {
   tf_shared_lock l(mu_);
   *num_elements = shape_.num_elements();
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool RemoteTensorHandleData::IsReady() const {
@@ -203,17 +203,17 @@ Status RemoteTensorHandleData::SetShapeAndRemoteTask(
                        " from existing shape of ", shape_.DebugString()));
     }
     LOG(WARNING) << "SetShape can only be called on non-ready handles.";
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   shape_ = shape;
   if (!remote_task.empty()) {
     remote_task_ = remote_task;
   }
-  is_poisoned_ = OkStatus();
+  is_poisoned_ = absl::OkStatus();
   is_ready_ = true;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string RemoteTensorHandleData::DebugString() const {
@@ -229,7 +229,7 @@ Status RemoteTensorHandleData::OpIdAndOutputNum(const bool wait_until_ready,
   }
   *op_id = op_id_;
   *output_num = output_num_;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RemoteTensorHandleData::WaitReady(const char* caller) const {
diff --git a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
index c5b348eca2fadc..92f0a66ebbbba7 100644
--- a/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
+++ b/tensorflow/core/distributed_runtime/eager/remote_tensor_handle_data.h
@@ -45,7 +45,7 @@ class RemoteTensorHandleData {
   Status NumDims(int* num_dims) const;
   Status Dim(int dim_index, int64_t* dim) const;
   Status NumElements(int64_t* num_elements) const;
-  Status Unprotect() { return OkStatus(); }
+  Status Unprotect() { return absl::OkStatus(); }
 
   bool IsReady() const;
   Status WaitReady(const char* caller) const;
diff --git a/tensorflow/core/distributed_runtime/graph_mgr.cc b/tensorflow/core/distributed_runtime/graph_mgr.cc
index dd749b9e86810a..cb827d584e260b 100644
--- a/tensorflow/core/distributed_runtime/graph_mgr.cc
+++ b/tensorflow/core/distributed_runtime/graph_mgr.cc
@@ -98,7 +98,7 @@ static Status ValidateGraphDefForDevices(const GraphDef& gdef) {
                                      FormatNodeDefForError(ndef));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphMgr::DecorateAndPublishGraphForDebug(
@@ -108,7 +108,7 @@ Status GraphMgr::DecorateAndPublishGraphForDebug(
       DebugGraphDecoratorRegistry::CreateDecorator(debug_options, &decorator));
   TF_RETURN_IF_ERROR(decorator->DecorateGraph(graph, device));
   TF_RETURN_IF_ERROR(decorator->PublishGraph(*graph, device->name()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Creates executors given a graph definition "gdef" of a "session".
@@ -150,7 +150,7 @@ Status GraphMgr::InitItem(const string& handle, const GraphDef& gdef,
                 this->worker_env_->rendezvous_mgr->Find(step_id);
             TF_RETURN_IF_ERROR(remote_r->Initialize(session));
             *r = std::move(remote_r);
-            return OkStatus();
+            return absl::OkStatus();
           }}));
 
   // Constructs the graph out of "gdef".
@@ -288,7 +288,7 @@ Status GraphMgr::InitItem(const string& handle, const GraphDef& gdef,
     }
     TF_RETURN_IF_ERROR(NewLocalExecutor(params, *unit->graph, &unit->root));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphMgr::Register(const string& handle, const GraphDef& gdef,
@@ -314,7 +314,7 @@ Status GraphMgr::Register(const string& handle, const GraphDef& gdef,
     item->handle = *graph_handle;
     CHECK(table_.insert({*graph_handle, item}).second);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphMgr::Deregister(const string& handle) {
@@ -331,7 +331,7 @@ Status GraphMgr::Deregister(const string& handle) {
     table_.erase(iter);
   }
   item->Unref();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphMgr::DeregisterAll() {
@@ -347,7 +347,7 @@ Status GraphMgr::DeregisterAll() {
   for (auto item : items) {
     item->Unref();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphMgr::SendInputs(const int64_t step_id, const NamedTensors& in) {
diff --git a/tensorflow/core/distributed_runtime/local_master.cc b/tensorflow/core/distributed_runtime/local_master.cc
index 76f8d4f37b20e6..a41f977b059f82 100644
--- a/tensorflow/core/distributed_runtime/local_master.cc
+++ b/tensorflow/core/distributed_runtime/local_master.cc
@@ -43,7 +43,7 @@ Status WaitForNotification(CallOptions* call_options,
   } else {
     n->WaitForNotification();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
diff --git a/tensorflow/core/distributed_runtime/master.cc b/tensorflow/core/distributed_runtime/master.cc
index 3992a211ef68b6..2602c471a73671 100644
--- a/tensorflow/core/distributed_runtime/master.cc
+++ b/tensorflow/core/distributed_runtime/master.cc
@@ -144,7 +144,7 @@ class DeviceFinder {
     finder.Start();
     TF_RETURN_IF_ERROR(finder.Wait());
     finder.GetRemoteDevices(env->local_devices, out_remote);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   static void GetRemoteWorkers(
diff --git a/tensorflow/core/distributed_runtime/master_session.cc b/tensorflow/core/distributed_runtime/master_session.cc
index e54c577bc012a5..5593963988d9e5 100644
--- a/tensorflow/core/distributed_runtime/master_session.cc
+++ b/tensorflow/core/distributed_runtime/master_session.cc
@@ -870,7 +870,7 @@ Status MasterSession::ReffedClientGraph::RunPartitions(
     }
     fetch_proto->Swap(&iter->second);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -967,7 +967,7 @@ void MasterSession::ReffedClientGraph::ProcessStats(int64_t step_id,
     }
     ph->StepDone(pss->start_micros, pss->end_micros,
                  Microseconds(0) /*cleanup_time*/, 0 /*total_runops*/,
-                 OkStatus());
+                 absl::OkStatus());
   }
   // Assemble all stats for this timeline into a merged StepStats.
   if (pss->collect_timeline) {
@@ -1091,7 +1091,7 @@ Status MasterSession::ReffedClientGraph::CheckFetches(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Asynchronously deregisters subgraphs on the workers, without waiting for the
@@ -1304,7 +1304,7 @@ Status MasterSession::CreateWorkerSessions(const ClusterDef& cluster_def) {
     // Request and responses used for a given worker.
     CreateWorkerSessionRequest request;
     CreateWorkerSessionResponse response;
-    Status status = OkStatus();
+    Status status = absl::OkStatus();
   };
   BlockingCounter done(worker_names.size());
   std::vector<WorkerGroup> workers(worker_names.size());
@@ -1325,7 +1325,7 @@ Status MasterSession::CreateWorkerSessions(const ClusterDef& cluster_def) {
   const int64_t client_device_incarnation =
       devices_->client_device()->attributes().incarnation();
 
-  Status status = OkStatus();
+  Status status = absl::OkStatus();
   // Create all the workers & kick off the computations.
   for (size_t i = 0; i < worker_names.size(); ++i) {
     workers[i].name = &worker_names[i];
@@ -1439,7 +1439,7 @@ Status MasterSession::DeleteWorkerSessions() {
     // Request and responses used for a given worker.
     DeleteWorkerSessionRequest request;
     DeleteWorkerSessionResponse response;
-    Status status = OkStatus();
+    Status status = absl::OkStatus();
   };
   BlockingCounter done(worker_names.size());
   std::vector<WorkerGroup> workers(worker_names.size());
@@ -1453,7 +1453,7 @@ Status MasterSession::DeleteWorkerSessions() {
     }
   });
 
-  Status status = OkStatus();
+  Status status = absl::OkStatus();
   // Create all the workers & kick off the computations.
   for (size_t i = 0; i < worker_names.size(); ++i) {
     workers[i].name = &worker_names[i];
@@ -1501,7 +1501,7 @@ Status MasterSession::ListDevices(ListDevicesResponse* resp) const {
       *(resp->add_local_device()) = dev->attributes();
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MasterSession::Extend(const ExtendSessionRequest* req,
@@ -1530,7 +1530,7 @@ Status MasterSession::Extend(const ExtendSessionRequest* req,
     ++graph_version_;
     resp->set_new_graph_version(graph_version_);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 WorkerCacheInterface* MasterSession::get_worker_cache() const {
@@ -1571,7 +1571,7 @@ Status MasterSession::StartStep(const BuildGraphOptions& opts, bool is_partial,
     (*out_rcg)->Ref();
     *out_count = (*out_rcg)->get_and_increment_execution_count();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void MasterSession::ClearRunsTable(std::vector<ReffedClientGraph*>* to_unref,
@@ -1654,7 +1654,7 @@ Status MasterSession::PartialRunSetup(const PartialRunSetupRequest* req,
   TF_RETURN_IF_ERROR(BuildAndRegisterPartitions(rcg));
 
   resp->set_partial_run_handle(handle);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MasterSession::Run(CallOptions* opts, const RunStepRequestWrapper& req,
@@ -1727,7 +1727,7 @@ Status MasterSession::BuildAndRegisterPartitions(ReffedClientGraph* rcg) {
 
   TF_RETURN_IF_ERROR(rcg->RegisterPartitions(std::move(popts)));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MasterSession::DoPartialRun(CallOptions* opts,
@@ -1888,7 +1888,7 @@ Status MasterSession::CreateDebuggerState(
       debug_options.global_step(), rcg_execution_count, rcg_execution_count,
       input_names, output_names, target_names));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void MasterSession::FillPerStepState(MasterSession::ReffedClientGraph* rcg,
@@ -2049,7 +2049,7 @@ Status MasterSession::MakeCallable(const MakeCallableRequest& req,
   }
 
   resp->set_handle(handle);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MasterSession::DoRunCallable(CallOptions* opts, ReffedClientGraph* rcg,
@@ -2123,7 +2123,7 @@ Status MasterSession::ReleaseCallable(const ReleaseCallableRequest& req,
   if (to_unref != nullptr) {
     to_unref->Unref();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MasterSession::Close() {
@@ -2149,7 +2149,7 @@ Status MasterSession::Close() {
       LOG(WARNING) << s;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void MasterSession::GarbageCollect() {
diff --git a/tensorflow/core/distributed_runtime/message_wrappers.cc b/tensorflow/core/distributed_runtime/message_wrappers.cc
index 35bc935496ea91..d3ae6f252e8482 100644
--- a/tensorflow/core/distributed_runtime/message_wrappers.cc
+++ b/tensorflow/core/distributed_runtime/message_wrappers.cc
@@ -59,13 +59,13 @@ const string& InMemoryRunStepRequest::feed_name(size_t i) const {
 
 Status InMemoryRunStepRequest::FeedValue(size_t i, Tensor* out_tensor) const {
   *out_tensor = feeds_[i].second;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InMemoryRunStepRequest::FeedValue(size_t i,
                                          TensorProto* out_tensor) const {
   feeds_[i].second.AsProtoTensorContent(out_tensor);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void InMemoryRunStepRequest::add_feed(const string& name, const Tensor& value) {
@@ -155,14 +155,14 @@ Status MutableProtoRunStepRequest::FeedValue(size_t i,
   if (!ParseTensorProtoToTensor(request_.feed(i).tensor(), out_tensor)) {
     return errors::InvalidArgument("Invalid TensorProto for feed value ", i);
   } else {
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
 Status MutableProtoRunStepRequest::FeedValue(size_t i,
                                              TensorProto* out_tensor) const {
   *out_tensor = request_.feed(i).tensor();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void MutableProtoRunStepRequest::add_feed(const string& name,
@@ -246,13 +246,13 @@ Status ProtoRunStepRequest::FeedValue(size_t i, Tensor* out_tensor) const {
   if (!ParseTensorProtoToTensor(request_->feed(i).tensor(), out_tensor)) {
     return errors::InvalidArgument("Invalid TensorProto for feed value ", i);
   } else {
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
 Status ProtoRunStepRequest::FeedValue(size_t i, TensorProto* out_tensor) const {
   *out_tensor = request_->feed(i).tensor();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 size_t ProtoRunStepRequest::num_fetches() const {
@@ -335,7 +335,7 @@ const string& InMemoryRunGraphRequest::send_key(size_t i) const {
 
 Status InMemoryRunGraphRequest::SendValue(size_t i, Tensor* out_tensor) const {
   *out_tensor = sends_[i].second;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InMemoryRunGraphRequest::AddSendFromRunStepRequest(
@@ -344,7 +344,7 @@ Status InMemoryRunGraphRequest::AddSendFromRunStepRequest(
   Tensor tensor;
   TF_RETURN_IF_ERROR(run_step_request.FeedValue(i, &tensor));
   sends_.emplace_back(send_key, std::move(tensor));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InMemoryRunGraphRequest::AddSendFromRunCallableRequest(
@@ -355,7 +355,7 @@ Status InMemoryRunGraphRequest::AddSendFromRunCallableRequest(
     return errors::InvalidArgument("Invalid TensorProto for feed value ", i);
   }
   sends_.emplace_back(send_key, std::move(tensor));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 size_t InMemoryRunGraphRequest::num_recvs() const { return recvs_.size(); }
@@ -478,7 +478,7 @@ Status MutableProtoRunGraphRequest::SendValue(size_t i,
   if (!ParseTensorProtoToTensor(request_.send(i).tensor(), out_tensor)) {
     return errors::InvalidArgument("Invalid TensorProto for feed value ", i);
   } else {
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
@@ -488,7 +488,7 @@ Status MutableProtoRunGraphRequest::AddSendFromRunStepRequest(
   NamedTensorProto* send = request_.add_send();
   send->set_name(send_key);
   TF_RETURN_IF_ERROR(run_step_request.FeedValue(i, send->mutable_tensor()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MutableProtoRunGraphRequest::AddSendFromRunCallableRequest(
@@ -497,7 +497,7 @@ Status MutableProtoRunGraphRequest::AddSendFromRunCallableRequest(
   NamedTensorProto* send = request_.add_send();
   send->set_name(send_key);
   *send->mutable_tensor() = run_callable_request.feed(i);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 size_t MutableProtoRunGraphRequest::num_recvs() const {
@@ -581,7 +581,7 @@ Status ProtoRunGraphRequest::SendValue(size_t i, Tensor* out_tensor) const {
   if (!ParseTensorProtoToTensor(request_->send(i).tensor(), out_tensor)) {
     return errors::InvalidArgument("Invalid TensorProto for feed value ", i);
   } else {
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
@@ -619,12 +619,12 @@ const string& InMemoryRunGraphResponse::recv_key(size_t i) const {
 
 Status InMemoryRunGraphResponse::RecvValue(size_t i, TensorProto* out_tensor) {
   recvs_[i].second.AsProtoTensorContent(out_tensor);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InMemoryRunGraphResponse::RecvValue(size_t i, Tensor* out_tensor) {
   *out_tensor = recvs_[i].second;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void InMemoryRunGraphResponse::AddRecv(const string& key, const Tensor& value) {
@@ -678,14 +678,14 @@ const string& OwnedProtoRunGraphResponse::recv_key(size_t i) const {
 Status OwnedProtoRunGraphResponse::RecvValue(size_t i,
                                              TensorProto* out_tensor) {
   out_tensor->Swap(response_.mutable_recv(i)->mutable_tensor());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status OwnedProtoRunGraphResponse::RecvValue(size_t i, Tensor* out_tensor) {
   if (!ParseTensorProtoToTensor(response_.recv(i).tensor(), out_tensor)) {
     return errors::InvalidArgument("Invalid TensorProto for recv value ", i);
   } else {
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
@@ -750,14 +750,14 @@ const string& NonOwnedProtoRunGraphResponse::recv_key(size_t i) const {
 Status NonOwnedProtoRunGraphResponse::RecvValue(size_t i,
                                                 TensorProto* out_tensor) {
   out_tensor->Swap(response_->mutable_recv(i)->mutable_tensor());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status NonOwnedProtoRunGraphResponse::RecvValue(size_t i, Tensor* out_tensor) {
   if (!ParseTensorProtoToTensor(response_->recv(i).tensor(), out_tensor)) {
     return errors::InvalidArgument("Invalid TensorProto for recv value ", i);
   } else {
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
@@ -820,7 +820,7 @@ const string& InMemoryRunStepResponse::tensor_name(size_t i) const {
 Status InMemoryRunStepResponse::TensorValue(size_t i,
                                             Tensor* out_tensor) const {
   *out_tensor = tensors_[i].second;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 const RunMetadata& InMemoryRunStepResponse::metadata() const {
@@ -832,7 +832,7 @@ Status InMemoryRunStepResponse::AddTensorFromRunGraphResponse(
   Tensor tensor;
   TF_RETURN_IF_ERROR(wrapper->RecvValue(i, &tensor));
   tensors_.emplace_back(name, tensor);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 RunMetadata* InMemoryRunStepResponse::mutable_metadata() { return &metadata_; }
@@ -865,7 +865,7 @@ Status OwnedProtoRunStepResponse::TensorValue(size_t i,
   if (!ParseTensorProtoToTensor(response_.tensor(i).tensor(), out_tensor)) {
     return errors::InvalidArgument("Invalid TensorProto for fetch value ", i);
   } else {
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
@@ -918,7 +918,7 @@ Status NonOwnedProtoRunStepResponse::TensorValue(size_t i,
   if (!ParseTensorProtoToTensor(response_->tensor(i).tensor(), out_tensor)) {
     return errors::InvalidArgument("Invalid TensorProto for fetch value ", i);
   } else {
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
diff --git a/tensorflow/core/distributed_runtime/partial_run_mgr_test.cc b/tensorflow/core/distributed_runtime/partial_run_mgr_test.cc
index f0277c5aeb94ee..a2d56e5b2fd6ad 100644
--- a/tensorflow/core/distributed_runtime/partial_run_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/partial_run_mgr_test.cc
@@ -65,15 +65,15 @@ TEST(PartialRunMgr, PartialRunRemoved) {
 
   int called = 0;
   partial_run_mgr.PartialRunDone(
-      step_id, [&called](Status status) { called++; }, OkStatus());
-  partial_run_mgr.ExecutorDone(step_id, OkStatus());
+      step_id, [&called](Status status) { called++; }, absl::OkStatus());
+  partial_run_mgr.ExecutorDone(step_id, absl::OkStatus());
 
   // Calling ExecutorDone and PartialRunDone on the step_id should still only
   // result in the callback being called once.
   // This proves that the original PartialRun has been removed.
   partial_run_mgr.PartialRunDone(
-      step_id, [&called](Status status) { called++; }, OkStatus());
-  partial_run_mgr.ExecutorDone(step_id, OkStatus());
+      step_id, [&called](Status status) { called++; }, absl::OkStatus());
+  partial_run_mgr.ExecutorDone(step_id, absl::OkStatus());
   EXPECT_EQ(1, called);
 }
 
@@ -142,9 +142,9 @@ Status PartialRunError() { return errors::Internal("partial run error"); }
 INSTANTIATE_TEST_SUITE_P(
     PartialRunMgr, StatusPropagationTest,
     ::testing::Values(
-        StatusTestParam{OkStatus(), OkStatus(), OkStatus()},
-        StatusTestParam{ExecutorError(), OkStatus(), ExecutorError()},
-        StatusTestParam{OkStatus(), PartialRunError(), PartialRunError()},
+        StatusTestParam{absl::OkStatus(), absl::OkStatus(), absl::OkStatus()},
+        StatusTestParam{ExecutorError(), absl::OkStatus(), ExecutorError()},
+        StatusTestParam{absl::OkStatus(), PartialRunError(), PartialRunError()},
         StatusTestParam{ExecutorError(), PartialRunError(), ExecutorError()}));
 
 }  // namespace
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids.cc b/tensorflow/core/distributed_runtime/recent_request_ids.cc
index 490755f1125fec..e7ea66286fb341 100644
--- a/tensorflow/core/distributed_runtime/recent_request_ids.cc
+++ b/tensorflow/core/distributed_runtime/recent_request_ids.cc
@@ -64,7 +64,7 @@ Status RecentRequestIds::TrackUnique(int64_t request_id,
                                      const string& method_name,
                                      const protobuf::Message& request) {
   if (Insert(request_id)) {
-    return OkStatus();
+    return absl::OkStatus();
   } else {
     return errors::Aborted("The same ", method_name,
                            " request was received twice. ",
diff --git a/tensorflow/core/distributed_runtime/recent_request_ids.h b/tensorflow/core/distributed_runtime/recent_request_ids.h
index 5cb8b82634f82d..bc2b1a3d615035 100644
--- a/tensorflow/core/distributed_runtime/recent_request_ids.h
+++ b/tensorflow/core/distributed_runtime/recent_request_ids.h
@@ -91,7 +91,7 @@ Status RecentRequestIds::TrackUnique(int64_t request_id,
                                      const string& method_name,
                                      const RequestWrapper* wrapper) {
   if (Insert(request_id)) {
-    return OkStatus();
+    return absl::OkStatus();
   } else {
     return errors::Aborted("The same ", method_name,
                            " request was received twice. ",
diff --git a/tensorflow/core/distributed_runtime/remote_device.cc b/tensorflow/core/distributed_runtime/remote_device.cc
index a7a56435d829b7..9e6479b0e77ba1 100644
--- a/tensorflow/core/distributed_runtime/remote_device.cc
+++ b/tensorflow/core/distributed_runtime/remote_device.cc
@@ -39,7 +39,7 @@ class RemoteDevice : public Device {
       : Device(env, da),
         local_dev_name_(DeviceNameUtils::LocalName(da.name())) {}
 
-  Status Sync() override { return OkStatus(); }
+  Status Sync() override { return absl::OkStatus(); }
   Allocator* GetAllocator(AllocatorAttributes attr) override { return nullptr; }
 
   ResourceMgr* resource_manager() override {
diff --git a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
index d3b0280dcd500a..38f1d73331bca4 100644
--- a/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
+++ b/tensorflow/core/distributed_runtime/rpc/eager/grpc_eager_client.cc
@@ -317,7 +317,7 @@ class GrpcEagerClientCache : public EagerClientCache {
 
     it->second->Ref();
     client->reset(it->second.get());
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
index 141dcff71b15e2..50dc3d76889a02 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_master_service.cc
@@ -211,7 +211,7 @@ class GrpcMasterService : public tsl::AsyncServiceInterface {
                 static_cast<error::Code>(status.code()));
             call->response.set_status_error_message(
                 std::string(status.message()));
-            call->SendResponse(ToGrpcStatus(OkStatus()));
+            call->SendResponse(ToGrpcStatus(absl::OkStatus()));
           } else {
             call->SendResponse(ToGrpcStatus(status));
           }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
index 475573f93096e7..d5eacc7e6a16cd 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_server_lib.cc
@@ -179,7 +179,7 @@ Status GrpcServer::GetHostAndPort(const ServerDef& server_def,
                             "\" was not defined in cluster");
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GrpcServer::Init(const GrpcServerOptions& opts) {
@@ -356,7 +356,7 @@ Status GrpcServer::Init(const GrpcServerOptions& opts) {
   LocalMaster::Register(target(), master_impl_.get(),
                         config.operation_timeout_in_ms());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options,
@@ -383,7 +383,7 @@ Status GrpcServer::ParseChannelSpec(const WorkerCacheFactoryOptions& options,
     }
     TF_RETURN_IF_ERROR(channel_spec->AddHostPortsJob(job.name(), host_ports));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GrpcServer::WorkerCacheFactory(const WorkerCacheFactoryOptions& options,
@@ -422,7 +422,7 @@ Status GrpcServer::WorkerCacheFactory(const WorkerCacheFactoryOptions& options,
   }
   *worker_cache = NewGrpcWorkerCacheWithLocalWorker(
       channel_cache, grpc_worker_env(), worker_impl(), name_prefix);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GrpcServer::Start() {
@@ -455,11 +455,11 @@ Status GrpcServer::Start() {
 
       state_ = STARTED;
       LOG(INFO) << "Started server with target: " << target();
-      return OkStatus();
+      return absl::OkStatus();
     }
     case STARTED:
       LOG(INFO) << "Server already started (target: " << target() << ")";
-      return OkStatus();
+      return absl::OkStatus();
     case STOPPED:
       return errors::FailedPrecondition("Server has stopped.");
     default:
@@ -502,7 +502,7 @@ Status GrpcServer::UpdateServerDef(const ServerDef& server_def) {
   master_env_.worker_cache = worker_cache;
   master_env_.collective_executor_mgr =
       worker_env_.collective_executor_mgr.get();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // TODO(haoyuzhang): Remove this method once we have a mechanism to directly set
@@ -512,7 +512,7 @@ Status GrpcServer::SetCoordinationServiceAgentInstance(
   auto* coord_service =
       static_cast<GrpcCoordinationServiceImpl*>(coordination_service_);
   coord_service->SetCoordinationServiceAgentInstance(agent);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GrpcServer::SetCoordinationServiceInstance(
@@ -520,7 +520,7 @@ Status GrpcServer::SetCoordinationServiceInstance(
   auto* coord_service =
       static_cast<GrpcCoordinationServiceImpl*>(coordination_service_);
   coord_service->SetCoordinationServiceInstance(service);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GrpcServer::StopCoordinationService() {
@@ -534,7 +534,7 @@ Status GrpcServer::StopCoordinationService() {
   TF_RETURN_IF_ERROR(SetCoordinationServiceInstance(nullptr));
   coordination_service_->Shutdown();
   worker_env()->session_mgr->TeardownCoordinationService();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GrpcServer::Stop() {
@@ -542,13 +542,13 @@ Status GrpcServer::Stop() {
   switch (state_) {
     case NEW:
       state_ = STOPPED;
-      return OkStatus();
+      return absl::OkStatus();
     case STARTED:
       return errors::Unimplemented(
           "Clean shutdown is not currently implemented");
     case STOPPED:
       LOG(INFO) << "Server already stopped (target: " << target() << ")";
-      return OkStatus();
+      return absl::OkStatus();
     default:
       LOG(FATAL);
   }
@@ -560,7 +560,7 @@ Status GrpcServer::Join() {
     case NEW:
       // Prevent the server from being started subsequently.
       state_ = STOPPED;
-      return OkStatus();
+      return absl::OkStatus();
     case STARTED:
     case STOPPED:
       master_thread_.reset();
@@ -569,7 +569,7 @@ Status GrpcServer::Join() {
       for (auto& thread : extra_service_threads_) {
         thread.reset();
       }
-      return OkStatus();
+      return absl::OkStatus();
     default:
       LOG(FATAL);
   }
@@ -609,7 +609,7 @@ Status GrpcServer::Create(const ServerDef& server_def, Env* env,
     return s;
   }
   *out_server = std::move(ret);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 /* static */
@@ -627,7 +627,7 @@ Status GrpcServer::Create(const ServerDef& server_def, Env* env,
     return s;
   }
   out_server->reset(dynamic_cast<GrpcServer*>(server.release()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
index ea7c59c2dcc503..7911ea2e59dc03 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_session.cc
@@ -63,7 +63,7 @@ Status GrpcSession::Create(const SessionOptions& options,
   }
   session->SetRemoteMaster(std::move(master));
   *out_session = std::move(session);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -109,7 +109,7 @@ Status GrpcSession::Handle(string* out_handle) {
     return errors::InvalidArgument("A session is not created yet....");
   }
   *out_handle = handle_;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GrpcSession::CreateImpl(CallOptions* call_options, GraphDef graph) {
@@ -281,7 +281,7 @@ Status GrpcSession::RunHelper(
     run_metadata->Swap(resp->mutable_metadata());
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GrpcSession::Run(const RunOptions& run_options,
@@ -335,7 +335,7 @@ Status GrpcSession::PRunSetup(const std::vector<string>& input_names,
   call_options.SetTimeout(options_.config.operation_timeout_in_ms());
   TF_RETURN_IF_ERROR(master_->PartialRunSetup(&call_options, &req, &resp));
   *handle = resp.partial_run_handle();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GrpcSession::PRun(const string& handle,
@@ -353,7 +353,7 @@ Status GrpcSession::Close() {
   {
     mutex_lock l(mu_);
     if (handle_.empty()) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     req.set_session_handle(handle_);
     handle_.clear();
@@ -398,7 +398,7 @@ Status GrpcSession::ListDevices(std::vector<DeviceAttributes>* response) {
   for (const auto& device_attr : resp.remote_device()) {
     response->emplace_back(device_attr);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void GrpcSession::SetRemoteMaster(std::unique_ptr<MasterInterface> master) {
@@ -435,7 +435,7 @@ Status GrpcSession::MakeCallable(const CallableOptions& callable_options,
   call_options.SetTimeout(options_.config.operation_timeout_in_ms());
   TF_RETURN_IF_ERROR(master_->MakeCallable(&call_options, &req, &resp));
   *out_handle = resp.handle();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GrpcSession::RunCallable(CallableHandle handle,
@@ -462,7 +462,7 @@ Status GrpcSession::RunCallable(CallableHandle handle,
     }
     fetch_tensors->push_back(std::move(fetch_tensor));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GrpcSession::ReleaseCallable(CallableHandle handle) {
@@ -486,7 +486,7 @@ class GrpcSessionFactory : public SessionFactory {
     std::unique_ptr<GrpcSession> session;
     TF_RETURN_IF_ERROR(GrpcSession::Create(options, &session));
     *out_session = session.release();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Invokes the session specific static method to reset containers.
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_state.h b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
index bcf044ff0b87c4..8fdc1d9437cf2f 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_state.h
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_state.h
@@ -332,7 +332,7 @@ class StreamingRPCState : public UntypedStreamingRPCState {
     e = &exchanges_.GetFront();
     mu_.unlock();
 
-    e->Complete(OkStatus());
+    e->Complete(absl::OkStatus());
 
     {
       mutex_lock l(mu_);
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc b/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc
index f9314e92e6631b..8c7a686dd002e6 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_tensorflow_server.cc
@@ -78,7 +78,7 @@ Status FillServerDef(const string& cluster_spec, const string& job_name,
                                    " is invalid (job \"", options->job_name(),
                                    "\" contains ", my_num_tasks, " tasks");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
index bd821118629981..0e9769cefd866e 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib.cc
@@ -102,7 +102,7 @@ Status TestCluster::MakeTestCluster(const TestClusterConfig& config,
   TF_RETURN_IF_ERROR(session->ListDevices(&ret->devices_));
 
   *out_cluster = std::move(ret);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 TestCluster::~TestCluster() {
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc
index 1f93608522a88e..2900259a83867d 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_testlib_server.cc
@@ -106,7 +106,7 @@ Status FillServerDef(const string& job_spec, const string& job_name,
   ConfigProto* config = options->mutable_default_session_config();
   (*config->mutable_device_count())["CPU"] = num_cpus;
   (*config->mutable_device_count())["GPU"] = num_gpus;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
index 45d2015155de9b..2a18d0d28fe885 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_cache.cc
@@ -78,14 +78,14 @@ class GrpcWorkerCache : public WorkerCachePartial {
   Status GetEagerClientCache(
       std::unique_ptr<eager::EagerClientCache>* eager_client_cache) override {
     eager_client_cache->reset(eager::NewGrpcEagerClientCache(channel_cache_));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetCoordinationClientCache(std::unique_ptr<CoordinationClientCache>*
                                         coordination_client_cache) override {
     coordination_client_cache->reset(
         NewGrpcCoordinationClientCache(channel_cache_));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void SetLogging(bool v) override { logger_.SetLogging(v); }
diff --git a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
index 888d13a398a418..2eaaeda2d571dc 100644
--- a/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
+++ b/tensorflow/core/distributed_runtime/rpc/grpc_worker_service.cc
@@ -725,7 +725,7 @@ void GrpcWorker::LoggingAsync(const LoggingRequest* request,
       }
     }
   }
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 void GrpcWorker::CleanupGraphAsync(const CleanupGraphRequest* request,
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
index 1bf7183245a8a9..5f74b27223af7a 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr.cc
@@ -85,7 +85,7 @@ class RpcRecvTensorCall : public BaseRecvTensorCall {
     resp_.Clear();
     {
       mutex_lock l(mu_);
-      status_ = OkStatus();
+      status_ = absl::OkStatus();
     }
     done_ = nullptr;
   }
diff --git a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
index ea374f7346c41c..fa784d27e0be39 100644
--- a/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/rpc/rpc_rendezvous_mgr_test.cc
@@ -63,7 +63,7 @@ class DummyWorker : public TestWorkerInterface {
       // RPC call objects.
       const int64_t t_us = random::New64() % 100 * 1000;
       Env::Default()->SleepForMicroseconds(t_us);
-      done(OkStatus());
+      done(absl::OkStatus());
     });
   }
 };
@@ -103,7 +103,7 @@ static Device* CreateDevice(const char* type, const char* name) {
   class FakeDevice : public Device {
    public:
     explicit FakeDevice(const DeviceAttributes& attr) : Device(nullptr, attr) {}
-    Status Sync() override { return OkStatus(); }
+    Status Sync() override { return absl::OkStatus(); }
     Allocator* GetAllocator(AllocatorAttributes) override { return nullptr; }
   };
   DeviceAttributes attr;
@@ -320,7 +320,7 @@ TEST_F(RpcRendezvousMgrTest, RemoteRecvAsyncMany) {
     int num_requests = 10000;
     Tensor val(DT_STRING);
     mutex mu_;
-    Status status = OkStatus();
+    Status status = absl::OkStatus();
     BlockingCounter counter(num_requests);
 
     for (int i = 0; i < num_requests; i++) {
diff --git a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
index d89ab15689fafe..fd649181a38b97 100644
--- a/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
+++ b/tensorflow/core/distributed_runtime/rpc_collective_executor_mgr.cc
@@ -80,7 +80,7 @@ void RpcCollectiveExecutorMgr::RefreshStepIdSequenceAsync(
       gks = it->second;
     }
     gks->next_step_id_ = NewRandomStepId();
-    done(OkStatus());
+    done(absl::OkStatus());
   } else {
     WorkerInterface* wi = worker_cache_->GetOrCreateWorker(group_leader_);
     GetStepSequenceRequest* req = new GetStepSequenceRequest;
@@ -124,7 +124,7 @@ void RpcCollectiveExecutorMgr::GetStepSequenceAsync(
       ss->set_graph_key(graph_key);
       ss->set_next_step_id(gks->next_step_id_);
     }
-    done(OkStatus());
+    done(absl::OkStatus());
   }
 }
 
@@ -142,7 +142,7 @@ Status RpcCollectiveExecutorMgr::UpdateStepSequences(
     }
     gks->next_step_id_ = ss.next_step_id();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 int64_t RpcCollectiveExecutorMgr::NextStepId(int64_t graph_key) {
diff --git a/tensorflow/core/distributed_runtime/server_lib.cc b/tensorflow/core/distributed_runtime/server_lib.cc
index e9f622bcf2e3fd..a653a7999fed41 100644
--- a/tensorflow/core/distributed_runtime/server_lib.cc
+++ b/tensorflow/core/distributed_runtime/server_lib.cc
@@ -52,7 +52,7 @@ Status ServerFactory::GetFactory(const ServerDef& server_def,
   for (const auto& server_factory : *server_factories()) {
     if (server_factory.second->AcceptsOptions(server_def)) {
       *out_factory = server_factory.second;
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
diff --git a/tensorflow/core/distributed_runtime/server_lib_test.cc b/tensorflow/core/distributed_runtime/server_lib_test.cc
index 2ac89ae67d554a..49abd7e7a639e9 100644
--- a/tensorflow/core/distributed_runtime/server_lib_test.cc
+++ b/tensorflow/core/distributed_runtime/server_lib_test.cc
@@ -28,7 +28,7 @@ class TestServerFactory : public ServerFactory {
 
   Status NewServer(const ServerDef& server_def, const Options& options,
                    std::unique_ptr<ServerInterface>* out_server) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -45,7 +45,7 @@ TEST(ServerLibTest, NewServerNoFactoriesAccept) {
   server_def.set_protocol("fake_protocol");
   std::unique_ptr<ServerInterface> server;
   Status s = NewServer(server_def, &server);
-  ASSERT_NE(s, OkStatus());
+  ASSERT_NE(s, absl::OkStatus());
   EXPECT_TRUE(absl::StrContains(
       s.message(), "No server factory registered for the given ServerDef"));
   EXPECT_TRUE(
diff --git a/tensorflow/core/distributed_runtime/session_mgr.cc b/tensorflow/core/distributed_runtime/session_mgr.cc
index f2826a45aecc1a..489ce22f945a2a 100644
--- a/tensorflow/core/distributed_runtime/session_mgr.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr.cc
@@ -307,7 +307,7 @@ Status SessionMgr::CreateSession(
     activity_watcher::MaybeEnableMultiWorkersWatching(
         coordination_service_agent_.get());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void SessionMgr::ResetDefaultWorkerCache(WorkerCacheInterface* worker_cache) {
@@ -374,7 +374,7 @@ Status SessionMgr::UpdateSession(
   TF_RETURN_IF_ERROR(worker_session->UpdateWorkerCacheAndDevices(
       std::unique_ptr<WorkerCacheInterface>(worker_cache),
       std::move(added_remote_devices), removed_remote_devices));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SessionMgr::DeleteSession(const std::string& session) {
@@ -383,7 +383,7 @@ Status SessionMgr::DeleteSession(const std::string& session) {
   if (it != sessions_.end()) {
     sessions_.erase(it);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SessionMgr::WorkerSessionForSessionLocked(
@@ -406,7 +406,7 @@ Status SessionMgr::WorkerSessionForSessionLocked(
       *out_session = it->second;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SessionMgr::WorkerSessionForSession(
diff --git a/tensorflow/core/distributed_runtime/session_mgr_test.cc b/tensorflow/core/distributed_runtime/session_mgr_test.cc
index 21982e7d86a33f..9e19a878750e77 100644
--- a/tensorflow/core/distributed_runtime/session_mgr_test.cc
+++ b/tensorflow/core/distributed_runtime/session_mgr_test.cc
@@ -61,7 +61,7 @@ class SessionMgrTest : public ::testing::Test {
   SessionMgr::WorkerCacheFactory factory_ =
       [](const ServerDef& server_def, WorkerCacheInterface** worker_cache) {
         *worker_cache = nullptr;  // Set to null to make debugging easier.
-        return OkStatus();
+        return absl::OkStatus();
       };
   SessionMgr mgr_;
 };
@@ -194,11 +194,13 @@ TEST_F(SessionMgrTest, CreateSessionWithMasterName) {
                                   cluster_device_attributes, true, master_name,
                                   new_incarnation));
 
-  EXPECT_NE(mgr_.WorkerSessionForSession(sess_handle1, &session), OkStatus())
+  EXPECT_NE(mgr_.WorkerSessionForSession(sess_handle1, &session),
+            absl::OkStatus())
       << "Session for " << sess_handle1
       << " should have been garbage collected.";
 
-  EXPECT_NE(mgr_.WorkerSessionForSession(sess_handle2, &session), OkStatus())
+  EXPECT_NE(mgr_.WorkerSessionForSession(sess_handle2, &session),
+            absl::OkStatus())
       << "Session for " << sess_handle2
       << " should have been garbage collected.";
 
diff --git a/tensorflow/core/distributed_runtime/tensor_coding.cc b/tensorflow/core/distributed_runtime/tensor_coding.cc
index 70e7fda6bb99c1..4779fb5777742b 100644
--- a/tensorflow/core/distributed_runtime/tensor_coding.cc
+++ b/tensorflow/core/distributed_runtime/tensor_coding.cc
@@ -101,9 +101,9 @@ Status TensorResponse::ParseFrom(Source* source) {
     ClearTensor();
   }
   already_used_ = true;
-  if (ParseFast(source)) return OkStatus();
+  if (ParseFast(source)) return absl::OkStatus();
   meta_.Clear();
-  if (ParseSlow(source)) return OkStatus();
+  if (ParseSlow(source)) return absl::OkStatus();
   return errors::InvalidArgument("Cannot parse tensor from response");
 }
 
diff --git a/tensorflow/core/distributed_runtime/test_utils.h b/tensorflow/core/distributed_runtime/test_utils.h
index 2e787c17d79eb9..ec8ba7be22da8e 100644
--- a/tensorflow/core/distributed_runtime/test_utils.h
+++ b/tensorflow/core/distributed_runtime/test_utils.h
@@ -187,7 +187,7 @@ class TestWorkerCache : public WorkerCacheInterface {
     auto it = localities_.find(device);
     if (it != localities_.end()) {
       *locality = it->second;
-      done(OkStatus());
+      done(absl::OkStatus());
       return;
     }
     done(errors::Internal("Device not found: ", device));
diff --git a/tensorflow/core/distributed_runtime/worker.cc b/tensorflow/core/distributed_runtime/worker.cc
index 9d3f8187f8bdfe..42708852b36f36 100644
--- a/tensorflow/core/distributed_runtime/worker.cc
+++ b/tensorflow/core/distributed_runtime/worker.cc
@@ -53,7 +53,7 @@ void Worker::GetStatusAsync(CallOptions* opts, const GetStatusRequest* request,
   for (auto& d : devices) {
     response->add_device_attributes()->Swap(&d);
   }
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 void Worker::CreateWorkerSessionAsync(const CreateWorkerSessionRequest* request,
@@ -142,7 +142,7 @@ Status Worker::PrepareRunGraph(RunGraphRequestWrapper* req,
   for (size_t i = 0; i < req->num_recvs(); ++i) {
     out->insert({req->recv_key(i), empty_tensor});
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void Worker::RunGraphAsync(CallOptions* opts, RunGraphRequestWrapper* request,
@@ -151,7 +151,7 @@ void Worker::RunGraphAsync(CallOptions* opts, RunGraphRequestWrapper* request,
   if (request->store_errors_in_response_body()) {
     done = [response, done](const Status& status) {
       response->set_status(status);
-      done(OkStatus());
+      done(absl::OkStatus());
     };
   }
   if (request->is_partial()) {
@@ -371,7 +371,7 @@ void Worker::CleanupGraphAsync(const CleanupGraphRequest* request,
       sam->Cleanup(step_id);
     }
   }
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 void Worker::CleanupAllAsync(const CleanupAllRequest* request,
@@ -380,7 +380,7 @@ void Worker::CleanupAllAsync(const CleanupAllRequest* request,
   std::vector<string> containers;
   for (const auto& c : request->container()) containers.push_back(c);
   env_->device_mgr->ClearContainers(containers);
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 void Worker::LoggingAsync(const LoggingRequest* request,
@@ -489,7 +489,7 @@ Status Worker::PrepareRecvTensor(const Rendezvous::ParsedKey& parsed,
           distributed_runtime::WorkerPossiblyRestarted().SerializeAsString()}});
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void Worker::RecvTensorAsync(CallOptions* opts,
diff --git a/tensorflow/core/distributed_runtime/worker_cache_partial.cc b/tensorflow/core/distributed_runtime/worker_cache_partial.cc
index 7c53d6f0fed938..f224094f5f7d5b 100644
--- a/tensorflow/core/distributed_runtime/worker_cache_partial.cc
+++ b/tensorflow/core/distributed_runtime/worker_cache_partial.cc
@@ -51,7 +51,7 @@ void WorkerCachePartial::GetDeviceLocalityAsync(const string& device_name,
     });
     return;
   }
-  done(OkStatus());
+  done(absl::OkStatus());
 }
 
 Status WorkerCachePartial::RefreshDeviceStatus(const string& device_name) {
diff --git a/tensorflow/core/distributed_runtime/worker_session.cc b/tensorflow/core/distributed_runtime/worker_session.cc
index 9fbf08760ae9d8..1bbf1a7bb6c329 100644
--- a/tensorflow/core/distributed_runtime/worker_session.cc
+++ b/tensorflow/core/distributed_runtime/worker_session.cc
@@ -158,7 +158,7 @@ Status WorkerSession::UpdateWorkerCacheAndDevices(
   TF_RETURN_IF_ERROR(remote_device_mgr_->RemoveDevices(removed_remote_devices));
   TF_RETURN_IF_ERROR(
       remote_device_mgr_->AddDevices(std::move(added_remote_devices)));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 /* static */
diff --git a/tensorflow/core/framework/BUILD b/tensorflow/core/framework/BUILD
index 01a6cfa158390a..b5f24419245868 100644
--- a/tensorflow/core/framework/BUILD
+++ b/tensorflow/core/framework/BUILD
@@ -1,9 +1,4 @@
 # Placeholder: load py_proto_library
-load(
-    "//tensorflow/core/platform:build_config.bzl",
-    "tf_proto_library",
-    "tf_pyclif_proto_library",
-)
 load(
     "//tensorflow:tensorflow.bzl",
     "tf_cc_test",
@@ -12,6 +7,11 @@ load(
     "tf_cuda_library",
 )
 load("//tensorflow:tensorflow.default.bzl", "filegroup", "tf_cuda_cc_test", "tf_generate_proto_text_sources")
+load(
+    "//tensorflow/core/platform:build_config.bzl",
+    "tf_proto_library",
+    "tf_pyclif_proto_library",
+)
 load(
     "//tensorflow/core/platform:build_config_root.bzl",
     "if_static",
@@ -35,6 +35,7 @@ default_visibility = [
     # copybara:uncomment "//platforms/xla/megascale/tensorflow:__subpackages__",
     # copybara:uncomment "//learning/brain/experimental/tfrt/native_lowering/graph_executor:__subpackages__",
     "//tensorflow/cc/saved_model:__subpackages__",
+    #internal starburst clustering service,
 ]
 
 package(
@@ -714,6 +715,7 @@ cc_library(
         "//learning/brain/google/data/core/kernels:__pkg__",
         "//learning/deepmind/tensorflow/queues:__pkg__",
         "//learning/deepmind/tensorflow/sstable:__pkg__",
+        "//tensorflow/compiler/mlir/tools/kernel_gen:__pkg__",
     ],
     deps = [
         "//tensorflow/core/lib/core:refcount",
@@ -1383,6 +1385,7 @@ tf_cc_tests(
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/profiler/utils:xplane_utils",
     ],
 )
 
diff --git a/tensorflow/core/framework/allocator_test.cc b/tensorflow/core/framework/allocator_test.cc
index 0eaf97d4b054c3..7e85b25a9df6f7 100644
--- a/tensorflow/core/framework/allocator_test.cc
+++ b/tensorflow/core/framework/allocator_test.cc
@@ -28,6 +28,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/protobuf/xplane.pb.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
 #include "tensorflow/core/profiler/utils/xplane_visitor.h"
+#include "tsl/profiler/utils/xplane_utils.h"
 
 namespace tensorflow {
 
@@ -238,16 +239,16 @@ TEST(CPUAllocatorTest, ProfilerReporting) {
   EXPECT_EQ(OkStatus(), profiler->CollectData(&xspace));
 
   // Validate the output
-  ASSERT_EQ(xspace.planes_size(), 1) << "XSpace: " << xspace.DebugString();
-  const auto& plane = xspace.planes(0);
-  ::tensorflow::profiler::XPlaneVisitor xplane(&plane);
+  const auto plane = ::tsl::profiler::FindPlaneWithName(
+      xspace, ::tensorflow::profiler::kHostThreadsPlaneName);
+  ::tensorflow::profiler::XPlaneVisitor xplane(plane);
 
-  ASSERT_EQ(plane.name(), ::tensorflow::profiler::kHostThreadsPlaneName)
+  ASSERT_EQ(plane->name(), ::tensorflow::profiler::kHostThreadsPlaneName)
       << "XSpace: " << xspace.DebugString();
-  ASSERT_EQ(plane.event_metadata_size(), 2)
+  ASSERT_EQ(plane->event_metadata_size(), 2)
       << "XSpace: " << xspace.DebugString();
 
-  const auto& line = plane.lines(0);
+  const auto& line = plane->lines(0);
   ASSERT_EQ(line.events_size(), 2) << "XSpace: " << xspace.DebugString();
   const auto& events = line.events();
 
diff --git a/tensorflow/core/framework/dataset.cc b/tensorflow/core/framework/dataset.cc
index 1623798a3263b8..c5a1831508ffe4 100644
--- a/tensorflow/core/framework/dataset.cc
+++ b/tensorflow/core/framework/dataset.cc
@@ -709,6 +709,10 @@ void WarnProtoConflicts(const protobuf::Message& src, protobuf::Message* dst) {
                         set_dst.end(), std::back_inserter(in_both));
 
   for (auto field : in_both) {
+    // Used for Job Instrumentation, users should not be warned.
+    if (field->name() == "framework_type") {
+      continue;
+    }
     if (field->type() == protobuf::FieldDescriptor::TYPE_MESSAGE) {
       WarnProtoConflicts(reflection->GetMessage(src, field),
                          reflection->MutableMessage(dst, field));
diff --git a/tensorflow/core/framework/dataset.h b/tensorflow/core/framework/dataset.h
index b74cfa3e6b01f4..4c983892b82eff 100644
--- a/tensorflow/core/framework/dataset.h
+++ b/tensorflow/core/framework/dataset.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/attr_value.pb.h"
 #include "tensorflow/core/framework/attr_value_util.h"
@@ -666,6 +667,7 @@ class IteratorContext {
           interleave_depth(ctx->interleave_depth()),
           is_restoring(ctx->is_restoring()),
           model(ctx->model()),
+          options(ctx->options()),
           ram_budget_manager(ctx->ram_budget_manager()),
           resource_mgr(ctx->resource_mgr()),
           runner(*(ctx->runner())),
@@ -676,7 +678,8 @@ class IteratorContext {
           thread_factory(ctx->thread_factory()),
           thread_pool(ctx->thread_pool()),
           id_registry(ctx->id_registry()),
-          warm_start(ctx->warm_start()) {}
+          warm_start(ctx->warm_start()),
+          index_mapper(ctx->index_mapper()) {}
 
     explicit Params(OpKernelContext* ctx)
         : collective_executor(ctx->collective_executor()),
@@ -736,12 +739,12 @@ class IteratorContext {
     // If non-null, identifies the object used for performance modeling.
     std::shared_ptr<model::Model> model = nullptr;
 
-    // Manager for the ram budget when using autotune.
-    std::shared_ptr<model::RamBudgetManager> ram_budget_manager = nullptr;
-
     // The input pipeline options.
     const Options* options = nullptr;
 
+    // Manager for the ram budget when using autotune.
+    std::shared_ptr<model::RamBudgetManager> ram_budget_manager = nullptr;
+
     // A resource manager for storing dataset-related state, e.g. random
     // seeds or cached tensors. Not owned.
     ResourceMgr* resource_mgr = nullptr;
@@ -781,6 +784,11 @@ class IteratorContext {
 
     // Specifies the tf.data pipeline run mode.
     RunMode run_mode = RunMode::DEFAULT;
+
+    // Maps the index of dataset elements to a shuffled index. In other words,
+    // given an index i, returns the permuted index p(i) for the iterator. Used
+    // to support global shuffling of datasets that support random access.
+    std::function<int64_t(int64_t)> index_mapper = nullptr;
   };
 
   explicit IteratorContext(IteratorContext* ctx)
@@ -835,6 +843,8 @@ class IteratorContext {
 
   const std::shared_ptr<model::Model>& model() const { return params_.model; }
 
+  const Options* options() const { return params_.options; }
+
   const std::shared_ptr<model::RamBudgetManager>& ram_budget_manager() {
     return params_.ram_budget_manager;
   }
@@ -867,6 +877,12 @@ class IteratorContext {
 
   RunMode run_mode() { return params_.run_mode; }
 
+  std::function<int64_t(int64_t)> index_mapper() const {
+    return params_.index_mapper;
+  }
+
+  void SetModel(std::shared_ptr<model::Model> model) { params_.model = model; }
+
   std::unique_ptr<thread::ThreadPool> CreateThreadPool(const string& name,
                                                        int num_threads) {
     if (params_.thread_pool) {
@@ -1310,6 +1326,12 @@ class DatasetBase : public core::RefCounted {
   virtual Status Get(OpKernelContext* ctx, int64 index,
                      std::vector<Tensor>* out_tensors) const;
 
+  // Returns true if the dataset and its inputs support random access.
+  virtual absl::Status RandomIndexingCompatible() const {
+    return absl::FailedPreconditionError(
+        absl::StrCat(type_string(), " does not support random access."));
+  }
+
   // Return a finalized version of the dataset.  The returned DatasetBase is
   // unowned and lives for as long as this dataset.
   virtual StatusOr<DatasetBase*> Finalize(
diff --git a/tensorflow/core/framework/dataset_options.proto b/tensorflow/core/framework/dataset_options.proto
index 1fe7c0bd8a8895..26276adec73178 100644
--- a/tensorflow/core/framework/dataset_options.proto
+++ b/tensorflow/core/framework/dataset_options.proto
@@ -27,7 +27,7 @@ enum AutoShardPolicy {
   OFF = -1;
 }
 
-// next: 5
+// next: 6
 message AutotuneOptions {
   // Whether to automatically tune performance knobs.
   oneof optional_enabled {
@@ -54,6 +54,15 @@ message AutotuneOptions {
   oneof optional_autotune_algorithm {
     model.AutotuneAlgorithm autotune_algorithm = 4;
   }
+
+  // The initial parallelism to use for parallel transformations before autotune
+  // has a chance to run. A higher value can help with quick startup, but may
+  // cause the ram_budget to temporarily be exceeded. Memory-sensitive datasets
+  // should consider setting this to `1` to avoid running out of memory.
+  // Defaults to 16.
+  oneof optional_initial_parallelism {
+    int64 initial_parallelism = 5;
+  }
 }
 
 // next: 2
@@ -82,7 +91,7 @@ message DistributeOptions {
   }
 }
 
-// next: 21
+// next: 22
 message OptimizationOptions {
   // Whether to apply default graph optimizations. If False, only graph
   // optimizations that have been explicitly enabled will be applied.
@@ -151,6 +160,12 @@ message OptimizationOptions {
   }
   // NOTE: field id 20 was removed in August 2023.
   reserved 20;
+  // Whether to replace parallel interleave with interleave and prefetch. Only
+  // takes effect if the parallel interleave is deterministic; otherwise does
+  // nothing.
+  oneof optional_seq_interleave_prefetch {
+    bool seq_interleave_prefetch = 21;
+  }
 }
 
 // next: 3
@@ -175,13 +190,14 @@ enum ExternalStatePolicy {
 // Message stored with Dataset objects to control how datasets are processed and
 // optimized.
 //
-// next: 11
+// next: 12
 message Options {
   // Optional name for the dataset.
   oneof optional_dataset_name {
     string dataset_name = 10;
   }
-
+  // List of frameworks used to generate this dataset.
+  repeated string framework_type = 11;
   // Whether the outputs need to be produced in deterministic order.
   oneof optional_deterministic {
     bool deterministic = 1;
diff --git a/tensorflow/core/framework/metrics.cc b/tensorflow/core/framework/metrics.cc
index e6f94a8e444b4a..43e3dc5c11d9a7 100644
--- a/tensorflow/core/framework/metrics.cc
+++ b/tensorflow/core/framework/metrics.cc
@@ -31,6 +31,15 @@ namespace tensorflow {
 namespace metrics {
 namespace {
 
+auto* persistent_cache_load_count = tsl::monitoring::Counter<0>::New(
+    "/tensorflow/core/persistent_cache_load_count",
+    "The number of times a binary is loaded from the persistent cache.");
+
+auto* aot_bef_mlir_load_count = tsl::monitoring::Counter<0>::New(
+    "/tensorflow/core/aot_bef_mlir_load_count",
+    "The number of times BEF and MLIR are deserialized instead of generated "
+    "and used.");
+
 auto* graph_runs = tsl::monitoring::Counter<0>::New(
     "/tensorflow/core/graph_runs",
     "The number of graph executions used to collect "
@@ -297,11 +306,19 @@ auto* tf_data_autotune_stopping_criteria_counter =
         "algorithm stopping criterion is met.",
         "name");
 
+auto* tf_data_debug = tsl::monitoring::Counter<1>::New(
+    "/tensorflow/data/debug",
+    "The number of times this event occured, for debugging.", "event");
+
 auto* tf_data_error = tsl::monitoring::Counter<2>::New(
     "/tensorflow/data/error",
     "The number of times an error of this type occurred with this status code.",
     "error_type", "status_code");
 
+auto* tf_data_framework_type = tsl::monitoring::Counter<1>::New(
+    "/tensorflow/data/framework_type",
+    "The framework type used to build the tf.data.Dataset.", "framework_type");
+
 auto* parse_dense_feature_counter = tsl::monitoring::Counter<0>::New(
     "/tensorflow/data/dense_feature",
     "The number of dense features parsed by ops for parsing tf.Example.");
@@ -666,10 +683,18 @@ void RecordTFDataAutotuneStoppingCriteria(const string& name) {
   tf_data_autotune_stopping_criteria_counter->GetCell(name)->IncrementBy(1);
 }
 
+void RecordTFDataDebug(const string& event) {
+  tf_data_debug->GetCell(event)->IncrementBy(1);
+}
+
 void RecordTFDataError(const string& error_type, const string& status_code) {
   tf_data_error->GetCell(error_type, status_code)->IncrementBy(1);
 }
 
+void RecordTFDataFrameworkType(const std::string& framework_type) {
+  tf_data_framework_type->GetCell(framework_type)->IncrementBy(1);
+}
+
 void RecordParseDenseFeature(int64 num_features) {
   static auto* parse_dense_feature_counter_cell =
       parse_dense_feature_counter->GetCell();
@@ -705,6 +730,18 @@ void RecordTPUXlaSpmdCoresPerReplica(int64_t cores_per_replica) {
       ->IncrementBy(1);
 }
 
+void UpdatePersistentCacheLoadCount() {
+  static auto* persistent_cache_load_count_cell =
+      persistent_cache_load_count->GetCell();
+  persistent_cache_load_count_cell->IncrementBy(1);
+}
+
+void UpdateAotBefMlirLoadCount() {
+  static auto* aot_bef_mlir_load_count_cell =
+      aot_bef_mlir_load_count->GetCell();
+  aot_bef_mlir_load_count_cell->IncrementBy(1);
+}
+
 void UpdateGraphExecTime(const uint64 running_time_usecs) {
   if (running_time_usecs > 0) {
     static auto* graph_runs_cell = graph_runs->GetCell();
diff --git a/tensorflow/core/framework/metrics.h b/tensorflow/core/framework/metrics.h
index bcc5808cf7f8e8..2fb2c2355f6903 100644
--- a/tensorflow/core/framework/metrics.h
+++ b/tensorflow/core/framework/metrics.h
@@ -217,10 +217,16 @@ void RecordTFDataAutoShardRewriteBatchSize(
 // criterion is met.
 void RecordTFDataAutotuneStoppingCriteria(const string& name);
 
+// Records the number of times this event occured, for debugging.
+void RecordTFDataDebug(const string& event);
+
 // Records the number of times an error of this type occurred with this status
 // code.
 void RecordTFDataError(const string& error_type, const string& error_code);
 
+// Records the framework type used to build the tf.data.Dataset.
+void RecordTFDataFrameworkType(const std::string& framework_type);
+
 // Records parsing of dense tensor features.
 void RecordParseDenseFeature(int64_t num_features);
 
@@ -247,6 +253,12 @@ void RecordUnusedOutput(const string& op_name);
 void RecordPipelineProcessingTime(const string& id,
                                   double pipeline_processing_time_usec);
 
+// Increments the count of binaries loaded from the persistent cache.
+void UpdatePersistentCacheLoadCount();
+
+// Increments the count of BEF and MLIR deserialized.
+void UpdateAotBefMlirLoadCount();
+
 // Updates the metrics stored about time spent building graphs.
 //
 // By "GraphBuild", we refer to building a client graph, which is a sub-graph of
diff --git a/tensorflow/core/framework/model.cc b/tensorflow/core/framework/model.cc
index 6670485733f2ab..3e87e47b8a508a 100644
--- a/tensorflow/core/framework/model.cc
+++ b/tensorflow/core/framework/model.cc
@@ -1938,6 +1938,26 @@ void Node::CollectBufferParametersToUpsize(
   }
 }
 
+void Node::SyncStateValuesToParameterValues(const std::string& parameter_name) {
+  // We need to first collect the parameters because `parameter->state->mu` must
+  // be locked before the node mutex `mu_`;
+  std::vector<Parameter*> parameters;
+  {
+    tf_shared_lock l(mu_);
+    for (auto& [node_name, parameter] : parameters_) {
+      if ((parameter->state == nullptr) ||
+          (parameter->name != parameter_name)) {
+        continue;
+      }
+      parameters.push_back(parameter.get());
+    }
+  }
+  for (auto& parameter : parameters) {
+    tf_shared_lock l(*parameter->state->mu);
+    parameter->value = parameter->state->value;
+  }
+}
+
 Node::NodeVector Node::CollectNodesLocked(
     TraversalOrder order, bool collect_node(const std::shared_ptr<Node>)) const
     TF_SHARED_LOCKS_REQUIRED(mu_) {
@@ -2300,6 +2320,7 @@ void Model::Optimize(AutotuneAlgorithm algorithm,
     tf_shared_lock l(mu_);
     snapshot = output_->Snapshot();
   }
+  MaybeSyncStateValuesToValues(snapshot);
   int64_t total_ram_budget;
   if (fixed_ram_budget.has_value()) {
     total_ram_budget = fixed_ram_budget.value();
@@ -2418,18 +2439,13 @@ Model::ModelParameters Model::CollectTunableParameters(
   return node->CollectTunableParameters();
 }
 
-void Model::MaybeSyncStateValuesToValues(Model::ModelParameters* parameters) {
-  for (auto& [node_name, parameter] : *parameters) {
-    // We only sync state values to values for `DataService` nodes because the
-    // `buffer_size` parameter is set by the `DataServiceClient` directly.
-    if (!absl::StartsWith(node_name, kDataService)) {
+void Model::MaybeSyncStateValuesToValues(std::shared_ptr<Node> snapshot) {
+  auto subtree_nodes = snapshot->CollectNodes(TraversalOrder::BFS, IsAnyNode);
+  for (const auto& node : subtree_nodes) {
+    if (!absl::StartsWith(node->name(), kDataService)) {
       continue;
     }
-    if (parameter->name != kBufferSize) {
-      continue;
-    }
-    mutex_lock l(*parameter->state->mu);
-    parameter->value = parameter->state->value;
+    node->SyncStateValuesToParameterValues(kBufferSize);
   }
 }
 
@@ -2626,7 +2642,6 @@ void Model::OptimizeHillClimbHelper(
   VLOG(2) << "Starting optimization of tunable parameters with Hill Climb.";
   const double processing_time = TotalProcessingTime(snapshot);
   auto parameters = CollectTunableParameters(snapshot);
-  MaybeSyncStateValuesToValues(&parameters);
   if (parameters.empty()) {
     VLOG(2) << "There are no tunable parameters.";
     return;
diff --git a/tensorflow/core/framework/model.h b/tensorflow/core/framework/model.h
index 062df125fba20a..e61412b4c6bf9a 100644
--- a/tensorflow/core/framework/model.h
+++ b/tensorflow/core/framework/model.h
@@ -626,6 +626,10 @@ class Node {
     return AverageBufferedElementSizeLocked();
   }
 
+  // Copies node's parameter state value to parameter value if the parameter
+  // name matches `parameter_name`.
+  void SyncStateValuesToParameterValues(const std::string& parameter_name);
+
  protected:
   // Used for (incrementally) recording metrics. The class is thread-safe.
   class Metrics {
@@ -1039,7 +1043,7 @@ class Model {
   // nodes, the parameter state values are not tuned by Autotune and hence the
   // parameter values can be stale. We do not sync all parameters because it may
   // increase mutex contention with `GetNext()`.
-  void MaybeSyncStateValuesToValues(ModelParameters* parameters);
+  void MaybeSyncStateValuesToValues(std::shared_ptr<Node> snapshot);
 
   // Downsizes buffers that are too large for all nodes rooted at `snapshot`.
   // Returns true if any buffer is downsized.
diff --git a/tensorflow/core/framework/op_requires.h b/tensorflow/core/framework/op_requires.h
index a009a11ea606e7..118dbbb8a434b6 100644
--- a/tensorflow/core/framework/op_requires.h
+++ b/tensorflow/core/framework/op_requires.h
@@ -49,7 +49,7 @@ namespace tensorflow {
 
 #define OP_REQUIRES_OK(CTX, ...)                             \
   do {                                                       \
-    const ::tensorflow::Status& _s(__VA_ARGS__);             \
+    const ::absl::Status& _s(__VA_ARGS__);                   \
     if (!TF_PREDICT_TRUE(_s.ok())) {                         \
       CheckNotInComputeAsync((CTX), "OP_REQUIRES_OK_ASYNC"); \
       (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s);  \
@@ -80,7 +80,7 @@ namespace tensorflow {
 
 #define OP_REQUIRES_OK_ASYNC(CTX, STATUS, CALLBACK)         \
   do {                                                      \
-    const ::tensorflow::Status& _s(STATUS);                 \
+    const ::absl::Status& _s(STATUS);                       \
     if (!TF_PREDICT_TRUE(_s.ok())) {                        \
       (CTX)->CtxFailureWithWarning(__FILE__, __LINE__, _s); \
       (CALLBACK)();                                         \
diff --git a/tensorflow/core/framework/tensor.h b/tensorflow/core/framework/tensor.h
index dbfc314f253fd8..7d1940d4bcf2c1 100644
--- a/tensorflow/core/framework/tensor.h
+++ b/tensorflow/core/framework/tensor.h
@@ -57,6 +57,9 @@ Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64_t index);
 Status CopyContiguousSlices(const Tensor& src, int64_t src_offset,
                             int64_t dst_offset, int64_t num_slices,
                             Tensor* dst);
+Status MaybeMoveContiguousSlices(Tensor& src, int64_t src_offset,
+                                 int64_t dst_offset, int64_t num_slices,
+                                 Tensor* dst);
 }  // namespace batch_util
 
 /// @ingroup core
@@ -715,6 +718,9 @@ class Tensor {
       const Tensor& src, int64_t src_offset, int64_t dst_offset,
       int64_t num_slices,
       Tensor* dst);  // For access to base<T>().
+  friend Status batch_util::MaybeMoveContiguousSlices(
+      Tensor& src, int64_t src_offset, int64_t dst_offset, int64_t num_slices,
+      Tensor* dst);  // For access to base<T>().
 
   bool CanUseDMA() const;
 
diff --git a/tensorflow/core/framework/variant_encode_decode.h b/tensorflow/core/framework/variant_encode_decode.h
index f481714eee2c4b..05ecb775709d3d 100644
--- a/tensorflow/core/framework/variant_encode_decode.h
+++ b/tensorflow/core/framework/variant_encode_decode.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/framework/variant_tensor_data.h"
 #include "tensorflow/core/lib/strings/strcat.h"
 #include "tensorflow/core/platform/abi.h"
+#include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/protobuf.h"
 
 namespace tensorflow {
@@ -68,7 +69,10 @@ void EncodeVariantImpl(const T& value,
                        TypeResolver<T, false /* is_pod */, false /* Tensor */,
                                     true /* protobuf */>,
                        VariantTensorData* data) {
-  value.SerializeToString(&data->metadata_);
+  if (!value.SerializeToString(&data->metadata_)) {
+    data->metadata_.clear();
+    LOG(ERROR) << "Failed to encode variant " << value.DebugString();
+  }
 }
 
 // Specialization for other types
diff --git a/tensorflow/core/function/runtime_client/runtime_client.cc b/tensorflow/core/function/runtime_client/runtime_client.cc
index 6438a1ca2b83c1..bdae703b14d633 100644
--- a/tensorflow/core/function/runtime_client/runtime_client.cc
+++ b/tensorflow/core/function/runtime_client/runtime_client.cc
@@ -170,7 +170,7 @@ Status Runtime::TransformFunction(StringPiece name, StringPiece pipeline_name,
           CreateFunction(reinterpret_cast<OpaqueTfgGraphFuncOp*>(&fn)),
           absl::StrCat("updating function ", fn.getName().str()));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (dialect == Dialect::TF) {
@@ -196,7 +196,7 @@ Status Runtime::TransformFunction(StringPiece name, StringPiece pipeline_name,
           CreateFunction(reinterpret_cast<OpaqueTfFuncOp*>(&fn)),
           absl::StrCat("updating function ", fn.getName().str()));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   return Status(
diff --git a/tensorflow/core/graph/graph_debug_info_builder_test.cc b/tensorflow/core/graph/graph_debug_info_builder_test.cc
index 6c236de97d0bba..f7e5a6c01f68ad 100644
--- a/tensorflow/core/graph/graph_debug_info_builder_test.cc
+++ b/tensorflow/core/graph/graph_debug_info_builder_test.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include <gmock/gmock.h>
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/framework/graph_debug_info.pb.h"
 #include "tensorflow/core/platform/test.h"
 
diff --git a/tensorflow/core/grappler/clusters/cluster.h b/tensorflow/core/grappler/clusters/cluster.h
index 11b62675b299c9..f1c5f210f13a00 100644
--- a/tensorflow/core/grappler/clusters/cluster.h
+++ b/tensorflow/core/grappler/clusters/cluster.h
@@ -58,7 +58,7 @@ class Cluster {
   // Returns OK iff there are no pending calls to the Run() method and all the
   // resources used by the cluster could be released. Returns an error
   // otherwise.
-  virtual Status Shutdown() { return OkStatus(); }
+  virtual Status Shutdown() { return absl::OkStatus(); }
 
   // Whether soft placement is allowed. If allow_soft_placement is true,
   // an op will be placed on CPU if there's no GPU implementation for the OP
diff --git a/tensorflow/core/grappler/clusters/single_machine.cc b/tensorflow/core/grappler/clusters/single_machine.cc
index 9bd403f2c14781..92f17cc30d1a42 100644
--- a/tensorflow/core/grappler/clusters/single_machine.cc
+++ b/tensorflow/core/grappler/clusters/single_machine.cc
@@ -120,7 +120,7 @@ Status SingleMachine::Provision() {
   if (cpu_allocator_stats_enabled_) {
     TF_RETURN_IF_ERROR(ClearAllocatorStats());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SingleMachine::Initialize(const GrapplerItem& item) {
@@ -132,7 +132,7 @@ Status SingleMachine::Initialize(const GrapplerItem& item) {
     queue_runner_defs_ = item.queue_runners;
     last_graph_id_ = item.id;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SingleMachine::Shutdown() {
@@ -142,7 +142,7 @@ Status SingleMachine::Shutdown() {
   last_graph_ = nullptr;
   already_provisioned = false;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SingleMachine::Run(const GraphDef& graph_def,
@@ -203,14 +203,14 @@ Status SingleMachine::Run(const GraphDef& graph_def,
 
   last_graph_ = &graph_def;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SingleMachine::EnablePeakMemoryStats() {
   EnableCPUAllocatorStats();
   cpu_allocator_stats_enabled_ = true;
   // No need to enable GPU allocator stats since its stats are always collected.
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SingleMachine::GetPeakMemoryUsage(
@@ -238,7 +238,7 @@ Status SingleMachine::GetPeakMemoryUsage(
         (stats ? stats->peak_bytes_in_use : 0);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SingleMachine::RunWithTimeout(
@@ -276,7 +276,7 @@ Status SingleMachine::RunWithTimeout(
 
 Status SingleMachine::CloseSession(bool use_timeout) {
   if (!session_ || !thread_pool_) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   {
@@ -317,7 +317,7 @@ Status SingleMachine::CloseSession(bool use_timeout) {
                      " seconds, aborting"));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SingleMachine::ShutdownSession() {
@@ -343,7 +343,7 @@ Status SingleMachine::ShutdownSession() {
         "The session is still running graphs after ", timeout_s_, " seconds"));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SingleMachine::ResetSession() {
@@ -379,7 +379,7 @@ Status SingleMachine::ResetSession() {
     // We currently don't care about the client device.
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void SingleMachine::MergeCosts(CostGraphDef* graph_costs,
@@ -469,7 +469,7 @@ Status SingleMachine::ClearAllocatorStats() const {
                        device->name()));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/clusters/virtual_cluster.cc b/tensorflow/core/grappler/clusters/virtual_cluster.cc
index 0a902e8ac1bd2e..03dea8dd13a378 100644
--- a/tensorflow/core/grappler/clusters/virtual_cluster.cc
+++ b/tensorflow/core/grappler/clusters/virtual_cluster.cc
@@ -57,10 +57,10 @@ VirtualCluster::VirtualCluster(const DeviceSet* device_set)
 
 VirtualCluster::~VirtualCluster() {}
 
-Status VirtualCluster::Provision() { return OkStatus(); }
+Status VirtualCluster::Provision() { return absl::OkStatus(); }
 
 Status VirtualCluster::Initialize(const GrapplerItem& item) {
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status VirtualCluster::Run(const GraphDef& graph,
@@ -114,7 +114,7 @@ Status VirtualCluster::Run(const GrapplerItem& item, RunMetadata* metadata) {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
index 2f3975d73e78e5..10bf5ea4fc1aae 100644
--- a/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/analytical_cost_estimator.cc
@@ -108,7 +108,7 @@ Status AddCostNode(ReadyNodeManager* node_manager, const OpContext& op_context,
     }
     output_info->set_size(size);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -151,7 +151,7 @@ AnalyticalCostEstimator::AnalyticalCostEstimator(
 
 Status AnalyticalCostEstimator::Initialize(const GrapplerItem& item) {
   item_ = &item;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
@@ -244,7 +244,7 @@ Status AnalyticalCostEstimator::PredictCosts(const GraphDef& optimized_graph,
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/costs/graph_memory.cc b/tensorflow/core/grappler/costs/graph_memory.cc
index e937dbac274b7d..4099c2495edc00 100644
--- a/tensorflow/core/grappler/costs/graph_memory.cc
+++ b/tensorflow/core/grappler/costs/graph_memory.cc
@@ -45,7 +45,7 @@ Status GraphMemory::InferStatically(
     return s;
   }
   InferFromTrace(metadata.step_stats());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphMemory::InferDynamically(Cluster* cluster) {
@@ -57,7 +57,7 @@ Status GraphMemory::InferDynamically(Cluster* cluster) {
   RunMetadata metadata;
   TF_RETURN_IF_ERROR(cluster->Run(item_, &metadata));
   InferFromTrace(metadata.step_stats());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 int64_t GraphMemory::GetWorstCaseMemoryUsage() const {
diff --git a/tensorflow/core/grappler/costs/graph_properties.cc b/tensorflow/core/grappler/costs/graph_properties.cc
index 84b33460db1b03..94b86ba901c44d 100644
--- a/tensorflow/core/grappler/costs/graph_properties.cc
+++ b/tensorflow/core/grappler/costs/graph_properties.cc
@@ -231,7 +231,7 @@ Status DisjointSet<Handle>::Merge(Handle x, Handle y) {
 
   // x and y are already in the same set
   if (x_root == y_root) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   // x and y are not in same set, so we merge them
   // Use the occasion to strengthen what we know about the handle by merging the
@@ -248,7 +248,7 @@ Status DisjointSet<Handle>::Merge(Handle x, Handle y) {
     y_root->parent = x_root;
     x_root->rank = x_root->rank + 1;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename Handle>
@@ -807,7 +807,7 @@ class SymbolicShapeRefiner {
         TF_RETURN_IF_ERROR(SetUnknownShape(function_node, i));
       }
 
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Copy (not reference) so that changes we make here (e.g., replacing
@@ -983,7 +983,7 @@ class SymbolicShapeRefiner {
       output++;
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Prepares input shapes/values/handles, then runs shape inference, and
@@ -1071,7 +1071,7 @@ class SymbolicShapeRefiner {
 
     if (!*refined) {
       // No input shape has changed, we're done.
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Convert all kUnknownDimFromConst to -1 for shape inference.
@@ -1097,14 +1097,14 @@ class SymbolicShapeRefiner {
         // get output values or output shapes as tensor from function node.
         auto s = UpdateOutputShapesUsingAnnotatedInformation(*node, ctx);
         if (s.ok() && AllOutputShapesKnown(ctx)) {
-          return OkStatus();
+          return absl::OkStatus();
         }
         // If shape annotation was not available, incomplete, or incompatible,
         // fall through to call UpdateFunction().
       }
       auto s = UpdateFunction(node);
       if (s.ok()) {
-        return OkStatus();
+        return absl::OkStatus();
       } else {
         VLOG(1) << "UpdateFunction failed for " << node->op()
                 << ". Defaulting to ShapeUnknown.\n"
@@ -1144,7 +1144,7 @@ class SymbolicShapeRefiner {
           ") but was ", output_port);
     }
     ctx->set_output(output_port, shape);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   struct ShapeId {
@@ -1309,7 +1309,7 @@ class SymbolicShapeRefiner {
                      const std::string& function_name) {
     auto it = fun_to_grappler_function_item_.find(function_name);
     if (it != fun_to_grappler_function_item_.end()) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     const FunctionDef* function_def =
@@ -1325,7 +1325,7 @@ class SymbolicShapeRefiner {
               << function_instantiated.message();
       fun_to_grappler_function_item_[function_def->signature().name()] =
           absl::nullopt;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     if (static_cast<int>(grappler_function_item.inputs().size()) >
@@ -1348,7 +1348,7 @@ class SymbolicShapeRefiner {
     fun_to_grappler_function_item_[function_def->signature().name()] =
         grappler_function_item;
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status AddNode(const NodeDef* node) {
@@ -1629,7 +1629,7 @@ class SymbolicShapeRefiner {
       const_tensors_to_propagate_.push_back(tensor_proto);
       c->output_tensor_protos[k] = &const_tensors_to_propagate_.back();
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Update output shapes with annotated information.
@@ -1641,7 +1641,7 @@ class SymbolicShapeRefiner {
     const auto& attr = node.attr();
     if (attr.count(kOutputSame) == 0 || !attr.at(kOutputSame).b() ||
         attr.count(kOutputShapes) == 0)
-      return OkStatus();
+      return absl::OkStatus();
 
     InferenceContext* ic = c->inference_context.get();
     int output_size = attr.at(kOutputShapes).list().shape_size();
@@ -1694,7 +1694,7 @@ class SymbolicShapeRefiner {
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status MaybeUpdateNodeContextOutput(const NodeDef& node, const bool is_fed,
@@ -1911,11 +1911,11 @@ class SymbolicShapeRefiner {
       const int max_element_size = 17;  // Max up to 4x4 matrix or similar.
       if (AllOutputValuesKnown(c) || !AllInputValuesKnown(c) ||
           !ShouldUpdateOutputShapesAndValues(c, max_element_size)) {
-        return OkStatus();
+        return absl::OkStatus();
       }
       UpdateOutputShapesAndValues(node, c).IgnoreError();  // This is optional.
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status InferShapes(const NodeDef& node, NodeContext* c) {
@@ -1929,7 +1929,7 @@ class SymbolicShapeRefiner {
       TF_RETURN_IF_ERROR(
           c->inference_context->Run(shape_inference::UnknownShape));
     }
-    Status status = OkStatus();
+    Status status = absl::OkStatus();
     auto it = fed_ports_.find(node.name());
     const bool is_fed = it != fed_ports_.end();
     if (is_fed) {
@@ -2073,7 +2073,7 @@ class SymbolicShapeManager {
 
   Status Merge(ShapeHandle s1, ShapeHandle s2) {
     if (!s1.IsSet() || !s2.IsSet()) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     TF_RETURN_IF_ERROR(shapes_.Merge(s1, s2));
     if (InferenceContext::Rank(s1) > 0 && InferenceContext::Rank(s2) > 0) {
@@ -2083,11 +2083,11 @@ class SymbolicShapeManager {
                                        InferenceContext::DimKnownRank(s2, i)));
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
   Status Merge(DimensionHandle d1, DimensionHandle d2) {
     if (!d1.IsSet() || !d2.IsSet()) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     return dims_.Merge(d1, d2);
   }
@@ -2141,7 +2141,7 @@ Status ValidateSymbolicShapeManager(const GraphDef& graph_def,
                                     SymbolicShapeRefiner* refiner,
                                     SymbolicShapeManager* shape_manager) {
   if (!VLOG_IS_ON(1)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   VLOG(1) << "Checking any conflicts in shapes and dimensions ...";
@@ -2182,7 +2182,7 @@ Status ValidateSymbolicShapeManager(const GraphDef& graph_def,
     VLOG(1) << "**** No incompatible shape found from SymbolicShapeManager.";
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Log shape inference and its merged shapes.
@@ -2194,7 +2194,7 @@ Status VerboseShapeInferenceLogging(const GraphDef& graph_def,
   // node_names_for_logging to enable detailed logging.
   absl::flat_hash_set<std::string> node_names_for_logging = {};
   if (!VLOG_IS_ON(3) || node_names_for_logging.empty()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   auto should_log = [&node_names_for_logging](std::string node_name) {
@@ -2231,7 +2231,7 @@ Status VerboseShapeInferenceLogging(const GraphDef& graph_def,
     VLOG(3) << "";
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphProperties::RelaxEnqueueShapesAndMergeTypes(
@@ -2254,7 +2254,7 @@ Status GraphProperties::RelaxEnqueueShapesAndMergeTypes(
 
     b.shape = shape_refiner->OutputAsUnion(qnode, i, a.shape, b.shape);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Compute the output shape of the merge node as the union of the available
@@ -2308,7 +2308,7 @@ Status GraphProperties::UpdateMerge(SymbolicShapeRefiner* shape_refiner,
     *new_shapes = true;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Manually propagate the input shape for Enter nodes.
@@ -2336,7 +2336,7 @@ Status GraphProperties::UpdateEnter(SymbolicShapeRefiner* shape_refiner,
     ic->set_output_handle_shapes_and_types(0, *outputs);
     *new_shapes = true;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphProperties::UpdateShapes(
@@ -2364,7 +2364,7 @@ Status GraphProperties::UpdateShapes(
     TF_RETURN_IF_ERROR(shape_refiner->UpdateNode(n, new_shapes));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Propagates the shapes in the transitive fan-out of <new_shapes>.
@@ -2417,7 +2417,7 @@ Status GraphProperties::PropagateShapes(
     return errors::Internal("Shape inference failed to converge");
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphProperties::UpdateQueue(const NodeDef* queue_node,
@@ -2480,12 +2480,12 @@ Status GraphProperties::UpdateEnqueue(
   auto it = resource_handles.find(enqueue_node);
   if (it == resource_handles.end()) {
     // The corresponding queue was not found, there isn't much we can do.
-    return OkStatus();
+    return absl::OkStatus();
   }
   const NodeDef* qnode = it->second;
   auto qctx = shape_refiner->GetContext(qnode);
   if (!qctx) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   auto* queue_handle_data = qctx->output_handle_shapes_and_types(0);
 
@@ -2511,7 +2511,7 @@ Status GraphProperties::UpdateEnqueue(
     qctx->set_output_handle_shapes_and_types(0, shapes_and_types);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphProperties::InferStatically(bool assume_valid_feeds,
@@ -2756,7 +2756,7 @@ Status GraphProperties::InferStatically(bool assume_valid_feeds,
   TF_RETURN_IF_ERROR(VerboseShapeInferenceLogging(item_.graph, refiner.get(),
                                                   shape_manager.get()));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphProperties::InferDynamically(Cluster* cluster) {
@@ -2783,7 +2783,7 @@ Status GraphProperties::AnnotateOutputShapes(GraphDef* output_graph_def) const {
     }
     (*node->mutable_attr())["_output_shapes"] = std::move(attr_output_shape);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphProperties::InferFromCostGraph(const CostGraphDef& cost_graph) {
@@ -2819,7 +2819,7 @@ Status GraphProperties::InferFromCostGraph(const CostGraphDef& cost_graph) {
 
     input_properties_[node.name()] = inputs;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool GraphProperties::HasInputProperties(const string& node_name) const {
diff --git a/tensorflow/core/grappler/costs/graph_properties_test.cc b/tensorflow/core/grappler/costs/graph_properties_test.cc
index 97296d6a0f3cda..0824a95eefaf19 100644
--- a/tensorflow/core/grappler/costs/graph_properties_test.cc
+++ b/tensorflow/core/grappler/costs/graph_properties_test.cc
@@ -300,7 +300,7 @@ REGISTER_OP("DetectInputValueInShapeInferenceOp")
       if (c->input_tensor(0)) {
         // 10x10 if input_tensor is given to the inference context.
         c->set_output(0, c->Matrix(10, 10));
-        return OkStatus();
+        return absl::OkStatus();
       }
       // unknown rank if input_tensor is not provided.
       return shape_inference::UnknownShape(c);
diff --git a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
index f3385e67802a2c..e7c330cbe22e6c 100644
--- a/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
+++ b/tensorflow/core/grappler/costs/measuring_cost_estimator.cc
@@ -144,7 +144,7 @@ Status MeasuringCostEstimator::PredictCosts(const GraphDef& optimized_graph,
   RobustStats stats(times);
   costs->execution_time = Costs::Duration(stats.mean());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // end namespace grappler
 }  // end namespace tensorflow
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.cc b/tensorflow/core/grappler/costs/virtual_scheduler.cc
index 3fcb21e5862068..43941e62226648 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.cc
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.cc
@@ -180,7 +180,7 @@ Status HeapReadyManager::Init(
   // Sets up the comparator for the heap.
   greater_ = Greater();
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void HeapReadyManager::AddNode(const NodeDef* node) {
@@ -269,7 +269,7 @@ void PriorityReadyManager::AddNode(const NodeDef* node) {
 Status PriorityReadyManager::SetPriority(
     const std::unordered_map<string, int>& node_priority) {
   node_priority_ = node_priority;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 CompositeNodeManager::CompositeNodeManager()
@@ -281,7 +281,7 @@ Status CompositeNodeManager::Init(
   TF_RETURN_IF_ERROR(send_manager_.Init(node_map));
   TF_RETURN_IF_ERROR(recv_manager_.Init(node_map));
   curr_node_ = nullptr;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void CompositeNodeManager::AddNode(const NodeDef* node) {
@@ -593,7 +593,7 @@ Status SchedulerState::Init(const GrapplerItem* item,
   }
 
   initialized_ = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void SchedulerState::MaybeUpdateInputOutput(const NodeDef* node) {
diff --git a/tensorflow/core/grappler/costs/virtual_scheduler.h b/tensorflow/core/grappler/costs/virtual_scheduler.h
index fe33b8eeda04a7..12aaa1ea7da325 100644
--- a/tensorflow/core/grappler/costs/virtual_scheduler.h
+++ b/tensorflow/core/grappler/costs/virtual_scheduler.h
@@ -174,7 +174,7 @@ class ReadyNodeManager {
   virtual ~ReadyNodeManager() {}
   virtual Status Init(
       const std::unordered_map<const NodeDef*, NodeState>* node_map) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   virtual void AddNode(const NodeDef* node) = 0;
   virtual const NodeDef* GetCurrNode() = 0;
diff --git a/tensorflow/core/grappler/graph_analyzer/gen_node.cc b/tensorflow/core/grappler/graph_analyzer/gen_node.cc
index f7a9f11bf3dcfc..175466f31e7f4a 100644
--- a/tensorflow/core/grappler/graph_analyzer/gen_node.cc
+++ b/tensorflow/core/grappler/graph_analyzer/gen_node.cc
@@ -44,7 +44,7 @@ Status GenNode::BuildGraphInMap(const GraphDef& source, GenNodeMap* map) {
       return st;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GenNode::ParseInputs(const GenNodeMap* map) {
@@ -119,7 +119,7 @@ Status GenNode::ParseInputs(const GenNodeMap* map) {
     links_[this_port].emplace_back(LinkTarget(other_node, other_port));
     other_node->links_[other_port].emplace_back(LinkTarget(this, this_port));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool GenNode::IsMultiInput(Port port) const {
diff --git a/tensorflow/core/grappler/graph_analyzer/gen_node_test.cc b/tensorflow/core/grappler/graph_analyzer/gen_node_test.cc
index 79d32dc348f4d5..e41acc5337fa5f 100644
--- a/tensorflow/core/grappler/graph_analyzer/gen_node_test.cc
+++ b/tensorflow/core/grappler/graph_analyzer/gen_node_test.cc
@@ -79,7 +79,7 @@ TEST(GenNodeTest, ParseNodeNoInputs) {
   map["node1"] = std::make_unique<GenNode>(&node1);
 
   auto gn1 = map["node1"].get();
-  ASSERT_THAT(gn1->ParseInputs(&map), Eq(OkStatus()));
+  ASSERT_THAT(gn1->ParseInputs(&map), Eq(absl::OkStatus()));
   EXPECT_THAT(DumpLinkMap(gn1->links()), ElementsAre());
 }
 
@@ -101,7 +101,7 @@ TEST(GenNodeTest, ParseNodeWithControl) {
   auto gn1 = map["node1"].get();
   auto gn2 = map["node2"].get();
   auto gn3 = map["node3"].get();
-  ASSERT_THAT(gn3->ParseInputs(&map), Eq(OkStatus()));
+  ASSERT_THAT(gn3->ParseInputs(&map), Eq(absl::OkStatus()));
   // clang-format off
   EXPECT_THAT(DumpLinkMap(gn1->links()), ElementsAre(
       "o0: node3[i0]",
@@ -147,7 +147,7 @@ TEST(GenNodeTest, ParseNodeCommutative) {
   auto gn1 = map["node1"].get();
   auto gn2 = map["node2"].get();
   auto gn3 = map["node3"].get();
-  ASSERT_THAT(gn3->ParseInputs(&map), Eq(OkStatus()));
+  ASSERT_THAT(gn3->ParseInputs(&map), Eq(absl::OkStatus()));
   // clang-format off
   EXPECT_THAT(DumpLinkMap(gn1->links()), ElementsAre(
       "o0: node3[i0]"
@@ -180,7 +180,7 @@ TEST(GenNodeTest, ParseNodeMultiInputCommutative) {
   auto gn1 = map["node1"].get();
   auto gn2 = map["node2"].get();
   auto gn3 = map["node3"].get();
-  ASSERT_THAT(gn3->ParseInputs(&map), Eq(OkStatus()));
+  ASSERT_THAT(gn3->ParseInputs(&map), Eq(absl::OkStatus()));
   // clang-format off
   EXPECT_THAT(DumpLinkMap(gn1->links()), ElementsAre(
       "o0: node3[i0]"
@@ -216,7 +216,7 @@ TEST(GenNodeTest, ParseNodeMultiInputNotCommutative) {
   auto gn1 = map["node1"].get();
   auto gn2 = map["node2"].get();
   auto gn3 = map["node3"].get();
-  ASSERT_THAT(gn3->ParseInputs(&map), Eq(OkStatus()));
+  ASSERT_THAT(gn3->ParseInputs(&map), Eq(absl::OkStatus()));
   // clang-format off
   EXPECT_THAT(DumpLinkMap(gn1->links()), ElementsAre(
       "o0: node3[i0]"
@@ -250,7 +250,7 @@ TEST(GenNodeTest, ParseNodeMultiInputList) {
   auto gn1 = map["node1"].get();
   auto gn2 = map["node2"].get();
   auto gn3 = map["node3"].get();
-  ASSERT_THAT(gn3->ParseInputs(&map), Eq(OkStatus()));
+  ASSERT_THAT(gn3->ParseInputs(&map), Eq(absl::OkStatus()));
   // clang-format off
   EXPECT_THAT(DumpLinkMap(gn1->links()), ElementsAre(
       "o0: node3[i0]"
@@ -293,7 +293,7 @@ TEST(GenNodeTest, ParseNodeMultiMultiInput) {
   auto gn3 = map["node3"].get();
   auto gn4 = map["node4"].get();
   auto gn5 = map["node5"].get();
-  ASSERT_THAT(gn5->ParseInputs(&map), Eq(OkStatus()));
+  ASSERT_THAT(gn5->ParseInputs(&map), Eq(absl::OkStatus()));
   // clang-format off
   EXPECT_THAT(DumpLinkMap(gn1->links()), ElementsAre(
       "o0: node5[i0]"
@@ -337,7 +337,7 @@ TEST(GenNodeTest, ParseNodeMultiOutput) {
   map["node4"] = std::make_unique<GenNode>(&node4);
 
   auto gn4 = map["node4"].get();
-  ASSERT_THAT(gn4->ParseInputs(&map), Eq(OkStatus()));
+  ASSERT_THAT(gn4->ParseInputs(&map), Eq(absl::OkStatus()));
   // clang-format off
   EXPECT_THAT(DumpLinkMap(gn4->links()), ElementsAre(
       "i0: node3[o1]",
@@ -403,7 +403,7 @@ TEST(GenNodeTest, ParseNodeControlInputsAlwaysOk) {
   map["node1"] = std::make_unique<GenNode>(&node1);
   node1.add_input("^node1");
   auto gn1 = map["node1"].get();
-  ASSERT_THAT(gn1->ParseInputs(&map), Eq(OkStatus()));
+  ASSERT_THAT(gn1->ParseInputs(&map), Eq(absl::OkStatus()));
   // clang-format off
   EXPECT_THAT(DumpLinkMap(gn1->links()), ElementsAre(
       "iC: node1[oC]",
@@ -434,7 +434,7 @@ TEST(GenNodeTest, BuildGraphInMap) {
       MakeNodeBroadcastGradientArgs("node3", "node1", "node2");
 
   GenNodeMap map;
-  ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(OkStatus()));
+  ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(absl::OkStatus()));
   ASSERT_THAT(map.find("node1"), Ne(map.end()));
   ASSERT_THAT(map.find("node2"), Ne(map.end()));
   ASSERT_THAT(map.find("node3"), Ne(map.end()));
diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc b/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc
index cbea47e263c1b5..70b4637d7b9d21 100644
--- a/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc
+++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer.cc
@@ -53,7 +53,7 @@ Status GraphAnalyzer::Run() {
     return st;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphAnalyzer::BuildMap() {
@@ -305,7 +305,7 @@ Status GraphAnalyzer::CollateResult() {
 
   result_.clear();  // Not needed after collation.
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::vector<string> GraphAnalyzer::DumpRawSubgraphs() {
@@ -335,7 +335,7 @@ Status GraphAnalyzer::OutputSubgraphs() {
   if (std::cout.fail()) {
     return Status(absl::StatusCode::kDataLoss, "Failed to write to stdout");
   } else {
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
diff --git a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc
index 011959e2665d93..65e57ef5443deb 100644
--- a/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc
+++ b/tensorflow/core/grappler/graph_analyzer/graph_analyzer_test.cc
@@ -79,7 +79,7 @@ class GraphAnalyzerTest : public ::testing::Test, protected TestGraphs {
 TEST_F(GraphAnalyzerTest, BuildMap) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_3n_self_control_, 1);
   Status st = BuildMap();
-  EXPECT_THAT(st, Eq(OkStatus()));
+  EXPECT_THAT(st, Eq(absl::OkStatus()));
 
   auto& map = GetNodes();
   EXPECT_THAT(map.find("node1"), Ne(map.end()));
@@ -99,7 +99,7 @@ TEST_F(GraphAnalyzerTest, BuildMapError) {
 TEST_F(GraphAnalyzerTest, FindSubgraphs0) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_3n_self_control_, 0);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   FindSubgraphs();
   auto& subgraphs = GetResult();
@@ -112,7 +112,7 @@ TEST_F(GraphAnalyzerTest, FindSubgraphs0) {
 TEST_F(GraphAnalyzerTest, FindSubgraphs1) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_3n_self_control_, 1);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   FindSubgraphs();
   auto& subgraphs = GetResult();
@@ -133,7 +133,7 @@ TEST_F(GraphAnalyzerTest, FindSubgraphs1) {
 TEST_F(GraphAnalyzerTest, FindSubgraphsTooLarge) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_3n_self_control_, 4);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   FindSubgraphs();
   EXPECT_THAT(DumpRawSubgraphs(), ElementsAre());
@@ -148,7 +148,7 @@ TEST_F(GraphAnalyzerTest, FindSubgraphsTooLarge) {
 TEST_F(GraphAnalyzerTest, MultiInputSuccessBackwardsBaseIn) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_multi_input_, 4);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   auto root =
       std::make_unique<Subgraph>(Subgraph::Identity({GetNode("add2")}));
@@ -170,7 +170,7 @@ TEST_F(GraphAnalyzerTest, MultiInputSuccessBackwardsBaseIn) {
 TEST_F(GraphAnalyzerTest, MultiInputSuccessBackwardsBaseOut) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_multi_input_, 4);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   auto parent = std::make_unique<Subgraph>(Subgraph::Identity());
   auto root =
@@ -193,7 +193,7 @@ TEST_F(GraphAnalyzerTest, MultiInputSuccessBackwardsBaseOut) {
 TEST_F(GraphAnalyzerTest, MultiInputSuccessBackwardsIncomplete) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_multi_input_, 5);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   auto root =
       std::make_unique<Subgraph>(Subgraph::Identity({GetNode("add2")}));
@@ -215,7 +215,7 @@ TEST_F(GraphAnalyzerTest, MultiInputSuccessBackwardsIncomplete) {
 TEST_F(GraphAnalyzerTest, MultiInputTooLargeBackwards) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_multi_input_, 3);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   auto root =
       std::make_unique<Subgraph>(Subgraph::Identity({GetNode("add2")}));
@@ -233,7 +233,7 @@ TEST_F(GraphAnalyzerTest, MultiInputTooLargeBackwards) {
 TEST_F(GraphAnalyzerTest, MultiInputNothingAddedBackwards) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_multi_input_, 4);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   auto root = std::make_unique<Subgraph>(
       Subgraph::Identity({GetNode("add2"), GetNode("const2_1"),
@@ -252,7 +252,7 @@ TEST_F(GraphAnalyzerTest, MultiInputNothingAddedBackwards) {
 TEST_F(GraphAnalyzerTest, MultiInputSuccessForwardsBaseOut) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_multi_input_, 4);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   auto root =
       std::make_unique<Subgraph>(Subgraph::Identity({GetNode("const2_1")}));
@@ -273,7 +273,7 @@ TEST_F(GraphAnalyzerTest, MultiInputSuccessForwardsBaseOut) {
 TEST_F(GraphAnalyzerTest, MultiInputSuccessBackwardsFull) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_multi_input_, 4);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   auto root =
       std::make_unique<Subgraph>(Subgraph::Identity({GetNode("add2")}));
@@ -295,7 +295,7 @@ TEST_F(GraphAnalyzerTest, MultiInputSuccessBackwardsFull) {
 TEST_F(GraphAnalyzerTest, MultiInputSuccessForwardsFull) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_multi_input_, 4);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   auto root =
       std::make_unique<Subgraph>(Subgraph::Identity({GetNode("const2_1")}));
@@ -314,7 +314,7 @@ TEST_F(GraphAnalyzerTest, MultiInputSuccessForwardsFull) {
 TEST_F(GraphAnalyzerTest, DropInvalidSubgraphsMulti) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_multi_input_, 3);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   // A good one, multi-input is all-in.
   GetResult().insert(std::make_unique<Subgraph>(Subgraph::Identity({
@@ -360,7 +360,7 @@ TEST_F(GraphAnalyzerTest, DropInvalidSubgraphsMulti) {
 TEST_F(GraphAnalyzerTest, AllOrNoneInputSuccessBackwards) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_all_or_none_, 4);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   auto root =
       std::make_unique<Subgraph>(Subgraph::Identity({GetNode("pass2")}));
@@ -382,7 +382,7 @@ TEST_F(GraphAnalyzerTest, AllOrNoneInputSuccessBackwards) {
 TEST_F(GraphAnalyzerTest, AllOrNoneInputSuccessBackwardsNoControl) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_all_or_none_, 5);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   auto root =
       std::make_unique<Subgraph>(Subgraph::Identity({GetNode("pass1")}));
@@ -403,7 +403,7 @@ TEST_F(GraphAnalyzerTest, AllOrNoneInputSuccessBackwardsNoControl) {
 TEST_F(GraphAnalyzerTest, AllOrNoneInputSeparateControl) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_all_or_none_, 5);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   auto root =
       std::make_unique<Subgraph>(Subgraph::Identity({GetNode("pass1")}));
@@ -425,7 +425,7 @@ TEST_F(GraphAnalyzerTest, AllOrNoneInputSeparateControl) {
 TEST_F(GraphAnalyzerTest, AllOrNoneInputTooLargeBackwards) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_all_or_none_, 3);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   auto root =
       std::make_unique<Subgraph>(Subgraph::Identity({GetNode("pass2")}));
@@ -442,7 +442,7 @@ TEST_F(GraphAnalyzerTest, AllOrNoneInputTooLargeBackwards) {
 TEST_F(GraphAnalyzerTest, AllOrNoneInputNothingAddedBackwards) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_all_or_none_, 4);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   auto root = std::make_unique<Subgraph>(
       Subgraph::Identity({GetNode("pass2"), GetNode("const2_1"),
@@ -460,7 +460,7 @@ TEST_F(GraphAnalyzerTest, AllOrNoneInputNothingAddedBackwards) {
 TEST_F(GraphAnalyzerTest, AllOrNoneInputSuccessForwardsBaseOut) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_all_or_none_, 4);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   auto root =
       std::make_unique<Subgraph>(Subgraph::Identity({GetNode("const2_1")}));
@@ -480,7 +480,7 @@ TEST_F(GraphAnalyzerTest, AllOrNoneInputSuccessForwardsBaseOut) {
 TEST_F(GraphAnalyzerTest, AllOrNoneInputSuccessBackwardsFull) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_all_or_none_, 4);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   auto root =
       std::make_unique<Subgraph>(Subgraph::Identity({GetNode("pass2")}));
@@ -504,7 +504,7 @@ TEST_F(GraphAnalyzerTest, AllOrNoneInputSuccessBackwardsFull) {
 TEST_F(GraphAnalyzerTest, AllOrNoneInputSuccessForwardsFull) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_all_or_none_, 4);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   auto root =
       std::make_unique<Subgraph>(Subgraph::Identity({GetNode("const2_1")}));
@@ -524,7 +524,7 @@ TEST_F(GraphAnalyzerTest, AllOrNoneInputSuccessForwardsFull) {
 TEST_F(GraphAnalyzerTest, DropInvalidSubgraphsAllOrNone) {
   gran_ = std::make_unique<GraphAnalyzer>(graph_all_or_none_, 3);
   Status st = BuildMap();
-  ASSERT_THAT(st, Eq(OkStatus()));
+  ASSERT_THAT(st, Eq(absl::OkStatus()));
 
   // A good one, all-or-none is all-in.
   GetResult().insert(std::make_unique<Subgraph>(Subgraph::Identity({
diff --git a/tensorflow/core/grappler/graph_analyzer/sig_node.cc b/tensorflow/core/grappler/graph_analyzer/sig_node.cc
index 8e53e13b88833a..fc680b04265558 100644
--- a/tensorflow/core/grappler/graph_analyzer/sig_node.cc
+++ b/tensorflow/core/grappler/graph_analyzer/sig_node.cc
@@ -248,7 +248,7 @@ Status Signature::Compute() {
 
   OrderLinks();
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void Signature::PrepareNodes() {
diff --git a/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc b/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc
index 82081e898d9bb4..70f8e9a0fe3761 100644
--- a/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc
+++ b/tensorflow/core/grappler/graph_analyzer/sig_node_test.cc
@@ -479,7 +479,8 @@ TEST_F(SigNodeTest, EqualsLinkSize) {
   (*graph1.add_node()) = MakeNodeMul("node2", "node1", "node1");
 
   GenNodeMap gen_map1;
-  ASSERT_THAT(GenNode::BuildGraphInMap(graph1, &gen_map1), Eq(OkStatus()));
+  ASSERT_THAT(GenNode::BuildGraphInMap(graph1, &gen_map1),
+              Eq(absl::OkStatus()));
 
   Subgraph::Identity id1;
   id1.insert(gen_map1["node1"].get());
@@ -497,7 +498,8 @@ TEST_F(SigNodeTest, EqualsLinkSize) {
   node22->add_input("node2");
 
   GenNodeMap gen_map2;
-  ASSERT_THAT(GenNode::BuildGraphInMap(graph2, &gen_map2), Eq(OkStatus()));
+  ASSERT_THAT(GenNode::BuildGraphInMap(graph2, &gen_map2),
+              Eq(absl::OkStatus()));
 
   Subgraph::Identity id2;
   id2.insert(gen_map2["node1"].get());
@@ -519,7 +521,8 @@ TEST_F(SigNodeTest, EqualsLinks) {
   (*graph1.add_node()) = MakeNodeMul("node2", "node1", "node1");
 
   GenNodeMap gen_map1;
-  ASSERT_THAT(GenNode::BuildGraphInMap(graph1, &gen_map1), Eq(OkStatus()));
+  ASSERT_THAT(GenNode::BuildGraphInMap(graph1, &gen_map1),
+              Eq(absl::OkStatus()));
 
   Subgraph::Identity id1;
   id1.insert(gen_map1["node1"].get());
@@ -530,7 +533,8 @@ TEST_F(SigNodeTest, EqualsLinks) {
   sg1.ExtractForSignature(&sig_map1);
 
   GenNodeMap gen_map2;
-  ASSERT_THAT(GenNode::BuildGraphInMap(graph1, &gen_map2), Eq(OkStatus()));
+  ASSERT_THAT(GenNode::BuildGraphInMap(graph1, &gen_map2),
+              Eq(absl::OkStatus()));
 
   Subgraph::Identity id2;
   id2.insert(gen_map2["node1"].get());
@@ -610,7 +614,7 @@ class SignatureTest : public SigBaseTest {
     gen_map_.clear();
     sig_.map.clear();
     Status result = GenNode::BuildGraphInMap(graph, &gen_map_);
-    ASSERT_THAT(result, Eq(OkStatus()));
+    ASSERT_THAT(result, Eq(absl::OkStatus()));
     Subgraph::Identity id;
     for (const auto& entry : gen_map_) {
       id.insert(entry.second.get());
@@ -668,7 +672,7 @@ class SignatureTest : public SigBaseTest {
       OrderLinks(&sig_);
 
       // The test as such.
-      ASSERT_THAT(sig_.Compute(), Eq(OkStatus()));
+      ASSERT_THAT(sig_.Compute(), Eq(absl::OkStatus()));
 
       signatures.insert(sig_.ToString());
 
@@ -1035,7 +1039,7 @@ TEST_F(SignatureTest, OrderLinks) {
   gen_map_.clear();
   sig_.map.clear();
   Status result = GenNode::BuildGraphInMap(graph_for_link_order_, &gen_map_);
-  ASSERT_THAT(result, Eq(OkStatus()));
+  ASSERT_THAT(result, Eq(absl::OkStatus()));
   Subgraph::Identity id;
   for (const auto& entry : gen_map_) {
     id.insert(entry.second.get());
@@ -1084,7 +1088,7 @@ TEST_F(SignatureTest, GraphTooBig) {
     (*graph.add_node()) = MakeNodeConst(absl::StrFormat("node%d", i));
   }
 
-  ASSERT_THAT(GenNode::BuildGraphInMap(graph, &gen_map_), Eq(OkStatus()));
+  ASSERT_THAT(GenNode::BuildGraphInMap(graph, &gen_map_), Eq(absl::OkStatus()));
 
   Subgraph::Identity id;
   for (const auto& entry : gen_map_) {
@@ -1173,7 +1177,7 @@ TEST_F(SignatureTest, Equals) {
   // Start with 2 copies of the same graph.
   GenNodeMap gen_map1;
   ASSERT_THAT(GenNode::BuildGraphInMap(graph_circular_bidir_, &gen_map1),
-              Eq(OkStatus()));
+              Eq(absl::OkStatus()));
 
   Subgraph::Identity id1;
   id1.insert(gen_map1["node1"].get());
@@ -1182,11 +1186,11 @@ TEST_F(SignatureTest, Equals) {
 
   Signature sig1;
   sg1.ExtractForSignature(&sig1.map);
-  ASSERT_THAT(sig1.Compute(), Eq(OkStatus()));
+  ASSERT_THAT(sig1.Compute(), Eq(absl::OkStatus()));
 
   GenNodeMap gen_map2;
   ASSERT_THAT(GenNode::BuildGraphInMap(graph_circular_bidir_, &gen_map2),
-              Eq(OkStatus()));
+              Eq(absl::OkStatus()));
 
   Subgraph::Identity id2;
   id2.insert(gen_map2["node1"].get());
@@ -1195,7 +1199,7 @@ TEST_F(SignatureTest, Equals) {
 
   Signature sig2;
   sg2.ExtractForSignature(&sig2.map);
-  ASSERT_THAT(sig2.Compute(), Eq(OkStatus()));
+  ASSERT_THAT(sig2.Compute(), Eq(absl::OkStatus()));
 
   EXPECT_TRUE(sig1 == sig2);
 
diff --git a/tensorflow/core/grappler/graph_analyzer/subgraph_test.cc b/tensorflow/core/grappler/graph_analyzer/subgraph_test.cc
index c6ec39238727d7..bbe6856417adbe 100644
--- a/tensorflow/core/grappler/graph_analyzer/subgraph_test.cc
+++ b/tensorflow/core/grappler/graph_analyzer/subgraph_test.cc
@@ -41,7 +41,7 @@ TEST(SubgraphTest, Comparison) {
   (*graph.add_node()) = MakeNodeConst("node1");
   (*graph.add_node()) = MakeNodeConst("node2");
   GenNodeMap map;
-  ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(OkStatus()));
+  ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(absl::OkStatus()));
   auto gn1 = map["node1"].get();
   auto gn2 = map["node2"].get();
   ASSERT_THAT(gn1, Ne(nullptr));
@@ -88,7 +88,7 @@ TEST(SubgraphTest, Iteration) {
   node3->add_input("^node3");  // The control link goes back to self.
 
   GenNodeMap map;
-  ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(OkStatus()));
+  ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(absl::OkStatus()));
   ASSERT_THAT(map.find("node3"), Ne(map.end()));
 
   Subgraph::Identity id;
@@ -151,7 +151,7 @@ TEST(SubgraphTest, IterationSamePort) {
   (*graph.add_node()) = MakeNodeAddN("node3", "node1", "node2");
 
   GenNodeMap map;
-  ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(OkStatus()));
+  ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(absl::OkStatus()));
   ASSERT_THAT(map.find("node3"), Ne(map.end()));
 
   Subgraph::Identity id;
@@ -201,7 +201,7 @@ TEST(SubgraphTest, IterationSameNode) {
   (*graph.add_node()) = MakeNodeAddN("node3", "node1", "node2");
 
   GenNodeMap map;
-  ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(OkStatus()));
+  ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(absl::OkStatus()));
   ASSERT_THAT(map.find("node3"), Ne(map.end()));
 
   Subgraph::Identity id;
@@ -252,7 +252,7 @@ TEST(SubgraphTest, ExtendSet) {
   node3->add_input("^node3");  // The control link goes back to self.
 
   GenNodeMap map;
-  ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(OkStatus()));
+  ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(absl::OkStatus()));
   ASSERT_THAT(map.find("node2"), Ne(map.end()));
   ASSERT_THAT(map.find("node3"), Ne(map.end()));
 
@@ -301,7 +301,7 @@ TEST(SubgraphTest, ExtractForSignature) {
   node3->add_input("^node3");  // The control link goes back to self.
 
   GenNodeMap map;
-  ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(OkStatus()));
+  ASSERT_THAT(GenNode::BuildGraphInMap(graph, &map), Eq(absl::OkStatus()));
   ASSERT_THAT(map.find("node1"), Ne(map.end()));
   ASSERT_THAT(map.find("node2"), Ne(map.end()));
   ASSERT_THAT(map.find("node3"), Ne(map.end()));
diff --git a/tensorflow/core/grappler/graph_topology_view.cc b/tensorflow/core/grappler/graph_topology_view.cc
index 265b1f4e58fab6..7dfb15c2c07f6a 100644
--- a/tensorflow/core/grappler/graph_topology_view.cc
+++ b/tensorflow/core/grappler/graph_topology_view.cc
@@ -137,7 +137,7 @@ Status GraphTopologyView::InitializeFromGraph(
     SortAndRemoveDuplicates(&fanouts_[node_idx]);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphTopologyView::InitializeFromGraph(
diff --git a/tensorflow/core/grappler/graph_view.h b/tensorflow/core/grappler/graph_view.h
index a7775aaeb31c7d..03bffa03bc112a 100644
--- a/tensorflow/core/grappler/graph_view.h
+++ b/tensorflow/core/grappler/graph_view.h
@@ -323,7 +323,7 @@ class GraphViewInternal {
   Status AddUniqueNode(NodeDefT* node) {
     auto inserted = nodes_.emplace(node->name(), node);
     return inserted.second
-               ? OkStatus()
+               ? absl::OkStatus()
                : absl::InvalidArgumentError(absl::StrCat(
                      "Non unique node name detected: ", node->name()));
   }
diff --git a/tensorflow/core/grappler/grappler_item.cc b/tensorflow/core/grappler/grappler_item.cc
index 18e711c5aeb547..17e01a67bf3793 100644
--- a/tensorflow/core/grappler/grappler_item.cc
+++ b/tensorflow/core/grappler/grappler_item.cc
@@ -184,7 +184,7 @@ Status GrapplerItem::AddDevice(const string& device) {
   }
 
   devices_.insert(DeviceNameUtils::ParsedNameToString(name));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GrapplerItem::AddDevices(const GrapplerItem& other) {
@@ -194,7 +194,7 @@ Status GrapplerItem::AddDevices(const GrapplerItem& other) {
     if (!added.ok()) invalid_devices.emplace_back(device);
   }
   return invalid_devices.empty()
-             ? OkStatus()
+             ? absl::OkStatus()
              : errors::InvalidArgument("Skipped invalid devices: [",
                                        absl::StrJoin(invalid_devices, ", "),
                                        "]");
@@ -208,7 +208,7 @@ Status GrapplerItem::InferDevicesFromGraph() {
   }
   VLOG(2) << "Inferred device set: [" << absl::StrJoin(devices_, ", ") << "]";
   return invalid_devices.empty()
-             ? OkStatus()
+             ? absl::OkStatus()
              : errors::InvalidArgument("Skipped invalid devices: [",
                                        absl::StrJoin(invalid_devices, ", "),
                                        "]");
diff --git a/tensorflow/core/grappler/grappler_item_builder.cc b/tensorflow/core/grappler/grappler_item_builder.cc
index ce4f96ac204e08..35198461cc8033 100644
--- a/tensorflow/core/grappler/grappler_item_builder.cc
+++ b/tensorflow/core/grappler/grappler_item_builder.cc
@@ -86,7 +86,7 @@ Status PruneGraph(GrapplerItem* item) {
   Cluster* cluster = nullptr;  // ModelPruner doesn't check cluster.
   TF_RETURN_IF_ERROR(pruner.Optimize(cluster, *item, &pruned_graph));
   item->graph = std::move(pruned_graph);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Replace any unknown dimensions in a shape with
@@ -203,7 +203,7 @@ Status UpdatePlaceholderShape(
   if (!shape_proto.dim().empty())
     *(node->mutable_attr()->at("shape").mutable_shape()) = shape_proto;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -223,7 +223,7 @@ Status RuntimeGraphOptimizer(const GraphDef& graph_def_arg,
     if (output_graph_def != &graph_def_arg) {
       *output_graph_def = graph_def_arg;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Create a session option for a single GPU device.
diff --git a/tensorflow/core/grappler/mutable_graph_view.cc b/tensorflow/core/grappler/mutable_graph_view.cc
index 743a925cb74c39..638a6a33f9395f 100644
--- a/tensorflow/core/grappler/mutable_graph_view.cc
+++ b/tensorflow/core/grappler/mutable_graph_view.cc
@@ -272,7 +272,7 @@ Status CheckFaninIsRegular(const TensorId& fanin, ErrorHandler handler) {
     return handler(absl::Substitute("fanin '$0' must be a regular tensor id",
                                     fanin.ToString()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CheckFaninIsValid(const TensorId& fanin, ErrorHandler handler) {
@@ -280,7 +280,7 @@ Status CheckFaninIsValid(const TensorId& fanin, ErrorHandler handler) {
     return handler(absl::Substitute("fanin '$0' must be a valid tensor id",
                                     fanin.ToString()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CheckAddingFaninToSelf(absl::string_view node_name,
@@ -289,7 +289,7 @@ Status CheckAddingFaninToSelf(absl::string_view node_name,
     return handler(
         absl::Substitute("can't add fanin '$0' to self", fanin.ToString()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CheckRemovingFaninFromSelf(absl::string_view node_name,
@@ -298,7 +298,7 @@ Status CheckRemovingFaninFromSelf(absl::string_view node_name,
     return handler(absl::Substitute("can't remove fanin '$0' from self",
                                     fanin.ToString()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string NodeMissingErrorMsg(absl::string_view node_name) {
@@ -310,7 +310,7 @@ Status CheckNodeExists(absl::string_view node_name, NodeDef* node,
   if (node == nullptr) {
     return handler(NodeMissingErrorMsg(node_name));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CheckPortRange(int port, int min, int max, ErrorHandler handler) {
@@ -321,7 +321,7 @@ Status CheckPortRange(int port, int min, int max, ErrorHandler handler) {
     return handler(
         absl::Substitute("port must be in range [$0, $1]", min, max));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string SwapNodeNamesSwitchControlErrorMsg(absl::string_view node_name) {
@@ -508,7 +508,7 @@ Status MutableGraphView::AddSubgraph(GraphDef&& subgraph) {
     AddAndDedupFanouts(node);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MutableGraphView::UpdateNode(
@@ -549,7 +549,7 @@ Status MutableGraphView::UpdateNode(
   }
 
   if (node->op() == op) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   node->set_op(string(op));
@@ -562,7 +562,7 @@ Status MutableGraphView::UpdateNode(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MutableGraphView::UpdateNodeName(absl::string_view from_node_name,
@@ -580,7 +580,7 @@ Status MutableGraphView::UpdateNodeName(absl::string_view from_node_name,
   TF_RETURN_IF_ERROR(CheckNodeExists(from_node_name, node, error_status));
 
   if (node->name() == to_node_name) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (HasNode(to_node_name)) {
     return error_status(
@@ -605,7 +605,7 @@ Status MutableGraphView::UpdateNodeName(absl::string_view from_node_name,
   nodes().erase(node->name());
   node->set_name(string(to_node_name));
   nodes().emplace(node->name(), node);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MutableGraphView::SwapNodeNames(absl::string_view from_node_name,
@@ -622,7 +622,7 @@ Status MutableGraphView::SwapNodeNames(absl::string_view from_node_name,
   NodeDef* from_node = GetNode(from_node_name);
   TF_RETURN_IF_ERROR(CheckNodeExists(from_node_name, from_node, error_status));
   if (from_node_name == to_node_name) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   NodeDef* to_node = GetNode(to_node_name);
   TF_RETURN_IF_ERROR(CheckNodeExists(to_node_name, to_node, error_status));
@@ -639,7 +639,7 @@ Status MutableGraphView::SwapNodeNames(absl::string_view from_node_name,
     SwapFanoutInputs(*this, &fanouts(), &max_regular_output_port(), from_node,
                      to_node);
     swap_names();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   bool from_is_switch = IsSwitch(*from_node);
@@ -750,7 +750,7 @@ Status MutableGraphView::SwapNodeNames(absl::string_view from_node_name,
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MutableGraphView::UpdateFanouts(absl::string_view from_node_name,
@@ -771,7 +771,7 @@ Status MutableGraphView::UpdateFanoutsInternal(NodeDef* from_node,
   VLOG(2) << absl::Substitute("Update fanouts from '$0' to '$1'.",
                               from_node->name(), to_node->name());
   if (from_node == to_node) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Update internal state with the new output_port->input_port edge.
@@ -860,7 +860,7 @@ Status MutableGraphView::UpdateFanoutsInternal(NodeDef* from_node,
     max_regular_output_port().erase(from_node);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool MutableGraphView::AddFaninInternal(NodeDef* node,
@@ -927,7 +927,7 @@ Status MutableGraphView::AddRegularFanin(absl::string_view node_name,
   TF_RETURN_IF_ERROR(CheckNodeExists(fanin.node(), fanin_node, error_status));
 
   AddFaninInternal(node, {fanin_node, fanin.index()});
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MutableGraphView::AddRegularFaninByPort(absl::string_view node_name,
@@ -971,7 +971,7 @@ Status MutableGraphView::AddRegularFaninByPort(absl::string_view node_name,
     RemoveControllingFaninInternal(node, fanin_node);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 NodeDef* MutableGraphView::GetControllingFaninToAdd(absl::string_view node_name,
@@ -1061,7 +1061,7 @@ Status MutableGraphView::AddControllingFanin(absl::string_view node_name,
   }
   AddFaninInternal(node, {control_node, Graph::kControlSlot});
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool MutableGraphView::RemoveRegularFaninInternal(NodeDef* node,
@@ -1137,7 +1137,7 @@ Status MutableGraphView::RemoveRegularFanin(absl::string_view node_name,
   TF_RETURN_IF_ERROR(CheckNodeExists(fanin.node(), fanin_node, error_status));
 
   RemoveRegularFaninInternal(node, {fanin_node, fanin.index()});
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MutableGraphView::RemoveRegularFaninByPort(absl::string_view node_name,
@@ -1180,7 +1180,7 @@ Status MutableGraphView::RemoveRegularFaninByPort(absl::string_view node_name,
     max_regular_input_port()[node] = updated_last_regular_input_port;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool MutableGraphView::RemoveControllingFaninInternal(NodeDef* node,
@@ -1218,7 +1218,7 @@ Status MutableGraphView::RemoveControllingFanin(
       CheckNodeExists(fanin_node_name, fanin_node, error_status));
 
   RemoveControllingFaninInternal(node, fanin_node);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MutableGraphView::RemoveAllFanins(absl::string_view node_name,
@@ -1233,7 +1233,7 @@ Status MutableGraphView::RemoveAllFanins(absl::string_view node_name,
   }
 
   if (node->input().empty()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const int num_regular_fanins =
@@ -1241,7 +1241,7 @@ Status MutableGraphView::RemoveAllFanins(absl::string_view node_name,
   RemoveFaninsInternal(node, keep_controlling_fanins);
   if (keep_controlling_fanins) {
     if (num_regular_fanins == 0) {
-      return OkStatus();
+      return absl::OkStatus();
     } else if (num_regular_fanins < node->input_size()) {
       node->mutable_input()->DeleteSubrange(0, num_regular_fanins);
     } else {
@@ -1250,7 +1250,7 @@ Status MutableGraphView::RemoveAllFanins(absl::string_view node_name,
   } else {
     node->clear_input();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MutableGraphView::UpdateFanin(absl::string_view node_name,
@@ -1289,7 +1289,7 @@ Status MutableGraphView::UpdateFanin(absl::string_view node_name,
   }
 
   if (from_fanin == to_fanin) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   bool from_fanin_is_control = IsTensorIdControlling(from_fanin);
@@ -1304,7 +1304,7 @@ Status MutableGraphView::UpdateFanin(absl::string_view node_name,
     if (modified) {
       AddFaninInternal(node, {to_fanin_node, to_fanin.index()});
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // In place mutation of regular fanins, requires no shifting of ports.
@@ -1340,7 +1340,7 @@ Status MutableGraphView::UpdateFanin(absl::string_view node_name,
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MutableGraphView::UpdateRegularFaninByPort(absl::string_view node_name,
@@ -1365,7 +1365,7 @@ Status MutableGraphView::UpdateRegularFaninByPort(absl::string_view node_name,
 
   TensorId tensor_id = ParseTensorName(node->input(port));
   if (tensor_id == fanin) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   InputPort input(node, port);
@@ -1384,7 +1384,7 @@ Status MutableGraphView::UpdateRegularFaninByPort(absl::string_view node_name,
     RemoveControllingFaninInternal(node, fanin_node);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MutableGraphView::SwapRegularFaninsByPorts(absl::string_view node_name,
@@ -1405,12 +1405,12 @@ Status MutableGraphView::SwapRegularFaninsByPorts(absl::string_view node_name,
                                     error_status));
 
   if (from_port == to_port) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   TensorId from_fanin = ParseTensorName(node->input(from_port));
   TensorId to_fanin = ParseTensorName(node->input(to_port));
   if (from_fanin == to_fanin) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   InputPort from_input(node, from_port);
@@ -1428,7 +1428,7 @@ Status MutableGraphView::SwapRegularFaninsByPorts(absl::string_view node_name,
 
   node->mutable_input()->SwapElements(from_port, to_port);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MutableGraphView::UpdateAllRegularFaninsToControlling(
@@ -1499,7 +1499,7 @@ Status MutableGraphView::UpdateAllRegularFaninsToControlling(
   node->mutable_input()->DeleteSubrange(pos, node->input_size() - pos);
   max_regular_input_port().erase(node);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MutableGraphView::CheckNodesCanBeDeleted(
@@ -1562,7 +1562,7 @@ Status MutableGraphView::CheckNodesCanBeDeleted(
     return MutationError("DeleteNodes", params, error_msg);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MutableGraphView::DeleteNodes(
@@ -1600,7 +1600,7 @@ Status MutableGraphView::DeleteNodes(
     graph()->mutable_node()->DeleteSubrange(last_pos + 1, last_idx - last_pos);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void MutableGraphView::RemoveFaninsInternal(NodeDef* deleted_node,
diff --git a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
index 4c1ee2e302f19d..f3079b745d029c 100644
--- a/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/arithmetic_optimizer.cc
@@ -255,7 +255,7 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
   // Update consumers of node to take new_input as input instead.
   Status UpdateConsumers(NodeDef* node, const string& new_input) {
     const auto consumers = ctx().node_map->GetOutputs(node->name());
-    if (consumers.empty()) return OkStatus();
+    if (consumers.empty()) return absl::OkStatus();
     const TensorId new_tensor = ParseTensorName(new_input);
     for (NodeDef* consumer : consumers) {
       if (consumer->name() == new_tensor.node()) continue;
@@ -283,7 +283,7 @@ class ArithmeticOptimizerStage : public GraphOptimizerStage<string> {
         AddToOptimizationQueue(consumer);
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // TODO(ezhulenev): remove this method from ArithmeticOptimizer when all
@@ -397,7 +397,7 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
       *simplified_node_name = RewriteOptimizedNodesGroup(group);
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
@@ -438,7 +438,7 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CreateOptimizedNodesGroup(NodeDef* root_node,
@@ -458,7 +458,7 @@ class ArithmeticNodesGroupOptimizerStage : public ArithmeticOptimizerStage {
       TF_RETURN_IF_ERROR(AbsorbInputByOptimizedNodesGroup(input_i, group));
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Check if all inputs can be broadcasted to the same shape
@@ -833,7 +833,7 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
         *simplified_node_name = new_outer_node->name();
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -917,7 +917,7 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
     }
 
     *common_factor_is_denominator = has_div;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Gather up the non-shared factors (the y's in the example).
@@ -951,7 +951,7 @@ class HoistCommonFactorOutOfAggregation : public ArithmeticOptimizerStage {
         *shapes_match = ShapesSymbolicallyEqual(*lhs, *rhs);
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   bool IsRewritten(const NodeDef* node) const {
@@ -1193,7 +1193,7 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
     NodeDef* node_perm;
     TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &node_perm));
     if (!IsConstant(*node_perm)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     std::vector<int64_t> node_perm_values;
     TF_RETURN_IF_ERROR(GetPermutation(*node_perm, &node_perm_values));
@@ -1203,7 +1203,7 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
       TF_RETURN_IF_ERROR(
           GetInputNode(first_transpose->input(1), &first_transpose_perm));
       if (!IsConstant(*first_transpose_perm)) {
-        return OkStatus();
+        return absl::OkStatus();
       }
       std::vector<int64_t> first_transpose_perm_values;
       TF_RETURN_IF_ERROR(
@@ -1240,7 +1240,7 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
         }
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -1252,10 +1252,10 @@ class RemoveIdentityTranspose : public ArithmeticOptimizerStage {
       for (int val : perm32) {
         perm64->push_back(static_cast<int64_t>(val));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
     if (ValuesFromConstNode(node_perm, perm64)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     return errors::InvalidArgument("Couldn't extract permutation from ",
                                    node_perm.name());
@@ -1321,7 +1321,7 @@ class RemoveInvolution : public ArithmeticOptimizerStage {
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -1351,7 +1351,7 @@ class RemoveRedundantBitcastStage : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(GetNodeAttr(attrs, "type", &output_type));
     if ((input_type == output_type) && !IsInPreserveSet(*node)) {
       *simplified_node_name = node->input(0);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     NodeDef* bitcast;
@@ -1372,7 +1372,7 @@ class RemoveRedundantBitcastStage : public ArithmeticOptimizerStage {
       *simplified_node_name = bitcast->name();
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -1400,7 +1400,7 @@ class RemoveRedundantCastStage : public ArithmeticOptimizerStage {
     if (input_type == output_type) {
       *simplified_node_name = node->input(0);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -1440,7 +1440,7 @@ class RemoveNegationStage : public ArithmeticOptimizerStage {
     if (updated) {
       AddToOptimizationQueue(node);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -1461,7 +1461,7 @@ class RemoveLogicalNotStage : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
     if (IsInPreserveSet(*input) ||
         NumNonControlOutputs(*input, *ctx().node_map) > 1) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     string new_op;
     if (IsEqual(*input)) {
@@ -1481,7 +1481,7 @@ class RemoveLogicalNotStage : public ArithmeticOptimizerStage {
       input->set_op(new_op);
       *simplified_node_name = input->name();
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -1564,7 +1564,7 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
       TF_RETURN_IF_ERROR(
           HoistUnaryOpChain(prefix_length, tails, &ctrl_inputs, node));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -1594,7 +1594,7 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
     ChainLinkSet cur_tails;
     TF_RETURN_IF_ERROR(InitializeChains(root_node, &cur_tails));
     if (cur_tails.size() < 2) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     ctrl_inputs->clear();
     bool stop = false;
@@ -1608,7 +1608,7 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
       // Advance tail pointers to the next level.
       TF_RETURN_IF_ERROR(AdvanceTails(*tails, &cur_tails, &stop));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Hoists the chains to the other side of concat or split and attaches the
@@ -1621,7 +1621,7 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
             << absl::StrJoin(*ctrl_inputs, ", ") << "]";
 
     if (tails.empty()) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     AddToOptimizationQueue(root_node);
     optimized_nodes_.insert(root_node->name());
@@ -1680,7 +1680,7 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
         TF_RETURN_IF_ERROR(GetInputNode(node.input(input_port), &tail));
         tails->insert(ChainLink(tail, input_port));
       }
-      return OkStatus();
+      return absl::OkStatus();
     } else {
       // Handle split nodes by looking forwards in the graph.
       const auto& outputs = ctx().node_map->GetOutputs(node.name());
@@ -1695,11 +1695,11 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
           // This output node has a non-control input other than the split node,
           // abort.
           tails->clear();
-          return OkStatus();
+          return absl::OkStatus();
         }
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   bool OpsAreSafeToHoist(const NodeDef& root_node,
@@ -1740,7 +1740,7 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
       const NodeDef* tail = link.node;
       if (node_is_concat_) {
         if (tail->input_size() == 0 || IsControlInput(tail->input(0))) {
-          return OkStatus();
+          return absl::OkStatus();
         }
         NodeDef* new_tail;
         TF_RETURN_IF_ERROR(GetInputNode(tail->input(0), &new_tail));
@@ -1750,7 +1750,7 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
         for (NodeDef* new_tail : ctx().node_map->GetOutputs(tail->name())) {
           const TensorId tensor = ParseTensorName(new_tail->input(0));
           if (tensor.node() != tail->name()) {
-            return OkStatus();
+            return absl::OkStatus();
           }
           // Skip control outputs.
           if (tensor.index() >= 0) {
@@ -1761,7 +1761,7 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
       }
     }
     *stop = false;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status HoistChainForConcat(const int prefix_length, const ChainLinkSet& tails,
@@ -1788,7 +1788,7 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
         ctx().node_map->UpdateInput(tail->name(), tail_input, concat_name);
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status HoistChainForSplit(const int prefix_length, const ChainLinkSet& tails,
@@ -1839,7 +1839,7 @@ class HoistCWiseUnaryChainsStage : public ArithmeticOptimizerStage {
                          ? split_name
                          : strings::StrCat(split_name, ":", link.port_origin)));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   bool IsAlreadyOptimized(const NodeDef& node) const {
@@ -1869,7 +1869,7 @@ class RemoveIdempotentStage : public ArithmeticOptimizerStage {
     if (input->op() == node->op() && input->device() == node->device()) {
       *simplified_node_name = node->input(0);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -1908,7 +1908,7 @@ class SqrtDivToRsqrtMulStage : public ArithmeticOptimizerStage {
       AddToOptimizationQueue(node);
       AddToOptimizationQueue(y);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -1935,13 +1935,14 @@ class FuseSquaredDiffStage : public ArithmeticOptimizerStage {
       // For complex, SquaredDiff computes conj(x-y)*(x-y), so this rewrite is
       // invalid.
       const DataType type = GetDataTypeFromAttr(*b, "T");
-      if ((type == DT_COMPLEX64) || (type == DT_COMPLEX128)) return OkStatus();
+      if ((type == DT_COMPLEX64) || (type == DT_COMPLEX128))
+        return absl::OkStatus();
       node->set_op("Identity");
       b->set_op("SquaredDifference");
       AddToOptimizationQueue(node);
       AddToOptimizationQueue(b);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -1969,7 +1970,7 @@ class LogSoftmaxStage : public ArithmeticOptimizerStage {
       AddToOptimizationQueue(node);
       AddToOptimizationQueue(x);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -2007,7 +2008,7 @@ class RemoveRedundantReshapeOrBroadcastTo : public ArithmeticOptimizerStage {
     if (!IsInPreserveSet(*node) && InputMatchesTargetShape(*node) &&
         !HasControlInputs(*node)) {
       *simplified_node_name = node->input(0);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // 2. Bypass reshape followed by reshape, possibly separated by a simple
@@ -2040,7 +2041,7 @@ class RemoveRedundantReshapeOrBroadcastTo : public ArithmeticOptimizerStage {
             (!IsReshape(*reshape_to_bypass) ||
              NumNonControlOutputs(*reshape_to_bypass, *ctx().node_map) > 1 ||
              IsInPreserveSet(*reshape_to_bypass))) {
-          return OkStatus();
+          return absl::OkStatus();
         }
         // Clearing invalid shape inference results of nodes in chain.
         for (const NodeDef* node_in_chain : nodes_in_chain) {
@@ -2059,11 +2060,11 @@ class RemoveRedundantReshapeOrBroadcastTo : public ArithmeticOptimizerStage {
         // Change the bypassed reshape to NoOp.
         ReplaceWithNoOp(reshape_to_bypass, ctx());
         *simplified_node_name = node->name();
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -2130,7 +2131,7 @@ class ReorderCastLikeAndValuePreserving : public ArithmeticOptimizerStage {
     if (!can_optimize || IsControlFlow(*producer) ||
         IsInPreserveSet(*producer) ||
         producer->device() != consumer->device()) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     const NodeDef* cast_like_node = producer_is_cast ? producer : consumer;
@@ -2144,13 +2145,13 @@ class ReorderCastLikeAndValuePreserving : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(OutputTypeForNode(*cast_like_node, *cast_like_op_def, 0,
                                          &cast_dst_type));
     if (!IsFixedSizeType(cast_src_type) || !IsFixedSizeType(cast_dst_type)) {
-      return OkStatus();
+      return absl::OkStatus();
     } else if (producer_is_cast &&
                DataTypeSize(cast_dst_type) <= DataTypeSize(cast_src_type)) {
-      return OkStatus();
+      return absl::OkStatus();
     } else if (!producer_is_cast &&
                DataTypeSize(cast_dst_type) >= DataTypeSize(cast_src_type)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Check that nodes were not already optimized.
@@ -2162,7 +2163,7 @@ class ReorderCastLikeAndValuePreserving : public ArithmeticOptimizerStage {
         ctx().node_map->NodeExists(optimized_consumer_name) ||
         ctx().node_map->NodeExists(optimized_producer_name);
     if (is_already_optimized) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Add copies of consumer and producer in reverse order.
@@ -2192,7 +2193,7 @@ class ReorderCastLikeAndValuePreserving : public ArithmeticOptimizerStage {
     AddToOptimizationQueue(new_producer);
     *simplified_node_name = new_consumer->name();
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -2209,11 +2210,11 @@ class ReorderCastLikeAndValuePreserving : public ArithmeticOptimizerStage {
                                        DataTypeString(dtype));
       } else {
         // Op has fixed input type that already matches dtype.
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
     SetDataTypeToAttr(dtype, type_attr_name, node);
-    return OkStatus();
+    return absl::OkStatus();
   }
   // This optimization can be dangerous on devices other than CPU and
   // GPU. The transpose might not be implemented for image.type, or
@@ -2360,7 +2361,7 @@ class FoldMultiplyIntoConv : public ArithmeticOptimizerStage {
     AddToOptimizationQueue(tail);
     *simplified_node_name = conv->name();
 
-    return OkStatus();
+    return absl::OkStatus();
 #undef TF_RETURN_IF_TRUE
   }
 };
@@ -2380,7 +2381,8 @@ class FoldTransposeIntoMatMul : public ArithmeticOptimizerStage {
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
     const NodeScopeAndName matmul = ParseNodeScopeAndName(node->name());
     const string optimized_node_name = OptimizedNodeName(matmul);
-    if (ctx().node_map->NodeExists(optimized_node_name)) return OkStatus();
+    if (ctx().node_map->NodeExists(optimized_node_name))
+      return absl::OkStatus();
 
     NodeDef* a;
     NodeDef* b;
@@ -2403,7 +2405,7 @@ class FoldTransposeIntoMatMul : public ArithmeticOptimizerStage {
                                IsInnerMatrixTransposeNode(*a, ctx().node_map);
     const bool b_is_foldable = foldable_transpose_ops.count(b->op()) > 0 &&
                                IsInnerMatrixTransposeNode(*b, ctx().node_map);
-    if (!a_is_foldable && !b_is_foldable) return OkStatus();
+    if (!a_is_foldable && !b_is_foldable) return absl::OkStatus();
 
     NodeDef* new_op = AddCopyNode(optimized_node_name, node);
 
@@ -2431,7 +2433,7 @@ class FoldTransposeIntoMatMul : public ArithmeticOptimizerStage {
     ForwardControlDependencies(new_op, deps_to_forward);
     *simplified_node_name = new_op->name();
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -2488,7 +2490,8 @@ class FoldConjugateIntoTranspose : public ArithmeticOptimizerStage {
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
     const NodeScopeAndName matmul = ParseNodeScopeAndName(node->name());
     const string optimized_node_name = OptimizedNodeName(matmul);
-    if (ctx().node_map->NodeExists(optimized_node_name)) return OkStatus();
+    if (ctx().node_map->NodeExists(optimized_node_name))
+      return absl::OkStatus();
 
     NodeDef* input;
     TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
@@ -2510,7 +2513,7 @@ class FoldConjugateIntoTranspose : public ArithmeticOptimizerStage {
       *simplified_node_name = new_op->name();
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -2534,7 +2537,8 @@ class ReplaceMulWithSquare : public ArithmeticOptimizerStage {
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
     const NodeScopeAndName mul = ParseNodeScopeAndName(node->name());
     const string optimized_node_name = OptimizedNodeName(mul);
-    if (ctx().node_map->NodeExists(optimized_node_name)) return OkStatus();
+    if (ctx().node_map->NodeExists(optimized_node_name))
+      return absl::OkStatus();
 
     const DataType type = GetDataTypeFromAttr(*node, "T");
     bool is_complex = (type == DT_COMPLEX64) || (type == DT_COMPLEX128);
@@ -2552,7 +2556,7 @@ class ReplaceMulWithSquare : public ArithmeticOptimizerStage {
       *simplified_node_name = new_square_node->name();
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -2583,11 +2587,11 @@ class ReplaceMulWithBroadcastByTile : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &ones));
     if (IsInPreserveSet(*node) || IsInPreserveSet(*input) ||
         IsInPreserveSet(*ones)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // TODO(kkiningh): Generalize using IsOnes from constant_folding.cc
-    if (IsConstant(*input) || !IsOnes(*ones)) return OkStatus();
+    if (IsConstant(*input) || !IsOnes(*ones)) return absl::OkStatus();
 
     // Avoid optimizing the same node twice
     const NodeScopeAndName scope_and_name = ParseNodeScopeAndName(node->name());
@@ -2595,28 +2599,28 @@ class ReplaceMulWithBroadcastByTile : public ArithmeticOptimizerStage {
     const string const_node_name = OptimizedNodeName(scope_and_name, "Const");
     if (ctx().node_map->NodeExists(tile_node_name) ||
         ctx().node_map->NodeExists(const_node_name)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     const std::vector<OpInfo::TensorProperties>& props =
         ctx().graph_properties->GetInputProperties(node->name());
-    if (props.size() != 2) return OkStatus();
+    if (props.size() != 2) return absl::OkStatus();
 
     // Ignore ops where the shape doesn't change
     const TensorShapeProto& input_shape = props[0].shape();
     const TensorShapeProto& ones_shape = props[1].shape();
     TensorShapeProto output_shape;
     if (!ShapeAfterBroadcast(input_shape, ones_shape, &output_shape)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     if (ShapesSymbolicallyEqual(input_shape, output_shape)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // All inputs must have same input/output dimensions
     if (input_shape.dim_size() != output_shape.dim_size() ||
         ones_shape.dim_size() != output_shape.dim_size())
-      return OkStatus();
+      return absl::OkStatus();
 
     // At this point all preconditions are met. Can proceed with rewrite.
     VLOG(3) << "Simplify multiply with all ones input: node=" << node->name()
@@ -2653,7 +2657,7 @@ class ReplaceMulWithBroadcastByTile : public ArithmeticOptimizerStage {
     ForwardControlDependencies(tile_node, {node});
     *simplified_node_name = tile_node->name();
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
@@ -2707,20 +2711,20 @@ class ReduceUpsamplingDims : public ArithmeticOptimizerStage {
     NodeDef* tile;
     TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &tile));
     if (!IsTile(*tile) || IsInPreserveSet(*tile)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     if (NumNonControlOutputs(*tile, *ctx().node_map) != 1) {
       // Optimization is only worthwile when there is a single output from Tile.
       // Otherwise, we need to insert additional Reshape ops that can't be
       // easily removed.
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     NodeDef* reshape;
     TF_RETURN_IF_ERROR(GetInputNode(tile->input(0), &reshape));
     if (!IsReshape(*reshape) || IsInPreserveSet(*reshape)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     NodeDef* multiples;
@@ -2741,18 +2745,18 @@ class ReduceUpsamplingDims : public ArithmeticOptimizerStage {
         ctx().node_map->NodeExists(new_tile_name) ||
         ctx().node_map->NodeExists(new_shape_name) ||
         ctx().node_map->NodeExists(new_multiples_name)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Compuate updated multiples/shape values.
     AttrValue new_multiples_attr;
     if (!CreateUpdatedMultiplesProto(multiples,
                                      new_multiples_attr.mutable_tensor())) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     AttrValue new_shape_attr;
     if (!CreateUpdatedShapeProto(shape, new_shape_attr.mutable_tensor())) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // At this point the graph is validated and can be updated
@@ -2790,7 +2794,7 @@ class ReduceUpsamplingDims : public ArithmeticOptimizerStage {
     ForwardControlDependencies(new_shape, {shape});
 
     *simplified_node_name = node->name();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -2922,7 +2926,7 @@ class ReplacePackWithTileReshape : public ArithmeticOptimizerStage {
 
     // Must be at least two Pack operations to consider for replacement
     if (chain.empty()) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Avoid optimizing the same node twice
@@ -2939,7 +2943,7 @@ class ReplacePackWithTileReshape : public ArithmeticOptimizerStage {
         ctx().node_map->NodeExists(new_tile_name) ||
         ctx().node_map->NodeExists(new_shape_name) ||
         ctx().node_map->NodeExists(new_reshape_name)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // 2. Calculate the multiples and shape tensor using the chain
@@ -2947,7 +2951,7 @@ class ReplacePackWithTileReshape : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(GetTensorProperties(input->name(), &input_props));
     const TensorShapeProto& input_shape = input_props->shape();
     if (!PartialTensorShape(input_shape).IsFullyDefined()) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     Tensor multiples(DT_INT32, TensorShape({input_shape.dim_size()}));
     TF_RETURN_IF_ERROR(CalculateMultiplesFromChain(chain, &multiples));
@@ -2956,7 +2960,7 @@ class ReplacePackWithTileReshape : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(GetTensorProperties(node->name(), &output_props));
     const TensorShapeProto& output_shape = output_props->shape();
     if (!PartialTensorShape(output_shape).IsFullyDefined()) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     Tensor output_shape_tensor(DT_INT32,
                                TensorShape({output_shape.dim_size()}));
@@ -3011,7 +3015,7 @@ class ReplacePackWithTileReshape : public ArithmeticOptimizerStage {
 
     *simplified_node_name = new_reshape_node->name();
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
@@ -3052,7 +3056,7 @@ class ReplacePackWithTileReshape : public ArithmeticOptimizerStage {
       dims.insert(dims.begin() + axis, dims[axis]);
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -3086,7 +3090,7 @@ class SimplifyAggregation : public ArithmeticOptimizerStage {
     // 1. Discard aggregate nodes with a single input and no control deps.
     if (node->input_size() == 1) {
       *simplified_node_name = node->input(0);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // 2. Rewrite aggregations of N >= 2 identical terms.
@@ -3102,7 +3106,7 @@ class SimplifyAggregation : public ArithmeticOptimizerStage {
         break;
       }
     }
-    if (!all_equal) return OkStatus();
+    if (!all_equal) return absl::OkStatus();
 
     // And node should not be optimized earlier.
     const NodeScopeAndName node_scope_and_name =
@@ -3116,7 +3120,7 @@ class SimplifyAggregation : public ArithmeticOptimizerStage {
         ctx().node_map->NodeExists(optimized_const_name) ||
         ctx().node_map->NodeExists(optimized_mul_name);
 
-    if (is_already_optimized) return OkStatus();
+    if (is_already_optimized) return absl::OkStatus();
 
     // At this point all preconditions are met, and we safely do the rewrite.
     VLOG(3) << "Simplify aggregation with identical inputs: node="
@@ -3157,7 +3161,7 @@ class SimplifyAggregation : public ArithmeticOptimizerStage {
     ForwardControlDependencies(new_mul_node, {node});
     *simplified_node_name = new_mul_node->name();
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -3175,16 +3179,16 @@ class ConvertPowStage : public ArithmeticOptimizerStage {
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
     Tensor pow;
-    if (!GetTensorFromConstNode(node->input(1), &pow)) return OkStatus();
+    if (!GetTensorFromConstNode(node->input(1), &pow)) return absl::OkStatus();
     complex128 prev, curr;
     for (int i = 0; i < pow.NumElements(); ++i) {
       if (!GetElementUnexhaustive(pow, i, {pow.dtype()}, &curr)) {
         // input data type is not supported by Pow. Skip.
-        return OkStatus();
+        return absl::OkStatus();
       }
       if (i != 0 && curr != prev) {
         // pow has different values on different elements. Skip.
-        return OkStatus();
+        return absl::OkStatus();
       }
       prev = curr;
     }
@@ -3266,7 +3270,7 @@ class ConvertPowStage : public ArithmeticOptimizerStage {
       AddToOptimizationQueue(node);
       AddToOptimizationQueue(y);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -3274,22 +3278,22 @@ class ConvertPowStage : public ArithmeticOptimizerStage {
     switch (t->dtype()) {
       case DT_INT32:
         t->flat<int32>()(i) = 1;
-        return OkStatus();
+        return absl::OkStatus();
       case DT_INT64:
         t->flat<int64_t>()(i) = 1L;
-        return OkStatus();
+        return absl::OkStatus();
       case DT_FLOAT:
         t->flat<float>()(i) = 1.0f;
-        return OkStatus();
+        return absl::OkStatus();
       case DT_DOUBLE:
         t->flat<double>()(i) = 1.0;
-        return OkStatus();
+        return absl::OkStatus();
       case DT_COMPLEX64:
         t->flat<complex64>()(i) = complex64(1);
-        return OkStatus();
+        return absl::OkStatus();
       case DT_COMPLEX128:
         t->flat<complex128>()(i) = complex128(1);
-        return OkStatus();
+        return absl::OkStatus();
       default:
         return errors::InvalidArgument("Invalid data type: ", t->dtype());
     }
@@ -3309,11 +3313,11 @@ class ConvertLog1pStage : public ArithmeticOptimizerStage {
     NodeDef* input;
     TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &input));
     if (!IsAdd(*input)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     if (ctx().graph_properties->GetInputProperties(input->name()).size() < 2) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     bool modified = false;
@@ -3324,7 +3328,7 @@ class ConvertLog1pStage : public ArithmeticOptimizerStage {
     if (modified) {
       *simplified_node_name = node->name();
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -3337,17 +3341,17 @@ class ConvertLog1pStage : public ArithmeticOptimizerStage {
     for (int k = 0; k < c.shape().dim_size(); ++k) {
       // Skip if c shape is not fully determined.
       if (c.shape().dim(k).size() < 0) {
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
     TensorShapeProto broadcast_shape;
     if (!ShapeAfterBroadcast(t.shape(), c.shape(), &broadcast_shape)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     if (!ShapesSymbolicallyEqual(t.shape(), broadcast_shape)) {
       // skip if the non-constant tensor doesn't have the same shape after
       // broadcast.
-      return OkStatus();
+      return absl::OkStatus();
     }
     Tensor constant;
     if (GetTensorFromConstNode(add_node->input(j), &constant)) {
@@ -3362,11 +3366,11 @@ class ConvertLog1pStage : public ArithmeticOptimizerStage {
                                      DT_COMPLEX64, DT_COMPLEX128},
                                     &element)) {
           // input data type is not supported by log1p. Skip.
-          return OkStatus();
+          return absl::OkStatus();
         }
         if (element != complex128(1)) {
           // current element is not 1. Skip.
-          return OkStatus();
+          return absl::OkStatus();
         }
       }
       NodeDef *x, *y;
@@ -3383,7 +3387,7 @@ class ConvertLog1pStage : public ArithmeticOptimizerStage {
       AddToOptimizationQueue(y);
       *modified = true;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -3405,21 +3409,22 @@ class ConvertExpm1Stage : public ArithmeticOptimizerStage {
 
   Status TrySimplify(NodeDef* node, string* simplified_node_name) override {
     if (ctx().graph_properties->GetInputProperties(node->name()).size() < 2) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     const auto& t = ctx().graph_properties->GetInputProperties(node->name())[0];
     const auto& c = ctx().graph_properties->GetInputProperties(node->name())[1];
     TensorShapeProto broadcast_shape;
     if (!ShapeAfterBroadcast(t.shape(), c.shape(), &broadcast_shape)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     if (!ShapesSymbolicallyEqual(t.shape(), broadcast_shape)) {
       // skip if the non-constant tensor doesn't have the same shape after
       // broadcast.
-      return OkStatus();
+      return absl::OkStatus();
     }
     Tensor constant;
-    if (!GetTensorFromConstNode(node->input(1), &constant)) return OkStatus();
+    if (!GetTensorFromConstNode(node->input(1), &constant))
+      return absl::OkStatus();
     // TODO(rmlarsen): Use the more general IsOnes helper here.
     complex128 element;
     for (int k = 0; k < constant.NumElements(); ++k) {
@@ -3428,11 +3433,11 @@ class ConvertExpm1Stage : public ArithmeticOptimizerStage {
                                    DT_COMPLEX64, DT_COMPLEX128},
                                   &element)) {
         // input data type is not supported by expm1. Skip.
-        return OkStatus();
+        return absl::OkStatus();
       }
       if (element != complex128(1)) {
         // current element is not 1. Skip.
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
     NodeDef* exp;
@@ -3450,7 +3455,7 @@ class ConvertExpm1Stage : public ArithmeticOptimizerStage {
     AddToOptimizationQueue(exp_input);
     AddToOptimizationQueue(ones);
     *simplified_node_name = node->name();
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -3476,7 +3481,7 @@ class OptimizeMaxOrMinOfMonotonicStage : public ArithmeticOptimizerStage {
   Status TrySimplify(NodeDef* reduction_node,
                      string* simplified_node_name) override {
     if (IsInPreserveSet(*reduction_node)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     NodeDef* inner_function;
@@ -3538,7 +3543,7 @@ class OptimizeMaxOrMinOfMonotonicStage : public ArithmeticOptimizerStage {
       AddToOptimizationQueue(inner_function);
       AddToOptimizationQueue(inner_input);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -3638,7 +3643,7 @@ class UnaryOpsComposition : public ArithmeticOptimizerStage {
         *root, *ctx().node_map, /*follow_control_input*/ false, predicate_fn);
 
     // We were not able to find a chain that can be replaced.
-    if (op_names.size() == 1) return OkStatus();
+    if (op_names.size() == 1) return absl::OkStatus();
 
     // Do not add fused nodes to any other chain.
     std::for_each(op_nodes.begin(), op_nodes.end(),
@@ -3666,7 +3671,7 @@ class UnaryOpsComposition : public ArithmeticOptimizerStage {
 
     *simplified_node_name = composition_node->name();
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -3745,14 +3750,14 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
 
     // Get the input and see if it's a Pack op.
     TF_RETURN_IF_ERROR(GetInputNode(node->input(0), &pack));
-    if (!IsPack(*pack)) return OkStatus();
+    if (!IsPack(*pack)) return absl::OkStatus();
 
     bool return_early;
     PartialTensorShape pack_output_shape;
     int pack_axis;
     TF_RETURN_IF_ERROR(
         CheckInputs(node, pack, &pack_output_shape, &pack_axis, &return_early));
-    if (return_early) return OkStatus();
+    if (return_early) return absl::OkStatus();
 
     int64_t slice_start_value;
     bool found;
@@ -3760,7 +3765,7 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(GetSliceAxis(node, pack, pack_output_shape, pack_axis,
                                     &slice_start_value, &found,
                                     &must_expand_dims));
-    if (!found) return OkStatus();
+    if (!found) return absl::OkStatus();
 
     return RewriteGraph(node, pack, slice_start_value, pack_axis,
                         must_expand_dims, simplified_node_name);
@@ -3778,7 +3783,7 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
         ctx().graph_properties->GetInputProperties(node->name());
     if (slice_properties.empty() ||
         slice_properties[0].shape().unknown_rank()) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     *pack_output_shape = slice_properties[0].shape();
     const int pack_output_rank = pack_output_shape->dims();
@@ -3791,7 +3796,7 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
           ") axis attribute is out of bounds: ", pack->attr().at("axis").i());
     }
     *return_early = false;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetSliceAxis(const NodeDef* node, const NodeDef* pack,
@@ -3818,18 +3823,18 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(GetInputNode(node->input(1), &slice_begin));
     TF_RETURN_IF_ERROR(GetInputNode(node->input(2), &slice_size));
     for (const auto* n : {slice_begin, slice_size}) {
-      if (!IsReallyConstant(*n)) return OkStatus();
+      if (!IsReallyConstant(*n)) return absl::OkStatus();
     }
 
     Tensor slice_begin_t;
     Tensor slice_size_t;
     TF_RETURN_IF_ERROR(CheckAttrExists(*slice_begin, "value"));
     if (!slice_begin_t.FromProto(slice_begin->attr().at("value").tensor())) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     TF_RETURN_IF_ERROR(CheckAttrExists(*slice_size, "value"));
     if (!slice_size_t.FromProto(slice_size->attr().at("value").tensor())) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     auto copy_tensor_values_to_vector =
@@ -3845,7 +3850,7 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
                                            " has invalid type for Index attr: ",
                                            DataTypeString(t.dtype()));
           }
-          return OkStatus();
+          return absl::OkStatus();
         };
 
     gtl::InlinedVector<int64_t, 4> slice_begin_vec;
@@ -3864,7 +3869,7 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
     int slice_begin_vec_size = slice_begin_vec.size();
     if (!pack_output_shape.unknown_rank() &&
         slice_begin_vec_size != pack_output_shape.dims()) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     if (pack_axis >= slice_begin_vec_size) {
       return errors::InvalidArgument(
@@ -3875,7 +3880,7 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
     *slice_start_value = slice_begin_vec[pack_axis];
     if (slice_size_vec[pack_axis] != 1) {
       // Not slicing a single value out.
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     for (int i = 0; i < slice_begin_vec_size; ++i) {
@@ -3884,7 +3889,7 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
             !(slice_size_vec[i] == -1 ||
               slice_size_vec[i] == pack_output_shape.dim_size(i))) {
           // Not slicing on the same axis as the Pack op.
-          return OkStatus();
+          return absl::OkStatus();
         }
       }
     }
@@ -3897,7 +3902,7 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
     }
 
     *found = true;  // slice_start_value is valid.
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetStridedSliceAxis(const NodeDef* node, const NodeDef* pack,
@@ -3928,7 +3933,7 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(GetInputNode(node->input(3), &slice_strides));
 
     for (const auto* n : {slice_begin, slice_end, slice_strides}) {
-      if (!IsReallyConstant(*n)) return OkStatus();
+      if (!IsReallyConstant(*n)) return absl::OkStatus();
     }
 
     Tensor slice_begin_t;
@@ -3937,16 +3942,16 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
 
     TF_RETURN_IF_ERROR(CheckAttrExists(*slice_begin, "value"));
     if (!slice_begin_t.FromProto(slice_begin->attr().at("value").tensor())) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     TF_RETURN_IF_ERROR(CheckAttrExists(*slice_end, "value"));
     if (!slice_end_t.FromProto(slice_end->attr().at("value").tensor())) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     TF_RETURN_IF_ERROR(CheckAttrExists(*slice_strides, "value"));
     if (!slice_strides_t.FromProto(
             slice_strides->attr().at("value").tensor())) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     TensorShape processing_shape;
     TensorShape final_shape;
@@ -3962,7 +3967,7 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
         &processing_shape, &final_shape, &is_identity, &is_simple_slice,
         &slice_dim0, &slice_begin_vec, &slice_end_vec, &slice_strides_vec));
 
-    if (!is_simple_slice) return OkStatus();
+    if (!is_simple_slice) return absl::OkStatus();
 
     int begin_index = -1;
     int64_t begin_value = 0;
@@ -3971,7 +3976,7 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
       if (v != 0) {
         if (begin_index != -1) {
           // At least two start values that are nonzero.
-          return OkStatus();
+          return absl::OkStatus();
         }
         begin_index = i;
         begin_value = v;
@@ -3985,29 +3990,29 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
       if (v != pack_output_shape.dim_size(i)) {
         if (end_index != -1) {
           // At least two end values that are nonzero.
-          return OkStatus();
+          return absl::OkStatus();
         }
         end_index = i;
         end_value = v;
       }
     }
 
-    if (begin_index == -1 && end_index == -1) return OkStatus();
+    if (begin_index == -1 && end_index == -1) return absl::OkStatus();
     if (begin_index != -1 && end_index != -1 && begin_index != end_index) {
       // Somehow received different axes for begin/end slicing
-      return OkStatus();
+      return absl::OkStatus();
     }
     const int slice_axis = (begin_index == -1) ? end_index : begin_index;
     if (slice_axis != pack_axis) {
       // Not slicing on the same axis as the Pack op.
-      return OkStatus();
+      return absl::OkStatus();
     }
     *slice_start_value = (begin_index == -1) ? 0 : begin_value;
     const int64_t slice_end_value =
         (end_index == -1) ? pack_output_shape.dim_size(slice_axis) : end_value;
     if (slice_end_value != *slice_start_value + 1) {
       // Not slicing a single value out.
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     if (*slice_start_value < 0 || *slice_start_value >= pack->input_size()) {
@@ -4023,11 +4028,11 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
       *must_expand_dims = false;
     } else {
       // Shrinking on a different axis from the one that we are slicing on.
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     *found = true;  // slice_start_value is valid.
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status RewriteGraph(const NodeDef* node, const NodeDef* pack,
@@ -4079,7 +4084,7 @@ class RemoveStackSliceSameAxis : public ArithmeticOptimizerStage {
     AddToOptimizationQueue(output);
     *simplified_node_name = output->name();
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -4120,7 +4125,7 @@ class SimplifyEmbeddingLookupStage : public ArithmeticOptimizerStage {
 
   Status TrySimplify(NodeDef* reduction_node,
                      string* simplified_node_name) override {
-    if (IsInPreserveSet(*reduction_node)) return OkStatus();
+    if (IsInPreserveSet(*reduction_node)) return absl::OkStatus();
 
     // Input 0 (data) of the reduction node must be a tf.gather() on the 0th
     // axis.
@@ -4128,9 +4133,9 @@ class SimplifyEmbeddingLookupStage : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(GetInputNode(reduction_node->input(0), &gather_node));
     if (!IsGather(*gather_node) || IsInPreserveSet(*gather_node) ||
         gather_node->device() != reduction_node->device())
-      return OkStatus();
+      return absl::OkStatus();
     if (gather_node->op() == "GatherV2" && !IsAxis0(*gather_node, 2))
-      return OkStatus();
+      return absl::OkStatus();
 
     // Input 1 (indices) of the gather node must be a tf.unique() on the 0th
     // axis.
@@ -4138,9 +4143,9 @@ class SimplifyEmbeddingLookupStage : public ArithmeticOptimizerStage {
     TF_RETURN_IF_ERROR(GetInputNode(gather_node->input(1), &unique_node));
     if (!IsUnique(*unique_node) || IsInPreserveSet(*unique_node) ||
         unique_node->device() != gather_node->device())
-      return OkStatus();
+      return absl::OkStatus();
     if (unique_node->op() == "UniqueV2" && !IsAxis0(*unique_node, 1))
-      return OkStatus();
+      return absl::OkStatus();
 
     DataType unique_element_type;
     TF_RETURN_IF_ERROR(GetNodeAttr(*unique_node, "T", &unique_element_type));
@@ -4148,7 +4153,7 @@ class SimplifyEmbeddingLookupStage : public ArithmeticOptimizerStage {
     // Input 1 (indices) of the reduction node must be output 1 of the unique
     // node.
     const TensorId idx_tensor = ParseTensorName(reduction_node->input(1));
-    if (idx_tensor != TensorId(unique_node->name(), 1)) return OkStatus();
+    if (idx_tensor != TensorId(unique_node->name(), 1)) return absl::OkStatus();
 
     // Input 1 (indices) of the reduction node becomes input 0 (x) of the unique
     // node.
@@ -4207,7 +4212,7 @@ class SimplifyEmbeddingLookupStage : public ArithmeticOptimizerStage {
                                   gather_node->input(0));
     }
     *simplified_node_name = reduction_node->name();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -4247,7 +4252,7 @@ class RemoveCastIntoSegmentReductionStage : public ArithmeticOptimizerStage {
 
   Status TrySimplify(NodeDef* reduction_node,
                      string* simplified_node_name) override {
-    if (IsInPreserveSet(*reduction_node)) return OkStatus();
+    if (IsInPreserveSet(*reduction_node)) return absl::OkStatus();
 
     bool optimized = false;
 
@@ -4274,7 +4279,7 @@ class RemoveCastIntoSegmentReductionStage : public ArithmeticOptimizerStage {
     }
 
     if (optimized) *simplified_node_name = reduction_node->name();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -4415,7 +4420,7 @@ Status ArithmeticOptimizer::SimplifyArithmeticOps(bool can_use_shapes) {
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
@@ -4456,7 +4461,7 @@ Status ArithmeticOptimizer::Optimize(Cluster* /*cluster*/,
   // Perform the optimizations.
   TF_RETURN_IF_ERROR(SimplifyArithmeticOps(can_use_shapes));
   *optimized_graph = std::move(*optimized_graph_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
index 7cab9376515a87..134b9017045ea6 100644
--- a/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
+++ b/tensorflow/core/grappler/optimizers/auto_mixed_precision.cc
@@ -298,7 +298,7 @@ class NodeTypeAttrMap {
     for (const NodeDef& node : graph.node()) {
       TF_RETURN_IF_ERROR(AddNode(node));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   bool is_initialized() const { return graph_ != nullptr; }
@@ -412,7 +412,7 @@ class NodeTypeAttrMap {
         }
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // WARN: `graph_` must outlive this object (node pointers must remain valid).
@@ -628,7 +628,7 @@ Status GraphTypeTopologyView::InitializeFromGraph(
     SortAndRemoveDuplicates(&fanouts_[node_type_idx]);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GraphTypeTopologyView::AddEphemeralEdges(
@@ -678,7 +678,7 @@ Status GraphTypeTopologyView::AddEphemeralEdges(
     SortAndRemoveDuplicates(&fanouts_[node_type_idx]);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool GraphTypeTopologyView::HasNode(absl::string_view node_name,
@@ -931,7 +931,7 @@ Status ValidateLists(const gtl::FlatSet<string>& allow_list,
   if (duplicates) {
     return errors::InvalidArgument("Op lists have conflicting entries");
   } else {
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
@@ -1195,7 +1195,7 @@ Status AutoMixedPrecisionImpl::PrintDebugLogs(bool preop, size_t timestamp) {
   string prepend_path;
   TF_RETURN_IF_ERROR(ReadStringFromEnvVar(
       "TF_AUTO_MIXED_PRECISION_GRAPH_REWRITE_LOG_PATH", "", &prepend_path));
-  if (prepend_path.empty()) return OkStatus();
+  if (prepend_path.empty()) return absl::OkStatus();
 
   string suffix =
       strings::StrCat("_", preop ? "preop" : kSuffix, "_", id_, "_", timestamp);
@@ -1242,7 +1242,7 @@ Status AutoMixedPrecisionImpl::PrintDebugLogs(bool preop, size_t timestamp) {
     f.close();
     LOG(INFO) << "Saved paint bucket info to " << fname;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void AutoMixedPrecisionImpl::LogSkippedNode(const NodeDef& node,
@@ -1500,7 +1500,7 @@ Status AutoMixedPrecisionImpl::Optimize() {
 
   if (allow_set.empty()) {
     LOG(INFO) << "No allowlist ops found, nothing to do";
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   VLOG(2) << "Beginning pass 2 to propagate deny forwards from denylist ops "
@@ -1552,7 +1552,7 @@ Status AutoMixedPrecisionImpl::Optimize() {
 
   TF_RETURN_IF_ERROR(PrintDebugLogs(/* preop = */ false, timestamp));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // If node is a Tensor List op with a float32 data type attribute then this
@@ -1968,7 +1968,7 @@ Status AutoMixedPrecisionImpl::ForceColorMatchOnRecurrentEdges(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Forces all of the given Tensor List nodes into the same color set.
@@ -2255,7 +2255,7 @@ Status AutoMixedPrecisionImpl::ChangeTypeAttrsAndAddCasts(
             << " nodes to " << type_str << " precision using "
             << num_nonvar_casts_to_f16_ << " cast(s) to " << type_str
             << " (excluding Const and Variable casts)";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 int GetNumGPUs(const Cluster& cluster) {
@@ -2302,7 +2302,7 @@ Status AutoMixedPrecision::Optimize(Cluster* cluster, const GrapplerItem& item,
     // AutoMixedPrecision is currently only tuned for GPU.
     LOG(WARNING) << "No (suitable) GPUs detected, skipping " << name()
                  << " graph optimizer";
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (num_gpus >= 1 && mode_ == AutoMixedPrecisionMode::BF16) {
diff --git a/tensorflow/core/grappler/optimizers/auto_parallel.cc b/tensorflow/core/grappler/optimizers/auto_parallel.cc
index f4b6da85ed2372..8753db8cb46fb1 100644
--- a/tensorflow/core/grappler/optimizers/auto_parallel.cc
+++ b/tensorflow/core/grappler/optimizers/auto_parallel.cc
@@ -198,7 +198,7 @@ Status AutoParallel::Initialize(const GrapplerItem& item) {
     }
   }
   LOG(INFO) << "Number of shared nodes: " << shared_nodes_.size();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool AutoParallel::NotSharedNode(const string& name) {
@@ -268,7 +268,7 @@ Status AutoParallel::Optimize(Cluster* cluster, const GrapplerItem& item,
                               GraphDef* output) {
   TF_RETURN_IF_ERROR(Initialize(item));
   BuildGraph(output);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc b/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc
index e755cabfbf84b5..587031b416a609 100644
--- a/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc
+++ b/tensorflow/core/grappler/optimizers/common_subgraph_elimination.cc
@@ -177,7 +177,7 @@ Status CommonSubgraphElimination::DedupComputations(GraphDef* optimized_graph) {
   GraphTopologyView graph_view;
   if (!graph_view.InitializeFromGraph(*optimized_graph).ok()) {
     LOG(WARNING) << "Failed to initialize GraphTopologyView.";
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // If either node or rep feeds an inplace op, deduping them may cause data
@@ -270,7 +270,7 @@ Status CommonSubgraphElimination::DedupComputations(GraphDef* optimized_graph) {
     EraseNodesFromGraph(duplicates, optimized_graph);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CommonSubgraphElimination::Optimize(Cluster* /*cluster*/,
diff --git a/tensorflow/core/grappler/optimizers/constant_folding.cc b/tensorflow/core/grappler/optimizers/constant_folding.cc
index c018067c9c626a..ce0dab802427f7 100644
--- a/tensorflow/core/grappler/optimizers/constant_folding.cc
+++ b/tensorflow/core/grappler/optimizers/constant_folding.cc
@@ -336,7 +336,7 @@ static Status PutValueIntoTensor(const int64_t value, const DataType& type,
   } else {
     tensor->flat<int64_t>()(index) = value;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Writes the given tensor shape into the given tensor.
@@ -361,7 +361,7 @@ static Status ConvertShapeToConstant(const string& op, const DataType& type,
     *tensor = Tensor(type, TensorShape({}));
     TF_RETURN_IF_ERROR(PutValueIntoTensor(shp.dims(), type, 0, tensor));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // TODO(rmlarsen): Perhaps we should move this to the GraphOptimizer base class.
@@ -563,7 +563,7 @@ Status ConstantFolding::MaterializeShapes(const GraphProperties& properties) {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -615,22 +615,22 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
       (shape_node1->op() != "Shape" && !IsReallyConstant(*shape_node1)) ||
       shape_node2 == nullptr ||
       (shape_node2->op() != "Shape" && !IsReallyConstant(*shape_node2))) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Don't optimize this again if it was already optimized and folded.
   if (OptimizedNodeExists(node, "-folded-1") ||
       OptimizedNodeExists(node, "-folded-2")) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   int64_t min_id = 0;
   BCast::Vec shape1;
   if (!ExtractShape(*shape_node1, properties, &shape1, &min_id)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   BCast::Vec shape2;
   if (!ExtractShape(*shape_node2, properties, &shape2, &min_id)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   // A value of -1 means we don't known anything about the dimension. Replace
   // the -1 values with unique dimension ids since we don't want two '-1'
@@ -659,7 +659,7 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
       // We're either dealing with 2 different symbolic dimensions or a symbolic
       // and a know dimensions. We can't be sure whether both are equal or not,
       // so we can't be sure whether we'll be broadcasting or not.
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
   // These extra dims could be equal to 1, in which case there is no
@@ -667,18 +667,18 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
   // be broadcasting. Since we don't know, we'll just punt.
   for (int i = common_dims, end = shape1.size(); i < end; ++i) {
     if (shape1[i] < 0) {
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
   for (int i = common_dims, end = shape2.size(); i < end; ++i) {
     if (shape2[i] < 0) {
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
   BCast bcast(shape1, shape2);
   if (!bcast.IsValid()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   BCast::Vec reduce_dims[2];
@@ -727,39 +727,39 @@ Status ConstantFolding::MaterializeBroadcastGradientArgs(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConstantFolding::MaterializeReductionIndices(
     NodeDef* node, const GraphProperties& properties) {
   if (node->input_size() < 2) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   const NodeDef* indices = node_map_->GetNode(node->input(1));
   if (!indices || IsReallyConstant(*indices)) {
     // The reduction indices are already constant, there's nothing to do.
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const std::vector<OpInfo::TensorProperties>& input_props =
       properties.GetInputProperties(node->name());
   if (input_props.size() != 2) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   const OpInfo::TensorProperties& input_prop = input_props[0];
   if (input_prop.shape().unknown_rank()) {
     // We can't do anything if we don't know the rank of the input.
-    return OkStatus();
+    return absl::OkStatus();
   }
   const int input_rank = input_prop.shape().dim_size();
   if (input_rank < 1) {
     // Unexpected graph, don't try to change it.
-    return OkStatus();
+    return absl::OkStatus();
   }
   const OpInfo::TensorProperties& reduction_indices_prop = input_props[1];
   DataType dtype = reduction_indices_prop.dtype();
   if (dtype != DT_INT32 && dtype != DT_INT64) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   PartialTensorShape reduction_indices_shape(reduction_indices_prop.shape());
   const int num_reduction_indices = reduction_indices_shape.num_elements();
@@ -767,7 +767,7 @@ Status ConstantFolding::MaterializeReductionIndices(
   const std::vector<OpInfo::TensorProperties>& output_props =
       properties.GetOutputProperties(node->name());
   if (output_props.size() != 1) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   const OpInfo::TensorProperties& output_prop = output_props[0];
   const int output_rank =
@@ -782,23 +782,23 @@ Status ConstantFolding::MaterializeReductionIndices(
     for (const NodeDef* fanout : node_map_->GetOutputs(node->name())) {
       full_reduction = false;
       if (!IsReshape(*fanout)) {
-        return OkStatus();
+        return absl::OkStatus();
       }
       const std::vector<OpInfo::TensorProperties>& reshape_props =
           properties.GetOutputProperties(fanout->name());
       if (reshape_props.size() != 1) {
-        return OkStatus();
+        return absl::OkStatus();
       }
       const OpInfo::TensorProperties& reshape_prop = reshape_props[0];
       PartialTensorShape shape(reshape_prop.shape());
       if (shape.num_elements() != 1) {
-        return OkStatus();
+        return absl::OkStatus();
       } else {
         full_reduction = true;
       }
     }
     if (!full_reduction) {
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -806,7 +806,7 @@ Status ConstantFolding::MaterializeReductionIndices(
   // reduce as a constant node.
   string const_name = OptimizedNodeName(*node, "-reduction_indices");
   if (node_map_->GetNode(const_name)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   NodeDef* reduction_indices = graph_->add_node();
   Tensor value(dtype, TensorShape({input_rank}));
@@ -831,22 +831,22 @@ Status ConstantFolding::MaterializeReductionIndices(
   node_map_->UpdateInput(node->name(), indices->name(),
                          reduction_indices->name());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConstantFolding::MaterializeConstantValuedNode(
     NodeDef* node, const GraphProperties& properties) {
   if (disable_compressed_tensor_optimization_) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   // Nodes that generate constant-valued outputs can be represented compactly in
   // compressed format, regardless of their shape.
   const std::vector<OpInfo::TensorProperties>& output_props =
       properties.GetOutputProperties(node->name());
-  if (output_props.size() != 1) return OkStatus();
+  if (output_props.size() != 1) return absl::OkStatus();
   const auto& output_shape = output_props[0].shape();
   if (!PartialTensorShape(output_shape).IsFullyDefined()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (IsFill(*node)) {
     const auto output_dtype = output_props[0].dtype();
@@ -854,7 +854,7 @@ Status ConstantFolding::MaterializeConstantValuedNode(
     for (int i = 0; i < 2; ++i) {
       input_node = node_map_->GetNode(NodeName(node->input(i)));
       if (input_node == nullptr || !IsReallyConstant(*input_node)) {
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
     TF_RETURN_IF_ERROR(CheckAttrExists(*input_node, "value"));
@@ -898,7 +898,7 @@ Status ConstantFolding::MaterializeConstantValuedNode(
           value, properties, output_shape, node, graph_));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Materialize output values inferred by the shape inference.
@@ -908,7 +908,7 @@ Status ConstantFolding::MaterializeOutputValues(
       properties.GetOutputProperties(node->name());
   if (output.size() != 1 || !output[0].has_value() ||
       !IsFoldable(*node, &properties)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // If this is a trivial Identity node with a constant input, just route the
@@ -919,7 +919,7 @@ Status ConstantFolding::MaterializeOutputValues(
       std::vector<int> inputs_to_forward;
       std::iota(inputs_to_forward.begin(), inputs_to_forward.end(), 0);
       graph_modified_ = ForwardInputs(node, inputs_to_forward);
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
   // Repurpose the existing node to be the constant.
@@ -945,7 +945,7 @@ Status ConstantFolding::MaterializeConstants(
       TF_RETURN_IF_ERROR(MaterializeOutputValues(&node, properties));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool ConstantFolding::IsFoldable(const NodeDef& node,
@@ -1182,7 +1182,7 @@ Status CreateConstantTensorAttrValue(DataType type, double value,
           absl::StrCat("Unsupported type in CreateConstantTensorAttrValue: ",
                        DataTypeString(type)));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #undef SET_TENSOR_CAL_CASE
@@ -1331,7 +1331,7 @@ Status ConstantFolding::CreateNodeDef(const string& name,
         absl::StrCat("Can't fold ", name, ", its size would be too large (",
                      encoded_size, " >= ", kMaxConstantSize, " bytes)"));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConstantFolding::EvaluateNode(const NodeDef& node,
@@ -1420,7 +1420,7 @@ Status ConstantFolding::EvaluateOneFoldable(const NodeDef& node,
       outputs->at(i) = NodeDef();
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConstantFolding::FoldMergeNode(NodeDef* node, GraphDef* output_graph) {
@@ -1510,9 +1510,9 @@ Status ConstantFolding::FoldMergeNode(NodeDef* node, GraphDef* output_graph) {
         }
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph,
@@ -1638,7 +1638,7 @@ Status ConstantFolding::FoldNode(NodeDef* node, GraphDef* output_graph,
       node->clear_input();
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConstantFolding::FoldGraph(
@@ -1703,7 +1703,7 @@ Status ConstantFolding::FoldGraph(
       *(optimized_graph->add_node()) = std::move(*node);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConstantFolding::IsSimplifiableReshape(
@@ -1785,7 +1785,7 @@ Status ConstantFolding::IsSimplifiableReshape(
                      "to be compatible with ", new_dims.DebugString()));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #define IS_VALUE_CASE(DTYPE, VALUE)                   \
@@ -2057,7 +2057,7 @@ Status ConstantFolding::ReplaceOperationWithConstantTensor(DataType dtype,
                                                            TensorProto* value,
                                                            NodeDef* node,
                                                            GraphDef* graph) {
-  if (dtype == DT_VARIANT) return OkStatus();
+  if (dtype == DT_VARIANT) return absl::OkStatus();
   node->set_op("Const");
   EraseRegularNodeAttributes(node);
   (*node->mutable_attr())["dtype"].set_type(dtype);
@@ -2074,14 +2074,14 @@ Status ConstantFolding::ReplaceOperationWithConstantTensor(DataType dtype,
   }
   DedupControlInputs(node);
   graph_modified_ = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConstantFolding::ReplaceOperationWithConstant(
     double value, const GraphProperties& properties,
     const TensorShapeProto& shape, NodeDef* node, GraphDef* graph) {
   const DataType dtype = GetDataTypeFromNodeOrProps(*node, properties);
-  if (dtype == DT_VARIANT) return OkStatus();
+  if (dtype == DT_VARIANT) return absl::OkStatus();
   AttrValue tensor_attr;
   Status s = CreateConstantTensorAttrValue(dtype, value, shape, &tensor_attr);
   if (!s.ok()) {
@@ -2089,7 +2089,7 @@ Status ConstantFolding::ReplaceOperationWithConstant(
     VLOG(1) << "Failed to replace node " << node->name() << " of type "
             << DataTypeString(dtype) << " with constant tensor of value "
             << value;
-    return OkStatus();
+    return absl::OkStatus();
   }
   return ReplaceOperationWithConstantTensor(dtype, tensor_attr.mutable_tensor(),
                                             node, graph);
@@ -2112,7 +2112,7 @@ Status ConstantFolding::SimplifyGraph(
       TF_RETURN_IF_ERROR(SimplifyNode(node, optimized_graph, properties));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #define RETURN_IF_ERROR_OR_MODIFIED(EXPR) \
@@ -2180,7 +2180,7 @@ Status ConstantFolding::SimplifyNode(NodeDef* node, GraphDef* optimized_graph,
       RemoveRedundantVariableUpdates(properties, optimized_graph, node));
 
   graph_modified_ = graph_modified_cached;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void ConstantFolding::RemoveSplitOrSplitV(const GraphProperties& properties,
@@ -2199,7 +2199,7 @@ Status ConstantFolding::RemoveShuffleOrTranspose(
     const GraphProperties& properties, bool use_shape_info,
     GraphDef* optimized_graph, NodeDef* node) {
   if (!use_shape_info || !(IsShuffle(*node) || IsTranspose(*node)))
-    return OkStatus();
+    return absl::OkStatus();
   Tensor permutation_tensor;
   if (GetTensorFromConstNode(node->input(1), &permutation_tensor) &&
       properties.HasInputProperties(node->name())) {
@@ -2215,7 +2215,7 @@ Status ConstantFolding::RemoveShuffleOrTranspose(
     int permutation_size = permutation.size();
     if (permutation_size != shape.dim_size()) {
       // Number of elements in perm should be same as dim_size. Skip if not.
-      return OkStatus();
+      return absl::OkStatus();
     }
     // The node is replaceable iff
     // dim_size == 0 || all dims have size 1 ||
@@ -2228,7 +2228,7 @@ Status ConstantFolding::RemoveShuffleOrTranspose(
       ReplaceOperationWithIdentity(0, properties, node, optimized_graph);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void ConstantFolding::RemoveRandomShuffle(const GraphProperties& properties,
@@ -2251,12 +2251,12 @@ Status ConstantFolding::RemoveReverse(const GraphProperties& properties,
                                       bool use_shape_info,
                                       GraphDef* optimized_graph,
                                       NodeDef* node) {
-  if (!use_shape_info || node->op() != "ReverseV2") return OkStatus();
+  if (!use_shape_info || node->op() != "ReverseV2") return absl::OkStatus();
   Tensor axis;
   if (properties.HasInputProperties(node->name()) &&
       GetTensorFromConstNode(node->input(1), &axis)) {
     const auto& shape = properties.GetInputProperties(node->name())[0].shape();
-    if (shape.unknown_rank()) return OkStatus();
+    if (shape.unknown_rank()) return absl::OkStatus();
     std::set<int> target_axes;
     for (int j = 0; j < axis.NumElements(); ++j) {
       // value of axis can be negative.
@@ -2282,14 +2282,14 @@ Status ConstantFolding::RemoveReverse(const GraphProperties& properties,
       ReplaceOperationWithIdentity(0, properties, node, optimized_graph);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConstantFolding::SimplifySlice(const GraphProperties& properties,
                                       bool use_shape_info,
                                       GraphDef* optimized_graph,
                                       NodeDef* node) {
-  if (!use_shape_info || !IsSlice(*node)) return OkStatus();
+  if (!use_shape_info || !IsSlice(*node)) return absl::OkStatus();
   Tensor begin;
   Tensor size;
   if (properties.HasInputProperties(node->name()) &&
@@ -2317,7 +2317,7 @@ Status ConstantFolding::SimplifySlice(const GraphProperties& properties,
       ReplaceOperationWithIdentity(0, properties, node, optimized_graph);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConstantFolding::SimplifyStridedSlice(const GraphProperties& properties,
@@ -2332,20 +2332,20 @@ Status ConstantFolding::SimplifyStridedSlice(const GraphProperties& properties,
         node->attr().at("shrink_axis_mask").i() != 0) {
       // Skip nodes with new/shrink axis mask, since they involve dimension
       // changes.
-      return OkStatus();
+      return absl::OkStatus();
     }
     const auto& input = properties.GetInputProperties(node->name())[0];
     for (int j = 0; j < input.shape().dim_size(); ++j) {
       // Skip if input shape is not fully determined.
       if (input.shape().dim(j).size() < 0) {
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
 
     std::vector<Tensor> input_tensors(3);
     for (int i = 1; i < 4; ++i) {
       if (!GetTensorFromConstNode(node->input(i), &input_tensors[i - 1])) {
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
 
@@ -2409,7 +2409,7 @@ Status ConstantFolding::SimplifyStridedSlice(const GraphProperties& properties,
       ReplaceOperationWithIdentity(0, properties, node, optimized_graph);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConstantFolding::SimplifyTile(const GraphProperties& properties,
@@ -2434,13 +2434,13 @@ Status ConstantFolding::SimplifyTile(const GraphProperties& properties,
       ReplaceOperationWithIdentity(0, properties, node, optimized_graph);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConstantFolding::SimplifyPad(const GraphProperties& properties,
                                     bool use_shape_info,
                                     GraphDef* optimized_graph, NodeDef* node) {
-  if (!use_shape_info || !IsPad(*node)) return OkStatus();
+  if (!use_shape_info || !IsPad(*node)) return absl::OkStatus();
 
   Tensor paddings;
   if (GetTensorFromConstNode(node->input(1), &paddings)) {
@@ -2461,7 +2461,7 @@ Status ConstantFolding::SimplifyPad(const GraphProperties& properties,
       ReplaceOperationWithIdentity(0, properties, node, optimized_graph);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void ConstantFolding::SimplifySqueeze(const GraphProperties& properties,
@@ -3026,13 +3026,13 @@ Status ConstantFolding::SimplifyArithmeticOperations(
         ReplaceBinaryOperationWithBroadcastTo(1, properties, node,
                                               optimized_graph);
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     if (y_matches_output_shape && (is_sub && x_is_zero)) {
       // Replace 0 - y with Neg(y).
       ReplaceSubtractionFromZeroByNegation(node, optimized_graph);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Replace 1 / y with Reciprocal op.
@@ -3041,7 +3041,7 @@ Status ConstantFolding::SimplifyArithmeticOperations(
       DataType type = node->attr().at("T").type();
       if (DataTypeIsFloating(type) || DataTypeIsComplex(type)) {
         ReplaceDivisionOfOnesByReciprocal(node, optimized_graph);
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
 
@@ -3056,7 +3056,7 @@ Status ConstantFolding::SimplifyArithmeticOperations(
         ReplaceBinaryOperationWithBroadcastTo(0, properties, node,
                                               optimized_graph);
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // x OR true = true OR y = true.
@@ -3064,7 +3064,7 @@ Status ConstantFolding::SimplifyArithmeticOperations(
     if (shp.IsFullyDefined() && IsLogicalOr(*node) && (y_is_one || x_is_one)) {
       TF_RETURN_IF_ERROR(ReplaceOperationWithConstant(
           1, properties, output_shape, node, optimized_graph));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Simplify multiplication and matmul by zeros.
@@ -3082,7 +3082,7 @@ Status ConstantFolding::SimplifyArithmeticOperations(
           TF_RETURN_IF_ERROR(
               AddQuantizedMatMulMinMaxOutConstNodes(node, optimized_graph));
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
       // Even if an input shape is only partially known, we may known that it
       // matches the output shape and thus forward or broadcast the
@@ -3094,7 +3094,7 @@ Status ConstantFolding::SimplifyArithmeticOperations(
           ReplaceBinaryOperationWithBroadcastTo(0, properties, node,
                                                 optimized_graph);
         }
-        return OkStatus();
+        return absl::OkStatus();
       } else if (is_mul && y_is_zero) {
         if (y_matches_output_shape) {
           ReplaceOperationWithIdentity(1, properties, node, optimized_graph);
@@ -3102,11 +3102,11 @@ Status ConstantFolding::SimplifyArithmeticOperations(
           ReplaceBinaryOperationWithBroadcastTo(1, properties, node,
                                                 optimized_graph);
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool ConstantFolding::ReduceDivToReciprocalMul(GraphDef* optimized_graph,
@@ -3980,7 +3980,7 @@ Status ConstantFolding::AddQuantizedMatMulMinMaxOutConstNodes(
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   };
   const string min_out_const_name =
       OptimizedNodeName(*node, "-quantized_matmul_min_out");
@@ -3996,7 +3996,7 @@ Status ConstantFolding::AddQuantizedMatMulMinMaxOutConstNodes(
         "node '$0' because of node name conflict",
         node->name()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
@@ -4035,7 +4035,7 @@ Status ConstantFolding::RunOptimizationPass(Cluster* cluster,
   TF_RETURN_IF_ERROR(
       SimplifyGraph(optimized_graph, properties, &nodes_to_not_simplify));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
@@ -4093,7 +4093,7 @@ Status ConstantFolding::Optimize(Cluster* cluster, const GrapplerItem& item,
   *optimized_graph->mutable_library() = item.graph.library();
   *optimized_graph->mutable_versions() = item.graph.versions();
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc
index ce7f574c8f4d99..4719e066a78276 100644
--- a/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc
+++ b/tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry_test.cc
@@ -35,13 +35,13 @@ class TestGraphOptimizer : public CustomGraphOptimizer {
  public:
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
   string name() const override { return kTestOptimizerName; }
   bool UsesFunctionLibrary() const override { return false; }
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -89,13 +89,13 @@ class TestPluginGraphOptimizer : public CustomGraphOptimizer {
  public:
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
   string name() const override { return kTestPluginOptimizerName; }
   bool UsesFunctionLibrary() const override { return false; }
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/grappler/optimizers/data/BUILD b/tensorflow/core/grappler/optimizers/data/BUILD
index 0418e287ac5f02..bfed9693e0dcbe 100644
--- a/tensorflow/core/grappler/optimizers/data/BUILD
+++ b/tensorflow/core/grappler/optimizers/data/BUILD
@@ -38,6 +38,7 @@ cc_library(
         ":parallel_batch",
         ":remove_compression_map",
         ":replicate_on_split",
+        ":seq_interleave_prefetch",
         ":shuffle_and_repeat_fusion",
         ":slack",
         ":use_private_thread_pool",
@@ -969,6 +970,62 @@ tf_cc_test(
     ],
 )
 
+cc_library(
+    name = "seq_interleave_prefetch",
+    srcs = ["seq_interleave_prefetch.cc"],
+    hdrs = ["seq_interleave_prefetch.h"],
+    deps = [
+        ":function_utils",
+        ":fusion_utils",
+        ":graph_utils",
+        ":optimizer_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/data:dataset_utils",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/grappler:mutable_graph_view",
+        "//tensorflow/core/grappler:op_types",
+        "//tensorflow/core/grappler:utils",
+        "//tensorflow/core/grappler/clusters:cluster",
+        "//tensorflow/core/grappler/optimizers:custom_graph_optimizer_registry",
+        "//tensorflow/core/grappler/utils:topological_sort",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/protobuf:error_codes_proto_impl_cc",
+    ] + tf_protos_all(),
+    alwayslink = 1,
+)
+
+tf_cc_test(
+    name = "seq_interleave_prefetch_test",
+    size = "small",
+    srcs = ["seq_interleave_prefetch_test.cc"],
+    deps = [
+        ":function_utils",
+        ":graph_test_utils",
+        ":graph_utils",
+        ":inject_io_prefetch",
+        ":seq_interleave_prefetch",
+        "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:test_main",
+        "//tensorflow/core/data:dataset_utils",
+        "//tensorflow/core/framework:function_proto_cc",
+        "//tensorflow/core/framework:function_testlib",
+        "//tensorflow/core/framework:graph_proto_cc",
+        "//tensorflow/core/framework:node_def_proto_cc",
+        "//tensorflow/core/framework:types_proto_cc",
+        "//tensorflow/core/grappler:grappler_item",
+        "//tensorflow/core/platform:status",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/lib/core:status_test_util",
+    ],
+)
+
 cc_library(
     name = "inject_io_prefetch",
     srcs = ["inject_io_prefetch.cc"],
diff --git a/tensorflow/core/grappler/optimizers/data/auto_shard.cc b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
index ecb6e140c2056c..1ba30b81062cf8 100644
--- a/tensorflow/core/grappler/optimizers/data/auto_shard.cc
+++ b/tensorflow/core/grappler/optimizers/data/auto_shard.cc
@@ -263,7 +263,7 @@ Status AddShardNode(MutableGraphView* graph, const NodeDef& add_before,
   TF_RETURN_IF_ERROR(
       graph->UpdateFanouts(add_after->name(), new_node_graph->name()));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddShuffleDataset(MutableGraphView* graph, const NodeDef& add_before,
@@ -292,7 +292,7 @@ Status AddShuffleDataset(MutableGraphView* graph, const NodeDef& add_before,
 
   TF_RETURN_IF_ERROR(
       graph->UpdateFanouts(add_after->name(), new_node_graph->name()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddShuffleDatasetV2(MutableGraphView* graph, const NodeDef& add_before,
@@ -315,7 +315,7 @@ Status AddShuffleDatasetV2(MutableGraphView* graph, const NodeDef& add_before,
 
   TF_RETURN_IF_ERROR(
       graph->UpdateFanouts(add_after->name(), new_node_graph->name()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddShuffleDatasetV3(MutableGraphView* graph, const NodeDef& add_before,
@@ -346,7 +346,7 @@ Status AddShuffleDatasetV3(MutableGraphView* graph, const NodeDef& add_before,
 
   TF_RETURN_IF_ERROR(
       graph->UpdateFanouts(add_after->name(), new_node_graph->name()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool ReaderOpInFunction(const NodeDef& node,
@@ -390,7 +390,7 @@ Status RemoveShuffleDataset(MutableGraphView* graph, const NodeDef& node,
   }
 
   // TODO(frankchn): Traverse functions too.
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RemoveShuffleDatasetV2(MutableGraphView* graph, const NodeDef& node,
@@ -412,7 +412,7 @@ Status RemoveShuffleDatasetV2(MutableGraphView* graph, const NodeDef& node,
   }
 
   // TODO(frankchn): Traverse functions too.
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RemoveShuffleDatasetV3(MutableGraphView* graph, const NodeDef& node,
@@ -439,7 +439,7 @@ Status RemoveShuffleDatasetV3(MutableGraphView* graph, const NodeDef& node,
   }
 
   // TODO(frankchn): Traverse functions too.
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ProcessDatasetSourceNode(MutableGraphView* graph, const NodeDef& node,
@@ -481,7 +481,7 @@ Status ProcessDatasetSourceNode(MutableGraphView* graph, const NodeDef& node,
         seed_generator_node, reshuffle_each_iteration));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 const NodeDef* FindFuncAndTensorSliceDataset(
@@ -570,7 +570,7 @@ Status RecursivelyHandleOp(const NodeDef& node, int64_t num_workers,
       TF_RETURN_IF_ERROR(RecursivelyHandleOp(*input_node, num_workers, index,
                                              flib, graph, nodes_to_delete));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // This handles the case for the following subgraph:
@@ -673,7 +673,7 @@ Status RewriteRebatchV2ToV1(const NodeDef& sink_node, int64_t num_replicas,
   // sink_node to get the RebatchDataset.
   NodeDef* input_node = graph_utils::GetInputNode(sink_node, *graph);
   if (input_node->op() != kRebatchDatasetV2OpName) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   NodeDef* rebatch_node = input_node;
@@ -710,7 +710,7 @@ Status RewriteRebatchV2ToV1(const NodeDef& sink_node, int64_t num_replicas,
     shape->mutable_dim(0)->set_size(-1);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ShardByData(const NodeDef& sink_node, int64_t num_workers, int64_t index,
@@ -762,7 +762,7 @@ Status ShardByHint(const NodeDef& sink_node, int64_t num_workers, int64_t index,
     (*(mutable_node->mutable_attr()))[data::ShardDatasetOp::kRequireNonEmpty]
         .set_b(true);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ApplyAutoShard(const NodeDef& sink_node, int64_t num_workers,
@@ -774,7 +774,7 @@ Status ApplyAutoShard(const NodeDef& sink_node, int64_t num_workers,
                                  graph->graph()->library());
   switch (policy) {
     case AutoShardPolicy::OFF:
-      return OkStatus();
+      return absl::OkStatus();
     case AutoShardPolicy::FILE:
       return ShardByFile(sink_node, num_workers, index, &flib, graph);
     case AutoShardPolicy::DATA:
@@ -830,7 +830,7 @@ Status OptimizeGraph(const GrapplerItem& item, int64_t num_workers,
     metrics::RecordTFDataAutoShard(id, policy_applied, num_workers,
                                    num_replicas);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // anonymous namespace
@@ -949,7 +949,7 @@ Status AutoShard::Init(
     return errors::InvalidArgument(kNumReplicasAttrName, " should be >= 0");
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AutoShard::OptimizeAndCollectStats(Cluster* cluster,
@@ -960,7 +960,7 @@ Status AutoShard::OptimizeAndCollectStats(Cluster* cluster,
   TF_RETURN_IF_ERROR(OptimizeGraph(item, num_workers_, index_,
                                    auto_shard_policy_, num_replicas_, output));
   stats->num_changes++;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(AutoShard, "tf_auto_shard");
diff --git a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc
index cc9b45086986cf..adcc2ec3549dff 100644
--- a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc
+++ b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.cc
@@ -56,7 +56,7 @@ Status AutotuneBufferSizes::OptimizeAndCollectStats(Cluster* cluster,
   if (!autotune_) {
     VLOG(1) << "The optimization autotune_buffer_sizes is not applied if "
                "autotune is off.";
-    return OkStatus();
+    return absl::OkStatus();
   }
   MutableGraphView graph(output);
 
@@ -110,7 +110,7 @@ Status AutotuneBufferSizes::OptimizeAndCollectStats(Cluster* cluster,
     }
   }
 
-  if (async_datasets.empty()) return OkStatus();
+  if (async_datasets.empty()) return absl::OkStatus();
 
   for (const NodeDef* async_dataset_node : async_datasets) {
     NodeDef prefetch_node;
@@ -130,7 +130,7 @@ Status AutotuneBufferSizes::OptimizeAndCollectStats(Cluster* cluster,
         graph.UpdateFanouts(async_dataset_node->name(), added_node->name()));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(AutotuneBufferSizes, "autotune_buffer_sizes");
diff --git a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h
index 5594e77dc85f69..6ef62b74dd9f10 100644
--- a/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h
+++ b/tensorflow/core/grappler/optimizers/data/autotune_buffer_sizes.h
@@ -45,7 +45,7 @@ class AutotuneBufferSizes : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    if (!config) return OkStatus();
+    if (!config) return absl::OkStatus();
 
     const string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
@@ -57,7 +57,7 @@ class AutotuneBufferSizes : public TFDataOptimizerBase {
           absl::StrCat("Received an invalid value for parameter ", kAutotune,
                        ": ", autotune));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/batch_parallelization.cc b/tensorflow/core/grappler/optimizers/data/batch_parallelization.cc
index 8a8af73b0cd987..cdb82ba93cf82f 100644
--- a/tensorflow/core/grappler/optimizers/data/batch_parallelization.cc
+++ b/tensorflow/core/grappler/optimizers/data/batch_parallelization.cc
@@ -63,14 +63,15 @@ Status BatchParallelization::OptimizeAndCollectStats(Cluster* cluster,
   if (!autotune_) {
     VLOG(1) << "The optimization batch_parallelization is not applied if "
                "autotune is off.";
-    return OkStatus();
+    return absl::OkStatus();
   }
   MutableGraphView graph(output);
 
   // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
   // because we only want to enable extra batch parallelism on the main dataset
   // pipeline.
-  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph)) return OkStatus();
+  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
+    return absl::OkStatus();
 
   absl::flat_hash_set<string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
@@ -93,7 +94,7 @@ Status BatchParallelization::OptimizeAndCollectStats(Cluster* cluster,
   }
 
   TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(BatchParallelization, "batch_parallelization");
diff --git a/tensorflow/core/grappler/optimizers/data/batch_parallelization.h b/tensorflow/core/grappler/optimizers/data/batch_parallelization.h
index 430b57a4cbd0bd..ac872763fff0ed 100644
--- a/tensorflow/core/grappler/optimizers/data/batch_parallelization.h
+++ b/tensorflow/core/grappler/optimizers/data/batch_parallelization.h
@@ -36,7 +36,7 @@ class BatchParallelization : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    if (!config) return OkStatus();
+    if (!config) return absl::OkStatus();
 
     const string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
@@ -47,7 +47,7 @@ class BatchParallelization : public TFDataOptimizerBase {
       return errors::InvalidArgument("Received an invalid value for parameter ",
                                      kAutotune, ": ", autotune);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc
index 67c9918d4d6556..1812504dcae3e9 100644
--- a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc
+++ b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.cc
@@ -48,7 +48,8 @@ Status DisableIntraOpParallelism::OptimizeAndCollectStats(
   // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
   // because we only want to disable intra op parallelism on the main dataset
   // pipeline.
-  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph)) return OkStatus();
+  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
+    return absl::OkStatus();
 
   if (item.fetch.size() != 1) {
     return errors::InvalidArgument(
@@ -61,7 +62,7 @@ Status DisableIntraOpParallelism::OptimizeAndCollectStats(
       if (node.op() == target_dataset_op) {
         // If parallelism is set by the user, we keep the user setting instead
         // of disabling it.
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
   }
@@ -97,14 +98,14 @@ Status DisableIntraOpParallelism::OptimizeAndCollectStats(
   // attrs from the input node. If we fail to set the attributes, we abort the
   // rewrite.
   if (!graph_utils::CopyShapesAndTypesAttrs(*last_node, &insert_node))
-    return OkStatus();
+    return absl::OkStatus();
 
   auto* added_node = graph.AddNode(std::move(insert_node));
   TF_RETURN_IF_ERROR(
       graph.UpdateFanouts(last_node->name(), added_node->name()));
 
   stats->num_changes++;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(DisableIntraOpParallelism,
diff --git a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h
index c8f5df89c32263..5ea8c79ec02855 100644
--- a/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h
+++ b/tensorflow/core/grappler/optimizers/data/disable_intra_op_parallelism.h
@@ -33,7 +33,7 @@ class DisableIntraOpParallelism : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.cc b/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.cc
index 4c2e0ffe0312e1..0218d590ec303e 100644
--- a/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.cc
+++ b/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.cc
@@ -41,7 +41,7 @@ Status DisablePrefetchLegacyAutotune::OptimizeAndCollectStats(
   if (!autotune_) {
     VLOG(1) << "The optimization disable_prefetch_legacy_autotune is not "
                "applied if autotune is off.";
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   MutableGraphView graph(output);
@@ -58,7 +58,7 @@ Status DisablePrefetchLegacyAutotune::OptimizeAndCollectStats(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(DisablePrefetchLegacyAutotune,
diff --git a/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h b/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h
index e3de9bf8ca15cc..a4f480f4f9c6bf 100644
--- a/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h
+++ b/tensorflow/core/grappler/optimizers/data/disable_prefetch_legacy_autotune.h
@@ -36,7 +36,7 @@ class DisablePrefetchLegacyAutotune : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    if (!config) return OkStatus();
+    if (!config) return absl::OkStatus();
 
     const string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
@@ -47,7 +47,7 @@ class DisablePrefetchLegacyAutotune : public TFDataOptimizerBase {
       return errors::InvalidArgument("Received an invalid value for parameter ",
                                      kAutotune, ": ", autotune);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.cc b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.cc
index 88d587de0dddee..e13ba28148cde9 100644
--- a/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.cc
+++ b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.cc
@@ -44,14 +44,15 @@ Status EnableGradientDescent::OptimizeAndCollectStats(
   if (!autotune_) {
     VLOG(1) << "The optimization enable_gradient_descent is not applied if "
                "autotune is off.";
-    return OkStatus();
+    return absl::OkStatus();
   }
   MutableGraphView graph(output);
 
   // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
   // because we only want to enable gradient descent on the main dataset
   // pipeline.
-  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph)) return OkStatus();
+  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
+    return absl::OkStatus();
 
   int index = graph_utils::FindGraphNodeWithOp(kModelDataset, *output);
   NodeDef& model_node = *(output->mutable_node(index));
@@ -61,7 +62,7 @@ Status EnableGradientDescent::OptimizeAndCollectStats(
     stats->num_changes++;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(EnableGradientDescent, "enable_gradient_descent");
diff --git a/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h
index 21f742a957de88..faf97e7aafa44c 100644
--- a/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h
+++ b/tensorflow/core/grappler/optimizers/data/enable_gradient_descent.h
@@ -36,7 +36,7 @@ class EnableGradientDescent : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    if (!config) return OkStatus();
+    if (!config) return absl::OkStatus();
 
     const string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
@@ -47,7 +47,7 @@ class EnableGradientDescent : public TFDataOptimizerBase {
       return errors::InvalidArgument("Received an invalid value for parameter ",
                                      kAutotune, ": ", autotune);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
index 7e214b1ea459d0..a3e929fda7adbb 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/filter_fusion.cc
@@ -125,7 +125,7 @@ Status FilterFusion::OptimizeAndCollectStats(Cluster* cluster,
   }
 
   TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(FilterFusion, "filter_fusion");
diff --git a/tensorflow/core/grappler/optimizers/data/filter_fusion.h b/tensorflow/core/grappler/optimizers/data/filter_fusion.h
index 30058b4975aa92..f7d8849d7d0dab 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/filter_fusion.h
@@ -33,7 +33,7 @@ class FilterFusion : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/filter_parallelization.cc b/tensorflow/core/grappler/optimizers/data/filter_parallelization.cc
index ee7b9a07f470b7..36b476acf7a9fa 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_parallelization.cc
+++ b/tensorflow/core/grappler/optimizers/data/filter_parallelization.cc
@@ -61,14 +61,15 @@ Status FilterParallelization::OptimizeAndCollectStats(
   if (!autotune_) {
     VLOG(1) << "The optimization filter_parallelization is not applied if "
                "autotune is off.";
-    return OkStatus();
+    return absl::OkStatus();
   }
   MutableGraphView graph(output);
 
   // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
   // because we only want to enable extra filter parallelism on the main dataset
   // pipeline.
-  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph)) return OkStatus();
+  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
+    return absl::OkStatus();
 
   absl::flat_hash_set<string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
@@ -96,7 +97,7 @@ Status FilterParallelization::OptimizeAndCollectStats(
   }
 
   TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(FilterParallelization, "filter_parallelization");
diff --git a/tensorflow/core/grappler/optimizers/data/filter_parallelization.h b/tensorflow/core/grappler/optimizers/data/filter_parallelization.h
index fd9f7827b52e99..ab501a02242cce 100644
--- a/tensorflow/core/grappler/optimizers/data/filter_parallelization.h
+++ b/tensorflow/core/grappler/optimizers/data/filter_parallelization.h
@@ -36,7 +36,7 @@ class FilterParallelization : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    if (!config) return OkStatus();
+    if (!config) return absl::OkStatus();
 
     const string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
@@ -47,7 +47,7 @@ class FilterParallelization : public TFDataOptimizerBase {
       return errors::InvalidArgument("Received an invalid value for parameter ",
                                      kAutotune, ": ", autotune);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
index 07955f04f89e46..20b5940f98102c 100644
--- a/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/fusion_utils.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/strip.h"
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/node_def.pb.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/op_def.pb.h"
@@ -391,6 +392,9 @@ void ComposeSignature(const OpDef& first_signature,
   *fused_signature->mutable_output_arg() = second_signature.output_arg();
 
   if (first_signature.is_stateful() || second_signature.is_stateful()) {
+    if (!(first_signature.is_stateful() && second_signature.is_stateful())) {
+      metrics::RecordTFDataDebug("fused_with_mixed_statefulness");
+    }
     fused_signature->set_is_stateful(true);
   }
 
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
index 578575b23e71e1..d01d5c8f4621c4 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.cc
@@ -142,6 +142,24 @@ NodeDef MakeParallelInterleaveV4Node(StringPiece name,
       });
 }
 
+NodeDef MakeInterleaveNode(StringPiece name, StringPiece input_node_name,
+                           StringPiece cycle_length_node_name,
+                           StringPiece block_length_node_name,
+                           StringPiece function_name,
+                           StringPiece deterministic) {
+  return test::function::NDef(
+      name, "InterleaveDataset",
+      {string(input_node_name), string(cycle_length_node_name),
+       string(block_length_node_name)},
+      {
+          {"f", FunctionDefHelper::FunctionRef(string(function_name))},
+          {"Targuments", {}},
+          {"output_shapes", gtl::ArraySlice<TensorShape>{}},
+          {"output_types", gtl::ArraySlice<DataType>{}},
+          {"deterministic", string(deterministic)},
+      });
+}
+
 NodeDef MakeParallelMapNode(StringPiece name, StringPiece input_node_name,
                             StringPiece num_parallel_calls_node_name,
                             StringPiece function_name, bool sloppy) {
diff --git a/tensorflow/core/grappler/optimizers/data/graph_test_utils.h b/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
index 5865f03a81a747..7341329ac36030 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
+++ b/tensorflow/core/grappler/optimizers/data/graph_test_utils.h
@@ -73,6 +73,13 @@ NodeDef MakeParallelInterleaveV4Node(StringPiece name,
                                      StringPiece function_name,
                                      StringPiece deterministic);
 
+// Creates a test NodeDef for InterleaveDataset.
+NodeDef MakeInterleaveNode(StringPiece name, StringPiece input_node_name,
+                           StringPiece cycle_length_node_name,
+                           StringPiece block_length_node_name,
+                           StringPiece function_name,
+                           StringPiece deterministic);
+
 // Creates a test NodeDef for ParallelMapDataset.
 NodeDef MakeParallelMapNode(StringPiece name, StringPiece input_node_name,
                             StringPiece num_parallel_calls_node_name,
diff --git a/tensorflow/core/grappler/optimizers/data/graph_utils.cc b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
index 41558b47747375..4494f576bc5078 100644
--- a/tensorflow/core/grappler/optimizers/data/graph_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/graph_utils.cc
@@ -189,7 +189,7 @@ Status GetScalarConstNodeValueHelper(
 
   get_value(tensor);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <>
@@ -360,7 +360,7 @@ Status EnsureNodeNamesUnique(Graph* g) {
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GetFetchNode(const MutableGraphView& graph, const GrapplerItem& item,
@@ -373,7 +373,7 @@ Status GetFetchNode(const MutableGraphView& graph, const GrapplerItem& item,
 
   *fetch_node = graph.GetNode(item.fetch.at(0));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool IsItemDerivedFromFunctionDef(const GrapplerItem& item,
@@ -466,7 +466,7 @@ Status SetMetadataName(const std::string& name, NodeDef* node) {
   }
   *metadata.mutable_name() = name;
   metadata.SerializeToString((*node->mutable_attr())["metadata"].mutable_s());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace graph_utils
diff --git a/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc b/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc
index 5051f23b1ec4f5..76eea7b955d063 100644
--- a/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc
+++ b/tensorflow/core/grappler/optimizers/data/inject_prefetch.cc
@@ -86,13 +86,13 @@ Status InjectPrefetch::OptimizeAndCollectStats(Cluster* cluster,
   if (!autotune_) {
     VLOG(1) << "The optimization inject_prefetch is not applied if autotune is "
                "off.";
-    return OkStatus();
+    return absl::OkStatus();
   }
   MutableGraphView graph(output);
 
   // If the GrapplerItem is derived from a FunctionDef, we don't optimize it.
   if (graph_utils::IsItemDerivedFromFunctionDef(item, graph)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (item.fetch.size() != 1) {
@@ -104,7 +104,7 @@ Status InjectPrefetch::OptimizeAndCollectStats(Cluster* cluster,
   NodeDef* sink_node = graph.GetNode(item.fetch.at(0));
   NodeDef* last_node = graph_utils::GetInputNode(*sink_node, graph);
   if (!ShouldInjectPrefetch(last_node, graph)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Insert `prefetch(AUTOTUNE)` after the last node.
@@ -124,7 +124,7 @@ Status InjectPrefetch::OptimizeAndCollectStats(Cluster* cluster,
   // attrs from the input node. If we fail to set the attributes, we abort the
   // rewrite.
   if (!graph_utils::CopyShapesAndTypesAttrs(*last_node, &prefetch_node))
-    return OkStatus();
+    return absl::OkStatus();
 
   TF_RETURN_IF_ERROR(
       graph_utils::SetMetadataName(prefetch_node.name(), &prefetch_node));
@@ -134,7 +134,7 @@ Status InjectPrefetch::OptimizeAndCollectStats(Cluster* cluster,
       graph.UpdateFanouts(last_node->name(), added_node->name()));
 
   stats->num_changes++;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(InjectPrefetch, "inject_prefetch");
diff --git a/tensorflow/core/grappler/optimizers/data/inject_prefetch.h b/tensorflow/core/grappler/optimizers/data/inject_prefetch.h
index 5ce6b3dadec085..433841fdbef251 100644
--- a/tensorflow/core/grappler/optimizers/data/inject_prefetch.h
+++ b/tensorflow/core/grappler/optimizers/data/inject_prefetch.h
@@ -37,7 +37,7 @@ class InjectPrefetch : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    if (!config) return OkStatus();
+    if (!config) return absl::OkStatus();
 
     const std::string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
@@ -48,7 +48,7 @@ class InjectPrefetch : public TFDataOptimizerBase {
       return errors::InvalidArgument("Received an invalid value for parameter ",
                                      kAutotune, ": ", autotune);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/make_deterministic.cc b/tensorflow/core/grappler/optimizers/data/make_deterministic.cc
index 7e402d05e7a953..3defd73e0b48a6 100644
--- a/tensorflow/core/grappler/optimizers/data/make_deterministic.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_deterministic.cc
@@ -210,7 +210,7 @@ Status ConvertMapOrInterleave(const string& node_name,
   // Remove extra attributes not in Interleave or Map.
   node->mutable_attr()->erase("deterministic");
   node->mutable_attr()->erase("sloppy");
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Returns all transitive dependencies of a set of nodes, including the nodes
@@ -379,7 +379,7 @@ Status SplitMap(
       split_results.first_function;
   *graph->graph()->mutable_library()->mutable_function()->Add() =
       split_results.second_function;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Converts a ParallalBatch dataset to a Batch dataset, to make it
@@ -391,7 +391,7 @@ Status ConvertBatch(const string& node_name, MutableGraphView* graph) {
   node->set_input(2, node->input(3));
   node->set_input(3, absl::StrCat("^", num_parallel_calls_input));
   node->mutable_attr()->erase("deterministic");
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Convert a MapAndBatch node to a separate Map node and Batch node, to make it
@@ -473,7 +473,7 @@ Status ConvertMapAndBatch(const string& node_name, MutableGraphView* graph) {
   NodeDef* graph_batch_node = graph->AddNode(std::move(new_batch_node));
   TF_RETURN_IF_ERROR(
       graph->UpdateFanouts(orig_node.name(), graph_batch_node->name()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Change the buffer_size of a Prefetch node to zero, effectively disabling it,
@@ -484,7 +484,7 @@ Status ConvertPrefetch(const string& node_name, MutableGraphView* graph) {
   node->add_input(absl::StrCat("^", node->input(buffer_size_index)));
   NodeDef* tmp = graph_utils::AddScalarConstNode<int64_t>(0, graph);
   node->set_input(buffer_size_index, tmp->name());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // The two ways nondeterminism can occur in an input pipeline when there are
@@ -746,7 +746,7 @@ Status MakeDeterministic::OptimizeAndCollectStats(Cluster* cluster,
   }
 
   TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(MakeDeterministic, "make_deterministic");
diff --git a/tensorflow/core/grappler/optimizers/data/make_deterministic.h b/tensorflow/core/grappler/optimizers/data/make_deterministic.h
index 1df6e10def699e..3f331c398d282b 100644
--- a/tensorflow/core/grappler/optimizers/data/make_deterministic.h
+++ b/tensorflow/core/grappler/optimizers/data/make_deterministic.h
@@ -62,7 +62,7 @@ class MakeDeterministic : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/make_sloppy.cc b/tensorflow/core/grappler/optimizers/data/make_sloppy.cc
index c76a4b6d80ffcf..6cc34fc7dd7f43 100644
--- a/tensorflow/core/grappler/optimizers/data/make_sloppy.cc
+++ b/tensorflow/core/grappler/optimizers/data/make_sloppy.cc
@@ -43,7 +43,7 @@ Status MakeSloppy::OptimizeAndCollectStats(Cluster* cluster,
       stats->num_changes++;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(MakeSloppy, "make_sloppy");
diff --git a/tensorflow/core/grappler/optimizers/data/make_sloppy.h b/tensorflow/core/grappler/optimizers/data/make_sloppy.h
index 72caf5138eb7bb..bedf80dd9bccc7 100644
--- a/tensorflow/core/grappler/optimizers/data/make_sloppy.h
+++ b/tensorflow/core/grappler/optimizers/data/make_sloppy.h
@@ -32,7 +32,7 @@ class MakeSloppy : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
index e3e712deb2daea..69943e81044728 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.cc
@@ -139,7 +139,7 @@ Status MapAndBatchFusion::OptimizeAndCollectStats(Cluster* cluster,
   }
 
   TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(MapAndBatchFusion, "map_and_batch_fusion");
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
index dfc809a343e6ac..1eff9f2b4e2bfe 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_and_batch_fusion.h
@@ -32,7 +32,7 @@ class MapAndBatchFusion : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
index 0db7273578ba7f..89ae48482c3a2d 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.cc
@@ -240,7 +240,7 @@ Status MapAndFilterFusion::OptimizeAndCollectStats(Cluster* cluster,
   }
 
   TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(MapAndFilterFusion, "map_and_filter_fusion");
diff --git a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
index 11eca17cfe8ba8..9645ca8532b26b 100644
--- a/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_and_filter_fusion.h
@@ -41,7 +41,7 @@ class MapAndFilterFusion : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.cc b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
index 67f015ff2efd9b..dfaa2f31d6ce35 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.cc
@@ -164,7 +164,7 @@ Status MapFusion::OptimizeAndCollectStats(Cluster* cluster,
   if (!autotune_) {
     VLOG(1) << "The optimization map_fusion is not applied if "
                "autotune is off.";
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   MutableGraphView graph(output);
@@ -244,7 +244,7 @@ Status MapFusion::OptimizeAndCollectStats(Cluster* cluster,
   }
 
   TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(MapFusion, "map_fusion");
diff --git a/tensorflow/core/grappler/optimizers/data/map_fusion.h b/tensorflow/core/grappler/optimizers/data/map_fusion.h
index 7ce44d51cf7fde..b6db4f74c08a74 100644
--- a/tensorflow/core/grappler/optimizers/data/map_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/map_fusion.h
@@ -36,7 +36,7 @@ class MapFusion : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    if (!config) return OkStatus();
+    if (!config) return absl::OkStatus();
 
     const string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
@@ -47,7 +47,7 @@ class MapFusion : public TFDataOptimizerBase {
       return errors::InvalidArgument("Received an invalid value for parameter ",
                                      kAutotune, ": ", autotune);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization.cc b/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
index a7b7884620a3b1..44b33c4f83471d 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization.cc
@@ -62,14 +62,15 @@ Status MapParallelization::OptimizeAndCollectStats(Cluster* cluster,
   if (!autotune_) {
     VLOG(1) << "The optimization map_parallelization is not applied if "
                "autotune is off.";
-    return OkStatus();
+    return absl::OkStatus();
   }
   MutableGraphView graph(output);
 
   // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
   // because we only want to enable extra map parallelism on the main dataset
   // pipeline.
-  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph)) return OkStatus();
+  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
+    return absl::OkStatus();
 
   absl::flat_hash_set<string> nodes_to_delete;
   FunctionLibraryDefinition function_library(OpRegistry::Global(),
@@ -97,7 +98,7 @@ Status MapParallelization::OptimizeAndCollectStats(Cluster* cluster,
   }
 
   TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(MapParallelization, "map_parallelization");
diff --git a/tensorflow/core/grappler/optimizers/data/map_parallelization.h b/tensorflow/core/grappler/optimizers/data/map_parallelization.h
index 3b6de4da545c9b..8f6c3a795fba58 100644
--- a/tensorflow/core/grappler/optimizers/data/map_parallelization.h
+++ b/tensorflow/core/grappler/optimizers/data/map_parallelization.h
@@ -36,7 +36,7 @@ class MapParallelization : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    if (!config) return OkStatus();
+    if (!config) return absl::OkStatus();
 
     const string& autotune = config->parameter_map().at(kAutotune).s();
     if (autotune == "true") {
@@ -47,7 +47,7 @@ class MapParallelization : public TFDataOptimizerBase {
       return errors::InvalidArgument("Received an invalid value for parameter ",
                                      kAutotune, ": ", autotune);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
index 87bb504f146c12..3627ab9b626cd2 100644
--- a/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/data/meta_optimizer.cc
@@ -38,7 +38,8 @@ using ConfigMap =
     std::map<string, tensorflow::RewriterConfig_CustomGraphOptimizer>;
 
 // tf.data optimizations, in the order we want to perform them.
-constexpr std::array<const char*, 21> kTFDataOptimizations = {
+// clang-format off
+constexpr std::array<const char*, 22> kTFDataOptimizations = {
     "noop_elimination",
     "disable_intra_op_parallelism",
     "use_private_thread_pool",
@@ -54,12 +55,14 @@ constexpr std::array<const char*, 21> kTFDataOptimizations = {
     "parallel_batch",
     "slack",
     "autotune_buffer_sizes",
+    "seq_interleave_prefetch",
     "inject_prefetch",
     "inject_io_prefetch_eligible",
     "inject_io_prefetch",
     "disable_prefetch_legacy_autotune",
     "enable_gradient_descent",
     "make_deterministic"};
+// clang-format on
 
 // Parses a list of string optimizer configurations into a map from
 // optimizer name -> rewriter config for that optimizer.
@@ -67,7 +70,7 @@ Status ToConfigMap(
     const tensorflow::RewriterConfig_CustomGraphOptimizer* config,
     ConfigMap* result) {
   auto found = gtl::FindOrNull(config->parameter_map(), "optimizer_configs");
-  if (!found) return OkStatus();
+  if (!found) return absl::OkStatus();
 
   auto& options = found->list().s();
   for (const auto& option_string : options) {
@@ -95,7 +98,7 @@ Status ToConfigMap(
         config_value);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -160,7 +163,7 @@ Status TFDataMetaOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   if (optimized_functions) {
     *output->mutable_library() = flib.ToProto();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status TFDataMetaOptimizer::ApplyOptimization(const string& name,
@@ -170,7 +173,7 @@ Status TFDataMetaOptimizer::ApplyOptimization(const string& name,
 
   const auto* optimizer = gtl::FindOrNull(enabled_optimizers_, name);
   if (!optimizer) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   GraphDef result;
@@ -183,7 +186,7 @@ Status TFDataMetaOptimizer::ApplyOptimization(const string& name,
     // A status of errors::Aborted just means that the optimizer was a no-op and
     // did not populate result. Swallow the error status and leave the original
     // graph in item.
-    status = OkStatus();
+    status = absl::OkStatus();
   }
 
   return status;
@@ -191,7 +194,7 @@ Status TFDataMetaOptimizer::ApplyOptimization(const string& name,
 
 Status TFDataMetaOptimizer::Init(
     const tensorflow::RewriterConfig_CustomGraphOptimizer* config) {
-  if (!config) return OkStatus();
+  if (!config) return absl::OkStatus();
 
   // Initialize custom tf.data optimizers based on config.
   auto& optimizers = config->parameter_map().at("optimizers").list().s();
@@ -213,7 +216,7 @@ Status TFDataMetaOptimizer::Init(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(TFDataMetaOptimizer, "tf_data_meta_optimizer");
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
index 7faa95a5cb10ef..8e32fba097a97c 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination.cc
@@ -169,7 +169,7 @@ Status NoOpElimination::OptimizeAndCollectStats(Cluster* cluster,
   }
 
   TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(NoOpElimination, "noop_elimination");
diff --git a/tensorflow/core/grappler/optimizers/data/noop_elimination.h b/tensorflow/core/grappler/optimizers/data/noop_elimination.h
index 45fd59326c289a..a35f7a1b632772 100644
--- a/tensorflow/core/grappler/optimizers/data/noop_elimination.h
+++ b/tensorflow/core/grappler/optimizers/data/noop_elimination.h
@@ -34,7 +34,7 @@ class NoOpElimination : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/parallel_batch.cc b/tensorflow/core/grappler/optimizers/data/parallel_batch.cc
index 25e858f44713a9..512db36330b926 100644
--- a/tensorflow/core/grappler/optimizers/data/parallel_batch.cc
+++ b/tensorflow/core/grappler/optimizers/data/parallel_batch.cc
@@ -38,7 +38,7 @@ Status ParallelBatch::OptimizeAndCollectStats(Cluster* cluster,
       stats->num_changes++;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(ParallelBatch, "parallel_batch");
diff --git a/tensorflow/core/grappler/optimizers/data/parallel_batch.h b/tensorflow/core/grappler/optimizers/data/parallel_batch.h
index b4ac857a9eaa6c..c6b0a7b64036c3 100644
--- a/tensorflow/core/grappler/optimizers/data/parallel_batch.h
+++ b/tensorflow/core/grappler/optimizers/data/parallel_batch.h
@@ -32,7 +32,7 @@ class ParallelBatch : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/remove_compression_map.cc b/tensorflow/core/grappler/optimizers/data/remove_compression_map.cc
index 8c674a87359347..a2e9049f039907 100644
--- a/tensorflow/core/grappler/optimizers/data/remove_compression_map.cc
+++ b/tensorflow/core/grappler/optimizers/data/remove_compression_map.cc
@@ -76,7 +76,7 @@ Status RemoveCompressionMap::OptimizeAndCollectStats(Cluster* cluster,
     compression_map_output.node->add_input(compression_map_node.input().Get(0));
     ++stats->num_changes;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(RemoveCompressionMap, "remove_compression_map");
diff --git a/tensorflow/core/grappler/optimizers/data/remove_compression_map.h b/tensorflow/core/grappler/optimizers/data/remove_compression_map.h
index d3dc881d119a42..6306cca768894f 100644
--- a/tensorflow/core/grappler/optimizers/data/remove_compression_map.h
+++ b/tensorflow/core/grappler/optimizers/data/remove_compression_map.h
@@ -32,7 +32,7 @@ class RemoveCompressionMap : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/replicate_on_split.cc b/tensorflow/core/grappler/optimizers/data/replicate_on_split.cc
index 00d501eed99874..43bcca49e2a7a5 100644
--- a/tensorflow/core/grappler/optimizers/data/replicate_on_split.cc
+++ b/tensorflow/core/grappler/optimizers/data/replicate_on_split.cc
@@ -40,7 +40,7 @@ Status ReplicateOnSplit::OptimizeAndCollectStats(Cluster* cluster,
       stats->num_changes++;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(ReplicateOnSplit, "replicate_on_split");
diff --git a/tensorflow/core/grappler/optimizers/data/replicate_on_split.h b/tensorflow/core/grappler/optimizers/data/replicate_on_split.h
index 42732fbb1a1bea..338ef29b3fdcc3 100644
--- a/tensorflow/core/grappler/optimizers/data/replicate_on_split.h
+++ b/tensorflow/core/grappler/optimizers/data/replicate_on_split.h
@@ -32,7 +32,7 @@ class ReplicateOnSplit : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/seq_interleave_prefetch.cc b/tensorflow/core/grappler/optimizers/data/seq_interleave_prefetch.cc
new file mode 100644
index 00000000000000..11e4916e5a18e5
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/seq_interleave_prefetch.cc
@@ -0,0 +1,362 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/seq_interleave_prefetch.h"
+
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/match.h"
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/model.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/node_def_util.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/grappler/clusters/cluster.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/mutable_graph_view.h"
+#include "tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/stringpiece.h"
+#include "tensorflow/core/platform/types.h"
+#include "tsl/platform/errors.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+constexpr char kInterleaveDatasetOpName[] = "InterleaveDataset";
+constexpr char kParallelInterleaveDatasetV2OpName[] =
+    "ParallelInterleaveDatasetV2";
+constexpr char kParallelInterleaveDatasetV3OpName[] =
+    "ParallelInterleaveDatasetV3";
+constexpr char kParallelInterleaveDatasetV4OpName[] =
+    "ParallelInterleaveDatasetV4";
+constexpr char kParallelInterleaveDatasetOpName[] = "ParallelInterleaveDataset";
+constexpr char kPrefetchDatasetOpName[] = "PrefetchDataset";
+constexpr char kDatasetStr[] = "Dataset";
+constexpr char kConstOpName[] = "Const";
+constexpr char kOutputShapes[] = "output_shapes";
+constexpr char kOutputTypes[] = "output_types";
+constexpr char kConstNodeOutputSuffix[] = ":output:0";
+constexpr char kDatasetNodeOutputSuffix[] = ":handle:0";
+constexpr char kDeterministicAttr[] = "deterministic";
+constexpr char kFunctionAttr[] = "f";
+constexpr char kDTypeAttr[] = "dtype";
+constexpr char kValueAttr[] = "value";
+constexpr char kTArgumentsAttr[] = "Targuments";
+constexpr char kOutputTypesAttr[] = "output_types";
+constexpr char kMetadataAttr[] = "metadata";
+constexpr char kOutputShapesAttr[] = "output_shapes";
+constexpr char kTOutputTypesAttr[] = "Toutput_types";
+constexpr char kSeqInterleavePrefetchRewritePrefix[] =
+    "inject/seq_interleave_prefetch_rewrite_";
+
+//
+// Steps involved in rewrite:
+//
+// For every deterministic parallel interleave node,
+//    1. Create interleave node and set the `interleave_fn` function same as the
+//    `interleave_fn` in parallel interleave node.
+//          - Update fan outs in the top level graph.
+//          - Delete parallel interleave nodes and its unused input nodes.
+//    2. Create a prefetch node with 'input set to (input of Identity node in
+//    FLD of the graph)
+//          - From the signature of 'f', find the output node (Identity node).
+//          - Find the input of this output node and set it as input of Prefetch
+//            node.
+//.         - Add prefetch and its input nodes to the FunctionDef.
+//          - Update fan outs of prefetch node.
+//
+
+bool IsParallelInterleave(const std::string& op) {
+  return data::MatchesAnyVersion(kParallelInterleaveDatasetOpName, op);
+}
+
+// Returns the number of inputs accepted by the parallel interleave op as per
+// the version excluding the `other_arguments` input.
+int GetNumInputsForParallelInterleaveOp(const std::string& op) {
+  if (op == kParallelInterleaveDatasetV2OpName) {
+    return 4;
+  } else if (op == kParallelInterleaveDatasetV3OpName) {
+    return 4;
+  } else if (op == kParallelInterleaveDatasetV4OpName) {
+    return 6;
+  }
+  return 0;
+}
+
+// Check if op type of `node` has "Dataset" suffix.
+bool NodeOpHasDatasetSuffix(const NodeDef& node) {
+  return absl::EndsWith(node.op(), kDatasetStr);
+}
+
+// Returns true if there is at least one function node with dataset op.
+bool DatasetOpInFunction(const NodeDef& node, const FunctionDef* fn) {
+  for (const auto& node : fn->node_def()) {
+    if (NodeOpHasDatasetSuffix(node)) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// A node is eligible for rewrite if it is a deterministic parallel interleave
+// node and has a function node creating `Dataset`.
+bool RewritePossibleForNode(const NodeDef& node,
+                            const FunctionLibraryDefinition& fld) {
+  auto is_deterministic_parallel_interleave_node = [&]() -> bool {
+    if (!IsParallelInterleave(node.op())) return false;
+    auto determinism_value = node.attr().find(kDeterministicAttr);
+    return (determinism_value != node.attr().end()) &&
+           (determinism_value->second.s() == "true");
+  };
+
+  if (node.attr().count(kFunctionAttr) == 0) return false;
+  const FunctionDef* fn = fld.Find(node.attr().at(kFunctionAttr).func().name());
+
+  if (fn == nullptr) return false;
+  if (fn->signature().output_arg_size() != 1) return false;
+  if (is_deterministic_parallel_interleave_node()) {
+    return DatasetOpInFunction(node, fn);
+  }
+
+  return false;
+}
+
+NodeDef CreateBufferSizeNode(DataType dtype,
+                             const std::function<void(TensorProto*)>& add_value,
+                             MutableGraphView* graph, FunctionDef& fdef) {
+  NodeDef node;
+  node.set_op(kConstOpName);
+  function_utils::SetUniqueFunctionNodeName(
+      absl::StrCat(kSeqInterleavePrefetchRewritePrefix, "buffer_size"), &fdef,
+      &node);
+
+  (*node.mutable_attr())[kDTypeAttr].set_type(dtype);
+  auto tensor = std::make_unique<tensorflow::TensorProto>();
+  auto tensor_shape = std::make_unique<tensorflow::TensorShapeProto>();
+  tensor->set_allocated_tensor_shape(tensor_shape.release());
+  tensor->set_dtype(dtype);
+  add_value(tensor.get());
+  (*node.mutable_attr())[kValueAttr].set_allocated_tensor(tensor.release());
+
+  return node;
+}
+
+Status CreateAndAppendPrefetchNode(MutableGraphView* graph, FunctionDef& fdef) {
+  auto get_last_dataset_op_node = [&]() -> const NodeDef* {
+    // Find the input node of fdef's ret value.
+    const auto& output_arg = fdef.signature().output_arg(0).name();
+    const auto& ret_val = fdef.ret().at(output_arg);
+    auto input = function_utils::FunctionDefTensorDesc(ret_val);
+    // Walk from output to input and find the first eligible node.
+    const NodeDef* dataset_op_node = nullptr;
+    while (
+        function_utils::ContainsFunctionNodeWithName(input.node_name, fdef)) {
+      int idx = function_utils::FindFunctionNodeWithName(input.node_name, fdef);
+      const NodeDef& node = fdef.node_def(idx);
+      if (NodeOpHasDatasetSuffix(node)) {
+        dataset_op_node = &node;
+        break;
+      }
+      input = function_utils::FunctionDefTensorDesc(node.input(0));
+    }
+    return dataset_op_node;
+  };
+
+  // 1. Find the position for the `prefetch` node.
+  const NodeDef* add_after = get_last_dataset_op_node();
+  if (add_after == nullptr) {
+    return errors::NotFound(
+        "Could not find any dataset node to append `Prefetch` at its output in "
+        "`seq_interleave_prefetch` rewrite");
+  }
+
+  // 2. Create prefetch node.
+  NodeDef prefetch_node;
+  prefetch_node.set_op(kPrefetchDatasetOpName);
+  function_utils::SetUniqueFunctionNodeName(
+      absl::StrCat(kSeqInterleavePrefetchRewritePrefix,
+                   fdef.signature().name()),
+      &fdef, &prefetch_node);
+
+  // 3. Construct argument nodes.
+  const auto input_dataset =
+      absl::StrCat(add_after->name(), kDatasetNodeOutputSuffix);
+  NodeDef buffer_size_node = CreateBufferSizeNode(
+      DT_INT64,
+      [](TensorProto* proto) { proto->add_int64_val(data::model::kAutotune); },
+      graph, fdef);
+
+  // 4. Add inputs to prefetch nodes.
+  prefetch_node.add_input(input_dataset);
+  prefetch_node.add_input(
+      absl::StrCat(buffer_size_node.name(), kConstNodeOutputSuffix));
+
+  // 5. Set other attributes of prefetch node.
+  if (add_after->attr().count(kOutputShapes) > 0) {
+    graph_utils::CopyAttribute(kOutputShapes, *add_after, &prefetch_node);
+  } else {
+    tensorflow::TensorShapeProto* shape =
+        (*(prefetch_node.mutable_attr()))[kOutputShapes]
+            .mutable_list()
+            ->add_shape();
+    shape->set_unknown_rank(true);
+  }
+
+  if (add_after->attr().count(kOutputTypes) > 0) {
+    graph_utils::CopyAttribute(kOutputTypes, *add_after, &prefetch_node);
+  } else if (add_after->attr().count(kTOutputTypesAttr) > 0) {
+    (*(prefetch_node.mutable_attr()))[kOutputTypes] =
+        add_after->attr().at(kTOutputTypesAttr);
+  } else {
+    (*(prefetch_node.mutable_attr()))[kOutputTypes].mutable_list()->add_type(
+        tensorflow::DataType::DT_STRING);
+  }
+
+  // 6. Update fanouts.
+  std::string old_input = input_dataset;
+  std::string new_input =
+      absl::StrCat(prefetch_node.name(), kDatasetNodeOutputSuffix);
+  function_utils::ReplaceReferences(old_input, new_input, &fdef);
+
+  // 7. Add `prefetch` and its argument nodes to `fdef`.
+  *fdef.add_node_def() = std::move(prefetch_node);
+  *fdef.add_node_def() = std::move(buffer_size_node);
+
+  return absl::OkStatus();
+}
+
+Status AddInterleaveNode(MutableGraphView* graph,
+                         const NodeDef& parallel_interleave_node,
+                         const std::string& interleave_map_func_name,
+                         absl::flat_hash_set<string>& nodes_to_delete) {
+  NodeDef interleave_node;
+  interleave_node.set_op(kInterleaveDatasetOpName);
+  graph_utils::SetUniqueGraphNodeName(
+      absl::StrCat(kSeqInterleavePrefetchRewritePrefix,
+                   parallel_interleave_node.name()),
+      graph->graph(), &interleave_node);
+
+  // Inputs to interleave node passed from parallel interleave node would
+  // comprise of `input_dataset`, `other_arguments`, `cycle_length`, and
+  // `block_length`.
+  int num_other_args =
+      parallel_interleave_node.input_size() -
+      GetNumInputsForParallelInterleaveOp(parallel_interleave_node.op());
+  int inputs_from_parallel_interleave = 1 /* input_dataset */ + num_other_args +
+                                        1 /* cycle_length */ +
+                                        1 /* block_length */;
+  for (int i = 0; i < inputs_from_parallel_interleave; ++i) {
+    interleave_node.add_input(parallel_interleave_node.input(i));
+  }
+
+  // Copy attributes.
+  if (parallel_interleave_node.attr().contains(kTArgumentsAttr)) {
+    graph_utils::CopyAttribute(kTArgumentsAttr, parallel_interleave_node,
+                               &interleave_node);
+  }
+  if (parallel_interleave_node.attr().contains(kOutputTypesAttr)) {
+    graph_utils::CopyAttribute(kOutputTypesAttr, parallel_interleave_node,
+                               &interleave_node);
+  }
+  if (parallel_interleave_node.attr().contains(kOutputShapesAttr)) {
+    graph_utils::CopyAttribute(kOutputShapesAttr, parallel_interleave_node,
+                               &interleave_node);
+  }
+  if (parallel_interleave_node.attr().contains(kMetadataAttr)) {
+    graph_utils::CopyAttribute(kMetadataAttr, parallel_interleave_node,
+                               &interleave_node);
+  }
+
+  // Set the interleave function attr to the same function as in parallel
+  // interleave.
+  const auto& parallel_interleave_fn_attr =
+      parallel_interleave_node.attr().at(kFunctionAttr);
+  (*interleave_node.mutable_attr())[kFunctionAttr] =
+      parallel_interleave_fn_attr;
+  (*interleave_node.mutable_attr())[kFunctionAttr].mutable_func()->set_name(
+      interleave_map_func_name);
+
+  // Copy shapes and types attributes.
+  graph_utils::CopyShapesAndTypesAttrs(parallel_interleave_node,
+                                       &interleave_node);
+
+  // Copy experimental types.
+  *interleave_node.mutable_experimental_type() =
+      parallel_interleave_node.experimental_type();
+
+  // Add new node into graph and update edges
+  NodeDef* new_node_graph = graph->AddNode(std::move(interleave_node));
+  TF_RETURN_IF_ERROR(graph->UpdateFanouts(parallel_interleave_node.name(),
+                                          new_node_graph->name()));
+
+  // Delete the parallel interleave node.
+  nodes_to_delete.insert(parallel_interleave_node.name());
+  return absl::OkStatus();
+}
+}  // namespace
+
+Status SeqInterleavePrefetch::OptimizeAndCollectStats(
+    Cluster* cluster, const GrapplerItem& item, GraphDef* output,
+    OptimizationStats* stats) {
+  *output = item.graph;
+  MutableGraphView graph(output);
+  absl::flat_hash_set<string> nodes_to_delete;
+  FunctionLibraryDefinition fld(OpRegistry::Global(), item.graph.library());
+
+  for (const NodeDef& node : item.graph.node()) {
+    if (!RewritePossibleForNode(node, fld)) continue;
+    // Find the parallel_interleave_node's `map_func`.
+    const FunctionDef* parallel_interleave_fn =
+        fld.Find(node.attr().at("f").func().name());
+    FunctionDef interleave_fn(*parallel_interleave_fn);
+    interleave_fn.mutable_signature()->set_name(
+        absl::StrCat(kSeqInterleavePrefetchRewritePrefix,
+                     parallel_interleave_fn->signature().name()));
+    // Replace the parallel interleave node with interleave.
+    TF_RETURN_IF_ERROR(AddInterleaveNode(
+        &graph, node, interleave_fn.signature().name(), nodes_to_delete));
+    // Create and append the prefetch node to the interleave_fn.
+    TF_RETURN_IF_ERROR(CreateAndAppendPrefetchNode(&graph, interleave_fn));
+    // Replace the `parallel_interleave_fn` with `interleave_fn`.
+    TF_RETURN_IF_ERROR(fld.ReplaceFunction(
+        parallel_interleave_fn->signature().name(), interleave_fn));
+    stats->num_changes++;
+  }
+
+  // Update the `FunctionDefLibrary` of the optimized graph.
+  *output->mutable_library() = fld.ToProto();
+  TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
+
+  return absl::OkStatus();
+}
+
+REGISTER_GRAPH_OPTIMIZER_AS(SeqInterleavePrefetch, "seq_interleave_prefetch");
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/seq_interleave_prefetch.h b/tensorflow/core/grappler/optimizers/data/seq_interleave_prefetch.h
new file mode 100644
index 00000000000000..00cfed1ed78abd
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/seq_interleave_prefetch.h
@@ -0,0 +1,54 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SEQ_INTERLEAVE_PREFETCH_H_
+#define TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SEQ_INTERLEAVE_PREFETCH_H_
+
+#include "tensorflow/core/framework/attr_value.pb.h"
+#include "tensorflow/core/grappler/optimizers/data/optimizer_base.h"
+
+namespace tensorflow {
+namespace grappler {
+
+// This optimization replaces parallel interleave with sequential interleave and
+// adds `prefetch(AUTOTUNE)` after the user defined map function in interleave.
+class SeqInterleavePrefetch : public TFDataOptimizerBase {
+ public:
+  SeqInterleavePrefetch() = default;
+  ~SeqInterleavePrefetch() override = default;
+
+  std::string name() const override { return "seq_interleave_prefetch"; };
+
+  // The SeqInterleavePrefetch optimizer requires access to the function
+  // library.
+  bool UsesFunctionLibrary() const override { return true; }
+
+  Status Init(
+      const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
+    return absl::OkStatus();
+  }
+
+  Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
+                                 GraphDef* output,
+                                 OptimizationStats* stats) override;
+
+ protected:
+  bool autotune_ = true;
+};
+
+}  // namespace grappler
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_GRAPPLER_OPTIMIZERS_DATA_SEQ_INTERLEAVE_PREFETCH_H_
diff --git a/tensorflow/core/grappler/optimizers/data/seq_interleave_prefetch_test.cc b/tensorflow/core/grappler/optimizers/data/seq_interleave_prefetch_test.cc
new file mode 100644
index 00000000000000..076357accef923
--- /dev/null
+++ b/tensorflow/core/grappler/optimizers/data/seq_interleave_prefetch_test.cc
@@ -0,0 +1,381 @@
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/grappler/optimizers/data/seq_interleave_prefetch.h"
+
+#include <string>
+
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/framework/function.h"
+#include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/function_testlib.h"
+#include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/node_def.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/grappler/grappler_item.h"
+#include "tensorflow/core/grappler/optimizers/data/function_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_test_utils.h"
+#include "tensorflow/core/grappler/optimizers/data/graph_utils.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/lib/core/status_test_util.h"
+
+namespace tensorflow {
+namespace grappler {
+namespace {
+
+using test::function::GDef;
+using test::function::NDef;
+
+constexpr char kPrefetchDatasetOpName[] = "PrefetchDataset";
+constexpr char kInterleaveDatasetOpName[] = "InterleaveDataset";
+constexpr char kParallelInterleaveDatasetOpName[] =
+    "ParallelInterleaveDatasetV4";
+constexpr char kSeqInterleavePrefetchRewritePrefix[] =
+    "inject/seq_interleave_prefetch_rewrite_";
+constexpr char kFdefProtoStr[] =
+    R"pb(signature {
+           name: "parallel_interleave_fdef"
+           input_arg { name: "args_0" type: DT_STRING }
+           output_arg { name: "identity" type: DT_VARIANT }
+           is_stateful: true
+           control_output: "SSTableDataset"
+         }
+         node_def {
+           name: "key_prefix"
+           op: "Const"
+           attr {
+             key: "dtype"
+             value { type: DT_STRING }
+           }
+           attr {
+             key: "value"
+             value {
+               tensor {
+                 dtype: DT_STRING
+                 tensor_shape {}
+                 string_val: ""
+               }
+             }
+           }
+         }
+         node_def {
+           name: "start_key"
+           op: "Const"
+           attr {
+             key: "dtype"
+             value { type: DT_STRING }
+           }
+           attr {
+             key: "value"
+             value {
+               tensor {
+                 dtype: DT_STRING
+                 tensor_shape {}
+                 string_val: ""
+               }
+             }
+           }
+         }
+         node_def {
+           name: "stop_key"
+           op: "Const"
+           attr {
+             key: "dtype"
+             value { type: DT_STRING }
+           }
+           attr {
+             key: "value"
+             value {
+               tensor {
+                 dtype: DT_STRING
+                 tensor_shape {}
+                 string_val: ""
+               }
+             }
+           }
+         }
+         node_def {
+           name: "SSTableDataset"
+           op: "SSTableDataset"
+           input: "args_0"
+           input: "key_prefix:output:0"
+           input: "start_key:output:0"
+           input: "stop_key:output:0"
+           attr {
+             key: "metadata"
+             value { s: "" }
+           }
+           attr {
+             key: "split_size"
+             value { i: 0 }
+           }
+           experimental_type {
+             type_id: TFT_PRODUCT
+             args {
+               type_id: TFT_DATASET
+               args {
+                 type_id: TFT_TENSOR
+                 args { type_id: TFT_STRING }
+               }
+             }
+           }
+         }
+         node_def {
+           name: "Identity"
+           op: "Identity"
+           input: "SSTableDataset:handle:0"
+           input: "^NoOp"
+           attr {
+             key: "T"
+             value { type: DT_VARIANT }
+           }
+         }
+         node_def { name: "NoOp" op: "NoOp" input: "^SSTableDataset" }
+         ret { key: "identity" value: "Identity:output:0" }
+         attr {
+           key: "_construction_context"
+           value { s: "kEagerRuntime" }
+         }
+         attr {
+           key: "_tf_data_function"
+           value { b: true }
+         }
+         control_ret { key: "SSTableDataset" value: "SSTableDataset" }
+         arg_attr {
+           key: 0
+           value {
+             attr {
+               key: "_output_shapes"
+               value { list { shape {} } }
+             }
+             attr {
+               key: "_user_specified_name"
+               value { s: "args_0" }
+             }
+           }
+         })pb";
+
+GraphDef ParallelInterleaveCase(bool deterministic) {
+  FunctionDef fdef;
+  protobuf::TextFormat::ParseFromString(kFdefProtoStr, &fdef);
+  return GDef(
+      {NDef("stop", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"stop"}, {}),
+       NDef("cycle_length", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("block_length", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("num_parallel_calls", "Const", {},
+            {{"value", 1}, {"dtype", DT_INT32}}),
+       graph_tests_utils::MakeParallelInterleaveV4Node(
+           "parallel_interleave", "range", "cycle_length", "block_length",
+           "num_parallel_calls", "parallel_interleave_fdef",
+           deterministic ? "true" : "false")},
+      // FunctionLib
+      {
+          fdef,
+      });
+}
+
+GraphDef MultipleParallelInterleaveCase(bool deterministic) {
+  FunctionDef fdef_1, fdef_2, fdef_3;
+  protobuf::TextFormat::ParseFromString(kFdefProtoStr, &fdef_1);
+  fdef_1.mutable_signature()->set_name("parallel_interleave_fdef_1");
+  protobuf::TextFormat::ParseFromString(kFdefProtoStr, &fdef_2);
+  fdef_2.mutable_signature()->set_name("parallel_interleave_fdef_2");
+  protobuf::TextFormat::ParseFromString(kFdefProtoStr, &fdef_3);
+  fdef_3.mutable_signature()->set_name("parallel_interleave_fdef_3");
+
+  auto make_parallel_interleave_node =
+      [&deterministic](const int node_num, const FunctionDef &fdef) {
+        return graph_tests_utils::MakeParallelInterleaveV4Node(
+            absl::StrCat("parallel_interleave_", node_num), "range",
+            "cycle_length", "block_length", "num_parallel_calls",
+            fdef.signature().name(), deterministic ? "true" : "false");
+      };
+
+  return GDef(
+      {NDef("stop", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"stop"}, {}),
+       NDef("cycle_length", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("block_length", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("num_parallel_calls", "Const", {},
+            {{"value", 1}, {"dtype", DT_INT32}}),
+       make_parallel_interleave_node(1, fdef_1),
+       make_parallel_interleave_node(2, fdef_2),
+       make_parallel_interleave_node(3, fdef_3)},
+      // FunctionLib
+      {
+          fdef_1,
+          fdef_2,
+          fdef_3,
+      });
+}
+
+GraphDef InterleaveCase(bool deterministic) {
+  FunctionDef fdef;
+  protobuf::TextFormat::ParseFromString(kFdefProtoStr, &fdef);
+  return GDef(
+      {NDef("stop", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("range", "RangeDataset", {"stop"}, {}),
+       NDef("cycle_length", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       NDef("block_length", "Const", {}, {{"value", 1}, {"dtype", DT_INT32}}),
+       graph_tests_utils::MakeInterleaveNode(
+           "sequential_interleave", "range", "cycle_length", "block_length",
+           "parallel_interleave_fdef", deterministic ? "true" : "false")},
+      // FunctionLib
+      {
+          fdef,
+      });
+}
+
+bool PrefetchInFunction(const NodeDef &node,
+                        const FunctionLibraryDefinition &flib) {
+  auto f_attr_it = node.attr().find("f");
+  if (f_attr_it == node.attr().end()) return false;
+  const FunctionDef *func = flib.Find(f_attr_it->second.func().name());
+  if (func == nullptr) {
+    return false;
+  }
+  for (int i = 0; i < func->node_def_size(); i++) {
+    NodeDef node_in_func = func->node_def(i);
+    if (tensorflow::data::MatchesAnyVersion(
+            /*op_prefix=*/kPrefetchDatasetOpName,
+            /*op_to_match=*/node_in_func.op())) {
+      return true;
+    }
+  }
+  return false;
+}
+
+bool IsInterleaveNode(const NodeDef &node) {
+  return (node.op() == kInterleaveDatasetOpName);
+}
+
+}  // namespace
+
+Status OptimizeWithInjectInterleavePrefetch(const GrapplerItem &item,
+                                            GraphDef *output) {
+  SeqInterleavePrefetch optimizer;
+  return optimizer.Optimize(nullptr, item, output);
+}
+
+class SeqInterleavePrefetchParameterizedTest
+    : public ::testing::TestWithParam<bool> {};
+
+TEST_P(SeqInterleavePrefetchParameterizedTest,
+       ParallelInterleaveHasConditionalInjection) {
+  GrapplerItem item;
+  bool deterministic = GetParam();
+  item.graph = ParallelInterleaveCase(deterministic);
+  item.fetch.push_back("Sink");
+
+  GraphDef output;
+  TF_ASSERT_OK(OptimizeWithInjectInterleavePrefetch(item, &output));
+  FunctionLibraryDefinition lib_def(OpRegistry::Global(), output.library());
+  const std::string &parallel_interleave_fdef_name = "parallel_interleave_fdef";
+  const std::string &interleave_fdef_name = absl::StrCat(
+      kSeqInterleavePrefetchRewritePrefix, parallel_interleave_fdef_name);
+  if (deterministic) {
+    EXPECT_TRUE(
+        !graph_utils::ContainsGraphNodeWithName("parallel_interleave", output));
+    EXPECT_TRUE(!graph_utils::ContainsNodeWithOp(
+        kParallelInterleaveDatasetOpName, output));
+    EXPECT_TRUE(
+        graph_utils::ContainsNodeWithOp(kInterleaveDatasetOpName, output));
+    for (auto node : output.node()) {
+      if (!IsInterleaveNode(node)) continue;
+      EXPECT_TRUE(PrefetchInFunction(node, lib_def));
+    }
+    const FunctionDef *parallel_interleave_fdef =
+        lib_def.Find(parallel_interleave_fdef_name);
+    const FunctionDef *interleave_fdef = lib_def.Find(interleave_fdef_name);
+    EXPECT_EQ(parallel_interleave_fdef, nullptr);
+    EXPECT_NE(interleave_fdef, nullptr);
+    EXPECT_EQ(lib_def.ListFunctionNames().at(0), interleave_fdef_name);
+    EXPECT_TRUE(function_utils::FindFunctionNodeWithOp(kPrefetchDatasetOpName,
+                                                       *interleave_fdef));
+  } else {
+    EXPECT_TRUE(graph_utils::ContainsNodeWithOp(
+        kParallelInterleaveDatasetOpName, output));
+    EXPECT_TRUE(
+        !graph_utils::ContainsNodeWithOp(kInterleaveDatasetOpName, output));
+    EXPECT_TRUE(
+        graph_utils::ContainsGraphNodeWithName("parallel_interleave", output));
+    EXPECT_NE(lib_def.Find(parallel_interleave_fdef_name), nullptr);
+  }
+  EXPECT_EQ(lib_def.num_functions(), 1);
+}
+
+TEST_P(SeqInterleavePrefetchParameterizedTest,
+       MultipleParallelInterleavesHaveConditionalInjection) {
+  GrapplerItem item;
+  bool deterministic = GetParam();
+  item.graph = MultipleParallelInterleaveCase(deterministic);
+  item.fetch.push_back("Sink");
+
+  GraphDef output;
+  TF_ASSERT_OK(OptimizeWithInjectInterleavePrefetch(item, &output));
+  FunctionLibraryDefinition lib_def(OpRegistry::Global(), output.library());
+  if (deterministic) {
+    EXPECT_TRUE(!graph_utils::ContainsNodeWithOp(
+        kParallelInterleaveDatasetOpName, output));
+    EXPECT_TRUE(
+        graph_utils::ContainsNodeWithOp(kInterleaveDatasetOpName, output));
+    for (int i = 1; i <= 3; ++i) {
+      EXPECT_TRUE(!graph_utils::ContainsGraphNodeWithName(
+          absl::StrCat("parallel_interleave_", std::to_string(i)), output));
+    }
+    for (auto node : output.node()) {
+      if (!IsInterleaveNode(node)) continue;
+      EXPECT_TRUE(PrefetchInFunction(node, lib_def));
+    }
+  } else {
+    EXPECT_TRUE(graph_utils::ContainsNodeWithOp(
+        kParallelInterleaveDatasetOpName, output));
+    EXPECT_TRUE(
+        !graph_utils::ContainsNodeWithOp(kInterleaveDatasetOpName, output));
+    for (int i = 1; i <= 3; ++i) {
+      EXPECT_TRUE(graph_utils::ContainsGraphNodeWithName(
+          absl::StrCat("parallel_interleave_", std::to_string(i)), output));
+    }
+  }
+}
+
+TEST_P(SeqInterleavePrefetchParameterizedTest,
+       SequentialInterleaveHasNoInjection) {
+  GrapplerItem item;
+  item.graph = InterleaveCase(/*deterministic=*/GetParam());
+  item.fetch.push_back("Sink");
+
+  GraphDef output;
+  TF_ASSERT_OK(OptimizeWithInjectInterleavePrefetch(item, &output));
+  EXPECT_TRUE(
+      graph_utils::ContainsNodeWithOp(kInterleaveDatasetOpName, output));
+  EXPECT_TRUE(
+      graph_utils::ContainsGraphNodeWithName("sequential_interleave", output));
+  FunctionLibraryDefinition lib_def(OpRegistry::Global(), output.library());
+  for (auto node : output.node()) {
+    if (!IsInterleaveNode(node)) continue;
+    EXPECT_FALSE(PrefetchInFunction(node, lib_def));
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(Determinism, SeqInterleavePrefetchParameterizedTest,
+                         ::testing::Values(false, true));
+
+}  // namespace grappler
+}  // namespace tensorflow
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
index f0391112b32d0e..5fabf42bf03872 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.cc
@@ -72,7 +72,7 @@ Status FuseShuffleV1AndRepeat(const NodeDef& shuffle_node,
   // Optionally set the `metadata` attribute.
   graph_utils::MaybeSetFusedMetadata(shuffle_node, repeat_node, fused_node);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FuseShuffleV2AndRepeat(const NodeDef& shuffle_node,
@@ -112,7 +112,7 @@ Status FuseShuffleV2AndRepeat(const NodeDef& shuffle_node,
   // Optionally set the `metadata` attribute.
   graph_utils::MaybeSetFusedMetadata(shuffle_node, repeat_node, fused_node);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FuseShuffleV3AndRepeat(const NodeDef& shuffle_node,
@@ -149,7 +149,7 @@ Status FuseShuffleV3AndRepeat(const NodeDef& shuffle_node,
   // Optionally set the `metadata` attribute.
   graph_utils::MaybeSetFusedMetadata(shuffle_node, repeat_node, fused_node);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -205,7 +205,7 @@ Status ShuffleAndRepeatFusion::OptimizeAndCollectStats(
   }
 
   TF_RETURN_IF_ERROR(graph.DeleteNodes(nodes_to_delete));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(ShuffleAndRepeatFusion,
diff --git a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h
index d21619b92b34aa..5ce38242bbe3b0 100644
--- a/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h
+++ b/tensorflow/core/grappler/optimizers/data/shuffle_and_repeat_fusion.h
@@ -32,7 +32,7 @@ class ShuffleAndRepeatFusion : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/slack.cc b/tensorflow/core/grappler/optimizers/data/slack.cc
index 61b305fabb1c26..c83a371973609c 100644
--- a/tensorflow/core/grappler/optimizers/data/slack.cc
+++ b/tensorflow/core/grappler/optimizers/data/slack.cc
@@ -85,7 +85,7 @@ Status Slack::RecursivelyHandleOp(const MutableGraphView& graph,
     } else {
       AddNodeAttr("slack_period", slack_period_, dataset_node);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (IsDatasetNodeOfType(*dataset_node, kPassThroughOps)) {
     NodeDef* input_node = graph_utils::GetInputNode(*dataset_node, graph, 0);
@@ -97,12 +97,12 @@ Status Slack::RecursivelyHandleOp(const MutableGraphView& graph,
       NodeDef* input_node = graph_utils::GetInputNode(*dataset_node, graph, i);
       TF_RETURN_IF_ERROR(RecursivelyHandleOp(graph, input_node));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   LOG(WARNING) << "Could not find a final `prefetch` in the input pipeline to "
                   "which to introduce slack.";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Slack::OptimizeAndCollectStats(Cluster* cluster,
@@ -119,7 +119,8 @@ Status Slack::OptimizeAndCollectStats(Cluster* cluster,
   // If the GrapplerItem is derived from a FunctionDef, we don't optimize it,
   // because we only want to add slack to the prefetch on the main dataset
   // pipeline.
-  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph)) return OkStatus();
+  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
+    return absl::OkStatus();
 
   if (item.fetch.size() != 1) {
     return errors::InvalidArgument(
diff --git a/tensorflow/core/grappler/optimizers/data/slack.h b/tensorflow/core/grappler/optimizers/data/slack.h
index 8ce90841ebf49c..b39cfc65094567 100644
--- a/tensorflow/core/grappler/optimizers/data/slack.h
+++ b/tensorflow/core/grappler/optimizers/data/slack.h
@@ -45,7 +45,7 @@ class Slack : public TFDataOptimizerBase {
       return errors::InvalidArgument("Invalid `slack_period` parameter: ",
                                      slack_period_param);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/data/split_utils.cc b/tensorflow/core/grappler/optimizers/data/split_utils.cc
index d023b9fbd86ded..85ce00bba49f49 100644
--- a/tensorflow/core/grappler/optimizers/data/split_utils.cc
+++ b/tensorflow/core/grappler/optimizers/data/split_utils.cc
@@ -178,7 +178,7 @@ Status InputRewriter::RewriteInput(absl::string_view input_str,
   auto iter = input_map_.find(input_str);
   if (iter != input_map_.end()) {
     *new_input_str = iter->second;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (IsControlInput(input_str)) {
@@ -189,7 +189,7 @@ Status InputRewriter::RewriteInput(absl::string_view input_str,
     TF_RETURN_IF_ERROR(RewriteNodeInput(input_str, new_input_str));
   }
   input_map_.insert({input_str, *new_input_str});
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InputRewriter::RewriteControlInput(absl::string_view input_str,
@@ -201,7 +201,7 @@ Status InputRewriter::RewriteControlInput(absl::string_view input_str,
   } else {
     *new_input_str = string{input_str};
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InputRewriter::RewriteArgumentInput(absl::string_view input_str,
@@ -233,7 +233,7 @@ Status InputRewriter::RewriteArgumentInput(absl::string_view input_str,
       original_function_.signature().input_arg_size() - num_captured_inputs_) {
     // Argument is a captured input. No need to modify argument string.
     *new_input_str = string{input_str};
-    return OkStatus();
+    return absl::OkStatus();
   }
   const OpDef::ArgDef* found_arg_def =
       &original_function_.signature().input_arg(i);
@@ -266,7 +266,7 @@ Status InputRewriter::RewriteNodeInput(absl::string_view input_str,
       components.size() == 3 ? components[2] : "0";
   if (!IsInFirstFunction(node_name)) {
     *new_input_str = string{input_str};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   auto index_iter = name_to_node_.find(node_name);
@@ -352,7 +352,7 @@ Status InputRewriter::RewriteCrossFunctionInput(
   added_input_arg->set_description(absl::StrCat("Input ", input_index));
 
   *new_input_str = added_input_arg->name();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void InitializeSignatures(
diff --git a/tensorflow/core/grappler/optimizers/data/use_private_thread_pool.cc b/tensorflow/core/grappler/optimizers/data/use_private_thread_pool.cc
index 4bf2e02635bd88..917c66939d8208 100644
--- a/tensorflow/core/grappler/optimizers/data/use_private_thread_pool.cc
+++ b/tensorflow/core/grappler/optimizers/data/use_private_thread_pool.cc
@@ -42,7 +42,8 @@ Status UsePrivateThreadPool::OptimizeAndCollectStats(Cluster* cluster,
   MutableGraphView graph(output);
 
   // If the GrapplerItem is derived from a FunctionDef, we don't optimize it.
-  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph)) return OkStatus();
+  if (graph_utils::IsItemDerivedFromFunctionDef(item, graph))
+    return absl::OkStatus();
 
   if (item.fetch.size() != 1) {
     return errors::InvalidArgument(
@@ -54,7 +55,7 @@ Status UsePrivateThreadPool::OptimizeAndCollectStats(Cluster* cluster,
     if (node.op() == kPrivateThreadPoolDataset) {
       // If private thread pool is set by the user, we keep the user setting
       // instead of rewriting it.
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -89,14 +90,14 @@ Status UsePrivateThreadPool::OptimizeAndCollectStats(Cluster* cluster,
   // attrs from the input node. If we fail to set the attributes, we abort the
   // rewrite.
   if (!graph_utils::CopyShapesAndTypesAttrs(*last_node, &insert_node))
-    return OkStatus();
+    return absl::OkStatus();
 
   auto* added_node = graph.AddNode(std::move(insert_node));
   TF_RETURN_IF_ERROR(
       graph.UpdateFanouts(last_node->name(), added_node->name()));
 
   stats->num_changes++;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(UsePrivateThreadPool, "use_private_thread_pool");
diff --git a/tensorflow/core/grappler/optimizers/data/use_private_thread_pool.h b/tensorflow/core/grappler/optimizers/data/use_private_thread_pool.h
index 8f2868af3852af..f515b3afb41371 100644
--- a/tensorflow/core/grappler/optimizers/data/use_private_thread_pool.h
+++ b/tensorflow/core/grappler/optimizers/data/use_private_thread_pool.h
@@ -33,7 +33,7 @@ class UsePrivateThreadPool : public TFDataOptimizerBase {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OptimizeAndCollectStats(Cluster* cluster, const GrapplerItem& item,
diff --git a/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.cc b/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.cc
index 822dbf5c2b4552..1a0cdbc1d2833c 100644
--- a/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.cc
+++ b/tensorflow/core/grappler/optimizers/inference/batch_op_rewriter.cc
@@ -152,7 +152,7 @@ Status BatchOpRewriter::Init(
     // (e.g., enable_adaptive_shared_batching_thread_pool is false), proto
     // is considered as empty.
     VLOG(2) << "Empty batch-op rewrite config";
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (!absl::Base64Unescape(params.s(), &unencoded)) {
     return absl::InternalError(
@@ -163,7 +163,7 @@ Status BatchOpRewriter::Init(
         "Failed to parse batch_op_rewrite_config from params.");
   }
   VLOG(2) << "BatchOp Rewrite config is " << config_.DebugString();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status BatchOpRewriter::Optimize(Cluster* cluster, const GrapplerItem& item,
@@ -270,7 +270,7 @@ Status BatchOpRewriter::Optimize(Cluster* cluster, const GrapplerItem& item,
   }
 
   if (asbs_overridden) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (config_.enable_adaptive_shared_batching_thread_pool()) {
@@ -280,7 +280,7 @@ Status BatchOpRewriter::Optimize(Cluster* cluster, const GrapplerItem& item,
                                                   batch_op);
     });
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_GRAPH_OPTIMIZER_AS(BatchOpRewriter, "batch_op_rewrite");
diff --git a/tensorflow/core/grappler/optimizers/loop_optimizer.cc b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
index 148047a62553ed..94c2c22f472f19 100644
--- a/tensorflow/core/grappler/optimizers/loop_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/loop_optimizer.cc
@@ -111,7 +111,7 @@ Status LoopInvariantNodeMotionOptimizer::HandleInvariantEnter(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LoopInvariantNodeMotionOptimizer::HandleConst(NodeDef* node,
@@ -183,7 +183,7 @@ Status LoopInvariantNodeMotionOptimizer::HandleConst(NodeDef* node,
     const_node->add_input(ctrl_dep);
     node_map_->AddOutput(NodeName(ctrl_dep), const_node->name());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LoopInvariantNodeMotionOptimizer::HandleInvariantNode(
@@ -197,7 +197,7 @@ Status LoopInvariantNodeMotionOptimizer::HandleInvariantNode(
     }
   }
   if (num_outputs == 0) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   DataTypeVector input_types;
@@ -252,7 +252,7 @@ Status LoopInvariantNodeMotionOptimizer::HandleInvariantNode(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LoopInvariantNodeMotionOptimizer::MoveInvariantNodes(
@@ -270,7 +270,7 @@ Status LoopInvariantNodeMotionOptimizer::MoveInvariantNodes(
           HandleInvariantNode(invariant_node, num_outputs, frame_id));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LoopInvariantNodeMotionOptimizer::RevertInvariantNodes() {
@@ -327,7 +327,7 @@ Status LoopInvariantNodeMotionOptimizer::RevertInvariantNodes() {
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LoopInvariantNodeMotionOptimizer::FindInvariantNodes(
@@ -376,7 +376,7 @@ Status LoopInvariantNodeMotionOptimizer::FindInvariantNodes(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LoopInvariantNodeMotionOptimizer::Optimize() {
@@ -450,7 +450,7 @@ Status LoopInvariantNodeMotionOptimizer::Optimize() {
 
     TF_RETURN_IF_ERROR(MoveInvariantNodes(frame_id));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 std::vector<int> GetStackPushNodesToConvert(
@@ -544,7 +544,7 @@ Status RemoveStackOps(const std::unordered_set<string>& nodes_to_preserve,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool IsSimpleBinaryOperator(const NodeDef& node) {
@@ -582,7 +582,7 @@ Status EvaluateBoolOpForConstantOperands(const NodeDef& op_node,
   *value = outputs[0].tensor->scalar<bool>()();
   delete outputs[0].tensor;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // TODO(lyandy): Consolidate with ConstantFolding implementation.
@@ -614,7 +614,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
     CHECK(selector.FromProto(switch_predicate->attr().at("value").tensor()));
     *has_dead_fanout = true;
     *dead_fanout = selector.scalar<bool>()() ? 0 : 1;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   GraphView::InputPort switch_input_port(&switch_node, 0);
@@ -625,7 +625,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
   // operator which returns false for the initialization value.
   // TODO(srjoglekar): Improve to work with arbitrary predicate subgraphs.
   if (!IsMerge(*switch_input) || !IsLoopCond(*switch_predicate)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   VLOG(4) << "Try to find a zero iteration while loop:"
@@ -634,7 +634,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
   // Find the boolean predicate from a LoopCond node (e.g. Greater).
   NodeDef* switch_ctrl_node = view.GetRegularFanin({switch_predicate, 0}).node;
   if (!switch_ctrl_node || !IsSimpleBinaryOperator(*switch_ctrl_node)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Find the Merge node & the Constant Operand to the condition node, if
@@ -656,7 +656,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
     }
   }
   if (merge_node == nullptr || constant_ctrl_input == nullptr) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Find the initialization constant (via Enter, if one exists).
@@ -672,7 +672,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
     }
   }
   if (enter_node != nullptr) {
-    if (constant_init_node != nullptr) return OkStatus();
+    if (constant_init_node != nullptr) return absl::OkStatus();
     for (const auto& input : enter_node->input()) {
       NodeDef* node = node_map.GetNode(input);
       if (IsReallyConstant(*node, feed_nodes)) {
@@ -681,7 +681,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
     }
   }
   if (constant_init_node == nullptr) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   VLOG(4) << "Check if loop will be 0 iterations:"
@@ -712,7 +712,7 @@ Status CheckForDeadFanout(const MutableGraphView& view,
   } else {
     VLOG(4) << "Was not able to prove that loop has 0 iterations.";
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -756,7 +756,7 @@ Status LoopOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
                                           feed_nodes, optimized_graph));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // An Identity node has only 1 output, but Switch and Merge nodes have 2.
@@ -781,7 +781,7 @@ static Status update_identity_node_type(NodeDef* sw_node) {
     *(new_t.add_args()) = old_t.args()[0];
     *(sw_node->mutable_experimental_type()) = new_t;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status LoopOptimizer::RemoveDeadBranches(
@@ -933,14 +933,14 @@ Status LoopOptimizer::RemoveDeadBranches(
       LOG(WARNING)
           << "Skipping loop optimization for Merge node with control input: "
           << merge_node->name();
-      return OkStatus();
+      return absl::OkStatus();
     } else if (dead_inputs.size() != 1 || num_data_inputs != 2) {
       LOG(WARNING) << "Skipping loop optimization for Merge node ("
                    << merge_node->name()
                    << ") with unexpected dead_inputs.size() ("
                    << dead_inputs.size() << " or  num_data_inputs"
                    << num_data_inputs;
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -1013,7 +1013,7 @@ Status LoopOptimizer::RemoveDeadBranches(
   }
   EraseNodesFromGraph(std::move(nodes_idx_to_delete), optimized_graph);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/memory_optimizer.cc b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
index c9cf8dbc9fb03e..08e9eb9d23815c 100644
--- a/tensorflow/core/grappler/optimizers/memory_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/memory_optimizer.cc
@@ -794,7 +794,7 @@ Status BuildSwapPair(NodeDef* node, int input_to_swap,
   (*swap_out_node->mutable_attr())["T"].set_type(input_type);
   *swap_pair = std::make_pair(swap_out_node, swap_in_node);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 struct SwapInfo {
@@ -1313,7 +1313,7 @@ Status FindAssignNodesToRelax(const GraphDef& graph,
   }
   if (!found_send && devices.size() == 1) {
     nodes_to_relax->insert(assign_nodes.begin(), assign_nodes.end());
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   GraphTopologyView graph_view;
@@ -1373,7 +1373,7 @@ Status FindAssignNodesToRelax(const GraphDef& graph,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -1439,7 +1439,7 @@ Status MemoryOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
   }
 
   optimized_graph->Swap(&optimized_item.graph);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer.cc b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
index 999e7c0dc6d092..5fe1d762b2c517 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer.cc
@@ -183,7 +183,7 @@ Status GetGraphDevice(const GraphDef& g_def, std::set<std::string>* devices) {
     }
     devices->insert(parsed_name.type);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -279,7 +279,7 @@ Status MetaOptimizer::InitializeOptimizers(
     const std::set<string>& device_types,
     std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const {
   if (cfg_.disable_meta_optimizer()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   ConfigList plugin_configs = PluginGraphOptimizerRegistry::GetPluginConfigs(
@@ -574,13 +574,14 @@ Status MetaOptimizer::InitializeCustomGraphOptimizers(
 Status MetaOptimizer::InitializePluginGraphOptimizers(
     const std::set<string>& device_types,
     std::vector<std::unique_ptr<GraphOptimizer>>* optimizers) const {
-  if (cfg_.use_plugin_optimizers() == RewriterConfig::OFF) return OkStatus();
+  if (cfg_.use_plugin_optimizers() == RewriterConfig::OFF)
+    return absl::OkStatus();
   auto plugin_optimizers =
       PluginGraphOptimizerRegistry::CreateOptimizers(device_types);
   for (auto& plugin_optimizer : plugin_optimizers) {
     optimizers->push_back(std::move(plugin_optimizer));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 const RewriterConfig::CustomGraphOptimizer*
@@ -758,7 +759,7 @@ Status MetaOptimizer::OptimizeGraph(
     VLOG(3) << "Skipping optimization, graph has less than " << min_graph_nodes
             << " nodes.";
     *optimized_graph = item.graph;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   tensorflow::metrics::ScopedCounter<2> timings(
@@ -789,7 +790,7 @@ Status MetaOptimizer::OptimizeGraph(
   if (optimizers.empty()) {
     VLOG(3) << "Skipping graph optimization, no optimizers registered";
     *optimized_graph = item.graph;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Invariant: optimized_graph contains the most recently optimized version of
@@ -892,7 +893,7 @@ Status MetaOptimizer::OptimizeGraph(
     DCHECK_EQ(optimized_graph->versions().producer(), original_producer);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MetaOptimizer::OptimizeGraph(Cluster* cluster, GrapplerItem&& item,
@@ -950,7 +951,7 @@ Status MetaOptimizer::RunOptimizer(
       message = strings::StrCat(optimizer->name(),
                                 " did nothing. time = ", duration_ms, "ms.");
       // Swallow the non-critical error.
-      status = OkStatus();
+      status = absl::OkStatus();
     } else if (absl::IsDeadlineExceeded(status)) {
       message =
           strings::StrCat(status.ToString(), ", time = ", duration_ms, "ms.");
@@ -983,7 +984,7 @@ Status MetaOptimizer::RunOptimizer(
     if (absl::StartsWith(optimizer->name(), "tfg_optimizer")) return status;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Propagates `_tf_data_function` attributes from functions to their callees.
@@ -1311,7 +1312,7 @@ Status MetaOptimizer::OptimizeConsumeItem(Cluster* cluster, GrapplerItem&& item,
         *optimized_graph);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 string MetaOptimizer::GetResultString() const {
@@ -1379,7 +1380,7 @@ Status OptimizeGraph(
     const GrapplerItem::OptimizationOptions& optimization_options,
     std::unique_ptr<tensorflow::Graph>* g) {
   if (!tensorflow::grappler::MetaOptimizerEnabled(config_proto)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   tensorflow::grappler::GrapplerItem item;
@@ -1459,7 +1460,7 @@ Status OptimizeGraph(
   }
 
   *g = std::move(optimized_graph);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
index a8c9aed523ad62..d1ad7e5a580c3c 100644
--- a/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
+++ b/tensorflow/core/grappler/optimizers/meta_optimizer_test.cc
@@ -53,14 +53,14 @@ class TestOptimizer : public CustomGraphOptimizer {
 
   Status Init(const tensorflow::RewriterConfig_CustomGraphOptimizer* config =
                   nullptr) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override {
     optimized_ = true;
     *optimized_graph = item.graph;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -83,7 +83,7 @@ class TestOptimizerWithParams : public TestOptimizer {
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
     CHECK(config != nullptr);
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -107,7 +107,7 @@ class GrapplerItemPropertiesAccumulator : public CustomGraphOptimizer {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
@@ -116,7 +116,7 @@ class GrapplerItemPropertiesAccumulator : public CustomGraphOptimizer {
     if (optimization_options_) {
       optimization_options_->insert({item.id, item.optimization_options()});
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -711,7 +711,7 @@ class SleepingOptimizer : public CustomGraphOptimizer {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
@@ -720,7 +720,7 @@ class SleepingOptimizer : public CustomGraphOptimizer {
     Env::Default()->SleepForMicroseconds(1000000);
     GRAPPLER_RETURN_IF_DEADLINE_EXCEEDED();
     optimized_graph->add_node();
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -1201,14 +1201,14 @@ class TfDataTestOptimizer : public CustomGraphOptimizer {
 
   Status Init(
       const tensorflow::RewriterConfig_CustomGraphOptimizer* config) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Optimize(Cluster* cluster, const GrapplerItem& item,
                   GraphDef* optimized_graph) override {
     ++count_;
     *optimized_graph = item.graph;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
diff --git a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
index e53086933deb29..56aa21a7ab09a3 100644
--- a/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/mkl_remapper_test.cc
@@ -39,6 +39,9 @@ class MklRemapperTest : public GrapplerTest {
   void FuseConv2DWithBiasAndAddNOrAdd(const string& data_format,
                                       const string& activation, string add_op,
                                       bool add_with_bcast) {
+#ifdef DNNL_AARCH64_USE_ACL
+    GTEST_SKIP() << "Skipping test due to different behaviour on AARCH64";
+#endif
     if (!IsMKLEnabled()) GTEST_SKIP() << "Test only applicable to oneDNN.";
     using ::tensorflow::ops::Placeholder;
     tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -216,6 +219,7 @@ CREATE_CONV2DFUSION_ADD_BCAST_TEST(AddV2);
 #undef CREATE_CONV2DFUSION_ADD_ACTIVATION_TEST
 #undef CREATE_CONV2DFUSION_TEST
 
+#ifndef DNNL_AARCH64_USE_ACL
 #define REGISTER_TEST(NAME, T, INPUT)                                         \
   TEST_F(MklRemapperTest, NAME##_##T) {                                       \
     if (!IsMKLEnabled()) GTEST_SKIP() << "Test only applicable to oneDNN.";   \
@@ -310,6 +314,7 @@ CREATE_CONV2DFUSION_ADD_BCAST_TEST(AddV2);
   }
 REGISTER_TEST_ALL_TYPES(FuseDepthwiseConv2DWithBiasAndActivation);
 #undef REGISTER_TEST
+#endif
 
 TEST_F(MklRemapperTest, FuseBatchNormWithRelu) {
   if (!IsMKLEnabled()) GTEST_SKIP() << "Test only applicable to oneDNN.";
@@ -444,12 +449,15 @@ TEST_F(MklRemapperTest, FuseBatchNormWithRelu) {
       ASSERT_EQ(tensors_expected.size(), 1);
       auto tensors = EvaluateNodes(output, item.fetch, item.feed);
       ASSERT_EQ(tensors.size(), 1);
-      test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
+      test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 2e-5);
     }
   }
 }
 
 TEST_F(MklRemapperTest, FuseMatMulWithBiasAddAndAdd) {
+#ifdef DNNL_AARCH64_USE_ACL
+  GTEST_SKIP() << "Skipping test due to different behaviour on AARCH64";
+#endif
   if (!IsMKLEnabled()) GTEST_SKIP() << "Test only applicable to oneDNN.";
   using ::tensorflow::ops::Placeholder;
 
@@ -543,6 +551,9 @@ class RelpaceAddWithBiasAddTest : public GrapplerTest {
  protected:
   template <DataType DTYPE>
   void RelpaceAddWithBiasAddDepthwiseConv2D(const string& add_op) {
+#ifdef DNNL_AARCH64_USE_ACL
+    GTEST_SKIP() << "Skipping test due to different behaviour on AARCH64";
+#endif
     if (!IsMKLEnabled()) GTEST_SKIP() << "Test only applicable to oneDNN.";
     using ::tensorflow::ops::Placeholder;
 
@@ -1087,6 +1098,9 @@ class MklRemapperConv2dBiasAddSwishTest : public GrapplerTest {
  protected:
   template <DataType DTYPE>
   void RunTest() {
+#ifdef DNNL_AARCH64_USE_ACL
+    GTEST_SKIP() << "Skipping test due to different behaviour on AARCH64";
+#endif
     if (!IsMKLEnabled()) GTEST_SKIP() << "Test only applicable to oneDNN.";
     if (!IsDataTypeSupportedByOneDNNOnThisCPU(DTYPE))
       GTEST_SKIP() << "Intel oneDNN with " << DataType_Name(DTYPE)
@@ -1172,6 +1186,9 @@ class MklRemapperConv2dFusedBatchNormSwishTest : public GrapplerTest {
  protected:
   template <DataType DTYPE>
   void RunTest() {
+#ifdef DNNL_AARCH64_USE_ACL
+    GTEST_SKIP() << "Skipping test due to different behaviour on AARCH64";
+#endif
     if (!IsMKLEnabled()) GTEST_SKIP() << "Test only applicable to oneDNN.";
     using ::tensorflow::ops::Placeholder;
 
@@ -1557,6 +1574,9 @@ class FusedConvBiasAddAndHardSwishTest : public GrapplerTest {
 
   template <DataType DType, bool with_cast_op = false>
   void RunTest(const string& add_op, const bool is_depthwise) {
+#ifdef DNNL_AARCH64_USE_ACL
+    GTEST_SKIP() << "Skipping test due to different behaviour on AARCH64";
+#endif
     if (!IsMKLEnabled()) GTEST_SKIP() << "Test only applicable to oneDNN.";
     if (!IsDataTypeSupportedByOneDNNOnThisCPU(DType))
       GTEST_SKIP() << "Intel oneDNN with " << DataType_Name(DType)
diff --git a/tensorflow/core/grappler/optimizers/model_pruner.cc b/tensorflow/core/grappler/optimizers/model_pruner.cc
index b8ed4c24c6c750..cbc630b4952b95 100644
--- a/tensorflow/core/grappler/optimizers/model_pruner.cc
+++ b/tensorflow/core/grappler/optimizers/model_pruner.cc
@@ -381,7 +381,7 @@ Status RewriteIdentityNAndInputsOutputs(
   }
   mutable_inputs->DeleteSubrange(curr_pos, num_inputs - curr_pos);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SplitIdentityNInputs(GraphDef* graph,
@@ -413,7 +413,7 @@ Status SplitIdentityNInputs(GraphDef* graph,
     *updated_graph = true;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -504,7 +504,7 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
   *optimized_graph->mutable_versions() = item.graph.versions();
   if (nodes_to_delete.empty()) {
     optimized_graph->mutable_node()->Swap(pruned_graph->mutable_node());
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const bool fetches_are_known = !item.fetch.empty();
@@ -526,7 +526,7 @@ Status ModelPruner::Optimize(Cluster* cluster, const GrapplerItem& item,
   if (optimized_graph->node_size() > item.graph.node_size()) {
     return errors::Internal("Pruning increased graph size.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
index 7b41ff2c4dc75b..0792f6f0b2dd06 100644
--- a/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/pin_to_host_optimizer.cc
@@ -78,7 +78,7 @@ Status TryFindKernelDef(const std::vector<DeviceType>& devices,
       if (kdef) {
         *kdef = kernel;
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -95,7 +95,7 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
 
   // Make sure we are not a denylisted op.
   if (IsDenylisted(node)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Check to make sure we have the right properties (i.e., statically shaped).
@@ -112,10 +112,10 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
                  << " but output_properties.size()=" << output_properties.size()
                  << "\n"
                  << node.DebugString();
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (!IsTensorSmall(output_properties[port_id])) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // These nodes may be optimized away downstream (even if pinned to Host), we
@@ -126,17 +126,17 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
       TF_RETURN_IF_ERROR(IsNodeOutputPortHostFriendly(
           graph, properties, *fanin.node, fanin.port_id, &fanin_candidate));
       if (!fanin_candidate) {
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
     *is_candidate = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Check if op's device is on CPU.
   if (absl::StrContains(node.device(), DEVICE_CPU)) {
     *is_candidate = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Check if op's output port is pinned to HostMemory.
@@ -144,7 +144,7 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
   Status s = OpRegistry::Global()->LookUpOpDef(node.op(), &op);
   if (!s.ok()) {
     LOG(WARNING) << "Could not find OpDef for : " << node.op();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Map the port_id to output_arg_id.
@@ -153,7 +153,7 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
     LOG(WARNING) << "Invalid port: " << port_id << "!\n"
                  << node.DebugString() << "\n"
                  << op->DebugString();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Find the kernel.
@@ -162,7 +162,7 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
                        &kernel);
   if (!s.ok()) {
     LOG(INFO) << "Could not find KernelDef for: " << node.op();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Check if the output_arg is pinned to Host.
@@ -173,7 +173,7 @@ Status IsNodeOutputPortHostFriendly(const GraphView& graph,
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Checks if a node's input port is Host friendly.
@@ -225,18 +225,18 @@ Status IsNodeHostCandidate(const GraphView& graph, GraphProperties* properties,
   // Check if node already on CPU.
   if (absl::StrContains(node.device(), DEVICE_CPU)) {
     *is_candidate = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Skip these node types.
   if (IsDenylisted(node)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Check the node can be run on CPU.
   Status s = TryFindKernelDef({DEVICE_CPU}, node, nullptr);
   if (!s.ok()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Check all inputs are Host friendly.
@@ -246,7 +246,7 @@ Status IsNodeHostCandidate(const GraphView& graph, GraphProperties* properties,
     TF_RETURN_IF_ERROR(IsNodeOutputPortHostFriendly(
         graph, properties, *fanin.node, fanin.port_id, &fanin_candidate));
     if (!fanin_candidate) {
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -259,12 +259,12 @@ Status IsNodeHostCandidate(const GraphView& graph, GraphProperties* properties,
   }
   for (const auto& prop : properties->GetOutputProperties(node.name())) {
     if (!IsTensorSmall(prop)) {
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
   *is_candidate = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Tries to find a Host device from `devices`. Returns empty string if no
@@ -301,7 +301,7 @@ Status PinToHostOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   // Skip Legacy TPU bridge graphs.
   if (IsLegacyTPUBridgeGraphDef(*optimized_graph)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   GraphProperties properties(item);
@@ -364,7 +364,7 @@ Status PinToHostOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc
index 0079a2640f1dbf..7fe058101d0a75 100644
--- a/tensorflow/core/grappler/optimizers/remapper.cc
+++ b/tensorflow/core/grappler/optimizers/remapper.cc
@@ -968,12 +968,14 @@ bool FindConv2DWithBatchNorm(const RemapperContext& ctx, int node_index,
   if (!IsFusedBatchNorm(*node_def)) return false;
 
   // FusedBatchNormV2 and V3 have an extra type parameter.
-  // Conv2D + FusedBatchNormV2/V3 fusion is currently not supported for bf16.
-  // TODO(intel-tf): enable the fusion for bf16
+  // Conv2D + FusedBatchNormV2/V3 fusion is currently supported only for fp32.
+  // TODO(intel-tf): Enable the fusion for bf16 and fp16.
   bool dtypeU_is_float = HasDataType(node_def, DT_FLOAT, "U");
   bool dtypeT_is_bf16 = HasDataType(node_def, DT_BFLOAT16, "T");
+  bool dtypeT_is_mkl_fp16 =
+      IsMKLEnabled() && HasDataType(node_def, DT_HALF, "T");
   if (node_view->GetOp() != "FusedBatchNorm" &&
-      (!dtypeU_is_float || dtypeT_is_bf16)) {
+      (!dtypeU_is_float || dtypeT_is_bf16 || dtypeT_is_mkl_fp16)) {
     return false;
   }
 
@@ -3237,8 +3239,8 @@ Status AddFusedContractionNode(RemapperContext* ctx,
   const GraphDef* graph = ctx->graph_view.graph();
   const NodeDef& contraction = graph->node(matched.contraction);
   const NodeDef& bias_add = graph->node(matched.bias_add);
-  VLOG(2) << "Fuse " << contraction.op() << " with BiasAdd: "
-          << " bias_add=" << bias_add.name()
+  VLOG(2) << "Fuse " << contraction.op()
+          << " with BiasAdd: " << " bias_add=" << bias_add.name()
           << " contraction=" << contraction.name();
 
   NodeDef fused_op;
@@ -3273,7 +3275,7 @@ Status AddFusedContractionNode(RemapperContext* ctx,
   (*invalidated_nodes)[matched.bias_add] = true;
   (*nodes_to_delete)[matched.contraction] = true;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddFusedContractionNode(RemapperContext* ctx,
@@ -3326,7 +3328,7 @@ Status AddFusedContractionNode(RemapperContext* ctx,
   (*nodes_to_delete)[matched.contraction] = true;
   (*invalidated_nodes)[matched.activation] = true;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddFusedContractionNode(
@@ -3340,8 +3342,7 @@ Status AddFusedContractionNode(
   const NodeDef& activation = graph->node(matched.activation);
 
   VLOG(2) << "Fuse " << contraction.op() << " with BiasAdd and "
-          << activation.op() << ":"
-          << " activation=" << activation.name()
+          << activation.op() << ":" << " activation=" << activation.name()
           << " bias_add=" << bias_add.name()
           << " contraction=" << contraction.name();
 
@@ -3381,7 +3382,7 @@ Status AddFusedContractionNode(
   (*nodes_to_delete)[matched.bias_add] = true;
   (*invalidated_nodes)[matched.activation] = true;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddFusedConvNode(RemapperContext* ctx,
@@ -3395,8 +3396,8 @@ Status AddFusedConvNode(RemapperContext* ctx,
 
   const NodeDef& bias_add = graph->node(matched.bias_add);
   const NodeDef& squeeze = graph->node(matched.squeeze);
-  VLOG(2) << "Fuse Conv2D/3D with Squeeze and BiasAdd: "
-          << " bias_add=" << bias_add.name() << " squeeze=" << squeeze.name()
+  VLOG(2) << "Fuse Conv2D/3D with Squeeze and BiasAdd: " << " bias_add="
+          << bias_add.name() << " squeeze=" << squeeze.name()
           << " conv=" << contraction.name();
 
   // Replace Conv2D/3D node with a fused Conv2D/3D. Matched pattern guarantees
@@ -3436,7 +3437,7 @@ Status AddFusedConvNode(RemapperContext* ctx,
   (*invalidated_nodes)[matched.bias_add] = true;
   (*nodes_to_delete)[matched.squeeze] = true;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddFusedConv2DNode(RemapperContext* ctx,
@@ -3475,7 +3476,7 @@ Status AddFusedConv2DNode(RemapperContext* ctx,
   (*invalidated_nodes)[matched.fused_batch_norm] = true;
   (*nodes_to_delete)[matched.contraction] = true;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddFusedConv2DNode(RemapperContext* ctx,
@@ -3520,7 +3521,7 @@ Status AddFusedConv2DNode(RemapperContext* ctx,
   (*nodes_to_delete)[matched.contraction] = true;
   (*nodes_to_delete)[matched.fused_batch_norm] = true;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddFusedContractionNode(RemapperContext* ctx,
@@ -3574,7 +3575,7 @@ Status AddFusedContractionNode(RemapperContext* ctx,
   (*nodes_to_delete)[matched.contraction] = true;
   (*nodes_to_delete)[matched.bias_add] = true;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddFusedConv3DNode(RemapperContext* ctx, const PadWithConv3D& matched,
@@ -3585,8 +3586,8 @@ Status AddFusedConv3DNode(RemapperContext* ctx, const PadWithConv3D& matched,
   const NodeDef& pad_node_def = graph->node(matched.pad_idx);
   const NodeDef& padding_const_node_def =
       graph->node(matched.padding_const_idx);
-  VLOG(2) << "Fuse " << pad_node_def.op() << " with contraction: "
-          << " contraction=" << contraction.name();
+  VLOG(2) << "Fuse " << pad_node_def.op()
+          << " with contraction: " << " contraction=" << contraction.name();
 
   NodeDef fused_node;
   // Note: Currently, the attributes of fused op are superset of contraction
@@ -3615,7 +3616,7 @@ Status AddFusedConv3DNode(RemapperContext* ctx, const PadWithConv3D& matched,
   } else {
     VLOG(2) << "Pad fusion with " << contraction.op() << " is invalidated, "
             << "it requires padding dim sizes to be constant.";
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   utils::Mutation* mutation = ctx->graph_view.GetMutationBuilder();
@@ -3626,7 +3627,7 @@ Status AddFusedConv3DNode(RemapperContext* ctx, const PadWithConv3D& matched,
 
   (*invalidated_nodes)[matched.contraction_idx] = true;
   (*nodes_to_delete)[matched.pad_idx] = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddFusedContractionNode(
@@ -3673,7 +3674,7 @@ Status AddFusedContractionNode(
   (*nodes_to_delete)[matched.bias_add] = true;
   (*nodes_to_delete)[matched.contraction] = true;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FuseContractionWithBiasAddAndHardSwish(
@@ -3714,7 +3715,7 @@ Status FuseContractionWithBiasAddAndHardSwish(
   for (const auto& node_idx : *remove_node_indices) {
     (*nodes_to_delete)[node_idx] = true;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FuseConv2DSwish(RemapperContext* ctx,
@@ -3767,7 +3768,7 @@ Status FuseConv2DSwish(RemapperContext* ctx,
     (*nodes_to_delete)[node_index] = true;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddFusedMatMulBiasAddAndGelu(
@@ -3810,7 +3811,7 @@ Status AddFusedMatMulBiasAddAndGelu(
   for (const auto& node_idx : remove_node_indices) {
     (*nodes_to_delete)[node_idx] = true;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddMklLayerNorm(RemapperContext* ctx,
@@ -3843,7 +3844,7 @@ Status AddMklLayerNorm(RemapperContext* ctx,
   for (const auto& node_idx : remove_node_indices) {
     (*nodes_to_delete)[node_idx] = true;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ReplaceMulMaximumWithLeakyRelu(
@@ -3879,7 +3880,7 @@ Status ReplaceMulMaximumWithLeakyRelu(
     (*nodes_to_delete)[node_index] = true;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ReplaceSigmoidMulWithSwish(
@@ -3911,7 +3912,7 @@ Status ReplaceSigmoidMulWithSwish(
   for (const auto& node_index : remove_node_indices) {
     (*nodes_to_delete)[node_index] = true;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddFusedBatchNormExNode(RemapperContext* ctx,
@@ -3922,8 +3923,9 @@ Status AddFusedBatchNormExNode(RemapperContext* ctx,
   const NodeDef& fused_batch_norm = graph->node(matched.fused_batch_norm);
   const NodeDef& activation = graph->node(matched.activation);
 
-  VLOG(2) << "Fuse " << activation.op() << " with FusedBatchNorm:"
-          << " activation=" << activation.name() << " side_input="
+  VLOG(2) << "Fuse " << activation.op()
+          << " with FusedBatchNorm:" << " activation=" << activation.name()
+          << " side_input="
           << (matched.side_input != kMissingIndex
                   ? graph->node(matched.side_input).name()
                   : "<none>")
@@ -3980,7 +3982,7 @@ Status AddFusedBatchNormExNode(RemapperContext* ctx,
     (*nodes_to_delete)[matched.invalidated] = true;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddFusedBatchNormGradExNode(RemapperContext* ctx,
@@ -4053,7 +4055,7 @@ Status AddFusedBatchNormGradExNode(RemapperContext* ctx,
     (*nodes_to_delete)[matched.activation_grad] = true;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddBatchNormNodes(RemapperContext* ctx, const FusedBatchNorm& matched) {
@@ -4260,8 +4262,8 @@ Status AddTensorToHashBucketNode(RemapperContext* ctx,
   const NodeDef& as_string = graph->node(matched.as_string);
   const NodeDef& string_to_hash_bucket =
       graph->node(matched.string_to_hash_bucket);
-  VLOG(2) << "Fuse AsString with StringToHashBucketFast:"
-          << " as_string=" << as_string.name()
+  VLOG(2) << "Fuse AsString with StringToHashBucketFast:" << " as_string="
+          << as_string.name()
           << " string_to_hash_bucket=" << string_to_hash_bucket.name()
           << " on device=" << pre_as_string.device();
 
@@ -4286,7 +4288,7 @@ Status AddTensorToHashBucketNode(RemapperContext* ctx,
   (*invalidated_nodes)[matched.string_to_hash_bucket] = true;
   (*nodes_to_delete)[matched.as_string] = true;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status AddFusedBatchMatMul(RemapperContext* ctx,
@@ -4319,7 +4321,7 @@ Status AddFusedBatchMatMul(RemapperContext* ctx,
   for (const auto& node_idx : remove_node_indices) {
     (*nodes_to_delete)[node_idx] = true;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Helper function to get data of type T from a given tensor and
@@ -4357,19 +4359,19 @@ Status AddMklFusedInstanceNorm(RemapperContext* ctx,
 
   if (!mean_axes_node || mean_axes_node->op() != "Const") {
     VLOG(2) << "Mean reduction axes node is not valid, abort fusion";
-    return OkStatus();
+    return absl::OkStatus();
   }
   DataType dtype;
   Tensor mean_axes_tensor;
   if (!mean_axes_tensor.FromProto(
           mean_axes_node->attr().at("value").tensor())) {
     VLOG(2) << "Unable to get mean reduction axes, abort fusion";
-    return OkStatus();
+    return absl::OkStatus();
   }
   dtype = mean_axes_tensor.dtype();
   if (dtype != DT_INT32 && dtype != DT_INT64) {
     VLOG(2) << "Unexpected mean reduction axes data type, abort fusion";
-    return OkStatus();
+    return absl::OkStatus();
   }
   std::vector<int> reduction_axes =
       (dtype == DT_INT32) ? GetTensorValues<int32, int>(mean_axes_tensor)
@@ -4381,11 +4383,11 @@ Status AddMklFusedInstanceNorm(RemapperContext* ctx,
         ctx->graph_view.GetNode(matched_nodes_map->at("activation"))->node();
     if (!activation_node) {
       VLOG(2) << "Error to retrieve activation node, abort fusion";
-      return OkStatus();
+      return absl::OkStatus();
     }
     if (!IsLeakyRelu(*activation_node) && !IsRelu(*activation_node)) {
       VLOG(2) << "Unsupported activation node, abort fusion";
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -4443,7 +4445,7 @@ Status AddMklFusedInstanceNorm(RemapperContext* ctx,
   for (const auto& node_idx : *remove_node_indices) {
     (*nodes_to_delete)[node_idx] = true;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // This function supports below patterns that require inferred
@@ -4615,7 +4617,7 @@ Status ReplaceSoftplusTanhAndMulWithMish(
     (*nodes_to_delete)[node_index] = true;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Check if a node is a candidate to one of the patterns that require inferred
@@ -5137,7 +5139,7 @@ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item,
 
   *optimized_graph = std::move(mutable_item.graph);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/remapper_test.cc b/tensorflow/core/grappler/optimizers/remapper_test.cc
index cc315186474712..f5efc5b7d98bc7 100644
--- a/tensorflow/core/grappler/optimizers/remapper_test.cc
+++ b/tensorflow/core/grappler/optimizers/remapper_test.cc
@@ -1029,7 +1029,7 @@ class RemapperFuseConvWithBiasAndAddActivation : public RemapperTest {
       ASSERT_EQ(tensors_expected.size(), 1);
       auto tensors = EvaluateNodes(output, item.fetch, item.feed);
       ASSERT_EQ(tensors.size(), 1);
-      test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
+      test::ExpectClose(tensors[0], tensors_expected[0], 0, 1e-6);
     }
   }
 };
@@ -2087,6 +2087,9 @@ TEST_F(RemapperFuseMatMulWithBiasAndActivationTest, Bf16) {
 }
 
 TEST_F(RemapperTest, FuseConv2DWithBatchNorm) {
+#ifdef DNNL_AARCH64_USE_ACL
+  GTEST_SKIP() << "Skipping test due to different behaviour on AARCH64";
+#endif
   using ops::Placeholder;
 
   tensorflow::Scope s = tensorflow::Scope::NewRootScope();
@@ -2165,6 +2168,9 @@ TEST_F(RemapperTest, FuseConv2DWithBatchNorm) {
 }
 
 TEST_F(RemapperTest, FuseConv2DWithBatchNormAndActivation) {
+#ifdef DNNL_AARCH64_USE_ACL
+  GTEST_SKIP() << "Skipping test due to different behaviour on AARCH64";
+#endif
   using ops::Placeholder;
 
   for (const string& activation : {"Relu", "Relu6", "Elu", "LeakyRelu"}) {
@@ -2271,6 +2277,9 @@ TEST_F(RemapperTest, FuseConv2DWithBatchNormAndActivation) {
 
 #ifdef INTEL_MKL
 TEST_F(RemapperTest, FuseConv3DWithBiasAndAddN) {
+#ifdef DNNL_AARCH64_USE_ACL
+  GTEST_SKIP() << "Skipping test due to different behaviour on AARCH64";
+#endif
   if (!IsMKLEnabled()) GTEST_SKIP() << "Test only applicable to oneDNN.";
   using ::tensorflow::ops::Placeholder;
 
@@ -2339,10 +2348,13 @@ TEST_F(RemapperTest, FuseConv3DWithBiasAndAddN) {
   ASSERT_EQ(tensors_expected.size(), 1);
   auto tensors = EvaluateNodes(output, item.fetch, item.feed);
   ASSERT_EQ(tensors.size(), 1);
-  test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
+  test::ExpectClose(tensors[0], tensors_expected[0], 0, 1e-6);
 }
 
 TEST_F(RemapperTest, FuseConv3DWithBiasAndAdd) {
+#ifdef DNNL_AARCH64_USE_ACL
+  GTEST_SKIP() << "Skipping test due to different behaviour on AARCH64";
+#endif
   if (!IsMKLEnabled()) GTEST_SKIP() << "Test only applicable to oneDNN.";
   using ::tensorflow::ops::Placeholder;
 
@@ -2410,7 +2422,7 @@ TEST_F(RemapperTest, FuseConv3DWithBiasAndAdd) {
   ASSERT_EQ(tensors_expected.size(), 1);
   auto tensors = EvaluateNodes(output, item.fetch, item.feed);
   ASSERT_EQ(tensors.size(), 1);
-  test::ExpectTensorNear<float>(tensors[0], tensors_expected[0], 1e-6);
+  test::ExpectClose(tensors[0], tensors_expected[0], 0, 1e-6);
 }
 
 // Conv2D + Add {6,} + Conv2D + Biasadd fusion.
diff --git a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
index 6cc45a31b7ebd1..0caf9d89e925a9 100644
--- a/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/scoped_allocator_optimizer.cc
@@ -79,7 +79,7 @@ Status GetOutputDataType(
                             " size of output_props ", output_props.size());
   }
   *dtype = output_props[output_index].dtype();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // After shape inference has been done each op should be annotated
@@ -133,7 +133,7 @@ Status CheckTypesAndGetShapes(const GraphProperties& graph_properties,
     VLOG(2) << "Adding shape " << props.shape().DebugString();
     shapes->push_back(TensorShape(props.shape()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Describes an existing input edge in the graph.
@@ -183,7 +183,7 @@ Status RemoveEdge(const string& input_edge_name, const string& from_node_name,
     node_map->RemoveOutput(from_node_name, to_node->name());
   }
   inputs->DeleteSubrange(edge_index, 1);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // In certain cases, we would like to insert an identity op between `input` and
@@ -224,7 +224,7 @@ Status MaybeRewriteInput(ScopedAllocatorOptimizer* sa_opti,
   if (!(*rewrite)) {
     *new_input = input;
     *new_output_index = output_index;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Create new Identity op.
@@ -249,7 +249,7 @@ Status MaybeRewriteInput(ScopedAllocatorOptimizer* sa_opti,
   VLOG(1) << "Rewrite input " << edge_name << " op " << op->name()
           << " old output index " << output_index << " with identity "
           << identity_name << " new output index 0";
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Populates *inputs with all of the non-control inputs of ops.
@@ -310,7 +310,7 @@ Status GetInputs(ScopedAllocatorOptimizer* sa_opti, int64_t invocation_count,
     }
     inputs->emplace_back(inode, output_index, n);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Return non-control inputs of `op` in `inputs`.
@@ -333,7 +333,7 @@ Status GetDataInputs(GraphDef* graph, NodeMap* node_map, NodeDef* op,
             << output_index;
     inputs->emplace_back(inode, output_index, op);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void DumpGraphToVLOG(const GraphDef& graph, int log_level) {
@@ -377,7 +377,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
             " is a Const op which does not use AllocatorAttributes");
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Return non-OK if any input is already committed to a ScopedAllocator.
@@ -403,7 +403,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
             "assigned to scope_id ", scope_ids[1]);
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Return non-OK if any input is a member of op_set.
@@ -419,7 +419,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
         }
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Remove all control edges between members of ops.
@@ -478,7 +478,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
     int64_t num_elts = num_bytes / DataTypeSize(*dtype);
     VLOG(2) << "num_bytes " << num_bytes << " num_elts=" << num_elts;
     *sa_shape = TensorShape({num_elts});
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Returns the set of all nodes that are transitively reachable via data or
@@ -505,7 +505,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Build the ScopedAllocator node that will be assigned to allocate
@@ -594,7 +594,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
                       "ScopedAllocatorOptimizer and file a bug.";
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status BuildSAConcatNode(GraphDef* graph, NodeMap* node_map,
@@ -657,7 +657,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
       sac_node->add_input(ctl_edge);
       node_map->AddOutput(input_name, sac_node->name());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status BuildReplacementOp(GraphDef* graph, NodeMap* node_map,
@@ -684,7 +684,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
     LOG_WARNING_AND_RETURN_IF_ERROR(op_builder.Finalize(sa_op_node));
     node_map->AddNode(sa_op_name, sa_op_node);
     node_map->AddOutput(sac_name, sa_op_name);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status BuildSplitNode(GraphDef* graph, NodeMap* node_map,
@@ -713,7 +713,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
     for (const auto& input : sas_inputs) {
       node_map->AddOutput(input.node, sas_name);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // After the new ScopedAllocator and its corresponding Concat and
@@ -804,7 +804,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
       // Remove.
       RemoveNode(old_op, graph, node_map);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Given a collection of instances of op_name, presumed to be
@@ -885,7 +885,7 @@ class UnaryElementwiseRewriter : public ScopedAllocatorOptimizer::Rewriter {
                                       op_name, sas_name));
 
     *applied = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -933,7 +933,7 @@ Status ScopedAllocatorOptimizer::Optimize(Cluster* /*cluster*/,
   VLOG(1) << "ScopedAllocatorOptimizer::Optimize() done";
   VLOG(3) << "Optimized graph:";
   DumpGraphToVLOG(*optimized_graph, /*log_level=*/3);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 ScopedAllocatorOptimizer::Rewriter* ScopedAllocatorOptimizer::GetRewriter(
@@ -958,7 +958,7 @@ Status ScopedAllocatorOptimizer::NewIdentityId(int* id) {
   if (next_identity_id_ < 0) {
     return errors::Aborted("NewIdentityId overflow");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 ScopedAllocatorOptimizer::~ScopedAllocatorOptimizer() {
@@ -1115,7 +1115,7 @@ Status ScopedAllocatorOptimizer::ProcessGraphDef(
         absl::flat_hash_set<string> seen_outputs;
         status = ApplyToAll(root.get(), [this, &seen_outputs](Tree* t) {
           IdentifyRepeatedInputs(t->nodes_, &seen_outputs, &repeated_outputs_);
-          return OkStatus();
+          return absl::OkStatus();
         });
         if (!status.ok()) {
           break;
@@ -1142,7 +1142,7 @@ Status ScopedAllocatorOptimizer::ProcessGraphDef(
               }
             }
           }
-          return OkStatus();
+          return absl::OkStatus();
         });
         if (!status.ok()) {
           break;
@@ -1198,13 +1198,13 @@ Status ScopedAllocatorOptimizer::OrderNodeSet(
   // Nodes should be identical type.  Default order is by name but for
   // collectives we order by increasing instance_key so each group gets
   // the same instance_key.
-  if (nodes->size() <= 1) return OkStatus();
+  if (nodes->size() <= 1) return absl::OkStatus();
   if (IsCollectiveNode(*nodes->at(0))) {
     std::sort(nodes->begin(), nodes->end(), InstanceKeyLess());
   } else {
     std::sort(nodes->begin(), nodes->end(), NameLess());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/shape_optimizer.cc b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
index c0053a49a5f6c8..e6f7d069860adb 100644
--- a/tensorflow/core/grappler/optimizers/shape_optimizer.cc
+++ b/tensorflow/core/grappler/optimizers/shape_optimizer.cc
@@ -191,7 +191,7 @@ Status ShapeOptimizer::Optimize(Cluster* cluster, const GrapplerItem& item,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/static_schedule.cc b/tensorflow/core/grappler/optimizers/static_schedule.cc
index 5eff0db3c9a663..0f7bc13140e416 100644
--- a/tensorflow/core/grappler/optimizers/static_schedule.cc
+++ b/tensorflow/core/grappler/optimizers/static_schedule.cc
@@ -127,7 +127,7 @@ Status EstimateEarliestExecutionTimes(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status EstimateRequiredTimes(
@@ -196,7 +196,7 @@ Status EstimateRequiredTimes(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc b/tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc
index df6fcc04c133bd..12c494c6d0456c 100644
--- a/tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc
+++ b/tensorflow/core/grappler/optimizers/tfg_optimizer_hook.cc
@@ -172,7 +172,7 @@ Status TFGGrapplerOptimizer::Optimize(
     module.dump();
   }
 
-  return ::tensorflow::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // end namespace tfg
diff --git a/tensorflow/core/grappler/utils/frame.cc b/tensorflow/core/grappler/utils/frame.cc
index f7e114ff0ff705..280b04a71889d5 100644
--- a/tensorflow/core/grappler/utils/frame.cc
+++ b/tensorflow/core/grappler/utils/frame.cc
@@ -111,7 +111,7 @@ inline Status FrameView::InferFromGraphViewT(const GraphViewT& graph_view) {
             " does not match frame ids for it's fanout ", fanout_node->name());
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   while (!ready_node_indices.empty()) {
@@ -138,7 +138,7 @@ inline Status FrameView::InferFromGraphViewT(const GraphViewT& graph_view) {
   }
 
   num_frames_ = static_cast<int>(frame_name_to_id.size());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FrameView::InferFromGraphView(const utils::GraphView& graph_view) {
diff --git a/tensorflow/core/grappler/utils/functions.cc b/tensorflow/core/grappler/utils/functions.cc
index eb1493f968173d..c2d848aaa67ae6 100644
--- a/tensorflow/core/grappler/utils/functions.cc
+++ b/tensorflow/core/grappler/utils/functions.cc
@@ -180,7 +180,7 @@ Status InstantiationTypeParameters(
         ++index;
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   for (const auto& input : func.signature().input_arg())
@@ -188,7 +188,7 @@ Status InstantiationTypeParameters(
   for (const auto& output : func.signature().output_arg())
     TF_RETURN_IF_ERROR(resolve_type_attr(output));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InstantiationBodyParameters(
@@ -218,7 +218,7 @@ Status InstantiationBodyParameters(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MakeGrapplerFunctionItem(const FunctionDef& func,
@@ -308,7 +308,7 @@ Status MakeGrapplerFunctionItem(const FunctionDef& func,
       /*func_attr=*/AttrSlice(&func.attr()), std::move(arg_attr),
       std::move(inputs), std::move(outputs), std::move(control_outputs),
       graph_def_version, signature.is_stateful(), std::move(function_body));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MakeGrapplerFunctionItem(const FunctionDef& func,
@@ -357,7 +357,7 @@ Status ReplaceInputWithConst(const NodeDef& input_const, int input_index,
   item->input_args_.erase(item->input_args_.begin() + input_index);
   item->arg_attr_.erase(item->arg_attr_.begin() + input_index);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status RemoveFunctionOutputs(const absl::flat_hash_set<int>& remove_outputs,
@@ -412,7 +412,7 @@ Status RemoveFunctionOutputs(const absl::flat_hash_set<int>& remove_outputs,
   auto& o = item->output_args_;
   o.erase(std::remove_if(o.begin(), o.end(), is_remove_output_arg), o.end());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
@@ -472,14 +472,14 @@ Status MakeFunctionDefHelper::Initialize(
     function_body_outputs_.emplace(node.name(), std::move(outputs_range_map));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MakeFunctionDefHelper::AsFunctionDefInput(const string& graph_def_input,
                                                  string* func_def_input) const {
   if (IsControlInput(graph_def_input)) {
     *func_def_input = graph_def_input;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const SafeTensorId tensor = ParseTensorName(graph_def_input);
@@ -490,7 +490,7 @@ Status MakeFunctionDefHelper::AsFunctionDefInput(const string& graph_def_input,
   if (is_input != input_nodes_.end()) {
     DCHECK_EQ(tensor.index(), 0);
     *func_def_input = tensor.node();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Or it must be output from one of the function body nodes
@@ -505,7 +505,7 @@ Status MakeFunctionDefHelper::AsFunctionDefInput(const string& graph_def_input,
           tensor.index() < output_range.second) {
         *func_def_input = absl::StrCat(tensor.node(), ":", output_name, ":",
                                        tensor.index() - output_range.first);
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
   }
@@ -524,7 +524,7 @@ Status MakeFunctionDefHelper::AsFunctionDefNode(
     function_body_node->set_input(i, func_def_input);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -617,7 +617,7 @@ Status MakeFunctionDef(const GrapplerFunctionItem& item,
     TF_RETURN_IF_ERROR(helper.AsFunctionDefNode(func_def_node));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // end namespace grappler
diff --git a/tensorflow/core/grappler/utils/graph_view.cc b/tensorflow/core/grappler/utils/graph_view.cc
index 1d26a4a6bfe38c..aeb947808488e8 100644
--- a/tensorflow/core/grappler/utils/graph_view.cc
+++ b/tensorflow/core/grappler/utils/graph_view.cc
@@ -104,7 +104,7 @@ GraphView::GraphView(const GraphDef* graph, Status* status)
       return;
     }
   }
-  *status = OkStatus();
+  *status = absl::OkStatus();
 }
 
 bool GraphView::AddUniqueNodeInternal(const NodeDef* node) {
@@ -165,7 +165,7 @@ Status GraphView::CheckAndAddFaninsInternal(NodeView* node_view) {
       node_view->fanins_set_.emplace(fanin_node_view.node(), fanin_id.index());
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 MutableFaninView::MutableFaninView(MutableNodeView* node_view, int index)
@@ -257,7 +257,7 @@ MutationNewNode Mutation::AddNode(NodeDef&& node, Status* status) {
   mutation_node.regular_fanins = std::move(regular_fanins);
   mutation_node.num_regular_fanins = mutation_node.regular_fanins.size();
   mutation_node.controlling_fanins = std::move(controlling_fanins);
-  *status = OkStatus();
+  *status = absl::OkStatus();
   return MutationNewNode(this, mutation_counter_, new_nodes_.size() - 1);
 }
 
@@ -484,7 +484,7 @@ MutableGraphView::MutableGraphView(GraphDef* graph, Status* status)
   }
   AddFaninsInternal(&fanins);
   mutation_.ResetInternal();
-  *status = OkStatus();
+  *status = absl::OkStatus();
 }
 
 Mutation* MutableGraphView::GetMutationBuilder() { return &mutation_; }
@@ -534,7 +534,7 @@ Status MutableGraphView::CheckFaninsInternal(
     }
     fanins->push_back(std::move(node_fanins));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void MutableGraphView::AddFaninsInternal(
@@ -686,7 +686,7 @@ Status MutableGraphView::GetNodeNamesAndPartitionUpdatedNodes(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MutableGraphView::RemovedOrMissingNodeFanoutsWellFormed(
@@ -764,7 +764,7 @@ Status MutableGraphView::RemovedOrMissingNodeFanoutsWellFormed(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MutableGraphView::CheckNodeNamesAndFanins(
@@ -801,7 +801,7 @@ Status MutableGraphView::CheckNodeNamesAndFanins(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MutableGraphView::CheckKernelRegisteredForNodes() {
@@ -849,7 +849,7 @@ Status MutableGraphView::CheckKernelRegisteredForNodes() {
       LOG(WARNING) << s.message();
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename T>
@@ -1617,7 +1617,7 @@ Status MutableGraphView::SortTopologically(
   // Permute graph NodeDefs.
   PermuteNodesInPlace(graph_, &order, /*invert_permutation=*/false);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 inline Status MutableGraphView::ValidateInternal(
@@ -1637,7 +1637,7 @@ inline Status MutableGraphView::ValidateInternal(
   // Check if nodes after mutation have kernels registered.
   TF_RETURN_IF_ERROR(CheckKernelRegisteredForNodes());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MutableGraphView::ApplyMutationInternal() {
@@ -1701,7 +1701,7 @@ Status MutableGraphView::ApplyMutationInternal() {
 
   mutation_.mutation_counter_++;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace utils
diff --git a/tensorflow/core/grappler/utils/topological_sort.cc b/tensorflow/core/grappler/utils/topological_sort.cc
index 4e10be8d16d03f..29e00240028715 100644
--- a/tensorflow/core/grappler/utils/topological_sort.cc
+++ b/tensorflow/core/grappler/utils/topological_sort.cc
@@ -105,7 +105,7 @@ Status ComputeTopologicalOrder(
     return errors::InvalidArgument(
         "The graph couldn't be sorted in topological order.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -123,7 +123,7 @@ Status ComputeTopologicalOrder(
     topo_order->emplace_back(&graph.node(ready_node_idx));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ComputeTopologicalOrder(const GraphDef& graph,
@@ -136,14 +136,14 @@ Status ReversedTopologicalSort(GraphDef* graph) {
   TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, {}, &ready_nodes));
   std::reverse(ready_nodes.begin(), ready_nodes.end());
   PermuteNodesInPlace(graph, &ready_nodes, /*invert_permutation=*/true);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status TopologicalSort(GraphDef* graph) {
   std::vector<int> ready_nodes;
   TF_RETURN_IF_ERROR(ComputeTopologicalOrder(*graph, {}, &ready_nodes));
   PermuteNodesInPlace(graph, &ready_nodes, /*invert_permutation=*/true);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/utils/transitive_fanin.cc b/tensorflow/core/grappler/utils/transitive_fanin.cc
index bc122674664190..2b2c0d8e6662d8 100644
--- a/tensorflow/core/grappler/utils/transitive_fanin.cc
+++ b/tensorflow/core/grappler/utils/transitive_fanin.cc
@@ -82,7 +82,7 @@ Status ComputeTransitiveFanin(
       // So, we do not set ill_formed for missing _Send.
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ComputeTransitiveFanin(const GraphDef& graph,
@@ -106,7 +106,7 @@ Status SetTransitiveFaninGraph(const GraphDef& input_graph,
     *output_graph->add_node() = *keep[i];
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace grappler
diff --git a/tensorflow/core/grappler/verifiers/structure_verifier_test.cc b/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
index bea5de0c6c6fdb..eac350e2882554 100644
--- a/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
+++ b/tensorflow/core/grappler/verifiers/structure_verifier_test.cc
@@ -46,7 +46,7 @@ Status Scalars(shape_inference::InferenceContext* c) {
   for (int i = 0; i < c->num_outputs(); ++i) {
     c->set_output(i, c->Scalar());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 REGISTER_OP("TestParams").Output("o: float").SetShapeFn(Scalars);
diff --git a/tensorflow/core/ir/importexport/graphdef_import.cc b/tensorflow/core/ir/importexport/graphdef_import.cc
index eb09991b6a0973..ec1db60ccdee33 100644
--- a/tensorflow/core/ir/importexport/graphdef_import.cc
+++ b/tensorflow/core/ir/importexport/graphdef_import.cc
@@ -823,6 +823,9 @@ Status GraphDefImporter::ConvertNodeDef(OpBuilder &builder, ConversionState &s,
   // Get the result types. Ops can have multiple named results. Track the
   // segment sizes.
   SmallVector<std::pair<unsigned, unsigned>> result_segments;
+
+  if (op_def->output_arg_size() < 0)
+    return InvalidArgument("Node ", node.name(), " output arg size < 0");
   result_segments.reserve(op_def->output_arg_size());
   state.types.reserve(op_def->output_arg_size() + 1);
   for (const OpDef::ArgDef &def : op_def->output_arg()) {
diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD
index 5ce78756bf3cd0..3f3687c50224f4 100644
--- a/tensorflow/core/kernels/BUILD
+++ b/tensorflow/core/kernels/BUILD
@@ -1601,9 +1601,9 @@ tf_kernel_library(
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
-        "//tensorflow/core/framework:bounds_check",
         "//tensorflow/core/platform:stream_executor",
         "//tensorflow/core/profiler/lib:scoped_annotation",
+        "@com_google_absl//absl/strings",
         "@eigen_archive//:eigen3",
     ],
 )
@@ -3877,7 +3877,7 @@ tf_cuda_cc_test(
     name = "matmul_op_test",
     srcs = ["matmul_op_test.cc"],
     tags = [
-        "no_arm64",  # b/282068262
+        "no_aarch64",  # b/282068262
     ],
     deps = [
         ":matmul_op",
diff --git a/tensorflow/core/kernels/barrier_ops.cc b/tensorflow/core/kernels/barrier_ops.cc
index 9c144faf997cb3..91b1de594b121a 100644
--- a/tensorflow/core/kernels/barrier_ops.cc
+++ b/tensorflow/core/kernels/barrier_ops.cc
@@ -396,7 +396,7 @@ class Barrier : public ResourceBase {
       TF_RETURN_IF_ERROR(ready_queue_->ValidateTuple(ready_tuple));
       ready_tuples->push_back(ready_tuple);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void CloseQueueLocked(OpKernelContext* ctx, bool cancel_pending_enqueues,
@@ -485,7 +485,7 @@ class BarrierOp : public ResourceOpKernel<Barrier> {
           " but requested component shapes were ",
           TensorShapeUtils::ShapeListString(value_component_shapes_));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   DataTypeVector value_component_types_;
diff --git a/tensorflow/core/kernels/batch_kernels.cc b/tensorflow/core/kernels/batch_kernels.cc
index 8862e5b0c98f58..af6de3a6f3987a 100644
--- a/tensorflow/core/kernels/batch_kernels.cc
+++ b/tensorflow/core/kernels/batch_kernels.cc
@@ -200,7 +200,7 @@ class BatchResource : public serving::BatchResourceBase {
             low_priority_max_enqueued_batches,
             low_priority_allowed_batch_sizes),
         allowed_batch_sizes));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   static Status Create(
@@ -221,7 +221,7 @@ class BatchResource : public serving::BatchResourceBase {
             /*enable_large_batch_splitting=*/true, allowed_batch_sizes,
             /*disable_padding=*/false),
         allowed_batch_sizes));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string DebugString() const final { return "BatchResource"; }
@@ -411,7 +411,7 @@ void BatchFunctionKernel::ComputeAsync(OpKernelContext* c, DoneCallback done) {
         new_resource->set_session_metadata(*session_metadata);
       }
       *r = new_resource.release();
-      return OkStatus();
+      return absl::OkStatus();
     };
   } else {
     creator = [this,
@@ -428,7 +428,7 @@ void BatchFunctionKernel::ComputeAsync(OpKernelContext* c, DoneCallback done) {
         new_resource->set_session_metadata(*session_metadata);
       }
       *r = new_resource.release();
-      return OkStatus();
+      return absl::OkStatus();
     };
   }
 
@@ -525,7 +525,7 @@ Status BatchFunctionKernel::GetOrCreateFunctionHandle(
   } else {
     *handle = fhandle_.value();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Validates 'allowed_batch_sizes_'. The entries must increase monotonically.
@@ -534,7 +534,7 @@ Status BatchFunctionKernel::GetOrCreateFunctionHandle(
 // to `max_batch_size_`.
 Status BatchFunctionKernel::ValidateAllowedBatchSizes() const {
   if (allowed_batch_sizes_.empty()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   int32_t last_size = 0;
   for (size_t i = 0; i < allowed_batch_sizes_.size(); ++i) {
@@ -553,7 +553,7 @@ Status BatchFunctionKernel::ValidateAllowedBatchSizes() const {
 
     last_size = size;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Initialize vars by reading from op-kernel-construction.
@@ -667,7 +667,7 @@ class BatchKernel : public AsyncOpKernel {
           max_batch_size_, batch_timeout_micros_, max_enqueued_batches_,
           allowed_batch_sizes_, false, &new_resource));
       *r = new_resource.release();
-      return OkStatus();
+      return absl::OkStatus();
     };
     OP_REQUIRES_OK_ASYNC(c,
                          c->resource_manager()->LookupOrCreate(
@@ -689,7 +689,7 @@ class BatchKernel : public AsyncOpKernel {
   // monotonically, and the last one must equal 'max_batch_size_'.
   Status ValidateAllowedBatchSizes() const {
     if (allowed_batch_sizes_.empty()) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     int32_t last_size = 0;
     for (size_t i = 0; i < allowed_batch_sizes_.size(); ++i) {
@@ -704,7 +704,7 @@ class BatchKernel : public AsyncOpKernel {
       }
       last_size = size;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -798,7 +798,7 @@ class UnbatchResource : public ResourceBase {
         context->set_output(0, tensor_it->second.tensor);
         waiting_tensors_.erase(tensor_it);
         done_callbacks_to_call.push_back(done);
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       const uint64 deadline_micros =
@@ -837,7 +837,7 @@ class UnbatchResource : public ResourceBase {
         }
       }
 
-      return OkStatus();
+      return absl::OkStatus();
     }();
 
     for (const AsyncOpKernel::DoneCallback& done_callback :
@@ -930,7 +930,7 @@ class UnbatchKernel : public AsyncOpKernel {
     std::function<Status(UnbatchResource**)> creator =
         [this](UnbatchResource** r) {
           *r = new UnbatchResource(timeout_micros_);
-          return OkStatus();
+          return absl::OkStatus();
         };
     OP_REQUIRES_OK_ASYNC(c,
                          c->resource_manager()->LookupOrCreate(
@@ -989,7 +989,7 @@ class UnbatchGradResource : public ResourceBase {
         return errors::InvalidArgument("Unsupported data type: ", type);
     }
     done();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Ingests data from one invocation of the op.
@@ -1076,7 +1076,7 @@ class UnbatchGradResource : public ResourceBase {
         available_batches_.erase(batch_it);
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -1126,7 +1126,7 @@ class UnbatchGradKernel : public AsyncOpKernel {
     std::function<Status(UnbatchGradResource**)> creator =
         [](UnbatchGradResource** r) {
           *r = new UnbatchGradResource();
-          return OkStatus();
+          return absl::OkStatus();
         };
     OP_REQUIRES_OK_ASYNC(c,
                          c->resource_manager()->LookupOrCreate(
diff --git a/tensorflow/core/kernels/batch_kernels_test.cc b/tensorflow/core/kernels/batch_kernels_test.cc
index af7546a062169d..62d5ef22ec2484 100644
--- a/tensorflow/core/kernels/batch_kernels_test.cc
+++ b/tensorflow/core/kernels/batch_kernels_test.cc
@@ -23,19 +23,24 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/strings/match.h"
 #include "tensorflow/core/common_runtime/rendezvous_mgr.h"
+#include "tensorflow/core/framework/device_factory.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/node_def_builder.h"
 #include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/kernels/batch_kernel_test_util.h"
 #include "tensorflow/core/kernels/batching_util/warmup.h"
 #include "tensorflow/core/kernels/ops_testutil.h"
+#include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/test.h"
 #include "tensorflow/core/protobuf/config.pb.h"
+#include "tensorflow/core/public/version.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/blocking_counter.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/refcount.h"
 #include "tsl/platform/status.h"
 
 namespace tensorflow {
@@ -120,7 +125,7 @@ class BatchFunctionKernelParallelWarmupTestState : public OpsTestBase {
                                tsl::core::RefCountPtr<Rendezvous> *r) {
           *r = tsl::core::RefCountPtr<Rendezvous>(
               new IntraProcessRendezvous(device_mgr));
-          return OkStatus();
+          return absl::OkStatus();
         }});
 
     std::vector<NodeDefBuilder::NodeOut> inputs(
@@ -152,6 +157,77 @@ class BatchFunctionKernelParallelWarmupTestState : public OpsTestBase {
 class BatchFunctionKernelParallelWarmupTest
     : public ::testing::TestWithParam<bool> {};
 
+TEST_P(BatchFunctionKernelParallelWarmupTest, HandlesLargeBatchSplitting) {
+  // This test fails if it does not come before the others in the suite,
+  // because `SharedBatchScheduler::QueueOptions::input_batch_size_limit`
+  // does not get reset.
+  SessionMetadata session_metadata;
+  session_metadata.set_name("test_model");
+  session_metadata.set_version(123);
+  serving::WarmupStateRegistry::Key key(session_metadata.name(),
+                                        session_metadata.version());
+
+  int num_requests = 16;
+
+  {
+    auto per_model_data = std::make_unique<PerModelData>();
+    per_model_data->warmup_all_batch_sizes = true;
+    auto handle = serving::GetGlobalWarmupStateRegistry().Register(
+        key, std::move(per_model_data));
+
+    tsl::BlockingCounter blocking_counter(num_requests);
+    for (int i = 0; i < num_requests; ++i) {
+      Env::Default()->SchedClosure([&]() {
+        BatchFunctionKernelParallelWarmupTestState test;
+        test.set_session_metadata(session_metadata);
+        TF_CHECK_OK(test.Init(/*enable_splitting=*/true,
+                              /*check_output_shape=*/true));
+        test.AddInputFromList<int64_t>(
+            TensorShape({16}),
+            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+        auto status = test.RunOpKernel();
+        ASSERT_FALSE(status.ok());
+        // This proves the kernel is executed with batch sizes other than 2.
+        EXPECT_TRUE(absl::StrContains(status.message(),
+                                      "is not compatible with expected shape"));
+        blocking_counter.DecrementCount();
+      });
+    }
+    blocking_counter.Wait();
+  }
+
+  {
+    EXPECT_FALSE(serving::GetGlobalWarmupStateRegistry().Lookup(key));
+    auto per_model_data = std::make_unique<PerModelData>();
+    per_model_data->warmup_all_batch_sizes = true;
+    auto handle = serving::GetGlobalWarmupStateRegistry().Register(
+        key, std::move(per_model_data));
+
+    tsl::BlockingCounter blocking_counter(num_requests);
+    for (int i = 0; i < num_requests; ++i) {
+      Env::Default()->SchedClosure([&]() {
+        BatchFunctionKernelParallelWarmupTestState test;
+        test.set_session_metadata(session_metadata);
+        // Error free when the EnsureShapeOp is replaced with an Identity op.
+        TF_CHECK_OK(
+            test.Init(/*enable_splitting=*/true, /*check_output_shape=*/false));
+        test.AddInputFromList<int64_t>(
+            TensorShape({16}),
+            {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15});
+        TF_CHECK_OK(test.RunOpKernel());
+
+        test::ExpectTensorEqual<int64_t>(
+            *test.GetOutput(0),
+            test::AsTensor<int64_t>(
+                {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}));
+
+        blocking_counter.DecrementCount();
+      });
+    }
+    blocking_counter.Wait();
+  }
+}
+
 TEST_P(BatchFunctionKernelParallelWarmupTest, ParallelWarmup) {
   SessionMetadata session_metadata;
   session_metadata.set_name("test_model");
diff --git a/tensorflow/core/kernels/batching_util/BUILD b/tensorflow/core/kernels/batching_util/BUILD
index 8b8bb863e78aad..26b33b2a71e781 100644
--- a/tensorflow/core/kernels/batching_util/BUILD
+++ b/tensorflow/core/kernels/batching_util/BUILD
@@ -1,7 +1,7 @@
 # Description: Utilities.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_cc_test")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -347,6 +347,7 @@ cc_library(
         ":adaptive_shared_batch_scheduler",
         ":batch_scheduler",
         ":concat_split_util",
+        ":input_split_metadata",
         ":shared_batch_scheduler",
         ":threadsafe_status",
         ":warmup",
@@ -360,18 +361,22 @@ cc_library(
         "//tensorflow/core/common_runtime:request_cost",
         "//tensorflow/core/common_runtime:request_cost_accessor",
         "//tensorflow/core/common_runtime:request_cost_accessor_registry",
+        "//tensorflow/core/platform:errors",
         "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:thread_annotations",
         "//tensorflow/core/profiler/lib:traceme",
         "//tensorflow/core/profiler/lib:traceme_encode",
         "//tensorflow/core/protobuf:for_core_protos_cc",
         "//tensorflow/core/util:incremental_barrier",
+        "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:optional",
         "@local_tsl//tsl/platform:criticality",
+        "@local_tsl//tsl/platform:logging",
     ],
 )
 
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
index b9ca84b3bef2b6..28e8f4396fd668 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler.h
@@ -424,7 +424,7 @@ Status AdaptiveSharedBatchScheduler<TaskType>::Create(
         options.batches_to_average_over);
   }
   scheduler->reset(new AdaptiveSharedBatchScheduler<TaskType>(options));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename TaskType>
@@ -472,7 +472,7 @@ Status AdaptiveSharedBatchScheduler<TaskType>::AddQueue(
                    this->shared_from_this(), options));
   mutex_lock l(mu_);
   queues_and_callbacks_[asbs_queue_raw] = process_batch_callback;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename TaskType>
@@ -822,7 +822,7 @@ Status ASBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
   if (closed_batch) {
     scheduler_->MaybeScheduleClosedBatches();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename TaskType>
diff --git a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
index 1cfeb68b028de2..8858390da400fc 100644
--- a/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/adaptive_shared_batch_scheduler_test.cc
@@ -470,7 +470,7 @@ TEST(AdaptiveSharedBatchSchedulerTest, TruncateBatches) {
           output_tasks->emplace_back(new FakeTask(task_size));
           remaining_size -= task_size;
         }
-        return OkStatus();
+        return absl::OkStatus();
       };
   TF_ASSERT_OK(scheduler->AddQueue(queue_options, queue_callback, &queue));
   TF_ASSERT_OK(ScheduleTask(30, queue.get()));
diff --git a/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h b/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
index 607a8212175662..047058c0c50f8e 100644
--- a/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/basic_batch_scheduler.h
@@ -342,7 +342,7 @@ Status BasicBatchScheduler<TaskType>::Create(
 
   scheduler->reset(
       new BasicBatchScheduler<TaskType>(std::move(shared_scheduler_queue)));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename TaskType>
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.cc b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
index fa900a9c87789c..08ad9866808498 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.cc
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/kernels/batching_util/batch_resource_base.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <iterator>
@@ -26,10 +27,13 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/fixed_array.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/blocking_counter.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
 #include "absl/types/optional.h"
 #include "tensorflow/core/common_runtime/cost_constants.h"
@@ -39,10 +43,16 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/request_cost.h"
 #include "tensorflow/core/common_runtime/request_cost_accessor.h"
 #include "tensorflow/core/common_runtime/request_cost_accessor_registry.h"
+#include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/ops_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/kernels/batching_util/concat_split_util.h"
+#include "tensorflow/core/kernels/batching_util/input_split_metadata.h"
+#include "tensorflow/core/kernels/batching_util/threadsafe_status.h"
 #include "tensorflow/core/kernels/batching_util/warmup.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
 #include "tensorflow/core/lib/monitoring/counter.h"
@@ -50,6 +60,10 @@ limitations under the License.
 #include "tensorflow/core/lib/monitoring/percentile_sampler.h"
 #include "tensorflow/core/lib/monitoring/sampler.h"
 #include "tensorflow/core/lib/monitoring/types.h"
+#include "tensorflow/core/platform/context.h"
+#include "tensorflow/core/platform/env_time.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
@@ -277,6 +291,7 @@ BatchResourceBase::BatchTask::CreateSplitTask(
   task->is_partial = true;
   task->start_time = this->start_time;
   task->request_cost = this->request_cost;
+  task->forced_warmup_batch_size = this->forced_warmup_batch_size;
 
   return task;
 }
@@ -300,12 +315,22 @@ Status BatchResourceBase::RegisterWarmupInputs(
     int64_t guid, OpKernelContext* context, const string& batcher_queue_name,
     const CreateBatchTaskFn& create_batch_task_fn,
     AsyncOpKernel::DoneCallback done) {
+  auto shared_status = std::make_shared<ThreadSafeStatus>();
+  auto create_batch_task_fn_share_status = [&create_batch_task_fn,
+                                            &shared_status]() {
+    auto batch_task = create_batch_task_fn();
+    if (!batch_task.ok()) {
+      return batch_task;
+    }
+    (*batch_task)->status = shared_status;
+    return batch_task;
+  };
   auto warmup_counter =
       std::make_shared<absl::BlockingCounter>(allowed_batch_sizes_.size());
   // Enqueue warmup batches.
   for (int i = 0; i < allowed_batch_sizes_.size(); ++i) {
     Status status = RegisterInput(
-        guid, context, batcher_queue_name, create_batch_task_fn,
+        guid, context, batcher_queue_name, create_batch_task_fn_share_status,
         [warmup_counter = warmup_counter.get()]() {
           warmup_counter->DecrementCount();
         },
@@ -313,11 +338,13 @@ Status BatchResourceBase::RegisterWarmupInputs(
     if (!status.ok()) return status;
   }
   // Enqueue real batch if the other batches were enqueued successfully.
-  return RegisterInput(guid, context, batcher_queue_name, create_batch_task_fn,
-                       [warmup_counter, done = done]() {
-                         warmup_counter->Wait();
-                         done();
-                       });
+  return RegisterInput(
+      guid, context, batcher_queue_name, create_batch_task_fn_share_status,
+      [warmup_counter, context, shared_status, done = std::move(done)]() {
+        warmup_counter->Wait();
+        context->SetStatus(shared_status->status());
+        done();
+      });
 }
 
 Status BatchResourceBase::RegisterInput(
@@ -394,7 +421,7 @@ Status BatchResourceBase::RegisterInput(
                                                   &empty_output, cpu_alloc));
     }
     done_callback();
-    return OkStatus();
+    return absl::OkStatus();
   }
   OpInputList captured_tensors;
   const auto captured_status =
@@ -406,10 +433,24 @@ Status BatchResourceBase::RegisterInput(
     }
   }
   batch_components->context = context;
-  batch_components->done_callback = std::move(done_callback);
   batch_components->split_index = 0;
   batch_components->output = std::make_shared<TensorMatrix>();
-  batch_components->status = std::make_shared<ThreadSafeStatus>();
+  if (!batch_components->status) {
+    // A shared status has already been injected if `RegisterWarmupInputs`
+    // was called. If not, create the `ThreadSafeStatus` and tie the setting
+    // of the kernel context's status to this shared status.
+    batch_components->status = std::make_shared<ThreadSafeStatus>();
+    batch_components->done_callback = [done_callback = std::move(done_callback),
+                                       shared_status = batch_components->status,
+                                       context = context]() {
+      context->SetStatus(shared_status->status());
+      done_callback();
+    };
+  } else {
+    // Otherwise `RegisterWarmupInputs` was called and already setup the
+    // `done_callback` and `status` correctly for this `BatchTask`.
+    batch_components->done_callback = std::move(done_callback);
+  }
   batch_components->forced_warmup_batch_size = forced_warmup_batch_size;
 
   std::unique_ptr<RequestCostAccessor> request_cost_accessor =
@@ -558,7 +599,7 @@ BatchResourceBase::GetAdaptiveBatcherQueueOptions(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Returns the smallest entry in 'allowed_batch_sizes_' that is greater than
@@ -656,7 +697,7 @@ Status BatchResourceBase::ConcatInputTensors(
     TF_RETURN_IF_ERROR(concat_status);
     concatenated_tensors->push_back(concatenated_tensor);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 /*static*/ Status BatchResourceBase::SplitInputTask(
@@ -673,7 +714,9 @@ Status BatchResourceBase::ConcatInputTensors(
   // complete.
   std::function<void()> split_task_done_callback =
       [done_callback = input_task.done_callback, output = input_task.output,
-       op_kernel_context = input_task.context, status = shared_status]() {
+       forced_warmup_batch_size = input_task.forced_warmup_batch_size,
+       op_kernel_context = input_task.context,
+       status = shared_status]() mutable {
         const int num_output = op_kernel_context->num_outputs();
         for (int i = 0; i < num_output; ++i) {
           Tensor output_tensor;
@@ -692,10 +735,10 @@ Status BatchResourceBase::ConcatInputTensors(
           if (!concat_status.ok()) {
             status->Update(concat_status);
           }
-
-          op_kernel_context->set_output(i, std::move(output_tensor));
+          if (forced_warmup_batch_size == 0) {
+            op_kernel_context->set_output(i, std::move(output_tensor));
+          }
         }
-        op_kernel_context->SetStatus(status->status());
         done_callback();
       };
   IncrementalBarrier barrier(split_task_done_callback);
@@ -751,7 +794,7 @@ Status BatchResourceBase::ConcatInputTensors(
                 std::back_inserter(output_task.inputs));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status BatchResourceBase::SplitOutputTensors(
@@ -824,7 +867,7 @@ Status BatchResourceBase::SplitOutputTensors(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void BatchResourceBase::ProcessFuncBatch(std::unique_ptr<BatchT> batch) const {
@@ -864,10 +907,15 @@ void BatchResourceBase::ProcessFuncBatch(std::unique_ptr<BatchT> batch) const {
     batch_cost_measurements.clear();
     for (int i = 0; i < batch->num_tasks(); ++i) {
       WithContext wc(batch->task(i).propagated_context);
-      if (batch->task(i).is_partial) {
-        batch->mutable_task(i)->status->Update(status);
-      } else {
-        batch->mutable_task(i)->context->SetStatus(status);
+      if (!status.ok()) {
+        if (!absl::StrContains(
+                status.message(),
+                "Function was cancelled before it was started")) {
+          batch->mutable_task(i)->status->Update(status);
+        } else {
+          // Do not propagate this error; Prefer a more helpful error message.
+          LOG(ERROR) << "ERROR!!!! " << status.message();
+        }
       }
       batch->mutable_task(i)->done_callback();
     }
@@ -1034,7 +1082,7 @@ void BatchResourceBase::ProcessBatch(std::unique_ptr<BatchT> batch) const {
     index_flat(task_idx, 2) = offset + task.size();
     offset += task.size();
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Looks up the batcher queue for 'queue_name'. If it didn't previously exist,
@@ -1046,7 +1094,7 @@ Status BatchResourceBase::LookupOrCreateBatcherQueue(const string& queue_name,
   auto it = batcher_queues_.find(queue_name);
   if (it != batcher_queues_.end()) {
     *queue = it->second.get();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::unique_ptr<BatcherQueueT> new_queue;
@@ -1072,7 +1120,7 @@ Status BatchResourceBase::LookupOrCreateBatcherQueue(const string& queue_name,
   }
   *queue = new_queue.get();
   batcher_queues_[queue_name] = std::move(new_queue);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void BatchResourceBase::SplitBatchCostsAndRecordMetrics(
diff --git a/tensorflow/core/kernels/batching_util/batch_resource_base.h b/tensorflow/core/kernels/batching_util/batch_resource_base.h
index b86d25c097da39..3983a28e1b70f2 100644
--- a/tensorflow/core/kernels/batching_util/batch_resource_base.h
+++ b/tensorflow/core/kernels/batching_util/batch_resource_base.h
@@ -171,6 +171,8 @@ class BatchResourceBase : public ResourceBase {
                               AsyncOpKernel::DoneCallback done);
   // Ingests data from one invocation of the batch op. The data is enqueued to
   // be combined with others into a batch, asynchronously.
+  // `CreateBatchTaskFn` should be used to instantiate fields added to a
+  // child class of `BatchTask` by the caller.
   Status RegisterInput(int64_t guid, OpKernelContext* context,
                        const string& batcher_queue_name,
                        const CreateBatchTaskFn& create_batch_task_fn,
diff --git a/tensorflow/core/kernels/batching_util/concat_split_util.h b/tensorflow/core/kernels/batching_util/concat_split_util.h
index 77a2b16340dd08..d3b2839ae9752e 100644
--- a/tensorflow/core/kernels/batching_util/concat_split_util.h
+++ b/tensorflow/core/kernels/batching_util/concat_split_util.h
@@ -87,7 +87,7 @@ Status Concat(OpKernelContext* context, const gtl::ArraySlice<Tensor> inputs,
     ConcatCPU<T>(context->device(), inputs_flat, &output_flat);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Same as 'Concat' above, but handles Tensor dtype deduction automatically.
@@ -135,7 +135,7 @@ Status SplitEasyCases(OpKernelContext* context, const Tensor& input,
   if (sizes.size() == 1 && sizes.at(0) == input.shape().dim_size(0)) {
     outputs->push_back(input);
     *done = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Special case 1: input is aligned.
@@ -146,10 +146,10 @@ Status SplitEasyCases(OpKernelContext* context, const Tensor& input,
       position += size;
     }
     *done = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Handles the general case, on CPU.
@@ -189,7 +189,7 @@ Status SplitCPU(OpKernelContext* context, const Tensor& input,
     position += size;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
@@ -215,7 +215,7 @@ Status Split(OpKernelContext* context, const Tensor& input,
   TF_RETURN_IF_ERROR(
       SplitEasyCases<T>(context, input, sizes, outputs, &easy_cases_done));
   if (easy_cases_done) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
diff --git a/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
index 80d24faa794838..7340ace6317603 100644
--- a/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/serial_device_batch_scheduler.h
@@ -303,7 +303,7 @@ Status SerialDeviceBatchScheduler<TaskType>::Create(
         "specified");
   }
   scheduler->reset(new SerialDeviceBatchScheduler<TaskType>(options));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename TaskType>
@@ -349,7 +349,7 @@ Status SerialDeviceBatchScheduler<TaskType>::AddQueue(
                    this->shared_from_this(), options));
   mutex_lock l(mu_);
   queues_and_callbacks_[SDBS_queue_raw] = process_batch_callback;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename TaskType>
@@ -516,7 +516,7 @@ Status SDBSQueue<TaskType>::Schedule(std::unique_ptr<TaskType>* task) {
   }
   // AddBatch must be called outside of lock, since it may call ReleaseBatch.
   if (new_batch != nullptr) scheduler_->AddBatch(new_batch);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename TaskType>
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
index 4e218e83dd51c2..f018076e96f633 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler.h
@@ -570,7 +570,7 @@ Status SharedBatchScheduler<TaskType>::Create(
                                    options.num_batch_threads);
   }
   scheduler->reset(new SharedBatchScheduler<TaskType>(options));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename TaskType>
@@ -678,7 +678,7 @@ Status SharedBatchScheduler<TaskType>::AddQueueAfterRewritingOptions(
     }
   }
   *queue = std::move(handle);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename TaskType>
@@ -909,7 +909,7 @@ Status Queue<TaskType>::ScheduleWithLazySplit(std::unique_ptr<TaskType>* task) {
     schedulable_batch_callback_();
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // TODO(b/194294263):
@@ -984,7 +984,7 @@ Status Queue<TaskType>::ScheduleWithoutOrEagerSplit(
     schedulable_batch_callback_();
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename TaskType>
@@ -1041,7 +1041,7 @@ Status Queue<TaskType>::ValidateBatchTaskQueueCapacity(TaskType* task) const {
           ", open_batch_size=", tail_batch_task_size(),
           ", max_execution_batch_size=", max_execution_batch_size(), ")");
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // NOTE, the capacity checking below is loose and is retained
@@ -1062,7 +1062,7 @@ Status Queue<TaskType>::ValidateBatchTaskQueueCapacity(TaskType* task) const {
           options_.max_enqueued_batches);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename TaskType>
diff --git a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
index 04d0b78df1d3ab..093999b319d7f0 100644
--- a/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
+++ b/tensorflow/core/kernels/batching_util/shared_batch_scheduler_test.cc
@@ -172,7 +172,7 @@ class SharedBatchSchedulerTest
               (*output_tasks)[i] = std::make_unique<FakeTask>(task_sizes[i]);
             }
 
-            return OkStatus();
+            return absl::OkStatus();
           };
     }
     return nullptr;
@@ -840,7 +840,7 @@ void CreateQueues() {
     });
     busy_waiter.join();
     notifier.join();
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   internal::Queue<FakeTask>::ProcessBatchCallback process_batch_callback =
diff --git a/tensorflow/core/kernels/batching_util/threadsafe_status_test.cc b/tensorflow/core/kernels/batching_util/threadsafe_status_test.cc
index 94ab08c8a1743d..3dc3185ce43b4b 100644
--- a/tensorflow/core/kernels/batching_util/threadsafe_status_test.cc
+++ b/tensorflow/core/kernels/batching_util/threadsafe_status_test.cc
@@ -35,7 +35,7 @@ TEST(ThreadSafeStatus, Update) {
   status.Update(errors::FailedPrecondition("original error"));
   EXPECT_EQ(status.status().code(), error::FAILED_PRECONDITION);
 
-  status.Update(OkStatus());
+  status.Update(absl::OkStatus());
   EXPECT_EQ(status.status().code(), error::FAILED_PRECONDITION);
 
   status.Update(errors::Internal("new error"));
diff --git a/tensorflow/core/kernels/cast_op.cc b/tensorflow/core/kernels/cast_op.cc
index c892af27b1eed4..79d4462489c583 100644
--- a/tensorflow/core/kernels/cast_op.cc
+++ b/tensorflow/core/kernels/cast_op.cc
@@ -125,7 +125,7 @@ CpuCastOp::CpuCastOp(OpKernelConstruction* ctx) : CastOpBase(ctx) {
 Status CpuCastOp::Prepare() {
   if (external_src_dtype_ == external_dst_dtype_) {
     work_ = nullptr;  // Identity
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (src_dtype_ == DT_BOOL) {
     work_ = GetCpuCastFromBool(dst_dtype_);
@@ -172,7 +172,7 @@ Status CpuCastOp::Prepare() {
   // vectorized versions (not the least based on F16C for Haswell
   // or newer).
 
-  return work_ == nullptr ? Unimplemented() : OkStatus();
+  return work_ == nullptr ? Unimplemented() : absl::OkStatus();
 }
 
 #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \
diff --git a/tensorflow/core/kernels/checkpoint_callback_manager.cc b/tensorflow/core/kernels/checkpoint_callback_manager.cc
index 85d9150e822dcf..30ac196d2a52be 100644
--- a/tensorflow/core/kernels/checkpoint_callback_manager.cc
+++ b/tensorflow/core/kernels/checkpoint_callback_manager.cc
@@ -165,7 +165,7 @@ Status CheckpointCallbackManager::RegisterSaveCallback(
     TriggerSaveCallbackIfFileNotExist(checkpoint_id, checkpoint_dir,
                                       file_extension, lazy_callback);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool CheckpointCallbackManager::DoesSaveCallbackExist(
@@ -199,7 +199,7 @@ Status CheckpointCallbackManager::RegisterRestoreCallback(
     TriggerRestoreCallbackIfFileExists(checkpoint_id, checkpoint_dir,
                                        file_extension, lazy_callback);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 bool CheckpointCallbackManager::DoesRestoreCallbackExist(
diff --git a/tensorflow/core/kernels/checkpoint_callback_manager_test.cc b/tensorflow/core/kernels/checkpoint_callback_manager_test.cc
index 115eb50b68c5c2..57f3c19fc68e2e 100644
--- a/tensorflow/core/kernels/checkpoint_callback_manager_test.cc
+++ b/tensorflow/core/kernels/checkpoint_callback_manager_test.cc
@@ -111,11 +111,11 @@ TEST_F(CheckpointCallbackManagerTest, RegisterSaveCallbackTwice) {
 TEST_F(CheckpointCallbackManagerTest, RegisterRestoreCallbackTwice) {
   RestoreCallback first_callback = [](absl::string_view checkpoint_id,
                                       absl::string_view str) {
-    return OkStatus();
+    return absl::OkStatus();
   };
   RestoreCallback second_callback = [](absl::string_view checkpoint_id,
                                        absl::string_view str) {
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   TF_ASSERT_OK(checkpoint_callback_manager_->RegisterRestoreCallback(
@@ -150,11 +150,11 @@ TEST_F(CheckpointCallbackManagerTest, DoesSaveCallbackExist) {
 TEST_F(CheckpointCallbackManagerTest, DoesRestoreCallbackExist) {
   RestoreCallback first_callback = [](absl::string_view checkpoint_id,
                                       absl::string_view str) {
-    return OkStatus();
+    return absl::OkStatus();
   };
   RestoreCallback second_callback = [](absl::string_view checkpoint_id,
                                        absl::string_view str) {
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   TF_ASSERT_OK(checkpoint_callback_manager_->RegisterRestoreCallback(
@@ -235,7 +235,7 @@ TEST_F(CheckpointCallbackManagerTest, Restore) {
     EXPECT_EQ(checkpoint_id, "model.ckpt-100");
     EXPECT_EQ(str, "Apple");
     ++callback_call_count;
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   TF_ASSERT_OK(checkpoint_callback_manager_->RegisterRestoreCallback(
@@ -271,7 +271,7 @@ TEST_F(CheckpointCallbackManagerTest, SaveAndRestore) {
     EXPECT_EQ(checkpoint_id, "model.ckpt-500");
     EXPECT_EQ(str, "Apple");
     ++restore_callback_count;
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   TF_ASSERT_OK(checkpoint_callback_manager_->RegisterRestoreCallback(
@@ -312,7 +312,7 @@ TEST_F(CheckpointCallbackManagerTest, RestoreLazyCallback) {
     EXPECT_EQ(checkpoint_id, "model.ckpt-100");
     EXPECT_EQ(str, "Apple");
     ++callback_call_count;
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   TF_EXPECT_OK(WriteStringToFile(
diff --git a/tensorflow/core/kernels/collective_ops.cc b/tensorflow/core/kernels/collective_ops.cc
index 57815190f4073a..dea6e0ec5f0953 100644
--- a/tensorflow/core/kernels/collective_ops.cc
+++ b/tensorflow/core/kernels/collective_ops.cc
@@ -560,7 +560,7 @@ class CollectiveAssignGroupV2OpKernel : public OpKernel {
                   << " device_index = " << index
                   << " group_key = " << group_key->DebugString()
                   << " group_size = " << group_size->DebugString();
-          return OkStatus();
+          return absl::OkStatus();
         }
       }
     }
@@ -638,7 +638,7 @@ class CollectiveOpV2Kernel : public AsyncOpKernel {
     col_params->instance.instance_key = instance_key.unaligned_flat<int32>()(0);
     col_params->instance.impl_details.communication_hint = communication_hint_;
     col_params->instance.impl_details.timeout_seconds = timeout_seconds_;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Runs a collective. The output tensor must be allocated before calling this
@@ -1072,7 +1072,7 @@ class CollectiveInitializeCommunicatorOpKernel : public AsyncOpKernel {
           "rank must be less than group size but got ", rank,
           " >= ", group_size);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void ComputeAsync(OpKernelContext* c, DoneCallback done) override {
@@ -1196,7 +1196,7 @@ class CollectiveOpV3Kernel : public AsyncOpKernel {
     col_params->instance.impl_details.timeout_seconds =
         timeout_seconds_ > 0 ? resource->timeout_seconds() : timeout_seconds_;
     col_params->run_group_initialization = false;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Runs a collective. The output tensor must be allocated before calling this
diff --git a/tensorflow/core/kernels/conditional_accumulator.h b/tensorflow/core/kernels/conditional_accumulator.h
index 5f901b12d43d58..13343a99b64acd 100644
--- a/tensorflow/core/kernels/conditional_accumulator.h
+++ b/tensorflow/core/kernels/conditional_accumulator.h
@@ -81,7 +81,7 @@ class ConditionalAccumulator
                                      shape_.DebugString(), ", got ",
                                      tensor->shape().DebugString());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void AllocateAndAssignToAccumGradFunction(OpKernelContext* ctx,
diff --git a/tensorflow/core/kernels/conditional_accumulator_base.cc b/tensorflow/core/kernels/conditional_accumulator_base.cc
index c830ac160324fa..8a0c73d0bdbca2 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base.cc
+++ b/tensorflow/core/kernels/conditional_accumulator_base.cc
@@ -31,7 +31,7 @@ ConditionalAccumulatorBase::ConditionalAccumulatorBase(
 
 Status ConditionalAccumulatorBase::MatchesNodeDef(const NodeDef& node_def) {
   // TODO(xinghao@): implement the checks for the node definition
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 /**
@@ -47,7 +47,7 @@ Status ConditionalAccumulatorBase::SetGlobalStep(int64_t new_global_step) {
                  << " >= " << new_global_step << " = new_global_step.";
   }
   current_global_step_ = new_global_step;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 /**
diff --git a/tensorflow/core/kernels/conditional_accumulator_base_op.h b/tensorflow/core/kernels/conditional_accumulator_base_op.h
index 9acf1ca1e21817..73504fbd495327 100644
--- a/tensorflow/core/kernels/conditional_accumulator_base_op.h
+++ b/tensorflow/core/kernels/conditional_accumulator_base_op.h
@@ -116,7 +116,7 @@ class ConditionalAccumulatorBaseOp : public OpKernel {
     h(0) = cinfo_.container();
     h(1) = cinfo_.name();
     accumulator_set_ = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/kernels/conditional_accumulator_op.cc b/tensorflow/core/kernels/conditional_accumulator_op.cc
index 6df0b7f701e2fd..732847ff200db7 100644
--- a/tensorflow/core/kernels/conditional_accumulator_op.cc
+++ b/tensorflow/core/kernels/conditional_accumulator_op.cc
@@ -37,13 +37,13 @@ class ConditionalAccumulatorOp : public ConditionalAccumulatorBaseOp {
           new ConditionalAccumulator<Device, T>(dtype_, shape_, cinfo_.name(),
                                                 reduction_type_);
       *ret = accumulator;
-      return OkStatus();
+      return absl::OkStatus();
     };
   }
 
   Status CheckSignature(OpKernelContext* ctx) override {
     TF_RETURN_IF_ERROR(ctx->MatchSignature({}, {DT_STRING_REF}));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void SetHandleToOutput(OpKernelContext* ctx)
@@ -75,13 +75,13 @@ class ResourceConditionalAccumulatorOp : public ConditionalAccumulatorBaseOp {
           new ConditionalAccumulator<Device, T>(dtype_, shape_, cinfo_.name(),
                                                 reduction_type_);
       *ret = accumulator;
-      return OkStatus();
+      return absl::OkStatus();
     };
   }
 
   Status CheckSignature(OpKernelContext* ctx) override {
     TF_RETURN_IF_ERROR(ctx->MatchSignature({}, {DT_RESOURCE}));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void SetHandleToOutput(OpKernelContext* ctx)
diff --git a/tensorflow/core/kernels/control_flow_ops.cc b/tensorflow/core/kernels/control_flow_ops.cc
index 2a063846b64ba5..0640bb3c24474e 100644
--- a/tensorflow/core/kernels/control_flow_ops.cc
+++ b/tensorflow/core/kernels/control_flow_ops.cc
@@ -26,9 +26,10 @@ namespace tensorflow {
 void SwitchOp::Compute(OpKernelContext* context) {
   const Tensor& outputPorts = context->input(1);
   OP_REQUIRES(context, TensorShapeUtils::IsScalar(outputPorts.shape()),
-              errors::InvalidArgument("The second input must be a scalar, "
-                                      "but it has shape ",
-                                      outputPorts.shape().DebugString()));
+              absl::InvalidArgumentError(
+                  absl::StrCat("The second input must be a scalar, "
+                               "but it has shape ",
+                               outputPorts.shape().DebugString())));
 
   bool pred = outputPorts.scalar<bool>()();
   int port = (pred) ? 1 : 0;
@@ -42,9 +43,10 @@ void SwitchOp::Compute(OpKernelContext* context) {
 void SwitchNOp::Compute(OpKernelContext* context) {
   const Tensor& output_index_t = context->input(1);
   OP_REQUIRES(context, TensorShapeUtils::IsScalar(output_index_t.shape()),
-              errors::InvalidArgument("The second input must be a scalar, "
-                                      "but it has shape ",
-                                      output_index_t.shape().DebugString()));
+              absl::InvalidArgumentError(
+                  absl::StrCat("The second input must be a scalar, "
+                               "but it has shape ",
+                               output_index_t.shape().DebugString())));
   int output_index = output_index_t.scalar<int>()();
   if (output_index < 0 || output_index >= num_outputs()) {
     output_index = num_outputs() - 1;
@@ -231,15 +233,17 @@ class RefSelectOp : public OpKernel {
   void Compute(OpKernelContext* context) override {
     const Tensor& index_tensor = context->input(0);
     OP_REQUIRES(context, TensorShapeUtils::IsScalar(index_tensor.shape()),
-                errors::InvalidArgument("Index must be a scalar, "
-                                        "but it has shape ",
-                                        index_tensor.shape().DebugString()));
+                absl::InvalidArgumentError(
+                    absl::StrCat("Index must be a scalar, "
+                                 "but it has shape ",
+                                 index_tensor.shape().DebugString())));
 
     int32_t index = index_tensor.scalar<int32>()();
 
     OP_REQUIRES(context, index >= 0 && index < num_ref_inputs_,
-                errors::InvalidArgument("Index must be in the range [0, ",
-                                        num_ref_inputs_, ") but got ", index));
+                absl::InvalidArgumentError(
+                    absl::StrCat("Index must be in the range [0, ",
+                                 num_ref_inputs_, ") but got ", index)));
     context->forward_ref_input_to_ref_output(index + 1, 0);
   }
 
@@ -413,6 +417,7 @@ void EnterOp::Compute(OpKernelContext* context) {
   }
 }
 
+REGISTER_KERNEL_BUILDER(Name("Enter").Device(DEVICE_CPU), EnterOp);
 REGISTER_KERNEL_BUILDER(Name("Enter").Device(DEVICE_TPU_SYSTEM), EnterOp);
 REGISTER_KERNEL_BUILDER(Name("Enter").Device(DEVICE_TPU), EnterOp);
 REGISTER_KERNEL_BUILDER(Name("RefEnter").Device(DEVICE_CPU), EnterOp);
@@ -471,6 +476,8 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_DEFAULT_KERNEL);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_DEFAULT_REF_KERNEL);
+TF_CALL_QUANTIZED_TYPES(REGISTER_DEFAULT_KERNEL);
+TF_CALL_QUANTIZED_TYPES(REGISTER_DEFAULT_REF_KERNEL);
 REGISTER_DEFAULT_KERNEL(bool);
 REGISTER_DEFAULT_REF_KERNEL(bool);
 TF_CALL_variant(REGISTER_DEFAULT_KERNEL);
@@ -511,6 +518,7 @@ void ExitOp::Compute(OpKernelContext* context) {
   }
 }
 
+REGISTER_KERNEL_BUILDER(Name("Exit").Device(DEVICE_CPU), ExitOp);
 REGISTER_KERNEL_BUILDER(Name("Exit").Device(DEVICE_TPU_SYSTEM), ExitOp);
 REGISTER_KERNEL_BUILDER(Name("Exit").Device(DEVICE_TPU), ExitOp);
 REGISTER_KERNEL_BUILDER(Name("RefExit").Device(DEVICE_CPU), ExitOp);
@@ -564,6 +572,8 @@ REGISTER_GPU_HOST_KERNEL(ResourceHandle);
 
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_DEFAULT_KERNEL);
 TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_DEFAULT_REF_KERNEL);
+TF_CALL_QUANTIZED_TYPES(REGISTER_DEFAULT_KERNEL);
+TF_CALL_QUANTIZED_TYPES(REGISTER_DEFAULT_REF_KERNEL);
 REGISTER_DEFAULT_KERNEL(bool);
 REGISTER_DEFAULT_REF_KERNEL(bool);
 TF_CALL_variant(REGISTER_DEFAULT_KERNEL);
@@ -686,7 +696,7 @@ void LoopCondOp::Compute(OpKernelContext* context) {
   if (cm != nullptr) {
     bool already_cancelled = cm->IsCancelled();
     OP_REQUIRES(context, !already_cancelled,
-                errors::Cancelled("Loop execution was cancelled."));
+                absl::CancelledError("Loop execution was cancelled."));
   }
 
   context->set_output(0, context->input(0));
diff --git a/tensorflow/core/kernels/control_flow_ops_test.cc b/tensorflow/core/kernels/control_flow_ops_test.cc
index ffa59f0fa93f8d..a60e6a022327b1 100644
--- a/tensorflow/core/kernels/control_flow_ops_test.cc
+++ b/tensorflow/core/kernels/control_flow_ops_test.cc
@@ -209,5 +209,63 @@ TEST(MergeOpTest, TypeInference) {
       << input_dataset1->def().experimental_type().DebugString();
 }
 
+// Tests for Enter op.
+class EnterOpTest : public OpsTestBase {
+ protected:
+  void Initialize(DataType dt) {
+    TF_ASSERT_OK(NodeDefBuilder("op", "Enter")
+                     .Input(FakeInput(dt))
+                     .Attr("frame_name", "EnterOp")
+                     .Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(EnterOpTest, QUInt8_Success) {
+  Initialize(DT_QUINT8);
+  AddInputFromArray<quint8>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QUINT8, TensorShape({2, 3}));
+  test::FillValues<quint8>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<quint8>(expected, *GetOutput(0));
+}
+
+TEST_F(EnterOpTest, String_Success) {
+  Initialize(DT_STRING);
+  AddInputFromArray<tstring>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({6}));
+  test::FillValues<tstring>(&expected, {"A", "b", "C", "d", "E", "f"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
+// Tests for Exit op.
+class ExitOpTest : public OpsTestBase {
+ protected:
+  void Initialize(DataType dt) {
+    TF_ASSERT_OK(
+        NodeDefBuilder("op", "Exit").Input(FakeInput(dt)).Finalize(node_def()));
+    TF_ASSERT_OK(InitOp());
+  }
+};
+
+TEST_F(ExitOpTest, QUInt8_Success) {
+  Initialize(DT_QUINT8);
+  AddInputFromArray<quint8>(TensorShape({2, 3}), {1, 2, 3, 4, 5, 6});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_QUINT8, TensorShape({2, 3}));
+  test::FillValues<quint8>(&expected, {1, 2, 3, 4, 5, 6});
+  test::ExpectTensorEqual<quint8>(expected, *GetOutput(0));
+}
+
+TEST_F(ExitOpTest, String_Success) {
+  Initialize(DT_STRING);
+  AddInputFromArray<tstring>(TensorShape({6}), {"A", "b", "C", "d", "E", "f"});
+  TF_ASSERT_OK(RunOpKernel());
+  Tensor expected(allocator(), DT_STRING, TensorShape({6}));
+  test::FillValues<tstring>(&expected, {"A", "b", "C", "d", "E", "f"});
+  test::ExpectTensorEqual<tstring>(expected, *GetOutput(0));
+}
+
 }  // namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc b/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc
index c21a0cc907bce7..4d43ee0bdf8592 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops_3d.cc
@@ -707,6 +707,9 @@ void LaunchConvBackpropFilterOpImpl(
         "with cuDNN on Ampere GPUs or later."));
     return;
   }
+  auto* blas = stream->parent()->AsBlas();
+  OP_REQUIRES(context, blas != nullptr,
+              absl::InternalError("No BLAS for stream."));
 
   bool is_grouped_convolution = filter_shape.dim_size(3) != dims.in_depth;
   if (!is_grouped_convolution && dims.filter_size(1) == 1 &&
@@ -737,8 +740,8 @@ void LaunchConvBackpropFilterOpImpl(
     auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
                                 filter_backprop->template flat<T>().size());
 
-    OP_REQUIRES_OK(context, stream->ThenBlasGemm(
-                                se::blas::Transpose::kNoTranspose,
+    OP_REQUIRES_OK(
+        context, blas->BlasGemm(stream, se::blas::Transpose::kNoTranspose,
                                 se::blas::Transpose::kTranspose, n, m, k, a_ptr,
                                 n, b_ptr, m, &c_ptr, n, GetNumericOptions(),
                                 se::blas::CallContext::kNone));
@@ -760,8 +763,8 @@ void LaunchConvBackpropFilterOpImpl(
     auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
                                 filter_backprop->template flat<T>().size());
 
-    OP_REQUIRES_OK(context, stream->ThenBlasGemm(
-                                se::blas::Transpose::kNoTranspose,
+    OP_REQUIRES_OK(
+        context, blas->BlasGemm(stream, se::blas::Transpose::kNoTranspose,
                                 se::blas::Transpose::kTranspose, n, m, k, b_ptr,
                                 n, a_ptr, m, &c_ptr, n, GetNumericOptions(),
                                 se::blas::CallContext::kNone));
diff --git a/tensorflow/core/kernels/conv_grad_filter_ops_launcher.cc b/tensorflow/core/kernels/conv_grad_filter_ops_launcher.cc
index e65e5995e92045..f1c8958b127e88 100644
--- a/tensorflow/core/kernels/conv_grad_filter_ops_launcher.cc
+++ b/tensorflow/core/kernels/conv_grad_filter_ops_launcher.cc
@@ -226,6 +226,8 @@ void LaunchConv2DBackpropFilterOpImpl(
         "without cudnn"));
     return;
   }
+  auto* blas = stream->parent()->AsBlas();
+  OP_REQUIRES(ctx, blas != nullptr, absl::InternalError("No BLAS for stream."));
 
   // If the filter in-depth (filter_shape.dim_size(2)) is 1 and smaller than the
   // input depth, it's a depthwise convolution. More generally, if the filter
@@ -261,11 +263,11 @@ void LaunchConv2DBackpropFilterOpImpl(
     auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
                                 filter_backprop->template flat<T>().size());
 
-    OP_REQUIRES_OK(ctx, stream->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
-                                             se::blas::Transpose::kTranspose, n,
-                                             m, k, a_ptr, n, b_ptr, m, &c_ptr,
-                                             n, GetNumericOptions(),
-                                             se::blas::CallContext::kNone));
+    OP_REQUIRES_OK(
+        ctx, blas->BlasGemm(stream, se::blas::Transpose::kNoTranspose,
+                            se::blas::Transpose::kTranspose, n, m, k, a_ptr, n,
+                            b_ptr, m, &c_ptr, n, GetNumericOptions(),
+                            se::blas::CallContext::kNone));
     return;
   } else if (dims.spatial_dims[0].filter_size ==
                  dims.spatial_dims[0].input_size &&
@@ -287,11 +289,11 @@ void LaunchConv2DBackpropFilterOpImpl(
     auto c_ptr = AsDeviceMemory(filter_backprop->template flat<T>().data(),
                                 filter_backprop->template flat<T>().size());
 
-    OP_REQUIRES_OK(ctx, stream->ThenBlasGemm(se::blas::Transpose::kNoTranspose,
-                                             se::blas::Transpose::kTranspose, n,
-                                             m, k, b_ptr, n, a_ptr, m, &c_ptr,
-                                             n, GetNumericOptions(),
-                                             se::blas::CallContext::kNone));
+    OP_REQUIRES_OK(
+        ctx, blas->BlasGemm(stream, se::blas::Transpose::kNoTranspose,
+                            se::blas::Transpose::kTranspose, n, m, k, b_ptr, n,
+                            a_ptr, m, &c_ptr, n, GetNumericOptions(),
+                            se::blas::CallContext::kNone));
     return;
   }
 
diff --git a/tensorflow/core/kernels/conv_grad_input_ops.cc b/tensorflow/core/kernels/conv_grad_input_ops.cc
index cf805027cb5835..ee41b2ddce4eb8 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops.cc
@@ -131,6 +131,8 @@ void LaunchConv2DBackpropInputOpGpuImpl(
         "without cudnn"));
     return;
   }
+  auto* blas = stream->parent()->AsBlas();
+  OP_REQUIRES(ctx, blas != nullptr, absl::InternalError("No BLAS for stream."));
 
   // If the filter in-depth (filter_shape.dim_size(2)) is 1 and smaller than the
   // input depth, it's a depthwise convolution. More generally, if the filter
@@ -158,9 +160,9 @@ void LaunchConv2DBackpropInputOpGpuImpl(
     auto no_transpose = se::blas::Transpose::kNoTranspose;
 
     OP_REQUIRES_OK(
-        ctx, stream->ThenBlasGemm(transpose, no_transpose, n, m, k, b_ptr, k,
-                                  a_ptr, k, &c_ptr, n, GetNumericOptions(),
-                                  se::blas::CallContext::kNone));
+        ctx, blas->BlasGemm(stream, transpose, no_transpose, n, m, k, b_ptr, k,
+                            a_ptr, k, &c_ptr, n, GetNumericOptions(),
+                            se::blas::CallContext::kNone));
     return;
   } else if (dims.spatial_dims[0].filter_size ==
                  dims.spatial_dims[0].input_size &&
@@ -186,9 +188,9 @@ void LaunchConv2DBackpropInputOpGpuImpl(
     auto no_transpose = se::blas::Transpose::kNoTranspose;
 
     OP_REQUIRES_OK(
-        ctx, stream->ThenBlasGemm(transpose, no_transpose, n, m, k, b_ptr, k,
-                                  a_ptr, k, &c_ptr, n, GetNumericOptions(),
-                                  se::blas::CallContext::kNone));
+        ctx, blas->BlasGemm(stream, transpose, no_transpose, n, m, k, b_ptr, k,
+                            a_ptr, k, &c_ptr, n, GetNumericOptions(),
+                            se::blas::CallContext::kNone));
     return;
   }
 
diff --git a/tensorflow/core/kernels/conv_grad_input_ops_3d.cc b/tensorflow/core/kernels/conv_grad_input_ops_3d.cc
index 70311cbbd7a3d7..2fc59b60e3034b 100644
--- a/tensorflow/core/kernels/conv_grad_input_ops_3d.cc
+++ b/tensorflow/core/kernels/conv_grad_input_ops_3d.cc
@@ -718,6 +718,9 @@ void LaunchConvBackpropInputOpImpl(
 
   auto* stream = context->op_device_context()->stream();
   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+  auto* blas = stream->parent()->AsBlas();
+  OP_REQUIRES(context, blas != nullptr,
+              absl::InternalError("No BLAS for stream."));
 
   bool is_grouped_convolution = filter_shape.dim_size(3) != dims.in_depth;
   if (!is_grouped_convolution && dims.filter_size(0) == 1 &&
@@ -740,10 +743,10 @@ void LaunchConvBackpropInputOpImpl(
     auto transpose = se::blas::Transpose::kTranspose;
     auto no_transpose = se::blas::Transpose::kNoTranspose;
 
-    OP_REQUIRES_OK(context, stream->ThenBlasGemm(transpose, no_transpose, n, m,
-                                                 k, b_ptr, k, a_ptr, k, &c_ptr,
-                                                 n, GetNumericOptions(),
-                                                 se::blas::CallContext::kNone));
+    OP_REQUIRES_OK(
+        context, blas->BlasGemm(stream, transpose, no_transpose, n, m, k, b_ptr,
+                                k, a_ptr, k, &c_ptr, n, GetNumericOptions(),
+                                se::blas::CallContext::kNone));
     return;
   } else if (!is_grouped_convolution &&
              dims.filter_size(0) == dims.input_size(0) &&
@@ -765,10 +768,10 @@ void LaunchConvBackpropInputOpImpl(
     auto transpose = se::blas::Transpose::kTranspose;
     auto no_transpose = se::blas::Transpose::kNoTranspose;
 
-    OP_REQUIRES_OK(context, stream->ThenBlasGemm(transpose, no_transpose, n, m,
-                                                 k, b_ptr, k, a_ptr, k, &c_ptr,
-                                                 n, GetNumericOptions(),
-                                                 se::blas::CallContext::kNone));
+    OP_REQUIRES_OK(
+        context, blas->BlasGemm(stream, transpose, no_transpose, n, m, k, b_ptr,
+                                k, a_ptr, k, &c_ptr, n, GetNumericOptions(),
+                                se::blas::CallContext::kNone));
     return;
   }
 
diff --git a/tensorflow/core/kernels/conv_grad_shape_utils.cc b/tensorflow/core/kernels/conv_grad_shape_utils.cc
index 9560a37fd6eea6..4bce9b873473f2 100644
--- a/tensorflow/core/kernels/conv_grad_shape_utils.cc
+++ b/tensorflow/core/kernels/conv_grad_shape_utils.cc
@@ -87,7 +87,7 @@ Status ConvBackpropExtractAndVerifyDimension(
           << ", pad_before = " << dim->pad_before
           << ", pad_after = " << dim->pad_after
           << ", dilation = " << dim->dilation << ", strides = " << dim->stride;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -154,7 +154,7 @@ Status ConvBackpropComputeDimensionsV2(
         strides, padding, padding_before, padding_after, image_dim, i,
         &dims->spatial_dims[i]));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ConvBackpropComputeDimensions(StringPiece label, int num_spatial_dims,
diff --git a/tensorflow/core/kernels/conv_ops.cc b/tensorflow/core/kernels/conv_ops.cc
index 063899c6b4a3b7..751a7c0b1957f4 100644
--- a/tensorflow/core/kernels/conv_ops.cc
+++ b/tensorflow/core/kernels/conv_ops.cc
@@ -91,7 +91,7 @@ Status InitConv2DParameters(const OpKernelConstruction* context,
   TF_RETURN_IF_ERROR(CheckValidPadding(
       params->padding, params->explicit_paddings, num_dims, data_format));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ComputeConv2DDimension(const Conv2DParameters& params,
@@ -210,7 +210,7 @@ Status ComputeConv2DDimension(const Conv2DParameters& params,
   dimensions->pad_cols_before = pad_cols_before;
   dimensions->pad_cols_after = pad_cols_after;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #undef TF_REQUIRES
diff --git a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
index 042ee1dfc3cd93..d45ff0171dfe59 100644
--- a/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_image_transform.cc
@@ -344,7 +344,7 @@ class FusedResizeAndPadConvFunctor {
     std::function<Status(Im2ColBufferResource<T1, kMaxChunkSize>**)> creator =
         [](Im2ColBufferResource<T1, kMaxChunkSize>** resource) {
           *resource = new Im2ColBufferResource<T1, kMaxChunkSize>();
-          return OkStatus();
+          return absl::OkStatus();
         };
     OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
                                 "Conv2d", "im2col_buffer",
@@ -382,7 +382,7 @@ class FusedResizeAndPadConvFunctor {
         resize_creator =
             [](Im2ColBufferResource<T1, kResizeCacheSize>** resource) {
               *resource = new Im2ColBufferResource<T1, kResizeCacheSize>();
-              return OkStatus();
+              return absl::OkStatus();
             };
     OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
                                 "Conv2d", "resize_cache",
diff --git a/tensorflow/core/kernels/conv_ops_fused_impl.h b/tensorflow/core/kernels/conv_ops_fused_impl.h
index a70fff1ed4aef6..5e35562b6f1c3a 100644
--- a/tensorflow/core/kernels/conv_ops_fused_impl.h
+++ b/tensorflow/core/kernels/conv_ops_fused_impl.h
@@ -665,8 +665,11 @@ struct LaunchFusedConv2DOp<GPUDevice, T> {
           stream, nullptr, std::get<se::DeviceMemoryBase>(runner_and_scratch),
           input_ptr, filter_ptr, side_input_ptr, bias_ptr, output_ptr);
     } else {
-      cudnn_launch_status = stream->FusedConvolveWithAlgorithm(
-          input_desc, input_ptr,            // input
+      auto dnn = stream->parent()->AsDnn();
+      OP_REQUIRES(context, dnn != nullptr,
+                  absl::InternalError("No DNN for stream."));
+      cudnn_launch_status = dnn->FusedConvolveWithAlgorithm(
+          stream, input_desc, input_ptr,    // input
           kConvScale,                       // input_scale
           filter_desc, filter_ptr,          // filter
           conv_desc,                        // conv
diff --git a/tensorflow/core/kernels/conv_ops_fused_int8.cc b/tensorflow/core/kernels/conv_ops_fused_int8.cc
index 54d7f68cf5b614..6ce8a906409e46 100644
--- a/tensorflow/core/kernels/conv_ops_fused_int8.cc
+++ b/tensorflow/core/kernels/conv_ops_fused_int8.cc
@@ -580,7 +580,9 @@ void operator()(
     profiler::ScopedAnnotation trace("cudnn_autotuning");
 
     std::vector<std::unique_ptr<const se::dnn::FusedConvRunner>> runners;
-    TF_CHECK_OK(stream->parent()->GetFusedConvolveRunners(
+    auto dnn = stream->parent()->AsDnn();
+    CHECK_NE(dnn, nullptr);
+    TF_CHECK_OK(dnn->GetFusedConvolveRunners(
         use_cudnn_frontend, se::dnn::ConvolutionKind::FORWARD, type, bias_type,
         type, conv_scale, side_input_scale, /*leakyrelu_alpha=*/0.0, stream,
         conv_input_desc, filter_desc, bias_desc, output_desc, conv_desc,
@@ -628,7 +630,9 @@ void operator()(
     } else {
       std::vector<std::unique_ptr<const se::dnn::FusedConvRunner>>
           fallback_runners;
-      TF_CHECK_OK(stream->parent()->GetFusedConvolveRunners(
+      auto dnn = stream->parent()->AsDnn();
+      CHECK_NE(dnn, nullptr);
+      TF_CHECK_OK(dnn->GetFusedConvolveRunners(
           use_cudnn_frontend, se::dnn::ConvolutionKind::FORWARD, type,
           bias_type, type, conv_scale, side_input_scale, leakyrelu_alpha,
           stream, conv_input_desc, filter_desc, bias_desc, output_desc,
@@ -701,11 +705,13 @@ void operator()(
         std::get<se::DeviceMemoryBase>(runner_and_scratch), conv_input_ptr,
         filter_ptr, side_input_ptr, bias_ptr, output_ptr);
   } else {
-    cudnn_launch_status = stream->FusedConvolveWithAlgorithm(
-        conv_input_desc, conv_input_ptr, conv_scale, filter_desc, filter_ptr,
-        conv_desc, side_input_ptr, side_input_scale, bias_desc, bias_ptr,
-        dnn_activation_mode, output_desc, &output_ptr, &scratch_allocator,
-        autotune_entry.GetAlgorithmConfig(),
+    auto dnn = stream->parent()->AsDnn();
+    OP_REQUIRES(ctx, dnn != nullptr, absl::InternalError("No DNN for stream."));
+    cudnn_launch_status = dnn->FusedConvolveWithAlgorithm(
+        stream, conv_input_desc, conv_input_ptr, conv_scale, filter_desc,
+        filter_ptr, conv_desc, side_input_ptr, side_input_scale, bias_desc,
+        bias_ptr, dnn_activation_mode, output_desc, &output_ptr,
+        &scratch_allocator, autotune_entry.GetAlgorithmConfig(),
         /*output_profile_result=*/nullptr);
   }
 
diff --git a/tensorflow/core/kernels/conv_ops_gpu.cc b/tensorflow/core/kernels/conv_ops_gpu.cc
index b8a71290972e8d..ecc689a383d4d8 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.cc
+++ b/tensorflow/core/kernels/conv_ops_gpu.cc
@@ -96,7 +96,11 @@ StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv(
 
     std::vector<std::unique_ptr<const se::dnn::FusedConvRunner>> runners;
     auto element_type = se::dnn::ToDataType<T>::value;
-    TF_RETURN_IF_ERROR(stream->parent()->GetFusedConvolveRunners(
+    auto dnn = stream->parent()->AsDnn();
+    if (dnn == nullptr) {
+      return absl::InvalidArgumentError("No DNN in stream executor.");
+    }
+    TF_RETURN_IF_ERROR(dnn->GetFusedConvolveRunners(
         CudnnUseFrontend(), se::dnn::ConvolutionKind::FORWARD, element_type,
         element_type, element_type, conv_scale, side_input_scale,
         leakyrelu_alpha, stream, input_desc, filter_desc, bias_desc,
@@ -147,7 +151,7 @@ StatusOr<AutotuneEntry<se::dnn::FusedConvOp>> AutotuneFusedConv(
           << params.ToString();
       std::vector<std::unique_ptr<const se::dnn::FusedConvRunner>>
           fallback_runners;
-      TF_RETURN_IF_ERROR(stream->parent()->GetFusedConvolveRunners(
+      TF_RETURN_IF_ERROR(dnn->GetFusedConvolveRunners(
           CudnnUseFrontend(), se::dnn::ConvolutionKind::FORWARD, element_type,
           element_type, element_type, conv_scale, side_input_scale,
           leakyrelu_alpha, stream, input_desc, filter_desc, bias_desc,
@@ -279,7 +283,11 @@ StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
 
     const auto element_type = se::dnn::ToDataType<T>::value;
     std::vector<std::unique_ptr<const se::dnn::ConvRunner>> runners;
-    TF_RETURN_IF_ERROR(stream->parent()->GetConvolveRunners(
+    auto dnn = stream->parent()->AsDnn();
+    if (dnn == nullptr) {
+      return absl::InvalidArgumentError("No DNN in stream executor.");
+    }
+    TF_RETURN_IF_ERROR(dnn->GetConvolveRunners(
         CudnnUseFrontend(), kind, element_type, element_type, stream,
         input_desc, input_ptr, filter_desc, filter_ptr, output_desc, output_ptr,
         conv_desc, /*use_fallback=*/false, &rz_allocator, GetNumericOptions(),
@@ -324,7 +332,7 @@ StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
              "worked; trying fallback algorithms.  Conv: "
           << conv_parameters.ToString();
       std::vector<std::unique_ptr<const se::dnn::ConvRunner>> fallback_runners;
-      TF_RETURN_IF_ERROR(stream->parent()->GetConvolveRunners(
+      TF_RETURN_IF_ERROR(dnn->GetConvolveRunners(
           CudnnUseFrontend(), kind, element_type, element_type, stream,
           input_desc, input_ptr, filter_desc, filter_ptr, output_desc,
           output_ptr, conv_desc, /*use_fallback=*/true, &rz_allocator,
@@ -349,7 +357,11 @@ StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
     DnnScratchAllocator scratch_allocator(scratch_size_limit, ctx);
 
     std::vector<se::dnn::ProfileResult> algorithms;
-    if (!stream->parent()->GetMIOpenConvolveAlgorithms(
+    auto dnn = stream->parent()->AsDnn();
+    if (dnn == nullptr) {
+      return absl::InvalidArgumentError("No DNN in stream executor.");
+    }
+    if (!dnn->GetMIOpenConvolveAlgorithms(
             kind, se::dnn::ToDataType<T>::value, stream, input_desc, input_ptr,
             filter_desc, filter_ptr, output_desc, output_ptr, conv_desc,
             &scratch_allocator, &algorithms)) {
@@ -373,9 +385,9 @@ StatusOr<AutotuneEntry<se::dnn::ConvOp>> AutotuneUnfusedConv(
       for (auto miopen_algorithm : algorithms) {
         auto profile_algorithm = miopen_algorithm.algorithm();
         se::dnn::ProfileResult profile_result;
-        auto miopen_launch_status = stream->ConvolveWithAlgorithm(
-            kind, input_desc, input_ptr, filter_desc, filter_ptr, output_desc,
-            output_ptr, conv_desc, &scratch_allocator,
+        auto miopen_launch_status = dnn->ConvolveWithAlgorithm(
+            stream, kind, input_desc, input_ptr, filter_desc, filter_ptr,
+            output_desc, output_ptr, conv_desc, &scratch_allocator,
             se::dnn::AlgorithmConfig(profile_algorithm,
                                      miopen_algorithm.scratch_size()),
             &profile_result);
diff --git a/tensorflow/core/kernels/conv_ops_gpu.h b/tensorflow/core/kernels/conv_ops_gpu.h
index 80646badbade7d..627450ef2d6321 100644
--- a/tensorflow/core/kernels/conv_ops_gpu.h
+++ b/tensorflow/core/kernels/conv_ops_gpu.h
@@ -195,10 +195,14 @@ Status LaunchAutotunedConv(const AutotuneEntry<se::dnn::ConvOp>& autotune_entry,
                   std::get<se::DeviceMemoryBase>(runner_and_scratch), in_ptr,
                   filter_ptr, out_ptr);
   } else {
-    return stream->ConvolveWithAlgorithm(
-        kind, input_desc, in_ptr, filter_desc, filter_ptr, output_desc, out_ptr,
-        conv_desc, scratch_allocator, autotune_entry.GetAlgorithmConfig(),
-        nullptr);
+    auto dnn = stream->parent()->AsDnn();
+    if (dnn == nullptr) {
+      return absl::InternalError("No DNN for stream.");
+    }
+    return dnn->ConvolveWithAlgorithm(
+        stream, kind, input_desc, in_ptr, filter_desc, filter_ptr, output_desc,
+        out_ptr, conv_desc, scratch_allocator,
+        autotune_entry.GetAlgorithmConfig(), nullptr);
   }
 }
 
diff --git a/tensorflow/core/kernels/conv_ops_impl.h b/tensorflow/core/kernels/conv_ops_impl.h
index 432093b48e3d5d..0d3fc798bbe3c2 100644
--- a/tensorflow/core/kernels/conv_ops_impl.h
+++ b/tensorflow/core/kernels/conv_ops_impl.h
@@ -789,6 +789,9 @@ void LaunchConvOpImpl(OpKernelContext* context, bool cudnn_use_autotune,
     if (filter_dims[i] != in_dims[i]) filter_same_dims = false;
   }
 
+  auto* blas = stream->parent()->AsBlas();
+  OP_REQUIRES(context, blas != nullptr,
+              absl::InternalError("No BLAS for stream."));
   if (!is_grouped_convolution && one_filter && one_dilations && one_stride &&
       data_format == FORMAT_NHWC && (padding == VALID || padding == SAME)) {
     // 1x1 filter, so call cublas directly.
@@ -805,10 +808,10 @@ void LaunchConvOpImpl(OpKernelContext* context, bool cudnn_use_autotune,
                                 output->template flat<T>().size());
 
     auto no_transpose = se::blas::Transpose::kNoTranspose;
-    OP_REQUIRES_OK(context, stream->ThenBlasGemm(no_transpose, no_transpose, n,
-                                                 m, k, b_ptr, n, a_ptr, k,
-                                                 &c_ptr, n, GetNumericOptions(),
-                                                 se::blas::CallContext::kNone));
+    OP_REQUIRES_OK(context, blas->BlasGemm(stream, no_transpose, no_transpose,
+                                           n, m, k, b_ptr, n, a_ptr, k, &c_ptr,
+                                           n, GetNumericOptions(),
+                                           se::blas::CallContext::kNone));
     return;
   } else if (!is_grouped_convolution && filter_same_dims && padding == VALID &&
              data_format == FORMAT_NHWC) {
@@ -827,10 +830,10 @@ void LaunchConvOpImpl(OpKernelContext* context, bool cudnn_use_autotune,
                                 output->template flat<T>().size());
 
     auto no_transpose = se::blas::Transpose::kNoTranspose;
-    OP_REQUIRES_OK(context, stream->ThenBlasGemm(no_transpose, no_transpose, n,
-                                                 m, k, b_ptr, n, a_ptr, k,
-                                                 &c_ptr, n, GetNumericOptions(),
-                                                 se::blas::CallContext::kNone));
+    OP_REQUIRES_OK(context, blas->BlasGemm(stream, no_transpose, no_transpose,
+                                           n, m, k, b_ptr, n, a_ptr, k, &c_ptr,
+                                           n, GetNumericOptions(),
+                                           se::blas::CallContext::kNone));
     return;
   }
 
diff --git a/tensorflow/core/kernels/conv_ops_using_gemm.cc b/tensorflow/core/kernels/conv_ops_using_gemm.cc
index 8374935243fec7..07efd8069b019f 100644
--- a/tensorflow/core/kernels/conv_ops_using_gemm.cc
+++ b/tensorflow/core/kernels/conv_ops_using_gemm.cc
@@ -311,7 +311,7 @@ class Im2ColConvFunctor {
     std::function<Status(Im2ColBufferResource<T1, chunk_value_count>**)>
         creator = [](Im2ColBufferResource<T1, chunk_value_count>** resource) {
           *resource = new Im2ColBufferResource<T1, chunk_value_count>();
-          return OkStatus();
+          return absl::OkStatus();
         };
     OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
                                 "Conv2d", "im2col_buffer",
diff --git a/tensorflow/core/kernels/count_ops.cc b/tensorflow/core/kernels/count_ops.cc
index 84741c3ed49bac..dd1d3db048046d 100644
--- a/tensorflow/core/kernels/count_ops.cc
+++ b/tensorflow/core/kernels/count_ops.cc
@@ -86,7 +86,7 @@ Status OutputSparse(const BatchedMap<T>& per_batch_counts, int64_t num_values,
     dense_shape->flat<int64_t>().data()[1] = num_values;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 int64_t GetOutputSize(int64_t max_seen, int64_t max_length,
diff --git a/tensorflow/core/kernels/ctc_decoder_ops.cc b/tensorflow/core/kernels/ctc_decoder_ops.cc
index e89c9c8c7c4220..2480bc435bd01c 100644
--- a/tensorflow/core/kernels/ctc_decoder_ops.cc
+++ b/tensorflow/core/kernels/ctc_decoder_ops.cc
@@ -111,7 +111,7 @@ class CTCDecodeHelper {
     s = ctx->output_list("decoded_shape", decoded_shape);
     if (!s.ok()) return s;
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // sequences[b][p][ix] stores decoded value "ix" of path "p" for batch "b".
@@ -174,7 +174,7 @@ class CTCDecodeHelper {
       shape_t(0) = batch_size;
       shape_t(1) = max_decoded;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
diff --git a/tensorflow/core/kernels/ctc_loss_op.cc b/tensorflow/core/kernels/ctc_loss_op.cc
index 7e38ac5fcb0669..dfe9de54c91535 100644
--- a/tensorflow/core/kernels/ctc_loss_op.cc
+++ b/tensorflow/core/kernels/ctc_loss_op.cc
@@ -331,13 +331,13 @@ class CTCLossOpGPU : public OpKernel {
     StreamExecutor* executor = ctx->op_device_context()->stream()->parent();
     se::dnn::DataType data_type = ToDataType<float>::value;
 
-    auto probs_desc_s = executor->createRnnStateTensorDescriptor(
+    auto probs_desc_s = executor->AsDnn()->CreateRnnStateTensorDescriptor(
         max_time, batch_size, num_classes, data_type);
     OP_REQUIRES_OK(ctx, probs_desc_s.status());
     std::unique_ptr<RnnStateTensorDescriptor> probs_desc =
         std::move(probs_desc_s).value();
 
-    auto grads_desc_s = executor->createRnnStateTensorDescriptor(
+    auto grads_desc_s = executor->AsDnn()->CreateRnnStateTensorDescriptor(
         max_time, batch_size, num_classes, data_type);
     OP_REQUIRES_OK(ctx, grads_desc_s.status());
     std::unique_ptr<RnnStateTensorDescriptor> grads_desc =
@@ -358,14 +358,23 @@ class CTCLossOpGPU : public OpKernel {
     DnnScratchAllocator workspace_allocator(1LL << 32, ctx);
 
     Stream* stream = ctx->op_device_context()->stream();
+    auto dnn = stream->parent()->AsDnn();
+    OP_REQUIRES(ctx, dnn != nullptr,
+                absl::InternalError("stream->parent() has no DNN support"));
+    stream_executor::DeviceMemory<uint8_t> scratch_memory;
+    int ctc_loss_algo_id;
     bool cudnn_launch_status =
-        stream
-            ->ThenCtcLoss(*probs_desc, probs_data, labels_data,
-                          labels_lengths_data, input_lengths_data,
-                          GetNumericOptions(), &costs_data, *grads_desc,
-                          &grads_data, &workspace_allocator)
+        dnn->PrepareForCtcLoss(
+               stream, *probs_desc, probs_data, *grads_desc, labels_data,
+               labels_lengths_data, input_lengths_data, GetNumericOptions(),
+               &workspace_allocator, &scratch_memory, &ctc_loss_algo_id)
             .ok();
-
+    if (cudnn_launch_status) {
+      cudnn_launch_status = dnn->DoCtcLoss<float>(
+          stream, *probs_desc, probs_data, labels_data, labels_lengths_data,
+          input_lengths_data, &costs_data, *grads_desc, &grads_data,
+          &scratch_memory, ctc_loss_algo_id);
+    }
     if (!cudnn_launch_status) {
       ctx->SetStatus(errors::Internal("cuDNN CTCLoss launch failure"));
     }
diff --git a/tensorflow/core/kernels/cudnn_pooling_gpu.cc b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
index bd6e9ed054762a..e9917cce87a3e3 100644
--- a/tensorflow/core/kernels/cudnn_pooling_gpu.cc
+++ b/tensorflow/core/kernels/cudnn_pooling_gpu.cc
@@ -104,7 +104,9 @@ void DnnPooling3dImpl(OpKernelContext* context,
 
   auto* stream = context->op_device_context()->stream();
   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
-
+  auto* dnn = stream->parent()->AsDnn();
+  OP_REQUIRES(context, dnn != nullptr,
+              errors::Internal("No DNN support for stream."));
 #if TENSORFLOW_USE_ROCM
   static int64 PoolingScratchSize = GetDnnWorkspaceLimit(
       // default value is in bytes despite the name of the environment variable
@@ -113,13 +115,14 @@ void DnnPooling3dImpl(OpKernelContext* context,
 
   DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
   OP_REQUIRES_OK(context,
-                 stream->ThenPoolForward(pooling_desc, GetNumericOptions(),
-                                         input_desc, input_data, output_desc,
-                                         &output_data, &scratch_allocator));
+                 dnn->PoolForward(stream, pooling_desc, GetNumericOptions(),
+                                  input_desc, input_data, output_desc,
+                                  &output_data, &scratch_allocator));
 #else
-  OP_REQUIRES_OK(context, stream->ThenPoolForward(
-                              pooling_desc, GetNumericOptions(), input_desc,
-                              input_data, output_desc, &output_data));
+  OP_REQUIRES_OK(
+      context,
+      dnn->PoolForward(stream, pooling_desc, GetNumericOptions(), input_desc,
+                       input_data, output_desc, &output_data));
 #endif
 
   if (data_format == FORMAT_NHWC) {
@@ -294,6 +297,9 @@ void DnnPooling3dGradImpl(
 
   auto* stream = context->op_device_context()->stream();
   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+  auto* dnn = stream->parent()->AsDnn();
+  OP_REQUIRES(context, dnn != nullptr,
+              errors::Internal("No DNN support for stream."));
 
 #if TENSORFLOW_USE_ROCM
   static int64 PoolingScratchSize = GetDnnWorkspaceLimit(
@@ -304,16 +310,16 @@ void DnnPooling3dGradImpl(
   DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
   OP_REQUIRES_OK(
       context,
-      stream->ThenPoolBackward(
-          pooling_desc, GetNumericOptions(), orig_input_desc, orig_input_data,
-          orig_output_desc, orig_output_data, output_backprop_data,
-          &input_backprop_data, &scratch_allocator));
+      dnn->PoolBackward(stream, pooling_desc, GetNumericOptions(),
+                        orig_input_desc, orig_input_data, orig_output_desc,
+                        orig_output_data, output_backprop_data,
+                        &input_backprop_data, &scratch_allocator));
 #else
   OP_REQUIRES_OK(context,
-                 stream->ThenPoolBackward(
-                     pooling_desc, GetNumericOptions(), orig_input_desc,
-                     orig_input_data, orig_output_desc, orig_output_data,
-                     output_backprop_data, &input_backprop_data));
+                 dnn->PoolBackward(stream, pooling_desc, GetNumericOptions(),
+                                   orig_input_desc, orig_input_data,
+                                   orig_output_desc, orig_output_data,
+                                   output_backprop_data, &input_backprop_data));
 #endif
 
   if (data_format == FORMAT_NHWC) {
diff --git a/tensorflow/core/kernels/cudnn_rnn_ops.cc b/tensorflow/core/kernels/cudnn_rnn_ops.cc
index 1e6cf1846adbc1..a96cf937e277a5 100644
--- a/tensorflow/core/kernels/cudnn_rnn_ops.cc
+++ b/tensorflow/core/kernels/cudnn_rnn_ops.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <unordered_set>
 #include <utility>
 
+#include "absl/strings/str_cat.h"
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 #include "tensorflow/core/framework/device_base.h"
 #include "tensorflow/core/framework/kernel_def_builder.h"
@@ -137,6 +138,13 @@ using se::dnn::RnnStateTensorDescriptor;
 using se::dnn::ToDataType;
 using tsl::StatusOr;
 
+absl::StatusOr<stream_executor::dnn::DnnSupport*> GetDnn(Stream* stream) {
+  auto dnn = stream->parent()->AsDnn();
+  if (dnn == nullptr) {
+    return absl::InternalError("No DNN for stream");
+  }
+  return dnn;
+}
 uint64 HashList(const std::vector<int>& list) {
   if (list.empty()) {
     return 0;
@@ -708,22 +716,26 @@ Status CreateForwardAndBackwardIODescriptors(
   const TensorShape& output_shape = model_shapes.output_shape;
 
   DCHECK_EQ(input_shape.dims(), 3);
+  auto dnn = executor->AsDnn();
+  if (dnn == nullptr) {
+    return absl::InvalidArgumentError("No dnn in the executor.");
+  }
   if (seq_lengths.data() != nullptr) {
     if (time_major) {
-      auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
+      auto input_desc_s = dnn->CreateRnnSequenceTensorDescriptor(
           input_shape.dim_size(0), input_shape.dim_size(1),
           input_shape.dim_size(2), seq_lengths, time_major, data_type);
       TF_RETURN_IF_ERROR(input_desc_s.status());
       *input_desc = std::move(input_desc_s).value();
     } else {
-      auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
+      auto input_desc_s = dnn->CreateRnnSequenceTensorDescriptor(
           input_shape.dim_size(1), input_shape.dim_size(0),
           input_shape.dim_size(2), seq_lengths, time_major, data_type);
       TF_RETURN_IF_ERROR(input_desc_s.status());
       *input_desc = std::move(input_desc_s).value();
     }
   } else {
-    auto input_desc_s = executor->createRnnSequenceTensorDescriptor(
+    auto input_desc_s = dnn->CreateRnnSequenceTensorDescriptor(
         input_shape.dim_size(0), input_shape.dim_size(1),
         input_shape.dim_size(2), data_type);
     TF_RETURN_IF_ERROR(input_desc_s.status());
@@ -732,13 +744,13 @@ Status CreateForwardAndBackwardIODescriptors(
 
   DCHECK_EQ(hidden_state_shape.dims(), 3);
   if (time_major) {
-    auto hidden_state_desc_s = executor->createRnnStateTensorDescriptor(
+    auto hidden_state_desc_s = dnn->CreateRnnStateTensorDescriptor(
         hidden_state_shape.dim_size(0), hidden_state_shape.dim_size(1),
         hidden_state_shape.dim_size(2), data_type);
     TF_RETURN_IF_ERROR(hidden_state_desc_s.status());
     *h_state_desc = std::move(hidden_state_desc_s).value();
   } else {
-    auto hidden_state_desc_s = executor->createRnnStateTensorDescriptor(
+    auto hidden_state_desc_s = dnn->CreateRnnStateTensorDescriptor(
         hidden_state_shape.dim_size(1), hidden_state_shape.dim_size(0),
         hidden_state_shape.dim_size(2), data_type);
     TF_RETURN_IF_ERROR(hidden_state_desc_s.status());
@@ -747,13 +759,13 @@ Status CreateForwardAndBackwardIODescriptors(
 
   DCHECK_EQ(cell_state_shape.dims(), 3);
   if (time_major) {
-    auto cell_state_desc_s = executor->createRnnStateTensorDescriptor(
+    auto cell_state_desc_s = dnn->CreateRnnStateTensorDescriptor(
         cell_state_shape.dim_size(0), cell_state_shape.dim_size(1),
         cell_state_shape.dim_size(2), data_type);
     TF_RETURN_IF_ERROR(cell_state_desc_s.status());
     *c_state_desc = std::move(cell_state_desc_s).value();
   } else {
-    auto cell_state_desc_s = executor->createRnnStateTensorDescriptor(
+    auto cell_state_desc_s = dnn->CreateRnnStateTensorDescriptor(
         cell_state_shape.dim_size(1), cell_state_shape.dim_size(0),
         cell_state_shape.dim_size(2), data_type);
     TF_RETURN_IF_ERROR(cell_state_desc_s.status());
@@ -763,20 +775,20 @@ Status CreateForwardAndBackwardIODescriptors(
   DCHECK_EQ(output_shape.dims(), 3);
   if (seq_lengths.data() != nullptr) {
     if (time_major) {
-      auto output_desc_s = executor->createRnnSequenceTensorDescriptor(
+      auto output_desc_s = dnn->CreateRnnSequenceTensorDescriptor(
           output_shape.dim_size(0), output_shape.dim_size(1),
           output_shape.dim_size(2), seq_lengths, time_major, data_type);
       TF_RETURN_IF_ERROR(output_desc_s.status());
       *output_desc = std::move(output_desc_s).value();
     } else {
-      auto output_desc_s = executor->createRnnSequenceTensorDescriptor(
+      auto output_desc_s = dnn->CreateRnnSequenceTensorDescriptor(
           output_shape.dim_size(1), output_shape.dim_size(0),
           output_shape.dim_size(2), seq_lengths, time_major, data_type);
       TF_RETURN_IF_ERROR(output_desc_s.status());
       *output_desc = std::move(output_desc_s).value();
     }
   } else {
-    auto output_desc_s = executor->createRnnSequenceTensorDescriptor(
+    auto output_desc_s = dnn->CreateRnnSequenceTensorDescriptor(
         output_shape.dim_size(0), output_shape.dim_size(1),
         output_shape.dim_size(2), data_type);
     TF_RETURN_IF_ERROR(output_desc_s.status());
@@ -849,21 +861,18 @@ Status DoForwardImpl(OpKernelContext* context, const RnnDescriptor& rnn_desc,
     }
   }
 
-  bool launch_success =
-      stream
-          ->ThenRnnForward(rnn_desc, *input_desc, input_data, seq_lengths_ptr,
-                           *h_state_desc, input_h_data, *c_state_desc,
-                           input_c_data, params_data, *output_desc,
-                           &output_data, *h_state_desc, &output_h_data,
-                           *c_state_desc, &output_c_data, is_training,
-                           reserve_space_allocator, workspace_allocator,
-                           output_profile_result)
-          .ok();
-  return launch_success
-             ? OkStatus()
-             : errors::Internal(
-                   "Failed to call ThenRnnForward with model config: ",
-                   model_types.DebugString(), ", ", model_shapes.DebugString());
+  TF_ASSIGN_OR_RETURN(auto dnn, GetDnn(stream));
+  bool launch_success = dnn->DoRnnForward(
+      stream, rnn_desc, *input_desc, input_data, seq_lengths_ptr, *h_state_desc,
+      input_h_data, *c_state_desc, input_c_data, params_data, *output_desc,
+      &output_data, *h_state_desc, &output_h_data, *c_state_desc,
+      &output_c_data, is_training, reserve_space_allocator, workspace_allocator,
+      output_profile_result);
+  return launch_success ? OkStatus()
+                        : absl::InternalError(absl::StrCat(
+                              "Failed to call DoRnnForward with model config: ",
+                              model_types.DebugString(), ", ",
+                              model_shapes.DebugString()));
 }
 
 template <typename T>
@@ -1035,23 +1044,21 @@ Status DoBackwardImpl(
     }
   }
 
-  bool launch_success =
-      stream
-          ->ThenRnnBackward(
-              rnn_desc, *input_desc, input_data, seq_lengths_ptr, *h_state_desc,
-              input_h_data, *c_state_desc, input_c_data, params_data,
-              *output_desc, output_data, *h_state_desc, output_h_data,
-              *c_state_desc, output_c_data, output_backprop_data,
-              output_h_backprop_data, output_c_backprop_data,
-              &input_backprop_data, &input_h_backprop_data,
-              &input_c_backprop_data, &params_backprop_data,
-              &reserve_space_uint8, workspace_allocator, output_profile_result)
-          .ok();
+  TF_ASSIGN_OR_RETURN(auto dnn, GetDnn(stream));
+  bool launch_success = dnn->DoRnnBackward(
+      stream, rnn_desc, *input_desc, input_data, seq_lengths_ptr, *h_state_desc,
+      input_h_data, *c_state_desc, input_c_data, params_data, *output_desc,
+      output_data, *h_state_desc, output_h_data, *c_state_desc, output_c_data,
+      output_backprop_data, output_h_backprop_data, output_c_backprop_data,
+      &input_backprop_data, &input_h_backprop_data, &input_c_backprop_data,
+      &params_backprop_data, &reserve_space_uint8, workspace_allocator,
+      output_profile_result);
   return launch_success
              ? OkStatus()
-             : errors::Internal(
-                   "Failed to call ThenRnnBackward with model config: ",
-                   model_types.DebugString(), ", ", model_shapes.DebugString());
+             : absl::InternalError(absl::StrCat(
+                   "Failed to call DoRnnBackward with model config: ",
+                   model_types.DebugString(), ", ",
+                   model_shapes.DebugString()));
 }
 
 template <typename T>
@@ -1294,7 +1301,11 @@ class CudnnRNNKernelCommon : public OpKernel {
     // ExtracCudnnRNNParamsInfo is only called by op_kernels that do not require
     // random number generator, therefore set state_allocator to nullptr.
     const AlgorithmConfig algo_config;
-    auto rnn_desc_s = stream->parent()->createRnnDescriptor(
+    auto dnn = stream->parent()->AsDnn();
+    if (dnn == nullptr) {
+      return absl::InvalidArgumentError("Dnn is not supported");
+    }
+    auto rnn_desc_s = dnn->CreateRnnDescriptor(
         num_layers, h_num_units, input_size, /*cell_size=*/c_num_units,
         /*batch_size=*/0, input_mode, rnn_direction_mode(), rnn_mode(),
         ToDataType<T>::value, algo_config, GetNumericOptions(), dropout(),
@@ -1319,7 +1330,11 @@ class CudnnRNNKernelCommon : public OpKernel {
     se::dnn::DataType data_type = std::is_same_v<T, bfloat16>
                                       ? se::dnn::DataType::kFloat
                                       : ToDataType<T>::value;
-    auto rnn_desc_s = executor->createRnnDescriptor(
+    auto dnn = executor->AsDnn();
+    if (dnn == nullptr) {
+      return absl::InvalidArgumentError("Dnn is not supported");
+    }
+    auto rnn_desc_s = dnn->CreateRnnDescriptor(
         model_shapes.num_layers, model_shapes.num_units,
         model_shapes.input_size, model_shapes.cell_num_units,
         model_shapes.batch_size, input_mode, rnn_direction_mode(), rnn_mode(),
@@ -1892,7 +1907,11 @@ class CudnnRNNForwardOpV2<GPUDevice, T>
 
     std::vector<AlgorithmDesc> algorithms;
     auto* stream = context->op_device_context()->stream();
-    CHECK(stream->parent()->GetRnnAlgorithms(&algorithms));
+    auto dnn = stream->parent()->AsDnn();
+    if (dnn == nullptr) {
+      return absl::InvalidArgumentError("No DNN found");
+    }
+    CHECK(dnn->GetRnnAlgorithms(&algorithms));
     if (algorithms.empty()) {
       LOG(WARNING) << "No Rnn algorithm found";
       return OkStatus();
diff --git a/tensorflow/core/kernels/data/BUILD b/tensorflow/core/kernels/data/BUILD
index 498ebdcbe4c3c3..a0d17a35b104b0 100644
--- a/tensorflow/core/kernels/data/BUILD
+++ b/tensorflow/core/kernels/data/BUILD
@@ -150,9 +150,12 @@ tf_kernel_library(
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core/data:captured_function",
         "//tensorflow/core/data:dataset_utils",
+        "//tensorflow/core/data:hash_utils",
         "//tensorflow/core/data:serialization_utils",
         "//tensorflow/core/grappler:graph_topology_view",
         "//tensorflow/core/grappler/utils:traversal",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
     ],
 )
 
@@ -1000,6 +1003,8 @@ tf_kernel_library(
         "//tensorflow/core/data:name_utils",
         "//tensorflow/core/data:split_utils",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:types",
     ],
 )
diff --git a/tensorflow/core/kernels/data/batch_dataset_op.cc b/tensorflow/core/kernels/data/batch_dataset_op.cc
index 2b4610ea751a6a..ba5e8dd3750528 100644
--- a/tensorflow/core/kernels/data/batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/batch_dataset_op.cc
@@ -119,7 +119,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -145,7 +145,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
     TF_RETURN_IF_ERROR(CopyBatch(CopyBatchParams(ctx), batch_elements,
                                  parallel_copy_,
                                  /*allocation_callback=*/nullptr, out_tensors));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
@@ -163,7 +163,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
     TF_RETURN_IF_ERROR(
         b->AddDataset(this, {input_graph_node, batch_size, drop_remainder},
                       {{kParallelCopy, parallel_copy}}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -188,7 +188,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
         mutex_lock l(mu_);
         if (!input_impl_) {
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
         batch_elements.reserve(dataset()->reserve_size_);
         *end_of_sequence = false;
@@ -206,13 +206,13 @@ class BatchDatasetOp::Dataset : public DatasetBase {
 
       if (batch_elements.empty()) {
         DCHECK(*end_of_sequence);
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       if (dataset()->drop_remainder_ &&
           batch_elements.size() < dataset()->batch_size_) {
         *end_of_sequence = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       // Copy the retrieved batch elements into one output tensor per tuple
@@ -228,7 +228,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
           /*allocation_callback=*/nullptr, out_tensors));
 
       *end_of_sequence = false;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -245,7 +245,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
       if (input_impl_) {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -259,7 +259,7 @@ class BatchDatasetOp::Dataset : public DatasetBase {
       } else {
         input_impl_.reset();
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     TraceMeMetadata GetTraceMeMetadata() const override {
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops.cc b/tensorflow/core/kernels/data/cache_dataset_ops.cc
index 4344e57af08fb1..ef74dc61124b56 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops.cc
@@ -93,7 +93,7 @@ class PartialCache {
       TF_RETURN_IF_ERROR(ExtendTempCacheToIndex(index, ctx));
     }
     *out_tensors = cache_.at(index);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Returns the data which has been cached up to this point.
@@ -112,7 +112,7 @@ class PartialCache {
       }
       cache_.push_back(out_tensors);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   StatusOr<core::RefCountPtr<IteratorResource>> GetIteratorResourceFromDataset(
@@ -183,7 +183,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -329,7 +329,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
         *end_of_sequence = false;
         TF_RETURN_IF_ERROR(EnsureLockFileExists(end_of_sequence));
         if (*end_of_sequence) {
-          return OkStatus();
+          return absl::OkStatus();
         }
         TF_RETURN_IF_ERROR(writer_->status());
         if (cur_index_ >= kMaxItems) {
@@ -348,7 +348,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
         if (*end_of_sequence && out_tensors->empty()) {
           TF_RETURN_IF_ERROR(Finish());
           cur_index_++;
-          return OkStatus();
+          return absl::OkStatus();
         }
         if (out_tensors->size() != dataset()->num_tensors_) {
           return errors::Internal(
@@ -366,7 +366,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
           TF_RETURN_IF_ERROR(Finish());
         }
         cur_index_++;
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      protected:
@@ -385,7 +385,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
         if (iteration_completed_) {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(prefix(), kIterationCompleted, ""));
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         // lockfile is created on the first call to GetNextInternal. The
@@ -409,7 +409,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
         }
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
         TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kShardId, shard_id_));
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -428,7 +428,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
 
         if (reader->Contains(prefix(), kIterationCompleted)) {
           iteration_completed_ = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
@@ -445,7 +445,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
         filename_ = strings::StrCat(dataset()->filename_, "_", shard_id_);
         lockfile_ = strings::StrCat(filename_, kLockFileSuffix);
         writer_ = std::make_unique<BundleWriter>(dataset()->env_, filename_);
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
@@ -453,10 +453,10 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
           TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
         if (iteration_completed_) {
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
         if (lockfile_created_) {
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         // Perform rudimentary locking to help catch concurrent writes to the
@@ -508,7 +508,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
         // BundleWriter in another Session.
         writer_ = std::make_unique<BundleWriter>(dataset()->env_, filename_);
         lockfile_created_ = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status Finish() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
@@ -537,7 +537,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
           TF_RETURN_IF_ERROR(dataset()->env_->DeleteFile(
               strings::StrCat(dataset()->filename_, "_", i, kLockFileSuffix)));
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       mutex mu_;
@@ -571,7 +571,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
         TF_RETURN_IF_ERROR(reader_.status());
         if (!reader_.Valid()) {
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
         out_tensors->clear();
         out_tensors->resize(dataset()->num_tensors_);
@@ -588,7 +588,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
           if (!reader_.Valid()) {
             out_tensors->clear();
             *end_of_sequence = true;
-            return OkStatus();
+            return absl::OkStatus();
           }
           StringPiece key = reader_.key();
           DCHECK_EQ(key, dataset()->FormatName(cur_index_, i));
@@ -596,7 +596,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
           TF_RETURN_IF_ERROR(reader_.status());
         }
         cur_index_++;
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      protected:
@@ -611,7 +611,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(prefix(), kCurIndex, cur_index_));
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(
@@ -634,7 +634,7 @@ class CacheDatasetOp::FileDatasetBase : public DatasetBase {
         }
         reader_.Seek(dataset()->FormatName(cur_index_, 0));
         iterator_restored_ = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
@@ -696,7 +696,7 @@ class CacheDatasetOp::FileDataset : public CacheDatasetOp::FileDatasetBase {
     Node* filename = nullptr;
     TF_RETURN_IF_ERROR(b->AddScalar(filename_, &filename));
     TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph, filename}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -720,7 +720,7 @@ class CacheDatasetOp::FileDatasetV2 : public CacheDatasetOp::FileDatasetBase {
     TF_RETURN_IF_ERROR(b->AddTensor(resource_handle_, &resource_handle_node));
     TF_RETURN_IF_ERROR(b->AddDataset(
         this, {input_node, filename_node, resource_handle_node}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -788,7 +788,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -876,7 +876,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
             VLOG(2) << "Finalizing the cache because EOF has been reached.";
             cache_->Complete(std::move(temp_cache_));
           }
-          return OkStatus();
+          return absl::OkStatus();
         }
         RecordBufferEnqueue(ctx, *out_tensors);
         temp_cache_.emplace_back(*out_tensors);
@@ -885,7 +885,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
                      "expected input cardinality.";
           cache_->Complete(std::move(temp_cache_));
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      protected:
@@ -939,7 +939,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
         for (size_t i = 0; i < cache_->size(); ++i) {
           RecordBufferEnqueue(ctx, cache_->at(i));
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -952,10 +952,10 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
                               cache_tensors.end());
           index_++;
           *end_of_sequence = false;
-          return OkStatus();
+          return absl::OkStatus();
         } else {
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
       }
 
@@ -970,7 +970,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
                           IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kIndex, index_));
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -985,7 +985,7 @@ class CacheDatasetOp::MemoryDatasetBase : public DatasetBase {
           }
           index_ = static_cast<size_t>(temp);
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
@@ -1053,7 +1053,7 @@ class CacheDatasetOp::MemoryDataset : public CacheDatasetOp::MemoryDatasetBase {
     TF_RETURN_IF_ERROR(b->AddScalar(tstring(""), &filename_node));
     TF_RETURN_IF_ERROR(
         b->AddDataset(this, {input_node, filename_node}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -1102,7 +1102,7 @@ class CacheDatasetOp::MemoryDatasetV2
     TF_RETURN_IF_ERROR(b->AddTensor(handle, &resource_handle_node));
     TF_RETURN_IF_ERROR(b->AddDataset(
         this, {input_node, filename_node, resource_handle_node}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -1139,7 +1139,7 @@ void CacheDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
             ctx->resource_manager()->LookupOrCreate<MemoryCacheManager>(
                 container, name, &manager, [](MemoryCacheManager** manager) {
                   *manager = new MemoryCacheManager();
-                  return OkStatus();
+                  return absl::OkStatus();
                 }));
         handle = MakeResourceHandle<MemoryCacheManager>(ctx, container, name);
       } else {
@@ -1154,7 +1154,7 @@ void CacheDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
           ctx, ctx->resource_manager()->LookupOrCreate<MemoryCacheManager>(
                    container, name, &manager, [](MemoryCacheManager** manager) {
                      *manager = new MemoryCacheManager();
-                     return OkStatus();
+                     return absl::OkStatus();
                    }));
       auto handle =
           MakeResourceHandle<MemoryCacheManager>(ctx, container, name);
diff --git a/tensorflow/core/kernels/data/cache_dataset_ops_test.cc b/tensorflow/core/kernels/data/cache_dataset_ops_test.cc
index 578633988a3ce1..0f8f7f8824b2b5 100644
--- a/tensorflow/core/kernels/data/cache_dataset_ops_test.cc
+++ b/tensorflow/core/kernels/data/cache_dataset_ops_test.cc
@@ -51,14 +51,14 @@ class CacheDatasetParams : public DatasetParams {
 
   Status GetInputNames(std::vector<string>* input_names) const override {
     *input_names = {CacheDatasetOp::kInputDataset, CacheDatasetOp::kFileName};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
     *attr_vector = {{"output_types", output_dtypes_},
                     {"output_shapes", output_shapes_},
                     {"metadata", ""}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override { return CacheDatasetOp::kDatasetType; }
@@ -75,7 +75,7 @@ class CacheDatasetOpTest : public DatasetOpsTestBase {
     TF_RETURN_IF_ERROR(DatasetOpsTestBase::Initialize(dataset_params));
     auto params = static_cast<const CacheDatasetParams&>(dataset_params);
     cache_filename_ = params.filename();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   ~CacheDatasetOpTest() override {
diff --git a/tensorflow/core/kernels/data/cache_ops.cc b/tensorflow/core/kernels/data/cache_ops.cc
index 6c6c7a6980aeee..002d3876e61ef0 100644
--- a/tensorflow/core/kernels/data/cache_ops.cc
+++ b/tensorflow/core/kernels/data/cache_ops.cc
@@ -81,7 +81,7 @@ Status AnonymousMemoryCacheHandleOp::CreateResource(
     std::unique_ptr<ProcessFunctionLibraryRuntime> pflr,
     FunctionLibraryRuntime* lib, MemoryCacheManager** manager) {
   *manager = new MemoryCacheManager();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void DeleteMemoryCacheOp::Compute(OpKernelContext* ctx) {
diff --git a/tensorflow/core/kernels/data/concatenate_dataset_op.cc b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
index fe8990b333c822..8f380a29b87193 100644
--- a/tensorflow/core/kernels/data/concatenate_dataset_op.cc
+++ b/tensorflow/core/kernels/data/concatenate_dataset_op.cc
@@ -73,7 +73,7 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
   Status MakeSplitProviders(std::vector<std::unique_ptr<SplitProvider>>*
                                 split_providers) const override {
     TF_ASSIGN_OR_RETURN(*split_providers, GetSplitProviders(this));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const DataTypeVector& output_dtypes() const override {
@@ -106,7 +106,7 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
     inputs->push_back(to_concatenate_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -123,7 +123,7 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(
           to_concatenate_->Get(ctx, index - input_cardinality_, out_tensors));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
@@ -137,7 +137,7 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
         b->AddInputDataset(ctx, to_concatenate_, &to_concatenate_graph));
     TF_RETURN_IF_ERROR(
         b->AddDataset(this, {input_graph, to_concatenate_graph}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -155,7 +155,7 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
           &input_contexts_[0], this, strings::StrCat(prefix(), "[0]"),
           &input_impl_));
       ctx->MergeCheckpoint(input_contexts_[0].checkpoint());
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
@@ -164,14 +164,14 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
       mutex_lock l(mu_);
       if (!input_impl_) {
         *end_of_sequence = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
       while (i_ < 2) {
         TF_RETURN_IF_ERROR(input_impl_->GetNext(&input_contexts_[i_],
                                                 out_tensors, end_of_sequence));
         ctx->MergeCheckpoint(input_contexts_[i_].checkpoint());
         if (!*end_of_sequence) {
-          return OkStatus();
+          return absl::OkStatus();
         }
         if (++i_ < 2) {
           TF_RETURN_IF_ERROR(dataset()->to_concatenate_->MakeIterator(
@@ -181,7 +181,7 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
       }
       *end_of_sequence = true;
       input_impl_.reset();
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -201,7 +201,7 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
       if (input_impl_) {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -213,7 +213,7 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
                                             &input_uninitialized));
       if (static_cast<bool>(input_uninitialized)) {
         input_impl_.reset();
-        return OkStatus();
+        return absl::OkStatus();
       }
       if (!TF_PREDICT_TRUE(i_ >= 0 && i_ <= 2))
         return errors::InvalidArgument("i_ must be in range [0, 2].");
@@ -226,7 +226,7 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
       if (input_impl_) {
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -240,7 +240,7 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
                                      const PartialTensorShape& ts2,
                                      PartialTensorShape* output_tensorshape) {
     if (ts1.dims() != ts2.dims() || ts1.unknown_rank() || ts2.unknown_rank())
-      return OkStatus();
+      return absl::OkStatus();
     auto dims1 = ts1.dim_sizes();
     auto dims2 = ts2.dim_sizes();
     for (int d = 0; d < ts1.dims(); d++) {
@@ -249,7 +249,7 @@ class ConcatenateDatasetOp::Dataset : public DatasetBase {
       else
         TF_RETURN_IF_ERROR(output_tensorshape->AddDimWithStatus(-1));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const DatasetBase* input_;
diff --git a/tensorflow/core/kernels/data/dataset_ops.cc b/tensorflow/core/kernels/data/dataset_ops.cc
index ec59ee3ed55be7..e1d14f3ecd9fa8 100644
--- a/tensorflow/core/kernels/data/dataset_ops.cc
+++ b/tensorflow/core/kernels/data/dataset_ops.cc
@@ -14,9 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/kernels/data/dataset_ops.h"
 
+#include <cstdint>
 #include <memory>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "tensorflow/core/framework/op_requires.h"
+
 // On mobile we do not provide this functionality because not all of its
 // dependencies are available there.
 #if !defined(IS_MOBILE_PLATFORM)
@@ -24,6 +29,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/graph_runner.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/data/dataset_utils.h"
+#include "tensorflow/core/data/hash_utils.h"
 #include "tensorflow/core/data/serialization_utils.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/graph.pb.h"
@@ -164,6 +170,30 @@ void DatasetFromGraphOp::Compute(OpKernelContext* ctx) {
   OP_REQUIRES_OK(ctx, ctx->set_output(kHandle, outputs[0]));
 }
 
+DatasetFingerprintOp::DatasetFingerprintOp(OpKernelConstruction* ctx)
+    : OpKernel(ctx) {}
+void DatasetFingerprintOp::Compute(OpKernelContext* ctx) {
+  DatasetBase* dataset;
+  OP_REQUIRES_OK(ctx, GetDatasetFromVariantTensor(ctx->input(0), &dataset));
+
+  SerializationContext::Params params(ctx);
+  GraphDef graph_def;
+  Status s = AsGraphDef(dataset, SerializationContext(params), &graph_def);
+
+  if (!s.ok()) {
+    ctx->CtxFailure(absl::FailedPreconditionError(absl::StrFormat(
+        "Failed to serialize the input pipeline graph: %s", s.message())));
+    return;
+  }
+
+  uint64_t hash_result = 0;
+  OP_REQUIRES_OK(ctx, HashGraph(graph_def, &hash_result));
+
+  Tensor* result;
+  OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &result));
+  result->scalar<uint64_t>()() = hash_result;
+}
+
 REGISTER_KERNEL_BUILDER(Name("DatasetToGraph").Device(DEVICE_CPU),
                         DatasetToGraphOp);
 REGISTER_KERNEL_BUILDER(Name("DatasetToGraphV2").Device(DEVICE_CPU),
@@ -178,6 +208,9 @@ REGISTER_KERNEL_BUILDER(
 REGISTER_KERNEL_BUILDER(Name("DatasetFromGraph").Device(DEVICE_CPU),
                         DatasetFromGraphOp);
 
+REGISTER_KERNEL_BUILDER(Name("DatasetFingerprint").Device(DEVICE_CPU),
+                        DatasetFingerprintOp);
+
 }  // namespace data
 }  // namespace tensorflow
 #endif  // !IS_MOBILE_PLATFORM
diff --git a/tensorflow/core/kernels/data/dataset_ops.h b/tensorflow/core/kernels/data/dataset_ops.h
index fa91fcc9171785..fbbfb514e8819f 100644
--- a/tensorflow/core/kernels/data/dataset_ops.h
+++ b/tensorflow/core/kernels/data/dataset_ops.h
@@ -57,6 +57,14 @@ class DatasetCardinalityOp : public OpKernel {
   std::unique_ptr<CardinalityOptions> cardinality_options_;
 };
 
+// An OpKernel that computes the fingerprint of a dataset.
+class DatasetFingerprintOp : public OpKernel {
+ public:
+  explicit DatasetFingerprintOp(OpKernelConstruction* ctx);
+
+  void Compute(OpKernelContext* ctx) override;
+};
+
 class DatasetFromGraphOp : public OpKernel {
  public:
   static constexpr const char* const kGraphDef = "graph_def";
diff --git a/tensorflow/core/kernels/data/experimental/BUILD b/tensorflow/core/kernels/data/experimental/BUILD
index a88b206f7d752c..b8c54db8486fb0 100644
--- a/tensorflow/core/kernels/data/experimental/BUILD
+++ b/tensorflow/core/kernels/data/experimental/BUILD
@@ -329,6 +329,26 @@ tf_cc_test(
     ],
 )
 
+tf_kernel_library(
+    name = "global_shuffle_dataset_op",
+    srcs = ["global_shuffle_dataset_op.cc"],
+    deps = [
+        "//tensorflow/core:dataset_ops_op_lib",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:lib_internal",
+        "//tensorflow/core/data:name_utils",
+        "//tensorflow/core/framework:types_proto_cc",
+        "//tensorflow/core/kernels:random_index_shuffle",
+        "//tensorflow/core/kernels/data:random_seed_ops",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
 tf_kernel_library(
     name = "group_by_reducer_dataset_op",
     srcs = ["group_by_reducer_dataset_op.cc"],
@@ -935,6 +955,7 @@ tf_kernel_library(
         ":csv_dataset_op",
         ":dense_to_sparse_batch_dataset_op",
         ":directed_interleave_dataset_op",
+        ":global_shuffle_dataset_op",
         ":group_by_reducer_dataset_op",
         ":group_by_window_dataset_op",
         ":ignore_errors_dataset_op",
diff --git a/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc
index c9221f8ae502f1..99c8045f7c0fe7 100644
--- a/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_cardinality_dataset_op.cc
@@ -72,7 +72,7 @@ class AssertCardinalityDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -89,7 +89,7 @@ class AssertCardinalityDatasetOp::Dataset : public DatasetBase {
     TF_RETURN_IF_ERROR(b->AddScalar(cardinality_, &cardinality_node));
     TF_RETURN_IF_ERROR(
         b->AddDataset(this, {input_graph_node, cardinality_node}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -125,7 +125,7 @@ class AssertCardinalityDatasetOp::Dataset : public DatasetBase {
             ElementString(dataset()->cardinality_), " but contained at least ",
             ElementString(num_elements_), ".");
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -140,7 +140,7 @@ class AssertCardinalityDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(
           writer->WriteScalar(full_name("num_elements"), num_elements_));
       TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -148,7 +148,7 @@ class AssertCardinalityDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(
           reader->ReadScalar(full_name("num_elements"), &num_elements_));
       TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
index d65faf6f7d3e1d..6dac458703c872 100644
--- a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op.cc
@@ -68,7 +68,7 @@ class AssertNextDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -85,7 +85,7 @@ class AssertNextDatasetOp::Dataset : public DatasetBase {
     TF_RETURN_IF_ERROR(b->AddVector(transformations_, &transformations_node));
     TF_RETURN_IF_ERROR(
         b->AddDataset(this, {input_graph_node, transformations_node}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -132,13 +132,13 @@ class AssertNextDatasetOp::Dataset : public DatasetBase {
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
       TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
diff --git a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op_test.cc
index 18da142aad6a42..96a08f9acf2a2b 100644
--- a/tensorflow/core/kernels/data/experimental/assert_next_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_next_dataset_op_test.cc
@@ -49,13 +49,13 @@ class AssertNextDatasetParams : public DatasetParams {
     input_names->reserve(input_dataset_params_.size() + 1);
     input_names->emplace_back(AssertNextDatasetOp::kInputDataset);
     input_names->emplace_back(AssertNextDatasetOp::kTransformations);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
     *attr_vector = {{AssertNextDatasetOp::kOutputShapes, output_shapes_},
                     {AssertNextDatasetOp::kOutputTypes, output_dtypes_}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override {
diff --git a/tensorflow/core/kernels/data/experimental/assert_prev_dataset_op.cc b/tensorflow/core/kernels/data/experimental/assert_prev_dataset_op.cc
index bcee58bdd317ef..a7589ecc12bdd4 100644
--- a/tensorflow/core/kernels/data/experimental/assert_prev_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_prev_dataset_op.cc
@@ -74,7 +74,7 @@ Status CheckOpName(const DatasetBase& dataset, const NameAttrList& assertions) {
                                    assertions.name(), "', but found '",
                                    dataset.type_string(), "'.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Returns a NodeDef representation of `dataset`.
@@ -92,7 +92,7 @@ StatusOr<NodeDef> GetDatasetNode(const DatasetBase& dataset,
 // Checks `dataset`'s attrs against those in `assertions`.
 Status CheckAttributes(const DatasetBase& dataset,
                        const NameAttrList& assertions) {
-  if (assertions.attr().empty()) return OkStatus();
+  if (assertions.attr().empty()) return absl::OkStatus();
   TF_ASSIGN_OR_RETURN(NodeDef node, GetDatasetNode(dataset, assertions.name()));
   std::vector<std::string> attrs_not_found;
   for (const auto& attr : assertions.attr()) {
@@ -116,7 +116,7 @@ Status CheckAttributes(const DatasetBase& dataset,
           attr.second.DebugString(), "', but found no such attribute defined.");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Checks `dataset`'s op name and attrs against those in `transformation`.
@@ -125,7 +125,7 @@ Status CheckTransformation(const DatasetBase& dataset,
   TF_ASSIGN_OR_RETURN(NameAttrList assertions, GetAssertions(transformation));
   TF_RETURN_IF_ERROR(CheckOpName(dataset, assertions));
   TF_RETURN_IF_ERROR(CheckAttributes(dataset, assertions));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -167,7 +167,7 @@ class AssertPrevDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -184,7 +184,7 @@ class AssertPrevDatasetOp::Dataset : public DatasetBase {
     TF_RETURN_IF_ERROR(b->AddVector(transformations_, &transformations_node));
     TF_RETURN_IF_ERROR(
         b->AddDataset(this, {input_graph_node, transformations_node}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -233,13 +233,13 @@ class AssertPrevDatasetOp::Dataset : public DatasetBase {
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
       TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
diff --git a/tensorflow/core/kernels/data/experimental/assert_prev_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/assert_prev_dataset_op_test.cc
index 0bff98260f0055..cb7bd224bb23f2 100644
--- a/tensorflow/core/kernels/data/experimental/assert_prev_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/assert_prev_dataset_op_test.cc
@@ -71,13 +71,13 @@ class AssertPrevDatasetParams : public DatasetParams {
     input_names->reserve(input_dataset_params_.size() + 1);
     input_names->emplace_back(AssertPrevDatasetOp::kInputDataset);
     input_names->emplace_back(AssertPrevDatasetOp::kTransformations);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
     *attr_vector = {{AssertPrevDatasetOp::kOutputShapes, output_shapes_},
                     {AssertPrevDatasetOp::kOutputTypes, output_dtypes_}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override {
diff --git a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc
index 935c2f9387d23f..e7b2925b0dca25 100644
--- a/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/auto_shard_dataset_op_test.cc
@@ -55,7 +55,7 @@ class AutoShardDatasetParams : public DatasetParams {
     input_names->emplace_back(AutoShardDatasetOp::kInputDataset);
     input_names->emplace_back(AutoShardDatasetOp::kNumWorkers);
     input_names->emplace_back(AutoShardDatasetOp::kIndex);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -66,7 +66,7 @@ class AutoShardDatasetParams : public DatasetParams {
     attr_vector->emplace_back(AutoShardDatasetOp::kOutputTypes, output_dtypes_);
     attr_vector->emplace_back(AutoShardDatasetOp::kOutputShapes,
                               output_shapes_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override {
diff --git a/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
index efd9102f0240ae..2d2b239205302b 100644
--- a/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/choose_fastest_branch_dataset_op.cc
@@ -56,10 +56,10 @@ class WrapperDataset : public DatasetBase {
   string DebugString() const override { return "WrapperDataset"; }
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status CheckExternalState() const override { return OkStatus(); }
+  Status CheckExternalState() const override { return absl::OkStatus(); }
 
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
@@ -93,7 +93,7 @@ class WrapperDataset : public DatasetBase {
             "Make sure the branches to ChooseFastestDataset do not expect the "
             "input to repeat.");
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
@@ -111,12 +111,12 @@ class WrapperDataset : public DatasetBase {
 
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -252,7 +252,7 @@ class ChooseFastestBranchDatasetOp : public UnaryDatasetOpKernel {
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->push_back(input_);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status CheckExternalState() const override {
@@ -351,7 +351,7 @@ class ChooseFastestBranchDatasetOp : public UnaryDatasetOpKernel {
               ctx, &instantiated_captured_funcs_[i]));
         }
 
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       // The first num_elements_per_branch * num_branches iterations, we run
@@ -422,7 +422,7 @@ class ChooseFastestBranchDatasetOp : public UnaryDatasetOpKernel {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("input_impl_empty"), ""));
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -449,7 +449,7 @@ class ChooseFastestBranchDatasetOp : public UnaryDatasetOpKernel {
           }
           TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, current_iterator_));
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
@@ -545,7 +545,7 @@ class ChooseFastestBranchDatasetOp : public UnaryDatasetOpKernel {
               &current_iterator_, /*node=*/nullptr));
         }
 
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       mutex mu_;
diff --git a/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc b/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
index 606dd8b1f216b4..41352a2cd40f5a 100644
--- a/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/choose_fastest_dataset_op.cc
@@ -165,14 +165,14 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
       for (const auto& input : inputs_) {
         inputs->push_back(input);
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status CheckExternalState() const override {
       for (const auto& input : inputs_) {
         TF_RETURN_IF_ERROR(input->CheckExternalState());
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -210,7 +210,7 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
               ctx, this, strings::StrCat(prefix(), "[", i, "]"),
               &input_impls_[i]));
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -266,7 +266,7 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
             TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl));
           }
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -289,7 +289,7 @@ class ChooseFastestDatasetOp : public DatasetOpKernel {
             TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl));
           }
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
diff --git a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
index 935ea38b0797dc..80c717806f7be7 100644
--- a/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/csv_dataset_op.cc
@@ -200,12 +200,12 @@ class CSVDatasetOp : public DatasetOpKernel {
 
     string DebugString() const override { return "CSVDatasetOp::Dataset"; }
 
-    Status CheckExternalState() const override { return OkStatus(); }
+    Status CheckExternalState() const override { return absl::OkStatus(); }
 
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->clear();
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -269,7 +269,7 @@ class CSVDatasetOp : public DatasetOpKernel {
             {std::make_pair(8, record_defaults)},  // Tensor list inputs
             {}, output));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -314,7 +314,7 @@ class CSVDatasetOp : public DatasetOpKernel {
           // Iteration ends when there are no more files to process.
           if (current_file_index_ == dataset()->filenames_.size()) {
             *end_of_sequence = true;
-            return OkStatus();
+            return absl::OkStatus();
           }
           TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
         } while (true);
@@ -340,7 +340,7 @@ class CSVDatasetOp : public DatasetOpKernel {
           TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("num_buffer_reads"),
                                                  num_buffer_reads_));
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -380,7 +380,7 @@ class CSVDatasetOp : public DatasetOpKernel {
           }
           pos_ = size_t(pos);
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
@@ -451,7 +451,7 @@ class CSVDatasetOp : public DatasetOpKernel {
             if (include) {
               return FieldToOutput(ctx, StringPiece(), out_tensors);
             } else {
-              return OkStatus();
+              return absl::OkStatus();
             }
           } else if (!s.ok()) {
             return s;  // Surface other errors back to caller
@@ -575,7 +575,7 @@ class CSVDatasetOp : public DatasetOpKernel {
                                  const std::vector<Piece>& earlier_pieces,
                                  bool include)
           TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        if (!include) return OkStatus();
+        if (!include) return absl::OkStatus();
 
         if (earlier_pieces.empty()) {
           if (field.find('\"', 1) == field.size() - 1) {
@@ -695,7 +695,7 @@ class CSVDatasetOp : public DatasetOpKernel {
 
         if (errors::IsOutOfRange(s) && !result->empty()) {
           // Ignore OutOfRange error when ReadNBytes read < N bytes.
-          return OkStatus();
+          return absl::OkStatus();
         }
         return s;
       }
@@ -798,7 +798,7 @@ class CSVDatasetOp : public DatasetOpKernel {
                                            " not supported in field ",
                                            output_idx);
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       // Records can be delimited by "\r\n" line breaks. When we encounter a
@@ -825,7 +825,7 @@ class CSVDatasetOp : public DatasetOpKernel {
                                    const std::vector<Piece>& earlier_pieces,
                                    bool include)
           TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
-        if (!include) return OkStatus();
+        if (!include) return absl::OkStatus();
 
         if (earlier_pieces.empty()) {
           return FieldToOutput(ctx, field, out_tensors);
@@ -883,7 +883,7 @@ class CSVDatasetOp : public DatasetOpKernel {
             return errors::InvalidArgument("Can't read header of file");
           }
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       // Resets all reader streams.
diff --git a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
index 9f614710466ec6..a75018294d7874 100644
--- a/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/data_service_dataset_op.cc
@@ -25,8 +25,6 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
 #include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
@@ -61,7 +59,6 @@ limitations under the License.
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/platform/tstring.h"
 #include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/lib/traceme.h"
 #include "tensorflow/core/protobuf/data_service.pb.h"
 #include "tensorflow/core/protobuf/error_codes.pb.h"
 
@@ -199,7 +196,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->clear();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
@@ -356,7 +353,7 @@ class DataServiceDatasetOp::Dataset : public DatasetBase {
                           data_service_client_.GetNext(ctx_factory));
       *out_tensors = std::move(result.tensors);
       *end_of_sequence = result.end_of_sequence;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -671,7 +668,7 @@ void DataServiceDatasetOp::MakeDataset(OpKernelContext* ctx,
                        container, name, &iteration_counter,
                        [](IterationCounter** counter) {
                          *counter = new IterationCounter();
-                         return OkStatus();
+                         return absl::OkStatus();
                        }));
     iteration_counter_handle =
         MakeResourceHandle<IterationCounter>(ctx, container, name);
diff --git a/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
index cbdf0ae1b88cf2..8bfc9ade778f1f 100644
--- a/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/dense_to_sparse_batch_dataset_op.cc
@@ -123,7 +123,7 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->push_back(input_);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status CheckExternalState() const override {
@@ -147,7 +147,7 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
       TF_RETURN_IF_ERROR(b->AddVector(row_shape, &row_shape_node));
       TF_RETURN_IF_ERROR(b->AddDataset(
           this, {input_node, batch_size_node, row_shape_node}, output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -231,7 +231,7 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
 
         if (batch_elements.empty()) {
           DCHECK(*end_of_sequence);
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         // * indices will be [`total_elements`, `row_shape + 1`].
@@ -284,7 +284,7 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
         out_tensors->push_back(std::move(serialized_sparse));
 
         *end_of_sequence = false;
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      protected:
@@ -299,14 +299,14 @@ class DenseToSparseBatchDatasetOp : public UnaryDatasetOpKernel {
                           IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(Iterator::SaveInput(ctx, writer, input_impl_));
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(Iterator::RestoreInput(ctx, reader, input_impl_));
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
diff --git a/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
index da86d78c9329d2..059105c0aaa39b 100644
--- a/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op.cc
@@ -85,7 +85,7 @@ class DirectedInterleaveDatasetOp::Dataset : public DatasetBase {
   Status MakeSplitProviders(std::vector<std::unique_ptr<SplitProvider>>*
                                 split_providers) const override {
     TF_ASSIGN_OR_RETURN(*split_providers, GetSplitProviders(this));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const DataTypeVector& output_dtypes() const override {
@@ -117,7 +117,7 @@ class DirectedInterleaveDatasetOp::Dataset : public DatasetBase {
     for (const auto& data_input : data_inputs_) {
       inputs->push_back(data_input);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -151,7 +151,7 @@ class DirectedInterleaveDatasetOp::Dataset : public DatasetBase {
         /*attrs=*/
         {std::make_pair(kStopOnEmptyDataset, stop_on_empty_dataset_attr)},
         output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -178,7 +178,7 @@ class DirectedInterleaveDatasetOp::Dataset : public DatasetBase {
             strings::StrCat(prefix(), "[", i, "]"), &data_input_impls_[i]));
         ctx->MergeCheckpoint(input_contexts_[i + 1].checkpoint());
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
@@ -187,7 +187,7 @@ class DirectedInterleaveDatasetOp::Dataset : public DatasetBase {
       mutex_lock l(mu_);
       if (!selector_input_impl_) {
         *end_of_sequence = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       while (true) {
@@ -198,7 +198,7 @@ class DirectedInterleaveDatasetOp::Dataset : public DatasetBase {
         ctx->MergeCheckpoint(input_contexts_[0].checkpoint());
         if (*end_of_sequence) {
           ResetInputs();
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         int64_t selected_input = selector_result[0].scalar<int64_t>()();
@@ -216,7 +216,7 @@ class DirectedInterleaveDatasetOp::Dataset : public DatasetBase {
           ctx->MergeCheckpoint(
               input_contexts_[selected_input + 1].checkpoint());
           if (!end_of_selected_input) {
-            return OkStatus();
+            return absl::OkStatus();
           }
 
           // End of selected input here. Do cleanup on checkpoints.
@@ -225,7 +225,7 @@ class DirectedInterleaveDatasetOp::Dataset : public DatasetBase {
           if (dataset()->stop_on_empty_dataset_) {
             *end_of_sequence = true;
             ResetInputs();
-            return OkStatus();
+            return absl::OkStatus();
           }
 
           data_input_impls_[selected_input].reset();
@@ -234,7 +234,7 @@ class DirectedInterleaveDatasetOp::Dataset : public DatasetBase {
           if (num_active_inputs_ == 0) {
             selector_input_impl_.reset();
             *end_of_sequence = true;
-            return OkStatus();
+            return absl::OkStatus();
           }
         }
 
@@ -269,7 +269,7 @@ class DirectedInterleaveDatasetOp::Dataset : public DatasetBase {
           TF_RETURN_IF_ERROR(SaveInput(ctx, writer, data_input_impl));
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -293,7 +293,7 @@ class DirectedInterleaveDatasetOp::Dataset : public DatasetBase {
           data_input_impls_[i].reset();
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
diff --git a/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op_test.cc
index 4bf22292f2ac0d..e79c64d5750e72 100644
--- a/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/directed_interleave_dataset_op_test.cc
@@ -57,7 +57,7 @@ class DirectedInterleaveDatasetParams : public DatasetParams {
       input_names->emplace_back(absl::StrCat(
           DirectedInterleaveDatasetOp::kDataInputDatasets, "_", i));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -70,7 +70,7 @@ class DirectedInterleaveDatasetParams : public DatasetParams {
                               num_input_datasets_);
     attr_vector->emplace_back(DirectedInterleaveDatasetOp::kStopOnEmptyDataset,
                               stop_on_empty_dataset_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override {
diff --git a/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc b/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc
new file mode 100644
index 00000000000000..dcf76df269df4f
--- /dev/null
+++ b/tensorflow/core/kernels/data/experimental/global_shuffle_dataset_op.cc
@@ -0,0 +1,322 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include <atomic>
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "tensorflow/core/data/name_utils.h"
+#include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/kernels/data/random_seed_ops.h"
+#include "tensorflow/core/kernels/random_index_shuffle.h"
+#include "tsl/platform/errors.h"
+
+namespace tensorflow {
+namespace data {
+namespace {
+
+constexpr int32_t kIndexShuffleRounds = 8;
+
+constexpr const char kGlobalShuffleDataset[] = "GlobalShuffleDataset";
+constexpr const char kReshuffleEachIteration[] = "reshuffle_each_iteration";
+constexpr const char kSeed[] = "seed";
+constexpr const char kSeed2[] = "seed2";
+constexpr const char kSeedGenerator[] = "SeedGenerator";
+
+class GlobalShuffleDatasetOp : public UnaryDatasetOpKernel {
+ public:
+  explicit GlobalShuffleDatasetOp(OpKernelConstruction* ctx);
+
+ protected:
+  void MakeDataset(OpKernelContext* ctx, DatasetBase* input,
+                   DatasetBase** output) override;
+
+ private:
+  class Dataset;
+
+  bool reshuffle_each_iteration_ = true;
+};
+
+class GlobalShuffleDatasetOp::Dataset : public DatasetBase {
+ public:
+  Dataset(OpKernelContext* ctx, const DatasetBase* input,
+          SeedGeneratorManager* seed_generator, RandomSeeds&& input_seeds,
+          bool owns_resource, ResourceHandle&& resource_handle)
+      : DatasetBase(DatasetContext(ctx)),
+        input_(input),
+        seed_generator_(seed_generator),
+        input_seeds_(std::move(input_seeds)),
+        owns_resource_(owns_resource),
+        resource_handle_(std::move(resource_handle)),
+        resource_mgr_(ctx->resource_manager()) {
+    input_->Ref();
+  }
+
+  ~Dataset() override {
+    seed_generator_->Unref();
+    if (owns_resource_) {
+      absl::Status s = resource_mgr_->Delete<SeedGeneratorManager>(
+          resource_handle_.container(), resource_handle_.name());
+      if (!s.ok()) {
+        LOG(WARNING) << "Failed to delete random seed generator resource for "
+                     << "tf.data global shuffle dataset: " << s;
+      }
+    }
+    input_->Unref();
+  }
+
+  const DataTypeVector& output_dtypes() const override {
+    return input_->output_dtypes();
+  }
+
+  const std::vector<PartialTensorShape>& output_shapes() const override {
+    return input_->output_shapes();
+  }
+
+  std::string DebugString() const override {
+    return name_utils::DatasetDebugString(kGlobalShuffleDataset);
+  }
+
+  int64_t CardinalityInternal(CardinalityOptions options) const override {
+    return input_->Cardinality(options);
+  }
+
+  absl::Status InputDatasets(
+      std::vector<const DatasetBase*>* inputs) const override {
+    inputs->push_back(input_);
+    return absl::OkStatus();
+  }
+
+  absl::Status CheckExternalState() const override {
+    return input_->CheckExternalState();
+  }
+
+ protected:
+  std::unique_ptr<IteratorBase> MakeIteratorInternal(
+      const std::string& prefix) const override;
+
+  absl::Status AsGraphDefInternal(SerializationContext* ctx,
+                                  DatasetGraphDefBuilder* b,
+                                  Node** output) const override {
+    // Inputs
+    Node* input_graph_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
+    Node* seed_node = nullptr;
+    Node* seed2_node = nullptr;
+    TF_RETURN_IF_ERROR(b->AddScalar(input_seeds_.input_seed(), &seed_node));
+    TF_RETURN_IF_ERROR(b->AddScalar(input_seeds_.input_seed2(), &seed2_node));
+
+    Node* resource_handle_node = nullptr;
+    Tensor handle(DT_RESOURCE, TensorShape({}));
+    handle.scalar<ResourceHandle>()() = resource_handle_;
+    TF_RETURN_IF_ERROR(b->AddTensor(handle, &resource_handle_node));
+
+    // Attrs
+    AttrValue reshuffle_each_iteration;
+    b->BuildAttrValue(seed_generator_->get()->reshuffle_each_iteration(),
+                      &reshuffle_each_iteration);
+    return b->AddDataset(
+        this, /*inputs=*/
+        {input_graph_node, seed_node, seed2_node, resource_handle_node},
+        /*attrs=*/
+        {std::make_pair(kReshuffleEachIteration, reshuffle_each_iteration)},
+        output);
+  }
+
+ private:
+  class Iterator;
+
+  const DatasetBase* const input_;
+  SeedGeneratorManager* const seed_generator_;  // Owned
+  const RandomSeeds input_seeds_;
+  const bool owns_resource_;
+  const ResourceHandle resource_handle_;
+  ResourceMgr* const resource_mgr_;  // Not owned.
+};
+
+class GlobalShuffleDatasetOp::Dataset::Iterator
+    : public DatasetIterator<Dataset> {
+ public:
+  explicit Iterator(const Params& params,
+                    std::shared_ptr<SeedGenerator> seed_generator)
+      : DatasetIterator<Dataset>(params),
+        cardinality_(dataset()->Cardinality()),
+        seed_generator_(seed_generator) {}
+
+  bool SymbolicCheckpointCompatible() const override { return true; }
+
+  absl::Status Initialize(IteratorContext* ctx) override
+      ABSL_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock l(&mu_);
+    int64_t seed4;
+    seed_generator_->GenerateSeeds(&seed_, &seed2_);
+    seed_generator_->GenerateSeeds(&seed3_, &seed4);
+    TF_RETURN_IF_ERROR(
+        dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_));
+    return absl::OkStatus();
+  }
+
+  absl::Status GetNextInternal(IteratorContext* ctx,
+                               std::vector<Tensor>* out_tensors,
+                               bool* end_of_sequence) override
+      ABSL_LOCKS_EXCLUDED(mu_) {
+    absl::MutexLock l(&mu_);
+    IteratorContext::Params params(ctx);
+    params.index_mapper = GetIndexMapper(ctx->index_mapper());
+    IteratorContext global_shuffle_ctx(params);
+    TF_RETURN_IF_ERROR(input_impl_->GetNext(&global_shuffle_ctx, out_tensors,
+                                            end_of_sequence));
+    ctx->MergeCheckpoint(global_shuffle_ctx.checkpoint());
+    return absl::OkStatus();
+  }
+
+  absl::Status SaveInternal(SerializationContext* ctx,
+                            IteratorStateWriter* writer) override {
+    return absl::UnimplementedError(
+        "TODO(b/325112575): Support checkpoints for random access iterators.");
+  }
+
+  absl::Status RestoreInternal(IteratorContext* ctx,
+                               IteratorStateReader* reader) override {
+    return absl::UnimplementedError(
+        "TODO(b/325112575): Support checkpoints for random access iterators.");
+  }
+
+ private:
+  std::function<int64_t(int64_t)> GetIndexMapper(
+      std::function<int64_t(int64_t)> parent_index_mapper) const
+      ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    uint32_t seed = static_cast<uint32_t>(seed_);
+    uint32_t seed2 = static_cast<uint32_t>(seed2_);
+    uint32_t seed3 = static_cast<uint32_t>(seed3_);
+    uint64_t max_index =
+        cardinality_ > 0 ? static_cast<uint64_t>(cardinality_ - 1) : 0;
+    return [parent_index_mapper, seed, seed2, seed3,
+            max_index](int64_t element_position) {
+      if (parent_index_mapper != nullptr) {
+        element_position = parent_index_mapper(element_position);
+      }
+      return static_cast<int64_t>(tensorflow::random::index_shuffle(
+          static_cast<uint64_t>(element_position), {seed, seed2, seed3},
+          max_index, kIndexShuffleRounds));
+    };
+  }
+
+  const int64_t cardinality_;
+
+  mutable absl::Mutex mu_;
+  std::shared_ptr<SeedGenerator> seed_generator_ ABSL_GUARDED_BY(mu_);
+  int64_t seed_ ABSL_GUARDED_BY(mu_) = 0;
+  int64_t seed2_ ABSL_GUARDED_BY(mu_) = 0;
+  int64_t seed3_ ABSL_GUARDED_BY(mu_) = 0;
+  std::unique_ptr<IteratorBase> input_impl_ ABSL_GUARDED_BY(mu_);
+};
+
+GlobalShuffleDatasetOp::GlobalShuffleDatasetOp(OpKernelConstruction* ctx)
+    : UnaryDatasetOpKernel(ctx) {
+  if (ctx->HasAttr(kReshuffleEachIteration)) {
+    OP_REQUIRES_OK(
+        ctx, ctx->GetAttr(kReshuffleEachIteration, &reshuffle_each_iteration_));
+  }
+}
+
+void GlobalShuffleDatasetOp::MakeDataset(OpKernelContext* ctx,
+                                         DatasetBase* input,
+                                         DatasetBase** output) {
+  OP_REQUIRES(ctx, input->RandomIndexingCompatible().ok(),
+              absl::FailedPreconditionError(absl::StrCat(
+                  "`global_shuffle` requires all upstream transformations be "
+                  "compatible with random access. Got: ",
+                  input->RandomIndexingCompatible().ToString())));
+
+  int64_t cardinality = input->Cardinality();
+  OP_REQUIRES(ctx, cardinality > 0,
+              absl::InvalidArgumentError(absl::StrCat(
+                  "`global_shuffle` requires the input dataset to have a "
+                  "non-empty finite cardinality. Got cardinality ",
+                  cardinality, " for dataset ", input->DebugString())));
+
+  int64_t seed, seed2;
+  OP_REQUIRES_OK(ctx, ParseScalarArgument<int64_t>(ctx, kSeed, &seed));
+  OP_REQUIRES_OK(ctx, ParseScalarArgument<int64_t>(ctx, kSeed2, &seed2));
+  RandomSeeds input_seeds(seed, seed2);
+
+  static std::atomic<int64_t> resource_id_counter(0);
+  const std::string& container = ctx->resource_manager()->default_container();
+  std::string name = absl::StrCat(ctx->op_kernel().name(), "/", kSeedGenerator,
+                                  "_", resource_id_counter.fetch_add(1));
+
+  auto handle = HandleFromInput(ctx, 3);
+  SeedGeneratorManager* seed_generator = nullptr;
+  absl::Status s = ctx->resource_manager()->Lookup<SeedGeneratorManager>(
+      handle.container(), handle.name(), &seed_generator);
+
+  bool owns_resource = false;
+  if (absl::IsNotFound(s)) {
+    owns_resource = true;
+    OP_REQUIRES_OK(
+        ctx, ctx->resource_manager()->LookupOrCreate<SeedGeneratorManager>(
+                 container, name, &seed_generator,
+                 [reshuffle = reshuffle_each_iteration_,
+                  &input_seeds](SeedGeneratorManager** seed_generator) {
+                   if (reshuffle) {
+                     *seed_generator = new SeedGeneratorManager(
+                         new RandomSeedGenerator(input_seeds));
+                   } else {
+                     *seed_generator = new SeedGeneratorManager(
+                         new FixedSeedGenerator(input_seeds));
+                   }
+                   return absl::OkStatus();
+                 }));
+    handle = MakeResourceHandle<SeedGenerator>(ctx, container, name);
+  } else {
+    OP_REQUIRES_OK(ctx, s);
+  }
+
+  *output = new Dataset(ctx, input, seed_generator, std::move(input_seeds),
+                        owns_resource, std::move(handle));
+}
+
+std::unique_ptr<IteratorBase>
+GlobalShuffleDatasetOp::Dataset::MakeIteratorInternal(
+    const std::string& prefix) const {
+  return std::make_unique<GlobalShuffleDatasetOp::Dataset::Iterator>(
+      Iterator::Params{
+          this, name_utils::IteratorPrefix(kGlobalShuffleDataset, prefix)},
+      seed_generator_->get());
+}
+
+REGISTER_KERNEL_BUILDER(Name(kGlobalShuffleDataset).Device(DEVICE_CPU),
+                        GlobalShuffleDatasetOp);
+
+}  // namespace
+}  // namespace data
+}  // namespace tensorflow
diff --git a/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
index 2fbdc124c5f907..059719f214f3f0 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_reducer_dataset_op.cc
@@ -191,7 +191,7 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
            {"Tfinalize_func_other_arguments",
             finalize_func_other_arguments_types_attr}},
           output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -211,7 +211,7 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
             ctx, &instantiated_reduce_func_));
         TF_RETURN_IF_ERROR(dataset()->captured_finalize_func_->Instantiate(
             ctx, &instantiated_finalize_func_));
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -272,12 +272,12 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
 
         if (keys_index_ == keys_.size()) {
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
         TF_RETURN_IF_ERROR(instantiated_finalize_func_->RunWithBorrowedArgs(
             ctx, states_[keys_[keys_index_++]], out_tensors, model_node()));
         *end_of_sequence = false;
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      protected:
@@ -341,7 +341,7 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
           }
         }
 
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -398,7 +398,7 @@ class GroupByReducerDatasetOp : public UnaryDatasetOpKernel {
           }
         }
 
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
diff --git a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
index 05035da404cc99..508a281f43f49d 100644
--- a/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/group_by_window_dataset_op.cc
@@ -118,7 +118,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->push_back(input_);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status CheckExternalState() const override {
@@ -184,7 +184,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
            {"Twindow_size_func_other_arguments",
             window_size_func_other_arguments_types_attr}},
           output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -202,7 +202,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
             ctx, &instantiated_reduce_func_));
         TF_RETURN_IF_ERROR(dataset()->captured_window_size_func_->Instantiate(
             ctx, &instantiated_window_size_func_));
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status GetNextInternal(IteratorContext* ctx,
@@ -219,7 +219,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
             if (!end_of_group) {
               // Produce the subelement as output.
               *end_of_sequence = false;
-              return OkStatus();
+              return absl::OkStatus();
             }
             // We have reached the end of the current group, so maybe move on
             // to the next group.
@@ -301,7 +301,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
         } while (current_group_iterator_ || !end_of_input_);
 
         *end_of_sequence = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      protected:
@@ -371,7 +371,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
         }
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name("group_counter"),
                                                group_counter_ - 1));
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -431,7 +431,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
           TF_RETURN_IF_ERROR(
               RestoreInput(ctx, reader, current_group_iterator_));
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
@@ -448,7 +448,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
                 strings::StrCat(name, "[", i, "][", j, "]"), group[i][j]));
           }
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreGroup(IteratorContext* ctx, IteratorStateReader* reader,
@@ -470,7 +470,7 @@ class GroupByWindowDatasetOp : public UnaryDatasetOpKernel {
                 &group->at(i)[j]));
           }
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status StartFlushingGroup(IteratorContext* ctx, int64_t key)
diff --git a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
index ae420ee85ba8cd..5d0c9ff554b531 100644
--- a/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/ignore_errors_dataset_op.cc
@@ -71,7 +71,7 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->push_back(input_);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status CheckExternalState() const override {
@@ -89,7 +89,7 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
       TF_RETURN_IF_ERROR(
           b->AddDataset(this, {std::make_pair(0, input_graph_node)}, {},
                         {{"log_warning", log_warning_attr}}, output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -111,7 +111,7 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
           tf_shared_lock l(mu_);
           if (!input_impl_) {
             *end_of_sequence = true;
-            return OkStatus();
+            return absl::OkStatus();
           }
           s = input_impl_->GetNext(ctx, out_tensors, end_of_sequence);
           while (!s.ok() && !errors::IsCancelled(s)) {
@@ -144,7 +144,7 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
         else
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("input_impls_empty"), ""));
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -154,7 +154,7 @@ class IgnoreErrorsDatasetOp : public UnaryDatasetOpKernel {
           input_impl_.reset();
         else
           TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
diff --git a/tensorflow/core/kernels/data/experimental/list_dataset_op.cc b/tensorflow/core/kernels/data/experimental/list_dataset_op.cc
index 94f788a2812816..8678a687fa3637 100644
--- a/tensorflow/core/kernels/data/experimental/list_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/list_dataset_op.cc
@@ -64,7 +64,7 @@ class ListDatasetOp::Dataset : public DatasetBase {
                                 split_providers) const override {
     split_providers->push_back(
         std::make_unique<IndexSplitProvider>(num_elements_));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const DataTypeVector& output_dtypes() const override { return output_types_; }
@@ -82,10 +82,10 @@ class ListDatasetOp::Dataset : public DatasetBase {
   }
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status CheckExternalState() const override { return OkStatus(); }
+  Status CheckExternalState() const override { return absl::OkStatus(); }
 
   Status Get(OpKernelContext* ctx, int64 index,
              std::vector<Tensor>* out_tensors) const override {
@@ -95,7 +95,7 @@ class ListDatasetOp::Dataset : public DatasetBase {
     for (int i = 0; i < num_components_; ++i) {
       out_tensors->push_back(tensors_[i + num_components_ * index]);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
@@ -119,7 +119,7 @@ class ListDatasetOp::Dataset : public DatasetBase {
     b->BuildAttrValue(input_types_, &input_types);
     TF_RETURN_IF_ERROR(b->AddDataset(this, {}, {{0, tensors}},
                                      {{kTinputTypes, input_types}}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -138,7 +138,7 @@ class ListDatasetOp::Dataset : public DatasetBase {
         TF_ASSIGN_OR_RETURN(split_provider_,
                             GetSingleSplitProvider(ctx, dataset()));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
@@ -147,7 +147,7 @@ class ListDatasetOp::Dataset : public DatasetBase {
       Tensor split;
       TF_RETURN_IF_ERROR(split_provider_->GetNext(&split, end_of_sequence));
       if (*end_of_sequence) {
-        return OkStatus();
+        return absl::OkStatus();
       }
       int64_t index = split.scalar<int64_t>()();
       out_tensors->reserve(dataset()->num_components_);
@@ -156,7 +156,7 @@ class ListDatasetOp::Dataset : public DatasetBase {
             dataset()->tensors_[i + dataset()->num_components_ * index]);
       }
       *end_of_sequence = false;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
diff --git a/tensorflow/core/kernels/data/experimental/list_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/list_dataset_op_test.cc
index 524302e0a332d9..86f4b00386bab8 100644
--- a/tensorflow/core/kernels/data/experimental/list_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/list_dataset_op_test.cc
@@ -52,7 +52,7 @@ class ListDatasetParams : public DatasetParams {
     for (int i = 0; i < tensors_.size(); ++i) {
       input_names->emplace_back(absl::StrCat("tensors_", i));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -60,7 +60,7 @@ class ListDatasetParams : public DatasetParams {
                     {"output_types", output_dtypes_},
                     {"output_shapes", output_shapes_},
                     {"metadata", ""}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override { return "List"; }
diff --git a/tensorflow/core/kernels/data/experimental/load_dataset_op.cc b/tensorflow/core/kernels/data/experimental/load_dataset_op.cc
index c5dc060f44e9ee..a28743164d43a5 100644
--- a/tensorflow/core/kernels/data/experimental/load_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/load_dataset_op.cc
@@ -87,7 +87,7 @@ class LoadDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->clear();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
@@ -122,7 +122,7 @@ class LoadDatasetOp::Dataset : public DatasetBase {
          std::make_pair(kReaderFuncTarguments,
                         reader_func_arguments_types_attr)},  // Attrs
         output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -210,7 +210,7 @@ class LoadDatasetOp::Dataset : public DatasetBase {
       // We need to take a reference here as we will use the input_ and
       // its iterator.
       input_->Ref();
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     mutex mu_;
diff --git a/tensorflow/core/kernels/data/experimental/lookup_ops.cc b/tensorflow/core/kernels/data/experimental/lookup_ops.cc
index 91d6b1dfe03266..7fff4076d71bd8 100644
--- a/tensorflow/core/kernels/data/experimental/lookup_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/lookup_ops.cc
@@ -70,7 +70,7 @@ class DatasetIterator
         iterator_ctx_.get(), nullptr, "LookupTable", &iterator_));
     core::ScopedUnref unref(finalized_dataset);
     Next();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void Next() override {
@@ -132,7 +132,7 @@ std::unique_ptr<InitializerSerializer> MakeDatasetInitializerSerializer(
               "Failed to create InitializeTableFromDataset op: ",
               builder->opts().StatusToString());
         }
-        return OkStatus();
+        return absl::OkStatus();
       },
       /*cleanup=*/std::move(unref_dataset));
 }
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
index 88558fab26cc60..80b208a8f3e2d6 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op.cc
@@ -138,7 +138,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -182,7 +182,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
          std::make_pair(kPreserveCardinality,
                         preserve_cardinality_attr)},  // Attrs
         output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -239,7 +239,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
       if (ctx->warm_start() && !ctx->is_restoring()) {
         EnsureThreadsStarted(ctx);
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
@@ -279,7 +279,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
           ProcessBatch(dataset()->batch_size_, result->num_elements,
                        dataset()->drop_remainder_, result->status, ctx,
                        out_tensors, end_of_sequence, &result->output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -298,7 +298,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
       if (ctx->symbolic_checkpoint()) {
         TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kCallCounter, 0));
         TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kBatchResultsSize, 0));
-        return OkStatus();
+        return absl::OkStatus();
       }
       mutex_lock l(*mu_);
       // Wait for all in-flight calls to complete.
@@ -314,7 +314,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
       for (size_t i = 0; i < batch_results_.size(); ++i) {
         TF_RETURN_IF_ERROR(WriteBatchResult(writer, i));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -334,7 +334,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
       if (ctx->warm_start()) {
         EnsureThreadsStarted(ctx);
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     TraceMeMetadata GetTraceMeMetadata() const override {
@@ -370,7 +370,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
           : end_of_input(false),
             num_elements(0),
             output_allocated(false),
-            status(OkStatus()),
+            status(absl::OkStatus()),
             status_offset(-1),
             num_calls(batch_size),
             checkpoint(MemoryCheckpoint{ctx->id_registry()}),
@@ -534,7 +534,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
         const std::shared_ptr<std::vector<Tensor>>& return_values) {
       mutex_lock l(result->mu);
       if (result->output_allocated) {
-        return OkStatus();
+        return absl::OkStatus();
       }
       const size_t num_components = return_values->size();
       result->output.reserve(num_components);
@@ -553,7 +553,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
       }
       RecordBufferEnqueue(ctx.get(), result->output);
       result->output_allocated = true;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     void RunnerThread(const std::shared_ptr<IteratorContext>& ctx)
@@ -647,7 +647,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
       if (result->output_allocated) {
         RecordBufferEnqueue(ctx, result->output);
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status WriteBatchResult(IteratorStateWriter* writer, size_t index)
@@ -677,7 +677,7 @@ class MapAndBatchDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(
           WriteStatus(prefix(), strings::StrCat(batch_prefix, "_", kStatus),
                       result->status, writer));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Used for coordination between the main thread, the runner thread, and
diff --git a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op_test.cc
index 96507577f2abd9..a340bd20758a48 100644
--- a/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/map_and_batch_dataset_op_test.cc
@@ -67,7 +67,7 @@ class MapAndBatchDatasetParams : public DatasetParams {
     input_names->emplace_back(MapAndBatchDatasetOp::kNumParallelCalls);
     input_names->emplace_back(MapAndBatchDatasetOp::kDropRemainder);
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -77,7 +77,7 @@ class MapAndBatchDatasetParams : public DatasetParams {
                     {"output_types", output_dtypes_},
                     {"preserve_cardinality", preserve_cardinality_},
                     {"metadata", ""}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::vector<FunctionDef> func_lib() const override { return func_lib_; }
diff --git a/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
index 90b1a041fb9a21..403439b1604277 100644
--- a/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/matching_files_dataset_op.cc
@@ -84,10 +84,10 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
 
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
-    Status CheckExternalState() const override { return OkStatus(); }
+    Status CheckExternalState() const override { return absl::OkStatus(); }
 
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
@@ -96,7 +96,7 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
       Node* patterns_node = nullptr;
       TF_RETURN_IF_ERROR(b->AddVector(patterns_, &patterns_node));
       TF_RETURN_IF_ERROR(b->AddDataset(this, {patterns_node}, output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -139,7 +139,7 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
               out_tensors->emplace_back(std::move(filepath_tensor));
               *end_of_sequence = false;
               hasMatch_ = true;
-              return OkStatus();
+              return absl::OkStatus();
             }
 
             // In this case, current_path is a directory. Then continue the
@@ -185,7 +185,7 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
 
         *end_of_sequence = true;
         if (hasMatch_) {
-          return OkStatus();
+          return absl::OkStatus();
         } else {
           return errors::NotFound("Don't find any matched files");
         }
@@ -226,7 +226,7 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
           }
         }
 
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -268,7 +268,7 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
           }
         }
 
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
@@ -289,7 +289,7 @@ class MatchingFilesDatasetOp : public DatasetOpKernel {
           // All the files in the heap are matched with the pattern, so finish
           // the search if current_path is a file.
           if (!current_path.second) {
-            return OkStatus();
+            return absl::OkStatus();
           }
 
           filepath_queue_.pop();
diff --git a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
index 237dd5ca4cae29..1649bb7d54a93b 100644
--- a/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/non_serializable_dataset_op.cc
@@ -72,7 +72,7 @@ class NonSerializableDatasetOp : public UnaryDatasetOpKernel {
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->push_back(input_);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status CheckExternalState() const override {
@@ -117,13 +117,13 @@ class NonSerializableDatasetOp : public UnaryDatasetOpKernel {
       Status SaveInternal(SerializationContext* ctx,
                           IteratorStateWriter* writer) override {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
index 162076e4ae07ac..9facb7f4fb52ea 100644
--- a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op.cc
@@ -151,7 +151,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -219,7 +219,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     attrs.emplace_back(kTarguments, other_arguments_types_attr);
 
     TF_RETURN_IF_ERROR(b->AddDataset(this, inputs, list_inputs, attrs, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -392,7 +392,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         if (!can_produce_elements && !input_impl_) {
           // No potential for future values.
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         if (must_wait_for_input) {
@@ -462,7 +462,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(prefix(), kWorkerThreadsRunning, ""));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -494,7 +494,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       }
       std::unique_ptr<thread::ThreadPool> threadpool = ctx->CreateThreadPool(
           "read_worker_thread_state", dataset()->num_threads());
-      Status s = OkStatus();
+      Status s = absl::OkStatus();
       BlockingCounter counter(dataset()->num_threads());
       for (size_t i = 0; i < dataset()->num_threads(); ++i) {
         threadpool->Schedule([this, i, ctx, reader, &s, &counter] {
@@ -572,7 +572,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
               [this, new_ctx, i]() { WorkerThread(new_ctx, i); }));
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     TraceMeMetadata GetTraceMeMetadata() const override {
@@ -655,7 +655,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
       std::unique_ptr<IteratorBase> iterator;
 
-      WorkerThreadState() : output_elem(OkStatus()) {}
+      WorkerThreadState() : output_elem(absl::OkStatus()) {}
     };
 
     void CancelThreads() TF_LOCKS_EXCLUDED(mu_) {
@@ -677,7 +677,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
           Status s = input_impl_->GetNext(ctx, &args, &end_of_input);
           if (end_of_input) {
             input_impl_.reset();
-            return OkStatus();
+            return absl::OkStatus();
           }
           if (i < dataset()->cycle_length_) {
             interleave_indices_.push_back(i);
@@ -693,7 +693,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         DCHECK(interleave_indices_.size() == dataset()->cycle_length_);
         DCHECK(staging_indices_.size() == dataset()->prefetch_input_elements_);
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Produces elements into the worker's output buffers.
@@ -837,7 +837,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
           workers_[thread_index].outputs.emplace_back(iterator_creation_status);
           workers_[thread_index].is_producing = false;
           worker_thread_states_[thread_index].iterator_creation_status =
-              OkStatus();
+              absl::OkStatus();
           // CHECKPOINT_MARKER_C
           // Non-OK iterator creation status has been notified to the
           // client.
@@ -929,7 +929,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
                     worker_thread_states_[thread_index].output_elem.output);
               }
               worker_thread_states_[thread_index].output_elem.status =
-                  OkStatus();
+                  absl::OkStatus();
               if (deterministic_) {
                 workers_[thread_index].cond_var.notify_one();
               } else {
@@ -966,7 +966,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(iterator_name, kIsProducing, ""));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status ReadWorkerStateLocked(IteratorContext* ctx,
@@ -989,7 +989,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(
           reader->ReadScalar(worker_prefix, kOutputsSize, &outputs_size));
       for (int i = 0; i < outputs_size; ++i) {
-        workers_[index].outputs.emplace_back(OkStatus());
+        workers_[index].outputs.emplace_back(absl::OkStatus());
         TF_RETURN_IF_ERROR(ReadOutputElemLocked(
             ctx, reader, &workers_[index].outputs.back(), worker_prefix,
             strings::StrCat(kOutputs, "_", i)));
@@ -999,7 +999,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       } else {
         workers_[index].is_producing = false;
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status WriteWorkerThreadStateLocked(SerializationContext* ctx,
@@ -1032,7 +1032,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(iterator_name, kEndOfSequence, ""));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status ReadWorkerThreadStateLocked(IteratorContext* ctx,
@@ -1073,7 +1073,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       } else {
         state->end_of_sequence = false;
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status WriteOutputElemLocked(IteratorStateWriter* writer,
@@ -1092,7 +1092,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
             iterator_name, strings::StrCat(prefix, "_", kOutput, "_", i),
             output_elem.output[i]));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status ReadOutputElemLocked(IteratorContext* ctx,
@@ -1115,7 +1115,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
                                strings::StrCat(prefix, "_", kOutput, "_", i),
                                &output_elem->output.back()));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status WriteStatusLocked(IteratorStateWriter* writer,
@@ -1130,7 +1130,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
             iterator_name, strings::StrCat(prefix, "_", KMessage),
             std::string(status.message())));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status ReadStatusLocked(IteratorStateReader* reader,
@@ -1148,9 +1148,9 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
             &error_message));
         *status = Status(code, error_message);
       } else {
-        *status = OkStatus();
+        *status = absl::OkStatus();
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Mutex & condition variable to guard mutable iterator internals and
diff --git a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op_test.cc
index 8cd0d9a5019ea1..1ba4c37e0513a8 100644
--- a/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/parallel_interleave_dataset_op_test.cc
@@ -78,7 +78,7 @@ class ParallelInterleaveDatasetParams : public DatasetParams {
         ParallelInterleaveDatasetOp::kBufferOutputElements);
     input_names->emplace_back(
         ParallelInterleaveDatasetOp::kPrefetchInputElements);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -88,7 +88,7 @@ class ParallelInterleaveDatasetParams : public DatasetParams {
                     {"output_shapes", output_shapes_},
                     {"output_types", output_dtypes_},
                     {"metadata", ""}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override {
diff --git a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
index ec90ead5920270..9ca3a2bf9aa059 100644
--- a/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/parse_example_dataset_op.cc
@@ -276,7 +276,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->push_back(input_);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status CheckExternalState() const override {
@@ -357,7 +357,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
                                        },
                                        {{2, dense_defaults_nodes}}, attrs,
                                        output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -462,7 +462,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
                 ""));
           }
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -507,7 +507,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
           RecordBufferEnqueue(ctx, result.return_values);
           result.notification.Notify();
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       TraceMeMetadata GetTraceMeMetadata() const override {
@@ -643,7 +643,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
               dataset()->output_shapes()[output_index].DebugString(), ", got ",
               tensor.shape().DebugString(), ").");
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status ParseExample(IteratorContext* ctx, std::vector<Tensor> input,
@@ -722,7 +722,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
                 steps);
           }
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status ProcessResult(IteratorContext* ctx,
@@ -733,7 +733,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
           *out_tensors = std::move(result->return_values);
           RecordBufferDequeue(ctx, *out_tensors);
           *end_of_sequence = false;
-          return OkStatus();
+          return absl::OkStatus();
         }
         if (errors::IsOutOfRange(result->status)) {
           // To guarantee that the transformation preserves the cardinality of
@@ -858,7 +858,7 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
           TF_RETURN_IF_ERROR(writer->WriteScalar(
               ErrorMessageKey(index), std::string(status.message())));
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status ReadStatusLocked(IteratorStateReader* reader, size_t index,
@@ -874,9 +874,9 @@ class ParseExampleDatasetOp : public UnaryDatasetOpKernel {
               reader->ReadScalar(ErrorMessageKey(index), &error_message));
           *status = Status(code, error_message);
         } else {
-          *status = OkStatus();
+          *status = absl::OkStatus();
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       string CodeKey(size_t index) {
diff --git a/tensorflow/core/kernels/data/experimental/random_access_ops.cc b/tensorflow/core/kernels/data/experimental/random_access_ops.cc
index a868e975732b38..b4c26b8136ed2c 100644
--- a/tensorflow/core/kernels/data/experimental/random_access_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/random_access_ops.cc
@@ -46,7 +46,7 @@ Status GetElementAtIndexOp::DoCompute(OpKernelContext* ctx) {
   for (int i = 0; i < components.size(); ++i) {
     ctx->set_output(i, components[i]);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
diff --git a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
index 1f1163839ff968..c83febdeac422b 100644
--- a/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op.cc
@@ -85,7 +85,7 @@ class RandomDatasetOp::Dataset : public DatasetBase {
     // TODO(aaudibert): Avoid sending dummy splits over RPC when using tf.data
     // service with RandomDataset.
     split_providers->push_back(std::make_unique<IndexSplitProvider>(kint64max));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
@@ -119,10 +119,10 @@ class RandomDatasetOp::Dataset : public DatasetBase {
   }
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status CheckExternalState() const override { return OkStatus(); }
+  Status CheckExternalState() const override { return absl::OkStatus(); }
 
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
@@ -163,7 +163,7 @@ class RandomDatasetOp::Dataset : public DatasetBase {
       mutex_lock l(mu_);
       seed_generator_->GenerateSeeds(&seed_, &seed2_);
       ResetRngs();
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
@@ -174,7 +174,7 @@ class RandomDatasetOp::Dataset : public DatasetBase {
       out_tensors->emplace_back(ctx->allocator({}), DT_INT64, TensorShape({}));
       out_tensors->back().scalar<int64_t>()() = Random();
       *end_of_sequence = false;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     std::shared_ptr<model::Node> CreateNode(
@@ -193,7 +193,7 @@ class RandomDatasetOp::Dataset : public DatasetBase {
                                              num_random_samples_));
       TF_RETURN_IF_ERROR(writer->WriteScalar(this->full_name(kSeed), seed_));
       TF_RETURN_IF_ERROR(writer->WriteScalar(this->full_name(kSeed2), seed2_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -210,7 +210,7 @@ class RandomDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(reader->ReadScalar(this->full_name(kSeed), &seed_));
       TF_RETURN_IF_ERROR(reader->ReadScalar(this->full_name(kSeed2), &seed2_));
       ResetRngs();
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -304,7 +304,7 @@ void RandomDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase** output) {
                 *manager =
                     new SeedGeneratorManager(new FixedSeedGenerator(seeds));
               }
-              return OkStatus();
+              return absl::OkStatus();
             }));
     handle = MakeResourceHandle<SeedGenerator>(ctx, container, name);
   }
diff --git a/tensorflow/core/kernels/data/experimental/random_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/random_dataset_op_test.cc
index f93cf82dbeefc7..35bd077fb8bd8f 100644
--- a/tensorflow/core/kernels/data/experimental/random_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/random_dataset_op_test.cc
@@ -104,7 +104,7 @@ class RandomDatasetParams : public DatasetParams {
     if (op_version_ == 2) {
       input_names->emplace_back("seed_generator");
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   virtual Status GetAttributes(AttributeVector* attributes) const override {
@@ -115,7 +115,7 @@ class RandomDatasetParams : public DatasetParams {
       attributes->emplace_back("rerandomize_each_iteration",
                                rerandomize_each_iteration_);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   virtual string dataset_type() const override {
diff --git a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
index ddbb5b98680b54..e6c02d226756ed 100644
--- a/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/rebatch_dataset_op.cc
@@ -13,8 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <optional>
+
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/framework/dataset.h"
+#include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_util.h"
 #include "tensorflow/core/platform/stringprintf.h"
 
@@ -94,7 +97,7 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel {
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->push_back(input_);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status CheckExternalState() const override {
@@ -111,7 +114,7 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel {
       TF_RETURN_IF_ERROR(b->AddScalar(num_replicas_, &num_replicas));
       TF_RETURN_IF_ERROR(
           b->AddDataset(this, {input_graph_node, num_replicas}, output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -138,7 +141,7 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel {
           TF_RETURN_IF_ERROR(
               input_impl_->GetNext(ctx, &input_tensors, end_of_sequence));
           if (*end_of_sequence) {
-            return OkStatus();
+            return absl::OkStatus();
           }
 
           input_descriptors_.reserve(input_tensors.size());
@@ -182,7 +185,7 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel {
           }
         }
         slice_number_ = (slice_number_ + 1) % dataset()->num_replicas_;
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      protected:
@@ -206,7 +209,7 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel {
                 input_descriptors_[i].whole_tensor));
           }
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -234,7 +237,7 @@ class RebatchDatasetOp : public UnaryDatasetOpKernel {
                         dataset()->num_replicas_);
           }
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       TraceMeMetadata GetTraceMeMetadata() const override {
@@ -355,7 +358,7 @@ class RebatchDatasetV2Op : public UnaryDatasetOpKernel {
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->push_back(input_);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status CheckExternalState() const override {
@@ -374,7 +377,7 @@ class RebatchDatasetV2Op : public UnaryDatasetOpKernel {
       TF_RETURN_IF_ERROR(b->AddScalar(drop_remainder_, &drop_remainder));
       TF_RETURN_IF_ERROR(b->AddDataset(
           this, {input_graph_node, batch_sizes, drop_remainder}, output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -396,7 +399,7 @@ class RebatchDatasetV2Op : public UnaryDatasetOpKernel {
         mutex_lock l(mu_);
         if (end_of_sequence_) {
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         *end_of_sequence = false;
@@ -406,7 +409,7 @@ class RebatchDatasetV2Op : public UnaryDatasetOpKernel {
         // different input tensors.
         int64_t batch_size = 0;
 
-        std::vector<std::vector<Tensor>> slices_to_concatenate;
+        std::vector<std::vector<TensorSlice>> slices_to_concatenate;
         // Get slices from input tensors until they make up the whole batch
         // size or we run out of input.
         while (batch_size < desired_batch_size) {
@@ -427,10 +430,10 @@ class RebatchDatasetV2Op : public UnaryDatasetOpKernel {
               std::min(offset_ + desired_batch_size - batch_size,
                        tensors_[0].dim_size(0));
 
-          std::vector<Tensor> slices;
+          std::vector<TensorSlice> slices;
           slices.reserve(tensors_.size());
           for (const auto& tensor : tensors_) {
-            slices.push_back(tensor.Slice(offset_, slice_end));
+            slices.push_back(TensorSlice(tensor, offset_, slice_end));
           }
           slices_to_concatenate.push_back(std::move(slices));
 
@@ -452,7 +455,7 @@ class RebatchDatasetV2Op : public UnaryDatasetOpKernel {
             (dataset()->drop_remainder_ && batch_size < desired_batch_size)) {
           DCHECK(end_of_sequence_);
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         const size_t num_components = dataset()->output_dtypes().size();
@@ -487,27 +490,31 @@ class RebatchDatasetV2Op : public UnaryDatasetOpKernel {
                   Tensor(dataset()->output_dtypes()[i], tensor_shape));
             }
           }
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         // Special case: when there's only one slice, we return the slice
         // directly where possible instead of copying the tensor data.
         if (slices_to_concatenate.size() == 1) {
-          auto tensors = std::move(slices_to_concatenate[0]);
+          std::vector<Tensor> tensors;
+          tensors.reserve(num_components);
           for (size_t i = 0; i < num_components; ++i) {
+            Tensor& tensor = slices_to_concatenate[0][i].Slice();
             // If the slice is aligned, we return it directly.
-            if (!tensors[i].IsAligned()) {
-              tensors[i] = tensor::DeepCopy(std::move(tensors[i]));
+            if (!tensor.IsAligned()) {
+              tensor = tensor::DeepCopy(tensor);
             }
+            tensors.push_back(std::move(tensor));
           }
           *out_tensors = std::move(tensors);
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         // For each component, concatenate slices into one tensor.
         for (size_t i = 0; i < num_components; ++i) {
           TensorShape component_shape({batch_size});
-          TensorShape remaining_shape = slices_to_concatenate[0][i].shape();
+          TensorShape remaining_shape =
+              slices_to_concatenate[0][i].Slice().shape();
           remaining_shape.RemoveDim(0);
           component_shape.AppendShape(remaining_shape);
           out_tensors->emplace_back(ctx->allocator({}),
@@ -519,15 +526,48 @@ class RebatchDatasetV2Op : public UnaryDatasetOpKernel {
           }
           int64_t dst_offset = 0;
           for (size_t j = 0; j < slices_to_concatenate.size(); ++j) {
-            auto num_slices = slices_to_concatenate[j][i].shape().dim_size(0);
-            TF_RETURN_IF_ERROR(batch_util::CopyContiguousSlices(
-                slices_to_concatenate[j][i], 0, dst_offset, num_slices,
-                &(*out_tensors)[i]));
+            auto num_slices =
+                slices_to_concatenate[j][i].Slice().shape().dim_size(0);
+            TensorSlice& slice = slices_to_concatenate[j][i];
+            if (slice.OwnsTensor()) {
+              slice.ClearSliceRef();
+              // Instead of using the slice,
+              // we directly use its parent tensor to make sure
+              // the reference count is 1 and can move the data potentially.
+              TF_RETURN_IF_ERROR(batch_util::MaybeMoveContiguousSlices(
+                  slice.Parent(), slice.Start(), dst_offset, num_slices,
+                  &(*out_tensors)[i]));
+            } else if (slice.ParentRefCount() == 3 &&
+                       j == slices_to_concatenate.size() - 1 &&
+                       !tensors_.empty()) {
+              // Special case:
+              // When `tensors_` still holds a reference to the tensor buffer,
+              // we could clear both parent and slice so that we can
+              // potentially move the underlying data by dirctly using
+              // `tensors_[i]`.
+              //
+              // For example:
+              // B = 3, B_new = 2
+              // tensors_:  [| e | e | e |, ...]
+              //               v   v
+              // new batch:  | e | e |
+              slice.ClearAllRefs();
+              Tensor& parent = tensors_[i];
+              TF_RETURN_IF_ERROR(batch_util::MaybeMoveContiguousSlices(
+                  parent, slice.Start(), dst_offset, num_slices,
+                  &(*out_tensors)[i]));
+            } else {
+              // Other iterator ops are holding references,
+              // we have to copy the underlying tensor buffer.
+              TF_RETURN_IF_ERROR(batch_util::CopyContiguousSlices(
+                  slice.Slice(), 0, dst_offset, num_slices,
+                  &(*out_tensors)[i]));
+            }
             dst_offset += num_slices;
           }
         }
 
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      protected:
@@ -549,7 +589,7 @@ class RebatchDatasetV2Op : public UnaryDatasetOpKernel {
                 full_name(strings::StrCat("tensors[", i, "]")), tensors_[i]));
           }
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -573,7 +613,7 @@ class RebatchDatasetV2Op : public UnaryDatasetOpKernel {
                 &tensors_[i]));
           }
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       TraceMeMetadata GetTraceMeMetadata() const override {
@@ -596,9 +636,53 @@ class RebatchDatasetV2Op : public UnaryDatasetOpKernel {
                 tensors_[i].dim_size(0), ".");
           }
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
+      class TensorSlice {
+       public:
+        TensorSlice(const Tensor& t, int64_t start, int64_t end)
+            : start_(start),
+              end_(end),
+              parent_(std::make_unique<Tensor>(t)),
+              slice_(std::make_unique<Tensor>(t.Slice(start, end))) {}
+        bool OwnsTensor() {
+          // If this iterator op owns this tensor,
+          // there will be one reference from `parent_` and one from `slice_`.
+          // Otherwise, some other iterator op might own this tensor.
+          // For example, tensor_dataset_op.cc
+          auto ref_count = ParentRefCount();
+          if (ref_count) {
+            return *ref_count == 2;
+          } else {
+            return false;
+          }
+        }
+        std::optional<int> ParentRefCount() {
+          if (parent_->data() == nullptr) {
+            return std::nullopt;
+          }
+          return parent_->RefCount();
+        }
+
+        Tensor& Slice() { return *slice_; }
+        Tensor& Parent() { return *parent_; }
+        inline void ClearSliceRef() { slice_.reset(); }
+        inline void ClearAllRefs() {
+          parent_.reset();
+          slice_.reset();
+        }
+        int64_t Start() { return start_; }
+        int64_t End() { return end_; }
+
+       private:
+        const int64_t start_;
+        const int64_t end_;
+        std::unique_ptr<Tensor> parent_;
+        // A slice taken from the first dimension of `parent`.
+        std::unique_ptr<Tensor> slice_;
+      };
+
       mutex mu_;
       std::unique_ptr<IteratorBase> input_impl_;
       // Whether we have reached the end of the input.
diff --git a/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc
index fd01c81061f227..ca84cf49ee33f6 100644
--- a/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sampling_dataset_op.cc
@@ -72,7 +72,7 @@ class SamplingDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -93,7 +93,7 @@ class SamplingDatasetOp::Dataset : public DatasetBase {
     TF_RETURN_IF_ERROR(b->AddScalar(seeds_.second, &seed2));
     TF_RETURN_IF_ERROR(
         b->AddDataset(this, {input_graph_node, rate, seed, seed2}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -118,7 +118,7 @@ class SamplingDatasetOp::Dataset : public DatasetBase {
           tf_shared_lock l(mu_);
           if (!input_impl_) {
             *end_of_sequence = true;
-            return OkStatus();
+            return absl::OkStatus();
           }
           TF_RETURN_IF_ERROR(
               input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
@@ -126,7 +126,7 @@ class SamplingDatasetOp::Dataset : public DatasetBase {
         if (*end_of_sequence) {
           mutex_lock l(mu_);
           input_impl_.reset();
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         // generate a number from random uniform [0, 1)
@@ -138,7 +138,7 @@ class SamplingDatasetOp::Dataset : public DatasetBase {
         }
       } while (!rand_val_hit);
       *end_of_sequence = false;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -167,7 +167,7 @@ class SamplingDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name("input_impl_empty"), ""));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -188,7 +188,7 @@ class SamplingDatasetOp::Dataset : public DatasetBase {
       } else {
         input_impl_.reset();
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     mutex mu_;
diff --git a/tensorflow/core/kernels/data/experimental/sampling_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/sampling_dataset_op_test.cc
index 87bcf12b1925d4..3c0f3dac7b9bc9 100644
--- a/tensorflow/core/kernels/data/experimental/sampling_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/sampling_dataset_op_test.cc
@@ -50,13 +50,13 @@ class SamplingDatasetParams : public DatasetParams {
     *input_names = {SamplingDatasetOp::kInputDataset, SamplingDatasetOp::kRate,
                     SamplingDatasetOp::kSeed, SamplingDatasetOp::kSeed2};
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
     *attr_vector = {{SamplingDatasetOp::kOutputTypes, output_dtypes_},
                     {SamplingDatasetOp::kOutputShapes, output_shapes_}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override {
diff --git a/tensorflow/core/kernels/data/experimental/save_dataset_op.cc b/tensorflow/core/kernels/data/experimental/save_dataset_op.cc
index 989bfceb6f63d9..0110618143c85f 100644
--- a/tensorflow/core/kernels/data/experimental/save_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/save_dataset_op.cc
@@ -92,7 +92,7 @@ Status SaveDatasetOp::DoCompute(OpKernelContext* ctx) {
   TF_RETURN_IF_ERROR(WriteMetadataFile(ctx->env(), path, run_id,
                                        dataset->output_dtypes(), num_elements,
                                        /*finalized=*/true));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SaveDatasetOp::WriteData(OpKernelContext* ctx, DatasetBase* dataset,
@@ -173,7 +173,7 @@ Status SaveDatasetOp::GetShardIndex(IteratorContext* ctx,
                                     int64_t* shard_index) {
   if (!use_shard_func_) {
     *shard_index = (*shard_index + 1) % GetCpuBudget();
-    return OkStatus();
+    return absl::OkStatus();
   }
   std::vector<Tensor> output_tensors;
   TF_RETURN_IF_ERROR(function->RunWithBorrowedArgs(
@@ -184,7 +184,7 @@ Status SaveDatasetOp::GetShardIndex(IteratorContext* ctx,
     return errors::InvalidArgument("`shard_func` must return a scalar int64.");
   }
   *shard_index = output_tensors[0].flat<int64_t>()(0);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SaveDatasetOp::WriteMetadataFile(Env* env, const std::string& path,
@@ -244,7 +244,7 @@ class SaveDatasetV2Op::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -296,7 +296,7 @@ class SaveDatasetV2Op::Dataset : public DatasetBase {
          std::make_pair(kShardFuncTarguments, shard_func_arguments_types_attr)},
         output));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -396,7 +396,7 @@ class SaveDatasetV2Op::Dataset : public DatasetBase {
       }
 
       current_writer->Write(*out_tensors);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -447,7 +447,7 @@ class SaveDatasetV2Op::Dataset : public DatasetBase {
         TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
       if (!use_shard_func) {
         *shard_index = (*shard_index + 1) % GetCpuBudget();
-        return OkStatus();
+        return absl::OkStatus();
       }
       std::vector<Tensor> output_tensors;
       TF_RETURN_IF_ERROR(function->RunWithBorrowedArgs(
@@ -459,7 +459,7 @@ class SaveDatasetV2Op::Dataset : public DatasetBase {
             "`shard_func` must return a scalar int64.");
       }
       *shard_index = output_tensors[0].flat<int64_t>()(0);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status WriteMetadataFile(Env* env, const std::string& path, uint64 run_id,
diff --git a/tensorflow/core/kernels/data/experimental/save_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/save_dataset_op_test.cc
index 5ae5757eb799ee..f0b7745a46e93d 100644
--- a/tensorflow/core/kernels/data/experimental/save_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/save_dataset_op_test.cc
@@ -63,7 +63,7 @@ class SaveDatasetV2Params : public DatasetParams {
     input_names->clear();
     input_names->emplace_back(SaveDatasetV2Op::kInputDataset);
     input_names->emplace_back(SaveDatasetV2Op::kPath);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -75,7 +75,7 @@ class SaveDatasetV2Params : public DatasetParams {
                               type_arguments_);
     attr_vector->emplace_back(SaveDatasetV2Op::kOutputTypes, output_dtypes_);
     attr_vector->emplace_back(SaveDatasetV2Op::kOutputShapes, output_shapes_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string path() const { return path_; }
@@ -101,7 +101,7 @@ class SaveDatasetV2OpTest : public DatasetOpsTestBase {
     TF_RETURN_IF_ERROR(DatasetOpsTestBase::Initialize(dataset_params));
     auto params = static_cast<const SaveDatasetV2Params&>(dataset_params);
     save_filename_ = params.path();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
diff --git a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
index 7e937c682343a9..a4aa38277870ee 100644
--- a/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/scan_dataset_op.cc
@@ -117,7 +117,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->push_back(input_);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status CheckExternalState() const override {
@@ -161,7 +161,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
                          {"preserve_cardinality", preserve_cardinality_attr},
                          {"use_default_device", use_default_device_attr}},
                         output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -189,7 +189,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
         TF_RETURN_IF_ERROR(
             input_impl_->GetNext(ctx, &next_element, end_of_sequence));
         if (*end_of_sequence) {
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         std::vector<Tensor> args;
@@ -248,7 +248,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
             // `f` may deliberately raise `errors::OutOfRange` to indicate
             // that we should terminate the iteration early.
             *end_of_sequence = true;
-            return OkStatus();
+            return absl::OkStatus();
           }
         }
         return s;
@@ -273,7 +273,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
           TF_RETURN_IF_ERROR(writer->WriteTensor(
               full_name(strings::StrCat("state[", idx, "]")), state_[idx]));
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -288,7 +288,7 @@ class ScanDatasetOp : public UnaryDatasetOpKernel {
               ctx->flr(), full_name(strings::StrCat("state[", idx, "]")),
               &state_[idx]));
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
diff --git a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
index d1f05d8a3bcc2f..3d2ae0e7f7434a 100644
--- a/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/set_stats_aggregator_dataset_op.cc
@@ -146,7 +146,7 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->push_back(input_);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status CheckExternalState() const override {
@@ -168,7 +168,7 @@ class SetStatsAggregatorDatasetOp : public UnaryDatasetOpKernel {
       TF_RETURN_IF_ERROR(b->AddDataset(
           this, {input_graph_node, resource_handle_node, tag_node, prefix_node},
           output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
diff --git a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
index 7fbd26a5a81b67..a8d8a7fa44228f 100644
--- a/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sleep_dataset_op.cc
@@ -73,7 +73,7 @@ class SleepDatasetOp : public UnaryDatasetOpKernel {
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->push_back(input_);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status CheckExternalState() const override {
diff --git a/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
index 8e95a7128266db..fa522ea8e74bdf 100644
--- a/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sliding_window_dataset_op.cc
@@ -121,7 +121,7 @@ class SlidingWindowDatasetOp : public UnaryDatasetOpKernel {
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->push_back(input_);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status CheckExternalState() const override {
@@ -148,7 +148,7 @@ class SlidingWindowDatasetOp : public UnaryDatasetOpKernel {
       TF_RETURN_IF_ERROR(b->AddDataset(
           this, {input_graph_node, window_size, window_shift, window_stride},
           {std::make_pair(kDropRemainder, drop_remainder_attr)}, output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -193,7 +193,7 @@ class SlidingWindowDatasetOp : public UnaryDatasetOpKernel {
               (buffer_.size() < target_size && drop_remainder)) {
             DCHECK(input_impl_ == nullptr);
             *end_of_sequence = true;
-            return OkStatus();
+            return absl::OkStatus();
           }
 
           for (size_t i = 0; i < buffer_.size(); i += window_stride) {
@@ -248,7 +248,7 @@ class SlidingWindowDatasetOp : public UnaryDatasetOpKernel {
           }
         }
         *end_of_sequence = false;
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      protected:
@@ -278,7 +278,7 @@ class SlidingWindowDatasetOp : public UnaryDatasetOpKernel {
                 strings::StrCat("buffer[", i, "][", j, "]"), buffer_[i][j]));
           }
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -305,7 +305,7 @@ class SlidingWindowDatasetOp : public UnaryDatasetOpKernel {
                 &buffer_[i][j]));
           }
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
diff --git a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
index b62e0241570ed9..9dfdbafe6e8e3c 100644
--- a/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/snapshot_dataset_op.cc
@@ -148,7 +148,7 @@ class SnapshotDatasetV2Op::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -306,12 +306,12 @@ class SnapshotDatasetV2Op::Dataset : public DatasetBase {
       // We do not need to checkpoint the reader as we are rebuilding the
       // reader datasets from information that is already saved by the main
       // iterator.
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -429,7 +429,7 @@ class SnapshotDatasetV2Op::Dataset : public DatasetBase {
       }
 
       current_writer->Write(*out_tensors);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -488,7 +488,7 @@ class SnapshotDatasetV2Op::Dataset : public DatasetBase {
       // Create writable files if we see an index bigger than our current
       // files.
       *shard_index = output_tensors[0].flat<int64_t>()(0);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status WriteMetadataFile(Env* env, bool finalized)
@@ -621,7 +621,7 @@ class SnapshotDatasetV2Op::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(full_name(kGraphHashDirectory), hash_dir_));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -631,7 +631,7 @@ class SnapshotDatasetV2Op::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(InitializeIterator(ctx, reader));
         return RestoreInput(ctx, reader, iterator_);
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -998,7 +998,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->push_back(input_);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status CheckExternalState() const override {
@@ -1081,7 +1081,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
            {"mode", mode_attr},
            {"snapshot_name", snapshot_name_attr}},
           output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -1106,7 +1106,9 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
       // Initialize at first and at that point we don't know which iterator
       // (Reader / Writer / Passthrough) we need to restore as this info is part
       // of the checkpoint.
-      Status Initialize(IteratorContext* ctx) override { return OkStatus(); }
+      Status Initialize(IteratorContext* ctx) override {
+        return absl::OkStatus();
+      }
 
       Status GetNextInternal(IteratorContext* ctx,
                              std::vector<Tensor>* out_tensors,
@@ -1137,7 +1139,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
                                                static_cast<int64_t>(state_)));
         TF_RETURN_IF_ERROR(writer->WriteScalar(full_name(kHashDir), hash_dir_));
         VLOG(2) << "Saving Snapshot iterator: " << state_;
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -1149,7 +1151,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           LOG(ERROR) << "Dataset has changed while restoring from the "
                         "checkpoint. Old hash: "
                      << hash_dir << "; new hash: " << hash_dir_;
-          return OkStatus();
+          return absl::OkStatus();
         }
         int64_t temp;
         TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kState), &temp));
@@ -1275,7 +1277,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           for (auto i = 0; i < dataset()->num_reader_threads_; ++i) {
             curr_filenames_.push_back(GetNextFilename());
           }
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         Status GetNextInternal(IteratorContext* ctx,
@@ -1358,7 +1360,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
 
           if (background_threads_finished_) {
             *end_of_sequence = true;
-            return OkStatus();
+            return absl::OkStatus();
           }
 
           return errors::Internal("Unreachable point in SnapshotReader");
@@ -1397,7 +1399,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
                                                  num_elements_read_));
           VLOG(2) << "Saving SnapshotReaderIterator: " << num_elements_read_
                   << "; elements_produced: " << elements_produced_;
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         Status RestoreInternal(IteratorContext* ctx,
@@ -1413,7 +1415,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
                        << "run_dir: " << run_dir
                        << " but new run_dir is: " << run_dir_
                        << ". We'll now restart snapshot creation.";
-            return OkStatus();
+            return absl::OkStatus();
           }
           TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kRunId), &run_id_));
           TF_RETURN_IF_ERROR(reader->ReadScalar(full_name(kRunDir), &run_dir_));
@@ -1464,7 +1466,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
                                                 &num_elements_read_));
           VLOG(2) << "Restoring SnapshotReaderIterator: " << num_elements_read_
                   << "; elements_produced: " << elements_produced_;
-          return OkStatus();
+          return absl::OkStatus();
         }
 
        private:
@@ -1497,18 +1499,18 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
                   profiler::TraceMeLevel::kInfo);
               BufferElement elem;
               elem.value = std::move(read_tensors);
-              elem.status = OkStatus();
+              elem.status = absl::OkStatus();
               mutex_lock l(mu_);
               buffer_.push_back(std::move(elem));
               num_elements_read_++;
               cond_var_.notify_all();
             } else if (errors::IsOutOfRange(s)) {
-              return OkStatus();
+              return absl::OkStatus();
             } else {
               return s;
             }
           }
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         string GetNextFilename() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
@@ -1575,7 +1577,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
             TF_RETURN_IF_ERROR(writer->WriteScalar(
                 ErrorMessageKey(index), std::string(status.message())));
           }
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         Status ReadStatus(IteratorStateReader* reader, size_t index,
@@ -1590,9 +1592,9 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
                 reader->ReadScalar(ErrorMessageKey(index), &error_message));
             *status = Status(code, error_message);
           } else {
-            *status = OkStatus();
+            *status = absl::OkStatus();
           }
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         string CodeKey(size_t index) {
@@ -1774,7 +1776,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
                         << write_throughput;
             }
           }
-          return OkStatus();
+          return absl::OkStatus();
         }
 
        protected:
@@ -1829,7 +1831,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           }
           VLOG(2) << "Saving SnapshotWriterIterator: " << num_elements_written_
                   << "; elements_produced: " << elements_produced_;
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         Status RestoreInternal(IteratorContext* ctx,
@@ -1844,7 +1846,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           if (hash_dir != hash_dir_) {
             LOG(INFO) << "Old hash dir from ckpt: " << hash_dir
                       << " is not the same as the new one: " << hash_dir_;
-            return OkStatus();
+            return absl::OkStatus();
           }
           is_restored_ = true;
           if (reader->Contains(full_name(kEndOfSequence))) {
@@ -1937,7 +1939,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           VLOG(2) << "Restoring SnapshotWriterIterator: "
                   << num_elements_written_
                   << "; elements_produced: " << elements_produced_;
-          return OkStatus();
+          return absl::OkStatus();
         }
 
        private:
@@ -1966,7 +1968,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
             while (num_active_threads_ > 0) {
               cond_var_.wait(l);
             }
-            return OkStatus();
+            return absl::OkStatus();
           }
 
           // Wait for a space in the buffer_.
@@ -1989,7 +1991,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
           snapshot_util::ElementOrEOF elem_copy = next_elem_;
           buffer_.push_back(elem_copy);
           cond_var_.notify_all();
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         Status ProcessOneElement(Env* env, int64_t* bytes_written,
@@ -2055,7 +2057,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
               *bytes_written = 0;
             }
             TF_RETURN_IF_ERROR((*writer)->WriteTensors(elem.value));
-            return OkStatus();
+            return absl::OkStatus();
           }
 
           if (*end_of_processing) {
@@ -2078,7 +2080,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
               cond_var_.notify_all();
             }
           }
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         // Just pulls off elements from the buffer and writes them.
@@ -2135,14 +2137,14 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
             if (compression_ratio_ > 0.0) {
               *should_close = bytes_written > (compression_ratio_ *
                                                dataset()->shard_size_bytes_);
-              return OkStatus();
+              return absl::OkStatus();
             }
           }
           // If the number of bytes written aren't shard_size_bytes_ yet, we
           // keep on going.
           if (bytes_written <= dataset()->shard_size_bytes_) {
             *should_close = false;
-            return OkStatus();
+            return absl::OkStatus();
           }
           // Use the actual file size to determine compression ratio.
           // Make sure that all bytes are written out.
@@ -2154,7 +2156,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
                                static_cast<double>(file_size);
           LOG(INFO) << "Writing compression achieved: " << compression_ratio_;
           *should_close = file_size >= dataset()->shard_size_bytes_;
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         mutex mu_;
@@ -2265,7 +2267,7 @@ class SnapshotDatasetOp : public UnaryDatasetOpKernel {
     *hash = Hash64Combine(*hash, Hash64(reader_path_prefix_));
     *hash = Hash64Combine(*hash, Hash64(writer_path_prefix_));
     *hash = Hash64Combine(*hash, shard_size_bytes_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const int graph_def_version_;
diff --git a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
index 82fcb2fa4d7299..8b63d10a6e86e5 100644
--- a/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
+++ b/tensorflow/core/kernels/data/experimental/sql/sqlite_query_connection.cc
@@ -40,14 +40,14 @@ Status SqliteQueryConnection::Open(const string& data_source_name,
       data_source_name, SQLITE_OPEN_READWRITE | SQLITE_OPEN_CREATE, &db_));
   query_ = query;
   output_types_ = output_types;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SqliteQueryConnection::Close() {
   stmt_ = SqliteStatement();
   db_->Unref();
   db_ = nullptr;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SqliteQueryConnection::GetNext(IteratorContext* ctx,
@@ -63,7 +63,7 @@ Status SqliteQueryConnection::GetNext(IteratorContext* ctx,
       FillTensorWithResultSetEntry(dt, i, &out_tensors->back());
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SqliteQueryConnection::PrepareQuery() {
@@ -77,7 +77,7 @@ Status SqliteQueryConnection::PrepareQuery() {
         column_count, output_types_.size()));
   }
   column_count_ = column_count;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void SqliteQueryConnection::FillTensorWithResultSetEntry(
diff --git a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
index 5d190bd8598958..daf55e01f85d29 100644
--- a/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/sql_dataset_op.cc
@@ -105,10 +105,10 @@ class SqlDatasetOp : public DatasetOpKernel {
 
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
-    Status CheckExternalState() const override { return OkStatus(); }
+    Status CheckExternalState() const override { return absl::OkStatus(); }
 
    protected:
     Status AsGraphDefInternal(SerializationContext* ctx,
@@ -123,7 +123,7 @@ class SqlDatasetOp : public DatasetOpKernel {
       TF_RETURN_IF_ERROR(b->AddScalar(query_, &query_node));
       TF_RETURN_IF_ERROR(b->AddDataset(
           this, {driver_name_node, data_source_name_node, query_node}, output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -147,7 +147,7 @@ class SqlDatasetOp : public DatasetOpKernel {
         if (!query_connection_initialized_) {
           TF_RETURN_IF_ERROR(InitializeQueryConnection());
         }
-        Status status = OkStatus();
+        Status status = absl::OkStatus();
         if (!end_of_sequence_) {
           next_calls_++;
           status =
@@ -170,7 +170,7 @@ class SqlDatasetOp : public DatasetOpKernel {
           TF_RETURN_IF_ERROR(
               writer->WriteScalar(full_name("next_calls"), next_calls_));
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -192,7 +192,7 @@ class SqlDatasetOp : public DatasetOpKernel {
           query_connection_initialized_ = false;
           end_of_sequence_ = false;
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
@@ -209,7 +209,7 @@ class SqlDatasetOp : public DatasetOpKernel {
           LOG(WARNING) << "Failed to connect to database: " << s;
           return s;
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       mutex mu_;
diff --git a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
index 6404c0fc52a200..4bb95b087fcb5b 100644
--- a/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_aggregator_ops.cc
@@ -86,7 +86,7 @@ class StatsAggregatorImpl : public StatsAggregator {
   // in V1.
   Status SetSummaryWriter(
       SummaryWriterInterface* summary_writer_interface) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void IncrementCounter(const string& name, const string& label,
@@ -124,7 +124,7 @@ class StatsAggregatorHandleOp
   Status CreateResource(StatsAggregatorResource** ret) override
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     *ret = new StatsAggregatorResource(std::make_unique<StatsAggregatorImpl>());
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -159,7 +159,7 @@ class StatsAggregatorImplV2 : public StatsAggregator {
     mutex_lock l(mu_);
     if (summary_writer_interface_)
       TF_RETURN_IF_ERROR(summary_writer_interface_->Flush());
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void IncrementCounter(const string& name, const string& label,
@@ -194,7 +194,7 @@ class StatsAggregatorImplV2 : public StatsAggregator {
     }
     summary_writer_interface_ = summary_writer_interface;
     summary_writer_interface_->Ref();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -250,7 +250,7 @@ class StatsAggregatorHandleOpV2
       TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
     *ret =
         new StatsAggregatorResource(std::make_unique<StatsAggregatorImplV2>());
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
index 146a74ec85d0f0..0ff1595afae7d3 100644
--- a/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
+++ b/tensorflow/core/kernels/data/experimental/stats_dataset_ops.cc
@@ -83,7 +83,7 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->push_back(input_);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status CheckExternalState() const override {
@@ -99,7 +99,7 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
       Node* tag_node;
       TF_RETURN_IF_ERROR(b->AddScalar(tag_, &tag_node));
       TF_RETURN_IF_ERROR(b->AddDataset(this, {input_node, tag_node}, output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -140,14 +140,14 @@ class LatencyStatsDatasetOp : public UnaryDatasetOpKernel {
                           IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
@@ -218,7 +218,7 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
       Node* tag_node;
       TF_RETURN_IF_ERROR(b->AddScalar(tag_, &tag_node));
       TF_RETURN_IF_ERROR(b->AddDataset(this, {input_node, tag_node}, output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -261,14 +261,14 @@ class BytesProducedStatsDatasetOp : public UnaryDatasetOpKernel {
                           IteratorStateWriter* writer) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         mutex_lock l(mu_);
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
diff --git a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
index 395c390f403527..50fe016ee7e822 100644
--- a/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/take_while_dataset_op.cc
@@ -88,7 +88,7 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->push_back(input_);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status CheckExternalState() const override {
@@ -119,7 +119,7 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
           {std::make_pair("predicate", f_attr),
            std::make_pair("Targuments", other_arguments_types_attr)},
           output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -144,7 +144,7 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
           tf_shared_lock l(mu_);
           if (!input_impl_) {
             *end_of_sequence = true;
-            return OkStatus();
+            return absl::OkStatus();
           }
           TF_RETURN_IF_ERROR(
               input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
@@ -152,7 +152,7 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
         if (*end_of_sequence) {
           mutex_lock l(mu_);
           input_impl_.reset();
-          return OkStatus();
+          return absl::OkStatus();
         }
         std::vector<Tensor> result;
         TF_RETURN_IF_ERROR(instantiated_captured_func_->RunWithBorrowedArgs(
@@ -169,7 +169,7 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
           input_impl_.reset();
           out_tensors->clear();
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      protected:
@@ -189,7 +189,7 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
         if (input_impl_) {
           TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -203,7 +203,7 @@ class TakeWhileDatasetOp : public UnaryDatasetOpKernel {
         } else {
           TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
diff --git a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
index 62a317acec4fec..79f7689c2a52bc 100644
--- a/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/threadpool_dataset_op.cc
@@ -51,7 +51,7 @@ Status ValidateNumThreads(int32_t num_threads) {
   if (num_threads >= kThreadLimit) {
     return errors::InvalidArgument("`num_threads` must be < ", kThreadLimit);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
@@ -136,7 +136,7 @@ class ThreadPoolHandleOp : public OpKernel {
                                         num_threads_,
                                         /*low_latency_hint=*/false,
                                         max_intra_op_parallelism_);
-                                    return OkStatus();
+                                    return absl::OkStatus();
                                   }));
       initialized_ = true;
     }
@@ -210,7 +210,7 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->push_back(input_);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status CheckExternalState() const override {
@@ -227,7 +227,7 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
       TF_RETURN_IF_ERROR(b->AddTensor(resource_handle_, &resource_handle_node));
       TF_RETURN_IF_ERROR(b->AddDataset(
           this, {input_graph_node, resource_handle_node}, output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -259,13 +259,13 @@ class ThreadPoolDatasetOp : public UnaryDatasetOpKernel {
                           IteratorStateWriter* writer) override {
         DCHECK(input_impl_ != nullptr);
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
                              IteratorStateReader* reader) override {
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
@@ -332,7 +332,7 @@ class MaxIntraOpParallelismDatasetOp::Dataset : public DatasetBase {
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->clear();
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -350,7 +350,7 @@ class MaxIntraOpParallelismDatasetOp::Dataset : public DatasetBase {
                                     &max_intra_op_parallelism_node));
     TF_RETURN_IF_ERROR(b->AddDataset(
         this, {input_graph_node, max_intra_op_parallelism_node}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -383,13 +383,13 @@ class MaxIntraOpParallelismDatasetOp::Dataset : public DatasetBase {
                         IteratorStateWriter* writer) override {
       DCHECK(input_impl_ != nullptr);
       TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     TraceMeMetadata GetTraceMeMetadata() const override {
@@ -475,7 +475,7 @@ class PrivateThreadPoolDatasetOp::Dataset : public DatasetBase {
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->clear();
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -492,7 +492,7 @@ class PrivateThreadPoolDatasetOp::Dataset : public DatasetBase {
     TF_RETURN_IF_ERROR(b->AddScalar(num_threads_, &num_threads_node));
     TF_RETURN_IF_ERROR(
         b->AddDataset(this, {input_graph_node, num_threads_node}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -528,13 +528,13 @@ class PrivateThreadPoolDatasetOp::Dataset : public DatasetBase {
                         IteratorStateWriter* writer) override {
       DCHECK(input_impl_ != nullptr);
       TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     TraceMeMetadata GetTraceMeMetadata() const override {
diff --git a/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
index ca3a985881a8db..c8d53de774ebe6 100644
--- a/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
+++ b/tensorflow/core/kernels/data/experimental/to_tf_record_op.cc
@@ -45,7 +45,7 @@ class ToTFRecordOp : public AsyncOpKernel {
       return errors::InvalidArgument(argument_name, " must be a scalar");
     }
     *output = argument_t->scalar<T>()();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void ComputeAsync(OpKernelContext* ctx, DoneCallback done) override {
@@ -119,7 +119,7 @@ class ToTFRecordOp : public AsyncOpKernel {
       }
       components.clear();
     } while (!end_of_sequence);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   BackgroundWorker background_worker_;
diff --git a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
index 88bcf453693ed9..0d153486f10eee 100644
--- a/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unbatch_dataset_op.cc
@@ -116,7 +116,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
     Status InputDatasets(
         std::vector<const DatasetBase*>* inputs) const override {
       inputs->push_back(input_);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status CheckExternalState() const override {
@@ -130,7 +130,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
       Node* input_graph_node = nullptr;
       TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
       TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -157,7 +157,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
         mutex_lock l(mu_);
         if (!input_impl_) {
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
         *end_of_sequence = false;
         while (!*end_of_sequence) {
@@ -177,7 +177,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
             if (current_index_ >= current_batch_size_) {
               ctx->MergeCheckpoint(input_ckpt_.get());
             }
-            return OkStatus();
+            return absl::OkStatus();
           }
           current_index_ = 0;
           current_batch_size_ = 0;
@@ -207,7 +207,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
           }
         }
         input_impl_.reset();
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      protected:
@@ -244,7 +244,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
                 full_name(StrCat("tensors[", i, "]")), tensors_[i]));
           }
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status RestoreInternal(IteratorContext* ctx,
@@ -267,7 +267,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
         if (current_index_ < current_batch_size_) {
           TF_RETURN_IF_ERROR(RestoreTensors(ctx, reader));
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
      private:
@@ -298,7 +298,7 @@ class UnbatchDatasetOp : public UnaryDatasetOpKernel {
           shapes_[i] = tensors_[i].shape();
           shapes_[i].RemoveDim(0);
         }
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       mutex mu_;
diff --git a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
index d2f4ac53e5abcd..466a5f13bb47b8 100644
--- a/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
+++ b/tensorflow/core/kernels/data/experimental/unique_dataset_op.cc
@@ -56,7 +56,7 @@ class UniqueDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -70,7 +70,7 @@ class UniqueDatasetOp::Dataset : public DatasetBase {
     Node* input_graph_node = nullptr;
     TF_RETURN_IF_ERROR(b->AddInputDataset(ctx, input_, &input_graph_node));
     TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -99,7 +99,7 @@ class UniqueDatasetOp::Dataset : public DatasetBase {
         DCHECK_EQ(1, out_tensors->size());
         saw_new_value = unique_elements_.insert((*out_tensors)[0]).second;
       } while (!saw_new_value);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -124,7 +124,7 @@ class UniqueDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(writer->WriteTensor(
             full_name(strings::StrCat("unique_elements[", i++, "]")), t));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -151,7 +151,7 @@ class UniqueDatasetOp::Dataset : public DatasetBase {
               "value.");
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
diff --git a/tensorflow/core/kernels/data/experimental/unique_dataset_op_test.cc b/tensorflow/core/kernels/data/experimental/unique_dataset_op_test.cc
index eeabf2efd6ea49..e918812bb04bc5 100644
--- a/tensorflow/core/kernels/data/experimental/unique_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/experimental/unique_dataset_op_test.cc
@@ -40,14 +40,14 @@ class UniqueDatasetParams : public DatasetParams {
   Status GetInputNames(std::vector<string>* input_names) const override {
     input_names->clear();
     input_names->emplace_back(UniqueDatasetOp::kInputDataset);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attributes) const override {
     *attributes = {{"output_types", output_dtypes_},
                    {"output_shapes", output_shapes_},
                    {"metadata", ""}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override { return UniqueDatasetOp::kDatasetType; }
diff --git a/tensorflow/core/kernels/data/filter_dataset_op.cc b/tensorflow/core/kernels/data/filter_dataset_op.cc
index a14e8f32b9ac23..6acdb586da1cd1 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op.cc
@@ -81,7 +81,7 @@ class FilterDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -107,7 +107,7 @@ class FilterDatasetOp::Dataset : public DatasetBase {
     TF_RETURN_IF_ERROR(b->AddDataset(
         this, {{0, input_graph_node}}, {{1, other_arguments}},
         {{kPredicate, f}, {kTarguments, other_arguments_types_attr}}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -140,7 +140,7 @@ class FilterDatasetOp::Dataset : public DatasetBase {
           tf_shared_lock l(mu_);
           if (!input_impl_) {
             *end_of_sequence = true;
-            return OkStatus();
+            return absl::OkStatus();
           }
           TF_RETURN_IF_ERROR(
               input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
@@ -148,7 +148,7 @@ class FilterDatasetOp::Dataset : public DatasetBase {
         if (*end_of_sequence) {
           mutex_lock l(mu_);
           input_impl_.reset();
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         std::vector<Tensor> result;
@@ -201,7 +201,7 @@ class FilterDatasetOp::Dataset : public DatasetBase {
                                            static_cast<float>(1));
       }
       *end_of_sequence = false;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -224,7 +224,7 @@ class FilterDatasetOp::Dataset : public DatasetBase {
           writer->WriteScalar(prefix(), kFilteredElements, filtered_elements_));
       TF_RETURN_IF_ERROR(
           writer->WriteScalar(prefix(), kDroppedElements, dropped_elements_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -242,7 +242,7 @@ class FilterDatasetOp::Dataset : public DatasetBase {
           reader->ReadScalar(prefix(), kFilteredElements, &filtered_elements_));
       TF_RETURN_IF_ERROR(
           reader->ReadScalar(prefix(), kDroppedElements, &dropped_elements_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     data::TraceMeMetadata GetTraceMeMetadata() const override {
diff --git a/tensorflow/core/kernels/data/filter_dataset_op_test.cc b/tensorflow/core/kernels/data/filter_dataset_op_test.cc
index 1bb0e374ebb632..e325b604c60dda 100644
--- a/tensorflow/core/kernels/data/filter_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/filter_dataset_op_test.cc
@@ -56,7 +56,7 @@ class FilterDatasetParams : public DatasetParams {
           absl::StrCat(FilterDatasetOp::kOtherArguments, "_", i));
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -65,7 +65,7 @@ class FilterDatasetParams : public DatasetParams {
                     {"output_shapes", output_shapes_},
                     {"output_types", output_dtypes_},
                     {"metadata", ""}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::vector<FunctionDef> func_lib() const override { return func_lib_; }
diff --git a/tensorflow/core/kernels/data/finalize_dataset_op_test.cc b/tensorflow/core/kernels/data/finalize_dataset_op_test.cc
index 0cd087bc0bfa80..efd135c0e24839 100644
--- a/tensorflow/core/kernels/data/finalize_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/finalize_dataset_op_test.cc
@@ -42,14 +42,14 @@ class FinalizeDatasetParams : public DatasetParams {
 
   Status GetInputNames(std::vector<string>* input_names) const override {
     input_names->emplace_back(FinalizeDatasetOp::kInputDataset);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
     *attr_vector = {{FinalizeDatasetOp::kHasCapturedRef, has_captured_ref_},
                     {FinalizeDatasetOp::kOutputTypes, output_dtypes_},
                     {FinalizeDatasetOp::kOutputShapes, output_shapes_}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override { return "Finalize"; }
diff --git a/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc b/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
index 9440721bc858a6..fab7523bd4587c 100644
--- a/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/fixed_length_record_dataset_op.cc
@@ -95,10 +95,10 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
   }
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status CheckExternalState() const override { return OkStatus(); }
+  Status CheckExternalState() const override { return absl::OkStatus(); }
 
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
@@ -121,7 +121,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
                       {filenames, header_bytes, record_bytes, footer_bytes,
                        buffer_size, compression_type},
                       output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -152,7 +152,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
             record_tensor.scalar<tstring>()() = record;
             out_tensors->emplace_back(std::move(record_tensor));
             *end_of_sequence = false;
-            return OkStatus();
+            return absl::OkStatus();
           }
 
           // We have reached the end of the current file, so maybe move on to
@@ -165,7 +165,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
         // Iteration ends when there are no more files to process.
         if (current_file_index_ == dataset()->filenames_.size()) {
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         // Actually move on to next file.
@@ -208,7 +208,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
       int64_t current_pos = input_buffer_ ? input_buffer_->Tell() : -1;
       TF_RETURN_IF_ERROR(
           writer->WriteScalar(prefix(), kCurrentPos, current_pos));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -239,7 +239,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(input_buffer_->Seek(current_pos));
       }
 
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -279,7 +279,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
               record_tensor.scalar<tstring>()() = std::move(record);
               out_tensors->emplace_back(std::move(record_tensor));
               *end_of_sequence = false;
-              return OkStatus();
+              return absl::OkStatus();
             }
           } else {
             tstring record;
@@ -298,7 +298,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
               record_tensor.scalar<tstring>()() = std::move(record);
               out_tensors->emplace_back(std::move(record_tensor));
               *end_of_sequence = false;
-              return OkStatus();
+              return absl::OkStatus();
             }
             if (errors::IsOutOfRange(s) && !record.empty()) {
               uint64 body_size =
@@ -326,7 +326,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
         // Iteration ends when there are no more files to process.
         if (current_file_index_ == dataset()->filenames_.size()) {
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         // Actually move on to next file.
@@ -397,7 +397,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
           buffered_input_stream_ ? buffered_input_stream_->Tell() : -1;
       TF_RETURN_IF_ERROR(
           writer->WriteScalar(prefix(), kCurrentPos, current_pos));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -434,7 +434,7 @@ class FixedLengthRecordDatasetOp::Dataset : public DatasetBase {
             dataset()->footer_bytes_, &lookahead_cache_));
       }
 
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
diff --git a/tensorflow/core/kernels/data/fixed_length_record_dataset_op_test.cc b/tensorflow/core/kernels/data/fixed_length_record_dataset_op_test.cc
index 366ca2078c2c89..125d6de637abc1 100644
--- a/tensorflow/core/kernels/data/fixed_length_record_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/fixed_length_record_dataset_op_test.cc
@@ -63,13 +63,13 @@ class FixedLengthRecordDatasetParams : public DatasetParams {
                     FixedLengthRecordDatasetOp::kFooterBytes,
                     FixedLengthRecordDatasetOp::kBufferSize,
                     FixedLengthRecordDatasetOp::kCompressionType};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
     attr_vector->clear();
     attr_vector->emplace_back("metadata", "");
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override {
@@ -107,7 +107,7 @@ Status CreateTestFiles(const std::vector<tstring>& filenames,
           WriteDataToFile(filenames[i], contents[i].data(), params));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Test case 1: multiple fixed-length record files with ZLIB compression.
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op.cc b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
index 9abee3f1112296..6a9afa758ed3ab 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op.cc
@@ -87,7 +87,7 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -116,7 +116,7 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
         {std::make_pair(kFunc, f),
          std::make_pair(kTarguments, other_arguments_types_attr)},  // Attrs
         output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -144,7 +144,7 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
       do {
         if (!input_impl_) {
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
         if (current_element_iterator_) {
           // We are currently processing a mapped element, so try to get the
@@ -162,7 +162,7 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
           if (!end_of_element) {
             // Produce the subelement as output.
             *end_of_sequence = false;
-            return OkStatus();
+            return absl::OkStatus();
           }
           // Since this sub-iterator is done,
           // we can commit `input_ckpt_` to `ctx->checkpoint()`
@@ -186,7 +186,7 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
         input_ckpt_->Merge(input_ctx->checkpoint());
         if (*end_of_sequence) {
           input_impl_.reset();
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         TF_RETURN_IF_ERROR(
@@ -203,7 +203,7 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
       while (*num_skipped < num_to_skip) {
         if (!input_impl_) {
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
         if (current_element_iterator_) {
           // We are currently processing a mapped element, so try to get the
@@ -256,13 +256,13 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
         if (*end_of_sequence) {
           input_impl_.reset();
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
         TF_RETURN_IF_ERROR(
             BuildCurrentElementIteratorLocked(ctx, /*is_get_next=*/false));
       }
       *end_of_sequence = false;
-      return OkStatus();
+      return absl::OkStatus();
       // LINT.ThenChange(:GetNextInternal)
     }
 
@@ -298,7 +298,7 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
           TF_RETURN_IF_ERROR(SaveInput(ctx, writer, current_element_iterator_));
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -329,7 +329,7 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
           TF_RETURN_IF_ERROR(RestoreCurrentElementIterator(ctx, reader));
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -367,7 +367,7 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(
           BuildCurrentElementIteratorLocked(ctx, /*is_get_next=*/false));
       TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, current_element_iterator_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreCurrentElementIteratorSymbolic(IteratorContext* ctx,
@@ -388,7 +388,7 @@ class FlatMapDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(
           BuildCurrentElementIteratorLocked(ctx, /*is_get_next=*/false));
       TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, current_element_iterator_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     mutex mu_;
diff --git a/tensorflow/core/kernels/data/flat_map_dataset_op_test.cc b/tensorflow/core/kernels/data/flat_map_dataset_op_test.cc
index b2bf4052dcaa04..b6a68065c93845 100644
--- a/tensorflow/core/kernels/data/flat_map_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/flat_map_dataset_op_test.cc
@@ -52,7 +52,7 @@ class FlatMapDatasetParams : public DatasetParams {
       input_names->emplace_back(
           absl::StrCat(FlatMapDatasetOp::kOtherArguments, "_", i));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -61,7 +61,7 @@ class FlatMapDatasetParams : public DatasetParams {
                     {"output_shapes", output_shapes_},
                     {"output_types", output_dtypes_},
                     {"metadata", ""}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override {
diff --git a/tensorflow/core/kernels/data/generator_dataset_op.cc b/tensorflow/core/kernels/data/generator_dataset_op.cc
index 99149b606ecdf0..401a8e50284859 100644
--- a/tensorflow/core/kernels/data/generator_dataset_op.cc
+++ b/tensorflow/core/kernels/data/generator_dataset_op.cc
@@ -76,7 +76,7 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
   }
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -119,7 +119,7 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
           dataset()->next_func_->Instantiate(ctx, &instantiated_next_func_));
       TF_RETURN_IF_ERROR(dataset()->finalize_func_->Instantiate(
           ctx, &instantiated_finalize_func_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
@@ -135,7 +135,7 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
 
       if (finalized_) {
         *end_of_sequence = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status s = instantiated_next_func_->RunWithBorrowedArgs(
@@ -145,7 +145,7 @@ class GeneratorDatasetOp::Dataset : public DatasetBase {
       } else if (errors::IsOutOfRange(s)) {
         // `next_func` may deliberately raise `errors::OutOfRange`
         // to indicate that we should terminate the iteration.
-        s = OkStatus();
+        s = absl::OkStatus();
         *end_of_sequence = true;
 
         // NOTE(mrry): We ignore any tensors returned by the finalize function.
diff --git a/tensorflow/core/kernels/data/get_options_op_test.cc b/tensorflow/core/kernels/data/get_options_op_test.cc
index c7b8face3323ce..8f5ae9d7ea7d8b 100644
--- a/tensorflow/core/kernels/data/get_options_op_test.cc
+++ b/tensorflow/core/kernels/data/get_options_op_test.cc
@@ -44,11 +44,11 @@ class GetOptionsParams : public DatasetParams {
 
   Status GetInputNames(std::vector<string>* input_names) const override {
     input_names->emplace_back(OptionsDatasetOp::kInputDataset);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override { return "GetOptions"; }
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op.cc b/tensorflow/core/kernels/data/interleave_dataset_op.cc
index bbc0c1d76947f7..772f1fdb416d50 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op.cc
@@ -106,7 +106,7 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -137,7 +137,7 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
         this, {{0, input_node}, {2, cycle_length_node}, {3, block_length_node}},
         {{1, other_arguments}},
         {{kFunc, f}, {kTarguments, other_arguments_types_attr}}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -202,7 +202,7 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
             // Produce the subelement as output.
             AdvancePosition();
             *end_of_sequence = false;
-            return OkStatus();
+            return absl::OkStatus();
           } else {
             // We have reached the end of the current element, so move
             // on to the next element in the cycle.
@@ -228,7 +228,7 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
       ctx->MergeCheckpoint(input_ckpt_.get());
 
       *end_of_sequence = true;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status SkipInternal(IteratorContext* ctx, int num_to_skip,
@@ -267,7 +267,7 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
           }
           if (num_to_skip == *num_skipped) {
             *end_of_sequence = false;
-            return OkStatus();
+            return absl::OkStatus();
           }
         } else {
           TF_RETURN_IF_ERROR(MoveToNextElement(ctx));
@@ -277,7 +277,7 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
       ctx->MergeCheckpoint(input_ckpt_.get());
 
       *end_of_sequence = true;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -309,7 +309,7 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
                               last_checkpointed_input_element_index_));
 
       TF_RETURN_IF_ERROR(SaveCurrentElements(ctx, writer));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -357,7 +357,7 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(
           RestoreCurrentElements(ctx, reader, input_element_indices,
                                  std::move(checkpoints), std::move(args)));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     TraceMeMetadata GetTraceMeMetadata() const override {
@@ -415,7 +415,7 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
               current_elements_[idx]->input_element_index));
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     absl::StatusOr<std::vector<InputOffsetWithCycleIdx>> RestoreInputOffsets(
@@ -569,7 +569,7 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
       }
 
       input_ckpt_->Merge(input_ctx->checkpoint());
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreCurrentElements(
@@ -630,7 +630,7 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
         }
       }
 
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status MoveToNextElement(IteratorContext* ctx)
@@ -669,7 +669,7 @@ class InterleaveDatasetOp::Dataset : public DatasetBase {
       } else {
         AdvanceToNextInCycle();
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Check if the given `input_element_index` is the earliest(oldest) current
diff --git a/tensorflow/core/kernels/data/interleave_dataset_op_test.cc b/tensorflow/core/kernels/data/interleave_dataset_op_test.cc
index ceda37554e0195..e27e147ae7c6da 100644
--- a/tensorflow/core/kernels/data/interleave_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/interleave_dataset_op_test.cc
@@ -65,7 +65,7 @@ class InterleaveDatasetParams : public DatasetParams {
     }
     input_names->emplace_back(InterleaveDatasetOp::kCycleLength);
     input_names->emplace_back(InterleaveDatasetOp::kBlockLength);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -74,7 +74,7 @@ class InterleaveDatasetParams : public DatasetParams {
                     {"output_shapes", output_shapes_},
                     {"output_types", output_dtypes_},
                     {"metadata", ""}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::vector<FunctionDef> func_lib() const override { return func_lib_; }
diff --git a/tensorflow/core/kernels/data/iterator_ops.cc b/tensorflow/core/kernels/data/iterator_ops.cc
index 7ec2442016c936..3a00a0a39b35dc 100644
--- a/tensorflow/core/kernels/data/iterator_ops.cc
+++ b/tensorflow/core/kernels/data/iterator_ops.cc
@@ -171,7 +171,7 @@ Status IteratorResource::Save(OpKernelContext* ctx,
     }
     LOG(INFO) << "Saving symbolic checkpoint";
     TF_RETURN_IF_ERROR(checkpoint.Save(writer));
-    return OkStatus();
+    return absl::OkStatus();
   }
   SerializationContext::Params params(ctx);
   params.external_state_policy = external_state_policy;
@@ -231,7 +231,7 @@ Status IteratorResource::Restore(OpKernelContext* ctx,
   new_state->MergeCheckpoint(iter_ctx.checkpoint());
   mutex_lock l(mu_);
   std::swap(iterator_state_, new_state);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status IteratorResource::SetIteratorFromDataset(OpKernelContext* ctx,
@@ -280,14 +280,15 @@ Status IteratorResource::SetIteratorFromDataset(OpKernelContext* ctx,
   TF_RETURN_IF_ERROR(
       VerifyShapesCompatible(output_shapes_, iterator->output_shapes()));
   new_state->DowncastAndSetIteratorAndDataset(std::move(iterator), dataset);
+  new_state->SetModel(iter_ctx.model());
   new_state->MergeCheckpoint(iter_ctx.checkpoint());
   mutex_lock l(mu_);
   std::swap(iterator_state_, new_state);
   tf_dataz_metrics_collector_ = std::make_shared<TfDatazMetricsCollector>(
-      env_, iterator_state_->iterator());
+      env_, iterator_state_->iterator(), iterator_state_->model());
   EnsureIteratorMemoryLoggerStarted();
   TfDatazMetricsRegistry::Register(tf_dataz_metrics_collector_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void IteratorResource::State::DowncastAndSetIteratorAndDataset(
@@ -305,6 +306,10 @@ void IteratorResource::State::MergeCheckpoint(MemoryCheckpoint* other) {
   }
 }
 
+void IteratorResource::State::SetModel(std::shared_ptr<model::Model> model) {
+  model_ = model;
+}
+
 namespace {
 
 // A helper class that uses a list of IteratorStateVariant objects to represent
@@ -344,7 +349,7 @@ class IteratorVariantSerializer {
     }
     num_tensors_ = variants_.size();
     can_serialize_ = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Initializes `this` from `serialized_t` while restoring the iterator state.
@@ -365,7 +370,7 @@ class IteratorVariantSerializer {
     }
     reader_ = std::make_unique<VariantTensorDataReader>(data);
     num_tensors_ = data.size();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   int64_t NumTensors() { return num_tensors_; }
@@ -385,7 +390,7 @@ class IteratorVariantSerializer {
       }
       serialized->vec<Variant>()(i) = variants_[i];
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Returns an IteratorStateReader to restore iterator state. Expects that
@@ -462,7 +467,7 @@ void IteratorHandleOp::Compute(OpKernelContext* context)
                     context->env(), output_dtypes_, output_shapes_,
                     std::move(device_mgr), std::move(flib_def), std::move(pflr),
                     flr);
-                return OkStatus();
+                return absl::OkStatus();
               }));
 
       Status s = VerifyResource(resource);
@@ -485,7 +490,7 @@ Status IteratorHandleOp::VerifyResource(IteratorResource* resource) {
       VerifyTypesMatch(output_dtypes_, resource->output_dtypes()));
   TF_RETURN_IF_ERROR(
       VerifyShapesCompatible(output_shapes_, resource->output_shapes()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 FunctionLibraryRuntime* IteratorHandleOp::CreatePrivateFLR(
@@ -544,7 +549,7 @@ Status AnonymousIteratorHandleOp::CreateResource(
   *resource = new IteratorResource(ctx->env(), output_dtypes_, output_shapes_,
                                    std::move(device_mgr), std::move(flib_def),
                                    std::move(pflr), lib);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 HybridAsyncOpKernel::HybridAsyncOpKernel(OpKernelConstruction* ctx,
@@ -659,7 +664,7 @@ class ToSingleElementOp : public AsyncOpKernel {
     if (!end_of_sequence) {
       return errors::InvalidArgument("Dataset had more than one element.");
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   IteratorMetricsCollector metrics_collector_;
@@ -770,7 +775,7 @@ class OneShotIteratorOp : public AsyncOpKernel {
                       ctx->env(), output_dtypes_, output_shapes_,
                       /*device_mgr=*/nullptr, std::move(flib_def),
                       std::move(pflr), flr);
-                  return OkStatus();
+                  return absl::OkStatus();
                 }));
 
     core::ScopedUnref unref_iterator(*iterator);
@@ -810,7 +815,7 @@ class OneShotIteratorOp : public AsyncOpKernel {
     TF_RETURN_IF_ERROR(GetDatasetFromVariantTensor(return_values[0], &dataset));
     TF_RETURN_IF_ERROR((*iterator)->SetIteratorFromDataset(ctx, dataset));
     (*iterator)->Ref();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void ProduceOutput(OpKernelContext* ctx, const DoneCallback& done) {
@@ -901,7 +906,7 @@ Status IteratorGetNextOp::DoCompute(OpKernelContext* ctx) {
   for (int i = 0; i < components.size(); ++i) {
     ctx->set_output(i, components[i]);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status IteratorGetNextAsOptionalOp::DoCompute(OpKernelContext* ctx) {
diff --git a/tensorflow/core/kernels/data/iterator_ops.h b/tensorflow/core/kernels/data/iterator_ops.h
index 1b5d75622c8872..841df1a3b19bce 100644
--- a/tensorflow/core/kernels/data/iterator_ops.h
+++ b/tensorflow/core/kernels/data/iterator_ops.h
@@ -26,6 +26,7 @@ limitations under the License.
 #include "tensorflow/core/data/unbounded_thread_pool.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/function_handle_cache.h"
+#include "tensorflow/core/framework/model.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
@@ -114,6 +115,8 @@ class IteratorResource : public ResourceBase {
 
     DatasetBaseIterator* iterator() { return iterator_.get(); }
 
+    std::shared_ptr<model::Model> model() { return model_; }
+
     const MemoryCheckpoint& checkpoint() const { return checkpoint_; }
 
     DatasetBase* dataset() { return dataset_.get(); }
@@ -126,6 +129,8 @@ class IteratorResource : public ResourceBase {
     // Merges the given checkpoint with the checkpoint of this state.
     void MergeCheckpoint(MemoryCheckpoint* other);
 
+    void SetModel(std::shared_ptr<model::Model> model);
+
     std::shared_ptr<MemoryCheckpoint::IdRegistry> id_registry() {
       return id_registry_;
     }
@@ -141,6 +146,7 @@ class IteratorResource : public ResourceBase {
     core::RefCountPtr<DatasetBase> dataset_;
     std::shared_ptr<MemoryCheckpoint::IdRegistry> id_registry_;
     MemoryCheckpoint checkpoint_;
+    std::shared_ptr<model::Model> model_;
   };
 
   IteratorMetricsCollector metrics_collector_;
diff --git a/tensorflow/core/kernels/data/map_dataset_op.cc b/tensorflow/core/kernels/data/map_dataset_op.cc
index 6661b103b69915..a5394e16baf9df 100644
--- a/tensorflow/core/kernels/data/map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/map_dataset_op.cc
@@ -83,7 +83,7 @@ class MapDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -142,7 +142,7 @@ class MapDatasetOp::Dataset : public DatasetBase {
          std::make_pair(kPreserveCardinality,
                         preserve_cardinality_attr)},  // Attrs
         output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -169,7 +169,7 @@ class MapDatasetOp::Dataset : public DatasetBase {
       std::vector<Tensor> args;
       TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &args, end_of_sequence));
       if (*end_of_sequence) {
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       Status s = instantiated_captured_func_->Run(ctx, std::move(args),
@@ -185,7 +185,7 @@ class MapDatasetOp::Dataset : public DatasetBase {
           // `f` may deliberately raise `errors::OutOfRange` to indicate
           // that we should terminate the iteration early.
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
       } else {
         return s;
@@ -203,13 +203,13 @@ class MapDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(ctx->HandleCheckExternalStateStatus(
           dataset()->captured_func_->CheckExternalState()));
       TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
       TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
diff --git a/tensorflow/core/kernels/data/map_defun_op.cc b/tensorflow/core/kernels/data/map_defun_op.cc
index cf09562bba52e8..3171116e7ae404 100644
--- a/tensorflow/core/kernels/data/map_defun_op.cc
+++ b/tensorflow/core/kernels/data/map_defun_op.cc
@@ -102,7 +102,7 @@ class MapDefunOp::MapFunctionCallFrame : public CallFrameInterface {
       // The function is calling for a captured input
       *val =
           &compute_opts_->captured_inputs[index - compute_opts_->args.size()];
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // NOTE: If contention on mu_ becomes problematic, we could create a vector
@@ -118,7 +118,7 @@ class MapDefunOp::MapFunctionCallFrame : public CallFrameInterface {
       sliced_args_[index] = tensor::DeepCopy(sliced_args_[index]);
     }
     *val = &sliced_args_[index];
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status SetRetval(int index, const Tensor& val) override {
@@ -287,7 +287,7 @@ Status MapDefunOp::SetupArgs(OpKernelContext* ctx,
   *compute_opts =
       new ComputeOptions(ctx, arguments, captured_inputs, std::move(arg_shapes),
                          batch_size, output_shapes_, max_intra_op_parallelism_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status MapDefunOp::SetupOutputs(OpKernelContext* ctx, ComputeOptions* opts) {
@@ -303,7 +303,7 @@ Status MapDefunOp::SetupOutputs(OpKernelContext* ctx, ComputeOptions* opts) {
       TF_RETURN_IF_ERROR(opts->output.allocate(i, output_shape, &out));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
diff --git a/tensorflow/core/kernels/data/map_defun_op_test.cc b/tensorflow/core/kernels/data/map_defun_op_test.cc
index 4e390d44ead5c1..d48650cada3f82 100644
--- a/tensorflow/core/kernels/data/map_defun_op_test.cc
+++ b/tensorflow/core/kernels/data/map_defun_op_test.cc
@@ -59,7 +59,7 @@ class MapDefunOpParams : public DatasetParams {
       input_names->emplace_back(
           strings::StrCat(MapDefunOp::kCapturedInputs, "_", i));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -70,7 +70,7 @@ class MapDefunOpParams : public DatasetParams {
         {MapDefunOp::kOutputTypes, output_dtypes_},
         {MapDefunOp::kFunc, func_},
         {MapDefunOp::kMaxIntraOpParallelism, max_intra_op_parallelism_}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::vector<FunctionDef> func_lib() const override { return func_lib_; }
@@ -100,7 +100,7 @@ class MapDefunOpTest : public DatasetOpsTestBase {
     NodeDef node_def =
         test::function::NDef(kNodeName, kOpName, input_namess, attributes);
     TF_RETURN_IF_ERROR(CreateOpKernel(node_def, map_defun_kernel));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Creates a new `MapDefun` op kernel context.
@@ -109,7 +109,7 @@ class MapDefunOpTest : public DatasetOpsTestBase {
                                std::unique_ptr<OpKernelContext>* context) {
     TF_RETURN_IF_ERROR(CheckOpKernelInput(*op_kernel, *inputs));
     TF_RETURN_IF_ERROR(CreateOpKernelContext(op_kernel, inputs, context));
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/kernels/data/model_dataset_op.cc b/tensorflow/core/kernels/data/model_dataset_op.cc
index 7bd4b738520741..15a19b992d66ee 100644
--- a/tensorflow/core/kernels/data/model_dataset_op.cc
+++ b/tensorflow/core/kernels/data/model_dataset_op.cc
@@ -95,7 +95,7 @@ class ModelDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -121,7 +121,7 @@ class ModelDatasetOp::Dataset : public DatasetBase {
                        std::make_pair(kCpuBudget, cpu_budget_attr),
                        std::make_pair(kRamBudget, ram_budget_attr)},
                       output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -206,7 +206,7 @@ class ModelDatasetOp::Dataset : public DatasetBase {
               }
             });
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     mutex mu_;
diff --git a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
index 390663822aa986..a6de1b57025f84 100644
--- a/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
+++ b/tensorflow/core/kernels/data/multi_device_iterator_ops.cc
@@ -127,7 +127,7 @@ class MultiDeviceIterator : public ResourceBase {
     multi_device_buffer_ = std::make_unique<MultiDeviceBuffer>(
         devices_.size(), max_buffer_size, incarnation_id_, std::move(iterator),
         this);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetNextFromShard(OpKernelContext* ctx, int shard_num,
@@ -144,7 +144,7 @@ class MultiDeviceIterator : public ResourceBase {
     IteratorContext iter_ctx(std::move(params));
     multi_device_buffer_->GetNextFromShard(&iter_ctx, shard_num, incarnation_id,
                                            std::move(callback));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const DataTypeVector& output_types() const { return output_types_; }
@@ -547,7 +547,7 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
                                        output_shapes_, devices_,
                                        std::move(flib_def), std::move(pflr),
                                        flr, std::move(function_handle_cache));
-                                   return OkStatus();
+                                   return absl::OkStatus();
                                  }));
           Status s = VerifyResource(resource);
           if (TF_PREDICT_FALSE(!s.ok())) {
@@ -575,7 +575,7 @@ class MultiDeviceIteratorHandleOp : public OpKernel {
         VerifyTypesMatch(output_types_, resource->output_types()));
     TF_RETURN_IF_ERROR(
         VerifyShapesCompatible(output_shapes_, resource->output_shapes()));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   mutex mu_;
@@ -620,7 +620,7 @@ class AnonymousMultiDeviceIteratorOp
         new MultiDeviceIterator(ctx->env(), output_dtypes_, output_shapes_,
                                 devices_, std::move(flib_def), std::move(pflr),
                                 lib, std::move(function_handle_cache));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::vector<string> devices_;
diff --git a/tensorflow/core/kernels/data/optimize_dataset_op_test.cc b/tensorflow/core/kernels/data/optimize_dataset_op_test.cc
index f90d3d47d0704a..6e03748b970582 100644
--- a/tensorflow/core/kernels/data/optimize_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/optimize_dataset_op_test.cc
@@ -47,7 +47,7 @@ class OptimizeDatasetParams : public DatasetParams {
   Status GetInputNames(std::vector<string>* input_names) const override {
     *input_names = {OptimizeDatasetOp::kInputDataset,
                     OptimizeDatasetOp::kOptimizations};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -55,7 +55,7 @@ class OptimizeDatasetParams : public DatasetParams {
         {OptimizeDatasetOp::kOutputShapes, output_shapes_},
         {OptimizeDatasetOp::kOutputTypes, output_dtypes_},
         {OptimizeDatasetOp::kOptimizationConfigs, optimization_configs_}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override {
diff --git a/tensorflow/core/kernels/data/optional_ops.cc b/tensorflow/core/kernels/data/optional_ops.cc
index 78e66510fdb437..07967a672135e3 100644
--- a/tensorflow/core/kernels/data/optional_ops.cc
+++ b/tensorflow/core/kernels/data/optional_ops.cc
@@ -50,7 +50,7 @@ static Status OptionalDeviceCopy(
   } else {
     *to = from;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #define REGISTER_OPTIONAL_COPY(DIRECTION)               \
@@ -144,7 +144,7 @@ Status WriteOptionalWithValueToOutput(OpKernelContext* ctx, int output_index,
   TF_RETURN_IF_ERROR(ctx->allocate_output(output_index, TensorShape({}),
                                           &variant_t, cpu_alloc));
   variant_t->scalar<Variant>()() = v;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status WriteOptionalNoneToOutput(OpKernelContext* ctx, int output_index) {
@@ -155,7 +155,7 @@ Status WriteOptionalNoneToOutput(OpKernelContext* ctx, int output_index) {
   TF_RETURN_IF_ERROR(ctx->allocate_output(output_index, TensorShape({}),
                                           &variant_t, cpu_alloc));
   variant_t->scalar<Variant>()() = v;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
diff --git a/tensorflow/core/kernels/data/optional_ops_util.cc b/tensorflow/core/kernels/data/optional_ops_util.cc
index 4392dda39a2e5e..c504c99c7a528d 100644
--- a/tensorflow/core/kernels/data/optional_ops_util.cc
+++ b/tensorflow/core/kernels/data/optional_ops_util.cc
@@ -32,7 +32,7 @@ Status OptionalZerosLike(OpKernelContext* ctx, const OptionalVariant& x,
                                               const Tensor& input, Tensor* out)>
                              zeros_like_func) {
   if (!x.has_value()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   std::vector<Tensor> zero_tensors;
   for (const Tensor& tensor : x.get_values()) {
@@ -41,7 +41,7 @@ Status OptionalZerosLike(OpKernelContext* ctx, const OptionalVariant& x,
     zero_tensors.push_back(std::move(zero_t));
   }
   *y = OptionalVariant(zero_tensors);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status OptionalBinaryAdd(
@@ -56,7 +56,7 @@ Status OptionalBinaryAdd(
         "Cannot add optionals because one has a value and the other doesn't.");
   }
   if (!a.has_value()) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (a.get_values().size() != b.get_values().size()) {
     return errors::InvalidArgument(
@@ -73,7 +73,7 @@ Status OptionalBinaryAdd(
     out_tensors.push_back(std::move(out_tensor));
   }
   *out = OptionalVariant(out_tensors);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/options_dataset_op.cc b/tensorflow/core/kernels/data/options_dataset_op.cc
index 350fa76d16af65..311957f5e28178 100644
--- a/tensorflow/core/kernels/data/options_dataset_op.cc
+++ b/tensorflow/core/kernels/data/options_dataset_op.cc
@@ -82,7 +82,7 @@ class OptionsDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -100,7 +100,7 @@ class OptionsDatasetOp::Dataset : public DatasetBase {
     TF_RETURN_IF_ERROR(b->AddDataset(
         this, {input_graph_node},
         {std::make_pair(kSerializedOptions, serialized_options_attr)}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
index 9cfde812ed5a3f..1c8ef0caef8b04 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op.cc
@@ -123,7 +123,7 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -178,7 +178,7 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
          {kToutputTypes, output_types},
          {kNumPaddedShapes, N}},
         output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -203,7 +203,7 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
         mutex_lock l(mu_);
         if (!input_impl_) {
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         } else {
           *end_of_sequence = false;
           batch_elements.reserve(dataset()->batch_size_);
@@ -224,18 +224,18 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
 
       if (batch_elements.empty()) {
         DCHECK(*end_of_sequence);
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       if (dataset()->drop_remainder_ &&
           batch_elements.size() < dataset()->batch_size_) {
         *end_of_sequence = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       TF_RETURN_IF_ERROR(CopyBatch(ctx, batch_elements, out_tensors));
       *end_of_sequence = false;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -252,7 +252,7 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
       if (input_impl_) {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -268,7 +268,7 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
             dataset()->input_->MakeIterator(ctx, this, prefix(), &input_impl_));
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     TraceMeMetadata GetTraceMeMetadata() const override {
@@ -365,7 +365,7 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
                 batch_elements[index][component_index], &batch_component,
                 index));
           }
-          return OkStatus();
+          return absl::OkStatus();
         };
 
         if (dataset()->parallel_copy_ && (batch_component.AllocatedBytes() /
@@ -403,7 +403,7 @@ class PaddedBatchDatasetOp::Dataset : public DatasetBase {
           }
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     mutex mu_;
diff --git a/tensorflow/core/kernels/data/padded_batch_dataset_op_test.cc b/tensorflow/core/kernels/data/padded_batch_dataset_op_test.cc
index 9f0b8e2cb5f3a2..6edacb8b7e3444 100644
--- a/tensorflow/core/kernels/data/padded_batch_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/padded_batch_dataset_op_test.cc
@@ -76,7 +76,7 @@ class PaddedBatchDatasetParams : public DatasetParams {
           strings::StrCat(PaddedBatchDatasetOp::kPaddingValues, "_", j));
     }
     input_names->push_back(PaddedBatchDatasetOp::kDropRemainder);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -85,7 +85,7 @@ class PaddedBatchDatasetParams : public DatasetParams {
                     {"output_shapes", output_shapes_},
                     {"N", num_padded_shapes_},
                     {"metadata", ""}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override {
diff --git a/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc b/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc
index f8963ba3d5c428..50a57b3d635a7c 100644
--- a/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_batch_dataset_op.cc
@@ -139,7 +139,7 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -181,7 +181,7 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
         this,
         {input_graph_node, batch_size, num_parallel_calls, drop_remainder},
         attrs, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -229,7 +229,7 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
       if (ctx->warm_start() && !ctx->is_restoring()) {
         EnsureThreadsStarted(ctx);
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
@@ -266,7 +266,7 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
           ProcessBatch(dataset()->batch_size_, result->num_elements,
                        dataset()->drop_remainder_, result->status, ctx,
                        out_tensors, end_of_sequence, &result->output));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -296,7 +296,7 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
       for (size_t i = 0; i < batch_results_.size(); ++i) {
         TF_RETURN_IF_ERROR(WriteBatchResult(writer, i));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -314,7 +314,7 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
       if (ctx->warm_start()) {
         EnsureThreadsStarted(ctx);
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     TraceMeMetadata GetTraceMeMetadata() const override {
@@ -344,7 +344,7 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
       explicit BatchResult(IteratorContext* ctx)
           : end_of_input(false),
             num_elements(0),
-            status(OkStatus()),
+            status(absl::OkStatus()),
             call_finished(false),
             output_allocated(false),
             uid(tensorflow::EnvTime::NowNanos()),
@@ -427,7 +427,7 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
                   TF_EXCLUSIVE_LOCKS_REQUIRED(&BatchResult::mu) {
                     result->output_allocated = true;
                     RecordBufferEnqueue(ctx.get(), result->output);
-                    return OkStatus();
+                    return absl::OkStatus();
                   };
           status = CopyBatch(CopyBatchParams(ctx.get()), *batch_elements,
                              dataset()->parallel_copy_,
@@ -566,7 +566,7 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
       if (result->output_allocated) {
         RecordBufferEnqueue(ctx, result->output);
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status WriteBatchResult(IteratorStateWriter* writer, size_t index)
@@ -597,7 +597,7 @@ class ParallelBatchDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(
           WriteStatus(prefix(), strings::StrCat(batch_prefix, "_", kStatus),
                       result->status, writer));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Used for coordination between the main thread and the runner thread.
diff --git a/tensorflow/core/kernels/data/parallel_batch_dataset_op_test.cc b/tensorflow/core/kernels/data/parallel_batch_dataset_op_test.cc
index 6f07d87289441d..f4fc0c519e79e6 100644
--- a/tensorflow/core/kernels/data/parallel_batch_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/parallel_batch_dataset_op_test.cc
@@ -60,7 +60,7 @@ class ParallelBatchDatasetParams : public DatasetParams {
                     ParallelBatchDatasetOp::kBatchSize,
                     ParallelBatchDatasetOp::kNumParallelCalls,
                     ParallelBatchDatasetOp::kDropRemainder};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -71,7 +71,7 @@ class ParallelBatchDatasetParams : public DatasetParams {
         {"deterministic", deterministic_},
         {"metadata", ""},
     };
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   string dataset_type() const override {
diff --git a/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc b/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc
index 15fbfd59bc4c5f..2a7786d91624e6 100644
--- a/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_filter_dataset_op.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "tensorflow/core/common_runtime/function.h"
 #include "tensorflow/core/common_runtime/input_colocation_exemption_registry.h"
+#include "tensorflow/core/data/dataset_utils.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/model.h"
@@ -94,7 +95,7 @@ class ParallelFilterDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -128,7 +129,7 @@ class ParallelFilterDatasetOp::Dataset : public DatasetBase {
                        {kPredicate, predicate_attr},
                        {kTarguments, other_arguments_types_attr}},
                       output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -234,7 +235,7 @@ class ParallelFilterDatasetOp::Dataset : public DatasetBase {
               writer->WriteScalar(element_prefix, kEndOfInput, ""));
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -263,7 +264,7 @@ class ParallelFilterDatasetOp::Dataset : public DatasetBase {
         RecordBufferEnqueue(ctx, result.return_values);
         result.notification.Notify();
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     TraceMeMetadata GetTraceMeMetadata() const override {
@@ -403,7 +404,7 @@ class ParallelFilterDatasetOp::Dataset : public DatasetBase {
       if (!result->end_of_input && result->status.ok()) {
         *out_tensors = std::move(result->return_values);
         *end_of_sequence = false;
-        return OkStatus();
+        return absl::OkStatus();
       }
       if (errors::IsOutOfRange(result->status)) {
         // `predicate` may deliberately raise `errors::OutOfRange` to indicate
@@ -530,7 +531,7 @@ class ParallelFilterDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(writer->WriteTensor(
             prefix, absl::StrCat(kComponent, "[", j, "]"), values[j]));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status ReadComponentsLocked(IteratorContext* ctx,
@@ -552,7 +553,7 @@ class ParallelFilterDatasetOp::Dataset : public DatasetBase {
             ctx->flr(), prefix, absl::StrCat(kComponent, "[", j, "]"),
             &values->back()));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status WriteStatusLocked(IteratorStateWriter* writer,
@@ -564,7 +565,7 @@ class ParallelFilterDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(writer->WriteScalar(key, kErrorMessage,
                                                std::string(status.message())));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status ReadStatusLocked(IteratorStateReader* reader, const std::string& key,
@@ -579,9 +580,9 @@ class ParallelFilterDatasetOp::Dataset : public DatasetBase {
             reader->ReadScalar(key, kErrorMessage, &error_message));
         *status = Status(code, error_message);
       } else {
-        *status = OkStatus();
+        *status = absl::OkStatus();
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Used for coordination between the main thread and the runner thread.
diff --git a/tensorflow/core/kernels/data/parallel_filter_dataset_op_test.cc b/tensorflow/core/kernels/data/parallel_filter_dataset_op_test.cc
index 0ebadffa1c9f02..9537ea1c1aef2c 100644
--- a/tensorflow/core/kernels/data/parallel_filter_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/parallel_filter_dataset_op_test.cc
@@ -66,7 +66,7 @@ class ParallelFilterDatasetParams : public DatasetParams {
           absl::StrCat(ParallelFilterDatasetOp::kOtherArguments, "_", i));
     }
     input_names->emplace_back(ParallelFilterDatasetOp::kNumParallelCalls);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -74,7 +74,7 @@ class ParallelFilterDatasetParams : public DatasetParams {
         {"predicate", pred_func_},         {"Targuments", type_arguments_},
         {"output_shapes", output_shapes_}, {"output_types", output_dtypes_},
         {"deterministic", deterministic_}, {"metadata", ""}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override {
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
index 43699475d26c3b..e134259ee4abfa 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op.cc
@@ -275,7 +275,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -351,7 +351,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
     }
 
     TF_RETURN_IF_ERROR(b->AddDataset(this, inputs, list_inputs, attrs, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -415,7 +415,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         EnsureInitialElementsCreated(ctx);
         EnsureThreadsStarted(ctx);
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
@@ -446,7 +446,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       }
       if (!result) {
         *end_of_sequence = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
       profiler::TraceMe traceme([&] {
         return profiler::TraceMeEncode("ParallelInterleaveConsume",
@@ -530,7 +530,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       // Wake workers back up.
       current_workers_cond_var_.notify_all();
       future_workers_cond_var_.notify_all();
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -579,7 +579,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       }
       VLOG(2) << "Parallel interleave iterator restored";
       VLOG(4) << "State after restore:\n" << DebugString();
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     TraceMeMetadata GetTraceMeMetadata() const override {
@@ -1286,7 +1286,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
                                                ErrorMessageKey(idx),
                                                std::string(status.message())));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status ReadStatusLocked(IteratorStateReader* reader,
@@ -1303,9 +1303,9 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
             iterator_name, ErrorMessageKey(idx), &error_message));
         *status = Status(code, error_message);
       } else {
-        *status = OkStatus();
+        *status = absl::OkStatus();
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     string CodeKey(size_t idx) {
@@ -1362,7 +1362,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
             key_prefix,
             absl::StrCat(kResultsSuffix, "[", i, "]", kIsReadySuffix), ""));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status WriteCurrentElements(SerializationContext* ctx,
@@ -1381,7 +1381,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
               WriteElement(ctx, current_elements_[idx], key_prefix, writer));
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status WriteFutureElements(SerializationContext* ctx,
@@ -1400,7 +1400,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
               WriteElement(ctx, future_elements_[idx], key_prefix, writer));
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status ReadElement(IteratorContext* ctx, IteratorStateReader* reader,
@@ -1410,7 +1410,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(reader->ReadScalar(key_prefix, kElementUninitialized,
                                             &element_uninitialized));
       if (static_cast<bool>(element_uninitialized)) {
-        return OkStatus();
+        return absl::OkStatus();
       }
       std::unique_ptr<IteratorBase> iterator;
       auto element = std::make_shared<Element>();
@@ -1447,7 +1447,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         if (static_cast<bool>(!restore_iterator)) {
           element->iterator.reset();
           *out = std::move(element);
-          return OkStatus();
+          return absl::OkStatus();
         }
         int64_t inputs_size;
         TF_RETURN_IF_ERROR(reader->ReadScalar(
@@ -1475,7 +1475,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       mutex_lock l(*mu_);
       element->iterator = std::move(iterator);
       *out = std::move(element);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status ReadCurrentElements(IteratorContext* ctx,
@@ -1501,7 +1501,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         }
       }
       if (size == 0) {
-        return OkStatus();
+        return absl::OkStatus();
       }
       std::vector<std::shared_ptr<Element>> elements;
       TF_RETURN_IF_ERROR(
@@ -1513,7 +1513,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       for (int idx = 0; idx < size; ++idx) {
         current_elements_[idx] = std::move(elements[idx]);
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status ReadFutureElements(IteratorContext* ctx,
@@ -1526,7 +1526,7 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
         future_elements_.resize(size);
       }
       if (size == 0) {
-        return OkStatus();
+        return absl::OkStatus();
       }
       std::vector<std::shared_ptr<Element>> elements;
       TF_RETURN_IF_ERROR(
@@ -1538,14 +1538,14 @@ class ParallelInterleaveDatasetOp::Dataset : public DatasetBase {
       for (int idx = 0; idx < size; ++idx) {
         future_elements_[idx] = std::move(elements[idx]);
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status ReadElementsParallel(
         IteratorContext* ctx, IteratorStateReader* reader, int64_t size,
         const string& name, std::vector<std::shared_ptr<Element>>* elements) {
       elements->resize(size);
-      Status s = OkStatus();
+      Status s = absl::OkStatus();
       BlockingCounter counter(size);
       for (int idx = 0; idx < size; ++idx) {
         thread_pool_->Schedule([this, ctx, reader, idx, name, &s, &counter,
diff --git a/tensorflow/core/kernels/data/parallel_interleave_dataset_op_test.cc b/tensorflow/core/kernels/data/parallel_interleave_dataset_op_test.cc
index e8117a8a07210f..7649e7aa996996 100644
--- a/tensorflow/core/kernels/data/parallel_interleave_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/parallel_interleave_dataset_op_test.cc
@@ -86,7 +86,7 @@ class ParallelInterleaveDatasetParams : public DatasetParams {
     input_names->emplace_back(
         ParallelInterleaveDatasetOp::kPrefetchInputElements);
     input_names->emplace_back(ParallelInterleaveDatasetOp::kNumParallelCalls);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -96,7 +96,7 @@ class ParallelInterleaveDatasetParams : public DatasetParams {
                     {"output_shapes", output_shapes_},
                     {"output_types", output_dtypes_},
                     {"metadata", ""}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override {
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
index dd859aaf333c12..81eec302549a74 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op.cc
@@ -153,7 +153,7 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -225,7 +225,7 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
          std::make_pair(2, num_parallel_calls)},  // Single tensor inputs.
         {std::make_pair(1, other_arguments)},     // Tensor list inputs.
         attrs, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -274,7 +274,7 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       if (ctx->warm_start() && !ctx->is_restoring()) {
         EnsureThreadsStarted(ctx);
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
@@ -318,7 +318,7 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(ctx->HandleCheckExternalStateStatus(
           dataset()->captured_func_->CheckExternalState()));
       if (ctx->symbolic_checkpoint()) {
-        return OkStatus();
+        return absl::OkStatus();
       }
       mutex_lock l(*mu_);
       // Wait for all in-flight calls to complete.
@@ -350,7 +350,7 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
             writer->WriteScalar(element_prefix, kEndOfInput,
                                 static_cast<int64_t>(result.end_of_input)));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -359,7 +359,7 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
       TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
       DCHECK(invocation_results_.empty());
       if (ctx->symbolic_checkpoint()) {
-        return OkStatus();
+        return absl::OkStatus();
       }
       int64_t invocation_results_size;
       TF_RETURN_IF_ERROR(reader->ReadScalar(
@@ -397,7 +397,7 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
         RecordBufferEnqueue(ctx, result.return_values);
         result.notification.Notify();
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     TraceMeMetadata GetTraceMeMetadata() const override {
@@ -539,7 +539,7 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
         *out_tensors = std::move(result->return_values);
         RecordBufferDequeue(ctx, *out_tensors);
         *end_of_sequence = false;
-        return OkStatus();
+        return absl::OkStatus();
       }
       if (errors::IsOutOfRange(result->status)) {
         if (preserve_cardinality_) {
@@ -553,7 +553,7 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
           // `f` may deliberately raise `errors::OutOfRange` to indicate
           // that we should terminate the iteration early.
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
       }
       *end_of_sequence = result->end_of_input;
@@ -672,7 +672,7 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
                                                absl::StrCat("_", kErrorMessage),
                                                std::string(status.message())));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status ReadStatusLocked(IteratorStateReader* reader,
@@ -689,9 +689,9 @@ class ParallelMapDatasetOp::Dataset : public DatasetBase {
             prefix, absl::StrCat("_", kErrorMessage), &error_message));
         *status = Status(code, error_message);
       } else {
-        *status = OkStatus();
+        *status = absl::OkStatus();
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Used for coordination between the main thread and the runner thread.
diff --git a/tensorflow/core/kernels/data/parallel_map_dataset_op_test.cc b/tensorflow/core/kernels/data/parallel_map_dataset_op_test.cc
index 6072b37cbd70b6..cfd139cc6bbfe7 100644
--- a/tensorflow/core/kernels/data/parallel_map_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/parallel_map_dataset_op_test.cc
@@ -65,7 +65,7 @@ class ParallelMapDatasetParams : public DatasetParams {
           absl::StrCat(ParallelMapDatasetOp::kOtherArguments, "_", i));
     }
     input_names->emplace_back(ParallelMapDatasetOp::kNumParallelCalls);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -77,7 +77,7 @@ class ParallelMapDatasetParams : public DatasetParams {
                     {"deterministic", deterministic_},
                     {"preserve_cardinality", preserve_cardinality_},
                     {"metadata", ""}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override {
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op.cc b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
index cc23163c17ff94..2bcda3283a76bb 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op.cc
@@ -104,7 +104,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -137,7 +137,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
                        std::make_pair(kLegacyAutotune, legacy_autotune_attr),
                        std::make_pair(kBufferSizeMin, buffer_size_min_attr)},
                       output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -188,7 +188,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(EnsureThreadsStarted(ctx));
       }
       ctx->MergeCheckpoint(iter_ctx.checkpoint());
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
@@ -217,7 +217,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
 
         if (prefetch_thread_finished_) {
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         DCHECK_EQ(buffer_limit(), 0);
@@ -259,7 +259,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
       if (ctx->symbolic_checkpoint()) {
-        return OkStatus();
+        return absl::OkStatus();
       }
       // Acquire both locks to ensure that the prefetch thread and
       // all GetNext threads are blocked.
@@ -282,7 +282,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
           }
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -301,7 +301,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(EnsureThreadsStarted(ctx));
       }
       cond_var_->notify_all();
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     data::TraceMeMetadata GetTraceMeMetadata() const override {
@@ -393,7 +393,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
         }
         RecordBufferEnqueue(ctx, buffer_element.value);
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     int64_t buffer_limit() const TF_EXCLUSIVE_LOCKS_REQUIRED(*mu_) {
@@ -492,7 +492,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
         prefetch_thread_ = ctx->StartThread(
             "tf_data_prefetch", [this, new_ctx]() { PrefetchThread(new_ctx); });
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Prefetches elements of the input, storing results in an internal buffer.
@@ -576,7 +576,7 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
             absl::StrCat(prefix(), "::", index), ErrorMessageKey(),
             std::string(status.message())));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status ReadStatus(IteratorStateReader* reader, size_t index, Status* status)
@@ -593,9 +593,9 @@ class PrefetchDatasetOp::Dataset : public DatasetBase {
                                ErrorMessageKey(), &error_message));
         *status = Status(code, error_message);
       } else {
-        *status = OkStatus();
+        *status = absl::OkStatus();
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     string CodeKey() { return absl::StrCat(kStatus, kCodeSuffix); }
diff --git a/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc b/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc
index 36a35f97bae619..7fe3674db2aca4 100644
--- a/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/prefetch_dataset_op_test.cc
@@ -50,7 +50,7 @@ class PrefetchDatasetParams : public DatasetParams {
     input_names->clear();
     input_names->emplace_back(PrefetchDatasetOp::kInputDataset);
     input_names->emplace_back(PrefetchDatasetOp::kBufferSize);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -61,7 +61,7 @@ class PrefetchDatasetParams : public DatasetParams {
     attr_vector->emplace_back("legacy_autotune", legacy_autotune_);
     attr_vector->emplace_back("buffer_size_min", buffer_size_min_);
     attr_vector->emplace_back("metadata", "");
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override {
diff --git a/tensorflow/core/kernels/data/random_seed_ops.cc b/tensorflow/core/kernels/data/random_seed_ops.cc
index 5cf316fded10d5..61566cab3fabd3 100644
--- a/tensorflow/core/kernels/data/random_seed_ops.cc
+++ b/tensorflow/core/kernels/data/random_seed_ops.cc
@@ -91,7 +91,7 @@ Status AnonymousSeedGeneratorHandleOp::CreateResource(
     *manager = new SeedGeneratorManager(new FixedSeedGenerator(*seeds_));
   }
   seeds_ = nullptr;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void DeleteSeedGeneratorOp::Compute(OpKernelContext* ctx) {
diff --git a/tensorflow/core/kernels/data/range_dataset_op.cc b/tensorflow/core/kernels/data/range_dataset_op.cc
index b6aa84aa7089f5..7e0baae0632d4e 100644
--- a/tensorflow/core/kernels/data/range_dataset_op.cc
+++ b/tensorflow/core/kernels/data/range_dataset_op.cc
@@ -19,12 +19,14 @@ limitations under the License.
 #include <utility>
 
 #include "absl/memory/memory.h"
+#include "absl/status/status.h"
 #include "tensorflow/core/data/name_utils.h"
 #include "tensorflow/core/data/split_utils.h"
 #include "tensorflow/core/framework/dataset.h"
 #include "tensorflow/core/framework/partial_tensor_shape.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tsl/platform/mutex.h"
 #include "tsl/platform/types.h"
 
 namespace tensorflow {
@@ -61,7 +63,7 @@ Status ConvertOutputTypes(const tensorflow::DataTypeVector& output_dtypes,
       return errors::InvalidArgument("Unsupported data type: ",
                                      DataTypeString(output_dtypes[0]));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 int64_t sgn(int64_t val) { return (0 < val) - (val < 0); }
@@ -142,23 +144,23 @@ class RangeDatasetOp::RangeSplitProvider : public SplitProvider {
   Status GetNext(Tensor* split, bool* end_of_splits) override {
     int64_t next = counter_.GetNext(end_of_splits);
     if (*end_of_splits) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     *split = Tensor(DT_INT64, TensorShape{});
     split->scalar<int64_t>()() = next;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Reset() override {
     counter_.Reset();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Save(std::function<std::string(std::string)> key_name_fn,
               IteratorStateWriter* writer) override {
     TF_RETURN_IF_ERROR(
         writer->WriteScalar(key_name_fn(kNext), counter_.Peek()));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Restore(std::function<std::string(std::string)> key_name_fn,
@@ -166,7 +168,7 @@ class RangeDatasetOp::RangeSplitProvider : public SplitProvider {
     int64_t next;
     TF_RETURN_IF_ERROR(reader->ReadScalar(key_name_fn(kNext), &next));
     counter_.SetNext(next);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   int64_t Cardinality() const override { return counter_.Cardinality(); }
@@ -186,6 +188,10 @@ class RangeDatasetOp::Dataset : public DatasetBase {
         output_dtypes_(output_dtypes),
         replicate_on_split_(replicate_on_split) {}
 
+  absl::Status RandomIndexingCompatible() const override {
+    return absl::OkStatus();
+  }
+
   std::unique_ptr<IteratorBase> MakeIteratorInternal(
       const string& prefix) const override {
     return std::make_unique<Iterator>(Iterator::Params{
@@ -216,15 +222,15 @@ class RangeDatasetOp::Dataset : public DatasetBase {
                                 split_providers) const override {
     split_providers->push_back(
         std::make_unique<RangeSplitProvider>(start_, stop_, step_));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->clear();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status CheckExternalState() const override { return OkStatus(); }
+  Status CheckExternalState() const override { return absl::OkStatus(); }
 
   Status Get(OpKernelContext* ctx, int64 index,
              std::vector<Tensor>* out_tensors) const override {
@@ -250,7 +256,7 @@ class RangeDatasetOp::Dataset : public DatasetBase {
         this, {start, stop, step},                                // Inputs
         {std::make_pair(kReplicateOnSplit, replicate_on_split)},  // Attrs
         output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -269,30 +275,47 @@ class RangeDatasetOp::Dataset : public DatasetBase {
         TF_ASSIGN_OR_RETURN(split_provider_,
                             GetSingleSplitProvider(ctx, dataset()));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
+      if (ctx->index_mapper() != nullptr) {
+        return Get(ctx, out_tensors, end_of_sequence);
+      }
       int64_t value;
       if (split_provider_ != nullptr) {
         Tensor split;
         TF_RETURN_IF_ERROR(split_provider_->GetNext(&split, end_of_sequence));
         if (*end_of_sequence) {
-          return OkStatus();
+          return absl::OkStatus();
         }
         value = split.scalar<int64_t>()();
       } else {
         value = counter_->GetNext(end_of_sequence);
         if (*end_of_sequence) {
-          return OkStatus();
+          return absl::OkStatus();
         }
       }
       out_tensors->reserve(1);
       return ConvertOutputTypes(output_dtypes(), out_tensors, value);
     }
 
+    absl::Status Get(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
+                     bool* end_of_sequence) {
+      tsl::mutex_lock l(mu_);
+      if (element_count_ >=
+          (dataset()->stop_ - dataset()->start_) / dataset()->step_) {
+        *end_of_sequence = true;
+        return absl::OkStatus();
+      }
+      int64_t output_index = ctx->index_mapper()(element_count_++);
+      int64_t value = dataset()->start_ + output_index * dataset()->step_;
+      *end_of_sequence = false;
+      return ConvertOutputTypes(output_dtypes(), out_tensors, value);
+    }
+
    protected:
     std::shared_ptr<model::Node> CreateNode(
         IteratorContext* ctx, model::Node::Args args) const override {
@@ -313,7 +336,7 @@ class RangeDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(prefix(), kNext, counter_->Peek()));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -329,7 +352,7 @@ class RangeDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(reader->ReadScalar(prefix(), kNext, &next));
         counter_->SetNext(next);
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     std::string SplitProviderKeyNameFn(const std::string& key) {
@@ -339,6 +362,11 @@ class RangeDatasetOp::Dataset : public DatasetBase {
    private:
     std::unique_ptr<RangeCounter> counter_;
     std::shared_ptr<SplitProvider> split_provider_;
+
+    mutable tsl::mutex mu_;
+    // Count of elements produced by this iterator when it runs in the random
+    // access mode.
+    int64_t element_count_ TF_GUARDED_BY(mu_) = 0;
   };
 
   const int64_t start_;
diff --git a/tensorflow/core/kernels/data/reduce_dataset_op.cc b/tensorflow/core/kernels/data/reduce_dataset_op.cc
index 5b29a2958d0d8b..226ed3d2e66587 100644
--- a/tensorflow/core/kernels/data/reduce_dataset_op.cc
+++ b/tensorflow/core/kernels/data/reduce_dataset_op.cc
@@ -125,7 +125,7 @@ Status ReduceDatasetOp::DoCompute(OpKernelContext* ctx) {
   for (size_t i = 0; i < state.size(); ++i) {
     ctx->set_output(i, state[i]);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
diff --git a/tensorflow/core/kernels/data/reduce_dataset_op_test.cc b/tensorflow/core/kernels/data/reduce_dataset_op_test.cc
index 9bb932a87faae5..65e9e0a4ba5e6c 100644
--- a/tensorflow/core/kernels/data/reduce_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/reduce_dataset_op_test.cc
@@ -63,7 +63,7 @@ class ReduceDatasetParams : public DatasetParams {
     for (int i = 0; i < other_arguments_.size(); ++i) {
       input_names->emplace_back(strings::StrCat("other_arguments_", i));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -75,7 +75,7 @@ class ReduceDatasetParams : public DatasetParams {
                     {"output_shapes", output_shapes_},
                     {"use_inter_op_parallelism", use_inter_op_parallelism_},
                     {"metadata", ""}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override { return "Reduce"; }
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op.cc b/tensorflow/core/kernels/data/repeat_dataset_op.cc
index 904d8ac23baad4..156c47caf957b1 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op.cc
@@ -188,7 +188,7 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -210,7 +210,7 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
     Node* count = nullptr;
     TF_RETURN_IF_ERROR(b->AddScalar(count_, &count));
     TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node, count}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -225,7 +225,7 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
       *end_of_sequence = true;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -237,11 +237,11 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
 
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
-      return OkStatus();
+      return absl::OkStatus();
     }
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
-      return OkStatus();
+      return absl::OkStatus();
     }
   };
 
@@ -264,13 +264,13 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
       mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
       if (!input_impl_) {
         *end_of_sequence = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
       while (i_ < dataset()->count_) {
         TF_RETURN_IF_ERROR(
             input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
         if (!*end_of_sequence) {
-          return OkStatus();
+          return absl::OkStatus();
         }
         ctx->PurgeCheckpoint(nested_prefix(prefix(), i_));
         ++i_;
@@ -283,7 +283,7 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
       }
       *end_of_sequence = true;
       input_impl_.reset();
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -302,7 +302,7 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
       if (input_impl_) {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -319,7 +319,7 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
       } else {
         input_impl_.reset();
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -364,12 +364,12 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
           // Otherwise, this iterator could loop infinitely.
           if (!has_data_service_input_) {
             input_impl_.reset();
-            return OkStatus();
+            return absl::OkStatus();
           }
         }
         first_call_ = false;
         if (!*end_of_sequence) {
-          return OkStatus();
+          return absl::OkStatus();
         }
         ctx->PurgeCheckpoint(nested_prefix(prefix(), i_));
         ++i_;
@@ -397,7 +397,7 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
       if (input_impl_) {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -416,7 +416,7 @@ class RepeatDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl_));
         first_call_ = false;
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
diff --git a/tensorflow/core/kernels/data/repeat_dataset_op_test.cc b/tensorflow/core/kernels/data/repeat_dataset_op_test.cc
index 812563f689d7b7..451d7c107de04c 100644
--- a/tensorflow/core/kernels/data/repeat_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/repeat_dataset_op_test.cc
@@ -51,7 +51,7 @@ class RepeatDatasetParams : public DatasetParams {
     input_names->clear();
     input_names->emplace_back(RepeatDatasetOp::kInputDataset);
     input_names->emplace_back(RepeatDatasetOp::kCount);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -59,7 +59,7 @@ class RepeatDatasetParams : public DatasetParams {
     attr_vector->emplace_back("output_types", output_dtypes_);
     attr_vector->emplace_back("output_shapes", output_shapes_);
     attr_vector->emplace_back("metadata", "");
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override { return RepeatDatasetOp::kDatasetType; }
diff --git a/tensorflow/core/kernels/data/rewrite_dataset_op_test.cc b/tensorflow/core/kernels/data/rewrite_dataset_op_test.cc
index 05b5f049f64dc8..bde5797ffd2d69 100644
--- a/tensorflow/core/kernels/data/rewrite_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/rewrite_dataset_op_test.cc
@@ -45,13 +45,13 @@ class RewriteDatasetParams : public DatasetParams {
   Status GetInputNames(std::vector<string>* input_names) const override {
     *input_names = {RewriteDatasetOp::kInputDataset,
                     RewriteDatasetOp::kRewriteName};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
     attr_vector->emplace_back("output_types", output_dtypes_);
     attr_vector->emplace_back("output_shapes", output_shapes_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override {
diff --git a/tensorflow/core/kernels/data/shard_dataset_op.cc b/tensorflow/core/kernels/data/shard_dataset_op.cc
index 8542b9d4c33ece..dd102c3a4d8e09 100644
--- a/tensorflow/core/kernels/data/shard_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shard_dataset_op.cc
@@ -92,7 +92,7 @@ class ShardDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -122,7 +122,7 @@ class ShardDatasetOp::Dataset : public DatasetBase {
     TF_RETURN_IF_ERROR(
         b->AddDataset(this, {input_graph_node, num_shards, index},
                       {{kRequireNonEmpty, require_non_empty_attr}}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -154,7 +154,7 @@ class ShardDatasetOp::Dataset : public DatasetBase {
       *end_of_sequence = false;
       if (!input_impl_) {
         *end_of_sequence = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       int num_to_skip =
@@ -168,14 +168,14 @@ class ShardDatasetOp::Dataset : public DatasetBase {
       next_index_ += num_skipped;
       if (*end_of_sequence) {
         input_impl_.reset();
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       std::vector<Tensor> result;
       TF_RETURN_IF_ERROR(input_impl_->GetNext(ctx, &result, end_of_sequence));
       if (*end_of_sequence) {
         input_impl_.reset();
-        return OkStatus();
+        return absl::OkStatus();
       }
       next_index_++;
 
@@ -207,7 +207,7 @@ class ShardDatasetOp::Dataset : public DatasetBase {
       }
 
       *out_tensors = std::move(result);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -227,7 +227,7 @@ class ShardDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(prefix(), kNextIndex, next_index_));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -243,7 +243,7 @@ class ShardDatasetOp::Dataset : public DatasetBase {
       } else {
         input_impl_.reset();
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     TraceMeMetadata GetTraceMeMetadata() const override {
diff --git a/tensorflow/core/kernels/data/shard_dataset_op_test.cc b/tensorflow/core/kernels/data/shard_dataset_op_test.cc
index de9ad00821cf8e..d593bc6be3accb 100644
--- a/tensorflow/core/kernels/data/shard_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/shard_dataset_op_test.cc
@@ -46,7 +46,7 @@ class ShardDatasetParams : public DatasetParams {
     input_names->emplace_back(ShardDatasetOp::kInputDataset);
     input_names->emplace_back(ShardDatasetOp::kNumShards);
     input_names->emplace_back(ShardDatasetOp::kIndex);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -55,7 +55,7 @@ class ShardDatasetParams : public DatasetParams {
     attr_vector->emplace_back("output_types", output_dtypes_);
     attr_vector->emplace_back("output_shapes", output_shapes_);
     attr_vector->emplace_back("metadata", "");
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override { return ShardDatasetOp::kDatasetType; }
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op.cc b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
index cb2c28dbf1ea58..c1002e7c7c64e5 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op.cc
@@ -126,7 +126,7 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -148,7 +148,7 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
       shuffled_index = shuffled_indices_[index];
     }
     TF_RETURN_IF_ERROR(input_->Get(ctx, shuffled_index, out_tensors));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string DebugString() const override {
@@ -211,7 +211,7 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
           checkpoint_indices_.insert(i);
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
@@ -222,7 +222,7 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
       if (num_elements_ == 0) {
         DCHECK(input_impl_ == nullptr);
         *end_of_sequence = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       *end_of_sequence = false;
@@ -241,7 +241,7 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
       checkpoint_indices_.insert(slices_.front()->start % buffer_->size());
       slices_.front()->start++;
       num_elements_--;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -316,7 +316,7 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
             writer->WriteScalar(this->prefix(), kDataProduced, ""));
       }
 
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -393,7 +393,7 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
       }
       data_produced_ = reader->Contains(this->prefix(), kDataProduced);
 
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     TraceMeMetadata GetTraceMeMetadata() const override {
@@ -473,13 +473,13 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
           // If we encounter the end of sequence without producing data, we
           // terminate the iteration immediately. (Otherwise, this iterator
           // would loop infinitely and never produce a value.)
-          return OkStatus();
+          return absl::OkStatus();
         }
       }
       if (num_log_entries > 0) {
         LOG(INFO) << "Shuffle buffer filled.";
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     bool ShouldFillBuffer() TF_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
@@ -517,7 +517,7 @@ class ShuffleDatasetOpBase::ShuffleDatasetBase : public DatasetBase {
       TF_RETURN_IF_ERROR(this->dataset()->input_->MakeIterator(
           ctx, this, this->prefix(), &input_impl_));
       epoch_++;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     void AddToShuffleBuffer(IteratorContext* ctx, std::vector<Tensor>&& element)
@@ -641,7 +641,7 @@ class ShuffleDatasetOp::Dataset : public ShuffleDatasetBase {
         {std::make_pair(kReshuffleEachIteration,
                         reshuffle_each_iteration)},  // Attrs
         output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -696,7 +696,7 @@ class ShuffleDatasetOp::DatasetV2 : public ShuffleDatasetBase {
         {input_graph_node, buffer_size_node, resource_handle_node},  // Inputs
         {},                                                          // Attrs
         output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -760,7 +760,7 @@ class ShuffleDatasetOp::DatasetV3 : public ShuffleDatasetBase {
                       {std::make_pair(kReshuffleEachIteration,
                                       reshuffle_each_iteration)},  // Attrs
                       output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -828,7 +828,7 @@ void ShuffleDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                   *manager =
                       new SeedGeneratorManager(new FixedSeedGenerator(seeds));
                 }
-                return OkStatus();
+                return absl::OkStatus();
               }));
       handle = MakeResourceHandle<SeedGenerator>(ctx, container, name);
     } else {
@@ -857,7 +857,7 @@ void ShuffleDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                    [&seeds](SeedGeneratorManager** manager) {
                      *manager = new SeedGeneratorManager(
                          new RandomSeedGenerator(seeds));
-                     return OkStatus();
+                     return absl::OkStatus();
                    }));
       handle = MakeResourceHandle<SeedGeneratorManager>(ctx, container, name);
     } else {
@@ -892,7 +892,7 @@ void ShuffleDatasetOp::MakeDataset(OpKernelContext* ctx, DatasetBase* input,
                 *manager =
                     new SeedGeneratorManager(new FixedSeedGenerator(seeds));
               }
-              return OkStatus();
+              return absl::OkStatus();
             }));
     auto handle =
         MakeResourceHandle<SeedGeneratorManager>(ctx, container, name);
@@ -949,7 +949,7 @@ class ShuffleAndRepeatDatasetOp::Dataset : public ShuffleDatasetBase {
         {std::make_pair(kReshuffleEachIteration,
                         reshuffle_each_iteration)},  // Attrs
         output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -1012,7 +1012,7 @@ class ShuffleAndRepeatDatasetOp::DatasetV2 : public ShuffleDatasetBase {
                       {std::make_pair(kReshuffleEachIteration,
                                       reshuffle_each_iteration)},  // Attrs
                       output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -1088,7 +1088,7 @@ void ShuffleAndRepeatDatasetOp::MakeDataset(OpKernelContext* ctx,
                   *manager =
                       new SeedGeneratorManager(new FixedSeedGenerator(seeds));
                 }
-                return OkStatus();
+                return absl::OkStatus();
               }));
       handle = MakeResourceHandle<SeedGenerator>(ctx, container, name);
     } else {
@@ -1118,7 +1118,7 @@ void ShuffleAndRepeatDatasetOp::MakeDataset(OpKernelContext* ctx,
                 *manager =
                     new SeedGeneratorManager(new FixedSeedGenerator(seeds));
               }
-              return OkStatus();
+              return absl::OkStatus();
             }));
     auto handle =
         MakeResourceHandle<SeedGeneratorManager>(ctx, container, name);
diff --git a/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc b/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc
index 5845894baf592a..99f907de2c4dc7 100644
--- a/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/shuffle_dataset_op_test.cc
@@ -68,7 +68,7 @@ class ShuffleDatasetParams : public DatasetParams {
     if (count_ != 1) {
       input_names->emplace_back(ShuffleAndRepeatDatasetOp::kCount);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -78,7 +78,7 @@ class ShuffleDatasetParams : public DatasetParams {
     attr_vector->emplace_back("reshuffle_each_iteration",
                               reshuffle_each_iteration_);
     attr_vector->emplace_back("metadata", "");
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override {
diff --git a/tensorflow/core/kernels/data/skip_dataset_op.cc b/tensorflow/core/kernels/data/skip_dataset_op.cc
index 1b5c027af0f7b2..2a0c75f4c54b93 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op.cc
@@ -77,7 +77,7 @@ class SkipDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -99,7 +99,7 @@ class SkipDatasetOp::Dataset : public DatasetBase {
     Node* count = nullptr;
     TF_RETURN_IF_ERROR(b->AddScalar(count_, &count));
     TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node, count}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -114,7 +114,7 @@ class SkipDatasetOp::Dataset : public DatasetBase {
                            std::vector<Tensor>* out_tensors,
                            bool* end_of_sequence) override {
       *end_of_sequence = true;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -126,12 +126,12 @@ class SkipDatasetOp::Dataset : public DatasetBase {
 
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
                            IteratorStateReader* reader) override {
-      return OkStatus();
+      return absl::OkStatus();
     }
   };
 
@@ -153,7 +153,7 @@ class SkipDatasetOp::Dataset : public DatasetBase {
 
       if (!input_impl_) {
         *end_of_sequence = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       if (i_ < dataset()->count_) {
@@ -164,7 +164,7 @@ class SkipDatasetOp::Dataset : public DatasetBase {
         if (*end_of_sequence) {
           // We reached the end before the count was reached.
           input_impl_.reset();
-          return OkStatus();
+          return absl::OkStatus();
         }
       }
 
@@ -174,7 +174,7 @@ class SkipDatasetOp::Dataset : public DatasetBase {
       if (*end_of_sequence) {
         input_impl_.reset();
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -193,7 +193,7 @@ class SkipDatasetOp::Dataset : public DatasetBase {
       if (input_impl_) {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -208,7 +208,7 @@ class SkipDatasetOp::Dataset : public DatasetBase {
       } else {
         input_impl_.reset();
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
diff --git a/tensorflow/core/kernels/data/skip_dataset_op_test.cc b/tensorflow/core/kernels/data/skip_dataset_op_test.cc
index 70af1581d40c51..5a0fdf0060a660 100644
--- a/tensorflow/core/kernels/data/skip_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/skip_dataset_op_test.cc
@@ -43,7 +43,7 @@ class SkipDatasetParams : public DatasetParams {
     input_names->clear();
     input_names->emplace_back(SkipDatasetOp::kInputDataset);
     input_names->emplace_back(SkipDatasetOp::kCount);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -51,7 +51,7 @@ class SkipDatasetParams : public DatasetParams {
     attr_vector->emplace_back("output_types", output_dtypes_);
     attr_vector->emplace_back("output_shapes", output_shapes_);
     attr_vector->emplace_back("metadata", "");
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override { return SkipDatasetOp::kDatasetType; }
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
index 846d7be6d45af6..695263eee45c38 100644
--- a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op.cc
@@ -59,10 +59,10 @@ class Dataset : public DatasetBase {
   }
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status CheckExternalState() const override { return OkStatus(); }
+  Status CheckExternalState() const override { return absl::OkStatus(); }
 
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
@@ -83,7 +83,7 @@ class Dataset : public DatasetBase {
     TF_RETURN_IF_ERROR(
         b->AddDataset(this, {indices_node, value_node, dense_shape_node},
                       {{"Tvalues", val_dtype}}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -107,7 +107,7 @@ class Dataset : public DatasetBase {
       mutex_lock l(mu_);
       if (i_ == num_elements_) {
         *end_of_sequence = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       out_tensors->clear();
@@ -158,7 +158,7 @@ class Dataset : public DatasetBase {
 
       ++i_;
       *end_of_sequence = false;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -181,7 +181,7 @@ class Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(writer->WriteTensor(Iterator::prefix(),
                                                "next_values_", next_values_));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -200,7 +200,7 @@ class Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(reader->ReadTensor(Iterator::prefix(),
                                               "next_values_", &next_values_));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
diff --git a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op_test.cc b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op_test.cc
index dbc76441aeb26b..680b6c704a7722 100644
--- a/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/sparse_tensor_slice_dataset_op_test.cc
@@ -50,13 +50,13 @@ class SparseTensorSliceDatasetParams : public DatasetParams {
     input_names->emplace_back("indices");
     input_names->emplace_back("values");
     input_names->emplace_back("dense_shape");
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
     attr_vector->clear();
     attr_vector->emplace_back("Tvalues", tvalues_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override { return kDatasetType; }
diff --git a/tensorflow/core/kernels/data/take_dataset_op.cc b/tensorflow/core/kernels/data/take_dataset_op.cc
index 931b7dcdbfefd0..adbf81c06888c1 100644
--- a/tensorflow/core/kernels/data/take_dataset_op.cc
+++ b/tensorflow/core/kernels/data/take_dataset_op.cc
@@ -77,7 +77,7 @@ int64_t TakeDataset::CardinalityInternal(CardinalityOptions options) const {
 Status TakeDataset::InputDatasets(
     std::vector<const DatasetBase*>* inputs) const {
   inputs->push_back(input_);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status TakeDataset::CheckExternalState() const {
@@ -100,7 +100,7 @@ class TakeDataset::EmptyIterator : public DatasetIterator<TakeDataset> {
   Status GetNextInternal(IteratorContext* ctx, std::vector<Tensor>* out_tensors,
                          bool* end_of_sequence) override {
     *end_of_sequence = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
@@ -112,12 +112,12 @@ class TakeDataset::EmptyIterator : public DatasetIterator<TakeDataset> {
 
   Status SaveInternal(SerializationContext* ctx,
                       IteratorStateWriter* writer) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status RestoreInternal(IteratorContext* ctx,
                          IteratorStateReader* reader) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -137,20 +137,20 @@ class TakeDataset::FiniteIterator : public DatasetIterator<TakeDataset> {
     mutex_lock l(mu_);  // TODO(mrry): Make locking less conservative.
     if (!input_impl_) {
       *end_of_sequence = true;
-      return OkStatus();
+      return absl::OkStatus();
     }
     while (dataset()->count_ < 0 || i_ < dataset()->count_) {
       TF_RETURN_IF_ERROR(
           input_impl_->GetNext(ctx, out_tensors, end_of_sequence));
       if (!*end_of_sequence) {
         ++i_;
-        return OkStatus();
+        return absl::OkStatus();
       }
       break;
     }
     *end_of_sequence = true;
     input_impl_.reset();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
@@ -169,7 +169,7 @@ class TakeDataset::FiniteIterator : public DatasetIterator<TakeDataset> {
     if (input_impl_) {
       TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl_));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status RestoreInternal(IteratorContext* ctx,
@@ -184,7 +184,7 @@ class TakeDataset::FiniteIterator : public DatasetIterator<TakeDataset> {
     } else {
       input_impl_.reset();
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -214,7 +214,7 @@ Status TakeDataset::AsGraphDefInternal(SerializationContext* ctx,
   Node* count = nullptr;
   TF_RETURN_IF_ERROR(b->AddScalar(count_, &count));
   TF_RETURN_IF_ERROR(b->AddDataset(this, {input_graph_node, count}, output));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 TakeDatasetOp::TakeDatasetOp(OpKernelConstruction* ctx)
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op.cc b/tensorflow/core/kernels/data/tensor_dataset_op.cc
index 81822eb3db007d..7d0fbc40c454bf 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op.cc
@@ -59,7 +59,7 @@ class TensorDatasetOp::Dataset : public DatasetBase {
   Status MakeSplitProviders(std::vector<std::unique_ptr<SplitProvider>>*
                                 split_providers) const override {
     split_providers->push_back(std::make_unique<IndexSplitProvider>(1));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const DataTypeVector& output_dtypes() const override { return dtypes_; }
@@ -77,16 +77,16 @@ class TensorDatasetOp::Dataset : public DatasetBase {
   }
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status CheckExternalState() const override { return OkStatus(); }
+  Status CheckExternalState() const override { return absl::OkStatus(); }
 
   Status Get(OpKernelContext* ctx, int64 index,
              std::vector<Tensor>* out_tensors) const override {
     TF_RETURN_IF_ERROR(CheckRandomAccessCompatible(index));
     *out_tensors = tensors_;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
@@ -110,7 +110,7 @@ class TensorDatasetOp::Dataset : public DatasetBase {
     b->BuildAttrValue(dtypes_, &dtypes);
     TF_RETURN_IF_ERROR(b->AddDataset(this, {}, {{0, components}},
                                      {{kToutput_types, dtypes}}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -126,7 +126,7 @@ class TensorDatasetOp::Dataset : public DatasetBase {
         TF_ASSIGN_OR_RETURN(split_provider_,
                             GetSingleSplitProvider(ctx, dataset()));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
@@ -145,10 +145,10 @@ class TensorDatasetOp::Dataset : public DatasetBase {
         *out_tensors = dataset()->tensors_;
         produced_ = true;
         *end_of_sequence = false;
-        return OkStatus();
+        return absl::OkStatus();
       } else {
         *end_of_sequence = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
     }
 
@@ -163,7 +163,7 @@ class TensorDatasetOp::Dataset : public DatasetBase {
       mutex_lock l(mu_);
       TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kProduced,
                                              static_cast<int64_t>(produced_)));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -172,7 +172,7 @@ class TensorDatasetOp::Dataset : public DatasetBase {
       int64_t produced;
       TF_RETURN_IF_ERROR(reader->ReadScalar(prefix(), kProduced, &produced));
       produced_ = static_cast<bool>(produced);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
diff --git a/tensorflow/core/kernels/data/tensor_dataset_op_test.cc b/tensorflow/core/kernels/data/tensor_dataset_op_test.cc
index bee3ccee541c0a..d3e62d757f9988 100644
--- a/tensorflow/core/kernels/data/tensor_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/tensor_dataset_op_test.cc
@@ -42,14 +42,14 @@ class TensorDatasetParams : public DatasetParams {
       input_names->emplace_back(
           absl::StrCat(TensorDatasetOp::kComponents, "_", i));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
     *attr_vector = {{"Toutput_types", output_dtypes_},
                     {"output_shapes", output_shapes_},
                     {"metadata", ""}};
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override { return TensorDatasetOp::kDatasetType; }
diff --git a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
index 2e9e1726deb6a2..6de213fe921992 100644
--- a/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tensor_slice_dataset_op.cc
@@ -71,7 +71,7 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
                                 split_providers) const override {
     split_providers->push_back(
         std::make_unique<IndexSplitProvider>(tensors_[0].dim_size(0)));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const DataTypeVector& output_dtypes() const override { return dtypes_; }
@@ -89,10 +89,10 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
   }
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status CheckExternalState() const override { return OkStatus(); }
+  Status CheckExternalState() const override { return absl::OkStatus(); }
 
   Status Get(OpKernelContext* ctx, int64 index,
              std::vector<Tensor>* out_tensors) const override {
@@ -102,7 +102,7 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
     for (int i = 0; i < tensors_.size(); ++i) {
       out_tensors->push_back(MaybeCopySubSlice(tensors_[i], index));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
@@ -138,7 +138,7 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
                                       {kIsFiles, is_files},
                                       {kReplicateOnSplit, replicate_on_split}},
                                      output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -157,7 +157,7 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
         TF_ASSIGN_OR_RETURN(split_provider_,
                             GetSingleSplitProvider(ctx, dataset()));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
@@ -166,7 +166,7 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
       Tensor split;
       TF_RETURN_IF_ERROR(split_provider_->GetNext(&split, end_of_sequence));
       if (*end_of_sequence) {
-        return OkStatus();
+        return absl::OkStatus();
       }
       int64_t index = split.scalar<int64_t>()();
       out_tensors->reserve(dataset()->tensors_.size());
@@ -175,7 +175,7 @@ class TensorSliceDatasetOp::Dataset : public DatasetBase {
             MaybeCopySubSlice(dataset()->tensors_[i], index));
       }
       *end_of_sequence = false;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
diff --git a/tensorflow/core/kernels/data/text_line_dataset_op.cc b/tensorflow/core/kernels/data/text_line_dataset_op.cc
index e9b7852d5c0a95..317165d3f36ecd 100644
--- a/tensorflow/core/kernels/data/text_line_dataset_op.cc
+++ b/tensorflow/core/kernels/data/text_line_dataset_op.cc
@@ -72,10 +72,10 @@ class TextLineDatasetOp::Dataset : public DatasetBase {
   }
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status CheckExternalState() const override { return OkStatus(); }
+  Status CheckExternalState() const override { return absl::OkStatus(); }
 
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
@@ -89,7 +89,7 @@ class TextLineDatasetOp::Dataset : public DatasetBase {
     TF_RETURN_IF_ERROR(b->AddScalar(options_.input_buffer_size, &buffer_size));
     TF_RETURN_IF_ERROR(b->AddDataset(
         this, {filenames, compression_type, buffer_size}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -119,7 +119,7 @@ class TextLineDatasetOp::Dataset : public DatasetBase {
             bytes_counter->IncrementBy(line_contents_str.size());
             out_tensors->push_back(std::move(line_contents));
             *end_of_sequence = false;
-            return OkStatus();
+            return absl::OkStatus();
           } else if (!errors::IsOutOfRange(s)) {
             // Report non-EOF errors to the caller.
             return s;
@@ -133,7 +133,7 @@ class TextLineDatasetOp::Dataset : public DatasetBase {
         // Iteration ends when there are no more files to process.
         if (current_file_index_ == dataset()->filenames_.size()) {
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
@@ -158,7 +158,7 @@ class TextLineDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kCurrentPos,
                                                buffered_input_stream_->Tell()));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -179,7 +179,7 @@ class TextLineDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
         TF_RETURN_IF_ERROR(buffered_input_stream_->Seek(current_pos));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -209,7 +209,7 @@ class TextLineDatasetOp::Dataset : public DatasetBase {
         buffered_input_stream_ = std::make_unique<io::BufferedInputStream>(
             input_stream_.get(), dataset()->options_.input_buffer_size, false);
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Resets all reader streams.
diff --git a/tensorflow/core/kernels/data/text_line_dataset_op_test.cc b/tensorflow/core/kernels/data/text_line_dataset_op_test.cc
index a65874ebd2756a..02495e859a58b7 100644
--- a/tensorflow/core/kernels/data/text_line_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/text_line_dataset_op_test.cc
@@ -51,13 +51,13 @@ class TextLineDatasetParams : public DatasetParams {
         TextLineDatasetOp::kCompressionType,
         TextLineDatasetOp::kBufferSize,
     };
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
     attr_vector->clear();
     attr_vector->emplace_back("metadata", "");
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override {
@@ -86,7 +86,7 @@ Status CreateTestFiles(const std::vector<tstring>& filenames,
     TF_RETURN_IF_ERROR(
         WriteDataToFile(filenames[i], contents[i].data(), params));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Test case 1: multiple text files with ZLIB compression.
diff --git a/tensorflow/core/kernels/data/tf_record_dataset_op.cc b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
index 9e6cb5185511c7..8d95feab7bdce7 100644
--- a/tensorflow/core/kernels/data/tf_record_dataset_op.cc
+++ b/tensorflow/core/kernels/data/tf_record_dataset_op.cc
@@ -97,10 +97,10 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
   }
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status CheckExternalState() const override { return OkStatus(); }
+  Status CheckExternalState() const override { return absl::OkStatus(); }
 
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
@@ -116,7 +116,7 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
         this, {filenames, compression_type, buffer_size}, output));
     Node* byte_offsets = nullptr;
     TF_RETURN_IF_ERROR(b->AddVector(byte_offsets_, &byte_offsets));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -145,7 +145,7 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
             bytes_counter->IncrementBy(
                 out_tensors->back().scalar<tstring>()().size());
             *end_of_sequence = false;
-            return OkStatus();
+            return absl::OkStatus();
           }
           out_tensors->pop_back();
           if (!errors::IsOutOfRange(s)) {
@@ -166,7 +166,7 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
         // Iteration ends when there are no more files to process.
         if (current_file_index_ == dataset()->filenames_.size()) {
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
@@ -187,7 +187,7 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
           *num_skipped += last_num_skipped;
           if (s.ok()) {
             *end_of_sequence = false;
-            return OkStatus();
+            return absl::OkStatus();
           }
           if (!errors::IsOutOfRange(s)) {
             // In case of other errors e.g., DataLoss, we still move forward
@@ -207,7 +207,7 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
         // Iteration ends when there are no more files to process.
         if (current_file_index_ == dataset()->filenames_.size()) {
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
@@ -230,7 +230,7 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(
             writer->WriteScalar(prefix(), kOffset, reader_->TellOffset()));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -247,7 +247,7 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(SetupStreamsLocked(ctx->env()));
         TF_RETURN_IF_ERROR(reader_->SeekOffset(offset));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -269,7 +269,7 @@ class TFRecordDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(
             reader_->SeekOffset(dataset()->byte_offsets_[current_file_index_]));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     // Resets all reader streams.
diff --git a/tensorflow/core/kernels/data/tf_record_dataset_op_test.cc b/tensorflow/core/kernels/data/tf_record_dataset_op_test.cc
index c154dda6169e99..7b918b6e784ccc 100644
--- a/tensorflow/core/kernels/data/tf_record_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/tf_record_dataset_op_test.cc
@@ -74,13 +74,13 @@ class TFRecordDatasetParams : public DatasetParams {
         TFRecordDatasetOp::kBufferSize,
         TFRecordDatasetOp::kByteOffsets,
     };
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
     attr_vector->clear();
     attr_vector->emplace_back("metadata", "");
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override {
@@ -111,7 +111,7 @@ Status CreateTestFiles(const std::vector<tstring>& filenames,
                                            contents[i].end());
     TF_RETURN_IF_ERROR(WriteDataToTFRecordFile(filenames[i], records, params));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Test case 1: multiple text files with ZLIB compression.
diff --git a/tensorflow/core/kernels/data/window_dataset.cc b/tensorflow/core/kernels/data/window_dataset.cc
index 4a945f8b6644ac..4ca4c2a8ac1f93 100644
--- a/tensorflow/core/kernels/data/window_dataset.cc
+++ b/tensorflow/core/kernels/data/window_dataset.cc
@@ -77,10 +77,10 @@ class Window : public DatasetBase {
   string DebugString() const override { return kWindow; }
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
-  Status CheckExternalState() const override { return OkStatus(); }
+  Status CheckExternalState() const override { return absl::OkStatus(); }
 
  protected:
   Status AsGraphDefInternal(SerializationContext* ctx,
@@ -103,7 +103,7 @@ class Window : public DatasetBase {
     }
     TF_RETURN_IF_ERROR(
         b->AddDataset(this, {}, {std::make_pair(0, input_nodes)}, {}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -121,14 +121,14 @@ class Window : public DatasetBase {
         *end_of_sequence = false;
         *out_tensors = dataset()->elements_[i_++];
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status SaveInternal(SerializationContext* ctx,
                         IteratorStateWriter* writer) override {
       mutex_lock l(mu_);
       TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), kCurIndex, i_));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -137,7 +137,7 @@ class Window : public DatasetBase {
       int64_t i;
       TF_RETURN_IF_ERROR(reader->ReadScalar(prefix(), kCurIndex, &i));
       i_ = size_t(i);
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     mutex mu_;
@@ -191,7 +191,7 @@ Status NewWindow(std::vector<std::vector<Tensor>> elements,
   *out_dataset = new Window(std::move(elements), std::move(output_types),
                             std::move(output_shapes));
   (*out_dataset)->Initialize(/*metadata=*/{});
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace data
diff --git a/tensorflow/core/kernels/data/window_dataset_op.cc b/tensorflow/core/kernels/data/window_dataset_op.cc
index 711f60dd8f8ace..2aa058078e1c39 100644
--- a/tensorflow/core/kernels/data/window_dataset_op.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op.cc
@@ -107,7 +107,7 @@ class WindowDatasetOp::Dataset : public DatasetBase {
 
   Status InputDatasets(std::vector<const DatasetBase*>* inputs) const override {
     inputs->push_back(input_);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
@@ -133,7 +133,7 @@ class WindowDatasetOp::Dataset : public DatasetBase {
                       {input_graph_node, window_size_node, window_shift_node,
                        window_stride_node, drop_remainder_node},
                       output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -153,7 +153,7 @@ class WindowDatasetOp::Dataset : public DatasetBase {
       const int64_t window_shift = dataset()->window_shift_;
       const int64_t window_stride = dataset()->window_stride_;
       std::vector<std::vector<Tensor>> window_elements;
-      Status status = OkStatus();
+      Status status = absl::OkStatus();
       {
         const size_t target_size = TargetBufferSize(window_size, window_stride);
 
@@ -162,7 +162,7 @@ class WindowDatasetOp::Dataset : public DatasetBase {
             (buffer_.empty() ||
              (dataset()->drop_remainder_ && buffer_.size() < target_size))) {
           *end_of_sequence = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         // Add elements to the buffer.
@@ -187,7 +187,7 @@ class WindowDatasetOp::Dataset : public DatasetBase {
         if (buffer_.empty() ||
             (dataset()->drop_remainder_ && buffer_.size() < target_size)) {
           DCHECK(*end_of_sequence);
-          return OkStatus();
+          return absl::OkStatus();
         }
 
         int num_elements = 1 + (buffer_.size() - 1) / window_stride;
@@ -252,7 +252,7 @@ class WindowDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(
             StoreDatasetInVariantTensor(window_dataset, &out_tensors->back()));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    protected:
@@ -284,7 +284,7 @@ class WindowDatasetOp::Dataset : public DatasetBase {
               buffer_[i].result[j]));
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -314,7 +314,7 @@ class WindowDatasetOp::Dataset : public DatasetBase {
                                  &buffer_[i].result[j]));
         }
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     TraceMeMetadata GetTraceMeMetadata() const override {
@@ -340,7 +340,7 @@ class WindowDatasetOp::Dataset : public DatasetBase {
         TF_RETURN_IF_ERROR(writer->WriteScalar(prefix(), ErrorMessageKey(index),
                                                std::string(status.message())));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status ReadStatusLocked(IteratorStateReader* reader, size_t index,
@@ -356,9 +356,9 @@ class WindowDatasetOp::Dataset : public DatasetBase {
                                               &error_message));
         *status = Status(code, error_message);
       } else {
-        *status = OkStatus();
+        *status = absl::OkStatus();
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     string CodeKey(size_t index) {
diff --git a/tensorflow/core/kernels/data/window_dataset_op_test.cc b/tensorflow/core/kernels/data/window_dataset_op_test.cc
index fda3afbc8a8025..7d8d5b6bc0192b 100644
--- a/tensorflow/core/kernels/data/window_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/window_dataset_op_test.cc
@@ -59,7 +59,7 @@ class WindowDatasetParams : public DatasetParams {
     input_names->emplace_back(WindowDatasetOp::kShift);
     input_names->emplace_back(WindowDatasetOp::kStride);
     input_names->emplace_back(WindowDatasetOp::kDropRemainder);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -67,7 +67,7 @@ class WindowDatasetParams : public DatasetParams {
     attr_vector->emplace_back("output_types", output_dtypes_);
     attr_vector->emplace_back("output_shapes", output_shapes_);
     attr_vector->emplace_back("metadata", "");
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override { return WindowDatasetOp::kDatasetType; }
diff --git a/tensorflow/core/kernels/data/zip_dataset_op.cc b/tensorflow/core/kernels/data/zip_dataset_op.cc
index 41582cf0a1e5e9..3ceadfb5db2def 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op.cc
@@ -70,7 +70,7 @@ class ZipDatasetOp::Dataset : public DatasetBase {
   Status MakeSplitProviders(std::vector<std::unique_ptr<SplitProvider>>*
                                 split_providers) const override {
     TF_ASSIGN_OR_RETURN(*split_providers, GetSplitProviders(this));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   const DataTypeVector& output_dtypes() const override {
@@ -104,14 +104,14 @@ class ZipDatasetOp::Dataset : public DatasetBase {
     for (const auto& input : inputs_) {
       inputs->push_back(input);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckExternalState() const override {
     for (const auto& input : inputs_) {
       TF_RETURN_IF_ERROR(input->CheckExternalState());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Get(OpKernelContext* ctx, int64 index,
@@ -124,7 +124,7 @@ class ZipDatasetOp::Dataset : public DatasetBase {
       out_tensors->insert(out_tensors->end(), input_tensors.begin(),
                           input_tensors.end());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
@@ -140,7 +140,7 @@ class ZipDatasetOp::Dataset : public DatasetBase {
     }
     TF_RETURN_IF_ERROR(b->AddDataset(
         this, {}, {std::make_pair(0, input_graph_nodes)}, {}, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -162,7 +162,7 @@ class ZipDatasetOp::Dataset : public DatasetBase {
             &input_impls_[i]));
         ctx->MergeCheckpoint(input_contexts_[i].checkpoint());
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status GetNextInternal(IteratorContext* ctx,
@@ -171,11 +171,11 @@ class ZipDatasetOp::Dataset : public DatasetBase {
       mutex_lock l(mu_);
       if (input_impls_.empty()) {
         *end_of_sequence = true;
-        return OkStatus();
+        return absl::OkStatus();
       }
       out_tensors->clear();
       out_tensors->reserve(dataset()->output_dtypes().size());
-      Status status = OkStatus();
+      Status status = absl::OkStatus();
       *end_of_sequence = false;
       for (int i = 0; i < input_impls_.size(); ++i) {
         const auto& input_impl = input_impls_[i];
@@ -232,7 +232,7 @@ class ZipDatasetOp::Dataset : public DatasetBase {
       for (auto& input_impl : input_impls_) {
         TF_RETURN_IF_ERROR(SaveInput(ctx, writer, input_impl));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     Status RestoreInternal(IteratorContext* ctx,
@@ -248,7 +248,7 @@ class ZipDatasetOp::Dataset : public DatasetBase {
         for (auto& input_impl : input_impls_)
           TF_RETURN_IF_ERROR(RestoreInput(ctx, reader, input_impl));
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
diff --git a/tensorflow/core/kernels/data/zip_dataset_op_test.cc b/tensorflow/core/kernels/data/zip_dataset_op_test.cc
index 460c22f205747d..fcbd97f725491b 100644
--- a/tensorflow/core/kernels/data/zip_dataset_op_test.cc
+++ b/tensorflow/core/kernels/data/zip_dataset_op_test.cc
@@ -49,7 +49,7 @@ class ZipDatasetParams : public DatasetParams {
       input_names->emplace_back(
           absl::StrCat(ZipDatasetOp::kDatasetType, "_", i));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAttributes(AttributeVector* attr_vector) const override {
@@ -58,7 +58,7 @@ class ZipDatasetParams : public DatasetParams {
     attr_vector->emplace_back("output_shapes", output_shapes_);
     attr_vector->emplace_back("N", num_input_datasets_);
     attr_vector->emplace_back("metadata", "");
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   string dataset_type() const override { return ZipDatasetOp::kDatasetType; }
diff --git a/tensorflow/core/kernels/debug_ops.h b/tensorflow/core/kernels/debug_ops.h
index eb1d2db77a7f2e..9258a8e95ede1d 100644
--- a/tensorflow/core/kernels/debug_ops.h
+++ b/tensorflow/core/kernels/debug_ops.h
@@ -181,7 +181,7 @@ class BaseDebugOp : public OpKernel {
   // Log an error if the publishing failed.
   Status PublishTensor(const Tensor& tensor, int64_t step_id = -1) {
     if (debug_urls_.empty()) {
-      return OkStatus();
+      return absl::OkStatus();
     } else {
       Status status = DebugIO::PublishDebugTensor(
           *debug_watch_key_, tensor, Env::Default()->NowMicros(), debug_urls_,
diff --git a/tensorflow/core/kernels/decode_compressed_op.cc b/tensorflow/core/kernels/decode_compressed_op.cc
index 971faa01ac4fa3..407746a9e20b02 100644
--- a/tensorflow/core/kernels/decode_compressed_op.cc
+++ b/tensorflow/core/kernels/decode_compressed_op.cc
@@ -41,7 +41,7 @@ class MemoryInputStream : public io::InputStreamInterface {
                                      bytes_to_read);
     }
     int64_t bytes = bytes_to_read;
-    Status s = OkStatus();
+    Status s = absl::OkStatus();
     if (pos_ + bytes_to_read > len_) {
       bytes = len_ - pos_;
       s = errors::OutOfRange("reached end of file");
@@ -58,7 +58,7 @@ class MemoryInputStream : public io::InputStreamInterface {
 
   Status Reset() override {
     pos_ = 0;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
diff --git a/tensorflow/core/kernels/decode_proto_op.cc b/tensorflow/core/kernels/decode_proto_op.cc
index 6257fef46da3ff..eb55543c528355 100644
--- a/tensorflow/core/kernels/decode_proto_op.cc
+++ b/tensorflow/core/kernels/decode_proto_op.cc
@@ -123,7 +123,7 @@ Status InitDefaultValue(DataType dtype, const T value, DefaultValue* result) {
           "Cannot initialize default value for unsupported type: ",
           DataTypeString(dtype));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <>
@@ -190,7 +190,7 @@ Status InitDefaultValueFromFieldDescriptor(DataType dtype,
       return InitDefaultValue(dtype, "", result);
       // default: intentionally omitted in order to enable static checking.
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // A FieldInfo holds a handful of information from the FieldDescriptor
@@ -263,14 +263,14 @@ class CountCollector {
     if (!SkipValue(input, field)) {
       return errors::DataLoss("ReadValue: Failed skipping field when counting");
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Reads (in this case counts) a length-delimited list of values.
   Status ReadPackedValues(CodedInputStream* input, const FieldInfo& field,
                           size_t buf_size) {
     if (buf_size == 0) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     const void* tmpbuf;
@@ -356,7 +356,7 @@ class CountCollector {
     if (!field.is_repeated && *count_ptr_ > 1) {
       *count_ptr_ = 1;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -395,7 +395,7 @@ class CountCollector {
     }
 
     *count_ptr_ += count;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Counts the number of fixed-size values in a packed field. This can be done
@@ -408,7 +408,7 @@ class CountCollector {
           "Illegal data length for packed fixed-size type: ", len);
     }
     *count_ptr_ += len / sizeof(T);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Skips a single value in the input stream. Dispatches to the appropriately
@@ -578,7 +578,7 @@ class DenseCollector {
     for (int i = next_repeat_index_; i < max_repeat_count_; i++) {
       reinterpret_cast<T*>(datap_)[i] = default_value;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   int32 next_repeat_index_ = 0;
@@ -669,8 +669,9 @@ class DecodeProtoOp : public OpKernel {
           // allow conversions here, but tf.cast already exists so we don't
           // duplicate the functionality.
           errors::InvalidArgument("Unexpected output type for ",
-                                  fd->full_name(), ": ", fd->cpp_type(), " to ",
-                                  output_types[field_index]));
+                                  fd->full_name(), ": ", fd->cpp_type_name(),
+                                  " to ",
+                                  DataType_Name(output_types[field_index])));
 
       field_index++;
       field_descs.push_back(fd);
@@ -1032,7 +1033,7 @@ class DecodeProtoOp : public OpKernel {
           *field_info, WireFormatLite::GetTagWireType(tag), input,
           &collectors[expected_field_info_iter - fields_.begin()]));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Collects values for a single field.
@@ -1072,7 +1073,7 @@ class DecodeProtoOp : public OpKernel {
         return errors::DataLoss(
             "CollectField: Failed skipping malformed field");
       }
-      return OkStatus();
+      return absl::OkStatus();
     }
     return collector->ReadValue(input, field);
   }
diff --git a/tensorflow/core/kernels/deserialize_sparse_string_op.cc b/tensorflow/core/kernels/deserialize_sparse_string_op.cc
index b02308b827b8f1..c7275f139421a8 100644
--- a/tensorflow/core/kernels/deserialize_sparse_string_op.cc
+++ b/tensorflow/core/kernels/deserialize_sparse_string_op.cc
@@ -223,7 +223,7 @@ class DeserializeSparseOp : public OpKernel {
       return errors::InvalidArgument("Could not construct tensor from proto");
     }
     *result = tensor;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAndValidateSparseTensor(
@@ -278,7 +278,7 @@ class DeserializeSparseOp : public OpKernel {
                                      index, "].shape but they do not: ", rank,
                                      " vs. ", output_shape->dim_size(0));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   DataType dtype_;
diff --git a/tensorflow/core/kernels/deserialize_sparse_variant_op.cc b/tensorflow/core/kernels/deserialize_sparse_variant_op.cc
index 9be55421aefa5b..57d246ff5912e3 100644
--- a/tensorflow/core/kernels/deserialize_sparse_variant_op.cc
+++ b/tensorflow/core/kernels/deserialize_sparse_variant_op.cc
@@ -296,7 +296,7 @@ class DeserializeSparseOp : public OpKernel {
           (*output_shape)->shape().DebugString());
     }
     *output_num_non_zeros = serialized_values.get<Tensor>()->NumElements();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status GetAndValidateSparseTensorIndicesAndValues(
@@ -356,7 +356,7 @@ class DeserializeSparseOp : public OpKernel {
           (*output_values)->dim_size(0));
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   DataType dtype_;
diff --git a/tensorflow/core/kernels/edit_distance_op.cc b/tensorflow/core/kernels/edit_distance_op.cc
index 9e2b430dbc03d6..b495f2f80524e4 100644
--- a/tensorflow/core/kernels/edit_distance_op.cc
+++ b/tensorflow/core/kernels/edit_distance_op.cc
@@ -100,7 +100,7 @@ Status ValidateShapes(OpKernelContext* ctx, const Tensor& hypothesis_indices,
         truth_shape.shape().DebugString(), " and ",
         hypothesis_shape.shape().DebugString());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/encode_proto_op.cc b/tensorflow/core/kernels/encode_proto_op.cc
index d585dff1f9e90f..8a79d0817d4126 100644
--- a/tensorflow/core/kernels/encode_proto_op.cc
+++ b/tensorflow/core/kernels/encode_proto_op.cc
@@ -277,7 +277,7 @@ Status WriteField(const FieldDescriptor& field_desc, const Tensor& input,
       Writer(value, output);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Writes a possibly repeated string, bytes, or message field.
@@ -293,7 +293,7 @@ Status WriteVarLenField(const FieldDescriptor& field_desc, const Tensor& input,
     // small speedup.
     Writer(field_desc.number(), value, output);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 static void WriteStringAdapter(int field_number, const tstring& value,
@@ -331,7 +331,7 @@ Status WriteGroup(const FieldDescriptor& field_desc, const Tensor& input,
     WireFormatLite::WriteTag(field_desc.number(),
                              WireFormatLite::WIRETYPE_END_GROUP, output);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Writes a (possibly repeated) field into an output stream. It is the caller's
diff --git a/tensorflow/core/kernels/example_parsing_ops.cc b/tensorflow/core/kernels/example_parsing_ops.cc
index e241665f297110..b425b0a845612b 100644
--- a/tensorflow/core/kernels/example_parsing_ops.cc
+++ b/tensorflow/core/kernels/example_parsing_ops.cc
@@ -111,7 +111,7 @@ class ParseExampleOp : public OpKernel {
     for (int i = 0; i < keys_flat.size(); ++i) {
       keys->push_back(keys_flat(i));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Copies keys from OpInputList of scalar to std::vector<string>.
@@ -123,7 +123,7 @@ class ParseExampleOp : public OpKernel {
     for (const auto& key : key_list) {
       keys->push_back(key.scalar<tstring>()());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Validates the shapes of input tensors.
@@ -205,7 +205,7 @@ class ParseExampleOp : public OpKernel {
             "] == ", DataTypeString(attrs_.dense_types[d]));
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Populates the FastParseExampleConfig from keys & defaults.
@@ -284,7 +284,7 @@ class ParseExampleOp : public OpKernel {
         ragged_splits.set(d, result.ragged_splits[d]);
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   ParseExampleAttrs attrs_;
@@ -566,7 +566,7 @@ class ParseSequenceExampleOp : public OpKernel {
         }
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   example::FastParseExampleConfig MakeContextConfig(
@@ -765,7 +765,7 @@ class ParseSequenceExampleOp : public OpKernel {
             d, feature_list_result.ragged_splits[d]);
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   ParseSequenceExampleAttrs attrs_;
diff --git a/tensorflow/core/kernels/fft_ops.cc b/tensorflow/core/kernels/fft_ops.cc
index 099a57025b3657..bcc81f903b84f6 100644
--- a/tensorflow/core/kernels/fft_ops.cc
+++ b/tensorflow/core/kernels/fft_ops.cc
@@ -727,14 +727,16 @@ class FFTGPUBase : public FFTBase {
 
     // Create a new plan if one doesn't exist.  Otherwise, we need only set
     // the scratch allocator.
+    auto fft = stream->parent()->AsFft();
+    OP_REQUIRES(ctx, fft != nullptr, absl::InternalError("No FFT for stream."));
     if (plan == nullptr) {
-      plan = stream->parent()->AsFft()->CreateBatchedPlanWithScratchAllocator(
+      plan = fft->CreateBatchedPlanWithScratchAllocator(
           stream, fft_rank, fft_shape, input_embed, input_stride,
           input_distance, output_embed, output_stride, output_distance,
           kFftType, kInPlaceFft, batch_size, &scratch_allocator);
     } else {
-      stream->parent()->AsFft()->UpdatePlanWithScratchAllocator(
-          stream, plan.get(), &scratch_allocator);
+      fft->UpdatePlanWithScratchAllocator(stream, plan.get(),
+                                          &scratch_allocator);
     }
 
     OP_REQUIRES(
@@ -807,17 +809,21 @@ class FFTGPUBase : public FFTBase {
         AsDeviceMemory<InT>(in.flat<InT>().data(), input_shape.num_elements());
     auto dst = AsDeviceMemory<OutT>(out->flat<OutT>().data(),
                                     output_shape.num_elements());
+    auto fft = stream->parent()->AsFft();
+    OP_REQUIRES(ctx, fft != nullptr, absl::InternalError("No FFT for stream."));
     OP_REQUIRES(
-        ctx, stream->ThenFft(plan, src, &dst).ok(),
+        ctx, fft->DoFft(stream, plan, src, &dst),
         errors::Internal("fft failed : type=", static_cast<int>(fft_type),
                          " in.shape=", input_shape.DebugString()));
     if (!IsForward()) {
       typedef typename RealTypeFromComplexType<OutT>::RealT RealT;
       RealT alpha = 1.0 / output_distance;
+      auto blas = stream->parent()->AsBlas();
+      OP_REQUIRES(ctx, blas != nullptr,
+                  absl::InternalError("No Blas for stream."));
       OP_REQUIRES(
           ctx,
-          stream->ThenBlasScal(output_shape.num_elements(), alpha, &dst, 1)
-              .ok(),
+          blas->DoBlasScal(stream, output_shape.num_elements(), alpha, &dst, 1),
           errors::Internal("BlasScal failed : in.shape=",
                            input_shape.DebugString()));
     }
@@ -909,16 +915,18 @@ class FFTNGPUBase : public FFTNBase {
         plan = std::move(*plan_or);
       }
     }
+    auto fft = stream->parent()->AsFft();
+    OP_REQUIRES(ctx, fft != nullptr, absl::InternalError("No FFT for stream."));
     // Create a new plan if one doesn't exist.  Otherwise, we need only set
     // the scratch allocator.
     if (plan == nullptr) {
-      plan = stream->parent()->AsFft()->CreateBatchedPlanWithScratchAllocator(
+      plan = fft->CreateBatchedPlanWithScratchAllocator(
           stream, fft_rank, fft_shape, input_embed, input_stride,
           input_distance, output_embed, output_stride, output_distance,
           kFftType, kInPlaceFft, batch_size, &scratch_allocator);
     } else {
-      stream->parent()->AsFft()->UpdatePlanWithScratchAllocator(
-          stream, plan.get(), &scratch_allocator);
+      fft->UpdatePlanWithScratchAllocator(stream, plan.get(),
+                                          &scratch_allocator);
     }
 
     OP_REQUIRES(
@@ -989,17 +997,21 @@ class FFTNGPUBase : public FFTNBase {
         AsDeviceMemory<InT>(in.flat<InT>().data(), input_shape.num_elements());
     auto dst = AsDeviceMemory<OutT>(out->flat<OutT>().data(),
                                     output_shape.num_elements());
-    OP_REQUIRES(ctx, stream->ThenFft(plan, src, &dst).ok(),
+    auto fft = stream->parent()->AsFft();
+    OP_REQUIRES(ctx, fft != nullptr, absl::InternalError("No FFT for stream."));
+    OP_REQUIRES(ctx, fft->DoFft(stream, plan, src, &dst),
                 absl::InternalError(absl::StrCat(
                     "fft failed : type=", static_cast<int>(fft_type),
                     " in.shape=", input_shape.DebugString())));
     if (!IsForward()) {
       typedef typename RealTypeFromComplexType<OutT>::RealT RealT;
       RealT alpha = 1.0 / output_distance;
+      auto blas = stream->parent()->AsBlas();
+      OP_REQUIRES(ctx, blas != nullptr,
+                  absl::InternalError("No blas for stream."));
       OP_REQUIRES(
           ctx,
-          stream->ThenBlasScal(output_shape.num_elements(), alpha, &dst, 1)
-              .ok(),
+          blas->DoBlasScal(stream, output_shape.num_elements(), alpha, &dst, 1),
           absl::InternalError(absl::StrCat("BlasScal failed : in.shape=",
                                            input_shape.DebugString())));
     }
diff --git a/tensorflow/core/kernels/fifo_queue.cc b/tensorflow/core/kernels/fifo_queue.cc
index e58ee0e111c54e..27adb244c4d5e9 100644
--- a/tensorflow/core/kernels/fifo_queue.cc
+++ b/tensorflow/core/kernels/fifo_queue.cc
@@ -96,7 +96,7 @@ Status FIFOQueue::GetElementComponentFromBatch(const FIFOQueue::Tuple& tuple,
       ctx->allocate_temp(tuple[component].dtype(), element_shape, out_tensor));
   TF_RETURN_IF_ERROR(
       batch_util::CopySliceToElement(tuple[component], out_tensor, index));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void FIFOQueue::TryEnqueueMany(const Tuple& tuple, OpKernelContext* ctx,
@@ -363,7 +363,7 @@ Status FIFOQueue::MatchesNodeDef(const NodeDef& node_def) {
   TF_RETURN_IF_ERROR(MatchesNodeDefCapacity(node_def, capacity_));
   TF_RETURN_IF_ERROR(MatchesNodeDefTypes(node_def));
   TF_RETURN_IF_ERROR(MatchesNodeDefShapes(node_def));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Defines a FIFOQueueOp, which produces a Queue (specifically, one
diff --git a/tensorflow/core/kernels/fingerprint_op_test.cc b/tensorflow/core/kernels/fingerprint_op_test.cc
index 34f6fa354f007d..2ba9640d09e963 100644
--- a/tensorflow/core/kernels/fingerprint_op_test.cc
+++ b/tensorflow/core/kernels/fingerprint_op_test.cc
@@ -54,7 +54,7 @@ class FingerprintOpTest : public OpsTestBase {
     method_ = Tensor(DT_STRING, TensorShape{});
     method_.scalar<tstring>()() = method;
     inputs_.push_back(TensorValue(&method_));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Tensor batch_dims_;
diff --git a/tensorflow/core/kernels/fixed_length_record_reader_op.cc b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
index 7787b0ad5f1c02..8db930d7795480 100644
--- a/tensorflow/core/kernels/fixed_length_record_reader_op.cc
+++ b/tensorflow/core/kernels/fixed_length_record_reader_op.cc
@@ -69,12 +69,12 @@ class FixedLengthRecordReader : public ReaderBase {
     // header_bytes_ is always skipped.
     TF_RETURN_IF_ERROR(buffered_inputstream_->SkipNBytes(header_bytes_));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status OnWorkFinishedLocked() override {
     buffered_inputstream_.reset(nullptr);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status ReadLocked(tstring* key, tstring* value, bool* produced,
@@ -98,7 +98,7 @@ class FixedLengthRecordReader : public ReaderBase {
             return s;
           }
           *at_end = true;
-          return OkStatus();
+          return absl::OkStatus();
         }
       }
     }
@@ -112,7 +112,7 @@ class FixedLengthRecordReader : public ReaderBase {
         return s;
       }
       *at_end = true;
-      return OkStatus();
+      return absl::OkStatus();
     }
     lookahead_cache_.append(*value, 0, bytes_to_read);
     value->clear();
@@ -124,7 +124,7 @@ class FixedLengthRecordReader : public ReaderBase {
     *produced = true;
     ++record_number_;
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status ResetLocked() override {
diff --git a/tensorflow/core/kernels/function_ops.cc b/tensorflow/core/kernels/function_ops.cc
index 3ce1145493731b..3f4bb1bb96ed4d 100644
--- a/tensorflow/core/kernels/function_ops.cc
+++ b/tensorflow/core/kernels/function_ops.cc
@@ -52,7 +52,7 @@ void ArgOp::Compute(OpKernelContext* ctx) {
 
   auto validate_type = [this](const Tensor& val) {
     if (val.dtype() == dtype_) {
-      return OkStatus();
+      return absl::OkStatus();
     } else {
       return errors::InvalidArgument("Type mismatch: actual ",
                                      DataTypeString(val.dtype()),
diff --git a/tensorflow/core/kernels/functional_ops.cc b/tensorflow/core/kernels/functional_ops.cc
index dd098b01bc65d2..79c393facbb6f1 100644
--- a/tensorflow/core/kernels/functional_ops.cc
+++ b/tensorflow/core/kernels/functional_ops.cc
@@ -90,7 +90,7 @@ Status ToBool(gtl::ArraySlice<Tensor> t, bool* v) {
   } else {
     *v = t[0].NumElements() > 0;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Sets "rets" to be the output of "ctx". Validates rets' types based
@@ -109,7 +109,7 @@ Status SetOutputs(const OpKernel* kernel, OpKernelContext* ctx,
     }
     ctx->set_output(i, rets[i]);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void SetRunOptions(OpKernelContext* ctx, FunctionLibraryRuntime::Options* opts,
@@ -263,7 +263,7 @@ class IfOp : public AsyncOpKernel {
                            tsl::core::WeakPtr<FunctionLibraryRuntime>(lib));
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -349,7 +349,7 @@ class CaseOp : public AsyncOpKernel {
       }
     }
     branch_handles.assign(handles.begin(), handles.end());
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   class State {
@@ -556,7 +556,7 @@ class WhileOp : public AsyncOpKernel {
     Status GetArg(int index, const Tensor** val) override {
       if (index < args_->size()) {
         *val = &(*args_)[index];
-        return OkStatus();
+        return absl::OkStatus();
       } else {
         return errors::InvalidArgument("Argument ", index, " is out of range.");
       }
@@ -586,7 +586,7 @@ class WhileOp : public AsyncOpKernel {
                                        DataTypeString(val.dtype()), ".");
       }
       (*retvals_)[index] = val;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
    private:
@@ -665,7 +665,7 @@ class WhileOp : public AsyncOpKernel {
       }
 
       if (!cond) {
-        return Finish(OkStatus());
+        return Finish(absl::OkStatus());
       }
       rets_.clear();
       rets_.resize(args_.size());
@@ -790,7 +790,7 @@ class WhileOp : public AsyncOpKernel {
                            tsl::core::WeakPtr<FunctionLibraryRuntime>(lib));
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 // TODO(drpng): remove these.
@@ -825,7 +825,7 @@ Status GetScalar(OpKernelContext* ctx, int index, int32* value,
                                    t.shape().DebugString());
   }
   *value = t.scalar<int32>()();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 class ForOp : public AsyncOpKernel {
@@ -895,7 +895,7 @@ class ForOp : public AsyncOpKernel {
             *body_handle, tsl::core::WeakPtr<FunctionLibraryRuntime>(lib));
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   class State {
@@ -953,7 +953,7 @@ class ForOp : public AsyncOpKernel {
           (delta_ < 0 && *iter_ >= limit_) ||
           (delta_ == 0 && *iter_ == limit_)) {
         RunNext();
-        return OkStatus();
+        return absl::OkStatus();
       } else {
         return errors::InvalidArgument("Invalid start/limit/delta: ", *iter_,
                                        " ", limit_, " ", delta_);
@@ -968,7 +968,7 @@ class ForOp : public AsyncOpKernel {
         done_loop = *iter_ <= limit_;
       }
       if (done_loop) {
-        Finish(OkStatus());
+        Finish(absl::OkStatus());
         return;
       }
 
diff --git a/tensorflow/core/kernels/fused_batch_norm_op.cc b/tensorflow/core/kernels/fused_batch_norm_op.cc
index 009710f113fd7a..9071b3036887bb 100644
--- a/tensorflow/core/kernels/fused_batch_norm_op.cc
+++ b/tensorflow/core/kernels/fused_batch_norm_op.cc
@@ -73,11 +73,11 @@ Status ParseActivationMode(OpKernelConstruction* context,
 
   if (activation_mode_str == "Identity") {
     *activation_mode = FusedBatchNormActivationMode::kIdentity;
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (activation_mode_str == "Relu") {
     *activation_mode = FusedBatchNormActivationMode::kRelu;
-    return OkStatus();
+    return absl::OkStatus();
   }
   return errors::InvalidArgument("Unsupported activation mode: ",
                                  activation_mode_str);
@@ -987,18 +987,19 @@ struct FusedBatchNormImplGPU {
           errors::Internal("MatrixTriangularSolveOp: failed to copy rhs "
                            "from device"));
     }
-    bool cudnn_launch_status =
-        stream
-            ->ThenBatchNormalizationForward(
-                x_ptr, scale_ptr, offset_ptr, estimated_mean_ptr,
-                estimated_variance_ptr, side_input_ptr, x_desc,
-                scale_offset_desc, static_cast<double>(epsilon),
-                static_cast<double>(exponential_avg_factor),
-                AsDnnActivationMode(activation_mode), &y_ptr, &batch_mean_ptr,
-                &batch_var_ptr, &saved_mean_ptr, &saved_inv_var_ptr,
-                is_training, reserve_space_allocator.get(),
-                workspace_allocator.get())
-            .ok();
+    auto dnn = stream->parent()->AsDnn();
+    if (dnn == nullptr) {
+      context->SetStatus(absl::InternalError("No DNN support for stream"));
+      return;
+    }
+    bool cudnn_launch_status = dnn->DoBatchNormalizationForward(
+        stream, x_ptr, scale_ptr, offset_ptr, estimated_mean_ptr,
+        estimated_variance_ptr, side_input_ptr, x_desc, scale_offset_desc,
+        static_cast<double>(epsilon),
+        static_cast<double>(exponential_avg_factor),
+        AsDnnActivationMode(activation_mode), &y_ptr, &batch_mean_ptr,
+        &batch_var_ptr, &saved_mean_ptr, &saved_inv_var_ptr, is_training,
+        reserve_space_allocator.get(), workspace_allocator.get());
 
     if (!cudnn_launch_status) {
       context->SetStatus(
@@ -1256,18 +1257,19 @@ struct FusedBatchNormGradImplGPU {
       }
     }
 #endif  // CUDNN_VERSION >= 7402
+    auto dnn = stream->parent()->AsDnn();
+    if (dnn == nullptr) {
+      context->SetStatus(absl::InternalError("No DNN support for stream"));
+      return;
+    }
 
-    bool cudnn_launch_status =
-        stream
-            ->ThenBatchNormalizationBackward(
-                y_backprop_ptr, x_ptr, scale_ptr, offset_ptr, mean_ptr,
-                inv_variance_ptr, y_ptr, x_desc, scale_offset_desc,
-                static_cast<double>(epsilon),
-                AsDnnActivationMode(activation_mode), &x_backprop_ptr,
-                &scale_backprop_ptr, &offset_backprop_ptr,
-                &side_input_backprop_ptr, reserve_space_data_ptr,
-                workspace_allocator.get())
-            .ok();
+    bool cudnn_launch_status = dnn->DoBatchNormalizationBackward(
+        stream, y_backprop_ptr, x_ptr, scale_ptr, offset_ptr, mean_ptr,
+        inv_variance_ptr, y_ptr, x_desc, scale_offset_desc,
+        static_cast<double>(epsilon), AsDnnActivationMode(activation_mode),
+        &x_backprop_ptr, &scale_backprop_ptr, &offset_backprop_ptr,
+        &side_input_backprop_ptr, reserve_space_data_ptr,
+        workspace_allocator.get());
 
     if (!cudnn_launch_status) {
       context->SetStatus(
diff --git a/tensorflow/core/kernels/fused_eigen_output_kernels.cc b/tensorflow/core/kernels/fused_eigen_output_kernels.cc
index 9784441d57c0d6..39054713db5ac2 100644
--- a/tensorflow/core/kernels/fused_eigen_output_kernels.cc
+++ b/tensorflow/core/kernels/fused_eigen_output_kernels.cc
@@ -144,7 +144,7 @@ Status InitializeFusedComputation(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fused_eigen_output_kernels.h b/tensorflow/core/kernels/fused_eigen_output_kernels.h
index 9a50882a1016c7..21bcf17df3e9d6 100644
--- a/tensorflow/core/kernels/fused_eigen_output_kernels.h
+++ b/tensorflow/core/kernels/fused_eigen_output_kernels.h
@@ -428,7 +428,7 @@ Status InitBiasAddArgs(OpKernelContext* context, BiasAddArgs<T>* args,
     args->leakyrelu_alpha = *leakyrelu_alpha;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename T>
@@ -471,7 +471,7 @@ Status InitFusedBatchNormArgs(OpKernelContext* context, float epsilon,
     args->leakyrelu_alpha = *leakyrelu_alpha;
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/fuzzing/fuzz_session.h b/tensorflow/core/kernels/fuzzing/fuzz_session.h
index 6aeadddb49b40b..09c7563d2efd17 100644
--- a/tensorflow/core/kernels/fuzzing/fuzz_session.h
+++ b/tensorflow/core/kernels/fuzzing/fuzz_session.h
@@ -83,7 +83,7 @@ class FuzzSession {
   // can't be put into the constructor.
   Status InitIfNeeded() {
     if (initialized_) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     initialized_ = true;
 
diff --git a/tensorflow/core/kernels/image/crop_and_resize_op.cc b/tensorflow/core/kernels/image/crop_and_resize_op.cc
index 42d098612e23c0..8f8e5741349bfc 100644
--- a/tensorflow/core/kernels/image/crop_and_resize_op.cc
+++ b/tensorflow/core/kernels/image/crop_and_resize_op.cc
@@ -61,7 +61,7 @@ static inline Status ParseAndCheckBoxSizes(const Tensor& boxes,
                                            int* num_boxes) {
   if (boxes.NumElements() == 0 && box_index.NumElements() == 0) {
     *num_boxes = 0;
-    return OkStatus();
+    return absl::OkStatus();
   }
   // The shape of 'boxes' is [num_boxes, 4].
   if (boxes.dims() != 2) {
@@ -80,7 +80,7 @@ static inline Status ParseAndCheckBoxSizes(const Tensor& boxes,
   if (box_index.dim_size(0) != *num_boxes) {
     return errors::InvalidArgument("box_index has incompatible shape");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Conditionally calls the compute callback if all values in box_index are in
diff --git a/tensorflow/core/kernels/image/scale_and_translate_op.cc b/tensorflow/core/kernels/image/scale_and_translate_op.cc
index 7b9ecaeadc8df3..a2bfb80438c007 100644
--- a/tensorflow/core/kernels/image/scale_and_translate_op.cc
+++ b/tensorflow/core/kernels/image/scale_and_translate_op.cc
@@ -120,7 +120,7 @@ Status ComputeSpansCore(OpKernelContext* context, const Kernel& kernel,
     }
     starts_vec(x) = span_start;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ComputeGradSpansCore(OpKernelContext* context, const Spans& spans,
@@ -180,7 +180,7 @@ Status ComputeGradSpansCore(OpKernelContext* context, const Spans& spans,
       grad_starts_vec(input_index) = 0;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Computes the spans for the passed kernel, for a input dimension of length
@@ -229,7 +229,7 @@ Status ComputeSpans(OpKernelContext* context,
       return errors::InvalidArgument(Printf("Unrecognized kernel type: %d",
                                             static_cast<int>(kernel_type)));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Computes the grad spans for the passed kernel.
diff --git a/tensorflow/core/kernels/immutable_constant_op.cc b/tensorflow/core/kernels/immutable_constant_op.cc
index c496f8036de8da..2b9f8f34f5d4c1 100644
--- a/tensorflow/core/kernels/immutable_constant_op.cc
+++ b/tensorflow/core/kernels/immutable_constant_op.cc
@@ -32,7 +32,7 @@ class MemmappedTensorAllocator : public Allocator {
     if (!status.ok()) {
       return status;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
   string Name() override { return "MemmappedTensorAllocator"; }
 
diff --git a/tensorflow/core/kernels/immutable_constant_op_test.cc b/tensorflow/core/kernels/immutable_constant_op_test.cc
index d32e01ed526621..a91932a240b292 100644
--- a/tensorflow/core/kernels/immutable_constant_op_test.cc
+++ b/tensorflow/core/kernels/immutable_constant_op_test.cc
@@ -83,7 +83,7 @@ class TestFileSystem : public NullFileSystem {
     auto region = new TestReadOnlyMemoryRegion(kTestTensorSizeBytes);
     std::fill_n(region->GetWritableDataStart(), kTestTensorSize, val);
     result->reset(region);
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -158,7 +158,7 @@ Status CreateTempFileFloat(Env* env, float value, uint64 size,
     TF_RETURN_IF_ERROR(file->Append(sp));
   }
   TF_RETURN_IF_ERROR(file->Close());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 TEST(ImmutableConstantOpTest, FromFile) {
@@ -199,7 +199,7 @@ Status CreateTempFileBadString(Env* env, char value, uint64 size,
   TF_RETURN_IF_ERROR(env->NewWritableFile(*filename, &file));
   TF_RETURN_IF_ERROR(file->Append(std::string(size, value)));
   TF_RETURN_IF_ERROR(file->Close());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 TEST(ImmutableConstantOpTest, FromFileStringUnimplmented) {
diff --git a/tensorflow/core/kernels/initializable_lookup_table.cc b/tensorflow/core/kernels/initializable_lookup_table.cc
index 612c8c70fd3298..7c295b970ab538 100644
--- a/tensorflow/core/kernels/initializable_lookup_table.cc
+++ b/tensorflow/core/kernels/initializable_lookup_table.cc
@@ -54,7 +54,7 @@ Status InitializableLookupTable::ImportValues(OpKernelContext* ctx,
                                .WithAttr("Tout", values.dtype()));
         *out = ops::UnaryOp("Identity", table,
                             builder->opts().WithControlInput(import_table));
-        return OkStatus();
+        return absl::OkStatus();
       });
 
   return Initialize(iter, std::move(serializer));
@@ -84,7 +84,7 @@ Status InitializableLookupTable::Initialize(
           "Table was already initialized with "
           "different data.");
     } else {
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
   TF_RETURN_IF_ERROR(DoLazyPrepare([&iter]() { return iter.total_size(); }));
@@ -98,13 +98,13 @@ Status InitializableLookupTable::Initialize(
 
   initializer_serializer_ = std::move(serializer);
   is_initialized_.store(true, std::memory_order_release);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status InitializableLookupTable::AreEntriesSame(const InitTableIterator& iter,
                                                 bool* result) {
   *result = static_cast<size_t>(iter.total_size()) == size();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace lookup
diff --git a/tensorflow/core/kernels/initializable_lookup_table.h b/tensorflow/core/kernels/initializable_lookup_table.h
index 674352278ef568..010febb73e8cca 100644
--- a/tensorflow/core/kernels/initializable_lookup_table.h
+++ b/tensorflow/core/kernels/initializable_lookup_table.h
@@ -222,7 +222,7 @@ class KeyValueTensorIterator
  public:
   // keys and values are not owned by the iterator.
   explicit KeyValueTensorIterator(const Tensor* keys, const Tensor* values)
-      : keys_(keys), values_(values), valid_(true), status_(OkStatus()) {
+      : keys_(keys), values_(values), valid_(true), status_(absl::OkStatus()) {
     TensorShape key_shape = keys_->shape();
     if (!key_shape.IsSameSize(values_->shape())) {
       valid_ = false;
diff --git a/tensorflow/core/kernels/inplace_ops.cc b/tensorflow/core/kernels/inplace_ops.cc
index 1ce48822ea2c20..9c1a502214f590 100644
--- a/tensorflow/core/kernels/inplace_ops.cc
+++ b/tensorflow/core/kernels/inplace_ops.cc
@@ -37,7 +37,7 @@ Status DoParallelConcatUpdate(const Device& d, const Tensor& value, int32_t loc,
   auto nrows = Toutput.dimension(0);
   auto r = (loc % nrows + nrows) % nrows;  // Guard index range.
   Toutput.template chip<0>(r).device(d) = Tvalue.template chip<0>(0);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <>
diff --git a/tensorflow/core/kernels/linalg/einsum_op_impl.h b/tensorflow/core/kernels/linalg/einsum_op_impl.h
index da5c6718f4a271..6dc4b07070e81b 100644
--- a/tensorflow/core/kernels/linalg/einsum_op_impl.h
+++ b/tensorflow/core/kernels/linalg/einsum_op_impl.h
@@ -90,7 +90,7 @@ struct EinsumHelper {
           " but got dimension ", input_dim);
     }
     (*label_to_dim_sizes)[label] = input_dim;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Validate input dimensions and populate unnamed labels and their label
@@ -160,7 +160,7 @@ struct EinsumHelper {
     }
     if (!absl::c_linear_search(input_has_ellipsis, true) &&
         !output_has_ellipsis) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     // Insert broadcasting dimensions in the output labels.
     auto it =
@@ -178,7 +178,7 @@ struct EinsumHelper {
     // Populate EinsumDimensionType for the new broadcasting labels.
     label_types->resize(num_named_labels + max_bcast_dims,
                         EinsumDimensionType::kBroadcasting);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Permutes the labels according to the given permutation.
@@ -194,7 +194,7 @@ struct EinsumHelper {
   // Returns a reshaped input Tensor. The underlying buffer is not copied.
   static Status CopyFrom(const Tensor& input, const TensorShape& shape,
                          Tensor* output) {
-    if (output->CopyFrom(input, shape)) return OkStatus();
+    if (output->CopyFrom(input, shape)) return absl::OkStatus();
     return errors::Internal(
         "Encountered error while reshaping a Tensor of shape ",
         input.shape().DebugString(), " to shape ", shape.DebugString());
@@ -234,7 +234,7 @@ struct EinsumHelper {
         ctx->allocate_temp(DataTypeToEnum<T>::value, transposed_shape, output));
     const Device& device = ctx->eigen_device<Device>();
     TF_RETURN_IF_ERROR(DoTranspose(device, input, permutation, output));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // If there are repeated labels in either the input or output, then this
@@ -310,7 +310,7 @@ struct EinsumHelper {
             " while handling repeated indices. Up to rank 6 is supported.");
 #undef NDIMS_CASE
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Returns true if the input dimensions are already sorted in the order
@@ -413,7 +413,7 @@ struct EinsumHelper {
         const_cast<const Tensor&>(input_deduped)
             .shaped<T, 2>({output_size, reshape[kReduce]}),
         Eigen::array<Index, 1>({1}), Reducer());
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Reshapes a Tensor of shape [b0,b1...bk,N,M] to [prod(b0,b1...bk),N,M].
@@ -464,7 +464,7 @@ struct EinsumHelper {
     if (lhs.NumElements() == 0 || rhs.NumElements() == 0) {
       functor::SetZeroFunctor<Device, T> set_zero;
       set_zero(ctx->eigen_device<Device>(), output->flat<T>());
-      return OkStatus();
+      return absl::OkStatus();
     }
     Tensor output_reshaped;
     TF_RETURN_IF_ERROR(
@@ -473,7 +473,7 @@ struct EinsumHelper {
                                          /*adj_y=*/false, trans_x, trans_y,
                                          /*grad_x=*/false, /*grad_y=*/false,
                                          bcast, &output_reshaped);
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/kernels/list_kernels.cc b/tensorflow/core/kernels/list_kernels.cc
index a1dcb5058ef989..fa4b082c9779c2 100644
--- a/tensorflow/core/kernels/list_kernels.cc
+++ b/tensorflow/core/kernels/list_kernels.cc
@@ -51,7 +51,7 @@ Status TensorShapeFromTensor(const Tensor& t, PartialTensorShape* out) {
     if ((t.dtype() == DT_INT32 && t.scalar<int32>()() == -1) ||
         (t.dtype() == DT_INT64 && t.scalar<int64_t>()() == -1)) {
       *out = PartialTensorShape();
-      return OkStatus();
+      return absl::OkStatus();
     }
     return errors::InvalidArgument(
         "The only valid scalar shape tensor is the fully unknown shape "
@@ -80,7 +80,7 @@ Status GetElementShapeFromInput(OpKernelContext* c,
   // compatible and store the merged shape in `element_shape`.
   PartialTensorShape tmp = *element_shape;
   TF_RETURN_IF_ERROR(tmp.MergeWith(tensor_list.element_shape, element_shape));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status GetInputList(OpKernelContext* c, int index, const TensorList** list) {
@@ -95,7 +95,7 @@ Status GetInputList(OpKernelContext* c, int index, const TensorList** list) {
         c->input(index).scalar<Variant>()().DebugString(), "'");
   }
   *list = l;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ForwardInputOrCreateNewList(OpKernelContext* c, int32_t input_index,
@@ -120,7 +120,7 @@ Status ForwardInputOrCreateNewList(OpKernelContext* c, int32_t input_index,
       // Woohoo, forwarding succeeded!
       c->set_output(output_index, *output_tensor);
       *output_list = tmp_out;
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -133,7 +133,7 @@ Status ForwardInputOrCreateNewList(OpKernelContext* c, int32_t input_index,
   output_tensor->scalar<Variant>()() = input_list.Copy();
 
   *output_list = output_tensor->scalar<Variant>()().get<TensorList>();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 class EmptyTensorList : public OpKernel {
@@ -710,7 +710,7 @@ static Status TensorListDeviceCopy(
       TF_RETURN_IF_ERROR(copy(t, &to->tensors().back()));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #define REGISTER_LIST_COPY(DIRECTION)                                         \
diff --git a/tensorflow/core/kernels/list_kernels.h b/tensorflow/core/kernels/list_kernels.h
index 41f475799fc9b9..a43fc195c702ab 100644
--- a/tensorflow/core/kernels/list_kernels.h
+++ b/tensorflow/core/kernels/list_kernels.h
@@ -845,7 +845,7 @@ Status Scatter(OpKernelContext* c, const Tensor& value, const Tensor& indices,
     copy_tensor(c, tmp, aligned);
     std::swap(list->tensors()[i], aligned);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename Device, typename T>
diff --git a/tensorflow/core/kernels/load_and_remap_matrix_op.cc b/tensorflow/core/kernels/load_and_remap_matrix_op.cc
index 3d9d048bd7167e..fb2f9d40495c94 100644
--- a/tensorflow/core/kernels/load_and_remap_matrix_op.cc
+++ b/tensorflow/core/kernels/load_and_remap_matrix_op.cc
@@ -50,7 +50,7 @@ Status RemapVectorToMap(
                           ", which is not supported."));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // anonymous namespace
 
diff --git a/tensorflow/core/kernels/logging_ops_test.cc b/tensorflow/core/kernels/logging_ops_test.cc
index 6eb1f226400da8..8ba44782a194c0 100644
--- a/tensorflow/core/kernels/logging_ops_test.cc
+++ b/tensorflow/core/kernels/logging_ops_test.cc
@@ -50,13 +50,13 @@ TEST_F(PrintingV2GraphTest, StringSuccess) {
 }
 
 TEST_F(PrintingV2GraphTest, InvalidOutputStream) {
-  ASSERT_NE(OkStatus(), (Init("invalid_output_stream")));
+  ASSERT_NE(absl::OkStatus(), (Init("invalid_output_stream")));
 }
 
 TEST_F(PrintingV2GraphTest, InvalidInputRank) {
   TF_ASSERT_OK(Init());
   AddInputFromArray<tstring>(TensorShape({2}), {"bar", "foo"});
-  ASSERT_NE(OkStatus(), RunOpKernel());
+  ASSERT_NE(absl::OkStatus(), RunOpKernel());
 }
 
 class PrintingGraphTest : public OpsTestBase {
diff --git a/tensorflow/core/kernels/logistic-loss.h b/tensorflow/core/kernels/logistic-loss.h
index 6a7016a9625345..d06b1228f62a59 100644
--- a/tensorflow/core/kernels/logistic-loss.h
+++ b/tensorflow/core/kernels/logistic-loss.h
@@ -99,10 +99,10 @@ class LogisticLossUpdater : public DualLossUpdater {
   Status ConvertLabel(float* const example_label) const final {
     if (*example_label == 0.0) {
       *example_label = -1;
-      return OkStatus();
+      return absl::OkStatus();
     }
     if (*example_label == 1.0) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     return errors::InvalidArgument(
         "Only labels of 0.0 or 1.0 are supported right now. "
diff --git a/tensorflow/core/kernels/lookup_table_init_op.cc b/tensorflow/core/kernels/lookup_table_init_op.cc
index 1a80ba41958eef..27cc76ee11b945 100644
--- a/tensorflow/core/kernels/lookup_table_init_op.cc
+++ b/tensorflow/core/kernels/lookup_table_init_op.cc
@@ -178,7 +178,7 @@ class InitializeTableFromTextFileOp : public OpKernel {
                   .WithAttr("delimiter", delimiter_string));
           *out = ops::UnaryOp("Identity", table,
                               builder->opts().WithControlInput(import_table));
-          return OkStatus();
+          return absl::OkStatus();
         });
   }
 
diff --git a/tensorflow/core/kernels/lookup_table_op.cc b/tensorflow/core/kernels/lookup_table_op.cc
index 50dd6bd1d8ed2a..78a2716f0b95d2 100644
--- a/tensorflow/core/kernels/lookup_table_op.cc
+++ b/tensorflow/core/kernels/lookup_table_op.cc
@@ -83,7 +83,7 @@ class MutableHashTableOfScalars final : public LookupInterface {
           is_full_size_default ? default_flat(i) : default_flat(0));
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status DoInsert(bool clear, const Tensor& keys, const Tensor& values) {
@@ -98,7 +98,7 @@ class MutableHashTableOfScalars final : public LookupInterface {
       gtl::InsertOrUpdate(&table_, SubtleMustCopyIfIntegral(key_values(i)),
                           SubtleMustCopyIfIntegral(value_values(i)));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Insert(OpKernelContext* ctx, const Tensor& keys,
@@ -113,7 +113,7 @@ class MutableHashTableOfScalars final : public LookupInterface {
     for (int64_t i = 0; i < key_values.size(); ++i) {
       table_.erase(SubtleMustCopyIfIntegral(key_values(i)));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status ImportValues(OpKernelContext* ctx, const Tensor& keys,
@@ -132,7 +132,7 @@ class MutableHashTableOfScalars final : public LookupInterface {
     TF_RETURN_IF_ERROR(
         ctx->allocate_output("values", TensorShape({size}), &values));
     ExportKeysAndValues(keys, values);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   DataType key_dtype() const override { return DataTypeToEnum<K>::v(); }
@@ -191,7 +191,7 @@ class MutableHashTableOfScalars final : public LookupInterface {
                            .WithAttr("Tout", value_dtype()));
     *out = ops::UnaryOp("Identity", table,
                         builder->opts().WithControlInput(import_table));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -264,7 +264,7 @@ class MutableHashTableOfTensors final : public LookupInterface {
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status DoInsert(bool clear, const Tensor& keys, const Tensor& values) {
@@ -285,7 +285,7 @@ class MutableHashTableOfTensors final : public LookupInterface {
       gtl::InsertOrUpdate(&table_, SubtleMustCopyIfIntegral(key_values(i)),
                           value_vec);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Insert(OpKernelContext* ctx, const Tensor& keys,
@@ -300,7 +300,7 @@ class MutableHashTableOfTensors final : public LookupInterface {
     for (int64_t i = 0; i < key_values.size(); ++i) {
       table_.erase(SubtleMustCopyIfIntegral(key_values(i)));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status ImportValues(OpKernelContext* ctx, const Tensor& keys,
@@ -320,7 +320,7 @@ class MutableHashTableOfTensors final : public LookupInterface {
     TF_RETURN_IF_ERROR(ctx->allocate_output(
         "values", TensorShape({size, value_dim}), &values));
     ExportKeysAndValues(keys, values);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   DataType key_dtype() const override { return DataTypeToEnum<K>::v(); }
@@ -380,7 +380,7 @@ class MutableHashTableOfTensors final : public LookupInterface {
                            .WithAttr("Tout", value_dtype()));
     *out = ops::UnaryOp("Identity", table,
                         builder->opts().WithControlInput(import_table));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -560,7 +560,7 @@ class MutableDenseHashTable final : public LookupInterface {
         }
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Insert(OpKernelContext* ctx, const Tensor& key,
@@ -623,14 +623,14 @@ class MutableDenseHashTable final : public LookupInterface {
         ++num_entries_;
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status ExportValues(OpKernelContext* ctx) override TF_LOCKS_EXCLUDED(mu_) {
     tf_shared_lock l(mu_);
     TF_RETURN_IF_ERROR(ctx->set_output("keys", key_buckets_));
     TF_RETURN_IF_ERROR(ctx->set_output("values", value_buckets_));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CheckKeyAndValueTensorsForImport(const Tensor& keys,
@@ -655,7 +655,7 @@ class MutableDenseHashTable final : public LookupInterface {
           "Expected shape ", expected_value_shape.DebugString(),
           " for value, got ", values.shape().DebugString());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   DataType key_dtype() const override { return DataTypeToEnum<K>::v(); }
@@ -740,7 +740,7 @@ class MutableDenseHashTable final : public LookupInterface {
         }
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status DoRemove(OpKernelContext* ctx, const Tensor& key)
@@ -791,7 +791,7 @@ class MutableDenseHashTable final : public LookupInterface {
         }
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status AllocateBuckets(OpKernelContext* ctx, int64_t new_num_buckets)
@@ -829,7 +829,7 @@ class MutableDenseHashTable final : public LookupInterface {
         value_buckets_matrix(i, j) = V();
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status Rebucket(OpKernelContext* ctx, int64_t num_new_buckets)
diff --git a/tensorflow/core/kernels/lookup_table_op.h b/tensorflow/core/kernels/lookup_table_op.h
index 97d5f404d7c1df..f4855a22d73665 100644
--- a/tensorflow/core/kernels/lookup_table_op.h
+++ b/tensorflow/core/kernels/lookup_table_op.h
@@ -79,7 +79,7 @@ class LookupTableOp : public OpKernel {
                     container->MemoryUsed() + table_.AllocatedBytes());
               }
               *ret = container;
-              return OkStatus();
+              return absl::OkStatus();
             };
 
     lookup::LookupInterface* table = nullptr;
@@ -236,7 +236,7 @@ class HashTable : public InitializableLookupTable {
                            .WithAttr("use_node_name_sharing", true));
     if (table_.empty()) {
       *out = hash_table_node;
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     if (initializer_serializer_ == nullptr) {
@@ -251,7 +251,7 @@ class HashTable : public InitializableLookupTable {
         builder, hash_table_node, &initializer));
     *out = ops::UnaryOp("Identity", hash_table_node,
                         builder->opts().WithControlInput(initializer));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   size_t size() const override {
@@ -282,7 +282,7 @@ class HashTable : public InitializableLookupTable {
       keys_data(i) = it->first;
       values_data(i) = it->second;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   DataType key_dtype() const override { return DataTypeToEnum<K>::v(); }
@@ -297,7 +297,7 @@ class HashTable : public InitializableLookupTable {
     if (size > 0) {
       table_.reserve(size);
     }
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   Status DoLazyPrepare(std::function<int64(void)> size_fn) override {
@@ -317,7 +317,7 @@ class HashTable : public InitializableLookupTable {
             result.first->second, " and trying to add value ", value);
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status DoFind(const Tensor& key, Tensor* value,
@@ -330,7 +330,7 @@ class HashTable : public InitializableLookupTable {
       value_values(i) = gtl::FindWithDefault(
           table_, SubtleMustCopyIfIntegral(key_values(i)), default_val);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   int64_t MemoryUsed() const override {
diff --git a/tensorflow/core/kernels/lookup_util.cc b/tensorflow/core/kernels/lookup_util.cc
index 2ea77327c7df3b..44b93c75e5c988 100644
--- a/tensorflow/core/kernels/lookup_util.cc
+++ b/tensorflow/core/kernels/lookup_util.cc
@@ -54,7 +54,7 @@ Status GetNumLinesInTextFile(Env* env, const string& vocab_file,
     return s;
   }
   *num_lines = next_id;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Iterator that reads a text file. Each iteration process one line, it parses
@@ -210,7 +210,7 @@ class TextFileLineIterator
                   int64_t index, Tensor* tensor) {
     if (index == kLineNumber) {
       tensor->flat<int64_t>()(0) = next_id_ + offset_;
-      return OkStatus();
+      return absl::OkStatus();
     }
     const string& token = (index == kWholeLine) ? line : tokens[index];
     const DataType& dtype = tensor->dtype();
@@ -259,7 +259,7 @@ class TextFileLineIterator
         return errors::InvalidArgument("Data type ", DataTypeString(dtype),
                                        " not supported.");
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   TextFileLineIterator(const TextFileLineIterator&) = delete;
@@ -283,7 +283,7 @@ Status GetTableHandle(StringPiece input_name, OpKernelContext* ctx,
     *container = h(0);
     *table_handle = h(1);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -345,7 +345,7 @@ Status GetInitializableLookupTable(StringPiece input_name, OpKernelContext* ctx,
                                      " is not initializable");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status CheckTableDataTypes(const LookupInterface& table, DataType key_dtype,
@@ -357,7 +357,7 @@ Status CheckTableDataTypes(const LookupInterface& table, DataType key_dtype,
         DataTypeString(table.key_dtype()), "-",
         DataTypeString(table.value_dtype()), " for table ", table_name);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Helper function to initialize an InitializableLookupTable from a text file.
@@ -413,7 +413,7 @@ Status InitializeTableFromTextFile(
   if (absl::IsFailedPrecondition(s) && table->is_initialized()) {
     LOG(INFO) << "Table trying to initialize from file " << filename
               << " is already initialized.";
-    return OkStatus();
+    return absl::OkStatus();
   }
   return s;
 }
diff --git a/tensorflow/core/kernels/lrn_op.cc b/tensorflow/core/kernels/lrn_op.cc
index 22a25eeb6842a4..aa4eafabd0ab71 100644
--- a/tensorflow/core/kernels/lrn_op.cc
+++ b/tensorflow/core/kernels/lrn_op.cc
@@ -219,11 +219,11 @@ struct LaunchLRN<GPUDevice, T> {
     auto* stream = context->op_device_context()->stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
-    bool status =
-        stream
-            ->ThenNormalizeWithDimensions(normalize_desc, dimensions_desc,
-                                          input_data, &output_data)
-            .ok();
+    auto dnn = stream->parent()->AsDnn();
+    OP_REQUIRES(context, dnn != nullptr,
+                absl::InternalError("No DNN support for stream."));
+    bool status = dnn->DoNormalizeWithDimensions(
+        stream, normalize_desc, dimensions_desc, input_data, &output_data);
     OP_REQUIRES(context, status,
                 errors::Internal("NormalizeWithDimensions launch failed"));
 #elif TENSORFLOW_USE_ROCM
@@ -279,12 +279,12 @@ struct LaunchLRN<GPUDevice, T> {
 
     auto* stream = context->op_device_context()->stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+    auto dnn = stream->parent()->AsDnn();
+    OP_REQUIRES(context, dnn != nullptr,
+                absl::InternalError("No DNN support for stream."));
 
-    bool status =
-        stream
-            ->ThenNormalizeWithDimensions(normalize_desc, dimensions_desc,
-                                          input_data, &output_data)
-            .ok();
+    bool status = dnn->DoNormalizeWithDimensions(
+        stream, normalize_desc, dimensions_desc, input_data, &output_data);
     OP_REQUIRES(context, status,
                 errors::Internal("NormalizeWithDimensions launch failed"));
 
@@ -517,12 +517,13 @@ struct LaunchLRNGrad<GPUDevice, T> {
     auto* stream = context->op_device_context()->stream();
     OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
 
-    bool status =
-        stream
-            ->ThenNormalizeBackwardWithDimensions(
-                normalize_desc, dimensions_desc, input_image_data,
-                output_image_data, input_grads_data, &output_grads_data)
-            .ok();
+    auto dnn = stream->parent()->AsDnn();
+    OP_REQUIRES(context, dnn != nullptr,
+                absl::InternalError("No DNN support for stream."));
+    bool status = dnn->DoNormalizeBackwardWithDimensions(
+        stream, normalize_desc, dimensions_desc, input_image_data,
+        output_image_data, input_grads_data, &output_grads_data,
+        /*workspace_allocator=*/nullptr);
     OP_REQUIRES(
         context, status,
         errors::Internal("NormalizeBackwardWithDimensions launch failed"));
@@ -616,12 +617,13 @@ struct LaunchLRNGrad<GPUDevice, T> {
 
     DnnScratchAllocator scratch_allocator(NormalizeBackwardScratchSize,
                                           context);
-    bool status = stream
-                      ->ThenNormalizeBackwardWithDimensions(
-                          normalize_desc, dimensions_desc, input_image_data,
-                          output_image_data, input_grads_data,
-                          &output_grads_data, &scratch_allocator)
-                      .ok();
+    auto dnn = stream->parent()->AsDnn();
+    OP_REQUIRES(context, dnn != nullptr,
+                absl::InternalError("No DNN support for stream."));
+    bool status = dnn->DoNormalizeBackwardWithDimensions(
+        stream, normalize_desc, dimensions_desc, input_image_data,
+        output_image_data, input_grads_data, &output_grads_data,
+        /*workspace_allocator=*/nullptr, &scratch_allocator);
     OP_REQUIRES(
         context, status,
         errors::Internal("NormalizeBackwardWithDimensions launch failed"));
diff --git a/tensorflow/core/kernels/map_kernels.h b/tensorflow/core/kernels/map_kernels.h
index 6a05762983d627..ad01ef15932661 100644
--- a/tensorflow/core/kernels/map_kernels.h
+++ b/tensorflow/core/kernels/map_kernels.h
@@ -35,7 +35,7 @@ inline Status GetInputMap(OpKernelContext* ctx, int index,
         ctx->input(index).scalar<Variant>()().DebugString(), "'");
   }
   *ret_map = map;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // TODO(kattian): change into templated function
@@ -62,7 +62,7 @@ inline Status ForwardInputOrCreateNewMap(OpKernelContext* ctx,
       // Woohoo, forwarding succeeded!
       ctx->set_output(output_index, *output_tensor);
       *output_map = tmp_out;
-      return OkStatus();
+      return absl::OkStatus();
     }
   }
 
@@ -75,7 +75,7 @@ inline Status ForwardInputOrCreateNewMap(OpKernelContext* ctx,
   output_tensor->scalar<Variant>()() = input_map.Copy();
 
   *output_map = output_tensor->scalar<Variant>()().get<TensorMap>();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 class EmptyTensorMap : public OpKernel {
@@ -240,14 +240,14 @@ Status TensorMapBinaryAdd(OpKernelContext* ctx, const TensorMap& a,
       out->tensors().emplace(p.first, p.second);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename Device>
 Status TensorMapZerosLike(OpKernelContext* ctx, const TensorMap& x,
                           TensorMap* y) {
   // Zeros like returns an empty map.
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/map_stage_op.cc b/tensorflow/core/kernels/map_stage_op.cc
index d60d3435ff5294..bf16340081466a 100644
--- a/tensorflow/core/kernels/map_stage_op.cc
+++ b/tensorflow/core/kernels/map_stage_op.cc
@@ -171,7 +171,7 @@ class StagingMap : public ResourceBase {
           "' was out of bounds '", dtypes_.size(), "'."));
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status copy_or_move_tensors(OptionalTuple* map_tuple, const Tensor& key,
@@ -203,7 +203,7 @@ class StagingMap : public ResourceBase {
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Check that the optional value at the specified index
@@ -218,7 +218,7 @@ class StagingMap : public ResourceBase {
                                      dtypes_.size(), "'.");
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Check that the indices are strictly ordered
@@ -237,7 +237,7 @@ class StagingMap : public ResourceBase {
       return errors::InvalidArgument("Indices are not strictly ordered");
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Check bytes are within memory limits memory limits
@@ -250,7 +250,7 @@ class StagingMap : public ResourceBase {
           "'.");
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Insert incomplete data into the Barrier
@@ -327,7 +327,7 @@ class StagingMap : public ResourceBase {
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Does the insertion into the actual staging area
@@ -338,7 +338,7 @@ class StagingMap : public ResourceBase {
 
     notify_removers();
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  public:
@@ -376,7 +376,7 @@ class StagingMap : public ResourceBase {
     // Update the current size
     current_bytes_ += tuple_bytes;
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status get(const KeyType* key, const Tensor* indices, Tuple* tuple) {
@@ -398,7 +398,7 @@ class StagingMap : public ResourceBase {
     // Update bytes in the Staging Area
     current_bytes_ -= get_tuple_bytes(*tuple);
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status pop(const KeyType* key, const Tensor* indices, Tuple* tuple) {
@@ -429,7 +429,7 @@ class StagingMap : public ResourceBase {
 
     notify_inserters_if_bounded();
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status popitem(KeyType* key, const Tensor* indices, Tuple* tuple) {
@@ -464,7 +464,7 @@ class StagingMap : public ResourceBase {
 
     notify_inserters_if_bounded();
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status clear() {
@@ -475,7 +475,7 @@ class StagingMap : public ResourceBase {
 
     notify_inserters_if_bounded();
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   std::size_t incomplete_size() {
@@ -506,13 +506,13 @@ Status GetStagingMap(OpKernelContext* ctx, const NodeDef& ndef,
     TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "capacity", &capacity));
     TF_RETURN_IF_ERROR(GetNodeAttr(ndef, "memory_limit", &memory_limit));
     *ret = new StagingMap<Ordered>(dtypes, capacity, memory_limit);
-    return OkStatus();
+    return absl::OkStatus();
   };
 
   TF_RETURN_IF_ERROR(cinfo.Init(rm, ndef, true /* use name() */));
   TF_RETURN_IF_ERROR(rm->LookupOrCreate<StagingMap<Ordered>>(
       cinfo.container(), cinfo.name(), map, create_fn));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <bool Ordered>
diff --git a/tensorflow/core/kernels/matmul_op_fused.cc b/tensorflow/core/kernels/matmul_op_fused.cc
index f937a06016dc8c..e38b854db4ec19 100644
--- a/tensorflow/core/kernels/matmul_op_fused.cc
+++ b/tensorflow/core/kernels/matmul_op_fused.cc
@@ -361,7 +361,11 @@ StatusOr<AutotuneEntry<se::dnn::FusedMatmulOp>> AutotuneFusedMatmul(
 
     std::vector<std::unique_ptr<const se::dnn::FusedMatmulRunner>> runners;
     auto element_type = se::dnn::ToDataType<T>::value;
-    TF_RETURN_IF_ERROR(stream->parent()->GetFusedMatmulRunners(
+    auto dnn = stream->parent()->AsDnn();
+    if (dnn == nullptr) {
+      return errors::Internal("No DNN in stream executor.");
+    }
+    TF_RETURN_IF_ERROR(dnn->GetFusedMatmulRunners(
         CudnnUseFrontend(), element_type, element_type, element_type, stream,
         trans_a, trans_b, m, n, k, lda, ldb, ldc, activation_mode,
         /*use_fallback=*/false, GetNumericOptions(), &runners));
@@ -405,7 +409,7 @@ StatusOr<AutotuneEntry<se::dnn::FusedMatmulOp>> AutotuneFusedMatmul(
           << params.ToString();
       std::vector<std::unique_ptr<const se::dnn::FusedMatmulRunner>>
           fallback_runners;
-      TF_RETURN_IF_ERROR(stream->parent()->GetFusedMatmulRunners(
+      TF_RETURN_IF_ERROR(dnn->GetFusedMatmulRunners(
           CudnnUseFrontend(), element_type, element_type, element_type, stream,
           trans_a, trans_b, m, n, k, lda, ldb, ldc, activation_mode,
           /*use_fallback=*/true, GetNumericOptions(), &fallback_runners));
diff --git a/tensorflow/core/kernels/matmul_op_impl.h b/tensorflow/core/kernels/matmul_op_impl.h
index 7180fb1d4e35f9..9c5fe075d97ff5 100644
--- a/tensorflow/core/kernels/matmul_op_impl.h
+++ b/tensorflow/core/kernels/matmul_op_impl.h
@@ -720,15 +720,15 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
         }
 
         BlasScratchAllocator scratch_allocator(context, max_scratch_size);
-        bool blas_launch_status =
-            stream
-                ->ThenBlasGemmBatchedWithScratch(
-                    blas_transpose_b, blas_transpose_a, n, m, k,
-                    static_cast<Coefficient>(1.0), b_ptrs,
-                    adj_y || trans_y ? k : n, a_ptrs, adj_x || trans_x ? m : k,
-                    static_cast<Coefficient>(0.0), c_ptrs, n, batch_size,
-                    GetNumericOptions(), &scratch_allocator, call_context)
-                .ok();
+        auto blas = stream->parent()->AsBlas();
+        OP_REQUIRES(context, blas != nullptr,
+                    absl::InternalError("No blas support for stream"));
+        bool blas_launch_status = blas->DoBlasGemmBatched(
+            stream, blas_transpose_b, blas_transpose_a, n, m, k,
+            static_cast<Coefficient>(1.0), b_ptrs, adj_y || trans_y ? k : n,
+            a_ptrs, adj_x || trans_x ? m : k, static_cast<Coefficient>(0.0),
+            c_ptrs, n, batch_size, GetNumericOptions(), &scratch_allocator,
+            call_context);
         if (!blas_launch_status) {
           context->SetStatus(errors::Internal(
               "Blas xGEMMBatched launch failed: a.shape=",
@@ -785,6 +785,9 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
       // C' = B' x A', where ' stands for transpose (not adjoint).
       // TODO(yangzihao): Choose the best of the three strategies using
       // autotune.
+      auto blas = stream->parent()->AsBlas();
+      OP_REQUIRES(context, blas != nullptr,
+                  absl::InternalError("No blas support for stream"));
       if (batch_size == 1) {
         // This is a regular matrix*matrix or matrix*vector multiply. Avoid the
         // overhead of the scratch allocator and the batch interface.
@@ -803,14 +806,11 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
                 blas_transpose_a == se::blas::Transpose::kTranspose
                     ? se::blas::Transpose::kNoTranspose
                     : se::blas::Transpose::kTranspose;
-            bool blas_launch_status =
-                stream
-                    ->ThenBlasGemv(gemv_trans_a, adj_x || trans_x ? m : k,
-                                   adj_x || trans_x ? k : m,
-                                   static_cast<Coefficient>(1.0), *(a_ptrs[0]),
-                                   adj_x || trans_x ? m : k, *(b_ptrs[0]), 1,
-                                   static_cast<Coefficient>(0.0), c_ptrs[0], 1)
-                    .ok();
+            bool blas_launch_status = blas->DoBlasGemv(
+                stream, gemv_trans_a, adj_x || trans_x ? m : k,
+                adj_x || trans_x ? k : m, static_cast<Coefficient>(1.0),
+                *(a_ptrs[0]), adj_x || trans_x ? m : k, *(b_ptrs[0]), 1,
+                static_cast<Coefficient>(0.0), c_ptrs[0], 1);
             if (!blas_launch_status) {
               context->SetStatus(errors::Internal(
                   "Blas xGEMV launch failed : a.shape=",
@@ -821,16 +821,16 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
           }
         }
 
-        OP_REQUIRES_OK(context,
-                       stream->ThenBlasGemm(
-                           blas_transpose_b, blas_transpose_a, n, m, k,
+        OP_REQUIRES_OK(
+            context,
+            blas->BlasGemm(stream, blas_transpose_b, blas_transpose_a, n, m, k,
                            *(b_ptrs[0]), adj_y || trans_y ? k : n, *(a_ptrs[0]),
                            adj_x || trans_x ? m : k, c_ptrs[0], n,
                            GetNumericOptions(), call_context));
       } else if (use_strided_batched) {
         OP_REQUIRES_OK(
-            context, stream->ThenBlasGemmStridedBatched(
-                         blas_transpose_b, blas_transpose_a, n, m, k,
+            context, blas->BlasGemmStridedBatched(
+                         stream, blas_transpose_b, blas_transpose_a, n, m, k,
                          static_cast<Coefficient>(1.0), *b_ptrs[0],
                          adj_y || trans_y ? k : n, b_stride, *a_ptrs[0],
                          adj_x || trans_x ? m : k, a_stride,
@@ -838,15 +838,12 @@ struct LaunchBatchMatMul<GPUDevice, Scalar> {
                          batch_size, GetNumericOptions(), call_context));
       } else {
         BlasScratchAllocator scratch_allocator(context);
-        bool blas_launch_status =
-            stream
-                ->ThenBlasGemmBatchedWithScratch(
-                    blas_transpose_b, blas_transpose_a, n, m, k,
-                    static_cast<Coefficient>(1.0), b_ptrs,
-                    adj_y || trans_y ? k : n, a_ptrs, adj_x || trans_x ? m : k,
-                    static_cast<Coefficient>(0.0), c_ptrs, n, batch_size,
-                    GetNumericOptions(), &scratch_allocator, call_context)
-                .ok();
+        bool blas_launch_status = blas->DoBlasGemmBatched(
+            stream, blas_transpose_b, blas_transpose_a, n, m, k,
+            static_cast<Coefficient>(1.0), b_ptrs, adj_y || trans_y ? k : n,
+            a_ptrs, adj_x || trans_x ? m : k, static_cast<Coefficient>(0.0),
+            c_ptrs, n, batch_size, GetNumericOptions(), &scratch_allocator,
+            call_context);
         if (!blas_launch_status) {
           context->SetStatus(errors::Internal(
               "Blas xGEMMBatched launch failed : a.shape=",
@@ -1084,7 +1081,7 @@ class BatchMatMulOp : public BaseBatchMatMulOp<Device, Ta, Tb, Tout> {
         }
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
@@ -1110,7 +1107,7 @@ class BatchMatMulV2Op : public BaseBatchMatMulOp<Device, Ta, Tb, Tout> {
     if (in1.dims() < 2) {
       return errors::InvalidArgument("In[1] ndims must be >= 2: ", in1.dims());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/kernels/matmul_op_test.cc b/tensorflow/core/kernels/matmul_op_test.cc
index 96c37ac97817b8..897d8fd1772b07 100644
--- a/tensorflow/core/kernels/matmul_op_test.cc
+++ b/tensorflow/core/kernels/matmul_op_test.cc
@@ -55,7 +55,7 @@ class FusedMatMulOpTest : public OpsTestBase {
   void RunAndFetch(const tensorflow::Scope& root, const string& fetch,
                    Tensor* output, bool allow_gpu_device,
                    const NodeDef* fetch_node = nullptr,
-                   tsl::Status* last_status = nullptr) {
+                   absl::Status* last_status = nullptr) {
     tensorflow::GraphDef graph;
     TF_ASSERT_OK(root.ToGraphDef(&graph));
 
@@ -208,7 +208,7 @@ class FusedMatMulOpTest : public OpsTestBase {
                      .Attr("transpose_b", transpose_b)
                      .Finalize(&fused_matmul));
 
-    tsl::Status last_status;
+    absl::Status last_status;
     RunAndFetch(root, fused_matmul.name(), output, allow_gpu_device,
                 &fused_matmul, &last_status);
 
diff --git a/tensorflow/core/kernels/mkl/mkl_conv_ops.cc b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
index 551fefa6de0548..e1f55988097133 100644
--- a/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
+++ b/tensorflow/core/kernels/mkl/mkl_conv_ops.cc
@@ -112,6 +112,7 @@ struct MklConvFwdParams {
   MklTensorFormat tf_fmt;
   bool native_format;
   bool is_depthwise;
+  bool is_filter_const = false;
   string dtypes = string("");
   struct PostOpParam {
     string name;
@@ -127,7 +128,7 @@ struct MklConvFwdParams {
                    memory::dims strides, memory::dims dilations,
                    memory::dims padding_left, memory::dims padding_right,
                    memory::dims fuse_bn_dims, MklTensorFormat tf_fmt,
-                   bool native_format, bool is_depthwise)
+                   bool native_format, bool is_depthwise, bool is_filter_const)
       : src_dims(src_dims),
         filter_dims(filter_dims),
         bias_dims(bias_dims),
@@ -139,7 +140,8 @@ struct MklConvFwdParams {
         fuse_bn_dims(fuse_bn_dims),
         tf_fmt(tf_fmt),
         native_format(native_format),
-        is_depthwise(is_depthwise) {}
+        is_depthwise(is_depthwise),
+        is_filter_const(is_filter_const) {}
 };
 
 // With quantization, input, filter, and output can have different types
@@ -357,9 +359,24 @@ class MklConvFwdPrimitive : public MklPrimitive {
     context_.src_md.reset(new memory::desc(
         {convFwdDims.src_dims}, MklDnnType<Tinput>(), user_data_fmt));
 
-    context_.filter_md.reset(new memory::desc({convFwdDims.filter_dims},
-                                              MklDnnType<Tfilter>(),
-                                              memory::format_tag::any));
+    // In case of Saved_Model or non-cached filters, FP32 and small batch size:
+    // For the forward oneDNN conv op, creating the filter memory descriptor
+    // with hwio format explicitly, will have better execution performance.
+
+    // Currently hwio format is restricted to batch size 1 as,
+    // through experiments, batch size 1 seems to show notable improvement in
+    // performance for Saved_Model
+    if (convFwdDims.filter_dims.size() == 4 && !convFwdDims.is_filter_const &&
+        std::is_same<Tfilter, float>::value &&
+        convFwdDims.src_dims[MklDnnDims::Dim_N] == 1) {
+      context_.filter_md.reset(new memory::desc({convFwdDims.filter_dims},
+                                                MklDnnType<Tfilter>(),
+                                                memory::format_tag::hwio));
+    } else {
+      context_.filter_md.reset(new memory::desc({convFwdDims.filter_dims},
+                                                MklDnnType<Tfilter>(),
+                                                memory::format_tag::any));
+    }
 
     context_.dst_md.reset(new memory::desc(
         {convFwdDims.dst_dims}, MklDnnType<Toutput>(), user_data_fmt));
@@ -968,7 +985,7 @@ class MklConvOp : public OpKernel {
       MklConvFwdParams convFwdDims(
           src_dims, filter_dims, fuse_biasadd_ ? bias_dims : NONE_DIMS,
           dst_dims_mkl_order, strides, dilations, padding_left, padding_right,
-          fuse_bn_dims, tf_fmt, native_format, is_depthwise);
+          fuse_bn_dims, tf_fmt, native_format, is_depthwise, is_filter_const_);
 
       // TODO(intel-tf): Extend the basic parameters for data types and fusions
       this->ExtendConvFwdParams(context, convFwdDims);
diff --git a/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h
index d709f0f1d546fd..c80fb797566f0b 100644
--- a/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h
+++ b/tensorflow/core/kernels/mkl/mkl_pooling_ops_common.h
@@ -226,6 +226,7 @@ class MklPoolingFwdPrimitiveFactory : public MklPrimitiveFactory<T> {
 #endif  // ENABLE_ONEDNN_V3
     key_creator.AddAsKey(fwdParams.padding_left);
     key_creator.AddAsKey(fwdParams.padding_right);
+    key_creator.AddAsKey(fwdParams.src_format);
     key_creator.AddAsKey<int>(static_cast<int>(fwdParams.alg_kind));
     key_creator.AddAsKey<int>(static_cast<int>(fwdParams.prop_kind));
     return key_creator.GetKey();
@@ -382,6 +383,7 @@ class MklPoolingBwdPrimitiveFactory : public MklPrimitiveFactory<T> {
 #endif  // ENABLE_ONEDNN_V3
     key_creator.AddAsKey(bwdParams.padding_left);
     key_creator.AddAsKey(bwdParams.padding_right);
+    key_creator.AddAsKey(bwdParams.src_format);
     key_creator.AddAsKey<int>(static_cast<int>(bwdParams.alg_kind));
     return key_creator.GetKey();
   }
diff --git a/tensorflow/core/kernels/mlir_generated/BUILD b/tensorflow/core/kernels/mlir_generated/BUILD
index cb43658770bc9d..a59140af636cf6 100644
--- a/tensorflow/core/kernels/mlir_generated/BUILD
+++ b/tensorflow/core/kernels/mlir_generated/BUILD
@@ -1942,7 +1942,6 @@ gpu_kernel_library(
         "c64",
         "c128",
     ],
-    max_supported_rank = 8,
     op = "select_v2",
     tile_size = "256",
     types = [],
diff --git a/tensorflow/core/kernels/mlir_generated/build_defs.bzl b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
index fe071b2375ec0e..e6c1c6a93b2446 100644
--- a/tensorflow/core/kernels/mlir_generated/build_defs.bzl
+++ b/tensorflow/core/kernels/mlir_generated/build_defs.bzl
@@ -1,14 +1,14 @@
 """Generates cubin headers for TF dialect ops."""
 
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_gpu_architectures")
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "rocm_gpu_architectures",
-)
 load(
     "@local_xla//xla/stream_executor:build_defs.bzl",
     "if_gpu_is_configured",
 )
+load(
+    "@local_config_rocm//rocm:build_defs.bzl",
+    "rocm_gpu_architectures",
+)
 load("@bazel_tools//tools/cpp:toolchain_utils.bzl", "find_cpp_toolchain", "use_cpp_toolchain")
 
 def _lookup_file(filegroup, path):
@@ -158,7 +158,6 @@ def _gen_kernel_bin_impl(ctx):
         executable = ctx.executable._tool,
         arguments = cmd_args + [
             "--tile_sizes=%s" % ctx.attr.tile_size,
-            "--max-supported-rank=%s" % ctx.attr.max_supported_rank,
             "--host-triple=%s" % ctx.attr.host_triple,
             "--arch=%s" % ",".join(ctx.attr.gpu_archs),
             "--input=%s" % ctx.file.mlir_op.path,
@@ -194,7 +193,6 @@ _gen_kernel_bin_rule = rule(
         "data_type": attr.string(mandatory = True),
         "tile_size": attr.string(mandatory = True),
         "unroll_factors": attr.string(),
-        "max_supported_rank": attr.int(),
         "host_triple": attr.string(mandatory = True),
         "gpu_archs": attr.string_list(),
         "jit": attr.bool(),
@@ -217,6 +215,7 @@ _gen_kernel_bin_rule = rule(
     outputs = {"kernel": "%{name}_kernel.o"},
     toolchains = use_cpp_toolchain(),
     implementation = _gen_kernel_bin_impl,
+    provides = [CcInfo],
 )
 
 # Returns the shape string (e.g. "4x4" or "16Bx2") as comma-separated integers.
@@ -252,7 +251,6 @@ def _gen_kernel_library(
         platform,
         tile_size,
         tile_size_override = {},
-        max_supported_rank = 5,
         output_types = [],
         jit_types = [],
         output_jit_types = [],
@@ -274,7 +272,6 @@ def _gen_kernel_library(
       types: The types ("f16", "f32", "f64") for which a kernel should be generated.
       tile_size: The tiling specification, e.g. "16x16" or "16Bx16".
       tile_size_override: dict of type-specific tile_size.
-      max_supported_rank: Maximum supported rank for rank specialization.
       jit_types: The types ("f16", "f32", "f64") for which a kernel should be
                  generated. These kernels are different in that they are only
                  partially compiled and will be JIT compiled at execution time.
@@ -371,7 +368,6 @@ def _gen_kernel_library(
                 host_triple = host_triple,
                 gpu_archs = gpu_archs,
                 jit = jit,
-                max_supported_rank = max_supported_rank,
                 mlir_op = "{op}_{name}_{platform}_{type}_{output_type}.mlir".format(
                     op = op,
                     name = name,
diff --git a/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_test.cc b/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_test.cc
index 561ca57c67ca11..7bea10ceabd737 100644
--- a/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_test.cc
+++ b/tensorflow/core/kernels/mlir_generated/gpu_binary_ops_test.cc
@@ -1503,7 +1503,7 @@ GENERATE_DEFAULT_TESTS(Xlogy, /*test_name=*/Complex64, std::complex<float>,
                        test::OpsTestConfig().ATol(2e-6).RTol(2e-6))
 GENERATE_DEFAULT_TESTS(Xlogy, /*test_name=*/Complex128, std::complex<double>,
                        std::complex<double>, baseline_xlogy,
-                       test::OpsTestConfig())
+                       test::OpsTestConfig().ATol(1e-12).RTol(1e-12))
 
 /// Test `tf.Xlog1py`.
 
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/abs.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/abs.mlir.tmpl
index 896c5850ff2bce..8e3acc17ae313c 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/abs.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/abs.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Abs_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.abs %arg0 : tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Abs_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.abs %2 : tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/acos.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/acos.mlir.tmpl
index 1508141049b649..5c4a2789351012 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/acos.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/acos.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Acos_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.acos %arg0 : tensor<*xelem_type> -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Acos_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = chlo.acos %2 : tensor<?xelem_type> -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/acosh.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/acosh.mlir.tmpl
index 2d5a7cca69cc42..237f906d37282f 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/acosh.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/acosh.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Acosh_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.acosh %arg0 : tensor<*xelem_type> -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Acosh_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = chlo.acosh %2 : tensor<?xelem_type> -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/add_v2.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/add_v2.mlir.tmpl
index b113c33f2ed2a0..8681673198aad1 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/add_v2.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/add_v2.mlir.tmpl
@@ -1,6 +1,129 @@
-func.func @AddV2_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_add %arg0, %arg1 
-      : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @AddV2_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xoutput_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_add %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %cast = tensor.cast %17 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xoutput_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_add %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %cast = tensor.cast %20 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xoutput_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_add %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xoutput_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_add %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %cast_1 = tensor.cast %30 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xoutput_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_add %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xoutput_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_add %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xoutput_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_add %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_add %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %31 : tensor<*xoutput_type>
+            }
+            scf.yield %29 : tensor<*xoutput_type>
+          }
+          scf.yield %27 : tensor<*xoutput_type>
+        }
+        scf.yield %25 : tensor<*xoutput_type>
+      }
+      scf.yield %18 : tensor<*xoutput_type>
+    }
+    scf.yield %16 : tensor<*xoutput_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %13 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/angle.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/angle.mlir.tmpl
index d53cc3093fbd1d..009e352e52ae01 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/angle.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/angle.mlir.tmpl
@@ -1,7 +1,11 @@
-func.func @Angle_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.imag %arg0 : (tensor<*xelem_type>) -> tensor<*xoutput_type>
-  %1 = mhlo.real %arg0 : (tensor<*xelem_type>) -> tensor<*xoutput_type>
-  %2 = mhlo.atan2 %0, %1 : tensor<*xoutput_type>
-  func.return %2 : tensor<*xoutput_type>
+func.func @Angle_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.imag %2 : (tensor<?xelem_type>) -> tensor<?xoutput_type>
+  %4 = mhlo.real %2 : (tensor<?xelem_type>) -> tensor<?xoutput_type>
+  %5 = mhlo.atan2 %3, %4 : tensor<?xoutput_type>
+  %6 = mhlo.dynamic_reshape %5, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %6 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/asin.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/asin.mlir.tmpl
index 68a92623fe45b1..534932181bd4d6 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/asin.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/asin.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Asin_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.asin %arg0 : tensor<*xelem_type> -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Asin_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = chlo.asin %2 : tensor<?xelem_type> -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/asinh.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/asinh.mlir.tmpl
index 9cc65cb8c76d95..e3e256c8425d5a 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/asinh.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/asinh.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Asinh_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.asinh %arg0 : tensor<*xelem_type> -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Asinh_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = chlo.asinh %2 : tensor<?xelem_type> -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/atan.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/atan.mlir.tmpl
index 9dee2cd71aa9e0..679f4b2c0c48a4 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/atan.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/atan.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Atan_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.atan %arg0 : tensor<*xelem_type> -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Atan_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = chlo.atan %2 : tensor<?xelem_type> -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/atan2.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/atan2.mlir.tmpl
index b0820d84a35a29..a61936c7ec2bfe 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/atan2.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/atan2.mlir.tmpl
@@ -1,6 +1,129 @@
-func.func @Atan2_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_atan2 %arg0, %arg1
-    : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Atan2_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xoutput_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_atan2 %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %cast = tensor.cast %17 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xoutput_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_atan2 %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %cast = tensor.cast %20 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xoutput_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_atan2 %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xoutput_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_atan2 %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %cast_1 = tensor.cast %30 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xoutput_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_atan2 %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xoutput_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_atan2 %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xoutput_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_atan2 %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_atan2 %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %31 : tensor<*xoutput_type>
+            }
+            scf.yield %29 : tensor<*xoutput_type>
+          }
+          scf.yield %27 : tensor<*xoutput_type>
+        }
+        scf.yield %25 : tensor<*xoutput_type>
+      }
+      scf.yield %18 : tensor<*xoutput_type>
+    }
+    scf.yield %16 : tensor<*xoutput_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %13 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/atanh.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/atanh.mlir.tmpl
index f6c06c331dd14d..1f584b1d65ae03 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/atanh.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/atanh.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Atanh_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.atanh %arg0 : tensor<*xelem_type> -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Atanh_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = chlo.atanh %2 : tensor<?xelem_type> -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_and.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_and.mlir.tmpl
index 495c1fa8961f38..2f4e9084ee8beb 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_and.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_and.mlir.tmpl
@@ -1,5 +1,129 @@
-func.func @BitwiseAnd_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_and %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @BitwiseAnd_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xelem_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_and %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %17 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xelem_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_and %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %20 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xelem_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_and %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xelem_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_and %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xelem_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_and %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xelem_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_and %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xelem_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_and %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_and %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %31 : tensor<*xelem_type>
+            }
+            scf.yield %29 : tensor<*xelem_type>
+          }
+          scf.yield %27 : tensor<*xelem_type>
+        }
+        scf.yield %25 : tensor<*xelem_type>
+      }
+      scf.yield %18 : tensor<*xelem_type>
+    }
+    scf.yield %16 : tensor<*xelem_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %13 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_or.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_or.mlir.tmpl
index ebf18924609f42..8f9cb17256e7f5 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_or.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_or.mlir.tmpl
@@ -1,5 +1,129 @@
-func.func @BitwiseOr_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_or %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @BitwiseOr_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xelem_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_or %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %17 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xelem_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_or %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %20 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xelem_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_or %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xelem_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_or %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xelem_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_or %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xelem_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_or %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xelem_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_or %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_or %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %31 : tensor<*xelem_type>
+            }
+            scf.yield %29 : tensor<*xelem_type>
+          }
+          scf.yield %27 : tensor<*xelem_type>
+        }
+        scf.yield %25 : tensor<*xelem_type>
+      }
+      scf.yield %18 : tensor<*xelem_type>
+    }
+    scf.yield %16 : tensor<*xelem_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %13 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_xor.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_xor.mlir.tmpl
index fb1773b24942bb..e4ba7c1486571a 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_xor.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/bitwise_xor.mlir.tmpl
@@ -1,5 +1,129 @@
-func.func @BitwiseXor_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_xor %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @BitwiseXor_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xelem_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_xor %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %17 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xelem_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_xor %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %20 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xelem_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_xor %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xelem_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_xor %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xelem_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_xor %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xelem_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_xor %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xelem_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_xor %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_xor %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %31 : tensor<*xelem_type>
+            }
+            scf.yield %29 : tensor<*xelem_type>
+          }
+          scf.yield %27 : tensor<*xelem_type>
+        }
+        scf.yield %25 : tensor<*xelem_type>
+      }
+      scf.yield %18 : tensor<*xelem_type>
+    }
+    scf.yield %16 : tensor<*xelem_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %13 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/cast.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/cast.mlir.tmpl
index cf61977c9dad34..6e5a042e84e8bd 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/cast.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/cast.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Cast_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.convert %arg0 : (tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Cast_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.convert %2 : (tensor<?xelem_type>) -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/ceil.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/ceil.mlir.tmpl
index cfc627c90316d9..7ab5d18badadc9 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/ceil.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/ceil.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Ceil_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.ceil %arg0 : tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Ceil_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.ceil %2 : tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/complex.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/complex.mlir.tmpl
index 1adb56de892015..c9c7330058e09e 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/complex.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/complex.mlir.tmpl
@@ -1,6 +1,129 @@
-func.func @Complex_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_complex %arg0, %arg1
-    : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Complex_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xoutput_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_complex %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %cast = tensor.cast %17 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xoutput_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_complex %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %cast = tensor.cast %20 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xoutput_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_complex %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xoutput_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_complex %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %cast_1 = tensor.cast %30 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xoutput_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_complex %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xoutput_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_complex %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xoutput_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_complex %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_complex %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %31 : tensor<*xoutput_type>
+            }
+            scf.yield %29 : tensor<*xoutput_type>
+          }
+          scf.yield %27 : tensor<*xoutput_type>
+        }
+        scf.yield %25 : tensor<*xoutput_type>
+      }
+      scf.yield %18 : tensor<*xoutput_type>
+    }
+    scf.yield %16 : tensor<*xoutput_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %13 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/complex_abs.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/complex_abs.mlir.tmpl
index ebcc01234a3862..ed04436a48bb16 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/complex_abs.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/complex_abs.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @ComplexAbs_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.abs %arg0 : (tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @ComplexAbs_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.abs %2 : (tensor<?xelem_type>) -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/conj.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/conj.mlir.tmpl
index 6bbd0f2417a0f9..cdd354aa43d0de 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/conj.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/conj.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Conj_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.conj %arg0 : tensor<*xelem_type> -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Conj_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = chlo.conj %2 : tensor<?xelem_type> -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/cos.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/cos.mlir.tmpl
index e5394ba75ff576..39a6d7b1d82339 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/cos.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/cos.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Cos_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.cosine %arg0 : tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Cos_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.cosine %2 : tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/cosh.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/cosh.mlir.tmpl
index cb90a204235a27..1047460621e760 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/cosh.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/cosh.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Cosh_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.cosh %arg0 : tensor<*xelem_type> -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Cosh_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = chlo.cosh %2 : tensor<?xelem_type> -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/digamma.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/digamma.mlir.tmpl
index 59a5f0e0cab368..7acbe863efd3d3 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/digamma.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/digamma.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Digamma_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.digamma %arg0 : tensor<*xelem_type> -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Digamma_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = chlo.digamma %2 : tensor<?xelem_type> -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/div.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/div.mlir.tmpl
index ca6746a1d99730..fc4650e4ea4096 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/div.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/div.mlir.tmpl
@@ -1,6 +1,129 @@
-func.func @Div_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_divide %arg0, %arg1
-    : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Div_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xoutput_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_divide %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %cast = tensor.cast %17 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xoutput_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_divide %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %cast = tensor.cast %20 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xoutput_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_divide %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xoutput_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_divide %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %cast_1 = tensor.cast %30 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xoutput_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_divide %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xoutput_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_divide %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xoutput_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_divide %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_divide %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %31 : tensor<*xoutput_type>
+            }
+            scf.yield %29 : tensor<*xoutput_type>
+          }
+          scf.yield %27 : tensor<*xoutput_type>
+        }
+        scf.yield %25 : tensor<*xoutput_type>
+      }
+      scf.yield %18 : tensor<*xoutput_type>
+    }
+    scf.yield %16 : tensor<*xoutput_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %13 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/div_no_nan.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/div_no_nan.mlir.tmpl
index 53b9f5f97b8863..0dd4fc448970e6 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/div_no_nan.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/div_no_nan.mlir.tmpl
@@ -1,9 +1,148 @@
-func.func @DivNoNan_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
-    %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {tf_entry,
-    llvm.emit_c_interface} {
-  %0 = mhlo.constant dense<0.000000e+00> : tensor<output_type>
-  %1 = chlo.broadcast_compare %arg1, %0 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %2 = chlo.broadcast_divide %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  %3 = chlo.broadcast_select %1, %0, %2 : (tensor<*xi1>, tensor<output_type>, tensor<*xoutput_type>) -> tensor<*xoutput_type>
-  func.return %3 : tensor<*xoutput_type>
+func.func @DivNoNan_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = mhlo.constant dense<0.000000e+00> : tensor<output_type>
+  %6 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %8 = shape.num_elements %6 : tensor<?xindex> -> index
+  %9 = arith.cmpi eq, %8, %c1 : index
+  %10 = scf.if %9 -> (tensor<*xoutput_type>) {
+    %17 = shape.num_elements %7 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %17 : tensor<1xindex>
+    %18 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %19 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %20 = chlo.broadcast_compare %19, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<output_type>) -> tensor<?xi1>
+    %21 = chlo.broadcast_divide %18, %19 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %22 = chlo.broadcast_select %20, %5, %21 : (tensor<?xi1>, tensor<output_type>, tensor<?xoutput_type>) -> tensor<?xoutput_type>
+    %cast = tensor.cast %22 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %17 = shape.num_elements %7 : tensor<?xindex> -> index
+    %18 = arith.cmpi eq, %17, %c1 : index
+    %19 = scf.if %18 -> (tensor<*xoutput_type>) {
+      %20 = shape.num_elements %6 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %20 : tensor<1xindex>
+      %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %22 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %23 = chlo.broadcast_compare %22, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<elem_type>, tensor<output_type>) -> tensor<i1>
+      %24 = chlo.broadcast_divide %21, %22 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %25 = chlo.broadcast_select %23, %5, %24 : (tensor<i1>, tensor<output_type>, tensor<?xoutput_type>) -> tensor<?xoutput_type>
+      %cast = tensor.cast %25 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %20 = shape.shape_eq %6, %7 : tensor<?xindex>, tensor<?xindex>
+      %21 = scf.if %20 -> (tensor<*xoutput_type>) {
+        %22 = shape.any %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %23 = shape.num_elements %22 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %23 : tensor<1xindex>
+        %24 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %25 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %26 = chlo.broadcast_compare %25, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<output_type>) -> tensor<?xi1>
+        %27 = chlo.broadcast_divide %24, %25 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %28 = chlo.broadcast_select %26, %5, %27 : (tensor<?xi1>, tensor<output_type>, tensor<?xoutput_type>) -> tensor<?xoutput_type>
+        %cast = tensor.cast %28 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %22:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %23 = shape.rank %22#0 : tensor<?xindex> -> index
+        %24 = shape.rank %22#1 : tensor<?xindex> -> index
+        %25 = arith.cmpi sgt, %23, %24 : index
+        %26 = arith.select %25, %23, %24 : index
+        %27 = arith.cmpi ule, %26, %c1 : index
+        %28 = scf.if %27 -> (tensor<*xoutput_type>) {
+          %29 = shape.broadcast %22#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %29 : tensor<?xindex> to tensor<1xindex>
+          %30 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %31 = shape.broadcast %22#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %31 : tensor<?xindex> to tensor<1xindex>
+          %32 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %33 = chlo.broadcast_compare %32, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<output_type>) -> tensor<?xi1>
+          %34 = chlo.broadcast_divide %30, %32 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %35 = chlo.broadcast_select %33, %5, %34 : (tensor<?xi1>, tensor<output_type>, tensor<?xoutput_type>) -> tensor<?xoutput_type>
+          %cast_1 = tensor.cast %35 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %29 = arith.cmpi ule, %26, %c2 : index
+          %30 = scf.if %29 -> (tensor<*xoutput_type>) {
+            %31 = shape.broadcast %22#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %31 : tensor<?xindex> to tensor<2xindex>
+            %32 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %33 = shape.broadcast %22#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %33 : tensor<?xindex> to tensor<2xindex>
+            %34 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %35 = chlo.broadcast_compare %34, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?xelem_type>, tensor<output_type>) -> tensor<?x?xi1>
+            %36 = chlo.broadcast_divide %32, %34 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %37 = chlo.broadcast_select %35, %5, %36 : (tensor<?x?xi1>, tensor<output_type>, tensor<?x?xoutput_type>) -> tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %37 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %31 = arith.cmpi ule, %26, %c3 : index
+            %32 = scf.if %31 -> (tensor<*xoutput_type>) {
+              %33 = shape.broadcast %22#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %33 : tensor<?xindex> to tensor<3xindex>
+              %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %35 = shape.broadcast %22#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<3xindex>
+              %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %37 = chlo.broadcast_compare %36, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?xelem_type>, tensor<output_type>) -> tensor<?x?x?xi1>
+              %38 = chlo.broadcast_divide %34, %36 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %39 = chlo.broadcast_select %37, %5, %38 : (tensor<?x?x?xi1>, tensor<output_type>, tensor<?x?x?xoutput_type>) -> tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %39 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %33 = arith.cmpi ule, %26, %c4 : index
+              %34 = scf.if %33 -> (tensor<*xoutput_type>) {
+                %35 = shape.broadcast %22#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %35 : tensor<?xindex> to tensor<4xindex>
+                %36 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %37 = shape.broadcast %22#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %37 : tensor<?xindex> to tensor<4xindex>
+                %38 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %39 = chlo.broadcast_compare %38, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?xelem_type>, tensor<output_type>) -> tensor<?x?x?x?xi1>
+                %40 = chlo.broadcast_divide %36, %38 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %41 = chlo.broadcast_select %39, %5, %40 : (tensor<?x?x?x?xi1>, tensor<output_type>, tensor<?x?x?x?xoutput_type>) -> tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %41 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %35 = arith.cmpi ule, %26, %c5 : index
+                cf.assert %35, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %36 = shape.broadcast %22#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %36 : tensor<?xindex> to tensor<5xindex>
+                %37 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %38 = shape.broadcast %22#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %38 : tensor<?xindex> to tensor<5xindex>
+                %39 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %40 = chlo.broadcast_compare %39, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?x?xelem_type>, tensor<output_type>) -> tensor<?x?x?x?x?xi1>
+                %41 = chlo.broadcast_divide %37, %39 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %42 = chlo.broadcast_select %40, %5, %41 : (tensor<?x?x?x?x?xi1>, tensor<output_type>, tensor<?x?x?x?x?xoutput_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %42 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %34 : tensor<*xoutput_type>
+            }
+            scf.yield %32 : tensor<*xoutput_type>
+          }
+          scf.yield %30 : tensor<*xoutput_type>
+        }
+        scf.yield %28 : tensor<*xoutput_type>
+      }
+      scf.yield %21 : tensor<*xoutput_type>
+    }
+    scf.yield %19 : tensor<*xoutput_type>
+  }
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %13 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %14 = shape.broadcast %12, %13 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %15 = shape.broadcast %11, %14 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %16 = mhlo.dynamic_reshape %10, %15 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %16 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/div_no_nan_cmplx.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/div_no_nan_cmplx.mlir.tmpl
index 3b5509af9f03c7..f75314cf0590d2 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/div_no_nan_cmplx.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/div_no_nan_cmplx.mlir.tmpl
@@ -1,9 +1,148 @@
-func.func @DivNoNan_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
-    %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {tf_entry,
-    llvm.emit_c_interface} {
-  %0 = mhlo.constant dense<(0.000000e+00,0.000000e+00)> : tensor<output_type>
-  %1 = chlo.broadcast_compare %arg1, %0 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %2 = chlo.broadcast_divide %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  %3 = chlo.broadcast_select %1, %0, %2 : (tensor<*xi1>, tensor<output_type>, tensor<*xoutput_type>) -> tensor<*xoutput_type>
-  func.return %3 : tensor<*xoutput_type>
+func.func @DivNoNan_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = mhlo.constant dense<(0.000000e+00,0.000000e+00)> : tensor<output_type>
+  %6 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %8 = shape.num_elements %6 : tensor<?xindex> -> index
+  %9 = arith.cmpi eq, %8, %c1 : index
+  %10 = scf.if %9 -> (tensor<*xoutput_type>) {
+    %17 = shape.num_elements %7 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %17 : tensor<1xindex>
+    %18 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %19 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %20 = chlo.broadcast_compare %19, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<output_type>) -> tensor<?xi1>
+    %21 = chlo.broadcast_divide %18, %19 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %22 = chlo.broadcast_select %20, %5, %21 : (tensor<?xi1>, tensor<output_type>, tensor<?xoutput_type>) -> tensor<?xoutput_type>
+    %cast = tensor.cast %22 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %17 = shape.num_elements %7 : tensor<?xindex> -> index
+    %18 = arith.cmpi eq, %17, %c1 : index
+    %19 = scf.if %18 -> (tensor<*xoutput_type>) {
+      %20 = shape.num_elements %6 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %20 : tensor<1xindex>
+      %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %22 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %23 = chlo.broadcast_compare %22, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<elem_type>, tensor<output_type>) -> tensor<i1>
+      %24 = chlo.broadcast_divide %21, %22 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %25 = chlo.broadcast_select %23, %5, %24 : (tensor<i1>, tensor<output_type>, tensor<?xoutput_type>) -> tensor<?xoutput_type>
+      %cast = tensor.cast %25 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %20 = shape.shape_eq %6, %7 : tensor<?xindex>, tensor<?xindex>
+      %21 = scf.if %20 -> (tensor<*xoutput_type>) {
+        %22 = shape.any %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %23 = shape.num_elements %22 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %23 : tensor<1xindex>
+        %24 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %25 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %26 = chlo.broadcast_compare %25, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<output_type>) -> tensor<?xi1>
+        %27 = chlo.broadcast_divide %24, %25 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %28 = chlo.broadcast_select %26, %5, %27 : (tensor<?xi1>, tensor<output_type>, tensor<?xoutput_type>) -> tensor<?xoutput_type>
+        %cast = tensor.cast %28 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %22:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %23 = shape.rank %22#0 : tensor<?xindex> -> index
+        %24 = shape.rank %22#1 : tensor<?xindex> -> index
+        %25 = arith.cmpi sgt, %23, %24 : index
+        %26 = arith.select %25, %23, %24 : index
+        %27 = arith.cmpi ule, %26, %c1 : index
+        %28 = scf.if %27 -> (tensor<*xoutput_type>) {
+          %29 = shape.broadcast %22#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %29 : tensor<?xindex> to tensor<1xindex>
+          %30 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %31 = shape.broadcast %22#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %31 : tensor<?xindex> to tensor<1xindex>
+          %32 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %33 = chlo.broadcast_compare %32, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<output_type>) -> tensor<?xi1>
+          %34 = chlo.broadcast_divide %30, %32 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %35 = chlo.broadcast_select %33, %5, %34 : (tensor<?xi1>, tensor<output_type>, tensor<?xoutput_type>) -> tensor<?xoutput_type>
+          %cast_1 = tensor.cast %35 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %29 = arith.cmpi ule, %26, %c2 : index
+          %30 = scf.if %29 -> (tensor<*xoutput_type>) {
+            %31 = shape.broadcast %22#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %31 : tensor<?xindex> to tensor<2xindex>
+            %32 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %33 = shape.broadcast %22#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %33 : tensor<?xindex> to tensor<2xindex>
+            %34 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %35 = chlo.broadcast_compare %34, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?xelem_type>, tensor<output_type>) -> tensor<?x?xi1>
+            %36 = chlo.broadcast_divide %32, %34 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %37 = chlo.broadcast_select %35, %5, %36 : (tensor<?x?xi1>, tensor<output_type>, tensor<?x?xoutput_type>) -> tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %37 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %31 = arith.cmpi ule, %26, %c3 : index
+            %32 = scf.if %31 -> (tensor<*xoutput_type>) {
+              %33 = shape.broadcast %22#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %33 : tensor<?xindex> to tensor<3xindex>
+              %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %35 = shape.broadcast %22#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<3xindex>
+              %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %37 = chlo.broadcast_compare %36, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?xelem_type>, tensor<output_type>) -> tensor<?x?x?xi1>
+              %38 = chlo.broadcast_divide %34, %36 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %39 = chlo.broadcast_select %37, %5, %38 : (tensor<?x?x?xi1>, tensor<output_type>, tensor<?x?x?xoutput_type>) -> tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %39 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %33 = arith.cmpi ule, %26, %c4 : index
+              %34 = scf.if %33 -> (tensor<*xoutput_type>) {
+                %35 = shape.broadcast %22#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %35 : tensor<?xindex> to tensor<4xindex>
+                %36 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %37 = shape.broadcast %22#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %37 : tensor<?xindex> to tensor<4xindex>
+                %38 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %39 = chlo.broadcast_compare %38, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?xelem_type>, tensor<output_type>) -> tensor<?x?x?x?xi1>
+                %40 = chlo.broadcast_divide %36, %38 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %41 = chlo.broadcast_select %39, %5, %40 : (tensor<?x?x?x?xi1>, tensor<output_type>, tensor<?x?x?x?xoutput_type>) -> tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %41 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %35 = arith.cmpi ule, %26, %c5 : index
+                cf.assert %35, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %36 = shape.broadcast %22#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %36 : tensor<?xindex> to tensor<5xindex>
+                %37 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %38 = shape.broadcast %22#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %38 : tensor<?xindex> to tensor<5xindex>
+                %39 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %40 = chlo.broadcast_compare %39, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?x?xelem_type>, tensor<output_type>) -> tensor<?x?x?x?x?xi1>
+                %41 = chlo.broadcast_divide %37, %39 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %42 = chlo.broadcast_select %40, %5, %41 : (tensor<?x?x?x?x?xi1>, tensor<output_type>, tensor<?x?x?x?x?xoutput_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %42 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %34 : tensor<*xoutput_type>
+            }
+            scf.yield %32 : tensor<*xoutput_type>
+          }
+          scf.yield %30 : tensor<*xoutput_type>
+        }
+        scf.yield %28 : tensor<*xoutput_type>
+      }
+      scf.yield %21 : tensor<*xoutput_type>
+    }
+    scf.yield %19 : tensor<*xoutput_type>
+  }
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %13 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %14 = shape.broadcast %12, %13 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %15 = shape.broadcast %11, %14 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %16 = mhlo.dynamic_reshape %10, %15 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %16 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/elu.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/elu.mlir.tmpl
index 4575994a5e5ac6..40378cce20097f 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/elu.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/elu.mlir.tmpl
@@ -1,8 +1,12 @@
-func.func @Elu_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = "chlo.constant_like"(%arg0) {value = 0.000000e+00 : elem_type} : (tensor<*xelem_type>) -> tensor<*xelem_type>
-  %1 = mhlo.compare  GT, %arg0, %0 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xi1>
-  %2 = mhlo.exponential_minus_one %arg0 : tensor<*xelem_type>
-  %3 = mhlo.select %1, %arg0, %2 : tensor<*xi1>, tensor<*xelem_type>
-  func.return %3 : tensor<*xoutput_type>
+func.func @Elu_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = "chlo.constant_like"(%2) {value = 0.000000e+00 : elem_type} : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  %4 = mhlo.compare  GT, %2, %3 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xi1>
+  %5 = mhlo.exponential_minus_one %2 : tensor<?xelem_type>
+  %6 = mhlo.select %4, %2, %5 : tensor<?xi1>, tensor<?xelem_type>
+  %7 = mhlo.dynamic_reshape %6, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %7 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/equal.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/equal.mlir.tmpl
index 96d97f0273536d..fbf599ae3b3375 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/equal.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/equal.mlir.tmpl
@@ -1,5 +1,129 @@
-func.func @Equal_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_compare %arg0, %arg1 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Equal_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xoutput_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_compare %15, %16 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %cast = tensor.cast %17 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xoutput_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_compare %18, %19 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %cast = tensor.cast %20 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xoutput_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_compare %21, %22 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xoutput_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_compare %27, %29 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %cast_1 = tensor.cast %30 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xoutput_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_compare %29, %31 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xoutput_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_compare %31, %33 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xoutput_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_compare %33, %35 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_compare %34, %36 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %31 : tensor<*xoutput_type>
+            }
+            scf.yield %29 : tensor<*xoutput_type>
+          }
+          scf.yield %27 : tensor<*xoutput_type>
+        }
+        scf.yield %25 : tensor<*xoutput_type>
+      }
+      scf.yield %18 : tensor<*xoutput_type>
+    }
+    scf.yield %16 : tensor<*xoutput_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %13 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/erf.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/erf.mlir.tmpl
index 83fb89eaf43fc5..51ab4c0c995caf 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/erf.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/erf.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Erf_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.erf %arg0 : tensor<*xelem_type> -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Erf_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = chlo.erf %2 : tensor<?xelem_type> -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/erfc.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/erfc.mlir.tmpl
index f40bfee5e926f6..e0254a1d1a84e6 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/erfc.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/erfc.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Erfc_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.erfc %arg0 : tensor<*xelem_type> -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Erfc_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = chlo.erfc %2 : tensor<?xelem_type> -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/exp.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/exp.mlir.tmpl
index c41a3fb83d7d5f..030ff18ac939c9 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/exp.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/exp.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Exp_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.exponential %arg0 : tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Exp_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.exponential %2 : tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/expm1.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/expm1.mlir.tmpl
index ee0f193dffe2dd..f5ed49876d04fb 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/expm1.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/expm1.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Expm1_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.exponential_minus_one %arg0 : tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Expm1_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.exponential_minus_one %2 : tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/floor.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/floor.mlir.tmpl
index 226a224eb5012b..5200ecc0f4f54f 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/floor.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/floor.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Floor_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.floor %arg0 : tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Floor_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.floor %2 : tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_div.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_div.mlir.tmpl
index 265074b3f33418..0b19056d6c641a 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_div.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_div.mlir.tmpl
@@ -1,16 +1,195 @@
-func.func @FloorDiv_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_divide %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %1 = chlo.broadcast_multiply %0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %2 = chlo.broadcast_compare %1, %arg0 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xi1>
-  %3 = mhlo.constant dense<0> : tensor<elem_type>
-  %4 = chlo.broadcast_compare %arg0, %3 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %5 = mhlo.constant dense<0> : tensor<elem_type>
-  %6 = chlo.broadcast_compare %arg1, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %7 = chlo.broadcast_compare %4, %6 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
-  %8 = chlo.broadcast_and %2, %7 : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
-  %9 = mhlo.constant dense<1> : tensor<elem_type>
-  %10 = chlo.broadcast_subtract %0, %9 : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xelem_type>
-  %11 = mhlo.select %8, %10, %0 : tensor<*xi1>, tensor<*xelem_type>
-  return %11 : tensor<*xoutput_type>
+func.func @FloorDiv_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = mhlo.constant dense<1> : tensor<elem_type>
+  %6 = mhlo.constant dense<0> : tensor<elem_type>
+  %7 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %8 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %9 = shape.num_elements %7 : tensor<?xindex> -> index
+  %10 = arith.cmpi eq, %9, %c1 : index
+  %11 = scf.if %10 -> (tensor<*xelem_type>) {
+    %16 = shape.num_elements %8 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %16 : tensor<1xindex>
+    %17 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %19 = chlo.broadcast_divide %18, %17 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+    %20 = chlo.broadcast_multiply %19, %17 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+    %21 = chlo.broadcast_compare %20, %18 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xi1>
+    %22 = chlo.broadcast_compare %18, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+    %23 = chlo.broadcast_compare %17, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<elem_type>, tensor<elem_type>) -> tensor<i1>
+    %24 = chlo.broadcast_compare %22, %23 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xi1>, tensor<i1>) -> tensor<?xi1>
+    %25 = chlo.broadcast_and %21, %24 : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+    %26 = chlo.broadcast_subtract %19, %5 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+    %27 = mhlo.select %25, %26, %19 : tensor<?xi1>, tensor<?xelem_type>
+    %cast = tensor.cast %27 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %16 = shape.num_elements %8 : tensor<?xindex> -> index
+    %17 = arith.cmpi eq, %16, %c1 : index
+    %18 = scf.if %17 -> (tensor<*xelem_type>) {
+      %19 = shape.num_elements %7 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %19 : tensor<1xindex>
+      %20 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %21 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %22 = chlo.broadcast_divide %21, %20 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+      %23 = chlo.broadcast_multiply %22, %20 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+      %24 = chlo.broadcast_compare %23, %21 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+      %25 = chlo.broadcast_compare %21, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<elem_type>, tensor<elem_type>) -> tensor<i1>
+      %26 = chlo.broadcast_compare %20, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+      %27 = chlo.broadcast_compare %25, %26 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<i1>, tensor<?xi1>) -> tensor<?xi1>
+      %28 = chlo.broadcast_and %24, %27 : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+      %29 = chlo.broadcast_subtract %22, %5 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %30 = mhlo.select %28, %29, %22 : tensor<?xi1>, tensor<?xelem_type>
+      %cast = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %19 = shape.shape_eq %7, %8 : tensor<?xindex>, tensor<?xindex>
+      %20 = scf.if %19 -> (tensor<*xelem_type>) {
+        %21 = shape.any %7, %8 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %22 = shape.num_elements %21 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %22 : tensor<1xindex>
+        %23 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %24 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %25 = chlo.broadcast_divide %24, %23 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %26 = chlo.broadcast_multiply %25, %23 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %27 = chlo.broadcast_compare %26, %24 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xi1>
+        %28 = chlo.broadcast_compare %24, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+        %29 = chlo.broadcast_compare %23, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+        %30 = chlo.broadcast_compare %28, %29 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+        %31 = chlo.broadcast_and %27, %30 : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+        %32 = chlo.broadcast_subtract %25, %5 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+        %33 = mhlo.select %31, %32, %25 : tensor<?xi1>, tensor<?xelem_type>
+        %cast = tensor.cast %33 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %21:2 = chlo.minimum_broadcast_shapes %7, %8 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %22 = shape.rank %21#0 : tensor<?xindex> -> index
+        %23 = shape.rank %21#1 : tensor<?xindex> -> index
+        %24 = arith.cmpi sgt, %22, %23 : index
+        %25 = arith.select %24, %22, %23 : index
+        %26 = arith.cmpi ule, %25, %c1 : index
+        %27 = scf.if %26 -> (tensor<*xelem_type>) {
+          %28 = shape.broadcast %21#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = shape.broadcast %21#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<1xindex>
+          %31 = mhlo.dynamic_reshape %arg0, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %32 = chlo.broadcast_divide %31, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %33 = chlo.broadcast_multiply %32, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %34 = chlo.broadcast_compare %33, %31 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xi1>
+          %35 = chlo.broadcast_compare %31, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+          %36 = chlo.broadcast_compare %29, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+          %37 = chlo.broadcast_compare %35, %36 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+          %38 = chlo.broadcast_and %34, %37 : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+          %39 = chlo.broadcast_subtract %32, %5 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+          %40 = mhlo.select %38, %39, %32 : tensor<?xi1>, tensor<?xelem_type>
+          %cast_1 = tensor.cast %40 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %28 = arith.cmpi ule, %25, %c2 : index
+          %29 = scf.if %28 -> (tensor<*xelem_type>) {
+            %30 = shape.broadcast %21#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = shape.broadcast %21#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<2xindex>
+            %33 = mhlo.dynamic_reshape %arg0, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %34 = chlo.broadcast_divide %33, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %35 = chlo.broadcast_multiply %34, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %36 = chlo.broadcast_compare %35, %33 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xi1>
+            %37 = chlo.broadcast_compare %33, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?xelem_type>, tensor<elem_type>) -> tensor<?x?xi1>
+            %38 = chlo.broadcast_compare %31, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?xelem_type>, tensor<elem_type>) -> tensor<?x?xi1>
+            %39 = chlo.broadcast_compare %37, %38 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?xi1>, tensor<?x?xi1>) -> tensor<?x?xi1>
+            %40 = chlo.broadcast_and %36, %39 : (tensor<?x?xi1>, tensor<?x?xi1>) -> tensor<?x?xi1>
+            %41 = chlo.broadcast_subtract %34, %5 : (tensor<?x?xelem_type>, tensor<elem_type>) -> tensor<?x?xelem_type>
+            %42 = mhlo.select %40, %41, %34 : tensor<?x?xi1>, tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %42 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %30 = arith.cmpi ule, %25, %c3 : index
+            %31 = scf.if %30 -> (tensor<*xelem_type>) {
+              %32 = shape.broadcast %21#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = shape.broadcast %21#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<3xindex>
+              %35 = mhlo.dynamic_reshape %arg0, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %36 = chlo.broadcast_divide %35, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %37 = chlo.broadcast_multiply %36, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %38 = chlo.broadcast_compare %37, %35 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xi1>
+              %39 = chlo.broadcast_compare %35, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?xi1>
+              %40 = chlo.broadcast_compare %33, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?xi1>
+              %41 = chlo.broadcast_compare %39, %40 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?xi1>, tensor<?x?x?xi1>) -> tensor<?x?x?xi1>
+              %42 = chlo.broadcast_and %38, %41 : (tensor<?x?x?xi1>, tensor<?x?x?xi1>) -> tensor<?x?x?xi1>
+              %43 = chlo.broadcast_subtract %36, %5 : (tensor<?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?xelem_type>
+              %44 = mhlo.select %42, %43, %36 : tensor<?x?x?xi1>, tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %44 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %32 = arith.cmpi ule, %25, %c4 : index
+              %33 = scf.if %32 -> (tensor<*xelem_type>) {
+                %34 = shape.broadcast %21#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = shape.broadcast %21#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %36 : tensor<?xindex> to tensor<4xindex>
+                %37 = mhlo.dynamic_reshape %arg0, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %38 = chlo.broadcast_divide %37, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %39 = chlo.broadcast_multiply %38, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %40 = chlo.broadcast_compare %39, %37 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xi1>
+                %41 = chlo.broadcast_compare %37, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?xi1>
+                %42 = chlo.broadcast_compare %35, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?xi1>
+                %43 = chlo.broadcast_compare %41, %42 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?x?xi1>, tensor<?x?x?x?xi1>) -> tensor<?x?x?x?xi1>
+                %44 = chlo.broadcast_and %40, %43 : (tensor<?x?x?x?xi1>, tensor<?x?x?x?xi1>) -> tensor<?x?x?x?xi1>
+                %45 = chlo.broadcast_subtract %38, %5 : (tensor<?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?xelem_type>
+                %46 = mhlo.select %44, %45, %38 : tensor<?x?x?x?xi1>, tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %46 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %34 = arith.cmpi ule, %25, %c5 : index
+                cf.assert %34, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %35 = shape.broadcast %21#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = shape.broadcast %21#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %37 : tensor<?xindex> to tensor<5xindex>
+                %38 = mhlo.dynamic_reshape %arg0, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %39 = chlo.broadcast_divide %38, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %40 = chlo.broadcast_multiply %39, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %41 = chlo.broadcast_compare %40, %38 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xi1>
+                %42 = chlo.broadcast_compare %38, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?x?xi1>
+                %43 = chlo.broadcast_compare %36, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?x?xi1>
+                %44 = chlo.broadcast_compare %42, %43 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?x?x?xi1>, tensor<?x?x?x?x?xi1>) -> tensor<?x?x?x?x?xi1>
+                %45 = chlo.broadcast_and %41, %44 : (tensor<?x?x?x?x?xi1>, tensor<?x?x?x?x?xi1>) -> tensor<?x?x?x?x?xi1>
+                %46 = chlo.broadcast_subtract %39, %5 : (tensor<?x?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %47 = mhlo.select %45, %46, %39 : tensor<?x?x?x?x?xi1>, tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %47 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %33 : tensor<*xelem_type>
+            }
+            scf.yield %31 : tensor<*xelem_type>
+          }
+          scf.yield %29 : tensor<*xelem_type>
+        }
+        scf.yield %27 : tensor<*xelem_type>
+      }
+      scf.yield %20 : tensor<*xelem_type>
+    }
+    scf.yield %18 : tensor<*xelem_type>
+  }
+  %12 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %13 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %14 = shape.broadcast %12, %13 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %15 = mhlo.dynamic_reshape %11, %14 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %15 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_div_float.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_div_float.mlir.tmpl
index 32d9f89431eb3e..280d957a6bf705 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_div_float.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_div_float.mlir.tmpl
@@ -1,6 +1,137 @@
-func.func @FloorDiv_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_divide %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  %1 = mhlo.floor %0 : tensor<*xoutput_type>
-  func.return %1 : tensor<*xoutput_type>
+func.func @FloorDiv_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xoutput_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_divide %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %18 = mhlo.floor %17 : tensor<?xoutput_type>
+    %cast = tensor.cast %18 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xoutput_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_divide %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %21 = mhlo.floor %20 : tensor<?xoutput_type>
+      %cast = tensor.cast %21 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xoutput_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_divide %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %24 = mhlo.floor %23 : tensor<?xoutput_type>
+        %cast = tensor.cast %24 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xoutput_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_divide %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %31 = mhlo.floor %30 : tensor<?xoutput_type>
+          %cast_1 = tensor.cast %31 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xoutput_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_divide %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %33 = mhlo.floor %32 : tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %33 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xoutput_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_divide %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %35 = mhlo.floor %34 : tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %35 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xoutput_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_divide %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %37 = mhlo.floor %36 : tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_divide %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %38 = mhlo.floor %37 : tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %38 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %31 : tensor<*xoutput_type>
+            }
+            scf.yield %29 : tensor<*xoutput_type>
+          }
+          scf.yield %27 : tensor<*xoutput_type>
+        }
+        scf.yield %25 : tensor<*xoutput_type>
+      }
+      scf.yield %18 : tensor<*xoutput_type>
+    }
+    scf.yield %16 : tensor<*xoutput_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %13 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod.mlir.tmpl
index 92497856f22264..d7a703c12f3b37 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod.mlir.tmpl
@@ -1,14 +1,188 @@
-func.func @FloorMod_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_remainder %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %1 = mhlo.constant dense<0> : tensor<elem_type>
-  %2 = chlo.broadcast_compare %0, %1 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %3 = mhlo.constant dense<0> : tensor<elem_type>
-  %4 = chlo.broadcast_compare %arg1, %3 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %5 = chlo.broadcast_compare %0, %3 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %6 = chlo.broadcast_compare %4, %5 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
-  %7 = chlo.broadcast_and %2, %6 : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
-  %8 = chlo.broadcast_add %arg1, %0 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %9 = mhlo.select %7, %8, %0 : tensor<*xi1>, tensor<*xelem_type>
-  return %9 : tensor<*xoutput_type>
+func.func @FloorMod_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = mhlo.constant dense<0> : tensor<elem_type>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %8 = shape.num_elements %6 : tensor<?xindex> -> index
+  %9 = arith.cmpi eq, %8, %c1 : index
+  %10 = scf.if %9 -> (tensor<*xelem_type>) {
+    %17 = shape.num_elements %7 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %17 : tensor<1xindex>
+    %18 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %19 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %20 = chlo.broadcast_remainder %19, %18 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+    %21 = chlo.broadcast_compare %20, %5 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+    %22 = chlo.broadcast_compare %18, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<elem_type>, tensor<elem_type>) -> tensor<i1>
+    %23 = chlo.broadcast_compare %20, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+    %24 = chlo.broadcast_compare %22, %23 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<i1>, tensor<?xi1>) -> tensor<?xi1>
+    %25 = chlo.broadcast_and %21, %24 : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+    %26 = chlo.broadcast_add %18, %20 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %27 = mhlo.select %25, %26, %20 : tensor<?xi1>, tensor<?xelem_type>
+    %cast = tensor.cast %27 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %17 = shape.num_elements %7 : tensor<?xindex> -> index
+    %18 = arith.cmpi eq, %17, %c1 : index
+    %19 = scf.if %18 -> (tensor<*xelem_type>) {
+      %20 = shape.num_elements %6 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %20 : tensor<1xindex>
+      %21 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %22 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %23 = chlo.broadcast_remainder %22, %21 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+      %24 = chlo.broadcast_compare %23, %5 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+      %25 = chlo.broadcast_compare %21, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+      %26 = chlo.broadcast_compare %23, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+      %27 = chlo.broadcast_compare %25, %26 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+      %28 = chlo.broadcast_and %24, %27 : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+      %29 = chlo.broadcast_add %21, %23 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+      %30 = mhlo.select %28, %29, %23 : tensor<?xi1>, tensor<?xelem_type>
+      %cast = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %20 = shape.shape_eq %6, %7 : tensor<?xindex>, tensor<?xindex>
+      %21 = scf.if %20 -> (tensor<*xelem_type>) {
+        %22 = shape.any %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %23 = shape.num_elements %22 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %23 : tensor<1xindex>
+        %24 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %25 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %26 = chlo.broadcast_remainder %25, %24 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %27 = chlo.broadcast_compare %26, %5 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+        %28 = chlo.broadcast_compare %24, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+        %29 = chlo.broadcast_compare %26, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+        %30 = chlo.broadcast_compare %28, %29 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+        %31 = chlo.broadcast_and %27, %30 : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+        %32 = chlo.broadcast_add %24, %26 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %33 = mhlo.select %31, %32, %26 : tensor<?xi1>, tensor<?xelem_type>
+        %cast = tensor.cast %33 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %22:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %23 = shape.rank %22#0 : tensor<?xindex> -> index
+        %24 = shape.rank %22#1 : tensor<?xindex> -> index
+        %25 = arith.cmpi sgt, %23, %24 : index
+        %26 = arith.select %25, %23, %24 : index
+        %27 = arith.cmpi ule, %26, %c1 : index
+        %28 = scf.if %27 -> (tensor<*xelem_type>) {
+          %29 = shape.broadcast %22#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %29 : tensor<?xindex> to tensor<1xindex>
+          %30 = mhlo.dynamic_reshape %arg1, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %31 = shape.broadcast %22#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %31 : tensor<?xindex> to tensor<1xindex>
+          %32 = mhlo.dynamic_reshape %arg0, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %33 = chlo.broadcast_remainder %32, %30 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %34 = chlo.broadcast_compare %33, %5 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+          %35 = chlo.broadcast_compare %30, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+          %36 = chlo.broadcast_compare %33, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+          %37 = chlo.broadcast_compare %35, %36 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+          %38 = chlo.broadcast_and %34, %37 : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+          %39 = chlo.broadcast_add %30, %33 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %40 = mhlo.select %38, %39, %33 : tensor<?xi1>, tensor<?xelem_type>
+          %cast_1 = tensor.cast %40 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %29 = arith.cmpi ule, %26, %c2 : index
+          %30 = scf.if %29 -> (tensor<*xelem_type>) {
+            %31 = shape.broadcast %22#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %31 : tensor<?xindex> to tensor<2xindex>
+            %32 = mhlo.dynamic_reshape %arg1, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %33 = shape.broadcast %22#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %33 : tensor<?xindex> to tensor<2xindex>
+            %34 = mhlo.dynamic_reshape %arg0, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %35 = chlo.broadcast_remainder %34, %32 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %36 = chlo.broadcast_compare %35, %5 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?xelem_type>, tensor<elem_type>) -> tensor<?x?xi1>
+            %37 = chlo.broadcast_compare %32, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?xelem_type>, tensor<elem_type>) -> tensor<?x?xi1>
+            %38 = chlo.broadcast_compare %35, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?xelem_type>, tensor<elem_type>) -> tensor<?x?xi1>
+            %39 = chlo.broadcast_compare %37, %38 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?xi1>, tensor<?x?xi1>) -> tensor<?x?xi1>
+            %40 = chlo.broadcast_and %36, %39 : (tensor<?x?xi1>, tensor<?x?xi1>) -> tensor<?x?xi1>
+            %41 = chlo.broadcast_add %32, %35 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %42 = mhlo.select %40, %41, %35 : tensor<?x?xi1>, tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %42 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %31 = arith.cmpi ule, %26, %c3 : index
+            %32 = scf.if %31 -> (tensor<*xelem_type>) {
+              %33 = shape.broadcast %22#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %33 : tensor<?xindex> to tensor<3xindex>
+              %34 = mhlo.dynamic_reshape %arg1, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %35 = shape.broadcast %22#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<3xindex>
+              %36 = mhlo.dynamic_reshape %arg0, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %37 = chlo.broadcast_remainder %36, %34 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %38 = chlo.broadcast_compare %37, %5 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?xi1>
+              %39 = chlo.broadcast_compare %34, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?xi1>
+              %40 = chlo.broadcast_compare %37, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?xi1>
+              %41 = chlo.broadcast_compare %39, %40 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?xi1>, tensor<?x?x?xi1>) -> tensor<?x?x?xi1>
+              %42 = chlo.broadcast_and %38, %41 : (tensor<?x?x?xi1>, tensor<?x?x?xi1>) -> tensor<?x?x?xi1>
+              %43 = chlo.broadcast_add %34, %37 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %44 = mhlo.select %42, %43, %37 : tensor<?x?x?xi1>, tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %44 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %33 = arith.cmpi ule, %26, %c4 : index
+              %34 = scf.if %33 -> (tensor<*xelem_type>) {
+                %35 = shape.broadcast %22#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %35 : tensor<?xindex> to tensor<4xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %37 = shape.broadcast %22#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %37 : tensor<?xindex> to tensor<4xindex>
+                %38 = mhlo.dynamic_reshape %arg0, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %39 = chlo.broadcast_remainder %38, %36 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %40 = chlo.broadcast_compare %39, %5 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?xi1>
+                %41 = chlo.broadcast_compare %36, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?xi1>
+                %42 = chlo.broadcast_compare %39, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?xi1>
+                %43 = chlo.broadcast_compare %41, %42 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?x?xi1>, tensor<?x?x?x?xi1>) -> tensor<?x?x?x?xi1>
+                %44 = chlo.broadcast_and %40, %43 : (tensor<?x?x?x?xi1>, tensor<?x?x?x?xi1>) -> tensor<?x?x?x?xi1>
+                %45 = chlo.broadcast_add %36, %39 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %46 = mhlo.select %44, %45, %39 : tensor<?x?x?x?xi1>, tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %46 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %35 = arith.cmpi ule, %26, %c5 : index
+                cf.assert %35, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %36 = shape.broadcast %22#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %36 : tensor<?xindex> to tensor<5xindex>
+                %37 = mhlo.dynamic_reshape %arg1, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %38 = shape.broadcast %22#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %38 : tensor<?xindex> to tensor<5xindex>
+                %39 = mhlo.dynamic_reshape %arg0, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %40 = chlo.broadcast_remainder %39, %37 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %41 = chlo.broadcast_compare %40, %5 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?x?xi1>
+                %42 = chlo.broadcast_compare %37, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?x?xi1>
+                %43 = chlo.broadcast_compare %40, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?x?xi1>
+                %44 = chlo.broadcast_compare %42, %43 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?x?x?xi1>, tensor<?x?x?x?x?xi1>) -> tensor<?x?x?x?x?xi1>
+                %45 = chlo.broadcast_and %41, %44 : (tensor<?x?x?x?x?xi1>, tensor<?x?x?x?x?xi1>) -> tensor<?x?x?x?x?xi1>
+                %46 = chlo.broadcast_add %37, %40 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %47 = mhlo.select %45, %46, %40 : tensor<?x?x?x?x?xi1>, tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %47 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %34 : tensor<*xelem_type>
+            }
+            scf.yield %32 : tensor<*xelem_type>
+          }
+          scf.yield %30 : tensor<*xelem_type>
+        }
+        scf.yield %28 : tensor<*xelem_type>
+      }
+      scf.yield %21 : tensor<*xelem_type>
+    }
+    scf.yield %19 : tensor<*xelem_type>
+  }
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %13 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %14 = shape.broadcast %12, %13 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %15 = shape.broadcast %11, %14 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %16 = mhlo.dynamic_reshape %10, %15 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %16 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod_float.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod_float.mlir.tmpl
index dc86b86ff5b1d5..c71f34da2cd8ac 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod_float.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod_float.mlir.tmpl
@@ -1,14 +1,188 @@
-func.func @FloorMod_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_remainder %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %1 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
-  %2 = chlo.broadcast_compare %0, %1 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %3 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
-  %4 = chlo.broadcast_compare %arg1, %3 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %5 = chlo.broadcast_compare %0, %3 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %6 = chlo.broadcast_compare %4, %5 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
-  %7 = chlo.broadcast_and %2, %6 : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
-  %8 = chlo.broadcast_add %arg1, %0 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %9 = mhlo.select %7, %8, %0 : tensor<*xi1>, tensor<*xelem_type>
-  func.return %9 : tensor<*xoutput_type>
+func.func @FloorMod_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %8 = shape.num_elements %6 : tensor<?xindex> -> index
+  %9 = arith.cmpi eq, %8, %c1 : index
+  %10 = scf.if %9 -> (tensor<*xelem_type>) {
+    %17 = shape.num_elements %7 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %17 : tensor<1xindex>
+    %18 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %19 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %20 = chlo.broadcast_remainder %19, %18 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+    %21 = chlo.broadcast_compare %20, %5 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+    %22 = chlo.broadcast_compare %18, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<elem_type>, tensor<elem_type>) -> tensor<i1>
+    %23 = chlo.broadcast_compare %20, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+    %24 = chlo.broadcast_compare %22, %23 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<i1>, tensor<?xi1>) -> tensor<?xi1>
+    %25 = chlo.broadcast_and %21, %24 : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+    %26 = chlo.broadcast_add %18, %20 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %27 = mhlo.select %25, %26, %20 : tensor<?xi1>, tensor<?xelem_type>
+    %cast = tensor.cast %27 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %17 = shape.num_elements %7 : tensor<?xindex> -> index
+    %18 = arith.cmpi eq, %17, %c1 : index
+    %19 = scf.if %18 -> (tensor<*xelem_type>) {
+      %20 = shape.num_elements %6 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %20 : tensor<1xindex>
+      %21 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %22 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %23 = chlo.broadcast_remainder %22, %21 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+      %24 = chlo.broadcast_compare %23, %5 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+      %25 = chlo.broadcast_compare %21, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+      %26 = chlo.broadcast_compare %23, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+      %27 = chlo.broadcast_compare %25, %26 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+      %28 = chlo.broadcast_and %24, %27 : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+      %29 = chlo.broadcast_add %21, %23 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+      %30 = mhlo.select %28, %29, %23 : tensor<?xi1>, tensor<?xelem_type>
+      %cast = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %20 = shape.shape_eq %6, %7 : tensor<?xindex>, tensor<?xindex>
+      %21 = scf.if %20 -> (tensor<*xelem_type>) {
+        %22 = shape.any %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %23 = shape.num_elements %22 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %23 : tensor<1xindex>
+        %24 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %25 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %26 = chlo.broadcast_remainder %25, %24 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %27 = chlo.broadcast_compare %26, %5 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+        %28 = chlo.broadcast_compare %24, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+        %29 = chlo.broadcast_compare %26, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+        %30 = chlo.broadcast_compare %28, %29 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+        %31 = chlo.broadcast_and %27, %30 : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+        %32 = chlo.broadcast_add %24, %26 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %33 = mhlo.select %31, %32, %26 : tensor<?xi1>, tensor<?xelem_type>
+        %cast = tensor.cast %33 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %22:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %23 = shape.rank %22#0 : tensor<?xindex> -> index
+        %24 = shape.rank %22#1 : tensor<?xindex> -> index
+        %25 = arith.cmpi sgt, %23, %24 : index
+        %26 = arith.select %25, %23, %24 : index
+        %27 = arith.cmpi ule, %26, %c1 : index
+        %28 = scf.if %27 -> (tensor<*xelem_type>) {
+          %29 = shape.broadcast %22#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %29 : tensor<?xindex> to tensor<1xindex>
+          %30 = mhlo.dynamic_reshape %arg1, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %31 = shape.broadcast %22#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %31 : tensor<?xindex> to tensor<1xindex>
+          %32 = mhlo.dynamic_reshape %arg0, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %33 = chlo.broadcast_remainder %32, %30 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %34 = chlo.broadcast_compare %33, %5 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+          %35 = chlo.broadcast_compare %30, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+          %36 = chlo.broadcast_compare %33, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+          %37 = chlo.broadcast_compare %35, %36 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+          %38 = chlo.broadcast_and %34, %37 : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+          %39 = chlo.broadcast_add %30, %33 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %40 = mhlo.select %38, %39, %33 : tensor<?xi1>, tensor<?xelem_type>
+          %cast_1 = tensor.cast %40 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %29 = arith.cmpi ule, %26, %c2 : index
+          %30 = scf.if %29 -> (tensor<*xelem_type>) {
+            %31 = shape.broadcast %22#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %31 : tensor<?xindex> to tensor<2xindex>
+            %32 = mhlo.dynamic_reshape %arg1, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %33 = shape.broadcast %22#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %33 : tensor<?xindex> to tensor<2xindex>
+            %34 = mhlo.dynamic_reshape %arg0, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %35 = chlo.broadcast_remainder %34, %32 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %36 = chlo.broadcast_compare %35, %5 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?xelem_type>, tensor<elem_type>) -> tensor<?x?xi1>
+            %37 = chlo.broadcast_compare %32, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?xelem_type>, tensor<elem_type>) -> tensor<?x?xi1>
+            %38 = chlo.broadcast_compare %35, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?xelem_type>, tensor<elem_type>) -> tensor<?x?xi1>
+            %39 = chlo.broadcast_compare %37, %38 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?xi1>, tensor<?x?xi1>) -> tensor<?x?xi1>
+            %40 = chlo.broadcast_and %36, %39 : (tensor<?x?xi1>, tensor<?x?xi1>) -> tensor<?x?xi1>
+            %41 = chlo.broadcast_add %32, %35 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %42 = mhlo.select %40, %41, %35 : tensor<?x?xi1>, tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %42 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %31 = arith.cmpi ule, %26, %c3 : index
+            %32 = scf.if %31 -> (tensor<*xelem_type>) {
+              %33 = shape.broadcast %22#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %33 : tensor<?xindex> to tensor<3xindex>
+              %34 = mhlo.dynamic_reshape %arg1, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %35 = shape.broadcast %22#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<3xindex>
+              %36 = mhlo.dynamic_reshape %arg0, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %37 = chlo.broadcast_remainder %36, %34 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %38 = chlo.broadcast_compare %37, %5 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?xi1>
+              %39 = chlo.broadcast_compare %34, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?xi1>
+              %40 = chlo.broadcast_compare %37, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?xi1>
+              %41 = chlo.broadcast_compare %39, %40 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?xi1>, tensor<?x?x?xi1>) -> tensor<?x?x?xi1>
+              %42 = chlo.broadcast_and %38, %41 : (tensor<?x?x?xi1>, tensor<?x?x?xi1>) -> tensor<?x?x?xi1>
+              %43 = chlo.broadcast_add %34, %37 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %44 = mhlo.select %42, %43, %37 : tensor<?x?x?xi1>, tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %44 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %33 = arith.cmpi ule, %26, %c4 : index
+              %34 = scf.if %33 -> (tensor<*xelem_type>) {
+                %35 = shape.broadcast %22#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %35 : tensor<?xindex> to tensor<4xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %37 = shape.broadcast %22#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %37 : tensor<?xindex> to tensor<4xindex>
+                %38 = mhlo.dynamic_reshape %arg0, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %39 = chlo.broadcast_remainder %38, %36 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %40 = chlo.broadcast_compare %39, %5 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?xi1>
+                %41 = chlo.broadcast_compare %36, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?xi1>
+                %42 = chlo.broadcast_compare %39, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?xi1>
+                %43 = chlo.broadcast_compare %41, %42 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?x?xi1>, tensor<?x?x?x?xi1>) -> tensor<?x?x?x?xi1>
+                %44 = chlo.broadcast_and %40, %43 : (tensor<?x?x?x?xi1>, tensor<?x?x?x?xi1>) -> tensor<?x?x?x?xi1>
+                %45 = chlo.broadcast_add %36, %39 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %46 = mhlo.select %44, %45, %39 : tensor<?x?x?x?xi1>, tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %46 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %35 = arith.cmpi ule, %26, %c5 : index
+                cf.assert %35, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %36 = shape.broadcast %22#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %36 : tensor<?xindex> to tensor<5xindex>
+                %37 = mhlo.dynamic_reshape %arg1, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %38 = shape.broadcast %22#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %38 : tensor<?xindex> to tensor<5xindex>
+                %39 = mhlo.dynamic_reshape %arg0, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %40 = chlo.broadcast_remainder %39, %37 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %41 = chlo.broadcast_compare %40, %5 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?x?xi1>
+                %42 = chlo.broadcast_compare %37, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?x?xi1>
+                %43 = chlo.broadcast_compare %40, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?x?xi1>
+                %44 = chlo.broadcast_compare %42, %43 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?x?x?xi1>, tensor<?x?x?x?x?xi1>) -> tensor<?x?x?x?x?xi1>
+                %45 = chlo.broadcast_and %41, %44 : (tensor<?x?x?x?x?xi1>, tensor<?x?x?x?x?xi1>) -> tensor<?x?x?x?x?xi1>
+                %46 = chlo.broadcast_add %37, %40 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %47 = mhlo.select %45, %46, %40 : tensor<?x?x?x?x?xi1>, tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %47 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %34 : tensor<*xelem_type>
+            }
+            scf.yield %32 : tensor<*xelem_type>
+          }
+          scf.yield %30 : tensor<*xelem_type>
+        }
+        scf.yield %28 : tensor<*xelem_type>
+      }
+      scf.yield %21 : tensor<*xelem_type>
+    }
+    scf.yield %19 : tensor<*xelem_type>
+  }
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %13 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %14 = shape.broadcast %12, %13 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %15 = shape.broadcast %11, %14 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %16 = mhlo.dynamic_reshape %10, %15 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %16 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod_unsigned.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod_unsigned.mlir.tmpl
index b4b7654ca700f7..e9babe7dcf6c6b 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod_unsigned.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/floor_mod_unsigned.mlir.tmpl
@@ -1,5 +1,129 @@
-func.func @FloorMod_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_remainder %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @FloorMod_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xelem_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_remainder %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %17 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xelem_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_remainder %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %20 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xelem_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_remainder %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xelem_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_remainder %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xelem_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_remainder %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xelem_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_remainder %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xelem_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_remainder %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_remainder %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %31 : tensor<*xelem_type>
+            }
+            scf.yield %29 : tensor<*xelem_type>
+          }
+          scf.yield %27 : tensor<*xelem_type>
+        }
+        scf.yield %25 : tensor<*xelem_type>
+      }
+      scf.yield %18 : tensor<*xelem_type>
+    }
+    scf.yield %16 : tensor<*xelem_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %13 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/greater.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/greater.mlir.tmpl
index b3df2502ea01fa..f4a4472466165a 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/greater.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/greater.mlir.tmpl
@@ -1,5 +1,129 @@
-func.func @Greater_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_compare %arg0, %arg1 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Greater_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xoutput_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_compare %15, %16 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %cast = tensor.cast %17 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xoutput_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_compare %18, %19 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %cast = tensor.cast %20 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xoutput_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_compare %21, %22 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xoutput_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_compare %27, %29 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %cast_1 = tensor.cast %30 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xoutput_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_compare %29, %31 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xoutput_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_compare %31, %33 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xoutput_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_compare %33, %35 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_compare %34, %36 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %31 : tensor<*xoutput_type>
+            }
+            scf.yield %29 : tensor<*xoutput_type>
+          }
+          scf.yield %27 : tensor<*xoutput_type>
+        }
+        scf.yield %25 : tensor<*xoutput_type>
+      }
+      scf.yield %18 : tensor<*xoutput_type>
+    }
+    scf.yield %16 : tensor<*xoutput_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %13 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/greater_equal.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/greater_equal.mlir.tmpl
index 8160335dc8936f..573b389ebb5148 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/greater_equal.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/greater_equal.mlir.tmpl
@@ -1,5 +1,129 @@
-func.func @GreaterEqual_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_compare %arg0, %arg1 {comparison_direction = #chlo<comparison_direction GE>} : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @GreaterEqual_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xoutput_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_compare %15, %16 {comparison_direction = #chlo<comparison_direction GE>} : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %cast = tensor.cast %17 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xoutput_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_compare %18, %19 {comparison_direction = #chlo<comparison_direction GE>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %cast = tensor.cast %20 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xoutput_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_compare %21, %22 {comparison_direction = #chlo<comparison_direction GE>} : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xoutput_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_compare %27, %29 {comparison_direction = #chlo<comparison_direction GE>} : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %cast_1 = tensor.cast %30 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xoutput_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_compare %29, %31 {comparison_direction = #chlo<comparison_direction GE>} : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xoutput_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_compare %31, %33 {comparison_direction = #chlo<comparison_direction GE>} : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xoutput_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_compare %33, %35 {comparison_direction = #chlo<comparison_direction GE>} : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_compare %34, %36 {comparison_direction = #chlo<comparison_direction GE>} : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %31 : tensor<*xoutput_type>
+            }
+            scf.yield %29 : tensor<*xoutput_type>
+          }
+          scf.yield %27 : tensor<*xoutput_type>
+        }
+        scf.yield %25 : tensor<*xoutput_type>
+      }
+      scf.yield %18 : tensor<*xoutput_type>
+    }
+    scf.yield %16 : tensor<*xoutput_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %13 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/imag.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/imag.mlir.tmpl
index 779bd89fb60407..ac5dd500138258 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/imag.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/imag.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Imag_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.imag %arg0 : (tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Imag_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.imag %2 : (tensor<?xelem_type>) -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/invert.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/invert.mlir.tmpl
index d82438e4bff4b8..bba97765817ea9 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/invert.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/invert.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Invert_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.not %arg0 : tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Invert_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.not %2 : tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/is_finite.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/is_finite.mlir.tmpl
index c931c8b28e83f7..20ffb573f72eaf 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/is_finite.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/is_finite.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @IsFinite_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.is_finite %arg0 : (tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @IsFinite_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.is_finite %2 : (tensor<?xelem_type>) -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/is_inf.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/is_inf.mlir.tmpl
index 74675ebf4341fe..368dd246600504 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/is_inf.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/is_inf.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @IsInf_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.is_inf %arg0 : tensor<*xelem_type> -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @IsInf_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = chlo.is_inf %2 : tensor<?xelem_type> -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/is_nan.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/is_nan.mlir.tmpl
index 027b7527432601..739ceca3e1d4d0 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/is_nan.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/is_nan.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @IsNan_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_compare %arg0, %arg0 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @IsNan_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = chlo.broadcast_compare %2, %2 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/left_shift.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/left_shift.mlir.tmpl
index b91e79991378c1..a8638d08cc86b8 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/left_shift.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/left_shift.mlir.tmpl
@@ -1,5 +1,129 @@
-func.func @LeftShift_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_shift_left %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @LeftShift_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xelem_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_shift_left %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %17 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xelem_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_shift_left %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %20 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xelem_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_shift_left %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xelem_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_shift_left %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xelem_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_shift_left %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xelem_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_shift_left %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xelem_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_shift_left %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_shift_left %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %31 : tensor<*xelem_type>
+            }
+            scf.yield %29 : tensor<*xelem_type>
+          }
+          scf.yield %27 : tensor<*xelem_type>
+        }
+        scf.yield %25 : tensor<*xelem_type>
+      }
+      scf.yield %18 : tensor<*xelem_type>
+    }
+    scf.yield %16 : tensor<*xelem_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %13 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/less.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/less.mlir.tmpl
index cc68d6f7113892..0d3a3cd1844dc7 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/less.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/less.mlir.tmpl
@@ -1,5 +1,129 @@
-func.func @Less_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_compare %arg0, %arg1 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Less_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xoutput_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_compare %15, %16 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %cast = tensor.cast %17 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xoutput_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_compare %18, %19 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %cast = tensor.cast %20 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xoutput_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_compare %21, %22 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xoutput_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_compare %27, %29 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %cast_1 = tensor.cast %30 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xoutput_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_compare %29, %31 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xoutput_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_compare %31, %33 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xoutput_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_compare %33, %35 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_compare %34, %36 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %31 : tensor<*xoutput_type>
+            }
+            scf.yield %29 : tensor<*xoutput_type>
+          }
+          scf.yield %27 : tensor<*xoutput_type>
+        }
+        scf.yield %25 : tensor<*xoutput_type>
+      }
+      scf.yield %18 : tensor<*xoutput_type>
+    }
+    scf.yield %16 : tensor<*xoutput_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %13 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/less_equal.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/less_equal.mlir.tmpl
index d81d5d48a9e86d..31a1516a298815 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/less_equal.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/less_equal.mlir.tmpl
@@ -1,5 +1,129 @@
-func.func @LessEqual_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_compare %arg0, %arg1 {comparison_direction = #chlo<comparison_direction LE>} : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @LessEqual_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xoutput_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_compare %15, %16 {comparison_direction = #chlo<comparison_direction LE>} : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %cast = tensor.cast %17 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xoutput_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_compare %18, %19 {comparison_direction = #chlo<comparison_direction LE>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %cast = tensor.cast %20 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xoutput_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_compare %21, %22 {comparison_direction = #chlo<comparison_direction LE>} : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xoutput_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_compare %27, %29 {comparison_direction = #chlo<comparison_direction LE>} : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %cast_1 = tensor.cast %30 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xoutput_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_compare %29, %31 {comparison_direction = #chlo<comparison_direction LE>} : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xoutput_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_compare %31, %33 {comparison_direction = #chlo<comparison_direction LE>} : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xoutput_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_compare %33, %35 {comparison_direction = #chlo<comparison_direction LE>} : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_compare %34, %36 {comparison_direction = #chlo<comparison_direction LE>} : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %31 : tensor<*xoutput_type>
+            }
+            scf.yield %29 : tensor<*xoutput_type>
+          }
+          scf.yield %27 : tensor<*xoutput_type>
+        }
+        scf.yield %25 : tensor<*xoutput_type>
+      }
+      scf.yield %18 : tensor<*xoutput_type>
+    }
+    scf.yield %16 : tensor<*xoutput_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %13 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/lgamma.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/lgamma.mlir.tmpl
index fcbc71e4dbd6c0..1af17e4d37e512 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/lgamma.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/lgamma.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Lgamma_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.lgamma %arg0 : tensor<*xelem_type> -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Lgamma_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = chlo.lgamma %2 : tensor<?xelem_type> -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/log.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/log.mlir.tmpl
index 7cbd3f2d5173cd..a794234f629d58 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/log.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/log.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Log_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.log %arg0 : tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Log_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.log %2 : tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/log1p.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/log1p.mlir.tmpl
index 421aa3c61fcd4d..9aeb15d4c69d38 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/log1p.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/log1p.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Log1p_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.log_plus_one %arg0 : tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Log1p_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.log_plus_one %2 : tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/logical_and.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/logical_and.mlir.tmpl
index d4a23178c10ecc..62ea4c76bb670a 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/logical_and.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/logical_and.mlir.tmpl
@@ -1,5 +1,129 @@
-func.func @LogicalAnd_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_and %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @LogicalAnd_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xelem_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_and %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %17 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xelem_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_and %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %20 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xelem_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_and %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xelem_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_and %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xelem_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_and %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xelem_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_and %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xelem_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_and %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_and %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %31 : tensor<*xelem_type>
+            }
+            scf.yield %29 : tensor<*xelem_type>
+          }
+          scf.yield %27 : tensor<*xelem_type>
+        }
+        scf.yield %25 : tensor<*xelem_type>
+      }
+      scf.yield %18 : tensor<*xelem_type>
+    }
+    scf.yield %16 : tensor<*xelem_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %13 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/logical_not.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/logical_not.mlir.tmpl
index d2fc7ce3fdd955..0478252983a5b1 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/logical_not.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/logical_not.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @LogicalNot_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.not %arg0 : tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @LogicalNot_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.not %2 : tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/logical_or.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/logical_or.mlir.tmpl
index ffe5092bd61ced..d515ae404b1e68 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/logical_or.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/logical_or.mlir.tmpl
@@ -1,5 +1,129 @@
-func.func @LogicalOr_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_or %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @LogicalOr_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xelem_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_or %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %17 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xelem_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_or %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %20 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xelem_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_or %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xelem_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_or %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xelem_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_or %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xelem_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_or %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xelem_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_or %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_or %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %31 : tensor<*xelem_type>
+            }
+            scf.yield %29 : tensor<*xelem_type>
+          }
+          scf.yield %27 : tensor<*xelem_type>
+        }
+        scf.yield %25 : tensor<*xelem_type>
+      }
+      scf.yield %18 : tensor<*xelem_type>
+    }
+    scf.yield %16 : tensor<*xelem_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %13 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/maximum.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/maximum.mlir.tmpl
index de2fcd8171576b..8529768fb90285 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/maximum.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/maximum.mlir.tmpl
@@ -1,6 +1,129 @@
-func.func @Maximum_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_maximum %arg0, %arg1
-    : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Maximum_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xoutput_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_maximum %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %cast = tensor.cast %17 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xoutput_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_maximum %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %cast = tensor.cast %20 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xoutput_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_maximum %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xoutput_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_maximum %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %cast_1 = tensor.cast %30 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xoutput_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_maximum %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xoutput_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_maximum %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xoutput_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_maximum %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_maximum %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %31 : tensor<*xoutput_type>
+            }
+            scf.yield %29 : tensor<*xoutput_type>
+          }
+          scf.yield %27 : tensor<*xoutput_type>
+        }
+        scf.yield %25 : tensor<*xoutput_type>
+      }
+      scf.yield %18 : tensor<*xoutput_type>
+    }
+    scf.yield %16 : tensor<*xoutput_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %13 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/minimum.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/minimum.mlir.tmpl
index d3e0e22b815a7f..42ceb3a3306f1f 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/minimum.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/minimum.mlir.tmpl
@@ -1,6 +1,129 @@
-func.func @Minimum_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_minimum %arg0, %arg1
-    : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Minimum_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xelem_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_minimum %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %17 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xelem_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_minimum %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %20 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xelem_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_minimum %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xelem_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_minimum %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xelem_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_minimum %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xelem_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_minimum %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xelem_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_minimum %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_minimum %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %31 : tensor<*xelem_type>
+            }
+            scf.yield %29 : tensor<*xelem_type>
+          }
+          scf.yield %27 : tensor<*xelem_type>
+        }
+        scf.yield %25 : tensor<*xelem_type>
+      }
+      scf.yield %18 : tensor<*xelem_type>
+    }
+    scf.yield %16 : tensor<*xelem_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %13 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/mul.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/mul.mlir.tmpl
index 3cedec1a308144..ace06691c5977c 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/mul.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/mul.mlir.tmpl
@@ -1,6 +1,129 @@
-func.func @Mul_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_multiply %arg0, %arg1
-    : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Mul_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xoutput_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_multiply %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %cast = tensor.cast %17 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xoutput_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_multiply %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %cast = tensor.cast %20 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xoutput_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_multiply %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xoutput_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_multiply %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %cast_1 = tensor.cast %30 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xoutput_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_multiply %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xoutput_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_multiply %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xoutput_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_multiply %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_multiply %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %31 : tensor<*xoutput_type>
+            }
+            scf.yield %29 : tensor<*xoutput_type>
+          }
+          scf.yield %27 : tensor<*xoutput_type>
+        }
+        scf.yield %25 : tensor<*xoutput_type>
+      }
+      scf.yield %18 : tensor<*xoutput_type>
+    }
+    scf.yield %16 : tensor<*xoutput_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %13 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/mul_no_nan.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/mul_no_nan.mlir.tmpl
index 4974e11359410a..4699da4dbf95f6 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/mul_no_nan.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/mul_no_nan.mlir.tmpl
@@ -1,8 +1,148 @@
-func.func @MulNoNan_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
-  %1 = chlo.broadcast_compare %arg1, %0 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %2 = chlo.broadcast_multiply %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %3 = chlo.broadcast_select %1, %0, %2 : (tensor<*xi1>, tensor<elem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %3 : tensor<*xoutput_type>
+func.func @MulNoNan_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
+  %6 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %8 = shape.num_elements %6 : tensor<?xindex> -> index
+  %9 = arith.cmpi eq, %8, %c1 : index
+  %10 = scf.if %9 -> (tensor<*xelem_type>) {
+    %17 = shape.num_elements %7 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %17 : tensor<1xindex>
+    %18 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %19 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %20 = chlo.broadcast_compare %19, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+    %21 = chlo.broadcast_multiply %18, %19 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %22 = chlo.broadcast_select %20, %5, %21 : (tensor<?xi1>, tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %22 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %17 = shape.num_elements %7 : tensor<?xindex> -> index
+    %18 = arith.cmpi eq, %17, %c1 : index
+    %19 = scf.if %18 -> (tensor<*xelem_type>) {
+      %20 = shape.num_elements %6 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %20 : tensor<1xindex>
+      %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %22 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %23 = chlo.broadcast_compare %22, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<elem_type>, tensor<elem_type>) -> tensor<i1>
+      %24 = chlo.broadcast_multiply %21, %22 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %25 = chlo.broadcast_select %23, %5, %24 : (tensor<i1>, tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %25 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %20 = shape.shape_eq %6, %7 : tensor<?xindex>, tensor<?xindex>
+      %21 = scf.if %20 -> (tensor<*xelem_type>) {
+        %22 = shape.any %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %23 = shape.num_elements %22 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %23 : tensor<1xindex>
+        %24 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %25 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %26 = chlo.broadcast_compare %25, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+        %27 = chlo.broadcast_multiply %24, %25 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %28 = chlo.broadcast_select %26, %5, %27 : (tensor<?xi1>, tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %28 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %22:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %23 = shape.rank %22#0 : tensor<?xindex> -> index
+        %24 = shape.rank %22#1 : tensor<?xindex> -> index
+        %25 = arith.cmpi sgt, %23, %24 : index
+        %26 = arith.select %25, %23, %24 : index
+        %27 = arith.cmpi ule, %26, %c1 : index
+        %28 = scf.if %27 -> (tensor<*xelem_type>) {
+          %29 = shape.broadcast %22#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %29 : tensor<?xindex> to tensor<1xindex>
+          %30 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %31 = shape.broadcast %22#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %31 : tensor<?xindex> to tensor<1xindex>
+          %32 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %33 = chlo.broadcast_compare %32, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+          %34 = chlo.broadcast_multiply %30, %32 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %35 = chlo.broadcast_select %33, %5, %34 : (tensor<?xi1>, tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %35 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %29 = arith.cmpi ule, %26, %c2 : index
+          %30 = scf.if %29 -> (tensor<*xelem_type>) {
+            %31 = shape.broadcast %22#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %31 : tensor<?xindex> to tensor<2xindex>
+            %32 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %33 = shape.broadcast %22#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %33 : tensor<?xindex> to tensor<2xindex>
+            %34 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %35 = chlo.broadcast_compare %34, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?xelem_type>, tensor<elem_type>) -> tensor<?x?xi1>
+            %36 = chlo.broadcast_multiply %32, %34 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %37 = chlo.broadcast_select %35, %5, %36 : (tensor<?x?xi1>, tensor<elem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %37 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %31 = arith.cmpi ule, %26, %c3 : index
+            %32 = scf.if %31 -> (tensor<*xelem_type>) {
+              %33 = shape.broadcast %22#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %33 : tensor<?xindex> to tensor<3xindex>
+              %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %35 = shape.broadcast %22#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<3xindex>
+              %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %37 = chlo.broadcast_compare %36, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?xi1>
+              %38 = chlo.broadcast_multiply %34, %36 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %39 = chlo.broadcast_select %37, %5, %38 : (tensor<?x?x?xi1>, tensor<elem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %39 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %33 = arith.cmpi ule, %26, %c4 : index
+              %34 = scf.if %33 -> (tensor<*xelem_type>) {
+                %35 = shape.broadcast %22#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %35 : tensor<?xindex> to tensor<4xindex>
+                %36 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %37 = shape.broadcast %22#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %37 : tensor<?xindex> to tensor<4xindex>
+                %38 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %39 = chlo.broadcast_compare %38, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?xi1>
+                %40 = chlo.broadcast_multiply %36, %38 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %41 = chlo.broadcast_select %39, %5, %40 : (tensor<?x?x?x?xi1>, tensor<elem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %41 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %35 = arith.cmpi ule, %26, %c5 : index
+                cf.assert %35, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %36 = shape.broadcast %22#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %36 : tensor<?xindex> to tensor<5xindex>
+                %37 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %38 = shape.broadcast %22#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %38 : tensor<?xindex> to tensor<5xindex>
+                %39 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %40 = chlo.broadcast_compare %39, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?x?xi1>
+                %41 = chlo.broadcast_multiply %37, %39 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %42 = chlo.broadcast_select %40, %5, %41 : (tensor<?x?x?x?x?xi1>, tensor<elem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %42 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %34 : tensor<*xelem_type>
+            }
+            scf.yield %32 : tensor<*xelem_type>
+          }
+          scf.yield %30 : tensor<*xelem_type>
+        }
+        scf.yield %28 : tensor<*xelem_type>
+      }
+      scf.yield %21 : tensor<*xelem_type>
+    }
+    scf.yield %19 : tensor<*xelem_type>
+  }
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %13 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %14 = shape.broadcast %12, %13 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %15 = shape.broadcast %11, %14 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %16 = mhlo.dynamic_reshape %10, %15 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %16 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/mul_no_nan_cmplx.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/mul_no_nan_cmplx.mlir.tmpl
index f25272ef9812a7..80374748a840d9 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/mul_no_nan_cmplx.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/mul_no_nan_cmplx.mlir.tmpl
@@ -1,8 +1,148 @@
-func.func @MulNoNan_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.constant dense<(0.000000e+00,0.000000e+00)> : tensor<output_type>
-  %1 = chlo.broadcast_compare %arg1, %0 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %2 = chlo.broadcast_multiply %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %3 = chlo.broadcast_select %1, %0, %2 : (tensor<*xi1>, tensor<elem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %3 : tensor<*xoutput_type>
+func.func @MulNoNan_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = mhlo.constant dense<(0.000000e+00,0.000000e+00)> : tensor<output_type>
+  %6 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %8 = shape.num_elements %6 : tensor<?xindex> -> index
+  %9 = arith.cmpi eq, %8, %c1 : index
+  %10 = scf.if %9 -> (tensor<*xelem_type>) {
+    %17 = shape.num_elements %7 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %17 : tensor<1xindex>
+    %18 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %19 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %20 = chlo.broadcast_compare %19, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<output_type>) -> tensor<?xi1>
+    %21 = chlo.broadcast_multiply %18, %19 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %22 = chlo.broadcast_select %20, %5, %21 : (tensor<?xi1>, tensor<output_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %22 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %17 = shape.num_elements %7 : tensor<?xindex> -> index
+    %18 = arith.cmpi eq, %17, %c1 : index
+    %19 = scf.if %18 -> (tensor<*xelem_type>) {
+      %20 = shape.num_elements %6 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %20 : tensor<1xindex>
+      %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %22 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %23 = chlo.broadcast_compare %22, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<elem_type>, tensor<output_type>) -> tensor<i1>
+      %24 = chlo.broadcast_multiply %21, %22 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %25 = chlo.broadcast_select %23, %5, %24 : (tensor<i1>, tensor<output_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %25 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %20 = shape.shape_eq %6, %7 : tensor<?xindex>, tensor<?xindex>
+      %21 = scf.if %20 -> (tensor<*xelem_type>) {
+        %22 = shape.any %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %23 = shape.num_elements %22 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %23 : tensor<1xindex>
+        %24 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %25 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %26 = chlo.broadcast_compare %25, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<output_type>) -> tensor<?xi1>
+        %27 = chlo.broadcast_multiply %24, %25 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %28 = chlo.broadcast_select %26, %5, %27 : (tensor<?xi1>, tensor<output_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %28 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %22:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %23 = shape.rank %22#0 : tensor<?xindex> -> index
+        %24 = shape.rank %22#1 : tensor<?xindex> -> index
+        %25 = arith.cmpi sgt, %23, %24 : index
+        %26 = arith.select %25, %23, %24 : index
+        %27 = arith.cmpi ule, %26, %c1 : index
+        %28 = scf.if %27 -> (tensor<*xelem_type>) {
+          %29 = shape.broadcast %22#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %29 : tensor<?xindex> to tensor<1xindex>
+          %30 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %31 = shape.broadcast %22#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %31 : tensor<?xindex> to tensor<1xindex>
+          %32 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %33 = chlo.broadcast_compare %32, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<output_type>) -> tensor<?xi1>
+          %34 = chlo.broadcast_multiply %30, %32 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %35 = chlo.broadcast_select %33, %5, %34 : (tensor<?xi1>, tensor<output_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %35 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %29 = arith.cmpi ule, %26, %c2 : index
+          %30 = scf.if %29 -> (tensor<*xelem_type>) {
+            %31 = shape.broadcast %22#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %31 : tensor<?xindex> to tensor<2xindex>
+            %32 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %33 = shape.broadcast %22#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %33 : tensor<?xindex> to tensor<2xindex>
+            %34 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %35 = chlo.broadcast_compare %34, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?xelem_type>, tensor<output_type>) -> tensor<?x?xi1>
+            %36 = chlo.broadcast_multiply %32, %34 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %37 = chlo.broadcast_select %35, %5, %36 : (tensor<?x?xi1>, tensor<output_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %37 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %31 = arith.cmpi ule, %26, %c3 : index
+            %32 = scf.if %31 -> (tensor<*xelem_type>) {
+              %33 = shape.broadcast %22#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %33 : tensor<?xindex> to tensor<3xindex>
+              %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %35 = shape.broadcast %22#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<3xindex>
+              %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %37 = chlo.broadcast_compare %36, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?xelem_type>, tensor<output_type>) -> tensor<?x?x?xi1>
+              %38 = chlo.broadcast_multiply %34, %36 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %39 = chlo.broadcast_select %37, %5, %38 : (tensor<?x?x?xi1>, tensor<output_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %39 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %33 = arith.cmpi ule, %26, %c4 : index
+              %34 = scf.if %33 -> (tensor<*xelem_type>) {
+                %35 = shape.broadcast %22#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %35 : tensor<?xindex> to tensor<4xindex>
+                %36 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %37 = shape.broadcast %22#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %37 : tensor<?xindex> to tensor<4xindex>
+                %38 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %39 = chlo.broadcast_compare %38, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?xelem_type>, tensor<output_type>) -> tensor<?x?x?x?xi1>
+                %40 = chlo.broadcast_multiply %36, %38 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %41 = chlo.broadcast_select %39, %5, %40 : (tensor<?x?x?x?xi1>, tensor<output_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %41 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %35 = arith.cmpi ule, %26, %c5 : index
+                cf.assert %35, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %36 = shape.broadcast %22#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %36 : tensor<?xindex> to tensor<5xindex>
+                %37 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %38 = shape.broadcast %22#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %38 : tensor<?xindex> to tensor<5xindex>
+                %39 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %40 = chlo.broadcast_compare %39, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?x?xelem_type>, tensor<output_type>) -> tensor<?x?x?x?x?xi1>
+                %41 = chlo.broadcast_multiply %37, %39 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %42 = chlo.broadcast_select %40, %5, %41 : (tensor<?x?x?x?x?xi1>, tensor<output_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %42 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %34 : tensor<*xelem_type>
+            }
+            scf.yield %32 : tensor<*xelem_type>
+          }
+          scf.yield %30 : tensor<*xelem_type>
+        }
+        scf.yield %28 : tensor<*xelem_type>
+      }
+      scf.yield %21 : tensor<*xelem_type>
+    }
+    scf.yield %19 : tensor<*xelem_type>
+  }
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %13 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %14 = shape.broadcast %12, %13 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %15 = shape.broadcast %11, %14 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %16 = mhlo.dynamic_reshape %10, %15 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %16 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/neg.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/neg.mlir.tmpl
index cc63a97b842103..e4e303867a4b5f 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/neg.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/neg.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Neg_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.negate %arg0 : tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Neg_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.negate %2 : tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/next_after.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/next_after.mlir.tmpl
index 8444fc60ef966c..6c1ab509c9ff4a 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/next_after.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/next_after.mlir.tmpl
@@ -1,6 +1,129 @@
-func.func @NextAfter_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_next_after %arg0, %arg1
-    : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @NextAfter_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xoutput_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_next_after %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %cast = tensor.cast %17 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xoutput_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_next_after %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %cast = tensor.cast %20 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xoutput_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_next_after %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xoutput_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_next_after %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %cast_1 = tensor.cast %30 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xoutput_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_next_after %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xoutput_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_next_after %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xoutput_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_next_after %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_next_after %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %31 : tensor<*xoutput_type>
+            }
+            scf.yield %29 : tensor<*xoutput_type>
+          }
+          scf.yield %27 : tensor<*xoutput_type>
+        }
+        scf.yield %25 : tensor<*xoutput_type>
+      }
+      scf.yield %18 : tensor<*xoutput_type>
+    }
+    scf.yield %16 : tensor<*xoutput_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %13 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/not_equal.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/not_equal.mlir.tmpl
index 50a2a2a717cb06..21c1f7c0ca1933 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/not_equal.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/not_equal.mlir.tmpl
@@ -1,5 +1,129 @@
-func.func @NotEqual_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_compare %arg0, %arg1 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @NotEqual_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xoutput_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_compare %15, %16 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %cast = tensor.cast %17 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xoutput_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_compare %18, %19 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %cast = tensor.cast %20 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xoutput_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_compare %21, %22 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xoutput_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_compare %27, %29 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %cast_1 = tensor.cast %30 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xoutput_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_compare %29, %31 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xoutput_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_compare %31, %33 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xoutput_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_compare %33, %35 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_compare %34, %36 {comparison_direction = #chlo<comparison_direction NE>} : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %31 : tensor<*xoutput_type>
+            }
+            scf.yield %29 : tensor<*xoutput_type>
+          }
+          scf.yield %27 : tensor<*xoutput_type>
+        }
+        scf.yield %25 : tensor<*xoutput_type>
+      }
+      scf.yield %18 : tensor<*xoutput_type>
+    }
+    scf.yield %16 : tensor<*xoutput_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %13 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/ones_like.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/ones_like.mlir.tmpl
index e02415e579fa26..e176f96dd6eabf 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/ones_like.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/ones_like.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @OnesLike_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = "chlo.constant_like"(%arg0) {value = 1 : elem_type} : (tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @OnesLike_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = "chlo.constant_like"(%2) {value = 1 : elem_type} : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/ones_like_cmplx.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/ones_like_cmplx.mlir.tmpl
index b59eb9df791c10..ebc73531160654 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/ones_like_cmplx.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/ones_like_cmplx.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @OnesLike_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = "chlo.constant_like"(%arg0) {value = #complex.number<:scalar_type 1.000000e+00, 0.000000e+00> : elem_type} : (tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @OnesLike_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = "chlo.constant_like"(%2) {value = #complex.number<:scalar_type 1.000000e+00, 0.000000e+00> : elem_type} : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/ones_like_float.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/ones_like_float.mlir.tmpl
index 2f14209ebfde93..e2e3b9049c8637 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/ones_like_float.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/ones_like_float.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @OnesLike_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = "chlo.constant_like"(%arg0) {value = 1.000000e+00 : elem_type} : (tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @OnesLike_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = "chlo.constant_like"(%2) {value = 1.000000e+00 : elem_type} : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/polygamma.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/polygamma.mlir.tmpl
index bf6f31603b0aca..1eb02924c460df 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/polygamma.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/polygamma.mlir.tmpl
@@ -1,7 +1,129 @@
-func.func @Polygamma_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
-    %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type>
-    attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_polygamma %arg0, %arg1 : (tensor<*xelem_type>,
-    tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Polygamma_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xoutput_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_polygamma %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %cast = tensor.cast %17 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xoutput_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_polygamma %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %cast = tensor.cast %20 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xoutput_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_polygamma %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xoutput_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_polygamma %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %cast_1 = tensor.cast %30 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xoutput_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_polygamma %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xoutput_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_polygamma %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xoutput_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_polygamma %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_polygamma %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %31 : tensor<*xoutput_type>
+            }
+            scf.yield %29 : tensor<*xoutput_type>
+          }
+          scf.yield %27 : tensor<*xoutput_type>
+        }
+        scf.yield %25 : tensor<*xoutput_type>
+      }
+      scf.yield %18 : tensor<*xoutput_type>
+    }
+    scf.yield %16 : tensor<*xoutput_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %13 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/pow.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/pow.mlir.tmpl
index 0858ec866d5467..a64b681bb30d95 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/pow.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/pow.mlir.tmpl
@@ -1,6 +1,129 @@
-func.func @Pow_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_power %arg0, %arg1
-    : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Pow_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xoutput_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_power %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %cast = tensor.cast %17 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xoutput_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_power %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %cast = tensor.cast %20 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xoutput_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_power %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xoutput_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_power %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %cast_1 = tensor.cast %30 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xoutput_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_power %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xoutput_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_power %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xoutput_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_power %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_power %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %31 : tensor<*xoutput_type>
+            }
+            scf.yield %29 : tensor<*xoutput_type>
+          }
+          scf.yield %27 : tensor<*xoutput_type>
+        }
+        scf.yield %25 : tensor<*xoutput_type>
+      }
+      scf.yield %18 : tensor<*xoutput_type>
+    }
+    scf.yield %16 : tensor<*xoutput_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %13 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/real.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/real.mlir.tmpl
index ec4206ed73be12..435295f8538eb5 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/real.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/real.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Real_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.real %arg0 : (tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Real_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.real %2 : (tensor<?xelem_type>) -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/reciprocal.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/reciprocal.mlir.tmpl
index 691de823709f55..f9fb66a39658db 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/reciprocal.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/reciprocal.mlir.tmpl
@@ -1,6 +1,10 @@
-func.func @Reciprocal_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
+func.func @Reciprocal_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
   %0 = mhlo.constant dense<1> : tensor<elem_type>
-  %1 = chlo.broadcast_divide %0, %arg0 : (tensor<elem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %1 : tensor<*xoutput_type>
+  %1 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %2 = shape.num_elements %1 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %2 : tensor<1xindex>
+  %3 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %4 = chlo.broadcast_divide %0, %3 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+  %5 = mhlo.dynamic_reshape %4, %1 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %5 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/reciprocal_cmplx.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/reciprocal_cmplx.mlir.tmpl
index 4533174f0fb510..02889889064ea3 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/reciprocal_cmplx.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/reciprocal_cmplx.mlir.tmpl
@@ -1,6 +1,10 @@
-func.func @Reciprocal_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
+func.func @Reciprocal_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
   %0 = mhlo.constant dense<(1.000000e+00,0.000000e+00)> : tensor<output_type>
-  %1 = chlo.broadcast_divide %0, %arg0 : (tensor<elem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %1 : tensor<*xoutput_type>
+  %1 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %2 = shape.num_elements %1 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %2 : tensor<1xindex>
+  %3 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %4 = chlo.broadcast_divide %0, %3 : (tensor<output_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+  %5 = mhlo.dynamic_reshape %4, %1 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %5 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/reciprocal_float.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/reciprocal_float.mlir.tmpl
index afa19f32504afb..75fe52db7a277b 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/reciprocal_float.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/reciprocal_float.mlir.tmpl
@@ -1,6 +1,10 @@
-func.func @Reciprocal_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
+func.func @Reciprocal_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
   %0 = mhlo.constant dense<1.000000e+00> : tensor<elem_type>
-  %1 = chlo.broadcast_divide %0, %arg0 : (tensor<elem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %1 : tensor<*xoutput_type>
+  %1 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %2 = shape.num_elements %1 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %2 : tensor<1xindex>
+  %3 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %4 = chlo.broadcast_divide %0, %3 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+  %5 = mhlo.dynamic_reshape %4, %1 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %5 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/relu.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/relu.mlir.tmpl
index bde2dd294361e6..83029a0a31aab0 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/relu.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/relu.mlir.tmpl
@@ -1,6 +1,10 @@
-func.func @Relu_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
+func.func @Relu_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
   %0 = mhlo.constant dense<0> : tensor<elem_type>
-  %1 = chlo.broadcast_maximum %0, %arg0 : (tensor<elem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %1 : tensor<*xoutput_type>
+  %1 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %2 = shape.num_elements %1 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %2 : tensor<1xindex>
+  %3 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %4 = chlo.broadcast_maximum %3, %0 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+  %5 = mhlo.dynamic_reshape %4, %1 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %5 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/relu_float.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/relu_float.mlir.tmpl
index c31cf52a3c1f49..77fa9802579cee 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/relu_float.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/relu_float.mlir.tmpl
@@ -1,6 +1,10 @@
-func.func @Relu_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
+func.func @Relu_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
   %0 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
-  %1 = chlo.broadcast_maximum %0, %arg0 : (tensor<elem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %1 : tensor<*xoutput_type>
+  %1 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %2 = shape.num_elements %1 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %2 : tensor<1xindex>
+  %3 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %4 = chlo.broadcast_maximum %3, %0 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+  %5 = mhlo.dynamic_reshape %4, %1 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %5 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/relu_grad.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/relu_grad.mlir.tmpl
index 1408372bff6415..2f5533b8384130 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/relu_grad.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/relu_grad.mlir.tmpl
@@ -1,7 +1,14 @@
-func.func @ReluGrad_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1 : tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = "chlo.constant_like"(%arg1) {value = 0.000000e+00 : elem_type} : (tensor<*xelem_type>) -> tensor<*xelem_type>
-  %1 = mhlo.compare  GT, %arg1, %0 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xi1>
-  %2 = mhlo.select %1, %arg0, %0 : tensor<*xi1>, tensor<*xelem_type>
-  func.return %2 : tensor<*xoutput_type>
+func.func @ReluGrad_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %2 = shape.any %0, %1 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %3 = shape.num_elements %2 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %3 : tensor<1xindex>
+  %4 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %5 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %6 = "chlo.constant_like"(%5) {value = 0.000000e+00 : elem_type} : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  %7 = mhlo.compare  GT, %5, %6 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xi1>
+  %8 = mhlo.select %7, %4, %6 : tensor<?xi1>, tensor<?xelem_type>
+  %9 = mhlo.dynamic_reshape %8, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %9 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/right_shift.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/right_shift.mlir.tmpl
index b6f6b9f98627fc..1398a68d4713a4 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/right_shift.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/right_shift.mlir.tmpl
@@ -1,5 +1,129 @@
-func.func @RightShift_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_shift_right_arithmetic %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @RightShift_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xelem_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_shift_right_arithmetic %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %17 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xelem_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_shift_right_arithmetic %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %20 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xelem_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_shift_right_arithmetic %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xelem_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_shift_right_arithmetic %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xelem_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_shift_right_arithmetic %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xelem_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_shift_right_arithmetic %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xelem_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_shift_right_arithmetic %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_shift_right_arithmetic %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %31 : tensor<*xelem_type>
+            }
+            scf.yield %29 : tensor<*xelem_type>
+          }
+          scf.yield %27 : tensor<*xelem_type>
+        }
+        scf.yield %25 : tensor<*xelem_type>
+      }
+      scf.yield %18 : tensor<*xelem_type>
+    }
+    scf.yield %16 : tensor<*xelem_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %13 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/right_shift_unsigned.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/right_shift_unsigned.mlir.tmpl
index bdd71bc93c58a5..647783751d5cd0 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/right_shift_unsigned.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/right_shift_unsigned.mlir.tmpl
@@ -1,5 +1,129 @@
-func.func @RightShift_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_shift_right_logical %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @RightShift_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xelem_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_shift_right_logical %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %17 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xelem_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_shift_right_logical %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %20 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xelem_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_shift_right_logical %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xelem_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_shift_right_logical %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xelem_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_shift_right_logical %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xelem_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_shift_right_logical %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xelem_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_shift_right_logical %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_shift_right_logical %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %31 : tensor<*xelem_type>
+            }
+            scf.yield %29 : tensor<*xelem_type>
+          }
+          scf.yield %27 : tensor<*xelem_type>
+        }
+        scf.yield %25 : tensor<*xelem_type>
+      }
+      scf.yield %18 : tensor<*xelem_type>
+    }
+    scf.yield %16 : tensor<*xelem_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %13 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/rint.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/rint.mlir.tmpl
index 347c322bc14da7..1ed002e10c9176 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/rint.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/rint.mlir.tmpl
@@ -1,23 +1,27 @@
-func.func @Rint_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
-  %1 = mhlo.floor %arg0 : tensor<*xelem_type>
-  %2 = chlo.broadcast_subtract %arg0, %1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %3 = mhlo.constant dense<5.000000e-01> : tensor<elem_type>
-  %4 = chlo.broadcast_compare %2, %3 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %5 = chlo.broadcast_compare %2, %3 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %6 = mhlo.constant dense<2.000000e+00> : tensor<elem_type>
-  %7 = chlo.broadcast_multiply %arg0, %3 : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xelem_type>
-  %8 = mhlo.floor %7 : tensor<*xelem_type>
-  %9 = chlo.broadcast_multiply %8, %6 : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xelem_type>
-  %10 = chlo.broadcast_subtract %1, %9 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %11 = mhlo.constant dense<1.000000e+00> : tensor<elem_type>
-  %12 = chlo.broadcast_compare %10, %11 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %13 = chlo.broadcast_and %5, %12 : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
-  %14 = chlo.broadcast_or %4, %13 : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
-  %15 = chlo.broadcast_add %1, %11 : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xelem_type>
-  %16 = chlo.broadcast_select %14, %15, %1 : (tensor<*xi1>, tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %17 = chlo.broadcast_compare %16, %0 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %18 = chlo.broadcast_select %17, %0, %16 : (tensor<*xi1>, tensor<elem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %18 : tensor<*xoutput_type>
+func.func @Rint_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = mhlo.constant dense<1.000000e+00> : tensor<elem_type>
+  %1 = mhlo.constant dense<2.000000e+00> : tensor<elem_type>
+  %2 = mhlo.constant dense<5.000000e-01> : tensor<elem_type>
+  %3 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
+  %4 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %5 = shape.num_elements %4 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %5 : tensor<1xindex>
+  %6 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %7 = mhlo.floor %6 : tensor<?xelem_type>
+  %8 = chlo.broadcast_subtract %6, %7 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+  %9 = chlo.broadcast_compare %8, %2 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+  %10 = chlo.broadcast_compare %8, %2 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+  %11 = chlo.broadcast_multiply %6, %2 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+  %12 = mhlo.floor %11 : tensor<?xelem_type>
+  %13 = chlo.broadcast_multiply %12, %1 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+  %14 = chlo.broadcast_subtract %7, %13 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+  %15 = chlo.broadcast_compare %14, %0 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+  %16 = chlo.broadcast_and %10, %15 : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+  %17 = chlo.broadcast_or %9, %16 : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+  %18 = chlo.broadcast_add %7, %0 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+  %19 = chlo.broadcast_select %17, %18, %7 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+  %20 = chlo.broadcast_compare %19, %3 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+  %21 = chlo.broadcast_select %20, %3, %19 : (tensor<?xi1>, tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+  %22 = mhlo.dynamic_reshape %21, %4 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %22 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/round.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/round.mlir.tmpl
index 8790dccd9431da..eabbd15e939f06 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/round.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/round.mlir.tmpl
@@ -1,4 +1,3 @@
-func.func @Round_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  func.return %arg0 : tensor<*xoutput_type>
+func.func @Round_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  return %arg0 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/round_float.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/round_float.mlir.tmpl
index 4e6549245f24fa..63d31af013b6d1 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/round_float.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/round_float.mlir.tmpl
@@ -1,23 +1,27 @@
-func.func @Round_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
-  %1 = mhlo.floor %arg0 : tensor<*xelem_type>
-  %2 = chlo.broadcast_subtract %arg0, %1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %3 = mhlo.constant dense<5.000000e-01> : tensor<elem_type>
-  %4 = chlo.broadcast_compare %2, %3 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %5 = chlo.broadcast_compare %2, %3 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %6 = mhlo.constant dense<2.000000e+00> : tensor<elem_type>
-  %7 = chlo.broadcast_multiply %arg0, %3 : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xelem_type>
-  %8 = mhlo.floor %7 : tensor<*xelem_type>
-  %9 = chlo.broadcast_multiply %8, %6 : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xelem_type>
-  %10 = chlo.broadcast_subtract %1, %9 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %11 = mhlo.constant dense<1.000000e+00> : tensor<elem_type>
-  %12 = chlo.broadcast_compare %10, %11 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %13 = chlo.broadcast_and %5, %12 : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
-  %14 = chlo.broadcast_or %4, %13 : (tensor<*xi1>, tensor<*xi1>) -> tensor<*xi1>
-  %15 = chlo.broadcast_add %1, %11 : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xelem_type>
-  %16 = chlo.broadcast_select %14, %15, %1 : (tensor<*xi1>, tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %17 = chlo.broadcast_compare %16, %0 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %18 = chlo.broadcast_select %17, %0, %16 : (tensor<*xi1>, tensor<elem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %18 : tensor<*xoutput_type>
+func.func @Round_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = mhlo.constant dense<1.000000e+00> : tensor<elem_type>
+  %1 = mhlo.constant dense<2.000000e+00> : tensor<elem_type>
+  %2 = mhlo.constant dense<5.000000e-01> : tensor<elem_type>
+  %3 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
+  %4 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %5 = shape.num_elements %4 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %5 : tensor<1xindex>
+  %6 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %7 = mhlo.floor %6 : tensor<?xelem_type>
+  %8 = chlo.broadcast_subtract %6, %7 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+  %9 = chlo.broadcast_compare %8, %2 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+  %10 = chlo.broadcast_compare %8, %2 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+  %11 = chlo.broadcast_multiply %6, %2 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+  %12 = mhlo.floor %11 : tensor<?xelem_type>
+  %13 = chlo.broadcast_multiply %12, %1 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+  %14 = chlo.broadcast_subtract %7, %13 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+  %15 = chlo.broadcast_compare %14, %0 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+  %16 = chlo.broadcast_and %10, %15 : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+  %17 = chlo.broadcast_or %9, %16 : (tensor<?xi1>, tensor<?xi1>) -> tensor<?xi1>
+  %18 = chlo.broadcast_add %7, %0 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+  %19 = chlo.broadcast_select %17, %18, %7 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+  %20 = chlo.broadcast_compare %19, %3 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+  %21 = chlo.broadcast_select %20, %3, %19 : (tensor<?xi1>, tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+  %22 = mhlo.dynamic_reshape %21, %4 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %22 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/rsqrt.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/rsqrt.mlir.tmpl
index 5489e909beed69..c3f74cbde99bbc 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/rsqrt.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/rsqrt.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Rsqrt_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.rsqrt %arg0 : tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Rsqrt_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.rsqrt %2 : tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/select_v2.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/select_v2.mlir.tmpl
index e37d98ae4a8be4..0c39b00d033bce 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/select_v2.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/select_v2.mlir.tmpl
@@ -1,5 +1,183 @@
-func.func @SelectV2_platform_elem_type_output_type(%arg0: tensor<*xi1>, %arg1: tensor<*xelem_type>, %arg2: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_select %arg0, %arg1, %arg2 : (tensor<*xi1>, tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @SelectV2_platform_elem_type_output_type(%arg0: tensor<*xi1>, %arg1: tensor<*xelem_type>, %arg2: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1, 1, 1, 1] : tensor<8xindex>
+  %c8 = arith.constant 8 : index
+  %1 = shape.const_shape [1, 1, 1, 1, 1, 1, 1] : tensor<7xindex>
+  %c7 = arith.constant 7 : index
+  %2 = shape.const_shape [1, 1, 1, 1, 1, 1] : tensor<6xindex>
+  %c6 = arith.constant 6 : index
+  %3 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %4 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %5 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %6 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %7 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %8 = shape.shape_of %arg0 : tensor<*xi1> -> tensor<?xindex>
+  %9 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %10 = shape.shape_of %arg2 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_eq %8, %9 : tensor<?xindex>, tensor<?xindex>
+  %12 = shape.shape_eq %8, %10 : tensor<?xindex>, tensor<?xindex>
+  %13 = arith.andi %11, %12 : i1
+  %14 = scf.if %13 -> (tensor<*xelem_type>) {
+    %20 = shape.any %8, %9, %10 : tensor<?xindex>, tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+    %21 = shape.num_elements %20 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %21 : tensor<1xindex>
+    %22 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xi1>, tensor<1xindex>) -> tensor<?xi1>
+    %23 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %24 = mhlo.dynamic_reshape %arg2, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %25 = chlo.broadcast_select %22, %23, %24 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %25 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %20:3 = chlo.minimum_broadcast_shapes %8, %9, %10 : tensor<?xindex>, tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>, tensor<?xindex>
+    %21 = shape.rank %20#0 : tensor<?xindex> -> index
+    %22 = shape.rank %20#1 : tensor<?xindex> -> index
+    %23 = arith.cmpi sgt, %21, %22 : index
+    %24 = arith.select %23, %21, %22 : index
+    %25 = shape.rank %20#2 : tensor<?xindex> -> index
+    %26 = arith.cmpi sgt, %24, %25 : index
+    %27 = arith.select %26, %24, %25 : index
+    %28 = arith.cmpi ule, %27, %c1 : index
+    %29 = scf.if %28 -> (tensor<*xelem_type>) {
+      %30 = shape.broadcast %20#0, %7 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+      %cast = tensor.cast %30 : tensor<?xindex> to tensor<1xindex>
+      %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xi1>, tensor<1xindex>) -> tensor<?xi1>
+      %32 = shape.broadcast %20#1, %7 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+      %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<1xindex>
+      %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %34 = shape.broadcast %20#2, %7 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+      %cast_1 = tensor.cast %34 : tensor<?xindex> to tensor<1xindex>
+      %35 = mhlo.dynamic_reshape %arg2, %cast_1 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %36 = chlo.broadcast_select %31, %33, %35 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+      %cast_2 = tensor.cast %36 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast_2 : tensor<*xelem_type>
+    } else {
+      %30 = arith.cmpi ule, %27, %c2 : index
+      %31 = scf.if %30 -> (tensor<*xelem_type>) {
+        %32 = shape.broadcast %20#0, %6 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+        %cast = tensor.cast %32 : tensor<?xindex> to tensor<2xindex>
+        %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xi1>, tensor<2xindex>) -> tensor<?x?xi1>
+        %34 = shape.broadcast %20#1, %6 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+        %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<2xindex>
+        %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+        %36 = shape.broadcast %20#2, %6 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+        %cast_1 = tensor.cast %36 : tensor<?xindex> to tensor<2xindex>
+        %37 = mhlo.dynamic_reshape %arg2, %cast_1 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+        %38 = chlo.broadcast_select %33, %35, %37 : (tensor<?x?xi1>, tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+        %cast_2 = tensor.cast %38 : tensor<?x?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast_2 : tensor<*xelem_type>
+      } else {
+        %32 = arith.cmpi ule, %27, %c3 : index
+        %33 = scf.if %32 -> (tensor<*xelem_type>) {
+          %34 = shape.broadcast %20#0, %5 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+          %cast = tensor.cast %34 : tensor<?xindex> to tensor<3xindex>
+          %35 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xi1>, tensor<3xindex>) -> tensor<?x?x?xi1>
+          %36 = shape.broadcast %20#1, %5 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %36 : tensor<?xindex> to tensor<3xindex>
+          %37 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+          %38 = shape.broadcast %20#2, %5 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+          %cast_1 = tensor.cast %38 : tensor<?xindex> to tensor<3xindex>
+          %39 = mhlo.dynamic_reshape %arg2, %cast_1 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+          %40 = chlo.broadcast_select %35, %37, %39 : (tensor<?x?x?xi1>, tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+          %cast_2 = tensor.cast %40 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_2 : tensor<*xelem_type>
+        } else {
+          %34 = arith.cmpi ule, %27, %c4 : index
+          %35 = scf.if %34 -> (tensor<*xelem_type>) {
+            %36 = shape.broadcast %20#0, %4 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+            %cast = tensor.cast %36 : tensor<?xindex> to tensor<4xindex>
+            %37 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xi1>, tensor<4xindex>) -> tensor<?x?x?x?xi1>
+            %38 = shape.broadcast %20#1, %4 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %38 : tensor<?xindex> to tensor<4xindex>
+            %39 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+            %40 = shape.broadcast %20#2, %4 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+            %cast_1 = tensor.cast %40 : tensor<?xindex> to tensor<4xindex>
+            %41 = mhlo.dynamic_reshape %arg2, %cast_1 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+            %42 = chlo.broadcast_select %37, %39, %41 : (tensor<?x?x?x?xi1>, tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+            %cast_2 = tensor.cast %42 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_2 : tensor<*xelem_type>
+          } else {
+            %36 = arith.cmpi ule, %27, %c5 : index
+            %37 = scf.if %36 -> (tensor<*xelem_type>) {
+              %38 = shape.broadcast %20#0, %3 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+              %cast = tensor.cast %38 : tensor<?xindex> to tensor<5xindex>
+              %39 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xi1>, tensor<5xindex>) -> tensor<?x?x?x?x?xi1>
+              %40 = shape.broadcast %20#1, %3 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %40 : tensor<?xindex> to tensor<5xindex>
+              %41 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+              %42 = shape.broadcast %20#2, %3 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+              %cast_1 = tensor.cast %42 : tensor<?xindex> to tensor<5xindex>
+              %43 = mhlo.dynamic_reshape %arg2, %cast_1 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+              %44 = chlo.broadcast_select %39, %41, %43 : (tensor<?x?x?x?x?xi1>, tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+              %cast_2 = tensor.cast %44 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_2 : tensor<*xelem_type>
+            } else {
+              %38 = arith.cmpi ule, %27, %c6 : index
+              %39 = scf.if %38 -> (tensor<*xelem_type>) {
+                %40 = shape.broadcast %20#0, %2 : tensor<?xindex>, tensor<6xindex> -> tensor<?xindex>
+                %cast = tensor.cast %40 : tensor<?xindex> to tensor<6xindex>
+                %41 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xi1>, tensor<6xindex>) -> tensor<?x?x?x?x?x?xi1>
+                %42 = shape.broadcast %20#1, %2 : tensor<?xindex>, tensor<6xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %42 : tensor<?xindex> to tensor<6xindex>
+                %43 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<6xindex>) -> tensor<?x?x?x?x?x?xelem_type>
+                %44 = shape.broadcast %20#2, %2 : tensor<?xindex>, tensor<6xindex> -> tensor<?xindex>
+                %cast_1 = tensor.cast %44 : tensor<?xindex> to tensor<6xindex>
+                %45 = mhlo.dynamic_reshape %arg2, %cast_1 : (tensor<*xelem_type>, tensor<6xindex>) -> tensor<?x?x?x?x?x?xelem_type>
+                %46 = chlo.broadcast_select %41, %43, %45 : (tensor<?x?x?x?x?x?xi1>, tensor<?x?x?x?x?x?xelem_type>, tensor<?x?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?x?xelem_type>
+                %cast_2 = tensor.cast %46 : tensor<?x?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_2 : tensor<*xelem_type>
+              } else {
+                %40 = arith.cmpi ule, %27, %c7 : index
+                %41 = scf.if %40 -> (tensor<*xelem_type>) {
+                  %42 = shape.broadcast %20#0, %1 : tensor<?xindex>, tensor<7xindex> -> tensor<?xindex>
+                  %cast = tensor.cast %42 : tensor<?xindex> to tensor<7xindex>
+                  %43 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xi1>, tensor<7xindex>) -> tensor<?x?x?x?x?x?x?xi1>
+                  %44 = shape.broadcast %20#1, %1 : tensor<?xindex>, tensor<7xindex> -> tensor<?xindex>
+                  %cast_0 = tensor.cast %44 : tensor<?xindex> to tensor<7xindex>
+                  %45 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<7xindex>) -> tensor<?x?x?x?x?x?x?xelem_type>
+                  %46 = shape.broadcast %20#2, %1 : tensor<?xindex>, tensor<7xindex> -> tensor<?xindex>
+                  %cast_1 = tensor.cast %46 : tensor<?xindex> to tensor<7xindex>
+                  %47 = mhlo.dynamic_reshape %arg2, %cast_1 : (tensor<*xelem_type>, tensor<7xindex>) -> tensor<?x?x?x?x?x?x?xelem_type>
+                  %48 = chlo.broadcast_select %43, %45, %47 : (tensor<?x?x?x?x?x?x?xi1>, tensor<?x?x?x?x?x?x?xelem_type>, tensor<?x?x?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?x?x?xelem_type>
+                  %cast_2 = tensor.cast %48 : tensor<?x?x?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                  scf.yield %cast_2 : tensor<*xelem_type>
+                } else {
+                  %42 = arith.cmpi ule, %27, %c8 : index
+                  cf.assert %42, "Input for dynamic binary or n-ary op lowering was of a rank greater than 8"
+                  %43 = shape.broadcast %20#0, %0 : tensor<?xindex>, tensor<8xindex> -> tensor<?xindex>
+                  %cast = tensor.cast %43 : tensor<?xindex> to tensor<8xindex>
+                  %44 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xi1>, tensor<8xindex>) -> tensor<?x?x?x?x?x?x?x?xi1>
+                  %45 = shape.broadcast %20#1, %0 : tensor<?xindex>, tensor<8xindex> -> tensor<?xindex>
+                  %cast_0 = tensor.cast %45 : tensor<?xindex> to tensor<8xindex>
+                  %46 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<8xindex>) -> tensor<?x?x?x?x?x?x?x?xelem_type>
+                  %47 = shape.broadcast %20#2, %0 : tensor<?xindex>, tensor<8xindex> -> tensor<?xindex>
+                  %cast_1 = tensor.cast %47 : tensor<?xindex> to tensor<8xindex>
+                  %48 = mhlo.dynamic_reshape %arg2, %cast_1 : (tensor<*xelem_type>, tensor<8xindex>) -> tensor<?x?x?x?x?x?x?x?xelem_type>
+                  %49 = chlo.broadcast_select %44, %46, %48 : (tensor<?x?x?x?x?x?x?x?xi1>, tensor<?x?x?x?x?x?x?x?xelem_type>, tensor<?x?x?x?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?x?x?x?xelem_type>
+                  %cast_2 = tensor.cast %49 : tensor<?x?x?x?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                  scf.yield %cast_2 : tensor<*xelem_type>
+                }
+                scf.yield %41 : tensor<*xelem_type>
+              }
+              scf.yield %39 : tensor<*xelem_type>
+            }
+            scf.yield %37 : tensor<*xelem_type>
+          }
+          scf.yield %35 : tensor<*xelem_type>
+        }
+        scf.yield %33 : tensor<*xelem_type>
+      }
+      scf.yield %31 : tensor<*xelem_type>
+    }
+    scf.yield %29 : tensor<*xelem_type>
+  }
+  %15 = shape.shape_of %arg0 : tensor<*xi1> -> tensor<?xindex>
+  %16 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %17 = shape.shape_of %arg2 : tensor<*xelem_type> -> tensor<?xindex>
+  %18 = shape.broadcast %15, %16, %17 : tensor<?xindex>, tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %19 = mhlo.dynamic_reshape %14, %18 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %19 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/selu.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/selu.mlir.tmpl
index b769ef9f1e28a8..5fd921268e5dac 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/selu.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/selu.mlir.tmpl
@@ -1,12 +1,16 @@
-func.func @Selu_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
-  %1 = chlo.broadcast_compare %arg0, %0 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %2 = mhlo.constant dense<1.05070102> : tensor<elem_type>
-  %3 = chlo.broadcast_multiply %arg0, %2 : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xelem_type>
-  %4 = mhlo.constant dense<1.75809932> : tensor<elem_type>
-  %5 = mhlo.exponential_minus_one %arg0 : tensor<*xelem_type>
-  %6 = chlo.broadcast_multiply %5, %4 : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xelem_type>
-  %7 = chlo.broadcast_select %1, %3, %6 : (tensor<*xi1>, tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %7 : tensor<*xoutput_type>
+func.func @Selu_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = mhlo.constant dense<1.75809932> : tensor<elem_type>
+  %1 = mhlo.constant dense<1.05070102> : tensor<elem_type>
+  %2 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
+  %3 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %4 = shape.num_elements %3 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %4 : tensor<1xindex>
+  %5 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %6 = chlo.broadcast_compare %5, %2 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+  %7 = chlo.broadcast_multiply %5, %1 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+  %8 = mhlo.exponential_minus_one %5 : tensor<?xelem_type>
+  %9 = chlo.broadcast_multiply %8, %0 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+  %10 = chlo.broadcast_select %6, %7, %9 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+  %11 = mhlo.dynamic_reshape %10, %3 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %11 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/sigmoid.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/sigmoid.mlir.tmpl
index 61baae4b916c88..a4a8bf75c4cf82 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/sigmoid.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/sigmoid.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Sigmoid_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.logistic %arg0 : tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Sigmoid_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.logistic %2 : tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/sign.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/sign.mlir.tmpl
index 01065d26d9f3d1..c7ce54c81b06b3 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/sign.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/sign.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Sign_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.sign %arg0 : tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Sign_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.sign %2 : tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/sin.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/sin.mlir.tmpl
index 9d6d99bddb26d5..2ee09d30a8ceab 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/sin.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/sin.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Sin_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.sine %arg0 : tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Sin_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.sine %2 : tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/sinh.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/sinh.mlir.tmpl
index 04ead92ada5c9e..350c9c240057aa 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/sinh.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/sinh.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Sinh_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.sinh %arg0 : tensor<*xelem_type> -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Sinh_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = chlo.sinh %2 : tensor<?xelem_type> -> tensor<?xoutput_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %4 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/softplus_f16.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/softplus_f16.mlir.tmpl
index f20d3fc58d612f..1ccfd2cb2310cc 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/softplus_f16.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/softplus_f16.mlir.tmpl
@@ -1,15 +1,18 @@
-func.func @Softplus_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.exponential %arg0 : tensor<*xelem_type>
-  %1 = mhlo.constant dense<1.220700e-04> : tensor<elem_type>
-  %2 = mhlo.log %1 : tensor<elem_type>
-  %3 = mhlo.constant dense<2.000000e+00> : tensor<elem_type>
-  %4 = chlo.broadcast_add %2, %3 : (tensor<elem_type>, tensor<elem_type>) -> tensor<elem_type>
-  %5 = mhlo.negate %4 : tensor<elem_type>
-  %6 = chlo.broadcast_compare %arg0, %5 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %7 = chlo.broadcast_compare %arg0, %4 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %8 = mhlo.log_plus_one %0 : tensor<*xelem_type>
-  %9 = mhlo.select %7, %0, %8 : tensor<*xi1>, tensor<*xelem_type>
-  %10 = mhlo.select %6, %arg0, %9 : tensor<*xi1>, tensor<*xelem_type>
-  func.return %10 : tensor<*xoutput_type>
+func.func @Softplus_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = mhlo.constant dense<-9.01091575> : tensor<elem_type>
+  %1 = mhlo.constant dense<2.000000e+00> : tensor<elem_type>
+  %2 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %3 = shape.num_elements %2 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %3 : tensor<1xindex>
+  %4 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %5 = mhlo.exponential %4 : tensor<?xelem_type>
+  %6 = chlo.broadcast_add %0, %1 : (tensor<elem_type>, tensor<elem_type>) -> tensor<elem_type>
+  %7 = mhlo.negate %6 : tensor<elem_type>
+  %8 = chlo.broadcast_compare %4, %7 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+  %9 = chlo.broadcast_compare %4, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+  %10 = mhlo.log_plus_one %5 : tensor<?xelem_type>
+  %11 = mhlo.select %9, %5, %10 : tensor<?xi1>, tensor<?xelem_type>
+  %12 = mhlo.select %8, %4, %11 : tensor<?xi1>, tensor<?xelem_type>
+  %13 = mhlo.dynamic_reshape %12, %2 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %13 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/softplus_f32.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/softplus_f32.mlir.tmpl
index fd7541de4f03d9..5ad40b6a769541 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/softplus_f32.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/softplus_f32.mlir.tmpl
@@ -1,15 +1,18 @@
-func.func @Softplus_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.exponential %arg0 : tensor<*xelem_type>
-  %1 = mhlo.constant dense<1.1920929E-7> : tensor<elem_type>
-  %2 = mhlo.log %1 : tensor<elem_type>
-  %3 = mhlo.constant dense<2.000000e+00> : tensor<elem_type>
-  %4 = chlo.broadcast_add %2, %3 : (tensor<elem_type>, tensor<elem_type>) -> tensor<elem_type>
-  %5 = mhlo.negate %4 : tensor<elem_type>
-  %6 = chlo.broadcast_compare %arg0, %5 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %7 = chlo.broadcast_compare %arg0, %4 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %8 = mhlo.log_plus_one %0 : tensor<*xelem_type>
-  %9 = mhlo.select %7, %0, %8 : tensor<*xi1>, tensor<*xelem_type>
-  %10 = mhlo.select %6, %arg0, %9 : tensor<*xi1>, tensor<*xelem_type>
-  func.return %10 : tensor<*xoutput_type>
+func.func @Softplus_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = mhlo.constant dense<-15.9423847> : tensor<elem_type>
+  %1 = mhlo.constant dense<2.000000e+00> : tensor<elem_type>
+  %2 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %3 = shape.num_elements %2 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %3 : tensor<1xindex>
+  %4 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %5 = mhlo.exponential %4 : tensor<?xelem_type>
+  %6 = chlo.broadcast_add %0, %1 : (tensor<elem_type>, tensor<elem_type>) -> tensor<elem_type>
+  %7 = mhlo.negate %6 : tensor<elem_type>
+  %8 = chlo.broadcast_compare %4, %7 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+  %9 = chlo.broadcast_compare %4, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+  %10 = mhlo.log_plus_one %5 : tensor<?xelem_type>
+  %11 = mhlo.select %9, %5, %10 : tensor<?xi1>, tensor<?xelem_type>
+  %12 = mhlo.select %8, %4, %11 : tensor<?xi1>, tensor<?xelem_type>
+  %13 = mhlo.dynamic_reshape %12, %2 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %13 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/softplus_f64.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/softplus_f64.mlir.tmpl
index 2fd33d20f95b58..d8e986994d01f1 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/softplus_f64.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/softplus_f64.mlir.tmpl
@@ -1,15 +1,18 @@
-func.func @Softplus_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.exponential %arg0 : tensor<*xelem_type>
-  %1 = mhlo.constant dense<2.2204460492503131E-16> : tensor<elem_type>
-  %2 = mhlo.log %1 : tensor<elem_type>
-  %3 = mhlo.constant dense<2.000000e+00> : tensor<elem_type>
-  %4 = chlo.broadcast_add %2, %3 : (tensor<elem_type>, tensor<elem_type>) -> tensor<elem_type>
-  %5 = mhlo.negate %4 : tensor<elem_type>
-  %6 = chlo.broadcast_compare %arg0, %5 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %7 = chlo.broadcast_compare %arg0, %4 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %8 = mhlo.log_plus_one %0 : tensor<*xelem_type>
-  %9 = mhlo.select %7, %0, %8 : tensor<*xi1>, tensor<*xelem_type>
-  %10 = mhlo.select %6, %arg0, %9 : tensor<*xi1>, tensor<*xelem_type>
-  func.return %10 : tensor<*xoutput_type>
+func.func @Softplus_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = mhlo.constant dense<-36.0436516> : tensor<elem_type>
+  %1 = mhlo.constant dense<2.000000e+00> : tensor<elem_type>
+  %2 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %3 = shape.num_elements %2 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %3 : tensor<1xindex>
+  %4 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %5 = mhlo.exponential %4 : tensor<?xelem_type>
+  %6 = chlo.broadcast_add %0, %1 : (tensor<elem_type>, tensor<elem_type>) -> tensor<elem_type>
+  %7 = mhlo.negate %6 : tensor<elem_type>
+  %8 = chlo.broadcast_compare %4, %7 {comparison_direction = #chlo<comparison_direction GT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+  %9 = chlo.broadcast_compare %4, %6 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+  %10 = mhlo.log_plus_one %5 : tensor<?xelem_type>
+  %11 = mhlo.select %9, %5, %10 : tensor<?xi1>, tensor<?xelem_type>
+  %12 = mhlo.select %8, %4, %11 : tensor<?xi1>, tensor<?xelem_type>
+  %13 = mhlo.dynamic_reshape %12, %2 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %13 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/softsign.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/softsign.mlir.tmpl
index 65cf477d78571d..a77e5b170d74f0 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/softsign.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/softsign.mlir.tmpl
@@ -1,8 +1,12 @@
-func.func @Softsign_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = "chlo.constant_like"(%arg0) {value = 1.000000e+00 : elem_type} : (tensor<*xelem_type>) -> tensor<*xelem_type>
-  %1 = mhlo.abs %arg0 : tensor<*xelem_type>
-  %2 = mhlo.add %0, %1 : tensor<*xelem_type>
-  %3 = mhlo.divide %arg0, %2 : tensor<*xelem_type>
-  func.return %3 : tensor<*xoutput_type>
+func.func @Softsign_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = "chlo.constant_like"(%2) {value = 1.000000e+00 : elem_type} : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  %4 = mhlo.abs %2 : tensor<?xelem_type>
+  %5 = mhlo.add %3, %4 : tensor<?xelem_type>
+  %6 = mhlo.divide %2, %5 : tensor<?xelem_type>
+  %7 = mhlo.dynamic_reshape %6, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %7 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/sqrt.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/sqrt.mlir.tmpl
index 4e60e21a6d366e..08707c451e0312 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/sqrt.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/sqrt.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Sqrt_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.sqrt %arg0 : tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Sqrt_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.sqrt %2 : tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/square.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/square.mlir.tmpl
index 5863dfde38e5b6..8df4f3d08ac3b4 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/square.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/square.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Square_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_multiply %arg0, %arg0 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Square_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = chlo.broadcast_multiply %2, %2 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/squared_difference.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/squared_difference.mlir.tmpl
index cf2b58356c9153..0c4220928702b5 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/squared_difference.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/squared_difference.mlir.tmpl
@@ -1,6 +1,141 @@
-func.func @SquaredDifference_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_subtract %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %1 = chlo.broadcast_multiply %0, %0 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %1 : tensor<*xoutput_type>
+func.func @SquaredDifference_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xelem_type>) {
+    %18 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %18 : tensor<1xindex>
+    %19 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %20 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %21 = chlo.broadcast_subtract %19, %20 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %22 = chlo.broadcast_multiply %21, %21 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %22 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %18 = shape.num_elements %6 : tensor<?xindex> -> index
+    %19 = arith.cmpi eq, %18, %c1 : index
+    %20 = scf.if %19 -> (tensor<*xelem_type>) {
+      %21 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %21 : tensor<1xindex>
+      %22 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %23 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %24 = chlo.broadcast_subtract %22, %23 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %25 = chlo.broadcast_multiply %24, %24 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %25 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %21 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %22 = scf.if %21 -> (tensor<*xelem_type>) {
+        %23 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %24 = shape.num_elements %23 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %24 : tensor<1xindex>
+        %25 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %26 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %27 = chlo.broadcast_subtract %25, %26 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %28 = chlo.broadcast_multiply %27, %27 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %28 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %23:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %24 = shape.rank %23#0 : tensor<?xindex> -> index
+        %25 = shape.rank %23#1 : tensor<?xindex> -> index
+        %26 = arith.cmpi sgt, %24, %25 : index
+        %27 = arith.select %26, %24, %25 : index
+        %28 = arith.cmpi ule, %27, %c1 : index
+        %29 = scf.if %28 -> (tensor<*xelem_type>) {
+          %30 = shape.broadcast %23#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %30 : tensor<?xindex> to tensor<1xindex>
+          %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %32 = shape.broadcast %23#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<1xindex>
+          %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %34 = chlo.broadcast_subtract %31, %33 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %35 = chlo.broadcast_multiply %34, %34 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %35 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %30 = arith.cmpi ule, %27, %c2 : index
+          %31 = scf.if %30 -> (tensor<*xelem_type>) {
+            %32 = shape.broadcast %23#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %32 : tensor<?xindex> to tensor<2xindex>
+            %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %34 = shape.broadcast %23#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<2xindex>
+            %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %36 = chlo.broadcast_subtract %33, %35 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %37 = chlo.broadcast_multiply %36, %36 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %37 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %32 = arith.cmpi ule, %27, %c3 : index
+            %33 = scf.if %32 -> (tensor<*xelem_type>) {
+              %34 = shape.broadcast %23#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %34 : tensor<?xindex> to tensor<3xindex>
+              %35 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %36 = shape.broadcast %23#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %36 : tensor<?xindex> to tensor<3xindex>
+              %37 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %38 = chlo.broadcast_subtract %35, %37 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %39 = chlo.broadcast_multiply %38, %38 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %39 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %34 = arith.cmpi ule, %27, %c4 : index
+              %35 = scf.if %34 -> (tensor<*xelem_type>) {
+                %36 = shape.broadcast %23#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %36 : tensor<?xindex> to tensor<4xindex>
+                %37 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %38 = shape.broadcast %23#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %38 : tensor<?xindex> to tensor<4xindex>
+                %39 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %40 = chlo.broadcast_subtract %37, %39 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %41 = chlo.broadcast_multiply %40, %40 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %41 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %36 = arith.cmpi ule, %27, %c5 : index
+                cf.assert %36, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %37 = shape.broadcast %23#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %37 : tensor<?xindex> to tensor<5xindex>
+                %38 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %39 = shape.broadcast %23#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %39 : tensor<?xindex> to tensor<5xindex>
+                %40 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %41 = chlo.broadcast_subtract %38, %40 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %42 = chlo.broadcast_multiply %41, %41 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %42 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %35 : tensor<*xelem_type>
+            }
+            scf.yield %33 : tensor<*xelem_type>
+          }
+          scf.yield %31 : tensor<*xelem_type>
+        }
+        scf.yield %29 : tensor<*xelem_type>
+      }
+      scf.yield %22 : tensor<*xelem_type>
+    }
+    scf.yield %20 : tensor<*xelem_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %14 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %15 = shape.broadcast %13, %14 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %16 = shape.broadcast %12, %15 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %17 = mhlo.dynamic_reshape %9, %16 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %17 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/sub.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/sub.mlir.tmpl
index 9422b04a4c3d30..b4f74d07b0e79b 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/sub.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/sub.mlir.tmpl
@@ -1,5 +1,129 @@
-func.func @Sub_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_subtract %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Sub_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xelem_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_subtract %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %17 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xelem_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_subtract %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %20 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xelem_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_subtract %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xelem_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_subtract %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xelem_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_subtract %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xelem_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_subtract %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xelem_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_subtract %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_subtract %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %31 : tensor<*xelem_type>
+            }
+            scf.yield %29 : tensor<*xelem_type>
+          }
+          scf.yield %27 : tensor<*xelem_type>
+        }
+        scf.yield %25 : tensor<*xelem_type>
+      }
+      scf.yield %18 : tensor<*xelem_type>
+    }
+    scf.yield %16 : tensor<*xelem_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %13 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/tan.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/tan.mlir.tmpl
index ce6df920308c9d..c9d47268d5bd44 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/tan.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/tan.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Tan_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.tan %arg0 : tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Tan_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.tan %2 : tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/tanh.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/tanh.mlir.tmpl
index cb47c6fbb43f19..1c2bf19d63c124 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/tanh.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/tanh.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @Tanh_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.tanh %arg0 : tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Tanh_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = mhlo.tanh %2 : tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/truncate_div.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/truncate_div.mlir.tmpl
index 83c9e31ed37385..18921f3a068eea 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/truncate_div.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/truncate_div.mlir.tmpl
@@ -1,6 +1,129 @@
-func.func @TruncateDiv_platform_elem_type_output_type(
-    %arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_divide %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @TruncateDiv_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xelem_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_divide %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %17 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xelem_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_divide %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %20 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xelem_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_divide %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xelem_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_divide %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xelem_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_divide %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xelem_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_divide %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xelem_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_divide %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_divide %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %31 : tensor<*xelem_type>
+            }
+            scf.yield %29 : tensor<*xelem_type>
+          }
+          scf.yield %27 : tensor<*xelem_type>
+        }
+        scf.yield %25 : tensor<*xelem_type>
+      }
+      scf.yield %18 : tensor<*xelem_type>
+    }
+    scf.yield %16 : tensor<*xelem_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %13 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/truncate_div_float.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/truncate_div_float.mlir.tmpl
index 6b74e21508bc3b..bff954b451b307 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/truncate_div_float.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/truncate_div_float.mlir.tmpl
@@ -1,11 +1,169 @@
-func.func @TruncateDiv_platform_elem_type_output_type(
-    %arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_divide %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %1 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
-  %2 = chlo.broadcast_compare %0, %1 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %3 = mhlo.ceil %0 : tensor<*xelem_type>
-  %4 = mhlo.floor %0 : tensor<*xelem_type>
-  %5 = chlo.broadcast_select %2, %3, %4 : (tensor<*xi1>, tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %5 : tensor<*xoutput_type>
+func.func @TruncateDiv_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
+  %6 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %8 = shape.num_elements %6 : tensor<?xindex> -> index
+  %9 = arith.cmpi eq, %8, %c1 : index
+  %10 = scf.if %9 -> (tensor<*xelem_type>) {
+    %22 = shape.num_elements %7 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %22 : tensor<1xindex>
+    %23 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %24 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %25 = chlo.broadcast_divide %23, %24 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %26 = chlo.broadcast_compare %25, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+    %27 = mhlo.ceil %25 : tensor<?xelem_type>
+    %28 = mhlo.floor %25 : tensor<?xelem_type>
+    %29 = chlo.broadcast_select %26, %27, %28 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %29 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %22 = shape.num_elements %7 : tensor<?xindex> -> index
+    %23 = arith.cmpi eq, %22, %c1 : index
+    %24 = scf.if %23 -> (tensor<*xelem_type>) {
+      %25 = shape.num_elements %6 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %25 : tensor<1xindex>
+      %26 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %27 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %28 = chlo.broadcast_divide %26, %27 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %29 = chlo.broadcast_compare %28, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+      %30 = mhlo.ceil %28 : tensor<?xelem_type>
+      %31 = mhlo.floor %28 : tensor<?xelem_type>
+      %32 = chlo.broadcast_select %29, %30, %31 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %32 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %25 = shape.shape_eq %6, %7 : tensor<?xindex>, tensor<?xindex>
+      %26 = scf.if %25 -> (tensor<*xelem_type>) {
+        %27 = shape.any %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %28 = shape.num_elements %27 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %28 : tensor<1xindex>
+        %29 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %30 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %31 = chlo.broadcast_divide %29, %30 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %32 = chlo.broadcast_compare %31, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+        %33 = mhlo.ceil %31 : tensor<?xelem_type>
+        %34 = mhlo.floor %31 : tensor<?xelem_type>
+        %35 = chlo.broadcast_select %32, %33, %34 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %35 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %27:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %28 = shape.rank %27#0 : tensor<?xindex> -> index
+        %29 = shape.rank %27#1 : tensor<?xindex> -> index
+        %30 = arith.cmpi sgt, %28, %29 : index
+        %31 = arith.select %30, %28, %29 : index
+        %32 = arith.cmpi ule, %31, %c1 : index
+        %33 = scf.if %32 -> (tensor<*xelem_type>) {
+          %34 = shape.broadcast %27#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %34 : tensor<?xindex> to tensor<1xindex>
+          %35 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %36 = shape.broadcast %27#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %36 : tensor<?xindex> to tensor<1xindex>
+          %37 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %38 = chlo.broadcast_divide %35, %37 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %39 = chlo.broadcast_compare %38, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+          %40 = mhlo.ceil %38 : tensor<?xelem_type>
+          %41 = mhlo.floor %38 : tensor<?xelem_type>
+          %42 = chlo.broadcast_select %39, %40, %41 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %42 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %34 = arith.cmpi ule, %31, %c2 : index
+          %35 = scf.if %34 -> (tensor<*xelem_type>) {
+            %36 = shape.broadcast %27#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %36 : tensor<?xindex> to tensor<2xindex>
+            %37 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %38 = shape.broadcast %27#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %38 : tensor<?xindex> to tensor<2xindex>
+            %39 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %40 = chlo.broadcast_divide %37, %39 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %41 = chlo.broadcast_compare %40, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?xelem_type>, tensor<elem_type>) -> tensor<?x?xi1>
+            %42 = mhlo.ceil %40 : tensor<?x?xelem_type>
+            %43 = mhlo.floor %40 : tensor<?x?xelem_type>
+            %44 = chlo.broadcast_select %41, %42, %43 : (tensor<?x?xi1>, tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %44 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %36 = arith.cmpi ule, %31, %c3 : index
+            %37 = scf.if %36 -> (tensor<*xelem_type>) {
+              %38 = shape.broadcast %27#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %38 : tensor<?xindex> to tensor<3xindex>
+              %39 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %40 = shape.broadcast %27#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %40 : tensor<?xindex> to tensor<3xindex>
+              %41 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %42 = chlo.broadcast_divide %39, %41 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %43 = chlo.broadcast_compare %42, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?xi1>
+              %44 = mhlo.ceil %42 : tensor<?x?x?xelem_type>
+              %45 = mhlo.floor %42 : tensor<?x?x?xelem_type>
+              %46 = chlo.broadcast_select %43, %44, %45 : (tensor<?x?x?xi1>, tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %46 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %38 = arith.cmpi ule, %31, %c4 : index
+              %39 = scf.if %38 -> (tensor<*xelem_type>) {
+                %40 = shape.broadcast %27#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %40 : tensor<?xindex> to tensor<4xindex>
+                %41 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %42 = shape.broadcast %27#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %42 : tensor<?xindex> to tensor<4xindex>
+                %43 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %44 = chlo.broadcast_divide %41, %43 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %45 = chlo.broadcast_compare %44, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?xi1>
+                %46 = mhlo.ceil %44 : tensor<?x?x?x?xelem_type>
+                %47 = mhlo.floor %44 : tensor<?x?x?x?xelem_type>
+                %48 = chlo.broadcast_select %45, %46, %47 : (tensor<?x?x?x?xi1>, tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %48 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %40 = arith.cmpi ule, %31, %c5 : index
+                cf.assert %40, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %41 = shape.broadcast %27#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %41 : tensor<?xindex> to tensor<5xindex>
+                %42 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %43 = shape.broadcast %27#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %43 : tensor<?xindex> to tensor<5xindex>
+                %44 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %45 = chlo.broadcast_divide %42, %44 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %46 = chlo.broadcast_compare %45, %5 {comparison_direction = #chlo<comparison_direction LT>} : (tensor<?x?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?x?xi1>
+                %47 = mhlo.ceil %45 : tensor<?x?x?x?x?xelem_type>
+                %48 = mhlo.floor %45 : tensor<?x?x?x?x?xelem_type>
+                %49 = chlo.broadcast_select %46, %47, %48 : (tensor<?x?x?x?x?xi1>, tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %49 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %39 : tensor<*xelem_type>
+            }
+            scf.yield %37 : tensor<*xelem_type>
+          }
+          scf.yield %35 : tensor<*xelem_type>
+        }
+        scf.yield %33 : tensor<*xelem_type>
+      }
+      scf.yield %26 : tensor<*xelem_type>
+    }
+    scf.yield %24 : tensor<*xelem_type>
+  }
+  %11 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %13 = shape.broadcast %11, %12 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %14 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %15 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %16 = shape.broadcast %14, %15 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %17 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %18 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %19 = shape.broadcast %17, %18 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %20 = shape.broadcast %13, %16, %19 : tensor<?xindex>, tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %21 = mhlo.dynamic_reshape %10, %20 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %21 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/xdivy.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/xdivy.mlir.tmpl
index d53eae03ce1bc3..4abf5a03d7fb04 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/xdivy.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/xdivy.mlir.tmpl
@@ -1,9 +1,149 @@
-func.func @Xdivy_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
-    %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes
-    {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
-  %1 = chlo.broadcast_compare %arg0, %0 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %2 = chlo.broadcast_divide %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %3 = chlo.broadcast_select %1, %arg0, %2 : (tensor<*xi1>, tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %3 : tensor<*xoutput_type>
+func.func @Xdivy_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
+  %6 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %8 = shape.num_elements %6 : tensor<?xindex> -> index
+  %9 = arith.cmpi eq, %8, %c1 : index
+  %10 = scf.if %9 -> (tensor<*xelem_type>) {
+    %18 = shape.num_elements %7 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %18 : tensor<1xindex>
+    %19 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %20 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %21 = chlo.broadcast_compare %19, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<elem_type>, tensor<elem_type>) -> tensor<i1>
+    %22 = chlo.broadcast_divide %19, %20 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %23 = chlo.broadcast_select %21, %19, %22 : (tensor<i1>, tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %18 = shape.num_elements %7 : tensor<?xindex> -> index
+    %19 = arith.cmpi eq, %18, %c1 : index
+    %20 = scf.if %19 -> (tensor<*xelem_type>) {
+      %21 = shape.num_elements %6 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %21 : tensor<1xindex>
+      %22 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %23 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %24 = chlo.broadcast_compare %22, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+      %25 = chlo.broadcast_divide %22, %23 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %26 = chlo.broadcast_select %24, %22, %25 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %26 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %21 = shape.shape_eq %6, %7 : tensor<?xindex>, tensor<?xindex>
+      %22 = scf.if %21 -> (tensor<*xelem_type>) {
+        %23 = shape.any %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %24 = shape.num_elements %23 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %24 : tensor<1xindex>
+        %25 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %26 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %27 = chlo.broadcast_compare %25, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+        %28 = chlo.broadcast_divide %25, %26 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %29 = chlo.broadcast_select %27, %25, %28 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %29 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %23:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %24 = shape.rank %23#0 : tensor<?xindex> -> index
+        %25 = shape.rank %23#1 : tensor<?xindex> -> index
+        %26 = arith.cmpi sgt, %24, %25 : index
+        %27 = arith.select %26, %24, %25 : index
+        %28 = arith.cmpi ule, %27, %c1 : index
+        %29 = scf.if %28 -> (tensor<*xelem_type>) {
+          %30 = shape.broadcast %23#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %30 : tensor<?xindex> to tensor<1xindex>
+          %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %32 = shape.broadcast %23#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<1xindex>
+          %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %34 = chlo.broadcast_compare %31, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+          %35 = chlo.broadcast_divide %31, %33 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %36 = chlo.broadcast_select %34, %31, %35 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %36 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %30 = arith.cmpi ule, %27, %c2 : index
+          %31 = scf.if %30 -> (tensor<*xelem_type>) {
+            %32 = shape.broadcast %23#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %32 : tensor<?xindex> to tensor<2xindex>
+            %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %34 = shape.broadcast %23#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<2xindex>
+            %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %36 = chlo.broadcast_compare %33, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?xelem_type>, tensor<elem_type>) -> tensor<?x?xi1>
+            %37 = chlo.broadcast_divide %33, %35 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %38 = chlo.broadcast_select %36, %33, %37 : (tensor<?x?xi1>, tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %38 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %32 = arith.cmpi ule, %27, %c3 : index
+            %33 = scf.if %32 -> (tensor<*xelem_type>) {
+              %34 = shape.broadcast %23#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %34 : tensor<?xindex> to tensor<3xindex>
+              %35 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %36 = shape.broadcast %23#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %36 : tensor<?xindex> to tensor<3xindex>
+              %37 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %38 = chlo.broadcast_compare %35, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?xi1>
+              %39 = chlo.broadcast_divide %35, %37 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %40 = chlo.broadcast_select %38, %35, %39 : (tensor<?x?x?xi1>, tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %40 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %34 = arith.cmpi ule, %27, %c4 : index
+              %35 = scf.if %34 -> (tensor<*xelem_type>) {
+                %36 = shape.broadcast %23#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %36 : tensor<?xindex> to tensor<4xindex>
+                %37 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %38 = shape.broadcast %23#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %38 : tensor<?xindex> to tensor<4xindex>
+                %39 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %40 = chlo.broadcast_compare %37, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?xi1>
+                %41 = chlo.broadcast_divide %37, %39 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %42 = chlo.broadcast_select %40, %37, %41 : (tensor<?x?x?x?xi1>, tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %42 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %36 = arith.cmpi ule, %27, %c5 : index
+                cf.assert %36, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %37 = shape.broadcast %23#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %37 : tensor<?xindex> to tensor<5xindex>
+                %38 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %39 = shape.broadcast %23#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %39 : tensor<?xindex> to tensor<5xindex>
+                %40 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %41 = chlo.broadcast_compare %38, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?x?xi1>
+                %42 = chlo.broadcast_divide %38, %40 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %43 = chlo.broadcast_select %41, %38, %42 : (tensor<?x?x?x?x?xi1>, tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %43 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %35 : tensor<*xelem_type>
+            }
+            scf.yield %33 : tensor<*xelem_type>
+          }
+          scf.yield %31 : tensor<*xelem_type>
+        }
+        scf.yield %29 : tensor<*xelem_type>
+      }
+      scf.yield %22 : tensor<*xelem_type>
+    }
+    scf.yield %20 : tensor<*xelem_type>
+  }
+  %11 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %13 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %14 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %15 = shape.broadcast %13, %14 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %16 = shape.broadcast %11, %12, %15 : tensor<?xindex>, tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %17 = mhlo.dynamic_reshape %10, %16 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %17 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/xdivy_cmplx.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/xdivy_cmplx.mlir.tmpl
index bc4861cc614af2..ad1baeb9204232 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/xdivy_cmplx.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/xdivy_cmplx.mlir.tmpl
@@ -1,9 +1,149 @@
-func.func @Xdivy_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
-    %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes
-    {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.constant dense<(0.000000e+00,0.000000e+00)> : tensor<elem_type>
-  %1 = chlo.broadcast_compare %arg0, %0 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %2 = chlo.broadcast_divide %arg0, %arg1 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %3 = chlo.broadcast_select %1, %arg0, %2 : (tensor<*xi1>, tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %3 : tensor<*xoutput_type>
+func.func @Xdivy_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = mhlo.constant dense<(0.000000e+00,0.000000e+00)> : tensor<elem_type>
+  %6 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %8 = shape.num_elements %6 : tensor<?xindex> -> index
+  %9 = arith.cmpi eq, %8, %c1 : index
+  %10 = scf.if %9 -> (tensor<*xelem_type>) {
+    %18 = shape.num_elements %7 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %18 : tensor<1xindex>
+    %19 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %20 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %21 = chlo.broadcast_compare %19, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<elem_type>, tensor<elem_type>) -> tensor<i1>
+    %22 = chlo.broadcast_divide %19, %20 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %23 = chlo.broadcast_select %21, %19, %22 : (tensor<i1>, tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %23 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %18 = shape.num_elements %7 : tensor<?xindex> -> index
+    %19 = arith.cmpi eq, %18, %c1 : index
+    %20 = scf.if %19 -> (tensor<*xelem_type>) {
+      %21 = shape.num_elements %6 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %21 : tensor<1xindex>
+      %22 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %23 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %24 = chlo.broadcast_compare %22, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+      %25 = chlo.broadcast_divide %22, %23 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %26 = chlo.broadcast_select %24, %22, %25 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %26 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %21 = shape.shape_eq %6, %7 : tensor<?xindex>, tensor<?xindex>
+      %22 = scf.if %21 -> (tensor<*xelem_type>) {
+        %23 = shape.any %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %24 = shape.num_elements %23 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %24 : tensor<1xindex>
+        %25 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %26 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %27 = chlo.broadcast_compare %25, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+        %28 = chlo.broadcast_divide %25, %26 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %29 = chlo.broadcast_select %27, %25, %28 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %29 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %23:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %24 = shape.rank %23#0 : tensor<?xindex> -> index
+        %25 = shape.rank %23#1 : tensor<?xindex> -> index
+        %26 = arith.cmpi sgt, %24, %25 : index
+        %27 = arith.select %26, %24, %25 : index
+        %28 = arith.cmpi ule, %27, %c1 : index
+        %29 = scf.if %28 -> (tensor<*xelem_type>) {
+          %30 = shape.broadcast %23#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %30 : tensor<?xindex> to tensor<1xindex>
+          %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %32 = shape.broadcast %23#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<1xindex>
+          %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %34 = chlo.broadcast_compare %31, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+          %35 = chlo.broadcast_divide %31, %33 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %36 = chlo.broadcast_select %34, %31, %35 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %36 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %30 = arith.cmpi ule, %27, %c2 : index
+          %31 = scf.if %30 -> (tensor<*xelem_type>) {
+            %32 = shape.broadcast %23#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %32 : tensor<?xindex> to tensor<2xindex>
+            %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %34 = shape.broadcast %23#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<2xindex>
+            %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %36 = chlo.broadcast_compare %33, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?xelem_type>, tensor<elem_type>) -> tensor<?x?xi1>
+            %37 = chlo.broadcast_divide %33, %35 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %38 = chlo.broadcast_select %36, %33, %37 : (tensor<?x?xi1>, tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %38 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %32 = arith.cmpi ule, %27, %c3 : index
+            %33 = scf.if %32 -> (tensor<*xelem_type>) {
+              %34 = shape.broadcast %23#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %34 : tensor<?xindex> to tensor<3xindex>
+              %35 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %36 = shape.broadcast %23#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %36 : tensor<?xindex> to tensor<3xindex>
+              %37 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %38 = chlo.broadcast_compare %35, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?xi1>
+              %39 = chlo.broadcast_divide %35, %37 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %40 = chlo.broadcast_select %38, %35, %39 : (tensor<?x?x?xi1>, tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %40 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %34 = arith.cmpi ule, %27, %c4 : index
+              %35 = scf.if %34 -> (tensor<*xelem_type>) {
+                %36 = shape.broadcast %23#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %36 : tensor<?xindex> to tensor<4xindex>
+                %37 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %38 = shape.broadcast %23#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %38 : tensor<?xindex> to tensor<4xindex>
+                %39 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %40 = chlo.broadcast_compare %37, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?xi1>
+                %41 = chlo.broadcast_divide %37, %39 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %42 = chlo.broadcast_select %40, %37, %41 : (tensor<?x?x?x?xi1>, tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %42 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %36 = arith.cmpi ule, %27, %c5 : index
+                cf.assert %36, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %37 = shape.broadcast %23#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %37 : tensor<?xindex> to tensor<5xindex>
+                %38 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %39 = shape.broadcast %23#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %39 : tensor<?xindex> to tensor<5xindex>
+                %40 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %41 = chlo.broadcast_compare %38, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?x?xi1>
+                %42 = chlo.broadcast_divide %38, %40 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %43 = chlo.broadcast_select %41, %38, %42 : (tensor<?x?x?x?x?xi1>, tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %43 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %35 : tensor<*xelem_type>
+            }
+            scf.yield %33 : tensor<*xelem_type>
+          }
+          scf.yield %31 : tensor<*xelem_type>
+        }
+        scf.yield %29 : tensor<*xelem_type>
+      }
+      scf.yield %22 : tensor<*xelem_type>
+    }
+    scf.yield %20 : tensor<*xelem_type>
+  }
+  %11 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %13 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %14 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %15 = shape.broadcast %13, %14 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %16 = shape.broadcast %11, %12, %15 : tensor<?xindex>, tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %17 = mhlo.dynamic_reshape %10, %16 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %17 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/xlog1py.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/xlog1py.mlir.tmpl
index 5246d903414270..f53f620dea1f08 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/xlog1py.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/xlog1py.mlir.tmpl
@@ -1,10 +1,157 @@
-func.func @Xlog1py_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
-    %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes
-    {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
-  %1 = chlo.broadcast_compare %arg0, %0 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %2 = mhlo.log_plus_one %arg1 : tensor<*xelem_type>
-  %3 = chlo.broadcast_multiply %arg0, %2 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %4 = chlo.broadcast_select %1, %arg0, %3 : (tensor<*xi1>, tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %4 : tensor<*xoutput_type>
+func.func @Xlog1py_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
+  %6 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %8 = shape.num_elements %6 : tensor<?xindex> -> index
+  %9 = arith.cmpi eq, %8, %c1 : index
+  %10 = scf.if %9 -> (tensor<*xelem_type>) {
+    %18 = shape.num_elements %7 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %18 : tensor<1xindex>
+    %19 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %20 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %21 = chlo.broadcast_compare %19, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<elem_type>, tensor<elem_type>) -> tensor<i1>
+    %22 = mhlo.log_plus_one %20 : tensor<?xelem_type>
+    %23 = chlo.broadcast_multiply %19, %22 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %24 = chlo.broadcast_select %21, %19, %23 : (tensor<i1>, tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %24 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %18 = shape.num_elements %7 : tensor<?xindex> -> index
+    %19 = arith.cmpi eq, %18, %c1 : index
+    %20 = scf.if %19 -> (tensor<*xelem_type>) {
+      %21 = shape.num_elements %6 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %21 : tensor<1xindex>
+      %22 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %23 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %24 = chlo.broadcast_compare %22, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+      %25 = mhlo.log_plus_one %23 : tensor<elem_type>
+      %26 = chlo.broadcast_multiply %22, %25 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %27 = chlo.broadcast_select %24, %22, %26 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %27 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %21 = shape.shape_eq %6, %7 : tensor<?xindex>, tensor<?xindex>
+      %22 = scf.if %21 -> (tensor<*xelem_type>) {
+        %23 = shape.any %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %24 = shape.num_elements %23 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %24 : tensor<1xindex>
+        %25 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %26 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %27 = chlo.broadcast_compare %25, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+        %28 = mhlo.log_plus_one %26 : tensor<?xelem_type>
+        %29 = chlo.broadcast_multiply %25, %28 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %30 = chlo.broadcast_select %27, %25, %29 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %23:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %24 = shape.rank %23#0 : tensor<?xindex> -> index
+        %25 = shape.rank %23#1 : tensor<?xindex> -> index
+        %26 = arith.cmpi sgt, %24, %25 : index
+        %27 = arith.select %26, %24, %25 : index
+        %28 = arith.cmpi ule, %27, %c1 : index
+        %29 = scf.if %28 -> (tensor<*xelem_type>) {
+          %30 = shape.broadcast %23#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %30 : tensor<?xindex> to tensor<1xindex>
+          %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %32 = shape.broadcast %23#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<1xindex>
+          %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %34 = chlo.broadcast_compare %31, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+          %35 = mhlo.log_plus_one %33 : tensor<?xelem_type>
+          %36 = chlo.broadcast_multiply %31, %35 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %37 = chlo.broadcast_select %34, %31, %36 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %37 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %30 = arith.cmpi ule, %27, %c2 : index
+          %31 = scf.if %30 -> (tensor<*xelem_type>) {
+            %32 = shape.broadcast %23#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %32 : tensor<?xindex> to tensor<2xindex>
+            %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %34 = shape.broadcast %23#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<2xindex>
+            %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %36 = chlo.broadcast_compare %33, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?xelem_type>, tensor<elem_type>) -> tensor<?x?xi1>
+            %37 = mhlo.log_plus_one %35 : tensor<?x?xelem_type>
+            %38 = chlo.broadcast_multiply %33, %37 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %39 = chlo.broadcast_select %36, %33, %38 : (tensor<?x?xi1>, tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %39 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %32 = arith.cmpi ule, %27, %c3 : index
+            %33 = scf.if %32 -> (tensor<*xelem_type>) {
+              %34 = shape.broadcast %23#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %34 : tensor<?xindex> to tensor<3xindex>
+              %35 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %36 = shape.broadcast %23#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %36 : tensor<?xindex> to tensor<3xindex>
+              %37 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %38 = chlo.broadcast_compare %35, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?xi1>
+              %39 = mhlo.log_plus_one %37 : tensor<?x?x?xelem_type>
+              %40 = chlo.broadcast_multiply %35, %39 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %41 = chlo.broadcast_select %38, %35, %40 : (tensor<?x?x?xi1>, tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %41 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %34 = arith.cmpi ule, %27, %c4 : index
+              %35 = scf.if %34 -> (tensor<*xelem_type>) {
+                %36 = shape.broadcast %23#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %36 : tensor<?xindex> to tensor<4xindex>
+                %37 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %38 = shape.broadcast %23#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %38 : tensor<?xindex> to tensor<4xindex>
+                %39 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %40 = chlo.broadcast_compare %37, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?xi1>
+                %41 = mhlo.log_plus_one %39 : tensor<?x?x?x?xelem_type>
+                %42 = chlo.broadcast_multiply %37, %41 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %43 = chlo.broadcast_select %40, %37, %42 : (tensor<?x?x?x?xi1>, tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %43 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %36 = arith.cmpi ule, %27, %c5 : index
+                cf.assert %36, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %37 = shape.broadcast %23#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %37 : tensor<?xindex> to tensor<5xindex>
+                %38 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %39 = shape.broadcast %23#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %39 : tensor<?xindex> to tensor<5xindex>
+                %40 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %41 = chlo.broadcast_compare %38, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?x?xi1>
+                %42 = mhlo.log_plus_one %40 : tensor<?x?x?x?x?xelem_type>
+                %43 = chlo.broadcast_multiply %38, %42 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %44 = chlo.broadcast_select %41, %38, %43 : (tensor<?x?x?x?x?xi1>, tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %44 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %35 : tensor<*xelem_type>
+            }
+            scf.yield %33 : tensor<*xelem_type>
+          }
+          scf.yield %31 : tensor<*xelem_type>
+        }
+        scf.yield %29 : tensor<*xelem_type>
+      }
+      scf.yield %22 : tensor<*xelem_type>
+    }
+    scf.yield %20 : tensor<*xelem_type>
+  }
+  %11 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %13 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %14 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %15 = shape.broadcast %13, %14 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %16 = shape.broadcast %11, %12, %15 : tensor<?xindex>, tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %17 = mhlo.dynamic_reshape %10, %16 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %17 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/xlog1py_cmplx.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/xlog1py_cmplx.mlir.tmpl
index 4b3a9b96e17b4d..c5282303d94f7a 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/xlog1py_cmplx.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/xlog1py_cmplx.mlir.tmpl
@@ -1,10 +1,157 @@
-func.func @Xlog1py_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
-    %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes
-    {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.constant dense<(1.000000e+00,0.000000e+00)> : tensor<elem_type>
-  %1 = chlo.broadcast_compare %arg0, %0 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %2 = mhlo.log_plus_one %arg1 : tensor<*xelem_type>
-  %3 = chlo.broadcast_multiply %arg0, %2 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %4 = chlo.broadcast_select %1, %arg0, %3 : (tensor<*xi1>, tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %4 : tensor<*xoutput_type>
+func.func @Xlog1py_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = mhlo.constant dense<(1.000000e+00,0.000000e+00)> : tensor<elem_type>
+  %6 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %8 = shape.num_elements %6 : tensor<?xindex> -> index
+  %9 = arith.cmpi eq, %8, %c1 : index
+  %10 = scf.if %9 -> (tensor<*xelem_type>) {
+    %18 = shape.num_elements %7 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %18 : tensor<1xindex>
+    %19 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %20 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %21 = chlo.broadcast_compare %19, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<elem_type>, tensor<elem_type>) -> tensor<i1>
+    %22 = mhlo.log_plus_one %20 : tensor<?xelem_type>
+    %23 = chlo.broadcast_multiply %19, %22 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %24 = chlo.broadcast_select %21, %19, %23 : (tensor<i1>, tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %24 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %18 = shape.num_elements %7 : tensor<?xindex> -> index
+    %19 = arith.cmpi eq, %18, %c1 : index
+    %20 = scf.if %19 -> (tensor<*xelem_type>) {
+      %21 = shape.num_elements %6 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %21 : tensor<1xindex>
+      %22 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %23 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %24 = chlo.broadcast_compare %22, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+      %25 = mhlo.log_plus_one %23 : tensor<elem_type>
+      %26 = chlo.broadcast_multiply %22, %25 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %27 = chlo.broadcast_select %24, %22, %26 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %27 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %21 = shape.shape_eq %6, %7 : tensor<?xindex>, tensor<?xindex>
+      %22 = scf.if %21 -> (tensor<*xelem_type>) {
+        %23 = shape.any %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %24 = shape.num_elements %23 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %24 : tensor<1xindex>
+        %25 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %26 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %27 = chlo.broadcast_compare %25, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+        %28 = mhlo.log_plus_one %26 : tensor<?xelem_type>
+        %29 = chlo.broadcast_multiply %25, %28 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %30 = chlo.broadcast_select %27, %25, %29 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %23:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %24 = shape.rank %23#0 : tensor<?xindex> -> index
+        %25 = shape.rank %23#1 : tensor<?xindex> -> index
+        %26 = arith.cmpi sgt, %24, %25 : index
+        %27 = arith.select %26, %24, %25 : index
+        %28 = arith.cmpi ule, %27, %c1 : index
+        %29 = scf.if %28 -> (tensor<*xelem_type>) {
+          %30 = shape.broadcast %23#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %30 : tensor<?xindex> to tensor<1xindex>
+          %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %32 = shape.broadcast %23#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<1xindex>
+          %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %34 = chlo.broadcast_compare %31, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+          %35 = mhlo.log_plus_one %33 : tensor<?xelem_type>
+          %36 = chlo.broadcast_multiply %31, %35 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %37 = chlo.broadcast_select %34, %31, %36 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %37 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %30 = arith.cmpi ule, %27, %c2 : index
+          %31 = scf.if %30 -> (tensor<*xelem_type>) {
+            %32 = shape.broadcast %23#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %32 : tensor<?xindex> to tensor<2xindex>
+            %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %34 = shape.broadcast %23#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<2xindex>
+            %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %36 = chlo.broadcast_compare %33, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?xelem_type>, tensor<elem_type>) -> tensor<?x?xi1>
+            %37 = mhlo.log_plus_one %35 : tensor<?x?xelem_type>
+            %38 = chlo.broadcast_multiply %33, %37 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %39 = chlo.broadcast_select %36, %33, %38 : (tensor<?x?xi1>, tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %39 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %32 = arith.cmpi ule, %27, %c3 : index
+            %33 = scf.if %32 -> (tensor<*xelem_type>) {
+              %34 = shape.broadcast %23#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %34 : tensor<?xindex> to tensor<3xindex>
+              %35 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %36 = shape.broadcast %23#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %36 : tensor<?xindex> to tensor<3xindex>
+              %37 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %38 = chlo.broadcast_compare %35, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?xi1>
+              %39 = mhlo.log_plus_one %37 : tensor<?x?x?xelem_type>
+              %40 = chlo.broadcast_multiply %35, %39 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %41 = chlo.broadcast_select %38, %35, %40 : (tensor<?x?x?xi1>, tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %41 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %34 = arith.cmpi ule, %27, %c4 : index
+              %35 = scf.if %34 -> (tensor<*xelem_type>) {
+                %36 = shape.broadcast %23#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %36 : tensor<?xindex> to tensor<4xindex>
+                %37 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %38 = shape.broadcast %23#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %38 : tensor<?xindex> to tensor<4xindex>
+                %39 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %40 = chlo.broadcast_compare %37, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?xi1>
+                %41 = mhlo.log_plus_one %39 : tensor<?x?x?x?xelem_type>
+                %42 = chlo.broadcast_multiply %37, %41 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %43 = chlo.broadcast_select %40, %37, %42 : (tensor<?x?x?x?xi1>, tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %43 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %36 = arith.cmpi ule, %27, %c5 : index
+                cf.assert %36, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %37 = shape.broadcast %23#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %37 : tensor<?xindex> to tensor<5xindex>
+                %38 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %39 = shape.broadcast %23#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %39 : tensor<?xindex> to tensor<5xindex>
+                %40 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %41 = chlo.broadcast_compare %38, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?x?xi1>
+                %42 = mhlo.log_plus_one %40 : tensor<?x?x?x?x?xelem_type>
+                %43 = chlo.broadcast_multiply %38, %42 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %44 = chlo.broadcast_select %41, %38, %43 : (tensor<?x?x?x?x?xi1>, tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %44 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %35 : tensor<*xelem_type>
+            }
+            scf.yield %33 : tensor<*xelem_type>
+          }
+          scf.yield %31 : tensor<*xelem_type>
+        }
+        scf.yield %29 : tensor<*xelem_type>
+      }
+      scf.yield %22 : tensor<*xelem_type>
+    }
+    scf.yield %20 : tensor<*xelem_type>
+  }
+  %11 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %13 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %14 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %15 = shape.broadcast %13, %14 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %16 = shape.broadcast %11, %12, %15 : tensor<?xindex>, tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %17 = mhlo.dynamic_reshape %10, %16 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %17 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/xlogy.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/xlogy.mlir.tmpl
index 59cf8a1b594114..224b409127f35c 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/xlogy.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/xlogy.mlir.tmpl
@@ -1,10 +1,157 @@
-func.func @Xlogy_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
-    %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes
-    {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
-  %1 = chlo.broadcast_compare %arg0, %0 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %2 = mhlo.log %arg1 : tensor<*xelem_type>
-  %3 = chlo.broadcast_multiply %arg0, %2 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %4 = chlo.broadcast_select %1, %arg0, %3 : (tensor<*xi1>, tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %4 : tensor<*xoutput_type>
+func.func @Xlogy_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = mhlo.constant dense<0.000000e+00> : tensor<elem_type>
+  %6 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %8 = shape.num_elements %6 : tensor<?xindex> -> index
+  %9 = arith.cmpi eq, %8, %c1 : index
+  %10 = scf.if %9 -> (tensor<*xelem_type>) {
+    %18 = shape.num_elements %7 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %18 : tensor<1xindex>
+    %19 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %20 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %21 = chlo.broadcast_compare %19, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<elem_type>, tensor<elem_type>) -> tensor<i1>
+    %22 = mhlo.log %20 : tensor<?xelem_type>
+    %23 = chlo.broadcast_multiply %19, %22 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %24 = chlo.broadcast_select %21, %19, %23 : (tensor<i1>, tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %24 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %18 = shape.num_elements %7 : tensor<?xindex> -> index
+    %19 = arith.cmpi eq, %18, %c1 : index
+    %20 = scf.if %19 -> (tensor<*xelem_type>) {
+      %21 = shape.num_elements %6 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %21 : tensor<1xindex>
+      %22 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %23 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %24 = chlo.broadcast_compare %22, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+      %25 = mhlo.log %23 : tensor<elem_type>
+      %26 = chlo.broadcast_multiply %22, %25 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %27 = chlo.broadcast_select %24, %22, %26 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %27 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %21 = shape.shape_eq %6, %7 : tensor<?xindex>, tensor<?xindex>
+      %22 = scf.if %21 -> (tensor<*xelem_type>) {
+        %23 = shape.any %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %24 = shape.num_elements %23 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %24 : tensor<1xindex>
+        %25 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %26 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %27 = chlo.broadcast_compare %25, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+        %28 = mhlo.log %26 : tensor<?xelem_type>
+        %29 = chlo.broadcast_multiply %25, %28 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %30 = chlo.broadcast_select %27, %25, %29 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %23:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %24 = shape.rank %23#0 : tensor<?xindex> -> index
+        %25 = shape.rank %23#1 : tensor<?xindex> -> index
+        %26 = arith.cmpi sgt, %24, %25 : index
+        %27 = arith.select %26, %24, %25 : index
+        %28 = arith.cmpi ule, %27, %c1 : index
+        %29 = scf.if %28 -> (tensor<*xelem_type>) {
+          %30 = shape.broadcast %23#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %30 : tensor<?xindex> to tensor<1xindex>
+          %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %32 = shape.broadcast %23#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<1xindex>
+          %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %34 = chlo.broadcast_compare %31, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+          %35 = mhlo.log %33 : tensor<?xelem_type>
+          %36 = chlo.broadcast_multiply %31, %35 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %37 = chlo.broadcast_select %34, %31, %36 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %37 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %30 = arith.cmpi ule, %27, %c2 : index
+          %31 = scf.if %30 -> (tensor<*xelem_type>) {
+            %32 = shape.broadcast %23#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %32 : tensor<?xindex> to tensor<2xindex>
+            %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %34 = shape.broadcast %23#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<2xindex>
+            %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %36 = chlo.broadcast_compare %33, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?xelem_type>, tensor<elem_type>) -> tensor<?x?xi1>
+            %37 = mhlo.log %35 : tensor<?x?xelem_type>
+            %38 = chlo.broadcast_multiply %33, %37 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %39 = chlo.broadcast_select %36, %33, %38 : (tensor<?x?xi1>, tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %39 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %32 = arith.cmpi ule, %27, %c3 : index
+            %33 = scf.if %32 -> (tensor<*xelem_type>) {
+              %34 = shape.broadcast %23#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %34 : tensor<?xindex> to tensor<3xindex>
+              %35 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %36 = shape.broadcast %23#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %36 : tensor<?xindex> to tensor<3xindex>
+              %37 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %38 = chlo.broadcast_compare %35, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?xi1>
+              %39 = mhlo.log %37 : tensor<?x?x?xelem_type>
+              %40 = chlo.broadcast_multiply %35, %39 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %41 = chlo.broadcast_select %38, %35, %40 : (tensor<?x?x?xi1>, tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %41 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %34 = arith.cmpi ule, %27, %c4 : index
+              %35 = scf.if %34 -> (tensor<*xelem_type>) {
+                %36 = shape.broadcast %23#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %36 : tensor<?xindex> to tensor<4xindex>
+                %37 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %38 = shape.broadcast %23#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %38 : tensor<?xindex> to tensor<4xindex>
+                %39 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %40 = chlo.broadcast_compare %37, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?xi1>
+                %41 = mhlo.log %39 : tensor<?x?x?x?xelem_type>
+                %42 = chlo.broadcast_multiply %37, %41 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %43 = chlo.broadcast_select %40, %37, %42 : (tensor<?x?x?x?xi1>, tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %43 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %36 = arith.cmpi ule, %27, %c5 : index
+                cf.assert %36, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %37 = shape.broadcast %23#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %37 : tensor<?xindex> to tensor<5xindex>
+                %38 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %39 = shape.broadcast %23#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %39 : tensor<?xindex> to tensor<5xindex>
+                %40 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %41 = chlo.broadcast_compare %38, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?x?xi1>
+                %42 = mhlo.log %40 : tensor<?x?x?x?x?xelem_type>
+                %43 = chlo.broadcast_multiply %38, %42 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %44 = chlo.broadcast_select %41, %38, %43 : (tensor<?x?x?x?x?xi1>, tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %44 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %35 : tensor<*xelem_type>
+            }
+            scf.yield %33 : tensor<*xelem_type>
+          }
+          scf.yield %31 : tensor<*xelem_type>
+        }
+        scf.yield %29 : tensor<*xelem_type>
+      }
+      scf.yield %22 : tensor<*xelem_type>
+    }
+    scf.yield %20 : tensor<*xelem_type>
+  }
+  %11 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %13 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %14 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %15 = shape.broadcast %13, %14 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %16 = shape.broadcast %11, %12, %15 : tensor<?xindex>, tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %17 = mhlo.dynamic_reshape %10, %16 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %17 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/xlogy_cmplx.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/xlogy_cmplx.mlir.tmpl
index 441095d77d4940..4c5fa22db919db 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/xlogy_cmplx.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/xlogy_cmplx.mlir.tmpl
@@ -1,10 +1,157 @@
-func.func @Xlogy_platform_elem_type_output_type(%arg0: tensor<*xelem_type>,
-    %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes
-    {tf_entry, llvm.emit_c_interface} {
-  %0 = mhlo.constant dense<(0.000000e+00,0.000000e+00)> : tensor<elem_type>
-  %1 = chlo.broadcast_compare %arg0, %0 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<*xelem_type>, tensor<elem_type>) -> tensor<*xi1>
-  %2 = mhlo.log %arg1 : tensor<*xelem_type>
-  %3 = chlo.broadcast_multiply %arg0, %2 : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  %4 = chlo.broadcast_select %1, %arg0, %3 : (tensor<*xi1>, tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %4 : tensor<*xoutput_type>
+func.func @Xlogy_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = mhlo.constant dense<(0.000000e+00,0.000000e+00)> : tensor<elem_type>
+  %6 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %8 = shape.num_elements %6 : tensor<?xindex> -> index
+  %9 = arith.cmpi eq, %8, %c1 : index
+  %10 = scf.if %9 -> (tensor<*xelem_type>) {
+    %18 = shape.num_elements %7 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %18 : tensor<1xindex>
+    %19 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %20 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %21 = chlo.broadcast_compare %19, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<elem_type>, tensor<elem_type>) -> tensor<i1>
+    %22 = mhlo.log %20 : tensor<?xelem_type>
+    %23 = chlo.broadcast_multiply %19, %22 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %24 = chlo.broadcast_select %21, %19, %23 : (tensor<i1>, tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+    %cast = tensor.cast %24 : tensor<?xelem_type> to tensor<*xelem_type>
+    scf.yield %cast : tensor<*xelem_type>
+  } else {
+    %18 = shape.num_elements %7 : tensor<?xindex> -> index
+    %19 = arith.cmpi eq, %18, %c1 : index
+    %20 = scf.if %19 -> (tensor<*xelem_type>) {
+      %21 = shape.num_elements %6 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %21 : tensor<1xindex>
+      %22 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %23 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %24 = chlo.broadcast_compare %22, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+      %25 = mhlo.log %23 : tensor<elem_type>
+      %26 = chlo.broadcast_multiply %22, %25 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xelem_type>
+      %27 = chlo.broadcast_select %24, %22, %26 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+      %cast = tensor.cast %27 : tensor<?xelem_type> to tensor<*xelem_type>
+      scf.yield %cast : tensor<*xelem_type>
+    } else {
+      %21 = shape.shape_eq %6, %7 : tensor<?xindex>, tensor<?xindex>
+      %22 = scf.if %21 -> (tensor<*xelem_type>) {
+        %23 = shape.any %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %24 = shape.num_elements %23 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %24 : tensor<1xindex>
+        %25 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %26 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %27 = chlo.broadcast_compare %25, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+        %28 = mhlo.log %26 : tensor<?xelem_type>
+        %29 = chlo.broadcast_multiply %25, %28 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %30 = chlo.broadcast_select %27, %25, %29 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+        %cast = tensor.cast %30 : tensor<?xelem_type> to tensor<*xelem_type>
+        scf.yield %cast : tensor<*xelem_type>
+      } else {
+        %23:2 = chlo.minimum_broadcast_shapes %6, %7 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %24 = shape.rank %23#0 : tensor<?xindex> -> index
+        %25 = shape.rank %23#1 : tensor<?xindex> -> index
+        %26 = arith.cmpi sgt, %24, %25 : index
+        %27 = arith.select %26, %24, %25 : index
+        %28 = arith.cmpi ule, %27, %c1 : index
+        %29 = scf.if %28 -> (tensor<*xelem_type>) {
+          %30 = shape.broadcast %23#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %30 : tensor<?xindex> to tensor<1xindex>
+          %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %32 = shape.broadcast %23#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<1xindex>
+          %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %34 = chlo.broadcast_compare %31, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xi1>
+          %35 = mhlo.log %33 : tensor<?xelem_type>
+          %36 = chlo.broadcast_multiply %31, %35 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %37 = chlo.broadcast_select %34, %31, %36 : (tensor<?xi1>, tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xelem_type>
+          %cast_1 = tensor.cast %37 : tensor<?xelem_type> to tensor<*xelem_type>
+          scf.yield %cast_1 : tensor<*xelem_type>
+        } else {
+          %30 = arith.cmpi ule, %27, %c2 : index
+          %31 = scf.if %30 -> (tensor<*xelem_type>) {
+            %32 = shape.broadcast %23#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %32 : tensor<?xindex> to tensor<2xindex>
+            %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %34 = shape.broadcast %23#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<2xindex>
+            %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %36 = chlo.broadcast_compare %33, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?xelem_type>, tensor<elem_type>) -> tensor<?x?xi1>
+            %37 = mhlo.log %35 : tensor<?x?xelem_type>
+            %38 = chlo.broadcast_multiply %33, %37 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %39 = chlo.broadcast_select %36, %33, %38 : (tensor<?x?xi1>, tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xelem_type>
+            %cast_1 = tensor.cast %39 : tensor<?x?xelem_type> to tensor<*xelem_type>
+            scf.yield %cast_1 : tensor<*xelem_type>
+          } else {
+            %32 = arith.cmpi ule, %27, %c3 : index
+            %33 = scf.if %32 -> (tensor<*xelem_type>) {
+              %34 = shape.broadcast %23#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %34 : tensor<?xindex> to tensor<3xindex>
+              %35 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %36 = shape.broadcast %23#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %36 : tensor<?xindex> to tensor<3xindex>
+              %37 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %38 = chlo.broadcast_compare %35, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?xi1>
+              %39 = mhlo.log %37 : tensor<?x?x?xelem_type>
+              %40 = chlo.broadcast_multiply %35, %39 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %41 = chlo.broadcast_select %38, %35, %40 : (tensor<?x?x?xi1>, tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xelem_type>
+              %cast_1 = tensor.cast %41 : tensor<?x?x?xelem_type> to tensor<*xelem_type>
+              scf.yield %cast_1 : tensor<*xelem_type>
+            } else {
+              %34 = arith.cmpi ule, %27, %c4 : index
+              %35 = scf.if %34 -> (tensor<*xelem_type>) {
+                %36 = shape.broadcast %23#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %36 : tensor<?xindex> to tensor<4xindex>
+                %37 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %38 = shape.broadcast %23#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %38 : tensor<?xindex> to tensor<4xindex>
+                %39 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %40 = chlo.broadcast_compare %37, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?xi1>
+                %41 = mhlo.log %39 : tensor<?x?x?x?xelem_type>
+                %42 = chlo.broadcast_multiply %37, %41 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %43 = chlo.broadcast_select %40, %37, %42 : (tensor<?x?x?x?xi1>, tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %43 : tensor<?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              } else {
+                %36 = arith.cmpi ule, %27, %c5 : index
+                cf.assert %36, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %37 = shape.broadcast %23#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %37 : tensor<?xindex> to tensor<5xindex>
+                %38 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %39 = shape.broadcast %23#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %39 : tensor<?xindex> to tensor<5xindex>
+                %40 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %41 = chlo.broadcast_compare %38, %5 {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?x?x?x?x?xelem_type>, tensor<elem_type>) -> tensor<?x?x?x?x?xi1>
+                %42 = mhlo.log %40 : tensor<?x?x?x?x?xelem_type>
+                %43 = chlo.broadcast_multiply %38, %42 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %44 = chlo.broadcast_select %41, %38, %43 : (tensor<?x?x?x?x?xi1>, tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xelem_type>
+                %cast_1 = tensor.cast %44 : tensor<?x?x?x?x?xelem_type> to tensor<*xelem_type>
+                scf.yield %cast_1 : tensor<*xelem_type>
+              }
+              scf.yield %35 : tensor<*xelem_type>
+            }
+            scf.yield %33 : tensor<*xelem_type>
+          }
+          scf.yield %31 : tensor<*xelem_type>
+        }
+        scf.yield %29 : tensor<*xelem_type>
+      }
+      scf.yield %22 : tensor<*xelem_type>
+    }
+    scf.yield %20 : tensor<*xelem_type>
+  }
+  %11 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %13 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %14 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %15 = shape.broadcast %13, %14 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %16 = shape.broadcast %11, %12, %15 : tensor<?xindex>, tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %17 = mhlo.dynamic_reshape %10, %16 : (tensor<*xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %17 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/zeros_like.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/zeros_like.mlir.tmpl
index d7bd4c96416a62..995255ccba44c6 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/zeros_like.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/zeros_like.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @ZerosLike_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = "chlo.constant_like"(%arg0) {value = 0 : elem_type} : (tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @ZerosLike_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = "chlo.constant_like"(%2) {value = 0 : elem_type} : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/zeros_like_cmplx.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/zeros_like_cmplx.mlir.tmpl
index 13f4559d002ff8..804330e2d337ad 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/zeros_like_cmplx.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/zeros_like_cmplx.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @ZerosLike_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = "chlo.constant_like"(%arg0) {value = #complex.number<:scalar_type 0.000000e+00, 0.000000e+00> : elem_type} : (tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @ZerosLike_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = "chlo.constant_like"(%2) {value = #complex.number<:scalar_type 0.000000e+00, 0.000000e+00> : elem_type} : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/zeros_like_float.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/zeros_like_float.mlir.tmpl
index ec2facf391fef9..169b88dc35c313 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/zeros_like_float.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/zeros_like_float.mlir.tmpl
@@ -1,5 +1,9 @@
-func.func @ZerosLike_platform_elem_type_output_type(%arg0: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = "chlo.constant_like"(%arg0) {value = 0.000000e+00 : elem_type} : (tensor<*xelem_type>) -> tensor<*xelem_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @ZerosLike_platform_elem_type_output_type(%arg0: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %1 = shape.num_elements %0 : tensor<?xindex> -> index
+  %from_elements = tensor.from_elements %1 : tensor<1xindex>
+  %2 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+  %3 = "chlo.constant_like"(%2) {value = 0.000000e+00 : elem_type} : (tensor<?xelem_type>) -> tensor<?xelem_type>
+  %4 = mhlo.dynamic_reshape %3, %0 : (tensor<?xelem_type>, tensor<?xindex>) -> tensor<*xelem_type>
+  return %4 : tensor<*xelem_type>
 }
diff --git a/tensorflow/core/kernels/mlir_generated/op_definitions/zeta.mlir.tmpl b/tensorflow/core/kernels/mlir_generated/op_definitions/zeta.mlir.tmpl
index d7c88c2119b539..c31280f1daa362 100644
--- a/tensorflow/core/kernels/mlir_generated/op_definitions/zeta.mlir.tmpl
+++ b/tensorflow/core/kernels/mlir_generated/op_definitions/zeta.mlir.tmpl
@@ -1,6 +1,129 @@
-func.func @Zeta_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>)
-    -> tensor<*xoutput_type> attributes {tf_entry, llvm.emit_c_interface} {
-  %0 = chlo.broadcast_zeta %arg0, %arg1
-    : (tensor<*xelem_type>, tensor<*xelem_type>) -> tensor<*xoutput_type>
-  func.return %0 : tensor<*xoutput_type>
+func.func @Zeta_platform_elem_type_output_type(%arg0: tensor<*xelem_type>, %arg1: tensor<*xelem_type>) -> tensor<*xoutput_type> attributes {llvm.emit_c_interface, tf_entry} {
+  %0 = shape.const_shape [1, 1, 1, 1, 1] : tensor<5xindex>
+  %c5 = arith.constant 5 : index
+  %1 = shape.const_shape [1, 1, 1, 1] : tensor<4xindex>
+  %c4 = arith.constant 4 : index
+  %2 = shape.const_shape [1, 1, 1] : tensor<3xindex>
+  %c3 = arith.constant 3 : index
+  %3 = shape.const_shape [1, 1] : tensor<2xindex>
+  %c2 = arith.constant 2 : index
+  %4 = shape.const_shape [1] : tensor<1xindex>
+  %c1 = arith.constant 1 : index
+  %5 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %6 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %7 = shape.num_elements %5 : tensor<?xindex> -> index
+  %8 = arith.cmpi eq, %7, %c1 : index
+  %9 = scf.if %8 -> (tensor<*xoutput_type>) {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %from_elements = tensor.from_elements %14 : tensor<1xindex>
+    %15 = mhlo.reshape %arg0 : (tensor<*xelem_type>) -> tensor<elem_type>
+    %16 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+    %17 = chlo.broadcast_zeta %15, %16 : (tensor<elem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+    %cast = tensor.cast %17 : tensor<?xoutput_type> to tensor<*xoutput_type>
+    scf.yield %cast : tensor<*xoutput_type>
+  } else {
+    %14 = shape.num_elements %6 : tensor<?xindex> -> index
+    %15 = arith.cmpi eq, %14, %c1 : index
+    %16 = scf.if %15 -> (tensor<*xoutput_type>) {
+      %17 = shape.num_elements %5 : tensor<?xindex> -> index
+      %from_elements = tensor.from_elements %17 : tensor<1xindex>
+      %18 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+      %19 = mhlo.reshape %arg1 : (tensor<*xelem_type>) -> tensor<elem_type>
+      %20 = chlo.broadcast_zeta %18, %19 : (tensor<?xelem_type>, tensor<elem_type>) -> tensor<?xoutput_type>
+      %cast = tensor.cast %20 : tensor<?xoutput_type> to tensor<*xoutput_type>
+      scf.yield %cast : tensor<*xoutput_type>
+    } else {
+      %17 = shape.shape_eq %5, %6 : tensor<?xindex>, tensor<?xindex>
+      %18 = scf.if %17 -> (tensor<*xoutput_type>) {
+        %19 = shape.any %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+        %20 = shape.num_elements %19 : tensor<?xindex> -> index
+        %from_elements = tensor.from_elements %20 : tensor<1xindex>
+        %21 = mhlo.dynamic_reshape %arg0, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %22 = mhlo.dynamic_reshape %arg1, %from_elements : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+        %23 = chlo.broadcast_zeta %21, %22 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+        %cast = tensor.cast %23 : tensor<?xoutput_type> to tensor<*xoutput_type>
+        scf.yield %cast : tensor<*xoutput_type>
+      } else {
+        %19:2 = chlo.minimum_broadcast_shapes %5, %6 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>, tensor<?xindex>
+        %20 = shape.rank %19#0 : tensor<?xindex> -> index
+        %21 = shape.rank %19#1 : tensor<?xindex> -> index
+        %22 = arith.cmpi sgt, %20, %21 : index
+        %23 = arith.select %22, %20, %21 : index
+        %24 = arith.cmpi ule, %23, %c1 : index
+        %25 = scf.if %24 -> (tensor<*xoutput_type>) {
+          %26 = shape.broadcast %19#0, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast = tensor.cast %26 : tensor<?xindex> to tensor<1xindex>
+          %27 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %28 = shape.broadcast %19#1, %4 : tensor<?xindex>, tensor<1xindex> -> tensor<?xindex>
+          %cast_0 = tensor.cast %28 : tensor<?xindex> to tensor<1xindex>
+          %29 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<1xindex>) -> tensor<?xelem_type>
+          %30 = chlo.broadcast_zeta %27, %29 : (tensor<?xelem_type>, tensor<?xelem_type>) -> tensor<?xoutput_type>
+          %cast_1 = tensor.cast %30 : tensor<?xoutput_type> to tensor<*xoutput_type>
+          scf.yield %cast_1 : tensor<*xoutput_type>
+        } else {
+          %26 = arith.cmpi ule, %23, %c2 : index
+          %27 = scf.if %26 -> (tensor<*xoutput_type>) {
+            %28 = shape.broadcast %19#0, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast = tensor.cast %28 : tensor<?xindex> to tensor<2xindex>
+            %29 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %30 = shape.broadcast %19#1, %3 : tensor<?xindex>, tensor<2xindex> -> tensor<?xindex>
+            %cast_0 = tensor.cast %30 : tensor<?xindex> to tensor<2xindex>
+            %31 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<2xindex>) -> tensor<?x?xelem_type>
+            %32 = chlo.broadcast_zeta %29, %31 : (tensor<?x?xelem_type>, tensor<?x?xelem_type>) -> tensor<?x?xoutput_type>
+            %cast_1 = tensor.cast %32 : tensor<?x?xoutput_type> to tensor<*xoutput_type>
+            scf.yield %cast_1 : tensor<*xoutput_type>
+          } else {
+            %28 = arith.cmpi ule, %23, %c3 : index
+            %29 = scf.if %28 -> (tensor<*xoutput_type>) {
+              %30 = shape.broadcast %19#0, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast = tensor.cast %30 : tensor<?xindex> to tensor<3xindex>
+              %31 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %32 = shape.broadcast %19#1, %2 : tensor<?xindex>, tensor<3xindex> -> tensor<?xindex>
+              %cast_0 = tensor.cast %32 : tensor<?xindex> to tensor<3xindex>
+              %33 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<3xindex>) -> tensor<?x?x?xelem_type>
+              %34 = chlo.broadcast_zeta %31, %33 : (tensor<?x?x?xelem_type>, tensor<?x?x?xelem_type>) -> tensor<?x?x?xoutput_type>
+              %cast_1 = tensor.cast %34 : tensor<?x?x?xoutput_type> to tensor<*xoutput_type>
+              scf.yield %cast_1 : tensor<*xoutput_type>
+            } else {
+              %30 = arith.cmpi ule, %23, %c4 : index
+              %31 = scf.if %30 -> (tensor<*xoutput_type>) {
+                %32 = shape.broadcast %19#0, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast = tensor.cast %32 : tensor<?xindex> to tensor<4xindex>
+                %33 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %34 = shape.broadcast %19#1, %1 : tensor<?xindex>, tensor<4xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %34 : tensor<?xindex> to tensor<4xindex>
+                %35 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<4xindex>) -> tensor<?x?x?x?xelem_type>
+                %36 = chlo.broadcast_zeta %33, %35 : (tensor<?x?x?x?xelem_type>, tensor<?x?x?x?xelem_type>) -> tensor<?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %36 : tensor<?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              } else {
+                %32 = arith.cmpi ule, %23, %c5 : index
+                cf.assert %32, "Input for dynamic binary or n-ary op lowering was of a rank greater than 5"
+                %33 = shape.broadcast %19#0, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast = tensor.cast %33 : tensor<?xindex> to tensor<5xindex>
+                %34 = mhlo.dynamic_reshape %arg0, %cast : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %35 = shape.broadcast %19#1, %0 : tensor<?xindex>, tensor<5xindex> -> tensor<?xindex>
+                %cast_0 = tensor.cast %35 : tensor<?xindex> to tensor<5xindex>
+                %36 = mhlo.dynamic_reshape %arg1, %cast_0 : (tensor<*xelem_type>, tensor<5xindex>) -> tensor<?x?x?x?x?xelem_type>
+                %37 = chlo.broadcast_zeta %34, %36 : (tensor<?x?x?x?x?xelem_type>, tensor<?x?x?x?x?xelem_type>) -> tensor<?x?x?x?x?xoutput_type>
+                %cast_1 = tensor.cast %37 : tensor<?x?x?x?x?xoutput_type> to tensor<*xoutput_type>
+                scf.yield %cast_1 : tensor<*xoutput_type>
+              }
+              scf.yield %31 : tensor<*xoutput_type>
+            }
+            scf.yield %29 : tensor<*xoutput_type>
+          }
+          scf.yield %27 : tensor<*xoutput_type>
+        }
+        scf.yield %25 : tensor<*xoutput_type>
+      }
+      scf.yield %18 : tensor<*xoutput_type>
+    }
+    scf.yield %16 : tensor<*xoutput_type>
+  }
+  %10 = shape.shape_of %arg0 : tensor<*xelem_type> -> tensor<?xindex>
+  %11 = shape.shape_of %arg1 : tensor<*xelem_type> -> tensor<?xindex>
+  %12 = shape.broadcast %10, %11 : tensor<?xindex>, tensor<?xindex> -> tensor<?xindex>
+  %13 = mhlo.dynamic_reshape %9, %12 : (tensor<*xoutput_type>, tensor<?xindex>) -> tensor<*xoutput_type>
+  return %13 : tensor<*xoutput_type>
 }
diff --git a/tensorflow/core/kernels/mutex_ops.cc b/tensorflow/core/kernels/mutex_ops.cc
index dd1d02cfdfb0cb..6147afc73e58a0 100644
--- a/tensorflow/core/kernels/mutex_ops.cc
+++ b/tensorflow/core/kernels/mutex_ops.cc
@@ -113,7 +113,7 @@ class Mutex : public ResourceBase {
             delete cancelled;
           }
           if (local_locked) {  // Not cancelled.
-            fn_(OkStatus(),
+            fn_(absl::OkStatus(),
                 SharedLockReleaser{std::make_shared<LockReleaser>(this)});
           } else {
             fn_(errors::Cancelled("Lock acquisition cancelled."),
@@ -146,7 +146,7 @@ class MutexLockOp : public AsyncOpKernel {
                                       [c](Mutex** ptr) {
                                         *ptr = new Mutex(
                                             c, HandleFromInput(c, 0).name());
-                                        return OkStatus();
+                                        return absl::OkStatus();
                                       }),
         done);
 
diff --git a/tensorflow/core/kernels/ops_testutil.cc b/tensorflow/core/kernels/ops_testutil.cc
index 98618167f358a8..06762dbf9652eb 100644
--- a/tensorflow/core/kernels/ops_testutil.cc
+++ b/tensorflow/core/kernels/ops_testutil.cc
@@ -149,7 +149,7 @@ Status OpsTestBase::InitOpWithGraphVersion(int graph_def_version) {
       device_->resource_manager(), props, graph_def_version, &kernel));
   kernel_.reset(kernel);
   input_types_ = kernel_->input_types();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 static std::function<void(std::function<void()>)>* GetDefaultRunner() {
diff --git a/tensorflow/core/kernels/ops_util_test.cc b/tensorflow/core/kernels/ops_util_test.cc
index 955bca25b16776..82275979f758e1 100644
--- a/tensorflow/core/kernels/ops_util_test.cc
+++ b/tensorflow/core/kernels/ops_util_test.cc
@@ -133,7 +133,7 @@ class OpsUtilTest : public ::testing::Test {
 
   static void VerifyBcastValues(bcast_struct bcast) {
     int new_index, new_size;
-    EXPECT_EQ(OkStatus(),
+    EXPECT_EQ(absl::OkStatus(),
               GetBroadcastSize(bcast.input.index, bcast.input.in_size,
                                bcast.input.ksize, bcast.input.stride,
                                bcast.input.pad_size, &new_index, &new_size));
diff --git a/tensorflow/core/kernels/padding_fifo_queue.cc b/tensorflow/core/kernels/padding_fifo_queue.cc
index 95a5483437e307..57b46a0a06f48e 100644
--- a/tensorflow/core/kernels/padding_fifo_queue.cc
+++ b/tensorflow/core/kernels/padding_fifo_queue.cc
@@ -52,7 +52,7 @@ Status PaddingFIFOQueue::Initialize() {
         " shapes.");
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 /* static */
@@ -63,7 +63,7 @@ Status PaddingFIFOQueue::GetElementComponent(
   TF_RETURN_IF_ERROR(
       ctx->allocate_temp(tuple[component].dtype(), element_shape, out_tensor));
   *out_tensor = tuple[component];
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void PaddingFIFOQueue::TryDequeueMany(int num_elements, OpKernelContext* ctx,
@@ -243,7 +243,7 @@ Status PaddingFIFOQueue::ValidateTuple(const Tuple& tuple) {
                                      tuple[i].shape().DebugString());
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PaddingFIFOQueue::ValidateManyTuple(const Tuple& tuple) {
@@ -260,7 +260,7 @@ Status PaddingFIFOQueue::ValidateManyTuple(const Tuple& tuple) {
                                      tuple[i].shape().DebugString());
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PaddingFIFOQueue::CompatibleNodeDefShapes(
@@ -275,7 +275,7 @@ Status PaddingFIFOQueue::CompatibleNodeDefShapes(
         " but requested component shapes were ",
         PartialTensorShapeUtils::PartialShapeListString(requested_shapes));
   } else {
-    return OkStatus();
+    return absl::OkStatus();
   }
 }
 
@@ -288,7 +288,7 @@ Status PaddingFIFOQueue::MatchesNodeDef(const NodeDef& node_def) {
   TF_RETURN_IF_ERROR(MatchesNodeDefCapacity(node_def, capacity_));
   TF_RETURN_IF_ERROR(MatchesNodeDefTypes(node_def));
   TF_RETURN_IF_ERROR(CompatibleNodeDefShapes(node_def));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 static Status ValidateElementToLargerSlice(const Tensor& element,
@@ -303,7 +303,7 @@ static Status ValidateElementToLargerSlice(const Tensor& element,
         "Shapes are: [element]: ", element.shape().DebugString(),
         ", [parent slice]: ", chip_shape.DebugString());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename T, int NDIMS>
@@ -314,7 +314,7 @@ Status HandleElementToLargerSlice(const Tensor& element, Tensor* parent,
     return s;
   }
   if (element.NumElements() == 0) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   auto element_t = element.tensor<T, NDIMS>();
   auto parent_t = parent->tensor<T, NDIMS + 1>();
@@ -326,7 +326,7 @@ Status HandleElementToLargerSlice(const Tensor& element, Tensor* parent,
     slice_size[i] = element_t.dimension(i - 1);
   }
   parent_t.slice(slice_indices, slice_size) = element_t.reshape(slice_size);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 namespace {
diff --git a/tensorflow/core/kernels/partitioned_function_ops.cc b/tensorflow/core/kernels/partitioned_function_ops.cc
index 724c7aae337800..761920189c3933 100644
--- a/tensorflow/core/kernels/partitioned_function_ops.cc
+++ b/tensorflow/core/kernels/partitioned_function_ops.cc
@@ -162,7 +162,7 @@ Status PartitionedCallOp::FillOutputDevices(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PartitionedCallOp::Instantiate(FunctionLibraryRuntime* lib,
@@ -227,7 +227,7 @@ Status PartitionedCallOp::Instantiate(FunctionLibraryRuntime* lib,
 
   TF_RETURN_IF_ERROR(
       lib->Instantiate(func_->name(), AttrSlice(&func_->attr()), opts, handle));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void PartitionedCallOp::RunFunction(FunctionLibraryRuntime::Handle handle,
diff --git a/tensorflow/core/kernels/poisson-loss.h b/tensorflow/core/kernels/poisson-loss.h
index 12888bda3135aa..7cdee546dd2b17 100644
--- a/tensorflow/core/kernels/poisson-loss.h
+++ b/tensorflow/core/kernels/poisson-loss.h
@@ -84,7 +84,7 @@ class PoissonLossUpdater : public DualLossUpdater {
           "Only non-negative labels can be used with the Poisson log loss. "
           "Found example with label: ", *example_label);
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
diff --git a/tensorflow/core/kernels/pooling_ops_common.cc b/tensorflow/core/kernels/pooling_ops_common.cc
index 8d1220de4d4e0e..55a5dde92f12ca 100644
--- a/tensorflow/core/kernels/pooling_ops_common.cc
+++ b/tensorflow/core/kernels/pooling_ops_common.cc
@@ -113,7 +113,7 @@ Status CheckPaddingSize(int64_t window_rows, int64_t window_cols,
                                    "window size ",
                                    window_cols);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 PoolParameters::PoolParameters(OpKernelContext* context,
@@ -220,7 +220,7 @@ Status PoolParameters::forward_output_shape(TensorShape* shape) {
     *shape = TensorShape(
         {tensor_in_batch, tensor_in_rows, tensor_in_cols, out_depth});
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -402,6 +402,9 @@ void DnnPoolingImpl(OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
 
   auto* stream = context->op_device_context()->stream();
   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+  auto* dnn = stream->parent()->AsDnn();
+  OP_REQUIRES(context, dnn != nullptr,
+              errors::Internal("No DNN support for stream."));
 
 #if TENSORFLOW_USE_ROCM
   static int64 PoolingScratchSize = GetDnnWorkspaceLimit(
@@ -411,13 +414,14 @@ void DnnPoolingImpl(OpKernelContext* context, se::dnn::PoolingMode pooling_mode,
 
   DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
   OP_REQUIRES_OK(context,
-                 stream->ThenPoolForward(pooling_desc, GetNumericOptions(),
-                                         input_desc, input_data, output_desc,
-                                         &output_data, &scratch_allocator));
+                 dnn->PoolForward(stream, pooling_desc, GetNumericOptions(),
+                                  input_desc, input_data, output_desc,
+                                  &output_data, &scratch_allocator));
 #else
-  OP_REQUIRES_OK(context, stream->ThenPoolForward(
-                              pooling_desc, GetNumericOptions(), input_desc,
-                              input_data, output_desc, &output_data));
+  OP_REQUIRES_OK(
+      context,
+      dnn->PoolForward(stream, pooling_desc, GetNumericOptions(), input_desc,
+                       input_data, output_desc, &output_data));
 #endif
 
 #if CUDNN_VERSION < 7300
@@ -798,6 +802,9 @@ void DnnPoolingGradImpl(OpKernelContext* context,
 
   auto* stream = context->op_device_context()->stream();
   OP_REQUIRES(context, stream, errors::Internal("No GPU stream available."));
+  auto* dnn = stream->parent()->AsDnn();
+  OP_REQUIRES(context, dnn != nullptr,
+              errors::Internal("No DNN support for stream."));
 
 #if TENSORFLOW_USE_ROCM
   static int64 PoolingScratchSize = GetDnnWorkspaceLimit(
@@ -808,16 +815,16 @@ void DnnPoolingGradImpl(OpKernelContext* context,
   DnnScratchAllocator scratch_allocator(PoolingScratchSize, context);
   OP_REQUIRES_OK(
       context,
-      stream->ThenPoolBackward(
-          pooling_desc, GetNumericOptions(), orig_input_desc, orig_input_data,
-          orig_output_desc, orig_output_data, output_backprop_data,
-          &input_backprop_data, &scratch_allocator));
+      dnn->PoolBackward(stream, pooling_desc, GetNumericOptions(),
+                        orig_input_desc, orig_input_data, orig_output_desc,
+                        orig_output_data, output_backprop_data,
+                        &input_backprop_data, &scratch_allocator));
 #else
   OP_REQUIRES_OK(context,
-                 stream->ThenPoolBackward(
-                     pooling_desc, GetNumericOptions(), orig_input_desc,
-                     orig_input_data, orig_output_desc, orig_output_data,
-                     output_backprop_data, &input_backprop_data));
+                 dnn->PoolBackward(stream, pooling_desc, GetNumericOptions(),
+                                   orig_input_desc, orig_input_data,
+                                   orig_output_desc, orig_output_data,
+                                   output_backprop_data, &input_backprop_data));
 #endif
 
   if (padding == EXPLICIT && (params.pad_top != params.pad_bottom ||
diff --git a/tensorflow/core/kernels/priority_queue.cc b/tensorflow/core/kernels/priority_queue.cc
index b5426a0f75e46c..dde3ad973610b4 100644
--- a/tensorflow/core/kernels/priority_queue.cc
+++ b/tensorflow/core/kernels/priority_queue.cc
@@ -57,7 +57,7 @@ Status PriorityQueue::Initialize() {
         "is: ",
         component_shapes_[0].DebugString());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void PriorityQueue::DequeueLocked(OpKernelContext* ctx, Tuple* tuple) {
@@ -124,7 +124,7 @@ Status PriorityQueue::GetElementComponentFromBatch(
       ctx->allocate_temp(tuple[component].dtype(), element_shape, out_element));
   TF_RETURN_IF_ERROR(
       batch_util::CopySliceToElement(tuple[component], out_element, index));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void PriorityQueue::TryEnqueueMany(const Tuple& tuple, OpKernelContext* ctx,
@@ -393,7 +393,7 @@ Status PriorityQueue::MatchesNodeDef(const NodeDef& node_def) {
   TF_RETURN_IF_ERROR(MatchesNodeDefCapacity(node_def, capacity_));
   TF_RETURN_IF_ERROR(MatchesPriorityNodeDefTypes(node_def));
   TF_RETURN_IF_ERROR(MatchesPriorityNodeDefShapes(node_def));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PriorityQueue::MatchesPriorityNodeDefTypes(
@@ -409,7 +409,7 @@ Status PriorityQueue::MatchesPriorityNodeDefTypes(
                                    " but requested component types were ",
                                    DataTypeSliceString(requested_dtypes));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PriorityQueue::MatchesPriorityNodeDefShapes(
@@ -424,7 +424,7 @@ Status PriorityQueue::MatchesPriorityNodeDefShapes(
                                    " but requested component shapes were ",
                                    ShapeListString(requested_shapes));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/quantized_concat_op.cc b/tensorflow/core/kernels/quantized_concat_op.cc
index 19ec612fdea654..997bc0293ba7a5 100644
--- a/tensorflow/core/kernels/quantized_concat_op.cc
+++ b/tensorflow/core/kernels/quantized_concat_op.cc
@@ -118,7 +118,7 @@ class QuantizedConcatOp : public OpKernel {
       *output_min = overall_min;
       *output_max = overall_max;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   int64_t CalculateInputsDim(const TensorShape& input_shape,
@@ -171,7 +171,7 @@ class QuantizedConcatOp : public OpKernel {
       }
       *output_concat_dim += in.dims() > 0 ? in.dim_size(concat_dim) : 1;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void Compute(OpKernelContext* context) override {
diff --git a/tensorflow/core/kernels/quantized_conv_ops.cc b/tensorflow/core/kernels/quantized_conv_ops.cc
index 495836794ff9e6..aa5135dc6bdfc9 100644
--- a/tensorflow/core/kernels/quantized_conv_ops.cc
+++ b/tensorflow/core/kernels/quantized_conv_ops.cc
@@ -284,7 +284,7 @@ class Im2ColConvFunctor {
               (kMaxChunkSize + (sizeof(T1) - 1)) / sizeof(T1);
 #endif
           *resource = new Im2ColBufferResource<T1, chunk_value_count>();
-          return OkStatus();
+          return absl::OkStatus();
         };
     OP_REQUIRES_OK(context, context->resource_manager()->LookupOrCreate(
                                 "Conv2d", "im2col_buffer",
diff --git a/tensorflow/core/kernels/queue_base.cc b/tensorflow/core/kernels/queue_base.cc
index 6ab7c97ba3f414..1b1d7f448ada46 100644
--- a/tensorflow/core/kernels/queue_base.cc
+++ b/tensorflow/core/kernels/queue_base.cc
@@ -44,7 +44,7 @@ Status HandleSliceToElement(const Tensor& parent, Tensor* element,
   }
   auto parent_as_matrix = parent.flat_outer_dims<T>();
   element->flat<T>() = parent_as_matrix.chip(index, 0);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -74,7 +74,7 @@ Status QueueBase::ValidateTupleCommon(const Tuple& tuple) const {
           DataTypeString(tuple[i].dtype()));
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // static
@@ -96,7 +96,7 @@ Status QueueBase::MatchesNodeDefOp(const NodeDef& node_def,
                                    "' that does not match type of Node '",
                                    node_def.name(), "': ", node_def.op());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status QueueBase::MatchesNodeDefCapacity(const NodeDef& node_def,
@@ -109,7 +109,7 @@ Status QueueBase::MatchesNodeDefCapacity(const NodeDef& node_def,
                                    capacity, " but requested capacity was ",
                                    requested_capacity);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status QueueBase::MatchesNodeDefTypes(const NodeDef& node_def) const {
@@ -123,7 +123,7 @@ Status QueueBase::MatchesNodeDefTypes(const NodeDef& node_def) const {
                                    " but requested component types were ",
                                    DataTypeSliceString(requested_dtypes));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status QueueBase::MatchesNodeDefShapes(const NodeDef& node_def) const {
@@ -136,7 +136,7 @@ Status QueueBase::MatchesNodeDefShapes(const NodeDef& node_def) const {
                                    " but requested component shapes were ",
                                    ShapeListString(requested_shapes));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // TODO(mrry): If these checks become a bottleneck, find a way to
@@ -153,7 +153,7 @@ Status QueueBase::ValidateTuple(const Tuple& tuple) {
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // TODO(mrry): If these checks become a bottleneck, find a way to
@@ -182,7 +182,7 @@ Status QueueBase::ValidateManyTuple(const Tuple& tuple) {
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void QueueBase::Cancel(Action action, CancellationManager* cancellation_manager,
diff --git a/tensorflow/core/kernels/ragged_cross_op.cc b/tensorflow/core/kernels/ragged_cross_op.cc
index c8f27051b449cc..c6afecc8b80f0d 100644
--- a/tensorflow/core/kernels/ragged_cross_op.cc
+++ b/tensorflow/core/kernels/ragged_cross_op.cc
@@ -439,7 +439,7 @@ class RaggedCrossOp : public OpKernel {
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Calculate the batch size from any input tensor.  (We check that all input
@@ -518,7 +518,7 @@ class RaggedCrossOp : public OpKernel {
       }
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Builds a RaggedReatureReader
@@ -552,7 +552,7 @@ class RaggedCrossOp : public OpKernel {
             new RaggedFeatureReader<tstring, int32>(values, splits));
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Builds a DenseFaggedReatureReader.
@@ -567,7 +567,7 @@ class RaggedCrossOp : public OpKernel {
                                      (features->size() + 1), ": ",
                                      values.dtype());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Builds a SparseFaggedReatureReader.
@@ -586,7 +586,7 @@ class RaggedCrossOp : public OpKernel {
                                      (features->size() + 1), ": ",
                                      values.dtype());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Allocates output tensors with proper size, and populates row_splits_out.
@@ -612,7 +612,7 @@ class RaggedCrossOp : public OpKernel {
     TF_RETURN_IF_ERROR(context->allocate_output(
         0, TensorShape({cross_count_total}), values_out));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Returns number of crosses for a given batch_index
diff --git a/tensorflow/core/kernels/ragged_gather_op.cc b/tensorflow/core/kernels/ragged_gather_op.cc
index 7d5495e10f71c3..0252be8de06803 100644
--- a/tensorflow/core/kernels/ragged_gather_op.cc
+++ b/tensorflow/core/kernels/ragged_gather_op.cc
@@ -111,7 +111,7 @@ class RaggedGatherOpBase : public OpKernel {
             " is not in [0, ", num_params, ")");
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Construct the `splits` output tensors, encoded using a nested vector.
@@ -188,7 +188,7 @@ class RaggedGatherOpBase : public OpKernel {
         *num_values += limit - start;
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   ::tensorflow::Status ValidateSplits(
@@ -216,7 +216,7 @@ class RaggedGatherOpBase : public OpKernel {
         }
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   ::tensorflow::Status WriteSplits(
@@ -234,7 +234,7 @@ class RaggedGatherOpBase : public OpKernel {
       std::copy_n(out_splits[i].data(), out_splits[i].size(),
                   splits_flat.data());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   ::tensorflow::Status WriteValues(
@@ -253,7 +253,7 @@ class RaggedGatherOpBase : public OpKernel {
                           : (num_elements / params_dense_values_in.dim_size(0));
     CallWriteValueSlices(params_dense_values_in, value_slices, value_size,
                          values_out);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  protected:
diff --git a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
index ea137114b55248..2b3052394766a0 100644
--- a/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_from_variant_op.cc
@@ -77,7 +77,7 @@ Status RaggedComponentsFromVariant(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 /* Takes a set of RaggedTensorVariants for non-ragged tensors, stacks
@@ -96,7 +96,7 @@ Status StackNonRaggedTensors(
     RaggedTensorVariant* output_ragged) {
   if (ragged_components.empty()) {
     output_ragged->set_values(Tensor(DataTypeToEnum<VALUE_TYPE>::value, {0}));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   TensorShape component_values_shape = ragged_components[0].values().shape();
@@ -120,7 +120,7 @@ Status StackNonRaggedTensors(
       output_values_flat(values_index++) = component_values_flat(j);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename VALUE_TYPE, typename SPLIT_TYPE>
@@ -261,7 +261,7 @@ Status NestedStackRaggedTensors(
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
diff --git a/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc b/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc
index c5e3e2e7896a14..41667581eab3d1 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_sparse_kernel.cc
@@ -188,7 +188,7 @@ class RaggedTensorToSparseOp : public OpKernel {
           "Final value of ragged splits must match the length "
           "the corresponding ragged values.");
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Build a list of index suffixes that should be added for each ragged item,
diff --git a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
index 3985680337a6e3..a3dbeb9aac9f84 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_tensor_op.cc
@@ -90,10 +90,10 @@ class RaggedTensorToTensorBaseOp : public OpKernel {
     switch (GetRowPartitionTypeByDimension(dimension - 1)) {
       case RowPartitionType::VALUE_ROWIDS:
         *result = GetMaxWidthValueRowID(row_partition_tensor);
-        return OkStatus();
+        return absl::OkStatus();
       case RowPartitionType::ROW_SPLITS:
         *result = GetMaxWidthRowSplit(row_partition_tensor);
-        return OkStatus();
+        return absl::OkStatus();
       default:
         return errors::InvalidArgument(
             "Cannot handle partition type ",
@@ -176,7 +176,7 @@ class RaggedTensorToTensorBaseOp : public OpKernel {
         TF_RETURN_IF_ERROR(GetMaxWidth(c, i, &(*result)[i]));
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   /**
@@ -236,7 +236,7 @@ class RaggedTensorToTensorBaseOp : public OpKernel {
       return errors::InvalidArgument("Invalid row split size.");
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Calculate the output index of the first element of a list.
@@ -268,7 +268,7 @@ class RaggedTensorToTensorBaseOp : public OpKernel {
     const INDEX_TYPE index_size = value_rowids.size();
     result->reserve(index_size);
     if (index_size == 0) {
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     INDEX_TYPE current_output_column = 0;
@@ -312,7 +312,7 @@ class RaggedTensorToTensorBaseOp : public OpKernel {
       return errors::InvalidArgument("Invalid row ids.");
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status CalculateOutputIndex(OpKernelContext* context, int dimension,
@@ -355,13 +355,13 @@ class RaggedTensorToTensorBaseOp : public OpKernel {
     switch (first_partition_type) {
       case RowPartitionType::FIRST_DIM_SIZE:
         *result = first_partition_tensor.scalar<INDEX_TYPE>()();
-        return OkStatus();
+        return absl::OkStatus();
       case RowPartitionType::VALUE_ROWIDS:
         return errors::InvalidArgument(
             "Cannot handle VALUE_ROWIDS in first dimension.");
       case RowPartitionType::ROW_SPLITS:
         *result = first_partition_tensor.shape().dim_size(0) - 1;
-        return OkStatus();
+        return absl::OkStatus();
       default:
         return errors::InvalidArgument(
             "Cannot handle type ",
diff --git a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
index 153fd5a98fea1e..04237d8ecb7f99 100644
--- a/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
+++ b/tensorflow/core/kernels/ragged_tensor_to_variant_op.cc
@@ -61,7 +61,7 @@ Status UnbatchDenseZerothDim(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename VALUE_TYPE, typename SPLIT_TYPE>
@@ -110,7 +110,7 @@ Status UnbatchRaggedZerothDim(
             batched_flat(j + start * num_inner_elems);
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Unbatch nested splits.
@@ -166,7 +166,7 @@ Status UnbatchRaggedZerothDim(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
diff --git a/tensorflow/core/kernels/ragged_tensor_variant.cc b/tensorflow/core/kernels/ragged_tensor_variant.cc
index c418961f64acc6..3ef3294ee957ba 100644
--- a/tensorflow/core/kernels/ragged_tensor_variant.cc
+++ b/tensorflow/core/kernels/ragged_tensor_variant.cc
@@ -58,7 +58,7 @@ Status RaggedTensorVariantDeviceCopy(
   TF_RETURN_IF_ERROR(copy(from.values(), to->mutable_values()));
   // TODO(b/170415165) Should we use `copy` to move splits from device<->host?
   *to->mutable_nested_splits() = from.nested_splits();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/ragged_tensor_variant.h b/tensorflow/core/kernels/ragged_tensor_variant.h
index 4bef66a9e745b3..db35b8bcf0d35b 100644
--- a/tensorflow/core/kernels/ragged_tensor_variant.h
+++ b/tensorflow/core/kernels/ragged_tensor_variant.h
@@ -74,7 +74,7 @@ Status RaggedTensorVariantZerosLike(OpKernelContext* c,
   y->set_nested_splits(x.nested_splits());
   TF_RETURN_IF_ERROR(
       ZerosLikeTensor<Device>(c, x.values(), y->mutable_values()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename Device>
@@ -102,7 +102,7 @@ Status RaggedTensorVariantBinaryAdd(OpKernelContext* c,
   out->set_nested_splits(x.nested_splits());
   TF_RETURN_IF_ERROR(BinaryAddTensors<Device>(c, x.values(), y.values(),
                                               out->mutable_values()));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/random_index_shuffle_ops.cc b/tensorflow/core/kernels/random_index_shuffle_ops.cc
index 6233f982381a81..d1f275b5fe4413 100644
--- a/tensorflow/core/kernels/random_index_shuffle_ops.cc
+++ b/tensorflow/core/kernels/random_index_shuffle_ops.cc
@@ -60,7 +60,7 @@ Status GetSeed(const Tensor& seed_t, const int row,
     return errors::InvalidArgument("Invalid seed type: ",
                                    DataTypeString(seed_t.dtype()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename IntType>
diff --git a/tensorflow/core/kernels/random_shuffle_queue_op.cc b/tensorflow/core/kernels/random_shuffle_queue_op.cc
index 1f0352a153d7d5..bed97066d14d03 100644
--- a/tensorflow/core/kernels/random_shuffle_queue_op.cc
+++ b/tensorflow/core/kernels/random_shuffle_queue_op.cc
@@ -113,7 +113,7 @@ Status RandomShuffleQueue::Initialize() {
   for (int i = 0; i < num_components(); ++i) {
     queues_[i].reserve(min_after_dequeue_);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void RandomShuffleQueue::DequeueLocked(OpKernelContext* ctx, Tuple* tuple) {
@@ -176,7 +176,7 @@ Status RandomShuffleQueue::GetElementComponentFromBatch(const Tuple& tuple,
       ctx->allocate_temp(tuple[component].dtype(), element_shape, out_tensor));
   TF_RETURN_IF_ERROR(
       batch_util::CopySliceToElement(tuple[component], out_tensor, index));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void RandomShuffleQueue::TryEnqueueMany(const Tuple& tuple,
@@ -469,7 +469,7 @@ Status RandomShuffleQueue::MatchesNodeDef(const NodeDef& node_def) {
   TF_RETURN_IF_ERROR(MatchesNodeDefTypes(node_def));
   TF_RETURN_IF_ERROR(MatchesNodeDefShapes(node_def));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Defines a RandomShuffleQueueOp, which produces a Queue (specifically, one
diff --git a/tensorflow/core/kernels/range_sampler.cc b/tensorflow/core/kernels/range_sampler.cc
index 79e5889454be5f..eae756b89896e5 100644
--- a/tensorflow/core/kernels/range_sampler.cc
+++ b/tensorflow/core/kernels/range_sampler.cc
@@ -252,7 +252,7 @@ Status FixedUnigramSampler::SetDistributionSampler(Env* env,
                                     " must be equal to weights size  ",
                                     weights_.size()));
   dist_sampler_.reset(new random::DistributionSampler(weights_));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status FixedUnigramSampler::SetDistributionSampler(
@@ -263,7 +263,7 @@ Status FixedUnigramSampler::SetDistributionSampler(
                                     " must be equal to weights size  ",
                                     weights_.size()));
   dist_sampler_.reset(new random::DistributionSampler(weights_));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 float FixedUnigramSampler::Probability(int64_t value) const {
@@ -309,7 +309,7 @@ Status FixedUnigramSampler::LoadFromFile(Env* env, const string& vocab_file,
     }
     ++word_id;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void FixedUnigramSampler::LoadFromUnigrams(const std::vector<float>& unigrams,
diff --git a/tensorflow/core/kernels/record_yielder.cc b/tensorflow/core/kernels/record_yielder.cc
index 593d11cb2efd91..6bba98ab2c1ae9 100644
--- a/tensorflow/core/kernels/record_yielder.cc
+++ b/tensorflow/core/kernels/record_yielder.cc
@@ -91,7 +91,7 @@ static Status MatchFiles(const string& patterns,
                       std::make_move_iterator(tmp_filenames.begin()),
                       std::make_move_iterator(tmp_filenames.end()));
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void RecordYielder::MainLoop() {
@@ -200,7 +200,7 @@ void RecordYielder::ShardLoop(Shard* shard) {
   const int64_t kRecords = 16;
   for (const string& filename : shard->filenames) {
     std::unique_ptr<RandomAccessFile> file;
-    if (ShouldFinish(OkStatus())) break;
+    if (ShouldFinish(absl::OkStatus())) break;
     Status s = Env::Default()->NewRandomAccessFile(filename, &file);
     if (!s.ok()) {
       shard->status = errors::InvalidArgument("Can't open ", filename);
diff --git a/tensorflow/core/kernels/reduction_ops_common.cc b/tensorflow/core/kernels/reduction_ops_common.cc
index 943e75e93ffef2..d66ad0954f710b 100644
--- a/tensorflow/core/kernels/reduction_ops_common.cc
+++ b/tensorflow/core/kernels/reduction_ops_common.cc
@@ -76,7 +76,7 @@ Status SimplifyHelper(const Tensor& data, const Tensor& axis,
     }
     bitmap[index] = true;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ReductionHelper::Simplify(const Tensor& data, const Tensor& axis,
@@ -154,7 +154,7 @@ Status ReductionHelper::Simplify(const Tensor& data, const Tensor& axis,
   VLOG(1) << "data reshape: " << absl::StrJoin(data_reshape_, ",");
   VLOG(1) << "out  reshape: " << absl::StrJoin(out_reshape_, ",");
   VLOG(1) << "out    shape: " << absl::StrJoin(out_shape_, ",");
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/regex_replace_op.cc b/tensorflow/core/kernels/regex_replace_op.cc
index 7b615aaf401fef..a3e5041d189f83 100644
--- a/tensorflow/core/kernels/regex_replace_op.cc
+++ b/tensorflow/core/kernels/regex_replace_op.cc
@@ -59,7 +59,7 @@ Status InternalCompute(const RE2& regex, const string& rewrite,
     }
     output_flat(i) = std::move(buf);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
diff --git a/tensorflow/core/kernels/reshape_op.h b/tensorflow/core/kernels/reshape_op.h
index 099678c94324b8..e7459d91aef01d 100644
--- a/tensorflow/core/kernels/reshape_op.h
+++ b/tensorflow/core/kernels/reshape_op.h
@@ -159,7 +159,7 @@ class ReshapeOp : public OpKernel {
         (*product) *= size;
       }
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/kernels/resource_variable_ops.cc b/tensorflow/core/kernels/resource_variable_ops.cc
index df0b87f4349f8b..d6d6cf228a9175 100644
--- a/tensorflow/core/kernels/resource_variable_ops.cc
+++ b/tensorflow/core/kernels/resource_variable_ops.cc
@@ -138,7 +138,7 @@ Status CopyVariable(int output_idx, OpKernelContext* ctx, const Tensor* t) {
         return errors::Internal("Unsupported dtype", t->dtype());
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -432,7 +432,7 @@ class AssignVariableOp : public OpKernel {
                                   *ptr = new Var(dtype_);
                                   *(*ptr)->tensor() = value;
                                   (*ptr)->is_initialized = true;
-                                  return OkStatus();
+                                  return absl::OkStatus();
                                 }));
     mutex_lock ml(*variable->mu());
     // (variable->tensor()->dtype() == DT_INVALID && !variable->is_initialized)
@@ -1096,7 +1096,7 @@ Status DoScatter(OpKernelContext* c, Tensor* params, const Tensor& indices,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/resource_variable_util.cc b/tensorflow/core/kernels/resource_variable_util.cc
index f6c0bac09fff9e..d66ac4e3e1d47f 100644
--- a/tensorflow/core/kernels/resource_variable_util.cc
+++ b/tensorflow/core/kernels/resource_variable_util.cc
@@ -27,7 +27,7 @@ Status ValidateAssignUpdateVariableOpShapes(const TensorShape& variable_shape,
         " using a Tensor with shape ", value_shape.DebugString(),
         ", shapes must be equal.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/rnn/blas_gemm.cc b/tensorflow/core/kernels/rnn/blas_gemm.cc
index 96c9cde2a74f1d..a79a5506e34e68 100644
--- a/tensorflow/core/kernels/rnn/blas_gemm.cc
+++ b/tensorflow/core/kernels/rnn/blas_gemm.cc
@@ -49,12 +49,15 @@ void TensorCuBlasGemm<T>::operator()(OpKernelContext* ctx, bool transa,
   auto a_ptr = AsDeviceMemory(a);
   auto b_ptr = AsDeviceMemory(b);
   auto c_ptr = AsDeviceMemory(c);
+  auto* stream = ctx->op_device_context()->stream();
+  auto* blas = stream->parent()->AsBlas();
+  OP_REQUIRES(ctx, blas != nullptr, absl::InternalError("No BLAS for stream."));
 
   OP_REQUIRES_OK(
-      ctx, ctx->op_device_context()->stream()->ThenBlasGemm(
-               trans[transa], trans[transb], m, n, k, static_cast<T>(alpha),
-               a_ptr, lda, b_ptr, ldb, static_cast<T>(beta), &c_ptr, ldc,
-               GetNumericOptions(), se::blas::CallContext::kNone));
+      ctx, blas->BlasGemm(stream, trans[transa], trans[transb], m, n, k,
+                          static_cast<T>(alpha), a_ptr, lda, b_ptr, ldb,
+                          static_cast<T>(beta), &c_ptr, ldc,
+                          GetNumericOptions(), se::blas::CallContext::kNone));
 #else
   ctx->SetStatus(errors::InvalidArgument("CuBlasGemm needs CUDA."));
 #endif
diff --git a/tensorflow/core/kernels/save_restore_tensor.cc b/tensorflow/core/kernels/save_restore_tensor.cc
index fa292292a34cef..7b433ab8b5b9a0 100644
--- a/tensorflow/core/kernels/save_restore_tensor.cc
+++ b/tensorflow/core/kernels/save_restore_tensor.cc
@@ -346,7 +346,7 @@ struct RestoreOp {
     }
     VLOG(1) << "Done restoring tensor " << idx << " : " << tensor_name << " : "
             << restored_full_shape.num_elements();
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   OpKernelContext* context;
@@ -444,7 +444,7 @@ Status RestoreTensorsV2(OpKernelContext* context, const Tensor& prefix,
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/save_restore_v2_ops.cc b/tensorflow/core/kernels/save_restore_v2_ops.cc
index 617cf60d37b89d..eb329b9b90b616 100644
--- a/tensorflow/core/kernels/save_restore_v2_ops.cc
+++ b/tensorflow/core/kernels/save_restore_v2_ops.cc
@@ -176,7 +176,7 @@ class SaveV2 : public OpKernel {
                   &checkpoint_callback_manager,
                   [](checkpoint::CheckpointCallbackManager** out) {
                     *out = new checkpoint::CheckpointCallbackManager();
-                    return OkStatus();
+                    return absl::OkStatus();
                   }));
       checkpoint_callback_manager->Save(prefix_string);
       checkpoint_callback_manager->Unref();
@@ -245,7 +245,7 @@ class RestoreV2 : public OpKernel {
                   &checkpoint_callback_manager,
                   [](checkpoint::CheckpointCallbackManager** out) {
                     *out = new checkpoint::CheckpointCallbackManager();
-                    return OkStatus();
+                    return absl::OkStatus();
                   }));
       checkpoint_callback_manager->Restore(prefix_string);
       checkpoint_callback_manager->Unref();
diff --git a/tensorflow/core/kernels/scatter_nd_op.cc b/tensorflow/core/kernels/scatter_nd_op.cc
index 9effcca70f0ef8..1cb4837133d95d 100644
--- a/tensorflow/core/kernels/scatter_nd_op.cc
+++ b/tensorflow/core/kernels/scatter_nd_op.cc
@@ -863,7 +863,7 @@ Status PrepareAndValidateInputs(const TensorShape& params_shape,
   const int64_t safe_slice_dim = (*slice_dim < 1) ? 1 : *slice_dim;
   *num_updates = indices_shape.num_elements() / safe_slice_dim;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename Device, typename Index>
@@ -904,7 +904,7 @@ Status DoScatterNdImpl(OpKernelContext* c, const Tensor& indices,
   }
 
   if (shape.num_elements() == 0) {
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   if (allocate) {
@@ -956,7 +956,7 @@ Status DoScatterNdImpl(OpKernelContext* c, const Tensor& indices,
             gtl::ArraySlice<Index>(&indices_flat(bad_i, 0), slice_dim), ", "),
         "] does not index into shape ", shape.DebugString());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename T, typename Index, scatter_nd_op::UpdateOp Op>
diff --git a/tensorflow/core/kernels/scatter_nd_util.cc b/tensorflow/core/kernels/scatter_nd_util.cc
index 403c5361225880..4793e4ce99761c 100644
--- a/tensorflow/core/kernels/scatter_nd_util.cc
+++ b/tensorflow/core/kernels/scatter_nd_util.cc
@@ -61,7 +61,7 @@ Status ValidateScatterNdUpdateShape(const TensorShape& params_shape,
       return shape_err_suffix();
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sdca_internal.cc b/tensorflow/core/kernels/sdca_internal.cc
index a3b3db6bdc9121..bf669e0a13e7c5 100644
--- a/tensorflow/core/kernels/sdca_internal.cc
+++ b/tensorflow/core/kernels/sdca_internal.cc
@@ -156,7 +156,7 @@ Status ModelWeights::Initialize(OpKernelContext* const context) {
                   {1, weight_inputs[i].NumElements()}),
               deltas});
         }
-        return OkStatus();
+        return absl::OkStatus();
       };
 
   return initialize_weights(dense_weights_inputs, &dense_weights_outputs,
@@ -319,7 +319,7 @@ Status Examples::SampleAdaptiveProbabilities(
   for (int i = id; i < num_examples(); ++i) {
     sampled_count_[i] = examples_not_seen[i - id].first;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void Examples::RandomShuffle() {
@@ -421,7 +421,7 @@ Status Examples::Initialize(OpKernelContext* const context,
   TF_RETURN_IF_ERROR(ComputeSquaredNormPerExample(
       worker_threads, num_examples, num_sparse_features, num_dense_features,
       &examples_));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status Examples::CreateSparseFeatureRepresentation(
diff --git a/tensorflow/core/kernels/sdca_internal.h b/tensorflow/core/kernels/sdca_internal.h
index 5f5f8bacd15c1e..5bf6325f4d2fae 100644
--- a/tensorflow/core/kernels/sdca_internal.h
+++ b/tensorflow/core/kernels/sdca_internal.h
@@ -80,7 +80,7 @@ class Regularizations {
     TF_RETURN_IF_ERROR(context->GetAttr("l1", &symmetric_l1_));
     TF_RETURN_IF_ERROR(context->GetAttr("l2", &symmetric_l2_));
     shrinkage_ = symmetric_l1_ / symmetric_l2_;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Proximal SDCA shrinking for L1 regularization.
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl.h b/tensorflow/core/kernels/segment_reduction_ops_impl.h
index f2fe9f528607db..b3364da3af53be 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl.h
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl.h
@@ -1355,7 +1355,7 @@ class SparseSegmentGradV2OpCommon {
       Tensor* sorted_unique_indices = nullptr;
       TF_RETURN_IF_ERROR(context->allocate_output(1, TensorShape({0}),
                                                   &sorted_unique_indices));
-      return OkStatus();
+      return absl::OkStatus();
     }
 
     auto input_flat = input.flat_outer_dims<T>();
@@ -1366,7 +1366,7 @@ class SparseSegmentGradV2OpCommon {
         context, operation, input_flat, indices_vec, segment_vec,
         dense_output_shape, done);
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/kernels/segment_reduction_ops_impl_1.cc b/tensorflow/core/kernels/segment_reduction_ops_impl_1.cc
index 4b0e55de7bb5bb..24d5f94b60f987 100644
--- a/tensorflow/core/kernels/segment_reduction_ops_impl_1.cc
+++ b/tensorflow/core/kernels/segment_reduction_ops_impl_1.cc
@@ -34,7 +34,7 @@ Status ValidateSegmentReduction(OpKernelContext* context, const Tensor& input,
         " input.");
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // check routines not in the templated class to reduce code size
@@ -55,7 +55,7 @@ Status ValidateUnsortedSegmentReduction(OpKernel* op_kernel,
                                    segment_ids.shape().DebugString());
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ValidateSparseSegmentReduction(OpKernelContext* context,
@@ -97,7 +97,7 @@ Status ValidateSparseSegmentReduction(OpKernelContext* context,
     return errors::InvalidArgument("Shape must be at least rank 1");
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace internal
diff --git a/tensorflow/core/kernels/sendrecv_ops_test.cc b/tensorflow/core/kernels/sendrecv_ops_test.cc
index 75c02e6308e7ea..10f23e418d9a57 100644
--- a/tensorflow/core/kernels/sendrecv_ops_test.cc
+++ b/tensorflow/core/kernels/sendrecv_ops_test.cc
@@ -29,12 +29,12 @@ namespace {
 class DummyRendezvous : public Rendezvous {
   Status Send(const ParsedKey& key, const Args& args, const Tensor& val,
               const bool is_dead) override {
-    return OkStatus();
+    return absl::OkStatus();
   }
   void RecvAsync(const ParsedKey& key, const Args& args,
                  DoneCallback done) override {
     static Tensor* t = new Tensor(DT_FLOAT, TensorShape({0}));
-    done(OkStatus(), args, args, *t, false);
+    done(absl::OkStatus(), args, args, *t, false);
   }
   void StartAbort(const Status& status) override {}
 };
diff --git a/tensorflow/core/kernels/set_kernels.cc b/tensorflow/core/kernels/set_kernels.cc
index 2ae7e276fd0eaf..8ba24346ab1cb5 100644
--- a/tensorflow/core/kernels/set_kernels.cc
+++ b/tensorflow/core/kernels/set_kernels.cc
@@ -60,7 +60,7 @@ Status GroupShape(const VarDimArray& input_shape, ShapeArray* grouped_shape) {
   }
   // grouped_shape is input_shape[:-1]
   *grouped_shape = ShapeArray(input_shape.begin(), input_shape.end() - 1);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Build `SparseTensor` from indices, values, and shape in inputs
@@ -425,7 +425,7 @@ Status CheckShapesMatch(VarDimArray shape1, VarDimArray shape2) {
                                    absl::StrJoin(shape1, ","), "] vs [",
                                    absl::StrJoin(shape2, ","), "]");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Validate ranks are the same, and all but last dimension are the same.
@@ -438,7 +438,7 @@ Status GroupShapeFromInputs(VarDimArray shape1, VarDimArray shape2,
   TF_RETURN_IF_ERROR(GroupShape(shape2, &group_shape_2));
   TF_RETURN_IF_ERROR(CheckShapesMatch(group_shape_1, group_shape_2));
   *group_shape = group_shape_1;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Split `flat_group_index` into separate dimensions based on `group_shape`.
diff --git a/tensorflow/core/kernels/shape_ops.h b/tensorflow/core/kernels/shape_ops.h
index 296031eceb53d6..dcddddc5e38686 100644
--- a/tensorflow/core/kernels/shape_ops.h
+++ b/tensorflow/core/kernels/shape_ops.h
@@ -34,7 +34,7 @@ namespace shape_op_helpers {
 inline Status GetShape(OpKernelContext* ctx, int input_index,
                        TensorShape* shape) {
   *shape = ctx->input(input_index).shape();
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace shape_op_helpers
 
diff --git a/tensorflow/core/kernels/shuffle_common.h b/tensorflow/core/kernels/shuffle_common.h
index feb62accc02e53..0fd93bdfca9573 100644
--- a/tensorflow/core/kernels/shuffle_common.h
+++ b/tensorflow/core/kernels/shuffle_common.h
@@ -94,7 +94,7 @@ Status RandomShuffle(OpKernelContext* context, const Tensor& input,
       }
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/smooth-hinge-loss.h b/tensorflow/core/kernels/smooth-hinge-loss.h
index beb56fbf01882b..f1019b7c53cb7c 100644
--- a/tensorflow/core/kernels/smooth-hinge-loss.h
+++ b/tensorflow/core/kernels/smooth-hinge-loss.h
@@ -78,10 +78,10 @@ class SmoothHingeLossUpdater : public DualLossUpdater {
   Status ConvertLabel(float* const example_label) const final {
     if (*example_label == 0.0) {
       *example_label = -1;
-      return OkStatus();
+      return absl::OkStatus();
     }
     if (*example_label == 1.0) {
-      return OkStatus();
+      return absl::OkStatus();
     }
     return errors::InvalidArgument(
         "Only labels of 0.0 or 1.0 are supported right now. "
diff --git a/tensorflow/core/kernels/spacetobatch_op.cc b/tensorflow/core/kernels/spacetobatch_op.cc
index 079c3c32e3d5b3..210ff0c93023a4 100644
--- a/tensorflow/core/kernels/spacetobatch_op.cc
+++ b/tensorflow/core/kernels/spacetobatch_op.cc
@@ -123,7 +123,7 @@ Status SpaceToBatchOpCompute(OpKernelContext* context,
 
   if (internal_block_dims == 0) {
     context->set_output(0, orig_input_tensor);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // For the purpose of computing the result, the input will be treated as
@@ -212,7 +212,7 @@ Status SpaceToBatchOpCompute(OpKernelContext* context,
     TF_SPACETOBATCH_FOR_EACH_NUM_BLOCK_DIMS(TF_SPACETOBATCH_BLOCK_DIMS_CASE)
 #undef TF_SPACETOBATCH_BLOCK_DIMS_CASE
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/core/kernels/sparse/add_op.cc b/tensorflow/core/kernels/sparse/add_op.cc
index 6dd92e11320043..e27de2a1782b91 100644
--- a/tensorflow/core/kernels/sparse/add_op.cc
+++ b/tensorflow/core/kernels/sparse/add_op.cc
@@ -175,7 +175,7 @@ class CSRSparseMatrixAddFunctor {
       TF_RETURN_IF_ERROR(csr_geam.Compute(a_comp, b_comp, &c_comp, workspace));
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
diff --git a/tensorflow/core/kernels/sparse/conj_op.cc b/tensorflow/core/kernels/sparse/conj_op.cc
index 36ed86c587b1bc..0436ea0a85f889 100644
--- a/tensorflow/core/kernels/sparse/conj_op.cc
+++ b/tensorflow/core/kernels/sparse/conj_op.cc
@@ -60,7 +60,7 @@ class CSRSparseMatrixConjFunctor {
     functor::UnaryFunctor<Device, functor::conj<T>> func;
     func(d, b->values().flat<T>() /*out*/, a.values().flat<T>() /*in*/);
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
diff --git a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
index 32d452eadd1f8c..ff3543f4602c77 100644
--- a/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
+++ b/tensorflow/core/kernels/sparse/csr_sparse_matrix_to_sparse_tensor_op.cc
@@ -55,7 +55,7 @@ Status ValidateCSRSparseMatrix(const CSRSparseMatrix& csr_sparse_matrix,
     return errors::InvalidArgument("CSR SparseMatrix must have rank 2 or 3; ",
                                    "but dense_shape has size ", rank);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
diff --git a/tensorflow/core/kernels/sparse/kernels.cc b/tensorflow/core/kernels/sparse/kernels.cc
index 63651473ab9c96..bbb96743d0be21 100644
--- a/tensorflow/core/kernels/sparse/kernels.cc
+++ b/tensorflow/core/kernels/sparse/kernels.cc
@@ -140,7 +140,7 @@ Status SparseTensorToCSRSparseMatrixCPUFunctor::operator()(
     std::partial_sum(row_ptr_batch, row_ptr_batch + num_rows + 1,
                      row_ptr_batch);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace functor
diff --git a/tensorflow/core/kernels/sparse/mat_mul_op.cc b/tensorflow/core/kernels/sparse/mat_mul_op.cc
index c532be216ade2c..ed5fd044f825f8 100644
--- a/tensorflow/core/kernels/sparse/mat_mul_op.cc
+++ b/tensorflow/core/kernels/sparse/mat_mul_op.cc
@@ -140,7 +140,7 @@ class CSRMatMulOp : public OpKernel {
           TensorShape(a_dense_shape), " vs. ",
           dense_tensor_b.shape().DebugString());
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  public:
@@ -263,7 +263,7 @@ class CSRMatMulCPUOp : public CSRMatMulOp<CPUDevice, T> {
       TF_RETURN_IF_ERROR(ctx->allocate_output(0, output_shape, output));
       *matmul_result = output_transposed;
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Returns an Eigen::Ref expression of a sparse sub-matrix from the given
@@ -488,7 +488,7 @@ class CSRMatMulCPUOp : public CSRMatMulOp<CPUDevice, T> {
       TF_RETURN_IF_ERROR(
           DoMatrixTranspose(ctx->eigen_device<CPUDevice>(), input, output));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/kernels/sparse/sparse_cholesky_op.cc b/tensorflow/core/kernels/sparse/sparse_cholesky_op.cc
index 3045f97ce0ac18..90f4fbde158748 100644
--- a/tensorflow/core/kernels/sparse/sparse_cholesky_op.cc
+++ b/tensorflow/core/kernels/sparse/sparse_cholesky_op.cc
@@ -269,7 +269,7 @@ class CSRSparseCholeskyCPUOp : public OpKernel {
             perm_shape.dim_size(0), " != ", *batch_size);
     }
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/kernels/sparse/sparse_matrix.h b/tensorflow/core/kernels/sparse/sparse_matrix.h
index 5601c6c7de2196..95b93443863ecc 100644
--- a/tensorflow/core/kernels/sparse/sparse_matrix.h
+++ b/tensorflow/core/kernels/sparse/sparse_matrix.h
@@ -525,7 +525,7 @@ class CSRSparseMatrix {
           "CSRSparseMatrix::Validate: size(col_indices) = ",
           col_indices.dim_size(0), " != size(values) = ", values.dim_size(0));
     }
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   struct Metadata {
@@ -648,7 +648,7 @@ Status ExtractVariantFromInput(OpKernelContext* ctx, int index,
   if (!(*value)->valid()) {
     return errors::InvalidArgument("Variant input ", index, " is not valid.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/kernels/sparse/transpose_op.cc b/tensorflow/core/kernels/sparse/transpose_op.cc
index 6f98e6704e471b..44e4783613ccae 100644
--- a/tensorflow/core/kernels/sparse/transpose_op.cc
+++ b/tensorflow/core/kernels/sparse/transpose_op.cc
@@ -87,7 +87,7 @@ Status ValidateTransposeInputs(const ConstCSRComponent<T>& input,
         "Input nnz should equal the output values size. Got ", nnz, " vs. ",
         output.values.size());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
@@ -205,7 +205,7 @@ Status CSRSparseMatrixTranspose<Device, T>::operator()(
     maybe_conj_inplace<Device, T>::run(d, &output_values_t);
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // CPU kernel for transposing a single component of a CSR SparseMatrix.
diff --git a/tensorflow/core/kernels/sparse/zeros_op.h b/tensorflow/core/kernels/sparse/zeros_op.h
index a6d03974a78ae4..8df31337110275 100644
--- a/tensorflow/core/kernels/sparse/zeros_op.h
+++ b/tensorflow/core/kernels/sparse/zeros_op.h
@@ -74,7 +74,7 @@ struct CSRSparseMatrixZeros {
         dtype, dense_shape_t, batch_ptr_t, csr_row_ptr_t, coo_col_ind_t,
         csr_values_t, matrix));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/kernels/sparse_matmul_op.cc b/tensorflow/core/kernels/sparse_matmul_op.cc
index 6ec0bdd78fae11..a25861f99f66ea 100644
--- a/tensorflow/core/kernels/sparse_matmul_op.cc
+++ b/tensorflow/core/kernels/sparse_matmul_op.cc
@@ -105,6 +105,9 @@ struct SparseSlice {
  public:
   // Indices of three elements on the same row.
   struct Index3 {
+    Index3(uint8 m, uint8 k1, uint8 k2, uint8 k3)
+        : m(m), k1(k1), k2(k2), k3(k3) {}
+
     uint8 m;  // row
     // columns
     uint8 k1;
@@ -114,6 +117,8 @@ struct SparseSlice {
 
   // Index of one element.
   struct Index {
+    Index(uint8 m, uint8 k) : m(m), k(k) {}
+
     uint8 m;
     uint8 k;
   };
@@ -164,6 +169,39 @@ ALWAYS_INLINE bool IsZero(float v) {
   return v == 0.0f;
 }
 
+// Note: this is intended to be used as a value type with all inline methods so
+// that the compiler can optimize.
+template <typename T>
+class StridedIterator {
+ public:
+  StridedIterator(int stride, const T* start, const T* end)
+      : stride_(stride), k_(0), curr_(start), end_(end) {}
+
+  ALWAYS_INLINE bool Done() const { return curr_ >= end_; }
+
+  // Requires `!Done()`.
+  ALWAYS_INLINE T Value() const { return *curr_; }
+
+  ALWAYS_INLINE uint8 K() const { return k_; }
+
+  ALWAYS_INLINE void Next() {
+    curr_ += stride_;
+    ++k_;
+  }
+
+  ALWAYS_INLINE void EatZeros() {
+    while (curr_ < end_ && IsZero<T>(*curr_)) {
+      Next();
+    }
+  }
+
+ private:
+  const int stride_;
+  uint8 k_;
+  const T* curr_;
+  const T* const end_;
+};
+
 template <typename T>
 template <bool Transpose>
 void SparseSlice<T>::Initialize(
@@ -184,62 +222,52 @@ void SparseSlice<T>::Initialize(
   data.reserve(num_blocks * num_rows * 2);
   index.reserve(num_blocks * num_rows * 2);
 
-  Index3 idx3;
   const int stride = Transpose ? mat.dimension(1) : 1;
 
   for (int i = 0; i < num_blocks; ++i) {
     int num_block_cols = std::min(block_size, num_cols - block_size * i);
     for (int row = 0; row < num_rows; ++row) {
-      idx3.m = static_cast<uint8>(row);
+      const uint8 m = static_cast<uint8>(row);
       // Safety note: The following code has a race, since it checks whether
       // *curr is nonzero and then reads it again on use.  However, the result
       // of the race is only that some of the "nonzeros" in the resulting sparse
       // representation may actually be zero, which is harmless.
       const auto* start =
           Transpose ? &mat(col_offset, row) : &mat(row, col_offset);
-      const auto* curr = start;
       const auto* end = start + stride * num_block_cols;
-      uint8 k = 0;
-#define NEXT_ELEM \
-  curr += stride; \
-  ++k;
-#define EAT_ZEROS                          \
-  while (curr < end && IsZero<T>(*curr)) { \
-    NEXT_ELEM;                             \
-  }
+      StridedIterator<T> iter(stride, start, end);
       while (true) {
-        EAT_ZEROS
-        if (curr >= end) break;
-        idx3.k1 = k;
-        const T value1 = *curr;
-        NEXT_ELEM;
-
-        EAT_ZEROS
-        if (curr >= end) {
+        iter.EatZeros();
+        if (iter.Done()) break;
+        const uint8 k1 = iter.K();
+        const T value1 = iter.Value();
+        iter.Next();
+
+        iter.EatZeros();
+        if (iter.Done()) {
           data.push_back(value1);
-          index.push_back({idx3.m, idx3.k1});
+          index.emplace_back(m, k1);
           break;
         }
-        idx3.k2 = k;
-        const T value2 = *curr;
-        NEXT_ELEM;
+        const uint8 k2 = iter.K();
+        const T value2 = iter.Value();
+        iter.Next();
 
-        EAT_ZEROS
-        if (curr >= end) {
+        iter.EatZeros();
+        if (iter.Done()) {
           data.push_back(value2);
-          index.push_back({idx3.m, idx3.k2});
+          index.emplace_back(m, k2);
           data.push_back(value1);
-          index.push_back({idx3.m, idx3.k1});
+          index.emplace_back(m, k1);
           break;
         }
-        idx3.k3 = k;
+        const uint8 k3 = iter.K();
         data3.push_back(value1);
         data3.push_back(value2);
-        data3.push_back(*curr);
-        NEXT_ELEM;
-        index3.push_back(idx3);
-#undef NEXT_ELEM
-#undef EAT_ZEROS
+        data3.push_back(iter.Value());
+        iter.Next();
+        ;
+        index3.emplace_back(m, k1, k2, k3);
       }
     }
     col_offset += block_size;
diff --git a/tensorflow/core/kernels/variable_ops.cc b/tensorflow/core/kernels/variable_ops.cc
index 6b30bc9c0aebb2..870dfd01fabc07 100644
--- a/tensorflow/core/kernels/variable_ops.cc
+++ b/tensorflow/core/kernels/variable_ops.cc
@@ -76,7 +76,7 @@ void VariableOp::Compute(OpKernelContext* ctx) {
   auto creator = [this](LegacyVar** var) {
     *var = new LegacyVar(dtype_);
     (*var)->tensor()->set_shape(shape_);
-    return OkStatus();
+    return absl::OkStatus();
   };
   LegacyVar* var;
   OP_REQUIRES_OK(ctx, cinfo_.resource_manager()->LookupOrCreate<LegacyVar>(
diff --git a/tensorflow/core/kernels/variant_ops_util.cc b/tensorflow/core/kernels/variant_ops_util.cc
index 1665fe23d74236..989d5c70d28629 100644
--- a/tensorflow/core/kernels/variant_ops_util.cc
+++ b/tensorflow/core/kernels/variant_ops_util.cc
@@ -49,7 +49,7 @@ static inline Status AddVariantTo(
   Variant* c = &temp->at(lhs_ix);
   TF_RETURN_IF_ERROR(binary_add_variant(ctx, a, b, c));
   temp_filled->at(lhs_ix) = true;
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 void AddNVariant(OpKernelContext* ctx,
diff --git a/tensorflow/core/kernels/whole_file_read_ops.cc b/tensorflow/core/kernels/whole_file_read_ops.cc
index 7ded247d961852..f57e1d7c602ade 100644
--- a/tensorflow/core/kernels/whole_file_read_ops.cc
+++ b/tensorflow/core/kernels/whole_file_read_ops.cc
@@ -41,7 +41,7 @@ static Status ReadEntireFile(Env* env, const string& filename, T* contents) {
   io::RandomAccessInputStream input_stream(file.get());
   io::BufferedInputStream in(&input_stream, 1 << 20);
   TF_RETURN_IF_ERROR(in.ReadAll(contents));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 class WholeFileReader : public ReaderBase {
@@ -56,7 +56,7 @@ class WholeFileReader : public ReaderBase {
     TF_RETURN_IF_ERROR(ReadEntireFile(env_, *key, value));
     *produced = true;
     *at_end = true;
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // Stores state in a ReaderBaseState proto, since WholeFileReader has
@@ -65,7 +65,7 @@ class WholeFileReader : public ReaderBase {
     ReaderBaseState base_state;
     SaveBaseState(&base_state);
     SerializeToTString(base_state, state);
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   Status RestoreStateLocked(const tstring& state) override {
@@ -75,7 +75,7 @@ class WholeFileReader : public ReaderBase {
                                      absl::CEscape(state));
     }
     TF_RETURN_IF_ERROR(RestoreBaseState(base_state));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
diff --git a/tensorflow/core/kernels/word2vec_kernels.cc b/tensorflow/core/kernels/word2vec_kernels.cc
index 1e249136744b3c..bc81562edf3155 100644
--- a/tensorflow/core/kernels/word2vec_kernels.cc
+++ b/tensorflow/core/kernels/word2vec_kernels.cc
@@ -233,7 +233,7 @@ class SkipgramOp : public OpKernel {
     }
     precalc_examples_.resize(kPrecalc);
     sentence_.resize(kSentenceSize);
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/core/lib/core/status_test.cc b/tensorflow/core/lib/core/status_test.cc
index 9927545f73ab69..af80f615baf65b 100644
--- a/tensorflow/core/lib/core/status_test.cc
+++ b/tensorflow/core/lib/core/status_test.cc
@@ -24,11 +24,11 @@ limitations under the License.
 namespace tensorflow {
 
 TEST(Status, OK) {
-  EXPECT_EQ(OkStatus().code(), error::OK);
-  EXPECT_EQ(OkStatus().message(), "");
-  TF_EXPECT_OK(OkStatus());
-  TF_ASSERT_OK(OkStatus());
-  EXPECT_EQ(OkStatus(), Status());
+  EXPECT_EQ(absl::OkStatus().code(), error::OK);
+  EXPECT_EQ(absl::OkStatus().message(), "");
+  TF_EXPECT_OK(absl::OkStatus());
+  TF_ASSERT_OK(absl::OkStatus());
+  EXPECT_EQ(absl::OkStatus(), Status());
   Status s;
   EXPECT_TRUE(s.ok());
 }
@@ -73,7 +73,7 @@ TEST(Status, MoveAssign) {
 
 TEST(Status, Update) {
   Status s;
-  s.Update(OkStatus());
+  s.Update(absl::OkStatus());
   ASSERT_TRUE(s.ok());
   Status a(errors::InvalidArgument("Invalid"));
   s.Update(a);
@@ -81,12 +81,12 @@ TEST(Status, Update) {
   Status b(errors::Internal("Internal"));
   s.Update(b);
   ASSERT_EQ(s.ToString(), a.ToString());
-  s.Update(OkStatus());
+  s.Update(absl::OkStatus());
   ASSERT_EQ(s.ToString(), a.ToString());
   ASSERT_FALSE(s.ok());
 }
 
-TEST(Status, EqualsOK) { ASSERT_EQ(OkStatus(), Status()); }
+TEST(Status, EqualsOK) { ASSERT_EQ(absl::OkStatus(), Status()); }
 
 TEST(Status, EqualsSame) {
   Status a(errors::InvalidArgument("Invalid"));
@@ -114,10 +114,10 @@ TEST(Status, EqualsDifferentMessage) {
 
 TEST(StatusGroup, OKStatusGroup) {
   StatusGroup c;
-  c.Update(OkStatus());
-  c.Update(OkStatus());
-  ASSERT_EQ(c.as_summary_status(), OkStatus());
-  ASSERT_EQ(c.as_concatenated_status(), OkStatus());
+  c.Update(absl::OkStatus());
+  c.Update(absl::OkStatus());
+  ASSERT_EQ(c.as_summary_status(), absl::OkStatus());
+  ASSERT_EQ(c.as_concatenated_status(), absl::OkStatus());
 }
 
 TEST(StatusGroup, AggregateWithSingleErrorStatus) {
@@ -197,7 +197,7 @@ TEST(Status, ErasePayloadRemovesIt) {
 static void BM_TF_CHECK_OK(::testing::benchmark::State& state) {
   tensorflow::Status s = (state.max_iterations < 0)
                              ? errors::InvalidArgument("Invalid")
-                             : OkStatus();
+                             : absl::OkStatus();
   for (auto i : state) {
     TF_CHECK_OK(s);
   }
diff --git a/tensorflow/core/lib/core/threadpool_test.cc b/tensorflow/core/lib/core/threadpool_test.cc
index e4af7e70812e9d..504e2c894d62af 100644
--- a/tensorflow/core/lib/core/threadpool_test.cc
+++ b/tensorflow/core/lib/core/threadpool_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/core/lib/core/threadpool.h"
 
 #include <atomic>
+#include <optional>
 
 #include "absl/synchronization/barrier.h"
 #include "absl/synchronization/blocking_counter.h"
@@ -76,7 +77,7 @@ void RunWithFixedBlockSize(int64_t block_size, int64_t total,
       total,
       ThreadPool::SchedulingParams(
           ThreadPool::SchedulingStrategy::kFixedBlockSize /* strategy */,
-          absl::nullopt /* cost_per_unit */, block_size /* block_size */),
+          std::nullopt /* cost_per_unit */, block_size /* block_size */),
       [=, &mu, &num_shards, &num_done_work, &work](int64_t start, int64_t end) {
         VLOG(1) << "Shard [" << start << "," << end << ")";
         EXPECT_GE(start, 0);
@@ -220,7 +221,7 @@ void RunFixedBlockSizeShardingWithWorkerId(int64_t block_size, int64_t total,
       total,
       ThreadPool::SchedulingParams(
           ThreadPool::SchedulingStrategy::kFixedBlockSize /* strategy */,
-          absl::nullopt /* cost_per_unit */, block_size /* block_size */),
+          std::nullopt /* cost_per_unit */, block_size /* block_size */),
       [=, &mu, &num_done_work, &work, &threads_running](int64_t start,
                                                         int64_t end, int id) {
         VLOG(1) << "Shard [" << start << "," << end << ")";
@@ -301,7 +302,7 @@ TEST(ThreadPool, ParallelForWithAdaptiveSchedulingStrategy) {
         kWorkItems,
         ThreadPool::SchedulingParams(
             ThreadPool::SchedulingStrategy::kAdaptive /* strategy */,
-            kHugeCost /* cost_per_unit */, absl::nullopt /* block_size */),
+            kHugeCost /* cost_per_unit */, std::nullopt /* block_size */),
         [&outer_context, &work](int64_t begin, int64_t end) {
           Context inner_context(ContextKind::kThread);
           ASSERT_EQ(outer_context, inner_context);
diff --git a/tensorflow/core/lib/db/sqlite.cc b/tensorflow/core/lib/db/sqlite.cc
index 583aeee34f37f6..b208c3a7242c65 100644
--- a/tensorflow/core/lib/db/sqlite.cc
+++ b/tensorflow/core/lib/db/sqlite.cc
@@ -83,7 +83,7 @@ sqlite3_stmt* PrepareRawOrDie(sqlite3* db, const char* sql) {
 }
 
 Status SetPragma(Sqlite* db, const char* pragma, const StringPiece& value) {
-  if (value.empty()) return OkStatus();
+  if (value.empty()) return absl::OkStatus();
   for (auto p = value.begin(); p < value.end(); ++p) {
     if (!(('0' <= *p && *p <= '9') || ('A' <= *p && *p <= 'Z') ||
           ('a' <= *p && *p <= 'z') || *p == '-')) {
@@ -105,7 +105,7 @@ const StringPiece GetEnv(const char* var) {
 Status EnvPragma(Sqlite* db, const char* pragma, const char* var) {
   TF_RETURN_WITH_CONTEXT_IF_ERROR(SetPragma(db, pragma, GetEnv(var)), "getenv(",
                                   var, ")");
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -130,7 +130,7 @@ Status Sqlite::Open(const string& path, int flags, Sqlite** db) {
   sqlite3_stmt* commit = PrepareRawOrDie(sqlite, "COMMIT");
   sqlite3_stmt* rollback = PrepareRawOrDie(sqlite, "ROLLBACK");
   *db = new Sqlite(sqlite, begin, commit, rollback);
-  Status s = OkStatus();
+  Status s = absl::OkStatus();
   // Up until 2016 the default SQLite page_size was 1024. This ensures
   // the new default regardless of linkage unless configured otherwise.
   s.Update(SetPragma(*db, "page_size", "4096"));
@@ -172,7 +172,7 @@ Status Sqlite::Prepare(const StringPiece& sql, SqliteStatement* stmt) {
                         sql.size(), sql.data());
   }
   *stmt = SqliteStatement(this, ps);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status SqliteStatement::Step(bool* is_done) {
@@ -188,10 +188,10 @@ Status SqliteStatement::Step(bool* is_done) {
   switch (rc) {
     case SQLITE_ROW:
       *is_done = false;
-      return OkStatus();
+      return absl::OkStatus();
     case SQLITE_DONE:
       *is_done = true;
-      return OkStatus();
+      return absl::OkStatus();
     default:
       *is_done = true;
       return PrintfStatus(rc, "Step() failed: [%d] %s: %s", rc, db_->errmsg(),
@@ -211,7 +211,7 @@ Status SqliteStatement::StepOnce() {
   if (TF_PREDICT_FALSE(is_done)) {
     return errors::Internal("No rows returned: ", sql());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 const SqliteStatement& SqliteStatement::StepOnceOrDie() {
@@ -277,7 +277,7 @@ Status SqliteTransaction::Commit() {
   sqlite3_reset(db_->commit_);
   sqlite3_reset(db_->begin_);
   Begin();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/core/lib/gif/BUILD b/tensorflow/core/lib/gif/BUILD
index 579b7e39024fb1..7816383907c309 100644
--- a/tensorflow/core/lib/gif/BUILD
+++ b/tensorflow/core/lib/gif/BUILD
@@ -44,7 +44,6 @@ cc_library(
     srcs = if_mobile([
         "gif_io.cc",
         "//tensorflow/core/platform:gif_hdrs",
-        "@local_tsl//tsl/platform:gif_hdrs",
     ]),
     hdrs = [
         "gif_io.h",
diff --git a/tensorflow/core/lib/gif/gif_io_test.cc b/tensorflow/core/lib/gif/gif_io_test.cc
index 86e927cb60650d..0a1bfa2172d821 100644
--- a/tensorflow/core/lib/gif/gif_io_test.cc
+++ b/tensorflow/core/lib/gif/gif_io_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/lib/gif/gif_io.h"
 
+#include <memory>
+
 #include "tensorflow/core/lib/png/png_io.h"
 #include "tensorflow/core/platform/env.h"
 #include "tensorflow/core/platform/test.h"
@@ -222,10 +224,12 @@ TEST(GifTest, TransparentIndexOutsideColorTable) {
   auto allocate_image_data = [&](int frame_cnt, int width, int height,
                                  int channels) -> uint8* {
     nframes = frame_cnt;
-    return new uint8[frame_cnt * height * width * channels];
+    // Create the unique_ptr here, as gif::Decode does not return a pointer to
+    // the allocated array in the case of an error.
+    imgdata = std::make_unique<uint8[]>(frame_cnt * height * width * channels);
+    return imgdata.get();
   };
-  imgdata.reset(gif::Decode(encoded, sizeof(encoded), allocate_image_data,
-                            &error_string));
+  gif::Decode(encoded, sizeof(encoded), allocate_image_data, &error_string);
 
   // ...should be successful and treat the pixels with the transparent index as
   // transparent.
@@ -247,8 +251,7 @@ TEST(GifTest, TransparentIndexOutsideColorTable) {
 
   // ...decoding the image...
   error_string.clear();
-  imgdata.reset(gif::Decode(encoded, sizeof(encoded), allocate_image_data,
-                            &error_string));
+  gif::Decode(encoded, sizeof(encoded), allocate_image_data, &error_string);
 
   // ...should fail with an error about a color out of range.
   ASSERT_EQ(error_string, "found color index 3 outside of color map range 2");
diff --git a/tensorflow/core/lib/jpeg/BUILD b/tensorflow/core/lib/jpeg/BUILD
index fec1f63cc6790b..26fbb6398bdfb8 100644
--- a/tensorflow/core/lib/jpeg/BUILD
+++ b/tensorflow/core/lib/jpeg/BUILD
@@ -26,7 +26,6 @@ cc_library(
         "jpeg_handle.cc",
         "jpeg_mem.cc",
         "//tensorflow/core/platform:jpeg_hdrs",
-        "@local_tsl//tsl/platform:jpeg_hdrs",
     ],
     hdrs = [
         "jpeg_handle.h",
@@ -52,7 +51,6 @@ cc_library(
         "jpeg_handle.cc",
         "jpeg_mem.cc",
         "//tensorflow/core/platform:jpeg_hdrs",
-        "@local_tsl//tsl/platform:jpeg_hdrs",
     ]),
     hdrs = [
         "jpeg_handle.h",
diff --git a/tensorflow/core/ops/compat/ops_history_v2/DatasetFingerprint.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/DatasetFingerprint.pbtxt
new file mode 100644
index 00000000000000..11fcf992a5dd7b
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/DatasetFingerprint.pbtxt
@@ -0,0 +1,11 @@
+op 	 {
+  name: "DatasetFingerprint"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "fingerprint"
+    type: DT_UINT64
+  }
+}
diff --git a/tensorflow/core/ops/compat/ops_history_v2/GlobalShuffleDataset.pbtxt b/tensorflow/core/ops/compat/ops_history_v2/GlobalShuffleDataset.pbtxt
new file mode 100644
index 00000000000000..131281a80ec590
--- /dev/null
+++ b/tensorflow/core/ops/compat/ops_history_v2/GlobalShuffleDataset.pbtxt
@@ -0,0 +1,70 @@
+op 	 {
+  name: "GlobalShuffleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed_generator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "reshuffle_each_iteration"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
diff --git a/tensorflow/core/ops/dataset_ops.cc b/tensorflow/core/ops/dataset_ops.cc
index f73247719bc286..2e8898c4b85b7a 100644
--- a/tensorflow/core/ops/dataset_ops.cc
+++ b/tensorflow/core/ops/dataset_ops.cc
@@ -326,6 +326,11 @@ REGISTER_OP("FilterDataset")
                                                            "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("DatasetFingerprint")
+    .Input("input_dataset: variant")
+    .Output("fingerprint: uint64")
+    .SetShapeFn(shape_inference::ScalarShape);
+
 REGISTER_OP("ParallelFilterDataset")
     .Input("input_dataset: variant")
     .Input("other_arguments: Targuments")
diff --git a/tensorflow/core/ops/experimental_dataset_ops.cc b/tensorflow/core/ops/experimental_dataset_ops.cc
index 396e1720aaf2fd..e3c8a17116efce 100644
--- a/tensorflow/core/ops/experimental_dataset_ops.cc
+++ b/tensorflow/core/ops/experimental_dataset_ops.cc
@@ -502,6 +502,27 @@ REGISTER_OP("ExperimentalGroupByWindowDataset")
                                                            "output_types"))
     .SetShapeFn(shape_inference::ScalarShape);
 
+REGISTER_OP("GlobalShuffleDataset")
+    .Input("input_dataset: variant")
+    .Input("seed: int64")
+    .Input("seed2: int64")
+    .Input("seed_generator: resource")
+    .Output("handle: variant")
+    .Attr("reshuffle_each_iteration: bool = true")
+    .Attr("output_types: list(type) >= 1")
+    .Attr("output_shapes: list(shape) >= 1")
+    .Attr("metadata: string = ''")
+    .SetTypeConstructor(full_type::VariadicTensorContainer(TFT_DATASET,
+                                                           "output_types"))
+    .SetShapeFn([](shape_inference::InferenceContext* c) {
+      shape_inference::ShapeHandle unused;
+      // seed, seed2, and seed_generator should be scalars.
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(2), 0, &unused));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(3), 0, &unused));
+      return shape_inference::ScalarShape(c);
+    });
+
 REGISTER_OP("IgnoreErrorsDataset")
     .Input("input_dataset: variant")
     .Output("handle: variant")
diff --git a/tensorflow/core/ops/mkl_nn_ops.cc b/tensorflow/core/ops/mkl_nn_ops.cc
index 791ce943192f9c..868e15113b406c 100644
--- a/tensorflow/core/ops/mkl_nn_ops.cc
+++ b/tensorflow/core/ops/mkl_nn_ops.cc
@@ -290,7 +290,7 @@ REGISTER_OP("_MklNativeConv2DBackpropFilterWithBias")
     .Input("out_backprop: T")
     .Output("output: T")
     .Output("bias_grad: T")
-    .Attr("T: {bfloat16, float}")
+    .Attr("T: {bfloat16, float, half}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
diff --git a/tensorflow/core/ops/nn_ops.cc b/tensorflow/core/ops/nn_ops.cc
index b57c12d4e18d73..44404aa319806b 100644
--- a/tensorflow/core/ops/nn_ops.cc
+++ b/tensorflow/core/ops/nn_ops.cc
@@ -2065,7 +2065,7 @@ REGISTER_OP("__MklDummyConv2DBackpropFilterWithBias")
     .Input("out_backprop: T")
     .Output("output: T")
     .Output("bias_grad: T")
-    .Attr("T: {bfloat16, float}")
+    .Attr("T: {bfloat16, float, half}")
     .Attr("strides: list(int)")
     .Attr("use_cudnn_on_gpu: bool = true")
     .Attr(GetPaddingAttrString())
diff --git a/tensorflow/core/ops/ops.pbtxt b/tensorflow/core/ops/ops.pbtxt
index fbcc744b49444a..39e6cb6b3bd12d 100644
--- a/tensorflow/core/ops/ops.pbtxt
+++ b/tensorflow/core/ops/ops.pbtxt
@@ -13334,6 +13334,17 @@ op {
     }
   }
 }
+op {
+  name: "DatasetFingerprint"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  output_arg {
+    name: "fingerprint"
+    type: DT_UINT64
+  }
+}
 op {
   name: "DatasetFromGraph"
   input_arg {
@@ -22084,6 +22095,76 @@ op {
   }
   is_stateful: true
 }
+op {
+  name: "GlobalShuffleDataset"
+  input_arg {
+    name: "input_dataset"
+    type: DT_VARIANT
+  }
+  input_arg {
+    name: "seed"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed2"
+    type: DT_INT64
+  }
+  input_arg {
+    name: "seed_generator"
+    type: DT_RESOURCE
+  }
+  output_arg {
+    name: "handle"
+    type: DT_VARIANT
+    experimental_full_type {
+      type_id: TFT_DATASET
+      args {
+        type_id: TFT_FOR_EACH
+        args {
+          type_id: TFT_PRODUCT
+        }
+        args {
+          type_id: TFT_TENSOR
+          args {
+            type_id: TFT_VAR
+            s: "output_types"
+          }
+        }
+        args {
+          type_id: TFT_VAR
+          s: "output_types"
+        }
+      }
+    }
+  }
+  attr {
+    name: "reshuffle_each_iteration"
+    type: "bool"
+    default_value {
+      b: true
+    }
+  }
+  attr {
+    name: "output_types"
+    type: "list(type)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "output_shapes"
+    type: "list(shape)"
+    has_minimum: true
+    minimum: 1
+  }
+  attr {
+    name: "metadata"
+    type: "string"
+    default_value {
+      s: ""
+    }
+  }
+  is_stateful: true
+}
 op {
   name: "Greater"
   input_arg {
diff --git a/tensorflow/core/platform/BUILD b/tensorflow/core/platform/BUILD
index 0019b2c868b613..9af3f143790b36 100644
--- a/tensorflow/core/platform/BUILD
+++ b/tensorflow/core/platform/BUILD
@@ -384,7 +384,7 @@ cc_library(
     name = "gif",
     hdrs = ["gif.h"],
     deps = [
-        "@local_tsl//tsl/platform:gif",
+        "@gif",
     ],
 )
 
@@ -412,7 +412,7 @@ cc_library(
     name = "jpeg",
     hdrs = ["jpeg.h"],
     deps = [
-        "@local_tsl//tsl/platform:jpeg",
+        "@libjpeg_turbo//:jpeg",
     ],
 )
 
@@ -580,7 +580,7 @@ cc_library(
     hdrs = ["png.h"],
     deps = [
         ":platform",
-        "@local_tsl//tsl/platform:png",
+        "@png",
     ],
 )
 
@@ -1088,7 +1088,7 @@ tf_cuda_library(
         "@local_tsl//tsl/platform",
         "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor:dnn",
-        "@local_xla//xla/stream_executor:multi_platform_manager",
+        "@local_xla//xla/stream_executor:platform_manager",
         "@local_xla//xla/stream_executor/cuda:cuda_activation_header",
         "@local_xla//xla/stream_executor/cuda:cuda_platform_id",
         "@local_xla//xla/stream_executor/host:host_platform_id",
@@ -1111,7 +1111,7 @@ cc_library(
     deps = [
         "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor:dnn",
-        "@local_xla//xla/stream_executor:multi_platform_manager",
+        "@local_xla//xla/stream_executor:platform_manager",
         "@local_xla//xla/stream_executor/cuda:cuda_platform_id",
         "@local_xla//xla/stream_executor/host:host_platform",
         "@local_xla//xla/stream_executor/host:host_platform_id",
diff --git a/tensorflow/core/platform/cpu_feature_guard.cc b/tensorflow/core/platform/cpu_feature_guard.cc
index f70c874c1c3c77..748dc21090a557 100644
--- a/tensorflow/core/platform/cpu_feature_guard.cc
+++ b/tensorflow/core/platform/cpu_feature_guard.cc
@@ -111,18 +111,18 @@ class CPUFeatureGuard {
 #ifdef __AVXNECONVERT__
     CheckFeatureOrDie(CPUFeature::AVX_NE_CONVERT, "AVX_NE_CONVERT");
 #endif  // __AVXNECONVERT__
-#ifdef __AMXTILE__
+#ifdef __AMX_TILE__
     CheckFeatureOrDie(CPUFeature::AMX_TILE, "AMX_TILE");
-#endif  // __AMXTILE__
-#ifdef __AMXINT8__
+#endif  // __AMX_TILE__
+#ifdef __AMX_INT8__
     CheckFeatureOrDie(CPUFeature::AMX_INT8, "AMX_INT8");
-#endif  // __AMXINT8__
-#ifdef __AMXBF16__
+#endif  // __AMX_INT8__
+#ifdef __AMX_BF16__
     CheckFeatureOrDie(CPUFeature::AMX_BF16, "AMX_BF16");
-#endif  // __AMXBF16__
-#ifdef __AMXFP16__
+#endif  // __AMX_BF16__
+#ifdef __AMX_FP16__
     CheckFeatureOrDie(CPUFeature::AMX_FP16, "AMX_FP16");
-#endif  // __AMXFP16__
+#endif  // __AMX_FP16__
 #ifdef __FMA__
     CheckFeatureOrDie(CPUFeature::FMA, "FMA");
 #endif  // __FMA__
@@ -187,22 +187,22 @@ void InfoAboutUnusedCPUFeatures() {
     CheckIfFeatureUnused(CPUFeature::AVX_NE_CONVERT, "AVX_NE_CONVERT",
                          missing_instructions);
 #endif  // __AVXNECONVERT__
-#ifndef __AMXTILE__
+#ifndef __AMX_TILE__
     CheckIfFeatureUnused(CPUFeature::AMX_TILE, "AMX_TILE",
                          missing_instructions);
-#endif  // __AMXTILE__
-#ifndef __AMXINT8__
+#endif  // __AMX_TILE__
+#ifndef __AMX_INT8__
     CheckIfFeatureUnused(CPUFeature::AMX_INT8, "AMX_INT8",
                          missing_instructions);
-#endif  // __AMXINT8__
-#ifndef __AMXBF16__
+#endif  // __AMX_INT8__
+#ifndef __AMX_BF16__
     CheckIfFeatureUnused(CPUFeature::AMX_BF16, "AMX_BF16",
                          missing_instructions);
-#endif  // __AMXBF16__
-#ifndef __AMXFP16__
+#endif  // __AMX_BF16__
+#ifndef __AMX_FP16__
     CheckIfFeatureUnused(CPUFeature::AMX_FP16, "AMX_FP16",
                          missing_instructions);
-#endif  // __AMXFP16__
+#endif  // __AMX_FP16__
 #ifndef __FMA__
     CheckIfFeatureUnused(CPUFeature::FMA, "FMA", missing_instructions);
 #endif  // __FMA__
diff --git a/tensorflow/core/platform/default/build_config/BUILD b/tensorflow/core/platform/default/build_config/BUILD
index 122ff0e928e3df..7745a940ec7727 100644
--- a/tensorflow/core/platform/default/build_config/BUILD
+++ b/tensorflow/core/platform/default/build_config/BUILD
@@ -1,8 +1,8 @@
 # Description:
 # Platform-specific build configurations.
 
-load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 load("//tensorflow:tensorflow.bzl", "tf_copts")
+load("//tensorflow/core/platform:rules_cc.bzl", "cc_library")
 
 package(default_visibility = ["//tensorflow:internal"])
 
@@ -19,8 +19,20 @@ cc_library(
         "@farmhash_archive//:farmhash",
         "@fft2d",
         "@highwayhash//:sip_hash",
-        "@local_tsl//tsl/platform/default/build_config:gif",
-        "@local_tsl//tsl/platform/default/build_config:jpeg",
         "@zlib",
     ],
 )
+
+cc_library(
+    name = "tensorflow_platform_specific",
+    copts = tf_copts(),
+    linkstatic = 1,
+    deps = [],
+)
+
+cc_library(
+    name = "test_main",
+    testonly = 1,
+    linkstatic = 1,
+    deps = [],
+)
diff --git a/tensorflow/core/platform/gif.h b/tensorflow/core/platform/gif.h
index 0b9dc9289c6ba2..79af3822d29831 100644
--- a/tensorflow/core/platform/gif.h
+++ b/tensorflow/core/platform/gif.h
@@ -16,6 +16,6 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PLATFORM_GIF_H_
 #define TENSORFLOW_CORE_PLATFORM_GIF_H_
 
-#include "tsl/platform/gif.h"  // IWYU pragma: export
+#include "gif_lib.h"  // from @gif
 
 #endif  // TENSORFLOW_CORE_PLATFORM_GIF_H_
diff --git a/tensorflow/core/platform/jpeg.h b/tensorflow/core/platform/jpeg.h
index 7f205d8cfd1503..68dadd18a03da6 100644
--- a/tensorflow/core/platform/jpeg.h
+++ b/tensorflow/core/platform/jpeg.h
@@ -16,6 +16,14 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PLATFORM_JPEG_H_
 #define TENSORFLOW_CORE_PLATFORM_JPEG_H_
 
-#include "tsl/platform/jpeg.h"
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+
+extern "C" {
+#include "jerror.h"  // from @libjpeg_turbo   // IWYU pragma: export
+#include "jpeglib.h"  // from @libjpeg_turbo  // IWYU pragma: export
+}
 
 #endif  // TENSORFLOW_CORE_PLATFORM_JPEG_H_
diff --git a/tensorflow/core/platform/platform_strings_test.cc b/tensorflow/core/platform/platform_strings_test.cc
index 807cbc00b457fc..3943ccdc018585 100644
--- a/tensorflow/core/platform/platform_strings_test.cc
+++ b/tensorflow/core/platform/platform_strings_test.cc
@@ -20,7 +20,10 @@ limitations under the License.
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
+#ifndef _WIN32
 #include <unistd.h>
+#endif  // _WIN32
 
 #include <string>
 #include <vector>
diff --git a/tensorflow/core/platform/png.h b/tensorflow/core/platform/png.h
index dde8ad007c6719..fc1a342165fac3 100644
--- a/tensorflow/core/platform/png.h
+++ b/tensorflow/core/platform/png.h
@@ -17,6 +17,14 @@ limitations under the License.
 #define TENSORFLOW_CORE_PLATFORM_PNG_H_
 
 #include "tensorflow/core/platform/platform.h"
-#include "tsl/platform/png.h"
+
+#if defined(PLATFORM_GOOGLE) && !defined(IS_MOBILE_PLATFORM)
+#include "png.h"  // from @png  // IWYU pragma: export
+#elif defined(PLATFORM_POSIX) || defined(PLATFORM_WINDOWS) || \
+    defined(PLATFORM_POSIX_ANDROID) || defined(IS_MOBILE_PLATFORM)
+#include <png.h>  // IWYU pragma: export
+#else
+#error Define the appropriate PLATFORM_<foo> macro for this platform
+#endif
 
 #endif  // TENSORFLOW_CORE_PLATFORM_PNG_H_
diff --git a/tensorflow/core/platform/stream_executor.h b/tensorflow/core/platform/stream_executor.h
index 5b836e2f194d4b..f72e3566645e59 100644
--- a/tensorflow/core/platform/stream_executor.h
+++ b/tensorflow/core/platform/stream_executor.h
@@ -21,9 +21,9 @@ limitations under the License.
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/host/host_platform_id.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/dso_loader.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
diff --git a/tensorflow/core/platform/stream_executor_no_cuda.h b/tensorflow/core/platform/stream_executor_no_cuda.h
index d4305ec2af1e62..53f5ccefed2616 100644
--- a/tensorflow/core/platform/stream_executor_no_cuda.h
+++ b/tensorflow/core/platform/stream_executor_no_cuda.h
@@ -21,9 +21,9 @@ limitations under the License.
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/host/host_platform_id.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/dso_loader.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
diff --git a/tensorflow/core/profiler/backends/gpu/BUILD b/tensorflow/core/profiler/backends/gpu/BUILD
index 9a5038d9712de6..261607e1edb25e 100644
--- a/tensorflow/core/profiler/backends/gpu/BUILD
+++ b/tensorflow/core/profiler/backends/gpu/BUILD
@@ -1,16 +1,5 @@
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library", "if_cuda")
-load(
-    "//tensorflow:tensorflow.bzl",
-    "tf_copts",
-    "tf_cuda_library",
-)
 load("//tensorflow:tensorflow.default.bzl", "tf_cuda_cc_test")
-load(
-    "//tensorflow/core/platform:build_config_root.bzl",
-    "tf_cuda_tests_tags",
-)
-load("//tensorflow/core/profiler/builds:build_config.bzl", "tf_profiler_copts")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
+load("//tensorflow/core/platform:build_config_root.bzl", "tf_cuda_tests_tags")
 load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
@@ -32,7 +21,6 @@ tf_cuda_cc_test(
         "nomac",
     ],
     deps = [
-        ":cupti_collector",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/core:all_kernels",
         "//tensorflow/core:core_cpu",
@@ -56,194 +44,11 @@ tf_cuda_cc_test(
         "//tensorflow/core/profiler/utils:xplane_visitor",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/profiler/utils:tf_xplane_visitor",
+        "@local_xla//xla/backends/profiler/gpu:cuda_test",
+        "@local_xla//xla/backends/profiler/gpu:cupti_collector",
         "@local_xla//xla/backends/profiler/gpu:device_tracer",
     ] + if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
         "@local_config_cuda//cuda:cupti_headers",
     ]),
 )
-
-tf_cuda_library(
-    name = "cupti_interface",
-    hdrs = if_cuda(["cupti_interface.h"]),
-    copts = tf_profiler_copts() + tf_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core:platform_base",
-        "@local_xla//xla/backends/profiler/gpu:cupti_interface",
-    ] + if_cuda(["@local_tsl//tsl/cuda:cupti"]),
-)
-
-tf_cuda_library(
-    name = "mock_cupti",
-    testonly = 1,
-    hdrs = if_cuda(["mock_cupti.h"]),
-    copts = tf_profiler_copts() + tf_copts(),
-    cuda_deps = [
-        ":cupti_interface",
-    ],
-    deps = [
-        "//tensorflow/core:test",
-        "@local_xla//xla/backends/profiler/gpu:mock_cupti",
-    ],
-)
-
-tf_cuda_library(
-    name = "cupti_error_manager",
-    hdrs = if_cuda(["cupti_error_manager.h"]),
-    copts = tf_profiler_copts() + tf_copts(),
-    cuda_deps = [
-        ":cupti_interface",
-        ":cupti_wrapper",
-        "@local_xla//xla/backends/profiler/gpu:cupti_error_manager",
-        "//tensorflow/core/platform:mutex",
-        "//tensorflow/core/platform:thread_annotations",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/debugging:leak_check",
-        "@com_google_absl//absl/synchronization",
-    ],
-)
-
-cuda_library(
-    name = "cuda_test",
-    testonly = 1,
-    hdrs = ["cuda_test.h"],
-    copts = select({
-        "@local_config_cuda//cuda:using_nvcc": [
-            "-nvcc_options",
-            "ptxas-options=-v",
-        ],
-        "//conditions:default": [],
-    }),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:test",
-        "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_cuda//cuda:cudart",
-        "@local_xla//xla/backends/profiler/gpu:cuda_test",
-    ],
-)
-
-# Rationale for linkstatic: The symbols in libcupti_static.a have hidden
-# visibility. The wrapper will fail to find them if it's ever built as a
-# shared library. This is the same issue as b/11094727. Always linking
-# the wrapper statically works around the issue. An alternative would be
-# to patch libcupti_static, but it's not worth the trouble considering
-# that the wrapper is about the only direct user.
-tf_cuda_library(
-    name = "cupti_wrapper",
-    hdrs = if_cuda(["cupti_wrapper.h"]),
-    copts = tf_profiler_copts() + tf_copts(),
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-    deps = [
-        ":cupti_interface",
-        "@local_xla//xla/backends/profiler/gpu:cupti_wrapper",
-    ] + if_cuda(["@local_tsl//tsl/cuda:cupti"]),
-)
-
-tf_cuda_library(
-    name = "cupti_tracer",
-    hdrs = if_cuda(["cupti_tracer.h"]),
-    copts = tf_profiler_copts() + tf_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":cupti_collector",
-        ":cupti_interface",
-        ":cupti_utils",
-        ":nvtx_utils",
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/backends/cpu:annotation_stack",
-        "//tensorflow/core/profiler/lib:scoped_annotation",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/container:node_hash_set",
-        "@com_google_absl//absl/types:optional",
-        "@local_tsl//tsl/profiler/utils:buffer_pool",
-        "@local_xla//xla/backends/profiler/gpu:cupti_tracer",
-    ],
-)
-
-tf_cuda_library(
-    name = "rocm_tracer",
-    hdrs = if_rocm(["rocm_tracer.h"]),
-    copts = tf_profiler_copts() + tf_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/backends/cpu:annotation_stack",
-        "@com_google_absl//absl/container:fixed_array",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/container:node_hash_set",
-        "@com_google_absl//absl/types:optional",
-        "@local_tsl//tsl/profiler/utils:time_utils",
-        "@local_xla//xla/backends/profiler/gpu:rocm_tracer",
-        "@local_xla//xla/stream_executor/rocm:roctracer_wrapper",
-    ],
-)
-
-tf_cuda_library(
-    name = "nvtx_utils",
-    hdrs = if_cuda(["nvtx_utils.h"]),
-    copts = tf_profiler_copts() + tf_copts(),
-    cuda_deps = ["@com_google_absl//absl/strings:string_view"],
-    deps = [
-        "//tensorflow/core:lib",
-        "@local_xla//xla/backends/profiler/gpu:nvtx_utils",
-    ],
-)
-
-tf_cuda_library(
-    name = "cupti_collector",
-    hdrs = if_cuda(["cupti_collector.h"]),
-    copts = tf_profiler_copts() + tf_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "//tensorflow/core/profiler/utils:trace_utils",
-        "//tensorflow/core/profiler/utils:xplane_builder",
-        "//tensorflow/core/profiler/utils:xplane_schema",
-        "//tensorflow/core/profiler/utils:xplane_utils",
-        "@com_google_absl//absl/container:fixed_array",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:node_hash_set",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/utils:parse_annotation",
-        "@local_xla//xla/backends/profiler/gpu:cupti_collector",
-    ] + if_cuda(["@local_tsl//tsl/cuda:cupti"]),
-)
-
-cc_library(
-    name = "cupti_collector_header",
-    hdrs = ["cupti_collector.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/profiler/protobuf:xplane_proto_cc",
-        "@com_google_absl//absl/container:fixed_array",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:node_hash_set",
-        "@com_google_absl//absl/strings",
-        "@local_xla//xla/backends/profiler/gpu:cupti_collector_header",
-    ],
-)
-
-tf_cuda_library(
-    name = "cupti_utils",
-    copts = tf_profiler_copts() + tf_copts(),
-    cuda_deps = [
-        ":cupti_error_manager",
-        ":cupti_interface",
-        ":cupti_wrapper",
-        "@local_xla//xla/backends/profiler/gpu:cupti_utils",
-    ],
-    visibility = ["//visibility:public"],
-    alwayslink = 1,
-)
diff --git a/tensorflow/core/profiler/backends/gpu/cuda_test.h b/tensorflow/core/profiler/backends/gpu/cuda_test.h
deleted file mode 100644
index 65d8c395d7d74c..00000000000000
--- a/tensorflow/core/profiler/backends/gpu/cuda_test.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUDA_TEST_H_
-#define TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUDA_TEST_H_
-
-#include "xla/backends/profiler/gpu/cuda_test.h"
-
-namespace tensorflow {
-namespace profiler {
-namespace test {
-
-using xla::profiler::test::EmptyKernel;          // NOLINT
-using xla::profiler::test::MemCopyD2H;           // NOLINT
-using xla::profiler::test::MemCopyH2D;           // NOLINT
-using xla::profiler::test::MemCopyH2D_Async;     // NOLINT
-using xla::profiler::test::MemCopyP2PAvailable;  // NOLINT
-using xla::profiler::test::MemCopyP2PExplicit;   // NOLINT
-using xla::profiler::test::MemCopyP2PImplicit;   // NOLINT
-using xla::profiler::test::PrintfKernel;         // NOLINT
-using xla::profiler::test::Synchronize;          // NOLINT
-
-}  // namespace test
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUDA_TEST_H_
diff --git a/tensorflow/core/profiler/backends/gpu/cupti_collector.h b/tensorflow/core/profiler/backends/gpu/cupti_collector.h
deleted file mode 100644
index 7673cbbcbf5239..00000000000000
--- a/tensorflow/core/profiler/backends/gpu/cupti_collector.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUPTI_COLLECTOR_H_
-#define TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUPTI_COLLECTOR_H_
-
-#include <memory>
-
-#include "absl/container/fixed_array.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/node_hash_set.h"
-#include "absl/strings/string_view.h"
-#include "xla/backends/profiler/gpu/cupti_collector.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/protobuf/xplane.pb.h"
-
-namespace tensorflow {
-namespace profiler {
-
-using xla::profiler::AnnotationMap;                // NOLINT
-using xla::profiler::CreateCuptiCollector;         // NOLINT
-using xla::profiler::CuptiTraceCollector;          // NOLINT
-using xla::profiler::CuptiTracerCollectorOptions;  // NOLINT
-using xla::profiler::CuptiTracerEvent;             // NOLINT
-using xla::profiler::CuptiTracerEventSource;       // NOLINT
-using xla::profiler::CuptiTracerEventType;         // NOLINT
-using xla::profiler::GetMemoryKindName;            // NOLINT
-using xla::profiler::GetTraceEventTypeName;        // NOLINT
-using xla::profiler::KernelDetails;                // NOLINT
-using xla::profiler::MemAllocDetails;              // NOLINT
-using xla::profiler::MemcpyDetails;                // NOLINT
-using xla::profiler::MemsetDetails;                // NOLINT
-using xla::profiler::ToXStat;                      // NOLINT
-
-using MemFreeDetails = MemAllocDetails;
-using MemoryResidencyDetails = MemAllocDetails;
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUPTI_COLLECTOR_H_
diff --git a/tensorflow/core/profiler/backends/gpu/cupti_error_manager.h b/tensorflow/core/profiler/backends/gpu/cupti_error_manager.h
deleted file mode 100644
index 99c6ed2352b760..00000000000000
--- a/tensorflow/core/profiler/backends/gpu/cupti_error_manager.h
+++ /dev/null
@@ -1,41 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUPTI_ERROR_MANAGER_H_
-#define TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUPTI_ERROR_MANAGER_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include <atomic>
-#include <functional>
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "xla/backends/profiler/gpu/cupti_error_manager.h"
-#include "tensorflow/core/platform/mutex.h"
-#include "tensorflow/core/platform/thread_annotations.h"
-#include "tensorflow/core/profiler/backends/gpu/cupti_interface.h"
-
-namespace tensorflow {
-namespace profiler {
-
-using xla::profiler::CuptiErrorManager;  // NOLINT
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUPTI_ERROR_MANAGER_H_
diff --git a/tensorflow/core/profiler/backends/gpu/cupti_interface.h b/tensorflow/core/profiler/backends/gpu/cupti_interface.h
deleted file mode 100644
index c0f693f295cf75..00000000000000
--- a/tensorflow/core/profiler/backends/gpu/cupti_interface.h
+++ /dev/null
@@ -1,37 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUPTI_INTERFACE_H_
-#define TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUPTI_INTERFACE_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
-#include "third_party/gpus/cuda/include/cuda.h"
-#include "xla/backends/profiler/gpu/cupti_interface.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace profiler {
-
-using xla::profiler::CuptiInterface;     // NOLINT
-using xla::profiler::GetCuptiInterface;  // NOLINT
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUPTI_INTERFACE_H_
diff --git a/tensorflow/core/profiler/backends/gpu/cupti_tracer.h b/tensorflow/core/profiler/backends/gpu/cupti_tracer.h
deleted file mode 100644
index 6afa95a703cc4b..00000000000000
--- a/tensorflow/core/profiler/backends/gpu/cupti_tracer.h
+++ /dev/null
@@ -1,39 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUPTI_TRACER_H_
-#define TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUPTI_TRACER_H_
-
-#include "absl/types/optional.h"
-#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
-#include "third_party/gpus/cuda/include/nvtx3/nvToolsExt.h"
-#include "xla/backends/profiler/gpu/cupti_tracer.h"
-#include "tensorflow/core/platform/errors.h"
-#include "tensorflow/core/platform/status.h"
-#include "tensorflow/core/platform/types.h"
-#include "tensorflow/core/profiler/backends/gpu/cupti_collector.h"
-#include "tensorflow/core/profiler/backends/gpu/cupti_interface.h"
-#include "tsl/profiler/utils/buffer_pool.h"
-
-namespace tensorflow {
-namespace profiler {
-
-using xla::profiler::CuptiTracer;         // NOLINT
-using xla::profiler::CuptiTracerOptions;  // NOLINT
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUPTI_TRACER_H_
diff --git a/tensorflow/core/profiler/backends/gpu/cupti_wrapper.h b/tensorflow/core/profiler/backends/gpu/cupti_wrapper.h
deleted file mode 100644
index 331828f31bec31..00000000000000
--- a/tensorflow/core/profiler/backends/gpu/cupti_wrapper.h
+++ /dev/null
@@ -1,186 +0,0 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUPTI_WRAPPER_H_
-#define TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUPTI_WRAPPER_H_
-
-#include <stddef.h>
-#include <stdint.h>
-
-#include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
-#include "third_party/gpus/cuda/include/cuda.h"
-#include "tensorflow/core/profiler/backends/gpu/cupti_interface.h"
-
-namespace tensorflow {
-namespace profiler {
-
-class CuptiWrapper : public tensorflow::profiler::CuptiInterface {
- public:
-  CuptiWrapper() {}
-
-  ~CuptiWrapper() override {}
-
-  // CUPTI activity API
-  CUptiResult ActivityDisable(CUpti_ActivityKind kind) override;
-
-  CUptiResult ActivityEnable(CUpti_ActivityKind kind) override;
-
-  CUptiResult ActivityFlushAll(uint32_t flag) override;
-
-  CUptiResult ActivityGetNextRecord(uint8_t* buffer,
-                                    size_t valid_buffer_size_bytes,
-                                    CUpti_Activity** record) override;
-
-  CUptiResult ActivityGetNumDroppedRecords(CUcontext context,
-                                           uint32_t stream_id,
-                                           size_t* dropped) override;
-
-  CUptiResult ActivityConfigureUnifiedMemoryCounter(
-      CUpti_ActivityUnifiedMemoryCounterConfig* config,
-      uint32_t count) override;
-
-  CUptiResult ActivityRegisterCallbacks(
-      CUpti_BuffersCallbackRequestFunc func_buffer_requested,
-      CUpti_BuffersCallbackCompleteFunc func_buffer_completed) override;
-
-  CUptiResult GetDeviceId(CUcontext context, uint32* deviceId) override;
-
-  CUptiResult GetTimestamp(uint64_t* timestamp) override;
-
-  // cuptiFinalize is only defined in CUDA8 and above.
-  // To enable it in CUDA8, the environment variable CUPTI_ENABLE_FINALIZE must
-  // be set to 1.
-  CUptiResult Finalize() override;
-
-  // CUPTI callback API
-  CUptiResult EnableCallback(uint32_t enable, CUpti_SubscriberHandle subscriber,
-                             CUpti_CallbackDomain domain,
-                             CUpti_CallbackId cbid) override;
-
-  CUptiResult EnableDomain(uint32_t enable, CUpti_SubscriberHandle subscriber,
-                           CUpti_CallbackDomain domain) override;
-
-  CUptiResult Subscribe(CUpti_SubscriberHandle* subscriber,
-                        CUpti_CallbackFunc callback, void* userdata) override;
-
-  CUptiResult Unsubscribe(CUpti_SubscriberHandle subscriber) override;
-
-  // CUPTI event API
-  CUptiResult DeviceEnumEventDomains(
-      CUdevice device, size_t* array_size_bytes,
-      CUpti_EventDomainID* domain_array) override;
-
-  CUptiResult DeviceGetEventDomainAttribute(CUdevice device,
-                                            CUpti_EventDomainID event_domain,
-                                            CUpti_EventDomainAttribute attrib,
-                                            size_t* value_size,
-                                            void* value) override;
-
-  CUptiResult DisableKernelReplayMode(CUcontext context) override;
-
-  CUptiResult EnableKernelReplayMode(CUcontext context) override;
-
-  CUptiResult DeviceGetNumEventDomains(CUdevice device,
-                                       uint32_t* num_domains) override;
-
-  CUptiResult EventDomainEnumEvents(CUpti_EventDomainID event_domain,
-                                    size_t* array_size_bytes,
-                                    CUpti_EventID* event_array) override;
-
-  CUptiResult EventDomainGetNumEvents(CUpti_EventDomainID event_domain,
-                                      uint32_t* num_events) override;
-
-  CUptiResult EventGetAttribute(CUpti_EventID event,
-                                CUpti_EventAttribute attrib, size_t* value_size,
-                                void* value) override;
-
-  CUptiResult EventGetIdFromName(CUdevice device, const char* event_name,
-                                 CUpti_EventID* event) override;
-
-  CUptiResult EventGroupDisable(CUpti_EventGroup event_group) override;
-
-  CUptiResult EventGroupEnable(CUpti_EventGroup event_group) override;
-
-  CUptiResult EventGroupGetAttribute(CUpti_EventGroup event_group,
-                                     CUpti_EventGroupAttribute attrib,
-                                     size_t* value_size, void* value) override;
-
-  CUptiResult EventGroupReadEvent(CUpti_EventGroup event_group,
-                                  CUpti_ReadEventFlags flags,
-                                  CUpti_EventID event,
-                                  size_t* event_value_buffer_size_bytes,
-                                  uint64_t* event_value_buffer) override;
-
-  CUptiResult EventGroupSetAttribute(CUpti_EventGroup event_group,
-                                     CUpti_EventGroupAttribute attrib,
-                                     size_t value_size, void* value) override;
-
-  CUptiResult EventGroupSetsCreate(
-      CUcontext context, size_t event_id_array_size_bytes,
-      CUpti_EventID* event_id_array,
-      CUpti_EventGroupSets** event_group_passes) override;
-
-  CUptiResult EventGroupSetsDestroy(
-      CUpti_EventGroupSets* event_group_sets) override;
-
-  // CUPTI metric API
-  CUptiResult DeviceEnumMetrics(CUdevice device, size_t* arraySizeBytes,
-                                CUpti_MetricID* metricArray) override;
-
-  CUptiResult DeviceGetNumMetrics(CUdevice device,
-                                  uint32_t* num_metrics) override;
-
-  CUptiResult MetricGetIdFromName(CUdevice device, const char* metric_name,
-                                  CUpti_MetricID* metric) override;
-
-  CUptiResult MetricGetNumEvents(CUpti_MetricID metric,
-                                 uint32_t* num_events) override;
-
-  CUptiResult MetricEnumEvents(CUpti_MetricID metric,
-                               size_t* event_id_array_size_bytes,
-                               CUpti_EventID* event_id_array) override;
-
-  CUptiResult MetricGetAttribute(CUpti_MetricID metric,
-                                 CUpti_MetricAttribute attrib,
-                                 size_t* value_size, void* value) override;
-
-  CUptiResult MetricGetValue(CUdevice device, CUpti_MetricID metric,
-                             size_t event_id_array_size_bytes,
-                             CUpti_EventID* event_id_array,
-                             size_t event_value_array_size_bytes,
-                             uint64_t* event_value_array,
-                             uint64_t time_duration,
-                             CUpti_MetricValue* metric_value) override;
-
-  CUptiResult GetResultString(CUptiResult result, const char** str) override;
-
-  CUptiResult GetContextId(CUcontext context, uint32_t* context_id) override;
-
-  CUptiResult GetStreamIdEx(CUcontext context, CUstream stream,
-                            uint8_t per_thread_stream,
-                            uint32_t* stream_id) override;
-
-  void CleanUp() override {}
-  bool Disabled() const override { return false; }
-
- private:
-  CuptiWrapper(const CuptiWrapper&) = delete;
-  void operator=(const CuptiWrapper&) = delete;
-};
-
-}  // namespace profiler
-}  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_CUPTI_WRAPPER_H_
diff --git a/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc b/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
index effc959443ac77..fe017117ae664c 100644
--- a/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
+++ b/tensorflow/core/profiler/backends/gpu/device_tracer_test.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
 #include "third_party/gpus/cuda/include/cuda.h"
 #include "third_party/gpus/cuda/include/cuda_runtime.h"
-#include "tensorflow/core/profiler/backends/gpu/cupti_collector.h"
+#include "xla/backends/profiler/gpu/cupti_collector.h"
 #endif  // GOOGLE_CUDA
 #include "tensorflow/core/common_runtime/direct_session.h"
 #include "tensorflow/core/framework/allocator.h"
@@ -439,13 +439,13 @@ TEST_F(DeviceTracerTest, CudaRuntimeResource) {
           if (addr == reinterpret_cast<size_t>(devptr) &&
               num_bytes == size_in_bytes) {
             found_activity_memory_device = true;
-            EXPECT_EQ(kind,
-                      GetMemoryKindName(CUPTI_ACTIVITY_MEMORY_KIND_DEVICE));
+            EXPECT_EQ(kind, xla::profiler::GetMemoryKindName(
+                                CUPTI_ACTIVITY_MEMORY_KIND_DEVICE));
           } else if (addr == reinterpret_cast<size_t>(hostptr) &&
                      num_bytes == size_in_bytes) {
             found_activity_memory_host = true;
-            EXPECT_EQ(kind,
-                      GetMemoryKindName(CUPTI_ACTIVITY_MEMORY_KIND_PINNED));
+            EXPECT_EQ(kind, xla::profiler::GetMemoryKindName(
+                                CUPTI_ACTIVITY_MEMORY_KIND_PINNED));
           }
         } else if (stat.Type() == StatType::kMemsetDetails) {
           CHECK(!found_activity_memset);
@@ -459,8 +459,8 @@ TEST_F(DeviceTracerTest, CudaRuntimeResource) {
               (void)absl::SimpleAtoi(name_value[1], &num_bytes);
               EXPECT_EQ(num_bytes, 8);
             } else if (absl::StartsWith(detail, "kind:")) {
-              EXPECT_EQ(name_value[1],
-                        GetMemoryKindName(CUPTI_ACTIVITY_MEMORY_KIND_DEVICE));
+              EXPECT_EQ(name_value[1], xla::profiler::GetMemoryKindName(
+                                           CUPTI_ACTIVITY_MEMORY_KIND_DEVICE));
             }
           }
         } else if (stat.Type() == StatType::kMemcpyDetails) {
@@ -475,11 +475,11 @@ TEST_F(DeviceTracerTest, CudaRuntimeResource) {
               (void)absl::SimpleAtoi(name_value[1], &num_bytes);
               EXPECT_EQ(num_bytes, 8);
             } else if (absl::StartsWith(detail, "kind_src:")) {
-              EXPECT_EQ(name_value[1],
-                        GetMemoryKindName(CUPTI_ACTIVITY_MEMORY_KIND_DEVICE));
+              EXPECT_EQ(name_value[1], xla::profiler::GetMemoryKindName(
+                                           CUPTI_ACTIVITY_MEMORY_KIND_DEVICE));
             } else if (absl::StartsWith(detail, "kind_dst:")) {
-              EXPECT_EQ(name_value[1],
-                        GetMemoryKindName(CUPTI_ACTIVITY_MEMORY_KIND_PINNED));
+              EXPECT_EQ(name_value[1], xla::profiler::GetMemoryKindName(
+                                           CUPTI_ACTIVITY_MEMORY_KIND_PINNED));
             }
           }
         }
diff --git a/tensorflow/core/profiler/backends/gpu/rocm_tracer.h b/tensorflow/core/profiler/backends/gpu/rocm_tracer.h
deleted file mode 100644
index 30d121aaac1579..00000000000000
--- a/tensorflow/core/profiler/backends/gpu/rocm_tracer.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_ROCM_TRACER_H_
-#define TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_ROCM_TRACER_H_
-
-#include "absl/container/fixed_array.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "absl/container/node_hash_set.h"
-#include "absl/types/optional.h"
-#include "xla/backends/profiler/gpu/rocm_tracer.h"
-#include "xla/stream_executor/rocm/roctracer_wrapper.h"
-#include "tensorflow/core/lib/core/errors.h"
-#include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
-
-namespace tensorflow {
-namespace profiler {
-
-using xla::profiler::AnnotationMap;                 // NOLINT
-using xla::profiler::DumpRocmTracerEvent;           // NOLINT
-using xla::profiler::GetRocmTracerEventDomainName;  // NOLINT
-using xla::profiler::GetRocmTracerEventSourceName;  // NOLINT
-using xla::profiler::GetRocmTracerEventTypeName;    // NOLINT
-using xla::profiler::KernelDetails;                 // NOLINT
-using xla::profiler::MemAllocDetails;               // NOLINT
-using xla::profiler::MemcpyDetails;                 // NOLINT
-using xla::profiler::MemsetDetails;                 // NOLINT
-using xla::profiler::RocmActivityCallbackImpl;      // NOLINT
-using xla::profiler::RocmApiCallbackImpl;           // NOLINT
-using xla::profiler::RocmTraceCollector;            // NOLINT
-using xla::profiler::RocmTraceCollectorOptions;     // NOLINT
-using xla::profiler::RocmTracer;                    // NOLINT
-using xla::profiler::RocmTracerEvent;               // NOLINT
-using xla::profiler::RocmTracerEventDomain;         // NOLINT
-using xla::profiler::RocmTracerEventSource;         // NOLINT
-using xla::profiler::RocmTracerEventType;           // NOLINT
-using xla::profiler::RocmTracerOptions;             // NOLINT
-using xla::profiler::RocmTracerSyncTypes;           // NOLINT
-using xla::profiler::SynchronizationDetails;        // NOLINT
-
-}  // namespace profiler
-}  // namespace tensorflow
-#endif  // TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_ROCM_TRACER_H_
diff --git a/tensorflow/core/profiler/convert/BUILD b/tensorflow/core/profiler/convert/BUILD
index 27d9d649c84cb3..3fc84fa8818707 100644
--- a/tensorflow/core/profiler/convert/BUILD
+++ b/tensorflow/core/profiler/convert/BUILD
@@ -324,6 +324,7 @@ cc_library(
         "//tensorflow/core/profiler/utils:device_caps_utils",
         "//tensorflow/core/profiler/utils:event_span",
         "//tensorflow/core/profiler/utils:hardware_type_utils",
+        "//tensorflow/core/profiler/utils:hlo_proto_map",
         "//tensorflow/core/profiler/utils:kernel_stats_utils",
         "//tensorflow/core/profiler/utils:math_utils",
         "//tensorflow/core/profiler/utils:xplane_schema",
@@ -829,6 +830,9 @@ cc_library(
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/profiler/utils:hlo_module_utils",
         "//tensorflow/core/profiler/utils:hlo_proto_to_module",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_xla//xla/hlo/ir:hlo",
         "@local_xla//xla/service:hlo_graph_dumper",
         "@local_xla//xla/service:hlo_proto_cc",
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc
index b4600f6acc66e8..bee450104be0a1 100644
--- a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc
+++ b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.cc
@@ -15,16 +15,22 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h"
 
+#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_graph_dumper.h"
 #include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/profiler/convert/tool_options.h"
@@ -33,7 +39,6 @@ limitations under the License.
 
 namespace tensorflow {
 namespace profiler {
-
 namespace {
 
 using ::tensorflow::StatusOr;
@@ -81,10 +86,10 @@ StatusOr<std::string> Plot(std::unique_ptr<HloModule> module,
   CleanUpHloModuleForGraphviz(module.get());
   if (comp) {
     graph_handle =
-        xla::RenderGraph(*comp, "", comp->parent()->config().debug_options(),
-                         format, render_options);
+        RenderGraphView(*comp, "", comp->parent()->config().debug_options(),
+                        format, render_options);
   } else {
-    graph_handle = xla::RenderNeighborhoodAround(*instr, graph_width, format,
+    graph_handle = RenderGraphNeighborhoodAround(*instr, graph_width, format,
                                                  render_options);
   }
   if (graph_handle.ok()) {
@@ -183,5 +188,174 @@ StatusOr<std::string> ConvertHloProtoToStringView(const HloProto& hlo_proto,
   options.set_print_metadata(metadata);
   return hlo_module->ToString(options);
 }
+
+std::function<StatusOr<std::string>(absl::string_view)>* url_renderer = nullptr;
+
+// Precondition: (url_renderer != nullptr || format != kUrl).
+//
+// (We specify this as a precondition rather than checking it in here and
+// returning an error because we want to fail quickly when there's no URL
+// renderer available, and this function runs only after we've done all the work
+// of producing dot for the graph.)
+absl::Status CheckPrecondition(xla::RenderedGraphFormat format) {
+  if (format == xla::RenderedGraphFormat::kUrl && url_renderer == nullptr) {
+    return absl::FailedPreconditionError(
+        "Can't render as URL; no URL renderer was registered.");
+  }
+  return absl::OkStatus();
+}
+
+StatusOr<std::string> RenderGraphView(
+    const xla::HloComputation& computation, absl::string_view label,
+    const xla::DebugOptions& debug_options, xla::RenderedGraphFormat format,
+    xla::HloRenderOptions hlo_render_options) {
+  auto precheck_status = CheckPrecondition(format);
+  if (!precheck_status.ok()) {
+    return precheck_status;
+  }
+  auto rendered_dot =
+      xla::RenderGraph(computation, label, debug_options,
+                       RenderedGraphFormat::kDot, hlo_render_options);
+  if (!rendered_dot.ok()) {
+    return rendered_dot.status();
+  }
+  return WrapDotInFormat(rendered_dot.value(), format);
+}
+
+StatusOr<std::string> RenderGraphNeighborhoodAround(
+    const xla::HloInstruction& node, int radius,
+    xla::RenderedGraphFormat format, xla::HloRenderOptions hlo_render_options,
+    const absl::flat_hash_set<const xla::HloInstruction*>& boundary) {
+  auto precheck_status = CheckPrecondition(format);
+  if (!precheck_status.ok()) {
+    return precheck_status;
+  }
+  auto rendered_dot = xla::RenderNeighborhoodAround(
+      node, radius, RenderedGraphFormat::kDot, hlo_render_options, boundary);
+  if (!rendered_dot.ok()) {
+    return rendered_dot.status();
+  }
+  return WrapDotInFormat(rendered_dot.value(), format);
+}
+
+StatusOr<std::string> WrapDotInFormat(std::string dot,
+                                      xla::RenderedGraphFormat format) {
+  switch (format) {
+    case xla::RenderedGraphFormat::kUrl:
+      if (url_renderer == nullptr) {
+        return absl::InternalError("url_renderer is null");
+      }
+      return (*url_renderer)(dot);
+    case xla::RenderedGraphFormat::kHtml:
+      return WrapDotInHtml(dot);
+    case xla::RenderedGraphFormat::kDot:
+      return std::string(dot);
+  }
+}
+
+std::string WrapDotInHtml(std::string dot) {
+  return absl::StrReplaceAll(R"html(
+<!DOCTYPE html>
+<html>
+<head>
+  <meta charset="utf-8">
+  <style type="text/css">
+    body {
+      height: 100vh;
+      margin: 0;
+    }
+    #graph-container {height:95vh;width:100%;padding:10px;display:block;}
+    #graph-container svg { height: 100% !important; width: 100% !important;}
+    .node, .cluster {cursor:pointer;}
+    .cluster:hover, .node:hover {outline: solid 3px black;}
+  </style>
+</head>
+<body>
+  <script src="https://www.gstatic.com/external_hosted/hpcc_js_wasm/index.min.js"
+      integrity="sha384-LigJPbR3TOfU/Xbb+PjiN1dGJYPweLk7kiGnaMgmxnUmKWaCFKbb5tH6iLlyVhPZ"
+      crossorigin="anonymous"></script>
+  <script src="https://www.gstatic.com/external_hosted/svg_pan_zoom/svg-pan-zoom.js"></script>
+  <div id="graph-container"></div>
+  <script>
+    const cssregex = new RegExp('stylesheet=<([^]*)\n>\n', 'gm');
+    const hpccWasm = window["@hpcc-js/wasm"];
+    const data = `$DOT`;
+    const results = cssregex.exec(data);
+    // graphviz has problem dealing with large stylesheets.
+    // https://github.com/tensorflow/tensorflow/issues/17220#issuecomment-369228492
+    // In order to avoid the problem, remove the stylesheet from the dot and
+    // insert it directly info the rendered SVG.
+
+    let dot_data = data;
+    let css_data = '';
+    if (results !== null) {
+        css_data = results[1].replace(/\s*data:.*\s*,/,''); // Strip content-type field.
+        // CSS inside DOT is URL-escaped, so we must unescape it
+        // before we can insert it into SVG.
+        css_data = unescape(css_data);
+        dot_data = data.replace(cssregex, ''); // Remove the stylesheet
+    }
+
+    var render_start = performance.now()
+    function add_controls(svg) {
+        var htmlblob = new Blob([document.documentElement.innerHTML],
+                                {type: 'text/html'});
+        var savehtml = document.createElement('a');
+        savehtml.setAttribute('href', URL.createObjectURL(htmlblob));
+        savehtml.setAttribute('download', 'graph.html');
+        savehtml.innerHTML = " [Save HTML+SVG] ";
+        document.body.append(savehtml);
+        var svgblob = new Blob([svg.outerHTML], {type: 'image/svg'});
+        var savesvg = document.createElement('a');
+        savesvg.setAttribute('href', URL.createObjectURL(svgblob));
+        savesvg.setAttribute('download', 'graph.svg');
+        savesvg.innerHTML = " [Save SVG] ";
+        document.body.append(savesvg);
+        var dotblob =  new Blob([data], {type: 'text/dot'});
+        var savedot = document.createElement('a');
+        savedot.setAttribute('href', URL.createObjectURL(dotblob));
+        savedot.setAttribute('download', 'graph.dot');
+        savedot.innerHTML = " [Save DOT] ";
+        document.body.append(savedot);
+        // Will get called after embed element was loaded
+        var render_end = performance.now();
+        var render_note = document.createElement('div')
+        render_note.innerHTML = 'Rendering took '
+                                + (render_end - render_start).toFixed(2) + "ms."
+        document.body.append(render_note);
+    }
+    const render_callback = svg => {
+      const container = document.getElementById('graph-container')
+      container.innerHTML = `${svg}<style>${css_data}</style>`;
+      const panZoom = svgPanZoom(container.children[0], {
+        zoomEnabled: true,
+        controlIconsEnabled: true,
+        maxZoom: 200,
+        minZoom: 0,
+      });
+      add_controls(svg);
+    };
+    hpccWasm.graphviz.layout(dot_data, "svg", "dot").then(render_callback);
+  </script>
+</body>
+</html>
+)html",
+                             {
+                                 {"$DOT", dot},
+                             });
+}
+
+void RegisterGraphvizURLRenderer(
+    std::function<StatusOr<std::string>(absl::string_view)> renderer) {
+  if (url_renderer != nullptr) {
+    LOG(WARNING) << "Multiple calls to RegisterGraphToURLRenderer. Last call "
+                    "wins, but because order of initialization in C++ is "
+                    "nondeterministic, this may not be what you want.";
+  }
+  delete url_renderer;
+  url_renderer = new std::function<StatusOr<std::string>(absl::string_view)>(
+      std::move(renderer));
+}
+
 }  // namespace profiler
 }  // namespace tensorflow
diff --git a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h
index ecee2a6b99ce12..9beeaf50cb53fe 100644
--- a/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h
+++ b/tensorflow/core/profiler/convert/hlo_proto_to_graph_view.h
@@ -54,10 +54,37 @@ tensorflow::StatusOr<std::string> ConvertHloProtoToGraph(
     int graph_width, const xla::HloRenderOptions& render_options,
     const xla::RenderedGraphFormat& format);
 
+// Render graph with the provided render options.
+StatusOr<std::string> RenderGraphView(
+    const xla::HloComputation& computation, absl::string_view label,
+    const xla::DebugOptions& debug_options, xla::RenderedGraphFormat format,
+    xla::HloRenderOptions hlo_render_options = {});
+
+// Render graph with centered node and depth
+StatusOr<std::string> RenderGraphNeighborhoodAround(
+    const xla::HloInstruction& node, int radius,
+    xla::RenderedGraphFormat format,
+    xla::HloRenderOptions hlo_render_options = {},
+    const absl::flat_hash_set<const xla::HloInstruction*>& boundary = {});
+
 // Convert `hlo_proto` to StringView.
 tensorflow::StatusOr<std::string> ConvertHloProtoToStringView(
     const xla::HloProto& hlo_proto, bool verbose, bool metadata);
 
+// Convert dot into certain format
+StatusOr<std::string> WrapDotInFormat(std::string dot,
+                                      xla::RenderedGraphFormat format);
+
+// Convert dot into visual graph in html
+std::string WrapDotInHtml(std::string dot);
+
+// Registers a function which implements RenderedGraphFormat::kUrl.
+// The input to the function is dot, and the output should be a URL or an error.
+// There can only be one active renderer, and the last call to this function
+// wins.
+void RegisterGraphvizURLRenderer(
+    std::function<StatusOr<std::string>(absl::string_view dot)> renderer);
+
 }  // namespace profiler
 }  // namespace tensorflow
 
diff --git a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc b/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
index 1ecc1ce828991a..9c14abcf21db3a 100644
--- a/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
+++ b/tensorflow/core/profiler/convert/op_metrics_db_combiner.cc
@@ -72,6 +72,7 @@ void CombineOpMetrics(const OpMetrics& src, OpMetrics* dst,
   dst->set_time_ps(src.time_ps() + dst->time_ps());
   dst->set_self_time_ps(src.self_time_ps() + dst->self_time_ps());
   dst->set_flops(src.flops() + dst->flops());
+  dst->set_model_flops(src.model_flops() + dst->model_flops());
   dst->set_bytes_accessed(src.bytes_accessed() + dst->bytes_accessed());
   dst->set_autotuned(dst->autotuned() || src.autotuned());
   if (update_num_cores) {
diff --git a/tensorflow/core/profiler/convert/op_metrics_to_record.h b/tensorflow/core/profiler/convert/op_metrics_to_record.h
index 514e248df86041..47b675554b691b 100644
--- a/tensorflow/core/profiler/convert/op_metrics_to_record.h
+++ b/tensorflow/core/profiler/convert/op_metrics_to_record.h
@@ -35,6 +35,13 @@ inline double GigaFlopsPerSecondPerCore(const OpMetrics& metrics) {
   return SafeDivide(metrics.flops(), PicoToNano(metrics.time_ps()));
 }
 
+inline double GigaModelFlopsPerSecondPerCore(const OpMetrics& metrics) {
+  // flops and time_ps are accumulated across all occurrences on all cores.
+  // time_ps is used instead of self_time_ps because flops for an op includes
+  // the flops executed by children (nested) ops.
+  return SafeDivide(metrics.model_flops(), PicoToNano(metrics.time_ps()));
+}
+
 // Return ByteAccessed for memory_space and operation_type.
 inline double BytesAccessedPerCore(
     const OpMetrics& metrics, uint64_t memory_space,
diff --git a/tensorflow/core/profiler/convert/op_profile_builder.cc b/tensorflow/core/profiler/convert/op_profile_builder.cc
index 124d4096518f95..c6d45f70df29c7 100644
--- a/tensorflow/core/profiler/convert/op_profile_builder.cc
+++ b/tensorflow/core/profiler/convert/op_profile_builder.cc
@@ -146,21 +146,6 @@ void FinalizeDeduplicatedNodes(bool by_program, Node* root) {
   }
 }
 
-// Recursively find computation size for HLOs -- applied only for convolutions.
-// This is only for convolutions, not other HLOs, categories or whole programs.
-// TODO(b/243596435) Find a permanent fix to this problem.
-int64_t GetComputationSize(Node node) {
-  if (node.has_xla() && node.xla().computation_primitive_size() > 0) {
-    return node.xla().computation_primitive_size();
-  }
-  for (auto child_iter = node.children().rbegin();
-       child_iter != node.children().rend(); ++child_iter) {
-    if (const int64_t computation_size = GetComputationSize(*child_iter))
-      return computation_size;
-  }
-  return 0;
-}
-
 // Fills op metrics into a node.
 void PopulateOpMetricsNode(
     const OpMetrics& op_metrics, double peak_gigaflops_per_second_per_core,
@@ -184,19 +169,11 @@ void PopulateOpMetricsNode(
   // and memory_bandwidth = raw_bytes_accessed / raw_time. See:
   // https://github.com/tensorflow/profiler/blob/master/frontend/app/common/utils/utils.ts
   metrics->set_raw_time(op_metrics.time_ps());
-  metrics->set_raw_flops(op_metrics.flops());
+  metrics->set_raw_flops(op_metrics.model_flops());
   metrics->set_occurrences(op_metrics.occurrences());
   metrics->set_avg_time_ps(
       SafeDivide(op_metrics.time_ps(), op_metrics.occurrences()));
 
-  // Hack to approximate utilization for INT8/4 convolution HLOs:
-  // Since MXU BW is 2x/4x for INT8/4, multiply peak BW by the factor determined
-  // by the computation size
-  if (GetComputationSize(*node) == 8) {
-    peak_gigaflops_per_second_per_core *= 2;
-  } else if (GetComputationSize(*node) == 4) {
-    peak_gigaflops_per_second_per_core *= 4;
-  }
   double flops_utilization = SafeDivide(GigaFlopsPerSecondPerCore(op_metrics),
                                         peak_gigaflops_per_second_per_core);
   // The UI expects flops_utilization = flop_util / time_fraction. See:
diff --git a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
index 66a677441f7803..120e22bab5d64d 100644
--- a/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
+++ b/tensorflow/core/profiler/convert/op_stats_to_input_pipeline_analysis.cc
@@ -659,7 +659,7 @@ bool InputAnalysis(double input_percent, double all_other_percent,
         "could be due to I/O or Python execution or both).");
     return true;
   } else {
-    // Defintely not input-bound.
+    // Definitely not input-bound.
     *input_classification = "device";
     *input_statement =
         absl::StrCat("Your program is NOT input-bound because only ",
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
index c5135e5eec4464..2954cc2206f19e 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db.cc
@@ -247,11 +247,14 @@ OpMetricsDb ConvertDeviceTraceXPlaneToOpMetricsDb(const XPlane& device_trace) {
 
       absl::string_view tf_op_full_name;
       bool is_eager = false;
+      int64_t program_id = 0;
       event.ForEachStat([&](const XStatVisitor& stat) {
         if (stat.Type() == StatType::kTfOp) {
           tf_op_full_name = stat.StrOrRefValue();
         } else if (stat.Type() == StatType::kIsEager) {
           is_eager = stat.IntValue();
+        } else if (stat.Type() == StatType::kProgramId) {
+          program_id = stat.IntOrUintValue();
         }
       });
       if (tf_op_full_name.empty()) return;
@@ -262,8 +265,9 @@ OpMetricsDb ConvertDeviceTraceXPlaneToOpMetricsDb(const XPlane& device_trace) {
         costs = op_level_cost_estimator.Predict(event);
       }
       device_op_metrics_db_builder.EnterOp(
-          /*program_id=*/0, absl::StrCat(tf_op.name, "/", event.Name()),
-          tf_op.type, tf_op_full_name, is_eager,
+          /*program_id=*/program_id,
+          absl::StrCat(tf_op.name, "/", event.Name()), tf_op.type,
+          tf_op_full_name, is_eager,
           /*occurrences=*/1, event.DurationPs(),
           /*children_time_ps=*/0, costs.flops, costs.bytes_accessed);
     });
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
index 9d742ac95fad09..5226b07353f74c 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_metrics_db_test.cc
@@ -226,6 +226,7 @@ TEST(ConvertXPlaneToOpMetricsDb, TpuDeviceOpMetricsDb) {
 #if defined(PLATFORM_GOOGLE)
   EXPECT_THAT(op_metrics,
               EqualsProto(R"pb(metrics_db {
+                                 hlo_module_id: 1
                                  self_time_ps: 10000
                                  flops: 68
                                  occurrences: 2
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
index f8b1844dd4bb42..ab5906965f23fe 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "tensorflow/core/profiler/utils/device_caps_utils.h"
 #include "tensorflow/core/profiler/utils/event_span.h"
 #include "tensorflow/core/profiler/utils/hardware_type_utils.h"
+#include "tensorflow/core/profiler/utils/hlo_proto_map.h"
 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
 #include "tensorflow/core/profiler/utils/math_utils.h"
 #include "tensorflow/core/profiler/utils/xplane_schema.h"
@@ -56,8 +57,6 @@ std::string Hostname(const XSpace& space) {
   if (space.hostnames().empty()) return "localhost";
   DCHECK_EQ(space.hostnames_size(), 1);
   const std::string& hostname = space.hostnames(0);
-  // This shouldn't be a taskname in host:port format.
-  DCHECK(!absl::StrContains(hostname, ':'));
   return hostname;
 }
 
@@ -166,6 +165,15 @@ void PropagateXSpaceDiagnosticsToOpStats(const XSpace& space,
   }
 }
 
+// This function should be idempotent to be called
+void SetProgramIdToNameMap(const HloProtoMap& hlo_proto_map,
+                           tensorflow::profiler::OpStats& op_stats) {
+  auto& program_id_to_name_map = *op_stats.mutable_program_id_to_name_map();
+  for (const auto& [program_id, hlo_proto] : hlo_proto_map) {
+    program_id_to_name_map[program_id] = hlo_proto->hlo_module().name();
+  }
+}
+
 OpStats ConvertXSpaceToOpStats(const XSpace& space,
                                const OpStatsOptions& options) {
   std::vector<const XPlane*> device_planes = FindTensorCorePlanes(space);
@@ -259,6 +267,13 @@ OpStats ConvertXSpaceToOpStats(const XSpace& space,
         (*op_stats.mutable_core_id_to_details())[kDefaultGpuLocalCoreId];
     details.set_hostname(Hostname(space));
   }
+
+  // Set program_id_to_name map in OpStats from Xspace
+  // Will be non-op if the space does not have materialized device traces
+  HloProtoMap hlo_proto_map;
+  hlo_proto_map.AddHloProtosFromXSpace(space);
+  SetProgramIdToNameMap(hlo_proto_map, op_stats);
+
   return op_stats;
 }
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_op_stats.h b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
index 0a1cde34b13afb..994efb032920a6 100644
--- a/tensorflow/core/profiler/convert/xplane_to_op_stats.h
+++ b/tensorflow/core/profiler/convert/xplane_to_op_stats.h
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/convert/repository.h"
 #include "tensorflow/core/profiler/protobuf/op_stats.pb.h"
+#include "tensorflow/core/profiler/utils/hlo_proto_map.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 
 namespace tensorflow {
@@ -36,6 +37,10 @@ struct OpStatsOptions {
 OpStats ConvertXSpaceToOpStats(const XSpace& space,
                                const OpStatsOptions& options);
 
+// Populates the program_id_to_name map in OpStats.
+void SetProgramIdToNameMap(const HloProtoMap& hlo_proto_map,
+                           tensorflow::profiler::OpStats& op_stats);
+
 // Populates the given RunEnvironment with data from XSpace.
 void SetRunEnvironment(const XSpace& space, RunEnvironment* env);
 
diff --git a/tensorflow/core/profiler/convert/xplane_to_tool_names.cc b/tensorflow/core/profiler/convert/xplane_to_tool_names.cc
index f83660e6cea5da..57ee859ad791a7 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tool_names.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tool_names.cc
@@ -39,7 +39,7 @@ StatusOr<std::string> GetAvailableToolNames(
     tools.push_back(is_cloud_vertex_ai ? "trace_viewer" : "trace_viewer@");
     tools.push_back("overview_page");
     tools.push_back("input_pipeline_analyzer");
-    tools.push_back("tensorflow_stats");
+    tools.push_back("framework_op_stats");
     tools.push_back("memory_profile");
     tools.push_back("pod_viewer");
     tools.push_back("tf_data_bottleneck_analysis");
diff --git a/tensorflow/core/profiler/convert/xplane_to_tool_names_test.cc b/tensorflow/core/profiler/convert/xplane_to_tool_names_test.cc
index f652010b8e29a4..b82403f319804d 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tool_names_test.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tool_names_test.cc
@@ -115,7 +115,7 @@ TEST_P(XPlaneToToolsTest, ToolsList) {
   std::vector<std::string> expected_tools = {"trace_viewer",
                                              "overview_page",
                                              "input_pipeline_analyzer",
-                                             "tensorflow_stats",
+                                             "framework_op_stats",
                                              "memory_profile",
                                              "pod_viewer",
                                              "tf_data_bottleneck_analysis",
diff --git a/tensorflow/core/profiler/convert/xplane_to_tools_data.cc b/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
index 21e4c9c6be587f..18131619d13463 100644
--- a/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
+++ b/tensorflow/core/profiler/convert/xplane_to_tools_data.cc
@@ -326,7 +326,7 @@ StatusOr<std::string> ConvertMultiXSpacesToToolData(
     return ConvertMultiXSpacesToOverviewPage(session_snapshot);
   } else if (tool_name == "input_pipeline_analyzer") {
     return ConvertMultiXSpacesToInputPipeline(session_snapshot);
-  } else if (tool_name == "tensorflow_stats") {
+  } else if (tool_name == "framework_op_stats") {
     return ConvertMultiXSpacesToTfStats(session_snapshot);
   } else if (tool_name == "kernel_stats") {
     return ConvertMultiXSpacesToKernelStats(session_snapshot);
diff --git a/tensorflow/core/profiler/lib/BUILD b/tensorflow/core/profiler/lib/BUILD
index 96e06e52dbca13..a7049b302e4640 100644
--- a/tensorflow/core/profiler/lib/BUILD
+++ b/tensorflow/core/profiler/lib/BUILD
@@ -166,8 +166,6 @@ cc_library(
         ":scoped_annotation",
         ":traceme",
         "//tensorflow/core:lib",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -209,6 +207,7 @@ cc_library(
     ],
 )
 
+# TODO(csigg): Remove this forwarding target.
 cc_library(
     name = "scoped_annotation",
     hdrs = ["scoped_annotation.h"],
@@ -223,22 +222,6 @@ cc_library(
     ]),
 )
 
-cc_library(
-    name = "scoped_annotation_stack",
-    hdrs = ["scoped_annotation_stack.h"],
-    visibility = [
-        "//visibility:private",  # Only private by automation, not intent. Owner may accept CLs adding visibility. See go/scheuklappen#explicit-private.
-    ],
-    deps = [
-        "//tensorflow/core:lib",
-        "//tensorflow/core/platform",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/profiler/lib:scoped_annotation_stack",
-    ] + if_not_android([
-        "//tensorflow/core/profiler/backends/cpu:annotation_stack",
-    ]),
-)
-
 cc_library(
     name = "profiler_lock",
     hdrs = ["profiler_lock.h"],
diff --git a/tensorflow/core/profiler/lib/annotated_traceme.h b/tensorflow/core/profiler/lib/annotated_traceme.h
index 24ab188674f101..313fca86d8d08c 100644
--- a/tensorflow/core/profiler/lib/annotated_traceme.h
+++ b/tensorflow/core/profiler/lib/annotated_traceme.h
@@ -15,13 +15,12 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_
 #define TENSORFLOW_CORE_PROFILER_LIB_ANNOTATED_TRACEME_H_
 
+#include <optional>
+#include <string>
 #include <utility>
 
-#include "absl/strings/string_view.h"
-#include "absl/types/optional.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
-#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/profiler/lib/scoped_annotation.h"
 #include "tensorflow/core/profiler/lib/traceme.h"
 
@@ -37,20 +36,21 @@ class AnnotatedTraceMe {
     DCHECK_GE(level, 1);
     bool annotation_enabled = ScopedAnnotation::IsEnabled();
     bool traceme_enabled = TraceMe::Active(level);
-    if (TF_PREDICT_FALSE(annotation_enabled || traceme_enabled)) {
-      string name = std::forward<NameGeneratorT>(name_generator)();
-      if (annotation_enabled) {
-        scoped_annotation_.emplace(absl::string_view(name));
-      }
-      if (TF_PREDICT_TRUE(traceme_enabled)) {
-        trace_me_.emplace([&name] { return std::move(name); }, level);
-      }
+    if (TF_PREDICT_TRUE(!annotation_enabled && !traceme_enabled)) {
+      return;
+    }
+    std::string name = name_generator();
+    if (annotation_enabled) {
+      scoped_annotation_.emplace(name);
+    }
+    if (TF_PREDICT_TRUE(traceme_enabled)) {
+      trace_me_.emplace([&name] { return std::move(name); }, level);
     }
   }
 
  private:
-  absl::optional<TraceMe> trace_me_;
-  absl::optional<ScopedAnnotation> scoped_annotation_;
+  std::optional<TraceMe> trace_me_;
+  std::optional<ScopedAnnotation> scoped_annotation_;
 };
 
 }  // namespace profiler
diff --git a/tensorflow/core/profiler/protobuf/op_metrics.proto b/tensorflow/core/profiler/protobuf/op_metrics.proto
index 2695f102d28c76..25803ae2999b8d 100644
--- a/tensorflow/core/profiler/protobuf/op_metrics.proto
+++ b/tensorflow/core/profiler/protobuf/op_metrics.proto
@@ -62,7 +62,7 @@ message MemoryAccessBreakdown {
 }
 
 // Metrics for an operation (accumulated over all occurrences).
-// Next ID: 24
+// Next ID: 25
 message OpMetrics {
   // HLO module id. 0 for TF ops.
   uint64 hlo_module_id = 13;
@@ -84,8 +84,11 @@ message OpMetrics {
   uint64 min_time_ps = 17;
   // Total self time in picoseconds.
   uint64 self_time_ps = 1;
-  // Total FLOPs.
+  // Total FLOPs. Normalized to the devices peak bandwidth.
   uint64 flops = 2;
+  // Total FLOPs for the model. Can be 0, in which case assume it's same as
+  // flops
+  uint64 model_flops = 24;
   // Total bytes accessed.
   uint64 bytes_accessed = 5;
   // Breakdown of memory accessed by operation type and memory space.
diff --git a/tensorflow/core/profiler/protobuf/overview_page.proto b/tensorflow/core/profiler/protobuf/overview_page.proto
index 501d1d09170f55..4b91b96da14c60 100644
--- a/tensorflow/core/profiler/protobuf/overview_page.proto
+++ b/tensorflow/core/profiler/protobuf/overview_page.proto
@@ -69,6 +69,14 @@ message OverviewPageAnalysis {
   // BEGIN-INTERNAL
   // Program Goodput metric in percentage.
   double program_goodput_percent = 18;
+  // Sparse core step time in ms average.
+  double sc_step_time_ms_average = 19;
+  // Sparse core infeed time in ms average.
+  double sc_infeed_time_ms_avg = 20;
+  // Sparse core outfeed time in ms average.
+  double sc_outfeed_time_ms_avg = 21;
+  // Sparse core idle time in ms average.
+  double sc_idle_time_ms_avg = 22;
   // END-INTERNAL
 }
 
diff --git a/tensorflow/core/profiler/utils/BUILD b/tensorflow/core/profiler/utils/BUILD
index 064c4dc76fa45c..b67fb16cd8f7c4 100644
--- a/tensorflow/core/profiler/utils/BUILD
+++ b/tensorflow/core/profiler/utils/BUILD
@@ -281,8 +281,8 @@ tf_cc_test(
         ":kernel_stats_utils",
         "//tensorflow/core:test",
         "//tensorflow/core:test_main",
-        "//tensorflow/core/profiler/backends/gpu:cupti_collector_header",
         "//tensorflow/core/profiler/protobuf:kernel_stats_proto_cc",
+        "@local_xla//xla/backends/profiler/gpu:cupti_collector",
     ],
 )
 
diff --git a/tensorflow/core/profiler/utils/kernel_stats_utils_test.cc b/tensorflow/core/profiler/utils/kernel_stats_utils_test.cc
index 45aea85f943adf..99096213f56704 100644
--- a/tensorflow/core/profiler/utils/kernel_stats_utils_test.cc
+++ b/tensorflow/core/profiler/utils/kernel_stats_utils_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "tensorflow/core/profiler/utils/kernel_stats_utils.h"
 
+#include "xla/backends/profiler/gpu/cupti_collector.h"
 #include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/backends/gpu/cupti_collector.h"
 #include "tensorflow/core/profiler/protobuf/kernel_stats.pb.h"
 
 namespace tensorflow {
@@ -68,7 +68,7 @@ TEST(KernelStatsUtilsTest, TestGroupKernelReportsByOpName) {
 }
 
 TEST(KernelStatsUtilsTest, KernelDetailsXStatParser) {
-  KernelDetails kernel_info;
+  xla::profiler::KernelDetails kernel_info;
   kernel_info.registers_per_thread = 10;
   kernel_info.static_shared_memory_usage = 128;
   kernel_info.dynamic_shared_memory_usage = 256;
diff --git a/tensorflow/core/profiler/utils/op_metrics_db_utils.cc b/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
index 68f4551df6223f..5ea6b463571a4e 100644
--- a/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
+++ b/tensorflow/core/profiler/utils/op_metrics_db_utils.cc
@@ -111,6 +111,9 @@ void SetOpMetadataFromHloEventMetadata(
   hlo_event_metadata.ForEachStat([&](const XStatVisitor& stat) {
     if (stat.Type().has_value()) {
       switch (static_cast<StatType>(*stat.Type())) {
+        case StatType::kProgramId:
+          op_metrics->set_hlo_module_id(stat.IntOrUintValue());
+          break;
         case StatType::kHloCategory:
           op_metrics->set_category(std::string(stat.StrOrRefValue()));
           break;
diff --git a/tensorflow/core/profiler/utils/op_utils.cc b/tensorflow/core/profiler/utils/op_utils.cc
index fe98a0d12fa997..d5d0c2e05e7b37 100644
--- a/tensorflow/core/profiler/utils/op_utils.cc
+++ b/tensorflow/core/profiler/utils/op_utils.cc
@@ -79,7 +79,8 @@ void DeviceOpMetricsDbBuilder::EnterOp(
     uint64 time_ps, uint64 children_time_ps, int64_t flops,
     int64_t bytes_accessed,
     const protobuf::RepeatedPtrField<OpMetrics::MemoryAccessed>&
-        memory_accessed_breakdown) {
+        memory_accessed_breakdown,
+    int64_t model_flops) {
   uint64 self_time_ps = time_ps - children_time_ps;
   DCHECK_GE(time_ps, self_time_ps);
   OpMetrics* op_metrics = LookupOrInsertNewOpMetrics(program_id, name);
@@ -95,6 +96,13 @@ void DeviceOpMetricsDbBuilder::EnterOp(
   op_metrics->set_time_ps(op_metrics->time_ps() + time_ps);
   op_metrics->set_self_time_ps(op_metrics->self_time_ps() + self_time_ps);
   op_metrics->set_flops(op_metrics->flops() + flops * occurrences);
+  if (model_flops == 0) {
+    // If ModelsFlops is 0, use the same value as device flops.
+    op_metrics->set_model_flops(op_metrics->flops());
+  } else {
+    op_metrics->set_model_flops(op_metrics->model_flops() +
+                                model_flops * occurrences);
+  }
   op_metrics->set_bytes_accessed(op_metrics->bytes_accessed() +
                                  bytes_accessed * occurrences);
   CombineMemoryAccessedBreakdown(
diff --git a/tensorflow/core/profiler/utils/op_utils.h b/tensorflow/core/profiler/utils/op_utils.h
index d7d156d5da31ba..b7cf3a804ff425 100644
--- a/tensorflow/core/profiler/utils/op_utils.h
+++ b/tensorflow/core/profiler/utils/op_utils.h
@@ -76,7 +76,8 @@ class DeviceOpMetricsDbBuilder : public OpMetricsDbBuilder {
                bool is_eager, uint64 occurrences, uint64 time_ps,
                uint64 children_time_ps, int64_t flops, int64_t bytes_accessed,
                const protobuf::RepeatedPtrField<OpMetrics::MemoryAccessed>&
-                   memory_accessed_breakdown = {});
+                   memory_accessed_breakdown = {},
+               int64_t model_flops = 0);
 };
 
 }  // namespace profiler
diff --git a/tensorflow/core/protobuf/BUILD b/tensorflow/core/protobuf/BUILD
index 8c60bdab0656ee..76f6c6551f8d74 100644
--- a/tensorflow/core/protobuf/BUILD
+++ b/tensorflow/core/protobuf/BUILD
@@ -216,7 +216,6 @@ tf_proto_library(
         "@local_tsl//tsl/protobuf:status_proto",
     ],
     tags = ["alt_dep=//third_party/tensorflow/core:protos_all"],
-    visibility = ["//visibility:public"],
     exports = [
         "@local_tsl//tsl/protobuf:bfc_memory_map_proto",
         "@local_tsl//tsl/protobuf:rpc_options_proto",
diff --git a/tensorflow/core/public/version.h b/tensorflow/core/public/version.h
index 2a78641271d445..ac525789b041ab 100644
--- a/tensorflow/core/public/version.h
+++ b/tensorflow/core/public/version.h
@@ -21,7 +21,7 @@ limitations under the License.
 // Also update tensorflow/tensorflow.bzl and
 // tensorflow/tools/pip_package/setup.py
 #define TF_MAJOR_VERSION 2
-#define TF_MINOR_VERSION 16
+#define TF_MINOR_VERSION 17
 #define TF_PATCH_VERSION 0
 
 // TF_VERSION_SUFFIX is non-empty for pre-releases (e.g. "-alpha", "-alpha.1",
@@ -108,7 +108,7 @@ limitations under the License.
 
 #define TF_GRAPH_DEF_VERSION_MIN_PRODUCER 0
 #define TF_GRAPH_DEF_VERSION_MIN_CONSUMER 0
-#define TF_GRAPH_DEF_VERSION 1750  // Updated: 2024/1/23
+#define TF_GRAPH_DEF_VERSION 1778  // Updated: 2024/2/20
 
 // Checkpoint compatibility versions (the versions field in SavedSliceMeta).
 //
diff --git a/tensorflow/core/runtime_fallback/test/BUILD b/tensorflow/core/runtime_fallback/test/BUILD
index e210a8296f3b85..e339e59bcd6fa9 100644
--- a/tensorflow/core/runtime_fallback/test/BUILD
+++ b/tensorflow/core/runtime_fallback/test/BUILD
@@ -190,9 +190,9 @@ cc_library(
 #         ":coreruntime_driver",
 #         "@com_google_googletest//:gtest",
 #         "//tensorflow/core/platform:test_benchmark",
+#         "//tensorflow/core/platform/default/build_config:test_main",
 #         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_op_handler",
 #         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_tensor",
-#         "@local_tsl//tsl/platform/default/build_config:test_main",
 #         "@tf_runtime//:core_runtime_alwayslink",
 #         "@tf_runtime//:hostcontext",
 #         "@tf_runtime//:tensor",
@@ -215,9 +215,9 @@ cc_library(
 #     deps = [
 #         "@com_google_googletest//:gtest",
 #         "//tensorflow/core/framework:tensor_testutil",
+#         "//tensorflow/core/platform/default/build_config:test_main",
 #         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_compat_request_state",
 #         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_op_handler",
-#         "@local_tsl//tsl/platform/default/build_config:test_main",
 #         "@tf_runtime//:core_runtime_alwayslink",
 #     ],
 # )
diff --git a/tensorflow/core/tfrt/common/BUILD b/tensorflow/core/tfrt/common/BUILD
index 469bbff25d6f04..48ad12ffda86f0 100644
--- a/tensorflow/core/tfrt/common/BUILD
+++ b/tensorflow/core/tfrt/common/BUILD
@@ -17,6 +17,7 @@ package_group(
     packages = [
         # copybara:uncomment "//learning/brain/experimental/dtensor/...",
         # copybara:uncomment "//learning/brain/experimental/tfrt/...",
+        # copybara:uncomment "//learning/brain/google/monitoring/...",
         # copybara:uncomment "//learning/brain/google/xla/...",
         # copybara:uncomment "//learning/brain/tfrc/...",
         # copybara:uncomment "//learning/brain/tfrt/...",
@@ -249,6 +250,16 @@ cc_library(
     alwayslink = True,
 )
 
+cc_library(
+    name = "metrics",
+    srcs = ["metrics.cc"],
+    hdrs = ["metrics.h"],
+    deps = [
+        "//tensorflow/core:lib",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
 tf_cuda_cc_test(
     name = "pjrt_gpu_client_registration_test",
     size = "small",
diff --git a/tensorflow/core/tfrt/common/metrics.cc b/tensorflow/core/tfrt/common/metrics.cc
new file mode 100644
index 00000000000000..ad6f3fb218eb96
--- /dev/null
+++ b/tensorflow/core/tfrt/common/metrics.cc
@@ -0,0 +1,40 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tfrt/common/metrics.h"
+
+#include <cstdint>
+#include <string>
+
+#include "absl/strings/str_cat.h"
+#include "tensorflow/core/lib/monitoring/sampler.h"
+#include "tsl/lib/monitoring/sampler.h"
+
+namespace tensorflow {
+namespace tfrt_metrics {
+
+monitoring::SamplerCell* GetTfrtGraphExecutorLatencySampler(
+    const std::string& model_name, int64_t model_version,
+    const std::string& graph_name) {
+  static auto* cell = tensorflow::monitoring::Sampler<3>::New(
+      {"/tfrt/graph_executor/latency",
+       "Tracks the latency of GraphExecutor (in microseconds) of a graph.",
+       "model_name", "model_version", "graph_name"},
+      monitoring::Buckets::Exponential(10, 1.5, 33));
+  return cell->GetCell(model_name, absl::StrCat(model_version), graph_name);
+}
+
+}  // namespace tfrt_metrics
+}  // namespace tensorflow
diff --git a/tensorflow/core/profiler/backends/gpu/mock_cupti.h b/tensorflow/core/tfrt/common/metrics.h
similarity index 55%
rename from tensorflow/core/profiler/backends/gpu/mock_cupti.h
rename to tensorflow/core/tfrt/common/metrics.h
index 2631def6819992..749dfdae4328b2 100644
--- a/tensorflow/core/profiler/backends/gpu/mock_cupti.h
+++ b/tensorflow/core/tfrt/common/metrics.h
@@ -1,4 +1,4 @@
-/* Copyright 2021 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,24 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_MOCK_CUPTI_H_
-#define TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_MOCK_CUPTI_H_
-
-#include <stddef.h>
-#include <stdint.h>
+#ifndef TENSORFLOW_CORE_TFRT_COMMON_METRICS_H_
+#define TENSORFLOW_CORE_TFRT_COMMON_METRICS_H_
 
 #include <cstdint>
+#include <string>
 
-#include "xla/backends/profiler/gpu/mock_cupti.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/profiler/backends/gpu/cupti_interface.h"
+#include "tensorflow/core/lib/monitoring/sampler.h"
+#include "tsl/lib/monitoring/sampler.h"
 
 namespace tensorflow {
-namespace profiler {
+namespace tfrt_metrics {
 
-using xla::profiler::MockCupti;  // NOLINT
+monitoring::SamplerCell* GetTfrtGraphExecutorLatencySampler(
+    const std::string& model_name, int64_t model_version,
+    const std::string& graph_name);
 
-}  // namespace profiler
+}  // namespace tfrt_metrics
 }  // namespace tensorflow
 
-#endif  // TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_MOCK_CUPTI_H_
+#endif  // TENSORFLOW_CORE_TFRT_COMMON_METRICS_H_
diff --git a/tensorflow/core/tfrt/graph_executor/BUILD b/tensorflow/core/tfrt/graph_executor/BUILD
index 3ae80483cfb0af..931e00ff95aef4 100644
--- a/tensorflow/core/tfrt/graph_executor/BUILD
+++ b/tensorflow/core/tfrt/graph_executor/BUILD
@@ -77,6 +77,7 @@ cc_library(
         "//tensorflow/compiler/mlir/tensorflow:error_util",
         "//tensorflow/compiler/mlir/tensorflow:import_model",
         "//tensorflow/compiler/mlir/tensorflow:mlir_roundtrip_flags",
+        "//tensorflow/compiler/mlir/tfrt:backend_compiler",
         "//tensorflow/compiler/mlir/tfrt:import_model",
         "//tensorflow/compiler/mlir/tfrt:tfrt_compile_options",
         "//tensorflow/compiler/mlir/tfrt:transforms/update_op_cost_in_tfrt_mlir",
@@ -96,6 +97,7 @@ cc_library(
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_compat_request_state",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_execute_compat",
         "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_utils",
+        "//tensorflow/core/tfrt/common:metrics",
         "//tensorflow/core/tfrt/fallback:cost_recorder",
         "//tensorflow/core/tfrt/fallback:fallback_state",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner",
@@ -116,6 +118,7 @@ cc_library(
         "//tensorflow/core/tfrt/utils:tfrt_graph_execution_state",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -123,9 +126,11 @@ cc_library(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:optional",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:FuncExtensions",
         "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/concurrency:async_value",
         "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:refcount",
         "@local_tsl//tsl/platform:status",
@@ -149,25 +154,39 @@ tf_cc_test(
     srcs = ["graph_executor_test.cc"],
     tags = ["no_oss"],
     deps = [
+        ":graph_execution_options",
         ":graph_executor",
         "//tensorflow/cc:array_ops",
         "//tensorflow/cc:cc_ops",
         "//tensorflow/cc:const_op",
+        "//tensorflow/cc:ops",
+        "//tensorflow/cc:scope",
         "//tensorflow/core:core_cpu_base",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_lite",
+        "//tensorflow/core:framework_types_hdr",
         "//tensorflow/core:test",
+        "//tensorflow/core/framework:common_shape_fns",
         "//tensorflow/core/framework:graph_proto_cc",
+        "//tensorflow/core/framework:op",
+        "//tensorflow/core/framework:tensor",
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/grappler/utils:grappler_test",
+        "//tensorflow/core/platform:status",
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/protobuf:for_core_protos_cc",
+        "//tensorflow/core/tfrt/fallback:fallback_state",
         "//tensorflow/core/tfrt/mlrt/interpreter:context",
         "//tensorflow/core/tfrt/mlrt/interpreter:value",
         "//tensorflow/core/tfrt/mlrt/kernel",
         "//tensorflow/core/tfrt/saved_model:saved_model_testutil",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
+        "@tf_runtime//:hostcontext",
         "@tf_runtime//:tensor",
         "@tf_runtime//cpp_tests:common",
     ] + if_google(
@@ -188,6 +207,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_protobuf//:protobuf",
     ],
 )
diff --git a/tensorflow/core/tfrt/graph_executor/config.h b/tensorflow/core/tfrt/graph_executor/config.h
index 60e9670133638d..b0e3fbf12d6ac3 100644
--- a/tensorflow/core/tfrt/graph_executor/config.h
+++ b/tensorflow/core/tfrt/graph_executor/config.h
@@ -21,6 +21,7 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/tfrt/graph_executor/config.pb.h"
 
 namespace tensorflow {
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor.cc b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
index 0d9c7ca29299aa..414f64dbda95b2 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -35,6 +36,8 @@ limitations under the License.
 #include "absl/time/time.h"
 #include "absl/types/optional.h"
 #include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
@@ -46,6 +49,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/mlir/tensorflow/ir/tf_saved_model.h"
 #include "tensorflow/compiler/mlir/tensorflow/translate/import_model.h"
+#include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
 #include "tensorflow/compiler/mlir/tensorflow/utils/error_util.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/mlrt/import_model.h"
 #include "tensorflow/compiler/mlir/tfrt/transforms/update_op_cost_in_tfrt_mlir.h"
@@ -57,7 +61,9 @@ limitations under the License.
 #include "tensorflow/core/framework/rendezvous.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/lib/gtl/cleanup.h"
+#include "tensorflow/core/lib/monitoring/sampler.h"
 #include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
 #include "tensorflow/core/platform/tstring.h"
@@ -65,17 +71,22 @@ limitations under the License.
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session.h"
 #include "tensorflow/core/public/version.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
 #include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h"
+#include "tensorflow/core/tfrt/common/metrics.h"
 #include "tensorflow/core/tfrt/fallback/cost_recorder.h"
 #include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
 #include "tensorflow/core/tfrt/graph_executor/executable_context.h"
 #include "tensorflow/core/tfrt/graph_executor/export_mlir.h"
 #include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
 #include "tensorflow/core/tfrt/graph_executor/sync_resource_state.h"
 #include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
 #include "tensorflow/core/tfrt/mlrt/bytecode/executable.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/function.h"
 #include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
 #include "tensorflow/core/tfrt/mlrt/interpreter/execute.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/value.h"
 #include "tensorflow/core/tfrt/mlrt/kernel/context.h"
 #include "tensorflow/core/tfrt/runtime/runtime.h"
 #include "tensorflow/core/tfrt/runtime/step_id.h"
@@ -83,11 +94,14 @@ limitations under the License.
 #include "tensorflow/core/tfrt/runtime/work_queue_interface.h"
 #include "tensorflow/core/tfrt/stubs/tfrt_native_lowering_stub.h"
 #include "tensorflow/core/tfrt/utils/fallback_tensor.h"
+#include "tensorflow/core/tfrt/utils/tfrt_graph_execution_state.h"
 #include "tensorflow/core/tfrt/utils/utils.h"
+#include "tsl/concurrency/async_value_ref.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/refcount.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/profiler/lib/traceme.h"
+#include "tfrt/bef/bef_buffer.h"  // from @tf_runtime
 #include "tfrt/bef_converter/mlir_to_bef.h"  // from @tf_runtime
 #include "tfrt/core_runtime/core_runtime.h"  // from @tf_runtime
 #include "tfrt/host_context/async_dispatch.h"  // from @tf_runtime
@@ -599,7 +613,11 @@ tensorflow::Status GraphExecutor::Run(
   }
 
   // Possibly record costs, depending on the particular setting of
-  // `CostAnalysisOptions`.
+  // `CostAnalysisOptions`. As of this comment, for correctness of that feature,
+  // the time needs to be created after the client graph is created
+  //
+  // To reduce system calls, this value is also used for timing the duration of
+  // `::Run`.
   auto now = absl::Now() + simulated_duration_;
   bool do_recompilation;
   CostRecorder* cost_recorder =
@@ -634,7 +652,10 @@ tensorflow::Status GraphExecutor::Run(
     (*outputs)[original_index] = std::move(*flat_output_iter);
     ++flat_output_iter;
   }
-
+  absl::Time end = absl::Now() + simulated_duration_;
+  absl::Duration elapsed_duration = end - now;
+  loaded_client_graph.latency_sampler()->Add(
+      absl::ToDoubleMicroseconds(elapsed_duration));
   return OkStatus();
 }
 
@@ -649,7 +670,7 @@ GraphExecutor::ImportAndCompileClientGraph(
   // Step 1 of loading: Import the client graph from proto to an MLIR module.
   auto import_start_time = absl::Now();
   mlir::DialectRegistry registry;
-  RegisterMlirDialect(registry);
+  RegisterMlirDialect(registry, options_.compile_options.backend_compiler);
   // Disable multi-threading in lazy loading as the thread pool it uses is out
   // of our control and this affects serving performance.
   //
@@ -665,7 +686,8 @@ GraphExecutor::ImportAndCompileClientGraph(
   // If the module contains a Restore op, then there should be one input,
   // and it should specify the checkpoint for variable restore.
   std::string checkpoint_path;
-  if (mlir::tf_saved_model::IsRestoreGraph(module.get())) {
+  if (options_.compile_options.backend_compiler &&
+      mlir::tf_saved_model::IsRestoreGraph(module.get())) {
     if (inputs.size() != 1) {
       return absl::InvalidArgumentError(absl::StrCat(
           "Expected 1 input for restore graph, but got ", inputs.size(), "."));
@@ -748,12 +770,15 @@ GraphExecutor::ImportAndCompileClientGraph(
   LOG(INFO) << "TFRT finished compiling client graph (" << &client_graph
             << "). Took " << absl::ToInt64Milliseconds(compile_duration)
             << " ms. Client graph name: " << client_graph.name;
-
+  auto* latency_sampler =
+      tensorflow::tfrt_metrics::GetTfrtGraphExecutorLatencySampler(
+          options_.model_metadata.name(), options_.model_metadata.version(),
+          client_graph.name);
   return std::make_unique<LoadedClientGraph>(
       client_graph.name, std::move(symbol_uids), this, std::move(context),
       std::move(module_with_op_keys), std::move(module),
       std::move(executable_context), stream_callback_id,
-      !checkpoint_path.empty(), std::move(flib_def));
+      !checkpoint_path.empty(), std::move(flib_def), latency_sampler);
 }
 
 StatusOr<std::unique_ptr<GraphExecutor::LoadedClientGraph>>
@@ -1072,7 +1097,8 @@ GraphExecutor::LoadedClientGraph::LoadedClientGraph(
     mlir::OwningOpRef<mlir::ModuleOp> tfrt_mlir,
     std::shared_ptr<ExecutableContext> executable_context,
     std::optional<StreamCallbackId> stream_callback_id, bool is_restore,
-    FunctionLibraryDefinition flib_def)
+    FunctionLibraryDefinition flib_def,
+    tensorflow::monitoring::SamplerCell* latency_sampler)
     : name_(std::move(name)),
       symbol_uids_(std::move(symbol_uids)),
       graph_executor_(graph_executor),
@@ -1096,7 +1122,8 @@ GraphExecutor::LoadedClientGraph::LoadedClientGraph(
               *r = tsl::core::RefCountPtr<Rendezvous>(
                   new IntraProcessRendezvous(device_mgr));
               return OkStatus();
-            }}) {
+            }}),
+      latency_sampler_(latency_sampler) {
   const auto& options = graph_executor_->options().cost_analysis_options;
   if (options.version != Options::CostAnalysisOptions::kDisabled) {
     // Initialize in a way that ensures recompilation on the first run.
@@ -1150,9 +1177,13 @@ tensorflow::Status GraphExecutor::CompileGraph(
       .status();
 }
 
-void RegisterMlirDialect(mlir::DialectRegistry& registry) {
+void RegisterMlirDialect(mlir::DialectRegistry& registry,
+                         tensorflow::BackendCompiler* backend_compiler) {
   registry.insert<mlir::BuiltinDialect, mlir::func::FuncDialect>();
   mlir::RegisterAllTensorFlowDialects(registry);
+  if (backend_compiler) {
+    backend_compiler->GetDependentDialects(registry);
+  }
 }
 
 }  // namespace tfrt_stub
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor.h b/tensorflow/core/tfrt/graph_executor/graph_executor.h
index 66c291dcf530d5..6deee210444099 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor.h
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor.h
@@ -33,10 +33,12 @@ limitations under the License.
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/translate/mlir_roundtrip_flags.h"
+#include "tensorflow/compiler/mlir/tfrt/backend_compiler.h"
 #include "tensorflow/core/common_runtime/process_function_library_runtime.h"
 #include "tensorflow/core/framework/cancellation.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/lib/monitoring/sampler.h"
 #include "tensorflow/core/platform/mutex.h"
 #include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/platform/statusor.h"
@@ -156,7 +158,8 @@ class GraphExecutor {
                       mlir::OwningOpRef<mlir::ModuleOp> tfrt_mlir,
                       std::shared_ptr<ExecutableContext> executable_context,
                       std::optional<StreamCallbackId> stream_callback_id,
-                      bool is_restore, FunctionLibraryDefinition flib_def);
+                      bool is_restore, FunctionLibraryDefinition flib_def,
+                      tensorflow::monitoring::SamplerCell* latency_sampler);
 
     // Returns this instance's CostRecorder if it is time to update costs,
     // else returns nullptr. Only allows one non-null return value at a time
@@ -192,6 +195,9 @@ class GraphExecutor {
         const {
       return pflr_;
     }
+    tensorflow::monitoring::SamplerCell* latency_sampler() {
+      return latency_sampler_;
+    }
 
    private:
     std::string name_;
@@ -230,6 +236,7 @@ class GraphExecutor {
     bool is_restore_;
     FunctionLibraryDefinition flib_def_;
     ProcessFunctionLibraryRuntime pflr_;
+    tensorflow::monitoring::SamplerCell* latency_sampler_;
   };
 
   // A subgraph constructed by specifying input/output tensors.
@@ -376,7 +383,8 @@ class GraphExecutor {
   int num_recompilations_ TF_GUARDED_BY(num_recompilations_mu_) = 0;
 };
 
-void RegisterMlirDialect(mlir::DialectRegistry& registry);
+void RegisterMlirDialect(mlir::DialectRegistry& registry,
+                         tensorflow::BackendCompiler* backend_compiler);
 
 }  // namespace tfrt_stub
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/graph_executor/graph_executor_test.cc b/tensorflow/core/tfrt/graph_executor/graph_executor_test.cc
index 7088231d3ff3e6..0a0f073fe589fa 100644
--- a/tensorflow/core/tfrt/graph_executor/graph_executor_test.cc
+++ b/tensorflow/core/tfrt/graph_executor/graph_executor_test.cc
@@ -14,8 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/core/tfrt/graph_executor/graph_executor.h"
 
+#include <cstdint>
 #include <memory>
-#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -24,23 +24,36 @@ limitations under the License.
 #include "learning/brain/experimental/tfrt/native_lowering/kernels/sync_fallback_kernels.h"
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "tensorflow/cc/framework/ops.h"
+#include "tensorflow/cc/framework/scope.h"
 #include "tensorflow/cc/ops/array_ops.h"
 #include "tensorflow/cc/ops/const_op.h"
+#include "tensorflow/core/framework/common_shape_fns.h"
 #include "tensorflow/core/framework/graph.pb.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/graph/graph_def_builder.h"
-#include "tensorflow/core/grappler/utils/grappler_test.h"
-#include "tensorflow/core/platform/statusor.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/status.h"
 #include "tensorflow/core/protobuf/rewriter_config.pb.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/graph_executor/graph_execution_options.h"
 #include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
 #include "tensorflow/core/tfrt/mlrt/interpreter/value.h"
 #include "tensorflow/core/tfrt/mlrt/kernel/kernel.h"
 #include "tensorflow/core/tfrt/saved_model/saved_model_testutil.h"
 #include "tsl/lib/core/status_test_util.h"
-#include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 #include "tfrt/cpp_tests/test_util.h"  // from @tf_runtime
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
 #include "tfrt/tensor/dense_host_tensor.h"  // from @tf_runtime
 
 namespace tensorflow {
diff --git a/tensorflow/core/tfrt/ifrt/BUILD b/tensorflow/core/tfrt/ifrt/BUILD
index e231290981d869..d58e0e14f7ec03 100644
--- a/tensorflow/core/tfrt/ifrt/BUILD
+++ b/tensorflow/core/tfrt/ifrt/BUILD
@@ -29,6 +29,7 @@ cc_library(
     srcs = ["ifrt_serving_executable.cc"],
     hdrs = ["ifrt_serving_executable.h"],
     deps = [
+        ":ifrt_loaded_variable_registry",
         ":ifrt_tensor_utils",
         ":sharding_utils",
         "//tensorflow/compiler/mlir/tfrt/transforms/ifrt:tf2hlo",
@@ -50,6 +51,7 @@ cc_library(
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:tstring",
         "@local_xla//xla:xla_data_proto_cc",
         "@local_xla//xla/hlo/ir:hlo",
         "@local_xla//xla/pjrt:pjrt_executable",
@@ -76,14 +78,43 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ifrt_loaded_variable_registry",
+    srcs = ["ifrt_loaded_variable_registry.cc"],
+    hdrs = ["ifrt_loaded_variable_registry.h"],
+    deps = [
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla/python/ifrt",
+    ],
+)
+
 cc_library(
     name = "ifrt_model_context",
+    srcs = ["ifrt_model_context.cc"],
     hdrs = ["ifrt_model_context.h"],
     deps = [
         ":ifrt_executable_registry",
+        ":ifrt_loaded_variable_registry",
         "//tensorflow/compiler/tf2xla:xla_helpers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@eigen_archive//:eigen3",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@local_tsl//tsl/concurrency:ref_count",
         "@local_xla//xla/python/ifrt",
     ],
 )
@@ -166,6 +197,7 @@ tf_cc_test(
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:statusor",
+        "@local_xla//xla:xla_data_proto_cc",
         "@local_xla//xla/hlo/ir:hlo",
         "@local_xla//xla/python/ifrt",
         "@local_xla//xla/python/ifrt:test_util",
@@ -184,7 +216,9 @@ tf_cc_test(
     ],
     tags = ["no_oss"],
     deps = [
+        ":ifrt_loaded_variable_registry",
         ":ifrt_serving_executable",
+        ":sharding_utils",
         "//tensorflow/compiler/mlir/tensorflow",
         "//tensorflow/compiler/tf2xla:xla_helpers",
         "//tensorflow/core:framework",
@@ -194,6 +228,7 @@ tf_cc_test(
         "//tensorflow/core/framework:tensor_testutil",
         "//tensorflow/core/framework:types_proto_cc",
         "//tensorflow/core/platform:resource_loader",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
@@ -201,8 +236,11 @@ tf_cc_test(
         "@llvm-project//mlir:AllPassesAndDialects",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Parser",
+        "@local_tsl//tsl/concurrency:ref_count",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:tstring",
+        "@local_xla//xla/hlo/ir:hlo",
         "@local_xla//xla/python/ifrt",
         "@local_xla//xla/python/ifrt:test_util",
         "@local_xla//xla/python/pjrt_ifrt:tfrt_cpu_client_test_lib",
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.cc b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.cc
new file mode 100644
index 00000000000000..213d75431df6c9
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.cc
@@ -0,0 +1,60 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
+
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/python/ifrt/array.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/statusor.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+absl::Status IfrtLoadedVariableRegistry::TryRegisterLoadedVariable(
+    absl::string_view name,
+    LoadedVariableConstructor&& loaded_variable_constructor) {
+  absl::MutexLock lock(&mutex_);
+  auto& variable = loaded_variable_map_[name];
+  if (variable != nullptr) {
+    // Already registered. This is rare.
+    VLOG(1) << "Variable '" << name << "' already registered.";
+    return absl::OkStatus();
+  }
+  TF_ASSIGN_OR_RETURN(variable, loaded_variable_constructor());
+  return absl::OkStatus();
+}
+
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+IfrtLoadedVariableRegistry::GetLoadedVariable(absl::string_view name) const {
+  absl::MutexLock lock(&mutex_);
+  auto it = loaded_variable_map_.find(name);
+  if (it == loaded_variable_map_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("Variable '", name, "' not found."));
+  }
+  return it->second;
+}
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
new file mode 100644
index 00000000000000..ccfc4aa3d46a1b
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h
@@ -0,0 +1,60 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_CORE_TFRT_IFRT_IFRT_LOADED_VARIABLE_REGISTRY_H_
+#define TENSORFLOW_CORE_TFRT_IFRT_IFRT_LOADED_VARIABLE_REGISTRY_H_
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/python/ifrt/array.h"
+#include "tsl/concurrency/ref_count.h"
+
+namespace tensorflow {
+namespace ifrt_serving {
+
+// This class is thread safe.
+class IfrtLoadedVariableRegistry {
+ public:
+  using LoadedVariableConstructor =
+      absl::AnyInvocable<absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>()
+                             const>;
+
+  // Tries to register a loaded variable with the given name.
+  // Returns an error if the named array does not already exists and
+  // loaded_variable_constructor failed to create an array. Note that it returns
+  // OK if the named array already exists.
+  // loaded_variable_constructor is invoked in the caller thread.
+  absl::Status TryRegisterLoadedVariable(
+      absl::string_view name,
+      LoadedVariableConstructor&& loaded_variable_constructor)
+      ABSL_LOCKS_EXCLUDED(mutex_);
+
+  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> GetLoadedVariable(
+      absl::string_view name) const ABSL_LOCKS_EXCLUDED(mutex_);
+
+ private:
+  mutable absl::Mutex mutex_;
+  absl::flat_hash_map<std::string, tsl::RCReference<xla::ifrt::Array>>
+      loaded_variable_map_ ABSL_GUARDED_BY(mutex_);
+};
+
+}  // namespace ifrt_serving
+}  // namespace tensorflow
+
+#endif  // TENSORFLOW_CORE_TFRT_IFRT_IFRT_LOADED_VARIABLE_REGISTRY_H_
diff --git a/tensorflow/core/profiler/lib/scoped_annotation_stack.h b/tensorflow/core/tfrt/ifrt/ifrt_model_context.cc
similarity index 54%
rename from tensorflow/core/profiler/lib/scoped_annotation_stack.h
rename to tensorflow/core/tfrt/ifrt/ifrt_model_context.cc
index 7c7d6688c68593..fa74302f625dfd 100644
--- a/tensorflow/core/profiler/lib/scoped_annotation_stack.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_model_context.cc
@@ -1,4 +1,5 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+
+/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,29 +13,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#ifndef TENSORFLOW_CORE_PROFILER_LIB_SCOPED_ANNOTATION_STACK_H_
-#define TENSORFLOW_CORE_PROFILER_LIB_SCOPED_ANNOTATION_STACK_H_
 
-#include <stddef.h>
+#include "tensorflow/core/tfrt/ifrt/ifrt_model_context.h"
 
-#include <atomic>
-#include <string>
-#include <string_view>
 #include <utility>
 
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
-#include "tsl/profiler/lib/scoped_annotation_stack.h"
 
-#if !defined(IS_MOBILE_PLATFORM)
-#include "tensorflow/core/profiler/backends/cpu/annotation_stack.h"
-#endif
+// Enable Eigen::ThreadPoolDevice structure definition, rather than just
+// declaration.
+#define EIGEN_USE_THREADS
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "xla/python/ifrt/array.h"
+#include "tsl/concurrency/ref_count.h"
 
 namespace tensorflow {
-namespace profiler {
+namespace ifrt_serving {
 
-using tsl::profiler::ScopedAnnotationStack;  // NOLINT
+const Eigen::ThreadPoolDevice& IfrtModelContext::GetThreadPoolDevice() const {
+  return thread_pool_device_;
+}
 
-}  // namespace profiler
+}  // namespace ifrt_serving
 }  // namespace tensorflow
-
-#endif  // TENSORFLOW_CORE_PROFILER_LIB_SCOPED_ANNOTATION_STACK_H_
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_model_context.h b/tensorflow/core/tfrt/ifrt/ifrt_model_context.h
index 3b16b8703a39e2..1c942ce0e045d7 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_model_context.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_model_context.h
@@ -17,17 +17,22 @@ limitations under the License.
 #define TENSORFLOW_CORE_TFRT_IFRT_IFRT_MODEL_CONTEXT_H_
 
 #include <memory>
+#include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
-
-// Enable definition of Eigen::ThreadPoolDevice instead of just declaration.
-#define EIGEN_USE_THREADS
+#include "absl/synchronization/mutex.h"
 #include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_executable_registry.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
+#include "tsl/concurrency/ref_count.h"
 
 namespace tensorflow {
 namespace ifrt_serving {
@@ -63,15 +68,20 @@ class IfrtModelContext {
 
   std::shared_ptr<xla::ifrt::Client> GetClient() const { return client_; }
 
-  const Eigen::ThreadPoolDevice& GetThreadPoolDevice() const {
-    return thread_pool_device_;
-  }
-
   const tensorflow::XlaHelpers::ShapeRepresentationFn&
   GetShapeRepresentationFn() const {
     return shape_representation_fn_;
   }
 
+  const Eigen::ThreadPoolDevice& GetThreadPoolDevice() const;
+
+  const IfrtLoadedVariableRegistry& GetLoadedVariableRegistry() const {
+    return loaded_variable_registry_;
+  }
+  IfrtLoadedVariableRegistry& GetLoadedVariableRegistry() {
+    return loaded_variable_registry_;
+  }
+
  private:
   std::shared_ptr<xla::ifrt::Client> client_;
   const Eigen::ThreadPoolDevice& thread_pool_device_;
@@ -79,6 +89,8 @@ class IfrtModelContext {
       tensorflow::IdentityShapeRepresentationFn();
 
   std::vector<ServingExecutableRegistry::Handle> handles_;
+
+  IfrtLoadedVariableRegistry loaded_variable_registry_;
 };
 
 }  // namespace ifrt_serving
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
index 737487e1c6de5c..fb77f5c1671019 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 #include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -50,15 +51,44 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor_shape.pb.h"
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
 #include "tensorflow/core/tfrt/ifrt/ifrt_tensor_utils.h"
 #include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
 #include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
+#include "tsl/platform/tstring.h"
 
 namespace tensorflow {
 namespace ifrt_serving {
 namespace {
+absl::StatusOr<std::vector<DtypeAndShape>> BuildDtypeAndShape(
+    absl::Span<const tensorflow::Tensor> inputs,
+    absl::Span<const int> variable_arg_indices,
+    const IfrtLoadedVariableRegistry& ifrt_loaded_variable_registry) {
+  std::vector<DtypeAndShape> dtypes_and_shapes;
+  dtypes_and_shapes.reserve(inputs.size());
+
+  int variable_index = 0;
+  for (int i = 0; i < inputs.size(); i++) {
+    if (variable_index < variable_arg_indices.size() &&
+        i == variable_arg_indices[variable_index]) {
+      // Get already loaded variable tensor.
+      TF_ASSIGN_OR_RETURN(auto single_array,
+                          ifrt_loaded_variable_registry.GetLoadedVariable(
+                              inputs[i].scalar<tsl::tstring>()()));
+      TF_ASSIGN_OR_RETURN(auto dtype, ToTensorDataType(single_array->dtype()));
+      dtypes_and_shapes.push_back(DtypeAndShape{
+          .dtype = dtype, .shape = ToTensorShape(single_array->shape())});
+
+      variable_index++;
+    } else {
+      dtypes_and_shapes.push_back(DtypeAndShape{.dtype = inputs[i].dtype(),
+                                                .shape = inputs[i].shape()});
+    }
+  }
+  return dtypes_and_shapes;
+}
 
 absl::StatusOr<xla::DeviceAssignment> GetXlaDeviceAssignment(
     const xla::ifrt::Client& ifrt_client,
@@ -111,28 +141,6 @@ absl::StatusOr<std::vector<xla::ifrt::Device*>> GetAssignedDevices(
   return devices;
 }
 
-absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
-CreateArrayFromHostTensorForSingleDevice(xla::ifrt::Client& ifrt_client,
-                                         const tensorflow::Tensor& tensor,
-                                         xla::ifrt::Device* device) {
-  TF_ASSIGN_OR_RETURN(auto dtype, ToIfrtDType(tensor.dtype()));
-
-  VLOG(2) << "Make single device array for buffer slice "
-          << " at " << tensor.data();
-  auto single_device_sharding =
-      xla::ifrt::SingleDeviceSharding::Create(device, xla::ifrt::MemoryKind());
-
-  return ifrt_client.MakeArrayFromHostBuffer(
-      tensor.data(), dtype, ToIfrtShape(tensor.shape()),
-      /*byte_strides=*/{}, std::move(single_device_sharding),
-      xla::ifrt::Client::HostBufferSemantics::kImmutableUntilTransferCompletes,
-      [tensor]() {
-        // Keep tensor alive
-        VLOG(2) << "Done with single device host buffer for slice "
-                << " at " << tensor.data();
-      });
-}
-
 }  // namespace
 
 absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
@@ -143,55 +151,18 @@ IfrtServingExecutable::ConvertTensorToArray(
   VLOG(2) << "Converting tensor of shape " << input_shape;
 
   TF_ASSIGN_OR_RETURN(auto hlo_sharding, xla::HloSharding::FromProto(sharding));
-  VLOG(3) << "IsTiled: " << hlo_sharding.IsTiled();
-  VLOG(3) << "IsReplicated: " << hlo_sharding.IsReplicated();
-  VLOG(3) << "IsTileMaximal: " << hlo_sharding.IsTileMaximal();
-  if (!hlo_sharding.IsTiled() && !hlo_sharding.IsReplicated() &&
-      !hlo_sharding.IsTileMaximal()) {
-    return absl::UnimplementedError(absl::StrCat(
-        "Only support MAXIMAL, OTHER or REPLICATED, but got sharding : ",
-        hlo_sharding.ToProto()));
-  }
-
-  VLOG(1) << "Hlo sharding: " << hlo_sharding.ToString();
-  VLOG(1) << "Device list size: " << device_list.size();
-
-  if (device_list.size() == 1) {
-    if (hlo_sharding.IsTiled()) {
-      return absl::InvalidArgumentError(
-          absl::StrCat("Tiled sharding", hlo_sharding.ToString(),
-                       " expect more than 1 device, but got 1 only"));
-    }
-    return CreateArrayFromHostTensorForSingleDevice(*ifrt_client_, tensor,
-                                                    device_list[0]);
-  }
 
-  // Replicate implies Maximal, but not vice versa. Only Maximal is single
-  // device.
-  if (!hlo_sharding.IsReplicated() && hlo_sharding.IsTileMaximal()) {
-    VLOG(1) << "Single device fast path for Maximal tiled tensor";
-    xla::ifrt::Device* device;
-    if (hlo_sharding.HasUniqueDevice()) {
-      int unique_device_id = hlo_sharding.GetUniqueDevice();
-      TF_ASSIGN_OR_RETURN(device, ifrt_client_->LookupDevice(unique_device_id));
-    } else {
-      device = device_list[0];
-    }
-    return CreateArrayFromHostTensorForSingleDevice(*ifrt_client_, tensor,
-                                                    device);
-  }
-
-  return MakeAssembledArrayFromHostBuffer(*ifrt_client_, tensor,
-                                          std::move(hlo_sharding), device_list,
-                                          thread_pool_device_);
+  return MakeArrayFromTensor(*ifrt_client_, tensor, device_list,
+                             std::move(hlo_sharding), thread_pool_device_);
 }
 
 absl::StatusOr<IfrtServingExecutable::CachedExecutableBundle>
 IfrtServingExecutable::CreateExecutableSynchronously(
-    absl::Span<const tensorflow::Tensor> inputs) {
-  TF_ASSIGN_OR_RETURN(Tf2HloResult tf2hlo_result,
-                      CompileTfToHlo(*module_, inputs, signature_name(),
-                                     *ifrt_client_, shape_representation_fn_));
+    absl::Span<const DtypeAndShape> dtypes_and_shapes) {
+  TF_ASSIGN_OR_RETURN(
+      Tf2HloResult tf2hlo_result,
+      CompileTfToHlo(*module_, dtypes_and_shapes, signature_name(),
+                     *ifrt_client_, shape_representation_fn_));
 
   const int num_replicas = tf2hlo_result.compile_metadata.num_replicas();
   const int num_partitions =
@@ -240,12 +211,12 @@ IfrtServingExecutable::CreateExecutableSynchronously(
 
 xla::ifrt::Future<absl::StatusOr<IfrtServingExecutable::CachedExecutableBundle>>
 IfrtServingExecutable::LookUpOrCreateExecutable(
-    absl::Span<const tensorflow::Tensor> inputs) {
+    absl::Span<const DtypeAndShape> dtypes_and_shapes) {
   std::vector<tensorflow::TensorShape> input_shapes;
-  for (const auto& tensor : inputs) {
-    input_shapes.push_back(tensor.shape());
+  for (const auto& dtype_and_shape : dtypes_and_shapes) {
+    input_shapes.push_back(dtype_and_shape.shape);
   }
-  Key key(input_shapes);
+  Key key = {input_shapes};
 
   xla::ifrt::Promise<absl::StatusOr<CachedExecutableBundle>> promise;
   xla::ifrt::Future<absl::StatusOr<CachedExecutableBundle>> future;
@@ -268,19 +239,50 @@ IfrtServingExecutable::LookUpOrCreateExecutable(
 
   LOG(INFO) << "Cache missed. Building executable";
   absl::StatusOr<CachedExecutableBundle> executable_bundle =
-      CreateExecutableSynchronously(inputs);
+      CreateExecutableSynchronously(dtypes_and_shapes);
 
   promise.Set(std::move(executable_bundle));
   return future;
 }
 
 absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
-    absl::Span<const tensorflow::Tensor> inputs) {
-  TF_ASSIGN_OR_RETURN(CachedExecutableBundle executable_bundle,
-                      LookUpOrCreateExecutable(inputs).Await());
+    absl::Span<const tensorflow::Tensor> inputs,
+    absl::Span<const int> variable_arg_indices) {
+  for (int i = 1; i < variable_arg_indices.size(); i++) {
+    if (variable_arg_indices[i] <= variable_arg_indices[i - 1]) {
+      return absl::FailedPreconditionError(absl::StrCat(
+          "Expected variable_arg_indices in ascending order. But subsequence "
+          "starting at ",
+          i - 1, ": (", variable_arg_indices[i - 1], ", ",
+          variable_arg_indices[i], ")", " is not in ascending order"));
+    }
+  }
 
-  std::vector<tsl::RCReference<xla::ifrt::Array>> args;
-  args.reserve(inputs.size());
+  if (!variable_arg_indices.empty() &&
+      inputs.size() <= variable_arg_indices.back()) {
+    return absl::FailedPreconditionError(absl::StrCat(
+        "Expected at most ", inputs.size(), " inputs, but got up to ",
+        variable_arg_indices.back(), " variables."));
+  }
+
+  // Ensure the variable tensor holds a valid key: a scalar string tensor.
+  for (const int i : variable_arg_indices) {
+    if (inputs[i].dtype() != tensorflow::DT_STRING ||
+        !tensorflow::TensorShapeUtils::IsScalar(inputs[i].shape())) {
+      return absl::FailedPreconditionError(
+          absl::StrCat("Expected a scalar tensor as loaded variable array key, "
+                       "but got type ",
+                       inputs[i].dtype(), " and shape ",
+                       inputs[i].shape().DebugString(), " at index ", i));
+    }
+  }
+
+  TF_ASSIGN_OR_RETURN(std::vector<DtypeAndShape> dtypes_and_shapes,
+                      BuildDtypeAndShape(inputs, variable_arg_indices,
+                                         ifrt_loaded_variable_registry_));
+  TF_ASSIGN_OR_RETURN(
+      CachedExecutableBundle executable_bundle,
+      LookUpOrCreateExecutable(absl::MakeSpan(dtypes_and_shapes)).Await());
 
   TF_ASSIGN_OR_RETURN(
       std::vector<xla ::ifrt::Device*> devices,
@@ -288,21 +290,35 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
   xla::ifrt::DeviceList device_list(
       xla::ifrt::DeviceList::Devices(devices.begin(), devices.end()));
 
-  auto compile_metadata_arg_iter =
-      executable_bundle.compile_metadata.args().begin();
-  if (executable_bundle.compile_metadata.args().size() != inputs.size()) {
+  if (executable_bundle.compile_metadata.args().size() !=
+      dtypes_and_shapes.size()) {
     return absl::InternalError(absl::StrCat(
-        "Expect ", executable_bundle.compile_metadata.args().size(),
-        " but got ", inputs.size(), " arguments"));
+        "Expected ", executable_bundle.compile_metadata.args().size(),
+        " but got ", dtypes_and_shapes.size(), " arguments"));
   }
-  for (const auto& input_tensor : inputs) {
-    TF_ASSIGN_OR_RETURN(
-        auto single_array,
-        ConvertTensorToArray(input_tensor, device_list,
-                             compile_metadata_arg_iter->sharding()));
-    args.push_back(single_array);
-    compile_metadata_arg_iter++;
+
+  std::vector<tsl::RCReference<xla::ifrt::Array>> args;
+  args.reserve(inputs.size());
+
+  int variable_index = 0;
+  for (int i = 0; i < inputs.size(); i++) {
+    if (variable_index < variable_arg_indices.size() &&
+        i == variable_arg_indices[variable_index]) {
+      TF_ASSIGN_OR_RETURN(auto single_array,
+                          ifrt_loaded_variable_registry_.GetLoadedVariable(
+                              inputs[i].scalar<tsl::tstring>()()));
+      args.push_back(single_array);
+      variable_index++;
+    } else {
+      TF_ASSIGN_OR_RETURN(
+          auto single_array,
+          ConvertTensorToArray(
+              inputs[i], device_list,
+              executable_bundle.compile_metadata.args()[i].sharding()));
+      args.push_back(single_array);
+    }
   }
+  DCHECK_EQ(args.size(), dtypes_and_shapes.size());
 
   VLOG(2) << "Start Execution";
 
@@ -315,9 +331,6 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
   TF_RETURN_IF_ERROR(status);
 
   std::vector<tensorflow::Tensor> outputs;
-  std::vector<xla::ifrt::Future<absl::Status>> output_futures;
-  output_futures.reserve(execution_result.outputs.size());
-  outputs.reserve(execution_result.outputs.size());
 
   if (executable_bundle.compile_metadata.retvals().size() !=
       execution_result.outputs.size()) {
@@ -332,50 +345,19 @@ absl::StatusOr<std::vector<tensorflow::Tensor>> IfrtServingExecutable::Execute(
     const tpu::TPUCompileMetadataProto::Retval& metadata_retval =
         executable_bundle.compile_metadata.retvals()[i];
 
-    TF_RETURN_IF_ERROR(tensorflow::TensorShape::BuildTensorShape(
-        array_for_copy->shape().dims(), &tensor_shape));
-    TF_ASSIGN_OR_RETURN(tensorflow::DataType data_type,
-                        ToTensorDataType(array_for_copy->dtype()));
-
-    tensorflow::Tensor tensor(data_type, std::move(tensor_shape));
-
     // IFRT's return does not contain sufficient information; so we use
     // sharding spec from metadata.
     VLOG(2) << "Output sharding: " << array_for_copy->sharding().DebugString();
-    VLOG(2) << "Metadata sharding: " << metadata_retval.sharding();
+
     TF_ASSIGN_OR_RETURN(auto hlo_sharding, xla::HloSharding::FromProto(
                                                metadata_retval.sharding()));
-
-    // TODO(b/313922535): support sharded outputs.
-    if (!hlo_sharding.IsReplicated()) {
-      return absl::UnimplementedError(
-          absl::StrCat("Only supported replicated output. But got ",
-                       hlo_sharding.ToString()));
-    }
-
-    TF_ASSIGN_OR_RETURN(auto fully_replicated_array,
-                        array_for_copy->FullyReplicatedShard(
-                            xla::ifrt::ArrayCopySemantics::kDonateInput));
-
-    if (fully_replicated_array->shape() !=
-        xla::ifrt::Shape(tensor.shape().dim_sizes())) {
-      return absl::UnimplementedError(
-          absl::StrCat("Not fully replicated output. Expected ",
-                       tensor.shape().DebugString(), " but got ",
-                       fully_replicated_array->shape().DebugString()));
-    }
-
-    xla::ifrt::Future<absl::Status> copy_future =
-        fully_replicated_array->CopyToHostBuffer(
-            tensor.data(), /*byte_strides=*/std::nullopt,
-            xla::ifrt::ArrayCopySemantics::kAlwaysCopy);
-
-    output_futures.push_back(copy_future);
+    TF_ASSIGN_OR_RETURN(
+        tensorflow::Tensor tensor,
+        MakeTensorFromArray(*ifrt_client_, *array_for_copy, hlo_sharding,
+                            device_list, thread_pool_device_));
     outputs.push_back(std::move(tensor));
   }
 
-  TF_RETURN_IF_ERROR(
-      xla::ifrt::JoinFutures(absl::MakeSpan(output_futures)).Await());
   return outputs;
 }
 
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
index cc7d53154d0c28..678eaeefdce7f6 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable.h
@@ -31,6 +31,7 @@ limitations under the License.
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/OwningOpRef.h"  // from @llvm-project
+#include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/tf2hlo.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
@@ -42,6 +43,7 @@ limitations under the License.
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/protobuf/tpu/compile_metadata.pb.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
 #include "tsl/concurrency/ref_count.h"
 
 namespace tensorflow {
@@ -54,12 +56,14 @@ class IfrtServingExecutable {
       mlir::OwningOpRef<mlir::ModuleOp> module,
       std::shared_ptr<xla::ifrt::Client> client,
       const Eigen::ThreadPoolDevice* thread_pool_device,
+      const IfrtLoadedVariableRegistry* ifrt_loaded_variable_registry,
       tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn)
       : model_name_(std::string(model_name)),
         signature_name_(std::string(signature_name)),
         module_(std::move(module)),
         ifrt_client_(std::move(client)),
         thread_pool_device_(*thread_pool_device),
+        ifrt_loaded_variable_registry_(*ifrt_loaded_variable_registry),
         shape_representation_fn_(std::move(shape_representation_fn)) {}
 
   // Movable but not copyable.
@@ -72,8 +76,10 @@ class IfrtServingExecutable {
   absl::string_view signature_name() const { return signature_name_; }
 
   // Executes the computation.
+  // variable_arg_indices are in sorted order.
   absl::StatusOr<std::vector<tensorflow::Tensor>> Execute(
-      absl::Span<const tensorflow::Tensor> inputs);
+      absl::Span<const tensorflow::Tensor> inputs,
+      absl::Span<const int> variable_arg_indices);
 
   int num_executables() const {
     absl::MutexLock lock(&mutex_);
@@ -113,6 +119,7 @@ class IfrtServingExecutable {
   std::shared_ptr<xla::ifrt::Client> ifrt_client_;
   const Eigen::ThreadPoolDevice& thread_pool_device_;
 
+  const IfrtLoadedVariableRegistry& ifrt_loaded_variable_registry_;
   tensorflow::XlaHelpers::ShapeRepresentationFn shape_representation_fn_;
 
   mutable absl::Mutex mutex_;
@@ -126,9 +133,10 @@ class IfrtServingExecutable {
       const xla::OpSharding& sharding);
 
   xla::ifrt::Future<absl::StatusOr<CachedExecutableBundle>>
-  LookUpOrCreateExecutable(absl::Span<const tensorflow::Tensor> inputs);
+  LookUpOrCreateExecutable(absl::Span<const DtypeAndShape> dtypes_and_shapes);
   absl::StatusOr<IfrtServingExecutable::CachedExecutableBundle>
-  CreateExecutableSynchronously(absl::Span<const tensorflow::Tensor> inputs);
+  CreateExecutableSynchronously(
+      absl::Span<const DtypeAndShape> dtypes_and_shapes);
 
   absl::StatusOr<std::unique_ptr<xla::ifrt::Sharding>> CreateSharding(
       int num_devices, const xla::ifrt::Shape& arg_xla_shape,
diff --git a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
index 391ba1b06b351e..a738f9e399c562 100644
--- a/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
+++ b/tensorflow/core/tfrt/ifrt/ifrt_serving_executable_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
@@ -38,6 +39,8 @@ limitations under the License.
 #include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "tensorflow/compiler/mlir/tensorflow/dialect_registration.h"
 #include "tensorflow/compiler/tf2xla/xla_helpers.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/python/ifrt/array.h"
 #include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/test_util.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -47,13 +50,27 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/core/platform/test.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_loaded_variable_registry.h"
+#include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
+#include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/threadpool.h"
+#include "tsl/platform/tstring.h"
 
 namespace tensorflow {
 namespace ifrt_serving {
 namespace {
+struct VariableInputTestParam {
+  std::vector<tensorflow::Tensor> in_tensors;
+  std::vector<bool>
+      is_variable;  // if is_variable[i] = true, then in_tensor[i] is a variable
+                    // and can be preloaded as an ifrt array.
+  std::vector<tensorflow::Tensor> expected_out_tensors;
+};
+using VariableInputTest = ::testing::TestWithParam<VariableInputTestParam>;
+
+using ::tensorflow::test::AsTensor;
 using ::tensorflow::test::TensorEq;
 using ::testing::ElementsAre;
 
@@ -91,21 +108,21 @@ TEST(IfrtServingExecutableTest, Basic) {
                           xla::ifrt::test_util::GetClient());
   Eigen::ThreadPoolDevice thread_pool_device = GetThreadPoolDevice();
 
+  IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
   IfrtServingExecutable executable("test", "main", std::move(mlir_module),
                                    client, &thread_pool_device,
+                                   &ifrt_loaded_variable_registry,
                                    tensorflow::IdentityShapeRepresentationFn());
 
-  auto x = tensorflow::test::AsTensor<int32_t>({1, 2, 3},
-                                               tensorflow::TensorShape({1, 3}));
-  auto y = tensorflow::test::AsTensor<int32_t>({1, 2, 3},
-                                               tensorflow::TensorShape({3, 1}));
+  auto x = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({1, 3}));
+  auto y = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({3, 1}));
   std::vector<tensorflow::Tensor> inputs{x, y};
 
   TF_ASSERT_OK_AND_ASSIGN(auto result,
-                          executable.Execute(absl::MakeSpan(inputs)));
+                          executable.Execute(absl::MakeSpan(inputs), {}));
 
-  const auto expected_out = tensorflow::test::AsTensor<int32_t>(
-      {14}, tensorflow::TensorShape({1, 1}));
+  const auto expected_out =
+      AsTensor<int32_t>({14}, tensorflow::TensorShape({1, 1}));
 
   EXPECT_THAT(result, ElementsAre(TensorEq(expected_out)));
 }
@@ -133,33 +150,32 @@ TEST(IfrtServingExecutableTest, MultipleShapes) {
                           xla::ifrt::test_util::GetClient());
   Eigen::ThreadPoolDevice thread_pool_device = GetThreadPoolDevice();
 
+  IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
+
   IfrtServingExecutable executable("test", "main", std::move(mlir_module),
                                    client, &thread_pool_device,
+                                   &ifrt_loaded_variable_registry,
                                    tensorflow::IdentityShapeRepresentationFn());
 
-  auto x1 = tensorflow::test::AsTensor<int32_t>(
-      {1, 2, 3}, tensorflow::TensorShape({1, 3}));
-  auto y1 = tensorflow::test::AsTensor<int32_t>(
-      {1, 2, 3}, tensorflow::TensorShape({3, 1}));
-  const auto expected_out1 = tensorflow::test::AsTensor<int32_t>(
-      {14}, tensorflow::TensorShape({1, 1}));
+  auto x1 = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({1, 3}));
+  auto y1 = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({3, 1}));
+  const auto expected_out1 =
+      AsTensor<int32_t>({14}, tensorflow::TensorShape({1, 1}));
   std::vector<tensorflow::Tensor> inputs1{x1, y1};
 
-  auto x2 = tensorflow::test::AsTensor<int32_t>(
-      {1, 2, 3, 4}, tensorflow::TensorShape({1, 4}));
-  auto y2 = tensorflow::test::AsTensor<int32_t>(
-      {1, 2, 3, 4}, tensorflow::TensorShape({4, 1}));
-  const auto expected_out2 = tensorflow::test::AsTensor<int32_t>(
-      {30}, tensorflow::TensorShape({1, 1}));
+  auto x2 = AsTensor<int32_t>({1, 2, 3, 4}, tensorflow::TensorShape({1, 4}));
+  auto y2 = AsTensor<int32_t>({1, 2, 3, 4}, tensorflow::TensorShape({4, 1}));
+  const auto expected_out2 =
+      AsTensor<int32_t>({30}, tensorflow::TensorShape({1, 1}));
 
   std::vector<tensorflow::Tensor> inputs2{x2, y2};
 
   std::vector<tensorflow::Tensor> outputs1, outputs2;
   for (int i = 0; i < 3; i++) {
     TF_ASSERT_OK_AND_ASSIGN(outputs1,
-                            executable.Execute(absl::MakeSpan(inputs1)));
+                            executable.Execute(absl::MakeSpan(inputs1), {}));
     TF_ASSERT_OK_AND_ASSIGN(outputs2,
-                            executable.Execute(absl::MakeSpan(inputs2)));
+                            executable.Execute(absl::MakeSpan(inputs2), {}));
   }
 
   ASSERT_EQ(executable.num_executables(), 2);
@@ -192,24 +208,27 @@ TEST(IfrtServingExecutableTest, Spmd) {
                           xla::ifrt::test_util::GetClient());
   Eigen::ThreadPoolDevice thread_pool_device = GetThreadPoolDevice();
 
+  IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
+
   IfrtServingExecutable executable("test", "main", std::move(mlir_module),
                                    client, &thread_pool_device,
+                                   &ifrt_loaded_variable_registry,
                                    tensorflow::IdentityShapeRepresentationFn());
 
-  auto x = tensorflow::test::AsTensor<int32_t>({1, 2, 3, 4, 5, 6, 7, 8},
-                                               tensorflow::TensorShape({4, 2}));
-  auto y = tensorflow::test::AsTensor<int32_t>({11, 12, 13, 14, 15, 16, 17, 18},
-                                               tensorflow::TensorShape({4, 2}));
+  auto x = AsTensor<int32_t>({1, 2, 3, 4, 5, 6, 7, 8},
+                             tensorflow::TensorShape({4, 2}));
+  auto y = AsTensor<int32_t>({11, 12, 13, 14, 15, 16, 17, 18},
+                             tensorflow::TensorShape({4, 2}));
 
-  auto z = tensorflow::test::AsTensor<int32_t>({21, 22, 23, 24, 25, 26, 27, 28},
-                                               tensorflow::TensorShape({4, 2}));
+  auto z = AsTensor<int32_t>({21, 22, 23, 24, 25, 26, 27, 28},
+                             tensorflow::TensorShape({4, 2}));
 
-  const auto expected_out = tensorflow::test::AsTensor<int32_t>(
-      {33, 36, 39, 42, 45, 48, 51, 54}, tensorflow::TensorShape({4, 2}));
+  const auto expected_out = AsTensor<int32_t>({33, 36, 39, 42, 45, 48, 51, 54},
+                                              tensorflow::TensorShape({4, 2}));
 
   std::vector<tensorflow::Tensor> inputs{x, y, z};
   TF_ASSERT_OK_AND_ASSIGN(auto result,
-                          executable.Execute(absl::MakeSpan(inputs)));
+                          executable.Execute(absl::MakeSpan(inputs), {}));
 
   EXPECT_THAT(result, ElementsAre(TensorEq(expected_out)));
 }
@@ -237,26 +256,30 @@ TEST(IfrtServingExecutableTest, SpmdTwoReturns) {
                           xla::ifrt::test_util::GetClient());
   Eigen::ThreadPoolDevice thread_pool_device = GetThreadPoolDevice();
 
+  IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
+
   IfrtServingExecutable executable("test", "main", std::move(mlir_module),
                                    client, &thread_pool_device,
+                                   &ifrt_loaded_variable_registry,
                                    tensorflow::IdentityShapeRepresentationFn());
 
-  auto x = tensorflow::test::AsTensor<int32_t>({1, 2, 3, 4, 5, 6, 7, 8},
-                                               tensorflow::TensorShape({4, 2}));
-  auto y = tensorflow::test::AsTensor<int32_t>({11, 12, 13, 14, 15, 16, 17, 18},
-                                               tensorflow::TensorShape({4, 2}));
+  auto x = AsTensor<int32_t>({1, 2, 3, 4, 5, 6, 7, 8},
+                             tensorflow::TensorShape({4, 2}));
+  auto y = AsTensor<int32_t>({11, 12, 13, 14, 15, 16, 17, 18},
+                             tensorflow::TensorShape({4, 2}));
 
-  auto z = tensorflow::test::AsTensor<int32_t>({21, 22, 23, 24, 25, 26, 27, 28},
-                                               tensorflow::TensorShape({4, 2}));
+  auto z = AsTensor<int32_t>({21, 22, 23, 24, 25, 26, 27, 28},
+                             tensorflow::TensorShape({4, 2}));
 
-  const auto expected_out0 = tensorflow::test::AsTensor<int32_t>(
-      {33, 36, 39, 42, 45, 48, 51, 54}, tensorflow::TensorShape({4, 2}));
-  const auto expected_out1 = tensorflow::test::AsTensor<int32_t>(
-      {20, 20, 20, 20, 20, 20, 20, 20}, tensorflow::TensorShape({4, 2}));
+  const auto expected_out0 = AsTensor<int32_t>({33, 36, 39, 42, 45, 48, 51, 54},
+                                               tensorflow::TensorShape({4, 2}));
+  const auto expected_out1 = AsTensor<int32_t>({20, 20, 20, 20, 20, 20, 20, 20},
+                                               tensorflow::TensorShape({4, 2}));
 
   std::vector<tensorflow::Tensor> inputs{x, y, z};
+
   TF_ASSERT_OK_AND_ASSIGN(auto result,
-                          executable.Execute(absl::MakeSpan(inputs)));
+                          executable.Execute(absl::MakeSpan(inputs), {}));
 
   EXPECT_THAT(result,
               ElementsAre(TensorEq(expected_out0), TensorEq(expected_out1)));
@@ -285,22 +308,204 @@ TEST(IfrtServingExecutableTest, NoReturn) {
                           xla::ifrt::test_util::GetClient());
   Eigen::ThreadPoolDevice thread_pool_device = GetThreadPoolDevice();
 
+  IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
+
   IfrtServingExecutable executable("test", "main", std::move(mlir_module),
                                    client, &thread_pool_device,
+                                   &ifrt_loaded_variable_registry,
                                    tensorflow::IdentityShapeRepresentationFn());
 
-  auto x = tensorflow::test::AsTensor<int32_t>({1, 2, 3},
-                                               tensorflow::TensorShape({1, 3}));
-  auto y = tensorflow::test::AsTensor<int32_t>({1, 2, 3},
-                                               tensorflow::TensorShape({3, 1}));
+  auto x = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({1, 3}));
+  auto y = AsTensor<int32_t>({1, 2, 3}, tensorflow::TensorShape({3, 1}));
   std::vector<tensorflow::Tensor> inputs{x, y};
 
   TF_ASSERT_OK_AND_ASSIGN(auto result,
-                          executable.Execute(absl::MakeSpan(inputs)));
+                          executable.Execute(absl::MakeSpan(inputs), {}));
 
   ASSERT_EQ(result.size(), 0);
 }
 
+TEST_P(VariableInputTest, InterleaveVariable) {
+  // Create test input module
+  constexpr absl::string_view kDataDirectory =
+      "tensorflow/core/tfrt/ifrt/testdata";
+  std::string mlir_module_path = tensorflow::GetDataDependencyFilepath(
+      absl::StrCat(kDataDirectory, "/executable_long_inputs.mlir"));
+
+  mlir::DialectRegistry registry;
+  mlir::registerAllDialects(registry);
+  mlir::RegisterAllTensorFlowDialects(registry);
+
+  mlir::MLIRContext context(registry);
+
+  mlir::OwningOpRef<mlir::ModuleOp> mlir_module =
+      mlir::parseSourceFile<mlir::ModuleOp>(mlir_module_path, &context);
+
+  ASSERT_TRUE(mlir_module);
+
+  // Create contexts required for the compiler execution.
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
+                          xla::ifrt::test_util::GetClient());
+  Eigen::ThreadPoolDevice thread_pool_device = GetThreadPoolDevice();
+
+  IfrtLoadedVariableRegistry ifrt_loaded_variable_registry;
+  IfrtServingExecutable executable("test", "main", std::move(mlir_module),
+                                   client, &thread_pool_device,
+                                   &ifrt_loaded_variable_registry,
+                                   tensorflow::IdentityShapeRepresentationFn());
+
+  std::vector<tensorflow::Tensor> inputs;
+  std::vector<int> loaded_variable_indices;
+  for (int i = 0; i < GetParam().in_tensors.size(); i++) {
+    if (GetParam().is_variable[i]) {
+      std::string variable_name = absl::StrCat("variable_", i);
+      ASSERT_OK(ifrt_loaded_variable_registry.TryRegisterLoadedVariable(
+          variable_name,
+          [&]() -> absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> {
+            TF_ASSIGN_OR_RETURN(
+                tsl::RCReference<xla::ifrt::Array> array,
+                MakeArrayFromTensor(*client, GetParam().in_tensors[i],
+                                    /*device_ids=*/{0},
+                                    xla::HloSharding::Replicate(),
+                                    thread_pool_device));
+
+            return array;
+          }));
+      loaded_variable_indices.push_back(i);
+
+      // Use string tensor containing the key (name) in place of variable
+      // tensor.
+      tensorflow::Tensor key_tensor(tensorflow::DT_STRING, {});
+      key_tensor.scalar<tsl::tstring>()() = variable_name;
+      inputs.push_back(key_tensor);
+    } else {
+      inputs.push_back(GetParam().in_tensors[i]);
+    }
+  }
+
+  ASSERT_EQ(inputs.size(), GetParam().is_variable.size());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result, executable.Execute(absl::MakeSpan(inputs),
+                                      absl::MakeSpan(loaded_variable_indices)));
+
+  EXPECT_THAT(result,
+              ElementsAre(TensorEq(GetParam().expected_out_tensors[0]),
+                          TensorEq(GetParam().expected_out_tensors[1]),
+                          TensorEq(GetParam().expected_out_tensors[2])));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    VariableInputTests, VariableInputTest,
+    ::testing::ValuesIn<VariableInputTestParam>(
+        {
+            // Basic case: all variables or all non-variables.
+            {
+                .in_tensors =
+                    {
+                        AsTensor<int32_t>({2, 2}, TensorShape({1, 2})),
+                        AsTensor<int32_t>({3, 3}, TensorShape({2, 1})),
+                        AsTensor<int32_t>({4, 4}, TensorShape({1, 2})),
+                        AsTensor<int32_t>({5, 5}, TensorShape({2, 1})),
+                        AsTensor<int32_t>({10, 10}, TensorShape({1, 2})),
+                    },
+                .is_variable = {true, true, true, true, true},
+                .expected_out_tensors =
+                    {
+                        AsTensor<int32_t>({12}, TensorShape({1, 1})),
+                        AsTensor<int32_t>({40}, TensorShape({1, 1})),
+                        AsTensor<int32_t>({100}, TensorShape({1, 1})),
+                    },
+            },
+            {
+                .in_tensors =
+                    {
+                        AsTensor<int32_t>({2, 2}, TensorShape({1, 2})),
+                        AsTensor<int32_t>({3, 3}, TensorShape({2, 1})),
+                        AsTensor<int32_t>({4, 4}, TensorShape({1, 2})),
+                        AsTensor<int32_t>({5, 5}, TensorShape({2, 1})),
+                        AsTensor<int32_t>({10, 10}, TensorShape({1, 2})),
+                    },
+                .is_variable = {false, false, false, false, false},
+                .expected_out_tensors =
+                    {
+                        AsTensor<int32_t>({12}, TensorShape({1, 1})),
+                        AsTensor<int32_t>({40}, TensorShape({1, 1})),
+                        AsTensor<int32_t>({100}, TensorShape({1, 1})),
+                    },
+            },
+            // Variable and non-variables are non-interleaved
+            {
+                .in_tensors =
+                    {
+                        AsTensor<int32_t>({2, 2}, TensorShape({1, 2})),
+                        AsTensor<int32_t>({3, 3}, TensorShape({2, 1})),
+                        AsTensor<int32_t>({4, 4}, TensorShape({1, 2})),
+                        AsTensor<int32_t>({5, 5}, TensorShape({2, 1})),
+                        AsTensor<int32_t>({10, 10}, TensorShape({1, 2})),
+                    },
+                .is_variable = {false, false, false, true, true},
+                .expected_out_tensors =
+                    {
+                        AsTensor<int32_t>({12}, TensorShape({1, 1})),
+                        AsTensor<int32_t>({40}, TensorShape({1, 1})),
+                        AsTensor<int32_t>({100}, TensorShape({1, 1})),
+                    },
+            },
+            {
+                .in_tensors =
+                    {
+                        AsTensor<int32_t>({2, 2}, TensorShape({1, 2})),
+                        AsTensor<int32_t>({3, 3}, TensorShape({2, 1})),
+                        AsTensor<int32_t>({4, 4}, TensorShape({1, 2})),
+                        AsTensor<int32_t>({5, 5}, TensorShape({2, 1})),
+                        AsTensor<int32_t>({10, 10}, TensorShape({1, 2})),
+                    },
+                .is_variable = {true, true, false, false, false},
+                .expected_out_tensors =
+                    {
+                        AsTensor<int32_t>({12}, TensorShape({1, 1})),
+                        AsTensor<int32_t>({40}, TensorShape({1, 1})),
+                        AsTensor<int32_t>({100}, TensorShape({1, 1})),
+                    },
+            },
+            // Variable and non-variables are interleaved
+            {
+                .in_tensors =
+                    {
+                        AsTensor<int32_t>({2, 2}, TensorShape({1, 2})),
+                        AsTensor<int32_t>({3, 3}, TensorShape({2, 1})),
+                        AsTensor<int32_t>({4, 4}, TensorShape({1, 2})),
+                        AsTensor<int32_t>({5, 5}, TensorShape({2, 1})),
+                        AsTensor<int32_t>({10, 10}, TensorShape({1, 2})),
+                    },
+                .is_variable = {true, false, false, true, false},
+                .expected_out_tensors =
+                    {
+                        AsTensor<int32_t>({12}, TensorShape({1, 1})),
+                        AsTensor<int32_t>({40}, TensorShape({1, 1})),
+                        AsTensor<int32_t>({100}, TensorShape({1, 1})),
+                    },
+            },
+            {
+                .in_tensors =
+                    {
+                        AsTensor<int32_t>({2, 2}, TensorShape({1, 2})),
+                        AsTensor<int32_t>({3, 3}, TensorShape({2, 1})),
+                        AsTensor<int32_t>({4, 4}, TensorShape({1, 2})),
+                        AsTensor<int32_t>({5, 5}, TensorShape({2, 1})),
+                        AsTensor<int32_t>({10, 10}, TensorShape({1, 2})),
+                    },
+                .is_variable = {false, true, true, false, true},
+                .expected_out_tensors =
+                    {
+                        AsTensor<int32_t>({12}, TensorShape({1, 1})),
+                        AsTensor<int32_t>({40}, TensorShape({1, 1})),
+                        AsTensor<int32_t>({100}, TensorShape({1, 1})),
+                    },
+            },
+        }));
+
 }  // namespace
 }  // namespace ifrt_serving
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/ifrt/sharding_utils.cc b/tensorflow/core/tfrt/ifrt/sharding_utils.cc
index cbba6cea704eb8..dda5beff68c9c7 100644
--- a/tensorflow/core/tfrt/ifrt/sharding_utils.cc
+++ b/tensorflow/core/tfrt/ifrt/sharding_utils.cc
@@ -12,7 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+// Enable definition of Eigen::ThreadPoolDevice instead of just declaration.
 #define EIGEN_USE_THREADS
+
 #include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
 
 #include <algorithm>
@@ -24,6 +26,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/btree_map.h"
+#include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -259,21 +262,23 @@ absl::StatusOr<int> VerifyIndexDomainsAndGetReplicas(
     if (first_index_domain->shape() != index_domain->shape()) {
       return absl::UnimplementedError(absl::StrCat(
           "Expect equal shape of ", first_index_domain->shape().DebugString(),
-          " but  got ", index_domain->shape().DebugString()));
+          " but got ", index_domain->shape().DebugString()));
     }
   }
 
   // Verify that each `IndexDomain` appear the same `num_replica` times. Since
   // shapes are the same for all `IndexDomain`, this also implies each `origin`
   // appear `num_replica` times.
-  auto index_domain_lexicographical_comparator =
-      [](const xla::ifrt::IndexDomain& a, const xla::ifrt::IndexDomain& b) {
-        return std::lexicographical_compare(
-            a.origin().elements().begin(), a.origin().elements().end(),
-            b.origin().elements().begin(), b.origin().elements().end());
-      };
+  struct IndexDomainLexicographicalComparator {
+    bool operator()(const xla::ifrt::IndexDomain& a,
+                    const xla::ifrt::IndexDomain& b) const {
+      return std::lexicographical_compare(
+          a.origin().elements().begin(), a.origin().elements().end(),
+          b.origin().elements().begin(), b.origin().elements().end());
+    }
+  };
   absl::btree_map<xla::ifrt::IndexDomain, int,
-                  decltype(index_domain_lexicographical_comparator)>
+                  IndexDomainLexicographicalComparator>
       index_domain_counts;
   for (const auto& index_domain : index_domains) {
     index_domain_counts[index_domain]++;
@@ -337,7 +342,28 @@ absl::StatusOr<int> VerifyIndexDomainsAndGetReplicas(
   return num_replicas;
 }
 
-}  // namespace
+// A simple wrapper function to create ifrt array for one single device.
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+CreateArrayFromHostTensorForSingleDevice(xla::ifrt::Client& ifrt_client,
+                                         const tensorflow::Tensor& tensor,
+                                         xla::ifrt::Device* device) {
+  TF_ASSIGN_OR_RETURN(auto dtype, ToIfrtDType(tensor.dtype()));
+
+  VLOG(2) << "Make single device array for buffer slice at " << tensor.data();
+  auto single_device_sharding =
+      xla::ifrt::SingleDeviceSharding::Create(device, xla::ifrt::MemoryKind());
+
+  return ifrt_client.MakeArrayFromHostBuffer(
+      tensor.data(), dtype, ToIfrtShape(tensor.shape()),
+      /*byte_strides=*/{}, std::move(single_device_sharding),
+      xla::ifrt::Client::HostBufferSemantics::kImmutableUntilTransferCompletes,
+      [tensor]() {
+        // Keep tensor alive
+        VLOG(2) << "Done with single device host buffer for slice " << " at "
+                << tensor.data();
+      });
+}
+
 
 StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeAssembledArrayFromHostBuffer(
     xla::ifrt::Client& ifrt_client, const tensorflow::Tensor& input_tensor,
@@ -447,20 +473,25 @@ StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeAssembledArrayFromHostBuffer(
       xla::ifrt::ArrayCopySemantics::kDonateInput);
 }
 
+}  // namespace
+
 absl::StatusOr<tensorflow::Tensor> MakeTensorFromArray(
-    xla::ifrt::Client& ifrt_client,
-    tsl::RCReference<xla::ifrt::Array> input_array,
+    xla::ifrt::Client& ifrt_client, xla::ifrt::Array& input_array,
     const xla::HloSharding& hlo_sharding,
     const xla::ifrt::DeviceList& device_list,
     const Eigen::ThreadPoolDevice& thread_pool_device) {
   TF_ASSIGN_OR_RETURN(tensorflow::DataType data_type,
-                      ToTensorDataType(input_array->dtype()));
-  tensorflow::TensorShape tensor_shape = ToTensorShape(input_array->shape());
+                      ToTensorDataType(input_array.dtype()));
+  tensorflow::TensorShape tensor_shape = ToTensorShape(input_array.shape());
+
+  VLOG(2) << "Create tensor from array based on sharding: "
+          << hlo_sharding.ToString();
+
   if (hlo_sharding.IsReplicated()) {
     VLOG(1) << "Fast path for replication";
     // fast path for replication.
     TF_ASSIGN_OR_RETURN(auto fully_replicated_array,
-                        input_array->FullyReplicatedShard(
+                        input_array.FullyReplicatedShard(
                             xla::ifrt::ArrayCopySemantics::kDonateInput));
 
     if (fully_replicated_array->shape() != ToIfrtShape(tensor_shape)) {
@@ -468,7 +499,6 @@ absl::StatusOr<tensorflow::Tensor> MakeTensorFromArray(
           "Not fully replicated output. Expected ", tensor_shape.DebugString(),
           " but got ", fully_replicated_array->shape().DebugString()));
     }
-
     tensorflow::Tensor output_tensor(data_type, tensor_shape);
     TF_RETURN_IF_ERROR(
         fully_replicated_array
@@ -477,7 +507,26 @@ absl::StatusOr<tensorflow::Tensor> MakeTensorFromArray(
                                xla::ifrt::ArrayCopySemantics::kAlwaysCopy)
             .Await());
     return output_tensor;
+  } else if (hlo_sharding.IsTileMaximal()) {
+    // Maximal implies single device
+    VLOG(1) << "Fast path for maximal";
+    TF_ASSIGN_OR_RETURN(
+        std::vector<tsl::RCReference<xla::ifrt::Array>> disassembled_array,
+        input_array.DisassembleIntoSingleDeviceArrays(
+            xla::ifrt::ArrayCopySemantics::kDonateInput));
+
+    int64_t device_id = hlo_sharding.GetUniqueDevice();
+
+    tensorflow::Tensor output_tensor(data_type, tensor_shape);
+    TF_RETURN_IF_ERROR(
+        disassembled_array[device_id]
+            ->CopyToHostBuffer(output_tensor.data(),
+                               /*byte_strides=*/std::nullopt,
+                               xla::ifrt::ArrayCopySemantics::kAlwaysCopy)
+            .Await());
+    return output_tensor;
   }
+
   auto ifrt_sharding = xla::ifrt::HloSharding::Create(
       device_list, xla::ifrt::MemoryKind(), hlo_sharding);
 
@@ -498,7 +547,7 @@ absl::StatusOr<tensorflow::Tensor> MakeTensorFromArray(
 
   TF_ASSIGN_OR_RETURN(
       std::vector<tsl::RCReference<xla::ifrt::Array>> disassembled_array,
-      input_array->DisassembleIntoSingleDeviceArrays(
+      input_array.DisassembleIntoSingleDeviceArrays(
           xla::ifrt::ArrayCopySemantics::kDonateInput));
 
   if (index_domains.size() != disassembled_array.size()) {
@@ -588,5 +637,65 @@ absl::StatusOr<tensorflow::Tensor> MakeTensorFromArray(
       tensor_shape, thread_pool_device);
 }
 
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeArrayFromTensor(
+    xla::ifrt::Client& ifrt_client, const tensorflow::Tensor& input_tensor,
+    const xla::ifrt::DeviceList& device_list,
+    const xla::HloSharding& hlo_sharding,
+    const Eigen::ThreadPoolDevice& thread_pool_device) {
+  VLOG(3) << "IsTiled: " << hlo_sharding.IsTiled();
+  VLOG(3) << "IsReplicated: " << hlo_sharding.IsReplicated();
+  VLOG(3) << "IsTileMaximal: " << hlo_sharding.IsTileMaximal();
+  if (!hlo_sharding.IsTiled() && !hlo_sharding.IsReplicated() &&
+      !hlo_sharding.IsTileMaximal()) {
+    return absl::UnimplementedError(absl::StrCat(
+        "Only support MAXIMAL, OTHER or REPLICATED, but got sharding : ",
+        hlo_sharding.ToString()));
+  }
+
+  VLOG(1) << "Hlo sharding: " << hlo_sharding.ToString();
+  VLOG(1) << "Device list size: " << device_list.size();
+
+  if (device_list.size() == 1) {
+    return CreateArrayFromHostTensorForSingleDevice(ifrt_client, input_tensor,
+                                                    device_list[0]);
+  }
+
+  // IsTileMaximal() also returns true for a replicate sharding created by
+  // xla::HloSharding::Replicate().
+  if (!hlo_sharding.IsReplicated() && hlo_sharding.IsTileMaximal()) {
+    VLOG(1) << "Single device fast path for Maximal tiled tensor";
+    xla::ifrt::Device* device;
+    int unique_device_id = hlo_sharding.GetUniqueDevice();
+    TF_ASSIGN_OR_RETURN(device, ifrt_client.LookupDevice(unique_device_id));
+    return CreateArrayFromHostTensorForSingleDevice(ifrt_client, input_tensor,
+                                                    device);
+  }
+
+  return MakeAssembledArrayFromHostBuffer(ifrt_client, input_tensor,
+                                          std::move(hlo_sharding), device_list,
+                                          thread_pool_device);
+}
+
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeArrayFromTensor(
+    xla::ifrt::Client& ifrt_client, const tensorflow::Tensor& input_tensor,
+    absl::Span<const int> device_ids, const xla::HloSharding& hlo_sharding,
+    const Eigen::ThreadPoolDevice& thread_pool_device) {
+  if (device_ids.empty()) {
+    return absl::InvalidArgumentError("device_ids cannot be empty");
+  }
+  std::vector<xla::ifrt::Device*> devices;
+  devices.reserve(device_ids.size());
+  for (auto device_id : device_ids) {
+    TF_ASSIGN_OR_RETURN(xla::ifrt::Device * device,
+                        ifrt_client.LookupDevice(device_id));
+    devices.push_back(device);
+  }
+  xla::ifrt::DeviceList device_list(
+      xla::ifrt::DeviceList::Devices(devices.begin(), devices.end()));
+
+  return MakeArrayFromTensor(ifrt_client, input_tensor, device_list,
+                             hlo_sharding, thread_pool_device);
+}
+
 }  // namespace ifrt_serving
 }  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/ifrt/sharding_utils.h b/tensorflow/core/tfrt/ifrt/sharding_utils.h
index 72d476bba5271f..13599d4e6c3032 100644
--- a/tensorflow/core/tfrt/ifrt/sharding_utils.h
+++ b/tensorflow/core/tfrt/ifrt/sharding_utils.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef TENSORFLOW_CORE_TFRT_IFRT_SHARDING_UTILS_H_
 #define TENSORFLOW_CORE_TFRT_IFRT_SHARDING_UTILS_H_
 
-#include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 #include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/python/ifrt/array.h"
@@ -31,12 +31,19 @@ limitations under the License.
 namespace tensorflow {
 namespace ifrt_serving {
 
-// Sharded the given `data` by the `sharding` specification.
-// It currently supports even sharding, replication and partial replication.
-StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeAssembledArrayFromHostBuffer(
+// Create a tensor from the given host tensor based on given device ids and
+// sharding information.
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeArrayFromTensor(
+    xla::ifrt::Client& ifrt_client, const tensorflow::Tensor& input_tensor,
+    absl::Span<const int> device_ids, const xla::HloSharding& hlo_sharding,
+    const Eigen::ThreadPoolDevice& thread_pool_device);
+
+// A variant of the above api. The difference is that the user passes in
+// device_list directly instead of a list of device_ids.
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeArrayFromTensor(
     xla::ifrt::Client& ifrt_client, const tensorflow::Tensor& input_tensor,
-    const xla::HloSharding& hlo_sharding,
     const xla::ifrt::DeviceList& device_list,
+    const xla::HloSharding& hlo_sharding,
     const Eigen::ThreadPoolDevice& thread_pool_device);
 
 // Reshard an disassembled array list back to one single tensor
@@ -51,8 +58,7 @@ StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeAssembledArrayFromHostBuffer(
 // in the `input_array`.
 //
 absl::StatusOr<tensorflow::Tensor> MakeTensorFromArray(
-    xla::ifrt::Client& ifrt_client,
-    tsl::RCReference<xla::ifrt::Array> input_array,
+    xla::ifrt::Client& ifrt_client, xla::ifrt::Array& input_array,
     const xla::HloSharding& hlo_sharding,
     const xla::ifrt::DeviceList& device_list,
     const Eigen::ThreadPoolDevice& thread_pool_device);
diff --git a/tensorflow/core/tfrt/ifrt/sharding_utils_test.cc b/tensorflow/core/tfrt/ifrt/sharding_utils_test.cc
index 9a3b584a5c4523..a4604fef756ba7 100644
--- a/tensorflow/core/tfrt/ifrt/sharding_utils_test.cc
+++ b/tensorflow/core/tfrt/ifrt/sharding_utils_test.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#define EIGEN_USE_THREADS
 #include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
 
 #include <cstdint>
@@ -21,6 +20,9 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#define EIGEN_USE_THREADS
+
+#include <gtest/gtest.h>
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
@@ -33,6 +35,7 @@ limitations under the License.
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/ifrt/test_util.h"
 #include "xla/python/pjrt_ifrt/xla_sharding.h"
+#include "xla/xla_data.pb.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor_matcher.h"
 #include "tensorflow/core/framework/tensor_shape.h"
@@ -50,6 +53,7 @@ namespace tensorflow {
 namespace ifrt_serving {
 namespace {
 
+using tensorflow::test::TensorEq;
 using tsl::testing::StatusIs;
 
 struct ReshardToTensorTestParam {
@@ -60,14 +64,15 @@ struct ReshardToTensorTestParam {
   xla::HloSharding sharding;
 };
 
-struct ShardToArrayTestParam {
+struct TensorToArrayTestParam {
   tensorflow::Tensor in_tensor;
   std::vector<tensorflow::Tensor> expected_out_tensors;
-  std::vector<int> device_indices;
+  std::vector<int> device_ids;
   xla::HloSharding sharding;
 };
-using ShardToArrayTest = ::testing::TestWithParam<ShardToArrayTestParam>;
+
 using ReshardToTensorTest = ::testing::TestWithParam<ReshardToTensorTestParam>;
+using TensorToArrayTest = ::testing::TestWithParam<TensorToArrayTestParam>;
 
 // Wrapper functions for building sharding specs for a given shape with a
 // natural device order.
@@ -78,6 +83,9 @@ xla::HloSharding PartialTile(absl::Span<const int64_t> dims) {
   return xla::HloSharding::PartialTile(xla::TileAssignment(dims));
 }
 xla::HloSharding Replicate() { return xla::HloSharding::Replicate(); }
+xla::HloSharding Maximal(int64_t device_index = 0) {
+  return xla::HloSharding::AssignDevice(device_index);
+}
 
 TEST_P(ReshardToTensorTest, MakeHostTensorFromDeviceArrays) {
   constexpr int kMaxParallelism = 16;
@@ -112,8 +120,10 @@ TEST_P(ReshardToTensorTest, MakeHostTensorFromDeviceArrays) {
 
   auto ifrt_sharding = xla::ifrt::HloSharding::Create(
       device_list, xla::ifrt::MemoryKind(), GetParam().sharding);
+  tsl::RCReference<xla::ifrt::Array> assembled_array;
+
   TF_ASSERT_OK_AND_ASSIGN(
-      auto assembled_array,
+      assembled_array,
       client->AssembleArrayFromSingleDeviceArrays(
           ToIfrtShape(GetParam().expected_out_tensor.shape()),
           std::move(ifrt_sharding), absl::MakeSpan(split_arrays),
@@ -121,17 +131,39 @@ TEST_P(ReshardToTensorTest, MakeHostTensorFromDeviceArrays) {
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto output_tensor,
-      MakeTensorFromArray(*client, assembled_array, GetParam().sharding,
+      MakeTensorFromArray(*client, *assembled_array, GetParam().sharding,
                           device_list, device));
 
-  EXPECT_THAT(GetParam().expected_out_tensor,
-              tensorflow::test::TensorEq(output_tensor));
+  EXPECT_THAT(GetParam().expected_out_tensor, TensorEq(output_tensor));
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    HloShardingConcatTests, ReshardToTensorTest,
+    HloShardingTests, ReshardToTensorTest,
     ::testing::ValuesIn<ReshardToTensorTestParam>(
         {
+            // Maximal
+            {
+                .split_tensors =
+                    {
+                        test::AsTensor<int32_t>({3}, TensorShape({})),
+                    },
+                .expected_out_tensor = test::AsTensor<int32_t>({3},
+                                                               TensorShape({})),
+                .device_indices = {0},
+                .sharding = Maximal(0),
+            },
+            {
+                .split_tensors =
+                    {
+                        test::AsTensor<int32_t>({3}, TensorShape({})),
+                        test::AsTensor<int32_t>({4}, TensorShape({})),
+                    },
+                .expected_out_tensor = test::AsTensor<int32_t>({4},
+                                                               TensorShape({})),
+                .device_indices = {0, 1},
+                .sharding = Maximal(1),
+            },
+
             // Full replication.
             {
                 .split_tensors =
@@ -273,7 +305,7 @@ INSTANTIATE_TEST_SUITE_P(
             },
         }));
 
-TEST_P(ShardToArrayTest, MakeAssembledArrayFromHostBuffer) {
+TEST_P(TensorToArrayTest, MakeArrayFromTensor) {
   constexpr int kMaxParallelism = 16;
   auto thread_pool = std::make_unique<tsl::thread::ThreadPool>(
       tsl::Env::Default(), tsl::ThreadOptions(), "Resharding", kMaxParallelism);
@@ -286,14 +318,12 @@ TEST_P(ShardToArrayTest, MakeAssembledArrayFromHostBuffer) {
   // Create contexts required for the compiler execution.
   TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
                           xla::ifrt::test_util::GetClient());
-  TF_ASSERT_OK_AND_ASSIGN(auto device_list,
-                          xla::ifrt::test_util::GetDevices(
-                              client.get(), GetParam().device_indices));
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto assembled_array,
-      MakeAssembledArrayFromHostBuffer(
-          *client, input_tensor, GetParam().sharding, device_list, device));
+      MakeArrayFromTensor(*client, input_tensor,
+                          absl::MakeSpan(GetParam().device_ids),
+                          GetParam().sharding, device));
 
   TF_ASSERT_OK_AND_ASSIGN(auto disassembled_arrays,
                           assembled_array->DisassembleIntoSingleDeviceArrays(
@@ -301,9 +331,6 @@ TEST_P(ShardToArrayTest, MakeAssembledArrayFromHostBuffer) {
 
   ASSERT_EQ(disassembled_arrays.size(), GetParam().expected_out_tensors.size());
 
-  tensorflow::Tensor host_tensor(tensorflow::DT_INT32,
-                                 tensorflow::TensorShape({1, 2}));
-
   for (int i = 0; i < disassembled_arrays.size(); ++i) {
     SCOPED_TRACE(absl::StrCat("Array ", i, " of ", disassembled_arrays.size()));
     auto disassembled_array = disassembled_arrays[i];
@@ -317,14 +344,42 @@ TEST_P(ShardToArrayTest, MakeAssembledArrayFromHostBuffer) {
             ->CopyToHostBuffer(host_tensor.data(), /*byte_strides=*/{},
                                xla::ifrt::ArrayCopySemantics::kAlwaysCopy)
             .Await());
-    EXPECT_THAT(expected_out_tensor, tensorflow::test::TensorEq(host_tensor));
+    EXPECT_THAT(expected_out_tensor, TensorEq(host_tensor));
   }
 }
 
 INSTANTIATE_TEST_SUITE_P(
-    HloShardingTests, ShardToArrayTest,
-    ::testing::ValuesIn<ShardToArrayTestParam>(
+    TensorToArrayTests, TensorToArrayTest,
+    ::testing::ValuesIn<TensorToArrayTestParam>(
         {
+            // Single device
+            {
+                .in_tensor = test::AsTensor<int32_t>({1}, TensorShape({})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({1}, TensorShape({})),
+                    },
+                .device_ids = {0},
+                .sharding = Replicate(),
+            },
+            {
+                .in_tensor = test::AsTensor<int32_t>({2}, TensorShape({})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({2}, TensorShape({})),
+                    },
+                .device_ids = {0},
+                .sharding = Maximal(0),
+            },
+            {
+                .in_tensor = test::AsTensor<int32_t>({3}, TensorShape({})),
+                .expected_out_tensors =
+                    {
+                        test::AsTensor<int32_t>({3}, TensorShape({})),
+                    },
+                .device_ids = {0, 1},
+                .sharding = Maximal(1),
+            },
             // Full replication.
             {
                 .in_tensor = test::AsTensor<int32_t>({1}, TensorShape({})),
@@ -333,7 +388,7 @@ INSTANTIATE_TEST_SUITE_P(
                         test::AsTensor<int32_t>({1}, TensorShape({})),
                         test::AsTensor<int32_t>({1}, TensorShape({})),
                     },
-                .device_indices = {0, 1},
+                .device_ids = {0, 1},
                 .sharding = Replicate(),
             },
             {
@@ -344,7 +399,7 @@ INSTANTIATE_TEST_SUITE_P(
                         test::AsTensor<int32_t>({1, 2, 3}, TensorShape({3, 1})),
                         test::AsTensor<int32_t>({1, 2, 3}, TensorShape({3, 1})),
                     },
-                .device_indices = {0, 1},
+                .device_ids = {0, 1},
                 .sharding = Replicate(),
             },
             // 1-D sharding
@@ -356,7 +411,7 @@ INSTANTIATE_TEST_SUITE_P(
                         test::AsTensor<int32_t>({1, 2}, TensorShape({2})),
                         test::AsTensor<int32_t>({3, 4}, TensorShape({2})),
                     },
-                .device_indices = {0, 1},
+                .device_ids = {0, 1},
                 .sharding = Tile({2}),
             },
             {
@@ -367,7 +422,7 @@ INSTANTIATE_TEST_SUITE_P(
                         test::AsTensor<int32_t>({1, 2}, TensorShape({1, 2})),
                         test::AsTensor<int32_t>({3, 4}, TensorShape({1, 2})),
                     },
-                .device_indices = {0, 1},
+                .device_ids = {0, 1},
                 .sharding = Tile({2, 1}),
             },
             {
@@ -378,7 +433,7 @@ INSTANTIATE_TEST_SUITE_P(
                         test::AsTensor<int32_t>({1, 3}, TensorShape({1, 2, 1})),
                         test::AsTensor<int32_t>({2, 4}, TensorShape({1, 2, 1})),
                     },
-                .device_indices = {0, 1},
+                .device_ids = {0, 1},
                 .sharding = Tile({1, 1, 2}),
             },
             {
@@ -391,7 +446,7 @@ INSTANTIATE_TEST_SUITE_P(
                         test::AsTensor<int32_t>({5, 6}, TensorShape({1, 2})),
                         test::AsTensor<int32_t>({7, 8}, TensorShape({1, 2})),
                     },
-                .device_indices = {0, 1, 2, 3},
+                .device_ids = {0, 1, 2, 3},
                 .sharding = Tile({4, 1}),
             },
             {
@@ -404,7 +459,7 @@ INSTANTIATE_TEST_SUITE_P(
                         test::AsTensor<int32_t>({2, 4, 6, 8},
                                                 TensorShape({4, 1})),
                     },
-                .device_indices = {0, 1},
+                .device_ids = {0, 1},
                 .sharding = Tile({1, 2}),
             },
             // 2-D sharding
@@ -423,7 +478,7 @@ INSTANTIATE_TEST_SUITE_P(
                         test::AsTensor<int32_t>({11, 12, 15, 16},
                                                 TensorShape({2, 2})),
                     },
-                .device_indices = {0, 1, 2, 3},
+                .device_ids = {0, 1, 2, 3},
                 .sharding = Tile({2, 2}),
             },
             {
@@ -441,7 +496,7 @@ INSTANTIATE_TEST_SUITE_P(
                         test::AsTensor<int32_t>({11, 12, 15, 16},
                                                 TensorShape({2, 1, 2})),
                     },
-                .device_indices = {0, 1, 2, 3},
+                .device_ids = {0, 1, 2, 3},
                 .sharding = Tile({2, 1, 2}),
             },
             // Partial replication
@@ -455,7 +510,7 @@ INSTANTIATE_TEST_SUITE_P(
                         test::AsTensor<int32_t>({2, 4}, TensorShape({2, 1})),
                         test::AsTensor<int32_t>({2, 4}, TensorShape({2, 1})),
                     },
-                .device_indices = {0, 1, 2, 3},
+                .device_ids = {0, 1, 2, 3},
                 .sharding = PartialTile({1, 2, 2}),
             },
             {
@@ -468,7 +523,7 @@ INSTANTIATE_TEST_SUITE_P(
                         test::AsTensor<int32_t>({3, 4}, TensorShape({1, 2})),
                         test::AsTensor<int32_t>({3, 4}, TensorShape({1, 2})),
                     },
-                .device_indices = {0, 1, 2, 3},
+                .device_ids = {0, 1, 2, 3},
                 .sharding = PartialTile({2, 1, 2}),
             },
             {
@@ -481,7 +536,7 @@ INSTANTIATE_TEST_SUITE_P(
                         test::AsTensor<int32_t>({3, 4}, TensorShape({1, 2})),
                         test::AsTensor<int32_t>({3, 4}, TensorShape({1, 2})),
                     },
-                .device_indices = {3, 2, 1, 0},
+                .device_ids = {3, 2, 1, 0},
                 .sharding = PartialTile({2, 1, 2}),
             },
         }));
@@ -505,12 +560,11 @@ TEST(ShardingUtilsTest, MismatchRank) {
 
   xla::HloSharding sharding = Tile({2, 1});
 
-  EXPECT_THAT(
-      MakeAssembledArrayFromHostBuffer(
-          *client, input_tensor, std::move(sharding), device_list, device),
-      StatusIs(absl::StatusCode::kInvalidArgument,
-               "shape must have 2 dimensions, but has 3 dimensions: "
-               "shape=[2,1,2], sharding={devices=[2,1]<=[2]}"));
+  EXPECT_THAT(MakeArrayFromTensor(*client, input_tensor, device_list,
+                                  std::move(sharding), device),
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       "shape must have 2 dimensions, but has 3 dimensions: "
+                       "shape=[2,1,2], sharding={devices=[2,1]<=[2]}"));
 }
 
 }  // namespace
diff --git a/tensorflow/core/tfrt/ifrt/testdata/executable_long_inputs.mlir b/tensorflow/core/tfrt/ifrt/testdata/executable_long_inputs.mlir
new file mode 100644
index 00000000000000..529aa607276072
--- /dev/null
+++ b/tensorflow/core/tfrt/ifrt/testdata/executable_long_inputs.mlir
@@ -0,0 +1,8 @@
+module attributes {tf.versions = {bad_consumers = [], min_consumer = 0 : i32, producer = 268 : i32}} {
+  func.func @main(%arg0: tensor<*xi32>, %arg1: tensor<*xi32>, %arg2: tensor<*xi32>, %arg3: tensor<*xi32>, %arg4: tensor<*xi32>) -> (tensor<*xi32>, tensor<*xi32>, tensor<*xi32>) attributes {__tpu_compile_metadata_text = "args { dtype: DT_INT32 kind: PARAMETER } args { dtype: DT_INT32 kind: PARAMETER } args { dtype: DT_INT32 kind: PARAMETER } args { dtype: DT_INT32 kind: PARAMETER } args { dtype: DT_INT32 kind: PARAMETER } retvals { }  retvals { } retvals { } num_replicas: 1 num_cores_per_replica: 1"} {
+    %0 = "tf.MatMul"(%arg0, %arg1): (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+    %1 = "tf.MatMul"(%arg2, %arg3): (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+    %2 = "tf.MatMul"(%arg4, %arg3): (tensor<*xi32>, tensor<*xi32>) -> tensor<*xi32>
+    func.return %0, %1, %2 : tensor<*xi32>, tensor<*xi32>, tensor<*xi32>
+  }
+}
diff --git a/tensorflow/core/tfrt/kernels/BUILD b/tensorflow/core/tfrt/kernels/BUILD
index 390bef2009b9b0..fa342c7a6bcb17 100644
--- a/tensorflow/core/tfrt/kernels/BUILD
+++ b/tensorflow/core/tfrt/kernels/BUILD
@@ -29,6 +29,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/tfrt/kernels/ifrt_program_ops.cc b/tensorflow/core/tfrt/kernels/ifrt_program_ops.cc
index 92ce3cad2c1e04..9f0ecea2708a79 100644
--- a/tensorflow/core/tfrt/kernels/ifrt_program_ops.cc
+++ b/tensorflow/core/tfrt/kernels/ifrt_program_ops.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
 #include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/op_requires.h"
 #include "tensorflow/core/framework/tensor.h"
@@ -34,6 +35,8 @@ namespace tfrt_stub {
 
 IfrtCallOp::IfrtCallOp(tensorflow::OpKernelConstruction* ctx) : OpKernel(ctx) {
   OP_REQUIRES_OK(ctx, ctx->GetAttr("program_id", &program_id_));
+  OP_REQUIRES_OK(ctx,
+                 ctx->GetAttr("variable_arg_indices", &variable_arg_indices_));
 }
 
 void IfrtCallOp::Compute(tensorflow::OpKernelContext* ctx) {
@@ -51,7 +54,8 @@ void IfrtCallOp::Compute(tensorflow::OpKernelContext* ctx) {
     inputs.push_back(ctx->input(i));
   }
 
-  absl::StatusOr<std::vector<Tensor>> results = executable_->Execute(inputs);
+  absl::StatusOr<std::vector<Tensor>> results =
+      executable_->Execute(inputs, absl::MakeSpan(variable_arg_indices_));
   OP_REQUIRES(ctx, results.ok(), results.status());
 
   tensorflow::OpOutputList outputs(ctx, 0, results->size());
diff --git a/tensorflow/core/tfrt/kernels/ifrt_program_ops.h b/tensorflow/core/tfrt/kernels/ifrt_program_ops.h
index 578ccae70b8e4b..31bb908519d405 100644
--- a/tensorflow/core/tfrt/kernels/ifrt_program_ops.h
+++ b/tensorflow/core/tfrt/kernels/ifrt_program_ops.h
@@ -41,6 +41,9 @@ class IfrtCallOp : public tensorflow::OpKernel {
   // Op attributes.
   int64_t program_id_;
 
+  std::vector<std::string> variable_names_;
+  std::vector<int> variable_arg_indices_;
+
   // Ifrt program to be called. Cached after the first call.
   absl::once_flag init_once_;
   std::shared_ptr<tensorflow::ifrt_serving::IfrtServingExecutable> executable_;
diff --git a/tensorflow/core/tfrt/mlrt/kernel/BUILD b/tensorflow/core/tfrt/mlrt/kernel/BUILD
index cca6cddbfb650a..b22a9fa16ffce0 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/BUILD
+++ b/tensorflow/core/tfrt/mlrt/kernel/BUILD
@@ -23,8 +23,7 @@ cc_library(
         ":context",
         ":kernel_runner_utils",
         "//tensorflow/core:framework",
-        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_utils",
-        "//tensorflow/core/tfrt/fallback:device_with_custom_allocator",
+        "//tensorflow/core/tfrt/mlrt/bytecode:function",
         "//tensorflow/core/tfrt/mlrt/interpreter:async_handle",
         "//tensorflow/core/tfrt/mlrt/interpreter:attribute_span",
         "//tensorflow/core/tfrt/mlrt/interpreter:builtin_kernels",
@@ -34,17 +33,48 @@ cc_library(
         "//tensorflow/core/tfrt/mlrt/interpreter:register_span",
         "//tensorflow/core/tfrt/mlrt/interpreter:value",
         "//tensorflow/core/tfrt/utils",
+        "//tensorflow/core/tfrt/utils:fallback_tensor",
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
-        "@com_google_protobuf//:protobuf_headers",
-        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/profiler/lib:traceme",
-        "@tf_runtime//:async_value",
         "@tf_runtime//:hostcontext",
     ],
 )
 
+cc_library(
+    name = "ifrt_ops_kernel",
+    srcs = ["ifrt_ops_kernel.cc"],
+    deps = [
+        ":context",
+        ":kernel",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:framework_internal",
+        "//tensorflow/core/platform:protobuf",
+        "//tensorflow/core/platform:refcount",
+        "//tensorflow/core/tfrt/ifrt:ifrt_config_proto_cc",
+        "//tensorflow/core/tfrt/ifrt:ifrt_model_context",
+        "//tensorflow/core/tfrt/ifrt:sharding_utils",
+        "//tensorflow/core/tfrt/mlrt/bytecode",
+        "//tensorflow/core/tfrt/mlrt/interpreter:context",
+        "//tensorflow/core/tfrt/utils:fallback_tensor",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:tstring",
+        "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/hlo/ir:hlo",
+        "@local_xla//xla/python/ifrt",
+    ],
+)
+
 cc_library(
     name = "batch_kernel",
     srcs = ["batch_kernel.cc"],
@@ -54,6 +84,7 @@ cc_library(
         ":kernel_runner_utils",
         "//tensorflow/core:framework",
         "//tensorflow/core/kernels/batching_util:batch_scheduler_hdrs",
+        "//tensorflow/core/platform:protobuf",
         "//tensorflow/core/platform:statusor",
         "//tensorflow/core/runtime_fallback/runtime:fallback_batch_kernel",
         "//tensorflow/core/tfrt/fallback:op_kernel_runner_cache",
@@ -127,3 +158,51 @@ tf_cc_shared_test(
         "@tf_runtime//:ref_count",
     ],
 )
+
+tf_cc_shared_test(
+    name = "ifrt_ops_kernel_test",
+    srcs = ["ifrt_ops_kernel_test.cc"],
+    tags = ["no_oss"],
+    deps = [
+        ":context",
+        ":ifrt_ops_kernel",
+        ":kernel",
+        "//tensorflow/core:core_cpu",
+        "//tensorflow/core/framework:tensor",
+        "//tensorflow/core/framework:tensor_matcher",
+        "//tensorflow/core/framework:tensor_testutil",
+        "//tensorflow/core/kernels:math",
+        "//tensorflow/core/ops:math_ops_op_lib",
+        "//tensorflow/core/platform:protobuf",
+        "//tensorflow/core/runtime_fallback/kernel:kernel_fallback_compat_request_state",
+        "//tensorflow/core/tfrt/fallback:fallback_state",
+        "//tensorflow/core/tfrt/fallback:op_kernel_runner",
+        "//tensorflow/core/tfrt/ifrt:ifrt_config_proto_cc",
+        "//tensorflow/core/tfrt/ifrt:ifrt_model_context",
+        "//tensorflow/core/tfrt/mlrt/bytecode",
+        "//tensorflow/core/tfrt/mlrt/bytecode:executable",
+        "//tensorflow/core/tfrt/mlrt/interpreter:builtin_kernels",
+        "//tensorflow/core/tfrt/mlrt/interpreter:context",
+        "//tensorflow/core/tfrt/mlrt/interpreter:execute",
+        "//tensorflow/core/tfrt/mlrt/interpreter:interpreter_testutil",
+        "//tensorflow/core/tfrt/mlrt/interpreter:value",
+        "//tensorflow/core/tfrt/utils:fallback_tensor",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:tstring",
+        "@local_xla//xla/python/ifrt",
+        "@local_xla//xla/python/ifrt:test_util",
+        "@local_xla//xla/python/pjrt_ifrt:tfrt_cpu_client_test_lib",
+        "@tf_runtime//:hostcontext",
+    ],
+)
diff --git a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
new file mode 100644
index 00000000000000..9f58ca6fa6b8f8
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel.cc
@@ -0,0 +1,156 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/xla_data.pb.h"
+#include "tensorflow/core/framework/resource_handle.h"
+#include "tensorflow/core/framework/resource_mgr.h"
+#include "tensorflow/core/framework/resource_var.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/platform/protobuf.h"  // IWYU pragma: keep
+#include "tensorflow/core/platform/refcount.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_config.pb.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_model_context.h"
+#include "tensorflow/core/tfrt/ifrt/sharding_utils.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/context.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/kernel.h"
+#include "tensorflow/core/tfrt/utils/fallback_tensor.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/tstring.h"
+
+namespace tensorflow {
+namespace tf_mlrt {
+
+namespace {
+
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> LoadIfrtVariable(
+    tensorflow::ifrt_serving::IfrtModelContext& ifrt_model_context,
+    const tensorflow::Tensor& variable,
+    absl::string_view sharding_config_proto_text, absl::string_view name) {
+  tensorflow::ifrt_serving::VariableDeviceShardingConfigProto sharding_config;
+
+  if (!tensorflow::protobuf::TextFormat::ParseFromString(
+          sharding_config_proto_text, &sharding_config)) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "Attribute: ", sharding_config_proto_text, " cannot be parsed"));
+  }
+
+  std::vector<int> device_ids{sharding_config.device_ids().begin(),
+                              sharding_config.device_ids().end()};
+  TF_ASSIGN_OR_RETURN(xla::HloSharding hlo_sharding,
+                      xla::HloSharding::FromProto(sharding_config.sharding()));
+  TF_ASSIGN_OR_RETURN(
+      auto result_array,
+      tensorflow::ifrt_serving::MakeArrayFromTensor(
+          *ifrt_model_context.GetClient(), variable, absl::MakeSpan(device_ids),
+          hlo_sharding, ifrt_model_context.GetThreadPoolDevice()));
+
+  return result_array;
+}
+
+struct MlrtIfrtLoadVariableKernel : mlrt::KernelFrame {
+  using KernelFrame::KernelFrame;
+
+  static constexpr char kName[] = "tf_mlrt.ifrt_load_variable";
+
+  const ResourceHandle& variable() const {
+    DCHECK_GE(arguments().size(), 1);
+    const auto& tensor =
+        arguments()[0].Get<tensorflow::tfrt_stub::FallbackTensor>().tensor();
+
+    DCHECK_EQ(tensor.NumElements(), 1);
+    return tensor.scalar<ResourceHandle>()();
+  }
+  absl::string_view sharding_config_proto_text() const {
+    DCHECK_EQ(attributes().size(), 2);
+    return attributes().GetAs<mlrt::bc::String>(0).Get();
+  }
+  absl::string_view name() const {
+    DCHECK_EQ(attributes().size(), 2);
+    return attributes().GetAs<mlrt::bc::String>(1).Get();
+  }
+
+  Context& context() { return execution_context().GetUserContext<Context>(); }
+  void Invoke();
+};
+
+void MlrtIfrtLoadVariableKernel::Invoke() {
+  DCHECK_EQ(1, results().size());
+  std::optional<tensorflow::ifrt_serving::IfrtModelContext*>
+      ifrt_model_context =
+          context()
+              .resource_context()
+              .GetResource<tensorflow::ifrt_serving::IfrtModelContext>(
+                  "IfrtModelContext");
+  if (!ifrt_model_context.has_value()) {
+    execution_context().Fail(absl::FailedPreconditionError(
+        "LoadVariableOp: failed to fetch IfrtModelContext: "));
+    return;
+  }
+
+  auto status =
+      (*ifrt_model_context)
+          ->GetLoadedVariableRegistry()
+          .TryRegisterLoadedVariable(
+              name(),
+              [&]() -> absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> {
+                core::RefCountPtr<Var> variable_resource;
+                TF_RETURN_IF_ERROR(
+                    LookupResource(&context().op_kernel_context(), variable(),
+                                   &variable_resource));
+
+                return LoadIfrtVariable(**ifrt_model_context,
+                                        *(variable_resource->tensor()),
+                                        sharding_config_proto_text(), name());
+              });
+  if (!status.ok()) {
+    execution_context().Fail(std::move(status));
+    return;
+  }
+
+  // Return the name as the key
+  tensorflow::Tensor key_tensor(tensorflow::DT_STRING, {});
+  key_tensor.scalar<tsl::tstring>()() = std::string(name());
+  results()[0].Set(tensorflow::tfrt_stub::FallbackTensor(key_tensor));
+}
+void RegisterTfMlrtIfrtKernels(mlrt::KernelRegistry& registry) {
+  registry.Register<MlrtIfrtLoadVariableKernel>();
+}
+
+}  // namespace
+
+const bool kUnused = [] {
+  RegisterTfMlrtIfrtKernels(GetTfMlrtOptionalKernelRegistry());
+  return true;
+}();
+
+}  // namespace tf_mlrt
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
new file mode 100644
index 00000000000000..1333ab67e59f04
--- /dev/null
+++ b/tensorflow/core/tfrt/mlrt/kernel/ifrt_ops_kernel_test.cc
@@ -0,0 +1,431 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+// Enable definition of Eigen::ThreadPoolDevice instead of just declaration.
+#define EIGEN_USE_THREADS
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/notification.h"
+#include "absl/types/span.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/test_util.h"
+#include "tensorflow/core/framework/tensor.h"
+#include "tensorflow/core/framework/tensor_testutil.h"
+#include "tensorflow/core/platform/protobuf.h"  // IWYU pragma: keep
+#include "tensorflow/core/public/session_options.h"
+#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_compat_request_state.h"
+#include "tensorflow/core/tfrt/fallback/fallback_state.h"
+#include "tensorflow/core/tfrt/fallback/op_kernel_runner.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_config.pb.h"
+#include "tensorflow/core/tfrt/ifrt/ifrt_model_context.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/bytecode.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/executable.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/execute.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/interpreter_testutil.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/value.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/context.h"
+#include "tensorflow/core/tfrt/mlrt/kernel/kernel.h"
+#include "tensorflow/core/tfrt/utils/fallback_tensor.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/threadpool.h"
+#include "tsl/platform/tstring.h"
+#include "tfrt/host_context/concurrent_work_queue.h"  // from @tf_runtime
+#include "tfrt/host_context/resource_context.h"  // from @tf_runtime
+
+namespace tensorflow {
+namespace tf_mlrt {
+namespace {
+using tensorflow::test::AsScalar;
+using tensorflow::test::ExpectEqual;
+
+static absl::string_view kVariableName = "test_variable";
+
+Eigen::ThreadPoolDevice GetThreadPoolDevice() {
+  constexpr int kMaxParallelism = 16;
+  static tsl::thread::ThreadPool* thread_pool = []() {
+    return new tsl::thread::ThreadPool(tsl::Env::Default(),
+                                       tsl::ThreadOptions(), "IfrtSharding",
+                                       kMaxParallelism);
+  }();
+  return Eigen::ThreadPoolDevice(thread_pool->AsEigenThreadPool(),
+                                 kMaxParallelism);
+}
+
+mlrt::bc::Buffer CreateExecutableForIfrtLoadVariableOp(
+    bool redundant_ifrt_load_variable_op = false) {
+  mlrt::bc::Buffer buffer;
+  mlrt::bc::Allocator allocator(&buffer);
+
+  auto executable_ctor = mlrt::bc::New<mlrt::bc::Executable>(&allocator);
+
+  mlrt::testing::SymbolTable kernels;
+  std::vector<std::string> kernel_names = {
+      "tf_mlrt.createop", "tf_mlrt.executeop", "tf_mlrt.ifrt_load_variable",
+      "return"};
+
+  executable_ctor.construct_kernel_names(kernel_names.size())
+      .Assign(kernel_names);
+  kernels.Def(kernel_names);
+
+  mlrt::testing::AttributeTable attributes(
+      executable_ctor.construct_attributes(6));
+
+  tensorflow::ifrt_serving::VariableDeviceShardingConfigProto sharding_config;
+  sharding_config.add_device_ids(0);
+  std::string serialized_sharding_config;
+  tsl::protobuf::TextFormat::Printer printer;
+  printer.SetSingleLineMode(true);
+  printer.PrintToString(sharding_config, &serialized_sharding_config);
+
+  attributes.Add("sharding_config", serialized_sharding_config);
+  attributes.Add("variable_name", kVariableName);
+
+  attributes.Add("var_handle_op_node_def",
+                 R"pb(name: "VarHandleOp"
+                      op: "VarHandleOp"
+                      device: "/job:localhost/replica:0/task:0/device:CPU:0"
+                      attr {
+                        key: "container"
+                        value { s: "test" }
+                      }
+                      attr {
+                        key: "shared_name"
+                        value { s: "y" }
+                      }
+                      attr {
+                        key: "dtype"
+                        value { type: DT_INT32 }
+                      }
+                      attr {
+                        key: "shape"
+                        value { shape { dim { size: 1 } } }
+                      }
+                 )pb");
+
+  attributes.Add("var_handle_op_key", 0);
+
+  attributes.Add("assign_variable_op_node_def",
+                 R"pb(name: "AssignVariableOp"
+                      op: "AssignVariableOp"
+                      input: "dummy_resource"
+                      input: "dummy_tensor"
+                      device: "/job:localhost/replica:0/task:0/device:CPU:0"
+                      attr {
+                        key: "dtype"
+                        value { type: DT_INT32 }
+                      }
+                 )pb");
+
+  attributes.Add("assign_variable_op_key", 1);
+
+  auto functions_ctor = executable_ctor.construct_functions(1);
+
+  {
+    auto function_ctor = functions_ctor.ConstructAt(0);
+    function_ctor.construct_name("main");
+
+    mlrt::testing::SymbolTable regs;
+
+    function_ctor.construct_input_regs(1).Assign({regs.Def("input_tensor")});
+    function_ctor.construct_output_regs(1).Assign({regs.Def("output_tensor")});
+
+    const int kNumKernels = 6 + (redundant_ifrt_load_variable_op ? 1 : 0);
+    auto kernels_ctor = function_ctor.construct_kernels(kNumKernels);
+    int kernel_index = 0;
+
+    {
+      // Create VarHandleOp
+      auto createop_ctor = kernels_ctor.ConstructAt(kernel_index);
+      createop_ctor.set_code(kernels.Use("tf_mlrt.createop"));
+      createop_ctor.construct_arguments(0);
+      createop_ctor.construct_results(0);
+      createop_ctor.construct_attributes(2).Assign(
+          {attributes.GetHandle("var_handle_op_node_def"),
+           attributes.GetHandle("var_handle_op_key")});
+      kernel_index++;
+    }
+    {
+      // Create AssignVariableOp
+      auto createop_ctor = kernels_ctor.ConstructAt(kernel_index);
+      createop_ctor.set_code(kernels.Use("tf_mlrt.createop"));
+      createop_ctor.construct_arguments(0);
+      createop_ctor.construct_results(0);
+      createop_ctor.construct_attributes(2).Assign(
+          {attributes.GetHandle("assign_variable_op_node_def"),
+           attributes.GetHandle("assign_variable_op_key")});
+      kernel_index++;
+    }
+    {
+      // Execute VarHandleOp
+      auto executeop_ctor = kernels_ctor.ConstructAt(kernel_index);
+      executeop_ctor.set_code(kernels.Use("tf_mlrt.executeop"));
+      executeop_ctor.construct_arguments(0);
+      executeop_ctor.construct_results(1).Assign({regs.Def("variable_handle")});
+      executeop_ctor.construct_attributes(2).Assign(
+          {attributes.GetHandle("var_handle_op_node_def"),
+           attributes.GetHandle("var_handle_op_key")});
+      executeop_ctor.construct_last_uses(1).Assign({0});
+      kernel_index++;
+    }
+
+    {
+      // Execute AssignVariableOp
+      auto executeop_ctor = kernels_ctor.ConstructAt(kernel_index);
+      executeop_ctor.set_code(kernels.Use("tf_mlrt.executeop"));
+      executeop_ctor.construct_arguments(2).Assign(
+          regs.Use({"variable_handle", "input_tensor"}));
+      executeop_ctor.construct_results(0);
+      executeop_ctor.construct_attributes(2).Assign(
+          {attributes.GetHandle("assign_variable_op_node_def"),
+           attributes.GetHandle("assign_variable_op_key")});
+      executeop_ctor.construct_last_uses(2).Assign({0, 0});
+      kernel_index++;
+    }
+
+    {
+      auto kernel_ctor = kernels_ctor.ConstructAt(kernel_index);
+      kernel_ctor.set_code(kernels.Use("tf_mlrt.ifrt_load_variable"));
+      kernel_ctor.construct_results(1).Assign({regs.Use("output_tensor")});
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("variable_handle")});
+      kernel_ctor.construct_attributes(2).Assign(
+          {attributes.GetHandle("sharding_config"),
+           attributes.GetHandle("variable_name")});
+      kernel_ctor.construct_last_uses(1).Assign({1});
+      kernel_index++;
+    }
+    if (redundant_ifrt_load_variable_op) {
+      auto kernel_ctor = kernels_ctor.ConstructAt(kernel_index);
+      kernel_ctor.set_code(kernels.Use("tf_mlrt.ifrt_load_variable"));
+      kernel_ctor.construct_results(1).Assign({regs.Def("dummy")});
+      kernel_ctor.construct_attributes(2).Assign(
+          {attributes.GetHandle("sharding_config"),
+           attributes.GetHandle("variable_name")});
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("input_tensor")});
+      kernel_ctor.construct_last_uses(1).Assign({1});
+      kernel_index++;
+    }
+    {
+      auto kernel_ctor = kernels_ctor.ConstructAt(kernel_index);
+      kernel_ctor.set_code(kernels.Use("return"));
+      kernel_ctor.construct_arguments(1).Assign({regs.Use("output_tensor")});
+      kernel_index++;
+    }
+    DCHECK_EQ(kernel_index, kNumKernels);
+
+    function_ctor.set_num_regs(regs.size());
+  }
+
+  return buffer;
+}
+
+TEST(KernelTest, IfrtLoadVariableOp) {
+  auto buffer = CreateExecutableForIfrtLoadVariableOp();
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  mlrt::RegisterBuiltinKernels(registry);
+  RegisterTfMlrtKernels(registry);
+
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
+                                                   session_options, fdef_lib));
+
+  std::function<void(std::function<void()>)> runner =
+      [](const std::function<void()>& f) { f(); };
+  tfrt_stub::OpKernelRunnerTable runner_table;
+  tfd::FallbackResourceArray resource_array;
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
+      &resource_array, /*user_intra_op_threadpool=*/nullptr,
+      /*model_metadata=*/std::nullopt,
+      &fallback_state->process_function_library_runtime());
+
+  tfrt::ResourceContext resource_context;
+
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
+                          xla::ifrt::test_util::GetClient());
+  Eigen::ThreadPoolDevice thread_pool_device = GetThreadPoolDevice();
+  resource_context.CreateResource<tensorflow::ifrt_serving::IfrtModelContext>(
+      "IfrtModelContext", client, &thread_pool_device);
+
+  auto tf_context =
+      std::make_unique<Context>(&fallback_request_state, &resource_context);
+  execution_context.AddUserContext(std::move(tf_context));
+
+  std::optional<tensorflow::ifrt_serving::IfrtModelContext*>
+      ifrt_model_context =
+          resource_context
+              .GetResource<tensorflow::ifrt_serving::IfrtModelContext>(
+                  "IfrtModelContext");
+
+  ASSERT_TRUE(ifrt_model_context.has_value());
+  EXPECT_THAT((*ifrt_model_context)
+                  ->GetLoadedVariableRegistry()
+                  .GetLoadedVariable(kVariableName)
+                  .status(),
+              ::tsl::testing::StatusIs(absl::StatusCode::kNotFound));
+
+  std::vector<mlrt::Value> args;
+  args.resize(1);
+  tensorflow::Tensor input_tensor;
+  TF_CHECK_OK(tensorflow::Tensor::BuildTensor(DT_INT32, {}, &input_tensor));
+  input_tensor.scalar<int32_t>()() = 1234;
+  args.at(0).Set(tfrt_stub::FallbackTensor(std::move(input_tensor)));
+
+  std::vector<uint8_t> last_uses = {true};
+  std::vector<mlrt::Value> results;
+  results.resize(1);
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::MakeSpan(args), absl::MakeSpan(results));
+  mlrt::Execute(execution_context);
+
+  notification.WaitForNotification();
+
+  TF_ASSERT_OK(execution_context.status());
+
+  TF_ASSERT_OK((*ifrt_model_context)
+                   ->GetLoadedVariableRegistry()
+                   .GetLoadedVariable(kVariableName)
+                   .status());
+
+  ExpectEqual(results[0].Get<tfrt_stub::FallbackTensor>().tensor(),
+              AsScalar(tsl::tstring(kVariableName)));
+}
+
+TEST(KernelTest, DuplicateIfrtLoadVariableOpShallSucceed) {
+  auto buffer = CreateExecutableForIfrtLoadVariableOp(
+      /*redundant_ifrt_load_variable_op=*/true);
+
+  mlrt::bc::Executable executable(buffer.data());
+
+  mlrt::KernelRegistry registry;
+  mlrt::RegisterBuiltinKernels(registry);
+  RegisterTfMlrtKernels(registry);
+
+  mlrt::LoadedExecutable loaded_executable(executable, registry);
+
+  auto work_queue = tfrt::CreateMultiThreadedWorkQueue(
+      /*num_threads=*/4, /*num_blocking_threads=*/4);
+  mlrt::ExecutionContext execution_context(&loaded_executable);
+  execution_context.set_work_queue(work_queue.get());
+
+  tensorflow::SessionOptions session_options;
+  tensorflow::FunctionDefLibrary fdef_lib;
+  TF_ASSERT_OK_AND_ASSIGN(auto fallback_state, tfrt_stub::FallbackState::Create(
+                                                   session_options, fdef_lib));
+
+  std::function<void(std::function<void()>)> runner =
+      [](const std::function<void()>& f) { f(); };
+  tfrt_stub::OpKernelRunnerTable runner_table;
+  tfd::FallbackResourceArray resource_array;
+  tfd::KernelFallbackCompatRequestState fallback_request_state(
+      &runner, &fallback_state->device_manager(), /*step_id=*/0, &runner_table,
+      &resource_array, /*user_intra_op_threadpool=*/nullptr,
+      /*model_metadata=*/std::nullopt,
+      &fallback_state->process_function_library_runtime());
+
+  tfrt::ResourceContext resource_context;
+
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<xla::ifrt::Client> client,
+                          xla::ifrt::test_util::GetClient());
+  Eigen::ThreadPoolDevice thread_pool_device = GetThreadPoolDevice();
+  resource_context.CreateResource<tensorflow::ifrt_serving::IfrtModelContext>(
+      "IfrtModelContext", client, &thread_pool_device);
+
+  auto tf_context =
+      std::make_unique<Context>(&fallback_request_state, &resource_context);
+  execution_context.AddUserContext(std::move(tf_context));
+
+  std::optional<tensorflow::ifrt_serving::IfrtModelContext*>
+      ifrt_model_context =
+          resource_context
+              .GetResource<tensorflow::ifrt_serving::IfrtModelContext>(
+                  "IfrtModelContext");
+
+  ASSERT_TRUE(ifrt_model_context.has_value());
+  EXPECT_THAT((*ifrt_model_context)
+                  ->GetLoadedVariableRegistry()
+                  .GetLoadedVariable(kVariableName)
+                  .status(),
+              ::tsl::testing::StatusIs(absl::StatusCode::kNotFound));
+
+  std::vector<mlrt::Value> args;
+  args.resize(1);
+  tensorflow::Tensor input_tensor;
+  TF_CHECK_OK(tensorflow::Tensor::BuildTensor(DT_INT32, {}, &input_tensor));
+  input_tensor.scalar<int32_t>()() = 1234;
+  args.at(0).Set(tfrt_stub::FallbackTensor(std::move(input_tensor)));
+
+  std::vector<uint8_t> last_uses = {true};
+  std::vector<mlrt::Value> results;
+  results.resize(1);
+
+  absl::Notification notification;
+  execution_context.set_exit_handler(
+      [&notification]() { notification.Notify(); });
+
+  execution_context.Call(executable.functions()[0], last_uses,
+                         absl::MakeSpan(args), absl::MakeSpan(results));
+  mlrt::Execute(execution_context);
+
+  notification.WaitForNotification();
+
+  TF_ASSERT_OK(execution_context.status());
+
+  TF_ASSERT_OK((*ifrt_model_context)
+                   ->GetLoadedVariableRegistry()
+                   .GetLoadedVariable(kVariableName)
+                   .status());
+
+  ExpectEqual(results[0].Get<tfrt_stub::FallbackTensor>().tensor(),
+              AsScalar(tsl::tstring(kVariableName)));
+}
+
+}  // namespace
+}  // namespace tf_mlrt
+}  // namespace tensorflow
diff --git a/tensorflow/core/tfrt/mlrt/kernel/kernel.cc b/tensorflow/core/tfrt/mlrt/kernel/kernel.cc
index d08d90614f5aa7..9bf87142e775c5 100644
--- a/tensorflow/core/tfrt/mlrt/kernel/kernel.cc
+++ b/tensorflow/core/tfrt/mlrt/kernel/kernel.cc
@@ -18,32 +18,30 @@ limitations under the License.
 #include <cstdint>
 #include <functional>
 #include <memory>
-#include <string>
 #include <utility>
 #include <vector>
 
-#include "google/protobuf/text_format.h"
 #include "absl/base/optimization.h"
+#include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "tensorflow/core/framework/op_kernel.h"
 #include "tensorflow/core/framework/tensor_shape.h"
 #include "tensorflow/core/framework/types.h"
-#include "tensorflow/core/runtime_fallback/kernel/kernel_fallback_utils.h"
-#include "tensorflow/core/tfrt/fallback/device_with_custom_allocator.h"
+#include "tensorflow/core/tfrt/mlrt/bytecode/function.h"
 #include "tensorflow/core/tfrt/mlrt/interpreter/async_handle.h"
 #include "tensorflow/core/tfrt/mlrt/interpreter/attribute_span.h"
 #include "tensorflow/core/tfrt/mlrt/interpreter/builtin_kernels.h"
+#include "tensorflow/core/tfrt/mlrt/interpreter/context.h"
 #include "tensorflow/core/tfrt/mlrt/interpreter/execute.h"
 #include "tensorflow/core/tfrt/mlrt/interpreter/future.h"
 #include "tensorflow/core/tfrt/mlrt/interpreter/register_span.h"
 #include "tensorflow/core/tfrt/mlrt/interpreter/value.h"
 #include "tensorflow/core/tfrt/mlrt/kernel/context.h"
 #include "tensorflow/core/tfrt/mlrt/kernel/kernel_runner_utils.h"
+#include "tensorflow/core/tfrt/utils/fallback_tensor.h"
 #include "tensorflow/core/tfrt/utils/utils.h"
-#include "tsl/platform/status.h"
 #include "tsl/profiler/lib/traceme.h"
-#include "tfrt/concurrency/chain.h"  // from @tf_runtime
 #include "tfrt/host_context/execution_context.h"  // from @tf_runtime
 
 namespace tensorflow {
diff --git a/tensorflow/core/tfrt/ops/ifrt_program_ops.cc b/tensorflow/core/tfrt/ops/ifrt_program_ops.cc
index ab8e14b2e41eac..ba60bba95ecd83 100644
--- a/tensorflow/core/tfrt/ops/ifrt_program_ops.cc
+++ b/tensorflow/core/tfrt/ops/ifrt_program_ops.cc
@@ -25,6 +25,7 @@ REGISTER_OP("IfrtCall")
     .Attr("Tin: list(type) >= 0")
     .Attr("Tout: list(type) >= 0")
     .Attr("program_id: int")
+    .Attr("variable_arg_indices: list(int)")
     .SetIsStateful()
     .SetShapeFn(tensorflow::shape_inference::UnknownShape)
     .Doc(R"(
@@ -39,7 +40,34 @@ in their SavedModel and instead rely on Ifrt Serving's mechanism that
 automatically inserts this op with graph rewrite.
 
 program_id: int64 id that can be used to look up compiled programs from
-  `ServingExecutableRegistry`.
+ServingExecutableRegistry`.
+
+variable_arg_indices: must be in sorted ascending order. The argument at position
+variable_arg_indices[k] in tpu program is already loaded as an ifrt array and
+the input `args[variable_arg_indices[k]]` is the key to look for this loaded array.
+)");
+
+REGISTER_OP("IfrtLoadVariable")
+    .Input("variable: Tin")
+    .Output("array_key: Tout")
+    .Attr("Tin: type")
+    .Attr("Tout: type")
+    .Attr("config: string")
+    .Attr("name: string")
+    .SetIsStateful()
+    .SetShapeFn(tensorflow::shape_inference::UnknownShape)
+    .Doc(R"(
+Converts the given tensor to a named array.
+
+This op loads the `variable` tensor to an IFRT device array based the sharding
+spec in a `config` and the array can be looked up by `name` by the runtime.
+The `config` is a text proto of `IfrtVariableDeviceShardingConfigProto`. 
+The `name` is typically a concatenation of `container` and `shared_name` from `tf.VarHandle`.
+The idea is to avoid transferring to device repeatedly.
+
+Note that this op is not part of a stable interface. Users must not use this op
+in their SavedModel and instead rely on Ifrt Serving's mechanism that
+automatically inserts this op with graph rewrite.
 )");
 
 }  // namespace tfrt_stub
diff --git a/tensorflow/core/tfrt/run_handler_thread_pool/BUILD b/tensorflow/core/tfrt/run_handler_thread_pool/BUILD
index ff4c3bf8994505..3acb0edd074e96 100644
--- a/tensorflow/core/tfrt/run_handler_thread_pool/BUILD
+++ b/tensorflow/core/tfrt/run_handler_thread_pool/BUILD
@@ -15,7 +15,7 @@ package_group(
         # copybara:uncomment "//learning/serving/...",
         # copybara:uncomment "//learning/brain/experimental/tfrt/...",
         # copybara:uncomment "//learning/brain/tfrt/cpp_tests/tpu_model/...",
-        # copybara:uncomment "//learning/brain/tfrt/tfrt_session/...",
+        # copybara:uncomment "//learning/brain/tfrt/...",
         "//tensorflow/core/tfrt/...",
         "//tensorflow/core/runtime_fallback/runtime/...",
         "//tensorflow_serving/...",
@@ -48,7 +48,6 @@ cc_library(
     hdrs = ["run_handler.h"],
     deps = [
         ":run_handler_util",
-        "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:lib",
         "//tensorflow/core:lib_internal",
diff --git a/tensorflow/core/tfrt/saved_model/saved_model.cc b/tensorflow/core/tfrt/saved_model/saved_model.cc
index f8bff2bc8047bc..80cf497896153f 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model.cc
@@ -50,6 +50,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
 #include "xla/status_macros.h"
 #include "tensorflow/core/framework/function.pb.h"
+#include "tensorflow/core/framework/metrics.h"
 #include "tensorflow/core/framework/tensor.h"
 #include "tensorflow/core/framework/tensor.pb.h"
 #include "tensorflow/core/framework/types.h"
@@ -478,7 +479,9 @@ SavedModelImpl::LoadSavedModel(Options options,
     LOG(INFO) << "Found AOT package. Register required dialects.";
     RegisterTfrtDialectsForAot(registry);
   }
-  RegisterMlirDialect(registry);
+  RegisterMlirDialect(
+      registry,
+      options.graph_execution_options.compile_options.backend_compiler);
   mlir::MLIRContext context(registry);
 
   // Step 1: Import saved model from a proto to an MLIR module.
@@ -559,9 +562,6 @@ SavedModelImpl::LoadSavedModel(Options options,
                                   meta_graph_def.signature_def(), options);
   }
 
-  auto runner_table = std::make_unique<OpKernelRunnerTable>();
-  auto resource_array = std::make_unique<tfd::FallbackResourceArray>();
-
   auto kernel_registry = std::make_unique<mlrt::KernelRegistry>();
 
   // Creates a ResourceContext and populate it with per model resource from
@@ -595,6 +595,7 @@ SavedModelImpl::LoadSavedModel(Options options,
           bef, LoadBefAndMlir(options.graph_execution_options.compile_options,
                               mlir_module.get(), saved_model_dir_string,
                               fallback_state.get()));
+      metrics::UpdateAotBefMlirLoadCount();
     }
 
   } else {
@@ -647,6 +648,9 @@ SavedModelImpl::LoadSavedModel(Options options,
         bef_file, tfrt::CreateBefFileFromBefBuffer(
                       *options.graph_execution_options.runtime, bef));
   }
+
+  auto runner_table = std::make_unique<OpKernelRunnerTable>();
+  auto resource_array = std::make_unique<tfd::FallbackResourceArray>();
   if (loaded_executable) {
     RETURN_IF_ERROR_IN_INIT(RunBytecodeInitializers(
         graph_executor->options(), initializers_and_signatures,
@@ -1016,7 +1020,8 @@ StatusOr<std::reference_wrapper<const SavedModelImpl::LoadingResult>>
 SavedModelImpl::LoadJoinedSignature(const JoinedSignature& joined_signature) {
   // Step 1: Import the combined subgraph from proto to an MLIR module.
   mlir::DialectRegistry registry;
-  RegisterMlirDialect(registry);
+  RegisterMlirDialect(
+      registry, graph_executor_->options().compile_options.backend_compiler);
   mlir::MLIRContext context(registry);
 
   ASSIGN_OR_RETURN_IN_IMPORT(auto module,
diff --git a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
index 759ec2e5b2dccc..6547d2172d3cad 100644
--- a/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
+++ b/tensorflow/core/tfrt/saved_model/saved_model_aot_compile.cc
@@ -204,7 +204,9 @@ StatusOr<AotResult> AotCompileSavedModel(absl::string_view input_model_dir,
                                        meta_graph_def.graph_def());
   UpdateCompileOptions(aot_options);
   mlir::DialectRegistry registry;
-  RegisterMlirDialect(registry);
+  RegisterMlirDialect(
+      registry,
+      aot_options.graph_execution_options->compile_options.backend_compiler);
   mlir::MLIRContext context(registry);
 
   tensorflow::SessionOptions session_options =
diff --git a/tensorflow/core/tfrt/saved_model/tests/BUILD b/tensorflow/core/tfrt/saved_model/tests/BUILD
index d4ba58ffaca91c..09f1e7deaa8d10 100644
--- a/tensorflow/core/tfrt/saved_model/tests/BUILD
+++ b/tensorflow/core/tfrt/saved_model/tests/BUILD
@@ -28,6 +28,30 @@ alias(
     actual = if_google("//learning/brain/public:disable_tf2", ":empty"),
 )
 
+filegroup(
+    name = "testdata",
+    srcs = glob([
+        "testdata/**",
+    ]) + [
+        ":saved_model_gen_control_flow_v1",
+        ":saved_model_gen_data",
+        ":saved_model_gen_dtype_coverage_v1",
+        ":saved_model_gen_error_v1",
+        ":saved_model_gen_hash_table_asset_v1",
+        ":saved_model_gen_if_v1",
+        ":saved_model_gen_matmul_gpu",
+        ":saved_model_gen_pow",
+        ":saved_model_gen_pow_v2",
+        ":saved_model_gen_ref_type_tensor_input",
+        ":saved_model_gen_resource_gather_v1",
+        ":saved_model_gen_sparse_tensor_input",
+        ":saved_model_gen_toy_v1",
+        ":saved_model_gen_toy_v2",
+        ":saved_model_gen_variable_on_tpu",
+        ":saved_model_gen_while_v1",
+    ],
+)
+
 pytype_strict_binary(
     name = "gen_resource_gather_v1",
     srcs = ["gen_resource_gather_v1.py"],
@@ -433,6 +457,7 @@ gen_saved_model(
 gen_saved_model(
     model_name = "toy_v1",
     script = "gen_saved_model_v1",
+    version = "1",
 )
 
 gen_saved_model(
@@ -569,36 +594,7 @@ cc_library(
     testonly = 1,
     srcs = ["saved_model_test.cc"],
     data = [
-        "control_flow_v1/saved_model.pb",
-        "data/saved_model.pb",
-        "dtype_coverage_v1/saved_model.pb",
-        "dtype_coverage_v1/variables/variables.data-00000-of-00001",
-        "dtype_coverage_v1/variables/variables.index",
-        "error_v1/saved_model.pb",
-        "hash_table_asset_v1/assets/tokens.txt",
-        "hash_table_asset_v1/saved_model.pb",
-        "if_v1/saved_model.pb",
-        "if_v1/variables/variables.data-00000-of-00001",
-        "if_v1/variables/variables.index",
-        "pow/saved_model.pb",
-        "pow_v2/saved_model.pb",
-        "ref_type_tensor_input/saved_model.pb",
-        "ref_type_tensor_input/variables/variables.data-00000-of-00001",
-        "ref_type_tensor_input/variables/variables.index",
-        "resource_gather_v1/saved_model.pb",
-        "resource_gather_v1/variables/variables.data-00000-of-00001",
-        "resource_gather_v1/variables/variables.index",
-        "sparse_tensor_input/saved_model.pb",
-        "toy_v1/saved_model.pb",
-        "toy_v1/variables/variables.data-00000-of-00001",
-        "toy_v1/variables/variables.index",
-        "toy_v2/saved_model.pb",
-        "toy_v2/variables/variables.data-00000-of-00001",
-        "toy_v2/variables/variables.index",
-        "variable_on_tpu/saved_model.pb",
-        "variable_on_tpu/variables/variables.data-00000-of-00001",
-        "variable_on_tpu/variables/variables.index",
-        "while_v1/saved_model.pb",
+        ":testdata",
     ],
     tags = ["no_oss"],
     deps = [
@@ -624,9 +620,7 @@ cc_library(
     testonly = 1,
     srcs = ["saved_model_ifrt_test.cc"],
     data = [
-        "toy_v2/saved_model.pb",
-        "toy_v2/variables/variables.data-00000-of-00001",
-        "toy_v2/variables/variables.index",
+        ":testdata",
     ],
     tags = ["no_oss"],
     deps = [
@@ -645,6 +639,7 @@ cc_library(
         "//tensorflow/core/tfrt/saved_model:saved_model_testutil",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest",
+        "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:env",
         "@local_xla//xla/python/ifrt",
         "@local_xla//xla/python/ifrt:test_util",
@@ -703,9 +698,7 @@ tf_cuda_cc_test(
     name = "saved_model_gpu_test",
     srcs = ["saved_model_gpu_test.cc"],
     data = [
-        "matmul_gpu/saved_model.pb",
-        "matmul_gpu/variables/variables.data-00000-of-00001",
-        "matmul_gpu/variables/variables.index",
+        ":testdata",
     ],
     tags = ["no_oss"],
     deps = [
diff --git a/tensorflow/core/tfrt/saved_model/tests/gen_saved_model.bzl b/tensorflow/core/tfrt/saved_model/tests/gen_saved_model.bzl
index f3ed254c39689e..227fdc6c5ebc6d 100644
--- a/tensorflow/core/tfrt/saved_model/tests/gen_saved_model.bzl
+++ b/tensorflow/core/tfrt/saved_model/tests/gen_saved_model.bzl
@@ -2,17 +2,21 @@
 
 load("//tensorflow:tensorflow.bzl", "if_google")
 
-def gen_saved_model(model_name = "", script = "", **kwargs):
+def gen_saved_model(model_name = "", script = "", version = "", **kwargs):
+    model_path = model_name
+    if version != "":
+        model_path = model_name + "/" + version
+
     native.genrule(
         name = "saved_model_gen_" + model_name,
         srcs = [],
         outs = [
-            model_name + "/saved_model.pb",
-            model_name + "/variables/variables.data-00000-of-00001",
-            model_name + "/variables/variables.index",
+            model_path + "/saved_model.pb",
+            model_path + "/variables/variables.data-00000-of-00001",
+            model_path + "/variables/variables.index",
         ],
         cmd = if_google(
-            "$(location " + script + ") --saved_model_path=$(RULEDIR)/" + model_name,
+            "$(location " + script + ") --saved_model_path=$(RULEDIR)/" + model_path,
             "touch $(OUTS)",  # TODO(b/188517768): fix model gen.
         ),
         tools = [script],
diff --git a/tensorflow/core/tfrt/saved_model/tests/gen_saved_model_v1.py b/tensorflow/core/tfrt/saved_model/tests/gen_saved_model_v1.py
index 714dc5168cbb9b..3a67303c3c7161 100644
--- a/tensorflow/core/tfrt/saved_model/tests/gen_saved_model_v1.py
+++ b/tensorflow/core/tfrt/saved_model/tests/gen_saved_model_v1.py
@@ -19,7 +19,6 @@
 from absl import flags
 from tensorflow.python.client import session
 from tensorflow.python.framework import dtypes
-from tensorflow.python.framework import test_ops
 from tensorflow.python.ops import array_ops
 from tensorflow.python.ops import math_ops
 from tensorflow.python.ops import variable_scope
@@ -58,9 +57,6 @@ def main(argv):
   r32 = math_ops.add(x3, r31, name='result32')
   r33 = math_ops.add(x3, r32, name='result33')
 
-  # Sleep for 1 second.
-  sleep_op = test_ops.sleep_identity_op(1, x1, name='sleep')
-
   sess = session.Session()
 
   sess.run(variables.global_variables_initializer())
@@ -75,46 +71,41 @@ def main(argv):
   tensor_info_r31 = utils.build_tensor_info(r31)
   tensor_info_r32 = utils.build_tensor_info(r32)
   tensor_info_r33 = utils.build_tensor_info(r33)
-  tensor_info_sleep = utils.build_tensor_info(sleep_op)
-
-  toy_signature = (
-      signature_def_utils.build_signature_def(
-          inputs={'x1': tensor_info_x1},
-          outputs={'r1': tensor_info_r1},
-          method_name=signature_constants.PREDICT_METHOD_NAME))
-  another_toy_signature = (
-      signature_def_utils.build_signature_def(
-          inputs={'x2': tensor_info_x2},
-          outputs={
-              'r21': tensor_info_r21,
-              'r22': tensor_info_r22,
-          },
-          method_name=signature_constants.PREDICT_METHOD_NAME))
-  yet_another_toy_signature = (
-      signature_def_utils.build_signature_def(
-          inputs={'x3': tensor_info_x3},
-          outputs={
-              'r31': tensor_info_r31,
-              'r32': tensor_info_r32,
-              'r33': tensor_info_r33,
-          },
-          method_name=signature_constants.PREDICT_METHOD_NAME))
-  sleep_signature = (
-      signature_def_utils.build_signature_def(
-          inputs={'x1': tensor_info_x1},
-          outputs={'sleep': tensor_info_sleep},
-          method_name=signature_constants.PREDICT_METHOD_NAME))
+
+  toy_signature = signature_def_utils.build_signature_def(
+      inputs={'x1': tensor_info_x1},
+      outputs={'r1': tensor_info_r1},
+      method_name=signature_constants.PREDICT_METHOD_NAME,
+  )
+  another_toy_signature = signature_def_utils.build_signature_def(
+      inputs={'x2': tensor_info_x2},
+      outputs={
+          'r21': tensor_info_r21,
+          'r22': tensor_info_r22,
+      },
+      method_name=signature_constants.PREDICT_METHOD_NAME,
+  )
+  yet_another_toy_signature = signature_def_utils.build_signature_def(
+      inputs={'x3': tensor_info_x3},
+      outputs={
+          'r31': tensor_info_r31,
+          'r32': tensor_info_r32,
+          'r33': tensor_info_r33,
+      },
+      method_name=signature_constants.PREDICT_METHOD_NAME,
+  )
 
   sm_builder.add_meta_graph_and_variables(
-      sess, [tag_constants.SERVING],
+      sess,
+      [tag_constants.SERVING],
       signature_def_map={
           'toy': toy_signature,
           'another_toy': another_toy_signature,
           'yet_another_toy': yet_another_toy_signature,
-          'sleep': sleep_signature,
           signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY: toy_signature,
       },
-      strip_default_attrs=True)
+      strip_default_attrs=True,
+  )
   sm_builder.save()
 
 
diff --git a/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc b/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc
index cac9db103d7a81..3b9bec19bffae0 100644
--- a/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc
+++ b/tensorflow/core/tfrt/saved_model/tests/saved_model_ifrt_test.cc
@@ -19,9 +19,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+// Enable definition of Eigen::ThreadPoolDevice instead of just declaration.
+#define EIGEN_USE_THREADS
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/status/status.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 #include "tensorflow/compiler/mlir/tfrt/transforms/ifrt/ifrt_backend_compiler.h"
 #include "tensorflow/compiler/mlir/tfrt/translate/tfrt_compile_options.h"
 #include "xla/python/ifrt/client.h"
diff --git a/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc b/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
index 70f83330f715ff..07ff5edbae1008 100644
--- a/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
+++ b/tensorflow/core/tfrt/saved_model/tests/saved_model_test.cc
@@ -51,7 +51,7 @@ TEST_P(SavedModelTest, BasicV1) {
   //  y = tf.compat.v1.get_variable(name='y', initializer=[1, 2, 3])
   //  r = tf.matmul(x, y)
   std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
 
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
   auto options = DefaultSavedModelOptions(runtime.get());
@@ -126,7 +126,7 @@ TEST_P(SavedModelTest, OnlineCostAnalysis) {
   //  y = tf.compat.v1.get_variable(name='y', initializer=[1, 2, 3])
   //  r = tf.matmul(x, y)
   std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
 
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
   auto options = DefaultSavedModelOptions(runtime.get());
@@ -258,7 +258,7 @@ TEST(SavedModelTest, RunMultipleSignatures) {
   //  y = tf.compat.v1.get_variable(name='y', initializer=[1, 2, 3])
   //  r = tf.matmul(x, y)
   std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
 
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
   auto options = DefaultSavedModelOptions(runtime.get());
@@ -370,7 +370,7 @@ TEST(SavedModelTest, RunMultipleSignatures_OverlappingNodes) {
   //  y = tf.compat.v1.get_variable(name='y', initializer=[1, 2, 3])
   //  r = tf.matmul(x, y)
   std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
 
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
   auto options = DefaultSavedModelOptions(runtime.get());
@@ -435,7 +435,7 @@ class SavedModelRunByTensorNamesTest : public ::testing::Test {
     //  y = tf.compat.v1.get_variable(name='y', initializer=[1, 2, 3])
     //  r = tf.matmul(x, y)
     auto saved_model_dir = tensorflow::GetDataDependencyFilepath(
-        "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+        "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
     runtime_ = DefaultTfrtRuntime(/*num_threads=*/1);
     auto options = DefaultSavedModelOptions(runtime_.get());
 
@@ -537,7 +537,7 @@ TEST(SavedModelTest, CustomWorkQueue) {
   //  y = tf.compat.v1.get_variable(name='y', initializer=[1, 2, 3])
   //  r = tf.matmul(x, y)
   std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
 
   tfrt::tf::RunHandlerThreadWorkQueue::Options queue_options;
   queue_options.num_complementary_threads = 1;
@@ -583,7 +583,7 @@ TEST(SavedModelTest, RunOptionsWorkQueue) {
   //  y = tf.compat.v1.get_variable(name='y', initializer=[1, 2, 3])
   //  r = tf.matmul(x, y)
   std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
 
   auto runtime =
       tensorflow::tfrt_stub::Runtime::Create(/*num_inter_op_threads=*/4);
@@ -633,7 +633,7 @@ TEST(SavedModelTest, FunctionMetadata) {
   //  y = tf.compat.v1.get_variable(name='y', initializer=[1, 2, 3])
   //  r = tf.matmul(x, y)
   std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
 
   TFRTSavedModelTest test(saved_model_dir);
   auto* saved_model = test.GetSavedModel();
@@ -989,7 +989,7 @@ TEST(SavedModelTest, DeadlineExceeded) {
   //  y = tf.compat.v1.get_variable(name='y', initializer=[1, 2, 3])
   //  r = tf.matmul(x, y)
   std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
 
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
   auto options = DefaultSavedModelOptions(runtime.get());
@@ -1021,7 +1021,7 @@ TEST(SavedModelTest, DisableCompilation) {
   //  y = tf.compat.v1.get_variable(name='y', initializer=[1, 2, 3])
   //  r = tf.matmul(x, y)
   std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
 
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
   auto options = DefaultSavedModelOptions(runtime.get());
@@ -1060,7 +1060,7 @@ TEST(SavedModelTest, CustomModelConfig) {
   //  y = tf.compat.v1.get_variable(name='y', initializer=[1, 2, 3])
   //  r = tf.matmul(x, y)
   std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
 
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
 
@@ -1124,7 +1124,7 @@ class TestCompiler : public BackendCompiler {
 
 TEST(SavedModelTest, CustomCompiler) {
   std::string saved_model_dir = tensorflow::GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
 
   auto runtime = DefaultTfrtRuntime(/*num_threads=*/1);
 
diff --git a/tensorflow/core/tfrt/tfrt_session/BUILD b/tensorflow/core/tfrt/tfrt_session/BUILD
index e7a18cb4306744..b4157cacbd7785 100644
--- a/tensorflow/core/tfrt/tfrt_session/BUILD
+++ b/tensorflow/core/tfrt/tfrt_session/BUILD
@@ -78,9 +78,9 @@ tf_cc_shared_test(
     size = "small",
     srcs = ["tfrt_session_test.cc"],
     data = [
-        "//tensorflow/core/tfrt/saved_model/tests:toy_v1/saved_model.pb",
-        "//tensorflow/core/tfrt/saved_model/tests:toy_v1/variables/variables.data-00000-of-00001",
-        "//tensorflow/core/tfrt/saved_model/tests:toy_v1/variables/variables.index",
+        "//tensorflow/core/tfrt/saved_model/tests:toy_v1/1/saved_model.pb",
+        "//tensorflow/core/tfrt/saved_model/tests:toy_v1/1/variables/variables.data-00000-of-00001",
+        "//tensorflow/core/tfrt/saved_model/tests:toy_v1/1/variables/variables.index",
     ],
     tags = ["no_oss"],
     deps = [
@@ -108,8 +108,10 @@ tf_cc_shared_test(
         "//tensorflow/core/tfrt/saved_model:saved_model_testutil",
         "//tensorflow/core/tfrt/utils:thread_pool",
         "//tensorflow/python/framework:test_ops_kernels",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:protobuf",
     ],
 )
 
diff --git a/tensorflow/core/tfrt/tfrt_session/tfrt_session_test.cc b/tensorflow/core/tfrt/tfrt_session/tfrt_session_test.cc
index a5898333c23a4f..490d132ecefd50 100644
--- a/tensorflow/core/tfrt/tfrt_session/tfrt_session_test.cc
+++ b/tensorflow/core/tfrt/tfrt_session/tfrt_session_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/memory/memory.h"
 #include "absl/time/time.h"
 #include "tensorflow/cc/framework/ops.h"
 #include "tensorflow/cc/framework/scope.h"
@@ -44,6 +45,7 @@ limitations under the License.
 #include "tensorflow/core/tfrt/saved_model/saved_model_testutil.h"
 #include "tensorflow/core/tfrt/utils/thread_pool.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/protobuf.h"
 
 namespace tensorflow {
 namespace {
@@ -78,7 +80,7 @@ class TfrtSessionTest : public ::testing::Test {
 
     // Initialize the session with a GraphDef.
     std::string saved_model_dir = GetDataDependencyFilepath(
-        "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+        "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
 
     MetaGraphDef meta_graph_def;
     TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(saved_model_dir, {"serve"},
@@ -131,27 +133,83 @@ TEST_F(TfrtSessionTest, NoTargetNodes) {
 }
 
 TEST_F(TfrtSessionTest, RunOptions) {
+  SessionOptions options;
+  options.config.mutable_experimental()->set_use_tfrt(true);
+  auto* model_metadata =
+      options.config.mutable_experimental()->mutable_session_metadata();
+  model_metadata->set_name("toy_v1");
+  model_metadata->set_version(0);
+
+  auto session = absl::WrapUnique(NewSession(options));
+  ASSERT_TRUE(session != nullptr);
+
+  tensorflow::GraphDef graph_def;
+  ASSERT_TRUE(google::protobuf::TextFormat::ParseFromString(
+      R"pb(
+        node: {
+          name: "input"
+          op: "Placeholder"
+          attr: {
+            key: "dtype"
+            value: { type: DT_INT32 }
+          }
+        }
+        node: {
+          name: "sleep_seconds"
+          op: "Const"
+          attr: {
+            key: "dtype"
+            value: { type: DT_INT32 }
+          }
+          attr: {
+            key: "value"
+            value: {
+              tensor: {
+                tensor_shape: {}
+                dtype: DT_INT32
+                int_val: 2
+              }
+            }
+          }
+        }
+        node: {
+          name: "sleep"
+          op: "SleepIdentityOp"
+          input: "sleep_seconds:0"
+          input: "input:0"
+          attr: {
+            key: "T"
+            value: { type: DT_INT32 }
+          }
+        })pb"
+
+      ,
+      &graph_def));
+
+  TF_ASSERT_OK(session->Create(graph_def));
+
   std::vector<Tensor> outputs;
   // Test the Run() overload with RunOptions and RunMetadata
   RunMetadata run_metadata;
-  TF_ASSERT_OK(session_->Run(RunOptions{}, inputs_, output_tensor_names_,
-                             /*target_tensor_names=*/{}, &outputs,
-                             &run_metadata));
+  TF_ASSERT_OK(session->Run(
+      RunOptions{},
+      /*inputs=*/{{"input", test::AsTensor<int32_t>({1}, TensorShape{1})}},
+      /*output_tensor_names=*/{"sleep"},
+      /*target_tensor_names=*/{}, &outputs, &run_metadata));
 
-  ASSERT_EQ(outputs.size(), 3);
+  ASSERT_EQ(outputs.size(), 1);
 
   // Check output "r1".
-  test::ExpectEqual(outputs[0],
-                    test::AsTensor<int32_t>({6}, TensorShape{1, 1}));
+  test::ExpectEqual(outputs[0], test::AsTensor<int32_t>({1}, TensorShape{1}));
 
   // Test timeout.
   RunOptions run_options;
   run_options.set_timeout_in_ms(1);
-  // The "sleep" op will sleep for 1 second, so the Session::Run() call will
-  // time out.
-  auto status =
-      session_->Run(run_options, inputs_, output_tensor_names_,
-                    /*target_tensor_names=*/{"sleep"}, &outputs, &run_metadata);
+  auto status = session->Run(
+      run_options,
+      /*inputs=*/{{"input", test::AsTensor<int32_t>({1}, TensorShape{1})}},
+      /*output_tensor_names=*/{"sleep"},
+      /*target_tensor_names=*/{}, &outputs, &run_metadata);
 
   ASSERT_FALSE(status.ok());
   EXPECT_THAT(status.ToString(), ::testing::HasSubstr("Deadline exceeded"));
@@ -234,7 +292,7 @@ TEST_F(TfrtSessionTest, RunInCallerThreadSessionOptions) {
 
   // Initialize the session with a GraphDef.
   std::string saved_model_dir = GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
 
   MetaGraphDef meta_graph_def;
   TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(saved_model_dir, {"serve"},
@@ -340,7 +398,7 @@ TEST_F(TfrtSessionTest, CreateWithEmptyGraphIsNoop) {
 
   // Create agian with an unempty GraphDef.
   std::string saved_model_dir = GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
 
   MetaGraphDef meta_graph_def;
   TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(saved_model_dir, {"serve"},
@@ -352,7 +410,7 @@ TEST_F(TfrtSessionTest, CreateWithEmptyGraphIsNoop) {
 TEST_F(TfrtSessionTest, CreateAgainError) {
   // On a created session, create agian with a GraphDef.
   std::string saved_model_dir = GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
 
   MetaGraphDef meta_graph_def;
   TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(saved_model_dir, {"serve"},
@@ -378,7 +436,7 @@ TEST_F(TfrtSessionTest, CreateAfterCloseError) {
 
   // Create the session with a GraphDef.
   std::string saved_model_dir = GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
 
   MetaGraphDef meta_graph_def;
   TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(saved_model_dir, {"serve"},
@@ -400,7 +458,7 @@ TEST_F(TfrtSessionTest, ExtendWhenNotCreated) {
 
   // Extend the session with a GraphDef.
   std::string saved_model_dir = GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
 
   MetaGraphDef meta_graph_def;
   TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(saved_model_dir, {"serve"},
@@ -535,7 +593,7 @@ TEST_F(TfrtSessionTest, ExtendAfterCloseError) {
 
   // Extend the session with a GraphDef.
   std::string saved_model_dir = GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
 
   MetaGraphDef meta_graph_def;
   TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(saved_model_dir, {"serve"},
diff --git a/tensorflow/core/tfrt/utils/debug/BUILD b/tensorflow/core/tfrt/utils/debug/BUILD
index 2396428514fb6f..38965aed32f1dc 100644
--- a/tensorflow/core/tfrt/utils/debug/BUILD
+++ b/tensorflow/core/tfrt/utils/debug/BUILD
@@ -28,9 +28,9 @@ cc_library(
 #     name = "node_io_dump_rewriter_test",
 #     srcs = ["node_io_dump_rewriter_test.cc"],
 #     data = [
-#         "//tensorflow/core/tfrt/saved_model/tests:toy_v1/saved_model.pb",
-#         "//tensorflow/core/tfrt/saved_model/tests:toy_v1/variables/variables.data-00000-of-00001",
-#         "//tensorflow/core/tfrt/saved_model/tests:toy_v1/variables/variables.index",
+#         "//tensorflow/core/tfrt/saved_model/tests:toy_v1/1/saved_model.pb",
+#         "//tensorflow/core/tfrt/saved_model/tests:toy_v1/1/variables/variables.data-00000-of-00001",
+#         "//tensorflow/core/tfrt/saved_model/tests:toy_v1/1/variables/variables.index",
 #         "//tensorflow/core/tfrt/saved_model/tests:toy_v2/saved_model.pb",
 #         "//tensorflow/core/tfrt/saved_model/tests:toy_v2/variables/variables.data-00000-of-00001",
 #         "//tensorflow/core/tfrt/saved_model/tests:toy_v2/variables/variables.index",
diff --git a/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc b/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc
index 273a0cab075e16..3ccb66f704df60 100644
--- a/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc
+++ b/tensorflow/core/tfrt/utils/debug/node_io_dump_rewriter_test.cc
@@ -150,7 +150,7 @@ TEST(NodeIoDumpRewriterTest, OnGraph) {
 TEST(NodeIoDumpRewriterTest, OnSavedModelV1) {
   // Read meta_graph_def.
   std::string saved_model_dir = GetDataDependencyFilepath(
-      "tensorflow/core/tfrt/saved_model/tests/toy_v1");
+      "tensorflow/core/tfrt/saved_model/tests/toy_v1/1");
   MetaGraphDef meta_graph_def;
   TF_ASSERT_OK(ReadMetaGraphDefFromSavedModel(saved_model_dir, {"serve"},
                                               &meta_graph_def));
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op.cc b/tensorflow/core/tpu/kernels/tpu_compile_op.cc
index 1ada53b73f4b60..b4a462a1e20b72 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op.cc
@@ -88,7 +88,7 @@ void TpuCompileSucceededAssertOp::Compute(OpKernelContext* ctx) {
   }
 }
 
-REGISTER_MODULE_INITIALIZER(register_tpu_compile_op_kernel, {
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(register_tpu_compile_op_kernel, {
   VLOG(1) << "Register TpuCompileOp kernel.";
   REGISTER_KERNEL_BUILDER(Name("TPUCompile").Device(DEVICE_CPU), TpuCompileOp);
   REGISTER_KERNEL_BUILDER(Name("_TPUCompileMlir").Device(DEVICE_CPU),
diff --git a/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc b/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
index 1bf78a934c3646..1e45bb2079f316 100644
--- a/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
+++ b/tensorflow/core/tpu/kernels/tpu_compile_op_impl.cc
@@ -90,7 +90,7 @@ class TpuCompileOpImplFactory : public CompileOpImplFactory {
 };
 
 #if defined(LIBTPU_ON_GCE)
-REGISTER_MODULE_INITIALIZER(tpu_compile_op_impl_factory, {
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(tpu_compile_op_impl_factory, {
   VLOG(1) << "register TpuCompileOpImplFactory()";
   CompileOpImplFactory::Register(new TpuCompileOpImplFactory());
 });
diff --git a/tensorflow/core/tpu/kernels/xla/BUILD b/tensorflow/core/tpu/kernels/xla/BUILD
index c75d6ded6f2f45..d91c12d5abde2f 100644
--- a/tensorflow/core/tpu/kernels/xla/BUILD
+++ b/tensorflow/core/tpu/kernels/xla/BUILD
@@ -7,11 +7,45 @@ package(
     licenses = ["notice"],
 )
 
+cc_library(
+    name = "host_compute_ops",
+    srcs = [
+        "host_compute_ops.cc",
+    ],
+    visibility = ["//visibility:public"],
+    deps = [
+        "//tensorflow/compiler/tf2xla:common",
+        "//tensorflow/compiler/tf2xla:mlir_xla_op_kernel",
+        "//tensorflow/compiler/tf2xla:side_effect_util",
+        "//tensorflow/compiler/tf2xla:xla_compiler",
+        "//tensorflow/compiler/tf2xla:xla_context",
+        "//tensorflow/compiler/tf2xla:xla_op_registry",
+        "//tensorflow/compiler/tf2xla/kernels:if_op",
+        "//tensorflow/compiler/tf2xla/kernels:index_ops",
+        "//tensorflow/compiler/tf2xla/kernels:while_op",
+        "//tensorflow/compiler/tf2xla/kernels:xla_ops",
+        "//tensorflow/core:core_cpu_internal",
+        "//tensorflow/core:framework",
+        "//tensorflow/core:lib",
+        "//tensorflow/core:protos_all_cc",
+        "//tensorflow/core/platform:errors",
+        "//tensorflow/core/platform:status",
+        "//tensorflow/core/tpu/kernels:cross_replica_ops",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_xla//xla:shape_util",
+        "@local_xla//xla:side_effect_util",
+        "@local_xla//xla:xla_data_proto_cc",
+        "@local_xla//xla/client:sharding_builder",
+        "@local_xla//xla/client:xla_builder",
+    ],
+    alwayslink = 1,
+)
+
 cc_library(
     name = "xla_ops",
     srcs = [
         "get_item_op.cc",
-        "host_compute_ops.cc",
         "index_ops.cc",
         "infeed_op.cc",
         "inplace_ops.cc",
@@ -19,9 +53,8 @@ cc_library(
     ],
     visibility = ["//visibility:public"],
     deps = [
+        ":host_compute_ops",
         "//tensorflow/compiler/tf2xla:common",
-        "//tensorflow/compiler/tf2xla:sharding_util",
-        "//tensorflow/compiler/tf2xla:side_effect_util",
         "//tensorflow/compiler/tf2xla:xla_compiler",
         "//tensorflow/compiler/tf2xla:xla_context",
         "//tensorflow/compiler/tf2xla:xla_op_registry",
@@ -39,18 +72,15 @@ cc_library(
         "//tensorflow/core/tpu/kernels:cross_replica_ops",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:macros",
         "@local_xla//xla:shape_util",
-        "@local_xla//xla:side_effect_util",
         "@local_xla//xla:util",
         "@local_xla//xla:xla_data_proto_cc",
-        "@local_xla//xla/client:sharding_builder",
         "@local_xla//xla/client:xla_builder",
+        "@local_xla//xla/hlo/ir:hlo",
         "@local_xla//xla/stream_executor/tpu:c_api_conversions",
         "@local_xla//xla/stream_executor/tpu:c_api_decl",
-        "@local_xla//xla/stream_executor/tpu:tpu_api",
+        "@local_xla//xla/stream_executor/tpu:tpu_executor_api",
     ],
     alwayslink = 1,
 )
diff --git a/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc b/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
index fe0fd5b309715b..2c2dfc8df4a825 100644
--- a/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
+++ b/tensorflow/core/tpu/kernels/xla/host_compute_ops.cc
@@ -18,15 +18,15 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "absl/strings/str_cat.h"
+#include "tensorflow/compiler/tf2xla/mlir_xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/shape_util.h"
 #include "tensorflow/compiler/tf2xla/side_effect_util.h"
 #include "tensorflow/compiler/tf2xla/xla_context.h"
+#include "tensorflow/compiler/tf2xla/xla_helpers.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "xla/client/sharding_builder.h"
 #include "xla/client/xla_builder.h"
-#include "xla/primitive_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/side_effect_util.h"
@@ -56,7 +56,6 @@ limitations under the License.
 #include "tensorflow/core/platform/types.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
-#include "tsl/platform/macros.h"
 
 namespace tensorflow {
 
@@ -180,7 +179,7 @@ class HostComputeOp : public XlaOpKernel {
     // Send values to the host.
     std::vector<xla::XlaOp> send_to_host_tokens;
     for (int i = 0; i < input_handles.size(); ++i) {
-      const string channel_name = absl::StrCat(send_key_, "_dtoh_", i);
+      const string channel_name = GetDeviceToHostChannelName(send_key_, i);
       xla::Shape xla_shape;
       OP_REQUIRES_OK(ctx, TensorShapeToXLAShape(input_dtypes_[i],
                                                 input_shapes[i], &xla_shape));
@@ -242,7 +241,7 @@ class HostComputeOp : public XlaOpKernel {
     // Copy results to the device.
     std::vector<xla::XlaOp> recv_from_host_tokens;
     for (int i = 0; i < output_shapes->size(); ++i) {
-      const string channel_name = absl::StrCat(recv_key_, "_htod_", i);
+      const string channel_name = GetHostToDeviceChannelName(recv_key_, i);
       // Specify frontend attributes.
       xla::FrontendAttributes attrs;
       (*attrs.mutable_map())[xla::kXlaHostTransferRendezvousNameAttr] =
@@ -538,6 +537,7 @@ class RecvFromHostOp : public XlaOpKernel {
 REGISTER_XLA_OP(Name("XlaHostCompute"), HostComputeOp);
 REGISTER_XLA_OP(Name("XlaSendToHost"), SendToHostOp);
 REGISTER_XLA_OP(Name("XlaRecvFromHost"), RecvFromHostOp);
+REGISTER_XLA_OP(Name("_XlaHostComputeMlir"), MlirXlaOpKernel);
 
 }  // anonymous namespace
 }  // namespace tensorflow
diff --git a/tensorflow/core/tpu/kernels/xla/infeed_op.cc b/tensorflow/core/tpu/kernels/xla/infeed_op.cc
index 050829f1ecfd5c..94ae03c0158fd5 100644
--- a/tensorflow/core/tpu/kernels/xla/infeed_op.cc
+++ b/tensorflow/core/tpu/kernels/xla/infeed_op.cc
@@ -17,18 +17,25 @@ limitations under the License.
 #include <vector>
 
 #include "tensorflow/compiler/tf2xla/shape_util.h"
-#include "tensorflow/compiler/tf2xla/sharding_util.h"
-#include "tensorflow/compiler/tf2xla/type_util.h"
 #include "tensorflow/compiler/tf2xla/xla_op_kernel.h"
 #include "tensorflow/compiler/tf2xla/xla_op_registry.h"
 #include "xla/client/xla_builder.h"
+#include "xla/hlo/ir/hlo_sharding.h"
+#include "xla/layout_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/tpu/c_api_conversions.h"
 #include "xla/stream_executor/tpu/c_api_decl.h"
-#include "xla/stream_executor/tpu/tpu_api.h"
+#include "xla/stream_executor/tpu/tpu_executor_api.h"
 #include "xla/util.h"
-#include "tensorflow/core/framework/kernel_def_builder.h"
+#include "xla/xla_data.pb.h"
 #include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/tpu/tpu_defs.h"
+#include "tensorflow/core/framework/op_requires.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+#include "tensorflow/core/framework/types.h"
+#include "tensorflow/core/framework/types.pb.h"
+#include "tensorflow/core/platform/status.h"
+#include "tsl/platform/statusor.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc b/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc
index aa2dacfb319b35..fac67ba2289c31 100644
--- a/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc
+++ b/tensorflow/core/tpu/kernels/xla/outfeed_ops.cc
@@ -28,7 +28,6 @@ limitations under the License.
 #include "tensorflow/core/framework/types.pb.h"
 #include "tensorflow/core/platform/types.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
-#include "tsl/platform/macros.h"
 
 namespace tensorflow {
 
diff --git a/tensorflow/core/transforms/eliminate_passthrough_iter_args/BUILD b/tensorflow/core/transforms/eliminate_passthrough_iter_args/BUILD
index b19c211a8abdcb..043cab4e10fb11 100644
--- a/tensorflow/core/transforms/eliminate_passthrough_iter_args/BUILD
+++ b/tensorflow/core/transforms/eliminate_passthrough_iter_args/BUILD
@@ -22,7 +22,6 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
         "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TransformUtils",
     ],
 )
 
diff --git a/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc b/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc
index 7b40357f79998c..a770f8528b0f74 100644
--- a/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc
+++ b/tensorflow/core/transforms/eliminate_passthrough_iter_args/pass.cc
@@ -18,15 +18,16 @@ limitations under the License.
 #include <memory>
 #include <utility>
 
+#include "llvm/ADT/ADL.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/EpochTracker.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Debug.h"
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/PatternMatch.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "tensorflow/core/ir/ops.h"
 #include "tensorflow/core/ir/utility.h"
 #include "tensorflow/core/transforms/utils/utils.h"
diff --git a/tensorflow/core/transforms/shape_inference/BUILD b/tensorflow/core/transforms/shape_inference/BUILD
index d9eb50e9762b2a..bc8c4762d809e5 100644
--- a/tensorflow/core/transforms/shape_inference/BUILD
+++ b/tensorflow/core/transforms/shape_inference/BUILD
@@ -18,7 +18,6 @@ cc_library(
         "//tensorflow/core:framework",
         "//tensorflow/core/ir:Dialect",
         "//tensorflow/core/ir:shape_inference_utils",
-        "//tensorflow/core/ir/importexport:convert_tensor",
         "//tensorflow/core/ir/types:Dialect",
         "//tensorflow/core/transforms:PassIncGen",
         "@llvm-project//llvm:Support",
diff --git a/tensorflow/core/transforms/shape_inference/pass.cc b/tensorflow/core/transforms/shape_inference/pass.cc
index c6763b4b51e5e7..80a28e0b53dbfb 100644
--- a/tensorflow/core/transforms/shape_inference/pass.cc
+++ b/tensorflow/core/transforms/shape_inference/pass.cc
@@ -19,16 +19,22 @@ limitations under the License.
 #include <vector>
 
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/SmallVector.h"
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypeInterfaces.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/OpDefinition.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/Types.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/Visitors.h"  // from @llvm-project
 #include "mlir/Interfaces/InferTypeOpInterface.h"  // from @llvm-project
+#include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/ir/importexport/convert_tensor.h"
+#include "tensorflow/core/ir/dialect.h"
 #include "tensorflow/core/ir/ops.h"
 #include "tensorflow/core/ir/tf_op_wrapper.h"
 #include "tensorflow/core/ir/types/dialect.h"
diff --git a/tensorflow/core/util/autotune_maps/BUILD b/tensorflow/core/util/autotune_maps/BUILD
index f4f13211ab2f8e..10078d6ce696a8 100644
--- a/tensorflow/core/util/autotune_maps/BUILD
+++ b/tensorflow/core/util/autotune_maps/BUILD
@@ -146,6 +146,7 @@ tf_cuda_library(
         "@local_xla//xla:status_macros",
         "@local_xla//xla/stream_executor",
         "@local_xla//xla/stream_executor:lazy_op_runner",
+        "@local_xla//xla/stream_executor:platform_manager",
         "@local_xla//xla/stream_executor/gpu:gpu_init",
     ],
 )
diff --git a/tensorflow/core/util/autotune_maps/autotune_serialize.cc b/tensorflow/core/util/autotune_maps/autotune_serialize.cc
index f943aecdfeedd4..464364df6768ef 100644
--- a/tensorflow/core/util/autotune_maps/autotune_serialize.cc
+++ b/tensorflow/core/util/autotune_maps/autotune_serialize.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/gpu/gpu_init.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "tensorflow/core/platform/str_util.h"
 #include "tensorflow/core/util/activation_mode.h"
 #include "tensorflow/core/util/autotune_maps/autotune_map.pb.h"
@@ -100,7 +101,7 @@ Status PopulateConvMap(
   // Get the list of all GPU StreamExecutors.
   TF_ASSIGN_OR_RETURN(
       se::Platform * platform,
-      se::MultiPlatformManager::PlatformWithName(se::GpuPlatformName()));
+      se::PlatformManager::PlatformWithName(se::GpuPlatformName()));
   std::vector<std::string> device_descs;
   for (int i = 0; i < platform->VisibleDeviceCount(); i++) {
     TF_ASSIGN_OR_RETURN(std::unique_ptr<se::DeviceDescription> device_desc,
diff --git a/tensorflow/core/util/autotune_maps/autotune_serialize_test.cc b/tensorflow/core/util/autotune_maps/autotune_serialize_test.cc
index ef62dfaf414cf5..c135beb198756e 100644
--- a/tensorflow/core/util/autotune_maps/autotune_serialize_test.cc
+++ b/tensorflow/core/util/autotune_maps/autotune_serialize_test.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 
 // For Google-internal use only.
+#include "xla/stream_executor/platform_manager.h"
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 #include "tensorflow/core/util/autotune_maps/autotune_serialize.h"
@@ -37,7 +38,7 @@ using ::testing::HasSubstr;
 // Gets a GPU StreamExecutor instance.  Any one will do.
 se::StreamExecutor* GetStreamExec() {
   se::Platform* platform =
-      se::MultiPlatformManager::PlatformWithName(se::GpuPlatformName()).value();
+      se::PlatformManager::PlatformWithName(se::GpuPlatformName()).value();
   CHECK_GT(platform->VisibleDeviceCount(), 0);
   return platform->ExecutorForDevice(0).value();
 }
diff --git a/tensorflow/core/util/batch_util.cc b/tensorflow/core/util/batch_util.cc
index 425ee2445672da..614f2b4e795c26 100644
--- a/tensorflow/core/util/batch_util.cc
+++ b/tensorflow/core/util/batch_util.cc
@@ -18,9 +18,13 @@ limitations under the License.
 #include <algorithm>
 #include <utility>
 
+#include "tensorflow/core/framework/full_type.pb.h"
 #include "tensorflow/core/framework/register_types.h"
 #include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/errors.h"
+#include "tensorflow/core/platform/errors.h"
+#include "tensorflow/core/platform/status.h"
+#include "tensorflow/core/platform/tstring.h"
 
 #define TF_CALL_DATASET_TYPES(m) TF_CALL_ALL_TYPES(m) TF_CALL_QUANTIZED_TYPES(m)
 
@@ -220,6 +224,81 @@ Status CopySliceToElement(const Tensor& parent, Tensor* element,
   }
 }
 
+// Does the same thing as `CopyContiguousSlices` except it might move
+// the underlying data from `src` to `dst` when possible.
+Status MaybeMoveContiguousSlices(Tensor& src, int64_t src_offset,
+                                 int64_t dst_offset, int64_t num_slices,
+                                 Tensor* dst) {
+  if (src.dtype() != dst->dtype()) {
+    return absl::FailedPreconditionError(absl::StrCat(
+        "MaybeMoveContiguousSlices cannot perform copy: src and dst have "
+        "different "
+        "dtypes. Source dtype: ",
+        src.dtype(), " dstination dtype: ", dst->dtype(), "."));
+  }
+  if (src.dims() < 1) {
+    return absl::FailedPreconditionError(absl::StrCat(
+        "MaybeMoveContiguousSlices cannot perform copy: src has to be a tensor "
+        "with "
+        "rank >= 1. Source shape: ",
+        src.shape().DebugString()));
+  }
+  if (dst->dims() < 1) {
+    return absl::FailedPreconditionError(absl::StrCat(
+        "MaybeMoveContiguousSlices cannot perform copy: dst has to be a tensor "
+        "with rank >= 1. Dest shape: ",
+        dst->shape().DebugString()));
+  }
+
+  const int64_t src_dim0 = src.dim_size(0);
+  const int64_t dst_dim0 = dst->dim_size(0);
+  int64_t src_chip_size = 1;
+  int64_t dst_chip_size = 1;
+  for (int i = 1; i < src.dims(); ++i) {
+    src_chip_size *= src.dim_size(i);
+  }
+  for (int i = 1; i < dst->dims(); ++i) {
+    dst_chip_size *= dst->dim_size(i);
+  }
+
+  if (src_chip_size != dst_chip_size) {
+    return absl::FailedPreconditionError(absl::StrCat(
+        "MaybeMoveContiguousSlices cannot perform copy: source and dst shapes "
+        "are"
+        "not compatible. Source shape: ",
+        src.shape().DebugString(),
+        ", dst shape: ", dst->shape().DebugString()));
+  }
+  if (src_chip_size == 0 && dst_chip_size == 0) {
+    return OkStatus();
+  }
+  if (src_offset < 0 || src_offset + num_slices > src_dim0 || dst_offset < 0 ||
+      dst_offset + num_slices > dst_dim0) {
+    return absl::FailedPreconditionError(absl::StrCat(
+        "MaybeMoveContiguousSlices cannot perform copy: index out of range. "
+        "src_offset: ",
+        src_offset, ", num_slices: ", num_slices, ", src_dim0: ", src_dim0,
+        ", dst_offset: ", dst_offset, ", dst_dim0: ", dst_dim0, "."));
+  }
+
+#define HANDLE_TYPE(T)                                                       \
+  case DataTypeToEnum<T>::value: {                                           \
+    T* src_p = src.base<T>() + (src_chip_size * src_offset);                 \
+    T* dst_p = dst->base<T>() + (dst_chip_size * dst_offset);                \
+    HandleSliceToElement<T>(&src, src_p, dst_p, src_chip_size * num_slices); \
+    return OkStatus();                                                       \
+  }
+
+  switch (src.dtype()) {
+    TF_CALL_ALL_TYPES(HANDLE_TYPE);
+    TF_CALL_QUANTIZED_TYPES(HANDLE_TYPE);
+#undef HANDLE_TYPE
+    default:
+      return absl::FailedPreconditionError(absl::StrCat(
+          "MaybeMoveContiguousSlices unhandled data type: ", src.dtype()));
+  }
+}
+
 Status CopyContiguousSlices(const Tensor& src, int64_t src_offset,
                             int64_t dst_offset, int64_t num_slices,
                             Tensor* dst) {
diff --git a/tensorflow/core/util/batch_util.h b/tensorflow/core/util/batch_util.h
index 39b7f467fa24d4..d31fd647ca1283 100644
--- a/tensorflow/core/util/batch_util.h
+++ b/tensorflow/core/util/batch_util.h
@@ -50,6 +50,12 @@ Status CopyContiguousSlices(const Tensor& src, int64_t src_offset,
 // This is particularly important for DT_STRING tensors.
 Status MaybeMoveSliceToElement(Tensor* parent, Tensor* element, int64_t index);
 
+// Moves `src` Tensor's data in [src_offset, src_offset+num_slices) along
+// the first dimension if possible. Otherwise, copy them into `dst`.
+Status MaybeMoveContiguousSlices(Tensor& src, int64_t src_offset,
+                                 int64_t dst_offset, int64_t num_slices,
+                                 Tensor* dst);
+
 // Zero-initializes the tensor `element` using the scalar stored in `padding`.
 // Both `element` and `padding` must have matching `dtype`.
 Status SetElementZero(Tensor* element, const Tensor& padding);
diff --git a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
index 83a37ad11c3b5b..a12a832222537d 100644
--- a/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
+++ b/tensorflow/core/util/tensor_bundle/tensor_bundle.cc
@@ -916,7 +916,7 @@ Status BundleReader::GetValue(const BundleEntryProto& entry, Tensor* val) {
   if (DataTypeCanUseMemcpy(entry.dtype())) {
     char* backing_buffer = const_cast<char*>((ret->tensor_data().data()));
     size_t unused_bytes_read;
-    if (entry.size() > kBufferSize) {
+    if (entry.size() > kBufferSize || enable_multi_threading_for_testing_) {
       StringPiece sp;
       if (!enable_multi_threading_for_testing_ &&
           entry.size() < kLargeTensorThreshold) {
@@ -964,6 +964,8 @@ Status BundleReader::GetValue(const BundleEntryProto& entry, Tensor* val) {
             statuses[i] = std::move(status);
           });
         }
+        reader_pool = nullptr;  // Wait for reads to finish
+
         for (const auto& status : statuses) {
           TF_RETURN_IF_ERROR(status);
         }
diff --git a/tensorflow/dtensor/cc/dtensor_device.cc b/tensorflow/dtensor/cc/dtensor_device.cc
index b9ddf25fde6caf..cc3a92ec8d1c97 100644
--- a/tensorflow/dtensor/cc/dtensor_device.cc
+++ b/tensorflow/dtensor/cc/dtensor_device.cc
@@ -249,7 +249,7 @@ class DTensorDevice {
     }
     Mesh::tpu_core_ids()[mesh_name].assign(tpu_core_ids.begin(),
                                            tpu_core_ids.end());
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   void ClearTPUCoreIDs() { Mesh::tpu_core_ids().clear(); }
@@ -1582,7 +1582,7 @@ Status AddExecutionFunctionDefsToFunctionDefLibrary(
             to_run, stack_traces));
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<DTensorDevice::DTensorOperationLoweringContext>
diff --git a/tensorflow/dtensor/cc/dtensor_device_util.cc b/tensorflow/dtensor/cc/dtensor_device_util.cc
index d3a84bac0d2017..31b90439f65766 100644
--- a/tensorflow/dtensor/cc/dtensor_device_util.cc
+++ b/tensorflow/dtensor/cc/dtensor_device_util.cc
@@ -598,7 +598,7 @@ tensorflow::Fprint128 ResourceHandleWithLayout::CacheKey() const {
   return f;
 }
 
-tsl::Status ResourceHandleWithLayout::UpdateLayout(const Layout& new_layout) {
+absl::Status ResourceHandleWithLayout::UpdateLayout(const Layout& new_layout) {
   // Only set the value for deferenced layout if the incoming layout is not
   // empty. This is still hacky as we use empty layout as placeholder for
   // eagerly placed VarHandleOp.
diff --git a/tensorflow/dtensor/cc/dtensor_device_util.h b/tensorflow/dtensor/cc/dtensor_device_util.h
index ce01047cc7bce8..77bec89fd241f8 100644
--- a/tensorflow/dtensor/cc/dtensor_device_util.h
+++ b/tensorflow/dtensor/cc/dtensor_device_util.h
@@ -294,17 +294,17 @@ class ResourceHandleWithLayout
   tensorflow::Fprint128 CacheKey() const override;
 
   // Updates the layout for the tensors.
-  tsl::Status UpdateLayout(const Layout& new_layout);
+  absl::Status UpdateLayout(const Layout& new_layout);
 
   // Updates the element layouts for the tensors.
-  tsl::Status UpdateElementLayouts(const std::vector<Layout>& layouts) {
+  absl::Status UpdateElementLayouts(const std::vector<Layout>& layouts) {
     dereferenced_element_layouts_.emplace(layouts);
     return tsl::OkStatus();
   }
 
   // Updates the local shape and dtype of the tensors.
-  tsl::Status UpdateShapeAndDType(const TensorShapeProto& shape,
-                                  const DataType& dtype) {
+  absl::Status UpdateShapeAndDType(const TensorShapeProto& shape,
+                                   const DataType& dtype) {
     set_dereferenced_shape(shape);
     set_dereferenced_dtype(dtype);
     return tsl::OkStatus();
diff --git a/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.cc b/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.cc
index 6049ae707a42e2..25767572bed386 100644
--- a/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.cc
+++ b/tensorflow/dtensor/cc/dtensor_graph_to_mlir_pass.cc
@@ -152,7 +152,7 @@ Status DTensorMlirPassRunner::Run(mlir::ModuleOp module) {
   TF_RETURN_IF_ERROR(diag_handler.ConsumeStatus());
 
   if (logging_enabled_) pass_manager_.getContext()->enableMultithreading();
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace tensorflow
diff --git a/tensorflow/dtensor/cc/dtensor_meta_ops.cc b/tensorflow/dtensor/cc/dtensor_meta_ops.cc
index a38e8392171c26..b4a51359209044 100644
--- a/tensorflow/dtensor/cc/dtensor_meta_ops.cc
+++ b/tensorflow/dtensor/cc/dtensor_meta_ops.cc
@@ -57,7 +57,7 @@ REGISTER_OP("DTensorAllScatter")
       if (!c->RankKnown(in)) {
         // Input shape unknown, so set unknown output shape.
         c->set_output(0, in);
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       std::string input_layout_string;
@@ -100,7 +100,7 @@ REGISTER_OP("DTensorAllScatter")
         }
       }
       c->set_output(0, c->MakeShape(out_dims));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("DTensorAllGather")
@@ -117,7 +117,7 @@ REGISTER_OP("DTensorAllGather")
       if (!c->RankKnown(in)) {
         // Input shape unknown, so set unknown output shape.
         c->set_output(0, in);
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       std::string input_layout_string;
@@ -159,7 +159,7 @@ REGISTER_OP("DTensorAllGather")
         }
       }
       c->set_output(0, c->MakeShape(out_dims));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("DTensorAllToAll")
@@ -173,7 +173,7 @@ REGISTER_OP("DTensorAllToAll")
       if (!c->RankKnown(in)) {
         // Input shape unknown, so set unknown output shape.
         c->set_output(0, in);
-        return OkStatus();
+        return absl::OkStatus();
       }
 
       std::string input_layout_string;
@@ -218,7 +218,7 @@ REGISTER_OP("DTensorAllToAll")
         }
       }
       c->set_output(0, c->MakeShape(out_dims));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 }  // namespace dtensor
diff --git a/tensorflow/dtensor/cc/dtensor_tpu_kernels.cc b/tensorflow/dtensor/cc/dtensor_tpu_kernels.cc
index f23b7eb54d6cb7..c6d1b560d4f108 100644
--- a/tensorflow/dtensor/cc/dtensor_tpu_kernels.cc
+++ b/tensorflow/dtensor/cc/dtensor_tpu_kernels.cc
@@ -59,11 +59,11 @@ Status DeleteIfExists(ResourceMgr* resource_manager,
       resource_manager->default_container(), resource_name);
   if (status.ok()) {
     VLOG(1) << "Removed existing resource " << resource_name;
-    return OkStatus();
+    return absl::OkStatus();
   }
   if (status.code() == error::NOT_FOUND) {
     VLOG(1) << "No resource " << resource_name << " to remove";
-    return OkStatus();
+    return absl::OkStatus();
   }
   VLOG(1) << "Error removing resource " << resource_name << " : " << status;
   return status;
@@ -151,7 +151,7 @@ class ConfigureAndInitializeGlobalTPUOpKernel : public OpKernel {
     }
 
     auto start = absl::Now();
-    auto init_status = OkStatus();
+    auto init_status = absl::OkStatus();
 
     // Keep trying to initialize underlying TPU system until either TPU system
     // is initialized or initialization times out.
@@ -242,7 +242,7 @@ class ConfigureAndInitializeGlobalTPUOpKernel : public OpKernel {
                      tpu::kTpuEmbeddingEngineStateInterfaceResourceName,
                      tpu::TpuEmbeddingEngineStateInterface::Create()));
 
-    return OkStatus();
+    return absl::OkStatus();
   }
 };
 
diff --git a/tensorflow/dtensor/cc/dtensor_tpu_ops.cc b/tensorflow/dtensor/cc/dtensor_tpu_ops.cc
index 3e096a4c7c32f6..8c2d5a4931cf25 100644
--- a/tensorflow/dtensor/cc/dtensor_tpu_ops.cc
+++ b/tensorflow/dtensor/cc/dtensor_tpu_ops.cc
@@ -44,7 +44,7 @@ REGISTER_OP("ConfigureAndInitializeGlobalTPU")
         TF_RETURN_IF_ERROR(c->WithRank(c->input(i), 0, &input));
       }
       c->set_output(0, c->Vector(c->UnknownDim()));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 REGISTER_OP("ShutdownTPUSystem")
@@ -58,7 +58,7 @@ REGISTER_OP("DTensorSetGlobalTPUArray")
     .SetShapeFn([](InferenceContext* c) {
       ShapeHandle input;
       TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 0, &input));
-      return OkStatus();
+      return absl::OkStatus();
     });
 
 }  // namespace dtensor
diff --git a/tensorflow/dtensor/cc/slice_util.h b/tensorflow/dtensor/cc/slice_util.h
index f8167f5dff2627..c16a18227a9495 100644
--- a/tensorflow/dtensor/cc/slice_util.h
+++ b/tensorflow/dtensor/cc/slice_util.h
@@ -191,7 +191,7 @@ class ForwardLayoutInference : public TokenProcessor {
     TF_ASSIGN_OR_RETURN(
         expander_value_layout_,
         Layout::GetLayout(expander_value_sharding_, input_layout_.mesh()));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
@@ -291,7 +291,7 @@ class BackwardLayoutInference : public TokenProcessor {
     TF_ASSIGN_OR_RETURN(
         expander_value_layout_,
         Layout::GetLayout(expander_value_sharding_, value_layout_.mesh()));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
  private:
diff --git a/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc b/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc
index b50b2a7b7d12c5..1f47934230e3bc 100644
--- a/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc
+++ b/tensorflow/dtensor/mlir/dtensor_multi_device_expansion.cc
@@ -747,7 +747,7 @@ mlir::LogicalResult BuildOuterMainFunc(
 Status ExtractResultLayouts(mlir::Operation* op, mlir::func::ReturnOp return_op,
                             std::vector<ExpandedResults>& expanded_results) {
   if (!return_op || (return_op.getNumOperands() == 0)) {
-    return OkStatus();
+    return absl::OkStatus();
   }
   TF_ASSIGN_OR_RETURN(std::vector<std::optional<Layout>> layouts,
                       ExtractLayoutFromOp(op));
@@ -760,7 +760,7 @@ Status ExtractResultLayouts(mlir::Operation* op, mlir::func::ReturnOp return_op,
     size_t result_index = std::distance(operands.begin(), search);
     expanded_results[result_index].layout = layouts[layout_index];
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 struct DTensorMultiDeviceExpansion
diff --git a/tensorflow/dtensor/mlir/expansions/concat_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/concat_spmd_expander.cc
index 1d3948fc7d8144..f71d8f72d7bd4e 100644
--- a/tensorflow/dtensor/mlir/expansions/concat_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/concat_spmd_expander.cc
@@ -47,7 +47,7 @@ Status VerifyConcatLayout(mlir::Value concat_dim_operand,
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<Layout> ReduceForConcatOutputLayout(mlir::Value concat_dim_operand,
diff --git a/tensorflow/dtensor/mlir/expansions/conv_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/conv_spmd_expander.cc
index e3c893e4c07a8a..2602a16eb9f824 100644
--- a/tensorflow/dtensor/mlir/expansions/conv_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/conv_spmd_expander.cc
@@ -59,7 +59,7 @@ Status VerifyConvLayout(const Layout& input_layout, const Layout& filter_layout,
 
   if (input_layout.IsBatchParallel() || input_layout.IsFullyReplicated())
     // No further checks needed for replicated case.
-    return OkStatus();
+    return absl::OkStatus();
 
   if (conv_op.getPadding() == "EXPLICIT")
     return errors::InvalidArgument(
@@ -106,7 +106,7 @@ Status VerifyConvLayout(const Layout& input_layout, const Layout& filter_layout,
           "spatial partitions.");
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 mlir::Value PadInputOnUnshardedDim(mlir::OpBuilder& builder,
diff --git a/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.cc
index 6e8d896545a02c..432bc75e2b800a 100644
--- a/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/dtensor_op_spmd_expander.cc
@@ -53,7 +53,7 @@ Status ValidateSendRecvLayoutConfiguration(mlir::TF::DTensorSend dtensor_send,
                                            mlir::TF::DTensorRecv dtensor_recv) {
   // If either one of the send/recv ops has already been lowered, then send/recv
   // configuration has already been verified.
-  if (!dtensor_send || !dtensor_recv) return OkStatus();
+  if (!dtensor_send || !dtensor_recv) return absl::OkStatus();
 
   TF_ASSIGN_OR_RETURN(const absl::optional<Layout> send_layout_or_null,
                       ExtractLayoutFromOperand(dtensor_send.getInput()));
@@ -110,7 +110,7 @@ Status ValidateSendRecvLayoutConfiguration(mlir::TF::DTensorSend dtensor_send,
     return absl::InvalidArgumentError(
         "tf.CopyToMesh op must be used to send data from/to host mesh.");
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <typename RelayoutOp>
diff --git a/tensorflow/dtensor/mlir/expansions/einsum_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/einsum_spmd_expander.cc
index eff66fa0354784..e7427b924f41ce 100644
--- a/tensorflow/dtensor/mlir/expansions/einsum_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/einsum_spmd_expander.cc
@@ -134,7 +134,7 @@ Status ExtractEquationRelations(
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // For a set of layouts and mappings from labels to offsets in the layouts,
@@ -545,7 +545,7 @@ Status EinsumSPMDExpander::MaybeRelayoutInputs(
   for (const auto& contracting : contracting_labels)
     reduce_dims.emplace(input_label_to_sharding_spec[contracting]);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace dtensor
diff --git a/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.cc
index fd42942f7e4130..7c3706e0f2d858 100644
--- a/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/gather_spmd_expander.cc
@@ -262,7 +262,7 @@ Status GatherNdGetInputLayoutFromOutput(const Layout& output_layout,
   TF_ASSIGN_OR_RETURN(*params_layout, Layout::GetLayout(params_specs, mesh));
   TF_ASSIGN_OR_RETURN(*indices_layout, Layout::GetLayout(indices_specs, mesh));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/dtensor/mlir/expansions/matmul_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/matmul_spmd_expander.cc
index ad409760344d12..f2c6521e71c744 100644
--- a/tensorflow/dtensor/mlir/expansions/matmul_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/matmul_spmd_expander.cc
@@ -114,7 +114,7 @@ StatusOr<Layout> MatMulSPMDExpander::OutputLayoutAndReducedDims(
   Layout batch_layout;
 
   if (!*left || !*right) {
-    if (allow_unknown_layouts) return OkStatus();
+    if (allow_unknown_layouts) return absl::OkStatus();
     return errors::Unimplemented("failed to do SPMD expansion for ", OpName(op),
                                  " operand layouts "
                                  "unknown");
@@ -360,7 +360,7 @@ Status MatMulSPMDExpander::MaybeRelayoutInputs(
   TF_ASSIGN_OR_RETURN(
       right, EmitRelayout(op->getOperand(1), right_layout, new_right_layout));
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<llvm::DenseMap<int, Layout>> MatMulSPMDExpander::ComputeLayoutForward(
diff --git a/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc
index 5606187b1e17ee..cc82fa42901fa7 100644
--- a/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/meta_spmd_expander.cc
@@ -262,7 +262,7 @@ Status VerifyPaddedDimensionNotSharded(const Layout& layout,
           "Padding over sharded dimension is not allowed.");
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -353,7 +353,7 @@ Status VerifyTileOperandLayout(const Layout& operand_layout,
           "tile op with input sharded at dimension where `multiple` > 1 is not "
           "supported.");
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -981,7 +981,7 @@ Status RelayoutOneHotInput(const absl::optional<Layout>& input_layout,
 
   one_hot->setOperand(0, new_input);
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc
index 32e6f4315849ba..5156e09e417c1f 100644
--- a/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/random_op_spmd_expander.cc
@@ -40,7 +40,7 @@ Status CheckLayoutIsSupported(const Layout& layout) {
     return errors::InvalidArgument("Large mesh rank size is not supported",
                                    layout.ToString());
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ValidateShapeAndGetNewShape(
@@ -70,7 +70,7 @@ Status ValidateShapeAndGetNewShape(
     }
     new_random_shape.emplace_back(op_dimension_size / dimension_sharding);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Get a device seed for this layout and device_id.
diff --git a/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc
index 217b41c986bbf0..9d40da3173cee4 100644
--- a/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/reduce_spmd_expander.cc
@@ -69,14 +69,14 @@ absl::string_view DefiningOpName(mlir::Value operand) {
 
 Status AssertReplicated(mlir::Value operand) {
   TF_ASSIGN_OR_RETURN(auto layout, ExtractLayoutFromOperand(operand));
-  if (!layout) return OkStatus();
+  if (!layout) return absl::OkStatus();
 
   if (!layout->IsFullyReplicated()) {
     return errors::InvalidArgument(
         "Expected layout for ", DefiningOpName(operand),
         " to be fully replicated, but found ", layout->ToString());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 absl::flat_hash_set<std::string> ReducedMeshDimensions(
@@ -95,7 +95,7 @@ template <typename OpType>
 Status ExtractDims(mlir::Operation* op,
                    llvm::SmallVector<int64_t, 4>* reduced_dims, bool* keep_dims,
                    bool* matched) {
-  if (!llvm::isa<OpType>(op)) return OkStatus();
+  if (!llvm::isa<OpType>(op)) return absl::OkStatus();
   auto reduce_op = llvm::cast<OpType>(op);
   *keep_dims = reduce_op.getKeepDims();
   TF_RETURN_IF_ERROR(ExtractConstVectorFromValue(
@@ -103,7 +103,7 @@ Status ExtractDims(mlir::Operation* op,
   TF_RETURN_IF_ERROR(AssertReplicated(reduce_op.getReductionIndices()));
   *matched = true;
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 template <>
@@ -191,7 +191,7 @@ Status ExtractReductionParameters(mlir::Operation* op,
                                  " not yet implemented.");
 
   reduced_dims_set.insert(reduced_dims.begin(), reduced_dims.end());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<Layout> ComputeResultLayout(mlir::Operation* op,
diff --git a/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.cc
index a7db8e1879ad38..c6f5042dfb8034 100644
--- a/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/resource_spmd_expander.cc
@@ -153,7 +153,7 @@ Status ValidateAndAssignResourceInputLayout(mlir::tf_device::ClusterOp op,
     add_layout_as_attributes(mutable_input_layouts, mutable_input_indices,
                              resource_arg_index, layout_string);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
index d3e7487006ce88..13768bc95419b5 100644
--- a/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/slice_spmd_expander.cc
@@ -50,7 +50,7 @@ Status GetSliceOpArguments(mlir::TF::SliceOp slice_op,
       ExtractConstVectorFromValue(slice_op.getSize(), &sizes),
       "expected constant argument for SliceOp::size()");
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<Layout> VerifySliceLayout(
diff --git a/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc
index 41f99410e16a71..8ccdf0ee8168ae 100644
--- a/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/softmax_spmd_expander.cc
@@ -141,7 +141,7 @@ Status ComputeExpAndSum(mlir::OpBuilder& builder, const mlir::Value& logits,
       ComputeGlobalReduce(builder, exp_of_shifted_logits, logits_layout,
                           {class_dimension}, kReduceOpAdd,
                           /*keep_dims=*/true));
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Computes softmax from its components. Assumes that builder's insertion point
diff --git a/tensorflow/dtensor/mlir/expansions/strided_slice_spmd_expander.cc b/tensorflow/dtensor/mlir/expansions/strided_slice_spmd_expander.cc
index 0a4104e2b736a0..506607fb35401b 100644
--- a/tensorflow/dtensor/mlir/expansions/strided_slice_spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/expansions/strided_slice_spmd_expander.cc
@@ -136,7 +136,7 @@ Status UpdateOpFromTokens(T strided_slice,
   mlir::Value new_end = IntConstWithMatchingType(
       builder, strided_slice.getLoc(), end, strided_slice.getBegin().getType());
   strided_slice.getEndMutable().assign(new_end);
-  return OkStatus();
+  return absl::OkStatus();
 }
 }  // namespace
 
diff --git a/tensorflow/dtensor/mlir/function_renaming.cc b/tensorflow/dtensor/mlir/function_renaming.cc
index fe91943915b3f0..0c485e91878d15 100644
--- a/tensorflow/dtensor/mlir/function_renaming.cc
+++ b/tensorflow/dtensor/mlir/function_renaming.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "absl/strings/str_cat.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
diff --git a/tensorflow/dtensor/mlir/layout_propagation_v2.cc b/tensorflow/dtensor/mlir/layout_propagation_v2.cc
index adf9219a4fd922..a7f089eb1726ec 100644
--- a/tensorflow/dtensor/mlir/layout_propagation_v2.cc
+++ b/tensorflow/dtensor/mlir/layout_propagation_v2.cc
@@ -1359,7 +1359,7 @@ Status RunOneIteration(
     llvm::DenseMap<mlir::Value, std::vector<mlir::OpOperand*>>& consumers,
     llvm::DenseMap<mlir::Value, Layout>& merged_layouts, mlir::ModuleOp& module,
     int stage, int* steps) {
-  if (is_updated.empty()) return OkStatus();
+  if (is_updated.empty()) return absl::OkStatus();
   // Merge any possibly updated layouts.
   if (mlir::failed(
           MergeAndGetUpdatedLayouts(is_locked, is_updated, producer_request,
@@ -1384,7 +1384,7 @@ Status RunOneIteration(
       return errors::Internal("UpdateLayoutsForOp failed to update layouts.");
   }
   ++(*steps);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Compares every value's layouts in `merged_a` with the ones in `merged_b`,
@@ -1406,7 +1406,7 @@ Status CompareMergedLayouts(const llvm::DenseMap<mlir::Value, Layout>& merged_a,
       changed.insert(value);
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // MLIR pass that propagates layout for all ops the module.
diff --git a/tensorflow/dtensor/mlir/shape_utils.cc b/tensorflow/dtensor/mlir/shape_utils.cc
index 57b07683f67940..d0eda18a6bc94b 100644
--- a/tensorflow/dtensor/mlir/shape_utils.cc
+++ b/tensorflow/dtensor/mlir/shape_utils.cc
@@ -239,7 +239,7 @@ Status InferSPMDExpandedLocalShapeForResourceOutput(
             mlir::ArrayRef<mlir::TensorType>{local_variable_subtype}, context));
     op_result->setType(new_var_type);
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 mlir::Operation* InferSPMDExpandedLocalShape(mlir::Operation* op) {
diff --git a/tensorflow/dtensor/mlir/sparse_expander.cc b/tensorflow/dtensor/mlir/sparse_expander.cc
index c471fbc1ed4241..92c5d616436423 100644
--- a/tensorflow/dtensor/mlir/sparse_expander.cc
+++ b/tensorflow/dtensor/mlir/sparse_expander.cc
@@ -64,7 +64,7 @@ Status RunSparseExpansion(mlir::Operation* op, mlir::Operation** output) {
   } else {  // If there is no SparseTensor inputs then just return the op.
     *output = op;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace dtensor
diff --git a/tensorflow/dtensor/mlir/spmd_expander.cc b/tensorflow/dtensor/mlir/spmd_expander.cc
index ce6b34c7a004b5..4e63f87970777f 100644
--- a/tensorflow/dtensor/mlir/spmd_expander.cc
+++ b/tensorflow/dtensor/mlir/spmd_expander.cc
@@ -73,7 +73,7 @@ Status AdjustPartedLayout(const llvm::DenseMap<int, Layout>& input_layouts,
       computed_layout.getSecond() = parted;
     }
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Returns whether DTensor should skip SPMD expansion because `op` uses parted
@@ -168,7 +168,7 @@ Status SPMDExpanderBase::ExpandOpAndSetLayout(mlir::Operation* op,
     }
     SetLayoutOnOp(*output, absl::Span<std::optional<Layout>>(
                                computed_layout.data(), computed_layout.size()));
-    return OkStatus();
+    return absl::OkStatus();
   }
 
   // `op` may be removed/replaced from the graph during SPMD expansion, so
@@ -239,7 +239,7 @@ Status SPMDExpanderBase::ExpandOpAndSetLayout(mlir::Operation* op,
     }
   }
 
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<llvm::DenseMap<int, Layout>> SPMDExpanderBase::ComputeLayoutForward(
@@ -299,7 +299,7 @@ Status RunSPMDExpansion(mlir::Operation* op, mlir::Operation** output) {
     VLOG(1) << "No expansion found for " << OpName(op) << "\n";
     *output = op;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace dtensor
diff --git a/tensorflow/dtensor/mlir/spmd_expander_common.cc b/tensorflow/dtensor/mlir/spmd_expander_common.cc
index 7f411c4da74c73..b3f823ae7e9fc4 100644
--- a/tensorflow/dtensor/mlir/spmd_expander_common.cc
+++ b/tensorflow/dtensor/mlir/spmd_expander_common.cc
@@ -141,7 +141,7 @@ Status CreateSplitOp(const int num_split, const int split_dimension,
   llvm::SmallVector<mlir::Type, 4> output_types(num_split, output_type);
   *split_op = builder->create<mlir::TF::SplitOp>(
       location, output_types, split_dimension_op.getOutput(), src_input);
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 // Given layouts + shapes, determines if the two are broadcasting compatible.
@@ -682,7 +682,7 @@ Status SetBuilderInsertionAfterValue(mlir::Value value,
                                      mlir::OpBuilder& builder) {
   if (value.isa<mlir::OpResult>()) {
     builder.setInsertionPointAfterValue(value);
-    return OkStatus();
+    return absl::OkStatus();
   }
   mlir::tf_device::ClusterOp cluster;
   for (mlir::Operation* op : value.getUsers()) {
@@ -696,7 +696,7 @@ Status SetBuilderInsertionAfterValue(mlir::Value value,
   if (!cluster) return errors::Internal("value not used in any cluster");
 
   builder.setInsertionPointToStart(cluster.SingleBlock::getBody());
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status PrintTensor(mlir::Value value, const std::string& format_string = "%s") {
@@ -713,7 +713,7 @@ Status PrintTensor(mlir::Value value, const std::string& format_string = "%s") {
   builder.create<mlir::TF::PrintV2Op>(value.getLoc(), format.getOutput(),
                                       /*output_stream=*/"log(info)",
                                       /*end=*/"\n");
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status ExtractConstStringVectorFromValue(
@@ -731,7 +731,7 @@ Status ExtractConstStringVectorFromValue(
   for (const auto& str : attr.getRawStringData()) {
     out_vector.push_back(str.str());
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<std::string> ExtractConstScalarStringFromValue(mlir::Value value) {
diff --git a/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc b/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc
index b55d22d229ac4f..802d46fd27ecde 100644
--- a/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc
+++ b/tensorflow/dtensor/mlir/utils/update_tpu_metadata.cc
@@ -179,7 +179,7 @@ Status UpdateMetadataProtoXlaSpmd(const Mesh& mesh_config,
     }
     *proto.mutable_device_assignment() = device_assignment;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 Status UpdateMetadataProtoDtensorSpmd(const Mesh& mesh_config,
@@ -238,7 +238,7 @@ Status UpdateMetadataProtoDtensorSpmd(const Mesh& mesh_config,
     }
     *proto.mutable_device_assignment() = device_assignment;
   }
-  return OkStatus();
+  return absl::OkStatus();
 }
 
 mlir::LogicalResult UpdateTPUCompileMetadata(const Mesh& mesh_config,
diff --git a/tensorflow/go/op/wrappers.go b/tensorflow/go/op/wrappers.go
index 1c1bc5a47fac85..60aa4403e76115 100644
--- a/tensorflow/go/op/wrappers.go
+++ b/tensorflow/go/op/wrappers.go
@@ -10562,6 +10562,29 @@ func DatasetCardinality(scope *Scope, input_dataset tf.Output, optional ...Datas
 	return op.Output(0)
 }
 
+// Returns the fingerprint of `input_dataset`.
+//
+// Returns the fingerprint of `input_dataset`.
+//
+// Arguments:
+//
+//	input_dataset: A variant tensor representing the dataset to return fingerprint for.
+//
+// Returns The fingerprint of `input_dataset` in `uint64`
+func DatasetFingerprint(scope *Scope, input_dataset tf.Output) (fingerprint tf.Output) {
+	if scope.Err() != nil {
+		return
+	}
+	opspec := tf.OpSpec{
+		Type: "DatasetFingerprint",
+		Input: []tf.Input{
+			input_dataset,
+		},
+	}
+	op := scope.AddOperation(opspec)
+	return op.Output(0)
+}
+
 // Creates a dataset from the given `graph_def`.
 //
 // Creates a dataset from the provided `graph_def`.
diff --git a/tensorflow/lite/BUILD b/tensorflow/lite/BUILD
index d1d3d6bb123958..6bfe5fc092ce66 100644
--- a/tensorflow/lite/BUILD
+++ b/tensorflow/lite/BUILD
@@ -8,7 +8,7 @@ load("//tensorflow/lite/core/shims:cc_library_with_tflite.bzl", "alias_with_tfli
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:LICENSE"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -645,6 +645,7 @@ cc_library_with_tflite(
     copts = tflite_copts() + tflite_copts_warnings(),
     generate_opaque_delegate_target = True,
     visibility = [
+        "//research/drishti/benchmarking/async:__subpackages__",
         "//tensorflow/compiler/mlir/lite:__subpackages__",
         "//tensorflow/lite/core/kernels:__subpackages__",
         "//tensorflow/lite/core/shims:__subpackages__",
@@ -1371,6 +1372,9 @@ tflite_cc_shared_object(
         "//tensorflow:macos": [
             "-Wl,-exported_symbols_list,$(location //tensorflow/lite:tflite_exported_symbols.lds)",
         ],
+        "//tensorflow:ios": [
+            "-Wl,-exported_symbols_list,$(location //tensorflow/lite:tflite_exported_symbols.lds)",
+        ],
         "//tensorflow:windows": [],
         "//conditions:default": [
             "-Wl,--version-script,$(location //tensorflow/lite:tflite_version_script.lds)",
diff --git a/tensorflow/lite/acceleration/configuration/BUILD b/tensorflow/lite/acceleration/configuration/BUILD
index 6b6aab6638d033..375d73fa2c23a4 100644
--- a/tensorflow/lite/acceleration/configuration/BUILD
+++ b/tensorflow/lite/acceleration/configuration/BUILD
@@ -235,24 +235,28 @@ cc_library_with_tflite(
     deps = ["//tensorflow/lite/core/acceleration/configuration:nnapi_plugin"],
 )
 
-cc_library(
-    name = "hexagon_plugin",
-    srcs = ["hexagon_plugin.cc"],
-    deps = [
-        ":configuration_fbs",
-        "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
-        "@com_google_absl//absl/memory",
-    ] + select({
-        "//third_party/bazel_platforms/cpu:aarch64": [
-            "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
-        ],
-        "//third_party/bazel_platforms/cpu:armv7": [
-            "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
-        ],
-        "//conditions:default": [],
-    }),
-    alwayslink = 1,  # For registration to always run.
-)
+# Commented under b/279852433 because caused an error in the OSS
+# TODO(zhurakovskyi): Uncomment when fixed.
+# copybara:uncomment_begin
+# cc_library(
+#     name = "hexagon_plugin",
+#     srcs = ["hexagon_plugin.cc"],
+#     deps = [
+#         ":configuration_fbs",
+#         "@com_google_absl//absl/memory",
+#         "//tensorflow/lite/core/acceleration/configuration:delegate_registry",
+#     ] + select({
+#         "//third_party/bazel_platforms/cpu:aarch64": [
+#             "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
+#         ],
+#         "//third_party/bazel_platforms/cpu:armv7": [
+#             "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
+#         ],
+#         "//conditions:default": [],
+#     }),
+#     alwayslink = 1,  # For registration to always run.
+# )
+# copybara:uncomment_end
 
 cc_library(
     name = "gpu_plugin",
diff --git a/tensorflow/lite/acceleration/configuration/configuration_generated.h b/tensorflow/lite/acceleration/configuration/configuration_generated.h
index 3547e8ca0d100c..675dfc9d8c9a09 100644
--- a/tensorflow/lite/acceleration/configuration/configuration_generated.h
+++ b/tensorflow/lite/acceleration/configuration/configuration_generated.h
@@ -720,6 +720,42 @@ inline const char *EnumNameQosClass(QosClass e) {
   return EnumNamesQosClass()[index];
 }
 
+enum UseLayerIrTgcBackend : int32_t {
+  UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_UNSPECIFIED = 0,
+  UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_NO = 1,
+  UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_YES = 2,
+  UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_AUTO = 3,
+  UseLayerIrTgcBackend_MIN = UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_UNSPECIFIED,
+  UseLayerIrTgcBackend_MAX = UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_AUTO
+};
+
+inline const UseLayerIrTgcBackend (&EnumValuesUseLayerIrTgcBackend())[4] {
+  static const UseLayerIrTgcBackend values[] = {
+    UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_UNSPECIFIED,
+    UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_NO,
+    UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_YES,
+    UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_AUTO
+  };
+  return values;
+}
+
+inline const char * const *EnumNamesUseLayerIrTgcBackend() {
+  static const char * const names[5] = {
+    "USE_LAYER_IR_TGC_BACKEND_UNSPECIFIED",
+    "USE_LAYER_IR_TGC_BACKEND_NO",
+    "USE_LAYER_IR_TGC_BACKEND_YES",
+    "USE_LAYER_IR_TGC_BACKEND_AUTO",
+    nullptr
+  };
+  return names;
+}
+
+inline const char *EnumNameUseLayerIrTgcBackend(UseLayerIrTgcBackend e) {
+  if (::flatbuffers::IsOutRange(e, UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_UNSPECIFIED, UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_AUTO)) return "";
+  const size_t index = static_cast<size_t>(e);
+  return EnumNamesUseLayerIrTgcBackend()[index];
+}
+
 }  // namespace EdgeTpuSettings_
 
 namespace GoogleEdgeTpuSettings_ {
@@ -1980,6 +2016,7 @@ struct EdgeTpuSettingsT : public ::flatbuffers::NativeTable {
   tflite::EdgeTpuSettings_::QosClass qos_class = tflite::EdgeTpuSettings_::QosClass_QOS_UNDEFINED;
   std::vector<int32_t> hardware_cluster_ids{};
   std::string public_model_id{};
+  tflite::EdgeTpuSettings_::UseLayerIrTgcBackend use_layer_ir_tgc_backend = tflite::EdgeTpuSettings_::UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_UNSPECIFIED;
   EdgeTpuSettingsT() = default;
   EdgeTpuSettingsT(const EdgeTpuSettingsT &o);
   EdgeTpuSettingsT(EdgeTpuSettingsT&&) FLATBUFFERS_NOEXCEPT = default;
@@ -1998,7 +2035,8 @@ struct EdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
     VT_FLOAT_TRUNCATION_TYPE = 14,
     VT_QOS_CLASS = 16,
     VT_HARDWARE_CLUSTER_IDS = 18,
-    VT_PUBLIC_MODEL_ID = 20
+    VT_PUBLIC_MODEL_ID = 20,
+    VT_USE_LAYER_IR_TGC_BACKEND = 22
   };
   tflite::EdgeTpuPowerState inference_power_state() const {
     return static_cast<tflite::EdgeTpuPowerState>(GetField<int32_t>(VT_INFERENCE_POWER_STATE, 0));
@@ -2027,6 +2065,9 @@ struct EdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
   const ::flatbuffers::String *public_model_id() const {
     return GetPointer<const ::flatbuffers::String *>(VT_PUBLIC_MODEL_ID);
   }
+  tflite::EdgeTpuSettings_::UseLayerIrTgcBackend use_layer_ir_tgc_backend() const {
+    return static_cast<tflite::EdgeTpuSettings_::UseLayerIrTgcBackend>(GetField<int32_t>(VT_USE_LAYER_IR_TGC_BACKEND, 0));
+  }
   bool Verify(::flatbuffers::Verifier &verifier) const {
     return VerifyTableStart(verifier) &&
            VerifyField<int32_t>(verifier, VT_INFERENCE_POWER_STATE, 4) &&
@@ -2044,6 +2085,7 @@ struct EdgeTpuSettings FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table {
            verifier.VerifyVector(hardware_cluster_ids()) &&
            VerifyOffset(verifier, VT_PUBLIC_MODEL_ID) &&
            verifier.VerifyString(public_model_id()) &&
+           VerifyField<int32_t>(verifier, VT_USE_LAYER_IR_TGC_BACKEND, 4) &&
            verifier.EndTable();
   }
   EdgeTpuSettingsT *UnPack(const ::flatbuffers::resolver_function_t *_resolver = nullptr) const;
@@ -2082,6 +2124,9 @@ struct EdgeTpuSettingsBuilder {
   void add_public_model_id(::flatbuffers::Offset<::flatbuffers::String> public_model_id) {
     fbb_.AddOffset(EdgeTpuSettings::VT_PUBLIC_MODEL_ID, public_model_id);
   }
+  void add_use_layer_ir_tgc_backend(tflite::EdgeTpuSettings_::UseLayerIrTgcBackend use_layer_ir_tgc_backend) {
+    fbb_.AddElement<int32_t>(EdgeTpuSettings::VT_USE_LAYER_IR_TGC_BACKEND, static_cast<int32_t>(use_layer_ir_tgc_backend), 0);
+  }
   explicit EdgeTpuSettingsBuilder(::flatbuffers::FlatBufferBuilder &_fbb)
         : fbb_(_fbb) {
     start_ = fbb_.StartTable();
@@ -2103,8 +2148,10 @@ inline ::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(
     tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type = tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED,
     tflite::EdgeTpuSettings_::QosClass qos_class = tflite::EdgeTpuSettings_::QosClass_QOS_UNDEFINED,
     ::flatbuffers::Offset<::flatbuffers::Vector<int32_t>> hardware_cluster_ids = 0,
-    ::flatbuffers::Offset<::flatbuffers::String> public_model_id = 0) {
+    ::flatbuffers::Offset<::flatbuffers::String> public_model_id = 0,
+    tflite::EdgeTpuSettings_::UseLayerIrTgcBackend use_layer_ir_tgc_backend = tflite::EdgeTpuSettings_::UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_UNSPECIFIED) {
   EdgeTpuSettingsBuilder builder_(_fbb);
+  builder_.add_use_layer_ir_tgc_backend(use_layer_ir_tgc_backend);
   builder_.add_public_model_id(public_model_id);
   builder_.add_hardware_cluster_ids(hardware_cluster_ids);
   builder_.add_qos_class(qos_class);
@@ -2127,7 +2174,8 @@ inline ::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettingsDirect(
     tflite::EdgeTpuSettings_::FloatTruncationType float_truncation_type = tflite::EdgeTpuSettings_::FloatTruncationType_UNSPECIFIED,
     tflite::EdgeTpuSettings_::QosClass qos_class = tflite::EdgeTpuSettings_::QosClass_QOS_UNDEFINED,
     const std::vector<int32_t> *hardware_cluster_ids = nullptr,
-    const char *public_model_id = nullptr) {
+    const char *public_model_id = nullptr,
+    tflite::EdgeTpuSettings_::UseLayerIrTgcBackend use_layer_ir_tgc_backend = tflite::EdgeTpuSettings_::UseLayerIrTgcBackend_USE_LAYER_IR_TGC_BACKEND_UNSPECIFIED) {
   auto inactive_power_configs__ = inactive_power_configs ? _fbb.CreateVector<::flatbuffers::Offset<tflite::EdgeTpuInactivePowerConfig>>(*inactive_power_configs) : 0;
   auto model_token__ = model_token ? _fbb.CreateString(model_token) : 0;
   auto hardware_cluster_ids__ = hardware_cluster_ids ? _fbb.CreateVector<int32_t>(*hardware_cluster_ids) : 0;
@@ -2142,7 +2190,8 @@ inline ::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettingsDirect(
       float_truncation_type,
       qos_class,
       hardware_cluster_ids__,
-      public_model_id__);
+      public_model_id__,
+      use_layer_ir_tgc_backend);
 }
 
 ::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT *_o, const ::flatbuffers::rehasher_function_t *_rehasher = nullptr);
@@ -4782,7 +4831,8 @@ inline bool operator==(const EdgeTpuSettingsT &lhs, const EdgeTpuSettingsT &rhs)
       (lhs.float_truncation_type == rhs.float_truncation_type) &&
       (lhs.qos_class == rhs.qos_class) &&
       (lhs.hardware_cluster_ids == rhs.hardware_cluster_ids) &&
-      (lhs.public_model_id == rhs.public_model_id);
+      (lhs.public_model_id == rhs.public_model_id) &&
+      (lhs.use_layer_ir_tgc_backend == rhs.use_layer_ir_tgc_backend);
 }
 
 inline bool operator!=(const EdgeTpuSettingsT &lhs, const EdgeTpuSettingsT &rhs) {
@@ -4798,7 +4848,8 @@ inline EdgeTpuSettingsT::EdgeTpuSettingsT(const EdgeTpuSettingsT &o)
         float_truncation_type(o.float_truncation_type),
         qos_class(o.qos_class),
         hardware_cluster_ids(o.hardware_cluster_ids),
-        public_model_id(o.public_model_id) {
+        public_model_id(o.public_model_id),
+        use_layer_ir_tgc_backend(o.use_layer_ir_tgc_backend) {
   inactive_power_configs.reserve(o.inactive_power_configs.size());
   for (const auto &inactive_power_configs_ : o.inactive_power_configs) { inactive_power_configs.emplace_back((inactive_power_configs_) ? new tflite::EdgeTpuInactivePowerConfigT(*inactive_power_configs_) : nullptr); }
 }
@@ -4813,6 +4864,7 @@ inline EdgeTpuSettingsT &EdgeTpuSettingsT::operator=(EdgeTpuSettingsT o) FLATBUF
   std::swap(qos_class, o.qos_class);
   std::swap(hardware_cluster_ids, o.hardware_cluster_ids);
   std::swap(public_model_id, o.public_model_id);
+  std::swap(use_layer_ir_tgc_backend, o.use_layer_ir_tgc_backend);
   return *this;
 }
 
@@ -4834,6 +4886,7 @@ inline void EdgeTpuSettings::UnPackTo(EdgeTpuSettingsT *_o, const ::flatbuffers:
   { auto _e = qos_class(); _o->qos_class = _e; }
   { auto _e = hardware_cluster_ids(); if (_e) { _o->hardware_cluster_ids.resize(_e->size()); for (::flatbuffers::uoffset_t _i = 0; _i < _e->size(); _i++) { _o->hardware_cluster_ids[_i] = _e->Get(_i); } } else { _o->hardware_cluster_ids.resize(0); } }
   { auto _e = public_model_id(); if (_e) _o->public_model_id = _e->str(); }
+  { auto _e = use_layer_ir_tgc_backend(); _o->use_layer_ir_tgc_backend = _e; }
 }
 
 inline ::flatbuffers::Offset<EdgeTpuSettings> EdgeTpuSettings::Pack(::flatbuffers::FlatBufferBuilder &_fbb, const EdgeTpuSettingsT* _o, const ::flatbuffers::rehasher_function_t *_rehasher) {
@@ -4853,6 +4906,7 @@ inline ::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(::flatbuffer
   auto _qos_class = _o->qos_class;
   auto _hardware_cluster_ids = _o->hardware_cluster_ids.size() ? _fbb.CreateVector(_o->hardware_cluster_ids) : 0;
   auto _public_model_id = _o->public_model_id.empty() ? 0 : _fbb.CreateString(_o->public_model_id);
+  auto _use_layer_ir_tgc_backend = _o->use_layer_ir_tgc_backend;
   return tflite::CreateEdgeTpuSettings(
       _fbb,
       _inference_power_state,
@@ -4863,7 +4917,8 @@ inline ::flatbuffers::Offset<EdgeTpuSettings> CreateEdgeTpuSettings(::flatbuffer
       _float_truncation_type,
       _qos_class,
       _hardware_cluster_ids,
-      _public_model_id);
+      _public_model_id,
+      _use_layer_ir_tgc_backend);
 }
 
 
diff --git a/tensorflow/lite/arena_planner.cc b/tensorflow/lite/arena_planner.cc
index 8fd1a794369b50..4f21713e84d3a6 100644
--- a/tensorflow/lite/arena_planner.cc
+++ b/tensorflow/lite/arena_planner.cc
@@ -141,6 +141,10 @@ bool ArenaPlanner::InputTensorCanBeShared(const TfLiteTensor& input_tensor,
       input_allocation_type != kTfLiteArenaRw) {
     return false;
   }
+  if (preserve_all_tensors_) {
+    return false;
+  }
+
   return true;
 }
 
diff --git a/tensorflow/lite/arena_planner_test.cc b/tensorflow/lite/arena_planner_test.cc
index 2021ac0797654c..9d6c896cffa5a6 100644
--- a/tensorflow/lite/arena_planner_test.cc
+++ b/tensorflow/lite/arena_planner_test.cc
@@ -1074,6 +1074,33 @@ TEST_F(ArenaPlannerTest, DebugTensors) {
   EXPECT_EQ(tensorOffsets.size(), 8);
 }
 
+TEST_F(ArenaPlannerTest, DebugTensorsInputReuse) {
+  TestGraph graph({0, 1},
+                  {
+                      /* in, out, tmp */
+                      {{0, 1}, {2, 3}, {}},
+                      {{2, 3}, {4}, {}, kTfLiteBuiltinMul},
+                      {{4, 2}, {5}, {}, kTfLiteBuiltinSub},
+                      {{5}, {6}, {}},
+                  },
+                  {6});
+
+  (*graph.tensors())[4].bytes = 200;
+  (*graph.tensors())[5].bytes = 200;
+
+  SetGraph(&graph, /*preserve_all_tensors=*/false);
+  Execute(0, graph.nodes().size() - 1);
+
+  // Output of mul node should be reused for output of sub node.
+  EXPECT_EQ(GetOffset(4), GetOffset(5));
+
+  SetGraph(&graph, /*preserve_all_tensors=*/true);
+  Execute(0, graph.nodes().size() - 1);
+
+  // Output of mul node should not be reused for output of sub node.
+  EXPECT_NE(GetOffset(4), GetOffset(5));
+}
+
 TEST_F(ArenaPlannerTest, SimpleProfilerTest) {
   gNumAlloc = 0;
   gNumDealloc = 0;
diff --git a/tensorflow/lite/core/BUILD b/tensorflow/lite/core/BUILD
index f405e37a47bf30..e5ace050bb9a10 100644
--- a/tensorflow/lite/core/BUILD
+++ b/tensorflow/lite/core/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings", "tflite_self_contained_libs_test_suite")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
+load("//tensorflow/lite:build_def.bzl", "tflite_copts", "tflite_copts_warnings", "tflite_self_contained_libs_test_suite")
 load("//tensorflow/lite:special_rules.bzl", "internal_visibility_allowlist", "tflite_portable_test_suite")
 load("//tensorflow/lite/core:special_rules.bzl", "macros_visibility_allowlist")
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
@@ -170,6 +170,7 @@ cc_library(
     ],
     compatible_with = get_compatible_with_portable(),
     visibility = [
+        "//research/drishti/benchmarking/async:__subpackages__",
         "//tensorflow/lite:__subpackages__",
     ],
     deps = [
@@ -515,6 +516,7 @@ cc_test(
         "//tensorflow/lite:util",
         "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/kernels:builtin_ops",  # build_cleaner: keep
+        "@com_google_absl//absl/log:check",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/core/acceleration/configuration/c/BUILD b/tensorflow/lite/core/acceleration/configuration/c/BUILD
index af5a8ad252af7f..6972ec631b3d44 100644
--- a/tensorflow/lite/core/acceleration/configuration/c/BUILD
+++ b/tensorflow/lite/core/acceleration/configuration/c/BUILD
@@ -194,43 +194,48 @@ tflite_cc_library_with_c_headers_test(
     ],
 )
 
-# This rule invokes the "flatcc" FlatBuffer C API compiler to generate the sources
-# use by the ":configuration_c_fbs" C library rule below.
-genrule(
-    name = "configuration_c_fbs_gen",
-    srcs = ["//tensorflow/lite/acceleration/configuration:configuration.fbs"],
-    outs = [
-        "configuration_builder.h",
-        "configuration_reader.h",
-    ],
-    cmd = "$(location //third_party/flatcc:flatcc) -o$(RULEDIR) --builder --reader $(SRCS)",
-    tools = ["//third_party/flatcc"],
-    # Currently this only enables the API for _building_ configuration flatbuffer objects,
-    # not the APIs for reading them, verifying them, or converting them to/from JSON.
-    # [If you need to enable those, replace the two lines above with the following
-    #   outs = ["configuration_builder.h", "configuration_reader.h", "configuration_verifier.h",
-    #           "configuration_json_parser.h", "configuration_json_printer.h"],
-    #   cmd = "$(location //third_party/flatcc:flatcc) -o$(RULEDIR) " +
-    #         "--builder --reader --verifier --json $(SRCS)",
-    # and then in the rule below -- or preferably in a separate target --
-    # add the additional header files in "hdrs" and fix the dependencies.]
-)
-
-# This rule defines a C library containing the Flatbuffer-generated C API for constructing objects
-# using the FlatBuffer schema generated from tensorflow/lite/acceleration/configuration/configuration.proto,
-# which defines the 'TFLiteSettings' FlatBuffer table and related types.
-tflite_cc_library_with_c_headers_test(
-    name = "configuration_c_fbs",
-    hdrs = [
-        "configuration_builder.h",
-        "configuration_reader.h",
-    ],
-    deps = ["//third_party/flatcc:runtime"],
-)
-
-build_test(
-    name = "configuration_c_fbs_build_test",
-    targets = [
-        ":configuration_c_fbs",
-    ],
-)
+# Commented out under the (b/279852433) because caused an error in the OSS
+# TODO(zhurakovskyi): Uncomment when fixed.
+#
+# copybara:uncomment_begin
+# # This rule invokes the "flatcc" FlatBuffer C API compiler to generate the sources
+# # use by the ":configuration_c_fbs" C library rule below.
+# genrule(
+#     name = "configuration_c_fbs_gen",
+#     srcs = ["//tensorflow/lite/acceleration/configuration:configuration.fbs"],
+#     outs = [
+#         "configuration_builder.h",
+#         "configuration_reader.h",
+#     ],
+#     cmd = "$(location //third_party/flatcc:flatcc) -o$(RULEDIR) --builder --reader $(SRCS)",
+#     tools = ["//third_party/flatcc"],
+#     # Currently this only enables the API for _building_ configuration flatbuffer objects,
+#     # not the APIs for reading them, verifying them, or converting them to/from JSON.
+#     # [If you need to enable those, replace the two lines above with the following
+#     #   outs = ["configuration_builder.h", "configuration_reader.h", "configuration_verifier.h",
+#     #           "configuration_json_parser.h", "configuration_json_printer.h"],
+#     #   cmd = "$(location //third_party/flatcc:flatcc) -o$(RULEDIR) " +
+#     #         "--builder --reader --verifier --json $(SRCS)",
+#     # and then in the rule below -- or preferably in a separate target --
+#     # add the additional header files in "hdrs" and fix the dependencies.]
+# )
+#
+# # This rule defines a C library containing the Flatbuffer-generated C API for constructing objects
+# # using the FlatBuffer schema generated from tensorflow/lite/acceleration/configuration/configuration.proto,
+# # which defines the 'TFLiteSettings' FlatBuffer table and related types.
+# tflite_cc_library_with_c_headers_test(
+#     name = "configuration_c_fbs",
+#     hdrs = [
+#         "configuration_builder.h",
+#         "configuration_reader.h",
+#     ],
+#     deps = ["//third_party/flatcc:runtime"],
+# )
+#
+# build_test(
+#     name = "configuration_c_fbs_build_test",
+#     targets = [
+#         ":configuration_c_fbs",
+#     ],
+# )
+# copybara:uncomment_end
diff --git a/tensorflow/lite/core/async/interop/c/types_test.cc b/tensorflow/lite/core/async/interop/c/types_test.cc
index 7cdc62854be151..24d649b97b59ab 100644
--- a/tensorflow/lite/core/async/interop/c/types_test.cc
+++ b/tensorflow/lite/core/async/interop/c/types_test.cc
@@ -14,7 +14,6 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/core/async/interop/c/types.h"
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
 
 namespace {
diff --git a/tensorflow/lite/core/interpreter.cc b/tensorflow/lite/core/interpreter.cc
index 5c2917e8be9f24..d794902490649c 100644
--- a/tensorflow/lite/core/interpreter.cc
+++ b/tensorflow/lite/core/interpreter.cc
@@ -492,14 +492,6 @@ TfLiteStatus Interpreter::ApplyOptionsImpl(InterpreterOptions* options) {
   for (auto& subgraph : subgraphs_) {
     subgraph->SetOptions(options_.get());
   }
-
-  // Handle `experimental_dynamic_allocation_for_large_tensors_`.
-  if (options->GetDynamicAllocationForLargeTensors() > 0) {
-    for (auto& subgraph : subgraphs_) {
-      subgraph->OptimizeMemoryForLargeTensors(
-          options->GetDynamicAllocationForLargeTensors());
-    }
-  }
   return kTfLiteOk;
 }
 
diff --git a/tensorflow/lite/core/interpreter_builder.cc b/tensorflow/lite/core/interpreter_builder.cc
index fc60df69278986..6f225aecd08e38 100644
--- a/tensorflow/lite/core/interpreter_builder.cc
+++ b/tensorflow/lite/core/interpreter_builder.cc
@@ -365,14 +365,13 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
                            EnumNameBuiltinOperator(op_type));
     }
 
+    void* builtin_data = nullptr;
+    const char* init_data = nullptr;
+    size_t init_data_size = 0;
     if (op_type == BuiltinOperator_CUSTOM) {
       if (op->custom_options()) {
-        subgraph->AddNodeWithParameters(
-            FlatBufferIntArrayToVector(op->inputs()),
-            FlatBufferIntArrayToVector(op->outputs()),
-            FlatBufferIntArrayToVector(op->intermediates()),
-            reinterpret_cast<const char*>(op->custom_options()->data()),
-            op->custom_options()->size(), nullptr, registration);
+        init_data = reinterpret_cast<const char*>(op->custom_options()->data());
+        init_data_size = op->custom_options()->size();
       } else if (op->large_custom_options_offset() > 1 && allocation_) {
         if (op->large_custom_options_offset() +
                 op->large_custom_options_size() >
@@ -384,31 +383,20 @@ TfLiteStatus InterpreterBuilder::ParseNodes(
           return kTfLiteError;
         }
         // If the custom op is storing payloads outside of flatbuffers
-        subgraph->AddNodeWithParameters(
-            FlatBufferIntArrayToVector(op->inputs()),
-            FlatBufferIntArrayToVector(op->outputs()),
-            FlatBufferIntArrayToVector(op->intermediates()),
-            reinterpret_cast<const char*>(allocation_->base()) +
-                op->large_custom_options_offset(),
-            op->large_custom_options_size(), nullptr, registration);
-      } else {
-        subgraph->AddNodeWithParameters(
-            FlatBufferIntArrayToVector(op->inputs()),
-            FlatBufferIntArrayToVector(op->outputs()),
-            FlatBufferIntArrayToVector(op->intermediates()), nullptr, 0,
-            nullptr, registration);
+        init_data = reinterpret_cast<const char*>(allocation_->base()) +
+                    op->large_custom_options_offset();
+        init_data_size = op->large_custom_options_size();
       }
     } else {
-      void* builtin_data = nullptr;
       MallocDataAllocator malloc_allocator;
       TF_LITE_ENSURE_STATUS(ParseOpData(op, op_type, error_reporter_,
                                         &malloc_allocator, &builtin_data));
-      subgraph->AddNodeWithParameters(
-          FlatBufferIntArrayToVector(op->inputs()),
-          FlatBufferIntArrayToVector(op->outputs()),
-          FlatBufferIntArrayToVector(op->intermediates()), nullptr, 0,
-          builtin_data, registration);
     }
+    subgraph->AddNodeWithParameters(
+        FlatBufferIntArrayToVector(op->inputs()),
+        FlatBufferIntArrayToVector(op->outputs()),
+        FlatBufferIntArrayToVector(op->intermediates()), init_data,
+        init_data_size, builtin_data, registration);
   }
 
   return status;
diff --git a/tensorflow/lite/core/kernels/BUILD b/tensorflow/lite/core/kernels/BUILD
index 81d643a6759004..f200394d6123de 100644
--- a/tensorflow/lite/core/kernels/BUILD
+++ b/tensorflow/lite/core/kernels/BUILD
@@ -12,7 +12,10 @@ exports_files(
         "builtin_op_kernels.h",
         "register.h",
     ],
-    visibility = ["//tensorflow/lite:__subpackages__"],
+    visibility = [
+        "//research/drishti/benchmarking/async:__subpackages__",
+        "//tensorflow/lite:__subpackages__",
+    ],
 )
 
 cc_test(
diff --git a/tensorflow/lite/core/kernels/register.cc b/tensorflow/lite/core/kernels/register.cc
index 0e3eacf4d65017..a20ef3540fb1e9 100644
--- a/tensorflow/lite/core/kernels/register.cc
+++ b/tensorflow/lite/core/kernels/register.cc
@@ -82,7 +82,7 @@ BuiltinOpResolver::BuiltinOpResolver() {
              Register_EMBEDDING_LOOKUP_SPARSE());
   AddBuiltin(BuiltinOperator_FULLY_CONNECTED, Register_FULLY_CONNECTED(),
              /* min_version = */ 1,
-             /* max_version = */ 11);
+             /* max_version = */ 12);
   AddBuiltin(BuiltinOperator_LSH_PROJECTION, Register_LSH_PROJECTION());
   AddBuiltin(BuiltinOperator_HASHTABLE_LOOKUP, Register_HASHTABLE_LOOKUP());
   AddBuiltin(BuiltinOperator_SOFTMAX, Register_SOFTMAX(),
diff --git a/tensorflow/lite/core/subgraph.h b/tensorflow/lite/core/subgraph.h
index 5de491609ccdb6..d01cd1b899c000 100644
--- a/tensorflow/lite/core/subgraph.h
+++ b/tensorflow/lite/core/subgraph.h
@@ -430,7 +430,17 @@ class Subgraph {
 
   // WARNING: This is an experimental API and subject to change.
   // Set the given `InterpreterOptions` object.
-  void SetOptions(InterpreterOptions* options) { options_ = options; }
+  void SetOptions(InterpreterOptions* options) {
+    options_ = options;
+    if (options && options->GetDynamicAllocationForLargeTensors() > 0) {
+      // Note: this operation cannot be reversed.
+      OptimizeMemoryForLargeTensors(
+          options->GetDynamicAllocationForLargeTensors());
+    }
+  }
+
+  // WARNING: This is an experimental API and subject to change.
+  const InterpreterOptions* GetOptions() const { return options_; }
 
   // WARNING: This is an experimental API and subject to change.
   // True if all intermediates tensors should be preserved for debugging.
diff --git a/tensorflow/lite/core/subgraph_test.cc b/tensorflow/lite/core/subgraph_test.cc
index 7515fcc0ca0aed..9c169c6ea256e3 100644
--- a/tensorflow/lite/core/subgraph_test.cc
+++ b/tensorflow/lite/core/subgraph_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
 #include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/core/interpreter.h"
 #include "tensorflow/lite/stderr_reporter.h"
diff --git a/tensorflow/lite/delegates/coreml/builders/BUILD b/tensorflow/lite/delegates/coreml/builders/BUILD
index 730a47ef29354a..3b1991dc4aba30 100644
--- a/tensorflow/lite/delegates/coreml/builders/BUILD
+++ b/tensorflow/lite/delegates/coreml/builders/BUILD
@@ -76,7 +76,7 @@ cc_test(
     deps = [
         ":util",
         "//tensorflow/lite/core/c:common",
-        "//third_party/eigen3",
         "@com_google_googletest//:gtest_main",
+        "@eigen_archive//:eigen3",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/BUILD b/tensorflow/lite/delegates/gpu/BUILD
index bd47abe905729c..d6eb080e6a74e9 100644
--- a/tensorflow/lite/delegates/gpu/BUILD
+++ b/tensorflow/lite/delegates/gpu/BUILD
@@ -264,6 +264,7 @@ cc_library(
         "//tensorflow/lite:kernel_api",
         "//tensorflow/lite:minimal_logging",
         "//tensorflow/lite/async:backend_async_kernel_interface",
+        "//tensorflow/lite/core/async/interop/c:types",
         "//tensorflow/lite/core/c:common",
         "//tensorflow/lite/delegates:serialization",
         "//tensorflow/lite/delegates/gpu/cl:api",
@@ -278,6 +279,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/lite/delegates/gpu/cl/BUILD b/tensorflow/lite/delegates/gpu/cl/BUILD
index ae7c23280538d2..84004f80218b9e 100644
--- a/tensorflow/lite/delegates/gpu/cl/BUILD
+++ b/tensorflow/lite/delegates/gpu/cl/BUILD
@@ -260,6 +260,7 @@ cc_library(
         ":program_cache",
         ":tensor",
         "//tensorflow/lite/delegates/gpu/common/task:gpu_operation",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_device.cc b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
index f3976b407e600d..174fdcdf0e4e55 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_device.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_device.cc
@@ -309,7 +309,15 @@ GpuInfo GpuInfoFromDeviceID(cl_device_id id, cl_platform_id platform_id) {
   info.opencl_info.max_work_group_size_z = max_work_group_sizes.z;
   info.opencl_info.max_work_group_total_size =
       GetDeviceInfo<size_t>(id, CL_DEVICE_MAX_WORK_GROUP_SIZE);
-
+  info.opencl_info.dedicated_local_memory =
+      (GetDeviceInfo<cl_device_local_mem_type>(id, CL_DEVICE_LOCAL_MEM_TYPE) ==
+       CL_LOCAL);
+  if (info.IsCL30OrHigher()) {
+    info.opencl_info.preferred_work_group_size_multiple =
+        GetDeviceInfo<size_t>(id, CL_DEVICE_PREFERRED_WORK_GROUP_SIZE_MULTIPLE);
+  } else {
+    info.opencl_info.preferred_work_group_size_multiple = 0;
+  }
   info.opencl_info.base_addr_align_in_bits =
       GetDeviceInfo<cl_uint>(id, CL_DEVICE_MEM_BASE_ADDR_ALIGN);
   info.opencl_info.image_pitch_alignment = 0;
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_operation.cc b/tensorflow/lite/delegates/gpu/cl/cl_operation.cc
index 5325d834d49dee..1cc1738d071d44 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_operation.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_operation.cc
@@ -24,10 +24,6 @@ namespace {
 std::string GetCommonOpenCLDefines(CalculationsPrecision precision) {
   std::string result;
 
-  result += "#define FLT16_0123(V) V.s0123\n";
-  result += "#define FLT16_4567(V) V.s4567\n";
-  result += "#define FLT16_89ab(V) V.s89ab\n";
-  result += "#define FLT16_cdef(V) V.scdef\n";
   result += "#define GLOBAL_ID_0 get_global_id(0)\n";
   result += "#define GLOBAL_ID_1 get_global_id(1)\n";
   result += "#define GLOBAL_ID_2 get_global_id(2)\n";
@@ -165,9 +161,9 @@ absl::Status ClOperation::SetDstTensor(int index, Tensor* tensor) {
 absl::Status ClOperation::Compile(const CreationContext& creation_context) {
   operation_->code_ =
       GetCommonOpenCLDefines(operation_->GetPrecision()) + operation_->code_;
-  RETURN_IF_ERROR(cl_args_.Init(
-      creation_context.GetGpuInfo(),
-      creation_context.context, &operation_->args_, &operation_->code_));
+  RETURN_IF_ERROR(cl_args_.Init(creation_context.GetGpuInfo(),
+                                creation_context.context, &operation_->args_,
+                                &operation_->code_));
   operation_->args_.ReleaseCPURepresentation();
   RETURN_IF_ERROR(creation_context.cache->GetOrCreateCLKernel(
       operation_->code_, "main_function", operation_->compiler_options_,
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_operation.h b/tensorflow/lite/delegates/gpu/cl/cl_operation.h
index c9192830a02037..772269c003ff75 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_operation.h
+++ b/tensorflow/lite/delegates/gpu/cl/cl_operation.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_arguments.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_command_queue.h"
 #include "tensorflow/lite/delegates/gpu/cl/cl_context.h"
diff --git a/tensorflow/lite/delegates/gpu/cl/cl_program.cc b/tensorflow/lite/delegates/gpu/cl/cl_program.cc
index 856896b667ba71..63e1837d17070c 100644
--- a/tensorflow/lite/delegates/gpu/cl/cl_program.cc
+++ b/tensorflow/lite/delegates/gpu/cl/cl_program.cc
@@ -79,6 +79,9 @@ std::string CompilerOptionToString(const GpuInfo& gpu_info,
                                    CompilerOptions option) {
   switch (option) {
     case CompilerOptions::kAdrenoFullSimd:
+      if (gpu_info.opencl_info.IsCLVK()) {
+        return "";
+      }
       if (gpu_info.IsAdreno()) {
         if (gpu_info.adreno_info.IsAdreno3xx() ||
             gpu_info.adreno_info.IsAdreno4xx()) {
@@ -90,6 +93,9 @@ std::string CompilerOptionToString(const GpuInfo& gpu_info,
         return "unsupported";
       }
     case CompilerOptions::kAdrenoMoreWaves:
+      if (gpu_info.opencl_info.IsCLVK()) {
+        return "";
+      }
       if (gpu_info.IsAdreno()) {
         if (!(gpu_info.adreno_info.IsAdreno3xx() ||
               gpu_info.adreno_info.IsAdreno4xx())) {
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_info.cc b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
index 30489a1721f9d8..3b1c264e786f48 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_info.cc
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.cc
@@ -397,6 +397,7 @@ AppleInfo::AppleInfo(const std::string& gpu_description) {
       {"apple a14 gpu", AppleGpu::kA14},
       {"apple a15 gpu", AppleGpu::kA15},
       {"apple a16 gpu", AppleGpu::kA16},
+      {"apple a17 pro gpu", AppleGpu::kA17Pro},
       // on tablets we have metal device name "apple m1 gpu"
       // and on notebooks "apple m1"
       {"apple m1 gpu", AppleGpu::kM1},
@@ -412,15 +413,82 @@ AppleInfo::AppleInfo(const std::string& gpu_description) {
   } else {
     gpu_type = AppleGpu::kUnknown;
   }
+  gpu_family = GetGpuFamily();
 }
 
-bool AppleInfo::IsA7GenerationGpu() const { return gpu_type == AppleGpu::kA7; }
-bool AppleInfo::IsA8GenerationGpu() const {
-  return gpu_type == AppleGpu::kA8 || gpu_type == AppleGpu::kA8X;
+AppleInfo::Family AppleInfo::GetGpuFamily() const {
+  if (gpu_type == AppleGpu::kA7) {
+    return AppleInfo::Family::kApple1;
+  } else if (gpu_type == AppleGpu::kA8 || gpu_type == AppleGpu::kA8X) {
+    return AppleInfo::Family::kApple2;
+  } else if (gpu_type == AppleGpu::kA9 || gpu_type == AppleGpu::kA9X ||
+             gpu_type == AppleGpu::kA10 || gpu_type == AppleGpu::kA10X) {
+    return AppleInfo::Family::kApple3;
+  } else if (gpu_type == AppleGpu::kA11) {
+    return AppleInfo::Family::kApple4;
+  } else if (gpu_type == AppleGpu::kA12 || gpu_type == AppleGpu::kA12X ||
+             gpu_type == AppleGpu::kA12Z) {
+    return AppleInfo::Family::kApple5;
+  } else if (gpu_type == AppleGpu::kA13) {
+    return AppleInfo::Family::kApple6;
+  } else if (gpu_type == AppleGpu::kA14 || IsM1Series()) {
+    return AppleInfo::Family::kApple7;
+  } else if (gpu_type == AppleGpu::kA15 || gpu_type == AppleGpu::kA16 ||
+             gpu_type == AppleGpu::kM2) {
+    return AppleInfo::Family::kApple8;
+  } else if (gpu_type == AppleGpu::kA17Pro) {
+    return AppleInfo::Family::kApple9;
+  }
+  return AppleInfo::Family::kApple1;
+}
+
+bool AppleInfo::IsFamilyApple1() const {
+  return gpu_family == AppleInfo::Family::kApple1;
+}
+
+bool AppleInfo::IsFamilyApple2() const {
+  return gpu_family == AppleInfo::Family::kApple2;
+}
+
+bool AppleInfo::IsFamilyApple3() const {
+  return gpu_family == AppleInfo::Family::kApple3;
+}
+
+bool AppleInfo::IsFamilyApple4() const {
+  return gpu_family == AppleInfo::Family::kApple4;
+}
+
+bool AppleInfo::IsFamilyApple5() const {
+  return gpu_family == AppleInfo::Family::kApple5;
+}
+
+bool AppleInfo::IsFamilyApple6() const {
+  return gpu_family == AppleInfo::Family::kApple6;
+}
+
+bool AppleInfo::IsFamilyApple7() const {
+  return gpu_family == AppleInfo::Family::kApple7;
+}
+
+bool AppleInfo::IsFamilyApple8() const {
+  return gpu_family == AppleInfo::Family::kApple8;
+}
+
+bool AppleInfo::IsFamilyApple9() const {
+  return gpu_family == AppleInfo::Family::kApple9;
+}
+
+bool AppleInfo::IsFamilyOrLower(AppleInfo::Family family) const {
+  return gpu_family <= family;
 }
 
 bool AppleInfo::IsLocalMemoryPreferredOverGlobal() const {
-  return IsA7GenerationGpu() || IsA8GenerationGpu();
+  return IsFamilyOrLower(AppleInfo::Family::kApple2);
+}
+
+bool AppleInfo::IsM1Series() const {
+  return gpu_type == AppleGpu::kM1 || gpu_type == AppleGpu::kM1Pro ||
+         gpu_type == AppleGpu::kM1Max || gpu_type == AppleGpu::kM1Ultra;
 }
 
 bool AppleInfo::IsBionic() const {
@@ -428,21 +496,22 @@ bool AppleInfo::IsBionic() const {
          gpu_type == AppleGpu::kA12X || gpu_type == AppleGpu::kA12Z ||
          gpu_type == AppleGpu::kA13 || gpu_type == AppleGpu::kA14 ||
          gpu_type == AppleGpu::kA15 || gpu_type == AppleGpu::kA16 ||
-         gpu_type == AppleGpu::kM1 || gpu_type == AppleGpu::kM1Pro ||
-         gpu_type == AppleGpu::kM1Max || gpu_type == AppleGpu::kM1Ultra ||
-         gpu_type == AppleGpu::kM2;
+         gpu_type == AppleGpu::kA17Pro || gpu_type == AppleGpu::kM1 ||
+         gpu_type == AppleGpu::kM1Pro || gpu_type == AppleGpu::kM1Max ||
+         gpu_type == AppleGpu::kM1Ultra || gpu_type == AppleGpu::kM2;
 }
 
 bool AppleInfo::IsSIMDMatMulSupported() const {
   return gpu_type == AppleGpu::kA14 || gpu_type == AppleGpu::kA15 ||
-         gpu_type == AppleGpu::kA16 || gpu_type == AppleGpu::kM1 ||
-         gpu_type == AppleGpu::kM1Pro || gpu_type == AppleGpu::kM1Max ||
-         gpu_type == AppleGpu::kM1Ultra || gpu_type == AppleGpu::kM2;
+         gpu_type == AppleGpu::kA16 || gpu_type == AppleGpu::kA17Pro ||
+         gpu_type == AppleGpu::kM1 || gpu_type == AppleGpu::kM1Pro ||
+         gpu_type == AppleGpu::kM1Max || gpu_type == AppleGpu::kM1Ultra ||
+         gpu_type == AppleGpu::kM2;
 }
 
 bool AppleInfo::IsSIMDMatMulFp32Perf2x() const {
   return gpu_type == AppleGpu::kA15 || gpu_type == AppleGpu::kA16 ||
-         gpu_type == AppleGpu::kM2;
+         gpu_type == AppleGpu::kA17Pro || gpu_type == AppleGpu::kM2;
 }
 
 bool AppleInfo::IsRoundToNearestSupported() const { return IsBionic(); }
@@ -484,6 +553,8 @@ int AppleInfo::GetComputeUnitsCount() const {
       return 5;
     case AppleGpu::kA16:
       return 5;
+    case AppleGpu::kA17Pro:
+      return 6;
     case AppleGpu::kM1:
       // approximate, can be 7 or 8
       return 8;
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_info.h b/tensorflow/lite/delegates/gpu/common/gpu_info.h
index 391060ebb64f8f..eeb6c33c312de8 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_info.h
+++ b/tensorflow/lite/delegates/gpu/common/gpu_info.h
@@ -172,6 +172,7 @@ enum class AppleGpu {
   kA14,
   kA15,
   kA16,
+  kA17Pro,
   kM1,
   kM1Pro,
   kM1Max,
@@ -180,15 +181,39 @@ enum class AppleGpu {
 };
 
 struct AppleInfo {
+  // https://developer.apple.com/documentation/metal/mtlgpufamily
+  enum class Family {
+    kApple9 = 9,
+    kApple8 = 8,
+    kApple7 = 7,
+    kApple6 = 6,
+    kApple5 = 5,
+    kApple4 = 4,
+    kApple3 = 3,
+    kApple2 = 2,
+    kApple1 = 1,
+  };
   AppleInfo() = default;
   explicit AppleInfo(const std::string& gpu_description);
   AppleGpu gpu_type;
+  Family gpu_family;
+
+  bool IsFamilyApple1() const;
+  bool IsFamilyApple2() const;
+  bool IsFamilyApple3() const;
+  bool IsFamilyApple4() const;
+  bool IsFamilyApple5() const;
+  bool IsFamilyApple6() const;
+  bool IsFamilyApple7() const;
+  bool IsFamilyApple8() const;
+  bool IsFamilyApple9() const;
+
+  bool IsFamilyOrLower(Family family) const;
 
-  bool IsA7GenerationGpu() const;
-  bool IsA8GenerationGpu() const;
   bool IsLocalMemoryPreferredOverGlobal() const;
 
   bool IsBionic() const;
+  bool IsM1Series() const;
 
   bool IsSIMDMatMulSupported() const;
   // Often, fp32 alu performance is 1/2 of fp16 alu performance
@@ -206,6 +231,7 @@ struct AppleInfo {
   void SetComputeUnits(int compute_units_count);
 
  private:
+  Family GetGpuFamily() const;
   int compute_units = -1;
 };
 
@@ -264,20 +290,23 @@ struct MaliInfo {
 };
 
 enum class PowerVRGpu {
+  kUnknown,
+  // Newer generation of IMG gpus
+  // Starting with B-series - all RTE with the exception of BXM:
+  kDXT,
+  kCXT,
+  kBXT,
+  kBXS,
+  kBXM,
+  kBXE,
+  // RTZ
+  kAXT,
+  kAXM,
+  kAXE,
+  // Older generation of rogue IMG gpus - all RTZ:
+  kRogue,
   kRogueGm9xxx,
   kRogueGe8xxx,
-  kRogue,
-  // New generation of IMG gpus after 2019:
-  kAXE,
-  kAXM,
-  kAXT,
-  kBXE,
-  kBXM,
-  kBXS,
-  kBXT,
-  kCXT,
-  kDXT,
-  kUnknown,
 };
 
 struct PowerVRInfo {
@@ -399,6 +428,8 @@ struct OpenClInfo {
   int max_work_group_size_y;
   int max_work_group_size_z;
   int max_work_group_total_size;
+  int preferred_work_group_size_multiple;
+  bool dedicated_local_memory;
 
   // The row pitch alignment size in pixels for 2D images created from a buffer.
   // The value must be a power of 2.
diff --git a/tensorflow/lite/delegates/gpu/common/gpu_model.cc b/tensorflow/lite/delegates/gpu/common/gpu_model.cc
index a5135439ae60c5..33301f407f9430 100644
--- a/tensorflow/lite/delegates/gpu/common/gpu_model.cc
+++ b/tensorflow/lite/delegates/gpu/common/gpu_model.cc
@@ -248,9 +248,7 @@ absl::Status ReserveGraphTensors(const CreateGpuModelInfo& create_info,
           tensor_desc.UpdateToSupportedStorageType(gpu_info, shape));
       if (gpu_info.IsApiMetal() &&
           storage_type == TensorStorageType::TEXTURE_2D) {
-        const bool a7_gen_gpu =
-            gpu_info.IsApple() && gpu_info.apple_info.IsA7GenerationGpu();
-        if (!a7_gen_gpu) {
+        if (!(gpu_info.IsApple() && gpu_info.apple_info.IsFamilyApple1())) {
           tensor_desc.SetUseBufferForWriteOnlyTexture2d(true);
         }
       }
@@ -294,7 +292,7 @@ absl::Status ConvertOperations(const GpuInfo& gpu_info,
     auto op_type = OperationTypeFromString(node.operation.type);
     if (op_type == OperationType::CONSTANT) {
       auto attr =
-          absl::any_cast<ConstTensorAttributes>(node.operation.attributes);
+          std::any_cast<ConstTensorAttributes>(node.operation.attributes);
       auto outputs = graph.FindOutputs(node.id);
       gpu_model->const_tensors[outputs[0]->id] =
           tensor_reserver->Get(outputs[0]->id);
diff --git a/tensorflow/lite/delegates/gpu/common/model.cc b/tensorflow/lite/delegates/gpu/common/model.cc
index e8d0ecb4f2f65c..dc68e702ac2328 100644
--- a/tensorflow/lite/delegates/gpu/common/model.cc
+++ b/tensorflow/lite/delegates/gpu/common/model.cc
@@ -139,7 +139,7 @@ Value* GraphFloat32::GetValue(ValueId id) const {
 Node* GraphFloat32::NewNode() {
   const NodeId new_id = nodes_.size();
   NodeDef def;
-  def.node = absl::make_unique<Node>(Node{static_cast<NodeId>(new_id), {}});
+  def.node = std::make_unique<Node>(Node{static_cast<NodeId>(new_id), {}});
   Node* node = def.node.get();
   nodes_[new_id] = std::move(def);
   execution_plan_.push_back(new_id);
@@ -161,7 +161,7 @@ absl::Status GraphFloat32::InsertNodeAfter(NodeId id, Node** new_node) {
 
   const NodeId new_id = nodes_.size();
   NodeDef def;
-  def.node = absl::make_unique<Node>(Node{static_cast<NodeId>(new_id), {}});
+  def.node = std::make_unique<Node>(Node{static_cast<NodeId>(new_id), {}});
   *new_node = def.node.get();
   nodes_[new_id] = std::move(def);
   execution_plan_.insert(execution_plan_.begin() + idx + 1, new_id);
@@ -171,7 +171,7 @@ absl::Status GraphFloat32::InsertNodeAfter(NodeId id, Node** new_node) {
 Value* GraphFloat32::NewValue() {
   ValueDef def;
   def.value =
-      absl::make_unique<Value>(Value{static_cast<ValueId>(values_.size()), {}});
+      std::make_unique<Value>(Value{static_cast<ValueId>(values_.size()), {}});
   Value* value = def.value.get();
   values_.push_back(std::move(def));
   return value;
@@ -336,7 +336,7 @@ absl::Status GraphFloat32::MakeExactCopy(GraphFloat32* model) const {
   for (auto& value_def : values_) {
     model->values_.push_back({});
     if (value_def.value) {
-      model->values_.back().value = absl::make_unique<Value>(*value_def.value);
+      model->values_.back().value = std::make_unique<Value>(*value_def.value);
     }
   }
   // Add all nodes first.
@@ -345,7 +345,7 @@ absl::Status GraphFloat32::MakeExactCopy(GraphFloat32* model) const {
     model->nodes_[node_id] = {};
     auto& node_def = nodes_.at(node_id);
     if (node_def.node) {
-      model->nodes_[node_id].node = absl::make_unique<Node>(*node_def.node);
+      model->nodes_[node_id].node = std::make_unique<Node>(*node_def.node);
     }
   }
   // Wire up dependencies between nodes.
diff --git a/tensorflow/lite/delegates/gpu/common/model.h b/tensorflow/lite/delegates/gpu/common/model.h
index ce6c2ae06f3216..51e67799a2248f 100644
--- a/tensorflow/lite/delegates/gpu/common/model.h
+++ b/tensorflow/lite/delegates/gpu/common/model.h
@@ -17,9 +17,11 @@ limitations under the License.
 #define TENSORFLOW_LITE_DELEGATES_GPU_COMMON_MODEL_H_
 
 #include <algorithm>
+#include <any>
 #include <cstdint>
 #include <map>
 #include <memory>
+#include <optional>
 #include <string>
 #include <vector>
 
@@ -51,12 +53,12 @@ struct QuantizationParams {
 struct Value {
   const ValueId id;
   TensorRef<BHWC> tensor;
-  absl::optional<QuantizationParams> quant_params;
+  std::optional<QuantizationParams> quant_params;
 };
 
 struct Operation {
   std::string type;
-  absl::any attributes;
+  std::any attributes;
 };
 
 struct Node {
diff --git a/tensorflow/lite/delegates/gpu/common/operations.h b/tensorflow/lite/delegates/gpu/common/operations.h
index f08ac4f176e10c..2e72e55149f139 100644
--- a/tensorflow/lite/delegates/gpu/common/operations.h
+++ b/tensorflow/lite/delegates/gpu/common/operations.h
@@ -119,8 +119,8 @@ std::string ToString(enum OperationType op);
 OperationType OperationTypeFromString(const std::string& name);
 
 template <DataType DataTypeT, typename t>
-using TensorOrScalarBase = absl::variant<std::monostate, Tensor<HWC, DataTypeT>,
-                                         Tensor<Linear, DataTypeT>, t>;
+using TensorOrScalarBase = std::variant<std::monostate, Tensor<HWC, DataTypeT>,
+                                        Tensor<Linear, DataTypeT>, t>;
 
 using TensorOrScalar = TensorOrScalarBase<DataType::FLOAT32, float>;
 
@@ -416,8 +416,8 @@ struct ReLUAttributes {
 struct PReLUAttributes {
   // If alpha is linear, then it is sharded across CHANNELS axis, otherwise
   // full shape alpha is required.
-  absl::variant<Tensor<Linear, DataType::FLOAT32>,
-                Tensor<HWC, DataType::FLOAT32>>
+  std::variant<Tensor<Linear, DataType::FLOAT32>,
+               Tensor<HWC, DataType::FLOAT32>>
       alpha;
 };
 
diff --git a/tensorflow/lite/delegates/gpu/common/task/buffer_desc.cc b/tensorflow/lite/delegates/gpu/common/task/buffer_desc.cc
index 56e1ce74d62fab..43b3d0df9a57fb 100644
--- a/tensorflow/lite/delegates/gpu/common/task/buffer_desc.cc
+++ b/tensorflow/lite/delegates/gpu/common/task/buffer_desc.cc
@@ -20,7 +20,6 @@ limitations under the License.
 
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "absl/strings/substitute.h"
 #include "tensorflow/lite/delegates/gpu/common/data_type.h"
 #include "tensorflow/lite/delegates/gpu/common/status.h"
 #include "tensorflow/lite/delegates/gpu/common/task/util.h"
@@ -91,30 +90,9 @@ absl::Status BufferDescriptor::PerformReadSelector(
                          " % 2 == 0 ? 0 : 2]), unpackHalf2x16(buffer[", arg0,
                          " / 2][", arg0, " % 2 == 0 ? 1 : 3]))");
       } else {
-        if (element_size == 4) {
-          *result =
-              absl::StrCat("vec4(unpackHalf2x16(buffer[", args[0],
-                           "].x), unpackHalf2x16(buffer[", args[0], "].y))");
-        } else if (element_size == 16) {
-          const std::string vec0 = absl::Substitute(
-              "vec4(unpackHalf2x16(buffer[$0].a.x), "
-              "unpackHalf2x16(buffer[$0].a.y))",
-              args[0]);
-          const std::string vec1 = absl::Substitute(
-              "vec4(unpackHalf2x16(buffer[$0].a.z), "
-              "unpackHalf2x16(buffer[$0].a.w))",
-              args[0]);
-          const std::string vec2 = absl::Substitute(
-              "vec4(unpackHalf2x16(buffer[$0].b.x), "
-              "unpackHalf2x16(buffer[$0].b.y))",
-              args[0]);
-          const std::string vec3 = absl::Substitute(
-              "vec4(unpackHalf2x16(buffer[$0].b.z), "
-              "unpackHalf2x16(buffer[$0].b.w))",
-              args[0]);
-          *result = absl::Substitute("mat4x4($0, $1, $2, $3)", vec0, vec1, vec2,
-                                     vec3);
-        }
+        *result =
+            absl::StrCat("vec4(unpackHalf2x16(buffer[", args[0],
+                         "].x), unpackHalf2x16(buffer[", args[0], "].y))");
       }
     } else {
       *result = absl::StrCat("buffer[", args[0], "]");
diff --git a/tensorflow/lite/delegates/gpu/common/task/qcom_thin_filter_desc.cc b/tensorflow/lite/delegates/gpu/common/task/qcom_thin_filter_desc.cc
index 03909221f99cd0..ffcf1fb2cdfebf 100644
--- a/tensorflow/lite/delegates/gpu/common/task/qcom_thin_filter_desc.cc
+++ b/tensorflow/lite/delegates/gpu/common/task/qcom_thin_filter_desc.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/delegates/gpu/common/task/tensor_desc.cc b/tensorflow/lite/delegates/gpu/common/task/tensor_desc.cc
index b3d342d5f71474..66b2d7bfcb3480 100644
--- a/tensorflow/lite/delegates/gpu/common/task/tensor_desc.cc
+++ b/tensorflow/lite/delegates/gpu/common/task/tensor_desc.cc
@@ -1590,11 +1590,9 @@ TensorDescriptor CreateHwcTensorDescriptor(DataType data_type,
 TensorStorageType GetStorageTypeForLinearTensor(const GpuInfo& gpu_info,
                                                 DataType data_type,
                                                 const Linear& shape) {
-  if (gpu_info.IsApple()) {
-    if (gpu_info.apple_info.IsA7GenerationGpu() ||
-        gpu_info.apple_info.IsA8GenerationGpu()) {
-      return TensorStorageType::TEXTURE_2D;
-    }
+  if (gpu_info.IsApple() &&
+      gpu_info.apple_info.IsFamilyOrLower(AppleInfo::Family::kApple2)) {
+    return TensorStorageType::TEXTURE_2D;
   }
   if (!gpu_info.SupportsImages() || gpu_info.IsMali() || gpu_info.IsApple() ||
       gpu_info.IsAMD()) {
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.cc b/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.cc
index edcdff50dbc991..c2a1bd8f539512 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/conv_generic.cc
@@ -1293,7 +1293,7 @@ ConvGeneric::ConvParams GetConvParamsForA7A8(const AppleInfo& apple_info,
   options.push_back(CreateWorkGroupSizeOption(
       {8, 4, 1}, WorkGroupSizeOption::ThreadMapping::kDefault, 1.0f, dst_shape,
       params.block_size));
-  if (!apple_info.IsA7GenerationGpu()) {
+  if (!apple_info.IsFamilyApple1()) {
     options.push_back(CreateWorkGroupSizeOption(
         {4, 4, 1}, WorkGroupSizeOption::ThreadMapping::kDefault, 1.01f,
         dst_shape, params.block_size));
@@ -1304,7 +1304,7 @@ ConvGeneric::ConvParams GetConvParamsForA7A8(const AppleInfo& apple_info,
   options.push_back(CreateWorkGroupSizeOption(
       {32, 1, 1}, WorkGroupSizeOption::ThreadMapping::kLinearSpatial, 1.0f,
       dst_shape, params.block_size));
-  if (!apple_info.IsA7GenerationGpu()) {
+  if (!apple_info.IsFamilyApple1()) {
     options.push_back(CreateWorkGroupSizeOption(
         {16, 1, 1}, WorkGroupSizeOption::ThreadMapping::kLinearSpatial, 1.01f,
         dst_shape, params.block_size));
@@ -1545,23 +1545,34 @@ ConvGeneric::ConvParams ConvGeneric::GuessBestParams(
       conv_params.src_depth_loop_size = 4;
     }
   } else if (gpu_info.IsPowerVR()) {
-    if (different_weights_for_height) {
+    if (gpu_info.IsCL30OrHigher()) {
+      work_group_size_ =
+          int3(gpu_info.opencl_info.preferred_work_group_size_multiple, 1, 1);
+    } else {
       work_group_size_ = int3(32, 1, 1);
+    }
+    if (different_weights_for_height) {
       work_group_launch_order_ = int3(2, 0, 1);
       conv_params.fixed_work_group_size = true;
     } else {
       conv_params.linear_spatial = true;
-      work_group_size_ = int3(32, 1, 1);
       work_group_launch_order_ = int3(1, 0, 2);
       conv_params.fixed_work_group_size = true;
     }
     conv_params.block_size = int4(1, 1, 1, 4);
     conv_params.src_depth_loop_size = 1;
-    if (definition.precision == CalculationsPrecision::F32_F16) {
-      conv_params.weights_upload_type = WeightsUploadType::LOCAL_MEM_BY_THREADS;
+    if (!gpu_info.IsApiOpenCl() ||
+        (gpu_info.IsApiOpenCl() &&
+         gpu_info.opencl_info.dedicated_local_memory)) {
+      if (definition.precision == CalculationsPrecision::F32_F16) {
+        conv_params.weights_upload_type =
+            WeightsUploadType::LOCAL_MEM_BY_THREADS;
+      } else {
+        conv_params.weights_upload_type =
+            WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
+      }
     } else {
-      conv_params.weights_upload_type =
-          WeightsUploadType::LOCAL_MEM_ASYNC_SUBGROUP;
+      conv_params.weights_upload_type = WeightsUploadType::GLOBAL_MEM;
     }
     if (dst_depth % 8 == 0 || dst_depth >= 32) {
       conv_params.block_size.w = 8;
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed.cc b/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed.cc
index 4b96d3bfbfc002..481121d441da44 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed.cc
@@ -140,7 +140,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
         weights_layout_ == WeightsLayout::kOSpatialIOGroupO4I4) {
       BufferDescriptor desc;
       desc.element_type = op_def.src_tensors[1].GetDataType();
-      desc.element_size = 16;
+      desc.element_size = 4;
       desc.memory_type = MemoryType::GLOBAL;
       AddSrcBuffer("weights", desc);
     } else {
@@ -160,15 +160,15 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
     std::string f0, f1, f2, f3;
     if (weights_are_buffer) {
       if (gpu_info.SupportsPointersInKernels()) {
-        f0 = "FLT16_0123(weights_cache[" + std::to_string(s) + "])";
-        f1 = "FLT16_4567(weights_cache[" + std::to_string(s) + "])";
-        f2 = "FLT16_89ab(weights_cache[" + std::to_string(s) + "])";
-        f3 = "FLT16_cdef(weights_cache[" + std::to_string(s) + "])";
+        f0 = "weights_cache[" + std::to_string(s * 4 + 0) + "]";
+        f1 = "weights_cache[" + std::to_string(s * 4 + 1) + "]";
+        f2 = "weights_cache[" + std::to_string(s * 4 + 2) + "]";
+        f3 = "weights_cache[" + std::to_string(s * 4 + 3) + "]";
       } else {
-        f0 = "FLT16_0123(flt16val)";
-        f1 = "FLT16_4567(flt16val)";
-        f2 = "FLT16_89ab(flt16val)";
-        f3 = "FLT16_cdef(flt16val)";
+        f0 = "f0";
+        f1 = "f1";
+        f2 = "f2";
+        f3 = "f3";
       }
     } else {
       f0 = "f" + std::to_string(s * 4 + 0);
@@ -250,16 +250,6 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
     return check;
   };
 
-  switch (op_def.precision) {
-    case CalculationsPrecision::F32:
-      c += "#define FLT16 float16\n";
-      break;
-    case CalculationsPrecision::F32_F16:
-    case CalculationsPrecision::F16:
-      c += "#define FLT16 half16\n";
-      break;
-  }
-
   c += "MAIN_FUNCTION($0) {\n";
   if (op_def.IsBatchSupported()) {
     c += "  int linear_id = GLOBAL_ID_0;\n";
@@ -300,7 +290,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
     if (src_def.HasAxis(Axis::DEPTH)) {
       c += " * args.kernel_size_z";
     }
-    c += ";\n";
+    c += " * 4;\n";
   }
   for (int s = 0; s < block_size.w; ++s) {
     const std::string sind = std::to_string(s);
@@ -444,7 +434,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
   if (weights_are_buffer) {
     c += "      int f_offset = f_base + kernel_index * "
          "args.src_tensor.Slices() * " +
-         std::to_string(block_size.w) + ";\n";
+         std::to_string(block_size.w * 4) + ";\n";
   } else {
     c += "      int x_c = kernel_index * args.src_tensor.Slices();\n";
   }
@@ -494,7 +484,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
   }
   if (weights_are_buffer) {
     if (gpu_info.SupportsPointersInKernels()) {
-      c += "        __global FLT16* weights_cache = "
+      c += "        __global FLT4* weights_cache = "
            "args.weights.GetPtr(f_offset);\n";
     }
   } else {
@@ -510,12 +500,18 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
     c += "        x_c++;\n";
   }
   if (weights_are_buffer && !gpu_info.SupportsPointersInKernels()) {
-    c += "      FLT16 flt16val;\n";
+    c += "      FLT4 f0, f1, f2, f3;\n";
   }
   for (int s = 0; s < block_size.w; ++s) {
     if (weights_are_buffer && !gpu_info.SupportsPointersInKernels()) {
-      c += "        flt16val = args.weights.Read(f_offset + " +
-           std::to_string(s) + ");\n";
+      c += "        f0 = args.weights.Read(f_offset + " +
+           std::to_string(s * 4 + 0) + ");\n";
+      c += "        f1 = args.weights.Read(f_offset + " +
+           std::to_string(s * 4 + 1) + ");\n";
+      c += "        f2 = args.weights.Read(f_offset + " +
+           std::to_string(s * 4 + 2) + ");\n";
+      c += "        f3 = args.weights.Read(f_offset + " +
+           std::to_string(s * 4 + 3) + ");\n";
     }
     const std::string sind = std::to_string(s);
     for (int z = 0; z < block_size.z; ++z) {
@@ -532,7 +528,7 @@ std::string ConvolutionTransposed::GenerateConvolutionTransposedCode(
     }
   }
   if (weights_are_buffer) {
-    c += "        f_offset += " + std::to_string(block_size.w) + ";\n";
+    c += "        f_offset += " + std::to_string(block_size.w * 4) + ";\n";
   }
   c += "      }\n";
   c += "    }\n";
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed.h b/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed.h
index f170afbb0cc098..5eb41cc83f637e 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed.h
+++ b/tensorflow/lite/delegates/gpu/common/tasks/convolution_transposed.h
@@ -108,7 +108,7 @@ void ConvolutionTransposed::UploadWeights(
   if (weights_are_buffer) {
     BufferDescriptor desc;
     desc.element_type = weights_desc.type;
-    desc.element_size = 16;
+    desc.element_size = 4;
     desc.size = weights_data.size();
     desc.data = std::move(weights_data);
     args_.AddObject("weights",
@@ -139,7 +139,7 @@ void ConvolutionTransposed::UploadWeights(
   if (weights_are_buffer) {
     BufferDescriptor desc;
     desc.element_type = weights_desc.type;
-    desc.element_size = 16;
+    desc.element_size = 4;
     desc.size = weights_data.size();
     desc.data = std::move(weights_data);
     args_.AddObject("weights",
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv.cc b/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv.cc
index 31eb5942fd1306..09fff84d5cdbf7 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv.cc
@@ -101,11 +101,9 @@ std::string GetSrcXYCheck(const GpuInfo& gpu_info,
 }
 
 bool UseBuffersForWeights(const GpuInfo& gpu_info) {
-  if (gpu_info.IsApple()) {
-    if (gpu_info.apple_info.IsA7GenerationGpu() ||
-        gpu_info.apple_info.IsA8GenerationGpu()) {
-      return false;
-    }
+  if (gpu_info.IsApple() &&
+      gpu_info.apple_info.IsFamilyOrLower(AppleInfo::Family::kApple2)) {
+    return false;
   }
   return !gpu_info.SupportsImages() || gpu_info.IsMali() ||
          gpu_info.IsApple() || gpu_info.IsAMD();
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3.cc b/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3.cc
index 69baac71b40492..0d913d6ba535da 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3.cc
@@ -337,11 +337,11 @@ DepthwiseConv3x3 CreateDepthwiseConv3x3(
   bool weights_are_buffer = !gpu_info.SupportsImages() ||
                             gpu_info.IsPowerVR() || gpu_info.IsMali() ||
                             gpu_info.IsApple();
-  bool local_mem_uploads = weights_are_buffer && gpu_info.IsPowerVR();
-  if (gpu_info.IsApple() &&
-      gpu_info.apple_info.IsLocalMemoryPreferredOverGlobal()) {
-    local_mem_uploads = true;
-  }
+  bool local_mem_uploads =
+      (weights_are_buffer && gpu_info.IsPowerVR() && gpu_info.IsApiOpenCl() &&
+       gpu_info.opencl_info.dedicated_local_memory) ||
+      (gpu_info.IsApple() &&
+       gpu_info.apple_info.IsLocalMemoryPreferredOverGlobal());
   DepthwiseConv3x3 result(definition, weights_are_buffer, local_mem_uploads,
                           gpu_info);
   result.UploadWeightsAndBiases(attr.weights, attr.bias, weights_are_buffer);
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3_stride_h2.cc b/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3_stride_h2.cc
index e6ab1b00626e02..9058a035b80c65 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3_stride_h2.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/depthwise_conv_3x3_stride_h2.cc
@@ -219,11 +219,11 @@ DepthWiseConv3x3StrideH2 CreateDepthWiseConv3x3StrideH2(
                             gpu_info.IsApple();
 
   DepthWiseConv3x3StrideH2 desc(definition);
-  desc.local_mem_uploads_ = weights_are_buffer && gpu_info.IsPowerVR();
-  if (gpu_info.IsApple() &&
-      gpu_info.apple_info.IsLocalMemoryPreferredOverGlobal()) {
-    desc.local_mem_uploads_ = true;
-  }
+  desc.local_mem_uploads_ =
+      (weights_are_buffer && gpu_info.IsPowerVR() && gpu_info.IsApiOpenCl() &&
+       gpu_info.opencl_info.dedicated_local_memory) ||
+      (gpu_info.IsApple() &&
+       gpu_info.apple_info.IsLocalMemoryPreferredOverGlobal());
   desc.work_group_size_ = int3(8, 4, 1);
   desc.code_ = GetKernelDepthWiseConv3x3StrideH2(
       gpu_info, definition, weights_are_buffer, desc.local_mem_uploads_);
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.cc b/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.cc
index 3194924ad1e16c..96e53b2f5a7534 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.cc
@@ -113,15 +113,6 @@ std::string FullyConnected::GetFullyConnectedKernelCode(
   AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
 
   std::string c;
-  switch (op_def.precision) {
-    case CalculationsPrecision::F32:
-      c += "#define FLT16 float16\n";
-      break;
-    case CalculationsPrecision::F32_F16:
-    case CalculationsPrecision::F16:
-      c += "#define FLT16 half16\n";
-      break;
-  }
 
   c += "#define WG_X " + std::to_string(work_group_size_.x) + "\n";
   c += "#define WG_Y " + std::to_string(work_group_size_.y) + "\n";
@@ -135,11 +126,11 @@ std::string FullyConnected::GetFullyConnectedKernelCode(
       FLT4 v = args.src_tensor.Read(0, 0, c);
 )";
   if (weights_are_buffer) {
-    c += R"(FLT16 w = args.weights.Read(c * args.dst_tensor.Slices() + gid);
-      FLT4 partial = v.x * FLT16_0123(w);
-      partial += v.y * FLT16_4567(w);
-      partial += v.z * FLT16_89ab(w);
-      partial += v.w * FLT16_cdef(w);
+    c += R"(int weights_index = (c * args.dst_tensor.Slices() + gid) * 4;
+      FLT4 partial = v.x * args.weights.Read(weights_index + 0);
+      partial += v.y * args.weights.Read(weights_index + 1);
+      partial += v.z * args.weights.Read(weights_index + 2);
+      partial += v.w * args.weights.Read(weights_index + 3);
       s += TO_ACCUM_TYPE(partial);
 )";
   } else {
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.h b/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.h
index 19ca95644354bb..f44383028840bd 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.h
+++ b/tensorflow/lite/delegates/gpu/common/tasks/fully_connected.h
@@ -166,7 +166,7 @@ void FullyConnected::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
   if (weights_are_buffer) {
     BufferDescriptor desc;
     desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc.element_size = 16;
+    desc.element_size = 4;
     desc.size = float4_size * elements_count;
     desc.data.resize(desc.size);
 
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/reduce.cc b/tensorflow/lite/delegates/gpu/common/tasks/reduce.cc
index 6fd1d262c5e145..b16f7d9265b1a3 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/reduce.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/reduce.cc
@@ -44,6 +44,13 @@ int GetMaximumWGTotalSize(const GpuInfo& gpu_info) {
       total_wg_size = 64;
     }
   }
+  if (gpu_info.IsPowerVR()) {
+    if (gpu_info.IsCL30OrHigher()) {
+      total_wg_size = gpu_info.opencl_info.preferred_work_group_size_multiple;
+    } else {
+      total_wg_size = 32;
+    }
+  }
   return total_wg_size;
 }
 
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/special/fc_fc_add.cc b/tensorflow/lite/delegates/gpu/common/tasks/special/fc_fc_add.cc
index 28f1c3f5f48217..c985b550a87f2f 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/special/fc_fc_add.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/special/fc_fc_add.cc
@@ -108,15 +108,6 @@ std::string FCFCAdd::GetFCFCAddKernelCode(const OperationDef& op_def,
   AddDstTensor("dst_tensor", op_def.dst_tensors[0]);
 
   std::string c;
-  switch (op_def.precision) {
-    case CalculationsPrecision::F32:
-      c += "#define FLT16 float16\n";
-      break;
-    case CalculationsPrecision::F32_F16:
-    case CalculationsPrecision::F16:
-      c += "#define FLT16 half16\n";
-      break;
-  }
 
   c += "#define WG_X " + std::to_string(work_group_size_.x) + "\n";
   c += "#define WG_Y " + std::to_string(work_group_size_.y) + "\n";
@@ -132,11 +123,11 @@ std::string FCFCAdd::GetFCFCAddKernelCode(const OperationDef& op_def,
       FLT4 v = args.src_tensor_0.Read(0, 0, c);
 )";
   if (weights_are_buffer) {
-    c += R"(FLT16 w = args.weights0.Read(c * args.dst_tensor.Slices() + gid);
-      FLT4 partial = v.x * FLT16_0123(w);
-      partial += v.y * FLT16_4567(w);
-      partial += v.z * FLT16_89ab(w);
-      partial += v.w * FLT16_cdef(w);
+    c += R"(int weights_index = (c * args.dst_tensor.Slices() + gid) * 4;
+      FLT4 partial = v.x * args.weights0.Read(weights_index + 0);
+      partial += v.y * args.weights0.Read(weights_index + 1);
+      partial += v.z * args.weights0.Read(weights_index + 2);
+      partial += v.w * args.weights0.Read(weights_index + 3);
       s += TO_ACCUM_TYPE(partial);
 )";
   } else {
@@ -169,11 +160,11 @@ std::string FCFCAdd::GetFCFCAddKernelCode(const OperationDef& op_def,
       FLT4 v = args.src_tensor_1.Read(0, 0, c);
       )";
   if (weights_are_buffer) {
-    c += R"(FLT16 w = args.weights1.Read(c * args.dst_tensor.Slices() + gid);
-      FLT4 partial = v.x * FLT16_0123(w);
-      partial += v.y * FLT16_4567(w);
-      partial += v.z * FLT16_89ab(w);
-      partial += v.w * FLT16_cdef(w);
+    c += R"(int weights_index = (c * args.dst_tensor.Slices() + gid) * 4;
+      FLT4 partial = v.x * args.weights1.Read(weights_index + 0);
+      partial += v.y * args.weights1.Read(weights_index + 1);
+      partial += v.z * args.weights1.Read(weights_index + 2);
+      partial += v.w * args.weights1.Read(weights_index + 3);
       s += TO_ACCUM_TYPE(partial);
 )";
   } else {
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/special/fc_fc_add.h b/tensorflow/lite/delegates/gpu/common/tasks/special/fc_fc_add.h
index d632d372e931a3..7d4f2a99ce3898 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/special/fc_fc_add.h
+++ b/tensorflow/lite/delegates/gpu/common/tasks/special/fc_fc_add.h
@@ -148,7 +148,7 @@ void FCFCAdd::UploadWeights(const tflite::gpu::Tensor<OHWI, T>& weights,
   if (weights_are_buffer) {
     BufferDescriptor desc;
     desc.element_type = f32_weights ? DataType::FLOAT32 : DataType::FLOAT16;
-    desc.element_size = 16;
+    desc.element_size = 4;
     desc.size = float4_size * elements_count;
     desc.data.resize(desc.size);
 
diff --git a/tensorflow/lite/delegates/gpu/common/tasks/special/thin_pointwise_fuser.cc b/tensorflow/lite/delegates/gpu/common/tasks/special/thin_pointwise_fuser.cc
index 03fd12928278f5..902ef54a4b3372 100644
--- a/tensorflow/lite/delegates/gpu/common/tasks/special/thin_pointwise_fuser.cc
+++ b/tensorflow/lite/delegates/gpu/common/tasks/special/thin_pointwise_fuser.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
 #include "tensorflow/lite/delegates/gpu/common/flops_util.h"
 #include "tensorflow/lite/delegates/gpu/common/model.h"
@@ -929,7 +930,8 @@ absl::Status TryThinPointwiseFuser(
         gpu_info.IsApple() || gpu_info.IsAMD())) {
     return absl::NotFoundError("ThinPointwiseFuser not suitable.");
   }
-  if (gpu_info.IsMali() && gpu_info.mali_info.IsMidgard()) {
+  // TODO(b/322801363): Add more precise checks for Mali
+  if (gpu_info.IsMali()) {
     return absl::NotFoundError("ThinPointwiseFuser not suitable.");
   }
   auto* node = graph.GetNode(first_node_id);
diff --git a/tensorflow/lite/delegates/gpu/delegate.cc b/tensorflow/lite/delegates/gpu/delegate.cc
index 00c58d37e6f37b..aeccd7ad574920 100644
--- a/tensorflow/lite/delegates/gpu/delegate.cc
+++ b/tensorflow/lite/delegates/gpu/delegate.cc
@@ -20,12 +20,15 @@ limitations under the License.
 
 #include "tensorflow/lite/delegates/gpu/delegate.h"
 
+#include "tensorflow/lite/logger.h"
+
 #if defined(__ANDROID__)
 #include <android/hardware_buffer.h>
 #endif
 
 #include <algorithm>
 #include <cstdint>
+#include <cstring>
 #include <memory>
 #include <string>
 #include <thread>  // NOLINT(build/c++11)
@@ -34,6 +37,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/numbers.h"
 #include "absl/types/span.h"
 #include "tensorflow/lite/builtin_ops.h"
 
@@ -116,6 +120,7 @@ namespace {
 
 using delegates::Serialization;
 using delegates::SerializationParams;
+using tflite::TFLITE_LOG_WARNING;
 
 constexpr char kSerializedDataPrefix[] = "gpuv2_data_";
 
@@ -151,6 +156,69 @@ InferenceUsage ToUsage(int32_t usage) {
   return InferenceUsage::UNKNOWN;
 }
 
+bool ParseOptions(const char* const* options_keys,
+                  const char* const* options_values, size_t num_options,
+                  TfLiteGpuDelegateOptionsV2* options) {
+  for (size_t i = 0; i < num_options; ++i) {
+    if (strcmp(options_keys[i], "is_precision_loss_allowed")) {
+      if (!absl::SimpleAtoi(options_values[i],
+                            &options->is_precision_loss_allowed)) {
+        TFLITE_LOG(TFLITE_LOG_WARNING, "ParseOptions: malformed option %s.",
+                   options_keys[i]);
+        return false;
+      }
+    } else if (strcmp(options_keys[i], "inference_preference")) {
+      if (!absl::SimpleAtoi(options_values[i],
+                            &options->inference_preference)) {
+        TFLITE_LOG(TFLITE_LOG_WARNING, "ParseOptions: malformed option %s.",
+                   options_keys[i]);
+        return false;
+      }
+    } else if (strcmp(options_keys[i], "inference_priority1")) {
+      if (!absl::SimpleAtoi(options_values[i], &options->inference_priority1)) {
+        TFLITE_LOG(TFLITE_LOG_WARNING, "ParseOptions: malformed option %s.",
+                   options_keys[i]);
+        return false;
+      }
+    } else if (strcmp(options_keys[i], "inference_priority2")) {
+      if (!absl::SimpleAtoi(options_values[i], &options->inference_priority2)) {
+        TFLITE_LOG(TFLITE_LOG_WARNING, "ParseOptions: malformed option %s.",
+                   options_keys[i]);
+        return false;
+      }
+    } else if (strcmp(options_keys[i], "inference_priority3")) {
+      if (!absl::SimpleAtoi(options_values[i], &options->inference_priority3)) {
+        TFLITE_LOG(TFLITE_LOG_WARNING, "ParseOptions: malformed option %s.",
+                   options_keys[i]);
+        return false;
+      }
+    } else if (strcmp(options_keys[i], "experimental_flags")) {
+      if (!absl::SimpleAtoi(options_values[i], &options->experimental_flags)) {
+        TFLITE_LOG(TFLITE_LOG_WARNING, "ParseOptions: malformed option %s.",
+                   options_keys[i]);
+        return false;
+      }
+    } else if (strcmp(options_keys[i], "max_delegated_partitions")) {
+      if (!absl::SimpleAtoi(options_values[i],
+                            &options->max_delegated_partitions)) {
+        TFLITE_LOG(TFLITE_LOG_WARNING, "ParseOptions: malformed option %s.",
+                   options_keys[i]);
+        return false;
+      }
+    } else if (strcmp(options_keys[i], "serialization_dir")) {
+      options->serialization_dir = options_values[i];
+    } else if (strcmp(options_keys[i], "model_token")) {
+      options->model_token = options_values[i];
+    } else {
+      TFLITE_LOG(TFLITE_LOG_WARNING, "ParseOptions: unknown option %s.",
+                 options_keys[i]);
+      return false;
+    }
+  }
+
+  return true;
+}
+
 // Forward declarations.
 TfLiteStatus DelegatePrepare(TfLiteContext* context, TfLiteDelegate* delegate);
 
@@ -521,9 +589,19 @@ absl::Status DelegateKernelCore::InitializeOpenGlApi(
   auto delegate_options = delegate_->options();
   gl::InferenceOptions options;
   options.usage = ToUsage(delegate_options.inference_preference);
-  options.priority1 = ToPriority(delegate_options.inference_priority1);
-  options.priority2 = ToPriority(delegate_options.inference_priority2);
-  options.priority3 = ToPriority(delegate_options.inference_priority3);
+  // If is_precision_loss_allowed == -1, then just use priorities instead
+  // of paying attention to is_precision_loss_allowed value.
+  if (delegate_options.is_precision_loss_allowed == -1) {
+    options.priority1 = ToPriority(delegate_options.inference_priority1);
+    options.priority2 = ToPriority(delegate_options.inference_priority2);
+    options.priority3 = ToPriority(delegate_options.inference_priority3);
+  } else {
+    if (delegate_options.is_precision_loss_allowed == 0) {
+      options.priority1 = InferencePriority::MAX_PRECISION;
+    } else {
+      options.priority1 = InferencePriority::MIN_LATENCY;
+    }
+  }
   RETURN_IF_ERROR(gl_environment_->NewInferenceBuilder(std::move(*graph),
                                                        options, builder));
   enforce_same_thread_ = true;
@@ -1419,3 +1497,19 @@ TfLiteDelegate* TfLiteGpuDelegateV2CreateAsync(
 void TfLiteGpuDelegateV2Delete(TfLiteDelegate* delegate) {
   delete tflite::gpu::GetDelegate(delegate);
 }
+
+TfLiteDelegate* tflite_plugin_create_delegate(
+    const char* const* options_keys, const char* const* options_values,
+    size_t num_options, void (*report_error)(const char*)) {
+  TfLiteGpuDelegateOptionsV2 options = TfLiteGpuDelegateOptionsV2Default();
+  if (!tflite::gpu::ParseOptions(options_keys, options_values, num_options,
+                                 &options)) {
+    return nullptr;
+  }
+
+  return TfLiteGpuDelegateV2Create(&options);
+}
+
+void tflite_plugin_destroy_delegate(TfLiteDelegate* delegate) {
+  TfLiteGpuDelegateV2Delete(delegate);
+}
diff --git a/tensorflow/lite/delegates/gpu/delegate.h b/tensorflow/lite/delegates/gpu/delegate.h
index 8c1ada4d5c5d56..0b11f41bc687e5 100644
--- a/tensorflow/lite/delegates/gpu/delegate.h
+++ b/tensorflow/lite/delegates/gpu/delegate.h
@@ -18,6 +18,8 @@ limitations under the License.
 
 #include <stdint.h>
 
+#include <cstddef>
+
 #include "tensorflow/lite/core/c/common.h"
 #include "tensorflow/lite/delegates/gpu/delegate_options.h"
 
@@ -46,6 +48,12 @@ TFL_CAPI_EXPORT TfLiteDelegate* TfLiteGpuDelegateV2CreateAsync(
 // Destroys a delegate created with `TfLiteGpuDelegateV2Create` call.
 TFL_CAPI_EXPORT void TfLiteGpuDelegateV2Delete(TfLiteDelegate* delegate);
 
+TFL_CAPI_EXPORT TfLiteDelegate* tflite_plugin_create_delegate(
+    const char* const* options_keys, const char* const* options_values,
+    size_t num_options, void (*report_error)(const char*));
+
+TFL_CAPI_EXPORT void tflite_plugin_destroy_delegate(TfLiteDelegate* delegate);
+
 #ifdef __cplusplus
 }
 #endif  // __cplusplus
diff --git a/tensorflow/lite/delegates/gpu/metal/compute_task.cc b/tensorflow/lite/delegates/gpu/metal/compute_task.cc
index ab8aec211b7aef..68e3eeffcb7fd5 100644
--- a/tensorflow/lite/delegates/gpu/metal/compute_task.cc
+++ b/tensorflow/lite/delegates/gpu/metal/compute_task.cc
@@ -35,27 +35,6 @@ namespace tflite {
 namespace gpu {
 namespace metal {
 namespace {
-bool IsWordSymbol(char symbol) {
-  return absl::ascii_isalnum(symbol) || symbol == '_';
-}
-
-void ReplaceAllWords(const std::string& old_word, const std::string& new_word,
-                     std::string* str) {
-  size_t position = str->find(old_word);
-  while (position != std::string::npos) {
-    const char prev = position == 0 ? ' ' : (*str)[position - 1];
-    const char next = position + old_word.size() < str->size()
-                          ? (*str)[position + old_word.size()]
-                          : ' ';
-    if (IsWordSymbol(prev) || IsWordSymbol(next)) {
-      position = str->find(old_word, position + 1);
-      continue;
-    }
-    str->replace(position, old_word.size(), new_word);
-    position = str->find(old_word, position + new_word.size());
-  }
-}
-
 std::map<std::string, std::string> GetMetalDefines(
     MetalDevice* device, CalculationsPrecision precision) {
   std::string simdgroup_barrier;
@@ -82,10 +61,6 @@ std::map<std::string, std::string> GetMetalDefines(
     }
   }
   return {
-      {"FLT16_0123(V)", "V[0]"},
-      {"FLT16_4567(V)", "V[1]"},
-      {"FLT16_89ab(V)", "V[2]"},
-      {"FLT16_cdef(V)", "V[3]"},
       {"FLT", storage_type},
       {"FLT2", storage_type + "2"},
       {"FLT3", storage_type + "3"},
@@ -190,12 +165,6 @@ absl::Status ComputeTask::Compile(MetalDevice* device) {
                                    &operation_->args_, &operation_->code_));
 
   operation_->args_.ReleaseCPURepresentation();
-
-  // manually resolving this defines, so as Metal has reserved words for them
-  ReplaceAllWords("float16", "float4x4", &operation_->code_);
-  ReplaceAllWords("half16", "half4x4", &operation_->code_);
-  ReplaceAllWords("float8", "float2x4", &operation_->code_);
-  ReplaceAllWords("half8", "half2x4", &operation_->code_);
   defines_ = GetMetalDefines(device, operation_->GetPrecision());
   return CompileProgram(device, operation_->code_, defines_);
 }
diff --git a/tensorflow/lite/delegates/gpu/metal/inference_context.cc b/tensorflow/lite/delegates/gpu/metal/inference_context.cc
index 84542dde27faef..0dcfb1440c1e23 100644
--- a/tensorflow/lite/delegates/gpu/metal/inference_context.cc
+++ b/tensorflow/lite/delegates/gpu/metal/inference_context.cc
@@ -48,10 +48,10 @@ namespace {
 
 // returns true if actual memory for this storage type is buffer
 bool IsBufferBased(const GpuInfo& gpu_info, const TensorStorageType& type) {
-  const bool a7_gen_gpu =
-      gpu_info.IsApple() && gpu_info.apple_info.IsA7GenerationGpu();
-  if (!a7_gen_gpu && (type == TensorStorageType::TEXTURE_2D ||
-                      type == TensorStorageType::SINGLE_TEXTURE_2D)) {
+  const bool family_apple1 =
+      gpu_info.IsApple() && gpu_info.apple_info.IsFamilyApple1();
+  if (!family_apple1 && (type == TensorStorageType::TEXTURE_2D ||
+                         type == TensorStorageType::SINGLE_TEXTURE_2D)) {
     return true;
   }
   return type == TensorStorageType::BUFFER ||
diff --git a/tensorflow/lite/delegates/gpu/metal/metal_device.cc b/tensorflow/lite/delegates/gpu/metal/metal_device.cc
index 1e387723ff66f6..9f3893af6306a4 100644
--- a/tensorflow/lite/delegates/gpu/metal/metal_device.cc
+++ b/tensorflow/lite/delegates/gpu/metal/metal_device.cc
@@ -40,11 +40,13 @@ GpuInfo CreateGpuInfoFromMetalDevice(id<MTLDevice> device) {
     }
   }
 
-  const bool a7_or_a8 =
-      gpu_info.IsApple() && (gpu_info.apple_info.IsA7GenerationGpu() ||
-                             gpu_info.apple_info.IsA8GenerationGpu());
-  gpu_info.metal_info.image2d_max_width = a7_or_a8 ? 1024 * 8 : 1024 * 16;
-  gpu_info.metal_info.image2d_max_height = a7_or_a8 ? 1024 * 8 : 1024 * 16;
+  const bool family_apple1_or_2 =
+      gpu_info.IsApple() &&
+      gpu_info.apple_info.IsFamilyOrLower(AppleInfo::Family::kApple2);
+  gpu_info.metal_info.image2d_max_width =
+      family_apple1_or_2 ? 1024 * 8 : 1024 * 16;
+  gpu_info.metal_info.image2d_max_height =
+      family_apple1_or_2 ? 1024 * 8 : 1024 * 16;
   gpu_info.metal_info.image_array_max_layers = 2048;
   gpu_info.metal_info.image3d_max_width = 2048;
   gpu_info.metal_info.image3d_max_height = 2048;
diff --git a/tensorflow/lite/delegates/gpu/metal/metal_spatial_tensor.cc b/tensorflow/lite/delegates/gpu/metal/metal_spatial_tensor.cc
index 87256bd682cb3e..996c31c4b2923b 100644
--- a/tensorflow/lite/delegates/gpu/metal/metal_spatial_tensor.cc
+++ b/tensorflow/lite/delegates/gpu/metal/metal_spatial_tensor.cc
@@ -409,10 +409,8 @@ absl::Status CreateTensorSharedImage2DBuffer(id<MTLBuffer> buffer,
 }
 
 TensorStorageType GetFastestStorageType(const GpuInfo& gpu_info) {
-  const bool a7_or_a8 =
-      gpu_info.IsApple() && (gpu_info.apple_info.IsA7GenerationGpu() ||
-                             gpu_info.apple_info.IsA8GenerationGpu());
-  if (a7_or_a8) {
+  if (gpu_info.IsApple() &&
+      gpu_info.apple_info.IsFamilyOrLower(AppleInfo::Family::kApple2)) {
     return TensorStorageType::TEXTURE_2D;
   } else {
     return TensorStorageType::BUFFER;
diff --git a/tensorflow/lite/delegates/hexagon/java/src/main/java/org/tensorflow/lite/BUILD b/tensorflow/lite/delegates/hexagon/java/src/main/java/org/tensorflow/lite/BUILD
index 49c5a70aff578d..dfe05cf2abae27 100644
--- a/tensorflow/lite/delegates/hexagon/java/src/main/java/org/tensorflow/lite/BUILD
+++ b/tensorflow/lite/delegates/hexagon/java/src/main/java/org/tensorflow/lite/BUILD
@@ -1,4 +1,4 @@
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
+# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:LICENSE"])
 
 licenses(["notice"])
 
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD b/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD
index 5ac1eb2c2ac07d..9b0cda01bc559a 100644
--- a/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/BUILD
@@ -34,6 +34,7 @@ cc_library_with_tflite(
     deps = [
         "//tensorflow/lite/experimental/acceleration/compatibility:android_info",
         "//tensorflow/lite/tools:logging",
+        "@com_google_absl//absl/strings",
     ],
 )
 
diff --git a/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.cc b/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.cc
index 7aff358797cd9a..3ed11550aae972 100644
--- a/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.cc
+++ b/tensorflow/lite/delegates/utils/experimental/stable_delegate/delegate_loader.cc
@@ -21,6 +21,7 @@ limitations under the License.
 #include <cerrno>
 #include <string>
 
+#include "absl/strings/numbers.h"
 #include "tensorflow/lite/acceleration/configuration/c/stable_delegate.h"
 #include "tensorflow/lite/experimental/acceleration/compatibility/android_info.h"
 #include "tensorflow/lite/tools/logging.h"
diff --git a/tensorflow/lite/delegates/xnnpack/BUILD b/tensorflow/lite/delegates/xnnpack/BUILD
index ad5755f6c990a3..6b878eb9ee6827 100644
--- a/tensorflow/lite/delegates/xnnpack/BUILD
+++ b/tensorflow/lite/delegates/xnnpack/BUILD
@@ -33,6 +33,12 @@ config_setting(
     define_values = {"xnnpack_use_latest_ops": "true"},
 )
 
+# Use transient indirection buffers.
+config_setting(
+    name = "xnnpack_use_transient_indirection_buffers_explicit",
+    define_values = {"xnnpack_use_transient_indirection_buffers": "true"},
+)
+
 # Enable offloading of quantized 8-bit signed operators to XNNPACK delegate
 config_setting(
     name = "tflite_with_xnnpack_qs8_explicit_true",
@@ -223,6 +229,9 @@ cc_library(
     }) + select({
         ":xnnpack_use_latest_ops_explicit": ["-DXNNPACK_DELEGATE_USE_LATEST_OPS=1"],
         "//conditions:default": [],
+    }) + select({
+        ":xnnpack_use_transient_indirection_buffers_explicit": ["-DXNNPACK_DELEGATE_USE_TRANSIENT_INDIRECTION_BUFFERS=1"],
+        "//conditions:default": [],
     }),
     linkstatic = True,
     deps = [
@@ -346,7 +355,7 @@ cc_library(
         "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//testing/base/public:gunit_for_library_testonly",
+        "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
 )
@@ -462,7 +471,7 @@ cc_library(
         "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//testing/base/public:gunit_for_library_testonly",
+        "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
 )
@@ -482,7 +491,7 @@ cc_library(
         "//tensorflow/lite/core/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_conversion_utils",
         "//tensorflow/lite/schema:schema_fbs",
-        "//testing/base/public:gunit_for_library_testonly",
+        "@com_google_googletest//:gtest",
         "@flatbuffers",
     ],
 )
diff --git a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
index 8c036c0dd3f95a..0a6fd65888da91 100644
--- a/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
+++ b/tensorflow/lite/delegates/xnnpack/xnnpack_delegate.cc
@@ -213,7 +213,7 @@ xnn_datatype GetXNNPackDatatype(TfLiteContext* context,
             TF_LITE_KERNEL_LOG(context,
                                "unsupported zero-point value %d in channel "
                                "%d of INT8 tensor %d in XNNPACK delegate",
-                               quantization_params->zero_point[c], c, t);
+                               quantization_params->zero_point->data[c], c, t);
             return xnn_datatype_invalid;
           }
         }
@@ -380,7 +380,7 @@ class VariableHolder {
       TF_LITE_MAYBE_KERNEL_LOG(
           logging_context,
           "global id mismatch for tensor "
-          "%d, expected %zu, found %zu at VAR_HANDLE node %d",
+          "%d, expected %u, found %u at VAR_HANDLE node %d",
           tensor_id, global_id, it.first->second, node_index);
       return kTfLiteError;
     }
@@ -434,9 +434,10 @@ class VariableHolder {
       for (size_t i = 0; i < NumDimensions(found_tensor); i++) {
         if (found_tensor->dims->data[i] != tensor->dims->data[i]) {
           TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                                   "mismatch between dimension %d of "
+                                   "mismatch between dimension %zu of "
                                    "variable tensor id %d: expected %d, got %d",
-                                   i, local_id, dims[i], tensor->dims->data[i]);
+                                   i, local_id, dims->data[i],
+                                   tensor->dims->data[i]);
           return kTfLiteError;
         }
       }
@@ -556,8 +557,12 @@ class Delegate {
   }
 
   bool transient_indirection_buffer() const {
+#ifdef XNNPACK_DELEGATE_USE_TRANSIENT_INDIRECTION_BUFFERS
+    return true;
+#else
     return (options_.flags &
             TFLITE_XNNPACK_DELEGATE_FLAG_TRANSIENT_INDIRECTION_BUFFER) != 0;
+#endif
   }
 
   bool experimental_adaptive_avx_optimization() const {
@@ -900,7 +905,7 @@ class Subgraph {
         const auto it = global_id_to_xnnpack_id.find(global_id);
         if (it == global_id_to_xnnpack_id.end()) {
           TF_LITE_KERNEL_LOG(context,
-                             "could not find variable with global id %zu in "
+                             "could not find variable with global id %u in "
                              "context %p for local tensor %d",
                              global_id, context, t);
           return nullptr;
@@ -1574,7 +1579,7 @@ class Subgraph {
                                      BuiltinOperator op_type, int node_index) {
     if (node->inputs->size != expected_num_inputs) {
       TF_LITE_MAYBE_KERNEL_LOG(
-          context, "unexpected number of inputs (%d != %d) in node #%d",
+          context, "unexpected number of inputs (%d != %d) in node %s #%d",
           node->inputs->size, expected_num_inputs,
           EnumNameBuiltinOperator(op_type), node_index);
       return kTfLiteError;
@@ -2060,8 +2065,8 @@ class Subgraph {
             "unexpected value %d of shape dimension #%d in "
             "tensor #%d in %s node #%d: "
             "expected 1 for non-channel dimensions",
-            tensor.dims[i], i, tensor_index, EnumNameBuiltinOperator(op_type),
-            node_index);
+            tensor.dims->data[i], i, tensor_index,
+            EnumNameBuiltinOperator(op_type), node_index);
         return kTfLiteError;
       }
     }
@@ -2920,7 +2925,7 @@ class Subgraph {
             "failed to delegate %s node #%d input tensor #%d and input tensor "
             "#%d.  Mismatch at dimensions %zu (%d != %d)",
             EnumNameBuiltinOperator(BuiltinOperator_BATCH_MATMUL), node_index,
-            node->inputs->data[0], node->inputs->data[1],
+            node->inputs->data[0], node->inputs->data[1], i,
             input1_tensor.dims->data[i], input2_tensor.dims->data[i]);
         return kTfLiteError;
       }
@@ -2930,7 +2935,7 @@ class Subgraph {
             "failed to delegate %s node #%d input tensor #%d and output tensor "
             "#%d.  Mismatch at dimensions %zu (%d != %d)",
             EnumNameBuiltinOperator(BuiltinOperator_BATCH_MATMUL), node_index,
-            node->inputs->data[0], node->outputs->data[0],
+            node->inputs->data[0], node->outputs->data[0], i,
             input1_tensor.dims->data[i], output_tensor.dims->data[i]);
         return kTfLiteError;
       }
@@ -4436,6 +4441,7 @@ class Subgraph {
         logging_context, output_tensor, node->outputs->data[0], node_index));
 
     if (subgraph != nullptr) {
+      uint32_t flags = reducer_params->keep_dims ? XNN_FLAG_KEEP_DIMS : 0;
       xnn_status status = xnn_status_success;
       switch (num_reduction_axes) {
         case 1:
@@ -4445,7 +4451,7 @@ class Subgraph {
               /*output_max=*/+std::numeric_limits<float>::infinity(),
               /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
               /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-              /*flags=*/0);
+              flags);
           break;
         case 2:
           status = xnn_define_global_average_pooling_2d(
@@ -4454,7 +4460,7 @@ class Subgraph {
               /*output_max=*/+std::numeric_limits<float>::infinity(),
               /*input_id=*/input_output_tensors.at(node->inputs->data[0]),
               /*output_id=*/input_output_tensors.at(node->outputs->data[0]),
-              /*flags=*/0);
+              flags);
           break;
         default:
           break;
@@ -5464,7 +5470,7 @@ class Subgraph {
         TF_LITE_MAYBE_KERNEL_LOG(logging_context,
                                  "size %" PRId64
                                  " does not match output shape %d at "
-                                 "dimension %d in SLICE node #%d",
+                                 "dimension %zu in SLICE node #%d",
                                  size[i], output_shape->data[i], i, node_index);
         return kTfLiteError;
       }
@@ -5965,7 +5971,7 @@ class Subgraph {
     for (size_t i = 0; i < num_dims; i++) {
       if (stride_data[i] != 1) {
         TF_LITE_MAYBE_KERNEL_LOG(logging_context,
-                                 "stride at dimension %d, %d, must be 1"
+                                 "stride at dimension %zu, %d, must be 1"
                                  "in STRIDED_SLICE node #%d",
                                  i, stride_data[i], node_index);
         return kTfLiteError;
@@ -6278,7 +6284,7 @@ class Subgraph {
           logging_context,
           "transpose convolution kernel input channel dimension (%d) "
           "doesn't match filter input channel (%d) in node #%d",
-          input_channels, input_tensor_dims[3]);
+          input_channels, input_tensor_dims[3], node_index);
       return kTfLiteError;
     }
 
@@ -6586,7 +6592,7 @@ TfLiteIntArray* Delegate::PrepareOpsToDelegate(TfLiteContext* context) {
 
     if (node->inputs->size != 1) {
       TF_LITE_KERNEL_LOG(
-          context, "unexpected number of inputs (%d) in %s node %d",
+          context, "unexpected number of inputs (%d) in %d node %d",
           node->inputs->size,
           static_cast<BuiltinOperator>(registration->builtin_code),
           producer_index);
@@ -6596,7 +6602,7 @@ TfLiteIntArray* Delegate::PrepareOpsToDelegate(TfLiteContext* context) {
 
     if (node->outputs->size != 1) {
       TF_LITE_KERNEL_LOG(
-          context, "unexpected number of outputs (%d) in %s node %d",
+          context, "unexpected number of outputs (%d) in %d node %d",
           node->outputs->size,
           static_cast<BuiltinOperator>(registration->builtin_code),
           producer_index);
diff --git a/tensorflow/lite/examples/label_image/BUILD b/tensorflow/lite/examples/label_image/BUILD
index f9d368dfbceb99..b5942f010521ea 100644
--- a/tensorflow/lite/examples/label_image/BUILD
+++ b/tensorflow/lite/examples/label_image/BUILD
@@ -31,16 +31,18 @@ cc_binary(
     deps = [
         ":bitmap_helpers",
         "//tensorflow/lite:framework",
-        "//tensorflow/lite:string_util",
+        "//tensorflow/lite:string",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/c:common",
+        "//tensorflow/lite/core:cc_api_stable",
         "//tensorflow/lite/kernels:builtin_ops",
+        "//tensorflow/lite/profiling:profile_buffer",
         "//tensorflow/lite/profiling:profiler",
+        "//tensorflow/lite/schema:schema_fbs",
         "//tensorflow/lite/tools:command_line_flags",
         "//tensorflow/lite/tools:tool_params",
         "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
         "//tensorflow/lite/tools/delegates:tflite_execution_providers",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -59,8 +61,10 @@ cc_library(
         "//tensorflow/lite:framework",
         "//tensorflow/lite:string",
         "//tensorflow/lite:string_util",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/kernels:builtin_ops",
         "//tensorflow/lite/schema:schema_fbs",
+        "@local_tsl//tsl/platform:tstring",
     ] + select({
         "//tensorflow:android": [
             "//tensorflow/lite/delegates/gpu:delegate",
@@ -86,6 +90,7 @@ cc_test(
     ],
     deps = [
         ":bitmap_helpers",
+        "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite/c:common",
         "@com_google_googletest//:gtest_main",
     ],
diff --git a/tensorflow/lite/examples/label_image/CMakeLists.txt b/tensorflow/lite/examples/label_image/CMakeLists.txt
index ae4ab447064713..08044b1675beb3 100644
--- a/tensorflow/lite/examples/label_image/CMakeLists.txt
+++ b/tensorflow/lite/examples/label_image/CMakeLists.txt
@@ -56,6 +56,11 @@ if(TFLITE_ENABLE_GPU)
   )
 endif()  # TFLITE_ENABLE_GPU
 
+if(TFLITE_ENABLE_EXTERNAL_DELEGATE)
+  list(APPEND TFLITE_LABEL_IMAGE_SRCS
+          ${TFLITE_SOURCE_DIR}/tools/delegates/external_delegate_provider.cc)
+endif()
+
 add_executable(label_image
   ${TFLITE_LABEL_IMAGE_SRCS}
 )
diff --git a/tensorflow/lite/examples/label_image/bitmap_helpers.cc b/tensorflow/lite/examples/label_image/bitmap_helpers.cc
index f060287154022d..d3698f3b22218b 100644
--- a/tensorflow/lite/examples/label_image/bitmap_helpers.cc
+++ b/tensorflow/lite/examples/label_image/bitmap_helpers.cc
@@ -24,8 +24,9 @@ limitations under the License.
 #include <iostream>
 #include <string>
 
-#include "tensorflow/core/platform/ctstring_internal.h"
+#include "tensorflow/lite/examples/label_image/label_image.h"
 #include "tensorflow/lite/examples/label_image/log.h"
+#include "tsl/platform/ctstring_internal.h"
 
 namespace tflite {
 namespace label_image {
diff --git a/tensorflow/lite/examples/label_image/label_image.cc b/tensorflow/lite/examples/label_image/label_image.cc
index 803a2a89c931ef..5811fc3006553d 100644
--- a/tensorflow/lite/examples/label_image/label_image.cc
+++ b/tensorflow/lite/examples/label_image/label_image.cc
@@ -36,16 +36,23 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "absl/memory/memory.h"
+#include "tensorflow/lite/c/c_api_types.h"
+#include "tensorflow/lite/c/common.h"
+#include "tensorflow/lite/core/interpreter_builder.h"
 #include "tensorflow/lite/examples/label_image/bitmap_helpers.h"
 #include "tensorflow/lite/examples/label_image/get_top_n.h"
 #include "tensorflow/lite/examples/label_image/log.h"
+#include "tensorflow/lite/interpreter.h"
 #include "tensorflow/lite/kernels/register.h"
+#include "tensorflow/lite/model_builder.h"
 #include "tensorflow/lite/optional_debug_tools.h"
+#include "tensorflow/lite/profiling/profile_buffer.h"
 #include "tensorflow/lite/profiling/profiler.h"
-#include "tensorflow/lite/string_util.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/string_type.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/delegates/delegate_provider.h"
+#include "tensorflow/lite/tools/tool_params.h"
 
 namespace tflite {
 namespace label_image {
diff --git a/tensorflow/lite/examples/label_image/label_image.h b/tensorflow/lite/examples/label_image/label_image.h
index 1c00edb6558f7a..db55265b7e08db 100644
--- a/tensorflow/lite/examples/label_image/label_image.h
+++ b/tensorflow/lite/examples/label_image/label_image.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H_
 #define TENSORFLOW_LITE_EXAMPLES_LABEL_IMAGE_LABEL_IMAGE_H_
 
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/model.h"
+#include "tensorflow/lite/model_builder.h"
 #include "tensorflow/lite/string_type.h"
 
 namespace tflite {
diff --git a/tensorflow/lite/examples/label_image/label_image_test.cc b/tensorflow/lite/examples/label_image/label_image_test.cc
index 3e51ca5ea32dbf..d4e2e87270484b 100644
--- a/tensorflow/lite/examples/label_image/label_image_test.cc
+++ b/tensorflow/lite/examples/label_image/label_image_test.cc
@@ -17,8 +17,8 @@ limitations under the License.
 
 #include <string>
 
-#include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/c/c_api_types.h"
 #include "tensorflow/lite/examples/label_image/bitmap_helpers.h"
 #include "tensorflow/lite/examples/label_image/get_top_n.h"
 
diff --git a/tensorflow/lite/examples/python/BUILD b/tensorflow/lite/examples/python/BUILD
index 5de058126f8054..1b853c6546c0a6 100644
--- a/tensorflow/lite/examples/python/BUILD
+++ b/tensorflow/lite/examples/python/BUILD
@@ -5,15 +5,19 @@ package(
     licenses = ["notice"],
 )
 
-py_strict_binary(
-    name = "label_image",
-    srcs = ["label_image.py"],
-    main = "label_image.py",
-    python_version = "PY3",
-    srcs_version = "PY3",
-    deps = [
-        "//tensorflow:tensorflow_py",
-        "//third_party/py/PIL:pil",
-        "//third_party/py/numpy",
-    ],
-)
+# Commented out under the (b/279852433) because caused an error in the OSS
+# TODO(zhurakovskyi): Uncomment when fixed.
+# copybara:uncomment_begin
+# py_strict_binary(
+#     name = "label_image",
+#     srcs = ["label_image.py"],
+#     main = "label_image.py",
+#     python_version = "PY3",
+#     srcs_version = "PY3",
+#     deps = [
+#         "//third_party/py/PIL:pil",
+#         "//third_party/py/numpy",
+#         "//tensorflow:tensorflow_py",
+#     ],
+# )
+# copybara:uncomment_end
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/BUILD b/tensorflow/lite/experimental/acceleration/compatibility/BUILD
index 7fc05e577ef969..9a5ececc1ece1b 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/BUILD
+++ b/tensorflow/lite/experimental/acceleration/compatibility/BUILD
@@ -50,8 +50,10 @@ cc_library(
     srcs = ["canonicalize_value.cc"],
     hdrs = ["canonicalize_value.h"],
     deps = [
+        ":devicedb",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
+        "@com_googlesource_code_re2//:re2",
     ],
 )
 
@@ -60,6 +62,7 @@ cc_test(
     srcs = ["canonicalize_value_test.cc"],
     deps = [
         ":canonicalize_value",
+        ":devicedb",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/canonicalize_value.cc b/tensorflow/lite/experimental/acceleration/compatibility/canonicalize_value.cc
index b7f9038aaf863b..8d1614e7b33f22 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/canonicalize_value.cc
+++ b/tensorflow/lite/experimental/acceleration/compatibility/canonicalize_value.cc
@@ -14,11 +14,14 @@ limitations under the License.
 ==============================================================================*/
 #include "tensorflow/lite/experimental/acceleration/compatibility/canonicalize_value.h"
 
+#include <iterator>
 #include <string>
 
 #include "absl/algorithm/container.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/string_view.h"
+#include "re2/re2.h"
+#include "tensorflow/lite/experimental/acceleration/compatibility/variables.h"
 
 namespace tflite::acceleration {
 
@@ -33,10 +36,23 @@ inline char ascii_normalise(const unsigned char c) {
 
 }  // namespace
 
-std::string CanonicalizeValue(absl::string_view input) {
+std::string CanonicalizeValue(absl::string_view value) {
   std::string output;
-  absl::c_transform(input, std::back_inserter(output),
+  absl::c_transform(value, std::back_inserter(output),
                     tflite::acceleration::ascii_normalise);
   return output;
 }
+
+std::string CanonicalizeValueWithKey(absl::string_view key,
+                                     absl::string_view value) {
+  std::string output = CanonicalizeValue(value);
+  std::string gpu_output;
+  return key == kGPUModel &&
+                 RE2::FullMatch(
+                     output,
+                     R"((angle_\(samsung_xclipse_[0-9]*\)_on_vulkan).*$)",
+                     &gpu_output)
+             ? gpu_output
+             : output;
+}
 }  // namespace tflite::acceleration
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/canonicalize_value.h b/tensorflow/lite/experimental/acceleration/compatibility/canonicalize_value.h
index 981373e026796d..bfb17539ae2f64 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/canonicalize_value.h
+++ b/tensorflow/lite/experimental/acceleration/compatibility/canonicalize_value.h
@@ -23,7 +23,11 @@ namespace tflite::acceleration {
 
 // Normalises the given ASCII input by converting all alphabets to lower case
 // and replacing ' ' and '-' with '_'.
-std::string CanonicalizeValue(absl::string_view input);
+std::string CanonicalizeValue(absl::string_view value);
+
+// Applies the above normalisation plus key specific normalisation.
+std::string CanonicalizeValueWithKey(absl::string_view key,
+                                     absl::string_view value);
 }  // namespace tflite::acceleration
 
 #endif  // TENSORFLOW_LITE_EXPERIMENTAL_ACCELERATION_COMPATIBILITY_CANONICALIZE_VALUE_H_
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/canonicalize_value_test.cc b/tensorflow/lite/experimental/acceleration/compatibility/canonicalize_value_test.cc
index 0f83f7c1a9fd4b..139cc99534cff2 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/canonicalize_value_test.cc
+++ b/tensorflow/lite/experimental/acceleration/compatibility/canonicalize_value_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/acceleration/compatibility/variables.h"
 
 namespace tflite::acceleration {
 namespace {
@@ -41,5 +42,11 @@ TEST(CanonicalizeValue, OtherSpecialCharactersAreUnaffected) {
     EXPECT_EQ(CanonicalizeValue(s), s);
   }
 }
+
+TEST(CanonicalizeValue, SamsungXclipseGpuNormalized) {
+  EXPECT_EQ(CanonicalizeValueWithKey(
+                kGPUModel, "ANGLE (Samsung Xclipse 920) on Vulkan 1.1.179"),
+            "angle_(samsung_xclipse_920)_on_vulkan");
+}
 }  // namespace
 }  // namespace tflite::acceleration
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin
index 417b66385ccfe7..bfe2754f16ff3c 100644
Binary files a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin and b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.bin differ
diff --git a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc
index a561992ab54ec2..ac8fe90883f6aa 100644
--- a/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc
+++ b/tensorflow/lite/experimental/acceleration/compatibility/gpu_compatibility.cc
@@ -34,7 +34,7 @@ namespace {
 
 void CanonicalizeValues(std::map<std::string, std::string>* variable_values) {
   for (auto& i : *variable_values) {
-    i.second = CanonicalizeValue(i.second);
+    i.second = CanonicalizeValueWithKey(i.first, i.second);
   }
 }
 
diff --git a/tensorflow/lite/experimental/acceleration/configuration/BUILD b/tensorflow/lite/experimental/acceleration/configuration/BUILD
index 2e14980693eee9..a1329c13fb6bc5 100644
--- a/tensorflow/lite/experimental/acceleration/configuration/BUILD
+++ b/tensorflow/lite/experimental/acceleration/configuration/BUILD
@@ -155,10 +155,10 @@ cc_library(
         "//tensorflow/lite/core/experimental/acceleration/configuration:delegate_registry",
         "@com_google_absl//absl/memory",
     ] + select({
-        "//third_party/bazel_platforms/cpu:aarch64": [
+        "@platforms//cpu:aarch64": [
             "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
         ],
-        "//third_party/bazel_platforms/cpu:armv7": [
+        "@platforms//cpu:armv7": [
             "//tensorflow/lite/delegates/hexagon:hexagon_delegate",
         ],
         "//conditions:default": [],
diff --git a/tensorflow/lite/experimental/microfrontend/lib/bits.h b/tensorflow/lite/experimental/microfrontend/lib/bits.h
index 04b3ba6f055f95..5e79ccaf6a79ca 100644
--- a/tensorflow/lite/experimental/microfrontend/lib/bits.h
+++ b/tensorflow/lite/experimental/microfrontend/lib/bits.h
@@ -30,13 +30,13 @@ static inline int CountLeadingZeros32Slow(uint64_t n) {
 }
 
 static inline int CountLeadingZeros32(uint32_t n) {
-#if defined(_MSC_VER)
+#if !defined(__clang__) && defined(_MSC_VER)
   unsigned long result = 0;  // NOLINT(runtime/int)
   if (_BitScanReverse(&result, n)) {
     return 31 - result;
   }
   return 32;
-#elif defined(__GNUC__)
+#elif defined(__clang__) && defined(__GNUC__)
 
   // Handle 0 as a special case because __builtin_clz(0) is undefined.
   if (n == 0) {
@@ -62,14 +62,14 @@ static inline int CountLeadingZeros64Slow(uint64_t n) {
 }
 
 static inline int CountLeadingZeros64(uint64_t n) {
-#if defined(_MSC_VER) && defined(_M_X64)
+#if !defined(__clang__) && defined(_MSC_VER) && defined(_M_X64)
   // MSVC does not have __builtin_clzll. Use _BitScanReverse64.
   unsigned long result = 0;  // NOLINT(runtime/int)
   if (_BitScanReverse64(&result, n)) {
     return 63 - result;
   }
   return 64;
-#elif defined(_MSC_VER)
+#elif !defined(__clang__) && defined(_MSC_VER)
   // MSVC does not have __builtin_clzll. Compose two calls to _BitScanReverse
   unsigned long result = 0;  // NOLINT(runtime/int)
   if ((n >> 32) && _BitScanReverse(&result, n >> 32)) {
@@ -79,7 +79,7 @@ static inline int CountLeadingZeros64(uint64_t n) {
     return 63 - result;
   }
   return 64;
-#elif defined(__GNUC__)
+#elif defined(__clang__) || defined(__GNUC__)
 
   // Handle 0 as a special case because __builtin_clzll(0) is undefined.
   if (n == 0) {
diff --git a/tensorflow/lite/experimental/shlo/BUILD b/tensorflow/lite/experimental/shlo/BUILD
index f45baf77128a2f..ab22092d029954 100644
--- a/tensorflow/lite/experimental/shlo/BUILD
+++ b/tensorflow/lite/experimental/shlo/BUILD
@@ -1,65 +1,99 @@
-# StableHLO Device Profile reference implementation
+# StableHLO Reference Library
 
 package(
-    default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = [":__subpackages__"],
+    default_applicable_licenses = ["//tensorflow:LICENSE"],
+    default_visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "shlo",
-    srcs = [
-        "src/broadcast_in_dim.cc",
-        "src/clamp.cc",
-        "src/compare.cc",
-        "src/concatenate.cc",
-        "src/dispatch.h",
-        "src/elementwise_binary.cc",
-        "src/elementwise_unary.cc",
-        "src/iota.cc",
-        "src/is_finite.cc",
-        "src/select.cc",
-        "src/shlo.cc",
-        "src/uniform_dequantize_quantize.cc",
-    ],
-    hdrs = [
-        "include/shlo.h",
-        "src/storage.h",
-        "src/util.h",
-    ],
     deps = [
-        ":float",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/types:span",
+        ":tensor",
     ],
 )
 
 cc_library(
-    name = "debug",
-    srcs = [
-        "src/debug.cc",
-    ],
-    hdrs = [
-        "src/debug.h",
+    name = "tensor",
+    srcs = ["tensor.cc"],
+    hdrs = ["tensor.h"],
+    deps = [
+        ":data_type",
+        ":quantized_tensor_element_type",
+        ":shape",
     ],
+)
+
+cc_library(
+    name = "shape",
+    srcs = ["shape.cc"],
+    hdrs = ["shape.h"],
     deps = [
-        ":float",
-        ":shlo",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/types:span",
     ],
 )
 
+cc_test(
+    name = "shape_test",
+    srcs = ["shape_test.cc"],
+    deps = [
+        ":shape",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
-    name = "float",
-    srcs = [
+    name = "quantized_tensor_element_type",
+    hdrs = ["quantized_tensor_element_type.h"],
+    deps = [
+        ":data_type",
+        ":shape",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:absl_check",
+        "@com_google_absl//absl/types:span",
     ],
-    hdrs = [
-        "src/bf16.h",
-        "src/f16.h",
-        "src/has_keyword.h",
+)
+
+cc_test(
+    name = "quantized_tensor_element_type_test",
+    srcs = ["quantized_tensor_element_type_test.cc"],
+    deps = [
+        ":data_type",
+        ":quantized_tensor_element_type",
+        "@com_google_googletest//:gtest_main",
     ],
+)
+
+cc_library(
+    name = "bf16",
+    hdrs = ["bf16.h"],
+    deps = [":has_keyword"],
+)
+
+cc_library(
+    name = "f16",
+    hdrs = ["f16.h"],
+    deps = [":has_keyword"],
+)
+
+cc_library(
+    name = "has_keyword",
+    hdrs = ["has_keyword.h"],
+)
+
+cc_library(
+    name = "data_type",
+    hdrs = ["data_type.h"],
     deps = [
+        ":bf16",
+        ":f16",
     ],
 )
+
+cc_library(
+    name = "dispatch",
+    hdrs = ["dispatch.h"],
+    visibility = ["//visibility:private"],
+    deps = [":data_type"],
+)
diff --git a/tensorflow/lite/experimental/shlo/bf16.h b/tensorflow/lite/experimental/shlo/bf16.h
new file mode 100644
index 00000000000000..b89ccda148c9f7
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/bf16.h
@@ -0,0 +1,129 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_BF16_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_BF16_H_
+
+#include "tensorflow/lite/experimental/shlo/has_keyword.h"
+
+#if defined(__STDCPP_BFLOAT16_T__)
+#include <stdfloat>
+namespace shlo_ref {
+using BF16 = bfloat16_t;
+}  // namespace shlo_ref
+
+#elif __has_keyword(__bf16) && __x86_64__
+namespace shlo_ref {
+// On x86 the compiler is able to generate code for __bf16 operations.
+using BF16 = __bf16;
+}  // namespace shlo_ref
+
+#elif __has_keyword(__bf16) && __aarch64__
+#include <cmath>
+#include <cstdint>
+
+namespace shlo_ref {
+
+// On arm64 the compiler is not yet able to generate code for __bf16
+// operations. Therefore, we resort to a software-based implementation of BF16
+// based on promoting ops to float.
+class BF16 {
+ public:
+  BF16(float f = 0.0f) {
+    if (std::isnan(f)) {
+      // If the value is a NaN, squash it to a NaN with the msb of the
+      // mantissa. This avoids that after the truncation below we don't end up
+      // with an inf.
+      value_ = std::signbit(f) ? 0xFFC0 : 0x7FC0;
+    } else {
+      // Fast rounding algorithm that rounds a half value to nearest even. This
+      // reduces expected error when we convert a large number of floats.
+      uint32_t input = *reinterpret_cast<const uint32_t*>(&f);
+
+      // Least significant bit of resulting bfloat.
+      uint32_t lsb = (input >> 16) & 1;
+      uint32_t rounding_bias = 0x7fff + lsb;
+      input += rounding_bias;
+
+      value_ = static_cast<uint16_t>(input >> 16u);
+    }
+  }
+
+  BF16& operator=(BF16 other) {
+    value_ = other.value_;
+    return *this;
+  }
+
+  bool operator==(BF16 other) const { return value_ == other.value_; }
+  bool operator!=(BF16 other) const { return !(*this == other); }
+
+  operator float() const {
+    uint32_t tmp = value_ << 16;
+    return *reinterpret_cast<float*>(&tmp);
+  }
+
+  BF16 operator-() const { return BF16(-static_cast<float>(*this)); }
+
+  BF16& operator+=(BF16 other) {
+    value_ = BF16(static_cast<float>(*this) + static_cast<float>(other)).value_;
+    return *this;
+  }
+
+  BF16& operator-=(BF16 other) {
+    value_ = BF16(static_cast<float>(*this) - static_cast<float>(other)).value_;
+    return *this;
+  }
+
+  BF16& operator*=(BF16 other) {
+    value_ = BF16(static_cast<float>(*this) * static_cast<float>(other)).value_;
+    return *this;
+  }
+
+  BF16& operator/=(BF16 other) {
+    value_ = BF16(static_cast<float>(*this) / static_cast<float>(other)).value_;
+    return *this;
+  }
+
+ private:
+  uint16_t value_;
+};
+
+inline BF16 operator+(BF16 x, BF16 y) {
+  x += y;
+  return x;
+}
+
+inline BF16 operator-(BF16 x, BF16 y) {
+  x -= y;
+  return x;
+}
+
+inline BF16 operator*(BF16 x, BF16 y) {
+  x *= y;
+  return x;
+}
+
+inline BF16 operator/(BF16 x, BF16 y) {
+  x /= y;
+  return x;
+}
+
+}  // namespace shlo_ref
+
+#else
+#error Type BF16 is not available
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_BF16_H_
diff --git a/tensorflow/lite/experimental/shlo/data_type.h b/tensorflow/lite/experimental/shlo/data_type.h
new file mode 100644
index 00000000000000..dbd90ec428b067
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/data_type.h
@@ -0,0 +1,119 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_DATA_TYPE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_DATA_TYPE_H_
+
+#include <cstdint>
+
+#include "tensorflow/lite/experimental/shlo/bf16.h"
+#include "tensorflow/lite/experimental/shlo/f16.h"
+
+namespace shlo_ref {
+
+// For more information on StableHLO types, see the spec., search for "Element
+// types". The SHLO Device Profile does not include unsigned or 64 bit types.
+enum class DataType {
+  kI1,
+  kSI4,
+  kSI8,
+  kSI16,
+  kSI32,
+  kBF16,
+  kF16,
+  kF32,
+};
+
+// Storage provides the corresponding C++ type for the given DataType.
+template <DataType data_type>
+struct Storage {};
+
+template <>
+struct Storage<DataType::kI1> {
+  using Type = bool;
+};
+template <>
+struct Storage<DataType::kSI4> {
+  using Type = int8_t;
+};
+template <>
+struct Storage<DataType::kSI8> {
+  using Type = int8_t;
+};
+template <>
+struct Storage<DataType::kSI16> {
+  using Type = int16_t;
+};
+template <>
+struct Storage<DataType::kSI32> {
+  using Type = int32_t;
+};
+template <>
+struct Storage<DataType::kBF16> {
+  using Type = BF16;
+};
+template <>
+struct Storage<DataType::kF16> {
+  using Type = F16;
+};
+template <>
+struct Storage<DataType::kF32> {
+  using Type = float;
+};
+
+template <DataType data_type>
+using StorageType = typename Storage<data_type>::Type;
+
+constexpr bool IsBool(DataType data_type) { return data_type == DataType::kI1; }
+
+constexpr bool IsInteger(DataType data_type) {
+  return data_type == DataType::kSI4 || data_type == DataType::kSI8 ||
+         data_type == DataType::kSI16 || data_type == DataType::kSI32;
+}
+constexpr bool IsFloat(DataType data_type) {
+  return data_type == DataType::kBF16 || data_type == DataType::kF16 ||
+         data_type == DataType::kF32;
+}
+
+template <DataType data_type>
+constexpr int64_t SizeOf() {
+  return sizeof(StorageType<data_type>);
+}
+
+constexpr int64_t SizeOf(DataType data_type) {
+  switch (data_type) {
+    case DataType::kI1:
+      return SizeOf<DataType::kI1>();
+    case DataType::kSI4:
+      return SizeOf<DataType::kSI4>();
+    case DataType::kSI8:
+      return SizeOf<DataType::kSI8>();
+    case DataType::kSI16:
+      return SizeOf<DataType::kSI16>();
+    case DataType::kSI32:
+      return SizeOf<DataType::kSI32>();
+    case DataType::kBF16:
+      return SizeOf<DataType::kBF16>();
+    case DataType::kF16:
+      return SizeOf<DataType::kF16>();
+    case DataType::kF32:
+      return SizeOf<DataType::kF32>();
+      break;
+  }
+}
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_DATA_TYPE_H_
diff --git a/tensorflow/lite/experimental/shlo/dispatch.h b/tensorflow/lite/experimental/shlo/dispatch.h
new file mode 100644
index 00000000000000..9e152d686fcf13
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/dispatch.h
@@ -0,0 +1,151 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_DISPATCH_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_DISPATCH_H_
+
+#define DISPATCH_INT(name, element_type, ...)                          \
+  {                                                                    \
+    switch (element_type) {                                            \
+      case DataType::kSI4:                                             \
+        return name<DataType::kSI4>(__VA_ARGS__);                      \
+      case DataType::kSI8:                                             \
+        return name<DataType::kSI8>(__VA_ARGS__);                      \
+      case DataType::kSI16:                                            \
+        return name<DataType::kSI16>(__VA_ARGS__);                     \
+      case DataType::kSI32:                                            \
+        return name<DataType::kSI32>(__VA_ARGS__);                     \
+      default:                                                         \
+        return absl::InvalidArgumentError("Unsupported element type"); \
+    }                                                                  \
+  }
+
+#define DISPATCH_FLOAT(name, element_type, ...)                        \
+  {                                                                    \
+    switch (element_type) {                                            \
+      case DataType::kBF16:                                            \
+        return name<DataType::kBF16>(__VA_ARGS__);                     \
+      case DataType::kF16:                                             \
+        return name<DataType::kF16>(__VA_ARGS__);                      \
+      case DataType::kF32:                                             \
+        return name<DataType::kF32>(__VA_ARGS__);                      \
+      default:                                                         \
+        return absl::InvalidArgumentError("Unsupported element type"); \
+    }                                                                  \
+  }
+
+#define DISPATCH_INT_FLOAT(name, element_type, ...)                    \
+  {                                                                    \
+    switch (element_type) {                                            \
+      case DataType::kSI4:                                             \
+        return name<DataType::kSI4>(__VA_ARGS__);                      \
+      case DataType::kSI8:                                             \
+        return name<DataType::kSI8>(__VA_ARGS__);                      \
+      case DataType::kSI16:                                            \
+        return name<DataType::kSI16>(__VA_ARGS__);                     \
+      case DataType::kSI32:                                            \
+        return name<DataType::kSI32>(__VA_ARGS__);                     \
+      case DataType::kBF16:                                            \
+        return name<DataType::kBF16>(__VA_ARGS__);                     \
+      case DataType::kF16:                                             \
+        return name<DataType::kF16>(__VA_ARGS__);                      \
+      case DataType::kF32:                                             \
+        return name<DataType::kF32>(__VA_ARGS__);                      \
+      default:                                                         \
+        return absl::InvalidArgumentError("Unsupported element type"); \
+    }                                                                  \
+  }
+
+#define DISPATCH_BOOL_INT_FLOAT(name, element_type, ...)               \
+  {                                                                    \
+    switch (element_type) {                                            \
+      case DataType::kI1:                                              \
+        return name<DataType::kI1>(__VA_ARGS__);                       \
+      case DataType::kSI4:                                             \
+        return name<DataType::kSI4>(__VA_ARGS__);                      \
+      case DataType::kSI8:                                             \
+        return name<DataType::kSI8>(__VA_ARGS__);                      \
+      case DataType::kSI16:                                            \
+        return name<DataType::kSI16>(__VA_ARGS__);                     \
+      case DataType::kSI32:                                            \
+        return name<DataType::kSI32>(__VA_ARGS__);                     \
+      case DataType::kBF16:                                            \
+        return name<DataType::kBF16>(__VA_ARGS__);                     \
+      case DataType::kF16:                                             \
+        return name<DataType::kF16>(__VA_ARGS__);                      \
+      case DataType::kF32:                                             \
+        return name<DataType::kF32>(__VA_ARGS__);                      \
+      default:                                                         \
+        return absl::InvalidArgumentError("Unsupported element type"); \
+    }                                                                  \
+  }
+
+#define DISPATCH_QUANTIZED(name, storage_type, expressed_type, ...)          \
+  {                                                                          \
+    switch (storage_type) {                                                  \
+      case DataType::kSI4:                                                   \
+        switch (expressed_type) {                                            \
+          case DataType::kBF16:                                              \
+            return name<DataType::kSI4, DataType::kBF16>(__VA_ARGS__);       \
+          case DataType::kF16:                                               \
+            return name<DataType::kSI4, DataType::kF16>(__VA_ARGS__);        \
+          case DataType::kF32:                                               \
+            return name<DataType::kSI4, DataType::kF32>(__VA_ARGS__);        \
+          default:                                                           \
+            return absl::InvalidArgumentError("Unsupported expressed type"); \
+        }                                                                    \
+        break;                                                               \
+      case DataType::kSI8:                                                   \
+        switch (expressed_type) {                                            \
+          case DataType::kBF16:                                              \
+            return name<DataType::kSI8, DataType::kBF16>(__VA_ARGS__);       \
+          case DataType::kF16:                                               \
+            return name<DataType::kSI8, DataType::kF16>(__VA_ARGS__);        \
+          case DataType::kF32:                                               \
+            return name<DataType::kSI8, DataType::kF32>(__VA_ARGS__);        \
+          default:                                                           \
+            return absl::InvalidArgumentError("Unsupported expressed type"); \
+        }                                                                    \
+        break;                                                               \
+      case DataType::kSI16:                                                  \
+        switch (expressed_type) {                                            \
+          case DataType::kBF16:                                              \
+            return name<DataType::kSI16, DataType::kBF16>(__VA_ARGS__);      \
+          case DataType::kF16:                                               \
+            return name<DataType::kSI16, DataType::kF16>(__VA_ARGS__);       \
+          case DataType::kF32:                                               \
+            return name<DataType::kSI16, DataType::kF32>(__VA_ARGS__);       \
+          default:                                                           \
+            return absl::InvalidArgumentError("Unsupported expressed type"); \
+        }                                                                    \
+        break;                                                               \
+      case DataType::kSI32:                                                  \
+        switch (expressed_type) {                                            \
+          case DataType::kBF16:                                              \
+            return name<DataType::kSI32, DataType::kBF16>(__VA_ARGS__);      \
+          case DataType::kF16:                                               \
+            return name<DataType::kSI32, DataType::kF16>(__VA_ARGS__);       \
+          case DataType::kF32:                                               \
+            return name<DataType::kSI32, DataType::kF32>(__VA_ARGS__);       \
+          default:                                                           \
+            return absl::InvalidArgumentError("Unsupported expressed type"); \
+        }                                                                    \
+        break;                                                               \
+      default:                                                               \
+        return absl::InvalidArgumentError("Unsupported storage type");       \
+    }                                                                        \
+  }
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_DISPATCH_H_
diff --git a/tensorflow/lite/experimental/shlo/f16.h b/tensorflow/lite/experimental/shlo/f16.h
new file mode 100644
index 00000000000000..2496b31b84dc9f
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/f16.h
@@ -0,0 +1,36 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_F16_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_F16_H_
+
+#include "tensorflow/lite/experimental/shlo/has_keyword.h"
+
+#if defined(__STDCPP_FLOAT16_T__)
+#include <stdfloat>
+namespace shlo_ref {
+using F16 = float16_t;
+}  // namespace shlo_ref
+
+#elif __has_keyword(_Float16)
+namespace shlo_ref {
+using F16 = _Float16;
+}  // namespace shlo_ref
+
+#else
+#error Type F16 is not available
+#endif
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_F16_H_
diff --git a/tensorflow/lite/experimental/shlo/src/has_keyword.h b/tensorflow/lite/experimental/shlo/has_keyword.h
similarity index 86%
rename from tensorflow/lite/experimental/shlo/src/has_keyword.h
rename to tensorflow/lite/experimental/shlo/has_keyword.h
index eb6f73f899e56c..548c86eec4de36 100644
--- a/tensorflow/lite/experimental/shlo/src/has_keyword.h
+++ b/tensorflow/lite/experimental/shlo/has_keyword.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_HAS_KEYWORD_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_HAS_KEYWORD_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_HAS_KEYWORD_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_HAS_KEYWORD_H_
 
 // CAUTION: __is_identifier behaves opposite how you would expect!
 // '__is_identifier' returns '0' if '__x' is a reserved identifier provided by
@@ -29,4 +29,4 @@ limitations under the License.
 // More sensible macro for keyword detection
 #define __has_keyword(__x) !(__is_identifier(__x))
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_HAS_KEYWORD_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_HAS_KEYWORD_H_
diff --git a/tensorflow/lite/experimental/shlo/legacy/BUILD b/tensorflow/lite/experimental/shlo/legacy/BUILD
new file mode 100644
index 00000000000000..2814b314f1d038
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/legacy/BUILD
@@ -0,0 +1,65 @@
+# StableHLO Device Profile reference implementation
+
+package(
+    default_applicable_licenses = ["//tensorflow:LICENSE"],
+    default_visibility = [":__subpackages__"],
+)
+
+cc_library(
+    name = "shlo",
+    srcs = [
+        "src/broadcast_in_dim.cc",
+        "src/clamp.cc",
+        "src/compare.cc",
+        "src/concatenate.cc",
+        "src/dispatch.h",
+        "src/elementwise_binary.cc",
+        "src/elementwise_unary.cc",
+        "src/iota.cc",
+        "src/is_finite.cc",
+        "src/select.cc",
+        "src/shlo.cc",
+        "src/uniform_dequantize_quantize.cc",
+    ],
+    hdrs = [
+        "include/shlo.h",
+        "src/storage.h",
+        "src/util.h",
+    ],
+    deps = [
+        ":float",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "debug",
+    srcs = [
+        "src/debug.cc",
+    ],
+    hdrs = [
+        "src/debug.h",
+    ],
+    deps = [
+        ":float",
+        ":shlo",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "float",
+    srcs = [
+    ],
+    hdrs = [
+        "src/bf16.h",
+        "src/f16.h",
+        "src/has_keyword.h",
+    ],
+    deps = [
+    ],
+)
diff --git a/tensorflow/lite/experimental/shlo/bench/BUILD b/tensorflow/lite/experimental/shlo/legacy/bench/BUILD
similarity index 68%
rename from tensorflow/lite/experimental/shlo/bench/BUILD
rename to tensorflow/lite/experimental/shlo/legacy/bench/BUILD
index c021814deabe51..400f05e72dafa0 100644
--- a/tensorflow/lite/experimental/shlo/bench/BUILD
+++ b/tensorflow/lite/experimental/shlo/legacy/bench/BUILD
@@ -1,4 +1,4 @@
-package(default_applicable_licenses = ["//tensorflow:license"])
+package(default_applicable_licenses = ["//tensorflow:LICENSE"])
 
 cc_library(
     name = "util",
@@ -8,7 +8,7 @@ cc_library(
         "util.h",
     ],
     deps = [
-        "//tensorflow/lite/experimental/shlo:float",
+        "//tensorflow/lite/experimental/shlo/legacy:float",
     ],
 )
 
@@ -19,8 +19,8 @@ cc_binary(
     ],
     deps = [
         ":util",
-        "//tensorflow/lite/experimental/shlo",
-        "//tensorflow/lite/experimental/shlo/test:util",
+        "//tensorflow/lite/experimental/shlo/legacy:shlo",
+        "//tensorflow/lite/experimental/shlo/legacy/test:util",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
@@ -37,7 +37,7 @@ cc_binary(
     ],
     deps = [
         ":util",
-        "//tensorflow/lite/experimental/shlo:float",
+        "//tensorflow/lite/experimental/shlo/legacy:float",
         "@XNNPACK",
         "@com_google_absl//absl/log",
         "@com_google_benchmark//:benchmark",
diff --git a/tensorflow/lite/experimental/shlo/bench/shlo_benchmark.cc b/tensorflow/lite/experimental/shlo/legacy/bench/shlo_benchmark.cc
similarity index 96%
rename from tensorflow/lite/experimental/shlo/bench/shlo_benchmark.cc
rename to tensorflow/lite/experimental/shlo/legacy/bench/shlo_benchmark.cc
index bf33e710217ad0..4d37f2f521420b 100644
--- a/tensorflow/lite/experimental/shlo/bench/shlo_benchmark.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/bench/shlo_benchmark.cc
@@ -19,10 +19,10 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "benchmark/benchmark.h"  // from @com_google_benchmark
-#include "tensorflow/lite/experimental/shlo/bench/util.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/test/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/bench/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/test/util.h"
 
 namespace stablehlo {
 namespace benchmark {
diff --git a/tensorflow/lite/experimental/shlo/bench/util.h b/tensorflow/lite/experimental/shlo/legacy/bench/util.h
similarity index 86%
rename from tensorflow/lite/experimental/shlo/bench/util.h
rename to tensorflow/lite/experimental/shlo/legacy/bench/util.h
index d79e822bd84990..29c81541175974 100644
--- a/tensorflow/lite/experimental/shlo/bench/util.h
+++ b/tensorflow/lite/experimental/shlo/legacy/bench/util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_BENCH_UTIL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_BENCH_UTIL_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_BENCH_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_BENCH_UTIL_H_
 
 #include <algorithm>
 #include <cstddef>
@@ -23,8 +23,8 @@ limitations under the License.
 #include <type_traits>
 #include <vector>
 
-#include "tensorflow/lite/experimental/shlo/src/bf16.h"
-#include "tensorflow/lite/experimental/shlo/src/f16.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/bf16.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/f16.h"
 
 namespace stablehlo {
 
@@ -61,4 +61,4 @@ std::vector<Number> GenerateRandomVector(
 
 }  // namespace stablehlo
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_BENCH_UTIL_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_BENCH_UTIL_H_
diff --git a/tensorflow/lite/experimental/shlo/bench/xnn_benchmark.cc b/tensorflow/lite/experimental/shlo/legacy/bench/xnn_benchmark.cc
similarity index 98%
rename from tensorflow/lite/experimental/shlo/bench/xnn_benchmark.cc
rename to tensorflow/lite/experimental/shlo/legacy/bench/xnn_benchmark.cc
index bc01b2b250d43b..6e9d502692c3f6 100644
--- a/tensorflow/lite/experimental/shlo/bench/xnn_benchmark.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/bench/xnn_benchmark.cc
@@ -22,8 +22,8 @@ limitations under the License.
 #include "xnnpack.h"  // from @XNNPACK
 #include "absl/log/log.h"
 #include "benchmark/benchmark.h"  // from @com_google_benchmark
-#include "tensorflow/lite/experimental/shlo/bench/util.h"
-#include "tensorflow/lite/experimental/shlo/src/f16.h"
+#include "tensorflow/lite/experimental/shlo/legacy/bench/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/f16.h"
 
 namespace stablehlo {
 namespace benchmark {
diff --git a/tensorflow/lite/experimental/shlo/include/shlo.h b/tensorflow/lite/experimental/shlo/legacy/include/shlo.h
similarity index 98%
rename from tensorflow/lite/experimental/shlo/include/shlo.h
rename to tensorflow/lite/experimental/shlo/legacy/include/shlo.h
index 28df8d7ba3d53b..31dbbc90576dc8 100644
--- a/tensorflow/lite/experimental/shlo/include/shlo.h
+++ b/tensorflow/lite/experimental/shlo/legacy/include/shlo.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_INCLUDE_SHLO_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_INCLUDE_SHLO_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_INCLUDE_SHLO_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_INCLUDE_SHLO_H_
 
 #include <cstddef>
 #include <cstdint>
@@ -419,4 +419,4 @@ absl::Status Xor(const Tensor& lhs, const Tensor& rhs, Tensor& result);
 
 }  // namespace stablehlo
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_INCLUDE_SHLO_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_INCLUDE_SHLO_H_
diff --git a/tensorflow/lite/experimental/shlo/src/bf16.h b/tensorflow/lite/experimental/shlo/legacy/src/bf16.h
similarity index 92%
rename from tensorflow/lite/experimental/shlo/src/bf16.h
rename to tensorflow/lite/experimental/shlo/legacy/src/bf16.h
index fd2e95d1ceff37..fbb1d480546409 100644
--- a/tensorflow/lite/experimental/shlo/src/bf16.h
+++ b/tensorflow/lite/experimental/shlo/legacy/src/bf16.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_BF16_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_BF16_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_BF16_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_BF16_H_
 
-#include "tensorflow/lite/experimental/shlo/src/has_keyword.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/has_keyword.h"
 
 #if defined(__STDCPP_BFLOAT16_T__)
 #include <stdfloat>
@@ -126,4 +126,4 @@ inline BF16 operator/(BF16 x, BF16 y) {
 #error Type BF16 is not available
 #endif
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_BF16_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_BF16_H_
diff --git a/tensorflow/lite/experimental/shlo/src/broadcast_in_dim.cc b/tensorflow/lite/experimental/shlo/legacy/src/broadcast_in_dim.cc
similarity index 96%
rename from tensorflow/lite/experimental/shlo/src/broadcast_in_dim.cc
rename to tensorflow/lite/experimental/shlo/legacy/src/broadcast_in_dim.cc
index f05cd7823bb063..b20fac03d97a9a 100644
--- a/tensorflow/lite/experimental/shlo/src/broadcast_in_dim.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/src/broadcast_in_dim.cc
@@ -20,10 +20,10 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/types/span.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/dispatch.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/src/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/util.h"
 
 namespace stablehlo {
 
diff --git a/tensorflow/lite/experimental/shlo/src/clamp.cc b/tensorflow/lite/experimental/shlo/legacy/src/clamp.cc
similarity index 95%
rename from tensorflow/lite/experimental/shlo/src/clamp.cc
rename to tensorflow/lite/experimental/shlo/legacy/src/clamp.cc
index cd221bae43c55b..b7481345cd227f 100644
--- a/tensorflow/lite/experimental/shlo/src/clamp.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/src/clamp.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <type_traits>
 
 #include "absl/status/status.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/dispatch.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/src/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/util.h"
 
 namespace stablehlo {
 
diff --git a/tensorflow/lite/experimental/shlo/src/compare.cc b/tensorflow/lite/experimental/shlo/legacy/src/compare.cc
similarity index 95%
rename from tensorflow/lite/experimental/shlo/src/compare.cc
rename to tensorflow/lite/experimental/shlo/legacy/src/compare.cc
index 923dd7222caeaf..354ea14977e0e8 100644
--- a/tensorflow/lite/experimental/shlo/src/compare.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/src/compare.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include <type_traits>
 
 #include "absl/status/status.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/dispatch.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/src/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/util.h"
 
 namespace stablehlo {
 
diff --git a/tensorflow/lite/experimental/shlo/src/concatenate.cc b/tensorflow/lite/experimental/shlo/legacy/src/concatenate.cc
similarity index 96%
rename from tensorflow/lite/experimental/shlo/src/concatenate.cc
rename to tensorflow/lite/experimental/shlo/legacy/src/concatenate.cc
index 507105cae6a83b..39998950e0358c 100644
--- a/tensorflow/lite/experimental/shlo/src/concatenate.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/src/concatenate.cc
@@ -19,10 +19,10 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/types/span.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/dispatch.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/src/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/util.h"
 
 namespace stablehlo {
 
diff --git a/tensorflow/lite/experimental/shlo/src/debug.cc b/tensorflow/lite/experimental/shlo/legacy/src/debug.cc
similarity index 97%
rename from tensorflow/lite/experimental/shlo/src/debug.cc
rename to tensorflow/lite/experimental/shlo/legacy/src/debug.cc
index 590ba31fcca5bd..655285c9c60825 100644
--- a/tensorflow/lite/experimental/shlo/src/debug.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/src/debug.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/shlo/src/debug.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/debug.h"
 
 #include <cmath>
 #include <cstddef>
@@ -24,11 +24,11 @@ limitations under the License.
 
 #include "absl/log/check.h"
 #include "absl/log/log.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/bf16.h"
-#include "tensorflow/lite/experimental/shlo/src/f16.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/src/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/bf16.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/f16.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/util.h"
 
 namespace stablehlo {
 
diff --git a/tensorflow/lite/experimental/shlo/src/debug.h b/tensorflow/lite/experimental/shlo/legacy/src/debug.h
similarity index 88%
rename from tensorflow/lite/experimental/shlo/src/debug.h
rename to tensorflow/lite/experimental/shlo/legacy/src/debug.h
index 4020ddc4b07de9..eb676526afa082 100644
--- a/tensorflow/lite/experimental/shlo/src/debug.h
+++ b/tensorflow/lite/experimental/shlo/legacy/src/debug.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_DEBUG_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_DEBUG_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_DEBUG_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_DEBUG_H_
 
 #include <algorithm>
 #include <cstddef>
@@ -29,10 +29,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/bf16.h"
-#include "tensorflow/lite/experimental/shlo/src/f16.h"
-#include "tensorflow/lite/experimental/shlo/src/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/bf16.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/f16.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/util.h"
 
 namespace stablehlo {
 
@@ -114,4 +114,4 @@ inline std::string ToString(absl::Span<T> span) {
 
 }  // namespace stablehlo
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_DEBUG_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_DEBUG_H_
diff --git a/tensorflow/lite/experimental/shlo/src/dispatch.h b/tensorflow/lite/experimental/shlo/legacy/src/dispatch.h
similarity index 97%
rename from tensorflow/lite/experimental/shlo/src/dispatch.h
rename to tensorflow/lite/experimental/shlo/legacy/src/dispatch.h
index ac0028fdb18f6d..649bc0baad2a94 100644
--- a/tensorflow/lite/experimental/shlo/src/dispatch.h
+++ b/tensorflow/lite/experimental/shlo/legacy/src/dispatch.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_DISPATCH_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_DISPATCH_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_DISPATCH_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_DISPATCH_H_
 
 namespace stablehlo {
 
@@ -134,4 +134,4 @@ namespace stablehlo {
 
 }  // namespace stablehlo
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_DISPATCH_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_DISPATCH_H_
diff --git a/tensorflow/lite/experimental/shlo/src/elementwise_binary.cc b/tensorflow/lite/experimental/shlo/legacy/src/elementwise_binary.cc
similarity index 99%
rename from tensorflow/lite/experimental/shlo/src/elementwise_binary.cc
rename to tensorflow/lite/experimental/shlo/legacy/src/elementwise_binary.cc
index e449f875d3d338..feed5f3d6322fa 100644
--- a/tensorflow/lite/experimental/shlo/src/elementwise_binary.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/src/elementwise_binary.cc
@@ -18,9 +18,9 @@ limitations under the License.
 #include <type_traits>
 
 #include "absl/status/status.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/src/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/util.h"
 
 namespace stablehlo {
 
diff --git a/tensorflow/lite/experimental/shlo/src/elementwise_unary.cc b/tensorflow/lite/experimental/shlo/legacy/src/elementwise_unary.cc
similarity index 98%
rename from tensorflow/lite/experimental/shlo/src/elementwise_unary.cc
rename to tensorflow/lite/experimental/shlo/legacy/src/elementwise_unary.cc
index 43510324917877..2c404cce6ac89a 100644
--- a/tensorflow/lite/experimental/shlo/src/elementwise_unary.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/src/elementwise_unary.cc
@@ -21,11 +21,11 @@ limitations under the License.
 #include <version>
 
 #include "absl/status/status.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/bf16.h"
-#include "tensorflow/lite/experimental/shlo/src/f16.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/src/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/bf16.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/f16.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/util.h"
 
 namespace stablehlo {
 
diff --git a/tensorflow/lite/experimental/shlo/src/f16.h b/tensorflow/lite/experimental/shlo/legacy/src/f16.h
similarity index 78%
rename from tensorflow/lite/experimental/shlo/src/f16.h
rename to tensorflow/lite/experimental/shlo/legacy/src/f16.h
index 72dbef86235c32..a2679306e83129 100644
--- a/tensorflow/lite/experimental/shlo/src/f16.h
+++ b/tensorflow/lite/experimental/shlo/legacy/src/f16.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_F16_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_F16_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_F16_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_F16_H_
 
-#include "tensorflow/lite/experimental/shlo/src/has_keyword.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/has_keyword.h"
 
 #if defined(__STDCPP_FLOAT16_T__)
 #include <stdfloat>
@@ -33,4 +33,4 @@ using F16 = _Float16;
 #error Type F16 is not available
 #endif
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_F16_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_F16_H_
diff --git a/tensorflow/lite/experimental/shlo/legacy/src/has_keyword.h b/tensorflow/lite/experimental/shlo/legacy/src/has_keyword.h
new file mode 100644
index 00000000000000..7c8efdc044c317
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/legacy/src/has_keyword.h
@@ -0,0 +1,32 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_HAS_KEYWORD_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_HAS_KEYWORD_H_
+
+// CAUTION: __is_identifier behaves opposite how you would expect!
+// '__is_identifier' returns '0' if '__x' is a reserved identifier provided by
+// the compiler and '1' otherwise.  borrowed from LLVM __config header under
+// Apache license 2.
+// (https://www.mend.io/blog/top-10-apache-license-questions-answered/)
+
+#ifndef __is_identifier       // Optional of course.
+#define __is_identifier(x) 1  // Compatibility with non-clang compilers.
+#endif
+
+// More sensible macro for keyword detection
+#define __has_keyword(__x) !(__is_identifier(__x))
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_HAS_KEYWORD_H_
diff --git a/tensorflow/lite/experimental/shlo/src/iota.cc b/tensorflow/lite/experimental/shlo/legacy/src/iota.cc
similarity index 92%
rename from tensorflow/lite/experimental/shlo/src/iota.cc
rename to tensorflow/lite/experimental/shlo/legacy/src/iota.cc
index 6df453b2544ab7..a2bd73febcd2df 100644
--- a/tensorflow/lite/experimental/shlo/src/iota.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/src/iota.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include <type_traits>
 
 #include "absl/status/status.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/dispatch.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/src/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/util.h"
 
 namespace stablehlo {
 
diff --git a/tensorflow/lite/experimental/shlo/src/is_finite.cc b/tensorflow/lite/experimental/shlo/legacy/src/is_finite.cc
similarity index 92%
rename from tensorflow/lite/experimental/shlo/src/is_finite.cc
rename to tensorflow/lite/experimental/shlo/legacy/src/is_finite.cc
index dce589e3f21bca..11be44cead6fb5 100644
--- a/tensorflow/lite/experimental/shlo/src/is_finite.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/src/is_finite.cc
@@ -18,10 +18,10 @@ limitations under the License.
 #include <type_traits>
 
 #include "absl/status/status.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/dispatch.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/src/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/util.h"
 
 namespace stablehlo {
 
diff --git a/tensorflow/lite/experimental/shlo/src/select.cc b/tensorflow/lite/experimental/shlo/legacy/src/select.cc
similarity index 95%
rename from tensorflow/lite/experimental/shlo/src/select.cc
rename to tensorflow/lite/experimental/shlo/legacy/src/select.cc
index 43913fc03bc2ad..5bef465c9a0df7 100644
--- a/tensorflow/lite/experimental/shlo/src/select.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/src/select.cc
@@ -17,10 +17,10 @@ limitations under the License.
 #include <type_traits>
 
 #include "absl/status/status.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/dispatch.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/src/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/util.h"
 
 namespace stablehlo {
 
diff --git a/tensorflow/lite/experimental/shlo/src/shlo.cc b/tensorflow/lite/experimental/shlo/legacy/src/shlo.cc
similarity index 96%
rename from tensorflow/lite/experimental/shlo/src/shlo.cc
rename to tensorflow/lite/experimental/shlo/legacy/src/shlo.cc
index c04020deaa1ae2..d5b47bc28371be 100644
--- a/tensorflow/lite/experimental/shlo/src/shlo.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/src/shlo.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
 
 #include <cstddef>
 #include <cstring>
@@ -23,7 +23,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/log.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
 
 namespace stablehlo {
 
diff --git a/tensorflow/lite/experimental/shlo/src/storage.h b/tensorflow/lite/experimental/shlo/legacy/src/storage.h
similarity index 88%
rename from tensorflow/lite/experimental/shlo/src/storage.h
rename to tensorflow/lite/experimental/shlo/legacy/src/storage.h
index 5b09031a3f37d3..280f8a8946b86d 100644
--- a/tensorflow/lite/experimental/shlo/src/storage.h
+++ b/tensorflow/lite/experimental/shlo/legacy/src/storage.h
@@ -13,15 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_STORAGE_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_STORAGE_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_STORAGE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_STORAGE_H_
 
 #include <cstddef>
 #include <cstdint>
 
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/bf16.h"
-#include "tensorflow/lite/experimental/shlo/src/f16.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/bf16.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/f16.h"
 
 namespace stablehlo {
 
@@ -121,4 +121,4 @@ struct Storage<ElementType::kF32> {
 
 }  // namespace stablehlo
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_STORAGE_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_STORAGE_H_
diff --git a/tensorflow/lite/experimental/shlo/src/uniform_dequantize_quantize.cc b/tensorflow/lite/experimental/shlo/legacy/src/uniform_dequantize_quantize.cc
similarity index 94%
rename from tensorflow/lite/experimental/shlo/src/uniform_dequantize_quantize.cc
rename to tensorflow/lite/experimental/shlo/legacy/src/uniform_dequantize_quantize.cc
index a7090c525ed21c..5b09f14b7189f5 100644
--- a/tensorflow/lite/experimental/shlo/src/uniform_dequantize_quantize.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/src/uniform_dequantize_quantize.cc
@@ -16,10 +16,10 @@ limitations under the License.
 #include <cstddef>
 
 #include "absl/status/status.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/dispatch.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/src/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/dispatch.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/util.h"
 
 namespace stablehlo {
 
diff --git a/tensorflow/lite/experimental/shlo/src/util.h b/tensorflow/lite/experimental/shlo/legacy/src/util.h
similarity index 95%
rename from tensorflow/lite/experimental/shlo/src/util.h
rename to tensorflow/lite/experimental/shlo/legacy/src/util.h
index aff723b822d10a..8df271be84f5a7 100644
--- a/tensorflow/lite/experimental/shlo/src/util.h
+++ b/tensorflow/lite/experimental/shlo/legacy/src/util.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_UTIL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_UTIL_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_UTIL_H_
 
 #include <cstddef>
 #include <cstdint>
@@ -24,8 +24,8 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
 
 namespace stablehlo {
 
@@ -218,4 +218,4 @@ class TensorIndexIterator {
 
 }  // namespace stablehlo
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SRC_UTIL_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_SRC_UTIL_H_
diff --git a/tensorflow/lite/experimental/shlo/test/BUILD b/tensorflow/lite/experimental/shlo/legacy/test/BUILD
similarity index 56%
rename from tensorflow/lite/experimental/shlo/test/BUILD
rename to tensorflow/lite/experimental/shlo/legacy/test/BUILD
index 5576856bc8eb75..1e0ed3830a60b3 100644
--- a/tensorflow/lite/experimental/shlo/test/BUILD
+++ b/tensorflow/lite/experimental/shlo/legacy/test/BUILD
@@ -1,4 +1,14 @@
-package(default_applicable_licenses = ["//tensorflow:license"])
+package(default_applicable_licenses = ["//tensorflow:LICENSE"])
+
+cc_library(
+    name = "matchers",
+    testonly = True,
+    hdrs = ["matchers.h"],
+    deps = [
+        "//tensorflow/lite/experimental/shlo/legacy:debug",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
 
 cc_library(
     name = "util",
@@ -7,9 +17,10 @@ cc_library(
     hdrs = [
         "util.h",
     ],
-    visibility = ["//tensorflow/lite/experimental/shlo/bench:__subpackages__"],
+    visibility = ["//tensorflow/lite/experimental/shlo/legacy/bench:__subpackages__"],
     deps = [
-        "//tensorflow/lite/experimental/shlo",
+        "//tensorflow/lite/experimental/shlo/legacy:debug",
+        "//tensorflow/lite/experimental/shlo/legacy:shlo",
         "@com_google_absl//absl/log:check",
     ],
 )
@@ -23,9 +34,8 @@ cc_test(
     ],
     deps = [
         ":util",
-        "//tensorflow/lite/experimental/shlo",
-        "//tensorflow/lite/experimental/shlo:debug",
-        "@com_google_absl//absl/log",
+        "//tensorflow/lite/experimental/shlo/legacy:debug",
+        "//tensorflow/lite/experimental/shlo/legacy:shlo",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
     ],
@@ -40,9 +50,8 @@ cc_test(
     ],
     deps = [
         ":util",
-        "//tensorflow/lite/experimental/shlo",
-        "//tensorflow/lite/experimental/shlo:debug",
-        "@com_google_absl//absl/log",
+        "//tensorflow/lite/experimental/shlo/legacy:debug",
+        "//tensorflow/lite/experimental/shlo/legacy:shlo",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -56,9 +65,8 @@ cc_test(
     ],
     deps = [
         ":util",
-        "//tensorflow/lite/experimental/shlo",
-        "//tensorflow/lite/experimental/shlo:debug",
-        "@com_google_absl//absl/log",
+        "//tensorflow/lite/experimental/shlo/legacy:debug",
+        "//tensorflow/lite/experimental/shlo/legacy:shlo",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -72,9 +80,8 @@ cc_test(
     ],
     deps = [
         ":util",
-        "//tensorflow/lite/experimental/shlo",
-        "//tensorflow/lite/experimental/shlo:debug",
-        "@com_google_absl//absl/log",
+        "//tensorflow/lite/experimental/shlo/legacy:debug",
+        "//tensorflow/lite/experimental/shlo/legacy:shlo",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest_main",
     ],
@@ -88,10 +95,10 @@ cc_test(
     data = [
     ],
     deps = [
+        ":matchers",
         ":util",
-        "//tensorflow/lite/experimental/shlo",
-        "//tensorflow/lite/experimental/shlo:debug",
-        "@com_google_absl//absl/log",
+        "//tensorflow/lite/experimental/shlo/legacy:debug",
+        "//tensorflow/lite/experimental/shlo/legacy:shlo",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
@@ -105,10 +112,10 @@ cc_test(
     data = [
     ],
     deps = [
+        ":matchers",
         ":util",
-        "//tensorflow/lite/experimental/shlo",
-        "//tensorflow/lite/experimental/shlo:debug",
-        "@com_google_absl//absl/log",
+        "//tensorflow/lite/experimental/shlo/legacy:debug",
+        "//tensorflow/lite/experimental/shlo/legacy:shlo",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
@@ -123,9 +130,8 @@ cc_test(
     ],
     deps = [
         ":util",
-        "//tensorflow/lite/experimental/shlo",
-        "//tensorflow/lite/experimental/shlo:debug",
-        "@com_google_absl//absl/log",
+        "//tensorflow/lite/experimental/shlo/legacy:debug",
+        "//tensorflow/lite/experimental/shlo/legacy:shlo",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -138,9 +144,8 @@ cc_test(
     data = [
     ],
     deps = [
-        "//tensorflow/lite/experimental/shlo",
-        "//tensorflow/lite/experimental/shlo:debug",
-        "@com_google_absl//absl/log",
+        "//tensorflow/lite/experimental/shlo/legacy:debug",
+        "//tensorflow/lite/experimental/shlo/legacy:shlo",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -154,9 +159,8 @@ cc_test(
     ],
     deps = [
         ":util",
-        "//tensorflow/lite/experimental/shlo",
-        "//tensorflow/lite/experimental/shlo:debug",
-        "@com_google_absl//absl/log",
+        "//tensorflow/lite/experimental/shlo/legacy:debug",
+        "//tensorflow/lite/experimental/shlo/legacy:shlo",
         "@com_google_googletest//:gtest_main",
     ],
 )
@@ -169,10 +173,9 @@ cc_test(
     data = [
     ],
     deps = [
-        "//tensorflow/lite/experimental/shlo",
-        "//tensorflow/lite/experimental/shlo:debug",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
+        ":matchers",
+        "//tensorflow/lite/experimental/shlo/legacy:debug",
+        "//tensorflow/lite/experimental/shlo/legacy:shlo",
         "@com_google_googletest//:gtest_main",
     ],
 )
diff --git a/tensorflow/lite/experimental/shlo/test/broadcast_in_dim_test.cc b/tensorflow/lite/experimental/shlo/legacy/test/broadcast_in_dim_test.cc
similarity index 85%
rename from tensorflow/lite/experimental/shlo/test/broadcast_in_dim_test.cc
rename to tensorflow/lite/experimental/shlo/legacy/test/broadcast_in_dim_test.cc
index f5fb64f95b496e..b8bc9a22bf1d36 100644
--- a/tensorflow/lite/experimental/shlo/test/broadcast_in_dim_test.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/test/broadcast_in_dim_test.cc
@@ -17,13 +17,13 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/log/log.h"
 #include "absl/types/span.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/debug.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/test/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/debug.h"  // IWYU pragma: keep, b/321245930
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/test/util.h"
 
 namespace stablehlo {
 namespace testing {
@@ -47,20 +47,10 @@ void test(std::initializer_list<DimensionSize>&& operand_shape,
   absl::Span<const DimensionSize> broadcast_dimensions(
       broadcast_dimensions_values);
 
-  auto res = BroadcastInDim(operand, broadcast_dimensions, result);
-
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  ASSERT_EQ(res.ok(), true);
-
-  if (result != expected) {
-    LOG(INFO) << "operand: " << operand;
-    LOG(INFO) << "broadcast_dimensions: " << ToString(broadcast_dimensions);
-    LOG(INFO) << "expected: " << expected;
-    LOG(INFO) << "result: " << result;
-  }
-  ASSERT_EQ(result, expected);
+  ASSERT_OK(BroadcastInDim(operand, broadcast_dimensions, result));
+  EXPECT_EQ(result, expected)
+      << "operand: " << operand
+      << "\nbroadcast_dimensions: " << ToString(broadcast_dimensions);
 }
 
 template <ElementType storage_type, ElementType expressed_type>
@@ -97,18 +87,10 @@ void test(
       broadcast_dimensions_values);
   auto res = BroadcastInDim(operand, broadcast_dimensions, result);
 
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  ASSERT_EQ(res.ok(), true);
-
-  if (result != expected) {
-    LOG(INFO) << "operand: " << operand;
-    LOG(INFO) << "broadcast_dimensions: " << ToString(broadcast_dimensions);
-    LOG(INFO) << "expected: " << expected;
-    LOG(INFO) << "result: " << result;
-  }
-  ASSERT_EQ(result, expected);
+  ASSERT_OK(BroadcastInDim(operand, broadcast_dimensions, result));
+  EXPECT_EQ(result, expected)
+      << "operand: " << operand
+      << "\nbroadcast_dimensions: " << ToString(broadcast_dimensions);
 }
 
 TEST(BroadcastInDim, Unquantized) {
diff --git a/tensorflow/lite/experimental/shlo/test/clamp_test.cc b/tensorflow/lite/experimental/shlo/legacy/test/clamp_test.cc
similarity index 88%
rename from tensorflow/lite/experimental/shlo/test/clamp_test.cc
rename to tensorflow/lite/experimental/shlo/legacy/test/clamp_test.cc
index a6d78c6cc2ffc1..001fd6f761d999 100644
--- a/tensorflow/lite/experimental/shlo/test/clamp_test.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/test/clamp_test.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/log/log.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/debug.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/test/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/debug.h"  // IWYU pragma: keep, b/321245930
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/test/util.h"
 
 namespace stablehlo {
 namespace testing {
@@ -45,21 +45,9 @@ void test(std::initializer_list<DimensionSize>&& shape,
       expected_values.size());
   Tensor result(TensorType(Shape(shape), element_type), result_values.data());
 
-  auto res = Clamp(min, operand, max, result);
-
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  ASSERT_EQ(res.ok(), true);
-
-  if (result != expected) {
-    LOG(INFO) << "min=" << min;
-    LOG(INFO) << "max=" << max;
-    LOG(INFO) << "operand=" << operand;
-    LOG(INFO) << "expected=" << expected;
-    LOG(INFO) << "result=" << result;
-  }
-  ASSERT_EQ(result, expected);
+  ASSERT_OK(Clamp(min, operand, max, result));
+  EXPECT_EQ(result, expected)
+      << "min: " << min << "\nmax: " << max << "\noperand: " << operand;
 }
 
 template <ElementType storage_type, ElementType expressed_type>
@@ -107,21 +95,9 @@ void test(
                           QuantizedTensorElementType(element_type)),
       result_quant_values.data());
 
-  auto res = Clamp(min, operand, max, result);
-
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  ASSERT_EQ(res.ok(), true);
-
-  if (result != expected) {
-    LOG(INFO) << "min=" << min;
-    LOG(INFO) << "max=" << max;
-    LOG(INFO) << "operand=" << operand;
-    LOG(INFO) << "expected=" << expected;
-    LOG(INFO) << "result=" << result;
-  }
-  ASSERT_EQ(result, expected);
+  ASSERT_OK(Clamp(min, operand, max, result));
+  EXPECT_EQ(result, expected)
+      << "min: " << min << "\nmax: " << max << "\noperand: " << operand;
 }
 
 TEST(Clamp, Unquantized) {
diff --git a/tensorflow/lite/experimental/shlo/test/compare_test.cc b/tensorflow/lite/experimental/shlo/legacy/test/compare_test.cc
similarity index 94%
rename from tensorflow/lite/experimental/shlo/test/compare_test.cc
rename to tensorflow/lite/experimental/shlo/legacy/test/compare_test.cc
index b14a2d1268acb7..217ea28424fdb0 100644
--- a/tensorflow/lite/experimental/shlo/test/compare_test.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/test/compare_test.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/log/log.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/debug.h"  // IWYU pragma: keep, b/321245930
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/test/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/debug.h"  // IWYU pragma: keep, b/321245930
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/test/util.h"
 
 namespace stablehlo {
 namespace testing {
@@ -44,22 +44,11 @@ void test(
   Tensor result(TensorType(Shape(shape), ElementType::kI1),
                 result_values.data());
 
-  auto res = Compare(lhs, rhs, comparison_direction, compare_type, result);
-
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  ASSERT_EQ(res.ok(), true);
-
-  if (result != expected) {
-    LOG(INFO) << "comparison_direction: " << comparison_direction;
-    LOG(INFO) << "compare_type: " << compare_type;
-    LOG(INFO) << "lhs: " << lhs;
-    LOG(INFO) << "rhs: " << rhs;
-    LOG(INFO) << "expected: " << expected;
-    LOG(INFO) << "result: " << result;
-  }
-  ASSERT_EQ(result, expected);
+  ASSERT_OK(Compare(lhs, rhs, comparison_direction, compare_type, result));
+  EXPECT_EQ(result, expected)
+      << "comparison_direction: " << comparison_direction
+      << "\ncompare_type: " << compare_type << "\nlhs: " << lhs
+      << "\nrhs: " << rhs;
 }
 
 template <ElementType storage_type, ElementType expressed_type>
@@ -94,22 +83,11 @@ void test(
   Tensor result(TensorType(Shape(shape), ElementType::kI1),
                 result_values.data());
 
-  auto res = Compare(lhs, rhs, comparison_direction, compare_type, result);
-
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  ASSERT_EQ(res.ok(), true);
-
-  if (result != expected) {
-    LOG(INFO) << "comparison_direction: " << comparison_direction;
-    LOG(INFO) << "compare_type: " << compare_type;
-    LOG(INFO) << "lhs: " << lhs;
-    LOG(INFO) << "rhs: " << rhs;
-    LOG(INFO) << "expected: " << expected;
-    LOG(INFO) << "result: " << result;
-  }
-  ASSERT_EQ(result, expected);
+  ASSERT_OK(Compare(lhs, rhs, comparison_direction, compare_type, result));
+  EXPECT_EQ(result, expected)
+      << "comparison_direction: " << comparison_direction
+      << "\ncompare_type: " << compare_type << "\nlhs: " << lhs
+      << "\nrhs: " << rhs;
 }
 
 TEST(Compare, Unquantized) {
diff --git a/tensorflow/lite/experimental/shlo/test/concatenate_test.cc b/tensorflow/lite/experimental/shlo/legacy/test/concatenate_test.cc
similarity index 87%
rename from tensorflow/lite/experimental/shlo/test/concatenate_test.cc
rename to tensorflow/lite/experimental/shlo/legacy/test/concatenate_test.cc
index 54dc17b6b899df..3494ad9940a58f 100644
--- a/tensorflow/lite/experimental/shlo/test/concatenate_test.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/test/concatenate_test.cc
@@ -13,17 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include <cstddef>
 #include <initializer_list>
+#include <sstream>
+#include <string>
+#include <string_view>
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/log/log.h"
 #include "absl/types/span.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/debug.h"  // IWYU pragma: keep, b/321245930
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/test/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/debug.h"  // IWYU pragma: keep, b/321245930
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/test/util.h"
 
 namespace stablehlo {
 namespace testing {
@@ -34,6 +38,16 @@ struct TensorConst {
   std::vector<typename Storage<element_type>::Type>&& values;
 };
 
+template <typename T>
+std::string ToString(std::string_view name,
+                     const std::vector<const T*>& tensors) {
+  std::ostringstream result;
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    result << name << "[" << i << "]: " << *tensors[i] << "\n";
+  }
+  return result.str();
+}
+
 template <ElementType element_type>
 void test(std::initializer_list<TensorConst<element_type>>&& inputs_,
           DimensionSize dimension, TensorConst<element_type>&& expected_) {
@@ -54,22 +68,9 @@ void test(std::initializer_list<TensorConst<element_type>>&& inputs_,
       expected.num_elements());
   Tensor result(TensorType(expected.type()), result_values.data());
 
-  auto res = Concatenate(absl::Span<const Tensor*>(inputs), dimension, result);
-
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  ASSERT_EQ(res.ok(), true);
-
-  if (result != expected) {
-    for (auto i = 0; i < inputs.size(); ++i) {
-      LOG(INFO) << "input[" << i << "]: " << *inputs[i];
-    }
-    LOG(INFO) << "dimension: " << dimension;
-    LOG(INFO) << "expected: " << expected;
-    LOG(INFO) << "result: " << result;
-  }
-  ASSERT_EQ(result, expected);
+  ASSERT_OK(Concatenate(absl::Span<const Tensor*>(inputs), dimension, result));
+  EXPECT_EQ(result, expected)
+      << ToString("inputs", inputs) << "dimension: " << dimension;
 }
 
 template <ElementType storage_type, ElementType expressed_type>
@@ -106,23 +107,10 @@ void test(QuantizedParameter&& quantized_parameter,
   QuantizedTensor result(QuantizedTensorType(expected.type()),
                          result_values.data());
 
-  auto res = Concatenate(absl::Span<const QuantizedTensor*>(inputs), dimension,
-                         result);
-
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  ASSERT_EQ(res.ok(), true);
-
-  if (result != expected) {
-    for (auto i = 0; i < inputs.size(); ++i) {
-      LOG(INFO) << "input[" << i << "]: " << *inputs[i];
-    }
-    LOG(INFO) << "dimension: " << dimension;
-    LOG(INFO) << "expected: " << expected;
-    LOG(INFO) << "result: " << result;
-  }
-  ASSERT_EQ(result, expected);
+  ASSERT_OK(Concatenate(absl::Span<const QuantizedTensor*>(inputs), dimension,
+                        result));
+  EXPECT_EQ(result, expected)
+      << ToString("inputs", inputs) << "dimension: " << dimension;
 }
 
 TEST(Concatenate, Unquantized) {
diff --git a/tensorflow/lite/experimental/shlo/test/elementwise_binary_test.cc b/tensorflow/lite/experimental/shlo/legacy/test/elementwise_binary_test.cc
similarity index 96%
rename from tensorflow/lite/experimental/shlo/test/elementwise_binary_test.cc
rename to tensorflow/lite/experimental/shlo/legacy/test/elementwise_binary_test.cc
index fff0ceae0862f0..18714f25ae490c 100644
--- a/tensorflow/lite/experimental/shlo/test/elementwise_binary_test.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/test/elementwise_binary_test.cc
@@ -18,13 +18,14 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/log/log.h"
 #include "absl/status/status.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/debug.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/test/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/debug.h"  // IWYU pragma: keep, b/321245930
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/test/matchers.h"
+#include "tensorflow/lite/experimental/shlo/legacy/test/util.h"
 
 namespace stablehlo {
 namespace testing {
@@ -46,20 +47,9 @@ void test(absl::Status (*op)(const Tensor&, const Tensor&, Tensor&),
       expected_values.size());
   Tensor result(TensorType(Shape(shape), element_type), result_values.data());
 
-  auto res = op(input1, input2, result);
-
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  if (result != expected) {
-    LOG(INFO) << "input1=" << input1;
-    LOG(INFO) << "input2=" << input2;
-    LOG(INFO) << "expected=" << expected;
-    LOG(INFO) << "result=" << result;
-  }
-
-  ASSERT_EQ(res.ok(), true);
-  ASSERT_EQ(AlmostSame(result, expected), true);
+  ASSERT_OK(op(input1, input2, result));
+  EXPECT_THAT(result, IsAlmostSame(expected))
+      << "input1: " << input1 << "\ninput2: " << input2;
 }
 
 template <ElementType storage_type, ElementType expressed_type>
@@ -100,20 +90,9 @@ void test(
                           QuantizedTensorElementType(element_type)),
       result_quant_values.data());
 
-  auto res = op(input1, input2, result);
-
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  if (result != expected) {
-    LOG(INFO) << "input1=" << input1;
-    LOG(INFO) << "input2=" << input2;
-    LOG(INFO) << "expected=" << expected;
-    LOG(INFO) << "result=" << result;
-  }
-
-  ASSERT_EQ(res.ok(), true);
-  ASSERT_EQ(AlmostSame(result, expected), true);
+  ASSERT_OK(op(input1, input2, result));
+  EXPECT_THAT(result, IsAlmostSame(expected))
+      << "input1: " << input1 << "\ninput2: " << input2;
 }
 
 TEST(ElementwiseBinary, Add) {
diff --git a/tensorflow/lite/experimental/shlo/test/elementwise_unary_test.cc b/tensorflow/lite/experimental/shlo/legacy/test/elementwise_unary_test.cc
similarity index 97%
rename from tensorflow/lite/experimental/shlo/test/elementwise_unary_test.cc
rename to tensorflow/lite/experimental/shlo/legacy/test/elementwise_unary_test.cc
index 995cf6deaf7bc6..11a2564cba9d11 100644
--- a/tensorflow/lite/experimental/shlo/test/elementwise_unary_test.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/test/elementwise_unary_test.cc
@@ -19,13 +19,14 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/log/log.h"
 #include "absl/status/status.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/debug.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/test/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/debug.h"  // IWYU pragma: keep, b/321245930
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/test/matchers.h"
+#include "tensorflow/lite/experimental/shlo/legacy/test/util.h"
 
 namespace stablehlo {
 namespace testing {
@@ -43,19 +44,8 @@ void test(absl::Status (*op)(const Tensor&, Tensor&),
       expected_values.size());
   Tensor result(TensorType(Shape(shape), element_type), result_values.data());
 
-  auto res = op(input, result);
-
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  if (result != expected) {
-    LOG(INFO) << "input=" << input;
-    LOG(INFO) << "expected=" << expected;
-    LOG(INFO) << "result=" << result;
-  }
-
-  ASSERT_EQ(res.ok(), true);
-  ASSERT_EQ(AlmostSame(result, expected), true);
+  ASSERT_OK(op(input, result));
+  EXPECT_THAT(result, IsAlmostSame(expected)) << "input: " << input;
 }
 
 template <ElementType storage_type, ElementType expressed_type>
@@ -88,19 +78,8 @@ void test(
                           QuantizedTensorElementType(element_type)),
       result_quant_values.data());
 
-  auto res = op(input, result);
-
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  if (result != expected) {
-    LOG(INFO) << "input=" << input;
-    LOG(INFO) << "expected=" << expected;
-    LOG(INFO) << "result=" << result;
-  }
-
-  ASSERT_EQ(res.ok(), true);
-  ASSERT_EQ(AlmostSame(result, expected), true);
+  ASSERT_OK(op(input, result));
+  EXPECT_THAT(result, IsAlmostSame(expected)) << "input: " << input;
 }
 
 TEST(ElementwiseUnary, Abs) {
diff --git a/tensorflow/lite/experimental/shlo/test/iota_test.cc b/tensorflow/lite/experimental/shlo/legacy/test/iota_test.cc
similarity index 87%
rename from tensorflow/lite/experimental/shlo/test/iota_test.cc
rename to tensorflow/lite/experimental/shlo/legacy/test/iota_test.cc
index 794e9518c5708d..7fe34cdc01d902 100644
--- a/tensorflow/lite/experimental/shlo/test/iota_test.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/test/iota_test.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/log/log.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/debug.h"  // IWYU pragma: keep, b/321245930
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/test/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/debug.h"  // IWYU pragma: keep, b/321245930
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/test/util.h"
 
 namespace stablehlo {
 namespace testing {
@@ -38,19 +38,8 @@ void test(std::initializer_list<DimensionSize>&& shape,
       expected_values.size());
   Tensor result(TensorType(Shape(shape), element_type), result_values.data());
 
-  auto res = Iota(iota_dimension, result);
-  if (!res.ok()) {
-    ABSL_LOG(INFO) << "Failure: " << res;
-  }
-  ASSERT_EQ(res.ok(), true);
-
-  if (result != expected) {
-    ABSL_LOG(INFO) << "iota_dimension=" << iota_dimension;
-    LOG(INFO) << "expected=" << expected;
-    LOG(INFO) << "result=" << result;
-  }
-
-  ASSERT_EQ(result, expected);
+  ASSERT_OK(Iota(iota_dimension, result));
+  EXPECT_EQ(result, expected) << "\niota_dimension: " << iota_dimension;
 }
 
 template <ElementType storage_type, ElementType expressed_type>
@@ -74,19 +63,8 @@ void test(
                           QuantizedTensorElementType(element_type)),
       result_quant_values.data());
 
-  auto res = Iota(iota_dimension, result);
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  ASSERT_EQ(res.ok(), true);
-
-  if (result != expected) {
-    LOG(INFO) << "iota_dimension=" << iota_dimension;
-    LOG(INFO) << "expected=" << expected;
-    LOG(INFO) << "result=" << result;
-  }
-
-  ASSERT_EQ(result, expected);
+  ASSERT_OK(Iota(iota_dimension, result));
+  EXPECT_EQ(result, expected) << "\niota_dimension: " << iota_dimension;
 }
 
 TEST(Iota, Unquantized) {
diff --git a/tensorflow/lite/experimental/shlo/test/is_finite_test.cc b/tensorflow/lite/experimental/shlo/legacy/test/is_finite_test.cc
similarity index 84%
rename from tensorflow/lite/experimental/shlo/test/is_finite_test.cc
rename to tensorflow/lite/experimental/shlo/legacy/test/is_finite_test.cc
index d52838b081a64a..5c097287233822 100644
--- a/tensorflow/lite/experimental/shlo/test/is_finite_test.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/test/is_finite_test.cc
@@ -17,11 +17,11 @@ limitations under the License.
 #include <initializer_list>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/log/log.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/debug.h"  // IWYU pragma: keep, b/321245930
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/debug.h"  // IWYU pragma: keep, b/321245930
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
 
 namespace stablehlo {
 namespace testing {
@@ -40,19 +40,8 @@ void test(
   Tensor result(TensorType(Shape(shape), ElementType::kI1),
                 result_values.data());
 
-  auto res = IsFinite(input, result);
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  ASSERT_EQ(res.ok(), true);
-
-  if (result != expected) {
-    LOG(INFO) << "input=" << input;
-    LOG(INFO) << "expected=" << expected;
-    LOG(INFO) << "result=" << result;
-  }
-
-  ASSERT_EQ(result, expected);
+  ASSERT_OK(IsFinite(input, result));
+  EXPECT_EQ(result, expected) << "input: " << input;
 }
 
 template <ElementType storage_type, ElementType expressed_type>
@@ -70,19 +59,8 @@ void test(
   Tensor result(TensorType(Shape(shape), ElementType::kI1),
                 result_values.data());
 
-  auto res = IsFinite(input, result);
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  ASSERT_EQ(res.ok(), true);
-
-  if (result != expected) {
-    LOG(INFO) << "input=" << input;
-    LOG(INFO) << "expected=" << expected;
-    LOG(INFO) << "result=" << result;
-  }
-
-  ASSERT_EQ(result, expected);
+  ASSERT_OK(IsFinite(input, result));
+  EXPECT_EQ(result, expected) << "input: " << input;
 }
 
 TEST(IsFinite, Unquantized) {
diff --git a/third_party/xla/xla/service/gpu/runtime/conv_reorder.h b/tensorflow/lite/experimental/shlo/legacy/test/matchers.h
similarity index 55%
rename from third_party/xla/xla/service/gpu/runtime/conv_reorder.h
rename to tensorflow/lite/experimental/shlo/legacy/test/matchers.h
index 1c43dad384e820..6f06428db3fd61 100644
--- a/third_party/xla/xla/service/gpu/runtime/conv_reorder.h
+++ b/tensorflow/lite/experimental/shlo/legacy/test/matchers.h
@@ -1,10 +1,10 @@
-/* Copyright 2022 The OpenXLA Authors.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -13,19 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME_CONV_REORDER_H_
-#define XLA_SERVICE_GPU_RUNTIME_CONV_REORDER_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_TEST_MATCHERS_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_TEST_MATCHERS_H_
 
-#include "xla/runtime/custom_call_registry.h"
+#include <gmock/gmock.h>
+#include "tensorflow/lite/experimental/shlo/legacy/src/debug.h"
 
-namespace xla {
-namespace gpu {
+namespace stablehlo {
+namespace testing {
 
-// Registers XLA Gpu runtime convolution reorder custom calls.
-void RegisterConvReorderCustomCalls(
-    runtime::DirectCustomCallRegistry& registry);
+MATCHER_P(IsAlmostSame, expected, "") { return AlmostSame(arg, expected); }
 
-}  // namespace gpu
-}  // namespace xla
+}  // namespace testing
+}  // namespace stablehlo
 
-#endif  // XLA_SERVICE_GPU_RUNTIME_CONV_REORDER_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_TEST_MATCHERS_H_
diff --git a/tensorflow/lite/experimental/shlo/test/select_test.cc b/tensorflow/lite/experimental/shlo/legacy/test/select_test.cc
similarity index 88%
rename from tensorflow/lite/experimental/shlo/test/select_test.cc
rename to tensorflow/lite/experimental/shlo/legacy/test/select_test.cc
index 16bdcc6a7418f4..d82accf8abaeb2 100644
--- a/tensorflow/lite/experimental/shlo/test/select_test.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/test/select_test.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/log/log.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/debug.h"  // IWYU pragma: keep, b/321245930
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/test/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/debug.h"  // IWYU pragma: keep, b/321245930
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/test/util.h"
 
 namespace stablehlo {
 namespace testing {
@@ -46,21 +46,9 @@ void test(std::initializer_list<DimensionSize>&& shape,
       expected_values.size());
   Tensor result(TensorType(Shape(shape), element_type), result_values.data());
 
-  auto res = Select(pred, on_true, on_false, result);
-
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  ASSERT_EQ(res.ok(), true);
-
-  if (result != expected) {
-    LOG(INFO) << "pred=" << pred;
-    LOG(INFO) << "on_true=" << on_true;
-    LOG(INFO) << "on_false=" << on_false;
-    LOG(INFO) << "expected=" << expected;
-    LOG(INFO) << "result=" << result;
-  }
-  ASSERT_EQ(result, expected);
+  ASSERT_OK(Select(pred, on_true, on_false, result));
+  EXPECT_EQ(result, expected) << "pred: " << pred << "\non_true: " << on_true
+                              << "\nnon_false: " << on_false;
 }
 
 template <ElementType storage_type, ElementType expressed_type>
@@ -103,21 +91,9 @@ void test(
                           QuantizedTensorElementType(element_type)),
       result_quant_values.data());
 
-  auto res = Select(pred, on_true, on_false, result);
-
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  ASSERT_EQ(res.ok(), true);
-
-  if (result != expected) {
-    LOG(INFO) << "pred=" << pred;
-    LOG(INFO) << "on_true=" << on_true;
-    LOG(INFO) << "on_false=" << on_false;
-    LOG(INFO) << "expected=" << expected;
-    LOG(INFO) << "result=" << result;
-  }
-  ASSERT_EQ(result, expected);
+  ASSERT_OK(Select(pred, on_true, on_false, result));
+  EXPECT_EQ(result, expected) << "pred: " << pred << "\non_true: " << on_true
+                              << "\nnon_false: " << on_false;
 }
 
 TEST(Select, Unquantized) {
diff --git a/tensorflow/lite/experimental/shlo/test/uniform_dequantize_quantize_test.cc b/tensorflow/lite/experimental/shlo/legacy/test/uniform_dequantize_quantize_test.cc
similarity index 87%
rename from tensorflow/lite/experimental/shlo/test/uniform_dequantize_quantize_test.cc
rename to tensorflow/lite/experimental/shlo/legacy/test/uniform_dequantize_quantize_test.cc
index 51b823a1b9f8b2..9cb5d288cedc87 100644
--- a/tensorflow/lite/experimental/shlo/test/uniform_dequantize_quantize_test.cc
+++ b/tensorflow/lite/experimental/shlo/legacy/test/uniform_dequantize_quantize_test.cc
@@ -17,12 +17,12 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include <gmock/gmock.h>
 #include <gtest/gtest.h>
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/debug.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/debug.h"  // IWYU pragma: keep, b/321245930
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/test/matchers.h"
 
 namespace stablehlo {
 namespace testing {
@@ -45,27 +45,14 @@ void test(std::initializer_list<DimensionSize>&& shape,
       input_values.size());
   Tensor result(TensorType(Shape(shape), expressed_type), result_values.data());
 
-  auto res = UniformQuantize(input, quant);
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  ASSERT_EQ(res.ok(), true);
-
-  res = UniformDequantize(quant, result);
-  if (!res.ok()) {
-    LOG(INFO) << "Failure: " << res;
-  }
-  ASSERT_EQ(res.ok(), true);
-
-  if (result != input) {
-    LOG(INFO) << "input=" << input;
-    LOG(INFO) << "result=" << result;
-  }
-
-  ASSERT_EQ(AlmostSame(result, input), true);
+  ASSERT_OK(UniformQuantize(input, quant));
+  ASSERT_OK(UniformDequantize(quant, result));
+  EXPECT_THAT(result, IsAlmostSame(input));
 }
 
 TEST(QuantizeDequantize, All) {
+  test<ElementType::kSI8, ElementType::kBF16>(
+      {4}, {.scale = 1, .zero_point = 0}, {-2, -1, 0, 1, 2});
   test<ElementType::kSI8, ElementType::kBF16>(
       {4}, {.scale = 1, .zero_point = 0}, {-2, -1, 0, 1, 2});
   test<ElementType::kSI8, ElementType::kBF16>(
diff --git a/tensorflow/lite/experimental/shlo/test/util.h b/tensorflow/lite/experimental/shlo/legacy/test/util.h
similarity index 80%
rename from tensorflow/lite/experimental/shlo/test/util.h
rename to tensorflow/lite/experimental/shlo/legacy/test/util.h
index e1bb4fb87c1d1d..2fb80c47d814c3 100644
--- a/tensorflow/lite/experimental/shlo/test/util.h
+++ b/tensorflow/lite/experimental/shlo/legacy/test/util.h
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TEST_UTIL_H_
-#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TEST_UTIL_H_
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_TEST_UTIL_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_TEST_UTIL_H_
 
 #include <optional>
 #include <vector>
 
 #include "absl/log/check.h"
-#include "tensorflow/lite/experimental/shlo/include/shlo.h"
-#include "tensorflow/lite/experimental/shlo/src/storage.h"
-#include "tensorflow/lite/experimental/shlo/src/util.h"
+#include "tensorflow/lite/experimental/shlo/legacy/include/shlo.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/storage.h"
+#include "tensorflow/lite/experimental/shlo/legacy/src/util.h"
 
 namespace stablehlo {
 
@@ -51,4 +51,4 @@ std::vector<typename Storage<storage_type>::Type> QuantizeVector(
 
 }  // namespace stablehlo
 
-#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TEST_UTIL_H_
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_LEGACY_TEST_UTIL_H_
diff --git a/tensorflow/lite/experimental/shlo/ops/BUILD b/tensorflow/lite/experimental/shlo/ops/BUILD
new file mode 100644
index 00000000000000..33ffee9a9c8f69
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/ops/BUILD
@@ -0,0 +1,6 @@
+# Implementation of StableHLO operations.
+
+package(
+    default_applicable_licenses = ["//tensorflow:LICENSE"],
+    default_visibility = ["//visibility:public"],
+)
diff --git a/tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h b/tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h
new file mode 100644
index 00000000000000..7e94966e893057
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h
@@ -0,0 +1,130 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_QUANTIZED_TENSOR_ELEMENT_TYPE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_QUANTIZED_TENSOR_ELEMENT_TYPE_H_
+
+#include <optional>
+#include <utility>
+#include <variant>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/log/absl_check.h"
+#include "absl/types/span.h"
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+
+namespace shlo_ref {
+
+class QuantizedTensorElementType {
+ public:
+  template <DataType storage_type, DataType expressed_type>
+  static QuantizedTensorElementType PerTensor(
+      StorageType<expressed_type> scale, StorageType<storage_type> zero_point) {
+    static_assert(IsInteger(storage_type),
+                  "Storage type must be an integer type");
+    static_assert(IsFloat(expressed_type),
+                  "Expressed type must be a floating point type");
+    using StorageT = Storage<storage_type>::Type;
+    using ExpressedT = Storage<expressed_type>::Type;
+
+    return QuantizedTensorElementType(
+        storage_type, expressed_type, std::nullopt,
+        SmallInlinedVector<ExpressedT>({scale}),
+        SmallInlinedVector<StorageT>({zero_point}));
+  }
+
+  template <DataType storage_type, DataType expressed_type>
+  static QuantizedTensorElementType PerAxis(
+      absl::Span<const StorageType<expressed_type>> scales,
+      absl::Span<const StorageType<storage_type>> zero_points,
+      Axis quantized_dimension) {
+    static_assert(IsInteger(storage_type),
+                  "Storage type must be an integer type");
+    static_assert(IsFloat(expressed_type),
+                  "Expressed type must be a floating point type");
+    using StorageT = Storage<storage_type>::Type;
+    using ExpressedT = Storage<expressed_type>::Type;
+
+    ABSL_CHECK(scales.size() == zero_points.size());
+    return QuantizedTensorElementType(
+        storage_type, expressed_type, quantized_dimension,
+        SmallInlinedVector<ExpressedT>(scales.begin(), scales.end()),
+        SmallInlinedVector<StorageT>(zero_points.begin(), zero_points.end()));
+  }
+
+  DataType StorageType() const { return storage_type_; }
+  DataType ExpressedType() const { return expressed_type_; }
+
+  bool IsPerTensorQuantized() const { return !quantized_dimension_; }
+  bool IsPerAxisQuantized() const { return !IsPerTensorQuantized(); }
+
+  Axis QuantizedDimension() const {
+    ABSL_CHECK(IsPerAxisQuantized());
+    return quantized_dimension_.value();
+  }
+
+  template <DataType expressed_type, typename T = Storage<expressed_type>::Type>
+  absl::Span<const T> Scales() const {
+    ABSL_CHECK(expressed_type == expressed_type_);
+    ABSL_CHECK(std::holds_alternative<SmallInlinedVector<T>>(scales_));
+    return std::get<SmallInlinedVector<T>>(scales_);
+  }
+
+  template <DataType storage_type, typename T = Storage<storage_type>::Type>
+  absl::Span<const T> ZeroPoints() const {
+    ABSL_CHECK(storage_type == storage_type_);
+    ABSL_CHECK(std::holds_alternative<SmallInlinedVector<T>>(zero_points_));
+    return std::get<SmallInlinedVector<T>>(zero_points_);
+  }
+
+ private:
+  // Most quantized tensors will likely be per tensor quantized, which will have
+  // a single element in the vector. Use an InlinedVector with a single element
+  // so we only allocate when using per axis quantization.
+  template <typename T>
+  using SmallInlinedVector = absl::InlinedVector<T, 1>;
+
+  template <typename StorageT, typename ExpressedT>
+  QuantizedTensorElementType(DataType storage_type, DataType expressed_type,
+                             std::optional<Axis> quantized_dimension,
+                             SmallInlinedVector<ExpressedT> scales,
+                             SmallInlinedVector<StorageT> zero_points)
+      : storage_type_(storage_type),
+        expressed_type_(expressed_type),
+        quantized_dimension_(quantized_dimension),
+        scales_(std::move(scales)),
+        zero_points_(std::move(zero_points)) {}
+
+  DataType storage_type_;
+  DataType expressed_type_;
+  std::optional<Axis> quantized_dimension_;
+
+  std::variant<SmallInlinedVector<Storage<DataType::kBF16>::Type>,
+               SmallInlinedVector<Storage<DataType::kF16>::Type>,
+               SmallInlinedVector<Storage<DataType::kF32>::Type>>
+      scales_;
+
+  // There is no need for kSI4 because it currently uses the same underlying
+  // storage type as kSI8, which complicates accessing the variant. If they ever
+  // use different underlying types, please add an alternative for kSI4.
+  std::variant<SmallInlinedVector<Storage<DataType::kSI8>::Type>,
+               SmallInlinedVector<Storage<DataType::kSI16>::Type>,
+               SmallInlinedVector<Storage<DataType::kSI32>::Type>>
+      zero_points_;
+};
+
+}  // namespace shlo_ref
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_QUANTIZED_TENSOR_ELEMENT_TYPE_H_
diff --git a/tensorflow/lite/experimental/shlo/quantized_tensor_element_type_test.cc b/tensorflow/lite/experimental/shlo/quantized_tensor_element_type_test.cc
new file mode 100644
index 00000000000000..f8e00423ea14f7
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/quantized_tensor_element_type_test.cc
@@ -0,0 +1,93 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+
+namespace shlo_ref {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+
+template <DataType storage_type, DataType expressed_type>
+struct TestPair {
+  using StorageT = StorageType<storage_type>;
+  using ExpressedT = StorageType<expressed_type>;
+
+  static constexpr DataType kStorageType = storage_type;
+  static constexpr DataType kExpressedType = expressed_type;
+};
+
+template <typename T>
+class QuantizedTensorElementTypeTest : public ::testing::Test {};
+
+using TestTypes = ::testing::Types<TestPair<DataType::kSI4, DataType::kBF16>,
+                                   TestPair<DataType::kSI4, DataType::kF16>,
+                                   TestPair<DataType::kSI4, DataType::kF32>,
+                                   TestPair<DataType::kSI8, DataType::kBF16>,
+                                   TestPair<DataType::kSI8, DataType::kF16>,
+                                   TestPair<DataType::kSI8, DataType::kF32>,
+                                   TestPair<DataType::kSI16, DataType::kBF16>,
+                                   TestPair<DataType::kSI16, DataType::kF16>,
+                                   TestPair<DataType::kSI16, DataType::kF32>,
+                                   TestPair<DataType::kSI32, DataType::kBF16>,
+                                   TestPair<DataType::kSI32, DataType::kF16>,
+                                   TestPair<DataType::kSI32, DataType::kF32>>;
+
+TYPED_TEST_SUITE(QuantizedTensorElementTypeTest, TestTypes);
+
+TYPED_TEST(QuantizedTensorElementTypeTest, PerTensor) {
+  typename TypeParam::ExpressedT scale = .5;
+  typename TypeParam::StorageT zero_point = 3;
+
+  auto element =
+      QuantizedTensorElementType::PerTensor<TypeParam::kStorageType,
+                                            TypeParam::kExpressedType>(
+          scale, zero_point);
+
+  EXPECT_THAT(element.StorageType(), Eq(TypeParam::kStorageType));
+  EXPECT_THAT(element.ExpressedType(), Eq(TypeParam::kExpressedType));
+  EXPECT_THAT(element.IsPerTensorQuantized(), Eq(true));
+  EXPECT_THAT(element.IsPerAxisQuantized(), Eq(false));
+  EXPECT_THAT(element.template Scales<TypeParam::kExpressedType>(),
+              ElementsAre(.5f));
+  EXPECT_THAT(element.template ZeroPoints<TypeParam::kStorageType>(),
+              ElementsAre(3));
+}
+
+TYPED_TEST(QuantizedTensorElementTypeTest, PerAxis) {
+  typename TypeParam::ExpressedT scales[] = {.5, .6, .2};
+  typename TypeParam::StorageT zero_points[] = {3, 1, 2};
+  auto element = QuantizedTensorElementType::PerAxis<TypeParam::kStorageType,
+                                                     TypeParam::kExpressedType>(
+      absl::MakeConstSpan(scales), absl::MakeConstSpan(zero_points), 3u);
+
+  EXPECT_THAT(element.StorageType(), Eq(TypeParam::kStorageType));
+  EXPECT_THAT(element.ExpressedType(), Eq(TypeParam::kExpressedType));
+  EXPECT_THAT(element.IsPerTensorQuantized(), Eq(false));
+  EXPECT_THAT(element.IsPerAxisQuantized(), Eq(true));
+  EXPECT_THAT(element.QuantizedDimension(), Eq(3));
+  EXPECT_THAT(element.template Scales<TypeParam::kExpressedType>(),
+              ElementsAre(.5f, .6f, .2f));
+  EXPECT_THAT(element.template ZeroPoints<TypeParam::kStorageType>(),
+              ElementsAre(3, 1, 2));
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/shape.cc b/tensorflow/lite/experimental/shlo/shape.cc
new file mode 100644
index 00000000000000..a7cc4ec9800d57
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/shape.cc
@@ -0,0 +1,71 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/shape.h"
+
+#include <cstddef>
+#include <functional>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/types/span.h"
+
+namespace shlo_ref {
+
+Shape::Shape(absl::Span<const DimensionSize> dims)
+    : dims_(dims.begin(), dims.end()) {}
+
+absl::Span<const DimensionSize> Shape::Dimensions() const { return dims_; }
+
+absl::Span<DimensionSize> Shape::MutableDimensions() {
+  return absl::MakeSpan(dims_);
+}
+
+absl::InlinedVector<Axis, kMaxNumDimensions> Shape::Axes() const {
+  absl::InlinedVector<Axis, kMaxNumDimensions> axes(dims_.size());
+  absl::c_iota(axes, 0);
+  return axes;
+}
+
+DimensionSize Shape::Dim(Axis axis) const { return dims_[axis]; }
+
+absl::InlinedVector<DimensionSize, kMaxNumDimensions> Shape::Dims(
+    absl::Span<const Axis> axes) const {
+  absl::InlinedVector<DimensionSize, kMaxNumDimensions> dims;
+  for (const auto axis : axes) {
+    // Ignore invalid axis
+    if (axis < dims_.size()) {
+      dims.push_back(Dim(axis));
+    }
+  }
+  return dims;
+}
+
+size_t Shape::Rank() const { return dims_.size(); }
+
+DimensionSize Shape::NumElements() const {
+  if (dims_.empty()) {
+    return 0;
+  }
+  return absl::c_accumulate(dims_, 1, std::multiplies<>());
+}
+
+bool operator==(const Shape& lhs, const Shape& rhs) {
+  return lhs.Dimensions() == rhs.Dimensions();
+}
+
+bool operator!=(const Shape& lhs, const Shape& rhs) { return !(lhs == rhs); }
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/shape.h b/tensorflow/lite/experimental/shlo/shape.h
new file mode 100644
index 00000000000000..b3c04ef0322db2
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/shape.h
@@ -0,0 +1,76 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SHAPE_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SHAPE_H_
+
+#include <cstddef>
+#include <cstdint>
+
+#include "absl/container/inlined_vector.h"
+#include "absl/types/span.h"
+
+namespace shlo_ref {
+
+// The SHLO Spec states that dimensions are non-negative. We diverge from the
+// spec here to use negative values to represent dynamic dimensions.
+using DimensionSize = int64_t;
+using Axis = size_t;
+
+inline constexpr DimensionSize kDynamicDimension = -1;
+inline constexpr Axis kMaxNumDimensions = 6;
+
+class Shape {
+ public:
+  Shape() = default;
+  ~Shape() = default;
+  Shape(const Shape&) = default;
+  Shape& operator=(const Shape&) = default;
+  Shape(Shape&&) = default;
+  Shape& operator=(Shape&&) = default;
+
+  explicit Shape(absl::Span<const DimensionSize> dims);
+
+  absl::Span<const DimensionSize> Dimensions() const;
+  absl::Span<DimensionSize> MutableDimensions();
+
+  // range(rank(x))
+  absl::InlinedVector<Axis, kMaxNumDimensions> Axes() const;
+
+  // shape(x)[axis]
+  DimensionSize Dim(Axis axis) const;
+
+  // list(map(lambda axis: dim(x, axis), axes))
+  absl::InlinedVector<DimensionSize, kMaxNumDimensions> Dims(
+      absl::Span<const Axis> axes) const;
+
+  // size(shape(x))
+  size_t Rank() const;
+
+  // reduce(lambda x, y: x * y, shape(x))
+  // Note: in the SHLO spec, this is called size. We've diverged for readability
+  // and possible confusion with C++ container's usage of size().
+  DimensionSize NumElements() const;
+
+ private:
+  absl::InlinedVector<DimensionSize, kMaxNumDimensions> dims_;
+};
+
+bool operator==(const Shape& lhs, const Shape& rhs);
+bool operator!=(const Shape& lhs, const Shape& rhs);
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_SHAPE_H_
diff --git a/tensorflow/lite/experimental/shlo/shape_test.cc b/tensorflow/lite/experimental/shlo/shape_test.cc
new file mode 100644
index 00000000000000..41495efb9cbd15
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/shape_test.cc
@@ -0,0 +1,92 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/shape.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+namespace shlo_ref {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Eq;
+
+TEST(ShapeTest, DimensionsAccess) {
+  const Shape shape({1, 2, 4, 8});
+  EXPECT_THAT(shape.Dimensions(), ElementsAre(1, 2, 4, 8));
+}
+
+TEST(ShapeTest, DimensionsMutableAccess) {
+  Shape shape({1, 2, 4, 8});
+
+  shape.MutableDimensions()[2] = 42;
+  EXPECT_THAT(shape.Dimensions(), ElementsAre(1, 2, 42, 8));
+}
+
+TEST(ShapeTest, Axes) {
+  Shape shape({1, 2, 4, 8});
+  EXPECT_THAT(shape.Axes(), ElementsAre(0, 1, 2, 3));
+}
+
+TEST(ShapeTest, Dim) {
+  Shape shape({1, 2, 4, 8});
+  EXPECT_THAT(shape.Dim(1), Eq(2));
+  EXPECT_THAT(shape.Dim(3), Eq(8));
+}
+
+TEST(ShapeTest, Dims) {
+  Shape shape({1, 2, 4, 8});
+  EXPECT_THAT(shape.Dims({1, 3}), ElementsAre(2, 8));
+}
+
+TEST(ShapeTest, DimsInvalidAxisIgnored) {
+  Shape shape({1, 2, 4, 8});
+  EXPECT_THAT(shape.Dims({1, 8}), ElementsAre(2));
+}
+
+TEST(ShapeTest, Rank) {
+  Shape shape({1, 2, 4, 8});
+  EXPECT_THAT(shape.Rank(), Eq(4));
+}
+
+TEST(ShapeTest, RankEmpty) {
+  Shape shape{};
+  EXPECT_THAT(shape.Rank(), Eq(0));
+}
+
+TEST(ShapeTest, NumElementsEmpty) {
+  Shape shape{};
+  EXPECT_THAT(shape.NumElements(), Eq(0));
+}
+TEST(ShapeTest, NumElements) {
+  Shape shape({1, 2, 4, 8});
+  EXPECT_THAT(shape.NumElements(), Eq(64));
+}
+
+TEST(ShapeTest, Equals) {
+  Shape s1({1, 2, 4, 8});
+  Shape s2({1, 2, 4, 8});
+  EXPECT_TRUE(s1 == s2);
+}
+
+TEST(ShapeTest, NotEquals) {
+  Shape s1({1, 2, 4, 8});
+  Shape s2({1, 4, 2, 8});
+  EXPECT_TRUE(s1 != s2);
+}
+
+}  // namespace
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/tensor.cc b/tensorflow/lite/experimental/shlo/tensor.cc
new file mode 100644
index 00000000000000..b65506a7a5bab2
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/tensor.cc
@@ -0,0 +1,100 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tensorflow/lite/experimental/shlo/tensor.h"
+
+#include <cassert>
+#include <cstddef>
+#include <variant>
+
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+
+namespace shlo_ref {
+
+const Shape& Tensor::shape() const {
+  if (IsQuantized()) {
+    return quantized_tensor_type().shape;
+  } else {
+    return tensor_type().shape;
+  }
+}
+
+Shape& Tensor::shape() {
+  if (IsQuantized()) {
+    return quantized_tensor_type().shape;
+  } else {
+    return tensor_type().shape;
+  }
+}
+
+bool Tensor::IsQuantized() const {
+  return std::holds_alternative<QuantizedTensorType>(type);
+}
+
+bool Tensor::IsPerAxisQuantized() const {
+  return IsQuantized() &&
+         std::get<QuantizedTensorType>(type).element_type.IsPerAxisQuantized();
+}
+bool Tensor::IsPerTensorQuantized() const {
+  return IsQuantized() && std::get<QuantizedTensorType>(type)
+                              .element_type.IsPerTensorQuantized();
+}
+
+size_t Tensor::Rank() const {
+  return IsQuantized() ? quantized_tensor_type().shape.Rank()
+                       : tensor_type().shape.Rank();
+}
+
+DataType Tensor::StorageType() const {
+  return IsQuantized() ? quantized_tensor_type().element_type.StorageType()
+                       : tensor_type().element_type;
+}
+
+DimensionSize Tensor::NumElements() const {
+  return IsQuantized() ? quantized_tensor_type().shape.NumElements()
+                       : tensor_type().shape.NumElements();
+}
+
+TensorType& Tensor::tensor_type() {
+  assert(std::holds_alternative<TensorType>(type));
+  return std::get<TensorType>(type);
+}
+
+const TensorType& Tensor::tensor_type() const {
+  assert(std::holds_alternative<TensorType>(type));
+  return std::get<TensorType>(type);
+}
+
+QuantizedTensorType& Tensor::quantized_tensor_type() {
+  assert(std::holds_alternative<QuantizedTensorType>(type));
+  return std::get<QuantizedTensorType>(type);
+}
+
+const QuantizedTensorType& Tensor::quantized_tensor_type() const {
+  assert(std::holds_alternative<QuantizedTensorType>(type));
+  return std::get<QuantizedTensorType>(type);
+}
+
+const TensorElementType& Tensor::tensor_element_type() const {
+  return tensor_type().element_type;
+}
+const QuantizedTensorElementType& Tensor::quantized_tensor_element_type()
+    const {
+  return quantized_tensor_type().element_type;
+}
+
+}  // namespace shlo_ref
diff --git a/tensorflow/lite/experimental/shlo/tensor.h b/tensorflow/lite/experimental/shlo/tensor.h
new file mode 100644
index 00000000000000..759a401c683a8a
--- /dev/null
+++ b/tensorflow/lite/experimental/shlo/tensor.h
@@ -0,0 +1,85 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TENSOR_H_
+#define TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TENSOR_H_
+
+#include <cassert>
+#include <cstddef>
+#include <variant>
+
+#include "tensorflow/lite/experimental/shlo/data_type.h"
+#include "tensorflow/lite/experimental/shlo/quantized_tensor_element_type.h"
+#include "tensorflow/lite/experimental/shlo/shape.h"
+
+namespace shlo_ref {
+
+using TensorElementType = DataType;
+
+struct TensorType {
+  Shape shape;
+  TensorElementType element_type;
+};
+
+struct QuantizedTensorType {
+  Shape shape;
+  QuantizedTensorElementType element_type;
+};
+
+struct Tensor {
+  const Shape& shape() const;
+  Shape& shape();
+
+  bool IsQuantized() const;
+  bool IsPerAxisQuantized() const;
+  bool IsPerTensorQuantized() const;
+
+  size_t Rank() const;
+  DataType StorageType() const;
+
+  DimensionSize NumElements() const;
+
+  TensorType& tensor_type();
+  const TensorType& tensor_type() const;
+
+  QuantizedTensorType& quantized_tensor_type();
+  const QuantizedTensorType& quantized_tensor_type() const;
+
+  const TensorElementType& tensor_element_type() const;
+  const QuantizedTensorElementType& quantized_tensor_element_type() const;
+
+  template <DataType data_type, typename T = Storage<data_type>::Type>
+  T* GetDataAs() {
+    return reinterpret_cast<T*>(data);
+  }
+
+  template <DataType data_type, typename T = Storage<data_type>::Type>
+  const T* GetDataAs() const {
+    return reinterpret_cast<const T*>(data);
+  }
+
+  std::variant<TensorType, QuantizedTensorType> type;
+
+  // If type is TensorType, the type should be Storage<type.element_type>::Type.
+  // If type is QuantizedTensorType, the type should be
+  // Storage<type.element_type.storage_type>::Type.
+  // May be nullptr if buffers are not yet available.
+  // The size of the array must be equal to Size(shape).
+  void* data = nullptr;
+};
+
+}  // namespace shlo_ref
+
+#endif  // TENSORFLOW_LITE_EXPERIMENTAL_SHLO_TENSOR_H_
diff --git a/tensorflow/lite/g3doc/_book.yaml b/tensorflow/lite/g3doc/_book.yaml
index a3c832afadcbd8..a229d8baf5e0c3 100644
--- a/tensorflow/lite/g3doc/_book.yaml
+++ b/tensorflow/lite/g3doc/_book.yaml
@@ -91,6 +91,12 @@ upper_tabs:
         path: /lite/android/quickstart
       - title: "Google Play services runtime"
         path: /lite/android/play_services
+        section:
+        - title: "Java API"
+          path: /lite/android/java.md
+        - title: "C API"
+          path: /lite/android/native.md
+          status: experimental
       - title: "Development tools"
         path: /lite/android/development
 
@@ -130,7 +136,7 @@ upper_tabs:
           path: /lite/android/delegates/gpu.md
         - title: "Task library API"
           path: /lite/android/delegates/gpu_task.md
-        - title: "Native API"
+        - title: "C/C++ API"
           path: /lite/android/delegates/gpu_native.md
       - title: "NNAPI delegate"
         path: /lite/android/delegates/nnapi
@@ -276,6 +282,9 @@ upper_tabs:
         - title: "Convert JAX models"
           path: /lite/examples/jax_conversion/overview
           status: nightly
+        - title: "Colab for JAX to TFLite"
+          path: /lite/examples/jax_conversion/jax_to_tflite
+          status: nightly
       - title: "Model compatibility"
         section:
         - title: "Overview"
diff --git a/tensorflow/lite/g3doc/android/delegates/gpu_native.md b/tensorflow/lite/g3doc/android/delegates/gpu_native.md
index 996056e1789d61..2221c2066f9cb6 100644
--- a/tensorflow/lite/g3doc/android/delegates/gpu_native.md
+++ b/tensorflow/lite/g3doc/android/delegates/gpu_native.md
@@ -63,6 +63,87 @@ an `EGLContext` does not exist, the delegate creates one internally, but then
 you must ensure that `Interpreter::Invoke()` is always called from the same
 thread in which `Interpreter::ModifyGraphWithDelegate()` was called.
 
+#### With TensorFlow Lite in Google Play Services:
+
+If you are using TensorFlow Lite in Google Play Services [C API](../native),
+you’ll need to use the Java/Kotlin API to check if a GPU delegate is available
+for your device before initializing the TensorFlow Lite runtime.
+
+Add the GPU delegate gradle dependencies to your application:
+
+```
+implementation 'com.google.android.gms:play-services-tflite-gpu:16.2.0'
+```
+
+Then, check the GPU availability and initialize TfLiteNative if the check is
+successful:
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>Java</h3>
+      <pre class="prettyprint">
+Task<Void> tfLiteHandleTask =
+TfLiteGpu.isGpuDelegateAvailable(this)
+   .onSuccessTask(gpuAvailable -> {
+      TfLiteInitializationOptions options =
+        TfLiteInitializationOptions.builder()
+          .setEnableGpuDelegateSupport(gpuAvailable).build();
+        return TfLiteNative.initialize(this, options);
+      }
+    );
+      </pre>
+      </section>
+      <section>
+      <h3>Kotlin</h3>
+        <pre class="prettyprint">
+val tfLiteHandleTask = TfLiteGpu.isGpuDelegateAvailable(this)
+    .onSuccessTask { gpuAvailable ->
+        val options = TfLiteInitializationOptions.Builder()
+            .setEnableGpuDelegateSupport(gpuAvailable)
+            .build()
+        TfLiteNative.initialize(this, options)
+    }
+        </pre>
+      </section>
+  </devsite-selector>
+</div>
+
+You also need to update your CMake configuration to include the
+`TFLITE_USE_OPAQUE_DELEGATE` compiler flag:
+
+```
+add_compile_definitions(TFLITE_USE_OPAQUE_DELEGATE)
+```
+
+The [FlatBuffers](https://flatbuffers.dev/) library is used to configure
+delegate plugins, so you need to add it to the dependencies of your native code.
+You can use the official `CMake` project configuration as follow:
+
+```
+target_include_directories(tflite-jni PUBLIC
+        third_party/headers # flatbuffers
+     ...)
+```
+
+You can also just bundle the headers to your app.
+
+Finally to use GPU inference in your C code, create the GPU delegate using
+`TFLiteSettings`:
+
+```
+#include "flatbuffers/flatbuffers.h"
+#include "tensorflow/lite/acceleration/configuration/configuration_generated.h"
+
+flatbuffers::FlatBufferBuilder fbb;
+tflite::TFLiteSettingsBuilder builder(fbb);
+const tflite::TFLiteSettings* tflite_settings =
+    flatbuffers::GetTemporaryPointer(fbb, builder.Finish());
+
+const TfLiteOpaqueDelegatePlugin* pluginCApi = TfLiteGpuDelegatePluginCApi();
+TfLiteOpaqueDelegate* gpu_delegate = pluginCApi->create(tflite_settings);
+```
+
 ## Quantized models {:#quantized-models}
 
 Android GPU delegate libraries support quantized models by default. You do not
diff --git a/tensorflow/lite/g3doc/android/java.md b/tensorflow/lite/g3doc/android/java.md
new file mode 100644
index 00000000000000..9278df326476e9
--- /dev/null
+++ b/tensorflow/lite/g3doc/android/java.md
@@ -0,0 +1,556 @@
+# TensorFlow Lite in Google Play services Java API
+
+TensorFlow Lite in Google Play services can also be accessed using Java APIs, in
+addition to the Native API. In particular, TensorFlow Lite in Google Play
+services is available through the
+[TensorFlow Lite Task API](https://www.tensorflow.org/lite/api_docs/java/org/tensorflow/lite/task/core/package-summary)
+and the
+[TensorFlow Lite Interpreter API](https://www.tensorflow.org/lite/api_docs/java/org/tensorflow/lite/InterpreterApi).
+The Task Library provides optimized out-of-the-box model interfaces for common
+machine learning tasks using visual, audio, and text data. The TensorFlow Lite
+Interpreter API, provided by the TensorFlow runtime, provides a more
+general-purpose interface for building and running ML models.
+
+The following sections provide instructions on how to use the Interpreter and
+Task Library APIs with TensorFlow Lite in Google Play services. While it is
+possible for an app to use both the Interpreter APIs and Task Library APIs, most
+apps should only use one set of APIs.
+
+### Using the Task Library APIs
+
+The TensorFlow Lite Task API wraps the Interpreter API and provides a high-level
+programming interface for common machine learning tasks that use visual, audio,
+and text data. You should use the Task API if your application requires one of
+the
+[supported tasks](../inference_with_metadata/task_library/overview#supported_tasks).
+
+#### 1. Add project dependencies
+
+Your project dependency depends on your machine learning use case. The Task APIs
+contain the following libraries:
+
+*   Vision library: `org.tensorflow:tensorflow-lite-task-vision-play-services`
+*   Audio library: `org.tensorflow:tensorflow-lite-task-audio-play-services`
+*   Text library: `org.tensorflow:tensorflow-lite-task-text-play-services`
+
+Add one of the dependencies to your app project code to access the Play services
+API for TensorFlow Lite. For example, use the following to implement a vision
+task:
+
+```
+dependencies {
+...
+    implementation 'org.tensorflow:tensorflow-lite-task-vision-play-services:0.4.2'
+...
+}
+```
+
+Caution: The TensorFlow Lite Tasks Audio library version 0.4.2 maven repository
+is incomplete. Use version 0.4.2.1 for this library instead:
+`org.tensorflow:tensorflow-lite-task-audio-play-services:0.4.2.1`.
+
+#### 2. Add initialization of TensorFlow Lite
+
+Initialize the TensorFlow Lite component of the Google Play services API
+*before* using the TensorFlow Lite APIs. The following example initializes the
+vision library:
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>Kotlin</h3>
+<pre class="prettyprint">
+init {
+  TfLiteVision.initialize(context)
+}
+</pre>
+    </section>
+  </devsite-selector>
+</div>
+
+Important: Make sure the `TfLite.initialize` task completes before executing
+code that accesses TensorFlow Lite APIs.
+
+Tip: The TensorFlow Lite modules are installed at the same time your application
+is installed or updated from the Play Store. You can check the availability of
+the modules by using `ModuleInstallClient` from the Google Play services APIs.
+For more information on checking module availability, see
+[Ensuring API availability with ModuleInstallClient](https://developers.google.com/android/guides/module-install-apis).
+
+#### 3. Run inferences
+
+After initializing the TensorFlow Lite component, call the `detect()` method to
+generate inferences. The exact code within the `detect()` method varies
+depending on the library and use case. The following is for a simple object
+detection use case with the `TfLiteVision` library:
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>Kotlin</h3>
+<pre class="prettyprint">
+fun detect(...) {
+  if (!TfLiteVision.isInitialized()) {
+    Log.e(TAG, "detect: TfLiteVision is not initialized yet")
+    return
+  }
+
+  if (objectDetector == null) {
+    setupObjectDetector()
+  }
+
+  ...
+
+}
+</pre>
+    </section>
+  </devsite-selector>
+</div>
+
+Depending on the data format, you may also need to preprocess and convert your
+data within the `detect()` method before generating inferences. For example,
+image data for an object detector requires the following:
+
+```kotlin
+val imageProcessor = ImageProcessor.Builder().add(Rot90Op(-imageRotation / 90)).build()
+val tensorImage = imageProcessor.process(TensorImage.fromBitmap(image))
+val results = objectDetector?.detect(tensorImage)
+```
+
+### Using the Interpreter APIs
+
+The Interpreter APIs offer more control and flexibility than the Task Library
+APIs. You should use the Interpreter APIs if your machine learning task is not
+supported by the Task library, or if you require a more general-purpose
+interface for building and running ML models.
+
+#### 1. Add project dependencies
+
+Add the following dependencies to your app project code to access the Play
+services API for TensorFlow Lite:
+
+```
+dependencies {
+...
+    // Tensorflow Lite dependencies for Google Play services
+    implementation 'com.google.android.gms:play-services-tflite-java:16.0.1'
+    // Optional: include Tensorflow Lite Support Library
+    implementation 'com.google.android.gms:play-services-tflite-support:16.0.1'
+...
+}
+```
+
+#### 2. Add initialization of TensorFlow Lite
+
+Initialize the TensorFlow Lite component of the Google Play services API
+*before* using the TensorFlow Lite APIs:
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>Kotlin</h3>
+<pre class="prettyprint">
+val initializeTask: Task&lt;Void> by lazy { TfLite.initialize(this) }
+</pre>
+    </section>
+    <section>
+      <h3>Java</h3>
+<pre class="prettyprint">
+Task&lt;Void> initializeTask = TfLite.initialize(context);
+</pre>
+    </section>
+  </devsite-selector>
+</div>
+
+Note: Make sure the `TfLite.initialize` task completes before executing code
+that accesses TensorFlow Lite APIs. Use the `addOnSuccessListener()` method, as
+shown in the next section.
+
+#### 3. Create an Interpreter and set runtime option {:#step_3_interpreter}
+
+Create an interpreter using `InterpreterApi.create()` and configure it to use
+Google Play services runtime, by calling `InterpreterApi.Options.setRuntime()`,
+as shown in the following example code:
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>Kotlin</h3>
+<pre class="prettyprint">
+import org.tensorflow.lite.InterpreterApi
+import org.tensorflow.lite.InterpreterApi.Options.TfLiteRuntime
+...
+private lateinit var interpreter: InterpreterApi
+...
+initializeTask.addOnSuccessListener {
+  val interpreterOption =
+    InterpreterApi.Options().setRuntime(TfLiteRuntime.FROM_SYSTEM_ONLY)
+  interpreter = InterpreterApi.create(
+    modelBuffer,
+    interpreterOption
+  )}
+  .addOnFailureListener { e ->
+    Log.e("Interpreter", "Cannot initialize interpreter", e)
+  }
+</pre>
+    </section>
+    <section>
+      <h3>Java</h3>
+<pre class="prettyprint">
+import org.tensorflow.lite.InterpreterApi
+import org.tensorflow.lite.InterpreterApi.Options.TfLiteRuntime
+...
+private InterpreterApi interpreter;
+...
+initializeTask.addOnSuccessListener(a -> {
+    interpreter = InterpreterApi.create(modelBuffer,
+      new InterpreterApi.Options().setRuntime(TfLiteRuntime.FROM_SYSTEM_ONLY));
+  })
+  .addOnFailureListener(e -> {
+    Log.e("Interpreter", String.format("Cannot initialize interpreter: %s",
+          e.getMessage()));
+  });
+</pre>
+    </section>
+  </devsite-selector>
+</div>
+
+You should use the implementation above because it avoids blocking the Android
+user interface thread. If you need to manage thread execution more closely, you
+can add a `Tasks.await()` call to interpreter creation:
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>Kotlin</h3>
+<pre class="prettyprint">
+import androidx.lifecycle.lifecycleScope
+...
+lifecycleScope.launchWhenStarted { // uses coroutine
+  initializeTask.await()
+}
+</pre>
+    </section>
+    <section>
+      <h3>Java</h3>
+<pre class="prettyprint">
+@BackgroundThread
+InterpreterApi initializeInterpreter() {
+    Tasks.await(initializeTask);
+    return InterpreterApi.create(...);
+}
+</pre>
+    </section>
+  </devsite-selector>
+</div>
+
+Warning: Do not call `.await()` on the foreground user interface thread because
+it interrupts display of user interface elements and creates a poor user
+experience.
+
+#### 4. Run inferences
+
+Using the `interpreter` object you created, call the `run()` method to generate
+an inference.
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>Kotlin</h3>
+<pre class="prettyprint">
+interpreter.run(inputBuffer, outputBuffer)
+</pre>
+    </section>
+    <section>
+      <h3>Java</h3>
+<pre class="prettyprint">
+interpreter.run(inputBuffer, outputBuffer);
+</pre>
+    </section>
+  </devsite-selector>
+</div>
+
+## Hardware acceleration {:#hardware-acceleration}
+
+TensorFlow Lite allows you to accelerate the performance of your model using
+specialized hardware processors, such as graphics processing units (GPUs). You
+can take advantage of these specialized processors using hardware drivers called
+[*delegates*](https://www.tensorflow.org/lite/performance/delegates). You can
+use the following hardware acceleration delegates with TensorFlow Lite in Google
+Play services:
+
+-   *[GPU delegate](https://www.tensorflow.org/lite/performance/gpu)
+    (recommended)* - This delegate is provided through Google Play services and
+    is dynamically loaded, just like the Play services versions of the Task API
+    and Interpreter API.
+
+-   [*NNAPI delegate*](https://www.tensorflow.org/lite/android/delegates/nnapi) -
+    This delegate is available as an included library dependency in your Android
+    development project, and is bundled into your app.
+
+For more information about hardware acceleration with TensorFlow Lite, see the
+[TensorFlow Lite Delegates](https://www.tensorflow.org/lite/performance/delegates)
+page.
+
+### Checking device compatibility
+
+Not all devices support GPU hardware acceleration with TFLite. In order to
+mitigate errors and potential crashes, use the
+`TfLiteGpu.isGpuDelegateAvailable` method to check whether a device is
+compatible with the GPU delegate.
+
+Use this method to confirm whether a device is compatible with GPU, and use CPU
+or the NNAPI delegate as a fallback for when GPU is not supported.
+
+```
+useGpuTask = TfLiteGpu.isGpuDelegateAvailable(context)
+```
+
+Once you have a variable like `useGpuTask`, you can use it to determine whether
+devices use the GPU delegate. The following examples show how this can be done
+with both the Task Library and Interpreter APIs.
+
+**With the Task Api**
+
+<div>
+<devsite-selector>
+<section>
+  <h3>Kotlin</h3>
+    <pre class="prettyprint">
+lateinit val optionsTask = useGpuTask.continueWith { task ->
+  val baseOptionsBuilder = BaseOptions.builder()
+  if (task.result) {
+    baseOptionsBuilder.useGpu()
+  }
+ ObjectDetectorOptions.builder()
+          .setBaseOptions(baseOptionsBuilder.build())
+          .setMaxResults(1)
+          .build()
+}
+    </pre>
+</section>
+<section>
+  <h3>Java</h3>
+    <pre class="prettyprint">
+Task&lt;ObjectDetectorOptions> optionsTask = useGpuTask.continueWith({ task ->
+  BaseOptions baseOptionsBuilder = BaseOptions.builder();
+  if (task.getResult()) {
+    baseOptionsBuilder.useGpu();
+  }
+  return ObjectDetectorOptions.builder()
+          .setBaseOptions(baseOptionsBuilder.build())
+          .setMaxResults(1)
+          .build()
+});
+    </pre>
+</section>
+</devsite-selector>
+</div>
+
+**With the Interpreter Api**
+
+<div>
+<devsite-selector>
+<section>
+  <h3>Kotlin</h3>
+    <pre class="prettyprint">
+val interpreterTask = useGpuTask.continueWith { task ->
+  val interpreterOptions = InterpreterApi.Options()
+      .setRuntime(TfLiteRuntime.FROM_SYSTEM_ONLY)
+  if (task.result) {
+      interpreterOptions.addDelegateFactory(GpuDelegateFactory())
+  }
+  InterpreterApi.create(FileUtil.loadMappedFile(context, MODEL_PATH), interpreterOptions)
+}
+    </pre>
+</section>
+<section>
+  <h3>Java</h3>
+    <pre class="prettyprint">
+Task&lt;InterpreterApi.Options> interpreterOptionsTask = useGpuTask.continueWith({ task ->
+  InterpreterApi.Options options =
+      new InterpreterApi.Options().setRuntime(TfLiteRuntime.FROM_SYSTEM_ONLY);
+  if (task.getResult()) {
+     options.addDelegateFactory(new GpuDelegateFactory());
+  }
+  return options;
+});
+    </pre>
+</section>
+</devsite-selector>
+</div>
+
+### GPU with Task Library APIs
+
+To use the GPU delegate with the Task APIs:
+
+1.  Update the project dependencies to use the GPU delegate from Play services:
+
+    ```
+    implementation 'com.google.android.gms:play-services-tflite-gpu:16.1.0'
+    ```
+
+1.  Initialize the GPU delegate with `setEnableGpuDelegateSupport`. For example,
+    you can initialize the GPU delegate for `TfLiteVision` with the following:
+
+    <div>
+    <devsite-selector>
+    <section>
+      <h3>Kotlin</h3>
+        <pre class="prettyprint">
+        TfLiteVision.initialize(context, TfLiteInitializationOptions.builder().setEnableGpuDelegateSupport(true).build())
+        </pre>
+    </section>
+    <section>
+      <h3>Java</h3>
+        <pre class="prettyprint">
+        TfLiteVision.initialize(context, TfLiteInitializationOptions.builder().setEnableGpuDelegateSupport(true).build());
+        </pre>
+    </section>
+    </devsite-selector>
+    </div>
+
+1.  Enable the GPU delegate option with
+    [`BaseOptions`](https://www.tensorflow.org/lite/api_docs/java/org/tensorflow/lite/task/core/BaseOptions.Builder):
+
+    <div>
+    <devsite-selector>
+    <section>
+      <h3>Kotlin</h3>
+        <pre class="prettyprint">
+        val baseOptions = BaseOptions.builder().useGpu().build()
+        </pre>
+    </section>
+    <section>
+      <h3>Java</h3>
+        <pre class="prettyprint">
+        BaseOptions baseOptions = BaseOptions.builder().useGpu().build();
+        </pre>
+    </section>
+    </devsite-selector>
+    </div>
+
+1.  Configure the options using `.setBaseOptions`. For example, you can set up
+    GPU in `ObjectDetector` with the following:
+
+    <div>
+    <devsite-selector>
+    <section>
+      <h3>Kotlin</h3>
+        <pre class="prettyprint">
+        val options =
+            ObjectDetectorOptions.builder()
+                .setBaseOptions(baseOptions)
+                .setMaxResults(1)
+                .build()
+        </pre>
+    </section>
+    <section>
+      <h3>Java</h3>
+        <pre class="prettyprint">
+        ObjectDetectorOptions options =
+            ObjectDetectorOptions.builder()
+                .setBaseOptions(baseOptions)
+                .setMaxResults(1)
+                .build();
+        </pre>
+    </section>
+    </devsite-selector>
+    </div>
+
+### GPU with Interpreter APIs
+
+To use the GPU delegate with the Interpreter APIs:
+
+1.  Update the project dependencies to use the GPU delegate from Play services:
+
+    ```
+    implementation 'com.google.android.gms:play-services-tflite-gpu:16.1.0'
+    ```
+
+1.  Enable the GPU delegate option in the TFlite initialization:
+
+    <div>
+    <devsite-selector>
+    <section>
+      <h3>Kotlin</h3>
+        <pre class="prettyprint">
+        TfLite.initialize(context,
+          TfLiteInitializationOptions.builder()
+           .setEnableGpuDelegateSupport(true)
+           .build())
+        </pre>
+    </section>
+    <section>
+      <h3>Java</h3>
+        <pre class="prettyprint">
+        TfLite.initialize(context,
+          TfLiteInitializationOptions.builder()
+           .setEnableGpuDelegateSupport(true)
+           .build());
+        </pre>
+    </section>
+    </devsite-selector>
+    </div>
+
+1.  Enable GPU delegate in the interpreter options: set the delegate factory to
+    GpuDelegateFactory by calling `addDelegateFactory()
+    within`InterpreterApi.Options()`:
+
+    <div>
+    <devsite-selector>
+    <section>
+      <h3>Kotlin</h3>
+        <pre class="prettyprint">
+        val interpreterOption = InterpreterApi.Options()
+         .setRuntime(TfLiteRuntime.FROM_SYSTEM_ONLY)
+         .addDelegateFactory(GpuDelegateFactory())
+        </pre>
+    </section>
+    <section>
+      <h3>Java</h3>
+        <pre class="prettyprint">
+        Options interpreterOption = InterpreterApi.Options()
+          .setRuntime(TfLiteRuntime.FROM_SYSTEM_ONLY)
+          .addDelegateFactory(new GpuDelegateFactory());
+        </pre>
+    </section>
+    </devsite-selector>
+    </div>
+
+## Migrating from stand-alone TensorFlow Lite {:#migrating}
+
+If you are planning to migrate your app from stand-alone TensorFlow Lite to the
+Play services API, review the following additional guidance for updating your
+app project code:
+
+1.  Review the [Limitations](#limitations) section of this page to ensure your
+    use case is supported.
+2.  Prior to updating your code, do performance and accuracy checks for your
+    models, particularly if you are using versions of TensorFlow Lite earlier
+    than version 2.1, so you have a baseline to compare against the new
+    implementation.
+3.  If you have migrated all of your code to use the Play services API for
+    TensorFlow Lite, you should remove the existing TensorFlow Lite *runtime
+    library* dependencies (entries with
+    <code>org.tensorflow:**tensorflow-lite**:*</code>) from your build.gradle
+    file so that you can reduce your app size.
+4.  Identify all occurrences of `new Interpreter` object creation in your code,
+    and modify each one so that it uses the InterpreterApi.create() call. The
+    new TfLite.initialize is asynchronous, which means in most cases it's not a
+    drop-in replacement: you must register a listener for when the call
+    completes. Refer to the code snippet in [Step 3](#step_3_interpreter) code.
+5.  Add `import org.tensorflow.lite.InterpreterApi;` and `import
+    org.tensorflow.lite.InterpreterApi.Options.TfLiteRuntime;` to any source
+    files using the `org.tensorflow.lite.Interpreter` or
+    `org.tensorflow.lite.InterpreterApi` classes.
+6.  If any of the resulting calls to `InterpreterApi.create()` have only a
+    single argument, append `new InterpreterApi.Options()` to the argument list.
+7.  Append `.setRuntime(TfLiteRuntime.FROM_SYSTEM_ONLY)` to the last argument of
+    any calls to `InterpreterApi.create()`.
+8.  Replace all other occurrences of the `org.tensorflow.lite.Interpreter` class
+    with `org.tensorflow.lite.InterpreterApi`.
+
+If you want to use stand-alone TensorFlow Lite and the Play services API
+side-by-side, you must use TensorFlow Lite 2.9 (or later). TensorFlow Lite 2.8
+and earlier versions are not compatible with the Play services API version.
diff --git a/tensorflow/lite/g3doc/android/lite_build.md b/tensorflow/lite/g3doc/android/lite_build.md
index 4324c0b9d5d4bf..b9e0ab86649537 100644
--- a/tensorflow/lite/g3doc/android/lite_build.md
+++ b/tensorflow/lite/g3doc/android/lite_build.md
@@ -23,6 +23,20 @@ allprojects {
 }
 ```
 
+add nightly snapshots to dependencies (or edit as needed) to your build.gradle
+
+```groovy
+...
+dependencies {
+    ...
+    implementation 'org.tensorflow:tensorflow-lite:0.0.0-nightly-SNAPSHOT'
+    implementation 'org.tensorflow:tensorflow-lite-gpu:0.0.0-nightly-SNAPSHOT'
+    implementation 'org.tensorflow:tensorflow-lite-support:0.0.0-nightly-SNAPSHOT'
+    ...
+}
+...
+```
+
 ## Build TensorFlow Lite locally
 
 In some cases, you might wish to use a local build of TensorFlow Lite. For
diff --git a/tensorflow/lite/g3doc/android/native.md b/tensorflow/lite/g3doc/android/native.md
new file mode 100644
index 00000000000000..a7ef8f055b3974
--- /dev/null
+++ b/tensorflow/lite/g3doc/android/native.md
@@ -0,0 +1,207 @@
+# TensorFlow Lite in Google Play services C API (Beta)
+
+Beta: TensorFlow Lite in Google Play services C API is currently in Beta.
+
+TensorFlow Lite in Google Play services runtime allows you to run machine
+learning (ML) models without statically bundling TensorFlow Lite libraries into
+your app. This guide provide instructions on how to use the C APIs for Google
+Play services.
+
+Before working with the TensorFlow Lite in Google Play services C API, make sure
+you have the [CMake](https://cmake.org/) build tool installed.
+
+## Update your build configuration
+
+Add the following dependencies to your app project code to access the Play
+services API for TensorFlow Lite:
+
+```
+implementation "com.google.android.gms:play-services-tflite-java:16.2.0-beta02"
+```
+
+Then, enable the
+[Prefab](https://developer.android.com/build/dependencies#build-system-configuration)
+feature to access the C API from your CMake script by updating the android block
+of your module's build.gradle file:
+
+```
+buildFeatures {
+  prefab = true
+}
+```
+
+You finally need to add the package `tensorflowlite_jni_gms_client` imported
+from the AAR as a dependency in your CMake script:
+
+```
+find_package(tensorflowlite_jni_gms_client REQUIRED CONFIG)
+
+target_link_libraries(tflite-jni # your JNI lib target
+        tensorflowlite_jni_gms_client::tensorflowlite_jni_gms_client
+        android # other deps for your target
+        log)
+
+# Also add -DTFLITE_IN_GMSCORE -DTFLITE_WITH_STABLE_ABI
+# to the C/C++ compiler flags.
+
+add_compile_definitions(TFLITE_IN_GMSCORE)
+add_compile_definitions(TFLITE_WITH_STABLE_ABI)
+```
+
+## Initialize the TensorFlow Lite runtime
+
+Before calling the TensorFlow Lite Native API you must initialize the
+`TfLiteNative` runtime in your Java/Kotlin code.
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>Java</h3>
+      <pre class="prettyprint">
+Task tfLiteInitializeTask = TfLiteNative.initialize(context);
+      </pre>
+      </section>
+      <section>
+      <h3>Kotlin</h3>
+        <pre class="prettyprint">
+val tfLiteInitializeTask: Task<Void> = TfLiteNative.initialize(context)
+        </pre>
+      </section>
+  </devsite-selector>
+</div>
+
+Using the Google Play services Task API, `TfLiteNative.initialize`
+asynchronously loads the TFLite runtime from Google Play services into your
+application's runtime process. Use `addOnSuccessListener()` to make sure the
+`TfLite.initialize()` task completes before executing code that accesses
+TensorFlow Lite APIs. Once the task has completed successfully, you can invoke
+all the available TFLite Native APIs.
+
+## Native code implementation
+
+To use TensorFlow Lite in Google Play services with your native code, you can do
+one of the following:
+
+-   declare new JNI functions to call native functions from your Java code
+-   Call the TensorFlow Lite Native API from your existing native C code.
+
+JNI functions:
+
+You can declare a new JNI function to make the TensorFlow Lite runtime declared
+in Java/Kotlin accessible to your native code as follow:
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>Java</h3>
+      <pre class="prettyprint">
+package com.google.samples.gms.tflite.c;
+
+public class TfLiteJni {
+  static {
+    System.loadLibrary("tflite-jni");
+  }
+  public TfLiteJni() { /**/ };
+  public native void loadModel(AssetManager assetManager, String assetName);
+  public native float[] runInference(float[] input);
+}
+      </pre>
+      </section>
+      <section>
+      <h3>Kotlin</h3>
+        <pre class="prettyprint">
+package com.google.samples.gms.tflite.c
+
+class TfLiteJni() {
+  companion object {
+    init {
+      System.loadLibrary("tflite-jni")
+    }
+  }
+  external fun loadModel(assetManager: AssetManager, assetName: String)
+  external fun runInference(input: FloatArray): FloatArray
+}
+        </pre>
+      </section>
+  </devsite-selector>
+</div>
+
+Matching the following `loadModel` and `runInference` native functions:
+
+```
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+void Java_com_google_samples_gms_tflite_c_loadModel(
+  JNIEnv *env, jobject tflite_jni, jobject asset_manager, jstring asset_name){}
+  //...
+}
+
+jfloatArray Java_com_google_samples_gms_tflite_c_TfLiteJni_runInference(
+  JNIEnv* env, jobject tfliteJni, jfloatArray input) {
+  //...
+}
+
+#ifdef __cplusplus
+}  // extern "C".
+#endif
+```
+
+You can then call your C functions from your Java/Kotlin code:
+
+<div>
+  <devsite-selector>
+    <section>
+      <h3>Java</h3>
+      <pre class="prettyprint">
+tfLiteHandleTask.onSuccessTask(unused -> {
+    TfLiteJni jni = new TfLiteJni();
+    jni.loadModel(getAssets(), "add.bin");
+    //...
+});
+    </pre>
+    </section>
+    <section>
+      <h3>Kotlin</h3>
+      <pre class="prettyprint">
+tfLiteHandleTask.onSuccessTask {
+    val jni = TfLiteJni()
+    jni.loadModel(assets, "add.bin")
+    // ...
+}
+      </pre>
+    </section>
+  </devsite-selector>
+</div>
+
+### TensorFlow Lite in C code
+
+Include the appropriate API header file to include the TfLite with Google Play
+services API:
+
+```
+#include "tensorflow/lite/c/c_api.h"
+```
+
+You can then use the regular TensorFlow Lite C API:
+
+```
+auto model = TfLiteModelCreate(model_asset, model_asset_length);
+// ...
+auto options = TfLiteInterpreterOptionsCreate();
+// ...
+auto interpreter = TfLiteInterpreterCreate(model, options);
+```
+
+The TensorFlow Lite with Google Play services Native API headers provide the
+same API as the regular
+[TensorFlow Lite C API](https://www.tensorflow.org/lite/api_docs/c), excluding
+features that are deprecated or experimental. For now the functions and types
+from the `c_api.h`, `c_api_types.h` and `common.h` headers are available. Please
+note that functions from the `c_api_experimental.h` header are not supported.
+The documentation can be found
+[online](https://www.tensorflow.org/lite/api_docs/c).
+
+You can use functions specific to TensorFlow Lite with Google Play Services by
+including `tflite.h`.
diff --git a/tensorflow/lite/g3doc/android/play_services.md b/tensorflow/lite/g3doc/android/play_services.md
index 76b3e136518ce9..95fb79e0c344a6 100644
--- a/tensorflow/lite/g3doc/android/play_services.md
+++ b/tensorflow/lite/g3doc/android/play_services.md
@@ -27,558 +27,11 @@ the APIs. </aside>
 
 ## Using the Play services runtime
 
-TensorFlow Lite in Google Play services is available through the
-[TensorFlow Lite Task API](../api_docs/java/org/tensorflow/lite/task/core/package-summary)
-and
-[TensorFlow Lite Interpreter API](../api_docs/java/org/tensorflow/lite/InterpreterApi).
-The Task Library provides optimized out-of-box model interfaces for common
-machine learning tasks using visual, audio, and text data. The TensorFlow Lite
-Interpreter API, provided by the TensorFlow runtime and support libraries,
-provides a more general-purpose interface for building and running ML models.
+The TensorFlow Lite in Google Play services is available through the following
+programming language apis:
 
-The following sections provide instructions on how to implement the Interpreter
-and Task Library APIs in Google Play services. While it is possible for an app
-to use both the Interpreter APIs and Task Library APIs, most apps should only
-use one set of APIs.
-
-### Using the Task Library APIs
-
-The TensorFlow Lite Task API wraps the Interpreter API and provides a high-level
-programming interface for common machine learning tasks that use visual, audio,
-and text data. You should use the Task API if your application requires one of
-the
-[supported tasks](../inference_with_metadata/task_library/overview#supported_tasks).
-
-#### 1. Add project dependencies
-
-Your project dependency depends on your machine learning use case. The Task APIs
-contain the following libraries:
-
-*   Vision library: `org.tensorflow:tensorflow-lite-task-vision-play-services`
-*   Audio library: `org.tensorflow:tensorflow-lite-task-audio-play-services`
-*   Text library: `org.tensorflow:tensorflow-lite-task-text-play-services`
-
-Add one of the dependencies to your app project code to access the Play services
-API for TensorFlow Lite. For example, use the following to implement a vision
-task:
-
-```
-dependencies {
-...
-    implementation 'org.tensorflow:tensorflow-lite-task-vision-play-services:0.4.2'
-...
-}
-```
-
-Caution: The TensorFlow Lite Tasks Audio library version 0.4.2
-maven repository is incomplete. Use version 0.4.2.1 for this library instead:
-`org.tensorflow:tensorflow-lite-task-audio-play-services:0.4.2.1`.
-
-#### 2. Add initialization of TensorFlow Lite
-
-Initialize the TensorFlow Lite component of the Google Play services API
-*before* using the TensorFlow Lite APIs. The following example initializes the
-vision library:
-
-<div>
-  <devsite-selector>
-    <section>
-      <h3>Kotlin</h3>
-<pre class="prettyprint">
-init {
-  TfLiteVision.initialize(context)
-    }
-  }
-</pre>
-    </section>
-  </devsite-selector>
-</div>
-
-Important: Make sure the `TfLite.initialize` task completes before executing
-code that accesses TensorFlow Lite APIs.
-
-Tip: The TensorFlow Lite modules are installed at the same time your application
-is installed or updated from the Play Store. You can check the availability of
-the modules by using `ModuleInstallClient` from the Google Play services APIs.
-For more information on checking module availability, see
-[Ensuring API availability with ModuleInstallClient](https://developers.google.com/android/guides/module-install-apis).
-
-#### 3. Run inferences
-
-After initializing the TensorFlow Lite component, call the `detect()` method to
-generate inferences. The exact code within the `detect()` method varies
-depending on the library and use case. The following is for a simple object
-detection use case with the `TfLiteVision` library:
-
-<div>
-  <devsite-selector>
-    <section>
-      <h3>Kotlin</h3>
-<pre class="prettyprint">
-fun detect(...) {
-  if (!TfLiteVision.isInitialized()) {
-    Log.e(TAG, "detect: TfLiteVision is not initialized yet")
-    return
-  }
-
-  if (objectDetector == null) {
-    setupObjectDetector()
-  }
-
-  ...
-
-}
-</pre>
-    </section>
-  </devsite-selector>
-</div>
-
-Depending on the data format, you may also need to preprocess and convert your
-data within the `detect()` method before generating inferences. For example,
-image data for an object detector requires the following:
-
-```kotlin
-val imageProcessor = ImageProcessor.Builder().add(Rot90Op(-imageRotation / 90)).build()
-val tensorImage = imageProcessor.process(TensorImage.fromBitmap(image))
-val results = objectDetector?.detect(tensorImage)
-```
-
-### Using the Interpreter APIs
-
-The Interpreter APIs offer more control and flexibility than the Task Library
-APIs. You should use the Interpreter APIs if your machine learning task is not
-supported by the Task library, or if you require a more general-purpose
-interface for building and running ML models.
-
-#### 1. Add project dependencies
-
-Add the following dependencies to your app project code to access the Play
-services API for TensorFlow Lite:
-
-```
-dependencies {
-...
-    // Tensorflow Lite dependencies for Google Play services
-    implementation 'com.google.android.gms:play-services-tflite-java:16.0.1'
-    // Optional: include Tensorflow Lite Support Library
-    implementation 'com.google.android.gms:play-services-tflite-support:16.0.1'
-...
-}
-```
-
-#### 2. Add initialization of TensorFlow Lite
-
-Initialize the TensorFlow Lite component of the Google Play services API
-*before* using the TensorFlow Lite APIs:
-
-<div>
-  <devsite-selector>
-    <section>
-      <h3>Kotlin</h3>
-<pre class="prettyprint">
-val initializeTask: Task&lt;Void> by lazy { TfLite.initialize(this) }
-</pre>
-    </section>
-    <section>
-      <h3>Java</h3>
-<pre class="prettyprint">
-Task&lt;Void> initializeTask = TfLite.initialize(context);
-</pre>
-    </section>
-  </devsite-selector>
-</div>
-
-Note: Make sure the `TfLite.initialize` task completes before executing code
-that accesses TensorFlow Lite APIs. Use the `addOnSuccessListener()` method, as
-shown in the next section.
-
-#### 3. Create an Interpreter and set runtime option {:#step_3_interpreter}
-
-Create an interpreter using `InterpreterApi.create()` and configure it to use
-Google Play services runtime, by calling `InterpreterApi.Options.setRuntime()`,
-as shown in the following example code:
-
-<div>
-  <devsite-selector>
-    <section>
-      <h3>Kotlin</h3>
-<pre class="prettyprint">
-import org.tensorflow.lite.InterpreterApi
-import org.tensorflow.lite.InterpreterApi.Options.TfLiteRuntime
-...
-private lateinit var interpreter: InterpreterApi
-...
-initializeTask.addOnSuccessListener {
-  val interpreterOption =
-    InterpreterApi.Options().setRuntime(TfLiteRuntime.FROM_SYSTEM_ONLY)
-  interpreter = InterpreterApi.create(
-    modelBuffer,
-    interpreterOption
-  )}
-  .addOnFailureListener { e ->
-    Log.e("Interpreter", "Cannot initialize interpreter", e)
-  }
-</pre>
-    </section>
-    <section>
-      <h3>Java</h3>
-<pre class="prettyprint">
-import org.tensorflow.lite.InterpreterApi
-import org.tensorflow.lite.InterpreterApi.Options.TfLiteRuntime
-...
-private InterpreterApi interpreter;
-...
-initializeTask.addOnSuccessListener(a -> {
-    interpreter = InterpreterApi.create(modelBuffer,
-      new InterpreterApi.Options().setRuntime(TfLiteRuntime.FROM_SYSTEM_ONLY));
-  })
-  .addOnFailureListener(e -> {
-    Log.e("Interpreter", String.format("Cannot initialize interpreter: %s",
-          e.getMessage()));
-  });
-</pre>
-    </section>
-  </devsite-selector>
-</div>
-
-You should use the implementation above because it avoids blocking the Android
-user interface thread. If you need to manage thread execution more closely, you
-can add a `Tasks.await()` call to interpreter creation:
-
-<div>
-  <devsite-selector>
-    <section>
-      <h3>Kotlin</h3>
-<pre class="prettyprint">
-import androidx.lifecycle.lifecycleScope
-...
-lifecycleScope.launchWhenStarted { // uses coroutine
-  initializeTask.await()
-}
-</pre>
-    </section>
-    <section>
-      <h3>Java</h3>
-<pre class="prettyprint">
-@BackgroundThread
-InterpreterApi initializeInterpreter() {
-    Tasks.await(initializeTask);
-    return InterpreterApi.create(...);
-}
-</pre>
-    </section>
-  </devsite-selector>
-</div>
-
-Warning: Do not call `.await()` on the foreground user interface thread because
-it interrupts display of user interface elements and creates a poor user
-experience.
-
-#### 4. Run inferences
-
-Using the `interpreter` object you created, call the `run()` method to generate
-an inference.
-
-<div>
-  <devsite-selector>
-    <section>
-      <h3>Kotlin</h3>
-<pre class="prettyprint">
-interpreter.run(inputBuffer, outputBuffer)
-</pre>
-    </section>
-    <section>
-      <h3>Java</h3>
-<pre class="prettyprint">
-interpreter.run(inputBuffer, outputBuffer);
-</pre>
-    </section>
-  </devsite-selector>
-</div>
-
-## Hardware acceleration {:#hardware-acceleration}
-
-TensorFlow Lite allows you to accelerate the performance of your model using
-specialized hardware processors, such as graphics processing units (GPUs). You
-can take advantage of these specialized processors using hardware drivers called
-[*delegates*](https://www.tensorflow.org/lite/performance/delegates). You can
-use the following hardware acceleration delegates with TensorFlow Lite in Google
-Play services:
-
--   *[GPU delegate](https://www.tensorflow.org/lite/performance/gpu)
-    (recommended)* - This delegate is provided through Google Play services and
-    is dynamically loaded, just like the Play services versions of the Task API
-    and Interpreter API.
-
--   [*NNAPI delegate*](https://www.tensorflow.org/lite/android/delegates/nnapi) -
-    This delegate is available as an included library dependency in your Android
-    development project, and is bundled into your app.
-
-For more information about hardware acceleration with TensorFlow Lite, see the
-[TensorFlow Lite Delegates](https://www.tensorflow.org/lite/performance/delegates)
-page.
-
-### Checking device compatibility
-
-Not all devices support GPU hardware acceleration with TFLite. In order to
-mitigate errors and potential crashes, use the
-`TfLiteGpu.isGpuDelegateAvailable` method to check whether a device is
-compatible with the GPU delegate.
-
-Use this method to confirm whether a device is compatible with GPU, and use CPU
-or the NNAPI delegate as a fallback for when GPU is not supported.
-
-```
-useGpuTask = TfLiteGpu.isGpuDelegateAvailable(context)
-```
-
-Once you have a variable like `useGpuTask`, you can use it to determine whether
-devices use the GPU delegate. The following examples show how this can be done
-with both the Task Library and Interpreter APIs.
-
-**With the Task Api**
-
-<div>
-<devsite-selector>
-<section>
-  <h3>Kotlin</h3>
-    <pre class="prettyprint">
-lateinit val optionsTask = useGpuTask.continueWith { task ->
-  val baseOptionsBuilder = BaseOptions.builder()
-  if (task.result) {
-    baseOptionsBuilder.useGpu()
-  }
- ObjectDetectorOptions.builder()
-          .setBaseOptions(baseOptionsBuilder.build())
-          .setMaxResults(1)
-          .build()
-}
-    </pre>
-</section>
-<section>
-  <h3>Java</h3>
-    <pre class="prettyprint">
-Task&lt;ObjectDetectorOptions> optionsTask = useGpuTask.continueWith({ task ->
-  BaseOptions baseOptionsBuilder = BaseOptions.builder();
-  if (task.getResult()) {
-    baseOptionsBuilder.useGpu();
-  }
-  return ObjectDetectorOptions.builder()
-          .setBaseOptions(baseOptionsBuilder.build())
-          .setMaxResults(1)
-          .build()
-});
-    </pre>
-</section>
-</devsite-selector>
-</div>
-
-**With the Interpreter Api**
-
-<div>
-<devsite-selector>
-<section>
-  <h3>Kotlin</h3>
-    <pre class="prettyprint">
-val interpreterTask = useGpuTask.continueWith { task ->
-  val interpreterOptions = InterpreterApi.Options()
-      .setRuntime(TfLiteRuntime.FROM_SYSTEM_ONLY)
-  if (task.result) {
-      interpreterOptions.addDelegateFactory(GpuDelegateFactory())
-  }
-  InterpreterApi.create(FileUtil.loadMappedFile(context, MODEL_PATH), interpreterOptions)
-}
-    </pre>
-</section>
-<section>
-  <h3>Java</h3>
-    <pre class="prettyprint">
-Task&lt;InterpreterApi.Options> interpreterOptionsTask = useGpuTask.continueWith({ task ->
-  InterpreterApi.Options options =
-      new InterpreterApi.Options().setRuntime(TfLiteRuntime.FROM_SYSTEM_ONLY);
-  if (task.getResult()) {
-     options.addDelegateFactory(new GpuDelegateFactory());
-  }
-  return options;
-});
-    </pre>
-</section>
-</devsite-selector>
-</div>
-
-### GPU with Task Library APIs
-
-To use the GPU delegate with the Task APIs:
-
-1.  Update the project dependencies to use the GPU delegate from Play services:
-
-    ```
-    implementation 'com.google.android.gms:play-services-tflite-gpu:16.1.0'
-    ```
-
-1.  Initialize the GPU delegate with `setEnableGpuDelegateSupport`. For example,
-    you can initialize the GPU delegate for `TfLiteVision` with the following:
-
-    <div>
-    <devsite-selector>
-    <section>
-      <h3>Kotlin</h3>
-        <pre class="prettyprint">
-        TfLiteVision.initialize(context, TfLiteInitializationOptions.builder().setEnableGpuDelegateSupport(true).build())
-        </pre>
-    </section>
-    <section>
-      <h3>Java</h3>
-        <pre class="prettyprint">
-        TfLiteVision.initialize(context, TfLiteInitializationOptions.builder().setEnableGpuDelegateSupport(true).build());
-        </pre>
-    </section>
-    </devsite-selector>
-    </div>
-
-1.  Enable the GPU delegate option with
-    [`BaseOptions`](https://www.tensorflow.org/lite/api_docs/java/org/tensorflow/lite/task/core/BaseOptions.Builder):
-
-    <div>
-    <devsite-selector>
-    <section>
-      <h3>Kotlin</h3>
-        <pre class="prettyprint">
-        val baseOptions = BaseOptions.builder().useGpu().build()
-        </pre>
-    </section>
-    <section>
-      <h3>Java</h3>
-        <pre class="prettyprint">
-        BaseOptions baseOptions = BaseOptions.builder().useGpu().build();
-        </pre>
-    </section>
-    </devsite-selector>
-    </div>
-
-1.  Configure the options using `.setBaseOptions`. For example, you can set up
-    GPU in `ObjectDetector` with the following:
-
-    <div>
-    <devsite-selector>
-    <section>
-      <h3>Kotlin</h3>
-        <pre class="prettyprint">
-        val options =
-            ObjectDetectorOptions.builder()
-                .setBaseOptions(baseOptions)
-                .setMaxResults(1)
-                .build()
-        </pre>
-    </section>
-    <section>
-      <h3>Java</h3>
-        <pre class="prettyprint">
-        ObjectDetectorOptions options =
-            ObjectDetectorOptions.builder()
-                .setBaseOptions(baseOptions)
-                .setMaxResults(1)
-                .build();
-        </pre>
-    </section>
-    </devsite-selector>
-    </div>
-
-### GPU with Interpreter APIs
-
-To use the GPU delegate with the Interpreter APIs:
-
-1.  Update the project dependencies to use the GPU delegate from Play services:
-
-    ```
-    implementation 'com.google.android.gms:play-services-tflite-gpu:16.1.0'
-    ```
-
-1.  Enable the GPU delegate option in the TFlite initialization:
-
-    <div>
-    <devsite-selector>
-    <section>
-      <h3>Kotlin</h3>
-        <pre class="prettyprint">
-        TfLite.initialize(context,
-          TfLiteInitializationOptions.builder()
-           .setEnableGpuDelegateSupport(true)
-           .build())
-        </pre>
-    </section>
-    <section>
-      <h3>Java</h3>
-        <pre class="prettyprint">
-        TfLite.initialize(context,
-          TfLiteInitializationOptions.builder()
-           .setEnableGpuDelegateSupport(true)
-           .build());
-        </pre>
-    </section>
-    </devsite-selector>
-    </div>
-
-1.  Set GPU delegate in interpreter options to use `DelegateFactory` by calling
-    `addDelegateFactory()` within `InterpreterApi.Options()`:
-
-    <div>
-    <devsite-selector>
-    <section>
-      <h3>Kotlin</h3>
-        <pre class="prettyprint">
-        val interpreterOption = InterpreterApi.Options()
-         .setRuntime(TfLiteRuntime.FROM_SYSTEM_ONLY)
-         .addDelegateFactory(GpuDelegateFactory())
-        </pre>
-    </section>
-    <section>
-      <h3>Java</h3>
-        <pre class="prettyprint">
-        Options interpreterOption = InterpreterApi.Options()
-          .setRuntime(TfLiteRuntime.FROM_SYSTEM_ONLY)
-          .addDelegateFactory(new GpuDelegateFactory());
-        </pre>
-    </section>
-    </devsite-selector>
-    </div>
-
-## Migrating from stand-alone TensorFlow Lite {:#migrating}
-
-If you are planning to migrate your app from stand-alone TensorFlow Lite to the
-Play services API, review the following additional guidance for updating your
-app project code:
-
-1.  Review the [Limitations](#limitations) section of this page to ensure your
-    use case is supported.
-2.  Prior to updating your code, do performance and accuracy checks for your
-    models, particularly if you are using versions of TensorFlow Lite earlier
-    than version 2.1, so you have a baseline to compare against the new
-    implementation.
-3.  If you have migrated all of your code to use the Play services API for
-    TensorFlow Lite, you should remove the existing TensorFlow Lite *runtime
-    library* dependencies (entries with
-    <code>org.tensorflow:**tensorflow-lite**:*</code>) from your build.gradle
-    file so that you can reduce your app size.
-4.  Identify all occurrences of `new Interpreter` object creation in your code,
-    and modify it so that it uses the InterpreterApi.create() call. This new API
-    is asynchronous, which means in most cases it's not a drop-in replacement,
-    and you must register a listener for when the call completes. Refer to the
-    code snippet in [Step 3](#step_3_interpreter) code.
-5.  Add `import org.tensorflow.lite.InterpreterApi;` and `import
-    org.tensorflow.lite.InterpreterApi.Options.TfLiteRuntime;` to any source
-    files using the `org.tensorflow.lite.Interpreter` or
-    `org.tensorflow.lite.InterpreterApi` classes.
-6.  If any of the resulting calls to `InterpreterApi.create()` have only a
-    single argument, append `new InterpreterApi.Options()` to the argument list.
-7.  Append `.setRuntime(TfLiteRuntime.FROM_SYSTEM_ONLY)` to the last argument of
-    any calls to `InterpreterApi.create()`.
-8.  Replace all other occurrences of the `org.tensorflow.lite.Interpreter` class
-    with `org.tensorflow.lite.InterpreterApi`.
-
-If you want to use stand-alone TensorFlow Lite and the Play services API
-side-by-side, you must use TensorFlow Lite 2.9 (or later). TensorFlow Lite 2.8
-and earlier versions are not compatible with the Play services API version.
+-   Java API - [see guide](../android/java)
+-   C API - [see guide](../android/native)
 
 ## Limitations
 
@@ -587,10 +40,6 @@ TensorFlow Lite in Google Play services has the following limitations:
 *   Support for hardware acceleration delegates is limited to the delegates
     listed in the [Hardware acceleration](#hardware-acceleration) section. No
     other acceleration delegates are supported.
-*   Access to TensorFlow Lite via
-    [native APIs](https://www.tensorflow.org/lite/guide/inference#load_and_run_a_model_in_c)
-    is not supported. Only the TensorFlow Lite Java APIs are available through
-    Google Play services.
 *   Experimental or deprecated TensorFlow Lite APIs, including custom ops, are
     not supported.
 
diff --git a/tensorflow/lite/g3doc/examples/jax_conversion/overview.ipynb b/tensorflow/lite/g3doc/examples/jax_conversion/jax_to_tflite.ipynb
similarity index 73%
rename from tensorflow/lite/g3doc/examples/jax_conversion/overview.ipynb
rename to tensorflow/lite/g3doc/examples/jax_conversion/jax_to_tflite.ipynb
index 164c1f03cf42ff..5adcf13b27adcb 100644
--- a/tensorflow/lite/g3doc/examples/jax_conversion/overview.ipynb
+++ b/tensorflow/lite/g3doc/examples/jax_conversion/jax_to_tflite.ipynb
@@ -1,21 +1,4 @@
 {
-  "nbformat": 4,
-  "nbformat_minor": 0,
-  "metadata": {
-    "colab": {
-      "name": "Jax to TFLite.ipynb",
-      "provenance": [],
-      "collapsed_sections": [],
-      "toc_visible": true
-    },
-    "kernelspec": {
-      "name": "python3",
-      "display_name": "Python 3"
-    },
-    "language_info": {
-      "name": "python"
-    }
-  },
   "cells": [
     {
       "cell_type": "markdown",
@@ -28,9 +11,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "qLCxmWRyRMZE"
       },
+      "outputs": [],
       "source": [
         "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
         "# you may not use this file except in compliance with the License.\n",
@@ -43,9 +28,14 @@
         "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
         "# See the License for the specific language governing permissions and\n",
         "# limitations under the License."
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8LYgHRFPRpS1"
+      },
+      "source": []
     },
     {
       "cell_type": "markdown",
@@ -55,7 +45,7 @@
       "source": [
         "# Jax Model Conversion For TFLite\n",
         "## Overview\n",
-        "Note: This API is new and only available via pip install tf-nightly. It will be available in TensorFlow version 2.7. Also, the API is still experimental and subject to changes.\n",
+        "Note: This API is new and we recommend using via pip install tf-nightly. Also, the API is still experimental and subject to changes.\n",
         "\n",
         "This CodeLab demonstrates how to build a model for MNIST recognition using Jax, and how to convert it to TensorFlow Lite. This codelab will also demonstrate how to optimize the Jax-converted TFLite model with post-training quantiztion."
       ]
@@ -66,20 +56,20 @@
         "id": "i8cfOBcjSByO"
       },
       "source": [
-        "<table class=\"tfo-notebook-buttons\" align=\"left\">\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://www.tensorflow.org/lite/examples/jax_conversion/overview\"><img src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" />View on TensorFlow.org</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/examples/jax_conversion/overview.ipynb\"><img src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" />Run in Google Colab</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/examples/jax_conversion/overview.ipynb\"><img src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" />View source on GitHub</a>\n",
-        "  </td>\n",
-        "  <td>\n",
-        "    <a href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/tensorflow/lite/g3doc/examples/jax_conversion/overview.ipynb\"><img src=\"https://www.tensorflow.org/images/download_logo_32px.png\" />Download notebook</a>\n",
-        "  </td>\n",
-        "</table>"
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/lite/examples/jax_conversion/jax_to_tflite\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/examples/jax_conversion/jax_to_tflite.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/examples/jax_conversion/jax_to_tflite.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/tensorflow/lite/g3doc/examples/jax_conversion/jax_to_tflite.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
       ]
     },
     {
@@ -94,16 +84,52 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "EV04hKdrnE4f"
       },
+      "outputs": [],
       "source": [
         "!pip install tf-nightly --upgrade\n",
-        "!pip install jax --upgrade\n",
-        "!pip install jaxlib --upgrade"
-      ],
+        "!pip install jax --upgrade"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "vsilblGuGQa2"
+      },
+      "outputs": [],
+      "source": [
+        "# Make sure your JAX version is at least 0.4.20 or above.\n",
+        "import jax\n",
+        "jax.__version__"
+      ]
+    },
+    {
+      "cell_type": "code",
       "execution_count": null,
-      "outputs": []
+      "metadata": {
+        "id": "PJeQhMUwH0oX"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install orbax-export --upgrade"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "j9_CVA0THQNc"
+      },
+      "outputs": [],
+      "source": [
+        "from orbax.export import ExportManager\n",
+        "from orbax.export import JaxModule\n",
+        "from orbax.export import ServingConfig"
+      ]
     },
     {
       "cell_type": "markdown",
@@ -117,9 +143,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "qSOPSZJn1_Tj"
       },
+      "outputs": [],
       "source": [
         "import numpy as np\n",
         "import tensorflow as tf\n",
@@ -134,15 +162,15 @@
         "from jax import jit, grad, random\n",
         "from jax.example_libraries import optimizers\n",
         "from jax.example_libraries import stax\n"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "hdJIt3Da2Qn1"
       },
+      "outputs": [],
       "source": [
         "def _one_hot(x, k, dtype=np.float32):\n",
         "  \"\"\"Create a one-hot encoding of x of size k.\"\"\"\n",
@@ -155,9 +183,7 @@
         "\n",
         "train_labels = _one_hot(train_labels, 10)\n",
         "test_labels = _one_hot(test_labels, 10)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -170,9 +196,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "mi3TKB9nnQdK"
       },
+      "outputs": [],
       "source": [
         "def loss(params, batch):\n",
         "  inputs, targets = batch\n",
@@ -192,9 +220,7 @@
         "    stax.Dense(10), stax.LogSoftmax)\n",
         "\n",
         "rng = random.PRNGKey(0)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -202,14 +228,16 @@
         "id": "bRtnOBdJLd63"
       },
       "source": [
-        "## Train & Evaluate the model"
+        "## Train \u0026 Evaluate the model"
       ]
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "SWbYRyj7LYZt"
       },
+      "outputs": [],
       "source": [
         "step_size = 0.001\n",
         "num_epochs = 10\n",
@@ -254,9 +282,7 @@
         "  print(\"Epoch {} in {:0.2f} sec\".format(epoch, epoch_time))\n",
         "  print(\"Training set accuracy {}\".format(train_acc))\n",
         "  print(\"Test set accuracy {}\".format(test_acc))"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -266,11 +292,10 @@
       "source": [
         "## Convert to TFLite model.\n",
         "Note here, we\n",
-        "1. Inline the params to the Jax `predict` func with `functools.partial`.\n",
-        "2. Build a `jnp.zeros`, this is a \"placeholder\" tensor used for Jax to trace the model.\n",
-        "3. Call `experimental_from_jax`:\n",
-        "> * The `serving_func` is wrapped in a list.\n",
-        "> * The input is associated with a given name and passed in as an array wrapped in a list.\n",
+        "1. Export the `JAX` model to `TF SavedModel` using `orbax`.\n",
+        "2. Call TFLite converter API to convert the `TF SavedModel` to `.tflite` model:\n",
+        "\n",
+        "\n",
         "\n",
         "\n",
         "\n"
@@ -278,20 +303,25 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "6pcqKZqdNTmn"
       },
+      "outputs": [],
       "source": [
-        "serving_func = functools.partial(predict, params)\n",
-        "x_input = jnp.zeros((1, 28, 28))\n",
-        "converter = tf.lite.TFLiteConverter.experimental_from_jax(\n",
-        "    [serving_func], [[('input1', x_input)]])\n",
+        "jax_module = JaxModule(params, predict, input_polymorphic_shape='b, ...')\n",
+        "converter = tf.lite.TFLiteConverter.from_concrete_functions(\n",
+        "    [\n",
+        "        jax_module.methods[JaxModule.DEFAULT_METHOD_KEY].get_concrete_function(\n",
+        "            tf.TensorSpec(shape=(1, 28, 28), dtype=tf.float32, name=\"input\")\n",
+        "        )\n",
+        "    ]\n",
+        ")\n",
+        "\n",
         "tflite_model = converter.convert()\n",
         "with open('jax_mnist.tflite', 'wb') as f:\n",
         "  f.write(tflite_model)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -305,10 +335,13 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "acj2AYzjSlaY"
       },
+      "outputs": [],
       "source": [
+        "serving_func = functools.partial(predict, params)\n",
         "expected = serving_func(train_images[0:1])\n",
         "\n",
         "# Run the model with TensorFlow Lite\n",
@@ -322,9 +355,7 @@
         "\n",
         "# Assert if the result of TFLite model is consistent with the JAX model.\n",
         "np.testing.assert_almost_equal(expected, result, 1e-5)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -340,15 +371,17 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "KI0rLV-Meg-2"
       },
+      "outputs": [],
       "source": [
         "def representative_dataset():\n",
         "  for i in range(1000):\n",
         "    x = train_images[i:i+1]\n",
         "    yield [x]\n",
-        "\n",
+        "x_input = jnp.zeros((1, 28, 28))\n",
         "converter = tf.lite.TFLiteConverter.experimental_from_jax(\n",
         "    [serving_func], [[('x', x_input)]])\n",
         "tflite_model = converter.convert()\n",
@@ -358,9 +391,7 @@
         "tflite_quant_model = converter.convert()\n",
         "with open('jax_mnist_quant.tflite', 'wb') as f:\n",
         "  f.write(tflite_quant_model)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -373,9 +404,11 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "X3oOm0OaevD6"
       },
+      "outputs": [],
       "source": [
         "expected = serving_func(train_images[0:1])\n",
         "\n",
@@ -390,9 +423,7 @@
         "\n",
         "# Assert if the result of TFLite model is consistent with the Jax model.\n",
         "np.testing.assert_almost_equal(expected, result, 1e-5)"
-      ],
-      "execution_count": null,
-      "outputs": []
+      ]
     },
     {
       "cell_type": "markdown",
@@ -406,15 +437,49 @@
     },
     {
       "cell_type": "code",
+      "execution_count": null,
       "metadata": {
         "id": "imFPw007juVG"
       },
+      "outputs": [],
       "source": [
         "!du -h jax_mnist.tflite\n",
         "!du -h jax_mnist_quant.tflite"
-      ],
+      ]
+    },
+    {
+      "cell_type": "code",
       "execution_count": null,
-      "outputs": []
+      "metadata": {
+        "id": "N5WdO18wNfyn"
+      },
+      "outputs": [],
+      "source": []
     }
-  ]
-}
\ No newline at end of file
+  ],
+  "metadata": {
+    "colab": {
+      "private_outputs": true,
+      "provenance": [
+        {
+          "file_id": "1UlzbQspn2an2kzlLWZBWhP_JEqvShxmi",
+          "timestamp": 1705015454450
+        },
+        {
+          "file_id": "https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/examples/jax_conversion/overview.ipynb",
+          "timestamp": 1698963811786
+        }
+      ],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/tensorflow/lite/g3doc/examples/jax_conversion/overview.md b/tensorflow/lite/g3doc/examples/jax_conversion/overview.md
new file mode 100644
index 00000000000000..d3cf89e15a7a24
--- /dev/null
+++ b/tensorflow/lite/g3doc/examples/jax_conversion/overview.md
@@ -0,0 +1,97 @@
+# JAX models with TensorFlow Lite
+
+This page provides a path for users who want to train models in JAX and deploy
+to mobile for inference ([example colab](https://colab.sandbox.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/lite/g3doc/examples/jax_conversion/jax_to_tflite.ipynb)).
+
+The methods in this guide produce a `tflite_model` which can be used directly
+with the TFLite interpreter code example or saved to a TFLite FlatBuffer file.
+
+## Prerequisite
+
+It's recommended to try this feature with the newest TensorFlow nightly Python
+package.
+
+```
+pip install tf-nightly --upgrade
+```
+
+We will use the [Orbax
+Export](https://orbax.readthedocs.io/en/latest/orbax_export_101.html) library to
+export JAX models. Make sure your JAX version is at least 0.4.20 or above.
+
+```
+pip install jax --upgrade
+pip install orbax-export --upgrade
+```
+
+## Convert JAX models to TensorFlow Lite
+
+We use the TensorFlow
+[SavedModel](https://www.tensorflow.org/guide/saved_model) as the intermediate
+format between JAX and TensorFlow Lite. Once you have a SavedModel then
+existing TensorFlow Lite APIs can be used to complete the conversion process.
+
+```py
+# This code snippet converts a JAX model to TFLite through TF SavedModel.
+from orbax.export import ExportManager
+from orbax.export import JaxModule
+from orbax.export import ServingConfig
+import tensorflow as tf
+import jax.numpy as jnp
+
+def model_fn(_, x):
+  return jnp.sin(jnp.cos(x))
+
+jax_module = JaxModule({}, model_fn, input_polymorphic_shape='b, ...')
+
+# Option 1: Simply save the model via `tf.saved_model.save` if no need for pre/post
+# processing.
+tf.saved_model.save(
+    jax_module,
+    '/some/directory',
+    signatures=jax_module.methods[JaxModule.DEFAULT_METHOD_KEY].get_concrete_function(
+        tf.TensorSpec(shape=(None,), dtype=tf.float32, name="input")
+    ),
+    options=tf.saved_model.SaveOptions(experimental_custom_gradients=True),
+)
+converter = tf.lite.TFLiteConverter.from_saved_model('/some/directory')
+tflite_model = converter.convert()
+
+# Option 2: Define pre/post processing TF functions (e.g. (de)?tokenize).
+serving_config = ServingConfig(
+    'Serving_default',
+    # Corresponds to the input signature of `tf_preprocessor`
+    input_signature=[tf.TensorSpec(shape=(None,), dtype=tf.float32, name='input')],
+    tf_preprocessor=lambda x: x,
+    tf_postprocessor=lambda out: {'output': out}
+)
+export_mgr = ExportManager(jax_module, [serving_config])
+export_mgr.save('/some/directory')
+converter = tf.lite.TFLiteConverter.from_saved_model('/some/directory')
+tflite_model = converter.convert()
+
+# Option 3: Convert from TF concrete function directly
+converter = tf.lite.TFLiteConverter.from_concrete_functions(
+    [
+        jax_module.methods[JaxModule.DEFAULT_METHOD_KEY].get_concrete_function(
+            tf.TensorSpec(shape=(None,), dtype=tf.float32, name="input")
+        )
+    ]
+)
+tflite_model = converter.convert()
+```
+
+## Check the converted TFLite model
+
+After the model is converted to TFLite, you can run TFLite interpreter APIs to
+check model outputs.
+
+```py
+# Run the model with TensorFlow Lite
+interpreter = tf.lite.Interpreter(model_content=tflite_model)
+interpreter.allocate_tensors() input_details = interpreter.get_input_details()
+output_details = interpreter.get_output_details()
+interpreter.set_tensor(input_details[0]["index"], input_data)
+interpreter.invoke()
+result = interpreter.get_tensor(output_details[0]["index"])
+```
diff --git a/tensorflow/lite/g3doc/performance/quantization_spec.md b/tensorflow/lite/g3doc/performance/quantization_spec.md
index f98f2922fc5df2..11f808e3980026 100644
--- a/tensorflow/lite/g3doc/performance/quantization_spec.md
+++ b/tensorflow/lite/g3doc/performance/quantization_spec.md
@@ -191,7 +191,7 @@ FULLY_CONNECTED
   Input 1 (Weight):
     data_type  : int8
     range      : [-127, 127]
-    granularity: per-tensor
+    granularity: per-axis (dim = 0)
     restriction: zero_point = 0
   Input 2 (Bias):
     data_type  : int32
diff --git a/tensorflow/lite/interpreter_options.h b/tensorflow/lite/interpreter_options.h
index d20fd5cb087def..a7557e5412188e 100644
--- a/tensorflow/lite/interpreter_options.h
+++ b/tensorflow/lite/interpreter_options.h
@@ -25,12 +25,6 @@ namespace tflite {
 /// WARNING: This is an experimental API and subject to change.
 class InterpreterOptions {
  public:
-  InterpreterOptions()
-      : experimental_preserve_all_tensors_(false),
-        experimental_ensure_dynamic_tensors_are_released_(false),
-        experimental_optimize_memory_for_large_tensors_(0),
-        experimental_disable_delegate_clustering_(false) {}
-
   /// Preserving all intermediates tensors for debugging.
   /// WARNING: This is an experimental API and subject to change.
   void SetPreserveAllTensors(bool value = true) {
@@ -93,11 +87,28 @@ class InterpreterOptions {
     experimental_disable_delegate_clustering_ = value;
   }
 
+  // If set to `true`, the CAST op will cache its output when its input is a
+  // constant tensor.
+  //
+  // WARNING: This is an experimental API and subject to change.
+  void SetCacheConstantCastOp(bool value) {
+    experimental_cache_constant_cast_op_ = value;
+  }
+
+  // If `true`, the CAST op will cache its output when its input is a constant
+  // tensor.
+  //
+  // WARNING: This is an experimental API and subject to change.
+  bool GetCacheConstantCastOp() const {
+    return experimental_cache_constant_cast_op_;
+  }
+
  private:
-  bool experimental_preserve_all_tensors_;
-  bool experimental_ensure_dynamic_tensors_are_released_;
-  int experimental_optimize_memory_for_large_tensors_;
-  bool experimental_disable_delegate_clustering_;
+  bool experimental_preserve_all_tensors_ = false;
+  bool experimental_ensure_dynamic_tensors_are_released_ = false;
+  int experimental_optimize_memory_for_large_tensors_ = 0;
+  bool experimental_disable_delegate_clustering_ = false;
+  bool experimental_cache_constant_cast_op_ = false;
 };
 
 }  // namespace tflite
diff --git a/tensorflow/lite/ios/allowlist_TensorFlowLiteCMetal.txt b/tensorflow/lite/ios/allowlist_TensorFlowLiteCMetal.txt
index 1ae124bcbddab8..2a745622252dc3 100644
--- a/tensorflow/lite/ios/allowlist_TensorFlowLiteCMetal.txt
+++ b/tensorflow/lite/ios/allowlist_TensorFlowLiteCMetal.txt
@@ -1,3 +1,4 @@
+_TFLGpuDelegateOptionsDefault
 _TFLGpuDelegateCreate
 _TFLGpuDelegateDelete
 _TFLGpuDelegateBindMetalBufferToTensor
\ No newline at end of file
diff --git a/tensorflow/lite/java/BUILD b/tensorflow/lite/java/BUILD
index 3f9fe7fea2a364..ae7a0fd7dad92f 100644
--- a/tensorflow/lite/java/BUILD
+++ b/tensorflow/lite/java/BUILD
@@ -671,39 +671,44 @@ java_test_with_tflite(
     ],
 )
 
-java_test_with_tflite(
-    name = "InterpreterApiTest",
-    size = "small",
-    srcs = [
-        "src/test/java/org/tensorflow/lite/InterpreterApiTest.java",
-        "src/test/java/org/tensorflow/lite/SupportedFeatures.java",
-        "src/test/java/org/tensorflow/lite/TestUtils.java",
-    ],
-    data = [
-        "src/testdata/add.bin",
-        "src/testdata/add_unknown_dimensions.bin",
-        "src/testdata/mul_add_signature_def.bin",
-        "src/testdata/tile_with_bool_input.bin",
-        "//tensorflow/lite:testdata/dynamic_shapes.bin",
-        "//tensorflow/lite:testdata/multi_add.bin",
-        "//tensorflow/lite:testdata/multi_add_flex.bin",
-    ],
-    javacopts = JAVACOPTS,
-    test_class = "org.tensorflow.lite.InterpreterApiTest",
-    tflite_deps = [
-        ":test_init",
-    ],
-    tflite_jni_binaries = [
-        "//tensorflow/lite/java/src/test/native:libtensorflowlite_stable_test_jni.so",
-    ],
-    visibility = ["//visibility:private"],
-    deps = [
-        ":tensorflowlite_javalib_stable",
-        "//third_party/java/mockito",
-        "@com_google_truth",
-        "@junit",
-    ],
-)
+# Disabled under the (b/279852433) because caused an error in the OSS
+# TODO(zhurakovskyi): Uncomment when fixed.
+#
+# copybara:uncomment_begin
+# java_test_with_tflite(
+#     name = "InterpreterApiTest",
+#     size = "small",
+#     srcs = [
+#         "src/test/java/org/tensorflow/lite/InterpreterApiTest.java",
+#         "src/test/java/org/tensorflow/lite/SupportedFeatures.java",
+#         "src/test/java/org/tensorflow/lite/TestUtils.java",
+#     ],
+#     data = [
+#         "src/testdata/add.bin",
+#         "src/testdata/add_unknown_dimensions.bin",
+#         "src/testdata/mul_add_signature_def.bin",
+#         "src/testdata/tile_with_bool_input.bin",
+#         "//tensorflow/lite:testdata/dynamic_shapes.bin",
+#         "//tensorflow/lite:testdata/multi_add.bin",
+#         "//tensorflow/lite:testdata/multi_add_flex.bin",
+#     ],
+#     javacopts = JAVACOPTS,
+#     test_class = "org.tensorflow.lite.InterpreterApiTest",
+#     tflite_deps = [
+#         ":test_init",
+#     ],
+#     tflite_jni_binaries = [
+#         "//tensorflow/lite/java/src/test/native:libtensorflowlite_stable_test_jni.so",
+#     ],
+#     visibility = ["//visibility:private"],
+#     deps = [
+#         ":tensorflowlite_javalib_stable",
+#         "//third_party/java/mockito",
+#         "@com_google_truth",
+#         "@junit",
+#     ],
+# )
+# copybara:uncomment_end
 
 java_test_with_tflite(
     name = "InterpreterApiNoRuntimeTest",
@@ -732,33 +737,37 @@ java_test_with_tflite(
     ],
 )
 
-java_test_with_tflite(
-    name = "NnApiDelegateNativeTest",
-    size = "small",
-    srcs = [
-        "src/test/java/org/tensorflow/lite/NnApiDelegateNativeTest.java",
-        "src/test/java/org/tensorflow/lite/SupportedFeatures.java",
-        "src/test/java/org/tensorflow/lite/TestUtils.java",
-    ],
-    data = [
-        "src/testdata/add.bin",
-    ],
-    tags = ["no_mac"],
-    test_class = "org.tensorflow.lite.NnApiDelegateNativeTest",
-    tflite_deps = [
-        ":test_init",
-    ],
-    tflite_jni_binaries = [
-        "//tensorflow/lite/java/src/test/native:libtensorflowlite_test_jni.so",
-    ],
-    visibility = ["//visibility:private"],
-    deps = [
-        ":tensorflowlite_javalib",
-        "//third_party/java/mockito",
-        "@com_google_truth",
-        "@junit",
-    ],
-)
+# Commented out under the (b/279852433) because caused an error in the OSS
+# TODO(zhurakovskyi): Uncomment when fixed.
+# copybara:uncomment_begin
+# java_test_with_tflite(
+#     name = "NnApiDelegateNativeTest",
+#     size = "small",
+#     srcs = [
+#         "src/test/java/org/tensorflow/lite/NnApiDelegateNativeTest.java",
+#         "src/test/java/org/tensorflow/lite/SupportedFeatures.java",
+#         "src/test/java/org/tensorflow/lite/TestUtils.java",
+#     ],
+#     data = [
+#         "src/testdata/add.bin",
+#     ],
+#     tags = ["no_mac"],
+#     test_class = "org.tensorflow.lite.NnApiDelegateNativeTest",
+#     tflite_deps = [
+#         ":test_init",
+#     ],
+#     tflite_jni_binaries = [
+#         "//tensorflow/lite/java/src/test/native:libtensorflowlite_test_jni.so",
+#     ],
+#     visibility = ["//visibility:private"],
+#     deps = [
+#         ":tensorflowlite_javalib",
+#         "//third_party/java/mockito",
+#         "@com_google_truth",
+#         "@junit",
+#     ],
+# )
+# copybara:uncomment_end
 
 java_test_with_tflite(
     name = "NnApiDelegateTest",
diff --git a/tensorflow/lite/java/demo/app/src/main/BUILD b/tensorflow/lite/java/demo/app/src/main/BUILD
index 0c2baa9d3101d7..6a8113274d4b2b 100644
--- a/tensorflow/lite/java/demo/app/src/main/BUILD
+++ b/tensorflow/lite/java/demo/app/src/main/BUILD
@@ -30,7 +30,5 @@ android_binary(
         "//tensorflow/lite/java:tensorflowlite",
         "//tensorflow/lite/java:tensorflowlite_gpu",
         "//tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite:testhelper",
-        "@androidsdk//com.android.support:support-v13-25.2.0",
-        "@androidsdk//com.android.support:support-v4-25.2.0",
     ],
 )
diff --git a/tensorflow/lite/java/ovic/demo/app/BUILD b/tensorflow/lite/java/ovic/demo/app/BUILD
index 126625be7f7e1a..4c3b55828723eb 100644
--- a/tensorflow/lite/java/ovic/demo/app/BUILD
+++ b/tensorflow/lite/java/ovic/demo/app/BUILD
@@ -28,11 +28,9 @@ android_binary(
     resource_files = glob(["res/**"]),
     tags = ["manual"],
     deps = [
+        # copybara:uncomment "//third_party/java/android/android_sdk_linux/extras/android/compatibility/multidex",
         "//tensorflow/lite/java:tensorflowlite",
         "//tensorflow/lite/java/ovic:ovicbenchmarkerlib",
         "//tensorflow/lite/java/ovic:ovicdetectionbenchmarkerlib",
-        "//third_party/java/android/android_sdk_linux/extras/android/compatibility/multidex",
-        "@androidsdk//com.android.support:support-v13-25.2.0",
-        "@androidsdk//com.android.support:support-v4-25.2.0",
     ],
 )
diff --git a/tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD b/tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
index 2264288a3631d4..365942d6490601 100644
--- a/tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
+++ b/tensorflow/lite/java/src/testhelper/java/org/tensorflow/lite/BUILD
@@ -4,7 +4,7 @@
 load("@build_bazel_rules_android//android:rules.bzl", "android_library")
 
 package(
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:LICENSE"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
diff --git a/tensorflow/lite/kernels/BUILD b/tensorflow/lite/kernels/BUILD
index a0eb5ce425fc2a..9445fee50db352 100644
--- a/tensorflow/lite/kernels/BUILD
+++ b/tensorflow/lite/kernels/BUILD
@@ -889,6 +889,7 @@ cc_library(
         "//tensorflow/lite/c:c_api_types",
         "//tensorflow/lite:array",
         "//tensorflow/lite:builtin_ops",
+        "//tensorflow/lite:cc_api_stable",
         "@local_tsl//tsl/lib/random:philox_random",
         "@local_tsl//tsl/lib/random:random_distributions_utils",
         "//tensorflow/lite/core/c:c_api_types",
@@ -1493,10 +1494,9 @@ cc_library(
     ],
     deps = [
         ":test_util",
-        "//tensorflow/lite:string",
+        "//tensorflow/lite:cc_api_stable",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/types:span",
-        "@flatbuffers",
     ],
 )
 
@@ -1509,6 +1509,7 @@ cc_test(
         ":cast_test_common",
         ":test_main",
         ":test_util",
+        "//tensorflow/lite/c:common",
         "//tensorflow/lite/core/c:c_api_types",
         "//tensorflow/lite/schema:schema_fbs",
         "@com_google_absl//absl/types:span",
diff --git a/tensorflow/lite/kernels/activations.cc b/tensorflow/lite/kernels/activations.cc
index 4ed6784ad0d7c5..82fcf583fe7c38 100644
--- a/tensorflow/lite/kernels/activations.cc
+++ b/tensorflow/lite/kernels/activations.cc
@@ -1543,7 +1543,7 @@ TfLiteStatus PreluEval(TfLiteContext* context, TfLiteNode* node) {
     default:
       TF_LITE_KERNEL_LOG(
           context,
-          "Only float32 and uint8 and int8 are supported currently, got %d.",
+          "Only float32 and uint8 and int8 are supported currently, got %s.",
           TfLiteTypeGetName(input->type));
       return kTfLiteError;
   }
diff --git a/tensorflow/lite/kernels/cast.cc b/tensorflow/lite/kernels/cast.cc
index e7d6cdf09b18b1..57c65247f8788d 100644
--- a/tensorflow/lite/kernels/cast.cc
+++ b/tensorflow/lite/kernels/cast.cc
@@ -14,11 +14,13 @@ limitations under the License.
 ==============================================================================*/
 #include <algorithm>
 #include <complex>
+#include <cstddef>
 #include <cstdint>
 
+#include "Eigen/Core"  // from @eigen_archive
 #include "tensorflow/lite/core/c/common.h"
-#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
-#include "tensorflow/lite/kernels/internal/tensor.h"
+#include "tensorflow/lite/core/subgraph.h"
+#include "tensorflow/lite/interpreter_options.h"
 #include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
 #include "tensorflow/lite/kernels/kernel_util.h"
 #include "tensorflow/lite/kernels/op_macros.h"
@@ -27,29 +29,11 @@ namespace tflite {
 namespace ops {
 namespace builtin {
 namespace cast {
-constexpr int kInputTensor = 0;
-constexpr int kOutputTensor = 0;
 
-TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
-  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
-  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
+namespace {
 
-  // TODO(ahentz): these two checks would make the new implementation
-  // incompatible with some existing models, where params is not specified. It
-  // is OK not to have them because toco would have set input and output types
-  // to match the parameters.
-  // auto* params = reinterpret_cast<TfLiteCastParams*>(node->builtin_data);
-  // TF_LITE_ENSURE_EQ(context, input->type, params->in_data_type);
-  // TF_LITE_ENSURE_EQ(context, output->type, params->out_data_type);
-
-  return context->ResizeTensor(context, output,
-                               TfLiteIntArrayCopy(input->dims));
-}
+constexpr int kInputTensor = 0;
+constexpr int kOutputTensor = 0;
 
 template <typename FromT, typename ToT>
 void copyCast(const FromT* in, ToT* out, int num_elements) {
@@ -213,14 +197,8 @@ TfLiteStatus copyToTensor(TfLiteContext* context, const FromT* in,
   return kTfLiteOk;
 }
 
-TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
-  const TfLiteTensor* input;
-  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
-  TfLiteTensor* output;
-  TF_LITE_ENSURE_OK(context,
-                    GetOutputSafe(context, node, kOutputTensor, &output));
-  const int num_elements = NumElements(input);
-  TF_LITE_ENSURE_EQ(context, num_elements, NumElements(output));
+TfLiteStatus EvalImpl(TfLiteContext* context, const TfLiteTensor* input,
+                      TfLiteTensor* output, const int num_elements) {
   switch (input->type) {
     case kTfLiteInt64:
       return copyToTensor(context, input->data.i64, output, num_elements);
@@ -260,11 +238,90 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
       // Unsupported type.
       TF_LITE_UNSUPPORTED_TYPE(context, input->type, "Cast");
   }
+  return kTfLiteError;
 }
+
+struct OpData {
+  bool cached_output = false;
+};
+
+void* Init(TfLiteContext* context, const char* /*buffer*/, size_t /*length*/) {
+  return new OpData();
+}
+
+void Free(TfLiteContext* context, void* op_data) {
+  delete reinterpret_cast<OpData*>(op_data);
+}
+
+bool OutputCachingEnabled(const TfLiteContext* context) {
+  if (context && context->impl_) {
+    const InterpreterOptions* options =
+        reinterpret_cast<Subgraph*>(context->impl_)->GetOptions();
+    if (options) {
+      return options->GetCacheConstantCastOp();
+    }
+  }
+  return false;
+}
+
+bool ShouldCacheOutput(const TfLiteContext* context,
+                       const TfLiteTensor* input) {
+  return OutputCachingEnabled(context) && IsConstantTensor(input);
+}
+
+TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
+  TF_LITE_ENSURE_EQ(context, NumInputs(node), 1);
+  TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+
+  // TODO(ahentz): these two checks would make the new implementation
+  // incompatible with some existing models, where params is not specified. It
+  // is OK not to have them because toco would have set input and output types
+  // to match the parameters.
+  // auto* params = reinterpret_cast<TfLiteCastParams*>(node->builtin_data);
+  // TF_LITE_ENSURE_EQ(context, input->type, params->in_data_type);
+  // TF_LITE_ENSURE_EQ(context, output->type, params->out_data_type);
+
+  if (ShouldCacheOutput(context, input)) {
+    output->allocation_type = kTfLiteArenaRwPersistent;
+  }
+
+  TF_LITE_ENSURE_OK(
+      context,
+      context->ResizeTensor(context, output, TfLiteIntArrayCopy(input->dims)));
+
+  return kTfLiteOk;
+}
+
+TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
+  const TfLiteTensor* input;
+  TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
+  TfLiteTensor* output;
+  TF_LITE_ENSURE_OK(context,
+                    GetOutputSafe(context, node, kOutputTensor, &output));
+  const int num_elements = NumElements(input);
+  TF_LITE_ENSURE_EQ(context, num_elements, NumElements(output));
+
+  OpData& op_data = *reinterpret_cast<OpData*>(node->user_data);
+  if (ShouldCacheOutput(context, input)) {
+    if (op_data.cached_output) {
+      return kTfLiteOk;
+    }
+    op_data.cached_output = true;
+  }
+  return EvalImpl(context, input, output, num_elements);
+}
+
+}  // namespace
 }  // namespace cast
 
 TfLiteRegistration* Register_CAST() {
-  static TfLiteRegistration r = {nullptr, nullptr, cast::Prepare, cast::Eval};
+  static TfLiteRegistration r = {cast::Init, cast::Free, cast::Prepare,
+                                 cast::Eval};
   return &r;
 }
 
diff --git a/tensorflow/lite/kernels/cast_test.cc b/tensorflow/lite/kernels/cast_test.cc
index e2971016619532..c2eef57197119b 100644
--- a/tensorflow/lite/kernels/cast_test.cc
+++ b/tensorflow/lite/kernels/cast_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include <gtest/gtest.h>
 #include "absl/types/span.h"
 #include "flatbuffers/flatbuffers.h"  // from @flatbuffers
+#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/core/c/c_api_types.h"
 #include "tensorflow/lite/kernels/cast_test_common.h"
 #include "tensorflow/lite/kernels/test_util.h"
@@ -291,5 +292,31 @@ TEST(CastOpModel, CastInt16ToUInt16) {
               ElementsAreArray({10, 20, 30, 40, 50, 60}));
 }
 
+TEST(CastOpModel, CastConstInputCachingWorks) {
+  // This tests the implementation of a performance optimization. If that
+  // optimization is changed, this test will likely break/need to be updated.
+  //
+  // We are relying on the fact that casting a constant input can be cached and
+  // that the output tensor does not need to be updated on every call.
+  CastOpModel m({TensorType_INT8, {2, 3}},
+                std::vector<int8_t>{10, 20, 30, 40, 50, 60},
+                {TensorType_FLOAT32, {2, 3}});
+  EXPECT_EQ(m.GetOutputTensor(0)->allocation_type, kTfLiteArenaRwPersistent);
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray({10, 20, 30, 40, 50, 60}));
+  // We are cheating here. If the values of the output tensor are cached then if
+  // we modify the cache and call the op again the output tensor values should
+  // not change.
+  float* output_data =
+      reinterpret_cast<float*>(m.GetOutputTensor(0)->data.data);
+  for (int i = 0; i < 6; ++i) {
+    ++output_data[i];
+  }
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+  EXPECT_THAT(m.ExtractVector<float>(m.output()),
+              ElementsAreArray({11, 21, 31, 41, 51, 61}));
+}
+
 }  // namespace
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/cast_test_common.h b/tensorflow/lite/kernels/cast_test_common.h
index 123cce213228f1..1cfa4bd740eecf 100644
--- a/tensorflow/lite/kernels/cast_test_common.h
+++ b/tensorflow/lite/kernels/cast_test_common.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/types/span.h"
+#include "tensorflow/lite/interpreter_options.h"
 #include "tensorflow/lite/kernels/test_util.h"
 #include "tensorflow/lite/schema/schema_generated.h"
 
@@ -37,6 +38,23 @@ class CastOpModel : public SingleOpModel {
     BuildInterpreter({GetShape(input_)});
   }
 
+  template <class ConstInputData>
+  CastOpModel(const TensorData& input, ConstInputData&& data,
+              const TensorData& output) {
+    input_ = AddConstInput(input, static_cast<ConstInputData>(data));
+    output_ = AddOutput(output);
+    SetBuiltinOp(BuiltinOperator_CAST, BuiltinOptions_CastOptions,
+                 CreateCastOptions(builder_).Union());
+    BuildInterpreter({GetShape(input_)}, /*num_threads=*/-1,
+                     /*allow_fp32_relax_to_fp16=*/false,
+                     /*apply_delegate=*/true, /*allocate_and_delegate=*/false,
+                     /*use_simple_allocator=*/false);
+    InterpreterOptions options;
+    options.SetCacheConstantCastOp(true);
+    interpreter_->ApplyOptions(&options);
+    AllocateAndDelegate(/*apply_delegate=*/true);
+  }
+
   void Set4BitInput(absl::Span<const int8_t> f) {
     PopulateTensor4bit(input_, 0, f.data(), f.data() + f.size());
   }
diff --git a/tensorflow/lite/kernels/complex_support.cc b/tensorflow/lite/kernels/complex_support.cc
index 8713e73ea5772c..6bc59e4a815e60 100644
--- a/tensorflow/lite/kernels/complex_support.cc
+++ b/tensorflow/lite/kernels/complex_support.cc
@@ -83,7 +83,7 @@ TfLiteStatus EvalReal(TfLiteContext* context, TfLiteNode* node) {
     default: {
       TF_LITE_KERNEL_LOG(context,
                          "Unsupported input type, Real op only supports "
-                         "complex input, but got: ",
+                         "complex input, but got: %s",
                          TfLiteTypeGetName(input->type));
       return kTfLiteError;
     }
@@ -115,7 +115,7 @@ TfLiteStatus EvalImag(TfLiteContext* context, TfLiteNode* node) {
     default: {
       TF_LITE_KERNEL_LOG(context,
                          "Unsupported input type, Imag op only supports "
-                         "complex input, but got: ",
+                         "complex input, but got: %s",
                          TfLiteTypeGetName(input->type));
       return kTfLiteError;
     }
@@ -146,7 +146,7 @@ TfLiteStatus EvalAbs(TfLiteContext* context, TfLiteNode* node) {
     default: {
       TF_LITE_KERNEL_LOG(context,
                          "Unsupported input type, ComplexAbs op only supports "
-                         "complex input, but got: ",
+                         "complex input, but got: %s",
                          TfLiteTypeGetName(input->type));
       return kTfLiteError;
     }
diff --git a/tensorflow/lite/kernels/cumsum.cc b/tensorflow/lite/kernels/cumsum.cc
index 997589d2b3b278..7d4d04dad82b41 100644
--- a/tensorflow/lite/kernels/cumsum.cc
+++ b/tensorflow/lite/kernels/cumsum.cc
@@ -63,7 +63,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   if (axis < 0) axis += NumDimensions(input);
 
   if (axis < 0 || axis >= NumDimensions(input)) {
-    TF_LITE_KERNEL_LOG(context, "Invalid axis: ", axis);
+    TF_LITE_KERNEL_LOG(context, "Invalid axis: %d", axis);
     return kTfLiteError;
   }
 
diff --git a/tensorflow/lite/kernels/fill.cc b/tensorflow/lite/kernels/fill.cc
index 910b8082238147..6f704cb028495a 100644
--- a/tensorflow/lite/kernels/fill.cc
+++ b/tensorflow/lite/kernels/fill.cc
@@ -41,7 +41,8 @@ TfLiteStatus ResizeOutputImpl(TfLiteContext* context, const TfLiteTensor* dims,
     T data = GetTensorData<T>(dims)[i];
     if (data < 0) {
       TfLiteIntArrayFree(output_shape);
-      TF_LITE_KERNEL_LOG(context, "Fill dimensions must be >= 0", dims->type);
+      TF_LITE_KERNEL_LOG(context, "Fill dimensions must be >= 0 got %d",
+                         dims->type);
       return kTfLiteError;
     }
     output_shape->data[i] = data;
diff --git a/tensorflow/lite/kernels/fully_connected.cc b/tensorflow/lite/kernels/fully_connected.cc
index 8ef28868dc47d6..0c92b0e2641b85 100644
--- a/tensorflow/lite/kernels/fully_connected.cc
+++ b/tensorflow/lite/kernels/fully_connected.cc
@@ -413,14 +413,6 @@ TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node,
   // parameters set. This is usually done during quantized training.
   if (input->type == kTfLiteUInt8 || input->type == kTfLiteInt8 ||
       input->type == kTfLiteInt16) {
-    // Populate scalar quantization parameters.
-    double real_multiplier = 0.0;
-    TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
-        context, input, filter, bias, output, &real_multiplier));
-    int exponent;
-    QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
-    data->output_shift = exponent;
-
     // Populate per-channel quantization parameters, if per-channel
     // quantization.
     TF_LITE_ENSURE_EQ(context, input->quantization.type,
@@ -466,6 +458,14 @@ TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node,
         per_channel_multiplier[i] = significand;
         per_channel_shift[i] = channel_shift;
       }
+    } else {
+      // Populate scalar quantization parameters otherwise.
+      double real_multiplier = 0.0;
+      TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
+          context, input, filter, bias, output, &real_multiplier));
+      int exponent;
+      QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
+      data->output_shift = exponent;
     }
 
     TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
@@ -1354,7 +1354,9 @@ TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
             // Block sparse with block size of 1x16.
             optimized_ops::FullyConnectedSparseWeight1x16(
                 sparsity, op_params, input_shape, GetTensorData<int8_t>(input),
-                filter_shape, GetTensorData<int8_t>(filter), bias_shape,
+                filter_shape, GetTensorData<int8_t>(filter),
+                data->per_channel_output_multiplier.data(),
+                data->per_channel_output_shift.data(), bias_shape,
                 GetTensorData<int32_t>(bias), output_shape,
                 GetTensorData<int8_t>(output),
                 CpuBackendContext::GetFromContext(context));
diff --git a/tensorflow/lite/kernels/fully_connected_test.cc b/tensorflow/lite/kernels/fully_connected_test.cc
index 16bb850221a250..86a3f1ef8619ed 100644
--- a/tensorflow/lite/kernels/fully_connected_test.cc
+++ b/tensorflow/lite/kernels/fully_connected_test.cc
@@ -1713,7 +1713,13 @@ class SparseFullyConnectedOpModel : public SingleOpModel {
     } else if (input.type == TensorType_INT8) {
       // This is a quantized version. The scale of 'bias' depends on the scales
       // of input and filter.
-      auto bias_scale = GetScale(input_) * GetScale(weights_);
+      float bias_scale = GetScale(input_);
+      if (weights.per_channel_quantization &&
+          !weights.per_channel_quantization_scales.empty()) {
+        bias_scale *= weights.per_channel_quantization_scales[0];
+      } else {
+        bias_scale *= GetScale(weights_);
+      }
       TensorData bias = {TensorType_INT32, {units_}, 0, 0, bias_scale};
       bias_ = AddInput(bias);
     } else {
@@ -2333,6 +2339,44 @@ TEST_P(SparseQuantizedFullyConnectedOpTest, Simple1x16TestScaledInputOutput) {
   EXPECT_THAT(m.GetOutput(), ElementsAre(-52, -50, -52));
 }
 
+TEST_P(SparseQuantizedFullyConnectedOpTest,
+       Simple1x16PerChannelQuantizationTest) {
+  std::vector<float> weight_data = {
+      1,  2,  3,  4,  -1, -2, -3, -4, 1,  2,  3,  4, -4, -3, -2, -1,  // u = 0
+      0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0, 0,  0,  0,  0,   // u = 1
+      -1, -2, -3, -4, 4,  3,  2,  1,  -1, -2, -3, 4, 1,  2,  3,  4,   // u = 2
+  };
+  TensorData weight = {TensorType_INT8,
+                       {3, 16},
+                       0.0,
+                       0.0,
+                       0.0,
+                       0,
+                       true,
+                       {4.0 / 127.0, 1.0 / 127.0, 4.0 / 127.0},
+                       {0, 0, 0}};
+  weight.traversal_order = {0, 1, 2};
+  weight.format = {kTfLiteDimDense, kTfLiteDimSparseCSR};
+  weight.block_map = {1};
+  weight.block_size = {16};
+  SparseQuantizedFullyConnectedOpModel m(
+      GetRegistration(),
+      /*units=*/3, /*batches=*/2,
+      /*input=*/{TensorType_INT8, {2, 16}, 0, 0, 1}, weight, weight_data,
+      /*output=*/{TensorType_INT8, {}, 0, 0, 1});
+
+  m.SetBias({1, 2, 3});
+  m.SetInput({
+      1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,  // b = 0
+      4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1, 4, 3, 2, 1,  // b = 1
+  });
+
+  ASSERT_EQ(m.Invoke(), kTfLiteOk);
+
+  EXPECT_THAT(m.GetOutputShape(), ElementsAre(2, 3));
+  EXPECT_THAT(m.GetOutput(), ElementsAre(11, 1, 25, 0, 1, 21));
+}
+
 INSTANTIATE_TEST_SUITE_P(
     SparseQuantizedFullyConnectedOpTest, SparseQuantizedFullyConnectedOpTest,
     ::testing::ValuesIn(SingleOpTest::GetKernelTags(*kKernelMapNoPie)));
diff --git a/tensorflow/lite/kernels/internal/common.cc b/tensorflow/lite/kernels/internal/common.cc
index 1654ab84f0d7ce..fabb0208b7d21c 100644
--- a/tensorflow/lite/kernels/internal/common.cc
+++ b/tensorflow/lite/kernels/internal/common.cc
@@ -17,6 +17,53 @@ limitations under the License.
 
 namespace tflite {
 
+// Single-rounding MultiplyByQuantizedMultiplier
+#if TFLITE_SINGLE_ROUNDING
+int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier,
+                                      int shift) {
+  TFLITE_DCHECK(quantized_multiplier >= 0);
+  TFLITE_DCHECK(shift >= -31 && shift <= 30);
+
+  const int64_t total_shift = 31 - shift;
+  const int64_t round = static_cast<int64_t>(1) << (total_shift - 1);
+  int64_t result = x * static_cast<int64_t>(quantized_multiplier) + round;
+  result = result >> total_shift;
+
+  TFLITE_DCHECK(result >= std::numeric_limits<int32_t>::min() &&
+                result <= std::numeric_limits<int32_t>::max());
+  return static_cast<int32_t>(result);
+}
+
+int32_t MultiplyByQuantizedMultiplier(int64_t x, int32_t quantized_multiplier,
+                                      int shift) {
+  // Inputs:
+  // - quantized_multiplier has fixed point at bit 31
+  // - shift is -31 to +7 (negative for right shift)
+  //
+  // Assumptions: The following input ranges are assumed
+  // - quantize_scale>=0  (the usual range is (1<<30) to (1>>31)-1)
+  // - scaling is chosen so final scaled result fits in int32_t
+  // - input x is in the range -(1<<47) <= x < (1<<47)
+  TFLITE_DCHECK(quantized_multiplier >= 0);
+  TFLITE_DCHECK(shift >= -31 && shift < 8);
+  TFLITE_DCHECK(x >= -(static_cast<int64_t>(1) << 47) &&
+                x < (static_cast<int64_t>(1) << 47));
+
+  const int32_t reduced_multiplier =
+      (quantized_multiplier < 0x7FFF0000)
+          ? ((quantized_multiplier + (1 << 15)) >> 16)
+          : 0x7FFF;
+  const int64_t total_shift = 15 - shift;
+  const int64_t round = static_cast<int64_t>(1) << (total_shift - 1);
+  int64_t result = x * static_cast<int64_t>(reduced_multiplier) + round;
+  result = result >> total_shift;
+
+  TFLITE_DCHECK(result >= std::numeric_limits<int32_t>::min() &&
+                result <= std::numeric_limits<int32_t>::max());
+  return static_cast<int32_t>(result);
+}
+// Double-rounding MultiplyByQuantizedMultiplier
+#else
 int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier,
                                       int shift) {
   using gemmlowp::RoundingDivideByPOT;
@@ -51,5 +98,6 @@ int32_t MultiplyByQuantizedMultiplier(int64_t x, int32_t quantized_multiplier,
   int32_t result = x >> total_shift;
   return result;
 }
+#endif  // TFLITE_SINGLE_ROUNDING
 
 }  // namespace tflite
diff --git a/tensorflow/lite/kernels/internal/common.h b/tensorflow/lite/kernels/internal/common.h
index 14d859917522f7..9761a8cc07a8ec 100644
--- a/tensorflow/lite/kernels/internal/common.h
+++ b/tensorflow/lite/kernels/internal/common.h
@@ -257,24 +257,14 @@ inline void BiasAndClamp(float clamp_min, float clamp_max, int bias_size,
 #endif
 }
 
-// Single-rounding MultiplyByQuantizedMultiplier
-#if TFLITE_SINGLE_ROUNDING
-inline int32_t MultiplyByQuantizedMultiplier(int32_t x,
-                                             int32_t quantized_multiplier,
-                                             int shift) {
-  TFLITE_DCHECK(quantized_multiplier >= 0);
-  TFLITE_DCHECK(shift >= -31 && shift <= 30);
-
-  const int64_t total_shift = 31 - shift;
-  const int64_t round = static_cast<int64_t>(1) << (total_shift - 1);
-  int64_t result = x * static_cast<int64_t>(quantized_multiplier) + round;
-  result = result >> total_shift;
+TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
+    int32_t x, int32_t quantized_multiplier, int shift);
 
-  TFLITE_DCHECK(result >= std::numeric_limits<int32_t>::min() &&
-                result <= std::numeric_limits<int32_t>::max());
-  return static_cast<int32_t>(result);
-}
+TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
+    int64_t x, int32_t quantized_multiplier, int shift);
 
+// Single-rounding MultiplyByQuantizedMultiplier
+#if TFLITE_SINGLE_ROUNDING
 inline int32_t MultiplyByQuantizedMultiplierSmallerThanOneExp(
     int32_t x, int32_t quantized_multiplier, int shift) {
   TFLITE_DCHECK_LE(shift, 0);
@@ -287,36 +277,6 @@ inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
   return MultiplyByQuantizedMultiplier(x, quantized_multiplier, shift);
 }
 
-inline int32_t MultiplyByQuantizedMultiplier(int64_t x,
-                                             int32_t quantized_multiplier,
-                                             int shift) {
-  // Inputs:
-  // - quantized_multiplier has fixed point at bit 31
-  // - shift is -31 to +7 (negative for right shift)
-  //
-  // Assumptions: The following input ranges are assumed
-  // - quantize_scale>=0  (the usual range is (1<<30) to (1>>31)-1)
-  // - scaling is chosen so final scaled result fits in int32_t
-  // - input x is in the range -(1<<47) <= x < (1<<47)
-  TFLITE_DCHECK(quantized_multiplier >= 0);
-  TFLITE_DCHECK(shift >= -31 && shift < 8);
-  TFLITE_DCHECK(x >= -(static_cast<int64_t>(1) << 47) &&
-                x < (static_cast<int64_t>(1) << 47));
-
-  const int32_t reduced_multiplier =
-      (quantized_multiplier < 0x7FFF0000)
-          ? ((quantized_multiplier + (1 << 15)) >> 16)
-          : 0x7FFF;
-  const int64_t total_shift = 15 - shift;
-  const int64_t round = static_cast<int64_t>(1) << (total_shift - 1);
-  int64_t result = x * static_cast<int64_t>(reduced_multiplier) + round;
-  result = result >> total_shift;
-
-  TFLITE_DCHECK(result >= std::numeric_limits<int32_t>::min() &&
-                result <= std::numeric_limits<int32_t>::max());
-  return static_cast<int32_t>(result);
-}
-
 #ifdef USE_NEON
 inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
     int32x4x4_t input_val, int32_t quantized_multiplier, int shift) {
@@ -366,12 +326,6 @@ inline int32_t MultiplyByQuantizedMultiplierGreaterThanOne(
                                            quantized_multiplier);
 }
 
-TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
-    int32_t x, int32_t quantized_multiplier, int shift);
-
-TFLITE_NOINLINE int32_t MultiplyByQuantizedMultiplier(
-    int64_t x, int32_t quantized_multiplier, int shift);
-
 #ifdef USE_NEON
 // Round uses ARM's rounding shift right.
 inline int32x4x4_t MultiplyByQuantizedMultiplier4Rows(
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
index ffdf4b6d4b1756..f8e7601f6b4f14 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.cc
@@ -1966,7 +1966,8 @@ void NeonSparseMatrixBatchVectorMultiplyAccumulate1x16(
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
     const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
     int n_batch, const int32_t input_offset, const int32_t output_multiplier,
-    const int32_t output_shift, const int32_t output_offset,
+    const int32_t output_shift, const int32_t* per_channel_scale,
+    const int32_t* per_channel_shift, const int32_t output_offset,
     const int32_t output_activation_min, const int32_t output_activation_max,
     int8_t* __restrict__ result) {
   constexpr int kBlockSize = kInt8ValuesPerNeonVector;
@@ -2028,7 +2029,9 @@ void NeonSparseMatrixBatchVectorMultiplyAccumulate1x16(
 #endif
       const int32_t bias_value = bias_vector != nullptr ? bias_vector[row] : 0;
       acc = acc + bias_value + input_offset * matrix_row_sum;
-      acc = MultiplyByQuantizedMultiplier(acc, output_multiplier, output_shift);
+      acc = MultiplyByQuantizedMultiplier(
+          acc, per_channel_scale ? per_channel_scale[row] : output_multiplier,
+          per_channel_shift ? per_channel_shift[row] : output_shift);
       acc += output_offset;
       result[batch * m_rows + row] =
           static_cast<int8_t>(ActivationFunctionWithMinMax(
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
index 096a6943ca4559..ebb5a2abad425a 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils.h
@@ -84,14 +84,15 @@ void SparseMatrixBatchVectorMultiplyAccumulate1x16(
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
     const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
     int n_batch, const int32_t input_offset, const int32_t output_multiplier,
-    const int32_t output_shift, const int32_t output_offset,
+    const int32_t output_shift, const int32_t* per_channel_scale,
+    const int32_t* per_channel_shift, const int32_t output_offset,
     const int32_t output_activation_min, const int32_t output_activation_max,
     int8_t* __restrict__ result) {
   NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate1x16, matrix,
                    segments, indices, m_rows, m_cols, vector, bias_vector,
                    n_batch, input_offset, output_multiplier, output_shift,
-                   output_offset, output_activation_min, output_activation_max,
-                   result);
+                   per_channel_scale, per_channel_shift, output_offset,
+                   output_activation_min, output_activation_max, result);
 }
 
 void SparseMatrixBatchVectorMultiplyAccumulate(
diff --git a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
index 2e65529aa6191f..dd8a05f4a37f8e 100644
--- a/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/optimized/neon_tensor_utils_impl.h
@@ -122,7 +122,8 @@ void NeonSparseMatrixBatchVectorMultiplyAccumulate1x16(
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
     const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
     int n_batch, const int32_t input_offset, const int32_t output_multiplier,
-    const int32_t output_shift, const int32_t output_offset,
+    int32_t output_shift, const int32_t* per_channel_scale,
+    const int32_t* per_channel_shift, int32_t output_offset,
     const int32_t output_activation_min, const int32_t output_activation_max,
     int8_t* __restrict__ result);
 
diff --git a/tensorflow/lite/kernels/internal/optimized/optimized_4bit_test.cc b/tensorflow/lite/kernels/internal/optimized/optimized_4bit_test.cc
index 313ef85e75b990..a4db8457594d85 100644
--- a/tensorflow/lite/kernels/internal/optimized/optimized_4bit_test.cc
+++ b/tensorflow/lite/kernels/internal/optimized/optimized_4bit_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 #include <cstdint>
 #include <cstdlib>
+#include <functional>
+#include <memory>
 #include <random>
 #include <vector>
 
@@ -37,10 +39,19 @@ struct TestPack {
         depth(depth),
         rows((src_rows + (width - 1)) & ~(width - 1)),
         cols((src_cols + (depth - 1)) & ~(depth - 1)),
-        packed_data_buffer(rows * cols + padding) {}
+        // Must be vector-aligned.
+        packed_data_buffer(
+            [=]() -> uint8_t* {
+              void* ptr;
+              if (posix_memalign(&ptr, 64, rows * cols + padding)) {
+                abort();
+              }
+              return static_cast<uint8_t*>(ptr);
+            }(),
+            [](uint8_t* ptr) { free(ptr); }) {}
 
   void Prepack() {
-    packed_data = packed_data_buffer.data();
+    packed_data = packed_data_buffer.get();
     optimized_4bit::Prepack(packed_data, src_data.data(), rows, cols, src_rows,
                             src_cols, width, depth);
   }
@@ -64,7 +75,7 @@ struct TestPack {
   int rows;
   int cols;
   int padding = optimized_4bit::kDefaultAlignmentPadding;
-  std::vector<uint8_t> packed_data_buffer;
+  std::unique_ptr<uint8_t, std::function<void(uint8_t*)>> packed_data_buffer;
 };
 
 class RunPackTests
diff --git a/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h b/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
index df556fea3719e9..1dd8c3e5d4bcc4 100644
--- a/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
+++ b/tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h
@@ -16,6 +16,7 @@ limitations under the License.
 #define TENSORFLOW_LITE_KERNELS_INTERNAL_OPTIMIZED_SPARSE_OPS_FULLY_CONNECTED_H_
 
 #include <algorithm>
+#include <cstdint>
 
 #include "ruy/profiler/instrumentation.h"  // from @ruy
 #include "tensorflow/lite/core/c/common.h"
@@ -80,6 +81,7 @@ inline void FullyConnectedSparseWeight1x16Impl(
     const TfLiteSparsity& sparsity, const FullyConnectedParams& params,
     const RuntimeShape& input_shape, const int8_t* input_data,
     const RuntimeShape& weights_shape, const int8_t* weights_data,
+    const int32_t* per_channel_scale, const int32_t* per_channel_shift,
     const RuntimeShape& bias_shape, const int32_t* bias_data,
     const RuntimeShape& output_shape, int8_t* output_data, int thread_start,
     int thread_end, const CpuBackendContext& cpu_backend_context) {
@@ -107,9 +109,9 @@ inline void FullyConnectedSparseWeight1x16Impl(
   tensor_utils::SparseMatrixBatchVectorMultiplyAccumulate1x16(
       weights_data, w1_segments, w1_indices, weights_shape.Dims(0),
       weights_shape.Dims(1), input_data + thread_start * input_depth, bias_data,
-      batches, input_offset, output_multiplier, output_shift, output_offset,
-      output_activation_min, output_activation_max,
-      output_data + thread_start * output_depth);
+      batches, input_offset, output_multiplier, output_shift, per_channel_scale,
+      per_channel_shift, output_offset, output_activation_min,
+      output_activation_max, output_data + thread_start * output_depth);
 }
 
 inline void FullyConnectedSparseWeight1x4Impl(
@@ -200,6 +202,7 @@ inline void FullyConnectedSparseWeight1x16(
     const TfLiteSparsity& sparsity, const FullyConnectedParams& params,
     const RuntimeShape& input_shape, const int8_t* input_data,
     const RuntimeShape& weights_shape, const int8_t* weights_data,
+    const int32_t* per_channel_scale, const int32_t* per_channel_shift,
     const RuntimeShape& bias_shape, const int32_t* bias_data,
     const RuntimeShape& output_shape, int8_t* output_data,
     CpuBackendContext* cpu_backend_context) {
@@ -212,8 +215,8 @@ inline void FullyConnectedSparseWeight1x16(
   // TODO(b/220851507): Add multi-thread support for quantized sparse kernel.
   return FullyConnectedSparseWeight1x16Impl(
       sparsity, params, input_shape, input_data, weights_shape, weights_data,
-      bias_shape, bias_data, output_shape, output_data, 0, batches,
-      *cpu_backend_context);
+      per_channel_scale, per_channel_shift, bias_shape, bias_data, output_shape,
+      output_data, 0, batches, *cpu_backend_context);
 }
 
 // The multi-threaded kernel slices the workload along the batch dimension. If
diff --git a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
index 896159c77b5010..4f313bd37a7c5b 100644
--- a/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/optimized/sse_tensor_utils.h
@@ -89,14 +89,15 @@ void SparseMatrixBatchVectorMultiplyAccumulate1x16(
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
     const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
     int n_batch, const int32_t input_offset, const int32_t output_multiplier,
-    const int32_t output_shift, const int32_t output_offset,
+    const int32_t output_shift, const int32_t* per_channel_scale,
+    const int32_t* per_channel_shift, const int32_t output_offset,
     const int32_t output_activation_min, const int32_t output_activation_max,
     int8_t* __restrict__ result) {
   NEON_OR_PORTABLE(SparseMatrixBatchVectorMultiplyAccumulate1x16, matrix,
                    segments, indices, m_rows, m_cols, vector, bias_vector,
                    n_batch, input_offset, output_multiplier, output_shift,
-                   output_offset, output_activation_min, output_activation_max,
-                   result);
+                   per_channel_scale, per_channel_shift, output_offset,
+                   output_activation_min, output_activation_max, result);
 }
 
 void SparseMatrixBatchVectorMultiplyAccumulate(
diff --git a/tensorflow/lite/kernels/internal/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/portable_tensor_utils.h
index 03bfdc8f41026e..d37fe6e4c89836 100644
--- a/tensorflow/lite/kernels/internal/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/portable_tensor_utils.h
@@ -241,7 +241,8 @@ void SparseMatrixBatchVectorMultiplyAccumulate1x16(
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
     const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
     int n_batch, const int32_t input_offset, const int32_t output_multiplier,
-    const int32_t output_shift, const int32_t output_offset,
+    int32_t output_shift, const int32_t* per_channel_scale,
+    const int32_t* per_channel_shift, int32_t output_offset,
     const int32_t output_activation_min, const int32_t output_activation_max,
     int8_t* __restrict__ result);
 
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
index b519d8139f646b..7d40df42b33db2 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.cc
@@ -157,7 +157,7 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
       *result += dotprod * batch_scaling_factor;
       ++result;
     }  // for row
-  }    // for batch
+  }  // for batch
 }
 
 void PortableMatrixBatchVectorMultiplyAccumulate(
@@ -200,7 +200,7 @@ void PortableMatrixBatchVectorMultiplyAccumulate(
       *result += dotprod * scale;
       ++result;
     }  // for row
-  }    // for batch
+  }  // for batch
 }
 
 void PortableSparseMatrixBatchVectorMultiplyAccumulate1x4(
@@ -232,7 +232,8 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
     const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
     int n_batch, const int32_t input_offset, const int32_t output_multiplier,
-    const int32_t output_shift, const int32_t output_offset,
+    const int32_t output_shift, const int32_t* per_channel_scale,
+    const int32_t* per_channel_shift, const int32_t output_offset,
     const int32_t output_activation_min, const int32_t output_activation_max,
     int8_t* __restrict__ result) {
   const int kBlockSize = 16;
@@ -252,8 +253,10 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
         }
       }
       const int32_t bias_value = bias_vector != nullptr ? bias_vector[row] : 0;
-      dot_prod = MultiplyByQuantizedMultiplier(dot_prod + bias_value,
-                                               output_multiplier, output_shift);
+      dot_prod = MultiplyByQuantizedMultiplier(
+          dot_prod + bias_value,
+          per_channel_scale ? per_channel_scale[row] : output_multiplier,
+          per_channel_shift ? per_channel_shift[row] : output_shift);
       dot_prod += output_offset;
       result[batch * m_rows + row] =
           static_cast<int8_t>(ActivationFunctionWithMinMax(
@@ -319,14 +322,14 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate(
         for (int c = 0; c < kBlockSize; c++) {
           dotprod += (*row_ptr++) * (*vector_block_ptr++);
         }  // for block
-      }    // for num_nonzero_blocks
+      }  // for num_nonzero_blocks
       float scaling_factor = batch_scaling_factor;
       if (per_channel_scale) {
         scaling_factor *= per_channel_scale[row];
       }
       result[batch * m_rows + row] += dotprod * scaling_factor;
     }  // for row
-  }    // for batch
+  }  // for batch
 }
 
 template <typename T>
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
index ab0185e5a850d8..7c623f71007166 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils.h
@@ -116,14 +116,16 @@ void SparseMatrixBatchVectorMultiplyAccumulate1x16(
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
     const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
     int n_batch, const int32_t input_offset, const int32_t output_multiplier,
-    const int32_t output_shift, const int32_t output_offset,
+    const int32_t output_shift, const int32_t* per_channel_scale,
+    const int32_t* per_channel_shift, const int32_t output_offset,
     const int32_t output_activation_min, const int32_t output_activation_max,
 
     int8_t* __restrict__ result) {
   PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
       matrix, segments, indices, m_rows, m_cols, vector, bias_vector, n_batch,
-      input_offset, output_multiplier, output_shift, output_offset,
-      output_activation_min, output_activation_max, result);
+      input_offset, output_multiplier, output_shift, per_channel_scale,
+      per_channel_shift, output_offset, output_activation_min,
+      output_activation_max, result);
 }
 
 void SparseMatrixBatchVectorMultiplyAccumulate(
diff --git a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
index 6a1f18e7244c13..11765ec7379599 100644
--- a/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
+++ b/tensorflow/lite/kernels/internal/reference/portable_tensor_utils_impl.h
@@ -92,7 +92,8 @@ void PortableSparseMatrixBatchVectorMultiplyAccumulate1x16(
     const int32_t* __restrict__ indices, int m_rows, int m_cols,
     const int8_t* __restrict__ vector, const int32_t* __restrict__ bias_vector,
     int n_batch, const int32_t input_offset, const int32_t output_multiplier,
-    const int32_t output_shift, const int32_t output_offset,
+    int32_t output_shift, const int32_t* per_channel_scale,
+    const int32_t* per_channel_shift, int32_t output_offset,
     const int32_t output_activation_min, const int32_t output_activation_max,
     int8_t* __restrict__ result);
 
diff --git a/tensorflow/lite/kernels/internal/reference/softmax.h b/tensorflow/lite/kernels/internal/reference/softmax.h
index c09a7eae8131b4..2930217b61f91d 100644
--- a/tensorflow/lite/kernels/internal/reference/softmax.h
+++ b/tensorflow/lite/kernels/internal/reference/softmax.h
@@ -115,6 +115,9 @@ inline void Softmax(const SoftmaxParams& params,
     FixedPoint0 shifted_scale = FixedPoint0::FromRaw(GetReciprocal(
         sum_of_exps.raw(), kAccumulationIntegerBits, &num_bits_over_unit));
 
+    const int exponent = num_bits_over_unit + 31 - (sizeof(OutputT) * 8);
+    TFLITE_CHECK(0 <= exponent && exponent <= 31);
+
     for (int c = 0; c < depth; ++c) {
       int32_t input_diff =
           static_cast<int32_t>(input_data[i * depth + c]) - max_in_row;
@@ -127,8 +130,7 @@ inline void Softmax(const SoftmaxParams& params,
 
         FixedPoint0 exp_in_0 = exp_on_negative_values(scaled_diff_f8);
         int32_t unsat_output = gemmlowp::RoundingDivideByPOT(
-            (shifted_scale * exp_in_0).raw(),
-            num_bits_over_unit + 31 - (sizeof(OutputT) * 8));
+            (shifted_scale * exp_in_0).raw(), exponent);
 
         const int32_t shifted_output =
             unsat_output +
diff --git a/tensorflow/lite/kernels/internal/utils/sparsity_format_converter.cc b/tensorflow/lite/kernels/internal/utils/sparsity_format_converter.cc
index eb02a3286c7d25..6f3b90d1783726 100644
--- a/tensorflow/lite/kernels/internal/utils/sparsity_format_converter.cc
+++ b/tensorflow/lite/kernels/internal/utils/sparsity_format_converter.cc
@@ -364,7 +364,7 @@ TfLiteStatus FormatConverter<T>::SparseToDense(const T* src_data,
                                                TfLiteContext* context) {
   if (dest_size != dense_size_) {
     TF_LITE_MAYBE_KERNEL_LOG(
-        context, "unexpected buffer size for densified data, expected %lld.\n",
+        context, "unexpected buffer size for densified data, expected %zu.\n",
         dense_size_);
     return kTfLiteError;
   }
diff --git a/tensorflow/lite/kernels/mul.cc b/tensorflow/lite/kernels/mul.cc
index 8a47d5dc60693a..67b0cfd112479b 100644
--- a/tensorflow/lite/kernels/mul.cc
+++ b/tensorflow/lite/kernels/mul.cc
@@ -398,9 +398,10 @@ TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node, OpData* data,
         context, EvalQuantized<kernel_type>(context, node, params, data, input1,
                                             input2, output));
   } else {
-    TF_LITE_KERNEL_LOG(
-        context, "Mul only supports FLOAT32, COMPLEX32, INT8, INT16,",
-        " INT32, INT64 and quantized UINT8 now, got %d.", output->type);
+    TF_LITE_KERNEL_LOG(context,
+                       "Mul only supports FLOAT32, COMPLEX32, INT8, INT16,"
+                       " INT32, INT64 and quantized UINT8 now, got %d.",
+                       output->type);
     return kTfLiteError;
   }
 
diff --git a/tensorflow/lite/kernels/parse_example/parse_example.cc b/tensorflow/lite/kernels/parse_example/parse_example.cc
index a0006605623519..2fdc06fa5c3316 100644
--- a/tensorflow/lite/kernels/parse_example/parse_example.cc
+++ b/tensorflow/lite/kernels/parse_example/parse_example.cc
@@ -467,7 +467,7 @@ Status FastParseExampleLite(
   std::vector<SparseBuffer> varlen_dense_buffers(config.dense.size());
   Status status_of_minibatch;
   for (size_t e = 0; e < count; ++e) {
-    Status status_of_minibatch = FastParseSerializedExample(
+    status_of_minibatch = FastParseSerializedExample(
         GetString(serialized, e),
         (!example_names.empty() ? example_names[e] : "<unknown>"), e, config,
         quick_filter, quick_filter_size, config_index, config_index_size,
diff --git a/tensorflow/lite/kernels/stablehlo_reduce_window.cc b/tensorflow/lite/kernels/stablehlo_reduce_window.cc
index 78385506c8d768..538b50fd8a7a6e 100644
--- a/tensorflow/lite/kernels/stablehlo_reduce_window.cc
+++ b/tensorflow/lite/kernels/stablehlo_reduce_window.cc
@@ -687,7 +687,7 @@ struct StablehloData : public OpData {
     if (execution_plan.size() != 1) {
       TF_LITE_KERNEL_LOG(context,
                          "Only one kernel is allowed within "
-                         "stablehlo.reduce_window body. (%d) kernels found.\n",
+                         "stablehlo.reduce_window body. (%zu) kernels found.\n",
                          execution_plan.size());
       return TfLiteReduceWindowFunctionUnsupported;
     }
diff --git a/tensorflow/lite/kernels/stablehlo_scatter.cc b/tensorflow/lite/kernels/stablehlo_scatter.cc
index 9885e8c2d94858..be67dc39e911ac 100644
--- a/tensorflow/lite/kernels/stablehlo_scatter.cc
+++ b/tensorflow/lite/kernels/stablehlo_scatter.cc
@@ -135,7 +135,7 @@ static TfLiteStatus GetComputationType(const Subgraph* computation_subgraph,
   if (computation_subgraph->execution_plan().size() > 1) {
     TF_LITE_KERNEL_LOG(context,
                        "Only one kernel allowed withing the stablehlo region. "
-                       "(%i) kernels found.\n",
+                       "(%zu) kernels found.\n",
                        computation_subgraph->execution_plan().size());
     return kTfLiteError;
   }
diff --git a/tensorflow/lite/kernels/transpose_conv.cc b/tensorflow/lite/kernels/transpose_conv.cc
index ebcc3011ddcd01..93c6df28890c9c 100644
--- a/tensorflow/lite/kernels/transpose_conv.cc
+++ b/tensorflow/lite/kernels/transpose_conv.cc
@@ -655,6 +655,8 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
   if (IsDynamicTensor(output)) {
     TF_LITE_ENSURE_OK(context, ResizeTensor(context, output_shape, output));
   }
+  TF_LITE_ENSURE_EQ(context, SizeOfDimension(input, 0),
+                    SizeOfDimension(output, 0));
   if (data->has_col2im && IsDynamicTensor(col2im)) {
     TF_LITE_ENSURE_OK(context, ResizeCol2ImTensor(context, output_shape,
                                                   weights, input, col2im));
diff --git a/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc b/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
index d6b7daab0cff7d..7c640f63444517 100644
--- a/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
+++ b/tensorflow/lite/kernels/unidirectional_sequence_rnn.cc
@@ -405,7 +405,7 @@ TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
                         accum_scratch, row_sums, &op_data->compute_row_sums);
     }
     default:
-      TF_LITE_KERNEL_LOG(context, "Type %d not currently supported.",
+      TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
                          TfLiteTypeGetName(input_weights->type));
       return kTfLiteError;
   }
diff --git a/tensorflow/lite/lib_package/BUILD b/tensorflow/lite/lib_package/BUILD
index 3c1b8d3d45f2bb..791ee3cdb62f4a 100644
--- a/tensorflow/lite/lib_package/BUILD
+++ b/tensorflow/lite/lib_package/BUILD
@@ -5,7 +5,7 @@ package(default_visibility = ["//visibility:private"])
 genrule(
     name = "clicenses_generate",
     srcs = [
-        "//third_party/eigen3:LICENSE",
+        # copybara:uncomment "//third_party/eigen3:LICENSE",
         "@arm_neon_2_x86_sse//:LICENSE",
         "@farmhash_archive//:COPYING",
         "@gemmlowp//:LICENSE",
diff --git a/tensorflow/lite/python/BUILD b/tensorflow/lite/python/BUILD
index 7dd04a58e6867b..817b12bb664363 100644
--- a/tensorflow/lite/python/BUILD
+++ b/tensorflow/lite/python/BUILD
@@ -1,8 +1,8 @@
+load("//tensorflow:pytype.default.bzl", "pytype_strict_contrib_test", "pytype_strict_library")
 load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library", "py_strict_test")
 load("@flatbuffers//:build_defs.bzl", "flatbuffer_py_library")
 load("//tensorflow:tensorflow.default.bzl", "get_compatible_with_portable")
 load("//tensorflow/lite:special_rules.bzl", "internal_visibility_allowlist")
-load("//tensorflow:pytype.default.bzl", "pytype_strict_contrib_test", "pytype_strict_library")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -189,6 +189,8 @@ py_strict_library(
         ":lite_constants",
         ":op_hint",
         ":util",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_py",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:representative_dataset",
         "//tensorflow/core:protos_all_py",
         "//tensorflow/lite/experimental/microfrontend:audio_microfrontend_py",
         "//tensorflow/lite/python/metrics",
@@ -207,7 +209,6 @@ py_strict_library(
         "//tensorflow/python/framework:ops",
         "//tensorflow/python/framework:versions",
         "//tensorflow/python/platform:gfile",
-        "//tensorflow/python/saved_model",
         "//tensorflow/python/saved_model:load",
         "//tensorflow/python/saved_model:loader",
         "//tensorflow/python/saved_model:save",
@@ -452,6 +453,7 @@ py_strict_library(
     ],
     srcs_version = "PY3",
     deps = [
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib_py",
         "//tensorflow/python:_pywrap_toco_api",
         "//tensorflow/python:pywrap_tensorflow",
     ],
@@ -479,12 +481,13 @@ pytype_strict_library(
         ":lite_constants",
         ":util",
         ":wrap_toco",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto_py",
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_options_proto_py",
-        "//tensorflow/core:protos_all_py",
         "//tensorflow/lite/python/metrics:converter_error_data_proto_py",
         "//tensorflow/lite/python/metrics:metrics_wrapper",
         "//tensorflow/lite/toco:model_flags_proto_py",
         "//tensorflow/lite/toco:toco_flags_proto_py",
+        "//tensorflow/lite/toco:types_proto_py",
         "//tensorflow/lite/toco/python:toco_from_protos",
         "//tensorflow/lite/tools:flatbuffer_utils",
         "//tensorflow/python/framework:dtypes",
diff --git a/tensorflow/lite/python/convert.py b/tensorflow/lite/python/convert.py
index 92d31bd8811c8c..6ae3a1724f5202 100644
--- a/tensorflow/lite/python/convert.py
+++ b/tensorflow/lite/python/convert.py
@@ -24,6 +24,7 @@
 from typing import Optional
 import warnings
 
+from tensorflow.compiler.mlir.quantization.stablehlo import quantization_config_pb2
 from tensorflow.compiler.mlir.quantization.stablehlo import quantization_options_pb2 as quant_opts_pb2
 from tensorflow.lite.python import lite_constants
 from tensorflow.lite.python import util
@@ -229,6 +230,7 @@ def mlir_quantize(
     denylisted_ops=None,
     denylisted_nodes=None,
     enable_variable_quantization=False,
+    disable_per_channel_for_dense_layers=False,
 ):
   """Quantize `input_data_str` with calibration results.
 
@@ -254,6 +256,9 @@ def mlir_quantize(
     enable_variable_quantization: Experimental. Subject to change. Bool
       indicating whether to enable quantization of the residual variables
       remaining after the variable freezing pass.
+    disable_per_channel_for_dense_layers: Bool indicating whether to do
+      per-channel or per-tensor quantization in Fully Connected layers. Default
+      value is False meaning per-channel quantization is enabled.
 
   Returns:
     Quantized model in serialized form (e.g. a TFLITE model) with floating-point
@@ -271,6 +276,7 @@ def mlir_quantize(
       denylisted_ops,
       denylisted_nodes,
       enable_variable_quantization,
+      disable_per_channel_for_dense_layers,
   )
 
 
@@ -418,15 +424,12 @@ def _run_deprecated_conversion_binary(
   output_filename: str = None
   try:
     # Build all input files
-    with _tempfile.NamedTemporaryFile(
-        delete=False
-    ) as fp_conversion, _tempfile.NamedTemporaryFile(
-        delete=False
-    ) as fp_model, _tempfile.NamedTemporaryFile(
-        delete=False
-    ) as fp_input, _tempfile.NamedTemporaryFile(
-        delete=False
-    ) as fp_debug:
+    with (
+        _tempfile.NamedTemporaryFile(delete=False) as fp_conversion,
+        _tempfile.NamedTemporaryFile(delete=False) as fp_model,
+        _tempfile.NamedTemporaryFile(delete=False) as fp_input,
+        _tempfile.NamedTemporaryFile(delete=False) as fp_debug,
+    ):
       conversion_filename = fp_conversion.name
       input_filename = fp_input.name
       model_filename = fp_model.name
@@ -501,7 +504,7 @@ def build_model_flags(
     saved_model_version=0,
     saved_model_tags=None,
     saved_model_exported_names=None,
-    **_
+    **_,
 ):
   """Builds the model flags object from params.
 
@@ -585,10 +588,14 @@ def build_conversion_flags(
     print_ir_after=None,
     print_ir_module_scope=None,
     elide_elementsattrs_if_larger=None,
+    quantization_config: Optional[
+        quantization_config_pb2.QuantizationConfig
+    ] = None,
     use_buffer_offset=False,
     reduce_type_precision=False,
     qdq_conversion_mode=None,
-    **_
+    disable_per_channel_quantization_for_dense_layers=False,
+    **_,
 ):
   """Builds protocol buffer describing a conversion of a model.
 
@@ -682,10 +689,10 @@ def build_conversion_flags(
       graph.
     disable_fuse_mul_and_fc: Disable fusing input multiplication with
       fullyconnected operations. Useful when quantizing weights.
-    quantization_options: Config to indicate quantization options of each
-      components (ex: weight, bias, activation). This can be a preset method or
-      a custom method, and allows finer, modular control. This option will
-      override any other existing quantization flags. We plan on gradually
+    quantization_options: [Deprecated] Config to indicate quantization options
+      of each components (ex: weight, bias, activation). This can be a preset
+      method or a custom method, and allows finer, modular control. This option
+      will override any other existing quantization flags. We plan on gradually
       migrating all quantization-related specs into this option.
     ir_dump_dir: A string specifying the target directory to output MLIR dumps
       produced during conversion. If populated, enables MLIR dumps.
@@ -704,6 +711,8 @@ def build_conversion_flags(
       operation when printing IR for print_ir_[before|after].
     elide_elementsattrs_if_larger: An int, if specified elides ElementsAttrs
       with '...' that have more elements than the given upper limit.
+    quantization_config: Configures the StableHLO Quantizer. See the comments in
+      `QuantizationConfig` protobuf definition for details.
     use_buffer_offset: Force the model use buffer_offset & buffer_size fields
       instead of data. i.e. store the constant tensor and custom op binaries
       outside of Flatbuffers
@@ -712,6 +721,9 @@ def build_conversion_flags(
       This could have side effects e.g. reduced flatbuffer size.
     qdq_conversion_mode: If set, assume input model is a quantized model
       represented with QDQ ops and convert to quantized kernels.
+    disable_per_channel_quantization_for_dense_layers: If set, disables per
+      channel end enables per tensor integer quantization for weights in Dense
+      layers. The flag works only for integer quantized model.
 
   Returns:
     conversion_flags: protocol buffer describing the conversion process.
@@ -795,8 +807,10 @@ def build_conversion_flags(
       enable_mlir_variable_quantization
   )
   conversion_flags.disable_fuse_mul_and_fc = disable_fuse_mul_and_fc
-  if quantization_options:
+  if quantization_options:  # Deprecated
     conversion_flags.quantization_options.CopyFrom(quantization_options)
+  if quantization_config:
+    conversion_flags.quantization_config.CopyFrom(quantization_config)
 
   # Transfer debug options. Check for existence before populating in order to
   # leverage defaults specified in proto definition.
@@ -827,6 +841,9 @@ def build_conversion_flags(
     conversion_flags.reduce_type_precision = reduce_type_precision
   if qdq_conversion_mode is not None:
     conversion_flags.qdq_conversion_mode = qdq_conversion_mode
+  conversion_flags.disable_per_channel_quantization_for_dense_layers = (
+      disable_per_channel_quantization_for_dense_layers
+  )
   return conversion_flags
 
 
@@ -838,7 +855,7 @@ def convert_graphdef_with_arrays(
     input_arrays_with_shape,
     output_arrays,
     control_output_arrays,
-    **kwargs
+    **kwargs,
 ):
   """Convert a frozen GraphDef that can't be loaded in TF.
 
diff --git a/tensorflow/lite/python/lite.py b/tensorflow/lite/python/lite.py
index 22088fa71e142b..e211a0ce40f7d5 100644
--- a/tensorflow/lite/python/lite.py
+++ b/tensorflow/lite/python/lite.py
@@ -27,9 +27,11 @@
 
 from google.protobuf import text_format as _text_format
 from google.protobuf.message import DecodeError
+from tensorflow.compiler.mlir.quantization.stablehlo import quantization_config_pb2 as qc
+from tensorflow.compiler.mlir.quantization.tensorflow.python import representative_dataset as rd
 from tensorflow.core.framework import graph_pb2 as _graph_pb2
 from tensorflow.lite.experimental.microfrontend.python.ops import audio_microfrontend_op  # pylint: disable=unused-import
-from tensorflow.lite.python import conversion_metadata_schema_py_generated as conversion_metdata_fb
+from tensorflow.lite.python import conversion_metadata_schema_py_generated as conversion_metadata_fb
 from tensorflow.lite.python import lite_constants as constants
 from tensorflow.lite.python.convert import convert_graphdef as _convert_graphdef
 from tensorflow.lite.python.convert import convert_graphdef_with_arrays as _convert_graphdef_with_arrays
@@ -149,7 +151,6 @@ class Optimize(enum.Enum):
   # The flag can be used alone to optimize float32 models with sparse weights.
   # It can also be used together with the DEFAULT optimization mode to optimize
   # quantized models with sparse weights.
-  # TODO(b/161560631): Add log message when this optimization is applied.
   EXPERIMENTAL_SPARSITY = "EXPERIMENTAL_SPARSITY"
 
   def __str__(self):
@@ -226,8 +227,6 @@ def __init__(
     # Hint for the supported accumulation type used for inference. Typically
     # used for fp16 post-training quantization, where some models can use fp16
     # accumulators instead of the typical fp32 type.
-    # TODO(b/188185962): Provide full API and authoring support for
-    # reduced precision accumulation types.
     self._experimental_supported_accumulation_type = None
 
 
@@ -603,7 +602,7 @@ class TFLiteConverterBase:
 
   # Stores the original model type temporarily to transmit the information
   # from the factory class methods to TFLiteConverterBase init function.
-  _original_model_type = conversion_metdata_fb.ModelType.NONE
+  _original_model_type = conversion_metadata_fb.ModelType.NONE
 
   def __init__(self):
     self.optimizations = set()
@@ -640,9 +639,9 @@ def __init__(self):
     self.experimental_use_stablehlo_quantizer = False
     # Initializes conversion metadata.
     self.exclude_conversion_metadata = False
-    self._metadata = conversion_metdata_fb.ConversionMetadataT()
-    self._metadata.environment = conversion_metdata_fb.EnvironmentT()
-    self._metadata.options = conversion_metdata_fb.ConversionOptionsT()
+    self._metadata = conversion_metadata_fb.ConversionMetadataT()
+    self._metadata.environment = conversion_metadata_fb.EnvironmentT()
+    self._metadata.options = conversion_metadata_fb.ConversionOptionsT()
     self._metadata.environment.tensorflowVersion = versions.__version__
     self._metadata.environment.modelType = self._get_original_model_type()
     self._experimental_enable_dynamic_update_slice = False
@@ -664,6 +663,7 @@ def __init__(self):
     self._experimental_use_buffer_offset = False
     self._experimental_reduce_type_precision = False
     self._experimental_qdq_conversion_mode = None
+    self._experimental_disable_per_channel_quantization_for_dense_layers = False
 
     # Debug parameters
     self.ir_dump_dir = None
@@ -743,14 +743,13 @@ def _quantize(
     elif self.experimental_new_quantizer and (
         activations_type != _dtypes.int16
     ):
-      # TODO(b/175659372): remove the activations_type restriction and enable
-      # it for all the activation types.
       return _mlir_quantize(
           calibrated,
           self._experimental_disable_per_channel,
           input_data_type=input_type,
           output_data_type=output_type,
           enable_variable_quantization=enable_variable_quantization,
+          disable_per_channel_for_dense_layers=self._experimental_disable_per_channel_quantization_for_dense_layers,
       )
     else:
       return calibrate_quantize.calibrate_and_quantize(
@@ -816,6 +815,9 @@ def _get_base_converter_args(self):
         "reduce_type_precision": self._experimental_reduce_type_precision,
         "use_stablehlo_quantizer": self.experimental_use_stablehlo_quantizer,
         "qdq_conversion_mode": self._experimental_qdq_conversion_mode,
+        "disable_per_channel_quantization_for_dense_layers": (
+            self._experimental_disable_per_channel_quantization_for_dense_layers
+        ),
     }
 
     if self.saved_model_dir:
@@ -834,9 +836,39 @@ def _get_base_converter_args(self):
           " path will be a no-op."
       )
 
-    # TODO: b/307626169 - Integrate StableHLO Quantizer.
     if self.experimental_use_stablehlo_quantizer:
-      raise ValueError("StableHLO quantizer is not supported yet.")
+      if Optimize.DEFAULT in self.optimizations and self.representative_dataset:
+        if len(self._saved_model_exported_names) != 1:
+          raise ValueError(
+              "StableHLO quantizer is only supported when converting from a"
+              " SavedModel with one signature key."
+          )
+
+        signature_key = self._saved_model_exported_names[0]
+
+        # Convert a programmatically provided representative dataset to a
+        # temporary TFRecord file to be used by the StableHLO quantizer.
+        tfrecord_file_path: str = tempfile.mkstemp(
+            suffix=".tfrecord", prefix=signature_key
+        )[1]
+        rd.TfRecordRepresentativeDatasetSaver(
+            {signature_key: tfrecord_file_path}
+        ).save({signature_key: self.representative_dataset()})
+
+        quantization_config = qc.QuantizationConfig(
+            static_range_ptq_preset=qc.StaticRangePtqPreset(
+                representative_datasets=[
+                    qc.RepresentativeDatasetConfig(
+                        tf_record=qc.TfRecordFile(path=tfrecord_file_path)
+                    )
+                ],
+                enable_per_channel_quantized_weight=True,
+            )
+        )
+
+        args["quantization_config"] = quantization_config
+      else:
+        raise ValueError("StableHLO quantizer only supports static-range PTQ.")
 
     return args
 
@@ -904,7 +936,7 @@ def _increase_conversion_success_metric(self):
   @classmethod
   def _set_original_model_type(cls, model_type):
     """Stores the original model type."""
-    if model_type == conversion_metdata_fb.ModelType.NONE:
+    if model_type == conversion_metadata_fb.ModelType.NONE:
       raise ValueError("The original model type should be specified.")
     cls._original_model_type = model_type
 
@@ -912,7 +944,7 @@ def _get_original_model_type(self):
     """One-time getter to return original model type and set it to NONE."""
     model_type = TFLiteConverterBase._original_model_type
     TFLiteConverterBase._original_model_type = (
-        conversion_metdata_fb.ModelType.NONE
+        conversion_metadata_fb.ModelType.NONE
     )
     return model_type
 
@@ -998,27 +1030,27 @@ def format_param(param):
 
     if quant_mode.is_post_training_float16_quantization():
       self._metadata.options.modelOptimizationModes.append(
-          conversion_metdata_fb.ModelOptimizationMode.PTQ_FLOAT16
+          conversion_metadata_fb.ModelOptimizationMode.PTQ_FLOAT16
       )
 
     if quant_mode.is_post_training_dynamic_range_quantization():
       self._metadata.options.modelOptimizationModes.append(
-          conversion_metdata_fb.ModelOptimizationMode.PTQ_DYNAMIC_RANGE
+          conversion_metadata_fb.ModelOptimizationMode.PTQ_DYNAMIC_RANGE
       )
 
     if quant_mode.is_post_training_int8_quantization():
       self._metadata.options.modelOptimizationModes.append(
-          conversion_metdata_fb.ModelOptimizationMode.PTQ_FULL_INTEGER
+          conversion_metadata_fb.ModelOptimizationMode.PTQ_FULL_INTEGER
       )
 
     if quant_mode.is_post_training_int16x8_quantization():
       self._metadata.options.modelOptimizationModes.append(
-          conversion_metdata_fb.ModelOptimizationMode.PTQ_INT16
+          conversion_metadata_fb.ModelOptimizationMode.PTQ_INT16
       )
 
     if quant_mode.is_quantization_aware_training():
       self._metadata.options.modelOptimizationModes.append(
-          conversion_metdata_fb.ModelOptimizationMode.QUANTIZATION_AWARE_TRAINING
+          conversion_metadata_fb.ModelOptimizationMode.QUANTIZATION_AWARE_TRAINING
       )
 
   def _set_conversion_latency_metric(self, value):
@@ -1028,7 +1060,14 @@ def _set_conversion_latency_metric(self, value):
   def _optimize_tflite_model(self, model, quant_mode, quant_io=True):
     """Apply optimizations on a TFLite model."""
 
-    if quant_mode.is_integer_quantization():
+    # Disable TFLite quantization pass when
+    # `experimental_use_stablehlo_quantizer` is set to `True`. StableHLO
+    # Quantizer performs quantization during the conversion step, which happens
+    # before `_optimize_tflite_model`.
+    if (
+        quant_mode.is_integer_quantization()
+        and not self.experimental_use_stablehlo_quantizer
+    ):
       in_type, out_type = self.inference_input_type, self.inference_output_type
 
       if quant_mode.is_post_training_integer_quantization():
@@ -1064,7 +1103,6 @@ def _optimize_tflite_model(self, model, quant_mode, quant_io=True):
       model = _mlir_sparsify(model)
 
     if not self._experimental_use_buffer_offset:
-      # TODO(b/287476027): move this logic into c++
       try:
         model_object = flatbuffer_utils.convert_bytearray_to_object(model)
         if _check_model_use_buffer_offset(model_object):
@@ -1450,7 +1488,6 @@ def convert(self):
       )
       # We make sure to clear the saved_model_dir as there is some
       # legacy code down in the caller that checks this.
-      # TODO(b/162537905): Clean these indirect dependencies.
       self.saved_model_dir = None
       return super(TFLiteSavedModelConverterV2, self).convert(
           graph_def, input_tensors, output_tensors
@@ -1654,8 +1691,6 @@ def _freeze_concrete_function(self):
     Raises:
       ValueError: none or multiple ConcreteFunctions provided.
     """
-    # TODO(b/130297984): Add support for converting multiple function.
-
     if len(self._funcs) == 0:  # pylint: disable=g-explicit-length-test
       raise ValueError("No ConcreteFunction is specified.")
 
@@ -2019,7 +2054,7 @@ def from_concrete_functions(cls, funcs, trackable_obj=None):
     """
     # pylint: disable=protected-access
     TFLiteConverterBase._set_original_model_type(
-        conversion_metdata_fb.ModelType.TF_CONCRETE_FUNCTIONS
+        conversion_metadata_fb.ModelType.TF_CONCRETE_FUNCTIONS
     )
     # pylint: enable=protected-access
     if trackable_obj is None:
@@ -2062,7 +2097,7 @@ def from_saved_model(cls, saved_model_dir, signature_keys=None, tags=None):
     """
     # pylint: disable=protected-access
     TFLiteConverterBase._set_original_model_type(
-        conversion_metdata_fb.ModelType.TF_SAVED_MODEL
+        conversion_metadata_fb.ModelType.TF_SAVED_MODEL
     )
     # pylint: enable=protected-access
     # When run without eager enabled, this will return the legacy
@@ -2088,7 +2123,7 @@ def from_saved_model(cls, saved_model_dir, signature_keys=None, tags=None):
     with context.eager_mode():
       saved_model = _load(saved_model_dir, tags)
     if not signature_keys:
-      signature_keys = saved_model.signatures
+      signature_keys = list(saved_model.signatures.keys())
 
     if not signature_keys:
       raise ValueError("Only support at least one signature key.")
@@ -2137,7 +2172,7 @@ def from_keras_model(cls, model):
     """
     # pylint: disable=protected-access
     TFLiteConverterBase._set_original_model_type(
-        conversion_metdata_fb.ModelType.KERAS_MODEL
+        conversion_metadata_fb.ModelType.KERAS_MODEL
     )
     # pylint: enable=protected-access
     return TFLiteKerasModelConverterV2(model)
@@ -2165,7 +2200,7 @@ def experimental_from_jax(cls, serving_funcs, inputs):
     """
     # pylint: disable=protected-access
     TFLiteConverterBase._set_original_model_type(
-        conversion_metdata_fb.ModelType.JAX
+        conversion_metadata_fb.ModelType.JAX
     )
     # pylint: enable=protected-access
     return TFLiteJaxConverterV2(serving_funcs, inputs)
@@ -2563,8 +2598,6 @@ def __init__(
     self._saved_model_tags = saved_model_tags
     self._saved_model_exported_names = saved_model_exported_names
 
-    signature_key = _signature_constants.DEFAULT_SERVING_SIGNATURE_DEF_KEY
-
     if len(self._saved_model_exported_names) != 1:
       raise ValueError("Only support a single signature key.")
 
@@ -2982,7 +3015,7 @@ def from_session(cls, sess, input_tensors, output_tensors):
     """
     # pylint: disable=protected-access
     TFLiteConverterBase._set_original_model_type(
-        conversion_metdata_fb.ModelType.TF_SESSION
+        conversion_metadata_fb.ModelType.TF_SESSION
     )
     # pylint: enable=protected-access
     graph_def = _freeze_graph(sess, input_tensors, output_tensors)
@@ -3022,7 +3055,7 @@ def from_frozen_graph(
     """
     # pylint: disable=protected-access
     TFLiteConverterBase._set_original_model_type(
-        conversion_metdata_fb.ModelType.TF_GRAPH_DEF
+        conversion_metadata_fb.ModelType.TF_GRAPH_DEF
     )
     # pylint: enable=protected-access
     with _ops.Graph().as_default():
@@ -3128,7 +3161,7 @@ def from_saved_model(
     """
     # pylint: disable=protected-access
     TFLiteConverterBase._set_original_model_type(
-        conversion_metdata_fb.ModelType.TF_SAVED_MODEL
+        conversion_metadata_fb.ModelType.TF_SAVED_MODEL
     )
     # pylint: enable=protected-access
     if tag_set is None:
@@ -3187,7 +3220,7 @@ def from_keras_model_file(
     """
     # pylint: disable=protected-access
     TFLiteConverterBase._set_original_model_type(
-        conversion_metdata_fb.ModelType.KERAS_MODEL
+        conversion_metadata_fb.ModelType.KERAS_MODEL
     )
     # pylint: enable=protected-access
     return TFLiteKerasModelConverter(
diff --git a/tensorflow/lite/python/lite_v2_test.py b/tensorflow/lite/python/lite_v2_test.py
index cb5699b88e170e..74d8c2940d56ed 100644
--- a/tensorflow/lite/python/lite_v2_test.py
+++ b/tensorflow/lite/python/lite_v2_test.py
@@ -34,21 +34,15 @@
 from tensorflow.compiler.mlir.quantization.stablehlo import quantization_options_pb2 as quant_opts_pb2
 from tensorflow.lite.python import conversion_metadata_schema_py_generated as metadata_fb
 from tensorflow.lite.python import convert
+from tensorflow.lite.python import interpreter
 from tensorflow.lite.python import lite
 from tensorflow.lite.python import lite_v2_test_util
 from tensorflow.lite.python import schema_py_generated as schema_fb
 from tensorflow.lite.python import test_util as tflite_test_util
 from tensorflow.lite.python import util
-from tensorflow.lite.python.convert import mlir_quantize
-from tensorflow.lite.python.interpreter import Interpreter
-from tensorflow.lite.python.interpreter import InterpreterWithCustomOps
-from tensorflow.lite.python.interpreter import OpResolverType
 from tensorflow.lite.python.testdata import _pywrap_test_registerer as test_registerer
 from tensorflow.lite.python.testdata import double_op
-from tensorflow.lite.python.util import get_conversion_metadata
-# TODO(b/175659372): We should support 16x8 mode in the mlir quantizer
-# from tensorflow.lite.toco import types_pb2 as _types_pb2
-from tensorflow.lite.tools.flatbuffer_utils import convert_bytearray_to_object as _convert_bytearray_to_object
+from tensorflow.lite.tools import flatbuffer_utils
 from tensorflow.python.framework import dtypes
 from tensorflow.python.framework import ops
 from tensorflow.python.framework import test_util
@@ -58,10 +52,10 @@
 from tensorflow.python.ops import rnn
 from tensorflow.python.platform import resource_loader
 from tensorflow.python.platform import test
+from tensorflow.python.saved_model import loader_impl
+from tensorflow.python.saved_model import save
 from tensorflow.python.saved_model import save_options
 from tensorflow.python.saved_model import saved_model
-from tensorflow.python.saved_model.loader_impl import parse_saved_model
-from tensorflow.python.saved_model.save import save
 from tensorflow.python.trackable import autotrackable
 
 # Type alias for preset quantization method protobuf enums.
@@ -166,9 +160,9 @@ def __call__(self, x):
         [str(x) for x in range(11)], shape=(11,), dtype=tf.dtypes.string
     )
     # Check values from converted model.
-    interpreter = tf.lite.Interpreter(model_content=tflite_model)
-    interpreter.allocate_tensors()
-    my_signature = interpreter.get_signature_runner()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    interp.allocate_tensors()
+    my_signature = interp.get_signature_runner()
 
     with self.assertRaises(ValueError) as error:
       _ = my_signature(x=input_data)
@@ -233,13 +227,13 @@ def testConvertMultipleFunctions(self):
     tflite_model = converter.convert()
 
     # Check signatures are valid from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    signature_defs = interpreter.get_signature_list()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    signature_defs = interp.get_signature_list()
 
     # Verify the SignatureDef structure returned is as expected.
-    self.assertEqual(len(signature_defs), 2)
+    self.assertLen(signature_defs, 2)
     self.assertEqual(list(signature_defs.keys()), ['add', 'sub'])
-    self.assertEqual(len(signature_defs.values()), 2)
+    self.assertLen(signature_defs.values(), 2)
     self.assertEqual(list(signature_defs['add'].keys()), ['inputs', 'outputs'])
     self.assertCountEqual(signature_defs['add']['inputs'], ['x'])
     self.assertEqual(list(signature_defs['add']['outputs']), ['output_0'])
@@ -248,21 +242,21 @@ def testConvertMultipleFunctions(self):
     self.assertEqual(list(signature_defs['sub']['outputs']), ['output_0'])
 
     # Verify the Signature runner executions.
-    add_signature_runner = interpreter.get_signature_runner('add')
+    add_signature_runner = interp.get_signature_runner('add')
     add_output = add_signature_runner(x=input_data)
     self.assertEqual(add_output['output_0'], 3)
     input_details = add_signature_runner.get_input_details()
-    self.assertEqual(1, len(input_details))
+    self.assertLen(input_details, 1)
     self.assertEqual('add_x:0', input_details['x']['name'])
     self.assertEqual(np.float32, input_details['x']['dtype'])
     self.assertTrue(([1] == input_details['x']['shape']).all())
     self.assertEqual((0.0, 0), input_details['x']['quantization'])
 
-    sub_signature_runner = interpreter.get_signature_runner('sub')
+    sub_signature_runner = interp.get_signature_runner('sub')
     sub_output = sub_signature_runner(x=input_data)
     self.assertEqual(sub_output['output_0'], -2)
     output_details = sub_signature_runner.get_output_details()
-    self.assertEqual(1, len(output_details))
+    self.assertLen(output_details, 1)
     self.assertEqual(
         'StatefulPartitionedCall_1:0', output_details['output_0']['name']
     )
@@ -271,7 +265,7 @@ def testConvertMultipleFunctions(self):
     self.assertEqual((0.0, 0), output_details['output_0']['quantization'])
 
     # Check the conversion metadata.
-    metadata = get_conversion_metadata(tflite_model)
+    metadata = util.get_conversion_metadata(tflite_model)
     self.assertIsNotNone(metadata)
     self.assertEqual(metadata.environment.apiVersion, 2)
     self.assertEqual(
@@ -330,7 +324,7 @@ def testPostTrainingCalibrateAndQuantize(self, mlir_quantizer):
     quantized_tflite_model = quantized_converter.convert()
     self.assertIsNotNone(quantized_tflite_model)
     # Check the conversion metadata.
-    metadata = get_conversion_metadata(quantized_tflite_model)
+    metadata = util.get_conversion_metadata(quantized_tflite_model)
     self.assertIsNotNone(metadata)
     self.assertEqual(
         metadata.environment.tensorflowVersion.decode('utf-8'),
@@ -350,12 +344,12 @@ def testPostTrainingCalibrateAndQuantize(self, mlir_quantizer):
     )
 
     # The default input and output types should be float.
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    interp.allocate_tensors()
+    input_details = interp.get_input_details()
     self.assertLen(input_details, 1)
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    output_details = interpreter.get_output_details()
+    output_details = interp.get_output_details()
     self.assertLen(output_details, 1)
     self.assertEqual(np.float32, output_details[0]['dtype'])
 
@@ -415,13 +409,13 @@ def testQuantizationRemovesQDQsForFloatIOInQAT(self):
 
     # Because assertions on the model later, we opt out applying default TFLite
     # delegates (i.e. the XNNPACK delegate).
-    interpreter = Interpreter(
+    interp = interpreter.Interpreter(
         model_content=quantized_model,
-        experimental_op_resolver_type=OpResolverType.BUILTIN_WITHOUT_DEFAULT_DELEGATES,
+        experimental_op_resolver_type=interpreter.OpResolverType.BUILTIN_WITHOUT_DEFAULT_DELEGATES,
     )
-    interpreter.allocate_tensors()
+    interp.allocate_tensors()
     # The model should have LOGISTIC op, instead of DEQUANTIZE op.
-    op_details = interpreter._get_ops_details()
+    op_details = interp._get_ops_details()
     self.assertEqual(op_details[len(op_details) - 1]['op_name'], 'LOGISTIC')
 
   @parameterized.named_parameters(
@@ -440,13 +434,13 @@ def testQuantizationRemovesQDQsForFloatIO(self, mlir_quantizer):
 
     # Because assertions on the model later, we opt out applying default TFLite
     # delegates (i.e. the XNNPACK delegate).
-    interpreter = Interpreter(
+    interp = interpreter.Interpreter(
         model_content=quantized_model,
-        experimental_op_resolver_type=OpResolverType.BUILTIN_WITHOUT_DEFAULT_DELEGATES,
+        experimental_op_resolver_type=interpreter.OpResolverType.BUILTIN_WITHOUT_DEFAULT_DELEGATES,
     )
-    interpreter.allocate_tensors()
+    interp.allocate_tensors()
     # The model should have only one sqrt op.
-    op_details = interpreter._get_ops_details()
+    op_details = interp._get_ops_details()
     self.assertLen(op_details, 1)
     self.assertEqual(op_details[0]['op_name'], 'SQRT')
 
@@ -498,7 +492,7 @@ def testIntegerQuantization(
     quantized_tflite_model = quantized_converter.convert()
     self.assertIsNotNone(quantized_tflite_model)
     # Check the conversion metadata.
-    metadata = get_conversion_metadata(quantized_tflite_model)
+    metadata = util.get_conversion_metadata(quantized_tflite_model)
     self.assertIsNotNone(metadata)
     expected_opt_options = [metadata_fb.ModelOptimizationMode.PTQ_FULL_INTEGER]
     if is_int16_quantize:
@@ -507,14 +501,14 @@ def testIntegerQuantization(
         expected_opt_options, metadata.options.modelOptimizationModes
     )
 
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    interp.allocate_tensors()
+    input_details = interp.get_input_details()
     self.assertLen(input_details, 1)
     self.assertEqual(
         inference_input_output_type.as_numpy_dtype, input_details[0]['dtype']
     )
-    output_details = interpreter.get_output_details()
+    output_details = interp.get_output_details()
     self.assertLen(output_details, 1)
     self.assertEqual(
         inference_input_output_type.as_numpy_dtype, output_details[0]['dtype']
@@ -523,12 +517,8 @@ def testIntegerQuantization(
     # Ensure that the quantized tflite model is smaller.
     self.assertLess(len(quantized_tflite_model), len(tflite_model))
 
-  @parameterized.named_parameters(
-      ('_INT16Quantize_INT8InputOutput', True, dtypes.int8)
-  )
-  def testInvalidIntegerQuantization(
-      self, is_int16_quantize, inference_input_output_type
-  ):
+  @parameterized.named_parameters(('_INT16Quantize_INT8InputOutput', True))
+  def testInvalidIntegerQuantization(self, is_int16_quantize):
     root, func, calibration_gen = self._getIntegerQuantizeModel()
 
     # Convert quantized model.
@@ -563,41 +553,29 @@ def testCalibrateAndQuantizeBuiltinInt16(self):
     self.assertIsNotNone(float_tflite_model)
 
     converter = lite.TFLiteConverterV2.from_concrete_functions([func], root)
-    # TODO(b/156309549): We should add INT16 to the builtin types.
     converter.optimizations = [lite.Optimize.DEFAULT]
     converter.target_spec.supported_ops = [
         lite.OpsSet.EXPERIMENTAL_TFLITE_BUILTINS_ACTIVATIONS_INT16_WEIGHTS_INT8
     ]
     converter.representative_dataset = calibration_gen
-    # TODO(b/175659372): We should support 16x8 mode in the mlir quantizer
-    # converter._experimental_calibrate_only = True
-    # calibrated_tflite = converter.convert()
-    # quantized_tflite_model = mlir_quantize(
-    #     calibrated_tflite, inference_type=_types_pb2.QUANTIZED_INT16)
     quantized_tflite_model = converter.convert()
 
     self.assertIsNotNone(quantized_tflite_model)
 
     # The default input and output types should be float.
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    interp.allocate_tensors()
+    input_details = interp.get_input_details()
     self.assertLen(input_details, 1)
     self.assertEqual(np.float32, input_details[0]['dtype'])
-    output_details = interpreter.get_output_details()
+    output_details = interp.get_output_details()
     self.assertLen(output_details, 1)
     self.assertEqual(np.float32, output_details[0]['dtype'])
 
     # The weights tensor should be quantized to 8 bits,
     # the bias tensor should be 32 bits to utilize optimized kernels,
     # and the activations should be 16 bits.
-    tensor_details = interpreter.get_tensor_details()
-    # TODO(b/175659372): The old quantizer yields a 64 bit bias and a
-    # slightly different tensor order than the new one.
-    # self.assertEqual(np.int8, tensor_details[1]['dtype'])
-    # self.assertEqual(np.int32, tensor_details[0]['dtype'])
-    # self.assertEqual(np.int16, tensor_details[2]['dtype'])
-    # self.assertEqual(np.int16, tensor_details[3]['dtype'])
+    tensor_details = interp.get_tensor_details()
     self.assertEqual(np.int8, tensor_details[2]['dtype'])
     self.assertEqual(np.int64, tensor_details[1]['dtype'])
     self.assertEqual(np.int16, tensor_details[0]['dtype'])
@@ -618,8 +596,8 @@ def testSignatureDefs(self):
 
     # Check values from converted model.
     expected_value = add_func(input_data)
-    interpreter = Interpreter(model_content=tflite_model)
-    signature_defs = interpreter.get_signature_list()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    signature_defs = interp.get_signature_list()
     results = self._evaluateTFLiteModelUsingSignatureDef(
         tflite_model, 'serving_default', {'x': input_data}
     )
@@ -631,9 +609,9 @@ def testSignatureDefs(self):
     )
 
     # Verify the SignatureDef structure returned is as expected.
-    self.assertEqual(len(signature_defs), 1)
+    self.assertLen(signature_defs, 1)
     self.assertEqual(list(signature_defs.keys()), ['serving_default'])
-    self.assertEqual(len(signature_defs.values()), 1)
+    self.assertLen(signature_defs.values(), 1)
     self.assertEqual(
         list(signature_defs['serving_default'].keys()), ['inputs', 'outputs']
     )
@@ -656,10 +634,10 @@ def testNoSignatureDefsWhenTrackingObjIsNone(self):
     tflite_model = converter.convert()
 
     # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    signature_defs = interpreter.get_signature_list()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    signature_defs = interp.get_signature_list()
     # Verify that there is no SignatureDef structure found.
-    self.assertEqual(len(signature_defs), 0)
+    self.assertEmpty(signature_defs)
 
   @test_util.run_v2_only
   def testNoSignatureDefsWhenInvalidTrackingObjIsGiven(self):
@@ -674,10 +652,10 @@ def testNoSignatureDefsWhenInvalidTrackingObjIsGiven(self):
     tflite_model = converter.convert()
 
     # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    signature_defs = interpreter.get_signature_list()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    signature_defs = interp.get_signature_list()
     # Verify that there is no SignatureDef structure found.
-    self.assertEqual(len(signature_defs), 0)
+    self.assertEmpty(signature_defs)
 
   @test_util.run_v2_only
   def testTrackbleObject(self):
@@ -700,7 +678,7 @@ def _getTrainingTimeQuantizedModel(self):
     class QLinear(tf.keras.layers.Layer):
 
       def __init__(self, units=3, **kwargs):
-        super(QLinear, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.units = units
 
       def build(self, input_shape):
@@ -759,21 +737,21 @@ def testTrainingTimeQuantization(self, inference_input_output_type):
     quantized_tflite_model = quantized_converter.convert()
     self.assertIsNotNone(quantized_tflite_model)
     # Check the conversion metadata.
-    metadata = get_conversion_metadata(quantized_tflite_model)
+    metadata = util.get_conversion_metadata(quantized_tflite_model)
     self.assertIsNotNone(metadata)
     self.assertAllEqual(
         [metadata_fb.ModelOptimizationMode.QUANTIZATION_AWARE_TRAINING],
         metadata.options.modelOptimizationModes,
     )
 
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    interp.allocate_tensors()
+    input_details = interp.get_input_details()
     self.assertLen(input_details, 1)
     self.assertEqual(
         inference_input_output_type.as_numpy_dtype, input_details[0]['dtype']
     )
-    output_details = interpreter.get_output_details()
+    output_details = interp.get_output_details()
     self.assertLen(output_details, 1)
     self.assertEqual(
         inference_input_output_type.as_numpy_dtype, output_details[0]['dtype']
@@ -811,6 +789,38 @@ def testNewQuantizer(self):
       new_value = self._evaluateTFLiteModel(new_tflite, [input_data])
       self.assertAllClose(old_value, new_value, atol=1e-01)
 
+  @test_util.run_v2_only
+  def testGatherNDQI8(self):
+    """Test gather_nd with quantized i8 parameters."""
+
+    class GatherNDQI8QDQ(tf.keras.Model):
+      @tf.function(
+          input_signature=[tf.TensorSpec(shape=(2, 2), dtype=tf.float32)]
+      )
+
+      def func(self, input_tensor):
+        x = tf.quantization.fake_quant_with_min_max_args(
+            input_tensor, -3.0, 3.0
+        )
+        x = tf.gather_nd(x, [[0, 0], [1, 1]])
+        return tf.quantization.fake_quant_with_min_max_args(x, -3.0, 3.0)
+
+    # Build a QDQ model so that tfl.gather_nd will be converted to a QI8 version
+    # with the `_experimental_qdq_conversion_mode`` flag
+    root = GatherNDQI8QDQ()
+    concrete_func = root.func.get_concrete_function()
+    converter = lite.TFLiteConverterV2.from_concrete_functions(
+        [concrete_func], root
+    )
+    converter._experimental_qdq_conversion_mode = 'STATIC'
+    tflite_model = converter.convert()
+
+    np_data = np.array([[1, 2], [3, 4]], dtype=np.float32)
+    input_tensor = tf.constant(np_data, dtype=tf.int8)
+    expected_value = [1, 4]
+    actual_value = self._evaluateTFLiteModel(tflite_model, [input_tensor])
+    self.assertAllClose(expected_value, actual_value[0], atol=1e-05)
+
   @test_util.run_v2_only
   def testEmbeddings(self):
     """Test model with embeddings."""
@@ -821,7 +831,7 @@ def testEmbeddings(self):
     class EmbeddingModel(tf.keras.Model):
 
       def __init__(self):
-        super(EmbeddingModel, self).__init__()
+        super().__init__()
         self.shared_weights = self.add_weight(
             'weights',
             shape=(2000, 300),
@@ -949,7 +959,7 @@ def testIntegerQuantizationWithFlexOp(
     quantized_tflite_model = quantized_converter.convert()
     self.assertIsNotNone(quantized_tflite_model)
     # Check the conversion metadata.
-    metadata = get_conversion_metadata(quantized_tflite_model)
+    metadata = util.get_conversion_metadata(quantized_tflite_model)
     self.assertIsNotNone(metadata)
     self.assertEqual(metadata.options.enableSelectTfOps, True)
     expected_opt_options = [metadata_fb.ModelOptimizationMode.PTQ_FULL_INTEGER]
@@ -959,14 +969,14 @@ def testIntegerQuantizationWithFlexOp(
         expected_opt_options, metadata.options.modelOptimizationModes
     )
 
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    interp.allocate_tensors()
+    input_details = interp.get_input_details()
     self.assertLen(input_details, 1)
     self.assertEqual(
         inference_input_output_type.as_numpy_dtype, input_details[0]['dtype']
     )
-    output_details = interpreter.get_output_details()
+    output_details = interp.get_output_details()
     self.assertLen(output_details, 1)
     self.assertEqual(
         inference_input_output_type.as_numpy_dtype, output_details[0]['dtype']
@@ -1063,13 +1073,13 @@ def testIntegerQuantizationWithUnsupportedOps(
         expected_dtype if enable_mlir_quantizer else dtypes.float32
     )
 
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    interp.allocate_tensors()
+    input_details = interp.get_input_details()
     self.assertLen(input_details, 2)
     self.assertEqual(input_details[0]['dtype'], expected_dtype)
     self.assertEqual(input_details[1]['dtype'], expected_ceil_dtype)
-    output_details = interpreter.get_output_details()
+    output_details = interp.get_output_details()
     self.assertLen(output_details, 2)
     self.assertEqual(output_details[0]['dtype'], expected_dtype)
     self.assertEqual(output_details[1]['dtype'], expected_ceil_dtype)
@@ -1127,9 +1137,6 @@ def calibration_gen():
       ('_IntOnly_INT8InputOutput', True, False, dtypes.int8),
       ('_IntOnly_UINT8InputOutput', True, False, dtypes.uint8),
       ('_IntOnly_INT16Quantize_INT16InputOutput', True, True, dtypes.int16),
-      # TODO(b/198231624): Support control flow ops in MLIR quantizer
-      # ('_IntOnly_INT8InputOutputMlirQuant', True, False, dtypes.int8, True),
-      # ('_IntOnly_UINT8InputOutputMlirQuant', True, False, dtypes.uint8, True),
   )
   @test_util.run_v2_only
   def testIntegerQuantizationWithControlFlow(
@@ -1177,13 +1184,13 @@ def testIntegerQuantizationWithControlFlow(
 
     expected_dtype = inference_input_output_type.as_numpy_dtype
 
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    interp.allocate_tensors()
+    input_details = interp.get_input_details()
     self.assertLen(input_details, 2)
     self.assertEqual(input_details[0]['dtype'], expected_dtype)
     self.assertEqual(input_details[1]['dtype'], dtypes.bool)
-    output_details = interpreter.get_output_details()
+    output_details = interp.get_output_details()
     self.assertLen(output_details, 1)
     self.assertEqual(output_details[0]['dtype'], expected_dtype)
 
@@ -1213,13 +1220,13 @@ def testNewQuantizerBlocklistingArgs(
     quantized_converter._experimental_calibrate_only = True
     quantized_converter.experimental_lower_to_saved_model = lower_to_saved_model
     calibrated = quantized_converter.convert()
-    quantized_tflite_model = mlir_quantize(
+    quantized_tflite_model = convert.mlir_quantize(
         calibrated,
         denylisted_ops=denylisted_ops,
         denylisted_nodes=denylisted_nodes,
     )
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    details = interpreter.get_tensor_details()
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    details = interp.get_tensor_details()
     num_quantized_tensors = sum([
         1
         for detail in details
@@ -1254,7 +1261,7 @@ def testNewQuantizerNumericVerificationDebugMode(self, whole_model_verify):
     # Create a TFLite model with new quantizer and numeric verify ops.
     quantized_converter._experimental_calibrate_only = True
     calibrated = quantized_converter.convert()
-    debug_mode_tflite = mlir_quantize(
+    debug_mode_tflite = convert.mlir_quantize(
         calibrated,
         enable_numeric_verify=True,
         enable_whole_model_verify=whole_model_verify,
@@ -1269,18 +1276,18 @@ def testNewQuantizerNumericVerificationDebugMode(self, whole_model_verify):
     )
 
     def examine_tflite_model(tflite_content, input_data):
-      interpreter = Interpreter(
+      interp = interpreter.Interpreter(
           model_content=tflite_content,
-          experimental_op_resolver_type=OpResolverType.BUILTIN_WITHOUT_DEFAULT_DELEGATES,
+          experimental_op_resolver_type=interpreter.OpResolverType.BUILTIN_WITHOUT_DEFAULT_DELEGATES,
       )
-      interpreter.allocate_tensors()
-      input_details = interpreter.get_input_details()
-      interpreter.set_tensor(input_details[0]['index'], input_data.numpy())
-      interpreter.invoke()
-      tensor_details = interpreter.get_tensor_details()
+      interp.allocate_tensors()
+      input_details = interp.get_input_details()
+      interp.set_tensor(input_details[0]['index'], input_data.numpy())
+      interp.invoke()
+      tensor_details = interp.get_tensor_details()
       return {
-          details['name']: interpreter.get_tensor(details['index'])
-          for details in interpreter.get_tensor_details()
+          details['name']: interp.get_tensor(details['index'])
+          for details in interp.get_tensor_details()
       }, tensor_details
 
     tflite_result, _ = examine_tflite_model(production_tflite, input_data)
@@ -1328,15 +1335,14 @@ def examine_tflite_model(tflite_content, input_data):
       ('_PerChannelMlirQuant', False, True),
       ('_PerTensorQuant', True, False),
       ('_PerTensorMlirQuant', True, True),
-      ('_PerChannelDynamicRange', False, False, False),
-      ('_PerTensorDynamicRange', True, False, False),
+      ('_PerChannelDynamicRange', False, False),
+      ('_PerTensorDynamicRange', True, False),
   )
   @test_util.run_v2_only
   def testDisablePerChannelQuantization(
       self,
       disable_per_channel=False,
       enable_mlir_quantizer=False,
-      representative_dataset=True,
   ):
     k_conv_name = 'tfl.pseudo_qconst' if enable_mlir_quantizer else 'Conv2D'
     # Dynamic range quant requires total num elements of filters > 1024.
@@ -1358,11 +1364,11 @@ def testDisablePerChannelQuantization(
     quantized_tflite_model = quantized_converter.convert()
     self.assertIsNotNone(quantized_tflite_model)
 
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    interp.allocate_tensors()
     detail = next((
         d
-        for d in interpreter.get_tensor_details()
+        for d in interp.get_tensor_details()
         if d['name'].startswith(k_conv_name)
     ))
     quant_params = detail['quantization_parameters']
@@ -1370,6 +1376,75 @@ def testDisablePerChannelQuantization(
     self.assertLen(quant_params['scales'], expected_num_params)
     self.assertLen(quant_params['zero_points'], expected_num_params)
 
+  def _getIntegerQuantizeDenseModel(self, num_filters=32):
+    np.random.seed(0)
+
+    root = autotrackable.AutoTrackable()
+
+    @tf.function(
+        input_signature=[tf.TensorSpec(shape=[1, 16], dtype=tf.float32)]
+    )
+    def func(inp):
+      dense = tf.matmul(a=inp, b=tf.ones([16, num_filters]))
+      output = tf.nn.relu(dense, name='output')
+      return output
+
+    def calibration_gen():
+      for _ in range(5):
+        yield [np.random.uniform(-1, 1, size=(1, 16)).astype(np.float32)]
+
+    root.f = func
+    to_save = root.f.get_concrete_function()
+    return (root, to_save, calibration_gen)
+
+  @parameterized.named_parameters(
+      ('_PerChannelQuant', False, False),
+      ('_PerChannelMlirQuant', False, True),
+      ('_PerTensorQuant', True, False),
+      ('_PerTensorMlirQuant', True, True),
+      ('_PerChannelDynamicRange', False, True, True),
+      ('_PerTensorDynamicRange', True, True, True),
+  )
+  @test_util.run_v2_only
+  def testDisablePerChannelQuantizationForDenseLayers(
+      self,
+      disable_per_channel_for_dense=False,
+      enable_mlir_quantizer=False,
+      representative_dataset=False,
+  ):
+    k_dense_name = 'tfl.pseudo_qconst' if representative_dataset else 'MatMul'
+    # Dynamic range quant requires total num elements of filters > 1024.
+    k_num_filters = 64
+    root, func, calib_gen = self._getIntegerQuantizeDenseModel(k_num_filters)
+    quantized_converter = tf.lite.TFLiteConverter.from_concrete_functions(
+        [func], root
+    )
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    if representative_dataset:
+      quantized_converter.representative_dataset = calib_gen
+    quantized_converter.target_spec.supported_ops = [
+        lite.OpsSet.TFLITE_BUILTINS
+    ]
+    quantized_converter.experimental_new_quantizer = enable_mlir_quantizer
+    if disable_per_channel_for_dense:
+      quantized_converter._experimental_disable_per_channel_quantization_for_dense_layers = (
+          disable_per_channel_for_dense
+      )
+    quantized_tflite_model = quantized_converter.convert()
+    self.assertIsNotNone(quantized_tflite_model)
+
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    interp.allocate_tensors()
+    detail = next((
+        d
+        for d in interp.get_tensor_details()
+        if d['name'].startswith(k_dense_name)
+    ))
+    quant_params = detail['quantization_parameters']
+    expected_num_params = 1 if disable_per_channel_for_dense else k_num_filters
+    self.assertLen(quant_params['scales'], expected_num_params)
+    self.assertLen(quant_params['zero_points'], expected_num_params)
+
   @parameterized.named_parameters(
       ('MlirQuantize', True), ('TocoQuantize', False)
   )
@@ -1392,13 +1467,13 @@ def calibration_gen():
     converter.experimental_new_quantizer = enable_mlir_quantizer
     quantized_model = converter.convert()
 
-    interpreter = Interpreter(model_content=quantized_model)
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
-    interpreter.set_tensor(input_details[0]['index'], input_data)
-    interpreter.invoke()
-    output_details = interpreter.get_output_details()
-    output = interpreter.get_tensor(output_details[0]['index'])
+    interp = interpreter.Interpreter(model_content=quantized_model)
+    interp.allocate_tensors()
+    input_details = interp.get_input_details()
+    interp.set_tensor(input_details[0]['index'], input_data)
+    interp.invoke()
+    output_details = interp.get_output_details()
+    output = interp.get_tensor(output_details[0]['index'])
     # the inputs and weights are far smaller than the biases, so the final
     # result should be equal to the biases.
     self.assertAllClose(root.bias, output.flatten())
@@ -1446,7 +1521,7 @@ def testForceSelectTFOps(self):
     converter.target_spec.supported_ops = [tf.lite.OpsSet.SELECT_TF_OPS]
     tflite_model = converter.convert()
     # Check the conversion metadata.
-    metadata = get_conversion_metadata(tflite_model)
+    metadata = util.get_conversion_metadata(tflite_model)
     self.assertIsNotNone(metadata)
     self.assertEqual(metadata.options.forceSelectTfOps, True)
 
@@ -1467,7 +1542,7 @@ def testExcludeConversionMetadata(self):
     converter.exclude_conversion_metadata = True
     tflite_model = converter.convert()
     # Check the conversion metadata.
-    metadata = get_conversion_metadata(tflite_model)
+    metadata = util.get_conversion_metadata(tflite_model)
     self.assertIsNone(metadata)
 
   def testConversionMetadataForDynamicRange(self):
@@ -1478,7 +1553,7 @@ def testConversionMetadataForDynamicRange(self):
     converter.optimizations = [lite.Optimize.DEFAULT]
     quantized_model = converter.convert()
     # Check the conversion metadata.
-    metadata = get_conversion_metadata(quantized_model)
+    metadata = util.get_conversion_metadata(quantized_model)
     self.assertIsNotNone(metadata)
     self.assertAllEqual(
         [metadata_fb.ModelOptimizationMode.PTQ_DYNAMIC_RANGE],
@@ -1493,7 +1568,7 @@ def testConversionMetadataForFloat16(self):
     converter.target_spec.supported_types = [dtypes.float16]
     quantized_model = converter.convert()
     # Check the conversion metadata.
-    metadata = get_conversion_metadata(quantized_model)
+    metadata = util.get_conversion_metadata(quantized_model)
     self.assertIsNotNone(metadata)
     self.assertAllEqual(
         [metadata_fb.ModelOptimizationMode.PTQ_FLOAT16],
@@ -1538,21 +1613,51 @@ def _createV2QATSavedModel(self, shape):
     return saved_model_dir, input_name, output_name
 
   @test_util.run_v2_only
-  def testStableHloQuantizerNotSupported(self):
-    """Test that StableHLO Quantizer is not yet supported."""
+  def testStableHloQuantizerSupportsOnlyStaticRangePtq(self):
+    """Tests that StableHLO Quantizer supports only static-range PTQ."""
     input_data = tf.constant(1.0, shape=[1])
     root = autotrackable.AutoTrackable()
     root.f = tf.function(lambda x: 2.0 * x)
     to_save = root.f.get_concrete_function(input_data)
 
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
-    save(root, save_dir, to_save)
+    save.save(root, save_dir, to_save)
 
     converter = lite.TFLiteConverterV2.from_saved_model(save_dir)
     converter.experimental_use_stablehlo_quantizer = True
-    with self.assertRaisesRegex(ValueError, 'not supported'):
+    with self.assertRaisesRegex(ValueError, 'only supports static-range PTQ'):
       converter.convert()
 
+  @test_util.run_v2_only
+  def testStableHloQuantizerNoOpForTfSavedModel(self):
+    """Tests that StableHLO Quantizer does not run for TF SavedModel."""
+    input_data = tf.constant(1.0, shape=[1])
+    root = autotrackable.AutoTrackable()
+    root.f = tf.function(lambda x: 2.0 * x)
+    to_save = root.f.get_concrete_function(input_data)
+
+    save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
+    save.save(root, save_dir, to_save)
+
+    def _representative_data_gen():
+      return [{'x': np.ones(shape=(1,), dtype=np.float32)}]
+
+    converter = lite.TFLiteConverterV2.from_saved_model(save_dir)
+    # Set the flags to enable StableHLO Quantizer.
+    converter.experimental_use_stablehlo_quantizer = True
+    converter.optimizations = [lite.Optimize.DEFAULT]
+    converter.representative_dataset = _representative_data_gen
+
+    tflite_model = converter.convert()
+    self.assertIsNotNone(tflite_model)
+
+    # Test that no tensor is quantized.
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    all_tensor_details = interp.get_tensor_details()
+    for tensor_detail in all_tensor_details:
+      self.assertIn('dtype', tensor_detail)
+      self.assertEqual(tensor_detail['dtype'], np.float32)
+
   @test_util.run_v2_only
   def testV1SimpleModel(self):
     """Test a SavedModel."""
@@ -1564,10 +1669,10 @@ def testV1SimpleModel(self):
       tflite_model = converter.convert()
       self.assertTrue(tflite_model)
 
-      interpreter = Interpreter(model_content=tflite_model)
-      interpreter.allocate_tensors()
+      interp = interpreter.Interpreter(model_content=tflite_model)
+      interp.allocate_tensors()
 
-      input_details = interpreter.get_input_details()
+      input_details = interp.get_input_details()
       self.assertLen(input_details, 2)
       self.assertStartsWith(input_details[0]['name'], 'inputA')
       self.assertEqual(np.float32, input_details[0]['dtype'])
@@ -1582,7 +1687,7 @@ def testV1SimpleModel(self):
       self.assertTrue([1, 16, 16, 3], input_details[1]['shape'])
       self.assertEqual((0.0, 0.0), input_details[1]['quantization'])
 
-      output_details = interpreter.get_output_details()
+      output_details = interp.get_output_details()
       self.assertLen(output_details, 1)
       self.assertStartsWith(output_details[0]['name'], 'add')
       self.assertEqual(np.float32, output_details[0]['dtype'])
@@ -1628,29 +1733,29 @@ def testUnfoldLargeConstant(self, unfold_large_constant):
       )
 
     # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    interpreter.allocate_tensors()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    interp.allocate_tensors()
 
-    input_details = interpreter.get_input_details()
+    input_details = interp.get_input_details()
     self.assertLen(input_details, 1)
     self.assertEqual('input:0', input_details[0]['name'])
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertAllEqual([1000, 1000], input_details[0]['shape'])
     self.assertEqual((0.0, 0.0), input_details[0]['quantization'])
 
-    output_details = interpreter.get_output_details()
+    output_details = interp.get_output_details()
     self.assertEqual('add:0', output_details[0]['name'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
     self.assertAllEqual([1000, 1000], output_details[0]['shape'])
     self.assertEqual((0.0, 0.0), output_details[0]['quantization'])
 
-    interpreter.set_tensor(
+    interp.set_tensor(
         input_details[0]['index'], np.ones(shape=[1000, 1000], dtype=np.float32)
     )
-    interpreter.invoke()
+    interp.invoke()
     self.assertAllEqual(
         np.full(shape=[1000, 1000], fill_value=2.0, dtype=np.float32),
-        interpreter.get_tensor(output_details[0]['index']),
+        interp.get_tensor(output_details[0]['index']),
     )
 
   @test_util.run_v2_only
@@ -1695,7 +1800,7 @@ def testTF1HubFormattedModel(self):
 
     # TF1 hub model is based on V1 saved model and they omit the saved model
     # schema version setting.
-    saved_model_proto = parse_saved_model(saved_model_dir)
+    saved_model_proto = loader_impl.parse_saved_model(saved_model_dir)
     saved_model_proto.saved_model_schema_version = 0
 
     saved_model_pb_file_path = os.path.join(saved_model_dir, 'saved_model.pb')
@@ -1773,29 +1878,27 @@ def testModelWithHashTableInitializer(self):
     tflite_model = converter.convert()
 
     # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    input_details = interpreter.get_input_details()
-    output_details = interpreter.get_output_details()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    input_details = interp.get_input_details()
+    output_details = interp.get_output_details()
 
     input_data = np.array(['a', 'b', 'c', 'z'], dtype=np.string_)
-    interpreter.resize_tensor_input(
-        input_details[0]['index'], [4], strict=False
-    )
-    interpreter.allocate_tensors()
+    interp.resize_tensor_input(input_details[0]['index'], [4], strict=False)
+    interp.allocate_tensors()
 
-    interpreter.set_tensor(input_details[0]['index'], input_data)
+    interp.set_tensor(input_details[0]['index'], input_data)
 
     # Invoke multiple times to ensure the initializer graph runs only once.
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual([1, 2, 3, -1], list(actual_value))
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual([1, 2, 3, -1], list(actual_value))
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual([1, 2, 3, -1], list(actual_value))
 
   def _createV1ModelWithMutableHashTable(self):
@@ -1869,20 +1972,18 @@ def testModelWithMutableHashTable(self):
     tflite_model = converter.convert()
 
     # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    input_details = interpreter.get_input_details()
-    output_details = interpreter.get_output_details()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    input_details = interp.get_input_details()
+    output_details = interp.get_output_details()
 
     input_data = np.array(['a', 'b', 'c'], dtype=np.string_)
-    interpreter.resize_tensor_input(
-        input_details[0]['index'], [3], strict=False
-    )
-    interpreter.allocate_tensors()
+    interp.resize_tensor_input(input_details[0]['index'], [3], strict=False)
+    interp.allocate_tensors()
 
-    interpreter.set_tensor(input_details[0]['index'], input_data)
+    interp.set_tensor(input_details[0]['index'], input_data)
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual([1, 5, -1], list(actual_value))
 
   @test_util.run_v2_only
@@ -1905,8 +2006,8 @@ def testReduceSumWithInt16Quant(self):
     converter.representative_dataset = lambda: [inputs]
     content = converter.convert()
 
-    interpreter = tf.lite.Interpreter(model_content=content)
-    runner = interpreter.get_signature_runner('serving_default')
+    interp = interpreter.Interpreter(model_content=content)
+    runner = interp.get_signature_runner('serving_default')
     y = runner(x=np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]]).astype(np.int16))
     self.assertEqual([3, 6, 9], list(list(y.values())[0]))
 
@@ -1919,7 +2020,7 @@ def testConstModel(self):
     to_save = root.f.get_concrete_function(input_data)
 
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
-    save(root, save_dir, to_save)
+    save.save(root, save_dir, to_save)
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverterV2.from_saved_model(save_dir)
@@ -1938,13 +2039,13 @@ def testVariableModel(self):
     to_save = root.f.get_concrete_function(input_data)
 
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
-    save(root, save_dir, to_save)
+    save.save(root, save_dir, to_save)
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverterV2.from_saved_model(save_dir)
     tflite_model = converter.convert()
     # Check the conversion metadata.
-    metadata = get_conversion_metadata(tflite_model)
+    metadata = util.get_conversion_metadata(tflite_model)
     self.assertIsNotNone(metadata)
     self.assertEqual(
         metadata.environment.modelType, metadata_fb.ModelType.TF_SAVED_MODEL
@@ -1966,7 +2067,7 @@ def testNativeVariablesModel(self, enable_resource_variables):
     to_save = root.assign_add.get_concrete_function(input_data)
 
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
-    save(root, save_dir, to_save)
+    save.save(root, save_dir, to_save)
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverterV2.from_saved_model(save_dir)
@@ -2000,7 +2101,7 @@ def testSignatures(self):
     to_save = root.f.get_concrete_function(input_data)
 
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
-    save(root, save_dir, to_save)
+    save.save(root, save_dir, to_save)
 
     # Convert model with invalid `signature_keys`.
     with self.assertRaises(ValueError) as error:
@@ -2037,12 +2138,10 @@ def testSignatureDefsWithFullIntegerQuantization(self):
     converter.optimizations = [tf.lite.Optimize.DEFAULT]
     tflite_model = converter.convert()
     # 2. Initialize the Interpreter
-    interpreter = Interpreter(model_content=tflite_model)
-    input_details = interpreter.get_input_details()[0]
-    output_details = interpreter.get_output_details()[0]
-    interpreter.resize_tensor_input(input_details['index'], tflite_input_shape)
-    interpreter.allocate_tensors()
-    signature_list = interpreter._get_full_signature_list()['serving_default']
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    input_details = interp.get_input_details()[0]
+    interp.resize_tensor_input(input_details['index'], tflite_input_shape)
+    interp.allocate_tensors()
     # 3. (Skip) Verify that signature def input/output tensors are in the model.
     # 4. Evaluate the model
     input_data = np.random.random(tflite_input_shape).astype(np.float32)
@@ -2058,14 +2157,14 @@ def testSignatureDefsWithFullIntegerQuantization(self):
     converter.inference_output_type = tf.int8
     tflite_model_quant = converter.convert()
     # 2. Initialize the Interpreter
-    interpreter = Interpreter(model_content=tflite_model_quant)
-    input_details = interpreter.get_input_details()[0]
-    output_details = interpreter.get_output_details()[0]
-    interpreter.resize_tensor_input(input_details['index'], tflite_input_shape)
-    interpreter.allocate_tensors()
+    interp = interpreter.Interpreter(model_content=tflite_model_quant)
+    input_details = interp.get_input_details()[0]
+    output_details = interp.get_output_details()[0]
+    interp.resize_tensor_input(input_details['index'], tflite_input_shape)
+    interp.allocate_tensors()
     # 3. Verify that signature def input/output tensors are in the model.
-    all_indices = {item['index'] for item in interpreter.get_tensor_details()}
-    signature_list = interpreter._get_full_signature_list()['serving_default']
+    all_indices = {item['index'] for item in interp.get_tensor_details()}
+    signature_list = interp._get_full_signature_list()['serving_default']
     input_tensor_indices = set(signature_list['inputs'].values())
     assert input_tensor_indices.issubset(all_indices)
     output_tensor_indices = set(signature_list['outputs'].values())
@@ -2100,7 +2199,7 @@ def testSignatureDefs(self):
     )
 
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
-    save(root, save_dir, {'mul_add': mul_add_func})
+    save.save(root, save_dir, {'mul_add': mul_add_func})
 
     converter = lite.TFLiteConverterV2.from_saved_model(
         save_dir, signature_keys=['mul_add']
@@ -2109,8 +2208,8 @@ def testSignatureDefs(self):
 
     # Check values from converted model.
     expected_value = root.mul_add(input_data_1, input_data_0)
-    interpreter = Interpreter(model_content=tflite_model)
-    signature_defs = interpreter.get_signature_list()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    signature_defs = interp.get_signature_list()
     results = self._evaluateTFLiteModelUsingSignatureDef(
         tflite_model, 'mul_add', {'y': input_data_0, 'x': input_data_1}
     )
@@ -2118,9 +2217,9 @@ def testSignatureDefs(self):
     self.assertEqual(expected_value.numpy(), results['output_0'])
 
     # Verify the SignatureDef structure returned is as expected.
-    self.assertEqual(len(signature_defs), 1)
+    self.assertLen(signature_defs, 1)
     self.assertEqual(list(signature_defs.keys()), ['mul_add'])
-    self.assertEqual(len(signature_defs.values()), 1)
+    self.assertLen(signature_defs.values(), 1)
     self.assertEqual(
         list(signature_defs['mul_add'].keys()), ['inputs', 'outputs']
     )
@@ -2141,7 +2240,7 @@ def testSignatureDefsWithDefaultValue(self):
     )
 
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
-    save(root, save_dir, {'mul_add': mul_add_func})
+    save.save(root, save_dir, {'mul_add': mul_add_func})
 
     converter = lite.TFLiteConverterV2.from_saved_model(
         save_dir, signature_keys=['mul_add']
@@ -2150,8 +2249,8 @@ def testSignatureDefsWithDefaultValue(self):
 
     # Check values from converted model.
     expected_value = root.mul_add(input_data_1, input_data_0)
-    interpreter = Interpreter(model_content=tflite_model)
-    signature_defs = interpreter.get_signature_list()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    signature_defs = interp.get_signature_list()
     results = self._evaluateTFLiteModelUsingSignatureDef(
         tflite_model, None, {'y': input_data_0, 'x': input_data_1}
     )
@@ -2159,9 +2258,9 @@ def testSignatureDefsWithDefaultValue(self):
     self.assertEqual(expected_value.numpy(), results['output_0'])
 
     # Verify the SignatureDef structure returned is as expected.
-    self.assertEqual(len(signature_defs), 1)
+    self.assertLen(signature_defs, 1)
     self.assertEqual(list(signature_defs.keys()), ['mul_add'])
-    self.assertEqual(len(signature_defs.values()), 1)
+    self.assertLen(signature_defs.values(), 1)
     self.assertEqual(
         list(signature_defs['mul_add'].keys()), ['inputs', 'outputs']
     )
@@ -2179,7 +2278,7 @@ def testSignatureDefsQuantizedModel(self):
     )
 
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
-    save(root, save_dir, {'mul_add': mul_add_func})
+    save.save(root, save_dir, {'mul_add': mul_add_func})
 
     converter = lite.TFLiteConverterV2.from_saved_model(
         save_dir, signature_keys=['mul_add']
@@ -2202,13 +2301,13 @@ def representative_dataset_gen():
     tflite_model = converter.convert()
 
     # Check signatures are valid from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    signature_defs = interpreter.get_signature_list()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    signature_defs = interp.get_signature_list()
 
     # Verify the SignatureDef structure returned is as expected.
-    self.assertEqual(len(signature_defs), 1)
+    self.assertLen(signature_defs, 1)
     self.assertEqual(list(signature_defs.keys()), ['mul_add'])
-    self.assertEqual(len(signature_defs.values()), 1)
+    self.assertLen(signature_defs.values(), 1)
     self.assertEqual(
         list(signature_defs['mul_add'].keys()), ['inputs', 'outputs']
     )
@@ -2224,20 +2323,20 @@ def testMultipleFunctionModel(self):
     sub_func = root.sub.get_concrete_function(input_data)
 
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
-    save(root, save_dir, {'add': add_func, 'sub': sub_func})
+    save.save(root, save_dir, {'add': add_func, 'sub': sub_func})
 
     # Try converting multiple functions.
     converter = lite.TFLiteConverterV2.from_saved_model(save_dir)
     tflite_model = converter.convert()
     self.assertIsNotNone(tflite_model)
 
-    interpreter = tf.lite.Interpreter(model_content=tflite_model)
-    signature_defs = interpreter.get_signature_list()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    signature_defs = interp.get_signature_list()
 
     # Verify the SignatureDef structure returned is as expected.
-    self.assertEqual(len(signature_defs), 2)
+    self.assertLen(signature_defs, 2)
     self.assertEqual(list(signature_defs.keys()), ['add', 'sub'])
-    self.assertEqual(len(signature_defs.values()), 2)
+    self.assertLen(signature_defs.values(), 2)
     self.assertEqual(list(signature_defs['add'].keys()), ['inputs', 'outputs'])
     self.assertCountEqual(signature_defs['add']['inputs'], ['x'])
     self.assertEqual(list(signature_defs['add']['outputs']), ['output_0'])
@@ -2246,11 +2345,11 @@ def testMultipleFunctionModel(self):
     self.assertEqual(list(signature_defs['sub']['outputs']), ['output_0'])
 
     # Verify the Signature runner executions.
-    add_signature_runner = interpreter.get_signature_runner('add')
+    add_signature_runner = interp.get_signature_runner('add')
     add_output = add_signature_runner(x=input_data)
     self.assertEqual(add_output['output_0'], 3)
 
-    sub_signature_runner = interpreter.get_signature_runner('sub')
+    sub_signature_runner = interp.get_signature_runner('sub')
     sub_output = sub_signature_runner(x=input_data)
     self.assertEqual(sub_output['output_0'], -2)
 
@@ -2281,7 +2380,7 @@ def testMultipleFunctionQuantizedModel(
     sub_func = root.sub.get_concrete_function(input_data)
 
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
-    save(root, save_dir, {'add': add_func, 'sub': sub_func})
+    save.save(root, save_dir, {'add': add_func, 'sub': sub_func})
 
     # Try converting multiple functions.
     converter = lite.TFLiteConverterV2.from_saved_model(save_dir)
@@ -2328,13 +2427,13 @@ def representative_dataset_gen():
     tflite_model = converter.convert()
     self.assertIsNotNone(tflite_model)
 
-    interpreter = tf.lite.Interpreter(model_content=tflite_model)
-    signature_defs = interpreter.get_signature_list()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    signature_defs = interp.get_signature_list()
 
     # Verify the SignatureDef structure returned is as expected.
-    self.assertEqual(len(signature_defs), 2)
+    self.assertLen(signature_defs, 2)
     self.assertEqual(list(signature_defs.keys()), ['add', 'sub'])
-    self.assertEqual(len(signature_defs.values()), 2)
+    self.assertLen(signature_defs.values(), 2)
     self.assertEqual(list(signature_defs['add'].keys()), ['inputs', 'outputs'])
     self.assertCountEqual(signature_defs['add']['inputs'], ['x'])
     self.assertEqual(list(signature_defs['add']['outputs']), ['output_0'])
@@ -2348,7 +2447,7 @@ def representative_dataset_gen():
             inference_input_output_type.as_numpy_dtype
         )
     )
-    add_signature_runner = interpreter.get_signature_runner('add')
+    add_signature_runner = interp.get_signature_runner('add')
     add_output = add_signature_runner(x=input_data)
     self.assertIsNotNone(add_output['output_0'])
     input_details = add_signature_runner.get_input_details()
@@ -2361,7 +2460,7 @@ def representative_dataset_gen():
     if inference_input_output_type == dtypes.float32:
       self.assertEqual((0.0, 0), input_details['x']['quantization'])
 
-    sub_signature_runner = interpreter.get_signature_runner('sub')
+    sub_signature_runner = interp.get_signature_runner('sub')
     sub_output = sub_signature_runner(x=input_data)
     self.assertIsNotNone(sub_output['output_0'])
     output_details = sub_signature_runner.get_output_details()
@@ -2387,7 +2486,9 @@ def testMultipleFunctionModelWithSharedWeight(self):
     mul_func = root.mul.get_concrete_function(input_data)
 
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
-    save(root, save_dir, {'add': add_func, 'sub': sub_func, 'mul': mul_func})
+    save.save(
+        root, save_dir, {'add': add_func, 'sub': sub_func, 'mul': mul_func}
+    )
 
     # Try converting multiple functions.
     converter = lite.TFLiteConverterV2.from_saved_model(save_dir)
@@ -2397,14 +2498,12 @@ def testMultipleFunctionModelWithSharedWeight(self):
     # Make sure that the weight tensors are shared.
     self.assertLess(len(tflite_model), 1100000)
 
-    # TODO(b/184696047): Write down the test codes for multiple signature
-    #                    runners once the Python API is ready to use.
-    interpreter = tf.lite.Interpreter(model_content=tflite_model)
-    signature_defs = interpreter.get_signature_list()
+    interp = tf.lite.Interpreter(model_content=tflite_model)
+    signature_defs = interp.get_signature_list()
     self.assertLen(signature_defs, 3)
-    add_signature_runner = interpreter.get_signature_runner('add')
-    sub_signature_runner = interpreter.get_signature_runner('sub')
-    mul_signature_runner = interpreter.get_signature_runner('mul')
+    add_signature_runner = interp.get_signature_runner('add')
+    sub_signature_runner = interp.get_signature_runner('sub')
+    mul_signature_runner = interp.get_signature_runner('mul')
     self.assertIsNotNone(add_signature_runner)
     self.assertIsNotNone(sub_signature_runner)
     self.assertIsNotNone(mul_signature_runner)
@@ -2414,7 +2513,7 @@ def testNoConcreteFunctionModel(self):
     root = self._getMultiFunctionModel()
 
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
-    save(root, save_dir)
+    save.save(root, save_dir)
 
     with self.assertRaises(ValueError) as error:
       _ = lite.TFLiteConverterV2.from_saved_model(save_dir)
@@ -2438,7 +2537,7 @@ def testKerasSequentialModel(self):
     model.fit(x, y, epochs=1)
 
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
-    save(model, save_dir)
+    save.save(model, save_dir)
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverterV2.from_saved_model(save_dir)
@@ -2472,8 +2571,8 @@ def testKerasSequentialModelExport(self):
     tflite_model = converter.convert()
 
     # Validate endpoints following `.export` to TFLite conversion.
-    interpreter = Interpreter(model_content=tflite_model)
-    signature_defs = interpreter.get_signature_list()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    signature_defs = interp.get_signature_list()
     self.assertLen(signature_defs, 1)
     self.assertEqual(next(iter(signature_defs)), 'serving_default')
 
@@ -2491,7 +2590,7 @@ def testGraphDebugInfo(self):
     to_save = root.f.get_concrete_function(input_data)
     options = save_options.SaveOptions(save_debug_info=True)
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
-    save(root, save_dir, to_save, options)
+    save.save(root, save_dir, to_save, options)
 
     # Convert model and ensure model is not None.
     converter = lite.TFLiteConverterV2.from_saved_model(save_dir)
@@ -2593,16 +2692,16 @@ def testKerasFullyConnectedOutputShape3D(self):
     tflite_model = converter.convert()
     self.assertTrue(tflite_model)
 
-    interpreter = Interpreter(model_content=tflite_model)
-    output_details = interpreter.get_output_details()
-    input_details = interpreter.get_input_details()
-    interpreter.allocate_tensors()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    output_details = interp.get_output_details()
+    input_details = interp.get_input_details()
+    interp.allocate_tensors()
 
     input_data = np.array([[[1, 2, 3], [4, 5, 6], [7, 8, 9]]], np.float32)
-    interpreter.set_tensor(input_details[0]['index'], input_data)
-    interpreter.invoke()
+    interp.set_tensor(input_details[0]['index'], input_data)
+    interp.invoke()
 
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    actual_value = interp.get_tensor(output_details[0]['index'])
     expected_value = model.predict(input_data)
 
     self.assertLen(output_details[0]['shape_signature'], 3)
@@ -2680,25 +2779,25 @@ def testUnknownInputShapeModel(self):
     self.assertTrue(tflite_model)
 
     # Validate that tensors with unknown shape have unknown rank.
-    tflite_model_obj = _convert_bytearray_to_object(tflite_model)
+    tflite_model_obj = flatbuffer_utils.convert_bytearray_to_object(
+        tflite_model
+    )
     for tensor in tflite_model_obj.subgraphs[0].tensors:
       self.assertEqual(False, tensor.hasRank)
       self.assertEqual([], tensor.shape.tolist())
 
     # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    input_details = interpreter.get_input_details()
-    output_details = interpreter.get_output_details()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    input_details = interp.get_input_details()
+    output_details = interp.get_output_details()
 
     input_data = np.array([1.0, 2.0, 3.0], dtype=np.float32)
-    interpreter.resize_tensor_input(
-        input_details[0]['index'], [3], strict=False
-    )
-    interpreter.allocate_tensors()
+    interp.resize_tensor_input(input_details[0]['index'], [3], strict=False)
+    interp.allocate_tensors()
 
-    interpreter.set_tensor(input_details[0]['index'], input_data)
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.set_tensor(input_details[0]['index'], input_data)
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual([2.0, 4.0, 6.0], list(actual_value))
 
   @test_util.run_v2_only
@@ -2711,7 +2810,9 @@ def testScalarInputShapeModel(self):
     self.assertTrue(tflite_model)
 
     # Validate that scalar tensors have a rank = 0.
-    tflite_model_obj = _convert_bytearray_to_object(tflite_model)
+    tflite_model_obj = flatbuffer_utils.convert_bytearray_to_object(
+        tflite_model
+    )
     for tensor in tflite_model_obj.subgraphs[0].tensors:
       self.assertEqual(True, tensor.hasRank)
       self.assertEqual([], tensor.shape.tolist())
@@ -2726,7 +2827,9 @@ def testMatrixInputShapeModel(self):
     self.assertTrue(tflite_model)
 
     # Validate that matrix tensors have a rank = 2.
-    tflite_model_obj = _convert_bytearray_to_object(tflite_model)
+    tflite_model_obj = flatbuffer_utils.convert_bytearray_to_object(
+        tflite_model
+    )
     for tensor in tflite_model_obj.subgraphs[0].tensors:
       self.assertEqual(True, tensor.hasRank)
       self.assertEqual([2, 3], tensor.shape.tolist())
@@ -2753,9 +2856,12 @@ def testDisablePerChannelQuantization(
     )
     model.build(input_shape=(1, 5, 5, 3))
     saved_model_dir = os.path.join(self.get_temp_dir(), 'conv_saved_model')
-    save(model, saved_model_dir)
-    k_conv_name = ('tfl.pseudo_qconst' if enable_mlir_quantizer
-                   else 'sequential/conv2d/Conv2D')
+    save.save(model, saved_model_dir)
+    k_conv_name = (
+        'tfl.pseudo_qconst'
+        if enable_mlir_quantizer
+        else 'sequential/conv2d/Conv2D'
+    )
     quantized_converter = tf.lite.TFLiteConverter.from_saved_model(
         saved_model_dir
     )
@@ -2778,11 +2884,11 @@ def calib_gen():
     quantized_tflite_model = quantized_converter.convert()
     self.assertIsNotNone(quantized_tflite_model)
 
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    interp.allocate_tensors()
     detail = next((
         d
-        for d in interpreter.get_tensor_details()
+        for d in interp.get_tensor_details()
         if d['name'].startswith(k_conv_name)
     ))
     quant_params = detail['quantization_parameters']
@@ -2815,9 +2921,12 @@ def testBiasQuantization(
         )
     ])
     saved_model_dir = os.path.join(self.get_temp_dir(), 'dense_saved_model')
-    save(model, saved_model_dir)
-    k_dense_bias_name = ('sequential/dense/BiasAdd/ReadVariableOp'
-                         if is_int16_quantize else 'tfl.pseudo_qconst')
+    save.save(model, saved_model_dir)
+    k_dense_bias_name = (
+        'sequential/dense/BiasAdd/ReadVariableOp'
+        if is_int16_quantize
+        else 'tfl.pseudo_qconst'
+    )
     quantized_converter = tf.lite.TFLiteConverter.from_saved_model(
         saved_model_dir
     )
@@ -2851,11 +2960,11 @@ def calibration_gen():
     quantized_tflite_model = quantized_converter.convert()
     self.assertIsNotNone(quantized_tflite_model)
 
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    interp.allocate_tensors()
     dense_bias = next((
         d
-        for d in interpreter.get_tensor_details()
+        for d in interp.get_tensor_details()
         if d['name'].startswith(k_dense_bias_name)
     ))
     self.assertEqual(bias_type, dense_bias['dtype'])
@@ -2881,7 +2990,7 @@ def testMlirDynamicRangeQuantization(
     )
     model.build(input_shape=(1, 32, 32, 3))
     saved_model_dir = self.create_tempdir()
-    save(model, saved_model_dir.full_path)
+    save.save(model, saved_model_dir.full_path)
 
     converter = tf.lite.TFLiteConverter.from_saved_model(
         saved_model_dir.full_path
@@ -2896,17 +3005,17 @@ def testMlirDynamicRangeQuantization(
     quantized_tflite_model = converter.convert()
     self.assertIsNotNone(quantized_tflite_model)
 
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    interp.allocate_tensors()
     quantized_weight = None
     quantized_weight_with_one_postfix = None
     quantized_weight_without_one_postfix = None
-    for d in interpreter.get_tensor_details():
+    for d in interp.get_tensor_details():
       if d['name'] == conv_name + '1':
         quantized_weight = d
         quantized_weight_with_one_postfix = d
         break
-    for d in interpreter.get_tensor_details():
+    for d in interp.get_tensor_details():
       if d['name'].startswith(conv_name):
         if quantized_weight is None:
           quantized_weight = d
@@ -2923,8 +3032,8 @@ def testMlirDynamicRangeQuantization(
     self.assertLen(quant_params['scales'], expected_num_params)
     self.assertLen(quant_params['zero_points'], expected_num_params)
 
-    input_details = interpreter.get_input_details()
-    output_details = interpreter.get_output_details()
+    input_details = interp.get_input_details()
+    output_details = interp.get_output_details()
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
     if enable_float16_quant:
@@ -2954,7 +3063,7 @@ def testQDQConversionMode(self, mode):
     )
     model.build(input_shape=(1, 32, 32, 3))
     saved_model_dir = self.create_tempdir()
-    save(model, saved_model_dir.full_path)
+    save.save(model, saved_model_dir.full_path)
     converter = tf.lite.TFLiteConverter.from_saved_model(
         saved_model_dir.full_path
     )
@@ -2968,85 +3077,6 @@ def testQDQConversionMode(self, mode):
       model = converter.convert()
       self.assertIsNotNone(model)
 
-  # pylint: disable=pointless-string-statement
-  """disable test for now """
-  """@parameterized.named_parameters(
-      (
-          '_Float16Quantization',
-          _PresetQuantizationMethod.FLOAT16,
-      ),
-  )
-  @test_util.run_v2_only
-  def testMlirStableHLOPresetQuantizationMethod(
-      self, preset_quantization_method
-  ):
-    num_filters = 38
-    model = tf.keras.models.Sequential(
-        [tf.keras.layers.Conv2D(num_filters, (3, 3), activation='relu')]
-    )
-    model.build(input_shape=(1, 5, 5, 3))
-    saved_model_dir = os.path.join(self.get_temp_dir(), 'conv_saved_model')
-    save(model, saved_model_dir)
-
-    quantization_options = quant_opts_pb2.QuantizationOptions(
-        quantization_method=quant_opts_pb2.QuantizationMethod(
-            preset_quantization_method=quant_opts_pb2.PresetQuantizationMethod(
-                preset_method=preset_quantization_method
-            )
-        )
-    )
-
-    converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-    converter._experimental_quantization_options = quantization_options
-
-    converter.target_spec.supported_ops = [
-        tf.lite.OpsSet.EXPERIMENTAL_STABLEHLO_OPS
-    ]
-    converter.exclude_conversion_metadata = True
-    converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_stablehlo_model = converter.convert()
-    self.assertIsNotNone(quantized_stablehlo_model)
-
-  @test_util.run_v2_only
-  def testMlirStableHLOCustomQuantizationMethod(self):
-    num_filters = 38
-    model = tf.keras.models.Sequential(
-        [tf.keras.layers.Conv2D(num_filters, (3, 3), activation='relu')]
-    )
-    model.build(input_shape=(1, 5, 5, 3))
-    saved_model_dir = os.path.join(self.get_temp_dir(), 'conv_saved_model')
-    save(model, saved_model_dir)
-
-    quantization_options = quant_opts_pb2.QuantizationOptions(
-        quantization_method=quant_opts_pb2.QuantizationMethod(
-            custom_quantization_method=quant_opts_pb2.CustomQuantizationMethod(
-                quantization_component_spec=[
-                    quant_opts_pb2.QuantizationComponentSpec(
-                        quantization_component=quant_opts_pb2.QuantizationComponentSpec.QuantizationComponent.COMPONENT_WEIGHT,
-                        bit_width=quant_opts_pb2.QuantizationComponentSpec.BitWidth.BIT_WIDTH_16,
-                        bit_type=quant_opts_pb2.QuantizationComponentSpec.BitType.BIT_TYPE_FLOAT,
-                    ),
-                    quant_opts_pb2.QuantizationComponentSpec(
-                        quantization_component=quant_opts_pb2.QuantizationComponentSpec.QuantizationComponent.COMPONENT_BIAS,
-                        bit_width=quant_opts_pb2.QuantizationComponentSpec.BitWidth.BIT_WIDTH_16,
-                    ),
-                ]
-            )
-        )
-    )
-
-    converter = tf.lite.TFLiteConverter.from_saved_model(saved_model_dir)
-    converter._experimental_quantization_options = quantization_options
-
-    converter.target_spec.supported_ops = [
-        tf.lite.OpsSet.EXPERIMENTAL_STABLEHLO_OPS
-    ]
-    converter.exclude_conversion_metadata = True
-    converter.optimizations = [lite.Optimize.DEFAULT]
-    quantized_stablehlo_model = converter.convert()
-    self.assertIsNotNone(quantized_stablehlo_model)"""
-  # pylint: enable=pointless-string-statement
-
 
 class FromKerasModelTest(lite_v2_test_util.ModelTest):
 
@@ -3071,12 +3101,12 @@ def testVariableQuantization(self, variable_quantization, number_of_states):
 
     quantized_tflite_model = converter.convert()
 
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    interp.allocate_tensors()
 
     detail = next((
         d
-        for d in interpreter.get_tensor_details()
+        for d in interp.get_tensor_details()
         if d['name'].startswith(k_readvariable_name)
     ))
     quant_params = detail['quantization_parameters']
@@ -3135,7 +3165,7 @@ def testSequentialModel(self):
     converter = lite.TFLiteConverterV2.from_keras_model(model)
     tflite_model = converter.convert()
     # Check the conversion metadata.
-    metadata = get_conversion_metadata(tflite_model)
+    metadata = util.get_conversion_metadata(tflite_model)
     self.assertIsNotNone(metadata)
     self.assertEqual(
         metadata.environment.modelType, metadata_fb.ModelType.KERAS_MODEL
@@ -3215,7 +3245,7 @@ def testKerasFallbackPath(self):
     class Model(tf.keras.Model):
 
       def __init__(self):
-        super(Model, self).__init__()
+        super().__init__()
         # A None name will cause a failure in exporting to a saved model.
         self.shared_weights = self.add_weight(
             name=None,
@@ -3262,8 +3292,8 @@ def testSignatureDefs(self):
         np.random.uniform(-1, 1, size=(1, 32, 32, 3)).astype(np.float32)
     )
     expected_value = keras_model(input_data)
-    interpreter = Interpreter(model_content=tflite_model)
-    signature_defs = interpreter.get_signature_list()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    signature_defs = interp.get_signature_list()
     results = self._evaluateTFLiteModelUsingSignatureDef(
         tflite_model, 'serving_default', {'tensor_input': input_data}
     )
@@ -3271,9 +3301,9 @@ def testSignatureDefs(self):
     self.assertAllClose(expected_value.numpy(), results['output_tensor'])
 
     # Verify the SignatureDef structure returned is as expected.
-    self.assertEqual(len(signature_defs), 1)
+    self.assertLen(signature_defs, 1)
     self.assertEqual(list(signature_defs.keys()), ['serving_default'])
-    self.assertEqual(len(signature_defs.values()), 1)
+    self.assertLen(signature_defs.values(), 1)
     self.assertEqual(
         list(signature_defs['serving_default'].keys()), ['inputs', 'outputs']
     )
@@ -3317,17 +3347,17 @@ def testMlirDynamicRangeQuantization(
     quantized_tflite_model = converter.convert()
     self.assertIsNotNone(quantized_tflite_model)
 
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    interpreter.allocate_tensors()
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    interp.allocate_tensors()
     quantized_weight = None
     quantized_weight_with_one_postfix = None
     quantized_weight_without_one_postfix = None
-    for d in interpreter.get_tensor_details():
+    for d in interp.get_tensor_details():
       if d['name'] == conv_name + '1':
         quantized_weight = d
         quantized_weight_with_one_postfix = d
         break
-    for d in interpreter.get_tensor_details():
+    for d in interp.get_tensor_details():
       if d['name'].startswith(conv_name):
         if quantized_weight is None:
           quantized_weight = d
@@ -3344,8 +3374,8 @@ def testMlirDynamicRangeQuantization(
     self.assertLen(quant_params['scales'], expected_num_params)
     self.assertLen(quant_params['zero_points'], expected_num_params)
 
-    input_details = interpreter.get_input_details()
-    output_details = interpreter.get_output_details()
+    input_details = interp.get_input_details()
+    output_details = interp.get_output_details()
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertEqual(np.float32, output_details[0]['dtype'])
     if enable_float16_quant:
@@ -3402,8 +3432,8 @@ def testQATLowBitKerasModel(self, num_bits, weight_only, low_bit):
     self.assertAllClose(
         [np.linalg.norm(result - tf_result.numpy().astype(np.float32))], [0.0]
     )
-    interpreter = tf.lite.Interpreter(model_content=tflite_model)
-    interpreter.allocate_tensors()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    interp.allocate_tensors()
     num_8bit_activations = 0
     num_8bit_weights = 0
     kernel_name = (
@@ -3411,14 +3441,14 @@ def testQATLowBitKerasModel(self, num_bits, weight_only, low_bit):
         'FakeQuantWithMinMaxVarsPerChannel'
     )
 
-    for detail in interpreter.get_tensor_details():
+    for detail in interp.get_tensor_details():
       if (
           detail['dtype'] == np.int8
           and detail['name']
           and detail['name'] == kernel_name
       ):
         num_8bit_weights += 1
-        weights = interpreter.get_tensor(detail['index'])
+        weights = interp.get_tensor(detail['index'])
         if low_bit:
           self.assertFalse(
               (bit_min > weights).any() or (weights > bit_max).any()
@@ -3488,15 +3518,15 @@ def call(self, inputs):
     converted_model = converter.convert()
     tf.lite.experimental.Analyzer.analyze(model_content=converted_model)
 
-    interpreter = tf.lite.Interpreter(model_content=converted_model)
-    interpreter.allocate_tensors()
+    interp = interpreter.Interpreter(model_content=converted_model)
+    interp.allocate_tensors()
 
-    input_index = interpreter.get_input_details()[0]['index']
-    output_index = interpreter.get_output_details()[0]['index']
+    input_index = interp.get_input_details()[0]['index']
+    output_index = interp.get_output_details()[0]['index']
 
-    interpreter.set_tensor(input_index, input_data.astype(np.float32))
-    interpreter.invoke()
-    tflite_result = interpreter.tensor(output_index)()
+    interp.set_tensor(input_index, input_data.astype(np.float32))
+    interp.invoke()
+    tflite_result = interp.tensor(output_index)()
 
     self.assertAllClose(
         [np.linalg.norm(tflite_result - tf_result.numpy().astype(np.float32))],
@@ -3504,13 +3534,67 @@ def call(self, inputs):
     )
 
     num_float32_tensor = 0
-    for detail in interpreter.get_tensor_details():
+    for detail in interp.get_tensor_details():
       if detail['dtype'] == np.float32:
         num_float32_tensor += 1
 
     # There should be only 2 float tensors, input and output.
     self.assertEqual(num_float32_tensor, 2)
 
+  @parameterized.named_parameters(
+      ('_PerChannelQuant', False, False),
+      ('_PerChannelMlirQuant', False, True),
+      ('_PerTensorQuant', True, False),
+      ('_PerTensorMlirQuant', True, True),
+      ('_PerChannelDynamicRange', False, True, True),
+      ('_PerTensorDynamicRange', True, True, True),
+  )
+  @test_util.run_v2_only
+  def testDisablePerChannelQuantizationForDenseLayers(
+      self,
+      disable_per_channel_for_dense=False,
+      enable_mlir_quantizer=False,
+      representative_dataset=False,
+  ):
+    k_dense_name = 'tfl.pseudo_qconst' if representative_dataset else 'MatMul'
+    # Dynamic range quant requires total num elements of filters > 1024.
+    k_num_filters = 64
+    model = tf.keras.models.Sequential([
+        tf.keras.Input(shape=(16,)),
+        tf.keras.layers.Dense(k_num_filters, activation='relu'),
+    ])
+    model.build()
+
+    quantized_converter = lite.TFLiteConverterV2.from_keras_model(model)
+    quantized_converter.optimizations = [lite.Optimize.DEFAULT]
+    if representative_dataset:
+
+      def calibration_gen():
+        for _ in range(5):
+          yield [np.random.uniform(-1, 1, size=(1, 16)).astype(np.float32)]
+
+      quantized_converter.representative_dataset = calibration_gen
+    quantized_converter.target_spec.supported_ops = [
+        lite.OpsSet.TFLITE_BUILTINS
+    ]
+    quantized_converter.experimental_new_quantizer = enable_mlir_quantizer
+    if disable_per_channel_for_dense:
+      quantized_converter._experimental_disable_per_channel_quantization_for_dense_layers = (
+          disable_per_channel_for_dense
+      )
+    quantized_tflite_model = quantized_converter.convert()
+    self.assertIsNotNone(quantized_tflite_model)
+
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    interp.allocate_tensors()
+    detail = next(
+        (d for d in interp.get_tensor_details() if k_dense_name in d['name'])
+    )
+    quant_params = detail['quantization_parameters']
+    expected_num_params = 1 if disable_per_channel_for_dense else k_num_filters
+    self.assertLen(quant_params['scales'], expected_num_params)
+    self.assertLen(quant_params['zero_points'], expected_num_params)
+
 
 class FromJaxModelTest(lite_v2_test_util.ModelTest):
 
@@ -3608,7 +3692,7 @@ def single_input(input_tensor):
     )
     tflite_model = converter.convert()
     # Check the conversion metadata.
-    metadata = get_conversion_metadata(tflite_model)
+    metadata = util.get_conversion_metadata(tflite_model)
     self.assertIsNotNone(metadata)
     self.assertEqual(metadata.environment.modelType, metadata_fb.ModelType.JAX)
 
@@ -4209,9 +4293,9 @@ def testMatMulQuantize(self):
     quantized_tflite_model = quantized_converter.convert()
 
     # The default input and output types should be float.
-    quantized_interpreter = Interpreter(model_content=quantized_tflite_model)
-    quantized_interpreter.allocate_tensors()
-    input_details = quantized_interpreter.get_input_details()
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    interp.allocate_tensors()
+    input_details = interp.get_input_details()
     self.assertLen(input_details, 1)
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertAllEqual([-1, 33], input_details[0]['shape_signature'])
@@ -4237,9 +4321,9 @@ def testMatMulCalibrateAndQuantize(self):
     quantized_tflite_model = quantized_converter.convert()
 
     # The default input and output types should be float.
-    quantized_interpreter = Interpreter(model_content=quantized_tflite_model)
-    quantized_interpreter.allocate_tensors()
-    input_details = quantized_interpreter.get_input_details()
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    interp.allocate_tensors()
+    input_details = interp.get_input_details()
     self.assertLen(input_details, 1)
     self.assertEqual(np.float32, input_details[0]['dtype'])
     self.assertAllEqual([-1, 33], input_details[0]['shape_signature'])
@@ -4420,25 +4504,25 @@ def model(v):
     self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    input_details = interpreter.get_input_details()
-    output_details = interpreter.get_output_details()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    input_details = interp.get_input_details()
+    output_details = interp.get_output_details()
 
-    interpreter.allocate_tensors()
+    interp.allocate_tensors()
 
     input_data = np.array([1.0], dtype=np.float32)
-    interpreter.set_tensor(input_details[0]['index'], input_data)
+    interp.set_tensor(input_details[0]['index'], input_data)
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual(1, actual_value)
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual(1, actual_value)
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual(1, actual_value)
 
   @test_util.run_v2_only
@@ -4479,26 +4563,26 @@ def body(i, m):
     self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    input_details = interpreter.get_input_details()
-    output_details = interpreter.get_output_details()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    input_details = interp.get_input_details()
+    output_details = interp.get_output_details()
 
-    interpreter.allocate_tensors()
+    interp.allocate_tensors()
 
     input_data = np.array([0], dtype=np.int32)
-    interpreter.set_tensor(input_details[0]['index'], input_data)
+    interp.set_tensor(input_details[0]['index'], input_data)
 
-    interpreter.invoke()
+    interp.invoke()
     expected_value = np.array([1], dtype=np.int32)
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual(expected_value, actual_value)
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual(expected_value, actual_value)
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual(expected_value, actual_value)
 
   @test_util.run_v2_only
@@ -4539,25 +4623,25 @@ def body(i, m):
     self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    input_details = interpreter.get_input_details()
-    output_details = interpreter.get_output_details()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    input_details = interp.get_input_details()
+    output_details = interp.get_output_details()
 
-    interpreter.allocate_tensors()
+    interp.allocate_tensors()
 
     input_data = np.array([0], dtype=np.int32)
-    interpreter.set_tensor(input_details[0]['index'], input_data)
+    interp.set_tensor(input_details[0]['index'], input_data)
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual(10, actual_value)
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual(10, actual_value)
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual(10, actual_value)
 
   @test_util.run_v2_only
@@ -4595,25 +4679,25 @@ def create_v1_saved_model():
     self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    input_details = interpreter.get_input_details()
-    output_details = interpreter.get_output_details()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    input_details = interp.get_input_details()
+    output_details = interp.get_output_details()
 
-    interpreter.allocate_tensors()
+    interp.allocate_tensors()
 
     input_data = np.array([1.0], dtype=np.float32)
-    interpreter.set_tensor(input_details[0]['index'], input_data)
+    interp.set_tensor(input_details[0]['index'], input_data)
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual(3.0, actual_value)
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual(3.0, actual_value)
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual(3.0, actual_value)
 
   @test_util.run_v2_only
@@ -4658,17 +4742,17 @@ def body(i, arr):
     self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    input_details = interpreter.get_input_details()
-    output_details = interpreter.get_output_details()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    input_details = interp.get_input_details()
+    output_details = interp.get_output_details()
 
-    interpreter.allocate_tensors()
+    interp.allocate_tensors()
 
     input_data = np.array([1.0], dtype=np.float32)
-    interpreter.set_tensor(input_details[0]['index'], input_data)
+    interp.set_tensor(input_details[0]['index'], input_data)
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual(0.0, actual_value)
 
   @test_util.run_v2_only
@@ -4719,17 +4803,17 @@ def body(i, arr, m):
     self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    input_details = interpreter.get_input_details()
-    output_details = interpreter.get_output_details()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    input_details = interp.get_input_details()
+    output_details = interp.get_output_details()
 
-    interpreter.allocate_tensors()
+    interp.allocate_tensors()
 
     input_data = np.array([1.0], dtype=np.float32)
-    interpreter.set_tensor(input_details[0]['index'], input_data)
+    interp.set_tensor(input_details[0]['index'], input_data)
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual(9.0, actual_value)
 
   @parameterized.named_parameters(
@@ -4775,17 +4859,17 @@ def create_v1_saved_model():
     self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    input_details = interpreter.get_input_details()
-    output_details = interpreter.get_output_details()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    input_details = interp.get_input_details()
+    output_details = interp.get_output_details()
 
-    interpreter.allocate_tensors()
+    interp.allocate_tensors()
 
     input_data = np.array([1.0], dtype=np.float32)
-    interpreter.set_tensor(input_details[0]['index'], input_data)
+    interp.set_tensor(input_details[0]['index'], input_data)
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual(40.0, actual_value)
 
   @parameterized.named_parameters(
@@ -4839,17 +4923,17 @@ def create_v1_saved_model():
     self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    input_details = interpreter.get_input_details()
-    output_details = interpreter.get_output_details()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    input_details = interp.get_input_details()
+    output_details = interp.get_output_details()
 
-    interpreter.allocate_tensors()
+    interp.allocate_tensors()
 
     input_data = np.array([1.0], dtype=np.float32)
-    interpreter.set_tensor(input_details[0]['index'], input_data)
+    interp.set_tensor(input_details[0]['index'], input_data)
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual(40.0, actual_value)
 
 
@@ -4892,23 +4976,23 @@ def testCustomOpRegistererByName(self):
     self.assertGreater(test_registerer.get_num_test_registerer_calls(), 0)
     self.assertIn('Double', tflite_test_util.get_ops_list(tflite_model))
     # Check the conversion metadata.
-    metadata = get_conversion_metadata(tflite_model)
+    metadata = util.get_conversion_metadata(tflite_model)
     self.assertIsNotNone(metadata)
     self.assertEqual(metadata.options.allowCustomOps, True)
 
     # Check the model works with custom ops.
-    interpreter = InterpreterWithCustomOps(
+    interp = interpreter.InterpreterWithCustomOps(
         model_content=tflite_model, custom_op_registerers=['TF_TestRegisterer']
     )
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
+    interp.allocate_tensors()
+    input_details = interp.get_input_details()
     test_input = np.array([[0.0, 0.1, 0.2, 0.3]], dtype=np.float32)
-    interpreter.set_tensor(input_details[0]['index'], test_input)
-    interpreter.invoke()
+    interp.set_tensor(input_details[0]['index'], test_input)
+    interp.invoke()
 
-    output_details = interpreter.get_output_details()
+    output_details = interp.get_output_details()
     expected_output = np.array([[0.0, 0.2, 0.4, 0.6]], dtype=np.float32)
-    output_data = interpreter.get_tensor(output_details[0]['index'])
+    output_data = interp.get_tensor(output_details[0]['index'])
     self.assertArrayNear(expected_output[0], output_data[0], err=1e-2)
 
   def testCustomOpRegistererByFunc(self):
@@ -4928,19 +5012,19 @@ def testCustomOpRegistererByFunc(self):
     self.assertIn('Double', tflite_test_util.get_ops_list(tflite_model))
 
     # Check the model works with custom ops.
-    interpreter = InterpreterWithCustomOps(
+    interp = interpreter.InterpreterWithCustomOps(
         model_content=tflite_model,
         custom_op_registerers=[test_registerer.TF_TestRegisterer],
     )
-    interpreter.allocate_tensors()
-    input_details = interpreter.get_input_details()
+    interp.allocate_tensors()
+    input_details = interp.get_input_details()
     test_input = np.array([[0.0, 0.1, 0.2, 0.3]], dtype=np.float32)
-    interpreter.set_tensor(input_details[0]['index'], test_input)
-    interpreter.invoke()
+    interp.set_tensor(input_details[0]['index'], test_input)
+    interp.invoke()
 
-    output_details = interpreter.get_output_details()
+    output_details = interp.get_output_details()
     expected_output = np.array([[0.0, 0.2, 0.4, 0.6]], dtype=np.float32)
-    output_data = interpreter.get_tensor(output_details[0]['index'])
+    output_data = interp.get_tensor(output_details[0]['index'])
     self.assertArrayNear(expected_output[0], output_data[0], err=1e-2)
 
   def testCustomOpRegistererFailure(self):
@@ -4971,7 +5055,7 @@ def f(x):
       w = tf.add(z, z, name='w')
       return w
 
-    # NOTE this is exactly representable as a float as are the intermeidates of
+    # NOTE this is exactly representable as a float as are the intermediates of
     # f. So direct comparison is ok below.
 
     input_data = np.array(2.0, np.float32)
@@ -4980,25 +5064,23 @@ def f(x):
         [concrete_func], f
     )
     tflite_model = converter.convert()
-    interpreter = Interpreter(
+    interp = interpreter.Interpreter(
         model_content=tflite_model,
         experimental_preserve_all_tensors=experimental_preserve_all_tensors,
     )
-    interpreter.allocate_tensors()
-    interpreter.set_tensor(
-        interpreter.get_input_details()[0]['index'], input_data
-    )
-    interpreter.invoke()
-    out = interpreter.get_tensor(interpreter.get_output_details()[0]['index'])
+    interp.allocate_tensors()
+    interp.set_tensor(interp.get_input_details()[0]['index'], input_data)
+    interp.invoke()
+    out = interp.get_tensor(interp.get_output_details()[0]['index'])
     tensors = {}
-    for t in interpreter.get_tensor_details():
+    for t in interp.get_tensor_details():
       # With Tensorflow Lite default delegate applied to the model graph, the
       # access to original tensors of a delegated op could cause a ValueError
       # (i.e. 'Tensor data is null. Run allocate_tensors() first') to be thrown
       # out because the tensor memory isn't allocated at all.
       val = None
       try:
-        val = interpreter.get_tensor(t['index'])
+        val = interp.get_tensor(t['index'])
       except ValueError:
         pass
       tensors.update({t['name']: val})
@@ -5043,13 +5125,13 @@ def model():
     self.assertIsNotNone(tflite_model)
 
     # Check values from converted model.
-    interpreter = Interpreter(model_content=tflite_model)
-    output_details = interpreter.get_output_details()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    output_details = interp.get_output_details()
 
-    interpreter.allocate_tensors()
+    interp.allocate_tensors()
 
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertEqual(10, actual_value)
 
 
@@ -5088,7 +5170,7 @@ def testRandomSparsity(self):
     float_tflite_model = float_converter.convert()
     self.assertIsNotNone(float_tflite_model)
     # Check the conversion metadata.
-    metadata = get_conversion_metadata(float_tflite_model)
+    metadata = util.get_conversion_metadata(float_tflite_model)
     self.assertIsNotNone(metadata)
     self.assertAllEqual(
         [metadata_fb.ModelOptimizationMode.RANDOM_SPARSITY],
@@ -5110,14 +5192,20 @@ def testBlockSparsity(self):
     float_tflite_model = float_converter.convert()
     self.assertIsNotNone(float_tflite_model)
     # Check the conversion metadata.
-    metadata = get_conversion_metadata(float_tflite_model)
+    metadata = util.get_conversion_metadata(float_tflite_model)
     self.assertIsNotNone(metadata)
     self.assertAllEqual(
         [metadata_fb.ModelOptimizationMode.BLOCK_SPARSITY],
         metadata.options.modelOptimizationModes,
     )
 
-  def testQuantizedBlockSparsity(self):
+  @parameterized.named_parameters(
+      ('_PerChannelQuantForDense', False),
+      ('_PerTensorQuantForDense', True),
+  )
+  def testQuantizedBlockSparsity(
+      self, disable_per_channel_quantization_for_dense_layers
+  ):
     weight_values = np.array([
         [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
         [0, 2, 0, 0, 0, 0, 5, 0, 0, 0, 3, 0, 0, 0, 1, 0],
@@ -5146,11 +5234,14 @@ def calibration_gen():
         lite.Optimize.DEFAULT,
     ]
     quantized_converter.representative_dataset = calibration_gen
+    quantized_converter._experimental_disable_per_channel_quantization_for_dense_layers = (
+        disable_per_channel_quantization_for_dense_layers
+    )
     quantized_tflite_model = quantized_converter.convert()
     self.assertIsNotNone(quantized_tflite_model)
 
     # Check the conversion metadata.
-    metadata = get_conversion_metadata(quantized_tflite_model)
+    metadata = util.get_conversion_metadata(quantized_tflite_model)
     self.assertIsNotNone(metadata)
     self.assertEqual(
         metadata.environment.tensorflowVersion.decode('utf-8'),
@@ -5166,17 +5257,17 @@ def calibration_gen():
     )
 
     # Check values from converted model.
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    input_details = interpreter.get_input_details()
-    output_details = interpreter.get_output_details()
-    interpreter.allocate_tensors()
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    input_details = interp.get_input_details()
+    output_details = interp.get_output_details()
+    interp.allocate_tensors()
     input_data = np.array(
         [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]],
         dtype=np.float32,
     )
-    interpreter.set_tensor(input_details[0]['index'], input_data)
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.set_tensor(input_details[0]['index'], input_data)
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertArrayNear(
         np.array([0, 87, 0, 0, 0, 0, 0, 34], dtype=np.float32),
         actual_value.flatten(),
@@ -5213,7 +5304,7 @@ def calibration_gen():
     self.assertIsNotNone(quantized_tflite_model)
 
     # Check the conversion metadata.
-    metadata = get_conversion_metadata(quantized_tflite_model)
+    metadata = util.get_conversion_metadata(quantized_tflite_model)
     self.assertIsNotNone(metadata)
     self.assertEqual(
         metadata.environment.tensorflowVersion.decode('utf-8'),
@@ -5236,17 +5327,17 @@ def calibration_gen():
     )
 
     # Check values from converted model.
-    interpreter = Interpreter(model_content=quantized_tflite_model)
-    input_details = interpreter.get_input_details()
-    output_details = interpreter.get_output_details()
-    interpreter.allocate_tensors()
+    interp = interpreter.Interpreter(model_content=quantized_tflite_model)
+    input_details = interp.get_input_details()
+    output_details = interp.get_output_details()
+    interp.allocate_tensors()
     input_data = np.array(
         [[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]],
         dtype=np.float32,
     )
-    interpreter.set_tensor(input_details[0]['index'], input_data)
-    interpreter.invoke()
-    actual_value = interpreter.get_tensor(output_details[0]['index'])
+    interp.set_tensor(input_details[0]['index'], input_data)
+    interp.invoke()
+    actual_value = interp.get_tensor(output_details[0]['index'])
     self.assertArrayNear(
         np.array([0, -3, 4, 35], dtype=np.float32),
         actual_value.flatten(),
@@ -5296,9 +5387,9 @@ def __call__(self, x):
         [str(x) for x in range(11)], shape=(11,), dtype=tf.dtypes.string
     )
     # Check values from converted model.
-    interpreter = tf.lite.Interpreter(model_content=tflite_model)
-    interpreter.allocate_tensors()
-    my_signature = interpreter.get_signature_runner()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    interp.allocate_tensors()
+    my_signature = interp.get_signature_runner()
 
     with self.assertRaises(ValueError) as error:
       _ = my_signature(x=input_data)
@@ -5318,7 +5409,7 @@ def testSavedModelSignatureDefs(self):
     )
 
     save_dir = os.path.join(self.get_temp_dir(), 'saved_model')
-    save(root, save_dir, {'mul_add': mul_add_func})
+    save.save(root, save_dir, {'mul_add': mul_add_func})
 
     converter = lite.TFLiteConverterV2.from_saved_model(
         save_dir, signature_keys=['mul_add']
@@ -5328,8 +5419,8 @@ def testSavedModelSignatureDefs(self):
 
     # Check values from converted model.
     expected_value = root.mul_add(input_data_1, input_data_0)
-    interpreter = Interpreter(model_content=tflite_model)
-    signature_defs = interpreter.get_signature_list()
+    interp = interpreter.Interpreter(model_content=tflite_model)
+    signature_defs = interp.get_signature_list()
     results = self._evaluateTFLiteModelUsingSignatureDef(
         tflite_model, 'mul_add', {'y': input_data_0, 'x': input_data_1}
     )
@@ -5337,9 +5428,9 @@ def testSavedModelSignatureDefs(self):
     self.assertEqual(expected_value.numpy(), results['output_0'])
 
     # Verify the SignatureDef structure returned is as expected.
-    self.assertEqual(len(signature_defs), 1)
+    self.assertLen(signature_defs, 1)
     self.assertEqual(list(signature_defs.keys()), ['mul_add'])
-    self.assertEqual(len(signature_defs.values()), 1)
+    self.assertLen(signature_defs.values(), 1)
     self.assertEqual(
         list(signature_defs['mul_add'].keys()), ['inputs', 'outputs']
     )
diff --git a/tensorflow/lite/python/wrap_toco.py b/tensorflow/lite/python/wrap_toco.py
index 049d2babf36ff8..9d8a8bc11a6456 100644
--- a/tensorflow/lite/python/wrap_toco.py
+++ b/tensorflow/lite/python/wrap_toco.py
@@ -17,12 +17,16 @@
 # pylint: disable=invalid-import-order,g-bad-import-order
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 from tensorflow.python import _pywrap_toco_api
+from tensorflow.compiler.mlir.quantization.tensorflow.python import py_function_lib
 
-# TODO(b/137402359): Remove lazy loading wrapper
 
-
-def wrapped_toco_convert(model_flags_str, toco_flags_str, input_data_str,
-                         debug_info_str, enable_mlir_converter):
+def wrapped_toco_convert(
+    model_flags_str,
+    toco_flags_str,
+    input_data_str,
+    debug_info_str,
+    enable_mlir_converter,
+):
   """Wraps TocoConvert with lazy loader."""
   return _pywrap_toco_api.TocoConvert(
       model_flags_str,
@@ -30,20 +34,40 @@ def wrapped_toco_convert(model_flags_str, toco_flags_str, input_data_str,
       input_data_str,
       False,  # extended_return
       debug_info_str,
-      enable_mlir_converter)
+      enable_mlir_converter,
+      py_function_lib.PyFunctionLibrary(),
+  )
 
 
 def wrapped_experimental_mlir_quantize(
-    input_data_str, disable_per_channel, fully_quantize, inference_type,
-    input_data_type, output_data_type, enable_numeric_verify,
-    enable_whole_model_verify, denylisted_ops, denylisted_nodes,
-    enable_variable_quantization):
+    input_data_str,
+    disable_per_channel,
+    fully_quantize,
+    inference_type,
+    input_data_type,
+    output_data_type,
+    enable_numeric_verify,
+    enable_whole_model_verify,
+    denylisted_ops,
+    denylisted_nodes,
+    enable_variable_quantization,
+    disable_per_channel_for_dense_layers,
+):
   """Wraps experimental mlir quantize model."""
   return _pywrap_toco_api.ExperimentalMlirQuantizeModel(
-      input_data_str, disable_per_channel, fully_quantize, inference_type,
-      input_data_type, output_data_type, enable_numeric_verify,
-      enable_whole_model_verify, denylisted_ops, denylisted_nodes,
-      enable_variable_quantization)
+      input_data_str,
+      disable_per_channel,
+      fully_quantize,
+      inference_type,
+      input_data_type,
+      output_data_type,
+      enable_numeric_verify,
+      enable_whole_model_verify,
+      denylisted_ops,
+      denylisted_nodes,
+      enable_variable_quantization,
+      disable_per_channel_for_dense_layers,
+  )
 
 
 def wrapped_experimental_mlir_sparsify(input_data_str):
diff --git a/tensorflow/lite/schema/BUILD b/tensorflow/lite/schema/BUILD
index 02b7fd4eccec9d..d131d81935f049 100644
--- a/tensorflow/lite/schema/BUILD
+++ b/tensorflow/lite/schema/BUILD
@@ -172,7 +172,7 @@ cc_library(
     srcs = ["schema_utils.cc"],
     hdrs = ["schema_utils.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = [":utils_friends"],
+    visibility = ["//visibility:public"],
     deps = [
         ":schema_fbs",
         "//tensorflow/lite/kernels/internal:compatibility",
diff --git a/tensorflow/lite/testing/join.h b/tensorflow/lite/testing/join.h
index 3f17f7fad46a8b..f7337d701fec63 100644
--- a/tensorflow/lite/testing/join.h
+++ b/tensorflow/lite/testing/join.h
@@ -15,6 +15,7 @@ limitations under the License.
 #ifndef TENSORFLOW_LITE_TESTING_JOIN_H_
 #define TENSORFLOW_LITE_TESTING_JOIN_H_
 
+#include <cstdint>
 #include <cstdlib>
 #include <iomanip>
 #include <sstream>
diff --git a/tensorflow/lite/toco/BUILD b/tensorflow/lite/toco/BUILD
index b3f0f8266527a1..c5b5a50cac6e2d 100644
--- a/tensorflow/lite/toco/BUILD
+++ b/tensorflow/lite/toco/BUILD
@@ -38,6 +38,7 @@ tf_proto_library(
     make_default_target_header_only = True,
     protodeps = [
         "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_options_proto",
+        "//tensorflow/compiler/mlir/quantization/stablehlo:quantization_config_proto",
         "//tensorflow/compiler/mlir/lite/debug:debug_options_proto",
         ":types_proto",
     ],
diff --git a/tensorflow/lite/toco/python/BUILD b/tensorflow/lite/toco/python/BUILD
index 1357f2e76988ef..145a9aa0e0a57a 100644
--- a/tensorflow/lite/toco/python/BUILD
+++ b/tensorflow/lite/toco/python/BUILD
@@ -1,5 +1,5 @@
-load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 load("//tensorflow:strict.default.bzl", "py_strict_binary", "py_strict_library")
+load("//tensorflow:tensorflow.default.bzl", "tf_py_strict_test")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -45,6 +45,7 @@ cc_library(
         "//tensorflow/compiler/mlir/lite/python:saved_model_to_tfl_flatbuffer",
         "//tensorflow/compiler/mlir/lite/quantization/lite:quantize_model",
         "//tensorflow/compiler/mlir/lite/sparsity:sparsify_model",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
         "//tensorflow/core:framework",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
@@ -99,6 +100,7 @@ py_strict_binary(
     srcs_version = "PY3",
     deps = [
         "@absl_py//absl:app",
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib_py",
         "//tensorflow/python:_pywrap_toco_api",
         "//tensorflow/python:pywrap_tensorflow",
         # Needed to provide PyArray_API
diff --git a/tensorflow/lite/toco/python/toco_from_protos.py b/tensorflow/lite/toco/python/toco_from_protos.py
index 617b5e9093d795..663bf6fd79bd52 100644
--- a/tensorflow/lite/toco/python/toco_from_protos.py
+++ b/tensorflow/lite/toco/python/toco_from_protos.py
@@ -20,6 +20,7 @@
 # pylint: disable=invalid-import-order,g-bad-import-order
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
 from tensorflow.python import _pywrap_toco_api
+from tensorflow.compiler.mlir.quantization.tensorflow.python import py_function_lib  # pylint: disable=unused-import; required for TocoConvert to understand the type: PyFunctionLibrary
 from absl import app
 
 FLAGS = None
@@ -49,7 +50,9 @@ def execute(unused_args):
       input_str,
       False,  # extended_return
       debug_info_str,
-      enable_mlir_converter)
+      enable_mlir_converter,
+      None,  # quantization_py_function_library
+  )
   open(FLAGS.model_output_file, "wb").write(output_str)
   sys.exit(0)
 
diff --git a/tensorflow/lite/toco/python/toco_python_api.cc b/tensorflow/lite/toco/python/toco_python_api.cc
index 48af2bdce7cb9a..345380916fbc55 100644
--- a/tensorflow/lite/toco/python/toco_python_api.cc
+++ b/tensorflow/lite/toco/python/toco_python_api.cc
@@ -29,6 +29,7 @@ limitations under the License.
 #include "tensorflow/compiler/mlir/lite/python/saved_model_to_tfl_flatbuffer.h"
 #include "tensorflow/compiler/mlir/lite/quantization/lite/quantize_model.h"
 #include "tensorflow/compiler/mlir/lite/sparsity/sparsify_model.h"
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/core/framework/op.h"
 #include "tensorflow/core/framework/op_def.pb.h"
 #include "tensorflow/lite/core/api/error_reporter.h"
@@ -96,8 +97,9 @@ void PopulateConversionLogHelper(const toco::ModelFlags& model_flags,
 PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
                       PyObject* toco_flags_proto_txt_raw,
                       PyObject* input_contents_txt_raw, bool extended_return,
-                      PyObject* debug_info_txt_raw,
-                      bool enable_mlir_converter) {
+                      PyObject* debug_info_txt_raw, bool enable_mlir_converter,
+                      const tensorflow::quantization::PyFunctionLibrary*
+                          quantization_py_function_library) {
   // Use Python C API to validate and convert arguments. In py3 (bytes),
   // in py2 (str).
   auto ConvertArg = [&](PyObject* obj, bool* error) {
@@ -196,7 +198,8 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
           &output_file_contents_txt);
     } else if (!model_flags.saved_model_dir().empty()) {
       status = tensorflow::ConvertSavedModelToTFLiteFlatBuffer(
-          model_flags, toco_flags, &output_file_contents_txt);
+          model_flags, toco_flags, &output_file_contents_txt,
+          quantization_py_function_library);
     } else {
       tensorflow::GraphDef graph_def;
       if (!graph_def.ParseFromString(input_contents_txt)) {
@@ -294,7 +297,8 @@ PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
                             bool enable_numeric_verify,
                             bool enable_whole_model_verify,
                             PyObject* op_denylist, PyObject* node_denylist,
-                            bool enable_variable_quantization) {
+                            bool enable_variable_quantization,
+                            bool disable_per_channel_for_dense_layers) {
   using tflite::interpreter_wrapper::PythonErrorReporter;
   char* buf = nullptr;
   Py_ssize_t length;
@@ -340,7 +344,7 @@ PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
       /*operator_names=*/{}, disable_per_channel, fully_quantize, output_model,
       error_reporter.get(), enable_numeric_verify, enable_whole_model_verify,
       /*legacy_float_scale=*/true, denylisted_ops, denylisted_nodes,
-      enable_variable_quantization);
+      enable_variable_quantization, disable_per_channel_for_dense_layers);
   if (status != kTfLiteOk) {
     error_reporter->exception();
     return nullptr;
diff --git a/tensorflow/lite/toco/python/toco_python_api.h b/tensorflow/lite/toco/python/toco_python_api.h
index 3cd289037def18..37d42bcf170316 100644
--- a/tensorflow/lite/toco/python/toco_python_api.h
+++ b/tensorflow/lite/toco/python/toco_python_api.h
@@ -20,6 +20,8 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
+
 namespace toco {
 
 // Convert a model represented in `input_contents`. `model_flags_proto`
@@ -36,7 +38,9 @@ PyObject* TocoConvert(PyObject* model_flags_proto_txt_raw,
                       PyObject* input_contents_txt_raw,
                       bool extended_return = false,
                       PyObject* debug_info_txt_raw = nullptr,
-                      bool enable_mlir_converter = false);
+                      bool enable_mlir_converter = false,
+                      const tensorflow::quantization::PyFunctionLibrary*
+                          quantization_py_function_library = nullptr);
 
 // Quantize the model with calibration data. Throw errors if `fully_quantize`
 // is specified by the calibration data are not sufficient to quantize the
@@ -48,7 +52,8 @@ PyObject* MlirQuantizeModel(PyObject* data, bool disable_per_channel,
                             bool enable_whole_model_verify = false,
                             PyObject* op_denylist = nullptr,
                             PyObject* node_denylist = nullptr,
-                            bool enable_variable_quantization = false);
+                            bool enable_variable_quantization = false,
+                            bool disable_per_channel_for_dense_layers = false);
 
 // Sparsifies model to encode sparse tensors with proper format. Throws error if
 // sparsification fails.
diff --git a/tensorflow/lite/toco/toco_flags.proto b/tensorflow/lite/toco/toco_flags.proto
index 8c9a130db29fc5..e7a31365a65330 100644
--- a/tensorflow/lite/toco/toco_flags.proto
+++ b/tensorflow/lite/toco/toco_flags.proto
@@ -16,6 +16,7 @@ syntax = "proto2";
 package toco;
 
 import "tensorflow/compiler/mlir/lite/debug/debug_options.proto";
+import "tensorflow/compiler/mlir/quantization/stablehlo/quantization_config.proto";
 import "tensorflow/compiler/mlir/quantization/stablehlo/quantization_options.proto";
 import "tensorflow/lite/toco/types.proto";
 
@@ -40,7 +41,7 @@ enum FileFormat {
 // of as properties of models, instead describing how models are to be
 // processed in the context of the present tooling job.
 //
-// Next ID to use: 61.
+// Next ID to use: 63.
 message TocoFlags {
   // Input file format
   optional FileFormat input_format = 1;
@@ -313,8 +314,9 @@ message TocoFlags {
 
   // Indicates the quantization specs. Quantization spec can be set to either
   // a preset method or a custom method.
-  // Note: This is an experimental feature
-  optional stablehlo.quantization.QuantizationOptions quantization_options = 54;
+  // Note: This is deprecated; use `quantization_config` instead.
+  optional stablehlo.quantization.QuantizationOptions quantization_options = 54
+      [deprecated = true];
 
   // Flag to enable hlo to tf conversion.
   // This is useful to exercise StableHLO -> HLO -> TF -> TFLite path.
@@ -343,4 +345,15 @@ message TocoFlags {
   // ops and to convert kernels to quantized kernels wherever appropriate.
   // WARNING: Experimental interface, subject to change.
   optional string qdq_conversion_mode = 60 [default = "NONE"];
+
+  // Configures quantization behavior. This config is fed to the StableHLO
+  // Quantizer integrated in the converter.
+  // WARNING: Experimental interface, subject to change.
+  optional stablehlo.quantization.QuantizationConfig quantization_config = 61;
+
+  // Disables per channel weights quantization for Dense layers and enables
+  // legacy per tensor quantization. The legacy quantization for Dense layers is
+  // inconsistent with Conv 1x1 which always performs per channel quantization.
+  optional bool disable_per_channel_quantization_for_dense_layers = 62
+      [default = false];
 }
diff --git a/tensorflow/lite/toco/toco_port.cc b/tensorflow/lite/toco/toco_port.cc
index 289decfe92683f..56bac97b2ed1ba 100644
--- a/tensorflow/lite/toco/toco_port.cc
+++ b/tensorflow/lite/toco/toco_port.cc
@@ -129,13 +129,15 @@ std::string JoinPath(const std::string& a, const std::string& b) {
 #else  // !PLATFORM_GOOGLE || __APPLE__ || __ANDROID__ || _WIN32
 
 #include <fcntl.h>
-#if defined(_WIN32)
-#include <io.h>  // for _close, _open, _read
-#endif
 #include <sys/stat.h>
 #include <sys/types.h>
-#include <unistd.h>
+
 #include <cstdio>
+#if defined(_WIN32)
+#include <io.h>  // for _close, _open, _read
+#else
+#include <unistd.h>
+#endif
 
 #if defined(PLATFORM_GOOGLE)
 #include "base/commandlineflags.h"
diff --git a/tensorflow/lite/tools/benchmark/README.md b/tensorflow/lite/tools/benchmark/README.md
index 2d7398525b99dc..1eb038b45ca77d 100644
--- a/tensorflow/lite/tools/benchmark/README.md
+++ b/tensorflow/lite/tools/benchmark/README.md
@@ -106,6 +106,12 @@ and the following optional parameters:
     Whether to optimize memory usage for large tensors with sacrificing latency.
     When the feature is enabled, `release_dynamic_tensors` is also enabled.
 
+*   `enable_builtin_cast_constant_cache`: `bool` (default=false) \
+    Configure the builtin TFLite CAST operation to cache its output if its input
+    is a constant tensor.
+
+    WARNING: This is an experimental option that may be removed at any time.
+
 This list of parameters is not exhaustive. See
 [here](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/tools/benchmark/benchmark_model.cc)
 and
diff --git a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
index a36eab0f29ab53..05ce93d2e6d588 100644
--- a/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
+++ b/tensorflow/lite/tools/benchmark/benchmark_tflite_model.cc
@@ -265,14 +265,13 @@ TfLiteStatus PopulateInputLayerInfo(
   std::vector<std::string> shapes = Split(shapes_string, ':');
 
   if (names.size() != shapes.size()) {
-    TFLITE_LOG(ERROR) << "The number of items in"
-                      << " --input_layer_shape (" << shapes_string << ", with "
-                      << shapes.size() << " items)"
-                      << " must match the number of items in"
-                      << " --input_layer (" << names_string << ", with "
-                      << names.size() << " items)."
-                      << " For example --input_layer=input1,input2"
-                      << " --input_layer_shape=1,224,224,4:1,20";
+    TFLITE_LOG(ERROR)
+        << "The number of items in --input_layer_shape (" << shapes_string
+        << ", with " << shapes.size()
+        << " items) must match the number of items in --input_layer ("
+        << names_string << ", with " << names.size()
+        << " items). For example --input_layer=input1,input2 "
+           "--input_layer_shape=1,224,224,4:1,20";
     return kTfLiteError;
   }
 
@@ -381,6 +380,8 @@ BenchmarkParams BenchmarkTfLiteModel::DefaultParams() {
                           BenchmarkParam::Create<int32_t>(0));
   default_params.AddParam("disable_delegate_clustering",
                           BenchmarkParam::Create<bool>(false));
+  default_params.AddParam("enable_builtin_cast_constant_cache",
+                          BenchmarkParam::Create<bool>(false));
   default_params.AddParam("output_filepath",
                           BenchmarkParam::Create<std::string>(""));
 
@@ -469,6 +470,10 @@ std::vector<Flag> BenchmarkTfLiteModel::GetFlags() {
           "Optimize memory usage for large tensors with sacrificing latency."),
       CreateFlag<bool>("disable_delegate_clustering", &params_,
                        "Disable delegate clustering."),
+      CreateFlag<bool>(
+          "enable_builtin_cast_constant_cache", &params_,
+          "Cache the output of the builtin cast operation when its input "
+          "is a constant tensor."),
       CreateFlag<std::string>(
           "output_filepath", &params_,
           "File path to export outputs layer as binary data."),
@@ -528,6 +533,8 @@ void BenchmarkTfLiteModel::LogParams() {
                       "Optimize memory usage for large tensors", verbose);
   LOG_BENCHMARK_PARAM(bool, "disable_delegate_clustering",
                       "Disable delegate clustering", verbose);
+  LOG_BENCHMARK_PARAM(bool, "enable_builtin_cast_constant_cache",
+                      "Constant CAST output cache", verbose);
   LOG_BENCHMARK_PARAM(std::string, "output_filepath",
                       "File path to export outputs layer to", verbose);
   LOG_BENCHMARK_PARAM(int32_t, "tensor_name_display_length",
@@ -726,6 +733,8 @@ TfLiteStatus BenchmarkTfLiteModel::InitInterpreter() {
       params_.Get<int32_t>("optimize_memory_for_large_tensors"));
   options.SetDisableDelegateClustering(
       params_.Get<bool>("disable_delegate_clustering"));
+  options.SetCacheConstantCastOp(
+      params_.Get<bool>("enable_builtin_cast_constant_cache"));
 
   tflite::InterpreterBuilder builder(*model_, *resolver, &options);
   if (builder.SetNumThreads(num_threads) != kTfLiteOk) {
diff --git a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
index dbc832812f4f17..485751f3373d71 100644
--- a/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
+++ b/tensorflow/lite/tools/cmake/modules/xnnpack.cmake
@@ -23,7 +23,7 @@ OverridableFetchContent_Declare(
   xnnpack
   GIT_REPOSITORY https://github.com/google/XNNPACK
   # Sync with tensorflow/workspace2.bzl
-  GIT_TAG dcbfffb80fb4f6fcfcfb5b3723854ec8797fa546
+  GIT_TAG 9325fcfe52092b2f8f816db218bca208db7b2750
   GIT_PROGRESS TRUE
   PREFIX "${CMAKE_BINARY_DIR}"
   SOURCE_DIR "${CMAKE_BINARY_DIR}/xnnpack"
diff --git a/tensorflow/lite/tools/cmake/native_tools/flatbuffers/CMakeLists.txt b/tensorflow/lite/tools/cmake/native_tools/flatbuffers/CMakeLists.txt
index c13f36fb140b2e..583a7c8c6bacf7 100644
--- a/tensorflow/lite/tools/cmake/native_tools/flatbuffers/CMakeLists.txt
+++ b/tensorflow/lite/tools/cmake/native_tools/flatbuffers/CMakeLists.txt
@@ -40,4 +40,4 @@ else()
   set(FLATC_INSTALL_PREFIX ${CMAKE_INSTALL_PREFIX} CACHE PATH "Flatc installation directory")
 endif()
 
-find_package(flatbuffers)
+find_package(FlatBuffers)
diff --git a/tensorflow/lite/tools/delegates/compatibility/gpu/BUILD b/tensorflow/lite/tools/delegates/compatibility/gpu/BUILD
index 71f07f8fccd235..e3f84c237be1ce 100644
--- a/tensorflow/lite/tools/delegates/compatibility/gpu/BUILD
+++ b/tensorflow/lite/tools/delegates/compatibility/gpu/BUILD
@@ -19,7 +19,10 @@ cc_test(
     deps = [
         ":gpu_delegate_compatibility_checker",
         "//tensorflow/core/platform:resource_loader",
+        "//tensorflow/lite:model_builder",
         "//tensorflow/lite/kernels:test_util",
+        "//tensorflow/lite/schema:schema_fbs",
+        "//tensorflow/lite/tools/delegates/compatibility/protos:compatibility_result_cc",
         "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest_main",
     ],
@@ -32,6 +35,7 @@ cc_library(
         "gpu_delegate_compatibility_checker.h",
     ],
     deps = [
+        "//tensorflow/lite:model_builder",
         "//tensorflow/lite/tools/delegates/compatibility/common:delegate_compatibility_checker_base",
         "//tensorflow/lite/tools/delegates/compatibility/protos:compatibility_result_cc",
         "//tensorflow/lite/tools/versioning:gpu_compatibility",
diff --git a/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker.cc b/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker.cc
index ff8ee8190d97dd..2f328a4b5f5394 100644
--- a/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker.cc
+++ b/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker.cc
@@ -20,6 +20,9 @@ limitations under the License.
 #include <string>
 #include <unordered_map>
 
+#include "absl/status/status.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/tools/delegates/compatibility/protos/compatibility_result.pb.h"
 #include "tensorflow/lite/tools/versioning/gpu_compatibility.h"
 #include "tensorflow/lite/tools/versioning/op_signature.h"
 
diff --git a/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker.h b/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker.h
index 7566ac33bf8f53..c3121c504ee420 100644
--- a/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker.h
+++ b/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker.h
@@ -20,6 +20,7 @@ limitations under the License.
 #include <unordered_map>
 
 #include "absl/status/status.h"
+#include "tensorflow/lite/model_builder.h"
 #include "tensorflow/lite/tools/delegates/compatibility/common/delegate_compatibility_checker_base.h"
 #include "tensorflow/lite/tools/delegates/compatibility/protos/compatibility_result.pb.h"
 #include "tensorflow/lite/tools/versioning/op_signature.h"
diff --git a/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker_test.cc b/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker_test.cc
index b4f70cc8ade9d6..2d3f3f645eb29b 100644
--- a/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker_test.cc
+++ b/tensorflow/lite/tools/delegates/compatibility/gpu/gpu_delegate_compatibility_checker_test.cc
@@ -22,6 +22,9 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "tensorflow/core/platform/resource_loader.h"
 #include "tensorflow/lite/kernels/test_util.h"
+#include "tensorflow/lite/model_builder.h"
+#include "tensorflow/lite/schema/schema_generated.h"
+#include "tensorflow/lite/tools/delegates/compatibility/protos/compatibility_result.pb.h"
 
 namespace tflite {
 namespace tools {
diff --git a/tensorflow/lite/tools/delegates/experimental/stable_delegate/BUILD b/tensorflow/lite/tools/delegates/experimental/stable_delegate/BUILD
index 9601726f043849..d4eeefe01432eb 100644
--- a/tensorflow/lite/tools/delegates/experimental/stable_delegate/BUILD
+++ b/tensorflow/lite/tools/delegates/experimental/stable_delegate/BUILD
@@ -13,6 +13,8 @@ cc_library(
     deps = [
         "//tensorflow/lite/c:common",
         "//tensorflow/lite/tools:command_line_flags",
+        "//tensorflow/lite/tools:logging",
+        "//tensorflow/lite/tools:tool_params",
         "//tensorflow/lite/tools/delegates:delegate_provider_hdr",
     ] + select({
         # Stable delegate does not support Windows because the shared library loader hasn't been
diff --git a/tensorflow/lite/tools/delegates/experimental/stable_delegate/stable_delegate_provider.cc b/tensorflow/lite/tools/delegates/experimental/stable_delegate/stable_delegate_provider.cc
index d8715648758543..ae8ab083237cd9 100644
--- a/tensorflow/lite/tools/delegates/experimental/stable_delegate/stable_delegate_provider.cc
+++ b/tensorflow/lite/tools/delegates/experimental/stable_delegate/stable_delegate_provider.cc
@@ -19,9 +19,10 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
-#include "tensorflow/lite/c/common.h"
 #include "tensorflow/lite/tools/command_line_flags.h"
 #include "tensorflow/lite/tools/delegates/delegate_provider.h"
+#include "tensorflow/lite/tools/logging.h"
+#include "tensorflow/lite/tools/tool_params.h"
 
 #if !defined(_WIN32)
 #include "tensorflow/lite/acceleration/configuration/c/delegate_plugin.h"
diff --git a/tensorflow/lite/tools/versioning/op_signature.cc b/tensorflow/lite/tools/versioning/op_signature.cc
index 5b8523622effd5..da367699004f8a 100644
--- a/tensorflow/lite/tools/versioning/op_signature.cc
+++ b/tensorflow/lite/tools/versioning/op_signature.cc
@@ -163,6 +163,15 @@ OpSignature GetOpSignature(const OperatorCode* op_code, const Operator* op,
           subgraph->tensors()->Get(op->inputs()->Get(1));
       op_sig.ext_options.fully_connected.sparse_weight =
           (weight_tensor->sparsity() != nullptr);
+      const QuantizationParameters* weight_quant =
+          weight_tensor->quantization();
+      if (weight_quant && weight_quant->scale() &&
+          weight_quant->scale()->size() && weight_tensor->shape() &&
+          weight_tensor->shape()->size()) {
+        op_sig.ext_options.fully_connected.is_per_channel_quantized =
+            weight_quant->scale()->size() > 1 &&
+            weight_quant->scale()->size() == weight_tensor->shape()->Get(0);
+      }
     } break;
 
     case BuiltinOperator_MUL: {
diff --git a/tensorflow/lite/tools/versioning/op_signature.h b/tensorflow/lite/tools/versioning/op_signature.h
index 6e09767273357b..aece1638eca1d6 100644
--- a/tensorflow/lite/tools/versioning/op_signature.h
+++ b/tensorflow/lite/tools/versioning/op_signature.h
@@ -52,6 +52,7 @@ typedef struct {
       // TODO(b/156530611): Make this global when more ops support sparse
       // computation.
       bool sparse_weight;
+      bool is_per_channel_quantized;
     } fully_connected;
     struct {
       float input1_scale;
diff --git a/tensorflow/lite/tools/versioning/op_version.cc b/tensorflow/lite/tools/versioning/op_version.cc
index bba285328d2527..b5d8bb151e7145 100644
--- a/tensorflow/lite/tools/versioning/op_version.cc
+++ b/tensorflow/lite/tools/versioning/op_version.cc
@@ -173,6 +173,13 @@ int GetBuiltinOperatorVersion(const OpSignature& op_sig) {
           reinterpret_cast<TfLiteFullyConnectedParams*>(op_sig.builtin_data);
       TFLITE_DCHECK(fully_connected_params != nullptr);
 
+      if (op_sig.inputs.at(0).type == kTfLiteFloat32 &&
+          op_sig.inputs.at(1).type == kTfLiteInt8 &&
+          op_sig.outputs.at(0).type == kTfLiteFloat32 &&
+          op_sig.ext_options.fully_connected.is_per_channel_quantized) {
+        return 12;
+      }
+
       if (op_sig.inputs.at(0).type == kTfLiteInt16 &&
           op_sig.inputs.at(1).type == kTfLiteInt8 &&
           op_sig.outputs.at(0).type == kTfLiteInt16) {
diff --git a/tensorflow/lite/tools/versioning/op_version_test.cc b/tensorflow/lite/tools/versioning/op_version_test.cc
index 5cff633f0ee0d0..acddf0d0de9043 100644
--- a/tensorflow/lite/tools/versioning/op_version_test.cc
+++ b/tensorflow/lite/tools/versioning/op_version_test.cc
@@ -709,6 +709,16 @@ TEST(OpVersionTest, VersioningFullyConnectedTest) {
   };
   fully_connected_params.quantized_bias_type = kTfLiteInt32;
   EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 11);
+
+  fake_op_sig = {
+      .op = BuiltinOperator_FULLY_CONNECTED,
+      .inputs = CreateOpSignatureTensorSpecs(
+          std::vector<TfLiteType>{kTfLiteFloat32, kTfLiteInt8}),
+      .outputs = CreateOpSignatureTensorSpecs(kTfLiteFloat32),
+      .builtin_data = reinterpret_cast<void*>(&fully_connected_params),
+  };
+  fake_op_sig.ext_options.fully_connected.is_per_channel_quantized = true;
+  EXPECT_EQ(GetBuiltinOperatorVersion(fake_op_sig), 12);
 }
 
 TEST(OpVersionTest, VersioningDequantizeTest) {
diff --git a/tensorflow/lite/tools/versioning/runtime_version.cc b/tensorflow/lite/tools/versioning/runtime_version.cc
index d011a5d5438e46..170927f81d7a55 100644
--- a/tensorflow/lite/tools/versioning/runtime_version.cc
+++ b/tensorflow/lite/tools/versioning/runtime_version.cc
@@ -132,6 +132,7 @@ std::string FindMinimumRuntimeVersionForOp(tflite::BuiltinOperator op_code,
            {{BuiltinOperator_FULLY_CONNECTED, 9}, "2.3.0"},
            {{BuiltinOperator_FULLY_CONNECTED, 10}, "2.11.0"},
            {{BuiltinOperator_FULLY_CONNECTED, 11}, "2.15.0"},
+           {{BuiltinOperator_FULLY_CONNECTED, 12}, "2.17.0"},
            {{BuiltinOperator_GATHER, 1}, "1.6.0"},
            {{BuiltinOperator_GATHER, 2}, "1.14.0"},
            {{BuiltinOperator_GATHER, 3}, "1.15.0"},
diff --git a/tensorflow/opensource_only.files b/tensorflow/opensource_only.files
index 024841ce626586..ef0169988b5bf7 100644
--- a/tensorflow/opensource_only.files
+++ b/tensorflow/opensource_only.files
@@ -284,6 +284,8 @@ tf_staging/third_party/nccl/generated_names.bzl.tpl:
 tf_staging/third_party/nccl/nccl_configure.bzl:
 tf_staging/third_party/nccl/system.BUILD.tpl:
 tf_staging/third_party/nlohmann_json.BUILD:
+tf_staging/third_party/nvtx/BUILD:
+tf_staging/third_party/nvtx/LICENSE:
 tf_staging/third_party/ortools/BUILD:
 tf_staging/third_party/ortools/glpk.BUILD:
 tf_staging/third_party/ortools/ortools.patch:
diff --git a/tensorflow/python/BUILD b/tensorflow/python/BUILD
index 69030356583d98..675709d4314eb4 100644
--- a/tensorflow/python/BUILD
+++ b/tensorflow/python/BUILD
@@ -546,6 +546,7 @@ tf_python_pybind_extension(
         "_pywrap_toco_api.pyi",
     ],
     deps = [
+        "//tensorflow/compiler/mlir/quantization/tensorflow/python:py_function_lib",
         "//tensorflow/python/lib/core:pybind11_lib",
         "//third_party/python_runtime:headers",
         "@pybind11",
diff --git a/tensorflow/python/_pywrap_toco_api.pyi b/tensorflow/python/_pywrap_toco_api.pyi
index 619c686c336c32..213c6f14872f7d 100644
--- a/tensorflow/python/_pywrap_toco_api.pyi
+++ b/tensorflow/python/_pywrap_toco_api.pyi
@@ -13,9 +13,9 @@
 # limitations under the License.
 # ==============================================================================
 
-def ExperimentalMlirQuantizeModel(input_contents_txt_raw: object, disable_per_channel: bool = ..., fully_quantize: bool = ..., inference_type: int = ..., input_data_type: int = ..., output_data_type: int = ..., enable_numeric_verify: bool = ..., enable_whole_model_verify: bool = ..., op_blocklist: object = ..., node_blocklist: object = ..., enable_variable_quantization: bool = ...) -> object: ...
+def ExperimentalMlirQuantizeModel(input_contents_txt_raw: object, disable_per_channel: bool = ..., fully_quantize: bool = ..., inference_type: int = ..., input_data_type: int = ..., output_data_type: int = ..., enable_numeric_verify: bool = ..., enable_whole_model_verify: bool = ..., op_blocklist: object = ..., node_blocklist: object = ..., enable_variable_quantization: bool = ..., disable_per_channel_for_dense_layers: bool = ...) -> object: ...
 def ExperimentalMlirSparsifyModel(input_contents_txt_raw: object) -> object: ...
 def FlatBufferToMlir(arg0: str, arg1: bool) -> str: ...
 def RegisterCustomOpdefs(custom_opdefs_txt_raw: object) -> object: ...
 def RetrieveCollectedErrors() -> list: ...
-def TocoConvert(model_flags_proto_txt_raw: object, toco_flags_proto_txt_raw: object, input_contents_txt_raw: object, extended_return: bool = ..., debug_info_txt_raw: object = ..., enable_mlir_converter: bool = ...) -> object: ...
+def TocoConvert(model_flags_proto_txt_raw: object, toco_flags_proto_txt_raw: object, input_contents_txt_raw: object, extended_return: bool = ..., debug_info_txt_raw: object = ..., enable_mlir_converter: bool = ..., quantization_py_function_library = ...) -> object: ...
diff --git a/tensorflow/python/checkpoint/sharding/sharding_util.py b/tensorflow/python/checkpoint/sharding/sharding_util.py
index 322bba18dcfa84..6871922443ef68 100644
--- a/tensorflow/python/checkpoint/sharding/sharding_util.py
+++ b/tensorflow/python/checkpoint/sharding/sharding_util.py
@@ -221,12 +221,33 @@ def validate_shards(
               f"  original tensor_dtype: {target_dtype}\n"
               f"  new tensor_dtype: {shard_tensor.dtype}\n")
 
-        # Validate same task in shard.
+        # Validate no task change.
+        target_task = device_lib.DeviceSpec.from_string(
+            unseen_tensor_dict[checkpoint_key][slice_spec].device).task
+        shard_tensor_task = device_lib.DeviceSpec.from_string(
+            shard_tensor.device).task
+        if shard_tensor_task != target_task:
+          raise RuntimeError(
+              "After executing the checkpoint sharding callback, a tensor "
+              "was found with an altered task:\n"
+              f"  callback_description: {callback_description}\n"
+              f"  checkpoint_key: {checkpoint_key}\n"
+              f"  slice_spec: {slice_spec}\n"
+              f"  original tensor_task: {target_task}\n"
+              f"  new tensor_task: {shard_tensor_task}\n")
+
+        # Validate tensors in shard have the same task.
         if task_tensor is None:
-          task_tensor = ShardableTensor
-          task_tensor.device = shard_tensor.device
-          task_tensor.checkpoint_key = checkpoint_key
-          task_tensor.slice_spec = slice_spec
+          task_tensor = ShardableTensor(
+              _tensor_save_spec=None,
+              tensor=None,
+              dtype=None,
+              device=shard_tensor.device,
+              name=None,
+              shape=None,
+              slice_spec=slice_spec,
+              checkpoint_key=checkpoint_key,
+              trackable=None)
         else:
           task1 = device_lib.DeviceSpec.from_string(task_tensor.device).task
           task2 = device_lib.DeviceSpec.from_string(shard_tensor.device).task
diff --git a/tensorflow/python/checkpoint/sharding/sharding_util_test.py b/tensorflow/python/checkpoint/sharding/sharding_util_test.py
index 1c5acbea791b78..32272e850aebc3 100644
--- a/tensorflow/python/checkpoint/sharding/sharding_util_test.py
+++ b/tensorflow/python/checkpoint/sharding/sharding_util_test.py
@@ -303,8 +303,56 @@ def __call__(
       sharding_util.validate_shards(
           shards, shardable_tensors_flat, sharding_callback.description)
 
+  def test_validate_shards_task_change(self):
+    servers = [server_lib.Server.create_local_server() for _ in range(2)]
+    cluster_spec = server_lib.ClusterSpec({
+        "worker": [s.target[len("grpc://"):] for s in servers]})
+    remote.connect_to_cluster(cluster_spec)
+
+    root = module.Module()
+    with ops.device("/job:worker/task:0/cpu:0"):
+      v0 = resource_variable_ops.ResourceVariable(0.0, name="v0")
+    with ops.device("/job:worker/task:1/cpu:0"):
+      v1 = resource_variable_ops.ResourceVariable(0.0, name="v1")
+    root.v0 = v0
+    root.v1 = v1
+
+    class TaskChangeCallback(sharding_util.ShardingCallback):
+      @property
+      def description(self):
+        return "task change callback"
+
+      def __call__(
+          self, shardable_tensors: Sequence[sharding_util.ShardableTensor]
+      ) -> Sequence[sharding_util.TensorSliceDict]:
+        shards = []
+        for shardable_tensor in shardable_tensors:
+          tensor = shardable_tensor.tensor
+          checkpoint_key = shardable_tensor.checkpoint_key
+          slice_spec = shardable_tensor.slice_spec
+          if checkpoint_key == "v0/.ATTRIBUTES/VARIABLE_VALUE":
+            with ops.device("/job:worker/task:1/cpu:0"):
+              tensor = array_ops.identity(tensor)
+          shards.append({checkpoint_key: {slice_spec: tensor}})
+        return shards
+
+    shardable_tensors = self._get_shardable_tensors_by_task(root)
+    shardable_tensors_flat = []
+    for tensors in shardable_tensors:
+      shardable_tensors_flat.extend(tensors)
+
+    sharding_callback = TaskChangeCallback()
+    shards = []
+    for tensors in shardable_tensors:
+      shards.extend(sharding_callback(tensors))
+
+    with self.assertRaisesRegex(RuntimeError,
+                                "a tensor was found with an altered task"):
+      sharding_util.validate_shards(
+          shards, shardable_tensors_flat, sharding_callback.description)
+
   def test_validate_shards_different_tasks(self):
-    servers = [server_lib.Server.create_local_server() for _ in range(3)]
+    servers = [server_lib.Server.create_local_server() for _ in range(2)]
     cluster_spec = server_lib.ClusterSpec({
         "worker": [s.target[len("grpc://"):] for s in servers]})
     remote.connect_to_cluster(cluster_spec)
diff --git a/tensorflow/python/compat/compat.py b/tensorflow/python/compat/compat.py
index deca700d5d3610..faa4182b15b59d 100644
--- a/tensorflow/python/compat/compat.py
+++ b/tensorflow/python/compat/compat.py
@@ -29,7 +29,7 @@
 # This value changes every day with an automatic CL. It can be modified in code
 # via `forward_compatibility_horizon()` or with the environment variable
 # TF_FORWARD_COMPATIBILITY_DELTA_DAYS, which is added to the compatibility date.
-_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 1, 23)
+_FORWARD_COMPATIBILITY_HORIZON = datetime.date(2024, 2, 20)
 _FORWARD_COMPATIBILITY_DELTA_DAYS_VAR_NAME = "TF_FORWARD_COMPATIBILITY_DELTA_DAYS"
 _FORWARD_COMPATIBILITY_DATE_NUMBER = None
 
diff --git a/tensorflow/python/data/experimental/kernel_tests/BUILD b/tensorflow/python/data/experimental/kernel_tests/BUILD
index 47f44a8e31576b..3db1b851d2aeb3 100644
--- a/tensorflow/python/data/experimental/kernel_tests/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/BUILD
@@ -191,6 +191,20 @@ tf_py_strict_test(
     ],
 )
 
+tf_py_strict_test(
+    name = "global_shuffle_test",
+    srcs = ["global_shuffle_test.py"],
+    deps = [
+        "//tensorflow/python/data/experimental/ops:global_shuffle_op",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_strict_test(
     name = "group_by_reducer_test",
     size = "small",
diff --git a/tensorflow/python/data/experimental/kernel_tests/global_shuffle_test.py b/tensorflow/python/data/experimental/kernel_tests/global_shuffle_test.py
new file mode 100644
index 00000000000000..71ec3ad00ded6b
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/global_shuffle_test.py
@@ -0,0 +1,126 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for global shuffling of tf.data datasets."""
+
+from typing import Optional
+
+from absl.testing import parameterized
+
+from tensorflow.python.data.experimental.ops import global_shuffle_op
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import errors
+from tensorflow.python.platform import test
+
+
+class GlobalShuffleTest(test_base.DatasetTestBase, parameterized.TestCase):
+  """Tests for global shuffling of tf.data datasets."""
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(seed=[None, 42])))
+  def testRange(self, seed: Optional[int]):
+    dataset_range = 100
+    dataset = dataset_ops.Dataset.range(dataset_range)
+    dataset = global_shuffle_op._global_shuffle(dataset, seed=seed)
+    dataset = dataset.repeat(3)
+    output = self.getDatasetOutput(dataset, requires_initialization=True)
+    self.assertCountEqual(output, list(range(dataset_range)) * 3)
+
+    output_per_iteration = [
+        output[i : i + dataset_range]
+        for i in range(0, len(output), dataset_range)]
+    self.assertCountEqual(output_per_iteration[0], list(range(dataset_range)))
+    self.assertCountEqual(output_per_iteration[1], list(range(dataset_range)))
+    self.assertCountEqual(output_per_iteration[2], list(range(dataset_range)))
+    self.assertNotEqual(output_per_iteration[0], output_per_iteration[1])
+    self.assertNotEqual(output_per_iteration[0], output_per_iteration[2])
+    self.assertNotEqual(output_per_iteration[1], output_per_iteration[2])
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(seed=[None, 42])))
+  def testNegativeRange(self, seed: Optional[int]):
+    dataset_range = 10
+    dataset = dataset_ops.Dataset.range(dataset_range, -dataset_range, -1)
+    dataset = global_shuffle_op._global_shuffle(dataset)
+    dataset = dataset.repeat(3)
+    output = self.getDatasetOutput(dataset, requires_initialization=True)
+    self.assertCountEqual(
+        output, list(range(dataset_range, -dataset_range, -1)) * 3)
+
+    output_per_iteration = [
+        output[i : i + dataset_range * 2]
+        for i in range(0, len(output), dataset_range * 2)]
+    self.assertCountEqual(output_per_iteration[0],
+                          list(range(dataset_range, -dataset_range, -1)))
+    self.assertCountEqual(output_per_iteration[1],
+                          list(range(dataset_range, -dataset_range, -1)))
+    self.assertCountEqual(output_per_iteration[2],
+                          list(range(dataset_range, -dataset_range, -1)))
+    self.assertNotEqual(output_per_iteration[0], output_per_iteration[1])
+    self.assertNotEqual(output_per_iteration[0], output_per_iteration[2])
+    self.assertNotEqual(output_per_iteration[1], output_per_iteration[2])
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(seed=[None, 42])))
+  def testNoReshuffleEachIteration(self, seed: Optional[int]):
+    dataset_range = 100
+    dataset = dataset_ops.Dataset.range(dataset_range)
+    dataset = global_shuffle_op._global_shuffle(
+        dataset, seed=seed, reshuffle_each_iteration=False)
+    dataset = dataset.repeat(3)
+    output = self.getDatasetOutput(dataset, requires_initialization=True)
+    self.assertCountEqual(output, list(range(dataset_range)) * 3)
+
+    output_per_iteration = [
+        output[i : i + dataset_range]
+        for i in range(0, len(output), dataset_range)]
+    self.assertCountEqual(output_per_iteration[0], list(range(dataset_range)))
+    self.assertCountEqual(output_per_iteration[1], list(range(dataset_range)))
+    self.assertCountEqual(output_per_iteration[2], list(range(dataset_range)))
+    self.assertEqual(output_per_iteration[0], output_per_iteration[1])
+    self.assertEqual(output_per_iteration[0], output_per_iteration[2])
+    self.assertEqual(output_per_iteration[1], output_per_iteration[2])
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testEmptyDataset(self):
+    dataset = dataset_ops.Dataset.range(0)
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        "`global_shuffle` requires the input dataset to have a non-empty "
+        "finite cardinality."):
+      dataset = global_shuffle_op._global_shuffle(dataset)
+      self.getDatasetOutput(dataset, requires_initialization=True)
+
+  @combinations.generate(test_base.default_test_combinations())
+  def testUnsupportedDataset(self):
+    dataset = dataset_ops.Dataset.range(100)
+    dataset = dataset.shuffle(buffer_size=1)
+    with self.assertRaisesRegex(
+        errors.FailedPreconditionError,
+        "`global_shuffle` requires all upstream transformations be compatible "
+        "with random access."):
+      dataset = global_shuffle_op._global_shuffle(dataset)
+      self.getDatasetOutput(dataset, requires_initialization=True)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
index 075e1eb1794969..34c1098ccae8fc 100644
--- a/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/BUILD
@@ -205,6 +205,7 @@ tf_py_strict_test(
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/ops:options",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
@@ -238,3 +239,31 @@ tf_py_strict_test(
         "@absl_py//absl/testing:parameterized",
     ],
 )
+
+tf_py_strict_test(
+    name = "seq_interleave_prefetch_test",
+    size = "medium",
+    srcs = ["seq_interleave_prefetch_test.py"],
+    deps = [
+        "//tensorflow/python/data/experimental/ops:batching",
+        "//tensorflow/python/data/experimental/ops:grouping",
+        "//tensorflow/python/data/experimental/ops:scan_ops",
+        "//tensorflow/python/data/experimental/ops:testing",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:constant_op",
+        "//tensorflow/python/framework:dtypes",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/framework:ops",
+        "//tensorflow/python/ops:array_ops",
+        "//tensorflow/python/ops:random_ops",
+        "//tensorflow/python/ops:variable_scope",
+        "//tensorflow/python/ops:variables",
+        "//tensorflow/python/platform:client_testlib",
+        "//tensorflow/python/platform:tf_logging",
+        "//third_party/py/numpy",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
diff --git a/tensorflow/python/data/experimental/kernel_tests/optimization/seq_interleave_prefetch_test.py b/tensorflow/python/data/experimental/kernel_tests/optimization/seq_interleave_prefetch_test.py
new file mode 100644
index 00000000000000..a5dfa78f5a0fc0
--- /dev/null
+++ b/tensorflow/python/data/experimental/kernel_tests/optimization/seq_interleave_prefetch_test.py
@@ -0,0 +1,100 @@
+# Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for the `SeqInterleavePrefetch` optimization."""
+from absl.testing import parameterized
+
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import options as options_lib
+from tensorflow.python.framework import combinations
+from tensorflow.python.framework import constant_op
+from tensorflow.python.framework import dtypes
+from tensorflow.python.platform import test
+
+
+class SeqInterleavePrefetchTest(
+    test_base.DatasetTestBase, parameterized.TestCase
+):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.eager_only_combinations(),
+          combinations.combine(cycle_length=[2, 4]),
+          combinations.combine(block_length=[2, 4]),
+          combinations.combine(other_arguments=[True, False]),
+      )
+  )
+  def testOptimizationSeqInterleavePrefetch(
+      self,
+      cycle_length,
+      block_length,
+      other_arguments,
+  ):
+    num_input_elements = 16
+    var1 = constant_op.constant(9, dtype=dtypes.int64)
+    var2 = constant_op.constant(11, dtype=dtypes.int64)
+
+    # dataset1: Deterministic parallel interleave dataset.
+    dataset1 = dataset_ops.Dataset.range(num_input_elements)
+    options1 = options_lib.Options()
+    options1.experimental_optimization.apply_default_optimizations = False
+    options1.experimental_optimization.seq_interleave_prefetch = False
+    dataset1 = dataset1.with_options(options1)
+    if other_arguments:
+      dataset1 = dataset1.interleave(
+          (lambda _: dataset_ops.Dataset.range(var1 + var2 + 1)),
+          cycle_length=cycle_length,
+          block_length=block_length,
+          num_parallel_calls=dataset_ops.AUTOTUNE,
+          deterministic=True,
+      )
+    else:
+      dataset1 = dataset1.interleave(
+          (lambda _: dataset_ops.Dataset.range(num_input_elements)),
+          cycle_length=cycle_length,
+          block_length=block_length,
+          num_parallel_calls=dataset_ops.AUTOTUNE,
+          deterministic=True,
+      )
+
+    # dataset2: Deterministic parallel interleave dataset with
+    # `seq_interleave_prefetch` optimization enabled.
+    dataset2 = dataset_ops.Dataset.range(num_input_elements)
+    options2 = options_lib.Options()
+    options2.experimental_optimization.apply_default_optimizations = False
+    options2.experimental_optimization.seq_interleave_prefetch = True
+    dataset2 = dataset2.with_options(options2)
+    if other_arguments:
+      dataset2 = dataset2.interleave(
+          (lambda _: dataset_ops.Dataset.range(var1 + var2 + 1)),
+          cycle_length=cycle_length,
+          block_length=block_length,
+          num_parallel_calls=dataset_ops.AUTOTUNE,
+          deterministic=True,
+      )
+    else:
+      dataset2 = dataset2.interleave(
+          (lambda _: dataset_ops.Dataset.range(num_input_elements)),
+          cycle_length=cycle_length,
+          block_length=block_length,
+          num_parallel_calls=dataset_ops.AUTOTUNE,
+          deterministic=True,
+      )
+
+    self.assertDatasetsEqual(dataset1, dataset2)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_ft_test.py b/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_ft_test.py
index 2e93dff8ed4c5b..e752b073206d9a 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_ft_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_ft_test.py
@@ -39,6 +39,7 @@ class DistributedSaveLoadFtTest(
           combinations.combine(
               num_elements=[200],
               num_workers=[1, 2],
+              save_repetitions=[1, 2],
               load_repetitions=[1, 2],
               sharding_policy=[
                   data_service_ops.ShardingPolicy.OFF,
@@ -47,17 +48,21 @@ def test_dispatcher_restart(
       self,
       num_workers: int,
       num_elements: int,
+      save_repetitions: int,
       load_repetitions: int,
       sharding_policy: data_service_ops.ShardingPolicy):
     cluster = data_service_test_base.TestCluster(num_workers=num_workers)
     snapshot_dir = data_service_test_base.TempDir()
     dataset = dataset_ops.Dataset.range(num_elements)
+    if save_repetitions > 1:
+      dataset = dataset.repeat(save_repetitions)
     self.evaluate(
         distributed_save_op.distributed_save(
             dataset, snapshot_dir.full_path, cluster.dispatcher_address()))
 
     dataset = load_op._load_with_retry(snapshot_dir.full_path)
-    dataset = dataset.repeat(load_repetitions)
+    if load_repetitions > 1:
+      dataset = dataset.repeat(load_repetitions)
     dataset = dataset.apply(
         data_service_ops.distribute(
             sharding_policy,
@@ -71,8 +76,9 @@ def test_dispatcher_restart(
 
     # For no sharding, dispatcher restarts do not affect data processing
     # happening at the workers.
+    repetitions = save_repetitions * load_repetitions
     if sharding_policy == data_service_ops.ShardingPolicy.OFF:
-      expected = list(range(num_elements)) * load_repetitions * num_workers
+      expected = list(range(num_elements)) * repetitions * num_workers
       self.assertCountEqual(output, expected)
 
     # Dynamic sharding may lose splits if the dispatcher fails.
@@ -85,24 +91,33 @@ def test_dispatcher_restart(
           test_base.eager_only_combinations(),
           combinations.combine(
               num_elements=[200],
+              num_workers=[1, 2],
+              save_repetitions=[1, 2],
               load_repetitions=[1, 2],
               sharding_policy=[
-                  data_service_ops.ShardingPolicy.OFF,
-                  data_service_ops.ShardingPolicy.DYNAMIC])))
+                  # TODO(b/297930782): Enable dynamic sharding. Need to fix the
+                  # race condition where workers restart before sending the
+                  # final task completion update.
+                  data_service_ops.ShardingPolicy.OFF])))
   def test_dispatcher_and_worker_restart(
       self,
       num_elements: int,
+      num_workers: int,
+      save_repetitions: int,
       load_repetitions: int,
       sharding_policy: data_service_ops.ShardingPolicy):
-    cluster = data_service_test_base.TestCluster(num_workers=1)
+    cluster = data_service_test_base.TestCluster(num_workers=num_workers)
     snapshot_dir = data_service_test_base.TempDir()
     dataset = dataset_ops.Dataset.range(num_elements)
+    if save_repetitions > 1:
+      dataset = dataset.repeat(save_repetitions)
     self.evaluate(
         distributed_save_op.distributed_save(
             dataset, snapshot_dir.full_path, cluster.dispatcher_address()))
 
     dataset = load_op._load_with_retry(snapshot_dir.full_path)
-    dataset = dataset.repeat(load_repetitions)
+    if load_repetitions > 1:
+      dataset = dataset.repeat(load_repetitions)
     dataset = dataset.apply(
         data_service_ops.distribute(
             sharding_policy,
@@ -111,23 +126,19 @@ def test_dispatcher_and_worker_restart(
 
     iterator = self.getNext(dataset)
     output = [self.evaluate(iterator())]
-    cluster.restart_dispatcher()
-    cluster.workers[0].restart()
+    for i in range(num_workers):
+      cluster.restart_dispatcher()
+      cluster.workers[i].restart()
     output.extend(self.getIteratorOutput(iterator))
 
     # If the sharding policy is OFF, the restarted worker will produce elements
     # from the beginning of the dataset. The result is a partial range plus
     # `num_elements` repetitions.
     if sharding_policy == data_service_ops.ShardingPolicy.OFF:
-      self.assertContainsSubset(
-          list(range(num_elements)) * load_repetitions, output)
-
-    # For dynamic sharding, the first split (and possibly prefetched splits) may
-    # be lost. The result is a partial range plus zero or more `num_elements`
-    # ranges.
-    if sharding_policy == data_service_ops.ShardingPolicy.DYNAMIC:
-      num_ranges = len(output) // num_elements
-      self.assertContainsSubset(list(range(num_elements)) * num_ranges, output)
+      repetitions = save_repetitions * load_repetitions
+      self.assertContainsSubsequence(
+          sorted(output),
+          sorted(list(range(num_elements)) * repetitions * num_workers))
 
   @combinations.generate(
       combinations.times(
diff --git a/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_test.py b/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_test.py
index 6787c3089bd35f..7f347347d3af78 100644
--- a/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_test.py
+++ b/tensorflow/python/data/experimental/kernel_tests/service/distributed_save_load_test.py
@@ -360,6 +360,30 @@ def test_snapshot_chunks_cardinality(self, num_repetitions: int):
         else num_chunks * num_repetitions)
     self.assertEqual(self.evaluate(dataset.cardinality()), expected_cardinality)
 
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(num_repetitions=[50])))
+  def test_snapshot_chunks_order(self, num_repetitions: int):
+    cluster = data_service_test_base.TestCluster(
+        num_workers=1, snapshot_max_chunk_size_bytes=1)
+    snapshot_dir = data_service_test_base.TempDir()
+    dataset = dataset_ops.Dataset.range(1)
+    dataset = dataset.repeat(num_repetitions)
+    self.evaluate(
+        distributed_save_op.distributed_save(
+            dataset, snapshot_dir.full_path, cluster.dispatcher_address()))
+
+    dataset = load_op._ListSnapshotChunksDataset(snapshot_dir.full_path)
+    while self.evaluate(dataset.cardinality()) == dataset_ops.UNKNOWN:
+      time.sleep(.1)
+
+    chunk_indices = [
+        int(os.path.basename(str(chunk)).split("_")[2])
+        for chunk in self.getDatasetOutput(dataset)]
+    self.assertEqual(chunk_indices, sorted(chunk_indices),
+                     "Snapshot chunks should be sorted by chunk indices.")
+
 
 class SaveLoadCheckpointTest(
     data_service_test_base.TestBase,
diff --git a/tensorflow/python/data/experimental/ops/BUILD b/tensorflow/python/data/experimental/ops/BUILD
index 75e297261a51bb..7731cfb8252687 100644
--- a/tensorflow/python/data/experimental/ops/BUILD
+++ b/tensorflow/python/data/experimental/ops/BUILD
@@ -171,6 +171,21 @@ py_strict_library(
     ],
 )
 
+py_strict_library(
+    name = "global_shuffle_op",
+    srcs = [
+        "global_shuffle_op.py",
+    ],
+    srcs_version = "PY3",
+    deps = [
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/util:random_seed",
+        "//tensorflow/python/framework:tensor",
+        "//tensorflow/python/ops:dataset_ops_gen",
+        "//tensorflow/python/ops:experimental_dataset_ops_gen",
+    ],
+)
+
 py_strict_library(
     name = "grouping",
     srcs = ["grouping.py"],
@@ -518,6 +533,7 @@ py_strict_library(
         ":error_ops",
         ":from_list",
         ":get_single_element",
+        ":global_shuffle_op",
         ":grouping",
         ":interleave_ops",
         ":io",
diff --git a/tensorflow/python/data/experimental/ops/global_shuffle_op.py b/tensorflow/python/data/experimental/ops/global_shuffle_op.py
new file mode 100644
index 00000000000000..e5d8a307ab2c64
--- /dev/null
+++ b/tensorflow/python/data/experimental/ops/global_shuffle_op.py
@@ -0,0 +1,90 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Globally shuffles tf.data datasets."""
+
+from typing import Optional, Union
+
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.util import random_seed
+from tensorflow.python.framework import tensor
+from tensorflow.python.ops import gen_dataset_ops
+from tensorflow.python.ops import gen_experimental_dataset_ops as ged_ops
+
+
+def _global_shuffle(  # pylint: disable=unused-private-name
+    input_dataset: dataset_ops.DatasetV2,
+    seed: Optional[Union[int, tensor.Tensor]] = None,
+    reshuffle_each_iteration: bool = True,
+    name: Optional[str] = None) -> dataset_ops.DatasetV2:
+  """Globally shuffles the elements of `input_dataset`.
+
+  The shuffling is done efficiently, without needing to buffer any additional
+  data. To achieve this, the transformations preceding global_shuffle must all
+  support random access.
+
+  Requires that:
+  - The shuffled dataset and all its input datasets support random access.
+  - The input_dataset to have a known, finite cardinality. Users can use
+    `tf.data.experimental.assert_cardinality` to specify the cardinality of a
+    dataset if it cannot be determined at runtime.
+
+  TODO(b/325112575): Move the API to dataset_ops.py.
+  TODO(b/325112575): Support reshuffle_each_iteration.
+  TODO(b/325112575): Support checkpoints.
+
+  Args:
+    input_dataset: The dataset to be shuffled.
+    seed: A `tf.int64` scalar `tf.Tensor` to control the shuffle order. If
+      `None`, a random seed will be used.
+    reshuffle_each_iteration: A boolean, which if True, indicates that a
+      different shuffle order should be generated for each iteration of the
+      dataset. (Defaults to `True`.)
+    name: (Optional.) A name for the tf.data operation.
+
+  Returns:
+    A new `Dataset` where elements are produced in a globally shuffled order.
+
+  Raises:
+    InvalidArgumentError if the input dataset does not support random access, or
+    it has infinite or unknown cardinality.
+  """
+  return _GlobalShuffleDataset(
+      input_dataset,
+      seed=seed,
+      reshuffle_each_iteration=reshuffle_each_iteration,
+      name=name)
+
+
+class _GlobalShuffleDataset(dataset_ops.UnaryUnchangedStructureDataset):
+  """Shuffles all elements in the input dataset."""
+
+  def __init__(
+      self,
+      input_dataset: dataset_ops.DatasetV2,
+      seed: Optional[Union[int, tensor.Tensor]] = None,
+      reshuffle_each_iteration: bool = True,
+      name: Optional[str] = None):
+    self._input_dataset = input_dataset
+    self._seed, self._seed2 = random_seed.get_seed(seed)
+    self._reshuffle_each_iteration = reshuffle_each_iteration
+    self._name = name
+    variant_tensor = ged_ops.global_shuffle_dataset(
+        self._input_dataset._variant_tensor,  # pylint: disable=protected-access
+        seed=self._seed,
+        seed2=self._seed2,
+        seed_generator=gen_dataset_ops.dummy_seed_generator(),
+        reshuffle_each_iteration=self._reshuffle_each_iteration,
+        **self._common_args)
+    super().__init__(input_dataset, variant_tensor)
diff --git a/tensorflow/python/data/kernel_tests/BUILD b/tensorflow/python/data/kernel_tests/BUILD
index 7114dbb3d5caf7..28a7d986a6434c 100644
--- a/tensorflow/python/data/kernel_tests/BUILD
+++ b/tensorflow/python/data/kernel_tests/BUILD
@@ -335,6 +335,23 @@ tf_py_strict_test(
     ],
 )
 
+tf_py_strict_test(
+    name = "fingerprint_test",
+    size = "small",
+    srcs = ["fingerprint_test.py"],
+    deps = [
+        "//tensorflow/python/data/kernel_tests:checkpoint_test_base",
+        "//tensorflow/python/data/kernel_tests:test_base",
+        "//tensorflow/python/data/ops:dataset_ops",
+        "//tensorflow/python/data/ops:options",
+        "//tensorflow/python/data/ops:readers",
+        "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/framework:errors",
+        "//tensorflow/python/platform:client_testlib",
+        "@absl_py//absl/testing:parameterized",
+    ],
+)
+
 tf_py_strict_test(
     name = "fixed_length_record_dataset_test",
     size = "medium",
@@ -1382,6 +1399,7 @@ py_strict_test(
         "//tensorflow/python/data/ops:dataset_ops",
         "//tensorflow/python/data/util:nest",
         "//tensorflow/python/framework:combinations",
+        "//tensorflow/python/ops:array_ops",
         "//tensorflow/python/ops/ragged:ragged_tensor",
         "//tensorflow/python/platform:client_testlib",
         "@absl_py//absl/testing:parameterized",
diff --git a/tensorflow/python/data/kernel_tests/fingerprint_test.py b/tensorflow/python/data/kernel_tests/fingerprint_test.py
new file mode 100644
index 00000000000000..46b4628da1cf81
--- /dev/null
+++ b/tensorflow/python/data/kernel_tests/fingerprint_test.py
@@ -0,0 +1,111 @@
+# Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Tests for `dataset.fingerprint()`."""
+
+from absl.testing import parameterized
+from tensorflow.python.data.kernel_tests import test_base
+from tensorflow.python.data.ops import dataset_ops
+from tensorflow.python.data.ops import readers
+from tensorflow.python.framework import combinations
+from tensorflow.python.platform import test
+
+
+def dataset_fn_test_cases():
+  cases = [
+      ("range1", lambda: dataset_ops.Dataset.range(10)),
+      (
+          "flat_map1",
+          lambda: dataset_ops.Dataset.range(10).flat_map(
+              dataset_ops.Dataset.range
+          ),
+      ),
+      ("tfrecord1", lambda: readers.TFRecordDataset(["f1.txt", "f2.txt"])),
+      (
+          "tfrecord2",
+          lambda: readers.TFRecordDataset(["f1.txt", "f2.txt"]).repeat(2),
+      ),
+  ]
+
+  named_cases = []
+  for case in cases:
+    name, dataset_fn = case
+    named_cases.append(combinations.NamedObject(name=name, obj=dataset_fn))
+
+  return combinations.combine(dataset_fn=named_cases)
+
+
+def dataset_pair_fn_test_cases():
+  dataset = dataset_ops.Dataset
+
+  cases = [
+      ("range1", lambda: (dataset.range(10), dataset.range(11))),
+      (
+          "flat_map1",
+          lambda: (
+              dataset.range(10).flat_map(dataset.range),
+              dataset.range(10).flat_map(lambda x: dataset.range(x + 1)),
+          ),
+      ),
+      (
+          "tfrecord1",
+          lambda: (
+              readers.TFRecordDataset(["f1.txt", "f2.txt"]),
+              readers.TFRecordDataset(["f1.txt", "f3.txt"]),
+          ),
+      ),
+      (
+          "tfrecord2",
+          lambda: (
+              readers.TFRecordDataset(["f1.txt", "f2.txt"]).repeat(2),
+              readers.TFRecordDataset(["f1.txt", "f3.txt"]).repeat(2),
+          ),
+      ),
+  ]
+
+  named_cases = []
+  for case in cases:
+    name, dataset_pair_fn = case
+    named_cases.append(combinations.NamedObject(name=name, obj=dataset_pair_fn))
+
+  return combinations.combine(dataset_pair_fn=named_cases)
+
+
+class FingerprintTest(test_base.DatasetTestBase, parameterized.TestCase):
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(), dataset_fn_test_cases()
+      )
+  )
+  def testSameDatasetSameFingerprint(self, dataset_fn):
+    fingerprint1 = self.evaluate(dataset_fn().fingerprint())
+    fingerprint2 = self.evaluate(dataset_fn().fingerprint())
+    self.assertEqual(fingerprint1, fingerprint2)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          dataset_pair_fn_test_cases(),
+      )
+  )
+  def testDifferentDatasetDifferentFingerprint(self, dataset_pair_fn):
+    lhs, rhs = dataset_pair_fn()
+    lhs_fingerprint = self.evaluate(lhs.fingerprint())
+    rhs_fingerprint = self.evaluate(rhs.fingerprint())
+    self.assertNotEqual(lhs_fingerprint, rhs_fingerprint)
+
+
+if __name__ == "__main__":
+  test.main()
diff --git a/tensorflow/python/data/kernel_tests/iterator_test.py b/tensorflow/python/data/kernel_tests/iterator_test.py
index 8d1e384e033683..b9cd6d2ae2d32f 100644
--- a/tensorflow/python/data/kernel_tests/iterator_test.py
+++ b/tensorflow/python/data/kernel_tests/iterator_test.py
@@ -1072,6 +1072,27 @@ def fn():
     self.evaluate(counter_var.initializer)
     self.assertEqual(self.evaluate(fn()), 10)
 
+  @combinations.generate(test_base.eager_only_combinations())
+  def testSaveRestore(self):
+    ds = dataset_ops.Dataset.range(10)
+    ds = ds.shuffle(5, seed=42, reshuffle_each_iteration=False)
+    it = ds.as_numpy_iterator()
+
+    expected = list(ds.as_numpy_iterator())
+
+    for i in range(3):
+      self.assertEqual(next(it), expected[i])
+
+    state = it.save()
+
+    for i in range(3, 6):
+      self.assertEqual(next(it), expected[i])
+
+    it.restore(state)
+
+    for i in range(3, 6):
+      self.assertEqual(next(it), expected[i])
+
 
 if __name__ == "__main__":
   test.main()
diff --git a/tensorflow/python/data/kernel_tests/options_test.py b/tensorflow/python/data/kernel_tests/options_test.py
index ecae99543fbd84..ebf9909c6c4670 100644
--- a/tensorflow/python/data/kernel_tests/options_test.py
+++ b/tensorflow/python/data/kernel_tests/options_test.py
@@ -79,6 +79,19 @@ def testOptionsTwiceSameOption(self):
     ds = ds.with_options(options2)
     self.assertTrue(self._get_options(ds).autotune.enabled)
 
+  @combinations.generate(test_base.default_test_combinations())
+  def testOptionsTwiceSameOptionWithMap(self):
+    options1 = options_lib.Options()
+    options1.framework_type = ["seqio"]
+    options2 = options_lib.Options()
+    options2.framework_type = ["tfgrain"]
+    ds = dataset_ops.Dataset.range(5)
+    ds = ds.with_options(options1)
+    ds = ds.map(lambda x: x + 1)
+    ds = ds.with_options(options2)
+    self.assertDatasetProduces(ds, [1, 2, 3, 4, 5])
+    self.assertLen(self._get_options(ds).framework_type, 2)
+
   @combinations.generate(test_base.default_test_combinations())
   def testOptionsMergeOptionsFromMultipleInputs(self):
     options1 = options_lib.Options()
@@ -149,14 +162,17 @@ def testOptionsProtoRoundTrip(self):
     options.experimental_optimization.noop_elimination = True
     options.experimental_optimization.parallel_batch = True
     options.experimental_optimization.shuffle_and_repeat_fusion = True
+    options.experimental_optimization.seq_interleave_prefetch = True
     options.experimental_warm_start = True
     options.experimental_slack = True
     options.dataset_name = "test_name"
+    options.framework_type = ["TFDS", "TfGrain"]
     options.threading.max_intra_op_parallelism = 30
     options.threading.private_threadpool_size = 40
     pb = options._to_proto()
     result = options_lib.Options()
     result._from_proto(pb)
+    self.assertEqual(options.framework_type, result.framework_type)
     self.assertEqual(options, result)
 
   @combinations.generate(test_base.default_test_combinations())
diff --git a/tensorflow/python/data/kernel_tests/rebatch_test.py b/tensorflow/python/data/kernel_tests/rebatch_test.py
index 76d6b37bafa2bb..5c862a3d7c6b03 100644
--- a/tensorflow/python/data/kernel_tests/rebatch_test.py
+++ b/tensorflow/python/data/kernel_tests/rebatch_test.py
@@ -21,6 +21,7 @@
 from tensorflow.python.data.ops import dataset_ops
 from tensorflow.python.data.util import nest
 from tensorflow.python.framework import combinations
+from tensorflow.python.ops import array_ops
 from tensorflow.python.ops.ragged import ragged_tensor
 from tensorflow.python.platform import test
 
@@ -222,6 +223,35 @@ def testEmptyLastSplits(self, drop_remainder):
   @combinations.generate(
       combinations.times(test_base.default_test_combinations(),
                          combinations.combine(drop_remainder=[True, False])))
+  def testEmptyTensors(self, drop_remainder):
+    """Tests empty tensors case.
+
+    Args:
+      drop_remainder: whether to drop the remainder.
+
+    The implementation of rebatch might move the input data.
+    This test ensures the empty buffer is handled correctly.
+    """
+    new_batch_size = 4
+    dataset = dataset_ops.Dataset.range(8)
+    dataset = dataset.map(lambda x: array_ops.reshape((), (5, 0)))
+    dataset = dataset.batch(2)
+    rebatched_dataset = dataset.rebatch(
+        batch_size=new_batch_size, drop_remainder=drop_remainder
+    )
+
+    expected_output = [
+        array_ops.reshape((), (new_batch_size, 5, 0))
+        for _ in range(8 // new_batch_size)
+    ]
+    self.assertDatasetProduces(rebatched_dataset, expected_output)
+
+  @combinations.generate(
+      combinations.times(
+          test_base.default_test_combinations(),
+          combinations.combine(drop_remainder=[True, False]),
+      )
+  )
   def testScalarBatchSizeInput(self, drop_remainder):
     dataset = dataset_ops.Dataset.range(8).batch(4, drop_remainder=True)
     rebatched_dataset = dataset.rebatch(batch_size=2,
diff --git a/tensorflow/python/data/ops/BUILD b/tensorflow/python/data/ops/BUILD
index 6955f4d01de4ad..58a4d1d3c9748e 100644
--- a/tensorflow/python/data/ops/BUILD
+++ b/tensorflow/python/data/ops/BUILD
@@ -180,6 +180,7 @@ py_strict_library(
         "//tensorflow/python/data/util:structure",
         "//tensorflow/python/eager:context",
         "//tensorflow/python/framework:composite_tensor",
+        "//tensorflow/python/framework:constant_op",
         "//tensorflow/python/framework:dtypes",
         "//tensorflow/python/framework:errors",
         "//tensorflow/python/framework:ops",
@@ -189,9 +190,12 @@ py_strict_library(
         "//tensorflow/python/framework:tensor_spec",
         "//tensorflow/python/framework:type_spec",
         "//tensorflow/python/framework:type_utils",
+        "//tensorflow/python/ops:array_ops_stack",
         "//tensorflow/python/ops:cond",
         "//tensorflow/python/ops:dataset_ops_gen",
         "//tensorflow/python/ops:parsing_ops",
+        "//tensorflow/python/ops:string_ops",
+        "//tensorflow/python/ops/ragged:ragged_string_ops",
         "//tensorflow/python/saved_model:nested_structure_coder",
         "//tensorflow/python/trackable:base",
         "//tensorflow/python/training:saver",
diff --git a/tensorflow/python/data/ops/dataset_ops.py b/tensorflow/python/data/ops/dataset_ops.py
index ba17b3d12f3dc0..21ca38c228a8e2 100644
--- a/tensorflow/python/data/ops/dataset_ops.py
+++ b/tensorflow/python/data/ops/dataset_ops.py
@@ -1156,6 +1156,24 @@ def counter(start=0, step=1, dtype=dtypes.int64, name=None) -> "DatasetV2":
     return counter_op._counter(start, step, dtype, name=name)
     # pylint: enable=g-import-not-at-top,protected-access
 
+  def fingerprint(self):
+    """Computes the fingerprint of this `Dataset`.
+
+    If two datasets have the same fingerprint, it is guaranteeed that they
+    would produce identical elements as long as the content of the upstream
+    input files does not change and they produce data deterministically.
+
+    However, two datasets producing identical values does not always mean they
+    would have the same fingerprint due to different graph constructs.
+
+    In other words, if two datasets have different fingerprints, they could
+    still produce identical values.
+
+    Returns:
+      A scalar `tf.Tensor` of type `tf.uint64`.
+    """
+    return gen_dataset_ops.dataset_fingerprint(self._variant_tensor)
+
   def rebatch(self, batch_size, drop_remainder=False, name=None) -> "DatasetV2":
     """Creates a `Dataset` that rebatches the elements from this dataset.
 
diff --git a/tensorflow/python/data/ops/iterator_ops.py b/tensorflow/python/data/ops/iterator_ops.py
index 6db3abca84c880..1b8db599df025e 100644
--- a/tensorflow/python/data/ops/iterator_ops.py
+++ b/tensorflow/python/data/ops/iterator_ops.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 # ==============================================================================
 """Python wrappers for Iterators."""
+
 import abc
 import threading
 import warnings
@@ -34,8 +35,11 @@
 from tensorflow.python.framework import tensor_shape
 from tensorflow.python.framework import type_spec
 from tensorflow.python.framework import type_utils
+from tensorflow.python.ops import array_ops_stack
 from tensorflow.python.ops import gen_dataset_ops
 from tensorflow.python.ops import parsing_ops
+from tensorflow.python.ops import string_ops
+from tensorflow.python.ops.ragged import ragged_string_ops
 from tensorflow.python.saved_model import nested_structure_coder
 from tensorflow.python.trackable import base as trackable
 from tensorflow.python.training.saver import BaseSaverBuilder
@@ -792,10 +796,23 @@ def _save(self):
     state_variant = gen_dataset_ops.serialize_iterator(
         self._iterator_resource, external_state_policy
     )
-    return parsing_ops.serialize_tensor(state_variant)
+    # Serialize each slice of the state_variant separately, to avoid hitting the
+    # 2GB proto serialization limit.
+    state = array_ops_stack.unstack(state_variant)
+    state = [parsing_ops.serialize_tensor(x) for x in state]
+    state = array_ops_stack.stack(state)
+
+    state = string_ops.encode_base64(state)
+    state = string_ops.string_join(state, separator=",")
+    return state
 
   def _restore(self, state):
-    state_variant = parsing_ops.parse_tensor(state, dtypes.variant)
+    state = ragged_string_ops.string_split_v2(state, sep=",")
+    state = string_ops.decode_base64(state)
+
+    state = array_ops_stack.unstack(state)
+    state = [parsing_ops.parse_tensor(x, dtypes.variant) for x in state]
+    state_variant = array_ops_stack.stack(state)
     return gen_dataset_ops.deserialize_iterator(
         self._iterator_resource, state_variant
     )
diff --git a/tensorflow/python/data/ops/options.py b/tensorflow/python/data/ops/options.py
index 396ab1b6d78856..41124b0bb56d37 100644
--- a/tensorflow/python/data/ops/options.py
+++ b/tensorflow/python/data/ops/options.py
@@ -229,6 +229,18 @@ class AutotuneOptions(options_lib.OptionsBase):
       docstring="When autotuning is enabled (through `autotune`), determines "
       "the algorithm to use.")
 
+  initial_parallelism = options_lib.create_option(
+      name="initial_parallelism",
+      ty=int,
+      docstring=(
+          "The initial parallelism to use for parallel transformations before"
+          " autotune has a chance to run. A higher value can help with quick"
+          " startup, but may cause the ram_budget to temporarily be exceeded."
+          " Memory-sensitive datasets should consider setting this to `1` to"
+          " avoid running out of memory. Defaults to 16."
+      ),
+  )
+
   def _to_proto(self):
     pb = dataset_options_pb2.AutotuneOptions()
     if self.enabled is not None:
@@ -240,6 +252,8 @@ def _to_proto(self):
     if self.autotune_algorithm is not None:
       pb.autotune_algorithm = AutotuneAlgorithm._to_proto(  # pylint: disable=protected-access
           self.autotune_algorithm)
+    if self.initial_parallelism is not None:
+      pb.initial_parallelism = self.initial_parallelism
     return pb
 
   def _from_proto(self, pb):
@@ -252,6 +266,8 @@ def _from_proto(self, pb):
     if pb.WhichOneof("optional_autotune_algorithm") is not None:
       self.autotune_algorithm = AutotuneAlgorithm._from_proto(  # pylint: disable=protected-access
           pb.autotune_algorithm)
+    if pb.WhichOneof("optional_initial_parallelism") is not None:
+      self.initial_parallelism = pb.initial_parallelism
 
   def _set_mutable(self, mutable):
     """Change the mutability value to `mutable` on this options and children."""
@@ -344,6 +360,16 @@ class OptimizationOptions(options_lib.OptionsBase):
       "when the last transformation is a synchronous transformation. If None, "
       "defaults to True.")
 
+  seq_interleave_prefetch = options_lib.create_option(
+      name="seq_interleave_prefetch",
+      ty=bool,
+      docstring=(
+          "Whether to replace parallel interleave using a sequential interleave"
+          " that prefetches elements from its input iterators. If None,"
+          " defaults to False."
+      ),
+  )
+
   map_and_batch_fusion = options_lib.create_option(
       name="map_and_batch_fusion",
       ty=bool,
@@ -399,6 +425,8 @@ def _to_proto(self):
       pb.filter_parallelization = self.filter_parallelization
     if self.inject_prefetch is not None:
       pb.inject_prefetch = self.inject_prefetch
+    if self.seq_interleave_prefetch is not None:
+      pb.seq_interleave_prefetch = self.seq_interleave_prefetch
     if self.map_and_batch_fusion is not None:
       pb.map_and_batch_fusion = self.map_and_batch_fusion
     if self.map_and_filter_fusion is not None:
@@ -424,6 +452,8 @@ def _from_proto(self, pb):
       self.filter_parallelization = pb.filter_parallelization
     if pb.WhichOneof("optional_inject_prefetch") is not None:
       self.inject_prefetch = pb.inject_prefetch
+    if pb.WhichOneof("optional_seq_interleave_prefetch") is not None:
+      self.seq_interleave_prefetch = pb.seq_interleave_prefetch
     if pb.WhichOneof("optional_map_and_batch_fusion") is not None:
       self.map_and_batch_fusion = pb.map_and_batch_fusion
     if pb.WhichOneof("optional_map_and_filter_fusion") is not None:
@@ -602,11 +632,18 @@ class Options(options_lib.OptionsBase):
       ),
       default_factory=lambda: True if test_mode.TEST_MODE else None,
   )
+
   dataset_name = options_lib.create_option(
       name="dataset_name",
       ty=str,
       docstring="A name for the dataset, to help in debugging.")
 
+  framework_type = options_lib.create_option(
+      name="framework_type",
+      ty=list,
+      docstring="The list of frameworks that are used to generate this "
+      "pipeline, used for telemetry.")
+
   threading = options_lib.create_option(
       name="threading",
       ty=ThreadingOptions,
@@ -664,6 +701,9 @@ def _to_proto(self):
       pb.warm_start = self.experimental_warm_start
     if self.dataset_name is not None:
       pb.dataset_name = self.dataset_name
+    if self.framework_type:
+      for framework_type in self.framework_type:
+        pb.framework_type.append(framework_type)
     pb.threading_options.CopyFrom(self.threading._to_proto())  # pylint: disable=protected-access
     return pb
 
@@ -685,6 +725,10 @@ def _from_proto(self, pb):
       self.experimental_warm_start = pb.warm_start
     if pb.WhichOneof("optional_dataset_name") is not None:
       self.dataset_name = pb.dataset_name
+    if pb.framework_type:
+      self.framework_type = []
+      for framework_type in pb.framework_type:
+        self.framework_type.append(framework_type)
     self.threading._from_proto(pb.threading_options)  # pylint: disable=protected-access
 
   def _set_mutable(self, mutable):
diff --git a/tensorflow/python/data/ops/prefetch_op.py b/tensorflow/python/data/ops/prefetch_op.py
index d4982c67e027bc..49ae904d83d8aa 100644
--- a/tensorflow/python/data/ops/prefetch_op.py
+++ b/tensorflow/python/data/ops/prefetch_op.py
@@ -47,5 +47,6 @@ def __init__(self, input_dataset, buffer_size, slack_period=None, name=None):
           input_dataset._variant_tensor,
           buffer_size=self._buffer_size,
           slack_period=slack_period,
+          legacy_autotune=(buffer_size == dataset_ops.AUTOTUNE),
           **self._common_args)
     super().__init__(input_dataset, variant_tensor)
diff --git a/tensorflow/python/data/util/options.py b/tensorflow/python/data/util/options.py
index 3ec1c53ff65087..3d438b6a19b9c0 100644
--- a/tensorflow/python/data/util/options.py
+++ b/tensorflow/python/data/util/options.py
@@ -166,6 +166,10 @@ def merge_options(*options_list):
         setattr(result, name, that)
       elif isinstance(this, OptionsBase):
         setattr(result, name, merge_options(this, that))
+      elif name == "framework_type":
+        # Since, `framework_type`` is a repeated string field (list), the merged
+        # result will be a combined list.
+        setattr(result, name, this+that)
       elif this != that:
         logging.warning("Changing the value of option %s from %r to %r.", name,
                         this, that)
diff --git a/tensorflow/python/debug/lib/BUILD b/tensorflow/python/debug/lib/BUILD
index ad2637cdc41a15..53b20deb3cd8b3 100644
--- a/tensorflow/python/debug/lib/BUILD
+++ b/tensorflow/python/debug/lib/BUILD
@@ -130,6 +130,7 @@ py_strict_library(
     visibility = [
         "//tensorflow:internal",
         "//third_party/py/tf_slim:__subpackages__",
+        "//waymo/ml/deploy/numeric_debugging:__subpackages__",
     ],
     deps = [
         ":debug_graphs",
diff --git a/tensorflow/python/distribute/BUILD b/tensorflow/python/distribute/BUILD
index 576e62d501cbbb..f19ce2c65215a6 100644
--- a/tensorflow/python/distribute/BUILD
+++ b/tensorflow/python/distribute/BUILD
@@ -2156,13 +2156,6 @@ cuda_py_strict_test(
 tpu_py_strict_test(
     name = "collective_all_reduce_strategy_test_tpu",
     srcs = ["collective_all_reduce_strategy_test.py"],
-    # copybara:uncomment_begin
-    # args = [
-    # "--tpu_use_tfrt=false",  #TODO(b/227404010): Remove once the bug is fixed.
-    # ],
-    # copybara:uncomment_end
-    # FIXME(b/227404010): On TFRT TPU, eager CollectiveReduceV2 is broken.
-    disable_tfrt = True,
     main = "collective_all_reduce_strategy_test.py",
     python_version = "PY3",
     deps = [
@@ -2314,7 +2307,7 @@ cuda_py_strict_test(
     name = "multi_process_runner_test",
     srcs = ["multi_process_runner_test.py"],
     python_version = "PY3",
-    shard_count = 12,
+    shard_count = 35,
     tags = [
         "multi_gpu",
         "no_pip",  # TODO(b/266520226)
diff --git a/tensorflow/python/framework/BUILD b/tensorflow/python/framework/BUILD
index 4fb1b45555a1c1..6c9ccdeb1d41ee 100644
--- a/tensorflow/python/framework/BUILD
+++ b/tensorflow/python/framework/BUILD
@@ -181,14 +181,12 @@ tf_cc_test(
 tf_proto_library(
     name = "op_reg_offset_proto",
     srcs = ["op_reg_offset.proto"],
-    cc_api_version = 3,
     visibility = ["//tensorflow:internal"],
 )
 
 tf_proto_library(
     name = "kythe_metadata_proto",
     srcs = ["kythe_metadata.proto"],
-    cc_api_version = 3,
     visibility = ["//tensorflow:internal"],
 )
 
@@ -1054,6 +1052,7 @@ tf_python_pybind_extension(
         "//tensorflow/python/util:function_parameter_canonicalizer_hdrs",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings",
         "@pybind11",
     ],
 )
@@ -1183,7 +1182,6 @@ pytype_strict_library(
         ":c_api_util",
         ":composite_tensor",
         ":device",
-        ":device_spec",
         ":dtypes",
         ":errors",
         ":op_callbacks",
diff --git a/tensorflow/python/framework/ops.py b/tensorflow/python/framework/ops.py
index eea523965722d2..34b1eed754bbed 100644
--- a/tensorflow/python/framework/ops.py
+++ b/tensorflow/python/framework/ops.py
@@ -15,23 +15,28 @@
 """Classes and functions used to construct graphs."""
 # pylint: disable=g-bad-name
 import collections
+from collections.abc import Callable, Iterator, Sequence
+import contextlib
 import copy
 import enum
 import re
 import sys
 import threading
 import types
-from typing import Any, AnyStr, Callable, List, NoReturn, Optional, Pattern, Sequence, Tuple, Union
+from typing import cast, TypeVar, Any, AnyStr, NoReturn, Optional, Pattern, Union, ContextManager
 
 from absl import app
 import numpy as np
+from numpy import typing as npt
 
+from google.protobuf import message
 from tensorflow.core.framework import attr_value_pb2
 from tensorflow.core.framework import full_type_pb2
 from tensorflow.core.framework import function_pb2
 from tensorflow.core.framework import graph_pb2
 from tensorflow.core.framework import node_def_pb2
 from tensorflow.core.framework import op_def_pb2
+from tensorflow.core.framework import types_pb2
 from tensorflow.core.framework import versions_pb2
 from tensorflow.core.protobuf import config_pb2
 # pywrap_tensorflow must be imported first to avoid protobuf issues.
@@ -80,6 +85,12 @@
 from tensorflow.python.util.tf_export import kwarg_only
 from tensorflow.python.util.tf_export import tf_export
 
+_T = TypeVar("_T")
+GraphType = TypeVar("GraphType", bound="Graph")
+OpStatsType = TypeVar("OpStatsType", bound="OpStats")
+OperationType = TypeVar("OperationType", bound="Operation")
+EagerTensorType = TypeVar("EagerTensorType", bound="_EagerTensorBase")
+
 
 # TODO(b/307794935): Remove after bug is fixed.
 is_oss = True  # Updated by copybara
@@ -103,11 +114,12 @@
     "Whether tf.function() is used.")
 
 # pylint: disable=protected-access
-_DTYPES_INTERN_TABLE = dtypes._INTERN_TABLE
+_DTYPES_INTERN_TABLE: dict[types_pb2.DataType, dtypes.DType] = (
+    dtypes._INTERN_TABLE)
 # pylint: enable=protected-access
 
 
-def tensor_id(tensor):
+def tensor_id(tensor) -> Any:
   """Returns a unique identifier for this Tensor."""
   return tensor._id  # pylint: disable=protected-access
 
@@ -159,7 +171,7 @@ def string_merge(self, node_def) -> str:
     return compat.as_str(_device_string(self.function(node_def)))
 
 
-class NullContextmanager(object):
+class NullContextmanager(contextlib.AbstractContextManager[None]):
 
   def __init__(self, *args, **kwargs) -> None:
     pass
@@ -240,14 +252,14 @@ def value_text(tensor, is_repr=False) -> AnyStr:
 class SymbolicTensor(pywrap_tf_session.PyTensor, tensor_lib.Tensor):
   """A symbolic tensor from a graph or tf.function."""
 
-  def __new__(cls, op, value_index, dtype, unique_id=None):
+  def __new__(cls, op, value_index, dtype, unique_id=None) -> "SymbolicTensor":
     if unique_id is None:
       unique_id = uid()
     return pywrap_tf_session.PyTensor.__new__(
         SymbolicTensor, op, value_index, dtypes.as_dtype(dtype), unique_id
     )
 
-  def __copy__(self):
+  def __copy__(self) -> "SymbolicTensor":
     cls = self.__class__
     result = cls.__new__(cls, self.op, self.value_index, self.dtype, self._id)
     result.__dict__.update(self.__dict__)
@@ -256,7 +268,7 @@ def __copy__(self):
 
 def _create_graph_constant(
     value, dtype, shape, name, verify_shape, allow_broadcast
-) -> "Operation":
+) -> tensor_lib.Tensor:
   """Create a graph constant and invoke constant callbacks."""
   g = get_default_graph()
   tensor_value = attr_value_pb2.AttrValue()
@@ -294,15 +306,15 @@ def __int__(self) -> int:
   def __float__(self) -> float:
     return float(self._numpy())
 
-  def __index__(self):
-    return self._numpy().__index__()
+  def __index__(self) -> int:
+    return cast(np.ndarray, self._numpy()).__index__()
 
   def __bool__(self) -> bool:
     return bool(self._numpy())
 
   __nonzero__ = __bool__
 
-  def __format__(self, format_spec):
+  def __format__(self, format_spec) -> str:
     if self._prefer_custom_summarizer():
       return self._summarize_value().__format__(format_spec)
     elif self.dtype.is_numpy_compatible:
@@ -314,11 +326,11 @@ def __format__(self, format_spec):
   def __reduce__(self):
     return convert_to_tensor, (self._numpy(),)
 
-  def __copy__(self):
+  def __copy__(self: EagerTensorType) -> EagerTensorType:
     # Eager Tensors are immutable so it's safe to return themselves as a copy.
     return self
 
-  def __deepcopy__(self, memo):
+  def __deepcopy__(self: EagerTensorType, memo) -> EagerTensorType:
     # Eager Tensors are immutable so it's safe to return themselves as a copy.
     del memo
     return self
@@ -331,7 +343,7 @@ def __repr__(self) -> str:
     return "<tf.Tensor: shape=%s, dtype=%s, %s>" % (
         self.shape, self.dtype.name, value_text(self, is_repr=True))
 
-  def __len__(self):
+  def __len__(self) -> int:
     """Returns the length of the first dimension in the Tensor."""
     if not self.shape.ndims:
       raise TypeError("Scalar tensor has no `len()`")
@@ -341,10 +353,10 @@ def __len__(self):
     except core._NotOkStatusException as e:
       raise core._status_to_exception(e) from None
 
-  def __array__(self, dtype=None):
+  def __array__(self, dtype=None) -> np.ndarray:
     a = self._numpy()
     if not dtype:
-      return a
+      return cast(np.ndarray, a)
 
     return np.array(a, dtype=dtype)
 
@@ -353,22 +365,22 @@ def __hash__(self) -> int:
     raise TypeError("Tensor is unhashable. "
                     "Instead, use tensor.ref() as the key.")
 
-  def _numpy_internal(self) -> NoReturn:
+  def _numpy_internal(self) -> npt.ArrayLike:
     raise NotImplementedError()
 
-  def _numpy(self):
+  def _numpy(self) -> npt.ArrayLike:
     try:
       return self._numpy_internal()
     except core._NotOkStatusException as e:  # pylint: disable=protected-access
       raise core._status_to_exception(e) from None  # pylint: disable=protected-access
 
   @property
-  def dtype(self):
+  def dtype(self) -> dtypes.DType:
     # Note: using the intern table directly here as this is
     # performance-sensitive in some models.
     return dtypes._INTERN_TABLE[self._datatype_enum()]  # pylint: disable=protected-access
 
-  def numpy(self):
+  def numpy(self) -> npt.ArrayLike:
     """Copy of the contents of this Tensor into a NumPy array or scalar.
 
     Unlike NumPy arrays, Tensors are immutable, so this method has to copy
@@ -457,7 +469,9 @@ def _copy_to_device(self, device_name) -> NoReturn:  # pylint: disable=redefined
   def _override_operator(name, func) -> None:
     setattr(_EagerTensorBase, name, func)
 
-  def _copy_nograd(self, ctx=None, device_name=None):
+  def _copy_nograd(
+      self: EagerTensorType, ctx=None, device_name=None,
+  ) -> EagerTensorType:
     """Copies tensor to dest device, but doesn't record the operation."""
     # Creates a new tensor on the dest device.
     if ctx is None:
@@ -472,7 +486,9 @@ def _copy_nograd(self, ctx=None, device_name=None):
       raise core._status_to_exception(e) from None
     return new_tensor
 
-  def _copy(self, ctx=None, device_name=None):
+  def _copy(
+      self: EagerTensorType, ctx=None, device_name=None,
+  ) -> EagerTensorType:
     """Copies tensor to dest device."""
     new_tensor = self._copy_nograd(ctx, device_name)
     # Record the copy on tape and define backprop copy as well.
@@ -506,18 +522,18 @@ def get_shape(self) -> tensor_shape.TensorShape:
     """Alias of Tensor.shape."""
     return self.shape
 
-  def _shape_as_list(self) -> List[Tuple[int, ...]]:
+  def _shape_as_list(self) -> list[int]:
     """The shape of the tensor as a list."""
     return list(self._shape_tuple())
 
   @deprecation.deprecated(
       None, "Use tf.identity with explicit device placement instead.")
-  def cpu(self):
+  def cpu(self: EagerTensorType) -> EagerTensorType:
     """A copy of this Tensor with contents backed by host memory."""
     return self._copy(context.context(), "CPU:0")
 
   @deprecation.deprecated(None, "Use tf.identity instead.")
-  def gpu(self, gpu_index=0):
+  def gpu(self: EagerTensorType, gpu_index=0) -> EagerTensorType:
     """A copy of this Tensor with contents backed by memory on the GPU.
 
     Args:
@@ -537,22 +553,22 @@ def set_shape(self, shape) -> None:
 
   # Methods not supported / implemented for Eager Tensors.
   @property
-  def op(self):
+  def op(self) -> NoReturn:
     raise AttributeError(
         "Tensor.op is undefined when eager execution is enabled.")
 
   @property
-  def graph(self):
+  def graph(self) -> NoReturn:
     raise AttributeError(
         "Tensor.graph is undefined when eager execution is enabled.")
 
   @property
-  def name(self):
+  def name(self) -> NoReturn:
     raise AttributeError(
         "Tensor.name is undefined when eager execution is enabled.")
 
   @property
-  def value_index(self):
+  def value_index(self) -> NoReturn:
     raise AttributeError(
         "Tensor.value_index is undefined when eager execution is enabled.")
 
@@ -591,7 +607,7 @@ def __tf_tensor__(
       return graph.capture(self, name=name)
     return super().__tf_tensor__(dtype, name)
 
-  def _capture_as_const(self, name):
+  def _capture_as_const(self, name) -> Optional[tensor_lib.Tensor]:
     """Capture the EagerTensor to a graph constant tensor."""
     with control_dependencies(None):
       constant_value = tensor_util.constant_value(self)
@@ -619,7 +635,7 @@ def _add_error_prefix(msg: str, *, name: Optional[str] = None) -> str:
   return msg if name is None else f"{name}: {msg}"
 
 
-def pack_eager_tensors(tensors, ctx=None):
+def pack_eager_tensors(tensors, ctx=None) -> EagerTensor:
   """Pack multiple `EagerTensor`s of the same dtype and shape.
 
   Args:
@@ -710,7 +726,7 @@ def internal_convert_n_to_tensor(
     as_ref=False,
     preferred_dtype=None,
     # TODO(b/268347915): Remove argument.
-    ctx=None) -> List[Union[EagerTensor, SymbolicTensor]]:  # pylint: disable=unused-argument
+    ctx=None) -> list[Union[EagerTensor, SymbolicTensor]]:  # pylint: disable=unused-argument
   """Converts `values` to a list of `Tensor` objects.
 
   Args:
@@ -752,7 +768,7 @@ def internal_convert_n_to_tensor(
 
 def convert_n_to_tensor(
     values, dtype=None, name=None, preferred_dtype=None
-) ->  List[Union[EagerTensor, SymbolicTensor]]:
+) ->  list[Union[EagerTensor, SymbolicTensor]]:
   """Converts `values` to a list of `Tensor` objects.
 
   Args:
@@ -854,7 +870,7 @@ def internal_convert_n_to_tensor_or_composite(
     dtype=None,
     name=None,
     as_ref=False
-) -> List[Union[
+) -> list[Union[
     EagerTensor, SymbolicTensor, composite_tensor.CompositeTensor, type(None)]]:
   """Converts `values` to a list of `Tensor` or `CompositeTensor` objects.
 
@@ -894,7 +910,7 @@ def internal_convert_n_to_tensor_or_composite(
 
 def convert_n_to_tensor_or_composite(
     values, dtype=None, name=None
-) -> List[Union[
+) -> list[Union[
     EagerTensor, SymbolicTensor, composite_tensor.CompositeTensor, type(None)]]:
   """Converts `values` to a list of `Output` or `CompositeTensor` objects.
 
@@ -921,7 +937,7 @@ def convert_n_to_tensor_or_composite(
       values=values, dtype=dtype, name=name, as_ref=False)
 
 
-def _device_string(dev_spec):
+def _device_string(dev_spec) -> str:
   if pydev.is_device_spec(dev_spec):
     return dev_spec.to_string()
   else:
@@ -964,7 +980,7 @@ def _create_c_op(graph,
                  inputs,
                  control_inputs,
                  op_def=None,
-                 extract_traceback=True):
+                 extract_traceback=True) -> pywrap_tf_session.TF_Operation:
   """Creates a TF_Operation.
 
   Args:
@@ -1051,7 +1067,7 @@ class Operation(pywrap_tf_session.PyOperation):
 
   @classmethod
   def from_node_def(
-      cls,
+      cls: type[OperationType],
       node_def,
       g,
       inputs=None,
@@ -1060,7 +1076,7 @@ def from_node_def(
       input_types=None,
       original_op=None,
       op_def=None,
-  ):
+  ) -> OperationType:
     r"""Creates an `Operation`.
 
     NOTE: This constructor validates the name of the `Operation` (passed
@@ -1170,7 +1186,7 @@ def from_node_def(
     return self
 
   @classmethod
-  def _from_c_op(cls, c_op, g):
+  def _from_c_op(cls: type[OperationType], c_op, g) -> OperationType:
     """Create an Operation from a TF_Operation.
 
     For internal use only: This is useful for creating Operation for ops
@@ -1188,7 +1204,7 @@ def _from_c_op(cls, c_op, g):
     self._init(g)
     return self
 
-  def _init(self, graph: "Graph"):
+  def _init(self, graph: "Graph") -> None:
     """Initializes Operation from a TF_Operation."""
     self.graph = graph
     self._original_op = None
@@ -1214,7 +1230,7 @@ def _init(self, graph: "Graph"):
     self._init_outputs()
     self._id_value = self.graph._add_op(self)  # pylint: disable=protected-access
 
-  def _control_flow_post_processing(self, input_tensors=None):
+  def _control_flow_post_processing(self, input_tensors=None) -> None:
     """Add this op to its control flow context.
 
     This may add new ops and change this op's inputs. self.inputs must be
@@ -1232,7 +1248,7 @@ def _control_flow_post_processing(self, input_tensors=None):
     if self._control_flow_context is not None:
       self._control_flow_context.AddOp(self)
 
-  def colocation_groups(self):
+  def colocation_groups(self) -> list[bytes]:
     """Returns the list of colocation groups of the op."""
     default_colocation_group = [compat.as_bytes("loc:@%s" % self.name)]
     try:
@@ -1251,7 +1267,7 @@ def colocation_groups(self):
     # return the default colocation group.
     return attr_groups if attr_groups else default_colocation_group
 
-  def values(self):
+  def values(self) -> tuple[Any, ...]:
     """DEPRECATED: Use outputs."""
     return tuple(self.outputs)
 
@@ -1263,7 +1279,7 @@ def _get_control_flow_context(self):
     """
     return self._control_flow_context
 
-  def _set_control_flow_context(self, ctx):
+  def _set_control_flow_context(self, ctx) -> None:
     """Sets the current control flow context of this op.
 
     Args:
@@ -1272,12 +1288,12 @@ def _set_control_flow_context(self, ctx):
     self._control_flow_context = ctx
 
   @property
-  def _id(self):
+  def _id(self) -> int:
     """The unique integer id of this operation."""
     return self._id_value
 
   @property
-  def device(self):
+  def device(self) -> str:
     """The name of the device to which this op has been assigned, if any.
 
     Returns:
@@ -1288,7 +1304,7 @@ def device(self):
     return pywrap_tf_session.TF_OperationDevice(self._c_op)
 
   @property
-  def _device_assignments(self):
+  def _device_assignments(self) -> list[traceable_stack.TraceableObject]:
     """Code locations for device context managers active at op creation.
 
     This property will return a list of traceable_stack.TraceableObject
@@ -1319,7 +1335,7 @@ def _device_assignments(self):
     return self._device_code_locations or []
 
   @property
-  def _colocation_dict(self):
+  def _colocation_dict(self) -> dict[str, traceable_stack.TraceableObject]:
     """Code locations for colocation context managers active at op creation.
 
     This property will return a dictionary for which the keys are nodes with
@@ -1354,7 +1370,7 @@ def _colocation_dict(self):
     return locations_dict.copy()
 
   @property
-  def _output_types(self):
+  def _output_types(self) -> list[int]:
     """List this operation's output types.
 
     Returns:
@@ -1372,7 +1388,7 @@ def _output_types(self):
 
     return output_types
 
-  def _set_device(self, device):  # pylint: disable=redefined-outer-name
+  def _set_device(self, device) -> None:  # pylint: disable=redefined-outer-name
     """Set the device of this operation.
 
     Args:
@@ -1380,7 +1396,7 @@ def _set_device(self, device):  # pylint: disable=redefined-outer-name
     """
     self._set_device_from_string(compat.as_str(_device_string(device)))
 
-  def _update_input(self, index, tensor):
+  def _update_input(self, index, tensor) -> None:
     """Update the input to this operation at the given index.
 
     NOTE: This is for TF internal use only. Please don't use it.
@@ -1407,7 +1423,7 @@ def _update_input(self, index, tensor):
           tensor._as_tf_output(),  # pylint: disable=protected-access
           self._tf_input(index))
 
-  def _add_while_inputs(self, tensors):
+  def _add_while_inputs(self, tensors) -> None:
     """See AddWhileInputHack in python_api.h.
 
     NOTE: This is for TF internal use only. Please don't use it.
@@ -1433,13 +1449,13 @@ def _add_while_inputs(self, tensors):
             tensor._as_tf_output(),  # pylint: disable=protected-access
             self._c_op)
 
-  def __str__(self):
+  def __str__(self) -> str:
     return str(self.node_def)
 
-  def __repr__(self):
+  def __repr__(self) -> str:
     return "<tf.Operation '%s' type=%s>" % (self.name, self.type)
 
-  def __tf_tensor__(self, dtype=None, name=None):
+  def __tf_tensor__(self, dtype=None, name=None) -> NoReturn:
     """Raises a helpful error."""
     raise TypeError("can't convert Operation '{}' to Tensor".format(self.name))
 
@@ -1455,7 +1471,7 @@ def inputs(self) -> Sequence[tensor_lib.Tensor]:
     return self._inputs_val
 
   @property
-  def _input_types(self):
+  def _input_types(self) -> list[dtypes.DType]:
     num_inputs = pywrap_tf_session.TF_OperationNumInputs(self._c_op)
     input_types = [
         dtypes.as_dtype(
@@ -1472,14 +1488,14 @@ def traceback(self):
     return pywrap_tf_session.TF_OperationGetStackTrace(self._c_op)
 
   @property
-  def node_def(self):
+  def node_def(self) -> node_def_pb2.NodeDef:
     return node_def_pb2.NodeDef.FromString(self._node_def)
 
   @property
-  def op_def(self):
+  def op_def(self) -> op_def_pb2.OpDef:
     return op_def_pb2.OpDef.FromString(self._op_def)
 
-  def _set_attr(self, attr_name, attr_value):
+  def _set_attr(self, attr_name, attr_value) -> None:
     """Private method used to set an attribute in the node_def."""
     buf = pywrap_tf_session.TF_NewBufferFromString(
         compat.as_bytes(attr_value.SerializeToString()))
@@ -1488,41 +1504,41 @@ def _set_attr(self, attr_name, attr_value):
     finally:
       pywrap_tf_session.TF_DeleteBuffer(buf)
 
-  def _set_attr_with_buf(self, attr_name, attr_buf):
+  def _set_attr_with_buf(self, attr_name, attr_buf) -> None:
     """Set an attr in the node_def with a pre-allocated buffer."""
     with self.graph._c_graph.get() as c_graph:  # pylint: disable=protected-access
       # pylint: disable=protected-access
       pywrap_tf_session.SetAttr(c_graph, self._c_op, attr_name, attr_buf)
       # pylint: enable=protected-access
 
-  def _set_func_attr(self, attr_name, func_name):
+  def _set_func_attr(self, attr_name, func_name) -> None:
     """Private method used to set a function attribute in the node_def."""
     func = attr_value_pb2.NameAttrList(name=func_name)
     self._set_attr(attr_name, attr_value_pb2.AttrValue(func=func))
 
-  def _set_func_list_attr(self, attr_name, func_names):
+  def _set_func_list_attr(self, attr_name, func_names) -> None:
     """Private method used to set a list(function) attribute in the node_def."""
     funcs = [attr_value_pb2.NameAttrList(name=func_name)
              for func_name in func_names]
     funcs_list = attr_value_pb2.AttrValue.ListValue(func=funcs)
     self._set_attr(attr_name, attr_value_pb2.AttrValue(list=funcs_list))
 
-  def _set_type_list_attr(self, attr_name, types):
+  def _set_type_list_attr(self, attr_name, data_types) -> None:
     """Private method used to set a list(type) attribute in the node_def."""
-    if not types:
+    if not data_types:
       return
-    if isinstance(types[0], dtypes.DType):
-      types = [dt.as_datatype_enum for dt in types]
-    types_list = attr_value_pb2.AttrValue.ListValue(type=types)
+    if isinstance(data_types[0], dtypes.DType):
+      data_types = [dt.as_datatype_enum for dt in data_types]
+    types_list = attr_value_pb2.AttrValue.ListValue(type=data_types)
     self._set_attr(attr_name, attr_value_pb2.AttrValue(list=types_list))
 
-  def _set_shape_list_attr(self, attr_name, shapes):
+  def _set_shape_list_attr(self, attr_name, shapes) -> None:
     """Private method used to set a list(shape) attribute in the node_def."""
     shapes = [s.as_proto() for s in shapes]
     shapes_list = attr_value_pb2.AttrValue.ListValue(shape=shapes)
     self._set_attr(attr_name, attr_value_pb2.AttrValue(list=shapes_list))
 
-  def _clear_attr(self, attr_name):
+  def _clear_attr(self, attr_name) -> None:
     """Private method used to clear an attribute in the node_def."""
     with self.graph._c_graph.get() as c_graph:  # pylint: disable=protected-access
       # pylint: disable=protected-access
@@ -1568,7 +1584,7 @@ def get_attr(self, name):
     assert oneof_value in fields, "Unsupported field type in " + str(x)
     return getattr(x, oneof_value)
 
-  def _get_attr_type(self, name):
+  def _get_attr_type(self, name) -> dtypes.DType:
     """Returns the `DType` value of the attr of this op with the given `name`."""
     try:
       dtype_enum = pywrap_tf_session.TF_OperationGetAttrType(self._c_op, name)
@@ -1577,7 +1593,7 @@ def _get_attr_type(self, name):
       # Convert to ValueError for backwards compatibility.
       raise ValueError(e.message)
 
-  def _get_attr_bool(self, name):
+  def _get_attr_bool(self, name) -> bool:
     """Returns the `bool` value of the attr of this op with the given `name`."""
     try:
       return pywrap_tf_session.TF_OperationGetAttrBool(self._c_op, name)
@@ -1585,7 +1601,7 @@ def _get_attr_bool(self, name):
       # Convert to ValueError for backwards compatibility.
       raise ValueError(e.message)
 
-  def _get_attr_int(self, name):
+  def _get_attr_int(self, name) -> int:
     """Returns the `int` value of the attr of this op with the given `name`."""
     try:
       return pywrap_tf_session.TF_OperationGetAttrInt(self._c_op, name)
@@ -1593,7 +1609,7 @@ def _get_attr_int(self, name):
       # Convert to ValueError for backwards compatibility.
       raise ValueError(e.message)
 
-  def experimental_set_type(self, type_proto):
+  def experimental_set_type(self, type_proto) -> None:
     """Sets the corresponding node's `experimental_type` field.
 
     See the description of `NodeDef.experimental_type` for more info.
@@ -1612,7 +1628,7 @@ def experimental_set_type(self, type_proto):
       with c_api_util.tf_buffer(type_proto.SerializeToString()) as serialized:
         pywrap_tf_session.SetFullType(c_graph, self._c_op, serialized)  # pylint:disable=protected-access
 
-  def run(self, feed_dict=None, session=None):
+  def run(self, feed_dict=None, session=None) -> None:
     """Runs this operation in a `Session`.
 
     Calling this method will execute all preceding operations that
@@ -1678,7 +1694,7 @@ def __init__(self, op_type):
       raise TypeError("op_type must be a string")
     self._op_type = op_type
 
-  def __call__(self, f):
+  def __call__(self, f: _T) -> _T:
     """Registers the function `f` as gradient function for `op_type`."""
     gradient_registry.register(f, self._op_type)
     return f
@@ -1686,7 +1702,7 @@ def __call__(self, f):
 
 @deprecation.deprecated_endpoints("NotDifferentiable", "NoGradient")
 @tf_export("no_gradient", v1=["no_gradient", "NotDifferentiable", "NoGradient"])
-def no_gradient(op_type: str):
+def no_gradient(op_type: str) -> None:
   """Specifies that ops of type `op_type` is not differentiable.
 
   This function should *not* be used for operations that have a
@@ -1783,7 +1799,7 @@ def value(self):
   def value(self, value):
     self._value = value
 
-  def __iadd__(self, other):
+  def __iadd__(self: OpStatsType, other: OpStatsType) -> OpStatsType:
     if other.statistic_type != self.statistic_type:
       raise ValueError("Can't add an OpStat of type %s to one of %s." %
                        (self.statistic_type, other.statistic_type))
@@ -1856,7 +1872,7 @@ def __init__(self, op_type, statistic_type) -> None:
       raise TypeError("statistic_type must not contain a comma.")
     self._statistic_type = statistic_type
 
-  def __call__(self, f):
+  def __call__(self, f: _T) -> _T:
     """Registers "f" as the statistics function for "op_type"."""
     _stats_registry.register(f, self._op_type + "," + self._statistic_type)
     return f
@@ -1904,7 +1920,7 @@ def name_from_scope_name(name) -> str:
 
 
 @tf_contextlib.contextmanager
-def resource_creator_scope(resource_type, resource_creator):
+def resource_creator_scope(resource_type, resource_creator) -> Iterator[None]:
   with get_default_graph()._resource_creator_scope(resource_type,  # pylint: disable=protected-access
                                                    resource_creator):
     yield
@@ -1955,7 +1971,7 @@ class Graph(pywrap_tf_session.PyGraph):
   may define additional collections by specifying a new name.
   """
 
-  def __init__(self):
+  def __init__(self) -> None:
     """Creates a new, empty Graph."""
     super().__init__()
     # Protects core state that can be returned via public accessors.
@@ -2065,19 +2081,19 @@ def __init__(self):
   def _c_graph(self):
     return self
 
-  def __enter__(self):
+  def __enter__(self: GraphType) -> GraphType:
     return self
 
-  def __exit__(self, *args):
+  def __exit__(self, *args) -> None:
     return
 
-  def get(self):
+  def get(self: GraphType) -> GraphType:
     return self
 
   # Note: this method is private because the API of tf.Graph() is public and
   # frozen, and this functionality is still not ready for public visibility.
   @tf_contextlib.contextmanager
-  def _variable_creator_scope(self, creator, priority=100):
+  def _variable_creator_scope(self, creator, priority=100) -> Iterator[None]:
     """Scope which defines a variable creation function.
 
     Args:
@@ -2113,7 +2129,7 @@ def _variable_creator_scope(self, creator, priority=100):
   # TODO(b/192405401): unify resource_creator_scope with variable_creator_scope.
   # pylint: disable=protected-access
   @tf_contextlib.contextmanager
-  def _resource_creator_scope(self, resource_type, creator):
+  def _resource_creator_scope(self, resource_type, creator) -> Iterator[None]:
     """Scope which defines a resource creation function used by some resource.
 
     The resource should be a subclass of CapturableResource with a class method
@@ -2174,20 +2190,23 @@ def resource_creator(next_creator, *a, **kwargs)
       self._thread_local._resource_creator_stack = old
 
   @property
-  def _resource_creator_stack(self):
+  def _resource_creator_stack(self) -> dict[str, list[Callable[..., Any]]]:
     if not hasattr(self._thread_local, "_resource_creator_stack"):
       self._thread_local._resource_creator_stack = collections.defaultdict(list)
     return self._thread_local._resource_creator_stack
 
   @_resource_creator_stack.setter
-  def _resource_creator_stack(self, resource_creator_stack):
+  def _resource_creator_stack(
+      self,
+      resource_creator_stack: dict[str, list[Callable[..., Any]]],
+  ) -> None:
     self._thread_local._resource_creator_stack = resource_creator_stack
   # pylint: enable=protected-access
 
   # Note: this method is private because the API of tf.Graph() is public and
   # frozen, and this functionality is still not ready for public visibility.
   @property
-  def _variable_creator_stack(self):
+  def _variable_creator_stack(self) -> list[tuple[int, Callable[..., Any]]]:
     if not hasattr(self._thread_local, "_variable_creator_stack"):
       self._thread_local._variable_creator_stack = []  # pylint: disable=protected-access
 
@@ -2211,10 +2230,13 @@ def _variable_creator_stack(self):
     return self._thread_local._variable_creator_stack  # pylint: disable=protected-access
 
   @_variable_creator_stack.setter
-  def _variable_creator_stack(self, variable_creator_stack):
+  def _variable_creator_stack(
+      self,
+      variable_creator_stack: list[tuple[int, Callable[..., Any]]],
+  ) -> None:
     self._thread_local._variable_creator_stack = variable_creator_stack  # pylint: disable=protected-access
 
-  def _check_not_finalized(self):
+  def _check_not_finalized(self) -> None:
     """Check if the graph is finalized.
 
     Raises:
@@ -2224,7 +2246,7 @@ def _check_not_finalized(self):
       raise RuntimeError("Graph is finalized and cannot be modified.")
 
   @property
-  def graph_def_versions(self):
+  def graph_def_versions(self) -> versions_pb2.VersionDef:
     # pylint: disable=line-too-long
     """The GraphDef version information of this graph.
 
@@ -2237,20 +2259,20 @@ def graph_def_versions(self):
     return versions_pb2.VersionDef.FromString(self._version_def)
 
   @property
-  def seed(self):
+  def seed(self) -> Optional[int]:
     """The graph-level random seed of this graph."""
     return self._seed
 
   @seed.setter
-  def seed(self, seed):
+  def seed(self, seed: int) -> None:
     self._seed = seed
 
   @property
-  def finalized(self):
+  def finalized(self) -> bool:
     """True if this graph has been finalized."""
     return self._finalized
 
-  def finalize(self):
+  def finalize(self) -> None:
     """Finalizes this graph, making it read-only.
 
     After calling `g.finalize()`, no new operations can be added to
@@ -2260,7 +2282,7 @@ def finalize(self):
     """
     self._finalized = True
 
-  def _unsafe_unfinalize(self):
+  def _unsafe_unfinalize(self) -> None:
     """Opposite of `finalize`.
 
     Internal interface.
@@ -2280,7 +2302,7 @@ def _get_control_flow_context(self):
     """
     return self._control_flow_context
 
-  def _set_control_flow_context(self, ctx):
+  def _set_control_flow_context(self, ctx) -> None:
     """Sets the current control flow context.
 
     Args:
@@ -2288,7 +2310,7 @@ def _set_control_flow_context(self, ctx):
     """
     self._control_flow_context = ctx
 
-  def _copy_functions_to_graph_def(self, graph_def, starting_bytesize):
+  def _copy_functions_to_graph_def(self, graph_def, starting_bytesize) -> None:
     """If this graph contains functions, copy them to `graph_def`."""
     bytesize = starting_bytesize
     for f in self._functions.values():
@@ -2303,7 +2325,8 @@ def _copy_functions_to_graph_def(self, graph_def, starting_bytesize):
         graph_def.library.gradient.extend([grad_def])
 
   def _as_graph_def(
-      self, from_version=None, add_shapes=False, use_pybind11_proto=False):
+      self, from_version=None, add_shapes=False, use_pybind11_proto=False,
+  ) -> tuple[graph_pb2.GraphDef, int]:
     # pylint: disable=line-too-long
     """Returns a serialized `GraphDef` representation of this graph.
 
@@ -2406,7 +2429,7 @@ def _as_graph_def(
 
   def as_graph_def(
       self, from_version=None, add_shapes=False, use_pybind11_proto=False
-  ):
+  ) -> graph_pb2.GraphDef:
     # pylint: disable=line-too-long
     """Returns a serialized `GraphDef` representation of this graph.
 
@@ -2442,7 +2465,7 @@ def as_graph_def(
     )
     return result
 
-  def _is_function(self, name):
+  def _is_function(self, name) -> bool:
     """Tests whether 'name' is registered in this graph's function library.
 
     Args:
@@ -2464,7 +2487,7 @@ def _get_function(self, name):
     """
     return self._functions.get(compat.as_str(name), None)
 
-  def _add_function_recursive(self, function, overwrite=False):
+  def _add_function_recursive(self, function, overwrite=False) -> None:
     """Adds function to the graph including other functions in its graph."""
 
     if self._is_function(function.name):
@@ -2483,7 +2506,7 @@ def _add_function_recursive(self, function, overwrite=False):
         else:
           self._add_function(f)
 
-  def _add_function(self, function):
+  def _add_function(self, function) -> None:
     """Adds a function to the graph.
 
     After the function has been added, you can call to the function by
@@ -2523,7 +2546,7 @@ def _add_function(self, function):
     if self._graph_def_versions.min_consumer < 12:
       self._graph_def_versions.min_consumer = 12
 
-  def _remove_function(self, name):
+  def _remove_function(self, name) -> None:
     self._check_not_finalized()
     if not self._is_function(name):
       raise ValueError(f"Function {name!r} is not found in {self!r}.")
@@ -2533,7 +2556,7 @@ def _remove_function(self, name):
       del self._functions[compat.as_str(name)]
 
   @property
-  def building_function(self):
+  def building_function(self) -> bool:
     """Returns True iff this graph represents a function."""
     return self._building_function
 
@@ -2552,7 +2575,7 @@ def create_op(
       attrs=None,
       op_def=None,
       compute_shapes=True,
-      compute_device=True):
+      compute_device=True) -> "Operation":
     """Creates an `Operation` in this graph.
 
     This is a low-level interface for creating an `Operation`. Most
@@ -2669,7 +2692,9 @@ def _create_op_internal(
       self._create_op_helper(ret, compute_device=compute_device)
     return ret
 
-  def _create_op_from_tf_operation(self, c_op, compute_device=True):
+  def _create_op_from_tf_operation(
+      self, c_op, compute_device=True,
+  ) -> "Operation":
     """Creates an `Operation` in this graph from the supplied TF_Operation.
 
     This method is like create_op() except the new Operation is constructed
@@ -2701,7 +2726,7 @@ def _create_op_from_tf_operation(self, c_op, compute_device=True):
     self._create_op_helper(ret, compute_device=compute_device)
     return ret
 
-  def _create_op_helper(self, op, compute_device=True):
+  def _create_op_helper(self, op, compute_device=True) -> None:
     """Common logic for creating an op in this graph."""
     # Apply any additional attributes requested. Do not overwrite any existing
     # attributes.
@@ -2788,7 +2813,7 @@ def _create_op_helper(self, op, compute_device=True):
           op._set_attr("container", attr_value_pb2.AttrValue(  # pylint: disable=protected-access
               s=compat.as_bytes(self._container)))
 
-  def _add_new_tf_operations(self, compute_devices=True):
+  def _add_new_tf_operations(self, compute_devices=True) -> list["Operation"]:
     """Creates `Operations` in this graph for any new TF_Operations.
 
     This is useful for when TF_Operations are indirectly created by the C API
@@ -2821,7 +2846,9 @@ def _add_new_tf_operations(self, compute_devices=True):
 
     return new_ops
 
-  def as_graph_element(self, obj, allow_tensor=True, allow_operation=True):
+  def as_graph_element(
+      self, obj, allow_tensor=True, allow_operation=True,
+  ) -> Union[tensor_lib.Tensor, "Operation"]:
     """Returns the object referred to by `obj`, as an `Operation` or `Tensor`.
 
     This function validates that `obj` represents an element of this
@@ -2857,7 +2884,9 @@ def as_graph_element(self, obj, allow_tensor=True, allow_operation=True):
     with self._lock:
       return self._as_graph_element_locked(obj, allow_tensor, allow_operation)
 
-  def _as_graph_element_locked(self, obj, allow_tensor, allow_operation):
+  def _as_graph_element_locked(
+      self, obj, allow_tensor, allow_operation,
+  ) -> Union[tensor_lib.Tensor, "Operation"]:
     """See `Graph.as_graph_element()` for details."""
     # The vast majority of this function is figuring
     # out what an API user might be doing wrong, so
@@ -2954,7 +2983,7 @@ def _as_graph_element_locked(self, obj, allow_tensor, allow_operation):
       raise TypeError("Can not convert a %s into a %s." %
                       (type(obj).__name__, types_str))
 
-  def get_operation_by_name(self, name):
+  def get_operation_by_name(self, name) -> "Operation":
     """Returns the `Operation` with the given `name`.
 
     This method may be called concurrently from multiple threads.
@@ -2973,13 +3002,17 @@ def get_operation_by_name(self, name):
     if not isinstance(name, str):
       raise TypeError("Operation names are strings (or similar), not %s." %
                       type(name).__name__)
-    return self.as_graph_element(name, allow_tensor=False, allow_operation=True)
+    op = cast(
+        Operation,
+        self.as_graph_element(name, allow_tensor=False, allow_operation=True)
+    )
+    return op
 
-  def _get_operation_by_tf_operation(self, tf_oper):
+  def _get_operation_by_tf_operation(self, tf_oper) -> "Operation":
     op_name = pywrap_tf_session.TF_OperationName(tf_oper)
     return self._get_operation_by_name(op_name)
 
-  def get_tensor_by_name(self, name):
+  def get_tensor_by_name(self, name) -> tensor_lib.Tensor:
     """Returns the `Tensor` with the given `name`.
 
     This method may be called concurrently from multiple threads.
@@ -2998,9 +3031,13 @@ def get_tensor_by_name(self, name):
     if not isinstance(name, str):
       raise TypeError("Tensor names are strings (or similar), not %s." %
                       type(name).__name__)
-    return self.as_graph_element(name, allow_tensor=True, allow_operation=False)
+    tensor = cast(
+        tensor_lib.Tensor,
+        self.as_graph_element(name, allow_tensor=True, allow_operation=False)
+    )
+    return tensor
 
-  def _get_tensor_by_tf_output(self, tf_output):
+  def _get_tensor_by_tf_output(self, tf_output) -> tensor_lib.Tensor:
     """Returns the `Tensor` representing `tf_output`.
 
     Note that there is only one such `Tensor`, i.e. multiple calls to this
@@ -3016,7 +3053,7 @@ def _get_tensor_by_tf_output(self, tf_output):
     op = self._get_operation_by_tf_operation(tf_output.oper)
     return op.outputs[tf_output.index]
 
-  def op_def_for_type(self, type):  # pylint: disable=redefined-builtin
+  def op_def_for_type(self, type) -> op_def_pb2.OpDef:  # pylint: disable=redefined-builtin
     """Returns the `OpDef` proto for `type`. `type` is a string."""
     # NOTE: No locking is required because the lookup and insertion operations
     # on Python dictionaries are atomic.
@@ -3028,7 +3065,7 @@ def op_def_for_type(self, type):  # pylint: disable=redefined-builtin
       )
       return self._op_def_cache[type]
 
-  def as_default(self):
+  def as_default(self) -> ContextManager["Graph"]:
     """Returns a context manager that makes this `Graph` the default graph.
 
     This method should be used if you want to create multiple graphs
@@ -3071,11 +3108,11 @@ def as_default(self):
     return _default_graph_stack.get_controller(self)
 
   @property
-  def collections(self):
+  def collections(self) -> list[str]:
     """Returns the names of the collections known to this graph."""
     return list(self._collections)
 
-  def add_to_collection(self, name, value):
+  def add_to_collection(self, name, value) -> None:
     """Stores `value` in the collection with the given `name`.
 
     Note that collections are not sets, so it is possible to add a value to
@@ -3093,7 +3130,7 @@ def add_to_collection(self, name, value):
       else:
         self._collections[name].append(value)
 
-  def add_to_collections(self, names, value):
+  def add_to_collections(self, names, value) -> None:
     """Stores `value` in the collections given by `names`.
 
     Note that collections are not sets, so it is possible to add a value to
@@ -3114,7 +3151,7 @@ def add_to_collections(self, names, value):
     for name in names:
       self.add_to_collection(name, value)
 
-  def get_collection_ref(self, name):
+  def get_collection_ref(self, name) -> list[Any]:
     """Returns a list of values in the collection with the given `name`.
 
     If the collection exists, this returns the list itself, which can
@@ -3139,7 +3176,7 @@ def get_collection_ref(self, name):
         self._collections[name] = coll_list
       return coll_list
 
-  def get_collection(self, name, scope=None):
+  def get_collection(self, name, scope=None) -> list[Any]:
     """Returns a list of values in the collection with the given `name`.
 
     This is different from `get_collection_ref()` which always returns the
@@ -3179,12 +3216,12 @@ def get_collection(self, name, scope=None):
             pass
         return c
 
-  def get_all_collection_keys(self):
+  def get_all_collection_keys(self) -> list[str]:
     """Returns a list of collections used in this graph."""
     with self._lock:
       return [x for x in self._collections if isinstance(x, str)]
 
-  def clear_collection(self, name):
+  def clear_collection(self, name) -> None:
     """Clears all values in a collection.
 
     Args:
@@ -3197,7 +3234,7 @@ def clear_collection(self, name):
         del self._collections[name]
 
   @tf_contextlib.contextmanager
-  def _original_op(self, op):
+  def _original_op(self, op) -> Iterator[None]:
     """Python 'with' handler to help annotate ops with their originator.
 
     An op may have an 'original_op' property that indicates the op on which
@@ -3222,19 +3259,19 @@ def _original_op(self, op):
       self._default_original_op = old_original_op
 
   @property
-  def _name_stack(self):
+  def _name_stack(self) -> str:
     # This may be called from a thread where name_stack doesn't yet exist.
     if not hasattr(self._thread_local, "_name_stack"):
       self._thread_local._name_stack = ""
     return self._thread_local._name_stack
 
   @_name_stack.setter
-  def _name_stack(self, name_stack):
+  def _name_stack(self, name_stack: str) -> None:
     self._thread_local._name_stack = name_stack
 
   # pylint: disable=g-doc-return-or-yield,line-too-long
   @tf_contextlib.contextmanager
-  def name_scope(self, name):
+  def name_scope(self, name) -> Iterator[str]:
     """Returns a context manager that creates hierarchical names for operations.
 
     A graph maintains a stack of name scopes. A `with name_scope(...):`
@@ -3355,7 +3392,7 @@ def name_scope(self, name):
 
   # pylint: enable=g-doc-return-or-yield,line-too-long
 
-  def unique_name(self, name, mark_as_used=True):
+  def unique_name(self, name, mark_as_used=True) -> str:
     """Return a unique operation name for `name`.
 
     Note: You rarely need to call `unique_name()` directly.  Most of
@@ -3407,7 +3444,7 @@ def unique_name(self, name, mark_as_used=True):
       name = "%s_%d" % (name, i - 1)
     return name
 
-  def get_name_scope(self):
+  def get_name_scope(self) -> str:
     """Returns the current name scope.
 
     For example:
@@ -3426,7 +3463,7 @@ def get_name_scope(self):
 
   @tf_contextlib.contextmanager
   def _colocate_with_for_gradient(self, op, gradient_uid,
-                                  ignore_existing=False):
+                                  ignore_existing=False) -> Iterator[None]:
     with self.colocate_with(op, ignore_existing):
       if gradient_uid is not None:
         ctx = _get_enclosing_context(self)
@@ -3442,7 +3479,7 @@ def _colocate_with_for_gradient(self, op, gradient_uid,
         yield
 
   @tf_contextlib.contextmanager
-  def colocate_with(self, op, ignore_existing=False):
+  def colocate_with(self, op, ignore_existing=False) -> Iterator[None]:
     """Returns a context manager that specifies an op to colocate with.
 
     Note: this function is not for public use, only for internal libraries.
@@ -3520,7 +3557,9 @@ def colocate_with(self, op, ignore_existing=False):
       if ignore_existing:
         self._colocation_stack = current_stack
 
-  def _add_device_to_stack(self, device_name_or_function, offset=0):
+  def _add_device_to_stack(
+      self, device_name_or_function, offset=0,
+  ) -> _UserDeviceSpec:
     """Add device to stack manually, separate from a context manager."""
     total_offset = 1 + offset
     spec = _UserDeviceSpec(device_name_or_function)
@@ -3528,7 +3567,7 @@ def _add_device_to_stack(self, device_name_or_function, offset=0):
     return spec
 
   @tf_contextlib.contextmanager
-  def device(self, device_name_or_function):
+  def device(self, device_name_or_function) -> Iterator[None]:
     # pylint: disable=line-too-long
     """Returns a context manager that specifies the default device to use.
 
@@ -3598,7 +3637,7 @@ def matmul_on_gpu(n):
         raise RuntimeError("Exiting device scope without proper scope nesting.")
       self._device_function_stack.pop_obj()
 
-  def _apply_device_functions(self, op):
+  def _apply_device_functions(self, op) -> None:
     """Applies the current device function stack to the given operation."""
     # Apply any device functions in LIFO order, so that the most recently
     # pushed function has the first chance to apply a device to the op.
@@ -3625,7 +3664,7 @@ def _apply_device_functions(self, op):
 
   # pylint: disable=g-doc-return-or-yield
   @tf_contextlib.contextmanager
-  def container(self, container_name):
+  def container(self, container_name) -> Iterator[str]:
     """Returns a context manager that specifies the resource container to use.
 
     Stateful operations, such as variables and queues, can maintain their
@@ -3681,7 +3720,7 @@ def container(self, container_name):
   class _ControlDependenciesController(object):
     """Context manager for `control_dependencies()`."""
 
-    def __init__(self, graph, control_inputs):
+    def __init__(self, graph, control_inputs) -> None:
       """Create a new `_ControlDependenciesController`.
 
       A `_ControlDependenciesController` is the context manager for
@@ -3716,7 +3755,7 @@ def __init__(self, graph, control_inputs):
 
     # pylint: disable=protected-access
 
-    def __enter__(self):
+    def __enter__(self) -> None:
       if self._new_stack:
         # Clear the control_dependencies graph.
         self._old_stack = self._graph._control_dependencies_stack
@@ -3726,7 +3765,7 @@ def __enter__(self):
         self._graph._set_control_flow_context(None)
       self._graph._push_control_dependencies_controller(self)
 
-    def __exit__(self, unused_type, unused_value, unused_traceback):
+    def __exit__(self, unused_type, unused_value, unused_traceback) -> None:
       self._graph._pop_control_dependencies_controller(self)
       if self._new_stack:
         self._graph._control_dependencies_stack = self._old_stack
@@ -3738,31 +3777,31 @@ def __exit__(self, unused_type, unused_value, unused_traceback):
     def control_inputs(self):
       return self._control_inputs_val
 
-    def add_op(self, op):
+    def add_op(self, op) -> None:
       if isinstance(op, tensor_lib.Tensor):
         op = op.ref()
       self._seen_nodes.add(op)
 
-    def op_in_group(self, op):
+    def op_in_group(self, op) -> bool:
       if isinstance(op, tensor_lib.Tensor):
         op = op.ref()
       return op in self._seen_nodes
 
-  def _push_control_dependencies_controller(self, controller):
+  def _push_control_dependencies_controller(self, controller) -> None:
     self._control_dependencies_stack.append(controller)
 
-  def _pop_control_dependencies_controller(self, controller):
+  def _pop_control_dependencies_controller(self, controller) -> None:
     assert self._control_dependencies_stack[-1] is controller
     self._control_dependencies_stack.pop()
 
-  def _current_control_dependencies(self):
+  def _current_control_dependencies(self) -> set[Operation]:
     ret = set()
     for controller in self._control_dependencies_stack:
       for op in controller.control_inputs:
         ret.add(op)
     return ret
 
-  def _control_dependencies_for_inputs(self, input_ops):
+  def _control_dependencies_for_inputs(self, input_ops) -> list[Operation]:
     """For an op that takes `input_ops` as inputs, compute control inputs.
 
     The returned control dependencies should yield an execution that
@@ -3796,7 +3835,7 @@ def _control_dependencies_for_inputs(self, input_ops):
         ret.extend(c for c in controller.control_inputs if c not in input_ops)
     return ret
 
-  def _record_op_seen_by_control_dependencies(self, op):
+  def _record_op_seen_by_control_dependencies(self, op) -> None:
     """Record that the given op depends on all registered control dependencies.
 
     Args:
@@ -3805,7 +3844,9 @@ def _record_op_seen_by_control_dependencies(self, op):
     for controller in self._control_dependencies_stack:
       controller.add_op(op)
 
-  def control_dependencies(self, control_inputs):
+  def control_dependencies(
+      self, control_inputs,
+  ) -> _ControlDependenciesController:
     """Returns a context manager that specifies control dependencies.
 
     Use with the `with` keyword to specify that all operations constructed
@@ -3928,7 +3969,7 @@ def my_func(pred, tensor):
 
   # pylint: disable=g-doc-return-or-yield
   @tf_contextlib.contextmanager
-  def _attr_scope(self, attr_map):
+  def _attr_scope(self, attr_map) -> Iterator[None]:
     """EXPERIMENTAL: A context manager for setting attributes on operators.
 
     This context manager can be used to add additional
@@ -3994,7 +4035,7 @@ def _attr_scope(self, attr_map):
 
   # pylint: disable=g-doc-return-or-yield
   @tf_contextlib.contextmanager
-  def _kernel_label_map(self, op_to_kernel_label_map):
+  def _kernel_label_map(self, op_to_kernel_label_map) -> Iterator[None]:
     """EXPERIMENTAL: A context manager for setting kernel labels.
 
     This context manager can be used to select particular
@@ -4056,7 +4097,9 @@ def _kernel_label_map(self, op_to_kernel_label_map):
   # pylint: enable=g-doc-return-or-yield
 
   @tf_contextlib.contextmanager
-  def _override_gradient_function(self, gradient_function_map):
+  def _override_gradient_function(
+      self, gradient_function_map,
+  ) -> Iterator[None]:
     """Specify gradient function for the given op type."""
 
     # This is an internal API and we don't need nested context for this.
@@ -4070,7 +4113,7 @@ def _override_gradient_function(self, gradient_function_map):
 
   # pylint: disable=g-doc-return-or-yield
   @tf_contextlib.contextmanager
-  def gradient_override_map(self, op_type_map):
+  def gradient_override_map(self, op_type_map) -> Iterator[None]:
     """EXPERIMENTAL: A context manager for overriding gradient functions.
 
     This context manager can be used to override the gradient function
@@ -4132,26 +4175,26 @@ def _custom_square_grad(op, grad):
 
   # pylint: enable=g-doc-return-or-yield
 
-  def prevent_feeding(self, tensor):
+  def prevent_feeding(self, tensor) -> None:
     """Marks the given `tensor` as unfeedable in this graph."""
     self._unfeedable_tensors.add(tensor)
 
-  def is_feedable(self, tensor):
+  def is_feedable(self, tensor) -> bool:
     """Returns `True` if and only if `tensor` is feedable."""
     return tensor not in self._unfeedable_tensors
 
-  def prevent_fetching(self, op):
+  def prevent_fetching(self, op) -> None:
     """Marks the given `op` as unfetchable in this graph."""
     self._unfetchable_ops.add(op)
 
-  def is_fetchable(self, tensor_or_op):
+  def is_fetchable(self, tensor_or_op) -> bool:
     """Returns `True` if and only if `tensor_or_op` is fetchable."""
     if isinstance(tensor_or_op, tensor_lib.Tensor):
       return tensor_or_op.op not in self._unfetchable_ops
     else:
       return tensor_or_op not in self._unfetchable_ops
 
-  def switch_to_thread_local(self):
+  def switch_to_thread_local(self) -> None:
     """Make device, colocation and dependencies stacks thread-local.
 
     Device, colocation and dependencies stacks are not thread-local be default.
@@ -4169,7 +4212,7 @@ def switch_to_thread_local(self):
       self._stack_state_is_thread_local = True
 
   @property
-  def _device_function_stack(self):
+  def _device_function_stack(self) -> traceable_stack.TraceableStack:
     if self._stack_state_is_thread_local:
       # This may be called from a thread where device_function_stack doesn't yet
       # exist.
@@ -4189,7 +4232,9 @@ def _device_functions_outer_to_inner(self):
     device_functions_outer_to_inner = list(reversed(device_functions))
     return device_functions_outer_to_inner
 
-  def _snapshot_device_function_stack_metadata(self):
+  def _snapshot_device_function_stack_metadata(
+      self,
+  ) -> list[traceable_stack.TraceableObject]:
     """Return device function stack as a list of TraceableObjects.
 
     Returns:
@@ -4206,7 +4251,9 @@ def _snapshot_device_function_stack_metadata(self):
     return snapshot
 
   @_device_function_stack.setter
-  def _device_function_stack(self, device_function_stack):
+  def _device_function_stack(
+      self, device_function_stack: traceable_stack.TraceableStack,
+  ) -> None:
     if self._stack_state_is_thread_local:
       # pylint: disable=protected-access
       self._thread_local._device_function_stack = device_function_stack
@@ -4215,7 +4262,7 @@ def _device_function_stack(self, device_function_stack):
       self._graph_device_function_stack = device_function_stack
 
   @property
-  def _colocation_stack(self):
+  def _colocation_stack(self) -> traceable_stack.TraceableStack:
     """Return thread-local copy of colocation stack."""
     if self._stack_state_is_thread_local:
       # This may be called from a thread where colocation_stack doesn't yet
@@ -4229,7 +4276,9 @@ def _colocation_stack(self):
     else:
       return self._graph_colocation_stack
 
-  def _snapshot_colocation_stack_metadata(self):
+  def _snapshot_colocation_stack_metadata(
+      self,
+  ) -> dict[str, traceable_stack.TraceableObject]:
     """Return colocation stack metadata as a dictionary."""
     return {
         traceable_obj.obj.name: traceable_obj.copy_metadata()
@@ -4237,7 +4286,9 @@ def _snapshot_colocation_stack_metadata(self):
     }
 
   @_colocation_stack.setter
-  def _colocation_stack(self, colocation_stack):
+  def _colocation_stack(
+      self, colocation_stack: traceable_stack.TraceableStack,
+  ) -> None:
     if self._stack_state_is_thread_local:
       # pylint: disable=protected-access
       self._thread_local._colocation_stack = colocation_stack
@@ -4246,7 +4297,7 @@ def _colocation_stack(self, colocation_stack):
       self._graph_colocation_stack = colocation_stack
 
   @property
-  def _control_dependencies_stack(self):
+  def _control_dependencies_stack(self) -> list[_ControlDependenciesController]:
     if self._stack_state_is_thread_local:
       # This may be called from a thread where control_dependencies_stack
       # doesn't yet exist.
@@ -4258,21 +4309,26 @@ def _control_dependencies_stack(self):
       return self._graph_control_dependencies_stack
 
   @_control_dependencies_stack.setter
-  def _control_dependencies_stack(self, control_dependencies):
+  def _control_dependencies_stack(
+      self,
+      control_deps_controllers: list[_ControlDependenciesController],
+  ) -> None:
     if self._stack_state_is_thread_local:
-      self._thread_local._control_dependencies_stack = control_dependencies
+      self._thread_local._control_dependencies_stack = control_deps_controllers
     else:
-      self._graph_control_dependencies_stack = control_dependencies
+      self._graph_control_dependencies_stack = control_deps_controllers
 
   @property
-  def _distribution_strategy_stack(self):
+  def _distribution_strategy_stack(self) -> list[Any]:
     """A stack to maintain distribution strategy context for each thread."""
     if not hasattr(self._thread_local, "_distribution_strategy_stack"):
       self._thread_local._distribution_strategy_stack = []  # pylint: disable=protected-access
     return self._thread_local._distribution_strategy_stack  # pylint: disable=protected-access
 
   @_distribution_strategy_stack.setter
-  def _distribution_strategy_stack(self, _distribution_strategy_stack):
+  def _distribution_strategy_stack(
+      self, _distribution_strategy_stack: list[Any],
+  ) -> None:
     self._thread_local._distribution_strategy_stack = (  # pylint: disable=protected-access
         _distribution_strategy_stack)
 
@@ -4287,14 +4343,14 @@ def _global_distribute_strategy_scope(self):
   def _global_distribute_strategy_scope(self, distribute_strategy_scope):
     self._thread_local.distribute_strategy_scope = (distribute_strategy_scope)
 
-  def _mutation_lock(self):
+  def _mutation_lock(self) -> lock_util.GroupLock._Context:
     """Returns a lock to guard code that creates & mutates ops.
 
     See the comment for self._group_lock for more info.
     """
     return self._group_lock.group(_MUTATION_LOCK_GROUP)
 
-  def _session_run_lock(self):
+  def _session_run_lock(self) -> lock_util.GroupLock._Context:
     """Returns a lock to guard code for Session.run.
 
     See the comment for self._group_lock for more info.
@@ -4307,7 +4363,7 @@ def _session_run_lock(self):
 
 
 @tf_export(v1=["device"])
-def device(device_name_or_function):
+def device(device_name_or_function) -> ContextManager[None]:
   """Wrapper for `Graph.device()` using the default graph.
 
   See `tf.Graph.device` for more details.
@@ -4343,7 +4399,7 @@ def combined(device_name_or_function):
 
 
 @tf_export("device", v1=[])
-def device_v2(device_name):
+def device_v2(device_name) -> ContextManager[None]:
   """Specifies the device for ops created/executed in this context.
 
   This function specifies the device to be used for ops created/executed in a
@@ -4382,7 +4438,7 @@ def device_v2(device_name):
 
 
 @tf_export(v1=["container"])
-def container(container_name):
+def container(container_name) -> ContextManager[str]:
   """Wrapper for `Graph.container()` using the default graph.
 
   Args:
@@ -4395,7 +4451,29 @@ def container(container_name):
   return get_default_graph().container(container_name)
 
 
-def _colocate_with_for_gradient(op, gradient_uid, ignore_existing=False):
+def _colocate_with_for_gradient(
+    op, gradient_uid, ignore_existing=False,
+) -> ContextManager[None]:
+  """Returns a context manager for colocating op gradients with an op.
+
+  Internal API. In eager mode, returns a context manager that sets the default
+  device for new ops to the same device as the given op. Does the same if a
+  function is currently being built (i.e. the current mode is graph, but the
+  overall mode is eager).
+
+  In all other cases, returns a `Graph.colocate_with()` context manager,
+  optionally accounting for gradients (if a gradient UID is specified).
+
+  Args:
+    op: Operation or Tensor with which to colocate.
+    gradient_uid: Optional gradient UID to enable colocation of gradients during
+      compilation.
+    ignore_existing: See `Graph.colocate_with()`.
+
+  Returns:
+    A context manager used to colocate ops and gradients with the specified
+    operation.
+  """
   if context.executing_eagerly():
     if op is not None:
       if not hasattr(op, "device"):
@@ -4418,19 +4496,21 @@ def _colocate_with_for_gradient(op, gradient_uid, ignore_existing=False):
 # Internal interface to colocate_with. colocate_with has been deprecated from
 # public API. There are still a few internal uses of colocate_with. Add internal
 # only API for those uses to avoid deprecation warning.
-def colocate_with(op, ignore_existing=False):
+def colocate_with(op, ignore_existing=False) -> ContextManager[None]:
   return _colocate_with_for_gradient(op, None, ignore_existing=ignore_existing)
 
 
 @deprecation.deprecated(
     date=None, instructions="Colocations handled automatically by placer.")
 @tf_export(v1=["colocate_with"])
-def _colocate_with(op, ignore_existing=False):
+def _colocate_with(op, ignore_existing=False) -> ContextManager[None]:
   return colocate_with(op, ignore_existing)
 
 
 @tf_export("control_dependencies")
-def control_dependencies(control_inputs):
+def control_dependencies(
+    control_inputs,
+) -> Graph._ControlDependenciesController:
   """Wrapper for `Graph.control_dependencies()` using the default graph.
 
   See `tf.Graph.control_dependencies` for more details.
@@ -4541,14 +4621,14 @@ def _run_using_default_session(
   session.run(operation, feed_dict)
 
 
-class _DefaultGraphStack(stack.DefaultStack):  # pylint: disable=protected-access
+class _DefaultGraphStack(stack.DefaultStack[Graph]):  # pylint: disable=protected-access
   """A thread-local stack of objects for providing an implicit default graph."""
 
   def __init__(self) -> None:
     super(_DefaultGraphStack, self).__init__()
     self._global_default_graph = None
 
-  def get_default(self):
+  def get_default(self) -> Graph:
     """Override that returns a global default if the stack is empty."""
     if self.stack:
       return self.stack[-1]
@@ -4558,7 +4638,7 @@ def get_default(self):
       self._global_default_graph = Graph()
       return self._global_default_graph
 
-  def _GetGlobalDefaultGraph(self):
+  def _GetGlobalDefaultGraph(self) -> Graph:
     if self._global_default_graph is None:
       # TODO(mrry): Perhaps log that the default graph is being used, or set
       #   provide some other feedback to prevent confusion when a mixture of
@@ -4572,7 +4652,7 @@ def reset(self) -> None:
     self._global_default_graph = None
 
   @tf_contextlib.contextmanager
-  def get_controller(self, default):
+  def get_controller(self, default) -> Iterator[Graph]:
     context.context().context_switches.push(default.building_function,
                                             default.as_default,
                                             default._device_function_stack)
@@ -4592,7 +4672,8 @@ def get_controller(self, default):
 # Shared helper used in init_scope and executing_eagerly_outside_functions
 # to obtain the outermost context that is not building a function, and the
 # innermost non empty device stack.
-def _get_outer_context_and_inner_device_stack():
+def _get_outer_context_and_inner_device_stack(
+) -> tuple[Callable[[], ContextManager[Graph]], traceable_stack.TraceableStack]:
   """Get the outermost context not building a function."""
   default_graph = get_default_graph()
   outer_context = None
@@ -4632,7 +4713,7 @@ def _get_outer_context_and_inner_device_stack():
 # pylint: disable=g-doc-return-or-yield,line-too-long
 @tf_export("init_scope")
 @tf_contextlib.contextmanager
-def init_scope():
+def init_scope() -> Iterator[None]:
   """A context manager that lifts ops out of control-flow scopes and function-building graphs.
 
   There is often a need to lift variable initialization ops out of control-flow
@@ -4736,7 +4817,7 @@ def func():
 
 
 @tf_export(v1=["executing_eagerly_outside_functions"])
-def executing_eagerly_outside_functions():
+def executing_eagerly_outside_functions() -> bool:
   """Returns True if executing eagerly, even if inside a graph function.
 
   This function will check the outermost context for the program and see if
@@ -4769,7 +4850,7 @@ def executing_eagerly_outside_functions():
 
 
 @tf_export("inside_function", v1=[])
-def inside_function():
+def inside_function() -> bool:
   """Indicates whether the caller code is executing inside a `tf.function`.
 
   Returns:
@@ -4791,7 +4872,7 @@ def inside_function():
 
 @tf_export(v1=["enable_eager_execution"])
 def enable_eager_execution(config=None, device_policy=None,
-                           execution_mode=None):
+                           execution_mode=None) -> None:
   """Enables eager execution for the lifetime of this program.
 
   Eager execution provides an imperative interface to TensorFlow. With eager
@@ -4868,7 +4949,7 @@ def enable_eager_execution(config=None, device_policy=None,
 
 
 @tf_export(v1=["disable_eager_execution"])
-def disable_eager_execution():
+def disable_eager_execution() -> None:
   """Disables eager execution.
 
   This function can only be called before any Graphs, Ops, or Tensors have been
@@ -4985,7 +5066,7 @@ def main(_):
 
 
 @tf_export(v1=["reset_default_graph"])
-def reset_default_graph():
+def reset_default_graph() -> None:
   """Clears the default graph stack and resets the global default graph.
 
   NOTE: The default graph is a property of the current thread. This
@@ -5049,7 +5130,7 @@ def has_default_graph() -> bool:
 
 # Exported due to b/171079555
 @tf_export("__internal__.get_name_scope", v1=[])
-def get_name_scope():
+def get_name_scope() -> str:
   """Returns the current name scope in the default_graph.
 
   For example:
@@ -5087,7 +5168,7 @@ def _assert_same_graph(original_item, item) -> None:
         (item, original_item, graph, original_graph))
 
 
-def _get_graph_from_inputs(op_input_list, graph=None):
+def _get_graph_from_inputs(op_input_list, graph=None) -> Graph:
   """Returns the appropriate graph to use for the given inputs.
 
   This library method provides a consistent algorithm for choosing the graph
@@ -5312,7 +5393,7 @@ def dismantle_graph(graph) -> None:
 
 
 @tf_export(v1=["add_to_collection"])
-def add_to_collection(name, value):
+def add_to_collection(name, value) -> None:
   """Wrapper for `Graph.add_to_collection()` using the default graph.
 
   See `tf.Graph.add_to_collection`
@@ -5332,7 +5413,7 @@ def add_to_collection(name, value):
 
 
 @tf_export(v1=["add_to_collections"])
-def add_to_collections(names, value):
+def add_to_collections(names, value) -> None:
   """Wrapper for `Graph.add_to_collections()` using the default graph.
 
   See `tf.Graph.add_to_collections`
@@ -5352,7 +5433,7 @@ def add_to_collections(names, value):
 
 
 @tf_export(v1=["get_collection_ref"])
-def get_collection_ref(key):
+def get_collection_ref(key) -> list[Any]:
   """Wrapper for `Graph.get_collection_ref()` using the default graph.
 
   See `tf.Graph.get_collection_ref`
@@ -5376,7 +5457,7 @@ def get_collection_ref(key):
 
 
 @tf_export(v1=["get_collection"])
-def get_collection(key, scope=None):
+def get_collection(key, scope=None) -> list[Any]:
   """Wrapper for `Graph.get_collection()` using the default graph.
 
   See `tf.Graph.get_collection`
@@ -5404,12 +5485,14 @@ def get_collection(key, scope=None):
   return get_default_graph().get_collection(key, scope)
 
 
-def get_all_collection_keys():
+def get_all_collection_keys() -> list[str]:
   """Returns a list of collections used in the default graph."""
   return get_default_graph().get_all_collection_keys()
 
 
-def name_scope(name, default_name=None, values=None, skip_on_eager=True):
+def name_scope(
+    name, default_name=None, values=None, skip_on_eager=True,
+) -> ContextManager[Optional[str]]:
   """Internal-only entry point for `name_scope*`.
 
   Internal ops do not use the public API and instead rely on
@@ -5454,7 +5537,7 @@ def name_scope(name, default_name=None, values=None, skip_on_eager=True):
   return name_scope_v2(name or "")
 
 
-class internal_name_scope_v1(object):  # pylint: disable=invalid-name
+class internal_name_scope_v1(contextlib.AbstractContextManager[str]):  # pylint: disable=invalid-name
   """Graph-only version of `name_scope_v1`."""
 
   @property
@@ -5480,7 +5563,7 @@ def __init__(self, name, default_name=None, values=None) -> None:
     self._default_name = default_name
     self._values = values
 
-  def __enter__(self):
+  def __enter__(self) -> str:
     """Start the scope block.
 
     Returns:
@@ -5530,7 +5613,7 @@ def __exit__(self, *exc_info) -> None:
 # @tf_contextlib.contextmanager version, which was switched to a class to avoid
 # some object creation overhead.
 @tf_export(v1=["name_scope"])
-class name_scope_v1(object):  # pylint: disable=invalid-name
+class name_scope_v1(contextlib.AbstractContextManager[Optional[str]]):  # pylint: disable=invalid-name
   """A context manager for use when defining a Python op.
 
   This context manager validates that the given `values` are from the
@@ -5558,7 +5641,7 @@ def my_op(a, b, c, name=None):
   def name(self):
     return self._name
 
-  def __init__(self, name, default_name=None, values=None):
+  def __init__(self, name, default_name=None, values=None) -> None:
     """Initialize the context manager.
 
     Args:
@@ -5573,15 +5656,15 @@ def __init__(self, name, default_name=None, values=None):
         name, default_name, values, skip_on_eager=False)
     self._name = default_name if name is None else name
 
-  def __enter__(self):
+  def __enter__(self) -> Optional[str]:
     return self._name_scope.__enter__()
 
-  def __exit__(self, *exc_info):
+  def __exit__(self, *exc_info) -> Optional[bool]:
     return self._name_scope.__exit__(*exc_info)
 
 
 @tf_export("get_current_name_scope", v1=[])
-def get_current_name_scope():
+def get_current_name_scope() -> str:
   """Returns current full name scope specified by `tf.name_scope(...)`s.
 
   For example,
@@ -5616,7 +5699,7 @@ def foo(x):
 
 
 @tf_export("name_scope", v1=[])
-class name_scope_v2(object):
+class name_scope_v2(contextlib.AbstractContextManager[str]):
   """A context manager for use when defining a Python op.
 
   This context manager pushes a name scope, which will make the name of all
@@ -5644,7 +5727,7 @@ def my_op(a, b, c, name=None):
 
   __slots__ = ["_name", "_exit_fns"]
 
-  def __init__(self, name):
+  def __init__(self, name) -> None:
     """Initialize the context manager.
 
     Args:
@@ -5662,7 +5745,7 @@ def __init__(self, name):
   def name(self):
     return self._name
 
-  def __enter__(self):
+  def __enter__(self) -> str:
     """Start the scope block.
 
     Returns:
@@ -5696,14 +5779,16 @@ def _restore_name_scope(*_):
       self._exit_fns.append(scope.__exit__)
     return scope_name
 
-  def __exit__(self, type_arg, value_arg, traceback_arg):
+  def __exit__(
+      self, type_arg: None, value_arg: None, traceback_arg: None,
+  ) -> bool:
     self._exit_fns.pop()(type_arg, value_arg, traceback_arg)
     return False  # False values do not suppress exceptions
 
-  def __getstate__(self):
+  def __getstate__(self) -> tuple[str, list[Callable[..., Any]]]:
     return self._name, self._exit_fns
 
-  def __setstate__(self, state):
+  def __setstate__(self, state) -> None:
     self._name = state[0]
     self._exit_fns = state[1]
 
@@ -5767,7 +5852,7 @@ def prepend_name_scope(name: str, import_scope) -> str:
 # pylint: disable=not-context-manager
 @tf_export(v1=["op_scope"])
 @tf_contextlib.contextmanager
-def op_scope(values, name, default_name=None):
+def op_scope(values, name, default_name=None) -> Iterator[Optional[str]]:
   """DEPRECATED. Same as name_scope above, just different argument order."""
   logging.warn("tf.op_scope(values, name, default_name) is deprecated,"
                " use tf.name_scope(name, default_name, values)")
@@ -5806,7 +5891,9 @@ def register_proto_function(collection_name,
                                     collection_name)
 
 
-def get_collection_proto_type(collection_name):
+def get_collection_proto_type(
+    collection_name,
+) -> Optional[type[message.Message]]:
   """Returns the proto_type for collection_name."""
   try:
     return _proto_function_registry.lookup(collection_name)[0]
@@ -5814,7 +5901,9 @@ def get_collection_proto_type(collection_name):
     return None
 
 
-def get_to_proto_function(collection_name):
+def get_to_proto_function(
+    collection_name,
+) -> Optional[Callable[[Any], message.Message]]:
   """Returns the to_proto function for collection_name."""
   try:
     return _proto_function_registry.lookup(collection_name)[1]
@@ -5822,7 +5911,9 @@ def get_to_proto_function(collection_name):
     return None
 
 
-def get_from_proto_function(collection_name):
+def get_from_proto_function(
+    collection_name,
+) -> Optional[Callable[[message.Message], Any]]:
   """Returns the from_proto function for collection_name."""
   try:
     return _proto_function_registry.lookup(collection_name)[2]
@@ -5830,7 +5921,9 @@ def get_from_proto_function(collection_name):
     return None
 
 
-def _op_to_colocate_with(v, graph):
+def _op_to_colocate_with(
+    v, graph,
+) -> tuple[Optional[Operation], Optional[Callable[[], None]]]:
   """Operation object corresponding to v to use for colocation constraints."""
   if v is None:
     return None, None
@@ -5867,7 +5960,7 @@ def _op_to_colocate_with(v, graph):
 # Helper functions for op wrapper modules generated by `python_op_gen`.
 
 
-def to_raw_op(f):
+def to_raw_op(f: types.FunctionType) -> Callable[..., Any]:
   """Make a given op wrapper function `f` raw.
 
   Raw op wrappers can only be called with keyword arguments.
@@ -5923,7 +6016,9 @@ def fn(x, v):
   default_graph._add_scope_exit_callback(fn)  # pylint: disable=protected-access
 
 
-def _reconstruct_sequence_inputs(op_def, inputs, attrs):
+def _reconstruct_sequence_inputs(
+    op_def, inputs, attrs,
+) -> list[Union[tensor_lib.Tensor, list[tensor_lib.Tensor]]]:
   """Regroups a flat list of input tensors into scalar and sequence inputs.
 
   Args:
@@ -5974,20 +6069,20 @@ class PromoMode(enum.Enum):
 _dtype_conversion_mode: PromoMode = PromoMode.OFF
 
 
-def get_dtype_conversion_mode():
+def get_dtype_conversion_mode() -> PromoMode:
   return _dtype_conversion_mode
 
 
 # TODO(b/289395872): Make sure all WeakTensor construction is guarded with this
 # check.
-def is_auto_dtype_conversion_enabled():
+def is_auto_dtype_conversion_enabled() -> bool:
   return (
       _dtype_conversion_mode == PromoMode.ALL
       or _dtype_conversion_mode == PromoMode.SAFE
   )
 
 
-def is_numpy_style_type_promotion():
+def is_numpy_style_type_promotion() -> bool:
   return _dtype_conversion_mode == PromoMode.LEGACY
 
 
@@ -6003,7 +6098,7 @@ def set_dtype_conversion_mode(dtype_conversion_mode) -> None:
   _dtype_conversion_mode = _get_promo_mode_enum(dtype_conversion_mode)
 
 
-def _get_promo_mode_enum(dtype_conversion_mode):
+def _get_promo_mode_enum(dtype_conversion_mode) -> PromoMode:
   """Returns the corresponding PromoMode enum value from string."""
   if dtype_conversion_mode == "off":
     return PromoMode.OFF
@@ -6081,7 +6176,7 @@ def _copy_handle_data_to_arg_def(tensor, arg_def) -> None:
 
 
 @tf_export("is_symbolic_tensor", v1=["is_symbolic_tensor"])
-def is_symbolic_tensor(tensor):
+def is_symbolic_tensor(tensor) -> bool:
   """Test if `tensor` is a symbolic Tensor.
 
   Args:
diff --git a/tensorflow/python/framework/python_api_dispatcher_wrapper.cc b/tensorflow/python/framework/python_api_dispatcher_wrapper.cc
index d3f688578a7098..f6ae3df3d23b62 100644
--- a/tensorflow/python/framework/python_api_dispatcher_wrapper.cc
+++ b/tensorflow/python/framework/python_api_dispatcher_wrapper.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 // Python bindings for tensorflow/python/framework/python_api_dispatcher.h.
 
+#include "absl/strings/str_cat.h"
 #include "pybind11/pybind11.h"  // from @pybind11
 #include "pybind11/pytypes.h"  // from @pybind11
 #include "pybind11/stl.h"  // from @pybind11
diff --git a/tensorflow/python/kernel_tests/linalg/BUILD b/tensorflow/python/kernel_tests/linalg/BUILD
index 936e4204d3ace8..4f12dc12ed3b7f 100644
--- a/tensorflow/python/kernel_tests/linalg/BUILD
+++ b/tensorflow/python/kernel_tests/linalg/BUILD
@@ -627,6 +627,7 @@ tf_py_strict_test(
 cuda_py_strict_test(
     name = "matrix_inverse_op_test",
     size = "small",
+    timeout = "moderate",
     srcs = ["matrix_inverse_op_test.py"],
     tags = ["optonly"],
     deps = [
diff --git a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
index ee84171e67ca81..f900b5c70d12e2 100644
--- a/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
+++ b/tensorflow/python/kernel_tests/linalg/linear_operator_circulant_test.py
@@ -210,16 +210,16 @@ class LinearOperatorCirculantBaseTest(object):
   _atol = {
       dtypes.float16: 1e-3,
       dtypes.float32: 1e-6,
-      dtypes.float64: 1e-7,
+      dtypes.float64: 2e-7,
       dtypes.complex64: 1e-6,
-      dtypes.complex128: 1e-7
+      dtypes.complex128: 2e-7,
   }
   _rtol = {
       dtypes.float16: 1e-3,
       dtypes.float32: 1e-6,
-      dtypes.float64: 1e-7,
+      dtypes.float64: 2e-7,
       dtypes.complex64: 1e-6,
-      dtypes.complex128: 1e-7
+      dtypes.complex128: 2e-7,
   }
 
   @contextlib.contextmanager
@@ -614,16 +614,16 @@ class LinearOperatorCirculant2DBaseTest(object):
   _atol = {
       dtypes.float16: 1e-3,
       dtypes.float32: 1e-6,
-      dtypes.float64: 1e-7,
+      dtypes.float64: 2e-7,
       dtypes.complex64: 1e-6,
-      dtypes.complex128: 1e-7
+      dtypes.complex128: 2e-7,
   }
   _rtol = {
       dtypes.float16: 1e-3,
       dtypes.float32: 1e-6,
-      dtypes.float64: 1e-7,
+      dtypes.float64: 2e-7,
       dtypes.complex64: 1e-6,
-      dtypes.complex128: 1e-7
+      dtypes.complex128: 2e-7,
   }
 
   @contextlib.contextmanager
diff --git a/tensorflow/python/kernel_tests/proto/decode_proto_op_test_base.py b/tensorflow/python/kernel_tests/proto/decode_proto_op_test_base.py
index 17b9520c831f39..cdc2678af6e862 100644
--- a/tensorflow/python/kernel_tests/proto/decode_proto_op_test_base.py
+++ b/tensorflow/python/kernel_tests/proto/decode_proto_op_test_base.py
@@ -297,7 +297,7 @@ def testCorruptProtobuf(self, sanitize):
     field_names = ['sizes']
     field_types = [dtypes.int32]
 
-    with self.assertRaisesRegexp(
+    with self.assertRaisesRegex(
         errors.DataLossError, 'Unable to parse binary protobuf'
         '|Failed to consume entire buffer'):
       self.evaluate(
@@ -308,6 +308,25 @@ def testCorruptProtobuf(self, sanitize):
               output_types=field_types,
               sanitize=sanitize))
 
+  def testUnexpectedType(self):
+    with self.assertRaisesRegex(
+        errors.InvalidArgumentError,
+        'Unexpected output type.*int64 to DT_STRING'):
+      msg = test_example_pb2.TestValue(
+          int64_value_with_default=3
+      ).SerializeToString()
+      msg_type = 'tensorflow.contrib.proto.TestValue'
+      field_names = ['int64_value_with_default']
+      field_types = [dtypes.string]
+      self.evaluate(
+          self._decode_module.decode_proto(
+              msg,
+              message_type=msg_type,
+              field_names=field_names,
+              output_types=field_types,
+          )
+      )
+
   def testOutOfOrderRepeated(self):
     fragments = [
         test_example_pb2.TestValue(double_value=[1.0]).SerializeToString(),
diff --git a/tensorflow/python/kernel_tests/variables/variables_test.py b/tensorflow/python/kernel_tests/variables/variables_test.py
index 45b88857090313..79e393cce8a5a9 100644
--- a/tensorflow/python/kernel_tests/variables/variables_test.py
+++ b/tensorflow/python/kernel_tests/variables/variables_test.py
@@ -646,6 +646,29 @@ def testTrainableVariable(self, cls):
              trainable=False)
     self.assertEqual(False, v4.trainable)
 
+  def testSaveSliceInfoFromSpecPasses(self):
+    save_slice_info = variables.Variable.SaveSliceInfo(
+        full_name="foo",
+        full_shape=[2, 3, 4],
+        var_offset=[0, 2, 0],
+        var_shape=[1, 1, 3])
+
+    save_slice_info_from_spec = variables.Variable.SaveSliceInfo.from_spec(
+        save_slice_info.spec)
+
+    self.assertEqual(save_slice_info.spec, save_slice_info_from_spec.spec)
+
+  @parameterized.parameters(
+      dict(spec="0", error_message="contain space-separated full_shape info"),
+      dict(spec="0:0", error_message="contain space-separated full_shape info"),
+      dict(spec="a b", error_message="full_shape must be a sequence of int"),
+      dict(spec="0 0:0", error_message="comma-separated pairs of offsets and"),
+      dict(spec="0 a,0:0:0", error_message="var_offset must be an integer"),
+      dict(spec="0 0,a:0:0", error_message="var_shape must be an integer"))
+  def testSaveSliceInfoFromSpecFails(self, spec, error_message):
+    with self.assertRaisesRegex(ValueError, error_message):
+      variables.Variable.SaveSliceInfo.from_spec(spec)
+
 
 class IsInitializedTest(test.TestCase):
 
diff --git a/tensorflow/python/lite/toco_python_api_wrapper.cc b/tensorflow/python/lite/toco_python_api_wrapper.cc
index a650948a85ad89..39dc7802f28f0e 100644
--- a/tensorflow/python/lite/toco_python_api_wrapper.cc
+++ b/tensorflow/python/lite/toco_python_api_wrapper.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include <vector>
 
 #include "pybind11/pybind11.h"  // from @pybind11
+#include "tensorflow/compiler/mlir/quantization/tensorflow/python/py_function_lib.h"
 #include "tensorflow/lite/toco/python/toco_python_api.h"
 #include "tensorflow/python/lib/core/pybind11_lib.h"
 
@@ -28,16 +29,20 @@ PYBIND11_MODULE(_pywrap_toco_api, m) {
       [](py::object model_flags_proto_txt_raw,
          py::object toco_flags_proto_txt_raw, py::object input_contents_txt_raw,
          bool extended_return, py::object debug_info_txt_raw,
-         bool enable_mlir_converter) {
+         bool enable_mlir_converter,
+         const tensorflow::quantization::PyFunctionLibrary*
+             quantization_py_function_library) {
         return tensorflow::PyoOrThrow(toco::TocoConvert(
             model_flags_proto_txt_raw.ptr(), toco_flags_proto_txt_raw.ptr(),
             input_contents_txt_raw.ptr(), extended_return,
-            debug_info_txt_raw.ptr(), enable_mlir_converter));
+            debug_info_txt_raw.ptr(), enable_mlir_converter,
+            quantization_py_function_library));
       },
       py::arg("model_flags_proto_txt_raw"), py::arg("toco_flags_proto_txt_raw"),
       py::arg("input_contents_txt_raw"), py::arg("extended_return") = false,
       py::arg("debug_info_txt_raw") = py::none(),
       py::arg("enable_mlir_converter") = false,
+      py::arg("quantization_py_function_library") = py::none(),
       R"pbdoc(
       Convert a model represented in `input_contents`. `model_flags_proto`
       describes model parameters. `toco_flags_proto` describes conversion
@@ -55,13 +60,15 @@ PYBIND11_MODULE(_pywrap_toco_api, m) {
          bool fully_quantize, int inference_type, int input_data_type,
          int output_data_type, bool enable_numeric_verify,
          bool enable_whole_model_verify, py::object op_blocklist,
-         py::object node_blocklist, bool enable_variable_quantization) {
+         py::object node_blocklist, bool enable_variable_quantization,
+         bool disable_per_channel_for_dense_layers) {
         return tensorflow::PyoOrThrow(toco::MlirQuantizeModel(
             input_contents_txt_raw.ptr(), disable_per_channel, fully_quantize,
             inference_type, input_data_type, output_data_type,
             enable_numeric_verify, enable_whole_model_verify,
             op_blocklist.ptr(), node_blocklist.ptr(),
-            enable_variable_quantization));
+            enable_variable_quantization,
+            disable_per_channel_for_dense_layers));
       },
       py::arg("input_contents_txt_raw"), py::arg("disable_per_channel") = false,
       py::arg("fully_quantize") = true, py::arg("inference_type") = 9,
@@ -71,6 +78,7 @@ PYBIND11_MODULE(_pywrap_toco_api, m) {
       py::arg("op_blocklist") = py::none(),
       py::arg("node_blocklist") = py::none(),
       py::arg("enable_variable_quantization") = false,
+      py::arg("disable_per_channel_for_dense_layers") = false,
       R"pbdoc(
       Returns a quantized model.
     )pbdoc");
diff --git a/tensorflow/python/ops/BUILD b/tensorflow/python/ops/BUILD
index 2da4f034f0c3c0..f4c80ec02dd88b 100644
--- a/tensorflow/python/ops/BUILD
+++ b/tensorflow/python/ops/BUILD
@@ -3153,6 +3153,7 @@ py_strict_library(
         "//tensorflow/python/util:tf_should_use",
         "//tensorflow/python/util:traceback_utils",
         "//third_party/py/numpy",
+        "@pypi_typing_extensions//:pkg",
     ],
 )
 
diff --git a/tensorflow/python/ops/math_ops_test.py b/tensorflow/python/ops/math_ops_test.py
index 01eefe80f74ba2..1694537d68aaf6 100644
--- a/tensorflow/python/ops/math_ops_test.py
+++ b/tensorflow/python/ops/math_ops_test.py
@@ -791,16 +791,38 @@ def testWithPythonValue(self):
 
   def intEdgeTestData(self, dtype):
     """Edge-case test data for integer types."""
-    # INT_MIN/-1 expected to produce signed-integer overflow,
-    # INT_MIN/INT_MAX expected to work.
-    nums = np.array([np.iinfo(dtype).min, -1, 1,
-                     np.iinfo(dtype).max],
-                    dtype=dtype).reshape([4, 1])
-    divs = nums.reshape([1, 4])
+    # INT_MIN/-1 will produce signed-integer overflow, so we instead test
+    # (INT_MIN + 1) / -1.
+    nums = np.array(
+        [
+            [np.iinfo(dtype).min, -1, 1, np.iinfo(dtype).max],
+            [np.iinfo(dtype).min + 1, -1, 1, np.iinfo(dtype).max],
+            [np.iinfo(dtype).min, -1, 1, np.iinfo(dtype).max],
+            [np.iinfo(dtype).min, -1, 1, np.iinfo(dtype).max],
+        ],
+        dtype=dtype,
+    )
+    divs = np.array(
+        [
+            [
+                np.iinfo(dtype).min,
+                np.iinfo(dtype).min,
+                np.iinfo(dtype).min,
+                np.iinfo(dtype).min,
+            ],
+            [-1, -1, -1, -1],
+            [1, 1, 1, 1],
+            [
+                np.iinfo(dtype).max,
+                np.iinfo(dtype).max,
+                np.iinfo(dtype).max,
+                np.iinfo(dtype).max,
+            ],
+        ],
+        dtype=dtype,
+    )
     return nums, divs
 
-  @test_util.disable_asan("Expected signed integer overflow.")
-  @test_util.disable_ubsan("Expected signed integer overflow.")
   def testFloorDivModIntEdges(self):
     for dtype in [np.int32, np.int64]:
       x, y = self.intEdgeTestData(dtype)
@@ -810,12 +832,7 @@ def testFloorDivModIntEdges(self):
       tf_floor_mod = math_ops.floormod(x, y)
       np_floor_mod = self.numpySafeFloorModInt(x, y)
       self.assertAllEqual(tf_floor_mod, np_floor_mod)
-      z = math_ops.add(math_ops.multiply(tf_floor_div, y), tf_floor_mod)
-      # x = floor_div(x, y) * y + floor_mod(x, y)
-      self.assertAllEqual(z, np.broadcast_to(x, z.shape))
 
-  @test_util.disable_asan("Expected signed integer overflow.")
-  @test_util.disable_ubsan("Expected signed integer overflow.")
   def testTruncateDivModIntEdges(self):
     for dtype in [np.int32, np.int64]:
       x, y = self.intEdgeTestData(dtype)
@@ -825,9 +842,6 @@ def testTruncateDivModIntEdges(self):
       tf_truncate_mod = math_ops.truncatemod(x, y)
       np_truncate_mod = self.numpySafeTruncateModInt(x, y)
       self.assertAllEqual(tf_truncate_mod, np_truncate_mod)
-      z = math_ops.add(math_ops.multiply(tf_truncate_div, y), tf_truncate_mod)
-      # x = truncatediv(x, y) * y + truncatemod(x, y)
-      self.assertAllEqual(z, np.broadcast_to(x, z.shape))
 
 
 @test_util.run_all_in_graph_and_eager_modes
diff --git a/tensorflow/python/ops/numpy_ops/tests/np_test.py b/tensorflow/python/ops/numpy_ops/tests/np_test.py
index f4b9a248174e76..d27c39a896dea7 100644
--- a/tensorflow/python/ops/numpy_ops/tests/np_test.py
+++ b/tensorflow/python/ops/numpy_ops/tests/np_test.py
@@ -2509,10 +2509,10 @@ def testIssue764(self):
   def testIssue776(self):
     """Tests that the scatter-add transpose rule instantiates symbolic zeros."""
     def f(u):
-      y = onp.ones(10,).at[[2, 4, 5]].add(u)
+      _ = onp.ones(10,).at[[2, 4, 5]].add(u)
       # The transpose rule for lax.tie_in returns a symbolic zero for its first
       # argument.
-      return lax.tie_in(y, 7.)
+      return 7.
 
     self.assertAllClose(onp.zeros(3,), api.grad(f)(onp.ones(3,)),
                         check_dtypes=True)
diff --git a/tensorflow/python/ops/variables.py b/tensorflow/python/ops/variables.py
index 49821d75da445d..a09834cc428536 100644
--- a/tensorflow/python/ops/variables.py
+++ b/tensorflow/python/ops/variables.py
@@ -19,6 +19,7 @@
 import functools
 import itertools
 import os
+from typing_extensions import Self
 
 from tensorflow.core.framework import variable_pb2
 from tensorflow.python import pywrap_tensorflow  # pylint: disable=unused-import
@@ -1300,6 +1301,74 @@ def spec(self):
           "%d,%d" % (o, s) for o, s in zip(self.var_offset, self.var_shape))
       return full_shape_str + sl_spec
 
+    @classmethod
+    def from_spec(cls, spec: str) -> Self:
+      """Parses a SaveSliceInfo spec string and returns a SaveSliceInfo object.
+
+      Args:
+        spec: The tensor slice spec string according to the SaveSliceInfo.spec
+          property. The spec contains the space-separated shape of the full
+          variable, followed by colon-separated pairs of the variable's offset
+          and shape, where each pair is comma-separated. For example, consider a
+          variable whose full shape is [4 3 5], offset is [0 1 3], and shape is
+          [4 1 2]. This variable's SaveSliceInfo.spec would be
+          "4 3 5 0,4:1,1:3,2".
+
+      Returns:
+        A SaveSliceInfo object containing the extracted information.
+
+      Raises:
+        ValueError: If the input string is not in the expected format.
+      """
+      if not spec:
+        return cls()
+
+      try:
+        full_shape_str, slice_str = spec.rsplit(" ", 1)
+      except ValueError as e:
+        raise ValueError(
+            "Spec string must contain space-separated full_shape info.") from e
+
+      # Parse the full shape.
+      full_shape = []
+      for dim in full_shape_str.split():
+        try:
+          full_shape.append(int(dim))
+        except ValueError as e:
+          raise ValueError(
+              "Spec string full_shape must be a sequence of integers. "
+              f"Found '{dim}', which is not an integer.") from e
+
+      # Parse the slice specification.
+      var_offset = []
+      var_shape = []
+      for dim_spec in slice_str.split(":"):
+        try:
+          offset, shape = dim_spec.split(",")
+        except ValueError as e:
+          raise ValueError(
+              "Spec string must contain comma-separated pairs of offsets and "
+              "shapes.") from e
+
+        try:
+          var_offset.append(int(offset))
+        except ValueError as e:
+          raise ValueError(
+              "Spec string var_offset must be an integer. "
+              f"Found '{offset}', which is not an integer.") from e
+        try:
+          var_shape.append(int(shape))
+        except ValueError as e:
+          raise ValueError(
+              "Spec string var_shape must be an integer. "
+              f"Found '{shape}', which is not an integer.") from e
+
+      return cls(
+          full_shape=full_shape,
+          var_offset=var_offset,
+          var_shape=var_shape
+      )
+
     def to_proto(self, export_scope=None):
       """Returns a SaveSliceInfoDef() proto.
 
diff --git a/tensorflow/python/profiler/internal/_pywrap_profiler.pyi b/tensorflow/python/profiler/internal/_pywrap_profiler.pyi
index 64193615707e1b..a514598c9682f7 100644
--- a/tensorflow/python/profiler/internal/_pywrap_profiler.pyi
+++ b/tensorflow/python/profiler/internal/_pywrap_profiler.pyi
@@ -23,4 +23,3 @@ def monitor(arg0: str, arg1: int, arg2: int, arg3: bool) -> str: ...
 def start_server(arg0: int) -> None: ...
 def trace(arg0: str, arg1: str, arg2: str, arg3: bool, arg4: int, arg5: int, arg6: dict) -> None: ...
 def xspace_to_tools_data(arg0: list, arg1: str, arg2: dict = ...) -> tuple: ...
-def xspace_to_tools_data_from_byte_string(arg0: list, arg1: list, arg2: str, arg3: dict) -> tuple: ...
diff --git a/tensorflow/python/profiler/internal/profiler_wrapper.cc b/tensorflow/python/profiler/internal/profiler_wrapper.cc
index e2f14743cee3b8..a4c3f25a21318d 100644
--- a/tensorflow/python/profiler/internal/profiler_wrapper.cc
+++ b/tensorflow/python/profiler/internal/profiler_wrapper.cc
@@ -177,53 +177,4 @@ PYBIND11_MODULE(_pywrap_profiler, m) {
       // TODO: consider defaulting `xspace_path_list` to empty list, since
       // this parameter is only used for two of the tools...
       py::arg(), py::arg(), py::arg() = py::dict());
-
-  m.def("xspace_to_tools_data_from_byte_string",
-        [](const py::list& xspace_string_list, const py::list& filenames_list,
-           const py::str& py_tool_name, const py::dict options = py::dict()) {
-          std::vector<std::unique_ptr<tensorflow::profiler::XSpace>> xspaces;
-          xspaces.reserve(xspace_string_list.size());
-          std::vector<std::string> xspace_paths;
-          xspace_paths.reserve(filenames_list.size());
-
-          // XSpace string inputs
-          for (py::handle obj : xspace_string_list) {
-            std::string xspace_string = std::string(py::cast<py::bytes>(obj));
-            auto xspace = std::make_unique<tensorflow::profiler::XSpace>();
-            if (!xspace->ParseFromString(xspace_string)) {
-              return py::make_tuple(py::bytes(""), py::bool_(false));
-            }
-            for (int i = 0; i < xspace->hostnames_size(); ++i) {
-              std::string hostname = xspace->hostnames(i);
-              std::replace(hostname.begin(), hostname.end(), ':', '_');
-              xspace->mutable_hostnames(i)->swap(hostname);
-            }
-            xspaces.push_back(std::move(xspace));
-          }
-
-          // XSpace paths.
-          for (py::handle obj : filenames_list) {
-            xspace_paths.push_back(std::string(py::cast<py::str>(obj)));
-          }
-
-          auto status_or_session_snapshot =
-              tensorflow::profiler::SessionSnapshot::Create(
-                  std::move(xspace_paths), std::move(xspaces));
-          if (!status_or_session_snapshot.ok()) {
-            LOG(ERROR) << status_or_session_snapshot.status().message();
-            return py::make_tuple(py::bytes(""), py::bool_(false));
-          }
-
-          std::string tool_name = std::string(py_tool_name);
-          ToolOptions tool_options = ToolOptionsFromPythonDict(options);
-          auto status_or_tool_data =
-              tensorflow::profiler::ConvertMultiXSpacesToToolData(
-                  status_or_session_snapshot.value(), tool_name, tool_options);
-          if (!status_or_tool_data.ok()) {
-            LOG(ERROR) << status_or_tool_data.status().message();
-            return py::make_tuple(py::bytes(""), py::bool_(false));
-          }
-          return py::make_tuple(py::bytes(status_or_tool_data.value()),
-                                py::bool_(true));
-        });
 };
diff --git a/tensorflow/python/saved_model/save.py b/tensorflow/python/saved_model/save.py
index 99c20bab308cb2..25b1d9f1d6797f 100644
--- a/tensorflow/python/saved_model/save.py
+++ b/tensorflow/python/saved_model/save.py
@@ -25,6 +25,7 @@
 
 from tensorflow.core.framework import function_pb2
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
 from tensorflow.core.framework import versions_pb2
 from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.core.protobuf import saved_model_pb2
@@ -843,6 +844,95 @@ def _trace_gradient_functions(graph: ops.Graph, saveable_view: _SaveableView):
       saveable_view.gradient_defs.append(grad_def)
 
 
+def _strip_debug_nodes(meta_graph_def: meta_graph_pb2.MetaGraphDef) -> None:
+  """An experimental function to remove debug nodes from the final graph.
+
+  This function removes all Assert and CheckNumerics nodes from the meta_graph.
+  It strips the operators in both the nodes and in all of the function defs,
+  with the Assert ops being replaced by `NoOp`s and the CheckNumerics ops being
+  transformed into `Identity` ops. In addition to this, it creates control
+  inputs for the nodes that are not relevant for the op. For more information
+  about control inputs please see go/how-tensors-flow#control-dependencies.
+
+  Args:
+   meta_graph_def: The meta_graph that will be exported.
+  """
+
+  def erase_regular_node_attributes(node: node_def_pb2.NodeDef) -> None:
+    """Erases regular node attributes."""
+    attributes_to_remove = [
+        attribute
+        for attribute in node.attr.keys()
+        if not attribute.startswith("_")
+    ]
+    for attribute in attributes_to_remove:
+      node.attr.pop(attribute)
+
+  def prune_all_non_t_attributes(node: node_def_pb2.NodeDef) -> None:
+    """Prunes all attributes that are not `T`."""
+    if "T" in node.attr:
+      t_value = node.attr["T"]
+      node.ClearField("attr")
+      node.attr["T"].CopyFrom(t_value)
+    else:
+      node.ClearField("attr")
+
+  def is_control_input(name: str) -> str:
+    """Returns whether or not the input is a control input."""
+    return name and name[0] == "^"
+
+  def as_control_dep(name: str) -> str:
+    """Returns the input as a control dependency."""
+    return "^" + name.split(":")[0]
+
+  def maybe_do_strip(node: node_def_pb2.NodeDef) -> None:
+    """Strips the graph from Assert and CheckNumerics ops.
+
+    For Assert ops, this function also rewrites all of the inputs to the nodes
+    that were transformed by making them into control dependencies. It also
+    removes all of the regular node attributes, that is all node attributes
+    that do not start with `_`.
+
+    For CheckNumerics ops, this function turns the op into an Identity op,
+    which will be pruned later (according to the original implementation in
+    grappler's `debug_stripper.cc`. Then, since Identity ops only take one
+    input, it leaves the first input as is while transforming the other ones
+    into control dependencies.
+
+    Args:
+      node: The node to potentally strip.
+    """
+    if node.op == "Assert" or node.op == "PrintV2":
+      node.op = "NoOp"
+      erase_regular_node_attributes(node)
+      new_inputs = []
+      for inp in node.input:
+        if not is_control_input(inp):
+          new_inputs.append(as_control_dep(inp))
+        else:
+          new_inputs.append(inp)
+      node.ClearField("input")
+      node.input.extend(new_inputs)
+    elif node.op == "CheckNumerics" or node.op == "Print":
+      # The identity op will be pruned later.
+      node.op = "Identity"
+      prune_all_non_t_attributes(node)
+      # As Identity op only takes one input, mark redundant inputs as control
+      # inputs.
+      for i in range(1, len(node.input)):
+        if not is_control_input(node.input[i]):
+          node.input[i] = as_control_dep(node.input[i])
+
+  # First, we strip the assert nodes from the graph.
+  for node in meta_graph_def.graph_def.node:
+    maybe_do_strip(node)
+
+  # Then, we strip the assert nodes from all of the function defs.
+  for func in meta_graph_def.graph_def.library.function:
+    for node in func.node_def:
+      maybe_do_strip(node)
+
+
 def _fill_meta_graph_def(
     meta_graph_def: meta_graph_pb2.MetaGraphDef,
     saveable_view: _SaveableView,
@@ -850,6 +940,7 @@ def _fill_meta_graph_def(
     namespace_whitelist: List[str],
     save_custom_gradients: bool,
     create_saver: bool,
+    enable_debug_stripper: bool,
     defaults=None,
 ) -> Tuple[_AssetInfo, ops.Graph]:
   """Generates a MetaGraph which calls `signature_functions`.
@@ -862,6 +953,7 @@ def _fill_meta_graph_def(
     namespace_whitelist: List of strings containing whitelisted op namespaces.
     save_custom_gradients: Whether to save custom gradients.
     create_saver: Whether to add SavedModel's native save and restore ops.
+    enable_debug_stripper: Whether to strip the debug nodes from the graph.
     defaults: A dictionary mapping signature_key to dictionary of
       user_specified_name to Tensor representing default values.
 
@@ -952,8 +1044,6 @@ def call_with_mapped_captures(function, args):
       versions.__git_version__)
   # We currently always strip default attributes.
   meta_graph_def.meta_info_def.stripped_default_attrs = True
-  meta_graph_def.meta_info_def.stripped_op_list.MergeFrom(
-      meta_graph.stripped_op_list_for_graph(meta_graph_def.graph_def))
   meta_graph_def.asset_file_def.extend(asset_info.asset_defs)
   for signature_key, signature in signatures.items():
     meta_graph_def.signature_def[signature_key].CopyFrom(signature)
@@ -961,6 +1051,10 @@ def call_with_mapped_captures(function, args):
   # store tensor_content in litle endian format
   if sys.byteorder == "big":
     utils_impl.swap_function_tensor_content(meta_graph_def, "big", "little")
+  if enable_debug_stripper:
+    _strip_debug_nodes(meta_graph_def)
+  meta_graph_def.meta_info_def.stripped_op_list.MergeFrom(
+      meta_graph.stripped_op_list_for_graph(meta_graph_def.graph_def))
   return asset_info, exported_graph
 
 
@@ -1514,6 +1608,7 @@ def _build_meta_graph_impl(
       namespace_whitelist=options.namespace_whitelist,
       save_custom_gradients=options.experimental_custom_gradients,
       create_saver=not options.experimental_skip_saver,
+      enable_debug_stripper=options.experimental_debug_stripper,
       defaults=defaults,
   )
   if options.function_aliases:
diff --git a/tensorflow/python/saved_model/save_options.py b/tensorflow/python/saved_model/save_options.py
index c0da1ff1f8cd22..bc6c7b657ef95e 100644
--- a/tensorflow/python/saved_model/save_options.py
+++ b/tensorflow/python/saved_model/save_options.py
@@ -103,6 +103,7 @@ class SaveOptions:
       "namespace_whitelist",
       "save_debug_info",
       "function_aliases",
+      "experimental_debug_stripper",
       "experimental_io_device",
       "experimental_variable_policy",
       "experimental_custom_gradients",
@@ -116,6 +117,7 @@ def __init__(
       namespace_whitelist=None,
       save_debug_info=False,
       function_aliases=None,
+      experimental_debug_stripper=False,
       experimental_io_device=None,
       experimental_variable_policy=None,
       experimental_custom_gradients=True,
@@ -149,6 +151,10 @@ class Adder(tf.Module): ...   @tf.function ...   def double(self, x):
         tf.saved_model.SaveOptions( ...   function_aliases={'double':
         model.double}) >>> tf.saved_model.save(model, '/tmp/adder',
         options=options)
+      experimental_debug_stripper: bool. If set to True, this strips the debug
+        nodes from the graph, from both the nodes and the function defs. Note
+        that this currently only strips the `Assert` nodes from the graph and
+        converts them into `NoOp`s instead.
       experimental_io_device: string. Applies in a distributed setting.
         Tensorflow device to use to access the filesystem. If `None` (default)
         then for each variable the filesystem is accessed from the CPU:0 device
@@ -186,6 +192,7 @@ class Adder(tf.Module): ...   @tf.function ...   def double(self, x):
     self.save_debug_info = save_debug_info
     self.function_aliases = function_aliases if function_aliases else dict()
     self.experimental_custom_gradients = experimental_custom_gradients
+    self.experimental_debug_stripper = experimental_debug_stripper
     self.experimental_io_device = experimental_io_device
     self.experimental_variable_policy = VariablePolicy.from_obj(
         experimental_variable_policy
diff --git a/tensorflow/python/saved_model/save_test.py b/tensorflow/python/saved_model/save_test.py
index 743831edc042e6..45fb5c4f11e0ab 100644
--- a/tensorflow/python/saved_model/save_test.py
+++ b/tensorflow/python/saved_model/save_test.py
@@ -20,8 +20,12 @@
 
 from google.protobuf import text_format
 from tensorflow.core.config import flags
+from tensorflow.core.framework import attr_value_pb2
+from tensorflow.core.framework import function_pb2
 from tensorflow.core.framework import graph_debug_info_pb2
 from tensorflow.core.framework import graph_pb2
+from tensorflow.core.framework import node_def_pb2
+from tensorflow.core.protobuf import meta_graph_pb2
 from tensorflow.python.checkpoint import checkpoint
 from tensorflow.python.checkpoint.sharding import sharding_policies
 from tensorflow.python.client import session as session_lib
@@ -1110,22 +1114,160 @@ def test_save_custom_op_with_no_whitelist_specified(self):
     # If the user passes an empty list for the namespace whitelist rather than
     # nothing, we should then throw an exception if a custom op is used.
     with self.assertRaisesRegex(
-        ValueError, "Attempted to save ops from non-whitelisted namespaces"):
+        ValueError, "Attempted to save ops from non-whitelisted namespaces"
+    ):
       save._verify_ops(graph_def, [])
 
+  def test_strip_debug_nodes(self):
+    # Test that we are able to strip debug nodes from a meta_graph correctly.
+    test_node_defs = [
+        node_def_pb2.NodeDef(
+            name="AssertNode",
+            op="Assert",
+            input=[
+                "NonControlInput:output:0",
+                "^ControlInput:output:0",
+            ],
+            attr={
+                "regular_node_attr": attr_value_pb2.AttrValue(i=1),
+                "_non_regular_node_attr": attr_value_pb2.AttrValue(i=2),
+            }
+        ),
+        node_def_pb2.NodeDef(
+            name="ConstNode",
+            op="Const",
+        ),
+        node_def_pb2.NodeDef(
+            name="CheckNumericsNode",
+            op="CheckNumerics",
+            input=[
+                "NonControlInput:output:0",
+                "NonControlInputTwo:output:0",
+                "^ControlInput:output:0",
+            ],
+            attr={
+                "T": attr_value_pb2.AttrValue(i=4),
+                "NotT": attr_value_pb2.AttrValue(i=5),
+            }
+        ),
+        node_def_pb2.NodeDef(
+            name="CheckNumericsNodeTwo",
+            op="CheckNumerics",
+            input=[
+                "NonControlInput:output:0",
+                "NonControlInputTwo:output:0",
+                "^ControlInput:output:0",
+            ],
+            attr={
+                "OnlyNotT": attr_value_pb2.AttrValue(i=6),
+            },
+        ),
+        node_def_pb2.NodeDef(
+            name="PrintNode",
+            op="Print",
+            input=[
+                "NonControlInput:output:0",
+            ],
+        ),
+        node_def_pb2.NodeDef(
+            name="PrintV2Node",
+            op="PrintV2",
+            input=[
+                "NonControlInput:output:0",
+            ],
+        ),
+    ]
+
+    expected_node_defs = [
+        node_def_pb2.NodeDef(
+            name="AssertNode",
+            op="NoOp",
+            input=[
+                "^NonControlInput",
+                "^ControlInput:output:0",
+            ],
+            attr={
+                "_non_regular_node_attr": attr_value_pb2.AttrValue(i=2),
+            }
+        ),
+        node_def_pb2.NodeDef(
+            name="ConstNode",
+            op="Const",
+        ),
+        node_def_pb2.NodeDef(
+            name="CheckNumericsNode",
+            op="Identity",
+            input=[
+                "NonControlInput:output:0",
+                "^NonControlInputTwo",
+                "^ControlInput:output:0",
+            ],
+            attr={
+                "T": attr_value_pb2.AttrValue(i=4),
+            }
+        ),
+        node_def_pb2.NodeDef(
+            name="CheckNumericsNodeTwo",
+            op="Identity",
+            input=[
+                "NonControlInput:output:0",
+                "^NonControlInputTwo",
+                "^ControlInput:output:0",
+            ],
+        ),
+        node_def_pb2.NodeDef(
+            name="PrintNode",
+            op="Identity",
+            input=[
+                "NonControlInput:output:0",
+            ],
+        ),
+        node_def_pb2.NodeDef(
+            name="PrintV2Node",
+            op="NoOp",
+            input=[
+                "^NonControlInput",
+            ],
+        ),
+    ]
+
+    meta_graph_def = meta_graph_pb2.MetaGraphDef(
+        graph_def=graph_pb2.GraphDef(
+            node=test_node_defs,
+            library=function_pb2.FunctionDefLibrary(
+                function=[function_pb2.FunctionDef(node_def=test_node_defs)]
+            ),
+        ),
+    )
+
+    expected = meta_graph_pb2.MetaGraphDef(
+        graph_def=graph_pb2.GraphDef(
+            node=expected_node_defs,
+            library=function_pb2.FunctionDefLibrary(
+                function=[function_pb2.FunctionDef(node_def=expected_node_defs)]
+            ),
+        ),
+    )
+
+    save._strip_debug_nodes(meta_graph_def)
+    self.assertEqual(expected, meta_graph_def)
+
   def test_save_debug_info_enabled(self):
     root = autotrackable.AutoTrackable()
     root.f = def_function.function(
-        lambda x: math_ops.mul(2., x, name="DEBUG_INFO_OP"),
-        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)])
+        lambda x: math_ops.mul(2.0, x, name="DEBUG_INFO_OP"),
+        input_signature=[tensor_spec.TensorSpec(None, dtypes.float32)],
+    )
     save_dir = os.path.join(self.get_temp_dir(), "saved_model")
     save.save(
         root,
         save_dir,
         root.f,
-        options=save_options.SaveOptions(save_debug_info=True))
-    debug_info_file_name = os.path.join(save_dir, "debug",
-                                        "saved_model_debug_info.pb")
+        options=save_options.SaveOptions(save_debug_info=True),
+    )
+    debug_info_file_name = os.path.join(
+        save_dir, "debug", "saved_model_debug_info.pb"
+    )
     self.assertTrue(os.path.exists(debug_info_file_name))
     debug_info = graph_debug_info_pb2.GraphDebugInfo()
     with open(debug_info_file_name, "rb") as f:
diff --git a/tensorflow/python/tools/BUILD b/tensorflow/python/tools/BUILD
index 208964d198579e..77140a6866e674 100644
--- a/tensorflow/python/tools/BUILD
+++ b/tensorflow/python/tools/BUILD
@@ -469,6 +469,18 @@ py_strict_binary(
     ],
 )
 
+# copybara:comment_begin(oss-only)
+py_strict_binary(
+    name = "grpc_tpu_worker",
+    srcs = ["grpc_tpu_worker.py"],
+)
+
+py_strict_binary(
+    name = "grpc_tpu_worker_service",
+    srcs = ["grpc_tpu_worker_service.py"],
+)
+# copybara:comment_end
+
 EMITTED_AOT_SAVE_MODEL_OBJECTS = [
     "x_matmul_y_large/saved_model.pb",
     "x_matmul_y_large/variables/variables.index",
diff --git a/tensorflow/python/tools/api/generator2/generate_api.bzl b/tensorflow/python/tools/api/generator2/generate_api.bzl
index c2a96438576d22..554389e625147a 100644
--- a/tensorflow/python/tools/api/generator2/generate_api.bzl
+++ b/tensorflow/python/tools/api/generator2/generate_api.bzl
@@ -176,7 +176,7 @@ def _generate_api_impl(ctx):
     args.use_param_file("--flagfile=%s")
 
     args.add_joined("--output_files", ctx.outputs.output_files, join_with = ",")
-    args.add("--output_dir", paths.join(ctx.bin_dir.path, ctx.label.package, ctx.attr.output_dir))
+    args.add("--output_dir", paths.join(ctx.bin_dir.path, ctx.label.workspace_root, ctx.label.package, ctx.attr.output_dir))
     if ctx.file.root_init_template:
         args.add("--root_init_template", ctx.file.root_init_template)
     args.add("--apiversion", ctx.attr.api_version)
diff --git a/tensorflow/python/tools/grpc_tpu_worker.py b/tensorflow/python/tools/grpc_tpu_worker.py
new file mode 100644
index 00000000000000..154fcaeacf07d9
--- /dev/null
+++ b/tensorflow/python/tools/grpc_tpu_worker.py
@@ -0,0 +1,138 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=g-import-not-at-top
+"""Python-based TPU Worker GRPC server.
+
+Start a blocking TPU Worker GRPC server.
+
+Usage:
+    python3 grpc_tpu_worker.py
+"""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import argparse
+import os
+import sys
+
+import requests
+
+from tensorflow.core.protobuf import config_pb2
+from tensorflow.core.protobuf import tensorflow_server_pb2
+from tensorflow.python.training import server_lib
+
+
+def get_metadata(key):
+  return requests.get(
+      'http://metadata.google.internal/computeMetadata'
+      '/v1/instance/attributes/{}'.format(key),
+      headers={
+          'Metadata-Flavor': 'Google'
+      }).text
+
+
+def get_host_ip():
+  return requests.get(
+      'http://metadata.google.internal/computeMetadata/v1/instance/network-interfaces/0/ip',
+      headers={
+          'Metadata-Flavor': 'Google'
+      }).text
+
+
+def setup_env_vars():
+  """Set environment variables."""
+  worker_id = get_metadata('agent-worker-number')
+  accelerator_type = get_metadata('accelerator-type')
+  worker_network_endpoints = get_metadata('worker-network-endpoints')
+  os.environ['TPU_STDERR_LOG_LEVEL'] = '0'
+  os.environ['CLOUD_TPU_TASK_ID'] = worker_id
+  os.environ['TPU_LOCK_DEVICE'] = 'true'
+  os.environ['TPU_CHIPS_PER_HOST_BOUNDS'] = '2,2,1'
+  accelerator_type_to_host_bounds = {
+      # v2
+      'v2-8': '1,1,1',
+      'v2-32': '2,2,1',
+      'v2-128': '4,4,1',
+      'v2-256': '4,8,1',
+      'v2-512': '8,8,1',
+      # v3
+      'v3-8': '1,1,1',
+      'v3-32': '2,2,1',
+      'v3-64': '2,4,1',
+      'v3-128': '4,4,1',
+      'v3-256': '4,8,1',
+      'v3-512': '8,8,1',
+      'v3-1024': '8,16,1',
+      'v3-2048': '16,16,1',
+      # v4
+      'v4-8': '1,1,1',
+      'v4-16': '1,1,2',
+      'v4-32': '1,1,4',
+      'v4-64': '1,2,4',
+      'v4-128': '2,2,4',
+      'v4-256': '2,2,8',
+      'v4-512': '2,4,8',
+      'v4-1024': '4,4,8',
+      'v4-2048': '4,4,16',
+      'v4-4096': '4,8,16',
+  }
+
+  os.environ['TPU_HOST_BOUNDS'] = accelerator_type_to_host_bounds[
+      accelerator_type]
+  os.environ['TPU_MESH_CONTROLLER_ADDRESS'] = worker_network_endpoints.split(
+      ',')[0].split(':')[2] + ':8476'
+  os.environ['TPU_MESH_CONTROLLER_PORT'] = '8476'
+
+  os.environ['TPU_STDERR_LOG_LEVEL'] = '0'
+
+  if accelerator_type not in ['v4-8', 'v4-16', 'v4-32', 'v4-64']:
+    os.environ['TPU_TOPOLOGY_WRAP'] = 'true,true,true'
+
+  # Set the hostname override.
+  os.environ['TPU_HOSTNAME_OVERRIDE'] = get_host_ip()
+
+
+def main(unused_args):
+  # Create Protobuf ServerDef.
+  server_def = tensorflow_server_pb2.ServerDef(protocol='grpc')
+  job_def = server_def.cluster.job.add()
+  job_def.name = 'tpu_worker'
+  job_def.tasks[0] = 'localhost:8470'
+  server_def.job_name = 'tpu_worker'
+  server_def.task_index = 0
+
+  config = config_pb2.ConfigProto()
+
+  # Create GRPC Server instance
+  server = server_lib.Server(server_def, config=config)
+
+  # join() is blocking, unlike start()
+  server.join()
+
+
+def run():
+  parser = argparse.ArgumentParser()
+
+  _, unparsed = parser.parse_known_args()
+  # Must set environment variables before importing tensorflow.
+  setup_env_vars()
+  from tensorflow.python.platform import app
+  app.run(main=main, argv=[sys.argv[0]] + unparsed)
+
+
+if __name__ == '__main__':
+  run()
diff --git a/tensorflow/python/tools/grpc_tpu_worker_service.py b/tensorflow/python/tools/grpc_tpu_worker_service.py
new file mode 100644
index 00000000000000..9da28227492f07
--- /dev/null
+++ b/tensorflow/python/tools/grpc_tpu_worker_service.py
@@ -0,0 +1,97 @@
+# Copyright 2022 The TensorFlow Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+# pylint: disable=g-import-not-at-top
+"""Script to start GRPC worker as a service.
+
+Usage:
+    python3 grpc_tpu_worker_service.py
+"""
+
+import os
+import subprocess
+import sys
+
+
+def get_sys_path():
+  return ":".join(sys.path)
+
+
+def get_username():
+  return os.environ.get("USER")
+
+
+username = get_username()
+sys_path = get_sys_path()
+
+SERVICE_FILE_CONTENT = f"""
+[Unit]
+Description=GRPC TPU Worker Service
+After=network.target
+[Service]
+Type=simple
+Environment="PYTHONPATH=$PYTHONPATH{sys_path}"
+EnvironmentFile=/home/tpu-runtime/tpu-env
+#ExecStartPre=/bin/mkdir -p /tmp/tflogs
+#ExecStartPre=/bin/touch /tmp/tflogs/grpc_tpu_worker.log
+#ExecStartPre=/bin/chmod +r /tmp/tflogs
+ExecStart=/home/{get_username()}/.local/bin/start_grpc_tpu_worker #2>&1 | tee -a /tmp/tflogs/grpc_tpu_worker.log
+Restart=on-failure
+# Restart service after 10 seconds if the service crashes:
+RestartSec=10
+[Install]
+WantedBy=multi-user.target
+"""
+SERVICE_NAME = "grpc_tpu_worker.service"
+
+
+def create_systemd_service_file(service_content, service_name):
+  with open(service_name, "w") as file:
+    file.write(service_content)
+  print(f"Service file {service_name} created")
+
+
+def move_file_to_systemd(service_name):
+  if not os.path.exists("~/.config/systemd/user/"):
+    mkdir_command = "mkdir -p ~/.config/systemd/user"
+    subprocess.run(mkdir_command, shell=True, check=True)
+    print("Created directory ~/.config/systemd/user/")
+  command = f"mv {service_name} ~/.config/systemd/user/{service_name}"
+  subprocess.run(command, shell=True, check=True)
+  print(f"Service file moved to ~/.config/systemd/user/{service_name}")
+
+
+def enable_start_service(service_name):
+  commands = [
+      "systemctl --user daemon-reload",
+      f"systemctl --user enable {service_name}",
+      f"systemctl --user start {service_name}",
+  ]
+  for command in commands:
+    subprocess.run(command, shell=True, check=True)
+    print(f"Executed: {command}")
+
+
+def run():
+  if os.path.exists(f"~/.config/systemd/user/{SERVICE_NAME}"):
+    print(f"Service file ~/.config/systemd/user/{SERVICE_NAME} already exists")
+    sys.exit(1)
+  else:
+    create_systemd_service_file(SERVICE_FILE_CONTENT, SERVICE_NAME)
+    move_file_to_systemd(SERVICE_NAME)
+    enable_start_service(SERVICE_NAME)
+
+
+if __name__ == "__main__":
+  run()
diff --git a/tensorflow/python/tpu/tpu.py b/tensorflow/python/tpu/tpu.py
index cc830a68dc427d..a38bd9f881ee09 100644
--- a/tensorflow/python/tpu/tpu.py
+++ b/tensorflow/python/tpu/tpu.py
@@ -1513,6 +1513,7 @@ def under_tpu_inference_context() -> bool:
       graph = graph.outer_graph
     else:
       return False
+  return False
 
 
 class _TPUInferenceContext(control_flow_ops.XLAControlFlowContext):
diff --git a/tensorflow/python/tpu/tpu_embedding_v2_utils.py b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
index 0ea650b336c630..98978846b9103d 100644
--- a/tensorflow/python/tpu/tpu_embedding_v2_utils.py
+++ b/tensorflow/python/tpu/tpu_embedding_v2_utils.py
@@ -240,7 +240,7 @@ def __init__(
       clip_weight_min: Optional[float] = None,
       clip_weight_max: Optional[float] = None,
       weight_decay_factor: Optional[float] = None,
-      multiply_weight_decay_factor_by_learning_rate: bool = None,
+      multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
       clipvalue: Optional[ClipValueType] = None,
       low_dimensional_packing_status: bool = False,
   ):
@@ -357,7 +357,7 @@ def __init__(
       clip_weight_min: Optional[float] = None,
       clip_weight_max: Optional[float] = None,
       weight_decay_factor: Optional[float] = None,
-      multiply_weight_decay_factor_by_learning_rate: bool = None,
+      multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
       slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
       clipvalue: Optional[ClipValueType] = None,
       low_dimensional_packing_status: bool = False,
@@ -490,7 +490,7 @@ def __init__(
       clip_weight_min: Optional[float] = None,
       clip_weight_max: Optional[float] = None,
       weight_decay_factor: Optional[float] = None,
-      multiply_weight_decay_factor_by_learning_rate: bool = None,
+      multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
       slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
       clipvalue: Optional[ClipValueType] = None,
       low_dimensional_packing_status: bool = False,
@@ -640,7 +640,7 @@ def __init__(
       clip_weight_min: Optional[float] = None,
       clip_weight_max: Optional[float] = None,
       weight_decay_factor: Optional[float] = None,
-      multiply_weight_decay_factor_by_learning_rate: bool = None,
+      multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
       slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
       clipvalue: Optional[ClipValueType] = None,
       multiply_linear_by_learning_rate: bool = False,
@@ -815,7 +815,7 @@ def __init__(
       clip_weight_min: Optional[float] = None,
       clip_weight_max: Optional[float] = None,
       weight_decay_factor: Optional[float] = None,
-      multiply_weight_decay_factor_by_learning_rate: bool = None,
+      multiply_weight_decay_factor_by_learning_rate: Optional[bool] = None,
       slot_variable_creation_fn: Optional[SlotVarCreationFnType] = None,
       clipvalue: Optional[ClipValueType] = None,
       low_dimensional_packing_status: bool = False,
diff --git a/tensorflow/python/trackable/data_structures.py b/tensorflow/python/trackable/data_structures.py
index 4cb9904f275c21..989549c0b65e77 100644
--- a/tensorflow/python/trackable/data_structures.py
+++ b/tensorflow/python/trackable/data_structures.py
@@ -818,7 +818,13 @@ def __getattribute__(self, name):
       # of the wrapper without this logic.
       return object.__getattribute__(self, name)
     else:
-      return super().__getattribute__(name)
+      # Raise TypeError as AttributeError to fix breakage in wrapt 1.15 for
+      # `__getattribute__` as suggested in discussion with library author in
+      # GitHub https://github.com/GrahamDumpleton/wrapt/issues/231
+      try:
+        return super().__getattribute__(name)
+      except TypeError as e:
+        raise AttributeError from e
 
   def copy(self):
     return copy.copy(self)
diff --git a/tensorflow/python/util/BUILD b/tensorflow/python/util/BUILD
index 0cf659efb72005..e5c0b1b6d29025 100644
--- a/tensorflow/python/util/BUILD
+++ b/tensorflow/python/util/BUILD
@@ -492,6 +492,7 @@ cc_library(
         "//tensorflow/python/lib/core:safe_pyobject_ptr",
         "//third_party/python_runtime:headers",  # buildcleaner: keep
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
diff --git a/tensorflow/python/util/function_parameter_canonicalizer.cc b/tensorflow/python/util/function_parameter_canonicalizer.cc
index 6afd6d7991085b..d9769242cff32e 100644
--- a/tensorflow/python/util/function_parameter_canonicalizer.cc
+++ b/tensorflow/python/util/function_parameter_canonicalizer.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "tensorflow/python/util/function_parameter_canonicalizer.h"
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/strings/str_cat.h"
 #include "tensorflow/core/platform/logging.h"
 #include "tensorflow/core/platform/macros.h"
 #include "tensorflow/python/lib/core/py_util.h"
diff --git a/tensorflow/tensorflow.bzl b/tensorflow/tensorflow.bzl
index 6f98d8ab55622a..13fd78d1ebf68f 100644
--- a/tensorflow/tensorflow.bzl
+++ b/tensorflow/tensorflow.bzl
@@ -82,7 +82,7 @@ def register_extension_info(**kwargs):
 # not contain rc or alpha, only numbers.
 # Also update tensorflow/core/public/version.h
 # and tensorflow/tools/pip_package/setup.py
-VERSION = "2.16.0"
+VERSION = "2.17.0"
 VERSION_MAJOR = VERSION.split(".")[0]
 two_gpu_tags = ["requires-gpu-nvidia:2", "manual", "no_pip"]
 
@@ -2547,7 +2547,7 @@ def py_test(deps = [], data = [], kernels = [], exec_properties = None, test_rul
         }),
         data = data + select({
             "//conditions:default": kernels,
-            clean_dep("//tensorflow:no_tensorflow_py_deps"): ["//tensorflow/tools/pip_package:win_pip_package_marker"],
+            clean_dep("//tensorflow:no_tensorflow_py_deps"): [],
         }),
         exec_properties = exec_properties,
         **kwargs
diff --git a/tensorflow/tf_framework_version_script.lds b/tensorflow/tf_framework_version_script.lds
index 5a62eb9060f054..f9d4c2f0f83283 100644
--- a/tensorflow/tf_framework_version_script.lds
+++ b/tensorflow/tf_framework_version_script.lds
@@ -3,9 +3,6 @@ VERS_1.0 {
   local:
     jpeg_*;
     jinit_*;
-    jdiv_round_up;
-    jround_up;
-    jzero_far;
     jcopy_*;
     jsimd_*;
     hwloc_*;
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.-variable.-save-slice-info.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.-variable.-save-slice-info.pbtxt
index ac3ccd468b216a..612803d0358fc2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.-variable.-save-slice-info.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.-variable.-save-slice-info.pbtxt
@@ -10,6 +10,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'full_name\', \'full_shape\', \'var_offset\', \'var_shape\', \'save_slice_info_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "from_spec"
+    argspec: "args=[\'cls\', \'spec\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "to_proto"
     argspec: "args=[\'self\', \'export_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
index a6840ab11e930f..ee8586b35c90b4 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-dataset.pbtxt
@@ -71,6 +71,10 @@ tf_class {
     name: "filter_with_legacy_function"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "fingerprint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 15a40fa9d1d49a..3dfe8c47450ab7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -73,6 +73,10 @@ tf_class {
     name: "filter_with_legacy_function"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "fingerprint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
index 3bc59721e5ccc2..141e749bb3242f 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-options.pbtxt
@@ -47,6 +47,10 @@ tf_class {
     name: "experimental_warm_start"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "framework_type"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "threading"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
index 45bbc931abb569..a6fd3ab125d9b0 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -73,6 +73,10 @@ tf_class {
     name: "filter_with_legacy_function"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "fingerprint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
index f5655dbf59bd45..7f5e3054580cb2 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.-text-line-dataset.pbtxt
@@ -73,6 +73,10 @@ tf_class {
     name: "filter_with_legacy_function"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "fingerprint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-autotune-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-autotune-options.pbtxt
index 91a0a2f6c57c7f..4c9a8ccfe98730 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-autotune-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-autotune-options.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "enabled"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "initial_parallelism"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "ram_budget"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
index b123b579dbce15..2d60c141f042e7 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -73,6 +73,10 @@ tf_class {
     name: "filter_with_legacy_function"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "fingerprint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
index b4ce611b3ff696..6e8bac7c375f2e 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -43,6 +43,10 @@ tf_class {
     name: "parallel_batch"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "seq_interleave_prefetch"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "shuffle_and_repeat_fusion"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
index 7bc3e6a9386003..69cbbe85568309 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -73,6 +73,10 @@ tf_class {
     name: "filter_with_legacy_function"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "fingerprint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
index 16564994f5308d..ef59409a7f8163 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -73,6 +73,10 @@ tf_class {
     name: "filter_with_legacy_function"
     argspec: "args=[\'self\', \'predicate\'], varargs=None, keywords=None, defaults=None"
   }
+  member_method {
+    name: "fingerprint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.name_scope.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.name_scope.pbtxt
index 8f74c08021ebf3..ebf9d94df57b8c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.name_scope.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.name_scope.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.name_scope"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.name_scope_v1\'>"
-  is_instance: "<type \'object\'>"
+  is_instance: "<class \'contextlib.AbstractContextManager\'>"
   member {
     name: "name"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
index 80e84f38715742..4829cda77f6b2c 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.raw_ops.pbtxt
@@ -1108,6 +1108,10 @@ tf_module {
     name: "DatasetCardinality"
     argspec: "args=[\'input_dataset\', \'cardinality_options\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
+  member_method {
+    name: "DatasetFingerprint"
+    argspec: "args=[\'input_dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "DatasetFromGraph"
     argspec: "args=[\'graph_def\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1940,6 +1944,10 @@ tf_module {
     name: "GlobalIterId"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "GlobalShuffleDataset"
+    argspec: "args=[\'input_dataset\', \'seed\', \'seed2\', \'seed_generator\', \'output_types\', \'output_shapes\', \'reshuffle_each_iteration\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'\', \'None\'], "
+  }
   member_method {
     name: "Greater"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-save-options.pbtxt b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-save-options.pbtxt
index 99aeb4d5057ef9..44764be213ac66 100644
--- a/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-save-options.pbtxt
+++ b/tensorflow/tools/api/golden/v1/tensorflow.saved_model.-save-options.pbtxt
@@ -6,6 +6,10 @@ tf_class {
     name: "experimental_custom_gradients"
     mtype: "<type \'member_descriptor\'>"
   }
+  member {
+    name: "experimental_debug_stripper"
+    mtype: "<type \'member_descriptor\'>"
+  }
   member {
     name: "experimental_image_format"
     mtype: "<type \'member_descriptor\'>"
@@ -40,6 +44,6 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'namespace_whitelist\', \'save_debug_info\', \'function_aliases\', \'experimental_io_device\', \'experimental_variable_policy\', \'experimental_custom_gradients\', \'experimental_image_format\', \'experimental_skip_saver\', \'experimental_sharding_callback\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\', \'None\', \'True\', \'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'namespace_whitelist\', \'save_debug_info\', \'function_aliases\', \'experimental_debug_stripper\', \'experimental_io_device\', \'experimental_variable_policy\', \'experimental_custom_gradients\', \'experimental_image_format\', \'experimental_skip_saver\', \'experimental_sharding_callback\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\', \'None\', \'None\', \'True\', \'False\', \'False\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.-variable.-save-slice-info.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.-variable.-save-slice-info.pbtxt
index ac3ccd468b216a..612803d0358fc2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.-variable.-save-slice-info.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.-variable.-save-slice-info.pbtxt
@@ -10,6 +10,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'full_name\', \'full_shape\', \'var_offset\', \'var_shape\', \'save_slice_info_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "from_spec"
+    argspec: "args=[\'cls\', \'spec\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "to_proto"
     argspec: "args=[\'self\', \'export_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
index be368d13f3cdcb..cfd67bcbfa2ec6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-dataset.pbtxt
@@ -54,6 +54,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "fingerprint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
index 4c061c2f7c2bf2..3131173eb62479 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-fixed-length-record-dataset.pbtxt
@@ -56,6 +56,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "fingerprint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
index 3bc59721e5ccc2..141e749bb3242f 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-options.pbtxt
@@ -47,6 +47,10 @@ tf_class {
     name: "experimental_warm_start"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "framework_type"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "threading"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
index be9a8aba994be2..a1d840f8c049d1 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-t-f-record-dataset.pbtxt
@@ -55,6 +55,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "fingerprint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
index fe6741cb0c8e63..0ea7026818e716 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.-text-line-dataset.pbtxt
@@ -56,6 +56,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "fingerprint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-autotune-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-autotune-options.pbtxt
index 91a0a2f6c57c7f..4c9a8ccfe98730 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-autotune-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-autotune-options.pbtxt
@@ -15,6 +15,10 @@ tf_class {
     name: "enabled"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "initial_parallelism"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "ram_budget"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
index b61f709597e4d5..3919f34bf0b3a2 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-csv-dataset.pbtxt
@@ -56,6 +56,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "fingerprint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
index b4ce611b3ff696..6e8bac7c375f2e 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-optimization-options.pbtxt
@@ -43,6 +43,10 @@ tf_class {
     name: "parallel_batch"
     mtype: "<type \'property\'>"
   }
+  member {
+    name: "seq_interleave_prefetch"
+    mtype: "<type \'property\'>"
+  }
   member {
     name: "shuffle_and_repeat_fusion"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
index b8cfedf400654a..dfcf305519801b 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-random-dataset.pbtxt
@@ -57,6 +57,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "fingerprint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
index a4f8d6edcb2aff..927fa402edd6c6 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.data.experimental.-sql-dataset.pbtxt
@@ -56,6 +56,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "fingerprint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-dataset.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-dataset.pbtxt
index 4ff9cd49689eeb..1c43477c9ba136 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-dataset.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-tensor-dataset.pbtxt
@@ -57,6 +57,10 @@ tf_class {
     name: "filter"
     argspec: "args=[\'self\', \'predicate\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "fingerprint"
+    argspec: "args=[\'self\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "flat_map"
     argspec: "args=[\'self\', \'map_func\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-variable.-save-slice-info.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-variable.-save-slice-info.pbtxt
index 790fdc05122e38..ebd596ecb91165 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-variable.-save-slice-info.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.experimental.dtensor.-d-variable.-save-slice-info.pbtxt
@@ -10,6 +10,10 @@ tf_class {
     name: "__init__"
     argspec: "args=[\'self\', \'full_name\', \'full_shape\', \'var_offset\', \'var_shape\', \'save_slice_info_def\', \'import_scope\'], varargs=None, keywords=None, defaults=[\'None\', \'None\', \'None\', \'None\', \'None\', \'None\'], "
   }
+  member_method {
+    name: "from_spec"
+    argspec: "args=[\'cls\', \'spec\'], varargs=None, keywords=None, defaults=None"
+  }
   member_method {
     name: "to_proto"
     argspec: "args=[\'self\', \'export_scope\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.name_scope.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.name_scope.pbtxt
index 41f9e5110d780d..ae7cee472d5563 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.name_scope.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.name_scope.pbtxt
@@ -1,7 +1,7 @@
 path: "tensorflow.name_scope"
 tf_class {
   is_instance: "<class \'tensorflow.python.framework.ops.name_scope_v2\'>"
-  is_instance: "<type \'object\'>"
+  is_instance: "<class \'contextlib.AbstractContextManager\'>"
   member {
     name: "name"
     mtype: "<type \'property\'>"
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
index 80e84f38715742..4829cda77f6b2c 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.raw_ops.pbtxt
@@ -1108,6 +1108,10 @@ tf_module {
     name: "DatasetCardinality"
     argspec: "args=[\'input_dataset\', \'cardinality_options\', \'name\'], varargs=None, keywords=None, defaults=[\'\', \'None\'], "
   }
+  member_method {
+    name: "DatasetFingerprint"
+    argspec: "args=[\'input_dataset\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
+  }
   member_method {
     name: "DatasetFromGraph"
     argspec: "args=[\'graph_def\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
@@ -1940,6 +1944,10 @@ tf_module {
     name: "GlobalIterId"
     argspec: "args=[\'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
   }
+  member_method {
+    name: "GlobalShuffleDataset"
+    argspec: "args=[\'input_dataset\', \'seed\', \'seed2\', \'seed_generator\', \'output_types\', \'output_shapes\', \'reshuffle_each_iteration\', \'metadata\', \'name\'], varargs=None, keywords=None, defaults=[\'True\', \'\', \'None\'], "
+  }
   member_method {
     name: "Greater"
     argspec: "args=[\'x\', \'y\', \'name\'], varargs=None, keywords=None, defaults=[\'None\'], "
diff --git a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-save-options.pbtxt b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-save-options.pbtxt
index 99aeb4d5057ef9..44764be213ac66 100644
--- a/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-save-options.pbtxt
+++ b/tensorflow/tools/api/golden/v2/tensorflow.saved_model.-save-options.pbtxt
@@ -6,6 +6,10 @@ tf_class {
     name: "experimental_custom_gradients"
     mtype: "<type \'member_descriptor\'>"
   }
+  member {
+    name: "experimental_debug_stripper"
+    mtype: "<type \'member_descriptor\'>"
+  }
   member {
     name: "experimental_image_format"
     mtype: "<type \'member_descriptor\'>"
@@ -40,6 +44,6 @@ tf_class {
   }
   member_method {
     name: "__init__"
-    argspec: "args=[\'self\', \'namespace_whitelist\', \'save_debug_info\', \'function_aliases\', \'experimental_io_device\', \'experimental_variable_policy\', \'experimental_custom_gradients\', \'experimental_image_format\', \'experimental_skip_saver\', \'experimental_sharding_callback\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'None\', \'None\', \'True\', \'False\', \'False\', \'None\'], "
+    argspec: "args=[\'self\', \'namespace_whitelist\', \'save_debug_info\', \'function_aliases\', \'experimental_debug_stripper\', \'experimental_io_device\', \'experimental_variable_policy\', \'experimental_custom_gradients\', \'experimental_image_format\', \'experimental_skip_saver\', \'experimental_sharding_callback\'], varargs=None, keywords=None, defaults=[\'None\', \'False\', \'None\', \'False\', \'None\', \'None\', \'True\', \'False\', \'False\', \'None\'], "
   }
 }
diff --git a/tensorflow/tools/ci_build/install/install_bazel.sh b/tensorflow/tools/ci_build/install/install_bazel.sh
index 8a3fce78657645..97abb5718d955d 100755
--- a/tensorflow/tools/ci_build/install/install_bazel.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel.sh
@@ -14,7 +14,7 @@
 # limitations under the License.
 # ==============================================================================
 
-BAZEL_VERSION="6.1.0"
+BAZEL_VERSION="6.5.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
index d571859e001b5a..78bdf70d9ee1b5 100755
--- a/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
+++ b/tensorflow/tools/ci_build/install/install_bazel_from_source.sh
@@ -17,7 +17,7 @@
 # This script is to be used to install bzel on non x86_64 systems
 # It will compile bazel from source and install it in /usr/local/bin
 
-BAZEL_VERSION="6.1.0"
+BAZEL_VERSION="6.5.0"
 
 set +e
 local_bazel_ver=$(bazel version 2>&1 | grep -i label | awk '{print $3}')
diff --git a/tensorflow/tools/ci_build/release/common.sh b/tensorflow/tools/ci_build/release/common.sh
index 84eb4611478531..340db2d9a529bf 100644
--- a/tensorflow/tools/ci_build/release/common.sh
+++ b/tensorflow/tools/ci_build/release/common.sh
@@ -17,13 +17,13 @@
 
 # Keeps Bazel versions of the build scripts.
 # LINT.IfChange
-LATEST_BAZEL_VERSION=6.1.0
+LATEST_BAZEL_VERSION=6.5.0
 # LINT.ThenChange(
 #   //tensorflow/opensource_only/.bazelversion,
+#   //tensorflow/opensource_only/ci/official/requirements_updater/.bazelversion
 #   //tensorflow/tools/ci_build/install/install_bazel.sh,
 #   //tensorflow/tools/ci_build/install/install_bazel_from_source.sh,
-#   //tensorflow/tools/toolchains/cross_compile/cc/cc_toolchain_config.bzl,
-#   //tensorflow_estimator/google/kokoro/common.sh)
+#   //tensorflow/tools/toolchains/cross_compile/cc/cc_toolchain_config.bzl)
 
 # Run flaky functions with retries.
 # run_with_retry cmd
@@ -106,8 +106,7 @@ function update_bazel_linux {
   which bazel
   bazel version
 }
-# LINT.ThenChange(
-#   //tensorflow_estimator/google/kokoro/common.sh)
+# LINT.ThenChange()
 
 function install_ubuntu_16_pip_deps {
   PIP_CMD="pip"
diff --git a/tensorflow/tools/docs/base_dir.py b/tensorflow/tools/docs/base_dir.py
index a210613f7b5184..14ea268191e0a5 100644
--- a/tensorflow/tools/docs/base_dir.py
+++ b/tensorflow/tools/docs/base_dir.py
@@ -21,6 +21,11 @@
 import tensorflow as tf
 from tensorflow_docs.api_generator import public_api
 
+try:
+  import tensorflow_estimator  # pylint: disable=[g-import-not-at-top, g-deprecated-tf-checker]
+except ImportError:
+  tensorflow_estimator = None
+
 
 def get_base_dirs_and_prefixes(code_url_prefix):
   """Returns the base_dirs and code_prefixes for OSS TensorFlow api gen."""
@@ -33,7 +38,32 @@ def get_base_dirs_and_prefixes(code_url_prefix):
         f"https://github.com/keras-team/keras/tree/v{keras.__version__}/keras"
     )
 
-  if version.parse(tf.__version__) >= version.parse("2.13"):
+  if version.parse(tf.__version__) >= version.parse("2.16"):
+    # First match takes precedence.
+    # Objects are dropped if they have no match.
+    base_dirs = [
+        # The real keras source files are now in `site-packages/keras/src/...`
+        pathlib.Path(keras.__file__).parent / "src",
+        # The generated module files in tensorflow are in keras
+        # under `site-packages/keras/api/_v2/keras/...`.
+        pathlib.Path(tf.keras.__file__).parent,
+        # The generated api-module files are now in `site-packages/keras/...`
+        pathlib.Path(keras.__file__).parent,
+        pathlib.Path(tensorboard.__file__).parent,
+        # The tensorflow base dir goes last because `tf.keras``
+        base_dir,
+    ]
+
+    code_url_prefixes = (
+        keras_url_prefix,
+        # None -> don't link to the generated keras api-module files.
+        None,
+        None,
+        f"https://github.com/tensorflow/tensorboard/tree/{tensorboard.__version__}/tensorboard",
+        code_url_prefix,
+    )
+
+  elif version.parse(tf.__version__) >= version.parse("2.13"):
     # First match takes precedence.
     # Objects are dropped if they have no match.
     base_dirs = [
@@ -45,6 +75,7 @@ def get_base_dirs_and_prefixes(code_url_prefix):
         # The generated api-module files are now in `site-packages/keras/...`
         pathlib.Path(keras.__file__).parent,
         pathlib.Path(tensorboard.__file__).parent,
+        pathlib.Path(tensorflow_estimator.__file__).parent,
         # The tensorflow base dir goes last because `tf.keras``
         base_dir,
     ]
@@ -55,6 +86,7 @@ def get_base_dirs_and_prefixes(code_url_prefix):
         None,
         None,
         f"https://github.com/tensorflow/tensorboard/tree/{tensorboard.__version__}/tensorboard",
+        "https://github.com/tensorflow/estimator/tree/master/tensorflow_estimator",
         code_url_prefix,
     )
   elif version.parse(tf.__version__) >= version.parse("2.9"):
@@ -62,11 +94,13 @@ def get_base_dirs_and_prefixes(code_url_prefix):
         base_dir,
         pathlib.Path(keras.__file__).parent,
         pathlib.Path(tensorboard.__file__).parent,
+        pathlib.Path(tensorflow_estimator.__file__).parent,
     ]
     code_url_prefixes = (
         code_url_prefix,
         keras_url_prefix,
         f"https://github.com/tensorflow/tensorboard/tree/{tensorboard.__version__}/tensorboard",
+        "https://github.com/tensorflow/estimator/tree/master/tensorflow_estimator",
     )
   else:
     raise ValueError("Unsupported: version < 2.9")
diff --git a/tensorflow/tools/docs/generate2.py b/tensorflow/tools/docs/generate2.py
index caac9f7bbcec87..1129c9412492a8 100644
--- a/tensorflow/tools/docs/generate2.py
+++ b/tensorflow/tools/docs/generate2.py
@@ -109,9 +109,17 @@
   ```
   """
 
+tf.estimator.Estimator = doc_controls.inheritable_header("""\
+  Warning: TensorFlow 2.15 included the final release of the `tf-estimator` 
+  package. Estimators will not be available in TensorFlow 2.16 or after. See the
+  [migration guide](https://www.tensorflow.org/guide/migrate/migrating_estimator)
+  for more information about how to convert off of Estimators."
+  """)(tf.estimator.Estimator)
+
 
 class RawOpsPageInfo(module_page.ModulePageInfo):
   """Generates a custom page for `tf.raw_ops`."""
+
   DEFAULT_BUILDER_CLASS = base_page.TemplatePageBuilder
 
   def build(self):
@@ -207,7 +215,7 @@ def generate_raw_ops_doc_ge_214(self):
 # So prefix the score tuples with -1 when this is the canonical name, +1
 # otherwise. The generator chooses the name with the lowest score.
 class TfExportAwareVisitor(doc_generator_visitor.DocGeneratorVisitor):
-  """A `tf_export`, and `keras_export` aware doc_visitor."""
+  """A `tf_export`, `keras_export` and `estimator_export` aware doc_visitor."""
 
   class TfNameScore(NamedTuple):
     canonical_score: int
@@ -215,9 +223,17 @@ class TfNameScore(NamedTuple):
 
   def _score_name(self, path: doc_generator_visitor.ApiPath) -> TfNameScore:
     name = ".".join(path)
-    all_exports = [tf_export.TENSORFLOW_API_NAME,
-                   tf_export.KERAS_API_NAME]
+    all_exports = [
+        tf_export.TENSORFLOW_API_NAME,
+        tf_export.KERAS_API_NAME,
+    ]
+
+    try:
+      all_exports.append(tf_export.ESTIMATOR_API_NAME)
+    except AttributeError:
+      pass
 
+    canonical = None
     for api_name in all_exports:
       try:
         canonical = tf_export.get_canonical_name_for_symbol(
@@ -245,10 +261,13 @@ def build_docs(output_dir, code_url_prefix, search_hints):
   output_dir = pathlib.Path(output_dir)
   site_path = pathlib.Path("/", FLAGS.site_path)
 
-  if version.parse(tf.__version__) >= version.parse("2.9"):
-    doc_controls.set_deprecated(tf.compat.v1)
-    doc_controls.set_deprecated(tf.feature_column)
-    doc_controls.set_deprecated(tf.keras.preprocessing)
+  doc_controls.set_deprecated(tf.compat.v1)
+  try:
+    doc_controls.set_deprecated(tf.estimator)
+  except AttributeError:
+    pass
+  doc_controls.set_deprecated(tf.feature_column)
+  doc_controls.set_deprecated(tf.keras.preprocessing)
 
   # The custom page will be used for raw_ops.md not the one generated above.
   doc_controls.set_custom_page_builder_cls(tf.raw_ops, RawOpsPageInfo)
diff --git a/tensorflow/tools/optimization/BUILD b/tensorflow/tools/optimization/BUILD
index fc12208932e00b..6c2b53f0da4dc8 100644
--- a/tensorflow/tools/optimization/BUILD
+++ b/tensorflow/tools/optimization/BUILD
@@ -28,6 +28,9 @@ tf_cuda_library(
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
@@ -39,14 +42,12 @@ tf_cc_binary(
         "//tensorflow/compiler/jit:xla_cpu_jit",
         "//tensorflow/compiler/jit:xla_gpu_jit",
         "//tensorflow/compiler/tf2xla:xla_compiler",
-        "//tensorflow/core:core_cpu",
         "//tensorflow/core:core_cpu_base",
-        "//tensorflow/core:framework",
         "//tensorflow/core:framework_internal",
         "//tensorflow/core:framework_lite",
         "//tensorflow/core:lib",
         "//tensorflow/core:protos_all_cc",
         "//tensorflow/core:tensorflow",
-        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:status",
     ],
 )
diff --git a/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc b/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc
index 563b1d5197cb71..ad2856332dd0a0 100644
--- a/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc
+++ b/tensorflow/tools/optimization/gpu_optimization_pass_runner_main.cc
@@ -17,14 +17,16 @@ limitations under the License.
 // --output_file_path=/tmp/output.pbtxt
 // --optimization_pass=NameOfGraphOptimizationPass
 
-#include "absl/strings/str_cat.h"
-#include "tensorflow/compiler/tf2xla/xla_op_registry.h"
-#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
+#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/errors.h"
 #include "tensorflow/core/platform/init_main.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/util/command_line_flags.h"
 #include "tensorflow/tools/optimization/optimization_pass_runner.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/tools/optimization/optimization_pass_runner.cc b/tensorflow/tools/optimization/optimization_pass_runner.cc
index 33bd9caf37452a..502977c4f72eb8 100644
--- a/tensorflow/tools/optimization/optimization_pass_runner.cc
+++ b/tensorflow/tools/optimization/optimization_pass_runner.cc
@@ -23,11 +23,12 @@ limitations under the License.
 #include <string>
 #include <vector>
 
-#include "tensorflow/core/common_runtime/device.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/common_runtime/device_set.h"
 #include "tensorflow/core/common_runtime/graph_constructor.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
-#include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/device_attributes.pb.h"
 #include "tensorflow/core/framework/function.h"
 #include "tensorflow/core/framework/function.pb.h"
@@ -37,9 +38,10 @@ limitations under the License.
 #include "tensorflow/core/graph/graph.h"
 #include "tensorflow/core/lib/core/errors.h"
 #include "tensorflow/core/lib/core/status.h"
-#include "tensorflow/core/platform/env.h"
+#include "tensorflow/core/platform/types.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 #include "tensorflow/core/public/session_options.h"
+#include "tsl/platform/errors.h"
 
 namespace tensorflow {
 namespace {
diff --git a/tensorflow/tools/optimization/optimization_pass_runner.h b/tensorflow/tools/optimization/optimization_pass_runner.h
index 0f7e3cfb56461d..0b96ce3e5a9d47 100644
--- a/tensorflow/tools/optimization/optimization_pass_runner.h
+++ b/tensorflow/tools/optimization/optimization_pass_runner.h
@@ -19,8 +19,11 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/strings/string_view.h"
 #include "tensorflow/core/common_runtime/device.h"
 #include "tensorflow/core/common_runtime/optimization_registry.h"
+#include "tensorflow/core/framework/device.h"
+#include "tensorflow/core/framework/types.h"
 #include "tensorflow/core/lib/core/status.h"
 #include "tensorflow/core/protobuf/config.pb.h"
 
diff --git a/tensorflow/tools/pip_package/BUILD b/tensorflow/tools/pip_package/BUILD
index 0209ada2f68c74..91be8d17531433 100644
--- a/tensorflow/tools/pip_package/BUILD
+++ b/tensorflow/tools/pip_package/BUILD
@@ -3,7 +3,7 @@
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_syslibs//:build_defs.bzl", "if_not_system_lib")
-load("//tensorflow:tensorflow.bzl", "filegroup_as_file", "transitive_hdrs")
+load("//tensorflow:tensorflow.bzl", "filegroup_as_file", "if_with_tpu_support", "transitive_hdrs")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_license_deps")
 load("//third_party/mkl:build_defs.bzl", "if_enable_mkl", "if_mkl", "if_mkl_ml")
 
@@ -197,7 +197,10 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/distribute/experimental/rpc:rpc_ops",
     "//tensorflow/python/util:pywrap_xla_ops",
     "//tensorflow:tensorflow_py",
-]
+] + if_with_tpu_support([
+    "//tensorflow/python/tools:grpc_tpu_worker",
+    "//tensorflow/python/tools:grpc_tpu_worker_service",
+])
 
 filegroup(
     name = "licenses",
diff --git a/tensorflow/tools/pip_package/setup.py b/tensorflow/tools/pip_package/setup.py
index e6a52936a47f1c..10f6bbcf026396 100644
--- a/tensorflow/tools/pip_package/setup.py
+++ b/tensorflow/tools/pip_package/setup.py
@@ -48,7 +48,7 @@
 # result for pip.
 # Also update tensorflow/tensorflow.bzl and
 # tensorflow/core/public/version.h
-_VERSION = '2.16.0'
+_VERSION = '2.17.0'
 
 
 # We use the same setup.py for all tensorflow_* packages and for the nightly
@@ -105,7 +105,7 @@ def standard_or_nightly(standard, nightly):
     'six >= 1.12.0',
     'termcolor >= 1.1.0',
     'typing_extensions >= 3.6.6',
-    'wrapt >= 1.11.0, < 1.15',
+    'wrapt >= 1.11.0',
     # TODO(b/305196096): Remove the <3.12 condition once the pkg is updated
     'tensorflow-io-gcs-filesystem >= 0.23.1 ; python_version < "3.12"',
     # grpcio does not build correctly on big-endian machines due to lack of
@@ -121,7 +121,7 @@ def standard_or_nightly(standard, nightly):
     # dependencies on the release branch is updated to the stable releases (RC
     # or final). For example, 'keras-nightly ~= 2.14.0.dev' will be replaced by
     # 'keras >= 2.14.0rc0, < 2.15' on the release branch after the branch cut.
-    'tb-nightly ~= 2.16.0.a',
+    'tb-nightly ~= 2.17.0.a',
     'keras-nightly ~= 3.0.0.dev',
 ]
 REQUIRED_PACKAGES = [p for p in REQUIRED_PACKAGES if p is not None]
@@ -155,9 +155,15 @@ def standard_or_nightly(standard, nightly):
       # Windows machine.
       standard_or_nightly('tensorflow-intel', 'tf-nightly-intel') + '==' +
       _VERSION + ';platform_system=="Windows"',
-      # Install the TensorFlow package built by Apple if the user is running
-      # macOS on an Apple Silicon machine.
-      standard_or_nightly('tensorflow-macos', 'tf-nightly-macos') + '==' +
+      # Starting with TF 2.16, Apple Silicon packages are uploaded directly
+      # to the "tensorflow" project on PyPI. In order to not break users who
+      # are still using `tensorflow-macos`, we upload an empty installer wheel
+      # to "tensorflow-macos" and add "tensorflow" as its dependency. Please
+      # note that this will go away in TF 2.17 and `tensorflow-macos` will be
+      # considered deprecated. Installer packages are not uploaded to
+      # `tf-nightly-macos`, `tf-nightly` is added below only to avoid breaking
+      # CI builds.
+      standard_or_nightly('tensorflow', 'tf-nightly') + '==' +
       _VERSION + ';platform_system=="Darwin" and platform_machine=="arm64"',
   ]
 
@@ -249,6 +255,10 @@ def finalize_options(self):
 
   def mkdir_and_copy_file(self, header):
     install_dir = os.path.join(self.install_dir, os.path.dirname(header))
+    # Windows platform uses "\" in path strings, the external header location
+    # expects "/" in paths. Hence, we replaced "\" with "/" for this reason
+    if platform.system() == 'Windows':
+      install_dir = install_dir.replace('\\', '/')
     # Get rid of some extra intervening directories so we can have fewer
     # directories for -I
     install_dir = re.sub('/google/protobuf_archive/src', '', install_dir)
@@ -333,6 +343,11 @@ def find_files(pattern, root):
       ).strftime('%Y%m%d'),
   )
   REQUIRED_PACKAGES.append([f'libtpu-nightly=={_libtpu_version}'])
+  CONSOLE_SCRIPTS.extend([
+      'start_grpc_tpu_worker = tensorflow.python.tools.grpc_tpu_worker:run',
+      ('start_grpc_tpu_service = '
+       'tensorflow.python.tools.grpc_tpu_worker_service:run'),
+  ])
 
 if os.name == 'nt':
   EXTENSION_NAME = 'python/_pywrap_tensorflow_internal.pyd'
diff --git a/tensorflow/tools/pip_package/v2/BUILD b/tensorflow/tools/pip_package/v2/BUILD
index 4f63f8c37d772f..99996a4e8fda25 100644
--- a/tensorflow/tools/pip_package/v2/BUILD
+++ b/tensorflow/tools/pip_package/v2/BUILD
@@ -3,7 +3,7 @@
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("@local_config_syslibs//:build_defs.bzl", "if_not_system_lib")
-load("//tensorflow:tensorflow.bzl", "transitive_hdrs")
+load("//tensorflow:tensorflow.bzl", "if_with_tpu_support", "transitive_hdrs")
 load("//tensorflow/core/platform:build_config_root.bzl", "tf_additional_license_deps")
 load("//third_party/mkl:build_defs.bzl", "if_enable_mkl", "if_mkl", "if_mkl_ml")
 load("//tensorflow/tools/pip_package/v2/utils:data_deps.bzl", "collect_data_files")
@@ -226,7 +226,10 @@ COMMON_PIP_DEPS = [
     "//tensorflow/python/util:pywrap_xla_ops",
     "//tensorflow:tensorflow_py",
     "//tensorflow/tools/compatibility:tf_upgrade_v2",
-]
+] + if_with_tpu_support([
+    "//tensorflow/python/tools:grpc_tpu_worker",
+    "//tensorflow/python/tools:grpc_tpu_worker_service",
+])
 
 py_binary(
     name = "build_pip_package_py",
diff --git a/tensorflow/tools/pip_package/v2/setup.py b/tensorflow/tools/pip_package/v2/setup.py
index cd5525963eabe3..2422ce2755cbf4 100644
--- a/tensorflow/tools/pip_package/v2/setup.py
+++ b/tensorflow/tools/pip_package/v2/setup.py
@@ -48,7 +48,7 @@
 # result for pip.
 # Also update tensorflow/tensorflow.bzl and
 # tensorflow/core/public/version.h
-_VERSION = '2.16.0'
+_VERSION = '2.17.0'
 
 
 # We use the same setup.py for all tensorflow_* packages and for the nightly
@@ -98,7 +98,7 @@ def standard_or_nightly(standard, nightly):
     'six >= 1.12.0',
     'termcolor >= 1.1.0',
     'typing_extensions >= 3.6.6',
-    'wrapt >= 1.11.0, < 1.15',
+    'wrapt >= 1.11.0',
     # TODO(b/305196096): Remove the <3.12 condition once the pkg is updated
     'tensorflow-io-gcs-filesystem >= 0.23.1 ; python_version < "3.12"',
     # grpcio does not build correctly on big-endian machines due to lack of
@@ -114,7 +114,7 @@ def standard_or_nightly(standard, nightly):
     # dependencies on the release branch is updated to the stable releases (RC
     # or final). For example, 'keras-nightly ~= 2.14.0.dev' will be replaced by
     # 'keras >= 2.14.0rc0, < 2.15' on the release branch after the branch cut.
-    'tb-nightly ~= 2.16.0.a',
+    'tb-nightly ~= 2.17.0.a',
     'keras-nightly ~= 3.0.0.dev',
 ]
 REQUIRED_PACKAGES = [p for p in REQUIRED_PACKAGES if p is not None]
@@ -242,6 +242,9 @@ def finalize_options(self):
 
   def mkdir_and_copy_file(self, header):
     install_dir = os.path.join(self.install_dir, os.path.dirname(header))
+    # Windows platform uses "\" in path strings, the external header location
+    # expects "/" in paths. Hence, we replaced "\" with "/" for this reason
+    install_dir = install_dir.replace('\\', '/')
     # Get rid of some extra intervening directories so we can have fewer
     # directories for -I
     install_dir = re.sub('/google/protobuf_archive/src', '', install_dir)
@@ -327,6 +330,11 @@ def find_files(pattern, root):
       ).strftime('%Y%m%d'),
   )
   REQUIRED_PACKAGES.append([f'libtpu-nightly=={_libtpu_version}'])
+  CONSOLE_SCRIPTS.extend([
+      'start_grpc_tpu_worker = tensorflow.python.tools.grpc_tpu_worker:run',
+      ('start_grpc_tpu_service = '
+       'tensorflow.python.tools.grpc_tpu_worker_service:run'),
+  ])
 
 if os.name == 'nt':
   EXTENSION_NAME = 'python/_pywrap_tensorflow_internal.pyd'
diff --git a/tensorflow/tools/test/performance.bzl b/tensorflow/tools/test/performance.bzl
index f918da44589729..1576299656943c 100644
--- a/tensorflow/tools/test/performance.bzl
+++ b/tensorflow/tools/test/performance.bzl
@@ -49,7 +49,7 @@ def tf_cc_logged_benchmark(
         deps = [
             "@absl_py//absl:app",
             "@absl_py//absl/flags",
-            "//tensorflow/core:protos_all_py_pb2",
+            "@org_tensorflow//tensorflow/core:protos_all_py",
             "//tensorflow/python/platform:gfile",
             "//tensorflow/python/platform:test",
             "//tensorflow/python/platform:tf_logging",
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt b/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
index 827d1d396d0031..49db07a0f190ab 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.packages.txt
@@ -36,10 +36,9 @@ autoconf
 automake
 build-essential
 ca-certificates
-# TODO(b/308399490) Remove CMake once dm-tree (Keras dependency) has 3.12 wheels
-cmake
 llvm-17
 clang-17
+clang-tidy-17
 lld-17
 clang-format-12
 colordiff
diff --git a/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt b/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
index 6e5bbbaa16e8f1..480cdc54385e5a 100644
--- a/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
+++ b/tensorflow/tools/tf_sig_build_dockerfiles/devel.requirements.txt
@@ -43,8 +43,6 @@ scipy ~= 1.11.3; python_version >= '3.12' # Earliest version for Python 3.12
 # Required for TFLite import from JAX tests
 jax ~= 0.4.1; python_version <= '3.11'
 jaxlib ~= 0.4.1; python_version <= '3.11' # Earliest version for Python 3.11
-# Needs to be addressed. Unblocked 2.4 branchcut cl/338377048
-PyYAML ~= 6.0
 # For uploading
 auditwheel ~= 5.0.0
 twine ~= 3.6.0
diff --git a/tensorflow/tools/toolchains/remote_config/configs.bzl b/tensorflow/tools/toolchains/remote_config/configs.bzl
index 4d99c0ff41333f..9555f225773cc8 100644
--- a/tensorflow/tools/toolchains/remote_config/configs.bzl
+++ b/tensorflow/tools/toolchains/remote_config/configs.bzl
@@ -627,7 +627,7 @@ def initialize_rbe_configs():
 
     sigbuild_tf_configs(
         name_container_map = {
-            "sigbuild-r2.16": "docker://gcr.io/tensorflow-sigs/build@sha256:22d863e6fe3f98946015b9e1264b2eeb8e56e504535a6c1d5e564cae65ae5d37",
+            "sigbuild-r2.16": "docker://gcr.io/tensorflow-sigs/build@sha256:842a5ba84d3658c5bf1f8a31e16284f7becc35409da0dfd71816afa3cd28d728",
             "sigbuild-r2.16-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:22d863e6fe3f98946015b9e1264b2eeb8e56e504535a6c1d5e564cae65ae5d37",
             "sigbuild-r2.16-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:da15288c8464153eadd35da720540a544b76aa9d78cceb42a6821b2f3e70a0fa",
             "sigbuild-r2.16-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:842a5ba84d3658c5bf1f8a31e16284f7becc35409da0dfd71816afa3cd28d728",
@@ -667,7 +667,7 @@ def initialize_rbe_configs():
 
     sigbuild_tf_configs(
         name_container_map = {
-            "sigbuild-r2.16-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:22d863e6fe3f98946015b9e1264b2eeb8e56e504535a6c1d5e564cae65ae5d37",
+            "sigbuild-r2.16-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:842a5ba84d3658c5bf1f8a31e16284f7becc35409da0dfd71816afa3cd28d728",
             "sigbuild-r2.16-clang-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:22d863e6fe3f98946015b9e1264b2eeb8e56e504535a6c1d5e564cae65ae5d37",
             "sigbuild-r2.16-clang-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:da15288c8464153eadd35da720540a544b76aa9d78cceb42a6821b2f3e70a0fa",
             "sigbuild-r2.16-clang-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:842a5ba84d3658c5bf1f8a31e16284f7becc35409da0dfd71816afa3cd28d728",
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 056df85ffdb335..f76ff11862d9df 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -150,9 +150,9 @@ def _tf_repositories():
     # LINT.IfChange
     tf_http_archive(
         name = "XNNPACK",
-        sha256 = "434fe914cb52da3e66ba920082af969f527f23729fff182aecd87ac5324e9f90",
-        strip_prefix = "XNNPACK-dcbfffb80fb4f6fcfcfb5b3723854ec8797fa546",
-        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/dcbfffb80fb4f6fcfcfb5b3723854ec8797fa546.zip"),
+        sha256 = "bd7592e11699a34e94ce7fd36d95798092b508c01b6ae44ff232c7929bb2c927",
+        strip_prefix = "XNNPACK-9325fcfe52092b2f8f816db218bca208db7b2750",
+        urls = tf_mirror_urls("https://github.com/google/XNNPACK/archive/9325fcfe52092b2f8f816db218bca208db7b2750.zip"),
     )
     # LINT.ThenChange(//tensorflow/lite/tools/cmake/modules/xnnpack.cmake)
 
@@ -181,9 +181,9 @@ def _tf_repositories():
         name = "cudnn_frontend_archive",
         build_file = "//third_party:cudnn_frontend.BUILD",
         patch_file = ["//third_party:cudnn_frontend_header_fix.patch"],
-        sha256 = "015ea933139a30e9ccd177b5e0dbfb16f3d08df78334aaacea57880275df734b",
-        strip_prefix = "cudnn-frontend-1.0.0",
-        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.0.0.zip"),
+        sha256 = "c2f5373ddf84e33d289dad5766667f52de652dfbbb1dccb2fada9cfcf2d774cf",
+        strip_prefix = "cudnn-frontend-1.1.0",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.1.0.zip"),
     )
 
     tf_http_archive(
@@ -205,9 +205,9 @@ def _tf_repositories():
     tf_http_archive(
         name = "onednn",
         build_file = "//third_party/mkl_dnn:mkldnn_v1.BUILD",
-        sha256 = "8d150a77025f38bff182aaef4dd643625563b2f311c635f86cf4b769b04d7b48",
-        strip_prefix = "oneDNN-3.3",
-        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.3.tar.gz"),
+        sha256 = "e291fa4702f4bcfa6c8c23cb5b6599f0fefa8f23bc08edb9e15ddc5254ab7843",
+        strip_prefix = "oneDNN-3.3.4",
+        urls = tf_mirror_urls("https://github.com/oneapi-src/oneDNN/archive/refs/tags/v3.3.4.tar.gz"),
     )
 
     tf_http_archive(
@@ -219,6 +219,7 @@ def _tf_repositories():
             "//third_party/mkl_dnn:onednn_acl_thread_local_scheduler.patch",
             "//third_party/mkl_dnn:onednn_acl_fp32_bf16_reorder.patch",
             "//third_party/mkl_dnn:onednn_acl_bf16_capability_detection_for_ubuntu20.04.patch",
+            "//third_party/mkl_dnn:onednn_acl_indirect_conv.patch",
         ],
         sha256 = "2f76b407ef8893cca71340f88cd800019a1f14f8ac1bbdbb89a84be1370b52e3",
         strip_prefix = "oneDNN-3.2.1",
@@ -607,12 +608,13 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/NVlabs/cub/archive/1.9.9.zip"),
     )
 
+    # Note that we are currently taking NVTX headers from a NCCL release to get nvToolsExtPayload.h
     tf_http_archive(
         name = "nvtx_archive",
-        build_file = "//third_party:nvtx.BUILD",
-        sha256 = "bb8d1536aad708ec807bc675e12e5838c2f84481dec4005cd7a9bbd49e326ba1",
-        strip_prefix = "NVTX-3.0.1/c/include",
-        urls = tf_mirror_urls("https://github.com/NVIDIA/NVTX/archive/v3.0.1.tar.gz"),
+        build_file = "//third_party:nvtx/BUILD",
+        sha256 = "1c5474553afedb88e878c772f13d6f90b9226b3f2971dfa6f873adb9443100c2",
+        strip_prefix = "nccl-2.19.3-1/src/include/nvtx3",
+        urls = tf_mirror_urls("https://github.com/nvidia/nccl/archive/v2.19.3-1.tar.gz"),
     )
 
     tf_http_archive(
diff --git a/tensorflow/workspace3.bzl b/tensorflow/workspace3.bzl
index af1613994a749c..7d187724bb1e47 100644
--- a/tensorflow/workspace3.bzl
+++ b/tensorflow/workspace3.bzl
@@ -34,10 +34,10 @@ def workspace():
     http_archive(
         name = "rules_license",
         urls = [
-            "https://mirror.bazel.build/github.com/bazelbuild/rules_license/releases/download/0.0.4/rules_license-0.0.4.tar.gz",
-            "https://github.com/bazelbuild/rules_license/releases/download/0.0.4/rules_license-0.0.4.tar.gz",
+            "https://mirror.bazel.build/github.com/bazelbuild/rules_license/releases/download/0.0.7/rules_license-0.0.7.tar.gz",
+            "https://github.com/bazelbuild/rules_license/releases/download/0.0.7/rules_license-0.0.7.tar.gz",
         ],
-        sha256 = "6157e1e68378532d0241ecd15d3c45f6e5cfd98fc10846045509fb2a7cc9e381",
+        sha256 = "4531deccb913639c30e5c7512a054d5d875698daeb75d8cf90f284375fe7c360",
     )
 
     http_archive(
diff --git a/third_party/cudnn_frontend_header_fix.patch b/third_party/cudnn_frontend_header_fix.patch
index af22372c66009e..70476bd3ff5d56 100644
--- a/third_party/cudnn_frontend_header_fix.patch
+++ b/third_party/cudnn_frontend_header_fix.patch
@@ -1,234 +1,13 @@
-diff --git a/include/cudnn_backend_base.h b/include/cudnn_backend_base.h
-index 1240282..cba52ec 100644
---- a/include/cudnn_backend_base.h
-+++ b/include/cudnn_backend_base.h
-@@ -24,7 +24,7 @@
- 
- #include <ostream>
- 
--#include <cudnn.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
- 
- namespace cudnn_frontend {
- 
-diff --git a/include/cudnn_frontend_ConvDesc.h b/include/cudnn_frontend_ConvDesc.h
-index 6e1d7ab..4deec88 100644
---- a/include/cudnn_frontend_ConvDesc.h
-+++ b/include/cudnn_frontend_ConvDesc.h
-@@ -29,8 +29,8 @@
- #include <sstream>
- #include <utility>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_utils.h"
- 
-diff --git a/include/cudnn_frontend_Engine.h b/include/cudnn_frontend_Engine.h
-index b95efb8..867541e 100644
---- a/include/cudnn_frontend_Engine.h
-+++ b/include/cudnn_frontend_Engine.h
-@@ -30,8 +30,8 @@
- #include <utility>
- #include <vector>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_OperationGraph.h"
- #include "cudnn_frontend_utils.h"
-diff --git a/include/cudnn_frontend_EngineConfig.h b/include/cudnn_frontend_EngineConfig.h
-index 973e777..97f0883 100644
---- a/include/cudnn_frontend_EngineConfig.h
-+++ b/include/cudnn_frontend_EngineConfig.h
-@@ -29,8 +29,8 @@
- #include <sstream>
- #include <utility>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_Engine.h"
- #include "cudnn_frontend_utils.h"
-diff --git a/include/cudnn_frontend_EngineFallbackList.h b/include/cudnn_frontend_EngineFallbackList.h
-index 4d4e5be..6390bc5 100644
---- a/include/cudnn_frontend_EngineFallbackList.h
-+++ b/include/cudnn_frontend_EngineFallbackList.h
-@@ -22,7 +22,7 @@
- 
- #pragma once
- 
--#include <cudnn.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
- #include <numeric>
- #include "cudnn_frontend_Heuristics.h"
- 
-diff --git a/include/cudnn_frontend_ExecutionPlan.h b/include/cudnn_frontend_ExecutionPlan.h
-index afceeb3..3d426e2 100644
---- a/include/cudnn_frontend_ExecutionPlan.h
-+++ b/include/cudnn_frontend_ExecutionPlan.h
-@@ -30,8 +30,8 @@
- #include <sstream>
- #include <utility>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_EngineConfig.h"
- #include "cudnn_frontend_Engine.h"
-diff --git a/include/cudnn_frontend_Filters.h b/include/cudnn_frontend_Filters.h
-index 676f0f2..4d1c020 100644
---- a/include/cudnn_frontend_Filters.h
-+++ b/include/cudnn_frontend_Filters.h
-@@ -22,7 +22,7 @@
- 
- #pragma once
- 
--#include <cudnn.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
- 
- namespace cudnn_frontend {
- 
-diff --git a/include/cudnn_frontend_Heuristics.h b/include/cudnn_frontend_Heuristics.h
-index dda3fb3..3e89237 100644
---- a/include/cudnn_frontend_Heuristics.h
-+++ b/include/cudnn_frontend_Heuristics.h
-@@ -25,8 +25,8 @@
- #include <vector>
- #include <mutex>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_OperationGraph.h"
- #include "cudnn_frontend_EngineConfig.h"
-diff --git a/include/cudnn_frontend_MatMulDesc.h b/include/cudnn_frontend_MatMulDesc.h
-index c9258ba..141f2f9 100644
---- a/include/cudnn_frontend_MatMulDesc.h
-+++ b/include/cudnn_frontend_MatMulDesc.h
-@@ -29,8 +29,8 @@
- #include <sstream>
- #include <utility>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_utils.h"
- 
-diff --git a/include/cudnn_frontend_Operation.h b/include/cudnn_frontend_Operation.h
-index bf16cfa..f3086e1 100644
---- a/include/cudnn_frontend_Operation.h
-+++ b/include/cudnn_frontend_Operation.h
-@@ -30,8 +30,8 @@
- #include <sstream>
- #include <utility>
+diff --git a/include/cudnn_frontend.h b/include/cudnn_frontend.h
+index 0f0d5a6..802bcbb 100644
+--- a/include/cudnn_frontend.h
++++ b/include/cudnn_frontend.h
+@@ -97,7 +97,7 @@
+  *      - Simpler samples on how to use the new API.
+  */
  
 -#include <cudnn.h>
--#include <cudnn_backend.h>
 +#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
  
  #include "cudnn_frontend_ConvDesc.h"
- #include "cudnn_frontend_PointWiseDesc.h"
-diff --git a/include/cudnn_frontend_OperationGraph.h b/include/cudnn_frontend_OperationGraph.h
-index c5e2704..71589b2 100644
---- a/include/cudnn_frontend_OperationGraph.h
-+++ b/include/cudnn_frontend_OperationGraph.h
-@@ -30,8 +30,8 @@
- #include <utility>
- #include <vector>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_Operation.h"
- #include "cudnn_frontend_utils.h"
-diff --git a/include/cudnn_frontend_PointWiseDesc.h b/include/cudnn_frontend_PointWiseDesc.h
-index afa71ce..56b6507 100644
---- a/include/cudnn_frontend_PointWiseDesc.h
-+++ b/include/cudnn_frontend_PointWiseDesc.h
-@@ -30,8 +30,8 @@
- #include <utility>
- #include <limits>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_utils.h"
- 
-diff --git a/include/cudnn_frontend_ReductionDesc.h b/include/cudnn_frontend_ReductionDesc.h
-index 5df2c5e..419fc93 100644
---- a/include/cudnn_frontend_ReductionDesc.h
-+++ b/include/cudnn_frontend_ReductionDesc.h
-@@ -29,8 +29,8 @@
- #include <sstream>
- #include <utility>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_utils.h"
- 
-diff --git a/include/cudnn_frontend_Resample.h b/include/cudnn_frontend_Resample.h
-index 351e2da..b1a1904 100644
---- a/include/cudnn_frontend_Resample.h
-+++ b/include/cudnn_frontend_Resample.h
-@@ -29,8 +29,8 @@
- #include <sstream>
- #include <utility>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_utils.h"
- 
-diff --git a/include/cudnn_frontend_Rng.h b/include/cudnn_frontend_Rng.h
-index 9d4e6ca..4224b61 100644
---- a/include/cudnn_frontend_Rng.h
-+++ b/include/cudnn_frontend_Rng.h
-@@ -29,8 +29,8 @@
- #include <sstream>
- #include <utility>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_utils.h"
- 
-diff --git a/include/cudnn_frontend_VariantPack.h b/include/cudnn_frontend_VariantPack.h
-index 455ab8b..4173860 100644
---- a/include/cudnn_frontend_VariantPack.h
-+++ b/include/cudnn_frontend_VariantPack.h
-@@ -30,8 +30,8 @@
- #include <sstream>
- #include <utility>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_utils.h"
- 
+ #include "cudnn_frontend_Heuristics.h"
diff --git a/third_party/curl.BUILD b/third_party/curl.BUILD
index 8dcd54451dae66..b31d8488aaa0ba 100644
--- a/third_party/curl.BUILD
+++ b/third_party/curl.BUILD
@@ -14,11 +14,6 @@ CURL_WIN_COPTS = [
     "/DCURL_DISABLE_PROXY",
     "/DHAVE_LIBZ",
     "/DHAVE_ZLIB_H",
-    # Defining _USING_V110_SDK71_ is hackery to defeat curl's incorrect
-    # detection of what OS releases we can build on with VC 2012. This
-    # may not be needed (or may have to change) if the WINVER setting
-    # changes in //third_party/msvc/vc_12_0/CROSSTOOL.
-    "/D_USING_V110_SDK71_",
 ]
 
 CURL_WIN_SRCS = [
diff --git a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
index 0da1d7b58f4bb0..74fafb9b32f516 100755
--- a/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+++ b/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -41,7 +41,7 @@ import os
 import subprocess
 import re
 import sys
-import pipes
+import shlex
 
 # Template values set by cuda_autoconf.
 CPU_COMPILER = ('%{cpu_compiler}')
@@ -299,7 +299,7 @@ def main():
 
   if args.x and args.x[0] == 'cuda':
     if args.cuda_log: Log('-x cuda')
-    leftover = [pipes.quote(s) for s in leftover]
+    leftover = [shlex.quote(s) for s in leftover]
     if args.cuda_log: Log('using nvcc')
     return InvokeNvcc(leftover, log=args.cuda_log)
 
diff --git a/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/gpus/cuda/build_defs.bzl.tpl
index 189d3e3e784003..bc865cecb3240a 100644
--- a/third_party/gpus/cuda/build_defs.bzl.tpl
+++ b/third_party/gpus/cuda/build_defs.bzl.tpl
@@ -94,6 +94,25 @@ def if_cuda_is_configured(x, no_cuda = []):
       return select({"//conditions:default": x})
     return select({"//conditions:default": no_cuda})
 
+def if_cuda_newer_than(wanted_ver, if_true, if_false = []):
+    """Tests if CUDA was enabled during the configured process and if the
+    configured version is at least `wanted_ver`. `wanted_ver` needs
+    to be provided as a string in the format `<major>_<minor>`.
+    Example: `11_0`
+    """
+
+    wanted_major = int(wanted_ver.split('_')[0])
+    wanted_minor = int(wanted_ver.split('_')[1])
+
+    configured_version = "%{cuda_version}"
+    configured_major = int(configured_version.split('.')[0])
+    configured_minor = int(configured_version.split('.')[1])
+
+    if %{cuda_is_configured} and (wanted_major, wanted_minor) <= (configured_major, configured_minor):
+      return select({"//conditions:default": if_true})
+    return select({"//conditions:default": if_false})
+
+
 def cuda_header_library(
         name,
         hdrs,
diff --git a/third_party/gpus/cuda_configure.bzl b/third_party/gpus/cuda_configure.bzl
index bc692500836410..5bf0504dc91bcc 100644
--- a/third_party/gpus/cuda_configure.bzl
+++ b/third_party/gpus/cuda_configure.bzl
@@ -827,6 +827,7 @@ def _create_dummy_repository(repository_ctx):
             "%{cuda_is_configured}": "False",
             "%{cuda_extra_copts}": "[]",
             "%{cuda_gpu_architectures}": "[]",
+            "%{cuda_version}": "0.0",
         },
     )
     _tpl(
@@ -1214,6 +1215,7 @@ def _create_local_cuda_repository(repository_ctx):
                 cuda_config.compute_capabilities,
             ),
             "%{cuda_gpu_architectures}": str(cuda_config.compute_capabilities),
+            "%{cuda_version}": cuda_config.cuda_version,
         },
     )
 
@@ -1427,6 +1429,7 @@ def _create_remote_cuda_repository(repository_ctx, remote_config_repo):
                 repository_ctx,
                 compute_capabilities(repository_ctx),
             ),
+            "%{cuda_version}": get_host_environ(repository_ctx, _TF_CUDA_VERSION),
         },
     )
     repository_ctx.template(
diff --git a/third_party/gpus/rocm/build_defs.bzl.tpl b/third_party/gpus/rocm/build_defs.bzl.tpl
index 2b4595bb222885..339733755d6f1f 100644
--- a/third_party/gpus/rocm/build_defs.bzl.tpl
+++ b/third_party/gpus/rocm/build_defs.bzl.tpl
@@ -38,6 +38,16 @@ def rocm_version_number():
     """Returns a list of supported GPU architectures."""
     return %{rocm_version_number}
 
+def if_gpu_is_configured(if_true, if_false = []):
+    """Tests if ROCm or CUDA was enabled during the configure process.
+
+    Unlike if_rocm() or if_cuda(), this does not require that we are building
+    with --config=rocm or --config=cuda, respectively. Used to allow non-GPU
+    code to depend on ROCm or CUDA libraries.
+
+    """
+    return select({"//conditions:default": %{gpu_is_configured}})
+
 def if_rocm_is_configured(x):
     """Tests if the ROCm was enabled during the configure process.
 
diff --git a/third_party/gpus/rocm_configure.bzl b/third_party/gpus/rocm_configure.bzl
index 520c9bce6c5265..85392089add88a 100644
--- a/third_party/gpus/rocm_configure.bzl
+++ b/third_party/gpus/rocm_configure.bzl
@@ -10,6 +10,7 @@
 
 load(
     ":cuda_configure.bzl",
+    "enable_cuda",
     "make_copy_dir_rule",
     "make_copy_files_rule",
     "to_list_of_strings",
@@ -449,6 +450,7 @@ def _create_dummy_repository(repository_ctx):
         "rocm:build_defs.bzl",
         {
             "%{rocm_is_configured}": "False",
+            "%{gpu_is_configured}": "if_true" if enable_cuda(repository_ctx) else "if_false",
             "%{rocm_extra_copts}": "[]",
             "%{rocm_gpu_architectures}": "[]",
             "%{rocm_version_number}": "0",
@@ -634,6 +636,7 @@ def _create_local_rocm_repository(repository_ctx):
         tpl_paths["rocm:build_defs.bzl"],
         {
             "%{rocm_is_configured}": "True",
+            "%{gpu_is_configured}": "if_true",
             "%{rocm_extra_copts}": _compute_rocm_extra_copts(
                 repository_ctx,
                 rocm_config.amdgpu_targets,
@@ -762,6 +765,7 @@ def _create_remote_rocm_repository(repository_ctx, remote_config_repo):
         "rocm:build_defs.bzl",
         {
             "%{rocm_is_configured}": "True",
+            "%{gpu_is_configured}": "if_true",
             "%{rocm_extra_copts}": _compute_rocm_extra_copts(
                 repository_ctx,
                 [],  #_compute_capabilities(repository_ctx)
diff --git a/third_party/llvm/generated.patch b/third_party/llvm/generated.patch
index ea2982f41c8d32..12c5c47f8c0d49 100644
--- a/third_party/llvm/generated.patch
+++ b/third_party/llvm/generated.patch
@@ -1,67 +1,1764 @@
 Auto generated patch. Do not edit or delete it, even if empty.
-diff -ruN --strip-trailing-cr a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
---- a/clang/lib/Sema/SemaDecl.cpp
-+++ b/clang/lib/Sema/SemaDecl.cpp
-@@ -6377,6 +6377,12 @@
-   } else if (DiagnoseUnexpandedParameterPack(NameInfo, UPPC_DeclarationType))
-     return nullptr;
- 
-+  // The scope passed in may not be a decl scope.  Zip up the scope tree until
-+  // we find one that is.
-+  while ((S->getFlags() & Scope::DeclScope) == 0 ||
-+         (S->getFlags() & Scope::TemplateParamScope) != 0)
-+    S = S->getParent();
-+
-   DeclContext *DC = CurContext;
-   if (D.getCXXScopeSpec().isInvalid())
-     D.setInvalidType();
-@@ -6529,12 +6535,6 @@
-   if (getLangOpts().CPlusPlus)
-     CheckExtraCXXDefaultArguments(D);
- 
--  // The scope passed in may not be a decl scope.  Zip up the scope tree until
--  // we find one that is.
--  while ((S->getFlags() & Scope::DeclScope) == 0 ||
--         (S->getFlags() & Scope::TemplateParamScope) != 0)
--    S = S->getParent();
--
-   NamedDecl *New;
- 
-   bool AddToScope = true;
-diff -ruN --strip-trailing-cr a/clang/test/CXX/temp/temp.res/temp.local/p6.cpp b/clang/test/CXX/temp/temp.res/temp.local/p6.cpp
---- a/clang/test/CXX/temp/temp.res/temp.local/p6.cpp
-+++ b/clang/test/CXX/temp/temp.res/temp.local/p6.cpp
-@@ -127,30 +127,16 @@
- template<typename T> // expected-note {{declared here}}
- void f(int T) {} // expected-error {{declaration of 'T' shadows template parameter}}
- 
-+// FIXME: These are ill-formed: a template-parameter shall not have the same name as the template name.
- namespace A {
-   template<typename T> struct T {};  // expected-error{{declaration of 'T' shadows template parameter}}
-                                      // expected-note@-1{{template parameter is declared here}}
--  template<typename T> struct U {
--    template<typename V> struct V {}; // expected-error{{declaration of 'V' shadows template parameter}}
--                                      // expected-note@-1{{template parameter is declared here}}
--  };
+diff -ruN --strip-trailing-cr a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
+--- a/llvm/lib/Analysis/ValueTracking.cpp
++++ b/llvm/lib/Analysis/ValueTracking.cpp
+@@ -5173,8 +5173,13 @@
+         Op->getOperand(0)->getType()->getScalarType()->getFltSemantics();
+ 
+     // All subnormal inputs should be in the normal range in the result type.
+-    if (APFloat::isRepresentableAsNormalIn(SrcTy, DstTy))
++    if (APFloat::isRepresentableAsNormalIn(SrcTy, DstTy)) {
++      if (Known.KnownFPClasses & fcPosSubnormal)
++        Known.KnownFPClasses |= fcPosNormal;
++      if (Known.KnownFPClasses & fcNegSubnormal)
++        Known.KnownFPClasses |= fcNegNormal;
+       Known.knownNot(fcSubnormal);
++    }
+ 
+     // Sign bit of a nan isn't guaranteed.
+     if (!Known.isKnownNeverNaN())
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
++++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+@@ -4764,6 +4764,15 @@
+ 
+   // Nothing custom, so do general case parsing.
+   SMLoc S, E;
++  auto parseOptionalShiftExtend = [&](AsmToken SavedTok) {
++    if (parseOptionalToken(AsmToken::Comma)) {
++      ParseStatus Res = tryParseOptionalShiftExtend(Operands);
++      if (!Res.isNoMatch())
++        return Res.isFailure();
++      getLexer().UnLex(SavedTok);
++    }
++    return false;
++  };
+   switch (getLexer().getKind()) {
+   default: {
+     SMLoc S = getLoc();
+@@ -4773,7 +4782,7 @@
+ 
+     SMLoc E = SMLoc::getFromPointer(getLoc().getPointer() - 1);
+     Operands.push_back(AArch64Operand::CreateImm(Expr, S, E, getContext()));
+-    return false;
++    return parseOptionalShiftExtend(getTok());
+   }
+   case AsmToken::LBrac: {
+     Operands.push_back(
+@@ -4895,14 +4904,7 @@
+     Operands.push_back(AArch64Operand::CreateImm(ImmVal, S, E, getContext()));
+ 
+     // Parse an optional shift/extend modifier.
+-    AsmToken SavedTok = Tok;
+-    if (parseOptionalToken(AsmToken::Comma)) {
+-      ParseStatus Res = tryParseOptionalShiftExtend(Operands);
+-      if (!Res.isNoMatch())
+-        return Res.isFailure();
+-      getLexer().UnLex(SavedTok);
+-    }
+-    return false;
++    return parseOptionalShiftExtend(Tok);
+   }
+   case AsmToken::Equal: {
+     SMLoc Loc = getLoc();
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/Hexagon/CMakeLists.txt b/llvm/lib/Target/Hexagon/CMakeLists.txt
+--- a/llvm/lib/Target/Hexagon/CMakeLists.txt
++++ b/llvm/lib/Target/Hexagon/CMakeLists.txt
+@@ -50,7 +50,6 @@
+   HexagonOptAddrMode.cpp
+   HexagonOptimizeSZextends.cpp
+   HexagonPeephole.cpp
+-  HexagonPostIncOpt.cpp
+   HexagonRDFOpt.cpp
+   HexagonRegisterInfo.cpp
+   HexagonSelectionDAGInfo.cpp
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
++++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+@@ -1655,13 +1655,6 @@
+   return getAddrMode(MI) == HexagonII::PostInc;
  }
- namespace B {
--  template<typename T> void T() {} // expected-error{{declaration of 'T' shadows template parameter}}
--                                   // expected-note@-1{{template parameter is declared here}}
--
--  template<typename T> struct U {
--    template<typename V> void V(); // expected-error{{declaration of 'V' shadows template parameter}}
--                                   // expected-note@-1{{template parameter is declared here}}
--  };
-+  template<typename T> void T() {}
+ 
+-bool HexagonInstrInfo::isPostIncWithImmOffset(const MachineInstr &MI) const {
+-  unsigned BasePos, OffsetPos;
+-  if (!getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
+-    return false;
+-  return isPostIncrement(MI) && MI.getOperand(OffsetPos).isImm();
+-}
+-
+ // Returns true if an instruction is predicated irrespective of the predicate
+ // sense. For example, all of the following will return true.
+ // if (p0) R1 = add(R2, R3)
+@@ -2443,55 +2436,6 @@
+          Opcode == Hexagon::J2_loop1rext;
  }
- namespace C {
--  template<typename T> int T; // expected-error{{declaration of 'T' shadows template parameter}}
--                              // expected-note@-1{{template parameter is declared here}}
--  template<typename T> struct U {
--    template<typename V> static int V; // expected-error{{declaration of 'V' shadows template parameter}}
--                                       // expected-note@-1{{template parameter is declared here}}
--  };
-+  template<typename T> int T;
+ 
+-bool HexagonInstrInfo::isCircBufferInstr(const MachineInstr &MI) const {
+-  switch (MI.getOpcode()) {
+-  default:
+-    return false;
+-  case Hexagon::L2_loadalignb_pci:
+-  case Hexagon::L2_loadalignb_pcr:
+-  case Hexagon::L2_loadalignh_pci:
+-  case Hexagon::L2_loadalignh_pcr:
+-  case Hexagon::L2_loadbsw2_pci:
+-  case Hexagon::L2_loadbsw2_pcr:
+-  case Hexagon::L2_loadbsw4_pci:
+-  case Hexagon::L2_loadbsw4_pcr:
+-  case Hexagon::L2_loadbzw2_pci:
+-  case Hexagon::L2_loadbzw2_pcr:
+-  case Hexagon::L2_loadbzw4_pci:
+-  case Hexagon::L2_loadbzw4_pcr:
+-  case Hexagon::L2_loadrb_pci:
+-  case Hexagon::L2_loadrb_pcr:
+-  case Hexagon::L2_loadrd_pci:
+-  case Hexagon::L2_loadrd_pcr:
+-  case Hexagon::L2_loadrh_pci:
+-  case Hexagon::L2_loadrh_pcr:
+-  case Hexagon::L2_loadri_pci:
+-  case Hexagon::L2_loadri_pcr:
+-  case Hexagon::L2_loadrub_pci:
+-  case Hexagon::L2_loadrub_pcr:
+-  case Hexagon::L2_loadruh_pci:
+-  case Hexagon::L2_loadruh_pcr:
+-  case Hexagon::S2_storerbnew_pci:
+-  case Hexagon::S2_storerbnew_pcr:
+-  case Hexagon::S2_storerb_pci:
+-  case Hexagon::S2_storerb_pcr:
+-  case Hexagon::S2_storerd_pci:
+-  case Hexagon::S2_storerd_pcr:
+-  case Hexagon::S2_storerf_pci:
+-  case Hexagon::S2_storerf_pcr:
+-  case Hexagon::S2_storerhnew_pci:
+-  case Hexagon::S2_storerhnew_pcr:
+-  case Hexagon::S2_storerh_pci:
+-  case Hexagon::S2_storerh_pcr:
+-  case Hexagon::S2_storerinew_pci:
+-  case Hexagon::S2_storerinew_pcr:
+-  case Hexagon::S2_storeri_pci:
+-  case Hexagon::S2_storeri_pcr:
+-    return true;
+-  }
+-  return false;
+-}
+-
+ bool HexagonInstrInfo::isMemOp(const MachineInstr &MI) const {
+   switch (MI.getOpcode()) {
+     default: return false;
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
++++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+@@ -434,8 +434,6 @@
+   bool predCanBeUsedAsDotNew(const MachineInstr &MI, Register PredReg) const;
+   bool PredOpcodeHasJMP_c(unsigned Opcode) const;
+   bool predOpcodeHasNot(ArrayRef<MachineOperand> Cond) const;
+-  bool isPostIncWithImmOffset(const MachineInstr &MI) const;
+-  bool isCircBufferInstr(const MachineInstr &MI) const;
+ 
+   unsigned getAddrMode(const MachineInstr &MI) const;
+   MachineOperand *getBaseAndOffset(const MachineInstr &MI, int64_t &Offset,
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp b/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp
+--- a/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp
++++ b/llvm/lib/Target/Hexagon/HexagonPostIncOpt.cpp
+@@ -1,689 +0,0 @@
+-//===-- HexagonPostIncOpt.cpp - Hexagon Post Increment Optimization Pass --===//
+-//
+-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+-// See https://llvm.org/LICENSE.txt for license information.
+-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+-//
+-//===----------------------------------------------------------------------===//
+-// Convert post-inc addressing mode into base-offset addressing mode.
+-// Ex:
+-// original loop:
+-// v1 = phi(v0, v3)
+-// v2,v3 = post_load v1, 4
+-
+-// Often, unroller creates below form of post-increments:
+-// v1 = phi(v0, v3')
+-// v2,v3  = post_load v1, 4
+-// v2',v3'= post_load v3, 4
+-
+-// This can be optimized in two ways
+-
+-// 1.
+-// v1 = phi(v0, v3')
+-// v2,v3' = post_load v1, 8
+-// v2' = load v3', -4
+-//
+-// 2.
+-// v1 = phi(v0, v3')
+-// v2,v3' = post_load v1, 8
+-// v2' = load v1, 4
+-//
+-// Option 2 is favored as we can packetize two memory operations in a single
+-// packet. However, this is not always favorable due to memory dependences
+-// and in cases where we form a bigger chain of post-increment ops that will
+-// create more spills as we can not execute post-increment ops with out
+-// executing base-offset instructions.
+-//===----------------------------------------------------------------------===//
+-#include "HexagonInstrInfo.h"
+-#include "HexagonSubtarget.h"
+-#include "llvm/Analysis/AliasAnalysis.h"
+-#include "llvm/CodeGen/MachineBasicBlock.h"
+-#include "llvm/CodeGen/MachineFunction.h"
+-#include "llvm/CodeGen/MachineFunctionPass.h"
+-#include "llvm/CodeGen/MachineInstr.h"
+-#include "llvm/CodeGen/MachineInstrBuilder.h"
+-#include "llvm/CodeGen/MachineLoopInfo.h"
+-#include "llvm/CodeGen/Passes.h"
+-#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+-#include "llvm/CodeGen/TargetRegisterInfo.h"
+-#include "llvm/InitializePasses.h"
+-#include "llvm/Pass.h"
+-#include "llvm/Support/CodeGen.h"
+-#include "llvm/Support/CommandLine.h"
+-#include "llvm/Support/Debug.h"
+-#include "llvm/Support/raw_ostream.h"
+-
+-using namespace llvm;
+-
+-#define DEBUG_TYPE "hexagon-postincopt"
+-
+-static cl::opt<unsigned> PostIncChainThreshold(
+-    "post-inc-chain-threshold", cl::Hidden, cl::init(4),
+-    cl::desc("Limit the number of post-inc instructions in a chain."));
+-
+-static cl::opt<bool> PreferPostIncStore(
+-    "prefer-post-inc-store", cl::Hidden, cl::init(true),
+-    cl::desc("Prefer post-inc store in a list of loads and stores."));
+-
+-namespace llvm {
+-void initializeHexagonPostIncOptPass(PassRegistry &);
+-FunctionPass *createHexagonPostIncOpt();
+-} // namespace llvm
+-
+-namespace {
+-
+-class HexagonPostIncOpt : public MachineFunctionPass {
+-  MachineLoopInfo *MLI = nullptr;
+-  const HexagonInstrInfo *HII = nullptr;
+-  const TargetRegisterInfo *TRI = nullptr;
+-  const MachineRegisterInfo *MRI = nullptr;
+-  const HexagonSubtarget *HST = nullptr;
+-
+-public:
+-  static char ID;
+-
+-  HexagonPostIncOpt() : MachineFunctionPass(ID) {
+-    initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry());
+-  }
+-
+-  void getAnalysisUsage(AnalysisUsage &AU) const override {
+-    AU.addRequired<AAResultsWrapperPass>();
+-    AU.addPreserved<AAResultsWrapperPass>();
+-    AU.addRequired<MachineLoopInfo>();
+-    MachineFunctionPass::getAnalysisUsage(AU);
+-  }
+-
+-  StringRef getPassName() const override { return "Hexagon Post-Inc-Opt Pass"; }
+-
+-  bool runOnMachineFunction(MachineFunction &Fn) override;
+-
+-private:
+-  bool translatePostIncsInLoop(MachineBasicBlock &MBB);
+-  void replacePostIncWithBaseOffset(MachineBasicBlock &MBB) const;
+-  void replacePostIncWithBaseOffset(MachineInstr &MI) const;
+-  bool isPostIncInsn(MachineInstr &MI) const;
+-  void foldAdds(MachineBasicBlock &MBB) const;
+-  void updateBaseAndOffset(MachineInstr &MI, MachineInstr &AddMI) const;
+-  void removeDeadInstructions(MachineBasicBlock &MBB) const;
+-
+-  void generatePostInc(MachineBasicBlock &MBB);
+-  bool canReplaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const;
+-  void replaceWithPostInc(MachineInstr *MI, MachineInstr *AddMI) const;
+-
+-  bool isValidOffset(const MachineInstr &MI, int64_t Offset) const;
+-  bool isValidPostIncValue(const MachineInstr &MI, int IncVal) const;
+-};
+-
+-class HexagonPostIncOptSchedDAG : public ScheduleDAGInstrs {
+-  HexagonPostIncOpt &Pass;
+-
+-public:
+-  HexagonPostIncOptSchedDAG(HexagonPostIncOpt &P, MachineFunction &MF,
+-                            MachineLoopInfo *MLI)
+-      : ScheduleDAGInstrs(MF, MLI, false), Pass(P){};
+-  void schedule() override;
+-  ScheduleDAGTopologicalSort &getTopo() { return Topo; };
+-};
+-
+-} // End anonymous namespace.
+-
+-char HexagonPostIncOpt::ID = 0;
+-
+-INITIALIZE_PASS_BEGIN(HexagonPostIncOpt, DEBUG_TYPE,
+-                      "Hexagon Post-Inc-Opt Pass", false, false)
+-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+-INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+-INITIALIZE_PASS_END(HexagonPostIncOpt, DEBUG_TYPE, "Hexagon Post-Inc-Opt Pass",
+-                    false, false)
+-
+-/// Return true if MIA dominates MIB.
+-static bool dominates(MachineInstr *MIA, MachineInstr *MIB) {
+-  if (MIA->getParent() != MIB->getParent())
+-    return false; // Don't know since machine dominator tree is out of date.
+-
+-  MachineBasicBlock *MBB = MIA->getParent();
+-  MachineBasicBlock::iterator I = MBB->instr_begin();
+-  // Iterate over the basic block until MIA or MIB is found.
+-  for (; &*I != MIA && &*I != MIB; ++I)
+-    ;
+-
+-  // MIA dominates MIB if MIA is found first.
+-  return &*I == MIA;
+-}
+-
+-// Return the Phi register value that comes from the loop block.
+-static unsigned getLoopPhiReg(MachineInstr *Phi, MachineBasicBlock *LoopBB) {
+-  for (unsigned i = 1, e = Phi->getNumOperands(); i != e; i += 2)
+-    if (Phi->getOperand(i + 1).getMBB() == LoopBB)
+-      return Phi->getOperand(i).getReg();
+-  return UINT_MAX;
+-}
+-
+-static bool isAddWithImmValue(const MachineInstr &MI) {
+-  // FIXME: For now, only deal with adds that have strict immediate values.
+-  // Some A2_addi instructions can be of the form.
+-  // %338:intregs = A2_addi %7:intregs, @_ZL7phs_tbl + 16
+-  return MI.getOpcode() == Hexagon::A2_addi && MI.getOperand(2).isImm();
+-}
+-
+-// Compute the number of 'real' instructions in the basic block by
+-// ignoring terminators.
+-static unsigned getBasicBlockSize(MachineBasicBlock &MBB) {
+-  unsigned size = 0;
+-  for (auto &I : make_range(MBB.begin(), MBB.getFirstTerminator()))
+-    if (!I.isDebugInstr())
+-      size++;
+-  return size;
+-}
+-
+-// Setup Post increment Schedule DAG.
+-static void initPISchedDAG(HexagonPostIncOptSchedDAG &PIDAG,
+-                           MachineBasicBlock &MBB) {
+-  PIDAG.startBlock(&MBB);
+-  PIDAG.enterRegion(&MBB, MBB.begin(), MBB.getFirstTerminator(),
+-                    getBasicBlockSize(MBB));
+-  // Build the graph.
+-  PIDAG.schedule();
+-  // exitRegion() is an empty function in base class. So, safe to call it here.
+-  PIDAG.exitRegion();
+-}
+-
+-// Check if post-increment candidate has any memory dependence on any
+-// instruction in the chain.
+-static bool hasMemoryDependency(SUnit *PostIncSU,
+-                                SmallVector<MachineInstr *, 4> &UseList) {
+-
+-  // FIXME: Fine tune the order dependence. Probably can only consider memory
+-  // related OrderKind.
+-  for (auto &Dep : PostIncSU->Succs)
+-    if (Dep.getKind() == SDep::Order)
+-      if (std::find(UseList.begin(), UseList.end(),
+-                    Dep.getSUnit()->getInstr()) != UseList.end())
+-        return true;
+-
+-  return false;
+-}
+-
+-// Fold an add with immediate into either an add or a load or a store.
+-void HexagonPostIncOpt::foldAdds(MachineBasicBlock &MBB) const {
+-  LLVM_DEBUG(dbgs() << "#Fold add instructions in this block.\n");
+-  for (auto &MI : make_range(MBB.getFirstNonPHI(), MBB.getFirstTerminator())) {
+-    if (!isAddWithImmValue(MI))
+-      continue;
+-    unsigned DefReg = MI.getOperand(0).getReg();
+-    unsigned AddReg = MI.getOperand(1).getReg();
+-    int64_t AddImm = MI.getOperand(2).getImm();
+-
+-    SmallVector<MachineInstr *, 4> UseList;
+-    // Gather the uses of add instruction's def reg.
+-    for (auto &MO : make_range(MRI->use_begin(DefReg), MRI->use_end())) {
+-      MachineInstr *UseMI = MO.getParent();
+-      // Deal with only the instuctions that belong to this block.
+-      // If we cross this block, the generation of post-increment logic
+-      // will not be able to transform to post-inc due to dominance.
+-      if (UseMI->getParent() == &MBB)
+-        UseList.push_back(UseMI);
+-    }
+-
+-    if (UseList.empty())
+-      continue;
+-
+-    LLVM_DEBUG({
+-      dbgs() << "Current instruction considered for folding \n";
+-      MI.dump();
+-    });
+-
+-    for (auto UseMI : UseList) {
+-      if (isAddWithImmValue(*UseMI)) {
+-        int64_t NewImm = AddImm + UseMI->getOperand(2).getImm();
+-        // Fold if the new immediate is with in the range.
+-        if (HII->isValidOffset(UseMI->getOpcode(), NewImm, TRI, false)) {
+-          LLVM_DEBUG({
+-            UseMI->dump();
+-            dbgs() << "\t is folded in to \n";
+-          });
+-          UseMI->getOperand(1).setReg(AddReg);
+-          UseMI->getOperand(2).setImm(NewImm);
+-          LLVM_DEBUG(UseMI->dump());
+-        }
+-      } else if (HII->isBaseImmOffset(*UseMI)) {
+-        LLVM_DEBUG({
+-          UseMI->dump();
+-          dbgs() << "\t is folded in to \n";
+-        });
+-        updateBaseAndOffset(*UseMI, MI);
+-        LLVM_DEBUG(UseMI->dump());
+-      }
+-      LLVM_DEBUG(dbgs() << "\n");
+-    }
+-  }
+-  removeDeadInstructions(MBB);
+-  LLVM_DEBUG(dbgs() << "#End of the fold instructions logic.\n");
+-}
+-
+-void HexagonPostIncOpt::updateBaseAndOffset(MachineInstr &MI,
+-                                            MachineInstr &AddMI) const {
+-  assert(HII->isBaseImmOffset(MI));
+-  unsigned BasePos, OffsetPos;
+-  if (!HII->getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
+-    return;
+-
+-  MachineOperand &OffsetOp = MI.getOperand(OffsetPos);
+-  MachineOperand &BaseOp = MI.getOperand(BasePos);
+-
+-  if (BaseOp.getReg() != AddMI.getOperand(0).getReg())
+-    return;
+-
+-  unsigned IncBase = AddMI.getOperand(1).getReg();
+-  int64_t IncValue = AddMI.getOperand(2).getImm();
+-
+-  int64_t NewOffset = OffsetOp.getImm() + IncValue;
+-  if (!isValidOffset(MI, NewOffset))
+-    return;
+-
+-  OffsetOp.setImm(NewOffset);
+-  BaseOp.setReg(IncBase);
+-}
+-
+-void HexagonPostIncOpt::removeDeadInstructions(MachineBasicBlock &MBB) const {
+-  // For MBB, check that the value defined by each instruction is used.
+-  // If not, delete it.
+-  for (MachineBasicBlock::reverse_instr_iterator MI = MBB.instr_rbegin(),
+-                                                 ME = MBB.instr_rend();
+-       MI != ME;) {
+-    // From DeadMachineInstructionElem. Don't delete inline assembly.
+-    if (MI->isInlineAsm()) {
+-      ++MI;
+-      continue;
+-    }
+-    bool SawStore = false;
+-    // Check if it's safe to remove the instruction due to side effects.
+-    if (!MI->isSafeToMove(nullptr, SawStore)) {
+-      ++MI;
+-      continue;
+-    }
+-    unsigned Uses = 0;
+-    for (MachineInstr::mop_iterator MOI = MI->operands_begin(),
+-                                    MOE = MI->operands_end();
+-         MOI != MOE; ++MOI) {
+-      if (!MOI->isReg() || !MOI->isDef())
+-        continue;
+-      unsigned reg = MOI->getReg();
+-      // Assume physical registers are used.
+-      if (Register::isPhysicalRegister(reg)) {
+-        Uses++;
+-        continue;
+-      }
+-      if (MRI->use_begin(reg) != MRI->use_end())
+-        Uses++;
+-    }
+-    if (!Uses) {
+-      MI++->eraseFromParent();
+-      continue;
+-    }
+-    ++MI;
+-  }
+-}
+-
+-bool HexagonPostIncOpt::isPostIncInsn(MachineInstr &MI) const {
+-  // Predicated post-increments are not yet handled. (ISel is not generating
+-  // them yet). Circular buffer instructions should not be handled.
+-  return (HII->isPostIncWithImmOffset(MI) && !HII->isPredicated(MI) &&
+-          !HII->isCircBufferInstr(MI));
+-}
+-
+-/// For instructions with a base and offset, return true if the new Offset
+-/// is a valid value with the correct alignment.
+-bool HexagonPostIncOpt::isValidOffset(const MachineInstr &MI,
+-                                      int64_t Offset) const {
+-  if (!HII->isValidOffset(MI.getOpcode(), Offset, TRI, false))
+-    return false;
+-  unsigned AlignMask = HII->getMemAccessSize(MI) - 1;
+-  return (Offset & AlignMask) == 0;
+-}
+-
+-bool HexagonPostIncOpt::isValidPostIncValue(const MachineInstr &MI,
+-                                            int IncVal) const {
+-  unsigned AlignMask = HII->getMemAccessSize(MI) - 1;
+-  if ((IncVal & AlignMask) != 0)
+-    return false;
+-
+-  // Number of total bits in the instruction used to encode Inc value.
+-  unsigned IncBits = 4;
+-  // For HVX instructions, the offset is 3.
+-  if (HexagonII::isCVI(MI.getDesc()))
+-    IncBits = 3;
+-
+-  IncBits += Log2_32(HII->getMemAccessSize(MI));
+-  if (HII->getMemAccessSize(MI) > 8)
+-    IncBits = 16;
+-
+-  int MinValidVal = -1U << (IncBits - 1);
+-  int MaxValidVal = ~(-1U << (IncBits - 1));
+-  return (IncVal >= MinValidVal && IncVal <= MaxValidVal);
+-}
+-
+-void HexagonPostIncOptSchedDAG::schedule() {
+-  AliasAnalysis *AA = &Pass.getAnalysis<AAResultsWrapperPass>().getAAResults();
+-  buildSchedGraph(AA);
+-}
+-
+-// Replace post-increment operations with base+offset counterpart.
+-void HexagonPostIncOpt::replacePostIncWithBaseOffset(
+-    MachineBasicBlock &MBB) const {
+-  LLVM_DEBUG(dbgs() << "#Replacing post-increment instructions with "
+-                       "base+offset counterparts.\n");
+-
+-  SmallVector<MachineInstr *, 4> MIList;
+-  for (auto &MI : make_range(MBB.getFirstNonPHI(), MBB.getFirstTerminator())) {
+-    // Check for eligible post-inc candidates.
+-    if (!isPostIncInsn(MI))
+-      continue;
+-    MIList.push_back(&MI);
+-  }
+-
+-  for (auto MI : MIList)
+-    replacePostIncWithBaseOffset(*MI);
+-
+-  LLVM_DEBUG(dbgs() << "#Done with replacing post-increment instructions.\n");
+-}
+-
+-void HexagonPostIncOpt::replacePostIncWithBaseOffset(MachineInstr &MI) const {
+-  short NewOpcode = HII->changeAddrMode_pi_io(MI.getOpcode());
+-  if (NewOpcode < 0)
+-    return;
+-
+-  unsigned BasePos = 0, OffsetPos = 0;
+-  if (!HII->getBaseAndOffsetPosition(MI, BasePos, OffsetPos))
+-    return;
+-  const MachineOperand &PostIncOffset = MI.getOperand(OffsetPos);
+-  const MachineOperand &PostIncBase = MI.getOperand(BasePos);
+-
+-  MachineBasicBlock &MBB = *MI.getParent();
+-  DebugLoc DL = MI.getDebugLoc();
+-  MachineOperand *PostIncDest;
+-  MachineInstrBuilder MIB;
+-  if (MI.mayLoad()) {
+-    PostIncDest = &MI.getOperand(1);
+-    const MachineOperand &LDValue = MI.getOperand(0);
+-    MIB = BuildMI(MBB, MI, DL, HII->get(NewOpcode));
+-    MIB.add(LDValue).add(PostIncBase).addImm(0);
+-  } else {
+-    PostIncDest = &MI.getOperand(0);
+-    const MachineOperand &STValue = MI.getOperand(3);
+-    MIB = BuildMI(MBB, MI, DL, HII->get(NewOpcode));
+-    MIB.add(PostIncBase).addImm(0).add(STValue);
+-  }
+-
+-  // Transfer memoperands.
+-  MIB->cloneMemRefs(*MBB.getParent(), MI);
+-
+-  // Create an add instruction for the post-inc addition of offset.
+-  MachineInstrBuilder MIBA = BuildMI(MBB, MI, DL, HII->get(Hexagon::A2_addi));
+-  MIBA.add(*PostIncDest).add(PostIncBase).add(PostIncOffset);
+-
+-  LLVM_DEBUG({
+-    dbgs() << "\n";
+-    MI.dump();
+-    dbgs() << "\tis tranformed to \n";
+-    MIB->dump();
+-    MIBA->dump();
+-    dbgs() << "\n\n";
+-  });
+-
+-  MI.eraseFromParent();
+-}
+-
+-void HexagonPostIncOpt::generatePostInc(MachineBasicBlock &MBB) {
+-  LLVM_DEBUG(dbgs() << "# Generate Post-inc and update uses if needed.\n");
+-  MachineBasicBlock::iterator MII = MBB.getFirstNonPHI();
+-  MachineBasicBlock::iterator MIE = MBB.instr_begin();
+-  bool isOK = true;
+-  while (MII != MIE) {
+-    MachineInstr *Phi = &*std::prev(MII);
+-    MII = std::prev(MII);
+-    unsigned LoopVal = getLoopPhiReg(Phi, &MBB);
+-    if (LoopVal == UINT_MAX)
+-      continue;
+-    MachineInstr *LoopInst = MRI->getVRegDef(LoopVal);
+-    if (!isAddWithImmValue(*LoopInst))
+-      continue;
+-
+-    if (LoopInst->getOpcode() != Hexagon::A2_addi)
+-      continue;
+-
+-    unsigned AddReg = LoopInst->getOperand(1).getReg();
+-    int64_t AddImm = LoopInst->getOperand(2).getImm();
+-    SmallVector<MachineInstr *, 4> UseList;
+-    MachineInstr *PostIncCandidate = nullptr;
+-
+-    // Find the probable candidates for Post-increment instruction.
+-    SmallVector<MachineInstr *, 4> CandList;
+-    for (auto &MO : make_range(MRI->use_begin(AddReg), MRI->use_end())) {
+-      MachineInstr *UseMI = MO.getParent();
+-
+-      if (UseMI == LoopInst)
+-        continue;
+-
+-      if (!dominates(UseMI, LoopInst)) {
+-        isOK = false;
+-        break;
+-      }
+-      const MachineOperand *BaseOp = nullptr;
+-      int64_t Offset;
+-      bool OffsetIsScalable;
+-      if (!HII->isBaseImmOffset(*UseMI) ||
+-          !HII->getMemOperandWithOffset(*UseMI, BaseOp, Offset,
+-                                        OffsetIsScalable, TRI)) {
+-        isOK = false;
+-        break;
+-      }
+-      int64_t NewOffset = Offset - AddImm;
+-      if (!isValidOffset(*UseMI, NewOffset) || !BaseOp->isReg() ||
+-          BaseOp->getReg() != AddReg) {
+-        isOK = false;
+-        break;
+-      }
+-      if (OffsetIsScalable) {
+-        isOK = false;
+-        break;
+-      }
+-      if (Offset == 0) {
+-        // If you have stores in the chain, make sure they are in the beginning
+-        // of the list. Eg: LD, LD, ST, ST will end up as LD, LD, PostInc_ST,
+-        // ST.
+-        if (UseMI->mayStore() && PreferPostIncStore)
+-          CandList.insert(CandList.begin(), UseMI);
+-        else
+-          CandList.push_back(UseMI);
+-        continue;
+-      }
+-      UseList.push_back(UseMI);
+-    }
+-
+-    if (!isOK)
+-      continue;
+-
+-    for (auto MI : CandList) {
+-      if (!PostIncCandidate)
+-        PostIncCandidate = MI;
+-      // Push the rest of the list for updation.
+-      else
+-        UseList.push_back(MI);
+-    }
+-
+-    // If a candidate is found, replace it with the post-inc instruction.
+-    // Also, adjust offset for other uses as needed.
+-    if (!PostIncCandidate || !canReplaceWithPostInc(PostIncCandidate, LoopInst))
+-      continue;
+-
+-    // Logic to determine what the base register to be.
+-    // There are two choices:
+-    //   1. New address register after we updated the post-increment candidate.
+-    //      v2,v3 = post_load v1, 4
+-    //      v3 is the choice here.
+-    //   2. The base register we used in post-increment candidate.
+-    //      v2,v3 = post_load v1, 4
+-    //      v1 is the choice here.
+-    // Use v3  if there is a memory dependence between post-inc instruction and
+-    // any other instruction in the chain.
+-    // FIXME: We can do some complex DAG analysis based off height and depth and
+-    // selectively update other instructions in the chain. Use v3 if there are
+-    // more instructions in the chain, otherwise we will end up increasing the
+-    // height of the DAG resulting in more spills. By default we have a
+-    // threshold controlled by the option "post-inc-chain-threshold" which is
+-    // set to 4. v1 is preferred as we can packetize two memory operations in a
+-    // single packet in scalar core. But it heavily depends on the structure of
+-    // DAG.
+-    bool UpdateBaseToNew = false;
+-
+-    // Do not bother to build a DAG and analyze if the Use list is empty.
+-    if (!UseList.empty()) {
+-      MachineFunction *MF = MBB.getParent();
+-      // Setup the Post-inc schedule DAG.
+-      HexagonPostIncOptSchedDAG PIDAG(*this, *MF, MLI);
+-      initPISchedDAG(PIDAG, MBB);
+-      SUnit *SU = PIDAG.getSUnit(PostIncCandidate);
+-      if (hasMemoryDependency(SU, UseList) ||
+-          UseList.size() >= PostIncChainThreshold)
+-        UpdateBaseToNew = true;
+-    }
+-
+-    if (UpdateBaseToNew) {
+-      LLVM_DEBUG(dbgs() << "The heuristic determines to update the uses of the "
+-                           "base register of post-increment\n");
+-      for (auto UseMI : UseList) {
+-        if (!dominates(PostIncCandidate, UseMI))
+-          continue;
+-        unsigned BasePos, OffsetPos;
+-        if (HII->getBaseAndOffsetPosition(*UseMI, BasePos, OffsetPos)) {
+-          // New offset has already been validated; no need to do it again.
+-          LLVM_DEBUG({
+-            UseMI->dump();
+-            dbgs() << "\t is transformed to \n";
+-          });
+-          int64_t NewOffset = UseMI->getOperand(OffsetPos).getImm() - AddImm;
+-          UseMI->getOperand(OffsetPos).setImm(NewOffset);
+-          UseMI->getOperand(BasePos).setReg(LoopVal);
+-          LLVM_DEBUG(UseMI->dump());
+-        }
+-      }
+-    }
+-    replaceWithPostInc(PostIncCandidate, LoopInst);
+-  }
+-  LLVM_DEBUG(dbgs() << "# End of generation of Post-inc.\n");
+-}
+-
+-bool HexagonPostIncOpt::canReplaceWithPostInc(MachineInstr *MI,
+-                                              MachineInstr *AddMI) const {
+-  if (HII->changeAddrMode_io_pi(MI->getOpcode()) < 0)
+-    return false;
+-  assert(AddMI->getOpcode() == Hexagon::A2_addi);
+-  return isValidPostIncValue(*MI, AddMI->getOperand(2).getImm());
+-}
+-
+-void HexagonPostIncOpt::replaceWithPostInc(MachineInstr *MI,
+-                                           MachineInstr *AddMI) const {
+-  short NewOpcode = HII->changeAddrMode_io_pi(MI->getOpcode());
+-  assert(NewOpcode >= 0 &&
+-         "Couldn't change base offset to post-increment form");
+-
+-  MachineBasicBlock &MBB = *MI->getParent();
+-  DebugLoc DL = MI->getDebugLoc();
+-  const MachineOperand &IncDest = AddMI->getOperand(0);
+-  const MachineOperand &IncBase = AddMI->getOperand(1);
+-  const MachineOperand &IncValue = AddMI->getOperand(2);
+-  MachineInstrBuilder MIB;
+-  LLVM_DEBUG({
+-    dbgs() << "\n\n";
+-    MI->dump();
+-    dbgs() << "\t is tranformed to post-inc form of \n";
+-  });
+-
+-  if (MI->mayLoad()) {
+-    const MachineOperand &LDValue = MI->getOperand(0);
+-    MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode));
+-    MIB.add(LDValue).add(IncDest).add(IncBase).add(IncValue);
+-  } else {
+-    const MachineOperand &STValue = MI->getOperand(2);
+-    MIB = BuildMI(MBB, *MI, DL, HII->get(NewOpcode));
+-    MIB.add(IncDest).add(IncBase).add(IncValue).add(STValue);
+-  }
+-
+-  // Transfer memoperands.
+-  MIB->cloneMemRefs(*MBB.getParent(), *MI);
+-
+-  LLVM_DEBUG({
+-    MIB->dump();
+-    dbgs() << "As a result this add instruction is erased.\n";
+-    AddMI->dump();
+-  });
+-
+-  MI->eraseFromParent();
+-  AddMI->eraseFromParent();
+-}
+-
+-bool HexagonPostIncOpt::translatePostIncsInLoop(MachineBasicBlock &MBB) {
+-  // Algorithm:
+-  // 1. Replace all the post-inc instructions with Base+Offset instruction and
+-  // an add instruction in this block.
+-  // 2. Fold all the adds in to respective uses.
+-  // 3. Generate post-increment instructions and update the uses of the base
+-  // register if needed based on constraints.
+-
+-  replacePostIncWithBaseOffset(MBB);
+-  foldAdds(MBB);
+-  generatePostInc(MBB);
+-  return true;
+-}
+-
+-bool HexagonPostIncOpt::runOnMachineFunction(MachineFunction &MF) {
+-
+-  // Skip pass if requested.
+-  if (skipFunction(MF.getFunction()))
+-    return false;
+-
+-  // Get Target Information.
+-  MLI = &getAnalysis<MachineLoopInfo>();
+-  HST = &MF.getSubtarget<HexagonSubtarget>();
+-  TRI = HST->getRegisterInfo();
+-  MRI = &MF.getRegInfo();
+-  HII = HST->getInstrInfo();
+-
+-  // Skip this pass for TinyCore.
+-  // Tiny core allwos partial post increment operations - This constraint can
+-  // be imposed inside the pass. In a chain of post-increments, the first can
+-  // be post-increment, rest can be adjusted to base+offset (these are
+-  // inexpensive in most of the cases);
+-  if (HST->isTinyCore())
+-    return false;
+-
+-  LLVM_DEBUG({
+-    dbgs() << "Begin: Hexagon Post-Inc-Opt Pass.\n";
+-    dbgs() << "Function: " << MF.getName() << "\n";
+-  });
+-  bool Change = false;
+-  std::vector<MachineBasicBlock *> MLBB;
+-  for (auto &BB : MF) {
+-    // Check if this Basic Block belongs to any loop.
+-    auto *LI = MLI->getLoopFor(&BB);
+-    // We only deal with inner-most loops that has one block.
+-    if (LI && LI->getBlocks().size() == 1) {
+-      MachineBasicBlock *MBB = LI->getHeader();
+-      // Do not traverse blocks that are already visited.
+-      if (std::find(MLBB.begin(), MLBB.end(), MBB) != MLBB.end())
+-        continue;
+-
+-      MLBB.push_back(MBB);
+-
+-      LLVM_DEBUG(dbgs() << "\n\t Basic Block: " << MBB->getName() << "\n");
+-      Change |= translatePostIncsInLoop(*MBB);
+-    }
+-  }
+-  LLVM_DEBUG(dbgs() << "End: Hexagon Post-Inc-Opt Pass\n");
+-  return Change;
+-}
+-
+-FunctionPass *llvm::createHexagonPostIncOpt() {
+-  return new HexagonPostIncOpt();
+-}
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
++++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+@@ -121,10 +121,6 @@
+                                         cl::init(true),
+                                         cl::desc("Enable instsimplify"));
+ 
+-static cl::opt<bool> DisableHexagonPostIncOpt(
+-    "hexagon-postinc-opt", cl::Hidden,
+-    cl::desc("Disable Hexagon post-increment optimization"));
+-
+ /// HexagonTargetMachineModule - Note that this is used on hosts that
+ /// cannot link in a library unless there are references into the
+ /// library.  In particular, it seems that it is not possible to get
+@@ -161,7 +157,6 @@
+   void initializeHexagonNewValueJumpPass(PassRegistry&);
+   void initializeHexagonOptAddrModePass(PassRegistry&);
+   void initializeHexagonPacketizerPass(PassRegistry&);
+-  void initializeHexagonPostIncOptPass(PassRegistry &);
+   void initializeHexagonRDFOptPass(PassRegistry&);
+   void initializeHexagonSplitDoubleRegsPass(PassRegistry&);
+   void initializeHexagonVExtractPass(PassRegistry &);
+@@ -193,7 +188,6 @@
+   FunctionPass *createHexagonOptimizeSZextends();
+   FunctionPass *createHexagonPacketizer(bool Minimal);
+   FunctionPass *createHexagonPeephole();
+-  FunctionPass *createHexagonPostIncOpt();
+   FunctionPass *createHexagonRDFOpt();
+   FunctionPass *createHexagonSplitConst32AndConst64();
+   FunctionPass *createHexagonSplitDoubleRegs();
+@@ -223,7 +217,6 @@
+   initializeHexagonNewValueJumpPass(PR);
+   initializeHexagonOptAddrModePass(PR);
+   initializeHexagonPacketizerPass(PR);
+-  initializeHexagonPostIncOptPass(PR);
+   initializeHexagonRDFOptPass(PR);
+   initializeHexagonSplitDoubleRegsPass(PR);
+   initializeHexagonVectorCombineLegacyPass(PR);
+@@ -251,7 +244,6 @@
+           (HexagonNoOpt ? CodeGenOptLevel::None : OL)),
+       TLOF(std::make_unique<HexagonTargetObjectFile>()) {
+   initializeHexagonExpandCondsetsPass(*PassRegistry::getPassRegistry());
+-  initializeHexagonPostIncOptPass(*PassRegistry::getPassRegistry());
+   initAsmInfo();
  }
  
- namespace PR28023 {
+@@ -424,11 +416,6 @@
+     if (!DisableHardwareLoops)
+       addPass(createHexagonHardwareLoops());
+   }
+-
+-  if (TM->getOptLevel() >= CodeGenOptLevel::Aggressive)
+-    if (!DisableHexagonPostIncOpt)
+-      addPass(createHexagonPostIncOpt());
+-
+   if (TM->getOptLevel() >= CodeGenOptLevel::Default)
+     addPass(&MachinePipelinerID);
+ }
+diff -ruN --strip-trailing-cr a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
++++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+@@ -18,7 +18,6 @@
+ 
+ #include "HexagonDepITypes.h"
+ #include "MCTargetDesc/HexagonMCTargetDesc.h"
+-#include "llvm/MC/MCInstrDesc.h"
+ 
+ namespace llvm {
+ 
+@@ -49,7 +48,7 @@
+ 
+   // MCInstrDesc TSFlags
+   // *** Must match HexagonInstrFormat*.td ***
+-  enum HexagonTSFlagsVal {
++  enum {
+     // This 7-bit field describes the insn type.
+     TypePos = 0,
+     TypeMask = 0x7f,
+@@ -174,11 +173,6 @@
+     hasUnaryRestrictionMask = 0x1,
+   };
+ 
+-  inline unsigned getTSFlags(const MCInstrDesc &MID, HexagonTSFlagsVal Pos,
+-                             unsigned Mask) {
+-    return (MID.TSFlags >> Pos) & Mask;
+-  }
+-
+   // *** The code above must match HexagonInstrFormat*.td *** //
+ 
+   // Hexagon specific MO operand flag mask.
+@@ -281,10 +275,6 @@
+     INST_ICLASS_ALU32_3   = 0xf0000000
+   };
+ 
+-  inline bool isCVI(const MCInstrDesc &MID) {
+-    return getTSFlags(MID, isCVIPos, isCVIMask) != 0;
+-  }
+-
+   LLVM_ATTRIBUTE_UNUSED
+   static unsigned getMemAccessSizeInBytes(MemAccessSize S) {
+     switch (S) {
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/Hexagon/post-inc-vec.mir b/llvm/test/CodeGen/Hexagon/post-inc-vec.mir
+--- a/llvm/test/CodeGen/Hexagon/post-inc-vec.mir
++++ b/llvm/test/CodeGen/Hexagon/post-inc-vec.mir
+@@ -1,413 +0,0 @@
+-#RUN: llc -march=hexagon -run-pass hexagon-postincopt %s -o - | FileCheck  %s
+-
+-# Test that we do not generate two post-increment vector load/store
+-# in the loop.
+-# CHECK: J2_loop0r
+-# CHECK: V6_vS32b_pi
+-# CHECK-NOT: = V6_vL32b_pi
+-# CHECK: V6_vL32b_ai
+-# CHECK: V6_vL32b_ai
+-# CHECK: V6_vS32b_ai
+-# CHECK: ENDLOOP0
+-
+---- |
+-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+-  declare <1024 x i1> @llvm.hexagon.V6.pred.scalar2v2.128B(i32) #0
+-
+-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+-  declare void @llvm.hexagon.V6.vS32b.qpred.ai.128B(<1024 x i1>, ptr, <32 x i32>) #1
+-
+-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+-  declare <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32>, <32 x i32>) #0
+-
+-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+-  declare <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32>) #0
+-
+-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+-  declare <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32>) #0
+-
+-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+-  declare <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32) #0
+-
+-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+-  declare <32 x i32> @llvm.hexagon.V6.vasrhbsat.128B(<32 x i32>, <32 x i32>, i32) #0
+-
+-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+-  declare <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32>, <32 x i32>) #0
+-
+-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+-  declare <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32>, i32) #0
+-
+-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+-  declare <32 x i32> @llvm.hexagon.V6.vavgh.128B(<32 x i32>, <32 x i32>) #0
+-
+-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+-  declare <64 x i32> @llvm.hexagon.V6.vmpabusv.128B(<64 x i32>, <64 x i32>) #0
+-
+-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+-  declare void @llvm.assume(i1 noundef) #2
+-
+-  ; Function Attrs: noinline nounwind
+-  define void @blah(i32 %0, i32 %1, ptr noalias %2, ptr noalias nocapture readonly %3, ptr noalias nocapture readonly %4, ptr nocapture readnone %5, ptr nocapture readnone %6, i32 %7, i32 %8, ptr nocapture readonly %9, ptr nocapture readonly %10) local_unnamed_addr #3 {
+-  entry:
+-    %11 = call i32 @llvm.hexagon.S2.extractu(i32 %0, i32 23, i32 9)
+-    %12 = shl i32 %11, 7
+-    %mul16.i = mul nsw i32 %12, %1
+-    %add.i = add nsw i32 %1, 1
+-    %mul17.i = mul nsw i32 %add.i, %12
+-    %cmp184.i = icmp slt i32 %mul16.i, %mul17.i
+-    br i1 %cmp184.i, label %for.body.lr.ph.i, label %for.end.i
+-
+-  for.body.lr.ph.i:                                 ; preds = %entry
+-    %13 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> <i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576, i32 1077952576>, <32 x i32> <i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144, i32 269488144>) #5
+-    %14 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> zeroinitializer, <32 x i32> zeroinitializer) #5
+-    %15 = tail call <32 x i32> @llvm.hexagon.V6.lvsplath.128B(i32 32) #5
+-    %cgep = getelementptr i8, ptr %2, i32 %mul16.i
+-    %cgep8 = getelementptr i8, ptr %4, i32 %mul16.i
+-    %cgep9 = getelementptr i8, ptr %3, i32 %mul16.i
+-    br label %for.body.i
+-
+-  for.body.i:                                       ; preds = %for.body.i, %for.body.lr.ph.i
+-    %lsr.iv6 = phi ptr [ %cgep12, %for.body.i ], [ %cgep9, %for.body.lr.ph.i ]
+-    %lsr.iv3 = phi ptr [ %cgep11, %for.body.i ], [ %cgep8, %for.body.lr.ph.i ]
+-    %lsr.iv = phi ptr [ %cgep10, %for.body.i ], [ %cgep, %for.body.lr.ph.i ]
+-    %elemIdx.05.i = phi i32 [ %mul16.i, %for.body.lr.ph.i ], [ %add19.i, %for.body.i ]
+-    %16 = load <32 x i32>, ptr %lsr.iv6, align 128
+-    %17 = load <32 x i32>, ptr %lsr.iv3, align 128
+-    %18 = tail call <64 x i32> @llvm.hexagon.V6.vcombine.128B(<32 x i32> %17, <32 x i32> %16) #5
+-    %19 = tail call <64 x i32> @llvm.hexagon.V6.vmpabusv.128B(<64 x i32> %13, <64 x i32> %18) #5
+-    %20 = tail call <64 x i32> @llvm.hexagon.V6.vmpabusv.128B(<64 x i32> %14, <64 x i32> %18) #5
+-    %21 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %19) #5
+-    %22 = tail call <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32> %20) #5
+-    %23 = tail call <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32> %22, i32 7) #5
+-    %24 = tail call <32 x i32> @llvm.hexagon.V6.vavgh.128B(<32 x i32> %21, <32 x i32> %23) #5
+-    %25 = tail call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %19) #5
+-    %26 = tail call <32 x i32> @llvm.hexagon.V6.lo.128B(<64 x i32> %20) #5
+-    %27 = tail call <32 x i32> @llvm.hexagon.V6.vasrh.128B(<32 x i32> %26, i32 7) #5
+-    %28 = tail call <32 x i32> @llvm.hexagon.V6.vavgh.128B(<32 x i32> %25, <32 x i32> %27) #5
+-    %29 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %24, <32 x i32> %15) #5
+-    %30 = tail call <32 x i32> @llvm.hexagon.V6.vaddhsat.128B(<32 x i32> %28, <32 x i32> %15) #5
+-    %31 = tail call <32 x i32> @llvm.hexagon.V6.vasrhbsat.128B(<32 x i32> %29, <32 x i32> %30, i32 4) #5
+-    store <32 x i32> %31, ptr %lsr.iv, align 128
+-    %add19.i = add nsw i32 %elemIdx.05.i, 128
+-    %cmp18.i = icmp slt i32 %add19.i, %mul17.i
+-    %cgep10 = getelementptr i8, ptr %lsr.iv, i32 128
+-    %cgep11 = getelementptr i8, ptr %lsr.iv3, i32 128
+-    %cgep12 = getelementptr i8, ptr %lsr.iv6, i32 128
+-    br i1 %cmp18.i, label %for.body.i, label %for.end.i
+-
+-  for.end.i:                                        ; preds = %for.body.i, %entry
+-    ret void
+-  }
+-
+-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+-  declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #4
+-
+-  ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none)
+-  declare i32 @llvm.hexagon.S2.extractu(i32, i32 immarg, i32 immarg) #0
+-
+-  attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
+-  attributes #1 = { nocallback nofree nosync nounwind willreturn memory(write) }
+-  attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
+-  attributes #3 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+hvx-length128b,+hvxv68,+v68,-long-calls,-small-data" "unsafe-fp-math"="false" "use-soft-float"="false" }
+-  attributes #4 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
+-  attributes #5 = { nounwind }
+-
+-...
+----
+-name:            blah
+-alignment:       16
+-exposesReturnsTwice: false
+-legalized:       false
+-regBankSelected: false
+-selected:        false
+-failedISel:      false
+-tracksRegLiveness: true
+-hasWinCFI:       false
+-callsEHReturn:   false
+-callsUnwindInit: false
+-hasEHCatchret:   false
+-hasEHScopes:     false
+-hasEHFunclets:   false
+-isOutlined:      false
+-debugInstrRef:   false
+-failsVerification: false
+-tracksDebugUserValues: false
+-registers:
+-  - { id: 0, class: intregs, preferred-register: '' }
+-  - { id: 1, class: intregs, preferred-register: '' }
+-  - { id: 2, class: hvxwr, preferred-register: '' }
+-  - { id: 3, class: hvxwr, preferred-register: '' }
+-  - { id: 4, class: hvxvr, preferred-register: '' }
+-  - { id: 5, class: intregs, preferred-register: '' }
+-  - { id: 6, class: intregs, preferred-register: '' }
+-  - { id: 7, class: intregs, preferred-register: '' }
+-  - { id: 8, class: intregs, preferred-register: '' }
+-  - { id: 9, class: intregs, preferred-register: '' }
+-  - { id: 10, class: intregs, preferred-register: '' }
+-  - { id: 11, class: intregs, preferred-register: '' }
+-  - { id: 12, class: intregs, preferred-register: '' }
+-  - { id: 13, class: intregs, preferred-register: '' }
+-  - { id: 14, class: intregs, preferred-register: '' }
+-  - { id: 15, class: intregs, preferred-register: '' }
+-  - { id: 16, class: intregs, preferred-register: '' }
+-  - { id: 17, class: intregs, preferred-register: '' }
+-  - { id: 18, class: intregs, preferred-register: '' }
+-  - { id: 19, class: intregs, preferred-register: '' }
+-  - { id: 20, class: intregs, preferred-register: '' }
+-  - { id: 21, class: intregs, preferred-register: '' }
+-  - { id: 22, class: intregs, preferred-register: '' }
+-  - { id: 23, class: intregs, preferred-register: '' }
+-  - { id: 24, class: intregs, preferred-register: '' }
+-  - { id: 25, class: predregs, preferred-register: '' }
+-  - { id: 26, class: predregs, preferred-register: '' }
+-  - { id: 27, class: hvxvr, preferred-register: '' }
+-  - { id: 28, class: intregs, preferred-register: '' }
+-  - { id: 29, class: hvxvr, preferred-register: '' }
+-  - { id: 30, class: intregs, preferred-register: '' }
+-  - { id: 31, class: hvxvr, preferred-register: '' }
+-  - { id: 32, class: intregs, preferred-register: '' }
+-  - { id: 33, class: hvxvr, preferred-register: '' }
+-  - { id: 34, class: hvxvr, preferred-register: '' }
+-  - { id: 35, class: hvxwr, preferred-register: '' }
+-  - { id: 36, class: hvxwr, preferred-register: '' }
+-  - { id: 37, class: hvxwr, preferred-register: '' }
+-  - { id: 38, class: hvxvr, preferred-register: '' }
+-  - { id: 39, class: hvxvr, preferred-register: '' }
+-  - { id: 40, class: intregs, preferred-register: '' }
+-  - { id: 41, class: hvxvr, preferred-register: '' }
+-  - { id: 42, class: hvxvr, preferred-register: '' }
+-  - { id: 43, class: hvxvr, preferred-register: '' }
+-  - { id: 44, class: hvxvr, preferred-register: '' }
+-  - { id: 45, class: hvxvr, preferred-register: '' }
+-  - { id: 46, class: hvxvr, preferred-register: '' }
+-  - { id: 47, class: hvxvr, preferred-register: '' }
+-  - { id: 48, class: hvxvr, preferred-register: '' }
+-  - { id: 49, class: intregslow8, preferred-register: '' }
+-  - { id: 50, class: hvxvr, preferred-register: '' }
+-  - { id: 51, class: predregs, preferred-register: '' }
+-  - { id: 52, class: intregs, preferred-register: '' }
+-  - { id: 53, class: intregs, preferred-register: '' }
+-  - { id: 54, class: intregs, preferred-register: '' }
+-  - { id: 55, class: intregs, preferred-register: '' }
+-  - { id: 56, class: intregs, preferred-register: '' }
+-  - { id: 57, class: intregs, preferred-register: '' }
+-  - { id: 58, class: intregs, preferred-register: '' }
+-  - { id: 59, class: intregs, preferred-register: '' }
+-  - { id: 60, class: intregs, preferred-register: '' }
+-  - { id: 61, class: hvxvr, preferred-register: '' }
+-  - { id: 62, class: intregs, preferred-register: '' }
+-  - { id: 63, class: hvxvr, preferred-register: '' }
+-  - { id: 64, class: intregs, preferred-register: '' }
+-  - { id: 65, class: hvxwr, preferred-register: '' }
+-  - { id: 66, class: hvxwr, preferred-register: '' }
+-  - { id: 67, class: hvxwr, preferred-register: '' }
+-  - { id: 68, class: hvxvr, preferred-register: '' }
+-  - { id: 69, class: hvxvr, preferred-register: '' }
+-  - { id: 70, class: hvxvr, preferred-register: '' }
+-  - { id: 71, class: hvxvr, preferred-register: '' }
+-  - { id: 72, class: hvxvr, preferred-register: '' }
+-  - { id: 73, class: hvxvr, preferred-register: '' }
+-  - { id: 74, class: hvxvr, preferred-register: '' }
+-  - { id: 75, class: intregs, preferred-register: '' }
+-  - { id: 76, class: intregs, preferred-register: '' }
+-  - { id: 77, class: intregs, preferred-register: '' }
+-  - { id: 78, class: intregs, preferred-register: '' }
+-  - { id: 79, class: hvxvr, preferred-register: '' }
+-  - { id: 80, class: intregs, preferred-register: '' }
+-  - { id: 81, class: hvxvr, preferred-register: '' }
+-  - { id: 82, class: intregs, preferred-register: '' }
+-  - { id: 83, class: hvxwr, preferred-register: '' }
+-  - { id: 84, class: hvxwr, preferred-register: '' }
+-  - { id: 85, class: hvxwr, preferred-register: '' }
+-  - { id: 86, class: hvxvr, preferred-register: '' }
+-  - { id: 87, class: hvxvr, preferred-register: '' }
+-  - { id: 88, class: hvxvr, preferred-register: '' }
+-  - { id: 89, class: hvxvr, preferred-register: '' }
+-  - { id: 90, class: hvxvr, preferred-register: '' }
+-  - { id: 91, class: hvxvr, preferred-register: '' }
+-  - { id: 92, class: hvxvr, preferred-register: '' }
+-  - { id: 93, class: intregs, preferred-register: '' }
+-  - { id: 94, class: intregs, preferred-register: '' }
+-  - { id: 95, class: intregs, preferred-register: '' }
+-  - { id: 96, class: intregs, preferred-register: '' }
+-  - { id: 97, class: predregs, preferred-register: '' }
+-  - { id: 98, class: predregs, preferred-register: '' }
+-liveins:
+-  - { reg: '$r0', virtual-reg: '%16' }
+-  - { reg: '$r1', virtual-reg: '%17' }
+-  - { reg: '$r2', virtual-reg: '%18' }
+-  - { reg: '$r3', virtual-reg: '%19' }
+-  - { reg: '$r4', virtual-reg: '%20' }
+-frameInfo:
+-  isFrameAddressTaken: false
+-  isReturnAddressTaken: false
+-  hasStackMap:     false
+-  hasPatchPoint:   false
+-  stackSize:       0
+-  offsetAdjustment: 0
+-  maxAlignment:    4
+-  adjustsStack:    false
+-  hasCalls:        false
+-  stackProtector:  ''
+-  functionContext: ''
+-  maxCallFrameSize: 4294967295
+-  cvBytesOfCalleeSavedRegisters: 0
+-  hasOpaqueSPAdjustment: false
+-  hasVAStart:      false
+-  hasMustTailInVarArgFunc: false
+-  hasTailCall:     false
+-  localFrameSize:  0
+-  savePoint:       ''
+-  restorePoint:    ''
+-fixedStack:
+-  - { id: 0, type: default, offset: 24, size: 4, alignment: 8, stack-id: default,
+-      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+-  - { id: 1, type: default, offset: 20, size: 4, alignment: 4, stack-id: default,
+-      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+-  - { id: 2, type: default, offset: 16, size: 4, alignment: 8, stack-id: default,
+-      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+-  - { id: 3, type: default, offset: 12, size: 4, alignment: 4, stack-id: default,
+-      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+-  - { id: 4, type: default, offset: 8, size: 4, alignment: 8, stack-id: default,
+-      isImmutable: true, isAliased: false, callee-saved-register: '', callee-saved-restored: true,
+-      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+-stack:           []
+-entry_values:    []
+-callSites:       []
+-debugValueSubstitutions: []
+-constants:       []
+-machineFunctionInfo: {}
+-body:             |
+-  bb.0.entry:
+-    successors: %bb.1(0x40000000), %bb.3(0x40000000)
+-    liveins: $r0, $r1, $r2, $r3, $r4
+-
+-    %20:intregs = COPY $r4
+-    %19:intregs = COPY $r3
+-    %18:intregs = COPY $r2
+-    %17:intregs = COPY $r1
+-    %16:intregs = COPY $r0
+-    %22:intregs = S2_extractu %16, 23, 9
+-    %23:intregs = S2_asl_i_r %22, 7
+-    %0:intregs = nsw M2_mpyi %23, %17
+-    %24:intregs = nsw A2_addi %17, 1
+-    %1:intregs = nsw M2_mpyi %24, %23
+-    %25:predregs = C2_cmpgt %1, %0
+-    J2_jumpf %25, %bb.3, implicit-def dead $pc
+-    J2_jump %bb.1, implicit-def dead $pc
+-
+-  bb.1.for.body.lr.ph.i:
+-    successors: %bb.4(0x40000000), %bb.6(0x40000000)
+-
+-    %28:intregs = A2_tfrsi 269488144
+-    %27:hvxvr = V6_lvsplatw %28
+-    %30:intregs = A2_tfrsi 1077952576
+-    %29:hvxvr = V6_lvsplatw %30
+-    %2:hvxwr = REG_SEQUENCE %29, %subreg.vsub_hi, %27, %subreg.vsub_lo
+-    %31:hvxvr = V6_vd0
+-    %3:hvxwr = REG_SEQUENCE %31, %subreg.vsub_hi, %31, %subreg.vsub_lo
+-    %32:intregs = A2_tfrsi 32
+-    %4:hvxvr = V6_lvsplath %32
+-    %5:intregs = A2_add %18, %0
+-    %6:intregs = A2_add %20, %0
+-    %7:intregs = A2_add %19, %0
+-    %40:intregs = A2_tfrsi 7
+-    %49:intregslow8 = A2_tfrsi 4
+-    %52:intregs = A2_sub %1, %0
+-    %53:intregs = A2_addi %52, 127
+-    %54:intregs = S2_lsr_i_r %53, 7
+-    %55:intregs = COPY %54
+-    %56:intregs = S2_lsr_i_r %55, 1
+-    %57:intregs = A2_andir %55, 1
+-    %97:predregs = C2_cmpgtui %56, 0
+-    J2_jumpf %97, %bb.6, implicit-def $pc
+-    J2_jump %bb.4, implicit-def $pc
+-
+-  bb.4:
+-    successors: %bb.5(0x80000000)
+-
+-    J2_loop0r %bb.5, %56, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+-    J2_jump %bb.5, implicit-def $pc
+-
+-  bb.5:
+-    successors: %bb.5(0x40000000), %bb.6(0x40000000)
+-
+-    %58:intregs = PHI %7, %bb.4, %80, %bb.5
+-    %59:intregs = PHI %6, %bb.4, %82, %bb.5
+-    %60:intregs = PHI %5, %bb.4, %93, %bb.5
+-    %61:hvxvr, %62:intregs = V6_vL32b_pi %58, 128 :: (load (s1024) from %ir.lsr.iv6)
+-    %63:hvxvr, %64:intregs = V6_vL32b_pi %59, 128 :: (load (s1024) from %ir.lsr.iv3)
+-    %65:hvxwr = REG_SEQUENCE %63, %subreg.vsub_hi, %61, %subreg.vsub_lo
+-    %66:hvxwr = V6_vmpabusv %2, %65
+-    %67:hvxwr = V6_vmpabusv %3, %65
+-    %68:hvxvr = V6_vasrh %67.vsub_hi, %40
+-    %69:hvxvr = V6_vavgh %66.vsub_hi, %68
+-    %70:hvxvr = V6_vasrh %67.vsub_lo, %40
+-    %71:hvxvr = V6_vavgh %66.vsub_lo, %70
+-    %72:hvxvr = V6_vaddhsat %69, %4
+-    %73:hvxvr = V6_vaddhsat %71, %4
+-    %74:hvxvr = V6_vasrhbsat %72, %73, %49
+-    %75:intregs = V6_vS32b_pi %60, 128, %74 :: (store (s1024) into %ir.lsr.iv)
+-    %79:hvxvr, %80:intregs = V6_vL32b_pi %62, 128 :: (load (s1024) from %ir.lsr.iv6 + 128)
+-    %81:hvxvr, %82:intregs = V6_vL32b_pi %64, 128 :: (load (s1024) from %ir.lsr.iv3 + 128)
+-    %83:hvxwr = REG_SEQUENCE %81, %subreg.vsub_hi, %79, %subreg.vsub_lo
+-    %84:hvxwr = V6_vmpabusv %2, %83
+-    %85:hvxwr = V6_vmpabusv %3, %83
+-    %86:hvxvr = V6_vasrh %85.vsub_hi, %40
+-    %87:hvxvr = V6_vavgh %84.vsub_hi, %86
+-    %88:hvxvr = V6_vasrh %85.vsub_lo, %40
+-    %89:hvxvr = V6_vavgh %84.vsub_lo, %88
+-    %90:hvxvr = V6_vaddhsat %87, %4
+-    %91:hvxvr = V6_vaddhsat %89, %4
+-    %92:hvxvr = V6_vasrhbsat %90, %91, %49
+-    %93:intregs = V6_vS32b_pi %75, 128, %92 :: (store (s1024) into %ir.lsr.iv + 128)
+-    ENDLOOP0 %bb.5, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+-    J2_jump %bb.6, implicit-def $pc
+-
+-  bb.6:
+-    successors: %bb.7(0x40000000), %bb.8(0x40000000)
+-
+-    %94:intregs = PHI %7, %bb.1, %80, %bb.5
+-    %95:intregs = PHI %6, %bb.1, %82, %bb.5
+-    %96:intregs = PHI %5, %bb.1, %93, %bb.5
+-    %98:predregs = C2_cmpgtui %57, 0
+-    J2_jumpf %98, %bb.8, implicit-def $pc
+-    J2_jump %bb.7, implicit-def $pc
+-
+-  bb.7:
+-    successors: %bb.2(0x80000000)
+-
+-    J2_jump %bb.2, implicit-def $pc
+-
+-  bb.2.for.body.i (machine-block-address-taken):
+-    successors: %bb.8(0x04000000)
+-
+-    %33:hvxvr, %15:intregs = V6_vL32b_pi %94, 128 :: (load (s1024) from %ir.lsr.iv6)
+-    %34:hvxvr, %14:intregs = V6_vL32b_pi %95, 128 :: (load (s1024) from %ir.lsr.iv3)
+-    %35:hvxwr = REG_SEQUENCE %34, %subreg.vsub_hi, %33, %subreg.vsub_lo
+-    %36:hvxwr = V6_vmpabusv %2, %35
+-    %37:hvxwr = V6_vmpabusv %3, %35
+-    %41:hvxvr = V6_vasrh %37.vsub_hi, %40
+-    %42:hvxvr = V6_vavgh %36.vsub_hi, %41
+-    %45:hvxvr = V6_vasrh %37.vsub_lo, %40
+-    %46:hvxvr = V6_vavgh %36.vsub_lo, %45
+-    %47:hvxvr = V6_vaddhsat %42, %4
+-    %48:hvxvr = V6_vaddhsat %46, %4
+-    %50:hvxvr = V6_vasrhbsat %47, %48, %49
+-    %13:intregs = V6_vS32b_pi %96, 128, %50 :: (store (s1024) into %ir.lsr.iv)
+-    J2_jump %bb.8, implicit-def $pc
+-
+-  bb.8:
+-    successors: %bb.3(0x80000000)
+-
+-    J2_jump %bb.3, implicit-def $pc
+-
+-  bb.3.for.end.i:
+-    PS_jmpret $r31, implicit-def dead $pc
+-
+-...
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/Hexagon/post_inc_store.mir b/llvm/test/CodeGen/Hexagon/post_inc_store.mir
+--- a/llvm/test/CodeGen/Hexagon/post_inc_store.mir
++++ b/llvm/test/CodeGen/Hexagon/post_inc_store.mir
+@@ -1,168 +0,0 @@
+-#RUN: llc -march=hexagon -run-pass hexagon-postincopt %s -o - | FileCheck  %s
+-
+-# Test that we convert a post-inc load and store to a regular load and post-inc
+-# store.
+-# CHECK: J2_loop0r
+-# CHECK-NOT: = L2_loadruh_pi
+-# CHECK: L2_loadruh_io
+-# CHECK: S2_storerh_pi
+-# CHECK: ENDLOOP0
+-
+---- |
+-  ; Function Attrs: nofree norecurse nounwind
+-  define dso_local void @blam(i32 %arg, ptr nocapture %arg1, i16 signext %arg2) local_unnamed_addr #0 {
+-  bb:
+-    %icmp = icmp eq i32 %arg, 0
+-    br i1 %icmp, label %bb13, label %bb3
+-
+-  bb3:                                              ; preds = %bb, %bb10
+-    %phi = phi i32 [ %add11, %bb10 ], [ 0, %bb ]
+-    %mul = mul i32 %phi, %arg
+-    %cgep = getelementptr i16, ptr %arg1, i32 %mul
+-    br label %bb4
+-
+-  bb4:                                              ; preds = %bb4, %bb3
+-    %lsr.iv = phi i32 [ %lsr.iv.next, %bb4 ], [ %arg, %bb3 ]
+-    %phi5 = phi ptr [ %cgep, %bb3 ], [ %cgep1, %bb4 ]
+-    %load = load i16, ptr %phi5, align 2
+-    %add = add i16 %load, %arg2
+-    store i16 %add, ptr %phi5, align 2
+-    %lsr.iv.next = add i32 %lsr.iv, -1
+-    %icmp8 = icmp eq i32 %lsr.iv.next, 0
+-    %cgep1 = getelementptr i16, ptr %phi5, i32 1
+-    br i1 %icmp8, label %bb10, label %bb4
+-
+-  bb10:                                             ; preds = %bb4
+-    %add11 = add nuw i32 %phi, 1
+-    %icmp12 = icmp eq i32 %add11, %arg
+-    br i1 %icmp12, label %bb13, label %bb3
+-
+-  bb13:                                             ; preds = %bb10, %bb
+-    ret void
+-  }
+-
+-  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv68" "target-features"="+v68,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
+-
+-...
+----
+-name:            blam
+-alignment:       16
+-exposesReturnsTwice: false
+-legalized:       false
+-regBankSelected: false
+-selected:        false
+-failedISel:      false
+-tracksRegLiveness: true
+-hasWinCFI:       false
+-callsEHReturn:   false
+-callsUnwindInit: false
+-hasEHCatchret:   false
+-hasEHScopes:     false
+-hasEHFunclets:   false
+-isOutlined:      false
+-debugInstrRef:   false
+-failsVerification: false
+-tracksDebugUserValues: false
+-registers:
+-  - { id: 0, class: intregs, preferred-register: '' }
+-  - { id: 1, class: intregs, preferred-register: '' }
+-  - { id: 2, class: intregs, preferred-register: '' }
+-  - { id: 3, class: intregs, preferred-register: '' }
+-  - { id: 4, class: intregs, preferred-register: '' }
+-  - { id: 5, class: intregs, preferred-register: '' }
+-  - { id: 6, class: intregs, preferred-register: '' }
+-  - { id: 7, class: intregs, preferred-register: '' }
+-  - { id: 8, class: intregs, preferred-register: '' }
+-  - { id: 9, class: intregs, preferred-register: '' }
+-  - { id: 10, class: intregs, preferred-register: '' }
+-  - { id: 11, class: intregs, preferred-register: '' }
+-  - { id: 12, class: predregs, preferred-register: '' }
+-  - { id: 13, class: intregs, preferred-register: '' }
+-  - { id: 14, class: intregs, preferred-register: '' }
+-  - { id: 15, class: intregs, preferred-register: '' }
+-  - { id: 16, class: predregs, preferred-register: '' }
+-  - { id: 17, class: predregs, preferred-register: '' }
+-  - { id: 18, class: predregs, preferred-register: '' }
+-  - { id: 19, class: predregs, preferred-register: '' }
+-  - { id: 20, class: intregs, preferred-register: '' }
+-  - { id: 21, class: intregs, preferred-register: '' }
+-liveins:
+-  - { reg: '$r0', virtual-reg: '%7' }
+-  - { reg: '$r1', virtual-reg: '%8' }
+-  - { reg: '$r2', virtual-reg: '%9' }
+-frameInfo:
+-  isFrameAddressTaken: false
+-  isReturnAddressTaken: false
+-  hasStackMap:     false
+-  hasPatchPoint:   false
+-  stackSize:       0
+-  offsetAdjustment: 0
+-  maxAlignment:    1
+-  adjustsStack:    false
+-  hasCalls:        false
+-  stackProtector:  ''
+-  functionContext: ''
+-  maxCallFrameSize: 4294967295
+-  cvBytesOfCalleeSavedRegisters: 0
+-  hasOpaqueSPAdjustment: false
+-  hasVAStart:      false
+-  hasMustTailInVarArgFunc: false
+-  hasTailCall:     false
+-  localFrameSize:  0
+-  savePoint:       ''
+-  restorePoint:    ''
+-fixedStack:      []
+-stack:           []
+-entry_values:    []
+-callSites:       []
+-debugValueSubstitutions: []
+-constants:       []
+-machineFunctionInfo: {}
+-body:             |
+-  bb.0.bb:
+-    successors: %bb.4(0x30000000), %bb.5(0x50000000)
+-    liveins: $r0, $r1, $r2
+-
+-    %9:intregs = COPY $r2
+-    %8:intregs = COPY $r1
+-    %7:intregs = COPY $r0
+-    %21:intregs = COPY %7
+-    %20:intregs = COPY %7
+-    %12:predregs = C2_cmpeqi %7, 0
+-    J2_jumpt %12, %bb.4, implicit-def $pc
+-
+-  bb.5:
+-    successors: %bb.1(0x80000000)
+-
+-    %11:intregs = A2_tfrsi 0
+-    J2_loop1r %bb.1, %21, implicit-def $lc1, implicit-def $sa1
+-
+-  bb.1.bb3 (machine-block-address-taken):
+-    successors: %bb.2(0x80000000)
+-
+-    %0:intregs = PHI %11, %bb.5, %6, %bb.3
+-    %13:intregs = M2_mpyi %0, %7
+-    %1:intregs = S2_addasl_rrri %8, %13, 1
+-    J2_loop0r %bb.2, %20, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+-
+-  bb.2.bb4 (machine-block-address-taken):
+-    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+-
+-    %3:intregs = PHI %1, %bb.1, %5, %bb.2
+-    %14:intregs = L2_loadruh_io %3, 0 :: (load (s16) from %ir.phi5)
+-    %15:intregs = A2_add %14, %9
+-    %5:intregs = S2_storerh_pi %3, 2, %15 :: (store (s16) into %ir.phi5)
+-    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+-    J2_jump %bb.3, implicit-def dead $pc
+-
+-  bb.3.bb10:
+-    successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+-
+-    %6:intregs = nuw A2_addi %0, 1
+-    ENDLOOP1 %bb.1, implicit-def $pc, implicit-def $lc1, implicit $sa1, implicit $lc1
+-    J2_jump %bb.4, implicit-def dead $pc
+-
+-  bb.4.bb13:
+-    PS_jmpret $r31, implicit-def dead $pc
+-
+-...
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/Hexagon/postincopt-crash.mir b/llvm/test/CodeGen/Hexagon/postincopt-crash.mir
+--- a/llvm/test/CodeGen/Hexagon/postincopt-crash.mir
++++ b/llvm/test/CodeGen/Hexagon/postincopt-crash.mir
+@@ -1,58 +0,0 @@
+-# RUN: llc -march=hexagon -run-pass=hexagon-postincopt %s -o /dev/null
+-# REQUIRES: asserts
+-# Test that we do not hit unreachable code dealt with L4_ior_memoph_io.
+-
+-...
+----
+-name:            foo
+-alignment:       4
+-tracksRegLiveness: true
+-body:             |
+-  bb.0.entry:
+-    successors: %bb.4(0x30000000), %bb.5(0x50000000)
+-    liveins: $r0, $r1, $r2
+-
+-    %9:intregs = COPY $r2
+-    %8:intregs = COPY $r1
+-    %7:intregs = COPY $r0
+-    %21:intregs = COPY %7
+-    %20:intregs = COPY %7
+-    %12:predregs = C2_cmpeqi %7, 0
+-    J2_jumpt %12, %bb.4, implicit-def $pc
+-
+-  bb.5:
+-    successors: %bb.1(0x80000000)
+-
+-    %11:intregs = A2_tfrsi 0
+-    J2_loop1r %bb.1, %21, implicit-def $lc1, implicit-def $sa1
+-
+-  bb.1:
+-    successors: %bb.2(0x80000000)
+-
+-    %0:intregs = PHI %11, %bb.5, %6, %bb.3
+-    %13:intregs = M2_mpyi %0, %7
+-    %1:intregs = S2_addasl_rrri %8, %13, 1
+-    J2_loop0r %bb.2, %20, implicit-def $lc0, implicit-def $sa0, implicit-def $usr
+-
+-  bb.2:
+-    successors: %bb.3(0x04000000), %bb.2(0x7c000000)
+-
+-    %3:intregs = PHI %1, %bb.1, %5, %bb.2
+-    %14:intregs = L2_loadruh_io %3, 0
+-    L4_ior_memoph_io %3:intregs, 0, 21
+-    %15:intregs = A2_add %14, %9
+-    %5:intregs = S2_storerh_pi %3, 2, %15
+-    ENDLOOP0 %bb.2, implicit-def $pc, implicit-def $lc0, implicit $sa0, implicit $lc0
+-    J2_jump %bb.3, implicit-def dead $pc
+-
+-  bb.3:
+-    successors: %bb.4(0x04000000), %bb.1(0x7c000000)
+-
+-    %6:intregs = nuw A2_addi %0, 1
+-    ENDLOOP1 %bb.1, implicit-def $pc, implicit-def $lc1, implicit $sa1, implicit $lc1
+-    J2_jump %bb.4, implicit-def dead $pc
+-
+-  bb.4:
+-    PS_jmpret $r31, implicit-def dead $pc
+-
+-...
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir b/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir
+--- a/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir
++++ b/llvm/test/CodeGen/Hexagon/postincopt-dcfetch.mir
+@@ -1,19 +0,0 @@
+-# RUN: llc -march=hexagon -run-pass hexagon-postincopt %s -o - | FileCheck %s
+-# Check that this doesn't crash.
+-# CHECK: Y2_dcfetchbo
+-
+-name: fred
+-tracksRegLiveness: true
+-body: |
+-  bb.0:
+-    successors: %bb.1
+-    %0:intregs = IMPLICIT_DEF
+-
+-  bb.1:
+-    successors: %bb.1
+-
+-    %1:intregs = PHI %0:intregs, %bb.0, %2:intregs, %bb.1
+-    Y2_dcfetchbo %1:intregs, 0
+-    %2:intregs = A2_addi %1:intregs, 1
+-    J2_jump %bb.1, implicit-def dead $pc
+-...
+diff -ruN --strip-trailing-cr a/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir b/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir
+--- a/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir
++++ b/llvm/test/CodeGen/Hexagon/valid-offset-loadbsw4.mir
+@@ -1,32 +0,0 @@
+-# RUN: llc -march=hexagon -run-pass hexagon-postincopt -o - %s | FileCheck %s
+-# REQUIRES: asserts
+-
+-# Check that this doesn't crash:
+-# CHECK: L2_loadbsw4_io
+-
+----
+-name: fred
+-tracksRegLiveness: true
+-liveins:
+-  - { reg: '$r0', virtual-reg: '%0' }
+-body: |
+-  bb.0:
+-    successors: %bb.1(0x80000000)
+-    liveins: $r0
+-
+-    %0:intregs = COPY $r0
+-    %1:intregs = A2_tfrsi 240
+-    %2:doubleregs = IMPLICIT_DEF
+-    %3:doubleregs = IMPLICIT_DEF
+-
+-  bb.1:
+-    successors: %bb.1(0x80000000)
+-
+-    %4:intregs = PHI %1, %bb.0, %5, %bb.1
+-    %6:doubleregs = L2_loadbsw4_io %4, 0
+-    %7:doubleregs = M2_vrmac_s0 %2, %6, %3
+-    S2_storeri_io %0, 0, %7.isub_lo
+-    %5:intregs = nuw A2_addi %4, 256
+-    J2_jump %bb.1, implicit-def dead $pc
+-
+-...
+diff -ruN --strip-trailing-cr a/llvm/test/MC/AArch64/arm64-optional-hash.s b/llvm/test/MC/AArch64/arm64-optional-hash.s
+--- a/llvm/test/MC/AArch64/arm64-optional-hash.s
++++ b/llvm/test/MC/AArch64/arm64-optional-hash.s
+@@ -8,6 +8,10 @@
+ ; CHECK: adds x3, x4, #1024, lsl #12 ; encoding: [0x83,0x00,0x50,0xb1]
+ adds x3, x4, 1024, lsl 12
+ 
++; CHECK-COUNT-2: mov x3, #327680 ; encoding: [0xa3,0x00,0xa0,0xd2]
++movz x3, (0x54321) >> 16, lsl 16
++movz x3, (0x54321 >> 16), lsl 16
++
+ ; Optional extend
+ ; CHECK: add sp, x2, x3              ; encoding: [0x5f,0x60,0x23,0x8b]
+ add sp, x2, x3, uxtx 0
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/Attributor/nofpclass-fpext.ll b/llvm/test/Transforms/Attributor/nofpclass-fpext.ll
+--- a/llvm/test/Transforms/Attributor/nofpclass-fpext.ll
++++ b/llvm/test/Transforms/Attributor/nofpclass-fpext.ll
+@@ -142,7 +142,7 @@
+ }
+ 
+ define double @ret_fpext_f32_to_f64_nonorm(float nofpclass(norm) %arg0) {
+-; CHECK-LABEL: define nofpclass(sub norm) double @ret_fpext_f32_to_f64_nonorm
++; CHECK-LABEL: define nofpclass(sub) double @ret_fpext_f32_to_f64_nonorm
+ ; CHECK-SAME: (float nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR0]] {
+ ; CHECK-NEXT:    [[EXT:%.*]] = fpext float [[ARG0]] to double
+ ; CHECK-NEXT:    ret double [[EXT]]
+@@ -482,7 +482,37 @@
+ }
+ 
+ define double @ret_fpext_bf16_f64_nonorm(bfloat nofpclass(norm) %arg0) {
+-; CHECK-LABEL: define nofpclass(sub norm) double @ret_fpext_bf16_f64_nonorm
++; CHECK-LABEL: define nofpclass(sub) double @ret_fpext_bf16_f64_nonorm
++; CHECK-SAME: (bfloat nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR0]] {
++; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
++; CHECK-NEXT:    ret double [[EXT]]
++;
++  %ext = fpext bfloat %arg0 to double
++  ret double %ext
++}
++
++define double @ret_fpext_bf16_f64_nonorm_psub(bfloat nofpclass(norm psub) %arg0) {
++; CHECK-LABEL: define nofpclass(sub pnorm) double @ret_fpext_bf16_f64_nonorm_psub
++; CHECK-SAME: (bfloat nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR0]] {
++; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
++; CHECK-NEXT:    ret double [[EXT]]
++;
++  %ext = fpext bfloat %arg0 to double
++  ret double %ext
++}
++
++define double @ret_fpext_bf16_f64_nonorm_nsub(bfloat nofpclass(norm nsub) %arg0) {
++; CHECK-LABEL: define nofpclass(sub nnorm) double @ret_fpext_bf16_f64_nonorm_nsub
++; CHECK-SAME: (bfloat nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR0]] {
++; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
++; CHECK-NEXT:    ret double [[EXT]]
++;
++  %ext = fpext bfloat %arg0 to double
++  ret double %ext
++}
++
++define double @ret_fpext_bf16_f64_nonorm_sub(bfloat nofpclass(norm sub) %arg0) {
++; CHECK-LABEL: define nofpclass(sub norm) double @ret_fpext_bf16_f64_nonorm_sub
+ ; CHECK-SAME: (bfloat nofpclass(sub norm) [[ARG0:%.*]]) #[[ATTR0]] {
+ ; CHECK-NEXT:    [[EXT:%.*]] = fpext bfloat [[ARG0]] to double
+ ; CHECK-NEXT:    ret double [[EXT]]
+diff -ruN --strip-trailing-cr a/llvm/test/Transforms/InstCombine/pr80941.ll b/llvm/test/Transforms/InstCombine/pr80941.ll
+--- a/llvm/test/Transforms/InstCombine/pr80941.ll
++++ b/llvm/test/Transforms/InstCombine/pr80941.ll
+@@ -0,0 +1,32 @@
++; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
++; RUN: opt -S -passes=instcombine < %s | FileCheck %s
++
++define float @pr80941(float %arg) {
++; CHECK-LABEL: define float @pr80941(
++; CHECK-SAME: float [[ARG:%.*]]) {
++; CHECK-NEXT:  entry:
++; CHECK-NEXT:    [[COND:%.*]] = tail call i1 @llvm.is.fpclass.f32(float [[ARG]], i32 144)
++; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_EXIT:%.*]]
++; CHECK:       if.then:
++; CHECK-NEXT:    [[FPEXT:%.*]] = fpext float [[ARG]] to double
++; CHECK-NEXT:    [[SIGN:%.*]] = call double @llvm.copysign.f64(double 0.000000e+00, double [[FPEXT]])
++; CHECK-NEXT:    [[FPTRUNC:%.*]] = fptrunc double [[SIGN]] to float
++; CHECK-NEXT:    br label [[IF_EXIT]]
++; CHECK:       if.exit:
++; CHECK-NEXT:    [[RET:%.*]] = phi float [ [[FPTRUNC]], [[IF_THEN]] ], [ [[ARG]], [[ENTRY:%.*]] ]
++; CHECK-NEXT:    ret float [[RET]]
++;
++entry:
++  %cond = tail call i1 @llvm.is.fpclass.f32(float %arg, i32 144)
++  br i1 %cond, label %if.then, label %if.exit
++
++if.then:
++  %fpext = fpext float %arg to double
++  %sign = call double @llvm.copysign.f64(double 0.000000e+00, double %fpext)
++  %fptrunc = fptrunc double %sign to float
++  br label %if.exit
++
++if.exit:
++  %ret = phi float [ %fptrunc, %if.then ], [ %arg, %entry ]
++  ret float %ret
++}
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
index 07eafd472a417a..11fec76d4327db 100644
--- a/third_party/llvm/workspace.bzl
+++ b/third_party/llvm/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive")
 
 def repo(name):
     """Imports LLVM."""
-    LLVM_COMMIT = "21830c913505b1fd2cf10e454253483180c7e10b"
-    LLVM_SHA256 = "1c79641c844278b10e2b5b5b9c4b72ab87a184a9689e36d3689208c67366da3a"
+    LLVM_COMMIT = "d332d88b919faa15564ab70b0633d94f21bc3e3c"
+    LLVM_SHA256 = "f532011637549cfdcbaef8cea4c9704b20963c307860311b4d12c45fdeb5c03d"
 
     tf_http_archive(
         name = name,
diff --git a/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/mkl_dnn/mkldnn_v1.BUILD
index 094e541bfff86d..add18406251df9 100644
--- a/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -95,7 +95,7 @@ expand_template(
     substitutions = {
         "@DNNL_VERSION_MAJOR@": "3",
         "@DNNL_VERSION_MINOR@": "3",
-        "@DNNL_VERSION_PATCH@": "0",
+        "@DNNL_VERSION_PATCH@": "4",
         "@DNNL_VERSION_HASH@": "N/A",
     },
     template = "include/oneapi/dnnl/dnnl_version.h.in",
diff --git a/third_party/mkl_dnn/onednn_acl_indirect_conv.patch b/third_party/mkl_dnn/onednn_acl_indirect_conv.patch
new file mode 100644
index 00000000000000..217e668352de5c
--- /dev/null
+++ b/third_party/mkl_dnn/onednn_acl_indirect_conv.patch
@@ -0,0 +1,31 @@
+ *******************************************************************************
+ Copyright 2024 Arm Limited and affiliates.
+ SPDX-License-Identifier: Apache-2.0
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ *******************************************************************************
+diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
+index f043fee4bc..0384cce757 100644
+--- a/src/cpu/aarch64/acl_convolution_utils.cpp
++++ b/src/cpu/aarch64/acl_convolution_utils.cpp
+@@ -313,10 +313,6 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+
+     CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
+
+-    // Indirect is slower than gemm for low thread counts, except for fast math
+-    if (dnnl_get_max_threads() < 28 && !acp.fast_math)
+-        return status::unimplemented;
+-
+     // If we do not need to pad input channels for fast math mode then it would
+     // be faster to run convolution with im2row instead of using indirect kernel
+     int block_by = arm_compute::block_by(acp.weights_info.weight_format());
diff --git a/third_party/nvtx/BUILD b/third_party/nvtx/BUILD
new file mode 100644
index 00000000000000..af6de99cb8fcf7
--- /dev/null
+++ b/third_party/nvtx/BUILD
@@ -0,0 +1,13 @@
+# NVIDIA NVTX 3
+
+licenses(["notice"])
+
+exports_files(["LICENSE.txt"])
+
+cc_library(
+    name = "headers",
+    hdrs = glob(["**"]),
+    include_prefix = "nvtx3",
+    visibility = ["//visibility:public"],
+    deps = ["@local_config_cuda//cuda:cuda_headers"],
+)
diff --git a/third_party/nvtx/LICENSE b/third_party/nvtx/LICENSE
new file mode 100644
index 00000000000000..aa23e563e6f868
--- /dev/null
+++ b/third_party/nvtx/LICENSE
@@ -0,0 +1,5 @@
+Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
+
+Licensed under the Apache License v2.0 with LLVM Exceptions.
+See https://llvm.org/LICENSE.txt for license information.
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/third_party/stablehlo/temporary.patch b/third_party/stablehlo/temporary.patch
index bb254f0c69c5aa..d7524290d3b9c2 100755
--- a/third_party/stablehlo/temporary.patch
+++ b/third_party/stablehlo/temporary.patch
@@ -1,7 +1,7 @@
 diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 --- stablehlo/CMakeLists.txt
 +++ stablehlo/CMakeLists.txt
-@@ -13,131 +13,20 @@
+@@ -13,153 +13,20 @@
  # See the License for the specific language governing permissions and
  # limitations under the License.
  #
@@ -25,6 +25,11 @@ diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 -if(POLICY CMP0116)
 -  cmake_policy(SET CMP0116 OLD)
 -endif()
+-
+-# Support for return(PROPAGATE ...) in functions.
+-if (POLICY CMP0140)
+-  cmake_policy(SET CMP0140 NEW)
+-endif()
 +# This build of StableHLO is meant to be embedded in MLIR-HLO.
 +# As a result, its root CMakeLists.txt is different from the original
 +# CMakeLists.txt from https://github.com/openxla/stablehlo.
@@ -39,6 +44,9 @@ diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 -option(STABLEHLO_BUILD_EMBEDDED "Build StableHLO as part of another project" OFF)
 -option(STABLEHLO_ENABLE_BINDINGS_PYTHON "Enables StableHLO Python bindings" OFF)
 -option(STABLEHLO_ENABLE_STRICT_BUILD "Build StableHLO with strict warnings and warnings as errors" OFF)
+-option(STABLEHLO_ENABLE_SANITIZER "Enable a sanitizer [OFF, address]" OFF)
+-option(STABLEHLO_ENABLE_SPLIT_DWARF "Enable split DWARF if the platform supports it" OFF)
+-option(STABLEHLO_ENABLE_LLD "Use LLD as the linker if available" OFF)
  
 -#-------------------------------------------------------------------------------
 -# Project setup and globals
@@ -55,29 +63,6 @@ diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 -  set(CMAKE_CXX_STANDARD 17)
 -endif()
 -
--# Build with ccache if the package is present
--set(LLVM_CCACHE_BUILD OFF CACHE BOOL "Set to ON for a ccache enabled build")
--if(LLVM_CCACHE_BUILD)
--  find_program(CCACHE_PROGRAM ccache)
--  if(CCACHE_PROGRAM)
--      set(LLVM_CCACHE_MAXSIZE "" CACHE STRING "Size of ccache")
--      set(LLVM_CCACHE_DIR "" CACHE STRING "Directory to keep ccached data")
--      set(LLVM_CCACHE_PARAMS "CCACHE_CPP2=yes CCACHE_HASHDIR=yes"
--          CACHE STRING "Parameters to pass through to ccache")
--
--      set(CCACHE_PROGRAM "${LLVM_CCACHE_PARAMS} ${CCACHE_PROGRAM}")
--      if (LLVM_CCACHE_MAXSIZE)
--        set(CCACHE_PROGRAM "CCACHE_MAXSIZE=${LLVM_CCACHE_MAXSIZE} ${CCACHE_PROGRAM}")
--      endif()
--      if (LLVM_CCACHE_DIR)
--        set(CCACHE_PROGRAM "CCACHE_DIR=${LLVM_CCACHE_DIR} ${CCACHE_PROGRAM}")
--      endif()
--      set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PROGRAM})
--  else()
--    message(FATAL_ERROR "Unable to find the program ccache. Set LLVM_CCACHE_BUILD to OFF")
--  endif()
--endif()
--
 -#-------------------------------------------------------------------------------
 -# MLIR/LLVM Configuration
 -#-------------------------------------------------------------------------------
@@ -114,10 +99,39 @@ diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 -  message(STATUS "Building StableHLO embedded in another project")
 -endif()
 -
+-# Add the CMake modules specific to StableHLO
+-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
+-
 -if(LLVM_ENABLE_ZLIB)
 -  find_package(ZLIB)
 -endif()
 -
+-#-------------------------------------------------------------------------------
+-# Performance configuration
+-#-------------------------------------------------------------------------------
+-
+-include(CheckCXXCompilerFlag)
+-include(CheckLinkerFlag)
+-if (STABLEHLO_ENABLE_LLD)
+-  message(STATUS "Enabling LLD as the linker")
+-  add_link_options("-fuse-ld=lld")
+-endif()
+-
+-if(STABLEHLO_ENABLE_SPLIT_DWARF)
+-    check_cxx_compiler_flag(-gsplit-dwarf STABLEHLO_SUPPORTS_SPLIT_DWARF)
+-    if (STABLEHLO_SUPPORTS_SPLIT_DWARF)
+-      message(STATUS "Enabling split-dwarf build")
+-      add_compile_options(-gsplit-dwarf -ggnu-pubnames)
+-    endif()
+-    check_linker_flag(CXX "-Wl,--gdb-index" STABLEHLO_SUPPORTS_GDB_INDEX)
+-    # If we set LLD it doesn't seem to affect the check_linker_flag above.
+-    # Account for it with the generator expression OR
+-    if (STABLEHLO_SUPPORTS_GDB_INDEX OR STABLEHLO_ENABLE_LLD)
+-      message(STATUS "Enabling GDB index in binary")
+-      add_link_options("-Wl,--gdb-index")
+-    endif()
+-endif()
+-
 -include(TableGen)
 -include(AddLLVM)
 -include(AddMLIR)
@@ -129,6 +143,14 @@ diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 -link_directories(${LLVM_BUILD_LIBRARY_DIR})
 -add_definitions(${LLVM_DEFINITIONS})
 -
+-
+-#-------------------------------------------------------------------------------
+-# Sanitizer configuration
+-#-------------------------------------------------------------------------------
+-
+-include(SetupSanitizers)
+-setup_sanitizers()
+-
 -#-------------------------------------------------------------------------------
 -# Python configuration
 -#-------------------------------------------------------------------------------
@@ -155,7 +177,7 @@ diff --ruN a/stablehlo/stablehlo/CMakeLists.txt b/stablehlo/stablehlo/CMakeLists
 diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/experimental/BUILD.bazel
 --- stablehlo/stablehlo/experimental/BUILD.bazel
 +++ stablehlo/stablehlo/experimental/BUILD.bazel
-@@ -0,0 +1,114 @@
+@@ -0,0 +1,115 @@
 +# Copyright 2023 The StableHLO Authors. All Rights Reserved.
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
@@ -226,6 +248,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/
 +cc_library(
 +    name = "experimental_stablehlo_passes",
 +    srcs = [
++        "transforms/ChloRecomposeOps.cpp",
 +        "transforms/StablehloCanonicalizeDynamism.cpp",
 +        "transforms/StablehloRefineShapes.cpp",
 +    ],
@@ -423,7 +446,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/CMakeLists.txt b/stablehlo
 diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
 --- stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
 +++ stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
-@@ -0,0 +1,641 @@
+@@ -0,0 +1,504 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -925,150 +948,13 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  return DynamicTopKOpAdaptor(op);
 +}
 +
-+LogicalResult TopKOpAdaptor::verify() {
-+  if (op_->getNumOperands() != 1)
-+    return op_.emitError("expects size(operands) = 1");
-+  if (op_->getNumResults() != 2)
-+    return op_.emitError("expects size(results) = 2");
-+  if (!op_.getBackendConfig().empty())
-+    return op_.emitError() << "expects an empty backend_config";
-+  if (op_.getCallTargetName() != "mhlo.topk")
-+    return op_.emitError() << "expects @mhlo.topk";
-+
-+  auto operand = op_.getInputs()[0];
-+  auto values = op_.getResults()[0];
-+  auto indices = op_.getResults()[1];
-+  DictionaryAttr topkAttributes =
-+      op_->getAttrOfType<DictionaryAttr>("mhlo.attributes");
-+  if (!topkAttributes) {
-+    return op_.emitError()
-+           << "mhlo.attributes missing or not a dictionary attribute";
-+  }
-+
-+  IntegerAttr k_attr = topkAttributes.get("k").dyn_cast_or_null<IntegerAttr>();
-+  if (!k_attr) {
-+    return op_.emitError() << "mhlo.attributes.k not present or not an integer";
-+  }
-+  int64_t k = k_attr.getInt();
-+
-+  // mhlo.topk_c5
-+  if (k < 0) return op_.emitError() << "expects k >= 0";
-+
-+  // mhlo.topk_i1
-+  auto operandType = operand.getType().dyn_cast<ShapedType>();
-+  if (!operandType || !operandType.hasRank() || operandType.getRank() < 1 ||
-+      !operandType.getElementType().isIntOrFloat())
-+    return op_.emitError()
-+           << "expects operand #0 "
-+           << "to be a tensor of integer or floating-point type "
-+           << "of rank at least 1";
-+
-+  // mhlo.topk_o1
-+  auto valuesType = values.getType().dyn_cast<ShapedType>();
-+  if (!valuesType || !valuesType.hasRank() || valuesType.getRank() < 1 ||
-+      !valuesType.getElementType().isIntOrFloat())
-+    return op_.emitError()
-+           << "expects values (result #0) "
-+           << "to be a tensor of integer or floating-point type "
-+           << "of rank at least 1";
-+
-+  // mhlo.topk_o2
-+  auto indicesType = indices.getType().dyn_cast<ShapedType>();
-+  if (!indicesType || !indicesType.hasRank() || indicesType.getRank() < 1 ||
-+      !indicesType.getElementType().isSignlessInteger(32))
-+    return op_.emitError() << "expects indices (result #1) "
-+                           << "to be a tensor of si32 of rank at least 1";
-+
-+  // mhlo.topk_c1 && mhlo.topk_c2
-+  auto operandLastDim = operandType.getRank() - 1;
-+  SmallVector<int64_t> expectedValuesShape(operandType.getShape());
-+  expectedValuesShape[operandLastDim] = k;
-+  if (failed(verifyCompatibleShape(expectedValuesShape, valuesType.getShape())))
-+    return op_.emitError() << "expects the values shape to match the operand "
-+                              "shape in all but the last dimension, and "
-+                              "that the last dimension of the values shape "
-+                              "has a size k";
-+
-+  // mhlo.topk_c3
-+  if (valuesType.getElementType() != operandType.getElementType())
-+    return op_.emitError()
-+           << "expects the values element type to be the same as the operand "
-+           << "element type";
-+
-+  // mhlo.topk_c4
-+  if (failed(
-+          verifyCompatibleShape(indicesType.getShape(), valuesType.getShape())))
-+    return op_.emitError()
-+           << "expects the indices shape to match the values shape";
-+
-+  return success();
-+}
-+
-+TypedValue<ShapedType> TopKOpAdaptor::getOperand() {
-+  return op_.getInputs()[0].cast<TypedValue<ShapedType>>();
-+}
-+
-+TypedValue<ShapedType> TopKOpAdaptor::getValues() {
-+  return op_.getResults()[0].cast<TypedValue<ShapedType>>();
-+}
-+
-+TypedValue<ShapedType> TopKOpAdaptor::getIndices() {
-+  return op_.getResults()[1].cast<TypedValue<ShapedType>>();
-+}
-+
-+int64_t TopKOpAdaptor::getK() {
-+  DictionaryAttr topkAttributes =
-+      op_->getAttrOfType<DictionaryAttr>("mhlo.attributes");
-+  return topkAttributes.get("k").cast<mlir::IntegerAttr>().getInt();
-+}
-+
-+bool TopKOpAdaptor::getLargest() {
-+  DictionaryAttr topkAttributes =
-+      op_->getAttrOfType<DictionaryAttr>("mhlo.attributes");
-+  IntegerAttr largest =
-+      topkAttributes.get("largest").dyn_cast_or_null<mlir::IntegerAttr>();
-+
-+  return (!largest) ? true : largest.getInt();
-+}
-+
-+std::optional<TopKOpAdaptor> getTopKOp(CustomCallOp op) {
-+  if (op.getCallTargetName() != "mhlo.topk") return {};
-+  return TopKOpAdaptor(op);
-+}
-+
-+LogicalResult TanOpAdaptor::verify() {
-+  if (op_->getNumOperands() != 1)
-+    return op_.emitError("expects size(operands) = 1");
-+  if (op_->getNumResults() != 1)
-+    return op_.emitError("expects size(results) = 1");
-+  auto operand = op_.getInputs()[0];
-+  auto result = op_.getResults()[0];
-+
-+  // tan_c1
-+  auto operandType = operand.getType().dyn_cast<ShapedType>();
-+  auto resultType = result.getType().dyn_cast<ShapedType>();
-+  if (!hlo::isCompatibleForHloTypeInference(operandType, resultType))
-+    return op_.emitError()
-+           << "expects operand and result to have compatible shapes";
-+  return success();
-+}
-+
-+TypedValue<ShapedType> TanOpAdaptor::getOperand() {
-+  return op_.getInputs()[0].cast<TypedValue<ShapedType>>();
-+}
-+
-+std::optional<TanOpAdaptor> getTanOp(CustomCallOp op) {
-+  if (op.getCallTargetName() != "mhlo.tan") return {};
-+  return TanOpAdaptor(op);
-+}
-+
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
 diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.h b/stablehlo/stablehlo/experimental/dialect/StablehloOps.h
 --- stablehlo/stablehlo/experimental/dialect/StablehloOps.h
 +++ stablehlo/stablehlo/experimental/dialect/StablehloOps.h
-@@ -0,0 +1,347 @@
+@@ -0,0 +1,230 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -1294,123 +1180,6 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.h b/stablehlo
 +// "stablehlo.dynamic_top_k".
 +std::optional<DynamicTopKOpAdaptor> getDynamicTopKOp(CustomCallOp op);
 +
-+///////////////////
-+// MHLO Op Wrappers
-+// There are some ops in MHLO which have experimental support in StableHLO
-+// programs by representing them as custom_calls with the target `mhlo.op_name`.
-+// The level of support of these ops is similar to the other custom_calls in
-+// this file. Generally these ops will be added to StableHLO and their
-+// experimental support can be deprecated in favor of op's type inference.
-+///////////////////
-+
-+// The TopK experiment provides a StableHLO adapter to MHLO TopKOp.
-+// In the future we expect stablehlo.top_k to be added which will use the same
-+// refinement rules.
-+//
-+// Within this experiment, TopKOp is represented via the serialized MHLO
-+// `stablehlo.custom_call @mhlo.topk` custom call.
-+//
-+// The semantics of experimental TopKOp are inherited from the semantics of
-+// mhlo.topk.
-+//
-+// #### Inputs
-+//
-+// | Label | Name            | Type                                         |
-+// |-------|-----------------|----------------------------------------------|
-+// | (I1)  | `operand`       | tensor of integer or floating-point type     |
-+// | (I2)  | `k`             | constant of type si64                        |
-+// | (I3)  | `largest`       | constant of type i1                          |
-+//
-+// #### Outputs
-+//
-+// | Name           | Type                                     |
-+// |----------------|------------------------------------------|
-+// | `values`       | tensor of integer or floating-point type |
-+// | `indices`      | tensor of si32 type                      |
-+//
-+// #### Constraints
-+//
-+// * (C1) `shape(values)[:-1] = shape(operand)[:-1]`
-+// * (C2) `shape(values)[-1] = k`
-+// * (C3) `element_type(values) = element_type(operand)`
-+// * (C4) `shape(indices) = shape(values)`
-+// * (C5) `k >= 0`
-+//
-+class TopKOpAdaptor {
-+ public:
-+  TopKOpAdaptor(CustomCallOp op) : op_(op) {}
-+  operator Operation*() { return op_; }
-+  Operation* operator->() { return op_; }
-+
-+  // These accessors assume that the operation is well-formed (i.e. that it
-+  // can pass verification).
-+  TypedValue<ShapedType> getOperand();
-+  TypedValue<ShapedType> getValues();
-+  TypedValue<ShapedType> getIndices();
-+  int64_t getK();
-+  bool getLargest();
-+
-+  // Verifies the constraints documented above.
-+  // Emits errors if errors are detected.
-+  LogicalResult verify();
-+
-+ private:
-+  CustomCallOp op_;
-+};
-+
-+// Wraps a custom call in a TopKOpAdaptor.
-+// Fails if the call_target_name of the custom call doesn't match
-+// "mhlo.topk".
-+std::optional<TopKOpAdaptor> getTopKOp(CustomCallOp op);
-+
-+// The TanOp experiment provides a StableHLO adapter to MHLO TanOp.
-+// In the future we expect stablehlo.tan to be added which will use the same
-+// refinement rules.
-+//
-+// Within this experiment, TanOp is represented via the serialized MHLO
-+// `stablehlo.custom_call @mhlo.tan` custom call. S
-+//
-+// Semantics of experimental TanOp are inherited from semantics of mhlo.tan.
-+//
-+// #### Inputs
-+//
-+// | Label | Name            | Type                                         |
-+// |-------|-----------------|----------------------------------------------|
-+// | (I1)  | `operand`       | tensor of floating-point or complex type     |
-+//
-+// #### Outputs
-+//
-+// | Name           | Type                                     |
-+// |----------------|------------------------------------------|
-+// | `result`       | tensor of floating-point or complex type |
-+//
-+// #### Constraints
-+//
-+// * (C1) baseline_type(operand) = baseline_type(result)
-+//
-+class TanOpAdaptor {
-+ public:
-+  TanOpAdaptor(CustomCallOp op) : op_(op) {}
-+  operator Operation*() { return op_; }
-+  Operation* operator->() { return op_; }
-+
-+  // These accessors assume that the operation is well-formed (i.e. that it
-+  // can pass verification).
-+  TypedValue<ShapedType> getOperand();
-+
-+  // Verifies the constraints documented above.
-+  // Emits errors if errors are detected.
-+  LogicalResult verify();
-+
-+ private:
-+  CustomCallOp op_;
-+};
-+
-+// Wraps a custom call in a TanOpAdaptor.
-+// Fails if the call_target_name of the custom call doesn't match
-+// "mhlo.tan".
-+std::optional<TanOpAdaptor> getTanOp(CustomCallOp op);
-+
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
@@ -1512,6 +1281,61 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/CMakeLists.txt b/stablehlo/s
 +  stablehlo-translate
 +)
 +add_dependencies(check-stablehlo-quick check-experimental-stablehlo-tests)
+diff --ruN a/stablehlo/stablehlo/experimental/tests/chlo_recompose_ops.mlir b/stablehlo/stablehlo/experimental/tests/chlo_recompose_ops.mlir
+--- stablehlo/stablehlo/experimental/tests/chlo_recompose_ops.mlir
++++ stablehlo/stablehlo/experimental/tests/chlo_recompose_ops.mlir
+@@ -0,0 +1,51 @@
++// RUN: experimental-stablehlo-opt --experimental-chlo-recompose-ops --split-input-file --verify-diagnostics %s | FileCheck %s
++
++// -----
++
++// CHECK-LABEL: func @recompose_topk
++func.func @recompose_topk(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
++  // CHECK: %values, %indices = chlo.top_k(%arg0, k = 4) {largest = true} : tensor<5x16xf32> -> (tensor<?x?xf32>, tensor<?x?xi32>)
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
++  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @recompose_topk_invalid_attr
++func.func @recompose_topk_invalid_attr(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
++  // CHECK: stablehlo.custom_call @mhlo.topk
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = false}
++  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
++  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
++}
++
++// -----
++
++// CHECK-LABEL: @recompose_tan
++func.func @recompose_tan(%arg0: tensor<16xf32>) -> tensor<?xf32> {
++  // CHECK: %0 = chlo.tan %arg0 : tensor<16xf32> -> tensor<?xf32>
++  %0 = "stablehlo.custom_call"(%arg0) {
++    call_target_name = "mhlo.tan",
++    mhlo.attributes = {},
++    mhlo.version = 1 : i64
++  } : (tensor<16xf32>) -> tensor<?xf32>
++  func.return %0 : tensor<?xf32>
++}
++
++// -----
++
++// CHECK-LABEL: @recompose_erf
++func.func @recompose_erf(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16> {
++  // CHECK: %0 = chlo.erf %arg0 : tensor<3x20x20xbf16> -> tensor<?x20x20xbf16>
++  %0 = "stablehlo.custom_call"(%arg0) {
++    backend_config = "",
++    call_target_name = "mhlo.erf",
++    mhlo.attributes = {},
++    mhlo.version = 1 : i64
++  } : (tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16>
++  func.return %0 : tensor<?x20x20xbf16>
++}
++
 diff --ruN a/stablehlo/stablehlo/experimental/tests/lit.cfg.py b/stablehlo/stablehlo/experimental/tests/lit.cfg.py
 --- stablehlo/stablehlo/experimental/tests/lit.cfg.py
 +++ stablehlo/stablehlo/experimental/tests/lit.cfg.py
@@ -1601,11 +1425,11 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynam
 +  //          CHECK-NEXT:     %[[VAL1:.*]] = stablehlo.add %arg2, %arg3 : tensor<f32>
 +  //          CHECK-NEXT:     stablehlo.return %[[VAL1]] : tensor<f32>
 +  //          CHECK-NEXT: }) {
-+  //          CHECK-SAME:   base_dilations = dense<[2, 1]> : tensor<2xi64>,
++  //          CHECK-SAME:   base_dilations = array<i64: 2, 1>,
 +  // CHECK-SAME{LITERAL}:   padding = dense<[[2, 1], [0, 0]]> : tensor<2x2xi64>,
-+  //          CHECK-SAME:   window_dilations = dense<[3, 1]> : tensor<2xi64>,
-+  //          CHECK-SAME:   window_dimensions = dense<[2, 1]> : tensor<2xi64>,
-+  //          CHECK-SAME:   window_strides = dense<[4, 1]> : tensor<2xi64>
++  //          CHECK-SAME:   window_dilations = array<i64: 3, 1>,
++  //          CHECK-SAME:   window_dimensions = array<i64: 2, 1>,
++  //          CHECK-SAME:   window_strides = array<i64: 4, 1>
 +  //          CHECK-SAME: } : (tensor<3x2xf32>, tensor<f32>) -> tensor<2x2xf32>
 +  %0 = stablehlo.constant dense<[2, 1]> : tensor<2xi64>
 +  %1 = stablehlo.constant dense<[4, 1]> : tensor<2xi64>
@@ -1633,11 +1457,11 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynam
 +  //          CHECK-NEXT:     %[[VAL1:.*]] = stablehlo.add %arg2, %arg3 : tensor<f32>
 +  //          CHECK-NEXT:     stablehlo.return %[[VAL1]] : tensor<f32>
 +  //          CHECK-NEXT: }) {
-+  //          CHECK-SAME:   base_dilations = dense<[2, 1]> : tensor<2xi64>,
++  //          CHECK-SAME:   base_dilations = array<i64: 2, 1>,
 +  // CHECK-SAME{LITERAL}:   padding = dense<[[2, 1], [0, 0]]> : tensor<2x2xi64>,
-+  //          CHECK-SAME:   window_dilations = dense<[3, 1]> : tensor<2xi64>,
-+  //          CHECK-SAME:   window_dimensions = dense<[2, 1]> : tensor<2xi64>,
-+  //          CHECK-SAME:   window_strides = dense<[4, 1]> : tensor<2xi64>
++  //          CHECK-SAME:   window_dilations = array<i64: 3, 1>,
++  //          CHECK-SAME:   window_dimensions = array<i64: 2, 1>,
++  //          CHECK-SAME:   window_strides = array<i64: 4, 1>
 +  //          CHECK-SAME: } : (tensor<?x2xf32>, tensor<f32>) -> tensor<?x2xf32>
 +  %0 = stablehlo.constant dense<[2, 1]> : tensor<2xi64>
 +  %1 = stablehlo.constant dense<[4, 1]> : tensor<2xi64>
@@ -1938,7 +1762,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynam
 diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir b/stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
 --- stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
 +++ stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
-@@ -0,0 +1,196 @@
+@@ -0,0 +1,42 @@
 +// RUN: experimental-stablehlo-opt --experimental-stablehlo-refine-shapes --split-input-file --verify-diagnostics %s | FileCheck %s
 +
 +// CHECK-LABEL: @main
@@ -1981,160 +1805,6 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
 +  %1:2 = stablehlo.custom_call @stablehlo.dynamic_top_k(%arg0, %k) : (tensor<16xf32>, tensor<ui64>) -> (tensor<?xf32>, tensor<?xi32>)
 +  return %1#0, %1#1 : tensor<?xf32>, tensor<?xi32>
 +}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_topk
-+func.func @refine_mhlo_topk(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
-+  // CHECK: mhlo.topk{{.*}} -> (tensor<5x4xf32>, tensor<5x4xi32>)
-+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
-+    mhlo.attributes = { k = 4 : i64, largest = true}
-+  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
-+  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_error_too_many_operands
-+func.func @refine_mhlo_error_too_many_operands(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
-+  // expected-error@+1{{expects size(operands) = 1}}
-+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0, %arg0) {
-+    mhlo.attributes = { k = 4 : i64, largest = true}
-+  } : (tensor<5x16xf32>, tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
-+  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_error_too_few_results
-+func.func @refine_mhlo_error_too_few_results(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>) {
-+  // expected-error@+1{{expects size(results) = 2}}
-+  %0 = stablehlo.custom_call @mhlo.topk(%arg0) {
-+    mhlo.attributes = { k = 4 : i64, largest = true}
-+  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>)
-+  return %0 : tensor<?x?xf32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_error_wrong_output_1_type
-+func.func @refine_mhlo_error_wrong_output_1_type(%arg0: tensor<5x16xf32>) -> (tensor<f32>, tensor<?x?xi32>) {
-+  // expected-error@+1{{expects values (result #0) to be a tensor of integer or floating-point type of rank at least 1}}
-+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
-+    mhlo.attributes = { k = 4 : i64, largest = true}
-+  } : (tensor<5x16xf32>) -> (tensor<f32>, tensor<?x?xi32>)
-+  return %0#0, %0#1 : tensor<f32>, tensor<?x?xi32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_error_wrong_output_2_type
-+func.func @refine_mhlo_error_wrong_output_2_type(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
-+  // expected-error@+1{{expects indices (result #1) to be a tensor of si32 of rank at least 1}}
-+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
-+    mhlo.attributes = { k = 4 : i64, largest = true}
-+  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>)
-+  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_error_c1_wrong_output_shape
-+func.func @refine_mhlo_error_c1_wrong_output_shape(%arg0: tensor<5x16xf32>) -> (tensor<?x?x?xf32>, tensor<?x?xi32>) {
-+  // expected-error@+1{{expects the values shape to match the operand}}
-+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
-+    mhlo.attributes = { k = 4 : i64, largest = true}
-+  } : (tensor<5x16xf32>) -> (tensor<?x?x?xf32>, tensor<?x?xi32>)
-+  return %0#0, %0#1 : tensor<?x?x?xf32>, tensor<?x?xi32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_error_c2_last_dim_not_k
-+func.func @refine_mhlo_error_c2_last_dim_not_k(%arg0: tensor<5x16xf32>) -> (tensor<?x5xf32>, tensor<?x?xi32>) {
-+  // expected-error@+1{{expects the values shape to match the operand}}
-+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
-+    mhlo.attributes = { k = 4 : i64, largest = true}
-+  } : (tensor<5x16xf32>) -> (tensor<?x5xf32>, tensor<?x?xi32>)
-+  return %0#0, %0#1 : tensor<?x5xf32>, tensor<?x?xi32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_error_c3_wrong_output_type
-+func.func @refine_mhlo_error_c3_wrong_output_type(%arg0: tensor<5x16xf32>) -> (tensor<?x?xi32>, tensor<?x?xi32>) {
-+  // expected-error@+1{{expects the values element type to be the same as the operand element type}}
-+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
-+    mhlo.attributes = { k = 4 : i64, largest = true}
-+  } : (tensor<5x16xf32>) -> (tensor<?x?xi32>, tensor<?x?xi32>)
-+  return %0#0, %0#1 : tensor<?x?xi32>, tensor<?x?xi32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_error_c4_outputs_shape_mismatch
-+func.func @refine_mhlo_error_c4_outputs_shape_mismatch(%arg0: tensor<5x16xf32>) -> (tensor<?x4xf32>, tensor<?x5xi32>) {
-+  // expected-error@+1{{expects the indices shape to match the values shape}}
-+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
-+    mhlo.attributes = { k = 4 : i64, largest = true}
-+  } : (tensor<5x16xf32>) -> (tensor<?x4xf32>, tensor<?x5xi32>)
-+  return %0#0, %0#1 : tensor<?x4xf32>, tensor<?x5xi32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_error_c5_negative_k
-+func.func @refine_mhlo_error_c5_negative_k(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
-+  // expected-error@+1{{expects k >= 0}}
-+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
-+    mhlo.attributes = { k = -4 : i64, largest = true}
-+  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
-+  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
-+}
-+
-+
-+// -----
-+
-+// CHECK-LABEL: @refine_tan
-+func.func @refine_tan(%arg0: tensor<16xf32>) -> tensor<?xf32> {
-+  // CHECK: mhlo.tan{{.*}} -> tensor<16xf32>
-+  %0 = "stablehlo.custom_call"(%arg0) {
-+    call_target_name = "mhlo.tan",
-+    mhlo.attributes = {},
-+    mhlo.version = 1 : i64
-+  } : (tensor<16xf32>) -> tensor<?xf32>
-+  func.return %0 : tensor<?xf32>
-+}
-+
-+// -----
-+
-+// tan_op I1
-+// CHECK-LABEL: func @tan_op_error_too_many_inputs
-+func.func @tan_op_error_too_many_inputs(%arg0: tensor<1xf32>) -> (tensor<1xf32>) {
-+  // expected-error@+1{{expects size(operands) = 1}}
-+  %0 = "stablehlo.custom_call"(%arg0, %arg0) {call_target_name = "mhlo.tan", mhlo.attributes = {}, mhlo.version = 1 : i64} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-+  return %0 : tensor<1xf32>
-+}
-+
-+// -----
-+
-+// tan_op O1
-+// CHECK-LABEL: func @tan_op_error_too_many_outputs
-+func.func @tan_op_error_too_many_outputs(%arg0: tensor<1xf32>) -> (tensor<1xf32>) {
-+  // expected-error@+1{{expects size(results) = 1}}
-+  %0:2 = "stablehlo.custom_call"(%arg0) {call_target_name = "mhlo.tan", mhlo.attributes = {}, mhlo.version = 1 : i64} : (tensor<1xf32>) -> (tensor<1xf32>, tensor<1xf32>)
-+  return %0#1 : tensor<1xf32>
-+}
-+
-+// -----
-+
-+// tan_op C1
-+// CHECK-LABEL: func @tan_op_error_values_bad_element_type
-+func.func @tan_op_error_values_bad_element_type(%arg0: tensor<1xf32>) -> (tensor<1xi32>) {
-+  // expected-error@+1{{expects operand and result to have compatible shapes}}
-+  %0 = "stablehlo.custom_call"(%arg0) {call_target_name = "mhlo.tan", mhlo.attributes = {}, mhlo.version = 1 : i64} : (tensor<1xf32>) -> tensor<1xi32>
-+  return %0 : tensor<1xi32>
-+}
 diff --ruN a/stablehlo/stablehlo/experimental/tools/CMakeLists.txt b/stablehlo/stablehlo/experimental/tools/CMakeLists.txt
 --- stablehlo/stablehlo/experimental/tools/CMakeLists.txt
 +++ stablehlo/stablehlo/experimental/tools/CMakeLists.txt
@@ -2233,7 +1903,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/tools/StablehloOptMain.cpp b/stabl
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
 --- stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
 +++ stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
-@@ -0,0 +1,39 @@
+@@ -0,0 +1,40 @@
 +# Copyright 2023 The StableHLO Authors.
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
@@ -2254,6 +1924,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stable
 +
 +add_mlir_dialect_library(ExperimentalStablehloPasses
 +  PARTIAL_SOURCES_INTENDED
++  ChloRecomposeOps.cpp
 +  StablehloCanonicalizeDynamism.cpp
 +  StablehloRefineShapes.cpp
 +
@@ -2273,10 +1944,182 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stable
 +  StablehloPasses
 +  StablehloTypeInference
 +)
+diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
+--- stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
++++ stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
+@@ -0,0 +1,168 @@
++/* Copyright 2024 The StableHLO Authors.
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#include <cstdint>
++#include <functional>
++
++#include "llvm/ADT/SmallVector.h"
++#include "mlir/Dialect/Func/IR/FuncOps.h"
++#include "mlir/IR/Attributes.h"
++#include "mlir/IR/BuiltinAttributes.h"
++#include "mlir/IR/PatternMatch.h"
++#include "mlir/Support/LogicalResult.h"
++#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
++#include "stablehlo/dialect/ChloOps.h"
++#include "stablehlo/dialect/StablehloOps.h"
++#include "stablehlo/experimental/dialect/StablehloOps.h"
++#include "stablehlo/experimental/transforms/Passes.h"
++
++namespace mlir {
++namespace stablehlo {
++namespace experimental {
++
++#define GEN_PASS_DEF_CHLORECOMPOSEOPSPASS
++#include "stablehlo/experimental/transforms/Passes.h.inc"
++
++namespace {
++
++FailureOr<DictionaryAttr> getCustomCallOpAttributes(CustomCallOp op,
++                                                    PatternRewriter& rewriter) {
++  auto attrs = op->getAttrOfType<DictionaryAttr>("mhlo.attributes");
++  if (!attrs)
++    return rewriter.notifyMatchFailure(
++        op, "Expected mhlo.attributes dictionary attribute.");
++  return attrs;
++}
++
++LogicalResult verifyCustomCallOpAttributes(
++    CustomCallOp op, PatternRewriter& rewriter,
++    std::function<LogicalResult(NamedAttribute)> verifyFn) {
++  auto attrs = getCustomCallOpAttributes(op, rewriter);
++  if (failed(attrs)) return failure();
++
++  for (auto attr : attrs->getValue()) {
++    if (failed(verifyFn(attr))) return failure();
++  }
++  return success();
++}
++
++// Experimental and public ops in MHLO that do not exist yet in StableHLO
++// can be encoded as a StableHLO CustomCallOp to allow round-tripping
++// between dialects. Some of these ops are CHLO ops that are accelerated by XLA.
++// For these ops we can recompose to CHLO.
++//
++// Example:
++//  %0 = stablehlo.custom_call @mhlo.topk(...) {...}
++//  ==>
++//   %0 = "chlo.topk"(...) {...}
++template <typename OpType>
++LogicalResult recomposeChloOpFromCustomCall(stablehlo::CustomCallOp op,
++                                            PatternRewriter& rewriter) {
++  // Only call_target_name, backend_config, called_computations, mhlo.version,
++  // and mhlo.attributes are compatible with the extensibility protocol.
++  auto isSupportedAttrName = [](NamedAttribute attr) {
++    auto name = attr.getName();
++    return name == "call_target_name" || name == "backend_config" ||
++           name == "called_computations" || name == "mhlo.attributes" ||
++           name == "mhlo.version";
++  };
++  if (!llvm::all_of(op->getAttrs(), isSupportedAttrName) ||
++      !op.getBackendConfig().empty()) {
++    return rewriter.notifyMatchFailure(
++        op, "CHLO Recompose custom call did not have required attributes.");
++  }
++  if (!op.getCalledComputations().empty())
++    return rewriter.notifyMatchFailure(op, "Ops with regions not supported.");
++
++  auto attrs = getCustomCallOpAttributes(op, rewriter);
++  if (failed(attrs)) return failure();
++
++  rewriter.replaceOpWithNewOp<OpType>(op, op->getResultTypes(),
++                                      op->getOperands(), attrs->getValue());
++  return success();
++}
++
++struct TopKOpRecomposePattern : public OpRewritePattern<CustomCallOp> {
++  using OpRewritePattern::OpRewritePattern;
++  LogicalResult matchAndRewrite(CustomCallOp op,
++                                PatternRewriter& rewriter) const override {
++    if (op.getCallTargetName() != "mhlo.topk") return failure();
++    auto res = verifyCustomCallOpAttributes(
++        op, rewriter, [&](NamedAttribute attr) -> LogicalResult {
++          if (attr.getName() != "largest") return success();
++          if (attr.getValue().cast<BoolAttr>().getValue() == false)
++            return rewriter.notifyMatchFailure(
++                op, "largest = false is not supported.");
++          return success();
++        });
++    if (failed(res)) return failure();
++    return recomposeChloOpFromCustomCall<chlo::TopKOp>(op, rewriter);
++  }
++};
++
++struct TanOpRecomposePattern : public OpRewritePattern<CustomCallOp> {
++  using OpRewritePattern::OpRewritePattern;
++  LogicalResult matchAndRewrite(CustomCallOp op,
++                                PatternRewriter& rewriter) const override {
++    if (op.getCallTargetName() != "mhlo.tan") return failure();
++    return recomposeChloOpFromCustomCall<chlo::TanOp>(op, rewriter);
++  }
++};
++
++struct ErfOpRecomposePattern : public OpRewritePattern<CustomCallOp> {
++  using OpRewritePattern::OpRewritePattern;
++  LogicalResult matchAndRewrite(CustomCallOp op,
++                                PatternRewriter& rewriter) const override {
++    if (op.getCallTargetName() != "mhlo.erf") return failure();
++    return recomposeChloOpFromCustomCall<chlo::ErfOp>(op, rewriter);
++  }
++};
++
++}  // namespace
++
++struct ChloRecomposeOpsPass
++    : public impl::ChloRecomposeOpsPassBase<ChloRecomposeOpsPass> {
++  using ChloRecomposeOpsPassBase::ChloRecomposeOpsPassBase;
++
++  void runOnOperation() override {
++    // Do a single traversal to recompose CustomCallOp to CHLO ops.
++    GreedyRewriteConfig config;
++    config.useTopDownTraversal = true;
++    config.enableRegionSimplification = true;
++    config.maxIterations = 1;
++    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
++    config.strictMode = GreedyRewriteStrictness::ExistingOps;
++
++    RewritePatternSet patterns(&getContext());
++    patterns.add<TopKOpRecomposePattern>(&getContext());
++    patterns.add<TanOpRecomposePattern>(&getContext());
++    patterns.add<ErfOpRecomposePattern>(&getContext());
++
++    // Only apply to CustomCallOps
++    auto moduleOp = getOperation();
++    llvm::SmallVector<Operation*> candidateOps;
++    moduleOp.walk([&](CustomCallOp op) { candidateOps.push_back(op); });
++
++    if (failed(applyOpPatternsAndFold(candidateOps, std::move(patterns),
++                                      config))) {
++      moduleOp.emitError("Failed to converge ChloRecomposeOps in ")
++          << config.maxIterations << " iterations";
++      return signalPassFailure();
++    }
++  }
++};
++
++}  // namespace experimental
++}  // namespace stablehlo
++}  // namespace mlir
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/stablehlo/experimental/transforms/Passes.h
 --- stablehlo/stablehlo/experimental/transforms/Passes.h
 +++ stablehlo/stablehlo/experimental/transforms/Passes.h
-@@ -0,0 +1,37 @@
+@@ -0,0 +1,36 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -2303,9 +2146,8 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/st
 +namespace mlir {
 +namespace stablehlo {
 +namespace experimental {
-+  
-+#define GEN_PASS_DECL_STABLEHLOCANONICALIZEDYNAMISMPASS
-+#define GEN_PASS_DECL_STABLEHLOREFINESHAPESPASS
++
++#define GEN_PASS_DECL
 +#define GEN_PASS_REGISTRATION
 +#include "stablehlo/experimental/transforms/Passes.h.inc"
 +
@@ -2317,7 +2159,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/st
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/stablehlo/experimental/transforms/Passes.td
 --- stablehlo/stablehlo/experimental/transforms/Passes.td
 +++ stablehlo/stablehlo/experimental/transforms/Passes.td
-@@ -0,0 +1,31 @@
+@@ -0,0 +1,39 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -2349,10 +2191,18 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/s
 +    Experimental version of the --stablehlo-refine-shapes pass.
 +  }];
 +}
++
++def ChloRecomposeOpsPass : Pass<"experimental-chlo-recompose-ops", "ModuleOp"> {
++  let summary = "(Experimental) Recompose CHLO ops serialized as custom calls.";
++  let description = [{
++    Experimental version of CHLO serialization support.
++  }];
++  let dependentDialects = ["chlo::ChloDialect"];
++}
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 --- stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 +++ stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
-@@ -0,0 +1,167 @@
+@@ -0,0 +1,171 @@
 +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 +   Copyright 2023 The StableHLO Authors.
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -2419,10 +2269,10 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDy
 +      return rewriter.notifyMatchFailure(op, "expected static padding");
 +    auto newOp = rewriter.create<ReduceWindowOp>(
 +        op->getLoc(), op->getResultTypes(), op.getInputs(), op.getInitValues(),
-+        rewriter.getI64TensorAttr(windowDimensions),
-+        rewriter.getI64TensorAttr(windowStrides),
-+        rewriter.getI64TensorAttr(baseDilations),
-+        rewriter.getI64TensorAttr(windowDilations),
++        rewriter.getDenseI64ArrayAttr(windowDimensions),
++        rewriter.getDenseI64ArrayAttr(windowStrides),
++        rewriter.getDenseI64ArrayAttr(baseDilations),
++        rewriter.getDenseI64ArrayAttr(windowDilations),
 +        hlo::getPaddingAttr(&rewriter, padding));
 +
 +    // Inline the called computation into newOp.
@@ -2509,8 +2359,12 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDy
 +    patterns.add<CanonicalizeDynamicReduceWindowOpPattern>(&getContext());
 +    patterns.add<CanonicalizeDynamicRngBitGeneratorOpPattern>(&getContext());
 +    patterns.add<CanonicalizeDynamicTopKOpPattern>(&getContext());
-+    if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns),
++
++    auto funcOp = getOperation();
++    if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns),
 +                                            config))) {
++      funcOp.emitError("Failed to converge StablehloCanonicalizeDynamism in ")
++          << config.maxIterations << " iterations";
 +      return signalPassFailure();
 +    }
 +  }
@@ -2523,7 +2377,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDy
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp
 --- stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp
 +++ stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp
-@@ -0,0 +1,196 @@
+@@ -0,0 +1,170 @@
 +/* Copyright 2022 The StableHLO Authors.
 +Licensed under the Apache License, Version 2.0 (the "License");
 +you may not use this file except in compliance with the License.
@@ -2654,33 +2508,6 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +  }
 +};
 +
-+struct RefineTopKOpPattern : public OpRewritePattern<CustomCallOp> {
-+  using OpRewritePattern::OpRewritePattern;
-+  LogicalResult matchAndRewrite(CustomCallOp impl,
-+                                PatternRewriter& rewriter) const override {
-+    auto maybeOp = getTopKOp(impl);
-+    if (!maybeOp || failed(maybeOp->verify())) return failure();
-+    TopKOpAdaptor op = *maybeOp;
-+
-+    auto operandType = op.getOperand().getType().cast<ShapedType>();
-+    SmallVector<int64_t> outputShape(operandType.getShape());
-+    outputShape.back() = op.getK();
-+    return refineReturnTypes(rewriter, op, {{outputShape}, {outputShape}});
-+  }
-+};
-+
-+struct RefineTanOpPattern : public OpRewritePattern<CustomCallOp> {
-+  using OpRewritePattern::OpRewritePattern;
-+  LogicalResult matchAndRewrite(CustomCallOp impl,
-+                                PatternRewriter& rewriter) const override {
-+    auto maybeOp = getTanOp(impl);
-+    if (!maybeOp || failed(maybeOp->verify())) return failure();
-+    TanOpAdaptor op = *maybeOp;
-+    return refineReturnShape(rewriter, op,
-+                             op.getOperand().getType().getShape());
-+  }
-+};
-+
 +struct StablehloRefineShapesPass
 +    : public impl::StablehloRefineShapesPassBase<StablehloRefineShapesPass> {
 +  using StablehloRefineShapesPassBase::StablehloRefineShapesPassBase;
@@ -2698,7 +2525,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +    GreedyRewriteConfig config;
 +    config.useTopDownTraversal = true;
 +    config.enableRegionSimplification = true;
-+    config.maxIterations = 2;
++    config.maxIterations = 3;
 +    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
 +    config.strictMode = GreedyRewriteStrictness::AnyOp;
 +
@@ -2707,10 +2534,11 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +    patterns.add<RefineDynamicReduceWindowOpPattern>(&getContext());
 +    patterns.add<RefineDynamicRngBitGeneratorOpPattern>(&getContext());
 +    patterns.add<RefineDynamicTopKOpPattern>(&getContext());
-+    patterns.add<RefineTanOpPattern>(&getContext());
-+    patterns.add<RefineTopKOpPattern>(&getContext());
 +    if (failed(
 +            applyPatternsAndFoldGreedily(func, std::move(patterns), config))) {
++      func.emitError()
++          << "Greedy rewriter in StablehloRefineShapes does not converge after "
++          << config.maxIterations << " iterations.";
 +      return signalPassFailure();
 +    }
 +  }
@@ -2720,4 +2548,61 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/transforms/ChloDecompositionPatterns.td b/stablehlo/stablehlo/transforms/ChloDecompositionPatterns.td
+--- stablehlo/stablehlo/transforms/ChloDecompositionPatterns.td
++++ stablehlo/stablehlo/transforms/ChloDecompositionPatterns.td
+@@ -1,6 +1,6 @@
+ // Copyright 2020 The IREE Authors
+ //
+-// Licensed under the Apache License v2.0 with LLVM Exceptions.
++// Licensed under the Apache License, Version 2.0 with LLVM Exceptions.
+ // See https://llvm.org/LICENSE.txt for license information.
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ 
+diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
+--- stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
++++ stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
+@@ -1,6 +1,6 @@
+ // Copyright 2020 The IREE Authors
+ //
+-// Licensed under the Apache License v2.0 with LLVM Exceptions.
++// Licensed under the Apache License, Version 2.0 with LLVM Exceptions.
+ // See https://llvm.org/LICENSE.txt for license information.
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ 
+diff --ruN a/stablehlo/stablehlo/transforms/StablehloAggressiveSimplification.cpp b/stablehlo/stablehlo/transforms/StablehloAggressiveSimplification.cpp
+--- stablehlo/stablehlo/transforms/StablehloAggressiveSimplification.cpp
++++ stablehlo/stablehlo/transforms/StablehloAggressiveSimplification.cpp
+@@ -1,6 +1,6 @@
+ // Copyright 2023 The IREE Authors
+ //
+-// Licensed under the Apache License v2.0 with LLVM Exceptions.
++// Licensed under the Apache License, Version 2.0 with LLVM Exceptions.
+ // See https://llvm.org/LICENSE.txt for license information.
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ 
+@@ -126,9 +126,8 @@
+ 
+     // The canonical form has the constant operand as the RHS.
+     if (isa<IntegerType>(type.getElementType()) && lhsAttr && !rhsAttr) {
+-      rewriter.modifyOpInPlace(op, [op, lhs, rhs] {
+-        op->setOperands(ValueRange{rhs, lhs});
+-      });
++      rewriter.modifyOpInPlace(
++          op, [op, lhs, rhs] { op->setOperands(ValueRange{rhs, lhs}); });
+       return success();
+     }
+ 
+@@ -221,9 +220,8 @@
+ 
+     // The canonical form has the constant operand as the RHS.
+     if (isa<IntegerType>(type.getElementType()) && lhsAttr && !rhsAttr) {
+-      rewriter.modifyOpInPlace(op, [op, lhs, rhs] {
+-        op->setOperands(ValueRange{rhs, lhs});
+-      });
++      rewriter.modifyOpInPlace(
++          op, [op, lhs, rhs] { op->setOperands(ValueRange{rhs, lhs}); });
+       return success();
+     }
+ 
 
diff --git a/third_party/stablehlo/workspace.bzl b/third_party/stablehlo/workspace.bzl
index cf0ae34f4e54b3..03d732544d9ba8 100644
--- a/third_party/stablehlo/workspace.bzl
+++ b/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "20255865ba299ed67bdf6267478c8477aef7a60d"
-    STABLEHLO_SHA256 = "d703dba8c3f6ed1b5c7ac9772ec24d474644a5f10c0122aaca1401e0b4120471"
+    STABLEHLO_COMMIT = "7509a264b4552ecaf89713ba67edbb59422fb859"
+    STABLEHLO_SHA256 = "712a10ec39063761c1214cf5be3b06089514f1b3a69113f5d72af7c02594a239"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/tf_runtime/workspace.bzl b/third_party/tf_runtime/workspace.bzl
index 08d3db67586a5f..480c2df37daea6 100644
--- a/third_party/tf_runtime/workspace.bzl
+++ b/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "9a4d3f7a8a5c3b53a005831d22acdc18cf04ee35"
-    TFRT_SHA256 = "2897d44de841d552f9ca1a58278eb5e9c32327d9c8e757260f67ba6be885f08e"
+    TFRT_COMMIT = "02bc4aa04741ad76070e251804aabe545a8f54af"
+    TFRT_SHA256 = "b9e41192c67a893773753e5c3f05529c96e955b215ff74044f6556015498ab5a"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/triton/b304456327.patch b/third_party/triton/b304456327.patch
deleted file mode 100644
index 68209ab4d7550e..00000000000000
--- a/third_party/triton/b304456327.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-index 169939eb3..9d90acf51 100644
---- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -24,7 +24,7 @@ using ttg::SliceEncodingAttr;
- // supported
- static int getMMAVersionSafe(int computeCapability, tt::DotOp op) {
-   int baseVersion = 0;
--  if (computeCapability < 75) {
-+  if (computeCapability < 80) {
-     baseVersion = 1;
-   } else if (computeCapability < 90) {
-     baseVersion = 2;
\ No newline at end of file
diff --git a/third_party/triton/cl596538580.patch b/third_party/triton/cl596538580.patch
deleted file mode 100644
index fccfeb14a4d236..00000000000000
--- a/third_party/triton/cl596538580.patch
+++ /dev/null
@@ -1,26 +0,0 @@
-==== triton/lib/Target/LLVMIR/LLVMDIScope.cpp#1 - /google/src/cloud/tomnatan/mlir_b3037ae1fc6d26459e37f813757ad30872eb2eee_1704712559/triton/lib/Target/LLVMIR/LLVMDIScope.cpp ====
-# action=edit type=text
---- triton/lib/Target/LLVMIR/LLVMDIScope.cpp	2023-07-12 12:22:54.000000000 -0700
-+++ triton/lib/Target/LLVMIR/LLVMDIScope.cpp	2024-01-08 03:26:39.000000000 -0800
-@@ -1,6 +1,7 @@
- #include "triton/Target/LLVMIR/Passes.h"
- 
- #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-+#include "mlir/IR/BuiltinAttributes.h"
- #include "mlir/Pass/Pass.h"
- #include "llvm/BinaryFormat/Dwarf.h"
- #include "llvm/Support/Debug.h"
-@@ -75,9 +76,10 @@
-     }
-     if (!compileUnitAttr) {
-       compileUnitAttr = LLVM::DICompileUnitAttr::get(
--          context, llvm::dwarf::DW_LANG_C, fileAttr,
--          StringAttr::get(context, "triton"), /*isOptimized=*/true,
--          LLVM::DIEmissionKind::LineTablesOnly);
-+          context,
-+          mlir::DistinctAttr::create(mlir::UnitAttr::get(&getContext())),
-+          llvm::dwarf::DW_LANG_C, fileAttr, StringAttr::get(context, "triton"),
-+          /*isOptimized=*/true, LLVM::DIEmissionKind::LineTablesOnly);
-     }
-     auto subroutineTypeAttr =
-         LLVM::DISubroutineTypeAttr::get(context, llvm::dwarf::DW_CC_normal, {});
diff --git a/third_party/triton/cl596550429.patch b/third_party/triton/cl596550429.patch
deleted file mode 100644
index b191db75535fa3..00000000000000
--- a/third_party/triton/cl596550429.patch
+++ /dev/null
@@ -1,42 +0,0 @@
-==== triton/lib/Target/LLVMIR/LLVMDIScope.cpp#2 - /google/src/cloud/tomnatan/mlir_bae1fdea712fcd0b0ea525b115e661f92263f2e7_1704716315/triton/lib/Target/LLVMIR/LLVMDIScope.cpp ====
-# action=edit type=text
---- triton/lib/Target/LLVMIR/LLVMDIScope.cpp	2024-01-08 03:37:57.000000000 -0800
-+++ triton/lib/Target/LLVMIR/LLVMDIScope.cpp	2024-01-08 04:22:40.000000000 -0800
-@@ -73,11 +73,11 @@
-           context, llvm::sys::path::filename(inputFilePath),
-           llvm::sys::path::parent_path(inputFilePath));
-     }
-+    auto distinctId = mlir::DistinctAttr::create(mlir::UnitAttr::get(context));
-     if (!compileUnitAttr) {
-       compileUnitAttr = LLVM::DICompileUnitAttr::get(
--          context,
--          mlir::DistinctAttr::create(mlir::UnitAttr::get(&getContext())),
--          llvm::dwarf::DW_LANG_C, fileAttr, StringAttr::get(context, "triton"),
-+          context, distinctId, llvm::dwarf::DW_LANG_C, fileAttr,
-+          StringAttr::get(context, "triton"),
-           /*isOptimized=*/true, LLVM::DIEmissionKind::LineTablesOnly);
-     }
-     auto subroutineTypeAttr =
-@@ -87,14 +87,14 @@
-     // Note that scopeline is set differently from LLVM's
-     // DIScopeForLLVMFuncOpPass. I don't find reasons why scopeline should be
-     // the column offset
--    auto subprogramAttr =
--        LLVM::DISubprogramAttr::get(context, compileUnitAttr, fileAttr,
--                                    funcNameAttr, funcNameAttr, fileAttr,
--                                    /*line=*/line,
--                                    /*scopeline=*/line,
--                                    LLVM::DISubprogramFlags::Definition |
--                                        LLVM::DISubprogramFlags::Optimized,
--                                    subroutineTypeAttr);
-+    auto subprogramAttr = LLVM::DISubprogramAttr::get(
-+        context, distinctId, compileUnitAttr, fileAttr, funcNameAttr,
-+        funcNameAttr, fileAttr,
-+        /*line=*/line,
-+        /*scopeline=*/line,
-+        LLVM::DISubprogramFlags::Definition |
-+            LLVM::DISubprogramFlags::Optimized,
-+        subroutineTypeAttr);
-     funcOp->setLoc(FusedLoc::get(context, {loc}, subprogramAttr));
-   }
- 
diff --git a/third_party/triton/cl597005212.patch b/third_party/triton/cl597005212.patch
deleted file mode 100644
index f5bbb332a3fa5f..00000000000000
--- a/third_party/triton/cl597005212.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-==== triton/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp#17 - /google/src/cloud/ecg/mlir_2aec7083ada09c8b8a0aad79492cbedcf8f9fbb7_1704830674/triton/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp ====
-# action=edit type=text
---- triton/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp	2024-01-05 23:46:53.000000000 -0800
-+++ triton/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp	2024-01-09 12:13:15.000000000 -0800
-@@ -226,7 +226,8 @@
-     }
-     // Set an attribute for maxntidx, it could be used in latter LLVM codegen
-     // for `nvvm.annotation` metadata.
--    newFuncOp->setAttr("nvvm.maxntid", rewriter.getI32ArrayAttr(32 * numWarps));
-+    newFuncOp->setAttr("nvvm.maxntid",
-+                       rewriter.getDenseI32ArrayAttr(32 * numWarps));
-     // The call graph is updated by mapping the old function to the new one.
-     allocation.mapFuncOp(funcOp, newFuncOp);
- 
diff --git a/third_party/triton/cl597014441.patch b/third_party/triton/cl597014441.patch
deleted file mode 100644
index 44d9627d034766..00000000000000
--- a/third_party/triton/cl597014441.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-==== triton/test/Conversion/tritongpu_to_llvm.mlir#19 - /google/src/cloud/ecg/mlir_2aec7083ada09c8b8a0aad79492cbedcf8f9fbb7_1704832781/triton/test/Conversion/tritongpu_to_llvm.mlir ====
-# action=edit type=text
---- triton/test/Conversion/tritongpu_to_llvm.mlir	2023-12-15 04:43:34.000000000 -0800
-+++ triton/test/Conversion/tritongpu_to_llvm.mlir	2024-01-09 12:40:14.000000000 -0800
-@@ -3,7 +3,7 @@
- module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
-   // CHECK: llvm.func @test_empty_kernel(%arg0: i64, %arg1: !llvm.ptr<1>)
-   // Here the 128 comes from the 4 in module attribute multiples 32
--  // CHECK: nvvm.kernel = 1 : ui1, nvvm.maxntid = [128 : i32]
-+  // CHECK: nvvm.kernel = 1 : ui1, nvvm.maxntid = array<i32: 128>
-   tt.func @test_empty_kernel(%lb : index, %A : !tt.ptr<f16>) {
-     // CHECK:  llvm.return
-     tt.return
diff --git a/third_party/triton/cl597183646.patch b/third_party/triton/cl597183646.patch
deleted file mode 100644
index 1a8501bb5f5e9e..00000000000000
--- a/third_party/triton/cl597183646.patch
+++ /dev/null
@@ -1,22 +0,0 @@
-==== triton/lib/Dialect/Triton/Transforms/Combine.td#3 - /google/src/cloud/tomnatan/mlir_a7262d2d9bee9bdfdbcd03ca27a0128c2e2b1c1a_1704881231/triton/lib/Dialect/Triton/Transforms/Combine.td ====
-# action=edit type=text
---- triton/lib/Dialect/Triton/Transforms/Combine.td	2023-12-13 05:13:41.000000000 -0800
-+++ triton/lib/Dialect/Triton/Transforms/Combine.td	2024-01-10 05:21:40.000000000 -0800
-@@ -12,7 +12,7 @@
- // AddIOp(d, DotOp(a, b, c)) and c==0 => DotOp(a, b, d)
- // AddFOp(d, DotOp(a, b, c)) and c==0 => DotOp(a, b, d)
- def CombineDotAddIPattern : Pat<
--        (Arith_AddIOp $d, (TT_DotOp:$res $a, $b, $c, $allowTF32, $maxNumImpreciseAcc)),
-+        (Arith_AddIOp $d, (TT_DotOp:$res $a, $b, $c, $allowTF32, $maxNumImpreciseAcc), $overflow),
-         (TT_DotOp $a, $b, $d, $allowTF32, $maxNumImpreciseAcc, (location $res)),
-         [(Constraint<CPred<"isZero($0)">> $c)]>;
- def CombineDotAddFPattern : Pat<
-@@ -22,7 +22,7 @@
-          (Constraint<CPred<"::llvm::cast<::mlir::IntegerAttr>($0).getInt() == 0">> $maxNumImpreciseAcc)]>;
- 
- def CombineDotAddIRevPattern : Pat<
--        (Arith_AddIOp (TT_DotOp:$res $a, $b, $c, $allowTF32, $maxNumImpreciseAcc), $d),
-+        (Arith_AddIOp (TT_DotOp:$res $a, $b, $c, $allowTF32, $maxNumImpreciseAcc), $d, $overflow),
-         (TT_DotOp $a, $b, $d, $allowTF32, $maxNumImpreciseAcc, (location $res)),
-         [(Constraint<CPred<"isZero($0)">> $c)]>;
- def CombineDotAddFRevPattern : Pat<
diff --git a/third_party/triton/cl597222925.patch b/third_party/triton/cl597222925.patch
deleted file mode 100644
index 1f1b38bceff247..00000000000000
--- a/third_party/triton/cl597222925.patch
+++ /dev/null
@@ -1,27 +0,0 @@
-Upstreamed in https://github.com/openai/triton/pull/2914
-
-diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
---- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
-+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
-@@ -300,17 +300,17 @@ SmallVector<unsigned> getCTASplitNum(Attribute layout) {
- }
- 
- SmallVector<unsigned> getCTAOrder(Attribute layout) {
--  ArrayRef<unsigned> ref;
-+  SmallVector<unsigned> res;
-   if (auto distributedLayout = layout.dyn_cast<DistributedEncodingTrait>()) {
--    ref = distributedLayout.getCTAOrder();
-+    res = distributedLayout.getCTAOrder();
-   } else if (auto mfmaLayout = layout.dyn_cast<MfmaEncodingAttr>()) {
-     return {0, 1};
-   } else if (auto sharedLayout = layout.dyn_cast<SharedEncodingAttr>()) {
--    ref = sharedLayout.getCTALayout().getCTAOrder();
-+    res = SmallVector<unsigned>(sharedLayout.getCTALayout().getCTAOrder());
-   } else {
-     llvm::report_fatal_error("Unimplemented usage of getCTAOrder");
-   }
--  return SmallVector<unsigned>(ref.begin(), ref.end());
-+  return res;
- }
- 
- SmallVector<int64_t> getShapePerCTA(ArrayRef<unsigned> CTASplitNum,
diff --git a/third_party/triton/cl598847928.patch b/third_party/triton/cl598847928.patch
deleted file mode 100644
index e150d47ce0b103..00000000000000
--- a/third_party/triton/cl598847928.patch
+++ /dev/null
@@ -1,46 +0,0 @@
-diff --git a/lib/Target/LLVMIR/LLVMDIScope.cpp b/lib/Target/LLVMIR/LLVMDIScope.cpp
-index 489aa455..a9de0b31 100644
---- a/lib/Target/LLVMIR/LLVMDIScope.cpp
-+++ b/lib/Target/LLVMIR/LLVMDIScope.cpp
-@@ -73,16 +73,24 @@ struct LLVMDIScopePass : public LLVMDIScopeBase<LLVMDIScopePass> {
-           context, llvm::sys::path::filename(inputFilePath),
-           llvm::sys::path::parent_path(inputFilePath));
-     }
--    auto distinctId = mlir::DistinctAttr::create(mlir::UnitAttr::get(context));
--    if (!compileUnitAttr) {
--      compileUnitAttr = LLVM::DICompileUnitAttr::get(
--          context, distinctId, llvm::dwarf::DW_LANG_C, fileAttr,
--          StringAttr::get(context, "triton"),
--          /*isOptimized=*/true, LLVM::DIEmissionKind::LineTablesOnly);
--    }
-     auto subroutineTypeAttr =
-         LLVM::DISubroutineTypeAttr::get(context, llvm::dwarf::DW_CC_normal, {});
- 
-+    DistinctAttr distinctId;
-+    auto subprogramFlags = LLVM::DISubprogramFlags::Optimized;
-+    if (!funcOp.isExternal()) {
-+      distinctId = mlir::DistinctAttr::create(mlir::UnitAttr::get(context));
-+      if (!compileUnitAttr) {
-+        compileUnitAttr = LLVM::DICompileUnitAttr::get(
-+            context, distinctId, llvm::dwarf::DW_LANG_C, fileAttr,
-+            StringAttr::get(context, "triton"),
-+            /*isOptimized=*/true, LLVM::DIEmissionKind::LineTablesOnly);
-+      }
-+      subprogramFlags = subprogramFlags | LLVM::DISubprogramFlags::Definition;
-+    } else {
-+      compileUnitAttr = {};
-+    }
-+
-     StringAttr funcNameAttr = funcOp.getNameAttr();
-     // Note that scopeline is set differently from LLVM's
-     // DIScopeForLLVMFuncOpPass. I don't find reasons why scopeline should be
-@@ -92,8 +100,7 @@ struct LLVMDIScopePass : public LLVMDIScopeBase<LLVMDIScopePass> {
-         funcNameAttr, fileAttr,
-         /*line=*/line,
-         /*scopeline=*/line,
--        LLVM::DISubprogramFlags::Definition |
--            LLVM::DISubprogramFlags::Optimized,
-+        subprogramFlags,
-         subroutineTypeAttr);
-     funcOp->setLoc(FusedLoc::get(context, {loc}, subprogramAttr));
-   }
diff --git a/third_party/triton/cl599125711.patch b/third_party/triton/cl599125711.patch
deleted file mode 100644
index 9a9e7adcd8596d..00000000000000
--- a/third_party/triton/cl599125711.patch
+++ /dev/null
@@ -1,68 +0,0 @@
-==== triton/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp#19 - /google/src/cloud/bchetioui/mlir_5fcf907b34355980f77d7665a175b05fea7a6b7b_1705493395/triton/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp ====
-# action=edit type=text
---- triton/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp	2024-01-15 03:54:37.000000000 -0800
-+++ triton/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp	2024-01-17 04:14:45.000000000 -0800
-@@ -88,7 +88,7 @@
-     auto splatOp = insertOp.getMask().getDefiningOp<triton::SplatOp>();
-     if (!splatOp)
-       return failure();
--    rewriter.updateRootInPlace(insertOp, [&]() {
-+    rewriter.modifyOpInPlace(insertOp, [&]() {
-       insertOp.getMaskMutable().assign(splatOp->getOperand(0));
-     });
-     return mlir::success();
-==== triton/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp#23 - /google/src/cloud/bchetioui/mlir_5fcf907b34355980f77d7665a175b05fea7a6b7b_1705493395/triton/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp ====
-# action=edit type=text
---- triton/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp	2023-12-15 04:43:34.000000000 -0800
-+++ triton/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp	2024-01-17 04:21:13.000000000 -0800
-@@ -650,8 +650,8 @@
-   LogicalResult
-   matchAndRewrite(scf::ConditionOp op, OpAdaptor adaptor,
-                   ConversionPatternRewriter &rewriter) const override {
--    rewriter.updateRootInPlace(
--        op, [&]() { op->setOperands(adaptor.getOperands()); });
-+    rewriter.modifyOpInPlace(op,
-+                             [&]() { op->setOperands(adaptor.getOperands()); });
-     return success();
-   }
- };
-==== triton/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp#17 - /google/src/cloud/bchetioui/mlir_5fcf907b34355980f77d7665a175b05fea7a6b7b_1705493395/triton/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp	2024-01-05 23:46:53.000000000 -0800
-+++ triton/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp	2024-01-17 04:21:13.000000000 -0800
-@@ -284,8 +284,7 @@
-         srcType.getShape(), srcType.getElementType(), dotOperandEncoding);
-     Value newOperand = rewriter.create<ConvertLayoutOp>(dotOp.getLoc(), newType,
-                                                         convertLhs.getSrc());
--    rewriter.updateRootInPlace(dotOp,
--                               [&]() { dotOp.setOperand(0, newOperand); });
-+    rewriter.modifyOpInPlace(dotOp, [&]() { dotOp.setOperand(0, newOperand); });
-     return success();
-   }
- };
-==== triton/lib/Dialect/TritonGPU/Transforms/OptimizeThreadLocality.cpp#6 - /google/src/cloud/bchetioui/mlir_5fcf907b34355980f77d7665a175b05fea7a6b7b_1705493395/triton/lib/Dialect/TritonGPU/Transforms/OptimizeThreadLocality.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/TritonGPU/Transforms/OptimizeThreadLocality.cpp	2024-01-15 03:54:37.000000000 -0800
-+++ triton/lib/Dialect/TritonGPU/Transforms/OptimizeThreadLocality.cpp	2024-01-17 04:14:45.000000000 -0800
-@@ -73,7 +73,7 @@
-     if (triton::gpu::isExpensiveView(viewOp.getSrc().getType(), newType))
-       return failure();
-     rewriter.setInsertionPointAfter(viewOp);
--    rewriter.updateRootInPlace(viewOp, [&]() {
-+    rewriter.modifyOpInPlace(viewOp, [&]() {
-       viewOp.getResult().setType(newType);
-       viewOp.setEfficientLayout(true);
-     });
-==== triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp#22 - /google/src/cloud/bchetioui/mlir_5fcf907b34355980f77d7665a175b05fea7a6b7b_1705493395/triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp	2023-12-15 04:43:34.000000000 -0800
-+++ triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp	2024-01-17 04:14:45.000000000 -0800
-@@ -706,7 +706,7 @@
-     }
-     if (deadArg.empty())
-       return failure();
--    rewriter.updateRootInPlace(forOp, [&]() {
-+    rewriter.modifyOpInPlace(forOp, [&]() {
-       // For simplicity we just change the dead yield operand to use the
-       // associated argument and leave the operations and argument removal to
-       // dead code elimination.
diff --git a/third_party/triton/cl607293980.patch b/third_party/triton/cl607293980.patch
new file mode 100644
index 00000000000000..b7b9d0e84fab2e
--- /dev/null
+++ b/third_party/triton/cl607293980.patch
@@ -0,0 +1,17 @@
+Long standing patch due to licensing issues.
+diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
+index 31bc03fe1..a19a432df 100644
+--- a/include/triton/Tools/Sys/GetEnv.hpp
++++ b/include/triton/Tools/Sys/GetEnv.hpp
+@@ -34,9 +34,10 @@ inline const std::set<std::string> ENV_VARS = {
+     "AMDGCN_ENABLE_DUMP",
+     "DISABLE_FAST_REDUCTION",
+     "DISABLE_LLVM_OPT",
+-    "DISABLE_MMA_V3",
++    "ENABLE_MMA_V3",
+     "DISABLE_PTXAS_OPT",
+     "LLVM_IR_ENABLE_DUMP",
++    "MLIR_ENABLE_DIAGNOSTICS",
+     "MLIR_ENABLE_DUMP",
+     "TRITON_DISABLE_LINE_INFO",
+     "TRITON_DISABLE_RESHAPE_ENCODING_INFERENCE",
diff --git a/third_party/triton/workspace.bzl b/third_party/triton/workspace.bzl
index 8c5e620681ac36..98b5d30214573e 100644
--- a/third_party/triton/workspace.bzl
+++ b/third_party/triton/workspace.bzl
@@ -5,9 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl596935355"
-    TRITON_SHA256 = "66bac24a0443655c8f4efe542bc78c0c34cf8756baa746e61604cf26bd9fae5e"
-
+    TRITON_COMMIT = "cl606191901"
+    TRITON_SHA256 = "f6c8089671af4d65e93e3ab4bebbaf289edce96e4d6f76847a2a24a933e02db9"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
@@ -15,10 +14,6 @@ def repo():
         urls = tf_mirror_urls("https://github.com/openxla/triton/archive/{commit}.tar.gz".format(commit = TRITON_COMMIT)),
         # For temporary changes which haven't landed upstream yet.
         patch_file = [
-            "//third_party/triton:b304456327.patch",
-            "//third_party/triton:cl597183646.patch",
-            "//third_party/triton:cl597222925.patch",
-            "//third_party/triton:cl598847928.patch",
-            "//third_party/triton:cl599125711.patch",
+            "//third_party/triton:cl607293980.patch",  # long standing :(
         ],
     )
diff --git a/third_party/xla/.bazelrc b/third_party/xla/.bazelrc
index 1538ddd284f616..bd71ab3016563d 100644
--- a/third_party/xla/.bazelrc
+++ b/third_party/xla/.bazelrc
@@ -255,6 +255,14 @@ build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-1
 build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 build:cuda_clang_official --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
 
+# Build with nvcc for CUDA and clang for host
+build:nvcc_clang --config=cuda
+# Unfortunately, cuda_configure.bzl demands this for using nvcc + clang
+build:nvcc_clang --action_env=TF_CUDA_CLANG="1"
+build:nvcc_clang --action_env=TF_NVCC_CLANG="1"
+build:nvcc_clang --@local_config_cuda//:cuda_compiler=nvcc
+
+
 # Debug config
 build:dbg -c dbg
 # Only include debug info for files under tensorflow/, excluding kernels, to
@@ -527,8 +535,8 @@ build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.16-clang_confi
 test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
 build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
+build:rbe_linux_cuda_nvcc --config=nvcc_clang
 build:rbe_linux_cuda_nvcc --repo_env TF_NCCL_USE_STUB=1
-build:rbe_linux_cuda_nvcc --action_env=TF_NVCC_CLANG="1"
 
 # TODO(kanglan): Remove rbe_win and rbe_win_py3* after b/289091160 is fixed
 build:rbe_win --config=rbe_base
@@ -577,6 +585,7 @@ build:elinux_armhf --copt -mfp16-format=ieee
 
 # Load rc file written by ./configure.
 try-import %workspace%/.tf_configure.bazelrc
+try-import %workspace%/xla_configure.bazelrc
 
 # Load rc file with user-specific options.
 try-import %workspace%/.bazelrc.user
@@ -719,7 +728,7 @@ build:no_tfrt --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compil
 # Use --config=tf_public_cache to try and use the TensorFlow public build cache
 # to build TensorFlow. Look at ci/official/envs to find which types of jobs
 # push to the cache.  For macOS, use --config=tf_public_macos_cache
-build:tf_public_cache --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --remote_upload_local_results=false
+build:tf_public_cache --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/january2024" --remote_upload_local_results=false
 # Cache pushes are limited to TF's CI system.
 build:tf_public_cache_push --config=tf_public_cache --remote_upload_local_results=true --google_default_credentials
 # Public cache for macOS builds
@@ -777,28 +786,38 @@ test:linux_cuda_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-os
 test:linux_cuda_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
 test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # ARM64 PYCPP
-test:linux_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
-test:linux_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
-test:linux_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium --flaky_test_attempts=3
+# In Linux Arm64 presubmit/continuous build, we cross-compile the binaries on
+# Linux x86 so that we can use RBE. Since tests still need to run on the single
+# host Arm64 machine, the build becomes too slow (~30 min) to be a presubmit.
+# For testing purposes, we want to see the runtime performance of an
+# experimental job that is build-only, i.e, we only build the test targets and
+# do not run them. By prefixing the configs with "build", we can run both
+# `bazel build` and `bazel test` commands with the same config as test configs
+# inherit from build.
+build:linux_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+build:linux_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+build:linux_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium --flaky_test_attempts=3
 # TODO(michaelhudgins): Why do we need to specifically omit go and java here? 
-test:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test -//tensorflow/compiler/mlir/tfr/examples/customization:test_ops_test -//tensorflow/compiler/mlir/tfr/examples/mnist:mnist_ops_test -//tensorflow/compiler/mlir/tfr/examples/pad:pad_ops_test -//tensorflow/python/tools:aot_compiled_test
+build:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test -//tensorflow/compiler/mlir/tfr/examples/customization:test_ops_test -//tensorflow/compiler/mlir/tfr/examples/mnist:mnist_ops_test -//tensorflow/compiler/mlir/tfr/examples/pad:pad_ops_test -//tensorflow/python/tools:aot_compiled_test
 # CROSS-COMPILE ARM64 PYCPP
-test:cross_compile_linux_arm64_pycpp_test --config=linux_arm64_pycpp_test
+build:cross_compile_linux_arm64_pycpp_test --config=linux_arm64_pycpp_test
 # Tests that fail only when cross-compiled
-test:cross_compile_linux_arm64_pycpp_test -//tensorflow/compiler/mlir/quantization/stablehlo:convert_tf_quant_to_mhlo_int_test
+build:cross_compile_linux_arm64_pycpp_test -//tensorflow/compiler/mlir/quantization/stablehlo:convert_tf_quant_to_mhlo_int_test
 # MACOS ARM64 PYCPP
 test:macos_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
 test:macos_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
 test:macos_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
 test:macos_arm64_pycpp_test --config=macos_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/compiler/aot/... -//tensorflow/core/kernels/image:resize_bicubic_op_test
 # MACOS X86 PYCPP
-test:macos_x86_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
-test:macos_x86_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
-test:macos_x86_pycpp_test_filters --keep_going --test_lang_filters=cc,py --test_size_filters=small,medium
-test:macos_x86_pycpp_test --config=macos_x86_pycpp_test_filters -- //tensorflow/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/python/integration_testing/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/...
+# These are defined as build configs so that we can run a build only job. See
+# the note under "ARM64 PYCPP" for more details.
+build:macos_x86_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
+build:macos_x86_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
+build:macos_x86_pycpp_test_filters --keep_going --test_lang_filters=cc,py --test_size_filters=small,medium
+build:macos_x86_pycpp_test --config=macos_x86_pycpp_test_filters -- //tensorflow/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/python/integration_testing/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/...
 # CROSS-COMPILE MACOS X86 PYCPP
-test:cross_compile_macos_x86_pycpp_test --config=macos_x86_pycpp_test
-test:cross_compile_macos_x86_pycpp_test -//tensorflow/core/kernels:quantized_conv_ops_test -//tensorflow/core/kernels:quantized_matmul_op_test -//tensorflow/python/ops:quantized_conv_ops_test -//tensorflow/tools/graph_transforms:transforms_test -//tensorflow/python/tools:aot_compiled_test
+build:cross_compile_macos_x86_pycpp_test --config=macos_x86_pycpp_test
+build:cross_compile_macos_x86_pycpp_test -//tensorflow/core/kernels:quantized_conv_ops_test -//tensorflow/core/kernels:quantized_matmul_op_test -//tensorflow/python/ops:quantized_conv_ops_test -//tensorflow/tools/graph_transforms:transforms_test -//tensorflow/python/tools:aot_compiled_test
 # END TF TEST SUITE OPTIONS
 
 # START CROSS-COMPILE CONFIGS
@@ -855,8 +874,12 @@ build:cross_compile_macos_x86 --platform_mappings=tensorflow/tools/toolchains/cr
 # RBE cross-compile configs for Darwin x86
 build:rbe_cross_compile_macos_x86 --config=cross_compile_macos_x86
 build:rbe_cross_compile_macos_x86 --config=rbe_cross_compile_base
+build:rbe_cross_compile_macos_x86 --bes_upload_mode=fully_async
 test:rbe_cross_compile_macos_x86 --config=rbe_cross_compile_base
 # Increase the test timeout as tests often take longer on mac.
 test:rbe_cross_compile_macos_x86 --test_timeout=300,450,1200,3600
+# Limit jobs to 100 to avoid running into "out of memory" issues (b/316266643)
+build:rbe_cross_compile_macos_x86 --jobs=100
+test:rbe_cross_compile_macos_x86 --jobs=100
 # END MACOS CROSS-COMPILE CONFIGS
 # END CROSS-COMPILE CONFIGS
diff --git a/third_party/xla/.bazelversion b/third_party/xla/.bazelversion
index b536fbc5061305..f3c238740e5bc3 100644
--- a/third_party/xla/.bazelversion
+++ b/third_party/xla/.bazelversion
@@ -1,2 +1,2 @@
-6.1.0
+6.5.0
 # NOTE: Update Bazel version in tensorflow/tools/ci_build/release/common.sh.oss
\ No newline at end of file
diff --git a/third_party/xla/xla/.clang-format b/third_party/xla/.clang-format
similarity index 75%
rename from third_party/xla/xla/.clang-format
rename to third_party/xla/.clang-format
index c2aa8675561990..b894b2a3e874c8 100644
--- a/third_party/xla/xla/.clang-format
+++ b/third_party/xla/.clang-format
@@ -1,3 +1,4 @@
 BasedOnStyle: Google
 Language: Cpp
 PointerBindsToType: true
+SortIncludes: Never
diff --git a/third_party/xla/.github/workflows/trusted_partners.js b/third_party/xla/.github/workflows/trusted_partners.js
index 75a1ff082592b7..d886ce5fc6ad45 100644
--- a/third_party/xla/.github/workflows/trusted_partners.js
+++ b/third_party/xla/.github/workflows/trusted_partners.js
@@ -76,8 +76,7 @@ const filter_action = async ({github, context, domain}) => {
       assignees.push('hawkinsp', 'yashk2810', 'skye');
     }
     if (lowercased_title.includes('xla') || lowercased_title.includes('gpu')) {
-      assignees.push(
-          'cheshire', 'gcforster', 'reedwm', 'chsigg', 'xla-rotation');
+      assignees.push('cheshire', 'reedwm', 'xla-rotation');
     }
     if (lowercased_title.includes('tf')) {
       assignees.push('rohan100jain', 'bfontain');
diff --git a/third_party/xla/.gitignore b/third_party/xla/.gitignore
index 2f09259330bc65..ee6ca187cc7fa4 100644
--- a/third_party/xla/.gitignore
+++ b/third_party/xla/.gitignore
@@ -10,6 +10,7 @@ bazel-testlogs
 
 # Ignore files produced by `configure`
 .tf_configure.bazelrc
+xla_configure.bazelrc
 tools/python_bin_path.sh
 
 # Emacs autosaves
diff --git a/third_party/xla/.kokoro/linux/build.sh b/third_party/xla/.kokoro/linux/build.sh
index 635af61a6d3ed5..1bb64faf7ba041 100644
--- a/third_party/xla/.kokoro/linux/build.sh
+++ b/third_party/xla/.kokoro/linux/build.sh
@@ -26,12 +26,21 @@ function is_linux_gpu_job() {
   [[ "$KOKORO_JOB_NAME" =~ tensorflow/xla/linux/.*gpu.* ]]
 }
 
-# Pull the container (in case it was updated since the instance started) and
-# store its SHA in the Sponge log.
-docker pull "$DOCKER_IMAGE"
-echo "TF_INFO_DOCKER_IMAGE,$DOCKER_IMAGE" >> "$KOKORO_ARTIFACTS_DIR/custom_sponge_config.csv"
-echo "TF_INFO_DOCKER_SHA,$(docker pull "$DOCKER_IMAGE" | sed -n '/Digest:/s/Digest: //g p')" >> "$KOKORO_ARTIFACTS_DIR/custom_sponge_config.csv"
+function is_linux_cpu_arm64_job() {
+  [[ "$KOKORO_JOB_NAME" =~ tensorflow/xla/linux/.*arm64.*/.*cpu.* ]]
+}
+
+function pull_docker_image_with_retries() {
+  # Pull the container (in case it was updated since the instance started) and
+  # store its SHA in the Sponge log.
+  docker pull "$DOCKER_IMAGE" || sleep 15
+  docker pull "$DOCKER_IMAGE" || sleep 15
+  docker pull "$DOCKER_IMAGE"
+  echo "TF_INFO_DOCKER_IMAGE,$DOCKER_IMAGE" >> "$KOKORO_ARTIFACTS_DIR/custom_sponge_config.csv"
+  echo "TF_INFO_DOCKER_SHA,$(docker pull "$DOCKER_IMAGE" | sed -n '/Digest:/s/Digest: //g p')" >> "$KOKORO_ARTIFACTS_DIR/custom_sponge_config.csv"
+}
 
+pull_docker_image_with_retries
 # Start a container in the background
 docker run --name xla -w /tf/xla -itd --rm \
     -v "$KOKORO_ARTIFACTS_DIR/github/xla:/tf/xla" \
@@ -39,40 +48,49 @@ docker run --name xla -w /tf/xla -itd --rm \
     "$DOCKER_IMAGE" \
     bash
 
-# bazelrc Files currently come from https://github.com/tensorflow/tensorflow/tree/master/tensorflow/tools/tf_sig_build_dockerfiles/devel.usertools
-RC_FILE="/usertools/cpu.bazelrc"
-TARGET_FILTER=""
 TAGS_FILTER="-no_oss,-oss_excluded,-oss_serial"
 ADDITIONAL_FLAGS=""
-RBE_CONFIG=""
+RBE_FLAGS=""
 
 if is_linux_gpu_job ; then
     TAGS_FILTER="$TAGS_FILTER,gpu,requires-gpu-nvidia,-no_gpu"
+
+    # We are currently running XLA presubmits on machines with NVIDIA T4 GPUs,
+    # which have a compute compatibility of 7.5. Se we filter out all the tests
+    # that need a newer GPU:
+    UNSUPPORTED_GPU_TAGS="$(echo -requires-gpu-sm{80,86,89,90}{,-only})"
+    TAGS_FILTER="${TAGS_FILTER},${UNSUPPORTED_GPU_TAGS// /,}"
+
     ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --run_under=//tools/ci_build/gpu_build:parallel_gpu_execute"
-    RC_FILE="/usertools/gpu.bazelrc"
-    RBE_CONFIG="rbe_linux_cuda_nvcc"
+    RBE_FLAGS="--config=rbe_linux_cuda_nvcc --jobs=150"
     echo "***NOTE: nvidia-smi lists the highest CUDA version the driver supports, which may be different than the version of CUDA actually used!!***"
     nvidia-smi
 else
     TAGS_FILTER="$TAGS_FILTER,-gpu,-requires-gpu-nvidia"
     ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --config=nonccl"
-    RBE_CONFIG="rbe_linux_cpu"
+
+    if is_linux_cpu_arm64_job ; then
+        TAGS_FILTER="$TAGS_FILTER,-no_aarch64"
+        ADDITIONAL_FLAGS="$ADDITIONAL_FLAGS --config=tf_public_cache_push --action_env PYTHON_BIN_PATH=/usr/bin/python3.10 --python_path=/usr/bin/python3.10"
+    else
+        RBE_FLAGS="--config=rbe_linux_cpu --jobs=150"
+    fi
 fi
 
 # Build & test XLA
-docker exec xla bazel --bazelrc=$RC_FILE \
+docker exec xla bazel \
         test \
         --build_tag_filters=$TAGS_FILTER  \
         --test_tag_filters=$TAGS_FILTER \
+        --test_output=errors \
         --keep_going \
         --features=layering_check \
         --profile=/tf/pkg/profile.json.gz \
         --flaky_test_attempts=3 \
-        --config=$RBE_CONFIG \
-        --jobs=150 \
+        $RBE_FLAGS \
         --nobuild_tests_only \
         $ADDITIONAL_FLAGS \
-        -- //xla/... //build_tools/... $TARGET_FILTER
+        -- //xla/... //build_tools/...
 
 
 # Print build time statistics, including critical path.
diff --git a/third_party/xla/.kokoro/macos/build.sh b/third_party/xla/.kokoro/macos/build.sh
index afdbe636fd778f..b3c97310752fbc 100644
--- a/third_party/xla/.kokoro/macos/build.sh
+++ b/third_party/xla/.kokoro/macos/build.sh
@@ -86,10 +86,3 @@ bazel test \
     --build_tag_filters=$TAGS_FILTER  --test_tag_filters=$TAGS_FILTER \
     --test_size_filters=small,medium \
     -- //xla/... $TARGET_FILTER
-
-
-# We want to store the individual test logs in GCS after the build finishes.
-# Since the Bazel test output folder is in a different partition to the Kokoro
-# artifacts folder, we need to symlink the test output folder to be in the
-# artifacts folder.
-ln -s /Volumes/BuildData/bazel_output "$KOKORO_ARTIFACTS_DIR"
diff --git a/third_party/xla/build_tools/BUILD b/third_party/xla/build_tools/BUILD
new file mode 100644
index 00000000000000..3111ccb2505db4
--- /dev/null
+++ b/third_party/xla/build_tools/BUILD
@@ -0,0 +1,28 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+load("//xla:pytype.default.bzl", "pytype_strict_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+pytype_strict_library(
+    name = "test_utils",
+    testonly = True,
+    srcs = ["test_utils.py"],
+    visibility = ["//visibility:public"],
+)
diff --git a/third_party/xla/build_tools/configure/BUILD b/third_party/xla/build_tools/configure/BUILD
new file mode 100644
index 00000000000000..90d63687cfb1b7
--- /dev/null
+++ b/third_party/xla/build_tools/configure/BUILD
@@ -0,0 +1,83 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+load("//xla:pytype.default.bzl", "pytype_strict_library")
+
+# Placeholder: load py_test
+load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    licenses = ["notice"],
+)
+
+pytype_strict_library(
+    name = "configure",
+    srcs = ["configure.py"],
+)
+
+py_test(
+    name = "configure_test",
+    srcs = ["configure_test.py"],
+    data = [
+        "testdata/clang.bazelrc",
+        "testdata/cuda_clang.bazelrc",
+        "testdata/gcc.bazelrc",
+        "testdata/nvcc_clang.bazelrc",
+        "testdata/nvcc_gcc.bazelrc",
+    ],
+    deps = [
+        ":configure",
+        "//build_tools:test_utils",
+        "@absl_py//absl/testing:absltest",
+    ],
+)
+
+# Below targets are just for checking if the host/CUDA compiler are configured
+# as expected.
+cc_library(
+    name = "assert_clang",
+    srcs = ["assert_clang.cc"],
+    tags = ["manual"],
+)
+
+cc_library(
+    name = "assert_gcc",
+    srcs = ["assert_gcc.cc"],
+    tags = ["manual"],
+)
+
+cuda_library(
+    name = "assert_cuda_clang",
+    srcs = ["assert_cuda_clang.cu.cc"],
+    tags = [
+        "gpu",
+        "manual",
+    ],
+    deps = ["@local_config_cuda//cuda:cuda_headers"],
+)
+
+cuda_library(
+    name = "assert_nvcc",
+    srcs = ["assert_nvcc.cu.cc"],
+    tags = [
+        "gpu",
+        "manual",
+    ],
+    # Notably, this builds fine in OSS without this dependency. Apparently,
+    # NVCC can give targets access to CUDA headers without letting Bazel know,
+    # while CUDA clang cannot.
+    deps = ["@local_config_cuda//cuda:cuda_headers"],
+)
diff --git a/third_party/xla/xla/service/gpu/runtime/memset.h b/third_party/xla/build_tools/configure/assert_clang.cc
similarity index 59%
rename from third_party/xla/xla/service/gpu/runtime/memset.h
rename to third_party/xla/build_tools/configure/assert_clang.cc
index a6b5a9ed38a526..3dd57d1d1ff8b5 100644
--- a/third_party/xla/xla/service/gpu/runtime/memset.h
+++ b/third_party/xla/build_tools/configure/assert_clang.cc
@@ -1,4 +1,4 @@
-/* Copyright 2022 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,18 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME_MEMSET_H_
-#define XLA_SERVICE_GPU_RUNTIME_MEMSET_H_
-
-#include "xla/runtime/custom_call_registry.h"
-
-namespace xla {
-namespace gpu {
-
-// Registers XLA Gpu runtime memset custom calls.
-void RegisterMemsetCustomCalls(runtime::DirectCustomCallRegistry& registry);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_MEMSET_H_
+#ifndef __clang__
+#error "__clang__ not defined!"
+#endif  // #ifdef __clang__
diff --git a/third_party/xla/xla/service/gpu/runtime/memcpy.h b/third_party/xla/build_tools/configure/assert_cuda_clang.cu.cc
similarity index 59%
rename from third_party/xla/xla/service/gpu/runtime/memcpy.h
rename to third_party/xla/build_tools/configure/assert_cuda_clang.cu.cc
index 6fe7ea05155644..12aeb2743b6356 100644
--- a/third_party/xla/xla/service/gpu/runtime/memcpy.h
+++ b/third_party/xla/build_tools/configure/assert_cuda_clang.cu.cc
@@ -1,4 +1,4 @@
-/* Copyright 2022 The OpenXLA Authors.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,18 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME_MEMCPY_H_
-#define XLA_SERVICE_GPU_RUNTIME_MEMCPY_H_
-
-#include "xla/runtime/custom_call_registry.h"
-
-namespace xla {
-namespace gpu {
-
-// Registers XLA Gpu runtime memcpy custom calls.
-void RegisterMemcpyCustomCalls(runtime::DirectCustomCallRegistry& registry);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_MEMCPY_H_
+#if !defined(__clang__) || !defined(__CUDA__)
+#error "__clang__ or __CUDA__ not defined!"
+#endif  // #if !defined(__clang__) || !defined(__CUDA__)
diff --git a/third_party/xla/build_tools/configure/assert_gcc.cc b/third_party/xla/build_tools/configure/assert_gcc.cc
new file mode 100644
index 00000000000000..617da0d621a01f
--- /dev/null
+++ b/third_party/xla/build_tools/configure/assert_gcc.cc
@@ -0,0 +1,21 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// Notably, clang will define `__GNUC__`, so need to make sure __clang__ is not
+// defined to detect GCC (or, most correctly, some compiler that supports GNU
+// extensions that is not clang).
+#if !defined(__GNUC__) || defined(__clang__)
+#error "__GNUC__ is not defined independently of __clang__!"
+#endif  // #if !defined(__GNUC__) || defined(__clang__)
diff --git a/third_party/xla/build_tools/configure/assert_nvcc.cu.cc b/third_party/xla/build_tools/configure/assert_nvcc.cu.cc
new file mode 100644
index 00000000000000..ea9287565755cc
--- /dev/null
+++ b/third_party/xla/build_tools/configure/assert_nvcc.cu.cc
@@ -0,0 +1,17 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef __NVCC__
+#error "__NVCC__ not defined!"
+#endif  // #ifdef __NVCC__
diff --git a/third_party/xla/build_tools/configure/configure.py b/third_party/xla/build_tools/configure/configure.py
new file mode 100755
index 00000000000000..ae40f9f41e7fdc
--- /dev/null
+++ b/third_party/xla/build_tools/configure/configure.py
@@ -0,0 +1,537 @@
+#!/usr/bin/env python3
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""Configure script to get build parameters from user.
+
+This script populates a bazelrc file that tells Bazel where to look for
+cuda versions and compilers. Note: that a configuration is possible to request,
+does not mean that it is supported (e.g. building with gcc). That being said,
+if this stops working for you on an unsupported build and you have a fix, please
+send a PR!
+
+Example usage:
+  `./configure.py --backend=cpu --host_compiler=clang`
+  Will write a bazelrc to the root of the repo with the lines required to find
+  the clang in your path. If that isn't the correct clang, you can override like
+  `./configure.py --backend=cpu --clang_path=<PATH_TO_YOUR_CLANG>`.
+
+NOTE(ddunleavy): Lots of these things should probably be outside of configure.py
+but are here because of complexity in `cuda_configure.bzl` and the TF bazelrc.
+Once XLA has it's own bazelrc, and cuda_configure.bzl is replaced or refactored,
+we can probably make this file smaller.
+
+TODO(ddunleavy): add more thorough validation.
+"""
+import argparse
+import dataclasses
+import enum
+import logging
+import os
+import pathlib
+import shutil
+import subprocess
+import sys
+from typing import Optional
+
+_REQUIRED_CUDA_LIBRARIES = ["cublas", "cuda", "cudnn"]
+_DEFAULT_BUILD_AND_TEST_TAG_FILTERS = ("-no_oss",)
+# Assume we are being invoked from the symlink at the root of the repo
+_XLA_SRC_ROOT = pathlib.Path(__file__).absolute().parent
+_FIND_CUDA_CONFIG = str(
+    _XLA_SRC_ROOT
+    / "third_party"
+    / "tsl"
+    / "third_party"
+    / "gpus"
+    / "find_cuda_config.py"
+)
+_XLA_BAZELRC_NAME = "xla_configure.bazelrc"
+_KW_ONLY_IF_PYTHON310 = {"kw_only": True} if sys.version_info >= (3, 10) else {}
+
+
+def _find_executable(executable: str) -> Optional[str]:
+  logging.info("Trying to find path to %s...", executable)
+  # Resolving the symlink is necessary for finding system headers.
+  if unresolved_path := shutil.which(executable):
+    return str(pathlib.Path(unresolved_path).resolve())
+  return None
+
+
+def _find_executable_or_die(executable: str) -> str:
+  """Finds executable and resolves symlinks or raises RuntimeError.
+
+  Resolving symlinks is sometimes necessary for finding system headers.
+
+  Args:
+    executable: The name of the executable that we want to find.
+
+  Returns:
+    The path to the executable we are looking for.
+  Raises:
+    RuntimeError: if path to the executable cannot be found.
+  """
+  resolved_path_to_exe = _find_executable(executable)
+  if resolved_path_to_exe is None:
+    raise RuntimeError(
+        f"Could not find executable `{executable}`! "
+        "Please change your $PATH or pass the path directly like"
+        f"`--{executable}_path=path/to/executable."
+    )
+  logging.info("Found path to %s at %s", executable, resolved_path_to_exe)
+
+  return resolved_path_to_exe
+
+
+def _get_cuda_compute_capabilities_or_die() -> list[str]:
+  """Finds compute capabilities via nvidia-smi or rasies exception.
+
+  Returns:
+    list of unique, sorted strings representing compute capabilities:
+  Raises:
+    RuntimeError: if path to nvidia-smi couldn't be found.
+    subprocess.CalledProcessError: if nvidia-smi process failed.
+  """
+  try:
+    nvidia_smi = _find_executable_or_die("nvidia-smi")
+    nvidia_smi_proc = subprocess.run(
+        [nvidia_smi, "--query-gpu=compute_cap", "--format=csv,noheader"],
+        capture_output=True,
+        check=True,
+        text=True,
+    )
+    # Command above returns a newline separated list of compute capabilities
+    # with possible repeats. So we should unique them and sort the final result.
+    capabilities = sorted(set(nvidia_smi_proc.stdout.strip().split("\n")))
+    logging.info("Found CUDA compute capabilities: %s", capabilities)
+    return capabilities
+  except (RuntimeError, subprocess.CalledProcessError) as e:
+    logging.info(
+        "Could not find nvidia-smi, or nvidia-smi command failed. Please pass"
+        " capabilities directly using --cuda_compute_capabilities."
+    )
+    raise e
+
+
+def _get_clang_major_version(path_to_clang: str) -> int:
+  """Gets the major version of the clang at `path_to_clang`.
+
+  Args:
+    path_to_clang: Path to a clang executable
+
+  Returns:
+    The major version.
+  """
+  logging.info("Running echo __clang_major__ | %s -E -P -", path_to_clang)
+  clang_version_proc = subprocess.run(
+      [path_to_clang, "-E", "-P", "-"],
+      input="__clang_major__",
+      check=True,
+      capture_output=True,
+      text=True,
+  )
+  major_version = int(clang_version_proc.stdout)
+  logging.info("%s reports major version %s.", path_to_clang, major_version)
+
+  return major_version
+
+
+class ArgparseableEnum(enum.Enum):
+  """Enum base class with helper methods for working with argparse.
+
+  Example usage:
+  ```
+  class Fruit(ArgparseableEnum):
+    APPLE = enum.auto()
+
+  # argparse setup
+  parser.add_argument("--fruit", type=Fruit.from_str, choices=list(Fruit))
+  ```
+  Users can pass strings like `--fruit=apple` with nice error messages and the
+  parser will get the corresponding enum value.
+
+  NOTE: PyType gets confused when this class is used to create Enums in the
+  functional style like `ArgparseableEnum("Fruit", ["APPLE", "BANANA"])`.
+  """
+
+  def __str__(self):
+    return self.name
+
+  @classmethod
+  def from_str(cls, s):
+    s = s.upper()
+    try:
+      return cls[s]
+    except KeyError:
+      # Sloppy looking exception handling, but argparse will catch ValueError
+      # and give a pleasant error message. KeyError would not work here.
+      raise ValueError  # pylint: disable=raise-missing-from
+
+
+class Backend(ArgparseableEnum):
+  CPU = enum.auto()
+  CUDA = enum.auto()
+  ROCM = enum.auto()
+
+
+class HostCompiler(ArgparseableEnum):
+  CLANG = enum.auto()
+  GCC = enum.auto()
+
+
+class CudaCompiler(ArgparseableEnum):
+  CLANG = enum.auto()
+  NVCC = enum.auto()
+
+
+class OS(ArgparseableEnum):
+  LINUX = enum.auto()
+  MACOS = enum.auto()
+  WINDOWS = enum.auto()
+
+
+@dataclasses.dataclass(**_KW_ONLY_IF_PYTHON310)
+class DiscoverablePathsAndVersions:
+  """Paths to various tools and libraries needed to build XLA.
+
+  This class is where all 'stateful' activity should happen, like trying to read
+  environment variables or looking for things in the $PATH. An instance that has
+  all fields set should not try to do any of these things though, so that this
+  file can remain unit testable.
+  """
+
+  clang_path: Optional[str] = None
+  clang_major_version: Optional[int] = None
+  gcc_path: Optional[str] = None
+  lld_path: Optional[str] = None
+  ld_library_path: Optional[str] = None
+
+  # CUDA specific
+  cublas_version: Optional[str] = None
+  cuda_toolkit_path: Optional[str] = None
+  cuda_compute_capabilities: Optional[list[str]] = None
+  cudnn_version: Optional[str] = None
+  nccl_version: Optional[str] = None
+
+  def get_relevant_paths_and_versions(self, config: "XLAConfigOptions"):
+    """Gets paths and versions as needed by the config.
+
+    Args:
+      config: XLAConfigOptions instance that determines what paths and versions
+        to try to autoconfigure.
+    """
+    if self.ld_library_path is None:
+      self.ld_library_path = os.environ.get("LD_LIBRARY_PATH", None)
+
+    if config.host_compiler == HostCompiler.CLANG:
+      self.clang_path = self.clang_path or _find_executable_or_die("clang")
+      self.clang_major_version = (
+          self.clang_major_version or _get_clang_major_version(self.clang_path)
+      )
+
+      # Notably, we don't use `_find_executable_or_die` for lld, as it changes
+      # which commands it accepts based on it's name! ld.lld is symlinked to a
+      # different executable just called lld, which should not be invoked
+      # directly.
+      self.lld_path = self.lld_path or shutil.which("ld.lld")
+    elif config.host_compiler == HostCompiler.GCC:
+      self.gcc_path = self.gcc_path or _find_executable_or_die("gcc")
+
+    if config.backend == Backend.CUDA:
+      if config.cuda_compiler == CudaCompiler.CLANG:
+        self.clang_path = self.clang_path or _find_executable_or_die("clang")
+
+      if not self.cuda_compute_capabilities:
+        self.cuda_compute_capabilities = _get_cuda_compute_capabilities_or_die()
+
+      self._get_cuda_libraries_paths_and_versions_if_needed(config)
+
+  def _get_cuda_libraries_paths_and_versions_if_needed(
+      self, config: "XLAConfigOptions"
+  ):
+    """Gets cuda paths and versions if user left any unspecified.
+
+    This uses `find_cuda_config.py` to find versions for all libraries in
+    `_REQUIRED_CUDA_LIBRARIES`.
+
+    Args:
+      config: config that determines which libraries should be found.
+    """
+    should_find_nccl = config.using_nccl and self.nccl_version is None
+    any_cuda_config_unset = any([
+        self.cublas_version is None,
+        self.cuda_toolkit_path is None,
+        self.cudnn_version is None,
+        should_find_nccl,
+    ])
+
+    maybe_nccl = ["nccl"] if should_find_nccl else []
+
+    if any_cuda_config_unset:
+      logging.info(
+          "Some CUDA config versions and paths were not provided, "
+          "so trying to find them using find_cuda_config.py"
+      )
+      try:
+        find_cuda_config_proc = subprocess.run(
+            [
+                sys.executable,
+                _FIND_CUDA_CONFIG,
+                *_REQUIRED_CUDA_LIBRARIES,
+                *maybe_nccl,
+            ],
+            capture_output=True,
+            check=True,
+            text=True,
+        )
+      except subprocess.CalledProcessError as e:
+        logging.info("Command %s failed. Is CUDA installed?", e.cmd)
+        logging.info("Dumping %s ouptut:\n %s", e.cmd, e.output)
+        raise e
+
+      cuda_config = dict(
+          tuple(line.split(": "))
+          for line in find_cuda_config_proc.stdout.strip().split("\n")
+      )
+
+      self.cublas_version = self.cublas_version or cuda_config["cublas_version"]
+      self.cuda_toolkit_path = (
+          self.cuda_toolkit_path or cuda_config["cuda_toolkit_path"]
+      )
+      self.cudnn_version = self.cudnn_version or cuda_config["cudnn_version"]
+      if should_find_nccl:
+        self.nccl_version = self.nccl_version or cuda_config["nccl_version"]
+
+
+@dataclasses.dataclass(frozen=True, **_KW_ONLY_IF_PYTHON310)
+class XLAConfigOptions:
+  """Represents XLA configuration options."""
+
+  backend: Backend
+  os: OS
+  python_bin_path: str
+  host_compiler: HostCompiler
+  compiler_options: list[str]
+
+  # CUDA specific
+  cuda_compiler: CudaCompiler
+  using_nccl: bool
+  using_tensorrt: bool
+
+  def to_bazelrc_lines(
+      self,
+      dpav: DiscoverablePathsAndVersions,
+  ) -> list[str]:
+    """Creates a bazelrc given an XLAConfigOptions.
+
+    Necessary paths are provided by the user, or retrieved via
+    `self._get_relevant_paths`.
+
+    Args:
+      dpav: DiscoverablePathsAndVersions that may hold user-specified paths and
+        versions. The dpav will then read from `self` to determine what to try
+        to auto-configure.
+
+    Returns:
+      The lines of a bazelrc.
+    """
+    dpav.get_relevant_paths_and_versions(self)
+    rc = []
+    build_and_test_tag_filters = list(_DEFAULT_BUILD_AND_TEST_TAG_FILTERS)
+
+    # Platform independent options based on host compiler
+    if self.host_compiler == HostCompiler.GCC:
+      rc.append(f"build --action_env GCC_HOST_COMPILER_PATH={dpav.gcc_path}")
+    elif self.host_compiler == HostCompiler.CLANG:
+      rc.append(f"build --action_env CLANG_COMPILER_PATH={dpav.clang_path}")
+      rc.append(f"build --repo_env CC={dpav.clang_path}")
+      rc.append(f"build --repo_env BAZEL_COMPILER={dpav.clang_path}")
+      self.compiler_options.append("-Wno-error=unused-command-line-argument")
+      if dpav.lld_path:
+        rc.append(f"build --linkopt --ld-path={dpav.lld_path}")
+
+    if self.backend == Backend.CPU:
+      build_and_test_tag_filters.append("-gpu")
+
+    elif self.backend == Backend.CUDA:
+      compiler_pair = self.cuda_compiler, self.host_compiler
+
+      if compiler_pair == (CudaCompiler.CLANG, HostCompiler.CLANG):
+        rc.append("build --config cuda_clang")
+        rc.append(
+            f"build --action_env CLANG_CUDA_COMPILER_PATH={dpav.clang_path}"
+        )
+      elif compiler_pair == (CudaCompiler.NVCC, HostCompiler.CLANG):
+        rc.append("build --config nvcc_clang")
+        # This is demanded by cuda_configure.bzl
+        rc.append(
+            f"build --action_env CLANG_CUDA_COMPILER_PATH={dpav.clang_path}"
+        )
+      elif compiler_pair == (CudaCompiler.NVCC, HostCompiler.GCC):
+        rc.append("build --config cuda")
+      else:
+        raise NotImplementedError(
+            "CUDA clang with host compiler gcc not supported"
+        )
+
+      # Lines needed for CUDA backend regardless of CUDA/host compiler
+      rc.append(
+          f"build --action_env CUDA_TOOLKIT_PATH={dpav.cuda_toolkit_path}"
+      )
+      rc.append(f"build --action_env TF_CUBLAS_VERSION={dpav.cublas_version}")
+      rc.append(
+          "build --action_env"
+          f" TF_CUDA_COMPUTE_CAPABILITIES={','.join(dpav.cuda_compute_capabilities)}"
+      )
+      rc.append(f"build --action_env TF_CUDNN_VERSION={dpav.cudnn_version}")
+      rc.append(f"build --repo_env TF_NEED_TENSORRT={int(self.using_tensorrt)}")
+      if self.using_nccl:
+        rc.append(f"build --action_env TF_NCCL_VERSION={dpav.nccl_version}")
+      else:
+        rc.append("build --config nonccl")
+    elif self.backend == Backend.ROCM:
+      pass
+
+    # Lines that are added for every backend
+    if dpav.ld_library_path:
+      rc.append(f"build --action_env LD_LIBRARY_PATH={dpav.ld_library_path}")
+
+    if dpav.clang_major_version in (16, 17):
+      self.compiler_options.append("-Wno-gnu-offsetof-extensions")
+
+    rc.append(f"build --action_env PYTHON_BIN_PATH={self.python_bin_path}")
+    rc.append(f"build --python_path {self.python_bin_path}")
+    rc.append("test --test_env LD_LIBRARY_PATH")
+    rc.append("test --test_size_filters small,medium")
+
+    rc.extend([
+        f"build --copt {compiler_option}"
+        for compiler_option in self.compiler_options
+    ])
+
+    # Add build and test tag filters
+    build_and_test_tag_filters = ",".join(build_and_test_tag_filters)
+    rc.append(f"build --build_tag_filters {build_and_test_tag_filters}")
+    rc.append(f"build --test_tag_filters {build_and_test_tag_filters}")
+    rc.append(f"test --build_tag_filters {build_and_test_tag_filters}")
+    rc.append(f"test --test_tag_filters {build_and_test_tag_filters}")
+
+    return rc
+
+
+def _parse_args():
+  """Creates an argparse.ArgumentParser and parses arguments."""
+  comma_separated_list = lambda l: [s.strip() for s in l.split(",")]
+
+  parser = argparse.ArgumentParser(allow_abbrev=False)
+  parser.add_argument(
+      "--backend",
+      type=Backend.from_str,
+      choices=list(Backend),
+      required=True,
+  )
+  parser.add_argument(
+      "--os", type=OS.from_str, choices=list(OS), default="linux"
+  )
+  parser.add_argument(
+      "--host_compiler",
+      type=HostCompiler.from_str,
+      choices=list(HostCompiler),
+      default="clang",
+  )
+  parser.add_argument(
+      "--cuda_compiler",
+      type=CudaCompiler.from_str,
+      choices=list(CudaCompiler),
+      default="nvcc",
+  )
+  parser.add_argument(
+      "--cuda_compute_capabilities",
+      type=comma_separated_list,
+      default=None,
+  )
+  parser.add_argument("--python_bin_path", default=sys.executable)
+  parser.add_argument(
+      "--compiler_options",
+      type=comma_separated_list,
+      default="-Wno-sign-compare",
+  )
+  parser.add_argument("--nccl", action="store_true")
+  parser.add_argument("--tensorrt", action="store_true")
+
+  # Path and version overrides
+  path_help = "Optional: will be found on PATH if possible."
+  parser.add_argument("--clang_path", help=path_help)
+  parser.add_argument("--gcc_path", help=path_help)
+  parser.add_argument(
+      "--ld_library_path",
+      help=(
+          "Optional: will be automatically taken from the current environment"
+          " if flag is not set"
+      ),
+  )
+  parser.add_argument("--lld_path", help=path_help)
+
+  # CUDA specific
+  find_cuda_config_help = (
+      "Optional: will be found using `find_cuda_config.py` if flag is not set."
+  )
+  parser.add_argument("--cublas_version", help=find_cuda_config_help)
+  parser.add_argument("--cuda_toolkit_path", help=find_cuda_config_help)
+  parser.add_argument("--cudnn_version", help=find_cuda_config_help)
+  parser.add_argument("--nccl_version", help=find_cuda_config_help)
+
+  return parser.parse_args()
+
+
+def main():
+  # Setup logging
+  logging.basicConfig()
+  logging.getLogger().setLevel(logging.INFO)
+
+  args = _parse_args()
+
+  config = XLAConfigOptions(
+      backend=args.backend,
+      os=args.os,
+      host_compiler=args.host_compiler,
+      cuda_compiler=args.cuda_compiler,
+      python_bin_path=args.python_bin_path,
+      compiler_options=args.compiler_options,
+      using_nccl=args.nccl,
+      using_tensorrt=args.tensorrt,
+  )
+
+  bazelrc_lines = config.to_bazelrc_lines(
+      DiscoverablePathsAndVersions(
+          clang_path=args.clang_path,
+          gcc_path=args.gcc_path,
+          ld_library_path=args.ld_library_path,
+          cublas_version=args.cublas_version,
+          cuda_compute_capabilities=args.cuda_compute_capabilities,
+          cuda_toolkit_path=args.cuda_toolkit_path,
+          cudnn_version=args.cudnn_version,
+          nccl_version=args.nccl_version,
+      )
+  )
+
+  bazelrc_path = _XLA_SRC_ROOT / _XLA_BAZELRC_NAME
+  bazelrc_contents = "\n".join(bazelrc_lines) + "\n"
+
+  with (bazelrc_path).open("w") as f:
+    logging.info("Writing bazelrc to %s...", bazelrc_path)
+    f.write(bazelrc_contents)
+
+if __name__ == "__main__":
+  raise SystemExit(main())
diff --git a/third_party/xla/build_tools/configure/configure_test.py b/third_party/xla/build_tools/configure/configure_test.py
new file mode 100644
index 00000000000000..c952c8f9241f4f
--- /dev/null
+++ b/third_party/xla/build_tools/configure/configure_test.py
@@ -0,0 +1,179 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+from absl.testing import absltest
+
+from xla.build_tools import test_utils
+from xla.build_tools.configure import configure
+
+
+XLAConfigOptions = configure.XLAConfigOptions
+DiscoverablePathsAndVersions = configure.DiscoverablePathsAndVersions
+Backend = configure.Backend
+HostCompiler = configure.HostCompiler
+CudaCompiler = configure.CudaCompiler
+OS = configure.OS
+
+_PYTHON_BIN_PATH = "/usr/bin/python3"
+_CLANG_PATH = "/usr/lib/llvm-17/bin/clang"
+_GCC_PATH = "/usr/bin/gcc"
+_COMPILER_OPTIONS = ("-Wno-sign-compare",)
+
+# CUDA specific paths and versions
+_CUDA_SPECIFIC_PATHS_AND_VERSIONS = {
+    "cublas_version": "12.3",
+    "cuda_toolkit_path": "/usr/local/cuda-12.2",
+    "cuda_compute_capabilities": ["7.5"],
+    "cudnn_version": "8",
+    "ld_library_path": "/usr/local/nvidia/lib:/usr/local/nvidia/lib64",
+    "nccl_version": "2",
+}
+
+
+class ConfigureTest(absltest.TestCase):
+
+  @classmethod
+  def setUpClass(cls):
+    super().setUpClass()
+
+    testdata = (
+        test_utils.xla_src_root() / "build_tools" / "configure" / "testdata"
+    )
+
+    with (testdata / "clang.bazelrc").open() as f:
+      cls.clang_bazelrc_lines = [line.strip() for line in f.readlines()]
+
+    with (testdata / "gcc.bazelrc").open() as f:
+      cls.gcc_bazelrc_lines = [line.strip() for line in f.readlines()]
+
+    with (testdata / "cuda_clang.bazelrc").open() as f:
+      cls.cuda_clang_bazelrc_lines = [line.strip() for line in f.readlines()]
+
+    with (testdata / "nvcc_clang.bazelrc").open() as f:
+      cls.nvcc_clang_bazelrc_lines = [line.strip() for line in f.readlines()]
+
+    with (testdata / "nvcc_gcc.bazelrc").open() as f:
+      cls.nvcc_gcc_bazelrc_lines = [line.strip() for line in f.readlines()]
+
+  def test_clang_bazelrc(self):
+    config = XLAConfigOptions(
+        backend=Backend.CPU,
+        os=OS.LINUX,
+        python_bin_path=_PYTHON_BIN_PATH,
+        host_compiler=HostCompiler.CLANG,
+        compiler_options=list(_COMPILER_OPTIONS),
+        cuda_compiler=CudaCompiler.NVCC,
+        using_nccl=False,
+        using_tensorrt=False,
+    )
+
+    bazelrc_lines = config.to_bazelrc_lines(
+        DiscoverablePathsAndVersions(
+            clang_path=_CLANG_PATH,
+            ld_library_path="",
+            clang_major_version=17,
+        )
+    )
+
+    self.assertEqual(bazelrc_lines, self.clang_bazelrc_lines)
+
+  def test_gcc_bazelrc(self):
+    config = XLAConfigOptions(
+        backend=Backend.CPU,
+        os=OS.LINUX,
+        python_bin_path=_PYTHON_BIN_PATH,
+        host_compiler=HostCompiler.GCC,
+        compiler_options=list(_COMPILER_OPTIONS),
+        cuda_compiler=CudaCompiler.NVCC,
+        using_nccl=False,
+        using_tensorrt=False,
+    )
+
+    bazelrc_lines = config.to_bazelrc_lines(
+        DiscoverablePathsAndVersions(
+            gcc_path=_GCC_PATH,
+            ld_library_path="",
+        )
+    )
+
+    self.assertEqual(bazelrc_lines, self.gcc_bazelrc_lines)
+
+  def test_cuda_clang_bazelrc(self):
+    config = XLAConfigOptions(
+        backend=Backend.CUDA,
+        os=OS.LINUX,
+        python_bin_path=_PYTHON_BIN_PATH,
+        host_compiler=HostCompiler.CLANG,
+        compiler_options=list(_COMPILER_OPTIONS),
+        cuda_compiler=CudaCompiler.CLANG,
+        using_nccl=False,
+        using_tensorrt=False,
+    )
+
+    bazelrc_lines = config.to_bazelrc_lines(
+        DiscoverablePathsAndVersions(
+            clang_path=_CLANG_PATH,
+            clang_major_version=17,
+            **_CUDA_SPECIFIC_PATHS_AND_VERSIONS,
+        )
+    )
+
+    self.assertEqual(bazelrc_lines, self.cuda_clang_bazelrc_lines)
+
+  def test_nvcc_clang_bazelrc(self):
+    config = XLAConfigOptions(
+        backend=Backend.CUDA,
+        os=OS.LINUX,
+        python_bin_path=_PYTHON_BIN_PATH,
+        host_compiler=HostCompiler.CLANG,
+        compiler_options=list(_COMPILER_OPTIONS),
+        cuda_compiler=CudaCompiler.NVCC,
+        using_nccl=False,
+        using_tensorrt=False,
+    )
+
+    bazelrc_lines = config.to_bazelrc_lines(
+        DiscoverablePathsAndVersions(
+            clang_path=_CLANG_PATH,
+            clang_major_version=17,
+            **_CUDA_SPECIFIC_PATHS_AND_VERSIONS,
+        )
+    )
+
+    self.assertEqual(bazelrc_lines, self.nvcc_clang_bazelrc_lines)
+
+  def test_nvcc_gcc_bazelrc(self):
+    config = XLAConfigOptions(
+        backend=Backend.CUDA,
+        os=OS.LINUX,
+        python_bin_path=_PYTHON_BIN_PATH,
+        host_compiler=HostCompiler.GCC,
+        compiler_options=list(_COMPILER_OPTIONS),
+        cuda_compiler=CudaCompiler.NVCC,
+        using_nccl=False,
+        using_tensorrt=False,
+    )
+
+    bazelrc_lines = config.to_bazelrc_lines(
+        DiscoverablePathsAndVersions(
+            gcc_path=_GCC_PATH,
+            **_CUDA_SPECIFIC_PATHS_AND_VERSIONS,
+        )
+    )
+
+    self.assertEqual(bazelrc_lines, self.nvcc_gcc_bazelrc_lines)
+
+
+if __name__ == "__main__":
+  absltest.main()
diff --git a/third_party/xla/build_tools/configure/testdata/clang.bazelrc b/third_party/xla/build_tools/configure/testdata/clang.bazelrc
new file mode 100644
index 00000000000000..317be65966633d
--- /dev/null
+++ b/third_party/xla/build_tools/configure/testdata/clang.bazelrc
@@ -0,0 +1,14 @@
+build --action_env CLANG_COMPILER_PATH=/usr/lib/llvm-17/bin/clang
+build --repo_env CC=/usr/lib/llvm-17/bin/clang
+build --repo_env BAZEL_COMPILER=/usr/lib/llvm-17/bin/clang
+build --action_env PYTHON_BIN_PATH=/usr/bin/python3
+build --python_path /usr/bin/python3
+test --test_env LD_LIBRARY_PATH
+test --test_size_filters small,medium
+build --copt -Wno-sign-compare
+build --copt -Wno-error=unused-command-line-argument
+build --copt -Wno-gnu-offsetof-extensions
+build --build_tag_filters -no_oss,-gpu
+build --test_tag_filters -no_oss,-gpu
+test --build_tag_filters -no_oss,-gpu
+test --test_tag_filters -no_oss,-gpu
diff --git a/third_party/xla/build_tools/configure/testdata/cuda_clang.bazelrc b/third_party/xla/build_tools/configure/testdata/cuda_clang.bazelrc
new file mode 100644
index 00000000000000..b998cf06935f33
--- /dev/null
+++ b/third_party/xla/build_tools/configure/testdata/cuda_clang.bazelrc
@@ -0,0 +1,23 @@
+build --action_env CLANG_COMPILER_PATH=/usr/lib/llvm-17/bin/clang
+build --repo_env CC=/usr/lib/llvm-17/bin/clang
+build --repo_env BAZEL_COMPILER=/usr/lib/llvm-17/bin/clang
+build --config cuda_clang
+build --action_env CLANG_CUDA_COMPILER_PATH=/usr/lib/llvm-17/bin/clang
+build --action_env CUDA_TOOLKIT_PATH=/usr/local/cuda-12.2
+build --action_env TF_CUBLAS_VERSION=12.3
+build --action_env TF_CUDA_COMPUTE_CAPABILITIES=7.5
+build --action_env TF_CUDNN_VERSION=8
+build --repo_env TF_NEED_TENSORRT=0
+build --config nonccl
+build --action_env LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+build --action_env PYTHON_BIN_PATH=/usr/bin/python3
+build --python_path /usr/bin/python3
+test --test_env LD_LIBRARY_PATH
+test --test_size_filters small,medium
+build --copt -Wno-sign-compare
+build --copt -Wno-error=unused-command-line-argument
+build --copt -Wno-gnu-offsetof-extensions
+build --build_tag_filters -no_oss
+build --test_tag_filters -no_oss
+test --build_tag_filters -no_oss
+test --test_tag_filters -no_oss
diff --git a/third_party/xla/build_tools/configure/testdata/gcc.bazelrc b/third_party/xla/build_tools/configure/testdata/gcc.bazelrc
new file mode 100644
index 00000000000000..8eefec15ee8efb
--- /dev/null
+++ b/third_party/xla/build_tools/configure/testdata/gcc.bazelrc
@@ -0,0 +1,10 @@
+build --action_env GCC_HOST_COMPILER_PATH=/usr/bin/gcc
+build --action_env PYTHON_BIN_PATH=/usr/bin/python3
+build --python_path /usr/bin/python3
+test --test_env LD_LIBRARY_PATH
+test --test_size_filters small,medium
+build --copt -Wno-sign-compare
+build --build_tag_filters -no_oss,-gpu
+build --test_tag_filters -no_oss,-gpu
+test --build_tag_filters -no_oss,-gpu
+test --test_tag_filters -no_oss,-gpu
diff --git a/third_party/xla/build_tools/configure/testdata/nvcc_clang.bazelrc b/third_party/xla/build_tools/configure/testdata/nvcc_clang.bazelrc
new file mode 100644
index 00000000000000..912dc50faff4c1
--- /dev/null
+++ b/third_party/xla/build_tools/configure/testdata/nvcc_clang.bazelrc
@@ -0,0 +1,23 @@
+build --action_env CLANG_COMPILER_PATH=/usr/lib/llvm-17/bin/clang
+build --repo_env CC=/usr/lib/llvm-17/bin/clang
+build --repo_env BAZEL_COMPILER=/usr/lib/llvm-17/bin/clang
+build --config nvcc_clang
+build --action_env CLANG_CUDA_COMPILER_PATH=/usr/lib/llvm-17/bin/clang
+build --action_env CUDA_TOOLKIT_PATH=/usr/local/cuda-12.2
+build --action_env TF_CUBLAS_VERSION=12.3
+build --action_env TF_CUDA_COMPUTE_CAPABILITIES=7.5
+build --action_env TF_CUDNN_VERSION=8
+build --repo_env TF_NEED_TENSORRT=0
+build --config nonccl
+build --action_env LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+build --action_env PYTHON_BIN_PATH=/usr/bin/python3
+build --python_path /usr/bin/python3
+test --test_env LD_LIBRARY_PATH
+test --test_size_filters small,medium
+build --copt -Wno-sign-compare
+build --copt -Wno-error=unused-command-line-argument
+build --copt -Wno-gnu-offsetof-extensions
+build --build_tag_filters -no_oss
+build --test_tag_filters -no_oss
+test --build_tag_filters -no_oss
+test --test_tag_filters -no_oss
diff --git a/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc b/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc
new file mode 100644
index 00000000000000..863209697362de
--- /dev/null
+++ b/third_party/xla/build_tools/configure/testdata/nvcc_gcc.bazelrc
@@ -0,0 +1,18 @@
+build --action_env GCC_HOST_COMPILER_PATH=/usr/bin/gcc
+build --config cuda
+build --action_env CUDA_TOOLKIT_PATH=/usr/local/cuda-12.2
+build --action_env TF_CUBLAS_VERSION=12.3
+build --action_env TF_CUDA_COMPUTE_CAPABILITIES=7.5
+build --action_env TF_CUDNN_VERSION=8
+build --repo_env TF_NEED_TENSORRT=0
+build --config nonccl
+build --action_env LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+build --action_env PYTHON_BIN_PATH=/usr/bin/python3
+build --python_path /usr/bin/python3
+test --test_env LD_LIBRARY_PATH
+test --test_size_filters small,medium
+build --copt -Wno-sign-compare
+build --build_tag_filters -no_oss
+build --test_tag_filters -no_oss
+test --build_tag_filters -no_oss
+test --test_tag_filters -no_oss
diff --git a/third_party/xla/build_tools/lint/BUILD b/third_party/xla/build_tools/lint/BUILD
index 9cb4f866ce7af8..b4b825c925425c 100644
--- a/third_party/xla/build_tools/lint/BUILD
+++ b/third_party/xla/build_tools/lint/BUILD
@@ -17,7 +17,6 @@ load("//xla:pytype.default.bzl", "pytype_strict_library")
 # Placeholder: load py_test
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
@@ -46,9 +45,9 @@ py_test(
         "testdata/bad_cc.diff",
         "testdata/important_cc.diff",
     ],
-    tags = ["no_oss"],
     deps = [
         ":check_contents",
+        "//build_tools:test_utils",
         "@absl_py//absl/testing:absltest",
     ],
 )
@@ -61,9 +60,9 @@ py_test(
         "testdata/crosstool.diff",
         "testdata/important_cc.diff",
     ],
-    tags = ["no_oss"],
     deps = [
         ":diff_parser",
+        "//build_tools:test_utils",
         "@absl_py//absl/testing:absltest",
     ],
 )
@@ -71,7 +70,6 @@ py_test(
 py_test(
     name = "generate_compile_commands_test",
     srcs = ["generate_compile_commands_test.py"],
-    tags = ["no_oss"],
     deps = [
         ":generate_compile_commands",
         "@absl_py//absl/testing:absltest",
diff --git a/third_party/xla/build_tools/lint/check_contents_test.py b/third_party/xla/build_tools/lint/check_contents_test.py
index 8781076ed2dacb..21d58785c6f6a5 100644
--- a/third_party/xla/build_tools/lint/check_contents_test.py
+++ b/third_party/xla/build_tools/lint/check_contents_test.py
@@ -14,6 +14,7 @@
 # ============================================================================
 from absl.testing import absltest
 
+from xla.build_tools import test_utils
 from xla.build_tools.lint import check_contents
 from xla.build_tools.lint import diff_parser
 
@@ -24,11 +25,11 @@ class CheckDiffsTest(absltest.TestCase):
   def setUpClass(cls):
     super().setUpClass()
 
-    base_path = "third_party/xla/build_tools/lint"
-    with open(f"{base_path}/testdata/bad_cc.diff") as f:
+    testdata = test_utils.xla_src_root() / "build_tools" / "lint" / "testdata"
+    with (testdata / "bad_cc.diff").open() as f:
       cls.bad_cc_hunks = diff_parser.parse_hunks(f.read())
 
-    with open(f"{base_path}/testdata/important_cc.diff") as f:
+    with (testdata / "important_cc.diff").open() as f:
       cls.important_cc_hunks = diff_parser.parse_hunks(f.read())
 
   def test_check_good_diff(self):
diff --git a/third_party/xla/build_tools/lint/diff_parser_test.py b/third_party/xla/build_tools/lint/diff_parser_test.py
index a21761040fc347..787020cc865033 100644
--- a/third_party/xla/build_tools/lint/diff_parser_test.py
+++ b/third_party/xla/build_tools/lint/diff_parser_test.py
@@ -14,6 +14,7 @@
 # ============================================================================
 from absl.testing import absltest
 
+from xla.build_tools import test_utils
 from xla.build_tools.lint import diff_parser
 
 
@@ -23,15 +24,15 @@ class ParseDiffTest(absltest.TestCase):
   def setUpClass(cls):
     super().setUpClass()
 
-    base_path = "third_party/xla/build_tools/lint"
+    testdata = test_utils.xla_src_root() / "build_tools" / "lint" / "testdata"
 
-    with open(f"{base_path}/testdata/bad_cc.diff") as f:
+    with (testdata / "bad_cc.diff").open() as f:
       cls.bad_cc_diff = f.read()
 
-    with open(f"{base_path}/testdata/important_cc.diff") as f:
+    with (testdata / "important_cc.diff").open() as f:
       cls.important_cc_diff = f.read()
 
-    with open(f"{base_path}/testdata/crosstool.diff") as f:
+    with (testdata / "crosstool.diff").open() as f:
       cls.crosstool_diff = f.read()
 
   def test_parse_important_cc_diff(self):
diff --git a/third_party/xla/build_tools/test_utils.py b/third_party/xla/build_tools/test_utils.py
new file mode 100644
index 00000000000000..1d9672379d9ca3
--- /dev/null
+++ b/third_party/xla/build_tools/test_utils.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""Test utils for python tests in XLA."""
+import os
+import pathlib
+
+
+def xla_src_root() -> pathlib.Path:
+  """Gets the path to the root of the XLA source tree."""
+  is_oss = "BAZEL_TEST" in os.environ
+  test_srcdir = os.environ["TEST_SRCDIR"]
+  test_workspace = os.environ["TEST_WORKSPACE"]
+  if is_oss:
+    return pathlib.Path(test_srcdir) / test_workspace
+  else:
+    return pathlib.Path(test_srcdir) / test_workspace / "third_party" / "xla"
diff --git a/third_party/xla/configure b/third_party/xla/configure
deleted file mode 100755
index e43908e39da0cc..00000000000000
--- a/third_party/xla/configure
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-set -o pipefail
-
-if [ -z "$PYTHON_BIN_PATH" ]; then
-  PYTHON_BIN_PATH=$(which python3 || which python || true)
-fi
-
-# Set all env variables
-CONFIGURE_DIR=$(dirname "$0")
-"$PYTHON_BIN_PATH" "${CONFIGURE_DIR}/configure.py" "$@"
-
-echo "Configuration finished"
-
diff --git a/third_party/xla/configure.cmd b/third_party/xla/configure.cmd
deleted file mode 100644
index 4efb802b42155c..00000000000000
--- a/third_party/xla/configure.cmd
+++ /dev/null
@@ -1,20 +0,0 @@
-:: Copyright 2019 The OpenXLA Authors.
-::
-:: Licensed under the Apache License, Version 2.0 (the "License");
-:: you may not use this file except in compliance with the License.
-:: You may obtain a copy of the License at
-::
-::    http://www.apache.org/licenses/LICENSE-2.0
-::
-:: Unless required by applicable law or agreed to in writing, software
-:: distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
-:: WARRANTIES OR CONDITIONS OF ANY KIND< either express or implied.  See the
-:: License for the specific language governing permissions and limitations under
-:: the License.
-
-@echo off
-
-set configure_dir=%~dp0
-set configure_dir=%configure_dir:~0,-1%
-python "%configure_dir%\configure.py" %* || ( exit /b )
-echo Configuration finished
diff --git a/third_party/xla/configure.py b/third_party/xla/configure.py
deleted file mode 100644
index 949a74d05ba8d8..00000000000000
--- a/third_party/xla/configure.py
+++ /dev/null
@@ -1,1194 +0,0 @@
-# Copyright 2017 The OpenXLA Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-"""configure script to get build parameters from user."""
-
-import argparse
-import os
-import pathlib
-import platform
-import re
-import subprocess
-import sys
-
-# pylint: disable=g-import-not-at-top
-try:
-  from shutil import which
-except ImportError:
-  from distutils.spawn import find_executable as which
-# pylint: enable=g-import-not-at-top
-
-_DEFAULT_CUDA_VERSION = '11'
-_DEFAULT_CUDNN_VERSION = '2'
-_DEFAULT_CUDA_COMPUTE_CAPABILITIES = '5.2,7.0'
-
-_DEFAULT_PROMPT_ASK_ATTEMPTS = 10
-
-_TF_BAZELRC_FILENAME = '.tf_configure.bazelrc'
-_TF_WORKSPACE_ROOT = ''
-_TF_BAZELRC = ''
-_TF_CURRENT_BAZEL_VERSION = None
-
-
-class UserInputError(Exception):
-  pass
-
-
-def is_windows():
-  return platform.system() == 'Windows'
-
-
-def is_linux():
-  return platform.system() == 'Linux'
-
-
-def is_macos():
-  return platform.system() == 'Darwin'
-
-
-def is_ppc64le():
-  return platform.machine() == 'ppc64le'
-
-
-def is_cygwin():
-  return platform.system().startswith('CYGWIN_NT')
-
-
-def get_input(question):
-  try:
-    try:
-      answer = raw_input(question)
-    except NameError:
-      answer = input(question)  # pylint: disable=bad-builtin
-  except EOFError:
-    answer = ''
-  return answer
-
-
-def write_to_bazelrc(line):
-  with open(_TF_BAZELRC, 'a') as f:
-    f.write(line + '\n')
-
-
-def write_action_env_to_bazelrc(var_name, var):
-  write_to_bazelrc('build --action_env {}="{}"'.format(var_name, str(var)))
-
-
-def run_shell(cmd, allow_non_zero=False, stderr=None):
-  if stderr is None:
-    stderr = sys.stdout
-  if allow_non_zero:
-    try:
-      output = subprocess.check_output(cmd, stderr=stderr)
-    except subprocess.CalledProcessError as e:
-      output = e.output
-  else:
-    output = subprocess.check_output(cmd, stderr=stderr)
-  return output.decode('UTF-8').strip()
-
-
-def cygpath(path):
-  """Convert path from posix to windows."""
-  return os.path.abspath(path).replace('\\', '/')
-
-
-def get_python_path(environ_cp, python_bin_path):
-  """Get the python site package paths."""
-  python_paths = []
-  if environ_cp.get('PYTHONPATH'):
-    python_paths = environ_cp.get('PYTHONPATH').split(':')
-  try:
-    stderr = open(os.devnull, 'wb')
-    library_paths = run_shell([
-        python_bin_path, '-c',
-        'import site; print("\\n".join(site.getsitepackages()))'
-    ],
-                              stderr=stderr).split('\n')
-  except subprocess.CalledProcessError:
-    library_paths = [
-        run_shell([
-            python_bin_path, '-c',
-            'from distutils.sysconfig import get_python_lib;'
-            'print(get_python_lib())'
-        ])
-    ]
-
-  all_paths = set(python_paths + library_paths)
-  # Sort set so order is deterministic
-  all_paths = sorted(all_paths)
-
-  paths = []
-  for path in all_paths:
-    if os.path.isdir(path):
-      paths.append(path)
-  return paths
-
-
-def get_python_major_version(python_bin_path):
-  """Get the python major version."""
-  return run_shell([python_bin_path, '-c', 'import sys; print(sys.version[0])'])
-
-
-def setup_python(environ_cp):
-  """Setup python related env variables."""
-  # Get PYTHON_BIN_PATH, default is the current running python.
-  default_python_bin_path = sys.executable
-  ask_python_bin_path = ('Please specify the location of python. [Default is '
-                         '{}]: ').format(default_python_bin_path)
-  while True:
-    python_bin_path = get_from_env_or_user_or_default(environ_cp,
-                                                      'PYTHON_BIN_PATH',
-                                                      ask_python_bin_path,
-                                                      default_python_bin_path)
-    # Check if the path is valid
-    if os.path.isfile(python_bin_path) and os.access(python_bin_path, os.X_OK):
-      break
-    elif not os.path.exists(python_bin_path):
-      print('Invalid python path: {} cannot be found.'.format(python_bin_path))
-    else:
-      print('{} is not executable.  Is it the python binary?'.format(
-          python_bin_path))
-    environ_cp['PYTHON_BIN_PATH'] = ''
-
-  # Convert python path to Windows style before checking lib and version
-  if is_windows() or is_cygwin():
-    python_bin_path = cygpath(python_bin_path)
-
-  # Get PYTHON_LIB_PATH
-  python_lib_path = environ_cp.get('PYTHON_LIB_PATH')
-  if not python_lib_path:
-    python_lib_paths = get_python_path(environ_cp, python_bin_path)
-    if environ_cp.get('USE_DEFAULT_PYTHON_LIB_PATH') == '1':
-      python_lib_path = python_lib_paths[0]
-    else:
-      print('Found possible Python library paths:\n  %s' %
-            '\n  '.join(python_lib_paths))
-      default_python_lib_path = python_lib_paths[0]
-      python_lib_path = get_input(
-          'Please input the desired Python library path to use.  '
-          'Default is [{}]\n'.format(python_lib_paths[0]))
-      if not python_lib_path:
-        python_lib_path = default_python_lib_path
-    environ_cp['PYTHON_LIB_PATH'] = python_lib_path
-
-  python_major_version = get_python_major_version(python_bin_path)
-  if python_major_version == '2':
-    write_to_bazelrc('build --host_force_python=PY2')
-
-  # Convert python path to Windows style before writing into bazel.rc
-  if is_windows() or is_cygwin():
-    python_lib_path = cygpath(python_lib_path)
-
-  # Set-up env variables used by python_configure.bzl
-  write_action_env_to_bazelrc('PYTHON_BIN_PATH', python_bin_path)
-  write_action_env_to_bazelrc('PYTHON_LIB_PATH', python_lib_path)
-  write_to_bazelrc('build --python_path=\"{}"'.format(python_bin_path))
-  environ_cp['PYTHON_BIN_PATH'] = python_bin_path
-
-  # If choosen python_lib_path is from a path specified in the PYTHONPATH
-  # variable, need to tell bazel to include PYTHONPATH
-  if environ_cp.get('PYTHONPATH'):
-    python_paths = environ_cp.get('PYTHONPATH').split(':')
-    if python_lib_path in python_paths:
-      write_action_env_to_bazelrc('PYTHONPATH', environ_cp.get('PYTHONPATH'))
-
-  # Write tools/python_bin_path.sh
-  with open(
-      os.path.join(_TF_WORKSPACE_ROOT, 'tools', 'python_bin_path.sh'),
-      'w') as f:
-    f.write('export PYTHON_BIN_PATH="{}"'.format(python_bin_path))
-
-
-def reset_tf_configure_bazelrc():
-  """Reset file that contains customized config settings."""
-  open(_TF_BAZELRC, 'w').close()
-
-
-def get_var(environ_cp,
-            var_name,
-            query_item,
-            enabled_by_default,
-            question=None,
-            yes_reply=None,
-            no_reply=None):
-  """Get boolean input from user.
-
-  If var_name is not set in env, ask user to enable query_item or not. If the
-  response is empty, use the default.
-
-  Args:
-    environ_cp: copy of the os.environ.
-    var_name: string for name of environment variable, e.g. "TF_NEED_CUDA".
-    query_item: string for feature related to the variable, e.g. "CUDA for
-      Nvidia GPUs".
-    enabled_by_default: boolean for default behavior.
-    question: optional string for how to ask for user input.
-    yes_reply: optional string for reply when feature is enabled.
-    no_reply: optional string for reply when feature is disabled.
-
-  Returns:
-    boolean value of the variable.
-
-  Raises:
-    UserInputError: if an environment variable is set, but it cannot be
-      interpreted as a boolean indicator, assume that the user has made a
-      scripting error, and will continue to provide invalid input.
-      Raise the error to avoid infinitely looping.
-  """
-  if not question:
-    question = 'Do you wish to build XLA with {} support?'.format(
-        query_item)
-  if not yes_reply:
-    yes_reply = '{} support will be enabled for XLA.'.format(query_item)
-  if not no_reply:
-    no_reply = 'No {}'.format(yes_reply)
-
-  yes_reply += '\n'
-  no_reply += '\n'
-
-  if enabled_by_default:
-    question += ' [Y/n]: '
-  else:
-    question += ' [y/N]: '
-
-  var = environ_cp.get(var_name)
-  if var is not None:
-    var_content = var.strip().lower()
-    true_strings = ('1', 't', 'true', 'y', 'yes')
-    false_strings = ('0', 'f', 'false', 'n', 'no')
-    if var_content in true_strings:
-      var = True
-    elif var_content in false_strings:
-      var = False
-    else:
-      raise UserInputError(
-          'Environment variable %s must be set as a boolean indicator.\n'
-          'The following are accepted as TRUE : %s.\n'
-          'The following are accepted as FALSE: %s.\n'
-          'Current value is %s.' %
-          (var_name, ', '.join(true_strings), ', '.join(false_strings), var))
-
-  while var is None:
-    user_input_origin = get_input(question)
-    user_input = user_input_origin.strip().lower()
-    if user_input == 'y':
-      print(yes_reply)
-      var = True
-    elif user_input == 'n':
-      print(no_reply)
-      var = False
-    elif not user_input:
-      if enabled_by_default:
-        print(yes_reply)
-        var = True
-      else:
-        print(no_reply)
-        var = False
-    else:
-      print('Invalid selection: {}'.format(user_input_origin))
-  return var
-
-
-def set_action_env_var(environ_cp,
-                       var_name,
-                       query_item,
-                       enabled_by_default,
-                       question=None,
-                       yes_reply=None,
-                       no_reply=None,
-                       bazel_config_name=None):
-  """Set boolean action_env variable.
-
-  Ask user if query_item will be enabled. Default is used if no input is given.
-  Set environment variable and write to .bazelrc.
-
-  Args:
-    environ_cp: copy of the os.environ.
-    var_name: string for name of environment variable, e.g. "TF_NEED_CUDA".
-    query_item: string for feature related to the variable, e.g. "CUDA for
-      Nvidia GPUs".
-    enabled_by_default: boolean for default behavior.
-    question: optional string for how to ask for user input.
-    yes_reply: optional string for reply when feature is enabled.
-    no_reply: optional string for reply when feature is disabled.
-    bazel_config_name: adding config to .bazelrc instead of action_env.
-  """
-  var = int(
-      get_var(environ_cp, var_name, query_item, enabled_by_default, question,
-              yes_reply, no_reply))
-
-  if not bazel_config_name:
-    write_action_env_to_bazelrc(var_name, var)
-  elif var:
-    write_to_bazelrc('build --config=%s' % bazel_config_name)
-  environ_cp[var_name] = str(var)
-
-
-def convert_version_to_int(version):
-  """Convert a version number to a integer that can be used to compare.
-
-  Version strings of the form X.YZ and X.Y.Z-xxxxx are supported. The
-  'xxxxx' part, for instance 'homebrew' on OS/X, is ignored.
-
-  Args:
-    version: a version to be converted
-
-  Returns:
-    An integer if converted successfully, otherwise return None.
-  """
-  version = version.split('-')[0]
-  version_segments = version.split('.')
-  # Treat "0.24" as "0.24.0"
-  if len(version_segments) == 2:
-    version_segments.append('0')
-  for seg in version_segments:
-    if not seg.isdigit():
-      return None
-
-  version_str = ''.join(['%03d' % int(seg) for seg in version_segments])
-  return int(version_str)
-
-
-def retrieve_bazel_version():
-  """Retrieve installed bazel version (or bazelisk).
-
-  Returns:
-    The bazel version detected.
-  """
-  bazel_executable = which('bazel')
-  if bazel_executable is None:
-    bazel_executable = which('bazelisk')
-    if bazel_executable is None:
-      print('Cannot find bazel. Please install bazel/bazelisk.')
-      sys.exit(1)
-
-  stderr = open(os.devnull, 'wb')
-  curr_version = run_shell([bazel_executable, '--version'],
-                           allow_non_zero=True,
-                           stderr=stderr)
-  if curr_version.startswith('bazel '):
-    curr_version = curr_version.split('bazel ')[1]
-
-  curr_version_int = convert_version_to_int(curr_version)
-
-  # Check if current bazel version can be detected properly.
-  if not curr_version_int:
-    print('WARNING: current bazel installation is not a release version.')
-    return curr_version
-
-  print('You have bazel %s installed.' % curr_version)
-  return curr_version
-
-
-def set_cc_opt_flags(environ_cp):
-  """Set up architecture-dependent optimization flags.
-
-  Also append CC optimization flags to bazel.rc..
-
-  Args:
-    environ_cp: copy of the os.environ.
-  """
-  if is_ppc64le():
-    # gcc on ppc64le does not support -march, use mcpu instead
-    default_cc_opt_flags = '-mcpu=native'
-  elif is_windows():
-    default_cc_opt_flags = '/arch:AVX'
-  else:
-    # On all other platforms, no longer use `-march=native` as this can result
-    # in instructions that are too modern being generated. Users that want
-    # maximum performance should compile TF in their environment and can pass
-    # `-march=native` there.
-    # See https://github.com/tensorflow/tensorflow/issues/45744 and duplicates
-    default_cc_opt_flags = '-Wno-sign-compare'
-  question = ('Please specify optimization flags to use during compilation when'
-              ' bazel option "--config=opt" is specified [Default is %s]: '
-             ) % default_cc_opt_flags
-  cc_opt_flags = get_from_env_or_user_or_default(environ_cp, 'CC_OPT_FLAGS',
-                                                 question, default_cc_opt_flags)
-  for opt in cc_opt_flags.split():
-    write_to_bazelrc('build:opt --copt=%s' % opt)
-    write_to_bazelrc('build:opt --host_copt=%s' % opt)
-
-
-def set_tf_cuda_clang(environ_cp):
-  """set TF_CUDA_CLANG action_env.
-
-  Args:
-    environ_cp: copy of the os.environ.
-  """
-  question = 'Do you want to use clang as CUDA compiler?'
-  yes_reply = 'Clang will be used as CUDA compiler.'
-  no_reply = 'nvcc will be used as CUDA compiler.'
-  set_action_env_var(
-      environ_cp,
-      'TF_CUDA_CLANG',
-      None,
-      False,
-      question=question,
-      yes_reply=yes_reply,
-      no_reply=no_reply,
-      bazel_config_name='cuda_clang')
-
-
-def set_tf_download_clang(environ_cp):
-  """Set TF_DOWNLOAD_CLANG action_env."""
-  question = 'Do you wish to download a fresh release of clang? (Experimental)'
-  yes_reply = 'Clang will be downloaded and used to compile tensorflow.'
-  no_reply = 'Clang will not be downloaded.'
-  set_action_env_var(
-      environ_cp,
-      'TF_DOWNLOAD_CLANG',
-      None,
-      False,
-      question=question,
-      yes_reply=yes_reply,
-      no_reply=no_reply,
-      bazel_config_name='download_clang')
-
-
-def get_from_env_or_user_or_default(environ_cp, var_name, ask_for_var,
-                                    var_default):
-  """Get var_name either from env, or user or default.
-
-  If var_name has been set as environment variable, use the preset value, else
-  ask for user input. If no input is provided, the default is used.
-
-  Args:
-    environ_cp: copy of the os.environ.
-    var_name: string for name of environment variable, e.g. "TF_NEED_CUDA".
-    ask_for_var: string for how to ask for user input.
-    var_default: default value string.
-
-  Returns:
-    string value for var_name
-  """
-  var = environ_cp.get(var_name)
-  if not var:
-    var = get_input(ask_for_var)
-    print('\n')
-  if not var:
-    var = var_default
-  return var
-
-
-def set_clang_cuda_compiler_path(environ_cp):
-  """Set CLANG_CUDA_COMPILER_PATH."""
-  default_clang_path = '/usr/lib/llvm-16/bin/clang'
-  if not os.path.exists(default_clang_path):
-    default_clang_path = which('clang') or ''
-
-  clang_cuda_compiler_path = prompt_loop_or_load_from_env(
-      environ_cp,
-      var_name='CLANG_CUDA_COMPILER_PATH',
-      var_default=default_clang_path,
-      ask_for_var='Please specify clang path that to be used as host compiler.',
-      check_success=os.path.exists,
-      resolve_symlinks=True,
-      error_msg='Invalid clang path. %s cannot be found.',
-  )
-
-  # Set CLANG_CUDA_COMPILER_PATH
-  environ_cp['CLANG_CUDA_COMPILER_PATH'] = clang_cuda_compiler_path
-  write_action_env_to_bazelrc('CLANG_CUDA_COMPILER_PATH',
-                              clang_cuda_compiler_path)
-  return clang_cuda_compiler_path
-
-
-def prompt_loop_or_load_from_env(environ_cp,
-                                 var_name,
-                                 var_default,
-                                 ask_for_var,
-                                 check_success,
-                                 error_msg,
-                                 suppress_default_error=False,
-                                 resolve_symlinks=False,
-                                 n_ask_attempts=_DEFAULT_PROMPT_ASK_ATTEMPTS):
-  """Loop over user prompts for an ENV param until receiving a valid response.
-
-  For the env param var_name, read from the environment or verify user input
-  until receiving valid input. When done, set var_name in the environ_cp to its
-  new value.
-
-  Args:
-    environ_cp: (Dict) copy of the os.environ.
-    var_name: (String) string for name of environment variable, e.g. "TF_MYVAR".
-    var_default: (String) default value string.
-    ask_for_var: (String) string for how to ask for user input.
-    check_success: (Function) function that takes one argument and returns a
-      boolean. Should return True if the value provided is considered valid. May
-      contain a complex error message if error_msg does not provide enough
-      information. In that case, set suppress_default_error to True.
-    error_msg: (String) String with one and only one '%s'. Formatted with each
-      invalid response upon check_success(input) failure.
-    suppress_default_error: (Bool) Suppress the above error message in favor of
-      one from the check_success function.
-    resolve_symlinks: (Bool) Translate symbolic links into the real filepath.
-    n_ask_attempts: (Integer) Number of times to query for valid input before
-      raising an error and quitting.
-
-  Returns:
-    [String] The value of var_name after querying for input.
-
-  Raises:
-    UserInputError: if a query has been attempted n_ask_attempts times without
-      success, assume that the user has made a scripting error, and will
-      continue to provide invalid input. Raise the error to avoid infinitely
-      looping.
-  """
-  default = environ_cp.get(var_name) or var_default
-  full_query = '%s [Default is %s]: ' % (
-      ask_for_var,
-      default,
-  )
-
-  for _ in range(n_ask_attempts):
-    val = get_from_env_or_user_or_default(environ_cp, var_name, full_query,
-                                          default)
-    if check_success(val):
-      break
-    if not suppress_default_error:
-      print(error_msg % val)
-    environ_cp[var_name] = ''
-  else:
-    raise UserInputError('Invalid %s setting was provided %d times in a row. '
-                         'Assuming to be a scripting mistake.' %
-                         (var_name, n_ask_attempts))
-
-  if resolve_symlinks:
-    val = os.path.realpath(val)
-  environ_cp[var_name] = val
-  return val
-
-
-def set_gcc_host_compiler_path(environ_cp):
-  """Set GCC_HOST_COMPILER_PATH."""
-  default_gcc_host_compiler_path = which('gcc') or ''
-  cuda_bin_symlink = '%s/bin/gcc' % environ_cp.get('CUDA_TOOLKIT_PATH')
-
-  if os.path.islink(cuda_bin_symlink):
-    # os.readlink is only available in linux
-    default_gcc_host_compiler_path = os.path.realpath(cuda_bin_symlink)
-
-  gcc_host_compiler_path = prompt_loop_or_load_from_env(
-      environ_cp,
-      var_name='GCC_HOST_COMPILER_PATH',
-      var_default=default_gcc_host_compiler_path,
-      ask_for_var='Please specify which gcc should be used by nvcc as the host '
-      'compiler.',
-      check_success=os.path.exists,
-      resolve_symlinks=True,
-      error_msg='Invalid gcc path. %s cannot be found.',
-  )
-
-  write_action_env_to_bazelrc('GCC_HOST_COMPILER_PATH', gcc_host_compiler_path)
-
-
-def choose_compiler(environ_cp):
-  question = 'Do you want to use Clang to build TensorFlow?'
-  yes_reply = 'Clang will be used to compile TensorFlow.'
-  no_reply = 'GCC will be used to compile TensorFlow.'
-  var = int(
-      get_var(
-          environ_cp, 'TF_NEED_CLANG', None, True, question, yes_reply, no_reply
-      )
-  )
-  return var
-
-
-def set_clang_compiler_path(environ_cp):
-  """Set CLANG_COMPILER_PATH and environment variables.
-
-  Loop over user prompts for clang path until receiving a valid response.
-  Default is used if no input is given. Set CLANG_COMPILER_PATH and write
-  environment variables CC and BAZEL_COMPILER to .bazelrc.
-
-  Args:
-    environ_cp: (Dict) copy of the os.environ.
-
-  Returns:
-    string value for clang_compiler_path.
-  """
-  # Default path if clang-16 is installed by using apt-get install
-  default_clang_path = '/usr/lib/llvm-16/bin/clang'
-  if not os.path.exists(default_clang_path):
-    default_clang_path = which('clang') or ''
-
-  clang_compiler_path = prompt_loop_or_load_from_env(
-      environ_cp,
-      var_name='CLANG_COMPILER_PATH',
-      var_default=default_clang_path,
-      ask_for_var='Please specify the path to clang executable.',
-      check_success=os.path.exists,
-      resolve_symlinks=True,
-      error_msg=(
-          'Invalid clang path. %s cannot be found. Note that TensorFlow now'
-          ' requires clang to compile. You may override this behavior by'
-          ' setting TF_NEED_CLANG=0'
-      ),
-  )
-
-  write_action_env_to_bazelrc('CLANG_COMPILER_PATH', clang_compiler_path)
-  write_to_bazelrc('build --repo_env=CC=%s' % clang_compiler_path)
-  write_to_bazelrc('build --repo_env=BAZEL_COMPILER=%s' % clang_compiler_path)
-  write_to_bazelrc('build --linkopt="-fuse-ld=lld"')
-  write_to_bazelrc('build --linkopt="-lm"')
-
-  return clang_compiler_path
-
-
-def retrieve_clang_version(clang_executable):
-  """Retrieve installed clang version.
-
-  Args:
-    clang_executable: (String) path to clang executable
-
-  Returns:
-    The clang version detected.
-  """
-  stderr = open(os.devnull, 'wb')
-  curr_version = run_shell(
-      [clang_executable, '--version'], allow_non_zero=True, stderr=stderr
-  )
-
-  curr_version_split = curr_version.lower().split('clang version ')
-  if len(curr_version_split) > 1:
-    curr_version = curr_version_split[1].split()[0]
-
-  curr_version_int = convert_version_to_int(curr_version)
-  # Check if current clang version can be detected properly.
-  if not curr_version_int:
-    print('WARNING: current clang installation is not a release version.\n')
-    return None
-
-  print('You have Clang %s installed.\n' % curr_version)
-  return curr_version
-
-
-# Disable clang extension that rejects type definitions within offsetof.
-# This was added in clang-16 by https://reviews.llvm.org/D133574.
-# Can be removed once upb is updated, since a type definition is used within
-# offset of in the current version of ubp. See
-# https://github.com/protocolbuffers/upb/blob/9effcbcb27f0a665f9f345030188c0b291e32482/upb/upb.c#L183.
-def disable_clang16_offsetof_extension(clang_version):
-  if int(clang_version.split('.')[0]) == 16:
-    write_to_bazelrc('build --copt=-Wno-gnu-offsetof-extensions')
-
-
-def set_tf_cuda_paths(environ_cp):
-  """Set TF_CUDA_PATHS."""
-  ask_cuda_paths = (
-      'Please specify the comma-separated list of base paths to look for CUDA '
-      'libraries and headers. [Leave empty to use the default]: ')
-  tf_cuda_paths = get_from_env_or_user_or_default(environ_cp, 'TF_CUDA_PATHS',
-                                                  ask_cuda_paths, '')
-  if tf_cuda_paths:
-    environ_cp['TF_CUDA_PATHS'] = tf_cuda_paths
-
-
-def set_tf_cuda_version(environ_cp):
-  """Set TF_CUDA_VERSION."""
-  ask_cuda_version = (
-      'Please specify the CUDA SDK version you want to use. '
-      '[Leave empty to default to CUDA %s]: ') % _DEFAULT_CUDA_VERSION
-  tf_cuda_version = get_from_env_or_user_or_default(environ_cp,
-                                                    'TF_CUDA_VERSION',
-                                                    ask_cuda_version,
-                                                    _DEFAULT_CUDA_VERSION)
-  environ_cp['TF_CUDA_VERSION'] = tf_cuda_version
-
-
-def set_tf_cudnn_version(environ_cp):
-  """Set TF_CUDNN_VERSION."""
-  ask_cudnn_version = (
-      'Please specify the cuDNN version you want to use. '
-      '[Leave empty to default to cuDNN %s]: ') % _DEFAULT_CUDNN_VERSION
-  tf_cudnn_version = get_from_env_or_user_or_default(environ_cp,
-                                                     'TF_CUDNN_VERSION',
-                                                     ask_cudnn_version,
-                                                     _DEFAULT_CUDNN_VERSION)
-  environ_cp['TF_CUDNN_VERSION'] = tf_cudnn_version
-
-
-def set_tf_nccl_version(environ_cp):
-  """Set TF_NCCL_VERSION."""
-  if not is_linux():
-    raise ValueError('Currently NCCL is only supported on Linux platform.')
-
-  if 'TF_NCCL_VERSION' in environ_cp:
-    return
-
-  ask_nccl_version = (
-      'Please specify the locally installed NCCL version you want to use. '
-      '[Leave empty to use http://github.com/nvidia/nccl]: ')
-  tf_nccl_version = get_from_env_or_user_or_default(environ_cp,
-                                                    'TF_NCCL_VERSION',
-                                                    ask_nccl_version, '')
-  environ_cp['TF_NCCL_VERSION'] = tf_nccl_version
-
-
-def get_native_cuda_compute_capabilities(environ_cp):
-  """Get native cuda compute capabilities.
-
-  Args:
-    environ_cp: copy of the os.environ.
-
-  Returns:
-    string of native cuda compute capabilities, separated by comma.
-  """
-  device_query_bin = os.path.join(
-      environ_cp.get('CUDA_TOOLKIT_PATH'), 'extras/demo_suite/deviceQuery')
-  if os.path.isfile(device_query_bin) and os.access(device_query_bin, os.X_OK):
-    try:
-      output = run_shell(device_query_bin).split('\n')
-      pattern = re.compile('[0-9]*\\.[0-9]*')
-      output = [pattern.search(x) for x in output if 'Capability' in x]
-      output = ','.join(x.group() for x in output if x is not None)
-    except subprocess.CalledProcessError:
-      output = ''
-  else:
-    output = ''
-  return output
-
-
-def set_tf_cuda_compute_capabilities(environ_cp):
-  """Set TF_CUDA_COMPUTE_CAPABILITIES."""
-  while True:
-    native_cuda_compute_capabilities = get_native_cuda_compute_capabilities(
-        environ_cp)
-    if not native_cuda_compute_capabilities:
-      default_cuda_compute_capabilities = _DEFAULT_CUDA_COMPUTE_CAPABILITIES
-    else:
-      default_cuda_compute_capabilities = native_cuda_compute_capabilities
-
-    ask_cuda_compute_capabilities = (
-        'Please specify a list of comma-separated CUDA compute capabilities '
-        'you want to build with.\nYou can find the compute capability of your '
-        'device at: https://developer.nvidia.com/cuda-gpus. Each capability '
-        'can be specified as "x.y" or "compute_xy" to include both virtual and'
-        ' binary GPU code, or as "sm_xy" to only include the binary '
-        'code.\nPlease note that each additional compute capability '
-        'significantly increases your build time and binary size, and that '
-        'XLA only supports compute capabilities >= 5.2 [Default is: '
-        '%s]: ' % default_cuda_compute_capabilities
-    )
-    tf_cuda_compute_capabilities = get_from_env_or_user_or_default(
-        environ_cp, 'TF_CUDA_COMPUTE_CAPABILITIES',
-        ask_cuda_compute_capabilities, default_cuda_compute_capabilities)
-    # Check whether all capabilities from the input is valid
-    all_valid = True
-    # Remove all whitespace characters before splitting the string
-    # that users may insert by accident, as this will result in error
-    tf_cuda_compute_capabilities = ''.join(tf_cuda_compute_capabilities.split())
-    for compute_capability in tf_cuda_compute_capabilities.split(','):
-      m = re.match('[0-9]+.[0-9]+', compute_capability)
-      if not m:
-        # We now support sm_52,compute_70.
-        sm_compute_match = re.match('(sm|compute)_?([0-9]+[0-9]+)',
-                                    compute_capability)
-        if not sm_compute_match:
-          print('Invalid compute capability: %s' % compute_capability)
-          all_valid = False
-        else:
-          ver = int(sm_compute_match.group(2))
-          if ver < 52:
-            print(
-                'ERROR: XLA only supports small CUDA compute'
-                ' capabilities of sm_52 and higher. Please re-specify the list'
-                ' of compute capabilities excluding version %s.' % ver
-            )
-            all_valid = False
-      else:
-        ver = float(m.group(0))
-        if ver < 5.2:
-          print(
-              'ERROR: XLA only supports CUDA compute capabilities 5.2 '
-              'and higher. Please re-specify the list of compute '
-              'capabilities excluding version %s.' % ver
-          )
-          all_valid = False
-
-    if all_valid:
-      break
-
-    # Reset and Retry
-    environ_cp['TF_CUDA_COMPUTE_CAPABILITIES'] = ''
-
-  # Set TF_CUDA_COMPUTE_CAPABILITIES
-  environ_cp['TF_CUDA_COMPUTE_CAPABILITIES'] = tf_cuda_compute_capabilities
-  write_action_env_to_bazelrc('TF_CUDA_COMPUTE_CAPABILITIES',
-                              tf_cuda_compute_capabilities)
-
-
-def set_other_cuda_vars(environ_cp):
-  """Set other CUDA related variables."""
-  # If CUDA is enabled, always use GPU during build and test.
-  if environ_cp.get('TF_CUDA_CLANG') == '1':
-    write_to_bazelrc('build --config=cuda_clang')
-  else:
-    write_to_bazelrc('build --config=cuda')
-
-
-def system_specific_test_config(environ_cp):
-  """Add default build and test flags required for TF tests to bazelrc."""
-  write_to_bazelrc('test --flaky_test_attempts=3')
-  write_to_bazelrc('test --test_size_filters=small,medium')
-
-  # Each instance of --test_tag_filters or --build_tag_filters overrides all
-  # previous instances, so we need to build up a complete list and write a
-  # single list of filters for the .bazelrc file.
-
-  # Filters to use with both --test_tag_filters and --build_tag_filters
-  test_and_build_filters = ['-benchmark-test', '-no_oss', '-oss_excluded']
-  # Additional filters for --test_tag_filters beyond those in
-  # test_and_build_filters
-  test_only_filters = ['-oss_serial']
-  if is_windows():
-    test_and_build_filters += ['-no_windows', '-windows_excluded']
-    if ((environ_cp.get('TF_NEED_CUDA', None) == '1') or
-        (environ_cp.get('TF_NEED_ROCM', None) == '1')):
-      test_and_build_filters += ['-no_windows_gpu', '-no_gpu']
-    else:
-      test_and_build_filters.append('-gpu')
-  elif is_macos():
-    test_and_build_filters += ['-gpu', '-nomac', '-no_mac', '-mac_excluded']
-  elif is_linux():
-    if ((environ_cp.get('TF_NEED_CUDA', None) == '1') or
-        (environ_cp.get('TF_NEED_ROCM', None) == '1')):
-      test_and_build_filters.append('-no_gpu')
-      write_to_bazelrc('test --test_env=LD_LIBRARY_PATH')
-    else:
-      test_and_build_filters.append('-gpu')
-    if environ_cp.get('TF_NEED_ROCM', None) == '1':
-      test_and_build_filters.append('-no_rocm')
-
-  write_to_bazelrc('test --test_tag_filters=%s' %
-                   ','.join(test_and_build_filters + test_only_filters))
-  write_to_bazelrc('test --build_tag_filters=%s' %
-                   ','.join(test_and_build_filters))
-  write_to_bazelrc('build --test_tag_filters=%s' %
-                   ','.join(test_and_build_filters + test_only_filters))
-  write_to_bazelrc('build --build_tag_filters=%s' %
-                   ','.join(test_and_build_filters))
-
-  # Disable tests with "v1only" tag in "v2" Bazel config, but not in "v1" config
-  write_to_bazelrc('test:v1 --test_tag_filters=%s' %
-                   ','.join(test_and_build_filters + test_only_filters))
-  write_to_bazelrc('test:v1 --build_tag_filters=%s' %
-                   ','.join(test_and_build_filters))
-  write_to_bazelrc(
-      'test:v2 --test_tag_filters=%s' %
-      ','.join(test_and_build_filters + test_only_filters + ['-v1only']))
-  write_to_bazelrc('test:v2 --build_tag_filters=%s' %
-                   ','.join(test_and_build_filters + ['-v1only']))
-
-
-def set_system_libs_flag(environ_cp):
-  syslibs = environ_cp.get('TF_SYSTEM_LIBS', '')
-  if syslibs:
-    if ',' in syslibs:
-      syslibs = ','.join(sorted(syslibs.split(',')))
-    else:
-      syslibs = ','.join(sorted(syslibs.split()))
-    write_action_env_to_bazelrc('TF_SYSTEM_LIBS', syslibs)
-
-  for varname in ('PREFIX', 'LIBDIR', 'INCLUDEDIR', 'PROTOBUF_INCLUDE_PATH'):
-    if varname in environ_cp:
-      write_to_bazelrc('build --define=%s=%s' % (varname, environ_cp[varname]))
-
-
-def set_windows_build_flags():
-  """Set Windows specific build options."""
-
-  # First available in VS 16.4. Speeds up Windows compile times by a lot. See
-  # https://groups.google.com/a/tensorflow.org/d/topic/build/SsW98Eo7l3o/discussion
-  # pylint: disable=line-too-long
-  write_to_bazelrc(
-      'build --copt=/d2ReducedOptimizeHugeFunctions --host_copt=/d2ReducedOptimizeHugeFunctions'
-  )
-
-
-def config_info_line(name, help_text):
-  """Helper function to print formatted help text for Bazel config options."""
-  print('\t--config=%-12s\t# %s' % (name, help_text))
-
-
-def validate_cuda_config(environ_cp):
-  """Run find_cuda_config.py and return cuda_toolkit_path, or None."""
-
-  def maybe_encode_env(env):
-    """Encodes unicode in env to str on Windows python 2.x."""
-    if not is_windows() or sys.version_info[0] != 2:
-      return env
-    for k, v in env.items():
-      if isinstance(k, unicode):
-        k = k.encode('ascii')
-      if isinstance(v, unicode):
-        v = v.encode('ascii')
-      env[k] = v
-    return env
-
-  cuda_libraries = ['cuda', 'cudnn']
-  if is_linux():
-    if environ_cp.get('TF_NCCL_VERSION', None):
-      cuda_libraries.append('nccl')
-
-  find_cuda_script = os.path.join(
-      pathlib.Path(__file__).parent.resolve(),
-      'third_party/tsl/third_party/gpus/find_cuda_config.py',
-  )
-  if not os.path.isfile(find_cuda_script):
-    raise FileNotFoundError(
-        "Can't find 'find_cuda_config.py' script inside working directory,"
-        f' expected in {find_cuda_script}'
-    )
-  proc = subprocess.Popen(
-      [environ_cp['PYTHON_BIN_PATH'], find_cuda_script] + cuda_libraries,
-      stdout=subprocess.PIPE,
-      env=maybe_encode_env(environ_cp),
-  )
-
-  if proc.wait():
-    # Errors from find_cuda_config.py were sent to stderr.
-    print('Asking for detailed CUDA configuration...\n')
-    return False
-
-  config = dict(
-      tuple(line.decode('ascii').rstrip().split(': ')) for line in proc.stdout)
-
-  print('Found CUDA %s in:' % config['cuda_version'])
-  print('    %s' % config['cuda_library_dir'])
-  print('    %s' % config['cuda_include_dir'])
-
-  print('Found cuDNN %s in:' % config['cudnn_version'])
-  print('    %s' % config['cudnn_library_dir'])
-  print('    %s' % config['cudnn_include_dir'])
-
-  if config.get('nccl_version', None):
-    print('Found NCCL %s in:' % config['nccl_version'])
-    print('    %s' % config['nccl_library_dir'])
-    print('    %s' % config['nccl_include_dir'])
-
-  print('\n')
-
-  environ_cp['CUDA_TOOLKIT_PATH'] = config['cuda_toolkit_path']
-  return True
-
-
-def get_gcc_compiler(environ_cp):
-  gcc_env = environ_cp.get('CXX') or environ_cp.get('CC') or which('gcc')
-  if gcc_env is not None:
-    gcc_version = run_shell([gcc_env, '--version']).split()
-    if gcc_version[0] in ('gcc', 'g++'):
-      return gcc_env
-  return None
-
-
-def main():
-  global _TF_WORKSPACE_ROOT
-  global _TF_BAZELRC
-  global _TF_CURRENT_BAZEL_VERSION
-
-  parser = argparse.ArgumentParser()
-  parser.add_argument(
-      '--workspace',
-      type=str,
-      default=os.path.abspath(os.path.dirname(__file__)),
-      help='The absolute path to your active Bazel workspace.')
-  args = parser.parse_args()
-
-  _TF_WORKSPACE_ROOT = args.workspace
-  _TF_BAZELRC = os.path.join(_TF_WORKSPACE_ROOT, _TF_BAZELRC_FILENAME)
-
-  # Make a copy of os.environ to be clear when functions and getting and setting
-  # environment variables.
-  environ_cp = dict(os.environ)
-
-  try:
-    current_bazel_version = retrieve_bazel_version()
-  except subprocess.CalledProcessError as e:
-    print('Error retrieving bazel version: ', e.output.decode('UTF-8').strip())
-    raise e
-
-  _TF_CURRENT_BAZEL_VERSION = convert_version_to_int(current_bazel_version)
-
-  reset_tf_configure_bazelrc()
-
-  setup_python(environ_cp)
-
-  if is_windows():
-    environ_cp['TF_NEED_OPENCL'] = '0'
-    environ_cp['TF_CUDA_CLANG'] = '0'
-    # TODO(ibiryukov): Investigate using clang as a cpu or cuda compiler on
-    # Windows.
-    environ_cp['TF_DOWNLOAD_CLANG'] = '0'
-    environ_cp['TF_NEED_MPI'] = '0'
-
-  if is_ppc64le():
-    # Enable MMA Dynamic Dispatch support if 'gcc' and if linker >= 2.35
-    gcc_env = get_gcc_compiler(environ_cp)
-    if gcc_env is not None:
-
-      # Use gold linker if 'gcc' and if 'ppc64le'
-      write_to_bazelrc('build --linkopt="-fuse-ld=gold"')
-
-      # Get the linker version
-      ld_version = run_shell([gcc_env, '-Wl,-version']).split()
-
-      ld_version_int = convert_version_to_int(ld_version[3])
-      if ld_version_int is None:
-        ld_version_int = convert_version_to_int(ld_version[4])
-
-      # Enable if 'ld' version >= 2.35
-      if ld_version_int >= 2035000:
-        write_to_bazelrc(
-            'build --copt="-DEIGEN_ALTIVEC_ENABLE_MMA_DYNAMIC_DISPATCH=1"')
-
-  set_action_env_var(
-      environ_cp, 'TF_NEED_ROCM', 'ROCm', False, bazel_config_name='rocm')
-  if (environ_cp.get('TF_NEED_ROCM') == '1' and
-      'LD_LIBRARY_PATH' in environ_cp and
-      environ_cp.get('LD_LIBRARY_PATH') != '1'):
-    write_action_env_to_bazelrc('LD_LIBRARY_PATH',
-                                environ_cp.get('LD_LIBRARY_PATH'))
-
-  if (environ_cp.get('TF_NEED_ROCM') == '1' and environ_cp.get('ROCM_PATH')):
-    write_action_env_to_bazelrc('ROCM_PATH', environ_cp.get('ROCM_PATH'))
-
-  if (environ_cp.get('TF_NEED_ROCM') == '1' and environ_cp.get('HIP_PLATFORM')):
-    write_action_env_to_bazelrc('HIP_PLATFORM', environ_cp.get('HIP_PLATFORM'))
-
-  if is_windows():
-    print('\nWARNING: Cannot build with CUDA support on Windows.\n'
-          'Starting in TF 2.11, CUDA build is not supported for Windows. '
-          'For using XLA GPU on Windows, you will need to build/install '
-          'XLA in WSL2.\n')
-    environ_cp['TF_NEED_CUDA'] = '0'
-  else:
-    environ_cp['TF_NEED_CUDA'] = str(
-        int(get_var(environ_cp, 'TF_NEED_CUDA', 'CUDA', False)))
-  if (environ_cp.get('TF_NEED_CUDA') == '1' and
-      'TF_CUDA_CONFIG_REPO' not in environ_cp):
-
-    environ_save = dict(environ_cp)
-    for _ in range(_DEFAULT_PROMPT_ASK_ATTEMPTS):
-
-      if validate_cuda_config(environ_cp):
-        cuda_env_names = [
-            'TF_CUDA_VERSION',
-            'TF_CUBLAS_VERSION',
-            'TF_CUDNN_VERSION',
-            'TF_NCCL_VERSION',
-            'TF_CUDA_PATHS',
-            # Items below are for backwards compatibility when not using
-            # TF_CUDA_PATHS.
-            'CUDA_TOOLKIT_PATH',
-            'CUDNN_INSTALL_PATH',
-            'NCCL_INSTALL_PATH',
-            'NCCL_HDR_PATH',
-        ]
-        # Note: set_action_env_var above already writes to bazelrc.
-        for name in cuda_env_names:
-          if name in environ_cp:
-            write_action_env_to_bazelrc(name, environ_cp[name])
-        break
-
-      # Restore settings changed below if CUDA config could not be validated.
-      environ_cp = dict(environ_save)
-
-      set_tf_cuda_version(environ_cp)
-      set_tf_cudnn_version(environ_cp)
-      if is_linux():
-        set_tf_nccl_version(environ_cp)
-
-      set_tf_cuda_paths(environ_cp)
-
-    else:
-      raise UserInputError(
-          'Invalid CUDA setting were provided %d '
-          'times in a row. Assuming to be a scripting mistake.'
-          % _DEFAULT_PROMPT_ASK_ATTEMPTS
-      )
-
-    set_tf_cuda_compute_capabilities(environ_cp)
-    if 'LD_LIBRARY_PATH' in environ_cp and environ_cp.get(
-        'LD_LIBRARY_PATH') != '1':
-      write_action_env_to_bazelrc('LD_LIBRARY_PATH',
-                                  environ_cp.get('LD_LIBRARY_PATH'))
-
-    set_tf_cuda_clang(environ_cp)
-    if environ_cp.get('TF_CUDA_CLANG') == '1':
-      # Set up which clang we should use as the cuda / host compiler.
-      clang_cuda_compiler_path = set_clang_cuda_compiler_path(environ_cp)
-      clang_version = retrieve_clang_version(clang_cuda_compiler_path)
-      disable_clang16_offsetof_extension(clang_version)
-    else:
-      # Set up which gcc nvcc should use as the host compiler
-      # No need to set this on Windows
-      if not is_windows():
-        set_gcc_host_compiler_path(environ_cp)
-    set_other_cuda_vars(environ_cp)
-  else:
-    # CUDA not required. Ask whether we should use clang for the CPU build.
-    if is_linux():
-      environ_cp['TF_NEED_CLANG'] = str(choose_compiler(environ_cp))
-      if environ_cp.get('TF_NEED_CLANG') == '1':
-        clang_compiler_path = set_clang_compiler_path(environ_cp)
-        clang_version = retrieve_clang_version(clang_compiler_path)
-        disable_clang16_offsetof_extension(clang_version)
-
-  # ROCm / CUDA are mutually exclusive.
-  # At most 1 GPU platform can be configured.
-  gpu_platform_count = 0
-  if environ_cp.get('TF_NEED_ROCM') == '1':
-    gpu_platform_count += 1
-  if environ_cp.get('TF_NEED_CUDA') == '1':
-    gpu_platform_count += 1
-  if gpu_platform_count >= 2:
-    raise UserInputError('CUDA / ROCm are mututally exclusive. '
-                         'At most 1 GPU platform can be configured.')
-
-  # Disable NCCL if XLA is configured for CPU
-  if gpu_platform_count == 0:
-    write_to_bazelrc('build --config=nonccl')
-
-  set_cc_opt_flags(environ_cp)
-  set_system_libs_flag(environ_cp)
-  if is_windows():
-    set_windows_build_flags()
-
-  system_specific_test_config(environ_cp)
-
-  print('Preconfigured Bazel build configs. You can use any of the below by '
-        'adding "--config=<>" to your build command. See .bazelrc for more '
-        'details.')
-  config_info_line('mkl', 'Build with MKL support.')
-  config_info_line(
-      'mkl_aarch64',
-      'Build with oneDNN and Compute Library for the Arm Architecture (ACL).')
-  config_info_line('monolithic', 'Config for mostly static monolithic build.')
-  config_info_line('numa', 'Build with NUMA support.')
-  config_info_line(
-      'dynamic_kernels',
-      '(Experimental) Build kernels into separate shared objects.')
-  config_info_line('v1', 'Build with TensorFlow 1 API instead of TF 2 API.')
-
-  print('Preconfigured Bazel build configs to DISABLE default on features:')
-  config_info_line('nogcp', 'Disable GCP support.')
-
-  if gpu_platform_count == 1:
-    config_info_line('nonccl', 'Disable NVIDIA NCCL support.')
-
-
-if __name__ == '__main__':
-  main()
diff --git a/third_party/xla/docs/_toc.yaml b/third_party/xla/docs/_toc.yaml
index df0bf39c6c84a7..50a24a1a6607c8 100644
--- a/third_party/xla/docs/_toc.yaml
+++ b/third_party/xla/docs/_toc.yaml
@@ -1,28 +1,51 @@
 toc:
 - heading: XLA developer guide
-- title: Overview
-  path: /xla
-- title: Aliasing
-  path: /xla/aliasing
-- title: Architecture
-  path: /xla/architecture
-- title: Broadcasting
-  path: /xla/broadcasting
-- title: Build from source
-  path: /xla/build_from_source
-- title: Code reviews
-  path: /xla/code_reviews
-- title: Copybara quirks
-  path: /xla/copybara
-- title: Custom calls
-  path: /xla/custom_call
-- title: Developer guide
-  path: /xla/developer_guide
-- title: Developing a new backend
-  path: /xla/developing_new_backend
-- title: Operation semantics
-  path: /xla/operation_semantics
-- title: Shapes and layout
-  path: /xla/shapes
-- title: Tiled layout
-  path: /xla/tiled_layout
+- title: Getting started
+  section:
+    - title: Overview
+      path: /xla
+    - title: XLA architecture
+      path: /xla/architecture
+    - title: Operation semantics
+      path: /xla/operation_semantics
+- title: Developer details
+  section:
+    - title: Broadcasting
+      path: /xla/broadcasting
+    - title: Shapes and layout
+      path: /xla/shapes
+    - title: Aliasing
+      path: /xla/aliasing
+    - title: Tiled layout
+      path: /xla/tiled_layout
+    - title: Writing custom calls
+      path: /xla/custom_call
+    - title: Persisted autotuning
+      path: /xla/persisted_autotuning
+    - title: Copybara quirks
+      path: /xla/copybara
+    - title: XLA Tooling
+      path: /xla/tools
+    - title: Using LSP autocompletion
+      path: /xla/lsp
+- title: Contributing
+  section:
+    - title: Develop a new backend for XLA
+      path: /xla/developing_new_backend
+    - title: Developer guide
+      path: /xla/developer_guide
+    - title: Code reviews
+      path: /xla/code_reviews
+    - title: Build from source
+      path: /xla/build_from_source
+- title: Using XLA in TensorFlow
+  section:
+    - title: Using XLA in TensorFlow
+      path: /xla/tf2xla
+    - title: Use tfcompile
+      path: /xla/tf2xla/tfcompile
+    - title: Autoclustering tutorial
+      path: /xla/tf2xla/tutorials/autoclustering_xla
+    - title: Use XLA with tf.function
+      path: /xla/tf2xla/tutorials/jit_compile
+
diff --git a/third_party/xla/docs/async_ops.md b/third_party/xla/docs/async_ops.md
index 889272eecc4411..a2f7c1dbc7aff7 100644
--- a/third_party/xla/docs/async_ops.md
+++ b/third_party/xla/docs/async_ops.md
@@ -24,22 +24,13 @@ instructions.
 
 %async-start = (f32[64], f32[32], s32[]) async-start(f32[64] %operand),
                                          calls=%async_op
-%async-done = f32[32] async-done((f32[64], f32[32], s32[]) %async-start),
-                                         calls=%async_op
+%async-done = f32[32] async-done((f32[64], f32[32], s32[]) %async-start)
 ```
 
 In the representation above, only `async-start` has a called computation since
 it is trivial to find what the `async-done` does by following its operand to
 find the corresponding `async-start` to find the called computation.
 
-Today both `async-start` and `async-done` have a called computation attribute,
-but long term we plan to keep it only for `async-start`, since it is trivial
-to find what the `async-done` does by following its operand to find the
-corresponding `async-start` to find the called computation.
-
-> [!NOTE]
-> Tracked as b/302594825 internally.
-
 Also note
 that the first element in the output tuple of `async-start` aliases with the
 operand, so the buffer stays alive until at least the async-done instruction.
@@ -102,10 +93,8 @@ to the following and the two can be parsed to the same representation:
                         (f32[64], f32[32], s32[]) %op-start),
                         op_specific_attr=”foo”
 %op-update1 = (f32[64], f32[32], s32[]) op-update(
-                        (f32[64], f32[32], s32[]) %op-update0),
-                        op_specific_attr=”foo”
-%op-done = f32[32] op-done((f32[64], f32[32], s32[]) %op-update1),
-                          op_specific_attr=”foo”
+                        (f32[64], f32[32], s32[]) %op-update0)
+%op-done = f32[32] op-done((f32[64], f32[32], s32[]) %op-update1)
 
 ```
 
diff --git a/third_party/xla/docs/build_from_source.md b/third_party/xla/docs/build_from_source.md
index ca2eb50234bf32..91ef1e49608818 100644
--- a/third_party/xla/docs/build_from_source.md
+++ b/third_party/xla/docs/build_from_source.md
@@ -10,23 +10,14 @@ If you did not clone the XLA repository or install Bazel, please check out the
 ### Configure
 
 XLA builds are configured by the `.bazelrc` file in the repository's root
-directory. The `./configure` or `./configure.py` scripts can be used to adjust
+directory. The `./configure.py` script can be used to adjust
 common settings.
 
-If you need to change the configuration, run the `./configure` script from the
-repository's root directory. This script will prompt you for the location of XLA
-dependencies and asks for additional build configuration options (compiler
+If you need to change the configuration, run the `./configure.py` script from the
+repository's root directory. This script has flags for the location of XLA
+dependencies and additional build configuration options (compiler
 flags, for example). Refer to the *Sample session* section for details.
 
-```
-./configure
-```
-
-There is also a python version of this script, `./configure.py`. If using a
-virtual environment, `python configure.py` prioritizes paths within the
-environment, whereas `./configure` prioritizes paths outside the environment. In
-both cases you can change the default.
-
 ### CPU support
 
 We recommend using a suitable docker container to build/test XLA, such as
@@ -39,19 +30,18 @@ docker run --name xla -w /xla -it -d --rm -v $PWD:/xla tensorflow/build:latest-p
 Using a docker container you can build XLA with CPU support using the following commands:
 
 ```
-docker exec xla ./configure
+docker exec xla ./configure.py --backend=CPU
 docker exec xla bazel build //xla/...  --spawn_strategy=sandboxed --test_output=all
 ```
 
-If you want to build XLA targets with CPU support without Docker you need to install gcc-10:
+If you want to build XLA targets with CPU support without Docker you need to install clang. XLA currently builds on CI with clang-17, but earlier versions should also work:
 
 ```
-apt install gcc-10 g++-10
+apt install clang
 ```
 
 Then configure and build targets using the following commands:
-```
-yes '' | GCC_HOST_COMPILER_PATH=/usr/bin/gcc-10 CC=/usr/bin/gcc-10 TF_NEED_ROCM=0 TF_NEED_CUDA=0 TF_CUDA_CLANG=0 ./configure
+``` ./configure.py --backend=CPU
 
 bazel build --test_output=all --spawn_strategy=sandboxed //xla/...
 ```
@@ -69,7 +59,7 @@ docker run --name xla_gpu -w /xla -it -d --rm -v $PWD:/xla tensorflow/build:late
 To build XLA with GPU support use the following command:
 
 ```
-docker exec -e TF_NEED_CUDA=1 xla_gpu ./configure
+docker exec xla_gpu ./configure.py --backend=CUDA
 docker exec xla_gpu bazel build --test_output=all --spawn_strategy=sandboxed //xla/...
 ```
 
@@ -81,7 +71,7 @@ install the following additional dependencies:
 Then configure and build targets using the following commands:
 
 ```
-yes '' | GCC_HOST_COMPILER_PATH=/usr/bin/gcc-10 CC=/usr/bin/gcc-10 TF_NEED_ROCM=0 TF_NEED_CUDA=1 TF_CUDA_CLANG=0 ./configure
+./configure.py --backend=CUDA
 
 bazel build --test_output=all --spawn_strategy=sandboxed //xla/...
 ```
diff --git a/third_party/xla/docs/custom_call.md b/third_party/xla/docs/custom_call.md
index bd2bff418b1768..84633d697daa20 100644
--- a/third_party/xla/docs/custom_call.md
+++ b/third_party/xla/docs/custom_call.md
@@ -14,6 +14,15 @@ program.
 > to change it capriciously, but it may change. Some possible future changes are
 > described below.
 
+> **Caution** The HLO-visible names of functions registered with the custom-call
+> macros API do not respect C++ namespaces. As a result, accidental collisions
+> from functions registered by different libraries are entirely possible! The
+> API will reject such duplicate registrations, but to avoid issues in large
+> projects the safest option is to either fully namespace-qualify all references
+> to the functions in both the `XLA_REGISTER_CUSTOM_CALL` registration macros
+> and custom call target references or to use C-style namespacing directly in
+> the function name.
+
 ## Create a custom call on CPU
 
 You can create an HLO instruction that represents a custom call via XLA's client
diff --git a/third_party/xla/docs/developer_guide.md b/third_party/xla/docs/developer_guide.md
index fe30b24740d24d..53b3efcd8cab5c 100644
--- a/third_party/xla/docs/developer_guide.md
+++ b/third_party/xla/docs/developer_guide.md
@@ -53,14 +53,14 @@ the repository, and create a pull request.
 Build for CPU:
 
 ```sh
-docker exec xla ./configure
+docker exec xla ./configure.py --backend=CPU
 docker exec xla bazel build --test_output=all --spawn_strategy=sandboxed //xla/...
 ```
 
 Build for GPU:
 
 ```sh
-docker exec -e TF_NEED_CUDA=1 xla ./configure
+docker exec xla ./configure.py --backend=CUDA
 docker exec xla bazel build --test_output=all --spawn_strategy=sandboxed //xla/...
 ```
 
diff --git a/third_party/xla/docs/images/indexing_analysis_softmax.png b/third_party/xla/docs/images/indexing_analysis_softmax.png
new file mode 100644
index 00000000000000..6edeb05acb2383
Binary files /dev/null and b/third_party/xla/docs/images/indexing_analysis_softmax.png differ
diff --git a/third_party/xla/docs/images/indexing_analysis_transposes.svg b/third_party/xla/docs/images/indexing_analysis_transposes.svg
new file mode 100644
index 00000000000000..4a7d0f8c62d214
--- /dev/null
+++ b/third_party/xla/docs/images/indexing_analysis_transposes.svg
@@ -0,0 +1,316 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<?xml-stylesheet href="
+  data:text/css,
+  @import url(https://fonts.googleapis.com/css?family=Roboto:400,700);
+  svg text {
+    font-family: 'Roboto';
+    font-size: 12px;
+  }
+
+    %23node7:hover ~ %23edge8 text { fill: %231976d2; }
+  %23node7:hover ~ %23edge8 path { stroke: %231976d2; stroke-width: .2em; }
+  %23node7:hover ~ %23edge8 polygon { fill: %231976d2; stroke: %231976d2; stroke-width: .2em; }
+
+  %23node8:hover ~ %23edge8 text { fill: %23d32f2f; }
+  %23node8:hover ~ %23edge8 path { stroke: %23d32f2f; stroke-width: .2em; }
+  %23node8:hover ~ %23edge8 polygon { fill: %23d32f2f; stroke: %23d32f2f; stroke-width: .2em; }
+
+  %23node4:hover ~ %23edge7 text { fill: %231976d2; }
+  %23node4:hover ~ %23edge7 path { stroke: %231976d2; stroke-width: .2em; }
+  %23node4:hover ~ %23edge7 polygon { fill: %231976d2; stroke: %231976d2; stroke-width: .2em; }
+
+  %23node8:hover ~ %23edge7 text { fill: %23d32f2f; }
+  %23node8:hover ~ %23edge7 path { stroke: %23d32f2f; stroke-width: .2em; }
+  %23node8:hover ~ %23edge7 polygon { fill: %23d32f2f; stroke: %23d32f2f; stroke-width: .2em; }
+
+  %23node1:hover ~ %23edge4 text { fill: %231976d2; }
+  %23node1:hover ~ %23edge4 path { stroke: %231976d2; stroke-width: .2em; }
+  %23node1:hover ~ %23edge4 polygon { fill: %231976d2; stroke: %231976d2; stroke-width: .2em; }
+
+  %23node5:hover ~ %23edge4 text { fill: %23d32f2f; }
+  %23node5:hover ~ %23edge4 path { stroke: %23d32f2f; stroke-width: .2em; }
+  %23node5:hover ~ %23edge4 polygon { fill: %23d32f2f; stroke: %23d32f2f; stroke-width: .2em; }
+
+  %23node3:hover ~ %23edge3 text { fill: %231976d2; }
+  %23node3:hover ~ %23edge3 path { stroke: %231976d2; stroke-width: .2em; }
+  %23node3:hover ~ %23edge3 polygon { fill: %231976d2; stroke: %231976d2; stroke-width: .2em; }
+
+  %23node4:hover ~ %23edge3 text { fill: %23d32f2f; }
+  %23node4:hover ~ %23edge3 path { stroke: %23d32f2f; stroke-width: .2em; }
+  %23node4:hover ~ %23edge3 polygon { fill: %23d32f2f; stroke: %23d32f2f; stroke-width: .2em; }
+
+  %23node5:hover ~ %23edge5 text { fill: %231976d2; }
+  %23node5:hover ~ %23edge5 path { stroke: %231976d2; stroke-width: .2em; }
+  %23node5:hover ~ %23edge5 polygon { fill: %231976d2; stroke: %231976d2; stroke-width: .2em; }
+
+  %23node6:hover ~ %23edge5 text { fill: %23d32f2f; }
+  %23node6:hover ~ %23edge5 path { stroke: %23d32f2f; stroke-width: .2em; }
+  %23node6:hover ~ %23edge5 polygon { fill: %23d32f2f; stroke: %23d32f2f; stroke-width: .2em; }
+
+  %23node6:hover ~ %23edge6 text { fill: %231976d2; }
+  %23node6:hover ~ %23edge6 path { stroke: %231976d2; stroke-width: .2em; }
+  %23node6:hover ~ %23edge6 polygon { fill: %231976d2; stroke: %231976d2; stroke-width: .2em; }
+
+  %23node7:hover ~ %23edge6 text { fill: %23d32f2f; }
+  %23node7:hover ~ %23edge6 path { stroke: %23d32f2f; stroke-width: .2em; }
+  %23node7:hover ~ %23edge6 polygon { fill: %23d32f2f; stroke: %23d32f2f; stroke-width: .2em; }
+
+  %23node8:hover ~ %23edge9 text { fill: %231976d2; }
+  %23node8:hover ~ %23edge9 path { stroke: %231976d2; stroke-width: .2em; }
+  %23node8:hover ~ %23edge9 polygon { fill: %231976d2; stroke: %231976d2; stroke-width: .2em; }
+
+  %23node9:hover ~ %23edge9 text { fill: %23d32f2f; }
+  %23node9:hover ~ %23edge9 path { stroke: %23d32f2f; stroke-width: .2em; }
+  %23node9:hover ~ %23edge9 polygon { fill: %23d32f2f; stroke: %23d32f2f; stroke-width: .2em; }
+
+  %23node1:hover ~ %23edge1 text { fill: %231976d2; }
+  %23node1:hover ~ %23edge1 path { stroke: %231976d2; stroke-width: .2em; }
+  %23node1:hover ~ %23edge1 polygon { fill: %231976d2; stroke: %231976d2; stroke-width: .2em; }
+
+  %23node2:hover ~ %23edge1 text { fill: %23d32f2f; }
+  %23node2:hover ~ %23edge1 path { stroke: %23d32f2f; stroke-width: .2em; }
+  %23node2:hover ~ %23edge1 polygon { fill: %23d32f2f; stroke: %23d32f2f; stroke-width: .2em; }
+
+  %23node2:hover ~ %23edge2 text { fill: %231976d2; }
+  %23node2:hover ~ %23edge2 path { stroke: %231976d2; stroke-width: .2em; }
+  %23node2:hover ~ %23edge2 polygon { fill: %231976d2; stroke: %231976d2; stroke-width: .2em; }
+
+  %23node3:hover ~ %23edge2 text { fill: %23d32f2f; }
+  %23node3:hover ~ %23edge2 path { stroke: %23d32f2f; stroke-width: .2em; }
+  %23node3:hover ~ %23edge2 polygon { fill: %23d32f2f; stroke: %23d32f2f; stroke-width: .2em; }
+
+" type="text/css"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN" "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.44.1 (20201121.0304)
+ -->
+<!-- Title: G Pages: 1 -->
+<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="290pt" height="549pt" viewBox="0.00 0.00 290.25 548.60"><script>
+(function() {
+  var enablePanZoom = false;
+  var enablePanZoomControls = false;
+  var panZoomScriptUrl = "https://graphviz.corp.google.com/svg-pan-zoom.js";
+
+  var isEmbedded = true;
+  try {
+    // This condition is true when the SVG is being displayed in a
+    // top-level browser window. If the parent location's href is
+    // different, or we can't access it (due to security constraints
+    // on accessing javascript properties on a different domain),
+    // then the document is embedded.
+    if (window.location.href == window.parent.location.href) {
+      isEmbedded = false;
+    }
+  } catch (e) {}
+
+  if (!isEmbedded) enablePanZoom = true;
+
+  if (enablePanZoom &amp;&amp; typeof svgPanZoom === 'undefined') {
+    var panZoomScript =
+      document.createElementNS('http://www.w3.org/2000/svg', 'script');
+    panZoomScript.setAttributeNS(
+      'http://www.w3.org/1999/xlink', 'xlink:href', panZoomScriptUrl);
+    document.currentScript.parentElement.appendChild(panZoomScript);
+  }
+
+  window.onload = function() {
+    var svg = document.getElementsByTagName("svg")[0];
+
+    if (!isEmbedded) {
+      svg.removeAttribute("width");
+      svg.removeAttribute("height");
+    }
+
+    if (enablePanZoom) {
+      var altPressed = false;
+      svgPanZoom(svg, {
+        minZoom: 0.2,
+        maxZoom: 100,
+        controlIconsEnabled: enablePanZoomControls,
+        preventMouseEventsDefault: isEmbedded,
+        beforeZoom: function(oldZoom, newZoom) { return !altPressed; },
+        beforePan: function(oldPan, newPan) { return !altPressed; },
+      });
+      document.onkeydown = function(e) {
+        altPressed = e.altKey;
+      };
+      document.onkeyup = function(e) {
+        altPressed = e.altKey;
+      };
+    }
+
+    var links = document.getElementsByTagName("a");
+    for (var i = 0; i &lt; links.length; i++) {
+      if (!links[i].getAttribute("target")) {
+        links[i].setAttribute("target", "_top");
+      }
+    }
+  };
+})();
+</script>
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 544.6)">
+<title>G</title>
+<g id="a_graph0"><a xlink:title=" ">
+<polygon fill="white" stroke="none" points="-4,4 -4,-544.6 286.25,-544.6 286.25,4 -4,4"/>
+<text text-anchor="start" x="14" y="-510.3" font-family="Times,serif" font-weight="bold" font-size="14.00">Computation f (in fusion instruction fusion)</text>
+</a>
+</g>
+<!-- 5734836738048 -->
+<g id="node1" class="node">
+<title>5734836738048</title>
+<polygon fill="#ffb74d" stroke="#c88719" points="207.25,-504.6 75,-504.6 75,-468.6 207.25,-468.6 207.25,-504.6"/>
+<text text-anchor="start" x="104.75" y="-488.3" font-family="Times,serif" font-weight="bold" font-size="14.00">Parameter 0</text>
+<text text-anchor="start" x="83" y="-474.3" font-family="Times,serif" font-size="14.00">f32[20,10,50]{2,1,0}</text>
+</g>
+<!-- 5734836737024 -->
+<g id="node2" class="node">
+<title>5734836737024</title>
+<polygon fill="#c8e6c9" stroke="#97b498" points="132.25,-432.6 0,-432.6 0,-368.6 132.25,-368.6 132.25,-432.6"/>
+<text text-anchor="start" x="38.38" y="-416.3" font-family="Times,serif" font-weight="bold" font-size="14.00">transpose</text>
+<text text-anchor="start" x="22.62" y="-402.3" font-family="Times,serif" font-size="14.00">lhs_transpose_1</text>
+<text text-anchor="start" x="10.25" y="-388.3" font-family="Times,serif" font-size="14.00">dimensions={1,0,2}</text>
+<text text-anchor="start" x="8" y="-374.3" font-family="Times,serif" font-size="14.00">f32[10,20,50]{2,1,0}</text>
+</g>
+<!-- 5734836738048&#45;&gt;5734836737024 -->
+<g id="edge1" class="edge">
+<title>5734836738048-&gt;5734836737024</title>
+<g id="a_edge1"><a xlink:title="p0 -&gt; lhs_transpose_1">
+<path fill="none" stroke="black" d="M125.58,-468.19C118.62,-460.39 110.1,-450.85 101.7,-441.44"/>
+<polygon fill="black" stroke="black" points="104.35,-439.16 95.08,-434.03 99.13,-443.82 104.35,-439.16"/>
+</a>
+</g>
+</g>
+<!-- 5734836733952 -->
+<g id="node5" class="node">
+<title>5734836733952</title>
+<polygon fill="#c8e6c9" stroke="#97b498" points="282.25,-432.6 150,-432.6 150,-368.6 282.25,-368.6 282.25,-432.6"/>
+<text text-anchor="start" x="188.38" y="-416.3" font-family="Times,serif" font-weight="bold" font-size="14.00">transpose</text>
+<text text-anchor="start" x="172.25" y="-402.3" font-family="Times,serif" font-size="14.00">rhs_transpose_1</text>
+<text text-anchor="start" x="160.25" y="-388.3" font-family="Times,serif" font-size="14.00">dimensions={2,1,0}</text>
+<text text-anchor="start" x="158" y="-374.3" font-family="Times,serif" font-size="14.00">f32[50,10,20]{2,1,0}</text>
+</g>
+<!-- 5734836738048&#45;&gt;5734836733952 -->
+<g id="edge4" class="edge">
+<title>5734836738048-&gt;5734836733952</title>
+<g id="a_edge4"><a xlink:title="p0 -&gt; rhs_transpose_1">
+<path fill="none" stroke="black" d="M156.67,-468.19C163.63,-460.39 172.15,-450.85 180.55,-441.44"/>
+<polygon fill="black" stroke="black" points="183.12,-443.82 187.17,-434.03 177.9,-439.16 183.12,-443.82"/>
+</a>
+</g>
+</g>
+<!-- 5734836736000 -->
+<g id="node3" class="node">
+<title>5734836736000</title>
+<polygon fill="white" stroke="#9e9e9e" points="132.25,-332.6 0,-332.6 0,-282.6 132.25,-282.6 132.25,-332.6"/>
+<text text-anchor="start" x="32.75" y="-316.3" font-family="Times,serif" font-weight="bold" font-size="14.00">exponential</text>
+<text text-anchor="start" x="51.88" y="-302.3" font-family="Times,serif" font-size="14.00">lhs_e</text>
+<text text-anchor="start" x="8" y="-288.3" font-family="Times,serif" font-size="14.00">f32[10,20,50]{2,1,0}</text>
+</g>
+<!-- 5734836737024&#45;&gt;5734836736000 -->
+<g id="edge2" class="edge">
+<title>5734836737024-&gt;5734836736000</title>
+<g id="a_edge2"><a xlink:title="lhs_transpose_1 -&gt; lhs_e">
+<path fill="none" stroke="black" d="M66.12,-368.28C66.12,-360.64 66.12,-352.39 66.12,-344.55"/>
+<polygon fill="black" stroke="black" points="69.63,-344.59 66.13,-334.59 62.63,-344.59 69.63,-344.59"/>
+</a>
+</g>
+</g>
+<!-- 5734836734976 -->
+<g id="node4" class="node">
+<title>5734836734976</title>
+<polygon fill="#c8e6c9" stroke="#97b498" points="132.25,-246.6 0,-246.6 0,-182.6 132.25,-182.6 132.25,-246.6"/>
+<text text-anchor="start" x="38.38" y="-230.3" font-family="Times,serif" font-weight="bold" font-size="14.00">transpose</text>
+<text text-anchor="start" x="22.62" y="-216.3" font-family="Times,serif" font-size="14.00">lhs_transpose_2</text>
+<text text-anchor="start" x="10.25" y="-202.3" font-family="Times,serif" font-size="14.00">dimensions={0,2,1}</text>
+<text text-anchor="start" x="8" y="-188.3" font-family="Times,serif" font-size="14.00">f32[10,50,20]{2,1,0}</text>
+</g>
+<!-- 5734836736000&#45;&gt;5734836734976 -->
+<g id="edge3" class="edge">
+<title>5734836736000-&gt;5734836734976</title>
+<g id="a_edge3"><a xlink:title="lhs_e -&gt; lhs_transpose_2">
+<path fill="none" stroke="black" d="M66.12,-282.21C66.12,-274.84 66.12,-266.52 66.12,-258.32"/>
+<polygon fill="black" stroke="black" points="69.63,-258.43 66.13,-248.43 62.63,-258.43 69.63,-258.43"/>
+</a>
+</g>
+</g>
+<!-- 5734836698112 -->
+<g id="node8" class="node">
+<title>5734836698112</title>
+<polygon fill="white" stroke="#9e9e9e" points="207.25,-146.6 75,-146.6 75,-110.6 207.25,-110.6 207.25,-146.6"/>
+<text text-anchor="start" x="130.25" y="-130.3" font-family="Times,serif" font-weight="bold" font-size="14.00">add</text>
+<text text-anchor="start" x="83" y="-116.3" font-family="Times,serif" font-size="14.00">f32[10,50,20]{2,1,0}</text>
+</g>
+<!-- 5734836734976&#45;&gt;5734836698112 -->
+<g id="edge7" class="edge">
+<title>5734836734976-&gt;5734836698112</title>
+<g id="a_edge7"><a xlink:title="lhs_transpose_2 -&gt; add">
+<path fill="none" stroke="black" d="M94.05,-182.32C101.97,-173.46 110.49,-163.91 118.08,-155.41"/>
+<polygon fill="black" stroke="black" points="120.53,-157.92 124.58,-148.13 115.31,-153.26 120.53,-157.92"/>
+</a>
+</g>
+<text text-anchor="middle" x="119.82" y="-161.86" font-family="Times,serif" font-size="14.00">0</text>
+</g>
+<!-- 5734836732928 -->
+<g id="node6" class="node">
+<title>5734836732928</title>
+<polygon fill="white" stroke="#9e9e9e" points="282.25,-332.6 150,-332.6 150,-282.6 282.25,-282.6 282.25,-332.6"/>
+<text text-anchor="start" x="182.75" y="-316.3" font-family="Times,serif" font-weight="bold" font-size="14.00">exponential</text>
+<text text-anchor="start" x="195.88" y="-302.3" font-family="Times,serif" font-size="14.00">rhs_log</text>
+<text text-anchor="start" x="158" y="-288.3" font-family="Times,serif" font-size="14.00">f32[50,10,20]{2,1,0}</text>
+</g>
+<!-- 5734836733952&#45;&gt;5734836732928 -->
+<g id="edge5" class="edge">
+<title>5734836733952-&gt;5734836732928</title>
+<g id="a_edge5"><a xlink:title="rhs_transpose_1 -&gt; rhs_log">
+<path fill="none" stroke="black" d="M216.12,-368.28C216.12,-360.64 216.12,-352.39 216.12,-344.55"/>
+<polygon fill="black" stroke="black" points="219.63,-344.59 216.13,-334.59 212.63,-344.59 219.63,-344.59"/>
+</a>
+</g>
+</g>
+<!-- 5734836731904 -->
+<g id="node7" class="node">
+<title>5734836731904</title>
+<polygon fill="#c8e6c9" stroke="#97b498" points="282.25,-246.6 150,-246.6 150,-182.6 282.25,-182.6 282.25,-246.6"/>
+<text text-anchor="start" x="188.38" y="-230.3" font-family="Times,serif" font-weight="bold" font-size="14.00">transpose</text>
+<text text-anchor="start" x="172.25" y="-216.3" font-family="Times,serif" font-size="14.00">rhs_transpose_2</text>
+<text text-anchor="start" x="160.25" y="-202.3" font-family="Times,serif" font-size="14.00">dimensions={1,0,2}</text>
+<text text-anchor="start" x="158" y="-188.3" font-family="Times,serif" font-size="14.00">f32[10,50,20]{2,1,0}</text>
+</g>
+<!-- 5734836732928&#45;&gt;5734836731904 -->
+<g id="edge6" class="edge">
+<title>5734836732928-&gt;5734836731904</title>
+<g id="a_edge6"><a xlink:title="rhs_log -&gt; rhs_transpose_2">
+<path fill="none" stroke="black" d="M216.12,-282.21C216.12,-274.84 216.12,-266.52 216.12,-258.32"/>
+<polygon fill="black" stroke="black" points="219.63,-258.43 216.13,-248.43 212.63,-258.43 219.63,-258.43"/>
+</a>
+</g>
+</g>
+<!-- 5734836731904&#45;&gt;5734836698112 -->
+<g id="edge8" class="edge">
+<title>5734836731904-&gt;5734836698112</title>
+<g id="a_edge8"><a xlink:title="rhs_transpose_2 -&gt; add">
+<path fill="none" stroke="black" d="M188.2,-182.32C180.28,-173.46 171.76,-163.91 164.17,-155.41"/>
+<polygon fill="black" stroke="black" points="166.94,-153.26 157.67,-148.13 161.72,-157.92 166.94,-153.26"/>
+</a>
+</g>
+<text text-anchor="middle" x="175.04" y="-150.6" font-family="Times,serif" font-size="14.00">1</text>
+</g>
+<!-- cluster_5734844288888 -->
+<g id="node9" class="node">
+<title>cluster_5734844288888</title>
+<g id="a_node9"><a xlink:title=" ">
+<ellipse fill="#bcaaa4" stroke="#8c7b75" cx="141.12" cy="-37.3" rx="37.3" ry="37.3"/>
+<text text-anchor="start" x="122.75" y="-33" font-family="Times,serif" font-size="14.00">ROOT</text>
+</a>
+</g>
+</g>
+<!-- 5734836698112&#45;&gt;cluster_5734844288888 -->
+<g id="edge9" class="edge">
+<title>5734836698112-&gt;cluster_5734844288888</title>
+<g id="a_edge9"><a xlink:title=" ">
+<path fill="none" stroke="black" d="M141.12,-110.38C141.12,-103.4 141.12,-94.97 141.12,-86.36"/>
+<polygon fill="black" stroke="black" points="144.63,-86.57 141.13,-76.57 137.63,-86.57 144.63,-86.57"/>
+</a>
+</g>
+</g>
+</g>
+</svg>
diff --git a/third_party/xla/docs/index.md b/third_party/xla/docs/index.md
index 676cd7b3a773d2..76bbb657f9fe7f 100644
--- a/third_party/xla/docs/index.md
+++ b/third_party/xla/docs/index.md
@@ -37,17 +37,6 @@ Alibaba, Amazon Web Services, AMD, Apple, Arm, Google, Intel, Meta, and NVIDIA.
 
 ## Documentation
 
-To learn more about XLA, check out the guides below. If you're a new XLA
+To learn more about XLA, check out the links on the left. If you're a new XLA
 developer, you might want to start with [XLA architecture](architecture.md) and
 then read [Code reviews](code_reviews.md).
-
--   [Aliasing in XLA](aliasing.md)
--   [XLA architecture](architecture.md)
--   [Broadcasting](broadcasting.md)
--   [Code reviews](code_reviews.md)
--   [XLA custom calls](custom_call.md)
--   [Developing a new backend for XLA](developing_new_backend.md)
--   [Operation semantics](operation_semantics.md)
--   [Shapes and layout](shapes.md)
--   [Tiled layout](tiled_layout.md)
--   [Setting up LSP with clangd](lsp.md)
diff --git a/third_party/xla/docs/indexing.md b/third_party/xla/docs/indexing.md
new file mode 100644
index 00000000000000..5dbffb8acb5a94
--- /dev/null
+++ b/third_party/xla/docs/indexing.md
@@ -0,0 +1,567 @@
+# Indexing analysis
+
+This document describes the HLO indexing analysis, which lets you symbolically
+compute indexing maps for HLO ops. The indexing map is a function that maps
+indices of one tensor to the indices of another, e.g. indices of an HLO
+instruction output to indices of HLO instruction inputs or vice versa.
+
+#### Example
+
+For a broadcast from `tensor<20xf32>` to `tensor<10x20x30xf32>`
+
+```c
+p0 = f32[20] parameter(0)
+bc0 = f32[10, 20, 30] broadcast(p0), dimensions={1}
+```
+
+the indexing map from the output to input is $(i, j, k) \mapsto (j)$ for $i \in
+[0, 10]$, $j \in [0, 20]$ and $k \in [0, 30]$.
+
+## Motivation
+
+XLA GPU uses several bespoke solutions to reason about coalescing, operand
+utilization, and tiling schemes (more details below). The goal of indexing
+analysis is providing a reusable component for such use cases. Indexing analysis
+is built on MLIR's Affine Map infrastructure and adds HLO semantics.
+
+### Coalescing
+
+Reasoning about memory coalescing becomes becomes feasible for non-trivial
+cases, when we know what elements/slices of the inputs are read to compute an
+element of the output.
+
+### Operand Utilization
+
+Operand utilization in XLA indicates how much each input of the instruction is
+used assuming its output is fully used. Currently, utilization is also not
+computed for a generic case. Indexing analysis allows to compute utilization
+precisely.
+
+### Tiling
+
+A tile/slice is hyper-rectangular subset of a tensor parameterized by offsets,
+sizes and strides. Tile propagation is a way to compute tile parameters of the
+producer/consumer of the op using the tiling parameters of the op itself. There
+is already a
+[library](https://github.com/openxla/xla/blob/main/xla/service/gpu/triton_tiling_propagation.h)
+that does it for softmax and dot. Tile propagation can be made more generic and
+robust if it is expressed via indexing maps.
+
+## Function and Domain
+
+The indexing map is a function $\boldsymbol{f}(\boldsymbol{d}, \boldsymbol{s})$
+that maps a multi-index $\boldsymbol{d}$ of a tensor $A$ to elements/ranges of
+tensor $B$. The parameter $\boldsymbol{s}$ refers to the ranges of indices of
+the dimensions that are present in tensor $B$, but not in tensor $A$​.
+
+For example, if we have a reduction from `tensor<2x4x8x16xf32>` to
+`tensor<4x8xf32>`, then the indexing map from the 2D output to the 4D input is
+$(d_0, d_1) \mapsto (s_0, d_0, d_1, s_1)$, where $d_i$ are the dimension
+parameters that correspond to the indices of the output tensor. Parameters $s_j$
+encode multiple values, i.e. to compute a $(d_0, d_1)$ element of the output, we
+need $(s_0, d_0, d_1, s_1)$ elements of the input, where $s_0 \in [0, 2)$ and
+$s_1 \in [0, 16)$.
+
+This mapping can be constructed from the attributes of HLO instructions or the
+mappings of unfused instructions can be composed to get indexing for a fusion.
+The mapping also has a domain, which specifies for what elements of the tensor
+the mapping exists.
+
+$$
+\begin{eqnarray}
+\boldsymbol{f}(\boldsymbol{d}, \boldsymbol{s})\; &s.t.& \\
+\boldsymbol{lb}_d &\leq& \boldsymbol{d} \leq \boldsymbol{ub}_d \\
+\boldsymbol{lb}_s &\leq& \boldsymbol{s} \leq \boldsymbol{ub}_s \\
+\boldsymbol{lb}_g &\leq& \boldsymbol{g}(\boldsymbol{d},
+  \boldsymbol{s}) \leq \boldsymbol{ub}_g
+\end{eqnarray}
+$$
+
+Since we want to minimize recomputation, we need a library for symbolic
+computations. XLA already depends on MLIR, so we use
+[mlir::AffineMap](https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/IR/AffineMap.h)
+instead of writing a symbolic arithmetic library.
+
+A typical `AffineMap` looks like
+
+```
+(d0)[s0, s1] -> (s0 + 5, d0 * 2, s1 * 3 + 50)
+```
+
+`AffineMap` conveniently has two types of parameters: *dimensions* and *symbols*
+that we can use for $\boldsymbol d$ and $\boldsymbol s$ respectively.
+`AffineMap` does not contain any metadata about ranges of the dimensions, so we
+have to provide this data ourselves.
+
+```c++
+struct Range {
+ int64_t lower_bound;
+ int64_t upper_bound;
+};
+
+struct IndexingMap {
+ mlir::AffineMap affine_map;
+ std::vector<Range> dimension_ranges;
+ std::vector<Range> symbol_ranges;
+ llvm::DenseMap<mlir::AffineExpr, Range> expr_ranges;
+};
+
+```
+
+`dim_ranges` encodes the **inclusive** box constraints for the dimension
+parameters $\boldsymbol{d}$ of the indexing map, which usually coincide with the
+shape of the output tensor for ops like transpose, reduce, elementwise, dot, but
+there are some exceptions like
+[HloConcatenateInstruction](https://github.com/openxla/stablehlo/blob/main/docs/spec.md#concatenate).
+
+`symbol_ranges` encode possible values that $\boldsymbol {s}$ parameters can
+take.
+
+Let's study-by-example to understand what's all of the above actually means.
+
+## Indexing Maps for Unfused Ops
+
+### Elementwise
+
+For an elementwise ops the indexing map is an identity.
+
+```c++
+  p0 = f32[10, 20] parameter(0)
+  p1 = f32[10, 20] parameter(1)
+  add = f32[10, 20] add(p0, p1)
+```
+
+The output to input maps:
+
+-   output -> input_0: $(d_0, d_1) \mapsto (d_0, d_1)$ for $\boldsymbol{d} \in
+    [0,9] \times [0, 19]$, i.e. $\boldsymbol{d} \in {\rm Dom}(output)$
+-   output -> input_1: $(d_0, d_1) \mapsto (d_0, d_1)$ for $\boldsymbol{d} \in
+    {\rm Dom} (output)$
+
+The input to output maps
+
+-   input_i -> output: $(d_0, d_1) \mapsto (d_0, d_1)$ for $\boldsymbol{d} \in
+    {\rm Dom}(output)$
+
+### [Broadcast](https://openxla.org/xla/operation_semantics#broadcastindim)
+
+Broadcasting means that some of the dimensions will be removed when we map
+output to input and added when we map input to output.
+
+```c+
+p0 = f32[20] parameter(0)
+bc0 = f32[10, 20, 30] broadcast(p0), dimensions={1}
+```
+
+The output to input map:
+
+-   output -> input: $(d_0, d_1, d_2) \mapsto (d_1)$ for $\boldsymbol{d} \in
+    {\rm Dom}(output)$
+
+The input to output map
+
+-   input -> output: $(d_0) \mapsto (s_0, d_1, s_1)$ for $\boldsymbol{d} \in
+    {\rm Dom}(output)$ and $\boldsymbol{s} \in [0, 9] \times [0, 29]$.
+
+Note that now we have $\boldsymbol s$ on the right side for the input-to-output
+mapping. Those are the symbols that represent ranges of values. For example, in
+this particular case every element of input with index $d_0$ is mapped to a
+10x1x30 slice of the output.
+
+### Constant and [Iota](https://openxla.org/xla/operation_semantics#iota)
+
+Conveniently, they do not have any input parameters, so there is nothing to
+compute indexing for.
+
+### [Transpose](https://openxla.org/xla/operation_semantics#transpose)
+
+Indexing map for transpose is a permutation of input/output dimensions.
+
+```c+
+p0 = f32[3, 12288, 6, 128] parameter(0)
+transpose = f32[3, 6, 128, 12288] transpose(p0), dimensions={0, 2, 3, 1}
+```
+
+The output to input map:
+
+-   output -> input: $(d_0, d_1, d_2, d_3) \mapsto (d_0, d_3, d_1, d_2)$ for
+    $\boldsymbol{d} \in {\rm Dom}(output)$
+
+The input to output map:
+
+-   input -> output: $(d_0, d_1, d_2, d_3) \mapsto (d_0, d_2, d_3, d_1)$ for
+    $\boldsymbol{d} \in {\rm Dom}(input)$
+
+### [Reverse](https://openxla.org/xla/operation_semantics#rev_reverse)
+
+Indexing map for reverse changes the reverted dimensions to $upper\_bound(d_i) -
+d_i$:
+
+```c+
+p0 = f32[1, 17, 9, 9] parameter(0)
+reverse = f32[1, 17, 9, 9] reverse(p0), dimensions={1, 2}
+```
+
+The output to input map:
+
+-   output -> input: $(d_0, d_1, d_2, d_3) \mapsto (d_0, -d_1 + 16, -d_2 + 8,
+    d_3)$ for $\boldsymbol{d} \in {\rm Dom}(output)$
+
+The input to output map:
+
+-   input -> output: $(d_0, d_1, d_2, d_3) \mapsto (d_0, -d_1 + 16, -d_2 + 8,
+    d_3)$ for $\boldsymbol{d} \in {\rm Dom}(input)$
+
+### **[(Variadic)Reduce](https://openxla.org/xla/operation_semantics#reduce)**
+
+Variadic reduction have several inputs and several inits, the map from output to
+input adds the reduced dimensions. So, it behaves like an inverse to a broadcast
+in some sense.
+
+```c+
+p0 = f32[256,10] parameter(0)
+p0_init = f32[] constant(-inf)
+p1 = s32[256,10] parameter(1)
+p1_init = s32[] constant(0)
+reduce = (f32[10], s32[10]) reduce(p0, p1, p0_init, p1_init),
+  dimensions={0}, to_apply=min
+```
+
+The output to input maps:
+
+-   output -> input_j: $(d_0) \mapsto (s_0, d_0)$ for $\boldsymbol{d} \in {\rm
+    Dom}(output)$ and $\boldsymbol{s} \in [0, 9]$
+-   output -> init_j: $(d_0) \mapsto ()$ for $\boldsymbol{d} \in {\rm
+    Dom}(output)$
+
+The input to output maps:
+
+-   input_i -> output_j: $(d_0, d_1) \mapsto (d_1)$ for $\boldsymbol{d} \in {\rm
+    Dom}(input)$
+-   init_i -> output_j: $() \mapsto (s_0)$ for $\boldsymbol{s} \in [0, 9]$
+
+for $i, j = 0, \ldots, INPUT\\_COUNT$.
+
+### [Slice](https://openxla.org/xla/operation_semantics#slice)
+
+Indexing from output to input for slice results in a strided indexing map which
+is valid for every element of the output. Mapping from the input to output is
+restricted to a strided range of the elements in the input.
+
+```c+
+p0 = f32[10, 20, 50] parameter(0)
+slice = f32[5, 3, 25] slice(f32[10, 20, 50] p0),
+  slice={[5:10:1], [3:20:7], [0:50:2]}
+```
+
+The output to input map:
+
+-   output -> input: $(d_0, d_1, d_2) \mapsto (d_0 + 5, 7d_1 + 3, 2d_2)$ for
+    $\boldsymbol{d} \in {\rm Dom}(output)$
+
+The input to output map:
+
+-   input -> output: $(d_0, d_1, d_2) \mapsto (d_0, d_1 / 7, d_2 / 2)$ for
+    $\boldsymbol{d} \in [5, 9] \times [3, 19] \times [0, 49]$ with strides $[1,
+    7, 2]$​.
+
+**TBD**: input-to-output indexing
+
+### [Reshape](https://openxla.org/xla/operation_semantics#reshape)
+
+Reshapes come in different flavors.
+
+#### Collapse shape
+
+This is a "linearizing" reshape from N-D to 1D.
+
+```c+
+p0 = f32[4,8] parameter(0)
+reshape = f32[32] reshape(p0)
+```
+
+The output to input map:
+
+-   output -> input: $(d_0) \mapsto (d_0 / 8, d_0 \mod 8)$ for $\boldsymbol{d}
+    \in {\rm Dom}(output)$
+
+The input to output map:
+
+-   input -> output: $(d_0, d_1) \mapsto (8 d_0 + d_1)$ for $\boldsymbol{d} \in
+    {\rm Dom}(input)$.
+
+#### Expand shape
+
+This is an inverse "collapse shape" op, it reshapes a 1D input into N-D output.
+
+```c+
+p0 = f32[32] parameter(0)
+reshape = f32[4, 8] reshape(p0)
+```
+
+The output to input map:
+
+-   output -> input: $(d_0, d_1) \mapsto (8 d_0 + d_1)$ for $\boldsymbol{d} \in
+    {\rm Dom}(output)$
+
+The input to output map:
+
+-   input -> output: $(d_0) \mapsto (d_0 / 8, d_0 \mod 8)$ for $\boldsymbol{d}
+    \in {\rm Dom}(input)$.
+
+#### Generic reshape
+
+These are the reshape ops that cannot be represented as a single expand or
+collapse shape. They can be only represented as a composition of 2 or more
+expand or collapse shapes.
+
+##### Example 1: Linearization-delinearization.
+
+```c+
+p0 = f32[4,8] parameter(0)
+reshape = f32[2, 4, 4] reshape(p0)
+```
+
+This reshape can be represented as a composition of collapse shape of
+`tensor<4x8xf32>` to `tensor<32xf32>` and then a shape expansion to
+`tensor<2x4x4xf32>`.
+
+The output to input map:
+
+-   output -> input: $(d_0, d_1, d_2) \mapsto (2d_0 + (4d_1 + d_2) / 8, 4d_1 +
+    d_2) \mod 8)$
+
+for $\boldsymbol{d} \in {\rm Dom}(output)$
+
+The input to output map:
+
+-   input -> output: $(d_0, d_1) \mapsto ((8d_0 + d_1) / 16, ((8d_0 + d_1) \mod
+    16) / 4, d_1 \mod 4)$
+
+for $\boldsymbol{d} \in {\rm Dom}(input)$.
+
+##### Example 2: Expanded and collapsed subshapes
+
+```c+
+p0 = f32[4, 8, 12] parameter(0)
+reshape = f32[32, 3, 4] reshape(p0)
+```
+
+This reshape can be represented as a composition of two reshapes. The first one
+collapses the outermost dimensions `tensor<4x8x12xf32>` to `tensor<32x12xf32>`
+and the second one expand the innermost dimension `tensor<32x12xf32>` into
+`tensor<32x3x4xf32>`.
+
+The output to input map:
+
+-   output -> input: $(d_0, d_1, d_2) \mapsto (d_0 / 8, d_0 \mod 8, 4d_1 + d_2)$
+    for $\boldsymbol{d} \in {\rm Dom}(output)$
+
+The input to output map:
+
+-   input -> output: $(d_0, d_1, d_2) \mapsto (8d_0 + d_1, d_2 / 4, d_2 \mod 4)$
+    for $\boldsymbol{d} \in {\rm Dom}(input)$.
+
+### Bitcast
+
+A bitcast op can be represented as a
+[sequence of transpose-reshape-transpose](https://github.com/openxla/xla/blob/578b6df240be94c3c84129fd83f34487efc623a5/xla/shape_util.h#L813).
+Therefore, its indexing maps are just a composition of indexing maps for this
+sequence.
+
+### [Concatenate](https://openxla.org/xla/operation_semantics#concatenate)
+
+Output-to-input mapping for concat is defined for all inputs, but with
+non-overlapping domains, i.e. only one of the inputs will be used at a time.
+
+```c+
+p0 = f32[3,50] parameter(0)
+p1 = f32[3,30] parameter(1)
+concat = f32[3,80] concatenate(f32[3,50] p0, f32[3,30] p1),
+  dimensions={1}
+```
+
+The output to input map:
+
+-   output -> input 1:
+
+$(d_0, d_1) \mapsto (d_0, d_1)$ for $\boldsymbol{d} \in [0, 2] \times [0, 49]$
+
+-   output -> input 2:
+
+$(d_0, d_1) \mapsto (d_0, d_1 - 50)$ for $\boldsymbol{d} \in [0, 2] \times [50,
+79]$
+
+The inputs to output map:
+
+-   input 1 -> output: $(d_0, d_1) \mapsto (d_0, d_1)$ for $\boldsymbol{d} \in
+    {\rm Dom}(input_1)$.
+-   input 2 -> output: $(d_0, d_1) \mapsto (d_0, d_1 + 50)$ for $\boldsymbol{d}
+    \in {\rm Dom}(input_2)$.
+
+### [Dot](https://openxla.org/xla/operation_semantics#dot)
+
+Indexing maps for dot are very similar to the ones of reduce.
+
+```c+
+p0 = f32[4, 128, 256] parameter(0)
+p1 = f32[4, 256, 64] parameter(1)
+dot = f32[4, 128, 64] dot(p0, p1),
+  lhs_batch_dims={0}, rhs_batch_dims={0},
+  lhs_contracting_dims={2}, rhs_contracting_dims={1}
+```
+
+The output to inputs maps:
+
+-   output -> input_1: $(d_0, d_1, d_2) \mapsto (d_0, d_1, s_0)$ for
+    $\boldsymbol{d} \in {\rm Dom}(output)$ and $\boldsymbol{s} \in [0, 255]$
+-   output -> input_2: $(d_0, d_1, d_2) \mapsto (d_0, s_0, d_2)$ for
+    $\boldsymbol{d} \in {\rm Dom}(output)$ and $\boldsymbol{s} \in [0, 255]$
+
+The inputs to output maps:
+
+-   input_1 -> output: $(d_0, d_1, d_2) \mapsto (d_0, d_1, s_0)$ for
+    $\boldsymbol{d} \in {\rm Dom}(input_1)$ and $\boldsymbol{s} \in [0, 63]$
+-   input_2 -> output: $(d_0, d_1, d_2) \mapsto (d_0, s_0, d_1)$ for
+    $\boldsymbol{d} \in {\rm Dom}(input_2)$ and $\boldsymbol{s} \in [0, 127]$
+
+### [Pad](https://openxla.org/xla/operation_semantics#pad)
+
+Indexing of PadOp is inverse of SliceOp indexing.
+
+```c+
+p0 = f32[4, 4] parameter(0)
+p1 = f32[] parameter(1)
+pad = f32[12, 16] pad(p0, p1), padding=1_4_1x4_8_0
+```
+
+The padding config `1_4_1x4_8_0` denotes `lowPad_highPad_interiorPad_dim_0 x lowPad_highPad_interiorPad_dim_1`.
+
+The output to input maps:
+
+-   output -> input: $(d_0, d_1) \mapsto ((d_0 - 1) / 2, d_1 - 4)$
+    for $\boldsymbol{d} \in [1, 7] \times [4, 7]$ and $(d_0 - 1) \mod 2 \equiv 0$
+-   output -> init: $(d_0, d_1) \mapsto ()$ for $\boldsymbol{d} \in {\rm Dom}(output)$
+
+
+### [ReduceWindow](https://openxla.org/xla/operation_semantics#reducewindow)
+
+ReduceWindow in XLA also performs padding. Therefore, the indexing maps can be
+computed as a composition of ReduceWindow indexing that does not do any padding
+and PadOp's indexing.
+
+
+```c+
+c_inf = f32[] constant(-inf)
+p0 = f32[1024, 514] parameter(0)
+reduce-window = f32[1024, 3] reduce-window(p0, c_inf),
+  window={size=1x512 pad=0_0x0_0}, to_apply=max
+```
+
+The output to input maps:
+
+-   output -> input: $(d_0, d_1) \mapsto (d_0, d_1 + s_0)$ for $\boldsymbol{d} \in [0, 1023] \times [0, 2]$ and $\boldsymbol{s} \in [0, 511]$
+-   output -> init: $(d_0, d_1) \mapsto ()$ for $\boldsymbol{d} \in {\rm Dom}(output)$
+
+## Indexing Maps for Fusion
+
+Indexing map for fusion op is a composition of indexing maps for every op in the
+cluster. It can happen that some inputs are read several times with different
+access patterns.
+
+### One input, several indexing maps
+
+Here is an example for $p_0 + p_0^T$
+
+```c+
+f {
+  p0 = f32[1000, 1000] parameter(0)
+  transpose_p0 = f32[1000, 1000]{0, 1} transpose(p0), dimensions={1, 0}
+  ROOT a0 = f32[1000, 1000] add(p0, transpose_p0)
+}
+```
+
+The output-to-input indexing maps for `p0` will be $(d_0, d_1) \mapsto (d_0,
+d_1)$ and $(d_0, d_1) \mapsto (d_1, d_0)$. It means that to compute one element
+of the output we might need to read the input parameter twice.
+
+### One input, deduplicated indexing map
+
+![img](./images/indexing_analysis_transposes.svg)
+
+There are cases when the indexing maps are actually the same, even though it is
+not immediately obvious.
+
+```c+
+f {
+  p0 = f32[20, 10, 50] parameter(0)
+  lhs_transpose_1 = f32[10, 20, 50] transpose(p0), dimensions={1, 0, 2}
+  lhs_e = f32[10, 20, 50] exponential(lhs_transpose_1)
+  lhs_transpose_2 = f32[10, 50, 20] transpose(lhs_e), dimensions={0, 2, 1}
+  rhs_transpose_1 = f32[50, 10, 20] transpose(p0), dimensions={2, 1, 0}
+  rhs_log = f32[50, 10, 20] exponential(rhs_transpose_1)
+  rhs_transpose_2 = f32[10, 50, 20] transpose(rhs_log), dimensions={1, 0, 2}
+  ROOT add = f32[10, 50, 20] add(lhs_transpose_2, rhs_transpose_2)
+}
+```
+
+The output-to-input indexing map for `p0` in this case is just $(d_0, d_1, d_2)
+\mapsto (d_2, d_0, d_1)$.
+
+
+### Softmax
+
+![img](./images/indexing_analysis_softmax.png)
+
+The output-to-input indexing maps for `parameter 0` for softmax:
+
+-   $(d_0, d_1, d_2) \mapsto (d_0, d_1, d_2)$
+-   $(d_0, d_1, d_2)[s_0] \mapsto (d_0, d_1, s_0)$
+
+for $\boldsymbol{d} \in {\rm Dom}(output)$ and $\boldsymbol{s} \in [0, 124]$
+refers to the inner-most dimension of the input.
+
+## Indexing Map Simplifier
+
+The default simplifier for `mlir::AffineMap` upstream cannot make any
+assumptions about the ranges of dimensions/symbols. Therefore, it cannot
+simplify expressions with `mod` and `div`efficiently.
+
+We can leverage the knowledge about lower and upper bounds of the
+sub-expressions in the affine maps to simplify them even more.
+
+The simplifier can rewrite the following expressions.
+
+1.  $(d_0, d_1) \mapsto (d_0 + d1 / 16, d1 \mod 16)$ for $\boldsymbol{d} \in [0,
+    6] \times [0, 14]$ becomes $(d_0, d_1) \mapsto (d_0, d_1)$
+2.  $(d_0, d_1, d_2) \mapsto ((100d_0 + 10d_1 + d_2) /100, ((100d_0 + 10d_1 +
+    d_2) \mod 100) / 10, d_2 \mod 10)$ for $d_i \in [0, 9]$ becomes $(d_0, d_1,
+    d_2) \mapsto (d_0, d_1, d_2)$.
+3.  $(d_0, d_1, d_2) \mapsto ((16d_0 + 4d_1 + d_2) /8, (16d_0 + 4d_1 + d_2) \mod
+    8)$ for $d_i \in [0, 9]$ becomes $(d_0, d_1, d_2) \mapsto (2d_0 + (4d_1 +
+    d_2) /8,(4d_1 + d_2) \mod 8)$.
+4.  $(d_0, d_1) \mapsto (-(-11d_0 - d_1 + 109) / 11 + 9)$ for $\boldsymbol{d}
+    \in [0, 9] \times [0, 10]$ becomes $(d_0, d_1) \mapsto (d_0)$.
+
+Indexing map simplifier allows us to understand that some of the chained
+reshapes in HLO cancel each other.
+
+```c+
+p0 = f32[10, 10, 10] parameter(0)
+reshape1 = f32[50, 20] reshape(p0)
+reshape2 = f32[10, 10, 10] reshape(reshape1)
+```
+
+After the composition of indexing maps and their simplification we will get
+
+$(d_0, d_1, d_2) \mapsto (d_0, d_1, d_2)$.
+
+Indexing map simplification also simplifies the constraints.
+
+1. Constraints of type
+`lower_bound <= affine_expr (floordiv, +, -, *) constant <= upper_bound` are
+rewritten as `updated_lower_bound <= affine_expr <= updated_upped_bound`.
+2. Constraints that are always satisfied, e.g. $d_0 + s_0 in [0, 20]$
+for $d_0 \in [0, 5]$ and $s_0 \in [1, 3]$ are eliminated.
+3. Affine expressions in the constraints are optimized as the indexing affine
+map above.
diff --git a/third_party/xla/docs/persisted_autotuning.md b/third_party/xla/docs/persisted_autotuning.md
new file mode 100644
index 00000000000000..5d1f01ab501442
--- /dev/null
+++ b/third_party/xla/docs/persisted_autotuning.md
@@ -0,0 +1,76 @@
+# Persisted autotuning (GPU only)
+
+We use OpenAI Triton for generating some of the GPU kernels. Triton allows
+generating fast GPU kernels for certain fusions, but we have to tune some
+parameters for each such fusion.
+
+This can take a long time if there are many fusions, so we provide a way to load
+those autotuning results, while still running the other compilation steps
+normally. Autotuning caches are still useful if we make a few changes: the
+fusions that are present in the cache will use the cache, and the other ones
+will be autotuned normally.
+
+The autotuning results can be dumped/loaded using these parameters:
+
+```
+--xla_gpu_dump_autotune_results_to=
+--xla_gpu_load_autotune_results_from=
+```
+
+If we specify a .txt or .textproto file, then the cache will be dumped in
+textproto format, otherwise in binary protobuf format.
+
+## In tests
+
+Persisted autotuning can also be used in tests. It is recommended to use it if
+the tests are very big, especially if the performance of the test environment is
+limited.
+
+It only works well if the autotune cache contains results generated on the same
+type of GPU where the tests are being run.
+
+### Making a test use persisted autotuning
+
+For now let's assume that the test in question always uses the same GPU type.
+
+1.  We have to export the autotune results from the test, for example by
+    specifying these parameters to the test command:
+
+    ```
+    --test_env=XLA_FLAGS=--xla_gpu_dump_autotune_results_to=TEST_UNDECLARED_OUTPUTS_DIR/autotune_cache.textproto
+    --test_sharding_strategy=disabled
+    ```
+
+    Sharding must be disabled to correctly get a single autotune cache for all
+    tests.
+
+2.  Then we have to upload that cache to our code repository.
+
+3.  Then we have to add the cache to the data dependencies of our test target,
+    and load it using an environment variable.
+
+    ```
+    data = ["test_autotune_cache.textproto"],
+    env = {"XLA_FLAGS": "--xla_gpu_load_autotune_results_from=" +
+                        "$(execpath test_autotune_cache.textproto)"},
+    ```
+
+    (It is OK to use sharding in tests that load autotune results.)
+
+Please also see the example tests in
+[xla/service/gpu/tests/BUILD](https://github.com/openxla/xla/blob/main/xla/service/gpu/tests/BUILD):
+
+-   load_autotune_results_using_execpath_test
+-   load_autotune_results_from_test_workspace_test
+-   dump_autotune_results_to_test_outputs_test
+
+### Cache obsolescence
+
+If many changes are made to a model, it is possible that the cache will no
+longer contain all fusions, so the test will become slower. In this case we
+would have to regenerate the autotuning cache.
+
+If we start using a new type of GPU for running the tests, the same applies.
+
+The cache may also become obsolete if the XLA compiler evolves and generates
+different fusions.
diff --git a/third_party/xla/docs/tf2xla/index.md b/third_party/xla/docs/tf2xla/index.md
new file mode 100644
index 00000000000000..edde1f7de62374
--- /dev/null
+++ b/third_party/xla/docs/tf2xla/index.md
@@ -0,0 +1,239 @@
+# XLA: Optimizing Compiler for Machine Learning
+
+[OpenXLA](https://openxla.org) is a domain-specific compiler for linear
+algebra that can accelerate TensorFlow models with potentially no source code
+changes.
+
+## Introduction
+
+When a TensorFlow program is run, all of the operations are executed
+individually by the TensorFlow executor. Each TensorFlow operation has a
+precompiled GPU kernel implementation that the executor dispatches to.
+
+XLA provides an alternative mode of running models: it compiles the TensorFlow
+graph into a sequence of computation kernels generated specifically for the
+given model. Because these kernels are unique to the model, they can exploit
+model-specific information for optimization. For example, let's look at an
+optimization XLA does in the context of a simple TensorFlow computation:
+
+```
+def model_fn(x, y, z):
+  return tf.reduce_sum(x + y * z)
+```
+
+Run without XLA, the graph launches three kernels: one for the multiplication,
+one for the addition and one for the reduction. However, XLA can optimize the
+graph so that it computes the result in a single kernel launch. It does this by
+"fusing" the addition, multiplication and reduction into a single GPU kernel.
+Moreover, this fused operation does not write out the intermediate values
+produced by `y*z` and `x+y*z` to memory; instead it "streams" the results of
+these intermediate computations directly to their users while keeping them
+entirely in GPU registers. Fusion is XLA's single most important optimization.
+Memory bandwidth is typically the scarcest resource on hardware accelerators, so
+removing memory operations is one of the best ways to improve performance.
+
+## Enable XLA for TensorFlow models
+
+### Explicit compilation with `tf.function(jit_compile=True)`
+
+Explicit compilation API offers a fine-grained control for choosing which
+functions should be compiled. For example, the following TensorFlow function
+which performs the MNIST training is compiled with XLA:
+
+```
+@tf.function(jit_compile=True)
+def train_mnist(images, labels):
+    images, labels = cast(images, labels)
+
+    with tf.GradientTape() as tape:
+      predicted_labels = layer(images)
+      loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(
+          logits=predicted_labels, labels=labels
+      ))
+    layer_variables = layer.trainable_variables
+    grads = tape.gradient(loss, layer_variables)
+    optimizer.apply_gradients(zip(grads, layer_variables))
+```
+
+The `jit_compile` API has _must-compile_ semantics: either the entire
+function is compiled with XLA, or an `errors.InvalidArgumentError` exception is
+thrown. XLA can not currently compile functions where dimensions are not
+_inferrable_: that is, if it's not possible to infer the dimensions of all
+tensors without running the entire computation. For example, the following
+function will not compile:
+
+```
+@tf.function
+def not_compilable(x):
+  return tf.unique(x)
+```
+
+Shapes can vary across the runs though:
+
+```
+@tf.function(jit_compile=True)
+def recompiled_on_launch(a, b):
+  return a + b
+
+recompiled_on_launch(tf.ones([1, 10]), tf.ones([1, 10]))
+recompiled_on_launch(tf.ones([1, 100]), tf.ones([1, 100]))
+```
+
+Note: Nesting behavior: the function will be compiled if at least one function
+in its call stack has `jit_compile=True`.
+
+See the [tutorial colab](./tutorials/jit_compile.ipynb) for a more detailed
+usage example, and a
+[tutorial video](https://www.youtube.com/watch?v=cPAD9vLKE0c) on
+`jit_compile=True` usage.
+
+### Usage with Keras
+
+For Keras models, `jit_compile=True` can be set as an argument to
+[`model.compile`](https://www.tensorflow.org/api_docs/python/tf/keras/Model#compile):
+
+```
+model.compile(optimizer="adam", jit_compile=True)
+```
+
+### Usage with distributed strategy
+
+XLA:GPU can be used with TF distributed strategy
+([`MirroredStrategy`](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)
+or
+[`MultiWorkerMirroredStrategy`](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy))
+by annotating step function with `jit_compile=True`:
+
+```
+@tf.function(jit_compile=True)
+def step_fn():
+  t = tf.ones(shape=[100], dtype=tf.float32)
+  ctx = tf.distribute.get_replica_context()
+  return ctx.all_reduce(tf.distribute.ReduceOp.SUM, t)
+
+@tf.function
+def run_fn():
+  return strategy.run(step_fn)
+```
+
+### Auto-clustering
+
+A simple way to start using XLA in TensorFlow models without any changes is to
+enable _auto-clustering_, which automatically finds _clusters_ (connected
+subgraphs) within the TensorFlow functions which can be compiled and executed
+using XLA. Auto-clustering on GPU can be enabled by setting the `TF_XLA_FLAGS`
+environment variable:
+
+Note: In TF2, only the code inside `tf.function` will be clustered.
+
+```
+$ TF_XLA_FLAGS=--tf_xla_auto_jit=2 path/to/your/tf/program
+```
+
+Auto-clustering is currently optimized for GPU workloads, but it can also be
+enabled on CPU by additionally using the flag `--tf_xla_cpu_global_jit`:
+
+```
+$ TF_XLA_FLAGS="--tf_xla_auto_jit=2 --tf_xla_cpu_global_jit" path/to/your/program
+```
+
+Note: Auto-clustering support on CPU and on multi-GPU environments is
+experimental.
+
+For a detailed usage example see the
+[auto-clustering tutorial colab](./tutorials/autoclustering_xla.ipynb).
+
+### AOT (Ahead-of-time) compilation for CPU with `tfcompile`
+
+You can also use a standalone [`tfcompile`](./tfcompile.md) tool, which converts
+TensorFlow graph into executable code (for x86-64 CPU only).
+
+## Inspect compiled programs
+
+XLA provides introspection facilities which let you inspect the generated
+programs. To dump the generated programs, use the environment variable
+`XLA_FLAGS`:
+
+```
+$ XLA_FLAGS="--xla_dump_to=/tmp/generated" TF_XLA_FLAGS="--tf_xla_auto_jit=2" my/tensorflow/program
+```
+
+After the dumping is performed, you can find the following files in
+`/tmp/generated`:
+
+-   `module_XXXX.*_optimizations.txt` Generated
+    [XLA programs](./operation_semantics.md), one per each compiled cluster.
+    Attaching those when submitting XLA bug reports is extremely helpful!
+
+-   `module_XXXX.ir-*.ll` Generated files in
+    [LLVM](https://llvm.org/docs/LangRef.html) intermediate representation, with
+    [NVPTX](https://llvm.org/docs/NVPTXUsage.html) intrinsics.
+
+-   `module_XXXX.ptx` Generated
+    [PTX](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html)
+    files.
+
+You can also dump the graph visualizing the embedding of XLA clusters inside of
+the TensorFlow graph with:
+
+```
+$ TF_DUMP_GRAPH_PREFIX=/tmp/generated TF_XLA_FLAGS="--tf_xla_clustering_debug"
+```
+
+## Reproducible bug reports
+
+A bug report is much easier to reproduce if it includes dumps for the generated
+XLA programs and the used auto-clustering embedding.
+To generate them for a TensorFlow program running with auto-clustering, launch:
+
+```
+$ TF_DUMP_GRAPH_PREFIX=/tmp/generated \
+  TF_XLA_FLAGS="--tf_xla_clustering_debug --tf_xla_auto_jit=2" \
+  XLA_FLAGS="--xla_dump_hlo_as_text --xla_dump_to=/tmp/generated" \
+    my/tensorflow/program"
+```
+
+When filing bugs, attach the contents of the `/tmp/generated` directory
+(referenced above).
+
+If possible, try to isolate
+a bug to a single XLA program by using the
+[`run_hlo_module`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/tools/run_hlo_module_main.cc)
+and iteratively running it on generated programs.
+
+## Further reading
+
+-   [OpenXLA Documentation](https://openxla.org) OpenXLA Documentation
+-   [Known Issues](./known_issues.md) List of known issues with XLA+TF
+-   [XLA - TensorFlow, Compiled](https://developers.googleblog.com/2017/03/xla-tensorflow-compiled.html):
+    Read on Google Developers Blog
+-   Check out the
+    [XLA source](https://github.com/openxla/xla)
+    on Github!
+
+## XLA Frontends
+
+Apart from TensorFlow, XLA programs can be generated by:
+
+-   [JAX](https://github.com/google/jax): Composable transformations of
+    Python+NumPy programs
+-   [Julia](https://github.com/JuliaTPU/XLA.jl): The Julia language for
+    scientific computing
+-   [PyTorch](https://github.com/pytorch/xla): PyTorch framework
+-   [Nx](https://github.com/elixir-nx/nx): Numerical computing library for the
+    Elixir programming language
+
+## Talks
+
+### Using XLA from TF using `jit_compile=True`
+
+<iframe frameborder="0" allow="accelerometer; autoplay;
+encrypted-media; gyroscope; picture-in-picture; fullscreen" width="640" height="360"
+src="https://www.youtube.com/embed/cPAD9vLKE0c?origin=https%3A%2F%2Fwww.tensorflow.org&amp;autohide=1&amp;showinfo=0&amp;video-id=kAOanJczHA0&amp;enablejsapi=1&amp;widgetid=1" id="widget2" data-title="YouTube video player"></iframe>
+
+### XLA Overview
+
+<iframe frameborder="0" allow="accelerometer; autoplay;
+encrypted-media; gyroscope; picture-in-picture; fullscreen" width="640" height="360"
+src="https://www.youtube.com/embed/kAOanJczHA0?origin=https%3A%2F%2Fwww.tensorflow.org&amp;autohide=1&amp;showinfo=0&amp;video-id=kAOanJczHA0&amp;enablejsapi=1&amp;widgetid=1"
+id="widget2" data-title="YouTube video player"></iframe>
diff --git a/third_party/xla/docs/tf2xla/tfcompile.md b/third_party/xla/docs/tf2xla/tfcompile.md
new file mode 100644
index 00000000000000..5d60a4e90a9acb
--- /dev/null
+++ b/third_party/xla/docs/tf2xla/tfcompile.md
@@ -0,0 +1,279 @@
+# Using AOT compilation
+
+## What is tfcompile?
+
+`tfcompile` is a standalone tool that ahead-of-time (AOT) compiles TensorFlow
+graphs into executable code. It can reduce total binary size, and also avoid
+some runtime overheads. A typical use-case of `tfcompile` is to compile an
+inference graph into executable code for mobile devices.
+
+The TensorFlow graph is normally executed by the TensorFlow runtime. This incurs
+some runtime overhead for execution of each node in the graph. This also leads
+to a larger total binary size, since the code for the TensorFlow runtime needs
+to be available, in addition to the graph itself. The executable code produced
+by `tfcompile` does not use the TensorFlow runtime, and only has dependencies on
+kernels that are actually used in the computation.
+
+The compiler is built on top of the XLA framework. The code bridging TensorFlow
+to the XLA framework resides under
+[tensorflow/compiler](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/).
+
+## What does tfcompile do?
+
+`tfcompile` takes a subgraph, identified by the TensorFlow concepts of
+feeds and fetches, and generates a function that implements that subgraph.
+The `feeds` are the input arguments for the function, and the `fetches` are the
+output arguments for the function. All inputs must be fully specified by the
+feeds; the resulting pruned subgraph cannot contain Placeholder or Variable
+nodes. It is common to specify all Placeholders and Variables as feeds, which
+ensures the resulting subgraph no longer contains these nodes. The generated
+function is packaged as a `cc_library`, with a header file exporting the
+function signature, and an object file containing the implementation. The user
+writes code to invoke the generated function as appropriate.
+
+## Using tfcompile
+
+This section details high level steps for generating an executable binary with
+`tfcompile` from a TensorFlow subgraph. The steps are:
+
+*   Step 1: Configure the subgraph to compile
+*   Step 2: Use the `tf_library` build macro to compile the subgraph
+*   Step 3: Write code to invoke the subgraph
+*   Step 4: Create the final binary
+
+### Step 1: Configure the subgraph to compile
+
+Identify the feeds and fetches that correspond to the input and output
+arguments for the generated function. Then configure the `feeds` and `fetches`
+in a [`tensorflow.tf2xla.Config`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/tf2xla/tf2xla.proto)
+proto.
+
+```textproto
+# Each feed is a positional input argument for the generated function.  The order
+# of each entry matches the order of each input argument.  Here “x_hold” and “y_hold”
+# refer to the names of placeholder nodes defined in the graph.
+feed {
+  id { node_name: "x_hold" }
+  shape {
+    dim { size: 2 }
+    dim { size: 3 }
+  }
+}
+feed {
+  id { node_name: "y_hold" }
+  shape {
+    dim { size: 3 }
+    dim { size: 2 }
+  }
+}
+
+# Each fetch is a positional output argument for the generated function.  The order
+# of each entry matches the order of each output argument.  Here “x_y_prod”
+# refers to the name of a matmul node defined in the graph.
+fetch {
+  id { node_name: "x_y_prod" }
+}
+```
+
+### Step 2: Use tf_library build macro to compile the subgraph
+
+This step converts the graph into a `cc_library` using the `tf_library` build
+macro. The `cc_library` consists of an object file containing the code generated
+from the graph, along with a header file that gives access to the generated
+code. `tf_library` utilizes `tfcompile` to compile the TensorFlow graph into
+executable code.
+
+```build
+load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+
+# Use the tf_library macro to compile your graph into executable code.
+tf_library(
+    # name is used to generate the following underlying build rules:
+    # <name>           : cc_library packaging the generated header and object files
+    # <name>_test      : cc_test containing a simple test and benchmark
+    # <name>_benchmark : cc_binary containing a stand-alone benchmark with minimal deps;
+    #                    can be run on a mobile device
+    name = "test_graph_tfmatmul",
+    # cpp_class specifies the name of the generated C++ class, with namespaces allowed.
+    # The class will be generated in the given namespace(s), or if no namespaces are
+    # given, within the global namespace.
+    cpp_class = "foo::bar::MatMulComp",
+    # graph is the input GraphDef proto, by default expected in binary format.  To
+    # use the text format instead, just use the ‘.pbtxt’ suffix.  A subgraph will be
+    # created from this input graph, with feeds as inputs and fetches as outputs.
+    # No Placeholder or Variable ops may exist in this subgraph.
+    graph = "test_graph_tfmatmul.pb",
+    # config is the input Config proto, by default expected in binary format.  To
+    # use the text format instead, use the ‘.pbtxt’ suffix.  This is where the
+    # feeds and fetches were specified above, in the previous step.
+    config = "test_graph_tfmatmul.config.pbtxt",
+)
+```
+
+> To generate the GraphDef proto (test_graph_tfmatmul.pb) for this example, run
+> [make_test_graphs.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/aot/tests/make_test_graphs.py)
+> and specify the output location with the --out_dir flag.
+
+Typical graphs contain [`Variables`](https://www.tensorflow.org/guide/variables)
+representing the weights that are learned via training, but `tfcompile` cannot
+compile a subgraph that contain `Variables`. The
+[freeze_graph.py](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/tools/freeze_graph.py)
+tool converts variables into constants, using values stored in a checkpoint
+file. As a convenience, the `tf_library` macro supports the `freeze_checkpoint`
+argument, which runs the tool. For more examples see
+[tensorflow/compiler/aot/tests/BUILD](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/aot/tests/BUILD).
+
+> Constants that show up in the compiled subgraph are compiled directly into the
+> generated code. To pass the constants into the generated function, rather than
+> having them compiled-in, simply pass them in as feeds.
+
+For details on the `tf_library` build macro, see
+[tfcompile.bzl](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/aot/tfcompile.bzl).
+
+For details on the underlying `tfcompile` tool, see
+[tfcompile_main.cc](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/aot/tfcompile_main.cc).
+
+### Step 3: Write code to invoke the subgraph
+
+This step uses the header file (`test_graph_tfmatmul.h`) generated by the
+`tf_library` build macro in the previous step to invoke the generated code. The
+header file is located in the `bazel-bin` directory corresponding to the
+build package, and is named based on the name attribute set on the `tf_library`
+build macro. For example, the header generated for `test_graph_tfmatmul` would
+be `test_graph_tfmatmul.h`. Below is an abbreviated version of what is
+generated. The generated file, in `bazel-bin`, contains additional useful
+comments.
+
+```c++
+namespace foo {
+namespace bar {
+
+// MatMulComp represents a computation previously specified in a
+// TensorFlow graph, now compiled into executable code.
+class MatMulComp {
+ public:
+  // AllocMode controls the buffer allocation mode.
+  enum class AllocMode {
+    ARGS_RESULTS_AND_TEMPS,  // Allocate arg, result and temp buffers
+    RESULTS_AND_TEMPS_ONLY,  // Only allocate result and temp buffers
+  };
+
+  MatMulComp(AllocMode mode = AllocMode::ARGS_RESULTS_AND_TEMPS);
+  ~MatMulComp();
+
+  // Runs the computation, with inputs read from arg buffers, and outputs
+  // written to result buffers. Returns true on success and false on failure.
+  bool Run();
+
+  // Arg methods for managing input buffers. Buffers are in row-major order.
+  // There is a set of methods for each positional argument.
+  void** args();
+
+  void set_arg0_data(float* data);
+  float* arg0_data();
+  float& arg0(size_t dim0, size_t dim1);
+
+  void set_arg1_data(float* data);
+  float* arg1_data();
+  float& arg1(size_t dim0, size_t dim1);
+
+  // Result methods for managing output buffers. Buffers are in row-major order.
+  // Must only be called after a successful Run call. There is a set of methods
+  // for each positional result.
+  void** results();
+
+
+  float* result0_data();
+  float& result0(size_t dim0, size_t dim1);
+};
+
+}  // end namespace bar
+}  // end namespace foo
+```
+
+The generated C++ class is called `MatMulComp` in the `foo::bar` namespace,
+because that was the `cpp_class` specified in the `tf_library` macro. All
+generated classes have a similar API, with the only difference being the methods
+to handle arg and result buffers. Those methods differ based on the number and
+types of the buffers, which were specified by the `feed` and `fetch` arguments
+to the `tf_library` macro.
+
+There are three types of buffers managed within the generated class: `args`
+representing the inputs, `results` representing the outputs, and `temps`
+representing temporary buffers used internally to perform the computation. By
+default, each instance of the generated class allocates and manages all of these
+buffers for you. The `AllocMode` constructor argument may be used to change this
+behavior. All buffers are aligned to 64-byte boundaries.
+
+The generated C++ class is just a wrapper around the low-level code generated by
+XLA.
+
+Example of invoking the generated function based on
+[`tfcompile_test.cc`](https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/aot/tests/tfcompile_test.cc):
+
+```c++
+#define EIGEN_USE_THREADS
+#define EIGEN_USE_CUSTOM_THREAD_POOL
+
+#include <iostream>
+#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
+#include "third_party/tensorflow/compiler/aot/tests/test_graph_tfmatmul.h" // generated
+
+int main(int argc, char** argv) {
+  Eigen::ThreadPool tp(2);  // Size the thread pool as appropriate.
+  Eigen::ThreadPoolDevice device(&tp, tp.NumThreads());
+
+
+  foo::bar::MatMulComp matmul;
+  matmul.set_thread_pool(&device);
+
+  // Set up args and run the computation.
+  const float args[12] = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12};
+  std::copy(args + 0, args + 6, matmul.arg0_data());
+  std::copy(args + 6, args + 12, matmul.arg1_data());
+  matmul.Run();
+
+  // Check result
+  if (matmul.result0(0, 0) == 58) {
+    std::cout << "Success" << std::endl;
+  } else {
+    std::cout << "Failed. Expected value 58 at 0,0. Got:"
+              << matmul.result0(0, 0) << std::endl;
+  }
+
+  return 0;
+}
+```
+
+### Step 4: Create the final binary
+
+This step combines the library generated by `tf_library` in step 2 and the code
+written in step 3 to create a final binary. Below is an example `bazel` BUILD
+file.
+
+```build
+# Example of linking your binary
+# Also see //tensorflow/compiler/aot/tests/BUILD
+load("//tensorflow/compiler/aot:tfcompile.bzl", "tf_library")
+
+# The same tf_library call from step 2 above.
+tf_library(
+    name = "test_graph_tfmatmul",
+    ...
+)
+
+# The executable code generated by tf_library can then be linked into your code.
+cc_binary(
+    name = "my_binary",
+    srcs = [
+        "my_code.cc",  # include test_graph_tfmatmul.h to access the generated header
+    ],
+    deps = [
+        ":test_graph_tfmatmul",  # link in the generated object file
+        "//third_party/eigen3",
+    ],
+    linkopts = [
+          "-lpthread",
+    ]
+)
+```
diff --git a/third_party/xla/docs/tf2xla/tutorials/autoclustering_xla.ipynb b/third_party/xla/docs/tf2xla/tutorials/autoclustering_xla.ipynb
new file mode 100644
index 00000000000000..88f94c2bbc3f82
--- /dev/null
+++ b/third_party/xla/docs/tf2xla/tutorials/autoclustering_xla.ipynb
@@ -0,0 +1,272 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "f4TSNCvpENrW"
+      },
+      "source": [
+        "##### Copyright 2019 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "vamNSA0vEP-m"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "asd4sdga7g"
+      },
+      "source": [
+        "# Classifying CIFAR-10 with XLA\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "b7noD9NjFRL-"
+      },
+      "source": [
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/xla/tutorials/autoclustering_xla\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/tensorflow/compiler/xla/g3doc/tutorials/autoclustering_xla.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "mz65veHXsmnS"
+      },
+      "source": [
+        "This tutorial trains a TensorFlow model to classify the [CIFAR-10](https://en.wikipedia.org/wiki/CIFAR-10) dataset, and we compile it using XLA.\n",
+        "\n",
+        "You will load and normalize the dataset using the [TensorFlow Datasets (TFDS)](https://tensorflow.org/datasets) API. First, install/upgrade TensorFlow and TFDS:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "R4xtYyOf78e3"
+      },
+      "outputs": [],
+      "source": [
+        "!pip install -U -q tensorflow tensorflow_datasets"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "PH2HbLW65tmo"
+      },
+      "outputs": [],
+      "source": [
+        "import tensorflow as tf\n",
+        "import tensorflow_datasets as tfds"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7vm2QsMisCxI"
+      },
+      "outputs": [],
+      "source": [
+        "# Check that GPU is available: cf. https://colab.research.google.com/notebooks/gpu.ipynb\n",
+        "assert(tf.test.gpu_device_name())\n",
+        "\n",
+        "tf.keras.backend.clear_session()\n",
+        "tf.config.optimizer.set_jit(False) # Start with XLA disabled.\n",
+        "\n",
+        "def load_data():\n",
+        "  result = tfds.load('cifar10', batch_size = -1)\n",
+        "  (x_train, y_train) = result['train']['image'],result['train']['label']\n",
+        "  (x_test, y_test) = result['test']['image'],result['test']['label']\n",
+        "  \n",
+        "  x_train = x_train.numpy().astype('float32') / 256\n",
+        "  x_test = x_test.numpy().astype('float32') / 256\n",
+        "\n",
+        "  # Convert class vectors to binary class matrices.\n",
+        "  y_train = tf.keras.utils.to_categorical(y_train, num_classes=10)\n",
+        "  y_test = tf.keras.utils.to_categorical(y_test, num_classes=10)\n",
+        "  return ((x_train, y_train), (x_test, y_test))\n",
+        "\n",
+        "(x_train, y_train), (x_test, y_test) = load_data()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "MgNM2tbgtScx"
+      },
+      "source": [
+        "We define the model, adapted from the Keras [CIFAR-10 example](https://keras.io/examples/cifar10_cnn/):"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "3ZRQSwoRsKM_"
+      },
+      "outputs": [],
+      "source": [
+        "def generate_model():\n",
+        "  return tf.keras.models.Sequential([\n",
+        "    tf.keras.layers.Conv2D(32, (3, 3), padding='same', input_shape=x_train.shape[1:]),\n",
+        "    tf.keras.layers.Activation('relu'),\n",
+        "    tf.keras.layers.Conv2D(32, (3, 3)),\n",
+        "    tf.keras.layers.Activation('relu'),\n",
+        "    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),\n",
+        "    tf.keras.layers.Dropout(0.25),\n",
+        "\n",
+        "    tf.keras.layers.Conv2D(64, (3, 3), padding='same'),\n",
+        "    tf.keras.layers.Activation('relu'),\n",
+        "    tf.keras.layers.Conv2D(64, (3, 3)),\n",
+        "    tf.keras.layers.Activation('relu'),\n",
+        "    tf.keras.layers.MaxPooling2D(pool_size=(2, 2)),\n",
+        "    tf.keras.layers.Dropout(0.25),\n",
+        "\n",
+        "    tf.keras.layers.Flatten(),\n",
+        "    tf.keras.layers.Dense(512),\n",
+        "    tf.keras.layers.Activation('relu'),\n",
+        "    tf.keras.layers.Dropout(0.5),\n",
+        "    tf.keras.layers.Dense(10),\n",
+        "    tf.keras.layers.Activation('softmax')\n",
+        "  ])\n",
+        "\n",
+        "model = generate_model()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-M4GtGDZtb8a"
+      },
+      "source": [
+        "We train the model using the\n",
+        "[RMSprop](https://www.tensorflow.org/api_docs/python/tf/train/RMSPropOptimizer)\n",
+        "optimizer:\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "UKCmrhF0tiMa"
+      },
+      "outputs": [],
+      "source": [
+        "def compile_model(model):\n",
+        "  opt = tf.keras.optimizers.RMSprop(learning_rate=0.0001)\n",
+        "  model.compile(loss='categorical_crossentropy',\n",
+        "                optimizer=opt,\n",
+        "                metrics=['accuracy'])\n",
+        "  return model\n",
+        "\n",
+        "model = compile_model(model)\n",
+        "\n",
+        "def train_model(model, x_train, y_train, x_test, y_test, epochs=25):\n",
+        "  model.fit(x_train, y_train, batch_size=256, epochs=epochs, validation_data=(x_test, y_test), shuffle=True)\n",
+        "\n",
+        "def warmup(model, x_train, y_train, x_test, y_test):\n",
+        "  # Warm up the JIT, we do not wish to measure the compilation time.\n",
+        "  initial_weights = model.get_weights()\n",
+        "  train_model(model, x_train, y_train, x_test, y_test, epochs=1)\n",
+        "  model.set_weights(initial_weights)\n",
+        "\n",
+        "warmup(model, x_train, y_train, x_test, y_test)\n",
+        "%time train_model(model, x_train, y_train, x_test, y_test)\n",
+        "\n",
+        "scores = model.evaluate(x_test, y_test, verbose=1)\n",
+        "print('Test loss:', scores[0])\n",
+        "print('Test accuracy:', scores[1])"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "SLpfQ0StRgsu"
+      },
+      "source": [
+        "Now let's train the model again, using the XLA compiler.\n",
+        "To enable the compiler in the middle of the application, we need to reset the Keras session."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "jxU-Tzy4SX7p"
+      },
+      "outputs": [],
+      "source": [
+        "# We need to clear the session to enable JIT in the middle of the program.\n",
+        "tf.keras.backend.clear_session()\n",
+        "tf.config.optimizer.set_jit(True) # Enable XLA.\n",
+        "model = compile_model(generate_model())\n",
+        "(x_train, y_train), (x_test, y_test) = load_data()\n",
+        "\n",
+        "warmup(model, x_train, y_train, x_test, y_test)\n",
+        "%time train_model(model, x_train, y_train, x_test, y_test)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "iWHz6P1se92F"
+      },
+      "source": [
+        "On a machine with a Titan V GPU and an Intel Xeon E5-2690 CPU the speed up is ~1.17x."
+      ]
+    }
+  ],
+  "metadata": {
+    "accelerator": "GPU",
+    "colab": {
+      "collapsed_sections": [],
+      "name": "CIFAR-10 with XLA.ipynb",
+      "private_outputs": true,
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/third_party/xla/docs/tf2xla/tutorials/jit_compile.ipynb b/third_party/xla/docs/tf2xla/tutorials/jit_compile.ipynb
new file mode 100644
index 00000000000000..b9967f4e94f4da
--- /dev/null
+++ b/third_party/xla/docs/tf2xla/tutorials/jit_compile.ipynb
@@ -0,0 +1,270 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "f4TSNCvpENrW"
+      },
+      "source": [
+        "##### Copyright 2019 The TensorFlow Authors."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "cellView": "form",
+        "id": "vamNSA0vEP-m"
+      },
+      "outputs": [],
+      "source": [
+        "#@title Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+        "# you may not use this file except in compliance with the License.\n",
+        "# You may obtain a copy of the License at\n",
+        "#\n",
+        "# https://www.apache.org/licenses/LICENSE-2.0\n",
+        "#\n",
+        "# Unless required by applicable law or agreed to in writing, software\n",
+        "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+        "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+        "# See the License for the specific language governing permissions and\n",
+        "# limitations under the License."
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "e1oSi4lHFt3z"
+      },
+      "source": [
+        "# Use XLA with tf.function"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "b7noD9NjFRL-"
+      },
+      "source": [
+        "\u003ctable class=\"tfo-notebook-buttons\" align=\"left\"\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://www.tensorflow.org/xla/tutorials/compile\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/tf_logo_32px.png\" /\u003eView on TensorFlow.org\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/g3doc/tutorials/jit_compile.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca href=\"https://storage.googleapis.com/tensorflow_docs/tensorflow/tensorflow/compiler/xla/g3doc/tutorials/jit_compile.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/download_logo_32px.png\" /\u003eDownload notebook\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "  \u003ctd\u003e\n",
+        "    \u003ca target=\"_blank\" href=\"https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/g3doc/tutorials/jit_compile.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
+        "  \u003c/td\u003e\n",
+        "\u003c/table\u003e"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "sDy5lSBd4BDE"
+      },
+      "source": [
+        "This tutorial trains a TensorFlow model to classify the MNIST dataset, where the training function is compiled using XLA.\n",
+        "\n",
+        "First, load TensorFlow and enable eager execution."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "45kUPj5ZFrRa"
+      },
+      "outputs": [],
+      "source": [
+        "import tensorflow as tf\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GZVNiRmTDV-5"
+      },
+      "source": [
+        "Then define some necessary constants and prepare the MNIST dataset."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "f37TSEGvGX4_"
+      },
+      "outputs": [],
+      "source": [
+        "# Size of each input image, 28 x 28 pixels\n",
+        "IMAGE_SIZE = 28 * 28\n",
+        "# Number of distinct number labels, [0..9]\n",
+        "NUM_CLASSES = 10\n",
+        "# Number of examples in each training batch (step)\n",
+        "TRAIN_BATCH_SIZE = 100\n",
+        "# Number of training steps to run\n",
+        "TRAIN_STEPS = 1000\n",
+        "\n",
+        "# Loads MNIST dataset.\n",
+        "train, test = tf.keras.datasets.mnist.load_data()\n",
+        "train_ds = tf.data.Dataset.from_tensor_slices(train).batch(TRAIN_BATCH_SIZE).repeat()\n",
+        "\n",
+        "# Casting from raw data to the required datatypes.\n",
+        "def cast(images, labels):\n",
+        "  images = tf.cast(\n",
+        "      tf.reshape(images, [-1, IMAGE_SIZE]), tf.float32)\n",
+        "  labels = tf.cast(labels, tf.int64)\n",
+        "  return (images, labels)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "lv7I-u_82v1S"
+      },
+      "source": [
+        "Finally, define the model and the optimizer. The model uses a single dense layer."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "7O2NcEfG206Q"
+      },
+      "outputs": [],
+      "source": [
+        "layer = tf.keras.layers.Dense(NUM_CLASSES)\n",
+        "optimizer = tf.keras.optimizers.Adam()"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "x_ZehpZP-SfS"
+      },
+      "source": [
+        "# Define the training function\n",
+        "\n",
+        "In the training function, you get the predicted labels using the layer defined above, and then minimize the gradient of the loss using the optimizer. In order to compile the computation using XLA, place it inside `tf.function` with `jit_compile=True`."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "ZbhJl_WvGa3g"
+      },
+      "outputs": [],
+      "source": [
+        "@tf.function(jit_compile=True)\n",
+        "def train_mnist(images, labels):\n",
+        "    images, labels = cast(images, labels)\n",
+        "\n",
+        "    with tf.GradientTape() as tape:\n",
+        "      predicted_labels = layer(images)\n",
+        "      loss = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(\n",
+        "          logits=predicted_labels, labels=labels\n",
+        "      ))\n",
+        "    layer_variables = layer.trainable_variables\n",
+        "    grads = tape.gradient(loss, layer_variables)\n",
+        "    optimizer.apply_gradients(zip(grads, layer_variables))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "EZD1m_n1DxAF"
+      },
+      "source": [
+        "# Train and test the model"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gukC2Hol3sFZ"
+      },
+      "source": [
+        "Once you have defined the training function, define the model."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "qe28bAHNHUG2"
+      },
+      "outputs": [],
+      "source": [
+        "for images, labels in train_ds:\n",
+        "  if optimizer.iterations \u003e TRAIN_STEPS:\n",
+        "    break\n",
+        "  train_mnist(images, labels)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "qgsKmz3n2UiW"
+      },
+      "source": [
+        "And, finally, check the accuracy:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_GxF6jTRHVuA"
+      },
+      "outputs": [],
+      "source": [
+        "images, labels = cast(test[0], test[1])\n",
+        "predicted_labels = layer(images)\n",
+        "correct_prediction = tf.equal(tf.argmax(predicted_labels, 1), labels)\n",
+        "accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))\n",
+        "print(\"Prediction accuracy after training: %s\" % accuracy)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "PXoOjJnuZRaV"
+      },
+      "source": [
+        "Behind the scenes, the XLA compiler has compiled the entire TF function to HLO, which has enabled fusion optimizations. Using the introspection facilities, we can see the HLO code (other interesting possible values for \"stage\" are `optimized_hlo` for HLO after optimizations and `optimized_hlo_dot` for a Graphviz graph):"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "_a8GsNLVaLSQ"
+      },
+      "outputs": [],
+      "source": [
+        "print(train_mnist.experimental_get_compiler_ir(images, labels)(stage='hlo'))"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "collapsed_sections": [],
+      "name": "jit_compile.ipynb",
+      "provenance": [],
+      "toc_visible": true
+    },
+    "kernelspec": {
+      "display_name": "Python 3",
+      "name": "python3"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/third_party/xla/docs/tools.md b/third_party/xla/docs/tools.md
new file mode 100644
index 00000000000000..fe7d8ee5a6a86c
--- /dev/null
+++ b/third_party/xla/docs/tools.md
@@ -0,0 +1,155 @@
+# Using XLA tooling
+
+The XLA development workflow is usually centered around
+[HLO](./operation_semantics) IR, which represents isolated functional
+computation given to the compiler. XLA comes with multiple command line tools
+(described below) which consume HLO and either run it, or provide an
+intermediate compilation stage. Using such tools is invaluable for a fast
+`compile->modify->run` iteration cycle, as HLO is both visualizable and
+hackable, and iteratively changing and running it is often the fastest way to
+understand and to fix an XLA performance or behavior.
+
+The easiest way to obtain the HLO for a program being compiled with XLA is
+usually to use the `XLA_FLAGS` environment variable:
+
+```
+XLA_FLAGS=--xla_dump_to=/tmp/myfolder ./myprogram-entry-point
+```
+
+which stores all before-optimization HLO files in the folder specified, along
+with many other useful artifacts.
+
+## Running HLO snippets: `run_hlo_module`
+
+The tool `run_hlo_module` operates on pre-optimization HLO, and by default
+bundles compilation, running and comparison with the reference interpreter
+implementation. For example, the usual invocation to run an input file
+`computation.hlo` on an NVIDIA GPU and to check it for correctness is:
+
+```
+run_hlo_module --platform=CUDA --reference_platform=Interpreter computation.hlo
+```
+
+As with all the tools, `--help` can be used to obtain the full list of options.
+
+## Running HLO snippets with SPMD support: `multihost_hlo_runner`
+
+Multihost HLO runner is a very similar tool, with the caveat that it supports
+SPMD, including cross host communication. A typical invocation looks like:
+
+```
+hlo_runner_main  --device_type=gpu --use_spmd_partitioning=true --num_partitions=4 --num_replicas=1 --hlo_file=computation.hlo
+```
+
+## Running passes/stages of HLO compilation: `hlo-opt`
+
+When debugging or understanding the workings of the compiler, it is often useful
+to get the expansion for a particular hardware at a particular point in the
+pipeline (be it HLO, optimized HLO, TritonIR or LLVM), for a given (Stable) HLO
+input.
+
+`hlo-opt` supports multiple output stages: be it PTX, HLO after optimizations,
+LLVM IR before optimizations, or TritonIR. The exact set of stages supported
+depends on the platform (as e.g. PTX is NVIDIA-specific), and can be seen using
+the --list-stages command:
+
+```
+$ hlo-opt --platform=CUDA --list-stages
+hlo
+llvm
+ptx
+```
+
+After selecting a stage, the user can write the result of the conversion for a
+given platform to a given stream:
+
+```
+$ hlo-opt myinput.hlo --platform=CUDA --stage=llvm
+```
+
+which would print the dump to stdout (or to a given file if `-o` was specified).
+
+### Deviceless Usage
+
+Access to a GPU is not needed for most of the compilation, and by specifying a
+GPU spec on the command line we can get e.g. PTX output without access to an
+accelerator:
+
+```
+$ hlo-opt  --platform=CUDA --stage=llvm  --xla_gpu_target_config_filename=(pwd)/tools/data/gpu_specs/a100_80.txtpb input.hlo
+```
+
+Note: For the above invocation to work, the user would usually either need to
+disable autotuning with `--xla_gpu_autotune_level=0` or load a pre-existing
+autotuning results with `--xla_gpu_load_autotune_results_from=<filename>`
+(obtained with `--xla_gpu_dump_autotune_results_to=<filename>`).
+
+Specs for popular GPUs are shipped with the compiler, and the provided file is
+string serialization of `device_description.proto`:
+
+```
+gpu_device_info {
+  cuda_compute_capability {
+    major: 8
+    minor: 0
+  }
+  threads_per_block_limit: 1024
+  threads_per_warp: 32
+  shared_memory_per_block: 127152
+  shared_memory_per_core: 65536
+  threads_per_core_limit: 2048
+  core_count: 6192
+  fpus_per_core: 64
+  block_dim_limit_x: 2147483647
+  block_dim_limit_y: 65535
+  block_dim_limit_z: 65535
+  memory_bandwidth: 2039000000000
+  l2_cache_size: 4194304
+  clock_rate_ghz: 1.1105
+  device_memory_size: 79050250240
+}
+platform_name: "CUDA"
+```
+
+Deviceless compilation might run into issues if autotuning is required. Luckily,
+we can also provide those on the command line:
+
+```
+hlo-opt  --platform=CUDA --stage=llvm  --xla_gpu_target_config_filename=gpu_specs/a100_80.txtpb --xla_gpu_load_autotune_results_from=results.textpb input.hlo
+```
+
+The autotune file is text serialization of `autotune_results.proto`, with
+example looking like:
+
+```
+version: 2
+results {
+  device: "sm_8.0 with 42331013120B RAM, 108 cores, 1410000KHz clock, 1215000KHz mem clock, 41943040B L2$"
+  hlo: "{\n  tmp_0 = f16[1,16,17,3]{3,2,1,0} parameter(0)\n  tmp_1 = f16[16,51]{1,0} bitcast(f16[1,16,17,3]{3,2,1,0} tmp_0)\n  tmp_2 = s8[16,17,3]{2,1,0} parameter(1)\n  tmp_3 = s8[51,16]{0,1} bitcast(s8[16,17,3]{2,1,0} tmp_2)\n  tmp_4 = f16[51,16]{0,1} convert(s8[51,16]{0,1} tmp_3)\n  tmp_5 = f16[16,16]{1,0} dot(f16[16,51]{1,0} tmp_1, f16[51,16]{0,1} tmp_4), lhs_contracting_dims={1}, rhs_contracting_dims={0}\n  ROOT tmp_6 = f16[1,16,16]{2,1,0} bitcast(f16[16,16]{1,0} tmp_5)\n}"
+  result {
+    run_time {
+      nanos: 31744
+    }
+    triton {
+      block_m: 32
+      block_n: 32
+      block_k: 32
+      split_k: 1
+      num_stages: 1
+      num_warps: 4
+    }
+  }
+}
+```
+
+The autotuning database can be serialized using
+`XLA_FLAGS=--xla_gpu_dump_autotune_results_t=<myfile.pbtxt>`
+
+### Running a Single Compiler Pass
+
+The flags from `XLA_FLAGS` are also supported, so the tool can be used to test
+running a single pass:
+
+```
+hlo-opt --platform=CUDA --stage=hlo --xla-hlo-enable-passes-only=algebraic_simplifer input.hlo
+```
diff --git a/third_party/xla/third_party/compute_library/BUILD b/third_party/xla/third_party/compute_library/BUILD
index b353c1fa0aedba..4fc694c50a43cf 100644
--- a/third_party/xla/third_party/compute_library/BUILD
+++ b/third_party/xla/third_party/compute_library/BUILD
@@ -1,9 +1,6 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
-exports_files(
-    ["LICENSE"],
-    visibility = ["//visibility:public"],
-)
+exports_files(["LICENSE"])
 
 config_setting(
     name = "build_with_acl",
diff --git a/third_party/xla/third_party/cudnn_frontend_header_fix.patch b/third_party/xla/third_party/cudnn_frontend_header_fix.patch
index af22372c66009e..70476bd3ff5d56 100644
--- a/third_party/xla/third_party/cudnn_frontend_header_fix.patch
+++ b/third_party/xla/third_party/cudnn_frontend_header_fix.patch
@@ -1,234 +1,13 @@
-diff --git a/include/cudnn_backend_base.h b/include/cudnn_backend_base.h
-index 1240282..cba52ec 100644
---- a/include/cudnn_backend_base.h
-+++ b/include/cudnn_backend_base.h
-@@ -24,7 +24,7 @@
- 
- #include <ostream>
- 
--#include <cudnn.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
- 
- namespace cudnn_frontend {
- 
-diff --git a/include/cudnn_frontend_ConvDesc.h b/include/cudnn_frontend_ConvDesc.h
-index 6e1d7ab..4deec88 100644
---- a/include/cudnn_frontend_ConvDesc.h
-+++ b/include/cudnn_frontend_ConvDesc.h
-@@ -29,8 +29,8 @@
- #include <sstream>
- #include <utility>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_utils.h"
- 
-diff --git a/include/cudnn_frontend_Engine.h b/include/cudnn_frontend_Engine.h
-index b95efb8..867541e 100644
---- a/include/cudnn_frontend_Engine.h
-+++ b/include/cudnn_frontend_Engine.h
-@@ -30,8 +30,8 @@
- #include <utility>
- #include <vector>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_OperationGraph.h"
- #include "cudnn_frontend_utils.h"
-diff --git a/include/cudnn_frontend_EngineConfig.h b/include/cudnn_frontend_EngineConfig.h
-index 973e777..97f0883 100644
---- a/include/cudnn_frontend_EngineConfig.h
-+++ b/include/cudnn_frontend_EngineConfig.h
-@@ -29,8 +29,8 @@
- #include <sstream>
- #include <utility>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_Engine.h"
- #include "cudnn_frontend_utils.h"
-diff --git a/include/cudnn_frontend_EngineFallbackList.h b/include/cudnn_frontend_EngineFallbackList.h
-index 4d4e5be..6390bc5 100644
---- a/include/cudnn_frontend_EngineFallbackList.h
-+++ b/include/cudnn_frontend_EngineFallbackList.h
-@@ -22,7 +22,7 @@
- 
- #pragma once
- 
--#include <cudnn.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
- #include <numeric>
- #include "cudnn_frontend_Heuristics.h"
- 
-diff --git a/include/cudnn_frontend_ExecutionPlan.h b/include/cudnn_frontend_ExecutionPlan.h
-index afceeb3..3d426e2 100644
---- a/include/cudnn_frontend_ExecutionPlan.h
-+++ b/include/cudnn_frontend_ExecutionPlan.h
-@@ -30,8 +30,8 @@
- #include <sstream>
- #include <utility>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_EngineConfig.h"
- #include "cudnn_frontend_Engine.h"
-diff --git a/include/cudnn_frontend_Filters.h b/include/cudnn_frontend_Filters.h
-index 676f0f2..4d1c020 100644
---- a/include/cudnn_frontend_Filters.h
-+++ b/include/cudnn_frontend_Filters.h
-@@ -22,7 +22,7 @@
- 
- #pragma once
- 
--#include <cudnn.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
- 
- namespace cudnn_frontend {
- 
-diff --git a/include/cudnn_frontend_Heuristics.h b/include/cudnn_frontend_Heuristics.h
-index dda3fb3..3e89237 100644
---- a/include/cudnn_frontend_Heuristics.h
-+++ b/include/cudnn_frontend_Heuristics.h
-@@ -25,8 +25,8 @@
- #include <vector>
- #include <mutex>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_OperationGraph.h"
- #include "cudnn_frontend_EngineConfig.h"
-diff --git a/include/cudnn_frontend_MatMulDesc.h b/include/cudnn_frontend_MatMulDesc.h
-index c9258ba..141f2f9 100644
---- a/include/cudnn_frontend_MatMulDesc.h
-+++ b/include/cudnn_frontend_MatMulDesc.h
-@@ -29,8 +29,8 @@
- #include <sstream>
- #include <utility>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_utils.h"
- 
-diff --git a/include/cudnn_frontend_Operation.h b/include/cudnn_frontend_Operation.h
-index bf16cfa..f3086e1 100644
---- a/include/cudnn_frontend_Operation.h
-+++ b/include/cudnn_frontend_Operation.h
-@@ -30,8 +30,8 @@
- #include <sstream>
- #include <utility>
+diff --git a/include/cudnn_frontend.h b/include/cudnn_frontend.h
+index 0f0d5a6..802bcbb 100644
+--- a/include/cudnn_frontend.h
++++ b/include/cudnn_frontend.h
+@@ -97,7 +97,7 @@
+  *      - Simpler samples on how to use the new API.
+  */
  
 -#include <cudnn.h>
--#include <cudnn_backend.h>
 +#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
  
  #include "cudnn_frontend_ConvDesc.h"
- #include "cudnn_frontend_PointWiseDesc.h"
-diff --git a/include/cudnn_frontend_OperationGraph.h b/include/cudnn_frontend_OperationGraph.h
-index c5e2704..71589b2 100644
---- a/include/cudnn_frontend_OperationGraph.h
-+++ b/include/cudnn_frontend_OperationGraph.h
-@@ -30,8 +30,8 @@
- #include <utility>
- #include <vector>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_Operation.h"
- #include "cudnn_frontend_utils.h"
-diff --git a/include/cudnn_frontend_PointWiseDesc.h b/include/cudnn_frontend_PointWiseDesc.h
-index afa71ce..56b6507 100644
---- a/include/cudnn_frontend_PointWiseDesc.h
-+++ b/include/cudnn_frontend_PointWiseDesc.h
-@@ -30,8 +30,8 @@
- #include <utility>
- #include <limits>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_utils.h"
- 
-diff --git a/include/cudnn_frontend_ReductionDesc.h b/include/cudnn_frontend_ReductionDesc.h
-index 5df2c5e..419fc93 100644
---- a/include/cudnn_frontend_ReductionDesc.h
-+++ b/include/cudnn_frontend_ReductionDesc.h
-@@ -29,8 +29,8 @@
- #include <sstream>
- #include <utility>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_utils.h"
- 
-diff --git a/include/cudnn_frontend_Resample.h b/include/cudnn_frontend_Resample.h
-index 351e2da..b1a1904 100644
---- a/include/cudnn_frontend_Resample.h
-+++ b/include/cudnn_frontend_Resample.h
-@@ -29,8 +29,8 @@
- #include <sstream>
- #include <utility>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_utils.h"
- 
-diff --git a/include/cudnn_frontend_Rng.h b/include/cudnn_frontend_Rng.h
-index 9d4e6ca..4224b61 100644
---- a/include/cudnn_frontend_Rng.h
-+++ b/include/cudnn_frontend_Rng.h
-@@ -29,8 +29,8 @@
- #include <sstream>
- #include <utility>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_utils.h"
- 
-diff --git a/include/cudnn_frontend_VariantPack.h b/include/cudnn_frontend_VariantPack.h
-index 455ab8b..4173860 100644
---- a/include/cudnn_frontend_VariantPack.h
-+++ b/include/cudnn_frontend_VariantPack.h
-@@ -30,8 +30,8 @@
- #include <sstream>
- #include <utility>
- 
--#include <cudnn.h>
--#include <cudnn_backend.h>
-+#include "third_party/gpus/cudnn/cudnn.h"
-+#include "third_party/gpus/cudnn/cudnn_backend.h"
- 
- #include "cudnn_frontend_utils.h"
- 
+ #include "cudnn_frontend_Heuristics.h"
diff --git a/third_party/xla/third_party/llvm_openmp/BUILD b/third_party/xla/third_party/llvm_openmp/BUILD
index 1592cf2ad51913..71a21b4e3786bb 100644
--- a/third_party/xla/third_party/llvm_openmp/BUILD
+++ b/third_party/xla/third_party/llvm_openmp/BUILD
@@ -19,19 +19,20 @@ load(
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    default_visibility = [
+        "//visibility:public",
+    ],
 )
 
-exports_files(
-    ["LICENSE.txt"],
-    visibility = ["//visibility:public"],
-)
+exports_files(["LICENSE.txt"])
 
 py_binary(
     name = "expand_cmake_vars",
     srcs = ["expand_cmake_vars.py"],
     srcs_version = "PY3",
-    visibility = ["//visibility:public"],
+    visibility = [
+        "@llvm_openmp//:__subpackages__",
+    ],
 )
 
 kmp_i18n_os_type = select({
@@ -239,5 +240,4 @@ if_windows(a = libiomp5_cc_binary(
 bzl_library(
     name = "openmp_bzl",
     srcs = ["openmp.bzl"],
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/third_party/python_runtime/BUILD b/third_party/xla/third_party/python_runtime/BUILD
index 14210ebf684fa9..2a1609191fe351 100644
--- a/third_party/xla/third_party/python_runtime/BUILD
+++ b/third_party/xla/third_party/python_runtime/BUILD
@@ -5,5 +5,4 @@ package(default_visibility = ["//visibility:public"])
 alias(
     name = "headers",
     actual = "@local_config_python//:python_headers",
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/third_party/stablehlo/temporary.patch b/third_party/xla/third_party/stablehlo/temporary.patch
index bb254f0c69c5aa..d7524290d3b9c2 100755
--- a/third_party/xla/third_party/stablehlo/temporary.patch
+++ b/third_party/xla/third_party/stablehlo/temporary.patch
@@ -1,7 +1,7 @@
 diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 --- stablehlo/CMakeLists.txt
 +++ stablehlo/CMakeLists.txt
-@@ -13,131 +13,20 @@
+@@ -13,153 +13,20 @@
  # See the License for the specific language governing permissions and
  # limitations under the License.
  #
@@ -25,6 +25,11 @@ diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 -if(POLICY CMP0116)
 -  cmake_policy(SET CMP0116 OLD)
 -endif()
+-
+-# Support for return(PROPAGATE ...) in functions.
+-if (POLICY CMP0140)
+-  cmake_policy(SET CMP0140 NEW)
+-endif()
 +# This build of StableHLO is meant to be embedded in MLIR-HLO.
 +# As a result, its root CMakeLists.txt is different from the original
 +# CMakeLists.txt from https://github.com/openxla/stablehlo.
@@ -39,6 +44,9 @@ diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 -option(STABLEHLO_BUILD_EMBEDDED "Build StableHLO as part of another project" OFF)
 -option(STABLEHLO_ENABLE_BINDINGS_PYTHON "Enables StableHLO Python bindings" OFF)
 -option(STABLEHLO_ENABLE_STRICT_BUILD "Build StableHLO with strict warnings and warnings as errors" OFF)
+-option(STABLEHLO_ENABLE_SANITIZER "Enable a sanitizer [OFF, address]" OFF)
+-option(STABLEHLO_ENABLE_SPLIT_DWARF "Enable split DWARF if the platform supports it" OFF)
+-option(STABLEHLO_ENABLE_LLD "Use LLD as the linker if available" OFF)
  
 -#-------------------------------------------------------------------------------
 -# Project setup and globals
@@ -55,29 +63,6 @@ diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 -  set(CMAKE_CXX_STANDARD 17)
 -endif()
 -
--# Build with ccache if the package is present
--set(LLVM_CCACHE_BUILD OFF CACHE BOOL "Set to ON for a ccache enabled build")
--if(LLVM_CCACHE_BUILD)
--  find_program(CCACHE_PROGRAM ccache)
--  if(CCACHE_PROGRAM)
--      set(LLVM_CCACHE_MAXSIZE "" CACHE STRING "Size of ccache")
--      set(LLVM_CCACHE_DIR "" CACHE STRING "Directory to keep ccached data")
--      set(LLVM_CCACHE_PARAMS "CCACHE_CPP2=yes CCACHE_HASHDIR=yes"
--          CACHE STRING "Parameters to pass through to ccache")
--
--      set(CCACHE_PROGRAM "${LLVM_CCACHE_PARAMS} ${CCACHE_PROGRAM}")
--      if (LLVM_CCACHE_MAXSIZE)
--        set(CCACHE_PROGRAM "CCACHE_MAXSIZE=${LLVM_CCACHE_MAXSIZE} ${CCACHE_PROGRAM}")
--      endif()
--      if (LLVM_CCACHE_DIR)
--        set(CCACHE_PROGRAM "CCACHE_DIR=${LLVM_CCACHE_DIR} ${CCACHE_PROGRAM}")
--      endif()
--      set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE ${CCACHE_PROGRAM})
--  else()
--    message(FATAL_ERROR "Unable to find the program ccache. Set LLVM_CCACHE_BUILD to OFF")
--  endif()
--endif()
--
 -#-------------------------------------------------------------------------------
 -# MLIR/LLVM Configuration
 -#-------------------------------------------------------------------------------
@@ -114,10 +99,39 @@ diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 -  message(STATUS "Building StableHLO embedded in another project")
 -endif()
 -
+-# Add the CMake modules specific to StableHLO
+-list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake")
+-
 -if(LLVM_ENABLE_ZLIB)
 -  find_package(ZLIB)
 -endif()
 -
+-#-------------------------------------------------------------------------------
+-# Performance configuration
+-#-------------------------------------------------------------------------------
+-
+-include(CheckCXXCompilerFlag)
+-include(CheckLinkerFlag)
+-if (STABLEHLO_ENABLE_LLD)
+-  message(STATUS "Enabling LLD as the linker")
+-  add_link_options("-fuse-ld=lld")
+-endif()
+-
+-if(STABLEHLO_ENABLE_SPLIT_DWARF)
+-    check_cxx_compiler_flag(-gsplit-dwarf STABLEHLO_SUPPORTS_SPLIT_DWARF)
+-    if (STABLEHLO_SUPPORTS_SPLIT_DWARF)
+-      message(STATUS "Enabling split-dwarf build")
+-      add_compile_options(-gsplit-dwarf -ggnu-pubnames)
+-    endif()
+-    check_linker_flag(CXX "-Wl,--gdb-index" STABLEHLO_SUPPORTS_GDB_INDEX)
+-    # If we set LLD it doesn't seem to affect the check_linker_flag above.
+-    # Account for it with the generator expression OR
+-    if (STABLEHLO_SUPPORTS_GDB_INDEX OR STABLEHLO_ENABLE_LLD)
+-      message(STATUS "Enabling GDB index in binary")
+-      add_link_options("-Wl,--gdb-index")
+-    endif()
+-endif()
+-
 -include(TableGen)
 -include(AddLLVM)
 -include(AddMLIR)
@@ -129,6 +143,14 @@ diff --ruN a/stablehlo/CMakeLists.txt b/stablehlo/CMakeLists.txt
 -link_directories(${LLVM_BUILD_LIBRARY_DIR})
 -add_definitions(${LLVM_DEFINITIONS})
 -
+-
+-#-------------------------------------------------------------------------------
+-# Sanitizer configuration
+-#-------------------------------------------------------------------------------
+-
+-include(SetupSanitizers)
+-setup_sanitizers()
+-
 -#-------------------------------------------------------------------------------
 -# Python configuration
 -#-------------------------------------------------------------------------------
@@ -155,7 +177,7 @@ diff --ruN a/stablehlo/stablehlo/CMakeLists.txt b/stablehlo/stablehlo/CMakeLists
 diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/experimental/BUILD.bazel
 --- stablehlo/stablehlo/experimental/BUILD.bazel
 +++ stablehlo/stablehlo/experimental/BUILD.bazel
-@@ -0,0 +1,114 @@
+@@ -0,0 +1,115 @@
 +# Copyright 2023 The StableHLO Authors. All Rights Reserved.
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
@@ -226,6 +248,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/BUILD.bazel b/stablehlo/stablehlo/
 +cc_library(
 +    name = "experimental_stablehlo_passes",
 +    srcs = [
++        "transforms/ChloRecomposeOps.cpp",
 +        "transforms/StablehloCanonicalizeDynamism.cpp",
 +        "transforms/StablehloRefineShapes.cpp",
 +    ],
@@ -423,7 +446,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/CMakeLists.txt b/stablehlo
 diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
 --- stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
 +++ stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp
-@@ -0,0 +1,641 @@
+@@ -0,0 +1,504 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -925,150 +948,13 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.cpp b/stableh
 +  return DynamicTopKOpAdaptor(op);
 +}
 +
-+LogicalResult TopKOpAdaptor::verify() {
-+  if (op_->getNumOperands() != 1)
-+    return op_.emitError("expects size(operands) = 1");
-+  if (op_->getNumResults() != 2)
-+    return op_.emitError("expects size(results) = 2");
-+  if (!op_.getBackendConfig().empty())
-+    return op_.emitError() << "expects an empty backend_config";
-+  if (op_.getCallTargetName() != "mhlo.topk")
-+    return op_.emitError() << "expects @mhlo.topk";
-+
-+  auto operand = op_.getInputs()[0];
-+  auto values = op_.getResults()[0];
-+  auto indices = op_.getResults()[1];
-+  DictionaryAttr topkAttributes =
-+      op_->getAttrOfType<DictionaryAttr>("mhlo.attributes");
-+  if (!topkAttributes) {
-+    return op_.emitError()
-+           << "mhlo.attributes missing or not a dictionary attribute";
-+  }
-+
-+  IntegerAttr k_attr = topkAttributes.get("k").dyn_cast_or_null<IntegerAttr>();
-+  if (!k_attr) {
-+    return op_.emitError() << "mhlo.attributes.k not present or not an integer";
-+  }
-+  int64_t k = k_attr.getInt();
-+
-+  // mhlo.topk_c5
-+  if (k < 0) return op_.emitError() << "expects k >= 0";
-+
-+  // mhlo.topk_i1
-+  auto operandType = operand.getType().dyn_cast<ShapedType>();
-+  if (!operandType || !operandType.hasRank() || operandType.getRank() < 1 ||
-+      !operandType.getElementType().isIntOrFloat())
-+    return op_.emitError()
-+           << "expects operand #0 "
-+           << "to be a tensor of integer or floating-point type "
-+           << "of rank at least 1";
-+
-+  // mhlo.topk_o1
-+  auto valuesType = values.getType().dyn_cast<ShapedType>();
-+  if (!valuesType || !valuesType.hasRank() || valuesType.getRank() < 1 ||
-+      !valuesType.getElementType().isIntOrFloat())
-+    return op_.emitError()
-+           << "expects values (result #0) "
-+           << "to be a tensor of integer or floating-point type "
-+           << "of rank at least 1";
-+
-+  // mhlo.topk_o2
-+  auto indicesType = indices.getType().dyn_cast<ShapedType>();
-+  if (!indicesType || !indicesType.hasRank() || indicesType.getRank() < 1 ||
-+      !indicesType.getElementType().isSignlessInteger(32))
-+    return op_.emitError() << "expects indices (result #1) "
-+                           << "to be a tensor of si32 of rank at least 1";
-+
-+  // mhlo.topk_c1 && mhlo.topk_c2
-+  auto operandLastDim = operandType.getRank() - 1;
-+  SmallVector<int64_t> expectedValuesShape(operandType.getShape());
-+  expectedValuesShape[operandLastDim] = k;
-+  if (failed(verifyCompatibleShape(expectedValuesShape, valuesType.getShape())))
-+    return op_.emitError() << "expects the values shape to match the operand "
-+                              "shape in all but the last dimension, and "
-+                              "that the last dimension of the values shape "
-+                              "has a size k";
-+
-+  // mhlo.topk_c3
-+  if (valuesType.getElementType() != operandType.getElementType())
-+    return op_.emitError()
-+           << "expects the values element type to be the same as the operand "
-+           << "element type";
-+
-+  // mhlo.topk_c4
-+  if (failed(
-+          verifyCompatibleShape(indicesType.getShape(), valuesType.getShape())))
-+    return op_.emitError()
-+           << "expects the indices shape to match the values shape";
-+
-+  return success();
-+}
-+
-+TypedValue<ShapedType> TopKOpAdaptor::getOperand() {
-+  return op_.getInputs()[0].cast<TypedValue<ShapedType>>();
-+}
-+
-+TypedValue<ShapedType> TopKOpAdaptor::getValues() {
-+  return op_.getResults()[0].cast<TypedValue<ShapedType>>();
-+}
-+
-+TypedValue<ShapedType> TopKOpAdaptor::getIndices() {
-+  return op_.getResults()[1].cast<TypedValue<ShapedType>>();
-+}
-+
-+int64_t TopKOpAdaptor::getK() {
-+  DictionaryAttr topkAttributes =
-+      op_->getAttrOfType<DictionaryAttr>("mhlo.attributes");
-+  return topkAttributes.get("k").cast<mlir::IntegerAttr>().getInt();
-+}
-+
-+bool TopKOpAdaptor::getLargest() {
-+  DictionaryAttr topkAttributes =
-+      op_->getAttrOfType<DictionaryAttr>("mhlo.attributes");
-+  IntegerAttr largest =
-+      topkAttributes.get("largest").dyn_cast_or_null<mlir::IntegerAttr>();
-+
-+  return (!largest) ? true : largest.getInt();
-+}
-+
-+std::optional<TopKOpAdaptor> getTopKOp(CustomCallOp op) {
-+  if (op.getCallTargetName() != "mhlo.topk") return {};
-+  return TopKOpAdaptor(op);
-+}
-+
-+LogicalResult TanOpAdaptor::verify() {
-+  if (op_->getNumOperands() != 1)
-+    return op_.emitError("expects size(operands) = 1");
-+  if (op_->getNumResults() != 1)
-+    return op_.emitError("expects size(results) = 1");
-+  auto operand = op_.getInputs()[0];
-+  auto result = op_.getResults()[0];
-+
-+  // tan_c1
-+  auto operandType = operand.getType().dyn_cast<ShapedType>();
-+  auto resultType = result.getType().dyn_cast<ShapedType>();
-+  if (!hlo::isCompatibleForHloTypeInference(operandType, resultType))
-+    return op_.emitError()
-+           << "expects operand and result to have compatible shapes";
-+  return success();
-+}
-+
-+TypedValue<ShapedType> TanOpAdaptor::getOperand() {
-+  return op_.getInputs()[0].cast<TypedValue<ShapedType>>();
-+}
-+
-+std::optional<TanOpAdaptor> getTanOp(CustomCallOp op) {
-+  if (op.getCallTargetName() != "mhlo.tan") return {};
-+  return TanOpAdaptor(op);
-+}
-+
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
 diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.h b/stablehlo/stablehlo/experimental/dialect/StablehloOps.h
 --- stablehlo/stablehlo/experimental/dialect/StablehloOps.h
 +++ stablehlo/stablehlo/experimental/dialect/StablehloOps.h
-@@ -0,0 +1,347 @@
+@@ -0,0 +1,230 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -1294,123 +1180,6 @@ diff --ruN a/stablehlo/stablehlo/experimental/dialect/StablehloOps.h b/stablehlo
 +// "stablehlo.dynamic_top_k".
 +std::optional<DynamicTopKOpAdaptor> getDynamicTopKOp(CustomCallOp op);
 +
-+///////////////////
-+// MHLO Op Wrappers
-+// There are some ops in MHLO which have experimental support in StableHLO
-+// programs by representing them as custom_calls with the target `mhlo.op_name`.
-+// The level of support of these ops is similar to the other custom_calls in
-+// this file. Generally these ops will be added to StableHLO and their
-+// experimental support can be deprecated in favor of op's type inference.
-+///////////////////
-+
-+// The TopK experiment provides a StableHLO adapter to MHLO TopKOp.
-+// In the future we expect stablehlo.top_k to be added which will use the same
-+// refinement rules.
-+//
-+// Within this experiment, TopKOp is represented via the serialized MHLO
-+// `stablehlo.custom_call @mhlo.topk` custom call.
-+//
-+// The semantics of experimental TopKOp are inherited from the semantics of
-+// mhlo.topk.
-+//
-+// #### Inputs
-+//
-+// | Label | Name            | Type                                         |
-+// |-------|-----------------|----------------------------------------------|
-+// | (I1)  | `operand`       | tensor of integer or floating-point type     |
-+// | (I2)  | `k`             | constant of type si64                        |
-+// | (I3)  | `largest`       | constant of type i1                          |
-+//
-+// #### Outputs
-+//
-+// | Name           | Type                                     |
-+// |----------------|------------------------------------------|
-+// | `values`       | tensor of integer or floating-point type |
-+// | `indices`      | tensor of si32 type                      |
-+//
-+// #### Constraints
-+//
-+// * (C1) `shape(values)[:-1] = shape(operand)[:-1]`
-+// * (C2) `shape(values)[-1] = k`
-+// * (C3) `element_type(values) = element_type(operand)`
-+// * (C4) `shape(indices) = shape(values)`
-+// * (C5) `k >= 0`
-+//
-+class TopKOpAdaptor {
-+ public:
-+  TopKOpAdaptor(CustomCallOp op) : op_(op) {}
-+  operator Operation*() { return op_; }
-+  Operation* operator->() { return op_; }
-+
-+  // These accessors assume that the operation is well-formed (i.e. that it
-+  // can pass verification).
-+  TypedValue<ShapedType> getOperand();
-+  TypedValue<ShapedType> getValues();
-+  TypedValue<ShapedType> getIndices();
-+  int64_t getK();
-+  bool getLargest();
-+
-+  // Verifies the constraints documented above.
-+  // Emits errors if errors are detected.
-+  LogicalResult verify();
-+
-+ private:
-+  CustomCallOp op_;
-+};
-+
-+// Wraps a custom call in a TopKOpAdaptor.
-+// Fails if the call_target_name of the custom call doesn't match
-+// "mhlo.topk".
-+std::optional<TopKOpAdaptor> getTopKOp(CustomCallOp op);
-+
-+// The TanOp experiment provides a StableHLO adapter to MHLO TanOp.
-+// In the future we expect stablehlo.tan to be added which will use the same
-+// refinement rules.
-+//
-+// Within this experiment, TanOp is represented via the serialized MHLO
-+// `stablehlo.custom_call @mhlo.tan` custom call. S
-+//
-+// Semantics of experimental TanOp are inherited from semantics of mhlo.tan.
-+//
-+// #### Inputs
-+//
-+// | Label | Name            | Type                                         |
-+// |-------|-----------------|----------------------------------------------|
-+// | (I1)  | `operand`       | tensor of floating-point or complex type     |
-+//
-+// #### Outputs
-+//
-+// | Name           | Type                                     |
-+// |----------------|------------------------------------------|
-+// | `result`       | tensor of floating-point or complex type |
-+//
-+// #### Constraints
-+//
-+// * (C1) baseline_type(operand) = baseline_type(result)
-+//
-+class TanOpAdaptor {
-+ public:
-+  TanOpAdaptor(CustomCallOp op) : op_(op) {}
-+  operator Operation*() { return op_; }
-+  Operation* operator->() { return op_; }
-+
-+  // These accessors assume that the operation is well-formed (i.e. that it
-+  // can pass verification).
-+  TypedValue<ShapedType> getOperand();
-+
-+  // Verifies the constraints documented above.
-+  // Emits errors if errors are detected.
-+  LogicalResult verify();
-+
-+ private:
-+  CustomCallOp op_;
-+};
-+
-+// Wraps a custom call in a TanOpAdaptor.
-+// Fails if the call_target_name of the custom call doesn't match
-+// "mhlo.tan".
-+std::optional<TanOpAdaptor> getTanOp(CustomCallOp op);
-+
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
@@ -1512,6 +1281,61 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/CMakeLists.txt b/stablehlo/s
 +  stablehlo-translate
 +)
 +add_dependencies(check-stablehlo-quick check-experimental-stablehlo-tests)
+diff --ruN a/stablehlo/stablehlo/experimental/tests/chlo_recompose_ops.mlir b/stablehlo/stablehlo/experimental/tests/chlo_recompose_ops.mlir
+--- stablehlo/stablehlo/experimental/tests/chlo_recompose_ops.mlir
++++ stablehlo/stablehlo/experimental/tests/chlo_recompose_ops.mlir
+@@ -0,0 +1,51 @@
++// RUN: experimental-stablehlo-opt --experimental-chlo-recompose-ops --split-input-file --verify-diagnostics %s | FileCheck %s
++
++// -----
++
++// CHECK-LABEL: func @recompose_topk
++func.func @recompose_topk(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
++  // CHECK: %values, %indices = chlo.top_k(%arg0, k = 4) {largest = true} : tensor<5x16xf32> -> (tensor<?x?xf32>, tensor<?x?xi32>)
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = true}
++  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
++  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
++}
++
++// -----
++
++// CHECK-LABEL: func @recompose_topk_invalid_attr
++func.func @recompose_topk_invalid_attr(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
++  // CHECK: stablehlo.custom_call @mhlo.topk
++  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
++    mhlo.attributes = { k = 4 : i64, largest = false}
++  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
++  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
++}
++
++// -----
++
++// CHECK-LABEL: @recompose_tan
++func.func @recompose_tan(%arg0: tensor<16xf32>) -> tensor<?xf32> {
++  // CHECK: %0 = chlo.tan %arg0 : tensor<16xf32> -> tensor<?xf32>
++  %0 = "stablehlo.custom_call"(%arg0) {
++    call_target_name = "mhlo.tan",
++    mhlo.attributes = {},
++    mhlo.version = 1 : i64
++  } : (tensor<16xf32>) -> tensor<?xf32>
++  func.return %0 : tensor<?xf32>
++}
++
++// -----
++
++// CHECK-LABEL: @recompose_erf
++func.func @recompose_erf(%arg0: tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16> {
++  // CHECK: %0 = chlo.erf %arg0 : tensor<3x20x20xbf16> -> tensor<?x20x20xbf16>
++  %0 = "stablehlo.custom_call"(%arg0) {
++    backend_config = "",
++    call_target_name = "mhlo.erf",
++    mhlo.attributes = {},
++    mhlo.version = 1 : i64
++  } : (tensor<3x20x20xbf16>) -> tensor<?x20x20xbf16>
++  func.return %0 : tensor<?x20x20xbf16>
++}
++
 diff --ruN a/stablehlo/stablehlo/experimental/tests/lit.cfg.py b/stablehlo/stablehlo/experimental/tests/lit.cfg.py
 --- stablehlo/stablehlo/experimental/tests/lit.cfg.py
 +++ stablehlo/stablehlo/experimental/tests/lit.cfg.py
@@ -1601,11 +1425,11 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynam
 +  //          CHECK-NEXT:     %[[VAL1:.*]] = stablehlo.add %arg2, %arg3 : tensor<f32>
 +  //          CHECK-NEXT:     stablehlo.return %[[VAL1]] : tensor<f32>
 +  //          CHECK-NEXT: }) {
-+  //          CHECK-SAME:   base_dilations = dense<[2, 1]> : tensor<2xi64>,
++  //          CHECK-SAME:   base_dilations = array<i64: 2, 1>,
 +  // CHECK-SAME{LITERAL}:   padding = dense<[[2, 1], [0, 0]]> : tensor<2x2xi64>,
-+  //          CHECK-SAME:   window_dilations = dense<[3, 1]> : tensor<2xi64>,
-+  //          CHECK-SAME:   window_dimensions = dense<[2, 1]> : tensor<2xi64>,
-+  //          CHECK-SAME:   window_strides = dense<[4, 1]> : tensor<2xi64>
++  //          CHECK-SAME:   window_dilations = array<i64: 3, 1>,
++  //          CHECK-SAME:   window_dimensions = array<i64: 2, 1>,
++  //          CHECK-SAME:   window_strides = array<i64: 4, 1>
 +  //          CHECK-SAME: } : (tensor<3x2xf32>, tensor<f32>) -> tensor<2x2xf32>
 +  %0 = stablehlo.constant dense<[2, 1]> : tensor<2xi64>
 +  %1 = stablehlo.constant dense<[4, 1]> : tensor<2xi64>
@@ -1633,11 +1457,11 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynam
 +  //          CHECK-NEXT:     %[[VAL1:.*]] = stablehlo.add %arg2, %arg3 : tensor<f32>
 +  //          CHECK-NEXT:     stablehlo.return %[[VAL1]] : tensor<f32>
 +  //          CHECK-NEXT: }) {
-+  //          CHECK-SAME:   base_dilations = dense<[2, 1]> : tensor<2xi64>,
++  //          CHECK-SAME:   base_dilations = array<i64: 2, 1>,
 +  // CHECK-SAME{LITERAL}:   padding = dense<[[2, 1], [0, 0]]> : tensor<2x2xi64>,
-+  //          CHECK-SAME:   window_dilations = dense<[3, 1]> : tensor<2xi64>,
-+  //          CHECK-SAME:   window_dimensions = dense<[2, 1]> : tensor<2xi64>,
-+  //          CHECK-SAME:   window_strides = dense<[4, 1]> : tensor<2xi64>
++  //          CHECK-SAME:   window_dilations = array<i64: 3, 1>,
++  //          CHECK-SAME:   window_dimensions = array<i64: 2, 1>,
++  //          CHECK-SAME:   window_strides = array<i64: 4, 1>
 +  //          CHECK-SAME: } : (tensor<?x2xf32>, tensor<f32>) -> tensor<?x2xf32>
 +  %0 = stablehlo.constant dense<[2, 1]> : tensor<2xi64>
 +  %1 = stablehlo.constant dense<[4, 1]> : tensor<2xi64>
@@ -1938,7 +1762,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_canonicalize_dynam
 diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir b/stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
 --- stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
 +++ stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
-@@ -0,0 +1,196 @@
+@@ -0,0 +1,42 @@
 +// RUN: experimental-stablehlo-opt --experimental-stablehlo-refine-shapes --split-input-file --verify-diagnostics %s | FileCheck %s
 +
 +// CHECK-LABEL: @main
@@ -1981,160 +1805,6 @@ diff --ruN a/stablehlo/stablehlo/experimental/tests/stablehlo_refine_shapes.mlir
 +  %1:2 = stablehlo.custom_call @stablehlo.dynamic_top_k(%arg0, %k) : (tensor<16xf32>, tensor<ui64>) -> (tensor<?xf32>, tensor<?xi32>)
 +  return %1#0, %1#1 : tensor<?xf32>, tensor<?xi32>
 +}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_topk
-+func.func @refine_mhlo_topk(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
-+  // CHECK: mhlo.topk{{.*}} -> (tensor<5x4xf32>, tensor<5x4xi32>)
-+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
-+    mhlo.attributes = { k = 4 : i64, largest = true}
-+  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
-+  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_error_too_many_operands
-+func.func @refine_mhlo_error_too_many_operands(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
-+  // expected-error@+1{{expects size(operands) = 1}}
-+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0, %arg0) {
-+    mhlo.attributes = { k = 4 : i64, largest = true}
-+  } : (tensor<5x16xf32>, tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
-+  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_error_too_few_results
-+func.func @refine_mhlo_error_too_few_results(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>) {
-+  // expected-error@+1{{expects size(results) = 2}}
-+  %0 = stablehlo.custom_call @mhlo.topk(%arg0) {
-+    mhlo.attributes = { k = 4 : i64, largest = true}
-+  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>)
-+  return %0 : tensor<?x?xf32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_error_wrong_output_1_type
-+func.func @refine_mhlo_error_wrong_output_1_type(%arg0: tensor<5x16xf32>) -> (tensor<f32>, tensor<?x?xi32>) {
-+  // expected-error@+1{{expects values (result #0) to be a tensor of integer or floating-point type of rank at least 1}}
-+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
-+    mhlo.attributes = { k = 4 : i64, largest = true}
-+  } : (tensor<5x16xf32>) -> (tensor<f32>, tensor<?x?xi32>)
-+  return %0#0, %0#1 : tensor<f32>, tensor<?x?xi32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_error_wrong_output_2_type
-+func.func @refine_mhlo_error_wrong_output_2_type(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>) {
-+  // expected-error@+1{{expects indices (result #1) to be a tensor of si32 of rank at least 1}}
-+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
-+    mhlo.attributes = { k = 4 : i64, largest = true}
-+  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xf32>)
-+  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xf32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_error_c1_wrong_output_shape
-+func.func @refine_mhlo_error_c1_wrong_output_shape(%arg0: tensor<5x16xf32>) -> (tensor<?x?x?xf32>, tensor<?x?xi32>) {
-+  // expected-error@+1{{expects the values shape to match the operand}}
-+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
-+    mhlo.attributes = { k = 4 : i64, largest = true}
-+  } : (tensor<5x16xf32>) -> (tensor<?x?x?xf32>, tensor<?x?xi32>)
-+  return %0#0, %0#1 : tensor<?x?x?xf32>, tensor<?x?xi32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_error_c2_last_dim_not_k
-+func.func @refine_mhlo_error_c2_last_dim_not_k(%arg0: tensor<5x16xf32>) -> (tensor<?x5xf32>, tensor<?x?xi32>) {
-+  // expected-error@+1{{expects the values shape to match the operand}}
-+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
-+    mhlo.attributes = { k = 4 : i64, largest = true}
-+  } : (tensor<5x16xf32>) -> (tensor<?x5xf32>, tensor<?x?xi32>)
-+  return %0#0, %0#1 : tensor<?x5xf32>, tensor<?x?xi32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_error_c3_wrong_output_type
-+func.func @refine_mhlo_error_c3_wrong_output_type(%arg0: tensor<5x16xf32>) -> (tensor<?x?xi32>, tensor<?x?xi32>) {
-+  // expected-error@+1{{expects the values element type to be the same as the operand element type}}
-+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
-+    mhlo.attributes = { k = 4 : i64, largest = true}
-+  } : (tensor<5x16xf32>) -> (tensor<?x?xi32>, tensor<?x?xi32>)
-+  return %0#0, %0#1 : tensor<?x?xi32>, tensor<?x?xi32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_error_c4_outputs_shape_mismatch
-+func.func @refine_mhlo_error_c4_outputs_shape_mismatch(%arg0: tensor<5x16xf32>) -> (tensor<?x4xf32>, tensor<?x5xi32>) {
-+  // expected-error@+1{{expects the indices shape to match the values shape}}
-+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
-+    mhlo.attributes = { k = 4 : i64, largest = true}
-+  } : (tensor<5x16xf32>) -> (tensor<?x4xf32>, tensor<?x5xi32>)
-+  return %0#0, %0#1 : tensor<?x4xf32>, tensor<?x5xi32>
-+}
-+
-+// -----
-+
-+// CHECK-LABEL: func @refine_mhlo_error_c5_negative_k
-+func.func @refine_mhlo_error_c5_negative_k(%arg0: tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>) {
-+  // expected-error@+1{{expects k >= 0}}
-+  %0:2 = stablehlo.custom_call @mhlo.topk(%arg0) {
-+    mhlo.attributes = { k = -4 : i64, largest = true}
-+  } : (tensor<5x16xf32>) -> (tensor<?x?xf32>, tensor<?x?xi32>)
-+  return %0#0, %0#1 : tensor<?x?xf32>, tensor<?x?xi32>
-+}
-+
-+
-+// -----
-+
-+// CHECK-LABEL: @refine_tan
-+func.func @refine_tan(%arg0: tensor<16xf32>) -> tensor<?xf32> {
-+  // CHECK: mhlo.tan{{.*}} -> tensor<16xf32>
-+  %0 = "stablehlo.custom_call"(%arg0) {
-+    call_target_name = "mhlo.tan",
-+    mhlo.attributes = {},
-+    mhlo.version = 1 : i64
-+  } : (tensor<16xf32>) -> tensor<?xf32>
-+  func.return %0 : tensor<?xf32>
-+}
-+
-+// -----
-+
-+// tan_op I1
-+// CHECK-LABEL: func @tan_op_error_too_many_inputs
-+func.func @tan_op_error_too_many_inputs(%arg0: tensor<1xf32>) -> (tensor<1xf32>) {
-+  // expected-error@+1{{expects size(operands) = 1}}
-+  %0 = "stablehlo.custom_call"(%arg0, %arg0) {call_target_name = "mhlo.tan", mhlo.attributes = {}, mhlo.version = 1 : i64} : (tensor<1xf32>, tensor<1xf32>) -> tensor<1xf32>
-+  return %0 : tensor<1xf32>
-+}
-+
-+// -----
-+
-+// tan_op O1
-+// CHECK-LABEL: func @tan_op_error_too_many_outputs
-+func.func @tan_op_error_too_many_outputs(%arg0: tensor<1xf32>) -> (tensor<1xf32>) {
-+  // expected-error@+1{{expects size(results) = 1}}
-+  %0:2 = "stablehlo.custom_call"(%arg0) {call_target_name = "mhlo.tan", mhlo.attributes = {}, mhlo.version = 1 : i64} : (tensor<1xf32>) -> (tensor<1xf32>, tensor<1xf32>)
-+  return %0#1 : tensor<1xf32>
-+}
-+
-+// -----
-+
-+// tan_op C1
-+// CHECK-LABEL: func @tan_op_error_values_bad_element_type
-+func.func @tan_op_error_values_bad_element_type(%arg0: tensor<1xf32>) -> (tensor<1xi32>) {
-+  // expected-error@+1{{expects operand and result to have compatible shapes}}
-+  %0 = "stablehlo.custom_call"(%arg0) {call_target_name = "mhlo.tan", mhlo.attributes = {}, mhlo.version = 1 : i64} : (tensor<1xf32>) -> tensor<1xi32>
-+  return %0 : tensor<1xi32>
-+}
 diff --ruN a/stablehlo/stablehlo/experimental/tools/CMakeLists.txt b/stablehlo/stablehlo/experimental/tools/CMakeLists.txt
 --- stablehlo/stablehlo/experimental/tools/CMakeLists.txt
 +++ stablehlo/stablehlo/experimental/tools/CMakeLists.txt
@@ -2233,7 +1903,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/tools/StablehloOptMain.cpp b/stabl
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
 --- stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
 +++ stablehlo/stablehlo/experimental/transforms/CMakeLists.txt
-@@ -0,0 +1,39 @@
+@@ -0,0 +1,40 @@
 +# Copyright 2023 The StableHLO Authors.
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
@@ -2254,6 +1924,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stable
 +
 +add_mlir_dialect_library(ExperimentalStablehloPasses
 +  PARTIAL_SOURCES_INTENDED
++  ChloRecomposeOps.cpp
 +  StablehloCanonicalizeDynamism.cpp
 +  StablehloRefineShapes.cpp
 +
@@ -2273,10 +1944,182 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/CMakeLists.txt b/stable
 +  StablehloPasses
 +  StablehloTypeInference
 +)
+diff --ruN a/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp b/stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
+--- stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
++++ stablehlo/stablehlo/experimental/transforms/ChloRecomposeOps.cpp
+@@ -0,0 +1,168 @@
++/* Copyright 2024 The StableHLO Authors.
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++    http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#include <cstdint>
++#include <functional>
++
++#include "llvm/ADT/SmallVector.h"
++#include "mlir/Dialect/Func/IR/FuncOps.h"
++#include "mlir/IR/Attributes.h"
++#include "mlir/IR/BuiltinAttributes.h"
++#include "mlir/IR/PatternMatch.h"
++#include "mlir/Support/LogicalResult.h"
++#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
++#include "stablehlo/dialect/ChloOps.h"
++#include "stablehlo/dialect/StablehloOps.h"
++#include "stablehlo/experimental/dialect/StablehloOps.h"
++#include "stablehlo/experimental/transforms/Passes.h"
++
++namespace mlir {
++namespace stablehlo {
++namespace experimental {
++
++#define GEN_PASS_DEF_CHLORECOMPOSEOPSPASS
++#include "stablehlo/experimental/transforms/Passes.h.inc"
++
++namespace {
++
++FailureOr<DictionaryAttr> getCustomCallOpAttributes(CustomCallOp op,
++                                                    PatternRewriter& rewriter) {
++  auto attrs = op->getAttrOfType<DictionaryAttr>("mhlo.attributes");
++  if (!attrs)
++    return rewriter.notifyMatchFailure(
++        op, "Expected mhlo.attributes dictionary attribute.");
++  return attrs;
++}
++
++LogicalResult verifyCustomCallOpAttributes(
++    CustomCallOp op, PatternRewriter& rewriter,
++    std::function<LogicalResult(NamedAttribute)> verifyFn) {
++  auto attrs = getCustomCallOpAttributes(op, rewriter);
++  if (failed(attrs)) return failure();
++
++  for (auto attr : attrs->getValue()) {
++    if (failed(verifyFn(attr))) return failure();
++  }
++  return success();
++}
++
++// Experimental and public ops in MHLO that do not exist yet in StableHLO
++// can be encoded as a StableHLO CustomCallOp to allow round-tripping
++// between dialects. Some of these ops are CHLO ops that are accelerated by XLA.
++// For these ops we can recompose to CHLO.
++//
++// Example:
++//  %0 = stablehlo.custom_call @mhlo.topk(...) {...}
++//  ==>
++//   %0 = "chlo.topk"(...) {...}
++template <typename OpType>
++LogicalResult recomposeChloOpFromCustomCall(stablehlo::CustomCallOp op,
++                                            PatternRewriter& rewriter) {
++  // Only call_target_name, backend_config, called_computations, mhlo.version,
++  // and mhlo.attributes are compatible with the extensibility protocol.
++  auto isSupportedAttrName = [](NamedAttribute attr) {
++    auto name = attr.getName();
++    return name == "call_target_name" || name == "backend_config" ||
++           name == "called_computations" || name == "mhlo.attributes" ||
++           name == "mhlo.version";
++  };
++  if (!llvm::all_of(op->getAttrs(), isSupportedAttrName) ||
++      !op.getBackendConfig().empty()) {
++    return rewriter.notifyMatchFailure(
++        op, "CHLO Recompose custom call did not have required attributes.");
++  }
++  if (!op.getCalledComputations().empty())
++    return rewriter.notifyMatchFailure(op, "Ops with regions not supported.");
++
++  auto attrs = getCustomCallOpAttributes(op, rewriter);
++  if (failed(attrs)) return failure();
++
++  rewriter.replaceOpWithNewOp<OpType>(op, op->getResultTypes(),
++                                      op->getOperands(), attrs->getValue());
++  return success();
++}
++
++struct TopKOpRecomposePattern : public OpRewritePattern<CustomCallOp> {
++  using OpRewritePattern::OpRewritePattern;
++  LogicalResult matchAndRewrite(CustomCallOp op,
++                                PatternRewriter& rewriter) const override {
++    if (op.getCallTargetName() != "mhlo.topk") return failure();
++    auto res = verifyCustomCallOpAttributes(
++        op, rewriter, [&](NamedAttribute attr) -> LogicalResult {
++          if (attr.getName() != "largest") return success();
++          if (attr.getValue().cast<BoolAttr>().getValue() == false)
++            return rewriter.notifyMatchFailure(
++                op, "largest = false is not supported.");
++          return success();
++        });
++    if (failed(res)) return failure();
++    return recomposeChloOpFromCustomCall<chlo::TopKOp>(op, rewriter);
++  }
++};
++
++struct TanOpRecomposePattern : public OpRewritePattern<CustomCallOp> {
++  using OpRewritePattern::OpRewritePattern;
++  LogicalResult matchAndRewrite(CustomCallOp op,
++                                PatternRewriter& rewriter) const override {
++    if (op.getCallTargetName() != "mhlo.tan") return failure();
++    return recomposeChloOpFromCustomCall<chlo::TanOp>(op, rewriter);
++  }
++};
++
++struct ErfOpRecomposePattern : public OpRewritePattern<CustomCallOp> {
++  using OpRewritePattern::OpRewritePattern;
++  LogicalResult matchAndRewrite(CustomCallOp op,
++                                PatternRewriter& rewriter) const override {
++    if (op.getCallTargetName() != "mhlo.erf") return failure();
++    return recomposeChloOpFromCustomCall<chlo::ErfOp>(op, rewriter);
++  }
++};
++
++}  // namespace
++
++struct ChloRecomposeOpsPass
++    : public impl::ChloRecomposeOpsPassBase<ChloRecomposeOpsPass> {
++  using ChloRecomposeOpsPassBase::ChloRecomposeOpsPassBase;
++
++  void runOnOperation() override {
++    // Do a single traversal to recompose CustomCallOp to CHLO ops.
++    GreedyRewriteConfig config;
++    config.useTopDownTraversal = true;
++    config.enableRegionSimplification = true;
++    config.maxIterations = 1;
++    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
++    config.strictMode = GreedyRewriteStrictness::ExistingOps;
++
++    RewritePatternSet patterns(&getContext());
++    patterns.add<TopKOpRecomposePattern>(&getContext());
++    patterns.add<TanOpRecomposePattern>(&getContext());
++    patterns.add<ErfOpRecomposePattern>(&getContext());
++
++    // Only apply to CustomCallOps
++    auto moduleOp = getOperation();
++    llvm::SmallVector<Operation*> candidateOps;
++    moduleOp.walk([&](CustomCallOp op) { candidateOps.push_back(op); });
++
++    if (failed(applyOpPatternsAndFold(candidateOps, std::move(patterns),
++                                      config))) {
++      moduleOp.emitError("Failed to converge ChloRecomposeOps in ")
++          << config.maxIterations << " iterations";
++      return signalPassFailure();
++    }
++  }
++};
++
++}  // namespace experimental
++}  // namespace stablehlo
++}  // namespace mlir
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/stablehlo/experimental/transforms/Passes.h
 --- stablehlo/stablehlo/experimental/transforms/Passes.h
 +++ stablehlo/stablehlo/experimental/transforms/Passes.h
-@@ -0,0 +1,37 @@
+@@ -0,0 +1,36 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -2303,9 +2146,8 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/st
 +namespace mlir {
 +namespace stablehlo {
 +namespace experimental {
-+  
-+#define GEN_PASS_DECL_STABLEHLOCANONICALIZEDYNAMISMPASS
-+#define GEN_PASS_DECL_STABLEHLOREFINESHAPESPASS
++
++#define GEN_PASS_DECL
 +#define GEN_PASS_REGISTRATION
 +#include "stablehlo/experimental/transforms/Passes.h.inc"
 +
@@ -2317,7 +2159,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.h b/stablehlo/st
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/stablehlo/experimental/transforms/Passes.td
 --- stablehlo/stablehlo/experimental/transforms/Passes.td
 +++ stablehlo/stablehlo/experimental/transforms/Passes.td
-@@ -0,0 +1,31 @@
+@@ -0,0 +1,39 @@
 +/* Copyright 2023 The StableHLO Authors.
 +
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -2349,10 +2191,18 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/Passes.td b/stablehlo/s
 +    Experimental version of the --stablehlo-refine-shapes pass.
 +  }];
 +}
++
++def ChloRecomposeOpsPass : Pass<"experimental-chlo-recompose-ops", "ModuleOp"> {
++  let summary = "(Experimental) Recompose CHLO ops serialized as custom calls.";
++  let description = [{
++    Experimental version of CHLO serialization support.
++  }];
++  let dependentDialects = ["chlo::ChloDialect"];
++}
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 --- stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
 +++ stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDynamism.cpp
-@@ -0,0 +1,167 @@
+@@ -0,0 +1,171 @@
 +/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
 +   Copyright 2023 The StableHLO Authors.
 +Licensed under the Apache License, Version 2.0 (the "License");
@@ -2419,10 +2269,10 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDy
 +      return rewriter.notifyMatchFailure(op, "expected static padding");
 +    auto newOp = rewriter.create<ReduceWindowOp>(
 +        op->getLoc(), op->getResultTypes(), op.getInputs(), op.getInitValues(),
-+        rewriter.getI64TensorAttr(windowDimensions),
-+        rewriter.getI64TensorAttr(windowStrides),
-+        rewriter.getI64TensorAttr(baseDilations),
-+        rewriter.getI64TensorAttr(windowDilations),
++        rewriter.getDenseI64ArrayAttr(windowDimensions),
++        rewriter.getDenseI64ArrayAttr(windowStrides),
++        rewriter.getDenseI64ArrayAttr(baseDilations),
++        rewriter.getDenseI64ArrayAttr(windowDilations),
 +        hlo::getPaddingAttr(&rewriter, padding));
 +
 +    // Inline the called computation into newOp.
@@ -2509,8 +2359,12 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDy
 +    patterns.add<CanonicalizeDynamicReduceWindowOpPattern>(&getContext());
 +    patterns.add<CanonicalizeDynamicRngBitGeneratorOpPattern>(&getContext());
 +    patterns.add<CanonicalizeDynamicTopKOpPattern>(&getContext());
-+    if (failed(applyPatternsAndFoldGreedily(getOperation(), std::move(patterns),
++
++    auto funcOp = getOperation();
++    if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns),
 +                                            config))) {
++      funcOp.emitError("Failed to converge StablehloCanonicalizeDynamism in ")
++          << config.maxIterations << " iterations";
 +      return signalPassFailure();
 +    }
 +  }
@@ -2523,7 +2377,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloCanonicalizeDy
 diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp b/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp
 --- stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp
 +++ stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.cpp
-@@ -0,0 +1,196 @@
+@@ -0,0 +1,170 @@
 +/* Copyright 2022 The StableHLO Authors.
 +Licensed under the Apache License, Version 2.0 (the "License");
 +you may not use this file except in compliance with the License.
@@ -2654,33 +2508,6 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +  }
 +};
 +
-+struct RefineTopKOpPattern : public OpRewritePattern<CustomCallOp> {
-+  using OpRewritePattern::OpRewritePattern;
-+  LogicalResult matchAndRewrite(CustomCallOp impl,
-+                                PatternRewriter& rewriter) const override {
-+    auto maybeOp = getTopKOp(impl);
-+    if (!maybeOp || failed(maybeOp->verify())) return failure();
-+    TopKOpAdaptor op = *maybeOp;
-+
-+    auto operandType = op.getOperand().getType().cast<ShapedType>();
-+    SmallVector<int64_t> outputShape(operandType.getShape());
-+    outputShape.back() = op.getK();
-+    return refineReturnTypes(rewriter, op, {{outputShape}, {outputShape}});
-+  }
-+};
-+
-+struct RefineTanOpPattern : public OpRewritePattern<CustomCallOp> {
-+  using OpRewritePattern::OpRewritePattern;
-+  LogicalResult matchAndRewrite(CustomCallOp impl,
-+                                PatternRewriter& rewriter) const override {
-+    auto maybeOp = getTanOp(impl);
-+    if (!maybeOp || failed(maybeOp->verify())) return failure();
-+    TanOpAdaptor op = *maybeOp;
-+    return refineReturnShape(rewriter, op,
-+                             op.getOperand().getType().getShape());
-+  }
-+};
-+
 +struct StablehloRefineShapesPass
 +    : public impl::StablehloRefineShapesPassBase<StablehloRefineShapesPass> {
 +  using StablehloRefineShapesPassBase::StablehloRefineShapesPassBase;
@@ -2698,7 +2525,7 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +    GreedyRewriteConfig config;
 +    config.useTopDownTraversal = true;
 +    config.enableRegionSimplification = true;
-+    config.maxIterations = 2;
++    config.maxIterations = 3;
 +    config.maxNumRewrites = GreedyRewriteConfig::kNoLimit;
 +    config.strictMode = GreedyRewriteStrictness::AnyOp;
 +
@@ -2707,10 +2534,11 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +    patterns.add<RefineDynamicReduceWindowOpPattern>(&getContext());
 +    patterns.add<RefineDynamicRngBitGeneratorOpPattern>(&getContext());
 +    patterns.add<RefineDynamicTopKOpPattern>(&getContext());
-+    patterns.add<RefineTanOpPattern>(&getContext());
-+    patterns.add<RefineTopKOpPattern>(&getContext());
 +    if (failed(
 +            applyPatternsAndFoldGreedily(func, std::move(patterns), config))) {
++      func.emitError()
++          << "Greedy rewriter in StablehloRefineShapes does not converge after "
++          << config.maxIterations << " iterations.";
 +      return signalPassFailure();
 +    }
 +  }
@@ -2720,4 +2548,61 @@ diff --ruN a/stablehlo/stablehlo/experimental/transforms/StablehloRefineShapes.c
 +}  // namespace experimental
 +}  // namespace stablehlo
 +}  // namespace mlir
+diff --ruN a/stablehlo/stablehlo/transforms/ChloDecompositionPatterns.td b/stablehlo/stablehlo/transforms/ChloDecompositionPatterns.td
+--- stablehlo/stablehlo/transforms/ChloDecompositionPatterns.td
++++ stablehlo/stablehlo/transforms/ChloDecompositionPatterns.td
+@@ -1,6 +1,6 @@
+ // Copyright 2020 The IREE Authors
+ //
+-// Licensed under the Apache License v2.0 with LLVM Exceptions.
++// Licensed under the Apache License, Version 2.0 with LLVM Exceptions.
+ // See https://llvm.org/LICENSE.txt for license information.
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ 
+diff --ruN a/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp b/stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
+--- stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
++++ stablehlo/stablehlo/transforms/ChloLegalizeToStablehlo.cpp
+@@ -1,6 +1,6 @@
+ // Copyright 2020 The IREE Authors
+ //
+-// Licensed under the Apache License v2.0 with LLVM Exceptions.
++// Licensed under the Apache License, Version 2.0 with LLVM Exceptions.
+ // See https://llvm.org/LICENSE.txt for license information.
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ 
+diff --ruN a/stablehlo/stablehlo/transforms/StablehloAggressiveSimplification.cpp b/stablehlo/stablehlo/transforms/StablehloAggressiveSimplification.cpp
+--- stablehlo/stablehlo/transforms/StablehloAggressiveSimplification.cpp
++++ stablehlo/stablehlo/transforms/StablehloAggressiveSimplification.cpp
+@@ -1,6 +1,6 @@
+ // Copyright 2023 The IREE Authors
+ //
+-// Licensed under the Apache License v2.0 with LLVM Exceptions.
++// Licensed under the Apache License, Version 2.0 with LLVM Exceptions.
+ // See https://llvm.org/LICENSE.txt for license information.
+ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ 
+@@ -126,9 +126,8 @@
+ 
+     // The canonical form has the constant operand as the RHS.
+     if (isa<IntegerType>(type.getElementType()) && lhsAttr && !rhsAttr) {
+-      rewriter.modifyOpInPlace(op, [op, lhs, rhs] {
+-        op->setOperands(ValueRange{rhs, lhs});
+-      });
++      rewriter.modifyOpInPlace(
++          op, [op, lhs, rhs] { op->setOperands(ValueRange{rhs, lhs}); });
+       return success();
+     }
+ 
+@@ -221,9 +220,8 @@
+ 
+     // The canonical form has the constant operand as the RHS.
+     if (isa<IntegerType>(type.getElementType()) && lhsAttr && !rhsAttr) {
+-      rewriter.modifyOpInPlace(op, [op, lhs, rhs] {
+-        op->setOperands(ValueRange{rhs, lhs});
+-      });
++      rewriter.modifyOpInPlace(
++          op, [op, lhs, rhs] { op->setOperands(ValueRange{rhs, lhs}); });
+       return success();
+     }
+ 
 
diff --git a/third_party/xla/third_party/stablehlo/workspace.bzl b/third_party/xla/third_party/stablehlo/workspace.bzl
index cf0ae34f4e54b3..03d732544d9ba8 100644
--- a/third_party/xla/third_party/stablehlo/workspace.bzl
+++ b/third_party/xla/third_party/stablehlo/workspace.bzl
@@ -4,8 +4,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 def repo():
     # LINT.IfChange
-    STABLEHLO_COMMIT = "20255865ba299ed67bdf6267478c8477aef7a60d"
-    STABLEHLO_SHA256 = "d703dba8c3f6ed1b5c7ac9772ec24d474644a5f10c0122aaca1401e0b4120471"
+    STABLEHLO_COMMIT = "7509a264b4552ecaf89713ba67edbb59422fb859"
+    STABLEHLO_SHA256 = "712a10ec39063761c1214cf5be3b06089514f1b3a69113f5d72af7c02594a239"
     # LINT.ThenChange(Google-internal path)
 
     tf_http_archive(
diff --git a/third_party/xla/third_party/triton/b304456327.patch b/third_party/xla/third_party/triton/b304456327.patch
deleted file mode 100644
index 68209ab4d7550e..00000000000000
--- a/third_party/xla/third_party/triton/b304456327.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-index 169939eb3..9d90acf51 100644
---- a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-+++ b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
-@@ -24,7 +24,7 @@ using ttg::SliceEncodingAttr;
- // supported
- static int getMMAVersionSafe(int computeCapability, tt::DotOp op) {
-   int baseVersion = 0;
--  if (computeCapability < 75) {
-+  if (computeCapability < 80) {
-     baseVersion = 1;
-   } else if (computeCapability < 90) {
-     baseVersion = 2;
\ No newline at end of file
diff --git a/third_party/xla/third_party/triton/cl596538580.patch b/third_party/xla/third_party/triton/cl596538580.patch
deleted file mode 100644
index fccfeb14a4d236..00000000000000
--- a/third_party/xla/third_party/triton/cl596538580.patch
+++ /dev/null
@@ -1,26 +0,0 @@
-==== triton/lib/Target/LLVMIR/LLVMDIScope.cpp#1 - /google/src/cloud/tomnatan/mlir_b3037ae1fc6d26459e37f813757ad30872eb2eee_1704712559/triton/lib/Target/LLVMIR/LLVMDIScope.cpp ====
-# action=edit type=text
---- triton/lib/Target/LLVMIR/LLVMDIScope.cpp	2023-07-12 12:22:54.000000000 -0700
-+++ triton/lib/Target/LLVMIR/LLVMDIScope.cpp	2024-01-08 03:26:39.000000000 -0800
-@@ -1,6 +1,7 @@
- #include "triton/Target/LLVMIR/Passes.h"
- 
- #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
-+#include "mlir/IR/BuiltinAttributes.h"
- #include "mlir/Pass/Pass.h"
- #include "llvm/BinaryFormat/Dwarf.h"
- #include "llvm/Support/Debug.h"
-@@ -75,9 +76,10 @@
-     }
-     if (!compileUnitAttr) {
-       compileUnitAttr = LLVM::DICompileUnitAttr::get(
--          context, llvm::dwarf::DW_LANG_C, fileAttr,
--          StringAttr::get(context, "triton"), /*isOptimized=*/true,
--          LLVM::DIEmissionKind::LineTablesOnly);
-+          context,
-+          mlir::DistinctAttr::create(mlir::UnitAttr::get(&getContext())),
-+          llvm::dwarf::DW_LANG_C, fileAttr, StringAttr::get(context, "triton"),
-+          /*isOptimized=*/true, LLVM::DIEmissionKind::LineTablesOnly);
-     }
-     auto subroutineTypeAttr =
-         LLVM::DISubroutineTypeAttr::get(context, llvm::dwarf::DW_CC_normal, {});
diff --git a/third_party/xla/third_party/triton/cl596550429.patch b/third_party/xla/third_party/triton/cl596550429.patch
deleted file mode 100644
index b191db75535fa3..00000000000000
--- a/third_party/xla/third_party/triton/cl596550429.patch
+++ /dev/null
@@ -1,42 +0,0 @@
-==== triton/lib/Target/LLVMIR/LLVMDIScope.cpp#2 - /google/src/cloud/tomnatan/mlir_bae1fdea712fcd0b0ea525b115e661f92263f2e7_1704716315/triton/lib/Target/LLVMIR/LLVMDIScope.cpp ====
-# action=edit type=text
---- triton/lib/Target/LLVMIR/LLVMDIScope.cpp	2024-01-08 03:37:57.000000000 -0800
-+++ triton/lib/Target/LLVMIR/LLVMDIScope.cpp	2024-01-08 04:22:40.000000000 -0800
-@@ -73,11 +73,11 @@
-           context, llvm::sys::path::filename(inputFilePath),
-           llvm::sys::path::parent_path(inputFilePath));
-     }
-+    auto distinctId = mlir::DistinctAttr::create(mlir::UnitAttr::get(context));
-     if (!compileUnitAttr) {
-       compileUnitAttr = LLVM::DICompileUnitAttr::get(
--          context,
--          mlir::DistinctAttr::create(mlir::UnitAttr::get(&getContext())),
--          llvm::dwarf::DW_LANG_C, fileAttr, StringAttr::get(context, "triton"),
-+          context, distinctId, llvm::dwarf::DW_LANG_C, fileAttr,
-+          StringAttr::get(context, "triton"),
-           /*isOptimized=*/true, LLVM::DIEmissionKind::LineTablesOnly);
-     }
-     auto subroutineTypeAttr =
-@@ -87,14 +87,14 @@
-     // Note that scopeline is set differently from LLVM's
-     // DIScopeForLLVMFuncOpPass. I don't find reasons why scopeline should be
-     // the column offset
--    auto subprogramAttr =
--        LLVM::DISubprogramAttr::get(context, compileUnitAttr, fileAttr,
--                                    funcNameAttr, funcNameAttr, fileAttr,
--                                    /*line=*/line,
--                                    /*scopeline=*/line,
--                                    LLVM::DISubprogramFlags::Definition |
--                                        LLVM::DISubprogramFlags::Optimized,
--                                    subroutineTypeAttr);
-+    auto subprogramAttr = LLVM::DISubprogramAttr::get(
-+        context, distinctId, compileUnitAttr, fileAttr, funcNameAttr,
-+        funcNameAttr, fileAttr,
-+        /*line=*/line,
-+        /*scopeline=*/line,
-+        LLVM::DISubprogramFlags::Definition |
-+            LLVM::DISubprogramFlags::Optimized,
-+        subroutineTypeAttr);
-     funcOp->setLoc(FusedLoc::get(context, {loc}, subprogramAttr));
-   }
- 
diff --git a/third_party/xla/third_party/triton/cl597005212.patch b/third_party/xla/third_party/triton/cl597005212.patch
deleted file mode 100644
index f5bbb332a3fa5f..00000000000000
--- a/third_party/xla/third_party/triton/cl597005212.patch
+++ /dev/null
@@ -1,14 +0,0 @@
-==== triton/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp#17 - /google/src/cloud/ecg/mlir_2aec7083ada09c8b8a0aad79492cbedcf8f9fbb7_1704830674/triton/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp ====
-# action=edit type=text
---- triton/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp	2024-01-05 23:46:53.000000000 -0800
-+++ triton/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp	2024-01-09 12:13:15.000000000 -0800
-@@ -226,7 +226,8 @@
-     }
-     // Set an attribute for maxntidx, it could be used in latter LLVM codegen
-     // for `nvvm.annotation` metadata.
--    newFuncOp->setAttr("nvvm.maxntid", rewriter.getI32ArrayAttr(32 * numWarps));
-+    newFuncOp->setAttr("nvvm.maxntid",
-+                       rewriter.getDenseI32ArrayAttr(32 * numWarps));
-     // The call graph is updated by mapping the old function to the new one.
-     allocation.mapFuncOp(funcOp, newFuncOp);
- 
diff --git a/third_party/xla/third_party/triton/cl597014441.patch b/third_party/xla/third_party/triton/cl597014441.patch
deleted file mode 100644
index 44d9627d034766..00000000000000
--- a/third_party/xla/third_party/triton/cl597014441.patch
+++ /dev/null
@@ -1,13 +0,0 @@
-==== triton/test/Conversion/tritongpu_to_llvm.mlir#19 - /google/src/cloud/ecg/mlir_2aec7083ada09c8b8a0aad79492cbedcf8f9fbb7_1704832781/triton/test/Conversion/tritongpu_to_llvm.mlir ====
-# action=edit type=text
---- triton/test/Conversion/tritongpu_to_llvm.mlir	2023-12-15 04:43:34.000000000 -0800
-+++ triton/test/Conversion/tritongpu_to_llvm.mlir	2024-01-09 12:40:14.000000000 -0800
-@@ -3,7 +3,7 @@
- module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32} {
-   // CHECK: llvm.func @test_empty_kernel(%arg0: i64, %arg1: !llvm.ptr<1>)
-   // Here the 128 comes from the 4 in module attribute multiples 32
--  // CHECK: nvvm.kernel = 1 : ui1, nvvm.maxntid = [128 : i32]
-+  // CHECK: nvvm.kernel = 1 : ui1, nvvm.maxntid = array<i32: 128>
-   tt.func @test_empty_kernel(%lb : index, %A : !tt.ptr<f16>) {
-     // CHECK:  llvm.return
-     tt.return
diff --git a/third_party/xla/third_party/triton/cl597183646.patch b/third_party/xla/third_party/triton/cl597183646.patch
deleted file mode 100644
index 1a8501bb5f5e9e..00000000000000
--- a/third_party/xla/third_party/triton/cl597183646.patch
+++ /dev/null
@@ -1,22 +0,0 @@
-==== triton/lib/Dialect/Triton/Transforms/Combine.td#3 - /google/src/cloud/tomnatan/mlir_a7262d2d9bee9bdfdbcd03ca27a0128c2e2b1c1a_1704881231/triton/lib/Dialect/Triton/Transforms/Combine.td ====
-# action=edit type=text
---- triton/lib/Dialect/Triton/Transforms/Combine.td	2023-12-13 05:13:41.000000000 -0800
-+++ triton/lib/Dialect/Triton/Transforms/Combine.td	2024-01-10 05:21:40.000000000 -0800
-@@ -12,7 +12,7 @@
- // AddIOp(d, DotOp(a, b, c)) and c==0 => DotOp(a, b, d)
- // AddFOp(d, DotOp(a, b, c)) and c==0 => DotOp(a, b, d)
- def CombineDotAddIPattern : Pat<
--        (Arith_AddIOp $d, (TT_DotOp:$res $a, $b, $c, $allowTF32, $maxNumImpreciseAcc)),
-+        (Arith_AddIOp $d, (TT_DotOp:$res $a, $b, $c, $allowTF32, $maxNumImpreciseAcc), $overflow),
-         (TT_DotOp $a, $b, $d, $allowTF32, $maxNumImpreciseAcc, (location $res)),
-         [(Constraint<CPred<"isZero($0)">> $c)]>;
- def CombineDotAddFPattern : Pat<
-@@ -22,7 +22,7 @@
-          (Constraint<CPred<"::llvm::cast<::mlir::IntegerAttr>($0).getInt() == 0">> $maxNumImpreciseAcc)]>;
- 
- def CombineDotAddIRevPattern : Pat<
--        (Arith_AddIOp (TT_DotOp:$res $a, $b, $c, $allowTF32, $maxNumImpreciseAcc), $d),
-+        (Arith_AddIOp (TT_DotOp:$res $a, $b, $c, $allowTF32, $maxNumImpreciseAcc), $d, $overflow),
-         (TT_DotOp $a, $b, $d, $allowTF32, $maxNumImpreciseAcc, (location $res)),
-         [(Constraint<CPred<"isZero($0)">> $c)]>;
- def CombineDotAddFRevPattern : Pat<
diff --git a/third_party/xla/third_party/triton/cl597222925.patch b/third_party/xla/third_party/triton/cl597222925.patch
deleted file mode 100644
index 1f1b38bceff247..00000000000000
--- a/third_party/xla/third_party/triton/cl597222925.patch
+++ /dev/null
@@ -1,27 +0,0 @@
-Upstreamed in https://github.com/openai/triton/pull/2914
-
-diff --git a/lib/Dialect/TritonGPU/IR/Dialect.cpp b/lib/Dialect/TritonGPU/IR/Dialect.cpp
---- a/lib/Dialect/TritonGPU/IR/Dialect.cpp
-+++ b/lib/Dialect/TritonGPU/IR/Dialect.cpp
-@@ -300,17 +300,17 @@ SmallVector<unsigned> getCTASplitNum(Attribute layout) {
- }
- 
- SmallVector<unsigned> getCTAOrder(Attribute layout) {
--  ArrayRef<unsigned> ref;
-+  SmallVector<unsigned> res;
-   if (auto distributedLayout = layout.dyn_cast<DistributedEncodingTrait>()) {
--    ref = distributedLayout.getCTAOrder();
-+    res = distributedLayout.getCTAOrder();
-   } else if (auto mfmaLayout = layout.dyn_cast<MfmaEncodingAttr>()) {
-     return {0, 1};
-   } else if (auto sharedLayout = layout.dyn_cast<SharedEncodingAttr>()) {
--    ref = sharedLayout.getCTALayout().getCTAOrder();
-+    res = SmallVector<unsigned>(sharedLayout.getCTALayout().getCTAOrder());
-   } else {
-     llvm::report_fatal_error("Unimplemented usage of getCTAOrder");
-   }
--  return SmallVector<unsigned>(ref.begin(), ref.end());
-+  return res;
- }
- 
- SmallVector<int64_t> getShapePerCTA(ArrayRef<unsigned> CTASplitNum,
diff --git a/third_party/xla/third_party/triton/cl598847928.patch b/third_party/xla/third_party/triton/cl598847928.patch
deleted file mode 100644
index e150d47ce0b103..00000000000000
--- a/third_party/xla/third_party/triton/cl598847928.patch
+++ /dev/null
@@ -1,46 +0,0 @@
-diff --git a/lib/Target/LLVMIR/LLVMDIScope.cpp b/lib/Target/LLVMIR/LLVMDIScope.cpp
-index 489aa455..a9de0b31 100644
---- a/lib/Target/LLVMIR/LLVMDIScope.cpp
-+++ b/lib/Target/LLVMIR/LLVMDIScope.cpp
-@@ -73,16 +73,24 @@ struct LLVMDIScopePass : public LLVMDIScopeBase<LLVMDIScopePass> {
-           context, llvm::sys::path::filename(inputFilePath),
-           llvm::sys::path::parent_path(inputFilePath));
-     }
--    auto distinctId = mlir::DistinctAttr::create(mlir::UnitAttr::get(context));
--    if (!compileUnitAttr) {
--      compileUnitAttr = LLVM::DICompileUnitAttr::get(
--          context, distinctId, llvm::dwarf::DW_LANG_C, fileAttr,
--          StringAttr::get(context, "triton"),
--          /*isOptimized=*/true, LLVM::DIEmissionKind::LineTablesOnly);
--    }
-     auto subroutineTypeAttr =
-         LLVM::DISubroutineTypeAttr::get(context, llvm::dwarf::DW_CC_normal, {});
- 
-+    DistinctAttr distinctId;
-+    auto subprogramFlags = LLVM::DISubprogramFlags::Optimized;
-+    if (!funcOp.isExternal()) {
-+      distinctId = mlir::DistinctAttr::create(mlir::UnitAttr::get(context));
-+      if (!compileUnitAttr) {
-+        compileUnitAttr = LLVM::DICompileUnitAttr::get(
-+            context, distinctId, llvm::dwarf::DW_LANG_C, fileAttr,
-+            StringAttr::get(context, "triton"),
-+            /*isOptimized=*/true, LLVM::DIEmissionKind::LineTablesOnly);
-+      }
-+      subprogramFlags = subprogramFlags | LLVM::DISubprogramFlags::Definition;
-+    } else {
-+      compileUnitAttr = {};
-+    }
-+
-     StringAttr funcNameAttr = funcOp.getNameAttr();
-     // Note that scopeline is set differently from LLVM's
-     // DIScopeForLLVMFuncOpPass. I don't find reasons why scopeline should be
-@@ -92,8 +100,7 @@ struct LLVMDIScopePass : public LLVMDIScopeBase<LLVMDIScopePass> {
-         funcNameAttr, fileAttr,
-         /*line=*/line,
-         /*scopeline=*/line,
--        LLVM::DISubprogramFlags::Definition |
--            LLVM::DISubprogramFlags::Optimized,
-+        subprogramFlags,
-         subroutineTypeAttr);
-     funcOp->setLoc(FusedLoc::get(context, {loc}, subprogramAttr));
-   }
diff --git a/third_party/xla/third_party/triton/cl599125711.patch b/third_party/xla/third_party/triton/cl599125711.patch
deleted file mode 100644
index 9a9e7adcd8596d..00000000000000
--- a/third_party/xla/third_party/triton/cl599125711.patch
+++ /dev/null
@@ -1,68 +0,0 @@
-==== triton/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp#19 - /google/src/cloud/bchetioui/mlir_5fcf907b34355980f77d7665a175b05fea7a6b7b_1705493395/triton/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp ====
-# action=edit type=text
---- triton/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp	2024-01-15 03:54:37.000000000 -0800
-+++ triton/lib/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.cpp	2024-01-17 04:14:45.000000000 -0800
-@@ -88,7 +88,7 @@
-     auto splatOp = insertOp.getMask().getDefiningOp<triton::SplatOp>();
-     if (!splatOp)
-       return failure();
--    rewriter.updateRootInPlace(insertOp, [&]() {
-+    rewriter.modifyOpInPlace(insertOp, [&]() {
-       insertOp.getMaskMutable().assign(splatOp->getOperand(0));
-     });
-     return mlir::success();
-==== triton/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp#23 - /google/src/cloud/bchetioui/mlir_5fcf907b34355980f77d7665a175b05fea7a6b7b_1705493395/triton/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp ====
-# action=edit type=text
---- triton/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp	2023-12-15 04:43:34.000000000 -0800
-+++ triton/lib/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.cpp	2024-01-17 04:21:13.000000000 -0800
-@@ -650,8 +650,8 @@
-   LogicalResult
-   matchAndRewrite(scf::ConditionOp op, OpAdaptor adaptor,
-                   ConversionPatternRewriter &rewriter) const override {
--    rewriter.updateRootInPlace(
--        op, [&]() { op->setOperands(adaptor.getOperands()); });
-+    rewriter.modifyOpInPlace(op,
-+                             [&]() { op->setOperands(adaptor.getOperands()); });
-     return success();
-   }
- };
-==== triton/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp#17 - /google/src/cloud/bchetioui/mlir_5fcf907b34355980f77d7665a175b05fea7a6b7b_1705493395/triton/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp	2024-01-05 23:46:53.000000000 -0800
-+++ triton/lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp	2024-01-17 04:21:13.000000000 -0800
-@@ -284,8 +284,7 @@
-         srcType.getShape(), srcType.getElementType(), dotOperandEncoding);
-     Value newOperand = rewriter.create<ConvertLayoutOp>(dotOp.getLoc(), newType,
-                                                         convertLhs.getSrc());
--    rewriter.updateRootInPlace(dotOp,
--                               [&]() { dotOp.setOperand(0, newOperand); });
-+    rewriter.modifyOpInPlace(dotOp, [&]() { dotOp.setOperand(0, newOperand); });
-     return success();
-   }
- };
-==== triton/lib/Dialect/TritonGPU/Transforms/OptimizeThreadLocality.cpp#6 - /google/src/cloud/bchetioui/mlir_5fcf907b34355980f77d7665a175b05fea7a6b7b_1705493395/triton/lib/Dialect/TritonGPU/Transforms/OptimizeThreadLocality.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/TritonGPU/Transforms/OptimizeThreadLocality.cpp	2024-01-15 03:54:37.000000000 -0800
-+++ triton/lib/Dialect/TritonGPU/Transforms/OptimizeThreadLocality.cpp	2024-01-17 04:14:45.000000000 -0800
-@@ -73,7 +73,7 @@
-     if (triton::gpu::isExpensiveView(viewOp.getSrc().getType(), newType))
-       return failure();
-     rewriter.setInsertionPointAfter(viewOp);
--    rewriter.updateRootInPlace(viewOp, [&]() {
-+    rewriter.modifyOpInPlace(viewOp, [&]() {
-       viewOp.getResult().setType(newType);
-       viewOp.setEfficientLayout(true);
-     });
-==== triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp#22 - /google/src/cloud/bchetioui/mlir_5fcf907b34355980f77d7665a175b05fea7a6b7b_1705493395/triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp ====
-# action=edit type=text
---- triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp	2023-12-15 04:43:34.000000000 -0800
-+++ triton/lib/Dialect/TritonGPU/Transforms/Utility.cpp	2024-01-17 04:14:45.000000000 -0800
-@@ -706,7 +706,7 @@
-     }
-     if (deadArg.empty())
-       return failure();
--    rewriter.updateRootInPlace(forOp, [&]() {
-+    rewriter.modifyOpInPlace(forOp, [&]() {
-       // For simplicity we just change the dead yield operand to use the
-       // associated argument and leave the operations and argument removal to
-       // dead code elimination.
diff --git a/third_party/xla/third_party/triton/cl607293980.patch b/third_party/xla/third_party/triton/cl607293980.patch
new file mode 100644
index 00000000000000..b7b9d0e84fab2e
--- /dev/null
+++ b/third_party/xla/third_party/triton/cl607293980.patch
@@ -0,0 +1,17 @@
+Long standing patch due to licensing issues.
+diff --git a/include/triton/Tools/Sys/GetEnv.hpp b/include/triton/Tools/Sys/GetEnv.hpp
+index 31bc03fe1..a19a432df 100644
+--- a/include/triton/Tools/Sys/GetEnv.hpp
++++ b/include/triton/Tools/Sys/GetEnv.hpp
+@@ -34,9 +34,10 @@ inline const std::set<std::string> ENV_VARS = {
+     "AMDGCN_ENABLE_DUMP",
+     "DISABLE_FAST_REDUCTION",
+     "DISABLE_LLVM_OPT",
+-    "DISABLE_MMA_V3",
++    "ENABLE_MMA_V3",
+     "DISABLE_PTXAS_OPT",
+     "LLVM_IR_ENABLE_DUMP",
++    "MLIR_ENABLE_DIAGNOSTICS",
+     "MLIR_ENABLE_DUMP",
+     "TRITON_DISABLE_LINE_INFO",
+     "TRITON_DISABLE_RESHAPE_ENCODING_INFERENCE",
diff --git a/third_party/xla/third_party/triton/workspace.bzl b/third_party/xla/third_party/triton/workspace.bzl
index 8c5e620681ac36..98b5d30214573e 100644
--- a/third_party/xla/third_party/triton/workspace.bzl
+++ b/third_party/xla/third_party/triton/workspace.bzl
@@ -5,9 +5,8 @@ load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 def repo():
     """Imports Triton."""
 
-    TRITON_COMMIT = "cl596935355"
-    TRITON_SHA256 = "66bac24a0443655c8f4efe542bc78c0c34cf8756baa746e61604cf26bd9fae5e"
-
+    TRITON_COMMIT = "cl606191901"
+    TRITON_SHA256 = "f6c8089671af4d65e93e3ab4bebbaf289edce96e4d6f76847a2a24a933e02db9"
     tf_http_archive(
         name = "triton",
         sha256 = TRITON_SHA256,
@@ -15,10 +14,6 @@ def repo():
         urls = tf_mirror_urls("https://github.com/openxla/triton/archive/{commit}.tar.gz".format(commit = TRITON_COMMIT)),
         # For temporary changes which haven't landed upstream yet.
         patch_file = [
-            "//third_party/triton:b304456327.patch",
-            "//third_party/triton:cl597183646.patch",
-            "//third_party/triton:cl597222925.patch",
-            "//third_party/triton:cl598847928.patch",
-            "//third_party/triton:cl599125711.patch",
+            "//third_party/triton:cl607293980.patch",  # long standing :(
         ],
     )
diff --git a/third_party/xla/third_party/tsl/.bazelrc b/third_party/xla/third_party/tsl/.bazelrc
index 1538ddd284f616..bd71ab3016563d 100644
--- a/third_party/xla/third_party/tsl/.bazelrc
+++ b/third_party/xla/third_party/tsl/.bazelrc
@@ -255,6 +255,14 @@ build:cuda_clang_official --action_env=CLANG_CUDA_COMPILER_PATH="/usr/lib/llvm-1
 build:cuda_clang_official --action_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 build:cuda_clang_official --crosstool_top="@sigbuild-r2.16-clang_config_cuda//crosstool:toolchain"
 
+# Build with nvcc for CUDA and clang for host
+build:nvcc_clang --config=cuda
+# Unfortunately, cuda_configure.bzl demands this for using nvcc + clang
+build:nvcc_clang --action_env=TF_CUDA_CLANG="1"
+build:nvcc_clang --action_env=TF_NVCC_CLANG="1"
+build:nvcc_clang --@local_config_cuda//:cuda_compiler=nvcc
+
+
 # Debug config
 build:dbg -c dbg
 # Only include debug info for files under tensorflow/, excluding kernels, to
@@ -527,8 +535,8 @@ build:rbe_linux_cuda --repo_env=TF_NCCL_CONFIG_REPO="@sigbuild-r2.16-clang_confi
 test:rbe_linux_cuda --test_env=LD_LIBRARY_PATH="/usr/local/cuda/lib64:/usr/local/cuda/extras/CUPTI/lib64"
 
 build:rbe_linux_cuda_nvcc --config=rbe_linux_cuda
+build:rbe_linux_cuda_nvcc --config=nvcc_clang
 build:rbe_linux_cuda_nvcc --repo_env TF_NCCL_USE_STUB=1
-build:rbe_linux_cuda_nvcc --action_env=TF_NVCC_CLANG="1"
 
 # TODO(kanglan): Remove rbe_win and rbe_win_py3* after b/289091160 is fixed
 build:rbe_win --config=rbe_base
@@ -577,6 +585,7 @@ build:elinux_armhf --copt -mfp16-format=ieee
 
 # Load rc file written by ./configure.
 try-import %workspace%/.tf_configure.bazelrc
+try-import %workspace%/xla_configure.bazelrc
 
 # Load rc file with user-specific options.
 try-import %workspace%/.bazelrc.user
@@ -719,7 +728,7 @@ build:no_tfrt --deleted_packages=tensorflow/compiler/mlir/tfrt,tensorflow/compil
 # Use --config=tf_public_cache to try and use the TensorFlow public build cache
 # to build TensorFlow. Look at ci/official/envs to find which types of jobs
 # push to the cache.  For macOS, use --config=tf_public_macos_cache
-build:tf_public_cache --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/september2022" --remote_upload_local_results=false
+build:tf_public_cache --remote_cache="https://storage.googleapis.com/tensorflow-devinfra-bazel-cache/january2024" --remote_upload_local_results=false
 # Cache pushes are limited to TF's CI system.
 build:tf_public_cache_push --config=tf_public_cache --remote_upload_local_results=true --google_default_credentials
 # Public cache for macOS builds
@@ -777,28 +786,38 @@ test:linux_cuda_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-os
 test:linux_cuda_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
 test:linux_cuda_pycpp_test --config=linux_cuda_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/...
 # ARM64 PYCPP
-test:linux_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
-test:linux_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
-test:linux_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium --flaky_test_attempts=3
+# In Linux Arm64 presubmit/continuous build, we cross-compile the binaries on
+# Linux x86 so that we can use RBE. Since tests still need to run on the single
+# host Arm64 machine, the build becomes too slow (~30 min) to be a presubmit.
+# For testing purposes, we want to see the runtime performance of an
+# experimental job that is build-only, i.e, we only build the test targets and
+# do not run them. By prefixing the configs with "build", we can run both
+# `bazel build` and `bazel test` commands with the same config as test configs
+# inherit from build.
+build:linux_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+build:linux_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-no_aarch64,-oss_excluded,-oss_serial,-gpu,-tpu,-benchmark-test,-v1only
+build:linux_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium --flaky_test_attempts=3
 # TODO(michaelhudgins): Why do we need to specifically omit go and java here? 
-test:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test -//tensorflow/compiler/mlir/tfr/examples/customization:test_ops_test -//tensorflow/compiler/mlir/tfr/examples/mnist:mnist_ops_test -//tensorflow/compiler/mlir/tfr/examples/pad:pad_ops_test -//tensorflow/python/tools:aot_compiled_test
+build:linux_arm64_pycpp_test --config=linux_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/core/grappler/optimizers:auto_mixed_precision_test_cpu -//tensorflow/core/grappler/optimizers:remapper_test_cpu -//tensorflow/core/kernels/image:resize_bicubic_op_test -//tensorflow/compiler/mlir/tfr/examples/customization:test_ops_test -//tensorflow/compiler/mlir/tfr/examples/mnist:mnist_ops_test -//tensorflow/compiler/mlir/tfr/examples/pad:pad_ops_test -//tensorflow/python/tools:aot_compiled_test
 # CROSS-COMPILE ARM64 PYCPP
-test:cross_compile_linux_arm64_pycpp_test --config=linux_arm64_pycpp_test
+build:cross_compile_linux_arm64_pycpp_test --config=linux_arm64_pycpp_test
 # Tests that fail only when cross-compiled
-test:cross_compile_linux_arm64_pycpp_test -//tensorflow/compiler/mlir/quantization/stablehlo:convert_tf_quant_to_mhlo_int_test
+build:cross_compile_linux_arm64_pycpp_test -//tensorflow/compiler/mlir/quantization/stablehlo:convert_tf_quant_to_mhlo_int_test
 # MACOS ARM64 PYCPP
 test:macos_arm64_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
 test:macos_arm64_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test,-no_mac_arm64,-no_aarch64
 test:macos_arm64_pycpp_test_filters --test_lang_filters=cc,py --test_size_filters=small,medium
 test:macos_arm64_pycpp_test --config=macos_arm64_pycpp_test_filters -- //tensorflow/... -//tensorflow/python/integration_testing/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/lite/... -//tensorflow/tools/toolchains/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/compiler/aot/... -//tensorflow/core/kernels/image:resize_bicubic_op_test
 # MACOS X86 PYCPP
-test:macos_x86_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
-test:macos_x86_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
-test:macos_x86_pycpp_test_filters --keep_going --test_lang_filters=cc,py --test_size_filters=small,medium
-test:macos_x86_pycpp_test --config=macos_x86_pycpp_test_filters -- //tensorflow/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/python/integration_testing/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/...
+# These are defined as build configs so that we can run a build only job. See
+# the note under "ARM64 PYCPP" for more details.
+build:macos_x86_pycpp_test_filters --test_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
+build:macos_x86_pycpp_test_filters --build_tag_filters=-no_oss,-oss_excluded,-oss_serial,-no_oss_py38,-no_oss_py39,-no_oss_py310,-nomac,-no_mac,-mac_excluded,-v1only,-gpu,-tpu,-benchmark-test
+build:macos_x86_pycpp_test_filters --keep_going --test_lang_filters=cc,py --test_size_filters=small,medium
+build:macos_x86_pycpp_test --config=macos_x86_pycpp_test_filters -- //tensorflow/... -//tensorflow/compiler/tf2tensorrt/... -//tensorflow/core/tpu/... -//tensorflow/go/... -//tensorflow/java/... -//tensorflow/python/integration_testing/... -//tensorflow/tools/toolchains/... -//tensorflow/lite/... -//tensorflow/compiler/aot/...
 # CROSS-COMPILE MACOS X86 PYCPP
-test:cross_compile_macos_x86_pycpp_test --config=macos_x86_pycpp_test
-test:cross_compile_macos_x86_pycpp_test -//tensorflow/core/kernels:quantized_conv_ops_test -//tensorflow/core/kernels:quantized_matmul_op_test -//tensorflow/python/ops:quantized_conv_ops_test -//tensorflow/tools/graph_transforms:transforms_test -//tensorflow/python/tools:aot_compiled_test
+build:cross_compile_macos_x86_pycpp_test --config=macos_x86_pycpp_test
+build:cross_compile_macos_x86_pycpp_test -//tensorflow/core/kernels:quantized_conv_ops_test -//tensorflow/core/kernels:quantized_matmul_op_test -//tensorflow/python/ops:quantized_conv_ops_test -//tensorflow/tools/graph_transforms:transforms_test -//tensorflow/python/tools:aot_compiled_test
 # END TF TEST SUITE OPTIONS
 
 # START CROSS-COMPILE CONFIGS
@@ -855,8 +874,12 @@ build:cross_compile_macos_x86 --platform_mappings=tensorflow/tools/toolchains/cr
 # RBE cross-compile configs for Darwin x86
 build:rbe_cross_compile_macos_x86 --config=cross_compile_macos_x86
 build:rbe_cross_compile_macos_x86 --config=rbe_cross_compile_base
+build:rbe_cross_compile_macos_x86 --bes_upload_mode=fully_async
 test:rbe_cross_compile_macos_x86 --config=rbe_cross_compile_base
 # Increase the test timeout as tests often take longer on mac.
 test:rbe_cross_compile_macos_x86 --test_timeout=300,450,1200,3600
+# Limit jobs to 100 to avoid running into "out of memory" issues (b/316266643)
+build:rbe_cross_compile_macos_x86 --jobs=100
+test:rbe_cross_compile_macos_x86 --jobs=100
 # END MACOS CROSS-COMPILE CONFIGS
 # END CROSS-COMPILE CONFIGS
diff --git a/third_party/xla/third_party/tsl/.bazelversion b/third_party/xla/third_party/tsl/.bazelversion
index b536fbc5061305..f3c238740e5bc3 100644
--- a/third_party/xla/third_party/tsl/.bazelversion
+++ b/third_party/xla/third_party/tsl/.bazelversion
@@ -1,2 +1,2 @@
-6.1.0
+6.5.0
 # NOTE: Update Bazel version in tensorflow/tools/ci_build/release/common.sh.oss
\ No newline at end of file
diff --git a/third_party/xla/third_party/tsl/opensource_only.files b/third_party/xla/third_party/tsl/opensource_only.files
index aad15f22276aec..2c461ca4c940f2 100644
--- a/third_party/xla/third_party/tsl/opensource_only.files
+++ b/third_party/xla/third_party/tsl/opensource_only.files
@@ -17,8 +17,6 @@ third_party/ducc/threading.h:
 third_party/eigen3/BUILD:
 third_party/eigen3/LICENSE:
 third_party/eigen3/eigen_archive.BUILD:
-third_party/gif.BUILD:
-third_party/gif_fix_strtok_r.patch:
 third_party/git/BUILD.tpl:
 third_party/git/BUILD:
 third_party/git/git_configure.bzl:
@@ -66,8 +64,8 @@ third_party/nccl/build_defs.bzl.tpl:
 third_party/nccl/generated_names.bzl.tpl:
 third_party/nccl/nccl_configure.bzl:
 third_party/nccl/system.BUILD.tpl:
-third_party/png.BUILD:
-third_party/png_fix_rpi.patch:
+third_party/nvtx/BUILD:
+third_party/nvtx/LICENSE:
 third_party/protobuf/BUILD:
 third_party/py/non_hermetic/BUILD.tpl:
 third_party/py/non_hermetic/BUILD:
@@ -149,4 +147,3 @@ tsl/mkl/BUILD:
 tsl/mkl/LICENSE:
 tsl/mkl/MKL_LICENSE:
 tsl/mkl/build_defs.bzl:
-tsl/platform/default/build_config/BUILD:
diff --git a/third_party/xla/third_party/tsl/third_party/compute_library/BUILD b/third_party/xla/third_party/tsl/third_party/compute_library/BUILD
index b353c1fa0aedba..4fc694c50a43cf 100644
--- a/third_party/xla/third_party/tsl/third_party/compute_library/BUILD
+++ b/third_party/xla/third_party/tsl/third_party/compute_library/BUILD
@@ -1,9 +1,6 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
-exports_files(
-    ["LICENSE"],
-    visibility = ["//visibility:public"],
-)
+exports_files(["LICENSE"])
 
 config_setting(
     name = "build_with_acl",
diff --git a/third_party/xla/third_party/tsl/third_party/curl.BUILD b/third_party/xla/third_party/tsl/third_party/curl.BUILD
index 8dcd54451dae66..b31d8488aaa0ba 100644
--- a/third_party/xla/third_party/tsl/third_party/curl.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/curl.BUILD
@@ -14,11 +14,6 @@ CURL_WIN_COPTS = [
     "/DCURL_DISABLE_PROXY",
     "/DHAVE_LIBZ",
     "/DHAVE_ZLIB_H",
-    # Defining _USING_V110_SDK71_ is hackery to defeat curl's incorrect
-    # detection of what OS releases we can build on with VC 2012. This
-    # may not be needed (or may have to change) if the WINVER setting
-    # changes in //third_party/msvc/vc_12_0/CROSSTOOL.
-    "/D_USING_V110_SDK71_",
 ]
 
 CURL_WIN_SRCS = [
diff --git a/third_party/xla/third_party/tsl/third_party/gif.BUILD b/third_party/xla/third_party/tsl/third_party/gif.BUILD
deleted file mode 100644
index 51621ba953e6e2..00000000000000
--- a/third_party/xla/third_party/tsl/third_party/gif.BUILD
+++ /dev/null
@@ -1,61 +0,0 @@
-# Description:
-#   A library for decoding and encoding GIF images
-
-licenses(["notice"])  # MIT
-
-exports_files(["COPYING"])
-
-cc_library(
-    name = "gif",
-    srcs = [
-        "dgif_lib.c",
-        "egif_lib.c",
-        "gif_err.c",
-        "gif_font.c",
-        "gif_hash.c",
-        "gif_hash.h",
-        "gif_lib_private.h",
-        "gifalloc.c",
-        "openbsd-reallocarray.c",
-        "quantize.c",
-    ],
-    hdrs = ["gif_lib.h"],
-    defines = select({
-        ":android": [
-            "S_IREAD=S_IRUSR",
-            "S_IWRITE=S_IWUSR",
-            "S_IEXEC=S_IXUSR",
-        ],
-        "//conditions:default": [],
-    }),
-    includes = ["."],
-    visibility = ["//visibility:public"],
-    deps = select({
-        ":windows": [":windows_polyfill"],
-        "//conditions:default": [],
-    }),
-)
-
-cc_library(
-    name = "windows_polyfill",
-    hdrs = ["windows/unistd.h"],
-    includes = ["windows"],
-)
-
-genrule(
-    name = "windows_unistd_h",
-    outs = ["windows/unistd.h"],
-    cmd = "touch $@",
-)
-
-config_setting(
-    name = "windows",
-    values = {
-        "cpu": "x64_windows",
-    },
-)
-
-config_setting(
-    name = "android",
-    values = {"crosstool_top": "//external:android/crosstool"},
-)
diff --git a/third_party/xla/third_party/tsl/third_party/gif_fix_strtok_r.patch b/third_party/xla/third_party/tsl/third_party/gif_fix_strtok_r.patch
deleted file mode 100644
index c9c9c30c41fab9..00000000000000
--- a/third_party/xla/third_party/tsl/third_party/gif_fix_strtok_r.patch
+++ /dev/null
@@ -1,15 +0,0 @@
-diff -r -u ./fixed_gif_font.c ./gif_font.c
---- ./fixed_gif_font.c	2019-09-05 11:05:25.009598262 -0700
-+++ ./gif_font.c	2019-09-05 10:52:45.308389085 -0700
-@@ -11,6 +11,11 @@
-
- #include "gif_lib.h"
-
-+// Windows doesn't have strtok_r.
-+#if defined(WIN32) || defined(_WIN32) || defined(__WIN32) && !defined(__CYGWIN__)
-+#define strtok_r strtok_s
-+#endif
-+
- /*****************************************************************************
-  Ascii 8 by 8 regular font - only first 128 characters are supported.
- *****************************************************************************/
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl b/third_party/xla/third_party/tsl/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
index 0da1d7b58f4bb0..74fafb9b32f516 100755
--- a/third_party/xla/third_party/tsl/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/crosstool/clang/bin/crosstool_wrapper_driver_is_not_gcc.tpl
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 # Copyright 2015 The TensorFlow Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -41,7 +41,7 @@ import os
 import subprocess
 import re
 import sys
-import pipes
+import shlex
 
 # Template values set by cuda_autoconf.
 CPU_COMPILER = ('%{cpu_compiler}')
@@ -299,7 +299,7 @@ def main():
 
   if args.x and args.x[0] == 'cuda':
     if args.cuda_log: Log('-x cuda')
-    leftover = [pipes.quote(s) for s in leftover]
+    leftover = [shlex.quote(s) for s in leftover]
     if args.cuda_log: Log('using nvcc')
     return InvokeNvcc(leftover, log=args.cuda_log)
 
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda/build_defs.bzl.tpl b/third_party/xla/third_party/tsl/third_party/gpus/cuda/build_defs.bzl.tpl
index 189d3e3e784003..bc865cecb3240a 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/cuda/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda/build_defs.bzl.tpl
@@ -94,6 +94,25 @@ def if_cuda_is_configured(x, no_cuda = []):
       return select({"//conditions:default": x})
     return select({"//conditions:default": no_cuda})
 
+def if_cuda_newer_than(wanted_ver, if_true, if_false = []):
+    """Tests if CUDA was enabled during the configured process and if the
+    configured version is at least `wanted_ver`. `wanted_ver` needs
+    to be provided as a string in the format `<major>_<minor>`.
+    Example: `11_0`
+    """
+
+    wanted_major = int(wanted_ver.split('_')[0])
+    wanted_minor = int(wanted_ver.split('_')[1])
+
+    configured_version = "%{cuda_version}"
+    configured_major = int(configured_version.split('.')[0])
+    configured_minor = int(configured_version.split('.')[1])
+
+    if %{cuda_is_configured} and (wanted_major, wanted_minor) <= (configured_major, configured_minor):
+      return select({"//conditions:default": if_true})
+    return select({"//conditions:default": if_false})
+
+
 def cuda_header_library(
         name,
         hdrs,
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
index 5cd589e8eeb2be..89ea8f54a3495e 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/cuda_configure.bzl
@@ -827,6 +827,7 @@ def _create_dummy_repository(repository_ctx):
             "%{cuda_is_configured}": "False",
             "%{cuda_extra_copts}": "[]",
             "%{cuda_gpu_architectures}": "[]",
+            "%{cuda_version}": "0.0",
         },
     )
     _tpl(
@@ -1214,6 +1215,7 @@ def _create_local_cuda_repository(repository_ctx):
                 cuda_config.compute_capabilities,
             ),
             "%{cuda_gpu_architectures}": str(cuda_config.compute_capabilities),
+            "%{cuda_version}": cuda_config.cuda_version,
         },
     )
 
@@ -1427,6 +1429,7 @@ def _create_remote_cuda_repository(repository_ctx, remote_config_repo):
                 repository_ctx,
                 compute_capabilities(repository_ctx),
             ),
+            "%{cuda_version}": get_host_environ(repository_ctx, _TF_CUDA_VERSION),
         },
     )
     repository_ctx.template(
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/rocm/build_defs.bzl.tpl b/third_party/xla/third_party/tsl/third_party/gpus/rocm/build_defs.bzl.tpl
index 2b4595bb222885..339733755d6f1f 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/rocm/build_defs.bzl.tpl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/rocm/build_defs.bzl.tpl
@@ -38,6 +38,16 @@ def rocm_version_number():
     """Returns a list of supported GPU architectures."""
     return %{rocm_version_number}
 
+def if_gpu_is_configured(if_true, if_false = []):
+    """Tests if ROCm or CUDA was enabled during the configure process.
+
+    Unlike if_rocm() or if_cuda(), this does not require that we are building
+    with --config=rocm or --config=cuda, respectively. Used to allow non-GPU
+    code to depend on ROCm or CUDA libraries.
+
+    """
+    return select({"//conditions:default": %{gpu_is_configured}})
+
 def if_rocm_is_configured(x):
     """Tests if the ROCm was enabled during the configure process.
 
diff --git a/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl b/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
index 5c1195bada43f8..1822a788571625 100644
--- a/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
+++ b/third_party/xla/third_party/tsl/third_party/gpus/rocm_configure.bzl
@@ -10,6 +10,7 @@
 
 load(
     ":cuda_configure.bzl",
+    "enable_cuda",
     "make_copy_dir_rule",
     "make_copy_files_rule",
     "to_list_of_strings",
@@ -449,6 +450,7 @@ def _create_dummy_repository(repository_ctx):
         "rocm:build_defs.bzl",
         {
             "%{rocm_is_configured}": "False",
+            "%{gpu_is_configured}": "if_true" if enable_cuda(repository_ctx) else "if_false",
             "%{rocm_extra_copts}": "[]",
             "%{rocm_gpu_architectures}": "[]",
             "%{rocm_version_number}": "0",
@@ -634,6 +636,7 @@ def _create_local_rocm_repository(repository_ctx):
         tpl_paths["rocm:build_defs.bzl"],
         {
             "%{rocm_is_configured}": "True",
+            "%{gpu_is_configured}": "if_true",
             "%{rocm_extra_copts}": _compute_rocm_extra_copts(
                 repository_ctx,
                 rocm_config.amdgpu_targets,
@@ -762,6 +765,7 @@ def _create_remote_rocm_repository(repository_ctx, remote_config_repo):
         "rocm:build_defs.bzl",
         {
             "%{rocm_is_configured}": "True",
+            "%{gpu_is_configured}": "if_true",
             "%{rocm_extra_copts}": _compute_rocm_extra_copts(
                 repository_ctx,
                 [],  #_compute_capabilities(repository_ctx)
diff --git a/third_party/xla/third_party/tsl/third_party/hwloc/BUILD b/third_party/xla/third_party/tsl/third_party/hwloc/BUILD
index 3848c0818e77db..db5c9ec8873bb4 100644
--- a/third_party/xla/third_party/tsl/third_party/hwloc/BUILD
+++ b/third_party/xla/third_party/tsl/third_party/hwloc/BUILD
@@ -1,12 +1,10 @@
 # BUILD file to make this directory a package.
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
 
 exports_files(
     ["static-components.h"],
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/third_party/tsl/third_party/implib_so/BUILD b/third_party/xla/third_party/tsl/third_party/implib_so/BUILD
index 8401d6152b88ab..ca6976cd8d3425 100644
--- a/third_party/xla/third_party/tsl/third_party/implib_so/BUILD
+++ b/third_party/xla/third_party/tsl/third_party/implib_so/BUILD
@@ -5,7 +5,6 @@ licenses(["notice"])  # MIT
 py_binary(
     name = "get_symbols",
     srcs = ["get_symbols.py"],
-    visibility = ["//visibility:public"],
     deps = [
         "@bazel_tools//tools/python/runfiles",
         "@implib_so//:implib_gen_lib",
@@ -15,7 +14,6 @@ py_binary(
 py_binary(
     name = "make_stub",
     srcs = ["make_stub.py"],
-    visibility = ["//visibility:public"],
     deps = [
         "@bazel_tools//tools/python/runfiles",
         "@implib_so//:implib_gen_lib",
diff --git a/third_party/xla/third_party/tsl/third_party/jpeg/BUILD b/third_party/xla/third_party/tsl/third_party/jpeg/BUILD
deleted file mode 100644
index ed1568c32f33ed..00000000000000
--- a/third_party/xla/third_party/tsl/third_party/jpeg/BUILD
+++ /dev/null
@@ -1,3 +0,0 @@
-# Needed to make this a package.
-
-# copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
diff --git a/third_party/xla/third_party/tsl/third_party/jpeg/BUILD.system b/third_party/xla/third_party/tsl/third_party/jpeg/BUILD.system
deleted file mode 100644
index f4f52da9bdae1b..00000000000000
--- a/third_party/xla/third_party/tsl/third_party/jpeg/BUILD.system
+++ /dev/null
@@ -1,12 +0,0 @@
-licenses(["notice"])  # custom notice-style license, see LICENSE.md
-
-filegroup(
-    name = "LICENSE.md",
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "jpeg",
-    linkopts = ["-ljpeg"],
-    visibility = ["//visibility:public"],
-)
diff --git a/third_party/xla/third_party/tsl/third_party/jpeg/jpeg.BUILD b/third_party/xla/third_party/tsl/third_party/jpeg/jpeg.BUILD
deleted file mode 100644
index 9f61f9e31e5e12..00000000000000
--- a/third_party/xla/third_party/tsl/third_party/jpeg/jpeg.BUILD
+++ /dev/null
@@ -1,806 +0,0 @@
-# Description:
-#   libjpeg-turbo is a drop in replacement for jpeglib optimized with SIMD.
-
-load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
-load("@bazel_skylib//rules:common_settings.bzl", "string_flag")
-
-licenses(["notice"])  # custom notice-style license, see LICENSE.md
-
-exports_files(["LICENSE.md"])
-
-WIN_COPTS = [
-    "/Ox",
-    "-DWITH_SIMD",
-    "-wd4996",
-]
-
-libjpegturbo_copts = select({
-    ":android": [
-        "-O3",
-        "-fPIC",
-        "-w",
-    ],
-    ":windows": WIN_COPTS,
-    "//conditions:default": [
-        "-O3",
-        "-w",
-    ],
-}) + select({
-    ":armeabi-v7a": [
-        "-D__ARM_NEON__",
-        "-DNEON_INTRINSICS",
-        "-march=armv7-a",
-        "-mfpu=neon",
-        "-mfloat-abi=softfp",
-        "-fprefetch-loop-arrays",
-    ],
-    ":arm64-v8a": [
-        "-DNEON_INTRINSICS",
-    ],
-    ":linux_ppc64le": [
-        "-mcpu=power8",
-        "-mtune=power8",
-    ],
-    "//conditions:default": [],
-})
-
-cc_library(
-    name = "jpeg",
-    srcs = [
-        "jaricom.c",
-        "jcapimin.c",
-        "jcapistd.c",
-        "jcarith.c",
-        "jccoefct.c",
-        "jccolor.c",
-        "jcdctmgr.c",
-        "jchuff.c",
-        "jchuff.h",
-        "jcinit.c",
-        "jcmainct.c",
-        "jcmarker.c",
-        "jcmaster.c",
-        "jcomapi.c",
-        "jconfig.h",
-        "jconfigint.h",
-        "jcparam.c",
-        "jcphuff.c",
-        "jcprepct.c",
-        "jcsample.c",
-        "jctrans.c",
-        "jdapimin.c",
-        "jdapistd.c",
-        "jdarith.c",
-        "jdatadst.c",
-        "jdatasrc.c",
-        "jdcoefct.c",
-        "jdcoefct.h",
-        "jdcolor.c",
-        "jdct.h",
-        "jddctmgr.c",
-        "jdhuff.c",
-        "jdhuff.h",
-        "jdinput.c",
-        "jdmainct.c",
-        "jdmainct.h",
-        "jdmarker.c",
-        "jdmaster.c",
-        "jdmaster.h",
-        "jdmerge.c",
-        "jdmerge.h",
-        "jdphuff.c",
-        "jdpostct.c",
-        "jdsample.c",
-        "jdsample.h",
-        "jdtrans.c",
-        "jerror.c",
-        "jfdctflt.c",
-        "jfdctfst.c",
-        "jfdctint.c",
-        "jidctflt.c",
-        "jidctfst.c",
-        "jidctint.c",
-        "jidctred.c",
-        "jinclude.h",
-        "jmemmgr.c",
-        "jmemnobs.c",
-        "jmemsys.h",
-        "jpeg_nbits_table.h",
-        "jpegcomp.h",
-        "jquant1.c",
-        "jquant2.c",
-        "jutils.c",
-        "jversion.h",
-    ],
-    hdrs = [
-        "jccolext.c",  # should have been named .inc
-        "jdcol565.c",  # should have been named .inc
-        "jdcolext.c",  # should have been named .inc
-        "jdmrg565.c",  # should have been named .inc
-        "jdmrgext.c",  # should have been named .inc
-        "jerror.h",
-        "jmorecfg.h",
-        "jpegint.h",
-        "jpeglib.h",
-        "jstdhuff.c",  # should have been named .inc
-    ],
-    copts = libjpegturbo_copts,
-    visibility = ["//visibility:public"],
-    deps = select({
-        ":nosimd": [":simd_none"],
-        ":k8": [":simd_x86_64"],
-        ":armeabi-v7a": [":simd_armv7a"],
-        ":arm64-v8a": [":simd_armv8a"],
-        ":linux_ppc64le": [":simd_altivec"],
-        ":windows": [":simd_win_x86_64"],
-        "//conditions:default": [":simd_none"],
-    }),
-)
-
-cc_library(
-    name = "simd_altivec",
-    srcs = [
-        "jchuff.h",
-        "jconfig.h",
-        "jconfigint.h",
-        "jdct.h",
-        "jerror.h",
-        "jinclude.h",
-        "jmorecfg.h",
-        "jpegint.h",
-        "jpeglib.h",
-        "jsimd.h",
-        "jsimddct.h",
-        "simd/jsimd.h",
-        "simd/powerpc/jccolor-altivec.c",
-        "simd/powerpc/jcgray-altivec.c",
-        "simd/powerpc/jcsample-altivec.c",
-        "simd/powerpc/jdcolor-altivec.c",
-        "simd/powerpc/jdmerge-altivec.c",
-        "simd/powerpc/jdsample-altivec.c",
-        "simd/powerpc/jfdctfst-altivec.c",
-        "simd/powerpc/jfdctint-altivec.c",
-        "simd/powerpc/jidctfst-altivec.c",
-        "simd/powerpc/jidctint-altivec.c",
-        "simd/powerpc/jquanti-altivec.c",
-        "simd/powerpc/jsimd.c",
-    ],
-    hdrs = [
-        "simd/powerpc/jccolext-altivec.c",
-        "simd/powerpc/jcgryext-altivec.c",
-        "simd/powerpc/jcsample.h",
-        "simd/powerpc/jdcolext-altivec.c",
-        "simd/powerpc/jdmrgext-altivec.c",
-        "simd/powerpc/jsimd_altivec.h",
-    ],
-    copts = libjpegturbo_copts,
-)
-
-SRCS_SIMD_COMMON = [
-    "jchuff.h",
-    "jconfig.h",
-    "jconfigint.h",
-    "jdct.h",
-    "jerror.h",
-    "jinclude.h",
-    "jmorecfg.h",
-    "jpegint.h",
-    "jpeglib.h",
-    "jsimddct.h",
-    "jsimd.h",
-    "simd/jsimd.h",
-]
-
-cc_library(
-    name = "simd_x86_64",
-    srcs = [
-        "simd/x86_64/jccolor-avx2.o",
-        "simd/x86_64/jccolor-sse2.o",
-        "simd/x86_64/jcgray-avx2.o",
-        "simd/x86_64/jcgray-sse2.o",
-        "simd/x86_64/jchuff-sse2.o",
-        "simd/x86_64/jcphuff-sse2.o",
-        "simd/x86_64/jcsample-avx2.o",
-        "simd/x86_64/jcsample-sse2.o",
-        "simd/x86_64/jdcolor-avx2.o",
-        "simd/x86_64/jdcolor-sse2.o",
-        "simd/x86_64/jdmerge-avx2.o",
-        "simd/x86_64/jdmerge-sse2.o",
-        "simd/x86_64/jdsample-avx2.o",
-        "simd/x86_64/jdsample-sse2.o",
-        "simd/x86_64/jfdctflt-sse.o",
-        "simd/x86_64/jfdctfst-sse2.o",
-        "simd/x86_64/jfdctint-avx2.o",
-        "simd/x86_64/jfdctint-sse2.o",
-        "simd/x86_64/jidctflt-sse2.o",
-        "simd/x86_64/jidctfst-sse2.o",
-        "simd/x86_64/jidctint-avx2.o",
-        "simd/x86_64/jidctint-sse2.o",
-        "simd/x86_64/jidctred-sse2.o",
-        "simd/x86_64/jquantf-sse2.o",
-        "simd/x86_64/jquanti-avx2.o",
-        "simd/x86_64/jquanti-sse2.o",
-        "simd/x86_64/jsimd.c",
-        "simd/x86_64/jsimdcpu.o",
-    ] + SRCS_SIMD_COMMON,
-    copts = libjpegturbo_copts,
-    linkstatic = 1,
-)
-
-genrule(
-    name = "simd_x86_64_assemblage23",
-    srcs = [
-        "jconfig.h",
-        "jconfigint.h",
-        "simd/x86_64/jccolext-avx2.asm",
-        "simd/x86_64/jccolext-sse2.asm",
-        "simd/x86_64/jccolor-avx2.asm",
-        "simd/x86_64/jccolor-sse2.asm",
-        "simd/x86_64/jcgray-avx2.asm",
-        "simd/x86_64/jcgray-sse2.asm",
-        "simd/x86_64/jcgryext-avx2.asm",
-        "simd/x86_64/jcgryext-sse2.asm",
-        "simd/x86_64/jchuff-sse2.asm",
-        "simd/x86_64/jcphuff-sse2.asm",
-        "simd/x86_64/jcsample-avx2.asm",
-        "simd/x86_64/jcsample-sse2.asm",
-        "simd/x86_64/jdcolext-avx2.asm",
-        "simd/x86_64/jdcolext-sse2.asm",
-        "simd/x86_64/jdcolor-avx2.asm",
-        "simd/x86_64/jdcolor-sse2.asm",
-        "simd/x86_64/jdmerge-avx2.asm",
-        "simd/x86_64/jdmerge-sse2.asm",
-        "simd/x86_64/jdmrgext-avx2.asm",
-        "simd/x86_64/jdmrgext-sse2.asm",
-        "simd/x86_64/jdsample-avx2.asm",
-        "simd/x86_64/jdsample-sse2.asm",
-        "simd/x86_64/jfdctflt-sse.asm",
-        "simd/x86_64/jfdctfst-sse2.asm",
-        "simd/x86_64/jfdctint-avx2.asm",
-        "simd/x86_64/jfdctint-sse2.asm",
-        "simd/x86_64/jidctflt-sse2.asm",
-        "simd/x86_64/jidctfst-sse2.asm",
-        "simd/x86_64/jidctint-avx2.asm",
-        "simd/x86_64/jidctint-sse2.asm",
-        "simd/x86_64/jidctred-sse2.asm",
-        "simd/x86_64/jquantf-sse2.asm",
-        "simd/x86_64/jquanti-avx2.asm",
-        "simd/x86_64/jquanti-sse2.asm",
-        "simd/x86_64/jsimdcpu.asm",
-        "simd/nasm/jcolsamp.inc",
-        "simd/nasm/jdct.inc",
-        "simd/nasm/jsimdcfg.inc",
-        "simd/nasm/jsimdcfg.inc.h",
-        "simd/nasm/jsimdext.inc",
-    ],
-    outs = [
-        "simd/x86_64/jccolor-avx2.o",
-        "simd/x86_64/jccolor-sse2.o",
-        "simd/x86_64/jcgray-avx2.o",
-        "simd/x86_64/jcgray-sse2.o",
-        "simd/x86_64/jchuff-sse2.o",
-        "simd/x86_64/jcphuff-sse2.o",
-        "simd/x86_64/jcsample-avx2.o",
-        "simd/x86_64/jcsample-sse2.o",
-        "simd/x86_64/jdcolor-avx2.o",
-        "simd/x86_64/jdcolor-sse2.o",
-        "simd/x86_64/jdmerge-avx2.o",
-        "simd/x86_64/jdmerge-sse2.o",
-        "simd/x86_64/jdsample-avx2.o",
-        "simd/x86_64/jdsample-sse2.o",
-        "simd/x86_64/jfdctflt-sse.o",
-        "simd/x86_64/jfdctfst-sse2.o",
-        "simd/x86_64/jfdctint-avx2.o",
-        "simd/x86_64/jfdctint-sse2.o",
-        "simd/x86_64/jidctflt-sse2.o",
-        "simd/x86_64/jidctfst-sse2.o",
-        "simd/x86_64/jidctint-avx2.o",
-        "simd/x86_64/jidctint-sse2.o",
-        "simd/x86_64/jidctred-sse2.o",
-        "simd/x86_64/jquantf-sse2.o",
-        "simd/x86_64/jquanti-avx2.o",
-        "simd/x86_64/jquanti-sse2.o",
-        "simd/x86_64/jsimdcpu.o",
-    ],
-    cmd = "for out in $(OUTS); do\n" +
-          "  $(location @nasm//:nasm) -f elf64" +
-          "    -DELF -DPIC -D__x86_64__" +
-          "    -I $$(dirname $(location jconfig.h))/" +
-          "    -I $$(dirname $(location jconfigint.h))/" +
-          "    -I $$(dirname $(location simd/nasm/jsimdcfg.inc.h))/" +
-          "    -I $$(dirname $(location simd/x86_64/jccolext-sse2.asm))/" +
-          "    -o $$out" +
-          "    $$(dirname $(location simd/x86_64/jccolext-sse2.asm))/$$(basename $${out%.o}.asm)\n" +
-          "done",
-    tools = ["@nasm"],
-)
-
-expand_template(
-    name = "neon-compat_gen",
-    out = "simd/arm/neon-compat.h",
-    substitutions = {
-        "#cmakedefine HAVE_VLD1_S16_X3": "#define HAVE_VLD1_S16_X3",
-        "#cmakedefine HAVE_VLD1_U16_X2": "#define HAVE_VLD1_U16_X2",
-        "#cmakedefine HAVE_VLD1Q_U8_X4": "#define HAVE_VLD1Q_U8_X4",
-    },
-    template = "simd/arm/neon-compat.h.in",
-)
-
-genrule(
-    name = "neon-compat_hdr_src",
-    srcs = ["simd/arm/neon-compat.h"],
-    outs = ["neon-compat.h"],
-    cmd = "cp $(location simd/arm/neon-compat.h) $@",
-)
-
-cc_library(
-    name = "neon-compat_hdr",
-    hdrs = ["neon-compat.h"],
-    copts = libjpegturbo_copts,
-)
-
-SRCS_SIMD_ARM = [
-    "simd/arm/jccolor-neon.c",
-    "simd/arm/jcgray-neon.c",
-    "simd/arm/jcphuff-neon.c",
-    "simd/arm/jcsample-neon.c",
-    "simd/arm/jdcolor-neon.c",
-    "simd/arm/jdmerge-neon.c",
-    "simd/arm/jdsample-neon.c",
-    "simd/arm/jfdctfst-neon.c",
-    "simd/arm/jfdctint-neon.c",
-    "simd/arm/jidctfst-neon.c",
-    "simd/arm/jidctint-neon.c",
-    "simd/arm/jidctred-neon.c",
-    "simd/arm/jquanti-neon.c",
-]
-
-# .c files in the following list are used like .h files in that they are
-# "#include"-ed in the actual .c files. So, treat them like normal headers, and
-# they *should not* be compiled into individual objects.
-HDRS_SIMD_ARM = [
-    "simd/arm/align.h",
-    "simd/arm/jchuff.h",
-    "simd/arm/jcgryext-neon.c",
-    "simd/arm/jdcolext-neon.c",
-    "simd/arm/jdmrgext-neon.c",
-]
-
-cc_library(
-    name = "simd_armv7a",
-    srcs = [
-        "simd/arm/aarch32/jchuff-neon.c",
-        "simd/arm/aarch32/jsimd.c",
-    ] + SRCS_SIMD_COMMON + SRCS_SIMD_ARM,
-    hdrs = [
-        "simd/arm/aarch32/jccolext-neon.c",
-    ] + HDRS_SIMD_ARM,
-    copts = libjpegturbo_copts,
-    visibility = ["//visibility:private"],
-    deps = [":neon-compat_hdr"],
-)
-
-cc_library(
-    name = "simd_armv8a",
-    srcs = [
-        "simd/arm/aarch64/jchuff-neon.c",
-        "simd/arm/aarch64/jsimd.c",
-    ] + SRCS_SIMD_COMMON + SRCS_SIMD_ARM,
-    hdrs = [
-        "simd/arm/aarch64/jccolext-neon.c",
-    ] + HDRS_SIMD_ARM,
-    copts = libjpegturbo_copts,
-    visibility = ["//visibility:private"],
-    deps = [":neon-compat_hdr"],
-)
-
-cc_library(
-    name = "simd_win_x86_64",
-    srcs = [
-        "simd/x86_64/jccolor-avx2.obj",
-        "simd/x86_64/jccolor-sse2.obj",
-        "simd/x86_64/jcgray-avx2.obj",
-        "simd/x86_64/jcgray-sse2.obj",
-        "simd/x86_64/jchuff-sse2.obj",
-        "simd/x86_64/jcphuff-sse2.obj",
-        "simd/x86_64/jcsample-avx2.obj",
-        "simd/x86_64/jcsample-sse2.obj",
-        "simd/x86_64/jdcolor-avx2.obj",
-        "simd/x86_64/jdcolor-sse2.obj",
-        "simd/x86_64/jdmerge-avx2.obj",
-        "simd/x86_64/jdmerge-sse2.obj",
-        "simd/x86_64/jdsample-avx2.obj",
-        "simd/x86_64/jdsample-sse2.obj",
-        "simd/x86_64/jfdctflt-sse.obj",
-        "simd/x86_64/jfdctfst-sse2.obj",
-        "simd/x86_64/jfdctint-avx2.obj",
-        "simd/x86_64/jfdctint-sse2.obj",
-        "simd/x86_64/jidctflt-sse2.obj",
-        "simd/x86_64/jidctfst-sse2.obj",
-        "simd/x86_64/jidctint-avx2.obj",
-        "simd/x86_64/jidctint-sse2.obj",
-        "simd/x86_64/jidctred-sse2.obj",
-        "simd/x86_64/jquantf-sse2.obj",
-        "simd/x86_64/jquanti-avx2.obj",
-        "simd/x86_64/jquanti-sse2.obj",
-        "simd/x86_64/jsimd.c",
-        "simd/x86_64/jsimdcpu.obj",
-    ] + SRCS_SIMD_COMMON,
-    copts = libjpegturbo_copts,
-)
-
-genrule(
-    name = "simd_win_x86_64_assemble",
-    srcs = [
-        "jconfig.h",
-        "jconfigint.h",
-        "simd/x86_64/jccolext-avx2.asm",
-        "simd/x86_64/jccolext-sse2.asm",
-        "simd/x86_64/jccolor-avx2.asm",
-        "simd/x86_64/jccolor-sse2.asm",
-        "simd/x86_64/jcgray-avx2.asm",
-        "simd/x86_64/jcgray-sse2.asm",
-        "simd/x86_64/jcgryext-avx2.asm",
-        "simd/x86_64/jcgryext-sse2.asm",
-        "simd/x86_64/jchuff-sse2.asm",
-        "simd/x86_64/jcphuff-sse2.asm",
-        "simd/x86_64/jcsample-avx2.asm",
-        "simd/x86_64/jcsample-sse2.asm",
-        "simd/x86_64/jdcolext-avx2.asm",
-        "simd/x86_64/jdcolext-sse2.asm",
-        "simd/x86_64/jdcolor-avx2.asm",
-        "simd/x86_64/jdcolor-sse2.asm",
-        "simd/x86_64/jdmerge-avx2.asm",
-        "simd/x86_64/jdmerge-sse2.asm",
-        "simd/x86_64/jdmrgext-avx2.asm",
-        "simd/x86_64/jdmrgext-sse2.asm",
-        "simd/x86_64/jdsample-avx2.asm",
-        "simd/x86_64/jdsample-sse2.asm",
-        "simd/x86_64/jfdctflt-sse.asm",
-        "simd/x86_64/jfdctfst-sse2.asm",
-        "simd/x86_64/jfdctint-avx2.asm",
-        "simd/x86_64/jfdctint-sse2.asm",
-        "simd/x86_64/jidctflt-sse2.asm",
-        "simd/x86_64/jidctfst-sse2.asm",
-        "simd/x86_64/jidctint-avx2.asm",
-        "simd/x86_64/jidctint-sse2.asm",
-        "simd/x86_64/jidctred-sse2.asm",
-        "simd/x86_64/jquantf-sse2.asm",
-        "simd/x86_64/jquanti-avx2.asm",
-        "simd/x86_64/jquanti-sse2.asm",
-        "simd/x86_64/jsimdcpu.asm",
-        "simd/nasm/jcolsamp.inc",
-        "simd/nasm/jdct.inc",
-        "simd/nasm/jsimdcfg.inc",
-        "simd/nasm/jsimdcfg.inc.h",
-        "simd/nasm/jsimdext.inc",
-    ],
-    outs = [
-        "simd/x86_64/jccolor-avx2.obj",
-        "simd/x86_64/jccolor-sse2.obj",
-        "simd/x86_64/jcgray-avx2.obj",
-        "simd/x86_64/jcgray-sse2.obj",
-        "simd/x86_64/jchuff-sse2.obj",
-        "simd/x86_64/jcphuff-sse2.obj",
-        "simd/x86_64/jcsample-avx2.obj",
-        "simd/x86_64/jcsample-sse2.obj",
-        "simd/x86_64/jdcolor-avx2.obj",
-        "simd/x86_64/jdcolor-sse2.obj",
-        "simd/x86_64/jdmerge-avx2.obj",
-        "simd/x86_64/jdmerge-sse2.obj",
-        "simd/x86_64/jdsample-avx2.obj",
-        "simd/x86_64/jdsample-sse2.obj",
-        "simd/x86_64/jfdctflt-sse.obj",
-        "simd/x86_64/jfdctfst-sse2.obj",
-        "simd/x86_64/jfdctint-avx2.obj",
-        "simd/x86_64/jfdctint-sse2.obj",
-        "simd/x86_64/jidctflt-sse2.obj",
-        "simd/x86_64/jidctfst-sse2.obj",
-        "simd/x86_64/jidctint-avx2.obj",
-        "simd/x86_64/jidctint-sse2.obj",
-        "simd/x86_64/jidctred-sse2.obj",
-        "simd/x86_64/jquantf-sse2.obj",
-        "simd/x86_64/jquanti-avx2.obj",
-        "simd/x86_64/jquanti-sse2.obj",
-        "simd/x86_64/jsimdcpu.obj",
-    ],
-    cmd = "for out in $(OUTS); do\n" +
-          "  $(location @nasm//:nasm) -fwin64 -DWIN64 -D__x86_64__" +
-          "    -I $$(dirname $(location simd/x86_64/jccolext-sse2.asm))/" +
-          "    -I $$(dirname $(location simd/nasm/jdct.inc))/" +
-          "    -I $$(dirname $(location simd/nasm/jdct.inc))/../../win/" +
-          "    -o $$out" +
-          "    $$(dirname $(location simd/x86_64/jccolext-sse2.asm))/$$(basename $${out%.obj}.asm)\n" +
-          "done",
-    tools = ["@nasm"],
-)
-
-cc_library(
-    name = "simd_none",
-    srcs = [
-        "jchuff.h",
-        "jconfig.h",
-        "jconfigint.h",
-        "jdct.h",
-        "jerror.h",
-        "jinclude.h",
-        "jmorecfg.h",
-        "jpegint.h",
-        "jpeglib.h",
-        "jsimd.h",
-        "jsimd_none.c",
-        "jsimddct.h",
-    ],
-    copts = libjpegturbo_copts,
-)
-
-expand_template(
-    name = "jversion",
-    out = "jversion.h",
-    substitutions = {
-        "@COPYRIGHT_YEAR@": "1991-2022",
-    },
-    template = "jversion.h.in",
-)
-
-expand_template(
-    name = "jconfig_win",
-    out = "jconfig_win.h",
-    substitutions = {
-        "@JPEG_LIB_VERSION@": "62",
-        "@VERSION@": "2.1.4",
-        "@LIBJPEG_TURBO_VERSION_NUMBER@": "2001004",
-        "@BITS_IN_JSAMPLE@": "8",
-        "#cmakedefine C_ARITH_CODING_SUPPORTED": "#define C_ARITH_CODING_SUPPORTED",
-        "#cmakedefine D_ARITH_CODING_SUPPORTED": "#define D_ARITH_CODING_SUPPORTED",
-        "#cmakedefine MEM_SRCDST_SUPPORTED": "#define MEM_SRCDST_SUPPORTED",
-        "#cmakedefine WITH_SIMD": "",
-    },
-    template = "win/jconfig.h.in",
-)
-
-JCONFIG_NOWIN_COMMON_SUBSTITUTIONS = {
-    "@JPEG_LIB_VERSION@": "62",
-    "@VERSION@": "2.1.4",
-    "@LIBJPEG_TURBO_VERSION_NUMBER@": "2001004",
-    "#cmakedefine C_ARITH_CODING_SUPPORTED 1": "#define C_ARITH_CODING_SUPPORTED 1",
-    "#cmakedefine D_ARITH_CODING_SUPPORTED 1": "#define D_ARITH_CODING_SUPPORTED 1",
-    "#cmakedefine MEM_SRCDST_SUPPORTED 1": "#define MEM_SRCDST_SUPPORTED 1",
-    "@BITS_IN_JSAMPLE@": "8",
-    "#cmakedefine HAVE_LOCALE_H 1": "#define HAVE_LOCALE_H 1",
-    "#cmakedefine HAVE_STDDEF_H 1": "#define HAVE_STDDEF_H 1",
-    "#cmakedefine HAVE_STDLIB_H 1": "#define HAVE_STDLIB_H 1",
-    "#cmakedefine NEED_SYS_TYPES_H 1": "#define NEED_SYS_TYPES_H 1",
-    "#cmakedefine NEED_BSD_STRINGS 1": "",
-    "#cmakedefine HAVE_UNSIGNED_CHAR 1": "#define HAVE_UNSIGNED_CHAR 1",
-    "#cmakedefine HAVE_UNSIGNED_SHORT 1": "#define HAVE_UNSIGNED_SHORT 1",
-    "#cmakedefine INCOMPLETE_TYPES_BROKEN 1": "",
-    "#cmakedefine RIGHT_SHIFT_IS_UNSIGNED 1": "",
-    "#cmakedefine __CHAR_UNSIGNED__ 1": "",
-    "#undef const": "",
-    "#undef size_t": "",
-}
-
-JCONFIG_NOWIN_SIMD_SUBSTITUTIONS = {
-    "#cmakedefine WITH_SIMD 1": "#define WITH_SIMD 1",
-}
-
-JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS = {
-    "#cmakedefine WITH_SIMD 1": "",
-}
-
-JCONFIG_NOWIN_SIMD_SUBSTITUTIONS.update(JCONFIG_NOWIN_COMMON_SUBSTITUTIONS)
-
-JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS.update(JCONFIG_NOWIN_COMMON_SUBSTITUTIONS)
-
-expand_template(
-    name = "jconfig_nowin_nosimd",
-    out = "jconfig_nowin_nosimd.h",
-    substitutions = JCONFIG_NOWIN_NOSIMD_SUBSTITUTIONS,
-    template = "jconfig.h.in",
-)
-
-expand_template(
-    name = "jconfig_nowin_simd",
-    out = "jconfig_nowin_simd.h",
-    substitutions = JCONFIG_NOWIN_SIMD_SUBSTITUTIONS,
-    template = "jconfig.h.in",
-)
-
-JCONFIGINT_COMMON_SUBSTITUTIONS = {
-    "@BUILD@": "20221022",
-    "@VERSION@": "2.1.4",
-    "@CMAKE_PROJECT_NAME@": "libjpeg-turbo",
-    "#undef inline": "",
-    "#cmakedefine HAVE_INTRIN_H": "",
-}
-
-JCONFIGINT_NOWIN_SUBSTITUTIONS = {
-    "#cmakedefine HAVE_BUILTIN_CTZL": "#define HAVE_BUILTIN_CTZL",
-    "@INLINE@": "inline __attribute__((always_inline))",
-    "#define SIZEOF_SIZE_T  @SIZE_T@": "#if (__WORDSIZE==64 && !defined(__native_client__))\n" +
-                                       "#define SIZEOF_SIZE_T 8\n" +
-                                       "#else\n" +
-                                       "#define SIZEOF_SIZE_T 4\n" +
-                                       "#endif\n",
-}
-
-JCONFIGINT_WIN_SUBSTITUTIONS = {
-    "#cmakedefine HAVE_BUILTIN_CTZL": "",
-    "#define INLINE  @INLINE@": "#if defined(__GNUC__)\n" +
-                                "#define INLINE inline __attribute__((always_inline))\n" +
-                                "#elif defined(_MSC_VER)\n" +
-                                "#define INLINE __forceinline\n" +
-                                "#else\n" +
-                                "#define INLINE\n" +
-                                "#endif\n",
-    "#define SIZEOF_SIZE_T  @SIZE_T@": "#if (__WORDSIZE==64)\n" +
-                                       "#define SIZEOF_SIZE_T 8\n" +
-                                       "#else\n" +
-                                       "#define SIZEOF_SIZE_T 4\n" +
-                                       "#endif\n",
-}
-
-JCONFIGINT_NOWIN_SUBSTITUTIONS.update(JCONFIGINT_COMMON_SUBSTITUTIONS)
-
-JCONFIGINT_WIN_SUBSTITUTIONS.update(JCONFIGINT_COMMON_SUBSTITUTIONS)
-
-expand_template(
-    name = "jconfigint_nowin",
-    out = "jconfigint_nowin.h",
-    substitutions = JCONFIGINT_NOWIN_SUBSTITUTIONS,
-    template = "jconfigint.h.in",
-)
-
-expand_template(
-    name = "jconfigint_win",
-    out = "jconfigint_win.h",
-    substitutions = JCONFIGINT_WIN_SUBSTITUTIONS,
-    template = "jconfigint.h.in",
-)
-
-genrule(
-    name = "configure",
-    srcs = [
-        "jconfig_win.h",
-        "jconfig_nowin_nosimd.h",
-        "jconfig_nowin_simd.h",
-    ],
-    outs = ["jconfig.h"],
-    cmd = select({
-        ":windows": "cp $(location jconfig_win.h) $@",
-        ":k8": "cp $(location jconfig_nowin_simd.h) $@",
-        ":armeabi-v7a": "cp $(location jconfig_nowin_simd.h) $@",
-        ":arm64-v8a": "cp $(location jconfig_nowin_simd.h) $@",
-        ":linux_ppc64le": "cp $(location jconfig_nowin_simd.h) $@",
-        "//conditions:default": "cp $(location jconfig_nowin_nosimd.h) $@",
-    }),
-)
-
-genrule(
-    name = "configure_internal",
-    srcs = [
-        "jconfigint_win.h",
-        "jconfigint_nowin.h",
-    ],
-    outs = ["jconfigint.h"],
-    cmd = select({
-        ":windows": "cp $(location jconfigint_win.h) $@",
-        "//conditions:default": "cp $(location jconfigint_nowin.h) $@",
-    }),
-)
-
-# jiminy cricket the way this file is generated is completely outrageous
-genrule(
-    name = "configure_simd",
-    outs = ["simd/jsimdcfg.inc"],
-    cmd = "cat <<'EOF' >$@\n" +
-          "%define DCTSIZE 8\n" +
-          "%define DCTSIZE2 64\n" +
-          "%define RGB_RED 0\n" +
-          "%define RGB_GREEN 1\n" +
-          "%define RGB_BLUE 2\n" +
-          "%define RGB_PIXELSIZE 3\n" +
-          "%define EXT_RGB_RED 0\n" +
-          "%define EXT_RGB_GREEN 1\n" +
-          "%define EXT_RGB_BLUE 2\n" +
-          "%define EXT_RGB_PIXELSIZE 3\n" +
-          "%define EXT_RGBX_RED 0\n" +
-          "%define EXT_RGBX_GREEN 1\n" +
-          "%define EXT_RGBX_BLUE 2\n" +
-          "%define EXT_RGBX_PIXELSIZE 4\n" +
-          "%define EXT_BGR_RED 2\n" +
-          "%define EXT_BGR_GREEN 1\n" +
-          "%define EXT_BGR_BLUE 0\n" +
-          "%define EXT_BGR_PIXELSIZE 3\n" +
-          "%define EXT_BGRX_RED 2\n" +
-          "%define EXT_BGRX_GREEN 1\n" +
-          "%define EXT_BGRX_BLUE 0\n" +
-          "%define EXT_BGRX_PIXELSIZE 4\n" +
-          "%define EXT_XBGR_RED 3\n" +
-          "%define EXT_XBGR_GREEN 2\n" +
-          "%define EXT_XBGR_BLUE 1\n" +
-          "%define EXT_XBGR_PIXELSIZE 4\n" +
-          "%define EXT_XRGB_RED 1\n" +
-          "%define EXT_XRGB_GREEN 2\n" +
-          "%define EXT_XRGB_BLUE 3\n" +
-          "%define EXT_XRGB_PIXELSIZE 4\n" +
-          "%define RGBX_FILLER_0XFF 1\n" +
-          "%define JSAMPLE byte ; unsigned char\n" +
-          "%define SIZEOF_JSAMPLE SIZEOF_BYTE ; sizeof(JSAMPLE)\n" +
-          "%define CENTERJSAMPLE 128\n" +
-          "%define JCOEF word ; short\n" +
-          "%define SIZEOF_JCOEF SIZEOF_WORD ; sizeof(JCOEF)\n" +
-          "%define JDIMENSION dword ; unsigned int\n" +
-          "%define SIZEOF_JDIMENSION SIZEOF_DWORD ; sizeof(JDIMENSION)\n" +
-          "%define JSAMPROW POINTER ; JSAMPLE * (jpeglib.h)\n" +
-          "%define JSAMPARRAY POINTER ; JSAMPROW * (jpeglib.h)\n" +
-          "%define JSAMPIMAGE POINTER ; JSAMPARRAY * (jpeglib.h)\n" +
-          "%define JCOEFPTR POINTER ; JCOEF * (jpeglib.h)\n" +
-          "%define SIZEOF_JSAMPROW SIZEOF_POINTER ; sizeof(JSAMPROW)\n" +
-          "%define SIZEOF_JSAMPARRAY SIZEOF_POINTER ; sizeof(JSAMPARRAY)\n" +
-          "%define SIZEOF_JSAMPIMAGE SIZEOF_POINTER ; sizeof(JSAMPIMAGE)\n" +
-          "%define SIZEOF_JCOEFPTR SIZEOF_POINTER ; sizeof(JCOEFPTR)\n" +
-          "%define DCTELEM word ; short\n" +
-          "%define SIZEOF_DCTELEM SIZEOF_WORD ; sizeof(DCTELEM)\n" +
-          "%define float FP32 ; float\n" +
-          "%define SIZEOF_FAST_FLOAT SIZEOF_FP32 ; sizeof(float)\n" +
-          "%define ISLOW_MULT_TYPE word ; must be short\n" +
-          "%define SIZEOF_ISLOW_MULT_TYPE SIZEOF_WORD ; sizeof(ISLOW_MULT_TYPE)\n" +
-          "%define IFAST_MULT_TYPE word ; must be short\n" +
-          "%define SIZEOF_IFAST_MULT_TYPE SIZEOF_WORD ; sizeof(IFAST_MULT_TYPE)\n" +
-          "%define IFAST_SCALE_BITS 2 ; fractional bits in scale factors\n" +
-          "%define FLOAT_MULT_TYPE FP32 ; must be float\n" +
-          "%define SIZEOF_FLOAT_MULT_TYPE SIZEOF_FP32 ; sizeof(FLOAT_MULT_TYPE)\n" +
-          "%define JSIMD_NONE 0x00\n" +
-          "%define JSIMD_MMX 0x01\n" +
-          "%define JSIMD_3DNOW 0x02\n" +
-          "%define JSIMD_SSE 0x04\n" +
-          "%define JSIMD_SSE2 0x08\n" +
-          "EOF",
-)
-
-string_flag(
-    name = "noasm",
-    build_setting_default = "no",
-)
-
-config_setting(
-    name = "nosimd",
-    flag_values = {":noasm": "yes"},
-)
-
-config_setting(
-    name = "k8",
-    flag_values = {":noasm": "no"},
-    values = {"cpu": "k8"},
-)
-
-config_setting(
-    name = "android",
-    values = {"crosstool_top": "//external:android/crosstool"},
-)
-
-config_setting(
-    name = "armeabi-v7a",
-    flag_values = {":noasm": "no"},
-    values = {"cpu": "armeabi-v7a"},
-)
-
-config_setting(
-    name = "arm64-v8a",
-    flag_values = {":noasm": "no"},
-    values = {"cpu": "arm64-v8a"},
-)
-
-config_setting(
-    name = "windows",
-    flag_values = {":noasm": "no"},
-    values = {"cpu": "x64_windows"},
-)
-
-config_setting(
-    name = "linux_ppc64le",
-    flag_values = {":noasm": "no"},
-    values = {"cpu": "ppc"},
-)
diff --git a/third_party/xla/third_party/tsl/third_party/jpeg/jpeg_helpers.BUILD.bazel b/third_party/xla/third_party/tsl/third_party/jpeg/jpeg_helpers.BUILD.bazel
deleted file mode 100644
index 5b01f6e3e4cfd1..00000000000000
--- a/third_party/xla/third_party/tsl/third_party/jpeg/jpeg_helpers.BUILD.bazel
+++ /dev/null
@@ -1 +0,0 @@
-licenses(["notice"])
diff --git a/third_party/xla/third_party/tsl/third_party/jpeg/workspace.bzl b/third_party/xla/third_party/tsl/third_party/jpeg/workspace.bzl
deleted file mode 100644
index 631cc933bc60d9..00000000000000
--- a/third_party/xla/third_party/tsl/third_party/jpeg/workspace.bzl
+++ /dev/null
@@ -1,13 +0,0 @@
-"""loads the jpeg library, used by TF."""
-
-load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
-
-def repo():
-    tf_http_archive(
-        name = "libjpeg_turbo",
-        urls = tf_mirror_urls("https://github.com/libjpeg-turbo/libjpeg-turbo/archive/refs/tags/2.1.4.tar.gz"),
-        sha256 = "a78b05c0d8427a90eb5b4eb08af25309770c8379592bb0b8a863373128e6143f",
-        strip_prefix = "libjpeg-turbo-2.1.4",
-        build_file = "//third_party/jpeg:jpeg.BUILD",
-        system_build_file = "//third_party/jpeg:BUILD.system",
-    )
diff --git a/third_party/xla/third_party/tsl/third_party/llvm_openmp/BUILD b/third_party/xla/third_party/tsl/third_party/llvm_openmp/BUILD
index 9c1e23f1f888c5..34ad101bd35036 100644
--- a/third_party/xla/third_party/tsl/third_party/llvm_openmp/BUILD
+++ b/third_party/xla/third_party/tsl/third_party/llvm_openmp/BUILD
@@ -19,19 +19,20 @@ load(
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    default_visibility = [
+        "//visibility:public",
+    ],
 )
 
-exports_files(
-    ["LICENSE.txt"],
-    visibility = ["//visibility:public"],
-)
+exports_files(["LICENSE.txt"])
 
 py_binary(
     name = "expand_cmake_vars",
     srcs = ["expand_cmake_vars.py"],
     srcs_version = "PY3",
-    visibility = ["//visibility:public"],
+    visibility = [
+        "@llvm_openmp//:__subpackages__",
+    ],
 )
 
 kmp_i18n_os_type = select({
@@ -239,5 +240,4 @@ if_windows(a = libiomp5_cc_binary(
 bzl_library(
     name = "openmp_bzl",
     srcs = ["openmp.bzl"],
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/third_party/tsl/third_party/mkl/BUILD b/third_party/xla/third_party/tsl/third_party/mkl/BUILD
index cfbdc211790d25..6da193d41ba067 100644
--- a/third_party/xla/third_party/tsl/third_party/mkl/BUILD
+++ b/third_party/xla/third_party/tsl/third_party/mkl/BUILD
@@ -7,67 +7,56 @@ package(default_visibility = ["//visibility:public"])
 alias(
     name = "build_with_mkl",
     actual = "//tsl/mkl:build_with_mkl",
-    visibility = ["//visibility:public"],
 )
 
 alias(
     name = "build_with_mkl_lnx_x64",
     actual = "//tsl/mkl:build_with_mkl_lnx_x64",
-    visibility = ["//visibility:public"],
 )
 
 alias(
     name = "build_with_mkl_lnx_openmp",
     actual = "//tsl/mkl:build_with_mkl_lnx_openmp",
-    visibility = ["//visibility:public"],
 )
 
 alias(
     name = "build_with_mkl_windows_openmp",
     actual = "//tsl/mkl:build_with_mkl_windows_openmp",
-    visibility = ["//visibility:public"],
 )
 
 alias(
     name = "build_with_mkl_aarch64",
     actual = "//tsl/mkl:build_with_mkl_aarch64",
-    visibility = ["//visibility:public"],
 )
 
 alias(
     name = "enable_mkl",
     actual = "//tsl/mkl:enable_mkl",
-    visibility = ["//visibility:public"],
 )
 
 alias(
     name = "intel_binary_blob",
     actual = "//tsl/mkl:intel_binary_blob",
-    visibility = ["//visibility:public"],
 )
 
 alias(
     name = "LICENSE",
     actual = "//tsl/mkl:LICENSE",
-    visibility = ["//visibility:public"],
 )
 
 alias(
     name = "mkl_libs_linux",
     actual = "//tsl/mkl:mkl_libs_linux",
-    visibility = ["//visibility:public"],
 )
 
 alias(
     name = "mkl_libs_darwin",
     actual = "//tsl/mkl:mkl_libs_darwin",
-    visibility = ["//visibility:public"],
 )
 
 alias(
     name = "mkl_libs_windows",
     actual = "//tsl/mkl:mkl_libs_windows",
-    visibility = ["//visibility:public"],
 )
 
 bzl_library(
diff --git a/third_party/xla/third_party/tsl/third_party/mkl_dnn/BUILD b/third_party/xla/third_party/tsl/third_party/mkl_dnn/BUILD
index c536923f794b07..a9cdec9dd632aa 100644
--- a/third_party/xla/third_party/tsl/third_party/mkl_dnn/BUILD
+++ b/third_party/xla/third_party/tsl/third_party/mkl_dnn/BUILD
@@ -1,6 +1,7 @@
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -16,7 +17,6 @@ config_setting(
         "build_with_mkl": "true",
         "build_with_mkl_opensource": "true",
     },
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
@@ -25,7 +25,6 @@ config_setting(
         "build_with_mkl": "true",
         "build_with_openmp": "true",
     },
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
@@ -34,7 +33,6 @@ config_setting(
         "build_with_mkl_aarch64": "true",
         "build_with_openmp": "true",
     },
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
@@ -42,11 +40,9 @@ config_setting(
     define_values = {
         "build_with_mkl_aarch64": "true",
     },
-    visibility = ["//visibility:public"],
 )
 
 bzl_library(
     name = "build_defs_bzl",
     srcs = ["build_defs.bzl"],
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD b/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD
index a2528211936972..69843d482c80d7 100644
--- a/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD
+++ b/third_party/xla/third_party/tsl/third_party/mkl_dnn/mkldnn_v1.BUILD
@@ -95,7 +95,7 @@ expand_template(
     substitutions = {
         "@DNNL_VERSION_MAJOR@": "3",
         "@DNNL_VERSION_MINOR@": "3",
-        "@DNNL_VERSION_PATCH@": "0",
+        "@DNNL_VERSION_PATCH@": "4",
         "@DNNL_VERSION_HASH@": "N/A",
     },
     template = "include/oneapi/dnnl/dnnl_version.h.in",
diff --git a/third_party/xla/third_party/tsl/third_party/mkl_dnn/onednn_acl_indirect_conv.patch b/third_party/xla/third_party/tsl/third_party/mkl_dnn/onednn_acl_indirect_conv.patch
new file mode 100644
index 00000000000000..217e668352de5c
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/mkl_dnn/onednn_acl_indirect_conv.patch
@@ -0,0 +1,31 @@
+ *******************************************************************************
+ Copyright 2024 Arm Limited and affiliates.
+ SPDX-License-Identifier: Apache-2.0
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ *******************************************************************************
+diff --git a/src/cpu/aarch64/acl_convolution_utils.cpp b/src/cpu/aarch64/acl_convolution_utils.cpp
+index f043fee4bc..0384cce757 100644
+--- a/src/cpu/aarch64/acl_convolution_utils.cpp
++++ b/src/cpu/aarch64/acl_convolution_utils.cpp
+@@ -313,10 +313,6 @@ status_t init_conf_indirect_gemm(acl_conv_conf_t &acp, memory_desc_t &src_md,
+
+     CHECK(acl_init_conf(acp, src_md, weights_md, dst_md, bias_md, cd, attr));
+
+-    // Indirect is slower than gemm for low thread counts, except for fast math
+-    if (dnnl_get_max_threads() < 28 && !acp.fast_math)
+-        return status::unimplemented;
+-
+     // If we do not need to pad input channels for fast math mode then it would
+     // be faster to run convolution with im2row instead of using indirect kernel
+     int block_by = arm_compute::block_by(acp.weights_info.weight_format());
diff --git a/third_party/xla/third_party/tsl/third_party/nvtx/BUILD b/third_party/xla/third_party/tsl/third_party/nvtx/BUILD
new file mode 100644
index 00000000000000..af6de99cb8fcf7
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/nvtx/BUILD
@@ -0,0 +1,13 @@
+# NVIDIA NVTX 3
+
+licenses(["notice"])
+
+exports_files(["LICENSE.txt"])
+
+cc_library(
+    name = "headers",
+    hdrs = glob(["**"]),
+    include_prefix = "nvtx3",
+    visibility = ["//visibility:public"],
+    deps = ["@local_config_cuda//cuda:cuda_headers"],
+)
diff --git a/third_party/xla/third_party/tsl/third_party/nvtx/LICENSE b/third_party/xla/third_party/tsl/third_party/nvtx/LICENSE
new file mode 100644
index 00000000000000..aa23e563e6f868
--- /dev/null
+++ b/third_party/xla/third_party/tsl/third_party/nvtx/LICENSE
@@ -0,0 +1,5 @@
+Copyright 2009-2022  NVIDIA Corporation.  All rights reserved.
+
+Licensed under the Apache License v2.0 with LLVM Exceptions.
+See https://llvm.org/LICENSE.txt for license information.
+SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
diff --git a/third_party/xla/third_party/tsl/third_party/png.BUILD b/third_party/xla/third_party/tsl/third_party/png.BUILD
deleted file mode 100644
index 95ea74b8a03c3c..00000000000000
--- a/third_party/xla/third_party/tsl/third_party/png.BUILD
+++ /dev/null
@@ -1,70 +0,0 @@
-# Description:
-#   libpng is the official PNG reference library.
-
-licenses(["notice"])  # BSD/MIT-like license
-
-exports_files(["LICENSE"])
-
-cc_library(
-    name = "png",
-    srcs = [
-        "png.c",
-        "pngdebug.h",
-        "pngerror.c",
-        "pngget.c",
-        "pnginfo.h",
-        "pnglibconf.h",
-        "pngmem.c",
-        "pngpread.c",
-        "pngpriv.h",
-        "pngread.c",
-        "pngrio.c",
-        "pngrtran.c",
-        "pngrutil.c",
-        "pngset.c",
-        "pngstruct.h",
-        "pngtrans.c",
-        "pngwio.c",
-        "pngwrite.c",
-        "pngwtran.c",
-        "pngwutil.c",
-    ] + select({
-        ":windows": [
-            "intel/filter_sse2_intrinsics.c",
-            "intel/intel_init.c",
-        ],
-        "@local_tsl//tsl:linux_ppc64le": [
-            #"powerpc/filter_vsx_intrinsics.c",
-            #"powerpc/powerpc_init.c",
-        ],
-        "//conditions:default": [
-        ],
-    }),
-    hdrs = [
-        "png.h",
-        "pngconf.h",
-    ],
-    copts = select({
-        ":windows": ["-DPNG_INTEL_SSE_OPT=1"],
-        "//conditions:default": [],
-    }),
-    includes = ["."],
-    linkopts = select({
-        ":windows": [],
-        "//conditions:default": ["-lm"],
-    }),
-    visibility = ["//visibility:public"],
-    deps = ["@zlib"],
-)
-
-genrule(
-    name = "snappy_stubs_public_h",
-    srcs = ["scripts/pnglibconf.h.prebuilt"],
-    outs = ["pnglibconf.h"],
-    cmd = "sed -e 's/PNG_ZLIB_VERNUM 0/PNG_ZLIB_VERNUM 0x12d0/' $< >$@",
-)
-
-config_setting(
-    name = "windows",
-    values = {"cpu": "x64_windows"},
-)
diff --git a/third_party/xla/third_party/tsl/third_party/png_fix_rpi.patch b/third_party/xla/third_party/tsl/third_party/png_fix_rpi.patch
deleted file mode 100644
index df6cfd7ffaee55..00000000000000
--- a/third_party/xla/third_party/tsl/third_party/png_fix_rpi.patch
+++ /dev/null
@@ -1,16 +0,0 @@
-diff -r -u ./scripts/pnglibconf.h.prebuilt ./scripts/pnglibconf.h.prebuilt
---- ./scripts/pnglibconf.h.prebuilt
-+++ ./scripts/pnglibconf.h.prebuilt
-@@ -19,6 +19,12 @@
- #define PNG_ALIGNED_MEMORY_SUPPORTED
- /*#undef PNG_ARM_NEON_API_SUPPORTED*/
- /*#undef PNG_ARM_NEON_CHECK_SUPPORTED*/
-+
-+/* Workaround not having a great build file by forcing
-+ * png filter optimization to be disabled on arm */
-+#define PNG_ARM_NEON_OPT 0
-+
-+
- #define PNG_BENIGN_ERRORS_SUPPORTED
- #define PNG_BENIGN_READ_ERRORS_SUPPORTED
- /*#undef PNG_BENIGN_WRITE_ERRORS_SUPPORTED*/
diff --git a/third_party/xla/third_party/tsl/third_party/python_runtime/BUILD b/third_party/xla/third_party/tsl/third_party/python_runtime/BUILD
index 14210ebf684fa9..2a1609191fe351 100644
--- a/third_party/xla/third_party/tsl/third_party/python_runtime/BUILD
+++ b/third_party/xla/third_party/tsl/third_party/python_runtime/BUILD
@@ -5,5 +5,4 @@ package(default_visibility = ["//visibility:public"])
 alias(
     name = "headers",
     actual = "@local_config_python//:python_headers",
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/third_party/tsl/third_party/tensorrt/plugin/BUILD b/third_party/xla/third_party/tsl/third_party/tensorrt/plugin/BUILD
index 2c76d3db31dd70..56e26d779de155 100644
--- a/third_party/xla/third_party/tsl/third_party/tensorrt/plugin/BUILD
+++ b/third_party/xla/third_party/tsl/third_party/tensorrt/plugin/BUILD
@@ -3,10 +3,7 @@
 # TensorRT open source repository.
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_default_copts", "cuda_library")
 
-exports_files(
-    ["LICENSE"],
-    visibility = ["//visibility:public"],
-)
+exports_files(["LICENSE"])
 
 cuda_library(
     name = "plugin_common",
@@ -19,7 +16,6 @@ cuda_library(
         "plugin/common/plugin.h",
     ],
     strip_include_prefix = "plugin/common",
-    visibility = ["//visibility:public"],
     deps = [
         "@local_config_tensorrt//:tensorrt",
         "@local_config_tensorrt//:tensorrt_headers",
@@ -33,7 +29,6 @@ cc_library(
         "plugin/efficientNMSPlugin/efficientNMSParameters.h",
         "plugin/efficientNMSPlugin/efficientNMSPlugin.h",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cuda_library(
diff --git a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
index 08d3db67586a5f..480c2df37daea6 100644
--- a/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
+++ b/third_party/xla/third_party/tsl/third_party/tf_runtime/workspace.bzl
@@ -6,8 +6,8 @@ def repo():
     """Imports TFRT."""
 
     # Attention: tools parse and update these lines.
-    TFRT_COMMIT = "9a4d3f7a8a5c3b53a005831d22acdc18cf04ee35"
-    TFRT_SHA256 = "2897d44de841d552f9ca1a58278eb5e9c32327d9c8e757260f67ba6be885f08e"
+    TFRT_COMMIT = "02bc4aa04741ad76070e251804aabe545a8f54af"
+    TFRT_SHA256 = "b9e41192c67a893773753e5c3f05529c96e955b215ff74044f6556015498ab5a"
 
     tf_http_archive(
         name = "tf_runtime",
diff --git a/third_party/xla/third_party/tsl/tools/def_file_filter/BUILD b/third_party/xla/third_party/tsl/tools/def_file_filter/BUILD
index 8fa81a4475ac64..250f31a6beb7ae 100644
--- a/third_party/xla/third_party/tsl/tools/def_file_filter/BUILD
+++ b/third_party/xla/third_party/tsl/tools/def_file_filter/BUILD
@@ -11,5 +11,4 @@ package(default_visibility = ["//visibility:public"])
 filegroup(
     name = "symbols_pybind",
     srcs = ["symbols_pybind.txt"],
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/cpus/py/BUILD b/third_party/xla/third_party/tsl/tools/toolchains/cpus/py/BUILD
index 54feb1695a21e3..1235988abb7fa9 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/cpus/py/BUILD
+++ b/third_party/xla/third_party/tsl/tools/toolchains/cpus/py/BUILD
@@ -22,7 +22,6 @@ cc_library(
     name = "python_headers",
     hdrs = [":python_include"],
     includes = ["python_include"],
-    visibility = ["//visibility:public"],
     deps = select({
         ":windows": [":python_lib"],
         "//conditions:default": [],
@@ -33,7 +32,6 @@ cc_library(
     name = "numpy_headers",
     hdrs = [":numpy_include"],
     includes = ["numpy_include"],
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/cpus/py3/BUILD b/third_party/xla/third_party/tsl/tools/toolchains/cpus/py3/BUILD
index 5dc47b98284c89..d47256ebef88fa 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/cpus/py3/BUILD
+++ b/third_party/xla/third_party/tsl/tools/toolchains/cpus/py3/BUILD
@@ -22,7 +22,6 @@ cc_library(
     name = "python_headers",
     hdrs = [":python_include"],
     includes = ["python_include"],
-    visibility = ["//visibility:public"],
     deps = select({
         ":windows": [":python_lib"],
         "//conditions:default": [],
@@ -33,7 +32,6 @@ cc_library(
     name = "numpy_headers",
     hdrs = [":numpy_include"],
     includes = ["numpy_include"],
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/cross_compile/cc/BUILD b/third_party/xla/third_party/tsl/tools/toolchains/cross_compile/cc/BUILD
index 976e57b777f15d..7cf6d8c3747b27 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/cross_compile/cc/BUILD
+++ b/third_party/xla/third_party/tsl/tools/toolchains/cross_compile/cc/BUILD
@@ -15,10 +15,7 @@ cc_toolchain_suite(
     },
 )
 
-filegroup(
-    name = "empty",
-    visibility = ["//visibility:public"],
-)
+filegroup(name = "empty")
 
 # We define a wraper ("cc_wrapper.sh") around the compiler to replace all paths
 # in the binary (bazel-out/.../path/to/original/library.so) by the paths
@@ -27,7 +24,6 @@ filegroup(
 filegroup(
     name = "cc_wrapper_and_macos_sysroot",
     srcs = ["cc_wrapper.sh"] + glob(["MacOSX.sdk/**"]),
-    visibility = ["//visibility:public"],
 )
 
 cc_toolchain(
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
index aab6ac89e37c67..7bbfb8b2854ca4 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/third_party/tsl/tools/toolchains/remote_config/configs.bzl
@@ -627,7 +627,7 @@ def initialize_rbe_configs():
 
     sigbuild_tf_configs(
         name_container_map = {
-            "sigbuild-r2.16": "docker://gcr.io/tensorflow-sigs/build@sha256:22d863e6fe3f98946015b9e1264b2eeb8e56e504535a6c1d5e564cae65ae5d37",
+            "sigbuild-r2.16": "docker://gcr.io/tensorflow-sigs/build@sha256:842a5ba84d3658c5bf1f8a31e16284f7becc35409da0dfd71816afa3cd28d728",
             "sigbuild-r2.16-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:22d863e6fe3f98946015b9e1264b2eeb8e56e504535a6c1d5e564cae65ae5d37",
             "sigbuild-r2.16-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:da15288c8464153eadd35da720540a544b76aa9d78cceb42a6821b2f3e70a0fa",
             "sigbuild-r2.16-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:842a5ba84d3658c5bf1f8a31e16284f7becc35409da0dfd71816afa3cd28d728",
@@ -667,7 +667,7 @@ def initialize_rbe_configs():
 
     sigbuild_tf_configs(
         name_container_map = {
-            "sigbuild-r2.16-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:22d863e6fe3f98946015b9e1264b2eeb8e56e504535a6c1d5e564cae65ae5d37",
+            "sigbuild-r2.16-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:842a5ba84d3658c5bf1f8a31e16284f7becc35409da0dfd71816afa3cd28d728",
             "sigbuild-r2.16-clang-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:22d863e6fe3f98946015b9e1264b2eeb8e56e504535a6c1d5e564cae65ae5d37",
             "sigbuild-r2.16-clang-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:da15288c8464153eadd35da720540a544b76aa9d78cceb42a6821b2f3e70a0fa",
             "sigbuild-r2.16-clang-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:842a5ba84d3658c5bf1f8a31e16284f7becc35409da0dfd71816afa3cd28d728",
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/BUILD b/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/BUILD
index 07aff97390d02e..cc23c8ecb22680 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/BUILD
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win/bazel_211/BUILD
@@ -22,31 +22,26 @@ package(default_visibility = ["//visibility:public"])
 
 cc_library(
     name = "malloc",
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "empty",
     srcs = [],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "mingw_compiler_files",
     srcs = [":builtin_include_directory_paths_mingw"],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "clangcl_compiler_files",
     srcs = [":builtin_include_directory_paths_clangcl"],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "msvc_compiler_files",
     srcs = [":builtin_include_directory_paths_msvc"],
-    visibility = ["//visibility:public"],
 )
 
 # Hardcoded toolchain, legacy behaviour.
@@ -358,5 +353,4 @@ toolchain(
 filegroup(
     name = "link_dynamic_library",
     srcs = ["link_dynamic_library.sh"],
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/BUILD b/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/BUILD
index 4d6c76ca644f07..f245f6d0789c9d 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/BUILD
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win/tf_win_05022023/BUILD
@@ -22,31 +22,26 @@ package(default_visibility = ["//visibility:public"])
 
 cc_library(
     name = "malloc",
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "empty",
     srcs = [],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "mingw_compiler_files",
     srcs = [":builtin_include_directory_paths_mingw"],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "clangcl_compiler_files",
     srcs = [":builtin_include_directory_paths_clangcl"],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "msvc_compiler_files",
     srcs = [":builtin_include_directory_paths_msvc"],
-    visibility = ["//visibility:public"],
 )
 
 # Hardcoded toolchain, legacy behaviour.
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win_1803/py38/BUILD b/third_party/xla/third_party/tsl/tools/toolchains/win_1803/py38/BUILD
index 3efde36a05097c..9aa4d82e6daca6 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/win_1803/py38/BUILD
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win_1803/py38/BUILD
@@ -39,7 +39,6 @@ cc_library(
     name = "python_headers",
     hdrs = [":python_include"],
     includes = ["python_include"],
-    visibility = ["//visibility:public"],
     deps = select({
         ":windows": [":python_lib"],
         "//conditions:default": [],
@@ -50,7 +49,6 @@ cc_library(
     name = "numpy_headers",
     hdrs = [":numpy_include"],
     includes = ["numpy_include"],
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
diff --git a/third_party/xla/third_party/tsl/tools/toolchains/win_1803/py39/BUILD b/third_party/xla/third_party/tsl/tools/toolchains/win_1803/py39/BUILD
index f3892df8ef4fae..f5b545cb161b3a 100644
--- a/third_party/xla/third_party/tsl/tools/toolchains/win_1803/py39/BUILD
+++ b/third_party/xla/third_party/tsl/tools/toolchains/win_1803/py39/BUILD
@@ -66,7 +66,6 @@ cc_library(
     name = "python_headers",
     hdrs = [":python_include"],
     includes = ["python_include"],
-    visibility = ["//visibility:public"],
     deps = select({
         ":windows": [":python_lib"],
         "//conditions:default": [],
@@ -77,7 +76,6 @@ cc_library(
     name = "numpy_headers",
     hdrs = [":numpy_include"],
     includes = ["numpy_include"],
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
diff --git a/third_party/xla/third_party/tsl/tsl/BUILD b/third_party/xla/third_party/tsl/tsl/BUILD
index c7cacce00c94dc..e41e9b1237faad 100644
--- a/third_party/xla/third_party/tsl/tsl/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/BUILD
@@ -1,7 +1,7 @@
-load("tsl.bzl", "if_google", "if_oss")
 load("@bazel_skylib//lib:selects.bzl", "selects")
 load("@bazel_skylib//rules:common_settings.bzl", "bool_flag", "bool_setting")
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+load("tsl.bzl", "if_google", "if_oss")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
@@ -132,7 +132,6 @@ config_setting(
         "apple_platform_type": "macos",
         "cpu": "darwin",
     },
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
@@ -145,7 +144,6 @@ config_setting(
         "apple_platform_type": "macos",
         "cpu": "darwin_x86_64",
     },
-    visibility = ["//visibility:public"],
 )
 
 selects.config_setting_group(
diff --git a/third_party/xla/third_party/tsl/tsl/c/BUILD b/third_party/xla/third_party/tsl/tsl/c/BUILD
index d2997f86eea138..39019dffa29ee2 100644
--- a/third_party/xla/third_party/tsl/tsl/c/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/c/BUILD
@@ -1,15 +1,14 @@
 # Description:
 # C API for TensorFlow, for use by client language bindings.
 
+load("//tsl:tsl.bzl", "internal_visibility", "tsl_copts", "tsl_gpu_library")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl:tsl.bzl", "tsl_copts", "tsl_gpu_library")
 
 # buildifier: disable=same-origin-load
 load("//tsl:tsl.default.bzl", "filegroup")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
@@ -22,7 +21,7 @@ filegroup(
     srcs = [
         "tsl_status.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow:__subpackages__"]),
 )
 
 filegroup(
@@ -36,7 +35,9 @@ filegroup(
             "*test*",
         ],
     ),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/c:__subpackages__",
+    ]),
 )
 
 tsl_gpu_library(
@@ -78,7 +79,6 @@ cc_library(
 tsl_cc_test(
     name = "tsl_status_test",
     srcs = ["tsl_status_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":tsl_status",
         ":tsl_status_internal",
@@ -111,5 +111,7 @@ tsl_gpu_library(
 filegroup(
     name = "tsl_status_internal_headers",
     srcs = ["tsl_status_internal.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/c:__subpackages__",
+    ]),
 )
diff --git a/third_party/xla/third_party/tsl/tsl/concurrency/BUILD b/third_party/xla/third_party/tsl/tsl/concurrency/BUILD
index 0cfb38d9697f61..4250b3db059897 100644
--- a/third_party/xla/third_party/tsl/tsl/concurrency/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/concurrency/BUILD
@@ -3,6 +3,7 @@ load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -19,7 +20,6 @@ cc_library(
         "chain.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":concurrent_vector",
         ":ref_count",
@@ -35,7 +35,6 @@ cc_library(
 tsl_cc_test(
     name = "async_value_test",
     srcs = ["async_value_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":async_value",
         "//tsl/platform:test",
@@ -46,7 +45,6 @@ tsl_cc_test(
 tsl_cc_test(
     name = "async_value_ref_test",
     srcs = ["async_value_ref_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":async_value",
         "//tsl/platform:test",
@@ -58,7 +56,6 @@ cc_library(
     name = "concurrent_vector",
     hdrs = ["concurrent_vector.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
@@ -68,7 +65,6 @@ cc_library(
 tsl_cc_test(
     name = "concurrent_vector_test",
     srcs = ["concurrent_vector_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":concurrent_vector",
         "//tsl/platform:env",
@@ -82,5 +78,4 @@ cc_library(
     name = "ref_count",
     hdrs = ["ref_count.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel b/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
index a62af76811a3e5..acb55221627a16 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
+++ b/third_party/xla/third_party/tsl/tsl/cuda/BUILD.bazel
@@ -13,7 +13,6 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/cuda/cudnn.symbols b/third_party/xla/third_party/tsl/tsl/cuda/cudnn.symbols
index 2c4dbd71030b38..95c46295e1dcbb 100644
--- a/third_party/xla/third_party/tsl/tsl/cuda/cudnn.symbols
+++ b/third_party/xla/third_party/tsl/tsl/cuda/cudnn.symbols
@@ -3,6 +3,7 @@ cudnnActivationForward
 cudnnAddTensor
 cudnnAdvInferVersionCheck
 cudnnAdvTrainVersionCheck
+cudnnAdvVersionCheck
 cudnnBackendCreateDescriptor
 cudnnBackendDestroyDescriptor
 cudnnBackendExecute
@@ -20,6 +21,7 @@ cudnnCTCLoss
 cudnnCTCLoss_v8
 cudnnCnnInferVersionCheck
 cudnnCnnTrainVersionCheck
+cudnnCnnVersionCheck
 cudnnConvolutionBackwardBias
 cudnnConvolutionBackwardData
 cudnnConvolutionBackwardFilter
@@ -175,6 +177,7 @@ cudnnGetTensorNdDescriptor
 cudnnGetTensorSizeInBytes
 cudnnGetTensorTransformDescriptor
 cudnnGetVersion
+cudnnGraphVersionCheck
 cudnnIm2Col
 cudnnInitTransformDest
 cudnnLRNCrossChannelBackward
@@ -189,6 +192,7 @@ cudnnNormalizationForwardTraining
 cudnnOpTensor
 cudnnOpsInferVersionCheck
 cudnnOpsTrainVersionCheck
+cudnnOpsVersionCheck
 cudnnPoolingBackward
 cudnnPoolingForward
 cudnnQueryRuntimeError
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/BUILD
index 5ada05374fad08..479903d7841ca0 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/BUILD
@@ -2,13 +2,17 @@
 # Distributed runtime modules for machine learning, which allows coordination between multiple
 # processes for distributed operations.
 
+load("//tsl:tsl.bzl", "internal_visibility")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//tsl:internal",
+    ]),
     licenses = ["notice"],
 )
 
@@ -16,7 +20,6 @@ cc_library(
     name = "call_options",
     srcs = ["call_options.cc"],
     hdrs = ["call_options.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:macros",
         "//tsl/platform:mutex",
@@ -30,5 +33,4 @@ filegroup(
     srcs = [
         "call_options.h",
     ],
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD
index 72cfcb34967870..68a79930df849b 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/coordination/BUILD
@@ -1,16 +1,18 @@
-load("//tsl:tsl.bzl", "if_oss", "set_external_visibility", "tsl_gpu_library")
+load("//tsl:tsl.bzl", "if_oss", "internal_visibility", "tsl_gpu_library")
 load("//tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//tsl:internal",
+    ]),
     licenses = ["notice"],
 )
 
 cc_library(
     name = "coordination_service_error_util",
     hdrs = ["coordination_service_error_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:errors",
         "//tsl/platform:status",
@@ -22,7 +24,6 @@ cc_library(
 tsl_cc_test(
     name = "coordination_service_error_util_test",
     srcs = ["coordination_service_error_util_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":coordination_service_error_util",
         "//tsl/platform:errors",
@@ -36,7 +37,6 @@ tsl_cc_test(
 cc_library(
     name = "coordination_client",
     hdrs = ["coordination_client.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/distributed_runtime:call_options",
         "//tsl/platform:status",
@@ -47,7 +47,6 @@ cc_library(
 cc_library(
     name = "coordination_service",
     hdrs = ["coordination_service.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":coordination_client",
         "//tsl/platform:status",
@@ -64,7 +63,6 @@ cc_library(
 tsl_gpu_library(
     name = "coordination_service_impl",
     srcs = ["coordination_service.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":coordination_client",
         ":coordination_service",
@@ -96,7 +94,6 @@ tf_proto_library(
     testonly = 1,
     srcs = ["test_device.proto"],
     cc_api_version = 2,
-    visibility = ["//visibility:public"],
 )
 
 tsl_cc_test(
@@ -106,7 +103,6 @@ tsl_cc_test(
         "manual",
         "no_oss",
     ]),  # b/169705709, no protobuf matchers in OSS.
-    visibility = ["//visibility:public"],
     deps = [
         ":coordination_client",
         ":coordination_service",
@@ -137,7 +133,6 @@ tsl_gpu_library(
     name = "coordination_service_agent",
     srcs = ["coordination_service_agent.cc"],
     hdrs = ["coordination_service_agent.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":coordination_client",
         ":coordination_service_error_util",
@@ -164,7 +159,6 @@ tsl_gpu_library(
 tsl_cc_test(
     name = "coordination_service_agent_test",
     srcs = ["coordination_service_agent_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":coordination_client",
         ":coordination_service_agent",
@@ -194,7 +188,6 @@ cc_library(
     hdrs = [
         "coordination_service_rpc_handler.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":coordination_service",
         ":coordination_service_agent",
@@ -214,7 +207,6 @@ cc_library(
 tsl_cc_test(
     name = "coordination_service_recoverable_job_test",
     srcs = ["coordination_service_recoverable_job_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":coordination_client",
         ":coordination_service",
@@ -245,5 +237,4 @@ filegroup(
         "coordination_client.h",
         "coordination_service.h",
     ],
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/BUILD
index 99df97792f73a0..aca10c8ca69783 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/BUILD
@@ -1,10 +1,13 @@
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl/platform:build_config.bzl", "tsl_cc_test")
+load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "get_compatible_with_portable", "tsl_grpc_cc_dependencies")
-load("//tsl:tsl.bzl", "set_external_visibility")
+load("//tsl/platform:build_config.bzl", "tsl_cc_test")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//tsl:internal",
+    ]),
     licenses = ["notice"],
 )
 
@@ -13,7 +16,6 @@ cc_library(
     srcs = ["preemption_notifier.cc"],
     hdrs = ["preemption_notifier.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:env",
         "//tsl/platform:errors",
@@ -29,7 +31,6 @@ tsl_cc_test(
     name = "preemption_notifier_test",
     size = "small",
     srcs = ["preemption_notifier_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":preemption_notifier",
         "//tsl/platform:env",
@@ -49,7 +50,6 @@ cc_library(
     name = "preemption_sync_manager",
     srcs = ["preemption_sync_manager.cc"],
     hdrs = ["preemption_sync_manager.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":preemption_notifier",
         "//tsl/distributed_runtime:call_options",
@@ -70,7 +70,6 @@ tsl_cc_test(
     name = "preemption_sync_manager_test",
     size = "small",
     srcs = ["preemption_sync_manager_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":preemption_notifier",
         ":preemption_sync_manager",
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager.cc
index a4ca1ac9159dee..2317e2f7b4a633 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager.cc
@@ -73,11 +73,12 @@ class PreemptionSyncManagerImpl : public PreemptionSyncManager {
   ~PreemptionSyncManagerImpl() override {
     shutdown_.Notify();
   }
-  Status Initialize(CoordinationServiceAgent* agent) override;
-  Status Initialize(CoordinationServiceAgent* agent,
-                    const std::string& preemption_notifier_type) override;
-  Status Initialize(CoordinationServiceAgent* agent,
-                    std::unique_ptr<PreemptionNotifier> notifier) override;
+  absl::Status Initialize(CoordinationServiceAgent* agent) override;
+  absl::Status Initialize(CoordinationServiceAgent* agent,
+                          const std::string& preemption_notifier_type) override;
+  absl::Status Initialize(
+      CoordinationServiceAgent* agent,
+      std::unique_ptr<PreemptionNotifier> notifier) override;
   bool ReachedSyncPoint(int step_counter) override;
 
  private:
@@ -103,11 +104,12 @@ class PreemptionSyncManagerImpl : public PreemptionSyncManager {
   std::shared_ptr<CallOptions> call_opts_;
 };
 
-Status PreemptionSyncManagerImpl::Initialize(CoordinationServiceAgent* agent) {
+absl::Status PreemptionSyncManagerImpl::Initialize(
+    CoordinationServiceAgent* agent) {
   return Initialize(agent, "sigterm");
 }
 
-Status PreemptionSyncManagerImpl::Initialize(
+absl::Status PreemptionSyncManagerImpl::Initialize(
     CoordinationServiceAgent* agent,
     const std::string& preemption_notifier_type) {
   TF_ASSIGN_OR_RETURN(Env * env, agent->GetEnv());
@@ -115,7 +117,7 @@ Status PreemptionSyncManagerImpl::Initialize(
                                preemption_notifier_type, env));
 }
 
-Status PreemptionSyncManagerImpl::Initialize(
+absl::Status PreemptionSyncManagerImpl::Initialize(
     CoordinationServiceAgent* agent,
     std::unique_ptr<PreemptionNotifier> notifier) {
   TF_ASSIGN_OR_RETURN(Env * env, agent->GetEnv());
@@ -147,8 +149,8 @@ Status PreemptionSyncManagerImpl::Initialize(
         }
         notified_metric->GetCell()->Set(true);
         // Notify coordination service about preemption notice.
-        const Status s = agent->InsertKeyValue(kPreemptionNoticeKey,
-                                               absl::FormatTime(*death_time));
+        const absl::Status s = agent->InsertKeyValue(
+            kPreemptionNoticeKey, absl::FormatTime(*death_time));
         LOG(INFO) << "Notified coordination service that this task will "
                      "be preempted at "
                   << *death_time << ". Status: " << s;
@@ -177,7 +179,7 @@ Status PreemptionSyncManagerImpl::Initialize(
           // CancelPreemptionBarrier() cannot be used because this may be
           // triggered after preemption sync manager has been destroyed.
           agent->CancelBarrierAsync(
-              kPreemptionBarrier, [](const Status& status) {
+              kPreemptionBarrier, [](const absl::Status& status) {
                 if (!status.ok()) {
                   LOG(ERROR)
                       << "Failed to cancel preemption barrier: " << status;
@@ -231,7 +233,7 @@ void PreemptionSyncManagerImpl::ComputeSyncCallCounter(absl::Time death_time) {
   // `preemption_sync_counter_` or the protocol failed. This ensures correctness
   // of the preemption sync protocol.
   mutex_lock l(mu_);
-  const Status notified_status = agent_->InsertKeyValue(
+  const absl::Status notified_status = agent_->InsertKeyValue(
       current_call_counter_key_, std::to_string(call_counter_));
   if (!notified_status.ok()) {
     LOG(ERROR) << "Preemption sync failed - could not inform service of "
@@ -243,7 +245,7 @@ void PreemptionSyncManagerImpl::ComputeSyncCallCounter(absl::Time death_time) {
 
   // 3. Impose a barrier to wait until everybody sends their current call
   // counter.
-  const Status barrier_status =
+  const absl::Status barrier_status =
       agent_->WaitAtBarrier(kPreemptionBarrier, kPreemptionBarrierTimeout, {});
   if (!barrier_status.ok()) {
     LOG(ERROR) << "Preemption sync barrier failed: " << barrier_status;
@@ -287,11 +289,12 @@ void PreemptionSyncManagerImpl::ComputeSyncCallCounter(absl::Time death_time) {
 }
 
 void PreemptionSyncManagerImpl::CancelPreemptionBarrier() {
-  agent_->CancelBarrierAsync(kPreemptionBarrier, [](const Status& status) {
-    if (!status.ok()) {
-      LOG(ERROR) << "Failed to cancel preemption barrier: " << status;
-    }
-  });
+  agent_->CancelBarrierAsync(
+      kPreemptionBarrier, [](const absl::Status& status) {
+        if (!status.ok()) {
+          LOG(ERROR) << "Failed to cancel preemption barrier: " << status;
+        }
+      });
 }
 
 bool PreemptionSyncManagerImpl::ReachedSyncPoint(int step_counter) {
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager.h b/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager.h
index 2c359b686ffc58..baf1911cac2d6d 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager.h
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager.h
@@ -35,11 +35,13 @@ class PreemptionSyncManager {
  public:
   virtual ~PreemptionSyncManager() = default;
 
-  virtual Status Initialize(CoordinationServiceAgent* agent) = 0;
-  virtual Status Initialize(CoordinationServiceAgent* agent,
-                            const std::string& preemption_notifier_type) = 0;
-  virtual Status Initialize(CoordinationServiceAgent* agent,
-                            std::unique_ptr<PreemptionNotifier> notifier) = 0;
+  virtual absl::Status Initialize(CoordinationServiceAgent* agent) = 0;
+  virtual absl::Status Initialize(
+      CoordinationServiceAgent* agent,
+      const std::string& preemption_notifier_type) = 0;
+  virtual absl::Status Initialize(
+      CoordinationServiceAgent* agent,
+      std::unique_ptr<PreemptionNotifier> notifier) = 0;
 
   // Check if the synchronized point has been reached. When a task has been
   // preempted, a safe sync point will be determined by using the fastest task's
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager_test.cc b/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager_test.cc
index 4caed02d705ad2..82d578c2b9658c 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/preemption/preemption_sync_manager_test.cc
@@ -158,7 +158,7 @@ class PreemptionSyncManagerTest : public ::testing::Test {
     std::unique_ptr<CoordinationClient> coord_client2 =
         absl::WrapUnique(NewGrpcCoordinationClient(
             grpc_server_->InProcessChannel(::grpc::ChannelArguments())));
-    auto error_fn = [](const Status& status) {
+    auto error_fn = [](const absl::Status& status) {
       LOG(ERROR) << "Coordination service agent in error status: " << status;
     };
     CoordinationServiceConfig coord_config;
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD
index 6ef4e72e665a6e..9effe4beb3e2e8 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/BUILD
@@ -1,13 +1,16 @@
 # Description:
 #   RPC communication interfaces and implementations for TensorFlow.
 
-load("//tsl:tsl.bzl", "set_external_visibility")
+load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load("//tsl/platform:build_config.bzl", "tf_proto_library", "tsl_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//tsl:internal",
+    ]),
     licenses = ["notice"],
 )
 
@@ -15,7 +18,6 @@ cc_library(
     name = "async_service_interface",
     srcs = [],
     hdrs = ["async_service_interface.h"],
-    visibility = ["//visibility:public"],
     deps = [],
 )
 
@@ -23,7 +25,6 @@ cc_library(
     name = "grpc_call",
     srcs = [],
     hdrs = ["grpc_call.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:mutex",
         "//tsl/platform:refcount",
@@ -34,7 +35,6 @@ cc_library(
     name = "grpc_util",
     srcs = ["grpc_util.cc"],
     hdrs = ["grpc_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:protobuf",
         "//tsl/platform:status",
@@ -53,7 +53,6 @@ tsl_cc_test(
     tags = [
         "no_mac",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":grpc_util",
         ":test_request_proto_cc_impl",
@@ -69,7 +68,6 @@ tsl_cc_test(
 cc_library(
     name = "grpc_channel_common",
     hdrs = ["grpc_channel_common.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":grpc_util",
         "//tsl/platform:logging",
@@ -82,7 +80,6 @@ cc_library(
     name = "grpc_channel",
     srcs = ["grpc_channel.cc"],
     hdrs = ["grpc_channel.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":grpc_channel_common",
         ":grpc_util",
@@ -109,7 +106,6 @@ tsl_cc_test(
     srcs = [
         "grpc_channel_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":grpc_channel",
         "//tsl/lib/core:status_test_util",
@@ -125,7 +121,6 @@ tsl_cc_test(
 cc_library(
     name = "grpc_state",
     hdrs = ["grpc_state.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":grpc_client_cq_tag",
         ":grpc_util",
@@ -143,7 +138,6 @@ cc_library(
     name = "grpc_client_cq_tag",
     srcs = [],
     hdrs = ["grpc_client_cq_tag.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:macros",
     ],
@@ -154,7 +148,6 @@ tf_proto_library(
     testonly = 1,
     srcs = ["test_request.proto"],
     create_java_proto = False,
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
@@ -163,5 +156,4 @@ filegroup(
         "**/*.cc",
         "**/*.h",
     ]),
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/BUILD b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/BUILD
index 6bf8ca8a9874f1..ba8ec2333d197d 100644
--- a/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/distributed_runtime/rpc/coordination/BUILD
@@ -1,9 +1,12 @@
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
-load("//tsl:tsl.bzl", "set_external_visibility")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//tsl:internal",
+    ]),
     licenses = ["notice"],
 )
 
@@ -11,7 +14,6 @@ cc_library(
     name = "grpc_coordination_client",
     srcs = ["grpc_coordination_client.cc"],
     hdrs = ["grpc_coordination_client.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/distributed_runtime:call_options",
         "//tsl/distributed_runtime/coordination:coordination_client",
@@ -31,7 +33,6 @@ cc_library(
     name = "grpc_coordination_service_impl",
     srcs = ["grpc_coordination_service_impl.cc"],
     hdrs = ["grpc_coordination_service_impl.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/distributed_runtime/coordination:coordination_service_agent",
         "//tsl/distributed_runtime/coordination:coordination_service_rpc_handler",
diff --git a/third_party/xla/third_party/tsl/tsl/framework/BUILD b/third_party/xla/third_party/tsl/tsl/framework/BUILD
index 3e23aa05cbd3d9..39598b662f7211 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/framework/BUILD
@@ -4,7 +4,7 @@
 #   The libraries in this package are not allowed to have ANY dependencies
 #   to other TF components outside of TSL.
 
-load("//tsl:tsl.bzl", "set_external_visibility")
+load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
     "//tsl/platform:build_config_root.bzl",
@@ -20,7 +20,10 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:public",
+    ],
     licenses = ["notice"],
 )
 
@@ -37,7 +40,6 @@ filegroup(
         "tracking_allocator.h",
         "type_traits.h",
     ],
-    visibility = ["//visibility:public"],
 )
 
 # Files needed for core:mobile_srcs_no_runtime.
@@ -56,7 +58,6 @@ filegroup(
         "tracking_allocator.h",
         "type_traits.h",
     ],
-    visibility = ["//visibility:public"],
 )
 
 # Files needed for core:mobile_srcs_only_runtime.
@@ -69,7 +70,6 @@ filegroup(
         "metrics.cc",
         "metrics.h",
     ],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
@@ -82,14 +82,13 @@ filegroup(
         "tracking_allocator.h",
         "type_traits.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core:__subpackages__"]),
 )
 
 # Files needed for tf2xla build.
 filegroup(
     name = "xla_cpu_runtime_hdrs",
     srcs = ["fixedpoint_types.h"],
-    visibility = ["//visibility:public"],
 )
 
 # Individual targets. These should be preferred over tensorflow/core:framework
@@ -155,7 +154,11 @@ cc_library(
         "cpu_allocator_impl.cc",
         "tracking_allocator.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "@local_xla//xla:__subpackages__",
+        "//tensorflow/core:__subpackages__",
+        "//tsl:__subpackages__",
+    ]),
     deps = [
         ":numeric_types",
         ":type_traits",
@@ -228,7 +231,6 @@ cc_library(
         "device_id.h",
         "device_id_manager.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/lib/gtl:int_type",
     ] + if_static([
@@ -243,7 +245,6 @@ cc_library(
         "device_id.h",
         "device_id_manager.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":device_type",
         "//tsl/lib/gtl:int_type",
@@ -265,7 +266,6 @@ cc_library(
     hdrs = [
         "device_id_utils.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":device_id_impl",
         ":device_type",
@@ -286,13 +286,15 @@ filegroup(
         "device_id.h",
         "device_id_manager.h",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "numeric_types",
     hdrs = ["numeric_types.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/compiler:__subpackages__",
+        "//tensorflow/core:__subpackages__",
+    ]),
     deps = [
         ":fixedpoint_types",
         "//tsl/platform:types",
@@ -313,7 +315,6 @@ cc_library(
     name = "metrics",
     srcs = ["metrics.cc"],
     hdrs = ["metrics.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/lib/monitoring:counter",
     ],
@@ -332,7 +333,9 @@ cc_library(
 cc_library(
     name = "type_traits",
     hdrs = ["type_traits.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core/framework:__pkg__",
+    ]),
     deps = [
         ":numeric_types",
         "//tsl/platform:types",
@@ -344,7 +347,7 @@ filegroup(
     srcs = [
         "cancellation.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core:__subpackages__"]),
 )
 
 cc_library(
@@ -375,7 +378,6 @@ tsl_cc_test(
     name = "cancellation_test",
     size = "small",
     srcs = ["cancellation_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":cancellation",
         "//tsl/platform:env",
@@ -401,7 +403,12 @@ exports_files(
         "shared_counter.h",
         "tracking_allocator.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/common_runtime:__pkg__",
+        "//tensorflow/core/common_runtime/gpu:__pkg__",
+        "//tensorflow/core/framework:__pkg__",
+    ]),
 )
 
 # Files whose users still need to be migrated from core:framework to the
@@ -415,7 +422,6 @@ exports_files(
         "numeric_types.h",
         "type_traits.h",
     ],
-    visibility = ["//visibility:public"],
 )
 
 tsl_cc_test(
@@ -423,7 +429,6 @@ tsl_cc_test(
     srcs = [
         "device_id_utils_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":device_id_impl",
         ":device_id_utils",
diff --git a/third_party/xla/third_party/tsl/tsl/framework/contraction/BUILD b/third_party/xla/third_party/tsl/tsl/framework/contraction/BUILD
index 26dafa4ef99e41..e265644dce3d79 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/contraction/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/framework/contraction/BUILD
@@ -4,6 +4,7 @@ load("@bazel_skylib//rules:common_settings.bzl", "bool_flag")
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     features = [
         # Required since headers are not self-contained.
@@ -24,13 +25,11 @@ config_setting(
     define_values = {
         "tensorflow_mkldnn_contraction_kernel": "0",
     },
-    visibility = ["//visibility:public"],
 )
 
 bzl_library(
     name = "build_defs",
     srcs = ["build_defs.bzl"],
-    visibility = ["//visibility:public"],
 )
 
 # Add
@@ -45,7 +44,6 @@ bool_flag(
 config_setting(
     name = "disable_onednn_contraction_kernel_config",
     flag_values = {":disable_onednn_contraction_kernel": "True"},
-    visibility = ["//visibility:public"],
 )
 
 # Depending on a build configuration this target provides custom kernel for Eigen
@@ -77,7 +75,6 @@ cc_library(
     name = "eigen_contraction_kernel",
     hdrs = ["eigen_contraction_kernel.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = select({
         ":no_mkldnn_contraction_kernel": [":eigen_contraction_kernel_no_mkl"],
         ":disable_onednn_contraction_kernel_config": [":eigen_contraction_kernel_no_mkl"],
@@ -105,7 +102,6 @@ cc_library(
             "TENSORFLOW_USE_MKLDNN_CONTRACTION_KERNEL",
         ],
     }),
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/framework/fixedpoint",
         "//tsl/platform:dynamic_annotations",
@@ -131,7 +127,6 @@ exports_files(
         "eigen_contraction_kernel.cc",
         "eigen_contraction_kernel.h",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -139,7 +134,6 @@ cc_library(
     srcs = ["eigen_contraction_kernel.cc"],
     hdrs = ["eigen_contraction_kernel.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     # Somehow the following code works with fixedpoint, but not here.
     # visibility = [
     #     "//tensorflow:__subpackages__",
@@ -160,7 +154,6 @@ filegroup(
     srcs = [
         "eigen_contraction_kernel.h",
     ],
-    visibility = ["//visibility:public"],
 )
 
 # Maintain the same name as other directories until a principled refactor is done, as these files
@@ -170,5 +163,4 @@ filegroup(
     srcs = [
         "eigen_contraction_kernel.cc",
     ],
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/third_party/tsl/tsl/framework/convolution/BUILD b/third_party/xla/third_party/tsl/tsl/framework/convolution/BUILD
index d6ed7283d0dbfc..6af7b983946c64 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/convolution/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/framework/convolution/BUILD
@@ -6,6 +6,7 @@ load(
 )
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     features = [
         # Required since headers are not self-contained.
@@ -19,7 +20,6 @@ cc_library(
         "eigen_spatial_convolutions-inl.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/framework/convolution:eigen_convolution_helpers",
     ],
@@ -34,7 +34,6 @@ cc_library(
     defines = [
         "EIGEN_ALTIVEC_USE_CUSTOM_PACK=0",
     ],
-    visibility = ["//visibility:public"],
 )
 
 # Tensorflow also has an eigen_helpers that is closely related, so maintain the same name.
@@ -45,7 +44,6 @@ cc_library(
     ],
     compatible_with = get_compatible_with_portable(),
     defines = ["EIGEN_NEON_GEBP_NR=4"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/framework/contraction:eigen_contraction_kernel",
         "//tsl/framework/convolution:eigen_convolution_helpers",
@@ -63,7 +61,6 @@ filegroup(
         "eigen_spatial_convolutions.h",
         "eigen_spatial_convolutions-inl.h",
     ],
-    visibility = ["//visibility:public"],
     # Somehow the following code works with fixedpoint, but not here.
     # visibility = [
     #     "//tensorflow:__subpackages__",
@@ -92,7 +89,6 @@ exports_files(
         "eigen_spatial_convolutions.h",
         "eigen_spatial_convolutions-inl.h",
     ],
-    visibility = ["//visibility:public"],
 )
 
 tsl_cc_test(
@@ -101,7 +97,6 @@ tsl_cc_test(
     srcs = [
         "eigen_spatial_convolutions_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":eigen_helpers",
         "//tsl/platform:test",
diff --git a/third_party/xla/third_party/tsl/tsl/framework/fixedpoint/BUILD b/third_party/xla/third_party/tsl/tsl/framework/fixedpoint/BUILD
index b9319feacd4fbc..310080f5285a60 100644
--- a/third_party/xla/third_party/tsl/tsl/framework/fixedpoint/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/framework/fixedpoint/BUILD
@@ -1,7 +1,9 @@
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
+load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     features = [
         # Required since headers are not self-contained.
@@ -25,7 +27,6 @@ cc_library(
         "TypeCastingAVX512.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/framework:fixedpoint_types",
         "@eigen_archive//:eigen3",
@@ -47,7 +48,10 @@ filegroup(
         "TypeCastingAVX512.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow:__subpackages__",
+        "//tsl:internal",
+    ]),
 )
 
 # Files needed for core:mobile_srcs_no_runtime.
@@ -66,7 +70,6 @@ filegroup(
         "TypeCastingAVX512.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
@@ -84,5 +87,4 @@ filegroup(
         "TypeCastingAVX512.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/third_party/tsl/tsl/lib/core/BUILD b/third_party/xla/third_party/tsl/tsl/lib/core/BUILD
index f597293bf733ae..eb5bbda819c7c3 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/core/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/core/BUILD
@@ -4,9 +4,9 @@
 #   The libraries in this package are not allowed to have ANY dependencies
 #   to other TF components outside of TSL.
 
-load("//tsl/platform:build_config.bzl", "tsl_cc_test")
-load("//tsl:tsl.bzl", "set_external_visibility")
+load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
@@ -14,7 +14,10 @@ load(
 
 # TODO(rdzhabarov): Tighten visibility after migration is complete.
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:public",
+    ],
     licenses = ["notice"],
 )
 
@@ -25,7 +28,7 @@ filegroup(
         "bits.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core:__pkg__"]),
 )
 
 filegroup(
@@ -36,7 +39,10 @@ filegroup(
         "status_test_util.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/core:__pkg__",
+    ]),
 )
 
 filegroup(
@@ -45,7 +51,7 @@ filegroup(
         "bitmap_test.cc",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core:__pkg__"]),
 )
 
 filegroup(
@@ -55,7 +61,7 @@ filegroup(
         "bits.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core:__pkg__"]),
 )
 
 filegroup(
@@ -64,7 +70,10 @@ filegroup(
         "status_test_util.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/core:__pkg__",
+    ]),
 )
 
 cc_library(
@@ -72,7 +81,6 @@ cc_library(
     srcs = ["bitmap.cc"],
     hdrs = ["bitmap.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:logging",
     ],
@@ -83,7 +91,6 @@ cc_library(
     name = "status_test_util",
     testonly = 1,
     hdrs = ["status_test_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:status",
         "//tsl/platform:test",
@@ -93,7 +100,6 @@ cc_library(
 cc_library(
     name = "bits",
     hdrs = ["bits.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:logging",
         "//tsl/platform:types",
@@ -105,7 +111,6 @@ tsl_cc_test(
     name = "bits_test",
     size = "small",
     srcs = ["bits_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":bits",
         "//tsl/platform:test",
diff --git a/third_party/xla/third_party/tsl/tsl/lib/gtl/BUILD b/third_party/xla/third_party/tsl/tsl/lib/gtl/BUILD
index 8ade5f0eb0beb3..306360d7cd6b16 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/gtl/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/gtl/BUILD
@@ -1,4 +1,4 @@
-load("//tsl:tsl.bzl", "set_external_visibility")
+load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "filegroup")
 load(
     "//tsl/platform:build_config.bzl",
@@ -10,7 +10,28 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
+        "//tensorflow/core:__pkg__",
+        # tensorflow/core/lib/strings:proto_serialization uses on gtl:inlined_vector
+        "//tensorflow/core/lib/strings:__pkg__",
+        "//tsl/lib/strings:__pkg__",
+        # tensorflow/core/framework uses map_util, and flatmap
+        "//tensorflow/core/framework:__pkg__",
+        "//tsl/framework:__pkg__",
+        "//tsl/platform/cloud:__pkg__",
+        # tensorflow/core/util uses inlined_vector
+        "//tensorflow/core/util:__pkg__",
+        # tensorflow/core/tfrt/utils uses inlined_vector
+        "//tensorflow/core/tfrt/utils:__pkg__",
+        # tensorflow/examples/custom_ops_doc/simple_hash_table uses map_util
+        "//tensorflow/examples/custom_ops_doc/simple_hash_table:__pkg__",
+        "@local_xla//xla:__subpackages__",
+        "//tensorflow/core/lib/gtl:__subpackages__",
+        "//tsl/distributed_runtime/rpc:__pkg__",
+        "//tsl/profiler/utils:__pkg__",
+    ]),
     licenses = ["notice"],
 )
 
@@ -19,14 +40,12 @@ package(
 cc_library(
     name = "compactptrset",
     hdrs = ["compactptrset.h"],
-    visibility = ["//visibility:public"],
     deps = [":flatset"],
 )
 
 cc_library(
     name = "flatmap",
     hdrs = ["flatmap.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":flatrep",
         "//tsl/platform:hash",
@@ -38,7 +57,6 @@ cc_library(
 cc_library(
     name = "flatrep",
     hdrs = ["flatrep.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:types",
         "@com_google_absl//absl/base:prefetch",
@@ -48,7 +66,6 @@ cc_library(
 cc_library(
     name = "flatset",
     hdrs = ["flatset.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":flatrep",
         "//tsl/platform:hash",
@@ -60,7 +77,6 @@ cc_library(
 cc_library(
     name = "inlined_vector",
     hdrs = ["inlined_vector.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:macros",
         "//tsl/platform:types",
@@ -71,7 +87,6 @@ cc_library(
 cc_library(
     name = "int_type",
     hdrs = ["int_type.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:macros",
         "//tsl/platform:types",
@@ -81,7 +96,6 @@ cc_library(
 cc_library(
     name = "iterator_range",
     hdrs = ["iterator_range.h"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -91,7 +105,6 @@ cc_library(
         "//tsl/lib/gtl/subtle:map_traits",
     ],
     hdrs = ["map_util.h"],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
@@ -103,7 +116,10 @@ filegroup(
         "inlined_vector.h",
         "iterator_range.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/gtl:__pkg__",
+    ]),
 )
 
 filegroup(
@@ -112,21 +128,30 @@ filegroup(
         "int_type.h",
         "map_util.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/gtl:__pkg__",
+    ]),
 )
 
 filegroup(
     name = "legacy_lib_test_internal_headers",
     srcs = [
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/gtl:__pkg__",
+    ]),
 )
 
 filegroup(
     name = "legacy_android_gif_internal_headers",
     srcs = [
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/gtl:__pkg__",
+    ]),
 )
 
 # Export source files needed for mobile builds, which do not use granular targets.
@@ -137,7 +162,11 @@ filegroup(
         "flatrep.h",
         "inlined_vector.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/gtl:__pkg__",
+        "//tsl:__subpackages__",
+    ]),
 )
 
 filegroup(
@@ -149,7 +178,10 @@ filegroup(
         "map_util.h",
         "//tsl/lib/gtl/subtle:map_traits",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/gtl:__pkg__",
+    ]),
 )
 
 filegroup(
@@ -165,7 +197,10 @@ filegroup(
         "map_util.h",
         "//tsl/lib/gtl/subtle:map_traits",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/gtl:__pkg__",
+    ]),
 )
 
 tsl_cc_test(
@@ -178,7 +213,6 @@ tsl_cc_test(
         "iterator_range_test.cc",
         "map_util_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":compactptrset",
         ":flatmap",
diff --git a/third_party/xla/third_party/tsl/tsl/lib/gtl/subtle/BUILD b/third_party/xla/third_party/tsl/tsl/lib/gtl/subtle/BUILD
index 5bb45dad10ec9d..3e9bfe7a5d03e4 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/gtl/subtle/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/gtl/subtle/BUILD
@@ -1,10 +1,10 @@
 # Description:
 # gtl subtle packages.
 
+load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "filegroup")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
@@ -14,5 +14,8 @@ filegroup(
     srcs = [
         "map_traits.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core/lib/gtl/subtle:__pkg__",
+        "//tsl/lib/gtl:__pkg__",
+    ]),
 )
diff --git a/third_party/xla/third_party/tsl/tsl/lib/hash/BUILD b/third_party/xla/third_party/tsl/tsl/lib/hash/BUILD
index d4b155ab625da6..9ef78a04c27284 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/hash/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/hash/BUILD
@@ -1,7 +1,7 @@
 load(
     "//tsl:tsl.bzl",
     "if_linux_x86_64",
-    "set_external_visibility",
+    "internal_visibility",
     "tsl_copts",
 )
 load("//tsl:tsl.default.bzl", "filegroup")
@@ -15,7 +15,13 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        # tensorflow/tsl/lib/io/table_builder.cc uses crc functionality
+        "//tsl/lib/io:__pkg__",
+        # tensorflow/core/lib/hash aliases hash for now
+        "//tensorflow/core/lib/hash:__pkg__",
+    ]),
     licenses = ["notice"],
 )
 
@@ -27,7 +33,6 @@ cc_library(
     hdrs = ["crc32c.h"],
     # -msse4.2 enables the use of crc32c compiler builtins.
     copts = tsl_copts() + if_linux_x86_64(["-msse4.2"]),
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform",
         "//tsl/platform:cord",
@@ -44,7 +49,7 @@ filegroup(
         "crc32c.cc",
         "crc32c.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core/lib/hash:__pkg__"]),
 )
 
 filegroup(
@@ -52,14 +57,13 @@ filegroup(
     srcs = [
         "crc32c.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core/lib/hash:__pkg__"]),
 )
 
 tsl_cc_test(
     name = "crc32c_test",
     size = "small",
     srcs = ["crc32c_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":crc32c",
         "//tsl/platform:logging",
diff --git a/third_party/xla/third_party/tsl/tsl/lib/histogram/BUILD b/third_party/xla/third_party/tsl/tsl/lib/histogram/BUILD
index f669b2b57232e3..0093ad1b5274da 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/histogram/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/histogram/BUILD
@@ -1,3 +1,4 @@
+load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "filegroup")
 load(
     "//tsl/platform:build_config.bzl",
@@ -9,7 +10,6 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
@@ -18,7 +18,11 @@ cc_library(
     name = "histogram",
     srcs = ["histogram.cc"],
     hdrs = ["histogram.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//learning/brain/google/monitoring:__pkg__",
+        "//tensorflow/core/lib/histogram:__pkg__",
+        "//tsl/lib/monitoring:__pkg__",
+    ]),
     deps = [
         "//tsl/platform:logging",
         "//tsl/platform:macros",
@@ -37,7 +41,7 @@ filegroup(
         "histogram.cc",
         "histogram.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core/lib/histogram:__pkg__"]),
 )
 
 filegroup(
@@ -45,7 +49,7 @@ filegroup(
     srcs = [
         "histogram.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core/lib/histogram:__pkg__"]),
 )
 
 tsl_cc_test(
@@ -53,7 +57,6 @@ tsl_cc_test(
     srcs = [
         "histogram_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":histogram",
         "//tsl/platform:logging",
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/BUILD b/third_party/xla/third_party/tsl/tsl/lib/io/BUILD
index 01aaf40d7b7c2c..5b45d10a620a1e 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/BUILD
@@ -1,13 +1,24 @@
-load("//tsl/platform:build_config.bzl", "tsl_cc_test")
-load("//tsl:tsl.bzl", "set_external_visibility")
+load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "filegroup")
+load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//tensorflow/c/experimental/filesystem:__pkg__",
+        "//tensorflow/c/experimental/filesystem/plugins/posix:__pkg__",
+        "//tsl/lib/io/snappy:__pkg__",
+        "@local_xla//xla:__subpackages__",
+        # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
+        "//tensorflow/core/util:__subpackages__",
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/io:__subpackages__",
+        "//tsl/profiler:__subpackages__",
+    ]),
     licenses = ["notice"],
 )
 
@@ -25,7 +36,6 @@ cc_library(
         "format.h",
         "table_builder.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":iterator",
         ":table_options",
@@ -47,7 +57,6 @@ cc_library(
     name = "buffered_inputstream",
     srcs = ["buffered_inputstream.cc"],
     hdrs = ["buffered_inputstream.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":inputstream_interface",
         ":random_inputstream",
@@ -61,7 +70,6 @@ cc_library(
     name = "compression",
     srcs = ["compression.cc"],
     hdrs = ["compression.h"],
-    visibility = ["//visibility:public"],
     alwayslink = True,
 )
 
@@ -69,7 +77,6 @@ cc_library(
     name = "inputbuffer",
     srcs = ["inputbuffer.cc"],
     hdrs = ["inputbuffer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:coding",
         "//tsl/platform:env",
@@ -86,7 +93,6 @@ cc_library(
     name = "inputstream_interface",
     srcs = ["inputstream_interface.cc"],
     hdrs = ["inputstream_interface.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:cord",
         "//tsl/platform:errors",
@@ -100,7 +106,6 @@ cc_library(
     name = "iterator",
     srcs = ["iterator.cc"],
     hdrs = ["iterator.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:status",
         "//tsl/platform:stringpiece",
@@ -111,7 +116,6 @@ cc_library(
 cc_library(
     name = "proto_encode_helper",
     hdrs = ["proto_encode_helper.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:coding",
         "//tsl/platform:logging",
@@ -124,7 +128,6 @@ cc_library(
     name = "random_inputstream",
     srcs = ["random_inputstream.cc"],
     hdrs = ["random_inputstream.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":inputstream_interface",
         "//tsl/platform:cord",
@@ -137,7 +140,6 @@ cc_library(
     name = "record_reader",
     srcs = ["record_reader.cc"],
     hdrs = ["record_reader.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":buffered_inputstream",
         ":compression",
@@ -162,7 +164,6 @@ cc_library(
     name = "record_writer",
     srcs = ["record_writer.cc"],
     hdrs = ["record_writer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":compression",
         ":snappy_compression_options",
@@ -184,25 +185,21 @@ cc_library(
 alias(
     name = "snappy_inputbuffer",
     actual = "//tsl/lib/io/snappy:snappy_inputbuffer",
-    visibility = ["//visibility:public"],
 )
 
 alias(
     name = "snappy_inputstream",
     actual = "//tsl/lib/io/snappy:snappy_inputstream",
-    visibility = ["//visibility:public"],
 )
 
 alias(
     name = "snappy_outputbuffer",
     actual = "//tsl/lib/io/snappy:snappy_outputbuffer",
-    visibility = ["//visibility:public"],
 )
 
 alias(
     name = "snappy_compression_options",
     actual = "//tsl/lib/io/snappy:snappy_compression_options",
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -213,7 +210,6 @@ cc_library(
     hdrs = [
         "cache.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:mutex",
         "//tsl/platform:raw_coding",
@@ -231,7 +227,6 @@ cc_library(
         "table.h",
         "two_level_iterator.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":block",
         ":cache",
@@ -247,7 +242,6 @@ cc_library(
 cc_library(
     name = "table_options",
     hdrs = ["table_options.h"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -266,7 +260,6 @@ tsl_cc_test(
     name = "buffered_file_test",
     size = "small",
     srcs = ["buffered_file_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":buffered_file",
         "//tsl/lib/core:status_test_util",
@@ -282,7 +275,6 @@ cc_library(
     name = "zlib_compression_options",
     srcs = ["zlib_compression_options.cc"],
     hdrs = ["zlib_compression_options.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:types",
         "@zlib",
@@ -294,7 +286,6 @@ cc_library(
     name = "zlib_inputstream",
     srcs = ["zlib_inputstream.cc"],
     hdrs = ["zlib_inputstream.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":inputstream_interface",
         ":zlib_compression_options",
@@ -313,7 +304,6 @@ cc_library(
     name = "zlib_outputbuffer",
     srcs = ["zlib_outputbuffer.cc"],
     hdrs = ["zlib_outputbuffer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":zlib_compression_options",
         "//tsl/platform:env",
@@ -369,7 +359,6 @@ filegroup(
         "//tsl/lib/io/snappy:snappy_inputstream.cc",
         "//tsl/lib/io/snappy:snappy_inputstream.h",
     ],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
@@ -399,7 +388,7 @@ filegroup(
         "//tsl/lib/io/snappy:snappy_inputstream.h",
         "//tsl/lib/io/snappy:snappy_outputbuffer.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core:__pkg__"]),
 )
 
 filegroup(
@@ -417,7 +406,7 @@ filegroup(
         "table_builder.h",
         "table_options.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core:__pkg__"]),
 )
 
 filegroup(
@@ -433,7 +422,7 @@ filegroup(
         "//tsl/lib/io/snappy:snappy_inputstream.h",
         "//tsl/lib/io/snappy:snappy_outputbuffer.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core:__pkg__"]),
 )
 
 filegroup(
@@ -443,14 +432,13 @@ filegroup(
         "block_builder.h",
         "format.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core:__pkg__"]),
 )
 
 tsl_cc_test(
     name = "buffered_inputstream_test",
     size = "small",
     srcs = ["buffered_inputstream_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":buffered_inputstream",
         ":random_inputstream",
@@ -467,7 +455,6 @@ tsl_cc_test(
     name = "cache_test",
     size = "small",
     srcs = ["cache_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":cache",
         "//tsl/platform:coding",
@@ -481,7 +468,6 @@ tsl_cc_test(
     name = "inputbuffer_test",
     size = "small",
     srcs = ["inputbuffer_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":inputbuffer",
         "//tsl/lib/core:status_test_util",
@@ -502,7 +488,6 @@ tsl_cc_test(
     name = "inputstream_interface_test",
     size = "small",
     srcs = ["inputstream_interface_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":inputstream_interface",
         "//tsl/lib/core:status_test_util",
@@ -516,7 +501,6 @@ tsl_cc_test(
     name = "random_inputstream_test",
     size = "small",
     srcs = ["random_inputstream_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":random_inputstream",
         "//tsl/lib/core:status_test_util",
@@ -531,7 +515,6 @@ tsl_cc_test(
     name = "record_reader_writer_test",
     size = "small",
     srcs = ["record_reader_writer_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":record_reader",
         ":record_writer",
@@ -552,7 +535,6 @@ tsl_cc_test(
     name = "recordio_test",
     size = "small",
     srcs = ["recordio_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":record_reader",
         ":record_writer",
@@ -573,7 +555,6 @@ tsl_cc_test(
     name = "table_test",
     size = "small",
     srcs = ["table_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":block",
         ":iterator",
@@ -593,7 +574,6 @@ tsl_cc_test(
     name = "zlib_buffers_test",
     size = "small",
     srcs = ["zlib_buffers_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":random_inputstream",
         ":zlib_compression_options",
diff --git a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/BUILD b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/BUILD
index 9d817cefdb1030..57e178d668e126 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/io/snappy/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/io/snappy/BUILD
@@ -1,3 +1,4 @@
+load("//tsl:tsl.bzl", "internal_visibility")
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
@@ -11,27 +12,27 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//tensorflow/core/lib/io:__pkg__",
+        "//tsl/lib/io:__pkg__",
+    ]),
     licenses = ["notice"],
 )
 
-exports_files(
-    [
-        "snappy_compression_options.h",
-        "snappy_inputbuffer.h",
-        "snappy_inputstream.h",
-        "snappy_outputbuffer.h",
-        "snappy_inputstream.cc",
-        "snappy_test.cc",
-    ],
-    visibility = ["//visibility:public"],
-)
+exports_files([
+    "snappy_compression_options.h",
+    "snappy_inputbuffer.h",
+    "snappy_inputstream.h",
+    "snappy_outputbuffer.h",
+    "snappy_inputstream.cc",
+    "snappy_test.cc",
+])
 
 cc_library(
     name = "snappy_inputbuffer",
     srcs = ["snappy_inputbuffer.cc"],
     hdrs = ["snappy_inputbuffer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/lib/io:inputstream_interface",
         "//tsl/platform:env",
@@ -48,7 +49,6 @@ cc_library(
     name = "snappy_outputbuffer",
     srcs = ["snappy_outputbuffer.cc"],
     hdrs = ["snappy_outputbuffer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform",
         "//tsl/platform:env",
@@ -64,7 +64,6 @@ cc_library(
     name = "snappy_inputstream",
     srcs = ["snappy_inputstream.cc"],
     hdrs = ["snappy_inputstream.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/lib/io:inputstream_interface",
         "//tsl/platform:errors",
@@ -77,7 +76,6 @@ cc_library(
 cc_library(
     name = "snappy_compression_options",
     hdrs = ["snappy_compression_options.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:types",
     ],
@@ -88,7 +86,6 @@ tsl_cc_test(
     name = "snappy_test",
     size = "small",
     srcs = ["snappy_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":snappy_inputbuffer",
         ":snappy_inputstream",
diff --git a/third_party/xla/third_party/tsl/tsl/lib/math/BUILD b/third_party/xla/third_party/tsl/tsl/lib/math/BUILD
index a48ddadcf92d06..e5f1178382650a 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/math/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/math/BUILD
@@ -1,12 +1,15 @@
+load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
-load("//tsl:tsl.bzl", "set_external_visibility")
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//tensorflow:__subpackages__",
+    ]),
     licenses = ["notice"],
 )
 
@@ -14,7 +17,11 @@ cc_library(
     name = "math_util",
     hdrs = ["math_util.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//platforms/performance/tf_sim/utils:__subpackages__",
+        "//platforms/xla/service:__subpackages__",
+        "//tensorflow:__subpackages__",
+    ]),
     deps = ["@com_google_absl//absl/base:core_headers"],
 )
 
@@ -24,7 +31,6 @@ tsl_cc_test(
     srcs = [
         "math_util_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":math_util",
         "//tsl/platform:logging",
@@ -42,12 +48,9 @@ filegroup(
         "math_util.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core:__pkg__"]),
 )
 
-exports_files(
-    [
-        "math_util.h",
-    ],
-    visibility = ["//visibility:public"],
-)
+exports_files([
+    "math_util.h",
+])
diff --git a/third_party/xla/third_party/tsl/tsl/lib/monitoring/BUILD b/third_party/xla/third_party/tsl/tsl/lib/monitoring/BUILD
index 8dcff6a0337f5e..1842f5ab656071 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/monitoring/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/monitoring/BUILD
@@ -1,4 +1,4 @@
-load("//tsl:tsl.bzl", "set_external_visibility")
+load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "filegroup")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
@@ -6,14 +6,33 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//learning/brain/google/data:__subpackages__",
+        "//learning/brain/google/monitoring:__subpackages__",
+        # tensorflow/core:lib effectively exposes all targets under tensorflow/core/lib/**
+        "//tensorflow/core:__pkg__",
+        # tensorflow/core/platform:monitoring depends on this package
+        "//tensorflow/core/platform:__subpackages__",
+        # tensorflow/compiler/xla/pjrt:metrics depends on this package
+        "@local_xla//xla/pjrt:__subpackages__",
+        "@local_xla//xla/service/gpu:__subpackages__",
+        # tensorflow/compiler/mlir/tfrt:tf_jitrt depends on this package
+        "//tensorflow/compiler/mlir/tfrt:__subpackages__",
+        "@local_xla//xla/stream_executor:__subpackages__",
+        "@local_xla//xla/hlo/experimental:__subpackages__",
+        "//tensorflow/core/lib/monitoring:__subpackages__",
+        "@local_xla//xla/service:__subpackages__",
+        "//tsl/framework:__subpackages__",
+        "//tsl/distributed_runtime:__subpackages__",
+        "//tensorflow/compiler/mlir/tf2xla:__subpackages__",
+    ]),
     licenses = ["notice"],
 )
 
 cc_library(
     name = "counter",
     hdrs = ["counter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":collection_registry",
         ":metric_def",
@@ -32,7 +51,6 @@ cc_library(
     hdrs = [
         "gauge.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":collection_registry",
         ":metric_def",
@@ -49,7 +67,6 @@ cc_library(
     name = "sampler",
     srcs = ["sampler.cc"],
     hdrs = ["sampler.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":collection_registry",
         ":metric_def",
@@ -69,7 +86,6 @@ cc_library(
     hdrs = [
         "types.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:types",
     ],
@@ -78,7 +94,9 @@ cc_library(
 cc_library(
     name = "metric_def",
     hdrs = ["metric_def.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__subpackages__",
+    ]),
     deps = [
         ":types",
         "//tsl/platform:stringpiece",
@@ -91,7 +109,9 @@ cc_library(
     name = "collection_registry",
     srcs = ["collection_registry.cc"],
     hdrs = ["collection_registry.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__subpackages__",
+    ]),
     deps = [
         ":collected_metrics",
         ":metric_def",
@@ -113,7 +133,6 @@ cc_library(
     hdrs = [
         "collected_metrics.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":metric_def",
         ":types",
@@ -139,7 +158,6 @@ cc_library(
     testonly = 1,
     srcs = ["cell_reader-inl.cc"],
     hdrs = ["cell_reader-inl.h"],
-    visibility = ["//visibility:public"],
     #visibility = ["//visibility:private"],
     deps = [
         ":collected_metrics",
@@ -159,7 +177,6 @@ cc_library(
     name = "percentile_sampler",
     srcs = ["percentile_sampler.cc"],
     hdrs = ["percentile_sampler.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":collection_registry",
         ":metric_def",
@@ -193,7 +210,6 @@ cc_library(
     hdrs = [
         "timed.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:env_time",
         "//tsl/platform:types",
@@ -214,7 +230,10 @@ filegroup(
         "timed.h",
         "types.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/monitoring:__pkg__",
+    ]),
 )
 
 filegroup(
@@ -232,7 +251,10 @@ filegroup(
         "timed.h",
         "types.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/monitoring:__pkg__",
+    ]),
 )
 
 filegroup(
@@ -249,5 +271,8 @@ filegroup(
         "test_utils.h",
         "types.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/monitoring:__pkg__",
+    ]),
 )
diff --git a/third_party/xla/third_party/tsl/tsl/lib/random/BUILD b/third_party/xla/third_party/tsl/tsl/lib/random/BUILD
index f20d9a1123870f..3223845738ed7c 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/random/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/random/BUILD
@@ -1,19 +1,24 @@
+load("//tsl:tsl.bzl", "internal_visibility")
+load("//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
 )
-load("//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//tsl/lib/io:__pkg__",
+        # tensorflow/core/platform/random aliases this package
+        "//tensorflow/core/lib/random:__pkg__",
+    ]),
     licenses = ["notice"],
 )
 
 cc_library(
     name = "exact_uniform_int",
     hdrs = ["exact_uniform_int.h"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -28,7 +33,6 @@ cc_library(
         "random_distributions.h",
         "simple_philox.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":exact_uniform_int",
         ":philox_random",
@@ -46,7 +50,10 @@ cc_library(
     name = "random_distributions_utils",
     hdrs = ["random_distributions_utils.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core/lib/random:__pkg__",
+        "//tensorflow/lite:__subpackages__",
+    ]),
     deps = [":philox_random"],
 )
 
@@ -54,14 +61,16 @@ cc_library(
     name = "philox_random",
     hdrs = ["philox_random.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core/lib/random:__pkg__",
+        "//tensorflow/lite:__subpackages__",
+    ]),
 )
 
 cc_library(
     name = "philox_random_test_utils",
     testonly = True,
     hdrs = ["philox_random_test_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":philox_random",
         "//tsl/platform:logging",
@@ -73,7 +82,6 @@ cc_library(
     name = "weighted_picker",
     srcs = ["weighted_picker.cc"],
     hdrs = ["weighted_picker.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":philox",
         "//tsl/platform:logging",
@@ -98,7 +106,6 @@ filegroup(
         "weighted_picker.cc",
         "weighted_picker.h",
     ],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
@@ -110,7 +117,7 @@ filegroup(
         "random_distributions_utils.h",
         "simple_philox.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core/lib/random:__pkg__"]),
 )
 
 filegroup(
@@ -120,7 +127,7 @@ filegroup(
         "random_distributions_utils.h",
         "weighted_picker.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core/lib/random:__pkg__"]),
 )
 
 filegroup(
@@ -128,7 +135,7 @@ filegroup(
     srcs = [
         "philox_random_test_utils.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core/lib/random:__pkg__"]),
 )
 
 filegroup(
@@ -143,14 +150,13 @@ filegroup(
         "simple_philox.h",
         "weighted_picker.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core/lib/random:__pkg__"]),
 )
 
 tsl_cc_test(
     name = "distribution_sampler_test",
     size = "small",
     srcs = ["distribution_sampler_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":philox",
         "//tsl/platform:macros",
@@ -165,7 +171,6 @@ tsl_cc_test(
     name = "philox_random_test",
     size = "small",
     srcs = ["philox_random_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":philox",
         ":philox_random",
@@ -181,7 +186,6 @@ tsl_cc_test(
     name = "random_distributions_test",
     srcs = ["random_distributions_test.cc"],
     tags = ["optonly"],
-    visibility = ["//visibility:public"],
     deps = [
         ":philox",
         ":philox_random",
@@ -198,7 +202,6 @@ tsl_cc_test(
     name = "simple_philox_test",
     size = "small",
     srcs = ["simple_philox_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":philox",
         "//tsl/platform:logging",
@@ -212,7 +215,6 @@ tsl_cc_test(
     name = "weighted_picker_test",
     size = "medium",
     srcs = ["weighted_picker_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":philox",
         ":weighted_picker",
diff --git a/third_party/xla/third_party/tsl/tsl/lib/strings/BUILD b/third_party/xla/third_party/tsl/tsl/lib/strings/BUILD
index a1c9dc81ac0e4c..b19d298365ae15 100644
--- a/third_party/xla/third_party/tsl/tsl/lib/strings/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/lib/strings/BUILD
@@ -1,4 +1,4 @@
-load("//tsl:tsl.bzl", "set_external_visibility")
+load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "filegroup")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
@@ -11,7 +11,15 @@ cc_library(
     name = "proto_serialization",
     srcs = ["proto_serialization.cc"],
     hdrs = ["proto_serialization.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "@local_xla//xla/pjrt:__subpackages__",
+        "@local_xla//xla/python:__pkg__",
+        "@local_xla//xla/service:__pkg__",
+        "@local_xla//xla/stream_executor:__pkg__",
+        "//tensorflow/core/lib/strings:__pkg__",
+        "//tensorflow/compiler/tf2xla/kernels:__pkg__",
+        "//tensorflow/core/util/autotune_maps:__pkg__",
+    ]),
     deps = [
         "//tsl/lib/gtl:inlined_vector",
         "//tsl/platform:hash",
@@ -29,7 +37,7 @@ filegroup(
         "proto_serialization.cc",
         "proto_serialization.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core/lib/strings:__pkg__"]),
 )
 
 filegroup(
@@ -37,7 +45,7 @@ filegroup(
     srcs = [
         "proto_serialization.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core/lib/strings:__pkg__"]),
 )
 
 filegroup(
@@ -45,7 +53,7 @@ filegroup(
     srcs = [
         "proto_serialization.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core/lib/strings:__pkg__"]),
 )
 
 filegroup(
@@ -53,5 +61,5 @@ filegroup(
     srcs = [
         "proto_serialization.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core/lib/strings:__pkg__"]),
 )
diff --git a/third_party/xla/third_party/tsl/tsl/platform/BUILD b/third_party/xla/third_party/tsl/tsl/platform/BUILD
index 536adfa32f554c..91ec2f9ea17098 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/BUILD
@@ -7,7 +7,7 @@
 load(
     "//tsl:tsl.bzl",
     "if_not_fuchsia",
-    "set_external_visibility",
+    "internal_visibility",
     "tsl_copts",
 )
 load("//tsl:tsl.default.bzl", "get_compatible_with_portable")
@@ -25,6 +25,7 @@ load(
     "tf_stream_executor_deps",
     "tf_windows_aware_platform_deps",
     "tsl_cc_test",
+    "tsl_grpc_credentials_deps",
     "tsl_protobuf_deps",
 )
 load("//tsl/platform:build_config_root.bzl", "if_static")
@@ -38,7 +39,10 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:public",
+    ],
     licenses = ["notice"],
 )
 
@@ -48,18 +52,22 @@ exports_files(
         "load_library.h",
         "stringpiece_test.cc",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core/platform:__subpackages__",
+        "//tsl:__subpackages__",
+    ]),
 )
 
 cc_library(
     name = "base64",
     srcs = ["base64.cc"],
     hdrs = ["base64.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":errors",
+        ":macros",
         ":status",
         ":stringpiece",
+        ":types",
     ],
 )
 
@@ -67,7 +75,6 @@ cc_library(
     name = "blocking_counter",
     hdrs = ["blocking_counter.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":logging",
         ":mutex",
@@ -78,14 +85,12 @@ cc_library(
     name = "byte_order",
     hdrs = ["byte_order.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "coding",
     srcs = ["coding.cc"],
     hdrs = ["coding.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":byte_order",
         ":stringpiece",
@@ -97,7 +102,6 @@ cc_library(
     name = "criticality",
     compatible_with = get_compatible_with_portable(),
     textual_hdrs = ["criticality.h"],
-    visibility = ["//visibility:public"],
     deps = tf_platform_deps("criticality"),
 )
 
@@ -107,7 +111,6 @@ tsl_cc_test(
     srcs = [
         "criticality_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":criticality",
         ":test",
@@ -119,7 +122,6 @@ cc_library(
     name = "denormal",
     srcs = ["denormal.cc"],
     hdrs = ["denormal.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":macros",
         ":platform",
@@ -131,7 +133,6 @@ tsl_cc_test(
     name = "denormal_test",
     size = "small",
     srcs = ["denormal_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":denormal",
         ":test",
@@ -147,13 +148,11 @@ cc_library(
         "file_system_helper.h",
         "threadpool.h",
     ],
-    visibility = ["//visibility:public"],
     deps = tf_windows_aware_platform_deps("env") + if_static([":env_impl"]),
 )
 
 cc_library(
     name = "env_impl",
-    visibility = ["//visibility:public"],
     deps = tf_windows_aware_platform_deps("env_impl"),
 )
 
@@ -161,7 +160,6 @@ cc_library(
     name = "env_time",
     compatible_with = get_compatible_with_portable(),
     textual_hdrs = ["env_time.h"],
-    visibility = ["//visibility:public"],
     deps = tf_windows_aware_platform_deps("env_time"),
 )
 
@@ -169,7 +167,6 @@ cc_library(
     name = "errors",
     srcs = ["errors.cc"],
     hdrs = ["errors.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":logging",
         ":macros",
@@ -187,26 +184,15 @@ cc_library(
     name = "dynamic_annotations",
     hdrs = ["dynamic_annotations.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/base:dynamic_annotations",
     ],
 )
 
-cc_library(
-    name = "gif",
-    hdrs = ["gif.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@gif",
-    ],
-)
-
 cc_library(
     name = "mutex",
     compatible_with = get_compatible_with_portable(),
     textual_hdrs = ["mutex.h"],
-    visibility = ["//visibility:public"],
     deps = tf_platform_deps("mutex"),
 )
 
@@ -214,7 +200,6 @@ cc_library(
     name = "numbers",
     srcs = ["numbers.cc"],
     hdrs = ["numbers.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":logging",
         ":macros",
@@ -230,7 +215,6 @@ cc_library(
     name = "path",
     srcs = ["path.cc"],
     hdrs = ["path.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":logging",
         ":mutex",
@@ -251,7 +235,6 @@ cc_library(
         "protobuf_util.cc",
     ],
     hdrs = ["protobuf.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":platform",
         ":types",
@@ -261,7 +244,6 @@ cc_library(
 cc_library(
     name = "regexp",
     hdrs = ["regexp.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":platform",
         "@com_googlesource_code_re2//:re2",
@@ -271,7 +253,6 @@ cc_library(
 cc_library(
     name = "resource",
     textual_hdrs = ["resource.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":stringpiece",
     ] + tf_resource_deps(),
@@ -281,14 +262,12 @@ cc_library(
     name = "stack_frame",
     hdrs = ["stack_frame.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "status",
     srcs = ["status.cc"],
     hdrs = ["status.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":logging",
         ":macros",
@@ -317,7 +296,6 @@ cc_library(
         "status_to_from_proto.cc",
     ],
     hdrs = ["status_to_from_proto.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":status",
         "//tsl/protobuf:error_codes_proto_impl_cc",
@@ -330,7 +308,6 @@ cc_library(
     testonly = 1,
     srcs = ["status_matchers.cc"],
     hdrs = ["status_matchers.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":status",
         ":statusor",
@@ -342,7 +319,6 @@ cc_library(
 cc_library(
     name = "statusor",
     hdrs = ["statusor.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":errors",
         ":logging",
@@ -361,7 +337,6 @@ cc_library(
     name = "thread_annotations",
     hdrs = ["thread_annotations.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -370,7 +345,6 @@ cc_library(
     srcs = ["test.cc"],
     compatible_with = get_compatible_with_portable(),
     textual_hdrs = ["test.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":logging",
         ":macros",
@@ -387,7 +361,6 @@ cc_library(
     testonly = True,
     hdrs = ["test_benchmark.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":platform",
         "@com_google_benchmark//:benchmark",
@@ -402,7 +375,10 @@ filegroup(
         "test_benchmark.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/platform:__pkg__",
+    ]),
 )
 
 filegroup(
@@ -412,13 +388,15 @@ filegroup(
         "test.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/platform:__pkg__",
+    ]),
 )
 
 cc_library(
     name = "tracing",
     textual_hdrs = ["tracing.h"],
-    visibility = ["//visibility:public"],
     deps = tf_platform_deps("tracing"),
 )
 
@@ -430,7 +408,6 @@ cc_library(
         "tstring.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":cord",
         ":platform",
@@ -445,7 +422,6 @@ filegroup(
         "ctstring_internal.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
@@ -493,7 +469,6 @@ filegroup(
         "//tsl/platform/profile_utils:i_cpu_utils_helper.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 # Header files for tensorflow/core:platform_base.
@@ -508,7 +483,6 @@ filegroup(
         "threadpool_options.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
@@ -525,7 +499,6 @@ filegroup(
         "thread_annotations.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 # Export source files needed for mobile builds, which do not use granular targets.
@@ -613,7 +586,6 @@ filegroup(
         ],
     }),
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
@@ -622,7 +594,6 @@ filegroup(
         "error_logging.h",
         "fingerprint.h",
         "notification.h",
-        "png.h",
         "random.cc",
         "random.h",
         "test_benchmark.h",
@@ -637,16 +608,10 @@ filegroup(
         "subprocess.h",
     ]),
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "gif_hdrs",
-    srcs = [
-        "gif.h",
-    ],
-    compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/platform:__pkg__",
+    ]),
 )
 
 filegroup(
@@ -657,9 +622,6 @@ filegroup(
         ],
         exclude = [
             "dynamic_annotations.h",
-            "gif.h",
-            "png.h",
-            "jpeg.h",
         ],
     ) + [
         "//tsl/platform/profile_utils:android_armv7a_cpu_utils_helper.h",
@@ -668,7 +630,6 @@ filegroup(
         "//tsl/platform/profile_utils:i_cpu_utils_helper.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 exports_files(
@@ -689,6 +650,7 @@ exports_files(
         "file_system.h",
         "file_system_helper.cc",
         "file_system_helper.h",
+        "grpc_credentials.h",
         "host_info.h",
         "human_readable_json.h",
         "init_main.h",
@@ -709,7 +671,10 @@ exports_files(
         "tracing.cc",
         "tracing.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        ":__subpackages__",
+        "//tensorflow:__subpackages__",
+    ]),
 )
 
 filegroup(
@@ -722,13 +687,14 @@ filegroup(
         "stringpiece.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+    ]),
 )
 
 cc_library(
     name = "intrusive_ptr",
     hdrs = ["intrusive_ptr.h"],
-    visibility = ["//visibility:public"],
     deps = [],
 )
 
@@ -750,25 +716,10 @@ filegroup(
         "unbounded_work_queue.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
-)
-
-filegroup(
-    name = "jpeg_hdrs",
-    srcs = [
-        "jpeg.h",
-    ],
-    compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "jpeg",
-    hdrs = ["jpeg.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@libjpeg_turbo//:jpeg",
-    ],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/platform:__pkg__",
+    ]),
 )
 
 filegroup(
@@ -779,7 +730,10 @@ filegroup(
         "platform.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/jpeg:__pkg__",
+    ]),
 )
 
 filegroup(
@@ -793,7 +747,11 @@ filegroup(
         "stringpiece.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/jpeg:__pkg__",
+        "//tensorflow/core/platform:__pkg__",
+    ]),
 )
 
 filegroup(
@@ -806,14 +764,17 @@ filegroup(
         "platform.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/gif:__pkg__",
+        "//tensorflow/core/platform:__pkg__",
+    ]),
 )
 
 cc_library(
     name = "macros",
     hdrs = ["macros.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
@@ -825,13 +786,11 @@ filegroup(
         "platform.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "net",
     textual_hdrs = ["net.h"],
-    visibility = ["//visibility:public"],
     deps = tf_windows_aware_platform_deps("net"),
 )
 
@@ -839,7 +798,6 @@ cc_library(
     name = "platform",
     hdrs = ["platform.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -854,7 +812,6 @@ cc_library(
         "numa.h",
         "snappy.h",
     ],
-    visibility = ["//visibility:public"],
     deps = tf_windows_aware_platform_deps("platform_port"),
 )
 
@@ -863,13 +820,11 @@ cc_library(
     srcs = [
         "platform_strings_computed.h",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "protobuf_compiler",
     hdrs = ["protobuf_compiler.h"],
-    visibility = ["//visibility:public"],
     deps = tf_protobuf_compiler_deps(),
 )
 
@@ -877,7 +832,6 @@ cc_library(
     name = "random",
     srcs = ["random.cc"],
     hdrs = ["random.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":mutex",
         ":types",
@@ -889,7 +843,6 @@ cc_library(
     testonly = 1,
     srcs = ["resource_loader.cc"],
     textual_hdrs = ["resource_loader.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":path",
         ":test",
@@ -899,7 +852,6 @@ cc_library(
 cc_library(
     name = "rocm_rocdl_path",
     textual_hdrs = ["rocm_rocdl_path.h"],
-    visibility = ["//visibility:public"],
     deps = tf_platform_deps("rocm_rocdl_path"),
 )
 
@@ -909,20 +861,17 @@ filegroup(
         "stacktrace_handler.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "stacktrace_handler_hdrs_lib",
     hdrs = ["stacktrace_handler.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "stacktrace_handler",
     textual_hdrs = ["stacktrace_handler.h"],
-    visibility = ["//visibility:public"],
     deps = tf_windows_aware_platform_deps("stacktrace_handler"),
     alwayslink = 1,
 )
@@ -934,7 +883,6 @@ tsl_cc_test(
         "stacktrace_handler_test.cc",
     ],
     tags = ["no_windows"],
-    visibility = ["//visibility:public"],
     deps = [
         ":logging",
         ":stacktrace",
@@ -948,7 +896,6 @@ cc_library(
     name = "str_util",
     srcs = ["str_util.cc"],
     hdrs = ["str_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":logging",
         ":macros",
@@ -962,7 +909,6 @@ cc_library(
     name = "strcat",
     srcs = ["strcat.cc"],
     hdrs = ["strcat.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":logging",
         ":macros",
@@ -977,7 +923,6 @@ cc_library(
     name = "stringpiece",
     hdrs = ["stringpiece.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/strings",
     ],
@@ -987,7 +932,6 @@ cc_library(
     name = "crash_analysis",
     hdrs = ["crash_analysis.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":platform",
     ] + tf_platform_deps("crash_analysis"),
@@ -997,7 +941,6 @@ cc_library(
     name = "stringprintf",
     srcs = ["stringprintf.cc"],
     hdrs = ["stringprintf.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":macros",
         ":types",
@@ -1009,7 +952,6 @@ cc_library(
     textual_hdrs = [
         "subprocess.h",
     ],
-    visibility = ["//visibility:public"],
     deps = tf_windows_aware_platform_deps("subprocess"),
 )
 
@@ -1017,7 +959,6 @@ cc_library(
     name = "cord",
     hdrs = ["cord.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/strings:cord",
     ],
@@ -1027,7 +968,6 @@ cc_library(
     name = "threadpool_interface",
     hdrs = ["threadpool_interface.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":mutex",
         ":types",
@@ -1039,7 +979,6 @@ cc_library(
     name = "types",
     hdrs = ["types.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":bfloat16",
         ":ml_dtypes",
@@ -1051,12 +990,12 @@ cc_library(
 cc_library(
     name = "build_test",
     testonly = 1,
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core/platform:__pkg__",
+    ]),
     deps = [
         ":byte_order",
         ":fingerprint",
-        ":gif",
-        ":jpeg",
         ":macros",
         ":net",
         ":platform",
@@ -1072,7 +1011,6 @@ cc_library(
     name = "bfloat16",
     hdrs = ["bfloat16.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":byte_order",
         "@eigen_archive//:eigen3",
@@ -1083,7 +1021,6 @@ cc_library(
     name = "ml_dtypes",
     hdrs = ["ml_dtypes.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@ml_dtypes//:float8",
         "@ml_dtypes//:int4",
@@ -1093,7 +1030,6 @@ cc_library(
 cc_library(
     name = "dso_loader",
     hdrs = ["dso_loader.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":platform",
     ] + tf_stream_executor_deps("dso_loader"),
@@ -1103,7 +1039,9 @@ cc_library(
     name = "logging",
     compatible_with = get_compatible_with_portable(),
     textual_hdrs = ["logging.h"],
-    visibility = ["//visibility:public"],
+    visibility = [
+        "//visibility:public",
+    ],
     deps = tf_logging_deps(),
 )
 
@@ -1111,18 +1049,26 @@ cc_library(
     name = "error_logging",
     compatible_with = get_compatible_with_portable(),
     textual_hdrs = ["error_logging.h"],
-    visibility = ["//visibility:public"],
+    visibility = [
+        "//visibility:public",
+    ],
     deps = [
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ] + tf_error_logging_deps(),
 )
 
+cc_library(
+    name = "grpc_credentials",
+    compatible_with = get_compatible_with_portable(),
+    textual_hdrs = ["grpc_credentials.h"],
+    deps = tsl_grpc_credentials_deps(),
+)
+
 cc_library(
     name = "prefetch",
     hdrs = ["prefetch.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/base:prefetch",
     ],
@@ -1133,7 +1079,6 @@ cc_library(
     srcs = ["hash.cc"],
     hdrs = ["hash.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":macros",
         ":raw_coding",
@@ -1145,7 +1090,6 @@ cc_library(
 cc_library(
     name = "human_readable_json",
     textual_hdrs = ["human_readable_json.h"],
-    visibility = ["//visibility:public"],
     deps = tf_platform_deps("human_readable_json"),
 )
 
@@ -1153,7 +1097,6 @@ cc_library(
     name = "raw_coding",
     hdrs = ["raw_coding.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":byte_order",
         ":types",
@@ -1169,13 +1112,12 @@ filegroup(
         "str_util.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core:__pkg__"]),
 )
 
 cc_library(
     name = "casts",
     hdrs = ["casts.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":platform",
     ] + tf_platform_deps("casts"),
@@ -1185,7 +1127,6 @@ cc_library(
     name = "setround",
     srcs = ["setround.cc"],
     hdrs = ["setround.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":logging",
         ":macros",
@@ -1196,7 +1137,6 @@ cc_library(
     name = "stacktrace",
     hdrs = ["stacktrace.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":platform",
     ] + tf_windows_aware_platform_deps("stacktrace"),
@@ -1209,7 +1149,6 @@ tsl_cc_test(
         "stacktrace_test.cc",
     ],
     tags = ["no_windows"],
-    visibility = ["//visibility:public"],
     deps = [
         ":logging",
         ":stacktrace",
@@ -1222,14 +1161,12 @@ cc_library(
     name = "cuda_libdevice_path",
     compatible_with = get_compatible_with_portable(),
     textual_hdrs = ["cuda_libdevice_path.h"],
-    visibility = ["//visibility:public"],
     deps = tf_cuda_libdevice_path_deps(),
 )
 
 cc_library(
     name = "file_statistics",
     hdrs = ["file_statistics.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":types",
     ],
@@ -1239,7 +1176,6 @@ cc_library(
     name = "fingerprint",
     hdrs = ["fingerprint.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":platform",
         ":stringpiece",
@@ -1251,7 +1187,6 @@ tsl_cc_test(
     name = "fingerprint_test",
     size = "small",
     srcs = ["fingerprint_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":fingerprint",
         ":test",
@@ -1265,7 +1200,6 @@ cc_library(
     srcs = ["tensor_float_32_utils.cc"],
     hdrs = ["tensor_float_32_utils.h"],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     alwayslink = 1,
 )
 
@@ -1273,7 +1207,6 @@ cc_library(
     name = "scanner",
     srcs = ["scanner.cc"],
     hdrs = ["scanner.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":macros",
         ":str_util",
@@ -1285,21 +1218,18 @@ filegroup(
     name = "tensor_float_32_hdr",
     srcs = ["tensor_float_32_utils.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "tensor_float_32_hdr_lib",
     hdrs = [":tensor_float_32_hdr"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 tsl_cc_test(
     name = "ctstring_test",
     size = "small",
     srcs = ["ctstring_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":test",
         ":test_main",
@@ -1311,7 +1241,6 @@ tsl_cc_test(
     name = "hash_test",
     size = "small",
     srcs = ["hash_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hash",
         ":logging",
@@ -1325,7 +1254,6 @@ tsl_cc_test(
     name = "path_test",
     size = "small",
     srcs = ["path_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":env",
         ":env_impl",
@@ -1339,7 +1267,6 @@ tsl_cc_test(
 tsl_cc_test(
     name = "random_test",
     srcs = ["random_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":random",
         ":test",
@@ -1352,7 +1279,6 @@ tsl_cc_test(
     name = "tstring_test",
     size = "small",
     srcs = ["tstring_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":cord",
         ":platform",
@@ -1372,7 +1298,6 @@ cc_library(
         "//tsl:windows": [],
         "//conditions:default": ["-lm"],
     }),
-    visibility = ["//visibility:public"],
     deps = [
         ":platform",
         ":stacktrace_handler",
@@ -1387,7 +1312,6 @@ tsl_cc_test(
     name = "status_test",
     size = "small",
     srcs = ["status_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":errors",
         ":stack_frame",
@@ -1408,7 +1332,6 @@ tsl_cc_test(
     name = "statusor_test",
     size = "small",
     srcs = ["statusor_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":errors",
         ":macros",
@@ -1424,7 +1347,6 @@ tsl_cc_test(
     name = "status_matchers_test",
     size = "small",
     srcs = ["status_matchers_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":errors",
         ":status",
@@ -1440,29 +1362,16 @@ cc_library(
     name = "notification",
     hdrs = ["notification.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
     ],
 )
 
-cc_library(
-    name = "png",
-    hdrs = ["png.h"],
-    compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":platform",
-        "@png",
-    ],
-)
-
 cc_library(
     name = "threadpool_options",
     hdrs = ["threadpool_options.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":threadpool_interface",
     ],
@@ -1472,7 +1381,6 @@ cc_library(
     name = "unbounded_work_queue",
     hdrs = ["unbounded_work_queue.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":platform",
     ] + tf_platform_deps("unbounded_work_queue"),
@@ -1481,7 +1389,6 @@ cc_library(
 tsl_cc_test(
     name = "unbounded_work_queue_test",
     srcs = ["unbounded_work_queue_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":blocking_counter",
         ":env",
@@ -1498,14 +1405,12 @@ cc_library(
     name = "context",
     compatible_with = get_compatible_with_portable(),
     textual_hdrs = ["context.h"],
-    visibility = ["//visibility:public"],
     deps = tf_platform_deps("context"),
 )
 
 cc_library(
     name = "load_library",
     textual_hdrs = ["load_library.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":status",
     ] + tf_windows_aware_platform_deps("load_library"),
@@ -1515,7 +1420,6 @@ cc_library(
     name = "abi",
     srcs = ["abi.cc"],
     hdrs = ["abi.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":types",
     ],
@@ -1524,7 +1428,6 @@ cc_library(
 cc_library(
     name = "refcount",
     hdrs = ["refcount.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":logging",
         ":mutex",
@@ -1535,7 +1438,6 @@ cc_library(
 cc_library(
     name = "null_file_system",
     hdrs = ["null_file_system.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":env",
     ],
@@ -1551,7 +1453,6 @@ tsl_cc_test(
         "//tsl/platform/testdata:test_noop",
         "//tsl/platform/testdata:test_stderr",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":path",
         ":strcat",
@@ -1566,7 +1467,6 @@ tsl_cc_test(
     name = "errors_test",
     size = "small",
     srcs = ["errors_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":errors",
         ":test",
@@ -1581,7 +1481,6 @@ tsl_cc_test(
     srcs = [
         "intrusive_ptr_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":intrusive_ptr",
         ":refcount",
@@ -1600,7 +1499,6 @@ tsl_cc_test(
         "manual",
         "notap",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":logging",
         ":test",
@@ -1613,7 +1511,6 @@ tsl_cc_test(
     size = "small",
     srcs = ["setround_test.cc"],
     tags = ["noclang"],
-    visibility = ["//visibility:public"],
     deps = [
         ":setround",
         ":test",
@@ -1627,7 +1524,6 @@ tsl_cc_test(
     srcs = [
         "refcount_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":env",
         ":env_impl",
@@ -1643,7 +1539,6 @@ tsl_cc_test(
     srcs = [
         "integral_types_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":test",
         ":test_main",
@@ -1657,7 +1552,6 @@ tsl_cc_test(
     srcs = [
         "logging_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":logging",
         ":test",
@@ -1671,7 +1565,6 @@ tsl_cc_test(
     srcs = [
         "mutex_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":env",
         ":env_impl",
@@ -1697,7 +1590,6 @@ tsl_cc_test(
     srcs = [
         "net_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":logging",
         ":net",
@@ -1715,7 +1607,6 @@ tsl_cc_test(
     tags = [
         "notap",  #TODO(b/245510532) : disabled due to flakiness.
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":env",
         ":env_impl",
@@ -1733,7 +1624,6 @@ tsl_cc_test(
     srcs = [
         "scanner_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":scanner",
         ":test",
@@ -1747,7 +1637,6 @@ tsl_cc_test(
     srcs = [
         "str_util_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":str_util",
         ":test",
@@ -1761,7 +1650,6 @@ tsl_cc_test(
     srcs = [
         "strcat_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":strcat",
         ":stringprintf",
@@ -1778,7 +1666,6 @@ tsl_cc_test(
     srcs = [
         "stringpiece_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":stringpiece",
         ":test",
@@ -1792,7 +1679,6 @@ tsl_cc_test(
     srcs = [
         "stringprintf_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":stringprintf",
         ":test",
@@ -1806,7 +1692,6 @@ tsl_cc_test(
     srcs = [
         "numbers_test.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":numbers",
         ":test",
@@ -1817,7 +1702,6 @@ tsl_cc_test(
 bzl_library(
     name = "rules_cc_bzl",
     srcs = ["rules_cc.bzl"],
-    visibility = ["//visibility:public"],
     deps = tf_platform_alias("rules_cc_bzl"),
 )
 
@@ -1830,7 +1714,6 @@ cc_library(
         "retrying_utils.h",
     ],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":env",
         ":errors",
@@ -1847,7 +1730,6 @@ cc_library(
         "retrying_file_system.h",
     ],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":env",
         ":errors",
@@ -1861,7 +1743,6 @@ tsl_cc_test(
     name = "retrying_file_system_test",
     size = "small",
     srcs = ["retrying_file_system_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":env_impl",
         ":retrying_file_system",
@@ -1876,7 +1757,6 @@ tsl_cc_test(
     name = "retrying_utils_test",
     size = "small",
     srcs = ["retrying_utils_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":env",
         ":env_impl",
diff --git a/third_party/xla/third_party/tsl/tsl/platform/abi.cc b/third_party/xla/third_party/tsl/tsl/platform/abi.cc
index 3558b42c6e65ed..8e886535d45039 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/abi.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/abi.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "tsl/platform/abi.h"
 
+#include "tsl/platform/types.h"
+
 #if defined(_MSC_VER)
 #include <windows.h>
 #include <cstring>
diff --git a/third_party/xla/third_party/tsl/tsl/platform/base64.cc b/third_party/xla/third_party/tsl/tsl/platform/base64.cc
index 8918d6ab1e3120..6421b5ec920010 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/base64.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/base64.cc
@@ -19,7 +19,10 @@ limitations under the License.
 #include <memory>
 
 #include "tsl/platform/errors.h"
+#include "tsl/platform/macros.h"
+#include "tsl/platform/status.h"
 #include "tsl/platform/stringpiece.h"
+#include "tsl/platform/types.h"
 
 namespace tsl {
 namespace {
diff --git a/third_party/xla/third_party/tsl/tsl/platform/build_config.bzl b/third_party/xla/third_party/tsl/tsl/platform/build_config.bzl
index 1658d0b017ec68..6a9193f65b1554 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/build_config.bzl
+++ b/third_party/xla/third_party/tsl/tsl/platform/build_config.bzl
@@ -38,6 +38,7 @@ load(
     _tf_stream_executor_deps = "tf_stream_executor_deps",
     _tf_windows_aware_platform_deps = "tf_windows_aware_platform_deps",
     _tsl_cc_test = "tsl_cc_test",
+    _tsl_grpc_credentials_deps = "tsl_grpc_credentials_deps",
     _tsl_protobuf_deps = "tsl_protobuf_deps",
 )
 
@@ -78,3 +79,4 @@ tf_stream_executor_deps = _tf_stream_executor_deps
 tf_windows_aware_platform_deps = _tf_windows_aware_platform_deps
 tsl_protobuf_deps = _tsl_protobuf_deps
 tsl_cc_test = _tsl_cc_test
+tsl_grpc_credentials_deps = _tsl_grpc_credentials_deps
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD b/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD
index ad242085cef386..e9588213ddf932 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/cloud/BUILD
@@ -1,17 +1,20 @@
 # Description:
 # Cloud file system implementation.
 
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//tsl:tsl.bzl",
     "if_windows",
-    "set_external_visibility",
+    "internal_visibility",
     "tsl_copts",
 )
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        ":dependency_allowlist",
+    ]),
     licenses = ["notice"],
 )
 
@@ -29,7 +32,6 @@ cc_library(
     name = "expiring_lru_cache",
     hdrs = ["expiring_lru_cache.h"],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:env",
         "//tsl/platform:mutex",
@@ -42,7 +44,6 @@ cc_library(
     name = "file_block_cache",
     hdrs = ["file_block_cache.h"],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:env",
         "//tsl/platform:mutex",
@@ -78,7 +79,6 @@ cc_library(
     srcs = ["gcs_dns_cache.cc"],
     hdrs = ["gcs_dns_cache.h"],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":http_request",
         "//tsl/platform:env",
@@ -95,7 +95,6 @@ cc_library(
     srcs = ["gcs_throttle.cc"],
     hdrs = ["gcs_throttle.h"],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:env",
     ],
@@ -190,7 +189,6 @@ cc_library(
     name = "http_request",
     hdrs = ["http_request.h"],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:env",
         "//tsl/platform:errors",
@@ -207,7 +205,6 @@ cc_library(
     srcs = ["curl_http_request.cc"],
     hdrs = ["curl_http_request.h"],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":http_request",
         "//tsl/lib/gtl:map_util",
@@ -232,7 +229,6 @@ cc_library(
         "http_request_fake.h",
     ],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":curl_http_request",
         "//tsl/lib/core:status_test_util",
@@ -255,7 +251,6 @@ cc_library(
         "google_auth_provider.h",
     ],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":compute_engine_metadata_client",
         ":oauth_client",
@@ -281,7 +276,6 @@ cc_library(
         "compute_engine_metadata_client.h",
     ],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":curl_http_request",
         ":http_request",
@@ -302,7 +296,6 @@ cc_library(
         "zone_provider.h",
     ],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":compute_engine_metadata_client",
         "//tsl/platform:errors",
@@ -316,7 +309,6 @@ cc_library(
     testonly = 1,
     hdrs = ["now_seconds_env.h"],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:env",
         "//tsl/platform:mutex",
@@ -333,7 +325,6 @@ cc_library(
         "oauth_client.h",
     ],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":curl_http_request",
         ":http_request",
@@ -355,7 +346,6 @@ cc_library(
         "time_util.h",
     ],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:errors",
         "//tsl/platform:status",
@@ -366,7 +356,6 @@ tsl_cc_test(
     name = "expiring_lru_cache_test",
     size = "small",
     srcs = ["expiring_lru_cache_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":expiring_lru_cache",
         ":now_seconds_env",
@@ -381,7 +370,6 @@ tsl_cc_test(
     name = "ram_file_block_cache_test",
     size = "small",
     srcs = ["ram_file_block_cache_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":now_seconds_env",
         ":ram_file_block_cache",
@@ -399,7 +387,6 @@ tsl_cc_test(
     name = "gcs_file_system_test",
     size = "small",
     srcs = ["gcs_file_system_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":gcs_file_system",
         ":http_request_fake",
@@ -420,7 +407,6 @@ tsl_cc_test(
     size = "small",
     srcs = ["gcs_dns_cache_test.cc"],
     linkopts = if_windows(["-DEFAULTLIB:ws2_32.lib"]),
-    visibility = ["//visibility:public"],
     deps = [
         ":gcs_dns_cache",
         "//tsl/platform:env_impl",
@@ -435,7 +421,6 @@ tsl_cc_test(
     size = "small",
     srcs = ["gcs_throttle_test.cc"],
     linkopts = if_windows(["-DEFAULTLIB:ws2_32.lib"]),
-    visibility = ["//visibility:public"],
     deps = [
         ":gcs_throttle",
         "//tsl/lib/core:status_test_util",
@@ -450,7 +435,6 @@ tsl_cc_test(
     name = "curl_http_request_test",
     size = "small",
     srcs = ["curl_http_request_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":curl_http_request",
         "//tsl/lib/core:status_test_util",
@@ -471,7 +455,6 @@ tsl_cc_test(
         "//tsl/platform/cloud/testdata:service_account_credentials",
         "//tsl/platform/cloud/testdata:service_account_public_key",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":http_request_fake",
         ":oauth_client",
@@ -495,7 +478,6 @@ tsl_cc_test(
         "//tsl/platform/cloud/testdata:application_default_credentials",
         "//tsl/platform/cloud/testdata:service_account_credentials",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":google_auth_provider",
         ":http_request_fake",
@@ -512,7 +494,6 @@ tsl_cc_test(
     name = "compute_engine_metadata_client_test",
     size = "small",
     srcs = ["compute_engine_metadata_client_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":compute_engine_metadata_client",
         ":http_request_fake",
@@ -527,7 +508,6 @@ tsl_cc_test(
     name = "compute_engine_zone_provider_test",
     size = "small",
     srcs = ["compute_engine_zone_provider_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":compute_engine_zone_provider",
         ":http_request_fake",
@@ -541,7 +521,6 @@ tsl_cc_test(
     name = "time_util_test",
     size = "small",
     srcs = ["time_util_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":time_util",
         "//tsl/lib/core:status_test_util",
diff --git a/third_party/xla/third_party/tsl/tsl/platform/cloud/testdata/BUILD b/third_party/xla/third_party/tsl/tsl/platform/cloud/testdata/BUILD
index 13102f42701e05..1cafb7b63b75c6 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/cloud/testdata/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/cloud/testdata/BUILD
@@ -3,7 +3,6 @@
 load("//tsl:tsl.default.bzl", "filegroup")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
@@ -13,7 +12,7 @@ filegroup(
     srcs = [
         "application_default_credentials.json",
     ],
-    visibility = ["//visibility:public"],
+    visibility = ["//tsl/platform/cloud:__pkg__"],
 )
 
 filegroup(
@@ -21,7 +20,7 @@ filegroup(
     srcs = [
         "service_account_credentials.json",
     ],
-    visibility = ["//visibility:public"],
+    visibility = ["//tsl/platform/cloud:__pkg__"],
 )
 
 filegroup(
@@ -29,5 +28,5 @@ filegroup(
     srcs = [
         "service_account_public_key.txt",
     ],
-    visibility = ["//visibility:public"],
+    visibility = ["//tsl/platform/cloud:__pkg__"],
 )
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/BUILD b/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
index 846d38e2e2680d..8e6a52db7d1854 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/BUILD
@@ -1,12 +1,23 @@
-load("//tsl:tsl.bzl", "if_not_fuchsia", "if_not_windows", "set_external_visibility", "tsl_copts")
-load("//tsl:tsl.default.bzl", "filegroup")
+load(
+    "//tsl:tsl.bzl",
+    "if_not_fuchsia",
+    "if_not_windows",
+    "internal_visibility",
+    "tsl_copts",
+)
+load("//tsl:tsl.default.bzl", "filegroup", "tsl_grpc_cc_dependencies")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 # Tensorflow default + linux implementations of tensorflow/core/platform libraries.
 load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//tensorflow/core/lib/jpeg:__pkg__",
+        "//tensorflow/core/platform:__pkg__",
+        "//tsl/platform:__pkg__",
+    ]),
     licenses = ["notice"],
 )
 
@@ -18,7 +29,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -30,7 +40,6 @@ cc_library(
         "nobuilder",
     ],
     textual_hdrs = ["context.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform",
     ],
@@ -40,7 +49,6 @@ cc_library(
     name = "criticality",
     hdrs = ["//tsl/platform:criticality.h"],
     textual_hdrs = ["criticality.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform",
     ],
@@ -56,7 +64,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform",
         "//tsl/platform:logging",
@@ -80,7 +87,6 @@ cc_library(
         "manual",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:load_library",
         "//tsl/platform:logging",
@@ -113,12 +119,12 @@ cc_library(
         "//tsl/platform:ram_file_system.h",
         "//tsl/platform:threadpool.h",
     ],
+    copts = tsl_copts(),
     tags = [
         "manual",
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform",
         "//tsl/platform:blocking_counter",
@@ -166,7 +172,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":env",
         "//tsl/platform:load_library",
@@ -186,7 +191,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = ["//tsl/platform:types"],
 )
 
@@ -199,13 +203,27 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
 )
 
+cc_library(
+    name = "grpc_credentials",
+    srcs = ["grpc_credentials.cc"],
+    hdrs = ["//tsl/platform:grpc_credentials.h"],
+    tags = [
+        "manual",
+        "no_oss",
+        "nobuilder",
+    ],
+    deps = [
+        "//tsl/platform:logging",
+        "@com_google_absl//absl/log:check",
+    ] + tsl_grpc_cc_dependencies(),
+)
+
 cc_library(
     name = "human_readable_json",
     srcs = ["human_readable_json.cc"],
@@ -215,7 +233,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:errors",
         "//tsl/platform:protobuf",
@@ -233,7 +250,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/status",
     ],
@@ -249,7 +265,6 @@ cc_library(
         "nobuilder",
     ],
     textual_hdrs = ["logging.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform",
         "//tsl/platform:env_time",
@@ -266,7 +281,6 @@ filegroup(
     srcs = [
         "integral_types.h",
     ] + if_not_windows(["env_time.cc"]),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -282,7 +296,6 @@ cc_library(
         "nobuilder",
     ],
     textual_hdrs = ["mutex.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform",
         "//tsl/platform:macros",
@@ -303,7 +316,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:logging",
         "//tsl/platform:strcat",
@@ -338,7 +350,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform",
         "//tsl/platform:byte_order",
@@ -368,7 +379,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:stringpiece",
     ],
@@ -384,7 +394,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:logging",
         "//tsl/platform:path",
@@ -397,7 +406,6 @@ cc_library(
     name = "stacktrace",
     hdrs = ["stacktrace.h"],
     linkopts = ["-ldl"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform",
         "//tsl/platform:abi",
@@ -409,7 +417,6 @@ cc_library(
     srcs = ["stacktrace_handler.cc"],
     hdrs = ["//tsl/platform:stacktrace_handler_hdrs"],
     linkstatic = 1,
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform",
         "//tsl/platform:stacktrace",
@@ -427,7 +434,6 @@ cc_library(
         "nobuilder",
     ],
     textual_hdrs = ["subprocess.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform",
         "//tsl/platform:logging",
@@ -451,7 +457,6 @@ cc_library(
         "nobuilder",
     ],
     textual_hdrs = ["tracing_impl.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform",
         "//tsl/platform:hash",
@@ -473,7 +478,6 @@ cc_library(
         "nobuilder",
     ],
     textual_hdrs = ["integral_types.h"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -485,7 +489,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:env",
         "//tsl/platform:mutex",
@@ -507,7 +510,7 @@ cc_library(
         "nobuilder",
     ],
     textual_hdrs = ["crash_analysis.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow:__subpackages__"]),
     deps = [
         "//tsl/platform",
         "//tsl/platform:protobuf",
@@ -522,7 +525,7 @@ cc_library(
         "nobuilder",
     ],
     textual_hdrs = ["status.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow:__subpackages__"]),
 )
 
 cc_library(
@@ -533,7 +536,7 @@ cc_library(
         "nobuilder",
     ],
     textual_hdrs = ["statusor.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow:__subpackages__"]),
     deps = [
         "//tsl/platform:macros",
         "//tsl/platform:status",
@@ -544,19 +547,18 @@ cc_library(
 bzl_library(
     name = "cuda_build_defs_bzl",
     srcs = ["cuda_build_defs.bzl"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow:__subpackages__"]),
 )
 
 bzl_library(
     name = "rules_cc_bzl",
     srcs = ["rules_cc.bzl"],
-    visibility = ["//visibility:public"],
 )
 
 # Export source files needed for mobile builds, which do not use granular targets.
 filegroup(
     name = "additional_mobile_srcs_no_runtime",
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core/platform:__pkg__"]),
 )
 
 filegroup(
@@ -577,7 +579,10 @@ filegroup(
         "//tsl/platform/profile_utils:cpu_utils.h",
         "//tsl/platform/profile_utils:i_cpu_utils_helper.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core/platform:__pkg__",
+        "//tsl/platform:__pkg__",
+    ]),
 )
 
 filegroup(
@@ -593,7 +598,7 @@ filegroup(
         "subprocess.cc",
         "subprocess.h",
     ]),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core/platform:__pkg__"]),
 )
 
 exports_files(
@@ -605,7 +610,7 @@ exports_files(
             "test.cc",
         ],
     ),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core/platform:__pkg__"]),
 )
 
 exports_files(
@@ -614,5 +619,10 @@ exports_files(
         "logging.h",
         "test.cc",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/lib/gif:__pkg__",
+        "//tensorflow/core/lib/jpeg:__pkg__",
+        "//tensorflow/core/platform:__pkg__",
+    ]),
 )
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl b/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl
index c0826ed15177fe..bbd6211fe102bd 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/build_config.bzl
@@ -732,7 +732,7 @@ def tf_lib_proto_parsing_deps():
     return [
         ":protos_all_cc",
         clean_dep("@eigen_archive//:eigen3"),
-        clean_dep("//tsl/platform/default/build_config:proto_parsing"),
+        clean_dep("//tsl/protobuf:protos_all_cc"),
     ]
 
 def tf_py_clif_cc(name, visibility = None, **kwargs):
@@ -830,6 +830,9 @@ def tf_logging_deps():
 def tf_error_logging_deps():
     return [clean_dep("//tsl/platform/default:error_logging")]
 
+def tsl_grpc_credentials_deps():
+    return [clean_dep("//tsl/platform/default:grpc_credentials")]
+
 def tf_resource_deps():
     return [clean_dep("//tsl/platform/default:resource")]
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/build_config/BUILD b/third_party/xla/third_party/tsl/tsl/platform/default/build_config/BUILD
deleted file mode 100644
index 2d5d252efe7ba6..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/platform/default/build_config/BUILD
+++ /dev/null
@@ -1,134 +0,0 @@
-# Description:
-# Platform-specific build configurations.
-
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl:tsl.bzl", "set_external_visibility", "tsl_copts")
-
-package(default_visibility = ["//visibility:public"])
-
-licenses(["notice"])  # Apache 2.0
-
-exports_files(
-    ["LICENSE"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "gtest",
-    testonly = 1,
-    copts = tsl_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "@com_google_googletest//:gtest",
-    ],
-)
-
-cc_library(
-    name = "tensorflow_platform_specific",
-    copts = tsl_copts(),
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-    deps = [],
-)
-
-cc_library(
-    name = "_empty_lib",
-    visibility = ["//visibility:public"],
-)
-
-# Dummy stream executor cuda plugins.
-cc_library(
-    name = "cublas_plugin",
-    srcs = [],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cufft_plugin",
-    srcs = [],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "cudnn_plugin",
-    srcs = [],
-    visibility = ["//visibility:public"],
-)
-
-# Minimal lib so that tools used for mobile compilation
-# don't have to depend on platformlib.
-cc_library(
-    name = "proto_parsing",
-    copts = tsl_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//tsl/protobuf:protos_all_cc",
-    ],
-)
-
-# Minimal lib to be used by tensorflow/core:framework_lite.
-# This provides minimal support for writing operator implementations (kernels),
-# and excludes anything that can bloat binary size if used.
-cc_library(
-    name = "minimal",
-    srcs = [],
-    copts = tsl_copts(),
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "gif",
-    copts = tsl_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "@gif",
-    ],
-)
-
-cc_library(
-    name = "jpeg",
-    copts = tsl_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "@libjpeg_turbo//:jpeg",
-    ],
-)
-
-cc_library(
-    name = "png",
-    copts = tsl_copts(),
-    visibility = ["//visibility:public"],
-    deps = [
-        "@png",
-        "@zlib",
-    ],
-)
-
-cc_library(
-    name = "test_main",
-    testonly = 1,
-    linkstatic = 1,
-    visibility = ["//visibility:public"],
-    deps = [],
-)
-
-cc_library(
-    name = "cuda",
-    data = [
-        "@local_config_cuda//cuda:cudart",
-    ],
-    linkopts = select({
-        "//tsl:macos": [
-            "-Wl,-rpath,../local_config_cuda/cuda/lib",
-            "-Wl,-rpath,../local_config_cuda/cuda/extras/CUPTI/lib",
-        ],
-        "//conditions:default": [
-            "-Wl,-rpath,../local_config_cuda/cuda/lib64",
-            "-Wl,-rpath,../local_config_cuda/cuda/extras/CUPTI/lib64",
-        ],
-    }),
-    visibility = ["//visibility:public"],
-    deps = [
-        "@local_config_cuda//cuda:cudart",
-    ],
-)
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/cuda_build_defs.bzl b/third_party/xla/third_party/tsl/tsl/platform/default/cuda_build_defs.bzl
index ad89515cd34c0b..1f7f52a627b769 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/cuda_build_defs.bzl
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/cuda_build_defs.bzl
@@ -1,6 +1,10 @@
 """Open source build configurations for CUDA."""
 
-load("@local_config_cuda//cuda:build_defs.bzl", _if_cuda_is_configured = "if_cuda_is_configured")
+load(
+    "@local_config_cuda//cuda:build_defs.bzl",
+    _if_cuda_is_configured = "if_cuda_is_configured",
+    _if_cuda_newer_than = "if_cuda_newer_than",
+)
 
 # We perform this indirection so that the copybara tool can distinguish this
 # macro from others provided by the same file.
@@ -16,3 +20,6 @@ def cuda_rpath_flags(relpath):
         "-Wl,-rpath='$$ORIGIN/../../" + relpath + "'",
         "-Wl,-rpath='$$ORIGIN/../" + relpath + "'",
     ]
+
+def if_cuda_newer_than(wanted_ver, if_true, if_false = []):
+    return _if_cuda_newer_than(wanted_ver, if_true, if_false)
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/dlopen_checker.cc b/third_party/xla/third_party/tsl/tsl/platform/default/dlopen_checker.cc
index eb8fff80bfb6ac..da840d61536048 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/default/dlopen_checker.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/dlopen_checker.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "tsl/platform/default/dso_loader.h"
 #include "tsl/platform/logging.h"
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/default/grpc_credentials.cc b/third_party/xla/third_party/tsl/tsl/platform/default/grpc_credentials.cc
new file mode 100644
index 00000000000000..44850f56e05195
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/platform/default/grpc_credentials.cc
@@ -0,0 +1,42 @@
+// Copyright 2024 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "tsl/platform/grpc_credentials.h"
+
+#include <memory>
+
+#include "absl/log/check.h"
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/security/server_credentials.h"
+#include "tsl/platform/logging.h"
+
+namespace tsl {
+
+std::shared_ptr<grpc::ChannelCredentials> GetClientCredentials(
+    bool verify_secure_credentials) {
+  CHECK(!verify_secure_credentials)
+      << "Insecure gRPC credentials are unexpectedly used!";
+  LOG(INFO) << "gRPC insecure client credentials are used.";
+  return grpc::InsecureChannelCredentials();
+}
+
+std::shared_ptr<grpc::ServerCredentials> GetServerCredentials(
+    bool verify_secure_credentials) {
+  CHECK(!verify_secure_credentials)
+      << "Insecure gRPC credentials are unexpectedly used!";
+  LOG(INFO) << "gRPC insecure server credentials are used.";
+  return grpc::InsecureServerCredentials();
+}
+
+}  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/platform/grpc_credentials.h b/third_party/xla/third_party/tsl/tsl/platform/grpc_credentials.h
new file mode 100644
index 00000000000000..5625811c0fdde9
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/platform/grpc_credentials.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2024 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef TENSORFLOW_TSL_PLATFORM_GRPC_CREDENTIALS_H_
+#define TENSORFLOW_TSL_PLATFORM_GRPC_CREDENTIALS_H_
+
+#include <memory>
+
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/security/server_credentials.h"
+
+namespace tsl {
+
+// Get credentials to use in the client gRPC.
+// If `verify_secure_credentials`, crash if insecure credentials are used.
+std::shared_ptr<::grpc::ChannelCredentials> GetClientCredentials(
+    bool verify_secure_credentials = true);
+
+// Get credentials to use in the server gRPC.
+// If `verify_secure_credentials`, crash if insecure credentials are used.
+std::shared_ptr<::grpc::ServerCredentials> GetServerCredentials(
+    bool verify_secure_credentials = true);
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PLATFORM_GRPC_CREDENTIALS_H_
diff --git a/third_party/xla/third_party/tsl/tsl/platform/png.h b/third_party/xla/third_party/tsl/tsl/platform/png.h
deleted file mode 100644
index c66e88a5dcd6af..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/platform/png.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef TENSORFLOW_TSL_PLATFORM_PNG_H_
-#define TENSORFLOW_TSL_PLATFORM_PNG_H_
-
-#include "tsl/platform/platform.h"
-
-#if defined(PLATFORM_GOOGLE) && !defined(IS_MOBILE_PLATFORM)
-#include "png.h"  // from @png  // IWYU pragma: export
-#elif defined(PLATFORM_POSIX) || defined(PLATFORM_WINDOWS) || \
-    defined(PLATFORM_POSIX_ANDROID) || defined(IS_MOBILE_PLATFORM)
-#include <png.h>  // IWYU pragma: export
-#else
-#error Define the appropriate PLATFORM_<foo> macro for this platform
-#endif
-
-#endif  // TENSORFLOW_TSL_PLATFORM_PNG_H_
diff --git a/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD b/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD
index 0832fcaedd51b7..1421f3c7e143f4 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/profile_utils/BUILD
@@ -1,7 +1,7 @@
 # Description:
 # profile_utils targets.
 
-load("//tsl:tsl.bzl", "set_external_visibility")
+load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "filegroup")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
@@ -13,22 +13,25 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "@local_xla//xla/stream_executor:__subpackages__",
+        "//tensorflow/core/platform:__subpackages__",
+        "//tsl:__pkg__",
+        "//tsl/platform/default:__pkg__",
+    ]),
     licenses = ["notice"],
 )
 
-exports_files(
-    srcs = [
-        "android_armv7a_cpu_utils_helper.cc",
-        "android_armv7a_cpu_utils_helper.h",
-        "clock_cycle_profiler.h",
-        "cpu_utils.cc",
-        "cpu_utils.h",
-        "cpu_utils_test.cc",
-        "i_cpu_utils_helper.h",
-    ],
-    visibility = ["//visibility:public"],
-)
+exports_files(srcs = [
+    "android_armv7a_cpu_utils_helper.cc",
+    "android_armv7a_cpu_utils_helper.h",
+    "clock_cycle_profiler.h",
+    "cpu_utils.cc",
+    "cpu_utils.h",
+    "cpu_utils_test.cc",
+    "i_cpu_utils_helper.h",
+])
 
 filegroup(
     name = "legacy_lib_internal_srcs",
@@ -36,7 +39,10 @@ filegroup(
         "android_armv7a_cpu_utils_helper.cc",
         "clock_cycle_profiler.cc",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core/platform:__subpackages__",
+        "//tsl/platform:__pkg__",
+    ]),
 )
 
 cc_library(
@@ -50,7 +56,6 @@ cc_library(
         "i_cpu_utils_helper.h",
     ],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:logging",
         "//tsl/platform:macros",
diff --git a/third_party/xla/third_party/tsl/tsl/platform/regexp.h b/third_party/xla/third_party/tsl/tsl/platform/regexp.h
index dbab940e81a0dd..fac545c266aae3 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/regexp.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/regexp.h
@@ -19,9 +19,9 @@ limitations under the License.
 #include "tsl/platform/platform.h"
 
 #if TSL_IS_IN_OSS
-#include "re2/re2.h"
+#include "re2/re2.h"  // IWYU pragma: export
 #else
-#include "third_party/re2/re2.h"
-#endif  // TSL_IS_IN_OSS
+#include "third_party/re2/re2.h"  // IWYU pragma: export
+#endif                            // TSL_IS_IN_OSS
 
 #endif  // TENSORFLOW_TSL_PLATFORM_REGEXP_H_
diff --git a/third_party/xla/third_party/tsl/tsl/platform/status.cc b/third_party/xla/third_party/tsl/tsl/platform/status.cc
index 063ad314568813..2fb124322c1098 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/status.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/status.cc
@@ -166,12 +166,6 @@ const char* NullTerminatedMessage(const Status& status) {
 }
 
 
-Status OkStatus() { return Status(); }
-
-Status FromAbslStatus(const absl::Status& s) { return s; }
-
-absl::Status ToAbslStatus(const ::absl::Status& s) { return s; }
-
 std::string* TfCheckOpHelperOutOfLine(const ::tsl::Status& v, const char* msg) {
   std::string r("Non-OK-status: ");
   r += msg;
diff --git a/third_party/xla/third_party/tsl/tsl/platform/status.h b/third_party/xla/third_party/tsl/tsl/platform/status.h
index 3b5cc2ef4e24da..812ac1a0d6adfc 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/status.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/status.h
@@ -46,7 +46,7 @@ limitations under the License.
 #include "tsl/platform/default/status.h"  // IWYU pragma: export
 #endif
 
-// This macro should eventually be provided by Abseil.
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
 #ifndef ABSL_DEPRECATE_AND_INLINE
 #define ABSL_DEPRECATE_AND_INLINE()
 #endif
@@ -105,10 +105,14 @@ namespace tsl {
 //
 // Returns an OK status, equivalent to a default constructed instance. Prefer
 // usage of `OkStatus()` when constructing such an OK status.
-Status OkStatus();
+ABSL_DEPRECATE_AND_INLINE() inline absl::Status OkStatus() {
+  return absl::OkStatus();
+};
 
-absl::Status FromAbslStatus(const absl::Status& s);
-absl::Status ToAbslStatus(const ::absl::Status& s);
+ABSL_DEPRECATE_AND_INLINE()
+inline absl::Status FromAbslStatus(const absl::Status& s) { return s; }
+ABSL_DEPRECATE_AND_INLINE()
+inline absl::Status ToAbslStatus(const ::absl::Status& s) { return s; }
 
 // Given `Status.message()` does not guarantee to be always backed by a
 // null-terminated string, we have this utility function when it's needed for
diff --git a/third_party/xla/third_party/tsl/tsl/platform/statusor.h b/third_party/xla/third_party/tsl/tsl/platform/statusor.h
index 0db4e733112c8c..6c49be5132fc9d 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/statusor.h
+++ b/third_party/xla/third_party/tsl/tsl/platform/statusor.h
@@ -69,6 +69,7 @@ limitations under the License.
 #define TENSORFLOW_TSL_PLATFORM_STATUSOR_H_
 
 #include "absl/base/attributes.h"
+#include "absl/base/macros.h"
 #include "absl/status/statusor.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/macros.h"
@@ -82,9 +83,15 @@ limitations under the License.
 #include "tsl/platform/default/statusor.h"  // IWYU pragma: export
 #endif
 
+// TODO: b/323943471 - This macro should eventually be provided by Abseil.
+#ifndef ABSL_DEPRECATE_AND_INLINE
+#define ABSL_DEPRECATE_AND_INLINE()
+#endif
+
 namespace tsl {
 
-using absl::StatusOr;
+template <typename T>
+using StatusOr ABSL_DEPRECATE_AND_INLINE() = absl::StatusOr<T>;
 
 }  // namespace tsl
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/testdata/BUILD b/third_party/xla/third_party/tsl/tsl/platform/testdata/BUILD
index 665398eb19e8ff..d50e456b83f5da 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/testdata/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/testdata/BUILD
@@ -3,7 +3,10 @@
 #   Thus helping write cross platform tests.
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//tsl/platform:__pkg__",
+    ],
     licenses = ["notice"],
 )
 
diff --git a/third_party/xla/third_party/tsl/tsl/platform/threadpool.cc b/third_party/xla/third_party/tsl/tsl/platform/threadpool.cc
index 218226611b13f1..8b2c850331e944 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/threadpool.cc
+++ b/third_party/xla/third_party/tsl/tsl/platform/threadpool.cc
@@ -28,6 +28,10 @@ limitations under the License.
 #include "tsl/platform/setround.h"
 #include "tsl/platform/tracing.h"
 
+#ifdef DNNL_AARCH64_USE_ACL
+#include "tsl/platform/cpu_info.h"
+#endif  // DNNL_AARCH64_USE_ACL
+
 #ifdef TENSORFLOW_THREADSCALING_EXPERIMENTAL
 ABSL_FLAG(float, tensorflow_num_threads_scale_factor, 1.0,
           "Allows to scale all Tensorflow ThreadPools. Total number of threads "
@@ -107,6 +111,14 @@ ThreadPool::ThreadPool(Env* env, const ThreadOptions& thread_options,
                        bool low_latency_hint, Eigen::Allocator* allocator) {
   CHECK_GE(num_threads, 1);
 
+#ifdef DNNL_AARCH64_USE_ACL
+  // To avoid cost of swapping in and out threads from running processes
+  // we do not use all available cores to parallelise TF operations.
+  if (num_threads == tsl::port::NumTotalCPUs() && num_threads >= 16) {
+    num_threads = num_threads - 1;
+  }
+#endif  // DNNL_AARCH64_USE_ACL
+
 #ifdef TENSORFLOW_THREADSCALING_EXPERIMENTAL
   CHECK_GT(absl::GetFlag(FLAGS_tensorflow_num_threads_scale_factor), 0);
   num_threads *= absl::GetFlag(FLAGS_tensorflow_num_threads_scale_factor);
diff --git a/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD b/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD
index a7955cdd5bfdea..fda40c1e211f02 100644
--- a/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/platform/windows/BUILD
@@ -1,6 +1,7 @@
 # Tensorflow windows-specific implementations of tensorflow/core/platform libraries.
 load(
     "//tsl:tsl.bzl",
+    "internal_visibility",
     "tsl_copts",
 )
 load("//tsl:tsl.default.bzl", "filegroup")
@@ -10,7 +11,11 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//tensorflow/core/platform:__pkg__",
+        "//tsl/platform:__pkg__",
+    ]),
     licenses = ["notice"],
 )
 
@@ -36,7 +41,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":error_windows",
         ":wide_char",
@@ -84,7 +88,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":env",
     ],
@@ -99,7 +102,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:types",
     ],
@@ -115,7 +117,9 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
+    # This code is highly windows specific and should only be used with care
+    # from this package.
+    visibility = ["//visibility:private"],
 )
 
 cc_library(
@@ -126,7 +130,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = ["//tsl/platform:types"],
 )
 
@@ -139,7 +142,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":wide_char",
         "//tsl/platform:errors",
@@ -159,7 +161,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":error_windows",
         "//tsl/platform:errors",
@@ -189,7 +190,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform",
         "//tsl/platform:byte_order",
@@ -209,7 +209,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = ["//tsl/platform:mutex"],
 )
 
@@ -222,7 +221,6 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:mutex",
         "//tsl/platform:stacktrace",
@@ -240,7 +238,6 @@ cc_library(
         "nobuilder",
     ],
     textual_hdrs = ["subprocess.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform",
         "//tsl/platform:logging",
@@ -260,16 +257,14 @@ cc_library(
         "no_oss",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "xla_cpu_runtime_srcs",
     srcs = ["env_time.cc"],
-    visibility = ["//visibility:public"],
 )
 
 exports_files(
     srcs = ["intrinsics_port.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core/platform:__pkg__"]),
 )
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/BUILD
index c1f6ae3ebd6ce8..e7d90f1c8caecb 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/BUILD
@@ -1,4 +1,4 @@
-load("//tsl:tsl.bzl", "set_external_visibility")
+load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl/platform:build_config_root.bzl", "if_static")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
@@ -10,7 +10,10 @@ cc_library(
     name = "traceme_recorder",
     hdrs = ["traceme_recorder.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tsl/profiler:internal",
+        "//tsl/profiler:xla_profiler_backends",
+    ]),
     deps = [
         "//tsl/platform:macros",
         "//tsl/platform:mutex",
@@ -29,7 +32,13 @@ cc_library(
     ],
     hdrs = ["traceme_recorder.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/python:__pkg__",
+        "//tsl/platform/cloud:__pkg__",
+        "//tsl/profiler:__pkg__",
+        "//tsl/profiler:internal",
+        "//tsl/profiler:xla_internal",
+    ]),
     deps = [
         "//tsl/platform:env",
         "//tsl/platform:logging",
@@ -45,7 +54,6 @@ cc_library(
 tsl_cc_test(
     name = "traceme_recorder_test",
     srcs = ["traceme_recorder_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":traceme_recorder",
         ":traceme_recorder_impl",
@@ -68,7 +76,9 @@ cc_library(
     name = "annotation_stack",
     hdrs = ["annotation_stack.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tsl/profiler:internal",
+    ]),
     deps = [
         "//tsl/platform:macros",
         "//tsl/platform:types",
@@ -85,7 +95,10 @@ cc_library(
         "annotation_stack.h",
     ],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "@local_xla//xla:__subpackages__",
+        "//tsl/profiler:internal",
+    ]),
     deps = [
         "//tsl/platform:macros",
         "//tsl/platform:types",
@@ -99,7 +112,10 @@ cc_library(
     srcs = ["host_tracer_utils.cc"],
     hdrs = ["host_tracer_utils.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tsl/profiler:internal",
+        "//tsl/profiler:xla_internal",
+    ]),
     deps = [
         ":traceme_recorder",
         "//tsl/platform:types",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/annotation_stack.cc b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/annotation_stack.cc
index 97b4c5daeb373e..1e4b99bf79a846 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/annotation_stack.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/annotation_stack.cc
@@ -16,33 +16,69 @@ limitations under the License.
 #include "tsl/profiler/backends/cpu/annotation_stack.h"
 
 #include <atomic>
+#include <cstddef>
+#include <string>
+#include <string_view>
+#include <utility>
+#include <vector>
 
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
 #include "tsl/platform/types.h"
 
 namespace tsl {
 namespace profiler {
-namespace internal {
-
-#ifdef _WIN32
-#define DECL_DLL_EXPORT __declspec(dllexport)
-#else
-#define DECL_DLL_EXPORT
-#endif
-// DLL imported variables cannot be initialized on Windows. This file is
-// included only on DLL exports.
-DECL_DLL_EXPORT std::atomic<int> g_annotation_enabled(0);
-
-// g_annotation_enabled implementation must be lock-free for faster execution of
-// the ScopedAnnotation API. This can be commented (if compilation is failing)
-// but execution might be slow (even when tracing is disabled).
-static_assert(ATOMIC_INT_LOCK_FREE == 2, "Assumed atomic<int> was lock free");
 
-}  // namespace internal
+// Returns the annotation data for the given generation.
+static auto GetAnnotationData(const std::atomic<int>& atomic) {
+  static thread_local struct {
+    int generation = 0;
+    std::vector<size_t> stack;
+    std::string string;
+  } data;
+  int generation = atomic.load(std::memory_order_acquire);
+  if (generation != data.generation) {
+    data = {generation};
+  }
+  return std::make_pair(&data.stack, &data.string);
+};
+
+void AnnotationStack::PushAnnotation(std::string_view name) {
+  auto [stack, string] = GetAnnotationData(generation_);
+  stack->push_back(string->size());
+  if (!string->empty()) {
+    return absl::StrAppend(
+        string, "::", absl::string_view(name.data(), name.size())  // NOLINT
+    );
+  }
+  string->assign(name);
+}
+
+void AnnotationStack::PopAnnotation() {
+  auto [stack, string] = GetAnnotationData(generation_);
+  if (stack->empty()) {
+    return string->clear();
+  }
+  string->resize(stack->back());
+  stack->pop_back();
+}
 
-/*static*/ string* AnnotationStack::ThreadAnnotationStack() {
-  static thread_local string annotation_stack;
-  return &annotation_stack;
+const string& AnnotationStack::Get() {
+  return *std::get<std::string*>(GetAnnotationData(generation_));
 }
 
+void AnnotationStack::Enable(bool enable) {
+  int generation = generation_.load(std::memory_order_relaxed);
+  while (!generation_.compare_exchange_weak(
+      generation, enable ? generation | 1 : generation + 1 & ~1,
+      std::memory_order_release)) {
+  }
+}
+
+// AnnotationStack::generation_ implementation must be lock-free for faster
+// execution of the ScopedAnnotation API.
+std::atomic<int> AnnotationStack::generation_{0};
+static_assert(ATOMIC_INT_LOCK_FREE == 2, "Assumed atomic<int> was lock free");
+
 }  // namespace profiler
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/annotation_stack.h b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/annotation_stack.h
index 23bd5236f185bf..44d1626e6a5cb3 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/annotation_stack.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/backends/cpu/annotation_stack.h
@@ -15,80 +15,42 @@ limitations under the License.
 #ifndef TENSORFLOW_TSL_PROFILER_BACKENDS_CPU_ANNOTATION_STACK_H_
 #define TENSORFLOW_TSL_PROFILER_BACKENDS_CPU_ANNOTATION_STACK_H_
 
-#include <stddef.h>
-
 #include <atomic>
-#include <utility>
+#include <string_view>
 
-#include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
-#include "tsl/platform/macros.h"
 #include "tsl/platform/types.h"
 
 namespace tsl {
 namespace profiler {
-namespace internal {
-
-// Whether annotations are enabled.
-// Static atomic so Annotation::IsEnabled can be fast and non-blocking.
-TF_EXPORT extern std::atomic<int> g_annotation_enabled;
-
-}  // namespace internal
 
 // Backend for ScopedAnnotation.
 class AnnotationStack {
  public:
-  // Appends name to the annotation for the current thread and returns the
-  // original length of the annotation.
-  // Append name to the current annotation, separated by "::".
-  // The choice of separator "::" is based on characters not used by
-  // TensorFlow for its TensorOps.
-  static size_t PushAnnotation(absl::string_view name) {
-    string* annotation_stack = ThreadAnnotationStack();
-    size_t old_length = annotation_stack->size();
-    if (old_length != 0) {
-      absl::StrAppend(annotation_stack, "::", name);
-    } else {
-      *annotation_stack = string(name);
-    }
-    return old_length;
-  }
+  // Appends name to the annotations for the current thread, separated by "::".
+  // The choice of separator "::" is based on characters not used by TensorFlow
+  // for its TensorOps.
+  static void PushAnnotation(std::string_view name);
 
-  static size_t PushAnnotation(string&& name) {
-    string* annotation_stack = ThreadAnnotationStack();
-    size_t old_length = annotation_stack->size();
-    if (old_length != 0) {
-      absl::StrAppend(annotation_stack, "::", name);
-    } else {
-      *annotation_stack = std::move(name);
-    }
-    return old_length;
-  }
+  // Resizes the annotation stack for the current thread.
+  static void PopAnnotation();
 
   // Returns the annotation stack for the current thread.
-  static const string& Get() { return *ThreadAnnotationStack(); }
+  static const string& Get();
 
-  // Resizes the annotation stack for the current thread to its old length.
-  static void PopAnnotation(size_t old_length) {
-    ThreadAnnotationStack()->resize(old_length);
-  }
-
-  static void Enable(bool enable) {
-    internal::g_annotation_enabled.store(enable, std::memory_order_release);
-  }
+  // Enables or disables the annotation stack.
+  static void Enable(bool enable);
 
+  // Returns whether the annotation stack is enabled.
   static bool IsEnabled() {
-    return internal::g_annotation_enabled.load(std::memory_order_acquire);
+    return generation_.load(std::memory_order_acquire) & 1;
   }
 
  private:
   AnnotationStack() = default;
 
-  AnnotationStack(const AnnotationStack&) = delete;
-  void operator=(const AnnotationStack&) = delete;
-
-  // Returns a reference to the annotation for the current thread.
-  static string* ThreadAnnotationStack();
+  // Enabled if odd, disabled if even. The value is incremented for every call
+  // to Enable() which changes the enabled state.
+  static std::atomic<int> generation_;
 };
 
 }  // namespace profiler
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/builds/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/builds/BUILD
index b5bac93cb20919..050103eec5570b 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/builds/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/builds/BUILD
@@ -1,5 +1,8 @@
+load("//tsl:tsl.bzl", "internal_visibility")
+
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility(["//tsl/profiler:internal"]),
     licenses = ["notice"],
 )
 
@@ -7,5 +10,4 @@ package(
 config_setting(
     name = "profiler_build_oss",
     define_values = {"profiler_build": "oss"},
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/builds/oss/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/builds/oss/BUILD
index 9ea44126a3c89a..d406ee17088ad7 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/builds/oss/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/builds/oss/BUILD
@@ -1,7 +1,6 @@
 # Tensorflow default + linux implementations of tensorflow/core/profiler libraries.
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/convert/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/convert/BUILD
index a92c9f0cdf7300..12273bf96f1964 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/convert/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/convert/BUILD
@@ -1,13 +1,14 @@
-load("//tsl:tsl.bzl", "set_external_visibility")
+load("//tsl:tsl.bzl", "internal_visibility")
+load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load(
     "@local_tsl//tsl/platform:rules_cc.bzl",
     "cc_library",
 )
-load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility(["//tsl/profiler:internal"]),
     licenses = ["notice"],
 )
 
@@ -16,7 +17,9 @@ cc_library(
     srcs = ["trace_container.cc"],
     hdrs = ["trace_container.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = [
+        "//tsl/profiler:internal",
+    ],
     deps = [
         "//tsl/platform:protobuf",
         "//tsl/profiler/protobuf:trace_events_proto_cc",
@@ -26,7 +29,11 @@ cc_library(
 cc_library(
     name = "xla_op_utils",
     hdrs = ["xla_op_utils.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tsl/profiler:internal",
+        "//tsl/profiler:xla_profiler_backends",
+        "@local_xla//xla/python:__pkg__",
+    ]),
     deps = ["@com_google_absl//absl/strings"],
 )
 
@@ -34,7 +41,6 @@ tsl_cc_test(
     name = "xla_op_utils_test",
     size = "small",
     srcs = ["xla_op_utils_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":xla_op_utils",
         "//tsl/platform:test",
@@ -47,10 +53,11 @@ cc_library(
     srcs = ["post_process_single_host_xplane.cc"],
     hdrs = ["post_process_single_host_xplane.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tsl/profiler:internal"]),
     deps = [
         "//tsl/platform:types",
         "//tsl/profiler/protobuf:xplane_proto_cc",
+        "//tsl/profiler/utils:timestamp_utils",
         "//tsl/profiler/utils:xplane_schema",
         "//tsl/profiler/utils:xplane_utils",
     ],
@@ -61,7 +68,9 @@ cc_library(
     srcs = ["trace_events_to_json.cc"],
     hdrs = ["trace_events_to_json.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tsl/profiler:internal",
+    ]),
     deps = [
         ":trace_container",
         "//tsl/platform:protobuf",
@@ -78,7 +87,6 @@ cc_library(
 tsl_cc_test(
     name = "trace_container_test",
     srcs = ["trace_container_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":trace_container",
         "//tsl/platform:protobuf",
@@ -91,7 +99,6 @@ tsl_cc_test(
 tsl_cc_test(
     name = "trace_events_to_json_test",
     srcs = ["trace_events_to_json_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":trace_container",
         ":trace_events_to_json",
@@ -108,7 +115,9 @@ cc_library(
     srcs = ["xplane_to_trace_events.cc"],
     hdrs = ["xplane_to_trace_events.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tsl/profiler:internal",
+    ]),
     deps = [
         ":trace_container",
         "//tsl/platform:types",
@@ -128,7 +137,6 @@ tsl_cc_test(
     name = "xplane_to_trace_events_test",
     size = "small",
     srcs = ["xplane_to_trace_events_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":xplane_to_trace_events",
         "//tsl/platform:test",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/convert/post_process_single_host_xplane.cc b/third_party/xla/third_party/tsl/tsl/profiler/convert/post_process_single_host_xplane.cc
index fbba8c2eb840ab..49e2f7dbda2ae3 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/convert/post_process_single_host_xplane.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/convert/post_process_single_host_xplane.cc
@@ -17,7 +17,9 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "tsl/platform/types.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
+#include "tsl/profiler/utils/timestamp_utils.h"
 #include "tsl/profiler/utils/xplane_schema.h"
 #include "tsl/profiler/utils/xplane_utils.h"
 
@@ -43,7 +45,7 @@ void MergeHostPlanesAndSortLines(tensorflow::profiler::XSpace* space) {
 }  // namespace
 
 void PostProcessSingleHostXSpace(tensorflow::profiler::XSpace* space,
-                                 uint64 start_time_ns) {
+                                 uint64 start_time_ns, uint64 stop_time_ns) {
   VLOG(3) << "Post processing local profiler XSpace.";
   // Post processing the collected XSpace without hold profiler lock.
   // 1. Merge all host planes and sorts lines by name.
@@ -51,7 +53,10 @@ void PostProcessSingleHostXSpace(tensorflow::profiler::XSpace* space,
   // 2. Normalize all timestamps by shifting timeline to profiling start time.
   // NOTE: this have to be done before sorting XSpace due to timestamp overflow.
   NormalizeTimestamps(space, start_time_ns);
-  // 3. Sort each plane of the XSpace
+  // 3. Add information regarding profiling start_time_ns_ and stop_time_ns_ to
+  // taskEnv.
+  SetSessionTimestamps(start_time_ns, stop_time_ns, *space);
+  // 4. Sort each plane of the XSpace
   SortXSpace(space);
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/convert/post_process_single_host_xplane.h b/third_party/xla/third_party/tsl/tsl/profiler/convert/post_process_single_host_xplane.h
index d0183f2dba188d..0b413931e989fd 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/convert/post_process_single_host_xplane.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/convert/post_process_single_host_xplane.h
@@ -23,7 +23,7 @@ namespace profiler {
 
 // Post process XSpaces collected locally from multiple profilers.
 void PostProcessSingleHostXSpace(tensorflow::profiler::XSpace* space,
-                                 uint64 start_time_ns);
+                                 uint64 start_time_ns, uint64 stop_time_ns);
 
 }  // namespace profiler
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
index 48427c0b61999d..fe6228c459b5f9 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/BUILD
@@ -1,8 +1,8 @@
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl/platform:build_config_root.bzl", "if_static")
+load("//tsl:tsl.bzl", "if_not_android", "internal_visibility", "nvtx_headers")
 load("//tsl:tsl.default.bzl", "filegroup")
-load("//tsl:tsl.bzl", "if_not_android", "set_external_visibility")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
+load("//tsl/platform:build_config_root.bzl", "if_static")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
@@ -53,7 +53,9 @@ cc_library(
     name = "profiler_controller",
     srcs = ["profiler_controller.cc"],
     hdrs = ["profiler_controller.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tsl/profiler:internal",
+    ]),
     deps = [
         ":profiler_interface",
         "//tsl/platform:errors",
@@ -66,7 +68,12 @@ cc_library(
 cc_library(
     name = "profiler_factory",
     hdrs = ["profiler_factory.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tsl/profiler:internal",
+        "//tsl/profiler:xla_profiler_backends",
+        "@local_xla//xla/python:__pkg__",
+        "//learning/brain/tfrc/executor/stream_executor:__pkg__",
+    ]),
     deps = [
         ":profiler_interface",
         "//tsl/profiler/protobuf:profiler_options_proto_cc",
@@ -82,7 +89,10 @@ cc_library(
         "profiler_factory.h",
     ],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tsl/profiler:internal",
+        "//learning/brain/tfrc/executor/stream_executor:__pkg__",
+    ]),
     deps = [
         ":profiler_interface",
         "//tsl/platform:mutex",
@@ -95,7 +105,6 @@ cc_library(
 tsl_cc_test(
     name = "profiler_factory_test",
     srcs = ["profiler_factory_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":profiler_factory",
         ":profiler_factory_impl",
@@ -114,7 +123,11 @@ cc_library(
     name = "profiler_interface",
     hdrs = ["profiler_interface.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tsl:internal",
+        "//tsl/profiler:internal",
+        "//tsl/profiler:xla_profiler_backends",
+    ]),
     deps = [
         "//tsl/platform:status",
         "//tsl/profiler/protobuf:xplane_proto_cc",
@@ -126,7 +139,10 @@ cc_library(
     srcs = ["profiler_lock.cc"],
     hdrs = ["profiler_lock.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tsl/profiler:internal",
+        "//tsl/profiler:xla_internal",
+    ]),
     deps = [
         "//tsl/platform:errors",
         "//tsl/platform:macros",
@@ -138,7 +154,6 @@ cc_library(
 tsl_cc_test(
     name = "profiler_lock_test",
     srcs = ["profiler_lock_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":profiler_lock",
         "//tsl/platform:test",
@@ -149,7 +164,7 @@ tsl_cc_test(
 cc_library(
     name = "profiler_session",
     hdrs = ["profiler_session.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tsl:internal"]),
     deps = [
         "//tsl/platform",
         "//tsl/platform:errors",
@@ -174,7 +189,10 @@ cc_library(
         "profiler_session.h",
     ],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/python:__pkg__",
+        "//tsl/profiler:internal",
+    ]),
     deps = [
         "//tsl/platform:errors",
         "//tsl/platform:logging",
@@ -213,11 +231,11 @@ cc_library(
 tsl_cc_test(
     name = "traceme_encode_test",
     srcs = ["traceme_encode_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":traceme_encode",
         "//tsl/platform",
         "//tsl/platform:test",
+        "//tsl/platform:test_benchmark",
         "//tsl/platform:test_main",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
@@ -227,7 +245,7 @@ tsl_cc_test(
 tf_profiler_pybind_cc_library_wrapper(
     name = "traceme_for_pybind",
     actual = ":traceme",
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tsl/profiler:xla_internal"]),
 )
 
 cc_library(
@@ -256,9 +274,7 @@ cc_library(
         "//tsl/platform:macros",
         "//tsl/platform:types",
         "@com_google_absl//absl/strings",
-    ] + if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",  # NVTX headers
-    ]),
+    ] + if_cuda_is_configured(nvtx_headers()),
 )
 
 cc_library(
@@ -275,26 +291,12 @@ cc_library(
     ]),
 )
 
-cc_library(
-    name = "scoped_annotation_stack",
-    hdrs = ["scoped_annotation_stack.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@com_google_absl//absl/strings",
-    ] + if_not_android([
-        ":nvtx_utils",
-        "//tsl/profiler/backends/cpu:annotation_stack",
-    ]),
-)
-
 tsl_cc_test(
     name = "scoped_annotation_test",
     size = "small",
     srcs = ["scoped_annotation_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":scoped_annotation",
-        ":scoped_annotation_stack",
         "//tsl/platform:test",
         "//tsl/platform:test_benchmark",
         "//tsl/platform:test_main",
@@ -321,7 +323,10 @@ cc_library(
     name = "profiler_collection",
     srcs = ["profiler_collection.cc"],
     hdrs = ["profiler_collection.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "@local_xla//xla/backends/profiler/plugin:__pkg__",
+        "//learning/brain/tfrc/executor/stream_executor:__pkg__",
+    ]),
     deps = [
         ":profiler_interface",
         "//tsl/platform:status",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.cc
index 9379885c4de76b..371631c10ba882 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.cc
@@ -46,6 +46,8 @@ const char* GetContextTypeString(ContextType context_type) {
       return "tpu_launch";
     case ContextType::kPathwaysExecutor:
       return "pathways_exec";
+    case ContextType::kPjrtLibraryCall:
+      return "pjrt_library_call";
   }
 }
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h
index 6f65454354a1dc..621f35462fdae2 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/context_types.h
@@ -36,6 +36,7 @@ enum class ContextType : int {
   kTpuStream,
   kTpuLaunch,
   kPathwaysExecutor,
+  kPjrtLibraryCall,
   kLastContextType = ContextType::kTpuLaunch,
 };
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h
index e3eaaa08af79e8..d6f0991d50a690 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/nvtx_utils.h
@@ -17,42 +17,27 @@ limitations under the License.
 #define TENSORFLOW_TSL_PROFILER_LIB_NVTX_UTILS_H_
 
 #include <optional>
-
-#include "absl/strings/string_view.h"
-#include "tsl/platform/logging.h"
-#include "tsl/platform/macros.h"
+#include <string>
 
 #if GOOGLE_CUDA
 #include "nvtx3/nvToolsExt.h"
 #else
 // Some typedef to help build without NVTX.
-typedef void* nvtxEventAttributes_t;
 typedef void* nvtxDomainHandle_t;
 typedef void* nvtxStringHandle_t;
 #endif
 
 namespace tsl {
 namespace profiler {
-namespace nvtx {
 
 // A helper function that return the domains to use if NVTX profiling
 // is enabled.
 inline std::optional<nvtxDomainHandle_t> GetNVTXDomain() {
 #if GOOGLE_CUDA
-  static nvtxDomainHandle_t domain;
-  static bool is_enabled = [] {
-    bool _is_enabled = false;
-    // Force NVTX marker if a tool triggered the profiler.
-    domain = nvtxDomainCreateA("TSL");
-    if (domain) {
-      _is_enabled = true;
-    }
-    VLOG(1) << "Is NVTX marker enabled? " << _is_enabled;
-    return _is_enabled;
-  }();
-  if (is_enabled) return domain;
+  static nvtxDomainHandle_t domain = nvtxDomainCreateA("TSL");
+  if (domain != nullptr) return domain;
 #endif
-  return {};
+  return std::nullopt;
 }
 
 // A helper function to decide whether to enable CUDA NVTX profiling ranges.
@@ -64,42 +49,31 @@ inline bool RangesEnabled() {
 #endif
 }
 
-// Two types of NVTX range annotation are supported, the older/simpler option
-// is to use std::string and have the NVTX implementation copy a C-style
-// string every time. The other option is to pass a struct implementing two
-// methods:
-//
-//   std::string_view Title() const;
-//   nvtxStringHandle_t NvtxRegisteredTitle() const;
-//
-// in which case NvtxRegisteredTitle() will be used when starting NVTX ranges,
-// avoiding this string copy.
-// The Title() method is needed because AnnotationStack::PushAnnotation(...) is
-// the backend for some annotations when NVTX is not enabled, and it does not
-// recognise registered strings. has_annotation_api_v<AnnotationType>
-// distinguishes between the two types of annotation.
-template <typename AnnotationType>
-inline constexpr bool has_annotation_api_v =
-    !std::is_same_v<AnnotationType, std::string>;
+inline void RangePush(nvtxDomainHandle_t domain, const char* ascii) {
+#if GOOGLE_CUDA
+  nvtxEventAttributes_t attrs{};
+  attrs.version = NVTX_VERSION;
+  attrs.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
+  attrs.messageType = NVTX_MESSAGE_TYPE_ASCII;
+  attrs.message.ascii = ascii;
+  ::nvtxDomainRangePushEx(domain, &attrs);
+#endif
+}
+inline void RangePush(nvtxDomainHandle_t domain, const std::string& str) {
+  RangePush(domain, str.c_str());
+}
 
-template <typename AnnotationType>
-void RangePush(nvtxDomainHandle_t domain, const AnnotationType& annotation) {
+inline void RangePush(nvtxDomainHandle_t domain, nvtxStringHandle_t handle) {
 #if GOOGLE_CUDA
   nvtxEventAttributes_t attrs{};
   attrs.version = NVTX_VERSION;
   attrs.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
-  if constexpr (has_annotation_api_v<std::decay_t<AnnotationType>>) {
-    attrs.messageType = NVTX_MESSAGE_TYPE_REGISTERED;
-    attrs.message.registered = annotation.NvtxRegisteredTitle();
-  } else {
-    attrs.messageType = NVTX_MESSAGE_TYPE_ASCII;
-    attrs.message.ascii = annotation.c_str();
-  }
+  attrs.messageType = NVTX_MESSAGE_TYPE_REGISTERED;
+  attrs.message.registered = handle;
   ::nvtxDomainRangePushEx(domain, &attrs);
 #endif
 }
 
-}  // namespace nvtx
 }  // namespace profiler
 }  // namespace tsl
 #endif  // TENSORFLOW_TSL_PROFILER_LIB_NVTX_UTILS_H_
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.cc
index f5ba7e1ee9281e..2edf404b64e4e4 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.cc
@@ -70,6 +70,7 @@ Status ProfilerSession::CollectDataInternal(XSpace* space) {
   LOG(INFO) << "Profiler session collecting data.";
   if (profilers_ != nullptr) {
     profilers_->Stop().IgnoreError();
+    stop_time_ns_ = profiler::GetCurrentTimeNanos();
     profilers_->CollectData(space).IgnoreError();
     profilers_.reset();  // data has been collected.
   }
@@ -83,7 +84,7 @@ Status ProfilerSession::CollectData(XSpace* space) {
 #if !defined(IS_MOBILE_PLATFORM)
   space->add_hostnames(port::Hostname());
   TF_RETURN_IF_ERROR(CollectDataInternal(space));
-  profiler::PostProcessSingleHostXSpace(space, start_time_ns_);
+  profiler::PostProcessSingleHostXSpace(space, start_time_ns_, stop_time_ns_);
 #endif
   return OkStatus();
 }
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.h
index 424e5c87d0b4ef..e6fb67218ac5fe 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/profiler_session.h
@@ -83,6 +83,7 @@ class ProfilerSession {
   std::unique_ptr<profiler::ProfilerInterface> profilers_ TF_GUARDED_BY(mutex_);
 
   uint64 start_time_ns_;
+  uint64 stop_time_ns_;
   tensorflow::ProfileOptions options_;
 #endif
   tsl::Status status_ TF_GUARDED_BY(mutex_);
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
index f047fafc4ebe3a..b779d600959466 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation.h
@@ -18,123 +18,84 @@ limitations under the License.
 #include <stddef.h>
 
 #include <atomic>
-#include <optional>
 #include <string>
 #include <string_view>
 #include <utility>
 
-#include "absl/strings/string_view.h"
 #include "tsl/platform/macros.h"
-#include "tsl/platform/types.h"
 
 #if !defined(IS_MOBILE_PLATFORM)
 #include "tsl/profiler/backends/cpu/annotation_stack.h"
+#endif
+
+#if GOOGLE_CUDA
 #include "tsl/profiler/lib/nvtx_utils.h"
 #endif
 
 namespace tsl {
 namespace profiler {
 
-// Adds an annotation to all activities for the duration of the instance
-// lifetime through the currently registered TraceCollector.
-//
-// Usage: {
-//          ScopedAnnotation annotation("my kernels");
-//          Kernel1<<<x,y>>>;
-//          LaunchKernel2(); // Launches a CUDA kernel.
-//        }
-// This will add 'my kernels' to both kernels in the profiler UI
-template <bool always_annotate = false>
-class ScopedAnnotationT {
- public:
-  explicit ScopedAnnotationT(absl::string_view name) {
-#if !defined(IS_MOBILE_PLATFORM)
+// Adds an annotation to all activities through the currently registered
+// TraceCollector until PopAnnotation() is called.
+template <typename T>
+inline void PushAnnotation(const T& generator) {
 #if GOOGLE_CUDA
-    std::optional<nvtxDomainHandle_t> domain =
-        tsl::profiler::nvtx::GetNVTXDomain();
-    if (TF_PREDICT_FALSE(domain.has_value())) {
-      tsl::profiler::nvtx::RangePush(domain.value(), std::string{name});
-    } else  // NOLINT
-#endif
-        if (always_annotate || TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
-      old_length_ = AnnotationStack::PushAnnotation(name);
-    }
-#endif
+  if (auto domain = GetNVTXDomain(); TF_PREDICT_FALSE(domain.has_value())) {
+    return RangePush(*domain, generator());
   }
+#endif
 
-  explicit ScopedAnnotationT(const char* name)
-      : ScopedAnnotationT(absl::string_view(name)) {}
-
-  explicit ScopedAnnotationT(const string& name) {
 #if !defined(IS_MOBILE_PLATFORM)
-#if GOOGLE_CUDA
-    std::optional<nvtxDomainHandle_t> domain =
-        tsl::profiler::nvtx::GetNVTXDomain();
-    if (TF_PREDICT_FALSE(domain.has_value())) {
-      tsl::profiler::nvtx::RangePush(domain.value(), name);
-    } else  // NOLINT
-#endif
-        if (always_annotate || TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
-      old_length_ = AnnotationStack::PushAnnotation(name);
-    }
-#endif
+  if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
+    AnnotationStack::PushAnnotation(static_cast<std::string_view>(generator()));
   }
+#endif
+}
+
+inline void PushAnnotation(const char* name) {
+  PushAnnotation([&] { return name; });
+}
+inline void PushAnnotation(const std::string& name) {
+  PushAnnotation([&] { return name; });
+}
+
+inline void PopAnnotation() {
+  // TODO(b/137971921): without this memory fence, two presubmit tests will
+  // fail probably due to compiler in that presubmit config.
+  std::atomic_thread_fence(std::memory_order_acquire);
 
-  explicit ScopedAnnotationT(string&& name) {
-#if !defined(IS_MOBILE_PLATFORM)
 #if GOOGLE_CUDA
-    std::optional<nvtxDomainHandle_t> domain =
-        tsl::profiler::nvtx::GetNVTXDomain();
-    if (TF_PREDICT_FALSE(domain.has_value())) {
-      tsl::profiler::nvtx::RangePush(domain.value(), name);
-    } else  // NOLINT
-#endif
-        if (always_annotate || TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
-      old_length_ = AnnotationStack::PushAnnotation(std::move(name));
-    }
-#endif
+  if (auto domain = GetNVTXDomain(); TF_PREDICT_FALSE(domain.has_value())) {
+    ::nvtxDomainRangePop(*domain);
+    return;
   }
+#endif
 
-  template <typename NameGeneratorT>
-  explicit ScopedAnnotationT(NameGeneratorT name_generator) {
 #if !defined(IS_MOBILE_PLATFORM)
-#if GOOGLE_CUDA
-    std::optional<nvtxDomainHandle_t> domain =
-        tsl::profiler::nvtx::GetNVTXDomain();
-    if (TF_PREDICT_FALSE(domain.has_value())) {
-      tsl::profiler::nvtx::RangePush(domain.value(), name_generator());
-    } else  // NOLINT
-#endif
-        if (always_annotate || TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
-      auto annotation = name_generator();
-      if constexpr (tsl::profiler::nvtx::has_annotation_api_v<
-                        std::decay_t<decltype(annotation)>>) {
-        old_length_ = AnnotationStack::PushAnnotation(annotation.Title());
-      } else {
-        old_length_ = AnnotationStack::PushAnnotation(std::move(annotation));
-      }
-    }
+  if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
+    AnnotationStack::PopAnnotation();
+  }
 #endif
+}
+
+// Adds an annotation to all activities for the duration of the instance
+// lifetime through the currently registered TraceCollector.
+//
+// Usage: {
+//          ScopedAnnotation annotation("my kernels");
+//          Kernel1<<<x,y>>>;
+//          LaunchKernel2(); // Launches a CUDA kernel.
+//        }
+// This will add 'my kernels' to both kernels in the profiler UI
+class ScopedAnnotation {
+ public:
+  template <typename T>
+  explicit ScopedAnnotation(T&& annotation) {
+    PushAnnotation(std::forward<T>(annotation));
   }
 
   // Pops the name passed in the constructor from the current annotation.
-  ~ScopedAnnotationT() {
-    // TODO(b/137971921): without this memory fence, two presubmit tests will
-    // fail probably due to compiler in that presubmit config.
-    std::atomic_thread_fence(std::memory_order_acquire);
-#if !defined(IS_MOBILE_PLATFORM)
-#if GOOGLE_CUDA
-    std::optional<nvtxDomainHandle_t> domain =
-        tsl::profiler::nvtx::GetNVTXDomain();
-    if (TF_PREDICT_FALSE(domain.has_value())) {
-      ::nvtxDomainRangePop(domain.value());
-    } else  // NOLINT
-#endif
-        if (TF_PREDICT_FALSE(old_length_ != kInvalidLength)) {
-      AnnotationStack::PopAnnotation(old_length_);
-    }
-#endif
-  }
+  ~ScopedAnnotation() { PopAnnotation(); }
 
   static bool IsEnabled() {
 #if !defined(IS_MOBILE_PLATFORM)
@@ -145,18 +106,10 @@ class ScopedAnnotationT {
   }
 
  private:
-  // signals that annotation is disabled at the constructor.
-  static constexpr size_t kInvalidLength = static_cast<size_t>(-1);
-
-  ScopedAnnotationT(const ScopedAnnotationT&) = delete;
-  void operator=(const ScopedAnnotationT&) = delete;
-
-  size_t old_length_ = kInvalidLength;
+  ScopedAnnotation(const ScopedAnnotation&) = delete;
+  ScopedAnnotation& operator=(const ScopedAnnotation&) = delete;
 };
 
-using ScopedAnnotation = ScopedAnnotationT<false>;
-using ScopedAnnotationAlways = ScopedAnnotationT<true>;
-
 }  // namespace profiler
 }  // namespace tsl
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation_stack.h b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation_stack.h
deleted file mode 100644
index db46f7c99135e4..00000000000000
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation_stack.h
+++ /dev/null
@@ -1,118 +0,0 @@
-/* Copyright 2022 The TensorFlow Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#ifndef TENSORFLOW_TSL_PROFILER_LIB_SCOPED_ANNOTATION_STACK_H_
-#define TENSORFLOW_TSL_PROFILER_LIB_SCOPED_ANNOTATION_STACK_H_
-
-#include <stddef.h>
-
-#include <atomic>
-#include <string>
-#include <string_view>
-#include <utility>
-
-#include "absl/strings/string_view.h"
-#if !defined(IS_MOBILE_PLATFORM)
-#include "tsl/profiler/backends/cpu/annotation_stack.h"
-#include "tsl/profiler/lib/nvtx_utils.h"
-#endif
-
-namespace tsl {
-namespace profiler {
-
-// ScopedAnnotation for clients that can't use RAII for managing the lifetime
-// of annotations. It provides an API similar to the `TraceMe::ActivityStart`
-// and `TraceMe::ActivityEnd`.
-//
-// Usage:
-//   int64_t id = ScopedAnnotationStack::ActivityStart("foo");
-//   foo();
-//   ScopedAnnotationStack::ActivityEnd(id);
-//
-// Prefer a regular `ScopedAnnotation`. The name of this class is a misnomer,
-// because it doesn't do any automatic destruction at the scope end, it's just
-// for the sake of consistency.
-class ScopedAnnotationStack {
-  static constexpr size_t kInvalidActivity = static_cast<size_t>(-1);
-
- public:
-  static bool IsEnabled() { return AnnotationStack::IsEnabled(); }
-
-  static int64_t ActivityStart(std::string name) {
-#if !defined(IS_MOBILE_PLATFORM)
-#if GOOGLE_CUDA
-    std::optional<nvtxDomainHandle_t> domain =
-        tsl::profiler::nvtx::GetNVTXDomain();
-    if (TF_PREDICT_FALSE(domain.has_value())) {
-      tsl::profiler::nvtx::RangePush(domain.value(), name);
-    } else  // NOLINT
-#endif
-        if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
-      return AnnotationStack::PushAnnotation(std::move(name));
-    }
-#endif
-    return kInvalidActivity;
-  }
-
-  static int64_t ActivityStart(std::string_view name) {
-    return ActivityStart(std::string(name));
-  }
-
-  static int64_t ActivityStart(const char* name) {
-    return ActivityStart(std::string_view(name));
-  }
-
-  template <typename NameGeneratorT>
-  static int64_t ActivityStart(NameGeneratorT name_generator) {
-#if !defined(IS_MOBILE_PLATFORM)
-#if GOOGLE_CUDA
-    std::optional<nvtxDomainHandle_t> domain =
-        tsl::profiler::nvtx::GetNVTXDomain();
-    if (TF_PREDICT_FALSE(domain.has_value())) {
-      tsl::profiler::nvtx::RangePush(domain.value(), name_generator());
-    } else  // NOLINT
-#endif
-        if (TF_PREDICT_FALSE(AnnotationStack::IsEnabled())) {
-      auto annotation = name_generator();
-      if constexpr (tsl::profiler::nvtx::has_annotation_api_v<
-                        std::decay_t<decltype(annotation)>>) {
-        return AnnotationStack::PushAnnotation(annotation.Title());
-      } else {
-        return AnnotationStack::PushAnnotation(std::move(annotation));
-      }
-    }
-#endif
-    return kInvalidActivity;
-  }
-
-  static void ActivityEnd(int64_t activity_id) {
-#if !defined(IS_MOBILE_PLATFORM)
-#if GOOGLE_CUDA
-    std::optional<nvtxDomainHandle_t> domain =
-        tsl::profiler::nvtx::GetNVTXDomain();
-    if (TF_PREDICT_FALSE(domain.has_value())) {
-      ::nvtxDomainRangePop(domain.value());
-    } else  // NOLINT
-#endif
-        if (TF_PREDICT_FALSE(activity_id != kInvalidActivity)) {
-      AnnotationStack::PopAnnotation(activity_id);
-    }
-#endif
-  }
-};
-
-}  // namespace profiler
-}  // namespace tsl
-
-#endif  // TENSORFLOW_TSL_PROFILER_LIB_SCOPED_ANNOTATION_STACK_H_
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation_test.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation_test.cc
index dab3e91f2ed4cd..0ae9d3276375f1 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/scoped_annotation_test.cc
@@ -21,7 +21,6 @@ limitations under the License.
 #include "tsl/platform/test.h"
 #include "tsl/platform/test_benchmark.h"
 #include "tsl/profiler/backends/cpu/annotation_stack.h"
-#include "tsl/profiler/lib/scoped_annotation_stack.h"
 
 namespace tsl {
 namespace profiler {
@@ -50,11 +49,11 @@ TEST(ScopedAnnotation, Simple) {
 
   {
     AnnotationStack::Enable(true);
-    int64_t id0 = ScopedAnnotationStack::ActivityStart("foo");
-    int64_t id1 = ScopedAnnotationStack::ActivityStart("bar");
+    PushAnnotation("foo");
+    PushAnnotation("bar");
     EXPECT_EQ(AnnotationStack::Get(), "foo::bar");  // enabled
-    ScopedAnnotationStack::ActivityEnd(id1);
-    ScopedAnnotationStack::ActivityEnd(id0);
+    PopAnnotation();
+    PopAnnotation();
     AnnotationStack::Enable(false);
   }
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/lib/traceme_encode_test.cc b/third_party/xla/third_party/tsl/tsl/profiler/lib/traceme_encode_test.cc
index ea64d28f9e2b48..4827bee4d820b6 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/lib/traceme_encode_test.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/lib/traceme_encode_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include "absl/strings/str_format.h"
 #include "tsl/platform/platform.h"
 #include "tsl/platform/test.h"
+#include "tsl/platform/test_benchmark.h"
 
 namespace tsl {
 namespace profiler {
@@ -81,5 +82,25 @@ TEST(TraceMeEncodeTest, NoNameTest) {
 }
 
 }  // namespace
+
+void BM_TraceMeEncode(::testing::benchmark::State& state) {
+  for (auto s : state) {
+    TraceMeEncode(
+        "MyTestEvent",
+        {{"Lorem ipsum dolor sit amet", 1},
+         {"consectetur adipiscing elit", 2},
+         {"sed do eiusmod tempor incididunt", 3.52},
+         {"ut labore et dolore magna aliqua", "Ut enim ad minim veniam"},
+         {"quis nostrud exercitation ullamco", "laboris nisi ut aliquip ex"},
+         {"ea commodo consequat.", 11111.1111},
+         {"Duis aute", 1234567890},
+         {"irure dolor in", " reprehenderit in voluptate"},
+         {"velit esse cillum dolore", "eu fugiat nulla pariatur."},
+         {"Excepteur sint", "occaecat cupidatat non proident, sunt in"},
+         {"culpa qui officia", "deserunt mollit anim id est laborum."}});
+  }
+}
+BENCHMARK(BM_TraceMeEncode);
+
 }  // namespace profiler
 }  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD
index a8194cdb3e182f..f43bc94f52f585 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/protobuf/BUILD
@@ -1,6 +1,6 @@
 # Placeholder: load py_proto_library
 # copybara:uncomment(oss-unused) load("//net/grpc/go/build_defs:go_grpc_library.bzl", "go_grpc_library")
-load("//tsl:tsl.bzl", "set_external_visibility")
+load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl/platform:build_config.bzl", "tf_proto_library")
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
@@ -17,7 +17,7 @@ tf_proto_library(
     srcs = ["xplane.proto"],
     cc_api_version = 2,
     make_default_target_header_only = True,
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
 )
 
 tf_proto_library(
@@ -71,7 +71,7 @@ tf_proto_library(
     name = "trace_events_proto",
     srcs = ["trace_events.proto"],
     cc_api_version = 2,
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
 )
 
 # copybara:uncomment_begin(google-only)
@@ -86,14 +86,20 @@ tf_proto_library(
 # This is needed because of how tf_android_core_proto_sources parses proto paths.
 exports_files(
     srcs = ["xplane.proto"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tsl:__pkg__",
+    ]),
 )
 
 tf_proto_library(
     name = "profile_proto",
     srcs = ["profile.proto"],
     cc_api_version = 2,
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "@local_xla//xla/python:__pkg__",
+        "//tsl/profiler:internal",
+    ]),
 )
 
 tf_proto_library(
@@ -113,7 +119,7 @@ tf_proto_library(
 # py_proto_library(
 #     name = "xplane_py_pb2",
 #     api_version = 2,
-#     visibility = set_external_visibility([":friends"]),
+#     visibility = internal_visibility([":friends"]),
 #     deps = [":xplane_proto"],
 # )
 # copybara:uncomment_end
@@ -123,5 +129,5 @@ tf_proto_library(
     srcs = ["profiled_instructions.proto"],
     cc_api_version = 2,
     make_default_target_header_only = True,
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
 )
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/rpc/BUILD
index 5f7f5dbc3205be..debedf53094de4 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/BUILD
@@ -1,14 +1,15 @@
+load("//tsl:tsl.bzl", "internal_visibility")
+load("//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//tsl/profiler/builds:build_config.bzl",
     "tf_profiler_copts",
     "tf_profiler_pybind_cc_library_wrapper",
 )
-load("//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
-load("//tsl:tsl.bzl", "set_external_visibility")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility(["//tsl/profiler:internal"]),
     licenses = ["notice"],
 )
 
@@ -18,7 +19,14 @@ cc_library(
     srcs = ["profiler_service_impl.cc"],
     hdrs = ["profiler_service_impl.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core/data/service:__pkg__",
+        "//tensorflow/core/distributed_runtime/rpc:__pkg__",
+        "//tensorflow/core/profiler/rpc:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tsl/profiler/rpc/client:__pkg__",
+        "//tensorflow_serving/model_servers:__pkg__",
+    ]),
     deps = [
         "//tsl/platform:env",
         "//tsl/platform:env_time",
@@ -45,7 +53,7 @@ cc_library(
 tf_profiler_pybind_cc_library_wrapper(
     name = "profiler_server_for_pybind",
     actual = ":profiler_server_impl",
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/python/profiler/internal:__pkg__"]),
 )
 
 cc_library(
@@ -53,7 +61,14 @@ cc_library(
     srcs = ["profiler_server.cc"],
     hdrs = ["profiler_server.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "@local_xla//xla:__subpackages__",
+        "//tensorflow/core/profiler/rpc:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/profiler/internal:__pkg__",
+        "//tsl/profiler:internal",
+        "//tsl/profiler/rpc/client:__pkg__",
+    ]),
     deps = [
         ":profiler_service_impl",
         "//tsl/platform:logging",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD
index 8dc488fc78c5aa..deb9383157a594 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/rpc/client/BUILD
@@ -1,6 +1,6 @@
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-load("//tsl:tsl.bzl", "set_external_visibility")
+load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl:tsl.default.bzl", "tsl_grpc_cc_dependencies")
+load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "//tsl/platform:build_config.bzl",
     "tf_protos_profiler_service",
@@ -13,7 +13,10 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//tsl/profiler:internal",
+    ]),
     licenses = ["notice"],
 )
 
@@ -22,7 +25,11 @@ cc_library(
     srcs = ["capture_profile.cc"],
     hdrs = ["capture_profile.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "@local_xla//xla/python:__pkg__",
+        "//tensorflow/core/profiler/rpc/client:__pkg__",
+        "//tensorflow/python/profiler/internal:__pkg__",
+    ]),
     deps = [
         ":profiler_client_for_pybind",
         ":remote_profiler_session_manager",
@@ -49,7 +56,12 @@ cc_library(
     srcs = ["save_profile.cc"],
     hdrs = ["save_profile.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core/profiler/rpc/client:__pkg__",
+        "@local_xla//xla/python:__pkg__",
+        "//tsl/profiler:internal",
+        "//tsl/profiler/rpc:__pkg__",
+    ]),
     deps = [
         "//tsl/lib/io:zlib_compression_options",
         "//tsl/lib/io:zlib_outputbuffer",
@@ -69,13 +81,20 @@ cc_library(
 tf_profiler_pybind_cc_library_wrapper(
     name = "profiler_client_for_pybind",
     actual = ":profiler_client",
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core/profiler/rpc/client:__pkg__",
+        "//tensorflow/python/profiler/internal:__pkg__",
+    ]),
 )
 
 cc_library(
     name = "profiler_client",
     hdrs = ["profiler_client.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "@local_xla//xla:__subpackages__",
+        "//tensorflow/core/profiler/rpc/client:__pkg__",
+        "//tensorflow/python/profiler/internal:__pkg__",
+    ]),
     deps = [
         ":profiler_client_impl",
         "//tsl/platform:status",
@@ -94,7 +113,12 @@ cc_library(
         "profiler_client.h",
     ],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "@local_xla//xla/python:__pkg__",
+        "//tensorflow/core/profiler/rpc/client:__pkg__",
+        "//tensorflow/python:__pkg__",
+        "//tensorflow/python/profiler/internal:__pkg__",
+    ]),
     deps = [
         "//tsl/platform:errors",
         "//tsl/platform:logging",
@@ -114,7 +138,6 @@ cc_library(
     name = "profiler_client_test_util",
     testonly = 1,
     hdrs = ["profiler_client_test_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:logging",
         "//tsl/platform:test",
@@ -131,7 +154,6 @@ cc_library(
 tsl_cc_test(
     name = "profiler_client_test",
     srcs = ["profiler_client_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":profiler_client",
         ":profiler_client_impl",  # for oss
@@ -156,7 +178,6 @@ cc_library(
     srcs = ["remote_profiler_session_manager.cc"],
     hdrs = ["remote_profiler_session_manager.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":profiler_client_for_pybind",
         "//tsl/platform:env_time",
@@ -177,7 +198,6 @@ cc_library(
 tsl_cc_test(
     name = "remote_profiler_session_manager_test",
     srcs = ["remote_profiler_session_manager_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":profiler_client_impl",  # for oss
         ":profiler_client_test_util",
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD b/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
index d36ed09f6cc845..527723044ae302 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/BUILD
@@ -1,11 +1,14 @@
-load("//tsl:tsl.bzl", "set_external_visibility")
+load("//tsl:tsl.bzl", "internal_visibility")
 load("//tsl/platform:build_config.bzl", "tsl_cc_test")
 load("//tsl/platform:build_config_root.bzl", "if_static")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//tsl/profiler:internal",
+    ]),
     licenses = ["notice"],
 )
 
@@ -19,13 +22,11 @@ package_group(
 cc_library(
     name = "math_utils",
     hdrs = ["math_utils.h"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "format_utils",
     hdrs = ["format_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:logging",
     ],
@@ -35,7 +36,7 @@ cc_library(
     name = "time_utils",
     hdrs = ["time_utils.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":math_utils",
     ] + if_static([
@@ -50,7 +51,11 @@ cc_library(
         "time_utils.h",
     ],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "@local_xla//xla:__subpackages__",
+        "//tsl/platform/cloud:__pkg__",
+        "//tsl/profiler:internal",
+    ]),
     deps = [
         ":math_utils",
         "@com_google_absl//absl/time",
@@ -62,7 +67,6 @@ cc_library(
     name = "timespan",
     hdrs = ["timespan.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":math_utils",
         "//tsl/platform:logging",
@@ -74,7 +78,6 @@ cc_library(
 tsl_cc_test(
     name = "timespan_test",
     srcs = ["timespan_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":timespan",
         "//tsl/platform:test",
@@ -87,7 +90,6 @@ cc_library(
     srcs = ["tf_op_utils.cc"],
     hdrs = ["tf_op_utils.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:macros",
         "//tsl/platform:regexp",
@@ -99,7 +101,6 @@ tsl_cc_test(
     name = "tf_op_utils_test",
     size = "small",
     srcs = ["tf_op_utils_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":tf_op_utils",
         "//tsl/platform:test",
@@ -113,7 +114,7 @@ cc_library(
     srcs = ["xplane_schema.cc"],
     hdrs = ["xplane_schema.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":tf_op_utils",
         "//tsl/lib/gtl:map_util",
@@ -133,7 +134,7 @@ cc_library(
     srcs = ["xplane_visitor.cc"],
     hdrs = ["xplane_visitor.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":timespan",
         "//tsl/platform:logging",
@@ -150,7 +151,7 @@ cc_library(
     srcs = ["xplane_builder.cc"],
     hdrs = ["xplane_builder.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":math_utils",
         ":timespan",
@@ -170,7 +171,6 @@ tsl_cc_test(
     name = "xplane_builder_test",
     size = "small",
     srcs = ["xplane_builder_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":xplane_builder",
         ":xplane_visitor",
@@ -185,11 +185,13 @@ cc_library(
     name = "trace_utils",
     hdrs = ["trace_utils.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "@local_xla//xla/backends/profiler/gpu:__pkg__",
+        "//tsl/profiler:internal",
+    ]),
     deps = [
         "//tsl/platform:types",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
     ],
 )
 
@@ -198,7 +200,7 @@ cc_library(
     srcs = ["xplane_utils.cc"],
     hdrs = ["xplane_utils.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":math_utils",
         ":tf_xplane_visitor",
@@ -223,7 +225,6 @@ cc_library(
 tsl_cc_test(
     name = "xplane_utils_test",
     srcs = ["xplane_utils_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":math_utils",
         ":xplane_builder",
@@ -245,7 +246,7 @@ cc_library(
     name = "tf_xplane_visitor",
     hdrs = ["tf_xplane_visitor.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":xplane_schema",
         ":xplane_visitor",
@@ -258,7 +259,7 @@ cc_library(
     srcs = ["parse_annotation.cc"],
     hdrs = ["parse_annotation.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         "@com_google_absl//absl/strings",
     ],
@@ -267,7 +268,6 @@ cc_library(
 tsl_cc_test(
     name = "parse_annotation_test",
     srcs = ["parse_annotation_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":parse_annotation",
         "//tsl/platform:test",
@@ -281,7 +281,7 @@ cc_library(
     srcs = ["group_events.cc"],
     hdrs = ["group_events.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":tf_xplane_visitor",
         ":xplane_builder",
@@ -309,7 +309,7 @@ cc_library(
     srcs = ["xplane_test_utils.cc"],
     hdrs = ["xplane_test_utils.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":xplane_builder",
         ":xplane_schema",
@@ -325,7 +325,6 @@ cc_library(
 tsl_cc_test(
     name = "group_events_test",
     srcs = ["group_events_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":group_events",
         ":tf_xplane_visitor",
@@ -347,7 +346,6 @@ cc_library(
     name = "tpu_xplane_utils",
     srcs = ["tpu_xplane_utils.cc"],
     hdrs = ["tpu_xplane_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":xplane_schema",
         ":xplane_utils",
@@ -361,7 +359,6 @@ cc_library(
 tsl_cc_test(
     name = "tpu_xplane_utils_test",
     srcs = ["tpu_xplane_utils_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":tpu_xplane_utils",
         ":xplane_schema",
@@ -377,7 +374,10 @@ cc_library(
     name = "file_system_utils",
     hdrs = ["file_system_utils.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "@local_xla//xla/python:__pkg__",
+        "//tsl/profiler:internal",
+    ]),
     deps = [
         "//tsl/platform",
         "@com_google_absl//absl/strings",
@@ -389,7 +389,10 @@ cc_library(
     srcs = ["buffer_pool.cc"],
     hdrs = ["buffer_pool.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "@local_xla//xla/backends/profiler/gpu:__pkg__",
+        "//tsl/profiler:internal",
+    ]),
     deps = [
         "//tsl/platform:logging",
         "//tsl/platform:mutex",
@@ -401,7 +404,6 @@ cc_library(
 tsl_cc_test(
     name = "buffer_pool_test",
     srcs = ["buffer_pool_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":buffer_pool",
         "//tsl/platform:test",
@@ -414,7 +416,7 @@ cc_library(
     srcs = ["preprocess_xplane.cc"],
     hdrs = ["preprocess_xplane.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":tpu_xplane_utils",
         ":trace_utils",
@@ -433,7 +435,6 @@ cc_library(
 tsl_cc_test(
     name = "preprocess_xplane_test",
     srcs = ["preprocess_xplane_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":preprocess_xplane",
         ":tf_xplane_visitor",
@@ -453,7 +454,6 @@ cc_library(
     name = "session_manager",
     srcs = ["session_manager.cc"],
     hdrs = ["session_manager.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:errors",
         "//tsl/platform:status",
@@ -462,3 +462,29 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
+
+cc_library(
+    name = "timestamp_utils",
+    srcs = ["timestamp_utils.cc"],
+    hdrs = ["timestamp_utils.h"],
+    deps = [
+        ":xplane_builder",
+        ":xplane_schema",
+        ":xplane_utils",
+        "//tsl/profiler/protobuf:xplane_proto_cc",
+        "@com_google_absl//absl/log",
+    ],
+)
+
+tsl_cc_test(
+    name = "timestamp_utils_test",
+    srcs = ["timestamp_utils_test.cc"],
+    deps = [
+        ":timestamp_utils",
+        ":xplane_schema",
+        ":xplane_utils",
+        ":xplane_visitor",
+        "//tsl/platform:test",
+        "//tsl/platform:test_main",
+    ],
+)
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/timestamp_utils.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/timestamp_utils.cc
new file mode 100644
index 00000000000000..ea208ed309c468
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/timestamp_utils.cc
@@ -0,0 +1,49 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tsl/profiler/utils/timestamp_utils.h"
+
+#include <cstdint>
+
+#include "absl/log/log.h"
+#include "tsl/profiler/protobuf/xplane.pb.h"
+#include "tsl/profiler/utils/xplane_builder.h"
+#include "tsl/profiler/utils/xplane_schema.h"
+#include "tsl/profiler/utils/xplane_utils.h"
+
+namespace tsl {
+namespace profiler {
+
+void SetSessionTimestamps(uint64_t start_walltime_ns, uint64_t stop_walltime_ns,
+                          tensorflow::profiler::XSpace& space) {
+  if (start_walltime_ns != 0 && stop_walltime_ns != 0) {
+    tsl::profiler::XPlaneBuilder plane(
+        tsl::profiler::FindOrAddMutablePlaneWithName(
+            &space, tsl::profiler::kTaskEnvPlaneName));
+    plane.AddStatValue(*plane.GetOrCreateStatMetadata(
+                           GetTaskEnvStatTypeStr(kEnvProfileStartTime)),
+                       start_walltime_ns);
+    plane.AddStatValue(*plane.GetOrCreateStatMetadata(
+                           GetTaskEnvStatTypeStr(kEnvProfileStopTime)),
+                       stop_walltime_ns);
+  } else {
+    LOG(WARNING) << "Not Setting Session Timestamps, (start_walltime_ns, "
+                    "stop_walltime_ns) : "
+                 << start_walltime_ns << ", " << stop_walltime_ns;
+  }
+}
+
+}  // namespace profiler
+}  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/timestamp_utils.h b/third_party/xla/third_party/tsl/tsl/profiler/utils/timestamp_utils.h
new file mode 100644
index 00000000000000..87013c97a6f5b0
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/timestamp_utils.h
@@ -0,0 +1,33 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef TENSORFLOW_TSL_PROFILER_UTILS_TIMESTAMP_UTILS_H_
+#define TENSORFLOW_TSL_PROFILER_UTILS_TIMESTAMP_UTILS_H_
+
+#include <cstdint>
+
+#include "tsl/profiler/protobuf/xplane.pb.h"
+
+namespace tsl {
+namespace profiler {
+
+// Add metadata regarding profile start_time and stop_time to xspace.
+// This function won't have an effect if either of the timestamps is zero.
+void SetSessionTimestamps(uint64_t start_walltime_ns, uint64_t stop_walltime_ns,
+                          tensorflow::profiler::XSpace& space);
+}  // namespace profiler
+}  // namespace tsl
+
+#endif  // TENSORFLOW_TSL_PROFILER_UTILS_TIMESTAMP_UTILS_H_
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/timestamp_utils_test.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/timestamp_utils_test.cc
new file mode 100644
index 00000000000000..893e31ebb5ec59
--- /dev/null
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/timestamp_utils_test.cc
@@ -0,0 +1,45 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "tsl/profiler/utils/timestamp_utils.h"
+
+#include "tsl/platform/test.h"
+#include "tsl/profiler/utils/xplane_schema.h"
+#include "tsl/profiler/utils/xplane_utils.h"
+#include "tsl/profiler/utils/xplane_visitor.h"
+
+namespace tsl {
+namespace profiler {
+using ::testing::Eq;
+
+TEST(TimestampUtilsTest, StartAndStopTimestampAreAdded) {
+  XSpace xspace;
+
+  SetSessionTimestamps(1000, 2000, xspace);
+
+  const XPlane* xplane = FindPlaneWithName(xspace, kTaskEnvPlaneName);
+
+  XPlaneVisitor visitor(xplane, {}, {FindTaskEnvStatType});
+
+  auto start_time = visitor.GetStat(TaskEnvStatType::kEnvProfileStartTime);
+  auto stop_time = visitor.GetStat(TaskEnvStatType::kEnvProfileStopTime);
+
+  EXPECT_THAT(start_time->IntOrUintValue(), Eq(1000));
+  EXPECT_THAT(stop_time->IntOrUintValue(), Eq(2000));
+}
+
+}  // namespace profiler
+
+}  // namespace tsl
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/trace_utils.h b/third_party/xla/third_party/tsl/tsl/profiler/utils/trace_utils.h
index 6a7093b422c7d1..27d27c5b9966a9 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/trace_utils.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/trace_utils.h
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include <optional>
 
+#include "absl/strings/numbers.h"
 #include "absl/strings/string_view.h"
 #include "tsl/platform/types.h"
 
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
index e767b2a6375bda..5794a3f490ba19 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.cc
@@ -59,6 +59,8 @@ const absl::string_view kCounterEventsLineName = "_counters_";
 const absl::string_view kDeviceVendorNvidia = "Nvidia";
 const absl::string_view kDeviceVendorAMD = "AMD";
 
+const absl::string_view kTaskEnvPlaneName = "Task Environment";
+
 namespace {
 
 constexpr int kNumHostEventTypes =
@@ -265,6 +267,7 @@ const StatTypeMap& GetStatTypeMap() {
       {"tf_function_call", kTfFunctionCall},
       {"tracing_count", kTfFunctionTracingCount},
       {"flops", kFlops},
+      {"model_flops", kModelFlops},
       {"bytes_accessed", kBytesAccessed},
       {"memory_access_breakdown", kMemoryAccessBreakdown},
       {"source", kSourceInfo},
@@ -329,6 +332,7 @@ const StatTypeMap& GetStatTypeMap() {
       {"dcn_destination_per_slice_device_id", kDcnDestinationPerSliceDeviceId},
       {"dcn_chunk", kDcnChunk},
       {"dcn_loop_index", kDcnLoopIndex},
+      {"dropped_traces", kDroppedTraces},
   });
   DCHECK_EQ(stat_type_map->size(), kNumStatTypes);
   return *stat_type_map;
@@ -394,6 +398,29 @@ const LineIdTypeStrMap& GetLineIdTypeStrMap() {
   return *line_id_type_str_map;
 }
 
+using TaskEnvStatTypeMap =
+    absl::flat_hash_map<absl::string_view, TaskEnvStatType>;
+using TaskEnvStatTypeStrMap =
+    absl::flat_hash_map<TaskEnvStatType, absl::string_view>;
+
+constexpr int kNumTaskEnvStatTypes = TaskEnvStatType::kLastTaskEnvStatType -
+                                     TaskEnvStatType::kFirstTaskEnvStatType + 1;
+
+const TaskEnvStatTypeMap& GetTaskEnvStatTypeMap() {
+  static auto* task_env_stat_type_map = new TaskEnvStatTypeMap({
+      {"profile_start_time", kEnvProfileStartTime},
+      {"profile_stop_time", kEnvProfileStopTime},
+  });
+  DCHECK_EQ(task_env_stat_type_map->size(), kNumTaskEnvStatTypes);
+  return *task_env_stat_type_map;
+}
+
+const TaskEnvStatTypeStrMap& GetTaskEnvStatTypeStrMap() {
+  static auto* task_env_stat_type_str_map = new TaskEnvStatTypeStrMap(
+      gtl::ReverseMap<TaskEnvStatTypeStrMap>(GetTaskEnvStatTypeMap()));
+  return *task_env_stat_type_str_map;
+}
+
 }  // namespace
 
 absl::string_view GetHostEventTypeStr(HostEventType event_type) {
@@ -442,6 +469,17 @@ std::optional<int64_t> FindMegaScaleStatType(absl::string_view stat_name) {
   return std::nullopt;
 }
 
+absl::string_view GetTaskEnvStatTypeStr(TaskEnvStatType stat_type) {
+  return GetTaskEnvStatTypeStrMap().at(stat_type);
+}
+
+std::optional<int64_t> FindTaskEnvStatType(absl::string_view stat_name) {
+  if (auto stat_type = gtl::FindOrNull(GetTaskEnvStatTypeMap(), stat_name)) {
+    return *stat_type;
+  }
+  return std::nullopt;
+}
+
 absl::string_view GetLineIdTypeStr(LineIdType line_id_type) {
   return GetLineIdTypeStrMap().at(line_id_type);
 }
diff --git a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
index 3e40f71d71f977..255099aef57aa5 100644
--- a/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
+++ b/third_party/xla/third_party/tsl/tsl/profiler/utils/xplane_schema.h
@@ -77,6 +77,9 @@ TF_CONST_INIT extern const absl::string_view kCounterEventsLineName;
 TF_CONST_INIT extern const absl::string_view kDeviceVendorNvidia;
 TF_CONST_INIT extern const absl::string_view kDeviceVendorAMD;
 
+// Name of Xplane that contains environment information
+TF_CONST_INIT extern const absl::string_view kTaskEnvPlaneName;
+
 // Max collectives to display per TPU.
 // Since in most cases there will be more than 9 collectives, the last line
 // contains all collectives that did not qualify to get their own line.
@@ -252,6 +255,7 @@ enum StatType {
   kTfFunctionCall,
   kTfFunctionTracingCount,
   kFlops,
+  kModelFlops,
   kBytesAccessed,
   kMemoryAccessBreakdown,
   kSourceInfo,
@@ -313,7 +317,8 @@ enum StatType {
   kEdgeTpuModelInfo,
   kEdgeTpuModelProfileInfo,
   kEdgeTpuMlir,
-  kLastStatType = kEdgeTpuMlir,
+  kDroppedTraces,
+  kLastStatType = kDroppedTraces,
 };
 
 enum MegaScaleStatType : uint8_t {
@@ -340,6 +345,13 @@ enum MegaScaleStatType : uint8_t {
   kLastMegaScaleStatType = kMegaScaleGraphProtos,
 };
 
+enum TaskEnvStatType {
+  kFirstTaskEnvStatType = 1,
+  kEnvProfileStartTime = kFirstTaskEnvStatType,
+  kEnvProfileStopTime,
+  kLastTaskEnvStatType = kEnvProfileStopTime,
+};
+
 static constexpr uint32_t kLineIdOffset = 10000;
 
 enum LineIdType {
@@ -400,6 +412,10 @@ bool IsInternalEvent(std::optional<int64_t> event_type);
 // Returns true if the given stat shouldn't be shown in the trace viewer.
 bool IsInternalStat(std::optional<int64_t> stat_type);
 
+absl::string_view GetTaskEnvStatTypeStr(TaskEnvStatType stat_type);
+
+std::optional<int64_t> FindTaskEnvStatType(absl::string_view stat_name);
+
 // Support for flow events:
 // This class enables encoding/decoding the flow id and direction, stored as
 // XStat value. The flow id are limited to 56 bits.
diff --git a/third_party/xla/third_party/tsl/tsl/protobuf/BUILD b/third_party/xla/third_party/tsl/tsl/protobuf/BUILD
index a8c7f93c8999b4..db185588785a46 100644
--- a/third_party/xla/third_party/tsl/tsl/protobuf/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/protobuf/BUILD
@@ -2,7 +2,7 @@
 load(
     "//tsl:tsl.bzl",
     "if_google",
-    "set_external_visibility",
+    "internal_visibility",
 )
 load(
     "//tsl/platform:build_config.bzl",
@@ -10,7 +10,12 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//tensorflow/core:__subpackages__",
+        "//tsl:internal",
+        "//tensorflow_models:__subpackages__",
+    ]),
     features = if_google(["-parse_headers"]),
     licenses = ["notice"],
 )
@@ -98,7 +103,10 @@ tf_proto_library(
     name = "test_log_proto",
     srcs = ["test_log.proto"],
     make_default_target_header_only = True,
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__subpackages__",
+        "//tsl/util:__pkg__",
+    ]),
 )
 
 tf_proto_library(
diff --git a/third_party/xla/third_party/tsl/tsl/protobuf/dnn.proto b/third_party/xla/third_party/tsl/tsl/protobuf/dnn.proto
index b349115292e43a..cc16b2141e0e7a 100644
--- a/third_party/xla/third_party/tsl/tsl/protobuf/dnn.proto
+++ b/third_party/xla/third_party/tsl/tsl/protobuf/dnn.proto
@@ -179,6 +179,13 @@ message ConvolutionDescriptorProto {
   string name = 7;
 }
 
+// NormKind kind
+enum NormKind {
+  LAYER_FWD_INFER = 0;
+  LAYER_FWD_TRAIN = 1;
+  LAYER_BWD = 2;
+}
+
 // FusedMHAKind kind
 enum FusedMHAKind {
   BMM1_OUTPUT_UNKNOWN = 0;
diff --git a/third_party/xla/third_party/tsl/tsl/python/lib/core/BUILD b/third_party/xla/third_party/tsl/tsl/python/lib/core/BUILD
index df1482766a08d5..421930c4b0161f 100644
--- a/third_party/xla/third_party/tsl/tsl/python/lib/core/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/python/lib/core/BUILD
@@ -2,20 +2,21 @@
 #   Implementation of custom numpy floats.
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:public",
+    ],
     licenses = ["notice"],
 )
 
 filegroup(
     name = "numpy_hdr",
     srcs = ["numpy.h"],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "basic_hdrs",
     srcs = ["numpy.h"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -31,7 +32,6 @@ cc_library(
         "-use_header_modules",  # Required for pybind11.
         "-parse_headers",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":numpy",
         "//third_party/py/numpy:headers",
@@ -47,7 +47,6 @@ cc_library(
     name = "numpy",
     srcs = ["numpy.cc"],
     hdrs = ["numpy.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//third_party/py/numpy:headers",
         "//third_party/python_runtime:headers",
diff --git a/third_party/xla/third_party/tsl/tsl/tsl.bzl b/third_party/xla/third_party/tsl/tsl/tsl.bzl
index cc668741f78537..5e81264ce4c7ca 100644
--- a/third_party/xla/third_party/tsl/tsl/tsl.bzl
+++ b/third_party/xla/third_party/tsl/tsl/tsl.bzl
@@ -101,6 +101,14 @@ def if_google(google_value, oss_value = []):
     """
     return oss_value  # copybara:comment_replace return google_value
 
+def internal_visibility(internal_targets):
+    """Returns internal_targets in g3, but returns public in OSS.
+
+    Useful for targets that are part of the XLA/TSL API surface but want finer-grained visibilites
+    internally.
+    """
+    return if_google(internal_targets, ["//visibility:public"])
+
 # TODO(jakeharmon): Use this to replace if_static
 def if_tsl_link_protobuf(if_true, if_false = []):
     return select({
@@ -432,6 +440,9 @@ check_deps = rule(
 def get_compatible_with_portable():
     return []
 
+def get_compatible_with_libtpu_portable():
+    return []
+
 def filegroup(**kwargs):
     native.filegroup(**kwargs)
 
@@ -466,7 +477,7 @@ _transitive_hdrs = rule(
 
 def transitive_hdrs(name, deps = [], **kwargs):
     _transitive_hdrs(name = name + "_gather", deps = deps)
-    native.filegroup(name = name, srcs = [":" + name + "_gather"])
+    native.filegroup(name = name, srcs = [":" + name + "_gather"], **kwargs)
 
 # Create a header only library that includes all the headers exported by
 # the libraries in deps.
@@ -761,7 +772,5 @@ def tsl_pybind_extension_opensource(
         compatible_with = compatible_with,
     )
 
-# Used for specifying external visibility constraints. In non-monorepo situations, this needs to be
-# public, but monorepos can have more precise constraints.
-def set_external_visibility(monorepo_paths):
-    return if_oss(["//visibility:public"], monorepo_paths)
+def nvtx_headers():
+    return if_oss(["@nvtx_archive//:headers"], ["@local_config_cuda//cuda:cuda_headers"])
diff --git a/third_party/xla/third_party/tsl/tsl/tsl.default.bzl b/third_party/xla/third_party/tsl/tsl/tsl.default.bzl
index 1759e5106320d5..912939245725ab 100644
--- a/third_party/xla/third_party/tsl/tsl/tsl.default.bzl
+++ b/third_party/xla/third_party/tsl/tsl/tsl.default.bzl
@@ -3,6 +3,7 @@
 load(
     "//tsl:tsl.bzl",
     _filegroup = "filegroup",
+    _get_compatible_with_libtpu_portable = "get_compatible_with_libtpu_portable",
     _get_compatible_with_portable = "get_compatible_with_portable",
     _if_not_mobile_or_arm_or_lgpl_restricted = "if_not_mobile_or_arm_or_lgpl_restricted",
     _internal_hlo_deps = "internal_hlo_deps",
@@ -11,6 +12,7 @@ load(
 )
 
 get_compatible_with_portable = _get_compatible_with_portable
+get_compatible_with_libtpu_portable = _get_compatible_with_libtpu_portable
 filegroup = _filegroup
 if_not_mobile_or_arm_or_lgpl_restricted = _if_not_mobile_or_arm_or_lgpl_restricted
 internal_hlo_deps = _internal_hlo_deps
diff --git a/third_party/xla/third_party/tsl/tsl/util/BUILD b/third_party/xla/third_party/tsl/tsl/util/BUILD
index 827826f13d3651..cb70feac45c080 100644
--- a/third_party/xla/third_party/tsl/tsl/util/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/util/BUILD
@@ -4,14 +4,10 @@
 #   The libraries in this package are not allowed to have ANY dependencies
 #   to other TF components outside of TSL.
 
-load(
-    "@local_tsl//tsl/platform:rules_cc.bzl",
-    "cc_library",
-)
 load(
     "//tsl:tsl.bzl",
     "check_deps",
-    "set_external_visibility",
+    "internal_visibility",
     "tsl_copts",
 )
 load("//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
@@ -23,9 +19,16 @@ load(
     "//tsl/platform:build_config.bzl",
     "tsl_cc_test",
 )
+load(
+    "@local_tsl//tsl/platform:rules_cc.bzl",
+    "cc_library",
+)
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:public",
+    ],
     licenses = ["notice"],
 )
 
@@ -35,7 +38,6 @@ filegroup(
         "byte_swap_array.cc",
         "byte_swap_array.h",
     ],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
@@ -52,7 +54,6 @@ filegroup(
         "use_cudnn.cc",
         "use_cudnn.h",
     ],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
@@ -61,7 +62,10 @@ filegroup(
         "determinism.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow:__subpackages__",
+        "//tensorflow/core/util:__pkg__",
+    ]),
 )
 
 filegroup(
@@ -75,7 +79,6 @@ filegroup(
         "stats_calculator.h",
         "use_cudnn.h",
     ],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
@@ -83,7 +86,6 @@ filegroup(
     srcs = [
         "use_cudnn.cc",
     ],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
@@ -93,7 +95,10 @@ filegroup(
         "env_var.h",
         "use_cudnn.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/util:__pkg__",
+    ]),
 )
 
 filegroup(
@@ -102,7 +107,10 @@ filegroup(
         "determinism.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow:__subpackages__",
+        "//tensorflow/core/util:__pkg__",
+    ]),
 )
 
 filegroup(
@@ -112,14 +120,12 @@ filegroup(
         "stat_summarizer_options.h",
         "use_cudnn.h",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "byte_swap_array",
     srcs = ["byte_swap_array.cc"],
     hdrs = ["byte_swap_array.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:byte_order",
         "//tsl/platform:errors",
@@ -131,6 +137,7 @@ cc_library(
     name = "determinism_hdr_lib",
     hdrs = [":determinism_hdr"],
     compatible_with = get_compatible_with_portable(),
+    # TODO(b/298501506): narrow this in a way that won't break TAP
     visibility = ["//visibility:public"],
 )
 
@@ -141,7 +148,7 @@ cc_library(
     srcs = ["determinism.cc"],
     hdrs = ["determinism.h"],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow:__subpackages__"]),
     deps = [
         ":env_var",
         "//tsl/platform:mutex",
@@ -164,7 +171,7 @@ cc_library(
 alias(
     name = "determinism_for_kernels",
     actual = if_static(":determinism", ":determinism_hdr_lib"),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow:__subpackages__"]),
 )
 
 check_deps(
@@ -185,7 +192,6 @@ cc_library(
         # whenever determinism tests are run.
         ":determinism_check_deps",
     ],
-    visibility = ["//visibility:public"],
     deps = [":determinism"],
 )
 
@@ -193,7 +199,6 @@ cc_library(
     name = "env_var",
     srcs = ["env_var.cc"],
     hdrs = ["env_var.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:errors",
         "//tsl/platform:logging",
@@ -210,7 +215,10 @@ cc_library(
     name = "reporter",
     srcs = ["reporter.cc"],
     hdrs = ["reporter.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__subpackages__",
+        "//tsl:__subpackages__",
+    ]),
     deps = [
         "//tsl/platform:env",
         "//tsl/platform:env_impl",
@@ -233,13 +241,14 @@ cc_library(
         "stats_calculator.h",
     ],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tsl:internal",
+    ]),
 )
 
 tsl_cc_test(
     name = "stats_calculator_test",
     srcs = ["stats_calculator_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":stats_calculator_portable",
         "//tsl/platform:test",
@@ -251,7 +260,6 @@ cc_library(
     name = "device_name_utils",
     srcs = ["device_name_utils.cc"],
     hdrs = ["device_name_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:errors",
         "//tsl/platform:status",
@@ -263,7 +271,6 @@ tsl_cc_test(
     name = "device_name_utils_test",
     size = "small",
     srcs = ["device_name_utils_test.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":device_name_utils",
         "//tsl/lib/core:status_test_util",
@@ -279,7 +286,6 @@ cc_library(
     name = "command_line_flags",
     srcs = ["command_line_flags.cc"],
     hdrs = ["command_line_flags.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//tsl/platform:logging",
         "//tsl/platform:str_util",
@@ -296,7 +302,7 @@ filegroup(
     srcs = [
         "reporter.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core/util:__pkg__"]),
 )
 
 filegroup(
@@ -304,7 +310,12 @@ filegroup(
     srcs = [
         "onednn_threadpool.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "@local_xla//xla:__subpackages__",
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/framework:__pkg__",
+        "//tensorflow/core/util:__pkg__",
+    ]),
 )
 
 filegroup(
@@ -313,7 +324,10 @@ filegroup(
     srcs = [
         "reporter.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/util:__pkg__",
+    ]),
 )
 
 filegroup(
@@ -323,5 +337,8 @@ filegroup(
         "reporter.cc",
         ":android_test_hdrs",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/core:__pkg__",
+        "//tensorflow/core/util:__pkg__",
+    ]),
 )
diff --git a/third_party/xla/third_party/tsl/tsl/util/onednn_threadpool.h b/third_party/xla/third_party/tsl/tsl/util/onednn_threadpool.h
index 82fbec738f00ee..7d8a093ae89fa6 100644
--- a/third_party/xla/third_party/tsl/tsl/util/onednn_threadpool.h
+++ b/third_party/xla/third_party/tsl/tsl/util/onednn_threadpool.h
@@ -151,6 +151,16 @@ class OneDnnThreadPool : public threadpool_iface {
 
   ~OneDnnThreadPool() {}
 
+  static void set_onednn_max_threads(int num_threads) {
+#if DNNL_VERSION_MAJOR >= 3 || \
+    (DNNL_VERSION_MAJOR == 2 && DNNL_VERSION_MINOR >= 7)
+#ifndef DNNL_AARCH64_USE_ACL
+    dnnl_threadpool_interop_set_max_concurrency(num_threads);
+#endif  // DNNL_AARCH64_USE_ACL
+#endif  // DNNL_VERSION_MAJOR >= 3 ||
+        // (DNNL_VERSION_MAJOR == 2 && DNNL_VERSION_MINOR >= 7)
+  }
+
  private:
   Eigen::ThreadPoolInterface* eigen_interface_ = nullptr;
   int num_threads_ = 1;                 // Execute in caller thread.
@@ -159,13 +169,7 @@ class OneDnnThreadPool : public threadpool_iface {
   inline void set_num_and_max_threads(int num_threads) {
     num_threads_ =
         num_threads == -1 ? eigen_interface_->NumThreads() : num_threads;
-#if DNNL_VERSION_MAJOR >= 3 || \
-    (DNNL_VERSION_MAJOR == 2 && DNNL_VERSION_MINOR >= 7)
-#ifndef DNNL_AARCH64_USE_ACL
-    dnnl_threadpool_interop_set_max_concurrency(num_threads_);
-#endif  // DNNL_AARCH64_USE_ACL
-#endif  // DNNL_VERSION_MAJOR >= 3 ||
-        // (DNNL_VERSION_MAJOR == 2 && DNNL_VERSION_MINOR >= 7)
+    set_onednn_max_threads(num_threads_);
   }
 };
 
@@ -178,6 +182,7 @@ class OneDnnThreadPool {
   OneDnnThreadPool(Eigen::ThreadPoolInterface* eigen_interface) {}
   OneDnnThreadPool(Eigen::ThreadPoolInterface* eigen_interface,
                    bool can_use_caller_thread, int num_threads = -1) {}
+  static void set_onednn_max_threads(int num_threads) {}
 };
 
 #endif  // !ENABLE_ONEDNN_OPENMP
diff --git a/third_party/xla/third_party/tsl/tsl/util/proto/BUILD b/third_party/xla/third_party/tsl/tsl/util/proto/BUILD
index 07f4f10718ca03..2752d1f13e07d0 100644
--- a/third_party/xla/third_party/tsl/tsl/util/proto/BUILD
+++ b/third_party/xla/third_party/tsl/tsl/util/proto/BUILD
@@ -4,14 +4,16 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//visibility:public",
+    ],
     licenses = ["notice"],
 )
 
 cc_library(
     name = "proto_utils",
     hdrs = ["proto_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/time",
         "@com_google_protobuf//:protobuf_headers",
diff --git a/third_party/xla/third_party/tsl/workspace2.bzl b/third_party/xla/third_party/tsl/workspace2.bzl
index 0ed76aea75906b..e23dcc3a4c7ad6 100644
--- a/third_party/xla/third_party/tsl/workspace2.bzl
+++ b/third_party/xla/third_party/tsl/workspace2.bzl
@@ -22,7 +22,6 @@ load("//third_party/gpus:cuda_configure.bzl", "cuda_configure")
 load("//third_party/gpus:rocm_configure.bzl", "rocm_configure")
 load("//third_party/hwloc:workspace.bzl", hwloc = "repo")
 load("//third_party/implib_so:workspace.bzl", implib_so = "repo")
-load("//third_party/jpeg:workspace.bzl", jpeg = "repo")
 load("//third_party/llvm:setup.bzl", "llvm_setup")
 load("//third_party/nasm:workspace.bzl", nasm = "repo")
 load("//third_party/nccl:nccl_configure.bzl", "nccl_configure")
@@ -51,7 +50,6 @@ def _initialize_third_party():
     gemmlowp()
     hwloc()
     implib_so()
-    jpeg()
     ml_dtypes()
     nasm()
     pybind11_abseil()
@@ -250,26 +248,6 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/GoogleCloudPlatform/tensorflow-gcp-tools/archive/2643d8caeba6ca2a6a0b46bb123953cb95b7e7d5.tar.gz"),
     )
 
-    tf_http_archive(
-        name = "png",
-        build_file = "//third_party:png.BUILD",
-        patch_file = ["//third_party:png_fix_rpi.patch"],
-        sha256 = "a00e9d2f2f664186e4202db9299397f851aea71b36a35e74910b8820e380d441",
-        strip_prefix = "libpng-1.6.39",
-        system_build_file = "//third_party/systemlibs:png.BUILD",
-        urls = tf_mirror_urls("https://github.com/glennrp/libpng/archive/v1.6.39.tar.gz"),
-    )
-
-    tf_http_archive(
-        name = "gif",
-        build_file = "//third_party:gif.BUILD",
-        patch_file = ["//third_party:gif_fix_strtok_r.patch"],
-        sha256 = "31da5562f44c5f15d63340a09a4fd62b48c45620cd302f77a6d9acf0077879bd",
-        strip_prefix = "giflib-5.2.1",
-        system_build_file = "//third_party/systemlibs:gif.BUILD",
-        urls = tf_mirror_urls("https://pilotfiber.dl.sourceforge.net/project/giflib/giflib-5.2.1.tar.gz"),
-    )
-
     tf_http_archive(
         name = "six_archive",
         build_file = "//third_party:six.BUILD",
@@ -403,7 +381,6 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/open-source-parsers/jsoncpp/archive/1.9.5.tar.gz"),
     )
 
-    # Note: if you update this, you have to update libpng too. See cl/437813808
     tf_http_archive(
         name = "zlib",
         build_file = "//third_party:zlib.BUILD",
@@ -431,6 +408,15 @@ def _tf_repositories():
         urls = tf_mirror_urls("https://github.com/nvidia/nccl/archive/v2.19.3-1.tar.gz"),
     )
 
+    # Note that we are currently taking NVTX headers from a NCCL release to get nvToolsExtPayload.h
+    tf_http_archive(
+        name = "nvtx_archive",
+        build_file = "//third_party:nvtx/BUILD",
+        sha256 = "1c5474553afedb88e878c772f13d6f90b9226b3f2971dfa6f873adb9443100c2",
+        strip_prefix = "nccl-2.19.3-1/src/include/nvtx3",
+        urls = tf_mirror_urls("https://github.com/nvidia/nccl/archive/v2.19.3-1.tar.gz"),
+    )
+
     java_import_external(
         name = "junit",
         jar_sha256 = "59721f0805e223d84b90677887d9ff567dc534d7c502ca903c0c2b17f05c116a",
diff --git a/third_party/xla/tools/ci_build/gpu_build/BUILD b/third_party/xla/tools/ci_build/gpu_build/BUILD
index 29177db0620a8b..743f8f19dba0e8 100644
--- a/third_party/xla/tools/ci_build/gpu_build/BUILD
+++ b/third_party/xla/tools/ci_build/gpu_build/BUILD
@@ -3,7 +3,6 @@
 # learning applications.
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
@@ -22,5 +21,5 @@ filegroup(
             "**/OWNERS",
         ],
     ),
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
 )
diff --git a/third_party/xla/tools/toolchains/cpus/py/BUILD b/third_party/xla/tools/toolchains/cpus/py/BUILD
index 54feb1695a21e3..1235988abb7fa9 100644
--- a/third_party/xla/tools/toolchains/cpus/py/BUILD
+++ b/third_party/xla/tools/toolchains/cpus/py/BUILD
@@ -22,7 +22,6 @@ cc_library(
     name = "python_headers",
     hdrs = [":python_include"],
     includes = ["python_include"],
-    visibility = ["//visibility:public"],
     deps = select({
         ":windows": [":python_lib"],
         "//conditions:default": [],
@@ -33,7 +32,6 @@ cc_library(
     name = "numpy_headers",
     hdrs = [":numpy_include"],
     includes = ["numpy_include"],
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
diff --git a/third_party/xla/tools/toolchains/cpus/py3/BUILD b/third_party/xla/tools/toolchains/cpus/py3/BUILD
index 5dc47b98284c89..d47256ebef88fa 100644
--- a/third_party/xla/tools/toolchains/cpus/py3/BUILD
+++ b/third_party/xla/tools/toolchains/cpus/py3/BUILD
@@ -22,7 +22,6 @@ cc_library(
     name = "python_headers",
     hdrs = [":python_include"],
     includes = ["python_include"],
-    visibility = ["//visibility:public"],
     deps = select({
         ":windows": [":python_lib"],
         "//conditions:default": [],
@@ -33,7 +32,6 @@ cc_library(
     name = "numpy_headers",
     hdrs = [":numpy_include"],
     includes = ["numpy_include"],
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
diff --git a/third_party/xla/tools/toolchains/cross_compile/cc/BUILD b/third_party/xla/tools/toolchains/cross_compile/cc/BUILD
index 976e57b777f15d..7cf6d8c3747b27 100644
--- a/third_party/xla/tools/toolchains/cross_compile/cc/BUILD
+++ b/third_party/xla/tools/toolchains/cross_compile/cc/BUILD
@@ -15,10 +15,7 @@ cc_toolchain_suite(
     },
 )
 
-filegroup(
-    name = "empty",
-    visibility = ["//visibility:public"],
-)
+filegroup(name = "empty")
 
 # We define a wraper ("cc_wrapper.sh") around the compiler to replace all paths
 # in the binary (bazel-out/.../path/to/original/library.so) by the paths
@@ -27,7 +24,6 @@ filegroup(
 filegroup(
     name = "cc_wrapper_and_macos_sysroot",
     srcs = ["cc_wrapper.sh"] + glob(["MacOSX.sdk/**"]),
-    visibility = ["//visibility:public"],
 )
 
 cc_toolchain(
diff --git a/third_party/xla/tools/toolchains/remote_config/configs.bzl b/third_party/xla/tools/toolchains/remote_config/configs.bzl
index aab6ac89e37c67..7bbfb8b2854ca4 100644
--- a/third_party/xla/tools/toolchains/remote_config/configs.bzl
+++ b/third_party/xla/tools/toolchains/remote_config/configs.bzl
@@ -627,7 +627,7 @@ def initialize_rbe_configs():
 
     sigbuild_tf_configs(
         name_container_map = {
-            "sigbuild-r2.16": "docker://gcr.io/tensorflow-sigs/build@sha256:22d863e6fe3f98946015b9e1264b2eeb8e56e504535a6c1d5e564cae65ae5d37",
+            "sigbuild-r2.16": "docker://gcr.io/tensorflow-sigs/build@sha256:842a5ba84d3658c5bf1f8a31e16284f7becc35409da0dfd71816afa3cd28d728",
             "sigbuild-r2.16-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:22d863e6fe3f98946015b9e1264b2eeb8e56e504535a6c1d5e564cae65ae5d37",
             "sigbuild-r2.16-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:da15288c8464153eadd35da720540a544b76aa9d78cceb42a6821b2f3e70a0fa",
             "sigbuild-r2.16-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:842a5ba84d3658c5bf1f8a31e16284f7becc35409da0dfd71816afa3cd28d728",
@@ -667,7 +667,7 @@ def initialize_rbe_configs():
 
     sigbuild_tf_configs(
         name_container_map = {
-            "sigbuild-r2.16-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:22d863e6fe3f98946015b9e1264b2eeb8e56e504535a6c1d5e564cae65ae5d37",
+            "sigbuild-r2.16-clang": "docker://gcr.io/tensorflow-sigs/build@sha256:842a5ba84d3658c5bf1f8a31e16284f7becc35409da0dfd71816afa3cd28d728",
             "sigbuild-r2.16-clang-python3.9": "docker://gcr.io/tensorflow-sigs/build@sha256:22d863e6fe3f98946015b9e1264b2eeb8e56e504535a6c1d5e564cae65ae5d37",
             "sigbuild-r2.16-clang-python3.10": "docker://gcr.io/tensorflow-sigs/build@sha256:da15288c8464153eadd35da720540a544b76aa9d78cceb42a6821b2f3e70a0fa",
             "sigbuild-r2.16-clang-python3.11": "docker://gcr.io/tensorflow-sigs/build@sha256:842a5ba84d3658c5bf1f8a31e16284f7becc35409da0dfd71816afa3cd28d728",
diff --git a/third_party/xla/tools/toolchains/win/bazel_211/BUILD b/third_party/xla/tools/toolchains/win/bazel_211/BUILD
index 07aff97390d02e..cc23c8ecb22680 100644
--- a/third_party/xla/tools/toolchains/win/bazel_211/BUILD
+++ b/third_party/xla/tools/toolchains/win/bazel_211/BUILD
@@ -22,31 +22,26 @@ package(default_visibility = ["//visibility:public"])
 
 cc_library(
     name = "malloc",
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "empty",
     srcs = [],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "mingw_compiler_files",
     srcs = [":builtin_include_directory_paths_mingw"],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "clangcl_compiler_files",
     srcs = [":builtin_include_directory_paths_clangcl"],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "msvc_compiler_files",
     srcs = [":builtin_include_directory_paths_msvc"],
-    visibility = ["//visibility:public"],
 )
 
 # Hardcoded toolchain, legacy behaviour.
@@ -358,5 +353,4 @@ toolchain(
 filegroup(
     name = "link_dynamic_library",
     srcs = ["link_dynamic_library.sh"],
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/tools/toolchains/win/tf_win_05022023/BUILD b/third_party/xla/tools/toolchains/win/tf_win_05022023/BUILD
index 4d6c76ca644f07..f245f6d0789c9d 100644
--- a/third_party/xla/tools/toolchains/win/tf_win_05022023/BUILD
+++ b/third_party/xla/tools/toolchains/win/tf_win_05022023/BUILD
@@ -22,31 +22,26 @@ package(default_visibility = ["//visibility:public"])
 
 cc_library(
     name = "malloc",
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "empty",
     srcs = [],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "mingw_compiler_files",
     srcs = [":builtin_include_directory_paths_mingw"],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "clangcl_compiler_files",
     srcs = [":builtin_include_directory_paths_clangcl"],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "msvc_compiler_files",
     srcs = [":builtin_include_directory_paths_msvc"],
-    visibility = ["//visibility:public"],
 )
 
 # Hardcoded toolchain, legacy behaviour.
diff --git a/third_party/xla/tools/toolchains/win_1803/py38/BUILD b/third_party/xla/tools/toolchains/win_1803/py38/BUILD
index 3efde36a05097c..9aa4d82e6daca6 100644
--- a/third_party/xla/tools/toolchains/win_1803/py38/BUILD
+++ b/third_party/xla/tools/toolchains/win_1803/py38/BUILD
@@ -39,7 +39,6 @@ cc_library(
     name = "python_headers",
     hdrs = [":python_include"],
     includes = ["python_include"],
-    visibility = ["//visibility:public"],
     deps = select({
         ":windows": [":python_lib"],
         "//conditions:default": [],
@@ -50,7 +49,6 @@ cc_library(
     name = "numpy_headers",
     hdrs = [":numpy_include"],
     includes = ["numpy_include"],
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
diff --git a/third_party/xla/tools/toolchains/win_1803/py39/BUILD b/third_party/xla/tools/toolchains/win_1803/py39/BUILD
index f3892df8ef4fae..f5b545cb161b3a 100644
--- a/third_party/xla/tools/toolchains/win_1803/py39/BUILD
+++ b/third_party/xla/tools/toolchains/win_1803/py39/BUILD
@@ -66,7 +66,6 @@ cc_library(
     name = "python_headers",
     hdrs = [":python_include"],
     includes = ["python_include"],
-    visibility = ["//visibility:public"],
     deps = select({
         ":windows": [":python_lib"],
         "//conditions:default": [],
@@ -77,7 +76,6 @@ cc_library(
     name = "numpy_headers",
     hdrs = [":numpy_include"],
     includes = ["numpy_include"],
-    visibility = ["//visibility:public"],
 )
 
 config_setting(
diff --git a/third_party/xla/workspace2.bzl b/third_party/xla/workspace2.bzl
index 587d7874d8ac60..b0ebdf48ff8f08 100644
--- a/third_party/xla/workspace2.bzl
+++ b/third_party/xla/workspace2.bzl
@@ -1,10 +1,10 @@
 """TensorFlow workspace initialization. Consult the WORKSPACE on how to use it."""
 
-# Import TSL Workspaces
-load("@local_tsl//:workspace2.bzl", "tsl_workspace2")
-
 # Import third party config rules.
 load("@bazel_skylib//lib:versions.bzl", "versions")
+
+# Import TSL Workspaces
+load("@local_tsl//:workspace2.bzl", "tsl_workspace2")
 load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
 
 # Import third party repository rules. See go/tfbr-thirdparty.
@@ -34,9 +34,9 @@ def _tf_repositories():
         name = "cudnn_frontend_archive",
         build_file = "//third_party:cudnn_frontend.BUILD",
         patch_file = ["//third_party:cudnn_frontend_header_fix.patch"],
-        sha256 = "015ea933139a30e9ccd177b5e0dbfb16f3d08df78334aaacea57880275df734b",
-        strip_prefix = "cudnn-frontend-1.0.0",
-        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.0.0.zip"),
+        sha256 = "c2f5373ddf84e33d289dad5766667f52de652dfbbb1dccb2fada9cfcf2d774cf",
+        strip_prefix = "cudnn-frontend-1.1.0",
+        urls = tf_mirror_urls("https://github.com/NVIDIA/cudnn-frontend/archive/refs/tags/v1.1.0.zip"),
     )
 
     tf_http_archive(
@@ -94,6 +94,13 @@ def _tf_repositories():
         #url = "http://www.tcs.hut.fi/Software/bliss/bliss-0.73.zip",
     )
 
+    tf_http_archive(
+        name = "pybind11_protobuf",
+        urls = tf_mirror_urls("https://github.com/pybind/pybind11_protobuf/archive/80f3440cd8fee124e077e2e47a8a17b78b451363.zip"),
+        sha256 = "c7ab64b1ccf9a678694a89035a8c865a693e4e872803778f91f0965c2f281d78",
+        strip_prefix = "pybind11_protobuf-80f3440cd8fee124e077e2e47a8a17b78b451363",
+    )
+
 # buildifier: disable=function-docstring
 # buildifier: disable=unnamed-macro
 def workspace():
diff --git a/third_party/xla/xla/BUILD b/third_party/xla/xla/BUILD
index e017c87ee2cc87..5d113506e817b5 100644
--- a/third_party/xla/xla/BUILD
+++ b/third_party/xla/xla/BUILD
@@ -1,6 +1,7 @@
 # Placeholder: load py_proto_library
 load("//xla:xla.bzl", "xla_cc_test", "xla_py_proto_library")
 load("//third_party/compute_library:build_defs.bzl", "if_enable_acl")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
@@ -9,7 +10,8 @@ load(
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility(["//xla:internal"]),
     licenses = ["notice"],
 )
 
@@ -47,12 +49,9 @@ package_group(
     ],
 )
 
-exports_files(
-    [
-        "lit.cfg.py",
-    ],
-    visibility = ["//visibility:public"],
-)
+exports_files([
+    "lit.cfg.py",
+])
 
 # Filegroup used to collect source files for dependency checking.
 filegroup(
@@ -61,7 +60,6 @@ filegroup(
         "**/*.cc",
         "**/*.h",
     ]),
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
@@ -70,7 +68,7 @@ filegroup(
         "cpu_function_runtime.cc",
         "executable_run_options.cc",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
 )
 
 filegroup(
@@ -80,7 +78,7 @@ filegroup(
         "executable_run_options.h",
         "types.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
 )
 
 tf_proto_library(
@@ -108,7 +106,7 @@ tf_proto_library(
 cc_library(
     name = "bit_cast",
     hdrs = ["bit_cast.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":types",
         "@com_google_absl//absl/base",
@@ -138,7 +136,7 @@ cc_library(
         "comparison_util.h",
         "primitive_util.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":shape_util",
         ":statusor",
@@ -168,14 +166,14 @@ xla_cc_test(
 cc_library(
     name = "compiler_macros",
     hdrs = ["compiler_macros.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
 )
 
 cc_library(
     name = "ef57",
     srcs = ["ef57.cc"],
     hdrs = ["ef57.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":compiler_macros",
         "@com_google_absl//absl/types:span",
@@ -205,7 +203,7 @@ cc_library(
     hdrs = [
         "execution_options_util.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":debug_options_flags",
         ":xla_proto_cc",
@@ -220,7 +218,7 @@ cc_library(
     hdrs = [
         "frontend_attributes.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = ["//xla/hlo/ir:hlo"],
 )
 
@@ -228,7 +226,7 @@ cc_library(
     name = "test",
     testonly = 1,
     hdrs = ["test.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:test",
@@ -239,11 +237,10 @@ cc_library(
     name = "types",
     hdrs = ["types.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
-        "@com_google_absl//absl/strings:str_format",
         "@eigen_archive//:eigen3",
-        "@ml_dtypes//:int4",
+        "@local_tsl//tsl/platform:ml_dtypes",
     ],
 )
 
@@ -265,7 +262,7 @@ cc_library(
     name = "service_interface",
     srcs = [],
     hdrs = ["service_interface.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":status",
         ":xla_data_proto_cc",
@@ -308,7 +305,9 @@ cc_library(
     hdrs = ["status.h"],
     visibility = ["//visibility:public"],
     deps = [
-        "@local_tsl//tsl/platform:status",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
     ],
 )
 
@@ -387,7 +386,6 @@ cc_library(
     name = "permutation_util",
     srcs = ["permutation_util.cc"],
     hdrs = ["permutation_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":types",
         "@com_google_absl//absl/container:inlined_vector",
@@ -683,14 +681,14 @@ cc_library(
 cc_library(
     name = "error_spec",
     hdrs = ["error_spec.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
 )
 
 cc_library(
     name = "literal_comparison",
     srcs = ["literal_comparison.cc"],
     hdrs = ["literal_comparison.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":error_spec",
         ":literal",
@@ -739,7 +737,7 @@ cc_library(
     name = "array",
     srcs = ["array.cc"],
     hdrs = ["array.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":status",
         ":types",
@@ -785,7 +783,7 @@ xla_cc_test(
 cc_library(
     name = "array3d",
     hdrs = ["array3d.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":array",
         ":types",
@@ -807,7 +805,7 @@ xla_cc_test(
 cc_library(
     name = "array4d",
     hdrs = ["array4d.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":array",
         ":array2d",
@@ -845,7 +843,7 @@ cc_library(
     name = "packed_literal_reader",
     srcs = ["packed_literal_reader.cc"],
     hdrs = ["packed_literal_reader.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":literal",
         ":shape_util",
@@ -865,7 +863,7 @@ cc_library(
     name = "test_helpers",
     testonly = 1,
     hdrs = ["test_helpers.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":statusor",
         ":types",
@@ -880,7 +878,7 @@ cc_library(
     name = "text_literal_reader",
     srcs = ["text_literal_reader.cc"],
     hdrs = ["text_literal_reader.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":literal",
         ":shape_util",
@@ -917,7 +915,7 @@ cc_library(
     name = "text_literal_writer",
     srcs = ["text_literal_writer.cc"],
     hdrs = ["text_literal_writer.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":literal",
         ":shape_util",
@@ -1066,7 +1064,6 @@ cc_library(
     name = "parse_flags_from_env",
     srcs = ["parse_flags_from_env.cc"],
     hdrs = ["parse_flags_from_env.h"],
-    visibility = ["//visibility:public"],
     deps =
         [
             ":types",
@@ -1102,11 +1099,12 @@ cc_library(
     ],
     hdrs = ["debug_options_flags.h"],
     copts = if_enable_acl(["-DXLA_CPU_USE_ACL=1"]),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps =
         [
             ":parse_flags_from_env",
             ":xla_proto_cc",
+            "@com_google_absl//absl/algorithm:container",
             "@com_google_absl//absl/base",
             "@com_google_absl//absl/container:flat_hash_map",
             "@com_google_absl//absl/container:node_hash_map",
@@ -1123,7 +1121,7 @@ cc_library(
     srcs = ["cpu_function_runtime.cc"],
     hdrs = ["cpu_function_runtime.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         "@com_google_absl//absl/base:dynamic_annotations",
     ],
@@ -1148,7 +1146,6 @@ xla_cc_test(
 cc_library(
     name = "refcounting_hash_map",
     hdrs = ["refcounting_hash_map.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":statusor",
         "@com_google_absl//absl/base:core_headers",
@@ -1171,20 +1168,17 @@ xla_cc_test(
 cc_library(
     name = "union_find",
     hdrs = ["union_find.h"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "side_effect_util",
     srcs = ["side_effect_util.cc"],
     hdrs = ["side_effect_util.h"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "lazy",
     hdrs = ["lazy.h"],
-    visibility = ["//visibility:public"],
     deps = ["@com_google_absl//absl/functional:any_invocable"],
 )
 
@@ -1211,7 +1205,6 @@ tf_proto_library(
     srcs = ["autotuning.proto"],
     make_default_target_header_only = True,
     protodeps = ["@local_tsl//tsl/protobuf:dnn_proto"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -1232,7 +1225,7 @@ cc_library(
 # py_proto_library(
 #     name = "xla_data_proto_py_pb2",
 #     api_version = 2,
-#     visibility = [":friends"],
+#     visibility = internal_visibility([":friends"]),
 #     deps = [":xla_data_proto"],
 # )
 #
@@ -1241,7 +1234,7 @@ cc_library(
 #     testonly = 0,
 #     api_version = 2,
 #     compatible_with = ["//buildenv/target:non_prod"],
-#     visibility = [":friends"],
+#     visibility = internal_visibility([":friends"]),
 #     deps = [":xla_proto"],
 # )
 # copybara:uncomment_end
diff --git a/third_party/xla/xla/autotuning.proto b/third_party/xla/xla/autotuning.proto
index b177fe3e2895ea..9a867ab7e21776 100644
--- a/third_party/xla/xla/autotuning.proto
+++ b/third_party/xla/xla/autotuning.proto
@@ -82,6 +82,7 @@ message AutotuneResult {
     int64 split_k = 4;
     int64 num_stages = 5;
     int64 num_warps = 6;
+    int64 num_ctas = 7;
   }
 
   int64 scratch_bytes = 8;
diff --git a/third_party/xla/xla/backends/interpreter/BUILD b/third_party/xla/xla/backends/interpreter/BUILD
index d7ccdee29ceddc..66806fefc2ef84 100644
--- a/third_party/xla/xla/backends/interpreter/BUILD
+++ b/third_party/xla/xla/backends/interpreter/BUILD
@@ -5,6 +5,7 @@ load(
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -13,7 +14,6 @@ cc_library(
     name = "interpreter_transfer_manager",
     srcs = ["interpreter_transfer_manager.cc"],
     hdrs = ["interpreter_transfer_manager.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":platform_id",
         "//xla/service:generic_transfer_manager",
@@ -26,7 +26,6 @@ cc_library(
     name = "compiler",
     srcs = ["compiler.cc"],
     hdrs = ["compiler.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":executable",
         ":platform_id",
@@ -69,7 +68,6 @@ cc_library(
     name = "platform_id",
     srcs = ["platform_id.cc"],
     hdrs = ["platform_id.h"],
-    visibility = ["//visibility:public"],
     deps = ["//xla/stream_executor"] + if_static(
         ["@com_google_protobuf//:protobuf"],
         ["@com_google_protobuf//:protobuf_headers"],
@@ -80,7 +78,6 @@ cc_library(
     name = "executable_base",
     srcs = ["executable_base.cc"],
     hdrs = ["executable_base.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:literal",
         "//xla:shape_tree",
@@ -103,7 +100,6 @@ cc_library(
     name = "executable",
     srcs = ["executable.cc"],
     hdrs = ["executable.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":executable_base",
         ":executor",
@@ -133,7 +129,6 @@ cc_library(
     name = "platform",
     srcs = ["platform.cc"],
     hdrs = ["platform.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":executor",
         ":platform_id",
@@ -150,7 +145,6 @@ cc_library(
     name = "executor",
     srcs = ["executor.cc"],
     hdrs = ["executor.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:status_macros",
diff --git a/third_party/xla/xla/backends/interpreter/compiler.cc b/third_party/xla/xla/backends/interpreter/compiler.cc
index bc38a84fa462c2..7f97aa9ed44e9f 100644
--- a/third_party/xla/xla/backends/interpreter/compiler.cc
+++ b/third_party/xla/xla/backends/interpreter/compiler.cc
@@ -92,12 +92,13 @@ StatusOr<Literal> HandleEvaluatorCustomCall(
 Status InterpreterCompiler::RunHloOptimization(HloModule* hlo_module) {
   HloPassPipeline pipeline("Interpreter");
 
+  // The TopkDecomposer generates a compare op with type=TOTALORDER and must
+  // run before the ComparisonExpander which rewrites such comparisons.
   pipeline.AddPass<TopkDecomposer>();
   pipeline.AddPass<DynamicIndexSplitter>();
   pipeline.AddPass<CholeskyExpander>();
   pipeline.AddPass<QrExpander>();
   pipeline.AddPass<EighExpander>();
-  pipeline.AddPass<ComparisonExpander>();
   pipeline.AddPass<TriangularSolveExpander>();
   pipeline.AddPass<BatchNormExpander>(
       /*rewrite_training_op=*/true,
diff --git a/third_party/xla/xla/backends/interpreter/executor.cc b/third_party/xla/xla/backends/interpreter/executor.cc
index dea705fd907e9f..9266bf0e23b5b2 100644
--- a/third_party/xla/xla/backends/interpreter/executor.cc
+++ b/third_party/xla/xla/backends/interpreter/executor.cc
@@ -43,9 +43,9 @@ bool XlaInterpreterExecutor::Memcpy(Stream *stream, void *host_dst,
                                     uint64_t size) {
   AsExecutorStream(stream)->EnqueueTask([this, host_dst, dev_src, size]() {
     // Ignore errors.
-    tsl::Status ok = SynchronousMemcpy(host_dst, dev_src, size);
+    absl::Status ok = SynchronousMemcpy(host_dst, dev_src, size);
   });
-  tsl::Status status = AsExecutorStream(stream)->BlockUntilDone();
+  absl::Status status = AsExecutorStream(stream)->BlockUntilDone();
   if (status.ok()) {
     return true;
   }
@@ -60,9 +60,9 @@ bool XlaInterpreterExecutor::Memcpy(Stream *stream, DeviceMemoryBase *dev_dst,
                                     const void *host_src, uint64_t size) {
   AsExecutorStream(stream)->EnqueueTask([this, dev_dst, host_src, size]() {
     // Ignore errors.
-    tsl::Status ok = SynchronousMemcpy(dev_dst, host_src, size);
+    absl::Status ok = SynchronousMemcpy(dev_dst, host_src, size);
   });
-  tsl::Status status = AsExecutorStream(stream)->BlockUntilDone();
+  absl::Status status = AsExecutorStream(stream)->BlockUntilDone();
   if (status.ok()) {
     return true;
   }
@@ -73,21 +73,20 @@ bool XlaInterpreterExecutor::Memcpy(Stream *stream, DeviceMemoryBase *dev_dst,
   return false;
 }
 
-tsl::Status XlaInterpreterExecutor::SynchronousMemcpy(DeviceMemoryBase *dev_dst,
-                                                      const void *host_src,
-                                                      uint64_t size) {
+absl::Status XlaInterpreterExecutor::SynchronousMemcpy(
+    DeviceMemoryBase *dev_dst, const void *host_src, uint64_t size) {
   memcpy(dev_dst->opaque(), host_src, size);
-  return ::tsl::OkStatus();
+  return absl::OkStatus();
 }
 
-tsl::Status XlaInterpreterExecutor::SynchronousMemcpy(
+absl::Status XlaInterpreterExecutor::SynchronousMemcpy(
     void *host_dst, const DeviceMemoryBase &dev_src, uint64_t size) {
   memcpy(host_dst, dev_src.opaque(), size);
-  return ::tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 bool XlaInterpreterExecutor::HostCallback(
-    Stream *stream, absl::AnyInvocable<tsl::Status() &&> callback) {
+    Stream *stream, absl::AnyInvocable<absl::Status() &&> callback) {
   AsExecutorStream(stream)->EnqueueTaskWithStatus(std::move(callback));
   return true;
 }
@@ -96,7 +95,7 @@ bool XlaInterpreterExecutor::CreateStreamDependency(Stream *dependent,
                                                     Stream *other) {
   AsExecutorStream(dependent)->EnqueueTaskWithStatus(
       [other]() { return other->BlockHostUntilDone(); });
-  tsl::Status status = AsExecutorStream(dependent)->BlockUntilDone();
+  absl::Status status = AsExecutorStream(dependent)->BlockUntilDone();
   if (status.ok()) {
     return true;
   }
@@ -107,7 +106,7 @@ bool XlaInterpreterExecutor::CreateStreamDependency(Stream *dependent,
   return false;
 }
 
-tsl::Status XlaInterpreterExecutor::BlockHostUntilDone(Stream *stream) {
+absl::Status XlaInterpreterExecutor::BlockHostUntilDone(Stream *stream) {
   return AsExecutorStream(stream)->BlockUntilDone();
 }
 
diff --git a/third_party/xla/xla/backends/interpreter/executor.h b/third_party/xla/xla/backends/interpreter/executor.h
index 7de385986b70dd..4a9c18ccdfbdcf 100644
--- a/third_party/xla/xla/backends/interpreter/executor.h
+++ b/third_party/xla/xla/backends/interpreter/executor.h
@@ -47,19 +47,19 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
  public:
   XlaInterpreterExecutor() = default;
 
-  tsl::Status Init(int device_ordinal, DeviceOptions device_options) override {
+  absl::Status Init(int device_ordinal, DeviceOptions device_options) override {
     device_ordinal_ = device_ordinal;
-    return ::tsl::OkStatus();
+    return absl::OkStatus();
   }
 
   int device_ordinal() const override { return device_ordinal_; };
-  tsl::Status GetKernel(const MultiKernelLoaderSpec &spec,
-                        Kernel *kernel) override {
+  absl::Status GetKernel(const MultiKernelLoaderSpec &spec,
+                         Kernel *kernel) override {
     return tsl::errors::Unimplemented("Not Implemented");
   }
-  tsl::Status Launch(Stream *stream, const ThreadDim &thread_dims,
-                     const BlockDim &block_dims, const Kernel &kernel,
-                     const KernelArgs &args) override {
+  absl::Status Launch(Stream *stream, const ThreadDim &thread_dims,
+                      const BlockDim &block_dims, const Kernel &kernel,
+                      const KernelArgs &args) override {
     return tsl::errors::Unimplemented("Not Implemented");
   }
 
@@ -83,56 +83,57 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
     return false;
   }
 
-  tsl::Status MemZero(Stream *stream, DeviceMemoryBase *location,
-                      uint64_t size) override {
+  absl::Status MemZero(Stream *stream, DeviceMemoryBase *location,
+                       uint64_t size) override {
     return tsl::errors::Internal("Interpreter can not memzero");
   }
-  tsl::Status Memset(Stream *stream, DeviceMemoryBase *location,
-                     uint8_t pattern, uint64_t size) override {
+  absl::Status Memset(Stream *stream, DeviceMemoryBase *location,
+                      uint8_t pattern, uint64_t size) override {
     return tsl::errors::Internal("Interpreter can not memset");
   }
-  tsl::Status Memset32(Stream *stream, DeviceMemoryBase *location,
-                       uint32_t pattern, uint64_t size) override {
+  absl::Status Memset32(Stream *stream, DeviceMemoryBase *location,
+                        uint32_t pattern, uint64_t size) override {
     return tsl::errors::Internal("Interpreter can not memset");
   }
 
   // No "synchronize all activity" implemented for this platform at the moment.
   bool SynchronizeAllActivity() override { return true; }
-  tsl::Status SynchronousMemZero(DeviceMemoryBase *location,
-                                 uint64_t size) override {
+  absl::Status SynchronousMemZero(DeviceMemoryBase *location,
+                                  uint64_t size) override {
     return tsl::errors::Internal("Interpreter can not memzero");
   }
 
-  tsl::Status SynchronousMemSet(DeviceMemoryBase *location, int value,
-                                uint64_t size) override {
+  absl::Status SynchronousMemSet(DeviceMemoryBase *location, int value,
+                                 uint64_t size) override {
     return tsl::errors::Internal("Interpreter can not memset");
   }
 
-  tsl::Status SynchronousMemcpy(DeviceMemoryBase *dev_dst, const void *host_src,
-                                uint64_t size) override;
-  tsl::Status SynchronousMemcpy(void *host_dst, const DeviceMemoryBase &dev_src,
-                                uint64_t size) override;
-  tsl::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *pop_dst,
-                                              const DeviceMemoryBase &pop_src,
-                                              uint64_t size) override {
-    return tsl::Status{absl::StatusCode::kUnimplemented, ""};
+  absl::Status SynchronousMemcpy(DeviceMemoryBase *dev_dst,
+                                 const void *host_src, uint64_t size) override;
+  absl::Status SynchronousMemcpy(void *host_dst,
+                                 const DeviceMemoryBase &dev_src,
+                                 uint64_t size) override;
+  absl::Status SynchronousMemcpyDeviceToDevice(DeviceMemoryBase *pop_dst,
+                                               const DeviceMemoryBase &pop_src,
+                                               uint64_t size) override {
+    return absl::Status{absl::StatusCode::kUnimplemented, ""};
   }
 
   bool HostCallback(Stream *stream,
-                    absl::AnyInvocable<tsl::Status() &&> callback) override;
+                    absl::AnyInvocable<absl::Status() &&> callback) override;
 
-  tsl::Status AllocateEvent(Event *event) override { return ::tsl::OkStatus(); }
+  absl::Status AllocateEvent(Event *event) override { return absl::OkStatus(); }
 
-  tsl::Status DeallocateEvent(Event *event) override {
-    return ::tsl::OkStatus();
+  absl::Status DeallocateEvent(Event *event) override {
+    return absl::OkStatus();
   }
 
-  tsl::Status RecordEvent(Stream *stream, Event *event) override {
-    return tsl::Status{absl::StatusCode::kUnimplemented, "RecordEvent"};
+  absl::Status RecordEvent(Stream *stream, Event *event) override {
+    return absl::Status{absl::StatusCode::kUnimplemented, "RecordEvent"};
   }
 
-  tsl::Status WaitForEvent(Stream *stream, Event *event) override {
-    return tsl::Status{absl::StatusCode::kUnimplemented, "WaitForEvent"};
+  absl::Status WaitForEvent(Stream *stream, Event *event) override {
+    return absl::Status{absl::StatusCode::kUnimplemented, "WaitForEvent"};
   }
 
   Event::Status PollForEventStatus(Event *event) override {
@@ -143,7 +144,7 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
   void DeallocateStream(Stream *stream) override {}
   bool CreateStreamDependency(Stream *dependent, Stream *other) override;
 
-  tsl::Status BlockHostUntilDone(Stream *stream) override;
+  absl::Status BlockHostUntilDone(Stream *stream) override;
 
   bool DeviceMemoryUsage(int64_t *free, int64_t *total) const override {
     return false;
@@ -157,8 +158,8 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
   static tsl::StatusOr<std::unique_ptr<DeviceDescription>>
   CreateDeviceDescription(int device_ordinal);
 
-  tsl::Status EnablePeerAccessTo(StreamExecutorInterface *other) override {
-    return ::tsl::OkStatus();
+  absl::Status EnablePeerAccessTo(StreamExecutorInterface *other) override {
+    return absl::OkStatus();
   }
 
   bool CanEnablePeerAccessTo(StreamExecutorInterface *other) override {
@@ -170,11 +171,6 @@ class XlaInterpreterExecutor : public internal::StreamExecutorInterface {
     return nullptr;
   }
 
-  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
-      override {
-    return nullptr;
-  }
-
   std::unique_ptr<internal::StreamInterface> GetStreamImplementation()
       override {
     return std::unique_ptr<internal::StreamInterface>(
diff --git a/third_party/xla/xla/backends/interpreter/platform.cc b/third_party/xla/xla/backends/interpreter/platform.cc
index c0e1f65fd78bb5..f650fdbc72441c 100644
--- a/third_party/xla/xla/backends/interpreter/platform.cc
+++ b/third_party/xla/xla/backends/interpreter/platform.cc
@@ -67,7 +67,7 @@ XlaInterpreterPlatform::GetUncachedExecutor(
       this, std::make_unique<XlaInterpreterExecutor>(), config.ordinal);
   auto init_status = executor->Init(config.device_options);
   if (!init_status.ok()) {
-    return tsl::Status{
+    return absl::Status{
         absl::StatusCode::kInternal,
         absl::StrFormat(
             "failed initializing StreamExecutor for device ordinal %d: %s",
@@ -85,6 +85,6 @@ static void InitializeXlaInterpreterPlatform() {
 }  // namespace interpreter
 }  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(
     interpreter_platform,
     stream_executor::interpreter::InitializeXlaInterpreterPlatform());
diff --git a/third_party/xla/xla/backends/profiler/BUILD b/third_party/xla/xla/backends/profiler/BUILD
index d1fa77c7c61210..5978f53341c13f 100644
--- a/third_party/xla/xla/backends/profiler/BUILD
+++ b/third_party/xla/xla/backends/profiler/BUILD
@@ -1,4 +1,9 @@
-load("@local_tsl//tsl:tsl.bzl", "if_with_tpu_support", "tsl_gpu_library")
+load(
+    "@local_tsl//tsl:tsl.bzl",
+    "if_with_tpu_support",
+    "internal_visibility",
+    "tsl_gpu_library",
+)
 
 # copybara:uncomment package(default_applicable_licenses = ["//tensorflow:license"])
 
@@ -13,7 +18,7 @@ package_group(
 
 tsl_gpu_library(
     name = "profiler_backends",
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//xla:internal"]),
     deps = [
         "//xla/backends/profiler/cpu:host_tracer",
         "//xla/backends/profiler/cpu:metadata_collector",
diff --git a/third_party/xla/xla/backends/profiler/cpu/BUILD b/third_party/xla/xla/backends/profiler/cpu/BUILD
index 97d643f637e75f..0820bfcc1155f0 100644
--- a/third_party/xla/xla/backends/profiler/cpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/cpu/BUILD
@@ -1,3 +1,4 @@
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
@@ -6,7 +7,10 @@ load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 cc_library(
     name = "host_tracer",
     srcs = ["host_tracer_factory.cc"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//xla/backends/profiler:__pkg__",
+        # copybara:uncomment "//tensorflow/core/profiler:internal",
+    ]),
     deps = [
         ":host_tracer_impl",
         "@local_tsl//tsl/profiler/lib:profiler_factory",
@@ -20,7 +24,9 @@ cc_library(
     srcs = ["host_tracer.cc"],
     hdrs = ["host_tracer.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        # copybara:uncomment "//tensorflow/core/profiler:internal",
+    ]),
     deps = [
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status",
@@ -38,7 +44,10 @@ cc_library(
 cc_library(
     name = "python_tracer",
     srcs = ["python_tracer_factory.cc"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//xla/python:__pkg__",
+        # copybara:uncomment "//tensorflow/core/profiler:internal",
+    ]),
     deps = [
         ":python_tracer_impl",
         "@local_tsl//tsl/profiler/lib:profiler_factory",
@@ -53,7 +62,9 @@ cc_library(
     hdrs = ["python_tracer.h"],
     copts = tf_profiler_copts() + ["-fexceptions"],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        # copybara:uncomment "//tensorflow/core/profiler:internal",
+    ]),
     deps = [
         "//xla/python/profiler/internal:python_hooks",
         "@local_tsl//tsl/platform:errors",
@@ -69,7 +80,10 @@ cc_library(
     name = "metadata_collector",
     srcs = ["metadata_collector.cc"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//xla/backends/profiler:__pkg__",
+        # copybara:uncomment "//tensorflow/core/profiler:internal",
+    ]),
     deps = [
         ":metadata_utils",
         "//xla/service:hlo_proto_cc",
@@ -89,7 +103,9 @@ cc_library(
 cc_library(
     name = "metadata_utils",
     hdrs = ["metadata_utils.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        # copybara:uncomment "//tensorflow/core/profiler:internal",
+    ]),
     deps = [
         "//xla/service:hlo_proto_cc",
         "@local_tsl//tsl/profiler/convert:xla_op_utils",
diff --git a/third_party/xla/xla/backends/profiler/cpu/host_tracer.cc b/third_party/xla/xla/backends/profiler/cpu/host_tracer.cc
index 750dcb2d568cd9..c656afe0ffb688 100644
--- a/third_party/xla/xla/backends/profiler/cpu/host_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/cpu/host_tracer.cc
@@ -42,11 +42,11 @@ class HostTracer : public tsl::profiler::ProfilerInterface {
   explicit HostTracer(int host_trace_level);
   ~HostTracer() override;
 
-  tsl::Status Start() override;  // TENSORFLOW_STATUS_OK
+  absl::Status Start() override;  // TENSORFLOW_STATUS_OK
 
-  tsl::Status Stop() override;  // TENSORFLOW_STATUS_OK
+  absl::Status Stop() override;  // TENSORFLOW_STATUS_OK
 
-  tsl::Status CollectData(  // TENSORFLOW_STATUS_OK
+  absl::Status CollectData(  // TENSORFLOW_STATUS_OK
       tensorflow::profiler::XSpace* space) override;
 
  private:
@@ -68,7 +68,7 @@ HostTracer::HostTracer(int host_trace_level)
 
 HostTracer::~HostTracer() { Stop().IgnoreError(); }  // NOLINT
 
-tsl::Status HostTracer::Start() {  // TENSORFLOW_STATUS_OK
+absl::Status HostTracer::Start() {  // TENSORFLOW_STATUS_OK
   if (recording_) {
     return tsl::errors::Internal("TraceMeRecorder already started");
   }
@@ -81,33 +81,33 @@ tsl::Status HostTracer::Start() {  // TENSORFLOW_STATUS_OK
   if (!recording_) {
     return tsl::errors::Internal("Failed to start TraceMeRecorder");
   }
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
-tsl::Status HostTracer::Stop() {  // TENSORFLOW_STATUS_OK
+absl::Status HostTracer::Stop() {  // TENSORFLOW_STATUS_OK
   if (!recording_) {
     return tsl::errors::Internal("TraceMeRecorder not started");
   }
   events_ = tsl::profiler::TraceMeRecorder::Stop();
   recording_ = false;
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
-tsl::Status HostTracer::CollectData(  // TENSORFLOW_STATUS_OK
+absl::Status HostTracer::CollectData(  // TENSORFLOW_STATUS_OK
     tensorflow::profiler::XSpace* space) {
   VLOG(2) << "Collecting data to XSpace from HostTracer.";
   if (recording_) {
     return tsl::errors::Internal("TraceMeRecorder not stopped");
   }
   if (events_.empty()) {
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
   tensorflow::profiler::XPlane* plane =
       tsl::profiler::FindOrAddMutablePlaneWithName(
           space, tsl::profiler::kHostThreadsPlaneName);
   ConvertCompleteEventsToXPlane(start_timestamp_ns_, std::exchange(events_, {}),
                                 plane);
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/backends/profiler/cpu/python_tracer.cc b/third_party/xla/xla/backends/profiler/cpu/python_tracer.cc
index 0430b0eb1fb63b..4649fcd6e6d3ff 100644
--- a/third_party/xla/xla/backends/profiler/cpu/python_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/cpu/python_tracer.cc
@@ -35,11 +35,11 @@ class PythonTracer : public tsl::profiler::ProfilerInterface {
       : options_(options) {}
   ~PythonTracer() override;
 
-  tsl::Status Start() override;  // TENSORFLOW_STATUS_OK
+  absl::Status Start() override;  // TENSORFLOW_STATUS_OK
 
-  tsl::Status Stop() override;  // TENSORFLOW_STATUS_OK
+  absl::Status Stop() override;  // TENSORFLOW_STATUS_OK
 
-  tsl::Status CollectData(  // TENSORFLOW_STATUS_OK
+  absl::Status CollectData(  // TENSORFLOW_STATUS_OK
       tensorflow::profiler::XSpace* space) override;
 
  private:
@@ -53,34 +53,34 @@ class PythonTracer : public tsl::profiler::ProfilerInterface {
 
 PythonTracer::~PythonTracer() { Stop().IgnoreError(); }  // NOLINT
 
-tsl::Status PythonTracer::Start() {  // TENSORFLOW_STATUS_OK
+absl::Status PythonTracer::Start() {  // TENSORFLOW_STATUS_OK
   if (recording_) {
     return tsl::errors::Internal("PythonTracer already started");
   }
   VLOG(1) << __FUNCTION__;
   recording_ = true;
   PythonHooks::GetSingleton()->Start(options_);
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
-tsl::Status PythonTracer::Stop() {  // TENSORFLOW_STATUS_OK
+absl::Status PythonTracer::Stop() {  // TENSORFLOW_STATUS_OK
   if (!recording_) {
     return tsl::errors::Internal("PythonTracer not started");
   }
   VLOG(1) << __FUNCTION__;
   context_ = PythonHooks::GetSingleton()->Stop();
   recording_ = false;
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
-tsl::Status PythonTracer::CollectData(  // TENSORFLOW_STATUS_OK
+absl::Status PythonTracer::CollectData(  // TENSORFLOW_STATUS_OK
     tensorflow::profiler::XSpace* space) {
   VLOG(2) << "Collecting data to XSpace from PythonTracer.";
   if (context_) {
     context_->Finalize(space);
     context_.reset();
   }
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/backends/profiler/gpu/BUILD b/third_party/xla/xla/backends/profiler/gpu/BUILD
index def36fecf2b11c..91c036c9fece9c 100644
--- a/third_party/xla/xla/backends/profiler/gpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/gpu/BUILD
@@ -6,6 +6,7 @@ load(
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
     "@local_tsl//tsl:tsl.bzl",
+    "internal_visibility",
     "tsl_copts",
     "tsl_gpu_library",
 )
@@ -17,7 +18,6 @@ load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "tf_cuda_tests_tags",
 )
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(
     "@local_tsl//tsl/platform/default:cuda_build_defs.bzl",
     "if_cuda_is_configured",
@@ -25,7 +25,8 @@ load(
 load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility(["//xla:internal"]),
 )
 
 tsl_gpu_library(
@@ -38,7 +39,6 @@ tsl_gpu_library(
         ":cupti_wrapper",
         ":rocm_tracer",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":cupti_utils",
         "@com_google_absl//absl/container:fixed_array",
@@ -75,7 +75,6 @@ tsl_gpu_library(
     cuda_deps = [
         ":cupti_interface",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "@local_tsl//tsl/platform:test",
     ],
@@ -139,7 +138,7 @@ cuda_library(
     visibility = ["//visibility:public"],
     deps = [
         "@local_config_cuda//cuda:cuda_headers",
-        "@local_config_cuda//cuda:cudart",
+        "@local_config_cuda//cuda:cudart_static",
         "@local_tsl//tsl/platform:test",
     ],
 )
@@ -152,7 +151,10 @@ cuda_library(
 # that the wrapper is about the only direct user.
 tsl_gpu_library(
     name = "cupti_wrapper",
-    srcs = if_cuda(["cupti_wrapper.cc"]),
+    srcs = if_cuda([
+        "cupti_wrapper.cc",
+        "cupti_wrapper_stub.cc",
+    ]),
     hdrs = if_cuda(["cupti_wrapper.h"]),
     copts = tf_profiler_copts() + tsl_copts(),
     linkstatic = 1,
@@ -178,13 +180,13 @@ tsl_gpu_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/container:node_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:optional",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:macros",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:types",
         "@local_tsl//tsl/profiler/backends/cpu:annotation_stack",
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
@@ -205,13 +207,12 @@ tsl_gpu_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/container:node_hash_set",
-        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:macros",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:types",
         "@local_tsl//tsl/profiler/backends/cpu:annotation_stack",
         "@local_tsl//tsl/profiler/lib:scoped_annotation",
@@ -231,7 +232,7 @@ tsl_gpu_library(
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/container:node_hash_set",
-        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -247,7 +248,6 @@ tsl_gpu_library(
     srcs = if_cuda(["nvtx_utils.cc"]),
     hdrs = if_cuda(["nvtx_utils.h"]),
     copts = tf_profiler_copts() + tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform",
@@ -258,7 +258,7 @@ tsl_gpu_library(
 tsl_gpu_library(
     name = "cupti_collector",
     srcs = if_cuda(["cupti_collector.cc"]),
-    hdrs = if_cuda(["cupti_collector.h"]),
+    hdrs = ["cupti_collector.h"],
     copts = tf_profiler_copts() + tsl_copts(),
     visibility = ["//visibility:public"],
     deps = [
@@ -267,12 +267,12 @@ tsl_gpu_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:node_hash_set",
         "@com_google_absl//absl/hash",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:abi",
         "@local_tsl//tsl/platform:macros",
         "@local_tsl//tsl/platform:mutex",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:types",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
         "@local_tsl//tsl/profiler/utils:parse_annotation",
@@ -283,22 +283,6 @@ tsl_gpu_library(
     ] + if_cuda(["@local_tsl//tsl/cuda:cupti"]),
 )
 
-cc_library(
-    name = "cupti_collector_header",
-    hdrs = ["cupti_collector.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@com_google_absl//absl/container:fixed_array",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:node_hash_set",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:macros",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:types",
-        "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
-    ],
-)
-
 tsl_gpu_library(
     name = "cupti_utils",
     srcs = if_cuda(["cupti_utils.cc"]),
@@ -307,7 +291,11 @@ tsl_gpu_library(
         ":cupti_error_manager",
         ":cupti_interface",
         ":cupti_wrapper",
+        "@com_google_absl//absl/base",
         "@com_google_absl//absl/memory",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:stringpiece",
+        "@local_tsl//tsl/util:env_var",
     ],
     visibility = ["//visibility:public"],
     alwayslink = 1,
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
index a2a0c5aff071cb..c29cfc558c0632 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.cc
@@ -140,7 +140,7 @@ class PerDeviceCollector {
   }
 
   void CreateXEvent(const CuptiTracerEvent& event, XPlaneBuilder* plane,
-                    tsl::uint64 start_gpu_ns, tsl::uint64 end_gpu_ns,
+                    uint64_t start_gpu_ns, uint64_t end_gpu_ns,
                     XLineBuilder* line) {
     if (event.start_time_ns < start_gpu_ns || event.end_time_ns > end_gpu_ns ||
         event.start_time_ns > event.end_time_ns) {
@@ -177,7 +177,7 @@ class PerDeviceCollector {
     if (event.context_id != CuptiTracerEvent::kInvalidContextId) {
       xevent.AddStatValue(
           *plane->GetOrCreateStatMetadata(GetStatTypeStr(StatType::kContextId)),
-          absl::StrCat("$$", static_cast<tsl::uint64>(event.context_id)));
+          absl::StrCat("$$", static_cast<uint64_t>(event.context_id)));
     }
 
     if (event.type == CuptiTracerEventType::Kernel &&
@@ -320,7 +320,7 @@ class PerDeviceCollector {
     events_.emplace_back(std::move(event));
   }
 
-  size_t Flush(tsl::uint64 start_gpu_ns, tsl::uint64 end_gpu_ns,
+  size_t Flush(uint64_t start_gpu_ns, uint64_t end_gpu_ns,
                XPlaneBuilder* device_plane, XPlaneBuilder* host_plane) {
     mutex_lock l(m_);
     // Tracking event types per line.
@@ -392,7 +392,7 @@ class PerDeviceCollector {
       // Times 2 because HBM is DDR memory; it gets two data bits per each
       // data lane.
       auto memory_bandwidth =
-          tsl::uint64{2} * (*mem_clock_khz) * 1000 * (*mem_bus_width_bits) / 8;
+          uint64_t{2} * (*mem_clock_khz) * 1000 * (*mem_bus_width_bits) / 8;
       device_plane->AddStatValue(
           *device_plane->GetOrCreateStatMetadata(
               GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
@@ -404,7 +404,7 @@ class PerDeviceCollector {
       device_plane->AddStatValue(
           *device_plane->GetOrCreateStatMetadata(
               GetStatTypeStr(StatType::kDevCapMemorySize)),
-          static_cast<tsl::uint64>(total_memory));
+          static_cast<uint64_t>(total_memory));
     }
 
     auto compute_capability_major = GetDeviceAttribute(
@@ -469,7 +469,7 @@ class PerDeviceCollector {
 
 }  // namespace
 
-void AnnotationMap::Add(tsl::uint32 device_id, tsl::uint32 correlation_id,
+void AnnotationMap::Add(uint32_t device_id, uint32_t correlation_id,
                         const absl::string_view annotation,
                         const absl::string_view nvtx_range) {
   if (annotation.empty() && nvtx_range.empty()) return;
@@ -488,8 +488,8 @@ void AnnotationMap::Add(tsl::uint32 device_id, tsl::uint32 correlation_id,
   }
 }
 
-AnnotationMap::AnnotationInfo AnnotationMap::LookUp(
-    tsl::uint32 device_id, tsl::uint32 correlation_id) {
+AnnotationMap::AnnotationInfo AnnotationMap::LookUp(uint32_t device_id,
+                                                    uint32_t correlation_id) {
   if (device_id >= per_device_map_.size()) return AnnotationInfo();
   auto& per_device_map = per_device_map_[device_id];
   absl::MutexLock lock(&per_device_map.mutex);
@@ -503,8 +503,7 @@ AnnotationMap::AnnotationInfo AnnotationMap::LookUp(
 class CuptiTraceCollectorImpl : public CuptiTraceCollector {
  public:
   CuptiTraceCollectorImpl(const CuptiTracerCollectorOptions& option,
-                          tsl::uint64 start_walltime_ns,
-                          tsl::uint64 start_gpu_ns)
+                          uint64_t start_walltime_ns, uint64_t start_gpu_ns)
       : CuptiTraceCollector(option),
         num_callback_events_(0),
         num_activity_events_(0),
@@ -531,13 +530,13 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
     per_device_collector_[event.device_id].AddEvent(std::move(event));
   }
   void OnEventsDropped(const std::string& reason,
-                       tsl::uint32 num_events) override {
+                       uint32_t num_events) override {
     absl::MutexLock lock(&mutex_);
     dropped_events_[reason] += num_events;
   }
   void Flush() override {}
   // Returns true if some GPU events are captured.
-  bool Export(XSpace* space, tsl::uint64 end_gpu_ns) override {
+  bool Export(XSpace* space, uint64_t end_gpu_ns) override {
     LOG(INFO) << " GpuTracer has collected " << num_callback_events_
               << " callback api events and " << num_activity_events_
               << " activity events. " << ReportDroppedEvents();
@@ -587,10 +586,10 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
   std::atomic<int> num_callback_events_;
   std::atomic<int> num_activity_events_;
   absl::Mutex mutex_;
-  absl::flat_hash_map<std::string, tsl::uint64> dropped_events_
+  absl::flat_hash_map<std::string, uint64_t> dropped_events_
       ABSL_GUARDED_BY(mutex_);
-  tsl::uint64 start_walltime_ns_;
-  tsl::uint64 start_gpu_ns_;
+  uint64_t start_walltime_ns_;
+  uint64_t start_gpu_ns_;
   int num_gpus_;
 
   // Set the all XLines of specified XPlane to starting walltime.
@@ -599,7 +598,7 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
   // this fact. Eventually we change line start time to corresponding
   // start_walltime_ns to normalize with CPU wall time.
   static void NormalizeTimeStamps(XPlaneBuilder* plane,
-                                  tsl::uint64 start_walltime_ns) {
+                                  uint64_t start_walltime_ns) {
     plane->ForEachLine(
         [&](XLineBuilder line) { line.SetTimestampNs(start_walltime_ns); });
   }
@@ -611,8 +610,8 @@ class CuptiTraceCollectorImpl : public CuptiTraceCollector {
 };
 
 std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector(
-    const CuptiTracerCollectorOptions& options,
-    const tsl::uint64 start_walltime_ns, const tsl::uint64 start_gputime_ns) {
+    const CuptiTracerCollectorOptions& options, uint64_t start_walltime_ns,
+    uint64_t start_gputime_ns) {
   return std::make_unique<CuptiTraceCollectorImpl>(options, start_walltime_ns,
                                                    start_gputime_ns);
 }
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h
index dca9a3a247698c..8a523548256b59 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_collector.h
@@ -16,14 +16,14 @@ limitations under the License.
 #ifndef XLA_BACKENDS_PROFILER_GPU_CUPTI_COLLECTOR_H_
 #define XLA_BACKENDS_PROFILER_GPU_CUPTI_COLLECTOR_H_
 
+#include <cstdint>
 #include <memory>
 
 #include "absl/container/fixed_array.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/node_hash_set.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
-#include "tsl/platform/macros.h"
-#include "tsl/platform/status.h"
 #include "tsl/platform/types.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 
@@ -35,16 +35,16 @@ struct MemcpyDetails {
   size_t num_bytes;
   // The destination device for peer-2-peer communication (memcpy). The source
   // device is implicit: it's the current device.
-  tsl::uint32 destination;
+  uint32_t destination;
   // Whether or not the memcpy is asynchronous.
   bool async;
   // This contains CUpti_ActivityMemcpyKind for activity event (on device).
   // For events from other CuptiTracerEventSource, it is always 0.
-  tsl::int8 copy_kind;
+  int8_t copy_kind;
   // CUpti_ActivityMemoryKind of source.
-  tsl::int8 src_mem_kind;
+  int8_t src_mem_kind;
   // CUpti_ActivityMemoryKind of destination.
-  tsl::int8 dst_mem_kind;
+  int8_t dst_mem_kind;
 
   // ID of the hardware channel on which this operation ran.
   uint32_t channel_id = -1;
@@ -56,9 +56,9 @@ struct MemAllocDetails {
   // Size of memory to be written over in bytes.
   size_t num_bytes;
   // The CUpti_ActivityMemoryKind value for this activity event.
-  tsl::int8 mem_kind;
+  int8_t mem_kind;
   // The virtual address of allocation. 0 if it is a free operation.
-  tsl::uint64 address;
+  uint64_t address;
 };
 
 using MemFreeDetails = MemAllocDetails;
@@ -72,20 +72,20 @@ using MemoryResidencyDetails = MemAllocDetails;
 // cudaHostRegister
 struct HostRegisterDetails {
   size_t num_bytes;
-  tsl::uint64 address;
+  uint64_t address;
   unsigned int flags;
 };
 
 // cudaHostUnregister
 struct HostUnregisterDetails {
-  tsl::uint64 address;
+  uint64_t address;
 };
 
 struct MemsetDetails {
   // Size of memory to be written over in bytes.
   size_t num_bytes;
   // The CUpti_ActivityMemoryKind value for this activity event.
-  tsl::int8 mem_kind;
+  int8_t mem_kind;
   // Whether or not the memset is asynchronous.
   bool async;
 
@@ -97,23 +97,23 @@ struct MemsetDetails {
 
 struct KernelDetails {
   // The number of registers used in this kernel.
-  tsl::uint32 registers_per_thread;
+  uint32_t registers_per_thread;
   // The amount of shared memory space used by a thread block.
-  tsl::uint32 static_shared_memory_usage;
+  uint32_t static_shared_memory_usage;
   // The amount of dynamic memory space used by a thread block.
-  tsl::uint32 dynamic_shared_memory_usage;
+  uint32_t dynamic_shared_memory_usage;
   // X-dimension of a thread block.
-  tsl::uint32 block_x;
+  uint32_t block_x;
   // Y-dimension of a thread block.
-  tsl::uint32 block_y;
+  uint32_t block_y;
   // Z-dimension of a thread block.
-  tsl::uint32 block_z;
+  uint32_t block_z;
   // X-dimension of a grid.
-  tsl::uint32 grid_x;
+  uint32_t grid_x;
   // Y-dimension of a grid.
-  tsl::uint32 grid_y;
+  uint32_t grid_y;
   // Z-dimension of a grid.
-  tsl::uint32 grid_z;
+  uint32_t grid_z;
 
   // ID of the hardware channel on which this operation ran.
   uint32_t channel_id = -1;
@@ -165,13 +165,13 @@ enum class CuptiTracerEventSource {
 };
 
 struct CuptiTracerEvent {
-  static constexpr tsl::uint32 kInvalidThreadId =
+  static constexpr uint32_t kInvalidThreadId =
       std::numeric_limits<uint32_t>::max();
-  static constexpr tsl::uint32 kInvalidCorrelationId =
+  static constexpr uint32_t kInvalidCorrelationId =
       std::numeric_limits<uint32_t>::max();
-  static constexpr tsl::uint64 kInvalidContextId =
+  static constexpr uint64_t kInvalidContextId =
       std::numeric_limits<uint64_t>::max();
-  static constexpr tsl::uint64 kInvalidStreamId =
+  static constexpr uint64_t kInvalidStreamId =
       std::numeric_limits<uint64_t>::max();
   CuptiTracerEventType type = CuptiTracerEventType::Unsupported;
   CuptiTracerEventSource source = CuptiTracerEventSource::Invalid;
@@ -183,11 +183,11 @@ struct CuptiTracerEvent {
   // where serialization happens.
   absl::string_view annotation;
   absl::string_view nvtx_range;
-  tsl::uint64 start_time_ns = 0;
-  tsl::uint64 end_time_ns = 0;
-  tsl::uint32 device_id = 0;
-  tsl::uint32 correlation_id = kInvalidCorrelationId;
-  tsl::uint32 thread_id = kInvalidThreadId;
+  uint64_t start_time_ns = 0;
+  uint64_t end_time_ns = 0;
+  uint32_t device_id = 0;
+  uint32_t correlation_id = kInvalidCorrelationId;
+  uint32_t thread_id = kInvalidThreadId;
   int64_t context_id = kInvalidContextId;
   int64_t stream_id = kInvalidStreamId;
   union {
@@ -214,13 +214,13 @@ struct CuptiTracerCollectorOptions {
   // Maximum number of events to collect from callback API; if -1, no limit.
   // if 0, the callback API is enabled to build a correlation map, but no
   // events are collected.
-  tsl::uint64 max_callback_api_events = 2 * 1024 * 1024;
+  uint64_t max_callback_api_events = 2 * 1024 * 1024;
   // Maximum number of events to collect from activity API; if -1, no limit.
-  tsl::uint64 max_activity_api_events = 2 * 1024 * 1024;
+  uint64_t max_activity_api_events = 2 * 1024 * 1024;
   // Maximum number of annotation strings that we can accommodate.
-  tsl::uint64 max_annotation_strings = 1024 * 1024;
+  uint64_t max_annotation_strings = 1024 * 1024;
   // Number of GPUs involved.
-  tsl::uint32 num_gpus;
+  uint32_t num_gpus;
 };
 
 class AnnotationMap {
@@ -230,12 +230,12 @@ class AnnotationMap {
     absl::string_view nvtx_range;
   };
 
-  explicit AnnotationMap(tsl::uint64 max_size, tsl::uint32 num_gpus)
+  explicit AnnotationMap(uint64_t max_size, uint32_t num_gpus)
       : max_size_(max_size), per_device_map_(num_gpus) {}
-  void Add(tsl::uint32 device_id, tsl::uint32 correlation_id,
+  void Add(uint32_t device_id, uint32_t correlation_id,
            const absl::string_view annotation,
            const absl::string_view nvtx_range);
-  AnnotationInfo LookUp(tsl::uint32 device_id, tsl::uint32 correlation_id);
+  AnnotationInfo LookUp(uint32_t device_id, uint32_t correlation_id);
 
  private:
   struct PerDeviceAnnotationMap {
@@ -246,9 +246,9 @@ class AnnotationMap {
     // an use the reference to the string in the map.
     absl::node_hash_set<std::string> annotations;
     absl::node_hash_set<std::string> nvtx_ranges;
-    absl::flat_hash_map<tsl::uint32, AnnotationInfo> correlation_map;
+    absl::flat_hash_map<uint32_t, AnnotationInfo> correlation_map;
   };
-  const tsl::uint64 max_size_;
+  const uint64_t max_size_;
   absl::FixedArray<PerDeviceAnnotationMap> per_device_map_;
 
   AnnotationMap(const AnnotationMap&) = delete;
@@ -265,12 +265,12 @@ class CuptiTraceCollector {
   // Producer side functions (i.e. called by CuptiTracer).
   virtual void AddEvent(CuptiTracerEvent&& event) = 0;
   virtual void OnEventsDropped(const std::string& reason,
-                               tsl::uint32 num_events) = 0;
+                               uint32_t num_events) = 0;
   virtual void Flush() = 0;
 
   // Consumer side functions (i.e. called by GPU tracer);
   virtual bool Export(tensorflow::profiler::XSpace* space,
-                      tsl::uint64 end_gpu_ns) {
+                      uint64_t end_gpu_ns) {
     return true;
   }
   virtual std::string ReportNumEventsIfDropped() { return ""; }
@@ -288,8 +288,8 @@ class CuptiTraceCollector {
 };
 
 std::unique_ptr<CuptiTraceCollector> CreateCuptiCollector(
-    const CuptiTracerCollectorOptions& options,
-    const tsl::uint64 start_walltime_ns, const tsl::uint64 start_gputime_ns);
+    const CuptiTracerCollectorOptions& options, uint64_t start_walltime_ns,
+    uint64_t start_gputime_ns);
 
 }  // namespace profiler
 }  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.cc
index a8f2349c51e21c..d50be6c955e7e0 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.cc
@@ -202,242 +202,6 @@ CUptiResult CuptiErrorManager::Unsubscribe(CUpti_SubscriberHandle subscriber) {
   return error;
 }
 
-CUptiResult CuptiErrorManager::DeviceEnumEventDomains(
-    CUdevice device, size_t* array_size_bytes,
-    CUpti_EventDomainID* domain_array) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error = interface_->DeviceEnumEventDomains(
-      device, array_size_bytes, domain_array);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::DeviceGetEventDomainAttribute(
-    CUdevice device, CUpti_EventDomainID event_domain,
-    CUpti_EventDomainAttribute attrib, size_t* value_size, void* value) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error = interface_->DeviceGetEventDomainAttribute(
-      device, event_domain, attrib, value_size, value);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::DisableKernelReplayMode(CUcontext context) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error = interface_->DisableKernelReplayMode(context);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::EnableKernelReplayMode(CUcontext context) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error = interface_->EnableKernelReplayMode(context);
-  if (error == CUPTI_SUCCESS) {
-    auto f =
-        std::bind(&CuptiErrorManager::DisableKernelReplayMode, this, context);
-    RegisterUndoFunction(f);
-  }
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::DeviceEnumMetrics(CUdevice device,
-                                                 size_t* arraySizeBytes,
-                                                 CUpti_MetricID* metricArray) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error =
-      interface_->DeviceEnumMetrics(device, arraySizeBytes, metricArray);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::DeviceGetNumEventDomains(CUdevice device,
-                                                        uint32_t* num_domains) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error = interface_->DeviceGetNumEventDomains(device, num_domains);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::EventDomainEnumEvents(
-    CUpti_EventDomainID event_domain, size_t* array_size_bytes,
-    CUpti_EventID* event_array) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error = interface_->EventDomainEnumEvents(
-      event_domain, array_size_bytes, event_array);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::EventDomainGetNumEvents(
-    CUpti_EventDomainID event_domain, uint32_t* num_events) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error =
-      interface_->EventDomainGetNumEvents(event_domain, num_events);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::EventGetAttribute(CUpti_EventID event,
-                                                 CUpti_EventAttribute attrib,
-                                                 size_t* value_size,
-                                                 void* value) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error =
-      interface_->EventGetAttribute(event, attrib, value_size, value);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::EventGetIdFromName(CUdevice device,
-                                                  const char* event_name,
-                                                  CUpti_EventID* event) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error = interface_->EventGetIdFromName(device, event_name, event);
-  ALLOW_ERROR(error, CUPTI_ERROR_INVALID_EVENT_NAME);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::EventGroupDisable(CUpti_EventGroup event_group) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error = interface_->EventGroupDisable(event_group);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::EventGroupEnable(CUpti_EventGroup event_group) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error = interface_->EventGroupEnable(event_group);
-  if (error == CUPTI_SUCCESS) {
-    auto f =
-        std::bind(&CuptiErrorManager::EventGroupDisable, this, event_group);
-    RegisterUndoFunction(f);
-  }
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::EventGroupGetAttribute(
-    CUpti_EventGroup event_group, CUpti_EventGroupAttribute attrib,
-    size_t* value_size, void* value) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error = interface_->EventGroupGetAttribute(event_group, attrib,
-                                                         value_size, value);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::EventGroupReadEvent(
-    CUpti_EventGroup event_group, CUpti_ReadEventFlags flags,
-    CUpti_EventID event, size_t* event_value_buffer_size_bytes,
-    uint64_t* event_value_buffer) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error = interface_->EventGroupReadEvent(
-      event_group, flags, event, event_value_buffer_size_bytes,
-      event_value_buffer);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::EventGroupSetAttribute(
-    CUpti_EventGroup event_group, CUpti_EventGroupAttribute attrib,
-    size_t value_size, void* value) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error = interface_->EventGroupSetAttribute(event_group, attrib,
-                                                         value_size, value);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::EventGroupSetsCreate(
-    CUcontext context, size_t event_id_array_size_bytes,
-    CUpti_EventID* event_id_array, CUpti_EventGroupSets** event_group_passes) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error = interface_->EventGroupSetsCreate(
-      context, event_id_array_size_bytes, event_id_array, event_group_passes);
-  if (error == CUPTI_SUCCESS) {
-    auto f = std::bind(&CuptiErrorManager::EventGroupSetsDestroy, this,
-                       *event_group_passes);
-    RegisterUndoFunction(f);
-  }
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::EventGroupSetsDestroy(
-    CUpti_EventGroupSets* event_group_sets) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error = interface_->EventGroupSetsDestroy(event_group_sets);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-// CUPTI metric API
-CUptiResult CuptiErrorManager::DeviceGetNumMetrics(CUdevice device,
-                                                   uint32_t* num_metrics) {
-  IGNORE_CALL_IF_DISABLED;
-  // Disable heap checking for the first CUPTI metric API. See b/22091576.
-  absl::LeakCheckDisabler disabler;
-  CUptiResult error = interface_->DeviceGetNumMetrics(device, num_metrics);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::MetricGetIdFromName(CUdevice device,
-                                                   const char* metric_name,
-                                                   CUpti_MetricID* metric) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error =
-      interface_->MetricGetIdFromName(device, metric_name, metric);
-  ALLOW_ERROR(error, CUPTI_ERROR_INVALID_METRIC_NAME);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::MetricGetNumEvents(CUpti_MetricID metric,
-                                                  uint32_t* num_events) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error = interface_->MetricGetNumEvents(metric, num_events);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::MetricEnumEvents(
-    CUpti_MetricID metric, size_t* event_id_array_size_bytes,
-    CUpti_EventID* event_id_array) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error = interface_->MetricEnumEvents(
-      metric, event_id_array_size_bytes, event_id_array);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::MetricGetAttribute(CUpti_MetricID metric,
-                                                  CUpti_MetricAttribute attrib,
-                                                  size_t* value_size,
-                                                  void* value) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error =
-      interface_->MetricGetAttribute(metric, attrib, value_size, value);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
-CUptiResult CuptiErrorManager::MetricGetValue(
-    CUdevice device, CUpti_MetricID metric, size_t event_id_array_size_bytes,
-    CUpti_EventID* event_id_array, size_t event_value_array_size_bytes,
-    uint64_t* event_value_array, uint64_t time_duration,
-    CUpti_MetricValue* metric_value) {
-  IGNORE_CALL_IF_DISABLED;
-  CUptiResult error = interface_->MetricGetValue(
-      device, metric, event_id_array_size_bytes, event_id_array,
-      event_value_array_size_bytes, event_value_array, time_duration,
-      metric_value);
-  LOG_AND_DISABLE_IF_ERROR(error);
-  return error;
-}
-
 void CuptiErrorManager::UndoAndDisable() {
   if (undo_disabled_) {  // prevent deadlock
     return;
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h b/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h
index 45ce01fb96ca3b..863ae2730a7ed5 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_error_manager.h
@@ -100,119 +100,6 @@ class CuptiErrorManager : public xla::profiler::CuptiInterface {
   // Unsubscribes callbacks.
   CUptiResult Unsubscribe(CUpti_SubscriberHandle subscriber) override;
 
-  // CUPTI event API
-  // Returns a list of event domains.
-  CUptiResult DeviceEnumEventDomains(
-      CUdevice device, size_t* array_size_bytes,
-      CUpti_EventDomainID* domain_array) override;
-
-  // Returns domain attributes.
-  CUptiResult DeviceGetEventDomainAttribute(CUdevice device,
-                                            CUpti_EventDomainID event_domain,
-                                            CUpti_EventDomainAttribute attrib,
-                                            size_t* value_size,
-                                            void* value) override;
-
-  // Disables kernel replay mode.
-  CUptiResult DisableKernelReplayMode(CUcontext context) override;
-
-  // Enables kernel replay mode. If we successfully enable kernel replay mode,
-  // we add DisableKernelReplayMode to the undo log.
-  CUptiResult EnableKernelReplayMode(CUcontext context) override;
-
-  // Returns the number of event domains.
-  CUptiResult DeviceGetNumEventDomains(CUdevice device,
-                                       uint32_t* num_domains) override;
-
-  // Returns a list of events.
-  CUptiResult EventDomainEnumEvents(CUpti_EventDomainID event_domain,
-                                    size_t* array_size_bytes,
-                                    CUpti_EventID* event_array) override;
-
-  // Returns the number of events.
-  CUptiResult EventDomainGetNumEvents(CUpti_EventDomainID event_domain,
-                                      uint32_t* num_events) override;
-
-  // Returns an event attribute.
-  CUptiResult EventGetAttribute(CUpti_EventID event,
-                                CUpti_EventAttribute attrib, size_t* value_size,
-                                void* value) override;
-
-  // Convverts event ID from event name.
-  CUptiResult EventGetIdFromName(CUdevice device, const char* event_name,
-                                 CUpti_EventID* event) override;
-
-  // Disables event group.
-  CUptiResult EventGroupDisable(CUpti_EventGroup event_group) override;
-
-  // Enables event group. If we successfully enable an event group, we add
-  // EventGroupDisable to the undo log.
-  CUptiResult EventGroupEnable(CUpti_EventGroup event_group) override;
-
-  // Returns an event group attribute.
-  CUptiResult EventGroupGetAttribute(CUpti_EventGroup event_group,
-                                     CUpti_EventGroupAttribute attrib,
-                                     size_t* value_size, void* value) override;
-
-  // Returns a performance counter value.
-  CUptiResult EventGroupReadEvent(CUpti_EventGroup event_group,
-                                  CUpti_ReadEventFlags flags,
-                                  CUpti_EventID event,
-                                  size_t* event_value_buffer_size_bytes,
-                                  uint64_t* event_value_buffer) override;
-
-  // Returns an event group set attribute.
-  CUptiResult EventGroupSetAttribute(CUpti_EventGroup event_group,
-                                     CUpti_EventGroupAttribute attrib,
-                                     size_t value_size, void* value) override;
-
-  // Creates an event group set. If we successfully creates an event group set,
-  // we add EventGroupSetsDestroy to the undo log.
-  CUptiResult EventGroupSetsCreate(
-      CUcontext context, size_t event_id_array_size_bytes,
-      CUpti_EventID* event_id_array,
-      CUpti_EventGroupSets** event_group_passes) override;
-
-  // Destroys an event group set.
-  CUptiResult EventGroupSetsDestroy(
-      CUpti_EventGroupSets* event_group_sets) override;
-
-  // CUPTI metric API: all thread-safe
-  // Enumerates metrics.
-  CUptiResult DeviceEnumMetrics(CUdevice device, size_t* arraySizeBytes,
-                                CUpti_MetricID* metricArray) override;
-
-  // Returns the number of metrics.
-  CUptiResult DeviceGetNumMetrics(CUdevice device,
-                                  uint32_t* num_metrics) override;
-
-  // Converts a metric ID to a metric name.
-  CUptiResult MetricGetIdFromName(CUdevice device, const char* metric_name,
-                                  CUpti_MetricID* metric) override;
-
-  // Returns the number of events required to calculate a particular metric.
-  CUptiResult MetricGetNumEvents(CUpti_MetricID metric,
-                                 uint32_t* num_events) override;
-
-  // Returns a list of events required to calculate a particular metric.
-  CUptiResult MetricEnumEvents(CUpti_MetricID metric,
-                               size_t* event_id_array_size_bytes,
-                               CUpti_EventID* event_id_array) override;
-
-  // Returns a metric attribute.
-  CUptiResult MetricGetAttribute(CUpti_MetricID metric,
-                                 CUpti_MetricAttribute attrib,
-                                 size_t* value_size, void* value) override;
-
-  // Returns a metric value.
-  CUptiResult MetricGetValue(CUdevice device, CUpti_MetricID metric,
-                             size_t event_id_array_size_bytes,
-                             CUpti_EventID* event_id_array,
-                             size_t event_value_array_size_bytes,
-                             uint64_t* event_value_array,
-                             uint64_t time_duration,
-                             CUpti_MetricValue* metric_value) override;
-
   CUptiResult GetResultString(CUptiResult result, const char** str) override;
 
   CUptiResult GetContextId(CUcontext context, uint32_t* context_id) override;
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_interface.h b/third_party/xla/xla/backends/profiler/gpu/cupti_interface.h
index 8436b36fd7ac32..b487673d07e6a1 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_interface.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_interface.h
@@ -16,8 +16,8 @@ limitations under the License.
 #ifndef XLA_BACKENDS_PROFILER_GPU_CUPTI_INTERFACE_H_
 #define XLA_BACKENDS_PROFILER_GPU_CUPTI_INTERFACE_H_
 
-#include <stddef.h>
-#include <stdint.h>
+#include <cstddef>
+#include <cstdint>
 
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
 #include "third_party/gpus/cuda/include/cuda.h"
@@ -60,7 +60,7 @@ class CuptiInterface {
       CUpti_BuffersCallbackRequestFunc func_buffer_requested,
       CUpti_BuffersCallbackCompleteFunc func_buffer_completed) = 0;
 
-  virtual CUptiResult GetDeviceId(CUcontext context, tsl::uint32* deviceId) = 0;
+  virtual CUptiResult GetDeviceId(CUcontext context, uint32_t* deviceId) = 0;
 
   virtual CUptiResult GetTimestamp(uint64_t* timestamp) = 0;
 
@@ -82,95 +82,6 @@ class CuptiInterface {
 
   virtual CUptiResult Unsubscribe(CUpti_SubscriberHandle subscriber) = 0;
 
-  // CUPTI event API
-  virtual CUptiResult DeviceEnumEventDomains(
-      CUdevice device, size_t* array_size_bytes,
-      CUpti_EventDomainID* domain_array) = 0;
-
-  virtual CUptiResult DeviceGetEventDomainAttribute(
-      CUdevice device, CUpti_EventDomainID event_domain,
-      CUpti_EventDomainAttribute attrib, size_t* value_size, void* value) = 0;
-
-  virtual CUptiResult DisableKernelReplayMode(CUcontext context) = 0;
-
-  virtual CUptiResult EnableKernelReplayMode(CUcontext context) = 0;
-
-  virtual CUptiResult DeviceGetNumEventDomains(CUdevice device,
-                                               uint32_t* num_domains) = 0;
-
-  virtual CUptiResult EventDomainEnumEvents(CUpti_EventDomainID event_domain,
-                                            size_t* array_size_bytes,
-                                            CUpti_EventID* event_array) = 0;
-
-  virtual CUptiResult EventDomainGetNumEvents(CUpti_EventDomainID event_domain,
-                                              uint32_t* num_events) = 0;
-
-  virtual CUptiResult EventGetAttribute(CUpti_EventID event,
-                                        CUpti_EventAttribute attrib,
-                                        size_t* value_size, void* value) = 0;
-
-  virtual CUptiResult EventGetIdFromName(CUdevice device,
-                                         const char* event_name,
-                                         CUpti_EventID* event) = 0;
-
-  virtual CUptiResult EventGroupDisable(CUpti_EventGroup event_group) = 0;
-
-  virtual CUptiResult EventGroupEnable(CUpti_EventGroup event_group) = 0;
-
-  virtual CUptiResult EventGroupGetAttribute(CUpti_EventGroup event_group,
-                                             CUpti_EventGroupAttribute attrib,
-                                             size_t* value_size,
-                                             void* value) = 0;
-
-  virtual CUptiResult EventGroupReadEvent(CUpti_EventGroup event_group,
-                                          CUpti_ReadEventFlags flags,
-                                          CUpti_EventID event,
-                                          size_t* event_value_buffer_size_bytes,
-                                          uint64_t* eventValueBuffer) = 0;
-
-  virtual CUptiResult EventGroupSetAttribute(CUpti_EventGroup event_group,
-                                             CUpti_EventGroupAttribute attrib,
-                                             size_t value_size,
-                                             void* value) = 0;
-
-  virtual CUptiResult EventGroupSetsCreate(
-      CUcontext context, size_t event_id_array_size_bytes,
-      CUpti_EventID* event_id_array,
-      CUpti_EventGroupSets** event_group_passes) = 0;
-
-  virtual CUptiResult EventGroupSetsDestroy(
-      CUpti_EventGroupSets* event_group_sets) = 0;
-
-  // CUPTI metric API
-  virtual CUptiResult DeviceEnumMetrics(CUdevice device, size_t* arraySizeBytes,
-                                        CUpti_MetricID* metricArray) = 0;
-
-  virtual CUptiResult DeviceGetNumMetrics(CUdevice device,
-                                          uint32_t* num_metrics) = 0;
-
-  virtual CUptiResult MetricGetIdFromName(CUdevice device,
-                                          const char* metric_name,
-                                          CUpti_MetricID* metric) = 0;
-
-  virtual CUptiResult MetricGetNumEvents(CUpti_MetricID metric,
-                                         uint32_t* num_events) = 0;
-
-  virtual CUptiResult MetricEnumEvents(CUpti_MetricID metric,
-                                       size_t* event_id_array_size_bytes,
-                                       CUpti_EventID* event_id_array) = 0;
-
-  virtual CUptiResult MetricGetAttribute(CUpti_MetricID metric,
-                                         CUpti_MetricAttribute attrib,
-                                         size_t* value_size, void* value) = 0;
-
-  virtual CUptiResult MetricGetValue(CUdevice device, CUpti_MetricID metric,
-                                     size_t event_id_array_size_bytes,
-                                     CUpti_EventID* event_id_array,
-                                     size_t event_value_array_size_bytes,
-                                     uint64_t* event_value_array,
-                                     uint64_t time_duration,
-                                     CUpti_MetricValue* metric_value) = 0;
-
   virtual CUptiResult GetResultString(CUptiResult result, const char** str) = 0;
 
   virtual CUptiResult GetContextId(CUcontext context, uint32_t* context_id) = 0;
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_profiler.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_profiler.cc
index 4a58f07580a010..b355308b213368 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_profiler.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_profiler.cc
@@ -64,7 +64,7 @@ void CuptiProfiler::Enable(const CuptiProfilerOptions &option) {}
 
 void CuptiProfiler::Disable() {}
 
-/*static*/ tsl::uint64 CuptiProfiler::GetTimestamp() {
+/*static*/ uint64_t CuptiProfiler::GetTimestamp() {
   uint64_t tsc;
   CuptiInterface *cupti_interface = GetCuptiInterface();
   if (cupti_interface && cupti_interface->GetTimestamp(&tsc) == CUPTI_SUCCESS) {
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_profiler.h b/third_party/xla/xla/backends/profiler/gpu/cupti_profiler.h
index f71e2b275677ee..3065da37b12692 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_profiler.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_profiler.h
@@ -15,11 +15,11 @@ limitations under the License.
 #ifndef XLA_BACKENDS_PROFILER_GPU_CUPTI_PROFILER_H_
 #define XLA_BACKENDS_PROFILER_GPU_CUPTI_PROFILER_H_
 
-#include "absl/types/optional.h"
+#include <optional>
+#include <string>
+
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
 #include "xla/backends/profiler/gpu/cupti_interface.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
 #include "tsl/platform/types.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
index 3208d783143f8a..a7ebab58e6e3f5 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/container/node_hash_set.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti_activity.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/generated_nvtx_meta.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/backends/profiler/gpu/cupti_collector.h"
 #include "xla/backends/profiler/gpu/nvtx_utils.h"
 #include "tsl/platform/env.h"
@@ -36,9 +37,9 @@ namespace profiler {
 
 namespace {
 
+using absl::OkStatus;
+using absl::Status;
 using tsl::Env;
-using tsl::OkStatus;
-using tsl::Status;
 using tsl::profiler::AnnotationStack;
 
 // CUPTI from CUDA 11.6 adds information about the hardware channel that ops
@@ -379,9 +380,9 @@ void CUPTIAPI ProcessCuptiActivityBuffer(CUcontext context, uint32_t stream_id,
 }
 
 void AddKernelEventUponApiExit(CuptiTraceCollector *collector,
-                               tsl::uint32 device_id,
+                               uint32_t device_id,
                                const CUpti_CallbackData *cbdata,
-                               tsl::uint64 start_time, tsl::uint64 end_time) {
+                               uint64_t start_time, uint64_t end_time) {
   CuptiTracerEvent event{};
   event.type = CuptiTracerEventType::Kernel;
   event.source = CuptiTracerEventSource::DriverCallback;
@@ -399,8 +400,8 @@ void AddKernelEventUponApiExit(CuptiTraceCollector *collector,
 // Performs the actual callback for both normal and P2P memcpy operations.
 CuptiTracerEvent PopulateMemcpyCallbackEvent(
     CuptiTracerEventType type, const CUpti_CallbackData *cbdata,
-    size_t num_bytes, tsl::uint32 src_device, tsl::uint32 dst_device,
-    bool async, tsl::uint64 start_time, tsl::uint64 end_time) {
+    size_t num_bytes, uint32_t src_device, uint32_t dst_device, bool async,
+    uint64_t start_time, uint64_t end_time) {
   CuptiTracerEvent event{};
   event.type = type;
   event.source = CuptiTracerEventSource::DriverCallback;
@@ -421,11 +422,9 @@ CuptiTracerEvent PopulateMemcpyCallbackEvent(
 }
 
 void AddNormalMemcpyEventUponApiExit(CuptiTraceCollector *collector,
-                                     tsl::uint32 device_id,
-                                     CUpti_CallbackId cbid,
+                                     uint32_t device_id, CUpti_CallbackId cbid,
                                      const CUpti_CallbackData *cbdata,
-                                     tsl::uint64 start_time,
-                                     tsl::uint64 end_time) {
+                                     uint64_t start_time, uint64_t end_time) {
   size_t num_bytes;
   CuptiTracerEventType type;
   bool async;
@@ -440,9 +439,9 @@ void AddNormalMemcpyEventUponApiExit(CuptiTraceCollector *collector,
 }
 
 void AddCuMemsetEventUponApiExit(CuptiTraceCollector *collector,
-                                 tsl::uint32 device_id, CUpti_CallbackId cbid,
+                                 uint32_t device_id, CUpti_CallbackId cbid,
                                  const CUpti_CallbackData *cbdata,
-                                 tsl::uint64 start_time, tsl::uint64 end_time) {
+                                 uint64_t start_time, uint64_t end_time) {
   // We are casting all variants of cuMemset to cuMemsetD8 for accessing the
   // first member attribute, a CUdeviceptr.
   const auto *params =
@@ -473,17 +472,16 @@ void AddCuMemsetEventUponApiExit(CuptiTraceCollector *collector,
 
 void AddP2PMemcpyEventUponApiExit(CuptiTraceCollector *collector,
                                   CuptiInterface *cupti_interface,
-                                  tsl::uint32 device_id, CUpti_CallbackId cbid,
+                                  uint32_t device_id, CUpti_CallbackId cbid,
                                   const CUpti_CallbackData *cbdata,
-                                  tsl::uint64 start_time,
-                                  tsl::uint64 end_time) {
+                                  uint64_t start_time, uint64_t end_time) {
   size_t num_bytes;
   CuptiTracerEventType type;
   bool async;
   std::tie(num_bytes, type, async) =
       DecodeDriverMemcpy(cbid, cbdata->functionParams);
 
-  tsl::uint32 dst_device = -1, src_device = -1;
+  uint32_t dst_device = -1, src_device = -1;
   const auto *p2p_params =
       static_cast<const cuMemcpyPeer_params *>(cbdata->functionParams);
   cupti_interface->GetDeviceId(p2p_params->srcContext, &src_device);
@@ -497,10 +495,9 @@ void AddP2PMemcpyEventUponApiExit(CuptiTraceCollector *collector,
 }
 
 void AddCuMemAllocEventUponApiExit(CuptiTraceCollector *collector,
-                                   tsl::uint32 device_id, CUpti_CallbackId cbid,
+                                   uint32_t device_id, CUpti_CallbackId cbid,
                                    const CUpti_CallbackData *cbdata,
-                                   tsl::uint64 start_time,
-                                   tsl::uint64 end_time) {
+                                   uint64_t start_time, uint64_t end_time) {
   const auto *params =
       static_cast<const cuMemAlloc_v2_params *>(cbdata->functionParams);
   const void *dptr = reinterpret_cast<void *>(*params->dptr);
@@ -521,12 +518,9 @@ void AddCuMemAllocEventUponApiExit(CuptiTraceCollector *collector,
   collector->AddEvent(std::move(event));
 }
 
-void AddCuMemAllocPitchEventUponApiExit(CuptiTraceCollector *collector,
-                                        tsl::uint32 device_id,
-                                        CUpti_CallbackId cbid,
-                                        const CUpti_CallbackData *cbdata,
-                                        tsl::uint64 start_time,
-                                        tsl::uint64 end_time) {
+void AddCuMemAllocPitchEventUponApiExit(
+    CuptiTraceCollector *collector, uint32_t device_id, CUpti_CallbackId cbid,
+    const CUpti_CallbackData *cbdata, uint64_t start_time, uint64_t end_time) {
   const auto *params =
       static_cast<const cuMemAllocPitch_v2_params *>(cbdata->functionParams);
   const void *dptr = reinterpret_cast<void *>(*params->dptr);
@@ -548,12 +542,9 @@ void AddCuMemAllocPitchEventUponApiExit(CuptiTraceCollector *collector,
   collector->AddEvent(std::move(event));
 }
 
-void AddCuMemAllocManagedEventUponApiExit(CuptiTraceCollector *collector,
-                                          tsl::uint32 device_id,
-                                          CUpti_CallbackId cbid,
-                                          const CUpti_CallbackData *cbdata,
-                                          tsl::uint64 start_time,
-                                          tsl::uint64 end_time) {
+void AddCuMemAllocManagedEventUponApiExit(
+    CuptiTraceCollector *collector, uint32_t device_id, CUpti_CallbackId cbid,
+    const CUpti_CallbackData *cbdata, uint64_t start_time, uint64_t end_time) {
   const auto *params =
       static_cast<const cuMemAllocManaged_params *>(cbdata->functionParams);
   const void *dptr = reinterpret_cast<void *>(*params->dptr);
@@ -575,11 +566,10 @@ void AddCuMemAllocManagedEventUponApiExit(CuptiTraceCollector *collector,
 }
 
 void AddCuMemAllocHostEventUponApiExit(CuptiTraceCollector *collector,
-                                       tsl::uint32 device_id,
+                                       uint32_t device_id,
                                        CUpti_CallbackId cbid,
                                        const CUpti_CallbackData *cbdata,
-                                       tsl::uint64 start_time,
-                                       tsl::uint64 end_time) {
+                                       uint64_t start_time, uint64_t end_time) {
   const auto *params =
       static_cast<const cuMemAllocHost_v2_params *>(cbdata->functionParams);
   CuptiTracerEvent event{};
@@ -600,11 +590,10 @@ void AddCuMemAllocHostEventUponApiExit(CuptiTraceCollector *collector,
 }
 
 void AddCuMemHostAllocEventUponApiExit(CuptiTraceCollector *collector,
-                                       tsl::uint32 device_id,
+                                       uint32_t device_id,
                                        CUpti_CallbackId cbid,
                                        const CUpti_CallbackData *cbdata,
-                                       tsl::uint64 start_time,
-                                       tsl::uint64 end_time) {
+                                       uint64_t start_time, uint64_t end_time) {
   const auto *params =
       static_cast<const cuMemHostAlloc_params *>(cbdata->functionParams);
   CuptiTracerEvent event{};
@@ -626,10 +615,9 @@ void AddCuMemHostAllocEventUponApiExit(CuptiTraceCollector *collector,
 }
 
 void AddCuMemFreeEventUponApiExit(CuptiTraceCollector *collector,
-                                  tsl::uint32 device_id, CUpti_CallbackId cbid,
+                                  uint32_t device_id, CUpti_CallbackId cbid,
                                   const CUpti_CallbackData *cbdata,
-                                  tsl::uint64 start_time,
-                                  tsl::uint64 end_time) {
+                                  uint64_t start_time, uint64_t end_time) {
   const auto *params =
       static_cast<const cuMemFree_v2_params *>(cbdata->functionParams);
   const void *dptr = reinterpret_cast<void *>(params->dptr);
@@ -650,11 +638,9 @@ void AddCuMemFreeEventUponApiExit(CuptiTraceCollector *collector,
 }
 
 void AddCuMemFreeHostEventUponApiExit(CuptiTraceCollector *collector,
-                                      tsl::uint32 device_id,
-                                      CUpti_CallbackId cbid,
+                                      uint32_t device_id, CUpti_CallbackId cbid,
                                       const CUpti_CallbackData *cbdata,
-                                      tsl::uint64 start_time,
-                                      tsl::uint64 end_time) {
+                                      uint64_t start_time, uint64_t end_time) {
   const auto *params =
       static_cast<const cuMemFreeHost_params *>(cbdata->functionParams);
   CuptiTracerEvent event{};
@@ -673,12 +659,9 @@ void AddCuMemFreeHostEventUponApiExit(CuptiTraceCollector *collector,
   collector->AddEvent(std::move(event));
 }
 
-void AddCuMemHostRegisterEventUponApiExit(CuptiTraceCollector *collector,
-                                          tsl::uint32 device_id,
-                                          CUpti_CallbackId cbid,
-                                          const CUpti_CallbackData *cbdata,
-                                          tsl::uint64 start_time,
-                                          tsl::uint64 end_time) {
+void AddCuMemHostRegisterEventUponApiExit(
+    CuptiTraceCollector *collector, uint32_t device_id, CUpti_CallbackId cbid,
+    const CUpti_CallbackData *cbdata, uint64_t start_time, uint64_t end_time) {
   const auto *params =
       static_cast<const cuMemHostRegister_v2_params *>(cbdata->functionParams);
   CuptiTracerEvent event{};
@@ -700,12 +683,9 @@ void AddCuMemHostRegisterEventUponApiExit(CuptiTraceCollector *collector,
   collector->AddEvent(std::move(event));
 }
 
-void AddCuMemHostUnregisterEventUponApiExit(CuptiTraceCollector *collector,
-                                            tsl::uint32 device_id,
-                                            CUpti_CallbackId cbid,
-                                            const CUpti_CallbackData *cbdata,
-                                            tsl::uint64 start_time,
-                                            tsl::uint64 end_time) {
+void AddCuMemHostUnregisterEventUponApiExit(
+    CuptiTraceCollector *collector, uint32_t device_id, CUpti_CallbackId cbid,
+    const CUpti_CallbackData *cbdata, uint64_t start_time, uint64_t end_time) {
   const auto *params =
       static_cast<const cuMemHostUnregister_params *>(cbdata->functionParams);
   CuptiTracerEvent event{};
@@ -725,9 +705,9 @@ void AddCuMemHostUnregisterEventUponApiExit(CuptiTraceCollector *collector,
 }
 
 void AddGenericEventUponApiExit(CuptiTraceCollector *collector,
-                                tsl::uint32 device_id, CUpti_CallbackId cbid,
+                                uint32_t device_id, CUpti_CallbackId cbid,
                                 const CUpti_CallbackData *cbdata,
-                                tsl::uint64 start_time, tsl::uint64 end_time) {
+                                uint64_t start_time, uint64_t end_time) {
   CuptiTracerEvent event{};
   event.type = CuptiTracerEventType::Generic;
   event.source = CuptiTracerEventSource::DriverCallback;
@@ -1082,630 +1062,6 @@ class CuptiDriverApiHookWithActivityApi : public CuptiDriverApiHook {
   void operator=(const CuptiDriverApiHookWithActivityApi &) = delete;
 };
 
-struct KernelRecord {
-  const char *kernel_name;
-  // TODO(csigg): cuStreamGetCtx introduced in CUDA 9.2 would allow us to only
-  // record the stream and infer the context during collection.
-  CUcontext context;
-  CUstream stream;
-  tsl::uint32 correlation_id;
-  CUevent start_event;
-  CUevent stop_event;
-  KernelDetails details;
-  tsl::uint64 start_timestamp;
-};
-
-struct MemcpyRecord {
-  CuptiTracerEventType type;
-  size_t size_bytes;
-  CUcontext context;
-  CUstream stream;
-  tsl::uint32 correlation_id;
-  bool async;
-  CUevent start_event;
-  CUevent stop_event;
-  tsl::uint64 start_timestamp;
-};
-
-Status CreateAndRecordEvent(CUevent *event, CUstream stream) {
-  CuptiApiTracingDisabler disabler;
-  TF_RETURN_IF_ERROR(ToStatus(cuEventCreate(event, CU_EVENT_DEFAULT)));
-  return ToStatus(cuEventRecord(*event, stream));
-}
-
-// Maintain and restore current thread's CUDA context.
-// Note: cuStreamGetCtx only available after CUDA 9.2.
-class ScopedCudaContext {
- public:
-  explicit ScopedCudaContext(CUstream stream) : stream_(stream) {
-    CuptiApiTracingDisabler disabler;  // don't trace cuda call in this func.
-    CUcontext context;
-    if (cuStreamGetCtx(stream, &context) != CUDA_SUCCESS) return;
-    context_ = context;
-    tsl::uint32 device_ordinal;
-    if (cuptiGetDeviceId(context, &device_ordinal) != CUPTI_SUCCESS) return;
-    device_ordinal_ = device_ordinal;
-    context_pushed_ = cuCtxPushCurrent(context) == CUDA_SUCCESS;
-  }
-  ~ScopedCudaContext() {
-    if (!context_pushed_) return;
-    CuptiApiTracingDisabler disabler;  // don't trace cuda call in this func.
-    cuCtxPopCurrent(&*context_);
-  }
-
-  // If successful, return the device ordinal of the relevant cuda stream.
-  // Otherwise std::nullopt;     non-std ok
-  std::optional<tsl::uint32> GetDeviceOrdinal() { return device_ordinal_; }
-
-  // If successful, return the cuda context of the relevant cuda stream.
-  // Otherwise std::nullopt;
-  std::optional<CUcontext> GetContext() { return context_; }
-
- private:
-  CUstream stream_;
-  std::optional<CUcontext> context_;
-  std::optional<tsl::uint32> device_ordinal_;
-  bool context_pushed_ = false;
-};
-
-// Stores a series of kernel and memcpy records.
-class CudaEventRecorder {
- public:
-  CudaEventRecorder(CuptiInterface *cupti_interface,
-                    CuptiTraceCollector *collector, int ordinal)
-      : cupti_interface_(cupti_interface),
-        collector_(collector),
-        ordinal_(ordinal) {
-    device_name_ = absl::StrCat("gpu ", ordinal);  // default.
-    CUdevice device;
-    if (cuDeviceGet(&device, ordinal) == CUDA_SUCCESS) {
-      char name[100];
-      if (cuDeviceGetName(name, sizeof(name), device) == CUDA_SUCCESS) {
-        device_name_ = name;
-      }
-    }
-  }
-
-  // Registers the start of a kernel launch. The returned index should be passed
-  // to StopKernel() after the kernel launch has completed.
-  template <typename T>
-  size_t StartKernel(const char *kernel_name, CUcontext context,
-                     tsl::uint32 correlation_id, const T *params) {
-    CUstream stream = params->hStream;
-    KernelRecord record = {kernel_name, context, stream, correlation_id};
-    record.details.registers_per_thread = 0;  // unknown.
-    record.details.static_shared_memory_usage = params->sharedMemBytes;
-    record.details.dynamic_shared_memory_usage = 0;  // unknown
-    record.details.block_x = params->blockDimX;
-    record.details.block_y = params->blockDimY;
-    record.details.block_z = params->blockDimZ;
-    record.details.grid_x = params->gridDimX;
-    record.details.grid_y = params->gridDimY;
-    record.details.grid_z = params->gridDimZ;
-    record.start_timestamp = CuptiTracer::GetTimestamp();
-    LogIfError(CreateAndRecordEvent(&record.start_event, stream));
-    absl::MutexLock lock(&mutex_);
-    if (stopped_) return -1;
-    kernel_records_.push_back(record);
-    return kernel_records_.size() - 1;
-  }
-  tsl::uint64 StopKernel(size_t index) {
-    absl::MutexLock lock(&mutex_);
-    if (index >= kernel_records_.size()) return 0;
-    auto &record = kernel_records_[index];
-    LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
-    return record.start_timestamp;
-  }
-
-  // Registers the start of a copy operation. The returned index should be
-  // passed to StopMemcpy() after the memcpy has completed.
-  size_t StartMemcpy(CuptiTracerEventType type, size_t size_bytes,
-                     CUcontext context, CUstream stream,
-                     tsl::uint32 correlation_id, bool async) {
-    MemcpyRecord record = {type,   size_bytes,     context,
-                           stream, correlation_id, async};
-    record.start_timestamp = CuptiTracer::GetTimestamp();
-    LogIfError(CreateAndRecordEvent(&record.start_event, stream));
-    absl::MutexLock lock(&mutex_);
-    if (stopped_) return -1;
-    memcpy_records_.push_back(record);
-    return memcpy_records_.size() - 1;
-  }
-  tsl::uint64 StopMemcpy(size_t index) {
-    absl::MutexLock lock(&mutex_);
-    if (index >= memcpy_records_.size()) return 0;
-    auto &record = memcpy_records_[index];
-    LogIfError(CreateAndRecordEvent(&record.stop_event, record.stream));
-    return record.start_timestamp;
-  }
-
-  Status Stop() {
-    {
-      absl::MutexLock lock(&mutex_);
-      stopped_ = true;
-      LOG(INFO) << "Collecting " << kernel_records_.size()
-                << " kernel records, " << memcpy_records_.size()
-                << " memcpy records.";
-
-      // Gather all profiled streams and contexts.
-      for (const auto &record : kernel_records_) {
-        TF_RETURN_IF_ERROR(
-            AddStreamInfo(record.context, record.stream, "Kernel"));
-      }
-      for (const auto &record : memcpy_records_) {
-        TF_RETURN_IF_ERROR(AddStreamInfo(record.context, record.stream,
-                                         GetTraceEventTypeName(record.type)));
-      }
-    }
-
-    // Synchronize all contexts, record end events, synchronize again.
-    // This scheme is an unreliable measure to associate a event with the wall
-    // time. There are chances that other threads might enque kernels which
-    // delay the second synchronization.
-    TF_RETURN_IF_ERROR(Synchronize());
-    for (auto &pair : context_infos_) {
-      TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
-      TF_RETURN_IF_ERROR(CreateAndRecordEvent(&pair.second.end_event, nullptr));
-    }
-
-    TF_RETURN_IF_ERROR(Synchronize());
-    end_walltime_us_ = Env::Default()->NowMicros();
-    return OkStatus();
-  }
-
-  Status Flush(AnnotationMap *annotation_map) {
-    auto kernel_records = ConsumeKernelRecords();
-    auto memcpy_records = ConsumeMemcpyRecords();
-    for (const auto &record : kernel_records) {
-      TF_RETURN_IF_ERROR(SaveRecord(record, annotation_map));
-    }
-    for (const auto &record : memcpy_records) {
-      TF_RETURN_IF_ERROR(SaveRecord(record, annotation_map));
-    }
-    return OkStatus();
-  }
-
-  std::vector<KernelRecord> ConsumeKernelRecords() {
-    absl::MutexLock lock(&mutex_);
-    return std::move(kernel_records_);
-  }
-  std::vector<MemcpyRecord> ConsumeMemcpyRecords() {
-    absl::MutexLock lock(&mutex_);
-    return std::move(memcpy_records_);
-  }
-
- private:
-  struct ContextInfo {
-    tsl::uint32 context_id = 0;
-    int num_streams = 0;
-    CUevent end_event;
-  };
-
-  struct StreamInfo {
-    tsl::uint32 stream_id = 0;
-    std::string name;
-    int index;  // 0 is reserved for null stream.
-    const ContextInfo *ctx_info;
-  };
-
-  // Synchronizes all contexts.
-  Status Synchronize() const {
-    CuptiApiTracingDisabler disabler;
-    for (const auto &pair : context_infos_) {
-      TF_RETURN_IF_ERROR(ToStatus(cuCtxSetCurrent(pair.first)));
-      TF_RETURN_IF_ERROR(ToStatus(cuCtxSynchronize()));
-    }
-    return OkStatus();
-  }
-
-  // Returns element from context_infos_, adding it if not yet present.
-  Status GetContextInfo(CUcontext context, ContextInfo **ctx_info_ptr) {
-    auto it = context_infos_.find(context);
-
-    if (it == context_infos_.end()) {
-      tsl::uint32 context_id = 0;
-      RETURN_IF_CUPTI_ERROR(
-          cupti_interface_->GetContextId(context, &context_id));
-      ContextInfo ctx_info = {context_id};
-      it = context_infos_.emplace(context, ctx_info).first;
-    }
-
-    *ctx_info_ptr = &it->second;
-    return OkStatus();
-  }
-
-  // Adds element to stream_infos_ if not yet present. If present, clear name
-  // if it doesn't match parameter.
-  Status AddStreamInfo(CUcontext context, CUstream stream,
-                       absl::string_view name) {
-    StreamKey key(context, stream);
-    auto it = stream_infos_.find(key);
-    if (it != stream_infos_.end()) {
-      if (it->second.name != name) {
-        it->second.name.clear();  // Stream with inconsistent names, clear it.
-      }
-      return OkStatus();
-    }
-
-    ContextInfo *ctx_info;
-    TF_RETURN_IF_ERROR(GetContextInfo(context, &ctx_info));
-    int index = stream ? ++ctx_info->num_streams : 0;
-    tsl::uint32 stream_id = 0;
-#if defined(CUDA_API_PER_THREAD_DEFAULT_STREAM)
-    RETURN_IF_CUPTI_ERROR(
-        cupti_interface_->GetStreamIdEx(context, stream, 1, &stream_id));
-#else
-    RETURN_IF_CUPTI_ERROR(
-        cupti_interface_->GetStreamIdEx(context, stream, 0, &stream_id));
-#endif
-
-    StreamInfo stream_info = {stream_id, static_cast<std::string>(name), index,
-                              ctx_info};
-    stream_infos_.emplace(key, stream_info);
-    return OkStatus();
-  }
-
-  // Returns time in microseconds between events recorded on the GPU.
-  static uint64_t GetElapsedTimeUs(CUevent start, CUevent stop) {
-    CuptiApiTracingDisabler disabler;
-    float elapsed_ms = 0.0f;
-    LogIfError(ToStatus(cuEventElapsedTime(&elapsed_ms, start, stop)));
-    return static_cast<tsl::uint64>(
-        std::llroundf(1000 * std::max(elapsed_ms, 0.0f)));
-  }
-
-  Status SaveRecord(const KernelRecord &record,
-                    AnnotationMap *annotation_map) const {
-    if (!record.start_event || !record.stop_event) {
-      return OkStatus();
-    }
-    const auto &stream_info =
-        stream_infos_.at(StreamKey(record.context, record.stream));
-    auto start_us =
-        GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
-    auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
-
-    std::string annotation;
-
-    CuptiTracerEvent event{};
-    event.type = CuptiTracerEventType::Kernel;
-    event.source = CuptiTracerEventSource::Activity;  // on gpu device.
-    event.name = record.kernel_name;
-    event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
-    event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
-    event.device_id = ordinal_;
-    event.context_id = stream_info.ctx_info->context_id;
-    event.stream_id = stream_info.stream_id;
-    event.correlation_id = record.correlation_id;
-    AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
-        event.device_id, event.correlation_id);
-    event.annotation = info.annotation;
-    event.kernel_info = record.details;
-    collector_->AddEvent(std::move(event));
-    return OkStatus();
-  }
-
-  Status SaveRecord(const MemcpyRecord &record,
-                    AnnotationMap *annotation_map) const {
-    if (!record.start_event || !record.stop_event) {
-      return OkStatus();
-    }
-    const auto &stream_info =
-        stream_infos_.at(StreamKey(record.context, record.stream));
-    auto start_us =
-        GetElapsedTimeUs(record.start_event, stream_info.ctx_info->end_event);
-    auto elapsed_us = GetElapsedTimeUs(record.start_event, record.stop_event);
-
-    CuptiTracerEvent event{};
-    event.type = record.type;
-    event.name = GetTraceEventTypeName(event.type);
-    event.source = CuptiTracerEventSource::Activity;
-    event.start_time_ns = (end_walltime_us_ - start_us) * 1000;
-    event.end_time_ns = event.start_time_ns + elapsed_us * 1000;
-    event.device_id = ordinal_;
-    event.context_id = stream_info.ctx_info->context_id;
-    event.stream_id = stream_info.stream_id;
-    event.correlation_id = record.correlation_id;
-    AnnotationMap::AnnotationInfo info = collector_->annotation_map()->LookUp(
-        event.device_id, event.correlation_id);
-    event.annotation = info.annotation;
-    event.memcpy_info.num_bytes = record.size_bytes;
-    // TODO: support MemcpyD2D where destination != source;
-    event.memcpy_info.destination = ordinal_;
-    event.memcpy_info.async = record.async;
-    // TODO: set src_mem_kind and dst_mem_kind.
-    collector_->AddEvent(std::move(event));
-    return OkStatus();
-  }
-
-  absl::Mutex mutex_;
-  bool stopped_ TF_GUARDED_BY(mutex_) = false;
-  std::vector<KernelRecord> kernel_records_ TF_GUARDED_BY(mutex_);
-  std::vector<MemcpyRecord> memcpy_records_
-      TF_GUARDED_BY(mutex_);  // non std ok
-
-  CuptiInterface *cupti_interface_;
-  CuptiTraceCollector *collector_;
-  const int ordinal_;
-  std::string device_name_;
-  tsl::uint64 end_walltime_us_;
-  // Include context in key to distinguish null streams.
-  using StreamKey = std::pair<CUcontext, CUstream>;
-
-  absl::node_hash_map<CUcontext, ContextInfo> context_infos_;
-  absl::flat_hash_map<StreamKey, StreamInfo> stream_infos_;
-};
-
-// This hook uses cuda events to measure device side activities.
-class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
- public:
-  CuptiDriverApiHookWithCudaEvent(const CuptiTracerOptions &option,
-                                  CuptiInterface *cupti_interface,
-                                  CuptiTraceCollector *collector)
-      : option_(option),
-        cupti_interface_(cupti_interface),
-        collector_(collector) {
-    int num_gpus = CuptiTracer::NumGpus();
-    cuda_event_recorders_.reserve(num_gpus);
-    for (int i = 0; i < num_gpus; ++i) {
-      cuda_event_recorders_.emplace_back(
-          std::make_unique<CudaEventRecorder>(cupti_interface, collector, i));
-    }
-  }
-  ~CuptiDriverApiHookWithCudaEvent() {
-    for (auto *callback_context : callback_contexts_) delete callback_context;
-  }
-
-  Status OnDriverApiEnter(int device_id, CUpti_CallbackDomain domain,
-                          CUpti_CallbackId cbid,
-                          const CUpti_CallbackData *cbdata) override {
-    auto *recorder = cuda_event_recorders_[device_id].get();
-    switch (cbid) {
-      case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel: {
-        DCHECK_NE(cbdata->symbolName, nullptr);
-        const auto *params =
-            static_cast<const cuLaunchKernel_params *>(cbdata->functionParams);
-        *cbdata->correlationData = recorder->StartKernel<cuLaunchKernel_params>(
-            cbdata->symbolName, cbdata->context, cbdata->correlationId, params);
-        break;
-      }
-#if CUDA_VERSION >= 11080  // CUDA 11.8
-      case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx: {
-        DCHECK_NE(cbdata->symbolName, nullptr);
-        const auto *params = static_cast<const cuLaunchKernelEx_params *>(
-            cbdata->functionParams);
-        *cbdata->correlationData = recorder->StartKernel<CUlaunchConfig>(
-            cbdata->symbolName, cbdata->context, cbdata->correlationId,
-            params->config);
-        break;
-      }
-#endif  // CUDA_VERSION >= 11080
-      case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel: {
-        DCHECK_NE(cbdata->symbolName, nullptr);
-        const auto *params =
-            static_cast<const cuLaunchCooperativeKernel_params *>(
-                cbdata->functionParams);
-        *cbdata->correlationData =
-            recorder->StartKernel<cuLaunchCooperativeKernel_params>(
-                cbdata->symbolName, cbdata->context, cbdata->correlationId,
-                params);
-        break;
-      }
-      case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: {
-        const auto *params =
-            static_cast<const cuLaunchCooperativeKernelMultiDevice_params *>(
-                cbdata->functionParams);
-        std::vector<tsl::uint32> record_indices;
-        record_indices.reserve(params->numDevices);
-        *cbdata->correlationData = -1;  // Invalid value.
-        const auto &annotation = AnnotationStack::Get();
-        for (int i = 0; i < params->numDevices; ++i) {
-          CUstream stream = params->launchParamsList[i].hStream;
-          ScopedCudaContext scoped_cuda_context(stream);
-          auto dev_id = scoped_cuda_context.GetDeviceOrdinal();
-          auto context = scoped_cuda_context.GetContext();
-          if (!dev_id) return tsl::errors::Internal("Invalid CUDA stream");
-          // Because annotation are per device, therefore we need to populate
-          // annotation for each device involved.
-          collector_->annotation_map()->Add(*dev_id, cbdata->correlationId,
-                                            annotation, "");
-          record_indices.push_back(
-              cuda_event_recorders_[*dev_id]->StartKernel<CUDA_LAUNCH_PARAMS>(
-                  "CooperativeKernelMultiDevice", *context,
-                  cbdata->correlationId, &(params->launchParamsList[i])));
-        }
-        auto *callback_context =
-            new CuptiApiCallbackContext(std::move(record_indices));
-        callback_contexts_.insert(callback_context);
-        *cbdata->correlationData =
-            reinterpret_cast<tsl::uint64>(callback_context);
-      } break;
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy: {
-        const auto *params =
-            static_cast<const cuMemcpy_params *>(cbdata->functionParams);
-        StartMemcpy<cuMemcpy_params>(GetMemcpyType(params->src, params->dst),
-                                     cbdata, recorder);
-        break;
-      }
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync: {
-        const auto *params =
-            static_cast<const cuMemcpyAsync_params *>(cbdata->functionParams);
-        StartMemcpyAsync<cuMemcpyAsync_params>(
-            GetMemcpyType(params->src, params->dst), cbdata, recorder);
-        break;
-      }
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
-        StartMemcpy<cuMemcpyHtoD_v2_params>(CuptiTracerEventType::MemcpyH2D,
-                                            cbdata, recorder);
-        break;
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
-        StartMemcpyAsync<cuMemcpyHtoDAsync_v2_params>(
-            CuptiTracerEventType::MemcpyH2D, cbdata, recorder);
-        break;
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
-        StartMemcpy<cuMemcpyDtoH_v2_params>(CuptiTracerEventType::MemcpyD2H,
-                                            cbdata, recorder);
-        break;
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
-        StartMemcpyAsync<cuMemcpyDtoHAsync_v2_params>(
-            CuptiTracerEventType::MemcpyD2H, cbdata, recorder);
-        break;
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
-        StartMemcpy<cuMemcpyDtoD_v2_params>(CuptiTracerEventType::MemcpyD2D,
-                                            cbdata, recorder);
-        break;
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
-        StartMemcpyAsync<cuMemcpyDtoDAsync_v2_params>(
-            CuptiTracerEventType::MemcpyD2D, cbdata, recorder);
-        break;
-      default:
-        VLOG(1) << "Unexpected callback id: " << cbid;
-        break;
-    }
-    return OkStatus();
-  }
-
-  Status OnDriverApiExit(int device_id, CUpti_CallbackDomain domain,
-                         CUpti_CallbackId cbid,
-                         const CUpti_CallbackData *cbdata) override {
-    auto *recorder = cuda_event_recorders_[device_id].get();
-    if (*cbdata->correlationData == static_cast<size_t>(-1)) return OkStatus();
-    tsl::uint64 start_tsc = 0;
-    switch (cbid) {
-      case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel:
-#if CUDA_VERSION >= 11080  // CUDA 11.8
-      case CUPTI_DRIVER_TRACE_CBID_cuLaunchKernelEx:
-#endif  // CUDA_VERSION >= 11080
-      case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernel:
-        start_tsc = recorder->StopKernel(*cbdata->correlationData);
-        break;
-      case CUPTI_DRIVER_TRACE_CBID_cuLaunchCooperativeKernelMultiDevice: {
-        auto *callback_context = reinterpret_cast<CuptiApiCallbackContext *>(
-            *cbdata->correlationData);
-        callback_contexts_.erase(callback_context);
-        auto record_indices = std::move(callback_context->record_indices);
-        delete callback_context;
-        const auto *params =
-            static_cast<const cuLaunchCooperativeKernelMultiDevice_params *>(
-                cbdata->functionParams);
-        if (record_indices.size() != params->numDevices)
-          return tsl::errors::Internal("Invalid correlation data");
-        for (int i = 0; i < params->numDevices; ++i) {
-          CUstream stream = params->launchParamsList[i].hStream;
-          ScopedCudaContext scoped_cuda_context(stream);
-          auto dev_id = scoped_cuda_context.GetDeviceOrdinal();
-          if (!dev_id) return tsl::errors::Internal("Invalid CUDA stream");
-          start_tsc =
-              cuda_event_recorders_[*dev_id]->StopKernel(record_indices[i]);
-        }
-      } break;
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpy:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyAsync:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoD_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyHtoDAsync_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoH_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoHAsync_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoD_v2:
-      case CUPTI_DRIVER_TRACE_CBID_cuMemcpyDtoDAsync_v2:
-        start_tsc = recorder->StopMemcpy(*cbdata->correlationData);
-        break;
-      default:
-        VLOG(1) << "Unexpected callback id: " << cbid;
-        // TODO: figure out how to get start timestamp in this case.
-        return OkStatus();
-    }
-    // If we are not collecting CPU events from Callback API, we can return now.
-    if (!option_.required_callback_api_events) {
-      return OkStatus();
-    }
-
-    // Grab timestamp for API exit. API entry timestamp saved in cbdata.
-    tsl::uint64 end_tsc = CuptiTracer::GetTimestamp();
-    return AddDriverApiCallbackEvent(collector_, cupti_interface_, device_id,
-                                     start_tsc, end_tsc, domain, cbid, cbdata);
-  }
-  Status SyncAndFlush() override {
-    for (auto &recorder : cuda_event_recorders_) {
-      TF_RETURN_IF_ERROR(recorder->Stop());
-    }
-    for (auto &recorder : cuda_event_recorders_) {
-      TF_RETURN_IF_ERROR(recorder->Flush(collector_->annotation_map()));
-    }
-    return OkStatus();
-  }
-
- private:
-  template <typename T>
-  static void StartMemcpy(CuptiTracerEventType type,
-                          const CUpti_CallbackData *cbdata,
-                          CudaEventRecorder *recorder) {
-    const auto *params = static_cast<const T *>(cbdata->functionParams);
-    *cbdata->correlationData =
-        recorder->StartMemcpy(type, params->ByteCount, cbdata->context, nullptr,
-                              cbdata->correlationId, /*async*/ false);
-  }
-
-  template <typename T>
-  static void StartMemcpyAsync(CuptiTracerEventType type,
-                               const CUpti_CallbackData *cbdata,
-                               CudaEventRecorder *recorder) {
-    const auto *params = static_cast<const T *>(cbdata->functionParams);
-    *cbdata->correlationData = recorder->StartMemcpy(
-        type, params->ByteCount, cbdata->context, params->hStream,
-        cbdata->correlationId, /*async*/ true);
-  }
-
-  static CUmemorytype GetMemoryType(CUdeviceptr ptr) {
-    CuptiApiTracingDisabler disabler;
-    CUmemorytype mem_type = CU_MEMORYTYPE_HOST;
-    auto status =
-        cuPointerGetAttribute(&mem_type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, ptr);
-    if (status == CUDA_ERROR_INVALID_VALUE) {
-      // Pointer not registered with CUDA, must be host memory.
-      return CU_MEMORYTYPE_HOST;
-    }
-    LogIfError(ToStatus(status));
-    return mem_type;
-  }
-
-  static CuptiTracerEventType GetMemcpyType(CUdeviceptr src, CUdeviceptr dst) {
-    CUmemorytype src_type = GetMemoryType(src);
-    CUmemorytype dst_type = GetMemoryType(dst);
-    // TODO: handle CU_MEMORYTYPE_ARRAY case
-    if (src_type == CU_MEMORYTYPE_HOST && dst_type == CU_MEMORYTYPE_DEVICE) {
-      return CuptiTracerEventType::MemcpyH2D;
-    } else if (src_type == CU_MEMORYTYPE_DEVICE &&
-               dst_type == CU_MEMORYTYPE_HOST) {
-      return CuptiTracerEventType::MemcpyD2H;
-    } else if (src_type == CU_MEMORYTYPE_DEVICE &&
-               dst_type == CU_MEMORYTYPE_DEVICE) {
-      return CuptiTracerEventType::MemcpyD2D;
-    }
-    return CuptiTracerEventType::MemcpyOther;
-  }
-
-  // Each cuLaunchCooperativeKernelMultiDevice will need to add an entry in
-  // each corresponding device, therefore we need to keep records of all
-  // the record indices in each device's record array.
-  // We allocate such data structure during API entry and free during API exit.
-  // However there is no guarantee that we receive such callbacks in pairs, we
-  // maintain a on-going API calls to make sure no memory leaks.
-  struct CuptiApiCallbackContext {
-    explicit CuptiApiCallbackContext(std::vector<tsl::uint32> &&r)
-        : record_indices(std::move(r)) {}
-    std::vector<tsl::uint32> record_indices;
-  };
-
-  const CuptiTracerOptions option_;
-  CuptiInterface *cupti_interface_;
-  CuptiTraceCollector *collector_;
-  absl::node_hash_set<CuptiApiCallbackContext *> callback_contexts_;
-  std::vector<std::unique_ptr<CudaEventRecorder>> cuda_event_recorders_;
-  CuptiDriverApiHookWithCudaEvent(const CuptiDriverApiHookWithCudaEvent &) =
-      delete;
-  void operator=(const CuptiDriverApiHookWithCudaEvent &) = delete;
-};
-
 /*static*/ std::string ErrorWithHostname(absl::string_view error_message) {
   return absl::StrCat(tsl::port::Hostname(), ": ", error_message);
 }
@@ -1714,7 +1070,7 @@ class CuptiDriverApiHookWithCudaEvent : public CuptiDriverApiHook {
 
 /*static*/ Status CuptiDriverApiHook::AddDriverApiCallbackEvent(
     CuptiTraceCollector *collector, CuptiInterface *cupti_interface,
-    int device_id, tsl::uint64 start_tsc, tsl::uint64 end_tsc,
+    int device_id, uint64_t start_tsc, uint64_t end_tsc,
     CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
     const CUpti_CallbackData *cbdata) {
   switch (cbid) {
@@ -1890,30 +1246,21 @@ void CuptiTracer::Enable(const CuptiTracerOptions &option,
                          CuptiTraceCollector *collector) {
   option_ = option;
   collector_ = collector;
-  if (option_->enable_event_based_activity) {
-    option_->enable_activity_api = false;
-    cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithCudaEvent(
-        option, cupti_interface_, collector));
-  } else {
-    cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithActivityApi(
-        option, cupti_interface_, collector));
-  }
+
+  cupti_driver_api_hook_.reset(new CuptiDriverApiHookWithActivityApi(
+      option, cupti_interface_, collector));
 
   Status status = EnableApiTracing();
   need_root_access_ |= status.code() == tsl::error::PERMISSION_DENIED;
   if (!status.ok()) return;
 
-  if (option_->enable_activity_api) {
-    EnableActivityTracing().IgnoreError();
-  }
+  EnableActivityTracing().IgnoreError();
   tsl::profiler::AnnotationStack::Enable(true);
 }
 
 void CuptiTracer::Disable() {
   DisableApiTracing().IgnoreError();
-  if (option_->enable_activity_api) {
-    DisableActivityTracing().IgnoreError();
-  }
+  DisableActivityTracing().IgnoreError();
   cupti_interface_->CleanUp();
   Finalize().IgnoreError();
   cupti_driver_api_hook_->SyncAndFlush().IgnoreError();
@@ -2028,7 +1375,7 @@ Status CuptiTracer::Finalize() {
   return OkStatus();
 }
 
-/*static*/ tsl::uint64 CuptiTracer::GetTimestamp() {
+/*static*/ uint64_t CuptiTracer::GetTimestamp() {
   uint64_t tsc;
   CuptiInterface *cupti_interface = GetCuptiInterface();
   if (cupti_interface && cupti_interface->GetTimestamp(&tsc) == CUPTI_SUCCESS) {
@@ -2075,7 +1422,7 @@ Status CuptiTracer::HandleCallback(CUpti_CallbackDomain domain,
   }
 
   // Grab a correct device ID.
-  tsl::uint32 device_id = -1;
+  uint32_t device_id = -1;
   RETURN_IF_CUPTI_ERROR(
       cupti_interface_->GetDeviceId(cbdata->context, &device_id));
   if (device_id >= num_gpus_) {
@@ -2227,7 +1574,7 @@ Status CuptiTracer::ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
   RETURN_IF_CUPTI_ERROR(cupti_interface_->ActivityGetNumDroppedRecords(
       context, stream_id, &dropped));
   if (dropped != 0) {
-    tsl::uint32 device_id = -1;
+    uint32_t device_id = -1;
     RETURN_IF_CUPTI_ERROR(cupti_interface_->GetDeviceId(context, &device_id));
     collector_->OnEventsDropped("cupti activity buffer full", dropped);
   }
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h
index bd2959a5cfc3bb..59583453779b75 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_tracer.h
@@ -16,13 +16,12 @@ limitations under the License.
 #ifndef XLA_BACKENDS_PROFILER_GPU_CUPTI_TRACER_H_
 #define XLA_BACKENDS_PROFILER_GPU_CUPTI_TRACER_H_
 
+#include "absl/status/status.h"
 #include "absl/types/optional.h"
 #include "third_party/gpus/cuda/extras/CUPTI/include/cupti.h"
 #include "third_party/gpus/cuda/include/nvtx3/nvToolsExt.h"
 #include "xla/backends/profiler/gpu/cupti_collector.h"
 #include "xla/backends/profiler/gpu/cupti_interface.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
 #include "tsl/platform/types.h"
 #include "tsl/profiler/utils/buffer_pool.h"
 
@@ -30,13 +29,6 @@ namespace xla {
 namespace profiler {
 
 struct CuptiTracerOptions {
-  bool enable_activity_api = true;
-
-  // Use cuda events to enclose the kernel/memcpy to measure device activity.
-  // enable_event_based_activity, if true, will override the enable_activity_api
-  // setting.
-  bool enable_event_based_activity = false;
-
   bool required_callback_api_events = true;
   // The callback ids that will be enabled and monitored, if empty, all
   // Callback ids to be enabled using Callback API.
@@ -59,18 +51,18 @@ class CuptiDriverApiHook {
  public:
   virtual ~CuptiDriverApiHook() {}
 
-  virtual tsl::Status OnDriverApiEnter(
+  virtual absl::Status OnDriverApiEnter(
       int device_id, CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
       const CUpti_CallbackData* callback_info) = 0;
-  virtual tsl::Status OnDriverApiExit(
+  virtual absl::Status OnDriverApiExit(
       int device_id, CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
       const CUpti_CallbackData* callback_info) = 0;
-  virtual tsl::Status SyncAndFlush() = 0;
+  virtual absl::Status SyncAndFlush() = 0;
 
  protected:
-  static tsl::Status AddDriverApiCallbackEvent(
+  static absl::Status AddDriverApiCallbackEvent(
       CuptiTraceCollector* collector, CuptiInterface* cupti_interface,
-      int device_id, tsl::uint64 start_tsc, tsl::uint64 end_tsc,
+      int device_id, uint64_t start_tsc, uint64_t end_tsc,
       CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
       const CUpti_CallbackData* callback_info);
 };
@@ -94,8 +86,9 @@ class CuptiTracer {
   void Enable(const CuptiTracerOptions& option, CuptiTraceCollector* collector);
   void Disable();
 
-  tsl::Status HandleCallback(CUpti_CallbackDomain domain, CUpti_CallbackId cbid,
-                             const CUpti_CallbackData* callback_info);
+  absl::Status HandleCallback(CUpti_CallbackDomain domain,
+                              CUpti_CallbackId cbid,
+                              const CUpti_CallbackData* callback_info);
 
   // Returns a buffer and its size for CUPTI to store activities. This buffer
   // will be reclaimed when CUPTI makes a callback to ProcessActivityBuffer.
@@ -104,8 +97,8 @@ class CuptiTracer {
   // Parses CUPTI activity events from activity buffer, and emits events for
   // CuptiTraceCollector. This function is public because called from registered
   // callback.
-  tsl::Status ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
-                                    uint8_t* buffer, size_t size);
+  absl::Status ProcessActivityBuffer(CUcontext context, uint32_t stream_id,
+                                     uint8_t* buffer, size_t size);
 
   static uint64_t GetTimestamp();
   static int NumGpus();
@@ -120,14 +113,14 @@ class CuptiTracer {
   // Buffer size and alignment, 32K and 8 as in CUPTI samples.
   static constexpr size_t kBufferSizeInBytes = 32 * 1024;
 
-  tsl::Status EnableApiTracing();
-  tsl::Status EnableActivityTracing();
-  tsl::Status DisableApiTracing();
-  tsl::Status DisableActivityTracing();
-  tsl::Status Finalize();
+  absl::Status EnableApiTracing();
+  absl::Status EnableActivityTracing();
+  absl::Status DisableApiTracing();
+  absl::Status DisableActivityTracing();
+  absl::Status Finalize();
   void ConfigureActivityUnifiedMemoryCounter(bool enable);
-  tsl::Status HandleNVTXCallback(CUpti_CallbackId cbid,
-                                 const CUpti_CallbackData* cbdata);
+  absl::Status HandleNVTXCallback(CUpti_CallbackId cbid,
+                                  const CUpti_CallbackData* cbdata);
 
   int num_gpus_;
   std::optional<CuptiTracerOptions> option_;
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_utils.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_utils.cc
index 04f400483842df..ee9a542485a48c 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_utils.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_utils.cc
@@ -12,17 +12,43 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
+#include "absl/base/call_once.h"
 #include "absl/memory/memory.h"
 #include "xla/backends/profiler/gpu/cupti_error_manager.h"
 #include "xla/backends/profiler/gpu/cupti_interface.h"
 #include "xla/backends/profiler/gpu/cupti_wrapper.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/stringpiece.h"
+#include "tsl/util/env_var.h"
 
 namespace xla {
 namespace profiler {
 
+bool IsCuptiUseStubInterface() {
+  // TODO: b/149634979: Remove this after NVIDIA issue 4459155 resolved.
+  static constexpr tsl::StringPiece cupti_use_stub_interface_env =
+      "TF_GPU_CUPTI_USE_STUB_INTERFACE";
+  static absl::once_flag once;  // NOLINT(clang-diagnostic-unreachable-code)
+  static bool cupti_use_stub_interface = false;
+  absl::call_once(once, [&] {
+    tsl::ReadBoolFromEnvVar(cupti_use_stub_interface_env, false,
+                            &cupti_use_stub_interface)
+        .IgnoreError();
+    if (cupti_use_stub_interface) {
+      LOG(INFO) << cupti_use_stub_interface_env << " is set to true, "
+                << "XLA Profiler is using stub CUPTI interface to work around "
+                << "potential serious bug in CUPTI lib. Such control may be "
+                << "removed/disabled in future if the known issue is resolved!";
+    }
+  });
+  return cupti_use_stub_interface;
+}
+
 CuptiInterface* GetCuptiInterface() {
   static CuptiInterface* cupti_interface =
-      new CuptiErrorManager(std::make_unique<CuptiWrapper>());
+      IsCuptiUseStubInterface()
+          ? new CuptiErrorManager(std::make_unique<CuptiWrapperStub>())
+          : new CuptiErrorManager(std::make_unique<CuptiWrapper>());
   return cupti_interface;
 }
 
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.cc
index 13ae39dcc92943..16565d9dd576fd 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.cc
@@ -56,8 +56,7 @@ CUptiResult CuptiWrapper::ActivityRegisterCallbacks(
                                         func_buffer_completed);
 }
 
-CUptiResult CuptiWrapper::GetDeviceId(CUcontext context,
-                                      tsl::uint32* deviceId) {
+CUptiResult CuptiWrapper::GetDeviceId(CUcontext context, uint32_t* deviceId) {
   return cuptiGetDeviceId(context, deviceId);
 }
 
@@ -90,145 +89,6 @@ CUptiResult CuptiWrapper::Unsubscribe(CUpti_SubscriberHandle subscriber) {
   return cuptiUnsubscribe(subscriber);
 }
 
-CUptiResult CuptiWrapper::DeviceEnumEventDomains(
-    CUdevice device, size_t* array_size_bytes,
-    CUpti_EventDomainID* domain_array) {
-  return cuptiDeviceEnumEventDomains(device, array_size_bytes, domain_array);
-}
-
-CUptiResult CuptiWrapper::DeviceGetEventDomainAttribute(
-    CUdevice device, CUpti_EventDomainID event_domain,
-    CUpti_EventDomainAttribute attrib, size_t* value_size, void* value) {
-  return cuptiDeviceGetEventDomainAttribute(device, event_domain, attrib,
-                                            value_size, value);
-}
-
-CUptiResult CuptiWrapper::DisableKernelReplayMode(CUcontext context) {
-  return cuptiDisableKernelReplayMode(context);
-}
-
-CUptiResult CuptiWrapper::EnableKernelReplayMode(CUcontext context) {
-  return cuptiEnableKernelReplayMode(context);
-}
-
-CUptiResult CuptiWrapper::DeviceGetNumEventDomains(CUdevice device,
-                                                   uint32_t* num_domains) {
-  return cuptiDeviceGetNumEventDomains(device, num_domains);
-}
-
-CUptiResult CuptiWrapper::EventDomainEnumEvents(
-    CUpti_EventDomainID event_domain, size_t* array_size_bytes,
-    CUpti_EventID* event_array) {
-  return cuptiEventDomainEnumEvents(event_domain, array_size_bytes,
-                                    event_array);
-}
-
-CUptiResult CuptiWrapper::EventDomainGetNumEvents(
-    CUpti_EventDomainID event_domain, uint32_t* num_events) {
-  return cuptiEventDomainGetNumEvents(event_domain, num_events);
-}
-
-CUptiResult CuptiWrapper::EventGetAttribute(CUpti_EventID event,
-                                            CUpti_EventAttribute attrib,
-                                            size_t* value_size, void* value) {
-  return cuptiEventGetAttribute(event, attrib, value_size, value);
-}
-
-CUptiResult CuptiWrapper::EventGetIdFromName(CUdevice device,
-                                             const char* event_name,
-                                             CUpti_EventID* event) {
-  return cuptiEventGetIdFromName(device, event_name, event);
-}
-
-CUptiResult CuptiWrapper::EventGroupDisable(CUpti_EventGroup event_group) {
-  return cuptiEventGroupDisable(event_group);
-}
-
-CUptiResult CuptiWrapper::EventGroupEnable(CUpti_EventGroup event_group) {
-  return cuptiEventGroupEnable(event_group);
-}
-
-CUptiResult CuptiWrapper::EventGroupGetAttribute(
-    CUpti_EventGroup event_group, CUpti_EventGroupAttribute attrib,
-    size_t* value_size, void* value) {
-  return cuptiEventGroupGetAttribute(event_group, attrib, value_size, value);
-}
-
-CUptiResult CuptiWrapper::EventGroupReadEvent(
-    CUpti_EventGroup event_group, CUpti_ReadEventFlags flags,
-    CUpti_EventID event, size_t* event_value_buffer_size_bytes,
-    uint64_t* event_value_buffer) {
-  return cuptiEventGroupReadEvent(event_group, flags, event,
-                                  event_value_buffer_size_bytes,
-                                  event_value_buffer);
-}
-
-CUptiResult CuptiWrapper::EventGroupSetAttribute(
-    CUpti_EventGroup event_group, CUpti_EventGroupAttribute attrib,
-    size_t value_size, void* value) {
-  return cuptiEventGroupSetAttribute(event_group, attrib, value_size, value);
-}
-
-CUptiResult CuptiWrapper::EventGroupSetsCreate(
-    CUcontext context, size_t event_id_array_size_bytes,
-    CUpti_EventID* event_id_array, CUpti_EventGroupSets** event_group_passes) {
-  return cuptiEventGroupSetsCreate(context, event_id_array_size_bytes,
-                                   event_id_array, event_group_passes);
-}
-
-CUptiResult CuptiWrapper::EventGroupSetsDestroy(
-    CUpti_EventGroupSets* event_group_sets) {
-  return cuptiEventGroupSetsDestroy(event_group_sets);
-}
-
-// CUPTI metric API
-CUptiResult CuptiWrapper::DeviceEnumMetrics(CUdevice device,
-                                            size_t* arraySizeBytes,
-                                            CUpti_MetricID* metricArray) {
-  return cuptiDeviceEnumMetrics(device, arraySizeBytes, metricArray);
-}
-
-CUptiResult CuptiWrapper::DeviceGetNumMetrics(CUdevice device,
-                                              uint32_t* num_metrics) {
-  return cuptiDeviceGetNumMetrics(device, num_metrics);
-}
-
-CUptiResult CuptiWrapper::MetricGetIdFromName(CUdevice device,
-                                              const char* metric_name,
-                                              CUpti_MetricID* metric) {
-  return cuptiMetricGetIdFromName(device, metric_name, metric);
-}
-
-CUptiResult CuptiWrapper::MetricGetNumEvents(CUpti_MetricID metric,
-                                             uint32_t* num_events) {
-  return cuptiMetricGetNumEvents(metric, num_events);
-}
-
-CUptiResult CuptiWrapper::MetricEnumEvents(CUpti_MetricID metric,
-                                           size_t* event_id_array_size_bytes,
-                                           CUpti_EventID* event_id_array) {
-  return cuptiMetricEnumEvents(metric, event_id_array_size_bytes,
-                               event_id_array);
-}
-
-CUptiResult CuptiWrapper::MetricGetAttribute(CUpti_MetricID metric,
-                                             CUpti_MetricAttribute attrib,
-                                             size_t* value_size, void* value) {
-  return cuptiMetricGetAttribute(metric, attrib, value_size, value);
-}
-
-CUptiResult CuptiWrapper::MetricGetValue(CUdevice device, CUpti_MetricID metric,
-                                         size_t event_id_array_size_bytes,
-                                         CUpti_EventID* event_id_array,
-                                         size_t event_value_array_size_bytes,
-                                         uint64_t* event_value_array,
-                                         uint64_t time_duration,
-                                         CUpti_MetricValue* metric_value) {
-  return cuptiMetricGetValue(device, metric, event_id_array_size_bytes,
-                             event_id_array, event_value_array_size_bytes,
-                             event_value_array, time_duration, metric_value);
-}
-
 CUptiResult CuptiWrapper::GetResultString(CUptiResult result,
                                           const char** str) {
   return cuptiGetResultString(result, str);
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.h b/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.h
index a4a190b16aced4..ada4af91e6d48e 100644
--- a/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.h
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper.h
@@ -55,7 +55,7 @@ class CuptiWrapper : public xla::profiler::CuptiInterface {
       CUpti_BuffersCallbackRequestFunc func_buffer_requested,
       CUpti_BuffersCallbackCompleteFunc func_buffer_completed) override;
 
-  CUptiResult GetDeviceId(CUcontext context, tsl::uint32* deviceId) override;
+  CUptiResult GetDeviceId(CUcontext context, uint32_t* deviceId) override;
 
   CUptiResult GetTimestamp(uint64_t* timestamp) override;
 
@@ -77,92 +77,75 @@ class CuptiWrapper : public xla::profiler::CuptiInterface {
 
   CUptiResult Unsubscribe(CUpti_SubscriberHandle subscriber) override;
 
-  // CUPTI event API
-  CUptiResult DeviceEnumEventDomains(
-      CUdevice device, size_t* array_size_bytes,
-      CUpti_EventDomainID* domain_array) override;
-
-  CUptiResult DeviceGetEventDomainAttribute(CUdevice device,
-                                            CUpti_EventDomainID event_domain,
-                                            CUpti_EventDomainAttribute attrib,
-                                            size_t* value_size,
-                                            void* value) override;
-
-  CUptiResult DisableKernelReplayMode(CUcontext context) override;
+  CUptiResult GetResultString(CUptiResult result, const char** str) override;
 
-  CUptiResult EnableKernelReplayMode(CUcontext context) override;
+  CUptiResult GetContextId(CUcontext context, uint32_t* context_id) override;
 
-  CUptiResult DeviceGetNumEventDomains(CUdevice device,
-                                       uint32_t* num_domains) override;
+  CUptiResult GetStreamIdEx(CUcontext context, CUstream stream,
+                            uint8_t per_thread_stream,
+                            uint32_t* stream_id) override;
 
-  CUptiResult EventDomainEnumEvents(CUpti_EventDomainID event_domain,
-                                    size_t* array_size_bytes,
-                                    CUpti_EventID* event_array) override;
+  void CleanUp() override {}
+  bool Disabled() const override { return false; }
 
-  CUptiResult EventDomainGetNumEvents(CUpti_EventDomainID event_domain,
-                                      uint32_t* num_events) override;
+ private:
+  CuptiWrapper(const CuptiWrapper&) = delete;
+  void operator=(const CuptiWrapper&) = delete;
+};
 
-  CUptiResult EventGetAttribute(CUpti_EventID event,
-                                CUpti_EventAttribute attrib, size_t* value_size,
-                                void* value) override;
+// This is an implementation of CuptiWrapper that implements all load bearing
+// APIs as no-op. This is a stub that keeps XLA profiler functional, but all
+// collected profiles will be empty.
+class CuptiWrapperStub : public xla::profiler::CuptiInterface {
+ public:
+  CuptiWrapperStub() {}
 
-  CUptiResult EventGetIdFromName(CUdevice device, const char* event_name,
-                                 CUpti_EventID* event) override;
+  ~CuptiWrapperStub() override {}
 
-  CUptiResult EventGroupDisable(CUpti_EventGroup event_group) override;
+  // CUPTI activity API
+  CUptiResult ActivityDisable(CUpti_ActivityKind kind) override;
 
-  CUptiResult EventGroupEnable(CUpti_EventGroup event_group) override;
+  CUptiResult ActivityEnable(CUpti_ActivityKind kind) override;
 
-  CUptiResult EventGroupGetAttribute(CUpti_EventGroup event_group,
-                                     CUpti_EventGroupAttribute attrib,
-                                     size_t* value_size, void* value) override;
+  CUptiResult ActivityFlushAll(uint32_t flag) override;
 
-  CUptiResult EventGroupReadEvent(CUpti_EventGroup event_group,
-                                  CUpti_ReadEventFlags flags,
-                                  CUpti_EventID event,
-                                  size_t* event_value_buffer_size_bytes,
-                                  uint64_t* event_value_buffer) override;
+  CUptiResult ActivityGetNextRecord(uint8_t* buffer,
+                                    size_t valid_buffer_size_bytes,
+                                    CUpti_Activity** record) override;
 
-  CUptiResult EventGroupSetAttribute(CUpti_EventGroup event_group,
-                                     CUpti_EventGroupAttribute attrib,
-                                     size_t value_size, void* value) override;
+  CUptiResult ActivityGetNumDroppedRecords(CUcontext context,
+                                           uint32_t stream_id,
+                                           size_t* dropped) override;
 
-  CUptiResult EventGroupSetsCreate(
-      CUcontext context, size_t event_id_array_size_bytes,
-      CUpti_EventID* event_id_array,
-      CUpti_EventGroupSets** event_group_passes) override;
+  CUptiResult ActivityConfigureUnifiedMemoryCounter(
+      CUpti_ActivityUnifiedMemoryCounterConfig* config,
+      uint32_t count) override;
 
-  CUptiResult EventGroupSetsDestroy(
-      CUpti_EventGroupSets* event_group_sets) override;
+  CUptiResult ActivityRegisterCallbacks(
+      CUpti_BuffersCallbackRequestFunc func_buffer_requested,
+      CUpti_BuffersCallbackCompleteFunc func_buffer_completed) override;
 
-  // CUPTI metric API
-  CUptiResult DeviceEnumMetrics(CUdevice device, size_t* arraySizeBytes,
-                                CUpti_MetricID* metricArray) override;
+  CUptiResult GetDeviceId(CUcontext context, uint32_t* deviceId) override;
 
-  CUptiResult DeviceGetNumMetrics(CUdevice device,
-                                  uint32_t* num_metrics) override;
+  CUptiResult GetTimestamp(uint64_t* timestamp) override;
 
-  CUptiResult MetricGetIdFromName(CUdevice device, const char* metric_name,
-                                  CUpti_MetricID* metric) override;
+  // cuptiFinalize is only defined in CUDA8 and above.
+  // To enable it in CUDA8, the environment variable CUPTI_ENABLE_FINALIZE must
+  // be set to 1.
+  CUptiResult Finalize() override;
 
-  CUptiResult MetricGetNumEvents(CUpti_MetricID metric,
-                                 uint32_t* num_events) override;
+  // CUPTI callback API
+  CUptiResult EnableCallback(uint32_t enable, CUpti_SubscriberHandle subscriber,
+                             CUpti_CallbackDomain domain,
+                             CUpti_CallbackId cbid) override;
 
-  CUptiResult MetricEnumEvents(CUpti_MetricID metric,
-                               size_t* event_id_array_size_bytes,
-                               CUpti_EventID* event_id_array) override;
+  CUptiResult EnableDomain(uint32_t enable, CUpti_SubscriberHandle subscriber,
+                           CUpti_CallbackDomain domain) override;
 
-  CUptiResult MetricGetAttribute(CUpti_MetricID metric,
-                                 CUpti_MetricAttribute attrib,
-                                 size_t* value_size, void* value) override;
+  CUptiResult Subscribe(CUpti_SubscriberHandle* subscriber,
+                        CUpti_CallbackFunc callback, void* userdata) override;
 
-  CUptiResult MetricGetValue(CUdevice device, CUpti_MetricID metric,
-                             size_t event_id_array_size_bytes,
-                             CUpti_EventID* event_id_array,
-                             size_t event_value_array_size_bytes,
-                             uint64_t* event_value_array,
-                             uint64_t time_duration,
-                             CUpti_MetricValue* metric_value) override;
+  CUptiResult Unsubscribe(CUpti_SubscriberHandle subscriber) override;
 
   CUptiResult GetResultString(CUptiResult result, const char** str) override;
 
@@ -176,8 +159,8 @@ class CuptiWrapper : public xla::profiler::CuptiInterface {
   bool Disabled() const override { return false; }
 
  private:
-  CuptiWrapper(const CuptiWrapper&) = delete;
-  void operator=(const CuptiWrapper&) = delete;
+  CuptiWrapperStub(const CuptiWrapperStub&) = delete;
+  void operator=(const CuptiWrapperStub&) = delete;
 };
 
 }  // namespace profiler
diff --git a/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper_stub.cc b/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper_stub.cc
new file mode 100644
index 00000000000000..945fe49e853ded
--- /dev/null
+++ b/third_party/xla/xla/backends/profiler/gpu/cupti_wrapper_stub.cc
@@ -0,0 +1,109 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include <type_traits>
+
+#include "xla/backends/profiler/gpu/cupti_wrapper.h"
+
+namespace xla {
+namespace profiler {
+
+CUptiResult CuptiWrapperStub::ActivityDisable(CUpti_ActivityKind kind) {
+  return CUPTI_SUCCESS;
+}
+
+CUptiResult CuptiWrapperStub::ActivityEnable(CUpti_ActivityKind kind) {
+  return CUPTI_SUCCESS;
+}
+
+CUptiResult CuptiWrapperStub::ActivityFlushAll(uint32_t flag) {
+  return CUPTI_SUCCESS;
+}
+
+CUptiResult CuptiWrapperStub::ActivityGetNextRecord(
+    uint8_t* buffer, size_t valid_buffer_size_bytes, CUpti_Activity** record) {
+  return CUPTI_ERROR_MAX_LIMIT_REACHED;
+}
+
+CUptiResult CuptiWrapperStub::ActivityGetNumDroppedRecords(CUcontext context,
+                                                           uint32_t stream_id,
+                                                           size_t* dropped) {
+  *dropped = 0;
+  return CUPTI_SUCCESS;
+}
+
+CUptiResult CuptiWrapperStub::ActivityConfigureUnifiedMemoryCounter(
+    CUpti_ActivityUnifiedMemoryCounterConfig* config, uint32_t count) {
+  return CUPTI_SUCCESS;
+}
+
+CUptiResult CuptiWrapperStub::ActivityRegisterCallbacks(
+    CUpti_BuffersCallbackRequestFunc func_buffer_requested,
+    CUpti_BuffersCallbackCompleteFunc func_buffer_completed) {
+  return CUPTI_SUCCESS;
+}
+
+CUptiResult CuptiWrapperStub::GetDeviceId(CUcontext context,
+                                          uint32_t* deviceId) {
+  return cuptiGetDeviceId(context, deviceId);
+}
+
+CUptiResult CuptiWrapperStub::GetTimestamp(uint64_t* timestamp) {
+  return cuptiGetTimestamp(timestamp);
+}
+
+CUptiResult CuptiWrapperStub::Finalize() { return CUPTI_SUCCESS; }
+
+CUptiResult CuptiWrapperStub::EnableCallback(uint32_t enable,
+                                             CUpti_SubscriberHandle subscriber,
+                                             CUpti_CallbackDomain domain,
+                                             CUpti_CallbackId cbid) {
+  return CUPTI_SUCCESS;
+}
+
+CUptiResult CuptiWrapperStub::EnableDomain(uint32_t enable,
+                                           CUpti_SubscriberHandle subscriber,
+                                           CUpti_CallbackDomain domain) {
+  return CUPTI_SUCCESS;
+}
+
+CUptiResult CuptiWrapperStub::Subscribe(CUpti_SubscriberHandle* subscriber,
+                                        CUpti_CallbackFunc callback,
+                                        void* userdata) {
+  return CUPTI_SUCCESS;
+}
+
+CUptiResult CuptiWrapperStub::Unsubscribe(CUpti_SubscriberHandle subscriber) {
+  return CUPTI_SUCCESS;
+}
+
+CUptiResult CuptiWrapperStub::GetResultString(CUptiResult result,
+                                              const char** str) {
+  return cuptiGetResultString(result, str);
+}
+
+CUptiResult CuptiWrapperStub::GetContextId(CUcontext context,
+                                           uint32_t* context_id) {
+  return cuptiGetContextId(context, context_id);
+}
+
+CUptiResult CuptiWrapperStub::GetStreamIdEx(CUcontext context, CUstream stream,
+                                            uint8_t per_thread_stream,
+                                            uint32_t* stream_id) {
+  return cuptiGetStreamIdEx(context, stream, per_thread_stream, stream_id);
+}
+
+}  // namespace profiler
+}  // namespace xla
diff --git a/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc b/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
index 75fc3b30ff735c..70530221b68123 100644
--- a/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/device_tracer_cuda.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/container/fixed_array.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/backends/profiler/gpu/cupti_collector.h"
 #include "xla/backends/profiler/gpu/cupti_tracer.h"
 #include "xla/backends/profiler/gpu/cupti_wrapper.h"
@@ -38,11 +39,11 @@ limitations under the License.
 namespace xla {
 namespace profiler {
 
+using absl::OkStatus;
+using absl::Status;
 using tensorflow::ProfileOptions;
 using tensorflow::profiler::XSpace;
-using tsl::OkStatus;
 using tsl::ReadBoolFromEnvVar;
-using tsl::Status;
 
 // GpuTracer for GPU.
 class GpuTracer : public tsl::profiler::ProfilerInterface {
@@ -130,12 +131,6 @@ Status GpuTracer::DoStart() {
     CUPTI_DRIVER_TRACE_CBID_cuStreamSynchronize,
   };
 
-  bool use_cupti_activity_api = true;
-  ReadBoolFromEnvVar("TF_GPU_CUPTI_USE_ACTIVITY_API", true,
-                     &use_cupti_activity_api)
-      .IgnoreError();
-  options_.enable_event_based_activity = !use_cupti_activity_api;
-
   bool trace_concurrent_kernels = false;
   ReadBoolFromEnvVar("TF_GPU_CUPTI_FORCE_CONCURRENT_KERNEL", true,
                      &trace_concurrent_kernels)
@@ -155,8 +150,8 @@ Status GpuTracer::DoStart() {
 
   CuptiTracerCollectorOptions collector_options;
   collector_options.num_gpus = cupti_tracer_->NumGpus();
-  tsl::uint64 start_gputime_ns = CuptiTracer::GetTimestamp();
-  tsl::uint64 start_walltime_ns = tsl::profiler::GetCurrentTimeNanos();
+  uint64_t start_gputime_ns = CuptiTracer::GetTimestamp();
+  uint64_t start_walltime_ns = tsl::profiler::GetCurrentTimeNanos();
   cupti_collector_ = CreateCuptiCollector(collector_options, start_walltime_ns,
                                           start_gputime_ns);
 
@@ -213,7 +208,7 @@ Status GpuTracer::CollectData(XSpace* space) {
         space->add_warnings(std::move(events_dropped));
       }
       if (cupti_collector_) {
-        tsl::uint64 end_gpu_ns = CuptiTracer::GetTimestamp();
+        uint64_t end_gpu_ns = CuptiTracer::GetTimestamp();
         cupti_collector_->Export(space, end_gpu_ns);
       }
       return OkStatus();
diff --git a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
index 3e46187659b61d..df09d8b5458415 100644
--- a/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/device_tracer_rocm.cc
@@ -43,11 +43,11 @@ limitations under the License.
 namespace xla {
 namespace profiler {
 
+using absl::OkStatus;
+using absl::Status;
 using tensorflow::ProfileOptions;
 using tsl::mutex;
 using tsl::mutex_lock;
-using tsl::OkStatus;
-using tsl::Status;
 using tsl::profiler::Annotation;
 using tsl::profiler::AnnotationStack;
 using tsl::profiler::FindOrAddMutablePlaneWithName;
@@ -216,20 +216,20 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
   uint64_t start_gputime_ns_;
 
   mutex event_maps_mutex_;
-  absl::flat_hash_map<tsl::uint32, RocmTracerEvent> api_events_map_
+  absl::flat_hash_map<uint32_t, RocmTracerEvent> api_events_map_
       TF_GUARDED_BY(event_maps_mutex_);
-  absl::flat_hash_map<tsl::uint32, RocmTracerEvent> activity_api_events_map_
+  absl::flat_hash_map<uint32_t, RocmTracerEvent> activity_api_events_map_
       TF_GUARDED_BY(event_maps_mutex_);
 
   /* Some apis such as MEMSETD32 (based on an observation with ResNet50),
     trigger multiple HIP ops domain activities. We keep them in a vector and
     merge them with api activities at flush time.
   */
-  absl::flat_hash_map<tsl::uint32, std::vector<RocmTracerEvent>>
+  absl::flat_hash_map<uint32_t, std::vector<RocmTracerEvent>>
       activity_ops_events_map_ TF_GUARDED_BY(event_maps_mutex_);
   // This is for the APIs that we track because we need some information from
   // them to populate the corresponding activity that we actually track.
-  absl::flat_hash_map<tsl::uint32, RocmTracerEvent> auxiliary_api_events_map_
+  absl::flat_hash_map<uint32_t, RocmTracerEvent> auxiliary_api_events_map_
       TF_GUARDED_BY(event_maps_mutex_);
 
   const std::vector<RocmTracerEvent> ApiActivityInfoExchange() {
@@ -584,7 +584,7 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
         // Times 2 because HBM is DDR memory; it gets two data bits per each
         // data lane.
         auto memory_bandwidth =
-            tsl::uint64{2} * (mem_clock_khz)*1000 * (mem_bus_width_bits) / 8;
+            uint64_t{2} * (mem_clock_khz) * 1000 * (mem_bus_width_bits) / 8;
         device_plane->AddStatValue(
             *device_plane->GetOrCreateStatMetadata(
                 GetStatTypeStr(StatType::kDevCapMemoryBandwidth)),
@@ -596,7 +596,7 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
         device_plane->AddStatValue(
             *device_plane->GetOrCreateStatMetadata(
                 GetStatTypeStr(StatType::kDevCapMemorySize)),
-            static_cast<tsl::uint64>(total_memory));
+            static_cast<uint64_t>(total_memory));
       }
 
       auto compute_capability_major = device_properties_.major;
@@ -721,7 +721,7 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
       //   xevent.AddStatValue(
       //       *plane->GetOrCreateStatMetadata(
       //           GetStatTypeStr(StatType::kContextId)),
-      //       absl::StrCat("$$", static_cast<tsl::uint64>(event.context_id)));
+      //       absl::StrCat("$$", static_cast<uint64_t>(event.context_id)));
       // }
 
       if (event.type == RocmTracerEventType::Kernel &&
@@ -752,11 +752,11 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
                             occ_stats.occupancy_pct);
         xevent.AddStatValue(*plane->GetOrCreateStatMetadata(GetStatTypeStr(
                                 StatType::kOccupancyMinGridSize)),
-                            static_cast<tsl::int32>(occ_stats.min_grid_size));
+                            static_cast<int32_t>(occ_stats.min_grid_size));
         xevent.AddStatValue(
             *plane->GetOrCreateStatMetadata(
                 GetStatTypeStr(StatType::kOccupancySuggestedBlockSize)),
-            static_cast<tsl::int32>(occ_stats.suggested_block_size));
+            static_cast<int32_t>(occ_stats.suggested_block_size));
         xevent.AddStatValue(*plane->GetOrCreateStatMetadata(
                                 GetStatTypeStr(StatType::kKernelDetails)),
                             *plane->GetOrCreateStatMetadata(ToXStat(
@@ -921,7 +921,7 @@ class RocmTraceCollectorImpl : public profiler::RocmTraceCollector {
 
     mutex events_mutex;
     std::vector<RocmTracerEvent> events TF_GUARDED_BY(events_mutex);
-    absl::flat_hash_map<tsl::uint32, CorrelationInfo> correlation_info_
+    absl::flat_hash_map<uint32_t, CorrelationInfo> correlation_info_
         TF_GUARDED_BY(events_mutex);
     absl::flat_hash_map<RocmDeviceOccupancyParams, OccupancyStats>
         occupancy_cache_;
diff --git a/third_party/xla/xla/backends/profiler/gpu/mock_cupti.h b/third_party/xla/xla/backends/profiler/gpu/mock_cupti.h
index 28e4f2c4f8444d..2097e4049b32d7 100644
--- a/third_party/xla/xla/backends/profiler/gpu/mock_cupti.h
+++ b/third_party/xla/xla/backends/profiler/gpu/mock_cupti.h
@@ -68,85 +68,6 @@ class MockCupti : public xla::profiler::CuptiInterface {
               (override));
   MOCK_METHOD(CUptiResult, Unsubscribe, (CUpti_SubscriberHandle subscriber),
               (override));
-  MOCK_METHOD(CUptiResult, DeviceEnumEventDomains,
-              (CUdevice device, size_t* array_size_bytes,
-               CUpti_EventDomainID* domain_array),
-              (override));
-  MOCK_METHOD(CUptiResult, DeviceGetEventDomainAttribute,
-              (CUdevice device, CUpti_EventDomainID event_domain,
-               CUpti_EventDomainAttribute attrib, size_t* value_size,
-               void* value),
-              (override));
-  MOCK_METHOD(CUptiResult, DisableKernelReplayMode, (CUcontext context),
-              (override));
-  MOCK_METHOD(CUptiResult, EnableKernelReplayMode, (CUcontext context),
-              (override));
-  MOCK_METHOD(CUptiResult, DeviceGetNumEventDomains,
-              (CUdevice device, uint32_t* num_domains), (override));
-  MOCK_METHOD(CUptiResult, EventDomainEnumEvents,
-              (CUpti_EventDomainID event_domain, size_t* array_size_bytes,
-               CUpti_EventID* event_array),
-              (override));
-  MOCK_METHOD(CUptiResult, EventDomainGetNumEvents,
-              (CUpti_EventDomainID event_domain, uint32_t* num_events),
-              (override));
-  MOCK_METHOD(CUptiResult, EventGetAttribute,
-              (CUpti_EventID event, CUpti_EventAttribute attrib,
-               size_t* value_size, void* value),
-              (override));
-  MOCK_METHOD(CUptiResult, EventGetIdFromName,
-              (CUdevice device, const char* event_name, CUpti_EventID* event),
-              (override));
-  MOCK_METHOD(CUptiResult, EventGroupDisable, (CUpti_EventGroup event_group),
-              (override));
-  MOCK_METHOD(CUptiResult, EventGroupEnable, (CUpti_EventGroup event_group),
-              (override));
-  MOCK_METHOD(CUptiResult, EventGroupGetAttribute,
-              (CUpti_EventGroup event_group, CUpti_EventGroupAttribute attrib,
-               size_t* value_size, void* value),
-              (override));
-  MOCK_METHOD(CUptiResult, EventGroupReadEvent,
-              (CUpti_EventGroup event_group, CUpti_ReadEventFlags flags,
-               CUpti_EventID event, size_t* event_value_buffer_size_bytes,
-               uint64_t* eventValueBuffer),
-              (override));
-  MOCK_METHOD(CUptiResult, EventGroupSetAttribute,
-              (CUpti_EventGroup event_group, CUpti_EventGroupAttribute attrib,
-               size_t value_size, void* value),
-              (override));
-  MOCK_METHOD(CUptiResult, EventGroupSetsCreate,
-              (CUcontext context, size_t event_id_array_size_bytes,
-               CUpti_EventID* event_id_array,
-               CUpti_EventGroupSets** event_group_passes),
-              (override));
-  MOCK_METHOD(CUptiResult, EventGroupSetsDestroy,
-              (CUpti_EventGroupSets * event_group_sets), (override));
-  MOCK_METHOD(CUptiResult, DeviceEnumMetrics,
-              (CUdevice device, size_t* arraySizeBytes,
-               CUpti_MetricID* metricArray),
-              (override));
-  MOCK_METHOD(CUptiResult, DeviceGetNumMetrics,
-              (CUdevice device, uint32_t* num_metrics), (override));
-  MOCK_METHOD(CUptiResult, MetricGetIdFromName,
-              (CUdevice device, const char* metric_name,
-               CUpti_MetricID* metric),
-              (override));
-  MOCK_METHOD(CUptiResult, MetricGetNumEvents,
-              (CUpti_MetricID metric, uint32_t* num_events), (override));
-  MOCK_METHOD(CUptiResult, MetricEnumEvents,
-              (CUpti_MetricID metric, size_t* event_id_array_size_bytes,
-               CUpti_EventID* event_id_array),
-              (override));
-  MOCK_METHOD(CUptiResult, MetricGetAttribute,
-              (CUpti_MetricID metric, CUpti_MetricAttribute attrib,
-               size_t* value_size, void* value),
-              (override));
-  MOCK_METHOD(CUptiResult, MetricGetValue,
-              (CUdevice device, CUpti_MetricID metric,
-               size_t event_id_array_size_bytes, CUpti_EventID* event_id_array,
-               size_t event_value_array_size_bytes, uint64_t* event_value_array,
-               uint64_t time_duration, CUpti_MetricValue* metric_value),
-              (override));
   MOCK_METHOD(CUptiResult, GetResultString,
               (CUptiResult result, const char** str), (override));
 
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
index aa6d234b309a36..cc9545e270539a 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.cc
@@ -312,8 +312,8 @@ void DumpRocmTracerEvent(const RocmTracerEvent& event,
   VLOG(3) << oss.str();
 }
 
-tsl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
-                                            const void* cbdata) {
+absl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
+                                             const void* cbdata) {
   /* Some APIs such as hipMalloc, implicitly work on th devices set by the
     user using APIs such as hipSetDevice. API callbacks and activity records
     for functions like hipMalloc does not return the device id (CUDA does). To
@@ -325,7 +325,7 @@ tsl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
 
   // DumpApiCallbackData(domain, cbid, cbdata);
 
-  if (domain != ACTIVITY_DOMAIN_HIP_API) return tsl::OkStatus();
+  if (domain != ACTIVITY_DOMAIN_HIP_API) return absl::OkStatus();
 
   const hip_api_data_t* data = reinterpret_cast<const hip_api_data_t*>(cbdata);
 
@@ -353,7 +353,7 @@ tsl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
       } else {
         LOG(WARNING) << "An API exit callback received without API enter "
                         "with same correlation id. Event droped!";
-        return tsl::OkStatus();  // This API does not belong to us.
+        return absl::OkStatus();  // This API does not belong to us.
       }
       exit_time = RocmTracer::GetTimestamp();
     }
@@ -434,7 +434,7 @@ tsl::Status RocmApiCallbackImpl::operator()(uint32_t domain, uint32_t cbid,
         break;
     }
   }
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 void RocmApiCallbackImpl::AddKernelEventUponApiExit(uint32_t cbid,
@@ -873,8 +873,8 @@ void RocmApiCallbackImpl::AddSynchronizeEventUponApiExit(
   collector_->AddEvent(std::move(event), is_auxiliary);
 }
 
-tsl::Status RocmActivityCallbackImpl::operator()(const char* begin,
-                                                 const char* end) {
+absl::Status RocmActivityCallbackImpl::operator()(const char* begin,
+                                                  const char* end) {
   // we do not dump activities in this set in logger
 
   static std::set<activity_op_t> dump_excluded_activities = {
@@ -1021,7 +1021,7 @@ tsl::Status RocmActivityCallbackImpl::operator()(const char* begin,
             ));
   }
 
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 void RocmActivityCallbackImpl::AddHipKernelActivityEvent(
@@ -1414,15 +1414,15 @@ void ApiCallback(uint32_t domain, uint32_t cbid, const void* cbdata,
   tracer->ApiCallbackHandler(domain, cbid, cbdata).IgnoreError();
 }
 
-tsl::Status RocmTracer::ApiCallbackHandler(uint32_t domain, uint32_t cbid,
-                                           const void* cbdata) {
+absl::Status RocmTracer::ApiCallbackHandler(uint32_t domain, uint32_t cbid,
+                                            const void* cbdata) {
   if (api_tracing_enabled_)
     TF_RETURN_IF_ERROR((*api_cb_impl_)(domain, cbid, cbdata));
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
-tsl::Status RocmTracer::EnableApiTracing() {
-  if (api_tracing_enabled_) return tsl::OkStatus();
+absl::Status RocmTracer::EnableApiTracing() {
+  if (api_tracing_enabled_) return absl::OkStatus();
   api_tracing_enabled_ = true;
 
   for (auto& iter : options_->api_callbacks) {
@@ -1444,11 +1444,11 @@ tsl::Status RocmTracer::EnableApiTracing() {
       }
     }
   }
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
-tsl::Status RocmTracer::DisableApiTracing() {
-  if (!api_tracing_enabled_) return tsl::OkStatus();
+absl::Status RocmTracer::DisableApiTracing() {
+  if (!api_tracing_enabled_) return absl::OkStatus();
   api_tracing_enabled_ = false;
 
   for (auto& iter : options_->api_callbacks) {
@@ -1470,7 +1470,7 @@ tsl::Status RocmTracer::DisableApiTracing() {
       }
     }
   }
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 void ActivityCallback(const char* begin, const char* end, void* user_data) {
@@ -1478,8 +1478,8 @@ void ActivityCallback(const char* begin, const char* end, void* user_data) {
   tracer->ActivityCallbackHandler(begin, end).IgnoreError();
 }
 
-tsl::Status RocmTracer::ActivityCallbackHandler(const char* begin,
-                                                const char* end) {
+absl::Status RocmTracer::ActivityCallbackHandler(const char* begin,
+                                                 const char* end) {
   if (activity_tracing_enabled_) {
     TF_RETURN_IF_ERROR((*activity_cb_impl_)(begin, end));
   } else {
@@ -1504,11 +1504,11 @@ tsl::Status RocmTracer::ActivityCallbackHandler(const char* begin,
     }
     VLOG(3) << "Dropped Activity Records End";
   }
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
-tsl::Status RocmTracer::EnableActivityTracing() {
-  if (activity_tracing_enabled_) return tsl::OkStatus();
+absl::Status RocmTracer::EnableActivityTracing() {
+  if (activity_tracing_enabled_) return absl::OkStatus();
   activity_tracing_enabled_ = true;
 
   if (!options_->activity_tracing.empty()) {
@@ -1545,11 +1545,11 @@ tsl::Status RocmTracer::EnableActivityTracing() {
     }
   }
 
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
-tsl::Status RocmTracer::DisableActivityTracing() {
-  if (!activity_tracing_enabled_) return tsl::OkStatus();
+absl::Status RocmTracer::DisableActivityTracing() {
+  if (!activity_tracing_enabled_) return absl::OkStatus();
 
   for (auto& iter : options_->activity_tracing) {
     activity_domain_t domain = iter.first;
@@ -1600,7 +1600,7 @@ tsl::Status RocmTracer::DisableActivityTracing() {
 
   activity_tracing_enabled_ = false;
 
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 /*static*/ uint64_t RocmTracer::GetTimestamp() {
diff --git a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
index 8ae29562b111ca..8b4edb35b641ee 100644
--- a/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
+++ b/third_party/xla/xla/backends/profiler/gpu/rocm_tracer.h
@@ -16,15 +16,20 @@ limitations under the License.
 #ifndef XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_
 #define XLA_BACKENDS_PROFILER_GPU_ROCM_TRACER_H_
 
+#include <cstdint>
+#include <limits>
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
 #include "absl/container/fixed_array.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/node_hash_set.h"
-#include "absl/types/optional.h"
+#include "absl/status/status.h"
 #include "xla/stream_executor/rocm/roctracer_wrapper.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/macros.h"
-#include "tsl/platform/status.h"
 #include "tsl/platform/types.h"
 
 namespace xla {
@@ -241,7 +246,7 @@ class RocmApiCallbackImpl {
                       RocmTraceCollector* collector)
       : options_(options), tracer_(tracer), collector_(collector) {}
 
-  tsl::Status operator()(uint32_t domain, uint32_t cbid, const void* cbdata);
+  absl::Status operator()(uint32_t domain, uint32_t cbid, const void* cbdata);
 
  private:
   void AddKernelEventUponApiExit(uint32_t cbid, const hip_api_data_t* data,
@@ -279,7 +284,7 @@ class RocmActivityCallbackImpl {
                            RocmTraceCollector* collector)
       : options_(options), tracer_(tracer), collector_(collector) {}
 
-  tsl::Status operator()(const char* begin, const char* end);
+  absl::Status operator()(const char* begin, const char* end);
 
  private:
   void AddHipKernelActivityEvent(const roctracer_record_t* record);
@@ -309,9 +314,9 @@ class RocmTracer {
   void Enable(const RocmTracerOptions& options, RocmTraceCollector* collector);
   void Disable();
 
-  tsl::Status ApiCallbackHandler(uint32_t domain, uint32_t cbid,
-                                 const void* cbdata);
-  tsl::Status ActivityCallbackHandler(const char* begin, const char* end);
+  absl::Status ApiCallbackHandler(uint32_t domain, uint32_t cbid,
+                                  const void* cbdata);
+  absl::Status ActivityCallbackHandler(const char* begin, const char* end);
 
   static uint64_t GetTimestamp();
   static int NumGpus();
@@ -335,11 +340,11 @@ class RocmTracer {
   explicit RocmTracer() : num_gpus_(NumGpus()) {}
 
  private:
-  tsl::Status EnableApiTracing();
-  tsl::Status DisableApiTracing();
+  absl::Status EnableApiTracing();
+  absl::Status DisableApiTracing();
 
-  tsl::Status EnableActivityTracing();
-  tsl::Status DisableActivityTracing();
+  absl::Status EnableActivityTracing();
+  absl::Status DisableActivityTracing();
 
   int num_gpus_;
   std::optional<RocmTracerOptions> options_;
diff --git a/third_party/xla/xla/backends/profiler/plugin/BUILD b/third_party/xla/xla/backends/profiler/plugin/BUILD
index 02f20827ac2379..28419c97d06d28 100644
--- a/third_party/xla/xla/backends/profiler/plugin/BUILD
+++ b/third_party/xla/xla/backends/profiler/plugin/BUILD
@@ -6,7 +6,6 @@ load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
@@ -24,7 +23,7 @@ cc_library(
     srcs = ["plugin_tracer.cc"],
     hdrs = ["plugin_tracer.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = ["//xla:internal"],
     deps = [
         ":profiler_c_api_hdrs",
         "//xla:status",
@@ -34,6 +33,7 @@ cc_library(
         "@local_tsl//tsl/profiler/lib:profiler_interface",
         "@local_tsl//tsl/profiler/protobuf:profiler_options_proto_cc",
         "@local_tsl//tsl/profiler/protobuf:xplane_proto_cc",
+        "@local_tsl//tsl/profiler/utils:xplane_schema",
     ],
     alwayslink = True,
 )
@@ -56,7 +56,10 @@ cc_library(
     name = "plugin_tracer_impl",
     srcs = ["plugin_tracer_impl.cc"],
     hdrs = ["plugin_tracer_impl.h"],
-    visibility = ["//visibility:public"],
+    visibility = [
+        "//learning/brain/research/pjrt:__pkg__",
+        "//xla/pjrt/c:__pkg__",
+    ],
     deps = [
         ":profiler_c_api_hdrs",
         ":profiler_error",
diff --git a/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.cc b/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.cc
index 8176ac55f553e3..078187beeddcf1 100644
--- a/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.cc
+++ b/third_party/xla/xla/backends/profiler/plugin/plugin_tracer.cc
@@ -21,15 +21,18 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/backends/profiler/plugin/profiler_c_api.h"
 #include "xla/status.h"
 #include "tsl/platform/logging.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
+#include "tsl/profiler/utils/xplane_schema.h"
 
 namespace xla {
 namespace profiler {
 
+using tensorflow::profiler::XLine;
 using tensorflow::profiler::XPlane;
 using tensorflow::profiler::XSpace;
 
@@ -169,6 +172,11 @@ Status PluginTracer::CollectData(XSpace* space) {
     xspace.ParseFromArray(args.buffer, args.buffer_size_in_bytes);
     for (XPlane& tpu_plane : *xspace.mutable_planes()) {
       XPlane* plane = space->add_planes();
+      if (tpu_plane.name() == tsl::profiler::kHostThreadsPlaneName) {
+        for (XLine& xline : *tpu_plane.mutable_lines()) {
+          xline.set_display_name(absl::StrCat("libtpu:", xline.name()));
+        }
+      }
       plane->Swap(&tpu_plane);
     }
   }
diff --git a/third_party/xla/xla/backends/profiler/tpu/BUILD b/third_party/xla/xla/backends/profiler/tpu/BUILD
index dbc527ed2c3e4e..6f47d30c29b919 100644
--- a/third_party/xla/xla/backends/profiler/tpu/BUILD
+++ b/third_party/xla/xla/backends/profiler/tpu/BUILD
@@ -3,7 +3,6 @@ load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
@@ -12,7 +11,7 @@ cc_library(
     name = "tpu_tracer",
     srcs = if_with_tpu_support(["tpu_tracer.cc"]),
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = ["//xla:internal"],
     deps = [
         "//xla/stream_executor/tpu:tpu_api",
         "//xla/stream_executor/tpu:tpu_api_dlsym_set_fn",
diff --git a/third_party/xla/xla/c/BUILD b/third_party/xla/xla/c/BUILD
index 770aa03124c4d8..2a2288a13c9395 100644
--- a/third_party/xla/xla/c/BUILD
+++ b/third_party/xla/xla/c/BUILD
@@ -1,7 +1,12 @@
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//learning/brain/tfrt/tpu_plugin:__subpackages__",
+        "//tensorflow/core/common_runtime/next_pluggable_device:__subpackages__",
+    ]),
     licenses = ["notice"],
 )
 
@@ -10,7 +15,6 @@ cc_library(
     hdrs = [
         "c_api_decl.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
     ],
 )
diff --git a/third_party/xla/xla/client/BUILD b/third_party/xla/xla/client/BUILD
index 241c66874814ab..6c3b7790d88f21 100644
--- a/third_party/xla/xla/client/BUILD
+++ b/third_party/xla/xla/client/BUILD
@@ -6,6 +6,7 @@ load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )
@@ -24,14 +25,12 @@ filegroup(
         "**/*.cc",
         "**/*.h",
     ]),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "global_data",
     srcs = ["global_data.cc"],
     hdrs = ["global_data.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:service_interface",
         "//xla:types",
@@ -47,7 +46,6 @@ cc_library(
     name = "padding",
     srcs = ["padding.cc"],
     hdrs = ["padding.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:statusor",
         "//xla:types",
@@ -72,7 +70,6 @@ cc_library(
     name = "client",
     srcs = ["client.cc"],
     hdrs = ["client.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":global_data",
         ":xla_computation",
@@ -98,7 +95,6 @@ cc_library(
     name = "executable_build_options",
     srcs = ["executable_build_options.cc"],
     hdrs = ["executable_build_options.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:debug_options_flags",
         "//xla:execution_options_util",
@@ -124,7 +120,6 @@ cc_library(
     name = "local_client",
     srcs = ["local_client.cc"],
     hdrs = ["local_client.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":client",
         ":executable_build_options",
@@ -153,7 +148,6 @@ cc_library(
     name = "compile_only_client",
     srcs = ["compile_only_client.cc"],
     hdrs = ["compile_only_client.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":client",
         ":xla_computation",
@@ -174,7 +168,6 @@ cc_library(
     name = "client_library",
     srcs = ["client_library.cc"],
     hdrs = ["client_library.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":compile_only_client",
         ":local_client",
@@ -197,7 +190,6 @@ cc_library(
     name = "sharding_builder",
     srcs = ["sharding_builder.cc"],
     hdrs = ["sharding_builder.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:array",
         "//xla:shape_tree",
diff --git a/third_party/xla/xla/client/lib/BUILD b/third_party/xla/xla/client/lib/BUILD
index 25e83a7ed8ea83..65b29dcbda7cd4 100644
--- a/third_party/xla/xla/client/lib/BUILD
+++ b/third_party/xla/xla/client/lib/BUILD
@@ -1,11 +1,13 @@
 # Common computation builders for XLA.
 
 load("//xla/tests:build_defs.bzl", "generate_backend_suites", "xla_test")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility(["//xla/client:friends"]),
     licenses = ["notice"],
 )
 
@@ -16,7 +18,6 @@ filegroup(
         "**/*.cc",
         "**/*.h",
     ]),
-    visibility = ["//visibility:public"],
 )
 
 # Generate test_suites for all backends, named "${backend}_tests".
@@ -26,7 +27,6 @@ cc_library(
     name = "arithmetic",
     srcs = ["arithmetic.cc"],
     hdrs = ["arithmetic.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":constants",
         "//xla:shape_util",
@@ -61,7 +61,6 @@ cc_library(
     hdrs = [
         "comparators.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":constants",
         "//xla:shape_util",
@@ -96,7 +95,6 @@ cc_library(
     name = "constants",
     srcs = ["constants.cc"],
     hdrs = ["constants.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:literal_util",
         "//xla:shape_util",
@@ -112,7 +110,6 @@ cc_library(
     name = "broadcast",
     srcs = ["broadcast.cc"],
     hdrs = ["broadcast.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:status_macros",
@@ -144,7 +141,6 @@ cc_library(
     name = "conv_grad_size_util",
     srcs = ["conv_grad_size_util.cc"],
     hdrs = ["conv_grad_size_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status_macros",
         "//xla/client:padding",
@@ -156,7 +152,6 @@ cc_library(
     name = "dynamic_shaped_ops",
     srcs = ["dynamic_shaped_ops.cc"],
     hdrs = ["dynamic_shaped_ops.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":constants",
         "//xla:shape_util",
@@ -174,7 +169,6 @@ cc_library(
     name = "loops",
     srcs = ["loops.cc"],
     hdrs = ["loops.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":constants",
         "//xla:shape_util",
@@ -191,7 +185,6 @@ cc_library(
     name = "math",
     srcs = ["math.cc"],
     hdrs = ["math.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":arithmetic",
         ":constants",
@@ -229,7 +222,6 @@ cc_library(
     name = "matrix",
     srcs = ["matrix.cc"],
     hdrs = ["matrix.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":arithmetic",
         ":constants",
@@ -275,7 +267,6 @@ cc_library(
     name = "pooling",
     srcs = ["pooling.cc"],
     hdrs = ["pooling.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":arithmetic",
         ":constants",
@@ -302,7 +293,6 @@ cc_library(
     name = "prng",
     srcs = ["prng.cc"],
     hdrs = ["prng.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":constants",
         "//xla:shape_util",
@@ -335,7 +325,6 @@ cc_library(
     name = "qr",
     srcs = ["qr.cc"],
     hdrs = ["qr.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":arithmetic",
         ":constants",
@@ -381,7 +370,6 @@ cc_library(
     name = "lu_decomposition",
     srcs = ["lu_decomposition.cc"],
     hdrs = ["lu_decomposition.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:statusor",
@@ -395,7 +383,6 @@ cc_library(
     name = "approx_topk",
     srcs = ["approx_topk.cc"],
     hdrs = ["approx_topk.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":approx_topk_shape",
         "//xla:shape_util",
@@ -412,7 +399,6 @@ cc_library(
     name = "approx_topk_shape",
     srcs = ["approx_topk_shape.cc"],
     hdrs = ["approx_topk_shape.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:statusor",
         "//xla:util",
@@ -423,7 +409,6 @@ cc_library(
     name = "slicing",
     srcs = ["slicing.cc"],
     hdrs = ["slicing.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":arithmetic",
         ":constants",
@@ -458,7 +443,6 @@ cc_library(
     name = "sorting",
     srcs = ["sorting.cc"],
     hdrs = ["sorting.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":comparators",
         ":constants",
@@ -489,7 +473,6 @@ xla_test(
 cc_library(
     name = "quantize",
     hdrs = ["quantize.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":constants",
         "//xla:types",
@@ -524,7 +507,6 @@ cc_library(
     name = "testing",
     srcs = ["testing.cc"],
     hdrs = ["testing.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:execution_options_util",
         "//xla:literal",
@@ -547,7 +529,6 @@ cc_library(
     name = "self_adjoint_eig",
     srcs = ["self_adjoint_eig.cc"],
     hdrs = ["self_adjoint_eig.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":arithmetic",
         ":comparators",
@@ -599,7 +580,6 @@ cc_library(
     name = "svd",
     srcs = ["svd.cc"],
     hdrs = ["svd.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":arithmetic",
         ":comparators",
@@ -650,7 +630,6 @@ cc_library(
     name = "tridiagonal",
     srcs = ["tridiagonal.cc"],
     hdrs = ["tridiagonal.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":constants",
         ":loops",
@@ -688,7 +667,6 @@ cc_library(
     name = "logdet",
     srcs = ["logdet.cc"],
     hdrs = ["logdet.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":arithmetic",
         ":constants",
@@ -733,7 +711,6 @@ cc_library(
     name = "tuple",
     srcs = ["tuple.cc"],
     hdrs = ["tuple.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_tree",
         "//xla:shape_util",
diff --git a/third_party/xla/xla/client/lib/math.cc b/third_party/xla/xla/client/lib/math.cc
index 39e58cd77e2947..545d4449a72938 100644
--- a/third_party/xla/xla/client/lib/math.cc
+++ b/third_party/xla/xla/client/lib/math.cc
@@ -336,26 +336,6 @@ static XlaOp ErfImpl32(XlaOp x) {
          EvaluatePolynomial<float>(x2, kBeta);
 }
 
-XlaOp Erf(XlaOp x) {
-  auto& b = *x.builder();
-  return b.ReportErrorOrReturn([&]() -> StatusOr<XlaOp> {
-    TF_RETURN_IF_ERROR(EnsureOperandIsRealFp("Erf", x));
-    TF_ASSIGN_OR_RETURN(auto shape, b.GetShape(x));
-    // erf(x) =
-    //   erf_impl(x)            if x < 1
-    //   1 - erfc_impl(x)       otherwise
-    if (shape.element_type() == F64) {
-      return Select(Lt(Abs(x), ScalarLike(x, 1)), ErfImpl64(x),
-                    ScalarLike(x, 1) - ErfcImpl64(x));
-    }
-    // Erf(c)Impl don't have enough precision when run with bf16 intermediates
-    // (not surprising!), so upcast to f32 in this case.
-    return DoWithUpcastToF32(
-        x, {BF16, F16, F8E5M2, F8E4M3FN, F8E4M3B11FNUZ, F8E5M2FNUZ, F8E4M3FNUZ},
-        [](XlaOp x) { return ErfImpl32(x); });
-  });
-}
-
 namespace {
 
 // Approximation for the inverse error function from
diff --git a/third_party/xla/xla/client/lib/math.h b/third_party/xla/xla/client/lib/math.h
index 6c0d0ae05388a9..74b8a387a416de 100644
--- a/third_party/xla/xla/client/lib/math.h
+++ b/third_party/xla/xla/client/lib/math.h
@@ -46,9 +46,6 @@ XlaOp Reciprocal(XlaOp operand);
 // Computes an approximation of the error function complement (1 - erf(x)).
 XlaOp Erfc(XlaOp x);
 
-// Computes an approximation of the error function.
-XlaOp Erf(XlaOp x);
-
 // Computes an approximation of the inverse of the error function.
 XlaOp ErfInv(XlaOp x);
 
diff --git a/third_party/xla/xla/client/value_inference.cc b/third_party/xla/xla/client/value_inference.cc
index 179f232862a83d..8d09bc57b5759b 100644
--- a/third_party/xla/xla/client/value_inference.cc
+++ b/third_party/xla/xla/client/value_inference.cc
@@ -1113,6 +1113,7 @@ StatusOr<PostorderDFSNode> PostorderDFSVisitor::AnalyzeIsDynamic(
     case HloOpcode::kCollectivePermuteDone:
     case HloOpcode::kCos:
     case HloOpcode::kClz:
+    case HloOpcode::kErf:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
diff --git a/third_party/xla/xla/client/xla_builder.cc b/third_party/xla/xla/client/xla_builder.cc
index e3ee1aedbec6ed..f22e543f53f980 100644
--- a/third_party/xla/xla/client/xla_builder.cc
+++ b/third_party/xla/xla/client/xla_builder.cc
@@ -862,8 +862,8 @@ StatusOr<XlaOp> XlaBuilder::InDimBroadcast(
           << " i: " << i << ", shape: " << shape.ToString()
           << ", operand_shape: " << operand_shape->ToString();
     } else {
-      // Non-broadcast dimensions must not be dynamic.
-      TF_RET_CHECK(!shape.is_dynamic_dimension(i));
+      // Non-broadcast dimensions must be static.
+      TF_RET_CHECK(shape.is_static_dimension(i));
     }
   }
   return AddInstruction(std::move(instr), HloOpcode::kBroadcast, {operand});
@@ -898,7 +898,7 @@ StatusOr<XlaOp> XlaBuilder::AddBroadcastSequence(const Shape& output_shape,
           operand_shape->is_dynamic_dimension(i));
     } else {
       TF_RET_CHECK(operand_shape->dimensions(i) == 1 &&
-                   !operand_shape->is_dynamic_dimension(i))
+                   operand_shape->is_static_dimension(i))
           << "An explicit broadcast sequence requires the broadcasted "
              "dimensions to be trivial; operand shape: "
           << *operand_shape << "; output_shape: " << output_shape;
@@ -935,6 +935,57 @@ XlaOp XlaBuilder::UnaryOp(HloOpcode unop, XlaOp operand) {
   });
 }
 
+namespace {
+
+// Broadcasts an origin XLA op to the rank of target_shape.
+// Does not broadcast rank dimensions to match, only expands rank.
+// Is identity function if origin rank matches target rank.
+StatusOr<XlaOp> BroadcastToTargetRank(
+    XlaOp origin, const Shape& origin_shape, const Shape& target_shape,
+    absl::Span<const int64_t> broadcast_dimensions) {
+  const int64_t origin_rank = origin_shape.rank();
+  const int64_t target_rank = target_shape.rank();
+
+  // Identity op if ranks match, shold never be larger than target.
+  if (origin_rank >= target_rank) {
+    return origin;
+  }
+
+  // Update target_size with origin sizes using broadcast_dimensions
+  absl::Span<const int64_t> target_dimensions = target_shape.dimensions();
+  std::vector<int64_t> target_size{target_dimensions.begin(),
+                                   target_dimensions.end()};
+  for (int64_t origin_dim = 0; origin_dim < origin_rank; origin_dim++) {
+    int64_t target_dim = broadcast_dimensions[origin_dim];
+    target_size[target_dim] = origin_shape.dimensions(origin_dim);
+  }
+  return xla::BroadcastInDim(origin, target_size, broadcast_dimensions);
+}
+
+// For ternary ops, only scalar broadcasting is supported.
+// Return the non-scalar shape that all scalars should be broadcasted too
+// Returns status if non-scalar operands do not match.
+StatusOr<std::optional<Shape>> InferScalarBroadcastShape(
+    const Shape* lhs_shape, const Shape* rhs_shape, const Shape* ehs_shape) {
+  // The shape is not scalar, it may have unbounded/bounded dynamic
+  // dimensions.
+  std::optional<Shape> broadcasted_shape;
+  for (const Shape* shape : {lhs_shape, rhs_shape, ehs_shape}) {
+    if (!shape->IsArray() || shape->rank() == 0) continue;
+    if (!broadcasted_shape.has_value()) {
+      broadcasted_shape = ShapeUtil::MakeStaticShape(*shape);
+    }
+    // TODO(jpienaar): The case where we need to compute the broadcasted
+    // shape by considering multiple of the shapes is not implemented.
+    // Consider reusing getBroadcastedType from mlir/Dialect/Traits.h.
+    TF_RET_CHECK(ShapeUtil::SameDimensions(broadcasted_shape.value(), *shape))
+        << "Unimplemented implicit broadcast.";
+  }
+  return broadcasted_shape;
+}
+
+}  // namespace
+
 XlaOp XlaBuilder::BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
                            absl::Span<const int64_t> broadcast_dimensions,
                            std::optional<ComparisonDirection> direction,
@@ -946,40 +997,12 @@ XlaOp XlaBuilder::BinaryOp(HloOpcode binop, XlaOp lhs, XlaOp rhs,
         Shape shape, ShapeInference::InferBinaryOpShape(
                          binop, *lhs_shape, *rhs_shape, broadcast_dimensions));
 
-    const int64_t lhs_rank = lhs_shape->rank();
-    const int64_t rhs_rank = rhs_shape->rank();
-
-    XlaOp updated_lhs = lhs;
-    XlaOp updated_rhs = rhs;
-    if (!broadcast_dimensions.empty() && lhs_rank != rhs_rank) {
-      const bool should_broadcast_lhs = lhs_rank < rhs_rank;
-      XlaOp from = should_broadcast_lhs ? lhs : rhs;
-      const Shape& from_shape = should_broadcast_lhs ? *lhs_shape : *rhs_shape;
-
-      std::vector<int64_t> to_size;
-      std::vector<bool> to_size_is_dynamic;
-      const auto rank = shape.rank();
-      to_size.reserve(rank);
-      to_size_is_dynamic.reserve(rank);
-      for (int i = 0; i < rank; i++) {
-        to_size.push_back(shape.dimensions(i));
-        to_size_is_dynamic.push_back(false);
-      }
-      for (int64_t from_dim = 0; from_dim < from_shape.rank(); from_dim++) {
-        int64_t to_dim = broadcast_dimensions[from_dim];
-        to_size[to_dim] = from_shape.dimensions(from_dim);
-        to_size_is_dynamic[to_dim] = from_shape.is_dynamic_dimension(from_dim);
-      }
-
-      const Shape& broadcasted_shape = ShapeUtil::MakeShape(
-          from_shape.element_type(), to_size, to_size_is_dynamic);
-      TF_ASSIGN_OR_RETURN(
-          XlaOp broadcasted_operand,
-          InDimBroadcast(broadcasted_shape, from, broadcast_dimensions));
-
-      updated_lhs = should_broadcast_lhs ? broadcasted_operand : lhs;
-      updated_rhs = !should_broadcast_lhs ? broadcasted_operand : rhs;
-    }
+    TF_ASSIGN_OR_RETURN(
+        XlaOp updated_lhs,
+        BroadcastToTargetRank(lhs, *lhs_shape, shape, broadcast_dimensions));
+    TF_ASSIGN_OR_RETURN(
+        XlaOp updated_rhs,
+        BroadcastToTargetRank(rhs, *rhs_shape, shape, broadcast_dimensions));
 
     TF_ASSIGN_OR_RETURN(const Shape* updated_lhs_shape,
                         GetShapePtr(updated_lhs));
@@ -1058,23 +1081,11 @@ XlaOp XlaBuilder::TernaryOp(HloOpcode triop, XlaOp lhs, XlaOp rhs, XlaOp ehs) {
       TF_ASSIGN_OR_RETURN(const Shape* rhs_shape, GetShapePtr(rhs));
       TF_ASSIGN_OR_RETURN(const Shape* ehs_shape, GetShapePtr(ehs));
 
-      // The shape is not scalar, it may have unbounded/bounded dynamic
-      // dimensions.
-      std::optional<Shape> non_scalar_shape;
-      for (const Shape* shape : {lhs_shape, rhs_shape, ehs_shape}) {
-        if (shape->IsArray() && shape->rank() != 0) {
-          if (non_scalar_shape.has_value()) {
-            // TODO(jpienaar): The case where we need to compute the broadcasted
-            // shape by considering multiple of the shapes is not implemented.
-            // Consider reusing getBroadcastedType from mlir/Dialect/Traits.h.
-            TF_RET_CHECK(
-                ShapeUtil::SameDimensions(non_scalar_shape.value(), *shape))
-                << "Unimplemented implicit broadcast.";
-          } else {
-            non_scalar_shape = ShapeUtil::MakeStaticShape(*shape);
-          }
-        }
-      }
+      TF_ASSIGN_OR_RETURN(
+          std::optional<Shape> non_scalar_shape,
+          InferScalarBroadcastShape(lhs_shape, rhs_shape, ehs_shape));
+
+      // Scalar broadcast if mix of scalars and non-scalars
       if (non_scalar_shape.has_value()) {
         bool is_unbounded_dynamic = non_scalar_shape->is_unbounded_dynamic();
         if (ShapeUtil::IsScalar(*lhs_shape)) {
@@ -3921,7 +3932,7 @@ XlaOp XlaBuilder::GetDimensionSize(XlaOp operand, int64_t dimension) {
                                          *operand_shape, dimension));
     // Calling GetDimensionSize on a static dimension returns a constant
     // instruction.
-    if (!operand_shape->is_dynamic_dimension(dimension)) {
+    if (operand_shape->is_static_dimension(dimension)) {
       return ConstantR0<int32_t>(this, operand_shape->dimensions(dimension));
     }
     *instr.mutable_shape() = shape.ToProto();
@@ -5165,6 +5176,9 @@ XlaOp Log(const XlaOp operand) {
 XlaOp Log1p(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kLog1p, operand);
 }
+XlaOp Erf(const XlaOp operand) {
+  return operand.builder()->UnaryOp(HloOpcode::kErf, operand);
+}
 XlaOp Logistic(const XlaOp operand) {
   return operand.builder()->UnaryOp(HloOpcode::kLogistic, operand);
 }
diff --git a/third_party/xla/xla/client/xla_builder.h b/third_party/xla/xla/client/xla_builder.h
index 8bba0d85537b9c..49abb7895cf74f 100644
--- a/third_party/xla/xla/client/xla_builder.h
+++ b/third_party/xla/xla/client/xla_builder.h
@@ -1494,6 +1494,7 @@ class XlaBuilder {
   friend XlaOp Abs(XlaOp operand);
   friend XlaOp Atan2(XlaOp y, XlaOp x,
                      absl::Span<const int64_t> broadcast_dimensions);
+  friend XlaOp Erf(XlaOp operand);
   friend XlaOp Exp(XlaOp operand);
   friend XlaOp Expm1(XlaOp operand);
   friend XlaOp Floor(XlaOp operand);
@@ -2545,6 +2546,9 @@ XlaOp Abs(XlaOp operand);
 XlaOp Atan2(XlaOp y, XlaOp x,
             absl::Span<const int64_t> broadcast_dimensions = {});
 
+// Enqueues an erf instruction onto the computation.
+XlaOp Erf(XlaOp operand);
+
 // Enqueues an exp instruction onto the computation.
 XlaOp Exp(XlaOp operand);
 
diff --git a/third_party/xla/xla/client/xla_builder_test.cc b/third_party/xla/xla/client/xla_builder_test.cc
index b75776d604ace8..57ff0979063243 100644
--- a/third_party/xla/xla/client/xla_builder_test.cc
+++ b/third_party/xla/xla/client/xla_builder_test.cc
@@ -249,6 +249,30 @@ TEST(XlaBuilderTest, XPlusX) {
   EXPECT_THAT(root, GmockMatch(m::Add(m::Parameter(0), m::Parameter(0))));
 }
 
+TEST(XlaBuilderTest, TestBinaryOpImplicitBroadcast) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[1]"));
+  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[2, 2]"));
+  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[2,2]"));
+  Add(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
+      /*broadcast_dimensions=*/{1});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
+TEST(XlaBuilderTest, TestBinaryOpImplicitBroadcastBounded) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[1]"));
+  TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[<=2, <=2]"));
+  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[<=2, <=2]"));
+  Add(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
+      /*broadcast_dimensions=*/{1});
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
 TEST(XlaBuilderTest, ShapeInferenceError) {
   XlaBuilder b(TestName());
   auto x = Parameter(&b, 0, ShapeUtil::MakeShape(U32, {2, 4, 6}), "x");
@@ -1774,11 +1798,8 @@ TEST(XlaBuilderTest, UnboundedClampUnsupportedImplicitBroadcast1) {
   TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[?, 10]"));
   Clamp(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
         Parameter(&b, 2, ehs, "ehs"));
-  EXPECT_THAT(
-      BuildHloModule(b),
-      StatusIs(_,
-               HasSubstr("ShapeUtil::SameDimensions(non_scalar_shape.value(), "
-                         "*shape) Unimplemented implicit broadcast.")));
+  EXPECT_THAT(BuildHloModule(b),
+              StatusIs(_, HasSubstr("Unimplemented implicit broadcast.")));
 }
 
 TEST(XlaBuilderTest, UnboundedClampUnsupportedImplicitBroadcast2) {
@@ -1788,11 +1809,8 @@ TEST(XlaBuilderTest, UnboundedClampUnsupportedImplicitBroadcast2) {
   TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[]"));
   Clamp(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
         Parameter(&b, 2, ehs, "ehs"));
-  EXPECT_THAT(
-      BuildHloModule(b),
-      StatusIs(_,
-               HasSubstr(
-                   "!is_unbounded_dynamic Unimplemented implicit broadcast.")));
+  EXPECT_THAT(BuildHloModule(b),
+              StatusIs(_, HasSubstr("Unimplemented implicit broadcast.")));
 }
 
 TEST(XlaBuilderTest, UnboundedClampUnsupportedImplicitBroadcast3) {
@@ -1856,6 +1874,16 @@ TEST(XlaBuilderTest, UnboundedConcatenate) {
               GmockMatch(m::Op().WithShapeEqualTo(&expected)));
 }
 
+TEST(XlaBuilderTest, UnboundedConvert) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("s32[?]"));
+  ConvertElementType(Parameter(&b, 0, operand, "operand"), PrimitiveType::S32);
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
 TEST(XlaBuilderTest, UnboundedConvolution) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 2, ?, 128]"));
@@ -2128,6 +2156,43 @@ TEST(XlaBuilderTest, UnboundedReshapeUnsupportedInferredShape) {
                    "Reshaping with unbounded result shape is not supported.")));
 }
 
+TEST(XlaBuilderTest, UnboundedScatter) {
+  XlaBuilder b(TestName());
+  TF_ASSERT_OK_AND_ASSIGN(Shape input, ParseShape("f32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_indices, ParseShape("s32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(Shape updates, ParseShape("f32[?, ?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, ?, ?]"));
+
+  XlaComputation update_computation;
+  {
+    std::unique_ptr<XlaBuilder> sub_builder = b.CreateSubBuilder("add");
+    XlaOp arg0 = Parameter(sub_builder.get(), 0,
+                           ShapeUtil::MakeScalarShape(F32), "arg0");
+    XlaOp arg1 = Parameter(sub_builder.get(), 1,
+                           ShapeUtil::MakeScalarShape(F32), "arg1");
+    Add(arg0, arg1);
+    TF_ASSERT_OK_AND_ASSIGN(update_computation, sub_builder->Build());
+  }
+
+  ScatterDimensionNumbers dimension_numbers;
+  dimension_numbers.add_update_window_dims(2);
+  dimension_numbers.add_update_window_dims(3);
+  dimension_numbers.add_inserted_window_dims(0);
+  dimension_numbers.add_scatter_dims_to_operand_dims(1);
+  dimension_numbers.add_scatter_dims_to_operand_dims(0);
+  dimension_numbers.set_index_vector_dim(2);
+
+  Scatter(Parameter(&b, 0, input, "input"),
+          Parameter(&b, 1, scatter_indices, "scatter_indices"),
+          Parameter(&b, 2, updates, "updates"), update_computation,
+          dimension_numbers, /*indices_are_sorted=*/false,
+          /*unique_indices=*/false);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, BuildHloModule(b));
+  EXPECT_THAT(GetRoot(*module),
+              GmockMatch(m::Op().WithShapeEqualTo(&expected)));
+}
+
 TEST(XlaBuilderTest, UnboundedSelect) {
   XlaBuilder b(TestName());
   TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("pred[1, ?, 2, ?, <=2, ?, ?]"));
@@ -2149,11 +2214,8 @@ TEST(XlaBuilderTest, UnboundedSelectUnsupportedImplicitBroadcast1) {
   TF_ASSERT_OK_AND_ASSIGN(Shape ehs, ParseShape("f32[?, 10]"));
   Select(Parameter(&b, 0, lhs, "lhs"), Parameter(&b, 1, rhs, "rhs"),
          Parameter(&b, 2, ehs, "ehs"));
-  EXPECT_THAT(
-      BuildHloModule(b),
-      StatusIs(_,
-               HasSubstr("ShapeUtil::SameDimensions(non_scalar_shape.value(), "
-                         "*shape) Unimplemented implicit broadcast.")));
+  EXPECT_THAT(BuildHloModule(b),
+              StatusIs(_, HasSubstr("Unimplemented implicit broadcast.")));
 }
 
 TEST(XlaBuilderTest, UnboundedSelectUnsupportedImplicitBroadcast2) {
@@ -2254,6 +2316,7 @@ INSTANTIATE_TEST_SUITE_P(UnboundedDynamism, XlaBuilderUnboundedUnaryOpTest,
                               {"f32[?]", "f32[?]", &Ceil},
                               {"u32[?]", "u32[?]", &Clz},
                               {"f32[?]", "f32[?]", &Cos},
+                              {"f32[?]", "f32[?]", &Erf},
                               {"f32[?]", "f32[?]", &Exp},
                               {"f32[?]", "f32[?]", &Expm1},
                               {"f32[?]", "f32[?]", &Floor},
diff --git a/third_party/xla/xla/comparison_util.h b/third_party/xla/xla/comparison_util.h
index 49bbc77040b2f5..1127fa17ea1134 100644
--- a/third_party/xla/xla/comparison_util.h
+++ b/third_party/xla/xla/comparison_util.h
@@ -61,6 +61,13 @@ class Comparison {
     kPartial,
   };
 
+  friend absl::string_view ComparisonOrderToString(Comparison::Order order);
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const Order& p) {
+    absl::Format(&sink, "%s", ComparisonOrderToString(p));
+  }
+
   // Represents different comparison operations.
   enum class Direction : uint8_t {
     kEq,
@@ -228,7 +235,6 @@ inline std::ostream& operator<<(std::ostream& os, const Comparison& cmp) {
 std::string ComparisonDirectionToString(Comparison::Direction direction);
 std::string ComparisonTypeToString(Comparison::Type type);
 absl::string_view ComparisonPrimitiveTypeToString(PrimitiveType type);
-absl::string_view ComparisonOrderToString(Comparison::Order order);
 
 StatusOr<Comparison::Direction> StringToComparisonDirection(
     absl::string_view direction);
diff --git a/third_party/xla/xla/debug_options_flags.cc b/third_party/xla/xla/debug_options_flags.cc
index 0b64913a9e6225..50303d83676ec0 100644
--- a/third_party/xla/xla/debug_options_flags.cc
+++ b/third_party/xla/xla/debug_options_flags.cc
@@ -22,10 +22,12 @@ limitations under the License.
 #include <string>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/base/call_once.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/node_hash_map.h"
 #include "absl/strings/ascii.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
@@ -85,7 +87,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
   opts.set_xla_cpu_fast_math_honor_division(true);
 
   // TODO(AyanmoI): Remove this flag when cuDNN FMHA is fully supported.
-  opts.set_xla_gpu_enable_cudnn_fmha(false);
+  opts.set_xla_gpu_enable_cudnn_fmha(true);
 
   opts.set_xla_gpu_fused_attention_use_cudnn_rng(false);
 
@@ -137,9 +139,11 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 
   opts.set_xla_gpu_enable_xla_runtime_executable(false);
   opts.set_xla_gpu_enable_custom_fusions(false);
+  opts.set_xla_gpu_enable_address_computation_fusion(true);
   opts.set_xla_gpu_nccl_termination_timeout_seconds(-1);
   opts.set_xla_gpu_enable_shared_constants(true);
   opts.set_xla_gpu_enable_nccl_user_buffers(false);
+  opts.set_xla_gpu_enable_nccl_comm_splitting(false);
 
   // Set 4GB space limit for redzone scratch allocator.
   opts.set_xla_gpu_redzone_scratch_max_megabytes(1LL << 12);
@@ -189,7 +193,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 
   opts.set_xla_gpu_exhaustive_tiling_search(false);
 
-  opts.set_xla_gpu_enable_priority_fusion(false);
+  opts.set_xla_gpu_enable_priority_fusion(true);
 
   opts.set_xla_gpu_auto_spmd_partitioning_memory_budget_gb(0);
   opts.set_xla_gpu_auto_spmd_partitioning_memory_budget_ratio(1.1);
@@ -219,6 +223,10 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
 
   opts.set_xla_gpu_enable_libnvptxcompiler(false);
 
+  opts.set_xla_gpu_enable_dot_strength_reduction(true);
+
+  opts.set_xla_gpu_enable_bf16_6way_gemm(false);
+
   return opts;
 }
 
@@ -398,17 +406,76 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
 
   // Custom "sub-parser" lambda for xla_gpu_enable_command_buffer.
   auto setter_for_xla_gpu_enable_command_buffer =
-      [debug_options](const std::string& values) {
-        debug_options->clear_xla_gpu_enable_command_buffer();
-        for (const absl::string_view value : absl::StrSplit(values, ',')) {
+      [debug_options](const std::string& input) {
+        auto is_command_type = [](absl::string_view value) {
           DebugOptions::CommandBufferCmdType cmd_type;
-          if (!DebugOptions::CommandBufferCmdType_Parse(
-                  absl::AsciiStrToUpper(value), &cmd_type)) {
-            return false;
+          return DebugOptions::CommandBufferCmdType_Parse(
+              absl::AsciiStrToUpper(value), &cmd_type);
+        };
+
+        auto is_add_or_remove_command_type = [&](absl::string_view value) {
+          if (absl::StartsWith(value, "+") || absl::StartsWith(value, "-")) {
+            return (is_command_type(value.substr(1)));
           }
-          debug_options->add_xla_gpu_enable_command_buffer(cmd_type);
+          return false;
+        };
+
+        auto parse_command_type = [](absl::string_view value) {
+          DebugOptions::CommandBufferCmdType cmd_type;
+          DebugOptions::CommandBufferCmdType_Parse(absl::AsciiStrToUpper(value),
+                                                   &cmd_type);
+          return cmd_type;
+        };
+
+        auto erase_command_type = [](tsl::protobuf::RepeatedField<int>* enabled,
+                                     DebugOptions::CommandBufferCmdType type) {
+          auto it = enabled->begin();
+          while (it != enabled->end()) {
+            if (*it == type) {
+              it = enabled->erase(it);
+            } else {
+              it++;
+            }
+          }
+        };
+
+        // Disable command buffers by clearing a set of supported commands.
+        if (input.empty()) {
+          debug_options->clear_xla_gpu_enable_command_buffer();
+          return true;
         }
-        return true;
+
+        std::vector<absl::string_view> values = absl::StrSplit(input, ',');
+
+        // Overwrite a set of supported commands with a flag.
+        if (absl::c_all_of(values, is_command_type)) {
+          debug_options->clear_xla_gpu_enable_command_buffer();
+          for (const absl::string_view value : values) {
+            debug_options->add_xla_gpu_enable_command_buffer(
+                parse_command_type(value));
+          }
+          return true;
+        }
+
+        // Add or remove a commands from a default set.
+        if (absl::c_all_of(values, is_add_or_remove_command_type)) {
+          for (const absl::string_view value : values) {
+            DebugOptions::CommandBufferCmdType cmd_type =
+                parse_command_type(value.substr(1));
+            if (absl::StartsWith(value, "+")) {
+              debug_options->add_xla_gpu_enable_command_buffer(cmd_type);
+            } else if (absl::StartsWith(value, "-")) {
+              tsl::protobuf::RepeatedField<int>* enabled =
+                  debug_options->mutable_xla_gpu_enable_command_buffer();
+              erase_command_type(enabled, cmd_type);
+            }
+            return true;
+          }
+        }
+
+        // Return an error if flag value was not recognized as one of the
+        // supported modes.
+        return false;
       };
 
   // Custom "sub-parser" for xla_fuel.  Note that ConsumeFuel does not do any
@@ -1023,7 +1090,10 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
   flag_list->push_back(tsl::Flag(
       "xla_gpu_enable_command_buffer", setter_for_xla_gpu_enable_command_buffer,
       command_types_to_string(debug_options->xla_gpu_enable_command_buffer()),
-      "The types of the commands that are recorded into command buffers"));
+      "The types of the commands that are recorded into command buffers. It"
+      " can either be a list of command types or a list of command types with"
+      " + and - as prefix, which indicate adding or removing a command type"
+      " to/from the default list."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_graph_num_runs_to_instantiate",
       int32_setter_for(
@@ -1090,6 +1160,12 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "Limits custom fusion only to fusions which match this regular "
       "expression. Default is all custom fusions registerered in a current "
       "process."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_address_computation_fusion",
+      bool_setter_for(
+          &DebugOptions::set_xla_gpu_enable_address_computation_fusion),
+      debug_options->xla_gpu_enable_address_computation_fusion(),
+      "Whether to enable XLA address computation fusion"));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_nccl_termination_timeout_seconds",
       int64_setter_for(
@@ -1108,6 +1184,12 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       "Enables NCCL User Buffer Registration. collective_memory_size in the "
       "allocator config must also be set to a non-zero value that is large "
       "enough to meet peak collective memory usage."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_nccl_comm_splitting",
+      bool_setter_for(&DebugOptions::set_xla_gpu_enable_nccl_comm_splitting),
+      debug_options->xla_gpu_enable_nccl_comm_splitting(),
+      "Enables NCCL communicator splitting which allows sharing NCCL resources "
+      "between different NCCL cliques."));
   flag_list->push_back(tsl::Flag(
       "xla_gpu_redzone_scratch_max_megabytes",
       int64_setter_for(
@@ -1449,6 +1531,11 @@ void MakeDebugOptionsFlags(std::vector<tsl::Flag>* flag_list,
       debug_options->xla_gpu_enable_libnvptxcompiler(),
       "Use libnvptxcompiler for PTX-to-GPU-assembly compilation instead of "
       "calling ptxas."));
+  flag_list->push_back(tsl::Flag(
+      "xla_gpu_enable_dot_strength_reduction",
+      bool_setter_for(&DebugOptions::set_xla_gpu_enable_dot_strength_reduction),
+      debug_options->xla_gpu_enable_dot_strength_reduction(),
+      "Enable rewriting matmuls with a vector into reductions."));
 }  // NOLINT(readability/fn_size)
 
 // Allocates flag_values and flag_objects; this function must not be called more
diff --git a/third_party/xla/xla/experiments/BUILD b/third_party/xla/xla/experiments/BUILD
index c9aa052a1493ce..d298feaf3f0a7a 100644
--- a/third_party/xla/xla/experiments/BUILD
+++ b/third_party/xla/xla/experiments/BUILD
@@ -1,6 +1,8 @@
 # Various experiments related to the compiler that are not a part of the final XLA binary.
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    # keep visibility private, if you need to depend on this, move it out of experiments
+    default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
diff --git a/third_party/xla/xla/experiments/sm_bandwidth_benchmark/BUILD b/third_party/xla/xla/experiments/sm_bandwidth_benchmark/BUILD
index 003c091632c73f..79174d282658bd 100644
--- a/third_party/xla/xla/experiments/sm_bandwidth_benchmark/BUILD
+++ b/third_party/xla/xla/experiments/sm_bandwidth_benchmark/BUILD
@@ -6,7 +6,6 @@ load("//xla:xla.bzl", "xla_cc_test")
 cc_library(
     name = "sm_bw_utils",
     hdrs = ["sm_bw_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@local_tsl//tsl/platform:logging",
     ] + if_cuda([
@@ -18,7 +17,6 @@ cuda_library(
     name = "sm_bw_kernels",
     srcs = ["sm_bw_kernels.cu.cc"],
     hdrs = ["sm_bw_kernels.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":sm_bw_utils",
     ],
diff --git a/third_party/xla/xla/experiments/triton_autotuning/BUILD b/third_party/xla/xla/experiments/triton_autotuning/BUILD
index b32c9e1d2926a7..cdadad94af0bb3 100644
--- a/third_party/xla/xla/experiments/triton_autotuning/BUILD
+++ b/third_party/xla/xla/experiments/triton_autotuning/BUILD
@@ -2,7 +2,7 @@
 
 package(
     default_applicable_licenses = ["//tensorflow:license"],
-    default_visibility = ["//visibility:public"],
+    default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
 
diff --git a/third_party/xla/xla/ffi/BUILD b/third_party/xla/xla/ffi/BUILD
index ed544106b153c2..f4a29a469ba8df 100644
--- a/third_party/xla/xla/ffi/BUILD
+++ b/third_party/xla/xla/ffi/BUILD
@@ -2,13 +2,14 @@ load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "api",
     hdrs = ["//xla/ffi/api:api_headers"],
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
     deps = ["//xla/ffi/api:c_api"],
 )
 
@@ -16,7 +17,6 @@ cc_library(
     name = "call_frame",
     srcs = ["call_frame.cc"],
     hdrs = ["call_frame.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:types",
         "//xla:xla_data_proto_cc",
@@ -33,9 +33,9 @@ cc_library(
 cc_library(
     name = "ffi",
     hdrs = ["ffi.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":api",
+        "//xla:shape_util",
         "//xla:status",
         "//xla:types",
         "//xla:xla_data_proto_cc",
@@ -53,7 +53,6 @@ cc_library(
     name = "ffi_api",
     srcs = ["ffi_api.cc"],
     hdrs = ["ffi_api.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":api",
         ":call_frame",
@@ -82,6 +81,7 @@ xla_cc_test(
         "//xla/stream_executor:device_memory",
         "@com_google_absl//absl/status",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:status_matchers",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ],
diff --git a/third_party/xla/xla/ffi/api/BUILD b/third_party/xla/xla/ffi/api/BUILD
index fa35ce81f57128..399f233a211f38 100644
--- a/third_party/xla/xla/ffi/api/BUILD
+++ b/third_party/xla/xla/ffi/api/BUILD
@@ -3,6 +3,7 @@ load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
 )
 
@@ -25,39 +26,34 @@ package(
 filegroup(
     name = "api_headers",
     srcs = ["api.h"],
-    visibility = ["//visibility:public"],
 )
 
 filegroup(
     name = "c_api_headers",
     srcs = ["c_api.h"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "api",
     hdrs = [":api_headers"],
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
     deps = [":c_api"],
 )
 
 cc_library(
     name = "c_api",
     hdrs = ["c_api.h"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "c_api_internal",
     hdrs = ["c_api_internal.h"],
-    visibility = ["//visibility:public"],
     deps = [":c_api"],
 )
 
 cc_library(
     name = "ffi",
     hdrs = ["ffi.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":api",
         ":c_api",
@@ -78,6 +74,7 @@ xla_cc_test(
         "//xla/ffi:ffi_api",
         "//xla/stream_executor:device_memory",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:status_matchers",
diff --git a/third_party/xla/xla/ffi/api/api.h b/third_party/xla/xla/ffi/api/api.h
index 31bd331296839c..c7d33e844f7a78 100644
--- a/third_party/xla/xla/ffi/api/api.h
+++ b/third_party/xla/xla/ffi/api/api.h
@@ -320,6 +320,7 @@ struct ArgDecoding;
 //
 //   template <>
 //   struct AttrDecoding<MyType> {
+//    using Type = <handler argument type for attribute type MyType>
 //    static std::optional<MyType> Decode(XLA_FFI_AttrType type, void* attr,
 //                                        DiagnosticEngine&);
 //   }
@@ -461,7 +462,9 @@ struct Decode {
 
 template <typename T>
 struct internal::Decode<internal::AttrTag<T>> {
-  static std::optional<T> call(DecodingOffsets& offsets, DecodingContext& ctx,
+  using R = typename AttrDecoding<T>::Type;
+
+  static std::optional<R> call(DecodingOffsets& offsets, DecodingContext& ctx,
                                DiagnosticEngine& diagnostic) {
     // Find decoded attribute corresponding to the given attribute index.
     int64_t i = offsets.attrs++;
@@ -677,7 +680,7 @@ struct FnArgType {
 // Extracts the underlying type from the attribute type tag.
 template <typename T>
 struct FnArgType<internal::AttrTag<T>> {
-  using Type = T;
+  using Type = typename AttrDecoding<T>::Type;
 };
 
 // Extracts the underlying type from the context type tag.
@@ -897,6 +900,7 @@ inline std::ostream& operator<<(std::ostream& os, const XLA_FFI_AttrType type) {
 #define XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(T, TYPE)                \
   template <>                                                         \
   struct AttrDecoding<T> {                                            \
+    using Type = T;                                                   \
     static std::optional<T> Decode(XLA_FFI_AttrType type, void* attr, \
                                    DiagnosticEngine& diagnostic) {    \
       if (type != TYPE) {                                             \
@@ -916,6 +920,7 @@ XLA_FFI_REGISTER_SCALAR_ATTR_DECODING(float, XLA_FFI_AttrType_F32);
 
 template <>
 struct AttrDecoding<std::string_view> {
+  using Type = std::string_view;
   static std::optional<std::string_view> Decode(XLA_FFI_AttrType type,
                                                 void* attr,
                                                 DiagnosticEngine& diagnostic) {
@@ -931,6 +936,7 @@ struct AttrDecoding<std::string_view> {
 
 template <>
 struct AttrDecoding<Dictionary> {
+  using Type = Dictionary;
   static std::optional<Dictionary> Decode(XLA_FFI_AttrType type, void* attr,
                                           DiagnosticEngine& diagnostic) {
     if (type != XLA_FFI_AttrType_DICTIONARY) {
@@ -1022,6 +1028,7 @@ auto DictionaryDecoder(Members... m) {
 #define XLA_FFI_REGISTER_STRUCT_ATTR_DECODING(T, ...)                 \
   template <>                                                         \
   struct AttrDecoding<T> {                                            \
+    using Type = T;                                                   \
     static std::optional<T> Decode(XLA_FFI_AttrType type, void* attr, \
                                    DiagnosticEngine& diagnostic) {    \
       if (type != XLA_FFI_AttrType_DICTIONARY) {                      \
diff --git a/third_party/xla/xla/ffi/api/ffi.h b/third_party/xla/xla/ffi/api/ffi.h
index 5dd8dde6c3b705..57233ee301b7af 100644
--- a/third_party/xla/xla/ffi/api/ffi.h
+++ b/third_party/xla/xla/ffi/api/ffi.h
@@ -57,8 +57,8 @@ enum class DataType : uint8_t {
 
 inline std::ostream& operator<<(std::ostream& os, const DataType dtype) {
   static constexpr const char* kDataTypeNames[] = {
-      "PRED", "S8",  "S16", "S32", "S64", "U8",   "U16",
-      "U32",  "U64", "F16", "F32", "F64", "BF16",
+      "INVALID", "PRED", "S8",  "S16", "S32", "S64", "U8",
+      "U16",     "U32",  "U64", "F16", "F32", "F64", "BF16",
   };
   return os << kDataTypeNames[static_cast<int>(dtype)];
 }
@@ -130,33 +130,36 @@ template <DataType dtype>
 struct always_false : std::false_type {};
 
 template <DataType dtype>
-struct PtrType {
+struct DataTypeToNative {
   static_assert(always_false<dtype>::value, "unsupported data type");
 };
 
 // clang-format off
-template <> struct PtrType<DataType::PRED> { using Type = bool; };
-template <> struct PtrType<DataType::U8>   { using Type = uint8_t; };
-template <> struct PtrType<DataType::U16>  { using Type = uint16_t; };
-template <> struct PtrType<DataType::U32>  { using Type = uint32_t; };
-template <> struct PtrType<DataType::U64>  { using Type = uint64_t; };
-template <> struct PtrType<DataType::S8>   { using Type = int8_t; };
-template <> struct PtrType<DataType::S16>  { using Type = int16_t; };
-template <> struct PtrType<DataType::S32>  { using Type = int32_t; };
-template <> struct PtrType<DataType::S64>  { using Type = int64_t; };
-template <> struct PtrType<DataType::F16>  { using Type = uint16_t; };
-template <> struct PtrType<DataType::F32>  { using Type = float; };
-template <> struct PtrType<DataType::F64>  { using Type = double; };
-template <> struct PtrType<DataType::BF16> { using Type = uint16_t; };
+template <> struct DataTypeToNative<DataType::PRED> { using type = bool; };
+template <> struct DataTypeToNative<DataType::U8>   { using type = uint8_t; };
+template <> struct DataTypeToNative<DataType::U16>  { using type = uint16_t; };
+template <> struct DataTypeToNative<DataType::U32>  { using type = uint32_t; };
+template <> struct DataTypeToNative<DataType::U64>  { using type = uint64_t; };
+template <> struct DataTypeToNative<DataType::S8>   { using type = int8_t; };
+template <> struct DataTypeToNative<DataType::S16>  { using type = int16_t; };
+template <> struct DataTypeToNative<DataType::S32>  { using type = int32_t; };
+template <> struct DataTypeToNative<DataType::S64>  { using type = int64_t; };
+template <> struct DataTypeToNative<DataType::F16>  { using type = uint16_t; };
+template <> struct DataTypeToNative<DataType::F32>  { using type = float; };
+template <> struct DataTypeToNative<DataType::F64>  { using type = double; };
+template <> struct DataTypeToNative<DataType::BF16> { using type = uint16_t; };
 // clang-format on
 
 inline constexpr size_t kDynamicRank = std::numeric_limits<size_t>::max();
 
+template <DataType dtype>
+using NativeType = typename DataTypeToNative<dtype>::type;
+
 }  // namespace internal
 
 template <DataType dtype, size_t rank = internal::kDynamicRank>
 struct Buffer {
-  typename internal::PtrType<dtype>::Type* data;
+  internal::NativeType<dtype>* data;
   Span<const int64_t> dimensions;
 };
 
@@ -203,22 +206,26 @@ struct ArgDecoding<Buffer<dtype, rank>> {
       return diagnostic.Emit("Wrong argument type: expected ")
              << XLA_FFI_ArgType_BUFFER << " but got " << type;
     }
+
     auto* buf = reinterpret_cast<XLA_FFI_Buffer*>(arg);
+
     if (auto actual_dtype = static_cast<DataType>(buf->dtype);
         actual_dtype != dtype) {
       return diagnostic.Emit("Wrong buffer dtype: expected ")
              << dtype << " but got " << actual_dtype;
     }
-    auto* data =
-        static_cast<typename internal::PtrType<dtype>::Type*>(buf->data);
+
     if constexpr (rank != internal::kDynamicRank) {
       if (buf->rank != rank) {
-        diagnostic.Emit("Wrong buffer rank: expected ")
-            << rank << " but got " << buf->rank;
-        return std::nullopt;
+        return diagnostic.Emit("Wrong buffer rank: expected ")
+               << rank << " but got " << buf->rank;
       }
     }
-    return Buffer<dtype, rank>{data, Span<const int64_t>(buf->dims, rank)};
+
+    Buffer<dtype, rank> buffer;
+    buffer.data = static_cast<internal::NativeType<dtype>*>(buf->data);
+    buffer.dimensions = Span<const int64_t>(buf->dims, buf->rank);
+    return buffer;
   }
 };
 
diff --git a/third_party/xla/xla/ffi/api/ffi_test.cc b/third_party/xla/xla/ffi/api/ffi_test.cc
index 8c6643e51c6282..0f3f40a34415b7 100644
--- a/third_party/xla/xla/ffi/api/ffi_test.cc
+++ b/third_party/xla/xla/ffi/api/ffi_test.cc
@@ -20,12 +20,12 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "xla/ffi/call_frame.h"
 #include "xla/ffi/ffi_api.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/lib/core/status_test_util.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/test.h"
 #include "tsl/platform/test_benchmark.h"
@@ -108,13 +108,13 @@ TEST(FfiTest, MissingBufferArgument) {
       [](auto) { return Error::Success(); });
   auto status = Call(*handler, call_frame);
 
-  EXPECT_THAT(status, StatusIs(tsl::error::INVALID_ARGUMENT,
+  EXPECT_THAT(status, StatusIs(absl::StatusCode::kInvalidArgument,
                                HasSubstr("Wrong number of arguments")));
 }
 
 TEST(FfiTest, WrongRankBufferArgument) {
-  std::vector<std::int32_t> storage(4, 0.0);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(std::int32_t));
+  std::vector<int32_t> storage(4, 0.0);
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
 
   CallFrameBuilder builder;
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
@@ -125,13 +125,13 @@ TEST(FfiTest, WrongRankBufferArgument) {
   auto status = Call(*handler, call_frame);
 
   EXPECT_THAT(status,
-              StatusIs(tsl::error::INVALID_ARGUMENT,
+              StatusIs(absl::StatusCode::kInvalidArgument,
                        HasSubstr("Wrong buffer rank: expected 1 but got 2")));
 }
 
 TEST(FfiTest, WrongTypeBufferArgument) {
-  std::vector<std::int32_t> storage(4, 0.0);
-  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(std::int32_t));
+  std::vector<int32_t> storage(4, 0.0);
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
 
   CallFrameBuilder builder;
   builder.AddBufferArg(memory, PrimitiveType::S32, /*dims=*/{2, 2});
@@ -143,8 +143,8 @@ TEST(FfiTest, WrongTypeBufferArgument) {
 
   EXPECT_THAT(
       status,
-      StatusIs(tsl::error::INVALID_ARGUMENT,
-               HasSubstr("Wrong buffer dtype: expected F64 but got S64")));
+      StatusIs(absl::StatusCode::kInvalidArgument,
+               HasSubstr("Wrong buffer dtype: expected F32 but got S32")));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/ffi/ffi.h b/third_party/xla/xla/ffi/ffi.h
index 1e2d2a53e9bd5a..37734a11f1304c 100644
--- a/third_party/xla/xla/ffi/ffi.h
+++ b/third_party/xla/xla/ffi/ffi.h
@@ -20,7 +20,9 @@ limitations under the License.
 #error Two different XLA FFI implementations cannot be included together
 #endif  // XLA_FFI_API_FFI_H_
 
+#include <cstddef>
 #include <cstdint>
+#include <limits>
 #include <optional>
 
 // IWYU pragma: begin_exports
@@ -31,6 +33,7 @@ limitations under the License.
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/api/c_api_internal.h"  // IWYU pragma: keep
 #include "xla/hlo/ir/hlo_computation.h"
+#include "xla/primitive_util.h"
 #include "xla/runtime/memref_view.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/status.h"
@@ -47,7 +50,7 @@ struct CalledComputation {};
 // Arguments
 //===----------------------------------------------------------------------===//
 
-struct Buffer {
+struct BufferBase {
   PrimitiveType dtype;
   se::DeviceMemoryBase data;
   absl::Span<const int64_t> dimensions;
@@ -59,18 +62,46 @@ struct Buffer {
   }
 };
 
+namespace internal {
+
+inline constexpr size_t kDynamicRank = std::numeric_limits<size_t>::max();
+
+template <PrimitiveType dtype>
+using NativeType = typename primitive_util::PrimitiveTypeToNative<dtype>::type;
+
+}  // namespace internal
+
+template <PrimitiveType dtype, size_t rank = internal::kDynamicRank>
+struct Buffer {
+  se::DeviceMemory<internal::NativeType<dtype>> data;
+  absl::Span<const int64_t> dimensions;
+};
+
+// clang-format off
+template <PrimitiveType dtype> using BufferR0 = Buffer<dtype, 0>;
+template <PrimitiveType dtype> using BufferR1 = Buffer<dtype, 1>;
+template <PrimitiveType dtype> using BufferR2 = Buffer<dtype, 2>;
+template <PrimitiveType dtype> using BufferR3 = Buffer<dtype, 3>;
+template <PrimitiveType dtype> using BufferR4 = Buffer<dtype, 4>;
+// clang-format on
+
 //===----------------------------------------------------------------------===//
 // Arguments decoding
 //===----------------------------------------------------------------------===//
 
 template <>
-struct ArgDecoding<Buffer> {
-  static std::optional<Buffer> Decode(XLA_FFI_ArgType type, void* arg,
-                                      DiagnosticEngine&) {
-    if (type != XLA_FFI_ArgType_BUFFER) return std::nullopt;
+struct ArgDecoding<BufferBase> {
+  XLA_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<BufferBase> Decode(XLA_FFI_ArgType type, void* arg,
+                                          DiagnosticEngine& diagnostic) {
+    if (type != XLA_FFI_ArgType_BUFFER) {
+      return diagnostic.Emit("Wrong argument type: expected ")
+             << XLA_FFI_ArgType_BUFFER << " but got " << type;
+    }
+
     auto* buf = reinterpret_cast<XLA_FFI_Buffer*>(arg);
 
-    Buffer buffer;
+    BufferBase buffer;
     buffer.dtype = PrimitiveType(buf->dtype);
     buffer.data = se::DeviceMemoryBase(buf->data);
     buffer.dimensions = absl::MakeConstSpan(buf->dims, buf->rank);
@@ -78,6 +109,64 @@ struct ArgDecoding<Buffer> {
   }
 };
 
+template <PrimitiveType dtype, size_t rank>
+struct ArgDecoding<Buffer<dtype, rank>> {
+  XLA_ATTRIBUTE_ALWAYS_INLINE
+  static std::optional<Buffer<dtype, rank>> Decode(
+      XLA_FFI_ArgType type, void* arg, DiagnosticEngine& diagnostic) {
+    if (type != XLA_FFI_ArgType_BUFFER) {
+      return diagnostic.Emit("Wrong argument type: expected ")
+             << XLA_FFI_ArgType_BUFFER << " but got " << type;
+    }
+
+    auto* buf = reinterpret_cast<XLA_FFI_Buffer*>(arg);
+
+    if (auto actual_dtype = PrimitiveType(buf->dtype); actual_dtype != dtype) {
+      return diagnostic.Emit("Wrong buffer dtype: expected ")
+             << primitive_util::LowercasePrimitiveTypeName(dtype) << " but got "
+             << primitive_util::LowercasePrimitiveTypeName(actual_dtype);
+    }
+
+    if constexpr (rank != internal::kDynamicRank) {
+      if (buf->rank != rank) {
+        return diagnostic.Emit("Wrong buffer rank: expected ")
+               << rank << " but got " << buf->rank;
+      }
+    }
+
+    Buffer<dtype, rank> buffer;
+    buffer.data = se::DeviceMemory<internal::NativeType<dtype>>(
+        se::DeviceMemoryBase(buf->data));
+    buffer.dimensions = absl::MakeConstSpan(buf->dims, buf->rank);
+    return buffer;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// Attributes decoding
+//===----------------------------------------------------------------------===//
+
+// A type tag to mark i64 attributes as pointers to `T`.
+template <typename T>
+struct Pointer {};
+
+template <typename T>
+struct AttrDecoding<Pointer<T>> {
+  using Type = T*;
+
+  static std::optional<Type> Decode(XLA_FFI_AttrType type, void* attr,
+                                    DiagnosticEngine& diagnostic) {
+    if (type != XLA_FFI_AttrType_I64) {
+      return diagnostic.Emit("Wrong attribute type: ")
+             << "expected i64 for passing user data but got " << type;
+    }
+
+    static_assert(sizeof(uintptr_t) == sizeof(int64_t));
+    uintptr_t ptr = *reinterpret_cast<uintptr_t*>(attr);
+    return reinterpret_cast<Type>(ptr);
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Context decoding
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/ffi/ffi_test.cc b/third_party/xla/xla/ffi/ffi_test.cc
index 6c5c73a0fc92f9..39da24498848f8 100644
--- a/third_party/xla/xla/ffi/ffi_test.cc
+++ b/third_party/xla/xla/ffi/ffi_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/ffi/ffi.h"
 
 #include <cstdint>
+#include <string>
 #include <string_view>
 #include <vector>
 
@@ -26,10 +27,14 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/status_matchers.h"
 #include "tsl/platform/test.h"
 
 namespace xla::ffi {
 
+using ::testing::HasSubstr;
+using ::tsl::testing::StatusIs;
+
 TEST(FfiTest, StaticRegistration) {
   static constexpr auto* noop = +[] { return absl::OkStatus(); };
 
@@ -52,8 +57,8 @@ TEST(FfiTest, WrongNumArgs) {
   builder.AddBufferArg(se::DeviceMemoryBase(nullptr), PrimitiveType::F32, {});
   auto call_frame = builder.Build();
 
-  auto handler = Ffi::Bind().Arg<Buffer>().Arg<Buffer>().To(
-      [](Buffer, Buffer) { return absl::OkStatus(); });
+  auto handler = Ffi::Bind().Arg<BufferBase>().Arg<BufferBase>().To(
+      [](BufferBase, BufferBase) { return absl::OkStatus(); });
 
   auto status = Call(*handler, call_frame);
 
@@ -107,6 +112,32 @@ TEST(FfiTest, BuiltinAttributes) {
   TF_ASSERT_OK(status);
 }
 
+TEST(FfiTest, PointerAttr) {
+  std::string foo = "foo";
+
+  // Test for convenience attr binding that casts i64 attribute to user-type
+  // pointers. It's up to the user to guarantee that pointer is valid.
+  auto ptr = reinterpret_cast<uintptr_t>(&foo);
+  static_assert(sizeof(ptr) == sizeof(int64_t));
+
+  CallFrameBuilder::AttributesBuilder attrs;
+  attrs.Insert("ptr", static_cast<int64_t>(ptr));
+
+  CallFrameBuilder builder;
+  builder.AddAttributes(attrs.Build());
+  auto call_frame = builder.Build();
+
+  auto fn = [&](const std::string* str) {
+    EXPECT_EQ(*str, "foo");
+    return absl::OkStatus();
+  };
+
+  auto handler = Ffi::Bind().Attr<Pointer<std::string>>("ptr").To(fn);
+  auto status = Call(*handler, call_frame);
+
+  TF_ASSERT_OK(status);
+}
+
 TEST(FfiTest, AttrsAsDictionary) {
   CallFrameBuilder::AttributesBuilder attrs;
   attrs.Insert("i32", 42);
@@ -280,7 +311,7 @@ TEST(FfiTest, DecodingErrors) {
       "Failed to decode all FFI handler operands (bad operands at: 0, 1, 3)");
 }
 
-TEST(FfiTest, BufferArgument) {
+TEST(FfiTest, BufferBaseArgument) {
   std::vector<float> storage(4, 0.0f);
   se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
 
@@ -288,19 +319,74 @@ TEST(FfiTest, BufferArgument) {
   builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
   auto call_frame = builder.Build();
 
-  auto fn = [&](Buffer buffer) {
+  auto fn = [&](BufferBase buffer) {
     EXPECT_EQ(buffer.dtype, PrimitiveType::F32);
     EXPECT_EQ(buffer.data.opaque(), storage.data());
     EXPECT_EQ(buffer.dimensions.size(), 2);
     return absl::OkStatus();
   };
 
-  auto handler = Ffi::Bind().Arg<Buffer>().To(fn);
+  auto handler = Ffi::Bind().Arg<BufferBase>().To(fn);
   auto status = Call(*handler, call_frame);
 
   TF_ASSERT_OK(status);
 }
 
+TEST(FfiTest, TypedAndRankedBufferArgument) {
+  std::vector<float> storage(4, 0.0f);
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
+
+  CallFrameBuilder builder;
+  builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
+  auto call_frame = builder.Build();
+
+  auto fn = [&](BufferR2<PrimitiveType::F32> buffer) {
+    EXPECT_EQ(buffer.data.opaque(), storage.data());
+    EXPECT_EQ(buffer.dimensions.size(), 2);
+    return absl::OkStatus();
+  };
+
+  auto handler = Ffi::Bind().Arg<BufferR2<PrimitiveType::F32>>().To(fn);
+  auto status = Call(*handler, call_frame);
+
+  TF_ASSERT_OK(status);
+}
+
+TEST(FfiTest, WrongRankBufferArgument) {
+  std::vector<int32_t> storage(4, 0.0);
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
+
+  CallFrameBuilder builder;
+  builder.AddBufferArg(memory, PrimitiveType::F32, /*dims=*/{2, 2});
+  auto call_frame = builder.Build();
+
+  auto handler = Ffi::Bind().Arg<BufferR1<PrimitiveType::F32>>().To(
+      [](auto) { return absl::OkStatus(); });
+  auto status = Call(*handler, call_frame);
+
+  EXPECT_THAT(status,
+              StatusIs(absl::StatusCode::kInvalidArgument,
+                       HasSubstr("Wrong buffer rank: expected 1 but got 2")));
+}
+
+TEST(FfiTest, WrongTypeBufferArgument) {
+  std::vector<int32_t> storage(4, 0.0);
+  se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(int32_t));
+
+  CallFrameBuilder builder;
+  builder.AddBufferArg(memory, PrimitiveType::S32, /*dims=*/{2, 2});
+  auto call_frame = builder.Build();
+
+  auto handler = Ffi::Bind().Arg<BufferR2<PrimitiveType::F32>>().To(
+      [](auto) { return absl::OkStatus(); });
+  auto status = Call(*handler, call_frame);
+
+  EXPECT_THAT(
+      status,
+      StatusIs(absl::StatusCode::kInvalidArgument,
+               HasSubstr("Wrong buffer dtype: expected f32 but got s32")));
+}
+
 TEST(FfiTest, RemainingArgs) {
   std::vector<float> storage(4, 0.0f);
   se::DeviceMemoryBase memory(storage.data(), 4 * sizeof(float));
@@ -311,8 +397,8 @@ TEST(FfiTest, RemainingArgs) {
 
   auto fn = [&](RemainingArgs args) {
     EXPECT_EQ(args.size(), 1);
-    EXPECT_TRUE(args.get<Buffer>(0).has_value());
-    EXPECT_FALSE(args.get<Buffer>(1).has_value());
+    EXPECT_TRUE(args.get<BufferBase>(0).has_value());
+    EXPECT_FALSE(args.get<BufferBase>(1).has_value());
     return absl::OkStatus();
   };
 
diff --git a/third_party/xla/xla/hlo/evaluator/BUILD b/third_party/xla/xla/hlo/evaluator/BUILD
index 5f041a91f8c9ab..0d6b67083576f7 100644
--- a/third_party/xla/xla/hlo/evaluator/BUILD
+++ b/third_party/xla/xla/hlo/evaluator/BUILD
@@ -5,7 +5,8 @@ load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
     licenses = ["notice"],
 )
 
@@ -40,7 +41,6 @@ cc_library(
         "hlo_evaluator_typed_visitor_uint8.cc",
     ],
     hdrs = ["hlo_evaluator.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:array2d",
         "//xla:comparison_util",
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
index eb8c26a0b4cc7e..b81cc14459c609 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
@@ -3115,13 +3115,14 @@ Status HloEvaluator::HandleAsyncStart(const HloInstruction* async_start) {
     arg_literals.push_back(&arg_literal);
   }
 
-  HloEvaluator embedded_evaluator;
-  embedded_evaluator.set_dynamic_dimension_inference(
+  std::unique_ptr<HloEvaluator> embedded_evaluator =
+      CreateEmbedded(max_loop_iterations_);
+  embedded_evaluator->set_dynamic_dimension_inference(
       dynamic_dimension_inference_);
   TF_ASSIGN_OR_RETURN(
       Literal result,
-      embedded_evaluator.Evaluate(*async_start->async_wrapped_computation(),
-                                  arg_literals));
+      embedded_evaluator->Evaluate(*async_start->async_wrapped_computation(),
+                                   arg_literals));
 
   evaluated_[async_start] = Literal(async_start->shape());
   // Copy the operand values to the index {0, i} of the output.
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h
index 7c13ab01c9ec18..96004d73eea68d 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator.h
@@ -89,7 +89,9 @@ class HloEvaluator : public ConstDfsHloVisitorWithDefault {
   // instance of the subclass instead.
   virtual std::unique_ptr<HloEvaluator> CreateEmbedded(
       int64_t max_loop_iterations) {
-    return std::make_unique<HloEvaluator>(max_loop_iterations);
+    auto result = std::make_unique<HloEvaluator>(max_loop_iterations);
+    result->set_custom_call_handler(custom_call_handler_);
+    return result;
   }
 
   // Enables subclasses to be notified when a new computation is being
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_test.cc b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_test.cc
index 1a01e8f1b34539..f5f3f6d6c2056b 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_test.cc
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_test.cc
@@ -4605,6 +4605,30 @@ TEST_F(HloEvaluatorTest, EvaluateCustomCall_ManyInputs) {
   EXPECT_TRUE(absl::c_equal(expected_data, actual_literal.data<uint32_t>()));
 }
 
+TEST_F(HloEvaluatorTest, EvaluateCustomCallInFusion) {
+  const absl::string_view hlo_text = R"(
+fusion1 {
+  p = f32[] parameter(0)
+  ROOT c = f32[] custom-call(p), custom_call_target="__cchandler1"
+}
+
+ENTRY e {
+  p = f32[] parameter(0)
+  ROOT f = f32[] fusion(p), kind=kCustom, calls=fusion1
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(m_, ParseAndReturnVerifiedModule(hlo_text));
+  auto input = LiteralUtil::CreateR0<float>(0);
+  HloEvaluator evaluator;
+  evaluator.set_custom_call_handler([](const HloInstruction* custom_call,
+                                       absl::Span<const Literal*> operands) {
+    return LiteralUtil::CreateR0<float>(1 -
+                                        operands[0]->GetFirstElement<float>());
+  });
+  TF_ASSERT_OK_AND_ASSIGN(auto output, evaluator.Evaluate(*m_, {&input}));
+  EXPECT_EQ(output, LiteralUtil::CreateR0<float>(1));
+}
+
 TEST_F(HloEvaluatorTest, IsFiniteF16) {
   const absl::string_view hlo_text = R"(
   HloModule test
diff --git a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
index ef85750aa488c5..d08c824001d760 100644
--- a/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
+++ b/third_party/xla/xla/hlo/evaluator/hlo_evaluator_typed_visitor.h
@@ -245,6 +245,18 @@ class HloEvaluatorTypedVisitor : public ConstDfsHloVisitorWithDefault {
     return UnsupportedTypeError(ceil);
   }
 
+  Status HandleErf(const HloInstruction* erf) override {
+    if constexpr (!is_complex_v<ReturnT>) {
+      TF_ASSIGN_OR_RETURN(
+          parent_->evaluated_[erf],
+          ElementWiseUnaryOp(erf, [](ElementwiseT elem_operand) {
+            return std::erf(elem_operand);
+          }));
+      return OkStatus();
+    }
+    return UnsupportedTypeError(erf);
+  }
+
   Status HandleExp(const HloInstruction* exp) override {
     TF_ASSIGN_OR_RETURN(parent_->evaluated_[exp],
                         ElementWiseUnaryOp(exp, [](ElementwiseT elem_operand) {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
index 62503ebf5cb9ea..7a3cb4f6976c4d 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/BUILD
@@ -2,10 +2,12 @@
 
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("//xla:xla.bzl", "auto_sharding_deps", "auto_sharding_solver_deps", "xla_cc_binary", "xla_cc_test")
+load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_libtpu_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
 )
 
 package_group(
@@ -27,7 +29,7 @@ cc_library(
     hdrs = [
         "auto_sharding.h",
     ],
-    visibility = ["//visibility:public"],
+    compatible_with = get_compatible_with_libtpu_portable(),
     deps = [
         ":auto_sharding_cost_graph",
         ":auto_sharding_option",
@@ -82,10 +84,11 @@ cc_library(
 cc_library(
     name = "auto_sharding_solver_impl",
     srcs = ["auto_sharding_solver_impl.cc"],
-    visibility = ["//visibility:public"],
+    compatible_with = get_compatible_with_libtpu_portable(),
     deps = [
         ":auto_sharding_proto_cc",
         ":auto_sharding_strategy",
+        "@com_google_absl//absl/log:check",
         "@com_google_ortools//ortools/linear_solver",
     ],
 )
@@ -93,7 +96,7 @@ cc_library(
 cc_library(
     name = "auto_sharding_solver",
     srcs = ["auto_sharding_solver.cc"],
-    visibility = ["//visibility:public"],
+    compatible_with = get_compatible_with_libtpu_portable(),
     deps = [
         ":auto_sharding_proto_cc",
         ":auto_sharding_strategy",
@@ -120,7 +123,7 @@ cc_library(
         "auto_sharding_solver.h",
         "auto_sharding_strategy.h",
     ],
-    visibility = ["//visibility:public"],
+    compatible_with = get_compatible_with_libtpu_portable(),
     deps = [
         ":auto_sharding_proto_cc",
         "//xla:shape_util",
@@ -137,13 +140,14 @@ cc_library(
 cc_library(
     name = "auto_sharding_cost_graph",
     hdrs = ["auto_sharding_cost_graph.h"],
-    visibility = ["//visibility:public"],
+    compatible_with = get_compatible_with_libtpu_portable(),
     deps = [
         ":auto_sharding_strategy",
         ":matrix",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -154,7 +158,7 @@ cc_library(
     name = "auto_sharding_option",
     srcs = ["auto_sharding_option.cc"],
     hdrs = ["auto_sharding_option.h"],
-    visibility = ["//visibility:public"],
+    compatible_with = get_compatible_with_libtpu_portable(),
     deps = [
         ":auto_sharding_util",
         "@com_google_absl//absl/algorithm:container",
@@ -167,7 +171,7 @@ cc_library(
 cc_library(
     name = "auto_sharding_wrapper",
     hdrs = ["auto_sharding_wrapper.h"],
-    visibility = ["//visibility:public"],
+    compatible_with = get_compatible_with_libtpu_portable(),
     deps = [
         ":auto_sharding_cost_graph",
         ":auto_sharding_option",
@@ -183,7 +187,7 @@ cc_library(
 cc_library(
     name = "auto_sharding_impl",
     srcs = ["auto_sharding_impl.cc"],
-    visibility = ["//visibility:public"],
+    compatible_with = get_compatible_with_libtpu_portable(),
     deps = [
         ":auto_sharding_cost_graph",
         ":auto_sharding_option",
@@ -193,13 +197,14 @@ cc_library(
         "//xla/hlo/utils:hlo_live_range",
         "//xla/service:hlo_cost_analysis",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
 cc_library(
     name = "matrix",
     hdrs = ["matrix.h"],
-    visibility = ["//visibility:public"],
+    compatible_with = get_compatible_with_libtpu_portable(),
     deps = [
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:logging",
@@ -210,7 +215,7 @@ cc_library(
     name = "cluster_environment",
     srcs = ["cluster_environment.cc"],
     hdrs = ["cluster_environment.h"],
-    visibility = ["//visibility:public"],
+    compatible_with = get_compatible_with_libtpu_portable(),
     deps = [
         ":auto_sharding_option",
         ":auto_sharding_strategy",
@@ -224,7 +229,7 @@ cc_library(
 cc_library(
     name = "profiling_result",
     hdrs = ["profiling_result.h"],
-    visibility = ["//visibility:public"],
+    compatible_with = get_compatible_with_libtpu_portable(),
     deps = [":auto_sharding_strategy"],
 )
 
@@ -232,7 +237,7 @@ cc_library(
     name = "auto_sharding_util",
     srcs = ["auto_sharding_util.cc"],
     hdrs = ["auto_sharding_util.h"],
-    visibility = ["//visibility:public"],
+    compatible_with = get_compatible_with_libtpu_portable(),
     deps = [
         ":auto_sharding_strategy",
         "//xla:array",
@@ -265,13 +270,14 @@ cc_library(
     name = "metrics",
     srcs = ["metrics.cc"],
     hdrs = ["metrics.h"],
-    visibility = ["//visibility:public"],
+    compatible_with = get_compatible_with_libtpu_portable(),
     deps = ["@local_tsl//tsl/lib/monitoring:counter"],
 )
 
 xla_cc_binary(
     name = "auto_sharding_runner",
     srcs = ["auto_sharding_runner.cc"],
+    compatible_with = get_compatible_with_libtpu_portable(),
     deps = [
         ":auto_sharding",
         "//xla:status",
@@ -285,7 +291,6 @@ xla_cc_binary(
 tf_proto_library(
     name = "auto_sharding_proto",
     srcs = ["auto_sharding.proto"],
-    visibility = ["//visibility:public"],
 )
 
 build_test(
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
index 7770640d9db689..6c4070572f80c6 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.cc
@@ -94,7 +94,6 @@ namespace spmd {
 namespace {
 constexpr double kOverbudgetCoeff = 1e6;
 constexpr double kSaltiplier = 0.0;  // This value (0.0) disables salting.
-constexpr double kCoeffLimit = 1e7;  // May result in model cost scaling.
 }  // namespace
 
 // Compute the resharding cost vector from multiple possible strategies to a
@@ -239,20 +238,24 @@ void FollowArrayOrTokenStrategyGroup(
     const StrategyGroup& src_strategy_group, const Shape& shape,
     const size_t instruction_id, const bool have_memory_cost,
     const ClusterEnvironment& cluster_env,
-    StableHashMap<NodeIdx, std::vector<ShardingStrategy>>&
+    const StableHashMap<NodeIdx, std::vector<ShardingStrategy>>&
         pretrimmed_strategy_map,
     StrategyGroup& strategy_group) {
   CHECK(shape.IsArray() || shape.IsToken());
 
+  std::vector<ShardingStrategy> pretrimmed_strategies;
   // Only follows the given strategy when there is no other strategy to be
   // restored.
-  if (!pretrimmed_strategy_map.contains(src_strategy_group.node_idx)) {
+  auto pretrimmed_strategy_map_it =
+      pretrimmed_strategy_map.find(src_strategy_group.node_idx);
+  if (pretrimmed_strategy_map_it != pretrimmed_strategy_map.end()) {
+    pretrimmed_strategies = pretrimmed_strategy_map_it->second;
+  } else {
     strategy_group.following = &src_strategy_group;
   }
+
   strategy_group.strategies.reserve(src_strategy_group.strategies.size());
   // Creates the sharding strategies and restores trimmed strategies, if any.
-  std::vector<ShardingStrategy>& pretrimmed_strategies =
-      pretrimmed_strategy_map[src_strategy_group.node_idx];
   for (int64_t sid = 0; sid < src_strategy_group.strategies.size() +
                                   pretrimmed_strategies.size();
        ++sid) {
@@ -289,7 +292,7 @@ std::unique_ptr<StrategyGroup> MaybeFollowInsStrategyGroup(
     const StrategyGroup* src_strategy_group, const Shape& shape,
     const size_t instruction_id, const bool have_memory_cost,
     StrategyGroups& strategy_groups, const ClusterEnvironment& cluster_env,
-    StableHashMap<NodeIdx, std::vector<ShardingStrategy>>&
+    const StableHashMap<NodeIdx, std::vector<ShardingStrategy>>&
         pretrimmed_strategy_map) {
   std::unique_ptr<StrategyGroup> strategy_group;
   if (src_strategy_group->is_tuple) {
@@ -1522,7 +1525,7 @@ std::unique_ptr<StrategyGroup> CreateElementwiseOperatorStrategies(
     const size_t instruction_id, const HloInstruction* ins,
     const StrategyMap& strategy_map, const ClusterEnvironment& cluster_env,
     const InstructionDepthMap& depth_map, const AliasMap& alias_map,
-    StableHashMap<int64_t, std::vector<ShardingStrategy>>&
+    const StableHashMap<int64_t, std::vector<ShardingStrategy>>&
         pretrimmed_strategy_map,
     const int64_t max_depth, StrategyGroups& strategy_groups,
     AssociativeDotPairs& associative_dot_pairs) {
@@ -1669,7 +1672,6 @@ AutoShardingSolverResult CallSolver(
   request.mutable_solver_timeout()->set_solver_timeout_in_seconds(
       solver_timeout_in_seconds);
   request.mutable_overbudget_coeff()->set_coeff(kOverbudgetCoeff);
-  request.mutable_coeff_limit()->set_coeff(kCoeffLimit);
   request.set_crash_at_infinity_costs_check(!option.try_multiple_mesh_shapes);
   request.set_compute_iis(compute_iis);
   request.set_saltiplier(kSaltiplier);
@@ -1705,11 +1707,14 @@ AutoShardingSolverResult CallSolver(
   int num_nodes_without_default = 0;
   for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
     const StrategyGroup* strategy_group = strategy_groups[node_idx];
-    auto instruction_name =
-        instructions.at(strategy_group->instruction_id)->name();
+    const auto instruction = instructions.at(strategy_group->instruction_id);
+    const auto instruction_name = instruction->name();
+    const auto opcode = HloOpcodeString(instruction->opcode());
     request.add_instruction_names(
         absl::StrCat(instruction_name, " (id: ", node_idx, ")"));
+    request.add_opcodes(std::string(opcode));
     AutoShardingSolverRequest_Costs ci, di, mi, pi;
+    AutoShardingSolverRequest_Names strategy_names;
     std::optional<HloSharding> default_strategy;
     auto iter = sharding_propagation_solution.find(instruction_name);
     if (iter != sharding_propagation_solution.end()) {
@@ -1730,6 +1735,7 @@ AutoShardingSolverResult CallSolver(
                    cost_graph.extra_node_costs_[node_idx][j]);
       mi.add_costs(strategy.memory_cost);
       pi.add_costs(default_strategy && sharding == *default_strategy ? 0 : 1);
+      strategy_names.add_names(sharding.ToString());
     }
     if (option.use_sharding_propagation_for_default_shardings &&
         *std::min_element(pi.costs().begin(), pi.costs().end()) > 0) {
@@ -1742,6 +1748,7 @@ AutoShardingSolverResult CallSolver(
     request.mutable_communication_costs()->Add(std::move(di));
     request.mutable_memory_costs()->Add(std::move(mi));
     request.mutable_departure_costs()->Add(std::move(pi));
+    request.mutable_strategy_names()->Add(std::move(strategy_names));
   }
   LOG(INFO) << "Total nodes without default: " << num_nodes_without_default;
 
@@ -1933,7 +1940,9 @@ void SetHloSharding(const HloInstructionSequence& sequence,
   const std::vector<HloInstruction*>& instructions = sequence.instructions();
 
   for (HloInstruction* inst : instructions) {
-    if (inst->opcode() == HloOpcode::kOutfeed) {
+    if (inst->opcode() == HloOpcode::kOutfeed ||
+        inst->opcode() == HloOpcode::kSend ||
+        inst->opcode() == HloOpcode::kSendDone) {
       continue;
     }
     auto iter = strategy_map.find(inst);
@@ -2078,11 +2087,17 @@ Status SetHloShardingPostProcessing(
                                       device_mesh, resharding_cache);
         }
       }
-    } else if (inst->opcode() == HloOpcode::kOutfeed) {
-      // Outfeed operand shardings are handled in downstream passes and so we
-      // ignore outfeed ops here. However, we need to ensure that outfeed ops
-      // which have user shardings have their shardings restored at the end. If
-      // not, this can lead to errors downstream in the spmd_partitioner pass.
+    } else if (inst->opcode() == HloOpcode::kOutfeed ||
+               inst->opcode() == HloOpcode::kSendDone) {
+      // Outfeed: Outfeed operand shardings are handled in downstream passes and
+      // so we ignore outfeed ops here. However, we need to ensure that outfeed
+      // ops which have user shardings have their shardings restored at the
+      // end. If not, this can lead to errors downstream in the spmd_partitioner
+      // pass.
+
+      // In the analysis itself, we use replicated strategies as a stand-in for
+      // the (expected) maximal sharding annotations that send-done ops usually
+      // have. Here we restore these maximal shardings if present.
       auto preserved_sharding_iter = preserve_shardings->find(inst->name());
       if (preserved_sharding_iter != preserve_shardings->end()) {
         const auto& preserved_sharding = preserved_sharding_iter->second;
@@ -2104,7 +2119,22 @@ Status SetHloShardingPostProcessing(
           inst->set_sharding(preserved_sharding.at(0));
         }
       }
-
+      continue;
+    } else if (inst->opcode() == HloOpcode::kSend) {
+      // In the analysis itself, we use replicated strategies as a stand-in for
+      // the (expected) maximal sharding annotations that send ops usually
+      // have. Here we restore these maximal shardings if present.
+      auto preserved_sharding_iter = preserve_shardings->find(inst->name());
+      if (preserved_sharding_iter != preserve_shardings->end()) {
+        const auto& preserved_sharding = preserved_sharding_iter->second;
+        if (preserved_sharding.size() > 1) {
+          inst->set_sharding(
+              HloSharding::Tuple(inst->shape(), preserved_sharding));
+        } else {
+          CHECK_EQ(preserved_sharding.size(), 1);
+          inst->set_sharding(preserved_sharding[0]);
+        }
+      }
       continue;
     } else {
       if (inst->shape().IsTuple()) {
@@ -2392,7 +2422,7 @@ void SaveShardingForInstruction(
   }
 }
 
-// Check whether the shardings that need to be perserved are preserved.
+// Check whether the shardings that need to be preserved are preserved.
 void CheckUserShardingPreservation(
     HloModule* module,
     const absl::flat_hash_map<std::string, std::vector<HloSharding>>&
@@ -2452,7 +2482,7 @@ int64_t MemoryBudgetLowerBound(const HloModule& module,
       buffer_to_sharded_value_mapping;
   for (LivenessIdx time_idx = 0; time_idx < liveness_set.size(); ++time_idx) {
     for (const HloValue* value : liveness_set[time_idx]) {
-      auto buffer = alias_analysis->GetBufferContainingValue(*value);
+      const auto& buffer = alias_analysis->GetBufferContainingValue(*value);
       if (value->instruction()->has_sharding()) {
         auto this_value_sharding = get_value_sharding(value);
         auto iter = buffer_to_sharded_value_mapping.find(buffer.id());
@@ -2484,7 +2514,7 @@ int64_t MemoryBudgetLowerBound(const HloModule& module,
       }
       Shape shape =
           ShapeUtil::GetSubshape(value->instruction()->shape(), value->index());
-      auto buffer = alias_analysis->GetBufferContainingValue(*value);
+      const auto& buffer = alias_analysis->GetBufferContainingValue(*value);
       auto iter = buffer_to_sharded_value_mapping.find(buffer.id());
       std::optional<HloSharding> optional_sharding = std::nullopt;
       if (iter != buffer_to_sharded_value_mapping.end()) {
@@ -2533,6 +2563,7 @@ void RecoverShardingsFromPartialMesh(
     }
   }
 }
+
 // DFS to find the replicated set starting from cur instruction.
 void FindReplicateSet(
     HloInstruction* cur, const AliasMap& alias_map, const CostGraph& cost_graph,
@@ -3204,7 +3235,9 @@ AutoShardingImplementation::SaveAndRemoveShardingAnnotation(
   for (const HloComputation* computation :
        module->computations(execution_threads)) {
     for (const auto inst : computation->instructions()) {
-      if (inst->opcode() == HloOpcode::kOutfeed) {
+      if (inst->opcode() == HloOpcode::kOutfeed ||
+          inst->opcode() == HloOpcode::kSend ||
+          inst->opcode() == HloOpcode::kSendDone) {
         spmd::SaveShardingForInstruction(inst,
                                          /* save_for_copy_users */ false,
                                          preserve_shardings);
@@ -3260,6 +3293,12 @@ AutoShardingImplementation::SaveAndRemoveShardingAnnotation(
         continue;
       }
 
+      if (ins->opcode() == HloOpcode::kOutfeed ||
+          ins->opcode() == HloOpcode::kSend ||
+          ins->opcode() == HloOpcode::kSendDone) {
+        continue;
+      }
+
       if (ins->has_sharding()) {
         module_is_changed |= true;
         ins->clear_sharding();
@@ -3658,7 +3697,7 @@ StatusOr<bool> AutoSharding::Run(
   LOG(INFO) << "Starting the auto sharding pass";
 
   if (IsModuleManuallySharded(module)) {
-    LOG(ERROR)
+    LOG(FATAL)
         << "Auto-sharding on partially manually sharded modules is not yet "
            "supported. Please fall back on the sharding propagation pass.";
     return false;
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
index 54b896c61caec7..356294c9b8ed6e 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.h
@@ -166,6 +166,8 @@ Status FilterStrategy(const HloInstruction* ins, const Shape& shape,
 Status HandleDot(std::unique_ptr<StrategyGroup>& strategy_group,
                  StrategyGroups& strategy_groups, StrategyMap& strategy_map,
                  const HloInstruction* ins, size_t instruction_id,
+                 const HloInstructionSequence& instruction_sequence,
+                 const HloCostAnalysis& hlo_cost_analysis,
                  const ClusterEnvironment& cluster_env,
                  const InstructionBatchDimMap& batch_map,
                  const AutoShardingOption& option, const CallGraph& call_graph);
@@ -173,6 +175,8 @@ Status HandleDot(std::unique_ptr<StrategyGroup>& strategy_group,
 Status HandleConv(std::unique_ptr<StrategyGroup>& strategy_group,
                   StrategyGroups& strategy_groups, StrategyMap& strategy_map,
                   const HloInstruction* ins, size_t instruction_id,
+                  const HloInstructionSequence& instruction_sequence,
+                  const HloCostAnalysis& hlo_cost_analysis,
                   const ClusterEnvironment& cluster_env,
                   const InstructionBatchDimMap& batch_map,
                   const AutoShardingOption& option,
@@ -269,7 +273,7 @@ std::unique_ptr<StrategyGroup> CreateElementwiseOperatorStrategies(
     size_t instruction_id, const HloInstruction* ins,
     const StrategyMap& strategy_map, const ClusterEnvironment& cluster_env,
     const InstructionDepthMap& depth_map, const AliasMap& alias_map,
-    StableHashMap<int64_t, std::vector<ShardingStrategy>>&
+    const StableHashMap<int64_t, std::vector<ShardingStrategy>>&
         pretrimmed_strategy_map,
     int64_t max_depth, StrategyGroups& strategy_groups,
     AssociativeDotPairs& associative_dot_pairs);
@@ -341,7 +345,7 @@ std::unique_ptr<StrategyGroup> MaybeFollowInsStrategyGroup(
     const StrategyGroup* src_strategy_group, const Shape& shape,
     size_t instruction_id, bool have_memory_cost,
     StrategyGroups& strategy_groups, const ClusterEnvironment& cluster_env,
-    StableHashMap<NodeIdx, std::vector<ShardingStrategy>>&
+    const StableHashMap<NodeIdx, std::vector<ShardingStrategy>>&
         pretrimmed_strategy_map);
 
 void RemoveInvalidShardingsWithShapes(const Shape& shape,
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.proto b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.proto
index 9abe3dcd247f7a..4d1ebdfd0f5747 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.proto
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding.proto
@@ -31,6 +31,9 @@ message AutoShardingSolverRequest {
   message Edges {
     repeated int64 edges = 1;
   }
+  message Names {
+    repeated string names = 1;
+  }
   message SolverTimeout {
     int64 solver_timeout_in_seconds = 1;
   }
@@ -56,6 +59,8 @@ message AutoShardingSolverRequest {
   repeated Pair aliases = 14;
   repeated Costs value_costs = 15;
   repeated string instruction_names = 16;
+  repeated string opcodes = 33;
+  repeated Names strategy_names = 32;
   optional SolverTimeout solver_timeout = 17;
   optional Coeff overbudget_coeff = 18;
   optional Coeff makespan_coeff = 19;
@@ -68,4 +73,5 @@ message AutoShardingSolverRequest {
   bool deterministic_mode = 27;
   string module_name = 24;
   string request_name = 30;
+  bool enable_output = 31;
 }
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
index cd41ca48aaa28c..9b5599defde477 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_cost_graph.h
@@ -26,6 +26,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_map.h"
 #include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
@@ -46,48 +47,57 @@ class CostGraph {
     adjacency_.assign(strategy_groups.size(), StableHashSet<int>());
 
     // Build the cost graph
-    for (const auto& strategies : strategy_groups) {
-      node_lens_.push_back(strategies->strategies.size());
+    for (StrategyGroup* strategy_group : strategy_groups) {
+      node_lens_.push_back(strategy_group->strategies.size());
       extra_node_costs_.push_back(
-          std::vector<double>(strategies->strategies.size(), 0.0));
-
-      for (size_t i = 0; i < strategies->in_nodes.size(); ++i) {
-        if (!strategies->in_nodes[i]->is_tuple) {
-          NodeIdx src_idx = strategies->in_nodes[i]->node_idx;
-          NodeIdx dst_idx = strategies->node_idx;
-          Matrix edge_cost = CreateEdgeCost(src_idx, dst_idx, i, strategies);
+          std::vector<double>(strategy_group->strategies.size(), 0.0));
+
+      const auto& in_nodes = strategy_group->in_nodes;
+      for (size_t i = 0; i < in_nodes.size(); ++i) {
+        if (!in_nodes[i]->is_tuple) {
+          NodeIdx src_idx = in_nodes[i]->node_idx;
+          NodeIdx dst_idx = strategy_group->node_idx;
+          Matrix edge_cost =
+              CreateEdgeCost(src_idx, dst_idx, i, strategy_group);
           AddEdgeCost(src_idx, dst_idx, edge_cost);
-        } else if (strategies->in_nodes[i]->is_tuple &&
-                   strategies->in_nodes.size() > 1) {
-          for (size_t l = 0; l < strategies->in_nodes[i]->childs.size(); l++) {
-            NodeIdx src_idx = strategies->in_nodes[i]->childs.at(l)->node_idx;
-            NodeIdx dst_idx = strategies->node_idx;
+        } else if (in_nodes[i]->is_tuple && in_nodes.size() > 1) {
+          for (size_t l = 0; l < in_nodes[i]->childs.size(); l++) {
+            NodeIdx src_idx = in_nodes[i]->childs.at(l)->node_idx;
+            NodeIdx dst_idx = strategy_group->node_idx;
             Matrix edge_cost =
-                CreateEdgeCost(src_idx, dst_idx, i, strategies, true);
+                CreateEdgeCost(src_idx, dst_idx, i, strategy_group, true);
             AddEdgeCost(src_idx, dst_idx, edge_cost);
           }
 
         } else {
-          CHECK_EQ(strategies->in_nodes.size(), 1)
+          CHECK_EQ(in_nodes.size(), 1)
               << "Do not support instructions with more than one tuple "
                  "operand. If this CHECK fails, we will need to fix "
                  "b/233412625.";
-          for (size_t l = 0; l < strategies->in_nodes[i]->childs.size(); l++) {
-            NodeIdx src_idx = strategies->in_nodes[i]->childs.at(l)->node_idx;
-            NodeIdx dst_idx = strategies->node_idx;
+          for (size_t l = 0; l < in_nodes[i]->childs.size(); l++) {
+            NodeIdx src_idx = in_nodes[i]->childs.at(l)->node_idx;
+            NodeIdx dst_idx = strategy_group->node_idx;
             // TODO(b/233412625) Support more general case, e.g., multiple tuple
             // operands. If there is only one operand and it's a tuple, the
             // first index of resharding_costs is for the tuple element.
-            Matrix edge_cost =
-                CreateEdgeCost(src_idx, dst_idx, /*in_node_idx=*/l, strategies);
+            Matrix edge_cost = CreateEdgeCost(
+                src_idx, dst_idx, /*in_node_idx=*/l, strategy_group);
             AddEdgeCost(src_idx, dst_idx, edge_cost);
           }
         }
       }
 
-      if (strategies->following) {
-        to_merge_pairs_.push_back(
-            {strategies->node_idx, strategies->following->node_idx});
+      if (strategy_group->following) {
+        if (strategy_group->strategies.size() ==
+            strategy_group->following->strategies.size()) {
+          to_merge_pairs_.push_back(
+              {strategy_group->node_idx, strategy_group->following->node_idx});
+        } else {
+          LOG(WARNING) << "Different strategy counts for instruction ID "
+                       << strategy_group->instruction_id
+                       << " and following instruction ID "
+                       << strategy_group->following->instruction_id;
+        }
       }
     }
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
index d0d71870024d2a..2d6a1d03a3ccd5 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_dot_handler.cc
@@ -33,14 +33,17 @@ limitations under the License.
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_option.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_util.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_wrapper.h"
 #include "xla/hlo/experimental/auto_sharding/cluster_environment.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/service/call_graph.h"
 #include "xla/service/dot_as_convolution_util.h"
+#include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/sharding_propagation.h"
 #include "xla/status.h"
 #include "tsl/platform/errors.h"
@@ -63,12 +66,18 @@ class HandlerBase {
  protected:
   HandlerBase(std::unique_ptr<StrategyGroup>& strategy_group,
               StrategyMap& strategy_map, const HloInstruction* ins,
+              const int64_t instruction_id,
+              const HloInstructionSequence& instruction_sequence,
+              const HloCostAnalysis& hlo_cost_analysis,
               const ClusterEnvironment& cluster_env,
               const InstructionBatchDimMap& batch_map,
               const AutoShardingOption& option, const CallGraph& call_graph)
       : strategy_group_(strategy_group),
         strategy_map_(strategy_map),
         ins_(ins),
+        instruction_id_(instruction_id),
+        instruction_sequence_(instruction_sequence),
+        hlo_cost_analysis_(hlo_cost_analysis),
         cluster_env_(cluster_env),
         batch_map_(batch_map),
         option_(option),
@@ -145,6 +154,9 @@ class HandlerBase {
   std::unique_ptr<StrategyGroup>& strategy_group_;
   StrategyMap& strategy_map_;
   const HloInstruction* ins_;
+  const int64_t instruction_id_;
+  const HloInstructionSequence& instruction_sequence_;
+  const HloCostAnalysis& hlo_cost_analysis_;
   const ClusterEnvironment& cluster_env_;
   const InstructionBatchDimMap& batch_map_;
   const AutoShardingOption& option_;
@@ -160,13 +172,18 @@ class DotHandler : public HandlerBase {
  public:
   DotHandler(std::unique_ptr<StrategyGroup>& strategy_group,
              StrategyMap& strategy_map, const HloDotInstruction* ins,
+             int64_t instruction_id,
+             const HloInstructionSequence& instruction_sequence,
+             const HloCostAnalysis& hlo_cost_analysis,
              const ClusterEnvironment& cluster_env,
              const InstructionBatchDimMap& batch_map,
              const AutoShardingOption& option, const CallGraph& call_graph);
 
   DotHandler(
       std::unique_ptr<StrategyGroup>& strategy_group, StrategyMap& strategy_map,
-      const HloConvolutionInstruction* ins,
+      const HloConvolutionInstruction* ins, int64_t instruction_id,
+      const HloInstructionSequence& instruction_sequence,
+      const HloCostAnalysis& hlo_cost_analysis,
       const dot_as_convolution_util::DotConvolutionDimsInfo& conv_as_dot_dims,
       const ClusterEnvironment& cluster_env,
       const InstructionBatchDimMap& batch_map, const AutoShardingOption& option,
@@ -216,6 +233,9 @@ class ConvHandler : public HandlerBase {
  public:
   ConvHandler(std::unique_ptr<StrategyGroup>& strategy_group,
               StrategyMap& strategy_map, const HloInstruction* ins,
+              int64_t instruction_id,
+              const HloInstructionSequence& instruction_sequence,
+              const HloCostAnalysis& hlo_cost_analysis,
               const ClusterEnvironment& cluster_env,
               const InstructionBatchDimMap& batch_map,
               const AutoShardingOption& option, const CallGraph& call_graph);
@@ -353,12 +373,16 @@ std::optional<HloSharding> HandlerBase::GetShardingFromUser(
 
 DotHandler::DotHandler(std::unique_ptr<StrategyGroup>& strategy_group,
                        StrategyMap& strategy_map, const HloDotInstruction* ins,
+                       const int64_t instruction_id,
+                       const HloInstructionSequence& instruction_sequence,
+                       const HloCostAnalysis& hlo_cost_analysis,
                        const ClusterEnvironment& cluster_env,
                        const InstructionBatchDimMap& batch_map,
                        const AutoShardingOption& option,
                        const CallGraph& call_graph)
-    : HandlerBase(strategy_group, strategy_map, ins, cluster_env, batch_map,
-                  option, call_graph),
+    : HandlerBase(strategy_group, strategy_map, ins, instruction_id,
+                  instruction_sequence, hlo_cost_analysis, cluster_env,
+                  batch_map, option, call_graph),
       is_dot_(true),
       space_base_dim_(ins->dot_dimension_numbers().lhs_batch_dimensions_size()),
       lhs_con_dims_(ins->dot_dimension_numbers().lhs_contracting_dimensions()),
@@ -373,13 +397,16 @@ DotHandler::DotHandler(std::unique_ptr<StrategyGroup>& strategy_group,
 
 DotHandler::DotHandler(
     std::unique_ptr<StrategyGroup>& strategy_group, StrategyMap& strategy_map,
-    const HloConvolutionInstruction* ins,
+    const HloConvolutionInstruction* ins, const int64_t instruction_id,
+    const HloInstructionSequence& instruction_sequence,
+    const HloCostAnalysis& hlo_cost_analysis,
     const dot_as_convolution_util::DotConvolutionDimsInfo& conv_as_dot_dims,
     const ClusterEnvironment& cluster_env,
     const InstructionBatchDimMap& batch_map, const AutoShardingOption& option,
     const CallGraph& call_graph)
-    : HandlerBase(strategy_group, strategy_map, ins, cluster_env, batch_map,
-                  option, call_graph),
+    : HandlerBase(strategy_group, strategy_map, ins, instruction_id,
+                  instruction_sequence, hlo_cost_analysis, cluster_env,
+                  batch_map, option, call_graph),
       is_dot_(false),
       space_base_dim_(-1) {
   CHECK(conv_as_dot_dims.conv_spatial_dims.empty());
@@ -652,6 +679,9 @@ void DotHandler::RecomputeSplitBothContract() {
     if (device_mesh_.dim(e.mesh_dims[0]) <= 1 ||
         device_mesh_.dim(e.mesh_dims[1]) <= 1)
       return;
+    if (!option_.allow_recompute_heavy_op) {
+      return;
+    }
     std::string name = absl::StrFormat("RR = RS x SR @ {%d} (allreduce @ %d)",
                                        e.mesh_dims[0], e.mesh_dims[0]);
     const DimMap lhs_dim_map = {{lhs_con_dims_[e.i], e.mesh_dims[0]}};
@@ -660,7 +690,10 @@ void DotHandler::RecomputeSplitBothContract() {
     if (is_dot_) {
       out_dim_map = DimMap{};
     }
-    double compute_cost = cluster_env_.DotCost(lhs_->shape(), rhs_->shape());
+    double compute_cost = GetDotConvReplicationPenalty(
+                              ins_, instruction_id_, /* window */ 10,
+                              instruction_sequence_, hlo_cost_analysis_) /
+                          device_mesh_.dim(e.mesh_dims[0]);
     auto communication_cost_fn = [this, &e](const HloSharding& output_spec) {
       double memory_cost = GetBytes(ins_->shape()) / output_spec.NumTiles();
       return cluster_env_.AllReduceCost(memory_cost, e.mesh_dims[0]);
@@ -838,12 +871,16 @@ Status DotHandler::RegisterStrategies() {
 
 ConvHandler::ConvHandler(std::unique_ptr<StrategyGroup>& strategy_group,
                          StrategyMap& strategy_map, const HloInstruction* ins,
+                         const int64_t instruction_id,
+                         const HloInstructionSequence& instruction_sequence,
+                         const HloCostAnalysis& hlo_cost_analysis,
                          const ClusterEnvironment& cluster_env,
                          const InstructionBatchDimMap& batch_map,
                          const AutoShardingOption& option,
                          const CallGraph& call_graph)
-    : HandlerBase(strategy_group, strategy_map, ins, cluster_env, batch_map,
-                  option, call_graph),
+    : HandlerBase(strategy_group, strategy_map, ins, instruction_id,
+                  instruction_sequence, hlo_cost_analysis, cluster_env,
+                  batch_map, option, call_graph),
       conv_dnums_(ins->convolution_dimension_numbers()) {
   lhs_batch_dim_ = conv_dnums_.input_batch_dimension();
   lhs_in_channel_dim_ = conv_dnums_.input_feature_dimension();
@@ -1011,6 +1048,8 @@ void ConvHandler::SplitDepthwise(bool forward) {
 Status HandleDot(std::unique_ptr<StrategyGroup>& strategy_group,
                  StrategyGroups& strategy_groups, StrategyMap& strategy_map,
                  const HloInstruction* ins, size_t instruction_id,
+                 const HloInstructionSequence& instruction_sequence,
+                 const HloCostAnalysis& hlo_cost_analysis,
                  const ClusterEnvironment& cluster_env,
                  const InstructionBatchDimMap& batch_map,
                  const AutoShardingOption& option,
@@ -1019,6 +1058,7 @@ Status HandleDot(std::unique_ptr<StrategyGroup>& strategy_group,
                                            strategy_groups);
 
   DotHandler handler(strategy_group, strategy_map, Cast<HloDotInstruction>(ins),
+                     instruction_id, instruction_sequence, hlo_cost_analysis,
                      cluster_env, batch_map, option, call_graph);
   TF_RETURN_IF_ERROR(handler.RegisterStrategies());
   return OkStatus();
@@ -1028,6 +1068,8 @@ Status HandleDot(std::unique_ptr<StrategyGroup>& strategy_group,
 Status HandleConv(std::unique_ptr<StrategyGroup>& strategy_group,
                   StrategyGroups& strategy_groups, StrategyMap& strategy_map,
                   const HloInstruction* ins, size_t instruction_id,
+                  const HloInstructionSequence& instruction_sequence,
+                  const HloCostAnalysis& hlo_cost_analysis,
                   const ClusterEnvironment& cluster_env,
                   const InstructionBatchDimMap& batch_map,
                   const AutoShardingOption& option,
@@ -1038,13 +1080,15 @@ Status HandleConv(std::unique_ptr<StrategyGroup>& strategy_group,
   auto conv_as_dot_dims =
       dot_as_convolution_util::ParseConvolutionDimsInfo(ins);
   if (conv_as_dot_dims.conv_spatial_dims.empty()) {
-    DotHandler handler(strategy_group, strategy_map,
-                       Cast<HloConvolutionInstruction>(ins), conv_as_dot_dims,
-                       cluster_env, batch_map, option, call_graph);
+    DotHandler handler(
+        strategy_group, strategy_map, Cast<HloConvolutionInstruction>(ins),
+        instruction_id, instruction_sequence, hlo_cost_analysis,
+        conv_as_dot_dims, cluster_env, batch_map, option, call_graph);
     TF_RETURN_IF_ERROR(handler.RegisterStrategies());
 
   } else {
-    ConvHandler handler(strategy_group, strategy_map, ins, cluster_env,
+    ConvHandler handler(strategy_group, strategy_map, ins, instruction_id,
+                        instruction_sequence, hlo_cost_analysis, cluster_env,
                         batch_map, option, call_graph);
     TF_RETURN_IF_ERROR(handler.RegisterStrategies());
   }
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.cc
index dc3f77634360fd..208e11ea3cb4a9 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.cc
@@ -107,9 +107,6 @@ std::string AutoShardingOption::ToString() const {
   lines.push_back(absl::StrCat("nd_sharding_iteratively_strict_search_space: ",
                                nd_sharding_iteratively_strict_search_space));
 
-  lines.push_back(absl::StrCat("allow_replicated_strategy_for_dot_and_conv: ",
-                               allow_replicated_strategy_for_dot_and_conv));
-
   lines.push_back(absl::StrCat("device_mesh_shape: [",
                                absl::StrJoin(device_mesh_shape, ","), "]"));
   lines.push_back(absl::StrCat("device_mesh_alpha: [",
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h
index 682c6f16ae585f..1dd8f8454d1f7a 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_option.h
@@ -104,9 +104,12 @@ struct AutoShardingOption {
   // 2d mesh case.
   bool batch_matmul_always_split_batch = false;
 
-  // If true, allow strategies that recompute heavy operators (e.g., dot)
-  // to reduce communication.
-  bool allow_recompute_heavy_op = false;
+  // If true, allow strategies that recompute heavy operators (e.g., dot) to
+  // reduce communication. This will generate generate replicated or partially
+  // replicated strategies for dot/conv ops. Generating these seems to be
+  // beneficial for LLM serving models, but can increase the search space, so
+  // this feature is exposed as an option.
+  bool allow_recompute_heavy_op = true;
 
   // If true, allow adding 1d strategies in 2d logical mesh.
   bool allow_mixed_mesh_shape = false;
@@ -143,11 +146,6 @@ struct AutoShardingOption {
   // space more scalable. Therefore leaving it as an option.
   bool nd_sharding_iteratively_strict_search_space = false;
 
-  // Whether or not to generate replicated strategies for dot/conv
-  // ops. Generating these seems to be beneficial for LLM serving models, but
-  // can increase the search space, so this feature is exposed as an option.
-  bool allow_replicated_strategy_for_dot_and_conv = true;
-
   // Device mesh shape.
   std::vector<int64_t> device_mesh_shape;
   // Device IDs in the mesh.
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
index 8d986f39e921fd..03ecde81271032 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.cc
@@ -58,6 +58,10 @@ using ::operations_research::MPConstraint;
 using ::operations_research::MPSolver;
 using ::operations_research::MPVariable;
 
+// We need to nudge the maximum cost (if present) slightly, since the constraint
+// solver cannot guarantee exact numerical precision.
+constexpr double kMaxCostEpsilon = 1.0001;
+
 bool AutoShardingSolverResult::operator==(
     const AutoShardingSolverResult& other) const {
   return status == other.status &&
@@ -183,6 +187,8 @@ void ScaleCoeffs(
 AutoShardingSolverRequest ScaleRequest(
     const AutoShardingSolverRequest& request) {
   if (!request.has_coeff_limit()) return request;
+  VLOG(0) << "Scaling request by coefficient limit: "
+          << request.coeff_limit().coeff();
   double max_coeff = 0.0;
   max_coeff = std::max(max_coeff, MaxCoeff(request.communication_costs()));
   max_coeff = std::max(max_coeff, MaxCoeff(request.computation_costs()));
@@ -273,14 +279,15 @@ AutoShardingSolverResult CallORToolsSolver(
   if (solver->ProblemType() ==
       operations_research::MPSolver::SAT_INTEGER_PROGRAMMING) {
     // Set random_seed, interleave_search and share_binary_clauses for
-    // determinism, and num_workers for parallelism.
+    // determinism, mip_max_bound (to handle large costs), and num_workers for
+    // parallelism.
     solver_parameter_str =
         request.deterministic_mode()
             ? absl::StrCat(
                   "share_binary_clauses:false,random_seed:1,interleave_"
-                  "search:true,num_workers:",
+                  "search:true,mip_max_bound:1e9,num_workers:",
                   num_workers)
-            : absl::StrCat("num_workers:", num_workers);
+            : absl::StrCat("mip_max_bound:1e9,num_workers:", num_workers);
     solver->SetSolverSpecificParametersAsString(solver_parameter_str);
   }
 #endif
@@ -289,15 +296,6 @@ AutoShardingSolverResult CallORToolsSolver(
   std::vector<std::vector<MPVariable*>> e(num_edges);
   MPVariable* overbudget_var = nullptr;
   MPVariable* makespan_var = nullptr;
-  MPVariable* cost_var =
-      solver->MakeNumVar(0.0,
-                         request.has_max_cost() ? request.max_cost().coeff()
-                                                : MPSolver::infinity(),
-                         "cost");
-  MPConstraint* cost_constraint =
-      solver->MakeRowConstraint(-MPSolver::infinity(), 0.0, "cost_constraint");
-  cost_constraint->SetCoefficient(cost_var, -1.0);
-  solver->MutableObjective()->SetCoefficient(cost_var, 1.0);
 
   size_t unique_nodes = 0;
   for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
@@ -311,6 +309,8 @@ AutoShardingSolverResult CallORToolsSolver(
 
   for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
     if (request.s_follow(node_idx) >= 0) {
+      CHECK_EQ(request.s_len(node_idx),
+               request.s_len(request.s_follow(node_idx)));
       // Copies the variable of followed instruction to the following
       // instruction.
       s[node_idx] = s[request.s_follow(node_idx)];
@@ -344,45 +344,58 @@ AutoShardingSolverResult CallORToolsSolver(
   }
 
   if (request.has_makespan_coeff()) {
-    makespan_var = CreateMakespanVar(request, e, *solver, *cost_constraint);
+    makespan_var = CreateMakespanVar(request, e, *solver);
   }
 
   // Construct objective function.
   // Node costs
+  absl::flat_hash_set<MPVariable*> infinity_vars;
   for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
     for (NodeStrategyIdx j = 0; j < s[node_idx].size(); ++j) {
       double accumulated_coefficient =
-          cost_constraint->GetCoefficient(s[node_idx][j]);
+          solver->MutableObjective()->GetCoefficient(s[node_idx][j]);
       double coefficient = request.computation_costs(node_idx).costs(j) +
                            request.communication_costs(node_idx).costs(j);
+      if (coefficient >= kInfinityCost) {
+        infinity_vars.insert(s[node_idx][j]);
+        continue;
+      }
       AddSalt(absl::StrCat(node_idx, "S", j), request.saltiplier(),
               &coefficient);
-      cost_constraint->SetCoefficient(s[node_idx][j],
-                                      accumulated_coefficient + coefficient);
+      solver->MutableObjective()->SetCoefficient(
+          s[node_idx][j], accumulated_coefficient + coefficient);
     }
   }
   // Edge costs
   for (EdgeIdx edge_idx = 0; edge_idx < num_edges; ++edge_idx) {
     for (EdgeStrategyIdx j = 0; j < e[edge_idx].size(); ++j) {
       double accumulated_coefficient =
-          cost_constraint->GetCoefficient(e[edge_idx][j]);
+          solver->MutableObjective()->GetCoefficient(e[edge_idx][j]);
       double coefficient = request.resharding_costs(edge_idx).costs(j);
+      if (coefficient >= kInfinityCost) {
+        infinity_vars.insert(e[edge_idx][j]);
+        continue;
+      }
       AddSalt(absl::StrCat(edge_idx, "E", j), request.saltiplier(),
               &coefficient);
-      cost_constraint->SetCoefficient(e[edge_idx][j],
-                                      accumulated_coefficient + coefficient);
+      solver->MutableObjective()->SetCoefficient(
+          e[edge_idx][j], accumulated_coefficient + coefficient);
     }
   }
+  LOG(INFO) << "Number of infinity terms: " << infinity_vars.size();
 
   // Add constraints.
   // 0. Do not choose solutions with infinity costs, as it will make the
   // objective value so large that other solution choices do not matter anymore.
-  // Remove these constraints once b/238210866 is done.
+  // Also eliminate strategies that are known to be dominated by others.
+  const NodeStrategies shaved_strategies =
+      StrategyShaver(request).FindShavedStrategies();
   for (NodeIdx node_idx = 0; node_idx < request.num_nodes(); ++node_idx) {
     if (s[node_idx].empty() || request.s_follow(node_idx) >= 0) continue;
     bool all_infinity = true;
     for (NodeStrategyIdx j = 0; j < s[node_idx].size(); ++j) {
-      if (cost_constraint->GetCoefficient(s[node_idx][j]) >= kInfinityCost) {
+      if (infinity_vars.contains(s[node_idx][j]) ||
+          shaved_strategies.contains({node_idx, j})) {
         MPConstraint* constraint = solver->MakeRowConstraint(
             0.0, 0.0,
             absl::StrCat("infinitycost: s[", node_idx, "][", j, "] = 0"));
@@ -399,7 +412,7 @@ AutoShardingSolverResult CallORToolsSolver(
     if (e[edge_idx].empty() || e_follow[edge_idx] >= 0) continue;
     bool all_infinity = true;
     for (EdgeStrategyIdx j = 0; j < e[edge_idx].size(); ++j) {
-      if (cost_constraint->GetCoefficient(e[edge_idx][j]) >= kInfinityCost) {
+      if (infinity_vars.contains(e[edge_idx][j])) {
         MPConstraint* constraint = solver->MakeRowConstraint(
             0.0, 0.0,
             absl::StrCat("infinitycost: e[", edge_idx, "][", j, "] = 0"));
@@ -468,8 +481,8 @@ AutoShardingSolverResult CallORToolsSolver(
       }
     }
     if (overbudget_var) {
-      cost_constraint->SetCoefficient(overbudget_var,
-                                      request.overbudget_coeff().coeff());
+      solver->MutableObjective()->SetCoefficient(
+          overbudget_var, request.overbudget_coeff().coeff());
       solver->MutableObjective()->SetOffset(request.overbudget_coeff().coeff() *
                                             min_memory_overbudget);
     }
@@ -500,8 +513,8 @@ AutoShardingSolverResult CallORToolsSolver(
           absl::StrCat("f for i = ", edge_idx, ", p = ", p));
       constraint->SetCoefficient(s[edge.first()][p], -1.0);
       for (NodeStrategyIdx q = 0; q < s[edge.second()].size(); ++q) {
-        constraint->SetCoefficient(e[edge_idx][p * s[edge.second()].size() + q],
-                                   1.0);
+        const EdgeStrategyIdx j = p * s[edge.second()].size() + q;
+        constraint->SetCoefficient(e[edge_idx][j], 1.0);
       }
     }
   }
@@ -515,8 +528,8 @@ AutoShardingSolverResult CallORToolsSolver(
           absl::StrCat("g for i = ", edge_idx, ", q = ", q));
       constraint->SetCoefficient(s[edge.second()][q], -1.0);
       for (NodeStrategyIdx p = 0; p < s[edge.first()].size(); ++p) {
-        constraint->SetCoefficient(e[edge_idx][p * s[edge.second()].size() + q],
-                                   1.0);
+        const EdgeStrategyIdx j = p * s[edge.second()].size() + q;
+        constraint->SetCoefficient(e[edge_idx][j], 1.0);
       }
     }
   }
@@ -557,6 +570,15 @@ AutoShardingSolverResult CallORToolsSolver(
       }
     }
   }
+  if (request.has_max_cost()) {
+    double max_cost = kMaxCostEpsilon * request.max_cost().coeff();
+    max_cost -= solver->Objective().offset();
+    MPConstraint* cost_constraint = solver->MakeRowConstraint(
+        -MPSolver::infinity(), max_cost, "cost_constraint");
+    for (const auto [var, coeff] : solver->Objective().terms()) {
+      cost_constraint->SetCoefficient(var, coeff);
+    }
+  }
 
   if (!request.s_hint().empty() && !request.deterministic_mode()) {
     std::vector<std::pair<const MPVariable*, double>> hint;
@@ -605,17 +627,21 @@ AutoShardingSolverResult CallORToolsSolver(
     solver->SetTimeLimit(
         absl::Seconds(request.solver_timeout().solver_timeout_in_seconds()));
   }
+  if (request.enable_output()) {
+    solver->EnableOutput();
+  }
   VLOG(0) << "Starting solver " << solver->ProblemType() << "\n"
           << "Solver parameter string: " << solver_parameter_str << "\n"
           << "Number of workers: " << num_workers << "\n"
           << "Number of threads: " << solver->GetNumThreads() << "\n"
           << "Time limit: " << solver->time_limit() << "\n"
-          << "Number variables for ILP: " << solver->NumVariables() << "\n"
+          << "Aliases: " << request.aliases_size() << "\n"
           << "Unique nodes: " << unique_nodes << "\n"
           << "Unique edges: " << unique_edges << "\n"
           << "Total instructions: " << request.num_nodes() << "\n"
           << "Memory budget: " << request.memory_budget() / (1024 * 1024 * 1024)
           << "GB\n"
+          << "Number variables for ILP: " << solver->NumVariables() << "\n"
           << "Number of ILP constraints: " << solver->NumConstraints() << "\n"
           << "Deterministic mode: " << request.deterministic_mode() << "\n"
           << "Module name: " << request.module_name();
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
index e2d2f9bfcd49a2..99c0705447b2d6 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver.h
@@ -100,8 +100,7 @@ std::vector<std::string> Rationalize(const AutoShardingSolverRequest& request,
 operations_research::MPVariable* CreateMakespanVar(
     const AutoShardingSolverRequest& request,
     const std::vector<std::vector<operations_research::MPVariable*>>& e,
-    operations_research::MPSolver& solver,
-    operations_research::MPConstraint& cost_constraint);
+    operations_research::MPSolver& solver);
 
 double EvaluateMakespan(const AutoShardingSolverRequest& request,
                         const AutoShardingSolverResult& result,
@@ -111,6 +110,32 @@ double EvaluateMakespan(const AutoShardingSolverRequest& request,
 AutoShardingSolverRequest ScaleRequest(
     const AutoShardingSolverRequest& request);
 
+// Determines if strategy 'first' is dominated by strategy 'second' (i.e., its
+// costs are all equal or worse, and it has identical alias mappings).
+bool CheckDominance(const AutoShardingSolverRequest& request,
+                    const std::vector<EdgeIdx>& src_edges,
+                    const std::vector<EdgeIdx>& dst_edges,
+                    const std::vector<AliasIdx>& src_aliases,
+                    const std::vector<AliasIdx>& dst_aliases, NodeIdx node_idx,
+                    NodeStrategyIdx first, NodeStrategyIdx second);
+
+class StrategyShaver {
+ public:
+  explicit StrategyShaver(const AutoShardingSolverRequest& request);
+
+  // For every node, examine each sharding strategy to see if it is dominated by
+  // another.
+  NodeStrategies FindShavedStrategies() const;
+
+ private:
+  const AutoShardingSolverRequest& request_;  // NOLINT
+  std::vector<std::vector<EdgeIdx>> src_edge_map_;
+  std::vector<std::vector<EdgeIdx>> dst_edge_map_;
+  std::vector<std::vector<AliasIdx>> src_alias_map_;
+  std::vector<std::vector<AliasIdx>> dst_alias_map_;
+  std::vector<std::vector<NodeIdx>> followers_;
+};
+
 }  // namespace spmd
 }  // namespace xla
 
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_impl.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_impl.cc
index fdbdc505397022..4be54f98a0a496 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_impl.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_impl.cc
@@ -17,18 +17,18 @@ limitations under the License.
 
 #include "xla/hlo/experimental/auto_sharding/auto_sharding.pb.h"
 #include "xla/hlo/experimental/auto_sharding/auto_sharding_solver.h"
+#include "xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h"
 #include "ortools/linear_solver/linear_solver.h"
 
 namespace xla {
 namespace spmd {
 
-using ::operations_research::MPConstraint;
 using ::operations_research::MPSolver;
 using ::operations_research::MPVariable;
 
 MPVariable* CreateMakespanVar(const AutoShardingSolverRequest& request,
                               const std::vector<std::vector<MPVariable*>>& e,
-                              MPSolver& solver, MPConstraint& cost_constraint) {
+                              MPSolver& solver) {
   return nullptr;  // TODO(moffitt): Implement this.
 }
 
@@ -38,5 +38,12 @@ double EvaluateMakespan(const AutoShardingSolverRequest& request,
   return 0.0;  // TODO(moffitt): Implement this.
 }
 
+StrategyShaver::StrategyShaver(const AutoShardingSolverRequest& request)
+    : request_(request) {}
+
+NodeStrategies StrategyShaver::FindShavedStrategies() const {
+  return {};  // TODO(moffitt): Implement this.
+}
+
 }  // namespace spmd
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
index 60f05a9a3f9063..0bab0859791711 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_solver_test.cc
@@ -80,7 +80,7 @@ AutoShardingSolverRequest DefaultAutoShardingSolverRequest() {
                         {20, 21, 22},
                         {30, 31, 32, 33},
                         {40, 41, 42, 43},
-                        {50, 51, 52, 53}};
+                        {50, 51, 52}};
   const CostMatrix d = {{100, 110, 120, 130},
                         {200, 210, 220},
                         {300, 310, 320, 330},
@@ -140,6 +140,88 @@ AutoShardingSolverRequest DefaultAutoShardingSolverRequest() {
   return request;
 }
 
+AutoShardingSolverRequest AutoShardingSolverRequestWithEquivalences() {
+  const auto s_len = {4, 3, 7, 7, 3};
+  const auto s_follow = {-1, -1, -1, 2, -1};
+  AutoShardingSolverRequest_Pair edge1, edge2;
+  edge1.set_first(0);
+  edge1.set_second(2);
+  edge2.set_first(1);
+  edge2.set_second(2);
+  const auto edges = {edge1, edge2};
+  const NodeMatrix live = {{1, 0},
+                           {1, 0},
+                           {1, 2, 0},
+                           {1, 2, 3, 0},
+                           {1, 3, 0}};
+  const CostMatrix c = {{10, 10, 10, 10},
+                        {20, 20, 20},
+                        {30, 30, 31, 30, 30, 30, 30},
+                        {40, 40, 40, 40, 40, 40, 40},
+                        {50, 50, 50}};
+  const CostMatrix d = {{100, 100, 100, 100},
+                        {200, 200, 200},
+                        {300, 300, 300, 300, 300, 300, 300},
+                        {400, 400, 400, 400, 400, 400, 410},
+                        {500, 500, 500}};
+  const CostMatrix m = {{10000, 10000, 10000, 10000},
+                        {20000, 20000, 20000},
+                        {30000, 30000, 30000, 31000, 30000, 30000, 30000},
+                        {40000, 40000, 40000, 40000, 40000, 40000, 40000},
+                        {50000, 50000, 50000}};
+  const CostMatrix p = {{1.0, 0.0, 1.0, 1.0},
+                        {1.0, 0.0, 1.0},
+                        {1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0},
+                        {1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0},
+                        {1.0, 0.0, 1.0}};
+  const CostMatrix r = {{1000, 1000, 1000, 1000, 1000, 1000, 1000,
+                         2000, 2000, 2000, 2000, 2000, 2000, 2000,
+                         3000, 3000, 3000, 3000, 3100, 3000, 3000,
+                         4000, 4000, 4000, 4000, 4000, 4000, 4000},
+                        {5000, 5000, 5000, 5000, 5000, 5000, 5000,
+                         6000, 6000, 6000, 6000, 6000, 6000, 6000,
+                         7000, 7000, 7000, 7000, 7000, 7000, 7000}};
+  const CostMatrix t = {{70000, 70000, 70000, 70000, 70000, 70000, 70000,
+                         60000, 60000, 60000, 60000, 60000, 60000, 60000,
+                         50000, 50000, 50000, 50000, 50000, 50000, 50000,
+                         40000, 40000, 40000, 40000, 40000, 40000, 40000},
+                        {30000, 30000, 30000, 30000, 30000, 30000, 30000,
+                         20000, 20000, 20000, 20000, 20000, 20000, 20000,
+                         10000, 10000, 10000, 10000, 10000, 10000, 10000}};
+  AutoShardingSolverRequest_Pair alias;
+  alias.set_first(2);
+  alias.set_second(4);
+  const auto aliases = {alias};
+  const CostMatrix v = {{0, 1, 0,
+                         0, 1, 0,
+                         0, 1, 0,
+                         0, 1, 0,
+                         0, 1, 0,
+                         1, 0, 1,
+                         0, 1, 0}};
+  const std::vector<std::string> instruction_names = {"A", "B", "C", "D", "E"};
+
+  AutoShardingSolverRequest request;
+  request.set_num_nodes(5);
+  request.set_memory_budget(1500000);
+  request.mutable_s_len()->Add(s_len.begin(), s_len.end());
+  request.mutable_s_follow()->Add(s_follow.begin(), s_follow.end());
+  request.mutable_edges()->Add(edges.begin(), edges.end());
+  AddNodes(request.mutable_live(), live);
+  AddCosts(request.mutable_computation_costs(), c);
+  AddCosts(request.mutable_communication_costs(), d);
+  AddCosts(request.mutable_memory_costs(), m);
+  AddCosts(request.mutable_departure_costs(), p);
+  AddCosts(request.mutable_resharding_costs(), r);
+  AddCosts(request.mutable_duration_costs(), t);
+  request.mutable_aliases()->Add(aliases.begin(), aliases.end());
+  AddCosts(request.mutable_value_costs(), v);
+  request.mutable_instruction_names()->Add(instruction_names.begin(),
+                                           instruction_names.end());
+  AddCosts(request.mutable_computation_costs(), c);
+  return request;
+}
+
 TEST(CallORToolsSolverTest, SolvesOptimally) {
   const AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
 
@@ -293,6 +375,21 @@ TEST(CallORToolsSolverTest, HandlesMemoryEdgeCosts) {
   EXPECT_EQ(result, expected_result);
 }
 
+TEST(CallORToolsSolverTest, SolvesWithEquivalences) {
+  const AutoShardingSolverRequest request =
+      AutoShardingSolverRequestWithEquivalences();
+
+  const AutoShardingSolverResult result = CallORToolsSolver(request);
+
+  const std::vector<NodeStrategyIdx> s_val = {0, 0, 5, 5, 1};
+  const std::vector<EdgeStrategyIdx> e_val = {5, 5};
+  const double objective_value = 7650.0;
+  const AutoShardingSolverResult expected_result = {
+      std::make_tuple(
+          std::move(s_val), std::move(e_val), objective_value), false};
+  EXPECT_EQ(result, expected_result);
+}
+
 TEST(AutoShardingEvaluatorTest, NoViolations) {
   const AutoShardingSolverRequest request = DefaultAutoShardingSolverRequest();
   const std::vector<NodeStrategyIdx> s_val = {3, 1, 2, 2, 1};
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
index 99b9a0b34b680a..82ecfd72ce8039 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.cc
@@ -101,6 +101,8 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
     max_depth = std::max(max_depth, iter.second);
   }
 
+  absl::flat_hash_map<const HloInstruction*, const HloInstruction*>
+      while_body_args_to_input_tuple;
   // Register strategies and their costs for each instruction.
   for (size_t instruction_id = 0; instruction_id < instructions.size();
        ++instruction_id) {
@@ -126,8 +128,45 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
       only_allow_divisible = option.only_allow_divisible_intermediate;
     }
 
+    bool is_follow_necessary_for_correctness = false;
     switch (opcode) {
-      case HloOpcode::kParameter:
+      case HloOpcode::kParameter: {
+        auto it = while_body_args_to_input_tuple.find(ins);
+        if (it != while_body_args_to_input_tuple.end()) {
+          const HloInstruction* while_input_tuple = it->second;
+          const StrategyGroup* while_input_tuple_strategy_group =
+              strategy_map.at(while_input_tuple).get();
+
+          VLOG(5) << "Following while input " << while_input_tuple->name();
+          strategy_group = CreateTupleStrategyGroup(instruction_id);
+          strategy_group->childs.reserve(ins->shape().tuple_shapes_size());
+          // We use this following relationship to ensure that the input tuple
+          // of the while loop, and the parameter of the body of that while
+          // loop. Therefore, this followinf relationship is necessary for
+          // correctness, and is not merely an optmization.
+          is_follow_necessary_for_correctness = true;
+          for (size_t i = 0; i < ins->shape().tuple_shapes_size(); ++i) {
+            std::unique_ptr<StrategyGroup> child_strategies =
+                MaybeFollowInsStrategyGroup(
+                    while_input_tuple_strategy_group->childs[i].get(),
+                    ins->shape().tuple_shapes().at(i), instruction_id,
+                    /* have_memory_cost= */ true, strategy_groups, cluster_env,
+                    pretrimmed_strategy_map);
+            child_strategies->tuple_element_idx = i;
+            strategy_group->childs.push_back(std::move(child_strategies));
+          }
+        } else {
+          strategy_group =
+              CreateAllStrategiesGroup(
+                  ins, ins->shape(), instruction_id, strategy_groups,
+                  cluster_env, strategy_map, option, replicated_penalty,
+                  batch_dim_map, call_graph, only_allow_divisible,
+                  option.allow_replicated_parameters,
+                  /* create_partially_replicated_strategies */ true)
+                  .value();
+        }
+        break;
+      }
       case HloOpcode::kRngBitGenerator:
       case HloOpcode::kRng: {
         strategy_group =
@@ -345,8 +384,17 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
 
           // Find output shardings.
           switch (opcode) {
+            case HloOpcode::kSlice: {
+              bool is_1d_sharding =
+                  VectorGreaterThanOneElementCount(
+                      input_spec.tile_assignment().dimensions()) == 1;
+              output_spec = PropagateDimwiseShardingSlice(
+                  input_spec, operand->shape(), ins->shape(),
+                  is_1d_sharding ? cluster_env.device_mesh_1d_
+                                 : cluster_env.device_mesh_);
+              break;
+            }
             case HloOpcode::kPad:
-            case HloOpcode::kSlice:
             case HloOpcode::kConcatenate:
             case HloOpcode::kDynamicSlice:
             case HloOpcode::kDynamicUpdateSlice:
@@ -434,6 +482,7 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
       case HloOpcode::kBitcastConvert:
       case HloOpcode::kCopy:
       case HloOpcode::kCos:
+      case HloOpcode::kErf:
       case HloOpcode::kExp:
       case HloOpcode::kExpm1:
       case HloOpcode::kFloor:
@@ -495,10 +544,12 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
         break;
       }
       case HloOpcode::kDot: {
-        TF_RETURN_IF_ERROR(HandleDot(
-            strategy_group, strategy_groups, strategy_map, ins, instruction_id,
-            cluster_env, batch_dim_map, option, call_graph));
-        if (option.allow_replicated_strategy_for_dot_and_conv) {
+        TF_RETURN_IF_ERROR(HandleDot(strategy_group, strategy_groups,
+                                     strategy_map, ins, instruction_id,
+                                     sequence, hlo_cost_analysis, cluster_env,
+                                     batch_dim_map, option, call_graph));
+
+        if (option.allow_recompute_heavy_op) {
           AddReplicatedStrategy(
               ins, ins->shape(), cluster_env, strategy_map, strategy_group,
               GetDotConvReplicationPenalty(ins, instruction_id, /* window */ 10,
@@ -507,10 +558,11 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
         break;
       }
       case HloOpcode::kConvolution: {
-        TF_RETURN_IF_ERROR(HandleConv(
-            strategy_group, strategy_groups, strategy_map, ins, instruction_id,
-            cluster_env, batch_dim_map, option, call_graph));
-        if (option.allow_replicated_strategy_for_dot_and_conv) {
+        TF_RETURN_IF_ERROR(HandleConv(strategy_group, strategy_groups,
+                                      strategy_map, ins, instruction_id,
+                                      sequence, hlo_cost_analysis, cluster_env,
+                                      batch_dim_map, option, call_graph));
+        if (option.allow_recompute_heavy_op) {
           AddReplicatedStrategy(
               ins, ins->shape(), cluster_env, strategy_map, strategy_group,
               GetDotConvReplicationPenalty(ins, instruction_id, /* window */ 10,
@@ -553,6 +605,16 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
           child_strategies->tuple_element_idx = i;
           strategy_group->childs.push_back(std::move(child_strategies));
         }
+
+        if (ins->users().size() == 1 &&
+            ins->users()[0]->opcode() == HloOpcode::kWhile) {
+          const HloInstruction* while_op = ins->users()[0];
+          while_body_args_to_input_tuple[while_op->while_body()
+                                             ->parameter_instruction(0)] = ins;
+          while_body_args_to_input_tuple[while_op->while_condition()
+                                             ->parameter_instruction(0)] = ins;
+        }
+
         break;
       }
       case HloOpcode::kGetTupleElement: {
@@ -677,6 +739,27 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
                                 strategy_group, replicated_penalty);
         break;
       }
+      case HloOpcode::kSend: {
+        strategy_group = CreateTupleStrategyGroup(instruction_id);
+        strategy_group->childs.reserve(ins->shape().tuple_shapes_size());
+        for (size_t i = 0; i < ins->shape().tuple_shapes_size(); ++i) {
+          std::unique_ptr<StrategyGroup> child_strategies =
+              CreateLeafStrategyGroup(instruction_id, ins, strategy_map,
+                                      strategy_groups);
+          AddReplicatedStrategy(ins, ins->shape().tuple_shapes(i), cluster_env,
+                                strategy_map, child_strategies, 0);
+          child_strategies->tuple_element_idx = i;
+          strategy_group->childs.push_back(std::move(child_strategies));
+        }
+        break;
+      }
+      case HloOpcode::kSendDone: {
+        strategy_group = CreateLeafStrategyGroup(instruction_id, ins,
+                                                 strategy_map, strategy_groups);
+        AddReplicatedStrategy(ins, ins->shape(), cluster_env, strategy_map,
+                              strategy_group, 0);
+        break;
+      }
       case HloOpcode::kAfterAll: {
         strategy_group = CreateLeafStrategyGroup(instruction_id, ins,
                                                  strategy_map, strategy_groups);
@@ -700,10 +783,12 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
       if (!LeafVectorsAreConsistent(strategy_group->strategies,
                                     strategy_group->following->strategies)) {
         // It confuses the solver if two instructions have different number of
-        // sharding strategies but share the same ILP variable. The solver
-        // would run much longer and/or return infeasible solutions.
-        // So if two strategies' strategiess are inconsistent, we unfollow
-        // them.
+        // sharding strategies but share the same ILP variable. The solver would
+        // run much longer and/or return infeasible solutions. So if two
+        // strategies are inconsistent, we unfollow them.
+        CHECK(!is_follow_necessary_for_correctness)
+            << "Reverting a following decision that is necessary for "
+               "correctness. Please report this as a bug.";
         strategy_group->following = nullptr;
       }
     } else if (strategy_group->is_tuple) {
@@ -712,6 +797,9 @@ BuildStrategyAndCost(const HloInstructionSequence& sequence,
             !LeafVectorsAreConsistent(
                 strategy_group->childs.at(i)->strategies,
                 strategy_group->childs.at(i)->following->strategies)) {
+          CHECK(!is_follow_necessary_for_correctness)
+              << "Reverting a following decision that is necessary for "
+                 "correctness. Please report this as a bug.";
           strategy_group->childs.at(i)->following = nullptr;
         }
       }
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h
index 8a72b08dd7f1f4..7b8b991939d2c2 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_strategy.h
@@ -130,6 +130,10 @@ using EdgeStrategyIdx = int64_t;  // An index into an edge's strategy vector.
 using LivenessIdx = int64_t;      // An index into the liveness vector.
 using AliasIdx = int64_t;         // An index into the alias vector.
 
+// Various classes needed to support strategy shaving.
+using NodeStrategy = std::pair<NodeIdx, NodeStrategyIdx>;
+using NodeStrategies = StableHashSet<NodeStrategy>;
+
 // A group of strategy choices (along with details like index values)
 // for each instruction.
 struct StrategyGroup {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
index 4d4a4600b7e927..0904f9167916ec 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_test.cc
@@ -59,7 +59,7 @@ using ::testing::UnorderedElementsAre;
 using DummyAutoShardingTest = HloTestBase;
 
 TEST_F(DummyAutoShardingTest, ReplicatedShardingDummy) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 ENTRY %elementwise {
   %param0 = f32[5,7,11,13]{3,2,1,0} parameter(0)
@@ -79,14 +79,14 @@ ENTRY %elementwise {
 
 class AutoShardingTest : public HloTestBase {
  protected:
-  const char* const dot_hlo_string_ = R"(
+  absl::string_view dot_hlo_string_ = R"(
 HloModule module
 ENTRY matmul {
   parameter.1 = f32[32,64]{1,0} parameter(0)
   parameter.2 = f32[64,128]{1,0} parameter(1)
   ROOT root = f32[32,128]{1,0} dot(parameter.1, parameter.2), lhs_contracting_dims={1}, rhs_contracting_dims={0}
 })";
-  const char* const add_hlo_string_ = R"(
+  absl::string_view add_hlo_string_ = R"(
 HloModule module
 ENTRY %elementwise {
   %param0 = f32[16,32,64]{2,1,0} parameter(0)
@@ -163,7 +163,7 @@ ENTRY %elementwise {
 };
 
 TEST_F(AutoShardingTest, DISABLED_ElementWiseOperator) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 ENTRY %elementwise {
   %param0 = f32[128,128]{0,1} parameter(0)
@@ -189,7 +189,7 @@ ENTRY %elementwise {
 }
 
 TEST_F(AutoShardingTest, NDIterativeSolveTest) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 
 ENTRY %elementwise {
@@ -217,8 +217,38 @@ ENTRY %elementwise {
   EXPECT_THAT(slice, op::Sharding("{devices=[256,1]<=[256]}"));
 }
 
+TEST_F(AutoShardingTest, SliceDeviceMeshTest) {
+  constexpr absl::string_view hlo_string = R"(
+HloModule module
+
+ENTRY %elementwise {
+  param = s32[512,3084]{1,0} parameter(0)
+  slice = s32[512,2048]{1,0} slice(param), slice={[0:512], [0:2048]}
+  ROOT copy = s32[512,2048]{1,0} copy(slice)
+})";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool changed,
+      AutoSharding(/* option */ {.enable = true,
+                                 .solve_nd_sharding_iteratively = true,
+                                 .device_mesh_shape = {2, 2},
+                                 .device_mesh_alpha = {1.0, 1.0},
+                                 .device_mesh_beta = {0.01, 1.0}})
+          .Run(module.get()));
+  VLOG(10) << module->ToString();
+  EXPECT_TRUE(changed);
+  const HloInstruction* slice = FindInstruction(module.get(), "slice");
+  ASSERT_NE(slice, nullptr);
+  EXPECT_THAT(
+      slice,
+      AnyOf(op::Sharding("{devices=[2,1,2]0,1,2,3 last_tile_dim_replicate}"),
+            op::Sharding("{devices=[2,1,2]0,2,1,3 last_tile_dim_replicate}")));
+}
+
 TEST_F(AutoShardingTest, RngBitGeneratorArrayInput) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule rng_bit_generator
 
 ENTRY %RngBitGenerator (p0: u64[2]) -> (u64[2], u32[16,16]) {
@@ -243,7 +273,7 @@ ENTRY %RngBitGenerator (p0: u64[2]) -> (u64[2], u32[16,16]) {
 }
 
 TEST_F(AutoShardingTest, RngBitGeneratorTupleInput) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule rng_bit_generator
 
 ENTRY %RngBitGenerator {
@@ -273,7 +303,7 @@ ENTRY %RngBitGenerator {
 }
 
 TEST_F(AutoShardingTest, DotLHSTwoNonContractingDims) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 ENTRY %entry {
   %param0 = f32[4,256,64]{2,1,0} parameter(0)
@@ -325,7 +355,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, DotRHSTwoNonContractingDims) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 ENTRY %entry {
   %param0 = f32[4,256,32]{2,1,0} parameter(0)
@@ -377,7 +407,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, DotTwoContractingDims) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 ENTRY %entry {
   %param0 = f32[4,256,64]{2,1,0} parameter(0)
@@ -418,7 +448,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, TwoMatmul) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 ENTRY twomatmul {
   parameter.1 = f32[64,64]{1,0} parameter(0)
@@ -432,7 +462,7 @@ ENTRY twomatmul {
                           ParseAndReturnVerifiedModule(hlo_string));
   AutoShardingOption option;
   option.enable = true;
-  option.allow_replicated_strategy_for_dot_and_conv = false;
+  option.allow_recompute_heavy_op = false;
   option.device_mesh_shape = {2, 2};
   option.device_mesh_ids = {0, 1, 2, 3};
   option.device_mesh_alpha = {1.0, 1.0};
@@ -463,7 +493,7 @@ ENTRY twomatmul {
   // Test with replicated strategies on for dot
   TF_ASSERT_OK_AND_ASSIGN(module, ParseAndReturnVerifiedModule(hlo_string));
   option.enable = true;
-  option.allow_replicated_strategy_for_dot_and_conv = true;
+  option.allow_recompute_heavy_op = true;
   option.device_mesh_shape = {2, 2};
   option.device_mesh_ids = {0, 1, 2, 3};
   option.device_mesh_alpha = {1.0, 1.0};
@@ -497,7 +527,7 @@ ENTRY twomatmul {
 }
 
 TEST_F(AutoShardingTest, ProcessCustomCallShardings) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 
 ENTRY %entry {
@@ -527,7 +557,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, SaveAndRemoveShardingAnnotationKeepAll) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -581,7 +611,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 
 TEST_F(AutoShardingTest,
        SaveAndRemoveShardingAnnotationKeepInputOutputSmallTensor) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -631,7 +661,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 }
 
 TEST_F(AutoShardingTest, SaveAndRemoveShardingAnnotationKeepInputOutput) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -709,7 +739,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 }
 
 TEST_F(AutoShardingTest, SaveAndRemoveShardingAnnotationRemoveAll) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -746,7 +776,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 }
 
 TEST_F(AutoShardingTest, SaveAndRemoveShardingAnnotationRemoveAllSmallTensor) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -799,7 +829,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 }
 
 TEST_F(AutoShardingTest, TupleReduceTest) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 %func (lhs_value: f32[], lhs_index: s32[], rhs_value: f32[], rhs_index: s32[]) -> (f32[], s32[]) {
   %lhs_value = f32[] parameter(0)
@@ -845,7 +875,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, ReduceTest) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 
 %func (x: f32[], y: f32[]) -> f32[] {
@@ -888,7 +918,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, ScatterTest2D) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 
 region {
@@ -927,7 +957,7 @@ ENTRY %Scatter {
 }
 
 TEST_F(AutoShardingTest, ScatterTest3D) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 
 region {
@@ -970,7 +1000,7 @@ ENTRY %Scatter {
 }
 
 TEST_F(AutoShardingTest, GatherTest) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 ENTRY %entry {
   %param0 = f32[256,1024]{0,1} parameter(0)
@@ -1001,7 +1031,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, GatherTestNoReshard) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 ENTRY %entry {
   get-tuple-element = s8[1000,128]{1,0} parameter(0)
@@ -1032,7 +1062,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, GatherConvTest) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 ENTRY %entry {
   %param0 = f32[1024,1024]{0,1} parameter(0)
@@ -1059,13 +1089,8 @@ ENTRY %entry {
   auto* conv = FindInstruction(module.get(), "convolution");
   ASSERT_NE(gather, nullptr);
   ASSERT_NE(conv, nullptr);
-  EXPECT_THAT(std::make_tuple(gather, conv),
-              AnyOf(FieldsAre(op::Sharding("{devices=[4,1,1]0,1,2,3}"),
-                              op::Sharding("{devices=[4,1,1]0,1,2,3}")),
-                    FieldsAre(op::Sharding("{replicated}"),
-                              op::Sharding("{devices=[1,1,4]0,1,2,3}")),
-                    FieldsAre(op::Sharding("{replicated}"),
-                              op::Sharding("{devices=[4,1,1]0,1,2,3}"))));
+  EXPECT_THAT(gather, op::Sharding("{devices=[1,4,1]0,1,2,3}"));
+  EXPECT_THAT(conv, op::Sharding("{devices=[1,4,1]0,1,2,3}"));
   auto gather_sharding = gather->sharding();
   TF_EXPECT_OK(gather_sharding.Validate(gather->shape(), 4));
   auto conv_sharding = conv->sharding();
@@ -1336,7 +1361,7 @@ TEST_F(AutoShardingTest, InvalidOptions) {
 
 TEST_F(AutoShardingTest, AutoShardingKeepUserShardingInputOutput) {
   // An HLO Module with sharding for all instructions.
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -1368,7 +1393,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 
 TEST_F(AutoShardingTest, AutoShardingKeepUserShardingAdd) {
   // An HLO Module with sharding for all instructions.
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 ENTRY %elementwise {
   %param0 = f32[128,128]{0,1} parameter(0)
@@ -1402,7 +1427,7 @@ ENTRY %elementwise {
 
 TEST_F(AutoShardingTest, AutoShardingKeepUserShardingDot) {
   // An HLO Module with sharding for all instructions.
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 
 ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
@@ -1448,7 +1473,7 @@ ENTRY %entry (param0: f32[4,256,64], param1: f32[4,256,32]) -> f32[64,32] {
 }
 
 TEST_F(AutoShardingTest, DISABLED_AutoShardingKeepUserShardingTupleReduce) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 %func (lhs_value: f32[], lhs_index: s32[], rhs_value: f32[], rhs_index: s32[]) -> (f32[], s32[]) {
   %lhs_value = f32[] parameter(0)
@@ -1496,7 +1521,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, DISABLED_TupleParameter) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 ENTRY %tupleparameter {
   %tuple_param = (f32[16,32,64]{2,1,0}, f32[16,32,64]{2,1,0}) parameter(0)
@@ -1526,7 +1551,7 @@ ENTRY %tupleparameter {
 
 // CRASHES
 TEST_F(AutoShardingTest, DISABLED_GetTupleElementWithUserShardingTest) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 
 %while_cond {
@@ -1573,7 +1598,7 @@ ENTRY %entry (param0: f32[16,256,256], param1: f32[16,256,256]) -> f32[16,256,25
 }
 
 TEST_F(AutoShardingTest, While) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 
 %cond {
@@ -1653,7 +1678,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, DynamicSlice) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 ENTRY %entry {
   %param0 = s32[] parameter(0)
@@ -1682,7 +1707,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, Alias) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module, input_output_alias={ {0}: (0, {}, may-alias), {1}: (1, {}, may-alias), {2}: (2, {}, may-alias), {3}: (3, {}, may-alias)}
 
 ENTRY %entry {
@@ -1708,7 +1733,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, AliasTupleParameter) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module, input_output_alias={ {0}: (0, {0}, may-alias), {1}: (0, {1}, may-alias), {2}: (0, {2}, may-alias), {3}: (0, {3}, may-alias)}
 
 ENTRY %entry {
@@ -1735,7 +1760,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, JaxRandomUniform) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 clone {
   lhs.1 = u32[] parameter(0)
@@ -1786,7 +1811,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, Reshape) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 
 ENTRY %entry {
@@ -1814,7 +1839,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, ReshapeWithInvalidUserSharding) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 
 ENTRY %entry {
@@ -1840,7 +1865,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, Broadcast) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 
 ENTRY %entry {
@@ -1859,7 +1884,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, TestReshardingCostsForUserAnnotatedSharding) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module
 
 ENTRY %entry {
@@ -1885,7 +1910,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, AllowAliasToFollowerConversion) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module, input_output_alias={ {0}: (0, {}, may-alias), {1}: (1, {}, may-alias), {2}: (2, {}, may-alias), {3}: (3, {}, may-alias)}
 
 ENTRY %entry {
@@ -1912,7 +1937,7 @@ ENTRY %entry {
 }
 
 TEST_F(AutoShardingTest, DisallowAliasToFollowerConversion) {
-  const char* const hlo_string = R"(
+  constexpr absl::string_view hlo_string = R"(
 HloModule module, input_output_alias={ {0}: (0, {}, may-alias), {1}: (1, {}, may-alias), {2}: (2, {}, may-alias), {3}: (3, {}, may-alias)}
 
 ENTRY %entry {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
index 112f07979b187a..3011f3e11c54ea 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.cc
@@ -152,6 +152,32 @@ std::optional<HloSharding> PropagateDimwiseSharding(
   return input_spec;
 }
 
+HloSharding PropagateDimwiseShardingSlice(const HloSharding& input_spec,
+                                          const Shape& old_shape,
+                                          const Shape& new_shape,
+                                          const Array<int64_t>& device_mesh) {
+  if (input_spec.IsReplicated()) {
+    return input_spec;
+  }
+
+  CHECK(old_shape.IsArray());
+
+  std::vector<int64_t> tensor_to_mesh_dim =
+      GetTensorDimToMeshDim(new_shape.rank(), input_spec, device_mesh,
+                            /* consider_reverse_device_meshes */ false);
+
+  std::vector<int64_t> tensor_dims;
+  std::vector<int64_t> mesh_dims;
+  for (size_t i = 0; i < new_shape.rank(); ++i) {
+    if (new_shape.dimensions(i) == old_shape.dimensions(i) &&
+        tensor_to_mesh_dim[i] > -1) {
+      tensor_dims.push_back(i);
+      mesh_dims.push_back(tensor_to_mesh_dim[i]);
+    }
+  }
+  return Tile(new_shape, tensor_dims, mesh_dims, device_mesh);
+}
+
 // Propagate sharding for ReduceWindow-like operations.
 // The sharding can successfully propagate if the window operation only happens
 // on tensor dimensions that are not tiled.
@@ -356,6 +382,7 @@ void BatchDimMapForward(const std::vector<HloInstruction*>& instructions,
       case HloOpcode::kBitcastConvert:
       case HloOpcode::kCopy:
       case HloOpcode::kCos:
+      case HloOpcode::kErf:
       case HloOpcode::kExp:
       case HloOpcode::kExpm1:
       case HloOpcode::kFloor:
@@ -615,6 +642,7 @@ void BatchDimMapBackward(const std::vector<HloInstruction*>& instructions,
       case HloOpcode::kBitcastConvert:
       case HloOpcode::kCopy:
       case HloOpcode::kCos:
+      case HloOpcode::kErf:
       case HloOpcode::kExp:
       case HloOpcode::kExpm1:
       case HloOpcode::kFloor:
@@ -1427,7 +1455,8 @@ void FixMixedMeshShapeResharding(HloInstruction* inst, int operand_num,
                                  const Array<int64_t>& device_mesh,
                                  ReshardingCache* resharding_cache) {
   HloInstruction* operand = inst->mutable_operand(operand_num);
-  if (operand->opcode() == HloOpcode::kOutfeed) {
+  if (operand->opcode() == HloOpcode::kOutfeed ||
+      operand->opcode() == HloOpcode::kSendDone) {
     return;
   }
 
@@ -1556,7 +1585,6 @@ HloSharding Tile(const Shape& tensor_shape,
     tile_assignment_dimensions[tensor_dims[i]] = device_mesh.dim(mesh_dims[i]);
     split_prod *= device_mesh.dim(mesh_dims[i]);
   }
-
   // Replicate on remaining mesh dimensions
   bool replicate_on_last_tile_dim = false;
   if (split_prod < device_mesh.num_elements()) {
@@ -1733,19 +1761,14 @@ AliasSet BuildAliasSet(const HloModule* module,
   for (const HloComputation* computation : module->computations()) {
     for (const HloInstruction* instruction : computation->instructions()) {
       if (instruction->opcode() == HloOpcode::kWhile) {
+        // Aliasing between the while op, and the parameters of its body and
+        // conditional computations is handled by making the latter follow the
+        // input tuple to thew while loop in the function
+        // BuildStrategyAndCost().
         traverse_tuple_alias(
             strategy_map.at(instruction).get(),
             strategy_map.at(instruction->while_body()->root_instruction())
                 .get());
-        traverse_tuple_alias(
-            strategy_map.at(instruction).get(),
-            strategy_map.at(instruction->while_body()->parameter_instruction(0))
-                .get());
-        traverse_tuple_alias(
-            strategy_map.at(instruction).get(),
-            strategy_map
-                .at(instruction->while_condition()->parameter_instruction(0))
-                .get());
       } else if (instruction->opcode() == HloOpcode::kConditional) {
         auto branch_computations = instruction->branch_computations();
         for (size_t i = 0; i < branch_computations.size(); ++i) {
@@ -1798,8 +1821,7 @@ Status CheckAliasSetCompatibility(const AliasSet& alias_set,
              "tensors and may result in large memory consumption: "
           << "(" << instructions.at(src_strategy_group->instruction_id)->name()
           << ", " << instructions.at(dst_strategy_group->instruction_id)->name()
-          << ")"
-          << "\n"
+          << ")" << "\n"
           << "(" << src_strategy_group->node_idx << ", "
           << dst_strategy_group->node_idx << ")\n"
           << src_strategy_group->ToString() << "\n"
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
index 4162c5d0d1c78b..6fff6613db7847 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/auto_sharding_util.h
@@ -428,6 +428,11 @@ std::optional<HloSharding> PropagateDimwiseSharding(
     const HloSharding& input_spec, const Shape& old_shape,
     const Shape& new_shape);
 
+HloSharding PropagateDimwiseShardingSlice(const HloSharding& input_spec,
+                                          const Shape& old_shape,
+                                          const Shape& new_shape,
+                                          const Array<int64_t>& device_mesh);
+
 // Propagate sharding for ReduceWindow-like operations.
 // The sharding can successfully propagate if the window operation only happens
 // on tensor dimensions that are not tiled.
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.cc b/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.cc
index 906068c9c51bc3..a5e873059726f6 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.cc
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.cc
@@ -120,19 +120,6 @@ double ClusterEnvironment::AllToAllCost(double num_bytes, int mesh_dim) const {
                           mesh_beta_);
 }
 
-double ClusterEnvironment::DotCost(const Shape& lhs_shape,
-                                   const Shape& rhs_shape) const {
-  if (!auto_sharding_option_.allow_recompute_heavy_op) {
-    return kInfinityCost;
-  }
-
-  // TODO(zhuohan): When profiling data is not available, it is not easy to
-  // align the scale of compute cost and communication cost. Here we just use
-  // a simple heuristic to compute the compute cost with communication cost.
-  double num_bytes = GetBytes(lhs_shape) + GetBytes(rhs_shape);
-  return AllReduceCost(num_bytes, 0) + AllReduceCost(num_bytes, 1);
-}
-
 double ClusterEnvironment::CollectivePermuteCost(
     double num_bytes,
     const std::vector<std::pair<int64_t, int64_t>>& src_dst_pairs) const {
diff --git a/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h b/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h
index 47db1ab7a1918f..0a18ec6d884279 100644
--- a/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h
+++ b/third_party/xla/xla/hlo/experimental/auto_sharding/cluster_environment.h
@@ -50,7 +50,7 @@ class ClusterEnvironment {
         mesh_beta_(mesh_beta.begin(), mesh_beta.end()),
         prof_result_(prof_result),
         total_devices_(device_mesh.num_elements()),
-        device_mesh_1d_(original_device_mesh),
+        device_mesh_1d_(device_mesh),
         auto_sharding_option_(auto_sharding_option) {
     // Build replica group for each dimension.
     non_zero_mesh_dims_ =
@@ -68,10 +68,8 @@ class ClusterEnvironment {
                                              original_device_mesh_shape.end());
     size_t largest_dim_idx =
         std::distance(original_device_mesh_shape.begin(), max_dim_iterator);
-
-    std::vector<int64_t> device_mesh_1d_shape(
-        original_device_mesh.num_dimensions(), 1);
-    device_mesh_1d_shape[largest_dim_idx] = original_device_mesh.num_elements();
+    std::vector<int64_t> device_mesh_1d_shape(device_mesh.num_dimensions(), 1);
+    device_mesh_1d_shape[largest_dim_idx] = device_mesh.num_elements();
     device_mesh_1d_.Reshape(device_mesh_1d_shape);
   }
 
@@ -133,8 +131,6 @@ class ClusterEnvironment {
                                            const HloSharding& src_spec,
                                            const HloSharding& dst_spec) const;
 
-  double DotCost(const Shape& lhs_shape, const Shape& rhs_shape) const;
-
   // This function attempts to overestimate the cost of replicating a tensor of
   // shape `shape` sharded according to `src_spec`.
   double OverestimateReplicationCost(const Shape& shape,
diff --git a/third_party/xla/xla/hlo/ir/BUILD b/third_party/xla/xla/hlo/ir/BUILD
index 537cc657443174..bbd9e17120c7ec 100644
--- a/third_party/xla/xla/hlo/ir/BUILD
+++ b/third_party/xla/xla/hlo/ir/BUILD
@@ -1,10 +1,12 @@
 # Description:
 #   XLA’s HLO Intermediate Representation implementation.
 
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
     licenses = ["notice"],
 )
 
@@ -53,7 +55,6 @@ cc_library(
         "hlo_sharding.h",
         "hlo_sharding_metadata.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":ptrvec",
         ":tile_assignment",
@@ -111,7 +112,6 @@ cc_library(
     name = "hlo_module_group",
     srcs = ["hlo_module_group.cc"],
     hdrs = ["hlo_module_group.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo",
         "//xla/service:hlo_proto_cc",
@@ -124,7 +124,6 @@ cc_library(
     name = "hlo_reachability",
     srcs = ["hlo_reachability.cc"],
     hdrs = ["hlo_reachability.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo",
         "//xla:types",
@@ -137,7 +136,6 @@ cc_library(
 cc_library(
     name = "ptrvec",
     hdrs = ["ptrvec.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/log:check",
         "@local_tsl//tsl/platform:logging",
@@ -162,7 +160,6 @@ cc_library(
     name = "tile_assignment",
     srcs = ["tile_assignment.cc"],
     hdrs = ["tile_assignment.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:array",
         "//xla:printer",
diff --git a/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h b/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h
index 81654d524b8c10..01f825dc6ff97e 100644
--- a/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h
+++ b/third_party/xla/xla/hlo/ir/dfs_hlo_visitor.h
@@ -175,6 +175,9 @@ class DfsHloVisitorBase {
   virtual Status HandleRoundNearestEven(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
+  virtual Status HandleErf(HloInstructionPtr hlo) {
+    return HandleElementwiseUnary(hlo);
+  }
   virtual Status HandleLogistic(HloInstructionPtr hlo) {
     return HandleElementwiseUnary(hlo);
   }
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.cc b/third_party/xla/xla/hlo/ir/hlo_computation.cc
index 33d3391d3bbc74..1891617fc6addd 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.cc
@@ -19,7 +19,6 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <iterator>
-#include <list>
 #include <memory>
 #include <optional>
 #include <queue>
@@ -31,6 +30,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
@@ -43,6 +43,7 @@ limitations under the License.
 #include "xla/map_util.h"
 #include "xla/printer.h"
 #include "xla/service/mapped_ptr_container_sorter.h"
+#include "xla/service/name_uniquer.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/util.h"
@@ -54,30 +55,44 @@ namespace xla {
 
 using absl::StrCat;
 
-enum VisitState { kVisiting, kVisited };
+enum VisitState { kNew = 0, kVisiting = 1, kVisited = 2 };
 
-// VisitMap is a HloInstruction visitation map that uses an inline array to
-// store up to a certain number of unique elements, but upgrades itself
-// automatically to be backed by a real map when it runs out of space.
 class HloComputation::VisitMap {
  public:
   VisitMap() = default;
-  explicit VisitMap(int capacity);
+  explicit VisitMap(int capacity) : size_(capacity) {
+    int num_words = (capacity + 31) / 32;
+    bits_.resize(num_words);
+    bit_ptr_ = bits_.empty() ? nullptr : bits_.data();
+  }
 
-  // Inserts a given element into the map, provided an element with
-  // the same key hasn't already been inserted. It returns the boolean that
-  // indicates whether the element was inserted or not, along with a mutable
-  // reference to the element value inside the map.
-  std::pair<VisitState&, bool> insert(
-      std::pair<const HloInstruction*, VisitState>);
+  // A handle is a dense index used to identify a particular node.
+  using Handle = uint32_t;
 
- private:
-  static const int kMaxInlined = 16;
-  static const int kUsingMap = -1;
+  // Returns the current VisitState for the instruction with handle "h"
+  VisitState GetState(Handle h) const {
+    DCHECK_LT(h, size_);
+    uint32_t word = (h / 32);
+    uint32_t shift = (h % 32) << 1;
+    return static_cast<VisitState>((bit_ptr_[word] >> shift) & 0x3);
+  }
+
+  // Sets the VisitState for the instruction with Handle "h" to "new_state"
+  void SetState(Handle h, VisitState new_state) {
+    DCHECK_LT(h, size_);
+    uint32_t word = (h / 32);
+    uint32_t shift = (h % 32) << 1;
+    uint64_t mask = ~(3ull << shift);
+    uint64_t val = static_cast<uint64_t>(new_state);
+    bit_ptr_[word] = (bit_ptr_[word] & mask) | (val << shift);
+  }
 
-  int size_ = 0;
-  std::pair<const HloInstruction*, VisitState> array_[kMaxInlined];
-  absl::flat_hash_map<const HloInstruction*, VisitState> map_;
+ private:
+  // bits_ stores VisitState entries (2 bits per entry, packed 32 entries per
+  // 64-bit word)
+  absl::InlinedVector<uint64_t, 1> bits_;
+  uint64_t* bit_ptr_ = nullptr;  //
+  int size_ = 0;  // Number of entries.  bits_ holds at least 2 * this many bits
 };
 
 std::unique_ptr<HloComputation> HloComputation::Builder::Build(
@@ -100,19 +115,9 @@ HloComputation::HloComputation(
     const std::string& name, int parameter_count,
     std::vector<std::unique_ptr<HloInstruction>>* instructions,
     HloInstruction* root_instruction)
-    : name_(NameUniquer::GetSanitizedName(name)),
-      unique_id_(-1),
+    : unique_id_(-1),
       root_instruction_(root_instruction),
-      fusion_instruction_(nullptr),
-      is_fusion_computation_(false),
-      custom_call_instruction_(nullptr),
-      is_custom_call_computation_(false),
-      collective_call_instruction_(nullptr),
-      is_collective_called_computation_(false),
-      while_call_instruction_(nullptr),
-      is_while_call_body_computation_(false),
-      conditional_call_instruction_(nullptr),
-      is_conditional_branch_computation_(false) {
+      name_(NameUniquer::GetSanitizedName(name)) {
   param_instructions_.resize(parameter_count, nullptr);
   bool root_found = false;
   for (auto& instruction : *instructions) {
@@ -134,10 +139,9 @@ HloComputation::HloComputation(
 }
 
 HloComputation::~HloComputation() {
-  if (fusion_instruction_ != nullptr) {
-    CHECK(fusion_instruction_->fused_instructions_computation() == this);
-    fusion_instruction_->ClearCalledComputations();
-    fusion_instruction_ = nullptr;
+  if (FusionInstruction() != nullptr) {
+    CHECK(FusionInstruction()->fused_instructions_computation() == this);
+    FusionInstruction()->ClearCalledComputations();
   }
   if (IsAsyncComputation()) {
     CHECK(async_start_->async_wrapped_computation() == this);
@@ -146,6 +150,29 @@ HloComputation::~HloComputation() {
   for (const auto& i : instructions_) {
     delete i.inst();
   }
+  Cleanup();
+}
+
+void HloComputation::SetInstruction(HloInstruction* instruction,
+                                    InstructionType type) {
+  static_assert(alignof(HloInstruction) == kInstructionTypeMask + 1,
+                "HloInstruction should be aligned as a QWORD");
+
+  DCHECK(type != InstructionType::kUnset)
+      << "Set instruction must be called with a valid type, not kUnset.";
+  DCHECK(instruction_type() == InstructionType::kUnset ||
+         instruction_type() == type)
+      << "Unexpected instruction type. Current type is "
+      << static_cast<int>(instruction_type()) << " and it cannot be reset to "
+      << static_cast<int>(type);
+
+  // If `instruction` is nullptr, we need to preserve the existing type.
+  if (instruction == nullptr) {
+    type = instruction_type();
+  }
+
+  instruction_and_type_ =
+      reinterpret_cast<uintptr_t>(instruction) | static_cast<uintptr_t>(type);
 }
 
 HloInstruction* HloComputation::AddInstruction(
@@ -180,7 +207,9 @@ HloInstruction* HloComputation::AddInstructionInternal(
   info.inst_ = pinst;
   VLOG(2) << "Adding instruction " << pinst << " " << pinst->name()
           << " from computation " << name() << " opcode " << info.opcode();
-  instruction_indices_[pinst] = instructions_.size();
+  uint32_t index = instructions_.size();
+  instruction_indices_[pinst] = index;
+  pinst->index_in_parent_ = index;
   instructions_.push_back(info);
   return pinst;
 }
@@ -189,7 +218,7 @@ HloInstruction* HloComputation::AddParameter(
     std::unique_ptr<HloInstruction> instruction) {
   CHECK(instruction->opcode() == HloOpcode::kParameter);
   CHECK(!IsFusionComputation() ||
-        fusion_instruction_->operand_count() == param_instructions_.size());
+        FusionInstruction()->operand_count() == param_instructions_.size());
   instruction->set_parent(this);
   param_instructions_.push_back(instruction.get());
   AddInstructionInternal(std::move(instruction));
@@ -263,7 +292,7 @@ HloInstruction* HloComputation::ReplaceParameter(
   CHECK_LT(param_no, param_instructions_.size());
   CHECK(instruction->opcode() == HloOpcode::kParameter);
   CHECK(!IsFusionComputation() ||
-        fusion_instruction_->operand_count() == param_instructions_.size());
+        FusionInstruction()->operand_count() == param_instructions_.size());
 
   instruction->set_parent(this);
   HloInstruction* new_instruction =
@@ -408,7 +437,7 @@ Status HloComputation::RemoveInstructionImpl(HloInstruction* instruction,
   TF_RET_CHECK(inst_it != instruction_indices_.end());
   HloInstructionInfo* info = &instructions_[inst_it->second];
   info->inst()->set_parent(nullptr);
-  to_be_deleted_.emplace_back(info->inst());  // Takes ownership
+  to_be_deleted_.push_back(info->inst());  // Takes ownership
   to_be_deleted_.back()->DetachFromOperandsAndUsers();
   // Clear all operands to avoid Null operands.
   to_be_deleted_.back()->RemoveAllOperands();
@@ -423,6 +452,7 @@ Status HloComputation::RemoveInstructionImpl(HloInstruction* instruction,
   info->inst_ =
       nullptr;  // Leave a hole: this is no longer part of "instructions()"
   instruction_indices_.erase(inst_it);
+  instruction->index_in_parent_ = ~0u;
   return OkStatus();
 }
 
@@ -477,15 +507,18 @@ void HloComputation::ForEachInstructionPostOrderImpl(
   while (!dfs_stack->empty()) {
     HloInstruction& current = *dfs_stack->back();
 
-    auto [state, was_inserted] = visited.insert({&current, kVisiting});
-    if (!was_inserted) {  // We've already seen this instruction.
+    VisitMap::Handle h = current.index_in_parent_;
+    VisitState state = visited.GetState(h);
+    if (state == kNew) {
+      visited.SetState(h, kVisiting);
+    } else {
       dfs_stack->pop_back();
       if (state != kVisited) {
         DCHECK_EQ(current.parent(), this)
             << "Instruction " << current.name()
             << " is not in the current computation (" << name() << ").";
         func(&current);
-        state = kVisited;
+        visited.SetState(h, kVisited);
       }
       continue;
     }
@@ -506,7 +539,15 @@ void HloComputation::ForEachInstructionPostOrderImpl(
     // processed first. This will produce a more natural ordering and a nicer
     // result for things like HLO stringification.
     const HloInstruction::InstructionVector& operands = current.operands();
-    dfs_stack->insert(dfs_stack->end(), operands.rbegin(), operands.rend());
+
+    for (auto it = operands.rbegin(); it != operands.rend(); ++it) {
+      HloInstruction* operand = *it;
+      if (visited.GetState(operand->index_in_parent_) != kVisited) {
+        dfs_stack->push_back(operand);
+      } else {
+        // Already fully visited, so we avoid pushing onto the stack
+      }
+    }
 
     const PtrVec<HloInstruction*>& predecessors =
         current.control_predecessors();
@@ -559,7 +600,8 @@ HloComputation::ChannelDependencies HloComputation::ComputeChannelDependencies()
 std::vector<HloInstruction*> HloComputation::MakeInstructionPostOrderFrom(
     HloInstruction& postorder_root) const {
   std::vector<HloInstruction*> post_order;
-  VisitMap visited;
+  VisitMap visited(instructions_.size());
+
   std::vector<HloInstruction*> dfs_stack_scratch;
   ComputeInstructionPostOrder(&postorder_root, ComputeChannelDependencies(),
                               visited, post_order, &dfs_stack_scratch);
@@ -574,9 +616,10 @@ std::vector<HloInstruction*> HloComputation::MakeInstructionPostOrder(
     const ChannelDependencies& channel_dependencies) const {
   std::vector<HloInstruction*> post_order;
   post_order.reserve(instruction_count());
-  VisitMap visited(instruction_count());
+  VisitMap visited(instructions_.size());
   std::vector<HloInstruction*> dfs_stack_scratch;
   dfs_stack_scratch.reserve(instruction_count());
+
   for (const auto& instruction : instructions()) {
     if (instruction->users().empty()) {
       ComputeInstructionPostOrder(instruction, channel_dependencies, visited,
@@ -653,7 +696,7 @@ HloComputation::MakeInstructionPostOrderWithReshapeFirst() const {
 
 void HloComputation::ForEachInstructionPostOrder(
     absl::FunctionRef<void(HloInstruction*)> func) const {
-  VisitMap visited(instruction_count());
+  VisitMap visited(instructions_.size());
   std::vector<HloInstruction*> dfs_stack_scratch;
   dfs_stack_scratch.reserve(instruction_count());
   auto channel_dependencies = ComputeChannelDependencies();
@@ -847,7 +890,7 @@ HloComputationProto HloComputation::ToProto() const {
   }
   proto.set_root_id(root_instruction()->unique_id());
   *proto.mutable_program_shape() = ComputeProgramShape().ToProto();
-  proto.set_is_fusion_computation(is_fusion_computation_);
+  proto.set_is_fusion_computation(IsFusionComputation());
   proto.set_execution_thread(IsMainThread() ? ""
                                             : std::string(execution_thread()));
   return proto;
@@ -911,7 +954,10 @@ HloComputation::CreateFromProto(
   auto computation = absl::WrapUnique(
       new HloComputation(proto.name(), parameter_count, &instructions, root));
   computation->unique_id_ = proto.id();
-  computation->is_fusion_computation_ = proto.is_fusion_computation();
+  if (proto.is_fusion_computation()) {
+    computation->instruction_and_type_ =
+        static_cast<uintptr_t>(InstructionType::kFusion);
+  }
   if (!proto.execution_thread().empty()) {
     computation->SetExecutionThread(proto.execution_thread());
   }
@@ -1022,7 +1068,13 @@ StatusOr<HloInstruction*> HloComputation::CreateAsyncInstructions(
   async_start->CopyBackendConfigFrom(instruction);
   async_done->set_metadata(instruction->metadata());
   async_done->CopyBackendConfigFrom(instruction);
-  TF_RETURN_IF_ERROR(async_done->CopyAllControlDepsFrom(instruction));
+  for (HloInstruction* control_pred : instruction->control_predecessors()) {
+    TF_RETURN_IF_ERROR(control_pred->AddControlDependencyTo(async_start));
+  }
+  for (HloInstruction* control_successor : instruction->control_successors()) {
+    TF_RETURN_IF_ERROR(async_done->AddControlDependencyTo(control_successor));
+  }
+
   if (replace) {
     TF_RETURN_IF_ERROR(instruction->DropAllControlDeps());
     TF_RETURN_IF_ERROR(ReplaceInstruction(instruction, async_done));
@@ -1253,11 +1305,7 @@ StatusOr<bool> HloComputation::ReplaceInstructionWithDifferentShape(
   // But still this seems to be better than nothing.
   bool overwrite_op_name = new_instruction->metadata().op_name().empty() &&
                            !old_instruction->metadata().op_name().empty();
-  bool overwrite_pass_id =
-      new_instruction->metadata().op_name().empty() &&
-      new_instruction->metadata().logical_creation_pass_id() == 0 &&
-      old_instruction->metadata().logical_creation_pass_id() != 0;
-  if (overwrite_op_name || overwrite_pass_id) {
+  if (overwrite_op_name) {
     new_instruction->set_metadata(old_instruction->metadata());
   }
   if (new_instruction->frontend_attributes().map().empty()) {
@@ -1592,7 +1640,6 @@ std::unique_ptr<HloComputation> HloComputation::CloneInContext(
 
   context.MapComputation(this, result.get());
   result->SetExecutionThread(execution_thread());
-
   return result;
 }
 
@@ -1619,42 +1666,4 @@ bool HloComputation::CanExpandIntoSingleInstruction() const {
       });
 }
 
-HloComputation::HloComputation::VisitMap::VisitMap(int capacity) {
-  if (capacity <= kMaxInlined) return;  // already reserved
-  size_ = kUsingMap;
-  map_.reserve(capacity);
-}
-
-std::pair<VisitState&, bool> HloComputation::HloComputation::VisitMap::insert(
-    std::pair<const HloInstruction*, VisitState> x) {
-  if (size_ == kUsingMap) {
-    // Using map.
-    auto [it, was_inserted] = map_.insert(std::move(x));
-    return {it->second, was_inserted};
-  }
-
-  // Using array.
-  for (int i = 0; i < size_; ++i) {
-    if (array_[i].first == x.first) {  // already inserted
-      return {array_[i].second, false};
-    }
-  }
-
-  // See if there's space in array.
-  if (size_ < kMaxInlined) {
-    array_[size_] = std::move(x);
-    return {array_[size_++].second, true};
-  }
-
-  // Convert to a map.
-  for (int i = 0; i < kMaxInlined; ++i) {
-    map_.insert(std::move(array_[i]));
-  }
-  size_ = kUsingMap;
-
-  auto [it, inserted] = map_.insert(std::move(x));
-  DCHECK_EQ(inserted, true);
-  return {it->second, true};
-}
-
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/hlo_computation.h b/third_party/xla/xla/hlo/ir/hlo_computation.h
index 9dd7a6fc78d28a..f2df3c7b19a48a 100644
--- a/third_party/xla/xla/hlo/ir/hlo_computation.h
+++ b/third_party/xla/xla/hlo/ir/hlo_computation.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_HLO_IR_HLO_COMPUTATION_H_
 #define XLA_HLO_IR_HLO_COMPUTATION_H_
 
+#include <cstdint>
 #include <list>
 #include <memory>
 #include <optional>
@@ -34,6 +35,8 @@ limitations under the License.
 #include "xla/hlo/ir/dfs_hlo_visitor.h"
 #include "xla/hlo/ir/hlo_clone_context.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/ptrvec.h"
 #include "xla/iterator_util.h"
 #include "xla/printer.h"
 #include "xla/service/hlo.pb.h"
@@ -294,8 +297,20 @@ class HloComputation {
 
   absl::string_view name() const { return name_; }
 
+  // Sets the string identifier for this computation. Name will be sanitized to
+  // match the regexp "[a-zA-Z_][a-zA-Z0-9_.-]*".
+  //
+  // See also HloModule::SetAndUniquifyComputationName(), which does this plus
+  // UniqufyName().
+  void SetAndSanitizeName(absl::string_view name) {
+    name_ = NameUniquer::GetSanitizedName(name);
+  }
+
   // Use the given NameUniquer to select a unique name for the computation based
   // on the computation's existing name.
+  //
+  // See also HloModule::SetAndUniquifyComputationName(), which does this plus
+  // SetAndSanitizeName().
   void UniquifyName(NameUniquer* name_uniquer);
 
   // Prints a string representation of the computation.
@@ -698,100 +713,90 @@ class HloComputation {
   // Returns if this computation is a fusion computation.
   // Do not use this method to determine if fusion_instruction_ != nullptr.
   // Instead, directly do: FusionInstruction() != nullptr
-  bool IsFusionComputation() const { return is_fusion_computation_; }
+  bool IsFusionComputation() const {
+    return instruction_type() == InstructionType::kFusion;
+  }
 
   // Returns if this computation is the entry computation of the module.
   bool IsEntryComputation() const;
 
   // Returns the owning fusion instruction, or nullptr if this is not a fusion
   // computation.
-  HloInstruction* FusionInstruction() const { return fusion_instruction_; }
+  HloInstruction* FusionInstruction() const {
+    return instruction_type() == InstructionType::kFusion ? instruction()
+                                                          : nullptr;
+  }
   void SetFusionInstruction(HloInstruction* fusion_instruction) {
-    CHECK(!IsCustomCallComputation() && !IsAsyncComputation() &&
-          !IsCollectiveCalledComputation() && !IsWhileBodyComputation() &&
-          !IsConditionalBranchComputation());
-    fusion_instruction_ = fusion_instruction;
-    is_fusion_computation_ |= (fusion_instruction != nullptr);
+    SetInstruction(fusion_instruction, InstructionType::kFusion);
   }
 
   // Returns if this computation is a custom-call computation.
-  bool IsCustomCallComputation() const { return is_custom_call_computation_; }
+  bool IsCustomCallComputation() const {
+    return instruction_type() == InstructionType::kCustomCall;
+  }
 
   // Returns the owning custom call instruction, or nullptr if this is not a
   // custom call computation.
   HloInstruction* CustomCallInstruction() const {
-    return custom_call_instruction_;
+    return instruction_type() == InstructionType::kCustomCall ? instruction()
+                                                              : nullptr;
   }
   void SetCustomCallInstruction(HloInstruction* custom_call_instruction) {
-    CHECK(!IsFusionComputation() && !IsAsyncComputation() &&
-          !IsCollectiveCalledComputation() && !IsWhileBodyComputation() &&
-          !IsConditionalBranchComputation());
-    custom_call_instruction_ = custom_call_instruction;
-    is_custom_call_computation_ |= (custom_call_instruction != nullptr);
+    SetInstruction(custom_call_instruction, InstructionType::kCustomCall);
   }
 
   // Returns if this computation is a to_apply region of a collective.
   bool IsCollectiveCalledComputation() const {
-    return is_collective_called_computation_;
+    return instruction_type() == InstructionType::kCollective;
   }
 
   // Returns the owning collective call instruction, or nullptr if this is not a
   // collective call computation.
   HloInstruction* CollectiveCallInstruction() const {
-    return collective_call_instruction_;
+    return instruction_type() == InstructionType::kCollective ? instruction()
+                                                              : nullptr;
   }
 
   void SetCollectiveCallInstruction(
       HloInstruction* collective_call_instruction) {
-    CHECK(!IsFusionComputation() && !IsAsyncComputation() &&
-          !IsCustomCallComputation() && !IsWhileBodyComputation() &&
-          !IsConditionalBranchComputation());
-    collective_call_instruction_ = collective_call_instruction;
-    is_collective_called_computation_ |=
-        (collective_call_instruction != nullptr);
+    SetInstruction(collective_call_instruction, InstructionType::kCollective);
   }
 
   // Returns if this computation is a body computation of a while.
   bool IsWhileBodyComputation() const {
-    return is_while_call_body_computation_;
+    return instruction_type() == InstructionType::kWhile;
   }
 
   // Returns the owning while call instruction, or nullptr if this is not a
   // while call body computation.
   HloInstruction* WhileCallInstruction() const {
-    return while_call_instruction_;
+    return instruction_type() == InstructionType::kWhile ? instruction()
+                                                         : nullptr;
   }
 
   void SetWhileCallInstruction(HloInstruction* while_call_instruction) {
-    CHECK(!IsFusionComputation() && !IsAsyncComputation() &&
-          !IsCustomCallComputation() && !IsCollectiveCalledComputation() &&
-          !IsConditionalBranchComputation());
     CHECK(while_call_instruction != nullptr);
     CHECK(while_call_instruction->opcode() == HloOpcode::kWhile);
-    while_call_instruction_ = while_call_instruction;
-    is_while_call_body_computation_ = true;
+    SetInstruction(while_call_instruction, InstructionType::kWhile);
   }
 
   // Returns if this computation is a branch computation of a conditional.
   bool IsConditionalBranchComputation() const {
-    return is_conditional_branch_computation_;
+    return instruction_type() == InstructionType::kConditional;
   }
 
   // Returns the owning conditional call instruction, or nullptr if this is not
   // a conditional branch computation.
   HloInstruction* ConditionalCallInstruction() const {
-    return conditional_call_instruction_;
+    return instruction_type() == InstructionType::kConditional ? instruction()
+                                                               : nullptr;
   }
 
   void SetConditionalCallInstruction(
       HloInstruction* conditional_call_instruction) {
-    CHECK(!IsFusionComputation() && !IsAsyncComputation() &&
-          !IsCustomCallComputation() && !IsCollectiveCalledComputation() &&
-          !IsWhileBodyComputation());
     CHECK(conditional_call_instruction != nullptr);
     CHECK(conditional_call_instruction->opcode() == HloOpcode::kConditional);
-    conditional_call_instruction_ = conditional_call_instruction;
-    is_conditional_branch_computation_ = true;
+    SetInstruction(conditional_call_instruction, InstructionType::kConditional);
   }
 
   // Returns if this computation is an async computation.
@@ -802,18 +807,14 @@ class HloComputation {
   HloInstruction* AsyncStart() const { return async_start_; }
 
   void AddAsyncStart(HloInstruction* async_instruction) {
-    CHECK(!IsCalledComputation());
+    // TODO: Add instruction type for async instructions.
+    CHECK(instruction_type() == InstructionType::kUnset);
     CHECK(async_instruction->opcode() == HloOpcode::kAsyncStart);
     async_start_ = async_instruction;
   }
 
   void RemoveAsyncStart() { async_start_ = nullptr; }
 
-  // Returns if this computation is invoked by an Hlo instruction.
-  bool IsCalledComputation() const {
-    return IsFusionComputation() || IsCustomCallComputation();
-  }
-
   // Clear the unique ID of the computation so that it can be re-assigned, such
   // as for the purpose of compacting the unique IDs.
   void ClearUniqueIdInternal() { unique_id_ = -1; }
@@ -845,7 +846,12 @@ class HloComputation {
   // stage clean up process is designed such that HloPass can have stable
   // internal pointers to HloInstructions while we create and remove
   // HloInstructions in a pass.
-  void Cleanup() { to_be_deleted_.clear(); }
+  void Cleanup() {
+    for (HloInstruction* it : to_be_deleted_) {
+      delete it;
+    }
+    to_be_deleted_.clear();
+  }
 
   // Returns true if a given instruction is marked dead in this computation.
   bool IsMarkedAsDead(const HloInstruction* inst);
@@ -904,50 +910,48 @@ class HloComputation {
   Status RemoveInstructionImpl(HloInstruction* instruction,
                                bool ignore_safety_check);
 
-  std::string name_;
-  int64_t unique_id_;
-  HloInstruction* root_instruction_;
-
-  // If this computation is a fusion computation, this field points to the
-  // corresponding fusion instruction (if it is live). Otherwise, this is null.
-  HloInstruction* fusion_instruction_;
-
-  // Determines whether this computation is a fusion computation. A fusion
-  // computation ordinarily also has a non-null fusion_instruction_. However, if
-  // a fusion instruction is removed during compilation, the fusion computation
-  // becomes unreachable, and its fusion_instruction_ is set to null. We still
-  // need to regard such computations as fusion computations for HLO scheduling
-  // purposes.
-  bool is_fusion_computation_;
-
-  // If this computation is a custom-call computation, this field points to the
-  // corresponding custom-call instruction (if it is live). Otherwise, this is
-  // null.
-  HloInstruction* custom_call_instruction_;
-
-  // Determines whether this computation is a custom-call computation.
-  bool is_custom_call_computation_;
+  enum class InstructionType : uint8_t {
+    kUnset,
+    // This computation is a fusion computation. A fusion computation ordinarily
+    // also has a non-null instruction. However, if a fusion instruction
+    // is removed during compilation, the fusion computation becomes
+    // unreachable, and its instruction is set to null. We still need to regard
+    // such computations as fusion computations for HLO scheduling purposes.
+    kFusion,
+    // This computation is a custom-call computation.
+    kCustomCall,
+    // This computation is a while body computation.
+    kCollective,
+    // This computation is a while body computation.
+    kWhile,
+    // This computation is a conditional branch computation.
+    kConditional,
+  };
+  static constexpr uintptr_t kInstructionTypeMask = 0b111;
+  static_assert(static_cast<int>(InstructionType::kUnset) == 0,
+                "kUnset must be 0.");
 
-  // If this computation is a collective sub-computation, this field points to
-  // the corresponding collective instruction. Otherwise, this is null.
-  HloInstruction* collective_call_instruction_;
+  InstructionType instruction_type() const {
+    return static_cast<InstructionType>(instruction_and_type_ &
+                                        kInstructionTypeMask);
+  }
 
-  // Determines whether this computation is a collective sub-computation.
-  bool is_collective_called_computation_;
+  HloInstruction* instruction() const {
+    return reinterpret_cast<HloInstruction*>(instruction_and_type_ &
+                                             ~kInstructionTypeMask);
+  }
 
-  // If this computation is a while body computation, this field points to
-  // the corresponding while instruction. Otherwise, this is null.
-  HloInstruction* while_call_instruction_;
+  void SetInstruction(HloInstruction* instruction, InstructionType type);
 
-  // Determines whether this computation is a while body computation.
-  bool is_while_call_body_computation_;
+  int64_t unique_id_;
+  HloInstruction* root_instruction_;
 
-  // If this computation is a conditional branch computation, this field points
-  // to the corresponding conditional instruction. Otherwise, this is null.
-  HloInstruction* conditional_call_instruction_;
+  // Module containing this computation.
+  HloModule* parent_ = nullptr;
 
-  // Determines whether this computation is a conditional branch computation.
-  bool is_conditional_branch_computation_;
+  // Contains HloInstruction* and its type.
+  // The respective type in the least significant three bits.
+  uintptr_t instruction_and_type_ = 0;
 
   // If this computation is an async computation, this field points to the
   // first async instruction (async-start) in the asynchronous op chain that
@@ -955,11 +959,7 @@ class HloComputation {
   // Otherwise, this is empty.
   HloInstruction* async_start_ = nullptr;
 
-  // Execution thread of this computation. By default, it's main thread.
-  std::string execution_thread_ = HloInstruction::kMainExecutionThread;
-
-  // Module containing this computation.
-  HloModule* parent_ = nullptr;
+  HloInstruction::InstructionVector param_instructions_;
 
   // Store instructions in std::vector as they can be added and removed
   // arbitrarily and we want a stable iteration order. Keep a map from
@@ -967,11 +967,14 @@ class HloComputation {
   HloInstructionList instructions_;
   absl::flat_hash_map<const HloInstruction*, int> instruction_indices_;
 
+  // Execution thread of this computation. By default, it's main thread.
+  std::string execution_thread_ = HloInstruction::kMainExecutionThread;
+
   // Removed instructions are moved into to_be_deleted_ first and then
   // deallocated when Cleanup is called.
-  std::vector<std::unique_ptr<HloInstruction>> to_be_deleted_;
+  PtrVec<HloInstruction*> to_be_deleted_;
 
-  HloInstruction::InstructionVector param_instructions_;
+  std::string name_;
 
   HloComputation(const HloComputation&) = delete;
   HloComputation& operator=(const HloComputation&) = delete;
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.cc b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
index d9181e8a04243e..3eaebb7351b2a9 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.cc
@@ -1298,6 +1298,7 @@ HloInstruction::CreateRngBitGenerator(const Shape& shape, HloInstruction* state,
     case HloOpcode::kCos:
     case HloOpcode::kOptimizationBarrier:
     case HloOpcode::kClz:
+    case HloOpcode::kErf:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
@@ -1634,6 +1635,12 @@ HloInstruction::CreateCollectivePermuteStart(
                                                   is_host_transfer);
 }
 
+/* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateRecvDone(
+    HloInstruction* operand, int64_t channel_id, bool is_host_transfer) {
+  return std::make_unique<HloRecvDoneInstruction>(operand, channel_id,
+                                                  is_host_transfer);
+}
+
 /* static */ std::unique_ptr<HloInstruction> HloInstruction::CreateReverse(
     const Shape& shape, HloInstruction* operand,
     absl::Span<const int64_t> dimensions) {
@@ -2348,6 +2355,7 @@ std::unique_ptr<HloInstruction> HloInstruction::CloneWithNewOperands(
     case HloOpcode::kOptimizationBarrier:
     case HloOpcode::kCopyDone:
     case HloOpcode::kCos:
+    case HloOpcode::kErf:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kImag:
@@ -2776,6 +2784,7 @@ bool HloInstruction::IdenticalSlowPath(
     case HloOpcode::kCos:
     case HloOpcode::kDivide:
     case HloOpcode::kDynamicUpdateSlice:
+    case HloOpcode::kErf:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
@@ -3144,7 +3153,6 @@ bool HloInstruction::has_to_apply() const {
     case HloOpcode::kReduceWindow:
     case HloOpcode::kScatter:
     case HloOpcode::kSort:
-    case HloOpcode::kTopK:
       return true;
     case HloOpcode::kCustomCall:
       // CustomCall can have a to_apply computation, but it is not required to
@@ -3325,6 +3333,7 @@ bool HloInstruction::IsOpElementwise(HloOpcode opcode) {
     case HloOpcode::kBitcastConvert:
     case HloOpcode::kCopy:
     case HloOpcode::kCos:
+    case HloOpcode::kErf:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
@@ -3481,7 +3490,8 @@ void HloInstruction::PrintWithCanonicalNameMap(
       (!metadata_->op_type().empty() || !metadata_->op_name().empty() ||
        !metadata_->source_file().empty())) {
     printer->Append(", metadata={");
-    printer->Append(xla::OpMetadataToString(*metadata_));
+    printer->Append(xla::OpMetadataToString(
+        *metadata_, options.print_metadata_only_op_name()));
     printer->Append("}");
   }
   if (options.print_backend_config() && !backend_config_.empty()) {
@@ -3917,6 +3927,7 @@ bool HloInstruction::IsFusible() const {
 
 HloInstruction::HloInstruction(HloOpcode opcode, const Shape& shape)
     : unique_id_(-1),
+      index_in_parent_(~0u),
       opcode_(opcode),
       is_default_config_(false),
       cleaned_up_(false),
@@ -3943,6 +3954,8 @@ Status HloInstruction::Visit(DfsHloVisitorBase<HloInstructionPtr>* visitor) {
       return visitor->HandleBatchNormInference(this);
     case HloOpcode::kBatchNormGrad:
       return visitor->HandleBatchNormGrad(this);
+    case HloOpcode::kErf:
+      return visitor->HandleErf(this);
     case HloOpcode::kLogistic:
       return visitor->HandleLogistic(this);
     case HloOpcode::kSign:
diff --git a/third_party/xla/xla/hlo/ir/hlo_instruction.h b/third_party/xla/xla/hlo/ir/hlo_instruction.h
index a93cafbe7b8801..490181fcb1d544 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instruction.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instruction.h
@@ -95,6 +95,7 @@ class HloPrintOptions {
         print_large_constants_(false),
         print_only_essential_constants_(false),
         print_metadata_(true),
+        print_metadata_only_op_name_(false),
         print_backend_config_(true),
         print_infeed_outfeed_config_(true),
         compact_operands_(false),
@@ -203,6 +204,13 @@ class HloPrintOptions {
     return *this;
   }
 
+  // If true and print_metadata is true, metadata op name will be printed. Other
+  // metadata values will be omitted.
+  HloPrintOptions& set_print_metadata_only_op_name(bool value) {
+    print_metadata_only_op_name_ = value;
+    return *this;
+  }
+
   // If true, backend_config will be printed.
   HloPrintOptions& set_print_backend_config(bool value) {
     print_backend_config_ = value;
@@ -369,6 +377,9 @@ class HloPrintOptions {
     return print_subcomputation_mode_;
   }
   bool print_metadata() const { return print_metadata_; }
+  bool print_metadata_only_op_name() const {
+    return print_metadata_only_op_name_;
+  }
   bool print_backend_config() const { return print_backend_config_; }
   bool print_infeed_outfeed_config() const {
     return print_infeed_outfeed_config_;
@@ -408,6 +419,7 @@ class HloPrintOptions {
   bool print_large_constants_;
   bool print_only_essential_constants_;
   bool print_metadata_;
+  bool print_metadata_only_op_name_;
   bool print_backend_config_;
   bool print_infeed_outfeed_config_;
   bool compact_operands_;
@@ -1036,6 +1048,10 @@ class HloInstruction {
   // and returns the receive buffer. The operand must be kRecv.
   static std::unique_ptr<HloInstruction> CreateRecvDone(
       HloInstruction* operand, bool is_host_transfer = false);
+  // Similar to the above, but the operand doesn't have to be a kRecv.
+  static std::unique_ptr<HloInstruction> CreateRecvDone(
+      HloInstruction* operand, int64_t channel_id,
+      bool is_host_transfer = false);
 
   // Creates a slice instruction, where the operand is sliced by the given
   // start/limit indices.
@@ -1357,7 +1373,7 @@ class HloInstruction {
   // Returns true if this instruction has a side effect. An instruction has a
   // side effect if it uses certain opcodes or calls a computation with a side
   // effect.
-  bool HasSideEffect() const;
+  virtual bool HasSideEffect() const;
 
   // Returns the result shape of this instruction.
   const Shape& shape() const;
@@ -1656,6 +1672,13 @@ class HloInstruction {
   // a bitcast.
   bool IsEffectiveBitcast() const;
 
+  // Returns true if this instruction is asynchronous with the
+  // async_execution_thread set to `execution_thread`.
+  bool IsAsyncInstructionWithExecutionThread(
+      absl::string_view execution_thread) const {
+    return IsAsynchronous() && async_execution_thread() == execution_thread;
+  };
+
   // Gets/sets the to_apply HloComputation for Call, Map, Reduce, etc.
   // The setter should only be called by HloModule or HloComputation methods.
   //
@@ -2085,11 +2108,7 @@ class HloInstruction {
 
   // Sets the debug metadata for this instruction, excluding creation_pass_id,
   // which should never be copied anywhere.
-  void set_metadata(const OpMetadata& metadata) {
-    int64_t creation_pass_id = metadata_->creation_pass_id();
-    *metadata_ = metadata;
-    metadata_->set_creation_pass_id(creation_pass_id);
-  }
+  void set_metadata(const OpMetadata& metadata) { *metadata_ = metadata; }
 
   void set_size_of_generated_code_in_bytes(int64_t code_size_in_bytes) {
     metadata_->set_size_of_generated_code_in_bytes(code_size_in_bytes);
@@ -2099,15 +2118,9 @@ class HloInstruction {
     metadata_->set_size_of_memory_working_set_in_bytes(
         working_set_size_in_bytes);
   }
-  void set_creation_pass_id(int64_t pass_id) {
-    metadata_->set_creation_pass_id(pass_id);
-  }
   void set_metadata_op_name(const std::string& name) {
     metadata_->set_op_name(name);
   }
-  void set_logical_creation_pass_id(int64_t pass_id) {
-    metadata_->set_logical_creation_pass_id(pass_id);
-  }
   void set_metadata_deduplicated_name(std::string deduplicated_name) {
     metadata_->set_deduplicated_name(std::move(deduplicated_name));
   }
@@ -2682,6 +2695,7 @@ class HloInstruction {
   };
 
   int unique_id_;  // Unique to this HloInstruction within a HloModule
+  uint32_t index_in_parent_;  // Index that identifies inst in HloComputation
 
   // Opcode for this instruction.
   HloOpcode opcode_;
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.cc b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
index 87dcbb80657d80..4c1c38d5389def 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.cc
@@ -839,13 +839,30 @@ HloRecvDoneInstruction::HloRecvDoneInstruction(HloRecvInstruction* operand,
   AppendOperand(operand);
 }
 
+HloRecvDoneInstruction::HloRecvDoneInstruction(HloInstruction* operand,
+                                               int64_t channel_id,
+                                               bool is_host_transfer)
+    : HloSendRecvInstruction(
+          HloOpcode::kRecvDone,
+          ShapeUtil::MakeTupleShape(
+              {ShapeUtil::GetTupleElementShape(operand->shape(), 0),
+               ShapeUtil::MakeTokenShape()}),
+          channel_id, is_host_transfer) {
+  AppendOperand(operand);
+}
+
 std::unique_ptr<HloInstruction>
 HloRecvDoneInstruction::CloneWithNewOperandsImpl(
     const Shape& shape, absl::Span<HloInstruction* const> new_operands,
     HloCloneContext* context) const {
   CHECK_EQ(new_operands.size(), 1);
+  HloRecvInstruction* recv = dynamic_cast<HloRecvInstruction*>(new_operands[0]);
+  if (recv != nullptr) {
+    return std::make_unique<HloRecvDoneInstruction>(recv, is_host_transfer());
+  }
+
   return std::make_unique<HloRecvDoneInstruction>(
-      Cast<HloRecvInstruction>(new_operands[0]), is_host_transfer());
+      new_operands[0], channel_id().value(), is_host_transfer());
 }
 
 HloCollectiveInstruction::HloCollectiveInstruction(
diff --git a/third_party/xla/xla/hlo/ir/hlo_instructions.h b/third_party/xla/xla/hlo/ir/hlo_instructions.h
index fa114df17ff0e1..def030ffc3cdbe 100644
--- a/third_party/xla/xla/hlo/ir/hlo_instructions.h
+++ b/third_party/xla/xla/hlo/ir/hlo_instructions.h
@@ -264,6 +264,10 @@ class HloAsyncInstruction : public HloInstruction {
   // *end(GetAsyncChain()) is the async-done op.
   std::vector<HloAsyncInstruction*> GetAsyncChain() const;
 
+  bool HasSideEffect() const override {
+    return async_wrapped_instruction()->HasSideEffect();
+  }
+
  protected:
   // Helper to constructs async-{start,update,done}.
   HloAsyncInstruction(HloOpcode opcode, const Shape& shape,
@@ -607,6 +611,8 @@ class HloRecvDoneInstruction : public HloSendRecvInstruction {
  public:
   explicit HloRecvDoneInstruction(HloRecvInstruction* operand,
                                   bool is_host_transfer);
+  explicit HloRecvDoneInstruction(HloInstruction* operand, int64_t channel_id,
+                                  bool is_host_transfer);
 
   static bool ClassOf(const HloInstruction* hlo) {
     return hlo->opcode() == HloOpcode::kRecvDone;
diff --git a/third_party/xla/xla/hlo/ir/hlo_module.h b/third_party/xla/xla/hlo/ir/hlo_module.h
index 6a3c93b88c81cb..6311929ac533a5 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module.h
@@ -546,6 +546,12 @@ class HloModule {
     instr->UniquifyName(&instruction_name_uniquer_);
   }
 
+  void SetAndUniquifyComputationName(HloComputation* computation,
+                                     absl::string_view name) {
+    computation->SetAndSanitizeName(name);
+    computation->UniquifyName(&computation_name_uniquer_);
+  }
+
   Status CheckUniqueNamesAndIdsForComputationsAndInstructions() const;
 
   // Checks if this config has a list of entry parameters' HLO shardings for
diff --git a/third_party/xla/xla/hlo/ir/hlo_module_metadata.cc b/third_party/xla/xla/hlo/ir/hlo_module_metadata.cc
index 5ad3f264bf55d8..3af6a63cacec9a 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module_metadata.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_module_metadata.cc
@@ -18,7 +18,10 @@ limitations under the License.
 #include <algorithm>
 
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "xla/util.h"
 #include "tsl/platform/env.h"
+#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
@@ -84,4 +87,16 @@ void HloModuleMetadata::set_prepartitioning_metadata(
   }
 }
 
+Status HloModuleMetadata::set_custom_metadata(
+    const ::tsl::protobuf::Message& message) {
+  TF_ASSIGN_OR_RETURN(HloPassMetadata * pass_metadata,
+                      GetCurrentHloPassMetadata());
+  if (!pass_metadata->mutable_custom_metadata()->PackFrom(message)) {
+    LOG(WARNING) << "failed to pack custom metadata for "
+                 << pass_metadata->pass_id();
+    return Internal("failed to pack custom metadata");
+  };
+  return OkStatus();
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/hlo/ir/hlo_module_metadata.h b/third_party/xla/xla/hlo/ir/hlo_module_metadata.h
index 1c4f2a774e1bc4..0fc4f3169ad3ce 100644
--- a/third_party/xla/xla/hlo/ir/hlo_module_metadata.h
+++ b/third_party/xla/xla/hlo/ir/hlo_module_metadata.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/util.h"
 #include "tsl/platform/env.h"
+#include "tsl/platform/protobuf.h"
 
 namespace xla {
 
@@ -62,6 +63,7 @@ class HloModuleMetadata {
   void add_partitioned_module_id(int64_t id) {
     module_metadata_.add_partitioned_module_ids(id);
   }
+  Status set_custom_metadata(const ::tsl::protobuf::Message& message);
 
   StatusOr<int64_t> current_pass_id() {
     TF_ASSIGN_OR_RETURN(HloPassMetadata * pass_metadata,
diff --git a/third_party/xla/xla/hlo/ir/hlo_op_metadata.cc b/third_party/xla/xla/hlo/ir/hlo_op_metadata.cc
index aeebd0cfce20fb..b45f23ba96d432 100644
--- a/third_party/xla/xla/hlo/ir/hlo_op_metadata.cc
+++ b/third_party/xla/xla/hlo/ir/hlo_op_metadata.cc
@@ -24,8 +24,16 @@ limitations under the License.
 
 namespace xla {
 
-std::string OpMetadataToString(const OpMetadata& metadata) {
+std::string OpMetadataToString(const OpMetadata& metadata, bool only_op_name) {
   std::vector<std::string> result;
+  if (only_op_name) {
+    if (!metadata.op_name().empty()) {
+      return absl::StrCat("op_name=\"", absl::CEscape(metadata.op_name()),
+                          "\"");
+    } else {
+      return "";
+    }
+  }
   if (!metadata.op_type().empty()) {
     result.push_back(
         absl::StrCat("op_type=\"", absl::CEscape(metadata.op_type()), "\""));
diff --git a/third_party/xla/xla/hlo/ir/hlo_op_metadata.h b/third_party/xla/xla/hlo/ir/hlo_op_metadata.h
index 26311c812a1f7f..acbd34c84af2f8 100644
--- a/third_party/xla/xla/hlo/ir/hlo_op_metadata.h
+++ b/third_party/xla/xla/hlo/ir/hlo_op_metadata.h
@@ -21,7 +21,8 @@ limitations under the License.
 #include "xla/xla_data.pb.h"
 
 namespace xla {
-std::string OpMetadataToString(const OpMetadata& metadata);
+std::string OpMetadataToString(const OpMetadata& metadata,
+                               bool only_op_name = false);
 }  // namespace xla
 
 #endif  // XLA_HLO_IR_HLO_OP_METADATA_H_
diff --git a/third_party/xla/xla/hlo/ir/hlo_opcode.h b/third_party/xla/xla/hlo/ir/hlo_opcode.h
index 338346b80070c4..51b7126470f211 100644
--- a/third_party/xla/xla/hlo/ir/hlo_opcode.h
+++ b/third_party/xla/xla/hlo/ir/hlo_opcode.h
@@ -96,6 +96,7 @@ namespace xla {
   V(kDynamicReshape, "dynamic-reshape", kHloOpcodeIsVariadic)                  \
   V(kDynamicSlice, "dynamic-slice", kHloOpcodeIsVariadic)                      \
   V(kDynamicUpdateSlice, "dynamic-update-slice", kHloOpcodeIsVariadic)         \
+  V(kErf, "erf", 1)                                                            \
   V(kExp, "exponential", 1)                                                    \
   V(kExpm1, "exponential-minus-one", 1)                                        \
   V(kFft, "fft", 1)                                                            \
diff --git a/third_party/xla/xla/hlo/transforms/BUILD b/third_party/xla/xla/hlo/transforms/BUILD
index 2482904a2582be..16b4c0cf384094 100644
--- a/third_party/xla/xla/hlo/transforms/BUILD
+++ b/third_party/xla/xla/hlo/transforms/BUILD
@@ -5,7 +5,8 @@ load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
     licenses = ["notice"],
 )
 
@@ -20,7 +21,6 @@ cc_library(
     name = "hlo_constant_splitter",
     srcs = ["hlo_constant_splitter.cc"],
     hdrs = ["hlo_constant_splitter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
diff --git a/third_party/xla/xla/hlo/utils/BUILD b/third_party/xla/xla/hlo/utils/BUILD
index 92ae2ccc70f2bf..cec66c2bd7be4f 100644
--- a/third_party/xla/xla/hlo/utils/BUILD
+++ b/third_party/xla/xla/hlo/utils/BUILD
@@ -8,7 +8,8 @@ load(
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
     licenses = ["notice"],
 )
 
@@ -23,7 +24,6 @@ cc_library(
     name = "hlo_live_range",
     srcs = ["hlo_live_range.cc"],
     hdrs = ["hlo_live_range.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:statusor",
@@ -65,7 +65,6 @@ cc_library(
     testonly = 1,
     srcs = ["hlo_matchers.cc"],
     hdrs = ["hlo_matchers.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:test",
         "//xla/hlo/ir:hlo",
@@ -94,7 +93,6 @@ cc_library(
     hdrs = [
         "hlo_sharding_util.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:array",
         "//xla:literal_util",
@@ -144,7 +142,6 @@ cc_library(
     name = "hlo_query",
     srcs = ["hlo_query.cc"],
     hdrs = ["hlo_query.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:literal",
         "//xla:shape_util",
diff --git a/third_party/xla/xla/hlo/utils/hlo_matchers.h b/third_party/xla/xla/hlo/utils/hlo_matchers.h
index 2111cbb70de440..d912c81381f3d1 100644
--- a/third_party/xla/xla/hlo/utils/hlo_matchers.h
+++ b/third_party/xla/xla/hlo/utils/hlo_matchers.h
@@ -285,6 +285,7 @@ HLO_MATCHER(Divide);
 HLO_MATCHER(Domain);
 HLO_MATCHER(DynamicSlice);
 HLO_MATCHER(DynamicUpdateSlice);
+HLO_MATCHER(Erf);
 HLO_MATCHER(Exp);
 HLO_MATCHER(Fft);
 HLO_MATCHER(Floor);
diff --git a/third_party/xla/xla/hlo/utils/hlo_matchers_test.cc b/third_party/xla/xla/hlo/utils/hlo_matchers_test.cc
index cc9f3894bfa7dd..dbf6ff6c104a7f 100644
--- a/third_party/xla/xla/hlo/utils/hlo_matchers_test.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_matchers_test.cc
@@ -277,10 +277,12 @@ TEST_F(HloMatchersTest, ComparisonMatcher) {
 TEST_F(HloMatchersTest, AsyncCopyMatcher) {
   Shape shape_memspace1 = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {16}, /*minor_to_major=*/{0}, /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1,
       /*element_size_in_bits=*/0,
       /*memory_space=*/1);
   Shape shape_memspace2 = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {16}, /*minor_to_major=*/{0}, /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1,
       /*element_size_in_bits=*/0,
       /*memory_space=*/2);
 
diff --git a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
index 935c57aeb7befd..f5792d58d70b2a 100644
--- a/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
+++ b/third_party/xla/xla/hlo/utils/hlo_sharding_util.cc
@@ -2907,7 +2907,8 @@ bool DeviceGroupsAreMatch(GroupedSharding& lhs, GroupedSharding& rhs,
   }
 
   bool matching_groups = true;
-  absl::flat_hash_map<int64_t, int64_t> device_to_ref_group;
+  std::vector<int64_t> device_to_ref_group(lhs.device_groups.size() *
+                                           lhs.device_groups[0].size());
   for (int64_t g = 0; g < lhs.device_groups.size(); ++g) {
     for (int64_t device : lhs.device_groups[g]) {
       device_to_ref_group[device] = g;
diff --git a/third_party/xla/xla/layout.cc b/third_party/xla/xla/layout.cc
index 520c4f1a7846f9..9c586f6db619d8 100644
--- a/third_party/xla/xla/layout.cc
+++ b/third_party/xla/xla/layout.cc
@@ -76,6 +76,7 @@ Layout::Layout(absl::Span<const int64_t> minor_to_major,
                absl::Span<const DimLevelType> dim_level_types,
                absl::Span<const bool> dim_unique,
                absl::Span<const bool> dim_ordered, absl::Span<const Tile> tiles,
+               int64_t tail_padding_alignment_in_elements,
                PrimitiveType index_primitive_type,
                PrimitiveType element_primitive_type,
                int64_t element_size_in_bits, int64_t memory_space,
@@ -83,10 +84,11 @@ Layout::Layout(absl::Span<const int64_t> minor_to_major,
                int64_t dynamic_shape_metadata_prefix_bytes)
     : index_primitive_type_(index_primitive_type),
       pointer_primitive_type_(element_primitive_type),
-      element_size_in_bits_(element_size_in_bits),
       memory_space_(memory_space),
+      element_size_in_bits_(element_size_in_bits),
       minor_to_major_(minor_to_major.begin(), minor_to_major.end()),
       tiles_(tiles.begin(), tiles.end()),
+      tail_padding_alignment_in_elements_(tail_padding_alignment_in_elements),
       physical_shape_(std::move(physical_shape)),
       dynamic_shape_metadata_prefix_bytes_(
           dynamic_shape_metadata_prefix_bytes) {
@@ -114,10 +116,12 @@ Layout::Layout(const Layout& other)
       n_dim_ordered_(other.n_dim_ordered_),
       index_primitive_type_(other.index_primitive_type_),
       pointer_primitive_type_(other.pointer_primitive_type_),
-      element_size_in_bits_(other.element_size_in_bits_),
       memory_space_(other.memory_space_),
+      element_size_in_bits_(other.element_size_in_bits_),
       minor_to_major_(other.minor_to_major_),
       tiles_(other.tiles_),
+      tail_padding_alignment_in_elements_(
+          other.tail_padding_alignment_in_elements_),
       physical_shape_(other.physical_shape_ != nullptr
                           ? std::make_unique<Shape>(*other.physical_shape_)
                           : nullptr),
@@ -136,6 +140,8 @@ Layout& Layout::operator=(const Layout& other) {
     n_dim_ordered_ = other.n_dim_ordered_;
     minor_to_major_ = other.minor_to_major_;
     tiles_ = other.tiles_;
+    tail_padding_alignment_in_elements_ =
+        other.tail_padding_alignment_in_elements_;
     index_primitive_type_ = other.index_primitive_type_;
     pointer_primitive_type_ = other.pointer_primitive_type_;
     element_size_in_bits_ = other.element_size_in_bits_;
@@ -171,6 +177,12 @@ Layout& Layout::operator=(Layout&& other) = default;
   for (const TileProto& tile_proto : proto.tiles()) {
     *layout.add_tiles() = Tile::CreateFromProto(tile_proto);
   }
+  if (proto.tail_padding_alignment_in_elements() != 0) {
+    layout.set_tail_padding_alignment_in_elements(
+        proto.tail_padding_alignment_in_elements());
+  } else {
+    layout.set_tail_padding_alignment_in_elements(1);
+  }
   layout.set_index_primitive_type(proto.index_primitive_type());
   layout.set_pointer_primitive_type(proto.pointer_primitive_type());
   layout.set_element_size_in_bits(proto.element_size_in_bits());
@@ -201,6 +213,8 @@ LayoutProto Layout::ToProto() const {
   for (const Tile& tile : tiles()) {
     *proto.add_tiles() = tile.ToProto();
   }
+  proto.set_tail_padding_alignment_in_elements(
+      tail_padding_alignment_in_elements());
   proto.set_index_primitive_type(index_primitive_type());
   proto.set_pointer_primitive_type(pointer_primitive_type());
   proto.set_element_size_in_bits(element_size_in_bits_);
@@ -269,6 +283,13 @@ void Layout::Print(Printer* printer) const {
     }
   }
 
+  if (tail_padding_alignment_in_elements() != 1) {
+    print_colon();
+    printer->Append("L(");
+    printer->Append(tail_padding_alignment_in_elements());
+    printer->Append(")");
+  }
+
   if (index_primitive_type() != PRIMITIVE_TYPE_INVALID) {
     print_colon();
     if (primitive_util::IsIntegralType(index_primitive_type())) {
@@ -366,6 +387,11 @@ bool Layout::Equal::operator()(const Layout& lhs, const Layout& rhs) {
   if (!ignore_tiles_ && lhs.tiles() != rhs.tiles()) {
     return false;
   }
+  if (!ignore_tail_padding_alignment_in_elements_ &&
+      lhs.tail_padding_alignment_in_elements() !=
+          rhs.tail_padding_alignment_in_elements()) {
+    return false;
+  }
   if (!ignore_index_primitive_type_ &&
       lhs.index_primitive_type() != rhs.index_primitive_type()) {
     return false;
diff --git a/third_party/xla/xla/layout.h b/third_party/xla/xla/layout.h
index 54775fd885bbd4..f36da76e037172 100644
--- a/third_party/xla/xla/layout.h
+++ b/third_party/xla/xla/layout.h
@@ -109,6 +109,7 @@ class Layout {
                   absl::Span<const bool> dim_unique,
                   absl::Span<const bool> dim_ordered,
                   absl::Span<const Tile> tiles,
+                  int64_t tail_padding_alignment_in_elements = 1,
                   PrimitiveType index_primitive_type = PRIMITIVE_TYPE_INVALID,
                   PrimitiveType element_primitive_type = PRIMITIVE_TYPE_INVALID,
                   int64_t element_size_in_bits = 0, int64_t memory_space = 0,
@@ -147,6 +148,11 @@ class Layout {
       return *this;
     }
 
+    Equal& IgnoreTailPaddingAlignmentInElements() {
+      ignore_tail_padding_alignment_in_elements_ = true;
+      return *this;
+    }
+
     Equal& IgnoreIndexPrimitiveType() {
       ignore_index_primitive_type_ = true;
       return *this;
@@ -178,11 +184,13 @@ class Layout {
           .IgnorePointerPrimitiveType()
           .IgnoreMemorySpace()
           .IgnorePhysicalShape()
-          .IgnoreElementSize();
+          .IgnoreElementSize()
+          .IgnoreTailPaddingAlignmentInElements();
     }
 
    private:
     bool ignore_tiles_ = false;
+    bool ignore_tail_padding_alignment_in_elements_ = false;
     bool ignore_element_size_ = false;
     bool ignore_index_primitive_type_ = false;
     bool ignore_pointer_primitive_type_ = false;
@@ -221,11 +229,6 @@ class Layout {
     n_dim_level_types_ = 0;
     return *this;
   }
-  //  absl::Span<const DimLevelType> dim_level_types() const {
-  //    return dim_level_types_;
-  //  }
-  //  DimLevelTypeVector* mutable_dim_level_types() { return &dim_level_types_;
-  //  }
 
   // Methods for accessing the dim_unique array.
   int dim_unique_size() const { return n_dim_unique_; }
@@ -242,7 +245,6 @@ class Layout {
     n_dim_unique_++;
     return *this;
   }
-  //  absl::Span<const bool> dim_unique() const { return dim_unique_; }
 
   // Methods for accessing the dim_ordered array.
   int dim_ordered_size() const { return n_dim_ordered_; }
@@ -261,7 +263,6 @@ class Layout {
     n_dim_ordered_++;
     return *this;
   }
-  //  absl::Span<const bool> dim_ordered() const { return dim_ordered_; }
 
   // Methods for accessing the minor-to-major array.
   int minor_to_major_size() const { return minor_to_major_.size(); }
@@ -306,6 +307,14 @@ class Layout {
     return *this;
   }
 
+  int64_t tail_padding_alignment_in_elements() const {
+    return tail_padding_alignment_in_elements_;
+  }
+  Layout& set_tail_padding_alignment_in_elements(int64_t value) {
+    tail_padding_alignment_in_elements_ = value;
+    return *this;
+  }
+
   PrimitiveType index_primitive_type() const { return index_primitive_type_; }
   Layout& set_index_primitive_type(PrimitiveType value) {
     index_primitive_type_ = value;
@@ -355,7 +364,8 @@ class Layout {
   friend H AbslHashValue(H h, const Layout& l) {
     return H::combine(std::move(h), l.minor_to_major_, l.tiles_,
                       l.element_size_in_bits_, l.index_primitive_type_,
-                      l.pointer_primitive_type_, l.memory_space_);
+                      l.pointer_primitive_type_, l.memory_space_,
+                      l.tail_padding_alignment_in_elements_);
   }
 
  private:
@@ -379,13 +389,13 @@ class Layout {
   PrimitiveType index_primitive_type_ : 8;
   PrimitiveType pointer_primitive_type_ : 8;
 
+  // The assigned memory space.
+  int8_t memory_space_ = 0;
+
   // The number of bits used to store an individual array element.
   // When the value is 0, default to ShapeUtil::ByteSizeOfPrimitiveType.
   uint16_t element_size_in_bits_ = 0;
 
-  // The assigned memory space.
-  int8_t memory_space_ = 0;
-
   // A map from physical dimension numbers to logical dimension numbers.
   // The first element is the most minor physical dimension (fastest varying
   // index) and the last the most major (slowest varying index). The contents of
@@ -402,6 +412,15 @@ class Layout {
   // The tiles used in tiling-based layout.
   TileVector tiles_;
 
+  // The shape is padded at the end to multiple of, in terms of number of
+  // elements. This is useful when tiling does not bring the shape to certain
+  // desired granules. Tiling effectively pads/reshapes/transposes the shape
+  // to another shape. This field pads the total number of elements of that
+  // new shape to a multiple of certain number of elements. This is useful such
+  // as we want a layout which does not tile the data but still requires it to
+  // be padded to certain number of elements.
+  int64_t tail_padding_alignment_in_elements_ = 1;
+
   // The physical on-device shape used to represent a sparse array.
   std::unique_ptr<Shape> physical_shape_;
 
diff --git a/third_party/xla/xla/layout_test.cc b/third_party/xla/xla/layout_test.cc
index 5610c840e8d540..1637161bd65408 100644
--- a/third_party/xla/xla/layout_test.cc
+++ b/third_party/xla/xla/layout_test.cc
@@ -37,9 +37,10 @@ TEST_F(LayoutTest, ToString) {
                 .ToString(),
             "{3,2,1,0:T(42,123)(4,5)}");
   EXPECT_EQ(Layout({3, 2, 1, 0}, {}, {}, {}, {Tile({42, 123}), Tile({4, 5})})
+                .set_tail_padding_alignment_in_elements(100)
                 .set_element_size_in_bits(42)
                 .ToString(),
-            "{3,2,1,0:T(42,123)(4,5)E(42)}");
+            "{3,2,1,0:T(42,123)(4,5)L(100)E(42)}");
   EXPECT_EQ(Layout({3, 2, 1, 0}, {}, {}, {}, {Tile({42, 123}), Tile({4, 5})})
                 .set_memory_space(3)
                 .ToString(),
@@ -85,11 +86,11 @@ TEST_F(LayoutTest, Equality) {
                                Layout({0, 1, 2})));
   EXPECT_TRUE(Layout::Equal().IgnoreTiles()(
       Layout({0, 1, 2}, {}, {}, {}, {Tile({42, 44})}), Layout({0, 1, 2})));
-  EXPECT_FALSE(
-      Layout::Equal()(Layout({0, 1, 2}, {}, {}, {}, {}, PRIMITIVE_TYPE_INVALID,
-                             PRIMITIVE_TYPE_INVALID, 32),
-                      Layout({0, 1, 2}, {}, {}, {}, {}, PRIMITIVE_TYPE_INVALID,
-                             PRIMITIVE_TYPE_INVALID, 1)));
+  EXPECT_FALSE(Layout::Equal()(
+      Layout({0, 1, 2}, {}, {}, {}, {}, 1, PRIMITIVE_TYPE_INVALID,
+             PRIMITIVE_TYPE_INVALID, 32),
+      Layout({0, 1, 2}, {}, {}, {}, {}, 1, PRIMITIVE_TYPE_INVALID,
+             PRIMITIVE_TYPE_INVALID, 1)));
   EXPECT_TRUE(Layout::Equal().IgnoreElementSize()(
       Layout({0, 1, 2}).set_element_size_in_bits(32),
       Layout({0, 1, 2}).set_element_size_in_bits(1)));
@@ -111,7 +112,7 @@ TEST_F(LayoutTest, LayoutToFromProto) {
       Layout({3, 2, 1, 0}, {}, {}, {}, {Tile({42, 123}), Tile({4, 5})}));
   expect_unchanged(Layout({1, 0}, {DIM_DENSE, DIM_COMPRESSED}, {}, {}, {}));
   expect_unchanged(
-      Layout({1, 0}, {DIM_DENSE, DIM_COMPRESSED}, {}, {}, {},
+      Layout({1, 0}, {DIM_DENSE, DIM_COMPRESSED}, {}, {}, {}, 1,
              PRIMITIVE_TYPE_INVALID, PRIMITIVE_TYPE_INVALID, 0, 0,
              std::make_unique<Shape>(ShapeUtil::MakeShape(S32, {10, 10}))));
 }
diff --git a/third_party/xla/xla/layout_util.cc b/third_party/xla/xla/layout_util.cc
index ff3d07afda6199..f9248594f2ba4c 100644
--- a/third_party/xla/xla/layout_util.cc
+++ b/third_party/xla/xla/layout_util.cc
@@ -66,9 +66,10 @@ absl::string_view BoolToString(bool b) { return b ? "true" : "false"; }
     absl::Span<const int64_t> minor_to_major,
     absl::Span<const DimLevelType> dim_level_types,
     absl::Span<const bool> dim_unique, absl::Span<const bool> dim_ordered,
-    absl::Span<const Tile> tiles, PrimitiveType index_primitive_type,
-    PrimitiveType pointer_primitive_type, int64_t element_size_in_bits,
-    int64_t memory_space, std::optional<Shape> physical_shape,
+    absl::Span<const Tile> tiles, int64_t tail_padding_alignment_in_elements,
+    PrimitiveType index_primitive_type, PrimitiveType pointer_primitive_type,
+    int64_t element_size_in_bits, int64_t memory_space,
+    std::optional<Shape> physical_shape,
     int64_t dynamic_shape_metadata_prefix_bytes) {
   Layout layout;
   for (int64_t dimension_number : minor_to_major) {
@@ -94,6 +95,8 @@ absl::string_view BoolToString(bool b) { return b ? "true" : "false"; }
     }
     *layout.add_tiles() = tile;
   }
+  layout.set_tail_padding_alignment_in_elements(
+      tail_padding_alignment_in_elements);
   layout.set_index_primitive_type(index_primitive_type);
   layout.set_pointer_primitive_type(pointer_primitive_type);
   layout.set_element_size_in_bits(element_size_in_bits);
@@ -327,6 +330,12 @@ Layout CreateDefaultLayoutForRank(int64_t rank) {
     }
   }
 
+  if (layout.tail_padding_alignment_in_elements() <= 0) {
+    return InvalidArgument(
+        "layout tail_padding_alignment_in_elements field is <= 0: {%d}",
+        layout.tail_padding_alignment_in_elements());
+  }
+
   if (LayoutUtil::IsSparse(layout)) {
     if (layout.tiles_size() > 0) {
       return InvalidArgument(
diff --git a/third_party/xla/xla/layout_util.h b/third_party/xla/xla/layout_util.h
index 9263f6f4d5ab36..f36cfe37657b35 100644
--- a/third_party/xla/xla/layout_util.h
+++ b/third_party/xla/xla/layout_util.h
@@ -47,6 +47,7 @@ class LayoutUtil {
       absl::Span<const bool> dim_unique = {},
       absl::Span<const bool> dim_ordered = {},
       absl::Span<const Tile> tiles = {},
+      int64_t tail_padding_alignment_in_elements = 1,
       PrimitiveType index_primitive_type = PRIMITIVE_TYPE_INVALID,
       PrimitiveType pointer_primitive_type = PRIMITIVE_TYPE_INVALID,
       int64_t element_size_in_bits = 0, int64_t memory_space = 0,
diff --git a/third_party/xla/xla/lit.bzl b/third_party/xla/xla/lit.bzl
index 4b76b446b6e0a0..57d25fff261242 100644
--- a/third_party/xla/xla/lit.bzl
+++ b/third_party/xla/xla/lit.bzl
@@ -183,6 +183,7 @@ def lit_test(
         name = name,
         src = "@llvm-project//llvm:lit",
         args = [
+            "-a",
             "--path",
             bin_dir,
             "$(location {})".format(test_file),
diff --git a/third_party/xla/xla/lit.cfg.py b/third_party/xla/xla/lit.cfg.py
index a47541e542b390..ebb8bd434f3827 100644
--- a/third_party/xla/xla/lit.cfg.py
+++ b/third_party/xla/xla/lit.cfg.py
@@ -1,4 +1,4 @@
-# Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+# Copyright 2019 The OpenXLA Authors.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
diff --git a/third_party/xla/xla/literal_util.h b/third_party/xla/xla/literal_util.h
index 4bfa97fbfa0c13..452f34291b31a1 100644
--- a/third_party/xla/xla/literal_util.h
+++ b/third_party/xla/xla/literal_util.h
@@ -76,6 +76,8 @@ class LiteralUtil {
   // literal's linear representation in memory.
   template <typename NativeT>
   static Literal CreateR0(NativeT value);
+  template <typename T>
+  static Literal CreateR0(PrimitiveType primitive_type, T value);
   template <typename NativeT>
   static Literal CreateR1(absl::Span<const NativeT> values);
   static Literal CreateR1(const tsl::core::Bitmap& values);
@@ -297,6 +299,17 @@ template <typename NativeT>
   return literal;
 }
 
+template <typename T>
+/* static */ Literal LiteralUtil::CreateR0(PrimitiveType primitive_type,
+                                           T value) {
+  return primitive_util::ArrayTypeSwitch<Literal>(
+      [&value](auto type) {
+        using NativeT = primitive_util::NativeTypeOf<type>;
+        return CreateR0(static_cast<NativeT>(value));
+      },
+      primitive_type);
+}
+
 template <typename NativeT>
 /* static */ Literal LiteralUtil::CreateR1(absl::Span<const NativeT> values) {
   Literal literal(
diff --git a/third_party/xla/xla/mlir/backends/cpu/BUILD b/third_party/xla/xla/mlir/backends/cpu/BUILD
index b4623938eb2cba..f26817898f4f31 100644
--- a/third_party/xla/xla/mlir/backends/cpu/BUILD
+++ b/third_party/xla/xla/mlir/backends/cpu/BUILD
@@ -2,7 +2,8 @@ load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("//xla:xla.bzl", "xla_cc_binary")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla/mlir:__subpackages__"],
     licenses = ["notice"],
 )
 
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD b/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD
index 6cae37652fb39c..889bb5106136bf 100644
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD
+++ b/third_party/xla/xla/mlir/backends/cpu/transforms/BUILD
@@ -3,7 +3,8 @@ load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla:internal"],
     licenses = ["notice"],
 )
 
@@ -38,7 +39,6 @@ cc_library(
         "xla_rewrite_realloc_to_alloc.cc",
     ],
     hdrs = ["passes.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":passes_inc_gen",
         "//xla/mlir/runtime/transforms:type_converter",
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc b/third_party/xla/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc
index 028e565b5eb5ee..915e8c57d48598 100644
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc
+++ b/third_party/xla/xla/mlir/backends/cpu/transforms/sparse_rewrite_passes.cc
@@ -460,7 +460,9 @@ struct SparseSDDMMCallRewriter {
     iteratorTypes.push_back(utils::IteratorType::parallel);
     iteratorTypes.push_back(utils::IteratorType::reduction);
     using MapList = ArrayRef<ArrayRef<AffineExpr>>;
-    auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
+    auto infer = [&](MapList m) {
+      return AffineMap::inferFromExprList(m, rewriter.getContext());
+    };
     AffineExpr i, j, k;
     bindDims(op.getContext(), i, j, k);
     auto indexingMaps = infer({{i, k}, {k, j}, {i, j}});
@@ -522,7 +524,9 @@ struct Sparse2To4SpMMCallRewriter {
     iteratorTypes.push_back(utils::IteratorType::parallel);
     iteratorTypes.push_back(utils::IteratorType::reduction);
     using MapList = ArrayRef<ArrayRef<AffineExpr>>;
-    auto infer = [](MapList m) { return AffineMap::inferFromExprList(m); };
+    auto infer = [&](MapList m) {
+      return AffineMap::inferFromExprList(m, rewriter.getContext());
+    };
     AffineExpr i, j, k;
     bindDims(op.getContext(), i, j, k);
     auto indexing_maps = infer({{i, k}, {k, j}, {i, j}});
diff --git a/third_party/xla/xla/mlir/backends/cpu/transforms/tests/BUILD b/third_party/xla/xla/mlir/backends/cpu/transforms/tests/BUILD
index e329737f131c98..cdb29227e237cf 100644
--- a/third_party/xla/xla/mlir/backends/cpu/transforms/tests/BUILD
+++ b/third_party/xla/xla/mlir/backends/cpu/transforms/tests/BUILD
@@ -1,7 +1,6 @@
 load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
diff --git a/third_party/xla/xla/mlir/backends/gpu/BUILD b/third_party/xla/xla/mlir/backends/gpu/BUILD
deleted file mode 100644
index b14530d06af5d0..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/BUILD
+++ /dev/null
@@ -1,29 +0,0 @@
-load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("//xla:xla.bzl", "xla_cc_binary")
-
-package(
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],
-)
-
-build_test(
-    name = "xla-gpu-opt_build_test",
-    targets = [
-        ":xla-gpu-opt",
-    ],
-)
-
-xla_cc_binary(
-    name = "xla-gpu-opt",
-    srcs = ["xla-gpu-opt.cc"],
-    deps = [
-        "//xla/mlir/backends/gpu/transforms:passes",
-        "//xla/mlir_hlo:lhlo",
-        "//xla/mlir_hlo:lhlo_gpu",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:FuncExtensions",
-        "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:MlirOptLib",
-    ],
-)
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/BUILD b/third_party/xla/xla/mlir/backends/gpu/transforms/BUILD
deleted file mode 100644
index abbf17d1c8171c..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/BUILD
+++ /dev/null
@@ -1,102 +0,0 @@
-load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-
-package(
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],
-)
-
-gentbl_cc_library(
-    name = "passes_inc_gen",
-    compatible_with = get_compatible_with_portable(),
-    tbl_outs = [
-        (
-            [
-                "-gen-pass-decls",
-                "-name=GpuTransforms",
-            ],
-            "passes.h.inc",
-        ),
-    ],
-    tblgen = "@llvm-project//mlir:mlir-tblgen",
-    td_file = "passes.td",
-    deps = ["@llvm-project//mlir:PassBaseTdFiles"],
-)
-
-cc_library(
-    name = "dataflow_analysis",
-    srcs = ["dataflow_analysis.cc"],
-    hdrs = ["dataflow_analysis.h"],
-    compatible_with = [],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/mlir_hlo:lhlo_gpu",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-    ],
-)
-
-cc_library(
-    name = "passes",
-    srcs = [
-        "add_concurrent_regions.cc",
-        "add_hlo_trace_annotations.cc",
-        "gpu_to_gpu_runtime.cc",
-        "lmhlo_gpu_to_gpu_runtime.cc",
-        "lmhlo_to_gpu_launch.cc",
-        "lmhlo_to_gpu_runtime.cc",
-        "memref_get_global_to_arg.cc",
-        "outline_cuda_graphs.cc",
-        "passes.cc",
-        "stream_assignment.cc",
-        "uid_generator.h",
-    ],
-    hdrs = ["passes.h"],
-    # Override cc_library()'s internal default value of ["//buildenv/target:gce"].`
-    # TODO(ezhulenev): Do not depend on NCCL thunks in compiler passes.
-    compatible_with = [],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":dataflow_analysis",
-        ":passes_inc_gen",
-        "//xla:debug_options_flags",
-        "//xla:xla_proto_cc",
-        "//xla/mlir/runtime/ir:rt",
-        "//xla/mlir/runtime/utils:custom_calls",
-        "//xla/mlir_hlo:lhlo",
-        "//xla/mlir_hlo:lhlo_gpu",
-        "//xla/service/gpu:backend_configs_cc",
-        "//xla/service/gpu:gpu_executable",
-        "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu:nccl_collective_thunks",
-        "//xla/service/gpu/runtime3:conditional_thunk",
-        "//xla/service/gpu/runtime3:copy_thunk",
-        "//xla/service/gpu/runtime3:kernel_thunk",
-        "//xla/service/gpu/runtime3:memset_thunk",
-        "//xla/service/gpu/runtime3:sequential_thunk",
-        "//xla/service/gpu/runtime3:while_thunk",
-        "//xla/stream_executor:blas",
-        "//xla/stream_executor:device_description",
-        "//xla/translate/mhlo_to_hlo:location_exporter",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:ControlFlowDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:Transforms",
-        "@local_tsl//tsl/platform:env",
-    ],
-)
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/add_concurrent_regions.cc b/third_party/xla/xla/mlir/backends/gpu/transforms/add_concurrent_regions.cc
deleted file mode 100644
index caed7dc88d08f9..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/add_concurrent_regions.cc
+++ /dev/null
@@ -1,236 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-#include <algorithm>
-#include <iostream>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/match.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/IR/Visitors.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "xla/mlir/backends/gpu/transforms/dataflow_analysis.h"
-#include "xla/mlir/runtime/utils/custom_calls.h"
-#include "tsl/platform/env.h"
-
-namespace xla {
-namespace gpu {
-
-namespace {
-
-#define GEN_PASS_DEF_ADDCONCURRENTREGIONSPASS
-#include "xla/mlir/backends/gpu/transforms/passes.h.inc"
-
-using namespace mlir;  // NOLINT
-using mlir::func::FuncOp;
-using xla::runtime::CustomCallDeclarations;
-
-class AddConcurrentRegionsPass
-    : public impl::AddConcurrentRegionsPassBase<AddConcurrentRegionsPass> {
-  void runOnOperation() override;
-};
-
-//===----------------------------------------------------------------------===//
-
-struct RegionInfo {
-  Operation* start;
-  Operation* end;
-  int size;
-};
-
-bool IsNoOp(Operation* op) {
-  return isa<memref::ViewOp, memref::ReinterpretCastOp, arith::ConstantOp>(op);
-}
-
-int GetKernelCount(llvm::ArrayRef<DataflowAnalysis::Node> region) {
-  int kernel_count = 0;
-  for (const DataflowAnalysis::Node& node : region) {
-    Operation* op = node.operation;
-    if (!IsNoOp(op)) {
-      kernel_count++;
-    }
-  }
-  return kernel_count;
-}
-
-// We use the size of the inputs to the kernel as a heuristic to avoid
-// adding memory bound kernels to the concurrent region.
-// The memory bandwidth on A100 is 2MB/us, so a data movement less than 10MB
-// is hidden by the kernel launch overhead, which is 5us.
-static constexpr int64_t kInputSizeThreshold = 10'000'000;
-
-bool IsKernelMemoryBound(Operation* op) {
-  if (auto launch_func = dyn_cast<mlir::gpu::LaunchFuncOp>(op)) {
-    size_t size = 0;
-
-    for (Value operand : launch_func.getOperands()) {
-      if (auto memref_type = dyn_cast<MemRefType>(operand.getType())) {
-        size += (memref_type.getNumElements() *
-                     memref_type.getElementTypeBitWidth() +
-                 7) /
-                8;
-      }
-    }
-
-    if (size > kInputSizeThreshold) {
-      return true;
-    }
-  }
-
-  return false;
-}
-
-//
-// Return a list of pairs of operations, in which the first element is the
-// first operation in the region, and the second is the last operation in the
-// region.
-//
-// We currently use a greedy algorithm to determine region starting point:
-//   regions = []
-//   region = {first operation}
-//   for operation in the capture function
-//     if HasDependency(region, operation)
-//       regions.add(region)
-//       region = new region
-//     else
-//       region.add(operation)
-//
-llvm::SmallVector<RegionInfo> GetRegionInfos(
-    FuncOp capture_func, DataflowAnalysis& dataflow_analysis) {
-  llvm::SmallVector<RegionInfo> region_infos;
-  DataflowAnalysis::DataflowGraph dataflow_graph =
-      dataflow_analysis.GetDataflowGraph(capture_func);
-
-  // If verbose logging is enabled print the dataflow graph as a DOT graph.
-  if (VLOG_IS_ON(100)) {
-    std::cout << "Dependency graph for graph capture function "
-              << capture_func.getName().str() << ":\n"
-              << dataflow_analysis.ToDot(dataflow_graph);
-  }
-
-  llvm::SmallVector<DataflowAnalysis::Node> region;
-
-  auto store_region_and_start_new_region = [&]() {
-    int kernel_count = GetKernelCount(region);
-    if (kernel_count >= 2) {
-      RegionInfo region_info = {region.front().operation,
-                                region.back().operation, kernel_count};
-      region_infos.push_back(region_info);
-    }
-    region.clear();
-  };
-
-  auto append_node_to_region = [&](const DataflowAnalysis::Node& node) {
-    if (region.empty()) {
-      if (!IsNoOp(node.operation)) {
-        region.push_back(node);
-      }
-    } else {
-      region.push_back(node);
-    }
-  };
-
-  for (const DataflowAnalysis::Node& node : dataflow_graph) {
-    if (isa<func::ReturnOp>(node.operation)) {
-      break;
-    }
-
-    bool has_dependency = false;
-    for (const DataflowAnalysis::Node& node_in_region : region) {
-      std::vector<size_t> children = node_in_region.children;
-      if (std::find(children.begin(), children.end(), node.index) !=
-          children.end()) {
-        has_dependency = true;
-        break;
-      }
-    }
-
-    if (IsKernelMemoryBound(node.operation)) {
-      store_region_and_start_new_region();
-    } else if (has_dependency) {
-      store_region_and_start_new_region();
-      append_node_to_region(node);
-    } else {
-      append_node_to_region(node);
-    }
-  }
-
-  store_region_and_start_new_region();
-  return region_infos;
-}
-
-void InsertConcurrentRegions(FuncOp capture_func,
-                             CustomCallDeclarations& custom_calls,
-                             DataflowAnalysis& dataflow_analysis) {
-  llvm::SmallVector<RegionInfo> region_infos =
-      GetRegionInfos(capture_func, dataflow_analysis);
-  auto sym_table = custom_calls.sym_table();
-
-  for (RegionInfo region_info : region_infos) {
-    Operation* start = region_info.start;
-    Operation* end = region_info.end;
-
-    ImplicitLocOpBuilder b(start->getLoc(), sym_table.getOp());
-    func::FuncOp begin_marker = custom_calls.GetOrCreate(
-        b, "xla.gpu.concurrent_region.begin", TypeRange(), TypeRange());
-    b.setInsertionPoint(start);
-    auto call = b.create<func::CallOp>(begin_marker.getName(), TypeRange());
-    call->setAttr(b.getStringAttr("size"),
-                  IntegerAttr::get(b.getIntegerType(64), region_info.size));
-
-    func::FuncOp end_marker = custom_calls.GetOrCreate(
-        b, "xla.gpu.concurrent_region.end", TypeRange(), TypeRange());
-    b.setInsertionPointAfter(end);
-    b.create<func::CallOp>(end_marker.getName(), TypeRange());
-  }
-}
-
-//===----------------------------------------------------------------------===//
-
-void AddConcurrentRegionsPass::runOnOperation() {
-  ModuleOp module = getOperation();
-  SymbolTable sym_table(module);
-  CustomCallDeclarations custom_calls(std::move(sym_table));
-
-  auto func_ops = llvm::to_vector(module.getOps<FuncOp>());
-
-  for (auto func_op : func_ops) {
-    // Find the gpu graph capture function.
-    if (absl::StrContains(func_op.getSymNameAttr().str(),
-                          "xla.gpu.graph.capture")) {
-      InsertConcurrentRegions(func_op, custom_calls,
-                              getAnalysis<DataflowAnalysis>());
-    }
-  }
-}
-
-}  // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>> createAddConcurrentRegionsPass() {
-  return std::make_unique<AddConcurrentRegionsPass>();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/add_hlo_trace_annotations.cc b/third_party/xla/xla/mlir/backends/gpu/transforms/add_hlo_trace_annotations.cc
deleted file mode 100644
index c4ba4e3008acb2..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/add_hlo_trace_annotations.cc
+++ /dev/null
@@ -1,88 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <string>
-#include <utility>
-
-#include "absl/strings/match.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "xla/mlir/backends/gpu/transforms/passes.h"
-#include "xla/mlir/runtime/ir/rt_dialect.h"
-#include "xla/translate/mhlo_to_hlo/location_exporter.h"
-
-namespace xla {
-namespace gpu {
-
-#define GEN_PASS_DEF_ADDHLOTRACEANNOTATIONSPASS
-#include "xla/mlir/backends/gpu/transforms/passes.h.inc"
-
-using namespace mlir;  // NOLINT
-
-using xla::runtime::HloTraceAttr;
-
-class AddHloTraceAnnotationsPass
-    : public impl::AddHloTraceAnnotationsPassBase<AddHloTraceAnnotationsPass> {
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<runtime::RuntimeDialect>();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-
-void AddHloTraceAnnotationsPass::runOnOperation() {
-  MLIRContext* ctx = &getContext();
-
-  ModuleOp module = getOperation();
-  SymbolTable sym_table(module);
-
-  getOperation().walk([&](func::CallOp call) {
-    // Check if the callee is a custom call.
-    auto callee = sym_table.lookup<func::FuncOp>(call.getCallee());
-    if (!callee->hasAttr("rt.custom_call")) return;
-
-    // Drop multi-op trace for CUDA graphs since they are too large for xprof to
-    // display.
-    // TODO(b/275240695): Report the graph content once the Xprof team provides
-    // an API.
-    if (absl::StrContains(call.getCalleeAttr().getValue(),
-                          "xla.gpu.graph.launch")) {
-      auto capture = call->getAttr("capture").cast<FlatSymbolRefAttr>();
-      std::string op_name = "cuda_graph/" + capture.getValue().str();
-      auto annotation = HloTraceAttr::get(ctx, std::move(op_name));
-      call->setAttr("rt.trace", annotation);
-      return;
-    }
-
-    // HLO operation name is encoded in the operation location.
-    std::string hlo_op = mlir::mhlo::GetDebugNameFromLocation(call->getLoc());
-    auto annotation = HloTraceAttr::get(ctx, std::move(hlo_op));
-    call->setAttr("rt.trace", annotation);
-  });
-}
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createAddHloTraceAnnotationsPass() {
-  return std::make_unique<AddHloTraceAnnotationsPass>();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/dataflow_analysis.cc b/third_party/xla/xla/mlir/backends/gpu/transforms/dataflow_analysis.cc
deleted file mode 100644
index 59d70def88789f..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/dataflow_analysis.cc
+++ /dev/null
@@ -1,277 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/backends/gpu/transforms/dataflow_analysis.h"
-
-#include <algorithm>
-#include <cstddef>
-#include <string>
-#include <vector>
-
-#include "absl/strings/str_cat.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/IR/Visitors.h"  // from @llvm-project
-#include "xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
-
-namespace xla {
-namespace gpu {
-
-namespace {
-
-using namespace mlir;  // NOLINT
-using mlir::BlockArgument;
-using mlir::Operation;
-using mlir::func::FuncOp;
-
-// Represents a slice of the buffer argument to the graph capture function.
-struct BufferUse {
-  BlockArgument arg;
-  size_t offset;
-  size_t byte_len;
-
-  // The buffer is only read by the operation.
-  bool read_only;
-};
-
-BufferUse GetBufferUse(Value operand, bool read_only = false) {
-  Operation* defining_op = operand.getDefiningOp();
-  if (!defining_op) {
-    auto block_argument = cast<mlir::BlockArgument>(operand);
-    auto memref_type = cast<MemRefType>(block_argument.getType());
-    size_t byte_len =
-        (memref_type.getNumElements() * memref_type.getElementTypeBitWidth() +
-         7) /
-        8;
-    return {block_argument, 0, byte_len, read_only};
-  }
-
-  if (isa<memref::ViewOp>(defining_op)) {
-    auto view_op = cast<memref::ViewOp>(defining_op);
-    auto buffer_use = GetBufferUse(view_op.getSource());
-
-    IntegerAttr offset_attr;
-    bool is_constant =
-        matchPattern(view_op.getByteShift(), m_Constant(&offset_attr));
-    if (!is_constant) {
-      // Failed to refine the BufferUse.
-      return buffer_use;
-    }
-    size_t offset = offset_attr.getInt();
-
-    // Get len.
-    auto memref_type = cast<MemRefType>(view_op.getType());
-    // TODO(b/274157088): Handle the case where elements are complex numbers.
-    if (!memref_type.getElementType().isIntOrFloat()) {
-      return buffer_use;
-    }
-
-    size_t byte_len =
-        (memref_type.getNumElements() * memref_type.getElementTypeBitWidth() +
-         7) /
-        8;
-
-    return {buffer_use.arg, buffer_use.offset + offset, byte_len, read_only};
-  }
-
-  if (auto cast = dyn_cast<mlir::memref::ReinterpretCastOp>(defining_op)) {
-    return GetBufferUse(cast.getSource(), read_only);
-  }
-
-  return {};
-}
-
-llvm::SmallVector<BufferUse> GetBufferUses(Operation& operation) {
-  llvm::SmallVector<BufferUse> operand_buffer_uses;
-  if (auto launch_func = dyn_cast<mlir::gpu::LaunchFuncOp>(operation)) {
-    auto kernel_func =
-        SymbolTable::lookupNearestSymbolFrom<mlir::gpu::GPUFuncOp>(
-            &operation, launch_func.getKernel());
-    auto kernel_operands = launch_func.getKernelOperands();
-    for (auto it : llvm::enumerate(kernel_operands)) {
-      BufferUse buffer_use = GetBufferUse(
-          it.value(),
-          /*read_only=*/!kernel_func.getArgAttrOfType<mlir::UnitAttr>(
-              it.index(), "lmhlo.written"));
-      operand_buffer_uses.push_back(buffer_use);
-    }
-  } else if (auto gemm = dyn_cast<lmhlo_gpu::GEMMOp>(operation)) {
-    BufferUse buffer_use_0 = GetBufferUse(gemm.getA(), /*read_only=*/true);
-    BufferUse buffer_use_1 = GetBufferUse(gemm.getB(), /*read_only=*/true);
-    BufferUse buffer_use_2 = GetBufferUse(gemm.getC(), /*read_only=*/false);
-    operand_buffer_uses.push_back(buffer_use_0);
-    operand_buffer_uses.push_back(buffer_use_1);
-    operand_buffer_uses.push_back(buffer_use_2);
-  } else if (auto memcpy = dyn_cast<mlir::gpu::MemcpyOp>(operation)) {
-    BufferUse src_buffer = GetBufferUse(memcpy.getSrc(), /*read_only=*/true);
-    BufferUse dst_buffer = GetBufferUse(memcpy.getDst(), /*read_only=*/false);
-    operand_buffer_uses.push_back(src_buffer);
-    operand_buffer_uses.push_back(dst_buffer);
-  }
-
-  return operand_buffer_uses;
-}
-
-// Arguments to the graph capture function may have the "lmhlo.constant_name"
-// attribute, which indicates that the passed-in buffer is constant.
-bool IsConstant(BlockArgument block_argument) {
-  // Check if the input buffer is marked as constant.
-  Region* parent_region = block_argument.getParentRegion();
-  auto parent_func = parent_region->getParentOfType<FuncOp>();
-  unsigned parent_func_arg_index = block_argument.getArgNumber();
-  auto cst = parent_func.getArgAttrOfType<StringAttr>(parent_func_arg_index,
-                                                      "lmhlo.constant_name");
-  return cst != nullptr;
-}
-
-// Check if two buffer_uses overlap.
-bool HasDependency(BufferUse buffer_use_a, BufferUse buffer_use_b) {
-  if (buffer_use_a.arg.getArgNumber() != buffer_use_b.arg.getArgNumber())
-    return false;
-  if (IsConstant(buffer_use_a.arg) || IsConstant(buffer_use_b.arg))
-    return false;
-  if (buffer_use_a.read_only && buffer_use_b.read_only) return false;
-
-  // Check if two buffer slices overlap.
-  size_t start1 = buffer_use_a.offset;
-  size_t end1 = buffer_use_a.offset + buffer_use_a.byte_len;
-  size_t start2 = buffer_use_b.offset;
-  size_t end2 = buffer_use_b.offset + buffer_use_b.byte_len;
-  if (std::max(start1, start2) < std::min(end1, end2)) {
-    return true;
-  }
-  return false;
-}
-
-bool HasDependency(llvm::ArrayRef<BufferUse> buffer_uses_a,
-                   llvm::ArrayRef<BufferUse> buffer_uses_b) {
-  for (auto buffer_use_a : buffer_uses_a) {
-    for (auto buffer_use_b : buffer_uses_b) {
-      if (HasDependency(buffer_use_a, buffer_use_b)) return true;
-    }
-  }
-  return false;
-}
-
-// Remove edges that are redundant for determining the execution order of
-// kernels. We use the following algorithm to compute the transitive reduction:
-//
-// For source node in graph:
-//   For each edge (source -> target)
-//     longest_distance = the length of the longest path from source to target
-//     if (longest_distance > 1):
-//       remove (source -> target)
-//
-void TransitiveReduction(DataflowAnalysis::DataflowGraph& graph) {
-  std::vector<std::vector<size_t>> parents(graph.size(), std::vector<size_t>());
-  for (const DataflowAnalysis::Node& node : graph) {
-    for (size_t child_index : node.children) {
-      parents[child_index].push_back(node.index);
-    }
-  }
-
-  std::vector<int> longest_distance(graph.size());
-  for (DataflowAnalysis::Node& source : graph) {
-    if (source.children.empty()) {
-      continue;
-    }
-
-    std::fill(longest_distance.begin(), longest_distance.end(), 0);
-    size_t farthest_child = source.children.back();
-    for (size_t target = source.index + 1; target <= farthest_child; target++) {
-      for (size_t mid : parents[target]) {
-        // If the mid node is before source in the topological order, no path
-        // source -> mid -> target can exits and we can skip it.
-        if (mid >= source.index) {
-          // If source -> mid -> target is longer than the longest path so far
-          // from source -> target, update the longest distance.
-          int candidate_longest_distance = longest_distance[mid] + 1;
-          if (candidate_longest_distance > longest_distance[target]) {
-            longest_distance[target] = candidate_longest_distance;
-          }
-        }
-      }
-    }
-
-    source.children.erase(
-        std::remove_if(
-            source.children.begin(), source.children.end(),
-            [&](size_t target) { return longest_distance[target] > 1; }),
-        source.children.end());
-  }
-}
-
-}  // namespace
-
-DataflowAnalysis::DataflowGraph DataflowAnalysis::GetDataflowGraph(
-    FuncOp graph_capture_function) {
-  std::vector<Node> graph;
-  for (auto [index, op] : llvm::enumerate(graph_capture_function.getOps())) {
-    graph.push_back(Node{&op, index, {}});
-  }
-
-  // A vector that stores the buffer used by each operation in the graph. The
-  // i-th operation's buffer uses are stored as the vector buffer_uses[i];
-  std::vector<llvm::SmallVector<BufferUse>> buffer_uses;
-  for (Operation& operation : graph_capture_function.getOps()) {
-    buffer_uses.push_back(GetBufferUses(operation));
-  }
-
-  for (int i = 0; i < graph.size(); ++i) {
-    Node& node_i = graph[i];
-    llvm::ArrayRef<BufferUse> buffer_uses_i = buffer_uses[i];
-    for (int j = i + 1; j < graph.size(); ++j) {
-      llvm::ArrayRef<BufferUse> buffer_uses_j = buffer_uses[j];
-      if (HasDependency(buffer_uses_i, buffer_uses_j)) {
-        node_i.children.push_back(j);
-      }
-    }
-  }
-
-  TransitiveReduction(graph);
-  return graph;
-}
-
-std::string DataflowAnalysis::ToDot(const DataflowGraph& graph) {
-  std::string pad;
-  std::string res;
-  auto indent = [&] { pad.append(2, ' '); };
-  auto outdent = [&] { pad.resize(pad.size() - 2); };
-  auto addline = [&](auto&&... args) {
-    absl::StrAppend(&res, pad, args..., "\n");
-  };
-  auto get_name = [](const Node& node) -> std::string {
-    return absl::StrCat("\"", node.operation->getName().getStringRef().str(),
-                        "_", node.index, "\"");
-  };
-
-  addline("digraph {");
-  indent();
-  for (const Node& node : graph) {
-    for (size_t child_index : node.children) {
-      Node child = graph[child_index];
-      addline(get_name(node), " -> ", get_name(child));
-    }
-  }
-  outdent();
-  addline("}");
-  return res;
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/dataflow_analysis.h b/third_party/xla/xla/mlir/backends/gpu/transforms/dataflow_analysis.h
deleted file mode 100644
index 32bc001f03d211..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/dataflow_analysis.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_BACKENDS_GPU_TRANSFORMS_DATAFLOW_ANALYSIS_H_
-#define XLA_MLIR_BACKENDS_GPU_TRANSFORMS_DATAFLOW_ANALYSIS_H_
-
-#include <string>
-#include <vector>
-
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-
-namespace xla {
-namespace gpu {
-
-class DataflowAnalysis {
- public:
-  explicit DataflowAnalysis(mlir::Operation* op) {}
-
-  struct Node {
-    mlir::Operation* operation;
-    size_t index;
-    std::vector<size_t> children;
-  };
-
-  using DataflowGraph = std::vector<Node>;
-
-  // This function creates a dataflow graph that represent data dependencies in
-  // the graph capture function. The analysis relies on some properties of the
-  // IR in XLA:
-  //   (1) Buffer arguments do not alias. It is guaranteed that two buffer
-  //       arguments to the graph capture function do not overlap.
-  //   (2) XLA operations do not have any side effects beyond writing to its
-  //       buffer arguments. So it is safe to reorder operations if they do not
-  //       have write-conflicts.
-  //   (3) We have information about read-only and read-write buffer arguments.
-  DataflowGraph GetDataflowGraph(mlir::func::FuncOp graph_capture_function);
-
-  std::string ToDot(const DataflowGraph& graph);
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_MLIR_BACKENDS_GPU_TRANSFORMS_DATAFLOW_ANALYSIS_H_
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/gpu_to_gpu_runtime.cc b/third_party/xla/xla/mlir/backends/gpu/transforms/gpu_to_gpu_runtime.cc
deleted file mode 100644
index 02e97984c564a3..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/gpu_to_gpu_runtime.cc
+++ /dev/null
@@ -1,252 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <iterator>
-#include <memory>
-#include <utility>
-
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/TypeRange.h"  // from @llvm-project
-#include "mlir/IR/ValueRange.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "xla/mlir/backends/gpu/transforms/uid_generator.h"
-#include "xla/mlir/runtime/utils/custom_calls.h"
-
-namespace xla {
-namespace gpu {
-
-#define GEN_PASS_DEF_CONVERTGPUTOGPURUNTIMEPASS
-#include "xla/mlir/backends/gpu/transforms/passes.h.inc"
-
-using namespace mlir;  // NOLINT
-
-using mlir::gpu::GPUModuleOp;
-using mlir::gpu::LaunchFuncOp;
-using mlir::gpu::MemcpyOp;
-using mlir::gpu::MemsetOp;
-
-using xla::runtime::CustomCallDeclarations;
-
-class ConvertGpuToGpuRuntimePass
-    : public impl::ConvertGpuToGpuRuntimePassBase<ConvertGpuToGpuRuntimePass> {
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<mlir::func::FuncDialect, mlir::arith::ArithDialect>();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-
-class GpuModuleOpLowering : public OpRewritePattern<GPUModuleOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(GPUModuleOp op,
-                                PatternRewriter& rewriter) const override {
-    rewriter.eraseOp(op);
-    return success();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-
-class MemcpyOpLowering : public OpRewritePattern<MemcpyOp> {
- public:
-  MemcpyOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  // We use a heuristic to identify the direction of the memcpy operation, if
-  // the operand was allocated by alloca op or is a global memref, then it must
-  // be a memref on the host.
-  static bool IsHostMemRef(Value value) {
-    auto* op = value.getDefiningOp();
-    return llvm::isa_and_nonnull<memref::AllocaOp, memref::GetGlobalOp>(op);
-  }
-
-  // Identify the direction of the memcpy operation.
-  static StringRef Target(MemcpyOp op) {
-    if (IsHostMemRef(op.getDst())) return "xla.gpu.memcpy.d2h";
-    if (IsHostMemRef(op.getSrc())) return "xla.gpu.memcpy.h2d";
-    return "xla.gpu.memcpy.d2d";
-  }
-
-  LogicalResult matchAndRewrite(MemcpyOp op,
-                                PatternRewriter& rewriter) const override {
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(b, Target(op), op);
-
-    auto stream = op->getAttrOfType<IntegerAttr>("stream");
-
-    // Create a function launch call operation.
-    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
-        op, callee.getName(), TypeRange(), op.getOperands());
-
-    if (stream) {
-      call->setAttr(b.getStringAttr("stream"), stream);
-    } else {
-      call->setAttr(b.getStringAttr("stream"), b.getI64IntegerAttr(0));
-    }
-
-    return success();
-  }
-
- private:
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-class MemsetOpLowering : public OpRewritePattern<MemsetOp> {
- private:
-  static constexpr const char kCustomCallTarget[] = "xla.gpu.memset";
-
- public:
-  MemsetOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(MemsetOp op,
-                                PatternRewriter& rewriter) const override {
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(b, kCustomCallTarget, op);
-
-    // Create a function launch call operation.
-    rewriter.replaceOpWithNewOp<func::CallOp>(op, callee.getName(), TypeRange(),
-                                              op.getOperands());
-
-    return success();
-  }
-
- private:
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-class LaunchFuncOpLowering : public OpRewritePattern<LaunchFuncOp> {
- private:
-  static constexpr const char kCustomCallTarget[] = "xla.gpu.func.launch";
-
- public:
-  LaunchFuncOpLowering(MLIRContext* ctx, UidGenerator& uid,
-                       CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), uid_(uid), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(LaunchFuncOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // Cast grid and block dimensions to i32 before passing to the custom call.
-    auto cast = [&](mlir::Value value) {
-      return b.create<arith::IndexCastOp>(b.getI32Type(), value);
-    };
-
-    // Prepare arguments for the custom call.
-    llvm::SmallVector<Value> args = {
-        cast(op.getGridSizeX()),  cast(op.getGridSizeY()),
-        cast(op.getGridSizeZ()),  cast(op.getBlockSizeX()),
-        cast(op.getBlockSizeY()), cast(op.getBlockSizeZ())};
-
-    // Shared memory size is optional for the `gpu.launch` but mandatory for the
-    // Xla runtime kernel launch custom call.
-    if (op.getDynamicSharedMemorySize()) {
-      args.insert(args.begin(), op.getDynamicSharedMemorySize());
-    } else {
-      auto zero = b.create<arith::ConstantIntOp>(0, b.getI32Type());
-      args.insert(args.begin(), zero);
-    }
-
-    // Add kernel arguments.
-    llvm::copy(op.getKernelOperands(), std::back_inserter(args));
-
-    auto computation = op->getAttr("__custom_fusion_computation");
-
-    // Get or create a custom call function declaration.
-    func::FuncOp callee = custom_calls_.GetOrCreate(
-        b, computation ? "xla.gpu.func.custom_launch" : "xla.gpu.func.launch",
-        TypeRange(ValueRange(args)), TypeRange());
-
-    // Create a function launch call operation.
-    auto call = b.create<func::CallOp>(callee.getName(), TypeRange(), args);
-    call->setAttr(b.getStringAttr("kernel"), op.getKernelName());
-
-    // Assign a unique id to this instance of a kernel launch operation.
-    call->setAttr(b.getStringAttr("uid"), b.getI64IntegerAttr(uid_.uid()));
-
-    // Set assigned stream for the kernel launch.
-    auto stream = op->getAttrOfType<IntegerAttr>("stream");
-    if (stream) {
-      call->setAttr(b.getStringAttr("stream"), stream);
-    } else {
-      call->setAttr(b.getStringAttr("stream"), b.getI64IntegerAttr(0));
-    }
-
-    // Copy custom fusion computation.
-    if (computation) {
-      call->setAttr("__custom_fusion_computation", computation);
-    }
-
-    // Erase the original gpu launch operation.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
- private:
-  UidGenerator& uid_;
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-void ConvertGpuToGpuRuntimePass::runOnOperation() {
-  ModuleOp module = getOperation();
-  MLIRContext* ctx = module.getContext();
-
-  // Keep track of the custom calls created from the lowered operations.
-  SymbolTable sym_table(module);
-  CustomCallDeclarations custom_calls(std::move(sym_table));
-
-  // Each kernel launch operation gets a unique id.
-  UidGenerator kernel_uid;
-
-  // Convert gpu operations to XLA gpu runtime custom calls.
-  RewritePatternSet patterns(ctx);
-  patterns.insert<GpuModuleOpLowering>(ctx);
-  patterns.insert<LaunchFuncOpLowering>(ctx, kernel_uid, custom_calls);
-  patterns.insert<MemcpyOpLowering, MemsetOpLowering>(ctx, custom_calls);
-
-  if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
-    return signalPassFailure();
-}
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertGpuToGpuRuntimePass() {
-  return std::make_unique<ConvertGpuToGpuRuntimePass>();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_gpu_to_gpu_runtime.cc b/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_gpu_to_gpu_runtime.cc
deleted file mode 100644
index 473986d6ee6ab8..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_gpu_to_gpu_runtime.cc
+++ /dev/null
@@ -1,1065 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <iterator>
-#include <memory>
-#include <numeric>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "xla/mlir/backends/gpu/transforms/uid_generator.h"
-#include "xla/mlir/runtime/ir/rt_dialect.h"
-#include "xla/mlir/runtime/utils/custom_calls.h"
-#include "xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
-#include "xla/stream_executor/blas.h"
-
-namespace xla {
-namespace gpu {
-
-#define GEN_PASS_DEF_CONVERTLMHLOGPUTOGPURUNTIMEPASS
-#include "xla/mlir/backends/gpu/transforms/passes.h.inc"
-
-using namespace mlir;  // NOLINT
-
-using mlir::lmhlo_gpu::CholeskyOp;
-using mlir::lmhlo_gpu::ConvBackwardFilterOp;
-using mlir::lmhlo_gpu::ConvBackwardInputOp;
-using mlir::lmhlo_gpu::ConvForwardFusedOp;
-using mlir::lmhlo_gpu::ConvForwardFusedSideInputOp;
-using mlir::lmhlo_gpu::ConvForwardGraphOp;
-using mlir::lmhlo_gpu::ConvForwardOp;
-using mlir::lmhlo_gpu::CublasLtMatmulF8Op;
-using mlir::lmhlo_gpu::CublasLtMatmulOp;
-using mlir::lmhlo_gpu::CudnnConvReorderFilterAndBiasOp;
-using mlir::lmhlo_gpu::CudnnConvReorderFilterOp;
-using mlir::lmhlo_gpu::CudnnNormOp;
-using mlir::lmhlo_gpu::GEMMOp;
-using mlir::lmhlo_gpu::RadixSortOp;
-
-using xla::runtime::CustomCallDeclarations;
-
-class ConvertLmhloGpuToGpuRuntimePass
-    : public impl::ConvertLmhloGpuToGpuRuntimePassBase<
-          ConvertLmhloGpuToGpuRuntimePass> {
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<arith::ArithDialect, func::FuncDialect,
-                    memref::MemRefDialect, scf::SCFDialect>();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-
-class GemmOpLowering : public OpRewritePattern<GEMMOp> {
-  static constexpr const char kCustomCallTarget[] = "xla.gpu.gemm";
-
- public:
-  GemmOpLowering(MLIRContext* ctx, UidGenerator& uid,
-                 CustomCallDeclarations& custom_calls)
-      : OpRewritePattern<GEMMOp>(ctx), uid_(uid), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(GEMMOp op,
-                                PatternRewriter& rewriter) const override {
-    {
-      // Set requires_blas attribute to true. The runtime pass will add cuBLAS
-      // initialization custom call to the entry function if the attribute is
-      // set to true.
-      auto module = op.getOperation()->getParentOfType<ModuleOp>();
-      ImplicitLocOpBuilder b(module.getLoc(), rewriter);
-      module->setAttr(b.getStringAttr(runtime::kRequiresBlasAttrName),
-                      BoolAttr::get(b.getContext(), true));
-    }
-
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(b, kCustomCallTarget, op);
-
-    // Convert Gemm to a function call.
-    auto call = rewriter.create<func::CallOp>(op.getLoc(), callee.getName(),
-                                              TypeRange(), op.getOperands());
-
-    // Assign a unique id to this instance of a gemm operation.
-    call->setAttr(b.getStringAttr("uid"), b.getI64IntegerAttr(uid_.uid()));
-
-    // Copy backend specific attributes.
-    auto algorithm_attr =
-        op.getAlgorithm()
-            ? op.getAlgorithmAttr()
-            : b.getI64IntegerAttr(stream_executor::blas::kDefaultGemmAlgo);
-    call->setAttr(b.getStringAttr("algorithm"), algorithm_attr);
-    call->setAttr(b.getStringAttr("alpha_imag"), op.getAlphaImagAttr());
-    call->setAttr(b.getStringAttr("alpha_real"), op.getAlphaRealAttr());
-    call->setAttr(b.getStringAttr("beta"), op.getBetaAttr());
-    call->setAttr(b.getStringAttr("dot_dims"), op.getDotDimensionNumbers());
-
-    if (auto precisions = op.getPrecisionConfig()) {
-      llvm::SmallVector<int32_t> values;
-      for (auto precision : *precisions) {
-        auto value = precision.cast<mhlo::PrecisionAttr>().getValue();
-        values.push_back(static_cast<int32_t>(value));
-      }
-      call->setAttr(b.getStringAttr("precision"), b.getI32TensorAttr(values));
-    } else {
-      call->setAttr(b.getStringAttr("precision"), b.getI32TensorAttr({0, 0}));
-    }
-
-    // Erase the original gemm operation.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
- private:
-  UidGenerator& uid_;
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-class CublasLtMatmulOpLowering : public OpRewritePattern<CublasLtMatmulOp> {
- private:
-  static constexpr const char kCustomCallTarget[] = "xla.gpu.cublas.lt.matmul";
-
- public:
-  CublasLtMatmulOpLowering(MLIRContext* ctx, UidGenerator& uid,
-                           CustomCallDeclarations& custom_calls)
-      : OpRewritePattern<CublasLtMatmulOp>(ctx),
-        uid_(uid),
-        custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(CublasLtMatmulOp op,
-                                PatternRewriter& rewriter) const override {
-    // Get the custom call target.
-    std::string matmul = kCustomCallTarget;
-
-    switch (op.getEpilogue()) {
-      case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::Default:
-      case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::Relu:
-      case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::Gelu:
-        if (op.getNumOperands() != 4) {
-          return op.emitOpError("unexpected number of operands for matmul");
-        }
-        break;
-      case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::Bias:
-      case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::BiasRelu:
-      case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::BiasGelu:
-        if (op.getNumOperands() != 5) {
-          return op.emitOpError("unexpected number of operands for matmul");
-        }
-        matmul += ".bias";
-        break;
-      case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::GeluAux:
-        if (op.getNumOperands() != 5) {
-          return op.emitOpError("unexpected number of operands for matmul");
-        }
-        matmul += ".aux";
-        break;
-      case mlir::lmhlo_gpu::CublasLtMatmulEpilogue::BiasGeluAux:
-        if (op.getNumOperands() != 6) {
-          return op.emitOpError("unexpected number of operands for matmul");
-        }
-        matmul += ".bias.aux";
-        break;
-    }
-
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(b, matmul, op);
-
-    // Convert matmul to a function call.
-    auto call = rewriter.create<func::CallOp>(op.getLoc(), callee.getName(),
-                                              TypeRange(), op.getOperands());
-
-    // Assign a unique id to this instance of a matmul operation.
-    call->setAttr(b.getStringAttr("uid"), b.getI64IntegerAttr(uid_.uid()));
-
-    // Copy backend specific attributes.
-    call->setAttr(b.getStringAttr("algorithm"), op.getAlgorithmAttr());
-    call->setAttr(b.getStringAttr("alpha_imag"), op.getAlphaImagAttr());
-    call->setAttr(b.getStringAttr("alpha_real"), op.getAlphaRealAttr());
-    call->setAttr(b.getStringAttr("beta"), op.getBetaAttr());
-    call->setAttr(b.getStringAttr("dot_dims"), op.getDotDimensionNumbers());
-    call->setAttr(b.getStringAttr("epilogue"), op.getEpilogueAttr());
-
-    // TODO(ezhulenev): Today we can't pass an array of enum attributes to the
-    // custom call. Also we do not have a corresponding precision enum on the
-    // SE/XLA side, so we encode it as an i32 array (tensor).
-    if (auto precisions = op.getPrecisionConfig()) {
-      llvm::SmallVector<int32_t> values;
-      for (auto precision : *precisions) {
-        auto value = precision.cast<mhlo::PrecisionAttr>().getValue();
-        values.push_back(static_cast<int32_t>(value));
-      }
-      call->setAttr(b.getStringAttr("precision"), b.getI32TensorAttr(values));
-    } else {
-      call->setAttr(b.getStringAttr("precision"), b.getI32TensorAttr({0, 0}));
-    }
-
-    // Erase the original matmul operation.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
- private:
-  UidGenerator& uid_;
-  CustomCallDeclarations& custom_calls_;
-};
-
-// As above for FP8 Custom Calls.
-class CublasLtMatmulF8OpLowering : public OpRewritePattern<CublasLtMatmulF8Op> {
- private:
-  static constexpr const char kCustomCallTarget[] =
-      "xla.gpu.cublas.lt.matmul.f8";
-
- public:
-  CublasLtMatmulF8OpLowering(MLIRContext* ctx, UidGenerator& uid,
-                             CustomCallDeclarations& custom_calls)
-      : OpRewritePattern<CublasLtMatmulF8Op>(ctx),
-        uid_(uid),
-        custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(CublasLtMatmulF8Op op,
-                                PatternRewriter& rewriter) const override {
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(b, kCustomCallTarget, op);
-
-    // Convert matmul to a function call.
-    auto call = rewriter.create<func::CallOp>(op.getLoc(), callee.getName(),
-                                              TypeRange(), op.getOperands());
-
-    // Assign a unique id to this instance of a matmul operation.
-    call->setAttr(b.getStringAttr("uid"), b.getI64IntegerAttr(uid_.uid()));
-
-    // Copy backend specific attributes.
-    call->setAttr(b.getStringAttr("algorithm"), op.getAlgorithmAttr());
-    call->setAttr(b.getStringAttr("alpha_imag"), op.getAlphaImagAttr());
-    call->setAttr(b.getStringAttr("alpha_real"), op.getAlphaRealAttr());
-    call->setAttr(b.getStringAttr("beta"), op.getBetaAttr());
-    call->setAttr(b.getStringAttr("dot_dims"), op.getDotDimensionNumbers());
-    call->setAttr(b.getStringAttr("epilogue"), op.getEpilogueAttr());
-
-    // TODO(ezhulenev): Today we can't pass an array of enum attributes to the
-    // custom call. Also we do not have a corresponding precision enum on the
-    // SE/XLA side, so we encode it as an i32 array (tensor).
-    if (auto precisions = op.getPrecisionConfig()) {
-      llvm::SmallVector<int32_t> values;
-      for (auto precision : *precisions) {
-        auto value = precision.cast<mhlo::PrecisionAttr>().getValue();
-        values.push_back(static_cast<int32_t>(value));
-      }
-      call->setAttr(b.getStringAttr("precision"), b.getI32TensorAttr(values));
-    } else {
-      call->setAttr(b.getStringAttr("precision"), b.getI32TensorAttr({0, 0}));
-    }
-
-    // Erase the original matmul operation.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
- private:
-  UidGenerator& uid_;
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-template <typename Conv>
-class ConvOpLowering : public OpRewritePattern<Conv> {
- private:
-  static StringRef CustomCallTarget(ConvForwardOp) {
-    return "xla.gpu.conv.forward";
-  }
-  static StringRef CustomCallTarget(ConvForwardFusedOp) {
-    return "xla.gpu.conv.forward.fused";
-  }
-  static StringRef CustomCallTarget(ConvForwardFusedSideInputOp) {
-    return "xla.gpu.conv.forward.fused.side_input";
-  }
-  static StringRef CustomCallTarget(ConvBackwardFilterOp) {
-    return "xla.gpu.conv.backward.filter";
-  }
-  static StringRef CustomCallTarget(ConvBackwardInputOp) {
-    return "xla.gpu.conv.backward.input";
-  }
-  static StringRef CustomCallTarget(ConvForwardGraphOp) {
-    return "xla.gpu.conv.forward.graph";
-  }
-
- public:
-  explicit ConvOpLowering(MLIRContext* ctx, UidGenerator& uid,
-                          CustomCallDeclarations& custom_calls)
-      : OpRewritePattern<Conv>(ctx), uid_(uid), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(Conv op,
-                                PatternRewriter& rewriter) const override {
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee =
-        custom_calls_.GetOrCreate(b, CustomCallTarget(op), op);
-
-    // Convert Conv to a function call.
-    auto call = rewriter.create<func::CallOp>(op.getLoc(), callee.getName(),
-                                              TypeRange(), op.getOperands());
-
-    // Helper functins to copy attributes from the conv op to the custom call.
-    auto set_attr = [&](StringRef name, Attribute attr) {
-      call->setAttr(b.getStringAttr(name), attr);
-    };
-
-    auto set_xi64 = [&](StringRef name,
-                        std::optional<DenseIntElementsAttr> attr) {
-      SmallVector<int64_t> values;
-      if (attr.has_value())
-        values = llvm::to_vector(attr->getValues<int64_t>());
-      set_attr(name, b.getI64TensorAttr(values));
-    };
-
-    // Convert `BoolElementsAttr` to i64 before passing to the runtime.
-    // TODO(ezhulenev): Allow passing boolean tensors to the XLA custom calls.
-    auto set_xi1 = [&](StringRef name, std::optional<DenseElementsAttr> attr) {
-      SmallVector<int64_t> values;
-      if (attr.has_value())
-        values.assign(attr->getValues<bool>().begin(),
-                      attr->getValues<bool>().end());
-      set_attr(name, b.getI64TensorAttr(values));
-    };
-
-    // Assign a unique id to this instance of a conv operation.
-    call->setAttr(b.getStringAttr("uid"), b.getI64IntegerAttr(uid_.uid()));
-
-    // Copy dimension number attributes.
-    call->setAttr(b.getStringAttr("conv_dims"), op.getDimensionNumbers());
-
-    // Copy convolution window attributes.
-    set_xi1("window_reversal", op.getWindowReversal());
-    set_xi64("window_strides", op.getWindowStrides());
-    set_xi64("lhs_dilation", op.getLhsDilation());
-    set_xi64("rhs_dilation", op.getRhsDilation());
-    set_xi64("padding", op.getPadding());
-
-    // Copy backend config.
-    call->setAttr(b.getStringAttr("backend_config"), op.getBackendConfig());
-
-    // Copy remaining attributes.
-    set_attr("feature_group_count", op.getFeatureGroupCountAttr());
-    set_attr("result_scale", op.getResultScaleAttr());
-
-    // Copy attributes specific for fused convolutions.
-    if (auto fused = dyn_cast<ConvForwardFusedOp>(op.getOperation())) {
-      call->setAttr(b.getStringAttr("activation_mode"),
-                    fused.getActivationModeAttr());
-      set_attr("leakyrelu_alpha", fused.getLeakyreluAlphaAttr());
-    }
-
-    // Copy attributes specific for fused convolutions with side input.
-    if (auto fused = dyn_cast<ConvForwardFusedSideInputOp>(op.getOperation())) {
-      call->setAttr(b.getStringAttr("activation_mode"),
-                    fused.getActivationModeAttr());
-      set_attr("side_input_scale", fused.getSideInputScaleAttr());
-    }
-
-    // Copy attributes specific for graph convolutions.
-    if (auto fused = dyn_cast<ConvForwardGraphOp>(op.getOperation())) {
-      call->setAttr(b.getStringAttr("n_aux_outputs"),
-                    fused.getNAuxOutputsAttr());
-      call->setAttr(b.getStringAttr("serialized_graph"),
-                    fused.getSerializedGraphAttr());
-    }
-
-    // Erase the original conv operation.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
- private:
-  UidGenerator& uid_;
-  CustomCallDeclarations& custom_calls_;
-};
-
-class ConvForwardOpLowering : public ConvOpLowering<ConvForwardOp> {
- public:
-  using ConvOpLowering::ConvOpLowering;
-};
-
-class ConvForwardFusedOpLowering : public ConvOpLowering<ConvForwardFusedOp> {
- public:
-  using ConvOpLowering::ConvOpLowering;
-};
-
-class ConvBackwardFilterOpLowering
-    : public ConvOpLowering<ConvBackwardFilterOp> {
- public:
-  using ConvOpLowering::ConvOpLowering;
-};
-
-class ConvBackwardInputOpLowering : public ConvOpLowering<ConvBackwardInputOp> {
- public:
-  using ConvOpLowering::ConvOpLowering;
-};
-
-class ConvForwardFusedSideInputOpLowering
-    : public ConvOpLowering<ConvForwardFusedSideInputOp> {
- public:
-  using ConvOpLowering::ConvOpLowering;
-};
-
-class ConvForwardGraphOpLowering : public ConvOpLowering<ConvForwardGraphOp> {
- public:
-  using ConvOpLowering::ConvOpLowering;
-};
-
-//===----------------------------------------------------------------------===//
-
-template <typename ConvReorder>
-class CudnnConvReorderOpLowering : public OpRewritePattern<ConvReorder> {
- private:
-  static StringRef CustomCallTarget(CudnnConvReorderFilterOp) {
-    return "xla.gpu.conv.reorder.filter";
-  }
-  static StringRef CustomCallTarget(CudnnConvReorderFilterAndBiasOp) {
-    return "xla.gpu.conv.reorder.filter_and_bias";
-  }
-
- public:
-  explicit CudnnConvReorderOpLowering(MLIRContext* ctx,
-                                      CustomCallDeclarations& custom_calls)
-      : OpRewritePattern<ConvReorder>(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(ConvReorder op,
-                                PatternRewriter& rewriter) const override {
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee =
-        custom_calls_.GetOrCreate(b, CustomCallTarget(op), op);
-
-    auto filterDims = rewriter.getDenseI64ArrayAttr(
-        llvm::to_vector(op.getFilterDims().template getValues<int64_t>()));
-
-    // Replace ConvOp with an equivalent custom call.
-    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
-        op, callee.getName(), TypeRange(), op.getOperands());
-    call->setAttr(b.getStringAttr("filter_dims"), filterDims);
-
-    return success();
-  }
-
- private:
-  CustomCallDeclarations& custom_calls_;
-};
-
-class CudnnConvReorderFilterOpLowering
-    : public CudnnConvReorderOpLowering<CudnnConvReorderFilterOp> {
- public:
-  using CudnnConvReorderOpLowering::CudnnConvReorderOpLowering;
-};
-
-class CudnnConvReorderFilterAndBiasOpLowering
-    : public CudnnConvReorderOpLowering<CudnnConvReorderFilterAndBiasOp> {
- public:
-  using CudnnConvReorderOpLowering::CudnnConvReorderOpLowering;
-};
-
-//===----------------------------------------------------------------------===//
-
-class CholeskyOpLowering : public OpRewritePattern<CholeskyOp> {
- private:
-  static constexpr const char kCustomCallTarget[] = "xla.gpu.cholesky";
-
- public:
-  explicit CholeskyOpLowering(MLIRContext* ctx,
-                              CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(CholeskyOp op,
-                                PatternRewriter& rewriter) const override {
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(b, kCustomCallTarget, op);
-
-    // Convert Cholesky to a function call.
-    auto call = rewriter.create<func::CallOp>(op.getLoc(), callee.getName(),
-                                              TypeRange(), op.getOperands());
-
-    const auto& dims =
-        op.getInput().getType().cast<mlir::MemRefType>().getShape();
-    if (dims.size() < 2)
-      return op.emitOpError() << "Input's dimension count (" << dims.size()
-                              << ") must be 2 or greater.";
-    int64_t n = dims[dims.size() - 1];
-    int64_t batch_size =
-        std::accumulate(dims.begin(), dims.end() - 2, int64_t{1},
-                        [](int64_t a, int64_t b) { return a * b; });
-
-    // Copy backend specific attributes.
-    call->setAttr(b.getStringAttr("batch_size"),
-                  b.getI64IntegerAttr(batch_size));
-    call->setAttr(b.getStringAttr("n"), b.getI64IntegerAttr(n));
-    call->setAttr(b.getStringAttr("is_lower"), op.getIsLowerAttr());
-
-    // Erase the original Cholesky operation.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
- private:
-  CustomCallDeclarations& custom_calls_;
-};
-
-class NormOpLowering : public OpRewritePattern<CudnnNormOp> {
- private:
-  static constexpr const char kCustomCallTarget[] = "xla.gpu.norm";
-
- public:
-  NormOpLowering(MLIRContext* ctx, UidGenerator& uid,
-                 CustomCallDeclarations& custom_calls)
-      : OpRewritePattern<CudnnNormOp>(ctx),
-        uid_(uid),
-        custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(CudnnNormOp op,
-                                PatternRewriter& rewriter) const override {
-    // Get or create a Custom Call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(b, kCustomCallTarget, op);
-
-    // Convert norm to a function call.
-    auto call = rewriter.create<func::CallOp>(op.getLoc(), callee.getName(),
-                                              TypeRange(), op.getOperands());
-
-    // Assign a unique id to this instance of a norm operation.
-    call->setAttr(b.getStringAttr("uid"), b.getI64IntegerAttr(uid_.uid()));
-
-    // Copy backend specific attributes.
-    call->setAttr(b.getStringAttr("norm_algorithm_config"),
-                  op.getAlgorithmConfigAttr());
-    call->setAttr(b.getStringAttr("epsilon"), op.getEpsilonAttr());
-
-    mlir::ArrayAttr array = op.getOperandLayouts();
-    SmallVector<int64_t> values;
-    for (auto array_elem : array) {
-      mlir::IntegerAttr attr = array_elem.dyn_cast<mlir::IntegerAttr>();
-      values.push_back(attr.getInt());
-    }
-    call->setAttr(b.getStringAttr("operand_layouts"),
-                  b.getI64TensorAttr(values));
-
-    // Erase the original norm operation.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
- private:
-  UidGenerator& uid_;
-  CustomCallDeclarations& custom_calls_;
-};
-
-using mlir::lmhlo_gpu::fusedMHAOp;
-
-template <typename FusedDotAttentionForward>
-class FusedAttentionForwardLowering
-    : public OpRewritePattern<FusedDotAttentionForward> {
- private:
-  static constexpr const char kCustomCallTarget[] = "xla.gpu.fused.attention.";
-
- public:
-  explicit FusedAttentionForwardLowering(MLIRContext* ctx, UidGenerator& uid,
-                                         CustomCallDeclarations& custom_calls)
-      : OpRewritePattern<FusedDotAttentionForward>(ctx),
-        uid_(uid),
-        custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(FusedDotAttentionForward op,
-                                PatternRewriter& rewriter) const override {
-    // Get the custom call target.
-    std::string fused_attention = kCustomCallTarget;
-    auto num_operands = op.getNumOperands();
-    switch (op.getFusedMhaDag()) {
-      case mlir::lmhlo_gpu::FusedMhaDagSignature::Default:
-        if (num_operands == 5) {
-          fused_attention += "bmm.bmm.inference";
-        } else if (num_operands == 6) {
-          fused_attention += "bmm.bmm.forward";
-        } else {
-          return op.emitOpError(
-              "unexpected number of operands for fused dot attention - BMMBMM");
-        }
-        break;
-      case mlir::lmhlo_gpu::FusedMhaDagSignature::Softmax:
-        if (num_operands == 5) {
-          fused_attention += "softmax.inference";
-        } else if (num_operands == 6) {
-          fused_attention += "softmax.forward";
-        } else {
-          return op.emitOpError(
-              "unexpected number of operands for fused dot attention - "
-              "BMM_Softmax_BMM");
-        }
-        break;
-      case mlir::lmhlo_gpu::FusedMhaDagSignature::SoftmaxDropout:
-        if (num_operands == 5) {
-          fused_attention += "softmax.dropout.inference";
-        } else if (num_operands == 6) {
-          fused_attention += "softmax.dropout.forward";
-        } else {
-          return op.emitOpError(
-              "unexpected number of operands for fused dot attention - "
-              "BMM_Softmax_Dropout_BMM");
-        }
-        break;
-
-      case mlir::lmhlo_gpu::FusedMhaDagSignature::ScaleBiasMaskSoftmax:
-        if (num_operands == 7) {
-          fused_attention += "scale.bias.mask.softmax.inference";
-        } else if (num_operands == 8) {
-          fused_attention += "scale.bias.mask.softmax.forward";
-        } else {
-          return op.emitOpError(
-              "unexpected number of operands for fused dot attention - "
-              "BMM_Bias_Mask_Softmax_BMM");
-        }
-        break;
-
-      case mlir::lmhlo_gpu::FusedMhaDagSignature::ScaleBiasMaskSoftmaxDropout:
-        if (num_operands == 7) {
-          fused_attention += "scale.bias.mask.softmax.dropout.inference";
-        } else if (num_operands == 8) {
-          fused_attention += "scale.bias.mask.softmax.dropout.forward";
-        } else {
-          return op.emitOpError(
-              "unexpected number of operands for fused dot attention - "
-              "BMM_Bias_Mask_Softmax_Dropout_BMM");
-        }
-        break;
-
-      case mlir::lmhlo_gpu::FusedMhaDagSignature::ScaleMaskSoftmax:
-        if (num_operands == 6) {
-          fused_attention += "scale.mask.softmax.inference";
-        } else if (num_operands == 7) {
-          fused_attention += "scale.mask.softmax.forward";
-        } else {
-          return op.emitOpError(
-              "unexpected number of operands for fused dot attention - "
-              "BMM_mask_Softmax_BMM");
-        }
-        break;
-
-      case mlir::lmhlo_gpu::FusedMhaDagSignature::ScaleMaskSoftmaxDropout:
-        if (num_operands == 6) {
-          fused_attention += "scale.mask.softmax.dropout.inference";
-        } else if (num_operands == 7) {
-          fused_attention += "scale.mask.softmax.dropout.forward";
-        } else {
-          return op.emitOpError(
-              "unexpected number of operands for fused dot attention - "
-              "BMM_mask_Softmax_Dropout_BMM");
-        }
-        break;
-
-      case mlir::lmhlo_gpu::FusedMhaDagSignature::ScaleBiasSoftmax:
-        if (num_operands == 6) {
-          fused_attention += "scale.bias.softmax.inference";
-        } else if (num_operands == 7) {
-          fused_attention += "scale.bias.softmax.forward";
-        } else {
-          return op.emitOpError(
-              "unexpected number of operands for fused dot attention - "
-              "BMM_bias_Softmax_BMM");
-        }
-        break;
-
-      case mlir::lmhlo_gpu::FusedMhaDagSignature::ScaleBiasSoftmaxDropout:
-        if (num_operands == 6) {
-          fused_attention += "scale.bias.softmax.dropout.inference";
-        } else if (num_operands == 7) {
-          fused_attention += "scale.bias.softmax.dropout.forward";
-        } else {
-          return op.emitOpError(
-              "unexpected number of operands for fused dot attention - "
-              "BMM_bias_Softmax_Dropout_BMM");
-        }
-        break;
-
-      default:
-        return op.emitOpError("Undefined fused dot attention DAG signature");
-    }
-
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(b, fused_attention, op);
-
-    // Convert fused_attention to a function call.
-    auto call = rewriter.create<func::CallOp>(op.getLoc(), callee.getName(),
-                                              TypeRange(), op.getOperands());
-
-    // Assign a unique id to this instance of a fused_attention operation.
-    call->setAttr(b.getStringAttr("uid"), b.getI64IntegerAttr(uid_.uid()));
-
-    // Helper functins to copy attributes from the conv op to the custom call.
-    auto set_attr = [&](StringRef name, Attribute attr) {
-      if (attr) {
-        call->setAttr(b.getStringAttr(name), attr);
-      }
-    };
-
-    set_attr("fmha_scale", op.getFmhaScaleAttr());
-    set_attr("dropout_rate", op.getDropoutRateAttr());
-    set_attr("seed", op.getSeedAttr());
-    set_attr("is_flash_attention", op.getIsFlashAttentionAttr());
-    set_attr("is_causal_mask", op.getIsCausalMaskAttr());
-    set_attr("fused_mha_dag", op.getFusedMhaDagAttr());
-    set_attr("algorithm_config", op.getAlgorithmConfigAttr());
-    set_attr("bmm1_dot_dimension_numbers", op.getBmm1DotDimensionNumbers());
-    set_attr("bmm2_dot_dimension_numbers", op.getBmm2DotDimensionNumbers());
-
-    auto set_xi64 = [&](StringRef name, mlir::ArrayAttr array) {
-      int rank = array.size();
-      SmallVector<int64_t> values;
-      for (int i = 0; i < rank; i++) {
-        mlir::IntegerAttr attr = array[i].dyn_cast<mlir::IntegerAttr>();
-        values.push_back(attr.getInt());
-      }
-      set_attr(name, b.getI64TensorAttr(values));
-    };
-
-    set_xi64("intermediate_tensor_dimensions",
-             op.getIntermediateTensorDimensions());
-    set_xi64("intermediate_tensor_layout", op.getIntermediateTensorLayout());
-
-    // Erase the original fused dot attention operation.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
- private:
-  UidGenerator& uid_;
-  CustomCallDeclarations& custom_calls_;
-};
-
-class FusedAttentionForwardOpLowering
-    : public FusedAttentionForwardLowering<fusedMHAOp> {
- public:
-  using FusedAttentionForwardLowering::FusedAttentionForwardLowering;
-};
-
-using mlir::lmhlo_gpu::fusedMHABackwardOp;
-
-template <typename FusedDotAttentionBackward>
-class FusedAttentionBackwardLowering
-    : public OpRewritePattern<FusedDotAttentionBackward> {
- private:
-  static constexpr const char kFusedAttentionCustomCallTarget[] =
-      "xla.gpu.fused.attention.backward.";
-  static constexpr const char kFlashAttentionCustomCallTarget[] =
-      "xla.gpu.flash.attention.backward.";
-
- public:
-  explicit FusedAttentionBackwardLowering(MLIRContext* ctx, UidGenerator& uid,
-                                          CustomCallDeclarations& custom_calls)
-      : OpRewritePattern<FusedDotAttentionBackward>(ctx),
-        uid_(uid),
-        custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(FusedDotAttentionBackward op,
-                                PatternRewriter& rewriter) const override {
-    // Get the custom call target.
-    bool is_flash_attention = op.getIsFlashAttention();
-    std::string fused_attention = is_flash_attention
-                                      ? kFlashAttentionCustomCallTarget
-                                      : kFusedAttentionCustomCallTarget;
-    auto num_operands = op.getNumOperands();
-    switch (op.getFusedMhaDag()) {
-      case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::BackwardSoftmax:
-        if (is_flash_attention) {
-          if (num_operands == 12) {
-            fused_attention += "scale.softmax";
-          } else {
-            return op.emitOpError(
-                "unexpected number of operands for flash attention backward - "
-                "BMM_Softmax_BMM");
-          }
-        }
-        break;
-
-      case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::
-          BackwardSoftmaxDropout:
-        if (is_flash_attention) {
-          if (num_operands == 12) {
-            fused_attention += "scale.softmax.dropout";
-          } else {
-            return op.emitOpError(
-                "unexpected number of operands for flash attention backward - "
-                "BMM_Softmax_Dropout_BMM");
-          }
-        }
-        break;
-      case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::
-          BackwardScaleBiasSoftmax:
-        if (is_flash_attention) {
-          if (num_operands == 13) {
-            fused_attention += "scale.bias.softmax";
-          } else {
-            return op.emitOpError(
-                "unexpected number of operands for flash attention backward - "
-                "BMM_Bias_Softmax_BMM");
-          }
-          break;
-        }
-        if (num_operands == 10) {
-          fused_attention += "scale.softmax";
-        } else if (num_operands == 11) {
-          fused_attention += "scale.dbias.softmax";
-        } else {
-          return op.emitOpError(
-              "unexpected number of operands for fused attention backward - "
-              "BMM_Bias_Softmax_BMM");
-        }
-        break;
-
-      case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::
-          BackwardScaleBiasSoftmaxDropout:
-        if (is_flash_attention) {
-          if (num_operands == 13) {
-            fused_attention += "scale.bias.softmax.dropout";
-          } else {
-            return op.emitOpError(
-                "unexpected number of operands for flash attention backward - "
-                "BMM_Bias_Softmax_Dropout_BMM");
-          }
-          break;
-        }
-        if (num_operands == 10) {
-          fused_attention += "scale.softmax.dropout";
-        } else if (num_operands == 11) {
-          fused_attention += "scale.dbias.softmax.dropout";
-        } else {
-          return op.emitOpError(
-              "unexpected number of operands for fused attention backward - "
-              "BMM_Bias_Softmax_Dropout_BMM");
-        }
-        break;
-
-      case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::
-          BackwardScaleBiasMaskSoftmax:
-        if (num_operands == 11) {
-          fused_attention += "scale.mask.softmax";
-        } else if (num_operands == 12) {
-          fused_attention += "scale.dbias.mask.softmax";
-        } else {
-          return op.emitOpError(
-              "unexpected number of operands for fused attention backward - "
-              "BMM_Bias_Mask_Softmax_BMM");
-        }
-        break;
-
-      case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::
-          BackwardScaleBiasMaskSoftmaxDropout:
-        if (num_operands == 11) {
-          fused_attention += "scale.mask.softmax.dropout";
-        } else if (num_operands == 12) {
-          fused_attention += "scale.dbias.mask.softmax.dropout";
-        } else {
-          return op.emitOpError(
-              "unexpected number of operands for fused attention backward - "
-              "BMM_Bias_Mask_Softmax_Dropout_BMM");
-        }
-        break;
-
-      default:
-        return op.emitOpError("Undefined fused attention DAG signature");
-    }
-
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(b, fused_attention, op);
-
-    // Convert fused_attention to a function call.
-    auto call = rewriter.create<func::CallOp>(op.getLoc(), callee.getName(),
-                                              TypeRange(), op.getOperands());
-
-    // Assign a unique id to this instance of a fused_attention operation.
-    call->setAttr(b.getStringAttr("uid"), b.getI64IntegerAttr(uid_.uid()));
-
-    // Helper functins to copy attributes from the conv op to the custom call.
-    auto set_attr = [&](StringRef name, Attribute attr) {
-      if (attr) {
-        call->setAttr(b.getStringAttr(name), attr);
-      }
-    };
-
-    set_attr("fmha_scale", op.getFmhaScaleAttr());
-    set_attr("dropout_rate", op.getDropoutRateAttr());
-    set_attr("seed", op.getSeedAttr());
-    set_attr("is_flash_attention", op.getIsFlashAttentionAttr());
-    set_attr("is_causal_mask", op.getIsCausalMaskAttr());
-    set_attr("fused_mha_dag", op.getFusedMhaDagAttr());
-    set_attr("algorithm_config", op.getAlgorithmConfigAttr());
-    set_attr("bmm1_grad_gemm1_dot_dimension_numbers",
-             op.getBmm1GradGemm1DotDimensionNumbers());
-    set_attr("bmm1_grad_gemm2_dot_dimension_numbers",
-             op.getBmm1GradGemm2DotDimensionNumbers());
-    set_attr("bmm2_grad_gemm1_dot_dimension_numbers",
-             op.getBmm2GradGemm1DotDimensionNumbers());
-    set_attr("bmm2_grad_gemm2_dot_dimension_numbers",
-             op.getBmm2GradGemm2DotDimensionNumbers());
-
-    auto set_xi64 = [&](StringRef name, mlir::ArrayAttr array) {
-      int rank = array.size();
-      SmallVector<int64_t> values;
-      for (int i = 0; i < rank; i++) {
-        mlir::IntegerAttr attr = array[i].dyn_cast<mlir::IntegerAttr>();
-        values.push_back(attr.getInt());
-      }
-      set_attr(name, b.getI64TensorAttr(values));
-    };
-
-    set_xi64("intermediate_tensor_dimensions",
-             op.getIntermediateTensorDimensions());
-    set_xi64("intermediate_tensor_layout", op.getIntermediateTensorLayout());
-
-    // Erase the original fused dot attention operation.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
- private:
-  UidGenerator& uid_;
-  CustomCallDeclarations& custom_calls_;
-};
-
-class FusedAttentionBackwardOpLowering
-    : public FusedAttentionBackwardLowering<fusedMHABackwardOp> {
- public:
-  using FusedAttentionBackwardLowering::FusedAttentionBackwardLowering;
-};
-
-class RadixSortOpLowering : public OpRewritePattern<RadixSortOp> {
- private:
-  static constexpr const char kSortKeysTarget[] = "xla.gpu.radix_sort_keys";
-  static constexpr const char kSortPairsTarget[] = "xla.gpu.radix_sort_pairs";
-
- public:
-  explicit RadixSortOpLowering(MLIRContext* ctx,
-                               CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(RadixSortOp op,
-                                PatternRewriter& rewriter) const override {
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(
-        b, op.getOperands().size() == 3 ? kSortKeysTarget : kSortPairsTarget,
-        op);
-
-    // Convert radix sort to a function call.
-    auto call = rewriter.create<func::CallOp>(op.getLoc(), callee.getName(),
-                                              TypeRange(), op.getOperands());
-    call->setAttr(b.getStringAttr("descending"), op.getDescendingAttr());
-
-    // Erase the original operation.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
- private:
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-void ConvertLmhloGpuToGpuRuntimePass::runOnOperation() {
-  ModuleOp module = getOperation();
-  MLIRContext* ctx = module.getContext();
-
-  // Keep track of the custom calls created from the lowered operations.
-  SymbolTable sym_table(module);
-  CustomCallDeclarations custom_calls(std::move(sym_table));
-
-  // Convert lmhlo_gpu operations to XLA gpu runtime custom calls.
-  RewritePatternSet patterns(ctx);
-
-  // Each unique Gemm/Matmul operation in the module will get assigned a uid.
-  UidGenerator matmul_uid;
-  patterns.insert<GemmOpLowering, CublasLtMatmulOpLowering,
-                  CublasLtMatmulF8OpLowering>(ctx, matmul_uid, custom_calls);
-
-  // Each unique Conv operation in the module will get assigned a uid.
-  UidGenerator conv_uid;
-  patterns
-      .insert<ConvForwardOpLowering, ConvForwardFusedOpLowering,
-              ConvForwardFusedSideInputOpLowering, ConvBackwardFilterOpLowering,
-              ConvBackwardInputOpLowering, ConvForwardGraphOpLowering>(
-          ctx, conv_uid, custom_calls);
-
-  // Patterns for every other Gpu operation.
-  patterns.insert<CudnnConvReorderFilterOpLowering>(ctx, custom_calls);
-  patterns.insert<CudnnConvReorderFilterAndBiasOpLowering>(ctx, custom_calls);
-  patterns.insert<CholeskyOpLowering>(ctx, custom_calls);
-  patterns.insert<RadixSortOpLowering>(ctx, custom_calls);
-
-  // Each unique Norm operation in the module will get assigned a uid.
-  UidGenerator norm_uid;
-  patterns.insert<NormOpLowering>(ctx, norm_uid, custom_calls);
-
-  // Each unique fused_attention operation in the module will get assigned a
-  // uid.
-  UidGenerator fused_attention_uid;
-  patterns.insert<FusedAttentionForwardOpLowering>(ctx, fused_attention_uid,
-                                                   custom_calls);
-
-  // Each unique fused_attention_backward operation in the module will get
-  // assigned a uid.
-  UidGenerator fused_attention_backward_uid;
-  patterns.insert<FusedAttentionBackwardOpLowering>(
-      ctx, fused_attention_backward_uid, custom_calls);
-
-  if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
-    return signalPassFailure();
-}
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertLmhloGpuToGpuRuntimePass() {
-  return std::make_unique<ConvertLmhloGpuToGpuRuntimePass>();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_launch.cc b/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_launch.cc
deleted file mode 100644
index 39964a3467cdcc..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_launch.cc
+++ /dev/null
@@ -1,488 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <array>
-#include <iterator>
-#include <memory>
-#include <numeric>
-#include <string>
-#include <string_view>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include "absl/status/statusor.h"
-#include "llvm/ADT/APFloat.h"
-#include "llvm/ADT/STLExtras.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/Visitors.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "xla/mlir/runtime/ir/rt_ops.h"
-#include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
-#include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/runtime3/conditional_thunk.h"
-#include "xla/service/gpu/runtime3/copy_thunk.h"
-#include "xla/service/gpu/runtime3/kernel_thunk.h"
-#include "xla/service/gpu/runtime3/memset_thunk.h"
-#include "xla/service/gpu/runtime3/sequential_thunk.h"
-#include "xla/service/gpu/runtime3/while_thunk.h"
-
-namespace xla {
-namespace gpu {
-
-#define GEN_PASS_DEF_CONVERTLMHLOTOGPULAUNCHPASS
-#include "xla/mlir/backends/gpu/transforms/passes.h.inc"
-
-using namespace mlir;  // NOLINT
-
-using mlir::gpu::GPUDialect;
-using mlir::gpu::GPUFuncOp;
-using mlir::gpu::GPUModuleOp;
-using mlir::gpu::KernelDim3;
-using mlir::gpu::LaunchFuncOp;
-using mlir::gpu::MemcpyOp;
-using mlir::gpu::MemsetOp;
-using mlir::gpu::ReturnOp;
-
-class ConvertLmhloToGpuLaunchPass
-    : public impl::ConvertLmhloToGpuLaunchPassBase<
-          ConvertLmhloToGpuLaunchPass> {
- public:
-  explicit ConvertLmhloToGpuLaunchPass(ThunkSequence* thunk_sequence)
-      : thunk_sequence_(thunk_sequence) {}
-
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<mlir::gpu::GPUDialect, xla::runtime::RuntimeDialect>();
-  }
-
- private:
-  ThunkSequence* thunk_sequence_;
-};
-
-// XLA some times (ab)uses custom calls to represent operations for which we do
-// not want to define a separate `HloOpcode`. These operations emitted as device
-// kernels (similar to fusions), and we detect such custom calls by name, and
-// handle them similar to how we handle fusions.
-static std::array<std::string_view, 2> kCustomCallIntrinsics = {
-    "SliceToDynamic", "PadToStatic"};
-
-//===-----------------------------------------------------------------------===/
-
-static Value MakeBitPatternConstant(OpBuilder& b, Location loc, Type type,
-                                    uint32_t bit_pattern) {
-  mlir::MLIRContext* ctx = type.getContext();
-
-  // For zero bit pattern always memset with a zero value of the same type.
-  if (bit_pattern == 0) {
-    // Because `arith` dialect doesn't support unsigned constants, we have to
-    // create signless constant first, and then use `rt.unsigned_cast` operation
-    // to make it unsigned. When lowering to LLVM and function calls, this
-    // casting operation will be erased.
-    if (type.isUnsignedInteger()) {
-      auto signless = IntegerType::get(ctx, type.getIntOrFloatBitWidth());
-      auto zero = b.create<arith::ConstantOp>(loc, b.getZeroAttr(signless));
-      return b.create<runtime::UnsignedCastOp>(loc, type, zero.getResult());
-    }
-
-    return b.create<arith::ConstantOp>(loc, b.getZeroAttr(type));
-  }
-
-  // In XLA a 1-byte bit pattern copied to fill a 32-byte word when
-  // `Memset32BitValueThunk` is constructed, so to get back an `i1` constant we
-  // only need to check if any bit is set to `1`.
-  if (type.isInteger(1)) {
-    return b.create<arith::ConstantOp>(loc, b.getBoolAttr(bit_pattern));
-  }
-
-  // Xla IR emitter copies integers of smaller width to fill 32 bits, so we can
-  // safely truncate the bit pattern. For integers larger than 32 bits we can
-  // construct a wider integer, as Xla guarantees that all 32-bit words are
-  // equal.
-  if (auto integer = type.dyn_cast<mlir::IntegerType>()) {
-    llvm::APInt i32(32, bit_pattern);
-
-    assert(integer.getWidth() <= 64 && "integer value must be <= 64 bits");
-    llvm::APInt value = integer.getWidth() <= 32 ? i32.trunc(integer.getWidth())
-                                                 : i32.concat(i32);
-
-    // See unsigned-to-signed cast documentation above.
-    if (integer.isUnsigned()) {
-      auto signless = IntegerType::get(ctx, integer.getWidth());
-      auto cst =
-          b.create<arith::ConstantOp>(loc, b.getIntegerAttr(signless, value));
-      return b.create<runtime::UnsignedCastOp>(loc, type, cst.getResult());
-    }
-
-    return b.create<arith::ConstantOp>(loc, b.getIntegerAttr(integer, value));
-  }
-
-  // Similar to integer type we can safely truncate or concat bit pattern.
-  if (auto fp = type.dyn_cast<mlir::FloatType>()) {
-    llvm::APInt i32(32, bit_pattern);
-
-    assert(fp.getWidth() <= 64 && "floating point value must be <= 64 bits");
-    llvm::APInt ivalue =
-        fp.getWidth() <= 32 ? i32.trunc(fp.getWidth()) : i32.concat(i32);
-
-    llvm::APFloat fvalue = [&]() -> llvm::APFloat {
-      if (fp.isBF16()) return {llvm::APFloat::BFloat(), ivalue};
-      if (fp.isF16()) return {llvm::APFloat::IEEEhalf(), ivalue};
-      if (fp.isF32()) return {llvm::APFloat::IEEEsingle(), ivalue};
-      if (fp.isF64()) return {llvm::APFloat::IEEEdouble(), ivalue};
-
-      assert(false && "unsupported floating point type");
-      return llvm::APFloat::getZero(llvm::APFloat::IEEEsingle());
-    }();
-
-    return b.create<arith::ConstantFloatOp>(loc, fvalue, fp);
-  }
-
-  // Return a constant index value, that will safely fail verification (there is
-  // no memset operation for `index` type), so that we do not accidentally crash
-  // the binary in optimized builds.
-  assert(false && "unsupported memset type");
-  return b.create<arith::ConstantIndexOp>(loc, 0);
-}
-
-static void ExtractThunksForOp(Operation* op, ThunkSequence& thunk_sequence,
-                               ThunkSequence* thunks_for_op) {
-  for (std::unique_ptr<Thunk>& thunk : thunk_sequence) {
-    if (thunk == nullptr) {
-      // This thunk has already been std::move()'ed out of the ThunkSequence
-      // (see below). Do nothing.
-    } else if (thunk->kind() == Thunk::kWhile) {
-      // Search for thunks for the op in while loop.
-      auto* while_thunk = static_cast<WhileThunk*>(thunk.get());
-      ExtractThunksForOp(op, while_thunk->condition_thunk_sequence()->thunks(),
-                         thunks_for_op);
-      ExtractThunksForOp(op, while_thunk->body_thunk_sequence()->thunks(),
-                         thunks_for_op);
-    } else if (thunk->kind() == Thunk::kConditional) {
-      // Search for thunks for the op in conditional branches.
-      auto* cond_thunk = static_cast<ConditionalThunk*>(thunk.get());
-      for (const std::unique_ptr<SequentialThunk>& branch_thunks :
-           cond_thunk->branch_thunks()) {
-        ExtractThunksForOp(op, branch_thunks->thunks(), thunks_for_op);
-      }
-    } else if (thunk->op() == op) {
-      // Found a thunk for the op.
-      thunks_for_op->push_back(std::move(thunk));
-    } else {
-      // Thunk is not relevant to the op. Do nothing.
-    }
-  }
-}
-
-// Returns the data to rewrite op without changing the IR.
-static absl::StatusOr<std::unique_ptr<ThunkSequence>> Match(
-    Operation* op, ThunkSequence& thunk_sequence) {
-  auto thunks_for_op = std::make_unique<ThunkSequence>();
-  ExtractThunksForOp(op, thunk_sequence, thunks_for_op.get());
-
-  // Check if we know how to lower a Thunk to Gpu operation(s).
-  auto is_supported = [](const std::unique_ptr<Thunk>& thunk) -> bool {
-    Thunk::Kind kinds[] = {Thunk::kKernel,  Thunk::kCustomKernel,
-                           Thunk::kCopy,    Thunk::kMemset32BitValue,
-                           Thunk::kMemzero, Thunk::kSequential};
-    return llvm::any_of(
-        kinds, [&](Thunk::Kind kind) { return thunk->kind() == kind; });
-  };
-
-  if (!llvm::all_of(*thunks_for_op, is_supported)) {
-    return absl::InternalError("Unsupported Thunk kind");
-  }
-
-  return std::move(thunks_for_op);
-}
-
-static void LowerThunkToGpuOp(Operation* op, OpBuilder& b,
-                              GPUModuleOp gpu_module, Thunk* thunk);
-
-// Replaces op with gpu.launch_func, gpu.memcpy, gpu.memset ops.
-static void Rewrite(Operation* op, OpBuilder& b, SymbolTable& symbol_table,
-                    ThunkSequence* thunks) {
-  OpBuilder::InsertionGuard guard(b);
-  auto loc = op->getLoc();
-
-  b.setInsertionPoint(op->getParentOfType<func::FuncOp>());
-  auto gpu_module = b.create<GPUModuleOp>(loc, "gpu_module");
-  symbol_table.insert(gpu_module);
-
-  for (const std::unique_ptr<Thunk>& thunk : *thunks) {
-    LowerThunkToGpuOp(op, b, gpu_module, thunk.get());
-  }
-
-  op->erase();
-}
-
-static void LowerKernelThunkToGpuOp(
-    Operation* op, OpBuilder& b, GPUModuleOp gpu_module,
-    const KernelThunk& thunk, const SmallVector<Value>& kernel_args,
-    const SmallVector<bool>& kernel_args_written) {
-  mlir::Location loc = op->getLoc();
-  b.setInsertionPointToStart(gpu_module.getBody());
-
-  auto func_type =
-      b.getType<FunctionType>(TypeRange(ValueRange(kernel_args)), TypeRange());
-
-  gpu::GPUFuncOp kernel_func =
-      b.create<gpu::GPUFuncOp>(loc, thunk.kernel_name(), func_type);
-  kernel_func->setAttr(GPUDialect::getKernelFuncAttrName(), b.getUnitAttr());
-
-  for (int i = 0; i < kernel_args.size(); ++i) {
-    if (kernel_args_written[i]) {
-      kernel_func.setArgAttr(i, "lmhlo.written", b.getUnitAttr());
-    }
-  }
-
-  b.setInsertionPointToEnd(&kernel_func.getBody().back());
-  b.create<ReturnOp>(loc);
-
-  auto make_const_idx = [&](int64_t value) {
-    auto attr = b.getIndexAttr(value);
-    return b.create<arith::ConstantOp>(loc, attr).getResult();
-  };
-
-  auto make_kernel_dim3 = [&](const auto& dim3) {
-    return KernelDim3{make_const_idx(dim3.x), make_const_idx(dim3.y),
-                      make_const_idx(dim3.z)};
-  };
-
-  b.setInsertionPoint(op);
-  const auto& launch_dims = thunk.launch_dimensions();
-  auto grid_size = make_kernel_dim3(launch_dims.block_counts());
-  auto block_size = make_kernel_dim3(launch_dims.thread_counts_per_block());
-  auto shmem_size = b.create<arith::ConstantOp>(
-      loc, b.getI32IntegerAttr(thunk.shmem_bytes()));
-
-  b.create<LaunchFuncOp>(loc, kernel_func, grid_size, block_size, shmem_size,
-                         kernel_args);
-}
-
-static void LowerCustomKernelThunkToGpuOp(
-    Operation* op, OpBuilder& b, GPUModuleOp gpu_module,
-    const CustomKernelThunk& thunk, const SmallVector<Value>& kernel_args,
-    const SmallVector<bool>& kernel_args_written) {
-  mlir::Location loc = op->getLoc();
-  b.setInsertionPointToStart(gpu_module.getBody());
-
-  auto func_type =
-      b.getType<FunctionType>(TypeRange(ValueRange(kernel_args)), TypeRange());
-
-  gpu::GPUFuncOp kernel_func =
-      b.create<gpu::GPUFuncOp>(loc, thunk.custom_kernel_name(), func_type);
-  kernel_func->setAttr(GPUDialect::getKernelFuncAttrName(), b.getUnitAttr());
-
-  for (int i = 0; i < kernel_args.size(); ++i) {
-    if (kernel_args_written[i]) {
-      kernel_func.setArgAttr(i, "lmhlo.written", b.getUnitAttr());
-    }
-  }
-
-  b.setInsertionPointToEnd(&kernel_func.getBody().back());
-  b.create<ReturnOp>(loc);
-
-  auto make_const_idx = [&](int64_t value) {
-    auto attr = b.getIndexAttr(value);
-    return b.create<arith::ConstantOp>(loc, attr).getResult();
-  };
-
-  auto make_kernel_dim3 = [&](const auto& dim3) {
-    return KernelDim3{make_const_idx(dim3.x), make_const_idx(dim3.y),
-                      make_const_idx(dim3.z)};
-  };
-
-  b.setInsertionPoint(op);
-  auto launch_dims = thunk.launch_dimensions();
-  auto grid_size = make_kernel_dim3(launch_dims.block_counts());
-  auto block_size = make_kernel_dim3(launch_dims.thread_counts_per_block());
-  auto shmem_size = b.create<arith::ConstantOp>(
-      loc, b.getI32IntegerAttr(thunk.shmem_bytes()));
-
-  auto launch_func = b.create<LaunchFuncOp>(
-      loc, kernel_func, grid_size, block_size, shmem_size, kernel_args);
-
-  if (auto computation = op->getAttr("__custom_fusion_computation")) {
-    launch_func->setAttr("__custom_fusion_computation", computation);
-  } else {
-    launch_func->setAttr("__custom_fusion_computation",
-                         b.getStringAttr("<UNKNOWN>"));
-  }
-}
-
-static void LowerThunkToGpuOp(Operation* op, OpBuilder& b,
-                              GPUModuleOp gpu_module, Thunk* thunk) {
-  auto loc = op->getLoc();
-
-  if (thunk->kind() == Thunk::kSequential) {
-    const auto* seq_thunk = static_cast<const SequentialThunk*>(thunk);
-    for (const std::unique_ptr<Thunk>& thunk : seq_thunk->thunks()) {
-      LowerThunkToGpuOp(op, b, gpu_module, thunk.get());
-    }
-    return;
-  }
-
-  if (thunk->kind() == Thunk::kCopy) {
-    const auto* copy_thunk = static_cast<const DeviceToDeviceCopyThunk*>(thunk);
-    b.setInsertionPoint(op);
-    b.create<MemcpyOp>(loc, TypeRange(), ValueRange(),
-                       copy_thunk->destination_value(),
-                       copy_thunk->source_value());
-    return;
-  }
-
-  auto rewrite_memset = [&](const xla::BufferAllocation::Slice& slice,
-                            uint32_t memset_value, Value buffer_arg) {
-    auto element_type =
-        buffer_arg.getType().cast<MemRefType>().getElementType();
-    b.setInsertionPoint(op);
-    Value value = MakeBitPatternConstant(b, loc, element_type, memset_value);
-    b.create<MemsetOp>(loc, TypeRange(), ValueRange(), buffer_arg, value);
-  };
-
-  if (thunk->kind() == Thunk::kMemset32BitValue) {
-    const auto* memset_thunk = static_cast<const Memset32BitValueThunk*>(thunk);
-    rewrite_memset(memset_thunk->destination(), memset_thunk->value(),
-                   memset_thunk->dest_value());
-    return;
-  }
-  if (thunk->kind() == Thunk::kMemzero) {
-    const auto* memzero_thunk = static_cast<const MemzeroThunk*>(thunk);
-    rewrite_memset(memzero_thunk->destination(), 0,
-                   memzero_thunk->dest_value());
-    return;
-  }
-
-  if (thunk->kind() == Thunk::kKernel) {
-    const auto* kernel_thunk = static_cast<const KernelThunk*>(thunk);
-
-    SmallVector<Value> kernel_args;
-    for (auto kernel_arg : kernel_thunk->values())
-      kernel_args.push_back(kernel_arg);
-
-    SmallVector<bool> kernel_args_written;
-    for (auto written : kernel_thunk->written()) {
-      kernel_args_written.push_back(written);
-    }
-
-    LowerKernelThunkToGpuOp(op, b, gpu_module, *kernel_thunk, kernel_args,
-                            kernel_args_written);
-    return;
-  }
-
-  if (thunk->kind() == Thunk::kCustomKernel) {
-    const auto* kernel_thunk = static_cast<const CustomKernelThunk*>(thunk);
-
-    SmallVector<Value> kernel_args;
-    for (auto kernel_arg : kernel_thunk->values())
-      kernel_args.push_back(kernel_arg);
-
-    SmallVector<bool> kernel_args_written;
-    for (auto written : kernel_thunk->written()) {
-      kernel_args_written.push_back(written);
-    }
-
-    LowerCustomKernelThunkToGpuOp(op, b, gpu_module, *kernel_thunk, kernel_args,
-                                  kernel_args_written);
-    return;
-  }
-
-  CHECK(false) << "Thunk kind not handled: " << thunk->kind();
-}
-
-// An overload set for defining predicates for operations that should
-// conditionally go through the XLA GPU code emitters.
-template <typename OpTy>
-static bool HasGpuEmitter(OpTy) {
-  return true;
-}
-
-// Select custom calls that have corresponding GPU emitters.
-static bool HasGpuEmitter(lmhlo::CustomCallOp custom_call) {
-  return llvm::any_of(kCustomCallIntrinsics, [&](std::string_view name) {
-    return custom_call.getCallTargetName().equals(name);
-  });
-}
-
-//===-----------------------------------------------------------------------===/
-
-void ConvertLmhloToGpuLaunchPass::runOnOperation() {
-  ModuleOp module = getOperation();
-
-  // No thunks to lower from. Skip pass.
-  if (thunk_sequence_ == nullptr) return signalPassFailure();
-
-  // Collect thunks for rewriting each compatible operation in the module into
-  // the sequence of device kernel launches. Some operation might have an empty
-  // thunk sequence (e.g. redundant copy operation that does not require running
-  // anything on device).
-  absl::flat_hash_map<Operation*, std::unique_ptr<ThunkSequence>> rewrites;
-
-  // Get data to rewrite kernel ops without changing the IR.
-  auto walk = [&](auto op_type_tag) {
-    return module.walk([&](decltype(op_type_tag) op) -> WalkResult {
-      if (!HasGpuEmitter(op)) return success();
-
-      auto data = Match(op, *thunk_sequence_);
-      if (!data.ok()) return op.emitOpError(data.status().message());
-
-      rewrites[op] = std::move(*data);
-      return success();
-    });
-  };
-
-  // Collect all operations that have GPU code emitters.
-  if (walk(lmhlo::FusionOp()).wasInterrupted() ||
-      walk(lmhlo::RngGetAndUpdateStateOp()).wasInterrupted() ||
-      walk(lmhlo::ScatterOp()).wasInterrupted() ||
-      walk(lmhlo::SelectAndScatterOp()).wasInterrupted() ||
-      walk(lmhlo::SortOp()).wasInterrupted() ||
-      walk(lmhlo::CustomCallOp()).wasInterrupted() ||
-      walk(LaunchFuncOp()).wasInterrupted())
-    return signalPassFailure();
-
-  // No operations that should be lowered to sequence of device launches.
-  if (rewrites.empty()) return;
-
-  OpBuilder b(module);
-  SymbolTable symbol_table(module);
-
-  // Replace matched operations with gpu.launch_func's.
-  for (const auto& [op, thunks] : rewrites) {
-    Rewrite(op, b, symbol_table, thunks.get());
-  }
-
-  // Mark module as gpu.container_module.
-  module->setAttr(GPUDialect::getContainerModuleAttrName(), b.getUnitAttr());
-}
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertLmhloToGpuLaunchPass(ThunkSequence* thunk_sequence) {
-  return std::make_unique<ConvertLmhloToGpuLaunchPass>(thunk_sequence);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_runtime.cc b/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_runtime.cc
deleted file mode 100644
index 68b2946dac4046..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/lmhlo_to_gpu_runtime.cc
+++ /dev/null
@@ -1,1239 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <type_traits>
-#include <utility>
-#include <vector>
-
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"  // from @llvm-project
-#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/Dialect/SCF/IR/SCF.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/IRMapping.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/PatternMatch.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/Visitors.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"  // from @llvm-project
-#include "xla/mlir/backends/gpu/transforms/uid_generator.h"
-#include "xla/mlir/runtime/utils/custom_calls.h"
-#include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
-#include "xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
-#include "xla/service/gpu/nccl_all_gather_thunk.h"
-#include "xla/service/gpu/nccl_all_reduce_thunk.h"
-#include "xla/service/gpu/nccl_all_to_all_thunk.h"
-#include "xla/service/gpu/nccl_collective_permute_thunk.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
-#include "xla/service/gpu/nccl_recv_thunk.h"
-#include "xla/service/gpu/nccl_send_thunk.h"
-
-namespace xla {
-namespace gpu {
-
-#define GEN_PASS_DEF_CONVERTLMHLOTOGPURUNTIMEPASS
-#include "xla/mlir/backends/gpu/transforms/passes.h.inc"
-
-using namespace mlir;  // NOLINT
-
-using mlir::gpu::MemcpyOp;
-
-using mlir::lmhlo::CaseOp;
-using mlir::lmhlo::CustomCallOp;
-using mlir::lmhlo::FftOp;
-using mlir::lmhlo::InfeedOp;
-using mlir::lmhlo::OutfeedOp;
-using mlir::lmhlo::TerminatorOp;
-using mlir::lmhlo::WhileOp;
-
-using xla::runtime::AppendCustomCallAttrs;
-using xla::runtime::CustomCallDeclarations;
-
-// helper template to check T is any of the types listed in Ts.
-template <typename T, typename... Ts>
-inline constexpr bool is_any = std::disjunction_v<std::is_same<T, Ts>...>;
-
-class ConvertLmhloToGpuRuntimePass
-    : public impl::ConvertLmhloToGpuRuntimePassBase<
-          ConvertLmhloToGpuRuntimePass> {
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry
-        .insert<arith::ArithDialect, cf::ControlFlowDialect, func::FuncDialect,
-                memref::MemRefDialect, scf::SCFDialect>();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-
-class TerminatorOpLowering : public OpRewritePattern<TerminatorOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TerminatorOp op,
-                                PatternRewriter& rewriter) const override {
-    rewriter.replaceOpWithNewOp<func::ReturnOp>(op);
-    return mlir::success();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-
-template <typename IoFeedOp>
-class IoFeedOpLowering : public OpRewritePattern<IoFeedOp> {
-  static StringRef Target(InfeedOp) { return "xla.gpu.infeed"; }
-  static StringRef Target(OutfeedOp) { return "xla.gpu.outfeed"; }
-
- public:
-  IoFeedOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern<IoFeedOp>(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(IoFeedOp op,
-                                PatternRewriter& rewriter) const override {
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(b, Target(op), op);
-
-    llvm::SmallVector<NamedAttribute> custom_call_attrs = {
-        {b.getStringAttr("config"), op.getConfigAttr()}};
-
-    // Call the runtime intrinsic with the original operands.
-    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
-        op, callee.getName(), TypeRange(), op.getOperands());
-    AppendCustomCallAttrs(call, custom_call_attrs);
-
-    return success();
-  }
-
- private:
-  CustomCallDeclarations& custom_calls_;
-};
-
-class InfeedOpLowering : public IoFeedOpLowering<InfeedOp> {
- public:
-  using IoFeedOpLowering::IoFeedOpLowering;
-};
-
-class OutfeedOpLowering : public IoFeedOpLowering<OutfeedOp> {
- public:
-  using IoFeedOpLowering::IoFeedOpLowering;
-};
-
-//===----------------------------------------------------------------------===//
-
-class CustomCallOpLowering : public OpRewritePattern<CustomCallOp> {
- private:
-  static constexpr const char kCustomCallTarget[] = "xla.gpu.custom_call";
-
- public:
-  CustomCallOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), custom_calls_(custom_calls) {}
-
-  // Rewrite custom call with `API_VERSION_TYPED_FFI` version into XLA runtime
-  // custom calls bypassing custom call adaptor.
-  LogicalResult rewriteTypedCustomCall(CustomCallOp op,
-                                       PatternRewriter& rewriter) const {
-    // TODO(ezhulenev): Support target arg mapping, or explain why we do not
-    // need them for typed custom calls.
-    if (op.getTargetArgMapping())
-      return op.emitOpError(
-          "API_VERSION_TYPED_FFI custom calls do not "
-          "support target arg mapping");
-
-    // Create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee =
-        custom_calls_.GetOrCreate(b, op.getCallTargetName(), op);
-    // Custom calls starting with the __gpu$ prefix are considered internal and
-    // statically linked (e.g. __gpu$TopK).
-    if (!op.getCallTargetName().starts_with("__gpu$")) {
-      callee->setAttr("rt.dynamic", UnitAttr::get(b.getContext()));
-    }
-
-    // Forward backend config to the custom call implementation.
-    auto dict = op.getBackendConfig()
-                    ? op.getBackendConfig()->cast<mlir::DictionaryAttr>()
-                    : nullptr;
-    llvm::SmallVector<NamedAttribute> backend_config(dict.begin(), dict.end());
-
-    // Call the custom call function forwarding user-defined attributes.
-    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
-        op, callee.getName(), TypeRange(), op.getOperands());
-    AppendCustomCallAttrs(call, backend_config);
-
-    return success();
-  }
-
-  LogicalResult matchAndRewrite(CustomCallOp op,
-                                PatternRewriter& rewriter) const override {
-    // Typed custom calls lowered directly to XLA runtime custom calls.
-    if (op.getApiVersion() == mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI)
-      return rewriteTypedCustomCall(op, rewriter);
-
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // By default all operands passed to the custom call handler.
-    llvm::SmallVector<Value> operands = op.getOperands();
-
-    // If custom call has target arguments mapping, then we need to pass `i64`
-    // scalars in place of holes to detect them in custom call handler.
-    //
-    // TODO(ezhulenev): We need an `xla` dialect to model Xla framework
-    // semantics including holes for custom call. As a work around we pass `i64`
-    // values because xla custom call do not support scalar arguments, and we
-    // can disambiguate holes from buffers.
-    if (op.getTargetArgMapping().has_value()) {
-      auto mapping = *op.getTargetArgMapping();
-      int64_t num_args = mapping.getNumArgs();
-      int64_t num_results = mapping.getNumResults();
-
-      // We represent holes as an arbitrary `i64` constant.
-      Value hole = b.create<arith::ConstantOp>(b.getI64IntegerAttr(-1));
-      operands = llvm::SmallVector<Value>(num_args + num_results, hole);
-
-      // Update operands to mapped custom call arguments.
-      auto args = mapping.getArgsToTargetArgs();
-      for (const auto& indexed : llvm::enumerate(args))
-        operands[indexed.value()] = op.getArgs()[indexed.index()];
-
-      // Update operands to mapped custom call results.
-      auto res = mapping.getResultsToTargetResults();
-      for (const auto& indexed : llvm::enumerate(res))
-        operands[num_args + indexed.value()] = op.getOutput()[indexed.index()];
-    }
-
-    // Create a custom call function declaration.
-    func::FuncOp callee = custom_calls_.GetOrCreate(
-        b, kCustomCallTarget, TypeRange(ValueRange(operands)), TypeRange());
-
-    llvm::SmallVector<NamedAttribute> custom_call_attrs = {
-        {b.getStringAttr("api_version"), op.getApiVersionAttr()},
-        {b.getStringAttr("backend_config"), op.getBackendConfigAttr()},
-        {b.getStringAttr("call_target_name"), op.getCallTargetNameAttr()}};
-
-    // Call the runtime intrinsic with the original operands.
-    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
-        op, callee.getName(), TypeRange(), operands);
-    AppendCustomCallAttrs(call, custom_call_attrs);
-
-    return success();
-  }
-
- private:
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-class FftOpLowering : public OpRewritePattern<FftOp> {
- private:
-  static constexpr const char kCustomCallTarget[] = "xla.gpu.fft";
-
- public:
-  FftOpLowering(MLIRContext* ctx, UidGenerator& uid,
-                CustomCallDeclarations& custom_calls)
-      : OpRewritePattern(ctx), uid_(uid), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(FftOp op,
-                                PatternRewriter& rewriter) const override {
-    // Create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(b, kCustomCallTarget, op);
-
-    llvm::SmallVector<NamedAttribute> custom_call_attrs = {
-        {b.getStringAttr("fft_length"), op.getFftLengthAttr()},
-        {b.getStringAttr("fft_type"), op.getFftTypeAttr()},
-        {b.getStringAttr("uid"), b.getI64IntegerAttr(uid_.uid())}};
-
-    // Convert Fft to a function call.
-    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(
-        op, callee.getName(), TypeRange(), op.getOperands());
-    AppendCustomCallAttrs(call, custom_call_attrs);
-    return success();
-  }
-
- private:
-  UidGenerator& uid_;
-  CustomCallDeclarations& custom_calls_;
-};
-
-//===----------------------------------------------------------------------===//
-
-class CaseOpLowering : public OpRewritePattern<CaseOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(CaseOp op,
-                                PatternRewriter& rewriter) const override {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // Copy index buffer to the host ...
-    auto index_type = op.getIndex().getType().dyn_cast<MemRefType>();
-
-    // Always create an `alloca` in the parent function entry block.
-    // See: https://llvm.org/docs/Frontend/PerformanceTips.html#use-of-allocas
-    Value index_on_host = [&]() -> Value {
-      OpBuilder::InsertionGuard guard(b);
-      b.setInsertionPointToStart(&op->getParentOfType<func::FuncOp>().front());
-      return b.create<memref::AllocaOp>(index_type);
-    }();
-
-    b.create<MemcpyOp>(TypeRange(), ValueRange({index_on_host, op.getIndex()}));
-
-    // Get the index value from the buffer.
-    Value index = b.create<memref::LoadOp>(index_type.getElementType(),
-                                           index_on_host, ValueRange());
-
-    bool is_predicate = index_type.getElementType().isInteger(1);
-
-    // For binary index (predicate) convert i1 to i32 index.
-    if (is_predicate) {
-      Value c0 = b.create<arith::ConstantOp>(b.getI32IntegerAttr(0));
-      Value c1 = b.create<arith::ConstantOp>(b.getI32IntegerAttr(1));
-      index = b.create<arith::SelectOp>(index, c0, c1);
-    }
-
-    // For integer index make sure that it is within range.
-    if (!is_predicate) {
-      unsigned n = op.getNumRegions() - 1;
-      Value c0 = b.create<arith::ConstantOp>(b.getI32IntegerAttr(0));
-      Value cN = b.create<arith::ConstantOp>(b.getI32IntegerAttr(n));
-
-      Value too_small = b.create<arith::CmpIOp>(
-          b.getI1Type(), arith::CmpIPredicate::slt, index, c0);
-      Value too_large = b.create<arith::CmpIOp>(
-          b.getI1Type(), arith::CmpIPredicate::sgt, index, cN);
-
-      Value out_of_range = b.create<arith::OrIOp>(too_small, too_large);
-      index = b.create<arith::SelectOp>(out_of_range, cN, index);
-    }
-
-    // Wrap the CFG constructed from the `lmhlo.case` operation in an
-    // `scf.execute_region` operation, so that we do not introduce the CFG
-    // into regions that expect a single block (e.g. inside the loop body).
-    auto execute = b.create<scf::ExecuteRegionOp>(TypeRange());
-
-    // Add an entry block to the execute region operation.
-    Block& entry = execute.getRegion().emplaceBlock();
-
-    // Create a block with `scf.yield` terminator.
-    Block& yield = execute.getRegion().emplaceBlock();
-    b.setInsertionPointToStart(&yield);
-    b.create<scf::YieldOp>();
-
-    // Prepare case destinations for the `scf.switch` operation.
-    llvm::SmallVector<llvm::APInt> case_values;
-    llvm::SmallVector<Block*> case_blocks;
-    llvm::SmallVector<ValueRange> case_operands;
-
-    // Create blocks from each of the case regions.
-    for (Region& region : op->getRegions()) {
-      // Move `lmhlo.case` block into the execute region.
-      Block& block = region.front();
-      block.moveBefore(&yield);
-
-      // Erase original `lmhlo.terminator`.
-      rewriter.eraseOp(block.getTerminator());
-
-      // Branch into the yield block.
-      b.setInsertionPointToEnd(&block);
-      b.create<cf::BranchOp>(&yield);
-
-      // Add a `cf.switch` case.
-      int32_t idx = case_blocks.size();
-      case_values.push_back(b.getI32IntegerAttr(idx).getValue());
-      case_blocks.push_back(&block);
-      case_operands.push_back({});
-    }
-
-    // Create a `cf.switch` operation in the execute region entry block.
-    b.setInsertionPointToEnd(&entry);
-    b.create<cf::SwitchOp>(index, &yield, ValueRange(), case_values,
-                           case_blocks, case_operands);
-
-    // Erase the original case operation.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-
-class WhileOpLowering : public OpRewritePattern<WhileOp> {
- public:
-  using OpRewritePattern::OpRewritePattern;
-
-  // Rewrite while loop with known trip count to `scf.for` operation.
-  LogicalResult rewriteForLoop(WhileOp op, PatternRewriter& rewriter) const {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    Value lb = b.create<arith::ConstantIndexOp>(0);
-    Value ub = b.create<arith::ConstantIndexOp>(*op.getTripCount());
-    Value c1 = b.create<arith::ConstantIndexOp>(1);
-
-    // Create an `scf.for` loop in place of `lmhlo.while` loop.
-    auto loop = b.create<scf::ForOp>(lb, ub, c1, ValueRange());
-
-    // Move body region into the new loop operation.
-    IRMapping mapping;
-    rewriter.eraseOp(op.getBody().front().getTerminator());
-    rewriter.inlineBlockBefore(&op.getBody().front(),
-                               loop.getBody()->getTerminator());
-
-    // Erase the original while loop.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
-  // Rewrite while loop with unknown trip count to `scf.while` operation.
-  LogicalResult rewriteWhileLoop(WhileOp op, PatternRewriter& rewriter) const {
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // Create an `scf.while` loop in place of `lmhlo.while` loop.
-    auto loop = b.create<scf::WhileOp>(TypeRange(), ValueRange());
-
-    // Predicate buffer placed on the device.
-    Value pred = op.getOperand(0);
-
-    // Inline condition and body regions into the new loop operation.
-    IRMapping mapping;
-    rewriter.inlineRegionBefore(op.getCond(), loop.getBefore(),
-                                loop.getBefore().begin());
-    rewriter.inlineRegionBefore(op.getBody(), loop.getAfter(),
-                                loop.getAfter().begin());
-
-    {  // Replace loop condition terminator.
-      auto* terminator = loop.getBefore().back().getTerminator();
-      b.setInsertionPointAfter(terminator);
-
-      auto i1 = b.getI1Type();
-
-      // Always create an `alloca` in the parent function entry block.
-      // See: https://llvm.org/docs/Frontend/PerformanceTips.html#use-of-allocas
-      Value pred_on_host = [&]() -> Value {
-        OpBuilder::InsertionGuard guard(b);
-        b.setInsertionPointToStart(
-            &op->getParentOfType<func::FuncOp>().front());
-        return b.create<memref::AllocaOp>(MemRefType::get({}, i1));
-      }();
-
-      // Copy predicate buffer to the host ...
-      b.create<gpu::MemcpyOp>(TypeRange(), ValueRange({pred_on_host, pred}));
-
-      // .. and check if we need to continue loop iteration.
-      Value cond = b.create<memref::LoadOp>(i1, pred_on_host, ValueRange());
-      b.create<scf::ConditionOp>(cond, ValueRange());
-      rewriter.eraseOp(terminator);
-    }
-
-    {  // Replace loop body terminator.
-      auto* terminator = loop.getAfter().back().getTerminator();
-      b.setInsertionPointAfter(terminator);
-      b.create<scf::YieldOp>(TypeRange(), ValueRange());
-      rewriter.eraseOp(terminator);
-    }
-
-    // Erase the original while loop.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
-  LogicalResult matchAndRewrite(WhileOp op,
-                                PatternRewriter& rewriter) const override {
-    assert(op.getNumOperands() == 1 && "expected single lmhlo.while operand");
-    return op.getTripCount().has_value() ? rewriteForLoop(op, rewriter)
-                                         : rewriteWhileLoop(op, rewriter);
-  }
-};
-
-//===----------------------------------------------------------------------===//
-// Collective operations lowerings.
-//===----------------------------------------------------------------------===//
-
-using mlir::lmhlo::PartitionIdOp;
-using mlir::lmhlo::ReplicaIdOp;
-using mlir::lmhlo_gpu::AllGatherDoneOp;
-using mlir::lmhlo_gpu::AllGatherStartOp;
-using mlir::lmhlo_gpu::AllReduceDoneOp;
-using mlir::lmhlo_gpu::AllReduceStartOp;
-using mlir::lmhlo_gpu::AllToAllDoneOp;
-using mlir::lmhlo_gpu::AllToAllStartOp;
-using mlir::lmhlo_gpu::CollectivePermuteDoneOp;
-using mlir::lmhlo_gpu::CollectivePermuteStartOp;
-using mlir::lmhlo_gpu::ReduceScatterDoneOp;
-using mlir::lmhlo_gpu::ReduceScatterStartOp;
-
-using lmhlo::RecvDoneOp;
-using lmhlo::RecvOp;
-using lmhlo::SendDoneOp;
-using lmhlo::SendOp;
-
-// We assign unique id to all collective operations in the module, so that we
-// can efficiently access per-op state at run time. Exception to this rule are
-// asynchronous collective operations, that share the same unique id by the pair
-// of corresponding `start` and `done` operations.
-//
-// Asynchronous collective operations pass HLO Token to represent the dependency
-// between the `Start` and `Done` operations. When we lower to XLA runtime
-// custom calls we rely on assigning each unique pair of `Start` and `Done`
-// operations a unique event id, and use shared "context" owned by the
-// GpuExecutable to pass Gpu events from `Start` to `Done` custom call handlers.
-//
-// TODO(ezhulenev): Once XLA runtime custom calls support returning values, we
-// should explicitly return event id from the `Start` custom call, and pass it
-// to the `Done` custom call. Longer term this should become an `!async.token`
-// and rely on XLA runtime asynchronous execution.
-class CollectiveUidGenerator {
- public:
-  CollectiveUidGenerator() : cnt_(0) {}
-
-  // Assigns a unique event id to the pair of start and done operations.
-  int32_t AssignUid(Operation* start, Operation* done) {
-    int32_t id = next();
-    uids_[start] = id;
-    uids_[done] = id;
-    return id;
-  }
-
-  FailureOr<int32_t> AssignedUid(Operation* op) {
-    // Async operations must be assigned uid ahead of time.
-    if (isa<AllGatherStartOp, AllGatherDoneOp, AllReduceStartOp,
-            AllReduceDoneOp, AllToAllStartOp, AllToAllDoneOp,
-            CollectivePermuteStartOp, CollectivePermuteDoneOp,
-            ReduceScatterStartOp, ReduceScatterDoneOp, SendOp, RecvOp,
-            SendDoneOp, RecvDoneOp>(op)) {
-      auto it = uids_.find(op);
-      if (it == uids_.end()) return failure();
-      return it->second;
-    }
-    // For every other operation we just assign a next id.
-    return next();
-  }
-
- private:
-  int32_t next() { return cnt_++; }
-
-  int32_t cnt_;
-  llvm::DenseMap<Operation*, int32_t> uids_;
-};
-
-// Filters out host send/recv which do not participate in collective op
-// lowerings.
-struct CollectiveFilter {
-  template <typename OpT>
-  static std::enable_if_t<!is_any<OpT, SendOp, RecvOp>, bool> ShouldHandle(
-      OpT) {
-    return true;
-  }
-
-  // We only handle send/recv that is not a host transfer.
-  template <typename OpT>
-  static std::enable_if_t<is_any<OpT, SendOp, RecvOp>, bool> ShouldHandle(
-      OpT op) {
-    return !op.getIsHostTransfer();
-  }
-};
-
-template <typename ThunkT, typename OpT>
-NcclCollectiveConfig GetNcclCollectiveConfigForP2POps(OpT op, int replica_count,
-                                                      int num_partitions) {
-  return ThunkT::GetNcclP2PConfig(op, replica_count, num_partitions).config;
-}
-
-template <typename CollectiveOp>
-class CollectiveOpLowering : public OpRewritePattern<CollectiveOp> {
-  // Define target custom call for lowering of collective ops.
-  static StringRef Target(AllGatherStartOp) { return "xla.gpu.all_gather"; }
-  static StringRef Target(AllReduceStartOp) { return "xla.gpu.all_reduce"; }
-  static StringRef Target(AllToAllStartOp) { return "xla.gpu.all_to_all"; }
-  static StringRef Target(ReduceScatterStartOp) {
-    return "xla.gpu.reduce_scatter";
-  }
-  static StringRef Target(CollectivePermuteStartOp) {
-    return "xla.gpu.collective_permute";
-  }
-  static StringRef Target(SendOp) { return "xla.gpu.send"; }
-  static StringRef Target(RecvOp) { return "xla.gpu.recv"; }
-
-  template <typename OpT>
-  static std::enable_if_t<
-      is_any<OpT, AllReduceStartOp, ReduceScatterStartOp, AllGatherStartOp>,
-      NcclCollectiveConfig>
-  GetNcclCollectiveConfig(OpT op, int /*replica_count*/,
-                          int /*num_partitions*/) {
-    return GetNcclCollectiveConfigForMlir(op, op.getUseGlobalDeviceIds());
-  }
-
-  static NcclCollectiveConfig GetNcclCollectiveConfig(AllToAllStartOp op,
-                                                      int /*replica_count*/,
-                                                      int /*num_partitions*/) {
-    // TODO(b/180174349): LMHLO AllToAll incorrectly has use_global_device_ids
-    // attribute and it should be removed.
-    return GetNcclCollectiveConfigForMlir(op, std::nullopt);
-  }
-
-  static NcclCollectiveConfig GetNcclCollectiveConfig(
-      CollectivePermuteStartOp op, int replica_count, int num_partitions) {
-    return GetNcclCollectiveConfigForP2POps<NcclCollectivePermuteStartThunk,
-                                            CollectivePermuteStartOp>(
-        op, replica_count, num_partitions);
-  }
-
-  static NcclCollectiveConfig GetNcclCollectiveConfig(SendOp op,
-                                                      int replica_count,
-                                                      int num_partitions) {
-    return GetNcclCollectiveConfigForP2POps<NcclSendThunk, SendOp>(
-        op, replica_count, num_partitions);
-  }
-
-  static NcclCollectiveConfig GetNcclCollectiveConfig(RecvOp op,
-                                                      int replica_count,
-                                                      int num_partitions) {
-    return GetNcclCollectiveConfigForP2POps<NcclRecvThunk, RecvOp>(
-        op, replica_count, num_partitions);
-  }
-
-  template <typename NonCollectivePermuteOp>
-  static std::enable_if_t<!is_any<NonCollectivePermuteOp, SendOp, RecvOp>,
-                          LogicalResult>
-  TryDegenerateToMemCopy(NonCollectivePermuteOp op,
-                         const NcclCollectiveConfig& config, int replica_count,
-                         int num_partitions, PatternRewriter& rewriter) {
-    if (!config.IsDegenerate(replica_count, num_partitions)) {
-      return failure();
-    }
-
-    for (int64_t i = 0; i < op.getInputs().size(); i++) {
-      rewriter.create<gpu::MemcpyOp>(
-          op.getLoc(), TypeRange(),
-          ValueRange({op.getOutputs()[i], op.getOperands()[i]}));
-    }
-
-    return success();
-  }
-
-  // Send/Recv is never degenerate by itself, so returns failure().
-  template <typename OpT>
-  static std::enable_if_t<is_any<OpT, SendOp, RecvOp>, LogicalResult>
-  TryDegenerateToMemCopy(OpT op, const NcclCollectiveConfig& config,
-                         int replica_count, int num_partitions,
-                         PatternRewriter& rewriter) {
-    return failure();
-  }
-
-  static LogicalResult TryDegenerateToMemCopy(
-      CollectivePermuteStartOp op, const NcclCollectiveConfig& config,
-      int replica_count, int num_partitions, PatternRewriter& rewriter) {
-    if (!NcclCollectivePermuteStartThunk::IsDegenerate(op, replica_count,
-                                                       num_partitions)) {
-      return failure();
-    }
-
-    rewriter.create<gpu::MemcpyOp>(
-        op.getLoc(), TypeRange(),
-        ValueRange({op.getOutput(), op.getOperand()}));
-
-    return success();
-  }
-
-  static Status CheckImplementable(AllGatherStartOp op, int64_t replica_count,
-                                   int64_t num_partitions) {
-    return NcclAllGatherStartThunk::CheckImplementable(op, replica_count,
-                                                       num_partitions);
-  }
-
-  static Status CheckImplementable(AllReduceStartOp op, int64_t replica_count,
-                                   int64_t num_partitions) {
-    return NcclAllReduceStartThunk::CheckImplementable(op, replica_count,
-                                                       num_partitions);
-  }
-
-  static Status CheckImplementable(AllToAllStartOp op, int64_t replica_count,
-                                   int64_t num_partitions) {
-    return NcclAllToAllStartThunk::CheckImplementable(op, replica_count,
-                                                      num_partitions);
-  }
-
-  static Status CheckImplementable(CollectivePermuteStartOp op,
-                                   int64_t replica_count,
-                                   int64_t num_partitions) {
-    return NcclCollectivePermuteStartThunk::CheckImplementable(
-        op, replica_count, num_partitions);
-  }
-
-  static Status CheckImplementable(SendOp op, int64_t replica_count,
-                                   int64_t num_partitions) {
-    return NcclSendThunk::CheckImplementable(op, replica_count, num_partitions);
-  }
-
-  static Status CheckImplementable(RecvOp op, int64_t replica_count,
-                                   int64_t num_partitions) {
-    return NcclRecvThunk::CheckImplementable(op, replica_count, num_partitions);
-  }
-
-  static Status CheckImplementable(ReduceScatterStartOp op,
-                                   int64_t replica_count,
-                                   int64_t num_partitions) {
-    return NcclReduceScatterStartThunk::CheckImplementable(op, replica_count,
-                                                           num_partitions);
-  }
-
-  template <typename OpT>
-  static typename std::enable_if_t<
-      is_any<OpT, AllReduceStartOp, ReduceScatterStartOp>, LogicalResult>
-  SetSpecificAttrs(ImplicitLocOpBuilder& b, OpT op, func::CallOp call) {
-    std::optional<xla::ReductionKind> reduction_kind =
-        NcclAllReduceReduceScatterThunkBase::MatchAllReduceComputation(
-            op.getComputation());
-    if (!reduction_kind.has_value())
-      return op.emitOpError()
-             << "Failed to determine reduction computation for AllReduce";
-
-    call->setAttr(
-        b.getStringAttr("reduction_kind"),
-        b.getI64IntegerAttr(static_cast<int64_t>(reduction_kind.value())));
-
-    return success();
-  }
-
-  static LogicalResult SetSpecificAttrs(ImplicitLocOpBuilder& b,
-                                        AllGatherStartOp op,
-                                        func::CallOp call) {
-    return success();
-  }
-
-  static LogicalResult SetSpecificAttrs(ImplicitLocOpBuilder& b,
-                                        AllToAllStartOp op, func::CallOp call) {
-    call->setAttr(b.getStringAttr("has_split_dimension"),
-                  b.getBoolAttr(op.getSplitDimension().has_value()));
-    return success();
-  }
-
-  static void SetSourceTargetPeersAttrs(
-      ImplicitLocOpBuilder& b,
-      const std::vector<std::pair<int64_t, int64_t>>& source_target_pairs,
-      func::CallOp call) {
-    std::vector<int64_t> source_peers;
-    std::vector<int64_t> target_peers;
-    source_peers.reserve(source_target_pairs.size());
-    target_peers.reserve(source_target_pairs.size());
-    for (const auto& source_target_pair : source_target_pairs) {
-      source_peers.push_back(source_target_pair.first);
-      target_peers.push_back(source_target_pair.second);
-    }
-
-    auto source_peers_attr = b.getI64TensorAttr(source_peers);
-    auto target_peers_attr = b.getI64TensorAttr(target_peers);
-    call->setAttr(b.getStringAttr("source_peers"), source_peers_attr);
-    call->setAttr(b.getStringAttr("target_peers"), target_peers_attr);
-  }
-
-  static LogicalResult SetSpecificAttrs(ImplicitLocOpBuilder& b,
-                                        CollectivePermuteStartOp op,
-                                        func::CallOp call) {
-    auto source_target_pairs_or =
-        ConvertNx2Attribute(op.getSourceTargetPairs());
-    if (!source_target_pairs_or.ok()) {
-      return op.emitOpError() << source_target_pairs_or.status().message();
-    }
-    SetSourceTargetPeersAttrs(b, source_target_pairs_or.value(), call);
-    return success();
-  }
-
-  template <typename OpT>
-  static typename std::enable_if_t<is_any<OpT, SendOp, RecvOp>, LogicalResult>
-  SetSpecificAttrs(ImplicitLocOpBuilder& b, OpT op, func::CallOp call) {
-    auto source_target_pairs_or =
-        GetSourceTargetPairs(op.getFrontendAttributes());
-    if (!source_target_pairs_or.ok()) {
-      return op.emitOpError() << source_target_pairs_or.status().message();
-    }
-    SetSourceTargetPeersAttrs(b, source_target_pairs_or.value(), call);
-    return success();
-  }
-
-  template <typename OpT>
-  static typename std::enable_if_t<is_any<OpT, SendOp, RecvOp>, bool> getIsSync(
-      OpT) {
-    return false;
-  }
-
-  template <typename OpT>
-  static typename std::enable_if_t<!is_any<OpT, SendOp, RecvOp>, bool>
-  getIsSync(OpT op) {
-    return op.getIsSync();
-  }
-
-  template <typename OpT>
-  static typename std::enable_if_t<is_any<OpT, SendOp, RecvOp>, bool>
-  noParallelCustomCall(OpT) {
-    return false;
-  }
-
-  template <typename OpT>
-  static typename std::enable_if_t<!is_any<OpT, SendOp, RecvOp>, bool>
-  noParallelCustomCall(OpT op) {
-    return op.getNoParallelCustomCall();
-  }
-
-  // For async collective erase all corresponding done operations.
-  template <typename StartOpT, typename DoneOpT>
-  void eraseDoneOp(PatternRewriter& rewriter, CollectiveOp op) const {
-    if (auto start = dyn_cast<StartOpT>(op.getOperation())) {
-      auto users = llvm::to_vector(start.getToken().getUsers());
-      llvm::for_each(users, [&](Operation* user) {
-        if (isa<DoneOpT>(user)) rewriter.eraseOp(user);
-      });
-    }
-  }
-
- public:
-  CollectiveOpLowering(MLIRContext* ctx, CollectiveUidGenerator& uid,
-                       CustomCallDeclarations& custom_calls)
-      : OpRewritePattern<CollectiveOp>(ctx),
-        uid_(uid),
-        custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(CollectiveOp op,
-                                PatternRewriter& rewriter) const override {
-    if (!CollectiveFilter::ShouldHandle(op)) {
-      return failure();
-    }
-
-    // Construct an NCCL collective config from the parent func attributes.
-    func::FuncOp fn = op->template getParentOfType<func::FuncOp>();
-    auto replica_count_attr = fn->getAttrOfType<IntegerAttr>("replica_count");
-    auto num_partitions_attr = fn->getAttrOfType<IntegerAttr>("num_partitions");
-    const int64_t replica_count = replica_count_attr.getInt();
-    const int64_t num_partitions = num_partitions_attr.getInt();
-
-    NcclCollectiveConfig config =
-        GetNcclCollectiveConfig(op, replica_count, num_partitions);
-
-    // For async collective erase all corresponding done operations.
-    auto erase_done_op = [&]() {
-      eraseDoneOp<AllGatherStartOp, AllGatherDoneOp>(rewriter, op);
-      eraseDoneOp<AllReduceStartOp, AllReduceDoneOp>(rewriter, op);
-      eraseDoneOp<CollectivePermuteStartOp, CollectivePermuteDoneOp>(rewriter,
-                                                                     op);
-      eraseDoneOp<ReduceScatterStartOp, ReduceScatterDoneOp>(rewriter, op);
-      eraseDoneOp<AllToAllStartOp, AllToAllDoneOp>(rewriter, op);
-      eraseDoneOp<SendOp, SendDoneOp>(rewriter, op);
-      eraseDoneOp<RecvOp, RecvDoneOp>(rewriter, op);
-    };
-
-    // A given collective op can be degenerate if across all groups formed
-    // by it are singleton. In such a case, we don't need to do any
-    // communication and we can just copy the input to the output.
-    if (succeeded(TryDegenerateToMemCopy(op, config, replica_count,
-                                         num_partitions, rewriter))) {
-      // For async collective erase all corresponding done operations.
-      erase_done_op();
-
-      // Erase the original collective operation.
-      rewriter.eraseOp(op);
-
-      return success();
-    }
-
-    Status implementable_status =
-        CheckImplementable(op, replica_count, num_partitions);
-    if (!implementable_status.ok()) {
-      return op.emitOpError() << implementable_status.message();
-    }
-
-    // Check that we have and assigned unique collective operation id.
-    auto uid = uid_.AssignedUid(op);
-    if (failed(uid)) {
-      return op.emitOpError("failed to get a unique collective operation id");
-    }
-
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // We always drop the return value from the signature, because for
-    // AllReduceStart operation we pass dependency through the collective
-    // operation id.
-    func::FuncOp callee = custom_calls_.GetOrCreate(
-        b, Target(op), TypeRange(op.getOperands()), TypeRange());
-
-    // Convert collective op to a function call.
-    auto call = rewriter.create<func::CallOp>(op.getLoc(), callee.getName(),
-                                              TypeRange(), op.getOperands());
-
-    // Copy backend specific attributes.
-    call->setAttr(b.getStringAttr("group_mode"),
-                  b.getI64IntegerAttr(static_cast<int64_t>(config.group_mode)));
-    call->setAttr(b.getStringAttr("op_id"), b.getI64IntegerAttr(config.op_id));
-
-    // TODO(b/233930690): Pass the attribute below as a nested array.
-    // Pass an array of arrays using two vectors; one specifying all the values
-    // and another specifying the (ending) offsets of each array in the other
-    // vector. Example: [ [10, 20, 30, 40], [50, 60], [70, 80, 90] ] turns into
-    // offsets=[4, 6, 9] values=[10, 20, 30, 40, 50, 60, 70, 80, 90].
-    std::vector<int64_t> replica_group_offsets;
-    std::vector<int64_t> replica_group_values;
-    replica_group_offsets.reserve(config.replica_groups.size());
-    int replica_group_offset = 0;
-    for (const auto& replica_group : config.replica_groups) {
-      replica_group_offset += replica_group.replica_ids_size();
-      replica_group_offsets.push_back(replica_group_offset);
-      replica_group_values.reserve(replica_group_offset);
-      for (auto replica_id : replica_group.replica_ids()) {
-        replica_group_values.push_back(replica_id);
-      }
-    }
-    call->setAttr(b.getStringAttr("replica_group_offsets"),
-                  b.getI64TensorAttr(replica_group_offsets));
-    call->setAttr(b.getStringAttr("replica_group_values"),
-                  b.getI64TensorAttr(replica_group_values));
-
-    // Assign a unique collective operation id.
-    call->setAttr(b.getStringAttr("uid"), b.getI32IntegerAttr(*uid));
-
-    // Set attributes specific to the type of collective operation.
-    auto result = SetSpecificAttrs(b, op, call);
-    if (failed(result)) return result;
-
-    bool is_async = !getIsSync(op);
-    call->setAttr(b.getStringAttr("is_async"), b.getBoolAttr(is_async));
-
-    call->setAttr(b.getStringAttr("no_parallel_custom_call"),
-                  b.getBoolAttr(noParallelCustomCall(op)));
-
-    // If the collective will not execute asynchronously, erase the associated
-    // done op.
-    if (!is_async) {
-      erase_done_op();
-    } else {
-      // For asynchronous start operation we need to produce a fake token, that
-      // will be later removed, because corresponding `done` operation doesn't
-      // have a token argument. We rely on the `unrealized_conversion_cast`
-      // operation to create a fake token from the `i8` constant, and on the
-      // dead code elimination pass that will remove unused fake tokens.
-      Value token = op.getToken();
-      Value c0 = b.create<arith::ConstantOp>(b.getI8IntegerAttr(0));
-      auto fake = b.create<UnrealizedConversionCastOp>(token.getType(), c0);
-      token.replaceAllUsesWith(fake.getResult(0));
-    }
-
-    // Erase the original collective operation.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
- private:
-  CollectiveUidGenerator& uid_;
-  CustomCallDeclarations& custom_calls_;
-};
-
-#define DEFINE_COLLECTIVE_OP_LOWERING(OP)                \
-  class OP##Lowering : public CollectiveOpLowering<OP> { \
-   public:                                               \
-    using CollectiveOpLowering::CollectiveOpLowering;    \
-  }
-
-DEFINE_COLLECTIVE_OP_LOWERING(AllGatherStartOp);
-DEFINE_COLLECTIVE_OP_LOWERING(AllReduceStartOp);
-DEFINE_COLLECTIVE_OP_LOWERING(AllToAllStartOp);
-DEFINE_COLLECTIVE_OP_LOWERING(CollectivePermuteStartOp);
-DEFINE_COLLECTIVE_OP_LOWERING(ReduceScatterStartOp);
-DEFINE_COLLECTIVE_OP_LOWERING(SendOp);
-DEFINE_COLLECTIVE_OP_LOWERING(RecvOp);
-
-#undef DEFINE_COLLECTIVE_OP_LOWERING
-
-template <typename OpT, typename Derived>
-class AsyncDoneOpLowering : public OpRewritePattern<OpT> {
- public:
-  AsyncDoneOpLowering(MLIRContext* ctx, CollectiveUidGenerator& uid,
-                      CustomCallDeclarations& custom_calls)
-      : OpRewritePattern<OpT>(ctx), uid_(uid), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(OpT op,
-                                PatternRewriter& rewriter) const override {
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(
-        b, "xla.gpu.collective_done", TypeRange(), TypeRange());
-
-    // Get a unique collective operation id.
-    FailureOr<int32_t> uid = uid_.AssignedUid(op);
-    if (failed(uid))
-      return op.emitOpError("failed to get a unique collective operation id");
-
-    llvm::SmallVector<NamedAttribute> custom_call_attributes = {
-        {b.getStringAttr("uid"), b.getI32IntegerAttr(*uid)},
-        {b.getStringAttr("done_type"), b.getStringAttr(Derived::kDoneType)}};
-
-    // Convert AllReduceDone to a function call.
-    auto call = rewriter.replaceOpWithNewOp<func::CallOp>(op, callee.getName(),
-                                                          TypeRange());
-    AppendCustomCallAttrs(call, custom_call_attributes);
-
-    return success();
-  }
-
- private:
-  CollectiveUidGenerator& uid_;
-  CustomCallDeclarations& custom_calls_;
-};
-
-#define DEFINE_COLLECTIVE_DONE_OP_LOWERING(OP, done_type)              \
-  struct OP##Lowering : public AsyncDoneOpLowering<OP, OP##Lowering> { \
-    static constexpr const char kDoneType[] = done_type;               \
-    using AsyncDoneOpLowering::AsyncDoneOpLowering;                    \
-  }
-
-DEFINE_COLLECTIVE_DONE_OP_LOWERING(AllGatherDoneOp, "all_gather_done");
-DEFINE_COLLECTIVE_DONE_OP_LOWERING(AllReduceDoneOp, "all_reduce_done");
-DEFINE_COLLECTIVE_DONE_OP_LOWERING(AllToAllDoneOp, "all_to_all_done");
-DEFINE_COLLECTIVE_DONE_OP_LOWERING(CollectivePermuteDoneOp,
-                                   "collective_permute_done");
-DEFINE_COLLECTIVE_DONE_OP_LOWERING(ReduceScatterDoneOp, "reduce_scatter_done");
-DEFINE_COLLECTIVE_DONE_OP_LOWERING(SendDoneOp, "send_done");
-DEFINE_COLLECTIVE_DONE_OP_LOWERING(RecvDoneOp, "recv_done");
-
-#undef DEFINE_COLLECTIVE_DONE_OP_LOWERING
-
-template <typename CollectiveIdOp>
-class CollectiveIdOpLowering : public OpRewritePattern<CollectiveIdOp> {
-  static StringRef Target(ReplicaIdOp) { return "xla.gpu.replica_id"; }
-  static StringRef Target(PartitionIdOp) { return "xla.gpu.partition_id"; }
-
- public:
-  CollectiveIdOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern<CollectiveIdOp>(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(CollectiveIdOp op,
-                                PatternRewriter& rewriter) const override {
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    func::FuncOp callee = custom_calls_.GetOrCreate(b, Target(op), op);
-
-    // Call the runtime intrinsic with the original operands.
-    rewriter.replaceOpWithNewOp<func::CallOp>(op, callee.getName(), TypeRange(),
-                                              op->getOperands());
-    return success();
-  }
-
- private:
-  CustomCallDeclarations& custom_calls_;
-};
-
-class ReplicaIdOpLowering : public CollectiveIdOpLowering<ReplicaIdOp> {
- public:
-  using CollectiveIdOpLowering::CollectiveIdOpLowering;
-};
-
-class PartitionIdOpLowering : public CollectiveIdOpLowering<PartitionIdOp> {
- public:
-  using CollectiveIdOpLowering::CollectiveIdOpLowering;
-};
-
-//===----------------------------------------------------------------------===//
-// Host<->Device communication ops lowering (Send/Recv).
-//===----------------------------------------------------------------------===//
-
-template <typename OpT, typename Derived>
-class HostSendRecvOpLowering : public OpRewritePattern<OpT> {
- public:
-  HostSendRecvOpLowering(MLIRContext* ctx, CustomCallDeclarations& custom_calls)
-      : OpRewritePattern<OpT>(ctx), custom_calls_(custom_calls) {}
-
-  LogicalResult matchAndRewrite(OpT op,
-                                PatternRewriter& rewriter) const override {
-    if (!op.getIsHostTransfer()) {
-      return failure();
-    }
-
-    constexpr bool is_done_op =
-        is_any<OpT, lmhlo::SendDoneOp, lmhlo::RecvDoneOp>;
-
-    // Get or create a custom call function declaration.
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-
-    // For done ops, drop the token input.
-    TypeRange input_types =
-        is_done_op ? TypeRange() : TypeRange(op->getOperands());
-    func::FuncOp callee = custom_calls_.GetOrCreate(
-        b, Derived::kCustomCallTarget, input_types, TypeRange());
-
-    llvm::SmallVector<NamedAttribute> custom_call_attributes = {
-        {b.getStringAttr("channel_handle"), op.getChannelHandleAttr()}};
-    if constexpr (!is_done_op) {
-      custom_call_attributes.push_back(NamedAttribute(
-          b.getStringAttr("frontend_attributes"), op.getFrontendAttributes()));
-    }
-
-    // Convert Send/Recv/SendDone/RecvDone to a function call.
-    ValueRange inputs =
-        is_done_op ? ValueRange() : ValueRange(op->getOperands());
-    auto call = rewriter.create<func::CallOp>(op.getLoc(), callee.getName(),
-                                              TypeRange(), inputs);
-    AppendCustomCallAttrs(call, custom_call_attributes);
-
-    if constexpr (!is_done_op) {
-      // For communication operation we need to produce a fake token, that will
-      // be later removed, because corresponding `done` operation doesn't have
-      // the token argument. We rely on the `unrealized_conversion_cast`
-      // operation to create a fake token from the `i8` constant.
-      Value token = op.getResult();
-      Value c0 = b.create<arith::ConstantOp>(b.getI8IntegerAttr(0));
-      auto fake = b.create<UnrealizedConversionCastOp>(token.getType(), c0);
-      token.replaceAllUsesWith(fake.getResult(0));
-    }
-
-    // Erase the original operation.
-    rewriter.eraseOp(op);
-
-    return success();
-  }
-
- private:
-  CustomCallDeclarations& custom_calls_;
-};
-
-#define DEFINE_HOST_SENDRECV_OP_LOWERING(OP, custom_call)          \
-  struct Host##OP##Lowering                                        \
-      : public HostSendRecvOpLowering<OP, Host##OP##Lowering> {    \
-    static constexpr const char kCustomCallTarget[] = custom_call; \
-    using HostSendRecvOpLowering::HostSendRecvOpLowering;          \
-  }
-
-DEFINE_HOST_SENDRECV_OP_LOWERING(SendOp, "xla.gpu.send_host");
-DEFINE_HOST_SENDRECV_OP_LOWERING(SendDoneOp, "xla.gpu.send_done_host");
-DEFINE_HOST_SENDRECV_OP_LOWERING(RecvOp, "xla.gpu.recv_host");
-DEFINE_HOST_SENDRECV_OP_LOWERING(RecvDoneOp, "xla.gpu.recv_done_host");
-
-//===----------------------------------------------------------------------===//
-
-template <typename PairT, typename... Remaining>
-static WalkResult AssignAsyncUid(Operation* op,
-                                 CollectiveUidGenerator& collective_uid) {
-  auto start = dyn_cast<typename PairT::first_type>(op);
-  if (!start) {
-    if constexpr (sizeof...(Remaining) != 0) {
-      return AssignAsyncUid<Remaining...>(op, collective_uid);
-    } else {
-      return WalkResult::advance();
-    }
-  }
-
-  if (!CollectiveFilter::ShouldHandle(start)) {
-    return WalkResult::advance();
-  }
-
-  Value token = start.getToken();
-
-  // We expect the token to be consumed just once.
-  if (!token.hasOneUse()) return start.emitOpError("token has multiple uses");
-
-  // Token must be consumed by the corresponding done operation.
-  auto done = dyn_cast<typename PairT::second_type>(*token.getUsers().begin());
-  if (!done) return start.emitOpError("illegal token user");
-
-  collective_uid.AssignUid(start, done);
-  return WalkResult::advance();
-}
-
-void ConvertLmhloToGpuRuntimePass::runOnOperation() {
-  ModuleOp module = getOperation();
-  MLIRContext* ctx = module.getContext();
-
-  // Keep track of the custom calls created from the lowered operations.
-  SymbolTable sym_table(module);
-  CustomCallDeclarations custom_calls(std::move(sym_table));
-
-  // Convert lmhlo operations to XLA gpu runtime custom calls.
-  RewritePatternSet patterns(ctx);
-  patterns.insert<TerminatorOpLowering, CaseOpLowering, WhileOpLowering>(ctx);
-  patterns.insert<InfeedOpLowering, OutfeedOpLowering, CustomCallOpLowering>(
-      ctx, custom_calls);
-
-  UidGenerator fft_uid;
-  patterns.insert<FftOpLowering>(ctx, fft_uid, custom_calls);
-
-  // Assign shared unique id to each unique pair of async start-done operations,
-  // all other collective operations will get assigned uid.
-  CollectiveUidGenerator collective_uid;
-  auto walked = module.walk([&collective_uid](Operation* op) {
-    return AssignAsyncUid<
-        std::pair<AllGatherStartOp, AllGatherDoneOp>,
-        std::pair<AllReduceStartOp, AllReduceDoneOp>,
-        std::pair<AllToAllStartOp, AllToAllDoneOp>,
-        std::pair<CollectivePermuteStartOp, CollectivePermuteDoneOp>,
-        std::pair<ReduceScatterStartOp, ReduceScatterDoneOp>,
-        std::pair<SendOp, SendDoneOp>, std::pair<RecvOp, RecvDoneOp>>(
-        op, collective_uid);
-  });
-  if (walked.wasInterrupted()) return signalPassFailure();
-
-  // Convert lmhlo collective operations to XLA gpu runtime custom calls.
-  patterns.insert<PartitionIdOpLowering, ReplicaIdOpLowering>(ctx,
-                                                              custom_calls);
-  patterns.insert<AllGatherStartOpLowering, AllReduceStartOpLowering,
-                  AllToAllStartOpLowering, CollectivePermuteStartOpLowering,
-                  ReduceScatterStartOpLowering, SendOpLowering, RecvOpLowering>(
-      ctx, collective_uid, custom_calls);
-
-  // Convert lmhlo host<->device point-to-point communication operations to XLA
-  // gpu runtime.
-  patterns.insert<HostSendOpLowering, HostSendDoneOpLowering,
-                  HostRecvOpLowering, HostRecvDoneOpLowering>(ctx,
-                                                              custom_calls);
-
-  if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
-    return signalPassFailure();
-
-  // TODO(ezhulenev): We must run `done` op lowering after the `start` op
-  // lowering to ensure that all redundant collective operations will be
-  // safely replaced by a `memcpy` operations.
-  //
-  // This should be a part of lmhlo operation canonicalization.
-  {
-    RewritePatternSet patterns(ctx);
-    patterns.insert<AllGatherDoneOpLowering, AllReduceDoneOpLowering,
-                    AllToAllDoneOpLowering, CollectivePermuteDoneOpLowering,
-                    ReduceScatterDoneOpLowering, SendDoneOpLowering,
-                    RecvDoneOpLowering>(ctx, collective_uid, custom_calls);
-    if (failed(applyPatternsAndFoldGreedily(module, std::move(patterns))))
-      return signalPassFailure();
-  }
-}
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertLmhloToGpuRuntimePass() {
-  return std::make_unique<ConvertLmhloToGpuRuntimePass>();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/memref_get_global_to_arg.cc b/third_party/xla/xla/mlir/backends/gpu/transforms/memref_get_global_to_arg.cc
deleted file mode 100644
index 5333b1cdb51fb8..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/memref_get_global_to_arg.cc
+++ /dev/null
@@ -1,168 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-#include <utility>
-
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/Transforms/DialectConversion.h"  // from @llvm-project
-#include "xla/mlir/backends/gpu/transforms/passes.h"
-
-namespace xla {
-namespace gpu {
-
-#define GEN_PASS_DEF_CONVERTMEMREFGETGLOBALTOARGPASS
-#include "xla/mlir/backends/gpu/transforms/passes.h.inc"
-
-using namespace mlir;  // NOLINT
-
-class ConvertMemrefGetGlobalToArgPass
-    : public impl::ConvertMemrefGetGlobalToArgPassBase<
-          ConvertMemrefGetGlobalToArgPass> {
- public:
-  ConvertMemrefGetGlobalToArgPass() = default;
-
-  explicit ConvertMemrefGetGlobalToArgPass(int64_t min_num_elements) {
-    this->min_num_elements_ = min_num_elements;
-  }
-
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<memref::MemRefDialect>();
-  }
-};
-
-//===----------------------------------------------------------------------===//
-
-using GlobalConstantsArgs =
-    llvm::DenseMap<func::FuncOp, llvm::StringMap<Value>>;
-
-// Returns a mapping from a global constant name to the function argument.
-//
-// Example:
-//
-//   memref.global "private" constant @cst : memref<2x3xf32>
-//   func @get_global(%arg0: memref<24xi8> {lmhlo.constant_name = "cst"})
-//
-// All memref.get_global operations will be replaced by constant arguments
-// corresponding to the global constant.
-static GlobalConstantsArgs GetConstantArgs(ModuleOp m) {
-  GlobalConstantsArgs mapping;
-
-  m.walk([&](func::FuncOp func) {
-    for (unsigned i = 0; i < func.getNumArguments(); ++i) {
-      auto cst = func.getArgAttrOfType<StringAttr>(i, "lmhlo.constant_name");
-      if (cst) mapping[func][cst] = func.getArgument(i);
-    }
-  });
-
-  return mapping;
-}
-
-class GetGlobalOpLowering : public OpRewritePattern<memref::GetGlobalOp> {
- public:
-  GetGlobalOpLowering(MLIRContext* ctx, const GlobalConstantsArgs& cst_args)
-      : OpRewritePattern<memref::GetGlobalOp>(ctx), cst_args_(cst_args) {}
-
-  LogicalResult matchAndRewrite(memref::GetGlobalOp op,
-                                PatternRewriter& rewriter) const override {
-    // Find global constants mapping for the parent function.
-    auto func_mapping = cst_args_.find(op->getParentOfType<func::FuncOp>());
-    if (func_mapping == cst_args_.end()) return failure();
-
-    // Check if the global operation corresponds to the LMHLO constant arg.
-    auto arg = func_mapping->second.find(op.getName());
-    if (arg == func_mapping->second.end()) return failure();
-
-    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
-    MemRefType memref = op->getResult(0).getType().cast<MemRefType>();
-
-    // For identity layouts we can replace all loads from a global with the
-    // corresponding argument.
-    if (memref.getLayout().isIdentity()) {
-      Value c0 = b.create<arith::ConstantOp>(rewriter.getIndexAttr(0));
-      rewriter.replaceOpWithNewOp<memref::ViewOp>(op, memref, arg->second, c0,
-                                                  ValueRange());
-      return success();
-    }
-
-    // For non-identity type we first view constant argument as a flat memref
-    // with the correct element type, and then cast it to the strided memref
-    // corresponding to the original memref layout.
-
-    // Get the strides and offset from the original memref type.
-    int64_t offset;
-    llvm::SmallVector<int64_t> strides;
-    if (failed(getStridesAndOffset(memref, strides, offset)))
-      return op.emitOpError("failed to compute strides and offset");
-
-    // Create a 1d view into the corresponding argument.
-    Value c0 = b.create<arith::ConstantOp>(rewriter.getIndexAttr(0));
-    Value flat_view = b.create<memref::ViewOp>(
-        MemRefType::get({memref.getNumElements()}, memref.getElementType()),
-        arg->second, c0, ValueRange());
-
-    // Cast flat memref view into the original memref type.
-    rewriter.replaceOpWithNewOp<memref::ReinterpretCastOp>(
-        op, memref, flat_view, offset, memref.getShape(), strides);
-
-    return success();
-  }
-
- private:
-  const GlobalConstantsArgs& cst_args_;
-};
-
-void ConvertMemrefGetGlobalToArgPass::runOnOperation() {
-  ModuleOp module = getOperation();
-  MLIRContext* ctx = module.getContext();
-
-  // Replace memref loads from globals corresponding to the constant arguments.
-  RewritePatternSet patterns(ctx);
-  GlobalConstantsArgs cst_args = GetConstantArgs(module);
-  patterns.insert<GetGlobalOpLowering>(ctx, cst_args);
-
-  // Set up conversion target to rewrite only GetGlobalOp larger than the
-  // threshold and avoid any other canonicalizations that can break later
-  // passes.
-  ConversionTarget target(*ctx);
-  target.addDynamicallyLegalOp<memref::GetGlobalOp>(
-      [&](memref::GetGlobalOp op) {
-        auto memref = op.getType();
-        return memref.getNumElements() < min_num_elements_;
-      });
-  target.addLegalOp<arith::ConstantOp, memref::ViewOp,
-                    memref::ReinterpretCastOp>();
-
-  if (failed(applyPartialConversion(module, target, std::move(patterns))))
-    signalPassFailure();
-}
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertMemrefGetGlobalToArgPass() {
-  return std::make_unique<ConvertMemrefGetGlobalToArgPass>();
-}
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertMemrefGetGlobalToArgPass(int64_t min_num_elements) {
-  return std::make_unique<ConvertMemrefGetGlobalToArgPass>(min_num_elements);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/outline_cuda_graphs.cc b/third_party/xla/xla/mlir/backends/gpu/transforms/outline_cuda_graphs.cc
deleted file mode 100644
index 5e8393cad39dc0..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/outline_cuda_graphs.cc
+++ /dev/null
@@ -1,518 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <iterator>
-#include <memory>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "absl/container/flat_hash_set.h"
-#include "llvm/ADT/STLExtras.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/Dominance.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/TypeRange.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Transforms/RegionUtils.h"  // from @llvm-project
-#include "xla/debug_options_flags.h"
-#include "xla/mlir/backends/gpu/transforms/passes.h"
-#include "xla/mlir/runtime/ir/rt_dialect.h"
-#include "xla/mlir/runtime/ir/rt_ops.h"
-#include "xla/mlir/runtime/utils/custom_calls.h"
-#include "xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
-#include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/stream_executor/blas.h"
-#include "xla/xla.pb.h"
-
-namespace xla {
-namespace gpu {
-
-#define GEN_PASS_DEF_OUTLINEGPUGRAPHSPASS
-#include "xla/mlir/backends/gpu/transforms/passes.h.inc"
-
-using namespace mlir;  // NOLINT
-
-using mlir::gpu::LaunchFuncOp;
-
-class OutlineGpuGraphsPass
-    : public impl::OutlineGpuGraphsPassBase<OutlineGpuGraphsPass> {
- public:
-  OutlineGpuGraphsPass() = default;
-  explicit OutlineGpuGraphsPass(
-      absl::flat_hash_set<DebugOptions::CommandBufferCmdType> command_types,
-      int min_graph_size)
-      : command_types_(std::move(command_types)) {
-    this->min_graph_size_ = min_graph_size;
-  }
-
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<func::FuncDialect, runtime::RuntimeDialect>();
-  }
-
- private:
-  absl::flat_hash_set<DebugOptions::CommandBufferCmdType> command_types_ = {
-      DebugOptions::FUSION, DebugOptions::CUBLAS, DebugOptions::CUDNN};
-  int gpu_graph_level_ = 3;
-};
-
-//===----------------------------------------------------------------------===//
-
-struct OpCapturePattern {
-  // CUDA-graph-compatible operations can be either moved or cloned into the
-  // graph capture function. Most of the operations should be moved, as they
-  // have side effects, however small constants and pure operations like
-  // `memref.view` can be safely cloned into the graph region. We rely on later
-  // dead code elimination to erase them from the "main" function if they are
-  // not used by any other operations.
-  enum class Capture { kMove, kClone };
-
-  virtual ~OpCapturePattern() = default;
-  virtual FailureOr<Capture> match(Operation* op) = 0;
-};
-
-using OpCapturePatternSet = std::vector<std::unique_ptr<OpCapturePattern>>;
-
-// A sequence of operations to be outlined into cuda graph capture function.
-using CaptureSequence =
-    llvm::SmallVector<std::pair<Operation*, OpCapturePattern::Capture>>;
-
-//===----------------------------------------------------------------------===//
-
-template <OpCapturePattern::Capture capture, typename T, typename... Ts>
-struct OpCapture : public OpCapturePattern {
-  FailureOr<OpCapturePattern::Capture> match(Operation* op) final {
-    if (isa<T, Ts...>(op)) return capture;
-    return failure();
-  }
-};
-
-static constexpr auto kMove = OpCapturePattern::Capture::kMove;
-static constexpr auto kClone = OpCapturePattern::Capture::kClone;
-
-template <typename T, typename... Ts>
-using MoveOp = OpCapture<kMove, T, Ts...>;
-template <typename T, typename... Ts>
-using CloneOp = OpCapture<kClone, T, Ts...>;
-
-// Capture gpu operations by moving them into graph capture function.
-struct LaunchFuncOpCapture : public MoveOp<LaunchFuncOp> {};
-
-template <typename T>
-struct ConvOpCapture : public OpCapturePattern {
-  FailureOr<OpCapturePattern::Capture> match(Operation* op) final {
-    if (auto conv = llvm::dyn_cast<T>(op)) {
-      // Convolution that does runtime autotuning should not be captured, since
-      // CUDA graphs do not support operations that allocate memory.
-      lmhlo_gpu::ConvolutionBackendConfigAttr backend_config =
-          conv.getBackendConfig();
-      if (backend_config.getAlgorithm() != -1) {
-        return kMove;
-      }
-    }
-    return failure();
-  }
-};
-
-// TODO(b/270426911): Right now GEMM/Convolution with runtime autotuning can't
-// be captured by a cuda graph. However, longer term the proper fix is to make
-// autotuning "cuda-graph-aware", and run autotuning on a separate stream that
-// is not in capture mode.
-struct ConvForwardOpCapture : public ConvOpCapture<lmhlo_gpu::ConvForwardOp> {};
-struct ConvBackwardInputOpCapture
-    : public ConvOpCapture<lmhlo_gpu::ConvBackwardInputOp> {};
-struct ConvBackwardFilterOpCapture
-    : public ConvOpCapture<lmhlo_gpu::ConvBackwardFilterOp> {};
-struct ConvForwardFusedOpCapture
-    : public ConvOpCapture<lmhlo_gpu::ConvForwardFusedOp> {};
-struct ConvForwardFusedSideInputOpCapture
-    : public ConvOpCapture<lmhlo_gpu::ConvForwardFusedSideInputOp> {};
-
-struct GemmOpCapture : public OpCapturePattern {
-  FailureOr<OpCapturePattern::Capture> match(Operation* op) final {
-    if (auto gemm = llvm::dyn_cast<lmhlo_gpu::GEMMOp>(op)) {
-      // GEMM that does runtime autotuning should not be captured, since CUDA
-      // graph does not support operations that allocate memory.
-      if (!gemm.getAlgorithm().has_value() ||
-          gemm.getAlgorithm().value() !=
-              stream_executor::blas::kRuntimeAutotuning) {
-        return kMove;
-      }
-    }
-    return failure();
-  }
-};
-
-struct MemcpyOpCapture : public OpCapturePattern {
-  FailureOr<OpCapturePattern::Capture> match(Operation* op) final {
-    if (auto memcpy = llvm::dyn_cast<mlir::gpu::MemcpyOp>(op)) {
-      // We use a heuristic to identify the direction of the memcpy operation,
-      // if the operand was allocated by alloca op or is a global memref, then
-      // it must be a memref on the host.
-      auto IsHostMemRef = [](Value value) {
-        auto* op = value.getDefiningOp();
-        return llvm::isa_and_nonnull<memref::AllocaOp, memref::GetGlobalOp>(op);
-      };
-
-      auto IsDeviceToDevice = [&](mlir::gpu::MemcpyOp op) {
-        return !IsHostMemRef(op.getDst()) && !IsHostMemRef(op.getSrc());
-      };
-
-      // Device-to-host Memcpy cannot be captured by CUDA graphs.
-      if (IsDeviceToDevice(memcpy)) {
-        return kMove;
-      }
-    }
-    return failure();
-  }
-};
-
-// Capture pure operations by cloning them into graph capture function.
-struct ConstantOpCapture : public CloneOp<arith::ConstantOp> {};
-struct ViewOpCapture : public CloneOp<memref::ViewOp> {};
-struct ReinterpretCastOpCapture : public CloneOp<memref::ReinterpretCastOp> {};
-
-//===----------------------------------------------------------------------===//
-
-// Collect sequences of operations that can be outlined into Cuda Graphs.
-static std::vector<CaptureSequence> CollectCaptureSequences(
-    DominanceInfo& dominance, ModuleOp module, OpCapturePatternSet& patterns) {
-  std::vector<CaptureSequence> seqs;
-
-  // Match given operation with all capture patterns.
-  auto match = [&](Operation* op) -> FailureOr<OpCapturePattern::Capture> {
-    for (auto& pattern : patterns) {
-      if (auto matched = pattern->match(op); succeeded(matched)) return matched;
-    }
-    return failure();
-  };
-
-  // Find graph-compatible sequences of operations in every block.
-  module.walk([&](Block* block) {
-    CaptureSequence* seq = &seqs.emplace_back();
-
-    for (Operation& op : *block) {
-      FailureOr<OpCapturePattern::Capture> matched = match(&op);
-      // Append matched operation to the current sequence. We only append
-      // operations that must be moved into the graph capture function (ops with
-      // side effects), and add cloneable operations later.
-      if (succeeded(matched) && *matched == kMove)
-        seq->emplace_back(&op, *matched);
-
-      // Skip unsupported operation and start a new sequence.
-      if (failed(matched) && !seq->empty()) seq = &seqs.emplace_back();
-    }
-
-    // Remove the last sequence if it's empty.
-    if (seq->empty()) seqs.pop_back();
-  });
-
-  // Remove cloneable operations accidentally captured by the sequence of ops,
-  // e.g. we can have `memref.view` between two kernel launch operations that
-  // is not used by operations in the captured sequence.
-  for (CaptureSequence& seq : seqs) {
-    llvm::DenseSet<Operation*> moveable_ops;
-    for (auto& [op, capture] : seq)
-      if (capture == kMove) moveable_ops.insert(op);
-
-    llvm::erase_if(seq, [&](auto& pair) {
-      return pair.second == kClone &&
-             llvm::none_of(pair.first->getUsers(), [&](Operation* user) {
-               return moveable_ops.contains(user);
-             });
-    });
-  }
-
-  // Try to extend discovered sequences of ops following operands use-def chains
-  // and pulling cloneable operations defining operands into the graph capture
-  // sequence. In practice we just clone `arith.constant` and `memref.view`
-  // operations into the graph capture function, to make it cheaper to compute
-  // the hash of the arguments at run time.
-  for (CaptureSequence& seq : seqs) {
-    llvm::DenseSet<Operation*> seq_ops;  // operations already in `seq`
-    llvm::SmallVector<Operation*> worklist;
-
-    // Add operations that define `op` arguments to the worklist.
-    auto populate_worklist = [&](Operation* op) {
-      for (Value arg : op->getOperands())
-        if (Operation* op = arg.getDefiningOp()) worklist.push_back(op);
-    };
-
-    for (auto& [op, _] : seq) {
-      seq_ops.insert(op);
-      populate_worklist(op);
-    }
-
-    // Find cloneable ops and group them by block where they are defined.
-    llvm::DenseMap<Block*, llvm::SmallVector<Operation*>> cloneable;
-
-    // Traverse use-def chains to collect all cloneable operations.
-    while (!worklist.empty()) {
-      Operation* op = worklist.pop_back_val();
-      if (seq_ops.contains(op)) continue;
-
-      // Check if operation can be cloned into graph capture function.
-      if (auto matched = match(op);
-          succeeded(matched) && *matched == OpCapturePattern::Capture::kClone) {
-        cloneable[op->getBlock()].push_back(op);
-        seq_ops.insert(op);
-        populate_worklist(op);
-      }
-    }
-
-    // Traverse blocks according to their dominance to avoid used-before-defined
-    // invalid SSA region construction in graph capture function.
-    llvm::SmallVector<Block*> blocks;
-    for (auto& [block, _] : cloneable) blocks.push_back(block);
-    llvm::sort(blocks, [&](Block* a, Block* b) {
-      return dominance.properlyDominates(a, b);
-    });
-
-    for (Block* block : llvm::reverse(blocks)) {
-      // Sort operations according to their original position in the block.
-      llvm::sort(cloneable[block], [](Operation* a, Operation* b) {
-        return a->isBeforeInBlock(b);
-      });
-
-      // Prepend all cloneable operations to the discovered ops sequence.
-      auto cloned = llvm::map_range(cloneable[block], [](Operation* op) {
-        return std::make_pair(op, OpCapturePattern::Capture::kClone);
-      });
-      seq.insert(seq.begin(), cloned.begin(), cloned.end());
-    }
-  }
-
-  return seqs;
-}
-
-//===----------------------------------------------------------------------===//
-
-using xla::runtime::CustomCallDeclarations;
-
-static std::vector<Value> GetGraphCaptureFuncArgs(const CaptureSequence& seq) {
-  llvm::SetVector<Value> args;
-
-  // Values defined by operations in the capture sequence.
-  llvm::DenseSet<Value> defined_by_seq;
-  for (auto& [op, _] : seq)
-    defined_by_seq.insert(op->result_begin(), op->result_end());
-
-  // Add arguments defined outside of the capture sequence.
-  for (auto& [op, _] : seq) {
-    auto external_args = llvm::make_filter_range(
-        op->getOperands(),
-        [&](Value arg) { return !defined_by_seq.contains(arg); });
-    args.insert(external_args.begin(), external_args.end());
-  }
-  llvm::SmallVector<Value, 0> args_sv = args.takeVector();
-  std::vector<Value> args_tv(args_sv.begin(), args_sv.end());
-  return args_tv;
-}
-
-// Given a sequence of operations, outline them into a graph capture function
-// and replace them with an XLA Gpu runtime function call.
-static LogicalResult Outline(unsigned ordinal,
-                             CustomCallDeclarations& custom_calls,
-                             CaptureSequence& seq, int min_graph_size) {
-  // Only operations that have to be moved into the graph capture function
-  // represent Gpu computations.
-  unsigned num_move_captures = llvm::count_if(seq, [](auto capture) {
-    return capture.second == OpCapturePattern::Capture::kMove;
-  });
-  DebugOptions debug_options = GetDebugOptionsFromFlags();
-  if (num_move_captures < min_graph_size) return failure();
-
-  SymbolTable& sym_table = custom_calls.sym_table();
-  MLIRContext* ctx = sym_table.getOp()->getContext();
-
-  // Create a fused location out of LaunchFuncOp operations.
-  llvm::SmallVector<Location> locations;
-  for (auto& op : seq) locations.push_back(op.first->getLoc());
-  ImplicitLocOpBuilder b(FusedLoc::get(ctx, locations), sym_table.getOp());
-
-  // Arguments of the graph capture function.
-  std::vector<Value> args = GetGraphCaptureFuncArgs(seq);
-
-  // Create a function in the compiled module.
-  auto func = b.create<func::FuncOp>(
-      "xla.gpu.graph.capture",
-      FunctionType::get(ctx, TypeRange(ValueRange(args)), TypeRange()));
-
-  Operation* first_op = seq.front().first;
-  auto parent_func = first_op->getParentOfType<func::FuncOp>();
-
-  // If an argument to parent_func has the "lmhlo.constant_name" attribute and
-  // is passed to the graph capture function, we propagate the attribute the
-  // graph capture function.
-  //
-  // We also annotate all arguments with "rt.allocation_index" attribute that
-  // allows us to forward correct arguments to graph capture function during
-  // Gpu executable initialization (see `InstantiateAllGraphs` implementation).
-  for (unsigned i = 0; i < args.size(); ++i) {
-    Value arg = args[i];
-
-    // Check if arg is a function argument of parent_func.
-    if (!isa<BlockArgument>(arg)) continue;
-
-    // Function arguments are passed in as block arguments to the entry block.
-    auto block_arg = cast<BlockArgument>(arg);
-    Block* parent_block = block_arg.getParentBlock();
-    if (!parent_block->isEntryBlock()) continue;
-
-    // If this is an argument to the entry block of the parent function, it
-    // means that it's the XLA allocation, and we forward index to the capture
-    // function.
-    func.setArgAttr(i, "rt.allocation_index",
-                    b.getIndexAttr(block_arg.getArgNumber()));
-
-    // Check that the parent_block is in the SSACFG region of parent_func.
-    Region& parent_func_region = parent_func.getRegion();
-    if (parent_block->getParent() != &parent_func_region) continue;
-
-    unsigned parent_func_arg_index = block_arg.getArgNumber();
-    auto cst = parent_func.getArgAttrOfType<StringAttr>(parent_func_arg_index,
-                                                        "lmhlo.constant_name");
-    if (cst) {
-      func.setArgAttr(i, "lmhlo.constant_name", cst);
-    }
-  }
-
-  for (auto op : seq) {
-    mlir::Operation* captured_op = op.first;
-    if (isa<lmhlo_gpu::GEMMOp>(captured_op)) {
-      func->setAttr(b.getStringAttr(runtime::kRequiresBlasAttrName),
-                    BoolAttr::get(ctx, true));
-      break;
-    }
-  }
-
-  // Add graph capture function to the module.
-  sym_table.insert(func);
-
-  // Export graph capture function to the runtime.
-  b.setInsertionPoint(func);
-  b.create<runtime::ExportOp>(func, ordinal);
-
-  // Create a custom call declaration corresponding to the outlined graph
-  // capture function.
-  func::FuncOp graph_launch = custom_calls.GetOrCreate(
-      b, "xla.gpu.graph.launch", TypeRange(ValueRange(args)), TypeRange());
-
-  // Call the cuda graph launch custom call right before the first moved op.
-  auto insertion_point = llvm::find_if(seq, [](auto capture) {
-    return capture.second == OpCapturePattern::Capture::kMove;
-  });
-  b.setInsertionPoint(insertion_point->first);
-
-  auto call = b.create<func::CallOp>(graph_launch.getName(), TypeRange(), args);
-  call->setAttr(b.getStringAttr("capture"), FlatSymbolRefAttr::get(func));
-
-  // At this point we successfully added new functions to the module, so we can
-  // move or clone captured operations from their original location to the graph
-  // capture function.
-  Block* body = func.addEntryBlock();
-
-  // We'll need to replace operands of cloned/moved operations inside the graph
-  // capture function.
-  llvm::SmallVector<std::pair<Value, Value>> mappings;  // {from, to} mappings
-  for (auto mapping : llvm::zip(args, func.getArguments()))
-    mappings.emplace_back(std::get<0>(mapping), std::get<1>(mapping));
-
-  // Move or clone operations into the graph capture function.
-  for (auto& [op, capture] : seq) {
-    if (capture == OpCapturePattern::Capture::kMove)
-      op->moveBefore(body, body->end());
-
-    if (capture == OpCapturePattern::Capture::kClone) {
-      Operation* clone = op->clone();
-      OpBuilder::atBlockEnd(body).insert(clone);
-
-      for (auto mapping : llvm::zip(op->getResults(), clone->getResults()))
-        mappings.emplace_back(std::get<0>(mapping), std::get<1>(mapping));
-    }
-  }
-
-  // Update def-use chains inside the graph capture function.
-  for (auto mapping : mappings) {
-    replaceAllUsesInRegionWith(mapping.first, mapping.second, func.getBody());
-  }
-
-  // Add a return operation to the graph capture function.
-  b.setInsertionPointToEnd(body);
-  b.create<func::ReturnOp>(ValueRange());
-
-  return success();
-}
-
-//===----------------------------------------------------------------------===//
-
-void OutlineGpuGraphsPass::runOnOperation() {
-  SymbolTable sym_table(getOperation());
-  CustomCallDeclarations custom_calls(std::move(sym_table));
-
-  OpCapturePatternSet patterns;
-
-  if (command_types_.contains(DebugOptions::FUSION)) {
-    // Enable capturing fusions and memcpies.
-    patterns.emplace_back(new LaunchFuncOpCapture());
-    patterns.emplace_back(new ConstantOpCapture());
-    patterns.emplace_back(new ViewOpCapture());
-    patterns.emplace_back(new MemcpyOpCapture());
-    patterns.emplace_back(new ReinterpretCastOpCapture());
-  }
-
-  if (command_types_.contains(DebugOptions::CUBLAS)) {
-    // Enable capturing gemms.
-    patterns.emplace_back(new GemmOpCapture());
-  }
-
-  if (command_types_.contains(DebugOptions::CUDNN)) {
-    // Enable capturing convolutions.
-    patterns.emplace_back(new ConvForwardOpCapture());
-    patterns.emplace_back(new ConvBackwardInputOpCapture());
-    patterns.emplace_back(new ConvBackwardFilterOpCapture());
-    patterns.emplace_back(new ConvForwardFusedOpCapture());
-    patterns.emplace_back(new ConvForwardFusedSideInputOpCapture());
-  }
-
-  unsigned ordinal = 1;  // entry point will be exported with ordinal 0
-  for (auto& seq : CollectCaptureSequences(getAnalysis<DominanceInfo>(),
-                                           getOperation(), patterns)) {
-    if (succeeded(Outline(ordinal, custom_calls, seq, min_graph_size_)))
-      ordinal++;
-  }
-}
-
-std::unique_ptr<OperationPass<ModuleOp>> createOutlineGpuGraphsPass() {
-  return std::make_unique<OutlineGpuGraphsPass>();
-}
-
-std::unique_ptr<OperationPass<ModuleOp>> createOutlineGpuGraphsPass(
-    absl::flat_hash_set<DebugOptions::CommandBufferCmdType> command_types,
-    int min_graph_size) {
-  return std::make_unique<OutlineGpuGraphsPass>(command_types, min_graph_size);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/passes.cc b/third_party/xla/xla/mlir/backends/gpu/transforms/passes.cc
deleted file mode 100644
index a453e037334931..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/passes.cc
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/mlir/backends/gpu/transforms/passes.h"
-
-#include <cstdint>
-#include <vector>
-
-#include "absl/log/log.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/Pass/PassManager.h"  // from @llvm-project
-#include "mlir/Transforms/Passes.h"  // from @llvm-project
-#include "xla/mlir/runtime/ir/rt_ops.h"
-
-namespace xla {
-namespace gpu {
-
-using namespace mlir;  // NOLINT
-
-std::vector<std::vector<int64_t>> GetAllocationIndices(mlir::ModuleOp module) {
-  std::vector<std::vector<int64_t>> res;
-
-  SymbolTable sym_table(module);
-  for (auto op : module.getOps<runtime::ExportOp>()) {
-    unsigned ordinal = *op.ordinal();
-    if (ordinal >= res.size()) res.resize(ordinal + 1);
-
-    auto func = sym_table.lookup<func::FuncOp>(op.getFunctionRef());
-    res[ordinal].resize(func.getNumArguments(), -1);
-
-    for (unsigned i = 0; i < func.getNumArguments(); ++i) {
-      auto idx = func.getArgAttrOfType<IntegerAttr>(i, "rt.allocation_index");
-      if (idx) res[ordinal][i] = idx.getInt();
-    }
-  }
-
-  return res;
-}
-
-void populateXlaGpuRuntimePasses(mlir::OpPassManager& pm,
-                                 ThunkSequence* thunk_sequence,
-                                 const GpuPipelineOpts& opts) {
-  // Lower operations with registered IR emitters to Gpu launches.
-  pm.addPass(createConvertLmhloToGpuLaunchPass(thunk_sequence));
-
-  // Clean up IR before converting it to the runtime operations.
-  pm.addPass(createCSEPass());
-
-  // Convert global memrefs corresponding to constant arguments.
-  pm.addPass(createConvertMemrefGetGlobalToArgPass());
-  pm.addPass(createSymbolDCEPass());  // Clean up unused global constants.
-
-  // Outline CUDA-Graph-compatible operations into graph capture functions.
-  pm.addPass(
-      createOutlineGpuGraphsPass(opts.command_types, opts.min_graph_size));
-  if (opts.enable_concurrent_region) {
-    // Concurrent regions create repeated-fork-join topology inside CUDA graphs,
-    // which is not optimized by architectures prior to Ampere and may cause
-    // regression. So we enable concurrent regions only on Ampere GPUs.
-    if (auto cc = std::get_if<stream_executor::CudaComputeCapability>(
-            &opts.compute_capability);
-        !cc || cc->IsAtLeast(8, 0)) {
-      pm.addPass(createAddConcurrentRegionsPass());
-    } else {
-      LOG(WARNING)
-          << "Multi-stream execution disabled on non-ampere architectures";
-    }
-  }
-
-  // Lower all Gpu operations to the XLA Gpu runtime custom calls.
-  pm.addPass(createConvertLmhloGpuToGpuRuntimePass());
-  pm.addPass(createConvertLmhloToGpuRuntimePass());
-  pm.addPass(createConvertGpuToGpuRuntimePass());
-
-  // Add performance tracing annotations.
-  pm.addPass(createAddHloTraceAnnotationsPass());
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/passes.h b/third_party/xla/xla/mlir/backends/gpu/transforms/passes.h
deleted file mode 100644
index fa6fae39205f3f..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/passes.h
+++ /dev/null
@@ -1,150 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_BACKENDS_GPU_TRANSFORMS_PASSES_H_
-#define XLA_MLIR_BACKENDS_GPU_TRANSFORMS_PASSES_H_
-
-#include <cstdint>
-#include <memory>
-#include <vector>
-
-#include "absl/container/flat_hash_set.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "xla/stream_executor/device_description.h"
-#include "xla/xla.pb.h"
-
-namespace xla {
-namespace gpu {
-
-#define GEN_PASS_DECL_ADDHLOTRACEANNOTATIONSPASS
-#define GEN_PASS_DECL_CONVERTGPUTOGPURUNTIMEPASS
-#define GEN_PASS_DECL_CONVERTLMHLOGPUTOGPURUNTIMEPASS
-#define GEN_PASS_DECL_CONVERTLMHLOTOGPULAUNCHPASS
-#define GEN_PASS_DECL_CONVERTLMHLOTOGPURUNTIMEPASS
-#define GEN_PASS_DECL_CONVERTMEMREFGETGLOBALTOARGPASS
-#define GEN_PASS_DECL_OUTLINEGPUGRAPHSPASS
-#define GEN_PASS_DECL_ADDCONCURRENTREGIONSPASS
-#define GEN_PASS_DECL_STREAMASSIGNMENTPASS
-#include "xla/mlir/backends/gpu/transforms/passes.h.inc"
-
-class ThunkSequence;  // forward declare
-
-// Collects `rt.allocation_index` attributes from all exported functions.
-//
-//   auto result = GetAllocationIndices();
-//   result[ordinal][argument_index] == allocation_index;
-//
-// Returns `-1` for all arguments that do not have `rt.allocation_index`
-// attribute.
-//
-// TODO(ezhulenev): This is a very ugly hack for graph capture integration, but
-// given that we are moving towards a new runtime and command buffers, it's
-// supposed to be a very short lived hack.
-std::vector<std::vector<int64_t>> GetAllocationIndices(mlir::ModuleOp module);
-
-struct GpuPipelineOpts {
-  // Enable experimental pass that outlines parts of the XLA computation into
-  // CUDA Graphs, which allows us to amortize the cost of launching multiple
-  // device kernels.
-  absl::flat_hash_set<DebugOptions::CommandBufferCmdType> command_types;
-  int32_t min_graph_size = 0;
-  bool enable_concurrent_region = false;
-  stream_executor::GpuComputeCapability compute_capability;
-};
-
-// Populate passes that lower MLIR modules from a combination of LMHLO and
-// LMHLO_GPU dialects to the XLA Gpu runtime. This pipeline is composed from
-// the passes defined below, and few builtin MLIR passes.
-void populateXlaGpuRuntimePasses(mlir::OpPassManager& pm,
-                                 ThunkSequence* thunk_sequence,
-                                 const GpuPipelineOpts& opts = {});
-
-//===----------------------------------------------------------------------===//
-// Auxiliary passes for lowering to XLA Gpu runtime.
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertMemrefGetGlobalToArgPass();
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertMemrefGetGlobalToArgPass(int64_t min_num_elements);
-
-//===-----------------------------------------------------------------------===/
-// Passes for lowering from the `gpu` dialect.
-//===-----------------------------------------------------------------------===/
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertGpuToGpuRuntimePass();
-
-//===----------------------------------------------------------------------===//
-// Passes for lowering from the `lmhlo` dialect.
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertLmhloToGpuLaunchPass(ThunkSequence* thunk_sequence = nullptr);
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertLmhloToGpuRuntimePass();
-
-//===----------------------------------------------------------------------===//
-// Passes for lowering from the `lmhlo_gpu` dialect.
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createConvertLmhloGpuToGpuRuntimePass();
-
-//===----------------------------------------------------------------------===//
-// XLA runtime performance tracing passes.
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createAddHloTraceAnnotationsPass();
-
-//===----------------------------------------------------------------------===//
-// XLA runtime <-> Cuda Graphs integration.
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createOutlineGpuGraphsPass();
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>> createOutlineGpuGraphsPass(
-    absl::flat_hash_set<DebugOptions::CommandBufferCmdType> command_types,
-    int32_t min_graph_size);
-
-//===----------------------------------------------------------------------===//
-// Passes for marking concurrent region in CUDA graph capture function.
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createAddConcurrentRegionsPass();
-
-//===----------------------------------------------------------------------===//
-// Passes for assigning kernels to streams in CUDA graph capture function.
-//===----------------------------------------------------------------------===//
-
-std::unique_ptr<mlir::OperationPass<mlir::ModuleOp>>
-createStreamAssignmentPass();
-
-//===-----------------------------------------------------------------------===/
-
-#define GEN_PASS_REGISTRATION
-#include "xla/mlir/backends/gpu/transforms/passes.h.inc"
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_MLIR_BACKENDS_GPU_TRANSFORMS_PASSES_H_
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/passes.td b/third_party/xla/xla/mlir/backends/gpu/transforms/passes.td
deleted file mode 100644
index a522e7eeb735bf..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/passes.td
+++ /dev/null
@@ -1,302 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_GPU_PASSES
-#define XLA_GPU_PASSES
-
-include "mlir/Pass/PassBase.td"
-
-//===----------------------------------------------------------------------===//
-// Auxiliary passes for lowering to XLA Gpu runtime.
-//===----------------------------------------------------------------------===//
-
-def ConvertMemrefGetGlobalToArgPass :
-    Pass<"xla-memref-get-global-to-arg", "mlir::ModuleOp"> {
-  let summary = "Converts memref.get_global corresponding to lmhlo constants";
-
-  let description = [{
-    Replaces `memref.get_global` operations corresponding to the lmhlo constant
-    arguments (arguments marked with `lmhlo.constant_name` attribute) to use
-    the constant arguments directly.
-
-    Once we used global constants for constant folding, we no longer need to
-    keep them in the module, because they'll be in the binary constant section
-    on the host, and we need them on the device.
-  }];
-
-  let constructor = "createConvertMemrefGetGlobalToArgPass()";
-
-  let options = [
-    Option<"min_num_elements_", "min-num-elements", "int64_t", /*default=*/"0",
-           "Do not convert `memref.get_global` operation if the number of "
-           "elements is smaller than the given value.">,
-  ];
-}
-
-//===----------------------------------------------------------------------===//
-// Passes for lowering from the `gpu` dialect.
-//===----------------------------------------------------------------------===//
-
-def ConvertGpuToGpuRuntimePass :
-    Pass<"xla-gpu-to-gpu-runtime", "mlir::ModuleOp"> {
-  let summary = "Converts gpu operations to XLA Gpu runtime custom calls";
-
-  let description = [{
-    Converts gpu operations (function launch, memcpy, etc...) to the XLA Gpu
-    runtime custom calls.
-  }];
-
-  let constructor = "createConvertGpuToGpuRuntimePass()";
-}
-
-//===----------------------------------------------------------------------===//
-// Passes for lowering from the `lmhlo` dialect.
-//===----------------------------------------------------------------------===//
-
-def ConvertLmhloToGpuLaunchPass :
-    Pass<"xla-lmhlo-to-gpu-launch", "mlir::ModuleOp"> {
-  let summary = "Converts lmhlo fusions to Gpu dialect kernel launch";
-
-  let description = [{
-    Converts lmhlo operations that have registered IR emitters (e.g. fusions) to
-    Gpu dialect kernel launch operations (and trivial memory operations like
-    memcpy or memset). This pass relies on a pre-compiled ThunkSequence with an
-    associated device module (PTX and cubin) to find device kernels
-    corresponding to lmhlo operation in the input module.
-
-    Created Gpu kernel launch operations can be further lowered to the Gpu
-    runtime by the `xla-gpu-to-gpu-runtime` pass.
-  }];
-
-  let constructor = "createConvertLmhloToGpuLaunchPass()";
-}
-
-def ConvertLmhloToGpuRuntimePass :
-    Pass<"xla-lmhlo-to-gpu-runtime", "mlir::ModuleOp"> {
-  let summary = "Converts lmhlo operations to XLA Gpu runtime custom calls";
-
-  let description = [{
-    Converts lmhlo dialect operations (infeed, outfeed, collectives, etc...) to
-    the XLA Gpu runtime custom calls.
-  }];
-
-  let constructor = "createConvertLmhloToGpuRuntimePass()";
-}
-
-//===----------------------------------------------------------------------===//
-// Passes for lowering from the `lmhlo_gpu` dialect.
-//===----------------------------------------------------------------------===//
-
-def ConvertLmhloGpuToGpuRuntimePass :
-    Pass<"xla-lmhlo-gpu-to-gpu-runtime", "mlir::ModuleOp"> {
-  let summary = "Converts lmhlo_gpu operations to XLA Gpu runtime custom calls";
-
-  let description = [{
-    Converts lmhlo_gpu dialect operations (gemm, convolution, etc...) to
-    the XLA Gpu runtime custom calls.
-  }];
-
-  let constructor = "createConvertLmhloGpuToGpuRuntimePass()";
-}
-
-//===----------------------------------------------------------------------===//
-// XLA runtime performance tracing passes.
-//===----------------------------------------------------------------------===//
-
-// TODO(ezhulenev): This pass should be generic for all backends, consider
-// moving it to the `transforms/runtime` folder once it will be used by CPU
-// compiler.
-
-def AddHloTraceAnnotationsPass :
-    Pass<"xla-add-hlo-trace-annotations", "mlir::ModuleOp"> {
-  let summary = "Adds HLO trace annotations to the supported operations";
-
-  let description = [{
-    Adds HLO trace annotations to the operations that result from compiling
-    an input HLO module, e.g. it adds HLO trace annotations to all runtime custom
-    calls that are constructed from the corresponding HLO operations.
-
-    Example:
-
-    ```mlir
-    call @local_xla.gpu.gemm(...) : (...) -> memref<?x?xf32>
-    ```
-
-    becomes:
-
-    ```mlir
-    call @local_xla.gpu.gemm(...) { rt.trace = #rt.hlo<"gemm.1", "xla_module", 0> }
-      : (...) -> memref<?x?xf32>
-    ```
-
-    XLA compilation pipeline wraps traced operations into the `rt.trace`
-    operation, and eventually lowers them to the tracing API calls.
-  }];
-
-  let constructor = "createAddHloTraceAnnotationsPass()";
-}
-
-//===----------------------------------------------------------------------===//
-// Xla Gpu <-> Cuda Graphs integration.
-//===----------------------------------------------------------------------===//
-
-def OutlineGpuGraphsPass :
-    Pass<"xla-gpu-outline-gpu-graphs", "mlir::ModuleOp"> {
-  let summary = "Outline sequences of Xla Gpu operations into CUDA Graphs";
-
-  let description = [{
-    Converts sequences of supported Xla Gpu operations to Cuda Graph capture
-    functions, and replaces the original sequences with calls to the Xla Cuda
-    Graph runtime API.
-
-    Example:
-
-    ```mlir
-    gpu.launch_func @compute::foo args(%arg0: memref<?xf32>)
-    gpu.launch_func @compute::bar args(%arg1: memref<?xf32>)
-    ```
-
-    becomes:
-
-    ```mlir
-    // Export cuda graph capture function to Xla runtime.
-    rt.export @capture ordinal 1
-    func.func @capture(@arg0: memref<?xf32>, %arg1: memref<?xf32>) {
-      ... capture a graph corresponding to a sequence of `gpu.launch_func` ops
-    }
-
-    // Replace a sequence of graph launch operations with a call to runtime API.
-    call @local_xla.gpu.graph.launch(%arg0: memref<?xf32>,
-                                     %arg1: memref<?xf32>)
-      attributes { capture = @capture }
-    ```
-  }];
-
-  let constructor = "createOutlineGpuGraphsPass()";
-
-  let options = [
-    Option<"min_graph_size_", "min_graph_size", "int64_t", /*default=*/"2",
-           "The minimum size of the outlined CUDA graph function.">,
-  ];
-}
-
-//===----------------------------------------------------------------------===//
-// Add concurrent regions to CUDA graph capture functions.
-//===----------------------------------------------------------------------===//
-
-def AddConcurrentRegionsPass:
-    Pass<"xla-gpu-add-concurrent-regions", "mlir::ModuleOp"> {
-  let summary = "Identify and mark concurrent regions in CUDA graph capture "
-                "functions";
-
-  let description = [{
-    Add concurent region markers to indicate a region of operations that can be
-    executed concurrently.
-
-    Example:
-
-    ```mlir
-    func.func @capture.cuda.graph() {
-      call @local_xla.gpu.launch.func
-      call @local_xla.gpu.launch.func
-
-      // Everything here can run concurrently
-      call @local_xla.gpu.launch.func
-      call @local_xla.gpu.launch.func
-      call @local_xla.gpu.launch.func
-      call @local_xla.gpu.launch.func
-      // Back to sequential execution
-
-      call @local_xla.gpu.launch.func
-      func.return
-    }
-    ```
-
-    becomes:
-
-    ```mlir
-    func.func @capture.cuda.graph() {
-      call @local_xla.gpu.launch.func
-      call @local_xla.gpu.launch.func
-
-      call @local_xla.gpu.concurrent_region.begin()
-      call @local_xla.gpu.launch.func
-      call @local_xla.gpu.launch.func
-      call @local_xla.gpu.launch.func
-      call @local_xla.gpu.launch.func
-      call @local_xla.gpu.concurrent_region.end()
-
-      call @local_xla.gpu.launch.func
-      func.return
-    }
-    ```
-
-  }];
-
-  let constructor = "createAddConcurrentRegionsPass()";
-}
-
-//===----------------------------------------------------------------------===//
-// Stream assignment.
-//===----------------------------------------------------------------------===//
-
-def StreamAssignmentPass:
-    Pass<"xla-gpu-stream-assignment", "mlir::ModuleOp"> {
-  let summary = "Identify and mark concurrent regions in CUDA graph capture "
-                "functions";
-
-  let description = [{
-    Assign a stream to each kernel launch in the capture function. Streams are
-    assigned to exploit parallelism, so that we can build parallel GPU graphs
-    duing graph capture.
-
-    Example:
-
-    ```mlir
-    func.func @capture.cuda.graph() {
-      // func1, func2, func3 can run in parallel
-      call @local_xla.gpu.launch.func1
-      call @local_xla.gpu.launch.func2
-      call @local_xla.gpu.launch.func3
-
-      // Depends on xla.gpu.launc.func1 and xla.gpu.launch.func2 to finish.
-      call @local_xla.gpu.launch.func
-      func.return
-    }
-    ```
-
-    becomes:
-
-    ```mlir
-    func.func @capture.cuda.graph() {
-      // func1, func2, func3 can run in parallel
-      call @local_xla.gpu.launch.func1 {stream = 0 : i64}
-      call @local_xla.gpu.launch.func2 {stream = 1 : i64}
-      call @local_xla.gpu.launch.func3 {stream = 2 : i64}
-
-      // Add explicit synchronization to wait for stream 1 to finish executing
-      // func2.
-      call @local_xla.stream.await {from = 0 : i64, to = [1]}
-      call @local_xla.gpu.launch.func {stream = 0: i64}
-      func.return
-    }
-    ```
-
-  }];
-
-  let constructor = "createStreamAssignmentPass()";
-}
-
-#endif  // XLA_GPU_PASSES
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/stream_assignment.cc b/third_party/xla/xla/mlir/backends/gpu/transforms/stream_assignment.cc
deleted file mode 100644
index 08ab2e230dbe66..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/stream_assignment.cc
+++ /dev/null
@@ -1,271 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <array>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "absl/strings/match.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/ImplicitLocOpBuilder.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "xla/mlir/backends/gpu/transforms/dataflow_analysis.h"
-#include "xla/mlir/runtime/utils/custom_calls.h"
-
-namespace xla {
-namespace gpu {
-
-namespace {
-
-#define GEN_PASS_DEF_STREAMASSIGNMENTPASS
-#include "xla/mlir/backends/gpu/transforms/passes.h.inc"
-
-using namespace mlir;  // NOLINT
-using mlir::func::FuncOp;
-using DataflowGraph = DataflowAnalysis::DataflowGraph;
-using Node = DataflowAnalysis::Node;
-
-class StreamAssignmentPass
-    : public impl::StreamAssignmentPassBase<StreamAssignmentPass> {
-  void runOnOperation() override;
-
-  void getDependentDialects(DialectRegistry& registry) const override {
-    registry.insert<func::FuncDialect>();
-  }
-};
-
-static constexpr int kNumStreams = 10;
-
-//===----------------------------------------------------------------------===//
-
-bool IsParallelizableOp(Operation* op) {
-  return isa<mlir::gpu::LaunchFuncOp, mlir::gpu::MemcpyOp>(op);
-}
-
-//
-// A simple algorithm to assign streams using the dependency information
-// provided by the dataflow graph.
-// Pseudocode:
-//   stream = 0
-//   while there exists op such that it is unassigned:
-//     assign op to stream
-//     while op has a child:
-//       op = the last child in the order of execution in the capture function
-//       assign op to stream
-//     stream++
-//
-// When assigning a stream to a dependency chain, we find the next op in the
-// chain by finding the last child of the current op. For example, in the
-// following dependency graph, A and C are assigned to stream 0, while B is
-// assigned to 1.
-//
-// A-->B  C
-// |      ^
-// +------|
-//
-std::vector<size_t> AssignStreams(const DataflowGraph& graph, int num_streams) {
-  std::vector<size_t> stream_assignment(graph.size(), -1);
-  size_t current_stream = 0;
-
-  auto get_current_stream = [&]() -> size_t {
-    size_t assigned_stream = current_stream;
-    current_stream++;
-    if (current_stream == num_streams) {
-      current_stream = 0;
-    }
-    return assigned_stream;
-  };
-
-  auto get_first_unassigned_node = [&stream_assignment =
-                                        std::as_const(stream_assignment),
-                                    &graph]() -> std::optional<size_t> {
-    for (auto [index, stream] : llvm::enumerate(stream_assignment)) {
-      if (stream == -1 && IsParallelizableOp(graph[index].operation)) {
-        return index;
-      }
-    }
-    return std::nullopt;
-  };
-
-  auto get_last_unassigned_child = [&stream_assignment =
-                                        std::as_const(stream_assignment),
-                                    &graph](Node node) -> std::optional<Node> {
-    for (int i = node.children.size() - 1; i >= 0; i--) {
-      Node child = graph[node.children[i]];
-      if (!IsParallelizableOp(child.operation)) continue;
-      if (stream_assignment[child.index] == -1) {
-        return child;
-      }
-    }
-    return std::nullopt;
-  };
-
-  std::function<void(Node, size_t)> assign_stream_to_dependency_chain =
-      [&](Node node, size_t stream) {
-        stream_assignment[node.index] = stream;
-
-        if (auto child = get_last_unassigned_child(node)) {
-          assign_stream_to_dependency_chain(child.value(), stream);
-        }
-      };
-
-  while (std::optional<size_t> unassigned_index = get_first_unassigned_node()) {
-    Node unassigned_node = graph[unassigned_index.value()];
-    size_t assigned_stream = get_current_stream();
-    assign_stream_to_dependency_chain(unassigned_node, assigned_stream);
-  }
-
-  // next: Assign all non parallelizable ops to stream 0.
-
-  return stream_assignment;
-}
-
-std::optional<int> GetAssignedStream(Operation* op) {
-  if (op->hasAttr("stream")) {
-    return op->getAttrOfType<mlir::IntegerAttr>("stream").getInt();
-  }
-  return std::nullopt;
-}
-
-//
-// Add synchronizations between assigned streams. The added custom call
-// xla.streams.await() {from = A, to = [B, C, ...]} makes future work submitted
-// to A wait for work that are already submitted to streams B, C, ...
-//
-// Pseudo code:
-// For each node in the dependency graph
-//   If the node has a stream A assigned
-//     parents = A's parents
-//     to_streams = the assigned streams of its parents
-//     add xla.streams.await() {from = A, to = to_streams} before node
-//
-// TODO(anlunx): Handle the case where the cuda graph contains non
-// parallelizable ops (cuBLAS, cuDNN).
-//
-void AddSynchronization(FuncOp await_op,
-                        runtime::CustomCallDeclarations custom_calls,
-                        const DataflowGraph& graph) {
-  for (const Node& node : graph) {
-    Operation* op = node.operation;
-    std::optional<int> op_stream = GetAssignedStream(op);
-    if (!op_stream.has_value()) {
-      continue;
-    }
-    int from_stream = op_stream.value();
-
-    std::array<bool, kNumStreams> dependent_streams;
-    dependent_streams.fill(false);
-    for (int i = 0; i < node.index; i++) {
-      if (std::find(graph[i].children.begin(), graph[i].children.end(),
-                    node.index) != graph[i].children.end()) {
-        if (std::optional<int> to_stream =
-                GetAssignedStream(graph[i].operation)) {
-          if (to_stream.value() != from_stream) {
-            dependent_streams[to_stream.value()] = true;
-          }
-        }
-      }
-    }
-
-    ImplicitLocOpBuilder b(op->getLoc(), custom_calls.sym_table().getOp());
-    llvm::SmallVector<Attribute> to_streams;
-    for (int i = 0; i < kNumStreams; i++) {
-      if (dependent_streams[i]) {
-        to_streams.push_back(b.getI64IntegerAttr(i));
-      }
-    }
-
-    if (to_streams.empty()) {
-      continue;
-    }
-
-    b.setInsertionPoint(op);
-    auto call = b.create<func::CallOp>(await_op.getName(), TypeRange());
-    call->setAttr(b.getStringAttr("from"), b.getI64IntegerAttr(from_stream));
-    call->setAttr(b.getStringAttr("to"), b.getArrayAttr(to_streams));
-  }
-}
-
-//===----------------------------------------------------------------------===//
-
-void StreamAssignmentPass::runOnOperation() {
-  ModuleOp module = getOperation();
-  SymbolTable sym_table(module);
-  runtime::CustomCallDeclarations custom_calls(std::move(sym_table));
-
-  auto func_ops = llvm::to_vector(module.getOps<FuncOp>());
-  ImplicitLocOpBuilder b(module->getLoc(), custom_calls.sym_table().getOp());
-  func::FuncOp begin_marker = custom_calls.GetOrCreate(
-      b, "xla.gpu.concurrent_region.begin", TypeRange(), TypeRange());
-  func::FuncOp end_marker = custom_calls.GetOrCreate(
-      b, "xla.gpu.concurrent_region.end", TypeRange(), TypeRange());
-  func::FuncOp await_op = custom_calls.GetOrCreate(b, "xla.streams.await",
-                                                   TypeRange(), TypeRange());
-
-  for (auto func_op : func_ops) {
-    if (!absl::StrContains(func_op.getSymNameAttr().str(),
-                           "xla.gpu.graph.capture")) {
-      continue;
-    }
-
-    DataflowAnalysis dataflow_analysis(func_op);
-    DataflowGraph graph = dataflow_analysis.GetDataflowGraph(func_op);
-    std::vector<size_t> stream_assignment = AssignStreams(graph, kNumStreams);
-
-    size_t stream_count = 0;
-    for (auto [index, stream] : llvm::enumerate(stream_assignment)) {
-      stream_count = std::max(stream_count, stream + 1);
-      Node node = graph[index];
-      Operation* op = node.operation;
-      ImplicitLocOpBuilder b(op->getLoc(), custom_calls.sym_table().getOp());
-      if (stream != -1) {
-        op->setAttr(b.getStringAttr("stream"), b.getI64IntegerAttr(stream));
-      }
-    }
-
-    AddSynchronization(await_op, custom_calls, graph);
-
-    ImplicitLocOpBuilder b(func_op->getLoc(), custom_calls.sym_table().getOp());
-    auto first_op = &(*func_op.getOps().begin());
-    b.setInsertionPoint(first_op);
-    auto call = b.create<func::CallOp>(begin_marker.getName(), TypeRange());
-    call->setAttr(b.getStringAttr("size"), b.getI64IntegerAttr(stream_count));
-
-    auto op_it = func_op.getOps().begin();
-    while (!isa<func::ReturnOp>(*op_it)) {
-      op_it++;
-    }
-    Operation* return_op = &(*op_it);
-    b.setInsertionPoint(return_op);
-    b.create<func::CallOp>(end_marker.getName(), TypeRange());
-  }
-}
-
-}  // namespace
-
-std::unique_ptr<OperationPass<ModuleOp>> createStreamAssignmentPass() {
-  return std::make_unique<StreamAssignmentPass>();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/BUILD b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/BUILD
deleted file mode 100644
index ed7e97f057fd19..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/BUILD
+++ /dev/null
@@ -1,40 +0,0 @@
-load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
-
-package(
-    default_visibility = ["//visibility:public"],
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    licenses = ["notice"],
-)
-
-lit_test_suite(
-    name = "all_tests",
-    srcs = enforce_glob(
-        [
-            "add_concurrent_regions.mlir",
-            "add_hlo_trace.mlir",
-            "gpu_launch.mlir",
-            "gpu_memcpy.mlir",
-            "gpu_memset.mlir",
-            "lmhlo_case.mlir",
-            "lmhlo_custom_call.mlir",
-            "lmhlo_fft.mlir",
-            "lmhlo_gpu_cholesky.mlir",
-            "lmhlo_gpu_conv.mlir",
-            "lmhlo_gpu_cublas_lt_matmul.mlir",
-            "lmhlo_gpu_gemm.mlir",
-            "lmhlo_infeed.mlir",
-            "lmhlo_outfeed.mlir",
-            "lmhlo_send_recv.mlir",
-            "lmhlo_while.mlir",
-            "memref_get_global_to_arg.mlir",
-            "outline_cuda_graphs.mlir",
-            "stream_assignment.mlir",
-        ],
-        include = ["*.mlir"],
-    ),
-    cfg = "//xla:lit.cfg.py",
-    tools = [
-        "//xla/mlir/backends/gpu:xla-gpu-opt",
-        "@llvm-project//llvm:FileCheck",
-    ],
-)
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/add_concurrent_regions.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/add_concurrent_regions.mlir
deleted file mode 100644
index 5cf7738273d556..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/add_concurrent_regions.mlir
+++ /dev/null
@@ -1,348 +0,0 @@
-// RUN: xla-gpu-opt %s --split-input-file -xla-gpu-add-concurrent-regions \
-// RUN:   | FileCheck %s
-
-
-// -----
-// Check that two consecutive launch_funcs using different buffers is captured
-// by a concurrent_region.
-
-module attributes {gpu.container_module} {
-
-  gpu.module @gpu_module attributes {binary = "kernel binary"} {
-    gpu.func @fn0(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
-    gpu.func @fn1(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
-  }
-
-
-  // CHECK: func @local_xla.gpu.graph.capture
-  func.func @local_xla.gpu.graph.capture(%arg0: memref<72xi8>, %arg1: memref<72xi8>, %arg2: memref<328xi8>, %arg3: memref<72xi8>, %arg4: memref<72xi8>, %arg5: memref<72xi8>, %arg6: memref<72xi8>, %arg7: memref<72xi8>, %arg8: memref<72xi8>, %arg9: memref<72xi8>) {
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %view = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi64>
-    %view_0 = memref.view %arg1[%c0][] : memref<72xi8> to memref<3x3xi64>
-
-    // CHECK: call @local_xla.gpu.concurrent_region.begin()
-    // CHECK-NEXT: gpu.launch_func
-    // CHECK-NEXT: gpu.launch_func
-    // CHECK-NEXT: call @local_xla.gpu.concurrent_region.end()
-    // CHECK-NEXT: return
-    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi64>)
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>)
-    return
-  }
-}
-
-// -----
-// Check that two consecutive launch_funcs using the same buffer is not
-// captured.
-
-module attributes {gpu.container_module} {
-
-  gpu.module @gpu_module attributes {binary = "kernel binary"} {
-    gpu.func @fn0(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
-    gpu.func @fn1(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
-  }
-
-
-  // CHECK: func @local_xla.gpu.graph.capture
-  func.func @local_xla.gpu.graph.capture(%arg0: memref<72xi8>, %arg1: memref<72xi8>, %arg2: memref<328xi8>, %arg3: memref<72xi8>, %arg4: memref<72xi8>, %arg5: memref<72xi8>, %arg6: memref<72xi8>, %arg7: memref<72xi8>, %arg8: memref<72xi8>, %arg9: memref<72xi8>) {
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %view = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi64>
-    %view_0 = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi64>
-
-    // CHECK: gpu.launch_func
-    // CHECK-NEXT: gpu.launch_func
-    // CHECK-NEXT: return
-    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi64>)
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>)
-    return
-  }
-}
-
-// -----
-// Check that there is no dependency from launch_funcs that do not write to
-// buffers.
-
-module attributes {gpu.container_module} {
-
-  gpu.module @gpu_module attributes {binary = "kernel binary"} {
-    gpu.func @fn0(%arg0: memref<3x3xi64> ) kernel { gpu.return }
-    gpu.func @fn1(%arg0: memref<3x3xi64> ) kernel { gpu.return }
-  }
-
-
-  // CHECK: func @local_xla.gpu.graph.capture
-  func.func @local_xla.gpu.graph.capture(%arg0: memref<72xi8>, %arg1: memref<72xi8>, %arg2: memref<328xi8>, %arg3: memref<72xi8>, %arg4: memref<72xi8>, %arg5: memref<72xi8>, %arg6: memref<72xi8>, %arg7: memref<72xi8>, %arg8: memref<72xi8>, %arg9: memref<72xi8>) {
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %view = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi64>
-    %view_0 = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi64>
-
-    // CHECK: call @local_xla.gpu.concurrent_region.begin()
-    // CHECK-NEXT: gpu.launch_func
-    // CHECK-NEXT: gpu.launch_func
-    // CHECK-NEXT: call @local_xla.gpu.concurrent_region.end()
-    // CHECK-NEXT: return
-    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi64>)
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>)
-    return
-  }
-}
-
-// -----
-// Check that the i1 data type is handled correctly.
-module attributes {gpu.container_module} {
-
-  gpu.module @gpu_module attributes {binary = "kernel binary"} {
-    gpu.func @fn0(%arg0: memref<3x3xi1> {lmhlo.written} ) kernel { gpu.return }
-    gpu.func @fn1(%arg0: memref<3x3xi1> {lmhlo.written} ) kernel { gpu.return }
-  }
-
-
-  // CHECK: func @local_xla.gpu.graph.capture
-  func.func @local_xla.gpu.graph.capture(%arg0: memref<72xi8>, %arg1: memref<72xi8>, %arg2: memref<328xi8>, %arg3: memref<72xi8>, %arg4: memref<72xi8>, %arg5: memref<72xi8>, %arg6: memref<72xi8>, %arg7: memref<72xi8>, %arg8: memref<72xi8>, %arg9: memref<72xi8>) {
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %view = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi1>
-    %view_0 = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi1>
-
-    // CHECK-NOT: xla.gpu.concurrent_region.begin()
-    // CHECK: gpu.launch_func
-    // CHECK-NEXT: gpu.launch_func
-    // CHECK-NEXT: return
-    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi1>)
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi1>)
-    return
-  }
-}
-
-// -----
-// Check that disjoint buffer slices does not introduce dependency.
-
-module attributes {gpu.container_module} {
-
-  gpu.module @gpu_module attributes {binary = "kernel binary"} {
-    gpu.func @fn0(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
-    gpu.func @fn1(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
-  }
-
-
-  // CHECK: func @local_xla.gpu.graph.capture
-  func.func @local_xla.gpu.graph.capture(%arg0: memref<144xi8>) {
-    %c0 = arith.constant 0 : index
-    %c72 = arith.constant 72 : index
-    %c1 = arith.constant 1 : index
-    %view = memref.view %arg0[%c0][] : memref<144xi8> to memref<3x3xi64>
-    %view_0 = memref.view %arg0[%c72][] : memref<144xi8> to memref<3x3xi64>
-
-    // CHECK: call @local_xla.gpu.concurrent_region.begin()
-    // CHECK-NEXT: gpu.launch_func
-    // CHECK-NEXT: gpu.launch_func
-    // CHECK-NEXT: call @local_xla.gpu.concurrent_region.end()
-    // CHECK-NEXT: return
-    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi64>)
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>)
-    return
-  }
-}
-
-// -----
-// Check that overlapping buffer slices creates dependency.
-
-module attributes {gpu.container_module} {
-
-  gpu.module @gpu_module attributes {binary = "kernel binary"} {
-    gpu.func @fn0(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
-    gpu.func @fn1(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
-  }
-
-
-  // CHECK: func @local_xla.gpu.graph.capture
-  func.func @local_xla.gpu.graph.capture(%arg0: memref<144xi8>) {
-    %c0 = arith.constant 0 : index
-    %c36 = arith.constant 36 : index
-    %c1 = arith.constant 1 : index
-    %view = memref.view %arg0[%c0][] : memref<144xi8> to memref<3x3xi64>
-    %view_0 = memref.view %arg0[%c36][] : memref<144xi8> to memref<3x3xi64>
-
-    // CHECK: gpu.launch_func
-    // CHECK-NEXT: gpu.launch_func
-    // CHECK-NEXT: return
-    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi64>)
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>)
-    return
-  }
-}
-
-// -----
-// Check that constant input buffer does not create dependency.
-
-module attributes {gpu.container_module} {
-
-  gpu.module @gpu_module attributes {binary = "kernel binary"} {
-    gpu.func @fn0(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
-    gpu.func @fn1(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
-  }
-
-
-  // CHECK: func @local_xla.gpu.graph.capture
-  func.func @local_xla.gpu.graph.capture(%arg0: memref<144xi8> {lmhlo.constant_name = "cst0"}) {
-    %c0 = arith.constant 0 : index
-    %c36 = arith.constant 36 : index
-    %c1 = arith.constant 1 : index
-    %view = memref.view %arg0[%c0][] : memref<144xi8> to memref<3x3xi64>
-    %view_0 = memref.view %arg0[%c36][] : memref<144xi8> to memref<3x3xi64>
-
-    // CHECK: call @local_xla.gpu.concurrent_region.begin()
-    // CHECK-NEXT: gpu.launch_func
-    // CHECK-NEXT: gpu.launch_func
-    // CHECK-NEXT: call @local_xla.gpu.concurrent_region.end()
-    // CHECK-NEXT: return
-    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi64>)
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>)
-    return
-  }
-}
-
-// -----
-// Check that two gemms that read the same buffer are moved into a concurrent
-// region.
-
-module attributes {gpu.container_module} {
-
-  // CHECK: func @local_xla.gpu.graph.capture
-  func.func @local_xla.gpu.graph.capture(%arg0: memref<16xi8>,
-                   %arg1: memref<16xi8>,
-                   %arg2: memref<16xi8>,
-                   %arg3: memref<16xi8>) {
-    %c0 = arith.constant 0 : index
-    %view_0 = memref.view %arg0[%c0][] : memref<16xi8> to memref<2x2xf32>
-    %c1 = arith.constant 0 : index
-    %view_1 = memref.view %arg1[%c1][] : memref<16xi8> to memref<2x2xf32>
-    %c2 = arith.constant 0 : index
-    %view_2 = memref.view %arg2[%c2][] : memref<16xi8> to memref<2x2xf32>
-    %view_3 = memref.view %arg3[%c2][] : memref<16xi8> to memref<2x2xf32>
-
-    // CHECK: call @local_xla.gpu.concurrent_region.begin()
-    // CHECK-NEXT: lmhlo_gpu.gemm
-    // CHECK-NEXT: lmhlo_gpu.gemm
-    // CHECK-NEXT: call @local_xla.gpu.concurrent_region.end()
-    // CHECK-NEXT: return
-    "lmhlo_gpu.gemm"(%view_0, %view_1, %view_2) {alpha_imag = 0.000000e+00 : f64, alpha_real = 1.000000e+00 : f64, beta = 0.000000e+00 : f64, batch_size = 1 : i64, lhs_stride = 4 : i64, rhs_stride = 4 : i64, dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>} : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
-    "lmhlo_gpu.gemm"(%view_0, %view_1, %view_3) {alpha_imag = 0.000000e+00 : f64, alpha_real = 1.000000e+00 : f64, beta = 0.000000e+00 : f64, batch_size = 1 : i64, lhs_stride = 4 : i64, rhs_stride = 4 : i64, dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>} : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
-    return
-  }
-
-  func.func private @external()
-}
-
-// -----
-// Check that lmhlo_gpu.gemm is not moved into the concurrent region if it
-// uses a buffer used by a kernel launch.
-
-module attributes {gpu.container_module} {
-
-  gpu.module @gpu_module attributes {binary = "kernel binary"} {
-    gpu.func @fn0(%arg0: memref<16xi8> {lmhlo.written} ) kernel { gpu.return }
-  }
-
-  // CHECK: func @local_xla.gpu.graph.capture
-  func.func @local_xla.gpu.graph.capture(%arg0: memref<16xi8>,
-                   %arg1: memref<16xi8>,
-                   %arg2: memref<16xi8>) {
-    %c0 = arith.constant 0 : index
-    %view_0 = memref.view %arg0[%c0][] : memref<16xi8> to memref<2x2xf32>
-    %c1 = arith.constant 0 : index
-    %view_1 = memref.view %arg1[%c1][] : memref<16xi8> to memref<2x2xf32>
-    %c2 = arith.constant 0 : index
-    %view_2 = memref.view %arg2[%c2][] : memref<16xi8> to memref<2x2xf32>
-
-    // CHECK-NOT: @local_xla.gpu.concurrent_region.begin()
-    // CHECK: lmhlo_gpu.gemm
-    // CHECK-NEXT: gpu.launch_func
-    // CHECK-NEXT: return
-    "lmhlo_gpu.gemm"(%view_0, %view_1, %view_2) {alpha_imag = 0.000000e+00 : f64, alpha_real = 1.000000e+00 : f64, beta = 0.000000e+00 : f64, batch_size = 1 : i64, lhs_stride = 4 : i64, rhs_stride = 4 : i64, dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>} : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
-    gpu.launch_func  @gpu_module::@fn0 blocks in (%c0, %c0, %c0)
-      threads in (%c0, %c0, %c0) args(%arg0: memref<16xi8>)
-    return
-  }
-
-  func.func private @external()
-}
-
-// -----
-// Check that memcpies are added to concurrent regions.
-
-module attributes {gpu.container_module} {
-
-  // CHECK: func @local_xla.gpu.graph.capture
-  func.func @local_xla.gpu.graph.capture(%arg0: memref<16xi8>,
-                   %arg1: memref<16xi8>,
-                   %arg2: memref<16xi8>) {
-    %c0 = arith.constant 0 : index
-    %view_0 = memref.view %arg0[%c0][] : memref<16xi8> to memref<2x2xf32>
-    %c1 = arith.constant 0 : index
-    %view_1 = memref.view %arg1[%c1][] : memref<16xi8> to memref<2x2xf32>
-    %c2 = arith.constant 0 : index
-    %view_2 = memref.view %arg2[%c2][] : memref<16xi8> to memref<2x2xf32>
-
-    // CHECK: @local_xla.gpu.concurrent_region.begin()
-    // CHECK-NEXT: gpu.memcpy
-    // CHECK-NEXT: gpu.memcpy
-    // CHECK-NEXT: @local_xla.gpu.concurrent_region.end()
-    // CHECK-NEXT: return
-    gpu.memcpy %view_1, %view_0 : memref<2x2xf32>, memref<2x2xf32>
-    gpu.memcpy %view_2, %view_0 : memref<2x2xf32>, memref<2x2xf32>
-    return
-  }
-
-  func.func private @external()
-}
-
-// -----
-// Check that region size is set correctly.
-
-module attributes {gpu.container_module} {
-
-  gpu.module @gpu_module attributes {binary = "kernel binary"} {
-    gpu.func @fn0(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
-    gpu.func @fn1(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
-  }
-
-
-  // CHECK: func @local_xla.gpu.graph.capture
-  func.func @local_xla.gpu.graph.capture(%arg0: memref<72xi8>, %arg1: memref<72xi8>, %arg2: memref<328xi8>, %arg3: memref<72xi8>, %arg4: memref<72xi8>, %arg5: memref<72xi8>, %arg6: memref<72xi8>, %arg7: memref<72xi8>, %arg8: memref<72xi8>, %arg9: memref<72xi8>) {
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %view = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi64>
-    %view_0 = memref.view %arg1[%c0][] : memref<72xi8> to memref<3x3xi64>
-
-    // CHECK: call @local_xla.gpu.concurrent_region.begin() {size = 2 : i64}
-    // CHECK-NEXT: gpu.launch_func
-    // CHECK-NEXT: memref.view
-    // CHECK-NEXT: gpu.launch_func
-    // CHECK-NEXT: call @local_xla.gpu.concurrent_region.end()
-    // CHECK-NEXT: return
-    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi64>)
-    %view_1 = memref.view %arg1[%c0][] : memref<72xi8> to memref<3x3xi64>
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>)
-    return
-  }
-}
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/add_hlo_trace.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/add_hlo_trace.mlir
deleted file mode 100644
index 864a4b0433ddc1..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/add_hlo_trace.mlir
+++ /dev/null
@@ -1,15 +0,0 @@
-// RUN: xla-gpu-opt %s -xla-add-hlo-trace-annotations | FileCheck %s
-
-module attributes { mhlo.unique_id = 42 : i64 } {
-
-func.func private @local_xla.foo() attributes { rt.custom_call = "xla.foo" }
-
-// CHECK: func @func() {
-func.func @func() {
-  // CHECK: call @local_xla.foo()
-  // CHECK-SAME: rt.trace = #rt.hlo_trace<"gemm.name.42">
-  call @local_xla.foo() : () -> () loc("gemm.name.42")
-  return
-}
-
-} loc("module-name")
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/gpu_launch.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/gpu_launch.mlir
deleted file mode 100644
index e05ff982bb3c48..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/gpu_launch.mlir
+++ /dev/null
@@ -1,64 +0,0 @@
-// RUN: xla-gpu-opt %s -xla-gpu-to-gpu-runtime | FileCheck %s
-
-module attributes {gpu.container_module} {
-
-// CHECK-NOT: gpu.module
-gpu.module @gpu_module attributes {binary = "kernel binary"} {
-  gpu.func @fn0(%arg0: memref<4x4xf32>, %arg1: memref<4x4xf32>) kernel {
-    gpu.return
-  }
-  gpu.func @fn1(%arg0: memref<4x4xf32>, %arg1: memref<4x4xf32>) kernel {
-    gpu.return
-  }
-}
-
-// CHECK: @func(
-// CHECK:   %[[ARG0:.*]]: memref<4x4xf32>,
-// CHECK:   %[[ARG1:.*]]: memref<4x4xf32>
-// CHECK: )
-func.func @func(%arg0: memref<4x4xf32>, %arg1: memref<4x4xf32>) {
-  // Launch dimensions converted to i32 as a part of the lowering.
-  // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32
-  // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : i32
-  // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : i32
-  // CHECK-DAG: %[[C4:.*]] = arith.constant 4 : i32
-  // CHECK-DAG: %[[C5:.*]] = arith.constant 5 : i32
-  // CHECK-DAG: %[[C6:.*]] = arith.constant 6 : i32
-  // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-  // CHECK-DAG: %[[C256:.*]] = arith.constant 256 : i32
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c3 = arith.constant 3 : index
-  %c4 = arith.constant 4 : index
-  %c5 = arith.constant 5 : index
-  %c6 = arith.constant 6 : index
-  %c256 = arith.constant 256 : i32
-
-  // CHECK: call @[[LAUNCH:[_a-z.]+]](%[[C0]], %[[C1]], %[[C2]], %[[C3]],
-  // CHECK-SAME: %[[C4]], %[[C5]], %[[C6]], %[[ARG0]], %[[ARG1]])
-  // CHECK-SAME: kernel = "fn0"
-  gpu.launch_func  @gpu_module::@fn0
-    blocks in (%c1, %c2, %c3)
-    threads in (%c4, %c5, %c6)
-    args(%arg0 : memref<4x4xf32>, %arg1 : memref<4x4xf32>)
-
-  // CHECK: call @[[LAUNCH]](%[[C256]], %[[C3]], %[[C2]], %[[C1]], %[[C6]],
-  // CHECK-SAME: %[[C5]], %[[C4]], %[[ARG0]], %[[ARG1]])
-  // CHECK-DAG: kernel = "fn1"
-  gpu.launch_func  @gpu_module::@fn1
-    blocks in (%c3, %c2, %c1)
-    threads in (%c6, %c5, %c4)
-    dynamic_shared_memory_size %c256
-    args(%arg0 : memref<4x4xf32>, %arg1 : memref<4x4xf32>)
-
-  func.return
-}
-
-// CHECK: func private @[[LAUNCH]](i32, i32, i32, i32, i32, i32,
-// CHECK-SAME: memref<4x4xf32>, memref<4x4xf32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.func.launch"}
-
-// Check that we have a single custom call declaration in the module.
-// CHECK-NOT: rt.custom_call
-
-}
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/gpu_memcpy.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/gpu_memcpy.mlir
deleted file mode 100644
index 410a94c489ed16..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/gpu_memcpy.mlir
+++ /dev/null
@@ -1,46 +0,0 @@
-// RUN: xla-gpu-opt %s --split-input-file -xla-gpu-to-gpu-runtime | FileCheck %s
-
-// CHECK: func @gpu_memcpy_d2d(
-// CHECK:   %[[DST:[a-z0-9]+]]: memref<?xf32>,
-// CHECK:   %[[SRC:[a-z0-9]+]]: memref<?xf32>
-// CHECK: )
-func.func @gpu_memcpy_d2d(%dst: memref<?xf32>, %src: memref<?xf32>) {
-  // CHECK: call @[[MEMCPY:.*]](%[[DST]], %[[SRC]])
-  gpu.memcpy %dst, %src : memref<?xf32>, memref<?xf32>
-  return
-}
-
-// CHECK: func private @[[MEMCPY]](memref<?xf32>, memref<?xf32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.memcpy.d2d"}
-
-// -----
-
-// CHECK: func @gpu_memcpy_h2d(
-// CHECK:   %[[DST:[a-z0-9]+]]: memref<?xf32>
-// CHECK: )
-func.func @gpu_memcpy_h2d(%dst: memref<?xf32>, %dim: index) {
-  // CHECK: %[[SRC:.*]] = memref.alloca
-  %src = memref.alloca(%dim) : memref<?xf32>
-  // CHECK: call @[[MEMCPY:.*]](%[[DST]], %[[SRC]])
-  gpu.memcpy %dst, %src : memref<?xf32>, memref<?xf32>
-  return
-}
-
-// CHECK: func private @[[MEMCPY]](memref<?xf32>, memref<?xf32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.memcpy.h2d"}
-
-// -----
-
-// CHECK: func @gpu_memcpy_d2h(
-// CHECK:   %[[SRC:[a-z0-9]+]]: memref<?xf32>
-// CHECK: )
-func.func @gpu_memcpy_d2h(%src: memref<?xf32>, %dim: index) {
-  // CHECK: %[[DST:.*]] = memref.alloca
-  %dst = memref.alloca(%dim) : memref<?xf32>
-  // CHECK: call @[[MEMCPY:.*]](%[[DST]], %[[SRC]])
-  gpu.memcpy %dst, %src : memref<?xf32>, memref<?xf32>
-  return
-}
-
-// CHECK: func private @[[MEMCPY]](memref<?xf32>, memref<?xf32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.memcpy.d2h"}
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/gpu_memset.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/gpu_memset.mlir
deleted file mode 100644
index 33b2232fc8519d..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/gpu_memset.mlir
+++ /dev/null
@@ -1,31 +0,0 @@
-// RUN: xla-gpu-opt %s --split-input-file -xla-gpu-to-gpu-runtime | FileCheck %s
-
-// CHECK: func @gpu_memset_i32(
-// CHECK:   %[[DST:[a-z0-9]+]]: memref<?xi32>
-// CHECK: )
-func.func @gpu_memset_i32(%dst: memref<?xi32>) {
-  // CHECK: %[[CST:.*]] = arith.constant 0 : i32
-  %cst = arith.constant 0 : i32
-  // CHECK: call @[[MEMSET:.*]](%[[DST]], %[[CST]])
-  gpu.memset %dst, %cst : memref<?xi32>, i32
-  return
-}
-
-// CHECK: func private @[[MEMSET]](memref<?xi32>, i32)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.memset"}
-
-// -----
-
-// CHECK: func @gpu_memset_f32(
-// CHECK:   %[[DST:[a-z0-9]+]]: memref<?xf32>
-// CHECK: )
-func.func @gpu_memset_f32(%dst: memref<?xf32>) {
-  // CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32
-  %cst = arith.constant 0.000000e+00 : f32
-  // CHECK: call @[[MEMSET:.*]](%[[DST]], %[[CST]])
-  gpu.memset %dst, %cst : memref<?xf32>, f32
-  return
-}
-
-// CHECK: func private @[[MEMSET]](memref<?xf32>, f32)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.memset"}
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_case.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_case.mlir
deleted file mode 100644
index 9e4120cda6328e..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_case.mlir
+++ /dev/null
@@ -1,116 +0,0 @@
-// RUN: xla-gpu-opt %s -xla-lmhlo-to-gpu-runtime | FileCheck %s
-
-module attributes {gpu.container_module} {
-  memref.global "private" constant @constant : memref<i32> = dense<0>
-
-  gpu.module @case0 attributes {binary = "ptx"} {
-    gpu.func @fn(%arg0: memref<i32>) kernel {
-      gpu.return
-    }
-  }
-
-  gpu.module @case1 attributes {binary = "ptx"} {
-    gpu.func @fn(%arg0: memref<i32>) kernel {
-      gpu.return
-    }
-  }
-
-  // CHECK: @case_true_false(
-  // CHECK-SAME:   %[[ARG0:.*]]: memref<i32>,
-  // CHECK-SAME:   %[[ARG1:.*]]: memref<i1>
-  // CHECK-SAME: )
-  func.func @case_true_false(%arg0: memref<i32>, %arg1: memref<i1>) {
-    %c1 = arith.constant 1 : index
-
-    // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-    // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32
-
-    // CHECK: %[[HOST:.*]] = memref.alloca() : memref<i1>
-    // CHECK: gpu.memcpy %[[HOST]], %[[ARG1]]
-
-    // CHECK: %[[PRED:.*]] = memref.load %[[HOST]][] : memref<i1>
-    // CHECK: %[[IDX:.*]] = arith.select %[[PRED]], %[[C0]], %[[C1]]
-
-    // CHECK: scf.execute_region
-    // CHECK: cf.switch %[[IDX]] : i32
-    // CHECK:   default: ^[[YIELD:.*]],
-    // CHECK:         0: ^[[CASE0:.*]],
-    // CHECK:         1: ^[[CASE1:.*]]
-    "lmhlo.case"(%arg1) ({
-      gpu.launch_func @case0::@fn blocks in (%c1, %c1, %c1)
-                                  threads in (%c1, %c1, %c1)
-                                  args(%arg0 : memref<i32>)
-      "lmhlo.terminator"() : () -> ()
-    }, {
-      gpu.launch_func @case1::@fn blocks in (%c1, %c1, %c1)
-                                  threads in (%c1, %c1, %c1)
-                                  args(%arg0 : memref<i32>)
-      "lmhlo.terminator"() : () -> ()
-    }) : (memref<i1>) -> ()
-
-    // CHECK: ^[[CASE0]]:
-    // CHECK: gpu.launch_func @case0::@fn
-    // CHECK: cf.br ^[[YIELD]]
-
-    // CHECK: ^[[CASE1]]:
-    // CHECK: gpu.launch_func @case1::@fn
-    // CHECK: cf.br ^[[YIELD]]
-
-    // CHECK: ^[[YIELD]]:
-    // CHECK-NEXT: scf.yield
-
-    // CHECK: return
-    "lmhlo.terminator"() : () -> ()
-  }
-
-  // CHECK: @case_index(
-  // CHECK-SAME:   %[[ARG0:.*]]: memref<i32>,
-  // CHECK-SAME:   %[[ARG1:.*]]: memref<i32>
-  // CHECK-SAME: )
-  func.func @case_index(%arg0: memref<i32>, %arg1: memref<i32>) {
-    %c1 = arith.constant 1 : index
-
-    // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : i32
-    // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : i32
-
-    // CHECK: %[[HOST:.*]] = memref.alloca() : memref<i32>
-    // CHECK: gpu.memcpy %[[HOST]], %[[ARG1]]
-
-    // CHECK: %[[PRED:.*]] = memref.load %[[HOST]][] : memref<i32>
-    // CHECK: %[[SMALL:.*]] = arith.cmpi slt, %[[PRED]], %[[C0]] : i32
-    // CHECK: %[[LARGE:.*]] = arith.cmpi sgt, %[[PRED]], %[[C1]] : i32
-    // CHECK: %[[OOR:.*]] = arith.ori %[[SMALL]], %[[LARGE]] : i1
-    // CHECK: %[[IDX:.*]] = arith.select %[[OOR]], %[[C1]], %[[PRED]] : i32
-
-    // CHECK: scf.execute_region
-    // CHECK: cf.switch %[[IDX]] : i32
-    // CHECK:   default: ^[[YIELD:.*]],
-    // CHECK:         0: ^[[CASE0:.*]],
-    // CHECK:         1: ^[[CASE1:.*]]
-    "lmhlo.case"(%arg1) ({
-      gpu.launch_func @case0::@fn blocks in (%c1, %c1, %c1)
-                                  threads in (%c1, %c1, %c1)
-                                  args(%arg0 : memref<i32>)
-      "lmhlo.terminator"() : () -> ()
-    }, {
-      gpu.launch_func @case1::@fn blocks in (%c1, %c1, %c1)
-                                  threads in (%c1, %c1, %c1)
-                                  args(%arg0 : memref<i32>)
-      "lmhlo.terminator"() : () -> ()
-    }) : (memref<i32>) -> ()
-
-    // CHECK: ^[[CASE0]]:
-    // CHECK: gpu.launch_func @case0::@fn
-    // CHECK: cf.br ^[[YIELD]]
-
-    // CHECK: ^[[CASE1]]:
-    // CHECK: gpu.launch_func @case1::@fn
-    // CHECK: cf.br ^[[YIELD]]
-
-    // CHECK: ^[[YIELD]]:
-    // CHECK-NEXT: scf.yield
-
-    // CHECK: return
-    "lmhlo.terminator"() : () -> ()
-  }
-}
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_custom_call.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_custom_call.mlir
deleted file mode 100644
index 6b333f90f77dc3..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_custom_call.mlir
+++ /dev/null
@@ -1,64 +0,0 @@
-// RUN: xla-gpu-opt %s -split-input-file -xla-lmhlo-to-gpu-runtime \
-// RUN:   | FileCheck %s
-
-// CHECK: func @test
-// CHECK:   %[[ARG0:.*]]: memref<f32>
-// CHECK: )
-func.func @test(%arg0: memref<f32>) {
-  // CHECK: call @[[CUSTOM_CALL:.*]](%[[ARG0]])
-  // CHECK-SAME:   api_version = 2 : i32
-  // CHECK-SAME:   backend_config = ""
-  // CHECK-SAME:   call_target_name = "target"
-  // CHECK-SAME: : (memref<f32>) -> ()
-  "lmhlo.custom_call"(%arg0) ({}) {
-    api_version = 2 : i32,
-    backend_config = "",
-    call_target_name = "target",
-    operandSegmentSizes = array<i32: 0, 1>
-  } : (memref<f32>) -> ()
-  return
-}
-
-// CHECK: func.func private @[[CUSTOM_CALL]](memref<f32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.custom_call"}
-
-// -----
-
-// CHECK: func @test_with_mapping
-// CHECK:   %[[ARG0:[0-9a-z]*]]: memref<f32>,
-// CHECK:   %[[ARG1:[0-9a-z]*]]: memref<f32>,
-// CHECK:   %[[ARG2:[0-9a-z]*]]: memref<f32>,
-// CHECK:   %[[ARG3:[0-9a-z]*]]: memref<f32>,
-// CHECK:   %[[ARG4:[0-9a-z]*]]: memref<f32>
-// CHECK: )
-func.func @test_with_mapping(
-    %arg0: memref<f32>,
-    %arg1: memref<f32>,
-    %arg2: memref<f32>,
-    %arg3: memref<f32>,
-    %arg4: memref<f32>) {
-  // CHECK: %[[HOLE:.*]] = arith.constant -1 : i64
-
-  // CHECK: call @[[CUSTOM_CALL:.*]](%[[ARG0]], %[[HOLE]], %[[ARG1]], %[[HOLE]],
-  // CHECK-SAME:  %[[ARG2]], %[[ARG3]], %[[HOLE]], %[[ARG4]])
-  // CHECK-SAME:   api_version = 1 : i32
-  // CHECK-SAME:   backend_config = ""
-  // CHECK-SAME:   call_target_name = "target"
-  "lmhlo.custom_call"(%arg0, %arg1, %arg2, %arg3, %arg4) ({}) {
-    api_version = 1 : i32,
-    backend_config = "",
-    call_target_name = "target",
-    operandSegmentSizes = array<i32: 2, 3>,
-    target_arg_mapping = #lmhlo.custom_call_target_arg_mapping<
-      num_args = 4,
-      num_results = 4,
-      args_to_target_args = [0, 2],
-      results_to_target_results = [0, 1, 3]>
-    } : (memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<f32>) -> ()
-
-  return
-}
-
-// CHECK: func.func private @[[CUSTOM_CALL]](memref<f32>, i64, memref<f32>, i64,
-// CHECK-SAME: memref<f32>, memref<f32>, i64, memref<f32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.custom_call"}
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_fft.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_fft.mlir
deleted file mode 100644
index aeb19228d013e3..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_fft.mlir
+++ /dev/null
@@ -1,25 +0,0 @@
-// RUN: xla-gpu-opt %s -xla-lmhlo-to-gpu-runtime | FileCheck %s
-
-// CHECK: @compute(
-// CHECK:   %[[ARG0:[a-z0-9]+]]: memref<3x5x16x5xcomplex<f32>
-// CHECK:   %[[ARG1:[a-z0-9]+]]: memref<3x5x16x8xf32>
-// CHECK: )
-func.func @compute(%arg0: memref<3x5x16x5xcomplex<f32>>,
-                   %arg1: memref<3x5x16x8xf32>) {
-
-  // CHECK: call @[[FFT:.*]](%[[ARG0]], %[[ARG1]])
-  // CHECK-SAME: fft_length = dense<[16, 8]> : tensor<2xi64>
-  // CHECK-SAME: fft_type = #mhlo<fft_type IRFFT>
-  // CHECK-SAME: uid = 0 : i64
-  "lmhlo.fft"(%arg0, %arg1) {
-    fft_length = dense<[16, 8]> : tensor<2xi64>,
-    fft_type = #mhlo<fft_type IRFFT>
-  } : (memref<3x5x16x5xcomplex<f32>>, memref<3x5x16x8xf32>) -> ()
-
-  // CHECK-NEXT: return
-  func.return
-}
-
-// CHECK: func private @[[FFT]](memref<3x5x16x5xcomplex<f32>>,
-// CHECK-SAME:                  memref<3x5x16x8xf32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.fft"}
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_cholesky.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_cholesky.mlir
deleted file mode 100644
index 9dbc0ee2eb260c..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_cholesky.mlir
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: xla-gpu-opt %s -xla-lmhlo-gpu-to-gpu-runtime | FileCheck %s
-
-// CHECK: @compute(
-// CHECK:   %[[ARG0:[a-z0-9]+]]: memref<4x4xi32>
-// CHECK:   %[[ARG1:[a-z0-9]+]]: memref<4x4xi32>
-// CHECK:   %[[ARG2:[a-z0-9]+]]: memref<4x4xi32>
-// CHECK:   %[[ARG3:[a-z0-9]+]]: memref<4x4xi32>
-// CHECK: )
-func.func @compute(%operand: memref<4x4xi32>, %a: memref<4x4xi32>,
-                   %workspace: memref<4x4xi32>, %info: memref<4x4xi32>) {
-
-  // CHECK: call @[[CHOLESKY:.*]](%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]])
-  // CHECK-SAME: batch_size = 1 : i64
-  // CHECK-SAME: is_lower = true
-  // CHECK-SAME: n = 4 : i64
-  "lmhlo_gpu.cholesky"(%operand, %a, %workspace, %info) {
-    batch_size = 1 : i64,
-    is_lower = true,
-    n = 4 : i64
-  } : (memref<4x4xi32>, memref<4x4xi32>, memref<4x4xi32>, memref<4x4xi32>) -> ()
-
-  // CHECK-NEXT: return
-  func.return
-}
-
-// CHECK: func private @[[CHOLESKY]](memref<4x4xi32>, memref<4x4xi32>,
-// CHECK-SAME:                       memref<4x4xi32>, memref<4x4xi32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.cholesky"}
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_conv.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_conv.mlir
deleted file mode 100644
index e1173d06d707f7..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_conv.mlir
+++ /dev/null
@@ -1,380 +0,0 @@
-// RUN: xla-gpu-opt %s -split-input-file -xla-lmhlo-gpu-to-gpu-runtime \
-// RUN:   | FileCheck %s
-
-#map0 = affine_map<(d0, d1, d2, d3) -> (d0 * 3 + d1 + d2 * 9 + d3 * 9)>
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * 16384 + d1 * 4 + d2 + d3 * 16)>
-#map2 = affine_map<(d0, d1, d2, d3) -> (d0 * 4096 + d1 * 2 + d2 + d3 * 4)>
-
-// CHECK: @conv_forward(
-// CHECK:   %[[INPUT:[a-z0-9]+]]: memref
-// CHECK:   %[[FILTER:[a-z0-9]+]]: memref
-// CHECK:   %[[OUTPUT:[a-z0-9]+]]: memref
-// CHECK:   %[[SCRATCH:[a-z0-9]+]]: memref
-// CHECK: )
-func.func @conv_forward(%input: memref<1x4x4x1024xf16, #map1>,
-                        %filter: memref<3x3x1x1024xf16, #map0>,
-                        %output: memref<1x2x2x1024xf16, #map2>,
-                        %scratch: memref<0xui8>) {
-
-  // CHECK: call @local_xla.gpu.conv.forward(
-  // CHECK-SAME: %[[INPUT]], %[[FILTER]], %[[OUTPUT]], %[[SCRATCH]])
-
-  // CHECK-DAG: uid = 0 : i64
-  // CHECK-DAG: conv_dims = #mhlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>
-
-  // CHECK-DAG: window_strides = dense<1> : tensor<2xi64>
-  // CHECK-DAG: lhs_dilation = dense<1> : tensor<2xi64>
-  // CHECK-DAG: rhs_dilation = dense<1> : tensor<2xi64>
-  // CHECK-DAG: window_reversal = dense<0> : tensor<2xi64>
-  // CHECK-DAG: padding = dense<> : tensor<0xi64>
-
-  // CHECK-DAG: backend_config = #lmhlo_gpu.convolution_backend_config<
-  // CHECK-DAG: algorithm = 0
-  // CHECK-DAG: is_cudnn_frontend = true
-  // CHECK-DAG: knob_ids = []
-  // CHECK-DAG: knob_values = []
-  // CHECK-DAG: operand_0_layout = [2, 1, 3, 0]
-  // CHECK-DAG: operand_1_layout = [1, 0, 2, 3]
-  // CHECK-DAG: tensor_ops_enabled = false
-  // CHECK-DAG: workspace_size = 0
-
-  // CHECK-DAG: feature_group_count = 1024 : i64
-  // CHECK-DAG: result_scale = 1.000000e+00 : f64
-  lmhlo_gpu.conv_forward(%input, %filter, %output, %scratch)
-    dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
-    window = { stride = [1, 1],
-               lhs_dilate = [1, 1],
-               rhs_dilate = [1, 1],
-               reverse = [0, 0]
-             }
-    { backend_config = #lmhlo_gpu.convolution_backend_config<
-        algorithm = 0,
-        is_cudnn_frontend = true,
-        is_cudnn_reordered_int8 = false,
-        knob_ids = [],
-        knob_values = [],
-        operand_0_layout = [2, 1, 3, 0],
-        operand_1_layout = [1, 0, 2, 3],
-        result_layout = [2, 1, 3, 0],
-        tensor_ops_enabled = false,
-        workspace_size = 0
-      >,
-      batch_group_count = 1 : i64,
-      feature_group_count = 1024 : i64,
-      precision_config = [],
-      result_scale = 1.000000e+00 : f64
-    } : (memref<1x4x4x1024xf16, #map1>,
-         memref<3x3x1x1024xf16, #map0>,
-         memref<1x2x2x1024xf16, #map2>,
-         memref<0xui8>) -> ()
-
-  return
-}
-
-// CHECK: func private @local_xla.gpu.conv.forward(
-// CHECK-SAME:   memref<1x4x4x1024xf16, #map{{[0-9]*}}>, memref<3x3x1x1024xf16, #map{{[0-9]*}}>,
-// CHECK-SAME:   memref<1x2x2x1024xf16, #map{{[0-9]*}}>, memref<0xui8>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.conv.forward"}
-
-// -----
-
-#map0 = affine_map<(d0, d1, d2, d3) -> (d0 * 9 + d1 * 3 + d2 + d3 * 9)>
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * 3 + d1 + d2 * 27 + d3 * 9)>
-#map2 = affine_map<(d0, d1, d2, d3) -> (d0 + d1 + d2 + d3)>
-
-// CHECK: @conv_backwardfilter(
-// CHECK:   %[[INPUT:[a-z0-9]+]]: memref
-// CHECK:   %[[D_OUTPUT:[a-z0-9]+]]: memref
-// CHECK:   %[[D_FILTER:[a-z0-9]+]]: memref
-// CHECK:   %[[SCRATCH:[a-z0-9]+]]: memref
-// CHECK: )
-func.func @conv_backwardfilter(%input: memref<1x3x3x5xf16, #map0>,
-                               %d_output: memref<3x3x5x3xf16, #map1>,
-                               %d_filter: memref<1x1x1x3xf16, #map2>,
-                               %scratch: memref<0xui8>) {
-  // CHECK: call @local_xla.gpu.conv.backward.filter(
-  // CHECK-SAME: %[[INPUT]], %[[D_OUTPUT]], %[[D_FILTER]], %[[SCRATCH]])
-  lmhlo_gpu.conv_backwardfilter(%input, %d_output, %d_filter, %scratch)
-    dim_numbers = [f, 0, 1, b]x[i, 0, 1, o]->[0, 1, b, f],
-    window = { stride = [1, 1],
-               lhs_dilate = [1, 1],
-               rhs_dilate = [1, 1],
-               reverse = [0, 0]
-             }
-    { backend_config = #lmhlo_gpu.convolution_backend_config<
-        algorithm = 0,
-        is_cudnn_frontend = true,
-        is_cudnn_reordered_int8 = false,
-        knob_ids = [],
-        knob_values = [],
-        operand_0_layout = [2, 1, 0, 3],
-        operand_1_layout = [1, 0, 3, 2],
-        result_layout = [2, 1, 0, 3],
-        tensor_ops_enabled = false,
-        workspace_size = 0
-      >,
-      batch_group_count = 1 : i64,
-      feature_group_count = 1 : i64,
-      precision_config = [],
-      result_scale = 1.000000e+00 : f64
-    } : (memref<1x3x3x5xf16, #map0>,
-         memref<3x3x5x3xf16, #map1>,
-         memref<1x1x1x3xf16, #map2>,
-         memref<0xui8>) -> ()
-  return
-}
-
-// CHECK: func private @local_xla.gpu.conv.backward.filter(
-// CHECK-SAME:   memref<1x3x3x5xf16, #map{{[0-9]*}}>, memref<3x3x5x3xf16, #map{{[0-9]*}}>,
-// CHECK-SAME:   memref<1x1x1x3xf16, #map{{[0-9]*}}>, memref<0xui8>
-// CHECK-SAME: ) attributes {rt.custom_call =
-// CHECK-SAME:              "xla.gpu.conv.backward.filter"}
-
-// -----
-
-// CHECK: @conv_backwardinput(
-// CHECK:   %[[D_OUTPUT:[a-z0-9]+]]: memref
-// CHECK:   %[[FILTER:[a-z0-9]+]]: memref
-// CHECK:   %[[D_INPUT:[a-z0-9]+]]: memref
-// CHECK:   %[[SCRATCH:[a-z0-9]+]]: memref
-// CHECK: )
-func.func @conv_backwardinput(%d_output: memref<4x5x16x16xf64>,
-                              %filter: memref<5x3x7x7xf64>,
-                              %d_input: memref<4x3x16x16xf64>,
-                              %scratch: memref<0xui8>) {
-  // CHECK: call @local_xla.gpu.conv.backward.input(
-  // CHECK-SAME: %[[D_OUTPUT]], %[[FILTER]], %[[D_INPUT]], %[[SCRATCH]])
-  lmhlo_gpu.conv_backwardinput(%d_output, %filter, %d_input, %scratch)
-    dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1],
-    window = { stride = [1, 1],
-               lhs_dilate = [1, 1],
-               rhs_dilate = [1, 1],
-               reverse = [0, 0]
-             }
-    { backend_config = #lmhlo_gpu.convolution_backend_config<
-        algorithm = 2,
-        is_cudnn_frontend = true,
-        is_cudnn_reordered_int8 = false,
-        knob_ids = [3, 2],
-        knob_values = [0, 3],
-        operand_0_layout = [3, 2, 1, 0],
-        operand_1_layout = [3, 2, 1, 0],
-        result_layout = [3, 2, 1, 0],
-        tensor_ops_enabled = false,
-        workspace_size = 0
-      >,
-      batch_group_count = 1 : i64,
-      feature_group_count = 1 : i64,
-      precision_config = [],
-      result_scale = 1.000000e+00 : f64
-    } : (memref<4x5x16x16xf64>,
-         memref<5x3x7x7xf64>,
-         memref<4x3x16x16xf64>,
-         memref<0xui8>) -> ()
-  return
-}
-
-// CHECK: func private @local_xla.gpu.conv.backward.input(
-// CHECK-SAME:   memref<4x5x16x16xf64>, memref<5x3x7x7xf64>,
-// CHECK-SAME:   memref<4x3x16x16xf64>, memref<0xui8>
-// CHECK-SAME: ) attributes {rt.custom_call =
-// CHECK-SAME:              "xla.gpu.conv.backward.input"}
-
-// -----
-
-#map0 = affine_map<(d0, d1, d2, d3) -> (d0 * 3 + d1 + d2 * 9 + d3 * 9)>
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * 25 + d1 * 5 + d2 + d3 * 25)>
-#map2 = affine_map<(d0, d1, d2, d3) -> (d0 * 800 + d1 * 5 + d2 + d3 * 25)>
-
-// CHECK: @conv_forward_fused(
-// CHECK:   %[[INPUT:[a-z0-9]+]]: memref
-// CHECK:   %[[FILTER:[a-z0-9]+]]: memref
-// CHECK:   %[[BIAS:[a-z0-9]+]]: memref
-// CHECK:   %[[OUTPUT:[a-z0-9]+]]: memref
-// CHECK:   %[[SCRATCH:[a-z0-9]+]]: memref
-// CHECK: )
-func.func @conv_forward_fused(%input: memref<8x5x5x1xf32, #map1>,
-                              %filter: memref<3x3x1x32xf32, #map0>,
-                              %bias: memref<32xf32>,
-                              %output: memref<8x5x5x32xf32, #map2>,
-                              %scratch: memref<0xui8>) {
-  // CHECK: call @local_xla.gpu.conv.forward.fused(
-  // CHECK-SAME: %[[INPUT]], %[[FILTER]], %[[BIAS]], %[[OUTPUT]], %[[SCRATCH]])
-
-  // CHECK-DAG: activation_mode = #lmhlo_gpu<activation Relu>
-  // CHECK-DAG: knob_ids = [2, 3]
-  // CHECK-DAG: knob_values = [4, 0]
-  lmhlo_gpu.conv_forward_fused(%input, %filter, %bias, %output, %scratch)
-    dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
-    window = { stride = [1, 1],
-               lhs_dilate = [1, 1],
-               rhs_dilate = [1, 1],
-               reverse = [0, 0]
-             }
-    { activation_mode = #lmhlo_gpu<activation Relu>,
-      leakyrelu_alpha = 0.0 : f64,
-      backend_config = #lmhlo_gpu.convolution_backend_config<
-        algorithm = 11,
-        is_cudnn_frontend = true,
-        is_cudnn_reordered_int8 = false,
-        knob_ids = [2, 3],
-        knob_values = [4, 0],
-        operand_0_layout = [2, 1, 3, 0],
-        operand_1_layout = [1, 0, 2, 3],
-        result_layout = [2, 1, 3, 0],
-        tensor_ops_enabled = false,
-        workspace_size = 0
-      >,
-      batch_group_count = 1 : i64,
-      feature_group_count = 1 : i64,
-      precision_config = [],
-      result_scale = 1.000000e+00 : f64
-    } : (memref<8x5x5x1xf32, #map1>,
-         memref<3x3x1x32xf32, #map0>,
-         memref<32xf32>,
-         memref<8x5x5x32xf32, #map2>,
-         memref<0xui8>) -> ()
-
-  return
-}
-
-// CHECK: func private @local_xla.gpu.conv.forward.fused(
-// CHECK-SAME:   memref<8x5x5x1xf32, #map{{[0-9]*}}>, memref<3x3x1x32xf32, #map{{[0-9]*}}>,
-// CHECK-SAME:   memref<32xf32>, memref<8x5x5x32xf32, #map{{[0-9]*}}>, memref<0xui8>
-// CHECK-SAME: ) attributes {rt.custom_call =
-// CHECK-SAME:               "xla.gpu.conv.forward.fused"}
-
-// -----
-
-#map0 = affine_map<(d0, d1, d2, d3) -> (d0 * 576 + d1 * 3 + d2 + d3 * 9)>
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * 3 + d1 + d2 * 9 + d3 * 576)>
-
-// CHECK: @conv_forward_fused_with_side_input(
-// CHECK:   %[[INPUT:[a-z0-9]+]]: memref
-// CHECK:   %[[FILTER:[a-z0-9]+]]: memref
-// CHECK:   %[[BIAS:[a-z0-9]+]]: memref
-// CHECK:   %[[SIDE_INPUT:[a-z0-9]+]]: memref
-// CHECK:   %[[OUTPUT:[a-z0-9]+]]: memref
-// CHECK:   %[[SCRATCH:[a-z0-9]+]]: memref
-// CHECK: )
-func.func @conv_forward_fused_with_side_input(
-  %input: memref<1x3x3x64xf64, #map0>,
-  %filter: memref<3x3x64x64xf64, #map1>,
-  %bias: memref<64xf64>,
-  %side_input: memref<1x3x3x64xf64, #map0>,
-  %output: memref<1x3x3x64xf64, #map0>,
-  %scratch: memref<0xui8>) {
-
-  // CHECK: call @local_xla.gpu.conv.forward.fused.side_input(
-  // CHECK-SAME: %[[INPUT]], %[[FILTER]], %[[BIAS]], %[[SIDE_INPUT]],
-  // CHECK-SAME: %[[OUTPUT]], %[[SCRATCH]])
-
-  // CHECK-DAG: activation_mode = #lmhlo_gpu<activation Relu>
-  // CHECK-DAG: side_input_scale = 1.000000e+00 : f64
-  lmhlo_gpu.conv_forward_fused_with_side_input(
-      %input, %filter, %bias, %side_input, %output, %scratch)
-    dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
-    window = { stride = [1, 1],
-               lhs_dilate = [1, 1],
-               rhs_dilate = [1, 1],
-               reverse = [0, 0]
-             }
-     { activation_mode = #lmhlo_gpu<activation Relu>,
-       backend_config = #lmhlo_gpu.convolution_backend_config<
-         algorithm = 0,
-         is_cudnn_frontend = true,
-         is_cudnn_reordered_int8 = false,
-         knob_ids = [],
-         knob_values = [],
-         operand_0_layout = [2, 1, 3, 0],
-         operand_1_layout = [1, 0, 2, 3],
-         result_layout = [2, 1, 3, 0],
-         tensor_ops_enabled = false,
-         workspace_size = 0
-       >,
-       batch_group_count = 1 : i64,
-       feature_group_count = 1 : i64,
-       precision_config = [],
-       result_scale = 1.000000e+00 : f64,
-       side_input_scale = 1.000000e+00 : f64
-     } : (memref<1x3x3x64xf64, #map0>,
-          memref<3x3x64x64xf64, #map1>,
-          memref<64xf64>,
-          memref<1x3x3x64xf64, #map0>,
-          memref<1x3x3x64xf64, #map0>,
-          memref<0xui8>) -> ()
-
-  return
-}
-
-// CHECK: func private @local_xla.gpu.conv.forward.fused.side_input(
-// CHECK-SAME:   memref<1x3x3x64xf64, #map{{[0-9]*}}>, memref<3x3x64x64xf64, #map{{[0-9]*}}>,
-// CHECK-SAME:   memref<64xf64>, memref<1x3x3x64xf64, #map{{[0-9]*}}>,
-// CHECK-SAME:   memref<1x3x3x64xf64, #map{{[0-9]*}}>, memref<0xui8>
-// CHECK-SAME: ) attributes {rt.custom_call =
-// CHECK-SAME:               "xla.gpu.conv.forward.fused.side_input"}
-
-// -----
-
-#map0 = affine_map<(d0, d1, d2, d3, d4) -> (d0 + d1 + d2 + d3 * 3 + d4 * 9)>
-
-// CHECK: @conv_reorder_filter(
-// CHECK:   %[[INPUT:[a-z0-9]+]]: memref
-// CHECK:   %[[OUTPUT:[a-z0-9]+]]: memref
-// CHECK: )
-func.func @conv_reorder_filter(
-  %input: memref<1x1x3x3x32xi8, #map0>,
-  %output: memref<1x1x3x3x32xi8, #map0>) {
-
-  // CHECK: call @local_xla.gpu.conv.reorder.filter(
-  // CHECK-SAME: %[[INPUT]], %[[OUTPUT]]
-  // CHECK-DAG: filter_dims = array<i64: 1, 32, 3, 3>
-  "lmhlo_gpu.cudnn_conv_reorder_filter"(%input, %output) {
-    filter_dims = dense<[1, 32, 3, 3]> : tensor<4xi64>
-  }: (memref<1x1x3x3x32xi8, #map0>,
-      memref<1x1x3x3x32xi8, #map0>) -> ()
-
-  return
-}
-
-// CHECK: func private @local_xla.gpu.conv.reorder.filter(
-// CHECK-SAME:   memref<1x1x3x3x32xi8, #map{{[0-9]*}}>,
-// CHECK-SAME:   memref<1x1x3x3x32xi8, #map{{[0-9]*}}>
-// CHECK-SAME: ) attributes {rt.custom_call =
-// CHECK-SAME:               "xla.gpu.conv.reorder.filter"}
-
-// -----
-
-#map0 = affine_map<(d0, d1, d2, d3, d4) -> (d0 + d1 + d2 + d3 * 3 + d4 * 9)>
-
-// CHECK: @conv_reorder_filter_and_bias(
-// CHECK:   %[[FILTER_INPUT:[a-z0-9]+]]: memref
-// CHECK:   %[[BIAS_INPUT:[a-z0-9]+]]: memref
-// CHECK:   %[[FILTER_OUTPUT:[a-z0-9]+]]: memref
-// CHECK:   %[[BIAS_OUTPUT:[a-z0-9]+]]: memref
-// CHECK: )
-func.func @conv_reorder_filter_and_bias(
-  %filter_input: memref<1x1x3x3x32xi8, #map0>,
-  %bias_input: memref<32xf32>,
-  %filter_output: memref<1x1x3x3x32xi8, #map0>,
-  %bias_output: memref<32xf32>) {
-
-  // CHECK: call @local_xla.gpu.conv.reorder.filter_and_bias(
-  // CHECK-SAME: %[[FILTER_INPUT]], %[[BIAS_INPUT]], %[[FILTER_OUTPUT]], %[[BIAS_OUTPUT]]
-  // CHECK-DAG: filter_dims = array<i64: 1, 32, 3, 3>
-  "lmhlo_gpu.cudnn_conv_reorder_filter_and_bias"(
-      %filter_input, %bias_input, %filter_output, %bias_output) {
-    filter_dims = dense<[1, 32, 3, 3]> : tensor<4xi64>
-  }: (memref<1x1x3x3x32xi8, #map0>, memref<32xf32>,
-      memref<1x1x3x3x32xi8, #map0>, memref<32xf32>) -> ()
-
-  return
-}
-
-// CHECK: func private @local_xla.gpu.conv.reorder.filter_and_bias(
-// CHECK-SAME:   memref<1x1x3x3x32xi8, #map{{[0-9]*}}>,
-// CHECK-SAME:   memref<32xf32>,
-// CHECK-SAME:   memref<1x1x3x3x32xi8, #map{{[0-9]*}}>,
-// CHECK-SAME:   memref<32xf32>
-// CHECK-SAME: ) attributes {rt.custom_call =
-// CHECK-SAME:               "xla.gpu.conv.reorder.filter_and_bias"}
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_cublas_lt_matmul.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_cublas_lt_matmul.mlir
deleted file mode 100644
index 6255a255411c5a..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_cublas_lt_matmul.mlir
+++ /dev/null
@@ -1,100 +0,0 @@
-// RUN: xla-gpu-opt %s -split-input-file -xla-lmhlo-gpu-to-gpu-runtime \
-// RUN:   | FileCheck %s
-
-// CHECK: @compute(
-// CHECK:   %[[A:[a-z0-9]+]]: memref<2x6x2x2xf32>,
-// CHECK:   %[[B:[a-z0-9]+]]: memref<2x6x2x2xf32>,
-// CHECK:   %[[C:[a-z0-9]+]]: memref<2x6x2x2xf32>,
-// CHECK:   %[[D:[a-z0-9]+]]: memref<2x6x2x2xf32>
-// CHECK: )
-func.func @compute(%a: memref<2x6x2x2xf32>,
-                   %b: memref<2x6x2x2xf32>,
-                   %c: memref<2x6x2x2xf32>,
-                   %d: memref<2x6x2x2xf32>) {
-
-  // CHECK: @local_xla.gpu.cublas.lt.matmul(%[[A]], %[[B]], %[[C]], %[[D]])
-  // CHECK-SAME:   alpha_imag = 0.000000e+00 : f64
-  // CHECK-SAME:   alpha_real = 1.000000e+00 : f64
-  // CHECK-SAME:   beta = 0.000000e+00 : f64
-  // CHECK-SAME:   dot_dims = #mhlo.dot<lhs_batching_dimensions = [0, 1],
-  // CHECK-SAME:                        rhs_batching_dimensions = [0, 1],
-  // CHECK-SAME:                        lhs_contracting_dimensions = [3],
-  // CHECK-SAME:                        rhs_contracting_dimensions = [2]>
-  // CHECK-SAME:   epilogue = #lmhlo_gpu<epilogue Default>
-  // CHECK-SAME:   precision = dense<0> : tensor<2xi32>
-  // CHECK-SAME:   uid = 0 : i64
-  "lmhlo_gpu.cublas.lt.matmul"(%a, %b, %c, %d) {
-     algorithm = 0 : i64,
-     alpha_imag = 0.000000e+00 : f64,
-     alpha_real = 1.000000e+00 : f64,
-     beta = 0.000000e+00 : f64,
-     dot_dimension_numbers = #mhlo.dot<
-       lhs_batching_dimensions = [0, 1],
-       rhs_batching_dimensions = [0, 1],
-       lhs_contracting_dimensions = [3],
-       rhs_contracting_dimensions = [2]>,
-     epilogue = #lmhlo_gpu<epilogue Default>,
-     precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>],
-     operandSegmentSizes = array<i32: 1, 1, 1, 1, 0, 0>
-   } : (memref<2x6x2x2xf32>, memref<2x6x2x2xf32>,
-        memref<2x6x2x2xf32>, memref<2x6x2x2xf32>) -> ()
-
-  return
-}
-
-// CHECK: func private @local_xla.gpu.cublas.lt.matmul(
-// CHECK-SAME:   memref<2x6x2x2xf32>, memref<2x6x2x2xf32>,
-// CHECK-SAME:   memref<2x6x2x2xf32>, memref<2x6x2x2xf32>
-// CHECK-SAME: ) attributes {rt.custom_call = "xla.gpu.cublas.lt.matmul"}
-
-// -----
-
-// CHECK: @compute(
-// CHECK:   %[[A:[a-z0-9]+]]: memref<2x6x2x2xf32>,
-// CHECK:   %[[B:[a-z0-9]+]]: memref<2x6x2x2xf32>,
-// CHECK:   %[[C:[a-z0-9]+]]: memref<2x6x2x2xf32>,
-// CHECK:   %[[D:[a-z0-9]+]]: memref<2x6x2x2xf32>,
-// CHECK:   %[[BIAS:[a-z0-9]+]]: memref<2x6x2x2xf32>
-// CHECK: )
-func.func @compute(%a: memref<2x6x2x2xf32>,
-                   %b: memref<2x6x2x2xf32>,
-                   %c: memref<2x6x2x2xf32>,
-                   %d: memref<2x6x2x2xf32>,
-                   %bias: memref<2x6x2x2xf32>) {
-
-  // CHECK: @local_xla.gpu.cublas.lt.matmul.bias(%[[A]], %[[B]], %[[C]], %[[D]],
-  // CHECK-SAME:                           %[[BIAS]])
-  // CHECK-SAME:   alpha_imag = 0.000000e+00 : f64
-  // CHECK-SAME:   alpha_real = 1.000000e+00 : f64
-  // CHECK-SAME:   beta = 0.000000e+00 : f64
-  // CHECK-SAME:   dot_dims = #mhlo.dot<lhs_batching_dimensions = [0, 1],
-  // CHECK-SAME:                        rhs_batching_dimensions = [0, 1],
-  // CHECK-SAME:                        lhs_contracting_dimensions = [3],
-  // CHECK-SAME:                        rhs_contracting_dimensions = [2]>
-  // CHECK-SAME:   epilogue = #lmhlo_gpu<epilogue Bias>
-  // CHECK-SAME:   precision = dense<0> : tensor<2xi32>
-  // CHECK-SAME:   uid = 0 : i64
-  "lmhlo_gpu.cublas.lt.matmul"(%a, %b, %c, %d, %bias) {
-     algorithm = 0 : i64,
-     alpha_imag = 0.000000e+00 : f64,
-     alpha_real = 1.000000e+00 : f64,
-     beta = 0.000000e+00 : f64,
-     dot_dimension_numbers = #mhlo.dot<
-       lhs_batching_dimensions = [0, 1],
-       rhs_batching_dimensions = [0, 1],
-       lhs_contracting_dimensions = [3],
-       rhs_contracting_dimensions = [2]>,
-     epilogue = #lmhlo_gpu<epilogue Bias>,
-     precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>],
-     operandSegmentSizes = array<i32: 1, 1, 1, 1, 1, 0>
-   } : (memref<2x6x2x2xf32>, memref<2x6x2x2xf32>, memref<2x6x2x2xf32>,
-        memref<2x6x2x2xf32>, memref<2x6x2x2xf32>) -> ()
-
-  return
-}
-
-// CHECK: func private @local_xla.gpu.cublas.lt.matmul.bias(
-// CHECK-SAME:   memref<2x6x2x2xf32>, memref<2x6x2x2xf32>, memref<2x6x2x2xf32>,
-// CHECK-SAME:   memref<2x6x2x2xf32>, memref<2x6x2x2xf32>
-// CHECK-SAME: ) attributes {rt.custom_call =
-// CHECK-SAME: "xla.gpu.cublas.lt.matmul.bias"}
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_gemm.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_gemm.mlir
deleted file mode 100644
index 51d2247049d3c2..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_gpu_gemm.mlir
+++ /dev/null
@@ -1,41 +0,0 @@
-// RUN: xla-gpu-opt %s -split-input-file -xla-lmhlo-gpu-to-gpu-runtime \
-// RUN:   | FileCheck %s
-
-// CHECK: @compute(
-// CHECK:   %[[LHS:[a-z0-9]+]]: memref<4x4xf32>,
-// CHECK:   %[[RHS:[a-z0-9]+]]: memref<4x4xf32>,
-// CHECK:   %[[OUT:[a-z0-9]+]]: memref<4x4xf32>
-// CHECK: )
-func.func @compute(%lhs: memref<4x4xf32>, %rhs: memref<4x4xf32>,
-                   %out: memref<4x4xf32>) {
-
-  // CHECK: call @[[GEMM:[_a-z.]+]](%[[LHS]], %[[RHS]], %[[OUT]])
-  // CHECK-SAME:   algorithm = 13 : i64
-  // CHECK-SAME:   alpha_imag = 0.000000e+00 : f64
-  // CHECK-SAME:   alpha_real = 1.000000e+00 : f64
-  // CHECK-SAME:   beta = 0.000000e+00 : f64
-  // CHECK-SAME:   dot_dims = #mhlo.dot<lhs_contracting_dimensions = [1],
-  // CHECK-SAME:                        rhs_contracting_dimensions = [0]>
-  // CHECK-SAME:   uid = 0 : i64
-  // CHECK-SAME: (memref<4x4xf32>, memref<4x4xf32>, memref<4x4xf32>) -> ()
-  "lmhlo_gpu.gemm"(%lhs, %rhs, %out)
-     {
-       algorithm = 13 : i64,
-       alpha_imag = 0.000000e+00 : f64,
-       alpha_real = 1.000000e+00 : f64,
-       batch_size = 1 : i64,
-       beta = 0.000000e+00 : f64,
-       dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1],
-                                         rhs_contracting_dimensions = [0]>,
-       lhs_stride = 16 : i64,
-       rhs_stride = 16 : i64
-     }
-  : (memref<4x4xf32>, memref<4x4xf32>, memref<4x4xf32>) -> ()
-
-  // CHECK-NEXT: return
-  func.return
-}
-
-// CHECK: func private @[[GEMM:[_a-z.]+]](memref<4x4xf32>, memref<4x4xf32>,
-// CHECK-SAME: memref<4x4xf32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.gemm"}
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_infeed.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_infeed.mlir
deleted file mode 100644
index 089a7e6ac3351a..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_infeed.mlir
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: xla-gpu-opt %s -xla-lmhlo-to-gpu-runtime | FileCheck %s
-
-// CHECK: func @gpu_infeed(
-// CHECK:   %[[ARG0:[a-z0-9]+]]: memref<?xf32>
-// CHECK: )
-func.func @gpu_infeed(%arg0: memref<?xf32>) {
-  // CHECK: call @[[INFEED:.*]](%[[ARG0]])
-  // CHECK-SAME: {config = "abc"} : (memref<?xf32>) -> ()
-  "lmhlo.infeed"(%arg0) {config = "abc"} : (memref<?xf32>) -> ()
-  return
-}
-
-// CHECK: func private @[[INFEED]](memref<?xf32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.infeed"}
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_outfeed.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_outfeed.mlir
deleted file mode 100644
index 32cf254a7ff99b..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_outfeed.mlir
+++ /dev/null
@@ -1,14 +0,0 @@
-// RUN: xla-gpu-opt %s -xla-lmhlo-to-gpu-runtime | FileCheck %s
-
-// CHECK: func @gpu_infeed(
-// CHECK:   %[[ARG0:[a-z0-9]+]]: memref<?xf32>
-// CHECK: )
-func.func @gpu_infeed(%arg0: memref<?xf32>) {
-  // CHECK: call @[[OUTFEED:.*]](%[[ARG0]])
-  // CHECK-SAME: {config = "abc"} : (memref<?xf32>) -> ()
-  "lmhlo.outfeed"(%arg0) {config = "abc"} : (memref<?xf32>) -> ()
-  return
-}
-
-// CHECK: func private @[[OUTFEED]](memref<?xf32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.outfeed"}
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_send_recv.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_send_recv.mlir
deleted file mode 100644
index 51130537522712..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_send_recv.mlir
+++ /dev/null
@@ -1,88 +0,0 @@
-// RUN: xla-gpu-opt %s -split-input-file -xla-lmhlo-to-gpu-runtime \
-// RUN:   | FileCheck %s
-
-// CHECK: func @send(
-// CHECK:   %[[ARG0:[a-z0-9]+]]: memref<4xf32>
-// CHECK: )
-func.func @send(%arg0: memref<4xf32>) {
-  // CHECK: call @local_xla.gpu.send_host(%[[ARG0]]) {
-  // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
-  // CHECK-SAME:   frontend_attributes = {
-  // CHECK-SAME:     _xla_dcn_recv_channel = "2",
-  // CHECK-SAME:     _xla_host_transfer_handler_name = "undef",
-  // CHECK-SAME:     _xla_host_transfer_rendezvous = "undef"
-  // CHECK-SAME:   }} : (memref<4xf32>) -> ()
-  "lmhlo.send"(%arg0) {
-    channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
-    frontend_attributes = {_xla_dcn_recv_channel = "2",
-                           _xla_host_transfer_handler_name = "undef",
-                           _xla_host_transfer_rendezvous = "undef"},
-    is_host_transfer = true
-  } : (memref<4xf32>) -> !mhlo.token
-  return
-}
-
-// CHECK: func private @local_xla.gpu.send_host(memref<4xf32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.send_host"}
-
-// -----
-
-// CHECK: func @recv(
-// CHECK:   %[[ARG0:[a-z0-9]+]]: memref<4xf32>
-// CHECK: )
-func.func @recv(%arg0: memref<4xf32>) {
-  // CHECK: call @local_xla.gpu.recv_host(%[[ARG0]]) {
-  // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
-  // CHECK-SAME:   frontend_attributes = {
-  // CHECK-SAME:     _xla_host_transfer_handler_name = "undef",
-  // CHECK-SAME:     _xla_host_transfer_rendezvous = "undef"
-  // CHECK-SAME:   }} : (memref<4xf32>) -> ()
-  "lmhlo.recv"(%arg0) {
-    channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
-    frontend_attributes = {_xla_host_transfer_handler_name = "undef",
-                           _xla_host_transfer_rendezvous = "undef"},
-    is_host_transfer = true
-  } : (memref<4xf32>) -> !mhlo.token
-  return
-}
-
-// CHECK: func private @local_xla.gpu.recv_host(memref<4xf32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.recv_host"}
-
-// -----
-
-// CHECK: func @send_done(
-// CHECK:   %[[ARG0:[a-z0-9]+]]: !mhlo.token
-// CHECK: )
-func.func @send_done(%arg0: !mhlo.token) {
-  // CHECK: call @local_xla.gpu.send_done_host() {
-  // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 1, type = 2>
-  // CHECK-SAME: } : () -> ()
-  "lmhlo.send_done"(%arg0) {
-    channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
-    is_host_transfer = true
-  } : (!mhlo.token) -> ()
-  return
-}
-
-// CHECK: func private @local_xla.gpu.send_done_host()
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.send_done_host"}
-
-// -----
-
-// CHECK: func @recv_done(
-// CHECK:   %[[ARG0:[a-z0-9]+]]: !mhlo.token
-// CHECK: )
-func.func @recv_done(%arg0: !mhlo.token) {
-  // CHECK: call @local_xla.gpu.recv_done_host() {
-  // CHECK-SAME:   channel_handle = #mhlo.channel_handle<handle = 1, type = 2>
-  // CHECK-SAME: } : () -> ()
-  "lmhlo.recv_done"(%arg0) {
-    channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
-    is_host_transfer = true
-  } : (!mhlo.token) -> ()
-  return
-}
-
-// CHECK: func private @local_xla.gpu.recv_done_host()
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.recv_done_host"}
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_while.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_while.mlir
deleted file mode 100644
index 9dac0c42d4b00d..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/lmhlo_while.mlir
+++ /dev/null
@@ -1,97 +0,0 @@
-// RUN: xla-gpu-opt %s --split-input-file -xla-lmhlo-to-gpu-runtime \
-// RUN:   | FileCheck %s
-
-module attributes {gpu.container_module} {
-  memref.global "private" constant @constant : memref<i32> = dense<0>
-
-  gpu.module @cond attributes {binary = "ptx"} {
-    gpu.func @fn(%arg0: memref<i32>, %arg1: memref<i1>) kernel {
-      gpu.return
-    }
-  }
-
-  gpu.module @body attributes {binary = "ptx"} {
-    gpu.func @fn(%arg0: memref<i32>) kernel {
-      gpu.return
-    }
-  }
-
-  // CHECK:      @while_loop(
-  // CHECK-SAME:   %[[ARG0:.*]]: memref<i32>,
-  // CHECK-SAME:   %[[ARG1:.*]]: memref<i1>
-  // CHECK-SAME: )
-  func.func @while_loop(%arg0: memref<i32>, %arg1: memref<i1>) {
-    %c1 = arith.constant 1 : index
-    %0 = memref.get_global @constant : memref<i32>
-    gpu.memcpy  %arg0, %0 : memref<i32>, memref<i32>
-
-    // CHECK: %[[HOST_PRED:.*]] = memref.alloca() : memref<i1>
-    // CHECK: scf.while : () -> ()
-    "lmhlo.while"(%arg1) ({
-      // CHECK: gpu.launch_func @cond::@fn
-      // CHECK: gpu.memcpy %[[HOST_PRED]], %[[ARG1]]
-      // CHECK: %[[COND:.*]] = memref.load %[[HOST_PRED]][] : memref<i1>
-      // CHECK: scf.condition(%[[COND]])
-      gpu.launch_func @cond::@fn blocks in (%c1, %c1, %c1)
-                                 threads in (%c1, %c1, %c1)
-                                 args(%arg0 : memref<i32>, %arg1 : memref<i1>)
-      "lmhlo.terminator"() : () -> ()
-    }, {
-      // CHECK: gpu.launch_func @body::@fn
-      // CHECK: scf.yield
-      gpu.launch_func @body::@fn blocks in (%c1, %c1, %c1)
-                                 threads in (%c1, %c1, %c1)
-                                 args(%arg0 : memref<i32>)
-      "lmhlo.terminator"() : () -> ()
-    }) : (memref<i1>) -> ()
-    "lmhlo.terminator"() : () -> ()
-  }
-}
-
-// -----
-// Check that while loops with known trip counts lower to `scf.for` loops.
-
-module attributes {gpu.container_module} {
-  memref.global "private" constant @constant : memref<i32> = dense<0>
-
-  gpu.module @cond attributes {binary = "ptx"} {
-    gpu.func @fn(%arg0: memref<i32>, %arg1: memref<i1>) kernel {
-      gpu.return
-    }
-  }
-
-  gpu.module @body attributes {binary = "ptx"} {
-    gpu.func @fn(%arg0: memref<i32>) kernel {
-      gpu.return
-    }
-  }
-
-  // CHECK:      @for_loop(
-  // CHECK-SAME:   %[[ARG0:.*]]: memref<i32>,
-  // CHECK-SAME:   %[[ARG1:.*]]: memref<i1>
-  // CHECK-SAME: )
-  func.func @for_loop(%arg0: memref<i32>, %arg1: memref<i1>) {
-    // CHECK-DAG: %[[LB:.*]] = arith.constant 0
-    // CHECK-DAG: %[[UB:.*]] = arith.constant 3000
-    // CHECK-DAG: %[[C1:.*]] = arith.constant 1
-    %c1 = arith.constant 1 : index
-
-    // CHECK: scf.for %[[I:.*]] = %[[LB]] to %[[UB]] step %[[C1]]
-    // CHECK-NEXT: gpu.launch_func @body::@fn
-    // CHECK-NOT: gpu.launch.func
-
-    "lmhlo.while"(%arg1) ({
-      gpu.launch_func @cond::@fn blocks in (%c1, %c1, %c1)
-                                 threads in (%c1, %c1, %c1)
-                                 args(%arg0 : memref<i32>, %arg1 : memref<i1>)
-      "lmhlo.terminator"() : () -> ()
-    }, {
-      gpu.launch_func @body::@fn blocks in (%c1, %c1, %c1)
-                                 threads in (%c1, %c1, %c1)
-                                 args(%arg0 : memref<i32>)
-      "lmhlo.terminator"() : () -> ()
-    }) {trip_count = 3000 : i64} : (memref<i1>) -> ()
-
-    "lmhlo.terminator"() : () -> ()
-  }
-}
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/memref_get_global_to_arg.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/memref_get_global_to_arg.mlir
deleted file mode 100644
index 6361c77f145f1c..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/memref_get_global_to_arg.mlir
+++ /dev/null
@@ -1,43 +0,0 @@
-// RUN: xla-gpu-opt %s -xla-memref-get-global-to-arg=min-num-elements=2 \
-// RUN:   | FileCheck %s
-
-#map = affine_map<(d0, d1) -> (d0 + 2 * d1)>
-
-memref.global "private" constant @cst0 : memref<2x3xf32> =
-  dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00],
-         [4.000000e+00, 5.000000e+00, 6.000000e+00]]>
-
-memref.global "private" constant @cst1 : memref<f32> =
-  dense<1.000000e+00>
-
-memref.global "private" constant @cst2 : memref<2x3xf32, #map> =
-  dense<[[1.000000e+00, 2.000000e+00, 3.000000e+00],
-         [4.000000e+00, 5.000000e+00, 6.000000e+00]]>
-
-// CHECK: func.func @get_global(
-// CHECK-SAME:   %[[ARG0:.*]]: memref<24xi8> {lmhlo.constant_name = "cst0"},
-// CHECK-SAME:   %[[ARG1:.*]]: memref<4xi8> {lmhlo.constant_name = "cst1"},
-// CHECK-SAME:   %[[ARG2:.*]]: memref<24xi8> {lmhlo.constant_name = "cst2"}
-// CHECK-SAME: )
-func.func @get_global(%arg0: memref<24xi8> {lmhlo.constant_name = "cst0"},
-                      %arg1: memref<4xi8> {lmhlo.constant_name = "cst1"},
-                      %arg2: memref<24xi8> {lmhlo.constant_name = "cst2"})
-    -> (memref<2x3xf32>, memref<f32>, memref<2x3xf32, #map>) {
-
-  // CHECK: %[[C0:.*]] = arith.constant 0 : index
-  // CHECK: %[[V0:.*]] = memref.view %[[ARG0]][%[[C0]]][] {{.*}} memref<2x3xf32>
-  %0 = memref.get_global @cst0 : memref<2x3xf32>
-
-  // CHECK: %[[V1:.*]] = memref.get_global {{.*}} : memref<f32>
-  %1 = memref.get_global @cst1 : memref<f32>
-
-  // CHECK: %[[C0_1:.*]] = arith.constant 0 : index
-  // CHECK: %[[F:.*]] = memref.view %[[ARG2]][%[[C0_1]]][] {{.*}} memref<6xf32>
-  // CHECK: %[[V2:.*]] = memref.reinterpret_cast %[[F]]
-  // CHECK-SAME: to offset: [0], sizes: [2, 3], strides: [1, 2]
-  %2 = memref.get_global @cst2 : memref<2x3xf32, #map>
-
-  // CHECK: return %[[V0]], %[[V1]], %[[V2]]
-  // CHECK-SAME: : memref<2x3xf32>, memref<f32>, memref<2x3xf32, #map{{[0-9]*}}>
-  return %0, %1, %2 : memref<2x3xf32>, memref<f32>, memref<2x3xf32, #map>
-}
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/outline_cuda_graphs.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/outline_cuda_graphs.mlir
deleted file mode 100644
index cd286419770ce5..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/outline_cuda_graphs.mlir
+++ /dev/null
@@ -1,686 +0,0 @@
-// RUN: xla-gpu-opt %s --split-input-file -xla-gpu-outline-gpu-graphs \
-// RUN:   | FileCheck %s
-
-module attributes {gpu.container_module} {
-
-gpu.module @gpu_module attributes {binary = "kernel binary"} {
-  gpu.func @fn0(%arg0: memref<?xf32>) kernel {
-    gpu.return
-  }
-  gpu.func @fn1(%arg0: memref<?xf32>) kernel {
-    gpu.return
-  }
-}
-
-// CHECK: @func(
-// CHECK:   %[[ARG0:.*]]: memref<?xf32>,
-// CHECK:   %[[ARG1:.*]]: memref<?xf32>
-// CHECK: )
-func.func @func(%arg0: memref<?xf32>, %arg1: memref<?xf32>) {
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2 : index
-  %c3 = arith.constant 3 : index
-  %c4 = arith.constant 4 : index
-  %c5 = arith.constant 5 : index
-  %c6 = arith.constant 6 : index
-
-  // CHECK: call @local_xla.gpu.graph.launch(%[[ARG0]], %[[ARG1]])
-  // CHECK-SAME: {capture = @local_xla.gpu.graph.capture}
-  // CHECK-NEXT: return
-
-  gpu.launch_func  @gpu_module::@fn0
-    blocks in (%c1, %c2, %c3)
-    threads in (%c4, %c5, %c6)
-    args(%arg0 : memref<?xf32>)
-
-  gpu.launch_func  @gpu_module::@fn1
-    blocks in (%c3, %c2, %c1)
-    threads in (%c6, %c5, %c4)
-    args(%arg1 : memref<?xf32>)
-
-  func.return
-}
-
-// CHECK: func @local_xla.gpu.graph.capture
-// CHECK-NEXT:  %[[C1:.*]] = arith.constant 1
-// CHECK-NEXT:  %[[C2:.*]] = arith.constant 2
-// CHECK-NEXT:  %[[C3:.*]] = arith.constant 3
-// CHECK-NEXT:  %[[C4:.*]] = arith.constant 4
-// CHECK-NEXT:  %[[C5:.*]] = arith.constant 5
-// CHECK-NEXT:  %[[C6:.*]] = arith.constant 6
-// CHECK-NEXT:  gpu.launch_func @gpu_module::@fn0
-// CHECK-SAME:    blocks in (%[[C1]], %[[C2]], %[[C3]])
-// CHECK-SAME:    threads in (%[[C4]], %[[C5]], %[[C6]])
-// CHECK-NEXT:  gpu.launch_func @gpu_module::@fn1
-// CHECK-SAME:    blocks in (%[[C3]], %[[C2]], %[[C1]])
-// CHECK-SAME:    threads in (%[[C6]], %[[C5]], %[[C4]])
-// CHECK-NEXT:  return
-
-// CHECK: func private @local_xla.gpu.graph.launch(memref<?xf32>, memref<?xf32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.graph.launch"}
-}
-
-// -----
-// Check that single function launch was not outlined into graph capture.
-
-module attributes {gpu.container_module} {
-
-gpu.module @gpu_module attributes {binary = "kernel binary"} {
-  gpu.func @fn0(%arg0: memref<?xf32>) kernel {
-    gpu.return
-  }
-}
-
-// CHECK: @func(%[[ARG0:.*]]: memref<?xf32>)
-func.func @func(%arg0: memref<?xf32>) {
-  %c1 = arith.constant 1 : index
-
-  // CHECK: gpu.launch_func {{.*}} args(%[[ARG0]] : memref<?xf32>)
-  // CHECK-NOT: call @local_xla.gpu.graph.launch
-  gpu.launch_func  @gpu_module::@fn0
-    blocks in (%c1, %c1, %c1)
-    threads in (%c1, %c1, %c1)
-    args(%arg0 : memref<?xf32>)
-
-  func.return
-}
-
-}
-
-// -----
-// Check that two different sequences are outlined in different capture
-// functions.
-
-module attributes {gpu.container_module} {
-
-gpu.module @gpu_module attributes {binary = "kernel binary"} {
-  gpu.func @fn0(%arg0: memref<?xf32>) kernel {
-    gpu.return
-  }
-  gpu.func @fn1(%arg0: memref<?xf32>) kernel {
-    gpu.return
-  }
-}
-
-// CHECK: @func(%[[ARG0:.*]]: memref<?xf32>)
-func.func @func(%arg0: memref<?xf32>) {
-  // CHECK: %[[C1:.*]] = arith.constant 1
-  %c1 = arith.constant 1 : index
-
-  // CHECK: call @local_xla.gpu.graph.launch(%[[ARG0]])
-  // CHECK-SAME: {capture = @[[CAPTURE:.*]]}
-
-  gpu.launch_func  @gpu_module::@fn0
-    blocks in (%c1, %c1, %c1)
-    threads in (%c1, %c1, %c1)
-    args(%arg0 : memref<?xf32>)
-
-  gpu.launch_func  @gpu_module::@fn1
-    blocks in (%c1, %c1, %c1)
-    threads in (%c1, %c1, %c1)
-    args(%arg0 : memref<?xf32>)
-
-  // CHECK: %[[C2:.*]] = arith.constant 2
-  %c2 = arith.constant 2 : index
-
-  // Use function call to break the captured ops sequence.
-  // CHECK: call @external
-  call @external(): () -> ()
-
-  // CHECK: call @local_xla.gpu.graph.launch(%[[ARG0]])
-  // CHECK-SAME: {capture = @[[CAPTURE_0:.*]]}
-
-  gpu.launch_func  @gpu_module::@fn1
-    blocks in (%c2, %c2, %c2)
-    threads in (%c2, %c2, %c2)
-    args(%arg0 : memref<?xf32>)
-
-  gpu.launch_func  @gpu_module::@fn0
-    blocks in (%c2, %c2, %c2)
-    threads in (%c2, %c2, %c2)
-    args(%arg0 : memref<?xf32>)
-
-  func.return
-}
-
-func.func private @external()
-
-// CHECK: rt.export @[[CAPTURE]]
-// CHECK: func.func @[[CAPTURE]](
-// CHECK:   %arg0: memref<?xf32>
-// CHECK: )
-// CHECK-NEXT: arith.constant 1
-// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
-// CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
-
-// CHECK: rt.export @[[CAPTURE_0]]
-// CHECK: func.func @[[CAPTURE_0]](
-// CHECK:   %arg0: memref<?xf32>
-// CHECK: )
-// CHECK-NEXT: arith.constant 2
-// CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
-// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
-
-}
-
-// -----
-// Check that constants from the different basic blocks are cloned into the
-// graph capture function.
-
-module attributes {gpu.container_module} {
-
-gpu.module @gpu_module attributes {binary = "kernel binary"} {
-  gpu.func @fn0(%arg0: memref<?xf32>) kernel {
-    gpu.return
-  }
-  gpu.func @fn1(%arg0: memref<?xf32>) kernel {
-    gpu.return
-  }
-}
-
-// CHECK: @func(
-// CHECK:   %[[ARG0:.*]]: memref<?xf32>,
-// CHECK:   %[[ARG1:.*]]: memref<?xf32>
-// CHECK: )
-func.func @func(%arg0: memref<?xf32>, %arg1: memref<?xf32>) {
-  cf.br ^bb2
-^bb1:
-  // CHECK: call @local_xla.gpu.graph.launch(%[[ARG0]], %[[ARG1]])
-  // CHECK-SAME: {capture = @local_xla.gpu.graph.capture}
-  // CHECK-NEXT: return
-
-  gpu.launch_func  @gpu_module::@fn0
-    blocks in (%c1, %c1, %c1)
-    threads in (%c1, %c1, %c1)
-    args(%arg0 : memref<?xf32>)
-
-  gpu.launch_func  @gpu_module::@fn1
-    blocks in (%c1, %c1, %c1)
-    threads in (%c1, %c1, %c1)
-    args(%arg1 : memref<?xf32>)
-
-  func.return
-
-^bb2:
-  %c1 = arith.constant 1 : index
-  cf.br ^bb1
-}
-}
-
-// CHECK: func @local_xla.gpu.graph.capture
-// CHECK-NEXT: arith.constant 1
-// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
-// CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
-// CHECK-NEXT: return
-
-// -----
-// Check that memref.view operations are cloned into the graph capture function.
-
-module attributes {gpu.container_module} {
-
-gpu.module @gpu_module attributes {binary = "kernel binary"} {
-  gpu.func @fn0(%arg0: memref<4xf32>) kernel { gpu.return }
-  gpu.func @fn1(%arg0: memref<4xf32>) kernel { gpu.return }
-}
-
-// CHECK: @func(%[[ARG0:.*]]: memref<16xi8>)
-func.func @func(%arg0: memref<16xi8>) {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %view = memref.view %arg0[%c0][] : memref<16xi8> to memref<4xf32>
-
-  call @external() : () -> ()
-
-  // CHECK: call @local_xla.gpu.graph.launch(%[[ARG0]])
-  // CHECK-SAME: {capture = @local_xla.gpu.graph.capture}
-  // CHECK-NEXT: return
-  gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
-    threads in (%c1, %c1, %c1) args(%view : memref<4xf32>)
-  gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-    threads in (%c1, %c1, %c1) args(%view : memref<4xf32>)
-
-  func.return
-}
-
-func.func private @external()
-}
-
-// CHECK: func @local_xla.gpu.graph.capture
-// CHECK-NEXT: arith.constant 0
-// CHECK-NEXT: arith.constant 1
-// CHECK-NEXT: memref.view
-// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
-// CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
-// CHECK-NEXT: return
-
-// -----
-// Check that memref.view not used by operations in the captured graph will not
-// be moved into the graph capture function.
-
-module attributes {gpu.container_module} {
-
-gpu.module @gpu_module attributes {binary = "kernel binary"} {
-  gpu.func @fn0(%arg0: memref<16xi8>) kernel { gpu.return }
-  gpu.func @fn1(%arg0: memref<16xi8>) kernel { gpu.return }
-}
-
-// CHECK: @func(%[[ARG0:.*]]: memref<16xi8>)
-func.func @func(%arg0: memref<16xi8>) {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-
-  call @external() : () -> ()
-
-  // CHECK: call @local_xla.gpu.graph.launch(%[[ARG0]])
-  // CHECK-SAME: {capture = @local_xla.gpu.graph.capture}
-  // CHECK-NEXT: memref.view
-  // CHECK-NEXT: return
-  gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
-    threads in (%c1, %c1, %c1) args(%arg0 : memref<16xi8>)
-  %view = memref.view %arg0[%c0][] : memref<16xi8> to memref<4xf32>
-  gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-    threads in (%c1, %c1, %c1) args(%arg0 : memref<16xi8>)
-
-  func.return
-}
-
-func.func private @external()
-}
-
-// CHECK: func @local_xla.gpu.graph.capture
-// CHECK-NEXT: arith.constant 1
-// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
-// CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
-// CHECK-NEXT: return
-
-// -----
-// Check that lmhlo_gpu.gemm is moved into the graph capture function.
-
-module attributes {gpu.container_module} {
-
-  gpu.module @gpu_module attributes {binary = "kernel binary"} {
-    gpu.func @fn0(%arg0: memref<16xi8>) kernel { gpu.return }
-  }
-
-  // CHECK: @func(%[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0 : index}
-  // CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1 : index}
-  // CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-  func.func @func(%raw_arg0: memref<16xi8> {lmhlo.params = 0 : index},
-                   %raw_arg1: memref<16xi8> {lmhlo.params = 1 : index},
-                   %raw_arg2: memref<16xi8> {lmhlo.output_index = dense<[0]> : tensor<1xindex>}) attributes {
-                       result_xla_shape = "(f32[4]) "
-                   } {
-    %c0 = arith.constant 0 : index
-    %arg0 = memref.view %raw_arg0[%c0][] : memref<16xi8> to memref<2x2xf32>
-    %c1 = arith.constant 0 : index
-    %arg1 = memref.view %raw_arg1[%c1][] : memref<16xi8> to memref<2x2xf32>
-    %c2 = arith.constant 0 : index
-    %arg2 = memref.view %raw_arg2[%c2][] : memref<16xi8> to memref<2x2xf32>
-
-    // CHECK: call @local_xla.gpu.graph.launch(%[[ARG0]], %[[ARG1]], %[[ARG2]])
-    // CHECK-SAME: {capture = @local_xla.gpu.graph.capture}
-    "lmhlo_gpu.gemm"(%arg0, %arg1, %arg2) {alpha_imag = 0.000000e+00 : f64, alpha_real = 1.000000e+00 : f64, beta = 0.000000e+00 : f64, batch_size = 1 : i64, lhs_stride = 4 : i64, rhs_stride = 4 : i64, dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>} : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
-    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%raw_arg0 : memref<16xi8>)
-    "lmhlo.terminator"() : () -> ()
-  }
-
-  func.func private @external()
-}
-
-// CHECK: func @local_xla.gpu.graph.capture
-// CHECK-NEXT: arith.constant 0
-// CHECK-NEXT: memref.view
-// CHECK-NEXT: arith.constant 0
-// CHECK-NEXT: memref.view
-// CHECK-NEXT: arith.constant 0
-// CHECK-NEXT: memref.view
-// CHECK-NEXT: "lmhlo_gpu.gemm"
-// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
-// CHECK-NEXT: return
-
-// -----
-// Check that lmhlo_gpu.gemm with runtime autotuning is not captured by a CUDA
-// graph.
-
-module attributes {gpu.container_module} {
-
-  gpu.module @gpu_module attributes {binary = "kernel binary"} {
-    gpu.func @fn0(%arg0: memref<16xi8>) kernel { gpu.return }
-  }
-
-  // CHECK: @func(%[[ARG0:.*]]: memref<16xi8> {lmhlo.params = 0 : index}
-  // CHECK-SAME: %[[ARG1:.*]]: memref<16xi8> {lmhlo.params = 1 : index}
-  // CHECK-SAME: %[[ARG2:.*]]: memref<16xi8>
-  func.func @func(%raw_arg0: memref<16xi8> {lmhlo.params = 0 : index},
-                   %raw_arg1: memref<16xi8> {lmhlo.params = 1 : index},
-                   %raw_arg2: memref<16xi8> {lmhlo.output_index = dense<[0]> : tensor<1xindex>}) attributes {
-                       result_xla_shape = "(f32[4]) "
-                   } {
-    %c0 = arith.constant 0 : index
-    %arg0 = memref.view %raw_arg0[%c0][] : memref<16xi8> to memref<2x2xf32>
-    %c1 = arith.constant 0 : index
-    %arg1 = memref.view %raw_arg1[%c1][] : memref<16xi8> to memref<2x2xf32>
-    %c2 = arith.constant 0 : index
-    %arg2 = memref.view %raw_arg2[%c2][] : memref<16xi8> to memref<2x2xf32>
-
-
-    // CHECK-NOT: call @local_xla.gpu.graph.launch
-    // CHECK: "lmhlo_gpu.gemm"
-    "lmhlo_gpu.gemm"(%arg0, %arg1, %arg2) {algorithm = -5, alpha_imag = 0.000000e+00 : f64, alpha_real = 1.000000e+00 : f64, beta = 0.000000e+00 : f64, batch_size = 1 : i64, lhs_stride = 4 : i64, rhs_stride = 4 : i64, dot_dimension_numbers = #mhlo.dot<lhs_contracting_dimensions = [1], rhs_contracting_dimensions = [0]>} : (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
-    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%raw_arg0 : memref<16xi8>)
-    "lmhlo.terminator"() : () -> ()
-  }
-
-  func.func private @external()
-}
-
-// -----
-// Check that convolution with runtime autotuning is not captured by a CUDA
-// graph.
-
-#map0 = affine_map<(d0, d1, d2, d3) -> (d0 * 3 + d1 + d2 * 9 + d3 * 9)>
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * 16384 + d1 * 4 + d2 + d3 * 16)>
-#map2 = affine_map<(d0, d1, d2, d3) -> (d0 * 4096 + d1 * 2 + d2 + d3 * 4)>
-
-module attributes {gpu.container_module} {
-
-  gpu.module @gpu_module attributes {binary = "kernel binary"} {
-    gpu.func @fn0(%arg0: memref<16xi8>) kernel { gpu.return }
-  }
-
-
-  // CHECK: @func(%[[ARG0:.*]]: memref<8x5x5x1xf32, #map>
-  // CHECK-SAME: %[[ARG1:.*]]: memref<3x3x1x32xf32, #map1>
-  // CHECK-SAME: %[[ARG2:.*]]: memref<32xf32>
-  // CHECK-SAME: %[[ARG3:.*]]: memref<8x5x5x32xf32, #map2>
-  // CHECK-SAME: %[[ARG4:.*]]: memref<0xui8>
-  // CHECK-SAME: %[[ARG5:.*]]: memref<16xi8>
-  func.func @func(%input: memref<8x5x5x1xf32, #map1>,
-                                %filter: memref<3x3x1x32xf32, #map0>,
-                                %bias: memref<32xf32>,
-                                %output: memref<8x5x5x32xf32, #map2>,
-                                %scratch: memref<0xui8>,
-                                %raw_arg0: memref<16xi8> {lmhlo.params = 0 : index}
-                                ) {
-    %c0 = arith.constant 0 : index
-
-    // CHECK-NOT: call @local_xla.g.cuda.graph.launch
-    // CHECK: lmhlo_gpu.conv_forward_fused
-    lmhlo_gpu.conv_forward_fused(%input, %filter, %bias, %output, %scratch)
-      dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
-      window = { stride = [1, 1],
-                lhs_dilate = [1, 1],
-                rhs_dilate = [1, 1],
-                reverse = [0, 0]
-              }
-      { activation_mode = #lmhlo_gpu<activation Relu>,
-        leakyrelu_alpha = 0.0 : f64,
-        backend_config = #lmhlo_gpu.convolution_backend_config<
-          algorithm = -1,
-          is_cudnn_frontend = true,
-          is_cudnn_reordered_int8 = false,
-          knob_ids = [2, 3],
-          knob_values = [4, 0],
-          operand_0_layout = [2, 1, 3, 0],
-          operand_1_layout = [1, 0, 2, 3],
-          result_layout = [2, 1, 3, 0],
-          tensor_ops_enabled = false,
-          workspace_size = 0
-        >,
-        batch_group_count = 1 : i64,
-        feature_group_count = 1 : i64,
-        precision_config = [],
-        result_scale = 1.000000e+00 : f64
-      } : (memref<8x5x5x1xf32, #map1>,
-          memref<3x3x1x32xf32, #map0>,
-          memref<32xf32>,
-          memref<8x5x5x32xf32, #map2>,
-          memref<0xui8>) -> ()
-    gpu.launch_func  @gpu_module::@fn0 blocks in (%c0, %c0, %c0)
-      threads in (%c0, %c0, %c0) args(%raw_arg0 : memref<16xi8>)
-    return
-  }
-  func.func private @external()
-}
-
-// -----
-// Check that convolutions are captured by cuda graphs.
-
-#map0 = affine_map<(d0, d1, d2, d3) -> (d0 * 3 + d1 + d2 * 9 + d3 * 9)>
-#map1 = affine_map<(d0, d1, d2, d3) -> (d0 * 16384 + d1 * 4 + d2 + d3 * 16)>
-#map2 = affine_map<(d0, d1, d2, d3) -> (d0 * 4096 + d1 * 2 + d2 + d3 * 4)>
-
-module attributes {gpu.container_module} {
-
-  gpu.module @gpu_module attributes {binary = "kernel binary"} {
-    gpu.func @fn0(%arg0: memref<16xi8>) kernel { gpu.return }
-  }
-
-
-  // CHECK: @func(%[[ARG0:.*]]: memref<1x4x4x1024xf16, #map>
-  // CHECK-SAME: %[[ARG1:.*]]: memref<3x3x1x1024xf16, #map1>
-  // CHECK-SAME: %[[ARG2:.*]]: memref<1x2x2x1024xf16, #map2>
-  // CHECK-SAME: %[[ARG3:.*]]: memref<0xui8>
-  // CHECK-SAME: %[[ARG4:.*]]: memref<16xi8>
-  func.func @func(%input: memref<1x4x4x1024xf16, #map1>,
-                                %filter: memref<3x3x1x1024xf16, #map0>,
-                                %output: memref<1x2x2x1024xf16, #map2>,
-                                %scratch: memref<0xui8>,
-                                %raw_arg0: memref<16xi8> {lmhlo.params = 0 : index}
-                                ) {
-    %c0 = arith.constant 0 : index
-
-    // CHECK: call @local_xla.gpu.graph.launch(%[[ARG0]], %[[ARG1]], %[[ARG2]], %[[ARG3]], %[[ARG4]])
-    // CHECK-SAME: {capture = @local_xla.gpu.graph.capture}
-    lmhlo_gpu.conv_forward(%input, %filter, %output, %scratch)
-      dim_numbers = [b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f],
-      window = { stride = [1, 1],
-                lhs_dilate = [1, 1],
-                rhs_dilate = [1, 1],
-                reverse = [0, 0]
-              }
-      { backend_config = #lmhlo_gpu.convolution_backend_config<
-          algorithm = 0,
-          is_cudnn_frontend = true,
-          is_cudnn_reordered_int8 = false,
-          knob_ids = [],
-          knob_values = [],
-          operand_0_layout = [2, 1, 3, 0],
-          operand_1_layout = [1, 0, 2, 3],
-          result_layout = [2, 1, 3, 0],
-          tensor_ops_enabled = false,
-          workspace_size = 0
-        >,
-        batch_group_count = 1 : i64,
-        feature_group_count = 1024 : i64,
-        precision_config = [],
-        result_scale = 1.000000e+00 : f64
-      } : (memref<1x4x4x1024xf16, #map1>,
-          memref<3x3x1x1024xf16, #map0>,
-          memref<1x2x2x1024xf16, #map2>,
-          memref<0xui8>) -> ()
-    gpu.launch_func  @gpu_module::@fn0 blocks in (%c0, %c0, %c0)
-      threads in (%c0, %c0, %c0) args(%raw_arg0 : memref<16xi8>)
-    return
-  }
-  func.func private @external()
-}
-
-// CHECK: func @local_xla.gpu.graph.capture
-// CHECK-NEXT: arith.constant 0
-// CHECK-NEXT: lmhlo_gpu.conv_forward
-// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
-// CHECK-NEXT: return
-
-// -----
-// Check that d2d memcpy are captured.
-
-module attributes {gpu.container_module} {
-
-  // CHECK: @func(%[[ARG0:.*]]: memref<100xi8>)
-  func.func @func(%arg0: memref<100xi8>) {
-    %c0 = arith.constant 0 : index
-    %dst = memref.view %arg0[%c0][] : memref<100xi8> to memref<10xf32>
-    %src = memref.view %arg0[%c0][] : memref<100xi8> to memref<10xf32>
-
-    // CHECK: call @local_xla.gpu.graph.launch(%[[ARG0]])
-    // CHECK-SAME: {capture = @local_xla.gpu.graph.capture}
-    gpu.memcpy %dst, %src : memref<10xf32>, memref<10xf32>
-    gpu.memcpy %dst, %src : memref<10xf32>, memref<10xf32>
-
-    // CHECK: return
-    return
-  }
-  func.func private @external()
-}
-
-// CHECK: func @local_xla.gpu.graph.capture
-// CHECK: gpu.memcpy
-// CHECK: gpu.memcpy
-// CHECK-NEXT: return
-
-// -----
-// Check that memref.reinterpret_cast operations are cloned into the graph
-// capture function.
-
-module attributes {gpu.container_module} {
-
-gpu.module @gpu_module attributes {binary = "kernel binary"} {
-  gpu.func @fn0(%arg0: memref<16xi8, strided<[1], offset: 0>>) kernel { gpu.return }
-  gpu.func @fn1(%arg0: memref<16xi8, strided<[1], offset: 0>>) kernel { gpu.return }
-}
-
-// CHECK: @func(%[[ARG0:.*]]: memref<16xi8>)
-func.func @func(%arg0: memref<16xi8>) {
-  %c1 = arith.constant 1 : index
-  %view = memref.reinterpret_cast %arg0 to offset: [0], sizes: [16], strides: [1]: memref<16xi8> to memref<16xi8, strided<[1], offset: 0>>
-
-  call @external() : () -> ()
-
-  // CHECK: call @local_xla.gpu.graph.launch(%[[ARG0]])
-  // CHECK-SAME: {capture = @local_xla.gpu.graph.capture}
-  // CHECK-NEXT: return
-  gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
-    threads in (%c1, %c1, %c1) args(%view : memref<16xi8, strided<[1], offset: 0>>)
-  gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-    threads in (%c1, %c1, %c1) args(%view : memref<16xi8, strided<[1], offset: 0>>)
-
-  func.return
-}
-
-func.func private @external()
-}
-
-// CHECK: func @local_xla.gpu.graph.capture
-// CHECK-NEXT: arith.constant 1
-// CHECK-NEXT: memref.reinterpret_cast
-// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
-// CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
-// CHECK-NEXT: return
-
-// -----
-// Check that the loop body of lmhlo.while is cloned into the graph.
-
-module attributes {gpu.container_module} {
-
-gpu.module @gpu_module attributes {binary = "kernel binary"} {
-  gpu.func @fn0(%arg0: memref<16xi8>)  kernel { gpu.return }
-  gpu.func @fn1(%arg0: memref<16xi8>) kernel { gpu.return }
-}
-
-// CHECK: @func(%[[ARG0:.*]]: memref<16xi8>
-func.func @func(%arg0: memref<16xi8>, %cond: memref<i1>) {
-  %c1 = arith.constant 1 : index
-
-  call @external() : () -> ()
-
-  "lmhlo.while"(%cond) ({
-    // CHECK: func.call @local_xla.gpu.graph.launch(%[[ARG0]])
-    // CHECK-SAME: {capture = @local_xla.gpu.graph.capture}
-    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%arg0: memref<16xi8>)
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%arg0: memref<16xi8>)
-    "lmhlo.terminator"() : () -> () }, {
-    // CHECK: func.call @local_xla.gpu.graph.launch(%[[ARG0]])
-    // CHECK-SAME: {capture = @local_xla.gpu.graph.capture_0}
-    gpu.launch_func  @gpu_module::@fn0 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%arg0: memref<16xi8>)
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%arg0: memref<16xi8>)
-    "lmhlo.terminator"() : () -> ()
-  }) : (memref<i1>) -> ()
-  func.return
-}
-
-func.func private @external()
-}
-
-// CHECK: func @local_xla.gpu.graph.capture
-// CHECK-NEXT: arith.constant 1
-// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
-// CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
-// CHECK-NEXT: return
-
-// CHECK: func @local_xla.gpu.graph.capture_0
-// CHECK-NEXT: arith.constant 1
-// CHECK-NEXT: gpu.launch_func @gpu_module::@fn0
-// CHECK-NEXT: gpu.launch_func @gpu_module::@fn1
-// CHECK-NEXT: return
-
-// -----
-// Check that lmhlo.constant_name is propogated to the graph capture function
-module attributes {gpu.container_module} {
-
-gpu.module @gpu_module attributes {binary = "kernel binary"} {
-  gpu.func @fn0(%arg0: memref<?xf32>) kernel {
-    gpu.return
-  }
-  gpu.func @fn1(%arg0: memref<?xf32>) kernel {
-    gpu.return
-  }
-}
-
-// CHECK: @func(
-// CHECK:   %[[ARG0:.*]]: memref<?xf32> {lmhlo.constant_name = "cst0"},
-// CHECK:   %[[ARG1:.*]]: memref<?xf32> {lmhlo.constant_name = "cst1"}
-// CHECK: )
-func.func @func(%arg0: memref<?xf32> {lmhlo.constant_name = "cst0"},
-                %arg1: memref<?xf32> {lmhlo.constant_name = "cst1"}) {
-  %c1 = arith.constant 1 : index
-
-  // CHECK: call @local_xla.gpu.graph.launch(%[[ARG0]], %[[ARG1]])
-  // CHECK-SAME: {capture = @local_xla.gpu.graph.capture}
-  // CHECK-NEXT: return
-
-  gpu.launch_func  @gpu_module::@fn0
-    blocks in (%c1, %c1, %c1)
-    threads in (%c1, %c1, %c1)
-    args(%arg0 : memref<?xf32>)
-
-  gpu.launch_func  @gpu_module::@fn1
-    blocks in (%c1, %c1, %c1)
-    threads in (%c1, %c1, %c1)
-    args(%arg1 : memref<?xf32>)
-
-  func.return
-}
-
-// CHECK: func @local_xla.gpu.graph.capture(
-// CHECK-SAME:  %[[ARG0]]: memref<?xf32> {lmhlo.constant_name = "cst0",
-// CHECK-SAME:  %[[ARG1]]: memref<?xf32> {lmhlo.constant_name = "cst1",
-// CHECK-SAME: )
-// CHECK-NEXT:  %[[C1:.*]] = arith.constant 1
-// CHECK-NEXT:  gpu.launch_func @gpu_module::@fn0
-// CHECK-SAME:    blocks in (%[[C1]], %[[C1]], %[[C1]])
-// CHECK-SAME:    threads in (%[[C1]], %[[C1]], %[[C1]])
-// CHECK-NEXT:  gpu.launch_func @gpu_module::@fn1
-// CHECK-SAME:    blocks in (%[[C1]], %[[C1]], %[[C1]])
-// CHECK-SAME:    threads in (%[[C1]], %[[C1]], %[[C1]])
-// CHECK-NEXT:  return
-
-// CHECK: func private @local_xla.gpu.graph.launch(memref<?xf32>, memref<?xf32>)
-// CHECK-SAME: attributes {rt.custom_call = "xla.gpu.graph.launch"}
-}
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/stream_assignment.mlir b/third_party/xla/xla/mlir/backends/gpu/transforms/tests/stream_assignment.mlir
deleted file mode 100644
index a3543dca28bad9..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/tests/stream_assignment.mlir
+++ /dev/null
@@ -1,190 +0,0 @@
-// RUN: xla-gpu-opt %s --split-input-file -xla-gpu-stream-assignment \
-// RUN:   | FileCheck %s
-
-// -----
-// Check that independent kernels are assigned to different streams.
-// A   B--->C
-// |        ^
-// |        |
-// +--------+
-//
-// Stream assignment: A->0 B->1 C->0
-
-module attributes {gpu.container_module} {
-
-  gpu.module @gpu_module attributes {binary = "kernel binary"} {
-    gpu.func @fn1(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
-    gpu.func @fn2(%arg0: memref<3x3xi64>, %arg1: memref<3x3xi64>) kernel { gpu.return }
-  }
-
-  // CHECK: func @local_xla.gpu.graph.capture
-  func.func @local_xla.gpu.graph.capture(%arg0: memref<72xi8>, %arg1: memref<72xi8>) {
-    // CHECK: call @local_xla.gpu.concurrent_region.begin()
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %view = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi64>
-    %view_0 = memref.view %arg1[%c0][] : memref<72xi8> to memref<3x3xi64>
-
-    // CHECK: gpu.launch_func @gpu_module::@fn1
-    // CHECK-SAME: {stream = 0 : i64}
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi64>)
-    // CHECK: gpu.launch_func @gpu_module::@fn1
-    // CHECK-SAME: {stream = 1 : i64}
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>)
-    // CHECK: call @local_xla.streams.await() {from = 0 : i64, to = [1]}
-    // CHECK: gpu.launch_func @gpu_module::@fn2
-    // CHECK-SAME: {stream = 0 : i64}
-    gpu.launch_func  @gpu_module::@fn2 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi64>, %view_0 : memref<3x3xi64>)
-    // CHECK: call @local_xla.gpu.concurrent_region.end()
-    // CHECK: return
-    return
-  }
-}
-
-// -----
-// Check that the assignment for the following pattern correctly exploits
-// parallelism.
-// A--->B   C
-// |        ^
-// |        |
-// +--------+
-//
-// Stream assignment: A->0 B->1 C->0
-//
-
-module attributes {gpu.container_module} {
-
-  gpu.module @gpu_module attributes {binary = "kernel binary"} {
-    gpu.func @fn1(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
-    gpu.func @fn2(%arg0: memref<3x3xi64> {lmhlo.written}, %arg1: memref<3x3xi64> {lmhlo.written}) kernel { gpu.return }
-  }
-
-
-  // CHECK: func @local_xla.gpu.graph.capture
-  func.func @local_xla.gpu.graph.capture(%arg0: memref<72xi8>, %arg1: memref<72xi8>) {
-    // CHECK: call @local_xla.gpu.concurrent_region.begin()
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %view = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi64>
-    %view_0 = memref.view %arg1[%c0][] : memref<72xi8> to memref<3x3xi64>
-
-    // CHECK: gpu.launch_func @gpu_module::@fn2
-    // CHECK-SAME: {stream = 0 : i64}
-    gpu.launch_func  @gpu_module::@fn2 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi64>, %view_0 : memref<3x3xi64>)
-    // CHECK: call @local_xla.streams.await() {from = 1 : i64, to = [0]}
-    // CHECK: gpu.launch_func @gpu_module::@fn1
-    // CHECK-SAME: {stream = 1 : i64}
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view : memref<3x3xi64>)
-    // CHECK: gpu.launch_func @gpu_module::@fn1
-    // CHECK-SAME: {stream = 0 : i64}
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>)
-    // CHECK: call @local_xla.gpu.concurrent_region.end()
-    // CHECK: return
-    return
-  }
-}
-
-// -----
-// Check that stream with multiple dependencies is handled correctly.
-// A    B    C-->D
-// |    |        ^
-// |    |--------|
-// +-------------+
-//
-// Stream assignment: A->0 B->1 C->2 D->0
-//
-
-module attributes {gpu.container_module} {
-
-  gpu.module @gpu_module attributes {binary = "kernel binary"} {
-    gpu.func @fn1(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
-    gpu.func @fn2(%arg0: memref<3x3xi64> {lmhlo.written}, %arg1: memref<3x3xi64> {lmhlo.written}, %arg3: memref<3x3xi64>) kernel { gpu.return }
-  }
-
-
-  // CHECK: func @local_xla.gpu.graph.capture
-  func.func @local_xla.gpu.graph.capture(%arg0: memref<72xi8>, %arg1: memref<72xi8>, %arg2: memref<72xi8>) {
-    // CHECK: call @local_xla.gpu.concurrent_region.begin()
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %view_0 = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi64>
-    %view_1 = memref.view %arg1[%c0][] : memref<72xi8> to memref<3x3xi64>
-    %view_2 = memref.view %arg2[%c0][] : memref<72xi8> to memref<3x3xi64>
-
-    // CHECK: gpu.launch_func @gpu_module::@fn1
-    // CHECK-SAME: {stream = 0 : i64}
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>)
-    // CHECK: gpu.launch_func @gpu_module::@fn1
-    // CHECK-SAME: {stream = 1 : i64}
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view_1 : memref<3x3xi64>)
-    // CHECK: gpu.launch_func @gpu_module::@fn1
-    // CHECK-SAME: {stream = 2 : i64}
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view_2 : memref<3x3xi64>)
-    // CHECK: call @local_xla.streams.await() {from = 0 : i64, to = [1, 2]}
-    // CHECK: gpu.launch_func @gpu_module::@fn2
-    // CHECK-SAME: {stream = 0 : i64}
-    gpu.launch_func  @gpu_module::@fn2 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>, %view_1 : memref<3x3xi64>, %view_2 : memref<3x3xi64>)
-    // CHECK: call @local_xla.gpu.concurrent_region.end()
-    // CHECK: return
-    return
-  }
-}
-
-// -----
-// Check that stream synchronization only happens when two streams joins.
-// A    B--->C-->D
-// |         ^
-// |         |
-// +---------+
-//
-// Stream assignment: A->0 B->1 C->0 D->0
-//
-
-module attributes {gpu.container_module} {
-
-  gpu.module @gpu_module attributes {binary = "kernel binary"} {
-    gpu.func @fn1(%arg0: memref<3x3xi64> {lmhlo.written} ) kernel { gpu.return }
-    gpu.func @fn2(%arg0: memref<3x3xi64> {lmhlo.written}, %arg1: memref<3x3xi64>) kernel { gpu.return }
-  }
-
-
-  // CHECK: func @local_xla.gpu.graph.capture
-  func.func @local_xla.gpu.graph.capture(%arg0: memref<72xi8>, %arg1: memref<72xi8>) {
-    // CHECK: call @local_xla.gpu.concurrent_region.begin()
-    %c0 = arith.constant 0 : index
-    %c1 = arith.constant 1 : index
-    %view_0 = memref.view %arg0[%c0][] : memref<72xi8> to memref<3x3xi64>
-    %view_1 = memref.view %arg1[%c0][] : memref<72xi8> to memref<3x3xi64>
-
-    // CHECK: gpu.launch_func @gpu_module::@fn1
-    // CHECK-SAME: {stream = 0 : i64}
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>)
-    // CHECK: gpu.launch_func @gpu_module::@fn1
-    // CHECK-SAME: {stream = 1 : i64}
-    gpu.launch_func  @gpu_module::@fn1 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view_1 : memref<3x3xi64>)
-    // CHECK: call @local_xla.streams.await() {from = 0 : i64, to = [1]}
-    // CHECK: gpu.launch_func @gpu_module::@fn2
-    // CHECK-SAME: {stream = 0 : i64}
-    gpu.launch_func  @gpu_module::@fn2 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>, %view_1 : memref<3x3xi64>)
-    // CHECK-NEXT: gpu.launch_func @gpu_module::@fn2
-    // CHECK-SAME: {stream = 0 : i64}
-    gpu.launch_func  @gpu_module::@fn2 blocks in (%c1, %c1, %c1)
-      threads in (%c1, %c1, %c1) args(%view_0 : memref<3x3xi64>, %view_1 : memref<3x3xi64>)
-    // CHECK: call @local_xla.gpu.concurrent_region.end()
-    // CHECK: return
-    return
-  }
-}
diff --git a/third_party/xla/xla/mlir/backends/gpu/transforms/uid_generator.h b/third_party/xla/xla/mlir/backends/gpu/transforms/uid_generator.h
deleted file mode 100644
index 6e03655ede558b..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/transforms/uid_generator.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_MLIR_BACKENDS_GPU_TRANSFORMS_UID_GENERATOR_H_
-#define XLA_MLIR_BACKENDS_GPU_TRANSFORMS_UID_GENERATOR_H_
-
-#include <atomic>
-
-namespace xla {
-namespace gpu {
-
-// Every stateful operation in the module gets assigned a unique id, that is
-// passed to the custom call handler. This id is used for caching resources
-// between the different invocations of the same custom call (e.g. cache
-// convolution descriptors).
-//
-// TODO(b/255600288): Improve stateful custom calls in Xla runtime.
-class UidGenerator {
- public:
-  UidGenerator() : uid_(0) {}
-  int64_t uid() { return uid_.fetch_add(1); }
-
- private:
-  std::atomic<int64_t> uid_;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_MLIR_BACKENDS_GPU_TRANSFORMS_UID_GENERATOR_H_
diff --git a/third_party/xla/xla/mlir/backends/gpu/xla-gpu-opt.cc b/third_party/xla/xla/mlir/backends/gpu/xla-gpu-opt.cc
deleted file mode 100644
index ccea4d69d64b9c..00000000000000
--- a/third_party/xla/xla/mlir/backends/gpu/xla-gpu-opt.cc
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir/Dialect/Func/Extensions/AllExtensions.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
-#include "xla/mlir/backends/gpu/transforms/passes.h"
-#include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
-#include "xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
-
-int main(int argc, char **argv) {
-  mlir::DialectRegistry registry;
-  registry
-      .insert<mlir::memref::MemRefDialect, mlir::func::FuncDialect,
-              mlir::gpu::GPUDialect, mlir::mhlo::MhloDialect,
-              mlir::lmhlo::LmhloDialect, mlir::lmhlo_gpu::LmhloGpuDialect>();
-  mlir::func::registerAllExtensions(registry);
-
-  xla::gpu::registerGpuTransformsPasses();
-
-  return failed(MlirOptMain(argc, argv, "Xla Gpu Pass Driver\n", registry));
-}
diff --git a/third_party/xla/xla/mlir/framework/ir/BUILD b/third_party/xla/xla/mlir/framework/ir/BUILD
index b410c5b71a2e3d..9db56a6a5e11a5 100644
--- a/third_party/xla/xla/mlir/framework/ir/BUILD
+++ b/third_party/xla/xla/mlir/framework/ir/BUILD
@@ -1,9 +1,11 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility(["//learning/brain/mlir:xla_friends"]),
     licenses = ["notice"],
 )
 
@@ -62,7 +64,6 @@ cc_library(
         "xla_framework.h.inc",
     ],
     hdrs = ["xla_framework.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":xla_framework_inc_gen",
         "@llvm-project//llvm:Support",
diff --git a/third_party/xla/xla/mlir/framework/tests/BUILD b/third_party/xla/xla/mlir/framework/tests/BUILD
index 8f6df845d20816..e0311ea4ac362d 100644
--- a/third_party/xla/xla/mlir/framework/tests/BUILD
+++ b/third_party/xla/xla/mlir/framework/tests/BUILD
@@ -1,7 +1,6 @@
 load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
@@ -18,7 +17,7 @@ lit_test_suite(
     ),
     cfg = "//xla:lit.cfg.py",
     tools = [
-        "//xla/translate/mhlo_to_lhlo_with_xla:xla-translate-opt",
+        "//xla/translate:xla-translate-opt",
         "@llvm-project//llvm:FileCheck",
     ],
 )
diff --git a/third_party/xla/xla/mlir/framework/transforms/BUILD b/third_party/xla/xla/mlir/framework/transforms/BUILD
index be60927699ced5..26cbb6afcf960b 100644
--- a/third_party/xla/xla/mlir/framework/transforms/BUILD
+++ b/third_party/xla/xla/mlir/framework/transforms/BUILD
@@ -1,9 +1,11 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility(["//learning/brain/mlir:xla_friends"]),
     licenses = ["notice"],
 )
 
@@ -39,7 +41,6 @@ cc_library(
     hdrs = [
         "passes.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":passes_inc_gen",
         "//xla/mlir/framework/ir:xla_framework",
diff --git a/third_party/xla/xla/mlir/math/BUILD b/third_party/xla/xla/mlir/math/BUILD
index 7b4bde174949ea..15f4a35f9a210a 100644
--- a/third_party/xla/xla/mlir/math/BUILD
+++ b/third_party/xla/xla/mlir/math/BUILD
@@ -10,6 +10,7 @@ package_group(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
     licenses = ["notice"],
 )
diff --git a/third_party/xla/xla/mlir/math/transforms/BUILD b/third_party/xla/xla/mlir/math/transforms/BUILD
index 5a7436ac2a9b17..cfc7ff68c1e73c 100644
--- a/third_party/xla/xla/mlir/math/transforms/BUILD
+++ b/third_party/xla/xla/mlir/math/transforms/BUILD
@@ -3,7 +3,8 @@ load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla/mlir/math:friends"],
     licenses = ["notice"],
 )
 
@@ -32,7 +33,6 @@ cc_library(
     ],
     hdrs = ["passes.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":passes_inc_gen",
         "@llvm-project//mlir:ArithDialect",
diff --git a/third_party/xla/xla/mlir/math/transforms/tests/BUILD b/third_party/xla/xla/mlir/math/transforms/tests/BUILD
index 6fa63434706dbf..307c66af3b0561 100644
--- a/third_party/xla/xla/mlir/math/transforms/tests/BUILD
+++ b/third_party/xla/xla/mlir/math/transforms/tests/BUILD
@@ -1,7 +1,6 @@
 load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
diff --git a/third_party/xla/xla/mlir/memref/BUILD b/third_party/xla/xla/mlir/memref/BUILD
index 7b4bde174949ea..15f4a35f9a210a 100644
--- a/third_party/xla/xla/mlir/memref/BUILD
+++ b/third_party/xla/xla/mlir/memref/BUILD
@@ -10,6 +10,7 @@ package_group(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
     licenses = ["notice"],
 )
diff --git a/third_party/xla/xla/mlir/memref/transforms/BUILD b/third_party/xla/xla/mlir/memref/transforms/BUILD
index 0c09155bb1eb9c..adc78fc5b6a1cc 100644
--- a/third_party/xla/xla/mlir/memref/transforms/BUILD
+++ b/third_party/xla/xla/mlir/memref/transforms/BUILD
@@ -3,7 +3,8 @@ load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla/mlir/memref:friends"],
     licenses = ["notice"],
 )
 
@@ -29,10 +30,10 @@ cc_library(
     srcs = ["aligned_allocations.cc"],
     hdrs = ["passes.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":passes_inc_gen",
         "@llvm-project//mlir:FuncDialect",
+        "@llvm-project//mlir:IR",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:Pass",
     ],
diff --git a/third_party/xla/xla/mlir/memref/transforms/aligned_allocations.cc b/third_party/xla/xla/mlir/memref/transforms/aligned_allocations.cc
index 6bb9a925496dd5..8cee23aec11e35 100644
--- a/third_party/xla/xla/mlir/memref/transforms/aligned_allocations.cc
+++ b/third_party/xla/xla/mlir/memref/transforms/aligned_allocations.cc
@@ -19,6 +19,8 @@ limitations under the License.
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
 #include "mlir/Pass/Pass.h"  // from @llvm-project
 #include "xla/mlir/memref/transforms/passes.h"
 
diff --git a/third_party/xla/xla/mlir/memref/transforms/tests/BUILD b/third_party/xla/xla/mlir/memref/transforms/tests/BUILD
index 9a2cd8be569ced..57836f22d167fd 100644
--- a/third_party/xla/xla/mlir/memref/transforms/tests/BUILD
+++ b/third_party/xla/xla/mlir/memref/transforms/tests/BUILD
@@ -1,7 +1,6 @@
 load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
diff --git a/third_party/xla/xla/mlir/runtime/BUILD b/third_party/xla/xla/mlir/runtime/BUILD
index 7309674b1e7d78..7538e87ec6ba8e 100644
--- a/third_party/xla/xla/mlir/runtime/BUILD
+++ b/third_party/xla/xla/mlir/runtime/BUILD
@@ -22,7 +22,8 @@ package_group(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
     licenses = ["notice"],
 )
 
@@ -40,6 +41,7 @@ xla_cc_binary(
     deps = [
         "//xla/mlir/math/transforms:passes",
         "//xla/mlir/memref/transforms:passes",
+        "//xla/mlir/runtime/ir:rt",
         "//xla/mlir/runtime/ir/tests:testlib",
         "//xla/mlir/runtime/transforms:compilation_pipeline_cpu",
         "//xla/mlir/runtime/transforms:compilation_pipeline_gpu",
@@ -50,5 +52,6 @@ xla_cc_binary(
         "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:MemRefDialect",
         "@llvm-project//mlir:MlirOptLib",
+        "@llvm-project//mlir:Support",
     ],
 )
diff --git a/third_party/xla/xla/mlir/runtime/ir/BUILD b/third_party/xla/xla/mlir/runtime/ir/BUILD
index cbe2154a3bc8a5..50e28fcf876897 100644
--- a/third_party/xla/xla/mlir/runtime/ir/BUILD
+++ b/third_party/xla/xla/mlir/runtime/ir/BUILD
@@ -3,7 +3,8 @@ load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla/mlir/runtime:friends"],
     licenses = ["notice"],
 )
 
@@ -97,7 +98,6 @@ cc_library(
         "rt_ops.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":rt_inc_gen",
         ":rt_interfaces_inc_gen",
diff --git a/third_party/xla/xla/mlir/runtime/ir/tests/BUILD b/third_party/xla/xla/mlir/runtime/ir/tests/BUILD
index b7f5b4bd6029ca..8baa4ef9124e44 100644
--- a/third_party/xla/xla/mlir/runtime/ir/tests/BUILD
+++ b/third_party/xla/xla/mlir/runtime/ir/tests/BUILD
@@ -83,7 +83,7 @@ cc_library(
     srcs = ["testlib.cc"],
     hdrs = ["testlib.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = ["//xla/mlir/runtime:friends"],
     deps = [
         ":testlib_inc_gen",
         "@llvm-project//llvm:Support",
diff --git a/third_party/xla/xla/mlir/runtime/transforms/BUILD b/third_party/xla/xla/mlir/runtime/transforms/BUILD
index e7c650711671d4..4a82bebe33cc55 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/BUILD
+++ b/third_party/xla/xla/mlir/runtime/transforms/BUILD
@@ -5,7 +5,8 @@ load("@local_tsl//tsl/platform:build_config.bzl", "if_llvm_system_z_available")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla/mlir/runtime:friends"],
     licenses = ["notice"],
 )
 
@@ -39,7 +40,6 @@ cc_library(
     ],
     hdrs = ["passes.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":custom_call_encoding",
         ":passes_inc_gen",
@@ -70,7 +70,6 @@ cc_library(
     srcs = ["calling_convention.cc"],
     hdrs = ["calling_convention.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/mlir/runtime/ir:rt",
         "@llvm-project//mlir:IR",
@@ -195,7 +194,6 @@ cc_library(
     name = "compilation_pipeline_options",
     hdrs = ["compilation_pipeline_options.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":custom_call_encoding",
         "//xla/runtime:type_id",
@@ -208,7 +206,6 @@ cc_library(
     srcs = ["custom_call_encoding.cc"],
     hdrs = ["custom_call_encoding.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla/mlir/runtime/ir:rt",
@@ -232,7 +229,6 @@ cc_library(
     srcs = ["jit_compiler.cc"],
     hdrs = ["jit_compiler.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":calling_convention",
         ":compiler",
@@ -283,7 +279,6 @@ cc_library(
     srcs = ["specialization.cc"],
     hdrs = ["specialization.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":type_converter",
         "//xla/mlir/runtime/utils:constraints",
@@ -307,7 +302,6 @@ cc_library(
     srcs = ["type_converter.cc"],
     hdrs = ["type_converter.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla/mlir/runtime/ir:rt",
@@ -337,7 +331,6 @@ cc_library(
     name = "compiler",
     hdrs = ["compiler.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Pass",
diff --git a/third_party/xla/xla/mlir/runtime/transforms/tests/BUILD b/third_party/xla/xla/mlir/runtime/transforms/tests/BUILD
index 1ddcf60bab5adc..966f4837132247 100644
--- a/third_party/xla/xla/mlir/runtime/transforms/tests/BUILD
+++ b/third_party/xla/xla/mlir/runtime/transforms/tests/BUILD
@@ -1,7 +1,6 @@
 load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
@@ -32,7 +31,7 @@ cc_library(
     testonly = 1,
     srcs = ["testlib_pipeline.cc"],
     hdrs = ["testlib_pipeline.h"],
-    visibility = ["//visibility:public"],
+    visibility = ["//xla:runtime"],
     deps = [
         "//xla/mlir/runtime/transforms:compiler",
         "//xla/mlir/runtime/transforms:passes",
diff --git a/third_party/xla/xla/mlir/runtime/utils/BUILD b/third_party/xla/xla/mlir/runtime/utils/BUILD
index 78c4a00ae7f216..08021de8b0da53 100644
--- a/third_party/xla/xla/mlir/runtime/utils/BUILD
+++ b/third_party/xla/xla/mlir/runtime/utils/BUILD
@@ -2,7 +2,8 @@ load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla/mlir/runtime:friends"],
     licenses = ["notice"],
 )
 
@@ -11,7 +12,6 @@ cc_library(
     srcs = ["async_runtime_api.cc"],
     hdrs = ["async_runtime_api.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/runtime:async_runtime",
         "@com_google_absl//absl/base:dynamic_annotations",
@@ -27,7 +27,6 @@ cc_library(
     name = "c_runner_utils",
     hdrs = ["c_runner_utils.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@llvm-project//llvm:OrcJIT",
         "@llvm-project//mlir:mlir_c_runner_utils",
@@ -39,7 +38,6 @@ cc_library(
     srcs = ["constraints.cc"],
     hdrs = ["constraints.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/runtime:constraints",
         "@com_google_absl//absl/status",
@@ -57,7 +55,6 @@ cc_library(
     srcs = ["custom_calls.cc"],
     hdrs = ["custom_calls.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
@@ -69,6 +66,5 @@ cc_library(
     name = "float_16bits",
     hdrs = ["float_16bits.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = ["@llvm-project//llvm:OrcJIT"],
 )
diff --git a/third_party/xla/xla/mlir/runtime/xla-runtime-opt.cc b/third_party/xla/xla/mlir/runtime/xla-runtime-opt.cc
index 355c6072609f4e..a710634d721000 100644
--- a/third_party/xla/xla/mlir/runtime/xla-runtime-opt.cc
+++ b/third_party/xla/xla/mlir/runtime/xla-runtime-opt.cc
@@ -18,9 +18,11 @@ limitations under the License.
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
 #include "mlir/Dialect/Math/IR/Math.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "mlir/Tools/mlir-opt/MlirOptMain.h"  // from @llvm-project
 #include "xla/mlir/math/transforms/passes.h"
 #include "xla/mlir/memref/transforms/passes.h"
+#include "xla/mlir/runtime/ir/rt_dialect.h"
 #include "xla/mlir/runtime/ir/tests/testlib.h"
 #include "xla/mlir/runtime/transforms/passes.h"
 
diff --git a/third_party/xla/xla/mlir/utils/BUILD b/third_party/xla/xla/mlir/utils/BUILD
index d6dc30baf24e8b..f413f6c76f7ade 100644
--- a/third_party/xla/xla/mlir/utils/BUILD
+++ b/third_party/xla/xla/mlir/utils/BUILD
@@ -1,8 +1,13 @@
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//third_party/golang/github_com/gomlx/gomlx:__subpackages__",
+        "//xla:internal",
+    ]),
     licenses = ["notice"],
 )
 
@@ -11,7 +16,6 @@ cc_library(
     srcs = ["error_util.cc"],
     hdrs = ["error_util.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
diff --git a/third_party/xla/xla/mlir/xla_cpu/ir/BUILD b/third_party/xla/xla/mlir/xla_cpu/ir/BUILD
index 3252e387c8be8e..9cb3de61b2118e 100644
--- a/third_party/xla/xla/mlir/xla_cpu/ir/BUILD
+++ b/third_party/xla/xla/mlir/xla_cpu/ir/BUILD
@@ -1,9 +1,14 @@
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
-load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
+load(
+    "@local_tsl//tsl:tsl.default.bzl",
+    "get_compatible_with_portable",
+)
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility(["//learning/brain/mlir:xla_friends"]),
 )
 
 td_library(
@@ -92,7 +97,6 @@ cc_library(
         "xla_cpu.cc",
     ],
     hdrs = ["xla_cpu.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":xla_cpu_dialect_inc_gen",
         ":xla_cpu_enums_inc_gen",
diff --git a/third_party/xla/xla/mlir_hlo/BUILD b/third_party/xla/xla/mlir_hlo/BUILD
index 2d416f1c92788e..9d6603b6d98b57 100644
--- a/third_party/xla/xla/mlir_hlo/BUILD
+++ b/third_party/xla/xla/mlir_hlo/BUILD
@@ -1,31 +1,26 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "gentbl_filegroup", "td_library")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility(["//learning/brain/mlir:mhlo_friends"]),
     licenses = ["notice"],
 )
 
-exports_files(
-    [
-        "mhlo/IR/hlo_ops.td",
-        "lhlo/IR/lhlo_ops.td",
-    ],
-    visibility = ["//visibility:public"],
-)
+exports_files([
+    "mhlo/IR/hlo_ops.td",
+    "lhlo/IR/lhlo_ops.td",
+])
 
 # Python extension sources.
-exports_files(
-    ["bindings/python/MlirHloModule.cc"],
-    visibility = ["//visibility:public"],
-)
+exports_files(["bindings/python/MlirHloModule.cc"])
 
 filegroup(
     name = "hlo_ops_td_filegroup",
     srcs = glob(["mhlo/IR/*.td"]),
-    visibility = ["//visibility:public"],
 )
 
 td_library(
@@ -321,7 +316,6 @@ cc_library(
     srcs = ["mhlo/IR/hlo_ops_common.cc"],
     hdrs = ["mhlo/IR/hlo_ops_common.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -384,7 +378,6 @@ cc_library(
         "deallocation/transforms/passes.h",
     ],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":deallocation_passes_inc_gen",
         ":deallocation_utils",
@@ -428,7 +421,6 @@ cc_library(
     srcs = ["deallocation/utils/util.cc"],
     hdrs = ["deallocation/utils/util.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ControlFlowInterfaces",
@@ -484,7 +476,6 @@ cc_library(
         "lhlo/IR/lhlo_structured_interface.h.inc",
     ],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":lhlo_structured_interface_inc_gen",
         "@llvm-project//mlir:IR",
@@ -497,7 +488,6 @@ cc_library(
     srcs = ["utils/convert_op_folder.cc"],
     hdrs = ["utils/convert_op_folder.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -525,7 +515,6 @@ cc_library(
         "utils/hlo_utils.h",
     ],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":canonicalize_inc_gen",
         ":convert_op_folder",
@@ -573,7 +562,6 @@ cc_library(
         "lhlo/utils/lhlo_utils.h",
     ],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_ops_common",
         ":lhlo_ops_inc_gen",
@@ -598,7 +586,6 @@ cc_library(
     srcs = ["lhlo_gpu/IR/lhlo_gpu_ops.cc"],
     hdrs = ["lhlo_gpu/IR/lhlo_gpu_ops.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_ops_common",
         ":lhlo",
@@ -619,7 +606,6 @@ cc_library(
     srcs = ["lhlo_gpu/IR/lhlo_gpu_ops.cc.inc"],
     hdrs = ["lhlo_gpu/IR/lhlo_gpu_ops.h.inc"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Analysis",
@@ -643,7 +629,6 @@ cc_library(
     srcs = ["mhlo/IR/init.cc"],
     hdrs = ["mhlo/IR/register.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":mlir_hlo",
         "@llvm-project//mlir:IR",
@@ -700,7 +685,6 @@ cc_library(
         "mhlo/transforms/optimize_mhlo/optimize_mhlo.cc",
         "mhlo/transforms/optimize_mhlo/optimize_mhlo_pass.cc",
         "mhlo/transforms/prepare_for_export/prepare_for_export.cc",
-        "mhlo/transforms/rank_specialization/rank_specialization.cc",
         "mhlo/transforms/restrict_max_rank/restrict_max_rank.cc",
         "mhlo/transforms/shape_legalize_to_hlo/shape_legalize_to_hlo.cc",
         "mhlo/transforms/shape_reification/shape_reification_pass.cc",
@@ -720,7 +704,6 @@ cc_library(
         "mhlo/utils/mhlo_rng_utils.h",
     ],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":chlo_legalize_to_hlo",
         ":hlo_legalize_to_stablehlo",
@@ -778,7 +761,6 @@ cc_library(
     srcs = ["mhlo/utils/type_conversion.cc"],
     hdrs = ["mhlo/utils/type_conversion.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":mlir_hlo",
         "@llvm-project//mlir:FuncDialect",
@@ -794,7 +776,6 @@ cc_library(
     name = "map_lmhlo_to_scalar_op",
     hdrs = ["lhlo/transforms/map_lmhlo_to_scalar_op.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":map_lhlo_to_hlo_op",
         ":map_mhlo_to_scalar_op",
@@ -805,7 +786,6 @@ cc_library(
     name = "map_mhlo_to_scalar_op",
     hdrs = ["mhlo/transforms/map_mhlo_to_scalar_op.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":mlir_hlo",
         "@llvm-project//llvm:Support",
@@ -821,7 +801,6 @@ cc_library(
     name = "map_chlo_to_hlo_op",
     hdrs = ["mhlo/transforms/map_chlo_to_hlo_op.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":mlir_hlo",
         "@llvm-project//mlir:IR",
@@ -833,7 +812,6 @@ cc_library(
     name = "map_hlo_to_lhlo_op",
     hdrs = ["lhlo/transforms/map_hlo_to_lhlo_op.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":lhlo",
         ":mlir_hlo",
@@ -844,7 +822,6 @@ cc_library(
     name = "map_lhlo_to_hlo_op",
     hdrs = ["lhlo/transforms/map_lhlo_to_hlo_op.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":lhlo",
         ":mlir_hlo",
@@ -855,7 +832,6 @@ cc_library(
     name = "map_stablehlo_to_hlo_op",
     hdrs = ["mhlo/transforms/map_stablehlo_to_hlo_op.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":mlir_hlo",
         "@stablehlo//:stablehlo_ops",
@@ -873,7 +849,6 @@ cc_library(
     ],
     hdrs = ["lhlo/transforms/passes.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":lhlo",
         ":lmhlo_pass_inc_gen",
@@ -905,7 +880,6 @@ cc_library(
     srcs = ["utils/codegen_utils.cc"],
     hdrs = ["utils/codegen_utils.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
@@ -921,38 +895,14 @@ cc_library(
     name = "placement_utils",
     hdrs = ["utils/placement_utils.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = ["@llvm-project//llvm:Support"],
 )
 
-cc_library(
-    name = "lhlo_elemental_utils",
-    srcs = ["lhlo/transforms/lhlo_elemental_utils.cc"],
-    hdrs = ["lhlo/transforms/lhlo_elemental_utils.h"],
-    strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
-    deps = [
-        ":codegen_utils",
-        ":lhlo",
-        ":map_lmhlo_to_scalar_op",
-        ":mlir_hlo",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:GPUDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:SCFDialect",
-        "@llvm-project//mlir:Transforms",
-    ],
-)
-
 cc_library(
     name = "legalize_to_linalg_utils",
     srcs = ["mhlo/utils/legalize_to_linalg_utils.cc"],
     hdrs = ["mhlo/utils/legalize_to_linalg_utils.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":map_mhlo_to_scalar_op",
         ":mlir_hlo",
@@ -980,7 +930,6 @@ cc_library(
     srcs = ["mhlo/utils/mhlo_rng_utils.cc"],
     hdrs = ["mhlo/utils/mhlo_rng_utils.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":mlir_hlo",
         "@llvm-project//llvm:Support",
@@ -1000,7 +949,6 @@ cc_library(
     srcs = ["mhlo/utils/mhlo_scatter_gather_utils.cc"],
     hdrs = ["mhlo/utils/mhlo_scatter_gather_utils.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":mlir_hlo",
         "@llvm-project//mlir:DialectUtils",
@@ -1051,7 +999,6 @@ cc_library(
     srcs = ["mhlo/transforms/unfuse_batch_norm/unfuse_batch_norm.cc"],
     hdrs = ["mhlo/transforms/rewriters.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":mlir_hlo",
         "@llvm-project//llvm:Support",
@@ -1069,7 +1016,6 @@ cc_library(
     srcs = ["mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc"],
     hdrs = ["mhlo/transforms/rewriters.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":chlo_legalize_to_hlo_inc_gen",
         ":map_chlo_to_hlo_op",
@@ -1107,7 +1053,6 @@ cc_library(
     srcs = ["mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc"],
     hdrs = ["mhlo/transforms/rewriters.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":map_stablehlo_to_hlo_op",
         ":mlir_hlo",
@@ -1126,7 +1071,6 @@ cc_library(
     srcs = ["mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc"],
     hdrs = ["mhlo/transforms/rewriters.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":map_stablehlo_to_hlo_op",
         ":mlir_hlo",
@@ -1157,7 +1101,6 @@ cc_library(
         "transforms/passes.h",
     ],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":chlo_legalize_to_hlo",
         ":deallocation_passes",
@@ -1170,7 +1113,6 @@ cc_library(
         ":stablehlo_legalize_to_hlo",
         ":transforms_passes",
         ":transforms_passes_inc_gen",
-        ":userange_analysis",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
@@ -1183,10 +1125,8 @@ cc_library(
 cc_library(
     name = "transforms_passes",
     srcs = [
-        "analysis/test_userange_analysis.cc",
         "mhlo/analysis/test_shape_component_analysis.cc",
         "transforms/alloc_to_arg_pass.cc",
-        "transforms/buffer_packing.cc",
         "transforms/bufferize.cc",
         "transforms/bufferize_pass.cc",
         "transforms/collapse_parallel_loops_to_1d_pass.cc",
@@ -1207,7 +1147,6 @@ cc_library(
         "transforms/rewriters.h",
     ],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":deallocation_passes",
         ":lhlo",
@@ -1216,7 +1155,6 @@ cc_library(
         ":shape_component_analysis",
         ":transforms_passes_inc_gen",
         ":type_conversion",
-        ":userange_analysis",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AffineDialect",
         "@llvm-project//mlir:AffineToStandard",
@@ -1289,7 +1227,6 @@ cc_library(
     ],
     hdrs = ["transforms/gpu_passes.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":gpu_transforms_passes_inc_gen",
         ":lhlo",
@@ -1371,27 +1308,11 @@ gentbl_cc_library(
     deps = ["@llvm-project//mlir:PassBaseTdFiles"],
 )
 
-cc_library(
-    name = "userange_analysis",
-    srcs = ["analysis/userange_analysis.cc"],
-    hdrs = ["analysis/userange_analysis.h"],
-    strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
-    deps = [
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:Analysis",
-        "@llvm-project//mlir:BufferizationTransforms",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:LoopLikeInterface",
-    ],
-)
-
 cc_library(
     name = "shape_component_analysis",
     srcs = ["mhlo/analysis/shape_component_analysis.cc"],
     hdrs = ["mhlo/analysis/shape_component_analysis.h"],
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":mlir_hlo",
         "@llvm-project//llvm:Support",
@@ -1423,7 +1344,6 @@ cc_library(
     srcs = CAPI_SOURCES,
     hdrs = CAPI_HEADERS,
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":all_passes",
         ":mlir_hlo",
@@ -1436,7 +1356,6 @@ cc_library(
     name = "CAPIHeaders",
     hdrs = CAPI_HEADERS,
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = ["@llvm-project//mlir:CAPIIRHeaders"],
 )
 
@@ -1446,7 +1365,6 @@ cc_library(
     srcs = CAPI_SOURCES,
     hdrs = CAPI_HEADERS,
     strip_include_prefix = ".",
-    visibility = ["//visibility:public"],
     deps = [
         ":all_passes",
         ":mlir_hlo",
@@ -1509,7 +1427,6 @@ filegroup(
         "bindings/python/mlir/dialects/mhlo.py",
         ":MhloOpsPyGen",
     ],
-    visibility = ["//visibility:public"],
 )
 
 # A light-weight runtime support library, used by MLIR code that results
diff --git a/third_party/xla/xla/mlir_hlo/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/CMakeLists.txt
index d52ec092a08f6a..27c03298b33c1e 100644
--- a/third_party/xla/xla/mlir_hlo/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/CMakeLists.txt
@@ -158,7 +158,6 @@ set(MLIR_HLO_TOOLS_DIR ${MLIR_HLO_BINARY_DIR}/bin)
 set(MLIR_HLO_LIB_DIR ${MLIR_HLO_BINARY_DIR}/lib)
 add_custom_target(check-mlir-hlo)
 
-add_subdirectory(analysis)
 add_subdirectory(bindings)
 add_subdirectory(deallocation)
 add_subdirectory(lhlo)
diff --git a/third_party/xla/xla/mlir_hlo/analysis/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/analysis/CMakeLists.txt
deleted file mode 100644
index 88a0fec149403f..00000000000000
--- a/third_party/xla/xla/mlir_hlo/analysis/CMakeLists.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-add_mlir_library(MLIRHLOAnalysis
-  userange_analysis.cc
-
-  DEPENDS
-  mlir-headers
-
-  LINK_LIBS PUBLIC
-  MLIRAnalysis
-  MLIRIR
-)
-
-add_mlir_library(MLIRHLOTestAnalysis
-  test_userange_analysis.cc
-
-  DEPENDS
-  LMHLOTransformsPassIncGen
-
-  LINK_COMPONENTS
-  Core
-
-  LINK_LIBS PUBLIC
-  LmhloDialect
-  LmhloGPUDialect
-  MLIRHLOAnalysis
-  MLIRAnalysis
-  MLIRPass
-  MLIRTransforms
-)
diff --git a/third_party/xla/xla/mlir_hlo/analysis/test_userange_analysis.cc b/third_party/xla/xla/mlir_hlo/analysis/test_userange_analysis.cc
deleted file mode 100644
index c4581a8a0a9a7e..00000000000000
--- a/third_party/xla/xla/mlir_hlo/analysis/test_userange_analysis.cc
+++ /dev/null
@@ -1,51 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <memory>
-
-#include "analysis/userange_analysis.h"
-#include "lhlo/IR/lhlo_ops.h"
-#include "mlir/Dialect/Bufferization/Transforms/BufferUtils.h"
-#include "mlir/Dialect/Bufferization/Transforms/BufferViewFlowAnalysis.h"
-#include "mlir/Pass/Pass.h"
-
-namespace mlir {
-
-#define GEN_PASS_DEF_TESTUSERANGE
-#include "transforms/passes.h.inc"
-
-namespace {
-
-struct TestUserangePass : public impl::TestUserangeBase<TestUserangePass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<mlir::lmhlo::LmhloDialect>();
-  }
-
-  void runOnOperation() override {
-    llvm::outs() << "Testing : " << getOperation().getName() << "\n";
-    UserangeAnalysis(getOperation(),
-                     bufferization::BufferPlacementAllocs(getOperation()),
-                     BufferViewFlowAnalysis(getOperation()))
-        .dump(llvm::outs());
-  }
-};
-
-}  // end anonymous namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createTestUserangePass() {
-  return std::make_unique<TestUserangePass>();
-}
-
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/analysis/userange_analysis.cc b/third_party/xla/xla/mlir_hlo/analysis/userange_analysis.cc
deleted file mode 100644
index f4aca6fcc59b1e..00000000000000
--- a/third_party/xla/xla/mlir_hlo/analysis/userange_analysis.cc
+++ /dev/null
@@ -1,625 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "analysis/userange_analysis.h"
-
-#include <algorithm>
-#include <iterator>
-#include <limits>
-#include <utility>
-
-#include "llvm/ADT/SetOperations.h"
-#include "mlir/IR/Block.h"
-#include "mlir/IR/Region.h"
-#include "mlir/Interfaces/LoopLikeInterface.h"
-
-using namespace mlir;
-
-namespace {
-/// Builds a userange information from the given value and its liveness. The
-/// information includes all operations that are within the userange.
-struct UserangeInfoBuilder {
-  using OperationListT = Liveness::OperationListT;
-  using ValueSetT = BufferViewFlowAnalysis::ValueSetT;
-
- public:
-  /// Constructs an Userange builder.
-  UserangeInfoBuilder(Liveness liveness, ValueSetT values,
-                      OperationListT opList)
-      : values(std::move(values)),
-        opList(std::move(opList)),
-        liveness(std::move(liveness)) {}
-
-  /// Computes the userange of the current value by iterating over all of its
-  /// uses.
-  Liveness::OperationListT computeUserange() {
-    Region *topRegion = findTopRegion();
-    // Iterate over all associated uses.
-    for (Operation *use : opList) {
-      // If one of the parents implements a LoopLikeOpInterface we need to add
-      // all operations inside of its regions to the userange.
-      Operation *loopParent = use->getParentOfType<LoopLikeOpInterface>();
-      if (loopParent && topRegion->isProperAncestor(use->getParentRegion()))
-        addAllOperationsInRegion(loopParent);
-
-      // Check if the parent block has already been processed.
-      Block *useBlock = findTopLiveBlock(use);
-      if (!startBlocks.insert(useBlock).second || visited.contains(useBlock))
-        continue;
-
-      // Add all operations inside the block that are within the userange.
-      findOperationsInUse(useBlock);
-    }
-    return currentUserange;
-  }
-
- private:
-  /// Find the top most Region of all values stored in the values set.
-  Region *findTopRegion() const {
-    Region *topRegion = nullptr;
-    llvm::for_each(values, [&](Value v) {
-      Region *other = v.getParentRegion();
-      if (!topRegion || topRegion->isAncestor(other)) topRegion = other;
-    });
-    return topRegion;
-  }
-
-  /// Finds the highest level block that has the current value in its liveOut
-  /// set.
-  Block *findTopLiveBlock(Operation *op) const {
-    Operation *topOp = op;
-    while (const LivenessBlockInfo *blockInfo =
-               liveness.getLiveness(op->getBlock())) {
-      if (llvm::any_of(values,
-                       [&](Value v) { return blockInfo->isLiveOut(v); }))
-        topOp = op;
-      op = op->getParentOp();
-    }
-    return topOp->getBlock();
-  }
-
-  /// Adds all operations from start to end to the userange of the current
-  /// value. If an operation implements a nested region all operations inside of
-  /// it are included as well. If includeEnd is false the end operation is not
-  /// added.
-  void addAllOperationsBetween(Operation *start, Operation *end) {
-    currentUserange.push_back(start);
-    addAllOperationsInRegion(start);
-
-    while (start != end) {
-      start = start->getNextNode();
-      addAllOperationsInRegion(start);
-      currentUserange.push_back(start);
-    }
-  }
-
-  /// Adds all operations that are uses of the value in the given block to the
-  /// userange of the current value. Additionally iterate over all successors
-  /// where the value is live.
-  void findOperationsInUse(Block *block) {
-    SmallVector<Block *, 8> blocksToProcess;
-    addOperationsInBlockAndFindSuccessors(
-        block, block, getStartOperation(block), blocksToProcess);
-    while (!blocksToProcess.empty()) {
-      Block *toProcess = blocksToProcess.pop_back_val();
-      addOperationsInBlockAndFindSuccessors(
-          block, toProcess, &toProcess->front(), blocksToProcess);
-    }
-  }
-
-  /// Adds the operations between the given start operation and the computed end
-  /// operation to the userange. If the current value is live out, add all
-  /// successor blocks that have the value live in to the process queue. If we
-  /// find a loop, add the operations before the first use in block to the
-  /// userange (if any). The startBlock is the block where the iteration over
-  /// all successors started and is propagated further to find potential loops.
-  void addOperationsInBlockAndFindSuccessors(
-      const Block *startBlock, Block *toProcess, Operation *start,
-      SmallVector<Block *, 8> &blocksToProcess) {
-    const LivenessBlockInfo *blockInfo = liveness.getLiveness(toProcess);
-    Operation *end = getEndOperation(toProcess);
-
-    addAllOperationsBetween(start, end);
-
-    // If the value is live out we need to process all successors at which the
-    // value is live in.
-    if (!llvm::any_of(values, [&](Value v) { return blockInfo->isLiveOut(v); }))
-      return;
-    for (Block *successor : toProcess->getSuccessors()) {
-      // If the successor is the startBlock, we found a loop and only have to
-      // add the operations from the block front to the first use of the
-      // value.
-      if (!llvm::any_of(values, [&](Value v) {
-            return liveness.getLiveness(successor)->isLiveIn(v);
-          }))
-        continue;
-      if (successor == startBlock) {
-        start = &successor->front();
-        end = getStartOperation(successor);
-        if (start != end) addAllOperationsBetween(start, end->getPrevNode());
-        // Else we need to check if the value is live in and the successor
-        // has not been visited before. If so we also need to process it.
-      } else if (visited.insert(successor).second) {
-        blocksToProcess.push_back(successor);
-      }
-    }
-  }
-
-  /// Iterates over all regions of a given operation and adds all operations
-  /// inside those regions to the userange of the current value.
-  void addAllOperationsInRegion(Operation *parentOp) {
-    // Iterate over all regions of the parentOp.
-    for (Region &region : parentOp->getRegions()) {
-      // Iterate over blocks inside the region.
-      for (Block &block : region) {
-        // If the blocks have been used as a startBlock before, we need to add
-        // all operations between the block front and the startOp of the value.
-        if (startBlocks.contains(&block)) {
-          Operation *start = &block.front();
-          Operation *end = getStartOperation(&block);
-          if (start != end) addAllOperationsBetween(start, end->getPrevNode());
-
-          // If the block has never been seen before, we need to add all
-          // operations inside.
-        } else if (visited.insert(&block).second) {
-          for (Operation &op : block) {
-            addAllOperationsInRegion(&op);
-            currentUserange.push_back(&op);
-          }
-          continue;
-        }
-        // If the block has either been visited before or was used as a
-        // startBlock, we need to add all operations between the endOp of the
-        // value and the end of the block.
-        Operation *end = getEndOperation(&block);
-        if (end == &block.back()) continue;
-        addAllOperationsBetween(end->getNextNode(), &block.back());
-      }
-    }
-  }
-
-  /// Find the start operation of the current value inside the given block.
-  Operation *getStartOperation(Block *block) {
-    Operation *startOperation = &block->back();
-    for (Operation *useOp : opList) {
-      // Find the associated operation in the current block (if any).
-      useOp = block->findAncestorOpInBlock(*useOp);
-      // Check whether the use is in our block and after the current end
-      // operation.
-      if (useOp && useOp->isBeforeInBlock(startOperation))
-        startOperation = useOp;
-    }
-    return startOperation;
-  }
-
-  /// Find the end operation of the current value inside the given block.
-  Operation *getEndOperation(Block *block) {
-    const LivenessBlockInfo *blockInfo = liveness.getLiveness(block);
-    if (llvm::any_of(values, [&](Value v) { return blockInfo->isLiveOut(v); }))
-      return &block->back();
-
-    Operation *endOperation = &block->front();
-    for (Operation *useOp : opList) {
-      // Find the associated operation in the current block (if any).
-      useOp = block->findAncestorOpInBlock(*useOp);
-      // Check whether the use is in our block and after the current end
-      // operation.
-      if (useOp && endOperation->isBeforeInBlock(useOp)) endOperation = useOp;
-    }
-    return endOperation;
-  }
-
-  /// The current Value.
-  ValueSetT values;
-
-  /// The list of all operations used by the values.
-  OperationListT opList;
-
-  /// The result list of the userange computation.
-  OperationListT currentUserange;
-
-  /// The set of visited blocks during the userange computation.
-  SmallPtrSet<Block *, 32> visited;
-
-  /// The set of blocks that the userange computation started from.
-  SmallPtrSet<Block *, 8> startBlocks;
-
-  /// The current liveness info.
-  Liveness liveness;
-};
-}  // namespace
-
-/// Empty UseInterval Constructor.
-UseInterval::UseInterval()
-    : start(std::numeric_limits<size_t>::max()),
-      end(std::numeric_limits<size_t>::min()) {}
-
-/// Performs an interval subtraction => A = A - B.
-void UseInterval::intervalSubtract(UseInterval::Vector &a,
-                                   const UseInterval::Vector &b) {
-  const auto *iterB = b.begin();
-  const auto *endB = b.end();
-  for (auto *iterA = a.begin(); iterA != a.end() && iterB != endB;) {
-    // iterA is strictly before iterB => increment iterA.
-    if (*iterA < *iterB) {
-      ++iterA;
-      // iterB is strictly before iterA => increment iterB.
-    } else if (*iterA > *iterB) {
-      ++iterB;
-      // iterB overlaps with the start of iterA, but iterA has some values that
-      // go beyond those of iterB. We have to set the start of iterA to the end
-      // of iterB + 1 and increment iterB. A(3, 100) - B(3, 5) => A(6,100)
-    } else if (iterA->start >= iterB->start && iterA->end > iterB->end) {
-      iterA->start = iterB->end + 1;
-      ++iterB;
-      // iterB overlaps with the end of iterA, but iterA has some values that
-      // come before iterB. We have to set the end of iterA to the start of
-      // iterB - 1 and increment iterA. A(4, 50) - B(40, 50) => A(4, 39)
-    } else if (iterA->end <= iterB->end && iterA->start < iterB->start) {
-      iterA->end = iterB->start - 1;
-      ++iterA;
-      // iterB is in the middle of iterA. We have to split iterA and increment
-      // iterB.
-      // A(2, 10) - B(5, 7) => (2, 4), (8, 10)
-    } else if (iterA->start < iterB->start && iterA->end > iterB->end) {
-      size_t endA = iterA->end;
-      iterA->end = iterB->start - 1;
-      iterA = a.insert(iterA, UseInterval(iterB->end + 1, endA));
-      ++iterB;
-      // Both intervals are equal. We have to erase the whole interval.
-      // A(5, 5) - B(5, 5) => {}
-    } else {
-      iterA = a.erase(iterA);
-      ++iterB;
-    }
-  }
-}
-
-/// Performs an interval intersection => A = A ^ B.
-void UseInterval::intervalIntersect(UseInterval::Vector &a,
-                                    const UseInterval::Vector &b) {
-  const auto *iterB = b.begin();
-  const auto *endB = b.end();
-  for (auto *iterA = a.begin(); iterA != a.end();) {
-    // iterB points to the end, therefore the remaining UseIntervals from A must
-    // be erased or iterA is strictly before iterB => erase iterA.
-    if (iterB == endB || *iterA < *iterB) {
-      iterA = a.erase(iterA);
-      // iterB is strictly before iterA => increment iterB.
-    } else if (*iterA > *iterB) {
-      ++iterB;
-      // iterB overlaps with iterA => reduce the interval to the overlap and
-      // insert the ending split-off to vector A again.
-    } else {
-      size_t currentEndA = iterA->end;
-      iterA->start = std::max(iterA->start, iterB->start);
-      iterA->end = std::min(currentEndA, iterB->end);
-      if (currentEndA > iterB->end) {
-        iterA = a.insert(std::next(iterA),
-                         UseInterval(iterB->end + 1, currentEndA));
-        ++iterB;
-      } else {
-        ++iterA;
-      }
-    }
-  }
-}
-
-/// Performs an interval merge => A = A u B.
-/// Note: All overlapping and contiguous UseIntervals are merged.
-void UseInterval::intervalMerge(UseInterval::Vector &a,
-                                const UseInterval::Vector &b) {
-  const auto *iterB = b.begin();
-  const auto *endB = b.end();
-  // Iterate over UseInterval::Vector a and b.
-  for (auto *iterA = a.begin(); iterA != a.end() && iterB != endB;) {
-    // Let A be the UseInterval of iterA and B the UseInterval of iterB.
-    // Check if A is before B.
-    if (*iterA < *iterB) {
-      // Check if A and B can be merged if they are contiguous. If the merge
-      // result contains the next elements of A, we can erase them.
-      if (iterA->isContiguous(*iterB)) {
-        mergeAndEraseContiguousIntervals(a, iterA, *iterB);
-        ++iterB;
-      }
-      ++iterA;
-      // Check if B is before A.
-    } else if (*iterA > *iterB) {
-      // Check if A and B can be merged if they are contiguous, else add B
-      // to the Vector of A.
-      if (iterB->isContiguous(*iterA))
-        iterA->mergeWith(*iterB);
-      else
-        iterA = a.insert(iterA, *iterB);
-      ++iterB;
-      // The UseIntervals interfere and must be merged.
-    } else {
-      mergeAndEraseContiguousIntervals(a, iterA, *iterB);
-      ++iterB;
-    }
-  }
-  // If there are remaining UseIntervals in b, add them to a.
-  if (iterB != endB) a.insert(a.end(), iterB, endB);
-}
-
-/// Merge the UseIntervals and erase overlapping and contiguouse UseIntervals
-/// of the UseInterval::Vector.
-void UseInterval::mergeAndEraseContiguousIntervals(
-    UseInterval::Vector &interval, UseInterval *iter,
-    const UseInterval &toMerge) {
-  // Return if the iter points to the end.
-  if (iter == interval.end()) return;
-
-  // Merge the UseIntervals.
-  iter->mergeWith(toMerge);
-
-  // Find the next UseInterval from iter that is not contiguous with the merged
-  // iter.
-  UseInterval *next = std::next(iter);
-  while (next != interval.end() && iter->isContiguous(*next)) {
-    if (iter->end < next->end) iter->end = next->end;
-    ++next;
-  }
-  // Remove contiguous UseIntervals.
-  if (std::next(iter) != next) iter = interval.erase(std::next(iter), next);
-}
-
-UserangeAnalysis::UserangeAnalysis(
-    Operation *op, const bufferization::BufferPlacementAllocs &allocs,
-    const BufferViewFlowAnalysis &aliases)
-    : liveness(op) {
-  // Walk over all operations and map them to an ID.
-  op->walk([&](Operation *operation) {
-    gatherMemoryEffects(operation);
-    operationIds.insert({operation, operationIds.size()});
-    operations.push_back(operation);
-  });
-
-  // Compute the use range for every allocValue and its aliases. Merge them
-  // and compute an interval. Add all computed intervals to the useIntervalMap.
-  for (const bufferization::BufferPlacementAllocs::AllocEntry &entry : allocs) {
-    Value allocValue = std::get<0>(entry);
-    const Value::use_range &allocUses = allocValue.getUses();
-    size_t dist = std::distance(allocUses.begin(), allocUses.end());
-    OperationListT useList;
-    useList.reserve(dist);
-    for (auto &use : allocUses) useList.push_back(use.getOwner());
-    computeUsePositions(allocValue);
-
-    UserangeInfoBuilder builder(liveness, {allocValue}, useList);
-    OperationListT liveOperations = builder.computeUserange();
-
-    // Sort the operation list by ids.
-    std::sort(liveOperations.begin(), liveOperations.end(),
-              [&](Operation *left, Operation *right) {
-                return operationIds[left] < operationIds[right];
-              });
-
-    UseInterval::Vector allocInterval =
-        computeInterval(allocValue, liveOperations);
-    // Iterate over all aliases and add their useranges to the userange of the
-    // current value. Also add the useInterval of each alias to the
-    // useIntervalMap.
-    ValueSetT aliasSet = aliases.resolve(allocValue);
-    for (Value alias : aliasSet) {
-      if (alias == allocValue) continue;
-      if (!aliasUseranges.count(alias)) {
-        OperationListT aliasOperations;
-        // If the alias is a BlockArgument then the value is live with the first
-        // operation inside that block. Otherwise the liveness analysis is
-        // sufficient for the use range.
-        if (alias.isa<BlockArgument>()) {
-          aliasOperations.push_back(&alias.getParentBlock()->front());
-          for (auto &use : alias.getUses())
-            aliasOperations.push_back(use.getOwner());
-          // Compute the use range for the alias and sort the operations
-          // afterwards.
-          UserangeInfoBuilder aliasBuilder(liveness, {alias}, aliasOperations);
-          aliasOperations = aliasBuilder.computeUserange();
-          std::sort(aliasOperations.begin(), aliasOperations.end(),
-                    [&](Operation *left, Operation *right) {
-                      return operationIds[left] < operationIds[right];
-                    });
-        } else {
-          aliasOperations = liveness.resolveLiveness(alias);
-        }
-
-        aliasUseranges.insert({alias, aliasOperations});
-        useIntervalMap.insert(
-            {alias, computeInterval(alias, aliasUseranges[alias])});
-        computeUsePositions(alias);
-      }
-      UseInterval::intervalMerge(allocInterval, useIntervalMap[alias]);
-      mergeUsePositions(usePositionMap[allocValue], usePositionMap[alias]);
-    }
-    aliasCache.insert(std::make_pair(allocValue, aliasSet));
-
-    // Map the current allocValue to the computed useInterval.
-    useIntervalMap.insert(std::make_pair(allocValue, allocInterval));
-  }
-}
-
-/// Computes the doubled Id for the given value inside the operation based on
-/// the program sequence. If the value has only read effects, the returning ID
-/// will be even, otherwise odd.
-size_t UserangeAnalysis::computeId(Value v, Operation *op) const {
-  size_t doubledID = (operationIds.find(op)->second + 1) * 2 - 1;
-  auto mapIter = opReadWriteMap.find(op);
-  if (mapIter == opReadWriteMap.end()) return doubledID;
-  auto reads = mapIter->second.first;
-  auto writes = mapIter->second.second;
-  if (reads.contains(v) && !writes.contains(v)) return doubledID - 1;
-  return doubledID;
-}
-
-/// Computes the UsePositions of the given Value, sorts and inserts them into
-/// the usePositionMap.
-void UserangeAnalysis::computeUsePositions(Value v) {
-  // Get the uses of v.
-  const Value::use_range &uses = v.getUses();
-
-  // Create a UsePositionList.
-  UsePositionList usePosList;
-  size_t dist = std::distance(uses.begin(), uses.end());
-  usePosList.reserve(dist);
-
-  // Add all ids and Operations to the UsePositionList.
-  for (auto &use : uses) {
-    Operation *useOwner = use.getOwner();
-    usePosList.emplace_back(computeId(v, useOwner), useOwner);
-  }
-
-  // Sort the UsePositions by ascending Ids.
-  std::sort(usePosList.begin(), usePosList.end(),
-            [](const UsePosition &a, const UsePosition &b) {
-              return a.first < b.first;
-            });
-
-  // Insert the UsePositionList into the usePositionMap.
-  usePositionMap.insert(std::make_pair(v, usePosList));
-}
-
-/// Merges listB into listA, sorts the result and removes all duplicates.
-void UserangeAnalysis::mergeUsePositions(UsePositionList &listA,
-                                         const UsePositionList &listB) {
-  // Insert listB into listA.
-  listA.insert(listA.end(), listB.begin(), listB.end());
-
-  // Sort the resulting listA.
-  std::sort(listA.begin(), listA.end(),
-            [](const UsePosition &a, const UsePosition &b) {
-              return a.first < b.first;
-            });
-
-  // Remove duplicates.
-  listA.erase(std::unique(listA.begin(), listA.end()), listA.end());
-}
-
-/// Checks if the use intervals of the given values interfere.
-bool UserangeAnalysis::rangesInterfere(Value itemA, Value itemB) const {
-  ValueSetT intersect = aliasCache.find(itemA)->second;
-  llvm::set_intersect(intersect, aliasCache.find(itemB)->second);
-  UseInterval::Vector tmpIntervalA = useIntervalMap.find(itemA)->second;
-  const UseInterval::Vector &intervalsB = useIntervalMap.find(itemB)->second;
-
-  // If the two values share a common alias, then the alias does not count as an
-  // interference and should be removed.
-  if (!intersect.empty()) {
-    for (Value alias : intersect) {
-      const UseInterval::Vector &aliasInterval =
-          useIntervalMap.find(alias)->second;
-      UseInterval::intervalSubtract(tmpIntervalA, aliasInterval);
-    }
-  }
-
-  // Iterate over both UseInterval::Vector and check if they interfere.
-  const auto *iterB = intervalsB.begin();
-  const auto *endB = intervalsB.end();
-  for (auto iterA = tmpIntervalA.begin(), endA = tmpIntervalA.end();
-       iterA != endA && iterB != endB;) {
-    if (*iterA < *iterB)
-      ++iterA;
-    else if (*iterA > *iterB)
-      ++iterB;
-    else
-      return true;
-  }
-  return false;
-}
-
-/// Merges the userange of itemB into the userange of itemA.
-void UserangeAnalysis::unionRanges(Value itemA, Value itemB) {
-  UseInterval::intervalMerge(useIntervalMap[itemA], useIntervalMap[itemB]);
-}
-
-/// Builds an UseInterval::Vector corresponding to the given OperationList.
-UseInterval::Vector UserangeAnalysis::computeInterval(
-    Value value, const Liveness::OperationListT &operationList) {
-  assert(!operationList.empty() && "Operation list must not be empty");
-  size_t start = computeId(value, *operationList.begin());
-  size_t last = start;
-  UseInterval::Vector intervals;
-  // Iterate over all operations in the operationList. If the gap between the
-  // respective operationIds is greater 1 create a new interval.
-  for (auto opIter = ++operationList.begin(), e = operationList.end();
-       opIter != e; ++opIter) {
-    size_t current = computeId(value, *opIter);
-    if (current - last > 2) {
-      intervals.emplace_back(start, last);
-      start = current;
-    }
-    last = current;
-  }
-  intervals.emplace_back(start, last);
-  return intervals;
-}
-
-/// Checks each operand within the operation for its memory effects and
-/// separates them into read and write.
-void UserangeAnalysis::gatherMemoryEffects(Operation *op) {
-  if (OpTrait::hasElementwiseMappableTraits(op)) {
-    if (auto effectInterface = dyn_cast<MemoryEffectOpInterface>(op)) {
-      SmallPtrSet<Value, 2> readEffectSet;
-      SmallPtrSet<Value, 2> writeEffectSet;
-      SmallVector<MemoryEffects::EffectInstance> effects;
-      for (auto operand : op->getOperands()) {
-        effects.clear();
-        effectInterface.getEffectsOnValue(operand, effects);
-        for (auto effect : effects) {
-          if (isa<MemoryEffects::Write>(effect.getEffect()))
-            writeEffectSet.insert(operand);
-          else if (isa<MemoryEffects::Read>(effect.getEffect()))
-            readEffectSet.insert(operand);
-        }
-      }
-      opReadWriteMap.insert(
-          {op, std::make_pair(readEffectSet, writeEffectSet)});
-    }
-  }
-}
-
-/// Computes the doubled Id back to the OperationId.
-size_t UserangeAnalysis::unwrapId(size_t id) const { return id / 2; }
-
-void UserangeAnalysis::dump(raw_ostream &os) {
-  os << "// ---- UserangeAnalysis -----\n";
-  llvm::SmallVector<Value> values;
-  values.reserve(useIntervalMap.size());
-  for (auto const &item : useIntervalMap) {
-    values.push_back(item.first);
-  }
-  std::sort(values.begin(), values.end(), [&](Value left, Value right) {
-    if (left.getDefiningOp()) {
-      if (right.getDefiningOp())
-        return operationIds[left.getDefiningOp()] <
-               operationIds[right.getDefiningOp()];
-      return true;
-    }
-    if (right.getDefiningOp()) return false;
-    return operationIds[&left.getParentBlock()->front()] <
-           operationIds[&right.getParentBlock()->front()];
-  });
-  for (auto value : values) {
-    os << "Value: " << value << (value.getDefiningOp() ? "\n" : "");
-    auto *rangeIt = useIntervalMap[value].begin();
-    os << "Userange: {(" << rangeIt->start << ", " << rangeIt->end << ")";
-    rangeIt++;
-    for (auto *e = useIntervalMap[value].end(); rangeIt != e; ++rangeIt) {
-      os << ", (" << rangeIt->start << ", " << rangeIt->end << ")";
-    }
-    os << "}\n";
-  }
-  os << "// ---------------------------\n";
-}
diff --git a/third_party/xla/xla/mlir_hlo/analysis/userange_analysis.h b/third_party/xla/xla/mlir_hlo/analysis/userange_analysis.h
deleted file mode 100644
index 3e04e4bbc7bf71..00000000000000
--- a/third_party/xla/xla/mlir_hlo/analysis/userange_analysis.h
+++ /dev/null
@@ -1,206 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_ANALYSIS_USERANGE_ANALYSIS_H
-#define MLIR_HLO_ANALYSIS_USERANGE_ANALYSIS_H
-
-#include <optional>
-#include <vector>
-
-#include "mlir/Analysis/Liveness.h"
-#include "mlir/Dialect/Bufferization/Transforms/BufferUtils.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/Value.h"
-
-namespace mlir {
-
-/// Representation of an inclusive Interval for the Userange.
-struct UseInterval {
-  using Vector = SmallVector<UseInterval, 8>;
-
- public:
-  /// UseInterval Constructor.
-  UseInterval();
-  /// Empty UseInterval Constructor.
-  UseInterval(size_t start, size_t end) : start(start), end(end) {}
-
-  /// Checks if the given UseInterval overlaps with this UseInterval.
-  bool isOverlapping(const UseInterval &other) const {
-    return start <= other.end && end >= other.start;
-  }
-
-  /// Checks if the given UseInterval is contiguous with this UseInterval in
-  /// terms of doubled Ids.
-  /// For example: (0, 2) and (4, 6) are contiguous where (0, 2) and (5, 6) are
-  ///              not.
-  bool isContiguous(const UseInterval &other) const {
-    return start <= other.end + 2 && end + 2 >= other.start;
-  }
-
-  /// Checks if the given position is inside this UseInterval.
-  bool contains(size_t position) const {
-    return start <= position && end >= position;
-  }
-
-  /// Merges this UseInterval with the given UseInterval by updating start and
-  /// end.
-  bool mergeWith(const UseInterval &other) {
-    if (!isContiguous(other)) return false;
-    start = std::min(start, other.start);
-    end = std::max(end, other.end);
-    return true;
-  }
-
-  /// Performs an interval subtraction => A = A - B.
-  static void intervalSubtract(Vector &a, const Vector &b);
-
-  /// Performs an interval intersection => A = A ^ B.
-  static void intervalIntersect(Vector &a, const Vector &b);
-
-  /// Performs an interval merge => A = A u B.
-  /// Note: All overlapping and contiguous UseIntervals are merged.
-  static void intervalMerge(Vector &a, const Vector &b);
-
-  /// Merge the UseIntervals and erase overlapping and contiguouse UseIntervals
-  /// of the UseInterval::Vector.
-  static void mergeAndEraseContiguousIntervals(Vector &interval,
-                                               UseInterval *iter,
-                                               const UseInterval &toMerge);
-
-  bool operator<(const UseInterval &other) const { return end < other.start; }
-
-  bool operator>(const UseInterval &other) const { return start > other.end; }
-
-  bool operator==(const UseInterval &other) const {
-    return start == other.start && end == other.end;
-  }
-
-  /// The start of this UseInterval.
-  size_t start;
-
-  /// The end of this UseInterval.
-  size_t end;
-};
-
-/// Represents an analysis for computing the useranges of all alloc values
-/// inside a given function operation. The analysis uses liveness information to
-/// compute intervals starting at the first and ending with the last use of
-/// every alloc value.
-class UserangeAnalysis {
- public:
-  using UsePosition = std::pair<size_t, Operation *>;
-  using UsePositionList = std::vector<UsePosition>;
-
-  UserangeAnalysis(Operation *op,
-                   const bufferization::BufferPlacementAllocs &allocs,
-                   const BufferViewFlowAnalysis &aliases);
-
-  /// Returns the index of the first operation that uses the given value or an
-  /// empty Optional if the value has no uses.
-  std::optional<size_t> getFirstUseIndex(Value value) const {
-    auto &intervals = useIntervalMap.find(value)->second;
-    if (intervals.empty()) return std::nullopt;
-    return intervals.begin()->start;
-  }
-
-  /// Returns the UseInterval::Vector of the given value.
-  std::optional<const UseInterval::Vector *> getUserangeInterval(
-      Value value) const {
-    auto intervals = useIntervalMap.find(value);
-    if (intervals == useIntervalMap.end()) return std::nullopt;
-    return &intervals->second;
-  }
-
-  /// Returns an UsePositionList* of the given value or an empty Optional
-  /// if the value has no uses.
-  std::optional<const UsePositionList *> getUserangePositions(
-      Value value) const {
-    auto usePosition = usePositionMap.find(value);
-    if (usePosition == usePositionMap.end() || usePosition->second.empty())
-      return std::nullopt;
-    return &usePosition->second;
-  }
-
-  /// Returns the operation associated with a given Id.
-  Operation *getOperation(size_t id) const { return operations[unwrapId(id)]; };
-
-  /// Computes the doubled Id for the given value inside the operation based on
-  /// the program sequence. If the value has only read effects, the returning ID
-  /// will be even, otherwise odd.
-  size_t computeId(Value v, Operation *op) const;
-
-  /// Checks if the use intervals of the given values interfere.
-  bool rangesInterfere(Value itemA, Value itemB) const;
-
-  /// Merges the userange of itemB into the userange of itemA.
-  void unionRanges(Value itemA, Value itemB);
-
-  /// Merges listB into listA, sorts the result and removes all duplicates.
-  static void mergeUsePositions(UsePositionList &listA,
-                                const UsePositionList &listB);
-
-  /// Dumps the liveness information to the given stream.
-  void dump(raw_ostream &os);
-
- private:
-  using ValueSetT = BufferViewFlowAnalysis::ValueSetT;
-  using OperationListT = Liveness::OperationListT;
-
-  /// Builds an UseInterval::Vector corresponding to the given OperationList.
-  UseInterval::Vector computeInterval(
-      Value value, const Liveness::OperationListT &operationList);
-
-  /// Computes the UsePositions of the given Value, sorts and inserts them into
-  /// the usePositionMap.
-  void computeUsePositions(Value v);
-
-  /// Checks each operand within the operation for its memory effects and
-  /// separates them into read and write.
-  void gatherMemoryEffects(Operation *op);
-
-  /// Computes the doubled Id back to the OperationId.
-  size_t unwrapId(size_t id) const;
-
-  /// Maps each Operation to a unique ID according to the program sequence.
-  DenseMap<Operation *, size_t> operationIds;
-
-  /// Stores all operations according to the program sequence.
-  std::vector<Operation *> operations;
-
-  /// Maps a value to its UseInterval::Vector.
-  DenseMap<Value, UseInterval::Vector> useIntervalMap;
-
-  /// Maps an Operation to a pair of read and write Operands.
-  DenseMap<Operation *, std::pair<SmallPtrSet<Value, 2>, SmallPtrSet<Value, 2>>>
-      opReadWriteMap;
-
-  /// Maps aliasValues to their use ranges. This is necessary to prevent
-  /// recomputations of the use range intervals of the aliases.
-  DenseMap<Value, OperationListT> aliasUseranges;
-
-  /// Maps a Value to a UsePostionList which contains all uses of the Value and
-  /// their userange position.
-  DenseMap<Value, UsePositionList> usePositionMap;
-
-  /// Cache the alias lists for all values to avoid recomputation.
-  BufferViewFlowAnalysis::ValueMapT aliasCache;
-
-  /// The current liveness info.
-  Liveness liveness;
-};
-
-}  // namespace mlir
-
-#endif  // MLIR_HLO_ANALYSIS_USERANGE_ANALYSIS_H
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/transforms/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/lhlo/transforms/CMakeLists.txt
index b91cedb3f846db..30aae289f923d6 100644
--- a/third_party/xla/xla/mlir_hlo/lhlo/transforms/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/lhlo/transforms/CMakeLists.txt
@@ -24,7 +24,6 @@ include_directories(BEFORE
 
 add_mlir_library(LmhloPasses
   legalize_to_tensor_op/legalize_to_tensor_op.cc
-  lhlo_elemental_utils.cc
   lhlo_legalize_to_affine/lhlo_legalize_to_affine.cc
   lhlo_legalize_to_gpu/lhlo_legalize_to_gpu.cc
   lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_elemental_utils.cc b/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_elemental_utils.cc
deleted file mode 100644
index 4e9860b88b4e85..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_elemental_utils.cc
+++ /dev/null
@@ -1,275 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// This file provides basic utilities for the elemental lowering of
-// each node
-
-#include "lhlo/transforms/lhlo_elemental_utils.h"
-
-#include "lhlo/IR/lhlo_ops.h"
-#include "lhlo/transforms/map_lmhlo_to_scalar_op.h"
-#include "llvm/Support/Debug.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/IR/Attributes.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/Location.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Pass/Pass.h"
-#include "utils/codegen_utils.h"
-
-using mlir::memref::DimOp;
-using mlir::memref::LoadOp;
-using mlir::memref::StoreOp;
-
-namespace mlir {
-namespace lmhlo {
-
-Value createLoadOrUseCachedValue(Location loc, OpBuilder* b, Value memref,
-                                 ValueRange indices,
-                                 OpBuilder::InsertPoint insertPoint) {
-  // Check if there are any cached value that can be reused,
-  // within the current Block. Alternatively we can do this for
-  // all the Blocks that dominant this Block, but that will be
-  // complicated anyway.
-  std::vector<StoreOp> storeOps;
-  insertPoint.getBlock()->walk(
-      insertPoint.getBlock()->begin(), insertPoint.getPoint(),
-      [&](StoreOp storeOp) {
-        if (storeOp.getOperation()->getBlock() != insertPoint.getBlock())
-          return;
-        if ((storeOp.getMemRef() == memref) &&
-            (storeOp.getIndices() == indices))
-          storeOps.emplace_back(storeOp);
-      });
-  if (!storeOps.empty()) return storeOps[0].getOperand(0);
-  int rank = memref.getType().dyn_cast<MemRefType>().getRank();
-  return rank > 0 ? b->create<LoadOp>(loc, memref, indices)
-                  : b->create<LoadOp>(loc, memref);
-}
-
-DenseSet<Operation*> noLoaderUser(SmallVectorImpl<Operation*>& ops) {
-  SmallVector<Operation*, 4> worklist;
-  DenseSet<Operation*> hasLoaderOps;
-  for (Operation* op : ops) {
-    Value memref = cast<LmhloOp>(op).getResultBuffer();
-    if (memref == nullptr) continue;
-    for (auto* user : memref.getUsers()) {
-      if (isa<memref::LoadOp>(user)) {
-        worklist.push_back(op);
-        hasLoaderOps.insert(op);
-      }
-    }
-  }
-
-  while (!worklist.empty()) {
-    Operation* op = worklist.pop_back_val();
-    int numOperands = op->getNumOperands();
-    for (int i = 0; i < numOperands - 1; ++i) {
-      Value memref = op->getOperand(i);
-      for (Operation* user : memref.getUsers()) {
-        if ((!isa<LmhloOp>(user)) || hasLoaderOps.count(user)) continue;
-        if (cast<LmhloOp>(user).getResultBuffer() == memref) {
-          worklist.push_back(user);
-          hasLoaderOps.insert(user);
-        }
-      }
-    }
-  }
-
-  DenseSet<Operation*> noLoaderOps;
-  for (Operation* op : ops)
-    if (!hasLoaderOps.count(op)) noLoaderOps.insert(op);
-  return noLoaderOps;
-}
-
-void cleanUnusedLhloOps(Block* parent) {
-  SmallVector<Operation*, 4> lhloOps;
-  for (Operation& op : parent->getOperations()) {
-    if (op.getDialect() == op.getContext()->getLoadedDialect("lmhlo") &&
-        (!isa<lmhlo::TerminatorOp>(op)))
-      lhloOps.push_back(&op);
-  }
-  for (auto* lhloOp : noLoaderUser(lhloOps)) lhloOp->erase();
-}
-
-template <typename LHLO_OpTy>
-Value elementalLower(OpBuilder* b, Location loc, LHLO_OpTy op,
-                     ValueRange outputIndex, bool checkCache);
-
-template <>
-Value elementalLower<lmhlo::RealDynamicSliceOp>(OpBuilder* b, Location loc,
-                                                lmhlo::RealDynamicSliceOp op,
-                                                ValueRange outputIndex,
-                                                bool checkCache) {
-  Value startIndicesMemref = op->getOperand(1);
-  Value stridesMemref = op->getOperand(3);
-  int rank = outputIndex.size();
-  SmallVector<Value, 4> inputIndex;
-  for (int dim = 0; dim < rank; ++dim) {
-    SmallVector<Value, 4> dimIndex;
-    dimIndex.push_back(b->create<arith::ConstantOp>(
-        loc, b->getIndexType(), b->getIntegerAttr(b->getIndexType(), dim)));
-    auto startIndexLoad =
-        b->create<LoadOp>(loc, startIndicesMemref, ValueRange{dimIndex});
-    auto startIndex =
-        b->create<arith::IndexCastOp>(loc, b->getIndexType(), startIndexLoad);
-    auto strideLoad =
-        b->create<LoadOp>(loc, stridesMemref, ValueRange{dimIndex});
-    auto stride =
-        b->create<arith::IndexCastOp>(loc, b->getIndexType(), strideLoad);
-    // input_dim = out_dim * stride + start_index
-    auto inputDim = b->create<arith::AddIOp>(
-        loc, b->create<arith::MulIOp>(loc, outputIndex[dim], stride),
-        startIndex);
-    inputIndex.push_back(inputDim);
-  }
-
-  Value operandMemref = *(op->getOperands().begin());
-
-  if (!checkCache) return b->create<LoadOp>(loc, operandMemref, inputIndex);
-  return createLoadOrUseCachedValue(loc, b, operandMemref, inputIndex,
-                                    b->saveInsertionPoint());
-}
-
-namespace {
-
-template <typename T>
-Value elementalLowerImplForBroadcastInDimOps(OpBuilder* b, Location loc,
-                                             T broadcastInDim,
-                                             ValueRange outputIndex,
-                                             bool checkCache) {
-  auto broadcastDimensions =
-      broadcastInDim.getBroadcastDimensions().template getValues<int64_t>();
-  int outRank = outputIndex.size();
-  Value operandMemref = broadcastInDim->getOperand(0);
-  SmallVector<Value, 4> inputIndex;
-  for (int64_t dim = 0; dim < outRank; ++dim) {
-    auto it =
-        std::find(broadcastDimensions.begin(), broadcastDimensions.end(), dim);
-
-    bool isBroadcastDim = (it != broadcastDimensions.end());
-    if (isBroadcastDim) {
-      int inputDim = std::distance(broadcastDimensions.begin(), it);
-      int64_t staticDimSize =
-          operandMemref.getType().cast<MemRefType>().getShape()[inputDim];
-      if (staticDimSize == 1) {
-        // we know this dim is to be broadcasted at compile time
-        auto zero = b->create<arith::ConstantOp>(
-            loc, b->getIndexType(), b->getIntegerAttr(b->getIndexType(), 0));
-        inputIndex.push_back(zero);
-      } else if (staticDimSize == ShapedType::kDynamic) {
-        // we are not sure if this dim is to be broadcasted at compile time
-        auto dimSize = b->create<DimOp>(loc, operandMemref, inputDim);
-        auto one = b->create<arith::ConstantOp>(
-            loc, b->getIndexType(), b->getIntegerAttr(b->getIndexType(), 1));
-        auto zero = b->create<arith::ConstantOp>(
-            loc, b->getIndexType(), b->getIntegerAttr(b->getIndexType(), 0));
-        auto dimSizeIs1 = b->create<arith::CmpIOp>(
-            loc, arith::CmpIPredicate::eq, dimSize, one);
-        inputIndex.push_back(b->create<mlir::arith::SelectOp>(
-            loc, dimSizeIs1, zero, outputIndex[dim]));
-      } else {
-        // we know this dim is not to be broadcasted at compile time
-        inputIndex.push_back(outputIndex[dim]);
-      }
-    }
-  }
-
-  if (!checkCache) {
-    int rank = operandMemref.getType().dyn_cast<MemRefType>().getRank();
-    return (rank > 0) ? b->create<LoadOp>(loc, operandMemref, inputIndex)
-                      : b->create<LoadOp>(loc, operandMemref, ValueRange());
-  }
-  return createLoadOrUseCachedValue(loc, b, operandMemref, inputIndex,
-                                    b->saveInsertionPoint());
-}
-
-}  // namespace
-
-template <>
-Value elementalLower<lmhlo::DynamicBroadcastInDimOp>(
-    OpBuilder* b, Location loc, lmhlo::DynamicBroadcastInDimOp op,
-    ValueRange outputIndex, bool checkCache) {
-  return elementalLowerImplForBroadcastInDimOps(b, loc, op, outputIndex,
-                                                checkCache);
-}
-
-template <>
-Value elementalLower<lmhlo::BroadcastInDimOp>(OpBuilder* b, Location loc,
-                                              lmhlo::BroadcastInDimOp op,
-                                              ValueRange outputIndex,
-                                              bool checkCache) {
-  return elementalLowerImplForBroadcastInDimOps(b, loc, op, outputIndex,
-                                                checkCache);
-}
-
-scf::ForOp createLoopAndSetInsPt(OpBuilder& b, Location loc, Value& var,
-                                 Value lb, Value ub, Value step,
-                                 ArrayRef<Value> initValues) {
-  auto forOp = b.create<scf::ForOp>(loc, lb, ub, step, initValues);
-  b.setInsertionPointToStart(forOp.getBody());
-  var = forOp.getInductionVar();
-  return forOp;
-}
-
-scf::ParallelOp createParallelAndSetInsPt(OpBuilder& b, Location loc,
-                                          SmallVectorImpl<Value>& vars,
-                                          ArrayRef<Value> lbs,
-                                          ArrayRef<Value> ubs,
-                                          ArrayRef<Value> steps,
-                                          ArrayRef<Value> initValues) {
-  auto parOp = b.create<scf::ParallelOp>(loc, lbs, ubs, steps, initValues,
-                                         /*bodyBuilderFn=*/nullptr);
-  b.setInsertionPointToStart(parOp.getBody());
-  vars.append(parOp.getInductionVars().begin(), parOp.getInductionVars().end());
-  return parOp;
-}
-
-// reinterpret_cast the input memref into 1D
-memref::ReinterpretCastOp createMemRef1DReinterpretCast(OpBuilder& b,
-                                                        Location loc,
-                                                        Value memref) {
-  auto memrefTy = memref.getType().cast<MemRefType>();
-  assert(memrefTy.getLayout().isIdentity());
-  Value size = codegen_utils::emitNumElementsComputation(b, loc, memref);
-  Value stride = b.create<mlir::arith::ConstantOp>(
-      loc, b.getIndexType(), b.getIntegerAttr(b.getIndexType(), 1));
-  Value zero = b.create<mlir::arith::ConstantOp>(
-      loc, b.getIndexType(), b.getIntegerAttr(b.getIndexType(), 0));
-  auto memref1dType =
-      MemRefType::get({ShapedType::kDynamic}, memrefTy.getElementType(),
-                      b.getMultiDimIdentityMap(1), memrefTy.getMemorySpace());
-  return b.create<memref::ReinterpretCastOp>(
-      loc, memref1dType, memref, zero, ValueRange{size}, ValueRange{stride});
-}
-
-void createOffsetStore(OpBuilder& b, Location loc, Value res, Value memref,
-                       Value offset) {
-  Value memref1d = createMemRef1DReinterpretCast(b, loc, memref);
-  b.create<memref::StoreOp>(loc, res, memref1d, ValueRange{offset});
-}
-
-memref::LoadOp createOffsetLoad(OpBuilder& b, Location loc, Value memref,
-                                Value offset) {
-  Value memref1d = createMemRef1DReinterpretCast(b, loc, memref);
-  return b.create<memref::LoadOp>(loc, memref1d, ValueRange{offset});
-}
-
-}  // namespace lmhlo
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_elemental_utils.h b/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_elemental_utils.h
deleted file mode 100644
index 7683d66560cb21..00000000000000
--- a/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_elemental_utils.h
+++ /dev/null
@@ -1,74 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef MLIR_HLO_LHLO_TRANSFORMS_LHLO_ELEMENTAL_UTILS_H
-#define MLIR_HLO_LHLO_TRANSFORMS_LHLO_ELEMENTAL_UTILS_H
-
-#include "mlir/IR/Builders.h"
-
-namespace mlir {
-namespace func {
-class FuncOp;
-}  // namespace func
-class Value;
-class Location;
-class Operation;
-class ValueRange;
-class Region;
-enum class AtomicRMWKind : uint64_t;
-
-namespace scf {
-class ForOp;
-class ParallelOp;
-}  // namespace scf
-
-namespace memref {
-class LoadOp;
-}  // namespace memref
-
-namespace lmhlo {
-
-Value createLoadOrUseCachedValue(Location loc, OpBuilder* b, Value memref,
-                                 ValueRange indices,
-                                 OpBuilder::InsertPoint insertPoint);
-
-DenseSet<Operation*> noLoaderUser(SmallVectorImpl<Operation*>& ops);
-void cleanUnusedLhloOps(Block* parent);
-
-template <typename LHLO_OpTy>
-Value elementalLower(OpBuilder* b, Location loc, LHLO_OpTy op,
-                     ValueRange outputIndex, bool checkCache = false);
-
-scf::ForOp createLoopAndSetInsPt(OpBuilder& b, Location loc, Value& var,
-                                 Value lb, Value ub, Value step,
-                                 ArrayRef<Value> initValues = {});
-
-scf::ParallelOp createParallelAndSetInsPt(OpBuilder& b, Location loc,
-                                          SmallVectorImpl<Value>& vars,
-                                          ArrayRef<Value> lbs,
-                                          ArrayRef<Value> ubs,
-                                          ArrayRef<Value> steps,
-                                          ArrayRef<Value> initValues);
-
-void createOffsetStore(OpBuilder& b, Location loc, Value res, Value memref,
-                       Value offset);
-
-memref::LoadOp createOffsetLoad(OpBuilder& b, Location loc, Value memref,
-                                Value offset);
-
-}  // namespace lmhlo
-}  // namespace mlir
-
-#endif  // MLIR_HLO_LHLO_TRANSFORMS_LHLO_ELEMENTAL_UTILS_H
diff --git a/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc b/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc
index 81adcc7e9634cd..f3c6b3940aced5 100644
--- a/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc
+++ b/third_party/xla/xla/mlir_hlo/lhlo/transforms/lhlo_legalize_to_parallel_loops/lhlo_legalize_to_parallel_loops.cc
@@ -453,12 +453,14 @@ class ReduceWindowOpConverter
         loc, inputType.getElementType(), mappedIvs.inBounds,
         /*withElseRegion=*/true);
 
-    OpBuilder thenBuilder = elemOrInit.getThenBodyBuilder(rewriter);
+    OpBuilder thenBuilder =
+        elemOrInit.getThenBodyBuilder(rewriter->getListener());
     Value elem =
         thenBuilder.create<mlir::memref::LoadOp>(loc, input, mappedIvs.ivs);
     thenBuilder.create<scf::YieldOp>(loc, elem);
 
-    OpBuilder elseBuilder = elemOrInit.getElseBodyBuilder(rewriter);
+    OpBuilder elseBuilder =
+        elemOrInit.getElseBodyBuilder(rewriter->getListener());
     elseBuilder.create<scf::YieldOp>(loc, *windowLoop.getInitVals().begin());
 
     return rewriter->create<scf::ReduceOp>(loc,
diff --git a/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td b/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td
index 7b6de62b1eab0f..e1c305979e9496 100644
--- a/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td
+++ b/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.td
@@ -333,16 +333,20 @@ def LHLOGPU_AllToAllDoneOp: LHLOGPU_Op<"all_to_all_done"> {
 
 def LHLOGPU_CudnnNormOp : LHLOGPU_Op<"Norm", [AttrSizedOperandSegments]> {
   let arguments = (ins
-    Arg<LHLO_Buffer, "", [MemRead]>:$input,
+    Arg<LHLO_Buffer, "", [MemRead]>:$x,
     Arg<LHLO_Buffer, "", [MemRead]>:$scale,
-    Arg<LHLO_Buffer, "", [MemRead]>:$bias,
-    Arg<LHLO_Buffer, "", [MemWrite]>:$output,
-    Arg<Optional<LHLO_Buffer>, "", [MemWrite]>:$expectation,
-    Arg<Optional<LHLO_Buffer>, "", [MemWrite]>:$norm_factor,
+    Arg<LHLO_Buffer, "", [MemWrite]>:$y_or_dx,
+    Arg<Optional<LHLO_Buffer>, "", [MemRead]>:$bias,
+    Arg<Optional<LHLO_Buffer>, "", [MemRead]>:$dy,
+    Arg<Optional<LHLO_Buffer>, "", [MemRead, MemWrite]>:$expectation,
+    Arg<Optional<LHLO_Buffer>, "", [MemRead, MemWrite]>:$norm_factor,
+    Arg<Optional<LHLO_Buffer>, "", [MemWrite]>:$dscale,
+    Arg<Optional<LHLO_Buffer>, "", [MemWrite]>:$dbias,
     Arg<LHLO_Buffer, "", [MemWrite]>:$scratch,
     NormAlgorithmConfigAttr:$algorithm_config,
     F64Attr:$epsilon,
-    I64ArrayAttr:$operand_layouts
+    I64ArrayAttr:$operand_layouts,
+    CudnnNormKindAttr:$kind
     );
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops_enums.td b/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops_enums.td
index 3efb931e385bd9..480025bfe3c2e9 100644
--- a/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops_enums.td
+++ b/third_party/xla/xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops_enums.td
@@ -121,6 +121,19 @@ def NormAlgorithmConfigAttr : AttrDef<
  let summary = "GPU Norm Algorithm configuration";
 }
 
+def CudnnNormKindLayerFwdInfer : I32EnumAttrCase<"LayerFwdInfer", 0>;
+def CudnnNormKindLayerFwdTrain : I32EnumAttrCase<"LayerFwdTrain", 1>;
+def CudnnNormKindLayerBwd : I32EnumAttrCase<"LayerBwd", 2>;
+
+def CudnnNormKind: I32EnumAttr<"CudnnNormKind",
+    "Mode of cuDNN norm",
+    [CudnnNormKindLayerFwdInfer, CudnnNormKindLayerFwdTrain, CudnnNormKindLayerBwd]> {
+  let genSpecializedAttr = 0;
+  let cppNamespace = "::mlir::lmhlo_gpu";
+}
+
+def CudnnNormKindAttr : EnumAttr<LmhloGpuDialect, CudnnNormKind, "kind">;
+
 def FusedMHAAlgorithmConfigAttr : AttrDef<
   LmhloGpuDialect, "FusedMHAAlgorithmConfig"> {
   let mnemonic = "fHMA_algorithm_config";
@@ -155,6 +168,8 @@ def FusedMhaBackwardDagScaleBiasMaskSoftmax : I32EnumAttrCase<"BackwardScaleBias
 def FusedMhaBackwardDagScaleBiasMaskSoftmaxDropout : I32EnumAttrCase<"BackwardScaleBiasMaskSoftmaxDropout", 3>;
 def FusedMhaBackwardDagSoftmax : I32EnumAttrCase<"BackwardSoftmax", 4>;
 def FusedMhaBackwardDagSoftmaxDropout : I32EnumAttrCase<"BackwardSoftmaxDropout", 5>;
+def FusedMhaBackwardDagScaleMaskSoftmax : I32EnumAttrCase<"BackwardScaleMaskSoftmax", 6>;
+def FusedMhaBackwardDagScaleMaskSoftmaxDropout : I32EnumAttrCase<"BackwardScaleMaskSoftmaxDropout", 7>;
 
 def FusedMhaDagSignature: I32EnumAttr<"FusedMhaDagSignature",
     "DAG configuration for Fused Multi-Headed Attention",
@@ -179,7 +194,9 @@ def FusedMhaBackwardDagSignature: I32EnumAttr<"FusedMhaBackwardDagSignature",
     FusedMhaBackwardDagScaleBiasMaskSoftmax,
     FusedMhaBackwardDagScaleBiasMaskSoftmaxDropout,
     FusedMhaBackwardDagSoftmax,
-    FusedMhaBackwardDagSoftmaxDropout]> {
+    FusedMhaBackwardDagSoftmaxDropout,
+    FusedMhaBackwardDagScaleMaskSoftmax,
+    FusedMhaBackwardDagScaleMaskSoftmaxDropout]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::lmhlo_gpu";
 }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
index 5c4ab30e7fe021..cecb8198925502 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.cc
@@ -389,6 +389,7 @@ INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CosineOp)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(CrossReplicaSumOp)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(DivOp)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(DomainOp)
+INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(ErfOp)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(ExpOp)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(Expm1Op)
 INFER_RETURN_TYPE_COMPONENTS_FROM_OPERANDS(FloorOp)
@@ -2392,6 +2393,16 @@ void BroadcastInDimOp::getCanonicalizationPatterns(RewritePatternSet& results,
 //===----------------------------------------------------------------------===//
 
 LogicalResult DynamicBroadcastInDimOp::verify() {
+  // Check for unranked dynamism. Unranked dynamism is not supported by
+  // StableHLO (hlo::verifyReshapeOp will fail) and we can't verify
+  // anything statically in that case anyway.
+  auto outputdimensionsType =
+      getOutputDimensions().getType().cast<ShapedType>();
+  auto resultType = getResult().getType().cast<ShapedType>();
+  if (!outputdimensionsType.hasRank() || !resultType.hasRank()) {
+    return success();
+  }
+
   return hlo::verifyDynamicBroadcastInDimOp(
       getLoc(), getOperand(), getOutputDimensions(),
       llvm::to_vector(getBroadcastDimensions().getValues<int64_t>()),
@@ -2854,6 +2865,13 @@ LogicalResult ConcatenateOp::reifyReturnTypeShapes(
 //===----------------------------------------------------------------------===//
 
 LogicalResult DynamicReshapeOp::verify() {
+  // Check for unranked dynamism. Unranked dynamism is not supported by
+  // StableHLO (hlo::verifyDynamicReshapeOp will fail) and we can't verify
+  // anything statically in that case anyway.
+  auto resultType = getResult().getType().cast<ShapedType>();
+  auto outputShapeType = getOutputShape().getType().cast<ShapedType>();
+  if (!resultType.hasRank() || !outputShapeType.hasStaticShape())
+    return success();
   return hlo::verifyDynamicReshapeOp(getLoc(), getOutputShape(), getResult());
 }
 
@@ -4604,6 +4622,14 @@ LogicalResult DynamicPadOp::reifyReturnTypeShapes(
 //===----------------------------------------------------------------------===//
 
 LogicalResult ReshapeOp::verify() {
+  // Check for unranked dynamism. Unranked dynamism is not supported by
+  // StableHLO (hlo::verifyReshapeOp will fail) and we can't verify
+  // anything statically in that case anyway.
+  auto operandType = getOperand().getType().cast<ShapedType>();
+  auto resultType = getResult().getType().cast<ShapedType>();
+  if (!operandType.hasRank() || !resultType.hasRank()) {
+    return success();
+  }
   return hlo::verifyReshapeOp(getLoc(), getOperand(), getResult());
 }
 
@@ -4915,6 +4941,7 @@ UNARY_FOLDER_FLOAT(RoundNearestEvenOp, RoundNearestEven)
 UNARY_FOLDER_FLOAT(RoundOp, Round)
 
 UNARY_FOLDER_UPCAST_TO_F64(CosineOp, std::cos, AnyValue)
+UNARY_FOLDER_UPCAST_TO_F64(ErfOp, std::erf, AnyValue)
 UNARY_FOLDER_UPCAST_TO_F64(ExpOp, std::exp, AnyValue)
 UNARY_FOLDER_UPCAST_TO_F64(LogisticOp, logistic, AnyValue)
 UNARY_FOLDER_UPCAST_TO_F64(LogOp, std::log, PositiveValue)
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
index 307b4652559aba..680b80fc3e94c7 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_ops.td
@@ -305,6 +305,23 @@ def MHLO_CosineOp: MHLO_UnaryElementwiseOp<"cosine",
   let hasCustomHLOConverter = 1;
 }
 
+def MHLO_ErfOp: MHLO_UnaryElementwiseOp<"erf",
+    [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpTensor> {
+  let summary = "Erf operation";
+  let description = [{
+    Performs element-wise erf operation on `operand` tensor and produces a
+    `result` tensor.
+
+    See:
+    https://github.com/openxla/stablehlo/blob/main/docs/spec.md#erf
+
+    Example:
+    ```mlir
+    %result = mhlo.erf %operand : tensor<2x2xf32>
+    ```
+  }];
+  let hasFolder = 1;
+}
 def MHLO_ExpOp: MHLO_UnaryElementwiseOp<"exponential",
     [Pure, HLO_CompatibleOperandsAndResultType], MHLO_FpComplexOrQuantizedIntTensor> {
   let summary = "Exp operation";
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_utils.td b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_utils.td
index 09d80cd743e928..079e363b826f5d 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_utils.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/IR/hlo_utils.td
@@ -44,6 +44,8 @@ def MHLO_ConstantLikeNegInfValue : NativeCodeCall<
 
 def NullDenseIntElementsAttr : NativeCodeCall<"DenseIntElementsAttr()">;
 
+def NullDenseI64ArrayAttr : NativeCodeCall<"DenseI64ArrayAttr()">;
+
 def BinBroadcastDimensions : NativeCodeCall<
     "hlo::getBroadcastDimensionsAttr(&$_builder, $0, $1)">;
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/analysis/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/mhlo/analysis/CMakeLists.txt
index 4fb958c6d687b0..de8c9c223c8254 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/analysis/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/mhlo/analysis/CMakeLists.txt
@@ -20,7 +20,6 @@ add_mlir_library(MhloTestAnalysis
   LMHLOTransformsPassIncGen
 
   LINK_LIBS PUBLIC
-  MLIRHLOAnalysis
   MLIRAnalysis
   MLIRPass
   MLIRTransforms
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
index aa44eaca01a782..e1b36c4fd57a4c 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/CMakeLists.txt
@@ -70,7 +70,6 @@ add_mlir_library(MhloPasses
   prepare_for_export/prepare_for_export.cc
   optimize_mhlo/optimize_mhlo.cc
   optimize_mhlo/optimize_mhlo_pass.cc
-  rank_specialization/rank_specialization.cc
   restrict_max_rank/restrict_max_rank.cc
   shape_legalize_to_hlo/shape_legalize_to_hlo.cc
   shape_reification/shape_reification_pass.cc
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
index ad9b94627f3d9d..cb7a6ba5114990 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo.cc
@@ -589,7 +589,7 @@ Value materializeErfcApproximationF32(ConversionPatternRewriter &rewriter,
                                          erfcApprox);
 }
 
-struct ConvertErfOp : public OpConversionPattern<ErfOp> {
+struct BasisConvertErfOp : public OpConversionPattern<ErfOp> {
   using OpConversionPattern<ErfOp>::OpConversionPattern;
   LogicalResult matchAndRewrite(
       ErfOp op, OpAdaptor adaptor,
@@ -1553,10 +1553,7 @@ struct ConvertSinhOp : public OpConversionPattern<SinhOp> {
 //                              (tensor<16x16xf32>) -> tensor<16x8xf32>
 // %6 = "mhlo.slice"(%4) ...
 //
-// TODO(b/284078162): Decide what to do with this pattern given that we now
-// have mhlo::TopKOp. No action needed for now given that mhlo::TopKOp is
-// currently categorized as `hasPrivateFeaturesNotInStablehlo`.
-struct ConvertTopKOp : public OpConversionPattern<TopKOp> {
+struct BasisConvertTopKOp : public OpConversionPattern<TopKOp> {
   using OpConversionPattern<TopKOp>::OpConversionPattern;
   LogicalResult matchAndRewrite(
       TopKOp op, OpAdaptor /*adaptor*/,
@@ -1941,6 +1938,14 @@ void populateChloBroadcastingPatterns(MLIRContext *context,
           context);
 }
 
+void populateChloLegalizeToHloBasisOpsPatterns(MLIRContext *context,
+                                               RewritePatternSet *patterns) {
+  // Patterns that decompose to a basis set of HLOs
+  // These are guaranteed to be convertible to StableHLO, but discard some
+  // higher level information that is useful to XLA compilation.
+  patterns->add<BasisConvertErfOp, BasisConvertTopKOp>(context);
+}
+
 void populateDecomposeChloPatterns(MLIRContext *context,
                                    RewritePatternSet *patterns) {
   populateWithGenerated(*patterns);
@@ -1950,14 +1955,12 @@ void populateDecomposeChloPatterns(MLIRContext *context,
   patterns->add<ConvertBesselI1eOp,
                    ConvertCoshOp,
                    ConvertDigammaOp,
-                   ConvertErfOp,
                    ConvertErfcOp,
                    ConvertErfInvOp,
                    ConvertLgammaOp,
                    ConvertNextAfterOp,
                    ConvertPolygammaOp,
                    ConvertSinhOp,
-                   ConvertTopKOp,
                    ConvertZetaOp>(context);
   // clang-format on
 }
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
index 643d74493ef616..058e6db42fc321 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_pass.cc
@@ -25,12 +25,15 @@ limitations under the License.
 #include "mlir/Dialect/Shape/IR/Shape.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Pass/Pass.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Transforms/DialectConversion.h"
 #include "stablehlo/dialect/ChloOps.h"
 
 namespace mlir {
 namespace mhlo {
 
 #define GEN_PASS_DEF_CHLOLEGALIZETOHLOPASS
+#define GEN_PASS_DEF_CHLOLEGALIZETOHLOBASISOPSPASS
 #include "mhlo/transforms/mhlo_passes.h.inc"
 
 namespace {
@@ -45,10 +48,6 @@ struct ChloLegalizeToHloPass
     this->expand_compositions_ = expandCompositions;
   }
 
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<mhlo::MhloDialect, shape::ShapeDialect, scf::SCFDialect>();
-  }
-
   void runOnOperation() override {
     ConversionTarget conversionTarget(getContext());
     RewritePatternSet conversionPatterns(&getContext());
@@ -81,6 +80,37 @@ struct ChloLegalizeToHloPass
   }
 };
 
+struct ChloLegalizeToHloBasisOpsPass
+    : public impl::ChloLegalizeToHloBasisOpsPassBase<
+          ChloLegalizeToHloBasisOpsPass> {
+  using ChloLegalizeToHloBasisOpsPassBase::ChloLegalizeToHloBasisOpsPassBase;
+
+  void runOnOperation() override {
+    ConversionTarget conversionTarget(getContext());
+    RewritePatternSet conversionPatterns(&getContext());
+
+    // Patterns will only be applied to these ops
+    conversionTarget.addIllegalOp<chlo::ErfOp, chlo::TopKOp>();
+
+    // Programs with MHLO equivalents to the StableHLO ops are likely bugs
+    // for users of this expander pass, so best to disallow.
+    conversionTarget.addIllegalOp<mhlo::TopKOp>();  // TODO: Add ErfOp
+
+    // Given that the resulting patterns should be convertible to StableHLO
+    // Only MHLO should be legal.
+    conversionTarget
+        .addLegalDialect<MhloDialect, chlo::ChloDialect, func::FuncDialect>();
+
+    chlo::populateChloLegalizeToHloBasisOpsPatterns(&getContext(),
+                                                    &conversionPatterns);
+
+    if (failed(applyPartialConversion(getOperation(), conversionTarget,
+                                      std::move(conversionPatterns)))) {
+      return signalPassFailure();
+    }
+  }
+};
+
 }  // namespace
 
 std::unique_ptr<OperationPass<func::FuncOp>> createChloLegalizeToHloPass(
@@ -89,5 +119,10 @@ std::unique_ptr<OperationPass<func::FuncOp>> createChloLegalizeToHloPass(
                                                  expandCompositions);
 }
 
+std::unique_ptr<OperationPass<func::FuncOp>>
+createChloLegalizeToHloBasisOpsPass() {
+  return std::make_unique<ChloLegalizeToHloBasisOpsPass>();
+}
+
 }  // namespace mhlo
 }  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_patterns.td b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_patterns.td
index 4b05cb1e6fa9a9..937739f2851454 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_patterns.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/chlo_legalize_to_hlo/chlo_legalize_to_hlo_patterns.td
@@ -355,3 +355,9 @@ def : Pat<(CHLO_ConstantOp $v),
 
 def : Pat<(CHLO_TanOp $v),
           (MHLO_TanOp $v)>;
+
+def : Pat<(CHLO_ErfOp $v),
+          (MHLO_ErfOp $v)>;
+
+def : Pat<(CHLO_TopKOp AnyRankedTensor:$v, $k),
+          (MHLO_TopKOp $v, $k, ConstBoolAttrTrue)>;
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_arithmetic/hlo_legalize_to_arithmetic.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_arithmetic/hlo_legalize_to_arithmetic.cc
index a137f36298652b..fead7be62bf1c2 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_arithmetic/hlo_legalize_to_arithmetic.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_arithmetic/hlo_legalize_to_arithmetic.cc
@@ -207,6 +207,7 @@ void populateScalarHloToArithmeticConversionPatterns(
       ScalarHloToArithmeticPattern<mhlo::CopyOp>,
       ScalarHloToArithmeticPattern<mhlo::CosineOp>,
       ScalarHloToArithmeticPattern<mhlo::DivOp>,
+      ScalarHloToArithmeticPattern<mhlo::ErfOp>,
       ScalarHloToArithmeticPattern<mhlo::ExpOp>,
       ScalarHloToArithmeticPattern<mhlo::Expm1Op>,
       ScalarHloToArithmeticPattern<mhlo::FloorOp>,
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
index 950f5068a4c4fe..8de15bf6c65c1e 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/hlo_legalize_to_stablehlo/hlo_legalize_to_stablehlo.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <cstdint>
 #include <optional>
 #include <string>
+#include <type_traits>
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
@@ -146,6 +147,12 @@ std::optional<int64_t> getPublicFeaturesNotInStablehlo(HloOpTy hloOp) {
     // Version 1: Initial version for TopK.
     return 1;
   }
+  // StableHLO doesn't support TopK yet.
+  // Proposal: https://github.com/openxla/stablehlo/pull/1593
+  if constexpr (std::is_same<HloOpTy, mhlo::ErfOp>::value) {
+    // Version 1: Initial version for ErfOp.
+    return 1;
+  }
   return std::nullopt;
 }
 
@@ -155,22 +162,74 @@ bool hasPublicFeaturesNotInStablehlo(HloOpTy op) {
 }
 
 template <typename StablehloOpTy>
-Attribute convertDenseArray(Attribute hloAttr) {
+bool isDenseI64Array(mlir::StringAttr hloName) {
+  if (std::is_same<StablehloOpTy, stablehlo::BroadcastOp>::value &&
+      hloName == "broadcast_sizes")
+    return true;
+  if (std::is_same<StablehloOpTy, stablehlo::BroadcastInDimOp>::value &&
+      hloName == "broadcast_dimensions")
+    return true;
+  if ((std::is_same<StablehloOpTy, stablehlo::ConvolutionOp>::value ||
+       std::is_same<StablehloOpTy, stablehlo::DynamicConvOp>::value) &&
+      (hloName == "window_strides" || hloName == "lhs_dilation" ||
+       hloName == "rhs_dilation"))
+    return true;
+  if (std::is_same<StablehloOpTy, stablehlo::DynamicBroadcastInDimOp>::value &&
+      (hloName == "broadcast_dimensions" ||
+       hloName == "known_expanding_dimensions" ||
+       hloName == "known_nonexpanding_dimensions"))
+    return true;
+  if ((std::is_same<StablehloOpTy, stablehlo::DynamicSliceOp>::value ||
+       std::is_same<StablehloOpTy, stablehlo::GatherOp>::value) &&
+      hloName == "slice_sizes")
+    return true;
+  if (std::is_same<StablehloOpTy, stablehlo::FftOp>::value &&
+      hloName == "fft_length")
+    return true;
+  if ((std::is_same<StablehloOpTy, stablehlo::MapOp>::value ||
+       std::is_same<StablehloOpTy, stablehlo::ReduceOp>::value ||
+       std::is_same<StablehloOpTy, stablehlo::ReverseOp>::value) &&
+      hloName == "dimensions")
+    return true;
+  if (std::is_same<StablehloOpTy, stablehlo::PadOp>::value &&
+      (hloName == "edge_padding_low" || hloName == "edge_padding_high" ||
+       hloName == "interior_padding"))
+    return true;
+  if (std::is_same<StablehloOpTy, stablehlo::ReduceWindowOp>::value &&
+      (hloName == "window_dimensions" || hloName == "window_strides" ||
+       hloName == "base_dilations" || hloName == "window_dilations"))
+    return true;
+  if (std::is_same<StablehloOpTy, stablehlo::SelectAndScatterOp>::value &&
+      (hloName == "window_dimensions" || hloName == "window_strides"))
+    return true;
+  if (std::is_same<StablehloOpTy, stablehlo::SliceOp>::value &&
+      (hloName == "start_indices" || hloName == "limit_indices" ||
+       hloName == "strides"))
+    return true;
+  if (std::is_same<StablehloOpTy, stablehlo::TransposeOp>::value &&
+      hloName == "permutation")
+    return true;
+  return false;
+}
+
+template <typename StablehloOpTy>
+Attribute convertDenseArray(mlir::StringAttr hloName, Attribute hloAttr) {
   auto denseInts = hloAttr.dyn_cast<DenseIntElementsAttr>();
   if (!denseInts) return {};
 
+  if ((std::is_same<StablehloOpTy, stablehlo::ConvolutionOp>::value ||
+       std::is_same<StablehloOpTy, stablehlo::DynamicConvOp>::value) &&
+      hloName == "window_reversal") {
+    return DenseBoolArrayAttr::get(
+        hloAttr.getContext(), llvm::to_vector(denseInts.getValues<bool>()));
+  }
+
   // Handle DenseIntElementsAttr --> DenseI64ArrayAttr for StableHLO ops that
   // use dense arrays. This is temporary while MHLO integrates this change.
-  if constexpr (std::is_same<StablehloOpTy, stablehlo::BroadcastOp>::value ||
-                std::is_same<StablehloOpTy, stablehlo::DynamicSliceOp>::value ||
-                std::is_same<StablehloOpTy, stablehlo::FftOp>::value ||
-                std::is_same<StablehloOpTy, stablehlo::PadOp>::value ||
-                std::is_same<StablehloOpTy, stablehlo::ReverseOp>::value ||
-                std::is_same<StablehloOpTy, stablehlo::SliceOp>::value ||
-                std::is_same<StablehloOpTy, stablehlo::TransposeOp>::value) {
+  if (isDenseI64Array<StablehloOpTy>(hloName))
     return DenseI64ArrayAttr::get(
         hloAttr.getContext(), llvm::to_vector(denseInts.getValues<int64_t>()));
-  }
+
   return {};
 }
 
@@ -520,8 +579,8 @@ class HloToStablehloOpConverter : public OpConversionPattern<HloOpTy> {
             hloOp.getCustomCallSchedule() == mhlo::CustomCallSchedule::NONE)
           continue;
       }
-      auto stablehloAttr =
-          convertDenseArray<HloToStablehloOp<HloOpTy>>(hloAttr.getValue());
+      auto stablehloAttr = convertDenseArray<HloToStablehloOp<HloOpTy>>(
+          hloAttr.getName(), hloAttr.getValue());
       if (!stablehloAttr) {
         stablehloAttr = convertAttr(hloAttr.getValue());
       }
@@ -596,7 +655,8 @@ void populateHloToStablehloPatterns(RewritePatternSet* patterns,
 #include "stablehlo/dialect/StablehloOps.cpp.inc"
       >(patterns, converter, context, allowExperimentalFeatures);
 
-  populateHloToStablehloCustomCallPatterns<mhlo::TanOp, mhlo::TopKOp>(
+  populateHloToStablehloCustomCallPatterns<mhlo::TanOp, mhlo::TopKOp,
+                                           mhlo::ErfOp>(
       patterns, converter, context, allowExperimentalFeatures);
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
index 7abb36de697b41..a3c786179169c3 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/legalize_to_linalg/legalize_to_linalg.cc
@@ -3263,7 +3263,7 @@ struct ConvolutionOpGeneralConversion
 
     // Finally, create the computation
     auto inferredMaps =
-        AffineMap::inferFromExprList({srcExprs, windowExprs, dstExprs});
+        AffineMap::inferFromExprList({srcExprs, windowExprs, dstExprs}, ctx);
 
     Value emptyTensor = rewriter.create<tensor::EmptyOp>(
         loc, reshapedResultShape, resultType.getElementType());
@@ -3578,7 +3578,7 @@ struct ReduceWindowOpOnTensorsGenericConversion
     SmallVector<AffineMap, 4> inferredMaps(3, AffineMap::get(ctx));
     if (rank > 0)
       inferredMaps =
-          AffineMap::inferFromExprList({srcExprs, windowExprs, dstExprs});
+          AffineMap::inferFromExprList({srcExprs, windowExprs, dstExprs}, ctx);
 
     SmallVector<AffineMap, 4> indexingMaps;
 
@@ -4504,6 +4504,7 @@ void populateHloToLinalgConversionPattern(MLIRContext* context,
       PointwiseToLinalgMapConverter<mhlo::CopyOp>,
       PointwiseToLinalgMapConverter<mhlo::CosineOp>,
       PointwiseToLinalgMapConverter<mhlo::DivOp>,
+      PointwiseToLinalgMapConverter<mhlo::ErfOp>,
       PointwiseToLinalgMapConverter<mhlo::ExpOp>,
       PointwiseToLinalgMapConverter<mhlo::Expm1Op>,
       PointwiseToLinalgMapConverter<mhlo::FloorOp>,
@@ -4563,6 +4564,7 @@ void populateHloToLinalgConversionPattern(MLIRContext* context,
       PointwiseToLinalgConverter<mhlo::CopyOp>,
       PointwiseToLinalgConverter<mhlo::CosineOp>,
       PointwiseToLinalgConverter<mhlo::DivOp>,
+      PointwiseToLinalgConverter<mhlo::ErfOp>,
       PointwiseToLinalgConverter<mhlo::ExpOp>,
       PointwiseToLinalgConverter<mhlo::Expm1Op>,
       PointwiseToLinalgConverter<mhlo::FloorOp>,
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
index 5fcb5a6bf8eaab..f724059ff4982e 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/map_mhlo_to_scalar_op.h
@@ -81,6 +81,10 @@ struct MhloToScalarOp<mhlo::CosineOp> {
   using COp = ::mlir::complex::CosOp;
 };
 template <>
+struct MhloToScalarOp<mhlo::ErfOp> {
+  using FOp = ::mlir::math::ErfOp;
+};
+template <>
 struct MhloToScalarOp<mhlo::ExpOp> {
   using FOp = ::mlir::math::ExpOp;
   using COp = ::mlir::complex::ExpOp;
@@ -470,10 +474,9 @@ inline Value mapMhloOpToStdScalarOp<mhlo::ReducePrecisionOp>(
   mlir::ImplicitLocOpBuilder b(loc, *builder);
 
   // Integer and float types for casting and constant generation.
-  auto floatType =
-      argTypes.front().cast<TensorType>().getElementType().cast<FloatType>();
+  auto floatType = getElementTypeOrSelf(argTypes.front()).cast<FloatType>();
   int64_t nbits = floatType.getWidth();
-  auto intType = mlir::IntegerType::get(loc.getContext(), floatType.getWidth());
+  auto intType = mlir::IntegerType::get(loc.getContext(), nbits);
 
   Value xAsInt = b.create<arith::BitcastOp>(intType, adaptor.getOperand());
 
@@ -1072,13 +1075,18 @@ template <>
 inline Value mapMhloOpToStdScalarOp<mhlo::LogisticOp>(
     Location loc, ArrayRef<Type> resultTypes, ArrayRef<Type> /*argTypes*/,
     mhlo::LogisticOp::Adaptor adaptor, OpBuilder* b) {
-  // 1.0 / (1.0 - exp(-x))
+  // 1.0 / (1.0 + exp(-x))
   Value negX = mapMhloOpToStdScalarOp<mhlo::NegOp>(
       loc, resultTypes, resultTypes, {adaptor.getOperand()}, b);
   Value expNegX = mapMhloOpToStdScalarOp<mhlo::ExpOp>(loc, resultTypes,
                                                       resultTypes, {{negX}}, b);
 
-  Value oneFloat = b->create<arith::ConstantOp>(loc, b->getF32FloatAttr(1.0));
+  Type type = getElementTypeOrSelf(resultTypes[0]);
+  Value oneFloat =
+      type.isa<ComplexType>()
+          ? b->create<arith::ConstantOp>(loc, b->getF32FloatAttr(1.0))
+          : getConstantOrSplat(b, loc, resultTypes[0],
+                               FloatAttr::get(type, 1.0f));
   Value one = mapConvertOpToStdScalarOp(loc, resultTypes, resultTypes,
                                         {oneFloat.getType()}, {{oneFloat}}, b);
   Value oneAddExprNegX = mapMhloOpToStdScalarOp<mhlo::AddOp>(
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
index a6cb43bf1ad412..cd865f08fadb9f 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/mhlo_passes.td
@@ -18,6 +18,8 @@ include "mlir/Pass/PassBase.td"
 def ChloLegalizeToHloPass : Pass<"chlo-legalize-to-hlo", "func::FuncOp"> {
   let summary = "Legalize CHLO to HLO.";
   let constructor = "createChloLegalizeToHloPass()";
+  let dependentDialects = ["mhlo::MhloDialect", "chlo::ChloDialect",
+                           "shape::ShapeDialect", "scf::SCFDialect"];
   let options = [
     Option<"legalize_broadcasts_", "legalize-broadcasts", "bool",
            /*default=*/"true", "Legalize implicit broadcasts to explicit HLO broadcasting forms">,
@@ -26,6 +28,18 @@ def ChloLegalizeToHloPass : Pass<"chlo-legalize-to-hlo", "func::FuncOp"> {
   ];
 }
 
+def ChloLegalizeToHloBasisOpsPass : Pass<"chlo-legalize-to-hlo-basis-ops", "func::FuncOp"> {
+  let summary = "Legalize specific CHLO ops (e.g. ErfOf and TopKOp) to basis MHLO ops.";
+  let description = [{
+    XLA has specialization for certain CHLO ops (ErfOp, TopKOp), and other
+    backends still require decomposition of these ops into the basis set which
+    can be converted safely to StableHLO. This pass is needed until we have
+    direct CHLO to StableHLO lowerings.
+  }];
+  let constructor = "createChloLegalizeToHloBasisOpsPass()";
+  let dependentDialects = ["mhlo::MhloDialect", "chlo::ChloDialect"];
+}
+
 def LegalizeSparseOpsPass : Pass<"legalize-sparse-ops", "func::FuncOp"> {
   let summary = "Legalize from sparse ops before convert MLIR to XLA computation.";
   let constructor = "createLegalizeSparseOperationsPass()";
@@ -320,23 +334,6 @@ def SparseRewritingPass : Pass<"mhlo-sparse-rewriting", "func::FuncOp"> {
   let constructor = "createSparseRewritingPass()";
 }
 
-/// Rank specialization passes.
-
-def RankSpecializationClusterPass
-    : Pass<"mhlo-rank-specialization-cluster", "func::FuncOp"> {
-  let constructor = "createRankSpecializationClusterPass()";
-}
-
-def RankSpecializationToSCFPass
-    : Pass<"mhlo-rank-specialization-to-scf", "func::FuncOp"> {
-  let constructor = "createRankSpecializationToSCFPass()";
-  let options = [
-    Option<"max_target_rank_", "max-target-rank", "int", /*default=*/"8",
-           "The maximum supported rank after rank specialization. Any argument "
-           "of greater rank may result in a runtime failure.">,
-  ];
-}
-
 def MhloExpandOpsSimplifierPass
     : Pass<"mhlo-expand-ops-simplifier", "func::FuncOp"> {
   let summary = "Expand feature rich mhlo ops into a set of simpler mhlo ops.";
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
index 1198d9edf6cd00..0cc41f3ba8949b 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/passes.h
@@ -53,6 +53,10 @@ std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeToStdPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createChloLegalizeToHloPass(
     bool legalizeBroadcasts = true, bool expandCompositions = true);
 
+/// Lowers specific ops from the CHLO dialect to an HLO basis opset
+std::unique_ptr<OperationPass<func::FuncOp>>
+createChloLegalizeToHloBasisOpsPass();
+
 // Lowers from sparse ops in CHLO dialect to Linalg dialect.
 std::unique_ptr<OperationPass<func::FuncOp>> createLegalizeSparseOperationsPass(
     bool legalizeToCustomCalls = true);
@@ -144,15 +148,6 @@ std::unique_ptr<OperationPass<func::FuncOp>> createConstraintFusionPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createGroupReductionDimensionsPass(
     bool preferColumnsReductions = true);
 
-/// Rank specialization passes:
-///   - Find compatible operations and group them together in one rank
-///     specialization cluster.
-///   - Lower rank specialization clusters to SCF and ranked operations.
-std::unique_ptr<OperationPass<func::FuncOp>>
-createRankSpecializationClusterPass();
-std::unique_ptr<OperationPass<func::FuncOp>> createRankSpecializationToSCFPass(
-    int64_t maxTargetRank = 5);
-
 std::unique_ptr<OperationPass<func::FuncOp>> createOptimizeMhloPass();
 std::unique_ptr<OperationPass<func::FuncOp>> createLowerComplexPass();
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
index 5c594d9db2b500..3aceda3e7de692 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/prepare_for_export/prepare_for_export.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 // This file implements logic for some optimizations to reduce size on export.
 
+#include <cassert>
 #include <complex>
 #include <cstdint>
 #include <memory>
@@ -23,6 +24,7 @@ limitations under the License.
 #include "mhlo/IR/hlo_ops.h"
 #include "mhlo/transforms/passes.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/IR/Block.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -30,7 +32,9 @@ limitations under the License.
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/Matchers.h"
 #include "mlir/IR/Operation.h"
+#include "mlir/IR/Region.h"
 #include "mlir/IR/Types.h"
+#include "mlir/IR/Value.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Transforms/RegionUtils.h"
@@ -153,6 +157,36 @@ void prepareBroadcastInDim(BroadcastInDimOp bcast) {
       DenseIntElementsAttr::get(dims.getType(), transposedDim));
 }
 
+// Make implicitly captured constant explicit before exporting
+void prepareExplicitCapturedConstants(Operation *op) {
+  for (Region &region : op->getRegions()) {
+    assert(region.getBlocks().size() == 1 &&
+           "Only OPs with single block regions are allowed");
+    llvm::SetVector<Value> implicitInputs;
+    // Get implicit inputs, i.e. those are used in the region
+    // but defined outside
+    getUsedValuesDefinedAbove(region, implicitInputs);
+    Block &block = region.getBlocks().front();
+    OpBuilder builder(&block.front());
+    for (Value input : implicitInputs) {
+      // If the captured value is defined by a constant OP,
+      // Create a clone constant OP within a block to make
+      // it explicit and replace uses within the block
+      Operation *definingOp = input.getDefiningOp();
+      mlir::DenseElementsAttr attr;
+      if (matchPattern(input, m_Constant(&attr))) {
+        Operation *clonedOp = builder.clone(*definingOp);
+        // Find which uses belong to the block and replace
+        // with the cloned/explicit one
+        input.replaceUsesWithIf(
+            clonedOp->getResult(0), [&block](OpOperand &use) {
+              return block.getParentOp()->isProperAncestor(use.getOwner());
+            });
+      }
+    }
+  }
+}
+
 void PrepareForExportPass::runOnOperation() {
   getOperation().walk([&](Operation *op) {
     mlir::SplatElementsAttr attr;
@@ -161,6 +195,11 @@ void PrepareForExportPass::runOnOperation() {
     if (auto whileOp = dyn_cast<WhileOp>(op)) return prepareWhileOp(whileOp);
     if (auto bcastOp = dyn_cast<BroadcastInDimOp>(op))
       return prepareBroadcastInDim(bcastOp);
+    // IfOp, CaseOp, WhileOp are already being handled during
+    // mhlo --> hlo translation. MapOp soon be deprecated.
+    if (mlir::isa<ReduceOp, AllReduceOp, ReduceScatterOp, ReduceWindowOp,
+                  ScatterOp, SelectAndScatterOp, SortOp>(op))
+      return prepareExplicitCapturedConstants(op);
   });
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/rank_specialization/rank_specialization.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/rank_specialization/rank_specialization.cc
deleted file mode 100644
index 1444c86031b0ac..00000000000000
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/rank_specialization/rank_specialization.cc
+++ /dev/null
@@ -1,976 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-
-==============================================================================*/
-
-#include <algorithm>
-#include <memory>
-#include <string>
-#include <tuple>
-#include <utility>
-
-#include "llvm/ADT/EquivalenceClasses.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mhlo/IR/hlo_ops.h"
-#include "mhlo/transforms/passes.h"
-#include "mhlo/transforms/rewriters.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/ControlFlow/IR/ControlFlow.h"
-#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/SCF/IR/SCF.h"
-#include "mlir/Dialect/Shape/IR/Shape.h"
-#include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/IR/Block.h"
-#include "mlir/IR/BuiltinOps.h"
-#include "mlir/IR/BuiltinTypes.h"
-#include "mlir/IR/IRMapping.h"
-#include "mlir/IR/MLIRContext.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/IR/PatternMatch.h"
-#include "mlir/Interfaces/InferTypeOpInterface.h"
-#include "mlir/Pass/Pass.h"
-#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
-#include "stablehlo/dialect/ChloOps.h"
-
-namespace mlir {
-
-/// Needed to build `llvm::SmallSet`s and `llvm::EquivalenceClasses` of
-/// `mlir::Value`s.
-static bool operator<(const Value &lhs, const Value &rhs) {
-  return lhs.getAsOpaquePointer() < rhs.getAsOpaquePointer();
-}
-
-namespace mhlo {
-
-#define GEN_PASS_DEF_RANKSPECIALIZATIONCLUSTERPASS
-#define GEN_PASS_DEF_RANKSPECIALIZATIONTOSCFPASS
-#include "mhlo/transforms/mhlo_passes.h.inc"
-
-namespace {
-
-/// Identify clusters of operations that can be rank-specialized together. The
-/// required traits for clustered operations are:
-///   - Element-wise: All operations in the group must be element-wise. This
-///     allows to reshape operands before applying the operations as well as
-///     reshaping the result to the desired shape afterwards. This way, we can,
-///     e.g., apply unary ops to a completely flattened operand and restore the
-///     original shape afterwards.
-///   - Broadcasting semantics: All operations must implement broadcasting
-///     semantics. Most importantly, this allows extending operand shapes such
-///     that they match in rank. Operations that require all their operands to
-///     be of the same shape also fulfill this requirement.
-///   - Shape reification: All operations must implement
-///     `InferShapedTypeOpInterface`. This is later needed to compute and to
-///     restore the desired result shape.
-
-bool isClusterable(Operation *op) {
-  if (!llvm::isa<InferShapedTypeOpInterface>(op)) return false;
-  if (op->getNumOperands() == 0) return false;
-  return (op->hasTrait<mlir::OpTrait::Elementwise>() &&
-          op->hasTrait<mlir::OpTrait::SameOperandsAndResultShape>()) ||
-         op->hasTrait<hlo::OpTrait::BroadcastingElementwise>();
-}
-
-struct RankSpecializationClusterPattern : public RewritePattern {
-  explicit RankSpecializationClusterPattern(MLIRContext *ctx)
-      : RewritePattern(MatchAnyOpTypeTag(), /*benefit=*/1, ctx) {}
-
-  LogicalResult matchAndRewrite(Operation *op,
-                                PatternRewriter &rewriter) const override {
-    // Only apply to operations that have not been clustered yet.
-    if (op->getParentOfType<chlo::RankSpecializationClusterOp>()) {
-      return failure();
-    }
-
-    // Only cluster when rank specialization is needed.
-    if (!isClusterable(op) || !llvm::any_of(op->getOperandTypes(), [](Type ty) {
-          return ty.isa<UnrankedTensorType>();
-        })) {
-      return failure();
-    }
-
-    // Collect all collectively rank specializable ops.
-    SmallVector<Operation *, 16> cluster;
-    llvm::SmallSet<Value, 16> operandSet;
-    llvm::SmallSet<Value, 16> resultSet;
-
-    Operation *rootOp = op;
-    while (rootOp->getNextNode() != nullptr &&
-           isClusterable(rootOp->getNextNode()))
-      rootOp = rootOp->getNextNode();
-
-    Operation *it = rootOp;
-    while (it != nullptr && isClusterable(it)) {
-      // Find results that escape the cluster.
-      for (OpOperand &use : it->getUses()) {
-        if (!llvm::is_contained(cluster, use.getOwner()))
-          resultSet.insert(use.get());
-      }
-
-      // Update cluster operands.
-      for (OpResult v : it->getResults()) operandSet.erase(Value(v));
-      for (OpOperand &v : it->getOpOperands()) operandSet.insert(v.get());
-
-      cluster.push_back(it);
-      it = it->getPrevNode();
-    }
-
-    // Create `RankSpecializationClusterOp`.
-    auto operands = llvm::to_vector<16>(operandSet);
-    auto results = llvm::to_vector<16>(resultSet);
-    auto resultTypes = llvm::to_vector<16>(
-        llvm::map_range(resultSet, [](Value v) { return v.getType(); }));
-    Location loc = op->getLoc();
-    auto clusterOp = rewriter.create<chlo::RankSpecializationClusterOp>(
-        loc, resultTypes, operands);
-
-    // Create body block.
-    auto operandTypes = llvm::to_vector<16>(
-        llvm::map_range(operandSet, [](Value v) { return v.getType(); }));
-    Block *block =
-        rewriter.createBlock(&clusterOp.getBody(), {}, operandTypes,
-                             SmallVector<Location>(operandTypes.size(), loc));
-
-    // Copy operations into the body.
-    IRMapping bvm;
-    for (auto it : llvm::zip(operands, block->getArguments()))
-      bvm.map(std::get<0>(it), std::get<1>(it));
-    rewriter.setInsertionPointToStart(block);
-    for (Operation *it : llvm::reverse(cluster)) rewriter.clone(*it, bvm);
-
-    // Create `RankSpecializationClusterYieldOp`.
-    auto mappedResults = llvm::to_vector<16>(
-        llvm::map_range(results, [&](Value v) { return bvm.lookup(v); }));
-    rewriter.create<chlo::RankSpecializationClusterYieldOp>(loc, mappedResults);
-
-    // Replace original ops with the new results.
-    for (auto it : llvm::zip(results, clusterOp.getResults()))
-      bvm.map(std::get<0>(it), std::get<1>(it));
-    for (Operation *it : cluster) {
-      if (it->getUses().empty()) {
-        rewriter.eraseOp(it);
-        continue;
-      }
-      auto replacements = llvm::to_vector<16>(llvm::map_range(
-          it->getResults(), [&](Value v) { return bvm.lookup(v); }));
-      rewriter.replaceOp(it, replacements);
-    }
-
-    return success();
-  }
-};
-
-struct MergeRankSpecializationClusterOpsPattern
-    : public OpRewritePattern<chlo::RankSpecializationClusterOp> {
-  using OpRewritePattern<chlo::RankSpecializationClusterOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(chlo::RankSpecializationClusterOp op,
-                                PatternRewriter &rewriter) const override {
-    auto precedingOp =
-        llvm::dyn_cast_or_null<chlo::RankSpecializationClusterOp>(
-            op->getPrevNode());
-    if (!precedingOp) return failure();
-    Block *body = op.SingleBlock::getBody();
-    Block *precedingBody = precedingOp.SingleBlock::getBody();
-    auto yieldOp = llvm::dyn_cast<chlo::RankSpecializationClusterYieldOp>(
-        op.SingleBlock::getBody()->getTerminator());
-    auto precedingYieldOp =
-        llvm::dyn_cast<chlo::RankSpecializationClusterYieldOp>(
-            precedingOp.SingleBlock::getBody()->getTerminator());
-
-    // Merge cluster operands. Consider only those operands of the second
-    // cluster that do not originate in the preceding cluster.
-    SmallVector<Value, 8> newOperands;
-    for (Value v : precedingOp.getOperands()) newOperands.push_back(v);
-    for (Value v : op.getOperands()) {
-      if (v.getDefiningOp() != precedingOp &&
-          !llvm::is_contained(precedingOp.getOperands(), v)) {
-        newOperands.push_back(v);
-      }
-    }
-
-    // Merge cluster results. Consider only those results of the preceding
-    // cluster that are not exclusively used as operands to the second cluster.
-    SmallVector<Value, 8> newUnmappedResults;
-    for (auto it :
-         llvm::zip(precedingOp.getResults(), precedingYieldOp.getResults())) {
-      Value result, innerResult;
-      std::tie(result, innerResult) = it;
-      if (!llvm::all_of(result.getUsers(),
-                        [&](Operation *user) { return user == op; })) {
-        newUnmappedResults.push_back(innerResult);
-      }
-    }
-    for (Value v : yieldOp.getResults()) newUnmappedResults.push_back(v);
-
-    // Create merged cluster op.
-    rewriter.setInsertionPoint(precedingOp);
-    auto loc = op.getLoc();
-    auto resultTypes = llvm::to_vector<16>(llvm::map_range(
-        newUnmappedResults, [](Value v) { return v.getType(); }));
-    auto newOp = rewriter.create<chlo::RankSpecializationClusterOp>(
-        loc, resultTypes, newOperands);
-    auto operandTypes = llvm::to_vector<16>(
-        llvm::map_range(newOperands, [](Value v) { return v.getType(); }));
-    Block *newBody =
-        rewriter.createBlock(&newOp.getBody(), {}, operandTypes,
-                             SmallVector<Location>(operandTypes.size(), loc));
-    rewriter.setInsertionPointToStart(newBody);
-
-    // Map operands and copy operations of the preceding cluster into the new
-    // body.
-    IRMapping bvm;
-    for (const auto &it : llvm::enumerate(precedingBody->getArguments()))
-      bvm.map(it.value(), newBody->getArgument(it.index()));
-    for (Operation &nestedOp : precedingBody->without_terminator())
-      rewriter.clone(nestedOp, bvm);
-
-    // Map operands and copy operations of the second cluster. If they result
-    // from the preceeding cluster, we can simply map the corresponding value
-    // internally.
-    for (auto it : llvm::zip(body->getArguments(), op.getOperands())) {
-      Value blockArg, operand;
-      std::tie(blockArg, operand) = it;
-      if (operand.getDefiningOp() == precedingOp) {
-        auto where = llvm::find(precedingOp.getResults(), operand);
-        assert(where.getBase() != nullptr && "expected to find ");
-        bvm.map(blockArg,
-                bvm.lookup(precedingYieldOp.getOperand(where.getIndex())));
-      } else {
-        auto where = llvm::find(newOp.getOperands(), operand);
-        bvm.map(blockArg, newBody->getArgument(where.getIndex()));
-      }
-    }
-    for (Operation &nestedOp : body->without_terminator()) {
-      rewriter.clone(nestedOp, bvm);
-    }
-
-    // Yield inner results.
-    rewriter.create<chlo::RankSpecializationClusterYieldOp>(
-        loc,
-        llvm::to_vector<16>(llvm::map_range(newUnmappedResults, [&](Value v) {
-          return bvm.lookupOrDefault(v);
-        })));
-
-    // Replace the two cluster ops with the new corresponding results.
-    SmallVector<Value, 8> precedingOpReplacements;
-    int64_t i = 0;
-    for (Value result : precedingOp.getResults()) {
-      Value replacement = nullptr;
-      if (!llvm::all_of(result.getUsers(),
-                        [&](Operation *user) { return user == op; })) {
-        replacement = newOp->getResult(i++);
-      }
-      precedingOpReplacements.push_back(replacement);
-    }
-    ValueRange opReplacements =
-        newOp.getResults().take_back(op.getNumResults());
-    rewriter.replaceOp(op, opReplacements);
-    rewriter.replaceOp(precedingOp, precedingOpReplacements);
-
-    return success();
-  }
-};
-
-struct RankSpecializationClusterPass
-    : public impl::RankSpecializationClusterPassBase<
-          RankSpecializationClusterPass> {
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry.insert<mhlo::MhloDialect, chlo::ChloDialect>();
-  }
-
-  void runOnOperation() override {
-    MLIRContext *ctx = &getContext();
-    RewritePatternSet patterns(ctx);
-    mhlo::populateRankSpecializationClusterPatterns(ctx, &patterns);
-    if (failed(applyPatternsAndFoldGreedily(getOperation(),
-                                            std::move(patterns)))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-/// Lower rank specialization cluster to SCF.
-
-bool isScalarTensorType(Type ty) {
-  auto rankedTy = ty.dyn_cast<RankedTensorType>();
-  return rankedTy && rankedTy.getRank() == 0;
-}
-
-bool isScalarShapeType(Type ty) {
-  return ty.cast<RankedTensorType>().getDimSize(0) == 0;
-}
-
-Type deriveRankedTensorTypes(Type ty, int64_t rank) {
-  auto tensorTy = ty.dyn_cast<TensorType>();
-  if (!tensorTy) return ty;
-  SmallVector<int64_t, 8> shape(rank, ShapedType::kDynamic);
-  return RankedTensorType::get(shape, tensorTy.getElementType());
-}
-
-Type deriveUnrankedTensorTypes(Type ty) {
-  if (auto rankedTy = ty.dyn_cast<RankedTensorType>())
-    return UnrankedTensorType::get(rankedTy.getElementType());
-  return ty;
-}
-
-SmallVector<Value, 8> materializeRankedOperations(
-    OpBuilder &b, Location loc, IRMapping &bvm,
-    chlo::RankSpecializationClusterOp op) {
-  // Create ranked operations.
-  for (Operation &nestedOp : op.SingleBlock::getBody()->without_terminator()) {
-    auto mappedOperands = llvm::to_vector<4>(llvm::map_range(
-        nestedOp.getOperands(), [&](Value v) { return bvm.lookup(v); }));
-    int64_t targetRank = 0;
-    for (Value v : mappedOperands) {
-      targetRank =
-          std::max(targetRank, v.getType().cast<RankedTensorType>().getRank());
-    }
-    auto rankedResultTypes = llvm::to_vector<2>(
-        llvm::map_range(nestedOp.getResultTypes(), [targetRank](Type ty) {
-          return deriveRankedTensorTypes(ty, targetRank);
-        }));
-    OperationState rankedOpState(loc, nestedOp.getName().getStringRef(),
-                                 mappedOperands, rankedResultTypes,
-                                 nestedOp.getAttrs());
-    Operation *rankedOp = b.create(rankedOpState);
-    for (auto it : llvm::zip(nestedOp.getResults(), rankedOp->getResults()))
-      bvm.map(std::get<0>(it), std::get<1>(it));
-  }
-
-  // Collect ranked results.
-  auto yieldOp = llvm::cast<chlo::RankSpecializationClusterYieldOp>(
-      op.SingleBlock::getBody()->getTerminator());
-  return llvm::to_vector<8>(llvm::map_range(
-      yieldOp.getResults(), [&](Value v) { return bvm.lookup(v); }));
-}
-
-SmallVector<Value, 8> materializeFinalReshape(
-    PatternRewriter &rewriter, Location loc,
-    chlo::RankSpecializationClusterOp op, ValueRange unshapedResults) {
-  auto yieldOp = llvm::cast<chlo::RankSpecializationClusterYieldOp>(
-      op.SingleBlock::getBody()->getTerminator());
-  assert(unshapedResults.size() == 1 && yieldOp.getResults().size() == 1 &&
-         "Currently, rank specialization supports only one result.");
-
-  // Reify result shape.
-  Operation *lastOpBeforeShapeReification = op->getPrevNode();
-  SmallVector<Value, 1> resultShape;
-  Value originalResult = yieldOp.getResults().front();
-  auto originalResultIface =
-      llvm::cast<InferShapedTypeOpInterface>(originalResult.getDefiningOp());
-  if (failed(originalResultIface.reifyReturnTypeShapes(
-          rewriter, originalResultIface->getOperands(), resultShape))) {
-    return {};
-  }
-
-  // Materialize final reshape.
-  Value unshapedResult = unshapedResults.front();
-  Value result = rewriter.create<mhlo::DynamicReshapeOp>(
-      loc, deriveUnrankedTensorTypes(unshapedResult.getType()), unshapedResult,
-      resultShape.front());
-
-  // Reify shapes until they are independent of operations in the original
-  // cluster.
-  {
-    Operation *it = resultShape.front().getDefiningOp();
-    while (it != nullptr && it != lastOpBeforeShapeReification) {
-      bool advanced = false;
-      if (auto shapeOfOp = llvm::dyn_cast<shape::ShapeOfOp>(it)) {
-        Operation *def = shapeOfOp.getArg().getDefiningOp();
-        if (def && def->getBlock() == op.SingleBlock::getBody()) {
-          // Resolve `shape_of` op because it still depends on operation in the
-          // original cluster.
-          OpBuilder::InsertionGuard guard(rewriter);
-          rewriter.setInsertionPoint(shapeOfOp);
-          SmallVector<Value, 1> tmpShape;
-          auto iface = llvm::cast<InferShapedTypeOpInterface>(def);
-          if (failed(iface.reifyReturnTypeShapes(rewriter, iface->getOperands(),
-                                                 tmpShape)))
-            return {};
-          rewriter.replaceOp(shapeOfOp, tmpShape.front());
-
-          // Continue, including the newly created operations.
-          it = tmpShape.front().getDefiningOp();
-          advanced = true;
-        }
-      }
-
-      // Skip op, otherwise.
-      if (!advanced) it = it->getPrevNode();
-    }
-  }
-
-  // Replace all remaining uses of the original cluster's block args.
-  for (auto it :
-       llvm::zip(op.getOperands(), op.SingleBlock::getBody()->getArguments())) {
-    Value operand, barg;
-    std::tie(operand, barg) = it;
-    barg.replaceUsesWithIf(operand, [&](OpOperand &operand) {
-      return operand.getOwner()->getBlock() != op.SingleBlock::getBody();
-    });
-  }
-
-  return {result};
-}
-
-Value materializeFlatShape(OpBuilder &b, Location loc, ValueRange sameShapes) {
-  assert(!sameShapes.empty() && "Expected at least one shape.");
-  Value shape = sameShapes.size() == 1
-                    ? sameShapes.front()
-                    : b.create<shape::AnyOp>(loc, sameShapes.front().getType(),
-                                             sameShapes);
-  return b.create<tensor::FromElementsOp>(
-      loc,
-      b.create<shape::NumElementsOp>(loc, b.getIndexType(), shape).getResult());
-}
-
-Value materializeScalarRankSpecializationCase(
-    OpBuilder &b, Location loc, chlo::RankSpecializationClusterOp op,
-    const SmallVector<Value, 8> &shapes, ValueRange nonScalarsOfSameShape,
-    function_ref<void(OpBuilder &, Location)> elseBuilderFn) {
-  // Materialize predicate: All operands are scalars, except the expected
-  // non-scalars.
-  Value one = b.create<arith::ConstantIndexOp>(loc, 1);
-  Value allOthersAreScalar;
-  for (auto it : llvm::zip(op.getOperands(), shapes)) {
-    Value operand, shape;
-    std::tie(operand, shape) = it;
-    if (llvm::is_contained(nonScalarsOfSameShape, operand) ||
-        isScalarTensorType(operand.getType())) {
-      continue;
-    }
-    auto literal = b.create<arith::CmpIOp>(
-        loc, arith::CmpIPredicate::eq,
-        b.create<shape::NumElementsOp>(loc, shape), one);
-    allOthersAreScalar =
-        allOthersAreScalar
-            ? b.create<mlir::arith::AndIOp>(loc, allOthersAreScalar, literal)
-                  .getResult()
-            : literal.getResult();
-  }
-
-  auto ifOp = b.create<scf::IfOp>(
-      loc, allOthersAreScalar,
-      [&](OpBuilder &b, Location loc) {
-        // Compute flat non-scalar shape.
-        SmallVector<Value, 4> nonScalarShapes;
-        for (auto it : llvm::zip(op.getOperands(), shapes)) {
-          Value operand, shape;
-          std::tie(operand, shape) = it;
-          if (llvm::is_contained(nonScalarsOfSameShape, operand))
-            nonScalarShapes.push_back(shape);
-        }
-        Value flatShape = materializeFlatShape(b, loc, nonScalarShapes);
-
-        // Derive ranked operands.
-        auto rankedOperands = llvm::to_vector<8>(
-            llvm::map_range(op.getOperands(), [&](Value v) -> Value {
-              if (isScalarTensorType(v.getType())) return v;
-              if (!llvm::is_contained(nonScalarsOfSameShape, v)) {
-                return b
-                    .create<mhlo::ReshapeOp>(
-                        loc, deriveRankedTensorTypes(v.getType(), /*rank=*/0),
-                        v)
-                    .getResult();
-              }
-              return b
-                  .create<mhlo::DynamicReshapeOp>(
-                      loc, deriveRankedTensorTypes(v.getType(), /*rank=*/1), v,
-                      flatShape)
-                  .getResult();
-            }));
-
-        // Materialize ranked variants for the element-wise operations.
-        IRMapping bvm;
-        for (auto it : llvm::zip(op.SingleBlock::getBody()->getArguments(),
-                                 rankedOperands))
-          bvm.map(std::get<0>(it), std::get<1>(it));
-        Value unshapedResult =
-            materializeRankedOperations(b, loc, bvm, op).front();
-
-        // Return as unranked tensor for compatibility with the other cases.
-        b.create<scf::YieldOp>(
-            loc, b.create<tensor::CastOp>(
-                      loc, deriveUnrankedTensorTypes(unshapedResult.getType()),
-                      unshapedResult)
-                     .getDest());
-      },
-      elseBuilderFn);
-
-  return ifOp.getResults().front();
-}
-
-Value materializeEqualShapesRankSpecializationCase(
-    OpBuilder &b, Location loc, chlo::RankSpecializationClusterOp op,
-    const SmallVector<Value, 8> &shapes,
-    function_ref<void(OpBuilder &, Location)> elseBuilderFn) {
-  // Materialize all shapes equal predicate.
-  Value allShapesEqOrScalar;
-  auto nonScalarShapes = llvm::to_vector<8>(llvm::make_filter_range(
-      shapes, [](Value v) { return !isScalarShapeType(v.getType()); }));
-  assert(
-      nonScalarShapes.size() >= 2 &&
-      "Equal shapes strategy requires at least two non-scalar operand shapes.");
-  for (Value s : llvm::drop_begin(nonScalarShapes)) {
-    auto literal = b.create<shape::ShapeEqOp>(loc, nonScalarShapes.front(), s);
-    allShapesEqOrScalar =
-        allShapesEqOrScalar
-            ? b.create<mlir::arith::AndIOp>(loc, allShapesEqOrScalar, literal)
-                  .getResult()
-            : literal;
-  }
-
-  auto ifOp = b.create<scf::IfOp>(
-      loc, allShapesEqOrScalar,
-      [&](OpBuilder &b, Location loc) {
-        // Flatten non-scalar operands.
-        Value flatShape = materializeFlatShape(b, loc, nonScalarShapes);
-        auto flatOperands = llvm::to_vector<8>(
-            llvm::map_range(op.getOperands(), [&](Value v) -> Value {
-              if (isScalarTensorType(v.getType())) return v;
-              return b.create<mhlo::DynamicReshapeOp>(
-                  loc, deriveRankedTensorTypes(v.getType(), /*rank=*/1), v,
-                  flatShape);
-            }));
-
-        // Materialize ranked variants for the element-wise operations.
-        IRMapping bvm;
-        for (auto it :
-             llvm::zip(op.SingleBlock::getBody()->getArguments(), flatOperands))
-          bvm.map(std::get<0>(it), std::get<1>(it));
-        Value unshapedResult =
-            materializeRankedOperations(b, loc, bvm, op).front();
-
-        // Return as unranked tensor for compatibility with the other cases.
-        b.create<scf::YieldOp>(
-            loc, b.create<tensor::CastOp>(
-                      loc, deriveUnrankedTensorTypes(unshapedResult.getType()),
-                      unshapedResult)
-                     .getDest());
-      },
-      elseBuilderFn);
-
-  return ifOp.getResults().front();
-}
-
-Value materializeTargetRankSpecializationCase(
-    OpBuilder &b, Location loc, chlo::RankSpecializationClusterOp op,
-    const SmallVector<Value, 8> &shapes, int64_t targetRank) {
-  // Reshape unranked operands to match the target rank.
-  RankedTensorType extentTensorTy =
-      shape::getExtentTensorType(b.getContext(), targetRank);
-  Value allOnesShape = b.create<shape::ConstShapeOp>(
-      loc, extentTensorTy,
-      mlir::DenseIntElementsAttr::get(extentTensorTy,
-                                      SmallVector<int64_t, 6>(targetRank, 1)));
-  SmallVector<Value, 8> rankedOperands;
-  for (auto it : llvm::zip(op.getOperands(), shapes)) {
-    Value operand, shape;
-    std::tie(operand, shape) = it;
-    if (operand.getType().isa<RankedTensorType>()) {
-      rankedOperands.push_back(operand);
-      continue;
-    }
-    Value rankedShape = b.create<tensor::CastOp>(
-        loc, extentTensorTy,
-        b.create<shape::BroadcastOp>(loc,
-                                     shape::getExtentTensorType(b.getContext()),
-                                     shape, allOnesShape,
-                                     /*error=*/nullptr));
-    rankedOperands.push_back(b.create<mhlo::DynamicReshapeOp>(
-        loc, deriveRankedTensorTypes(operand.getType(), targetRank), operand,
-        rankedShape));
-  }
-
-  // Materialize ranked versions of the element-wise operations.
-  IRMapping bvm;
-  for (auto it : llvm::zip(op.getBody().front().getArguments(), rankedOperands))
-    bvm.map(std::get<0>(it), std::get<1>(it));
-
-  // Return as unranked for compatibility with other target ranks.
-  auto unshapedResult = materializeRankedOperations(b, loc, bvm, op).front();
-  return b.create<tensor::CastOp>(
-      loc, deriveUnrankedTensorTypes(unshapedResult.getType()), unshapedResult);
-}
-
-Value recusivelyMaterializeTargetRankSpecializationCases(
-    OpBuilder &b, Location loc, chlo::RankSpecializationClusterOp op,
-    const SmallVector<Value, 8> &shapes, Value maxRank, int64_t minTargetRank,
-    int64_t maxTargetRank) {
-  Value condition = b.create<arith::CmpIOp>(
-      loc, arith::CmpIPredicate::ule, maxRank,
-      b.create<arith::ConstantIndexOp>(loc, minTargetRank));
-
-  // If only a unique target rank is left, we can lower to an assert instead
-  // of the usual if operation.
-  if (minTargetRank == maxTargetRank) {
-    b.create<cf::AssertOp>(
-        loc, condition,
-        "Input for dynamic binary or n-ary op lowering was of "
-        "a rank greater than " +
-            std::to_string(maxTargetRank));
-    return materializeTargetRankSpecializationCase(b, loc, op, shapes,
-                                                   minTargetRank);
-  }
-
-  // Materialize IR for the smallest considered target rank.
-  auto ifOp = b.create<scf::IfOp>(loc, op->getResultTypes(), condition,
-                                  /*withElseRegion=*/true);
-  auto thenBuilder = ifOp.getThenBodyBuilder();
-  thenBuilder.create<scf::YieldOp>(
-      loc, materializeTargetRankSpecializationCase(thenBuilder, loc, op, shapes,
-                                                   minTargetRank));
-
-  // Recurse for all remaining target ranks.
-  auto elseBuilder = ifOp.getElseBodyBuilder();
-  elseBuilder.create<scf::YieldOp>(
-      loc, recusivelyMaterializeTargetRankSpecializationCases(
-               elseBuilder, loc, op, shapes, maxRank, minTargetRank + 1,
-               maxTargetRank));
-
-  return ifOp.getResults().front();
-}
-
-Value materializeGenericRankSpecializationCases(
-    OpBuilder &b, Location loc, chlo::RankSpecializationClusterOp op,
-    const SmallVector<Value, 8> &shapes, int64_t maxTargetRank) {
-  // Get the minimum broadcast shapes of the operands.
-  auto nonScalarShapes = llvm::to_vector<8>(llvm::make_filter_range(
-      shapes, [](Value v) { return !isScalarShapeType(v.getType()); }));
-  auto minBcastShapesOp = b.create<chlo::MinimumBroadcastShapesOp>(
-      loc,
-      SmallVector<Type, 8>(nonScalarShapes.size(),
-                           shape::getExtentTensorType(b.getContext())),
-      nonScalarShapes);
-
-  // Find the maximum rank among the reduced operand shapes.
-  Value maxRank;
-  for (Value shape : minBcastShapesOp.getResults()) {
-    Value rank = b.create<shape::RankOp>(loc, b.getIndexType(), shape);
-    if (!maxRank) {
-      maxRank = rank;
-    } else {
-      maxRank = b.create<mlir::arith::SelectOp>(
-          loc,
-          b.create<arith::CmpIOp>(loc, arith::CmpIPredicate::sgt, maxRank,
-                                  rank),
-          maxRank, rank);
-    }
-  }
-
-  // Collect reduced shapes.
-  SmallVector<Value, 8> reducedShapes;
-  auto it = minBcastShapesOp.result_begin();
-  for (Value s : shapes) {
-    if (isScalarShapeType(s.getType())) {
-      reducedShapes.push_back(s);
-    } else {
-      reducedShapes.push_back(*it++);
-    }
-  }
-
-  // Materialize rank specialization for ranks 1, ...
-  return recusivelyMaterializeTargetRankSpecializationCases(
-      b, loc, op, reducedShapes, maxRank, /*minTargetRank=*/1, maxTargetRank);
-}
-
-Value materializeDefaultRankSpecializationCases(
-    OpBuilder &b, Location loc, chlo::RankSpecializationClusterOp op,
-    const SmallVector<Value, 8> &shapes, int64_t maxTargetRank) {
-  return materializeEqualShapesRankSpecializationCase(
-      b, loc, op, shapes, [&](OpBuilder &b, Location loc) {
-        b.create<scf::YieldOp>(loc, materializeGenericRankSpecializationCases(
-                                        b, loc, op, shapes, maxTargetRank));
-      });
-}
-
-SmallVector<Value, 8>
-materializeRankSpecializationForSingleNonScalarShapeEquivalenceClass(
-    PatternRewriter &rewriter, Location loc,
-    chlo::RankSpecializationClusterOp op, ValueRange nonScalarsOfSameShape) {
-  // Compute flat operand shape.
-  auto nonScalarShapes =
-      llvm::to_vector<4>(llvm::map_range(nonScalarsOfSameShape, [&](Value v) {
-        return rewriter.create<shape::ShapeOfOp>(loc, v).getResult();
-      }));
-  Value flatShape = materializeFlatShape(rewriter, loc, nonScalarShapes);
-
-  // Materialize ranked variants for the element-wise operations.
-  IRMapping bvm;
-  for (auto it :
-       llvm::zip(op.SingleBlock::getBody()->getArguments(), op.getOperands())) {
-    Value operand;
-    Value bbArg;
-    std::tie(bbArg, operand) = it;
-    if (!isScalarTensorType(operand.getType())) {
-      assert(llvm::is_contained(nonScalarsOfSameShape, operand) &&
-             "Expected all non-scalars in the same shape equivalence class.");
-      operand = rewriter.create<mhlo::DynamicReshapeOp>(
-          loc, deriveRankedTensorTypes(operand.getType(), /*rank=*/1), operand,
-          flatShape);
-    }
-    bvm.map(bbArg, operand);
-  }
-  SmallVector<Value, 8> unshapedResults =
-      materializeRankedOperations(rewriter, loc, bvm, op);
-
-  // Restore the results' expected shape.
-  Value shape = nonScalarShapes.front();
-  return llvm::to_vector<8>(
-      llvm::map_range(unshapedResults, [&](Value v) -> Value {
-        return rewriter.create<mhlo::DynamicReshapeOp>(
-            loc, deriveUnrankedTensorTypes(v.getType()), v, shape);
-      }));
-}
-
-Value materializeRankSpecializationForTwoNonScalarShapeEquivalenceClasses(
-    PatternRewriter &rewriter, Location loc,
-    chlo::RankSpecializationClusterOp op,
-    SmallVector<SmallVector<Value, 4>, 4> nonScalarEqs, int64_t maxTargetRank) {
-  assert(nonScalarEqs.size() == 2 &&
-         "Expect two non-scalar equivalence classes.");
-  auto shapes =
-      llvm::to_vector<8>(llvm::map_range(op.getOperands(), [&](Value v) {
-        return rewriter.create<shape::ShapeOfOp>(loc, v).getResult();
-      }));
-  ValueRange lhsNonScalarEqs = nonScalarEqs[0];
-  ValueRange rhsNonScalarEqs = nonScalarEqs[1];
-
-  // Materialize all the different cases.
-  Value unshapedResult = materializeScalarRankSpecializationCase(
-      rewriter, loc, op, shapes, rhsNonScalarEqs,
-      [&](OpBuilder &b, Location loc) {
-        b.create<scf::YieldOp>(
-            loc, materializeScalarRankSpecializationCase(
-                     b, loc, op, shapes, lhsNonScalarEqs,
-                     [&](OpBuilder &b, Location loc) {
-                       b.create<scf::YieldOp>(
-                           loc, materializeDefaultRankSpecializationCases(
-                                    b, loc, op, shapes, maxTargetRank));
-                     }));
-      });
-
-  // Materialize final reshape once and for all rank specialization cases.
-  return materializeFinalReshape(rewriter, loc, op, unshapedResult).front();
-}
-
-// Materialize rank generic rank specialization.
-Value materializeDefaultRankSpecialization(PatternRewriter &rewriter,
-                                           Location loc,
-                                           chlo::RankSpecializationClusterOp op,
-                                           int64_t maxTargetRank) {
-  auto shapes =
-      llvm::to_vector<8>(llvm::map_range(op.getOperands(), [&](Value v) {
-        return rewriter.create<shape::ShapeOfOp>(loc, v).getResult();
-      }));
-
-  // Materialize all the different cases.
-  Value unshapedResult = materializeDefaultRankSpecializationCases(
-      rewriter, loc, op, shapes, maxTargetRank);
-
-  // Materialize final reshape once and for all rank specialization cases.
-  return materializeFinalReshape(rewriter, loc, op, unshapedResult).front();
-}
-
-// This is a very limited form of shape inference. It is correct but incomplete.
-SmallVector<SmallVector<Value, 4>, 4> findNonScalarShapeEquivalences(
-    chlo::RankSpecializationClusterOp op) {
-  llvm::EquivalenceClasses<Value> eqs;
-
-  // Bridge the equivalences between operands and block arguments.
-  for (auto it :
-       llvm::zip(op.getOperands(), op.SingleBlock::getBody()->getArguments()))
-    eqs.unionSets(std::get<0>(it), std::get<1>(it));
-
-  // Find equalities through `SameOperandsAndResultShape` trait.
-  auto unionSets = [&](ValueRange vs) {
-    if (vs.empty()) return;
-    Value repr = vs.front();
-    for (Value v : vs.drop_front()) eqs.unionSets(repr, v);
-  };
-  for (Operation &nestedOp : op.SingleBlock::getBody()->without_terminator()) {
-    if (nestedOp.hasTrait<mlir::OpTrait::SameOperandsAndResultShape>()) {
-      unionSets(nestedOp.getOperands());
-      unionSets(nestedOp.getResults());
-      if (!nestedOp.getOperands().empty() && !nestedOp.getResults().empty())
-        eqs.unionSets(nestedOp.getResult(0), nestedOp.getOperand(0));
-    }
-  }
-
-  // Find shape equalities through surrounding constraints.
-  if (auto assumingOp = op->getParentOfType<shape::AssumingOp>()) {
-    SmallVector<Operation *, 8> queue;
-    auto appendIfNotNull = [&](Operation *op) {
-      if (op != nullptr) queue.push_back(op);
-    };
-    appendIfNotNull(assumingOp.getWitness().getDefiningOp());
-    while (!queue.empty()) {
-      Operation *it = queue.pop_back_val();
-      if (auto assumingAllOp = llvm::dyn_cast<shape::AssumingAllOp>(it)) {
-        for (Value v : assumingAllOp.getInputs())
-          appendIfNotNull(v.getDefiningOp());
-      } else if (auto cstrEqOp = llvm::dyn_cast<shape::CstrEqOp>(it)) {
-        Value refArg;
-        for (Value v : cstrEqOp.getShapes()) {
-          if (auto shapeOfOp =
-                  dyn_cast_or_null<shape::ShapeOfOp>(v.getDefiningOp())) {
-            if (!refArg) {
-              refArg = shapeOfOp.getArg();
-            } else {
-              eqs.unionSets(refArg, shapeOfOp.getArg());
-            }
-          }
-        }
-      }
-    }
-  }
-
-  // Find equalities through special knowledge of ops.
-  // TODO(frgossen): Remove this when these shape equalities can be inferred
-  // from surrounding shape constraints.
-  for (Operation &nestedOp : op.SingleBlock::getBody()->without_terminator()) {
-    if (auto selectOp = llvm::dyn_cast<mhlo::SelectOp>(nestedOp)) {
-      unionSets(
-          {selectOp.getOnTrue(), selectOp.getOnFalse(), selectOp.getResult()});
-    } else if (auto clampOp = llvm::dyn_cast<mhlo::ClampOp>(nestedOp)) {
-      unionSets({clampOp.getOperand(), clampOp.getResult()});
-    }
-  }
-
-  // Convert to a list-like equivalence class representation.
-  SmallVector<SmallVector<Value, 4>, 4> nonScalarEqs;
-  for (Value v : op.getOperands()) {
-    if (isScalarTensorType(v.getType())) continue;
-    bool inserted = false;
-    for (auto &eqClass : nonScalarEqs) {
-      if (eqs.isEquivalent(eqClass.front(), v)) {
-        eqClass.push_back(v);
-        inserted = true;
-        break;
-      }
-    }
-    if (!inserted) nonScalarEqs.push_back(SmallVector<Value, 4>({v}));
-  }
-
-  return nonScalarEqs;
-}
-
-struct LowerRankSpecializationClusterPattern
-    : public OpRewritePattern<chlo::RankSpecializationClusterOp> {
-  LowerRankSpecializationClusterPattern(MLIRContext *ctx, int64_t maxTargetRank)
-      : OpRewritePattern<chlo::RankSpecializationClusterOp>(ctx, /*benefit=*/1),
-        maxTargetRank(maxTargetRank) {}
-
-  LogicalResult matchAndRewrite(chlo::RankSpecializationClusterOp op,
-                                PatternRewriter &rewriter) const override {
-    // Restoring the result shape currently relies on all operands being used
-    // for a single result. The result shape is then the broadcasted shape of
-    // all operands.
-    if (op.getNumResults() != 1) return failure();
-
-    // If there is only a single non-scalar shape equivalence class, we can
-    // flatten that operands completely.
-    SmallVector<SmallVector<Value, 4>, 4> nonScalarEqs =
-        findNonScalarShapeEquivalences(op);
-    Location loc = op.getLoc();
-    if (nonScalarEqs.size() == 1) {
-      rewriter.replaceOp(
-          op,
-          materializeRankSpecializationForSingleNonScalarShapeEquivalenceClass(
-              rewriter, loc, op, nonScalarEqs.front()));
-      return success();
-    }
-
-    // If there are exactly two non-scalar shape equivalence classes, we can
-    // consider two extra cases: If either of the operand classes turns out to
-    // be all-scalars at runtime, we can, again, flatten all operands.
-    if (nonScalarEqs.size() == 2) {
-      rewriter.replaceOp(
-          op,
-          materializeRankSpecializationForTwoNonScalarShapeEquivalenceClasses(
-              rewriter, loc, op, nonScalarEqs, maxTargetRank));
-      return success();
-    }
-
-    // For all other cases, reshape the operands to match in rank, apply the
-    // operation, and restore the expected shape.
-    rewriter.replaceOp(op, materializeDefaultRankSpecialization(
-                               rewriter, loc, op, maxTargetRank));
-    return success();
-  }
-
- private:
-  int64_t maxTargetRank;
-};
-
-struct RankSpecializationToSCFPass
-    : public impl::RankSpecializationToSCFPassBase<
-          RankSpecializationToSCFPass> {
-  explicit RankSpecializationToSCFPass(int64_t maxTargetRank)
-      : RankSpecializationToSCFPassBase<
-            RankSpecializationToSCFPass>::RankSpecializationToSCFPassBase() {
-    this->max_target_rank_ = maxTargetRank;
-  }
-
-  void getDependentDialects(DialectRegistry &registry) const override {
-    registry
-        .insert<mhlo::MhloDialect, chlo::ChloDialect, func::FuncDialect,
-                shape::ShapeDialect, scf::SCFDialect, cf::ControlFlowDialect>();
-  }
-
-  void runOnOperation() override {
-    MLIRContext *ctx = &getContext();
-    RewritePatternSet patterns(ctx);
-    populateRankSpecializationToSCFPatterns(ctx, &patterns,
-                                            this->max_target_rank_);
-    if (failed(applyPatternsAndFoldGreedily(getOperation(),
-                                            std::move(patterns)))) {
-      return signalPassFailure();
-    }
-  }
-};
-
-}  // namespace
-
-void populateRankSpecializationClusterPatterns(MLIRContext *context,
-                                               RewritePatternSet *patterns) {
-  patterns->add<MergeRankSpecializationClusterOpsPattern,
-                RankSpecializationClusterPattern>(context);
-}
-
-void populateRankSpecializationToSCFPatterns(MLIRContext *context,
-                                             RewritePatternSet *patterns,
-                                             int64_t maxTargetRank) {
-  patterns->add<LowerRankSpecializationClusterPattern>(context, maxTargetRank);
-  shape::BroadcastOp::getCanonicalizationPatterns(*patterns, context);
-  shape::ShapeOfOp::getCanonicalizationPatterns(*patterns, context);
-  shape::AnyOp::getCanonicalizationPatterns(*patterns, context);
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>>
-createRankSpecializationClusterPass() {
-  return std::make_unique<RankSpecializationClusterPass>();
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>> createRankSpecializationToSCFPass(
-    int64_t maxTargetRank) {
-  return std::make_unique<RankSpecializationToSCFPass>(maxTargetRank);
-}
-
-}  // namespace mhlo
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h b/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
index c8f8aa4b885dd8..4de69f27d268e0 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/rewriters.h
@@ -203,6 +203,12 @@ void populateChloBroadcastingPatterns(MLIRContext *context,
 void populateDecomposeChloPatterns(MLIRContext *context,
                                    RewritePatternSet *patterns);
 
+// Adds pattern to decompose specific CHLO ops like ErfOp and TopKOp to their
+// basis set of operations. These ops have 1:1 corresponding MHLO ops, but for
+// certain backends, they need to be expanded.
+void populateChloLegalizeToHloBasisOpsPatterns(MLIRContext *context,
+                                               RewritePatternSet *patterns);
+
 }  // namespace chlo
 
 namespace stablehlo {
diff --git a/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc b/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
index 7b769d5bbe38f7..899ec8afd2f2a3 100644
--- a/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
+++ b/third_party/xla/xla/mlir_hlo/mhlo/transforms/stablehlo_legalize_to_hlo/stablehlo_legalize_to_hlo.cc
@@ -52,6 +52,11 @@ Attribute convertAttr(Attribute stablehloAttr) {
         RankedTensorType::get(attr.getSize(), attr.getElementType()),
         attr.asArrayRef());
   }
+  if (auto attr = stablehloAttr.dyn_cast<DenseBoolArrayAttr>()) {
+    return DenseIntElementsAttr::get(
+        RankedTensorType::get(attr.getSize(), attr.getElementType()),
+        attr.asArrayRef());
+  }
 
   // Handle StableHLO attributes.
   // The logic that handles attributes from other dialects (e.g. builtin
diff --git a/third_party/xla/xla/mlir_hlo/tests/BUILD b/third_party/xla/xla/mlir_hlo/tests/BUILD
index 5e88c0e5ebc584..4db81efeb2dd52 100644
--- a/third_party/xla/xla/mlir_hlo/tests/BUILD
+++ b/third_party/xla/xla/mlir_hlo/tests/BUILD
@@ -3,7 +3,6 @@ load("@bazel_skylib//rules:expand_template.bzl", "expand_template")
 load("@llvm-project//llvm:lit_test.bzl", "lit_test", "package_path")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
@@ -35,7 +34,6 @@ package(
 cc_library(
     name = "capi_test",
     srcs = ["capi_test.c"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/mlir_hlo:CAPI",
     ],
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_broadcasts.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_broadcasts.mlir
index ce5a493fd6e894..512e1cea6ff752 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_broadcasts.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_hlo_broadcasts.mlir
@@ -151,7 +151,7 @@ func.func @selectv2_dynamic_ranked(%arg0: tensor<1xi1>, %arg1: tensor<2x?x8xi32>
 // CHECK-LABEL: @dynamicNonScalarBroadcastDimensions
 func.func @dynamicNonScalarBroadcastDimensions(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
   // CHECK: mhlo.add
-  %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = dense<1> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+  %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = array<i64: 1>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
   func.return %0 : tensor<1x4xf32>
 }
 
@@ -160,7 +160,7 @@ func.func @dynamicNonScalarBroadcastDimensions(%arg0: tensor<1x4xf32>, %arg1: te
 // CHECK-LABEL: @dynamicNonScalarByScalarBroadcastDimensions
 func.func @dynamicNonScalarByScalarBroadcastDimensions(%arg0: tensor<1x4xf32>, %arg1: tensor<f32>) -> tensor<1x4xf32> {
   // CHECK: mhlo.add
-  %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = dense<[]> : tensor<0xi64>} : (tensor<1x4xf32>, tensor<f32>) -> tensor<1x4xf32>
+  %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = array<i64>} : (tensor<1x4xf32>, tensor<f32>) -> tensor<1x4xf32>
   func.return %0 : tensor<1x4xf32>
 }
 
@@ -169,7 +169,7 @@ func.func @dynamicNonScalarByScalarBroadcastDimensions(%arg0: tensor<1x4xf32>, %
 func.func @dynamicNonScalarBroadcastDimensionsSizeMismatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
   // expected-warning @+2 {{unsupported non prefix-padded dynamic rank broadcast_dimensions}}
   // expected-error @+1 {{failed to legalize operation}}
-  %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = dense<[1, 2]> : tensor<2xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+  %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = array<i64: 1, 2>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
   func.return %0 : tensor<1x4xf32>
 }
 
@@ -178,7 +178,7 @@ func.func @dynamicNonScalarBroadcastDimensionsSizeMismatch(%arg0: tensor<1x4xf32
 func.func @dynamicNonScalarBroadcastDimensionsMismatch(%arg0: tensor<1x4xf32>, %arg1: tensor<4xf32>) -> tensor<1x4xf32> {
   // expected-warning @+2 {{unsupported non prefix-padded dynamic rank broadcast_dimensions}}
   // expected-error @+1 {{failed to legalize operation}}
-  %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = dense<2> : tensor<1xi64>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
+  %0 = chlo.broadcast_add %arg0, %arg1 {broadcast_dimensions = array<i64: 2>} : (tensor<1x4xf32>, tensor<4xf32>) -> tensor<1x4xf32>
   func.return %0 : tensor<1x4xf32>
 }
 
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
index 3e72efd620c746..d30a4a8136e9d7 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo.mlir
@@ -262,147 +262,7 @@ func.func @conj(%arg0: tensor<3xcomplex<f32>>) -> tensor<3xcomplex<f32>> {
 // CHECK-LABEL: @erf_f64
 // CHECK-SAME: %[[ARG:.*]]: tensor<f64>
 func.func @erf_f64(%arg : tensor<f64>) -> tensor<f64> {
-  // CHECK: %[[TMP_0:.*]] = mhlo.multiply %[[ARG]], %[[ARG]]
-  // CHECK: %[[TMP_3:.*]] = mhlo.constant dense<9.6049737398705161>
-  // CHECK: %[[TMP_5:.*]] = mhlo.multiply %[[TMP_3]], %[[TMP_0]]
-  // CHECK: %[[TMP_6:.*]] = mhlo.constant dense<90.026019720384269>
-  // CHECK: %[[TMP_7:.*]] = mhlo.add %[[TMP_5]], %[[TMP_6]]
-  // CHECK: %[[TMP_8:.*]] = mhlo.multiply %[[TMP_7]], %[[TMP_0]]
-  // CHECK: %[[TMP_9:.*]] = mhlo.constant dense<2232.0053459468431>
-  // CHECK: %[[TMP_10:.*]] = mhlo.add %[[TMP_8]], %[[TMP_9]]
-  // CHECK: %[[TMP_11:.*]] = mhlo.multiply %[[TMP_10]], %[[TMP_0]]
-  // CHECK: %[[TMP_12:.*]] = mhlo.constant dense<7003.3251411280507>
-  // CHECK: %[[TMP_13:.*]] = mhlo.add %[[TMP_11]], %[[TMP_12]]
-  // CHECK: %[[TMP_14:.*]] = mhlo.multiply %[[TMP_13]], %[[TMP_0]]
-  // CHECK: %[[TMP_15:.*]] = mhlo.constant dense<55592.301301039493>
-  // CHECK: %[[TMP_16:.*]] = mhlo.add %[[TMP_14]], %[[TMP_15]]
-  // CHECK: %[[TMP_17:.*]] = mhlo.multiply %[[ARG]], %[[TMP_16]]
-  // CHECK: %[[TMP_20:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_22:.*]] = mhlo.multiply %[[TMP_20]], %[[TMP_0]]
-  // CHECK: %[[TMP_23:.*]] = mhlo.constant dense<33.561714164750313>
-  // CHECK: %[[TMP_24:.*]] = mhlo.add %[[TMP_22]], %[[TMP_23]]
-  // CHECK: %[[TMP_25:.*]] = mhlo.multiply %[[TMP_24]], %[[TMP_0]]
-  // CHECK: %[[TMP_26:.*]] = mhlo.constant dense<521.35794978015269>
-  // CHECK: %[[TMP_27:.*]] = mhlo.add %[[TMP_25]], %[[TMP_26]]
-  // CHECK: %[[TMP_28:.*]] = mhlo.multiply %[[TMP_27]], %[[TMP_0]]
-  // CHECK: %[[TMP_29:.*]] = mhlo.constant dense<4594.3238297098014>
-  // CHECK: %[[TMP_30:.*]] = mhlo.add %[[TMP_28]], %[[TMP_29]]
-  // CHECK: %[[TMP_31:.*]] = mhlo.multiply %[[TMP_30]], %[[TMP_0]]
-  // CHECK: %[[TMP_32:.*]] = mhlo.constant dense<22629.000061389095>
-  // CHECK: %[[TMP_33:.*]] = mhlo.add %[[TMP_31]], %[[TMP_32]]
-  // CHECK: %[[TMP_34:.*]] = mhlo.multiply %[[TMP_33]], %[[TMP_0]]
-  // CHECK: %[[TMP_35:.*]] = mhlo.constant dense<49267.394260863592>
-  // CHECK: %[[TMP_36:.*]] = mhlo.add %[[TMP_34]], %[[TMP_35]]
-  // CHECK: %[[TMP_37:.*]] = mhlo.divide %[[TMP_17]], %[[TMP_36]]
-  // CHECK: %[[TMP_38:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_39:.*]] = mhlo.multiply %[[ARG]], %[[ARG]]
-  // CHECK: %[[TMP_40:.*]] = mhlo.negate %[[TMP_39]]
-  // CHECK: %[[TMP_41:.*]] = mhlo.exponential %[[TMP_40]]
-  // CHECK: %[[TMP_42:.*]] = mhlo.abs %[[ARG]]
-  // CHECK: %[[TMP_45:.*]] = mhlo.constant dense<2.4619698147353052E-10>
-  // CHECK: %[[TMP_47:.*]] = mhlo.multiply %[[TMP_45]], %[[TMP_42]]
-  // CHECK: %[[TMP_48:.*]] = mhlo.constant dense<0.56418956483106886>
-  // CHECK: %[[TMP_49:.*]] = mhlo.add %[[TMP_47]], %[[TMP_48]]
-  // CHECK: %[[TMP_50:.*]] = mhlo.multiply %[[TMP_49]], %[[TMP_42]]
-  // CHECK: %[[TMP_51:.*]] = mhlo.constant dense<7.4632105644226989>
-  // CHECK: %[[TMP_52:.*]] = mhlo.add %[[TMP_50]], %[[TMP_51]]
-  // CHECK: %[[TMP_53:.*]] = mhlo.multiply %[[TMP_52]], %[[TMP_42]]
-  // CHECK: %[[TMP_54:.*]] = mhlo.constant dense<48.637197098568137>
-  // CHECK: %[[TMP_55:.*]] = mhlo.add %[[TMP_53]], %[[TMP_54]]
-  // CHECK: %[[TMP_56:.*]] = mhlo.multiply %[[TMP_55]], %[[TMP_42]]
-  // CHECK: %[[TMP_57:.*]] = mhlo.constant dense<196.5208329560771>
-  // CHECK: %[[TMP_58:.*]] = mhlo.add %[[TMP_56]], %[[TMP_57]]
-  // CHECK: %[[TMP_59:.*]] = mhlo.multiply %[[TMP_58]], %[[TMP_42]]
-  // CHECK: %[[TMP_60:.*]] = mhlo.constant dense<526.44519499547732>
-  // CHECK: %[[TMP_61:.*]] = mhlo.add %[[TMP_59]], %[[TMP_60]]
-  // CHECK: %[[TMP_62:.*]] = mhlo.multiply %[[TMP_61]], %[[TMP_42]]
-  // CHECK: %[[TMP_63:.*]] = mhlo.constant dense<934.52852717195765>
-  // CHECK: %[[TMP_64:.*]] = mhlo.add %[[TMP_62]], %[[TMP_63]]
-  // CHECK: %[[TMP_65:.*]] = mhlo.multiply %[[TMP_64]], %[[TMP_42]]
-  // CHECK: %[[TMP_66:.*]] = mhlo.constant dense<1027.5518868951572>
-  // CHECK: %[[TMP_67:.*]] = mhlo.add %[[TMP_65]], %[[TMP_66]]
-  // CHECK: %[[TMP_68:.*]] = mhlo.multiply %[[TMP_67]], %[[TMP_42]]
-  // CHECK: %[[TMP_69:.*]] = mhlo.constant dense<557.53533536939938>
-  // CHECK: %[[TMP_70:.*]] = mhlo.add %[[TMP_68]], %[[TMP_69]]
-  // CHECK: %[[TMP_71:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_70]]
-  // CHECK: %[[TMP_74:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_76:.*]] = mhlo.multiply %[[TMP_74]], %[[TMP_42]]
-  // CHECK: %[[TMP_77:.*]] = mhlo.constant dense<13.228195115474499>
-  // CHECK: %[[TMP_78:.*]] = mhlo.add %[[TMP_76]], %[[TMP_77]]
-  // CHECK: %[[TMP_79:.*]] = mhlo.multiply %[[TMP_78]], %[[TMP_42]]
-  // CHECK: %[[TMP_80:.*]] = mhlo.constant dense<86.707214088598973>
-  // CHECK: %[[TMP_81:.*]] = mhlo.add %[[TMP_79]], %[[TMP_80]]
-  // CHECK: %[[TMP_82:.*]] = mhlo.multiply %[[TMP_81]], %[[TMP_42]]
-  // CHECK: %[[TMP_83:.*]] = mhlo.constant dense<354.93777888781989>
-  // CHECK: %[[TMP_84:.*]] = mhlo.add %[[TMP_82]], %[[TMP_83]]
-  // CHECK: %[[TMP_85:.*]] = mhlo.multiply %[[TMP_84]], %[[TMP_42]]
-  // CHECK: %[[TMP_86:.*]] = mhlo.constant dense<975.70850174320549>
-  // CHECK: %[[TMP_87:.*]] = mhlo.add %[[TMP_85]], %[[TMP_86]]
-  // CHECK: %[[TMP_88:.*]] = mhlo.multiply %[[TMP_87]], %[[TMP_42]]
-  // CHECK: %[[TMP_89:.*]] = mhlo.constant dense<1823.9091668790973>
-  // CHECK: %[[TMP_90:.*]] = mhlo.add %[[TMP_88]], %[[TMP_89]]
-  // CHECK: %[[TMP_91:.*]] = mhlo.multiply %[[TMP_90]], %[[TMP_42]]
-  // CHECK: %[[TMP_92:.*]] = mhlo.constant dense<2246.3376081871097>
-  // CHECK: %[[TMP_93:.*]] = mhlo.add %[[TMP_91]], %[[TMP_92]]
-  // CHECK: %[[TMP_94:.*]] = mhlo.multiply %[[TMP_93]], %[[TMP_42]]
-  // CHECK: %[[TMP_95:.*]] = mhlo.constant dense<1656.6630919416134>
-  // CHECK: %[[TMP_96:.*]] = mhlo.add %[[TMP_94]], %[[TMP_95]]
-  // CHECK: %[[TMP_97:.*]] = mhlo.multiply %[[TMP_96]], %[[TMP_42]]
-  // CHECK: %[[TMP_98:.*]] = mhlo.constant dense<557.53534081772773>
-  // CHECK: %[[TMP_99:.*]] = mhlo.add %[[TMP_97]], %[[TMP_98]]
-  // CHECK: %[[TMP_100:.*]] = mhlo.divide %[[TMP_71]], %[[TMP_99]]
-  // CHECK: %[[TMP_103:.*]] = mhlo.constant dense<0.56418958354775506>
-  // CHECK: %[[TMP_105:.*]] = mhlo.multiply %[[TMP_103]], %[[TMP_42]]
-  // CHECK: %[[TMP_106:.*]] = mhlo.constant dense<1.275366707599781>
-  // CHECK: %[[TMP_107:.*]] = mhlo.add %[[TMP_105]], %[[TMP_106]]
-  // CHECK: %[[TMP_108:.*]] = mhlo.multiply %[[TMP_107]], %[[TMP_42]]
-  // CHECK: %[[TMP_109:.*]] = mhlo.constant dense<5.0190504225118051>
-  // CHECK: %[[TMP_110:.*]] = mhlo.add %[[TMP_108]], %[[TMP_109]]
-  // CHECK: %[[TMP_111:.*]] = mhlo.multiply %[[TMP_110]], %[[TMP_42]]
-  // CHECK: %[[TMP_112:.*]] = mhlo.constant dense<6.160210979930536>
-  // CHECK: %[[TMP_113:.*]] = mhlo.add %[[TMP_111]], %[[TMP_112]]
-  // CHECK: %[[TMP_114:.*]] = mhlo.multiply %[[TMP_113]], %[[TMP_42]]
-  // CHECK: %[[TMP_115:.*]] = mhlo.constant dense<7.4097426995044895>
-  // CHECK: %[[TMP_116:.*]] = mhlo.add %[[TMP_114]], %[[TMP_115]]
-  // CHECK: %[[TMP_117:.*]] = mhlo.multiply %[[TMP_116]], %[[TMP_42]]
-  // CHECK: %[[TMP_118:.*]] = mhlo.constant dense<2.9788666537210022>
-  // CHECK: %[[TMP_119:.*]] = mhlo.add %[[TMP_117]], %[[TMP_118]]
-  // CHECK: %[[TMP_120:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_119]]
-  // CHECK: %[[TMP_123:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[TMP_125:.*]] = mhlo.multiply %[[TMP_123]], %[[TMP_42]]
-  // CHECK: %[[TMP_126:.*]] = mhlo.constant dense<2.2605286322011726>
-  // CHECK: %[[TMP_127:.*]] = mhlo.add %[[TMP_125]], %[[TMP_126]]
-  // CHECK: %[[TMP_128:.*]] = mhlo.multiply %[[TMP_127]], %[[TMP_42]]
-  // CHECK: %[[TMP_129:.*]] = mhlo.constant dense<9.3960352493800147>
-  // CHECK: %[[TMP_130:.*]] = mhlo.add %[[TMP_128]], %[[TMP_129]]
-  // CHECK: %[[TMP_131:.*]] = mhlo.multiply %[[TMP_130]], %[[TMP_42]]
-  // CHECK: %[[TMP_132:.*]] = mhlo.constant dense<12.048953980809666>
-  // CHECK: %[[TMP_133:.*]] = mhlo.add %[[TMP_131]], %[[TMP_132]]
-  // CHECK: %[[TMP_134:.*]] = mhlo.multiply %[[TMP_133]], %[[TMP_42]]
-  // CHECK: %[[TMP_135:.*]] = mhlo.constant dense<17.081445074756591>
-  // CHECK: %[[TMP_136:.*]] = mhlo.add %[[TMP_134]], %[[TMP_135]]
-  // CHECK: %[[TMP_137:.*]] = mhlo.multiply %[[TMP_136]], %[[TMP_42]]
-  // CHECK: %[[TMP_138:.*]] = mhlo.constant dense<9.6089680906328585>
-  // CHECK: %[[TMP_139:.*]] = mhlo.add %[[TMP_137]], %[[TMP_138]]
-  // CHECK: %[[TMP_140:.*]] = mhlo.multiply %[[TMP_139]], %[[TMP_42]]
-  // CHECK: %[[TMP_141:.*]] = mhlo.constant dense<3.3690764510008151>
-  // CHECK: %[[TMP_142:.*]] = mhlo.add %[[TMP_140]], %[[TMP_141]]
-  // CHECK: %[[TMP_143:.*]] = mhlo.divide %[[TMP_120]], %[[TMP_142]]
-  // CHECK: %[[TMP_144:.*]] = mhlo.constant dense<8.000000e+00>
-  // CHECK: %[[TMP_145:.*]] = mhlo.compare LT, %[[TMP_42]], %[[TMP_144]], NOTYPE
-  // CHECK: %[[TMP_146:.*]] = mhlo.select %[[TMP_145]], %[[TMP_100]], %[[TMP_143]]
-  // CHECK: %[[TMP_147:.*]] = mhlo.constant dense<-709.78271289338397>
-  // CHECK: %[[TMP_148:.*]] = mhlo.compare LT, %[[TMP_40]], %[[TMP_147]], NOTYPE
-  // CHECK: %[[TMP_149:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[TMP_150:.*]] = mhlo.select %[[TMP_148]], %[[TMP_149]], %[[TMP_146]]
-  // CHECK: %[[TMP_152:.*]] = mhlo.compare LT, %[[ARG]], %[[TMP_149]], NOTYPE
-  // CHECK: %[[TMP_153:.*]] = mhlo.constant dense<2.000000e+00>
-  // CHECK: %[[TMP_154:.*]] = mhlo.subtract %[[TMP_153]], %[[TMP_150]]
-  // CHECK: %[[TMP_155:.*]] = mhlo.select %[[TMP_152]], %[[TMP_154]], %[[TMP_150]]
-  // CHECK: %[[TMP_156:.*]] = mhlo.subtract %[[TMP_38]], %[[TMP_155]]
-  // CHECK: %[[TMP_157:.*]] = mhlo.abs %[[ARG]]
-  // CHECK: %[[TMP_159:.*]] = mhlo.compare LT, %[[TMP_157]], %[[TMP_38]], NOTYPE
-  // CHECK: %[[RESULT:.*]] = mhlo.select %[[TMP_159]], %[[TMP_37]], %[[TMP_156]]
+  // CHECK: %[[RESULT:.*]] = mhlo.erf %[[ARG]]
   // CHECK: return %[[RESULT]]
   %1 = "chlo.erf"(%arg) : (tensor<f64>) -> tensor<f64>
   func.return %1 : tensor<f64>
@@ -413,47 +273,7 @@ func.func @erf_f64(%arg : tensor<f64>) -> tensor<f64> {
 // CHECK-LABEL: @erf_f32
 // CHECK-SAME: %[[ARG:.*]]: tensor<f32>
 func.func @erf_f32(%arg : tensor<f32>) -> tensor<f32> {
-  // CHECK-DAG: %[[TMP_0:.*]] = mhlo.constant dense<-4.000000e+00>
-  // CHECK-DAG: %[[TMP_1:.*]] = mhlo.constant dense<4.000000e+00>
-  // CHECK: %[[TMP_2:.*]] = mhlo.clamp %[[TMP_0]], %[[ARG]], %[[TMP_1]]
-  // CHECK: %[[TMP_3:.*]] = mhlo.multiply %[[TMP_2]], %[[TMP_2]]
-  // CHECK: %[[TMP_6:.*]] = mhlo.constant dense<-2.72614237E-10>
-  // CHECK: %[[TMP_8:.*]] = mhlo.multiply %[[TMP_6]], %[[TMP_3]]
-  // CHECK: %[[TMP_9:.*]] = mhlo.constant dense<2.77068146E-8>
-  // CHECK: %[[TMP_10:.*]] = mhlo.add %[[TMP_8]], %[[TMP_9]]
-  // CHECK: %[[TMP_11:.*]] = mhlo.multiply %[[TMP_10]], %[[TMP_3]]
-  // CHECK: %[[TMP_12:.*]] = mhlo.constant dense<-2.10102394E-6>
-  // CHECK: %[[TMP_13:.*]] = mhlo.add %[[TMP_11]], %[[TMP_12]]
-  // CHECK: %[[TMP_14:.*]] = mhlo.multiply %[[TMP_13]], %[[TMP_3]]
-  // CHECK: %[[TMP_15:.*]] = mhlo.constant dense<-5.69250624E-5>
-  // CHECK: %[[TMP_16:.*]] = mhlo.add %[[TMP_14]], %[[TMP_15]]
-  // CHECK: %[[TMP_17:.*]] = mhlo.multiply %[[TMP_16]], %[[TMP_3]]
-  // CHECK: %[[TMP_18:.*]] = mhlo.constant dense<-7.34990637E-4>
-  // CHECK: %[[TMP_19:.*]] = mhlo.add %[[TMP_17]], %[[TMP_18]]
-  // CHECK: %[[TMP_20:.*]] = mhlo.multiply %[[TMP_19]], %[[TMP_3]]
-  // CHECK: %[[TMP_21:.*]] = mhlo.constant dense<-2.954600e-03>
-  // CHECK: %[[TMP_22:.*]] = mhlo.add %[[TMP_20]], %[[TMP_21]]
-  // CHECK: %[[TMP_23:.*]] = mhlo.multiply %[[TMP_22]], %[[TMP_3]]
-  // CHECK: %[[TMP_24:.*]] = mhlo.constant dense<-0.0160960332>
-  // CHECK: %[[TMP_25:.*]] = mhlo.add %[[TMP_23]], %[[TMP_24]]
-  // CHECK: %[[TMP_28:.*]] = mhlo.constant dense<-1.45660715E-5>
-  // CHECK: %[[TMP_30:.*]] = mhlo.multiply %[[TMP_28]], %[[TMP_3]]
-  // CHECK: %[[TMP_31:.*]] = mhlo.constant dense<-2.13374049E-4>
-  // CHECK: %[[TMP_32:.*]] = mhlo.add %[[TMP_30]], %[[TMP_31]]
-  // CHECK: %[[TMP_33:.*]] = mhlo.multiply %[[TMP_32]], %[[TMP_3]]
-  // CHECK: %[[TMP_34:.*]] = mhlo.constant dense<-0.00168282702>
-  // CHECK: %[[TMP_35:.*]] = mhlo.add %[[TMP_33]], %[[TMP_34]]
-  // CHECK: %[[TMP_36:.*]] = mhlo.multiply %[[TMP_35]], %[[TMP_3]]
-  // CHECK: %[[TMP_37:.*]] = mhlo.constant dense<-0.00737332925>
-  // CHECK: %[[TMP_38:.*]] = mhlo.add %[[TMP_36]], %[[TMP_37]]
-  // CHECK: %[[TMP_39:.*]] = mhlo.multiply %[[TMP_38]], %[[TMP_3]]
-  // CHECK: %[[TMP_40:.*]] = mhlo.constant dense<-0.0142647391>
-  // CHECK: %[[TMP_41:.*]] = mhlo.add %[[TMP_39]], %[[TMP_40]]
-  // CHECK: %[[TMP_42:.*]] = mhlo.multiply %[[TMP_2]], %[[TMP_25]]
-  // CHECK: %[[TMP_43:.*]] = mhlo.divide %[[TMP_42]], %[[TMP_41]]
-  // CHECK-DAG: %[[TMP_44:.*]] = mhlo.constant dense<-1.000000e+00>
-  // CHECK-DAG: %[[TMP_45:.*]] = mhlo.constant dense<1.000000e+00>
-  // CHECK: %[[RESULT:.*]] = mhlo.clamp %[[TMP_44]], %[[TMP_43]], %[[TMP_45]]
+  // CHECK: %[[RESULT:.*]] = mhlo.erf %[[ARG]]
   // CHECK: return %[[RESULT]]
   %1 = "chlo.erf"(%arg) : (tensor<f32>) -> tensor<f32>
   func.return %1 : tensor<f32>
@@ -464,8 +284,7 @@ func.func @erf_f32(%arg : tensor<f32>) -> tensor<f32> {
 // CHECK-LABEL: @erf_f16
 // CHECK-SAME: %[[ARG:.*]]: tensor<f16>
 func.func @erf_f16(%arg : tensor<f16>) -> tensor<f16> {
-  // CHECK: mhlo.convert %[[ARG]] : (tensor<f16>) -> tensor<f32>
-  // CHECK: %[[RESULT:.*]] = mhlo.convert %{{.*}} : (tensor<f32>) -> tensor<f16>
+  // CHECK: %[[RESULT:.*]] = mhlo.erf %[[ARG]]
   // CHECK: return %[[RESULT]]
   %1 = "chlo.erf"(%arg) : (tensor<f16>) -> tensor<f16>
   func.return %1 : tensor<f16>
@@ -476,8 +295,7 @@ func.func @erf_f16(%arg : tensor<f16>) -> tensor<f16> {
 // CHECK-LABEL: @erf_bf16
 // CHECK-SAME: %[[ARG:.*]]: tensor<bf16>
 func.func @erf_bf16(%arg : tensor<bf16>) -> tensor<bf16> {
-  // CHECK: mhlo.convert %[[ARG]] : (tensor<bf16>) -> tensor<f32>
-  // CHECK: %[[RESULT:.*]] = mhlo.convert %{{.*}} : (tensor<f32>) -> tensor<bf16>
+  // CHECK: %[[RESULT:.*]] = mhlo.erf %[[ARG]]
   // CHECK: return %[[RESULT]]
   %1 = "chlo.erf"(%arg) : (tensor<bf16>) -> tensor<bf16>
   func.return %1 : tensor<bf16>
@@ -2466,15 +2284,7 @@ func.func @tan_f32(%arg : tensor<f32>) -> tensor<f32> {
 // CHECK-LABEL: @top_k
 // CHECK-SAME: (%[[ARG:.*]]: tensor<16x16xf32>)
 func.func @top_k(%arg : tensor<16x16xf32>) -> (tensor<16x8xf32>, tensor<16x8xi32>) {
-  // CHECK:      %[[IOTA:.*]] = "mhlo.iota"() {iota_dimension = 1 : i64}
-  // CHECK-NEXT: %[[SORT:.*]]:2 = "mhlo.sort"(%[[ARG]], %[[IOTA]]) ({
-  // CHECK-NEXT: ^{{.*}}(%[[LHS:.*]]: tensor<f32>, %[[RHS:.*]]: tensor<f32>, %{{.*}}: tensor<i32>, %{{.*}}: tensor<i32>):
-  // CHECK-NEXT:   %[[CMP:.*]] = mhlo.compare GT, %[[LHS]], %[[RHS]], TOTALORDER
-  // CHECK-NEXT:   mhlo.return %[[CMP]]
-  // CHECK-NEXT: }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
-  // CHECK-NEXT: %[[VAL:.*]] = "mhlo.slice"(%[[SORT]]#0) {limit_indices = dense<[16, 8]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-  // CHECK-NEXT: %[[IDX:.*]] = "mhlo.slice"(%[[SORT]]#1) {limit_indices = dense<[16, 8]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
-  // CHECK-NEXT: return %[[VAL]], %[[IDX]]
+  // CHECK: %values, %indices = mhlo.topk(%arg0, k = 8, largest = true) : tensor<16x16xf32> -> (tensor<16x8xf32>, tensor<16x8xi32>)
   %1:2 = chlo.top_k(%arg, k=8) : tensor<16x16xf32> -> (tensor<16x8xf32>, tensor<16x8xi32>)
   func.return %1#0, %1#1 : tensor<16x8xf32>, tensor<16x8xi32>
 }
@@ -2485,28 +2295,7 @@ func.func @top_k(%arg : tensor<16x16xf32>) -> (tensor<16x8xf32>, tensor<16x8xi32
 // CHECK-SAME: ([[ARG:%.*]]: tensor<?x5x?xi1>
 // CHECK-SAME: -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
 func.func @dyn_top_k(%arg0: tensor<?x5x?xi1>) -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>) {
-  // CHECK-NEXT: [[DIM_0_I32:%.*]] = "mhlo.get_dimension_size"([[ARG]]) {dimension = 0 : i64} : (tensor<?x5x?xi1>) -> tensor<i32>
-  // CHECK-NEXT: [[DIM_0_I32x1:%.*]] = mhlo.reshape [[DIM_0_I32]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: [[DIM_1_I32:%.*]] = "mhlo.get_dimension_size"([[ARG]]) {dimension = 1 : i64} : (tensor<?x5x?xi1>) -> tensor<i32>
-  // CHECK-NEXT: [[DIM_1_I32x1:%.*]] = mhlo.reshape [[DIM_1_I32]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: [[DIM_2_I32:%.*]] = "mhlo.get_dimension_size"([[ARG]]) {dimension = 2 : i64} : (tensor<?x5x?xi1>) -> tensor<i32>
-  // CHECK-NEXT: [[DIM_2_I32x1:%.*]] = mhlo.reshape [[DIM_2_I32]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: [[IOTA_SHAPE:%.*]] = "mhlo.concatenate"([[DIM_0_I32x1]], [[DIM_1_I32x1]], [[DIM_2_I32x1]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
-  // CHECK-NEXT: [[K_I32:%.*]] = mhlo.constant dense<2> : tensor<i32>
-  // CHECK-NEXT: [[K_I32x1:%.*]] = mhlo.reshape [[K_I32]] : (tensor<i32>) -> tensor<1xi32>
-  // CHECK-NEXT: [[RESULT_SHAPE:%.*]] = "mhlo.concatenate"([[DIM_0_I32x1]], [[DIM_1_I32x1]], [[K_I32x1]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
-  // CHECK-NEXT: [[IOTA:%.*]] = "mhlo.dynamic_iota"([[IOTA_SHAPE]]) {iota_dimension = 2 : i64} : (tensor<3xi32>) -> tensor<?x5x?xi32>
-  // CHECK-NEXT: [[SORT:%.*]]:2 = "mhlo.sort"([[ARG]], [[IOTA]]) ({
-  // CHECK-NEXT: ^bb0([[ARG_1:%.*]]: tensor<i1>, [[ARG_2:%.*]]: tensor<i1>, [[ARG_3:%.*]]: tensor<i32>, [[ARG_4:%.*]]: tensor<i32>):
-  // CHECK-NEXT:   [[CMP:%.*]] = mhlo.compare  GT, [[ARG_1]], [[ARG_2]],  NOTYPE : (tensor<i1>, tensor<i1>) -> tensor<i1>
-  // CHECK-NEXT:   mhlo.return [[CMP]] : tensor<i1>
-  // CHECK-NEXT: }) {dimension = 2 : i64, is_stable = true} : (tensor<?x5x?xi1>, tensor<?x5x?xi32>) -> (tensor<?x5x?xi1>, tensor<?x5x?xi32>)
-  // CHECK-NEXT: [[STARTS:%.*]] = mhlo.constant dense<0> : tensor<3xi64>
-  // CHECK-NEXT: [[LIMITS:%.*]] = mhlo.convert [[RESULT_SHAPE]] : (tensor<3xi32>) -> tensor<3xi64>
-  // CHECK-NEXT: [[STRIDES:%.*]] = mhlo.constant dense<1> : tensor<3xi64>
-  // CHECK-NEXT: [[VAL:%.*]] = mhlo.real_dynamic_slice [[SORT]]#0, [[STARTS]], [[LIMITS]], [[STRIDES]] : (tensor<?x5x?xi1>, tensor<3xi64>, tensor<3xi64>, tensor<3xi64>) -> tensor<?x5x2xi1>
-  // CHECK-NEXT: [[IDX:%.*]] = mhlo.real_dynamic_slice [[SORT]]#1, [[STARTS]], [[LIMITS]], [[STRIDES]] : (tensor<?x5x?xi32>, tensor<3xi64>, tensor<3xi64>, tensor<3xi64>) -> tensor<?x5x2xi32>
-  // CHECK-NEXT: return [[VAL]], [[IDX]] : tensor<?x5x2xi1>, tensor<?x5x2xi32>
+  // CHECK: %values, %indices = mhlo.topk(%arg0, k = 2, largest = true) : tensor<?x5x?xi1> -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
   %values, %indices = chlo.top_k(%arg0, k = 2) : tensor<?x5x?xi1> -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
   return %values, %indices : tensor<?x5x2xi1>, tensor<?x5x2xi32>
 }
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo_basis_ops.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo_basis_ops.mlir
new file mode 100644
index 00000000000000..0a9b7eef41657e
--- /dev/null
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/chlo/chlo_legalize_to_mhlo_basis_ops.mlir
@@ -0,0 +1,276 @@
+// RUN: mlir-hlo-opt --chlo-legalize-to-hlo-basis-ops --chlo-legalize-to-hlo --split-input-file -verify-diagnostics %s | FileCheck %s 
+
+// -----
+
+// CHECK-LABEL: @erf_f64
+// CHECK-SAME: %[[ARG:.*]]: tensor<f64>
+func.func @erf_f64(%arg : tensor<f64>) -> tensor<f64> {
+  // CHECK: %[[TMP_0:.*]] = mhlo.multiply %[[ARG]], %[[ARG]]
+  // CHECK: %[[TMP_3:.*]] = mhlo.constant dense<9.6049737398705161>
+  // CHECK: %[[TMP_5:.*]] = mhlo.multiply %[[TMP_3]], %[[TMP_0]]
+  // CHECK: %[[TMP_6:.*]] = mhlo.constant dense<90.026019720384269>
+  // CHECK: %[[TMP_7:.*]] = mhlo.add %[[TMP_5]], %[[TMP_6]]
+  // CHECK: %[[TMP_8:.*]] = mhlo.multiply %[[TMP_7]], %[[TMP_0]]
+  // CHECK: %[[TMP_9:.*]] = mhlo.constant dense<2232.0053459468431>
+  // CHECK: %[[TMP_10:.*]] = mhlo.add %[[TMP_8]], %[[TMP_9]]
+  // CHECK: %[[TMP_11:.*]] = mhlo.multiply %[[TMP_10]], %[[TMP_0]]
+  // CHECK: %[[TMP_12:.*]] = mhlo.constant dense<7003.3251411280507>
+  // CHECK: %[[TMP_13:.*]] = mhlo.add %[[TMP_11]], %[[TMP_12]]
+  // CHECK: %[[TMP_14:.*]] = mhlo.multiply %[[TMP_13]], %[[TMP_0]]
+  // CHECK: %[[TMP_15:.*]] = mhlo.constant dense<55592.301301039493>
+  // CHECK: %[[TMP_16:.*]] = mhlo.add %[[TMP_14]], %[[TMP_15]]
+  // CHECK: %[[TMP_17:.*]] = mhlo.multiply %[[ARG]], %[[TMP_16]]
+  // CHECK: %[[TMP_20:.*]] = mhlo.constant dense<1.000000e+00>
+  // CHECK: %[[TMP_22:.*]] = mhlo.multiply %[[TMP_20]], %[[TMP_0]]
+  // CHECK: %[[TMP_23:.*]] = mhlo.constant dense<33.561714164750313>
+  // CHECK: %[[TMP_24:.*]] = mhlo.add %[[TMP_22]], %[[TMP_23]]
+  // CHECK: %[[TMP_25:.*]] = mhlo.multiply %[[TMP_24]], %[[TMP_0]]
+  // CHECK: %[[TMP_26:.*]] = mhlo.constant dense<521.35794978015269>
+  // CHECK: %[[TMP_27:.*]] = mhlo.add %[[TMP_25]], %[[TMP_26]]
+  // CHECK: %[[TMP_28:.*]] = mhlo.multiply %[[TMP_27]], %[[TMP_0]]
+  // CHECK: %[[TMP_29:.*]] = mhlo.constant dense<4594.3238297098014>
+  // CHECK: %[[TMP_30:.*]] = mhlo.add %[[TMP_28]], %[[TMP_29]]
+  // CHECK: %[[TMP_31:.*]] = mhlo.multiply %[[TMP_30]], %[[TMP_0]]
+  // CHECK: %[[TMP_32:.*]] = mhlo.constant dense<22629.000061389095>
+  // CHECK: %[[TMP_33:.*]] = mhlo.add %[[TMP_31]], %[[TMP_32]]
+  // CHECK: %[[TMP_34:.*]] = mhlo.multiply %[[TMP_33]], %[[TMP_0]]
+  // CHECK: %[[TMP_35:.*]] = mhlo.constant dense<49267.394260863592>
+  // CHECK: %[[TMP_36:.*]] = mhlo.add %[[TMP_34]], %[[TMP_35]]
+  // CHECK: %[[TMP_37:.*]] = mhlo.divide %[[TMP_17]], %[[TMP_36]]
+  // CHECK: %[[TMP_38:.*]] = mhlo.constant dense<1.000000e+00>
+  // CHECK: %[[TMP_39:.*]] = mhlo.multiply %[[ARG]], %[[ARG]]
+  // CHECK: %[[TMP_40:.*]] = mhlo.negate %[[TMP_39]]
+  // CHECK: %[[TMP_41:.*]] = mhlo.exponential %[[TMP_40]]
+  // CHECK: %[[TMP_42:.*]] = mhlo.abs %[[ARG]]
+  // CHECK: %[[TMP_45:.*]] = mhlo.constant dense<2.4619698147353052E-10>
+  // CHECK: %[[TMP_47:.*]] = mhlo.multiply %[[TMP_45]], %[[TMP_42]]
+  // CHECK: %[[TMP_48:.*]] = mhlo.constant dense<0.56418956483106886>
+  // CHECK: %[[TMP_49:.*]] = mhlo.add %[[TMP_47]], %[[TMP_48]]
+  // CHECK: %[[TMP_50:.*]] = mhlo.multiply %[[TMP_49]], %[[TMP_42]]
+  // CHECK: %[[TMP_51:.*]] = mhlo.constant dense<7.4632105644226989>
+  // CHECK: %[[TMP_52:.*]] = mhlo.add %[[TMP_50]], %[[TMP_51]]
+  // CHECK: %[[TMP_53:.*]] = mhlo.multiply %[[TMP_52]], %[[TMP_42]]
+  // CHECK: %[[TMP_54:.*]] = mhlo.constant dense<48.637197098568137>
+  // CHECK: %[[TMP_55:.*]] = mhlo.add %[[TMP_53]], %[[TMP_54]]
+  // CHECK: %[[TMP_56:.*]] = mhlo.multiply %[[TMP_55]], %[[TMP_42]]
+  // CHECK: %[[TMP_57:.*]] = mhlo.constant dense<196.5208329560771>
+  // CHECK: %[[TMP_58:.*]] = mhlo.add %[[TMP_56]], %[[TMP_57]]
+  // CHECK: %[[TMP_59:.*]] = mhlo.multiply %[[TMP_58]], %[[TMP_42]]
+  // CHECK: %[[TMP_60:.*]] = mhlo.constant dense<526.44519499547732>
+  // CHECK: %[[TMP_61:.*]] = mhlo.add %[[TMP_59]], %[[TMP_60]]
+  // CHECK: %[[TMP_62:.*]] = mhlo.multiply %[[TMP_61]], %[[TMP_42]]
+  // CHECK: %[[TMP_63:.*]] = mhlo.constant dense<934.52852717195765>
+  // CHECK: %[[TMP_64:.*]] = mhlo.add %[[TMP_62]], %[[TMP_63]]
+  // CHECK: %[[TMP_65:.*]] = mhlo.multiply %[[TMP_64]], %[[TMP_42]]
+  // CHECK: %[[TMP_66:.*]] = mhlo.constant dense<1027.5518868951572>
+  // CHECK: %[[TMP_67:.*]] = mhlo.add %[[TMP_65]], %[[TMP_66]]
+  // CHECK: %[[TMP_68:.*]] = mhlo.multiply %[[TMP_67]], %[[TMP_42]]
+  // CHECK: %[[TMP_69:.*]] = mhlo.constant dense<557.53533536939938>
+  // CHECK: %[[TMP_70:.*]] = mhlo.add %[[TMP_68]], %[[TMP_69]]
+  // CHECK: %[[TMP_71:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_70]]
+  // CHECK: %[[TMP_74:.*]] = mhlo.constant dense<1.000000e+00>
+  // CHECK: %[[TMP_76:.*]] = mhlo.multiply %[[TMP_74]], %[[TMP_42]]
+  // CHECK: %[[TMP_77:.*]] = mhlo.constant dense<13.228195115474499>
+  // CHECK: %[[TMP_78:.*]] = mhlo.add %[[TMP_76]], %[[TMP_77]]
+  // CHECK: %[[TMP_79:.*]] = mhlo.multiply %[[TMP_78]], %[[TMP_42]]
+  // CHECK: %[[TMP_80:.*]] = mhlo.constant dense<86.707214088598973>
+  // CHECK: %[[TMP_81:.*]] = mhlo.add %[[TMP_79]], %[[TMP_80]]
+  // CHECK: %[[TMP_82:.*]] = mhlo.multiply %[[TMP_81]], %[[TMP_42]]
+  // CHECK: %[[TMP_83:.*]] = mhlo.constant dense<354.93777888781989>
+  // CHECK: %[[TMP_84:.*]] = mhlo.add %[[TMP_82]], %[[TMP_83]]
+  // CHECK: %[[TMP_85:.*]] = mhlo.multiply %[[TMP_84]], %[[TMP_42]]
+  // CHECK: %[[TMP_86:.*]] = mhlo.constant dense<975.70850174320549>
+  // CHECK: %[[TMP_87:.*]] = mhlo.add %[[TMP_85]], %[[TMP_86]]
+  // CHECK: %[[TMP_88:.*]] = mhlo.multiply %[[TMP_87]], %[[TMP_42]]
+  // CHECK: %[[TMP_89:.*]] = mhlo.constant dense<1823.9091668790973>
+  // CHECK: %[[TMP_90:.*]] = mhlo.add %[[TMP_88]], %[[TMP_89]]
+  // CHECK: %[[TMP_91:.*]] = mhlo.multiply %[[TMP_90]], %[[TMP_42]]
+  // CHECK: %[[TMP_92:.*]] = mhlo.constant dense<2246.3376081871097>
+  // CHECK: %[[TMP_93:.*]] = mhlo.add %[[TMP_91]], %[[TMP_92]]
+  // CHECK: %[[TMP_94:.*]] = mhlo.multiply %[[TMP_93]], %[[TMP_42]]
+  // CHECK: %[[TMP_95:.*]] = mhlo.constant dense<1656.6630919416134>
+  // CHECK: %[[TMP_96:.*]] = mhlo.add %[[TMP_94]], %[[TMP_95]]
+  // CHECK: %[[TMP_97:.*]] = mhlo.multiply %[[TMP_96]], %[[TMP_42]]
+  // CHECK: %[[TMP_98:.*]] = mhlo.constant dense<557.53534081772773>
+  // CHECK: %[[TMP_99:.*]] = mhlo.add %[[TMP_97]], %[[TMP_98]]
+  // CHECK: %[[TMP_100:.*]] = mhlo.divide %[[TMP_71]], %[[TMP_99]]
+  // CHECK: %[[TMP_103:.*]] = mhlo.constant dense<0.56418958354775506>
+  // CHECK: %[[TMP_105:.*]] = mhlo.multiply %[[TMP_103]], %[[TMP_42]]
+  // CHECK: %[[TMP_106:.*]] = mhlo.constant dense<1.275366707599781>
+  // CHECK: %[[TMP_107:.*]] = mhlo.add %[[TMP_105]], %[[TMP_106]]
+  // CHECK: %[[TMP_108:.*]] = mhlo.multiply %[[TMP_107]], %[[TMP_42]]
+  // CHECK: %[[TMP_109:.*]] = mhlo.constant dense<5.0190504225118051>
+  // CHECK: %[[TMP_110:.*]] = mhlo.add %[[TMP_108]], %[[TMP_109]]
+  // CHECK: %[[TMP_111:.*]] = mhlo.multiply %[[TMP_110]], %[[TMP_42]]
+  // CHECK: %[[TMP_112:.*]] = mhlo.constant dense<6.160210979930536>
+  // CHECK: %[[TMP_113:.*]] = mhlo.add %[[TMP_111]], %[[TMP_112]]
+  // CHECK: %[[TMP_114:.*]] = mhlo.multiply %[[TMP_113]], %[[TMP_42]]
+  // CHECK: %[[TMP_115:.*]] = mhlo.constant dense<7.4097426995044895>
+  // CHECK: %[[TMP_116:.*]] = mhlo.add %[[TMP_114]], %[[TMP_115]]
+  // CHECK: %[[TMP_117:.*]] = mhlo.multiply %[[TMP_116]], %[[TMP_42]]
+  // CHECK: %[[TMP_118:.*]] = mhlo.constant dense<2.9788666537210022>
+  // CHECK: %[[TMP_119:.*]] = mhlo.add %[[TMP_117]], %[[TMP_118]]
+  // CHECK: %[[TMP_120:.*]] = mhlo.multiply %[[TMP_41]], %[[TMP_119]]
+  // CHECK: %[[TMP_123:.*]] = mhlo.constant dense<1.000000e+00>
+  // CHECK: %[[TMP_125:.*]] = mhlo.multiply %[[TMP_123]], %[[TMP_42]]
+  // CHECK: %[[TMP_126:.*]] = mhlo.constant dense<2.2605286322011726>
+  // CHECK: %[[TMP_127:.*]] = mhlo.add %[[TMP_125]], %[[TMP_126]]
+  // CHECK: %[[TMP_128:.*]] = mhlo.multiply %[[TMP_127]], %[[TMP_42]]
+  // CHECK: %[[TMP_129:.*]] = mhlo.constant dense<9.3960352493800147>
+  // CHECK: %[[TMP_130:.*]] = mhlo.add %[[TMP_128]], %[[TMP_129]]
+  // CHECK: %[[TMP_131:.*]] = mhlo.multiply %[[TMP_130]], %[[TMP_42]]
+  // CHECK: %[[TMP_132:.*]] = mhlo.constant dense<12.048953980809666>
+  // CHECK: %[[TMP_133:.*]] = mhlo.add %[[TMP_131]], %[[TMP_132]]
+  // CHECK: %[[TMP_134:.*]] = mhlo.multiply %[[TMP_133]], %[[TMP_42]]
+  // CHECK: %[[TMP_135:.*]] = mhlo.constant dense<17.081445074756591>
+  // CHECK: %[[TMP_136:.*]] = mhlo.add %[[TMP_134]], %[[TMP_135]]
+  // CHECK: %[[TMP_137:.*]] = mhlo.multiply %[[TMP_136]], %[[TMP_42]]
+  // CHECK: %[[TMP_138:.*]] = mhlo.constant dense<9.6089680906328585>
+  // CHECK: %[[TMP_139:.*]] = mhlo.add %[[TMP_137]], %[[TMP_138]]
+  // CHECK: %[[TMP_140:.*]] = mhlo.multiply %[[TMP_139]], %[[TMP_42]]
+  // CHECK: %[[TMP_141:.*]] = mhlo.constant dense<3.3690764510008151>
+  // CHECK: %[[TMP_142:.*]] = mhlo.add %[[TMP_140]], %[[TMP_141]]
+  // CHECK: %[[TMP_143:.*]] = mhlo.divide %[[TMP_120]], %[[TMP_142]]
+  // CHECK: %[[TMP_144:.*]] = mhlo.constant dense<8.000000e+00>
+  // CHECK: %[[TMP_145:.*]] = mhlo.compare LT, %[[TMP_42]], %[[TMP_144]], NOTYPE
+  // CHECK: %[[TMP_146:.*]] = mhlo.select %[[TMP_145]], %[[TMP_100]], %[[TMP_143]]
+  // CHECK: %[[TMP_147:.*]] = mhlo.constant dense<-709.78271289338397>
+  // CHECK: %[[TMP_148:.*]] = mhlo.compare LT, %[[TMP_40]], %[[TMP_147]], NOTYPE
+  // CHECK: %[[TMP_149:.*]] = mhlo.constant dense<0.000000e+00>
+  // CHECK: %[[TMP_150:.*]] = mhlo.select %[[TMP_148]], %[[TMP_149]], %[[TMP_146]]
+  // CHECK: %[[TMP_152:.*]] = mhlo.compare LT, %[[ARG]], %[[TMP_149]], NOTYPE
+  // CHECK: %[[TMP_153:.*]] = mhlo.constant dense<2.000000e+00>
+  // CHECK: %[[TMP_154:.*]] = mhlo.subtract %[[TMP_153]], %[[TMP_150]]
+  // CHECK: %[[TMP_155:.*]] = mhlo.select %[[TMP_152]], %[[TMP_154]], %[[TMP_150]]
+  // CHECK: %[[TMP_156:.*]] = mhlo.subtract %[[TMP_38]], %[[TMP_155]]
+  // CHECK: %[[TMP_157:.*]] = mhlo.abs %[[ARG]]
+  // CHECK: %[[TMP_159:.*]] = mhlo.compare LT, %[[TMP_157]], %[[TMP_38]], NOTYPE
+  // CHECK: %[[RESULT:.*]] = mhlo.select %[[TMP_159]], %[[TMP_37]], %[[TMP_156]]
+  // CHECK: return %[[RESULT]]
+  %1 = "chlo.erf"(%arg) : (tensor<f64>) -> tensor<f64>
+  func.return %1 : tensor<f64>
+}
+
+// -----
+
+// CHECK-LABEL: @erf_f32
+// CHECK-SAME: %[[ARG:.*]]: tensor<f32>
+func.func @erf_f32(%arg : tensor<f32>) -> tensor<f32> {
+  // CHECK-DAG: %[[TMP_0:.*]] = mhlo.constant dense<-4.000000e+00>
+  // CHECK-DAG: %[[TMP_1:.*]] = mhlo.constant dense<4.000000e+00>
+  // CHECK: %[[TMP_2:.*]] = mhlo.clamp %[[TMP_0]], %[[ARG]], %[[TMP_1]]
+  // CHECK: %[[TMP_3:.*]] = mhlo.multiply %[[TMP_2]], %[[TMP_2]]
+  // CHECK: %[[TMP_6:.*]] = mhlo.constant dense<-2.72614237E-10>
+  // CHECK: %[[TMP_8:.*]] = mhlo.multiply %[[TMP_6]], %[[TMP_3]]
+  // CHECK: %[[TMP_9:.*]] = mhlo.constant dense<2.77068146E-8>
+  // CHECK: %[[TMP_10:.*]] = mhlo.add %[[TMP_8]], %[[TMP_9]]
+  // CHECK: %[[TMP_11:.*]] = mhlo.multiply %[[TMP_10]], %[[TMP_3]]
+  // CHECK: %[[TMP_12:.*]] = mhlo.constant dense<-2.10102394E-6>
+  // CHECK: %[[TMP_13:.*]] = mhlo.add %[[TMP_11]], %[[TMP_12]]
+  // CHECK: %[[TMP_14:.*]] = mhlo.multiply %[[TMP_13]], %[[TMP_3]]
+  // CHECK: %[[TMP_15:.*]] = mhlo.constant dense<-5.69250624E-5>
+  // CHECK: %[[TMP_16:.*]] = mhlo.add %[[TMP_14]], %[[TMP_15]]
+  // CHECK: %[[TMP_17:.*]] = mhlo.multiply %[[TMP_16]], %[[TMP_3]]
+  // CHECK: %[[TMP_18:.*]] = mhlo.constant dense<-7.34990637E-4>
+  // CHECK: %[[TMP_19:.*]] = mhlo.add %[[TMP_17]], %[[TMP_18]]
+  // CHECK: %[[TMP_20:.*]] = mhlo.multiply %[[TMP_19]], %[[TMP_3]]
+  // CHECK: %[[TMP_21:.*]] = mhlo.constant dense<-2.954600e-03>
+  // CHECK: %[[TMP_22:.*]] = mhlo.add %[[TMP_20]], %[[TMP_21]]
+  // CHECK: %[[TMP_23:.*]] = mhlo.multiply %[[TMP_22]], %[[TMP_3]]
+  // CHECK: %[[TMP_24:.*]] = mhlo.constant dense<-0.0160960332>
+  // CHECK: %[[TMP_25:.*]] = mhlo.add %[[TMP_23]], %[[TMP_24]]
+  // CHECK: %[[TMP_28:.*]] = mhlo.constant dense<-1.45660715E-5>
+  // CHECK: %[[TMP_30:.*]] = mhlo.multiply %[[TMP_28]], %[[TMP_3]]
+  // CHECK: %[[TMP_31:.*]] = mhlo.constant dense<-2.13374049E-4>
+  // CHECK: %[[TMP_32:.*]] = mhlo.add %[[TMP_30]], %[[TMP_31]]
+  // CHECK: %[[TMP_33:.*]] = mhlo.multiply %[[TMP_32]], %[[TMP_3]]
+  // CHECK: %[[TMP_34:.*]] = mhlo.constant dense<-0.00168282702>
+  // CHECK: %[[TMP_35:.*]] = mhlo.add %[[TMP_33]], %[[TMP_34]]
+  // CHECK: %[[TMP_36:.*]] = mhlo.multiply %[[TMP_35]], %[[TMP_3]]
+  // CHECK: %[[TMP_37:.*]] = mhlo.constant dense<-0.00737332925>
+  // CHECK: %[[TMP_38:.*]] = mhlo.add %[[TMP_36]], %[[TMP_37]]
+  // CHECK: %[[TMP_39:.*]] = mhlo.multiply %[[TMP_38]], %[[TMP_3]]
+  // CHECK: %[[TMP_40:.*]] = mhlo.constant dense<-0.0142647391>
+  // CHECK: %[[TMP_41:.*]] = mhlo.add %[[TMP_39]], %[[TMP_40]]
+  // CHECK: %[[TMP_42:.*]] = mhlo.multiply %[[TMP_2]], %[[TMP_25]]
+  // CHECK: %[[TMP_43:.*]] = mhlo.divide %[[TMP_42]], %[[TMP_41]]
+  // CHECK-DAG: %[[TMP_44:.*]] = mhlo.constant dense<-1.000000e+00>
+  // CHECK-DAG: %[[TMP_45:.*]] = mhlo.constant dense<1.000000e+00>
+  // CHECK: %[[RESULT:.*]] = mhlo.clamp %[[TMP_44]], %[[TMP_43]], %[[TMP_45]]
+  // CHECK: return %[[RESULT]]
+  %1 = "chlo.erf"(%arg) : (tensor<f32>) -> tensor<f32>
+  func.return %1 : tensor<f32>
+}
+
+// -----
+
+// CHECK-LABEL: @erf_f16
+// CHECK-SAME: %[[ARG:.*]]: tensor<f16>
+func.func @erf_f16(%arg : tensor<f16>) -> tensor<f16> {
+  // CHECK: mhlo.convert %[[ARG]] : (tensor<f16>) -> tensor<f32>
+  // CHECK: %[[RESULT:.*]] = mhlo.convert %{{.*}} : (tensor<f32>) -> tensor<f16>
+  // CHECK: return %[[RESULT]]
+  %1 = "chlo.erf"(%arg) : (tensor<f16>) -> tensor<f16>
+  func.return %1 : tensor<f16>
+}
+
+// -----
+
+// CHECK-LABEL: @erf_bf16
+// CHECK-SAME: %[[ARG:.*]]: tensor<bf16>
+func.func @erf_bf16(%arg : tensor<bf16>) -> tensor<bf16> {
+  // CHECK: mhlo.convert %[[ARG]] : (tensor<bf16>) -> tensor<f32>
+  // CHECK: %[[RESULT:.*]] = mhlo.convert %{{.*}} : (tensor<f32>) -> tensor<bf16>
+  // CHECK: return %[[RESULT]]
+  %1 = "chlo.erf"(%arg) : (tensor<bf16>) -> tensor<bf16>
+  func.return %1 : tensor<bf16>
+}
+
+
+// CHECK-LABEL: @top_k
+// CHECK-SAME: (%[[ARG:.*]]: tensor<16x16xf32>)
+func.func @top_k(%arg : tensor<16x16xf32>) -> (tensor<16x8xf32>, tensor<16x8xi32>) {
+  // CHECK:      %[[IOTA:.*]] = "mhlo.iota"() {iota_dimension = 1 : i64}
+  // CHECK-NEXT: %[[SORT:.*]]:2 = "mhlo.sort"(%[[ARG]], %[[IOTA]]) ({
+  // CHECK-NEXT: ^{{.*}}(%[[LHS:.*]]: tensor<f32>, %[[RHS:.*]]: tensor<f32>, %{{.*}}: tensor<i32>, %{{.*}}: tensor<i32>):
+  // CHECK-NEXT:   %[[CMP:.*]] = mhlo.compare GT, %[[LHS]], %[[RHS]], TOTALORDER
+  // CHECK-NEXT:   mhlo.return %[[CMP]]
+  // CHECK-NEXT: }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
+  // CHECK-NEXT: %[[VAL:.*]] = "mhlo.slice"(%[[SORT]]#0) {limit_indices = dense<[16, 8]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+  // CHECK-NEXT: %[[IDX:.*]] = "mhlo.slice"(%[[SORT]]#1) {limit_indices = dense<[16, 8]> : tensor<2xi64>, start_indices = dense<0> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>}
+  // CHECK-NEXT: return %[[VAL]], %[[IDX]]
+  %1:2 = chlo.top_k(%arg, k=8) : tensor<16x16xf32> -> (tensor<16x8xf32>, tensor<16x8xi32>)
+  func.return %1#0, %1#1 : tensor<16x8xf32>, tensor<16x8xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @dyn_top_k
+// CHECK-SAME: ([[ARG:%.*]]: tensor<?x5x?xi1>
+// CHECK-SAME: -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
+func.func @dyn_top_k(%arg0: tensor<?x5x?xi1>) -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>) {
+  // CHECK-NEXT: [[DIM_0_I32:%.*]] = "mhlo.get_dimension_size"([[ARG]]) {dimension = 0 : i64} : (tensor<?x5x?xi1>) -> tensor<i32>
+  // CHECK-NEXT: [[DIM_0_I32x1:%.*]] = mhlo.reshape [[DIM_0_I32]] : (tensor<i32>) -> tensor<1xi32>
+  // CHECK-NEXT: [[DIM_1_I32:%.*]] = "mhlo.get_dimension_size"([[ARG]]) {dimension = 1 : i64} : (tensor<?x5x?xi1>) -> tensor<i32>
+  // CHECK-NEXT: [[DIM_1_I32x1:%.*]] = mhlo.reshape [[DIM_1_I32]] : (tensor<i32>) -> tensor<1xi32>
+  // CHECK-NEXT: [[DIM_2_I32:%.*]] = "mhlo.get_dimension_size"([[ARG]]) {dimension = 2 : i64} : (tensor<?x5x?xi1>) -> tensor<i32>
+  // CHECK-NEXT: [[DIM_2_I32x1:%.*]] = mhlo.reshape [[DIM_2_I32]] : (tensor<i32>) -> tensor<1xi32>
+  // CHECK-NEXT: [[IOTA_SHAPE:%.*]] = "mhlo.concatenate"([[DIM_0_I32x1]], [[DIM_1_I32x1]], [[DIM_2_I32x1]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
+  // CHECK-NEXT: [[K_I32:%.*]] = mhlo.constant dense<2> : tensor<i32>
+  // CHECK-NEXT: [[K_I32x1:%.*]] = mhlo.reshape [[K_I32]] : (tensor<i32>) -> tensor<1xi32>
+  // CHECK-NEXT: [[RESULT_SHAPE:%.*]] = "mhlo.concatenate"([[DIM_0_I32x1]], [[DIM_1_I32x1]], [[K_I32x1]]) {dimension = 0 : i64} : (tensor<1xi32>, tensor<1xi32>, tensor<1xi32>) -> tensor<3xi32>
+  // CHECK-NEXT: [[IOTA:%.*]] = "mhlo.dynamic_iota"([[IOTA_SHAPE]]) {iota_dimension = 2 : i64} : (tensor<3xi32>) -> tensor<?x5x?xi32>
+  // CHECK-NEXT: [[SORT:%.*]]:2 = "mhlo.sort"([[ARG]], [[IOTA]]) ({
+  // CHECK-NEXT: ^bb0([[ARG_1:%.*]]: tensor<i1>, [[ARG_2:%.*]]: tensor<i1>, [[ARG_3:%.*]]: tensor<i32>, [[ARG_4:%.*]]: tensor<i32>):
+  // CHECK-NEXT:   [[CMP:%.*]] = mhlo.compare  GT, [[ARG_1]], [[ARG_2]],  NOTYPE : (tensor<i1>, tensor<i1>) -> tensor<i1>
+  // CHECK-NEXT:   mhlo.return [[CMP]] : tensor<i1>
+  // CHECK-NEXT: }) {dimension = 2 : i64, is_stable = true} : (tensor<?x5x?xi1>, tensor<?x5x?xi32>) -> (tensor<?x5x?xi1>, tensor<?x5x?xi32>)
+  // CHECK-NEXT: [[STARTS:%.*]] = mhlo.constant dense<0> : tensor<3xi64>
+  // CHECK-NEXT: [[LIMITS:%.*]] = mhlo.convert [[RESULT_SHAPE]] : (tensor<3xi32>) -> tensor<3xi64>
+  // CHECK-NEXT: [[STRIDES:%.*]] = mhlo.constant dense<1> : tensor<3xi64>
+  // CHECK-NEXT: [[VAL:%.*]] = mhlo.real_dynamic_slice [[SORT]]#0, [[STARTS]], [[LIMITS]], [[STRIDES]] : (tensor<?x5x?xi1>, tensor<3xi64>, tensor<3xi64>, tensor<3xi64>) -> tensor<?x5x2xi1>
+  // CHECK-NEXT: [[IDX:%.*]] = mhlo.real_dynamic_slice [[SORT]]#1, [[STARTS]], [[LIMITS]], [[STRIDES]] : (tensor<?x5x?xi32>, tensor<3xi64>, tensor<3xi64>, tensor<3xi64>) -> tensor<?x5x2xi32>
+  // CHECK-NEXT: return [[VAL]], [[IDX]] : tensor<?x5x2xi1>, tensor<?x5x2xi32>
+  %values, %indices = chlo.top_k(%arg0, k = 2) : tensor<?x5x?xi1> -> (tensor<?x5x2xi1>, tensor<?x5x2xi32>)
+  return %values, %indices : tensor<?x5x2xi1>, tensor<?x5x2xi32>
+}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
index c87a831a492d1c..3f7266eddcd2d2 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/hlo-legalize-to-stablehlo.mlir
@@ -272,20 +272,20 @@ func.func @attr_rng_algorithm_philox(%arg0: tensor<f32>) -> (tensor<f32>, tensor
 }
 
 // CHECK-LABEL: "attr_rng_distribution_uniform"
-func.func @attr_rng_distribution_uniform(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<?xindex>) -> tensor<f32> {
+func.func @attr_rng_distribution_uniform(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<0xindex>) -> tensor<f32> {
   %0 = "mhlo.rng"(%arg0, %arg1, %arg2) {
     // CHECK: rng_distribution = #stablehlo<rng_distribution UNIFORM>
     rng_distribution = #mhlo.rng_distribution<UNIFORM>
-  } : (tensor<f32>, tensor<f32>, tensor<?xindex>) -> tensor<f32>
+  } : (tensor<f32>, tensor<f32>, tensor<0xindex>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "attr_rng_distribution_normal"
-func.func @attr_rng_distribution_normal(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<?xindex>) -> tensor<f32> {
+func.func @attr_rng_distribution_normal(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<0xindex>) -> tensor<f32> {
   %0 = "mhlo.rng"(%arg0, %arg1, %arg2) {
     // CHECK: rng_distribution = #stablehlo<rng_distribution NORMAL>
     rng_distribution = #mhlo.rng_distribution<NORMAL>
-  } : (tensor<f32>, tensor<f32>, tensor<?xindex>) -> tensor<f32>
+  } : (tensor<f32>, tensor<f32>, tensor<0xindex>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
@@ -490,7 +490,7 @@ func.func @op_bitcast_convert(%arg0: tensor<i32>) -> tensor<f32> {
 // CHECK-LABEL: "op_broadcast_in_dim"
 func.func @op_broadcast_in_dim(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
   //      CHECK: "stablehlo.broadcast_in_dim"(%arg0) {
-  // CHECK-SAME:   broadcast_dimensions = dense<1> : tensor<1xi64>
+  // CHECK-SAME:   broadcast_dimensions = array<i64: 1>
   // CHECK-SAME: } : (tensor<16xf32>) -> tensor<16x16xf32>
   %0 = "mhlo.broadcast_in_dim"(%arg0) {
     broadcast_dimensions = dense<1> : tensor<1xi64>
@@ -647,12 +647,12 @@ func.func @op_convolution(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16
   // CHECK-SAME:   batch_group_count = 1 : i64,
   // CHECK-SAME:   dimension_numbers = #stablehlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
   // CHECK-SAME:   feature_group_count = 1 : i64,
-  // CHECK-SAME:   lhs_dilation = dense<1> : tensor<2xi64>,
+  // CHECK-SAME:   lhs_dilation = array<i64: 1, 1>,
   // CHECK-SAME:   padding = dense<1> : tensor<2x2xi64>,
   // CHECK-SAME:   precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>],
-  // CHECK-SAME:   rhs_dilation = dense<1> : tensor<2xi64>,
-  // CHECK-SAME:   window_reversal = dense<false> : tensor<2xi1>,
-  // CHECK-SAME:   window_strides = dense<1> : tensor<2xi64>
+  // CHECK-SAME:   rhs_dilation = array<i64: 1, 1>,
+  // CHECK-SAME:   window_reversal = array<i1: false, false>,
+  // CHECK-SAME:   window_strides = array<i64: 1, 1>
   // CHECK-SAME: } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
   %0 = "mhlo.convolution"(%arg0, %arg1) {
     window_strides = dense<1> : tensor<2xi64>,
@@ -782,9 +782,9 @@ func.func @op_dot(%arg0: tensor<8x16xf32>, %arg1: tensor<16x8xf32>) -> tensor<8x
 // CHECK-LABEL: "op_dynamic_broadcast_in_dim"
 func.func @op_dynamic_broadcast_in_dim(%arg0: tensor<?xf32>, %arg1: tensor<2xindex>) -> tensor<?x?xf32> {
   //      CHECK: "stablehlo.dynamic_broadcast_in_dim"(%arg0, %arg1) {
-  // CHECK-SAME:   broadcast_dimensions = dense<1> : tensor<1xi64>,
-  // CHECK-SAME:   known_expanding_dimensions = dense<> : tensor<0xi64>,
-  // CHECK-SAME:   known_nonexpanding_dimensions = dense<0> : tensor<1xi64>
+  // CHECK-SAME:   broadcast_dimensions = array<i64: 1>,
+  // CHECK-SAME:   known_expanding_dimensions = array<i64>,
+  // CHECK-SAME:   known_nonexpanding_dimensions = array<i64: 0>
   // CHECK-SAME: } : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   %0 = "mhlo.dynamic_broadcast_in_dim"(%arg0, %arg1) {
     broadcast_dimensions = dense<1> : tensor<1xi64>,
@@ -800,12 +800,12 @@ func.func @op_dynamic_conv(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x1
   // CHECK-SAME:   batch_group_count = 1 : i64,
   // CHECK-SAME:   dimension_numbers = #stablehlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
   // CHECK-SAME:   feature_group_count = 1 : i64,
-  // CHECK-SAME:   lhs_dilation = dense<1> : tensor<2xi64>,
+  // CHECK-SAME:   lhs_dilation = array<i64: 1, 1>,
   // CHECK-SAME:   padding = dense<1> : tensor<2x2xi64>,
   // CHECK-SAME:   precision_config = [#stablehlo<precision DEFAULT>, #stablehlo<precision DEFAULT>],
-  // CHECK-SAME:   rhs_dilation = dense<1> : tensor<2xi64>,
-  // CHECK-SAME:   window_reversal = dense<false> : tensor<2xi1>,
-  // CHECK-SAME:   window_strides = dense<1> : tensor<2xi64>
+  // CHECK-SAME:   rhs_dilation = array<i64: 1, 1>,
+  // CHECK-SAME:   window_reversal = array<i1: false, false>,
+  // CHECK-SAME:   window_strides = array<i64: 1, 1>
   // CHECK-SAME: } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<4xi32>) -> tensor<1x?x?x16xf32>
   %0 = "mhlo.dynamic_conv"(%arg0, %arg1, %arg2) {
     window_strides = dense<1> : tensor<2xi64>,
@@ -863,9 +863,9 @@ func.func @op_dynamic_pad(%arg0: tensor<?xf32>, %arg1: tensor<f32>, %arg2: tenso
 }
 
 // CHECK-LABEL: "op_dynamic_reshape"
-func.func @op_dynamic_reshape(%arg0: tensor<16xf32>, %arg1: tensor<?xindex>) -> tensor<?x?xf32> {
-  // CHECK: "stablehlo.dynamic_reshape"(%arg0, %arg1) : (tensor<16xf32>, tensor<?xindex>) -> tensor<?x?xf32>
-  %0 = "mhlo.dynamic_reshape"(%arg0, %arg1) : (tensor<16xf32>, tensor<?xindex>) -> tensor<?x?xf32>
+func.func @op_dynamic_reshape(%arg0: tensor<16xf32>, %arg1: tensor<2xindex>) -> tensor<?x?xf32> {
+  // CHECK: "stablehlo.dynamic_reshape"(%arg0, %arg1) : (tensor<16xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  %0 = "mhlo.dynamic_reshape"(%arg0, %arg1) : (tensor<16xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
 
@@ -944,7 +944,7 @@ func.func @op_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>) -> te
   // CHECK-SAME:     index_vector_dim = 2
   // CHECK-SAME:   >,
   // CHECK-SAME:   indices_are_sorted = false,
-  // CHECK-SAME:   slice_sizes = dense<1> : tensor<3xi64>
+  // CHECK-SAME:   slice_sizes = array<i64: 1, 1, 1>
   // CHECK-SAME: } : (tensor<2x4x9xf32>, tensor<1x5x2xi32>) -> tensor<1x5x1xf32>
   %0 = "mhlo.gather"(%arg0, %arg1) {
     dimension_numbers = #mhlo.gather<
@@ -1062,7 +1062,7 @@ func.func @op_map(%arg0: tensor<16xf32>) -> tensor<16xf32> {
   // CHECK-NEXT:     %[[VAL1:.*]] = "stablehlo.abs"(%[[ARG1]]) : (tensor<f32>) -> tensor<f32>
   // CHECK-NEXT:     "stablehlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
   // CHECK-NEXT: }) {
-  // CHECK-SAME:   dimensions = dense<0> : tensor<1xi64>
+  // CHECK-SAME:   dimensions = array<i64: 0>
   // CHECK-SAME: } : (tensor<16xf32>) -> tensor<16xf32>
   %0 = "mhlo.map"(%arg0) ({
     ^bb0(%arg1: tensor<f32>):
@@ -1252,11 +1252,11 @@ func.func @op_reduce_window(%arg0: tensor<2x17x31x7xf32>, %arg1: tensor<f32>) ->
   //          CHECK-NEXT:     %[[VAL1:.*]] = "stablehlo.maximum"(%[[ARG2]], %[[ARG3]]) : (tensor<f32>, tensor<f32>) -> tensor<f32>
   //          CHECK-NEXT:     "stablehlo.return"(%[[VAL1]]) : (tensor<f32>) -> ()
   //          CHECK-NEXT: }) {
-  //          CHECK-SAME:   base_dilations = dense<1> : tensor<4xi64>,
+  //          CHECK-SAME:   base_dilations = array<i64: 1, 1, 1, 1>,
   // CHECK-SAME{LITERAL}:   padding = dense<[[0, 0], [2, 0], [0, 2], [0, 0]]> : tensor<4x2xi64>,
-  //          CHECK-SAME:   window_dilations = dense<[1, 2, 2, 1]> : tensor<4xi64>,
-  //          CHECK-SAME:   window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>,
-  //          CHECK-SAME:   window_strides = dense<[1, 4, 4, 1]> : tensor<4xi64>
+  //          CHECK-SAME:   window_dilations = array<i64: 1, 2, 2, 1>,
+  //          CHECK-SAME:   window_dimensions = array<i64: 1, 2, 2, 1>,
+  //          CHECK-SAME:   window_strides = array<i64: 1, 4, 4, 1>
   //          CHECK-SAME: } : (tensor<2x17x31x7xf32>, tensor<f32>) -> tensor<2x5x8x7xf32>
   %0 = "mhlo.reduce_window"(%arg0, %arg1) ({
     ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
@@ -1327,13 +1327,13 @@ func.func @op_rng_bit_generator(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>
 }
 
 // CHECK-LABEL: "op_rng"
-func.func @op_rng(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<?xindex>) -> tensor<f32> {
+func.func @op_rng(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<0xindex>) -> tensor<f32> {
   //      CHECK: "stablehlo.rng"(%arg0, %arg1, %arg2) {
   // CHECK-SAME:   rng_distribution = #stablehlo<rng_distribution NORMAL>
-  // CHECK-SAME: } : (tensor<f32>, tensor<f32>, tensor<?xindex>) -> tensor<f32>
+  // CHECK-SAME: } : (tensor<f32>, tensor<f32>, tensor<0xindex>) -> tensor<f32>
   %0 = "mhlo.rng"(%arg0, %arg1, %arg2) {
     rng_distribution = #mhlo.rng_distribution<NORMAL>
-  } : (tensor<f32>, tensor<f32>, tensor<?xindex>) -> tensor<f32>
+  } : (tensor<f32>, tensor<f32>, tensor<0xindex>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
@@ -1403,8 +1403,8 @@ func.func @op_select_and_scatter(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<1
   // CHECK-NEXT:     "stablehlo.return"(%[[VAL12]]) : (tensor<f32>) -> ()
   // CHECK-NEXT: }) {
   // CHECK-SAME:   padding = dense<0> : tensor<4x2xi64>,
-  // CHECK-SAME:   window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>,
-  // CHECK-SAME:   window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>
+  // CHECK-SAME:   window_dimensions = array<i64: 1, 2, 2, 1>,
+  // CHECK-SAME:   window_strides = array<i64: 1, 2, 2, 1>
   // CHECK-SAME: } : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
   %0 = "mhlo.select_and_scatter"(%arg0, %arg1, %arg2) ({
     ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/prepare-for-export.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/prepare-for-export.mlir
index 7b300ecad14662..0872ff45825723 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/prepare-for-export.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/prepare-for-export.mlir
@@ -99,3 +99,156 @@ func.func @broadcast_in_dim_dimension_unsorted(%arg0: tensor<1x2xi32>) -> tensor
   %0 = "mhlo.broadcast_in_dim"(%arg0) {broadcast_dimensions = dense<[2, 1]> : tensor<2xi64>} : (tensor<1x2xi32>) -> tensor<1x2x3xi32>
   func.return %0 : tensor<1x2x3xi32>
 }
+
+// -----
+
+// CHECK-LABEL: @reduce_with_multiple_implicit_captures
+func.func @reduce_with_multiple_implicit_captures(%arg0: tensor<2x2xf32>) -> tuple<tensor<i1>> {
+  %0 = mhlo.constant dense<1.000000e+00> : tensor<f32>
+  %1 = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  // CHECK: mhlo.reduce
+  %2 = mhlo.reduce(%arg0 init: %1) across dimensions = [0, 1] : (tensor<2x2xf32>, tensor<f32>) -> tensor<f32>
+   reducer(%arg1: tensor<f32>, %arg2: tensor<f32>)  {
+    // CHECK-DAG: mhlo.constant dense<0.000000e+00> : tensor<f32>
+    // CHECK-DAG: mhlo.constant dense<1.000000e+00> : tensor<f32>
+    // CHECK: mhlo.compare
+    %5 = mhlo.compare  NE, %arg1, %1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %6 = mhlo.compare  NE, %arg2, %1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %7 = mhlo.or %5, %6 : tensor<i1>
+    %8 = mhlo.select %7, %0, %1 : tensor<i1>, tensor<f32>
+    mhlo.return %8 : tensor<f32>
+  }
+  %3 = mhlo.compare  NE, %2, %1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  %4 = mhlo.tuple %3 {xla_shape = "(pred[])"} : tuple<tensor<i1>>
+  return %4 : tuple<tensor<i1>>
+}
+
+// -----
+
+// CHECK-LABEL: @all_reduce_with_implicit_capture
+func.func @all_reduce_with_implicit_capture(%arg0: tensor<f32>) -> tensor<f32> {
+    %c = mhlo.constant dense<0.0> : tensor<f32>
+    // CHECK: mhlo.all_reduce
+    // CHECK-NEXT: ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
+    %0 = "mhlo.all_reduce"(%arg0) ({
+    ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    // CHECK: %[[VAL1:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+    // CHECK: mhlo.add
+    // CHECK-SAME: %[[ARG1]], %[[VAL1]]
+      %1 = mhlo.add %arg1, %c : tensor<f32>
+      mhlo.return %1 : tensor<f32>
+    }) {replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>} : (tensor<f32>) -> tensor<f32>
+    return %0 : tensor<f32>
+  }
+
+// -----
+
+// CHECK-LABEL: @reduce_scatter_with_implicit_capture
+func.func @reduce_scatter_with_implicit_capture(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
+  %c = mhlo.constant dense<0.0> : tensor<f32>
+  // CHECK: mhlo.reduce_scatter
+  // CHECK-NEXT: ^[[BB:bb.*]](%[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<f32>):
+  %0 = "mhlo.reduce_scatter"(%data) ({
+    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+    // CHECK: %[[VAL1:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+    // CHECK: mhlo.add
+    // CHECK-SAME: %[[ARG1]], %[[VAL1]]
+    %1 = mhlo.add %arg2, %c : tensor<f32>
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64,
+      channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
+      use_global_device_ids} : (tensor<4x16xf32>) -> tensor<4x4xf32>
+  func.return %0 : tensor<4x4xf32>
+}
+
+// -----
+
+// CHECK-LABEL: @reduce_window_with_implicit_capture
+func.func @reduce_window_with_implicit_capture(%arg0: tensor<2x17x31x7xf32>, %arg1: tensor<f32>) -> tensor<2x16x30x7xf32> {
+    %c = mhlo.constant dense<0.0> : tensor<f32>
+    // CHECK: mhlo.reduce_window
+    // CHECK-NEXT: ^[[BB:bb.*]](%[[ARG2:arg.*]]: tensor<f32>, %[[ARG3:arg.*]]: tensor<f32>):
+    %0 = "mhlo.reduce_window"(%arg0, %arg1) ({
+    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+      // CHECK: %[[VAL1:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+      // CHECK: mhlo.maximum
+      // CHECK-SAME: %[[ARG2]], %[[VAL1]]
+      %1 = mhlo.maximum %arg2, %c : tensor<f32>
+      mhlo.return %1 : tensor<f32>
+    }) {window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>} : (tensor<2x17x31x7xf32>, tensor<f32>) -> tensor<2x16x30x7xf32>
+    return %0 : tensor<2x16x30x7xf32>
+  }
+
+// -----
+
+// CHECK-LABEL: @scatter_with_implicit_capture
+func.func @scatter_with_implicit_capture(%arg0: tensor<3xi32>, %arg1: tensor<1x1xi32>,
+                            %arg2: tensor<1xi32>) -> tensor<3xi32> {
+ %c = mhlo.constant dense<0> : tensor<i32>
+ // CHECK: mhlo.scatter
+ // CHECK-NEXT: ^[[BB:bb.*]](%[[ARG3:arg.*]]: tensor<i32>, %[[ARG4:arg.*]]: tensor<i32>):
+  %0 = "mhlo.scatter"(%arg0, %arg1, %arg2) ({
+  ^bb0(%arg3: tensor<i32>, %arg4: tensor<i32>):
+    // CHECK: %[[VAL1:.*]] = mhlo.constant dense<0> : tensor<i32>
+    // CHECK: mhlo.add
+    // CHECK-SAME: %[[ARG4]], %[[VAL1]]
+    %x = mhlo.add %arg4, %c : tensor<i32>
+    "mhlo.return"(%x) : (tensor<i32>) -> ()
+  }) {
+    indices_are_sorted = false,
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [],
+      inserted_window_dims = [0],
+      scatter_dims_to_operand_dims = [0],
+      index_vector_dim = 1,
+    >,
+    unique_indices = false
+  } : (tensor<3xi32>, tensor<1x1xi32>, tensor<1xi32>) -> tensor<3xi32>
+  func.return %0 : tensor<3xi32>
+}
+
+// -----
+
+// CHECK-LABEL: @select_and_scatter_with_implicit_capture
+func.func @select_and_scatter_with_implicit_capture(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<10x23x23x64xf32>, %arg2: tensor<f32>) -> tensor<10x24x24x64xf32> {
+    %c1 = mhlo.constant dense<0.0> : tensor<f32>
+    %c2 = mhlo.constant dense<0.0> : tensor<f32>
+    // CHECK: mhlo.select_and_scatter
+    // CHECK-NEXT: ^[[BB:bb.*]](%[[ARG3:arg.*]]: tensor<f32>, %[[ARG4:arg.*]]: tensor<f32>):
+    %0 = "mhlo.select_and_scatter"(%arg0, %arg1, %arg2) ({
+    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+      // CHECK: %[[VAL1:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+      // CHECK: mhlo.compare
+      // CHECK-SAME: %[[ARG3]], %[[VAL1]]
+      %1 = mhlo.compare  GE, %arg3, %c1,  TOTALORDER : (tensor<f32>, tensor<f32>) -> tensor<i1>
+      mhlo.return %1 : tensor<i1>
+    }, {
+    // CHECK: ^[[BB:bb.*]](%[[ARG3:arg.*]]: tensor<f32>, %[[ARG4:arg.*]]: tensor<f32>):
+    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+      // CHECK: %[[VAL2:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+      // CHECK: mhlo.add
+      // CHECK-SAME: %[[ARG4]], %[[VAL2]]
+      %1 = mhlo.add %arg4, %c2 : tensor<f32>
+      mhlo.return %1 : tensor<f32>
+    }) {window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>} : (tensor<10x24x24x64xf32>, tensor<10x23x23x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
+    return %0 : tensor<10x24x24x64xf32>
+  }
+
+// -----
+
+// CHECK-LABEL: @sort_with_implicit_capture
+func.func @sort_with_implicit_capture(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
+  %c = mhlo.constant dense<0.0> : tensor<f32>
+  // CHECK: mhlo.sort
+  // CHECK-NEXT: ^[[BB:bb.*]](%[[ARG0:arg.*]]: tensor<f32>, %[[ARG1:arg.*]]: tensor<f32>, %[[ARG2:arg.*]]: tensor<i32>, %[[ARG3:arg.*]]: tensor<i32>):
+  %0:2 = "mhlo.sort"(%input0, %input1) ({
+  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>):
+    // CHECK: %[[VAL1:.*]] = mhlo.constant dense<0.000000e+00> : tensor<f32>
+    // CHECK: mhlo.compare
+    // CHECK-SAME: %[[ARG0]], %[[VAL1]]
+    %7 = "mhlo.compare"(%arg0, %c) {comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    "mhlo.return"(%7) : (tensor<i1>) -> ()
+  }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
+  func.return
+}
diff --git a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
index 1fa1d20cf23d0c..cde68d60e020f4 100644
--- a/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/Dialect/mhlo/stablehlo-legalize-to-hlo.mlir
@@ -250,20 +250,20 @@ func.func @attr_rng_algorithm_philox(%arg0: tensor<f32>) -> (tensor<f32>, tensor
 }
 
 // CHECK-LABEL: "attr_rng_distribution_uniform"
-func.func @attr_rng_distribution_uniform(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<?xindex>) -> tensor<f32> {
+func.func @attr_rng_distribution_uniform(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<0xindex>) -> tensor<f32> {
   %0 = "stablehlo.rng"(%arg0, %arg1, %arg2) {
     // CHECK: rng_distribution = #mhlo.rng_distribution<UNIFORM>
     rng_distribution = #stablehlo<rng_distribution UNIFORM>
-  } : (tensor<f32>, tensor<f32>, tensor<?xindex>) -> tensor<f32>
+  } : (tensor<f32>, tensor<f32>, tensor<0xindex>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
 // CHECK-LABEL: "attr_rng_distribution_normal"
-func.func @attr_rng_distribution_normal(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<?xindex>) -> tensor<f32> {
+func.func @attr_rng_distribution_normal(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<0xindex>) -> tensor<f32> {
   %0 = "stablehlo.rng"(%arg0, %arg1, %arg2) {
     // CHECK: rng_distribution = #mhlo.rng_distribution<NORMAL>
     rng_distribution = #stablehlo<rng_distribution NORMAL>
-  } : (tensor<f32>, tensor<f32>, tensor<?xindex>) -> tensor<f32>
+  } : (tensor<f32>, tensor<f32>, tensor<0xindex>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
@@ -479,7 +479,7 @@ func.func @op_broadcast_in_dim(%arg0: tensor<16xf32>) -> tensor<16x16xf32> {
   // CHECK-SAME:   broadcast_dimensions = dense<1> : tensor<1xi64>
   // CHECK-SAME: } : (tensor<16xf32>) -> tensor<16x16xf32>
   %0 = "stablehlo.broadcast_in_dim"(%arg0) {
-    broadcast_dimensions = dense<1> : tensor<1xi64>
+    broadcast_dimensions = array<i64: 1>
   } : (tensor<16xf32>) -> tensor<16x16xf32>
   func.return %0 : tensor<16x16xf32>
 }
@@ -641,11 +641,11 @@ func.func @op_convolution(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x16
   // CHECK-SAME:   window_strides = dense<1> : tensor<2xi64>
   // CHECK-SAME: } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>) -> tensor<1x8x8x16xf32>
   %0 = "stablehlo.convolution"(%arg0, %arg1) {
-    window_strides = dense<1> : tensor<2xi64>,
+    window_strides = array<i64: 1, 1>,
     padding = dense<1> : tensor<2x2xi64>,
-    lhs_dilation = dense<1> : tensor<2xi64>,
-    rhs_dilation = dense<1> : tensor<2xi64>,
-    window_reversal = dense<false> : tensor<2xi1>,
+    lhs_dilation = array<i64: 1, 1>,
+    rhs_dilation = array<i64: 1, 1>,
+    window_reversal = array<i1: false, false>,
     dimension_numbers = #stablehlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
     feature_group_count = 1 : i64,
     batch_group_count = 1 : i64,
@@ -796,9 +796,9 @@ func.func @op_dynamic_broadcast_in_dim(%arg0: tensor<?xf32>, %arg1: tensor<2xind
   // CHECK-SAME:   known_nonexpanding_dimensions = dense<0> : tensor<1xi64>
   // CHECK-SAME: } : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   %0 = "stablehlo.dynamic_broadcast_in_dim"(%arg0, %arg1) {
-    broadcast_dimensions = dense<1> : tensor<1xi64>,
-    known_expanding_dimensions = dense<[]> : tensor<0xi64>,
-    known_nonexpanding_dimensions = dense<0> : tensor<1xi64>
+    broadcast_dimensions = array<i64: 1>,
+    known_expanding_dimensions = array<i64>,
+    known_nonexpanding_dimensions = array<i64: 0>
   } : (tensor<?xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
@@ -817,11 +817,11 @@ func.func @op_dynamic_conv(%arg0: tensor<1x8x8x207xf32>, %arg1: tensor<3x3x207x1
   // CHECK-SAME:   window_strides = dense<1> : tensor<2xi64>
   // CHECK-SAME: } : (tensor<1x8x8x207xf32>, tensor<3x3x207x16xf32>, tensor<4xi32>) -> tensor<1x?x?x16xf32>
   %0 = "stablehlo.dynamic_conv"(%arg0, %arg1, %arg2) {
-    window_strides = dense<1> : tensor<2xi64>,
+    window_strides = array<i64: 1, 1>,
     padding = dense<1> : tensor<2x2xi64>,
-    lhs_dilation = dense<1> : tensor<2xi64>,
-    rhs_dilation = dense<1> : tensor<2xi64>,
-    window_reversal = dense<false> : tensor<2xi1>,
+    lhs_dilation = array<i64: 1, 1>,
+    rhs_dilation = array<i64: 1, 1>,
+    window_reversal = array<i1: false, false>,
     dimension_numbers = #stablehlo.conv<[b, 0, 1, f]x[0, 1, i, o]->[b, 0, 1, f]>,
     feature_group_count = 1 : i64,
     batch_group_count = 1 : i64,
@@ -872,9 +872,9 @@ func.func @op_dynamic_pad(%arg0: tensor<?xf32>, %arg1: tensor<f32>, %arg2: tenso
 }
 
 // CHECK-LABEL: "op_dynamic_reshape"
-func.func @op_dynamic_reshape(%arg0: tensor<16xf32>, %arg1: tensor<?xindex>) -> tensor<?x?xf32> {
-  // CHECK: "mhlo.dynamic_reshape"(%arg0, %arg1) : (tensor<16xf32>, tensor<?xindex>) -> tensor<?x?xf32>
-  %0 = "stablehlo.dynamic_reshape"(%arg0, %arg1) : (tensor<16xf32>, tensor<?xindex>) -> tensor<?x?xf32>
+func.func @op_dynamic_reshape(%arg0: tensor<16xf32>, %arg1: tensor<2xindex>) -> tensor<?x?xf32> {
+  // CHECK: "mhlo.dynamic_reshape"(%arg0, %arg1) : (tensor<16xf32>, tensor<2xindex>) -> tensor<?x?xf32>
+  %0 = "stablehlo.dynamic_reshape"(%arg0, %arg1) : (tensor<16xf32>, tensor<2xindex>) -> tensor<?x?xf32>
   func.return %0 : tensor<?x?xf32>
 }
 
@@ -960,7 +960,7 @@ func.func @op_gather(%arg0 : tensor<2x4x9xf32>, %arg1 : tensor<1x5x2xi32>) -> te
       start_index_map = [0, 1],
       index_vector_dim = 2
     >,
-    slice_sizes = dense<1> : tensor<3xi64>,
+    slice_sizes = array<i64: 1, 1, 1>,
     indices_are_sorted = false
   } : (tensor<2x4x9xf32>, tensor<1x5x2xi32>) -> tensor<1x5x1xf32>
   func.return %0 : tensor<1x5x1xf32>
@@ -1076,7 +1076,7 @@ func.func @op_map(%arg0: tensor<16xf32>) -> tensor<16xf32> {
       %1 = "stablehlo.abs"(%arg1) : (tensor<f32>) -> tensor<f32>
       "stablehlo.return"(%1) : (tensor<f32>) -> ()
   }) {
-    dimensions = dense<0> : tensor<1xi64>
+    dimensions = array<i64: 0>
   } : (tensor<16xf32>) -> tensor<16xf32>
   func.return %0 : tensor<16xf32>
 }
@@ -1211,7 +1211,7 @@ func.func @op_reduce(%arg0: tensor<16xf32>, %arg1: tensor<f32>) -> tensor<f32> {
       %1 = "stablehlo.add"(%arg2, %arg3) : (tensor<f32>, tensor<f32>) -> tensor<f32>
       "stablehlo.return"(%1) : (tensor<f32>) -> ()
   }) {
-    dimensions = dense<0> : tensor<1xi64>
+    dimensions = array<i64: 0>
   } : (tensor<16xf32>, tensor<f32>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
@@ -1270,10 +1270,10 @@ func.func @op_reduce_window(%arg0: tensor<2x17x31x7xf32>, %arg1: tensor<f32>) ->
       %1 = "stablehlo.maximum"(%arg2, %arg3) : (tensor<f32>, tensor<f32>) -> tensor<f32>
       "stablehlo.return"(%1) : (tensor<f32>) -> ()
   }) {
-    window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>,
-    window_strides = dense<[1, 4, 4, 1]> : tensor<4xi64>,
-    base_dilations = dense<[1, 1, 1, 1]> : tensor<4xi64>,
-    window_dilations = dense<[1, 2, 2, 1]> : tensor<4xi64>,
+    window_dimensions = array<i64: 1, 2, 2, 1>,
+    window_strides = array<i64: 1, 4, 4, 1>,
+    base_dilations = array<i64: 1, 1, 1, 1>,
+    window_dilations = array<i64: 1, 2, 2, 1>,
     padding = dense<[[0, 0], [2, 0], [0, 2], [0, 0]]> : tensor<4x2xi64>
   } : (tensor<2x17x31x7xf32>, tensor<f32>) -> tensor<2x5x8x7xf32>
   func.return %0 : tensor<2x5x8x7xf32>
@@ -1334,13 +1334,13 @@ func.func @op_rng_bit_generator(%arg0: tensor<f32>) -> (tensor<f32>, tensor<f32>
 }
 
 // CHECK-LABEL: "op_rng"
-func.func @op_rng(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<?xindex>) -> tensor<f32> {
+func.func @op_rng(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<0xindex>) -> tensor<f32> {
   //      CHECK: "mhlo.rng"(%arg0, %arg1, %arg2) {
   // CHECK-SAME:   rng_distribution = #mhlo.rng_distribution<NORMAL>
-  // CHECK-SAME: } : (tensor<f32>, tensor<f32>, tensor<?xindex>) -> tensor<f32>
+  // CHECK-SAME: } : (tensor<f32>, tensor<f32>, tensor<0xindex>) -> tensor<f32>
   %0 = "stablehlo.rng"(%arg0, %arg1, %arg2) {
     rng_distribution = #stablehlo<rng_distribution NORMAL>
-  } : (tensor<f32>, tensor<f32>, tensor<?xindex>) -> tensor<f32>
+  } : (tensor<f32>, tensor<f32>, tensor<0xindex>) -> tensor<f32>
   func.return %0 : tensor<f32>
 }
 
@@ -1422,8 +1422,8 @@ func.func @op_select_and_scatter(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<1
       %1 = "stablehlo.add"(%arg3, %arg4) : (tensor<f32>, tensor<f32>) -> tensor<f32>
       "stablehlo.return"(%1) : (tensor<f32>) -> ()
   }) {
-    window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>,
-    window_strides = dense<[1, 2, 2, 1]> : tensor<4xi64>,
+    window_dimensions = array<i64: 1, 2, 2, 1>,
+    window_strides = array<i64: 1, 2, 2, 1>,
     padding = dense<0> : tensor<4x2xi64>
   } : (tensor<10x24x24x64xf32>, tensor<10x12x12x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
   func.return %0 : tensor<10x24x24x64xf32>
diff --git a/third_party/xla/xla/mlir_hlo/tests/buffer_packing.mlir b/third_party/xla/xla/mlir_hlo/tests/buffer_packing.mlir
deleted file mode 100644
index 737dad4148de1b..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/buffer_packing.mlir
+++ /dev/null
@@ -1,164 +0,0 @@
-// RUN: mlir-hlo-opt -buffer-packing -split-input-file %s | FileCheck %s
-
-// CHECK-LABEL: @noPackingSameLiveRange
-func.func @noPackingSameLiveRange() -> (f32, f32) {
-  // CHECK: memref.alloc
-  // CHECK: memref.alloc
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2.0 : f32
-  %0 = memref.alloc() : memref<42xf32>
-  %1 = memref.alloc() : memref<42xf32>
-  memref.store %c2, %0[%c1] : memref<42xf32>
-  memref.store %c2, %1[%c1] : memref<42xf32>
-  %2 = memref.load %0[%c1] : memref<42xf32>
-  %3 = memref.load %1[%c1] : memref<42xf32>
-  return %2, %3 : f32, f32
-}
-
-// -----
-
-// CHECK-LABEL: @packingScfIfSameSize
-func.func @packingScfIfSameSize(%pred : i1) -> (f32) {
-  // CHECK: %[[MEM:.*]] = memref.alloc() : memref<192xi8>
-  // CHECK: %[[VIEW1:.*]] = memref.view %[[MEM]][%{{.*}}][] : memref<192xi8> to memref<42xf32>
-  // CHECK: %[[VIEW2:.*]] = memref.view %[[MEM]][%{{.*}}][] : memref<192xi8> to memref<42xf32>
-  // CHECK: scf.if
-  // CHECK: memref.load %[[VIEW1]]
-  // CHECK: else
-  // CHECK: memref.load %[[VIEW2]]
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2.0 : f32
-  %0 = memref.alloc() : memref<42xf32>
-  %1 = memref.alloc() : memref<42xf32>
-  %2 = scf.if %pred -> f32 {
-    memref.store %c2, %0[%c1] : memref<42xf32>
-    %2 = memref.load %0[%c1] : memref<42xf32>
-    scf.yield %2 : f32
-  } else {
-    memref.store %c2, %1[%c1] : memref<42xf32>
-    %2 = memref.load %1[%c1] : memref<42xf32>
-    scf.yield %2 : f32
-  }
-  return %2 : f32
-}
-
-// -----
-
-// CHECK-LABEL: @packingScfIfDifferentSize
-func.func @packingScfIfDifferentSize(%pred : i1) -> (f32) {
-  // CHECK: %[[MEM:.*]] = memref.alloc() : memref<192xi8>
-  // CHECK: scf.if
-  // CHECK: %[[VIEW1:.*]] = memref.view %[[MEM]][%{{.*}}][] : memref<192xi8> to memref<42xf32>
-  // CHECK: memref.load %[[VIEW1]]
-  // CHECK: else
-  // CHECK: %[[VIEW2:.*]] = memref.view %[[MEM]][%{{.*}}][] : memref<192xi8> to memref<16xf32>
-  // CHECK: memref.load %[[VIEW2]]
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2.0 : f32
-  %0 = scf.if %pred -> f32 {
-    %0 = memref.alloc() : memref<42xf32>
-    memref.store %c2, %0[%c1] : memref<42xf32>
-    %1 = memref.load %0[%c1] : memref<42xf32>
-    scf.yield %1 : f32
-  } else {
-    %0 = memref.alloc() : memref<16xf32>
-    memref.store %c2, %0[%c1] : memref<16xf32>
-    %1 = memref.load %0[%c1] : memref<16xf32>
-    scf.yield %1 : f32
-  }
-  return %0 : f32
-}
-
-// -----
-
-// CHECK-LABEL: @packingScfIfDifferentElementType
-func.func @packingScfIfDifferentElementType(%pred : i1) -> (f32) {
-  // CHECK: %[[MEM:.*]] = memref.alloc() : memref<128xi8>
-  // CHECK: scf.if
-  // CHECK: %[[VIEW1:.*]] = memref.view %[[MEM]][%{{.*}}][] : memref<128xi8> to memref<42xf16>
-  // CHECK: memref.load %[[VIEW1]]
-  // CHECK: else
-  // CHECK: %[[VIEW2:.*]] = memref.view %[[MEM]][%{{.*}}][] : memref<128xi8> to memref<16xf32>
-  // CHECK: memref.load %[[VIEW2]]
-  %c1 = arith.constant 1 : index
-  %0 = scf.if %pred -> f32 {
-    %c2 = arith.constant 2.0 : f16
-    %0 = memref.alloc() : memref<42xf16>
-    memref.store %c2, %0[%c1] : memref<42xf16>
-    %1 = memref.load %0[%c1] : memref<42xf16>
-    %2 = arith.extf %1 : f16 to f32
-    scf.yield %2 : f32
-  } else {
-    %c2 = arith.constant 2.0 : f32
-    %0 = memref.alloc() : memref<16xf32>
-    memref.store %c2, %0[%c1] : memref<16xf32>
-    %1 = memref.load %0[%c1] : memref<16xf32>
-    scf.yield %1 : f32
-  }
-  return %0 : f32
-}
-
-// -----
-
-// CHECK-LABEL: @packWithOutsideControlFlow
-func.func @packWithOutsideControlFlow(%pred : i1) -> (f32, f32) {
-  // CHECK: %[[MEM:.*]] = memref.alloc() : memref<192xi8>
-  // CHECK: %[[VIEW0:.*]] = memref.view %[[MEM]][%{{.*}}][] : memref<192xi8> to memref<42xf32>
-  // CHECK: memref.load %[[VIEW0]]
-  // CHECK: scf.if
-  // CHECK: %[[VIEW1:.*]] = memref.view %[[MEM]][%{{.*}}][] : memref<192xi8> to memref<42xf32>
-  // CHECK: memref.load %[[VIEW1]]
-  // CHECK: else
-  // CHECK: %[[VIEW2:.*]] = memref.view %[[MEM]][%{{.*}}][] : memref<192xi8> to memref<42xf32>
-  // CHECK: memref.load %[[VIEW2]]
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2.0 : f32
-  %0 = memref.alloc() : memref<42xf32>
-  memref.store %c2, %0[%c1] : memref<42xf32>
-  %1 = memref.load %0[%c1] : memref<42xf32>
-  %2 = scf.if %pred -> f32 {
-    %3 = memref.alloc() : memref<42xf32>
-    memref.store %c2, %3[%c1] : memref<42xf32>
-    %4 = memref.load %3[%c1] : memref<42xf32>
-    scf.yield %4 : f32
-  } else {
-    %3 = memref.alloc() : memref<42xf32>
-    memref.store %c2, %3[%c1] : memref<42xf32>
-    %4 = memref.load %3[%c1] : memref<42xf32>
-    scf.yield %4 : f32
-  }
-  return %1, %2 : f32, f32
-}
-
-// -----
-
-// CHECK-LABEL: @packTwoInOne
-func.func @packTwoInOne(%pred : i1) -> (f32) {
-  // CHECK: %[[MEM:.*]] = memref.alloc() : memref<192xi8>
-  // CHECK: scf.if
-  // CHECK: %[[VIEW1:.*]] = memref.view %[[MEM]][%{{.*}}][] : memref<192xi8> to memref<42xf32>
-  // CHECK: memref.load %[[VIEW1]]
-  // CHECK: else
-  // CHECK: %[[VIEW2:.*]] = memref.view %[[MEM]][%{{.*}}][] : memref<192xi8> to memref<16xf32>
-  // CHECK: %[[VIEW3:.*]] = memref.view %[[MEM]][%{{.*}}][] : memref<192xi8> to memref<8xf32>
-  // CHECK: memref.load %[[VIEW2]]
-  // CHECK: memref.load %[[VIEW3]]
-  %c1 = arith.constant 1 : index
-  %c2 = arith.constant 2.0 : f32
-  %0 = scf.if %pred -> f32 {
-    %0 = memref.alloc() : memref<42xf32>
-    memref.store %c2, %0[%c1] : memref<42xf32>
-    %1 = memref.load %0[%c1] : memref<42xf32>
-    scf.yield %1 : f32
-  } else {
-    %0 = memref.alloc() : memref<16xf32>
-    %1 = memref.alloc() : memref<8xf32>
-    memref.store %c2, %0[%c1] : memref<16xf32>
-    %2 = memref.load %0[%c1] : memref<16xf32>
-    memref.store %c2, %1[%c1] : memref<8xf32>
-    %3 = memref.load %1[%c1] : memref<8xf32>
-    %4 = arith.addf %2, %3 : f32
-    scf.yield %4 : f32
-  }
-  return %0 : f32
-}
diff --git a/third_party/xla/xla/mlir_hlo/tests/naive_copy_removal.mlir b/third_party/xla/xla/mlir_hlo/tests/naive_copy_removal.mlir
index bec5a0822ac7b5..2b6a5b191c4634 100644
--- a/third_party/xla/xla/mlir_hlo/tests/naive_copy_removal.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/naive_copy_removal.mlir
@@ -65,9 +65,9 @@ func.func @target_is_subview_of_subview(%arg0: memref<8x8xf32>)
   %subview_5 = memref.subview %alloc_4[0, 0] [%c4, %c4] [1, 1] :
         memref<8x8xf32> to memref<?x?xf32, strided<[8, 1]>>
   %subview_6 = memref.subview %subview_5[0, 0] [%c4, %c4] [1, 1] :
-        memref<?x?xf32, strided<[8, 1]>> to memref<?x?xf32, strided<[?, ?]>>
+        memref<?x?xf32, strided<[8, 1]>> to memref<?x?xf32, strided<[8, 1]>>
   memref.copy %arg0, %subview_6 :
-        memref<8x8xf32> to memref<?x?xf32, strided<[?, ?]>>
+        memref<8x8xf32> to memref<?x?xf32, strided<[8, 1]>>
   return %arg0 : memref<8x8xf32>
 }
 
@@ -79,32 +79,6 @@ func.func @target_is_subview_of_subview(%arg0: memref<8x8xf32>)
 
 // -----
 
-func.func @do_not_simplify_subview_of_subview(%arg0: memref<8x8xf32>)
-                                              -> vector<8x8xf32> {
-  %c4 = arith.constant 4 : index
-  %c0 = arith.constant 0 : index
-  %cst_0 = arith.constant 0.000000e+00 : f32
-  %alloc_4 = memref.alloc() {alignment = 64 : i64} : memref<8x8xf32>
-  %subview_5 = memref.subview %alloc_4[0, 0] [%c4, %c4] [1, 1] :
-        memref<8x8xf32> to memref<?x?xf32, strided<[8, 1]>>
-  %subview_6 = memref.subview %subview_5[0, 0] [%c4, %c4] [1, 1] :
-        memref<?x?xf32, strided<[8, 1]>> to memref<?x?xf32, strided<[?, ?]>>
-  memref.copy %arg0, %subview_6 :
-        memref<8x8xf32> to memref<?x?xf32, strided<[?, ?]>>
-  %27 = vector.transfer_read %subview_5[%c0, %c0], %cst_0 :
-        memref<?x?xf32, strided<[8, 1]>>, vector<8x8xf32>
-  return %27 : vector<8x8xf32>
-}
-
-// CHECK-LABEL: func @do_not_simplify_subview_of_subview(
-
-// CHECK:         memref.alloc
-// CHECK:         memref.subview
-// CHECK:         memref.subview
-// CHECK:         memref.copy
-
-// -----
-
 func.func @do_not_simplify_subview(%arg0: memref<8x8xf32>) -> vector<8x8xf32> {
   %c4 = arith.constant 4 : index
   %c0 = arith.constant 0 : index
diff --git a/third_party/xla/xla/mlir_hlo/tests/rank-specialization.mlir b/third_party/xla/xla/mlir_hlo/tests/rank-specialization.mlir
deleted file mode 100644
index 3627fbebedc894..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/rank-specialization.mlir
+++ /dev/null
@@ -1,702 +0,0 @@
-// RUN: mlir-hlo-opt %s --split-input-file --mhlo-rank-specialization-cluster | FileCheck %s
-// RUN: mlir-hlo-opt %s --split-input-file --mhlo-rank-specialization-cluster --mhlo-rank-specialization-to-scf=max-target-rank=3 | FileCheck %s --check-prefix CHECK-SCF
-
-// CHECK-LABEL: @add_mul
-// CHECK-SAME:  (%[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>, %[[ARG2:.*]]: tensor<*xf32>)
-func.func @add_mul(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>,
-    %arg2 : tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG2]], %[[ARG0]], %[[ARG1]]) ({
-  // CHECK: ^bb0(%[[ARG2_:.*]]: tensor<*xf32>, %[[ARG0_:.*]]: tensor<*xf32>, %[[ARG1_:.*]]: tensor<*xf32>):
-  // CHECK:   %[[TMP:.*]] = chlo.broadcast_multiply %[[ARG0_]], %[[ARG1_]]
-  // CHECK:   %[[INNER_RES:.*]] = chlo.broadcast_add %[[TMP]], %[[ARG2_]]
-  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[INNER_RES]])
-  // CHECK: }) : (tensor<*xf32>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-  // CHECK: return %[[RES]]
-  %0 = chlo.broadcast_multiply %arg0, %arg1
-      : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-  %1 = chlo.broadcast_add %0, %arg2
-      : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-  func.return %1 : tensor<*xf32>
-}
-
-// CHECK-SCF-LABEL: @add_mul
-// CHECK-SCF-SAME: (%[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>, %[[ARG2:.*]]: tensor<*xf32>)
-// CHECK-SCF-DAG:  %[[C1:.*]] = arith.constant 1
-// CHECK-SCF-DAG:  %[[C2:.*]] = arith.constant 2
-// CHECK-SCF-DAG:  %[[C3:.*]] = arith.constant 3
-// CHECK-SCF-DAG:  %[[ONE_SHAPE_1:.*]] = shape.const_shape [1]
-// CHECK-SCF-DAG:  %[[ONE_SHAPE_2:.*]] = shape.const_shape [1, 1]
-// CHECK-SCF-DAG:  %[[ONE_SHAPE_3:.*]] = shape.const_shape [1, 1, 1]
-// CHECK-SCF-DAG:  %[[SHAPE_ARG0:.*]] = shape.shape_of %[[ARG0]]
-// CHECK-SCF-DAG:  %[[SHAPE_ARG1:.*]] = shape.shape_of %[[ARG1]]
-// CHECK-SCF-DAG:  %[[SHAPE_ARG2:.*]] = shape.shape_of %[[ARG2]]
-//                 Equal shapes case:
-// CHECK-SCF-DAG:  %[[EQ20:.*]] = shape.shape_eq %[[SHAPE_ARG2]], %[[SHAPE_ARG0]]
-// CHECK-SCF-DAG:  %[[EQ21:.*]] = shape.shape_eq %[[SHAPE_ARG2]], %[[SHAPE_ARG1]]
-// CHECK-SCF-DAG:  %[[SHAPES_EQ:.*]] = arith.andi %[[EQ20]], %[[EQ21]]
-// CHECK-SCF:      %[[UNSHAPED_RES_EQ_SHAPES:.*]] = scf.if %[[SHAPES_EQ]]
-// CHECK-SCF-DAG:    %[[ANY_SHAPE:.*]] = shape.any %[[SHAPE_ARG2]], %[[SHAPE_ARG0]], %[[SHAPE_ARG1]]
-// CHECK-SCF-DAG:    %[[N:.*]] = shape.num_elements %[[ANY_SHAPE]]
-// CHECK-SCF-DAG:    %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]
-// CHECK-SCF-DAG:    %[[FLAT_ARG0:.*]] = mhlo.dynamic_reshape %[[ARG0]], %[[FLAT_SHAPE]]
-// CHECK-SCF-DAG:    %[[FLAT_ARG1:.*]] = mhlo.dynamic_reshape %[[ARG1]], %[[FLAT_SHAPE]]
-// CHECK-SCF-DAG:    %[[FLAT_ARG2:.*]] = mhlo.dynamic_reshape %[[ARG2]], %[[FLAT_SHAPE]]
-// CHECK-SCF-DAG:    %[[TMP:.*]] = chlo.broadcast_multiply %[[FLAT_ARG0]], %[[FLAT_ARG1]] : (tensor<?xf32>, tensor<?xf32>)
-// CHECK-SCF-DAG:    %[[INNER_RES:.*]] = chlo.broadcast_add %[[TMP]], %[[FLAT_ARG2]] : (tensor<?xf32>, tensor<?xf32>)
-// CHECK-SCF-DAG:    %[[INNER_RES_:.*]] = tensor.cast %[[INNER_RES]]
-// CHECK-SCF:        scf.yield %[[INNER_RES_]]
-// CHECK-SCF:      else
-//                 Find maximum reduced rank.
-// CHECK-SCF-DAG:    %[[REDUCED_SHAPES:.*]]:3 = chlo.minimum_broadcast_shapes %[[SHAPE_ARG2]], %[[SHAPE_ARG0]], %[[SHAPE_ARG1]]
-// CHECK-SCF-DAG:    %[[REDUCED_RANK0:.*]] = shape.rank %[[REDUCED_SHAPES]]#1
-// CHECK-SCF-DAG:    %[[REDUCED_RANK1:.*]] = shape.rank %[[REDUCED_SHAPES]]#2
-// CHECK-SCF-DAG:    %[[REDUCED_RANK2:.*]] = shape.rank %[[REDUCED_SHAPES]]#0
-// CHECK-SCF-DAG:    %[[R2_GT_R0:.*]] = arith.cmpi sgt, %[[REDUCED_RANK2]], %[[REDUCED_RANK0]]
-// CHECK-SCF-DAG:    %[[R20:.*]] = arith.select %[[R2_GT_R0]], %[[REDUCED_RANK2]], %[[REDUCED_RANK0]]
-// CHECK-SCF-DAG:    %[[R20_GT_R1:.*]] = arith.cmpi sgt, %[[R20]], %[[REDUCED_RANK1]]
-// CHECK-SCF-DAG:    %[[MAX_RED_RANK:.*]] = arith.select %[[R20_GT_R1]], %[[R20]], %[[REDUCED_RANK1]]
-//                 Generic case 1:
-// CHECK-SCF:        %[[MAX_RED_RANK_LE_1:.*]] = arith.cmpi ule, %[[MAX_RED_RANK]], %[[C1]]
-// CHECK-SCF:        %[[UNSHAPED_RES_1:.*]] = scf.if %[[MAX_RED_RANK_LE_1]]
-// CHECK-SCF-DAG:      %[[EXT_SHAPE_ARG0:.*]] = shape.broadcast %[[REDUCED_SHAPES]]#1, %[[ONE_SHAPE_1]]
-// CHECK-SCF-DAG:      %[[EXT_SHAPE_ARG1:.*]] = shape.broadcast %[[REDUCED_SHAPES]]#2, %[[ONE_SHAPE_1]]
-// CHECK-SCF-DAG:      %[[EXT_SHAPE_ARG2:.*]] = shape.broadcast %[[REDUCED_SHAPES]]#0, %[[ONE_SHAPE_1]]
-// CHECK-SCF-DAG:      %[[EXT_SHAPE_ARG0_:.*]] = tensor.cast %[[EXT_SHAPE_ARG0]]
-// CHECK-SCF-DAG:      %[[EXT_SHAPE_ARG1_:.*]] = tensor.cast %[[EXT_SHAPE_ARG1]]
-// CHECK-SCF-DAG:      %[[EXT_SHAPE_ARG2_:.*]] = tensor.cast %[[EXT_SHAPE_ARG2]]
-// CHECK-SCF-DAG:      %[[REDUCED_ARG0:.*]] = mhlo.dynamic_reshape %[[ARG0]], %[[EXT_SHAPE_ARG0_]]
-// CHECK-SCF-DAG:      %[[REDUCED_ARG1:.*]] = mhlo.dynamic_reshape %[[ARG1]], %[[EXT_SHAPE_ARG1_]]
-// CHECK-SCF-DAG:      %[[REDUCED_ARG2:.*]] = mhlo.dynamic_reshape %[[ARG2]], %[[EXT_SHAPE_ARG2_]]
-// CHECK-SCF-DAG:      %[[TMP:.*]] = chlo.broadcast_multiply %[[REDUCED_ARG0]], %[[REDUCED_ARG1]] : (tensor<?xf32>, tensor<?xf32>)
-// CHECK-SCF-DAG:      %[[INNER_RES:.*]] = chlo.broadcast_add %[[TMP]], %[[REDUCED_ARG2]] : (tensor<?xf32>, tensor<?xf32>)
-// CHECK-SCF-DAG:      %[[INNER_RES_:.*]] = tensor.cast %[[INNER_RES]]
-// CHECK-SCF:          scf.yield %[[INNER_RES_]]
-// CHECK-SCF:        else
-//                 Generic case 2:
-// CHECK-SCF:          %[[MAX_RED_RANK_LE_2:.*]] = arith.cmpi ule, %[[MAX_RED_RANK]], %[[C2]]
-// CHECK-SCF:          %[[UNSHAPED_RES_2:.*]] = scf.if %[[MAX_RED_RANK_LE_2]]
-// CHECK-SCF-DAG:        %[[EXT_SHAPE_ARG0:.*]] = shape.broadcast %[[REDUCED_SHAPES]]#1, %[[ONE_SHAPE_2]]
-// CHECK-SCF-DAG:        %[[EXT_SHAPE_ARG1:.*]] = shape.broadcast %[[REDUCED_SHAPES]]#2, %[[ONE_SHAPE_2]]
-// CHECK-SCF-DAG:        %[[EXT_SHAPE_ARG2:.*]] = shape.broadcast %[[REDUCED_SHAPES]]#0, %[[ONE_SHAPE_2]]
-// CHECK-SCF-DAG:        %[[EXT_SHAPE_ARG0_:.*]] = tensor.cast %[[EXT_SHAPE_ARG0]]
-// CHECK-SCF-DAG:        %[[EXT_SHAPE_ARG1_:.*]] = tensor.cast %[[EXT_SHAPE_ARG1]]
-// CHECK-SCF-DAG:        %[[EXT_SHAPE_ARG2_:.*]] = tensor.cast %[[EXT_SHAPE_ARG2]]
-// CHECK-SCF-DAG:        %[[REDUCED_ARG0:.*]] = mhlo.dynamic_reshape %[[ARG0]], %[[EXT_SHAPE_ARG0_]]
-// CHECK-SCF-DAG:        %[[REDUCED_ARG1:.*]] = mhlo.dynamic_reshape %[[ARG1]], %[[EXT_SHAPE_ARG1_]]
-// CHECK-SCF-DAG:        %[[REDUCED_ARG2:.*]] = mhlo.dynamic_reshape %[[ARG2]], %[[EXT_SHAPE_ARG2_]]
-// CHECK-SCF-DAG:        %[[TMP:.*]] = chlo.broadcast_multiply %[[REDUCED_ARG0]], %[[REDUCED_ARG1]] : (tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-SCF-DAG:        %[[INNER_RES:.*]] = chlo.broadcast_add %[[TMP]], %[[REDUCED_ARG2]] : (tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-SCF-DAG:        %[[INNER_RES_:.*]] = tensor.cast %[[INNER_RES]]
-// CHECK-SCF:            scf.yield %[[INNER_RES_]]
-// CHECK-SCF:          else
-//                 Generic case 3:
-// CHECK-SCF:            %[[MAX_RED_RANK_LE_3:.*]] = arith.cmpi ule, %[[MAX_RED_RANK]], %[[C3]]
-// CHECK-SCF:            assert %[[MAX_RED_RANK_LE_3]], "Input for dynamic binary or n-ary op lowering was of a rank greater than 3"
-// CHECK-SCF-DAG:        %[[EXT_SHAPE_ARG0:.*]] = shape.broadcast %[[REDUCED_SHAPES]]#1, %[[ONE_SHAPE_3]]
-// CHECK-SCF-DAG:        %[[EXT_SHAPE_ARG1:.*]] = shape.broadcast %[[REDUCED_SHAPES]]#2, %[[ONE_SHAPE_3]]
-// CHECK-SCF-DAG:        %[[EXT_SHAPE_ARG2:.*]] = shape.broadcast %[[REDUCED_SHAPES]]#0, %[[ONE_SHAPE_3]]
-// CHECK-SCF-DAG:        %[[EXT_SHAPE_ARG0_:.*]] = tensor.cast %[[EXT_SHAPE_ARG0]]
-// CHECK-SCF-DAG:        %[[EXT_SHAPE_ARG1_:.*]] = tensor.cast %[[EXT_SHAPE_ARG1]]
-// CHECK-SCF-DAG:        %[[EXT_SHAPE_ARG2_:.*]] = tensor.cast %[[EXT_SHAPE_ARG2]]
-// CHECK-SCF-DAG:        %[[REDUCED_ARG0:.*]] = mhlo.dynamic_reshape %[[ARG0]], %[[EXT_SHAPE_ARG0_]]
-// CHECK-SCF-DAG:        %[[REDUCED_ARG1:.*]] = mhlo.dynamic_reshape %[[ARG1]], %[[EXT_SHAPE_ARG1_]]
-// CHECK-SCF-DAG:        %[[REDUCED_ARG2:.*]] = mhlo.dynamic_reshape %[[ARG2]], %[[EXT_SHAPE_ARG2_]]
-// CHECK-SCF-DAG:        %[[TMP:.*]] = chlo.broadcast_multiply %[[REDUCED_ARG0]], %[[REDUCED_ARG1]] : (tensor<?x?x?xf32>, tensor<?x?x?xf32>)
-// CHECK-SCF-DAG:        %[[INNER_RES:.*]] = chlo.broadcast_add %[[TMP]], %[[REDUCED_ARG2]] : (tensor<?x?x?xf32>, tensor<?x?x?xf32>)
-// CHECK-SCF-DAG:        %[[INNER_RES_:.*]] = tensor.cast %[[INNER_RES]]
-// CHECK-SCF:            scf.yield %[[INNER_RES_]]
-// CHECK-SCF:          scf.yield %[[UNSHAPED_RES_2]]
-// CHECK-SCF:        scf.yield %[[UNSHAPED_RES_1]]
-//                 Reshape the result.
-// CHECK-SCF-DAG:  %[[SHAPE_ARG0:.*]] = shape.shape_of %[[ARG0]]
-// CHECK-SCF-DAG:  %[[SHAPE_ARG1:.*]] = shape.shape_of %[[ARG1]]
-// CHECK-SCF-DAG:  %[[TMP:.*]] = shape.broadcast %[[SHAPE_ARG0]], %[[SHAPE_ARG1]]
-// CHECK-SCF-DAG:  %[[SHAPE_ARG2:.*]] = shape.shape_of %[[ARG2]]
-// CHECK-SCF-DAG:  %[[RES_SHAPE:.*]] = shape.broadcast %[[TMP]], %[[SHAPE_ARG2]]
-// CHECK-SCF-DAG:  %[[RES:.*]] = mhlo.dynamic_reshape %[[UNSHAPED_RES_EQ_SHAPES]], %[[RES_SHAPE]]
-// CHECK-SCF:      return %[[RES]]
-
-// -----
-
-// CHECK-LABEL: @compare_const_like
-// CHECK-SAME:  (%[[ARG0:.*]]: tensor<*xf32>)
-func.func @compare_const_like(%arg0 : tensor<*xf32>) -> tensor<*xi1> {
-  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG0]]) ({
-  // CHECK: ^bb0(%[[ARG1:.*]]: tensor<*xf32>):
-  // CHECK:   %[[ZERO:.*]] = "chlo.constant_like"(%[[ARG1]]) {value = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-  // CHECK:   %[[CMP_GT:.*]] = chlo.broadcast_compare %[[ARG1]], %[[ZERO]] {comparison_direction = #chlo<comparison_direction GT>} : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xi1>
-  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[CMP_GT]]) : (tensor<*xi1>) -> ()
-  // CHECK: }) : (tensor<*xf32>) -> tensor<*xi1>
-  // CHECK: return %[[RES]] : tensor<*xi1>
-  %0 = "chlo.constant_like"(%arg0) {value = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-  %1 = chlo.broadcast_compare %arg0, %0 {comparison_direction = #chlo<comparison_direction GT>}
-      : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xi1>
-  func.return %1 : tensor<*xi1>
-}
-
-// -----
-
-// Unary MHLO operation.
-// CHECK-LABEL: @sqrt
-// CHECK-SAME: (%[[ARG:.*]]: tensor<*xf32>)
-func.func @sqrt(%arg : tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG]])
-  // CHECK: ^bb0(%[[ARG_:.*]]: tensor<*xf32>):
-  // CHECK:   %[[TMP0:.*]] = mhlo.sqrt %[[ARG_]]
-  // CHECK:   %[[TMP1:.*]] = mhlo.sqrt %[[TMP0]]
-  // CHECK:   %[[TMP2:.*]] = mhlo.sqrt %[[TMP1]]
-  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP2]])
-  // CHECK: return %[[RES]]
-  %0 = mhlo.sqrt %arg : (tensor<*xf32>) -> tensor<*xf32>
-  %1 = mhlo.sqrt %0 : (tensor<*xf32>) -> tensor<*xf32>
-  %2 = mhlo.sqrt %1 : (tensor<*xf32>) -> tensor<*xf32>
-  func.return %2 : tensor<*xf32>
-}
-
-// CHECK-SCF-LABEL: @sqrt
-// CHECK-SCF-SAME:  (%[[ARG:.*]]: tensor<*xf32>)
-// CHECK-SCF:       %[[SHAPE:.*]] = shape.shape_of %[[ARG]]
-// CHECK-SCF:       %[[N:.*]] = shape.num_elements %[[SHAPE]]
-// CHECK-SCF:       %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]
-// CHECK-SCF:       %[[FLAT_ARG:.*]] = mhlo.dynamic_reshape %[[ARG]], %[[FLAT_SHAPE]] : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
-// CHECK-SCF:       %[[TMP0:.*]] = mhlo.sqrt %[[FLAT_ARG]] : tensor<?xf32>
-// CHECK-SCF:       %[[TMP1:.*]] = mhlo.sqrt %[[TMP0]] : tensor<?xf32>
-// CHECK-SCF:       %[[UNSHAPED_RES:.*]] = mhlo.sqrt %[[TMP1]] : tensor<?xf32>
-// CHECK-SCF:       %[[RES:.*]] = mhlo.dynamic_reshape %[[UNSHAPED_RES]], %[[SHAPE]] : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
-// CHECK-SCF:       return %[[RES]]
-
-// -----
-
-// Don't cluster ranked operations.
-// CHECK-LABEL: @sqrt_ranked
-// CHECK-SAME: (%[[ARG:.*]]: tensor<3x?xf32>)
-func.func @sqrt_ranked(%arg: tensor<3x?xf32>) -> tensor<3x?xf32> {
-  // CHECK-NOT: rank_specialization_cluster
-  %0 = mhlo.sqrt %arg : (tensor<3x?xf32>) -> tensor<3x?xf32>
-  %1 = mhlo.sqrt %0 : (tensor<3x?xf32>) -> tensor<3x?xf32>
-  %2 = mhlo.sqrt %1 : (tensor<3x?xf32>) -> tensor<3x?xf32>
-  func.return %2 : tensor<3x?xf32>
-}
-
-// CHECK-SCF-LABEL: @sqrt_ranked
-// CHECK-SCF-NOT:   dynamic_reshape
-// CHECK-SCF:       return
-
-// -----
-
-// Operation with mixed ranked and unranked operands.
-// CHECK-LABEL: @select_mixed
-// CHECK-SAME: (%[[PRED:.*]]: tensor<*xi1>, %[[ARG1:.*]]: tensor<*xf32>, %[[ARG2:.*]]: tensor<2xf32>)
-func.func @select_mixed(%pred: tensor<*xi1>, %arg1: tensor<*xf32>,
-    %arg2: tensor<2xf32>)  -> tensor<*xf32> {
-  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[PRED]], %[[ARG1]], %[[ARG2]])
-  // CHECK: ^bb0(%[[PRED_:.*]]: tensor<*xi1>, %[[ARG1_:.*]]: tensor<*xf32>, %[[ARG2_:.*]]: tensor<2xf32>)
-  // CHECK:   %[[TMP:.*]] = chlo.broadcast_select %[[PRED_]], %[[ARG1_]], %[[ARG2_]]
-  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP]])
-  // CHECK: return %[[RES]]
-  %0 = "chlo.broadcast_select"(%pred, %arg1, %arg2)
-      : (tensor<*xi1>, tensor<*xf32>, tensor<2xf32>) -> tensor<*xf32>
-  func.return %0 : tensor<*xf32>
-}
-
-// CHECK-SCF-LABEL: @select_mixed
-// CHECK-SCF:       chlo.broadcast_select %{{.*}}, %{{.*}}, %{{.*}} : (tensor<?xi1>, tensor<?xf32>, tensor<?xf32>)
-// CHECK-SCF:       return
-
-// -----
-
-// Unary CHLO operation.
-// CHECK-LABEL: @tan
-// CHECK-SAME: (%[[ARG:.*]]: tensor<*xf32>)
-func.func @tan(%arg : tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG]]) ({
-  // CHECK: ^bb0(%[[ARG_:.*]]: tensor<*xf32>)
-  // CHECK:   %[[TMP0:.*]] = chlo.tan %[[ARG_]]
-  // CHECK:   %[[TMP1:.*]] = chlo.tan %[[TMP0]]
-  // CHECK:   %[[TMP2:.*]] = chlo.tan %[[TMP1]]
-  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP2]])
-  // CHECK: return %[[RES]]
-  %0 = chlo.tan %arg : tensor<*xf32> -> tensor<*xf32>
-  %1 = chlo.tan %0 : tensor<*xf32> -> tensor<*xf32>
-  %2 = chlo.tan %1 : tensor<*xf32> -> tensor<*xf32>
-  func.return %2 : tensor<*xf32>
-}
-
-// CHECK-SCF-LABEL: @tan
-// CHECK-SCF-SAME: (%[[ARG:.*]]: tensor<*xf32>)
-// CHECK-SCF:      %[[SHAPE:.*]] = shape.shape_of %[[ARG]]
-// CHECK-SCF:      %[[N:.*]] = shape.num_elements %[[SHAPE]]
-// CHECK-SCF:      %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]
-// CHECK-SCF:      %[[FLAT_ARG:.*]] = mhlo.dynamic_reshape %[[ARG]], %[[FLAT_SHAPE]] : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
-// CHECK-SCF:      %[[TMP0:.*]] = chlo.tan %[[FLAT_ARG]] : tensor<?xf32>
-// CHECK-SCF:      %[[TMP1:.*]] = chlo.tan %[[TMP0]] : tensor<?xf32>
-// CHECK-SCF:      %[[UNSHAPED_RES:.*]] = chlo.tan %[[TMP1]] : tensor<?xf32>
-// CHECK-SCF:      %[[RES:.*]] = mhlo.dynamic_reshape %[[UNSHAPED_RES]], %[[SHAPE]] : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
-// CHECK-SCF:      return %[[RES]]
-
-// -----
-
-// Composition of unary/binary CHLO and unary MHLO ops.
-// CHECK-LABEL: @mixed
-// CHECK-SAME:  (%[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>, %[[ARG2:.*]]: tensor<*xf32>)
-func.func @mixed(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>, %arg2 : tensor<*xf32>)
-    -> tensor<*xf32> {
-  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG2]], %[[ARG1]], %[[ARG0]])
-  // CHECK: ^bb0(%[[ARG2_:.*]]: tensor<*xf32>, %[[ARG1_:.*]]: tensor<*xf32>, %[[ARG0_:.*]]: tensor<*xf32>)
-  // CHECK:   %[[TMP0:.*]] = chlo.tan %[[ARG0_]]
-  // CHECK:   %[[TMP1:.*]] = mhlo.sqrt %[[ARG1_]]
-  // CHECK:   %[[TMP2:.*]] = chlo.broadcast_multiply %[[TMP0]], %[[TMP1]]
-  // CHECK:   %[[TMP3:.*]] = chlo.broadcast_add %[[TMP2]], %[[ARG2_]]
-  // CHECK:   %[[TMP4:.*]] = mhlo.sqrt %[[TMP3]]
-  // CHECK:   %[[TMP5:.*]] = chlo.tan %[[TMP4]]
-  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP5]])
-  // CHECK: return %[[RES]]
-  %0 = chlo.tan %arg0 : tensor<*xf32> -> tensor<*xf32>
-  %1 = mhlo.sqrt %arg1 : (tensor<*xf32>) -> tensor<*xf32>
-  %2 = chlo.broadcast_multiply %0, %1
-      : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-  %3 = chlo.broadcast_add %2, %arg2
-      : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-  %4 = mhlo.sqrt %3 : (tensor<*xf32>) -> tensor<*xf32>
-  %5 = chlo.tan %4 : tensor<*xf32> -> tensor<*xf32>
-  func.return %5 : tensor<*xf32>
-}
-
-// CHECK-SCF-LABEL: @mixed
-// CHECK-SCF-DAG:   %[[TMP0:.*]] = chlo.tan %{{.*}} : tensor<?xf32>
-// CHECK-SCF-DAG:   %[[TMP1:.*]] = mhlo.sqrt %{{.*}} : tensor<?xf32>
-// CHECK-SCF-DAG:   %[[TMP2:.*]] = chlo.broadcast_multiply %[[TMP0]], %[[TMP1]] : (tensor<?xf32>, tensor<?xf32>)
-// CHECK-SCF-DAG:   %[[TMP3:.*]] = chlo.broadcast_add %[[TMP2]], %{{.*}} : (tensor<?xf32>, tensor<?xf32>)
-// CHECK-SCF-DAG:   %[[TMP4:.*]] = mhlo.sqrt %[[TMP3]] : tensor<?xf32>
-// CHECK-SCF:       chlo.tan %[[TMP4]] : tensor<?xf32>
-
-// -----
-
-// Constant cluster operand.
-// CHECK-LABEL: @relu
-// CHECK-SAME:  (%[[ARG:.*]]: tensor<*xf32>)
-func.func @relu(%arg : tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: %[[C0:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG]], %[[C0]])
-  // CHECK: ^bb0(%[[ARG_:.*]]: tensor<*xf32>, %[[C0_:.*]]: tensor<f32>):
-  // CHECK:   %[[TMP:.*]] = chlo.broadcast_maximum %[[ARG_]], %[[C0_]]
-  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP]])
-  // CHECK: return %[[RES]]
-  %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %1 = chlo.broadcast_maximum %0, %arg
-      : (tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
-  func.return %1 : tensor<*xf32>
-}
-
-// CHECK-SCF-LABEL: @relu
-// CHECK-SCF-SAME:  (%[[ARG:.*]]: tensor<*xf32>)
-// CHECK-SCF:       %[[C0:.*]] = mhlo.constant dense<0.000000e+00>
-// CHECK-SCF:       %[[SHAPE:.*]] = shape.shape_of %[[ARG]]
-// CHECK-SCF:       %[[N:.*]] = shape.num_elements %[[SHAPE]]
-// CHECK-SCF:       %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]
-// CHECK-SCF:       %[[FLAT_ARG:.*]] = mhlo.dynamic_reshape %[[ARG]], %[[FLAT_SHAPE]] : (tensor<*xf32>, tensor<1xindex>) -> tensor<?xf32>
-// CHECK-SCF:       %[[UNSHAPED_RES:.*]] = chlo.broadcast_maximum %[[FLAT_ARG]], %[[C0]] : (tensor<?xf32>, tensor<f32>)
-// CHECK-SCF:       %[[RES:.*]] = mhlo.dynamic_reshape %[[UNSHAPED_RES]], %[[SHAPE]] : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
-// CHECK-SCF:       return %[[RES]]
-
-// -----
-
-// Cluster with binary non-broadcasting operation.
-// CHECK-LABEL: @angle
-// CHECK-SAME:  (%[[ARG:.*]]: tensor<*xcomplex<f32>>)
-func.func @angle(%arg : tensor<*xcomplex<f32>>) -> tensor<*xf32> {
-  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG]])
-  // CHECK: ^bb0(%[[ARG_:.*]]: tensor<*xcomplex<f32>>):
-  // CHECK:   %[[IMAG:.*]] = mhlo.imag %[[ARG_]]
-  // CHECK:   %[[REAL:.*]] = mhlo.real %[[ARG_]]
-  // CHECK:   %[[TMP:.*]] = mhlo.atan2 %[[IMAG]], %[[REAL]]
-  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP]])
-  // CHECK: return %[[RES]]
-  %0 = mhlo.imag %arg : (tensor<*xcomplex<f32>>) -> tensor<*xf32>
-  %1 = mhlo.real %arg : (tensor<*xcomplex<f32>>) -> tensor<*xf32>
-  %2 = mhlo.atan2 %0, %1 : tensor<*xf32>
-  func.return %2 : tensor<*xf32>
-}
-
-// CHECK-SCF-LABEL: @angle
-// CHECK-SCF-SAME:  (%[[ARG:.*]]: tensor<*xcomplex<f32>>)
-// CHECK-SCF:       %[[SHAPE:.*]] = shape.shape_of %[[ARG]]
-// CHECK-SCF:       %[[N:.*]] = shape.num_elements %[[SHAPE]]
-// CHECK-SCF:       %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]
-// CHECK-SCF:       %[[FLAT_ARG:.*]] = mhlo.dynamic_reshape %[[ARG]], %[[FLAT_SHAPE]] : (tensor<*xcomplex<f32>>, tensor<1xindex>) -> tensor<?xcomplex<f32>>
-// CHECK-SCF:       %[[IMAG:.*]] = mhlo.imag %[[FLAT_ARG]] : (tensor<?xcomplex<f32>>)
-// CHECK-SCF:       %[[REAL:.*]] = mhlo.real %[[FLAT_ARG]] : (tensor<?xcomplex<f32>>)
-// CHECK-SCF:       %[[UNSHAPED_RES:.*]] = mhlo.atan2 %[[IMAG]], %[[REAL]] : tensor<?xf32>
-  // CHECK-SCF:       %[[RES:.*]] = mhlo.dynamic_reshape %[[UNSHAPED_RES]], %[[SHAPE]] : (tensor<?xf32>, tensor<?xindex>) -> tensor<*xf32>
-// CHECK-SCF:       return %[[RES]]
-
-// -----
-
-// Scalar cluster operand.
-// CHECK-LABEL: @xlogy
-// CHECK-SAME:  (%[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>)
-func.func @xlogy(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: %[[C0:.*]] = mhlo.constant dense<0.000000e+00>
-  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[C0]], %[[ARG0]], %[[ARG1]])
-  // CHECK: ^bb0(%[[C0_:.*]]: tensor<f32>, %[[ARG0_:.*]]: tensor<*xf32>, %[[ARG1_:.*]]: tensor<*xf32>):
-  // CHECK:   %[[TMP0:.*]] = chlo.broadcast_compare %[[ARG0_]], %[[C0_]] {comparison_direction = #chlo<comparison_direction EQ>}
-  // CHECK:   %[[TMP1:.*]] = mhlo.log %[[ARG1_]]
-  // CHECK:   %[[TMP2:.*]] = chlo.broadcast_multiply %[[ARG0_]], %[[TMP1]]
-  // CHECK:   %[[TMP3:.*]] = chlo.broadcast_select %[[TMP0]], %[[C0_]], %[[TMP2]]
-  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP3]])
-  // CHECK: return %[[RES]]
-  %0 = mhlo.constant dense<0.000000e+00> : tensor<f32>
-  %1 = tensor.cast %0 : tensor<f32> to tensor<f32>
-  %2 = chlo.broadcast_compare %arg0, %1 {comparison_direction = #chlo<comparison_direction EQ>}
-      : (tensor<*xf32>, tensor<f32>) -> tensor<*xi1>
-  %3 = mhlo.log %arg1 : (tensor<*xf32>) -> tensor<*xf32>
-  %4 = chlo.broadcast_multiply %arg0, %3
-      : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-  %5 = chlo.broadcast_select %2, %1, %4
-      : (tensor<*xi1>, tensor<f32>, tensor<*xf32>) -> tensor<*xf32>
-  func.return %5 : tensor<*xf32>
-}
-
-// CHECK-SCF:      @xlogy
-// CHECK-SCF-SAME: (%[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>)
-// CHECK-SCF-DAG:  %[[C1:.*]] = arith.constant 1
-// CHECK-SCF-DAG:  %[[ONE_SHAPE_1:.*]] = shape.const_shape [1]
-// CHECK-SCF-DAG:  %[[SHAPE_ARG0:.*]] = shape.shape_of %[[ARG0]]
-// CHECK-SCF-DAG:  %[[SHAPE_ARG1:.*]] = shape.shape_of %[[ARG1]]
-// CHECK-SCF-DAG:  %[[ZERO:.*]] = mhlo.constant dense<0.00{{.*}}>
-//                 Lhs scalar case:
-// CHECK-SCF-DAG:  %[[LHS_N:.*]] = shape.num_elements %[[SHAPE_ARG0]]
-// CHECK-SCF-DAG:  %[[LHS_SCALAR:.*]] = arith.cmpi eq, %[[LHS_N]], %[[C1]]
-// CHECK-SCF:      %[[UNSHAPED_RES:.*]] = scf.if %[[LHS_SCALAR]]
-// CHECK-SCF-DAG:    %[[N:.*]] = shape.num_elements %[[SHAPE_ARG1]]
-// CHECK-SCF-DAG:    %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]
-// CHECK-SCF-DAG:    %[[FLAT_NON_SCALAR:.*]] = mhlo.dynamic_reshape %[[ARG1]], %[[FLAT_SHAPE]]
-// CHECK-SCF-DAG:    %[[SCALAR:.*]] = mhlo.reshape %[[ARG0]]
-// CHECK-SCF-DAG:    %[[PRED:.*]] = chlo.broadcast_compare %[[SCALAR]], %[[ZERO]] {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<f32>, tensor<f32>)
-// CHECK-SCF-DAG:    %[[TMP0:.*]] = mhlo.log %[[FLAT_NON_SCALAR]] : tensor<?xf32>
-// CHECK-SCF-DAG:    %[[TMP1:.*]] = chlo.broadcast_multiply %[[SCALAR]], %[[TMP0]] : (tensor<f32>, tensor<?xf32>)
-// CHECK-SCF-DAG:    %[[INNER_RES:.*]] = chlo.broadcast_select %[[PRED]], %[[ZERO]], %[[TMP1]] : (tensor<i1>, tensor<f32>, tensor<?xf32>)
-// CHECK-SCF-DAG:    %[[INNER_RES_:.*]] = tensor.cast %[[INNER_RES]]
-// CHECK-SCF:        scf.yield %[[INNER_RES_]]
-// CHECK-SCF:      else
-//                   Rhs scalar case:
-// CHECK-SCF-DAG:    %[[RHS_N:.*]] = shape.num_elements %[[SHAPE_ARG1]]
-// CHECK-SCF-DAG:    %[[RHS_SCALAR:.*]] = arith.cmpi eq, %[[RHS_N]], %[[C1]]
-// CHECK-SCF:        %{{.*}} = scf.if %[[RHS_SCALAR]]
-// CHECK-SCF-DAG:      %[[N:.*]] = shape.num_elements %[[SHAPE_ARG0]]
-// CHECK-SCF-DAG:      %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]
-// CHECK-SCF-DAG:      %[[FLAT_NON_SCALAR:.*]] = mhlo.dynamic_reshape %[[ARG0]], %[[FLAT_SHAPE]]
-// CHECK-SCF-DAG:      %[[SCALAR:.*]] = mhlo.reshape %[[ARG1]]
-// CHECK-SCF-DAG:      %[[PRED:.*]] = chlo.broadcast_compare %[[FLAT_NON_SCALAR]], %[[ZERO]] {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xf32>, tensor<f32>)
-// CHECK-SCF-DAG:      %[[TMP0:.*]] = mhlo.log %[[SCALAR]] : tensor<f32>
-// CHECK-SCF-DAG:      %[[TMP1:.*]] = chlo.broadcast_multiply %[[FLAT_NON_SCALAR]], %[[TMP0]] : (tensor<?xf32>, tensor<f32>)
-// CHECK-SCF-DAG:      %[[INNER_RES:.*]] = chlo.broadcast_select %[[PRED]], %[[ZERO]], %[[TMP1]] : (tensor<?xi1>, tensor<f32>, tensor<?xf32>)
-// CHECK-SCF-DAG:      %[[INNER_RES_:.*]] = tensor.cast %[[INNER_RES]]
-// CHECK-SCF:          scf.yield %[[INNER_RES_]]
-// CHECK-SCF:        else
-//                 Equal shapes case:
-// CHECK-SCF-DAG:      %[[SHAPES_EQ:.*]] = shape.shape_eq %[[SHAPE_ARG0]], %[[SHAPE_ARG1]]
-// CHECK-SCF:          %{{.*}} = scf.if %[[SHAPES_EQ]]
-// CHECK-SCF-DAG:        %[[SHAPE:.*]] = shape.any %[[SHAPE_ARG0]], %[[SHAPE_ARG1]]
-// CHECK-SCF-DAG:        %[[N:.*]] = shape.num_elements %[[SHAPE]]
-// CHECK-SCF-DAG:        %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]
-// CHECK-SCF-DAG:        %[[FLAT_ARG0:.*]] = mhlo.dynamic_reshape %[[ARG0]], %[[FLAT_SHAPE]]
-// CHECK-SCF-DAG:        %[[FLAT_ARG1:.*]] = mhlo.dynamic_reshape %[[ARG1]], %[[FLAT_SHAPE]]
-// CHECK-SCF-DAG:        %[[PRED:.*]] = chlo.broadcast_compare %[[FLAT_ARG0]], %[[ZERO]] {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xf32>, tensor<f32>)
-// CHECK-SCF-DAG:        %[[TMP0:.*]] = mhlo.log %[[FLAT_ARG1]] : tensor<?xf32>
-// CHECK-SCF-DAG:        %[[TMP1:.*]] = chlo.broadcast_multiply %[[FLAT_ARG0]], %[[TMP0]] : (tensor<?xf32>, tensor<?xf32>)
-// CHECK-SCF-DAG:        %[[INNER_RES:.*]] = chlo.broadcast_select %[[PRED]], %[[ZERO]], %[[TMP1]] : (tensor<?xi1>, tensor<f32>, tensor<?xf32>)
-// CHECK-SCF-DAG:        %[[INNER_RES_:.*]] = tensor.cast %[[INNER_RES]]
-// CHECK-SCF:            scf.yield %[[INNER_RES_]]
-// CHECK-SCF:          else
-//                 Find maximum reduced rank.
-// CHECK-SCF-DAG:        %[[REDUCED_SHAPES:.*]]:2 = chlo.minimum_broadcast_shapes %[[SHAPE_ARG0]], %[[SHAPE_ARG1]]
-// CHECK-SCF-DAG:        %[[REDUCED_RANK0:.*]] = shape.rank %[[REDUCED_SHAPES]]#0
-// CHECK-SCF-DAG:        %[[REDUCED_RANK1:.*]] = shape.rank %[[REDUCED_SHAPES]]#1
-// CHECK-SCF-DAG:        %[[R0_GT_R1:.*]] = arith.cmpi sgt, %[[REDUCED_RANK0]], %[[REDUCED_RANK1]]
-// CHECK-SCF-DAG:        %[[MAX_RED_RANK:.*]] = arith.select %[[R0_GT_R1]], %[[REDUCED_RANK0]], %[[REDUCED_RANK1]]
-//                 Generic case 1:
-// CHECK-SCF:            %[[MAX_RED_RANK_LE_1:.*]] = arith.cmpi ule, %[[MAX_RED_RANK]], %[[C1]]
-// CHECK-SCF:            %{{.*}} = scf.if %[[MAX_RED_RANK_LE_1]]
-// CHECK-SCF-DAG:          %[[EXT_SHAPE_ARG0:.*]] = shape.broadcast %[[REDUCED_SHAPES]]#0, %[[ONE_SHAPE_1]]
-// CHECK-SCF-DAG:          %[[EXT_SHAPE_ARG1:.*]] = shape.broadcast %[[REDUCED_SHAPES]]#1, %[[ONE_SHAPE_1]]
-// CHECK-SCF-DAG:          %[[EXT_SHAPE_ARG0_:.*]] = tensor.cast %[[EXT_SHAPE_ARG0]]
-// CHECK-SCF-DAG:          %[[EXT_SHAPE_ARG1_:.*]] = tensor.cast %[[EXT_SHAPE_ARG1]]
-// CHECK-SCF-DAG:          %[[REDUCED_ARG0:.*]] = mhlo.dynamic_reshape %[[ARG0]], %[[EXT_SHAPE_ARG0_]]
-// CHECK-SCF-DAG:          %[[REDUCED_ARG1:.*]] = mhlo.dynamic_reshape %[[ARG1]], %[[EXT_SHAPE_ARG1_]]
-// CHECK-SCF-DAG:          %[[PRED:.*]] = chlo.broadcast_compare %[[REDUCED_ARG0]], %[[ZERO]] {comparison_direction = #chlo<comparison_direction EQ>} : (tensor<?xf32>, tensor<f32>)
-// CHECK-SCF-DAG:          %[[TMP0:.*]] = mhlo.log %[[REDUCED_ARG1]] : tensor<?xf32>
-// CHECK-SCF-DAG:          %[[TMP1:.*]] = chlo.broadcast_multiply %[[REDUCED_ARG0]], %[[TMP0]] : (tensor<?xf32>, tensor<?xf32>)
-// CHECK-SCF-DAG:          %[[INNER_RES:.*]] = chlo.broadcast_select %[[PRED]], %[[ZERO]], %[[TMP1]] : (tensor<?xi1>, tensor<f32>, tensor<?xf32>)
-// CHECK-SCF-DAG:          %[[INNER_RES_:.*]] = tensor.cast %[[INNER_RES]]
-// CHECK-SCF:              scf.yield %[[INNER_RES_]]
-// CHECK-SCF:            else
-//                         ...
-//                 Reshape the result.
-// CHECK-SCF:      %[[S0:.*]] = shape.shape_of %[[ARG0]]
-// CHECK-SCF:      %[[S0_:.*]] = shape.shape_of %[[ARG0]]
-// CHECK-SCF:      %[[S1:.*]] = shape.shape_of %[[ARG1]]
-// CHECK-SCF:      %[[TMP:.*]] = shape.broadcast %[[S0_]], %[[S1]]
-// CHECK-SCF:      %[[RES_SHAPE:.*]] = shape.broadcast %[[S0]], %[[TMP]]
-// CHECK-SCF:      %[[RES:.*]] = mhlo.dynamic_reshape %[[UNSHAPED_RES]], %[[RES_SHAPE]] : (tensor<*xf32>, tensor<?xindex>) -> tensor<*xf32>
-// CHECK-SCF:      return %[[RES]]
-
-// -----
-
-// CHECK-LABEL: @mul
-// CHECK-SAME:  (%[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>)
-func.func @mul(%arg0 : tensor<*xf32>, %arg1 : tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG0]], %[[ARG1]])
-  // CHECK: ^bb0(%[[ARG0_:.*]]: tensor<*xf32>, %[[ARG1_:.*]]: tensor<*xf32>):
-  // CHECK:   %[[TMP:.*]] = chlo.broadcast_multiply %[[ARG0_]], %[[ARG1_]]
-  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP]])
-  // CHECK: return %[[RES]]
-  %0 = chlo.broadcast_multiply %arg0, %arg1 : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-  func.return %0 : tensor<*xf32>
-}
-
-// CHECK-SCF-LABEL: @mul
-// CHECK-SCF-SAME: (%[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>)
-// CHECK-SCF-DAG:  %[[C1:.*]] = arith.constant 1
-// CHECK-SCF-DAG:  %[[C2:.*]] = arith.constant 2
-// CHECK-SCF-DAG:  %[[C3:.*]] = arith.constant 3
-// CHECK-SCF-DAG:  %[[ONE_SHAPE_1:.*]] = shape.const_shape [1]
-// CHECK-SCF-DAG:  %[[ONE_SHAPE_2:.*]] = shape.const_shape [1, 1]
-// CHECK-SCF-DAG:  %[[ONE_SHAPE_3:.*]] = shape.const_shape [1, 1, 1]
-// CHECK-SCF-DAG:  %[[SHAPE_ARG0:.*]] = shape.shape_of %[[ARG0]]
-// CHECK-SCF-DAG:  %[[SHAPE_ARG1:.*]] = shape.shape_of %[[ARG1]]
-//                 Lhs scalar case:
-// CHECK-SCF-DAG:  %[[LHS_N:.*]] = shape.num_elements %[[SHAPE_ARG0]]
-// CHECK-SCF-DAG:  %[[LHS_SCALAR:.*]] = arith.cmpi eq, %[[LHS_N]], %[[C1]]
-// CHECK-SCF:      %[[UNSHAPED_RES_LHS_SCALAR:.*]] = scf.if %[[LHS_SCALAR]]
-// CHECK-SCF-DAG:    %[[N:.*]] = shape.num_elements %[[SHAPE_ARG1]]
-// CHECK-SCF-DAG:    %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]
-// CHECK-SCF-DAG:    %[[FLAT_NON_SCALAR:.*]] = mhlo.dynamic_reshape %[[ARG1]], %[[FLAT_SHAPE]]
-// CHECK-SCF-DAG:    %[[SCALAR:.*]] = mhlo.reshape %[[ARG0]]
-// CHECK-SCF-DAG:    %[[INNER_RES:.*]] = chlo.broadcast_multiply %[[SCALAR]], %[[FLAT_NON_SCALAR]] : (tensor<f32>, tensor<?xf32>)
-// CHECK-SCF-DAG:    %[[INNER_RES_:.*]] = tensor.cast %[[INNER_RES]]
-// CHECK-SCF:        scf.yield %[[INNER_RES_]]
-// CHECK-SCF:      else
-//                   Rhs scalar case:
-// CHECK-SCF-DAG:    %[[RHS_N:.*]] = shape.num_elements %[[SHAPE_ARG1]]
-// CHECK-SCF-DAG:    %[[RHS_SCALAR:.*]] = arith.cmpi eq, %[[RHS_N]], %[[C1]]
-// CHECK-SCF:        %[[UNSHAPED_RES_RHS_SCALAR:.*]] = scf.if %[[RHS_SCALAR]]
-// CHECK-SCF-DAG:      %[[N:.*]] = shape.num_elements %[[SHAPE_ARG0]]
-// CHECK-SCF-DAG:      %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]
-// CHECK-SCF-DAG:      %[[FLAT_NON_SCALAR:.*]] = mhlo.dynamic_reshape %[[ARG0]], %[[FLAT_SHAPE]]
-// CHECK-SCF-DAG:      %[[SCALAR:.*]] = mhlo.reshape %[[ARG1]]
-// CHECK-SCF-DAG:      %[[INNER_RES:.*]] = chlo.broadcast_multiply %[[FLAT_NON_SCALAR]], %[[SCALAR]] : (tensor<?xf32>, tensor<f32>)
-// CHECK-SCF-DAG:      %[[INNER_RES_:.*]] = tensor.cast %[[INNER_RES]]
-// CHECK-SCF:          scf.yield %[[INNER_RES_]]
-// CHECK-SCF:        else
-//                 Equal shapes case:
-// CHECK-SCF-DAG:      %[[SHAPES_EQ:.*]] = shape.shape_eq %[[SHAPE_ARG0]], %[[SHAPE_ARG1]]
-// CHECK-SCF:          %[[UNSHAPED_RES_EQ_SHAPES:.*]] = scf.if %[[SHAPES_EQ]]
-// CHECK-SCF-DAG:        %[[SHAPE:.*]] = shape.any %[[SHAPE_ARG0]], %[[SHAPE_ARG1]]
-// CHECK-SCF-DAG:        %[[N:.*]] = shape.num_elements %[[SHAPE]]
-// CHECK-SCF-DAG:        %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]
-// CHECK-SCF-DAG:        %[[FLAT_ARG0:.*]] = mhlo.dynamic_reshape %[[ARG0]], %[[FLAT_SHAPE]]
-// CHECK-SCF-DAG:        %[[FLAT_ARG1:.*]] = mhlo.dynamic_reshape %[[ARG1]], %[[FLAT_SHAPE]]
-// CHECK-SCF-DAG:        %[[INNER_RES:.*]] = chlo.broadcast_multiply %[[FLAT_ARG0]], %[[FLAT_ARG1]] : (tensor<?xf32>, tensor<?xf32>)
-// CHECK-SCF-DAG:        %[[INNER_RES_:.*]] = tensor.cast %[[INNER_RES]]
-// CHECK-SCF:            scf.yield %[[INNER_RES_]]
-// CHECK-SCF:          else
-//                 Find maximum reduced rank.
-// CHECK-SCF-DAG:        %[[REDUCED_SHAPES:.*]]:2 = chlo.minimum_broadcast_shapes %[[SHAPE_ARG0]], %[[SHAPE_ARG1]]
-// CHECK-SCF-DAG:        %[[REDUCED_RANK0:.*]] = shape.rank %[[REDUCED_SHAPES]]#0
-// CHECK-SCF-DAG:        %[[REDUCED_RANK1:.*]] = shape.rank %[[REDUCED_SHAPES]]#1
-// CHECK-SCF-DAG:        %[[R0_GT_R1:.*]] = arith.cmpi sgt, %[[REDUCED_RANK0]], %[[REDUCED_RANK1]]
-// CHECK-SCF-DAG:        %[[MAX_RED_RANK:.*]] = arith.select %[[R0_GT_R1]], %[[REDUCED_RANK0]], %[[REDUCED_RANK1]]
-//                 Generic case 1:
-// CHECK-SCF:            %[[MAX_RED_RANK_LE_1:.*]] = arith.cmpi ule, %[[MAX_RED_RANK]], %[[C1]]
-// CHECK-SCF:            %[[UNSHAPED_RES_1:.*]] = scf.if %[[MAX_RED_RANK_LE_1]]
-// CHECK-SCF-DAG:          %[[EXT_SHAPE_ARG0:.*]] = shape.broadcast %[[REDUCED_SHAPES]]#0, %[[ONE_SHAPE_1]]
-// CHECK-SCF-DAG:          %[[EXT_SHAPE_ARG1:.*]] = shape.broadcast %[[REDUCED_SHAPES]]#1, %[[ONE_SHAPE_1]]
-// CHECK-SCF-DAG:          %[[EXT_SHAPE_ARG0_:.*]] = tensor.cast %[[EXT_SHAPE_ARG0]]
-// CHECK-SCF-DAG:          %[[EXT_SHAPE_ARG1_:.*]] = tensor.cast %[[EXT_SHAPE_ARG1]]
-// CHECK-SCF-DAG:          %[[REDUCED_ARG0:.*]] = mhlo.dynamic_reshape %[[ARG0]], %[[EXT_SHAPE_ARG0_]]
-// CHECK-SCF-DAG:          %[[REDUCED_ARG1:.*]] = mhlo.dynamic_reshape %[[ARG1]], %[[EXT_SHAPE_ARG1_]]
-// CHECK-SCF-DAG:          %[[INNER_RES:.*]] = chlo.broadcast_multiply %[[REDUCED_ARG0]], %[[REDUCED_ARG1]] : (tensor<?xf32>, tensor<?xf32>)
-// CHECK-SCF-DAG:          %[[INNER_RES_:.*]] = tensor.cast %[[INNER_RES]]
-// CHECK-SCF:              scf.yield %[[INNER_RES_]]
-// CHECK-SCF:            else
-//                 Generic case 2:
-// CHECK-SCF:              %[[MAX_RED_RANK_LE_2:.*]] = arith.cmpi ule, %[[MAX_RED_RANK]], %[[C2]]
-// CHECK-SCF:              %[[UNSHAPED_RES_2:.*]] = scf.if %[[MAX_RED_RANK_LE_2]]
-// CHECK-SCF-DAG:            %[[EXT_SHAPE_ARG0:.*]] = shape.broadcast %[[REDUCED_SHAPES]]#0, %[[ONE_SHAPE_2]]
-// CHECK-SCF-DAG:            %[[EXT_SHAPE_ARG1:.*]] = shape.broadcast %[[REDUCED_SHAPES]]#1, %[[ONE_SHAPE_2]]
-// CHECK-SCF-DAG:            %[[EXT_SHAPE_ARG0_:.*]] = tensor.cast %[[EXT_SHAPE_ARG0]]
-// CHECK-SCF-DAG:            %[[EXT_SHAPE_ARG1_:.*]] = tensor.cast %[[EXT_SHAPE_ARG1]]
-// CHECK-SCF-DAG:            %[[REDUCED_ARG0:.*]] = mhlo.dynamic_reshape %[[ARG0]], %[[EXT_SHAPE_ARG0_]]
-// CHECK-SCF-DAG:            %[[REDUCED_ARG1:.*]] = mhlo.dynamic_reshape %[[ARG1]], %[[EXT_SHAPE_ARG1_]]
-// CHECK-SCF-DAG:            %[[INNER_RES:.*]] = chlo.broadcast_multiply %[[REDUCED_ARG0]], %[[REDUCED_ARG1]] : (tensor<?x?xf32>, tensor<?x?xf32>)
-// CHECK-SCF-DAG:            %[[INNER_RES_:.*]] = tensor.cast %[[INNER_RES]]
-// CHECK-SCF:                scf.yield %[[INNER_RES_]]
-// CHECK-SCF:              else
-//                 Generic case 3:
-// CHECK-SCF:                %[[MAX_RED_RANK_LE_3:.*]] = arith.cmpi ule, %[[MAX_RED_RANK]], %[[C3]]
-// CHECK-SCF:                assert %[[MAX_RED_RANK_LE_3]], "Input for dynamic binary or n-ary op lowering was of a rank greater than 3"
-// CHECK-SCF-DAG:            %[[EXT_SHAPE_ARG0:.*]] = shape.broadcast %[[REDUCED_SHAPES]]#0, %[[ONE_SHAPE_3]]
-// CHECK-SCF-DAG:            %[[EXT_SHAPE_ARG1:.*]] = shape.broadcast %[[REDUCED_SHAPES]]#1, %[[ONE_SHAPE_3]]
-// CHECK-SCF-DAG:            %[[EXT_SHAPE_ARG0_:.*]] = tensor.cast %[[EXT_SHAPE_ARG0]]
-// CHECK-SCF-DAG:            %[[EXT_SHAPE_ARG1_:.*]] = tensor.cast %[[EXT_SHAPE_ARG1]]
-// CHECK-SCF-DAG:            %[[REDUCED_ARG0:.*]] = mhlo.dynamic_reshape %[[ARG0]], %[[EXT_SHAPE_ARG0_]]
-// CHECK-SCF-DAG:            %[[REDUCED_ARG1:.*]] = mhlo.dynamic_reshape %[[ARG1]], %[[EXT_SHAPE_ARG1_]]
-// CHECK-SCF-DAG:            %[[INNER_RES:.*]] = chlo.broadcast_multiply %[[REDUCED_ARG0]], %[[REDUCED_ARG1]] : (tensor<?x?x?xf32>, tensor<?x?x?xf32>)
-// CHECK-SCF-DAG:            %[[INNER_RES_:.*]] = tensor.cast %[[INNER_RES]]
-// CHECK-SCF:                scf.yield %[[INNER_RES_]]
-// CHECK-SCF:              scf.yield %[[UNSHAPED_RES_2]]
-// CHECK-SCF:            scf.yield %[[UNSHAPED_RES_1]]
-// CHECK-SCF:          scf.yield %[[UNSHAPED_RES_EQ_SHAPES]]
-// CHECK-SCF:        scf.yield %[[UNSHAPED_RES_RHS_SCALAR]]
-//                 Reshape the result.
-// CHECK-SCF-DAG:  %[[SHAPE_ARG0:.*]] = shape.shape_of %[[ARG0]]
-// CHECK-SCF-DAG:  %[[SHAPE_ARG1:.*]] = shape.shape_of %[[ARG1]]
-// CHECK-SCF-DAG:  %[[RES_SHAPE:.*]] = shape.broadcast %[[SHAPE_ARG0]], %[[SHAPE_ARG1]]
-// CHECK-SCF-DAG:  %[[RES:.*]] = mhlo.dynamic_reshape %[[UNSHAPED_RES_LHS_SCALAR]], %[[RES_SHAPE]]
-// CHECK-SCF:      return %[[RES]]
-
-// -----
-
-// CHECK-LABEL: @merge_clusters
-// CHECK-SAME:  (%[[ARG0:.*]]: tensor<*xf64>, %[[ARG1:.*]]: tensor<*xf64>)
-func.func @merge_clusters(%arg0: tensor<*xf64>, %arg1 : tensor<*xf64>)
-    -> tensor<*xf64> {
-  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG0]], %[[ARG1]])
-  // CHECK: ^bb0(%[[ARG0_:.*]]: tensor<*xf64>, %[[ARG1_:.*]]: tensor<*xf64>):
-  // CHECK:   %[[TMP0:.*]] = mhlo.tanh %[[ARG0_]]
-  // CHECK:   %[[TMP1:.*]] = chlo.broadcast_add %[[TMP0]], %[[ARG0_]]
-  // CHECK:   %[[TMP2:.*]] = chlo.broadcast_add %[[TMP1]], %[[ARG1_]]
-  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP2]])
-  // CHECK: return %[[RES]]
-  %0 = "chlo.rank_specialization_cluster"(%arg0) ({
-  ^bb0(%arg0_: tensor<*xf64>):
-    %1 = mhlo.tanh %arg0_ : (tensor<*xf64>) -> tensor<*xf64>
-    "chlo.rank_specialization_cluster_yield"(%1) : (tensor<*xf64>) -> ()
-  }) : (tensor<*xf64>) -> (tensor<*xf64>)
-  %2 = "chlo.rank_specialization_cluster"(%0, %arg0, %arg1) ({
-  ^bb0(%3: tensor<*xf64>, %4: tensor<*xf64>, %5: tensor<*xf64>):
-    %6 = "chlo.broadcast_add"(%3, %4)
-        : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
-    %7 = "chlo.broadcast_add"(%6, %5)
-        : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
-    "chlo.rank_specialization_cluster_yield"(%7) : (tensor<*xf64>) -> ()
-  }) : (tensor<*xf64>, tensor<*xf64>, tensor<*xf64>) -> (tensor<*xf64>)
-  func.return %2 : tensor<*xf64>
-}
-
-// -----
-
-// CHECK-LABEL: @all_equal_shapes_inferrable
-// CHECK-SAME:  (%[[ARG0:.*]]: tensor<*xf64>, %[[ARG1:.*]]: tensor<*xf64>)
-func.func @all_equal_shapes_inferrable(%arg0: tensor<*xf64>, %arg1 : tensor<*xf64>)
-    -> tensor<*xf64> {
-  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG0]], %[[ARG1]])
-  // CHECK: ^bb0(%[[ARG0_:.*]]: tensor<*xf64>, %[[ARG1_:.*]]: tensor<*xf64>)
-  // CHECK:   %[[INNER_RES:.*]] = mhlo.add %[[ARG0_]], %[[ARG1_]]
-  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[INNER_RES]])
-  // CHECK: return %[[RES]]
-  %0 = "mhlo.add"(%arg0, %arg1)
-      : (tensor<*xf64>, tensor<*xf64>) -> tensor<*xf64>
-  func.return %0 : tensor<*xf64>
-}
-
-// CHECK-SCF-LABEL: @all_equal_shapes_inferrable
-// CHECK-SCF-SAME:  (%[[ARG0:.*]]: tensor<*xf64>, %[[ARG1:.*]]: tensor<*xf64>)
-// CHECK-SCF-DAG:   %[[S0:.*]] = shape.shape_of %[[ARG0]]
-// CHECK-SCF-DAG:   %[[S1:.*]] = shape.shape_of %[[ARG1]]
-// CHECK-SCF-DAG:   %[[S:.*]] = shape.any %[[S0]], %[[S1]]
-// CHECK-SCF-DAG:   %[[N:.*]] = shape.num_elements %[[S]]
-// CHECK-SCF-DAG:   %[[FLAT_S:.*]] = tensor.from_elements %[[N]]
-// CHECK-SCF-DAG:   %[[FLAT0:.*]] = mhlo.dynamic_reshape %[[ARG0]], %[[FLAT_S]]
-// CHECK-SCF-DAG:   %[[FLAT1:.*]] = mhlo.dynamic_reshape %[[ARG1]], %[[FLAT_S]]
-// CHECK-SCF:       %[[FLAT_RES:.*]] = mhlo.add %[[FLAT0]], %[[FLAT1]]
-// CHECK-SCF-DAG:   %[[RES:.*]] = mhlo.dynamic_reshape %[[FLAT_RES]], %[[S0]]
-// CHECK-SCF:       return %[[RES]]
-
-// -----
-
-// All shapes are equal, which is inferrable through the select op.
-// CHECK-LABEL: @relu_grad
-// CHECK-SAME:  (%[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>)
-func.func @relu_grad(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK: %[[RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG1]], %[[ARG0]])
-  // CHECK: ^bb0(%[[ARG1_:.*]]: tensor<*xf32>, %[[ARG0_:.*]]: tensor<*xf32>)
-  // CHECK:   %[[TMP0:.*]] = "chlo.constant_like"(%[[ARG0_]]) {value = 0.0{{.*}}e+00 : f32}
-  // CHECK:   %[[TMP1:.*]] = mhlo.compare GT, %[[ARG0_]], %[[TMP0]]
-  // CHECK:   %[[TMP2:.*]] = mhlo.select %[[TMP1]], %[[ARG1_]], %[[TMP0]]
-  // CHECK:   "chlo.rank_specialization_cluster_yield"(%[[TMP2]])
-  // CHECK: return %[[RES]]
-  %0 = "chlo.constant_like"(%arg0) {value = 0.000000e+00 : f32} : (tensor<*xf32>) -> tensor<*xf32>
-  %1 = "mhlo.compare"(%arg0, %0) {comparison_direction = #mhlo<comparison_direction GT>} : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xi1>
-  %2 = "mhlo.select"(%1, %arg1, %0) : (tensor<*xi1>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-  func.return %2 : tensor<*xf32>
-}
-
-// CHECK-SCF-LABEL: @relu_grad
-// CHECK-SCF-SAME:  (%[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>)
-// CHECK-SCF-DAG:   %[[S0:.*]] = shape.shape_of %[[ARG0]]
-// CHECK-SCF-DAG:   %[[S1:.*]] = shape.shape_of %[[ARG1]]
-// CHECK-SCF-DAG:   %[[S:.*]] = shape.any %[[S1]], %[[S0]]
-// CHECK-SCF-DAG:   %[[N:.*]] = shape.num_elements %[[S]]
-// CHECK-SCF-DAG:   %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]
-// CHECK-SCF-DAG:   %[[FLAT0:.*]] = mhlo.dynamic_reshape %[[ARG0]], %[[FLAT_SHAPE]]
-// CHECK-SCF-DAG:   %[[FLAT1:.*]] = mhlo.dynamic_reshape %[[ARG1]], %[[FLAT_SHAPE]]
-// CHECK-SCF-DAG:   %[[ZERO:.*]] = "chlo.constant_like"(%[[FLAT0]]) {value = 0.0{{.*}}+00 : f32}
-// CHECK-SCF-DAG:   %[[PRED:.*]] = mhlo.compare GT, %[[FLAT0]], %[[ZERO]]
-// CHECK-SCF:       %[[UNSHAPED_RES:.*]] = mhlo.select %[[PRED]], %[[FLAT1]], %[[ZERO]]
-// CHECK-SCF-DAG:   %[[RES:.*]] = mhlo.dynamic_reshape %[[UNSHAPED_RES]], %[[S1]]
-// CHECK-SCF:       return %[[RES]]
-
-// -----
-
-// Find shape equivalences through surrounding constraints.
-// CHECK-LABEL: @relu_grad
-// CHECK-SAME:  (%[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>)
-func.func @relu_grad(%arg0: tensor<*xf32>, %arg1: tensor<*xf32>) -> tensor<*xf32> {
-  // CHECK-DAG: %[[S0:.*]] = shape.shape_of %[[ARG0]]
-  // CHECK-DAG: %[[S1:.*]] = shape.shape_of %[[ARG1]]
-  // CHECK-DAG: %[[CSTR_EQ:.*]] = shape.cstr_eq %[[S0]], %[[S1]]
-  // CHECK:     %[[RES:.*]] = shape.assuming %[[CSTR_EQ]]
-  // CHECK:       %[[INNER_RES:.*]] = "chlo.rank_specialization_cluster"(%[[ARG1]], %[[ARG0]])
-  // CHECK:       ^bb0(%[[ARG1_:.*]]: tensor<*xf32>, %[[ARG0_:.*]]: tensor<*xf32>):
-  // CHECK-DAG:     %[[ZERO:.*]] = "chlo.constant_like"(%[[ARG0_]]) {value = 0.0{{.*}}+00 : f32}
-  // CHECK-DAG:     %[[PRED:.*]] = mhlo.compare GT, %[[ARG0_]], %[[ZERO]]
-  // CHECK-DAG:     %[[INNER_INNER_RES:.*]] = mhlo.select %[[PRED]], %[[ARG1_]], %[[ZERO]]
-  // CHECK:         "chlo.rank_specialization_cluster_yield"(%[[INNER_INNER_RES]])
-  // CHECK:       shape.assuming_yield %[[INNER_RES]]
-  // CHECK:     return %[[RES]]
-  %0 = shape.shape_of %arg0 : tensor<*xf32> -> tensor<?xindex>
-  %1 = shape.shape_of %arg1 : tensor<*xf32> -> tensor<?xindex>
-  %2 = shape.cstr_eq %0, %1 : tensor<?xindex>, tensor<?xindex>
-  %3 = shape.assuming %2 -> tensor<*xf32> {
-    %4 = "chlo.constant_like"(%arg0) {value = 0.000000e+00 : f32}
-        : (tensor<*xf32>) -> tensor<*xf32>
-    %5 = "mhlo.compare"(%arg0, %4) {comparison_direction = #mhlo<comparison_direction GT>}
-        : (tensor<*xf32>, tensor<*xf32>) -> tensor<*xi1>
-    %6 = "mhlo.select"(%5, %arg1, %4)
-        : (tensor<*xi1>, tensor<*xf32>, tensor<*xf32>) -> tensor<*xf32>
-    shape.assuming_yield %6 : tensor<*xf32>
-  }
-  func.return %3 : tensor<*xf32>
-}
-
-// CHECK-SCF-LABEL: @relu_grad
-// CHECK-SCF-SAME:  (%[[ARG0:.*]]: tensor<*xf32>, %[[ARG1:.*]]: tensor<*xf32>)
-// CHECK-SCF-DAG:   %[[S0:.*]] = shape.shape_of %[[ARG0]]
-// CHECK-SCF-DAG:   %[[S1:.*]] = shape.shape_of %[[ARG1]]
-// CHECK-SCF-DAG:   %[[CSTR_EQ:.*]] = shape.cstr_eq %0, %1
-// CHECK-SCF:       %[[RES:.*]] = shape.assuming %[[CSTR_EQ]]
-// CHECK-SCF-DAG:     %[[S0:.*]] = shape.shape_of %[[ARG0]]
-// CHECK-SCF-DAG:     %[[S1:.*]] = shape.shape_of %[[ARG1]]
-// CHECK-SCF-DAG:     %[[S:.*]] = shape.any %[[S1]], %[[S0]]
-// CHECK-SCF-DAG:     %[[N:.*]] = shape.num_elements %[[S]]
-// CHECK-SCF-DAG:     %[[FLAT_SHAPE:.*]] = tensor.from_elements %[[N]]
-// CHECK-SCF-DAG:     %[[FLAT0:.*]] = mhlo.dynamic_reshape %[[ARG0]], %[[FLAT_SHAPE]]
-// CHECK-SCF-DAG:     %[[FLAT1:.*]] = mhlo.dynamic_reshape %[[ARG1]], %[[FLAT_SHAPE]]
-// CHECK-SCF-DAG:     %[[ZERO:.*]] = "chlo.constant_like"(%[[FLAT0]]) {value = 0.0{{.*}}+00 : f32}
-// CHECK-SCF-DAG:     %[[PRED:.*]] = mhlo.compare GT, %[[FLAT0]], %[[ZERO]]
-// CHECK-SCF:         %[[UNSHAPED_RES:.*]] = mhlo.select %[[PRED]], %[[FLAT1]], %[[ZERO]]
-// CHECK-SCF-DAG:     %[[INNER_RES:.*]] = mhlo.dynamic_reshape %[[UNSHAPED_RES]], %[[S1]]
-// CHECK-SCF:         shape.assuming_yield %[[INNER_RES]]
-// CHECK-SCF:       return %[[RES]]
diff --git a/third_party/xla/xla/mlir_hlo/tests/test_userange.mlir b/third_party/xla/xla/mlir_hlo/tests/test_userange.mlir
deleted file mode 100644
index 88b01dcf8c9acb..00000000000000
--- a/third_party/xla/xla/mlir_hlo/tests/test_userange.mlir
+++ /dev/null
@@ -1,118 +0,0 @@
-// RUN: mlir-hlo-opt -test-print-userange -split-input-file %s | FileCheck %s
-
-// CHECK-LABEL: Testing : func_empty
-func.func @func_empty() {
-  func.return
-}
-//      CHECK:  ---- UserangeAnalysis -----
-// CHECK-NEXT:  ---------------------------
-
-// -----
-
-// CHECK-LABEL: Testing : useRangeGap
-func.func @useRangeGap(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>)
-{
-  %0 = memref.alloc() : memref<2xf32>
-  %1 = memref.alloc() : memref<2xf32>
-  cf.cond_br %arg0, ^bb1, ^bb2
-^bb1:
-  "lmhlo.negate"(%arg1, %0) : (memref<2xf32>, memref<2xf32>) -> ()
-  "lmhlo.negate"(%arg1, %1) : (memref<2xf32>, memref<2xf32>) -> ()
-  cf.br ^bb3
-^bb2:
-  "lmhlo.negate"(%arg2, %0) : (memref<2xf32>, memref<2xf32>) -> ()
-  "lmhlo.negate"(%arg2, %1) : (memref<2xf32>, memref<2xf32>) -> ()
-  cf.br ^bb3
-^bb3:
-  func.return
-}
-//      CHECK:  Value: %[[A0:.*]] = memref.alloc
-// CHECK-NEXT:  Userange: {(7, 7), (13, 13)}
-//      CHECK:  Value: %[[A1:.*]] = memref.alloc
-// CHECK-NEXT:  Userange: {(9, 9), (15, 15)}
-//      CHECK:  %[[A0]] = memref.alloc
-//      CHECK:  %[[A1]] = memref.alloc
-
-// -----
-
-// CHECK-LABEL: Testing : loopWithNestedRegion
-func.func @loopWithNestedRegion(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>)
-{
-  %0 = memref.alloc() : memref<2xf32>
-  %1 = memref.alloc() : memref<2xf32>
-  %2 = memref.alloc() : memref<2xf32>
-  %3 = memref.alloc() : memref<2xf32>
-  cf.br ^bb1
-^bb1:
-  %4 = scf.if %arg0 -> (memref<2xf32>) {
-    "lmhlo.negate"(%arg1, %0) : (memref<2xf32>, memref<2xf32>) -> ()
-    scf.yield %2 : memref<2xf32>
-  } else {
-    "lmhlo.negate"(%arg1, %1) : (memref<2xf32>, memref<2xf32>) -> ()
-    scf.yield %2 : memref<2xf32>
-  }
-  cf.br ^bb2
-^bb2:
-  cf.cond_br %arg0, ^bb1, ^bb3
-^bb3:
-  "lmhlo.negate"(%arg1, %2) : (memref<2xf32>, memref<2xf32>) -> ()
-  "lmhlo.negate"(%arg1, %3) : (memref<2xf32>, memref<2xf32>) -> ()
-  func.return
-}
-//      CHECK:  Value: %[[A0:.*]] = memref.alloc
-// CHECK-NEXT:  Userange: {(11, 23)}
-//      CHECK:  Value: %[[A1:.*]] = memref.alloc
-// CHECK-NEXT:  Userange: {(11, 23)}
-//      CHECK:  Value: %[[A2:.*]] = memref.alloc
-// CHECK-NEXT:  Userange: {(11, 25)}
-//      CHECK:  Value: %[[A3:.*]] = memref.alloc
-// CHECK-NEXT:  Userange: {(27, 27)}
-//      CHECK:  Value: %[[A4:.*]] = scf.if
-//      CHECK:  Userange: {(19, 19)}
-//      CHECK:  %[[A0]] = memref.alloc
-//      CHECK:  %[[A1]] = memref.alloc
-//      CHECK:  %[[A2]] = memref.alloc
-//      CHECK:  %[[A3]] = memref.alloc
-//      CHECK:  %[[A4]] = scf.if
-
-// -----
-
-// CHECK-LABEL: Testing : condBranchWithAlias
-func.func @condBranchWithAlias(%arg0: i1, %arg1: memref<2xf32>, %arg2: memref<2xf32>)
-{
-  %0 = memref.alloc() : memref<2xf32>
-  cf.cond_br %arg0, ^bb1, ^bb2
-^bb1:
-  "lmhlo.negate"(%arg1, %0) : (memref<2xf32>, memref<2xf32>) -> ()
-  cf.br ^bb3(%0 : memref<2xf32>)
-^bb2:
-  %1 = memref.alloc() : memref<2xf32>
-  "lmhlo.negate"(%arg1, %1) : (memref<2xf32>, memref<2xf32>) -> ()
-  cf.br ^bb3(%1 : memref<2xf32>)
-^bb3(%2 : memref<2xf32>):
-  %3 = memref.alloc() : memref<2xf32>
-  "lmhlo.copy"(%2, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
-  "lmhlo.copy"(%3, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
-  %4 = memref.alloc() : memref<2xf32>
-  "lmhlo.copy"(%4, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
-  cf.br ^bb4(%0 : memref<2xf32>)
-^bb4(%5 : memref<2xf32>):
-  "lmhlo.copy"(%5, %arg2) : (memref<2xf32>, memref<2xf32>) -> ()
-  func.return
-}
-//      CHECK:  Value: %[[A0:.*]] = memref.alloc
-// CHECK-NEXT:  Userange: {(5, 7), (15, 27)}
-//      CHECK:  Value: %[[A1:.*]] = memref.alloc
-// CHECK-NEXT:  Userange: {(11, 17)}
-//      CHECK:  Value: %[[A2:.*]] = memref.alloc
-// CHECK-NEXT:  Userange: {(19, 19)}
-//      CHECK:  Value: %[[A3:.*]] = memref.alloc
-// CHECK-NEXT:  Userange: {(23, 23)}
-//      CHECK:  Value: <block argument> of type 'memref<2xf32>' at index: 0
-// CHECK-SAME:  Userange: {(15, 17)}
-//      CHECK:  Value: <block argument> of type 'memref<2xf32>' at index: 0
-// CHECK-SAME:  Userange: {(27, 27)}
-//      CHECK:  %[[A0]] = memref.alloc
-//      CHECK:  %[[A1]] = memref.alloc
-//      CHECK:  %[[A2]] = memref.alloc
-//      CHECK:  %[[A3]] = memref.alloc
diff --git a/third_party/xla/xla/mlir_hlo/tests/vectorize_copy.mlir b/third_party/xla/xla/mlir_hlo/tests/vectorize_copy.mlir
index 8c57281a7041c9..fa852549c90578 100644
--- a/third_party/xla/xla/mlir_hlo/tests/vectorize_copy.mlir
+++ b/third_party/xla/xla/mlir_hlo/tests/vectorize_copy.mlir
@@ -1,7 +1,6 @@
 // RUN: mlir-hlo-opt %s --vectorize-copy --split-input-file | FileCheck %s
 
-func.func @vectorize_copy(%arg: memref<2x2xf32>) -> memref<2x2xf32> {
-  %subview = memref.subview %arg[0, 0] [2, 2] [1, 1] : memref<2x2xf32> to memref<2x2xf32, strided<[16, 1]>>
+func.func @vectorize_copy(%subview: memref<2x2xf32, strided<[16, 1]>>) -> memref<2x2xf32> {
   %alloc = memref.alloc() : memref<2x2xf32>
   memref.copy %subview, %alloc : memref<2x2xf32, strided<[16, 1]>> to memref<2x2xf32>
   return %alloc : memref<2x2xf32>
diff --git a/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/CMakeLists.txt
index db2dbf2e60ac47..48c132706fc2a5 100644
--- a/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/tools/mlir-hlo-opt/CMakeLists.txt
@@ -28,9 +28,7 @@ set(LIBS
         LmhloGPUDialect
         LmhloPasses
         MLIRBufferTransforms
-        MLIRHLOAnalysis
         MLIRHLOGPUTransforms
-        MLIRHLOTestAnalysis
         MhloRegisterDialects
         MhloTestAnalysis
         )
diff --git a/third_party/xla/xla/mlir_hlo/transforms/CMakeLists.txt b/third_party/xla/xla/mlir_hlo/transforms/CMakeLists.txt
index f1e9d6004c967e..45c8f24a796f5f 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/CMakeLists.txt
+++ b/third_party/xla/xla/mlir_hlo/transforms/CMakeLists.txt
@@ -24,7 +24,6 @@ add_public_tablegen_target(LMHLOGPUTransformsPassIncGen)
 
 add_mlir_library(MLIRBufferTransforms
   alloc_to_arg_pass.cc
-  buffer_packing.cc
   bufferize.cc
   bufferize_pass.cc
   collapse_parallel_loops_to_1d_pass.cc
@@ -49,7 +48,6 @@ add_mlir_library(MLIRBufferTransforms
   LINK_LIBS PUBLIC
   ChloOps
   MLIRGPUDialect
-  MLIRHLOAnalysis
   MLIRIR
   MLIRMathTransforms
   MLIRPass
@@ -75,7 +73,6 @@ add_mlir_library(MLIRHLOGPUTransforms
   LINK_LIBS PUBLIC
   MLIRArithTransforms
   MLIRGPUDialect
-  MLIRHLOAnalysis
   MLIRIR
   MLIRMemRefTransforms
   MLIRPass
diff --git a/third_party/xla/xla/mlir_hlo/transforms/buffer_packing.cc b/third_party/xla/xla/mlir_hlo/transforms/buffer_packing.cc
deleted file mode 100644
index 6bf8c1899608b2..00000000000000
--- a/third_party/xla/xla/mlir_hlo/transforms/buffer_packing.cc
+++ /dev/null
@@ -1,494 +0,0 @@
-/* Copyright 2021 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <algorithm>
-#include <cmath>
-#include <iterator>
-#include <list>
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "analysis/userange_analysis.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Bufferization/Transforms/BufferUtils.h"
-#include "mlir/Dialect/Bufferization/Transforms/BufferViewFlowAnalysis.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/IR/Operation.h"
-#include "mlir/Pass/Pass.h"
-#include "transforms/passes.h"
-#include "utils/hlo_utils.h"
-
-namespace mlir {
-
-#define GEN_PASS_DEF_BUFFERPACKING
-#define GEN_PASS_DEF_MEMORYCOUNT
-#include "transforms/passes.h.inc"
-
-namespace {
-
-/// Returns the length of an userange interval.
-size_t computeUserangeSize(const UseInterval &interval) {
-  return interval.end - interval.start + 1;
-}
-
-/// Compute the byte size of a given Value.
-size_t computeByteSize(const Value &v) {
-  auto type = v.getType().cast<ShapedType>();
-  return type.getNumElements() * type.getElementTypeBitWidth() / 8;
-}
-
-/// Compute the 64 byte alinged segments of a given Value.
-size_t computeAlignedSegments(const Value &v) {
-  size_t padding = 64;
-  size_t bytes = computeByteSize(v);
-  return std::ceil(bytes / (double)padding);
-}
-
-/// The buffer offset information.
-struct AllocBufferOffset {
- public:
-  AllocBufferOffset(Value source, size_t offset)
-      : source(source), offset(offset) {}
-
-  Value source;
-  size_t offset;
-};
-
-/// Contains the information to create a new buffer, that is used to pack
-/// other buffers.
-struct PackedBuffer {
- public:
-  PackedBuffer(size_t numSegments,
-               std::vector<AllocBufferOffset> &packedBuffers)
-      : numSegments(numSegments), allocBufferOffsets(packedBuffers) {}
-
-  size_t numSegments;
-  std::vector<AllocBufferOffset> allocBufferOffsets;
-};
-
-/// Contains the information about a buffers allocation for sorting and checking
-/// if it fits into other buffers and vise versa.
-/// This structure contains the allocation value, the first and last userangeid
-/// of a buffer, the window id, the number of alligned 64 byte segments and all
-/// userange intervals.
-struct AllocationInfo {
- public:
-  AllocationInfo(Value alloc, size_t allocUserangeId, size_t firstUse,
-                 size_t lastUse, size_t numSegments, size_t windowId,
-                 const UseInterval::Vector *userangeIntervals)
-      : alloc(alloc),
-        allocUserangeId(allocUserangeId),
-        firstUse(firstUse),
-        lastUse(lastUse),
-        numSegments(numSegments),
-        windowId(windowId),
-        userangeIntervals(userangeIntervals) {}
-
-  /// The allocation value.
-  Value alloc;
-
-  /// The id of allocation based on the Userange Analysis.
-  size_t allocUserangeId;
-
-  /// The first use of the buffer.
-  size_t firstUse;
-
-  /// The last use of the buffer based on the Userange Analysis.
-  size_t lastUse;
-
-  /// The number of 64 byte aligned segments of contigous memory.
-  size_t numSegments;
-
-  /// The window id of the allocation position.
-  size_t windowId;
-
-  /// The userange intervals of the buffer.
-  const UseInterval::Vector *userangeIntervals;
-
-  /// Compute the gaps of the alloc userange with the number of segments. The
-  /// maxUserangeId is used to add a dummy gap from the last used id to the
-  /// maxUserangeId. By default the maxUserangeId is zero and no gap is added.
-  std::list<std::pair<UseInterval, size_t>> computeGaps(
-      size_t maxUserangeId = 0) {
-    std::list<std::pair<UseInterval, size_t>> gaps;
-
-    // The previous gap ending, initially set to 0.
-    size_t gapEnd = 0;
-
-    for (const auto *useRangeIter = userangeIntervals->begin();
-         useRangeIter < userangeIntervals->end(); ++useRangeIter) {
-      // Add a gap if the end is not equal to the start.
-      if (gapEnd < useRangeIter->start)
-        gaps.emplace_back(UseInterval(gapEnd, useRangeIter->start - 1),
-                          numSegments);
-      gapEnd = useRangeIter->end + 1;
-    }
-
-    // Add a dummy gap behind the last use of the buffer.
-    if (gapEnd < maxUserangeId) {
-      gaps.emplace_back(UseInterval(gapEnd, maxUserangeId), numSegments);
-    }
-
-    return gaps;
-  }
-
-  /// Compute the userange size.
-  size_t getUserangeSize() const { return lastUse - firstUse + 1; }
-};
-
-// Comparator to sort allocation informations by window id, userange and by
-// number of memory segments.
-class AllocInfoWinIdComparator {
- public:
-  bool operator()(const AllocationInfo &a, const AllocationInfo &b) {
-    if (a.windowId == b.windowId) {
-      if (a.allocUserangeId == b.allocUserangeId)
-        return a.numSegments > b.numSegments;
-      return a.allocUserangeId > b.allocUserangeId;
-    }
-    return a.windowId < b.windowId;
-  }
-};
-
-// Comparator to sort the allocation informations by number of segments.
-class AllocInfoMemSizeCompare {
- public:
-  bool operator()(const AllocationInfo &a, const AllocationInfo &b) {
-    return a.numSegments > b.numSegments;
-  }
-};
-
-/// This approach computes an allocation information list and sorts it by
-/// a given comparator. From top to bottom the algortihm tries to fill userange
-/// gaps with appropriate buffers behind it, to optimze the memory. It is a bin
-/// packing approach.
-template <typename CompareT>
-class SortedPackingStrategy {
- public:
-  using AllocInfoList = std::vector<AllocationInfo>;
-
- public:
-  /// Constructs the Sorted Packing Strategy. The window size is used as sliding
-  /// window size. Allocation userangepositions that are in the same range are
-  /// mapped to the same window id. So the information of the allocation
-  /// starting position is blured.
-  SortedPackingStrategy(size_t windowSize, CompareT compare)
-      : windowSize(windowSize), compare(compare) {}
-
-  /// Optimize the buffer allocations.
-  void optimze(const mlir::bufferization::BufferPlacementAllocs &allocs,
-               const UserangeAnalysis &userangeAnalysis,
-               std::vector<PackedBuffer> &packedBuffers) {
-    AllocInfoList allocInfos;
-    allocInfos.reserve(std::distance(allocs.begin(), allocs.end()));
-
-    // Create allocInformations and store them in allocInfos.
-    size_t maxUserangeId =
-        computeAllocationInfos(allocInfos, userangeAnalysis, allocs);
-
-    // Sort the allocation infos.
-    std::sort(allocInfos.begin(), allocInfos.end(), compare);
-
-    for (auto currentIter = allocInfos.begin(); currentIter != allocInfos.end();
-         ++currentIter) {
-      std::vector<AllocBufferOffset> allocBufferOffsets{
-          AllocBufferOffset(currentIter->alloc, 0)};
-
-      // Compute userange gaps.
-      std::list<std::pair<UseInterval, size_t>> gaps =
-          currentIter->computeGaps(maxUserangeId);
-
-      if (gaps.empty()) continue;
-
-      for (auto checkedAllocInfoIter = std::next(currentIter);
-           checkedAllocInfoIter != allocInfos.end();) {
-        // Check if a gap exists to pack the memory into.
-        // If not continue.
-        if (!findGapAndUpdate(gaps, allocBufferOffsets, *checkedAllocInfoIter,
-                              *currentIter)) {
-          ++checkedAllocInfoIter;
-          continue;
-        }
-        checkedAllocInfoIter = allocInfos.erase(checkedAllocInfoIter);
-      }
-      // Add the current buffer offets to the packed infos.
-      packedBuffers.emplace_back(currentIter->numSegments * 64,
-                                 allocBufferOffsets);
-    }
-  }
-
- private:
-  const size_t windowSize;
-  const CompareT compare;
-
-  /// We try to find an appropriate userange gap to pack the buffer into it.
-  /// If we find one we update only the gaps and the buffer offset map.
-  bool findGapAndUpdate(std::list<std::pair<UseInterval, size_t>> &gaps,
-                        std::vector<AllocBufferOffset> &allocBufferOffsets,
-                        const AllocationInfo &allocToPack,
-                        const AllocationInfo &allocToPackInto) {
-    // Check if the buffer to pack into has enough memory.
-    if (allocToPackInto.numSegments < allocToPack.numSegments) return false;
-    for (auto gapIter = gaps.begin(); gapIter != gaps.end();) {
-      // The list is sorted, so we can break here.
-      if (gapIter->first.start > allocToPack.firstUse) break;
-
-      // Checks if enough contiguous memory segments are free or if the current
-      // gap is out of bounds.
-      if (gapIter->second < allocToPack.numSegments ||
-          allocToPack.firstUse < gapIter->first.start ||
-          allocToPack.lastUse > gapIter->first.end) {
-        ++gapIter;
-        continue;
-      }
-
-      // Stores the packed buffer with the offset.
-      allocBufferOffsets.emplace_back(
-          allocToPack.alloc,
-          (allocToPackInto.numSegments - gapIter->second) * 64);
-
-      // Update gap segments, will removed later if no free contigous memory
-      // exists. It is needed to split the interval, if not the full gap is
-      // used.
-      size_t freeContiguousMemory = gapIter->second;
-      gapIter->second = freeContiguousMemory - allocToPack.numSegments;
-
-      // Check if the gap must be splitted. If so, then the current gap must be
-      // trimmed accordingly. Therefore, new gaps are created in front and after
-      // the current gap.
-      if (computeUserangeSize(gapIter->first) > allocToPack.getUserangeSize()) {
-        size_t oldStart = gapIter->first.start;
-        size_t oldEnd = gapIter->first.end;
-        gapIter->first.end = allocToPack.lastUse;
-        gapIter->first.start = allocToPack.firstUse;
-
-        // Insert a new gap behind.
-        if (allocToPack.lastUse < oldEnd)
-          gaps.insert(
-              std::next(gapIter),
-              std::make_pair(UseInterval(allocToPack.lastUse + 1, oldEnd),
-                             freeContiguousMemory));
-        // Insert a new gap before.
-        if (allocToPack.firstUse > oldStart)
-          gaps.insert(
-              gapIter,
-              std::make_pair(UseInterval(oldStart, allocToPack.firstUse - 1),
-                             freeContiguousMemory));
-      }
-
-      // If a gap interval has no free contiguous memory anymore, erease it from
-      // list.
-      if (gapIter->second <= 0) gapIter = gaps.erase(gapIter);
-
-      return true;
-    }
-    return false;
-  }
-
-  /// Aggreagtes the allocation informations of the allocs and returns the
-  /// maximal userange.
-  size_t computeAllocationInfos(
-      AllocInfoList &allocInfos, const UserangeAnalysis &userangeAnalysis,
-      const mlir::bufferization::BufferPlacementAllocs &allocs) {
-    // Create allocInformations and store them in allocInfos.
-    size_t maxUserangeId = 0;
-
-    for (auto &allocEntry : allocs) {
-      Value v = std::get<0>(allocEntry);
-      auto userangeIntervals = userangeAnalysis.getUserangeInterval(v);
-
-      if (!userangeIntervals) continue;
-
-      // Computes the userange id of the allocation.
-      size_t allocUserangeId = userangeAnalysis.computeId(v, v.getDefiningOp());
-
-      // Computes the last use of the allocated buffer.
-      size_t lastUse = std::prev((*userangeIntervals.value()).end())->end;
-
-      // Computes the first use of the allocated buffer.
-      size_t firstUse = (*userangeIntervals.value()).begin()->start;
-
-      // Computes the number of aligend segments of the buffer.
-      size_t numSegments = computeAlignedSegments(v);
-      maxUserangeId = std::max(maxUserangeId, lastUse);
-      allocInfos.emplace_back(v, allocUserangeId, firstUse, lastUse,
-                              numSegments, 0, userangeIntervals.value());
-    }
-
-    // If the window size is zero we need no sorting anymore.
-    if (windowSize == 0) return maxUserangeId;
-    // Sorts the allocation informations to compute the window id. The window id
-    // is used to blur the userange starting position of an allocation.
-    std::sort(allocInfos.begin(), allocInfos.end(),
-              [](const AllocationInfo &a, const AllocationInfo &b) {
-                return a.allocUserangeId < b.allocUserangeId;
-              });
-
-    // resize window id
-    size_t windowId = 0;
-    size_t lastAllocUserangeId = 0;
-    for (auto &allocationInfo : allocInfos) {
-      if (allocationInfo.allocUserangeId > lastAllocUserangeId + windowSize)
-        ++windowId;
-
-      lastAllocUserangeId = allocationInfo.allocUserangeId;
-      allocationInfo.windowId = windowId;
-    }
-    return maxUserangeId;
-  }
-};
-
-/// Pass to pack buffer together to optimize the memeory consumption and to
-/// save allocation operations. A strategy must be passed as a template
-/// argument.
-class BufferPacking : bufferization::BufferPlacementTransformationBase {
- public:
-  template <typename StrategyT>
-  BufferPacking(Operation *op, StrategyT strategy)
-      : BufferPlacementTransformationBase(op),
-        userangeAnalysis(op, allocs, aliases),
-        dominators(op) {
-    std::vector<PackedBuffer> packedBuffers;
-    strategy.optimze(allocs, userangeAnalysis, packedBuffers);
-
-    for (auto &packedBuffer : packedBuffers) {
-      // Find common dominators.
-      Block *block = findAllocationsDominator(packedBuffer.allocBufferOffsets);
-      // Find alloc position operation.
-      mlir::OpBuilder packBuilder(&(block->front()));
-      auto location = block->front().getLoc();
-      auto memrefType =
-          MemRefType::get({static_cast<int64_t>(packedBuffer.numSegments)},
-                          packBuilder.getIntegerType(8));
-      Value targetBuffer =
-          packBuilder.create<memref::AllocOp>(location, memrefType);
-
-      for (auto &packInfo : packedBuffer.allocBufferOffsets) {
-        Value currentAlloc = packInfo.source;
-        size_t offset = packInfo.offset;
-        Operation *viewDefOp = currentAlloc.getDefiningOp();
-        Location loc = viewDefOp->getLoc();
-        mlir::OpBuilder viewBuilder(viewDefOp);
-
-        // Create a arithmetic ConstantOp with the aligned offset.
-        Value constantOp = viewBuilder.create<mlir::arith::ConstantOp>(
-            loc, viewBuilder.getIndexType(),
-            viewBuilder.getIntegerAttr(viewBuilder.getIndexType(), offset));
-
-        // Store the operands for the ViewOp.
-        SmallVector<Value, 4> newOperands{targetBuffer};
-        newOperands.push_back(constantOp);
-
-        auto shape = currentAlloc.getType().cast<MemRefType>();
-
-        // Create a ViewOp with the shape of the old alloc and use the created
-        // packed alloc and the constant for the operands.
-        Value viewOp =
-            viewBuilder.create<memref::ViewOp>(loc, shape, newOperands);
-
-        // Replace all old allocs references with the created ViewOp and
-        // afterwards remove the old allocs.
-        currentAlloc.replaceAllUsesWith(viewOp);
-        viewDefOp->erase();
-      }
-    }
-  }
-
- private:
-  UserangeAnalysis userangeAnalysis;
-  /// The current dominance info.
-  DominanceInfo dominators;
-
-  /// Find the block that dominates all buffer allocations.
-  Block *findAllocationsDominator(
-      const std::vector<AllocBufferOffset> &packingInfos) {
-    SmallPtrSet<Value, 16> allocValues;
-    for (auto &packInfo : packingInfos) {
-      allocValues.insert(packInfo.source);
-    }
-
-    // Find common dominators.
-    return bufferization::findCommonDominator(packingInfos.begin()->source,
-                                              allocValues, dominators);
-  }
-};
-
-/// Tries to pack allocated buffer together to save allocation operations and
-/// memory. The window size is used as sliding window size. Allocation
-/// userangepoitions that are in the same range are mapped to the same window
-/// id. The information of the allocation starting position is blured.
-struct BufferPackingPass : public impl::BufferPackingBase<BufferPackingPass> {
-  explicit BufferPackingPass(unsigned windowSize) {
-    this->window_size_ = windowSize;
-  }
-
-  void runOnOperation() override {
-    if (window_size_ == 0) {
-      SortedPackingStrategy<AllocInfoMemSizeCompare> strategy(
-          window_size_, AllocInfoMemSizeCompare());
-      BufferPacking packing(getOperation(), strategy);
-    } else {
-      SortedPackingStrategy<AllocInfoWinIdComparator> strategy(
-          window_size_, AllocInfoWinIdComparator());
-      BufferPacking packing(getOperation(), strategy);
-    }
-  }
-};
-
-/// Pass to find all allocations and to compute memory usage.
-struct MemoryCountPass : impl::MemoryCountBase<MemoryCountPass> {
-  void runOnOperation() override {
-    Operation *op = getOperation();
-    std::vector<Value> allocs;
-    op->walk([&](MemoryEffectOpInterface opInterface) {
-      // Try to find a single allocation result.
-      SmallVector<MemoryEffects::EffectInstance, 2> effects;
-      opInterface.getEffects(effects);
-
-      SmallVector<MemoryEffects::EffectInstance, 2> allocateResultEffects;
-      llvm::copy_if(
-          effects, std::back_inserter(allocateResultEffects),
-          [=](MemoryEffects::EffectInstance &it) {
-            Value value = it.getValue();
-            return isa<MemoryEffects::Allocate>(it.getEffect()) && value &&
-                   value.isa<OpResult>() &&
-                   it.getResource() !=
-                       SideEffects::AutomaticAllocationScopeResource::get();
-          });
-
-      if (allocateResultEffects.size() != 1) return;
-      // Insert allocation.
-      allocs.push_back(allocateResultEffects[0].getValue());
-    });
-    auto output = mlir::hlo::computeMemory(allocs);
-    llvm::outs() << "Memory Count Pass:\n"
-                 << output.first << ";" << output.second << "\n";
-  }
-};
-
-}  // namespace
-
-std::unique_ptr<OperationPass<func::FuncOp>> createBufferPackingPass(
-    unsigned windowSize) {
-  return std::make_unique<BufferPackingPass>(windowSize);
-}
-
-std::unique_ptr<OperationPass<func::FuncOp>> createMemoryCountPass() {
-  return std::make_unique<MemoryCountPass>();
-}
-
-}  // namespace mlir
diff --git a/third_party/xla/xla/mlir_hlo/transforms/passes.h b/third_party/xla/xla/mlir_hlo/transforms/passes.h
index 6012a587d50e64..a668cc250dd81c 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/passes.h
+++ b/third_party/xla/xla/mlir_hlo/transforms/passes.h
@@ -44,7 +44,6 @@ using BufferizePatternsCallback = std::function<void(
 // Passes
 //===----------------------------------------------------------------------===//
 
-#define GEN_PASS_DECL_BUFFERPACKING
 #define GEN_PASS_DECL_FINALBUFFERIZEPASS
 #define GEN_PASS_DECL_PROPAGATESTATICSHAPESTOKERNELPASS
 #define GEN_PASS_DECL_TILELOOPSPASS
@@ -52,21 +51,10 @@ using BufferizePatternsCallback = std::function<void(
 #define GEN_PASS_DECL_VECTORIZECOPYPASS
 #include "transforms/passes.h.inc"
 
-/// Creates a pass that merges smaller buffer into bigger buffer to optimize
-/// memory consumption.
-std::unique_ptr<OperationPass<func::FuncOp>> createBufferPackingPass(
-    unsigned windowSize = 5);
-
-/// Creates a pass that tests the useranges of the UserangeAnalysis.
-std::unique_ptr<OperationPass<func::FuncOp>> createTestUserangePass();
-
 /// Creates a pass that prints the analysis results of ShapeComponentsAnalysis.
 std::unique_ptr<OperationPass<func::FuncOp>>
 createTestShapeComponentAnalysisPass();
 
-/// Creates a pass that computes the allocated memory.
-std::unique_ptr<OperationPass<func::FuncOp>> createMemoryCountPass();
-
 // Pass to lower index cast on tensors to tensor dialect.
 std::unique_ptr<OperationPass<func::FuncOp>> createLowerIndexCastPass();
 
diff --git a/third_party/xla/xla/mlir_hlo/transforms/passes.td b/third_party/xla/xla/mlir_hlo/transforms/passes.td
index 2d979c8508eb35..e5a36ae6ca03af 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/passes.td
+++ b/third_party/xla/xla/mlir_hlo/transforms/passes.td
@@ -18,26 +18,6 @@ limitations under the License.
 
 include "mlir/Pass/PassBase.td"
 
-def BufferPacking : Pass<"buffer-packing", "func::FuncOp"> {
-  let summary = "Pass to pack allocated buffer to reduce memory consumption.";
-  let description = [{The pass tries to pack smaller buffers into larger buffers.
-  To do this, it sorts all allocated buffers by multiple criteria depends on the
-  selected window-size.
-  After this sorting, the buffers are checked whether subsequent buffers can be
-  packed into them.}];
-  let dependentDialects = ["func::FuncDialect","memref::MemRefDialect",
-    "arith::ArithDialect"];
-  let constructor = "createBufferPackingPass()";
-  let options = [
-   Option<"window_size_", "window-size", "unsigned",
-           /*default=*/"5", "The window size blurs the start position of an"
-           "allocated buffer. Buffers allocated in the same sliding window area"
-           "are treated equally in terms of starting position, withing the"
-           "sliding window area they are sorted by memory size."
-           "A window size of zero sorts the buffers only by memory size.">,
-  ];
-}
-
 def CollapseParallelLoopsTo1DPass : Pass<"collapse-parallel-loops-to-1d"> {
   let summary = "Collapses multidimensional loops.";
   let description = [{ The pass converts a multidimensional `scf.parallel` loop
@@ -71,18 +51,6 @@ def TileLoopsPass : Pass<"tile-loops", "func::FuncOp"> {
   let dependentDialects = ["affine::AffineDialect"];
 }
 
-def MemoryCount : Pass<"memory-count", "func::FuncOp"> {
-  let summary = "Test pass to count the allocated memory of a module.";
-  let description = [{A test pass that prints the size of allocated memory of a
-  module.}];
-  let constructor = "createMemoryCountPass()";
-}
-
-def TestUserange : Pass<"test-print-userange", "func::FuncOp"> {
-  let summary = "Test pass for checking userange intervals.";
-  let constructor = "createTestUserangePass()";
-}
-
 def TestShapeComponentAnalysis : Pass<"test-print-shape-components",
                                       "func::FuncOp"> {
   let summary = "Test pass for analyzing shape components.";
diff --git a/third_party/xla/xla/mlir_hlo/transforms/rewriters.h b/third_party/xla/xla/mlir_hlo/transforms/rewriters.h
index 8e5fc500cebe24..6e484b22370507 100644
--- a/third_party/xla/xla/mlir_hlo/transforms/rewriters.h
+++ b/third_party/xla/xla/mlir_hlo/transforms/rewriters.h
@@ -31,11 +31,6 @@ void populateExtraBufferizePatterns(
     MLIRContext *context, bufferization::BufferizeTypeConverter *converter,
     RewritePatternSet *patterns);
 
-/// Populate pattern to bufferize `linalg.tiled_loop`.
-void populateTiledLoopBufferizePattern(
-    MLIRContext *context, bufferization::BufferizeTypeConverter *converter,
-    RewritePatternSet *patterns);
-
 }  // namespace mlir
 
 #endif  // MLIR_HLO_TRANSFORMS_REWRITERS_H
diff --git a/third_party/xla/xla/mlir_hlo/utils/hlo_utils.cc b/third_party/xla/xla/mlir_hlo/utils/hlo_utils.cc
index 96a67fca751b8e..fc02ada8840de2 100644
--- a/third_party/xla/xla/mlir_hlo/utils/hlo_utils.cc
+++ b/third_party/xla/xla/mlir_hlo/utils/hlo_utils.cc
@@ -24,14 +24,19 @@ limitations under the License.
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/IR/Attributes.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinAttributes.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/Value.h"
+#include "mlir/Support/LLVM.h"
 
 namespace mlir {
 namespace hlo {
 
 static constexpr size_t kPaddingSize = 64;
 
-DenseIntElementsAttr getBroadcastDimensionsAttr(Builder* b, Value x, Value y,
-                                                bool allowEmpty) {
+DenseI64ArrayAttr getBroadcastDimensionsAttr(Builder* b, Value x, Value y,
+                                             bool allowEmpty) {
   TensorType xType = x.getType().dyn_cast<RankedTensorType>();
   TensorType yType = y.getType().dyn_cast<RankedTensorType>();
   if (!xType || !yType) return {};
@@ -57,9 +62,7 @@ DenseIntElementsAttr getBroadcastDimensionsAttr(Builder* b, Value x, Value y,
   std::iota(broadcastDimensions.begin(), broadcastDimensions.end(),
             maxRank - minRank);
 
-  RankedTensorType type =
-      RankedTensorType::get({minRank}, b->getIntegerType(64));
-  return DenseIntElementsAttr::get(type, broadcastDimensions);
+  return b->getDenseI64ArrayAttr(broadcastDimensions);
 }
 
 DenseElementsAttr getScalarOfType(Type ty, int64_t rawValue) {
diff --git a/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h b/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h
index d4a3fb141f2597..72f22992b4a944 100644
--- a/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h
+++ b/third_party/xla/xla/mlir_hlo/utils/hlo_utils.h
@@ -36,10 +36,9 @@ namespace hlo {
 // between two ranked tensors.
 // If `allow_empty` is true, then null can be returned to mean that the
 // broadcast is an "identity".
-mlir::DenseIntElementsAttr getBroadcastDimensionsAttr(mlir::Builder* b,
-                                                      mlir::Value x,
-                                                      mlir::Value y,
-                                                      bool allowEmpty = true);
+mlir::DenseI64ArrayAttr getBroadcastDimensionsAttr(mlir::Builder* b,
+                                                   mlir::Value x, mlir::Value y,
+                                                   bool allowEmpty = true);
 
 // Get a constant splat for the given value of type. Requires value to be of
 // type static shaped RankedTensorType.
diff --git a/third_party/xla/xla/pjrt/BUILD b/third_party/xla/xla/pjrt/BUILD
index c12143b22c4644..9b07227289a870 100644
--- a/third_party/xla/xla/pjrt/BUILD
+++ b/third_party/xla/xla/pjrt/BUILD
@@ -1,5 +1,6 @@
 # Placeholder: load py_proto_library
 load("//xla:xla.bzl", "xla_cc_test")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
     "tf_proto_library",
@@ -7,7 +8,8 @@ load(
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility(["//xla:internal"]),
     licenses = ["notice"],
 )
 
@@ -28,7 +30,6 @@ cc_library(
     name = "worker_thread",
     srcs = ["worker_thread.cc"],
     hdrs = ["worker_thread.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/platform:env",
@@ -39,7 +40,6 @@ cc_library(
     name = "event_pool",
     srcs = ["event_pool.cc"],
     hdrs = ["event_pool.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status_macros",
         "//xla:statusor",
@@ -53,7 +53,6 @@ cc_library(
     name = "semaphore",
     srcs = ["semaphore.cc"],
     hdrs = ["semaphore.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:types",
         "@com_google_absl//absl/synchronization",
@@ -77,7 +76,6 @@ cc_library(
     name = "tracked_device_buffer",
     srcs = ["tracked_device_buffer.cc"],
     hdrs = ["tracked_device_buffer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":event_pool",
         ":local_device_state",
@@ -118,7 +116,6 @@ cc_library(
     name = "local_device_state",
     srcs = ["local_device_state.cc"],
     hdrs = ["local_device_state.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":event_pool",
         ":pjrt_common",
@@ -138,7 +135,6 @@ cc_library(
     name = "pjrt_api",
     srcs = ["pjrt_api.cc"],
     hdrs = ["pjrt_api.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status",
         "//xla:statusor",
@@ -170,7 +166,7 @@ cc_library(
     name = "pjrt_client",
     srcs = ["pjrt_client.cc"],
     hdrs = ["pjrt_client.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//xla:friends"]),
     deps = [
         ":pjrt_common",
         ":pjrt_compiler",
@@ -209,7 +205,6 @@ cc_library(
     testonly = 1,
     srcs = ["pjrt_client_test.cc"],
     hdrs = ["pjrt_client_test.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":pjrt_client",
         "//xla:shape_util",
@@ -231,7 +226,7 @@ cc_library(
     name = "pjrt_executable",
     srcs = ["pjrt_executable.cc"],
     hdrs = ["pjrt_executable.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":compile_options_proto_cc",
         ":executable_metadata_proto_cc",
@@ -279,7 +274,6 @@ xla_cc_test(
 cc_library(
     name = "pjrt_device_description",
     hdrs = ["pjrt_device_description.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":pjrt_common",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -290,7 +284,7 @@ cc_library(
     name = "pjrt_compiler",
     srcs = ["pjrt_compiler.cc"],
     hdrs = ["pjrt_compiler.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         ":metrics",
         ":pjrt_device_description",
@@ -329,7 +323,7 @@ xla_cc_test(
 cc_library(
     name = "pjrt_common",
     hdrs = ["pjrt_common.h"],
-    visibility = ["//visibility:public"],
+    visibility = [":friends"],
     deps = [
         "@local_tsl//tsl/lib/gtl:int_type",
     ],
@@ -339,7 +333,7 @@ cc_library(
     name = "utils",
     srcs = ["utils.cc"],
     hdrs = ["utils.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//xla:friends"]),
     deps = [
         ":layout_mode",
         "//xla:shape_util",
@@ -370,7 +364,7 @@ cc_library(
     name = "layout_mode",
     srcs = ["layout_mode.cc"],
     hdrs = ["layout_mode.h"],
-    visibility = ["//visibility:public"],
+    visibility = ["//xla:friends"],
     deps = [
         "//xla:shape_util",
         "//xla:status",
@@ -384,7 +378,6 @@ cc_library(
     name = "metrics",
     srcs = ["metrics.cc"],
     hdrs = ["metrics.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings",
@@ -408,7 +401,6 @@ cc_library(
     name = "stream_executor_executable",
     srcs = ["stream_executor_executable.cc"],
     hdrs = ["stream_executor_executable.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":pjrt_executable",
         ":stream_executor_executable_proto_cc",
@@ -424,7 +416,7 @@ cc_library(
     name = "pjrt_stream_executor_client",
     srcs = ["pjrt_stream_executor_client.cc"],
     hdrs = ["pjrt_stream_executor_client.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//xla:friends"]),
     deps = [
         ":event_pool",
         ":local_device_state",
@@ -526,7 +518,6 @@ cc_library(
     name = "interpreter_device",
     srcs = ["interpreter_device.cc"],
     hdrs = ["interpreter_device.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":pjrt_stream_executor_client",
         "//xla:statusor",
@@ -541,7 +532,7 @@ cc_library(
     name = "mlir_to_hlo",
     srcs = ["mlir_to_hlo.cc"],
     hdrs = ["mlir_to_hlo.h"],
-    visibility = ["//visibility:public"],
+    visibility = [":friends"],
     deps = [
         "//xla:status",
         "//xla:statusor",
@@ -579,7 +570,7 @@ cc_library(
 cc_library(
     name = "pjrt_future",
     hdrs = ["pjrt_future.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
     deps = [
         "@com_google_absl//absl/functional:any_invocable",
         "@local_tsl//tsl/concurrency:async_value",
@@ -591,7 +582,9 @@ cc_library(
 cc_library(
     name = "tfrt_cpu_pjrt_client",
     hdrs = ["tfrt_cpu_pjrt_client.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//xla:friends",
+    ]),
     deps = [
         "//xla/pjrt/cpu:cpu_client",
     ],
@@ -600,7 +593,6 @@ cc_library(
 cc_library(
     name = "lru_cache",
     hdrs = ["lru_cache.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/container:node_hash_map",
         "@local_tsl//tsl/platform:logging",
@@ -624,7 +616,7 @@ cc_library(
         "transpose_kernels.h",
     ],
     hdrs = ["transpose.h"],
-    visibility = ["//visibility:public"],
+    visibility = [":friends"],
     deps = [
         ":lru_cache",
         "//xla:compiler_macros",
@@ -670,7 +662,6 @@ cc_library(
     name = "pjrt_c_api_client",
     srcs = ["pjrt_c_api_client.cc"],
     hdrs = ["pjrt_c_api_client.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":compile_options_proto_cc",
         ":mlir_to_hlo",
@@ -790,7 +781,7 @@ cc_library(
     name = "host_callback",
     srcs = ["host_callback.cc"],
     hdrs = ["host_callback.h"],
-    visibility = ["//visibility:public"],
+    visibility = [":friends"],
     deps = [
         ":pjrt_client",
         ":pjrt_future",
@@ -853,7 +844,6 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status",
         "@com_google_absl//absl/status",
@@ -869,7 +859,7 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
+    visibility = [":friends"],
     deps = [
         ":exceptions",
         "//xla:status",
diff --git a/third_party/xla/xla/pjrt/c/BUILD b/third_party/xla/xla/pjrt/c/BUILD
index d89a30fda57258..ef12625d7d5e5d 100644
--- a/third_party/xla/xla/pjrt/c/BUILD
+++ b/third_party/xla/xla/pjrt/c/BUILD
@@ -169,15 +169,22 @@ cc_library(
         "//xla/backends/profiler/plugin:plugin_tracer_impl",
         "//xla/backends/profiler/plugin:profiler_c_api_hdrs",
         "//xla/backends/profiler/plugin:profiler_error",
+        "//xla/client:local_client",
         "//xla/ffi",
         "//xla/ffi:ffi_api",
         "//xla/ffi/api:c_api",
         "//xla/pjrt:pjrt_client",
         "//xla/pjrt:pjrt_common",
+        "//xla/pjrt:pjrt_compiler",
+        "//xla/pjrt:pjrt_device_description",
         "//xla/pjrt/gpu:gpu_helpers",
         "//xla/pjrt/gpu:se_gpu_pjrt_client",
+        "//xla/pjrt/gpu:se_gpu_pjrt_compiler",  # To register GPU AOT compiler
         "//xla/python:inspect_sharding",  # To register "InspectSharding" custom partitioning handler.
+        "//xla/service:compiler",
         "//xla/service:custom_call_target_registry",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor:stream_executor_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
@@ -189,7 +196,6 @@ cc_library(
     name = "pjrt_c_api_gpu",
     srcs = ["pjrt_c_api_gpu.cc"],
     hdrs = ["pjrt_c_api_gpu.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":pjrt_c_api_gpu_internal",
         ":pjrt_c_api_hdrs",
diff --git a/third_party/xla/xla/pjrt/c/CHANGELOG.md b/third_party/xla/xla/pjrt/c/CHANGELOG.md
index 5bd553df96126c..4680cf8f70aa67 100644
--- a/third_party/xla/xla/pjrt/c/CHANGELOG.md
+++ b/third_party/xla/xla/pjrt/c/CHANGELOG.md
@@ -1,5 +1,12 @@
 # PJRT C API changelog
 
+## 0.42
+* Renamed all ``priv`` fields to ``extension_start``
+
+## 0.41
+* Renamed PJRT_Structure_Base to PJRT_Extension_Base
+* Renamed PJRT_Structure_Type to PJRT_Extension_Type (and similarly for enum fields)
+
 ## 0.40 (Nov 27, 2023)
 * Added PJRT_Executable_GetCompiledMemoryStats.
 
@@ -21,7 +28,7 @@ PJRT_ExecuteOptions.
 * Deprecated PJRT_LoadedExecutable_Fingerprint
 
 ## 0.34 (Oct 9, 2023)
-* Added PJRT_Structure_Type::PJRT_Structure_Type_Profiler.
+* Added PJRT_Extension_Type::PJRT_Extension_Type_Profiler.
 
 ## 0.33 (Oct 3, 2023)
 * Added PJRT_Client_CreateViewOfDeviceBuffer.
@@ -30,9 +37,9 @@ PJRT_ExecuteOptions.
 * Added PJRT_Buffer_CopyToMemory.
 
 ## 0.31 (Sep 22, 2023)
-* Added PJRT_Structure_Base.
-* Added PJRT_Structure_Type.
-* Renamed PJRT_Api.priv to PJRT_Api.extension_start.
+* Added PJRT_Extension_Base.
+* Added PJRT_Extension_Type.
+* Renamed PJRT_Api.extension_start to PJRT_Api.extension_start.
 
 ## 0.30 (Sep 14, 2023)
 * Added PJRT_NamedValue_Type::PJRT_NamedValue_kBool.
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api.h b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
index c86d6813b586a2..b7450e44f4f6c9 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api.h
@@ -53,14 +53,14 @@ extern "C" {
 // Changes include:
 // * Adding a new field to the PJRT_Api or argument structs
 // * Renaming a method or argument (doesn't affect ABI)
-#define PJRT_API_MINOR 40
+#define PJRT_API_MINOR 42
 
 // The plugin should set the major_version and minor_version of
 // PJRT_Api.pjrt_api_version to be the `PJRT_API_MAJOR` and `PJRT_API_MINOR` in
 // this header that the implementation was compiled with.
 struct PJRT_Api_Version {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   int major_version;  // out
   int minor_version;  // out
 };
@@ -77,7 +77,7 @@ typedef struct PJRT_Error PJRT_Error;
 
 struct PJRT_Error_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Error* error;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Error_Destroy_Args, error);
@@ -87,7 +87,7 @@ typedef void PJRT_Error_Destroy(PJRT_Error_Destroy_Args* args);
 
 struct PJRT_Error_Message_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   const PJRT_Error* error;
   // Has the lifetime of `error`.
   const char* message;  // out
@@ -121,7 +121,7 @@ typedef enum {
 
 struct PJRT_Error_GetCode_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   const PJRT_Error* error;
   PJRT_Error_Code code;  // out
 };
@@ -151,7 +151,7 @@ typedef enum {
 // Named value for key-value pairs.
 struct PJRT_NamedValue {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   const char* name;
   size_t name_size;
   PJRT_NamedValue_Type type;
@@ -172,16 +172,16 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_NamedValue, value_size);
 
 struct PJRT_Plugin_Initialize_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
 };
-PJRT_DEFINE_STRUCT_TRAITS(PJRT_Plugin_Initialize_Args, priv);
+PJRT_DEFINE_STRUCT_TRAITS(PJRT_Plugin_Initialize_Args, extension_start);
 
 // One-time plugin setup. Must be called before any other functions are called.
 typedef PJRT_Error* PJRT_Plugin_Initialize(PJRT_Plugin_Initialize_Args* args);
 
 struct PJRT_Plugin_Attributes_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   // Returned attributes have the lifetime of the process.
   const PJRT_NamedValue* attributes;  // out
   size_t num_attributes;              // out
@@ -205,7 +205,7 @@ typedef struct PJRT_Event PJRT_Event;
 
 struct PJRT_Event_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Event* event;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Event_Destroy_Args, event);
@@ -215,7 +215,7 @@ typedef PJRT_Error* PJRT_Event_Destroy(PJRT_Event_Destroy_Args* args);
 
 struct PJRT_Event_IsReady_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Event* event;
   bool is_ready;  // out
 };
@@ -227,7 +227,7 @@ typedef PJRT_Error* PJRT_Event_IsReady(PJRT_Event_IsReady_Args* args);
 
 struct PJRT_Event_Error_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Event* event;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Event_Error_Args, event);
@@ -245,7 +245,7 @@ typedef PJRT_Error* PJRT_Event_Error(PJRT_Event_Error_Args* args);
 
 struct PJRT_Event_Await_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Event* event;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Event_Await_Args, event);
@@ -263,7 +263,7 @@ typedef void (*PJRT_Event_OnReadyCallback)(PJRT_Error* error, void* user_arg);
 
 struct PJRT_Event_OnReady_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Event* event;
   PJRT_Event_OnReadyCallback callback;
   // `user_arg` allows `callback` to be called with arbitrary arguments (e.g.
@@ -297,7 +297,7 @@ typedef void (*PJRT_KeyValueGetCallback_ValueDeleter)(char* value);
 
 struct PJRT_KeyValueGetCallback_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   const char* key;
   size_t key_size;
   int timeout_in_ms;
@@ -323,7 +323,7 @@ typedef PJRT_Error* (*PJRT_KeyValueGetCallback)(
 
 struct PJRT_KeyValuePutCallback_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   const char* key;
   size_t key_size;
   // Only needs to stay alive for the duration of the PJRT_KeyValuePutCallback
@@ -344,7 +344,7 @@ typedef PJRT_Error* (*PJRT_KeyValuePutCallback)(
 
 struct PJRT_Client_Create_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   // Extra platform-specific options to create a client.
   const PJRT_NamedValue* create_options;
   size_t num_options;
@@ -367,7 +367,7 @@ typedef PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args);
 
 struct PJRT_Client_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Client* client;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Client_Destroy_Args, client);
@@ -377,7 +377,7 @@ typedef PJRT_Error* PJRT_Client_Destroy(PJRT_Client_Destroy_Args* args);
 
 struct PJRT_Client_PlatformName_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Client* client;
   // `platform_name` has the same lifetime as `client`. It is owned by `client`.
   const char* platform_name;  // out
@@ -391,7 +391,7 @@ typedef PJRT_Error* PJRT_Client_PlatformName(
 
 struct PJRT_Client_ProcessIndex_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Client* client;
   int process_index;  // out
 };
@@ -404,7 +404,7 @@ typedef PJRT_Error* PJRT_Client_ProcessIndex(
 
 struct PJRT_Client_PlatformVersion_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Client* client;
   // `platform_version` has the same lifetime as `client`. It's owned by
   // `client`.
@@ -421,7 +421,7 @@ typedef PJRT_Error* PJRT_Client_PlatformVersion(
 
 struct PJRT_Client_TopologyDescription_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Client* client;
   // Is owned by and has the same lifetime as `client`.
   PJRT_TopologyDescription* topology;  // out
@@ -435,7 +435,7 @@ typedef PJRT_Error* PJRT_Client_TopologyDescription(
 
 struct PJRT_Client_Devices_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Client* client;
   PJRT_Device* const* devices;  // out
   size_t num_devices;           // out
@@ -448,7 +448,7 @@ typedef PJRT_Error* PJRT_Client_Devices(PJRT_Client_Devices_Args* args);
 
 struct PJRT_Client_AddressableDevices_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Client* client;
   PJRT_Device* const* addressable_devices;  // out
   size_t num_addressable_devices;           // out
@@ -464,7 +464,7 @@ typedef PJRT_Error* PJRT_Client_AddressableDevices(
 
 struct PJRT_Client_LookupDevice_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Client* client;
   int id;
   // `device` has the same lifetime as `client`. It is owned by `client`.
@@ -479,7 +479,7 @@ typedef PJRT_Error* PJRT_Client_LookupDevice(
 
 struct PJRT_Client_LookupAddressableDevice_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Client* client;
   int local_hardware_id;
   // `addressable_device` has the same lifetime as `client`. It is owned by
@@ -496,7 +496,7 @@ typedef PJRT_Error* PJRT_Client_LookupAddressableDevice(
 
 struct PJRT_Client_AddressableMemories_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Client* client;
   PJRT_Memory* const* addressable_memories;  // out
   size_t num_addressable_memories;           // out
@@ -512,7 +512,7 @@ typedef PJRT_Error* PJRT_Client_AddressableMemories(
 
 struct PJRT_Program {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   // Serialized code in the specified format below.
   // String is owned by the caller.
   char* code;  // in/out depending on usage
@@ -529,7 +529,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Program, format_size);
 
 struct PJRT_Client_Compile_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Client* client;
   // Only needs to stay alive for the duration of the Compile call.
   // `program->format` and `program->format_size` are owned by the caller.
@@ -549,7 +549,7 @@ typedef PJRT_Error* PJRT_Client_Compile(PJRT_Client_Compile_Args* args);
 
 struct PJRT_Client_DefaultDeviceAssignment_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Client* client;
   int num_replicas;
   int num_partitions;
@@ -643,7 +643,7 @@ typedef enum {
 
 struct PJRT_Buffer_MemoryLayout_Tiled {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   // A map from physical dimension numbers to logical dimension numbers.
   // The first element is the most minor physical dimension (fastest varying
   // index) and the last the most major (slowest varying index). The contents of
@@ -661,7 +661,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_MemoryLayout_Tiled, num_tiles);
 
 struct PJRT_Buffer_MemoryLayout_Strides {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   // Number of bytes to traverse per dimension. Must be the same size as
   // the number of dimensions of the data. Caution: `byte_strides` are allowed
   // to be negative, in which case data may need to point to the interior of
@@ -676,7 +676,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_MemoryLayout_Strides, num_byte_strides);
 // strides.
 struct PJRT_Buffer_MemoryLayout {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   union {
     PJRT_Buffer_MemoryLayout_Tiled tiled;
     PJRT_Buffer_MemoryLayout_Strides strides;
@@ -687,7 +687,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_MemoryLayout, type);
 
 struct PJRT_Client_BufferFromHostBuffer_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Client* client;
   // Pointer to the host buffer
   const void* data;
@@ -735,7 +735,7 @@ typedef PJRT_Error* PJRT_Client_BufferFromHostBuffer(
 
 struct PJRT_Client_CreateViewOfDeviceBuffer_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Client* client;
   // A pointer to a non-owned device buffer. A PJRT_Buffer that is a non-owned
   // view of this device buffer will be created.
@@ -781,7 +781,7 @@ typedef PJRT_Error* PJRT_Client_CreateViewOfDeviceBuffer(
 
 struct PJRT_DeviceDescription_Id_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_DeviceDescription* device_description;
   int id;  // out
 };
@@ -795,7 +795,7 @@ typedef PJRT_Error* PJRT_DeviceDescription_Id(
 
 struct PJRT_DeviceDescription_ProcessIndex_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_DeviceDescription* device_description;
   int process_index;  // out
 };
@@ -812,7 +812,7 @@ typedef PJRT_Error* PJRT_DeviceDescription_ProcessIndex(
 
 struct PJRT_DeviceDescription_Attributes_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_DeviceDescription* device_description;
   size_t num_attributes;              // out
   const PJRT_NamedValue* attributes;  // out
@@ -826,7 +826,7 @@ typedef PJRT_Error* PJRT_DeviceDescription_Attributes(
 
 struct PJRT_DeviceDescription_Kind_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_DeviceDescription* device_description;
   // `device_kind` string is owned by `device` and has same lifetime as
   // `device`.
@@ -842,7 +842,7 @@ typedef PJRT_Error* PJRT_DeviceDescription_Kind(
 
 struct PJRT_DeviceDescription_DebugString_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_DeviceDescription* device_description;
   const char* debug_string;  // out
   size_t debug_string_size;  // out
@@ -857,7 +857,7 @@ typedef PJRT_Error* PJRT_DeviceDescription_DebugString(
 
 struct PJRT_DeviceDescription_ToString_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_DeviceDescription* device_description;
   const char* to_string;  // out
   size_t to_string_size;  // out
@@ -873,7 +873,7 @@ typedef PJRT_Error* PJRT_DeviceDescription_ToString(
 
 struct PJRT_Device_GetDescription_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Device* device;
   PJRT_DeviceDescription* device_description;  // out
 };
@@ -885,7 +885,7 @@ typedef PJRT_Error* PJRT_Device_GetDescription(
 
 struct PJRT_Device_IsAddressable_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Device* device;
   bool is_addressable;  // out
 };
@@ -897,7 +897,7 @@ typedef PJRT_Error* PJRT_Device_IsAddressable(
 
 struct PJRT_Device_LocalHardwareId_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Device* device;
   int local_hardware_id;  // out
 };
@@ -910,7 +910,7 @@ typedef PJRT_Error* PJRT_Device_LocalHardwareId(
 
 struct PJRT_Device_AddressableMemories_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Device* device;
   // Has the lifetime of `device`.
   PJRT_Memory* const* memories;  // out
@@ -924,7 +924,7 @@ typedef PJRT_Error* PJRT_Device_AddressableMemories(
 
 struct PJRT_Device_DefaultMemory_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Device* device;
   // `memory` has the same lifetime as `device`.
   PJRT_Memory* memory;  // out
@@ -938,7 +938,7 @@ typedef PJRT_Error* PJRT_Device_DefaultMemory(
 
 struct PJRT_Device_MemoryStats_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Device* device;
 
   // Number of bytes in use.
@@ -989,7 +989,7 @@ typedef PJRT_Error* PJRT_Device_MemoryStats(PJRT_Device_MemoryStats_Args* args);
 
 struct PJRT_Memory_Id_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Memory* memory;
   int id;  // out
 };
@@ -1000,7 +1000,7 @@ typedef PJRT_Error* PJRT_Memory_Id(PJRT_Memory_Id_Args* args);
 
 struct PJRT_Memory_Kind_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Memory* memory;
   // `memory_kind` has same lifetime as `memory`.
   const char* memory_kind;  // out
@@ -1013,7 +1013,7 @@ typedef PJRT_Error* PJRT_Memory_Kind(PJRT_Memory_Kind_Args* args);
 
 struct PJRT_Memory_DebugString_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Memory* memory;
   const char* debug_string;  // out
   size_t debug_string_size;  // out
@@ -1026,7 +1026,7 @@ typedef PJRT_Error* PJRT_Memory_DebugString(PJRT_Memory_DebugString_Args* args);
 
 struct PJRT_Memory_ToString_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Memory* memory;
   const char* to_string;  // out
   size_t to_string_size;  // out
@@ -1038,7 +1038,7 @@ typedef PJRT_Error* PJRT_Memory_ToString(PJRT_Memory_ToString_Args* args);
 
 struct PJRT_Memory_AddressableByDevices_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Memory* memory;
   PJRT_Device* const* devices;  // out
   size_t num_devices;           // out
@@ -1053,7 +1053,7 @@ typedef PJRT_Error* PJRT_Memory_AddressableByDevices(
 
 struct PJRT_Executable_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Executable* executable;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Executable_Destroy_Args, executable);
@@ -1063,7 +1063,7 @@ typedef PJRT_Error* PJRT_Executable_Destroy(PJRT_Executable_Destroy_Args* args);
 
 struct PJRT_LoadedExecutable_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_LoadedExecutable* executable;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_Destroy_Args, executable);
@@ -1075,7 +1075,7 @@ typedef PJRT_Error* PJRT_LoadedExecutable_Destroy(
 
 struct PJRT_LoadedExecutable_GetExecutable_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_LoadedExecutable* loaded_executable;
   PJRT_Executable* executable;  // out
 };
@@ -1088,7 +1088,7 @@ typedef PJRT_Error* PJRT_LoadedExecutable_GetExecutable(
 
 struct PJRT_Executable_Name_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Executable* executable;
   // `executable_name` has the same lifetime as `executable`. It is owned by
   // `executable`.
@@ -1103,7 +1103,7 @@ typedef PJRT_Error* PJRT_Executable_Name(PJRT_Executable_Name_Args* args);
 // TODO(b/269178731): Revisit whether num_replicas is needed.
 struct PJRT_Executable_NumReplicas_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Executable* executable;
   size_t num_replicas;  // out
 };
@@ -1115,7 +1115,7 @@ typedef PJRT_Error* PJRT_Executable_NumReplicas(
 
 struct PJRT_Executable_NumPartitions_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Executable* executable;
   size_t num_partitions;  // out
 };
@@ -1127,7 +1127,7 @@ typedef PJRT_Error* PJRT_Executable_NumPartitions(
 
 struct PJRT_LoadedExecutable_AddressableDevices_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_LoadedExecutable* executable;
   PJRT_Device* const* addressable_devices;  // out
   size_t num_addressable_devices;           // out
@@ -1141,7 +1141,7 @@ typedef PJRT_Error* PJRT_LoadedExecutable_AddressableDevices(
 
 struct PJRT_Executable_OptimizedProgram_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Executable* executable;
   PJRT_Program* program;  // out, but read below
 };
@@ -1175,7 +1175,7 @@ typedef PJRT_Error* PJRT_Executable_OptimizedProgram(
 
 struct PJRT_LoadedExecutable_Delete_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_LoadedExecutable* executable;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_LoadedExecutable_Delete_Args, executable);
@@ -1190,7 +1190,7 @@ typedef PJRT_Error* PJRT_LoadedExecutable_Delete(
 
 struct PJRT_LoadedExecutable_IsDeleted_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_LoadedExecutable* executable;
   bool is_deleted;  // out
 };
@@ -1247,7 +1247,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_RecvCallbackInfo, recv_callback);
 
 struct PJRT_ExecuteOptions {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   // Callbacks for when send/recv ops are executed. The outer lists correspond
   // to each device returned by `PJRT_Executable_AddressableDevices` for
   // `executable` (i.e. they will have length `num_devices`). Each inner list
@@ -1279,7 +1279,7 @@ PJRT_DEFINE_STRUCT_TRAITS(PJRT_ExecuteOptions, launch_id);
 
 struct PJRT_LoadedExecutable_Execute_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_LoadedExecutable* executable;
   // Only needs to stay alive for the duration of the Execute call.
   PJRT_ExecuteOptions* options;
@@ -1318,7 +1318,7 @@ typedef PJRT_Error* PJRT_LoadedExecutable_Execute(
 
 struct PJRT_Executable_NumOutputs_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Executable* executable;
   size_t num_outputs;  // out
 };
@@ -1330,7 +1330,7 @@ typedef PJRT_Error* PJRT_Executable_NumOutputs(
 
 struct PJRT_Executable_SizeOfGeneratedCodeInBytes_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Executable* executable;
   int64_t size_in_bytes;  // out
 };
@@ -1342,7 +1342,7 @@ typedef PJRT_Error* PJRT_Executable_SizeOfGeneratedCodeInBytes(
 
 struct PJRT_Executable_Fingerprint_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Executable* executable;
   // Has the lifetime of `executable`
   const char* executable_fingerprint;  // out
@@ -1360,7 +1360,7 @@ typedef PJRT_Error* PJRT_Executable_Fingerprint(
 
 struct PJRT_Executable_GetCostAnalysis_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Executable* executable;
   size_t num_properties;  // out
   // `properties` and any embedded data are owned by and have the same lifetime
@@ -1378,7 +1378,7 @@ typedef PJRT_Error* PJRT_Executable_GetCostAnalysis(
 
 struct PJRT_Executable_GetCompiledMemoryStats_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Executable* executable;
 
   // Mirrors xla::CompiledMemoryStats.
@@ -1399,7 +1399,7 @@ typedef PJRT_Error* PJRT_Executable_GetCompiledMemoryStats(
 
 struct PJRT_Executable_OutputElementTypes_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Executable* executable;
   PJRT_Buffer_Type* output_types;  // out
   size_t num_output_types;         // out
@@ -1413,7 +1413,7 @@ typedef PJRT_Error* PJRT_Executable_OutputElementTypes(
 
 struct PJRT_Executable_OutputDimensions_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Executable* executable;
   size_t num_outputs;
   // Has length: sum of all elements in the list `dim_sizes`.
@@ -1431,7 +1431,7 @@ typedef PJRT_Error* PJRT_Executable_OutputDimensions(
 
 struct PJRT_Executable_OutputMemoryKinds_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Executable* executable;
   size_t num_outputs;
   // Has length `num_outputs`.
@@ -1450,7 +1450,7 @@ typedef struct PJRT_SerializedExecutable PJRT_SerializedExecutable;
 
 struct PJRT_Executable_Serialize_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   const PJRT_Executable* executable;
 
   // Lives only as long as serialized_executable
@@ -1473,7 +1473,7 @@ typedef PJRT_Error* PJRT_Executable_Serialize(
 
 struct PJRT_Executable_DeserializeAndLoad_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Client* client;
   const char* serialized_executable;
   size_t serialized_executable_size;
@@ -1490,7 +1490,7 @@ typedef PJRT_Error* PJRT_Executable_DeserializeAndLoad(
 
 struct PJRT_LoadedExecutable_Fingerprint_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_LoadedExecutable* executable;
   // Has the lifetime of `executable`
   const char* executable_fingerprint;  // out
@@ -1510,7 +1510,7 @@ typedef PJRT_Error* PJRT_LoadedExecutable_Fingerprint(
 
 struct PJRT_Buffer_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_Destroy_Args, buffer);
@@ -1521,7 +1521,7 @@ typedef PJRT_Error* PJRT_Buffer_Destroy(PJRT_Buffer_Destroy_Args* args);
 
 struct PJRT_Buffer_ElementType_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
   PJRT_Buffer_Type type;  // out
 };
@@ -1532,7 +1532,7 @@ typedef PJRT_Error* PJRT_Buffer_ElementType(PJRT_Buffer_ElementType_Args* args);
 
 struct PJRT_Buffer_Dimensions_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
   // Has the lifetime of `buffer` and length `num_dims`.
   const int64_t* dims;  // out
@@ -1545,7 +1545,7 @@ typedef PJRT_Error* PJRT_Buffer_Dimensions(PJRT_Buffer_Dimensions_Args* args);
 
 struct PJRT_Buffer_UnpaddedDimensions_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
   // Has the lifetime of `buffer` and length `num_dims`.
   const int64_t* unpadded_dims;  // out
@@ -1565,7 +1565,7 @@ typedef PJRT_Error* PJRT_Buffer_UnpaddedDimensions(
 
 struct PJRT_Buffer_DynamicDimensionIndices_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
   // Has the lifetime of `buffer` and length `num_dynamic_dims`.
   const size_t* dynamic_dim_indices;  // out
@@ -1583,7 +1583,7 @@ typedef PJRT_Error* PJRT_Buffer_DynamicDimensionIndices(
 
 struct PJRT_Buffer_GetMemoryLayout_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
   // Layout data is owned by and has the lifetime of `buffer`.
   PJRT_Buffer_MemoryLayout layout;  // out
@@ -1596,7 +1596,7 @@ typedef PJRT_Error* PJRT_Buffer_GetMemoryLayout(
 
 struct PJRT_Buffer_ToHostBuffer_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* src;
 
   // The caller can specify an optional host layout. If nullptr, the layout of
@@ -1622,7 +1622,7 @@ typedef PJRT_Error* PJRT_Buffer_ToHostBuffer(
 
 struct PJRT_Buffer_OnDeviceSizeInBytes_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
   size_t on_device_size_in_bytes;  // out
 };
@@ -1635,7 +1635,7 @@ typedef PJRT_Error* PJRT_Buffer_OnDeviceSizeInBytes(
 
 struct PJRT_Buffer_Delete_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_Delete_Args, buffer);
@@ -1649,7 +1649,7 @@ typedef PJRT_Error* PJRT_Buffer_Delete(PJRT_Buffer_Delete_Args* args);
 
 struct PJRT_Buffer_IsDeleted_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
   bool is_deleted;  // out
 };
@@ -1660,7 +1660,7 @@ typedef PJRT_Error* PJRT_Buffer_IsDeleted(PJRT_Buffer_IsDeleted_Args* args);
 
 struct PJRT_Buffer_CopyToDevice_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
   PJRT_Device* dst_device;
   PJRT_Buffer* dst_buffer;  // out
@@ -1675,7 +1675,7 @@ typedef PJRT_Error* PJRT_Buffer_CopyToDevice(
 
 struct PJRT_Buffer_CopyToMemory_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
   PJRT_Memory* dst_memory;
   PJRT_Buffer* dst_buffer;  // out
@@ -1690,7 +1690,7 @@ typedef PJRT_Error* PJRT_Buffer_CopyToMemory(
 
 struct PJRT_Buffer_IsOnCpu_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
   bool is_on_cpu;  // out
 };
@@ -1701,7 +1701,7 @@ typedef PJRT_Error* PJRT_Buffer_IsOnCpu(PJRT_Buffer_IsOnCpu_Args* args);
 
 struct PJRT_Buffer_Device_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
   PJRT_Device* device;  // out
 };
@@ -1712,7 +1712,7 @@ typedef PJRT_Error* PJRT_Buffer_Device(PJRT_Buffer_Device_Args* args);
 
 struct PJRT_Buffer_Memory_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
   PJRT_Memory* memory;  // out
 };
@@ -1723,7 +1723,7 @@ typedef PJRT_Error* PJRT_Buffer_Memory(PJRT_Buffer_Memory_Args* args);
 
 struct PJRT_Buffer_ReadyEvent_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
   // The caller is responsible for calling PJRT_Event_Destroy on `event`.
   PJRT_Event* event;  // out
@@ -1743,7 +1743,7 @@ typedef PJRT_Error* PJRT_Buffer_ReadyEvent(PJRT_Buffer_ReadyEvent_Args* args);
 
 struct PJRT_Buffer_UnsafePointer_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
   uintptr_t buffer_pointer;  // out
 };
@@ -1756,7 +1756,7 @@ typedef PJRT_Error* PJRT_Buffer_UnsafePointer(
 
 struct PJRT_Buffer_IncreaseExternalReferenceCount_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_IncreaseExternalReferenceCount_Args,
@@ -1772,7 +1772,7 @@ typedef PJRT_Error* PJRT_Buffer_IncreaseExternalReferenceCount(
 
 struct PJRT_Buffer_DecreaseExternalReferenceCount_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_Buffer_DecreaseExternalReferenceCount_Args,
@@ -1786,7 +1786,7 @@ typedef PJRT_Error* PJRT_Buffer_DecreaseExternalReferenceCount(
 
 struct PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_Buffer* buffer;
   void* device_memory_ptr;  // out
 };
@@ -1803,7 +1803,7 @@ typedef PJRT_Error* PJRT_Buffer_OpaqueDeviceMemoryDataPointer(
 
 struct PJRT_CopyToDeviceStream_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_CopyToDeviceStream* stream;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_CopyToDeviceStream_Destroy_Args, stream);
@@ -1814,7 +1814,7 @@ typedef PJRT_Error* PJRT_CopyToDeviceStream_Destroy(
 
 struct PJRT_CopyToDeviceStream_AddChunk_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_CopyToDeviceStream* stream;
   // Takes ownership of `chunk` (i.e. implementation will call chunk.deleter).
   PJRT_Chunk* chunk;
@@ -1835,7 +1835,7 @@ typedef PJRT_Error* PJRT_CopyToDeviceStream_AddChunk(
 
 struct PJRT_CopyToDeviceStream_TotalBytes_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_CopyToDeviceStream* stream;
   int64_t total_bytes;  // out
 };
@@ -1847,7 +1847,7 @@ typedef PJRT_Error* PJRT_CopyToDeviceStream_TotalBytes(
 
 struct PJRT_CopyToDeviceStream_GranuleSize_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_CopyToDeviceStream* stream;
   int64_t granule_size_in_bytes;  // out
 };
@@ -1861,7 +1861,7 @@ typedef PJRT_Error* PJRT_CopyToDeviceStream_GranuleSize(
 
 struct PJRT_CopyToDeviceStream_CurrentBytes_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_CopyToDeviceStream* stream;
   int64_t current_bytes;  // out
 };
@@ -1877,7 +1877,7 @@ typedef PJRT_Error* PJRT_CopyToDeviceStream_CurrentBytes(
 
 struct PJRT_TopologyDescription_Create_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   const char* topology_name;
   size_t topology_name_size;
   // Extra platform-specific options to create a client.
@@ -1894,7 +1894,7 @@ typedef PJRT_Error* PJRT_TopologyDescription_Create(
 
 struct PJRT_TopologyDescription_Destroy_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_TopologyDescription* topology;
 };
 PJRT_DEFINE_STRUCT_TRAITS(PJRT_TopologyDescription_Destroy_Args, topology);
@@ -1905,7 +1905,7 @@ typedef PJRT_Error* PJRT_TopologyDescription_Destroy(
 
 struct PJRT_TopologyDescription_PlatformVersion_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_TopologyDescription* topology;
   // `platform_version` has the same lifetime as `topology`. It's owned by
   // `topology`.
@@ -1922,7 +1922,7 @@ typedef PJRT_Error* PJRT_TopologyDescription_PlatformVersion(
 
 struct PJRT_TopologyDescription_PlatformName_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_TopologyDescription* topology;
   // `platform_name` has the same lifetime as `topology`. It is owned by
   // `topology`.
@@ -1938,7 +1938,7 @@ typedef PJRT_Error* PJRT_TopologyDescription_PlatformName(
 
 struct PJRT_TopologyDescription_GetDeviceDescriptions_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_TopologyDescription* topology;
   // Has the same lifetime as topology.
   PJRT_DeviceDescription* const* descriptions;  // out
@@ -1957,7 +1957,7 @@ typedef struct PJRT_SerializedTopology PJRT_SerializedTopology;
 
 struct PJRT_TopologyDescription_Serialize_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_TopologyDescription* topology;
 
   // Lives only as long as serialized_topology.
@@ -1979,7 +1979,7 @@ typedef PJRT_Error* PJRT_TopologyDescription_Serialize(
 
 struct PJRT_TopologyDescription_Attributes_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   PJRT_TopologyDescription* topology;
 
   // Only lives as long as topology.
@@ -1995,7 +1995,7 @@ typedef PJRT_Error* PJRT_TopologyDescription_Attributes(
 
 struct PJRT_Compile_Args {
   size_t struct_size;
-  void* priv;
+  void* extension_start;
   const PJRT_TopologyDescription* topology;
   // Only needs to stay alive for the duration of the Compile call.
   // `program->format` and `program->format_size` are owned by the caller.
@@ -2019,17 +2019,17 @@ typedef PJRT_Error* PJRT_Compile(PJRT_Compile_Args* args);
 // -------------------------------- Extension ----------------------------------
 
 typedef enum {
-  PJRT_Structure_Type_Gpu_Custom_Call = 0,
-  PJRT_Structure_Type_Profiler,
-} PJRT_Structure_Type;
+  PJRT_Extension_Type_Gpu_Custom_Call = 0,
+  PJRT_Extension_Type_Profiler,
+} PJRT_Extension_Type;
 
-// PJRT_Structure_Base contains a type and a pointer to next
-// PJRT_Structure_Base. The framework can go through this chain to find
+// PJRT_Extension_Base contains a type and a pointer to next
+// PJRT_Extension_Base. The framework can go through this chain to find
 // structure and identify it with the type.
-typedef struct PJRT_Structure_Base {
-  PJRT_Structure_Type type;
-  const struct PJRT_Structure_Base* next;
-} PJRT_Structure_Base;
+typedef struct PJRT_Extension_Base {
+  PJRT_Extension_Type type;
+  const struct PJRT_Extension_Base* next;
+} PJRT_Extension_Base;
 
 // -------------------------------- API access ---------------------------------
 
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_extension.h
index f6dbbc92e7fe78..341eadcd15a25b 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_extension.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_extension.h
@@ -41,7 +41,7 @@ typedef PJRT_Error* PJRT_Gpu_Register_Custom_Call(
     PJRT_Gpu_Register_Custom_Call_Args* args);
 
 typedef struct PJRT_Gpu_Custom_Call {
-  PJRT_Structure_Type type;
+  PJRT_Extension_Type type;
   const void* next;
   PJRT_Gpu_Register_Custom_Call* custom_call;
 } PJRT_Gpu_Custom_Call;
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
index fa10334d506fc8..bcd1a4410830ef 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_internal.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/backends/profiler/plugin/plugin_tracer_impl.h"
 #include "xla/backends/profiler/plugin/profiler_c_api.h"
 #include "xla/backends/profiler/plugin/profiler_error.h"
+#include "xla/client/local_client.h"
 #include "xla/ffi/api/c_api.h"
 #include "xla/ffi/ffi.h"
 #include "xla/ffi/ffi_api.h"
@@ -39,7 +40,12 @@ limitations under the License.
 #include "xla/pjrt/gpu/se_gpu_pjrt_client.h"
 #include "xla/pjrt/pjrt_client.h"
 #include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/service/compiler.h"
 #include "xla/service/custom_call_target_registry.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/stream_executor_pimpl.h"
 #include "tsl/platform/errors.h"
 
 namespace pjrt {
@@ -137,8 +143,32 @@ PJRT_Error* PJRT_Client_Create(PJRT_Client_Create_Args* args) {
 
 PJRT_Error* PJRT_GpuDeviceTopology_Create(
     PJRT_TopologyDescription_Create_Args* args) {
-  return new PJRT_Error{tsl::errors::Unimplemented(
-      "Topology not supported for GPU compilation.")};
+  PJRT_RETURN_IF_ERROR(ActualStructSizeIsGreaterOrEqual(
+      "PJRT_TopologyDescription_Create_Args",
+      PJRT_TopologyDescription_Create_Args_STRUCT_SIZE, args->struct_size));
+
+  PJRT_ASSIGN_OR_RETURN(xla::LocalClient * xla_client,
+                        xla::GetGpuXlaClient(/*platform_name=*/std::nullopt,
+                                             /*allowed_devices=*/std::nullopt));
+  stream_executor::StreamExecutor* executor =
+      xla_client->backend().default_stream_executor();
+  const stream_executor::DeviceDescription& description =
+      executor->GetDeviceDescription();
+  std::vector<int> device_ids;
+  device_ids.reserve(xla_client->backend().stream_executors().size());
+  for (stream_executor::StreamExecutor* executor :
+       xla_client->backend().stream_executors()) {
+    device_ids.push_back(executor->device_ordinal());
+  }
+  auto gpu_target_config = xla::Compiler::TargetConfig(executor);
+  auto pjrt_topology =
+      std::make_unique<xla::StreamExecutorGpuTopologyDescription>(
+          xla::CudaId(), xla::CudaName(), description.name(), device_ids,
+          absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute>{
+              {"target_config",
+               gpu_target_config.ToProto().SerializeAsString()}});
+  args->topology = CreateWrapperDeviceTopology(std::move(pjrt_topology));
+  return nullptr;
 }
 
 PLUGIN_Profiler_Api profiler_api{
@@ -155,7 +185,7 @@ PLUGIN_Profiler_Api profiler_api{
 };
 
 PJRT_Profiler_Extension profiler_extension{
-    /*type=*/PJRT_Structure_Type::PJRT_Structure_Type_Profiler,
+    /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Profiler,
     /*next=*/nullptr,
     /*profiler_api=*/&profiler_api,
 };
@@ -187,7 +217,7 @@ PJRT_Error* PJRT_Gpu_Register_Custom_Call(
 }
 
 PJRT_Gpu_Custom_Call custom_call{
-    /*type=*/PJRT_Structure_Type::PJRT_Structure_Type_Gpu_Custom_Call,
+    /*type=*/PJRT_Extension_Type::PJRT_Extension_Type_Gpu_Custom_Call,
     /*next=*/&profiler_extension,
     /*custom_call=*/PJRT_Gpu_Register_Custom_Call,
 };
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
index 444f00fb79b917..e60e2126a73b08 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_gpu_test.cc
@@ -84,7 +84,7 @@ TEST_F(PjrtCApiGpuTest, CreateViewOfDeviceBuffer) {
   PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args device_buffer_ptr_args;
   device_buffer_ptr_args.struct_size =
       PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args_STRUCT_SIZE;
-  device_buffer_ptr_args.priv = nullptr;
+  device_buffer_ptr_args.extension_start = nullptr;
   device_buffer_ptr_args.buffer = buffer.get();
   PJRT_Error* device_buffer_ptr_error =
       api_->PJRT_Buffer_OpaqueDeviceMemoryDataPointer(&device_buffer_ptr_args);
@@ -92,7 +92,7 @@ TEST_F(PjrtCApiGpuTest, CreateViewOfDeviceBuffer) {
   // Looks up a device.
   PJRT_Buffer_Device_Args device_args = PJRT_Buffer_Device_Args{
       /*struct_size=*/PJRT_Buffer_Device_Args_STRUCT_SIZE,
-      /*priv=*/nullptr,
+      /*extension_start=*/nullptr,
       /*buffer=*/buffer.get(),
   };
   PJRT_Error* device_error = api_->PJRT_Buffer_Device(&device_args);
@@ -102,7 +102,7 @@ TEST_F(PjrtCApiGpuTest, CreateViewOfDeviceBuffer) {
   PJRT_Client_CreateViewOfDeviceBuffer_Args create_view_args;
   create_view_args.struct_size =
       PJRT_Client_CreateViewOfDeviceBuffer_Args_STRUCT_SIZE;
-  create_view_args.priv = nullptr;
+  create_view_args.extension_start = nullptr;
   create_view_args.client = client_;
   create_view_args.device_buffer_ptr = device_buffer_ptr_args.device_memory_ptr;
   xla::Shape shape = xla::ShapeUtil::MakeShape(xla::S32, {4});
@@ -136,7 +136,7 @@ TEST_F(PjrtCApiGpuTest, CreateViewOfDeviceBuffer) {
   // Transfers view_buffer to host to verify.
   PJRT_Buffer_ToHostBuffer_Args to_host_args;
   to_host_args.struct_size = PJRT_Buffer_ToHostBuffer_Args_STRUCT_SIZE;
-  to_host_args.priv = nullptr;
+  to_host_args.extension_start = nullptr;
   to_host_args.src = view_buffer.get();
   xla::Shape host_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
   auto literal = std::make_shared<xla::Literal>(host_shape);
@@ -163,7 +163,7 @@ absl::StatusOr<PJRT_Client_Create_Args> BuildCreateArg(
     std::vector<PJRT_NamedValue>& c_options) {
   PJRT_Client_Create_Args args;
   args.struct_size = PJRT_Client_Create_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.create_options = c_options.data();
   args.num_options = c_options.size();
   args.kv_get_callback = kv_callback_data->c_kv_get;
@@ -201,7 +201,7 @@ TEST(PjrtCApiGpuKVStoreTest, CreateClientWithKVCallback) {
 
       PJRT_Client_Devices_Args device_args;
       device_args.struct_size = PJRT_Client_Devices_Args_STRUCT_SIZE;
-      device_args.priv = nullptr;
+      device_args.extension_start = nullptr;
       device_args.client = create_arg.client;
 
       PJRT_Error* device_error = api->PJRT_Client_Devices(&device_args);
@@ -211,7 +211,7 @@ TEST(PjrtCApiGpuKVStoreTest, CreateClientWithKVCallback) {
       PJRT_Client_AddressableDevices_Args addressable_device_args;
       addressable_device_args.struct_size =
           PJRT_Client_AddressableDevices_Args_STRUCT_SIZE;
-      addressable_device_args.priv = nullptr;
+      addressable_device_args.extension_start = nullptr;
       addressable_device_args.client = create_arg.client;
 
       PJRT_Error* addressable_device_error =
@@ -221,7 +221,7 @@ TEST(PjrtCApiGpuKVStoreTest, CreateClientWithKVCallback) {
 
       PJRT_Client_Destroy_Args destroy_args;
       destroy_args.struct_size = PJRT_Client_Destroy_Args_STRUCT_SIZE;
-      destroy_args.priv = nullptr;
+      destroy_args.extension_start = nullptr;
       destroy_args.client = create_arg.client;
 
       PJRT_Error* destroy_error = api->PJRT_Client_Destroy(&destroy_args);
@@ -253,7 +253,7 @@ TEST(PjrtCApiGpuAllocatorTest, ValidOptionsParsing) {
         ::pjrt::ConvertToPjRtNamedValueList(options, /*api_minor_version=*/30));
     PJRT_Client_Create_Args create_arg;
     create_arg.struct_size = PJRT_Client_Create_Args_STRUCT_SIZE;
-    create_arg.priv = nullptr;
+    create_arg.extension_start = nullptr;
     create_arg.client = nullptr;
     create_arg.create_options = c_options.data();
     create_arg.num_options = c_options.size();
@@ -262,7 +262,7 @@ TEST(PjrtCApiGpuAllocatorTest, ValidOptionsParsing) {
 
     PJRT_Client_Destroy_Args destroy_args;
     destroy_args.struct_size = PJRT_Client_Destroy_Args_STRUCT_SIZE;
-    destroy_args.priv = nullptr;
+    destroy_args.extension_start = nullptr;
     destroy_args.client = create_arg.client;
 
     PJRT_Error* destroy_error = api->PJRT_Client_Destroy(&destroy_args);
@@ -282,7 +282,7 @@ TEST(PjrtCApiGpuAllocatorTest, InvalidAllocatorOptionsParsing) {
       ::pjrt::ConvertToPjRtNamedValueList(options, /*api_minor_version=*/30));
   PJRT_Client_Create_Args create_arg;
   create_arg.struct_size = PJRT_Client_Create_Args_STRUCT_SIZE;
-  create_arg.priv = nullptr;
+  create_arg.extension_start = nullptr;
   create_arg.client = nullptr;
   create_arg.create_options = c_options.data();
   create_arg.num_options = c_options.size();
@@ -297,7 +297,7 @@ TEST(PjrtCApiGpuAllocatorTest, InvalidAllocatorOptionsParsing) {
 
   PJRT_Error_Destroy_Args error_destroy_args;
   error_destroy_args.struct_size = PJRT_Error_Destroy_Args_STRUCT_SIZE;
-  error_destroy_args.priv = nullptr;
+  error_destroy_args.extension_start = nullptr;
   error_destroy_args.error = error;
 
   api->PJRT_Error_Destroy(&error_destroy_args);
@@ -317,7 +317,7 @@ TEST(PjrtCApiPlatformNameTest, AvailablePlatformName) {
       ::pjrt::ConvertToPjRtNamedValueList(options, /*api_minor_version=*/30));
   PJRT_Client_Create_Args create_arg;
   create_arg.struct_size = PJRT_Client_Create_Args_STRUCT_SIZE;
-  create_arg.priv = nullptr;
+  create_arg.extension_start = nullptr;
   create_arg.client = nullptr;
   create_arg.create_options = c_options.data();
   create_arg.num_options = c_options.size();
@@ -326,7 +326,7 @@ TEST(PjrtCApiPlatformNameTest, AvailablePlatformName) {
 
   PJRT_Client_PlatformName_Args platform_name_args;
   platform_name_args.struct_size = PJRT_Client_PlatformName_Args_STRUCT_SIZE;
-  platform_name_args.priv = nullptr;
+  platform_name_args.extension_start = nullptr;
   platform_name_args.client = create_arg.client;
 
   PJRT_Error* platform_name_error =
@@ -340,7 +340,7 @@ TEST(PjrtCApiPlatformNameTest, AvailablePlatformName) {
 
   PJRT_Client_Destroy_Args destroy_args;
   destroy_args.struct_size = PJRT_Client_Destroy_Args_STRUCT_SIZE;
-  destroy_args.priv = nullptr;
+  destroy_args.extension_start = nullptr;
   destroy_args.client = create_arg.client;
 
   PJRT_Error* destroy_error = api->PJRT_Client_Destroy(&destroy_args);
@@ -359,7 +359,7 @@ TEST(PjrtCApiPlatformNameTest, UnavailablePlatformName) {
       ::pjrt::ConvertToPjRtNamedValueList(options, /*api_minor_version=*/30));
   PJRT_Client_Create_Args create_arg;
   create_arg.struct_size = PJRT_Client_Create_Args_STRUCT_SIZE;
-  create_arg.priv = nullptr;
+  create_arg.extension_start = nullptr;
   create_arg.client = nullptr;
   create_arg.create_options = c_options.data();
   create_arg.num_options = c_options.size();
@@ -374,7 +374,7 @@ TEST(PjrtCApiPlatformNameTest, UnavailablePlatformName) {
 
   PJRT_Error_Destroy_Args error_destroy_args;
   error_destroy_args.struct_size = PJRT_Error_Destroy_Args_STRUCT_SIZE;
-  error_destroy_args.priv = nullptr;
+  error_destroy_args.extension_start = nullptr;
   error_destroy_args.error = error;
 
   api->PJRT_Error_Destroy(&error_destroy_args);
@@ -382,7 +382,7 @@ TEST(PjrtCApiPlatformNameTest, UnavailablePlatformName) {
 
 void TestCustomCallV2() {}
 
-TEST(PjrtCApiGpuPrivTest, CustomCallUntyped) {
+TEST(PjrtCApiGpuExtensionTest, CustomCallUntyped) {
   PJRT_Gpu_Register_Custom_Call_Args args;
   args.struct_size = PJRT_Gpu_Register_Custom_Call_Args_STRUCT_SIZE;
   std::string function_name = "untyped_function_name";
@@ -391,11 +391,11 @@ TEST(PjrtCApiGpuPrivTest, CustomCallUntyped) {
   args.api_version = 0;
   args.custom_call_function = reinterpret_cast<void*>(&TestCustomCallV2);
   auto api = GetPjrtApi();
-  const PJRT_Structure_Base* next =
-      reinterpret_cast<const PJRT_Structure_Base*>(api->extension_start);
+  const PJRT_Extension_Base* next =
+      reinterpret_cast<const PJRT_Extension_Base*>(api->extension_start);
   while (next != nullptr &&
          next->type !=
-             PJRT_Structure_Type::PJRT_Structure_Type_Gpu_Custom_Call) {
+             PJRT_Extension_Type::PJRT_Extension_Type_Gpu_Custom_Call) {
     next = next->next;
   }
   ASSERT_NE(next, nullptr);
@@ -413,7 +413,7 @@ static void* kNoop = xla::ffi::Ffi::Bind()
                          .To([]() { return xla::ffi::Error::Success(); })
                          .release();
 
-TEST(PjrtCApiGpuPrivTest, CustomCallTyped) {
+TEST(PjrtCApiGpuExtensionTest, CustomCallTyped) {
   PJRT_Gpu_Register_Custom_Call_Args args;
   args.struct_size = PJRT_Gpu_Register_Custom_Call_Args_STRUCT_SIZE;
   std::string function_name = "typed_function_name";
@@ -422,11 +422,11 @@ TEST(PjrtCApiGpuPrivTest, CustomCallTyped) {
   args.api_version = 1;
   args.custom_call_function = kNoop;
   auto api = GetPjrtApi();
-  const PJRT_Structure_Base* next =
-      reinterpret_cast<const PJRT_Structure_Base*>(api->extension_start);
+  const PJRT_Extension_Base* next =
+      reinterpret_cast<const PJRT_Extension_Base*>(api->extension_start);
   while (next != nullptr &&
          next->type !=
-             PJRT_Structure_Type::PJRT_Structure_Type_Gpu_Custom_Call) {
+             PJRT_Extension_Type::PJRT_Extension_Type_Gpu_Custom_Call) {
     next = next->next;
   }
   ASSERT_NE(next, nullptr);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
index c655dec9dfc922..3fe1f56dce8892 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers.cc
@@ -61,7 +61,7 @@ PJRT_ClientDeleter MakeClientDeleter(const PJRT_Api* api) {
   return [api](PJRT_Client* client) -> void {
     PJRT_Client_Destroy_Args destroy_args;
     destroy_args.struct_size = PJRT_Client_Destroy_Args_STRUCT_SIZE;
-    destroy_args.priv = nullptr;
+    destroy_args.extension_start = nullptr;
     destroy_args.client = client;
 
     PJRT_Error* error = api->PJRT_Client_Destroy(&destroy_args);
@@ -74,7 +74,7 @@ PJRT_ErrorDeleter MakeErrorDeleter(const PJRT_Api* api) {
   return [api](PJRT_Error* error) -> void {
     PJRT_Error_Destroy_Args destroy_args;
     destroy_args.struct_size = PJRT_Error_Destroy_Args_STRUCT_SIZE;
-    destroy_args.priv = nullptr;
+    destroy_args.extension_start = nullptr;
     destroy_args.error = error;
 
     api->PJRT_Error_Destroy(&destroy_args);
@@ -85,7 +85,7 @@ PJRT_BufferDeleter MakeBufferDeleter(const PJRT_Api* api) {
   return [api](PJRT_Buffer* buffer) -> void {
     PJRT_Buffer_Destroy_Args destroy_args;
     destroy_args.struct_size = PJRT_Buffer_Destroy_Args_STRUCT_SIZE;
-    destroy_args.priv = nullptr;
+    destroy_args.extension_start = nullptr;
     destroy_args.buffer = buffer;
 
     pjrt::LogFatalIfPjrtError(api->PJRT_Buffer_Destroy(&destroy_args), api);
@@ -96,7 +96,7 @@ PJRT_ExecutableDeleter MakeExecutableDeleter(const PJRT_Api* api) {
   return [api](PJRT_Executable* executable) -> void {
     PJRT_Executable_Destroy_Args args;
     args.struct_size = PJRT_Executable_Destroy_Args_STRUCT_SIZE;
-    args.priv = nullptr;
+    args.extension_start = nullptr;
     args.executable = executable;
     pjrt::LogFatalIfPjrtError(api->PJRT_Executable_Destroy(&args), api);
   };
@@ -106,7 +106,7 @@ PJRT_LoadedExecutableDeleter MakeLoadedExecutableDeleter(const PJRT_Api* api) {
   return [api](PJRT_LoadedExecutable* executable) -> void {
     PJRT_LoadedExecutable_Destroy_Args args;
     args.struct_size = PJRT_LoadedExecutable_Destroy_Args_STRUCT_SIZE;
-    args.priv = nullptr;
+    args.extension_start = nullptr;
     args.executable = executable;
     pjrt::LogFatalIfPjrtError(api->PJRT_LoadedExecutable_Destroy(&args), api);
   };
@@ -127,7 +127,7 @@ PJRT_TopologyDescriptionDeleter MakeTopologyDescriptionDeleter(
     PJRT_TopologyDescription_Destroy_Args destroy_args;
     destroy_args.struct_size =
         PJRT_TopologyDescription_Destroy_Args_STRUCT_SIZE;
-    destroy_args.priv = nullptr;
+    destroy_args.extension_start = nullptr;
     destroy_args.topology = topology;
 
     pjrt::LogFatalIfPjrtError(
@@ -138,7 +138,7 @@ PJRT_TopologyDescriptionDeleter MakeTopologyDescriptionDeleter(
 PJRT_Error_Code GetErrorCode(const PJRT_Error* error, const PJRT_Api* api) {
   PJRT_Error_GetCode_Args args;
   args.struct_size = PJRT_Error_GetCode_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.error = error;
   pjrt::LogFatalIfPjrtError(api->PJRT_Error_GetCode(&args), api);
   return args.code;
@@ -205,7 +205,7 @@ absl::string_view GetPjrtErrorMessage(const PJRT_Error* error,
                                       const PJRT_Api* api) {
   PJRT_Error_Message_Args message_args;
   message_args.struct_size = PJRT_Error_Message_Args_STRUCT_SIZE;
-  message_args.priv = nullptr;
+  message_args.extension_start = nullptr;
   message_args.error = error;
   api->PJRT_Error_Message(&message_args);
   return absl::string_view(message_args.message, message_args.message_size);
@@ -225,7 +225,7 @@ PJRT_EventDeleter MakeEventDeleter(const PJRT_Api* api) {
   return [api](PJRT_Event* managed) {
     PJRT_Event_Destroy_Args args;
     args.struct_size = PJRT_Event_Destroy_Args_STRUCT_SIZE;
-    args.priv = nullptr;
+    args.extension_start = nullptr;
     args.event = managed;
 
     LogFatalIfPjrtError(api->PJRT_Event_Destroy(&args), api);
@@ -389,7 +389,7 @@ xla::PjRtFuture<xla::Status> ConvertCEventToCppFuture(PJRT_Event* c_event,
   using xla::Status, xla::PjRtFuture;
   PJRT_Event_OnReady_Args event_onready_args;
   event_onready_args.struct_size = PJRT_Event_OnReady_Args_STRUCT_SIZE;
-  event_onready_args.priv = nullptr;
+  event_onready_args.extension_start = nullptr;
   event_onready_args.event = c_event;
 
   PjRtFuture<Status>::Promise promise = PjRtFuture<Status>::CreatePromise();
@@ -400,7 +400,7 @@ xla::PjRtFuture<xla::Status> ConvertCEventToCppFuture(PJRT_Event* c_event,
           promise.Set(s);
           ::pjrt::MakeErrorDeleter(c_api)(error);
         } else {
-          promise.Set(tsl::OkStatus());
+          promise.Set(absl::OkStatus());
         }
         ::pjrt::MakeEventDeleter(c_api)(c_event);
       });
@@ -424,7 +424,7 @@ static xla::StatusOr<PJRT_NamedValue> ConvertToPjRtNamedValue(
     int api_minor_version) {
   PJRT_NamedValue c_value;
   c_value.struct_size = PJRT_NamedValue_STRUCT_SIZE;
-  c_value.priv = nullptr;
+  c_value.extension_start = nullptr;
   c_value.name = name.c_str();
   c_value.name_size = name.size();
 
@@ -564,7 +564,7 @@ xla::Status ValidateCreateOptions(
           it->second);
     }
   }
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 static std::string StructSizeErrorMsg(absl::string_view struct_name,
@@ -590,13 +590,13 @@ xla::Status ActualStructSizeIsGreaterOrEqual(absl::string_view struct_name,
   if (actual_size > expected_size) {
     VLOG(2) << StructSizeErrorMsg(struct_name, expected_size, actual_size);
   }
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::string_view GetPlatformVersion(PJRT_Client* client, const PJRT_Api* api) {
   PJRT_Client_PlatformVersion_Args args;
   args.struct_size = PJRT_Client_PlatformVersion_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.client = client;
   LogFatalIfPjrtError(api->PJRT_Client_PlatformVersion(&args), api);
 
@@ -609,7 +609,7 @@ absl::string_view GetPlatformName(PJRT_Client* client, const PJRT_Api* api) {
   PJRT_Client_PlatformName_Args args;
   args.client = client;
   args.struct_size = PJRT_Client_PlatformName_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   pjrt::LogFatalIfPjrtError(api->PJRT_Client_PlatformName(&args), api);
 
   absl::string_view platform_name(args.platform_name, args.platform_name_size);
@@ -620,7 +620,7 @@ xla::StatusOr<PJRT_TopologyDescription*> GetTopologyDescription(
     PJRT_Client* client, const PJRT_Api* api) {
   PJRT_Client_TopologyDescription_Args args;
   args.struct_size = PJRT_Client_TopologyDescription_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.client = client;
   RETURN_STATUS_IF_PJRT_ERROR(api->PJRT_Client_TopologyDescription(&args), api);
   return args.topology;
@@ -659,7 +659,7 @@ PJRT_DeviceDescription* GetDeviceDescription(const PJRT_Api* api,
                                              PJRT_Device* device) {
   PJRT_Device_GetDescription_Args args;
   args.struct_size = PJRT_Device_GetDescription_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.device = device;
   pjrt::LogFatalIfPjrtError(api->PJRT_Device_GetDescription(&args), api);
   return args.device_description;
@@ -669,7 +669,7 @@ absl::Span<PJRT_Memory* const> GetAddressableMemories(const PJRT_Api* api,
                                                       PJRT_Device* device) {
   PJRT_Device_AddressableMemories_Args args;
   args.struct_size = PJRT_Device_AddressableMemories_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.device = device;
   pjrt::LogFatalIfPjrtError(api->PJRT_Device_AddressableMemories(&args), api);
   return absl::MakeSpan(args.memories, args.num_memories);
@@ -864,7 +864,7 @@ xla::StatusOr<xla::Layout> ConvertToLayout(
 PJRT_Buffer_Type GetElementType(const PJRT_Api* api, PJRT_Buffer* buffer) {
   PJRT_Buffer_ElementType_Args args;
   args.struct_size = PJRT_Buffer_ElementType_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.buffer = buffer;
   LogFatalIfPjrtError(api->PJRT_Buffer_ElementType(&args), api);
   return args.type;
@@ -874,7 +874,7 @@ absl::Span<const int64_t> GetDimensions(const PJRT_Api* api,
                                         PJRT_Buffer* buffer) {
   PJRT_Buffer_Dimensions_Args args;
   args.struct_size = PJRT_Buffer_Dimensions_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.buffer = buffer;
   LogFatalIfPjrtError(api->PJRT_Buffer_Dimensions(&args), api);
   return {args.dims, args.num_dims};
@@ -884,7 +884,7 @@ PJRT_Buffer_MemoryLayout GetMemoryLayout(const PJRT_Api* api,
                                          PJRT_Buffer* buffer) {
   PJRT_Buffer_GetMemoryLayout_Args args;
   args.struct_size = PJRT_Buffer_GetMemoryLayout_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.buffer = buffer;
   LogFatalIfPjrtError(api->PJRT_Buffer_GetMemoryLayout(&args), api);
   return args.layout;
@@ -925,7 +925,7 @@ absl::string_view PlatformName(const PJRT_Api* api,
                                const PJRT_TopologyDescription* topo_desc) {
   PJRT_TopologyDescription_PlatformName_Args args;
   args.struct_size = PJRT_TopologyDescription_PlatformName_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.topology = const_cast<PJRT_TopologyDescription*>(topo_desc);
   LogFatalIfPjrtError(api->PJRT_TopologyDescription_PlatformName(&args), api);
   return {args.platform_name, args.platform_name_size};
@@ -936,7 +936,7 @@ absl::Span<PJRT_DeviceDescription* const> DeviceDescriptions(
   PJRT_TopologyDescription_GetDeviceDescriptions_Args args;
   args.struct_size =
       PJRT_TopologyDescription_GetDeviceDescriptions_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.topology = const_cast<PJRT_TopologyDescription*>(topo_desc);
   LogFatalIfPjrtError(
       api->PJRT_TopologyDescription_GetDeviceDescriptions(&args), api);
@@ -947,7 +947,7 @@ absl::StatusOr<xla::CompiledMemoryStats> GetCompiledMemoryStats(
     const PJRT_Api* api, PJRT_Executable* executable) {
   PJRT_Executable_GetCompiledMemoryStats_Args args;
   args.struct_size = PJRT_Executable_GetCompiledMemoryStats_Args_STRUCT_SIZE;
-  args.priv = 0;
+  args.extension_start = nullptr;
   args.executable = executable;
   RETURN_STATUS_IF_PJRT_ERROR(
       api->PJRT_Executable_GetCompiledMemoryStats(&args), api);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers_test.cc
index 39b25f5a3741b9..4cbf5eb73ec3ae 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_helpers_test.cc
@@ -110,7 +110,7 @@ TEST(PjRtCApiHelperTest, InvalidOptionName) {
 
   auto status = ValidateCreateOptions(invalid_map, expected);
 
-  EXPECT_NE(status, tsl::OkStatus());
+  EXPECT_NE(status, absl::OkStatus());
   EXPECT_THAT(status.message(),
               HasSubstr("Unexpected option name passed to PJRT_Client_Create"));
 }
@@ -125,7 +125,7 @@ TEST(PjRtCApiHelperTest, InvalidOptionTypeIndex) {
 
   auto status = ValidateCreateOptions(invalid_map, expected);
 
-  EXPECT_NE(status, tsl::OkStatus());
+  EXPECT_NE(status, absl::OkStatus());
   EXPECT_THAT(status.message(),
               HasSubstr("Option passed to PJRT_Client_Create with name string "
                         "has type index 2 but expected type index is 0"));
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_profiler_extension.h b/third_party/xla/xla/pjrt/c/pjrt_c_api_profiler_extension.h
index 8cd54a92645bc4..684c699feae554 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_profiler_extension.h
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_profiler_extension.h
@@ -26,7 +26,7 @@ extern "C" {
 #define PJRT_API_PROFILER_EXTENSION_VERSION 0
 
 typedef struct PJRT_Profiler_Extension {
-  PJRT_Structure_Type type;
+  PJRT_Extension_Type type;
   const void* next;
   PLUGIN_Profiler_Api* profiler_api;
 } PJRT_Profiler_Extension;
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
index 8bfd4bfc82c732..573a43c7aa6865 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test.cc
@@ -149,7 +149,7 @@ TEST_F(PjrtCApiTest, PlatformName) {
   PJRT_Client_PlatformName_Args args;
   args.client = client_;
   args.struct_size = PJRT_Client_PlatformName_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   PJRT_Error* error = api_->PJRT_Client_PlatformName(&args);
   ASSERT_EQ(error, nullptr);
   absl::string_view platform_name(args.platform_name, args.platform_name_size);
@@ -160,7 +160,7 @@ TEST_F(PjrtCApiTest, ClientProcessIndex) {
   PJRT_Client_ProcessIndex_Args process_index_args =
       PJRT_Client_ProcessIndex_Args{
           .struct_size = PJRT_Client_ProcessIndex_Args_STRUCT_SIZE,
-          .priv = nullptr,
+          .extension_start = nullptr,
           .client = client_,
           .process_index = -1,
       };
@@ -199,7 +199,7 @@ TEST_F(PjrtCApiTest, LookupDevice) {
   PJRT_Client_LookupDevice_Args lookup_device_args =
       PJRT_Client_LookupDevice_Args{
           .struct_size = PJRT_Client_LookupDevice_Args_STRUCT_SIZE,
-          .priv = nullptr,
+          .extension_start = nullptr,
           .client = client_,
           .id = 0,
           .device = nullptr,
@@ -217,7 +217,7 @@ TEST_F(PjrtCApiTest, LookupAddressableDevice) {
   PJRT_Client_LookupAddressableDevice_Args lookup_addressable_device_args =
       PJRT_Client_LookupAddressableDevice_Args{
           .struct_size = PJRT_Client_LookupAddressableDevice_Args_STRUCT_SIZE,
-          .priv = nullptr,
+          .extension_start = nullptr,
           .client = client_,
           .local_hardware_id = 0,
           .addressable_device = nullptr,
@@ -239,7 +239,7 @@ TEST_F(PjrtCApiTest, GetDefaultDeviceAssignmentNominal) {
   std::vector<int> assignment_buffer(kNumReplicas * kNumPartitions);
   PJRT_Client_DefaultDeviceAssignment_Args args{
       .struct_size = PJRT_Client_DefaultDeviceAssignment_Args_STRUCT_SIZE,
-      .priv = nullptr,
+      .extension_start = nullptr,
       .client = client_,
       .num_replicas = kNumReplicas,
       .num_partitions = kNumPartitions,
@@ -257,7 +257,7 @@ TEST_F(PjrtCApiTest, GetDefaultDeviceAssignmentBufferTooSmall) {
   std::vector<int> assignment_buffer(kBufferSize);
   PJRT_Client_DefaultDeviceAssignment_Args args{
       .struct_size = PJRT_Client_DefaultDeviceAssignment_Args_STRUCT_SIZE,
-      .priv = nullptr,
+      .extension_start = nullptr,
       .client = client_,
       .num_replicas = kNumReplicas,
       .num_partitions = kNumPartitions,
@@ -276,7 +276,7 @@ TEST_F(PjrtCApiTest, GetDefaultDeviceAssignmentBufferTooSmall) {
 TEST_F(PjrtCApiTest, LookupDeviceNegativeId) {
   PJRT_Client_LookupDevice_Args args = PJRT_Client_LookupDevice_Args{
       .struct_size = PJRT_Client_LookupDevice_Args_STRUCT_SIZE,
-      .priv = nullptr,
+      .extension_start = nullptr,
       .client = client_,
       .id = -1,
       .device = nullptr,
@@ -296,7 +296,7 @@ TEST_F(PjrtCApiTest, LookupDeviceOutOfRangeId) {
   int out_of_range_id = GetNumDevices();
   PJRT_Client_LookupDevice_Args args = PJRT_Client_LookupDevice_Args{
       .struct_size = PJRT_Client_LookupDevice_Args_STRUCT_SIZE,
-      .priv = nullptr,
+      .extension_start = nullptr,
       .client = client_,
       .id = out_of_range_id,
       .device = nullptr,
@@ -318,7 +318,7 @@ void destroy_executable(PJRT_LoadedExecutable* executable,
                         const PJRT_Api* api) {
   PJRT_LoadedExecutable_Destroy_Args args{
       .struct_size = PJRT_LoadedExecutable_Destroy_Args_STRUCT_SIZE,
-      .priv = nullptr,
+      .extension_start = nullptr,
       .executable = executable,
   };
   PJRT_Error* error = api->PJRT_LoadedExecutable_Destroy(&args);
@@ -345,7 +345,7 @@ TEST_F(PjrtCApiTest, BufferTransferImmutableUntilTransferCompletes) {
 
   PJRT_Event_Await_Args await_args;
   await_args.struct_size = PJRT_Event_Await_Args_STRUCT_SIZE;
-  await_args.priv = nullptr;
+  await_args.extension_start = nullptr;
   await_args.event = event.get();
   PJRT_Error* event_error = api_->PJRT_Event_Await(&await_args);
   ASSERT_EQ(event_error, nullptr);
@@ -354,7 +354,7 @@ TEST_F(PjrtCApiTest, BufferTransferImmutableUntilTransferCompletes) {
 TEST_F(PjrtCApiTest, Compile) {
   PJRT_Client_Compile_Args args = PJRT_Client_Compile_Args{
       .struct_size = PJRT_Client_Compile_Args_STRUCT_SIZE,
-      .priv = nullptr,
+      .extension_start = nullptr,
       .client = client_,
   };
   std::string options_str = BuildSingleDeviceCompileOptionStr();
@@ -365,7 +365,7 @@ TEST_F(PjrtCApiTest, Compile) {
   std::string program_code{module_add_one};
   PJRT_Program program = PJRT_Program{
       .struct_size = PJRT_Program_STRUCT_SIZE,
-      .priv = nullptr,
+      .extension_start = nullptr,
       .code = program_code.data(),
       .code_size = program_code.length(),
       .format = format.c_str(),
@@ -383,7 +383,7 @@ TEST_F(PjrtCApiTest, Compile) {
 TEST_F(PjrtCApiTest, CompileXlaComputation) {
   PJRT_Client_Compile_Args args = PJRT_Client_Compile_Args{
       .struct_size = PJRT_Client_Compile_Args_STRUCT_SIZE,
-      .priv = nullptr,
+      .extension_start = nullptr,
       .client = client_,
   };
   xla::DeviceAssignment device_assignment(1, 1);
@@ -403,7 +403,7 @@ TEST_F(PjrtCApiTest, CompileXlaComputation) {
   std::string format(::pjrt::kHloFormat);
   PJRT_Program program = PJRT_Program{
       .struct_size = PJRT_Program_STRUCT_SIZE,
-      .priv = nullptr,
+      .extension_start = nullptr,
       .code = module_str.data(),
       .code_size = module_str.size(),
       .format = format.c_str(),
@@ -421,7 +421,7 @@ TEST_F(PjrtCApiTest, CompileXlaComputation) {
 TEST_F(PjrtCApiTest, CompileInvalidOption) {
   PJRT_Client_Compile_Args args = PJRT_Client_Compile_Args{
       .struct_size = PJRT_Client_Compile_Args_STRUCT_SIZE,
-      .priv = nullptr,
+      .extension_start = nullptr,
       .client = client_,
   };
   std::string options_str = "invalid compile options";
@@ -432,7 +432,7 @@ TEST_F(PjrtCApiTest, CompileInvalidOption) {
   std::string program_code{module_add_one};
   PJRT_Program program = PJRT_Program{
       .struct_size = PJRT_Program_STRUCT_SIZE,
-      .priv = nullptr,
+      .extension_start = nullptr,
       .code = program_code.data(),
       .code_size = program_code.length(),
       .format = format.c_str(),
@@ -453,7 +453,7 @@ TEST_F(PjrtCApiTest, CompileInvalidOption) {
 TEST_F(PjrtCApiTest, CompileInvalidProgramFormat) {
   PJRT_Client_Compile_Args args = PJRT_Client_Compile_Args{
       .struct_size = PJRT_Client_Compile_Args_STRUCT_SIZE,
-      .priv = nullptr,
+      .extension_start = nullptr,
       .client = client_,
   };
   xla::DeviceAssignment device_assignment(1, 1);
@@ -468,7 +468,7 @@ TEST_F(PjrtCApiTest, CompileInvalidProgramFormat) {
   std::string format("invalid");
   PJRT_Program program = PJRT_Program{
       .struct_size = PJRT_Program_STRUCT_SIZE,
-      .priv = nullptr,
+      .extension_start = nullptr,
       .code = nullptr,
       .code_size = 0,
       .format = format.c_str(),
@@ -498,7 +498,7 @@ TEST_F(PjrtCApiTest, DeviceProcessIndex) {
   PJRT_DeviceDescription_ProcessIndex_Args args =
       PJRT_DeviceDescription_ProcessIndex_Args{
           .struct_size = PJRT_DeviceDescription_ProcessIndex_Args_STRUCT_SIZE,
-          .priv = nullptr,
+          .extension_start = nullptr,
           .device_description =
               ::pjrt::GetDeviceDescription(api_, GetClientDevices()[0]),
           .process_index = -1,
@@ -512,7 +512,7 @@ TEST_F(PjrtCApiTest, DeviceProcessIndex) {
 TEST_F(PjrtCApiTest, DeviceIsAddressable) {
   PJRT_Device_IsAddressable_Args args = PJRT_Device_IsAddressable_Args{
       .struct_size = PJRT_Device_IsAddressable_Args_STRUCT_SIZE,
-      .priv = nullptr,
+      .extension_start = nullptr,
       .device = GetClientDevices()[0],
       .is_addressable = false,
   };
@@ -525,7 +525,7 @@ TEST_F(PjrtCApiTest, DeviceIsAddressable) {
 TEST_F(PjrtCApiTest, DeviceLocalHardwareId) {
   PJRT_Device_LocalHardwareId_Args args = PJRT_Device_LocalHardwareId_Args{
       .struct_size = PJRT_Device_LocalHardwareId_Args_STRUCT_SIZE,
-      .priv = nullptr,
+      .extension_start = nullptr,
       .device = GetClientDevices()[0],
       .local_hardware_id = -1,
   };
@@ -565,7 +565,7 @@ class PjrtCApiBufferTest : public PjrtCApiTest {
 TEST_F(PjrtCApiBufferTest, IsDeleted) {
   PJRT_Buffer_IsDeleted_Args is_deleted_args;
   is_deleted_args.struct_size = PJRT_Buffer_IsDeleted_Args_STRUCT_SIZE;
-  is_deleted_args.priv = nullptr;
+  is_deleted_args.extension_start = nullptr;
   is_deleted_args.buffer = buffer_.get();
   PJRT_Error* is_deleted_error = api_->PJRT_Buffer_IsDeleted(&is_deleted_args);
   ASSERT_EQ(is_deleted_error, nullptr);
@@ -573,7 +573,7 @@ TEST_F(PjrtCApiBufferTest, IsDeleted) {
 
   PJRT_Buffer_Delete_Args delete_args;
   delete_args.struct_size = PJRT_Buffer_Delete_Args_STRUCT_SIZE;
-  delete_args.priv = nullptr;
+  delete_args.extension_start = nullptr;
   delete_args.buffer = buffer_.get();
   PJRT_Error* delete_error = api_->PJRT_Buffer_Delete(&delete_args);
   ASSERT_EQ(delete_error, nullptr);
@@ -586,7 +586,7 @@ TEST_F(PjrtCApiBufferTest, IsDeleted) {
 TEST_F(PjrtCApiBufferTest, GetOnDeviceSizeInBytes) {
   PJRT_Buffer_OnDeviceSizeInBytes_Args args;
   args.struct_size = PJRT_Buffer_OnDeviceSizeInBytes_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.buffer = buffer_.get();
   PJRT_Error* on_device_size_bytes_error =
       api_->PJRT_Buffer_OnDeviceSizeInBytes(&args);
@@ -598,7 +598,7 @@ TEST_F(PjrtCApiBufferTest, GetOnDeviceSizeInBytes) {
 TEST_F(PjrtCApiBufferTest, ReadyEvent) {
   PJRT_Buffer_ReadyEvent_Args get_event_args;
   get_event_args.struct_size = PJRT_Buffer_ReadyEvent_Args_STRUCT_SIZE;
-  get_event_args.priv = nullptr;
+  get_event_args.extension_start = nullptr;
   get_event_args.buffer = buffer_.get();
   auto error = ToUniquePtr(api_->PJRT_Buffer_ReadyEvent(&get_event_args));
   ASSERT_EQ(error, nullptr);
@@ -609,7 +609,7 @@ TEST_F(PjrtCApiBufferTest, ReadyEvent) {
   // Wait for `buffer_`'s data transfer to complete (if it hasn't already)
   PJRT_Event_Await_Args await_args;
   await_args.struct_size = PJRT_Event_Await_Args_STRUCT_SIZE;
-  await_args.priv = nullptr;
+  await_args.extension_start = nullptr;
   await_args.event = event;
   error.reset(api_->PJRT_Event_Await(&await_args));
   ASSERT_EQ(error, nullptr);
@@ -617,7 +617,7 @@ TEST_F(PjrtCApiBufferTest, ReadyEvent) {
   // Must be ready when `PJRT_Event_Await` completes
   PJRT_Event_IsReady_Args ready_args;
   ready_args.struct_size = PJRT_Event_IsReady_Args_STRUCT_SIZE;
-  ready_args.priv = nullptr;
+  ready_args.extension_start = nullptr;
   ready_args.event = event;
   error.reset(api_->PJRT_Event_IsReady(&ready_args));
   ASSERT_EQ(error, nullptr);
@@ -626,7 +626,7 @@ TEST_F(PjrtCApiBufferTest, ReadyEvent) {
   // Clean up
   PJRT_Event_Destroy_Args destroy_args;
   destroy_args.struct_size = PJRT_Event_Destroy_Args_STRUCT_SIZE;
-  destroy_args.priv = nullptr;
+  destroy_args.extension_start = nullptr;
   destroy_args.event = event;
   error.reset(api_->PJRT_Event_Destroy(&destroy_args));
   EXPECT_EQ(error, nullptr);
@@ -635,7 +635,7 @@ TEST_F(PjrtCApiBufferTest, ReadyEvent) {
 TEST_F(PjrtCApiBufferTest, ToHostBufferNoHostLayout) {
   PJRT_Buffer_ToHostBuffer_Args args;
   args.struct_size = PJRT_Buffer_ToHostBuffer_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.src = buffer_.get();
   xla::Shape host_shape = xla::ShapeUtil::MakeShape(xla::F32, {4});
   auto literal = std::make_shared<xla::Literal>(host_shape);
@@ -661,7 +661,7 @@ TEST_F(PjrtCApiBufferTest, IncreaseAndDecreaseReferenceCount) {
   PJRT_Buffer_IncreaseExternalReferenceCount_Args increase_reference_count_args;
   increase_reference_count_args.struct_size =
       PJRT_Buffer_IncreaseExternalReferenceCount_Args_STRUCT_SIZE;
-  increase_reference_count_args.priv = nullptr;
+  increase_reference_count_args.extension_start = nullptr;
   increase_reference_count_args.buffer = buffer_.get();
   PJRT_Error* increase_reference_count_error =
       api_->PJRT_Buffer_IncreaseExternalReferenceCount(
@@ -671,7 +671,7 @@ TEST_F(PjrtCApiBufferTest, IncreaseAndDecreaseReferenceCount) {
   PJRT_Buffer_DecreaseExternalReferenceCount_Args decrease_reference_count_args;
   decrease_reference_count_args.struct_size =
       PJRT_Buffer_DecreaseExternalReferenceCount_Args_STRUCT_SIZE;
-  decrease_reference_count_args.priv = nullptr;
+  decrease_reference_count_args.extension_start = nullptr;
   decrease_reference_count_args.buffer = buffer_.get();
   PJRT_Error* decrease_reference_error =
       api_->PJRT_Buffer_DecreaseExternalReferenceCount(
@@ -683,7 +683,7 @@ TEST_F(PjrtCApiBufferTest, DecreaseReferenceCountReturnsError) {
   PJRT_Buffer_DecreaseExternalReferenceCount_Args args;
   args.struct_size =
       PJRT_Buffer_DecreaseExternalReferenceCount_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.buffer = buffer_.get();
   auto error =
       ToUniquePtr(api_->PJRT_Buffer_DecreaseExternalReferenceCount(&args));
@@ -698,7 +698,7 @@ TEST_F(PjrtCApiBufferTest, DecreaseReferenceCountReturnsError) {
 TEST_F(PjrtCApiBufferTest, OpaqueDeviceMemoryDataPointer) {
   PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args args;
   args.struct_size = PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.buffer = buffer_.get();
   PJRT_Error* error = api_->PJRT_Buffer_OpaqueDeviceMemoryDataPointer(&args);
   EXPECT_EQ(error, nullptr);
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
index 8eafe9382fb15e..aa383745be2794 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_test_base.cc
@@ -42,7 +42,7 @@ namespace {
 PJRT_Client* CreateClient(const PJRT_Api* api) {
   PJRT_Client_Create_Args create_args;
   create_args.struct_size = PJRT_Client_Create_Args_STRUCT_SIZE;
-  create_args.priv = nullptr;
+  create_args.extension_start = nullptr;
   create_args.create_options = nullptr;
   create_args.num_options = 0;
   create_args.kv_get_callback = nullptr;
@@ -67,7 +67,7 @@ PjrtCApiTestBase::~PjrtCApiTestBase() { destroy_client(client_); }
 void PjrtCApiTestBase::destroy_client(PJRT_Client* client) {
   PJRT_Client_Destroy_Args destroy_args;
   destroy_args.struct_size = PJRT_Client_Destroy_Args_STRUCT_SIZE;
-  destroy_args.priv = nullptr;
+  destroy_args.extension_start = nullptr;
   destroy_args.client = client;
   PJRT_Error* error = api_->PJRT_Client_Destroy(&destroy_args);
   CHECK_EQ(error, nullptr);
@@ -76,7 +76,7 @@ void PjrtCApiTestBase::destroy_client(PJRT_Client* client) {
 int PjrtCApiTestBase::GetDeviceId(PJRT_DeviceDescription* device_desc) const {
   PJRT_DeviceDescription_Id_Args args = PJRT_DeviceDescription_Id_Args{
       .struct_size = PJRT_DeviceDescription_Id_Args_STRUCT_SIZE,
-      .priv = nullptr,
+      .extension_start = nullptr,
       .device_description = device_desc,
       .id = -1,
   };
@@ -96,7 +96,7 @@ bool PjrtCApiTestBase::IsValidDeviceId(PJRT_Device* device) const {
 int PjrtCApiTestBase::GetLocalHardwareId(PJRT_Device* device) const {
   PJRT_Device_LocalHardwareId_Args args = PJRT_Device_LocalHardwareId_Args{
       .struct_size = PJRT_Device_LocalHardwareId_Args_STRUCT_SIZE,
-      .priv = nullptr,
+      .extension_start = nullptr,
       .device = device,
       .local_hardware_id = -1,
   };
@@ -108,7 +108,7 @@ int PjrtCApiTestBase::GetLocalHardwareId(PJRT_Device* device) const {
 absl::Span<PJRT_Device* const> PjrtCApiTestBase::GetClientDevices() const {
   PJRT_Client_Devices_Args dev_args;
   dev_args.struct_size = PJRT_Client_Devices_Args_STRUCT_SIZE;
-  dev_args.priv = nullptr;
+  dev_args.extension_start = nullptr;
   dev_args.client = client_;
   PJRT_Error* error = api_->PJRT_Client_Devices(&dev_args);
   CHECK(error == nullptr);
@@ -136,7 +136,7 @@ absl::Span<PJRT_Device* const> PjrtCApiTestBase::GetClientAddressableDevices()
     const {
   PJRT_Client_AddressableDevices_Args addr_args;
   addr_args.struct_size = PJRT_Client_AddressableDevices_Args_STRUCT_SIZE;
-  addr_args.priv = nullptr;
+  addr_args.extension_start = nullptr;
   addr_args.client = client_;
   PJRT_Error* error = api_->PJRT_Client_AddressableDevices(&addr_args);
   CHECK(error == nullptr);
@@ -151,7 +151,7 @@ PjrtCApiTestBase::CreateBufferFromHostBufferArgs(
     PJRT_Device* device) {
   PJRT_Client_BufferFromHostBuffer_Args args;
   args.struct_size = PJRT_Client_BufferFromHostBuffer_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
 
   args.data = data.data();
   args.type = ::pjrt::ConvertToPjRtBufferType(shape.element_type());
@@ -195,7 +195,7 @@ PjrtCApiTestBase::create_buffer(PJRT_Device* device) {
 
   PJRT_Buffer_ReadyEvent_Args get_event_args;
   get_event_args.struct_size = PJRT_Buffer_ReadyEvent_Args_STRUCT_SIZE;
-  get_event_args.priv = nullptr;
+  get_event_args.extension_start = nullptr;
   get_event_args.buffer = buffer.get();
   auto ready_event_error =
       ToUniquePtr(api_->PJRT_Buffer_ReadyEvent(&get_event_args));
diff --git a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
index ef8387d7365d5d..92b3bc38be53c8 100644
--- a/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
+++ b/third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
@@ -120,7 +120,7 @@ static xla::Status PopulateExecutableCostAnalysis(PJRT_Executable* executable) {
     std::string& property_name = cost_analysis_names[i];
 
     cost_analysis_property.struct_size = PJRT_NamedValue_STRUCT_SIZE;
-    cost_analysis_property.priv = nullptr;
+    cost_analysis_property.extension_start = nullptr;
 
     property_name = property.first;
     cost_analysis_property.name = property_name.c_str();
@@ -1272,7 +1272,7 @@ static xla::SendCallback CSendCallbackToCpp(
         std::unique_ptr<PJRT_Error> error(callback(
             &c_chunk, &c_callback_error, total_size_in_bytes, done, user_arg));
         if (error == nullptr) {
-          return tsl::OkStatus();
+          return absl::OkStatus();
         }
         return error->status;
       }};
@@ -1936,7 +1936,7 @@ PJRT_Error* PJRT_Event_Error(PJRT_Event_Error_Args* args) {
   if (!event->status.has_value()) {
     PJRT_Event_Await_Args await_args;
     await_args.struct_size = PJRT_Event_Await_Args_STRUCT_SIZE;
-    await_args.priv = nullptr;
+    await_args.extension_start = nullptr;
     await_args.event = event;
     return PJRT_Event_Await(&await_args);
   }
@@ -2077,7 +2077,7 @@ static std::vector<PJRT_NamedValue> PopulatePjrtAttributes(
   for (auto const& [name, value] : attributes) {
     PJRT_NamedValue& cur_attribute = c_attributes[ind];
     cur_attribute.struct_size = PJRT_NamedValue_STRUCT_SIZE;
-    cur_attribute.priv = nullptr;
+    cur_attribute.extension_start = nullptr;
     cur_attribute.name = name.c_str();
     cur_attribute.name_size = name.size();
     if (const std::string* string_val = std::get_if<std::string>(&value)) {
diff --git a/third_party/xla/xla/pjrt/cpu/BUILD b/third_party/xla/xla/pjrt/cpu/BUILD
index 1b8d2aca5f208c..f736d66bfeb4bf 100644
--- a/third_party/xla/xla/pjrt/cpu/BUILD
+++ b/third_party/xla/xla/pjrt/cpu/BUILD
@@ -3,7 +3,8 @@ load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla:internal"],
     licenses = ["notice"],
 )
 
@@ -21,7 +22,6 @@ cc_library(
     name = "tracked_tfrt_cpu_device_buffer",
     srcs = ["tracked_tfrt_cpu_device_buffer.cc"],
     hdrs = ["tracked_tfrt_cpu_device_buffer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:cpu_function_runtime",
         "//xla:shape_util",
@@ -54,7 +54,9 @@ cc_library(
     name = "abstract_tfrt_cpu_buffer",
     srcs = ["abstract_tfrt_cpu_buffer.cc"],
     hdrs = ["abstract_tfrt_cpu_buffer.h"],
-    visibility = ["//visibility:public"],
+    visibility = [
+        "//xla:friends",
+    ],
     deps = [
         ":tracked_tfrt_cpu_device_buffer",
         "//xla:cpu_function_runtime",
@@ -106,7 +108,6 @@ cc_library(
     name = "cpu_topology",
     srcs = ["cpu_topology.cc"],
     hdrs = ["cpu_topology.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":cpu_topology_proto_cc",
         "@com_google_absl//absl/types:span",
@@ -128,7 +129,9 @@ cc_library(
     name = "cpu_client",
     srcs = ["cpu_client.cc"],
     hdrs = ["cpu_client.h"],
-    visibility = ["//visibility:public"],
+    visibility = [
+        "//xla:friends",
+    ],
     deps = [
         ":abstract_tfrt_cpu_buffer",
         ":cpu_topology",
@@ -245,7 +248,6 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
     deps = [
         "//third_party/gloo",
         "//xla/pjrt:status_casters",
@@ -263,7 +265,6 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
     deps = [
         "//third_party/gloo",
         "//xla:shape_util",
diff --git a/third_party/xla/xla/pjrt/cpu/cpu_client.cc b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
index 1e3a3e65ce9754..ad45573a9f0e33 100644
--- a/third_party/xla/xla/pjrt/cpu/cpu_client.cc
+++ b/third_party/xla/xla/pjrt/cpu/cpu_client.cc
@@ -645,15 +645,15 @@ static StatusOr<std::unique_ptr<xla::Executable>> JitCompile(
     const XlaComputation& computation,
     const absl::Span<const Shape* const> argument_layouts,
     const ExecutableBuildOptions& build_options,
-    const ExecutionOptions& execution_options) {
+    const ExecutionOptions& execution_options,
+    const xla::Compiler::CompileOptions& compile_options, int num_threads) {
   TF_ASSIGN_OR_RETURN(ProgramShape program_shape,
                       computation.GetProgramShape());
   // Unoptimized HloModuleConfig.
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModuleConfig> hlo_module_config,
       CreateModuleConfig(program_shape, argument_layouts, &execution_options,
-                         execution_options.num_replicas(),
-                         /*num_threads=*/std::nullopt,
+                         execution_options.num_replicas(), num_threads,
                          /*aot_options=*/nullptr));
 
   // Unoptimized HloModule.
@@ -669,14 +669,13 @@ static StatusOr<std::unique_ptr<xla::Executable>> JitCompile(
   bool allow_sparse_shapes =
       hlo_module->config().debug_options().xla_cpu_use_xla_runtime();
   cpu::CpuCompiler compiler(allow_sparse_shapes);
-  xla::Compiler::CompileOptions dummy;
-  TF_ASSIGN_OR_RETURN(hlo_module,
-                      compiler.RunHloPasses(std::move(hlo_module),
-                                            /*stream_exec=*/nullptr, dummy));
+  TF_ASSIGN_OR_RETURN(hlo_module, compiler.RunHloPasses(std::move(hlo_module),
+                                                        /*stream_exec=*/nullptr,
+                                                        compile_options));
 
   // Run backend.
   return compiler.RunBackend(std::move(hlo_module), /*stream_exec=*/nullptr,
-                             dummy);
+                             compile_options);
 }
 
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>> TfrtCpuClient::Compile(
@@ -758,9 +757,14 @@ StatusOr<std::unique_ptr<PjRtLoadedExecutable>> TfrtCpuClient::Compile(
                       computation.GetProgramShape());
   ExecutionOptions execution_options =
       CreateExecutionOptions(build_options, &program_shape);
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<Executable> cpu_executable,
-                      JitCompile(computation, argument_layout_pointers,
-                                 build_options, execution_options));
+  xla::Compiler::CompileOptions compile_options{
+      build_options.device_allocator(), build_options.compile_thread_pool(),
+      build_options.layout_canonicalization_callback()};
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<Executable> cpu_executable,
+      JitCompile(computation, argument_layout_pointers, build_options,
+                 execution_options, compile_options,
+                 eigen_intraop_device()->getPool()->NumThreads()));
   auto cpu_executable_ptr =
       tensorflow::down_cast<cpu::CpuExecutable*>(cpu_executable.get());
 
diff --git a/third_party/xla/xla/pjrt/distributed/BUILD b/third_party/xla/xla/pjrt/distributed/BUILD
index 907329eb8327e8..2e20fc7b2e8f70 100644
--- a/third_party/xla/xla/pjrt/distributed/BUILD
+++ b/third_party/xla/xla/pjrt/distributed/BUILD
@@ -6,21 +6,20 @@ load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 licenses(["notice"])
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla:internal"],
 )
 
 tf_proto_library(
     name = "protocol_proto",
     srcs = ["protocol.proto"],
     cc_api_version = 2,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "service",
     srcs = ["service.cc"],
     hdrs = ["service.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":topology_util",
         ":util",
@@ -68,7 +67,6 @@ cc_library(
     hdrs = [
         "client.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":key_value_store_interface",
         ":util",
@@ -96,7 +94,6 @@ cc_library(
 cc_library(
     name = "util",
     hdrs = ["util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status",
     ] + tsl_grpc_cc_dependencies(),
@@ -106,11 +103,11 @@ cc_library(
     name = "distributed",
     srcs = ["distributed.cc"],
     hdrs = ["distributed.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":client",
         ":service",
         "//xla:statusor",
+        "@local_tsl//tsl/platform:grpc_credentials",
     ] + tsl_grpc_cc_dependencies(),
 )
 
@@ -118,7 +115,6 @@ cc_library(
     name = "topology_util",
     srcs = ["topology_util.cc"],
     hdrs = ["topology_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":key_value_store_interface",
         ":protocol_proto_cc",
@@ -166,7 +162,6 @@ xla_cc_test(
 cc_library(
     name = "key_value_store_interface",
     hdrs = ["key_value_store_interface.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -178,7 +173,6 @@ cc_library(
     name = "in_memory_key_value_store",
     srcs = ["in_memory_key_value_store.cc"],
     hdrs = ["in_memory_key_value_store.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":key_value_store_interface",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/pjrt/distributed/distributed.cc b/third_party/xla/xla/pjrt/distributed/distributed.cc
index 2c3977dd238de7..dfd6b09e9d3690 100644
--- a/third_party/xla/xla/pjrt/distributed/distributed.cc
+++ b/third_party/xla/xla/pjrt/distributed/distributed.cc
@@ -15,27 +15,32 @@ limitations under the License.
 
 #include "xla/pjrt/distributed/distributed.h"
 
+#include <memory>
 #include <string>
 
-#include "grpcpp/grpcpp.h"
+#include "grpcpp/channel.h"
+#include "grpcpp/create_channel.h"
 #include "xla/pjrt/distributed/client.h"
 #include "xla/pjrt/distributed/service.h"
+#include "xla/statusor.h"
+#include "tsl/platform/grpc_credentials.h"
 
 namespace xla {
 
+// In OSS, insecure credentials are used as default.
+constexpr bool kVerifySecureCredentials = false;
+
 StatusOr<std::unique_ptr<DistributedRuntimeService>>
 GetDistributedRuntimeService(std::string address,
                              const CoordinationServiceImpl::Options& options) {
-  auto credentials = ::grpc::InsecureServerCredentials();
-  return DistributedRuntimeService::Get(address, credentials, options);
+  return DistributedRuntimeService::Get(
+      address, tsl::GetServerCredentials(kVerifySecureCredentials), options);
 }
 
 std::shared_ptr<DistributedRuntimeClient> GetDistributedRuntimeClient(
     std::string address, const DistributedRuntimeClient::Options& options) {
-  std::shared_ptr<::grpc::ChannelCredentials> creds =
-      ::grpc::InsecureChannelCredentials();
-  std::shared_ptr<::grpc::Channel> channel =
-      ::grpc::CreateChannel(address, creds);
+  std::shared_ptr<grpc::Channel> channel = grpc::CreateChannel(
+      address, tsl::GetClientCredentials(kVerifySecureCredentials));
   return GetDistributedRuntimeClient(channel, options);
 }
 
diff --git a/third_party/xla/xla/pjrt/gpu/BUILD b/third_party/xla/xla/pjrt/gpu/BUILD
index 2a3b5805808bfa..646a94a504e26b 100644
--- a/third_party/xla/xla/pjrt/gpu/BUILD
+++ b/third_party/xla/xla/pjrt/gpu/BUILD
@@ -2,12 +2,14 @@ load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/stream_executor:build_defs.bzl", "if_cuda_or_rocm", "if_gpu_is_configured")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility(["//xla:internal"]),
     licenses = ["notice"],
 )
 
@@ -15,7 +17,7 @@ cc_library(
     name = "gpu_helpers",
     srcs = ["gpu_helpers.cc"],
     hdrs = ["gpu_helpers.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//xla/pjrt:friends"]),
     deps = [
         "//xla:statusor",
         "//xla:types",
@@ -37,7 +39,7 @@ cc_library(
     srcs = ["se_gpu_pjrt_client.cc"],
     hdrs = ["se_gpu_pjrt_client.h"],
     defines = if_cuda(["GOOGLE_CUDA=1"]) + if_rocm(["TENSORFLOW_USE_ROCM=1"]),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//xla/pjrt:friends"]),
     deps = [
         ":gpu_helpers",
         ":gpu_metrics",
@@ -162,7 +164,6 @@ cc_library(
     name = "nccl_id_store",
     srcs = ["nccl_id_store.cc"],
     hdrs = ["nccl_id_store.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status_macros",
         "//xla:statusor",
@@ -212,7 +213,6 @@ cc_library(
     name = "gpu_topology",
     srcs = ["gpu_topology.cc"],
     hdrs = ["gpu_topology.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":gpu_topology_proto_cc",
     ],
@@ -223,7 +223,6 @@ cc_library(
     srcs = ["se_gpu_pjrt_compiler.cc"],
     hdrs = ["se_gpu_pjrt_compiler.h"],
     defines = if_cuda(["GOOGLE_CUDA=1"]) + if_rocm(["TENSORFLOW_USE_ROCM=1"]),
-    visibility = ["//visibility:public"],
     deps = [
         ":se_gpu_pjrt_client",
         "//xla:status_macros",
@@ -268,7 +267,6 @@ cc_library(
     name = "gpu_metrics",
     srcs = ["gpu_metrics.cc"],
     hdrs = ["gpu_metrics.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc b/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc
index 6b69a465bf4b0f..d77ebf6bd42fe5 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc
+++ b/third_party/xla/xla/pjrt/gpu/gpu_helpers.cc
@@ -120,7 +120,8 @@ StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateBFCAllocator(
 
 // Builds a BFCAllocator for all local GPUs that uses collective memory.
 StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateCollectiveBFCAllocator(
-    se::StreamExecutor* executor, size_t allocator_memory, bool preallocate) {
+    se::StreamExecutor* executor, double memory_fraction,
+    size_t collective_memory_size) {
   int device_ordinal = executor->device_ordinal();
   auto sub_allocator = std::make_unique<se::DeviceMemAllocator>(
       executor, tsl::PlatformDeviceId(device_ordinal),
@@ -128,6 +129,16 @@ StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateCollectiveBFCAllocator(
       /*alloc_visitors=*/std::vector<tsl::SubAllocator::Visitor>(),
       /*free_visitors=*/std::vector<tsl::SubAllocator::Visitor>());
 
+  int64_t free_memory;
+  int64_t total_memory;
+  if (!executor->DeviceMemoryUsage(&free_memory, &total_memory)) {
+    return Unavailable("Failed to query available memory from device %i",
+                       device_ordinal);
+  }
+  bool preallocate = collective_memory_size != 0;
+  size_t allocator_memory =
+      preallocate ? collective_memory_size : total_memory * memory_fraction;
+
   if (preallocate) {
     LOG(INFO) << "XLA backend allocating " << allocator_memory
               << " bytes on device " << device_ordinal
diff --git a/third_party/xla/xla/pjrt/gpu/gpu_helpers.h b/third_party/xla/xla/pjrt/gpu/gpu_helpers.h
index ab0ee2a42e56ed..351912aa01f861 100644
--- a/third_party/xla/xla/pjrt/gpu/gpu_helpers.h
+++ b/third_party/xla/xla/pjrt/gpu/gpu_helpers.h
@@ -58,10 +58,11 @@ struct GpuAllocatorConfig {
   // allocator will allocate more memory as allocations are requested.
   bool preallocate = true;
 
-  // Amount of collective memory (ncclMemAlloc) to reserve. Must be set when
-  // using `xla_gpu_enable_nccl_user_buffers=true`. If this value is 0,
-  // collective memory will not be allocated. Should be set to a multiple of
-  // 512MB to avoid wasting memory due to granularity requirements.
+  // Amount of collective memory (ncclMemAlloc) to preallocate. If this value is
+  // 0, collective memory space will be grown as needed to fit the application's
+  // usage, with the drawback of potentially higher fragmentation. If set,
+  // should be set to a multiple of 512MB to avoid wasting memory due to
+  // granularity requirements.
   size_t collective_memory_size = 0;
 };
 
@@ -74,7 +75,8 @@ StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateBFCAllocator(
 
 // Builds a BFCAllocator for all local GPUs that uses collective memory.
 StatusOr<std::unique_ptr<tsl::BFCAllocator>> CreateCollectiveBFCAllocator(
-    se::StreamExecutor* executor, size_t allocator_memory, bool preallocate);
+    se::StreamExecutor* executor, double memory_fraction,
+    size_t collective_memory_size);
 
 }  // namespace xla
 
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
index e679e8f66b5158..7bee3a32cab013 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <array>
 #include <cstddef>
 #include <cstdint>
+#include <limits>
 #include <map>
 #include <memory>
 #include <optional>
@@ -329,6 +330,10 @@ class AsyncHostToDeviceTransferManager
     }
 
     ++transfers_in_flight_;
+    // Release the lock before transfer in case transfer or cleanup could be
+    // called on this thread, to avoid deadlock.
+    l.Release();
+
     auto event = device_->local_device_state()->event_pool().AllocateEvent(
         stream->parent());
     if (transfer_size != 0) {
@@ -336,9 +341,6 @@ class AsyncHostToDeviceTransferManager
     }
     device_->local_device_state()->event_pool().ThenRecordEvent(stream,
                                                                 event.value());
-    // Release the lock before calling ThenDoHostCallback in case cleanup
-    // could be called on this thread, to avoid deadlock.
-    l.Release();
 
     auto cleanup = [this, buffer_index, event = std::move(event).value(),
                     stream, is_last_transfer,
@@ -585,7 +587,8 @@ StatusOr<std::unique_ptr<StreamExecutorExecutable>> FromProto(
   }
   return std::make_unique<StreamExecutorExecutable>(
       compile_options, std::move(deserialized_aot_executables),
-      proto.num_replicas(), proto.num_partitions(), proto.name());
+      proto.num_replicas(), proto.num_partitions(), proto.name(),
+      proto.fingerprint());
 }
 #endif
 }  // namespace
@@ -613,6 +616,24 @@ StreamExecutorGpuClient::LoadSerialized(absl::string_view serialized,
   return absl::InternalError("LoadSerialized only works with cuda or rocm.");
 }
 
+StatusOr<std::unique_ptr<PjRtLoadedExecutable>>
+StreamExecutorGpuClient::DeserializeExecutable(
+    absl::string_view serialized, std::optional<CompileOptions> options) {
+  if (serialized.size() > std::numeric_limits<int>::max()) {
+    return Internal(
+        "StreamExecutorGpuClient::DeserializeExecutable proto too large "
+        "(>2GB)");
+  }
+#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+  StreamExecutorExecutableProto proto;
+  if (proto.ParseFromArray(serialized.data(), serialized.size())) {
+    TF_ASSIGN_OR_RETURN(auto se_executable, FromProto(proto));
+    return Load(std::move(se_executable));
+  }
+#endif
+  return PjRtStreamExecutorClient::DeserializeExecutable(serialized, options);
+}
+
 StatusOr<std::unique_ptr<PjRtLoadedExecutable>> StreamExecutorGpuClient::Load(
     std::unique_ptr<PjRtExecutable> executable) {
   auto se_executable = absl::WrapUnique(
@@ -777,20 +798,17 @@ GetStreamExecutorGpuDeviceAllocator(
   }
 
   // Add any additional allocators for alternate memory spaces.
-  if (allocator_config.collective_memory_size != 0) {
-    for (const auto& ordinal_and_device : addressable_devices) {
-      TF_ASSIGN_OR_RETURN(
-          auto collective_bfc_allocator,
-          CreateCollectiveBFCAllocator(
-              ordinal_and_device.second->executor(),
-              /*allocator_memory=*/allocator_config.collective_memory_size,
-              /*preallocate=*/true));
-      allocators.emplace_back(std::move(collective_bfc_allocator),
-                              ordinal_and_device.second->compute_stream(),
-                              /*memory_space=*/1);
-    }
+  for (const auto& ordinal_and_device : addressable_devices) {
+    TF_ASSIGN_OR_RETURN(
+        auto collective_bfc_allocator,
+        CreateCollectiveBFCAllocator(
+            ordinal_and_device.second->executor(),
+            /*memory_fraction=*/1.0 - allocator_config.memory_fraction,
+            allocator_config.collective_memory_size));
+    allocators.emplace_back(std::move(collective_bfc_allocator),
+                            ordinal_and_device.second->compute_stream(),
+                            /*memory_space=*/1);
   }
-
   return std::make_unique<se::MultiDeviceAdapter>(platform,
                                                   std::move(allocators));
 }
@@ -928,8 +946,9 @@ absl::StatusOr<tsl::AllocatorStats> StreamExecutorGpuDevice::GetAllocatorStats()
   auto* allocator_adapter = dynamic_cast<se::MultiDeviceAdapter*>(
       tensorflow::down_cast<PjRtStreamExecutorClient*>(client())->allocator());
   if (!allocator_adapter) {
-    return FailedPrecondition(
-        "GetAllocatorStats() only works with MultiDeviceAdapter allocator");
+    return Unimplemented(
+        "GetAllocatorStats() is only implemented with MultiDeviceAdapter "
+        "allocator");
   }
 
   TF_ASSIGN_OR_RETURN(auto allocator, allocator_adapter->GetAllocator(
@@ -974,8 +993,6 @@ StatusOr<std::unique_ptr<PjRtClient>> GetStreamExecutorGpuClient(
   if (options.enable_mock_nccl) {
     gpu_run_options->set_enable_mock_nccl_collectives();
   }
-  absl::flat_hash_map<std::string, std::string> device_maps;
-  absl::Mutex mu;
   std::shared_ptr<KeyValueStoreInterface> kv_store = options.kv_store;
   if (options.enable_mock_nccl) {
     kv_store = std::make_shared<InMemoryKeyValueStore>();
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
index 066f453392dfba..66ebf591f560aa 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_client.h
@@ -60,14 +60,17 @@ class StreamExecutorGpuTopologyDescription : public PjRtTopologyDescription {
   }
   // `gpu_device_ids` is the list of logical device ids for the GPU devices and
   // will be used to initialize the GPU topology.
-  StreamExecutorGpuTopologyDescription(const PjRtPlatformId platform_id,
-                                       const absl::string_view platform_name,
-                                       const absl::string_view platform_version,
-                                       const std::vector<int>& gpu_device_ids)
+  StreamExecutorGpuTopologyDescription(
+      const PjRtPlatformId platform_id, const absl::string_view platform_name,
+      const absl::string_view platform_version,
+      const std::vector<int>& gpu_device_ids,
+      const absl::flat_hash_map<std::string, PjRtDeviceAttribute>& attributes =
+          {})
       : platform_id_(platform_id),
         platform_name_(platform_name),
         platform_version_(platform_version),
-        gpu_topology_(gpu_device_ids) {}
+        gpu_topology_(gpu_device_ids),
+        attributes_(attributes) {}
 
   bool operator==(const StreamExecutorGpuTopologyDescription& other) const {
     return this->platform_id() == other.platform_id() &&
@@ -216,6 +219,10 @@ class StreamExecutorGpuClient : public xla::PjRtStreamExecutorClient {
       absl::string_view serialized, std::optional<CompileOptions> options,
       const LoadOptions& load_options);
 
+  StatusOr<std::unique_ptr<PjRtLoadedExecutable>> DeserializeExecutable(
+      absl::string_view serialized,
+      std::optional<CompileOptions> options) override;
+
   StatusOr<std::unique_ptr<PjRtLoadedExecutable>> Compile(
       const XlaComputation& computation, CompileOptions options) override;
 
diff --git a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
index 88f1cb5b995798..7a4f99bed3e10b 100644
--- a/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
+++ b/third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
@@ -105,13 +105,24 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
 
   CompileOptions input_options = options;
   if (!options.target_config) {
-    if (!client) {
+    if (client != nullptr) {
+      TF_RETURN_IF_ERROR(IsValidTopologyAndClientForCompile(topology, client));
+      return client->Compile(computation, options);
+    }
+    auto attr = topology.Attributes();
+    if (auto it = attr.find("target_config"); it != attr.end()) {
+      auto target_config_str = std::get<std::string>(it->second);
+      stream_executor::GpuTargetConfigProto gpu_target_config_proto;
+      if (!gpu_target_config_proto.ParseFromString(target_config_str)) {
+        return FailedPrecondition("Failed to parse GpuTargetConfigProto");
+      }
+      options.target_config.emplace(
+          Compiler::TargetConfig(gpu_target_config_proto));
+    } else {
       return absl::UnimplementedError(
           "Compilation without client and without target_config specified is "
           "not implemented");
     }
-    TF_RETURN_IF_ERROR(IsValidTopologyAndClientForCompile(topology, client));
-    return client->Compile(computation, options);
   }
   TF_RETURN_IF_ERROR(options.ApplyAllOptionOverrides());
   std::vector<const Shape*> argument_layout_pointers;
@@ -148,6 +159,7 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
   const int num_replicas = hlo_module->config().replica_count();
   const int num_partitions = hlo_module->config().num_partitions();
   const std::string name = hlo_module->name();
+  const std::string fingerprint = hlo_module->GetFingerprint128();
   auto unique_module_group =
       std::make_unique<HloModuleGroup>(std::move(hlo_module));
   TF_ASSIGN_OR_RETURN(
@@ -156,7 +168,7 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
                                       aot_options));
   return std::make_unique<StreamExecutorExecutable>(
       std::move(input_options), std::move(aot_results), num_replicas,
-      num_partitions, name);
+      num_partitions, name, fingerprint);
 #else
   return absl::InternalError(
       "GPU Compilation requires the target to be built with CUDA or "
@@ -184,7 +196,7 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
 #endif
 }
 
-REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, {
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, {
   PjRtRegisterCompiler(CudaName(),
                        std::make_unique<StreamExecutorGpuCompiler>());
 });
diff --git a/third_party/xla/xla/pjrt/mlir_to_hlo.cc b/third_party/xla/xla/pjrt/mlir_to_hlo.cc
index e75eca5f195b2a..f50664eec45237 100644
--- a/third_party/xla/xla/pjrt/mlir_to_hlo.cc
+++ b/third_party/xla/xla/pjrt/mlir_to_hlo.cc
@@ -62,54 +62,107 @@ static mlir::Attribute ArrayToElements(mlir::Attribute attr) {
         mlir::RankedTensorType::get(array.size(), array.getElementType()),
         array.asArrayRef());
   }
+  if (auto array = attr.dyn_cast<mlir::DenseBoolArrayAttr>()) {
+    return mlir::DenseIntElementsAttr::get(
+        mlir::RankedTensorType::get(array.size(), array.getElementType()),
+        array.asArrayRef());
+  }
   return attr;
 }
 
 static mlir::Attribute ElementsToArray(mlir::Attribute attr) {
   if (auto elements = llvm::dyn_cast<mlir::DenseIntElementsAttr>(attr)) {
-    return mlir::DenseI64ArrayAttr::get(
-        attr.getContext(), llvm::to_vector(elements.getValues<int64_t>()));
+    if (elements.getElementType().isInteger(64)) {
+      return mlir::DenseI64ArrayAttr::get(
+          attr.getContext(), llvm::to_vector(elements.getValues<int64_t>()));
+    }
+    return mlir::DenseBoolArrayAttr::get(
+        attr.getContext(), llvm::to_vector(elements.getValues<bool>()));
   }
   return attr;
 }
 
-// Convert attrs that use DenseI64ArrayAttr to use a different type of
-// Attribute. For backwards compatibility purposes, arrays should be converted
-// to DenseIntElementsAttr right before serialization, and converted back right
-// after serialization. Deserialization checks the IR is valid by default, so
-// you will need to disable that and do the verification explicitly after
-// parsing.
+static void ConvertAttr(
+    mlir::Operation* op, llvm::StringRef attr_name,
+    llvm::function_ref<mlir::Attribute(mlir::Attribute)> convert) {
+  if (auto attr = op->getAttr(attr_name)) {
+    op->setAttr(attr_name, convert(attr));
+  }
+}
+
+// Convert attrs that use DenseI64ArrayAttr (or DenseBoolArrayAttr) to use a
+// different type of Attribute. For backwards compatibility purposes, arrays
+// should be converted to DenseIntElementsAttr right before serialization, and
+// converted back right after serialization. Deserialization checks the IR is
+// valid by default, so you will need to disable that and do the verification
+// explicitly after parsing.
 void ConvertStablehloDenseAttributes(
     mlir::Operation* root_op,
     llvm::function_ref<mlir::Attribute(mlir::Attribute)> convert) {
   llvm::TypeSwitch<mlir::Operation*>(root_op)
       .Case([&](mlir::stablehlo::BroadcastOp op) {
-        op->setAttr("broadcast_sizes", convert(op->getAttr("broadcast_sizes")));
+        ConvertAttr(op, "broadcast_sizes", convert);
+      })
+      .Case([&](mlir::stablehlo::BroadcastInDimOp op) {
+        ConvertAttr(op, "broadcast_dimensions", convert);
+      })
+      .Case([&](mlir::stablehlo::ConvolutionOp op) {
+        ConvertAttr(op, "window_strides", convert);
+        ConvertAttr(op, "lhs_dilation", convert);
+        ConvertAttr(op, "rhs_dilation", convert);
+        ConvertAttr(op, "window_reversal", convert);
+      })
+      .Case([&](mlir::stablehlo::DynamicBroadcastInDimOp op) {
+        ConvertAttr(op, "broadcast_dimensions", convert);
+        ConvertAttr(op, "known_expanding_dimensions", convert);
+        ConvertAttr(op, "known_nonexpanding_dimensions", convert);
+      })
+      .Case([&](mlir::stablehlo::DynamicConvOp op) {
+        ConvertAttr(op, "window_strides", convert);
+        ConvertAttr(op, "lhs_dilation", convert);
+        ConvertAttr(op, "rhs_dilation", convert);
+        ConvertAttr(op, "window_reversal", convert);
       })
       .Case([&](mlir::stablehlo::DynamicSliceOp op) {
-        op->setAttr("slice_sizes", convert(op->getAttr("slice_sizes")));
+        ConvertAttr(op, "slice_sizes", convert);
       })
       .Case([&](mlir::stablehlo::FftOp op) {
-        op->setAttr("fft_length", convert(op->getAttr("fft_length")));
+        ConvertAttr(op, "fft_length", convert);
+      })
+      .Case([&](mlir::stablehlo::GatherOp op) {
+        ConvertAttr(op, "slice_sizes", convert);
+      })
+      .Case([&](mlir::stablehlo::MapOp op) {
+        ConvertAttr(op, "dimensions", convert);
       })
       .Case([&](mlir::stablehlo::PadOp op) {
-        op->setAttr("edge_padding_low",
-                    convert(op->getAttr("edge_padding_low")));
-        op->setAttr("edge_padding_high",
-                    convert(op->getAttr("edge_padding_high")));
-        op->setAttr("interior_padding",
-                    convert(op->getAttr("interior_padding")));
+        ConvertAttr(op, "edge_padding_low", convert);
+        ConvertAttr(op, "edge_padding_high", convert);
+        ConvertAttr(op, "interior_padding", convert);
+      })
+      .Case([&](mlir::stablehlo::ReduceOp op) {
+        ConvertAttr(op, "dimensions", convert);
+      })
+      .Case([&](mlir::stablehlo::ReduceWindowOp op) {
+        ConvertAttr(op, "window_dimensions", convert);
+        ConvertAttr(op, "window_strides", convert);
+        ConvertAttr(op, "base_dilations", convert);
+        ConvertAttr(op, "window_dilations", convert);
       })
       .Case([&](mlir::stablehlo::ReverseOp op) {
-        op->setAttr("dimensions", convert(op->getAttr("dimensions")));
+        ConvertAttr(op, "dimensions", convert);
+      })
+      .Case([&](mlir::stablehlo::SelectAndScatterOp op) {
+        ConvertAttr(op, "window_dimensions", convert);
+        ConvertAttr(op, "window_strides", convert);
       })
       .Case([&](mlir::stablehlo::SliceOp op) {
-        op->setAttr("start_indices", convert(op->getAttr("start_indices")));
-        op->setAttr("limit_indices", convert(op->getAttr("limit_indices")));
-        op->setAttr("strides", convert(op->getAttr("strides")));
+        ConvertAttr(op, "start_indices", convert);
+        ConvertAttr(op, "limit_indices", convert);
+        ConvertAttr(op, "strides", convert);
       })
       .Case([&](mlir::stablehlo::TransposeOp op) {
-        op->setAttr("permutation", convert(op->getAttr("permutation")));
+        ConvertAttr(op, "permutation", convert);
       });
 }
 
@@ -195,7 +248,7 @@ StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> ParseMlirModuleString(
 
   // In
   // https://github.com/google/jax/commit/184e3a88004680dbf34328b05c5fc0d869cc4a93,
-  // fields on some ops were changed to use DenseI64ArrayAttr instead of
+  // fields on some ops were changed to use Dense{Bool,I64}ArrayAttr instead of
   // I64DenseElementsAttr (DenseIntElementsAttr). Some clients still expect
   // dense elements, not dense arrays, so when serializing we always convert the
   // arrays to elements. The elements need to be converted back to arrays when
@@ -229,7 +282,7 @@ StatusOr<std::string> SerializeUsingNativeBytecode(mlir::ModuleOp module) {
   config.setDesiredBytecodeVersion(1);
   // In
   // https://github.com/google/jax/commit/184e3a88004680dbf34328b05c5fc0d869cc4a93,
-  // fields on some ops were changed to use DenseI64ArrayAttr instead of
+  // fields on some ops were changed to use Dense{Bool,I64}ArrayAttr instead of
   // I64DenseElementsAttr (DenseIntElementsAttr). Some clients still expect
   // dense elements, not dense arrays, so convert the arrays to elements before
   // serializing. The elements need to be converted back to arrays when
diff --git a/third_party/xla/xla/pjrt/mlir_to_hlo.h b/third_party/xla/xla/pjrt/mlir_to_hlo.h
index e3110f24f80152..c02a99460e7b88 100644
--- a/third_party/xla/xla/pjrt/mlir_to_hlo.h
+++ b/third_party/xla/xla/pjrt/mlir_to_hlo.h
@@ -60,6 +60,8 @@ StatusOr<std::string> SerializeUsingVersionedStablehlo(
 // back to StableHLO.
 // If module is not a portable artifact, this method is identity. Only fails
 // on portable artifacts that are outside of the compatibility window.
+// `ParseMlirModuleString` uses this method, and should be preferred to directly
+// calling `UpgradeVersionedStablehlo` where possible.
 Status UpgradeVersionedStablehlo(mlir::ModuleOp mlir_module);
 
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/pjrt_api.cc b/third_party/xla/xla/pjrt/pjrt_api.cc
index 391eacf60b56e0..a710d8b62135b4 100644
--- a/third_party/xla/xla/pjrt/pjrt_api.cc
+++ b/third_party/xla/xla/pjrt/pjrt_api.cc
@@ -77,7 +77,7 @@ xla::Status SetPjrtApi(absl::string_view device_type, const PJRT_Api* api) {
   (*pjrt_apis)[canonicalize_device_type] =
       std::make_pair(api, /*is_initialized=*/false);
   LOG(INFO) << "PJRT_Api is set for device type " << canonicalize_device_type;
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 typedef const PJRT_Api* (*PjrtApiInitFn)();
@@ -177,7 +177,7 @@ xla::Status InitializePjrtPlugin(absl::string_view device_type) {
   }
   PJRT_Plugin_Initialize_Args args;
   args.struct_size = PJRT_Plugin_Initialize_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   RETURN_STATUS_IF_PJRT_ERROR(pjrt_api->PJRT_Plugin_Initialize(&args),
                               pjrt_api);
   iter->second.second = true;
diff --git a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
index 79ff23580c207a..d7e3a222d40c8d 100644
--- a/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_c_api_client.cc
@@ -141,7 +141,7 @@ void PjRtCApiClient::InitDevicesAndMemorySpaces() {
   // Initialize devices.
   PJRT_Client_Devices_Args devices_args;
   devices_args.struct_size = PJRT_Client_Devices_Args_STRUCT_SIZE;
-  devices_args.priv = nullptr;
+  devices_args.extension_start = nullptr;
   devices_args.client = c_client_.get();
 
   pjrt::LogFatalIfPjrtError(c_api_->PJRT_Client_Devices(&devices_args), c_api_);
@@ -162,7 +162,7 @@ void PjRtCApiClient::InitDevicesAndMemorySpaces() {
   // Initialize addressable devices.
   PJRT_Client_AddressableDevices_Args address_args;
   address_args.struct_size = PJRT_Client_AddressableDevices_Args_STRUCT_SIZE;
-  address_args.priv = nullptr;
+  address_args.extension_start = nullptr;
   address_args.client = c_client_.get();
 
   pjrt::LogFatalIfPjrtError(
@@ -180,7 +180,7 @@ void PjRtCApiClient::InitDevicesAndMemorySpaces() {
   // TODO(yueshengys): Initialize global memory spaces when supported.
   PJRT_Client_AddressableMemories_Args memory_args;
   memory_args.struct_size = PJRT_Client_AddressableMemories_Args_STRUCT_SIZE;
-  memory_args.priv = nullptr;
+  memory_args.extension_start = nullptr;
   memory_args.client = c_client_.get();
 
   std::unique_ptr<PJRT_Error, pjrt::PJRT_ErrorDeleter> client_error(
@@ -212,7 +212,7 @@ void PjRtCApiClient::InitDevicesAndMemorySpaces() {
     PJRT_Device* c_device = cpp_device->c_device();
     PJRT_Device_AddressableMemories_Args args;
     args.struct_size = PJRT_Device_AddressableMemories_Args_STRUCT_SIZE;
-    args.priv = nullptr;
+    args.extension_start = nullptr;
     args.device = c_device;
 
     std::unique_ptr<PJRT_Error, pjrt::PJRT_ErrorDeleter> device_error(
@@ -241,7 +241,7 @@ void PjRtCApiClient::InitDevicesAndMemorySpaces() {
     PJRT_Memory* c_memory = cpp_memory->c_memory();
     PJRT_Memory_AddressableByDevices_Args args;
     args.struct_size = PJRT_Memory_AddressableByDevices_Args_STRUCT_SIZE;
-    args.priv = nullptr;
+    args.extension_start = nullptr;
     args.memory = c_memory;
     pjrt::LogFatalIfPjrtError(c_api_->PJRT_Memory_AddressableByDevices(&args),
                               c_api_);
@@ -272,7 +272,7 @@ absl::Span<PjRtDevice* const> PjRtCApiClient::addressable_devices() const {
 int PjRtCApiClient::process_index() const {
   PJRT_Client_ProcessIndex_Args process_index_args;
   process_index_args.struct_size = PJRT_Client_ProcessIndex_Args_STRUCT_SIZE;
-  process_index_args.priv = nullptr;
+  process_index_args.extension_start = nullptr;
   process_index_args.client = c_client_.get();
   pjrt::LogFatalIfPjrtError(
       c_api_->PJRT_Client_ProcessIndex(&process_index_args), c_api_);
@@ -301,7 +301,7 @@ StatusOr<DeviceAssignment> PjRtCApiClient::GetDefaultDeviceAssignment(
     int num_replicas, int num_partitions) const {
   PJRT_Client_DefaultDeviceAssignment_Args args;
   args.struct_size = PJRT_Client_DefaultDeviceAssignment_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.client = c_client_.get();
   args.num_replicas = num_replicas;
   args.num_partitions = num_partitions;
@@ -324,7 +324,7 @@ StatusOr<PjRtDevice*> PjRtCApiClient::LookupDevice(
     PjRtGlobalDeviceId global_device_id) const {
   PJRT_Client_LookupDevice_Args args;
   args.struct_size = PJRT_Client_LookupDevice_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.client = c_client_.get();
   args.id = global_device_id.value();
   RETURN_STATUS_IF_PJRT_ERROR(c_api_->PJRT_Client_LookupDevice(&args), c_api_);
@@ -340,7 +340,7 @@ StatusOr<PjRtDevice*> PjRtCApiClient::LookupAddressableDevice(
     PjRtLocalDeviceId local_device_id) const {
   PJRT_Client_LookupAddressableDevice_Args args;
   args.struct_size = PJRT_Client_LookupAddressableDevice_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.client = c_client_.get();
   args.local_hardware_id = local_device_id.value();
   RETURN_STATUS_IF_PJRT_ERROR(
@@ -360,7 +360,7 @@ static StatusOr<std::unique_ptr<PjRtLoadedExecutable>> InitializeArgsAndCompile(
     const std::string& format) {
   PJRT_Client_Compile_Args args;
   args.struct_size = PJRT_Client_Compile_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.client = client;
   TF_ASSIGN_OR_RETURN(const CompileOptionsProto options_proto,
                       options.ToProto());
@@ -370,7 +370,7 @@ static StatusOr<std::unique_ptr<PjRtLoadedExecutable>> InitializeArgsAndCompile(
 
   PJRT_Program program;
   program.struct_size = PJRT_Program_STRUCT_SIZE;
-  program.priv = nullptr;
+  program.extension_start = nullptr;
   program.code = const_cast<char*>(code.c_str());
   program.code_size = code.size();
   program.format = format.c_str();
@@ -407,7 +407,7 @@ PjRtCApiClient::DeserializeExecutable(absl::string_view serialized,
   PJRT_Executable_DeserializeAndLoad_Args des_args;
 
   des_args.struct_size = PJRT_Executable_DeserializeAndLoad_Args_STRUCT_SIZE;
-  des_args.priv = nullptr;
+  des_args.extension_start = nullptr;
   des_args.client = c_client_.get();
   des_args.serialized_executable = serialized.data();
   des_args.serialized_executable_size = serialized.length();
@@ -447,7 +447,7 @@ StatusOr<std::uintptr_t> PjRtCApiClient::UnsafeBufferPointer(
 
   PJRT_Buffer_UnsafePointer_Args args;
   args.struct_size = PJRT_Buffer_UnsafePointer_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.buffer =
       tensorflow::down_cast<const PjRtCApiBuffer*>(buffer)->c_buffer();
 
@@ -477,7 +477,7 @@ PjRtCApiClient::BufferFromHostBufferInternalImpl(
 
   PJRT_Client_BufferFromHostBuffer_Args args;
   args.struct_size = PJRT_Client_BufferFromHostBuffer_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.client = c_client_.get();
   args.data = data;
   args.type = ::pjrt::ConvertToPjRtBufferType(type);
@@ -527,7 +527,7 @@ PjRtCApiClient::BufferFromHostBufferInternalImpl(
   if (on_done_with_host_buffer) {
     PJRT_Event_OnReady_Args event_args;
     event_args.struct_size = PJRT_Event_OnReady_Args_STRUCT_SIZE;
-    event_args.priv = nullptr;
+    event_args.extension_start = nullptr;
     event_args.event = event.get();
     event_args.user_arg = new absl::AnyInvocable<void(PJRT_Error*)>(
         [on_done_with_host_buffer = std::move(on_done_with_host_buffer),
@@ -590,7 +590,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiClient::CreateViewOfDeviceBuffer(
     std::optional<std::intptr_t> stream) {
   PJRT_Client_CreateViewOfDeviceBuffer_Args args;
   args.struct_size = PJRT_Client_CreateViewOfDeviceBuffer_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.client = c_client_.get();
   args.device_buffer_ptr = device_ptr;
   args.dims = shape.dimensions().data();
@@ -644,7 +644,7 @@ PjRtCApiDeviceDescription::PjRtCApiDeviceDescription(
 int PjRtCApiDeviceDescription::id() const {
   PJRT_DeviceDescription_Id_Args args;
   args.struct_size = PJRT_DeviceDescription_Id_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.device_description = device_description_;
   pjrt::LogFatalIfPjrtError(c_api_->PJRT_DeviceDescription_Id(&args), c_api_);
   return args.id;
@@ -653,7 +653,7 @@ int PjRtCApiDeviceDescription::id() const {
 int PjRtCApiDeviceDescription::process_index() const {
   PJRT_DeviceDescription_ProcessIndex_Args args;
   args.struct_size = PJRT_DeviceDescription_ProcessIndex_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.device_description = device_description_;
   pjrt::LogFatalIfPjrtError(c_api_->PJRT_DeviceDescription_ProcessIndex(&args),
                             c_api_);
@@ -664,7 +664,7 @@ void PjRtCApiDeviceDescription::InitAttributes() {
   attributes_ = {};
   PJRT_DeviceDescription_Attributes_Args args;
   args.struct_size = PJRT_DeviceDescription_Attributes_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.device_description = device_description_;
   pjrt::LogFatalIfPjrtError(c_api_->PJRT_DeviceDescription_Attributes(&args),
                             c_api_);
@@ -715,7 +715,7 @@ PjRtCApiDeviceDescription::Attributes() const {
 absl::string_view PjRtCApiDeviceDescription::device_kind() const {
   PJRT_DeviceDescription_Kind_Args args;
   args.struct_size = PJRT_DeviceDescription_Kind_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.device_description = device_description_;
 
   pjrt::LogFatalIfPjrtError(c_api_->PJRT_DeviceDescription_Kind(&args), c_api_);
@@ -727,7 +727,7 @@ absl::string_view PjRtCApiDeviceDescription::device_kind() const {
 absl::string_view PjRtCApiDeviceDescription::DebugString() const {
   PJRT_DeviceDescription_DebugString_Args args;
   args.struct_size = PJRT_DeviceDescription_DebugString_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.device_description = device_description_;
   pjrt::LogFatalIfPjrtError(c_api_->PJRT_DeviceDescription_DebugString(&args),
                             c_api_);
@@ -738,7 +738,7 @@ absl::string_view PjRtCApiDeviceDescription::DebugString() const {
 absl::string_view PjRtCApiDeviceDescription::ToString() const {
   PJRT_DeviceDescription_ToString_Args args;
   args.struct_size = PJRT_DeviceDescription_ToString_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.device_description = device_description_;
   pjrt::LogFatalIfPjrtError(c_api_->PJRT_DeviceDescription_ToString(&args),
                             c_api_);
@@ -757,7 +757,7 @@ PjRtClient* PjRtCApiDevice::client() const { return client_; }
 bool PjRtCApiDevice::IsAddressable() const {
   PJRT_Device_IsAddressable_Args args;
   args.struct_size = PJRT_Device_IsAddressable_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.device = device_;
   const PJRT_Api* api = client_->pjrt_c_api();
   pjrt::LogFatalIfPjrtError(api->PJRT_Device_IsAddressable(&args), api);
@@ -771,7 +771,7 @@ int PjRtCApiDevice::local_hardware_id() const {
 PjRtLocalHardwareId PjRtCApiDevice::local_hardware_id_typed() const {
   PJRT_Device_LocalHardwareId_Args args;
   args.struct_size = PJRT_Device_LocalHardwareId_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.device = device_;
   const PJRT_Api* api = client_->pjrt_c_api();
   pjrt::LogFatalIfPjrtError(api->PJRT_Device_LocalHardwareId(&args), api);
@@ -781,7 +781,7 @@ PjRtLocalHardwareId PjRtCApiDevice::local_hardware_id_typed() const {
 StatusOr<PjRtMemorySpace*> PjRtCApiDevice::default_memory_space() const {
   PJRT_Device_DefaultMemory_Args args;
   args.struct_size = PJRT_Device_DefaultMemory_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.device = device_;
   const PJRT_Api* api = client_->pjrt_c_api();
   RETURN_STATUS_IF_PJRT_ERROR(api->PJRT_Device_DefaultMemory(&args), api);
@@ -791,7 +791,7 @@ StatusOr<PjRtMemorySpace*> PjRtCApiDevice::default_memory_space() const {
 StatusOr<tsl::AllocatorStats> PjRtCApiDevice::GetAllocatorStats() const {
   PJRT_Device_MemoryStats_Args args;
   args.struct_size = PJRT_Device_MemoryStats_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.device = device_;
   const PJRT_Api* api = client_->pjrt_c_api();
   RETURN_STATUS_IF_PJRT_ERROR(api->PJRT_Device_MemoryStats(&args), api);
@@ -859,7 +859,7 @@ PjRtClient* PjRtCApiMemorySpace::client() const { return client_; }
 int PjRtCApiMemorySpace::id() const {
   PJRT_Memory_Id_Args args;
   args.struct_size = PJRT_Memory_Id_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.memory = c_memory_;
   pjrt::LogFatalIfPjrtError(pjrt_c_api()->PJRT_Memory_Id(&args), pjrt_c_api());
   return args.id;
@@ -868,7 +868,7 @@ int PjRtCApiMemorySpace::id() const {
 absl::string_view PjRtCApiMemorySpace::memory_space_kind() const {
   PJRT_Memory_Kind_Args args;
   args.struct_size = PJRT_Memory_Kind_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.memory = c_memory_;
 
   pjrt::LogFatalIfPjrtError(pjrt_c_api()->PJRT_Memory_Kind(&args),
@@ -880,7 +880,7 @@ absl::string_view PjRtCApiMemorySpace::memory_space_kind() const {
 absl::string_view PjRtCApiMemorySpace::DebugString() const {
   PJRT_Memory_DebugString_Args args;
   args.struct_size = PJRT_Memory_DebugString_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.memory = c_memory_;
   pjrt::LogFatalIfPjrtError(pjrt_c_api()->PJRT_Memory_DebugString(&args),
                             pjrt_c_api());
@@ -890,7 +890,7 @@ absl::string_view PjRtCApiMemorySpace::DebugString() const {
 absl::string_view PjRtCApiMemorySpace::ToString() const {
   PJRT_Memory_ToString_Args args;
   args.struct_size = PJRT_Memory_ToString_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.memory = c_memory_;
   pjrt::LogFatalIfPjrtError(pjrt_c_api()->PJRT_Memory_ToString(&args),
                             pjrt_c_api());
@@ -910,7 +910,7 @@ absl::string_view PjRtCApiExecutable::name() const {
   PJRT_Executable_Name_Args args;
   args.executable = executable;
   args.struct_size = PJRT_Executable_Name_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   pjrt::LogFatalIfPjrtError(c_api->PJRT_Executable_Name(&args), c_api);
 
   return absl::string_view(args.executable_name, args.executable_name_size);
@@ -922,7 +922,7 @@ int PjRtCApiExecutable::num_replicas() const {
   PJRT_Executable_NumReplicas_Args args;
   args.executable = executable;
   args.struct_size = PJRT_Executable_NumReplicas_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   pjrt::LogFatalIfPjrtError(c_api->PJRT_Executable_NumReplicas(&args), c_api);
 
   return args.num_replicas;
@@ -934,7 +934,7 @@ int PjRtCApiExecutable::num_partitions() const {
   PJRT_Executable_NumPartitions_Args args;
   args.executable = executable;
   args.struct_size = PJRT_Executable_NumPartitions_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   pjrt::LogFatalIfPjrtError(c_api->PJRT_Executable_NumPartitions(&args), c_api);
 
   return args.num_partitions;
@@ -946,7 +946,7 @@ int64_t PjRtCApiExecutable::SizeOfGeneratedCodeInBytes() const {
   PJRT_Executable_SizeOfGeneratedCodeInBytes_Args args;
   args.struct_size =
       PJRT_Executable_SizeOfGeneratedCodeInBytes_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.executable = executable;
 
   pjrt::LogFatalIfPjrtError(
@@ -959,7 +959,7 @@ PjRtCApiExecutable::GetCostAnalysis() const {
   // Initialize function call args
   PJRT_Executable_GetCostAnalysis_Args args;
   args.struct_size = PJRT_Executable_GetCostAnalysis_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.executable = c_executable();
 
   // Make PJRT C API call
@@ -976,7 +976,7 @@ StatusOr<std::vector<std::vector<PrimitiveType>>>
 PjRtCApiExecutable::GetOutputElementTypes() const {
   PJRT_Executable_OutputElementTypes_Args args;
   args.struct_size = PJRT_Executable_OutputElementTypes_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.executable = c_executable();
 
   const PJRT_Api* c_api = pjrt_c_api();
@@ -996,7 +996,7 @@ StatusOr<std::vector<std::vector<DimensionVector>>>
 PjRtCApiExecutable::GetOutputDimensions() const {
   PJRT_Executable_OutputDimensions_Args args;
   args.struct_size = PJRT_Executable_OutputDimensions_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.executable = c_executable();
 
   const PJRT_Api* c_api = pjrt_c_api();
@@ -1022,7 +1022,7 @@ StatusOr<std::vector<std::vector<absl::string_view>>>
 PjRtCApiExecutable::GetOutputMemoryKinds() const {
   PJRT_Executable_OutputMemoryKinds_Args args;
   args.struct_size = PJRT_Executable_OutputMemoryKinds_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.executable = c_executable();
 
   const PJRT_Api* c_api = pjrt_c_api();
@@ -1044,11 +1044,11 @@ PjRtCApiExecutable::GetHloModules() const {
   auto* executable = c_executable();
   PJRT_Executable_OptimizedProgram_Args args;
   args.struct_size = PJRT_Executable_OptimizedProgram_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.executable = executable;
   PJRT_Program program;
   program.struct_size = PJRT_Program_STRUCT_SIZE;
-  program.priv = nullptr;
+  program.extension_start = nullptr;
   program.code = nullptr;
   args.program = &program;
 
@@ -1113,7 +1113,7 @@ StatusOr<std::string> PjRtCApiExecutable::SerializeExecutable() const {
   auto* executable = c_executable();
   PJRT_Executable_Serialize_Args ser_args;
   ser_args.struct_size = PJRT_Executable_Serialize_Args_STRUCT_SIZE;
-  ser_args.priv = nullptr;
+  ser_args.extension_start = nullptr;
   ser_args.executable = executable;
   ser_args.serialized_executable = nullptr;
 
@@ -1137,7 +1137,7 @@ StatusOr<std::string> PjRtCApiExecutable::FingerprintExecutable() const {
 
   PJRT_Executable_Fingerprint_Args args;
   args.struct_size = PJRT_Executable_Fingerprint_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.executable = c_executable();
   RETURN_STATUS_IF_PJRT_ERROR(c_api_->PJRT_Executable_Fingerprint(&args),
                               c_api_);
@@ -1154,7 +1154,7 @@ PjRtCApiLoadedExecutable::PjRtCApiLoadedExecutable(
                                          client->pjrt_c_api())) {
   PJRT_LoadedExecutable_GetExecutable_Args args;
   args.struct_size = PJRT_LoadedExecutable_GetExecutable_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.loaded_executable = c_loaded_executable();
   args.executable = nullptr;
   pjrt::LogFatalIfPjrtError(
@@ -1167,7 +1167,7 @@ PjRtCApiLoadedExecutable::PjRtCApiLoadedExecutable(
 void PjRtCApiLoadedExecutable::InitDevices() {
   PJRT_LoadedExecutable_AddressableDevices_Args args;
   args.struct_size = PJRT_LoadedExecutable_AddressableDevices_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.executable = c_loaded_executable();
   args.addressable_devices = nullptr;
   args.num_addressable_devices = 0;
@@ -1272,7 +1272,7 @@ CApiCopyToDeviceStream::CApiCopyToDeviceStream(
   PJRT_CopyToDeviceStream_TotalBytes_Args total_bytes_args;
   total_bytes_args.struct_size =
       PJRT_CopyToDeviceStream_TotalBytes_Args_STRUCT_SIZE;
-  total_bytes_args.priv = nullptr;
+  total_bytes_args.extension_start = nullptr;
   total_bytes_args.stream = c_stream_;
   pjrt::LogFatalIfPjrtError(
       c_api_->PJRT_CopyToDeviceStream_TotalBytes(&total_bytes_args), c_api_);
@@ -1281,7 +1281,7 @@ CApiCopyToDeviceStream::CApiCopyToDeviceStream(
   PJRT_CopyToDeviceStream_GranuleSize_Args granule_size_args;
   granule_size_args.struct_size =
       PJRT_CopyToDeviceStream_GranuleSize_Args_STRUCT_SIZE;
-  granule_size_args.priv = nullptr;
+  granule_size_args.extension_start = nullptr;
   granule_size_args.stream = c_stream_;
   pjrt::LogFatalIfPjrtError(
       c_api_->PJRT_CopyToDeviceStream_GranuleSize(&granule_size_args), c_api_);
@@ -1291,7 +1291,7 @@ CApiCopyToDeviceStream::CApiCopyToDeviceStream(
 CApiCopyToDeviceStream::~CApiCopyToDeviceStream() {
   PJRT_CopyToDeviceStream_Destroy_Args destroy_args;
   destroy_args.struct_size = PJRT_CopyToDeviceStream_Destroy_Args_STRUCT_SIZE;
-  destroy_args.priv = nullptr;
+  destroy_args.extension_start = nullptr;
   destroy_args.stream = c_stream_;
   pjrt::LogFatalIfPjrtError(
       c_api_->PJRT_CopyToDeviceStream_Destroy(&destroy_args), c_api_);
@@ -1303,14 +1303,14 @@ PjRtFuture<Status> CApiCopyToDeviceStream::AddChunk(PjRtChunk chunk) {
   PJRT_CopyToDeviceStream_AddChunk_Args add_chunk_args;
   add_chunk_args.struct_size =
       PJRT_CopyToDeviceStream_AddChunk_Args_STRUCT_SIZE;
-  add_chunk_args.priv = nullptr;
+  add_chunk_args.extension_start = nullptr;
   add_chunk_args.stream = c_stream_;
   add_chunk_args.chunk = &c_chunk;
 
   PJRT_CopyToDeviceStream_CurrentBytes_Args current_bytes_args;
   current_bytes_args.struct_size =
       PJRT_CopyToDeviceStream_CurrentBytes_Args_STRUCT_SIZE;
-  current_bytes_args.priv = nullptr;
+  current_bytes_args.extension_start = nullptr;
   current_bytes_args.stream = c_stream_;
 
   {
@@ -1430,7 +1430,7 @@ PjRtCApiLoadedExecutable::GetCommonExecuteArgs(
 
   PJRT_LoadedExecutable_Execute_Args args;
   args.struct_size = PJRT_LoadedExecutable_Execute_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.executable = c_loaded_executable();
   args.options = &c_options;
   args.options->struct_size = PJRT_ExecuteOptions_STRUCT_SIZE;
@@ -1465,7 +1465,7 @@ PjRtCApiLoadedExecutable::GetCommonExecuteArgs(
 
   PJRT_Executable_NumOutputs_Args numoutputs_args;
   numoutputs_args.struct_size = PJRT_Executable_NumOutputs_Args_STRUCT_SIZE;
-  numoutputs_args.priv = nullptr;
+  numoutputs_args.extension_start = nullptr;
   numoutputs_args.executable = c_executable();
   RETURN_STATUS_IF_PJRT_ERROR(
       pjrt_c_api()->PJRT_Executable_NumOutputs(&numoutputs_args), pjrt_c_api());
@@ -1637,7 +1637,7 @@ PjRtCApiLoadedExecutable::ExecutePortable(
 void PjRtCApiLoadedExecutable::Delete() {
   PJRT_LoadedExecutable_Delete_Args args;
   args.struct_size = PJRT_LoadedExecutable_Delete_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.executable = c_loaded_executable();
   const PJRT_Api* c_api = pjrt_c_api();
   pjrt::LogFatalIfPjrtError(c_api->PJRT_LoadedExecutable_Delete(&args), c_api);
@@ -1646,7 +1646,7 @@ void PjRtCApiLoadedExecutable::Delete() {
 bool PjRtCApiLoadedExecutable::IsDeleted() {
   PJRT_LoadedExecutable_IsDeleted_Args args;
   args.struct_size = PJRT_LoadedExecutable_IsDeleted_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.executable = c_loaded_executable();
 
   const PJRT_Api* c_api = pjrt_c_api();
@@ -1670,7 +1670,7 @@ StatusOr<std::string> PjRtCApiLoadedExecutable::FingerprintExecutable() const {
   // TODO(yeounoh): To be removed after 01/20/2024.
   PJRT_LoadedExecutable_Fingerprint_Args args;
   args.struct_size = PJRT_LoadedExecutable_Fingerprint_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.executable = c_loaded_executable();
   const PJRT_Api* c_api = pjrt_c_api();
   std::unique_ptr<PJRT_Error, pjrt::PJRT_ErrorDeleter> error(
@@ -1694,7 +1694,7 @@ PjRtCApiBuffer::PjRtCApiBuffer(PjRtCApiClient* client, PJRT_Buffer* buffer)
 PrimitiveType PjRtCApiBuffer::element_type() const {
   PJRT_Buffer_ElementType_Args args;
   args.struct_size = PJRT_Buffer_ElementType_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.buffer = buffer_.get();
   pjrt::LogFatalIfPjrtError(pjrt_c_api()->PJRT_Buffer_ElementType(&args),
                             pjrt_c_api());
@@ -1704,7 +1704,7 @@ PrimitiveType PjRtCApiBuffer::element_type() const {
 absl::Span<const int64_t> PjRtCApiBuffer::dimensions() const {
   PJRT_Buffer_Dimensions_Args args;
   args.struct_size = PJRT_Buffer_Dimensions_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.buffer = buffer_.get();
   pjrt::LogFatalIfPjrtError(pjrt_c_api()->PJRT_Buffer_Dimensions(&args),
                             pjrt_c_api());
@@ -1717,7 +1717,7 @@ const Layout& PjRtCApiBuffer::layout() const {
     if (!layout_.has_value()) {
       PJRT_Buffer_GetMemoryLayout_Args args;
       args.struct_size = PJRT_Buffer_GetMemoryLayout_Args_STRUCT_SIZE;
-      args.priv = nullptr;
+      args.extension_start = nullptr;
       args.buffer = buffer_.get();
       pjrt::LogFatalIfPjrtError(
           pjrt_c_api()->PJRT_Buffer_GetMemoryLayout(&args), pjrt_c_api());
@@ -1735,7 +1735,7 @@ const Layout& PjRtCApiBuffer::layout() const {
 bool PjRtCApiBuffer::has_dynamic_dimensions() const {
   PJRT_Buffer_DynamicDimensionIndices_Args args;
   args.struct_size = PJRT_Buffer_DynamicDimensionIndices_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.buffer = buffer_.get();
 
   const PJRT_Api* api = pjrt_c_api();
@@ -1760,7 +1760,7 @@ absl::Span<const bool> PjRtCApiBuffer::is_dynamic_dimension() const {
 
       PJRT_Buffer_DynamicDimensionIndices_Args args;
       args.struct_size = PJRT_Buffer_DynamicDimensionIndices_Args_STRUCT_SIZE;
-      args.priv = nullptr;
+      args.extension_start = nullptr;
       args.buffer = buffer_.get();
       const PJRT_Api* api = pjrt_c_api();
       std::unique_ptr<PJRT_Error, pjrt::PJRT_ErrorDeleter> error(
@@ -1781,7 +1781,7 @@ absl::Span<const bool> PjRtCApiBuffer::is_dynamic_dimension() const {
 StatusOr<std::vector<int64_t>> PjRtCApiBuffer::logical_dimensions() {
   PJRT_Buffer_UnpaddedDimensions_Args args;
   args.struct_size = PJRT_Buffer_UnpaddedDimensions_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.buffer = buffer_.get();
   RETURN_STATUS_IF_PJRT_ERROR(
       pjrt_c_api()->PJRT_Buffer_UnpaddedDimensions(&args), pjrt_c_api());
@@ -1792,7 +1792,7 @@ StatusOr<std::vector<int64_t>> PjRtCApiBuffer::logical_dimensions() {
 PjRtFuture<Status> PjRtCApiBuffer::ToLiteral(MutableLiteralBase* literal) {
   PJRT_Buffer_ToHostBuffer_Args args;
   args.struct_size = PJRT_Buffer_ToHostBuffer_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.src = buffer_.get();
 
   const xla::Shape& shape = literal->shape();
@@ -1834,7 +1834,7 @@ PjRtFuture<Status> PjRtCApiBuffer::ToLiteral(MutableLiteralBase* literal) {
 StatusOr<size_t> PjRtCApiBuffer::GetOnDeviceSizeInBytes() const {
   PJRT_Buffer_OnDeviceSizeInBytes_Args args;
   args.struct_size = PJRT_Buffer_OnDeviceSizeInBytes_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.buffer = buffer_.get();
   RETURN_STATUS_IF_PJRT_ERROR(
       client_->pjrt_c_api()->PJRT_Buffer_OnDeviceSizeInBytes(&args),
@@ -1846,7 +1846,7 @@ StatusOr<size_t> PjRtCApiBuffer::GetOnDeviceSizeInBytes() const {
 PjRtMemorySpace* PjRtCApiBuffer::memory_space() const {
   PJRT_Buffer_Memory_Args args;
   args.struct_size = PJRT_Buffer_Memory_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.buffer = buffer_.get();
   const PJRT_Api* api = pjrt_c_api();
   std::unique_ptr<PJRT_Error, pjrt::PJRT_ErrorDeleter> error(
@@ -1863,7 +1863,7 @@ PjRtMemorySpace* PjRtCApiBuffer::memory_space() const {
 PjRtDevice* PjRtCApiBuffer::device() const {
   PJRT_Buffer_Device_Args args;
   args.struct_size = PJRT_Buffer_Device_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.buffer = buffer_.get();
   const PJRT_Api* api = pjrt_c_api();
   pjrt::LogFatalIfPjrtError(api->PJRT_Buffer_Device(&args), api);
@@ -1873,7 +1873,7 @@ PjRtDevice* PjRtCApiBuffer::device() const {
 void PjRtCApiBuffer::Delete() {
   PJRT_Buffer_Delete_Args args;
   args.struct_size = PJRT_Buffer_Delete_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.buffer = buffer_.get();
   const PJRT_Api* api = pjrt_c_api();
   pjrt::LogFatalIfPjrtError(api->PJRT_Buffer_Delete(&args), api);
@@ -1882,7 +1882,7 @@ void PjRtCApiBuffer::Delete() {
 bool PjRtCApiBuffer::IsDeleted() {
   PJRT_Buffer_IsDeleted_Args args;
   args.struct_size = PJRT_Buffer_IsDeleted_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.buffer = buffer_.get();
   const PJRT_Api* api = pjrt_c_api();
   pjrt::LogFatalIfPjrtError(api->PJRT_Buffer_IsDeleted(&args), api);
@@ -1894,7 +1894,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToDevice(
   if (dst_device->client() == client_) {
     PJRT_Buffer_CopyToDevice_Args args;
     args.struct_size = PJRT_Buffer_CopyToDevice_Args_STRUCT_SIZE;
-    args.priv = nullptr;
+    args.extension_start = nullptr;
     args.buffer = buffer_.get();
     args.dst_device =
         tensorflow::down_cast<PjRtCApiDevice*>(dst_device)->c_device();
@@ -1927,7 +1927,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToMemorySpace(
   if (dst_memory->client() == client_) {
     PJRT_Buffer_CopyToMemory_Args args;
     args.struct_size = PJRT_Buffer_CopyToMemory_Args_STRUCT_SIZE;
-    args.priv = nullptr;
+    args.extension_start = nullptr;
     args.buffer = buffer_.get();
     args.dst_memory =
         tensorflow::down_cast<PjRtCApiMemorySpace*>(dst_memory)->c_memory();
@@ -1956,7 +1956,7 @@ StatusOr<std::unique_ptr<PjRtBuffer>> PjRtCApiBuffer::CopyToMemorySpace(
 bool PjRtCApiBuffer::IsOnCpu() const {
   PJRT_Buffer_IsOnCpu_Args args;
   args.struct_size = PJRT_Buffer_IsOnCpu_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.buffer = buffer_.get();
   const PJRT_Api* api = pjrt_c_api();
   pjrt::LogFatalIfPjrtError(api->PJRT_Buffer_IsOnCpu(&args), api);
@@ -1968,7 +1968,7 @@ PJRT_Event* PjRtCApiBuffer::GetReadyEvent() {
     const PJRT_Api* api = pjrt_c_api();
     PJRT_Buffer_ReadyEvent_Args args;
     args.struct_size = PJRT_Buffer_ReadyEvent_Args_STRUCT_SIZE;
-    args.priv = nullptr;
+    args.extension_start = nullptr;
     args.buffer = buffer_.get();
     pjrt::LogFatalIfPjrtError(api->PJRT_Buffer_ReadyEvent(&args), api);
     readiness_event_.reset(args.event);
@@ -1981,7 +1981,7 @@ void PjRtCApiBuffer::MakePromiseTrackEvent() {
   const PJRT_Api* api = pjrt_c_api();
   PJRT_Event_OnReady_Args args;
   args.struct_size = PJRT_Event_OnReady_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.event = GetReadyEvent();
   args.user_arg = new std::function<void(PJRT_Error*)>(
       [promise = readiness_promise_, api](PJRT_Error* error) -> void {
@@ -2019,7 +2019,7 @@ PjRtCApiBuffer::AcquireExternalReference() {
   increase_reference_count_args.buffer = c_buffer();
   increase_reference_count_args.struct_size =
       PJRT_Buffer_IncreaseExternalReferenceCount_Args_STRUCT_SIZE;
-  increase_reference_count_args.priv = nullptr;
+  increase_reference_count_args.extension_start = nullptr;
   RETURN_STATUS_IF_PJRT_ERROR(
       pjrt_c_api()->PJRT_Buffer_IncreaseExternalReferenceCount(
           &increase_reference_count_args),
@@ -2029,7 +2029,7 @@ PjRtCApiBuffer::AcquireExternalReference() {
       opaque_device_memory_data_pointer_args;
   opaque_device_memory_data_pointer_args.struct_size =
       PJRT_Buffer_OpaqueDeviceMemoryDataPointer_Args_STRUCT_SIZE;
-  opaque_device_memory_data_pointer_args.priv = nullptr;
+  opaque_device_memory_data_pointer_args.extension_start = nullptr;
   opaque_device_memory_data_pointer_args.buffer = c_buffer();
   RETURN_STATUS_IF_PJRT_ERROR(
       pjrt_c_api()->PJRT_Buffer_OpaqueDeviceMemoryDataPointer(
@@ -2046,7 +2046,7 @@ PjRtCApiExternalReference::~PjRtCApiExternalReference() {
   PJRT_Buffer_DecreaseExternalReferenceCount_Args args;
   args.struct_size =
       PJRT_Buffer_DecreaseExternalReferenceCount_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.buffer = buffer_->c_buffer();
   pjrt::LogFatalIfPjrtError(
       client_->pjrt_c_api()->PJRT_Buffer_DecreaseExternalReferenceCount(&args),
@@ -2072,7 +2072,7 @@ absl::string_view PjRtCApiTopologyDescription::platform_name() const {
   PJRT_TopologyDescription_PlatformName_Args args;
   args.topology = c_topology_;
   args.struct_size = PJRT_TopologyDescription_PlatformName_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   pjrt::LogFatalIfPjrtError(
       c_api_->PJRT_TopologyDescription_PlatformName(&args), c_api_);
   return absl::string_view(args.platform_name, args.platform_name_size);
@@ -2081,7 +2081,7 @@ absl::string_view PjRtCApiTopologyDescription::platform_name() const {
 absl::string_view PjRtCApiTopologyDescription::platform_version() const {
   PJRT_TopologyDescription_PlatformVersion_Args args;
   args.struct_size = PJRT_TopologyDescription_PlatformVersion_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.topology = c_topology_;
   pjrt::LogFatalIfPjrtError(
       c_api_->PJRT_TopologyDescription_PlatformVersion(&args), c_api_);
@@ -2093,7 +2093,7 @@ PjRtCApiTopologyDescription::DeviceDescriptions() const {
   PJRT_TopologyDescription_GetDeviceDescriptions_Args args;
   args.struct_size =
       PJRT_TopologyDescription_GetDeviceDescriptions_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.topology = c_topology_;
   pjrt::LogFatalIfPjrtError(
       c_api_->PJRT_TopologyDescription_GetDeviceDescriptions(&args), c_api_);
@@ -2111,7 +2111,7 @@ PjRtCApiTopologyDescription::DeviceDescriptions() const {
 StatusOr<std::string> PjRtCApiTopologyDescription::Serialize() const {
   PJRT_TopologyDescription_Serialize_Args args;
   args.struct_size = PJRT_TopologyDescription_Serialize_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.topology = c_topology_;
   RETURN_STATUS_IF_PJRT_ERROR(c_api_->PJRT_TopologyDescription_Serialize(&args),
                               c_api_);
@@ -2123,7 +2123,7 @@ StatusOr<std::string> PjRtCApiTopologyDescription::Serialize() const {
 void PjRtCApiTopologyDescription::InitAttributes() {
   PJRT_TopologyDescription_Attributes_Args args;
   args.struct_size = PJRT_TopologyDescription_Attributes_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   args.topology = c_topology_;
   pjrt::LogFatalIfPjrtError(c_api_->PJRT_TopologyDescription_Attributes(&args),
                             c_api_);
@@ -2139,7 +2139,7 @@ static StatusOr<std::unique_ptr<PjRtExecutable>> InitializeArgsAndCompileAot(
     const std::string& format) {
   PJRT_Compile_Args args;
   args.struct_size = PJRT_Compile_Args_STRUCT_SIZE;
-  args.priv = nullptr;
+  args.extension_start = nullptr;
   if (client == nullptr) {
     args.client = nullptr;
   } else {
@@ -2157,7 +2157,7 @@ static StatusOr<std::unique_ptr<PjRtExecutable>> InitializeArgsAndCompileAot(
 
   PJRT_Program program;
   program.struct_size = PJRT_Program_STRUCT_SIZE;
-  program.priv = nullptr;
+  program.extension_start = nullptr;
   program.code = const_cast<char*>(code.c_str());
   program.code_size = code.size();
   program.format = format.c_str();
@@ -2203,7 +2203,7 @@ StatusOr<std::unique_ptr<PjRtClient>> GetCApiClient(
 
   PJRT_Client_Create_Args init_args;
   init_args.struct_size = PJRT_Client_Create_Args_STRUCT_SIZE;
-  init_args.priv = nullptr;
+  init_args.extension_start = nullptr;
   TF_ASSIGN_OR_RETURN(
       std::vector<PJRT_NamedValue> c_options,
       pjrt::ConvertToPjRtNamedValueList(create_options,
@@ -2237,7 +2237,7 @@ StatusOr<std::unique_ptr<PjRtTopologyDescription>> GetCApiTopology(
 
   PJRT_TopologyDescription_Create_Args init_args;
   init_args.struct_size = PJRT_TopologyDescription_Create_Args_STRUCT_SIZE;
-  init_args.priv = nullptr;
+  init_args.extension_start = nullptr;
   TF_ASSIGN_OR_RETURN(
       std::vector<PJRT_NamedValue> c_options,
       pjrt::ConvertToPjRtNamedValueList(create_options,
diff --git a/third_party/xla/xla/pjrt/pjrt_client.h b/third_party/xla/xla/pjrt/pjrt_client.h
index 12658c81099b03..e76e1fe39baf26 100644
--- a/third_party/xla/xla/pjrt/pjrt_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_client.h
@@ -150,10 +150,8 @@ class PjRtDevice {
   }
 
   // TODO(b/314368788): Remove `int local_hardware_id()` and rename this
-  // function to `local_device_id()`. Make this function pure virtual.
-  virtual PjRtLocalHardwareId local_hardware_id_typed() const {
-    return PjRtLocalHardwareId(local_hardware_id());
-  }
+  // function to `local_hardware_id()`.
+  virtual PjRtLocalHardwareId local_hardware_id_typed() const = 0;
 
   // The index of the process that this device belongs to, i.e. is addressable
   // from. This is not always identical to PjRtClient::process_index() in a
@@ -165,7 +163,10 @@ class PjRtDevice {
   // Opaque hardware ID, e.g., the CUDA device number, useful for identifying
   // which GPU when interacting with non-JAX code. In general, not guaranteed to
   // be dense, and -1 if undefined.
-  virtual int local_hardware_id() const = 0;
+  ABSL_DEPRECATED("Use local_hardware_id_typed() instead")
+  virtual int local_hardware_id() const {
+    return local_hardware_id_typed().value();
+  }
 
   // A vendor-dependent string that uniquely identifies the kind of device,
   // e.g., "Tesla V100-SXM2-16GB". May be used to determine whether two GPUs are
@@ -519,22 +520,22 @@ class PjRtClient {
   virtual absl::Span<PjRtDevice* const> addressable_devices() const = 0;
 
   // Lookup any PjRtDevice for a given PjRtDevice::id().
-  virtual StatusOr<PjRtDevice*> LookupDevice(int device_id) const = 0;
-  // TODO(b/314368788): Replace the above function with this function.
-  virtual StatusOr<PjRtDevice*> LookupDevice(
-      PjRtGlobalDeviceId global_device_id) const {
-    return LookupDevice(global_device_id.value());
+  ABSL_DEPRECATED("Use LookupDevice(PjRtGlobalDeviceId) instead")
+  virtual StatusOr<PjRtDevice*> LookupDevice(int device_id) const {
+    return LookupDevice(PjRtGlobalDeviceId(device_id));
   }
+  virtual StatusOr<PjRtDevice*> LookupDevice(
+      PjRtGlobalDeviceId global_device_id) const = 0;
 
   // Return an addressable PjRtDevice for a given
   // PjRtDevice::local_hardware_id().
+  ABSL_DEPRECATED("Use LookupAddressableDevice(PjRtLocalDeviceId) instead")
   virtual StatusOr<PjRtDevice*> LookupAddressableDevice(
-      int local_hardware_id) const = 0;
-  // TODO(b/314368788): Replace the above function with this function.
-  virtual StatusOr<PjRtDevice*> LookupAddressableDevice(
-      PjRtLocalDeviceId local_device_id) const {
-    return LookupAddressableDevice(local_device_id.value());
+      int local_hardware_id) const {
+    return LookupAddressableDevice(PjRtLocalDeviceId(local_hardware_id));
   }
+  virtual StatusOr<PjRtDevice*> LookupAddressableDevice(
+      PjRtLocalDeviceId local_device_id) const = 0;
 
   // Return all memory spaces owned by the client.
   // The memory spaces are in no particular order.
@@ -754,6 +755,43 @@ class PjRtClient {
   CreateBuffersForAsyncHostToDevice(absl::Span<const Shape> shapes,
                                     PjRtMemorySpace* memory_space) = 0;
 
+  // Creates a shapeless buffer on the device that can be partitioned into
+  // multiple PjRtBuffer. This class is an Arena version of
+  // `AsyncHostToDeviceTransferManager`.
+  // As a low-level interface, the user must make sure that invocations of
+  // `Slice` match properly with the writes from `TransferRawDataToSubBuffer`.
+  //
+  // For the intended application to Arena allocation / transfer, the user can
+  // use `GetOnDeviceSizeInBytes` to calculate the offsets for the host buffers
+  // that need to be transferred.
+  class PjRtRawDeviceBuffer {
+   public:
+    virtual ~PjRtRawDeviceBuffer() = default;
+
+    // Transfers data to the device buffer. Data should already be in the
+    // device layout.
+    virtual Status TransferRawDataToSubBuffer(
+        const void* data, int64_t offset, int64_t transfer_size,
+        bool is_last_transfer, absl::AnyInvocable<void() &&> on_done) = 0;
+
+    // The resulting buffer becomes ready when all transfers complete.
+    virtual StatusOr<std::unique_ptr<PjRtBuffer>> Slice(
+        int64_t offset, PrimitiveType type, absl::Span<int64_t const> dims,
+        const Layout& layout) = 0;
+  };
+  // Creates a raw device buffer of a given size in bytes.
+  virtual StatusOr<std::unique_ptr<PjRtRawDeviceBuffer>> CreateRawDeviceBuffer(
+      int64_t size, PjRtDevice* device) {
+    return Unimplemented("CreateRawDeviceBuffer is not implemented.");
+  }
+
+  // On-device bytes required for a PjRt buffer with these `Shape` attributes.
+  virtual StatusOr<int64_t> GetOnDeviceSizeInBytes(
+      PrimitiveType type, absl::Span<int64_t const> dims,
+      const Layout& layout) {
+    return Unimplemented("GetOnDeviceSizeInBytes is not implemented.");
+  };
+
   // Describes the semantics the caller to BufferFromHostBuffer expects from the
   // runtime, in a total order from most restrictive to least restrictive.
   enum class HostBufferSemantics {
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
index 38b6cc9756603b..fb80ac21b20a3d 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.cc
@@ -3026,6 +3026,22 @@ PjRtStreamExecutorLoadedExecutable::GetOutputMemoryKinds() const {
   return Unimplemented("GetOutputMemoryKinds is not supported.");
 }
 
+StatusOr<std::string>
+PjRtStreamExecutorLoadedExecutable::FingerprintExecutable() const {
+  if (executables_.size() != 1) {
+    return absl::InternalError(
+        "Fingerprinting multiple executables within one "
+        "PjRtStreamExecutorLoadedExecutable is not supported.");
+  }
+
+  Executable* executable = executables_[0]->executable();
+  if (executable->has_module()) {
+    return executable->module().GetFingerprint128();
+  } else {
+    return absl::InternalError("Executable does not have HLO modules.");
+  }
+}
+
 StatusOr<PjRtStreamExecutorClient::ExecutableExtras>
 PjRtStreamExecutorClient::GetExecutableExtras(CompileOptions* options) {
   ExecutableExtras extras;
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
index f29a92d52ed1ab..53e082ed43d1f7 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client.h
@@ -909,6 +909,8 @@ class PjRtStreamExecutorLoadedExecutable : public PjRtLoadedExecutable {
     return compile_options_;
   }
 
+  absl::StatusOr<std::string> FingerprintExecutable() const override;
+
  protected:
   bool parameter_is_tupled_arguments() const {
     return parameter_is_tupled_arguments_;
diff --git a/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc b/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
index 4c9ec7c5b8e915..b308ff82bb1b52 100644
--- a/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
+++ b/third_party/xla/xla/pjrt/pjrt_stream_executor_client_test.cc
@@ -150,7 +150,7 @@ TEST(PjRtStreamExecutorClientTest, DonateWithControlDependency) {
 
   EXPECT_FALSE(got_literal);
 
-  avr.emplace(tsl::OkStatus());
+  avr.emplace(absl::OkStatus());
   EXPECT_TRUE(future.IsReady());
 
   {
diff --git a/third_party/xla/xla/pjrt/plugin/BUILD b/third_party/xla/xla/pjrt/plugin/BUILD
index e2504e175fefc9..4b9aa12b80b0a1 100644
--- a/third_party/xla/xla/pjrt/plugin/BUILD
+++ b/third_party/xla/xla/pjrt/plugin/BUILD
@@ -30,12 +30,12 @@ load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 # ** Please don't remove this file - it is supporting some 3rd party plugins **
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "plugin",
-    visibility = ["//visibility:public"],
     deps = [
         #"//xla/pjrt/plugin/example:example_lib",
     ],
diff --git a/third_party/xla/xla/pjrt/stream_executor_executable.cc b/third_party/xla/xla/pjrt/stream_executor_executable.cc
index 61caeff30b2c22..41e0098eaa3eb0 100644
--- a/third_party/xla/xla/pjrt/stream_executor_executable.cc
+++ b/third_party/xla/xla/pjrt/stream_executor_executable.cc
@@ -36,6 +36,7 @@ StatusOr<std::string> StreamExecutorExecutable::SerializeExecutable() const {
   proto.set_num_replicas(num_replicas_);
   proto.set_num_partitions(num_partitions_);
   proto.set_name(name_);
+  proto.set_fingerprint(fingerprint_);
   return proto.SerializeAsString();
 }
 }  // namespace xla
diff --git a/third_party/xla/xla/pjrt/stream_executor_executable.h b/third_party/xla/xla/pjrt/stream_executor_executable.h
index 7228dfcf44c368..b12f21a14dd0a6 100644
--- a/third_party/xla/xla/pjrt/stream_executor_executable.h
+++ b/third_party/xla/xla/pjrt/stream_executor_executable.h
@@ -27,12 +27,14 @@ class StreamExecutorExecutable : public PjRtExecutable {
   StreamExecutorExecutable(
       const CompileOptions& compile_options,
       std::vector<std::unique_ptr<xla::AotCompilationResult>> executables,
-      int num_replicas, int num_partitions, absl::string_view name)
+      int num_replicas, int num_partitions, absl::string_view name,
+      absl::string_view fingerprint)
       : compile_options_(compile_options),
         aot_executables_(std::move(executables)),
         num_replicas_(num_replicas),
         num_partitions_(num_partitions),
-        name_(name) {}
+        name_(name),
+        fingerprint_(fingerprint) {}
 
   StatusOr<std::string> SerializeExecutable() const override;
 
@@ -63,12 +65,17 @@ class StreamExecutorExecutable : public PjRtExecutable {
     return aot_executables_;
   }
 
+  StatusOr<std::string> FingerprintExecutable() const override {
+    return fingerprint_;
+  }
+
  private:
   CompileOptions compile_options_;
   std::vector<std::unique_ptr<xla::AotCompilationResult>> aot_executables_;
   int num_replicas_;
   int num_partitions_;
   std::string name_;
+  std::string fingerprint_;
 };
 }  // namespace xla
 
diff --git a/third_party/xla/xla/pjrt/stream_executor_executable.proto b/third_party/xla/xla/pjrt/stream_executor_executable.proto
index b5bec5faa00297..c9572d9aaf8749 100644
--- a/third_party/xla/xla/pjrt/stream_executor_executable.proto
+++ b/third_party/xla/xla/pjrt/stream_executor_executable.proto
@@ -25,4 +25,5 @@ message StreamExecutorExecutableProto {
   int32 num_replicas = 3;
   int32 num_partitions = 4;
   string name = 5;
+  string fingerprint = 6;
 }
diff --git a/third_party/xla/xla/pjrt/tf_pjrt_client.h b/third_party/xla/xla/pjrt/tf_pjrt_client.h
index a8df9dc59e7d8d..7aaed8f25e90ab 100644
--- a/third_party/xla/xla/pjrt/tf_pjrt_client.h
+++ b/third_party/xla/xla/pjrt/tf_pjrt_client.h
@@ -340,6 +340,10 @@ class TfPjRtClient : public PjRtClient {
   StatusOr<ChannelHandle> CreateHostToDeviceChannelHandle() override {
     return wrapped_->CreateHostToDeviceChannelHandle();
   }
+  StatusOr<const PjRtTopologyDescription*> GetTopologyDescription()
+      const override {
+    return wrapped_->GetTopologyDescription();
+  }
   Status Defragment() override { return wrapped_->Defragment(); }
 
   PjRtClient* wrapped() const { return wrapped_.get(); }
diff --git a/third_party/xla/xla/pjrt/transpose_test.cc b/third_party/xla/xla/pjrt/transpose_test.cc
index d8f2da4e0b8e48..011a1ca5eb4780 100644
--- a/third_party/xla/xla/pjrt/transpose_test.cc
+++ b/third_party/xla/xla/pjrt/transpose_test.cc
@@ -127,8 +127,10 @@ TEST(TransposeTest, InvalidTilings) {
   options.elem_size_in_bytes = sizeof(float);
   options.dims = dims;
   options.permutation = perm;
-  options.input_layout = TransposePlan::Tiling{{8, 128}};
-  options.output_tiling = TransposePlan::Tiling{{4}};
+  std::vector<int64_t> input_tiling = {8, 128};
+  std::vector<int64_t> output_tiling = {4};
+  options.input_layout = TransposePlan::Tiling{input_tiling};
+  options.output_tiling = TransposePlan::Tiling{output_tiling};
   auto plan = TransposePlan::Create(options);
   EXPECT_EQ(plan.status().code(), tsl::error::UNIMPLEMENTED);
   EXPECT_THAT(
@@ -422,7 +424,8 @@ TEST(TransposeTest, NegativeStrides1D) {
   options.elem_size_in_bytes = sizeof(int32_t);
   options.dims = dims;
   options.permutation = permutation;
-  options.input_layout = TransposePlan::Striding{{-int64_t{sizeof(int32_t)}}};
+  std::vector<int64_t> strides = {-int64_t{sizeof(int32_t)}};
+  options.input_layout = TransposePlan::Striding{strides};
   TF_ASSERT_OK_AND_ASSIGN(auto plan, TransposePlan::Create(options));
   plan->Execute(input.data() + (n - 1), output.data());
   EXPECT_EQ(expected, output);
@@ -447,8 +450,9 @@ TEST(TransposeTest, NegativeStrides2D) {
   options.elem_size_in_bytes = sizeof(int16_t);
   options.dims = dims;
   options.permutation = permutation;
-  options.input_layout =
-      TransposePlan::Striding{{4 * sizeof(int16_t), -int64_t{sizeof(int16_t)}}};
+  std::vector<int64_t> strides = {4 * sizeof(int16_t),
+                                  -int64_t{sizeof(int16_t)}};
+  options.input_layout = TransposePlan::Striding{strides};
   TF_ASSERT_OK_AND_ASSIGN(auto plan, TransposePlan::Create(options));
   plan->Execute(input.data() + 3, output.data());
   EXPECT_EQ(expected, output);
@@ -578,24 +582,28 @@ static void* benchmarks = []() {
 }();
 
 TEST(TransposePlanCache, Basics) {
+  std::vector<int64_t> dims = {1, 2, 3};
+  std::vector<int64_t> permutation_210 = {2, 1, 0};
+  std::vector<int64_t> permutation_120 = {1, 2, 0};
+  std::vector<int64_t> permutation_012 = {0, 1, 2};
   TransposePlanCache cache(2);
   TransposePlan::Options o;
   o.elem_size_in_bytes = 4;
-  o.dims = {1, 2, 3};
-  o.permutation = {2, 1, 0};
+  o.dims = dims;
+  o.permutation = permutation_210;
   TF_ASSERT_OK_AND_ASSIGN(auto p1, cache.GetOrCreate(o));
   TF_ASSERT_OK_AND_ASSIGN(auto p1a, cache.GetOrCreate(o));
   EXPECT_TRUE(p1.get() == p1a.get());
   TransposePlan::Options o2;
   o2.elem_size_in_bytes = 4;
-  o2.dims = {1, 2, 3};
-  o2.permutation = {1, 2, 0};
+  o2.dims = dims;
+  o2.permutation = permutation_120;
   TF_ASSERT_OK_AND_ASSIGN(auto p2, cache.GetOrCreate(o2));
   EXPECT_TRUE(p1.get() != p2.get());
   TransposePlan::Options o3;
   o3.elem_size_in_bytes = 4;
-  o3.dims = {1, 2, 3};
-  o3.permutation = {0, 1, 2};
+  o3.dims = dims;
+  o3.permutation = permutation_012;
   TF_ASSERT_OK_AND_ASSIGN(auto p3, cache.GetOrCreate(o3));
   EXPECT_TRUE(p3.get() != p1.get());
   TF_ASSERT_OK_AND_ASSIGN(auto p1b, cache.GetOrCreate(o));
diff --git a/third_party/xla/xla/primitive_util.cc b/third_party/xla/xla/primitive_util.cc
index 98168390720d5e..85a0e7a39986d6 100644
--- a/third_party/xla/xla/primitive_util.cc
+++ b/third_party/xla/xla/primitive_util.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/strings/ascii.h"
 #include "absl/strings/string_view.h"
 #include "xla/statusor.h"
+#include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/logging.h"
@@ -93,6 +94,17 @@ bool HasInfinity(PrimitiveType type) {
   return false;
 }
 
+bool HasNegativeZero(PrimitiveType type) {
+  if (ABSL_PREDICT_TRUE(IsFloatingPointType(type))) {
+    return FloatingPointTypeSwitch<bool>(
+        [&](auto constant_type) -> bool {
+          return has_negative_zero_v<NativeTypeOf<constant_type>>;
+        },
+        type);
+  }
+  return false;
+}
+
 xla::PrimitiveType SignedIntegralTypeForBitWidth(int64_t src_bitwidth) {
   switch (src_bitwidth) {
     case 4:
diff --git a/third_party/xla/xla/primitive_util.h b/third_party/xla/xla/primitive_util.h
index d9e932ba673e9b..9bd34a28785d64 100644
--- a/third_party/xla/xla/primitive_util.h
+++ b/third_party/xla/xla/primitive_util.h
@@ -68,6 +68,9 @@ int ExponentBias(PrimitiveType type);
 // Returns whether the type has a value for infinity.
 bool HasInfinity(PrimitiveType type);
 
+// Returns whether the type has a value for negative zero.
+bool HasNegativeZero(PrimitiveType type);
+
 // Returns the XLA primitive type (eg, F32) corresponding to the given
 // template parameter native type (eg, float).
 template <typename NativeT>
diff --git a/third_party/xla/xla/python/BUILD b/third_party/xla/xla/python/BUILD
index dae405d80036f0..28253e50f84cd1 100644
--- a/third_party/xla/xla/python/BUILD
+++ b/third_party/xla/xla/python/BUILD
@@ -11,6 +11,7 @@ load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
 load(
     "@local_tsl//tsl:tsl.bzl",
     "if_cuda_or_rocm",
+    "internal_visibility",
 )
 load("@local_tsl//tsl:tsl.default.bzl", "tsl_pybind_extension")
 load("@local_tsl//tsl/platform:build_config.bzl", "pyx_library", "tf_proto_library")
@@ -21,7 +22,10 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        ":friends",
+    ]),
     licenses = ["notice"],
 )
 
@@ -44,13 +48,10 @@ pytype_strict_library(
     ],
 )
 
-exports_files(
-    [
-        "xla_client.py",
-        "xla_client.pyi",
-    ],
-    visibility = ["//visibility:public"],
-)
+exports_files([
+    "xla_client.py",
+    "xla_client.pyi",
+])
 
 pyx_library(
     name = "custom_call_for_test",
@@ -178,7 +179,7 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
+    visibility = [":friends"],
     deps = [
         "//xla:literal",
         "//xla:shape_util",
@@ -206,7 +207,7 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
+    visibility = [":friends"],
     deps = [
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
@@ -224,7 +225,6 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status_macros",
         "//xla:util",
@@ -243,7 +243,7 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
+    visibility = [":friends"],
     deps = [
         ":python_ref_manager",
         # placeholder for index annotation deps
@@ -251,7 +251,9 @@ cc_library(
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@local_config_python//:python_headers",  # buildcleaner: keep
         "//xla/pjrt:exceptions",
+        "@local_tsl//tsl/platform",
         "@local_tsl//tsl/platform:logging",
         "@pybind11",
     ],
@@ -267,7 +269,6 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
     deps = [
         ":traceback",
         "//xla:statusor",
@@ -313,7 +314,6 @@ cc_library(
         "TENSORFLOW_USE_ROCM=1",
     ]),
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
     deps = [
         ":callback",
         ":pprof_profile_builder",
@@ -373,6 +373,7 @@ cc_library(
         "@pybind11_abseil//pybind11_abseil:absl_casters",
     ] + if_cuda([
         "@local_config_cuda//cuda:cuda_headers",
+        "//xla/stream_executor/cuda:cuda_driver",
     ]) + if_rocm([
         "@local_config_rocm//rocm:rocm_headers",
     ]),
@@ -391,7 +392,6 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
     deps = [
         ":python_ref_manager",
         "//xla:comparison_util",
@@ -422,7 +422,6 @@ cc_library(
         "TENSORFLOW_USE_ROCM=1",
     ]),
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
     deps = [
         ":callback",
         "//xla:comparison_util",
@@ -449,7 +448,6 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
     deps = [
         ":py_client",
         ":python_ref_manager",
@@ -479,7 +477,7 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
+    visibility = [":friends"],  # For the functions to access C++ flags/thread-local variables
     deps = [
         ":py_client",
         ":python_ref_manager",
@@ -504,7 +502,6 @@ cc_library(
     name = "inspect_sharding",
     srcs = ["inspect_sharding.cc"],
     hdrs = ["inspect_sharding.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/service:custom_call_sharding_helper",
@@ -524,7 +521,7 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
     deps = [
         ":inspect_sharding",
         # placeholder for index annotation deps
@@ -552,7 +549,6 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
     deps = [
         ":types",
         # placeholder for index annotation deps
@@ -578,7 +574,6 @@ cc_library(
     name = "outfeed_receiver",
     srcs = ["outfeed_receiver.cc"],
     hdrs = ["outfeed_receiver.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:literal",
         "//xla:shape_util",
@@ -606,7 +601,7 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
     deps = [
         ":jax_jit",
         ":py_client",
@@ -635,7 +630,7 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
     deps = [
         ":jax_jit",
         ":py_client",
@@ -692,7 +687,6 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
     deps = [
         ":outfeed_receiver",
         ":py_client",
@@ -740,7 +734,7 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
+    visibility = [":friends"],
     deps = [
         ":pytree_proto_cc",
         # placeholder for index annotation deps
@@ -767,7 +761,6 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
     deps = [
         ":refine_polymorphic_shapes",
         ":types",
@@ -801,7 +794,6 @@ cc_library(
     name = "refine_polymorphic_shapes",
     srcs = ["refine_polymorphic_shapes.cc"],
     hdrs = ["refine_polymorphic_shapes.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/mlir/utils:error_util",
         "@com_google_absl//absl/status",
@@ -831,7 +823,6 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
     deps = [
         ":types",
         ":xplane_to_profile_instructions",
@@ -872,7 +863,7 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
+    visibility = [":friends"],
     deps = [
         # placeholder for index annotation deps
         "@com_google_absl//absl/base:core_headers",
@@ -894,7 +885,6 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status",
         "//xla:util",
@@ -916,7 +906,7 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
     deps = [
         # placeholder for index annotation deps
         "@com_google_absl//absl/cleanup",
@@ -937,7 +927,6 @@ cc_library(
         "-fno-strict-aliasing",
     ],
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
     deps = [
         ":py_client",
         ":types",
@@ -981,7 +970,6 @@ tf_proto_library(
     name = "py_host_callback_proto",
     srcs = ["py_host_callback.proto"],
     cc_api_version = 2,
-    visibility = ["//visibility:public"],
 )
 
 # TODO(phawkins): the configuration settings here are overly confusing. The right fix is to split
@@ -990,7 +978,6 @@ tf_proto_library(
 config_setting(
     name = "link_gpu_plugin",
     define_values = {"xla_python_enable_gpu": "true"},
-    visibility = ["//visibility:public"],
 )
 
 bool_flag(
@@ -1003,7 +990,6 @@ config_setting(
     flag_values = {
         ":enable_gpu": "True",
     },
-    visibility = ["//visibility:public"],
 )
 
 # If this flag is enabled, it sets RPATH on the xla_extension to values that are suitable for
@@ -1018,14 +1004,12 @@ config_setting(
     flag_values = {
         ":jax_cuda_pip_rpaths": "True",
     },
-    visibility = ["//visibility:public"],
 )
 
 # We cannot nest select and if_cuda_is_configured so we introduce
 # a standalone cc_library target.
 cc_library(
     name = "gpu_plugin_deps",
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/service:gpu_plugin",
     ] + if_cuda_is_configured([
@@ -1079,7 +1063,6 @@ cc_library(
         "//conditions:default": [],
     }),
     features = ["-use_header_modules"],
-    visibility = ["//visibility:public"],
     deps = [
         ":custom_call_sharding",
         ":dlpack",
@@ -1166,7 +1149,6 @@ cc_library(
     name = "xplane_to_profile_instructions",
     srcs = ["xplane_to_profile_instructions.cc"],
     hdrs = ["xplane_to_profile_instructions.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status",
         "//xla:xla_proto_cc",
@@ -1211,7 +1193,6 @@ xla_cc_test(
 cc_library(
     name = "status_casters",
     hdrs = ["status_casters.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/pjrt:status_casters",
     ],
diff --git a/third_party/xla/xla/python/ifrt/BUILD b/third_party/xla/xla/python/ifrt/BUILD
index 025c70f94b776b..c82387ee095616 100644
--- a/third_party/xla/xla/python/ifrt/BUILD
+++ b/third_party/xla/xla/python/ifrt/BUILD
@@ -1,4 +1,5 @@
 load("//xla:xla.bzl", "xla_cc_test")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 
@@ -20,15 +21,16 @@ package_group(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        ":friends",
+        ":internal",
+    ]),
 )
 
-exports_files(
-    [
-        "BUILD",
-    ],
-    visibility = ["//visibility:public"],
-)
+exports_files([
+    "BUILD",
+])
 
 cc_library(
     name = "ifrt",
@@ -67,7 +69,6 @@ cc_library(
         "value.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":serdes",
         ":types_proto_cc",
@@ -186,7 +187,6 @@ cc_library(
     testonly = True,
     srcs = ["test_util.cc"],
     hdrs = ["test_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":ifrt",
         "//xla:statusor",
@@ -205,7 +205,6 @@ cc_library(
     testonly = True,
     srcs = ["sharding_test_util.cc"],
     hdrs = ["sharding_test_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":ifrt",
         ":mock",
@@ -219,7 +218,6 @@ cc_library(
     name = "no_impl_test_main",
     testonly = True,
     srcs = ["no_impl_test_main.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_googletest//:gtest",
     ],
@@ -229,7 +227,6 @@ cc_library(
     name = "array_impl_test_lib",
     testonly = True,
     srcs = ["array_impl_test_lib.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":ifrt",
         ":test_util",
@@ -254,7 +251,6 @@ cc_library(
     name = "client_impl_test_lib",
     testonly = True,
     srcs = ["client_impl_test_lib.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":ifrt",
         ":test_util",
@@ -277,7 +273,6 @@ cc_library(
     name = "tuple_impl_test_lib",
     testonly = True,
     srcs = ["tuple_impl_test_lib.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":ifrt",
         ":test_util",
@@ -304,7 +299,6 @@ cc_library(
     testonly = True,
     srcs = ["mock.cc"],
     hdrs = ["mock.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":ifrt",
         "//xla:literal",
@@ -326,7 +320,6 @@ cc_library(
     srcs = ["serdes.cc"],
     hdrs = ["serdes.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":serdes_proto_cc",
         "@com_google_absl//absl/base:core_headers",
@@ -360,7 +353,6 @@ xla_cc_test(
 tf_proto_library(
     name = "serdes_proto",
     srcs = ["serdes.proto"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -368,7 +360,6 @@ cc_library(
     srcs = ["sharding_serdes.cc"],
     hdrs = ["sharding_serdes.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":ifrt",
         ":serdes",
@@ -399,12 +390,10 @@ xla_cc_test(
 tf_proto_library(
     name = "types_proto",
     srcs = ["types.proto"],
-    visibility = ["//visibility:public"],
 )
 
 tf_proto_library(
     name = "sharding_proto",
     srcs = ["sharding.proto"],
     protodeps = [":types_proto"],
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/xla/python/ifrt/client.h b/third_party/xla/xla/python/ifrt/client.h
index f9639b9c206945..42dd3026264830 100644
--- a/third_party/xla/xla/python/ifrt/client.h
+++ b/third_party/xla/xla/python/ifrt/client.h
@@ -124,6 +124,14 @@ class Client : public llvm::RTTIExtends<Client, llvm::RTTIRoot> {
   virtual absl::string_view platform_version() const = 0;
   virtual PlatformId platform_id() const = 0;
 
+  // Returns the attributes of the client. In principle, these try to describe
+  // capabilities of a client rather than being a "feature flag".
+  //
+  // List of officially supported attributes:
+  //
+  // * supports_executable_serialization (bool; default = true): Whether IFRT
+  //   executables produced by this client are serializable. If false, all
+  //   executables from this client are considered not serializable.
   using ClientAttribute = xla::PjRtValueType;
   virtual absl::flat_hash_map<std::string, ClientAttribute> attributes()
       const = 0;
diff --git a/third_party/xla/xla/python/ifrt/ir/BUILD b/third_party/xla/xla/python/ifrt/ir/BUILD
index 04d0808b8bc5e7..4084880cbb77dc 100644
--- a/third_party/xla/xla/python/ifrt/ir/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/BUILD
@@ -2,7 +2,6 @@ load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library", "td_library")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
@@ -130,7 +129,7 @@ cc_library(
         "sharding_param.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = ["//xla/python/ifrt:friends"],
     deps = [
         ":ifrt_dialect_inc_gen",
         ":ifrt_interfaces_inc_gen",
@@ -147,7 +146,7 @@ cc_library(
     srcs = ["compiler.cc"],
     hdrs = ["compiler.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = ["//xla/python/ifrt:friends"],
     deps = [
         "//xla/python/ifrt",
         "@com_google_absl//absl/container:flat_hash_map",
diff --git a/third_party/xla/xla/python/ifrt/ir/tests/BUILD b/third_party/xla/xla/python/ifrt/ir/tests/BUILD
index e5f261d6088d8a..75bb1618e6e320 100644
--- a/third_party/xla/xla/python/ifrt/ir/tests/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/tests/BUILD
@@ -2,7 +2,6 @@ load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
 load("//xla:xla.bzl", "xla_cc_test")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
@@ -52,7 +51,7 @@ cc_library(
     testonly = True,
     srcs = ["executable_impl_test_base.cc"],
     hdrs = ["executable_impl_test_base.h"],
-    visibility = ["//visibility:public"],
+    visibility = ["//xla/python/ifrt:friends"],
     deps = [
         "//xla:status_macros",
         "//xla/mlir_hlo:hlo_dialect_registration",
@@ -76,7 +75,7 @@ cc_library(
     name = "executable_impl_test_lib",
     testonly = True,
     srcs = ["executable_impl_test_lib.cc"],
-    visibility = ["//visibility:public"],
+    visibility = ["//xla/python/ifrt:friends"],
     deps = [
         ":executable_impl_test_base",
         "//xla/pjrt:pjrt_executable",
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/BUILD b/third_party/xla/xla/python/ifrt/ir/transforms/BUILD
index 72a6895c28bd4a..4905efe8d33189 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/BUILD
@@ -2,7 +2,8 @@ load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla/python/ifrt:friends"],
     licenses = ["notice"],
 )
 
@@ -34,7 +35,6 @@ cc_library(
     ],
     hdrs = ["passes.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":constants",
         ":passes_inc_gen",
@@ -57,7 +57,6 @@ cc_library(
     srcs = ["built_in_spmd_expansions.cc"],
     hdrs = ["built_in_spmd_expansions.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/python/ifrt/ir/transforms/spmd_expanders:spmd_expander",
         "@llvm-project//mlir:FuncDialect",
@@ -69,6 +68,5 @@ cc_library(
     name = "constants",
     hdrs = ["constants.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = ["@llvm-project//llvm:Support"],
 )
diff --git a/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/BUILD b/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/BUILD
index 1af96031fadd64..090ca21d09dcbc 100644
--- a/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/BUILD
+++ b/third_party/xla/xla/python/ifrt/ir/transforms/spmd_expanders/BUILD
@@ -1,7 +1,6 @@
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
@@ -15,7 +14,7 @@ cc_library(
         "*spmd_expander.h",
     ]),
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = ["//xla/python/ifrt:friends"],
     deps = [
         "//xla/python/ifrt/ir",
         "@llvm-project//llvm:Support",
diff --git a/third_party/xla/xla/python/ifrt/support/BUILD b/third_party/xla/xla/python/ifrt/support/BUILD
index a2b245904e8360..1fde84af53aed4 100644
--- a/third_party/xla/xla/python/ifrt/support/BUILD
+++ b/third_party/xla/xla/python/ifrt/support/BUILD
@@ -1,7 +1,6 @@
 load("//xla:xla.bzl", "xla_cc_test")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
@@ -10,7 +9,7 @@ cc_library(
     name = "sharding_conversions",
     srcs = ["sharding_conversions.cc"],
     hdrs = ["sharding_conversions.h"],
-    visibility = ["//visibility:public"],
+    visibility = ["//xla/python/ifrt:friends"],
     deps = [
         "//xla:statusor",
         "//xla:xla_data_proto_cc",
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/BUILD b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
new file mode 100644
index 00000000000000..7a6071ad0b2aee
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/BUILD
@@ -0,0 +1,519 @@
+# Copyright 2023 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+load("//xla/python/ifrt_proxy/common:ifrt_proxy.bzl", "default_ifrt_proxy_visibility", "ifrt_proxy_cc_test")
+load("@local_tsl//tsl:tsl.bzl", "if_google")
+load("@local_tsl//tsl:tsl.default.bzl", "tsl_pybind_extension")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = default_ifrt_proxy_visibility,
+)
+
+cc_library(
+    name = "grpc_client_session",
+    srcs = [
+        "grpc_client_session.cc",
+    ],
+    hdrs = ["grpc_client_session.h"],
+    deps = [
+        ":client_session",
+        "//xla/pjrt/distributed:util",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt_proxy/common:grpc_credentials",
+        "//xla/python/ifrt_proxy/common:grpc_ifrt_service_cc_grpc_proto",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "@com_github_grpc_grpc//:grpc",
+        "@com_github_grpc_grpc//:grpc++",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:bind_front",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:unbounded_work_queue",
+    ],
+)
+
+ifrt_proxy_cc_test(
+    name = "grpc_client_session_test",
+    srcs = [
+        "grpc_client_session_test.cc",
+    ],
+    deps = [
+        ":grpc_client_session",
+        ":version",
+        "//xla/python/ifrt_proxy/common:grpc_credentials",
+        "//xla/python/ifrt_proxy/common:grpc_ifrt_service_cc_grpc_proto",
+        "//xla/python/ifrt_proxy/common:grpc_ifrt_service_proto_cc",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "@com_github_grpc_grpc//:gpr",
+        "@com_github_grpc_grpc//:grpc++",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/log:log_sink_registry",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+cc_library(
+    name = "rpc_helper",
+    srcs = [
+        "rpc_helper.cc",
+    ],
+    hdrs = ["rpc_helper.h"],
+    deps = [
+        ":client_session",
+        ":host_buffer",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:status_to_from_proto",
+    ] + if_google(["@com_google_absl//absl/types:source_location"]),
+)
+
+cc_library(
+    name = "client",
+    srcs = ["client.cc"],
+    hdrs = ["client.h"],
+    deps = [
+        ":array",
+        ":compiler",
+        ":device",
+        ":memory",
+        ":rpc_helper",
+        "//xla:xla_data_proto_cc",
+        "//xla/pjrt:pjrt_compiler",
+        "//xla/pjrt:pjrt_device_description",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt_proxy/common:common_serdes",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "//xla/python/ifrt_proxy/common:types",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+ifrt_proxy_cc_test(
+    name = "client_test",
+    srcs = ["client_test.cc"],
+    deps = [
+        ":client",
+        ":client_session",
+        ":host_buffer",
+        ":mock_client_session",
+        ":mock_host_buffer",
+        ":rpc_helper",
+        ":version",
+        "//xla/pjrt:pjrt_device_description",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "//xla/service:computation_placer_hdr",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+cc_library(
+    name = "device",
+    srcs = ["device.cc"],
+    hdrs = ["device.h"],
+    deps = [
+        "//xla:literal",
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt:pjrt_device_description",
+        "//xla/pjrt:pjrt_future",
+        "//xla/python/ifrt",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "array",
+    srcs = ["array.cc"],
+    hdrs = ["array.h"],
+    deps = [
+        ":rpc_helper",
+        "//xla:status_macros",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt_proxy/common:array_util",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "//xla/python/ifrt_proxy/common:types",
+        "//xla/python/ifrt_proxy/common:types_proto_cc",
+        "//xla/python/pjrt_ifrt",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
+        "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+ifrt_proxy_cc_test(
+    name = "array_test",
+    srcs = ["array_test.cc"],
+    deps = [
+        ":array",
+        ":client_session",
+        ":host_buffer",
+        ":mock_client_session",
+        ":mock_host_buffer",
+        ":rpc_helper",
+        ":version",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt:mock",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "//xla/python/ifrt_proxy/common:types",
+        "//xla/python/ifrt_proxy/common:types_proto_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+cc_library(
+    name = "client_session",
+    hdrs = ["client_session.h"],
+    deps = [
+        "//xla/python/ifrt",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+    ],
+)
+
+cc_library(
+    name = "mock_client_session",
+    testonly = True,
+    hdrs = ["mock_client_session.h"],
+    deps = [
+        ":client_session",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "compiler",
+    srcs = ["compiler.cc"],
+    hdrs = ["compiler.h"],
+    deps = [
+        ":executable",
+        ":rpc_helper",
+        "//xla/pjrt:host_callback",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt:serdes",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "//xla/python/ifrt_proxy/server:host_callback",
+        "//xla/python/pjrt_ifrt",
+        "//xla/python/pjrt_ifrt:xla_ifrt",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:status_to_from_proto",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+ifrt_proxy_cc_test(
+    name = "compiler_test",
+    srcs = ["compiler_test.cc"],
+    deps = [
+        ":client_session",
+        ":compiler",
+        ":host_buffer",
+        ":mock_client_session",
+        ":mock_host_buffer",
+        ":rpc_helper",
+        ":version",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt:mock",
+        "//xla/python/ifrt:serdes",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+cc_library(
+    name = "executable",
+    srcs = ["executable.cc"],
+    hdrs = ["executable.h"],
+    deps = [
+        ":array",
+        ":host_buffer",
+        ":rpc_helper",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/pjrt:host_callback",
+        "//xla/pjrt:pjrt_executable",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "//xla/python/ifrt_proxy/common:types",
+        "//xla/python/pjrt_ifrt",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:node_hash_set",
+        "@com_google_absl//absl/functional:bind_front",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status_to_from_proto",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "host_buffer",
+    hdrs = ["host_buffer.h"],
+    deps = [
+        "//xla/python/ifrt",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
+    ],
+)
+
+cc_library(
+    name = "mock_host_buffer",
+    testonly = True,
+    hdrs = ["mock_host_buffer.h"],
+    deps = [
+        ":host_buffer",
+        "//xla/python/ifrt",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "grpc_host_buffer",
+    srcs = ["grpc_host_buffer.cc"],
+    hdrs = ["grpc_host_buffer.h"],
+    deps = [
+        ":host_buffer",
+        "//xla/pjrt/distributed:util",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt_proxy/common:grpc_ifrt_service_cc_grpc_proto",
+        "//xla/python/ifrt_proxy/common:grpc_ifrt_service_proto_cc",
+        "@com_github_grpc_grpc//:grpc++",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:cord",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:unbounded_work_queue",
+        "@local_tsl//tsl/protobuf:status_proto_cc",
+    ],
+)
+
+cc_library(
+    name = "grpc_client",
+    srcs = ["grpc_client.cc"],
+    deps = [
+        ":client",
+        ":grpc_client_session",
+        ":grpc_host_buffer",
+        ":registry",
+        ":rpc_helper",
+        ":version",
+        "//xla/pjrt/distributed:util",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt_proxy/common:grpc_ifrt_service_proto_cc",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "@com_github_grpc_grpc//:grpc++",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/log:log_entry",
+        "@com_google_absl//absl/log:log_sink",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/time",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "registry",
+    srcs = ["registry.cc"],
+    hdrs = ["registry.h"],
+    deps = [
+        "//xla/python/ifrt",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+cc_library(
+    name = "memory",
+    hdrs = ["memory.h"],
+    deps = [
+        "//xla/pjrt:pjrt_client",
+        "//xla/python/ifrt",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+cc_library(
+    name = "version",
+    hdrs = ["version.h"],
+)
+
+ifrt_proxy_cc_test(
+    name = "executable_test",
+    srcs = ["executable_test.cc"],
+    deps = [
+        ":array",
+        ":client_session",
+        ":executable",
+        ":host_buffer",
+        ":mock_client_session",
+        ":mock_host_buffer",
+        ":rpc_helper",
+        ":version",
+        "//xla:shape_util",
+        "//xla/pjrt:pjrt_common",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt:mock",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "//xla/python/ifrt_proxy/common:types",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+tsl_pybind_extension(
+    name = "py_module",
+    srcs = ["py_module.cc"],
+    deps = [
+        ":grpc_client",
+        ":registry",
+        "//xla/pjrt:status_casters",
+        "//xla/python:py_client",
+        "//xla/python/ifrt",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/log:log_entry",
+        "@com_google_absl//absl/log:log_sink",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:statusor",
+        "@pybind11",
+        "@pybind11_abseil//pybind11_abseil:absl_casters",
+        "@pybind11_protobuf//pybind11_protobuf:native_proto_caster",
+    ],
+)
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/README.md b/third_party/xla/xla/python/ifrt_proxy/client/README.md
new file mode 100644
index 00000000000000..b97be206439a1c
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/README.md
@@ -0,0 +1,11 @@
+This directory implements the IFRT proxy client.
+
+## Expected behavior when connection to the IFRT proxy server fails
+
+If a connection to the proxy server fails abruptly, any in-progress or further
+IFRT API calls and `Future`s are expected to either return valid values (if the
+value was already fetched from the server and is being cached locally) or an
+error from `rpc_helper.cc`'s `WrapAsConnectionError()`. They are expected to
+neither "hang" beyond the brief period required to determine whether the
+connection has failed nor crash the process internally within the proxy client
+library.
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array.cc b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
new file mode 100644
index 00000000000000..58c01c1d062906
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array.cc
@@ -0,0 +1,345 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/client/array.h"
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/cleanup/cleanup.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/strings/substitute.h"
+#include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt_proxy/client/rpc_helper.h"
+#include "xla/python/ifrt_proxy/common/array_util.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/types.h"
+#include "xla/python/ifrt_proxy/common/types.pb.h"
+#include "xla/python/pjrt_ifrt/pjrt_array.h"
+#include "xla/status_macros.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+char Array::ID = 0;
+
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+Array::MakeArrayFromHostBuffer(
+    xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
+    const void* data, DType dtype, Shape shape,
+    std::optional<absl::Span<const int64_t>> byte_strides,
+    std::shared_ptr<const Sharding> sharding,
+    xla::ifrt::Client::HostBufferSemantics semantics,
+    std::function<void()> on_done_with_host_buffer) {
+  TF_ASSIGN_OR_RETURN(const auto array_mem_region,
+                      ArrayMemRegion::FromZerothElementPointer(
+                          /*zeroth_element=*/data, dtype, shape, byte_strides));
+
+  const uint64_t host_buffer_handle =
+      rpc_helper->host_buffer_store()->NextHandle();
+  TF_RETURN_IF_ERROR(
+      rpc_helper->host_buffer_store()
+          ->Store(host_buffer_handle, array_mem_region.mem_region())
+          .Await());
+
+  auto req = std::make_unique<MakeArrayFromHostBufferRequest>();
+  req->set_host_buffer_handle(host_buffer_handle);
+  req->set_dtype(ToDTypeProto(dtype));
+  *req->mutable_shape() = ToShapeProto(shape);
+  TF_ASSIGN_OR_RETURN(*req->mutable_sharding(), ToShardingProto(*sharding));
+  if (byte_strides.has_value()) {
+    *req->mutable_byte_strides() = ToByteStridesProto(*byte_strides);
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      auto response,
+      rpc_helper->MakeArrayFromHostBuffer(std::move(req)).Await());
+  const ArrayHandle handle{.handle = response->array_handle()};
+
+  if (on_done_with_host_buffer != nullptr) {
+    std::move(on_done_with_host_buffer)();
+  }
+
+  return tsl::RCReference<xla::ifrt::Array>(
+      tsl::MakeRef<Array>(client, std::move(rpc_helper), dtype,
+                          std::move(shape), std::move(sharding), handle));
+}
+
+void Array::Destruct(RpcHelper* rpc_helper, ArrayHandle handle) {
+  auto req = std::make_unique<DestructArrayRequest>();
+  req->set_array_handle(handle.handle);
+  rpc_helper->DestructArray(std::move(req))
+      .OnReady(
+          [](absl::StatusOr<std::shared_ptr<DestructArrayResponse>> response) {
+            if (!response.ok()) {
+              LOG(WARNING)
+                  << "Server returned an error when asked to destruct array: "
+                  << response.status();
+            }
+          });
+}
+
+Future<absl::Status> Array::GetReadyFuture() const {
+  auto req = std::make_unique<CheckArrayReadyRequest>();
+  req->set_array_handle(handle_.handle);
+
+  auto promise = Future<absl::Status>::CreatePromise();
+  rpc_helper_->CheckArrayReady(std::move(req))
+      .OnReady(
+          [promise](absl::StatusOr<std::shared_ptr<CheckArrayReadyResponse>>
+                        resp) mutable -> void { promise.Set(resp.status()); });
+  return Future<absl::Status>(std::move(promise));
+}
+
+Future<absl::Status> Array::Delete() {
+  auto req = std::make_unique<DeleteArrayRequest>();
+  req->set_array_handle(handle_.handle);
+
+  absl::StatusOr<std::shared_ptr<DeleteArrayResponse>> response =
+      rpc_helper_->DeleteArray(std::move(req)).Await();
+  if (!response.ok()) {
+    return Future<absl::Status>(response.status());
+  }
+
+  // TODO(b/266635130): So that the caller is not blocked until the server
+  // replies with the deletion's response, from within
+  // `Future(status_handle_promise).OnReady()`, schedule `CheckFuture()` on a
+  // separate thread.
+  return rpc_helper_->CheckFuture((*response)->deletion_future_handle());
+}
+
+bool Array::IsDeleted() const {
+  auto req = std::make_unique<IsArrayDeletedRequest>();
+  req->set_array_handle(handle_.handle);
+
+  absl::StatusOr<std::shared_ptr<IsArrayDeletedResponse>> response =
+      rpc_helper_->IsArrayDeleted(std::move(req)).Await();
+  if (response.ok()) {
+    return (*response)->deleted();
+  } else {
+    LOG(ERROR) << "Internal error from proxy server during Array::IsDeleted(): "
+               << response.status();
+    // Return false so that the user likely queries the array with some
+    // method that returns an absl::Status, and ends up with the real
+    // error being returned to them by that method.
+    return false;
+  }
+}
+
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+Array::AssembleArrayFromSingleDeviceArrays(
+    xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
+    Shape shape, std::shared_ptr<const Sharding> sharding,
+    absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+    ArrayCopySemantics semantics) {
+  auto req = std::make_unique<AssembleArrayFromSingleDeviceArraysRequest>();
+  TF_RET_CHECK(!arrays.empty());
+  *req->mutable_shape() = ToShapeProto(shape);
+  TF_ASSIGN_OR_RETURN(*req->mutable_sharding(), ToShardingProto(*sharding));
+  req->set_copy_semantics(ToArrayCopySemanticsProto(semantics));
+  for (const tsl::RCReference<xla::ifrt::Array>& rcref : arrays) {
+    Array* array = llvm::dyn_cast<Array>(rcref.get());
+    if (array == nullptr) {
+      return absl::InvalidArgumentError(absl::Substitute(
+          "Array at $0 supplied to AssembleArrayFromSingleDeviceArrays() is "
+          "not a xla::ifrt::proxy::Array.",
+          rcref.get()));
+    }
+    req->add_single_device_array_handles(array->handle_.handle);
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<AssembleArrayFromSingleDeviceArraysResponse> response,
+      rpc_helper->AssembleArrayFromSingleDeviceArrays(std::move(req)).Await());
+  ArrayHandle handle{.handle = response->array_handle()};
+
+  return tsl::RCReference<xla::ifrt::Array>(
+      tsl::MakeRef<Array>(client, std::move(rpc_helper), arrays[0]->dtype(),
+                          std::move(shape), std::move(sharding), handle));
+}
+
+absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+Array::DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) {
+  auto req = std::make_unique<DisassembleIntoSingleDeviceArraysRequest>();
+  req->set_array_handle(handle_.handle);
+  req->set_copy_semantics(ToArrayCopySemanticsProto(semantics));
+
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<DisassembleIntoSingleDeviceArraysResponse> response,
+      rpc_helper_->DisassembleIntoSingleDeviceArrays(std::move(req)).Await());
+  std::vector<ArrayHandle> handles;
+  for (auto& handle : response->single_device_array_handles()) {
+    handles.push_back(ArrayHandle{.handle = handle});
+  }
+
+  TF_ASSIGN_OR_RETURN(auto shape_and_shardings, sharding_->Disassemble(shape_));
+  CHECK_EQ(handles.size(), shape_and_shardings.size())
+      << " " << absl::StrJoin(handles, ",") << " " << shape_ << " "
+      << *sharding_ << " ";
+
+  std::vector<tsl::RCReference<xla::ifrt::Array>> result;
+  result.reserve(handles.size());
+  for (int i = 0; i < handles.size(); ++i) {
+    result.push_back(tsl::RCReference<xla::ifrt::Array>(tsl::MakeRef<Array>(
+        client_, rpc_helper_, dtype_, std::move(shape_and_shardings[i].first),
+        std::move(shape_and_shardings[i].second), handles[i])));
+  }
+
+  return result;
+}
+
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> Array::FullyReplicatedShard(
+    ArrayCopySemantics semantics) {
+  auto req = std::make_unique<FullyReplicatedShardRequest>();
+  req->set_array_handle(handle_.handle);
+  req->set_copy_semantics(ToArrayCopySemanticsProto(semantics));
+
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<FullyReplicatedShardResponse> response,
+      rpc_helper_->FullyReplicatedShard(std::move(req)).Await());
+
+  ArrayHandle handle{.handle = response->array_handle()};
+
+  // We are making the assumption the Array returned by the server corresponds
+  // to the first device. Revisit this when IFRT supports: (1) an inexpensive
+  // way to derive a SingleDeviceSharding from a fully replicated Array's
+  // sharding and (2) A generalized `Reshard` API that allows the user to
+  // request an Array to be made out of a specific single shard.
+  std::unique_ptr<xla::ifrt::SingleDeviceSharding> single_device_sharding =
+      xla::ifrt::SingleDeviceSharding::Create(sharding_->devices()[0],
+                                              sharding_->memory_kind());
+
+  return tsl::RCReference<xla::ifrt::Array>(
+      tsl::MakeRef<Array>(client_, rpc_helper_, dtype_, shape_,
+                          std::move(single_device_sharding), handle));
+}
+
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> Array::Reshard(
+    std::shared_ptr<const Sharding> new_sharding,
+    ArrayCopySemantics semantics) {
+  auto req = std::make_unique<ReshardRequest>();
+  req->set_array_handle(handle_.handle);
+  TF_ASSIGN_OR_RETURN(*req->mutable_sharding(), ToShardingProto(*new_sharding));
+  req->set_copy_semantics(ToArrayCopySemanticsProto(semantics));
+
+  TF_ASSIGN_OR_RETURN(std::shared_ptr<ReshardResponse> response,
+                      rpc_helper_->Reshard(std::move(req)).Await());
+  ArrayHandle handle{.handle = response->array_handle()};
+
+  return tsl::RCReference<xla::ifrt::Array>(tsl::MakeRef<Array>(
+      client_, rpc_helper_, dtype_, shape_, std::move(new_sharding), handle));
+}
+
+Future<absl::Status> Array::CopyToHostBuffer(
+    void* data, std::optional<absl::Span<const int64_t>> byte_strides,
+    ArrayCopySemantics semantics) {
+  const auto mem_region = ArrayMemRegion::FromZerothElementPointer(
+      /*zeroth_element=*/data, dtype_, shape_, byte_strides);
+  if (!mem_region.ok()) {
+    return Future<absl::Status>(mem_region.status());
+  }
+
+  auto req = std::make_unique<CopyToHostBufferRequest>();
+  req->set_array_handle(handle_.handle);
+  if (byte_strides.has_value()) {
+    *req->mutable_byte_strides() = ToByteStridesProto(*byte_strides);
+  }
+  const uint64_t host_buffer_handle =
+      rpc_helper_->host_buffer_store()->NextHandle();
+  req->set_host_buffer_handle(host_buffer_handle);
+
+  auto promise = Future<absl::Status>::CreatePromise();
+  auto on_ready = [host_buffer_store = rpc_helper_->host_buffer_store(),
+                   promise, host_buffer_handle,
+                   mem_region = mem_region->mem_region()](
+                      absl::StatusOr<std::shared_ptr<CopyToHostBufferResponse>>
+                          resp) mutable {
+    if (!resp.ok()) {
+      promise.Set(resp.status());
+      return;
+    }
+
+    auto host_buffer = host_buffer_store->Lookup(host_buffer_handle);
+    host_buffer.OnReady(
+        [promise, mem_region, host_buffer_store,
+         host_buffer_handle](absl::StatusOr<absl::Cord> data) mutable {
+          absl::Cleanup cleanup = [&]() {
+            host_buffer_store->Delete(host_buffer_handle)
+                .OnReady([buffer_status = data.status()](absl::Status status) {
+                  if (!status.ok()) {
+                    LOG(WARNING) << "Failed to delete host buffer: " << status
+                                 << " (buffer status: " << buffer_status << ")";
+                  }
+                });
+          };
+
+          if (!data.ok()) {
+            promise.Set(data.status());
+            return;
+          }
+          if (data->size() != mem_region.size()) {
+            auto status = absl::InternalError(
+                absl::StrCat("During CopyToHostBuffer, size mismatch in "
+                             "response from proxy: ",
+                             mem_region.size(), " vs ", data->size()));
+            LOG(ERROR) << status;
+            promise.Set(status);
+            return;
+          }
+#if defined(PLATFORM_GOOGLE)
+          data->CopyToArray(const_cast<char*>(mem_region.data()));
+#else
+          std::memcpy(const_cast<char*>(mem_region.data()),
+                      data->Flatten().data(), data->size());
+#endif
+          promise.Set(absl::OkStatus());
+        });
+  };
+  rpc_helper_->CopyToHostBuffer(std::move(req)).OnReady(std::move(on_ready));
+  return Future<absl::Status>(std::move(promise));
+}
+
+xla::ifrt::Client* Array::client() const { return client_; }
+
+std::string Array::DebugString() const {
+  return absl::Substitute("proxy::Array, this=$0, handle=$1", this,
+                          handle_.handle);
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array.h b/third_party/xla/xla/python/ifrt_proxy/client/array.h
new file mode 100644
index 00000000000000..c17f497b1ce81d
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_ARRAY_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_ARRAY_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/attributes.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/value.h"
+#include "xla/python/ifrt_proxy/client/rpc_helper.h"
+#include "xla/python/ifrt_proxy/common/types.h"
+#include "tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Implementation of the xla::ifrt::Array interface.
+class Array final : public llvm::RTTIExtends<Array, xla::ifrt::Array> {
+ public:
+  // `Array::MakeArrayFromHostBuffer()` implements
+  // `Client::MakeArrayFromHostBuffer()`.
+  // TODO(b/261226026): Implement logic directly in client.cc.
+  static absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+  MakeArrayFromHostBuffer(xla::ifrt::Client* client,
+                          std::shared_ptr<RpcHelper> rpc_helper,
+                          const void* data, DType dtype, Shape shape,
+                          std::optional<absl::Span<const int64_t>> byte_strides,
+                          std::shared_ptr<const Sharding> sharding,
+                          xla::ifrt::Client::HostBufferSemantics semantics,
+                          std::function<void()> on_done_with_host_buffer);
+
+  // `Array::AssembleArrayFromSingleDeviceArrays()` implements
+  // `Client::AssembleArrayFromSingleDeviceArrays()`.
+  // TODO(b/261226026): Implement logic directly in client.cc.
+  static absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+  AssembleArrayFromSingleDeviceArrays(
+      xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
+      Shape shape, std::shared_ptr<const Sharding> sharding,
+      absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+      ArrayCopySemantics semantics);
+
+  // Destructs the array associated with the given handle. The corresponding
+  // array becomes unusable afterwards.
+  static void Destruct(RpcHelper* rpc_helper, ArrayHandle handle);
+
+  Array(xla::ifrt::Client* const client, std::shared_ptr<RpcHelper> rpc_helper,
+        DType dtype, Shape shape, std::shared_ptr<const Sharding> sharding,
+        ArrayHandle handle)
+      : client_(client),
+        rpc_helper_(std::move(rpc_helper)),
+        dtype_(dtype),
+        shape_(std::move(shape)),
+        sharding_(std::move(sharding)),
+        handle_(handle) {}
+
+  ~Array() override { Destruct(rpc_helper_.get(), handle_); }
+
+  ArrayHandle handle() const { return handle_; }
+
+  xla::ifrt::Client* client() const override;
+  Future<absl::Status> GetReadyFuture() const override;
+  Future<absl::Status> Delete() override;
+  bool IsDeleted() const override;
+  std::string DebugString() const override;
+
+  DType dtype() const override { return dtype_; }
+  const Shape& shape() const override { return shape_; }
+  const Sharding& sharding() const override { return *sharding_; }
+  std::shared_ptr<const Sharding> shared_ptr_sharding() const override {
+    return sharding_;
+  }
+
+  absl::StatusOr<std::vector<tsl::RCReference<xla::ifrt::Array>>>
+  DisassembleIntoSingleDeviceArrays(ArrayCopySemantics semantics) override;
+
+  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> FullyReplicatedShard(
+      xla::ifrt::ArrayCopySemantics semantics) override;
+
+  ABSL_MUST_USE_RESULT
+  Future<absl::Status> CopyToHostBuffer(
+      void* data, std::optional<absl::Span<const int64_t>> byte_strides,
+      ArrayCopySemantics semantics) override;
+
+  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> Reshard(
+      std::shared_ptr<const Sharding> new_sharding,
+      ArrayCopySemantics semantics) override;
+
+  static char ID;  // NOLINT
+
+ private:
+  template <typename T, typename... Args>
+  friend tsl::RCReference<T> tsl::MakeRef(Args&&... args);
+
+  // Not owned. Used only for implementing `client()` interface method. Note
+  // that `client()` will still return the pointer even if the pointed-to memory
+  // is freed; this unfortunate behavior currently exists in all IFRT
+  // implementations.
+  xla::ifrt::Client* const client_;
+
+  const std::shared_ptr<RpcHelper> rpc_helper_;
+  const DType dtype_;
+  const Shape shape_;
+  const std::shared_ptr<const Sharding> sharding_;
+  const ArrayHandle handle_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_ARRAY_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
new file mode 100644
index 00000000000000..0d80bba73cd9dc
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/array_test.cc
@@ -0,0 +1,139 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/client/array.h"
+
+#include <memory>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/mock.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt_proxy/client/client_session.h"
+#include "xla/python/ifrt_proxy/client/host_buffer.h"
+#include "xla/python/ifrt_proxy/client/mock_client_session.h"
+#include "xla/python/ifrt_proxy/client/mock_host_buffer.h"
+#include "xla/python/ifrt_proxy/client/rpc_helper.h"
+#include "xla/python/ifrt_proxy/client/version.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/types.h"
+#include "xla/python/ifrt_proxy/common/types.pb.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/test.h"
+
+using ::testing::_;
+using ::testing::Pointee;
+using ::testing::Return;
+using ::tsl::protobuf::TextFormat;
+using ::tsl::testing::IsOk;
+
+#if defined(PLATFORM_GOOGLE)
+using ::testing::EquivToProto;
+using ::testing::proto::Partially;
+#endif
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+namespace {
+
+IfrtProxyVersion Version() {
+  IfrtProxyVersion version;
+  version.set_protocol_version(kClientMinVersion);
+  return version;
+}
+
+class ArrayTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    session_ = std::make_shared<MockClientSession>();
+    rpc_helper_ = std::make_shared<RpcHelper>(Version(), session_);
+
+    host_buffer_store_ = std::make_shared<MockClientHostBufferStore>();
+    rpc_helper_->set_host_buffer_store(host_buffer_store_);
+
+    // Default handler that ignores all uninteresting requests, but still
+    // invokes the callback in order to avoid hanging the caller forever.
+    EXPECT_CALL(*session_, Enqueue(_))
+        .WillRepeatedly(Return(Future<ClientSession::Response>(
+            absl::InternalError("Request has no mock handlers"))));
+  }
+
+  std::shared_ptr<MockClientSession> session_;
+  std::shared_ptr<RpcHelper> rpc_helper_;
+  std::shared_ptr<ClientHostBufferStore> host_buffer_store_;
+};
+
+// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
+#if defined(PLATFORM_GOOGLE)
+TEST_F(ArrayTest, Destruction) {
+  IfrtResponse response;
+  EXPECT_CALL(
+      *session_,
+      Enqueue(Pointee(Partially(EquivToProto(R"pb(destruct_array_request {
+                                                    array_handle: 1234
+                                                  })pb")))))
+      .WillOnce(MockClientSessionReturnResponse(response));
+
+  MockClient client;
+  tsl::MakeRef<Array>(&client, rpc_helper_, DType(DType::Kind::kBF16),
+                      Shape({}), /*sharding=*/nullptr,
+                      ArrayHandle{.handle = 1234});
+}
+#endif
+
+// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
+#if defined(PLATFORM_GOOGLE)
+TEST_F(ArrayTest, FullyReplicatedShard) {
+  IfrtResponse response;
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(response_metadata {}
+           fully_replicated_shard_response { array_handle: 5678 })pb",
+      &response));
+
+  EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
+                             R"pb(fully_replicated_shard_request {
+                                    array_handle: 1234
+                                  })pb")))))
+      .WillOnce(MockClientSessionReturnResponse(response));
+
+  MockClient client;
+  MockDevice mock_device;
+
+  auto sharding = xla::ifrt::SingleDeviceSharding::Create(
+      &mock_device, xla::ifrt::MemoryKind());
+
+  auto array = tsl::MakeRef<Array>(
+      &client, rpc_helper_, DType(DType::Kind::kBF16), Shape({}),
+      std::move(sharding), ArrayHandle{.handle = 1234});
+
+  ASSERT_THAT(array->FullyReplicatedShard(ArrayCopySemantics::kAlwaysCopy),
+              IsOk());
+}
+#endif
+
+}  // namespace
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client.cc b/third_party/xla/xla/python/ifrt_proxy/client/client.cc
new file mode 100644
index 00000000000000..7153ad21420c1e
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client.cc
@@ -0,0 +1,209 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/client/client.h"
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt_proxy/client/array.h"
+#include "xla/python/ifrt_proxy/client/device.h"
+#include "xla/python/ifrt_proxy/client/memory.h"
+#include "xla/python/ifrt_proxy/client/rpc_helper.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/types.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+char Client::ID = 0;
+
+absl::StatusOr<std::unique_ptr<Client>> Client::Create(
+    std::shared_ptr<RpcHelper> rpc_helper, InitResponse init_response) {
+  absl::flat_hash_set<int> addressable_device_ids(
+      init_response.addressable_device_ids().begin(),
+      init_response.addressable_device_ids().end());
+
+  absl::flat_hash_map<int, std::unique_ptr<Memory>> memories;
+  for (const auto& m : init_response.memories()) {
+    auto memory = std::make_unique<Memory>(m.id(), m.memory_space_kind(),
+                                           m.debug_string(), m.to_string());
+    memories.insert({m.id(), std::move(memory)});
+  }
+
+  absl::flat_hash_map<int, std::unique_ptr<Device>> devices;
+  std::vector<xla::ifrt::Device*> device_ptrs;
+  std::vector<xla::ifrt::Device*> addressable_device_ptrs;
+
+  for (const auto& d : init_response.devices()) {
+    absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute> attributes;
+    for (const auto& [key, attr] : d.attributes()) {
+      TF_ASSIGN_OR_RETURN(xla::PjRtDeviceAttribute value,
+                          FromVariantProto(attr));
+      attributes.insert({key, std::move(value)});
+    }
+
+    DeviceDescription desc(d.id(), init_response.process_index(),
+                           d.device_kind(), d.debug_string(), d.to_string(),
+                           std::move(attributes));
+    bool is_addressable = addressable_device_ids.contains(d.id());
+
+    auto device =
+        std::make_unique<Device>(std::move(desc), d.local_device_id(),
+                                 d.local_hardware_id(), is_addressable);
+    device_ptrs.push_back(device.get());
+    if (is_addressable) {
+      addressable_device_ptrs.push_back(device.get());
+    }
+
+    if (d.has_default_memory_id()) {
+      const auto it = memories.find(d.default_memory_id());
+      if (it == memories.end()) {
+        return absl::NotFoundError(
+            absl::StrCat("Memory ", d.default_memory_id(), " not found"));
+      }
+      device->default_memory_space_ = it->second.get();
+    }
+    for (const int memory_id : d.memory_ids()) {
+      const auto it = memories.find(memory_id);
+      if (it == memories.end()) {
+        return absl::NotFoundError(
+            absl::StrCat("Memory ", memory_id, " not found"));
+      }
+      device->memory_spaces_.push_back(it->second.get());
+    }
+
+    devices.insert({d.id(), std::move(device)});
+  }
+
+  for (const auto& m : init_response.memories()) {
+    Memory* memory = memories.at(m.id()).get();
+    for (const int device_id : m.device_ids()) {
+      const auto device = devices.find(device_id);
+      if (device == devices.end()) {
+        return absl::NotFoundError(
+            absl::StrCat("Device ", device_id, " not found"));
+      }
+      memory->devices_.push_back(device->second.get());
+    }
+  }
+
+  // Prefix the runtime_type string received from the server with "proxy/" so
+  // that the users (of this proxy client, such as JAX) do not erroneously
+  // conclude that they are talking with the backend runtime directly.
+  std::string runtime_type =
+      absl::StrCat("proxy/", init_response.runtime_type());
+
+  return absl::WrapUnique(new Client(
+      std::move(rpc_helper), init_response.session_id(),
+      init_response.platform_name(), init_response.platform_version(),
+      init_response.platform_id(), init_response.process_index(), runtime_type,
+      std::move(devices), std::move(device_ptrs),
+      std::move(addressable_device_ptrs), std::move(memories)));
+}
+
+Client::Client(std::shared_ptr<RpcHelper> rpc_helper, uint64_t session_id,
+               std::string platform_name, std::string platform_version,
+               uint64_t platform_id, uint64_t process_index,
+               std::string runtime_type,
+               absl::flat_hash_map<int, std::unique_ptr<Device>> devices,
+               std::vector<xla::ifrt::Device*> device_ptrs,
+               std::vector<xla::ifrt::Device*> addressable_device_ptrs,
+               absl::flat_hash_map<int, std::unique_ptr<Memory>> memories)
+    : rpc_helper_(rpc_helper),
+      platform_name_(std::move(platform_name)),
+      platform_version_(std::move(platform_version)),
+      platform_id_(platform_id),
+      process_index_(process_index),
+      runtime_type_(std::move(runtime_type)),
+      devices_(std::move(devices)),
+      device_ptrs_(device_ptrs),
+      addressable_device_ptrs_(std::move(addressable_device_ptrs)),
+      memories_(std::move(memories)),
+      default_compiler_(this, rpc_helper) {}
+
+Client::~Client() { rpc_helper_->Disconnect(); }
+
+absl::StatusOr<xla::ifrt::Device*> Client::LookupDevice(int device_id) const {
+  auto it = devices_.find(device_id);
+  if (it == devices_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("Device ", device_id, " not found."));
+  }
+  return it->second.get();
+}
+
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+Client::MakeArrayFromHostBuffer(
+    const void* data, DType dtype, Shape shape,
+    std::optional<absl::Span<const int64_t>> byte_strides,
+    std::shared_ptr<const Sharding> sharding,
+    xla::ifrt::Client::HostBufferSemantics semantics,
+    std::function<void()> on_done_with_host_buffer) {
+  return Array::MakeArrayFromHostBuffer(
+      this, rpc_helper_, data, dtype, std::move(shape), std::move(byte_strides),
+      std::move(sharding), semantics, std::move(on_done_with_host_buffer));
+}
+
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+Client::AssembleArrayFromSingleDeviceArrays(
+    Shape shape, std::shared_ptr<const Sharding> sharding,
+    absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+    ArrayCopySemantics semantics) {
+  return Array::AssembleArrayFromSingleDeviceArrays(
+      this, rpc_helper_, std::move(shape), sharding, arrays, semantics);
+}
+
+absl::StatusOr<DeviceAssignment> Client::GetDefaultDeviceAssignment(
+    int num_replicas, int num_partitions) const {
+  auto req = std::make_unique<GetDefaultDeviceAssignmentRequest>();
+  req->set_num_replicas(num_replicas);
+  req->set_num_partitions(num_partitions);
+
+  auto future = rpc_helper_->GetDefaultDeviceAssignment(std::move(req));
+  TF_ASSIGN_OR_RETURN(auto response, future.Await());
+
+  TF_ASSIGN_OR_RETURN(
+      auto assignment_to_return,
+      DeviceAssignment::Deserialize(response->device_assignment()));
+
+  return *std::move(assignment_to_return);
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client.h b/third_party/xla/xla/python/ifrt_proxy/client/client.h
new file mode 100644
index 00000000000000..8cca1cbf0258af
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client.h
@@ -0,0 +1,157 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_CLIENT_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_CLIENT_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/optional.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/pjrt_compiler.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/value.h"
+#include "xla/python/ifrt_proxy/client/compiler.h"
+#include "xla/python/ifrt_proxy/client/device.h"
+#include "xla/python/ifrt_proxy/client/memory.h"
+#include "xla/python/ifrt_proxy/client/rpc_helper.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Implementation of the xla::ifrt::Client interface.
+class Client final : public llvm::RTTIExtends<Client, xla::ifrt::Client> {
+ public:
+  static absl::StatusOr<std::unique_ptr<Client>> Create(
+      std::shared_ptr<RpcHelper> rpc_helper, InitResponse init_response);
+
+  ~Client() override;
+
+  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> MakeArrayFromHostBuffer(
+      const void* data, DType dtype, Shape shape,
+      std::optional<absl::Span<const int64_t>> byte_strides,
+      std::shared_ptr<const Sharding> sharding, HostBufferSemantics semantics,
+      std::function<void()> on_done_with_host_buffer) override;
+
+  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>>
+  AssembleArrayFromSingleDeviceArrays(
+      Shape shape, std::shared_ptr<const Sharding> sharding,
+      absl::Span<tsl::RCReference<xla::ifrt::Array>> arrays,
+      ArrayCopySemantics semantics) override;
+
+  absl::StatusOr<tsl::RCReference<Tuple>> MakeTuple(
+      absl::Span<tsl::RCReference<Value>> values) override {
+    return absl::UnimplementedError(
+        "MakeTuple is not supported for the IFRT proxy client.");
+  }
+
+  absl::string_view runtime_type() const override { return runtime_type_; }
+  absl::string_view platform_name() const override { return platform_name_; }
+  absl::string_view platform_version() const override {
+    return platform_version_;
+  }
+  PlatformId platform_id() const override { return platform_id_; }
+  absl::flat_hash_map<std::string, ClientAttribute> attributes()
+      const override {
+    // TODO(b/309059940): Forward the backend attributes to the client.
+    return {};
+  }
+  int device_count() const override { return devices().size(); }
+  int addressable_device_count() const override {
+    return addressable_devices().size();
+  }
+  absl::Span<xla::ifrt::Device* const> devices() const override {
+    return device_ptrs_;
+  }
+  absl::Span<xla::ifrt::Device* const> addressable_devices() const override {
+    return addressable_device_ptrs_;
+  }
+  int process_index() const override { return process_index_; }
+  absl::StatusOr<DeviceAssignment> GetDefaultDeviceAssignment(
+      int num_replicas, int num_partitions) const override;
+  absl::StatusOr<xla::ifrt::Device*> LookupDevice(int device_id) const override;
+  absl::StatusOr<xla::ifrt::Device*> LookupAddressableDevice(
+      int local_hardware_id) const override {
+    return absl::UnimplementedError(
+        "LookupAddressableDevice is not supported for the IFRT proxy client.");
+  }
+  xla::ifrt::Compiler* GetDefaultCompiler() override {
+    return &default_compiler_;
+  }
+  absl::StatusOr<std::shared_ptr<const xla::PjRtTopologyDescription>>
+  GetTopologyForDevices(
+      absl::Span<xla::ifrt::Device* const> devices) const override {
+    return absl::UnimplementedError(
+        "GetTopologyForDevices is not supported for the IFRT proxy client.");
+  }
+
+  // For llvm::RTTIExtends.
+  static char ID;  // NOLINT
+
+ private:
+  Client(std::shared_ptr<RpcHelper> rpc_helper, uint64_t session_id,
+         std::string platform_name, std::string platform_version,
+         uint64_t platform_id, uint64_t process_index, std::string runtime_type,
+         absl::flat_hash_map<int, std::unique_ptr<Device>> devices,
+         std::vector<xla::ifrt::Device*> device_ptrs,
+         std::vector<xla::ifrt::Device*> addressable_device_ptrs,
+         absl::flat_hash_map<int, std::unique_ptr<Memory>> memories);
+
+  // rpc_helper_ will be referenced by various IFRT objects whose lifetime is
+  // managed by the layer above the IFRT interface, so shared_ptr is
+  // appropriate.
+  const std::shared_ptr<RpcHelper> rpc_helper_;
+
+  const std::string platform_name_;
+  const std::string platform_version_;
+  const uint64_t platform_id_;
+  const uint64_t process_index_;
+  const std::string runtime_type_;
+
+  const absl::flat_hash_map<int, std::unique_ptr<Device>> devices_;
+  const std::vector<xla::ifrt::Device*> device_ptrs_;
+  const std::vector<xla::ifrt::Device*> addressable_device_ptrs_;
+
+  const absl::flat_hash_map<int, std::unique_ptr<Memory>> memories_;
+
+  Compiler default_compiler_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_CLIENT_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client_session.h b/third_party/xla/xla/python/ifrt_proxy/client/client_session.h
new file mode 100644
index 00000000000000..9bd795825e50cf
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client_session.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_CLIENT_SESSION_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_CLIENT_SESSION_H_
+
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Base class that defines the interface between IFRT service protocol and the
+// stream implementation that is responsible for sending requests and receiving
+// responses.
+//
+// `ClientSession` implementation must be thread-safe.
+class ClientSession {
+ public:
+  // `Response` represents either an `IfrtResponse` value, or an `absl::Status`
+  // value corresponding to termination of the session stream. Value will never
+  // be a nullptr with OK status.
+  using Response = absl::StatusOr<std::shared_ptr<IfrtResponse>>;
+
+  virtual ~ClientSession() = default;
+
+  // Enqueues `request` to be sent via the stream; enqueued requests are sent in
+  // FIFO order. The caller must ensure that `request->op_id()` is unique
+  // throughout the stream's lifetime. The returned future becomes ready when a
+  // response for the given op id becomes ready.
+  virtual Future<Response> Enqueue(std::unique_ptr<IfrtRequest> request) = 0;
+
+  // Terminates the `ClientSession` if it has not already been terminated.
+  virtual void Finish(const absl::Status& s) {}
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_CLIENT_SESSION_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc
new file mode 100644
index 00000000000000..c6569a3a28f049
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/client_test.cc
@@ -0,0 +1,218 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/client/client.h"
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/client/client_session.h"
+#include "xla/python/ifrt_proxy/client/host_buffer.h"
+#include "xla/python/ifrt_proxy/client/mock_client_session.h"
+#include "xla/python/ifrt_proxy/client/mock_host_buffer.h"
+#include "xla/python/ifrt_proxy/client/rpc_helper.h"
+#include "xla/python/ifrt_proxy/client/version.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/service/computation_placer.h"
+#include "tsl/platform/platform.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Not;
+using ::testing::Pair;
+using ::testing::Pointee;
+using ::testing::Return;
+using ::testing::SizeIs;
+using ::testing::UnorderedElementsAre;
+using ::tsl::testing::IsOk;
+using ::tsl::testing::IsOkAndHolds;
+
+#if defined(PLATFORM_GOOGLE)
+using ::testing::EquivToProto;
+using ::testing::proto::Partially;
+#endif
+
+IfrtProxyVersion Version() {
+  IfrtProxyVersion version;
+  version.set_protocol_version(kClientMinVersion);
+  return version;
+}
+
+class ClientTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    session_ = std::make_shared<MockClientSession>();
+    rpc_helper_ = std::make_shared<RpcHelper>(Version(), session_);
+
+    host_buffer_store_ = std::make_shared<MockClientHostBufferStore>();
+    rpc_helper_->set_host_buffer_store(host_buffer_store_);
+
+    InitResponse response;
+    ASSERT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
+        R"pb(
+          platform_name: "ifrt-service"
+          platform_version: "n/a"
+          platform_id: 42
+          process_index: 1
+          runtime_type: "ifrt-service"
+          devices {
+            id: 0
+            local_hardware_id: 1234
+            device_kind: "mock"
+            default_memory_id: 0
+            memory_ids: [ 0 ]
+            attributes {
+              key: "name"
+              value { string_value: "device0" }
+            }
+          }
+          devices {
+            id: 1
+            local_hardware_id: 1234
+            device_kind: "mock"
+            default_memory_id: 1
+            memory_ids: [ 1 ]
+            attributes {
+              key: "name"
+              value { string_value: "device1" }
+            }
+          }
+          addressable_device_ids: 1
+          memories {
+            id: 0
+            memory_space_kind: "mock"
+            device_ids: [ 0 ]
+          }
+          memories {
+            id: 1
+            memory_space_kind: "mock"
+            device_ids: [ 1 ]
+          }
+        )pb",
+        &response));
+    TF_ASSERT_OK_AND_ASSIGN(client_, Client::Create(rpc_helper_, response));
+  }
+
+  std::shared_ptr<MockClientSession> session_;
+  std::shared_ptr<RpcHelper> rpc_helper_;
+  std::shared_ptr<ClientHostBufferStore> host_buffer_store_;
+  std::unique_ptr<Client> client_;
+};
+
+TEST_F(ClientTest, Init) {
+  if (tsl::testing::kIsOpenSource) {
+    // TODO(b/324824974): Fix this.
+    GTEST_SKIP() << "Non-rootcaused bug: xla::PjRtDeviceAttribute does not "
+                 << "work properly with IFRT proxy in open-source.";
+  }
+  EXPECT_EQ(client_->platform_name(), "ifrt-service");
+  EXPECT_EQ(client_->platform_version(), "n/a");
+  EXPECT_EQ(client_->platform_id(), 42);
+  EXPECT_EQ(client_->process_index(), 1);
+  EXPECT_EQ(client_->runtime_type(), "proxy/ifrt-service");
+
+  ASSERT_EQ(client_->device_count(), 2);
+  ASSERT_EQ(client_->addressable_device_count(), 1);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto* const device0, client_->LookupDevice(0));
+  EXPECT_EQ(device0->id(), 0);
+  EXPECT_EQ(device0->local_hardware_id(), 1234);
+  EXPECT_EQ(device0->device_kind(), "mock");
+  EXPECT_THAT(device0->Attributes(),
+              ElementsAre(Pair("name", xla::PjRtDeviceAttribute("device0"))));
+
+  ASSERT_THAT(device0->memory_spaces(), SizeIs(1));
+  auto* const memory0 = device0->memory_spaces()[0];
+  EXPECT_EQ(memory0->id(), 0);
+  EXPECT_EQ(memory0->memory_space_kind(), "mock");
+  EXPECT_THAT(memory0->devices(), UnorderedElementsAre(device0));
+  EXPECT_THAT(device0->default_memory_space(), IsOkAndHolds(memory0));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto* const device1, client_->LookupDevice(1));
+  EXPECT_EQ(device1->id(), 1);
+  EXPECT_EQ(device1->local_hardware_id(), 1234);
+  EXPECT_EQ(device1->device_kind(), "mock");
+  EXPECT_THAT(device1->Attributes(),
+              ElementsAre(Pair("name", xla::PjRtDeviceAttribute("device1"))));
+
+  ASSERT_THAT(device1->memory_spaces(), SizeIs(1));
+  auto* const memory1 = device1->memory_spaces()[0];
+  EXPECT_EQ(memory1->id(), 1);
+  EXPECT_EQ(memory1->memory_space_kind(), "mock");
+  EXPECT_THAT(memory1->devices(), UnorderedElementsAre(device1));
+  EXPECT_THAT(device1->default_memory_space(), IsOkAndHolds(memory1));
+
+  EXPECT_THAT(client_->addressable_devices(), ElementsAre(device1));
+}
+
+// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
+#if defined(PLATFORM_GOOGLE)
+TEST_F(ClientTest, GetDefaultDeviceAssignmentSuccess) {
+  IfrtResponse response;
+  xla::DeviceAssignment assignment(1, 3);
+  ASSERT_THAT(assignment.Serialize(
+                  response.mutable_get_default_device_assignment_response()
+                      ->mutable_device_assignment()),
+              IsOk());
+
+  EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
+                             R"pb(
+                               get_default_device_assignment_request {
+                                 num_replicas: 1
+                                 num_partitions: 3
+                               }
+                             )pb")))))
+      .WillOnce(MockClientSessionReturnResponse(response));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto assignment_got,
+                          client_->GetDefaultDeviceAssignment(1, 3));
+  EXPECT_EQ(assignment_got.replica_count(), 1);
+  EXPECT_EQ(assignment_got.computation_count(), 3);
+}
+#endif
+
+// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
+#if defined(PLATFORM_GOOGLE)
+TEST_F(ClientTest, GetDefaultDeviceAssignmentFailure) {
+  EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
+                             R"pb(
+                               get_default_device_assignment_request {
+                                 num_replicas: 1
+                                 num_partitions: 3
+                               }
+                             )pb")))))
+      .WillOnce(Return(Future<ClientSession::Response>(
+          absl::InternalError("injected from test"))));
+
+  EXPECT_THAT(client_->GetDefaultDeviceAssignment(1, 3), Not(IsOk()));
+}
+#endif
+
+}  // namespace
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc b/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
new file mode 100644
index 00000000000000..f0934734c0af37
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/compiler.cc
@@ -0,0 +1,156 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/client/compiler.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/Casting.h"
+#include "xla/pjrt/host_callback.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/host_callback.h"
+#include "xla/python/ifrt/serdes.h"
+#include "xla/python/ifrt_proxy/client/executable.h"
+#include "xla/python/ifrt_proxy/client/rpc_helper.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/server/host_callback.h"
+#include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
+#include "xla/python/pjrt_ifrt/xla_compiler.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/status_to_from_proto.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+Compiler::Compiler(xla::ifrt::Client* client,
+                   std::shared_ptr<RpcHelper> rpc_helper)
+    : client_(client), rpc_helper_(std::move(rpc_helper)) {}
+
+absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>> Compiler::Compile(
+    std::unique_ptr<Program> program,
+    std::unique_ptr<xla::ifrt::CompileOptions> options) {
+  auto request = std::make_unique<CompileRequest>();
+  TF_ASSIGN_OR_RETURN(*request->mutable_program(), Serialize(*program));
+
+  // Extract host callbacks from the XLA compile options. `XlaCompileOptions`'s
+  // SerDes fails when it contains host callbacks, so the following
+  // implementation handles host callback serialization out of band until we can
+  // natively support IFRT host callback on IFRT proxy.
+  std::vector<tsl::RCReference<xla::ifrt::LoadedHostCallback>>
+      loaded_host_callbacks;
+  if (auto* xla_options =
+          llvm::dyn_cast<xla::ifrt::XlaCompileOptions>(options.get())) {
+    for (const auto& loaded_host_callback :
+         xla_options->loaded_host_callbacks) {
+      auto* pjrt_host_callback =
+          llvm::dyn_cast<xla::ifrt::PjRtHostSendAndRecvLoadedHostCallback>(
+              loaded_host_callback.get());
+      if (pjrt_host_callback == nullptr) {
+        return absl::UnimplementedError("Unsupported host callback type");
+      }
+
+      const xla::HostCallback& xla_host_callback =
+          pjrt_host_callback->host_callback();
+
+      // The proxy server runs `RemoteLoadedHostCallback` that delegates actual
+      // host callback execution to the proxy client.
+      auto remote_loaded_host_callback = tsl::MakeRef<RemoteLoadedHostCallback>(
+          client_, xla_host_callback.operands, xla_host_callback.results,
+          /*queue=*/nullptr);
+      TF_ASSIGN_OR_RETURN(*request->add_host_callbacks(),
+                          remote_loaded_host_callback->Serialize());
+    }
+
+    loaded_host_callbacks.swap(xla_options->loaded_host_callbacks);
+  }
+
+  TF_ASSIGN_OR_RETURN(*request->mutable_compile_options(), Serialize(*options));
+
+  // TODO(b/266635130): Avoid blocking the caller.
+  TF_ASSIGN_OR_RETURN(std::shared_ptr<CompileResponse> response,
+                      rpc_helper_->Compile(std::move(request)).Await());
+
+  std::vector<xla::ifrt::LoadedExecutable::LogicalDeviceIds>
+      addressable_device_logical_device_ids;
+  addressable_device_logical_device_ids.reserve(
+      response->addressable_device_logical_ids_size());
+  for (const auto& logical_device_id :
+       response->addressable_device_logical_ids()) {
+    addressable_device_logical_device_ids.push_back({
+        .replica = logical_device_id.replica(),
+        .partition = logical_device_id.partition(),
+    });
+  }
+
+  std::vector<xla::ifrt::Device*> addressable_devices;
+  addressable_devices.reserve(response->addressable_device_ids_size());
+  for (const int32_t device_id : response->addressable_device_ids()) {
+    TF_ASSIGN_OR_RETURN(xla::ifrt::Device* const device,
+                        client_->LookupDevice(device_id));
+    addressable_devices.push_back(device);
+  }
+
+  absl::StatusOr<std::optional<std::string>> fingerprint;
+  switch (response->fingerprint_case()) {
+    case CompileResponse::kFingerprintValue:
+      fingerprint = response->fingerprint_value();
+      break;
+    case CompileResponse::kFingerprintError:
+      fingerprint = tsl::StatusFromProto(response->fingerprint_error());
+      break;
+    default:
+      fingerprint = std::nullopt;
+      break;
+  }
+
+  std::vector<uint64_t> loaded_host_callback_handles(
+      response->loaded_host_callback_handles().begin(),
+      response->loaded_host_callback_handles().end());
+
+  return std::make_unique<LoadedExecutable>(
+      client_, rpc_helper_, response->loaded_executable_handle(),
+      response->name(), response->num_devices(),
+      std::move(addressable_device_logical_device_ids),
+      std::move(addressable_devices), std::move(fingerprint),
+      std::move(loaded_host_callbacks),
+      std::move(loaded_host_callback_handles));
+}
+
+absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>>
+Compiler::DeserializeLoadedExecutable(
+    absl::string_view serialized,
+    std::unique_ptr<xla::ifrt::DeserializeExecutableOptions> options) {
+  return absl::UnimplementedError(
+      "IFRT service compiler does not support `DeserializeLoadedExecutable` "
+      "since the underlying serialization format is not stable");
+}
+
+char Compiler::ID = 0;
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/compiler.h b/third_party/xla/xla/python/ifrt_proxy/client/compiler.h
new file mode 100644
index 00000000000000..6bfc814766d111
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/compiler.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_COMPILER_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_COMPILER_H_
+
+#include <memory>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt_proxy/client/rpc_helper.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class Compiler final : public llvm::RTTIExtends<Compiler, xla::ifrt::Compiler> {
+ public:
+  Compiler(xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper);
+
+  absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>> Compile(
+      std::unique_ptr<xla::ifrt::Program> program,
+      std::unique_ptr<xla::ifrt::CompileOptions> options) override;
+
+  absl::StatusOr<std::unique_ptr<xla::ifrt::LoadedExecutable>>
+  DeserializeLoadedExecutable(
+      absl::string_view serialized,
+      std::unique_ptr<xla::ifrt::DeserializeExecutableOptions> options)
+      override;
+
+  static char ID;  // NOLINT
+
+ private:
+  xla::ifrt::Client* client_;
+  std::shared_ptr<RpcHelper> rpc_helper_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_COMPILER_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc
new file mode 100644
index 00000000000000..3692e7893b81e1
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/compiler_test.cc
@@ -0,0 +1,206 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/client/compiler.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/mock.h"
+#include "xla/python/ifrt/serdes.h"
+#include "xla/python/ifrt_proxy/client/client_session.h"
+#include "xla/python/ifrt_proxy/client/host_buffer.h"
+#include "xla/python/ifrt_proxy/client/mock_client_session.h"
+#include "xla/python/ifrt_proxy/client/mock_host_buffer.h"
+#include "xla/python/ifrt_proxy/client/rpc_helper.h"
+#include "xla/python/ifrt_proxy/client/version.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+namespace {
+
+using ::testing::_;
+using ::testing::ElementsAre;
+using ::testing::FieldsAre;
+using ::testing::Invoke;
+using ::testing::Optional;
+using ::testing::Pointee;
+using ::testing::Return;
+using ::tsl::protobuf::TextFormat;
+using ::tsl::testing::IsOkAndHolds;
+
+#if defined(PLATFORM_GOOGLE)
+using ::testing::EquivToProto;
+using ::testing::proto::Partially;
+#endif
+
+struct TestProgram : llvm::RTTIExtends<TestProgram, Program> {
+  static char ID;  // NOLINT
+};
+
+[[maybe_unused]] char TestProgram::ID = 0;  // NOLINT
+
+class TestProgramSerDes : public llvm::RTTIExtends<TestProgramSerDes, SerDes> {
+ public:
+  absl::string_view type_name() const override {
+    return "xla::ifrt::proxy::TestProgram";
+  }
+
+  absl::StatusOr<std::string> Serialize(Serializable& serializable) override {
+    CHECK(llvm::isa<TestProgram>(serializable));
+    return "";
+  }
+
+  absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
+      const std::string& serialized,
+      std::unique_ptr<DeserializeOptions> options) override {
+    return std::make_unique<TestProgram>();
+  }
+
+  static char ID;  // NOLINT
+};
+
+[[maybe_unused]] char TestProgramSerDes::ID = 0;  // NOLINT
+
+struct TestCompileOptions
+    : llvm::RTTIExtends<TestCompileOptions, CompileOptions> {
+  static char ID;  // NOLINT
+};
+
+[[maybe_unused]] char TestCompileOptions::ID = 0;  // NOLINT
+
+class TestCompileOptionsSerDes
+    : public llvm::RTTIExtends<TestCompileOptionsSerDes, SerDes> {
+ public:
+  absl::string_view type_name() const override {
+    return "xla::ifrt::proxy::TestCompileOptions";
+  }
+
+  absl::StatusOr<std::string> Serialize(Serializable& serializable) override {
+    CHECK(llvm::isa<TestCompileOptions>(serializable));
+    return "";
+  }
+
+  absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
+      const std::string& serialized,
+      std::unique_ptr<DeserializeOptions> options) override {
+    return std::make_unique<TestCompileOptions>();
+  }
+
+  static char ID;  // NOLINT
+};
+
+[[maybe_unused]] char TestCompileOptionsSerDes::ID = 0;  // NOLINT
+
+IfrtProxyVersion Version() {
+  IfrtProxyVersion version;
+  version.set_protocol_version(kClientMinVersion);
+  return version;
+}
+
+class CompilerTest : public testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    RegisterSerDes<TestProgram>(std::make_unique<TestProgramSerDes>());
+    RegisterSerDes<TestCompileOptions>(
+        std::make_unique<TestCompileOptionsSerDes>());
+  }
+
+  void SetUp() override {
+    session_ = std::make_shared<MockClientSession>();
+    rpc_helper_ = std::make_shared<RpcHelper>(Version(), session_);
+
+    host_buffer_store_ = std::make_shared<MockClientHostBufferStore>();
+    rpc_helper_->set_host_buffer_store(host_buffer_store_);
+
+    // Default handler that ignores all uninteresting requests but still
+    // invokes the callback in order to avoid hanging the caller forever.
+    EXPECT_CALL(*session_, Enqueue(_))
+        .WillRepeatedly(Return(Future<ClientSession::Response>(
+            absl::InternalError("Request has no mock handlers"))));
+  }
+
+  std::shared_ptr<MockClientSession> session_;
+  std::shared_ptr<RpcHelper> rpc_helper_;
+  std::shared_ptr<ClientHostBufferStore> host_buffer_store_;
+};
+
+// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
+#if defined(PLATFORM_GOOGLE)
+TEST_F(CompilerTest, Compile) {
+  std::vector<MockDevice> devices(2);
+
+  MockClient client;
+  ON_CALL(client, LookupDevice(_)).WillByDefault(Invoke([&](int id) {
+    return &devices[id];
+  }));
+
+  Compiler compiler(&client, rpc_helper_);
+
+  IfrtResponse response;
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(compile_response {
+             loaded_executable_handle: 1234
+             name: "foo-executable"
+             num_devices: 2
+             addressable_device_logical_ids { replica: 0 partition: 0 }
+             addressable_device_logical_ids { replica: 0 partition: 1 }
+             addressable_device_ids: [ 0, 1 ]
+             fingerprint_value: "fingerprint"
+           })pb",
+      &response));
+  EXPECT_CALL(*session_,
+              Enqueue(Pointee(Partially(EquivToProto(
+                  R"pb(compile_request {
+                         program { type_name: "xla::ifrt::proxy::TestProgram" }
+                       })pb")))))
+      .WillOnce(MockClientSessionReturnResponse(response));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto executable,
+      compiler.Compile(std::make_unique<TestProgram>(),
+                       std::make_unique<TestCompileOptions>()));
+
+  EXPECT_EQ(executable->name(), "foo-executable");
+  EXPECT_EQ(executable->num_devices(), 2);
+  EXPECT_THAT(executable->addressable_device_logical_ids(),
+              ElementsAre(FieldsAre(0, 0), FieldsAre(0, 1)));
+  EXPECT_THAT(executable->addressable_devices(),
+              ElementsAre(&devices[0], &devices[1]));
+  EXPECT_THAT(executable->Fingerprint(),
+              IsOkAndHolds(Optional(std::string("fingerprint"))));
+}
+#endif
+
+}  // namespace
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/device.cc b/third_party/xla/xla/python/ifrt_proxy/client/device.cc
new file mode 100644
index 00000000000000..f43d9aec101f7a
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/device.cc
@@ -0,0 +1,61 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/client/device.h"
+
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/literal.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_future.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+std::unique_ptr<xla::ScopedAsyncTrackingEvent> Device::CreateAsyncTrackingEvent(
+    absl::string_view description) const {
+  return nullptr;
+}
+
+absl::Status Device::TransferToInfeed(const xla::LiteralSlice& literal) {
+  return absl::UnimplementedError("Device does not support TransferToInfeed");
+}
+
+absl::Status Device::TransferFromOutfeed(xla::MutableBorrowingLiteral literal) {
+  return absl::UnimplementedError(
+      "Device does not support TransferFromOutfeed");
+}
+
+absl::Span<xla::PjRtMemorySpace* const> Device::memory_spaces() const {
+  return memory_spaces_;
+}
+
+absl::StatusOr<xla::PjRtMemorySpace*> Device::default_memory_space() const {
+  if (default_memory_space_ == nullptr) {
+    return absl::UnimplementedError(
+        "Device does not support default_memory_space");
+  }
+  return default_memory_space_;
+}
+
+char Device::ID = 0;  // NOLINT
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/device.h b/third_party/xla/xla/python/ifrt_proxy/client/device.h
new file mode 100644
index 00000000000000..6922488d7fa006
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/device.h
@@ -0,0 +1,137 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_DEVICE_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_DEVICE_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/literal.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/pjrt/pjrt_future.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/memory.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class DeviceDescription final : public xla::PjRtDeviceDescription {
+ public:
+  DeviceDescription(
+      int id, int process_index, std::string device_kind,
+      std::string debug_string, std::string to_string,
+      absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute> attributes)
+      : id_(id),
+        process_index_(process_index),
+        device_kind_(device_kind),
+        debug_string_(std::move(debug_string)),
+        to_string_(std::move(to_string)),
+        attributes_(std::move(attributes)) {}
+
+  int id() const override { return id_; }
+
+  int process_index() const override { return process_index_; }
+
+  absl::string_view device_kind() const override { return device_kind_; }
+
+  absl::string_view DebugString() const override { return debug_string_; }
+
+  absl::string_view ToString() const override { return to_string_; }
+
+  const absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute>& Attributes()
+      const override {
+    return attributes_;
+  }
+
+ private:
+  int id_;
+  int process_index_;
+  std::string device_kind_;
+  std::string debug_string_;
+  std::string to_string_;
+  absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute> attributes_;
+};
+
+class Device final : public xla::ifrt::Device {
+ public:
+  Device(DeviceDescription description, int local_device_id,
+         int local_hardware_id, bool is_addressable)
+      : description_(std::move(description)),
+        local_device_id_(local_device_id),
+        local_hardware_id_(local_hardware_id),
+        is_addressable_(is_addressable) {}
+
+  xla::PjRtClient* client() const override { return nullptr; }
+
+  bool IsAddressable() const override { return is_addressable_; }
+
+  const xla::PjRtDeviceDescription& description() const override {
+    return description_;
+  }
+
+  int local_hardware_id() const override {
+    return local_hardware_id_typed().value();
+  }
+
+  PjRtLocalDeviceId local_device_id() const override {
+    return PjRtLocalDeviceId(local_device_id_);
+  }
+
+  PjRtLocalHardwareId local_hardware_id_typed() const override {
+    return PjRtLocalHardwareId(local_hardware_id_);
+  }
+
+  std::unique_ptr<xla::ScopedAsyncTrackingEvent> CreateAsyncTrackingEvent(
+      absl::string_view description) const override;
+
+  absl::Status TransferToInfeed(const xla::LiteralSlice& literal) override;
+
+  absl::Status TransferFromOutfeed(
+      xla::MutableBorrowingLiteral literal) override;
+
+  absl::Span<xla::PjRtMemorySpace* const> memory_spaces() const override;
+
+  absl::StatusOr<xla::PjRtMemorySpace*> default_memory_space() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  friend class Client;  // For `memory_spaces_` initialization.
+
+  const DeviceDescription description_;
+  const int local_device_id_;
+  const int local_hardware_id_;
+  const bool is_addressable_;
+
+  std::vector<xla::ifrt::Memory*> memory_spaces_;
+  xla::ifrt::Memory* default_memory_space_ = nullptr;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_DEVICE_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
new file mode 100644
index 00000000000000..07d57d00d5fabd
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.cc
@@ -0,0 +1,542 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/client/executable.h"
+
+#include <cstdint>
+#include <cstdlib>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/cleanup/cleanup.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/bind_front.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
+#include "xla/pjrt/host_callback.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/host_callback.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt_proxy/client/array.h"
+#include "xla/python/ifrt_proxy/client/host_buffer.h"
+#include "xla/python/ifrt_proxy/client/rpc_helper.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/types.h"
+#include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
+#include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status_to_from_proto.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+namespace {
+
+// Locally executes the loaded host callback with given operand buffer from the
+// IFRT proxy server and returns a result buffer to be sent back.
+absl::StatusOr<absl::Cord> ExecuteLoadedHostCallback(
+    xla::ifrt::LoadedHostCallback* loaded_host_callback,
+    absl::Cord operand_buffer) {
+#if defined(PLATFORM_GOOGLE)
+  auto* pjrt_host_callback =
+      llvm::dyn_cast<PjRtHostSendAndRecvLoadedHostCallback>(
+          loaded_host_callback);
+  if (pjrt_host_callback == nullptr) {
+    return absl::UnimplementedError(
+        "Non-PjRt host callbacks cannot be executed");
+  }
+  const xla::HostCallback& xla_host_callback =
+      pjrt_host_callback->host_callback();
+
+  // The following allocates both operands and results using `aligned_alloc` in
+  // order to (loosely) emulate the XLA implementation where host callbacks are
+  // often called with aligned operand/result buffers. While this may not be
+  // strictly necessary for some callbacks, this reduces the chances of proxied
+  // callbacks behaving differently on a best-effort basis.
+  constexpr int kAlignment = 32;
+
+  struct Deleter {
+    void operator()(void* p) { free(p); }
+  };
+
+  std::vector<std::unique_ptr<char, Deleter>> operands;
+  operands.reserve(xla_host_callback.operands.size());
+  std::vector<void*> operand_ptrs;
+  operand_ptrs.reserve(xla_host_callback.operands.size());
+
+  absl::CordReader reader(operand_buffer);
+  for (const auto& spec : xla_host_callback.operands) {
+    const int64_t size = xla::ShapeUtil::ByteSizeOf(spec.shape);
+    void* p;
+    CHECK_EQ(posix_memalign(&p, kAlignment, size), 0);
+    std::unique_ptr<char, Deleter> buffer(reinterpret_cast<char*>(p));
+
+    if (reader.Available() < size) {
+      return absl::InternalError(absl::StrCat(
+          "Buffer overflow while reading host callback execution operands; ",
+          "range: [", reader.Position(), ", ", reader.Position() + size, "), ",
+          "buffer size: ", operand_buffer.size()));
+    }
+    reader.ReadN(size, buffer.get());
+
+    operand_ptrs.push_back(buffer.get());
+    operands.push_back(std::move(buffer));
+  }
+  if (reader.Available() > 0) {
+    return absl::InternalError(absl::StrCat(
+        "Host callback execution did not consume the entire operand buffer; "
+        "size: ",
+        operand_buffer.size(), "; consumed: ", reader.Available()));
+  }
+
+  absl::Cord result_buffer;
+  std::vector<void*> result_ptrs;
+  result_ptrs.reserve(xla_host_callback.results.size());
+
+  for (const auto& spec : xla_host_callback.results) {
+    const int64_t size = xla::ShapeUtil::ByteSizeOf(spec.shape);
+    void* data;
+    CHECK_EQ(posix_memalign(&data, kAlignment, size), 0);
+
+    result_ptrs.push_back(data);
+    result_buffer.AppendExternalMemory(
+        absl::string_view(reinterpret_cast<char*>(data), size), data, &free);
+  }
+
+  TF_RETURN_IF_ERROR(
+      xla_host_callback.callback(result_ptrs.data(), operand_ptrs.data()));
+
+  return result_buffer;
+#else
+  return absl::UnimplementedError("ExecuteLoadedHostCallback is unsupported.");
+#endif
+}
+
+// Same as `ExecuteLoadedHostCallback`, except that it uses host buffer store to
+// retrieve operands and store results.
+absl::StatusOr<uint64_t> PrepareAndExecuteLoadedHostCallback(
+    ClientHostBufferStore* host_buffer_store,
+    xla::ifrt::LoadedHostCallback* loaded_host_callback,
+    uint64_t operand_handle) {
+  TF_ASSIGN_OR_RETURN(absl::Cord operands,
+                      host_buffer_store->Lookup(operand_handle).Await());
+  absl::Cleanup cleanup = [&]() {
+    host_buffer_store->Delete(operand_handle).OnReady([](absl::Status status) {
+      if (!status.ok()) {
+        LOG(ERROR) << "Failed to delete host callback operands: " << status;
+      }
+    });
+  };
+
+  TF_ASSIGN_OR_RETURN(
+      absl::Cord results,
+      ExecuteLoadedHostCallback(loaded_host_callback, std::move(operands)));
+
+  const uint64_t result_handle = host_buffer_store->NextHandle();
+  TF_RETURN_IF_ERROR(host_buffer_store->Store(result_handle, results).Await());
+  return result_handle;
+}
+
+}  // namespace
+
+LoadedExecutable::LoadedExecutable(
+    xla::ifrt::Client* client, std::shared_ptr<RpcHelper> rpc_helper,
+    uint64_t handle, std::string name, int num_devices,
+    std::vector<xla::ifrt::LoadedExecutable::LogicalDeviceIds>
+        addressable_device_logical_device_ids,
+    std::vector<xla::ifrt::Device*> addressable_devices,
+    absl::StatusOr<std::optional<std::string>> fingerprint,
+    std::vector<tsl::RCReference<xla::ifrt::LoadedHostCallback>>
+        loaded_host_callbacks,
+    std::vector<uint64_t> loaded_host_callback_handles)
+    : client_(client),
+      rpc_helper_(std::move(rpc_helper)),
+      handle_(handle),
+      name_(std::move(name)),
+      num_devices_(num_devices),
+      addressable_device_logical_device_ids_(
+          std::move(addressable_device_logical_device_ids)),
+      addressable_devices_(std::move(addressable_devices)),
+      fingerprint_(std::move(fingerprint)) {
+  // Start host callback pollers.
+  CHECK_EQ(loaded_host_callbacks.size(), loaded_host_callback_handles.size());
+  if (!loaded_host_callbacks.empty()) {
+    for (int i = 0; i < loaded_host_callbacks.size(); ++i) {
+      PollLoadedHostCallback(loaded_host_callback_handles[i],
+                             loaded_host_callbacks[i]);
+    }
+  }
+
+  // Asynchronously fetch shardings. Since users of `LoadedExecutable` typically
+  // require sharding information to invoke the executable, it is beneficial to
+  // eagerly schedule this fetch since, in some implementations, it may take a
+  // long time for sharding information to be available.
+
+  auto promise =
+      Future<absl::StatusOr<std::shared_ptr<Metadata>>>::CreatePromise();
+  metadata_future_ = Future<absl::StatusOr<std::shared_ptr<Metadata>>>(promise);
+
+  auto req = std::make_unique<LoadedExecutableMetadataRequest>();
+  req->set_loaded_executable_handle(handle_);
+
+  auto on_done =
+      [promise](
+          absl::StatusOr<std::shared_ptr<LoadedExecutableMetadataResponse>>
+              response) mutable {
+        if (!response.ok()) {
+          LOG(ERROR) << "LoadedExecutableMetadata: Got " << response.status();
+          promise.Set(response.status());
+          return;
+        }
+
+        auto info = std::make_shared<Metadata>();
+
+        if (response.value()->has_parameter_shardings()) {
+          const auto& p = response.value()->parameter_shardings().shardings();
+          info->parameter_shardings.emplace(p.begin(), p.end());
+        }
+        if (response.value()->has_output_shardings()) {
+          const auto& o = response.value()->output_shardings().shardings();
+          info->output_shardings.emplace(o.begin(), o.end());
+        }
+
+        auto parse_layouts =
+            [](const LoadedExecutableMetadataResponse::LayoutList& list) {
+              std::vector<xla::Layout> layouts;
+              layouts.reserve(list.layouts_size());
+              for (const auto& layout : list.layouts()) {
+                layouts.push_back(xla::Layout::CreateFromProto(layout));
+              }
+              return layouts;
+            };
+
+        if (response.value()->has_parameter_layouts_list()) {
+          info->parameter_layouts =
+              parse_layouts(response.value()->parameter_layouts_list());
+        } else if (response.value()->has_parameter_layouts_error()) {
+          info->parameter_layouts =
+              tsl::StatusFromProto(response.value()->parameter_layouts_error());
+        } else {
+          info->parameter_layouts = absl::UnimplementedError(
+              "IFRT Proxy server did not return parameter layouts");
+        }
+        if (response.value()->has_output_layouts_list()) {
+          info->output_layouts =
+              parse_layouts(response.value()->output_layouts_list());
+        } else if (response.value()->has_output_layouts_error()) {
+          info->output_layouts =
+              tsl::StatusFromProto(response.value()->output_layouts_error());
+        } else {
+          info->output_layouts = absl::UnimplementedError(
+              "IFRT Proxy server did not return output layouts");
+        }
+
+        if (const absl::Status s = tsl::StatusFromProto(
+                response.value()->output_memory_kinds().status());
+            !s.ok()) {
+          info->output_memory_kinds = s;
+        } else {
+          std::vector<std::vector<absl::string_view>> output_memory_kinds;
+          for (const auto& list :
+               response.value()->output_memory_kinds().memory_kind_lists()) {
+            std::vector<absl::string_view> kinds;
+            kinds.reserve(list.memory_kinds_size());
+            for (const absl::string_view kind : list.memory_kinds()) {
+              const auto it =
+                  info->memory_kinds.insert(std::string(kind)).first;
+              kinds.push_back(*it);
+            }
+            output_memory_kinds.push_back(std::move(kinds));
+          }
+          info->output_memory_kinds = std::move(output_memory_kinds);
+        }
+
+        promise.Set(std::move(info));
+      };
+  rpc_helper_->LoadedExecutableMetadata(std::move(req))
+      .OnReady(std::move(on_done));
+}
+
+LoadedExecutable::~LoadedExecutable() {
+  auto req = std::make_unique<LoadedExecutableDestructRequest>();
+  req->set_loaded_executable_handle(handle_);
+
+  rpc_helper_->LoadedExecutableDestruct(std::move(req))
+      .OnReady(
+          [](absl::StatusOr<std::shared_ptr<LoadedExecutableDestructResponse>>
+                 response) {
+            if (!response.ok()) {
+              LOG(ERROR) << "Failed to destroy `LoadedExecutable`: "
+                         << response.status();
+            }
+          });
+}
+
+xla::ifrt::Client* LoadedExecutable::client() const { return client_; }
+
+absl::string_view LoadedExecutable::name() const { return name_; }
+
+absl::StatusOr<std::optional<std::string>> LoadedExecutable::Fingerprint()
+    const {
+  return fingerprint_;
+}
+
+absl::StatusOr<std::string> LoadedExecutable::Serialize() const {
+  return absl::UnimplementedError(
+      "IFRT service executable does not support `Serialize` since the "
+      "underlying serialization format is not stable");
+}
+
+int LoadedExecutable::num_devices() const { return num_devices_; }
+
+int64_t LoadedExecutable::SizeOfGeneratedCodeInBytes() const {
+  LOG(FATAL) << "Unimplemented";
+}
+
+absl::StatusOr<CompiledMemoryStats> LoadedExecutable::GetCompiledMemoryStats()
+    const {
+  return absl::UnimplementedError("Unimplemented");
+}
+
+std::optional<std::vector<OpSharding>> LoadedExecutable::GetParameterShardings()
+    const {
+  auto info = metadata_future_.Await();
+  if (!info.ok()) {
+    return std::nullopt;
+  }
+  return (*info)->parameter_shardings;
+}
+
+std::optional<std::vector<OpSharding>> LoadedExecutable::GetOutputShardings()
+    const {
+  auto info = metadata_future_.Await();
+  if (!info.ok()) {
+    return std::nullopt;
+  }
+  return (*info)->output_shardings;
+}
+
+absl::StatusOr<std::vector<Layout>> LoadedExecutable::GetParameterLayouts()
+    const {
+  TF_ASSIGN_OR_RETURN(auto info, metadata_future_.Await());
+  return info->parameter_layouts;
+}
+
+absl::StatusOr<std::vector<Layout>> LoadedExecutable::GetOutputLayouts() const {
+  TF_ASSIGN_OR_RETURN(auto info, metadata_future_.Await());
+  return info->output_layouts;
+}
+
+absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+LoadedExecutable::GetOutputMemoryKinds() const {
+  TF_ASSIGN_OR_RETURN(auto info, metadata_future_.Await());
+  return info->output_memory_kinds;
+}
+
+absl::StatusOr<std::vector<std::shared_ptr<HloModule>>>
+LoadedExecutable::GetHloModules() const {
+  return absl::UnimplementedError(
+      "IFRT service does not support LoadedExecutable::GetHloModules() since "
+      "HloModule does not provide stable serialization");
+}
+
+absl::StatusOr<
+    absl::flat_hash_map<std::string, xla::ifrt::Executable::CostAnalysisValue>>
+LoadedExecutable::GetCostAnalysis() const {
+  return absl::UnimplementedError("Unimplemented");
+}
+
+absl::StatusOr<xla::ifrt::LoadedExecutable::ExecuteResult>
+LoadedExecutable::Execute(absl::Span<tsl::RCReference<xla::ifrt::Array>> args,
+                          const ExecuteOptions& options,
+                          std::optional<xla::ifrt::DeviceList> devices) {
+  auto req = std::make_unique<LoadedExecutableExecuteRequest>();
+  req->set_loaded_executable_handle(handle_);
+  for (const auto& arg : args) {
+    auto* array = llvm::dyn_cast_or_null<Array>(arg.get());
+    if (array == nullptr) {
+      return absl::InvalidArgumentError(
+          "Invalid IFRT array type provided to `LoadedExecutable::Execute`");
+    }
+    req->add_args_handles(array->handle().handle);
+  }
+  TF_ASSIGN_OR_RETURN(*req->mutable_execute_options(), options.ToProto());
+  if (devices.has_value()) {
+    for (const auto* device : *devices) {
+      req->add_device_ids(device->id());
+    }
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<LoadedExecutableExecuteResponse> response,
+      rpc_helper_->LoadedExecutableExecute(std::move(req)).Await());
+
+  // NOTE: All future and array handles in `response` must have an owner
+  // locally, or be requested to be destructed remotely, before returning.
+
+  xla::ifrt::LoadedExecutable::ExecuteResult result;
+
+  // Populate the execution status future. `CheckFuture` deletes the server-side
+  // futures after its completion.
+  result.status = rpc_helper_->CheckFuture(response->status_handle());
+
+  // Create output arrays. The cleanup logic ensures that all handles are
+  // properly cleaned up on early return.
+  absl::Cleanup cleanup = [&]() {
+    int index = result.outputs.size();
+    result.outputs.clear();  // Cleaned up by `~Array()`.
+
+    for (; index < response->outputs_size(); ++index) {
+      Array::Destruct(
+          rpc_helper_.get(),
+          ArrayHandle{.handle = response->outputs(index).array_handle()});
+    }
+  };
+  const auto lookup_device = absl::bind_front(&Client::LookupDevice, client());
+  for (const auto& output : response->outputs()) {
+    DType dtype = FromDTypeProto(output.dtype());
+    Shape shape = FromShapeProto(output.shape());
+    TF_ASSIGN_OR_RETURN(auto sharding,
+                        FromShardingProto(lookup_device, output.sharding()));
+    result.outputs.push_back(tsl::MakeRef<Array>(
+        client(), rpc_helper_, dtype, std::move(shape), std::move(sharding),
+        ArrayHandle{.handle = output.array_handle()}));
+  }
+  std::move(cleanup).Cancel();
+
+  return result;
+}
+
+Future<absl::Status> LoadedExecutable::Delete() {
+  auto req = std::make_unique<LoadedExecutableDeleteRequest>();
+  req->set_loaded_executable_handle(handle_);
+
+  absl::StatusOr<std::shared_ptr<LoadedExecutableDeleteResponse>> response =
+      rpc_helper_->LoadedExecutableDelete(std::move(req)).Await();
+  if (!response.ok()) {
+    return Future<absl::Status>(response.status());
+  }
+  return rpc_helper_->CheckFuture((*response)->future_handle());
+}
+
+bool LoadedExecutable::IsDeleted() const {
+  auto req = std::make_unique<LoadedExecutableIsDeletedRequest>();
+  req->set_loaded_executable_handle(handle_);
+
+  absl::StatusOr<std::shared_ptr<LoadedExecutableIsDeletedResponse>> response =
+      rpc_helper_->LoadedExecutableIsDeleted(std::move(req)).Await();
+  if (!response.ok()) {
+    LOG(ERROR) << "Failed to query the deletion status of `LoadedExecutable`: "
+               << response.status();
+    return false;
+  }
+  return (*response)->is_deleted();
+}
+
+absl::Span<const LoadedExecutable::LogicalDeviceIds>
+LoadedExecutable::addressable_device_logical_ids() const {
+  return addressable_device_logical_device_ids_;
+}
+
+absl::Span<xla::ifrt::Device* const> LoadedExecutable::addressable_devices()
+    const {
+  return addressable_devices_;
+}
+
+void LoadedExecutable::PollLoadedHostCallback(
+    uint64_t handle,
+    tsl::RCReference<xla::ifrt::LoadedHostCallback> loaded_host_callback) {
+  // Note: individual host callbacks may live longer than the executable as the
+  // destruction of an IFRT executable is not required to block until all
+  // in-flight executions are complete. Therefore, the following lambda must not
+  // capture `this` and is scheduled on the default thread pool.
+  auto f = [rpc_helper = rpc_helper_, handle,
+            loaded_host_callback = std::move(loaded_host_callback)]() {
+    while (true) {
+      const uint64_t operand_handle =
+          rpc_helper->host_buffer_store()->NextHandle();
+
+      auto poll_req = std::make_unique<LoadedHostCallbackPollRequest>();
+      poll_req->set_loaded_host_callback_handle(handle);
+      poll_req->set_operand_host_buffer_handle(operand_handle);
+      auto response =
+          rpc_helper->LoadedHostCallbackPoll(std::move(poll_req)).Await();
+
+      if (!response.ok()) {
+        LOG_EVERY_N_SEC(ERROR, 60)
+            << "Failed to poll host callback execution: " << response.status();
+        continue;
+      }
+
+      if (!(*response)->has_host_callback_execution_handle()) {
+        // The host callback is destructed from the server.
+        break;
+      }
+
+      auto ret_req = std::make_unique<LoadedHostCallbackReturnRequest>();
+      ret_req->set_host_callback_execution_handle(
+          (*response)->host_callback_execution_handle());
+
+      absl::StatusOr<uint64_t> result_handle =
+          PrepareAndExecuteLoadedHostCallback(
+              rpc_helper->host_buffer_store().get(), loaded_host_callback.get(),
+              operand_handle);
+      if (result_handle.ok()) {
+        ret_req->set_result_host_buffer_handle(*result_handle);
+      } else {
+        *ret_req->mutable_error() = tsl::StatusToProto(result_handle.status());
+      }
+
+      rpc_helper->LoadedHostCallbackReturn(std::move(ret_req))
+          .OnReady([](absl::StatusOr<
+                       std::shared_ptr<LoadedHostCallbackReturnResponse>>
+                          response) {
+            if (!response.ok()) {
+              LOG(ERROR) << "Failed to return host callback results: "
+                         << response.status();
+            }
+          });
+    }
+  };
+  tsl::Env::Default()->SchedClosure(std::move(f));
+}
+
+char LoadedExecutable::ID = 0;  // NOLINT
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable.h b/third_party/xla/xla/python/ifrt_proxy/client/executable.h
new file mode 100644
index 00000000000000..41932886c3fed3
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable.h
@@ -0,0 +1,142 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_EXECUTABLE_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_EXECUTABLE_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/layout.h"
+#include "xla/pjrt/pjrt_executable.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/host_callback.h"
+#include "xla/python/ifrt_proxy/client/rpc_helper.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class LoadedExecutable final
+    : public llvm::RTTIExtends<LoadedExecutable, xla::ifrt::LoadedExecutable> {
+ public:
+  LoadedExecutable(xla::ifrt::Client* client,
+                   std::shared_ptr<RpcHelper> rpc_helper, uint64_t handle,
+                   std::string name, int num_devices,
+                   std::vector<xla::ifrt::LoadedExecutable::LogicalDeviceIds>
+                       addressable_device_logical_device_ids,
+                   std::vector<xla::ifrt::Device*> addressable_devices,
+                   absl::StatusOr<std::optional<std::string>> fingerprint,
+                   std::vector<tsl::RCReference<xla::ifrt::LoadedHostCallback>>
+                       loaded_host_callbacks,
+                   std::vector<uint64_t> loaded_host_callback_handles);
+
+  ~LoadedExecutable() override;
+
+  xla::ifrt::Client* client() const override;
+  absl::string_view name() const override;
+  absl::StatusOr<std::optional<std::string>> Fingerprint() const override;
+  absl::StatusOr<std::string> Serialize() const override;
+
+  int num_devices() const override;
+  int64_t SizeOfGeneratedCodeInBytes() const override;
+  absl::StatusOr<CompiledMemoryStats> GetCompiledMemoryStats() const override;
+
+  std::optional<std::vector<OpSharding>> GetParameterShardings() const override;
+  std::optional<std::vector<OpSharding>> GetOutputShardings() const override;
+  absl::StatusOr<std::vector<Layout>> GetParameterLayouts() const override;
+  absl::StatusOr<std::vector<Layout>> GetOutputLayouts() const override;
+  absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+  GetOutputMemoryKinds() const override;
+  absl::StatusOr<std::vector<std::shared_ptr<HloModule>>> GetHloModules()
+      const override;
+
+  absl::StatusOr<absl::flat_hash_map<std::string,
+                                     xla::ifrt::Executable::CostAnalysisValue>>
+  GetCostAnalysis() const override;
+
+  absl::StatusOr<ExecuteResult> Execute(
+      absl::Span<tsl::RCReference<xla::ifrt::Array>> args,
+      const ExecuteOptions& options,
+      std::optional<DeviceList> devices) override;
+
+  Future<absl::Status> Delete() override;
+  bool IsDeleted() const override;
+
+  absl::Span<const LogicalDeviceIds> addressable_device_logical_ids()
+      const override;
+  absl::Span<xla::ifrt::Device* const> addressable_devices() const override;
+
+  static char ID;  // NOLINT
+
+ private:
+  struct Metadata {
+    std::optional<std::vector<xla::OpSharding>> parameter_shardings;
+    std::optional<std::vector<xla::OpSharding>> output_shardings;
+
+    absl::StatusOr<std::vector<xla::Layout>> parameter_layouts;
+    absl::StatusOr<std::vector<xla::Layout>> output_layouts;
+
+    // Elements in `output_memory_kinds` point to elements in `memory_kinds`.
+    // Required since `GetOutputMemoryKinds()` returns `absl::string_view`.
+    // `memory_kinds` uses `absl::node_hash_set` for pointer stability.
+    absl::node_hash_set<std::string> memory_kinds;
+    absl::StatusOr<std::vector<std::vector<absl::string_view>>>
+        output_memory_kinds;
+  };
+
+  void PollLoadedHostCallback(
+      uint64_t handle,
+      tsl::RCReference<xla::ifrt::LoadedHostCallback> loaded_host_callback);
+
+  xla::ifrt::Client* client_;
+  std::shared_ptr<RpcHelper> rpc_helper_;
+
+  const uint64_t handle_;
+  const std::string name_;
+  const int num_devices_;
+  const std::vector<xla::ifrt::LoadedExecutable::LogicalDeviceIds>
+      addressable_device_logical_device_ids_;
+  const std::vector<xla::ifrt::Device*> addressable_devices_;
+  const absl::StatusOr<std::optional<std::string>> fingerprint_;
+
+  // Metadata queried when the executable is created. Declared as `mutable`
+  // since `Future::Await()` is not const.
+  mutable Future<absl::StatusOr<std::shared_ptr<Metadata>>> metadata_future_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_EXECUTABLE_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
new file mode 100644
index 00000000000000..778ec1fc745433
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/executable_test.cc
@@ -0,0 +1,341 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/client/executable.h"
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
+#include "xla/layout_util.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/mock.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt_proxy/client/array.h"
+#include "xla/python/ifrt_proxy/client/client_session.h"
+#include "xla/python/ifrt_proxy/client/host_buffer.h"
+#include "xla/python/ifrt_proxy/client/mock_client_session.h"
+#include "xla/python/ifrt_proxy/client/mock_host_buffer.h"
+#include "xla/python/ifrt_proxy/client/rpc_helper.h"
+#include "xla/python/ifrt_proxy/client/version.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/types.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+using ::testing::_;
+using ::testing::ElementsAre;
+using ::testing::Optional;
+using ::testing::Pointee;
+using ::testing::Return;
+using ::testing::SizeIs;
+using ::testing::StrEq;
+using ::tsl::protobuf::TextFormat;
+using ::tsl::testing::IsOkAndHolds;
+using ::tsl::testing::StatusIs;
+
+#if defined(PLATFORM_GOOGLE)
+using ::testing::EquivToProto;
+using ::testing::proto::Partially;
+#endif
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+namespace {
+
+IfrtProxyVersion Version() {
+  IfrtProxyVersion version;
+  version.set_protocol_version(kClientMinVersion);
+  return version;
+}
+
+class LoadedExecutableTest : public ::testing::Test {
+ protected:
+  void SetUp() override {
+    session_ = std::make_shared<MockClientSession>();
+    rpc_helper_ = std::make_shared<RpcHelper>(Version(), session_);
+
+    host_buffer_store_ = std::make_shared<MockClientHostBufferStore>();
+    rpc_helper_->set_host_buffer_store(host_buffer_store_);
+
+    // Default handler that ignores all uninteresting requests, but still
+    // invokes the callback in order to avoid hanging the caller forever.
+    EXPECT_CALL(*session_, Enqueue(_))
+        .WillRepeatedly(Return(Future<ClientSession::Response>(
+            absl::InternalError("Request has no mock handlers"))));
+  }
+
+  std::shared_ptr<MockClientSession> session_;
+  std::shared_ptr<RpcHelper> rpc_helper_;
+  std::shared_ptr<ClientHostBufferStore> host_buffer_store_;
+};
+
+// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
+#if defined(PLATFORM_GOOGLE)
+TEST_F(LoadedExecutableTest, Metadata) {
+  IfrtResponse response;
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(
+        loaded_executable_metadata_response {
+          parameter_shardings {
+            shardings { type: REPLICATED }
+            shardings {
+              type: OTHER
+              tile_shape {
+                element_type: BF16
+                dimensions: [ 2, 2 ]
+              }
+              tile_assignment_dimensions: [ 0, 1 ]
+            }
+          }
+          output_shardings { shardings { type: REPLICATED } }
+          parameter_layouts_list {
+            layouts { minor_to_major: 0 }
+            layouts { minor_to_major: [ 1, 0 ] }
+          }
+          output_layouts_list { layouts { minor_to_major: [ 1, 0 ] } }
+          output_memory_kinds { memory_kind_lists { memory_kinds: [ "foo" ] } }
+        }
+      )pb",
+      &response));
+  EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
+                             R"pb(loaded_executable_metadata_request {
+                                    loaded_executable_handle: 1234
+                                  })pb")))))
+      .WillOnce(MockClientSessionReturnResponse(response));
+
+  MockClient client;
+  LoadedExecutable executable(
+      &client, rpc_helper_, /*handle=*/1234, /*name=*/"foo",
+      /*num_devices=*/2, /*addressable_device_logical_device_ids=*/{},
+      /*addressable_devices=*/{}, /*fingerprint=*/"fingerprint",
+      /*loaded_host_callbacks=*/{}, /*loaded_host_callback_handles=*/{});
+
+  EXPECT_THAT(
+      executable.GetParameterShardings(),
+      Optional(ElementsAre(
+          EquivToProto(R"pb(type: REPLICATED)pb"),
+          EquivToProto(R"pb(type: OTHER
+                            tile_shape {
+                              element_type: BF16
+                              dimensions: [ 2, 2 ]
+                            }
+                            tile_assignment_dimensions: [ 0, 1 ])pb"))));
+  EXPECT_THAT(executable.GetOutputShardings(),
+              Optional(ElementsAre(EquivToProto(R"pb(type: REPLICATED)pb"))));
+  EXPECT_THAT(executable.GetParameterLayouts(),
+              IsOkAndHolds(ElementsAre(
+                  xla::LayoutUtil::MakeDescendingLayout(/*rank=*/1),
+                  xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2))));
+  EXPECT_THAT(executable.GetOutputLayouts(),
+              IsOkAndHolds(ElementsAre(
+                  xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2))));
+  EXPECT_THAT(executable.GetOutputMemoryKinds(),
+              IsOkAndHolds(ElementsAre(ElementsAre("foo"))));
+}
+#endif
+
+// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
+#if defined(PLATFORM_GOOGLE)
+TEST_F(LoadedExecutableTest, Execute) {
+  MockDevice device;
+  ON_CALL(device, global_device_id())
+      .WillByDefault(Return(xla::PjRtGlobalDeviceId(1)));
+
+  MockClient client;
+  ON_CALL(client, LookupDevice(1)).WillByDefault(Return(&device));
+
+  LoadedExecutable executable(
+      &client, rpc_helper_, /*handle=*/1234, /*name=*/"foo",
+      /*num_devices=*/2, /*addressable_device_logical_device_ids=*/{},
+      /*addressable_devices=*/{}, /*fingerprint=*/"fingerprint",
+      /*loaded_host_callbacks=*/{}, /*loaded_host_callback_handles=*/{});
+
+  IfrtResponse response;
+  ASSERT_TRUE(TextFormat::ParseFromString(R"pb(
+                                            loaded_executable_execute_response {
+                                              status_handle: 2000
+                                              outputs {
+                                                dtype: DTYPE_F32
+                                                shape { dimensions: [ 4, 4 ] }
+                                                array_handle: 3000
+                                              }
+                                              outputs {
+                                                dtype: DTYPE_F16
+                                                shape { dimensions: [ 8 ] }
+                                                array_handle: 3001
+                                              }
+                                            }
+                                          )pb",
+                                          &response));
+  {
+    auto* outputs = response.mutable_loaded_executable_execute_response()
+                        ->mutable_outputs();
+    TF_ASSERT_OK_AND_ASSIGN(
+        *(*outputs)[0].mutable_sharding(),
+        ToShardingProto(*SingleDeviceSharding::Create(&device, MemoryKind())));
+    TF_ASSERT_OK_AND_ASSIGN(
+        *(*outputs)[1].mutable_sharding(),
+        ToShardingProto(*SingleDeviceSharding::Create(&device, MemoryKind())));
+  }
+  EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
+                             R"pb(loaded_executable_execute_request {
+                                    loaded_executable_handle: 1234
+                                    args_handles: [ 1000, 1001 ]
+                                    device_ids: [ 1 ]
+                                  })pb")))))
+      .WillOnce(MockClientSessionReturnResponse(response));
+
+  ASSERT_TRUE(TextFormat::ParseFromString(R"pb(
+                                            response_metadata {
+                                              status {
+                                                code: 2  # UNKNOWN
+                                                message: "injected error"
+                                              }
+                                            }
+                                          )pb",
+                                          &response));
+  EXPECT_CALL(*session_,
+              Enqueue(Pointee(Partially(EquivToProto(R"pb(check_future_request {
+                                                            future_handle: 2000
+                                                          })pb")))))
+      .WillOnce(MockClientSessionReturnResponse(response));
+
+  DeviceList devices({&device});
+
+  std::vector<tsl::RCReference<xla::ifrt::Array>> args;
+  for (const uint64_t handle : {1000, 1001}) {
+    args.push_back(tsl::MakeRef<Array>(
+        &client, rpc_helper_, DType(DType::kF32), Shape({2, 2}),
+        OpaqueSharding::Create(devices, MemoryKind()),
+        ArrayHandle{.handle = handle}));
+  }
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto result, executable.Execute(
+                       absl::MakeSpan(args),
+                       xla::ifrt::LoadedExecutable::ExecuteOptions(), devices));
+
+  EXPECT_THAT(result.status.Await(),
+              StatusIs(absl::StatusCode::kUnknown, "injected error"));
+
+  ASSERT_THAT(result.outputs, SizeIs(2));
+
+  const auto output0 = result.outputs[0];
+  EXPECT_EQ(output0->dtype(), DType(DType::kF32));
+  EXPECT_EQ(output0->shape(), Shape({4, 4}));
+  EXPECT_EQ(llvm::cast<Array>(output0.get())->handle().handle, 3000);
+
+  const auto output1 = result.outputs[1];
+  EXPECT_EQ(output1->dtype(), DType(DType::kF16));
+  EXPECT_EQ(output1->shape(), Shape({8}));
+  EXPECT_EQ(llvm::cast<Array>(output1.get())->handle().handle, 3001);
+}
+#endif
+
+// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
+#if defined(PLATFORM_GOOGLE)
+TEST_F(LoadedExecutableTest, Delete) {
+  MockClient client;
+  LoadedExecutable executable(
+      &client, rpc_helper_, /*handle=*/1234, /*name=*/"foo",
+      /*num_devices=*/2, /*addressable_device_logical_device_ids=*/{},
+      /*addressable_devices=*/{}, /*fingerprint=*/"fingerprint",
+      /*loaded_host_callbacks=*/{}, /*loaded_host_callback_handles=*/{});
+
+  {
+    IfrtResponse response;
+    ASSERT_TRUE(TextFormat::ParseFromString(
+        R"pb(
+          loaded_executable_delete_response { future_handle: 2000 }
+        )pb",
+        &response));
+    EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
+                               R"pb(loaded_executable_delete_request {
+                                      loaded_executable_handle: 1234
+                                    })pb")))))
+        .WillOnce(MockClientSessionReturnResponse(response));
+
+    ASSERT_TRUE(TextFormat::ParseFromString(
+        R"pb(
+          response_metadata {
+            status {
+              code: 2  # UNKNOWN
+              message: "injected error"
+            }
+          }
+        )pb",
+        &response));
+    EXPECT_CALL(
+        *session_,
+        Enqueue(Pointee(Partially(EquivToProto(R"pb(check_future_request {
+                                                      future_handle: 2000
+                                                    })pb")))))
+        .WillOnce(MockClientSessionReturnResponse(response));
+
+    Future<absl::Status> result = executable.Delete();
+    EXPECT_THAT(result.Await(),
+                StatusIs(absl::StatusCode::kUnknown, StrEq("injected error")));
+  }
+
+  {
+    IfrtResponse response;
+    ASSERT_TRUE(TextFormat::ParseFromString(
+        R"pb(
+          loaded_executable_is_deleted_response { is_deleted: true }
+        )pb",
+        &response));
+    EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
+                               R"pb(loaded_executable_is_deleted_request {
+                                      loaded_executable_handle: 1234
+                                    })pb")))))
+        .WillOnce(MockClientSessionReturnResponse(response));
+
+    EXPECT_TRUE(executable.IsDeleted());
+  }
+
+  IfrtResponse response;
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(
+        loaded_executable_destruct_response {}
+      )pb",
+      &response));
+  EXPECT_CALL(*session_, Enqueue(Pointee(Partially(EquivToProto(
+                             R"pb(loaded_executable_destruct_request {
+                                    loaded_executable_handle: 1234
+                                  })pb")))))
+      .WillOnce(MockClientSessionReturnResponse(response));
+}
+#endif
+
+}  // namespace
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/grpc_client.cc b/third_party/xla/xla/python/ifrt_proxy/client/grpc_client.cc
new file mode 100644
index 00000000000000..4279ab7d4d430c
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/grpc_client.cc
@@ -0,0 +1,189 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/log/log_entry.h"
+#include "absl/log/log_sink.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "grpcpp/client_context.h"
+#include "xla/pjrt/distributed/util.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/client/client.h"
+#include "xla/python/ifrt_proxy/client/grpc_client_session.h"
+#include "xla/python/ifrt_proxy/client/grpc_host_buffer.h"
+#include "xla/python/ifrt_proxy/client/registry.h"
+#include "xla/python/ifrt_proxy/client/rpc_helper.h"
+#include "xla/python/ifrt_proxy/client/version.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+namespace {
+
+// Attempts to establish a session to the proxy-server and returns a `Client`
+// based on the session if successful. `on_disconnect` will be invoked exactly
+// once if this function returns successfully, and not invoked if this function
+// returns a non-OK status.
+absl::StatusOr<std::unique_ptr<Client>> AttemptConnection(
+    absl::string_view server_address,
+    std::function<void(absl::Status)> on_disconnect, int attempt_no,
+    absl::AnyInvocable<void(absl::string_view)> log_initial_connection) {
+  std::unique_ptr<RpcHelper> rpc_helper;
+  auto init_response_promise =
+      Future<absl::StatusOr<std::shared_ptr<InitResponse>>>::CreatePromise();
+
+  if (on_disconnect == nullptr) {
+    on_disconnect = [](absl::Status s) {
+      LOG(WARNING) << "IFRT proxy server disconnected: " << s;
+    };
+  }
+
+  // TODO(b/266635130): Move gRPC stub creation to be outside of `Client` so
+  // that we can pass mock `ClientSession` to the client.
+  auto stub = CreateGrpcStub(server_address);
+
+  auto session_disconnect_cb =
+      [init_response = Future<absl::StatusOr<std::shared_ptr<InitResponse>>>(
+           init_response_promise),
+       on_disconnect = std::move(on_disconnect),
+       attempt_no](absl::Status s) mutable {
+        // If the `rpc_helper->Init().OnReady(cb)` statement below has returned,
+        // the callback cb in that statement (which sets `init_response`) is
+        // guaranteed by `GrpcClientSession::Create()` to be called before
+        // `session_disconnect_cb`.
+        // TODO(madthanu): The above statement is false (even if we wanted to,
+        // we cannot meaningfully enforce or document the guarantee of
+        // the returned Future's OnReady being called before another callback),
+        // although the exact way init_response_promise is set below makes it
+        // work most of the time.
+        if (init_response.IsReady() && init_response.Await().ok()) {
+          // If the init RPC has already completed successfully, we have
+          // already or will be returning OK from the `AttemptConnection` call.
+          // So, invoke `on_disconnect`.
+          on_disconnect(s);
+        } else {
+          // Otherwise, we are going to return an error from
+          // `AttemptConnection`. So do not invoke `on_disconnect`.
+          VLOG(0) << "GrpcClientSession attempt " << attempt_no
+                  << " failed: " << s;
+        }
+      };
+
+  GrpcIfrtSessionMetadata metadata;
+  {
+    GrpcGetVersionRequest request;
+    request.mutable_min_version()->set_protocol_version(kClientMinVersion);
+    request.mutable_max_version()->set_protocol_version(kClientMaxVersion);
+
+    ::grpc::ClientContext context;
+    GrpcGetVersionResponse response;
+    TF_RETURN_IF_ERROR(
+        xla::FromGrpcStatus(stub->GetVersion(&context, request, &response)));
+
+    CHECK_GE(response.version().protocol_version(), kClientMinVersion);
+    CHECK_LE(response.version().protocol_version(), kClientMaxVersion);
+    *metadata.mutable_version() = response.version();
+  }
+
+  auto session =
+      GrpcClientSession::Create(stub, metadata, session_disconnect_cb);
+  rpc_helper =
+      std::make_unique<RpcHelper>(metadata.version(), std::move(session));
+
+  log_initial_connection(absl::StrCat("Sending InitRequest and waiting for ",
+                                      "response (attempt ", attempt_no, ")."));
+
+  // TODO(b/282757875): Use a separate Request that will indicate quickly
+  // whether the grpc_client<->grpc_server session has been established or
+  // not, instead of combining it with the Request that will fetch device
+  // information (which can take a while, depending on the IFRT backend).
+  rpc_helper->Init(std::make_unique<InitRequest>())
+      .OnReady([&](auto resp) mutable { init_response_promise.Set(resp); });
+
+  TF_ASSIGN_OR_RETURN(auto init_response,
+                      Future<absl::StatusOr<std::shared_ptr<InitResponse>>>(
+                          init_response_promise)
+                          .Await());
+
+  auto host_buffer_store = std::make_unique<GrpcClientHostBufferStore>(
+      stub, metadata.version(), init_response->session_id());
+  rpc_helper->set_host_buffer_store(std::move(host_buffer_store));
+
+  return Client::Create(std::move(rpc_helper), std::move(*init_response));
+}
+
+absl::StatusOr<std::unique_ptr<Client>> CreateGrpcClient(
+    absl::string_view server_address, const ClientConnectionOptions& options) {
+  auto log_initial_connection =
+      [f = std::move(options.on_connection_update)](absl::string_view msg) {
+        VLOG(0) << msg;
+        if (f) {
+          f(absl::StrCat(absl::Now(), ": ", msg));
+        }
+      };
+
+  absl::Time start_time = absl::Now();
+  absl::Status last_status;
+  for (int i = 0; absl::Now() - start_time < options.connection_timeout; ++i) {
+    log_initial_connection(absl::StrCat("Connecting to IFRT proxy server at ",
+                                        server_address, ", attempt #", i,
+                                        "..."));
+    absl::StatusOr<std::unique_ptr<Client>> result = AttemptConnection(
+        server_address, options.on_disconnect, i, log_initial_connection);
+    if (result.ok()) {
+      log_initial_connection(absl::StrCat("Connected to IFRT proxy server on ",
+                                          "attempt #", i, "."));
+      return result;
+    } else {
+      last_status = result.status();
+      log_initial_connection(
+          absl::StrCat("Connection to IFRT proxy server attempt #", i,
+                       "failed: ", last_status.ToString()));
+    }
+    absl::SleepFor(absl::Seconds(1));
+  }
+
+  // We want to prepend a human-friendly error message to status before
+  // returning.
+  auto err_msg =
+      absl::StrCat("Unable to establish connection to ifrt_proxy server, ",
+                   "please check provided address '", server_address,
+                   "'; detailed error: ", last_status.message());
+  log_initial_connection(err_msg);
+  return tsl::errors::CreateWithUpdatedMessage(last_status, err_msg);
+}
+
+}  // namespace
+
+bool register_client_factory =
+    ([] { RegisterClientFactory("grpc", CreateGrpcClient); }(), true);
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session.cc b/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session.cc
new file mode 100644
index 00000000000000..a70e633d6767e5
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session.cc
@@ -0,0 +1,266 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/client/grpc_client_session.h"
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/base/call_once.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/bind_front.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
+#include "grpc/grpc.h"
+#include "grpcpp/channel.h"
+#include "grpcpp/client_context.h"
+#include "grpcpp/create_channel.h"
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/support/channel_arguments.h"
+#include "xla/pjrt/distributed/util.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/client/client_session.h"
+#include "xla/python/ifrt_proxy/common/grpc_credentials.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.grpc.pb.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/threadpool.h"
+#include "tsl/platform/unbounded_work_queue.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+using OpId = int64_t;
+
+// Logically equivalent to a map<OpId, ResponseCallback>, but thread-safe and
+// with various convenience functions.
+class GrpcClientSession::ResponseCallbackTable {
+ public:
+  absl::Status Add(OpId op_id, ResponseCallback callback) {
+    absl::MutexLock l(&mu_);
+    const bool inserted = table_.insert({op_id, std::move(callback)}).second;
+    if (!inserted) {
+      return absl::AlreadyExistsError(
+          absl::StrCat("Op id ", op_id, " already exists"));
+    }
+    return absl::OkStatus();
+  }
+
+  std::optional<ResponseCallback> Pop(OpId op_id) {
+    absl::MutexLock l(&mu_);
+    auto it = table_.find(op_id);
+    if (it == table_.end()) {
+      return std::nullopt;
+    }
+    auto cb = std::move(it->second);
+    table_.erase(it);
+    return std::move(cb);
+  }
+
+  absl::flat_hash_map<OpId, ResponseCallback> PopAll() {
+    absl::flat_hash_map<OpId, ResponseCallback> result;
+    absl::MutexLock l(&mu_);
+    result = std::move(table_);
+    table_ = absl::flat_hash_map<OpId, ResponseCallback>();
+    return result;
+  }
+
+ private:
+  absl::Mutex mu_;
+  absl::flat_hash_map<OpId, ResponseCallback> table_ ABSL_GUARDED_BY(mu_);
+};
+
+std::shared_ptr<GrpcClientSession> GrpcClientSession::Create(
+    std::shared_ptr<grpc::GrpcIfrtService::StubInterface> stub,
+    GrpcIfrtSessionMetadata metadata,
+    StreamTerminatedCallback stream_terminated_cb) {
+  auto context = std::make_unique<::grpc::ClientContext>();
+  context->AddMetadata("ifrt-proxy-grpc-ifrt-session-metadata-bin",
+                       metadata.SerializeAsString());
+  std::shared_ptr<GrpcClientSession> result(new GrpcClientSession(
+      std::move(stub), std::move(context), std::move(stream_terminated_cb)));
+  return result;
+}
+
+GrpcClientSession::GrpcClientSession(
+    std::shared_ptr<grpc::GrpcIfrtService::StubInterface> stub,
+    std::unique_ptr<::grpc::ClientContext> context,
+    StreamTerminatedCallback stream_terminated_cb)
+    : response_callbacks_(std::make_unique<ResponseCallbackTable>()),
+      reader_thread_(std::make_unique<tsl::thread::ThreadPool>(
+          tsl::Env::Default(), "ifrt_proxy_client_grpc_reader",
+          /*num_threads=*/1)),
+      stub_(std::move(stub)),
+      context_(std::move(context)),
+      stream_(stub_->IfrtSession(context_.get())),
+      stream_terminated_cb_(std::move(stream_terminated_cb)),
+      user_futures_work_queue_(std::make_unique<tsl::UnboundedWorkQueue>(
+          tsl::Env::Default(), "GrpcClientSessionUserFuturesWorkQueue")) {
+  reader_thread_->Schedule(
+      absl::bind_front(&GrpcClientSession::ReadLoop, this));
+}
+
+Future<ClientSession::Response> GrpcClientSession::Enqueue(
+    std::unique_ptr<IfrtRequest> request) {
+  auto promise = Future<ClientSession::Response>::CreatePromise();
+  absl::Status status = Enqueue(
+      std::move(request), [promise, queue = user_futures_work_queue_.get()](
+                              Response response) mutable {
+        queue->Schedule([promise = std::move(promise),
+                         response = std::move(response)]() mutable -> void {
+          promise.Set(std::move(response));
+        });
+      });
+  if (!status.ok()) {
+    user_futures_work_queue_->Schedule([promise, status]() mutable -> void {
+      promise.Set(std::move(status));
+    });
+  }
+  return Future<ClientSession::Response>(std::move(promise));
+}
+
+absl::Status GrpcClientSession::Enqueue(std::unique_ptr<IfrtRequest> req,
+                                        ResponseCallback callback) {
+  const OpId op_id = req->request_metadata().op_id();
+
+  absl::MutexLock l(&writer_mu_);
+  if (writes_stopped_) {
+    return absl::FailedPreconditionError(
+        "GrpcClientSession: writes no longer allowed.");
+  }
+
+  TF_RETURN_IF_ERROR(response_callbacks_->Add(op_id, std::move(callback)));
+
+  if (!stream_->Write(*req)) {
+    CHECK(response_callbacks_->Pop(op_id).has_value());
+    return absl::UnknownError("GrpcClientSession: writing to stream failed.");
+  }
+
+  return absl::OkStatus();
+}
+
+void GrpcClientSession::ReadLoop() {
+  while (true) {
+    auto read_buffer = std::make_unique<IfrtResponse>();
+    if (!stream_->Read(read_buffer.get())) {
+      LOG(INFO) << "GrpcClientSession: reader loop is exiting.";
+      break;
+    }
+
+    const OpId op_id = read_buffer->response_metadata().op_id();
+    std::optional<ResponseCallback> callback = response_callbacks_->Pop(op_id);
+
+    if (callback.has_value()) {
+      VLOG(1) << "GrpcClientSession: Issuing callback for " << op_id;
+      (*callback)(std::move(read_buffer));
+      VLOG(1) << "GrpcClientSession: Done with callback for " << op_id;
+    } else {
+      LOG(ERROR) << "Received response with no remaining registered callback: "
+                 << read_buffer->DebugString();
+    }
+  }
+
+  reader_thread_stopped_.Notify();
+  Finish(absl::OkStatus());
+}
+
+void GrpcClientSession::Finish(const absl::Status& client_status) {
+  LOG(INFO) << "GrpcClientSession: Finish() called with client status "
+            << client_status;
+
+  absl::call_once(finish_once_, [&] {
+    context_->TryCancel();
+
+    LOG(INFO) << "GrpcClientSession: Waiting for reader thread to stop.";
+    reader_thread_stopped_.WaitForNotification();
+
+    auto finish_stream_and_get_server_status = [&]() -> absl::Status {
+      LOG(INFO) << "GrpClientSession: Attempting to call stream->Finish()";
+      absl::MutexLock l(&writer_mu_);
+      // Note: stream_->Finish() counts as a write, and needs to be serialized
+      // with stream->Write().
+      LOG(INFO) << "GrpClientSession: Attempting to call stream->Finish(), "
+                   "mutex acquired";
+      absl::Status server_status = xla::FromGrpcStatus(stream_->Finish());
+      LOG(INFO) << "GrpClientSession: stream->Finish() returned server status "
+                << server_status;
+
+      CHECK(!writes_stopped_);
+      writes_stopped_ = true;
+
+      return server_status;
+    };
+
+    absl::Status combined_status = finish_stream_and_get_server_status();
+    combined_status.Update(client_status);
+
+    auto all_callbacks = response_callbacks_->PopAll();
+    for (auto& [_, cb] : all_callbacks) {
+      if (combined_status.ok()) {
+        cb(absl::AbortedError("Finish(OK) called."));
+      } else {
+        cb(combined_status);
+      }
+    }
+
+    LOG(INFO) << "GrpClientSession::Finish(): calling terminated cb with "
+              << combined_status;
+    stream_terminated_cb_(combined_status);
+  });
+}
+
+GrpcClientSession::~GrpcClientSession() {
+  GrpcClientSession::Finish(absl::CancelledError("~GrpcClientSession called."));
+  reader_thread_.reset();  // Wait until the reader thread exits.
+  LOG(INFO) << "Deleting GrpcClientSession.user_futures_work_queue_ ...";
+  user_futures_work_queue_.reset();
+  LOG(INFO) << "Deleted GrpcClientSession.user_futures_work_queue_.";
+}
+
+std::shared_ptr<grpc::GrpcIfrtService::StubInterface> CreateGrpcStub(
+    absl::string_view server_address) {
+  ::grpc::ChannelArguments args;
+  // Remove message size limit to accommodate large messages exchanged during
+  // model compilation.
+  args.SetInt(GRPC_ARG_MAX_SEND_MESSAGE_LENGTH, -1);
+  args.SetInt(GRPC_ARG_MAX_RECEIVE_MESSAGE_LENGTH, -1);
+  std::shared_ptr<::grpc::Channel> channel = ::grpc::CreateCustomChannel(
+      std::string(server_address), GetClientCredentials(), args);
+  VLOG(0) << "  Established channel.";
+  CHECK(channel != nullptr);
+
+  std::shared_ptr<grpc::GrpcIfrtService::StubInterface> stub =
+      grpc::GrpcIfrtService::NewStub(channel);
+  VLOG(0) << "  Created stub.";
+  CHECK(stub != nullptr);
+
+  return stub;
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session.h b/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session.h
new file mode 100644
index 00000000000000..9ca8219760a15b
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session.h
@@ -0,0 +1,144 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_GRPC_CLIENT_SESSION_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_GRPC_CLIENT_SESSION_H_
+
+#include <functional>
+#include <memory>
+
+#include "absl/base/call_once.h"
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
+#include "grpcpp/client_context.h"
+#include "grpcpp/support/client_callback.h"
+#include "grpcpp/support/sync_stream.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/client/client_session.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.grpc.pb.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "tsl/platform/threadpool.h"
+#include "tsl/platform/unbounded_work_queue.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// `GrpcClientSession` implements the client side of an `IfrtSession`
+// stream(ing RPC) and allows users to enqueue `IfrtRequest`s on the
+// stream and register callbacks for when `IfrtResponse`s are received.
+class GrpcClientSession : public ClientSession {
+ public:
+  // `StreamTerminatedCallback` represents a function that will be called when
+  // the underlying streaming RPC is terminated permanently. The callback may be
+  // invoked by the "primary" thread and with various mutex locks held, so the
+  // callback should both return soon and not block on any events (deadlocks may
+  // happen otherwise).
+  using StreamTerminatedCallback = std::function<void(absl::Status)>;
+
+  // Returns an instantiation of GrpcClientSession on the given `stub`.
+  // `stream_terminated_cb` is guaranteed to be called exactly once (unless the
+  // process terminates beforehand). It is guaranteed that no registered
+  // `ResponseCallback` (see below) will be called after `stream_terminated_cb`.
+  static std::shared_ptr<GrpcClientSession> Create(
+      std::shared_ptr<grpc::GrpcIfrtService::StubInterface> stub,
+      GrpcIfrtSessionMetadata metadata,
+      StreamTerminatedCallback stream_terminated_cb);
+
+  Future<Response> Enqueue(std::unique_ptr<IfrtRequest> request) override;
+
+  // `ResponseCallback` represents a function that can be invoked when
+  // `ClientSession` receives an `IfrtResponse`. May be invoked by the "primary"
+  // thread and with various mutex locks held.
+  using ResponseCallback = std::function<void(Response)>;
+
+  absl::Status Enqueue(std::unique_ptr<IfrtRequest> req,
+                       ResponseCallback callback);
+
+  // Terminates the `GrpcClientSession` if it has not already been terminated.
+  // Waits until `stream_terminated_cb` returns.
+  void Finish(const absl::Status& client_status) override;
+
+  // Not copyable (or moveable)
+  GrpcClientSession(const GrpcClientSession&) = delete;
+  GrpcClientSession& operator=(const GrpcClientSession&) = delete;
+
+  // Calls `Finish()`. Also waits for the destruction of
+  // `user_futures_work_queue_` (see below) and thus can block on user-level
+  // callbacks.
+  ~GrpcClientSession() override;
+
+ private:
+  class ResponseCallbackTable;
+
+  GrpcClientSession(std::shared_ptr<grpc::GrpcIfrtService::StubInterface> stub,
+                    std::unique_ptr<::grpc::ClientContext> context,
+                    StreamTerminatedCallback stream_terminated_cb);
+
+  // Repeatedly waits for a `IfrtResponse` message to arrive; for each message,
+  // looks up the corresponding callback registered in `response_callbacks_` and
+  // invokes it inline.
+  void ReadLoop();
+
+  // Thread-safe table that logically maps from RequestMetadata.OpId to
+  // ResponseCallback.
+  const std::unique_ptr<ResponseCallbackTable> response_callbacks_;
+
+  // Thread that invokes `ReadLoop()`.
+  std::unique_ptr<tsl::thread::ThreadPool> reader_thread_;
+
+  // A notification (waited on by `Finish()`) for when `ReadLoop()` exits.
+  absl::Notification reader_thread_stopped_;
+
+  // Set by `Finish()`, respected by `Enqueue()` calls.
+  bool writes_stopped_ ABSL_GUARDED_BY(writer_mu_) = false;
+
+  // A mutex that ensures serialization between various `Enqueue()` calls, since
+  // only one thread is allowed to write to the gRPC stream at a time.
+  absl::Mutex writer_mu_;
+
+  // Ensures logic inside `Finish()` is internally called only once.
+  absl::once_flag finish_once_;
+
+  // References to gRPC objects used to read and write to the stream.
+  const std::shared_ptr<grpc::GrpcIfrtService::StubInterface> stub_;
+  const std::unique_ptr<::grpc::ClientContext> context_;
+  const std::unique_ptr<
+      ::grpc::ClientReaderWriterInterface<IfrtRequest, IfrtResponse>>
+      stream_;
+
+  const StreamTerminatedCallback stream_terminated_cb_;
+
+  // Threadpool used to perform `Future<>::Promise::Set()` for Futures returned
+  // to callers of `Enqueue(std::unique_ptr<IfrtRequest> request)`. We do this
+  // because `Set()` may block on arbitrary `OnReady` callbacks set by those
+  // callers.
+  std::unique_ptr<tsl::UnboundedWorkQueue> user_futures_work_queue_;
+};
+
+// Creates a gRPC stub that connects to `server_address`. It can be used for
+// `GrpcClientSession`. The same stub can be reused across multiple sessions.
+std::shared_ptr<grpc::GrpcIfrtService::StubInterface> CreateGrpcStub(
+    absl::string_view server_address);
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_GRPC_CLIENT_SESSION_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session_test.cc b/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session_test.cc
new file mode 100644
index 00000000000000..18f1bb1328de2d
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/grpc_client_session_test.cc
@@ -0,0 +1,481 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/client/grpc_client_session.h"
+
+#include <atomic>
+#include <deque>
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/log/log_sink_registry.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
+#include "absl/time/time.h"
+#include "grpc/support/time.h"
+#include "grpcpp/channel.h"
+#include "grpcpp/create_channel.h"
+#include "grpcpp/server_builder.h"
+#include "grpcpp/server_context.h"
+#include "grpcpp/support/status.h"
+#include "grpcpp/support/sync_stream.h"
+#include "xla/python/ifrt_proxy/client/version.h"
+#include "xla/python/ifrt_proxy/common/grpc_credentials.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.grpc.pb.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+namespace {
+
+using ::testing::Not;
+using ::tsl::testing::IsOk;
+
+constexpr int kOp1 = 1;
+constexpr int kOp2 = 2;
+
+// Sufficient time for all processing (that are not explicitly waiting for
+// further input) to have finished.
+constexpr absl::Duration kSufficientTime = absl::Seconds(5);
+
+GrpcIfrtSessionMetadata Metadata() {
+  GrpcIfrtSessionMetadata metadata;
+  metadata.mutable_version()->set_protocol_version(kClientMaxVersion);
+  return metadata;
+}
+
+absl::Status TestError() { return absl::UnknownError("test error"); }
+
+// A thread-safe queue of `absl::Status` values.
+class Queue {
+ public:
+  void Push(absl::Status t) {
+    absl::MutexLock l(&mu_);
+    queue_.push_back(std::move(t));
+  }
+
+  std::optional<absl::Status> PopOrTimeout(
+      absl::Duration timeout = kSufficientTime) {
+    absl::MutexLock l(&mu_);
+    auto cond = [this]() ABSL_SHARED_LOCKS_REQUIRED(mu_) -> bool {
+      return !queue_.empty();
+    };
+    mu_.AwaitWithTimeout(absl::Condition(&cond), timeout);
+    if (queue_.empty()) {
+      return std::nullopt;
+    }
+    absl::Status result = std::move(queue_.front());
+    queue_.pop_front();
+    return result;
+  }
+
+  absl::Status Pop(absl::Duration timeout = kSufficientTime) {
+    auto result = PopOrTimeout(timeout);
+    CHECK(result.has_value()) << "Timeout!";
+    return *result;
+  }
+
+  void PopAllDuringDestruction() {
+    absl::MutexLock l(&mu_);
+    allow_non_empty_destruction_ = true;
+  }
+
+  ~Queue() {
+    absl::MutexLock l(&mu_);
+    if (!allow_non_empty_destruction_) CHECK(queue_.empty()) << " " << this;
+  }
+
+ private:
+  absl::Mutex mu_;
+  std::deque<absl::Status> queue_ ABSL_GUARDED_BY(mu_);
+  bool allow_non_empty_destruction_ ABSL_GUARDED_BY(mu_) = false;
+};
+
+// Checks that the input is a list of zero-or-more OK statuses followed by
+// zero-or-more NOT-OK statuses. Succeeds for {OK, NOT_OK, NOT_OK}, but fails
+// for {OK, NOT_OK, OK}.
+void ExpectHeadAndTail(
+    std::vector<std::variant<absl::StatusOr<Queue*>, absl::Status>> var_list) {
+  std::vector<absl::Status> status_list;
+  for (const auto& v : var_list) {
+    if (std::holds_alternative<absl::StatusOr<Queue*>>(v)) {
+      status_list.push_back(std::get<absl::StatusOr<Queue*>>(v).status());
+    } else {
+      status_list.push_back(std::get<absl::Status>(v));
+    }
+  }
+  bool seen_not_ok = false;
+  std::string str;
+  for (const auto& s : status_list) {
+    absl::StrAppend(&str, "\n", s.ToString(), "\n-----\n");
+  }
+  for (const auto& s : status_list) {
+    if (!s.ok()) seen_not_ok = true;
+    if (seen_not_ok) {
+      EXPECT_THAT(s, Not(IsOk())) << str;
+    }
+  }
+}
+
+using ServerStream = ::grpc::ServerReaderWriter<IfrtResponse, IfrtRequest>;
+using SessionAction = bool;
+constexpr SessionAction kContinueSession = true;
+constexpr SessionAction kStopSession = false;
+using OnSessionStart = std::function<SessionAction()>;
+using OnReqReceived =
+    std::function<SessionAction(const IfrtRequest&, ServerStream*)>;
+
+// A simple implementation of IfrtService with various test-hooks.
+class SimpleIfrtService : public grpc::GrpcIfrtService::Service {
+ public:
+  SimpleIfrtService(OnReqReceived on_req_received,
+                    OnSessionStart on_session_start)
+      : on_req_received_(std::move(on_req_received)),
+        on_session_start_(std::move(on_session_start)) {}
+
+  ::grpc::Status IfrtSession(::grpc::ServerContext* context,
+                             ServerStream* stream) override {
+    if (on_session_start_ && on_session_start_() == kStopSession) {
+      return ::grpc::Status::OK;
+    }
+
+    {
+      absl::MutexLock l(&mu_);
+      CHECK(contexts_.insert(context).second);
+    }
+
+    while (true) {
+      IfrtRequest request;
+      LOG(INFO) << "Server: waiting on Read().";
+      if (!stream->Read(&request)) {
+        LOG(INFO) << "Server: Read() returned false.";
+        break;
+      }
+      LOG(INFO) << "Server: Read() returned true.";
+      if (!on_req_received_) {
+        IfrtResponse response;
+        response.mutable_response_metadata()->set_op_id(
+            request.request_metadata().op_id());
+        stream->Write(response);
+      } else if (on_req_received_(request, stream) == kStopSession) {
+        break;
+      }
+    }
+    {
+      absl::MutexLock l(&mu_);
+      CHECK_EQ(contexts_.erase(context), 1);
+    }
+
+    LOG(INFO) << "Finishing IFRT session";
+    return ::grpc::Status::OK;
+  }
+
+  void CancelAllServerSessions() {
+    absl::MutexLock l(&mu_);
+    for (const auto& context : contexts_) {
+      context->TryCancel();
+    }
+  }
+
+ private:
+  const OnReqReceived on_req_received_;
+  const OnSessionStart on_session_start_;
+
+  // Keeps track of `::grpc::ServerContext` for all ongoing sessions.
+  absl::Mutex mu_;
+  absl::flat_hash_set<::grpc::ServerContext*> contexts_ ABSL_GUARDED_BY(mu_);
+};
+
+// Encapsulates objects related to a client and server instance of
+// `grpc::GrpcIfrtService`.
+class ClientAndServer {
+ public:
+  explicit ClientAndServer(OnReqReceived on_req_received = nullptr,
+                           OnSessionStart on_session_start = nullptr) {
+    std::string address =
+        absl::StrCat("localhost:", tsl::testing::PickUnusedPortOrDie());
+    ::grpc::ServerBuilder builder;
+    builder.AddListeningPort(address, GetServerCredentials());
+    ifrt_service_ =
+        std::make_unique<SimpleIfrtService>(on_req_received, on_session_start);
+    builder.RegisterService(ifrt_service_.get());
+    server_ = builder.BuildAndStart();
+
+    LOG(INFO) << "Server started and listening on " << address;
+    absl::FlushLogSinks();
+
+    std::shared_ptr<::grpc::Channel> channel =
+        ::grpc::CreateChannel(address, GetClientCredentials());
+    channel->WaitForConnected(gpr_time_add(
+        gpr_now(GPR_CLOCK_REALTIME), gpr_time_from_seconds(10, GPR_TIMESPAN)));
+    LOG(INFO) << "conn_state = " << channel->GetState(/*try_to_connect=*/false);
+
+    auto stub = grpc::GrpcIfrtService::NewStub(channel);
+    CHECK(stub != nullptr);
+
+    client_session_ = GrpcClientSession::Create(
+        std::move(stub), Metadata(), [this](absl::Status s) {
+          client_finished_q_.Push(s);
+          client_finished_notification_.Notify();
+        });
+
+    client_finished_q_.PopAllDuringDestruction();
+  }
+
+  void StopServer() {
+    ifrt_service_->CancelAllServerSessions();
+    server_->Shutdown();
+    server_->Wait();
+  }
+
+  ~ClientAndServer() {
+    StopServer();
+    client_session_->Finish(absl::CancelledError("~ClientAndServer"));
+    client_finished_notification_.WaitForNotificationWithTimeout(
+        kSufficientTime);
+    CHECK(client_finished_notification_.HasBeenNotified());
+  }
+
+  GrpcClientSession* client_session() { return client_session_.get(); }
+
+  Queue* client_finished_q() { return &client_finished_q_; }
+
+  absl::StatusOr<Queue*> SendSimpleRequest(int op_id) {
+    owned_queues_.push_back(std::make_unique<Queue>());
+    Queue* q = owned_queues_.back().get();
+
+    auto req = std::make_unique<IfrtRequest>();
+    req->mutable_request_metadata()->set_op_id(op_id);
+    TF_RETURN_IF_ERROR(client_session_->Enqueue(
+        std::move(req),
+        [q](GrpcClientSession::Response resp) { q->Push(resp.status()); }));
+
+    return q;
+  }
+
+ private:
+  std::vector<std::unique_ptr<Queue>> owned_queues_;
+  Queue client_finished_q_;
+  absl::Notification client_finished_notification_;
+  std::shared_ptr<GrpcClientSession> client_session_;
+
+  std::unique_ptr<::grpc::Server> server_;
+  std::unique_ptr<SimpleIfrtService> ifrt_service_;
+};
+
+TEST(GrpcClientSessionTest, HappyCaseOneRequestWithServerTermination) {
+  ClientAndServer cs;
+
+  TF_ASSERT_OK_AND_ASSIGN(Queue * response_q, cs.SendSimpleRequest(kOp1));
+
+  EXPECT_THAT(response_q->Pop(), IsOk());
+
+  EXPECT_EQ(cs.client_finished_q()->PopOrTimeout(), std::nullopt);
+
+  cs.StopServer();
+  EXPECT_THAT(cs.client_finished_q()->Pop(), Not(IsOk()));
+}
+
+TEST(GrpcClientSessionTest, HappyCaseTwoRequestsWithClientFinish) {
+  ClientAndServer cs;
+
+  TF_ASSERT_OK_AND_ASSIGN(Queue * response_q_1, cs.SendSimpleRequest(kOp1));
+  TF_ASSERT_OK_AND_ASSIGN(Queue * response_q_2, cs.SendSimpleRequest(kOp2));
+
+  EXPECT_THAT(response_q_1->Pop(), IsOk());
+  EXPECT_THAT(response_q_2->Pop(), IsOk());
+
+  EXPECT_EQ(cs.client_finished_q()->PopOrTimeout(), std::nullopt);
+
+  cs.client_session()->Finish(TestError());
+  EXPECT_THAT(cs.client_finished_q()->Pop(), Not(IsOk()));
+}
+
+TEST(GrpcClientSessionTest, ServerFinishesDuringFirstRead) {
+  ClientAndServer cs(
+      /*on_req_received=*/[](auto, auto) { return kStopSession; });
+
+  TF_ASSERT_OK_AND_ASSIGN(Queue * response_q_1, cs.SendSimpleRequest(kOp1));
+  EXPECT_THAT(response_q_1->Pop(), Not(IsOk()));
+
+  absl::StatusOr<Queue*> response_q_2 = cs.SendSimpleRequest(kOp2);
+  EXPECT_THAT(response_q_2.status(), Not(IsOk()));
+
+  EXPECT_THAT(cs.client_finished_q()->Pop(), Not(IsOk()));
+}
+
+TEST(GrpcClientSessionTest, ServerFinishesDuringConstruction) {
+  ClientAndServer cs(/*on_req_received=*/nullptr,
+                     /*on_session_start=*/[]() { return kStopSession; });
+
+  absl::StatusOr<Queue*> response_q_1 = cs.SendSimpleRequest(kOp1);
+  absl::StatusOr<Queue*> response_q_2 = cs.SendSimpleRequest(kOp2);
+
+  ExpectHeadAndTail({response_q_1, response_q_2});
+  if (response_q_1.ok()) EXPECT_THAT(response_q_1.value()->Pop(), Not(IsOk()));
+  if (response_q_2.ok()) EXPECT_THAT(response_q_2.value()->Pop(), Not(IsOk()));
+
+  EXPECT_THAT(cs.client_finished_q()->Pop(), Not(IsOk()));
+}
+
+TEST(GrpcClientSessionTest, ClientFinishesAfterServerConsumesFirstRequest) {
+  std::atomic<GrpcClientSession*> session_ptr;
+  ClientAndServer cs(
+      /*on_req_received=*/[session_ptr = &session_ptr](auto, auto) {
+        session_ptr->load()->Finish(TestError());
+        return kContinueSession;
+      });
+  session_ptr.store(cs.client_session());
+
+  TF_ASSERT_OK_AND_ASSIGN(Queue * response_q_1, cs.SendSimpleRequest(kOp1));
+  EXPECT_THAT(response_q_1->Pop(), Not(IsOk()));
+
+  absl::StatusOr<Queue*> response_q_2 = cs.SendSimpleRequest(kOp2);
+  EXPECT_THAT(response_q_2.status(), Not(IsOk()));
+
+  EXPECT_THAT(cs.client_finished_q()->Pop(), Not(IsOk()));
+}
+
+TEST(GrpcClientSessionTest, ClientFinishesAfterServerWritesFirstResponse) {
+  std::atomic<GrpcClientSession*> session_ptr;
+  ClientAndServer cs(
+      /*on_req_received=*/[session_ptr = &session_ptr](const IfrtRequest& r,
+                                                       ServerStream* s) {
+        IfrtResponse response;
+        response.mutable_response_metadata()->set_op_id(
+            r.request_metadata().op_id());
+        s->Write(response);
+        session_ptr->load()->Finish(TestError());
+        return kContinueSession;
+      });
+  session_ptr.store(cs.client_session());
+
+  TF_ASSERT_OK_AND_ASSIGN(Queue * response_q_1, cs.SendSimpleRequest(kOp1));
+  absl::StatusOr<Queue*> response_q_2 = cs.SendSimpleRequest(kOp2);
+
+  // The client may or may not terminate before the first response arrives.
+  response_q_1->Pop().IgnoreError();
+
+  // The client may or may not terminate before the second request could be
+  // enqueued. If it could be enqueued, the client will die without the server
+  // sending the corresponding response.
+  if (response_q_2.ok()) {
+    EXPECT_THAT(response_q_2.value()->Pop(), Not(IsOk()));
+  }
+
+  EXPECT_THAT(cs.client_finished_q()->Pop(), Not(IsOk()));
+}
+
+TEST(GrpcClientSessionTest, ClientFinishesDuringServerConstruction) {
+  std::atomic<GrpcClientSession*> session_ptr;
+  absl::Notification init_done;
+  ClientAndServer cs(/*on_req_received=*/nullptr,
+                     /*on_session_start=*/[session_ptr = &session_ptr,
+                                           init_done = &init_done]() {
+                       init_done->WaitForNotification();
+                       session_ptr->load()->Finish(TestError());
+                       return kContinueSession;
+                     });
+  session_ptr.store(cs.client_session());
+  init_done.Notify();
+
+  absl::StatusOr<Queue*> response_q_1 = cs.SendSimpleRequest(kOp1);
+  absl::StatusOr<Queue*> response_q_2 = cs.SendSimpleRequest(kOp2);
+
+  if (response_q_1.ok()) {
+    EXPECT_THAT(response_q_1.value()->Pop(), Not(IsOk()));
+  }
+  if (response_q_2.ok()) {
+    EXPECT_THAT(response_q_2.value()->Pop(), Not(IsOk()));
+  }
+
+  ExpectHeadAndTail({response_q_1, response_q_2});
+
+  EXPECT_THAT(cs.client_finished_q()->Pop(), Not(IsOk()));
+}
+
+TEST(GrpcClientSessionTest, MethodsAfterFinishReturnError) {
+  ClientAndServer cs;
+
+  TF_ASSERT_OK_AND_ASSIGN(Queue * response_q_1, cs.SendSimpleRequest(kOp1));
+  cs.client_session()->Finish(TestError());
+
+  EXPECT_THAT(cs.SendSimpleRequest(kOp2), Not(IsOk()));
+
+  response_q_1->PopAllDuringDestruction();
+}
+
+TEST(GrpcClientSessionTest, ReceivingBadIfrtResponseDoesNotCrash) {
+  ClientAndServer cs(
+      /*on_req_received=*/[](const IfrtRequest& r, ServerStream* s) mutable {
+        IfrtResponse resp;
+        resp.mutable_response_metadata()->set_op_id(kOp2);
+        s->Write(resp);
+        resp.mutable_response_metadata()->set_op_id(
+            r.request_metadata().op_id());
+        s->Write(resp);
+        return kContinueSession;
+      });
+
+  TF_ASSERT_OK_AND_ASSIGN(Queue * response_q, cs.SendSimpleRequest(kOp1));
+
+  EXPECT_THAT(response_q->Pop(), IsOk());
+}
+
+TEST(GrpcClientSessionTest, BadInitialChannelFailsPromptly) {
+  std::string address =
+      absl::StrCat("localhost:", tsl::testing::PickUnusedPortOrDie());
+
+  std::shared_ptr<::grpc::Channel> channel =
+      ::grpc::CreateChannel(address, GetClientCredentials());
+
+  std::unique_ptr<grpc::GrpcIfrtService::StubInterface> stub =
+      grpc::GrpcIfrtService::NewStub(channel);
+  EXPECT_TRUE(stub != nullptr);
+
+  auto session_finished = std::make_shared<Queue>();
+  auto session = GrpcClientSession::Create(
+      std::move(stub), Metadata(),
+      [session_finished](absl::Status s) { session_finished->Push(s); });
+
+  EXPECT_THAT(session_finished->Pop(), Not(IsOk()));
+}
+
+}  // namespace
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/grpc_host_buffer.cc b/third_party/xla/xla/python/ifrt_proxy/client/grpc_host_buffer.cc
new file mode 100644
index 00000000000000..c5a69737f057d5
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/grpc_host_buffer.cc
@@ -0,0 +1,182 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/client/grpc_host_buffer.h"
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/string_view.h"
+#include "grpcpp/client_context.h"
+#include "grpcpp/support/client_callback.h"
+#include "grpcpp/support/status.h"
+#include "grpcpp/support/sync_stream.h"
+#include "xla/pjrt/distributed/util.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.grpc.pb.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.pb.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/unbounded_work_queue.h"
+#include "tsl/protobuf/status.pb.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+static constexpr int64_t kChunkSize = 1024 * 1024;
+
+GrpcClientHostBufferStore::GrpcClientHostBufferStore(
+    std::shared_ptr<grpc::GrpcIfrtService::StubInterface> stub,
+    IfrtProxyVersion version, uint64_t session_id)
+    : stub_(std::move(stub)),
+      version_(std::move(version)),
+      session_id_(session_id),
+      lookup_work_queue_(std::make_unique<tsl::UnboundedWorkQueue>(
+          tsl::Env::Default(), "HostBufferStoreLookupsWorkQueue")) {}
+
+GrpcClientHostBufferStore::~GrpcClientHostBufferStore() {
+  LOG(INFO) << "Waiting for destruction of HostBufferStoreLookupsWorkQueue...";
+  lookup_work_queue_.reset();
+  LOG(INFO) << "Destructed HostBufferStoreLookupsWorkQueue.";
+}
+
+uint64_t GrpcClientHostBufferStore::NextHandle() {
+  return next_handle_.fetch_add(1, std::memory_order_relaxed);
+}
+
+Future<absl::Status> GrpcClientHostBufferStore::Store(uint64_t handle,
+                                                      absl::string_view data) {
+  // The current implementation synchronously sends host buffer chunks. We may
+  // consider making it asynchronous if the caller can leverage such asynchrony.
+
+  GrpcHostBufferStoreMetadata metadata;
+  metadata.set_session_id(session_id_);
+  metadata.set_handle(handle);
+  metadata.set_buffer_size(data.size());
+
+  ::grpc::ClientContext context;
+  context.AddMetadata("ifrt-proxy-grpc-host-buffer-store-metadata-bin",
+                      metadata.SerializeAsString());
+
+  GrpcHostBufferStoreResponse response;
+  auto writer = stub_->HostBufferStore(&context, &response);
+
+  for (int64_t offset = 0; offset < data.size(); offset += kChunkSize) {
+    GrpcHostBufferStoreRequest request;
+#if defined(PLATFORM_GOOGLE)
+    request.set_alias_data(data.substr(offset, kChunkSize));
+#else
+    // TODO(b/325306748): Find a way to not do a memory-copy.
+    request.set_data(std::string(data.substr(offset, kChunkSize)));
+#endif
+    writer->Write(request);
+  }
+
+  if (!writer->WritesDone()) {
+    return Future<absl::Status>(
+        absl::InternalError("Failed to write all host buffer chunks"));
+  }
+
+  return Future<absl::Status>(xla::FromGrpcStatus(writer->Finish()));
+}
+
+Future<absl::Status> GrpcClientHostBufferStore::Store(uint64_t handle,
+                                                      const absl::Cord& data) {
+  // The current implementation synchronously sends host buffer chunks. We may
+  // consider making it asynchronous if the caller can leverage such asynchrony.
+
+  GrpcHostBufferStoreMetadata metadata;
+  metadata.set_session_id(session_id_);
+  metadata.set_handle(handle);
+  metadata.set_buffer_size(data.size());
+
+  ::grpc::ClientContext context;
+  context.AddMetadata("ifrt-proxy-grpc-host-buffer-store-metadata-bin",
+                      metadata.SerializeAsString());
+
+  GrpcHostBufferStoreResponse response;
+  auto writer = stub_->HostBufferStore(&context, &response);
+
+  for (absl::string_view chunk : data.Chunks()) {
+    for (int64_t offset = 0; offset < chunk.size(); offset += kChunkSize) {
+      GrpcHostBufferStoreRequest request;
+#if defined(PLATFORM_GOOGLE)
+      request.set_alias_data(chunk.substr(offset, kChunkSize));
+#else
+      // TODO(b/325306748): Find a way to not do a memory-copy.
+      request.set_data(std::string(chunk.substr(offset, kChunkSize)));
+#endif
+      writer->Write(request);
+    }
+  }
+  if (!writer->WritesDone()) {
+    return Future<absl::Status>(
+        absl::InternalError("Failed to write all host buffer chunks"));
+  }
+
+  return Future<absl::Status>(xla::FromGrpcStatus(writer->Finish()));
+}
+
+Future<absl::StatusOr<absl::Cord>> GrpcClientHostBufferStore::Lookup(
+    uint64_t handle) {
+  auto promise = Future<absl::StatusOr<absl::Cord>>::CreatePromise();
+
+  lookup_work_queue_->Schedule([this, handle, promise]() mutable -> void {
+    GrpcHostBufferLookupRequest request;
+    request.set_handle(handle);
+    request.set_session_id(session_id_);
+
+    ::grpc::ClientContext context;
+
+    std::unique_ptr<::grpc::ClientReaderInterface<GrpcHostBufferLookupResponse>>
+        stream = stub_->HostBufferLookup(&context, request);
+
+    absl::Cord data;
+    GrpcHostBufferLookupResponse response;
+    while (stream->Read(&response)) {
+      data.Append(response.data());
+    }
+
+    absl::Status status = xla::FromGrpcStatus(stream->Finish());
+    if (status.ok()) {
+      promise.Set(std::move(data));
+    } else {
+      promise.Set(status);
+    }
+  });
+
+  return Future<absl::StatusOr<absl::Cord>>(promise);
+}
+
+Future<absl::Status> GrpcClientHostBufferStore::Delete(uint64_t handle) {
+  GrpcHostBufferDeleteRequest request;
+  request.set_session_id(session_id_);
+  request.set_handle(handle);
+
+  ::grpc::ClientContext context;
+  GrpcHostBufferDeleteResponse response;
+  return Future<absl::Status>(xla::FromGrpcStatus(
+      stub_->HostBufferDelete(&context, request, &response)));
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/grpc_host_buffer.h b/third_party/xla/xla/python/ifrt_proxy/client/grpc_host_buffer.h
new file mode 100644
index 00000000000000..bbf9b9eecfeefe
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/grpc_host_buffer.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_GRPC_HOST_BUFFER_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_GRPC_HOST_BUFFER_H_
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/string_view.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/client/host_buffer.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.grpc.pb.h"
+#include "tsl/platform/unbounded_work_queue.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class GrpcClientHostBufferStore : public ClientHostBufferStore {
+ public:
+  GrpcClientHostBufferStore(
+      std::shared_ptr<grpc::GrpcIfrtService::StubInterface> stub,
+      IfrtProxyVersion version, uint64_t session_id);
+
+  ~GrpcClientHostBufferStore() override;
+
+  // Implements ClientHostBufferStore.
+
+  uint64_t NextHandle() override;
+  Future<absl::Status> Store(uint64_t handle, absl::string_view data) override;
+  Future<absl::Status> Store(uint64_t handle, const absl::Cord& data) override;
+  Future<absl::StatusOr<absl::Cord>> Lookup(uint64_t handle) override;
+  Future<absl::Status> Delete(uint64_t handle) override;
+
+ private:
+  const std::shared_ptr<grpc::GrpcIfrtService::StubInterface> stub_;
+  const IfrtProxyVersion version_;
+  const uint64_t session_id_;
+  std::atomic<uint64_t> next_handle_ = 0;
+
+  // Implementation note: `lookup_work_queue_` may have closures that invoke
+  // user-defined code. Each `Lookup()` call is associated with a scheduled
+  // closure, and the closure is used to first perform synchronous reads of the
+  // streaming RPC, and then to do `promise.Set()` for the Future returned to
+  // the caller.
+  std::unique_ptr<tsl::UnboundedWorkQueue> lookup_work_queue_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_GRPC_HOST_BUFFER_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/host_buffer.h b/third_party/xla/xla/python/ifrt_proxy/client/host_buffer.h
new file mode 100644
index 00000000000000..ceaf51debc7d8f
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/host_buffer.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_HOST_BUFFER_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_HOST_BUFFER_H_
+
+#include <cstdint>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/string_view.h"
+#include "xla/python/ifrt/future.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class ClientHostBufferStore {
+ public:
+  virtual ~ClientHostBufferStore() = default;
+
+  virtual uint64_t NextHandle() = 0;
+
+  // Stores the data associated with the given handle. Returns an error if the
+  // handle already exists.
+  virtual Future<absl::Status> Store(uint64_t handle,
+                                     absl::string_view data) = 0;
+
+  // Stores the data associated with the given handle. Returns an error if the
+  // handle already exists.
+  // TODO(b/315023499) Find a way to increase the chunk size
+  virtual Future<absl::Status> Store(uint64_t handle,
+                                     const absl::Cord& data) = 0;
+
+  // Retrieves the data associated with the handle. Returns an error if the
+  // handle does not exist.
+  virtual Future<absl::StatusOr<absl::Cord>> Lookup(uint64_t handle) = 0;
+
+  // Deletes the host buffer associated with the handle. Returns an error if the
+  // handle does not exist.
+  virtual Future<absl::Status> Delete(uint64_t handle) = 0;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_HOST_BUFFER_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/memory.h b/third_party/xla/xla/python/ifrt_proxy/client/memory.h
new file mode 100644
index 00000000000000..1bf8c584cd3dd3
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/memory.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_MEMORY_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_MEMORY_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/memory.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class Memory : public xla::ifrt::Memory {
+ public:
+  Memory(int id, std::string memory_space_kind, std::string debug_string,
+         std::string to_string)
+      : id_(id),
+        memory_space_kind_(std::move(memory_space_kind)),
+        debug_string_(std::move(debug_string)),
+        to_string_(std::move(to_string)) {}
+
+  // Not copyable or movable: IFRT expects `string_view` from
+  // `memory_space_kind()` to be stable throughout the client's lifetime.
+  Memory(const Memory& other) = delete;
+  Memory& operator=(const Memory& other) = delete;
+
+  PjRtClient* client() const override { return nullptr; }
+
+  absl::Span<xla::ifrt::Device* const> devices() const override {
+    return devices_;
+  }
+
+  int id() const override { return id_; }
+
+  absl::string_view memory_space_kind() const override {
+    return memory_space_kind_;
+  }
+
+  absl::string_view DebugString() const override { return debug_string_; }
+
+  absl::string_view ToString() const override { return to_string_; }
+
+ private:
+  friend class Client;  // For `devices_` initialization.
+
+  int id_;
+  std::vector<xla::ifrt::Device*> devices_;
+  std::string memory_space_kind_;
+  std::string debug_string_;
+  std::string to_string_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_MEMORY_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/mock_client_session.h b/third_party/xla/xla/python/ifrt_proxy/client/mock_client_session.h
new file mode 100644
index 00000000000000..6b2a5bda249898
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/mock_client_session.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_MOCK_CLIENT_SESSION_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_MOCK_CLIENT_SESSION_H_
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include "absl/status/status.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/client/client_session.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class MockClientSession final : public ClientSession {
+ public:
+  MOCK_METHOD(Future<Response>, Enqueue, (std::unique_ptr<IfrtRequest> req),
+              (override));
+  MOCK_METHOD(void, Finish, (const absl::Status& s), (override));
+};
+
+ACTION_P(MockClientSessionReturnResponse, response_proto) {
+  auto response = std::make_unique<IfrtResponse>(response_proto);
+  response->mutable_response_metadata()->set_op_id(
+      arg0->request_metadata().op_id());
+  return Future<ClientSession::Response>(std::move(response));
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_MOCK_CLIENT_SESSION_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/mock_host_buffer.h b/third_party/xla/xla/python/ifrt_proxy/client/mock_host_buffer.h
new file mode 100644
index 00000000000000..81d70cc4e93012
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/mock_host_buffer.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_MOCK_HOST_BUFFER_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_MOCK_HOST_BUFFER_H_
+
+#include <cstdint>
+
+#include <gmock/gmock.h>
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/string_view.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/client/host_buffer.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class MockClientHostBufferStore final : public ClientHostBufferStore {
+ public:
+  MOCK_METHOD(uint64_t, NextHandle, (), (override));
+  MOCK_METHOD(Future<absl::Status>, Store,
+              (uint64_t handle, absl::string_view data), (override));
+  MOCK_METHOD(Future<absl::Status>, Store,
+              (uint64_t handle, const absl::Cord& data), (override));
+  MOCK_METHOD(Future<absl::StatusOr<absl::Cord>>, Lookup, (uint64_t handle),
+              (override));
+  MOCK_METHOD(Future<absl::Status>, Delete, (uint64_t handle), (override));
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_MOCK_HOST_BUFFER_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/py_module.cc b/third_party/xla/xla/python/ifrt_proxy/client/py_module.cc
new file mode 100644
index 00000000000000..bf6ef8bcc4144c
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/py_module.cc
@@ -0,0 +1,119 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/log/log_entry.h"
+#include "absl/log/log_sink.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "pybind11/cast.h"  // from @pybind11
+#include "pybind11/detail/common.h"  // from @pybind11
+#include "pybind11/functional.h"  // from @pybind11  // NOLINT  // IWYU pragma: keep
+#include "pybind11/gil.h"  // from @pybind11
+#include "pybind11/pybind11.h"  // from @pybind11
+#include "pybind11/pytypes.h"  // from @pybind11
+#include "pybind11_abseil/absl_casters.h"  // from @pybind11_abseil  // NOLINT  // IWYU pragma: keep
+#include "pybind11_protobuf/native_proto_caster.h"  // from @pybind11_protobuf
+#include "xla/pjrt/status_casters.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt_proxy/client/registry.h"
+#include "xla/python/py_client.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+namespace {
+
+struct PyClientConnectionOptions {
+  std::function<void(absl::Status)> on_disconnect;
+  std::function<void(std::string)> on_connection_update;
+};
+
+absl::StatusOr<std::shared_ptr<xla::PyClient>> GetClient(
+    std::string proxy_server_address,
+    const PyClientConnectionOptions& py_options) {
+  DCHECK(PyGILState_Check());
+  std::unique_ptr<xla::ifrt::Client> client;
+
+  ClientConnectionOptions options;
+  if (py_options.on_disconnect) {
+    // While it is possible to pass around `py_options.on_disconnect` without
+    // wrapping it via a shared_ptr, copying the `py_options.on_disconnect`
+    // object can internally attempt to acquire the GIL [1], and can thus block
+    // or even deadlock. A unique_ptr or `absl::AnyInvocable` is not sufficient
+    // because downstream code can make copies. Reference:
+    // https://pybind11.readthedocs.io/en/stable/advanced/misc.html#common-sources-of-global-interpreter-lock-errors
+    auto py_on_disconnect = std::make_shared<std::function<void(absl::Status)>>(
+        std::move(py_options.on_disconnect));
+
+    options.on_disconnect = [on_disconnect =
+                                 std::move(py_on_disconnect)](absl::Status s) {
+      LOG(WARNING) << "Connection to server failed, calling supplied "
+                   << "`on_disconnect` function: " << s;
+      tsl::Env::Default()->SchedClosure([s, on_disconnect]() mutable {
+        pybind11::gil_scoped_acquire gil_acquire;
+        (*on_disconnect)(s);
+        on_disconnect = nullptr;
+      });
+    };
+  }
+
+  if (py_options.on_connection_update) {
+    auto fn = std::make_shared<std::function<void(std::string)>>(
+        std::move(py_options.on_connection_update));
+    options.on_connection_update = [fn](absl::string_view log_line) -> void {
+      tsl::Env::Default()->SchedClosure([fn, str = std::string(log_line)] {
+        pybind11::gil_scoped_acquire gil_acquire;
+        (*fn)(std::string(str));
+      });
+    };
+  }
+
+  {
+    pybind11::gil_scoped_release gil_release;
+    TF_ASSIGN_OR_RETURN(client, CreateClient(proxy_server_address, options));
+  }
+
+  // Constructing `xla::PyClient` requires GIL as it may dec-ref Python objects.
+  return std::make_shared<xla::PyClient>(std::move(client));
+}
+
+}  // namespace
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+PYBIND11_MODULE(py_module, m) {
+  pybind11_protobuf::ImportNativeProtoCasters();
+
+  using ::xla::ifrt::proxy::PyClientConnectionOptions;
+  pybind11::class_<PyClientConnectionOptions>(m, "ClientConnectionOptions")
+      .def(pybind11::init<>())
+      .def_readwrite("on_disconnect", &PyClientConnectionOptions::on_disconnect)
+      .def_readwrite("on_connection_update",
+                     &PyClientConnectionOptions::on_connection_update);
+
+  m.def("get_client", xla::ValueOrThrowWrapper(xla::ifrt::proxy::GetClient),
+        pybind11::arg("proxy_server_address"), pybind11::arg("options"));
+}
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/registry.cc b/third_party/xla/xla/python/ifrt_proxy/client/registry.cc
new file mode 100644
index 00000000000000..11680771b8b49b
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/registry.cc
@@ -0,0 +1,102 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/client/registry.h"
+
+#include <cstddef>
+#include <functional>
+#include <memory>
+#include <string>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/python/ifrt/client.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+namespace {
+
+using FactoryFn =
+    std::function<absl::StatusOr<std::unique_ptr<xla::ifrt::Client>>(
+        absl::string_view, const ClientConnectionOptions&)>;
+
+struct Registry {
+  absl::Mutex mu;
+  absl::flat_hash_map<std::string, FactoryFn> factories ABSL_GUARDED_BY(mu);
+};
+
+Registry* registry() {
+  static auto* r = new Registry();
+  return r;
+}
+
+}  // namespace
+
+void RegisterClientFactory(absl::string_view transport_name,
+                           FactoryFn factory) {
+  absl::MutexLock l(&registry()->mu);
+  const bool inserted =
+      registry()
+          ->factories.insert({std::string(transport_name), factory})
+          .second;
+  CHECK(inserted) << "IFRT proxy transport '" << transport_name
+                  << "' already registered";
+}
+
+absl::StatusOr<std::unique_ptr<xla::ifrt::Client>> CreateClient(
+    absl::string_view proxy_server_address,
+    const ClientConnectionOptions& options) {
+  const size_t pos = proxy_server_address.find("://");
+  if (pos == std::string::npos) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("IFRT proxy server address must be "
+                     "'<transport-type>://<backend-address>' (e.g., "
+                     "'grpc://localhost'), but got ",
+                     proxy_server_address));
+  }
+
+  const absl::string_view transport_name = proxy_server_address.substr(0, pos);
+  const absl::string_view address = proxy_server_address.substr(pos + 3);
+
+  FactoryFn factory;
+  {
+    absl::MutexLock l(&registry()->mu);
+    const auto it = registry()->factories.find(transport_name);
+    if (it == registry()->factories.end()) {
+      return absl::NotFoundError(
+          absl::StrCat("IFRT proxy transport '", transport_name,
+                       "' not found; available transports are: ",
+                       absl::StrJoin(registry()->factories, ", ",
+                                     [](std::string* out, const auto& it) {
+                                       out->append(it.first);
+                                     })));
+    }
+    factory = it->second;
+  }
+
+  return factory(address, options);
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/registry.h b/third_party/xla/xla/python/ifrt_proxy/client/registry.h
new file mode 100644
index 00000000000000..ebf04532b278eb
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/registry.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_REGISTRY_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_REGISTRY_H_
+
+#include <functional>
+#include <memory>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "xla/python/ifrt/client.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+struct ClientConnectionOptions {
+  // Timeout for establishing the connection.
+  absl::Duration connection_timeout = absl::Minutes(2);
+
+  // A callback that (if it is not set to nullptr) will be called if there was a
+  // successful connection to the proxy server, but there was a later
+  // disconnect. The callback may be called synchronously from a thread that
+  // performs various important activities, and therefore should not block on
+  // any events (or deadlocks may happen).
+  std::function<void(absl::Status)> on_disconnect = nullptr;
+
+  // Captures logs related to establishing the connection. Logs may be generated
+  // synchronously from a thread that performs various important activities,
+  // so the function should not block (or deadlocks may happen).
+  std::function<void(absl::string_view)> on_connection_update = nullptr;
+};
+
+// Registers a new factory for client backend implementation. Crashes if the
+// same backend name is registered more than once.
+void RegisterClientFactory(
+    absl::string_view transport_name,
+    std::function<absl::StatusOr<std::unique_ptr<xla::ifrt::Client>>(
+        absl::string_view address, const ClientConnectionOptions& options)>
+        factory);
+
+// Creates a client for the given backend target. The backend target string must
+// be in the form of `<backend-type>:<backend-address>`.
+absl::StatusOr<std::unique_ptr<xla::ifrt::Client>> CreateClient(
+    absl::string_view proxy_server_address,
+    const ClientConnectionOptions& options = ClientConnectionOptions());
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_REGISTRY_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
new file mode 100644
index 00000000000000..e07f689513e6a7
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.cc
@@ -0,0 +1,175 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/client/rpc_helper.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#if defined(PLATFORM_GOOGLE)
+#include "absl/types/source_location.h"
+#endif
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/client/client_session.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "tsl/platform/status_to_from_proto.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// DoRpc is a templated function that implements the logic of all RPC-wrapping
+// functions of `RpcHelper`, such as `RpcHelper::MakeArrayFromHostBuffer()`.
+template <typename Req, typename Resp>
+Future<absl::StatusOr<std::shared_ptr<Resp>>> DoRpc(
+    ClientSession* session, RequestMetadata metadata,
+    void (IfrtRequest::*set_req)(Req*), Resp* (IfrtResponse::*get_resp)(),
+    bool (IfrtResponse::*has_resp)() const, std::unique_ptr<Req> req) {
+  auto ifrt_req = std::make_unique<IfrtRequest>();
+  *ifrt_req->mutable_request_metadata() = metadata;
+  (ifrt_req.get()->*set_req)(req.release());
+
+  auto promise = Future<absl::StatusOr<std::shared_ptr<Resp>>>::CreatePromise();
+  auto on_ready = [promise, has_resp,
+                   get_resp](ClientSession::Response r) mutable {
+    if (!r.ok()) {
+      LOG(ERROR) << "Connection to IFRT proxy server was terminated: "
+                 << r.status();
+      promise.Set(absl::UnavailableError(
+          absl::StrCat("Connection to IFRT proxy server was terminated: ",
+                       r.status().ToString())));
+      return;
+    }
+
+    std::shared_ptr<IfrtResponse> response = *std::move(r);
+    if (!response->has_response_metadata()) {
+      promise.Set(absl::InternalError(
+          absl::StrCat("IFRT server sent a message without metadata: ",
+                       response->DebugString())));
+      return;
+    }
+
+    const absl::Status metadata_status =
+        tsl::StatusFromProto(response->response_metadata().status());
+    const bool has_expected_response = (response.get()->*has_resp)();
+    const auto has_some_response =
+        response->response_case() != IfrtResponse::RESPONSE_NOT_SET;
+
+    if (metadata_status.ok() && !has_some_response) {
+      promise.Set(absl::InternalError(
+          absl::StrCat("OK response with no actual response set: ",
+                       response->DebugString())));
+      return;
+    }
+
+    if (!has_expected_response && has_some_response) {
+      promise.Set(absl::InternalError(absl::StrCat(
+          "Response with wrong type (expected ", Resp::GetDescriptor()->name(),
+          "): ", response->DebugString())));
+      return;
+    }
+
+    // If the metadata_status is not-OK, according to ifrt_service.proto,
+    // there may be an error _instead_ of an actual response value. So, check if
+    // an actual response value exists, and if so return it irrespective of what
+    // the metadata_status says.
+    if (!has_some_response) {
+      promise.Set(metadata_status);
+    } else {
+      promise.Set(
+          std::make_shared<Resp>(*std::move((response.get()->*get_resp)())));
+    }
+  };
+  session->Enqueue(std::move(ifrt_req)).OnReady(on_ready);
+
+  return Future<absl::StatusOr<std::shared_ptr<Resp>>>(promise);
+}
+
+RequestMetadata RpcHelper::ManufactureRequestMetadata() {
+  RequestMetadata result;
+  {
+    absl::MutexLock l(&mu_);
+    result.set_op_id(next_op_id_++);
+  }
+  int prev_op_id = result.op_id() - 1;
+  if (prev_op_id != 0) {
+    // TODO(b/266635130): Depend only on necessary prior operations.
+    result.add_dependencies(prev_op_id);
+  }
+  // TODO(b/282757875): Add a ClearOps RPC for old dependencies.
+  return result;
+}
+
+void RpcHelper::Disconnect() {
+  session_->Finish(absl::CancelledError("Disconnected by client"));
+}
+
+// TODO(b/266635130): Remove this preprocessor macro. Preprocessor macros
+// go against the style guide, but are convenient as we are introducing more
+// RPCs and are making changes to the exact signature of the DoRpc function.
+#define RPC(METHOD, PROPERTY)                                               \
+  RpcHelper::ResponseFuture<METHOD##Response> RpcHelper::METHOD(            \
+      std::unique_ptr<METHOD##Request> req) {                               \
+    return DoRpc(session_.get(), ManufactureRequestMetadata(),              \
+                 &IfrtRequest::set_allocated_##PROPERTY##_request,          \
+                 &IfrtResponse::mutable_##PROPERTY##_response,              \
+                 &IfrtResponse::has_##PROPERTY##_response, std::move(req)); \
+  }
+
+RPC(Init, init);
+RPC(GetDefaultDeviceAssignment, get_default_device_assignment);
+RPC(CheckFuture, check_future);
+RPC(MakeArrayFromHostBuffer, make_array_from_host_buffer);
+RPC(AssembleArrayFromSingleDeviceArrays,
+    assemble_array_from_single_device_arrays);
+RPC(DisassembleIntoSingleDeviceArrays, disassemble_into_single_device_arrays);
+RPC(CopyToHostBuffer, copy_to_host_buffer);
+RPC(CheckArrayReady, check_array_ready);
+RPC(IsArrayDeleted, is_array_deleted);
+RPC(DestructArray, destruct_array)
+RPC(Reshard, reshard);
+RPC(FullyReplicatedShard, fully_replicated_shard);
+RPC(DeleteArray, delete_array);
+RPC(Compile, compile);
+RPC(LoadedExecutableMetadata, loaded_executable_metadata);
+RPC(LoadedExecutableExecute, loaded_executable_execute);
+RPC(LoadedExecutableDelete, loaded_executable_delete);
+RPC(LoadedExecutableIsDeleted, loaded_executable_is_deleted);
+RPC(LoadedExecutableDestruct, loaded_executable_destruct);
+RPC(LoadedHostCallbackPoll, loaded_host_callback_poll);
+RPC(LoadedHostCallbackReturn, loaded_host_callback_return);
+
+Future<absl::Status> RpcHelper::CheckFuture(uint64_t handle) {
+  auto req = std::make_unique<CheckFutureRequest>();
+  req->set_future_handle(handle);
+
+  auto promise = Future<absl::Status>::CreatePromise();
+  CheckFuture(std::move(req))
+      .OnReady(
+          [promise](absl::StatusOr<std::shared_ptr<CheckFutureResponse>>
+                        response) mutable { promise.Set(response.status()); });
+
+  return Future<absl::Status>(promise);
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
new file mode 100644
index 00000000000000..b5c3bf6340241d
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/rpc_helper.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_RPC_HELPER_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_RPC_HELPER_H_
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/client/client_session.h"
+#include "xla/python/ifrt_proxy/client/host_buffer.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// RpcHelper helps establish a connection with the IFRT server and perform
+// logical RPCs on the connection.
+//
+// TODO(b/266635130): RpcHelper currently makes each logical RPC order-dependent
+// on the previous RPC it was asked to make. Instead, allow users of RpcHelper
+// specify the necessary dependency.
+class RpcHelper {
+ public:
+  RpcHelper(IfrtProxyVersion version, std::shared_ptr<ClientSession> session)
+      : version_(std::move(version)), session_(std::move(session)) {}
+
+  void Disconnect();
+
+  RpcHelper(const RpcHelper&) = delete;
+  RpcHelper& operator=(const RpcHelper&) = delete;
+  ~RpcHelper() { Disconnect(); }
+
+  // IFRT Proxy version negotiated between the client and the server.
+  const IfrtProxyVersion& version() const { return version_; }
+
+  // Initializes the host buffer store for this RpcHelper instance. This must be
+  // called exactly once during initialization before `host_buffer_store()` is
+  // called.
+  void set_host_buffer_store(
+      std::shared_ptr<ClientHostBufferStore> host_buffer_store) {
+    CHECK(host_buffer_store_ == nullptr);
+    host_buffer_store_ = std::move(host_buffer_store);
+  }
+
+  const std::shared_ptr<ClientHostBufferStore>& host_buffer_store() const {
+    return host_buffer_store_;
+  }
+
+  template <typename T>
+  using ResponseFuture = Future<absl::StatusOr<std::shared_ptr<T>>>;
+
+  // Wrapper function for various logical RPCs defined in ifrt_service.proto.
+  // Whenever the RPC finishes, `on_done` will be called with the result or the
+  // return status. `on_done` can be called with various locks held and should
+  // return quickly without blocking on any event. `on_done` is guaranteed to be
+  // called exactly once.
+  //
+  // The functions can be invoked after the connection is broken, but will
+  // result in `on_done` getting called with an error (see
+  // "WrapAsConnectionError" in `rpc_helper.cc`).
+
+  ResponseFuture<InitResponse> Init(std::unique_ptr<InitRequest> req);
+  ResponseFuture<GetDefaultDeviceAssignmentResponse> GetDefaultDeviceAssignment(
+      std::unique_ptr<GetDefaultDeviceAssignmentRequest> req);
+
+  ResponseFuture<CheckFutureResponse> CheckFuture(
+      std::unique_ptr<CheckFutureRequest> req);
+
+  ResponseFuture<MakeArrayFromHostBufferResponse> MakeArrayFromHostBuffer(
+      std::unique_ptr<MakeArrayFromHostBufferRequest> req);
+  ResponseFuture<AssembleArrayFromSingleDeviceArraysResponse>
+  AssembleArrayFromSingleDeviceArrays(
+      std::unique_ptr<AssembleArrayFromSingleDeviceArraysRequest> req);
+  ResponseFuture<DisassembleIntoSingleDeviceArraysResponse>
+  DisassembleIntoSingleDeviceArrays(
+      std::unique_ptr<DisassembleIntoSingleDeviceArraysRequest> req);
+  ResponseFuture<CopyToHostBufferResponse> CopyToHostBuffer(
+      std::unique_ptr<CopyToHostBufferRequest> req);
+  ResponseFuture<CheckArrayReadyResponse> CheckArrayReady(
+      std::unique_ptr<CheckArrayReadyRequest> req);
+  ResponseFuture<ReshardResponse> Reshard(std::unique_ptr<ReshardRequest> req);
+  ResponseFuture<FullyReplicatedShardResponse> FullyReplicatedShard(
+      std::unique_ptr<FullyReplicatedShardRequest> req);
+  ResponseFuture<IsArrayDeletedResponse> IsArrayDeleted(
+      std::unique_ptr<IsArrayDeletedRequest> req);
+  ResponseFuture<DeleteArrayResponse> DeleteArray(
+      std::unique_ptr<DeleteArrayRequest> req);
+  ResponseFuture<DestructArrayResponse> DestructArray(
+      std::unique_ptr<DestructArrayRequest> req);
+
+  ResponseFuture<CompileResponse> Compile(std::unique_ptr<CompileRequest> req);
+
+  ResponseFuture<LoadedExecutableMetadataResponse> LoadedExecutableMetadata(
+      std::unique_ptr<LoadedExecutableMetadataRequest> req);
+  ResponseFuture<LoadedExecutableExecuteResponse> LoadedExecutableExecute(
+      std::unique_ptr<LoadedExecutableExecuteRequest> req);
+  ResponseFuture<LoadedExecutableDeleteResponse> LoadedExecutableDelete(
+      std::unique_ptr<LoadedExecutableDeleteRequest> req);
+  ResponseFuture<LoadedExecutableIsDeletedResponse> LoadedExecutableIsDeleted(
+      std::unique_ptr<LoadedExecutableIsDeletedRequest> req);
+  ResponseFuture<LoadedExecutableDestructResponse> LoadedExecutableDestruct(
+      std::unique_ptr<LoadedExecutableDestructRequest> req);
+
+  ResponseFuture<LoadedHostCallbackPollResponse> LoadedHostCallbackPoll(
+      std::unique_ptr<LoadedHostCallbackPollRequest> req);
+  ResponseFuture<LoadedHostCallbackReturnResponse> LoadedHostCallbackReturn(
+      std::unique_ptr<LoadedHostCallbackReturnRequest> req);
+
+  // Utility functions for common functions.
+
+  Future<absl::Status> CheckFuture(uint64_t handle);
+
+ private:
+  RequestMetadata ManufactureRequestMetadata() ABSL_LOCKS_EXCLUDED(mu_);
+
+  const IfrtProxyVersion version_;
+  const std::shared_ptr<ClientSession> session_;
+  std::shared_ptr<ClientHostBufferStore> host_buffer_store_;
+
+  absl::Mutex mu_;
+  uint64_t next_op_id_ ABSL_GUARDED_BY(mu_) = 1;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_RPC_HELPER_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/client/version.h b/third_party/xla/xla/python/ifrt_proxy/client/version.h
new file mode 100644
index 00000000000000..06df1e0c70b005
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/client/version.h
@@ -0,0 +1,32 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_CLIENT_VERSION_H_
+#define XLA_PYTHON_IFRT_PROXY_CLIENT_VERSION_H_
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// TODO(b/296144873): Document the version upgrade policy.
+inline constexpr int kClientMinVersion = 1;
+inline constexpr int kClientMaxVersion = 1;
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_CLIENT_VERSION_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/BUILD b/third_party/xla/xla/python/ifrt_proxy/common/BUILD
new file mode 100644
index 00000000000000..39f0c998f73d6c
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/common/BUILD
@@ -0,0 +1,180 @@
+# Copyright 2023 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+load("//xla/python/ifrt_proxy/common:ifrt_proxy.bzl", "default_ifrt_proxy_visibility", "ifrt_proxy_cc_test")
+load("@local_tsl//tsl:tsl.bzl", "if_google")
+load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
+# copybara:uncomment load("@bazel_skylib//:bzl_library.bzl", "bzl_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = default_ifrt_proxy_visibility,
+)
+
+# Export headers referenced by the google-internal-version of grpc_credentials.
+exports_files(
+    ["grpc_credentials.h"],
+    visibility = if_google(
+        ["//xla/python/ifrt_proxy/common/google:__pkg__"],
+        ["//visibility:private"],
+    ),
+)
+
+cc_library(
+    name = "grpc_credentials",
+    hdrs = ["grpc_credentials.h"],
+    deps = if_google(
+        ["//xla/python/ifrt_proxy/common/google:grpc_credentials_lib"],
+        [":grpc_credentials_oss_lib"],
+    ) + ["@com_github_grpc_grpc//:grpc++"],
+)
+
+cc_library(
+    name = "grpc_credentials_oss_lib",
+    srcs = [
+        "grpc_credentials.cc",
+        "grpc_credentials.h",
+    ],
+    visibility = ["//visibility:private"],
+    deps = [
+        "@com_github_grpc_grpc//:grpc++",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@local_tsl//tsl/platform",
+    ],
+    alwayslink = True,
+)
+
+tf_proto_library(
+    name = "types_proto",
+    srcs = ["types.proto"],
+    protodeps = ["//xla/python/ifrt:serdes_proto"],
+)
+
+tf_proto_library(
+    name = "ifrt_service_proto",
+    srcs = ["ifrt_service.proto"],
+    protodeps = [
+        ":types_proto",
+        "//xla:xla_data_proto",
+        "//xla/pjrt:execute_options_proto",
+        "//xla/python/ifrt:serdes_proto",
+        "@local_tsl//tsl/protobuf:status_proto",
+    ],
+)
+
+tf_proto_library(
+    name = "grpc_ifrt_service_proto",
+    srcs = ["grpc_ifrt_service.proto"],
+    has_services = True,
+    create_go_proto = False,
+    create_grpc_library = True,
+    create_java_proto = False,
+    create_kotlin_proto = False,
+    protodeps = [":ifrt_service_proto"],
+)
+
+cc_library(
+    name = "types",
+    srcs = ["types.cc"],
+    hdrs = ["types.h"],
+    deps = [
+        ":ifrt_service_proto_cc",
+        ":types_proto_cc",
+        "//xla/pjrt:pjrt_common",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt:serdes",
+        "//xla/python/ifrt:sharding_serdes",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:statusor",
+    ] + if_google(["@com_google_absl//absl/types:source_location"]),
+)
+
+ifrt_proxy_cc_test(
+    name = "types_test",
+    srcs = ["types_test.cc"],
+    deps = [
+        ":types",
+        ":types_proto_cc",
+        "//xla/pjrt:pjrt_common",
+        "//xla/python/ifrt",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+cc_library(
+    name = "array_util",
+    srcs = ["array_util.cc"],
+    hdrs = ["array_util.h"],
+    deps = [
+        "//xla/python/ifrt",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+ifrt_proxy_cc_test(
+    name = "array_util_test",
+    srcs = ["array_util_test.cc"],
+    deps = [
+        ":array_util",
+        "//xla/python/ifrt",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+# common_serdes is a collection of all common libraries that register SerDes implementations.
+cc_library(
+    name = "common_serdes",
+    deps = ["//xla/python/pjrt_ifrt:xla_program_serdes"],
+    alwayslink = True,
+)
+
+cc_library(
+    name = "proto_util",
+    srcs = ["proto_util.cc"],
+    hdrs = ["proto_util.h"],
+    deps = [
+        ":ifrt_service_proto_cc",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:status_to_from_proto",
+    ],
+)
+
+# copybara:uncomment_begin
+# bzl_library(
+#     name = "ifrt_proxy_bzl",
+#     srcs = ["ifrt_proxy.bzl"],
+#     parse_tests = False,
+#     visibility = ["//visibility:private"],
+# )
+# copybara:uncomment_end
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/array_util.cc b/third_party/xla/xla/python/ifrt_proxy/common/array_util.cc
new file mode 100644
index 00000000000000..bdcf8a13dfcc8e
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/common/array_util.cc
@@ -0,0 +1,156 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/common/array_util.h"
+
+#include <string>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/shape.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+namespace {
+
+std::string StridesAsStr(const ArrayMemRegion::ByteStrides& strides) {
+  if (!strides.has_value()) return "strides{nullopt}";
+  return absl::StrCat("strides{", absl::StrJoin(*strides, ","), "}");
+}
+
+}  // namespace
+
+absl::StatusOr<std::vector<int64_t>> DefaultByteStrides(const DType dtype,
+                                                        const Shape& shape) {
+  if (!dtype.byte_size().has_value()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Unsupported data type to query byte-strides for: ",
+                     dtype.DebugString()));
+  }
+  std::vector<int64_t> result(shape.dims().size());
+  int64_t stride = *dtype.byte_size();
+  for (int i = static_cast<int>(shape.dims().size()) - 1; i >= 0; --i) {
+    result[i] = stride;
+    stride *= shape.dims()[i];
+  }
+  return result;
+}
+
+absl::StatusOr<ArrayMemRegion> ArrayMemRegion::FromZerothElementPointer(
+    const void* zeroth_element, const DType dtype, const Shape& shape,
+    ByteStrides byte_strides) {
+  if (!dtype.byte_size().has_value()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Unsupported data type to construct ArrayMemRegion: ",
+                     dtype.DebugString()));
+  }
+  // Below, we return an error for all situations where the zeroth_element
+  // is different from mem_region_start.
+  void* const mem_region_start = const_cast<void*>(zeroth_element);
+
+  if (!byte_strides.has_value() ||
+      (byte_strides->empty() && shape.dims().empty())) {
+    return ArrayMemRegion(mem_region_start,
+                          dtype.byte_size().value() * shape.num_elements());
+  }
+  if (shape.num_elements() == 0) {
+    return ArrayMemRegion(mem_region_start, 0);
+  }
+  if (shape.dims().size() != byte_strides->size()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Shape has different dimensions from byte_strides: ",
+                     shape.DebugString(), " vs ", StridesAsStr(byte_strides)));
+  }
+  // Logic based on
+  // https://numpy.org/doc/stable/reference/generated/numpy.ndarray.strides.html
+  //
+  // So long as all strides are positive, the array's memory region begins at
+  // the zeroth element, and the last element of the array is farthest off from
+  // the beginning. We use the offset of the last element of the array to
+  // calculate the memory region. Note that this reasoning does not apply to
+  // negative strides, since the zeroth element can then be in the middle of the
+  // memory region (as an example, consider shape=[10, 10] and
+  // element_strides=[10,-1]).
+  uint64_t last_element_byte_offset = 0;
+  for (int i = 0; i < byte_strides->size(); ++i) {
+    int stride = (*byte_strides)[i];
+    if (shape.dims()[i] < 0) {
+      return absl::InvalidArgumentError(
+          absl::StrCat("A shape dimension is negative: ", shape.DebugString()));
+    } else if (shape.dims()[i] == 1) {
+      // The stride shouldn't matter in this case, so continue without checking
+      // validity of the given stride.
+      continue;
+    } else if (stride <= 0) {
+      return absl::UnimplementedError(
+          absl::StrCat("Negative or zero strides are not fully supported: ",
+                       StridesAsStr(byte_strides)));
+    } else if (stride % dtype.byte_size().value() != 0) {
+      return absl::UnimplementedError(absl::StrCat(
+          "byte_stride[", i, "] is not a multiple of the data-type's size: ",
+          StridesAsStr(byte_strides), ", dtype=", dtype.DebugString()));
+    } else {
+      // `shape.dims()[i]` cannot be negative (we explicitly check for this
+      // above) or zero (we return early for `shape.num_elements() == 0`).
+      DCHECK_GT(shape.dims()[i], 0);
+      last_element_byte_offset += (stride * (shape.dims()[i] - 1));
+    }
+  }
+  return ArrayMemRegion(mem_region_start,
+                        last_element_byte_offset + dtype.byte_size().value());
+}
+
+absl::StatusOr<ArrayMemRegion> ArrayMemRegion::FromMinimalMemRegion(
+    absl::string_view mem_region, const DType dtype, const Shape& shape,
+    ByteStrides byte_strides) {
+  // FromZerothElementPointer() currently returns an error for any situation
+  // where the zeroth_element will is not equal to the place where the minimal
+  // memory region starts.
+  TF_ASSIGN_OR_RETURN(
+      auto result,
+      FromZerothElementPointer(mem_region.data(), dtype, shape, byte_strides));
+
+  if (result.mem_region().size() != mem_region.size()) {
+    return absl::InvalidArgumentError(
+        absl::StrCat("Incorrect size ", result.mem_region().size(), " vs ",
+                     mem_region.size(), "; is provided memory region minimal? ",
+                     dtype.DebugString(), " ", shape.DebugString(), " ",
+                     StridesAsStr(byte_strides)));
+  }
+  CHECK_EQ(result.mem_region().data(), mem_region.data());
+  return result;
+}
+
+absl::string_view ArrayMemRegion::mem_region() const {
+  return absl::string_view(static_cast<char*>(mem_region_start_), nbytes_);
+}
+
+void* ArrayMemRegion::zeroth_element() const {
+  // ArrayMemRegion cannot yet be constructed for situations where the
+  // zeroth element pointer is different from mem_region_start_.
+  return mem_region_start_;
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/array_util.h b/third_party/xla/xla/python/ifrt_proxy/common/array_util.h
new file mode 100644
index 00000000000000..2ba8ff7ce42567
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/common/array_util.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_COMMON_ARRAY_UTIL_H_
+#define XLA_PYTHON_IFRT_PROXY_COMMON_ARRAY_UTIL_H_
+
+#include <optional>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/shape.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Returns the byte-strides corresponding to the compact major-to-minor layout.
+absl::StatusOr<std::vector<int64_t>> DefaultByteStrides(DType dtype,
+                                                        const Shape& shape);
+
+// Denotes a chunk of contiguous memory that contains all elements of the
+// in-host (RAM) representation of an Array.
+class ArrayMemRegion {
+ public:
+  // Nullopt implies compact major-to-minor layout, as returned by
+  // `DefaultByteStrides()`.
+  using ByteStrides = std::optional<absl::Span<const int64_t>>;
+
+  // Constructs an ArrayMemRegion given `mem_region`, where `mem_region` is
+  // minimal, i.e., the lower-most and upper-most addresses of `mem_region` are
+  // necessary to retrieve elements from the array.
+  static absl::StatusOr<ArrayMemRegion> FromMinimalMemRegion(
+      absl::string_view mem_region, DType dtype, const Shape& shape,
+      ByteStrides byte_strides);
+
+  // Constructs an ArrayMemRegion given a pointer to the zeroth-element of the
+  // (in-host representation of the) Array.
+  static absl::StatusOr<ArrayMemRegion> FromZerothElementPointer(
+      const void* zeroth_element, DType dtype, const Shape& shape,
+      ByteStrides byte_strides);
+
+  // Returns a region of memory whose lower-most and upper-most addresses are
+  // necessary to retrieve elements of the (in-host representation of) the
+  // array.
+  absl::string_view mem_region() const;
+
+  // Returns a pointer to the zeroth-element of the (in-host representation of
+  // the) Array.
+  void* zeroth_element() const;
+
+ private:
+  ArrayMemRegion(void* mem_region_start, size_t nbytes)
+      : mem_region_start_(mem_region_start), nbytes_(nbytes) {}
+
+  void* const mem_region_start_;
+  const size_t nbytes_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_COMMON_ARRAY_UTIL_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/array_util_test.cc b/third_party/xla/xla/python/ifrt_proxy/common/array_util_test.cc
new file mode 100644
index 00000000000000..51e189bb9ffc77
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/common/array_util_test.cc
@@ -0,0 +1,201 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/common/array_util.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/shape.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::Not;
+using ::testing::TestWithParam;
+using ::tsl::testing::IsOk;
+using ::tsl::testing::IsOkAndHolds;
+
+constexpr DType::Kind kF64 = DType::Kind::kF64;
+constexpr DType::Kind kS32 = DType::Kind::kS32;
+constexpr DType::Kind kString = DType::Kind::kString;
+using Strides = std::vector<int64_t>;
+
+TEST(DefaultByteStrides, ErrorsIfBadDtype) {
+  EXPECT_THAT(DefaultByteStrides(DType(kString), Shape({1})), Not(IsOk()));
+}
+
+TEST(DefaultByteStrides, HappyCase) {
+  EXPECT_THAT(DefaultByteStrides(DType(kF64), Shape({4, 3, 5})),
+              IsOkAndHolds(ElementsAre(120, 40, 8)));
+}
+
+// TC represents a testcase.
+struct TC {
+  const std::string test_name;
+  const DType::Kind dtype_kind;
+  const std::vector<int64_t> shape;
+  const std::optional<std::vector<int64_t>> byte_strides;
+  const std::optional<size_t> expected_size;
+};
+std::string PrintToString(const TC& tc) { return tc.test_name; }
+
+class ArrayMemRegionSuccess : public TestWithParam<TC> {};
+INSTANTIATE_TEST_SUITE_P(
+    Tests, ArrayMemRegionSuccess,
+    testing::Values(
+        // F64
+        TC{"DefaultF64", kF64, {4, 3, 5}, std::nullopt},
+        TC{"MajorToMinorStridesF64", kF64, {4, 3, 5}, Strides({120, 40, 8})},
+        TC{"NotMajorToMinorF64", kF64, {3, 4, 5}, Strides({40, 120, 8})},
+        TC{"TransposedF64", kF64, {5, 3, 4}, Strides({8, 40, 120})},
+        // S32
+        TC{"DefaultS32", kS32, {4, 3, 5}, std::nullopt},
+        TC{"MajorToMinorStridesS32", kS32, {4, 3, 5}, Strides({60, 20, 4})},
+        TC{"NotMajorToMinorS32", kS32, {3, 4, 5}, Strides({20, 60, 4})},
+        TC{"TransposedS32", kS32, {5, 3, 4}, Strides({4, 20, 60})},
+        // Scalar
+        TC{"ScalarF64DefaultStrides", kF64, {}, std::nullopt},
+        TC{"ScalarF64EmptyStrides", kF64, {}, Strides({})},
+        // Zero elements
+        TC{"NoColsDefaultStrides", kF64, {5, 0}, std::nullopt},
+        TC{"NoColsStridesNonZero", kF64, {5, 0}, Strides({40, 4})},
+        TC{"NoColsStridesZero", kF64, {5, 0}, Strides({0, 0})},
+        TC{"NoRowsDefaultStrides", kF64, {0, 5}, std::nullopt},
+        TC{"NoRowsStridesNonZero", kF64, {0, 5}, Strides({40, 4})},
+        TC{"NoRowsStridesZero", kF64, {0, 5}, Strides({0, 0})},
+        // Dimension with size 1
+        TC{"SingleElementArbitraryStrides", kF64, {1, 1}, Strides({100, 100})},
+        TC{"OneRowArbitraryColStride", kF64, {1, 5}, Strides({100, 8})},
+        TC{"OneColArbitraryRowStride", kF64, {5, 1}, Strides({8, 100})},
+        TC{"OneRowZeroColStride", kF64, {1, 5}, Strides({0, 8})},
+        TC{"OneColZeroRowStride", kF64, {5, 1}, Strides({8, 0})},
+        // Non-compact strides.
+        TC{"NonCompactSingleDimension", kS32, {5}, Strides({16}), 68},
+        TC{"NonCompactDim0", kS32, {4, 3, 5}, Strides({120, 20, 4}), 420},
+        TC{"PaddedElements", kS32, {4, 3, 5}, Strides({120, 40, 8}), 476}),
+    testing::PrintToStringParamName());
+TEST_P(ArrayMemRegionSuccess, TestCase) {
+  const TC tc = GetParam();
+  const DType dtype(tc.dtype_kind);
+  const Shape shape(tc.shape);
+  const size_t expected_size = tc.expected_size.value_or(
+      dtype.byte_size().value() * shape.num_elements());
+  std::string data(expected_size, 'a');
+
+  TF_ASSERT_OK_AND_ASSIGN(auto mem_region1,
+                          ArrayMemRegion::FromZerothElementPointer(
+                              data.data(), dtype, shape, tc.byte_strides));
+  EXPECT_EQ(mem_region1.zeroth_element(), data.data());
+  // Note: `EXPECT_EQ(mem_region.mem_region(), absl::string_view(data))` can
+  // cause asan to complain if the expectation fails.
+  EXPECT_EQ(mem_region1.mem_region().data(), data.data());
+  EXPECT_EQ(mem_region1.mem_region().size(), data.size());
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto mem_region2, ArrayMemRegion::FromMinimalMemRegion(data, dtype, shape,
+                                                             tc.byte_strides));
+  EXPECT_EQ(mem_region2.zeroth_element(), data.data());
+  EXPECT_EQ(mem_region2.mem_region().data(), data.data());
+  EXPECT_EQ(mem_region2.mem_region().size(), data.size());
+}
+
+class ArrayMemRegionFailure : public TestWithParam<TC> {};
+INSTANTIATE_TEST_SUITE_P(
+    Tests, ArrayMemRegionFailure,
+    testing::Values(
+        // Will not be supported
+        TC{"OneString", kString, {}, std::nullopt},
+        TC{"ManyStrings", kString, {5}, std::nullopt},
+        // Currently unimplemented
+        TC{"NegativeByteStrides", kS32, {4, 3, 5}, Strides({-60, -20, -4})},
+        TC{"ZeroByteStride", kS32, {5, 5}, Strides({0, 0})},
+        TC{"SmallerByteStrideThanDataType", kS32, {5, 5}, Strides({1, 1})},
+        TC{"ByteStrideIndivisibleByDataType", kS32, {5, 5}, Strides({7, 7})},
+        // Bad arguments
+        TC{"NegativeShapeDimension", kS32, {-5, -5}, Strides({20, 4})}),
+    testing::PrintToStringParamName());
+TEST_P(ArrayMemRegionFailure, TestCase) {
+  const TC tc = GetParam();
+  const DType dtype(tc.dtype_kind);
+  const Shape shape(tc.shape);
+  char const* kSomeAddr = reinterpret_cast<char*>(1UL << 48);
+
+  auto mem_region1 = ArrayMemRegion::FromZerothElementPointer(
+      /*zeroth_element=*/kSomeAddr, dtype, shape, tc.byte_strides);
+  EXPECT_THAT(mem_region1.status(), Not(IsOk()));
+
+  const size_t kSomeSize = 1024;
+  auto mem_region2 = ArrayMemRegion::FromMinimalMemRegion(
+      absl::string_view(kSomeAddr, kSomeSize), dtype, shape, tc.byte_strides);
+  EXPECT_THAT(mem_region2.status(), Not(IsOk()));
+}
+
+TEST(ArrayMemRegion, FromBadMemRegionSizeFails) {
+  const DType kDType(kS32);
+  const Shape kShape({5, 5});
+  const size_t kDataBytes = kDType.byte_size().value() * kShape.num_elements();
+
+  const size_t kExtraSuffixBytes = 10;
+  std::string data_with_extra_suffix(kDataBytes + kExtraSuffixBytes, 'a');
+
+  // If we know that the zeroth_element is at the beginning, then we
+  // can construct the ArrayMemoryRegion; the constructed ArrayMemoryRegion
+  // will not contain the suffix.
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto mem_region1,
+      ArrayMemRegion::FromZerothElementPointer(
+          /*zeroth_element=*/data_with_extra_suffix.data(), kDType, kShape,
+          /*byte_strides=*/std::nullopt));
+  EXPECT_EQ(mem_region1.mem_region().data(), data_with_extra_suffix.data());
+  EXPECT_EQ(mem_region1.zeroth_element(), data_with_extra_suffix.data());
+  EXPECT_LT(mem_region1.mem_region().size(), data_with_extra_suffix.size());
+  EXPECT_EQ(mem_region1.mem_region().size(), kDataBytes);
+
+  // But given the data_with_extra_suffix region, we cannot discover where
+  // within it the zeroth-element points to, so we cannot construct an
+  // ArrayMemoryRegion from it.
+  auto mem_region2 = ArrayMemRegion::FromMinimalMemRegion(
+      data_with_extra_suffix, kDType, kShape,
+      /*byte_strides=*/std::nullopt);
+  EXPECT_THAT(mem_region2.status(), Not(IsOk()));
+
+  // Similarly, if we provided `FromMinimalMemRegion` a `data` that was smaller
+  // than what the constructed `ArrayMemoryRegion` should point to, that will
+  // be detected as an error.
+  std::string data_without_some_bytes(kDataBytes - kExtraSuffixBytes, 'a');
+  auto mem_region3 = ArrayMemRegion::FromMinimalMemRegion(
+      data_without_some_bytes, kDType, kShape,
+      /*byte_strides=*/std::nullopt);
+  EXPECT_THAT(mem_region3.status(), Not(IsOk()));
+}
+
+}  // namespace
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/grpc_credentials.cc b/third_party/xla/xla/python/ifrt_proxy/common/grpc_credentials.cc
new file mode 100644
index 00000000000000..f72424b859577f
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/common/grpc_credentials.cc
@@ -0,0 +1,71 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/common/grpc_credentials.h"
+
+#include <cstdlib>
+#include <memory>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/security/server_credentials.h"
+#include "tsl/platform/platform.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+namespace {
+
+bool UseInsecureCredentials() {
+  // Use insecure only with `bazel test`.
+  const bool insecure = (getenv("TEST_UNDECLARED_OUTPUTS_DIR") != nullptr);
+
+  if (insecure) {
+    // We should not be getting to this point at all in the google-internal
+    // code, but check to be sure.
+    CHECK_EQ(TSL_IS_IN_OSS, 1);
+  }
+
+  return insecure;
+}
+
+}  // namespace
+
+std::shared_ptr<::grpc::ChannelCredentials> GetClientCredentials() {
+  if (UseInsecureCredentials()) {
+    LOG(WARNING) << "Using insecure client credentials for gRPC.";
+    return ::grpc::InsecureChannelCredentials();  // NOLINT
+  } else {
+    LOG(INFO) << "Using ALTS client credentials for gRPC.";
+    return ::grpc::experimental::AltsCredentials(
+        ::grpc::experimental::AltsCredentialsOptions());
+  }
+}
+
+std::shared_ptr<::grpc::ServerCredentials> GetServerCredentials() {
+  if (UseInsecureCredentials()) {
+    LOG(WARNING) << "Using insecure server credentials for gRPC.";
+    return ::grpc::InsecureServerCredentials();  // NOLINT
+  } else {
+    LOG(INFO) << "Using ALTS server credentials for gRPC.";
+    return ::grpc::experimental::AltsServerCredentials(
+        ::grpc::experimental::AltsServerCredentialsOptions());
+  }
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/grpc_credentials.h b/third_party/xla/xla/python/ifrt_proxy/common/grpc_credentials.h
new file mode 100644
index 00000000000000..46435a4adebefd
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/common/grpc_credentials.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_COMMON_GRPC_CREDENTIALS_H_
+#define XLA_PYTHON_IFRT_PROXY_COMMON_GRPC_CREDENTIALS_H_
+
+#include <memory>
+
+#include "grpcpp/security/credentials.h"
+#include "grpcpp/security/server_credentials.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Get credentials to use in the client gRPC.
+// TODO(b/323079791): Migrate to use utility library from tsl/platform.
+std::shared_ptr<::grpc::ChannelCredentials> GetClientCredentials();
+
+// Get credentials to use in the server gRPC.
+// TODO(b/323079791): Migrate to use utility library from tsl/platform.
+std::shared_ptr<::grpc::ServerCredentials> GetServerCredentials();
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_COMMON_GRPC_CREDENTIALS_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/grpc_ifrt_service.proto b/third_party/xla/xla/python/ifrt_proxy/common/grpc_ifrt_service.proto
new file mode 100644
index 00000000000000..6741e5d98af8a7
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/common/grpc_ifrt_service.proto
@@ -0,0 +1,107 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package xla.ifrt.proxy;
+
+import "xla/python/ifrt_proxy/common/ifrt_service.proto";
+
+service GrpcIfrtService {
+  // Returns the IFRT Proxy version that both the client and the server
+  // supports. Returns an error if there's no such version.
+  rpc GetVersion(GrpcGetVersionRequest) returns (GrpcGetVersionResponse) {}
+
+  // IfrtSession is a stream of IFRT requests (from the client) and responses
+  // from the server.
+  //
+  // Clients can optionally start the stream with an InitRequest to configure
+  // startup options and to retrieve basic run-time system details such as the
+  // number and handles of the available devices (see InitResponse). But clients
+  // that are fine with the default options and do not immediately need the info
+  // from the InitResponse can start with any other request.
+  //
+  // TODO(b/282757875): Investigate if there are useful details that client
+  // should supply to the server even before the first InitRequest message - may
+  // be via gRPC metadata.
+  rpc IfrtSession(stream IfrtRequest) returns (stream IfrtResponse) {}
+
+  // Sends a host buffer from the client to the server. Uses client-side
+  // streaming to allow sending buffers that exceed the 2GiB protobuf
+  // serialization limit.
+  rpc HostBufferStore(stream GrpcHostBufferStoreRequest)
+      returns (GrpcHostBufferStoreResponse);
+
+  // Reads a host buffer from the server to the client. Uses server-side
+  // streaming to allow >2GiB host buffer transfer.
+  rpc HostBufferLookup(GrpcHostBufferLookupRequest)
+      returns (stream GrpcHostBufferLookupResponse);
+
+  // Deletes a host buffer from the server.
+  rpc HostBufferDelete(GrpcHostBufferDeleteRequest)
+      returns (GrpcHostBufferDeleteResponse);
+}
+
+message GrpcGetVersionRequest {
+  IfrtProxyVersion min_version = 1;
+  IfrtProxyVersion max_version = 2;
+}
+
+message GrpcGetVersionResponse {
+  IfrtProxyVersion version = 1;
+}
+
+// Metadata for `IfrtSession` requests, sent as client metadata associated with
+// key "ifrt-proxy-grpc-ifrt-session-metadata-bin".
+message GrpcIfrtSessionMetadata {
+  IfrtProxyVersion version = 1;
+}
+
+// Metadata for `Store` requests, sent as client metadata associated with key
+// "ifrt-proxy-grpc-host-buffer-store-metadata-bin".
+message GrpcHostBufferStoreMetadata {
+  fixed64 session_id = 1;
+  fixed64 handle = 2;
+  int64 buffer_size = 3;
+}
+
+// `Store` request that contains actual data, potentially chunked. All requests
+// in a transfer must be sent in order and the server simply concatenate `bytes`
+// in the response under this assumption.
+message GrpcHostBufferStoreRequest {
+  bytes data = 1;  // copybara_removed [ctype = STRING_PIECE]
+}
+
+message GrpcHostBufferStoreResponse {}
+
+// `Lookup` request that specifies which host buffer in the server to read.
+message GrpcHostBufferLookupRequest {
+  fixed64 session_id = 1;
+  fixed64 handle = 2;
+}
+
+// `Lookup` response that returns the (potentially chunked) host buffer
+// contents. As in `GrpcHostBufferStoreRequest`, all responses must be sent in
+// order and the client simply concatenates `data`.
+message GrpcHostBufferLookupResponse {
+  bytes data = 1;  // copybara_removed [ctype = STRING_PIECE]
+}
+
+// `Delete` request that specifies the host buffer to delete.
+message GrpcHostBufferDeleteRequest {
+  fixed64 session_id = 1;
+  fixed64 handle = 2;
+}
+
+message GrpcHostBufferDeleteResponse {}
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_proxy.bzl b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_proxy.bzl
new file mode 100644
index 00000000000000..9dd5c3e3ad996e
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_proxy.bzl
@@ -0,0 +1,8 @@
+"""Common libraries for IFRT proxy."""
+
+load("//xla:xla.bzl", "xla_cc_test")
+
+def ifrt_proxy_cc_test(**kwargs):
+    xla_cc_test(**kwargs)
+
+default_ifrt_proxy_visibility = ["//xla/python/ifrt_proxy:__subpackages__"]
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
new file mode 100644
index 00000000000000..925bd76ecdea55
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/common/ifrt_service.proto
@@ -0,0 +1,483 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package xla.ifrt.proxy;
+
+import "xla/pjrt/execute_options.proto";
+import "xla/python/ifrt/serdes.proto";
+import "xla/python/ifrt_proxy/common/types.proto";
+import "xla/xla_data.proto";
+import "tsl/protobuf/status.proto";
+
+option cc_enable_arenas = true;
+
+message IfrtProxyVersion {
+  int32 protocol_version = 1;
+}
+
+message IfrtRequest {
+  RequestMetadata request_metadata = 1;
+
+  oneof request {
+    InitRequest init_request = 2;
+
+    // ===== Future =====
+    CheckFutureRequest check_future_request = 3;
+
+    // ===== Array =====
+    MakeArrayFromHostBufferRequest make_array_from_host_buffer_request = 4;
+    AssembleArrayFromSingleDeviceArraysRequest
+        assemble_array_from_single_device_arrays_request = 5;
+    CopyToHostBufferRequest copy_to_host_buffer_request = 6;
+    DisassembleIntoSingleDeviceArraysRequest
+        disassemble_into_single_device_arrays_request = 7;
+    CheckArrayReadyRequest check_array_ready_request = 8;
+    DeleteArrayRequest delete_array_request = 9;
+    ReshardRequest reshard_request = 10;
+    FullyReplicatedShardRequest fully_replicated_shard_request = 20;
+    IsArrayDeletedRequest is_array_deleted_request = 11;
+    DestructArrayRequest destruct_array_request = 12;
+
+    // ==== Compiler ====
+    CompileRequest compile_request = 13;
+
+    // ===== LoadedExecutable =====
+    LoadedExecutableMetadataRequest loaded_executable_metadata_request = 14;
+    LoadedExecutableExecuteRequest loaded_executable_execute_request = 15;
+    LoadedExecutableDeleteRequest loaded_executable_delete_request = 16;
+    LoadedExecutableIsDeletedRequest loaded_executable_is_deleted_request = 17;
+    LoadedExecutableDestructRequest loaded_executable_destruct_request = 18;
+
+    // ===== LoadedHostCallback =====
+    LoadedHostCallbackPollRequest loaded_host_callback_poll_request = 21;
+    LoadedHostCallbackReturnRequest loaded_host_callback_return_request = 22;
+
+    // ===== Client =====
+    GetDefaultDeviceAssignmentRequest get_default_device_assignment_request =
+        19;
+  }
+}
+
+message IfrtResponse {
+  ResponseMetadata response_metadata = 1;
+
+  oneof response {
+    InitResponse init_response = 2;
+
+    // ===== Future =====
+    CheckFutureResponse check_future_response = 3;
+
+    // ===== Array =====
+    MakeArrayFromHostBufferResponse make_array_from_host_buffer_response = 4;
+    AssembleArrayFromSingleDeviceArraysResponse
+        assemble_array_from_single_device_arrays_response = 5;
+    CopyToHostBufferResponse copy_to_host_buffer_response = 6;
+    DisassembleIntoSingleDeviceArraysResponse
+        disassemble_into_single_device_arrays_response = 7;
+    CheckArrayReadyResponse check_array_ready_response = 8;
+    DeleteArrayResponse delete_array_response = 9;
+    ReshardResponse reshard_response = 10;
+    FullyReplicatedShardResponse fully_replicated_shard_response = 20;
+    IsArrayDeletedResponse is_array_deleted_response = 11;
+    DestructArrayResponse destruct_array_response = 12;
+
+    // ===== Compiler =====
+    CompileResponse compile_response = 13;
+
+    // ===== LoadedExecutable =====
+    LoadedExecutableMetadataResponse loaded_executable_metadata_response = 14;
+    LoadedExecutableExecuteResponse loaded_executable_execute_response = 15;
+    LoadedExecutableDeleteResponse loaded_executable_delete_response = 16;
+    LoadedExecutableIsDeletedResponse loaded_executable_is_deleted_response =
+        17;
+    LoadedExecutableDestructResponse loaded_executable_destruct_response = 18;
+
+    // ===== LoadedHostCallback =====
+    LoadedHostCallbackPollResponse loaded_host_callback_poll_response = 21;
+    LoadedHostCallbackReturnResponse loaded_host_callback_return_response = 22;
+
+    // ===== Client =====
+    GetDefaultDeviceAssignmentResponse get_default_device_assignment_response =
+        19;
+  }
+}
+
+// Metadata of an IFRT Request.
+message RequestMetadata {
+  // Identifies a logical IFRT Operation (equivalent to an IFRT API call).
+  //
+  // For the operations that require chunking (e.g.: MakeArrayFromHostBuffer)
+  // all the request proto messages share the same op_id.
+  //
+  // Must be unique and monotonically increasing across the life of a client -
+  // may stretch across multiple successive IfrtSessions used to reconnect and
+  // resync after transient connectivity failures.
+  fixed64 op_id = 1;
+
+  // List of one or more prior ops this current op is "dependent"
+  // upon. Currently this allows the client to define the order in which the
+  // server starts the execution of requests. Future versions may add other
+  // types of dependencies. For instance, a separate list of dependencies that
+  // must *complete* executing before the current one can start to execute.
+  //
+  // An op_id that has not yet been seen by the server is treated as an error
+  // that fails the op.
+  repeated fixed64 dependencies = 2;
+
+  // UserContext is a basic provenance mechanism that allows the server-side
+  // actions and artifacts (say, allocating a buffer) to be associated with the
+  // corresponding client-side context that triggered those actions.
+  //
+  // The optional UserContextId is generated by the client and are used as an
+  // opaque label by the server and the run-time systems behind it.
+  // TODO(b/282757875): Add a pointer to Usercontext bugs/design doc.
+  fixed64 user_context_id = 3;
+}
+
+// Metadata of an IFRT Response.
+
+message ResponseMetadata {
+  // ID of the operation this response belongs to.
+  fixed64 op_id = 1;
+
+  // Status of the operation.
+  //
+  //    In case of "chunked" responses (i.e., the full logical response is
+  //    spread across a sequence of IfrtResponse protos), the actual sequence of
+  //    IfrtResponse messages will follow only if this Status is OK in the very
+  //    first message. That is, in case of errors, server sends a single
+  //    IfrtResponse with the appropriate error included.
+  //
+  //    In case of "batched" operations (i.e., where the response is carrying
+  //    the outcomes of multiple requests that were "batched" in the same
+  //    IfrtRequest proto - such as deleting a bunch of Arrays) this Status
+  //    field provides a way to quickly check if none of the individual
+  //    operations encountered errors. Clients should not rely on specific error
+  //    type or string when this is not OK, they should check the response
+  //    message for individual Statuses.
+  tensorflow.StatusProto status = 2;
+}
+
+// InitRequest allows the client to specify the optional startup configuration
+// parameters such as an idle timeout for this `IfrtSession`, backend servers
+// addresses, and whether to turn on tracing, etc.
+//
+// Initialization of a a session is optional, but if a client chooses to do it,
+// it must be the very first op i.e., the InitRequest must be the very first
+// request of the session.
+message InitRequest {}
+
+// InitResponse contains basic runtime system info (such as the available
+// devices, and name and type of the platform) that most clients can immediately
+// make use of. It may also carry the status for whether the optional
+// configuration requested by the InitRequest has been successfully applied.
+message InitResponse {
+  uint64 session_id = 8;
+
+  string platform_name = 1;     // == ifrt::Client::platform_name()
+  string platform_version = 2;  // == ifrt::Client::platform_version()
+  uint64 platform_id = 3;       // == ifrt::Client::platform_id()
+  uint64 process_index = 4;     // == ifrt::Client::process_index()
+  string runtime_type = 5;      // == ifrt::Client::runtime_type()
+
+  message Device {
+    int32 id = 1;
+    int32 local_device_id = 9;
+    int32 local_hardware_id = 2;
+    string device_kind = 3;
+    optional int32 default_memory_id = 7;
+    repeated int32 memory_ids = 8;
+    string debug_string = 4;
+    string to_string = 5;
+    map<string, proto.Variant> attributes = 6;
+  }
+
+  repeated Device devices = 6;  // == ifrt::Client::devices()
+  repeated int32 addressable_device_ids =
+      7;  // == ifrt::Client::addressable_devices()
+
+  message Memory {
+    int32 id = 1;
+    string memory_space_kind = 2;
+    repeated int32 device_ids = 3;
+    string debug_string = 4;
+    string to_string = 5;
+  }
+
+  repeated Memory memories = 9;
+}
+
+// ================ Future-related operations ================
+
+// Checks if the given Futures are ready on the server. This is a destructive
+// read, i.e., the given future will no longer be able to be referenced.
+message CheckFutureRequest {
+  fixed64 future_handle = 1;
+}
+message CheckFutureResponse {}
+
+// ================ Array-related operations ================
+
+// In the current context of the IFRT proxy service, the term `Host` in the
+// proto names below refers to the host where the proxy client and the user code
+// (e.g.: a Jax application) are running.
+
+// Makes an IFRT Array from the contents of a HostBuffer.
+// Equivalent to `ifrt::Client::MakeArrayFromHostBuffer`.
+message MakeArrayFromHostBufferRequest {
+  proto.DType dtype = 1;
+  proto.Shape shape = 2;
+  proto.Sharding sharding = 3;
+  fixed64 host_buffer_handle = 4;
+  optional proto.ByteStrides byte_strides = 5;
+}
+message MakeArrayFromHostBufferResponse {
+  fixed64 array_handle = 1;
+}
+
+// Makes an IFRT Array from a set of single-device Arrays.
+// Equivalent to ifrt::Client::AssembleArrayFromSingleDeviceArrays.
+message AssembleArrayFromSingleDeviceArraysRequest {
+  proto.Shape shape = 1;
+  proto.Sharding sharding = 2;
+  repeated fixed64 single_device_array_handles = 3;
+  proto.ArrayCopySemantics copy_semantics = 4;
+}
+message AssembleArrayFromSingleDeviceArraysResponse {
+  fixed64 array_handle = 1;
+}
+
+// Reads the contents of a given IFRT Array.
+// Equivalent to ifrt::Array::CopyToHostBuffer.
+message CopyToHostBufferRequest {
+  fixed64 array_handle = 2;
+  optional proto.ByteStrides byte_strides = 3;
+  fixed64 host_buffer_handle = 1;
+}
+message CopyToHostBufferResponse {}
+
+// Breaks the given Array into its constituent per-device Arrays.
+// Equivalent to ifrt::Array::DisassmebleIntoSingleDeviceArrays.
+message DisassembleIntoSingleDeviceArraysRequest {
+  fixed64 array_handle = 1;
+  proto.ArrayCopySemantics copy_semantics = 2;
+}
+message DisassembleIntoSingleDeviceArraysResponse {
+  repeated fixed64 single_device_array_handles = 1;
+}
+
+message ReshardRequest {
+  fixed64 array_handle = 1;
+  proto.Sharding sharding = 2;
+  proto.ArrayCopySemantics copy_semantics = 3;
+}
+message ReshardResponse {
+  fixed64 array_handle = 1;
+}
+
+message FullyReplicatedShardRequest {
+  fixed64 array_handle = 1;
+  proto.ArrayCopySemantics copy_semantics = 2;
+}
+message FullyReplicatedShardResponse {
+  fixed64 array_handle = 1;
+}
+
+// Checks if the given Arrays are ready on the server.
+message CheckArrayReadyRequest {
+  fixed64 array_handle = 1;
+}
+message CheckArrayReadyResponse {}
+
+// Deletes the given Array. Response contains the handle for a Future that
+// becomes ready when the deletion completes.
+message DeleteArrayRequest {
+  fixed64 array_handle = 1;
+}
+message DeleteArrayResponse {
+  fixed64 deletion_future_handle = 1;
+}
+
+message IsArrayDeletedRequest {
+  fixed64 array_handle = 1;
+}
+message IsArrayDeletedResponse {
+  bool deleted = 1;
+}
+
+message DestructArrayRequest {
+  fixed64 array_handle = 1;
+}
+message DestructArrayResponse {}
+
+// ================ Compiler-related operations ================
+
+// Modeled after `xla::PjRtLoadedExecutable::LogicalDeviceIds`.
+//
+// TODO(hyeontaek): this XLA-specific type is temporary and will be removed when
+// `addressable_device_logical_ids()` is removed from `LoadedExecutable` or
+// moved to a type-erased proto field.
+message LogicalDeviceIds {
+  int32 replica = 1;
+  int32 partition = 2;
+}
+
+// Compiles `mlir_module` and returns a `LoadedExecutable`.
+message CompileRequest {
+  xla.ifrt.Serialized program = 1;
+  xla.ifrt.Serialized compile_options = 2;
+  repeated bytes host_callbacks = 3;
+}
+message CompileResponse {
+  fixed64 loaded_executable_handle = 1;
+  repeated fixed64 loaded_host_callback_handles = 8;
+
+  // A subset of LoadedExecutable's fields that are cheap to calculate. See
+  // `LoadedExecutableMetadataResponse` for the rest of metadata.
+  string name = 2;
+  int32 num_devices = 3;
+  repeated LogicalDeviceIds addressable_device_logical_ids = 4;
+  repeated int32 addressable_device_ids = 5;
+  oneof fingerprint {
+    bytes fingerprint_value = 6;
+    tensorflow.StatusProto fingerprint_error = 7;
+  }
+}
+
+// ================ LoadedExecutable-related operations ================
+
+// Reads `LoadedExecutable`'s metadata that's typically available only after
+// compilation. Metadata fields that are cheaper to calculate are available
+// immediately as part of `CompileResponse`.
+message LoadedExecutableMetadataRequest {
+  fixed64 loaded_executable_handle = 1;
+}
+message LoadedExecutableMetadataResponse {
+  message ShardingList {
+    repeated xla.OpSharding shardings = 1;
+  }
+
+  optional ShardingList parameter_shardings = 1;
+  optional ShardingList output_shardings = 2;
+
+  message LayoutList {
+    repeated xla.LayoutProto layouts = 1;
+  }
+
+  oneof parameter_layouts {
+    LayoutList parameter_layouts_list = 4;
+    tensorflow.StatusProto parameter_layouts_error = 5;
+  }
+  oneof output_layouts {
+    LayoutList output_layouts_list = 6;
+    tensorflow.StatusProto output_layouts_error = 7;
+  }
+
+  message MemoryKindList {
+    repeated string memory_kinds = 1;
+  }
+
+  message OutputMemoryKind {
+    tensorflow.StatusProto status = 1;
+    repeated MemoryKindList memory_kind_lists = 2;
+  }
+
+  OutputMemoryKind output_memory_kinds = 3;
+}
+
+// Mirrors `LoadedExecutable::Execute`. Returns output array handles and a
+// future handle that becomes ready when the execution completes. The latter can
+// be checked by issuing `CheckFutureRequest`.
+message LoadedExecutableExecuteRequest {
+  fixed64 loaded_executable_handle = 1;
+  repeated fixed64 args_handles = 2;
+  xla.ExecuteOptionsProto execute_options = 3;
+  repeated int32 device_ids = 4;
+}
+message LoadedExecutableExecuteResponse {
+  fixed64 status_handle = 1;
+
+  message Output {
+    proto.DType dtype = 1;
+    proto.Shape shape = 2;
+    proto.Sharding sharding = 3;
+    fixed64 array_handle = 4;
+  }
+
+  repeated Output outputs = 2;
+}
+
+// Mirrors `LoadedExecutable::Delete`. Returns a handle of a future that becomes
+// ready when the deletion completes.
+message LoadedExecutableDeleteRequest {
+  fixed64 loaded_executable_handle = 1;
+}
+message LoadedExecutableDeleteResponse {
+  fixed64 future_handle = 1;
+}
+
+// Mirrors `LoadedExecutable::IsDeleted`.
+message LoadedExecutableIsDeletedRequest {
+  fixed64 loaded_executable_handle = 1;
+}
+message LoadedExecutableIsDeletedResponse {
+  bool is_deleted = 1;
+}
+
+// Mirrors `LoadedExecutable::~LoadedExecutable`. The LoadedExecutable handle
+// becomes unusable after this request.
+message LoadedExecutableDestructRequest {
+  fixed64 loaded_executable_handle = 1;
+}
+message LoadedExecutableDestructResponse {}
+
+// ================ LoadedHostCallback-related operations ================
+
+// Waits for the given host callback on the server to have any pending execution
+// and retrieves its execution identifier and operands. The server serializes
+// all operands, concatenates them in the argument order, stores it as a single
+// host buffer assocatiated with the given handle.
+message LoadedHostCallbackPollRequest {
+  fixed64 loaded_host_callback_handle = 1;
+  fixed64 operand_host_buffer_handle = 2;
+}
+message LoadedHostCallbackPollResponse {
+  optional fixed64 host_callback_execution_handle = 1;
+}
+
+// Returns the results of a client-side host callback execution, requested by
+// `LoadedHostCallbackPollResponse`. The client concatenates all serialized
+// results and stores them as a single host buffer associated with the given
+// handle.
+message LoadedHostCallbackReturnRequest {
+  fixed64 host_callback_execution_handle = 1;
+  oneof result {
+    fixed64 result_host_buffer_handle = 3;
+    tensorflow.StatusProto error = 2;
+  }
+}
+message LoadedHostCallbackReturnResponse {}
+
+// =============  Operations supported by the IFRT `Client` class =============
+
+// Mirrors Client::GetDefaultDeviceAssignment.
+message GetDefaultDeviceAssignmentRequest {
+  fixed64 num_replicas = 1;
+  fixed64 num_partitions = 2;
+}
+message GetDefaultDeviceAssignmentResponse {
+  xla.DeviceAssignmentProto device_assignment = 1;
+}
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/proto_util.cc b/third_party/xla/xla/python/ifrt_proxy/common/proto_util.cc
new file mode 100644
index 00000000000000..a9d057c2139a59
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/common/proto_util.cc
@@ -0,0 +1,38 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/common/proto_util.h"
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/status/status.h"
+#include "tsl/platform/status_to_from_proto.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+std::unique_ptr<IfrtResponse> NewIfrtResponse(uint64_t op_id,
+                                              absl::Status status) {
+  auto ifrt_resp = std::make_unique<IfrtResponse>();
+  auto* response_metadata = ifrt_resp->mutable_response_metadata();
+  response_metadata->set_op_id(op_id);
+  *response_metadata->mutable_status() = tsl::StatusToProto(status);
+  return ifrt_resp;
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/proto_util.h b/third_party/xla/xla/python/ifrt_proxy/common/proto_util.h
new file mode 100644
index 00000000000000..d999d14f978367
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/common/proto_util.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_COMMON_PROTO_UTIL_H_
+#define XLA_PYTHON_IFRT_PROXY_COMMON_PROTO_UTIL_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/string_view.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Makes an IfrtResponse proto with the given metadata.
+std::unique_ptr<IfrtResponse> NewIfrtResponse(
+    uint64_t op_id, absl::Status status = absl::OkStatus());
+
+// Converts an `absl::string_view` into a type that is appropriate for doing
+// `proto->set_string_field(...)`. This type can be absl::string_view in the
+// newest versions of protobuf, but needs to be std::string for previous
+// versions. (As of Feb 2024, OpenXLA uses an old version.)
+#if defined(PLATFORM_GOOGLE)
+inline absl::string_view AsProtoStringData(
+    absl::string_view s ABSL_ATTRIBUTE_LIFETIME_BOUND) {
+  return s;
+}
+#else
+inline std::string AsProtoStringData(absl::string_view s) {
+  LOG_FIRST_N(WARNING, 5) << "AsProtoStringData(): copying string_view->string";
+  return std::string(s);
+}
+#endif
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_COMMON_PROTO_UTIL_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/types.cc b/third_party/xla/xla/python/ifrt_proxy/common/types.cc
new file mode 100644
index 00000000000000..1474a6dad1fd77
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/common/types.cc
@@ -0,0 +1,231 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/common/types.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <variant>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/serdes.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt/sharding_serdes.h"
+#include "xla/python/ifrt_proxy/common/types.pb.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+DType FromDTypeProto(proto::DType dtype_proto) {
+  switch (dtype_proto) {
+    case proto::DType::DTYPE_PRED:
+      return DType(DType::Kind::kPred);
+    case proto::DType::DTYPE_TOKEN:
+      return DType(DType::Kind::kToken);
+#define CASE(X)                 \
+  case proto::DType::DTYPE_##X: \
+    return DType(DType::Kind::k##X);
+      CASE(S4);
+      CASE(S8);
+      CASE(S16);
+      CASE(S32);
+      CASE(S64);
+      CASE(U4);
+      CASE(U8);
+      CASE(U16);
+      CASE(U32);
+      CASE(U64);
+      CASE(F16);
+      CASE(F32);
+      CASE(F64);
+      CASE(BF16);
+      CASE(C64);
+      CASE(C128);
+      CASE(F8E4M3FN);
+      CASE(F8E4M3B11FNUZ);
+      CASE(F8E4M3FNUZ);
+      CASE(F8E5M2);
+      CASE(F8E5M2FNUZ);
+#undef CASE
+    default:
+      return DType(DType::Kind::kInvalid);
+  }
+}
+
+proto::DType ToDTypeProto(DType dtype) {
+  switch (dtype.kind()) {
+    case DType::Kind::kPred:
+      return proto::DType::DTYPE_PRED;
+    case DType::Kind::kToken:
+      return proto::DType::DTYPE_TOKEN;
+#define CASE(X)           \
+  case DType::Kind::k##X: \
+    return proto::DType::DTYPE_##X;
+      CASE(S4);
+      CASE(S8);
+      CASE(S16);
+      CASE(S32);
+      CASE(S64);
+      CASE(U4);
+      CASE(U8);
+      CASE(U16);
+      CASE(U32);
+      CASE(U64);
+      CASE(F16);
+      CASE(F32);
+      CASE(F64);
+      CASE(BF16);
+      CASE(C64);
+      CASE(C128);
+      CASE(F8E4M3FN);
+      CASE(F8E4M3B11FNUZ);
+      CASE(F8E4M3FNUZ);
+      CASE(F8E5M2);
+      CASE(F8E5M2FNUZ);
+#undef CASE
+    default:
+      return proto::DType::DTYPE_UNSPECIFIED;
+  }
+}
+
+Shape FromShapeProto(const proto::Shape& shape_proto) {
+  return Shape(shape_proto.dimensions());
+}
+
+proto::Shape ToShapeProto(const Shape& shape) {
+  proto::Shape shape_proto;
+  for (int64_t dim : shape.dims()) {
+    shape_proto.add_dimensions(dim);
+  }
+  return shape_proto;
+}
+
+absl::StatusOr<xla::PjRtValueType> FromVariantProto(
+    const proto::Variant& variant_proto) {
+  switch (variant_proto.value_case()) {
+    case proto::Variant::kStringValue:
+      return variant_proto.string_value();
+    case proto::Variant::kInt64Value:
+      return variant_proto.int64_value();
+    case proto::Variant::kInt64List: {
+      const auto& values = variant_proto.int64_list().values();
+      return std::vector<int64_t>(values.begin(), values.end());
+    }
+    case proto::Variant::kFloatValue:
+      return variant_proto.float_value();
+    default:
+      return absl::UnimplementedError(absl::StrCat(
+          "Unknown xla.ifrt.proto.Variant case: ", variant_proto.value_case()));
+  }
+}
+
+absl::StatusOr<proto::Variant> ToVariantProto(const xla::PjRtValueType& value) {
+  proto::Variant variant;
+  if (auto* s = std::get_if<std::string>(&value)) {
+    variant.set_string_value(*s);
+  } else if (auto* i = std::get_if<int64_t>(&value)) {
+    variant.set_int64_value(*i);
+  } else if (auto* is = std::get_if<std::vector<int64_t>>(&value)) {
+    for (const int64_t i : *is) {
+      variant.mutable_int64_list()->add_values(i);
+    }
+  } else if (auto* f = std::get_if<float>(&value)) {
+    variant.set_float_value(*f);
+  } else {
+    return absl::UnimplementedError("Unknown xla::PjRtValueType type");
+  }
+  return variant;
+}
+
+absl::StatusOr<std::shared_ptr<const Sharding>> FromShardingProto(
+    DeviceList::LookupDeviceFunc lookup_device,
+    const proto::Sharding& sharding_proto) {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<Serializable> sharding,
+                      Deserialize(sharding_proto.serialized_sharding(),
+                                  std::make_unique<DeserializeShardingOptions>(
+                                      std::move(lookup_device))));
+  return std::shared_ptr<const Sharding>(
+      llvm::cast<Sharding>(sharding.release()));
+}
+
+absl::StatusOr<proto::Sharding> ToShardingProto(const Sharding& sharding) {
+  proto::Sharding sharding_proto;
+  TF_ASSIGN_OR_RETURN(*sharding_proto.mutable_serialized_sharding(),
+                      Serialize(const_cast<Sharding&>(sharding)));
+  return sharding_proto;
+}
+
+proto::ArrayCopySemantics ToArrayCopySemanticsProto(ArrayCopySemantics s) {
+  switch (s) {
+    case ArrayCopySemantics::kAlwaysCopy:
+      return proto::ARRAY_COPY_SEMANTICS_ALWAYS_COPY;
+    case ArrayCopySemantics::kDonateInput:
+      return proto::ARRAY_COPY_SEMANTICS_DONATE_INPUT;
+    case ArrayCopySemantics::kReuseInput:
+      return proto::ARRAY_COPY_SEMANTICS_REUSE_INPUT;
+  }
+}
+
+absl::StatusOr<ArrayCopySemantics> FromArrayCopySemanticsProto(
+    proto::ArrayCopySemantics s) {
+  MakeArrayFromHostBufferRequest req;
+  switch (s) {
+    case proto::ARRAY_COPY_SEMANTICS_ALWAYS_COPY:
+      return ArrayCopySemantics::kAlwaysCopy;
+    case proto::ARRAY_COPY_SEMANTICS_DONATE_INPUT:
+      return ArrayCopySemantics::kDonateInput;
+    case proto::ARRAY_COPY_SEMANTICS_REUSE_INPUT:
+      return ArrayCopySemantics::kReuseInput;
+    default:
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unhandled proto-enum value ", s, ":",
+                       proto::ArrayCopySemantics_Name(s)));
+  }
+}
+
+std::vector<int64_t> FromByteStridesProto(const proto::ByteStrides& strides) {
+  std::vector<int64_t> result;
+  result.reserve(strides.strides_size());
+  for (auto x : strides.strides()) {
+    result.push_back(x);
+  }
+  return result;
+}
+
+proto::ByteStrides ToByteStridesProto(const absl::Span<const int64_t> strides) {
+  proto::ByteStrides result;
+  for (auto x : strides) {
+    result.add_strides(x);
+  }
+  return result;
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/types.h b/third_party/xla/xla/python/ifrt_proxy/common/types.h
new file mode 100644
index 00000000000000..bd894b70a68d6b
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/common/types.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_COMMON_TYPES_H_
+#define XLA_PYTHON_IFRT_PROXY_COMMON_TYPES_H_
+
+#include <cstdint>
+#include <memory>
+#include <vector>
+
+#include "absl/status/statusor.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/types.pb.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+struct ArrayHandle {
+  uint64_t handle;
+
+  template <typename Sink>
+  friend void AbslStringify(Sink& sink, const ArrayHandle& h) {
+    absl::Format(&sink, "arr_%v", h.handle);
+  }
+};
+
+DType FromDTypeProto(proto::DType dtype_proto);
+proto::DType ToDTypeProto(DType dtype);
+
+Shape FromShapeProto(const proto::Shape& shape_proto);
+proto::Shape ToShapeProto(const Shape& shape);
+
+absl::StatusOr<std::shared_ptr<const Sharding>> FromShardingProto(
+    DeviceList::LookupDeviceFunc lookup_device,
+    const proto::Sharding& sharding_proto);
+absl::StatusOr<proto::Sharding> ToShardingProto(const Sharding& sharding);
+
+absl::StatusOr<ArrayCopySemantics> FromArrayCopySemanticsProto(
+    proto::ArrayCopySemantics s);
+proto::ArrayCopySemantics ToArrayCopySemanticsProto(ArrayCopySemantics s);
+
+absl::StatusOr<xla::PjRtValueType> FromVariantProto(
+    const proto::Variant& variant_proto);
+absl::StatusOr<proto::Variant> ToVariantProto(const xla::PjRtValueType& value);
+
+std::vector<int64_t> FromByteStridesProto(const proto::ByteStrides& strides);
+proto::ByteStrides ToByteStridesProto(absl::Span<const int64_t> strides);
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_COMMON_TYPES_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/types.proto b/third_party/xla/xla/python/ifrt_proxy/common/types.proto
new file mode 100644
index 00000000000000..7ef48aed10c54b
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/common/types.proto
@@ -0,0 +1,104 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto3";
+
+package xla.ifrt.proto;
+
+import "xla/python/ifrt/serdes.proto";
+
+// Sharding of an Array or Executable parameter/output.
+// TODO(b/266635130): Remove `Sharding` and use `xla.ifrt.Serialized` directly
+// if caching of sharding is unnecessary.
+message Sharding {
+  xla.ifrt.Serialized serialized_sharding = 1;
+}
+
+// Shape of an Array. Currently we only support static shapes with all
+// dimension sizes greater than or equal to 0.
+message Shape {
+  repeated int64 dimensions = 1;
+}
+
+// Data types currently supported. Mirrors `xla::ifrt::DType`.
+enum DType {
+  DTYPE_UNSPECIFIED = 0;
+
+  // Predicates are two-state booleans.
+  DTYPE_PRED = 1;
+
+  // Signed integral values of fixed width.
+  DTYPE_S4 = 21;
+  DTYPE_S8 = 2;
+  DTYPE_S16 = 3;
+  DTYPE_S32 = 4;
+  DTYPE_S64 = 5;
+
+  // Unsigned integral values of fixed width.
+  DTYPE_U4 = 22;
+  DTYPE_U8 = 6;
+  DTYPE_U16 = 7;
+  DTYPE_U32 = 8;
+  DTYPE_U64 = 9;
+
+  // Floating-point values of fixed width.
+  DTYPE_F16 = 10;
+  DTYPE_F32 = 11;
+  DTYPE_F64 = 12;
+
+  // Truncated 16 bit floating-point format. This is similar to IEEE's 16 bit
+  // floating-point format, but uses 1 bit for the sign, 8 bits for the
+  // exponent and 7 bits for the mantissa.
+  DTYPE_BF16 = 16;
+
+  // Complex values of fixed width.
+  DTYPE_C64 = 15;   // Paired F32 (real, imag), as in std::complex<float>.
+  DTYPE_C128 = 18;  // Paired F64 (real, imag), as in std::complex<double>.
+
+  // A token type threaded between side-effecting operations. Shapes of this
+  // dtype will have empty dimensions.
+  DTYPE_TOKEN = 17;
+
+  DTYPE_F8E4M3FN = 20;
+  DTYPE_F8E4M3B11FNUZ = 23;
+  DTYPE_F8E4M3FNUZ = 25;
+  DTYPE_F8E5M2 = 19;
+  DTYPE_F8E5M2FNUZ = 24;
+}
+
+// Mirrors `xla::PjRtValueType`, which is used in IFRT to model
+// polymorphic-typed values, e.g., `xla::ifrt::Executable::CostAnalysisValue`.
+message Variant {
+  message Int64List {
+    repeated sfixed64 values = 1;
+  }
+
+  oneof value {
+    string string_value = 1;
+    sfixed64 int64_value = 2;
+    Int64List int64_list = 3;
+    float float_value = 4;
+  }
+}
+
+enum ArrayCopySemantics {
+  ARRAY_COPY_SEMANTICS_UNSPECIFIED = 0;
+  ARRAY_COPY_SEMANTICS_ALWAYS_COPY = 1;
+  ARRAY_COPY_SEMANTICS_REUSE_INPUT = 2;
+  ARRAY_COPY_SEMANTICS_DONATE_INPUT = 3;
+}
+
+message ByteStrides {
+  repeated int64 strides = 1;
+}
diff --git a/third_party/xla/xla/python/ifrt_proxy/common/types_test.cc b/third_party/xla/xla/python/ifrt_proxy/common/types_test.cc
new file mode 100644
index 00000000000000..5d0df922f1a5a6
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/common/types_test.cc
@@ -0,0 +1,167 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/common/types.h"
+
+#include <cstdint>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt_proxy/common/types.pb.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+namespace {
+
+using ::tsl::testing::IsOkAndHolds;
+
+#if defined(PLATFORM_GOOGLE)
+using ::testing::EquivToProto;
+#endif
+
+TEST(DTypeTest, ToFromProto) {
+  for (int i = 0; i < proto::DType_descriptor()->value_count(); ++i) {
+    const proto::DType dtype = static_cast<proto::DType>(
+        proto::DType_descriptor()->value(i)->number());
+    EXPECT_EQ(ToDTypeProto(FromDTypeProto(dtype)), dtype);
+  }
+}
+
+// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
+#if defined(PLATFORM_GOOGLE)
+class ShapeTest
+    : public testing::TestWithParam<std::pair<Shape, proto::Shape>> {};
+
+TEST_P(ShapeTest, FromShapeProto) {
+  const auto& [shape, shape_proto] = GetParam();
+  EXPECT_EQ(FromShapeProto(shape_proto), shape);
+}
+
+TEST_P(ShapeTest, ToShapeProto) {
+  const auto& [shape, shape_proto] = GetParam();
+  EXPECT_THAT(ToShapeProto(shape), EquivToProto(shape_proto));
+}
+
+proto::Shape MakeProtoShape(absl::Span<const int64_t> dims) {
+  auto shape = proto::Shape();
+  for (auto dim : dims) {
+    shape.add_dimensions(dim);
+  }
+  return shape;
+}
+
+INSTANTIATE_TEST_SUITE_P(Shape, ShapeTest,
+                         testing::ValuesIn({
+                             std::make_pair(Shape({}), MakeProtoShape({})),
+                             std::make_pair(Shape({1, 2}),
+                                            MakeProtoShape({1, 2})),
+                         }));
+#endif
+
+// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
+#if defined(PLATFORM_GOOGLE)
+class VariantTest : public testing::TestWithParam<
+                        std::pair<xla::PjRtValueType, proto::Variant>> {};
+
+TEST_P(VariantTest, FromVariantProto) {
+  const auto& [variant, variant_proto] = GetParam();
+  EXPECT_THAT(FromVariantProto(variant_proto), IsOkAndHolds(variant));
+}
+
+TEST_P(VariantTest, ToVariantProto) {
+  const auto& [variant, variant_proto] = GetParam();
+  EXPECT_THAT(ToVariantProto(variant),
+              IsOkAndHolds(EquivToProto(variant_proto)));
+}
+
+proto::Variant MakeProtoVariantString(absl::string_view arg) {
+  auto variant = proto::Variant();
+  variant.set_string_value(arg);
+  return variant;
+}
+
+proto::Variant MakeProtoVariantInt64(int64_t arg) {
+  auto variant = proto::Variant();
+  variant.set_int64_value(arg);
+  return variant;
+}
+
+proto::Variant MakeProtoVariantInt64List(absl::Span<const int64_t> arg) {
+  auto variant = proto::Variant();
+  for (auto arg : arg) {
+    variant.mutable_int64_list()->add_values(arg);
+  }
+  return variant;
+}
+
+proto::Variant MakeProtoVariantFloat(float arg) {
+  auto variant = proto::Variant();
+  variant.set_float_value(arg);
+  return variant;
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    Variant, VariantTest,
+    testing::ValuesIn({
+        std::make_pair(xla::PjRtValueType("foo"),
+                       MakeProtoVariantString("foo")),
+        std::make_pair(xla::PjRtValueType(static_cast<int64_t>(1234)),
+                       MakeProtoVariantInt64(1234)),
+        std::make_pair(xla::PjRtValueType(std::vector<int64_t>{1, 2}),
+                       MakeProtoVariantInt64List({1, 2})),
+        std::make_pair(xla::PjRtValueType(3.14f), MakeProtoVariantFloat(3.14f)),
+    }));
+#endif
+
+class ByteStridesTest : public testing::TestWithParam<std::vector<int64_t>> {};
+
+TEST_P(ByteStridesTest, ToFromProto) {
+  std::vector<int64_t> strides = GetParam();
+  EXPECT_EQ(FromByteStridesProto(ToByteStridesProto(strides)), strides);
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    ByteStrides, ByteStridesTest,
+    testing::ValuesIn(std::vector<std::vector<int64_t>>{
+        {}, {1}, {0}, {4, 8}, {8, 4}, {1, 2, 3, 4}, {0, 4}, {4, 0}}));
+
+TEST(ArrayCopySemantics, ToFromProtoTest) {
+  // NOLINTNEXTLINE readability-proto-enum-for-loop
+  for (int proto_enum_int = proto::ArrayCopySemantics_MIN;
+       proto_enum_int <= proto::ArrayCopySemantics_MAX; ++proto_enum_int) {
+    const auto proto_enum =
+        static_cast<proto::ArrayCopySemantics>(proto_enum_int);
+    if (proto_enum == proto::ARRAY_COPY_SEMANTICS_UNSPECIFIED) {
+      continue;
+    }
+    TF_ASSERT_OK_AND_ASSIGN(const auto cpp_enum,
+                            FromArrayCopySemanticsProto(proto_enum));
+    EXPECT_EQ(proto_enum, ToArrayCopySemanticsProto(cpp_enum));
+  }
+}
+
+}  // namespace
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD b/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD
new file mode 100644
index 00000000000000..23a56803a43fd7
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/integration_tests/BUILD
@@ -0,0 +1,107 @@
+# Copyright 2023 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+load("//xla/python/ifrt_proxy/common:ifrt_proxy.bzl", "ifrt_proxy_cc_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:private"],
+)
+
+cc_library(
+    name = "register_pjrt_cpu_for_ifrt_api_tests",
+    testonly = True,
+    srcs = ["register_pjrt_cpu_for_ifrt_api_tests.cc"],
+    deps = [
+        "//xla/pjrt:pjrt_client",
+        "//xla/pjrt/cpu:cpu_client",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt:test_util",
+        "//xla/python/ifrt_proxy/client:grpc_client",
+        "//xla/python/ifrt_proxy/client:registry",
+        "//xla/python/ifrt_proxy/server:grpc_server",
+        "//xla/python/pjrt_ifrt",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+    alwayslink = True,
+)
+
+ifrt_proxy_cc_test(
+    name = "client_impl_test_tfrt_cpu",
+    deps = [
+        ":register_pjrt_cpu_for_ifrt_api_tests",
+        "//xla/python/ifrt:client_impl_test_lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
+ifrt_proxy_cc_test(
+    name = "array_impl_test_tfrt_cpu",
+    srcs = ["array_impl_test_tfrt_cpu.cc"],
+    deps = [
+        ":register_pjrt_cpu_for_ifrt_api_tests",
+        "//xla/python/ifrt:array_impl_test_lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+ifrt_proxy_cc_test(
+    name = "executable_impl_test_tfrt_cpu",
+    timeout = "moderate",
+    srcs = ["executable_impl_test_tfrt_cpu.cc"],
+    deps = [
+        ":register_pjrt_cpu_for_ifrt_api_tests",  # buildcleaner: keep
+        "//xla/python/ifrt:test_util",
+        "//xla/python/ifrt/ir/tests:executable_impl_test_lib",
+        "//xla/python/pjrt_ifrt:xla_executable_impl_test_lib",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+ifrt_proxy_cc_test(
+    name = "mock_array_test",
+    size = "small",
+    srcs = ["mock_array_test.cc"],
+    deps = [
+        "//xla:status",
+        "//xla/pjrt/cpu:cpu_client",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt:mock",
+        "//xla/python/ifrt_proxy/client",
+        "//xla/python/ifrt_proxy/client:grpc_client",
+        "//xla/python/ifrt_proxy/client:registry",
+        "//xla/python/ifrt_proxy/server:grpc_server",
+        "//xla/python/pjrt_ifrt",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
diff --git a/third_party/xla/xla/python/ifrt_proxy/integration_tests/array_impl_test_tfrt_cpu.cc b/third_party/xla/xla/python/ifrt_proxy/integration_tests/array_impl_test_tfrt_cpu.cc
new file mode 100644
index 00000000000000..4ce343a1cb72df
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/integration_tests/array_impl_test_tfrt_cpu.cc
@@ -0,0 +1,42 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+
+int main(int argc, char** argv) {
+  const std::string disabled[] = {
+      // TfrtCpuBuffer::ToLiteral() currently does not respect the layout of the
+      // destination literal.
+      "ArrayImplTest.MakeArrayFromHostBufferAndCopyToHostBufferWithByteStrides",
+
+      // `ShardingParamSharding` does not support serialization yet.
+      // TODO(b/282757875): Enable the test once IFRT implements
+      // `ShardingParamShardingSerDes`.
+      "ArrayImplTest.AssembleAndDisassembleArray",
+  };
+
+  const std::string filter = absl::StrCat("-", absl::StrJoin(disabled, ":"));
+#ifdef GTEST_FLAG_SET
+  GTEST_FLAG_SET(filter, filter.c_str());
+#else
+  testing::GTEST_FLAG(filter) = filter.c_str();
+#endif
+
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/third_party/xla/xla/python/ifrt_proxy/integration_tests/executable_impl_test_tfrt_cpu.cc b/third_party/xla/xla/python/ifrt_proxy/integration_tests/executable_impl_test_tfrt_cpu.cc
new file mode 100644
index 00000000000000..2f9b1e47319166
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/integration_tests/executable_impl_test_tfrt_cpu.cc
@@ -0,0 +1,48 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "xla/python/ifrt/test_util.h"
+
+int main(int argc, char** argv) {
+  const std::string disabled[] = {
+      // Executable::IsDeleted always returns false with TFRT CPU backend.
+      "LoadedExecutableImplTest.IsDeleted",
+
+      // Enable this when Serialization support for IFRT IR is available.
+      "IfrtIrExecutableImplTest.CallXla",
+      "IfrtIrExecutableImplTest.Reshard",
+      "IfrtIrExecutableImplTest.ZeroInput",
+      "IfrtIrExecutableImplTest.ZeroOutput",
+      "IfrtIrExecutableImplTest.BufferDonation",
+      "IfrtIrExecutableImplTest.LoadedExecBinding",
+      "ProgramLoadedExecutableImplTest.MultipleAtomProgramsNeedDummyInputs",
+  };
+
+  const std::string filter = absl::StrCat("-", absl::StrJoin(disabled, ":"));
+
+#ifdef GTEST_FLAG_SET
+  GTEST_FLAG_SET(filter, filter.c_str());
+#else
+  testing::GTEST_FLAG(filter) = filter.c_str();
+#endif
+
+  testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc b/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
new file mode 100644
index 00000000000000..ba2dc5315f319d
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/integration_tests/mock_array_test.cc
@@ -0,0 +1,274 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/base/thread_annotations.h"
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/synchronization/notification.h"
+#include "absl/time/clock.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/pjrt/cpu/cpu_client.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/mock.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt_proxy/client/client.h"
+#include "xla/python/ifrt_proxy/client/registry.h"
+#include "xla/python/ifrt_proxy/server/grpc_server.h"
+#include "xla/python/pjrt_ifrt/pjrt_client.h"
+#include "xla/status.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+namespace {
+
+using ::tsl::testing::IsOk;
+using ::tsl::testing::StatusIs;
+
+constexpr absl::StatusCode kInternal = absl::StatusCode::kInternal;
+
+constexpr absl::Duration kSomeTime = absl::Seconds(1);
+
+class MockArrayTest : public testing::Test {
+ public:
+  void SetUp() override {
+    std::string address =
+        absl::StrCat("localhost:", tsl::testing::PickUnusedPortOrDie());
+    TF_ASSERT_OK_AND_ASSIGN(
+        server_, GrpcServer::CreateFromIfrtClientFactory(
+                     address, [this] { return CreateMockBackend(); }));
+    TF_ASSERT_OK_AND_ASSIGN(client_,
+                            CreateClient(absl::StrCat("grpc://", address)));
+  }
+
+  struct ArrayPair {
+    // IFRT array exposed to the proxy's user. Not a mock.
+    tsl::RCReference<xla::ifrt::Array> proxy_client_array;
+    // IFRT array owned by the proxy server whose behavior should be
+    // reflected by proxy_client_array. Mock but delegated.
+    tsl::RCReference<MockArray> backend_array;
+  };
+
+  absl::StatusOr<ArrayPair> NewArray() {
+    DType dtype(DType::kF32);
+    Shape shape({2, 3});
+    auto data = std::make_unique<std::vector<float>>(6);
+    std::iota(data->begin(), data->end(), 0);
+    xla::ifrt::Device* device = client_->addressable_devices().at(0);
+    std::shared_ptr<const Sharding> sharding =
+        SingleDeviceSharding::Create(device, MemoryKind());
+
+    TF_ASSIGN_OR_RETURN(
+        auto client_arr,
+        client_->MakeArrayFromHostBuffer(
+            data->data(), dtype, shape,
+            /*byte_strides=*/std::nullopt, sharding,
+            Client::HostBufferSemantics::kImmutableOnlyDuringCall,
+            /*on_done_with_host_buffer=*/nullptr));
+
+    // When the above `MakeArrayFromHostBuffer` results in the server issuing a
+    // `MakeArrayFromHostBuffer()` to the underlying mock backend, the mock
+    // backend enqueues the returned mock array onto `mock_arrays_` (this code
+    // is in `CreateMockBackend()`).
+    absl::MutexLock l(&mu_);
+    CHECK_EQ(mock_arrays_.size(), 1);
+    auto mock = mock_arrays_.back();
+    mock_arrays_.pop_back();
+    return ArrayPair{client_arr, mock};
+  }
+
+  std::unique_ptr<GrpcServer> server_;
+  std::unique_ptr<xla::ifrt::Client> client_;
+
+ private:
+  absl::StatusOr<std::unique_ptr<xla::ifrt::Client>> CreateMockBackend() {
+    // TODO(b/292339723): Use reference backend as the delegate while mocking.
+    CpuClientOptions options;
+    options.asynchronous = true;
+    options.cpu_device_count = 2;
+    TF_ASSIGN_OR_RETURN(auto tfrt_cpu_client, xla::GetTfrtCpuClient(options));
+    auto mock_backend = std::make_unique<MockClient>(
+        /*delegate=*/xla::ifrt::PjRtClient::Create(std::move(tfrt_cpu_client)));
+
+    ON_CALL(*mock_backend, MakeArrayFromHostBuffer)
+        .WillByDefault(
+            [this, mock_backend = mock_backend.get()](
+                const void* data, DType dtype, Shape shape,
+                std::optional<absl::Span<const int64_t>> byte_strides,
+                std::shared_ptr<const Sharding> sharding,
+                Client::HostBufferSemantics semantics,
+                std::function<void()> on_done_with_host_buffer)
+                -> absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> {
+              TF_ASSIGN_OR_RETURN(
+                  auto delegated,
+                  mock_backend->delegated()->MakeArrayFromHostBuffer(
+                      data, dtype, shape, byte_strides, sharding, semantics,
+                      on_done_with_host_buffer));
+              auto result = tsl::MakeRef<MockArray>(delegated);
+
+              absl::MutexLock l(&mu_);
+              mock_arrays_.push_back(result);
+              return result;
+            });
+
+    return mock_backend;
+  }
+
+  absl::Mutex mu_;
+  std::vector<tsl::RCReference<MockArray>> mock_arrays_ ABSL_GUARDED_BY(mu_);
+};
+
+TEST_F(MockArrayTest, ReadyFutureWaitsUntilReady) {
+  TF_ASSERT_OK_AND_ASSIGN(ArrayPair arr, NewArray());
+
+  absl::Notification wait_ready;
+
+  EXPECT_CALL(*arr.backend_array, GetReadyFuture).WillOnce([&] {
+    wait_ready.WaitForNotification();
+    return arr.backend_array->delegated()->GetReadyFuture();
+  });
+
+  auto ready = arr.proxy_client_array->GetReadyFuture();
+
+  absl::SleepFor(kSomeTime);
+  EXPECT_FALSE(ready.IsReady());
+
+  wait_ready.Notify();
+  EXPECT_THAT(ready.Await(), IsOk());
+}
+
+TEST_F(MockArrayTest, ReadyFuturePropagatesError) {
+  TF_ASSERT_OK_AND_ASSIGN(ArrayPair arr, NewArray());
+
+  EXPECT_CALL(*arr.backend_array, GetReadyFuture).WillOnce([&] {
+    return Future<absl::Status>(absl::InternalError("testing"));
+  });
+
+  EXPECT_THAT(arr.proxy_client_array->GetReadyFuture().Await(),
+              StatusIs(kInternal));
+}
+
+TEST_F(MockArrayTest, DeletionFutureWaitsUntilDeleted) {
+  TF_ASSERT_OK_AND_ASSIGN(ArrayPair arr, NewArray());
+
+  tsl::thread::ThreadPool threads(tsl::Env::Default(), "t", /*num_threads=*/1);
+  absl::Notification wait_ready;
+
+  EXPECT_CALL(*arr.backend_array, Delete).WillOnce([&] {
+    // TODO(b/266635130): Write a version of this testcase where the Delete()
+    // call of the MockArray blocks on `wait_ready`, instead of the Future it
+    // returns being blocked on `wait_ready`. That version of the testcase does
+    // not currently work since both the client and the server synchronously
+    // block until the MockArray's Delete() returns.
+    auto promise = Future<absl::Status>::CreatePromise();
+    threads.Schedule([&, promise]() mutable {
+      wait_ready.WaitForNotification();
+      promise.Set(arr.backend_array->delegated()->Delete().Await());
+    });
+    return Future<absl::Status>(promise);
+  });
+
+  EXPECT_FALSE(arr.proxy_client_array->IsDeleted());
+  auto deleted_future = arr.proxy_client_array->Delete();
+
+  absl::SleepFor(kSomeTime);
+  EXPECT_FALSE(deleted_future.IsReady());
+  EXPECT_FALSE(arr.proxy_client_array->IsDeleted());
+
+  wait_ready.Notify();
+  EXPECT_THAT(deleted_future.Await(), IsOk());
+  EXPECT_TRUE(arr.proxy_client_array->IsDeleted());
+}
+
+TEST_F(MockArrayTest, DeletionPropagatesError) {
+  TF_ASSERT_OK_AND_ASSIGN(ArrayPair arr, NewArray());
+
+  EXPECT_CALL(*arr.backend_array, Delete).WillOnce([&] {
+    return Future<absl::Status>(absl::InternalError("testing"));
+  });
+
+  EXPECT_FALSE(arr.proxy_client_array->IsDeleted());
+  EXPECT_THAT(arr.proxy_client_array->Delete().Await(), StatusIs(kInternal));
+}
+
+TEST_F(MockArrayTest, CopyToHostFutureWaitsUntilCopied) {
+  TF_ASSERT_OK_AND_ASSIGN(ArrayPair arr, NewArray());
+
+  absl::Notification wait_ready;
+
+  EXPECT_CALL(*arr.backend_array, CopyToHostBuffer)
+      .WillOnce([&](auto data, auto byte_strides, auto semantics) {
+        wait_ready.WaitForNotification();
+        return arr.backend_array->delegated()->CopyToHostBuffer(
+            data, byte_strides, semantics);
+      });
+
+  char data[1000];
+  auto copied = arr.proxy_client_array->CopyToHostBuffer(
+      &data[0], /*byte_strides=*/std::nullopt, ArrayCopySemantics::kAlwaysCopy);
+
+  absl::SleepFor(kSomeTime);
+  EXPECT_FALSE(copied.IsReady());
+
+  wait_ready.Notify();
+  EXPECT_THAT(copied.Await(), IsOk());
+}
+
+TEST_F(MockArrayTest, CopyToHostFuturePropagatesError) {
+  TF_ASSERT_OK_AND_ASSIGN(ArrayPair arr, NewArray());
+
+  absl::Notification wait_ready;
+
+  EXPECT_CALL(*arr.backend_array, CopyToHostBuffer).WillOnce([&] {
+    return Future<Status>(absl::InternalError("testing"));
+  });
+
+  char data[1000];
+  auto copied = arr.proxy_client_array->CopyToHostBuffer(
+      &data[0], /*byte_strides=*/std::nullopt, ArrayCopySemantics::kAlwaysCopy);
+
+  EXPECT_THAT(copied.Await(), StatusIs(kInternal));
+}
+
+}  // namespace
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/integration_tests/register_pjrt_cpu_for_ifrt_api_tests.cc b/third_party/xla/xla/python/ifrt_proxy/integration_tests/register_pjrt_cpu_for_ifrt_api_tests.cc
new file mode 100644
index 00000000000000..6b344e011ede78
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/integration_tests/register_pjrt_cpu_for_ifrt_api_tests.cc
@@ -0,0 +1,80 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// This file registers a factory with `xla::ifrt::test_util` that will spawn a
+// IFRT proxy client connected to an instance of a proxy server that is backed
+// by the IFRT-PjRt-CPU backend.
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "xla/pjrt/cpu/cpu_client.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/test_util.h"
+#include "xla/python/ifrt/tuple.h"
+#include "xla/python/ifrt/value.h"
+#include "xla/python/ifrt_proxy/client/registry.h"
+#include "xla/python/ifrt_proxy/server/grpc_server.h"
+#include "xla/python/pjrt_ifrt/pjrt_client.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+namespace test_util {
+namespace {
+
+absl::StatusOr<std::unique_ptr<xla::ifrt::Client>> CreateIfrtBackendClient() {
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::PjRtClient> tfrt_cpu_client,
+                      xla::GetTfrtCpuClient(/*asynchronous=*/true,
+                                            /*cpu_device_count=*/2));
+  return xla::ifrt::PjRtClient::Create(std::move(tfrt_cpu_client));
+}
+
+const bool kUnused =
+    (xla::ifrt::test_util::RegisterClientFactory(
+         []() -> absl::StatusOr<std::shared_ptr<xla::ifrt::Client>> {
+           std::string address =
+               absl::StrCat("localhost:", tsl::testing::PickUnusedPortOrDie());
+           TF_ASSIGN_OR_RETURN(auto server,
+                               GrpcServer::CreateFromIfrtClientFactory(
+                                   address, CreateIfrtBackendClient));
+
+           TF_ASSIGN_OR_RETURN(std::unique_ptr<xla::ifrt::Client> client,
+                               CreateClient(absl::StrCat("grpc://", address)));
+
+           return std::shared_ptr<xla::ifrt::Client>(
+               client.release(), /*deleter=*/
+               [server = server.release()](xla::ifrt::Client* client) {
+                 // Client has to be destructed before the server since the
+                 // server's destructor (as of Jul 2023) waits for the client to
+                 // end its session.
+                 // TODO(b/282757875): Make the server cancel the client's
+                 // session if the server is getting destructed.
+                 delete client;
+                 delete server;
+               });
+         }),
+     true);
+
+}  // namespace
+}  // namespace test_util
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/jax/BUILD b/third_party/xla/xla/python/ifrt_proxy/jax/BUILD
new file mode 100644
index 00000000000000..b86f65e9c3596a
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/jax/BUILD
@@ -0,0 +1,52 @@
+# Copyright 2023 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Jax library for IFRT proxy.
+load("//xla:pytype.default.bzl", "pytype_strict_library")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:public"],
+)
+
+pytype_strict_library(
+    name = "ifrt_proxy_internal",
+    srcs = ["ifrt_proxy_internal.py"],
+    # copybara:uncomment_begin
+    # visibility = [
+    # "//xla/python/ifrt_proxy/common/google:friends",
+    # "//xla/python/ifrt_proxy/common/google:jax_users",
+    # ],
+    # copybara:uncomment_end
+    deps = [
+        "//xla/python:xla_client",
+        "//xla/python/ifrt_proxy/client:py_module",
+        "@pybind11_abseil//pybind11_abseil:status",
+    ],
+)
+
+# copybara:uncomment_begin(ifrt_proxy.py is not exported to github)
+# pytype_strict_library(
+#     name = "ifrt_proxy",
+#     srcs = ["ifrt_proxy.py"],
+#     visibility = [
+#         "//xla/python/ifrt_proxy/common/google:friends",
+#         "//xla/python/ifrt_proxy/common/google:jax_users",
+#     ],
+#     deps = [
+#         ":ifrt_proxy_internal",
+#         "//third_party/py/jax",
+#     ],
+# )
+# copybara:uncomment_end
diff --git a/third_party/xla/xla/python/ifrt_proxy/jax/ifrt_proxy_internal.py b/third_party/xla/xla/python/ifrt_proxy/jax/ifrt_proxy_internal.py
new file mode 100644
index 00000000000000..746575cdd61135
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/jax/ifrt_proxy_internal.py
@@ -0,0 +1,74 @@
+# Copyright 2023 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Library to help create a IFRT proxy client."""
+
+import dataclasses
+from typing import Callable, Optional
+
+from pybind11_abseil import status
+from xla.python import xla_client
+from xla.python.ifrt_proxy.client import py_module
+
+
+@dataclasses.dataclass
+class ConnectionOptions:
+  """Various connection options.
+
+  Attributes:
+    on_disconnect: Optional, a callback that will be called if there was a
+      successful connection to the proxy server and Jax commands could be
+      issued, but there was a later disconnect before the Client is destroyed.
+    on_connection_update: Optional, a callback that will be called with status
+      updates about initial connection establishment. The updates will be
+      provided as human-readable strings, and an end-user may find them helpful.
+  """
+
+  on_disconnect: Optional[Callable[[status.Status], None]] = None
+  on_connection_update: Optional[Callable[[str], None]] = None
+
+
+_backend_created: bool = False
+_connection_options: ConnectionOptions = ConnectionOptions()
+
+
+def get_client(proxy_server_address: str) -> xla_client.Client:
+  """Creates an IFRT Proxy client for the given server address."""
+  global _backend_created
+  _backend_created = True
+  cpp_options = py_module.ClientConnectionOptions()
+  cpp_options.on_disconnect = _connection_options.on_disconnect
+  cpp_options.on_connection_update = _connection_options.on_connection_update
+  client = py_module.get_client(proxy_server_address, cpp_options)
+  return client
+
+
+def set_connection_options(
+    options: ConnectionOptions,
+) -> None:
+  """Sets the connection options for the "proxy" jax_platforms.
+
+  Args:
+    options: See documentation for ConnectionOptions class.
+
+  Raises:
+    ValueError: If this function is called after the proxy backend has already
+    been created.
+  """
+  global _connection_options
+  if _backend_created:
+    raise ValueError(
+        "set_connection_options() called after proxy backend was created."
+    )
+  _connection_options = options
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/BUILD b/third_party/xla/xla/python/ifrt_proxy/server/BUILD
new file mode 100644
index 00000000000000..60dbca7dca6856
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/BUILD
@@ -0,0 +1,318 @@
+# Copyright 2023 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+load("//xla/python/ifrt_proxy/common:ifrt_proxy.bzl", "default_ifrt_proxy_visibility", "ifrt_proxy_cc_test")
+
+package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = default_ifrt_proxy_visibility,
+)
+
+cc_library(
+    name = "grpc_server",
+    srcs = ["grpc_server.cc"],
+    hdrs = ["grpc_server.h"],
+    deps = [
+        ":grpc_service_impl",
+        ":host_buffer",
+        ":ifrt_backend",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt_proxy/common:grpc_credentials",
+        "//xla/python/ifrt_proxy/common:grpc_ifrt_service_cc_grpc_proto",
+        "@com_github_grpc_grpc//:grpc",
+        "@com_github_grpc_grpc//:grpc++",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+ifrt_proxy_cc_test(
+    name = "grpc_server_test",
+    srcs = ["grpc_server_test.cc"],
+    deps = [
+        ":grpc_server",
+        "//xla/python/ifrt_proxy/common:grpc_ifrt_service_cc_grpc_proto",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+cc_library(
+    name = "grpc_service_impl",
+    srcs = ["grpc_service_impl.cc"],
+    hdrs = ["grpc_service_impl.h"],
+    deps = [
+        ":host_buffer",
+        ":ifrt_backend",
+        ":ifrt_session_handler",
+        ":version",
+        "//xla/pjrt/distributed:util",
+        "//xla/python/ifrt_proxy/common:grpc_ifrt_service_cc_grpc_proto",
+        "//xla/python/ifrt_proxy/common:grpc_ifrt_service_proto_cc",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "//xla/python/ifrt_proxy/common:proto_util",
+        "@com_github_grpc_grpc//:grpc++",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/log:die_if_null",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+ifrt_proxy_cc_test(
+    name = "grpc_service_impl_test",
+    size = "small",
+    srcs = ["grpc_service_impl_test.cc"],
+    deps = [
+        ":grpc_server",
+        ":grpc_service_impl",
+        ":host_buffer",
+        ":version",
+        "//xla/python/ifrt_proxy/client:grpc_host_buffer",
+        "//xla/python/ifrt_proxy/common:grpc_ifrt_service_cc_grpc_proto",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "@com_github_grpc_grpc//:grpc++",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:cord",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:test",
+    ],
+)
+
+cc_library(
+    name = "ifrt_session_handler",
+    srcs = ["ifrt_session_handler.cc"],
+    hdrs = ["ifrt_session_handler.h"],
+    deps = [
+        ":ifrt_backend",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "//xla/python/ifrt_proxy/common:proto_util",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+ifrt_proxy_cc_test(
+    name = "ifrt_session_handler_test",
+    srcs = ["ifrt_session_handler_test.cc"],
+    deps = [
+        ":ifrt_backend",
+        ":ifrt_session_handler",
+        "//xla/python/ifrt",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:status_matchers",
+    ],
+)
+
+cc_library(
+    name = "ifrt_backend",
+    srcs = ["ifrt_backend.cc"],
+    hdrs = ["ifrt_backend.h"],
+    deps = [
+        ":host_buffer",
+        ":host_callback",
+        ":version",
+        "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
+        "//xla/pjrt:pjrt_client",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt:serdes",
+        "//xla/python/ifrt_proxy/common:array_util",
+        "//xla/python/ifrt_proxy/common:common_serdes",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "//xla/python/ifrt_proxy/common:proto_util",
+        "//xla/python/ifrt_proxy/common:types",
+        "//xla/python/ifrt_proxy/common:types_proto_cc",
+        "//xla/python/pjrt_ifrt:xla_ifrt",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:bind_front",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status_to_from_proto",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+ifrt_proxy_cc_test(
+    name = "ifrt_backend_test",
+    srcs = ["ifrt_backend_test.cc"],
+    deps = [
+        ":host_buffer",
+        ":host_callback",
+        ":ifrt_backend",
+        ":version",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:test",
+        "//xla:xla_data_proto_cc",
+        "//xla/pjrt:host_callback",
+        "//xla/pjrt:pjrt_common",
+        "//xla/pjrt:pjrt_device_description",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt:mock",
+        "//xla/python/ifrt:serdes",
+        "//xla/python/ifrt:sharding_serdes",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "//xla/python/ifrt_proxy/common:types",
+        "//xla/python/ifrt_proxy/common:types_proto_cc",
+        "//xla/python/pjrt_ifrt:xla_ifrt",
+        "//xla/service:computation_placer_hdr",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@com_google_googletest//:gtest_main",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:status_matchers",
+        "@local_tsl//tsl/platform:status_to_from_proto",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/protobuf:error_codes_proto_impl_cc",
+        "@local_tsl//tsl/protobuf:status_proto_cc",
+    ],
+)
+
+cc_library(
+    name = "mock_ifrt_backend",
+    testonly = True,
+    hdrs = ["mock_ifrt_backend.h"],
+    deps = [
+        ":ifrt_backend",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt_proxy/common:ifrt_service_proto_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "host_buffer",
+    srcs = ["host_buffer.cc"],
+    hdrs = ["host_buffer.h"],
+    deps = [
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
+    ],
+)
+
+cc_library(
+    name = "host_callback",
+    srcs = ["host_callback.cc"],
+    hdrs = ["host_callback.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla/pjrt:host_callback",
+        "//xla/python/ifrt",
+        "//xla/python/ifrt_proxy/common:proto_util",
+        "//xla/python/pjrt_ifrt",
+        "//xla/python/pjrt_ifrt:xla_host_callback_proto_cc",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/functional:bind_front",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:errors",
+    ],
+)
+
+cc_library(
+    name = "version",
+    srcs = ["version.cc"],
+    hdrs = ["version.h"],
+    deps = [
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+ifrt_proxy_cc_test(
+    name = "version_test",
+    srcs = ["version_test.cc"],
+    deps = [
+        ":version",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:status_matchers",
+    ],
+)
+
+ifrt_proxy_cc_test(
+    name = "host_buffer_test",
+    srcs = ["host_buffer_test.cc"],
+    deps = [
+        ":host_buffer",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:status_matchers",
+    ],
+)
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/grpc_server.cc b/third_party/xla/xla/python/ifrt_proxy/server/grpc_server.cc
new file mode 100644
index 00000000000000..fbd3b6952eb103
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/grpc_server.cc
@@ -0,0 +1,99 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/server/grpc_server.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "grpc/grpc.h"
+#include "grpcpp/completion_queue.h"
+#include "grpcpp/grpcpp.h"
+#include "grpcpp/server_builder.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt_proxy/common/grpc_credentials.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.grpc.pb.h"
+#include "xla/python/ifrt_proxy/server/grpc_service_impl.h"
+#include "xla/python/ifrt_proxy/server/host_buffer.h"
+#include "xla/python/ifrt_proxy/server/ifrt_backend.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+GrpcServer::~GrpcServer() {
+  server_->Shutdown();
+  server_->Wait();
+}
+
+absl::StatusOr<std::unique_ptr<GrpcServer>> GrpcServer::Create(
+    absl::string_view address,
+    std::unique_ptr<grpc::GrpcIfrtService::Service> impl) {
+  if (impl == nullptr) {
+    return absl::InvalidArgumentError(
+        "Service implementation cannot be a nullptr.");
+  }
+
+  ::grpc::ServerBuilder builder;
+  // Remove message size limit to accommodate large messages exchanged during
+  // model compilation.
+  builder.AddChannelArgument(GRPC_ARG_MAX_SEND_MESSAGE_LENGTH, -1);
+  builder.AddChannelArgument(GRPC_ARG_MAX_RECEIVE_MESSAGE_LENGTH, -1);
+  builder.RegisterService(impl.get());
+  builder.AddListeningPort(std::string(address), GetServerCredentials());
+  auto server = builder.BuildAndStart();
+  if (server == nullptr) {
+    return absl::UnavailableError(
+        absl::StrCat("Failed to initialize gRPC server at address:", address));
+  }
+
+  return absl::WrapUnique<GrpcServer>(
+      new GrpcServer(address, std::move(impl), std::move(server)));
+}
+
+absl::StatusOr<std::unique_ptr<GrpcServer>>
+GrpcServer::CreateFromIfrtClientFactory(
+    absl::string_view address,
+    absl::AnyInvocable<absl::StatusOr<std::unique_ptr<xla::ifrt::Client>>()>
+        backend_ifrt_client_factory) {
+  if (backend_ifrt_client_factory == nullptr) {
+    return absl::InvalidArgumentError(
+        "backend_ifrt_client_factory cannot be nullptr.");
+  }
+
+  auto service = std::make_unique<GrpcServiceImpl>(
+      [ifrt_client_factory = std::move(backend_ifrt_client_factory)](
+          IfrtProxyVersion version, uint64_t session_id,
+          std::shared_ptr<HostBufferStore> host_buffer_store) mutable
+      -> absl::StatusOr<std::unique_ptr<BackendInterface>> {
+        TF_ASSIGN_OR_RETURN(auto ifrt_client, ifrt_client_factory());
+        return IfrtBackend::Create(version, session_id, std::move(ifrt_client),
+                                   std::move(host_buffer_store));
+      });
+
+  return Create(address, std::move(service));
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/grpc_server.h b/third_party/xla/xla/python/ifrt_proxy/server/grpc_server.h
new file mode 100644
index 00000000000000..d9bd31dcee376f
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/grpc_server.h
@@ -0,0 +1,79 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_SERVER_GRPC_SERVER_H_
+#define XLA_PYTHON_IFRT_PROXY_SERVER_GRPC_SERVER_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/functional/any_invocable.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "grpcpp/server.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.grpc.pb.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Makes and runs a gRPC server with the given implementation and address.
+// Destroying this object shuts down the underlying gRPC server, and so can
+// block.
+class GrpcServer {
+ public:
+  // The address parameter must be in the standard URI format - as needed by the
+  // ::grpc::ServerBuilder::AddListentingPort. See the ::grpc::ServerBuilder
+  // documentation for more details.
+  static absl::StatusOr<std::unique_ptr<GrpcServer>> Create(
+      absl::string_view address,
+      std::unique_ptr<grpc::GrpcIfrtService::Service> impl);
+
+  static absl::StatusOr<std::unique_ptr<GrpcServer>>
+  CreateFromIfrtClientFactory(
+      absl::string_view address,
+      absl::AnyInvocable<absl::StatusOr<std::unique_ptr<xla::ifrt::Client>>()>
+          backend_ifrt_client_factory);
+
+  // Starts shutting down the server and waits until it properly shuts down.
+  ~GrpcServer();
+
+  // Address this server is listening on.
+  std::string address() const { return address_; }
+
+  // Blocks until the server shuts down.
+  void Wait() { server_->Wait(); }
+
+ private:
+  GrpcServer(absl::string_view address,
+             std::unique_ptr<grpc::GrpcIfrtService::Service> impl,
+             std::unique_ptr<::grpc::Server> server)
+      : address_(address), impl_(std::move(impl)), server_(std::move(server)) {}
+
+  const std::string address_;  // Address this server is listening on.
+
+  // Make sure that impl_ outlives the server_.
+  std::unique_ptr<grpc::GrpcIfrtService::Service> impl_;
+  std::unique_ptr<::grpc::Server> server_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_SERVER_GRPC_SERVER_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/grpc_server_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/grpc_server_test.cc
new file mode 100644
index 00000000000000..40216bca7876c8
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/grpc_server_test.cc
@@ -0,0 +1,72 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/server/grpc_server.h"
+
+#include <memory>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.grpc.pb.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+namespace {
+
+using ::testing::Not;
+using ::tsl::testing::IsOk;
+using ::tsl::testing::StatusIs;
+
+// A fake IFRT service that fails all the Session creation attempts.
+class FakeIfrtService : public grpc::GrpcIfrtService::Service {};
+
+TEST(GrpcServerTest, CreationTest) {
+  auto addr = absl::StrCat("[::1]:", tsl::testing::PickUnusedPortOrDie());
+  auto grpc_service_impl = std::make_unique<FakeIfrtService>();
+  ASSERT_THAT(GrpcServer::Create(addr, std::move(grpc_service_impl)), IsOk());
+  // Also implicitly tests that the destruction of the GrpcServer object.
+}
+
+TEST(GrpcServerTest, CreationFailsIfImplIsNullptr) {
+  auto addr = absl::StrCat("[::1]:", tsl::testing::PickUnusedPortOrDie());
+  EXPECT_THAT(GrpcServer::Create(addr, nullptr),
+              StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+TEST(GrpcServerTest, CreationFailsWithInvalidAddress) {
+  auto grpc_service_impl = std::make_unique<FakeIfrtService>();
+  EXPECT_THAT(GrpcServer::Create(/*address=*/"invalid-address",
+                                 std::move(grpc_service_impl)),
+              Not(IsOk()));
+}
+
+TEST(GrpcServerTest, RetrievingServerAddressWorks) {
+  auto addr = absl::StrCat("[::1]:", tsl::testing::PickUnusedPortOrDie());
+  auto grpc_service_impl = std::make_unique<FakeIfrtService>();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto grpc_server, GrpcServer::Create(addr, std::move(grpc_service_impl)));
+  EXPECT_EQ(grpc_server->address(), addr);
+}
+
+}  // namespace
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl.cc b/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl.cc
new file mode 100644
index 00000000000000..8f89d253affcbe
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl.cc
@@ -0,0 +1,241 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/server/grpc_service_impl.h"
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/cleanup/cleanup.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "grpcpp/server_context.h"
+#include "grpcpp/support/status.h"
+#include "grpcpp/support/sync_stream.h"
+#include "xla/pjrt/distributed/util.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/proto_util.h"
+#include "xla/python/ifrt_proxy/server/host_buffer.h"
+#include "xla/python/ifrt_proxy/server/ifrt_session_handler.h"
+#include "xla/python/ifrt_proxy/server/version.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+::grpc::Status GrpcServiceImpl::GetVersion(::grpc::ServerContext* context,
+                                           const GrpcGetVersionRequest* request,
+                                           GrpcGetVersionResponse* response) {
+  auto protocol_version =
+      ChooseVersion(request->min_version().protocol_version(),
+                    request->max_version().protocol_version());
+  if (!protocol_version.ok()) {
+    return xla::ToGrpcStatus(protocol_version.status());
+  }
+  response->mutable_version()->set_protocol_version(*protocol_version);
+  return ::grpc::Status::OK;
+}
+
+::grpc::Status GrpcServiceImpl::IfrtSession(
+    ::grpc::ServerContext* context,
+    ::grpc::ServerReaderWriter<IfrtResponse, IfrtRequest>* stream) {
+  GrpcIfrtSessionMetadata metadata;
+  {
+    const auto it = context->client_metadata().find(
+        "ifrt-proxy-grpc-ifrt-session-metadata-bin");
+    if (it == context->client_metadata().end()) {
+      return ::grpc::Status(::grpc::StatusCode::INVALID_ARGUMENT,
+                            "Missing metadata for GrpcIfrtService.IfrtSession: "
+                            "ifrt-proxy-grpc-ifrt-session-metadata-bin");
+    }
+    if (!metadata.ParseFromString(AsProtoStringData(
+            absl::string_view(it->second.data(), it->second.size())))) {
+      return ::grpc::Status(::grpc::StatusCode::INVALID_ARGUMENT,
+                            "Unable to parse GrpcIfrtSessionMetadata");
+    }
+  }
+
+  const uint64_t session_id =
+      next_session_id_.fetch_add(1, std::memory_order_relaxed);
+
+  VLOG(0) << "Starting a new IFRT session with session_id=" << session_id;
+
+  // Create a host buffer store for the session.
+  auto host_buffer_store =
+      std::make_shared<xla::ifrt::proxy::HostBufferStore>();
+  {
+    absl::MutexLock l(&host_buffer_store_mu_);
+    CHECK(host_buffer_stores_.insert({session_id, host_buffer_store}).second);
+  }
+  absl::Cleanup cleanup = [&] {
+    absl::MutexLock l(&host_buffer_store_mu_);
+    CHECK_GT(host_buffer_stores_.erase(session_id), 0);
+  };
+
+  absl::Mutex writer_mu;
+
+  auto session_handler = IfrtSessionHandler::Create(
+      session_id,
+      [this, version = metadata.version(),
+       host_buffer_store = std::move(host_buffer_store)](uint64_t session_id) {
+        return backend_factory_(version, session_id, host_buffer_store);
+      });
+
+  if (!session_handler.ok()) {
+    LOG(INFO) << "Creating session " << session_id
+              << " failed: " << session_handler.status();
+    return xla::ToGrpcStatus(session_handler.status());
+  }
+
+  bool first_request_read = false;
+  while (true) {
+    auto request = std::make_unique<IfrtRequest>();
+    if (!stream->Read(request.get())) {
+      break;
+    }
+    if (!first_request_read) {
+      VLOG(0) << "First request read for session " << session_id;
+      first_request_read = true;
+    }
+    (*session_handler)
+        ->NewIncomingRequest(std::move(request),
+                             [&](std::shared_ptr<IfrtResponse> response) {
+                               absl::MutexLock l(&writer_mu);
+                               stream->Write(*response);
+                             });
+  }
+
+  VLOG(0) << "Finishing IFRT session " << session_id;
+  return ::grpc::Status::OK;
+}
+
+::grpc::Status GrpcServiceImpl::HostBufferStore(
+    ::grpc::ServerContext* context,
+    ::grpc::ServerReader<GrpcHostBufferStoreRequest>* stream,
+    GrpcHostBufferStoreResponse* response) {
+  const auto it = context->client_metadata().find(
+      "ifrt-proxy-grpc-host-buffer-store-metadata-bin");
+  if (it == context->client_metadata().end()) {
+    return ::grpc::Status(
+        ::grpc::StatusCode::INTERNAL,
+        "Missing gRPC metadata for GrpcHostBufferService.Store");
+  }
+
+  GrpcHostBufferStoreMetadata metadata;
+  if (!metadata.ParseFromString(AsProtoStringData(
+          absl::string_view(it->second.data(), it->second.size())))) {
+    return ::grpc::Status(::grpc::StatusCode::DATA_LOSS,
+                          "Unable to parse GrpcHostBufferStoreMetadata");
+  }
+
+  std::string data;
+  data.reserve(metadata.buffer_size());
+
+  GrpcHostBufferStoreRequest request;
+  while (stream->Read(&request)) {
+    data.append(request.data());
+  }
+  if (data.size() != metadata.buffer_size()) {
+    return ::grpc::Status(
+        ::grpc::StatusCode::DATA_LOSS,
+        absl::StrCat("Potential data loss for host buffers: expected ",
+                     metadata.buffer_size(), " bytes but got ", data.size(),
+                     " bytes"));
+  }
+
+  auto store = GetHostBufferStore(metadata.session_id());
+  if (!store.ok()) {
+    return xla::ToGrpcStatus(store.status());
+  }
+  return xla::ToGrpcStatus((*store)->Store(metadata.handle(), std::move(data)));
+}
+
+::grpc::Status GrpcServiceImpl::HostBufferLookup(
+    ::grpc::ServerContext* context, const GrpcHostBufferLookupRequest* request,
+    ::grpc::ServerWriter<GrpcHostBufferLookupResponse>* stream) {
+  static constexpr int64_t kChunkSize = 1024 * 1024;
+
+  auto store = GetHostBufferStore(request->session_id());
+  if (!store.ok()) {
+    return xla::ToGrpcStatus(store.status());
+  }
+  auto data = (*store)->Lookup(request->handle());
+  if (!data.ok()) {
+    return xla::ToGrpcStatus(data.status());
+  }
+
+  GrpcHostBufferLookupResponse response;
+  if (!(*data)->empty()) {
+    for (int64_t offset = 0; offset < (*data)->size(); offset += kChunkSize) {
+#if defined(PLATFORM_GOOGLE)
+      response.set_alias_data(
+          absl::string_view(**data).substr(offset, kChunkSize));
+#else
+      // TODO(b/325306748): Find a way to not do a memory-copy.
+      response.set_data((*data)->substr(offset, kChunkSize));
+#endif
+      stream->Write(response);
+      response.Clear();
+    }
+  } else {
+    // Send at least one response even if the buffer is empty.
+    stream->Write(response);
+  }
+
+  return ::grpc::Status::OK;
+}
+
+::grpc::Status GrpcServiceImpl::HostBufferDelete(
+    ::grpc::ServerContext* context, const GrpcHostBufferDeleteRequest* request,
+    GrpcHostBufferDeleteResponse* response) {
+  auto store = GetHostBufferStore(request->session_id());
+  if (!store.ok()) {
+    return xla::ToGrpcStatus(store.status());
+  }
+  return xla::ToGrpcStatus((*store)->Delete(request->handle()));
+}
+
+bool GrpcServiceImpl::Test_InsertHostBufferStore(
+    uint64_t session_id,
+    std::shared_ptr<xla::ifrt::proxy::HostBufferStore> store) {
+  absl::MutexLock l(&host_buffer_store_mu_);
+  return host_buffer_stores_.insert({session_id, std::move(store)}).second;
+}
+
+bool GrpcServiceImpl::Test_DeleteHostBufferStore(uint64_t session_id) {
+  absl::MutexLock l(&host_buffer_store_mu_);
+  return host_buffer_stores_.erase(session_id) > 0;
+}
+
+absl::StatusOr<std::shared_ptr<xla::ifrt::proxy::HostBufferStore>>
+GrpcServiceImpl::GetHostBufferStore(uint64_t session_id) {
+  absl::MutexLock l(&host_buffer_store_mu_);
+  const auto it = host_buffer_stores_.find(session_id);
+  if (it == host_buffer_stores_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("Session id ", session_id, " does not exist"));
+  }
+  return it->second;
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl.h b/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl.h
new file mode 100644
index 00000000000000..c75709b4e6ff99
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_SERVER_GRPC_SERVICE_IMPL_H_
+#define XLA_PYTHON_IFRT_PROXY_SERVER_GRPC_SERVICE_IMPL_H_
+
+#include <atomic>
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/any_invocable.h"
+#include "absl/log/die_if_null.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "grpcpp/server_context.h"
+#include "grpcpp/support/status.h"
+#include "grpcpp/support/sync_stream.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.grpc.pb.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/server/host_buffer.h"
+#include "xla/python/ifrt_proxy/server/ifrt_backend.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Implementation for `GrpcIfrtService`.
+class GrpcServiceImpl : public grpc::GrpcIfrtService::Service {
+ public:
+  using BackendFactory =
+      absl::AnyInvocable<absl::StatusOr<std::unique_ptr<BackendInterface>>(
+          IfrtProxyVersion version, uint64_t session_id,
+          std::shared_ptr<xla::ifrt::proxy::HostBufferStore>
+              host_buffer_store)>;
+
+  explicit GrpcServiceImpl(BackendFactory backend_factory)
+      : backend_factory_(ABSL_DIE_IF_NULL(std::move(backend_factory))) {}
+
+  ::grpc::Status GetVersion(::grpc::ServerContext* context,
+                            const GrpcGetVersionRequest* request,
+                            GrpcGetVersionResponse* response) override;
+
+  ::grpc::Status IfrtSession(
+      ::grpc::ServerContext* context,
+      ::grpc::ServerReaderWriter<IfrtResponse, IfrtRequest>* stream) override;
+
+  ::grpc::Status HostBufferStore(
+      ::grpc::ServerContext* context,
+      ::grpc::ServerReader<GrpcHostBufferStoreRequest>* stream,
+      GrpcHostBufferStoreResponse* response) override;
+
+  ::grpc::Status HostBufferLookup(
+      ::grpc::ServerContext* context,
+      const GrpcHostBufferLookupRequest* request,
+      ::grpc::ServerWriter<GrpcHostBufferLookupResponse>* stream) override;
+
+  ::grpc::Status HostBufferDelete(
+      ::grpc::ServerContext* context,
+      const GrpcHostBufferDeleteRequest* request,
+      GrpcHostBufferDeleteResponse* response) override;
+
+  // Test-only method that adds a new session in the host buffer store map.
+  // Returns false if the session id already exists.
+  bool Test_InsertHostBufferStore(
+      uint64_t session_id,
+      std::shared_ptr<xla::ifrt::proxy::HostBufferStore> store);
+
+  // Test-only method that removes the given session id from the host buffer
+  // store map. Returns false if the session id does not exist.
+  bool Test_DeleteHostBufferStore(uint64_t session_id);
+
+ private:
+  absl::StatusOr<std::shared_ptr<xla::ifrt::proxy::HostBufferStore>>
+  GetHostBufferStore(uint64_t session_id)
+      ABSL_LOCKS_EXCLUDED(host_buffer_store_mu_);
+
+  BackendFactory backend_factory_;
+  std::atomic<uint64_t> next_session_id_ = 1;
+
+  absl::Mutex host_buffer_store_mu_;
+  absl::flat_hash_map<uint64_t,
+                      std::shared_ptr<xla::ifrt::proxy::HostBufferStore>>
+      host_buffer_stores_ ABSL_GUARDED_BY(host_buffer_store_mu_);
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_SERVER_GRPC_SERVICE_IMPL_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl_test.cc
new file mode 100644
index 00000000000000..2f8f553794dc47
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/grpc_service_impl_test.cc
@@ -0,0 +1,184 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/server/grpc_service_impl.h"
+
+#include <cstdint>
+#include <memory>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/cord.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "grpcpp/server.h"
+#include "grpcpp/server_builder.h"
+#include "grpcpp/support/channel_arguments.h"
+#include "grpcpp/support/status.h"
+#include "xla/python/ifrt_proxy/client/grpc_host_buffer.h"
+#include "xla/python/ifrt_proxy/common/grpc_ifrt_service.grpc.pb.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/server/grpc_server.h"
+#include "xla/python/ifrt_proxy/server/host_buffer.h"
+#include "xla/python/ifrt_proxy/server/version.h"
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+namespace {
+
+using ::tsl::testing::IsOk;
+using ::tsl::testing::IsOkAndHolds;
+using ::tsl::testing::StatusIs;
+
+IfrtProxyVersion Version() {
+  IfrtProxyVersion version;
+  version.set_protocol_version(kServerMaxVersion);
+  return version;
+}
+
+// Sets up fresh GrpcServer for testing.
+absl::StatusOr<std::unique_ptr<GrpcServer>> MakeGrpcServer() {
+  // TODO(b/282993619): For external/GKE uses, we may need to find (or build)
+  // a utility function that works similar to PickUnusedPortorDie().
+  auto addr = absl::StrCat("[::1]:", tsl::testing::PickUnusedPortOrDie());
+  return GrpcServer::CreateFromIfrtClientFactory(addr, []() {
+    return absl::UnimplementedError(
+        "IFRT client creation fails. This test is not expected to "
+        "instantiate any IFRT client");
+  });
+}
+
+TEST(GrpcServiceImplTest, CanBeUsedToSetupAnGrpcServer) {
+  ASSERT_THAT(MakeGrpcServer(), IsOk());
+  // Also implicitly tests that destruction of both the server and the
+  // implementation objects.
+}
+
+class GrpcIfrtServiceImplHostBufferTest
+    : public testing::TestWithParam</*size=*/int64_t> {
+ protected:
+  GrpcIfrtServiceImplHostBufferTest()
+      : impl_([](IfrtProxyVersion version, uint64_t session_id,
+                 std::shared_ptr<HostBufferStore> host_buffer_store) {
+          return absl::UnimplementedError(
+              "IFRT backend creation is not implemented");
+        }) {
+    ::grpc::ServerBuilder builder;
+    builder.RegisterService(&impl_);
+    server_ = builder.BuildAndStart();
+
+    stub_ = grpc::GrpcIfrtService::NewStub(
+        server_->InProcessChannel(::grpc::ChannelArguments()));
+  }
+
+  // Returns a string to be stored as a host buffer. The length is parameterized
+  // so that we can test chunking.
+  std::string GetTestData() const {
+    std::string data;
+    for (int i = 0; i < GetParam(); ++i) {
+      data.push_back(i % 7);
+    }
+    return data;
+  }
+
+  GrpcServiceImpl impl_;
+  std::unique_ptr<::grpc::Server> server_;
+  std::shared_ptr<grpc::GrpcIfrtService::Stub> stub_;
+};
+
+TEST_P(GrpcIfrtServiceImplHostBufferTest, StoreAndLookupStringView) {
+  static constexpr uint64_t kSessionId = 1;
+
+  auto store = std::make_shared<HostBufferStore>();
+  ASSERT_TRUE(impl_.Test_InsertHostBufferStore(kSessionId, store));
+  GrpcClientHostBufferStore client(stub_, Version(), kSessionId);
+
+  constexpr uint64_t kHandle = 2;
+  const std::string data = GetTestData();
+  absl::string_view source(data);
+
+  ASSERT_THAT(client.Store(kHandle, source).Await(), IsOk());
+  EXPECT_THAT(client.Lookup(kHandle).Await(), IsOkAndHolds(data));
+
+  EXPECT_TRUE(impl_.Test_DeleteHostBufferStore(kSessionId));
+}
+
+TEST_P(GrpcIfrtServiceImplHostBufferTest, StoreAndLookupCord) {
+  static constexpr uint64_t kSessionId = 1;
+
+  auto store = std::make_shared<HostBufferStore>();
+  ASSERT_TRUE(impl_.Test_InsertHostBufferStore(kSessionId, store));
+  GrpcClientHostBufferStore client(stub_, Version(), kSessionId);
+
+  constexpr uint64_t kHandle = 2;
+  const std::string data = GetTestData();
+
+  absl::Cord source(data);
+  ASSERT_THAT(client.Store(kHandle, source).Await(), IsOk());
+  EXPECT_THAT(client.Lookup(kHandle).Await(), IsOkAndHolds(data));
+
+  EXPECT_TRUE(impl_.Test_DeleteHostBufferStore(kSessionId));
+}
+
+TEST_P(GrpcIfrtServiceImplHostBufferTest, Lookup) {
+  static constexpr uint64_t kSessionId = 1;
+
+  auto store = std::make_shared<HostBufferStore>();
+  ASSERT_TRUE(impl_.Test_InsertHostBufferStore(kSessionId, store));
+  GrpcClientHostBufferStore client(stub_, Version(), kSessionId);
+
+  constexpr uint64_t kHandle = 2;
+  const std::string data = GetTestData();
+  ASSERT_THAT(store->Store(kHandle, data), IsOk());
+
+  EXPECT_THAT(client.Lookup(kHandle).Await(), IsOkAndHolds(data));
+
+  EXPECT_TRUE(impl_.Test_DeleteHostBufferStore(kSessionId));
+}
+
+TEST_P(GrpcIfrtServiceImplHostBufferTest, Delete) {
+  static constexpr uint64_t kSessionId = 1;
+
+  auto store = std::make_shared<HostBufferStore>();
+  ASSERT_TRUE(impl_.Test_InsertHostBufferStore(kSessionId, store));
+  GrpcClientHostBufferStore client(stub_, Version(), kSessionId);
+
+  constexpr uint64_t kHandle = 2;
+  const std::string data = GetTestData();
+  ASSERT_THAT(store->Store(kHandle, data), IsOk());
+
+  ASSERT_THAT(client.Delete(kHandle).Await(), IsOk());
+  EXPECT_THAT(client.Lookup(kHandle).Await(),
+              StatusIs(absl::StatusCode::kNotFound));
+
+  EXPECT_TRUE(impl_.Test_DeleteHostBufferStore(kSessionId));
+}
+
+INSTANTIATE_TEST_SUITE_P(
+    DataSize, GrpcIfrtServiceImplHostBufferTest,
+    testing::Values(0,                  // Empty host buffer.
+                    16,                 // Small enough to fit in one chunk.
+                    3 * 1024 * 1024));  // Requires multiple chunks
+
+}  // namespace
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/host_buffer.cc b/third_party/xla/xla/python/ifrt_proxy/server/host_buffer.cc
new file mode 100644
index 00000000000000..4b9dd7391ec81f
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/host_buffer.cc
@@ -0,0 +1,65 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/server/host_buffer.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/synchronization/mutex.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+absl::Status HostBufferStore::Store(uint64_t handle, std::string data) {
+  absl::MutexLock lock(&mu_);
+  const bool inserted =
+      buffers_.insert({handle, std::make_shared<std::string>(std::move(data))})
+          .second;
+  if (!inserted) {
+    return absl::AlreadyExistsError(
+        absl::StrCat("Host buffer handle ", handle, " already exists"));
+  }
+  return absl::OkStatus();
+}
+
+absl::StatusOr<std::shared_ptr<const std::string>> HostBufferStore::Lookup(
+    uint64_t handle) {
+  absl::MutexLock lock(&mu_);
+  const auto it = buffers_.find(handle);
+  if (it == buffers_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("Host buffer handle ", handle, " not found"));
+  }
+  return it->second;
+}
+
+absl::Status HostBufferStore::Delete(uint64_t handle) {
+  absl::MutexLock lock(&mu_);
+  if (buffers_.erase(handle) == 0) {
+    return absl::NotFoundError(
+        absl::StrCat("Host buffer handle ", handle, " not found"));
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/host_buffer.h b/third_party/xla/xla/python/ifrt_proxy/server/host_buffer.h
new file mode 100644
index 00000000000000..f9b07a40f30e91
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/host_buffer.h
@@ -0,0 +1,61 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_SERVER_HOST_BUFFER_H_
+#define XLA_PYTHON_IFRT_PROXY_SERVER_HOST_BUFFER_H_
+
+#include <memory>
+#include <string>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Keeps host buffers transferred from the client so that `IfrtBackend` can
+// access them when requests with pointers to host buffers arrive.
+//
+// We expect one `HostBufferStore` to exist per session (i.e., per `IfrtBackend`
+// instance) so that host buffers are cleaned up on session termination.
+class HostBufferStore {
+ public:
+  // Stores the data associated with the given handle. Returns an error if the
+  // handle already exists.
+  absl::Status Store(uint64_t handle, std::string data);
+
+  // Retrieves the data associated with the handle. Returns an error if the
+  // handle does not exist.
+  absl::StatusOr<std::shared_ptr<const std::string>> Lookup(uint64_t handle);
+
+  // Deletes the host buffer associated with the handle. Returns an error if the
+  // handle does not exist.
+  absl::Status Delete(uint64_t handle);
+
+ private:
+  absl::Mutex mu_;
+  absl::flat_hash_map<uint64_t, std::shared_ptr<const std::string>> buffers_
+      ABSL_GUARDED_BY(mu_);
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_SERVER_HOST_BUFFER_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/host_buffer_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/host_buffer_test.cc
new file mode 100644
index 00000000000000..7adc31658dda38
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/host_buffer_test.cc
@@ -0,0 +1,57 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/server/host_buffer.h"
+
+#include <cstdint>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "tsl/platform/status_matchers.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+namespace {
+
+using ::testing::Pointee;
+using ::tsl::testing::IsOk;
+using ::tsl::testing::IsOkAndHolds;
+using ::tsl::testing::StatusIs;
+
+TEST(HostBufferStoreTest, ReadAfterWrite) {
+  HostBufferStore store;
+  const uint64_t kHandle = 1;
+
+  ASSERT_THAT(store.Store(kHandle, "foo"), IsOk());
+  EXPECT_THAT(store.Lookup(kHandle), IsOkAndHolds(Pointee(std::string("foo"))));
+
+  ASSERT_THAT(store.Delete(kHandle), IsOk());
+  EXPECT_THAT(store.Lookup(kHandle), StatusIs(absl::StatusCode::kNotFound));
+}
+
+TEST(HostBufferStoreTest, UnknownHandle) {
+  HostBufferStore store;
+  const uint64_t kHandle = 1;
+
+  EXPECT_THAT(store.Lookup(kHandle), StatusIs(absl::StatusCode::kNotFound));
+  EXPECT_THAT(store.Delete(kHandle), StatusIs(absl::StatusCode::kNotFound));
+}
+
+}  // namespace
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc b/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc
new file mode 100644
index 00000000000000..a675ab40c66ef0
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/host_callback.cc
@@ -0,0 +1,195 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/server/host_callback.h"
+
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/functional/bind_front.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/host_callback.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/host_callback.h"
+#include "xla/python/ifrt_proxy/common/proto_util.h"
+#include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
+#include "xla/python/pjrt_ifrt/xla_host_callback.pb.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/errors.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+RemoteLoadedHostCallbackQueue::~RemoteLoadedHostCallbackQueue() { Close(); }
+
+absl::Status RemoteLoadedHostCallbackQueue::Push(ExecutionRequest request) {
+  absl::MutexLock l(&mu_);
+  if (closed_) {
+    return absl::CancelledError(
+        "RemoteLoadedHostCallback has stopped accepting new execution "
+        "requests");
+  }
+  requests_.push_back(std::move(request));
+  return absl::OkStatus();
+}
+
+std::optional<RemoteLoadedHostCallbackQueue::ExecutionRequest>
+RemoteLoadedHostCallbackQueue::Pop() {
+  auto not_empty = [this]() ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_) {
+    return !requests_.empty() || closed_;
+  };
+  absl::MutexLock l(&mu_, absl::Condition(&not_empty));
+  if (closed_) {
+    return std::nullopt;
+  }
+  ExecutionRequest request = std::move(requests_.front());
+  requests_.pop_front();
+  return request;
+}
+
+void RemoteLoadedHostCallbackQueue::Close() {
+  std::deque<ExecutionRequest> requests;
+  {
+    absl::MutexLock l(&mu_);
+    if (!closed_) {
+      requests.swap(requests_);
+    }
+    closed_ = true;
+  }
+  for (auto& request : requests) {
+    request.status.Set(absl::CancelledError(
+        "RemoteLoadedHostCallback execution has been cancelled"));
+  }
+}
+
+absl::StatusOr<tsl::RCReference<RemoteLoadedHostCallback>>
+RemoteLoadedHostCallback::CreateFromSerialized(
+    xla::ifrt::Client* client, absl::string_view serialized,
+    std::shared_ptr<RemoteLoadedHostCallbackQueue> queue) {
+  xla::ifrt::XlaHostCallbackProto proto;
+  if (!proto.ParseFromString(AsProtoStringData(serialized))) {
+    return absl::DataLossError(
+        "Unable to deserialize RemoteLoadedHostCallback");
+  }
+
+  auto from_proto =
+      [](const auto& arg_protos) -> std::vector<xla::HostCallbackArgInfo> {
+    std::vector<xla::HostCallbackArgInfo> args;
+    args.reserve(arg_protos.size());
+    for (const xla::ifrt::XlaHostCallbackProto::ArgInfo& arg_proto :
+         arg_protos) {
+      xla::HostCallbackArgInfo& arg = args.emplace_back();
+      arg.channel_id = static_cast<uint16_t>(arg_proto.channel_id());
+      arg.shape = xla::Shape(arg_proto.shape());
+    }
+    return args;
+  };
+
+  return tsl::MakeRef<RemoteLoadedHostCallback>(
+      client, from_proto(proto.operands()), from_proto(proto.results()),
+      std::move(queue));
+}
+
+RemoteLoadedHostCallback::RemoteLoadedHostCallback(
+    xla::ifrt::Client* client, std::vector<xla::HostCallbackArgInfo> operands,
+    std::vector<xla::HostCallbackArgInfo> results,
+    std::shared_ptr<RemoteLoadedHostCallbackQueue> queue)
+    : llvm::RTTIExtends<RemoteLoadedHostCallback,
+                        PjRtHostSendAndRecvLoadedHostCallback>(
+          client,
+          [&]() {
+            auto xla_host_callback = std::make_unique<xla::HostCallback>();
+            xla_host_callback->operands = std::move(operands);
+            xla_host_callback->results = std::move(results);
+            xla_host_callback->callback =
+                absl::bind_front(&RemoteLoadedHostCallback::Execute, this);
+            return xla_host_callback;
+          }()),
+      queue_(std::move(queue)) {}
+
+RemoteLoadedHostCallback::~RemoteLoadedHostCallback() {
+  if (queue_ != nullptr) {
+    queue_->Close();
+  }
+}
+
+absl::Status RemoteLoadedHostCallback::Execute(void** result_ptrs,
+                                               void** operand_ptrs) {
+  if (queue_ == nullptr) {
+    return absl::FailedPreconditionError(
+        "RemoteLoadedHostCallback without queue cannot be executed");
+  }
+
+  RemoteLoadedHostCallbackQueue::ExecutionRequest request;
+
+  auto to_buffer =
+      [&](absl::Span<const xla::HostCallbackArgInfo> args, void** ptrs,
+          std::vector<RemoteLoadedHostCallbackQueue::Buffer>& buffers) {
+        buffers.reserve(args.size());
+        for (int i = 0; i < args.size(); ++i) {
+          const int64_t size = xla::ShapeUtil::ByteSizeOf(args[i].shape);
+          buffers.push_back(RemoteLoadedHostCallbackQueue::Buffer{
+              .data = ptrs[i], .size = size});
+        }
+      };
+  to_buffer(host_callback().operands, operand_ptrs, request.operands);
+  to_buffer(host_callback().results, result_ptrs, request.results);
+
+  request.status = Future<absl::Status>::CreatePromise();
+  Future<absl::Status> status(request.status);
+
+  // Enqueue the execution request. `IfrtBackend` retrieves this by calling
+  // `PopExecutionRequest` and fulfills the `results` promise.
+  TF_RETURN_IF_ERROR(queue_->Push(std::move(request)));
+
+  // Block until the execution finishes and return its status.
+  return status.Await();
+}
+
+absl::StatusOr<std::string> RemoteLoadedHostCallback::Serialize() const {
+  xla::ifrt::XlaHostCallbackProto proto;
+
+  auto to_proto = [](absl::Span<const xla::HostCallbackArgInfo> args,
+                     auto* args_proto) {
+    args_proto->Reserve(args.size());
+    for (const auto& arg : args) {
+      auto* arg_proto = args_proto->Add();
+      arg_proto->set_channel_id(arg.channel_id);
+      *arg_proto->mutable_shape() = arg.shape.ToProto();
+    }
+  };
+  to_proto(host_callback().operands, proto.mutable_operands());
+  to_proto(host_callback().results, proto.mutable_results());
+
+  return proto.SerializeAsString();
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/host_callback.h b/third_party/xla/xla/python/ifrt_proxy/server/host_callback.h
new file mode 100644
index 00000000000000..e2d6ea834e7d60
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/host_callback.h
@@ -0,0 +1,126 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_SERVER_HOST_CALLBACK_H_
+#define XLA_PYTHON_IFRT_PROXY_SERVER_HOST_CALLBACK_H_
+
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/pjrt/host_callback.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/pjrt_ifrt/pjrt_host_callback.h"
+#include "tsl/concurrency/ref_count.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// Command queue interface between `RemoteLoadedHostCallback` and `IfrtBackend`.
+// Responsible for keeping track of in-flight execution requests.
+class RemoteLoadedHostCallbackQueue {
+ public:
+  struct Buffer {
+    void* data;
+    int64_t size;
+  };
+
+  // Encapsulates a host buffer execution. Operand and result buffers are
+  // pre-allocated and the caller is expected to fill them in-place before
+  // fulfilling the `status` promise.
+  struct ExecutionRequest {
+    std::vector<Buffer> operands;
+    std::vector<Buffer> results;
+    Future<absl::Status>::Promise status;
+  };
+
+  ~RemoteLoadedHostCallbackQueue();
+
+  // Pushes a new execution request to the queue. Returns an error if the queue
+  // has already been closed.
+  absl::Status Push(ExecutionRequest request);
+
+  // Blocks until this host callback queue has at least one pending execution
+  // and returns its information needed to perform execution. Returns nullopt if
+  // the request queue has already been closed by `Close()`.
+  std::optional<ExecutionRequest> Pop();
+
+  // Closes this request queue. After this call, all pending executions are
+  // unblocked with an error and no more executions can be enqueued.
+  void Close();
+
+ private:
+  absl::Mutex mu_;
+  bool closed_ ABSL_GUARDED_BY(mu_) = false;
+  std::deque<ExecutionRequest> requests_ ABSL_GUARDED_BY(mu_);
+};
+
+// Host callback that delegates its execution to an external executor. The
+// executor waits for execution requests to be enqueued to the given
+// `RemoteLoadedHostCallbackQueue` and returns results after execution by
+// fulfilling the returned promise.
+//
+// This class is thread-safe.
+//
+// Note: The current implementation inherits from PjRt's host callback
+// implementation. Even though this is a violation of the IFRT proxy's layering
+// principle, it is unavoidable right now because the base `LoadedHostCallback`
+// in IFRT has no associated execution semantics. For now, the IFRT proxy
+// focuses on supporting host callbacks on PjRt-like IFRT implementations.
+class RemoteLoadedHostCallback
+    : public llvm::RTTIExtends<RemoteLoadedHostCallback,
+                               PjRtHostSendAndRecvLoadedHostCallback> {
+ public:
+  // Creates from a serialized string returned by `Serialize()`.
+  static absl::StatusOr<tsl::RCReference<RemoteLoadedHostCallback>>
+  CreateFromSerialized(xla::ifrt::Client* client, absl::string_view serialized,
+                       std::shared_ptr<RemoteLoadedHostCallbackQueue> queue);
+
+  // Create from operand/result specs.
+  RemoteLoadedHostCallback(
+      xla::ifrt::Client* client, std::vector<xla::HostCallbackArgInfo> operands,
+      std::vector<xla::HostCallbackArgInfo> results,
+      std::shared_ptr<RemoteLoadedHostCallbackQueue> queue);
+
+  ~RemoteLoadedHostCallback() override;
+
+  // Serializes the remote host callback instance. The returned string can be
+  // deserialized into `RmeoteLoadedHostCallback` using `CreateFromSerialized`.
+  absl::StatusOr<std::string> Serialize() const override;
+
+ private:
+  // Implements the interface required by `xla::HostCallback`.
+  absl::Status Execute(void** result_ptrs, void** operand_ptrs);
+
+  std::shared_ptr<RemoteLoadedHostCallbackQueue> queue_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_SERVER_HOST_CALLBACK_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
new file mode 100644
index 00000000000000..cae94e7fa86cc5
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.cc
@@ -0,0 +1,1174 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/server/ifrt_backend.h"
+
+#include <cstdint>
+#include <cstring>
+#include <functional>
+#include <memory>
+#include <numeric>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/cleanup/cleanup.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/bind_front.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
+#include "xla/layout.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/host_callback.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/serdes.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt_proxy/common/array_util.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/proto_util.h"
+#include "xla/python/ifrt_proxy/common/types.h"
+#include "xla/python/ifrt_proxy/common/types.pb.h"
+#include "xla/python/ifrt_proxy/server/host_buffer.h"
+#include "xla/python/ifrt_proxy/server/host_callback.h"
+#include "xla/python/ifrt_proxy/server/version.h"
+#include "xla/python/pjrt_ifrt/xla_compiler.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/status_to_from_proto.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+namespace {
+
+// Convenient wrapper for `xla::ifrt::Deserialize()`.
+template <typename T>
+absl::StatusOr<std::unique_ptr<T>> Deserialize(
+    const Serialized& serialized,
+    std::unique_ptr<DeserializeOptions> options = nullptr) {
+  TF_ASSIGN_OR_RETURN(auto deserialized,
+                      Deserialize(serialized, std::move(options)));
+  auto obj = absl::WrapUnique(llvm::dyn_cast<T>(deserialized.release()));
+  if (obj == nullptr) {
+    return absl::InvalidArgumentError("Deserialization type mismatch");
+  }
+  return obj;
+}
+
+}  // namespace
+
+IfrtBackend::IfrtBackend(IfrtProxyVersion version, uint64_t session_id,
+                         std::unique_ptr<xla::ifrt::Client> ifrt_client,
+                         std::shared_ptr<HostBufferStore> host_buffer_store)
+    : version_(std::move(version)),
+      session_id_(session_id),
+      client_(std::move(ifrt_client)),
+      host_buffer_store_(std::move(host_buffer_store)),
+      compile_thread_pool_(
+          tsl::Env::Default(),
+          []() {
+            tsl::ThreadOptions options;
+            // Use a larger stack size since XLA often requires larger stacks
+            // for compilation.
+            options.stack_size = 240 * 1024;
+            return options;
+          }(),
+          "IfrtBackend",
+          // TODO(b/282757875): Consider making this configurable.
+          /*num_threads=*/32) {}
+
+absl::StatusOr<std::unique_ptr<IfrtBackend>> IfrtBackend::Create(
+    IfrtProxyVersion version, uint64_t session_id,
+    std::unique_ptr<xla::ifrt::Client> ifrt_client,
+    std::shared_ptr<HostBufferStore> host_buffer_store) {
+  if (ifrt_client == nullptr) {
+    return absl::InvalidArgumentError("ifrt_client cannot be a nullptr.");
+  }
+  if (version.protocol_version() < kServerMinVersion ||
+      version.protocol_version() > kServerMaxVersion) {
+    return absl::FailedPreconditionError(absl::StrCat(
+        "Protocol version ", version.protocol_version(),
+        " is unsupported by IFRT Proxy server; supported versions: [",
+        kServerMinVersion, ",", kServerMaxVersion, "]"));
+  }
+  return absl::WrapUnique<IfrtBackend>(
+      new IfrtBackend(std::move(version), session_id, std::move(ifrt_client),
+                      std::move(host_buffer_store)));
+}
+
+IfrtBackend::~IfrtBackend() {
+  // Cancel all in-flight host callback executions.
+  absl::flat_hash_map<uint64_t, RemoteLoadedHostCallbackQueue::ExecutionRequest>
+      host_callback_executions;
+  {
+    absl::MutexLock lock(&host_callback_executions_mutex_);
+    host_callback_executions.swap(host_callback_executions_);
+  }
+  for (auto& [handle, execution_request] : host_callback_executions) {
+    std::move(execution_request)
+        .status.Set(absl::CancelledError("IFRT backend has shut down"));
+  }
+
+  // Wait until all async work from `AsyncExecute` finishes execution.
+  {
+    auto done = [this]() ABSL_SHARED_LOCKS_REQUIRED(in_flight_count_mutex_) {
+      return in_flight_count_ == 0;
+    };
+    absl::MutexLock lock(&in_flight_count_mutex_, absl::Condition(&done));
+  }
+}
+
+Future<BackendInterface::Response> IfrtBackend::Process(
+    std::unique_ptr<IfrtRequest> request) {
+  switch (request->request_case()) {
+    case IfrtRequest::RequestCase::kInitRequest:
+      return Future<Response>(HandleInit(std::move(request)));
+    case IfrtRequest::RequestCase::kCheckFutureRequest:
+      return HandleCheckFutureRequest(std::move(request));
+    case IfrtRequest::RequestCase::kMakeArrayFromHostBufferRequest:
+      return Future<Response>(
+          HandleMakeArrayFromHostBufferRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kAssembleArrayFromSingleDeviceArraysRequest:
+      return Future<Response>(
+          HandleAssembleArrayFromSingleDeviceArraysRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kCopyToHostBufferRequest:
+      return HandleCopyToHostBufferRequest(std::move(request));
+    case IfrtRequest::RequestCase::kDisassembleIntoSingleDeviceArraysRequest:
+      return Future<Response>(
+          HandleDisassembleIntoSingleDeviceArraysRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kCheckArrayReadyRequest:
+      return Future<Response>(HandleCheckArrayReadyRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kReshardRequest:
+      return Future<Response>(HandleReshardRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kFullyReplicatedShardRequest:
+      return Future<Response>(
+          HandleFullyReplicatedShardRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kDeleteArrayRequest:
+      return Future<Response>(HandleDeleteArrayRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kIsArrayDeletedRequest:
+      return Future<Response>(HandleIsArrayDeletedRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kDestructArrayRequest:
+      return Future<Response>(HandleDestructArrayRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kCompileRequest:
+      return Future<Response>(HandleCompileRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kLoadedExecutableMetadataRequest:
+      return HandleLoadedExecutableMetadataRequest(std::move(request));
+    case IfrtRequest::RequestCase::kLoadedExecutableExecuteRequest:
+      return Future<Response>(
+          HandleLoadedExecutableExecuteRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kLoadedExecutableDeleteRequest:
+      return Future<Response>(
+          HandleLoadedExecutableDeleteRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kLoadedExecutableIsDeletedRequest:
+      return Future<Response>(
+          HandleLoadedExecutableIsDeletedRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kLoadedExecutableDestructRequest:
+      return Future<Response>(
+          HandleLoadedExecutableDestructRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kLoadedHostCallbackPollRequest:
+      return HandleLoadedHostCallbackPollRequest(std::move(request));
+    case IfrtRequest::RequestCase::kLoadedHostCallbackReturnRequest:
+      return Future<Response>(
+          HandleLoadedHostCallbackReturnRequest(std::move(request)));
+    case IfrtRequest::RequestCase::kGetDefaultDeviceAssignmentRequest:
+      return Future<Response>(
+          HandleGetDefaultDeviceAssignmentRequest(std::move(request)));
+    default:
+      return Future<Response>(absl::UnimplementedError(absl::StrCat(
+          "Got unimplemented request type: ", request->request_case())));
+  }
+}
+
+uint64_t IfrtBackend::HandleGenerator::New() {
+  absl::MutexLock lock(&mu_);
+  return current_++;
+}
+
+void IfrtBackend::HandleGenerator::BulkNew(absl::Span<uint64_t> handles) {
+  absl::MutexLock lock(&mu_);
+  std::iota(handles.begin(), handles.end(), current_);
+  current_ += handles.size();
+}
+
+Future<BackendInterface::Response> IfrtBackend::AsyncExecute(
+    std::function<Response()> handle_fn, tsl::thread::ThreadPool* thread_pool) {
+  {
+    absl::MutexLock lock(&in_flight_count_mutex_);
+    ++in_flight_count_;
+  }
+  auto promise = Future<Response>::CreatePromise();
+  auto f = [this, promise, handle_fn = std::move(handle_fn)]() mutable {
+    promise.Set(handle_fn());
+    {
+      absl::MutexLock lock(&in_flight_count_mutex_);
+      --in_flight_count_;
+    }
+  };
+  if (thread_pool != nullptr) {
+    thread_pool->Schedule(std::move(f));
+  } else {
+    tsl::Env::Default()->SchedClosure(std::move(f));
+  }
+  return Future<Response>(std::move(promise));
+}
+
+/////////////////////////////////////////////////////////////////////////////
+//
+// Handlers for individual request types
+//
+
+BackendInterface::Response IfrtBackend::HandleInit(
+    std::unique_ptr<IfrtRequest> request) {
+  std::unique_ptr<IfrtResponse> response =
+      NewIfrtResponse(request->request_metadata().op_id());
+  auto* init_resp = response->mutable_init_response();
+  init_resp->set_session_id(session_id_);
+  init_resp->set_platform_name(AsProtoStringData(client_->platform_name()));
+  init_resp->set_platform_version(
+      AsProtoStringData(client_->platform_version()));
+  init_resp->set_platform_id(client_->platform_id());
+  init_resp->set_runtime_type(AsProtoStringData(client_->runtime_type()));
+  init_resp->set_process_index(client_->process_index());
+
+  for (auto* device : client_->devices()) {
+    InitResponse::Device* d = init_resp->add_devices();
+    d->set_id(device->id());
+    d->set_local_device_id(device->local_device_id().value());
+    d->set_local_hardware_id(device->local_hardware_id_typed().value());
+    d->set_device_kind(AsProtoStringData(device->device_kind()));
+    if (auto default_memory_space = device->default_memory_space();
+        default_memory_space.ok()) {
+      d->set_default_memory_id((*default_memory_space)->id());
+    }
+    for (const auto* memory : device->memory_spaces()) {
+      d->add_memory_ids(memory->id());
+    }
+    d->set_debug_string(AsProtoStringData(device->DebugString()));
+    d->set_to_string(AsProtoStringData(device->ToString()));
+    for (const auto& [name, attr] : device->Attributes()) {
+      TF_ASSIGN_OR_RETURN((*d->mutable_attributes())[name],
+                          ToVariantProto(attr));
+    }
+  }
+  for (auto* addressable_device : client_->addressable_devices()) {
+    init_resp->add_addressable_device_ids(addressable_device->id());
+  }
+
+  absl::flat_hash_map<int, xla::ifrt::Memory*> memories;
+  for (auto* device : client_->devices()) {
+    for (xla::ifrt::Memory* memory : device->memory_spaces()) {
+      const auto [it, inserted] = memories.insert({memory->id(), memory});
+      if (!inserted && it->second != memory) {
+        return absl::FailedPreconditionError(absl::StrCat(
+            "Two memories cannot have the same id: ", memory->ToString(),
+            " vs. ", it->second->ToString()));
+      }
+    }
+  }
+  for (const auto& [id, memory] : memories) {
+    auto* m = init_resp->add_memories();
+    m->set_id(id);
+    m->set_memory_space_kind(AsProtoStringData(memory->memory_space_kind()));
+    for (const auto* device : memory->devices()) {
+      m->add_device_ids(device->id());
+    }
+    m->set_debug_string(AsProtoStringData(memory->DebugString()));
+    m->set_to_string(AsProtoStringData(memory->ToString()));
+  }
+
+  return response;
+}
+
+Future<BackendInterface::Response> IfrtBackend::HandleCheckFutureRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  const CheckFutureRequest& check_request = request->check_future_request();
+
+  Future<absl::Status> future;
+  {
+    absl::MutexLock lock(&futures_mutex_);
+    const auto it = futures_.find(check_request.future_handle());
+    if (it == futures_.end()) {
+      return Future<Response>(absl::NotFoundError(absl::StrCat(
+          "Unknown future handle: ", check_request.future_handle())));
+    }
+    future = std::move(it->second);
+    futures_.erase(it);
+  }
+
+  auto promise = Future<BackendInterface::Response>::CreatePromise();
+  // With PjRtFuture, the `Future` needs to be owned by one or more owners until
+  // `OnReady()`'s lambda gets executed. So, capture a copy of `future` in the
+  // lambda, making the lambda itself an owner of `future`.
+  future.OnReady([op_id = request->request_metadata().op_id(), promise,
+                  hold = future](absl::Status status) mutable {
+    if (!status.ok()) {
+      promise.Set(std::move(status));
+      return;
+    }
+    auto ifrt_resp = NewIfrtResponse(op_id);
+    ifrt_resp->mutable_check_future_response();
+    promise.Set(std::move(ifrt_resp));
+  });
+
+  return Future<BackendInterface::Response>(std::move(promise));
+}
+
+BackendInterface::Response IfrtBackend::HandleMakeArrayFromHostBufferRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  if (!request->has_make_array_from_host_buffer_request()) {
+    return absl::InternalError(
+        "MakeArrayFromHostBuffer got an IfrtRequest with no "
+        "MakeArrayFromHostBufferRequest in it.");
+  }
+  auto* make_array_request =
+      request->mutable_make_array_from_host_buffer_request();
+
+  TF_ASSIGN_OR_RETURN(
+      auto sharding,
+      FromShardingProto(absl::bind_front(&Client::LookupDevice, client_.get()),
+                        make_array_request->sharding()));
+
+  const auto byte_strides = [&]() -> std::optional<std::vector<int64_t>> {
+    if (!make_array_request->has_byte_strides()) return std::nullopt;
+    return FromByteStridesProto(make_array_request->byte_strides());
+  }();
+  const auto shape = FromShapeProto(make_array_request->shape());
+  const auto dtype = FromDTypeProto(make_array_request->dtype());
+
+  const uint64_t host_buffer_handle = make_array_request->host_buffer_handle();
+  absl::Cleanup cleanup = [&] {
+    CHECK_OK(host_buffer_store_->Delete(host_buffer_handle));
+  };
+  TF_ASSIGN_OR_RETURN(std::shared_ptr<const std::string> host_buffer,
+                      host_buffer_store_->Lookup(host_buffer_handle));
+  std::move(cleanup).Invoke();
+
+  TF_ASSIGN_OR_RETURN(const auto mem_region,
+                      ArrayMemRegion::FromMinimalMemRegion(
+                          *host_buffer, dtype, shape, byte_strides));
+
+  TF_ASSIGN_OR_RETURN(
+      auto array,
+      client_->MakeArrayFromHostBuffer(
+          mem_region.zeroth_element(), dtype, std::move(shape),
+          std::move(byte_strides), std::move(sharding),
+          xla::ifrt::Client::HostBufferSemantics::
+              kImmutableUntilTransferCompletes,
+          [hold = std::move(host_buffer)]() mutable { hold.reset(); }));
+
+  // TODO(b/282757875): Consider merging the handle_generator with the
+  // arrays_.
+  uint64_t handle = handle_generator_.New();
+  {
+    absl::MutexLock lock(&arrays_mutex_);
+    arrays_.insert({handle, std::move(array)});
+  }
+
+  std::unique_ptr<IfrtResponse> response =
+      NewIfrtResponse(request->request_metadata().op_id());
+  auto* make_array_resp =
+      response->mutable_make_array_from_host_buffer_response();
+  make_array_resp->set_array_handle(handle);
+
+  return response;
+}
+
+BackendInterface::Response
+IfrtBackend::HandleAssembleArrayFromSingleDeviceArraysRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  const auto& assemble_request =
+      request->assemble_array_from_single_device_arrays_request();
+
+  std::vector<tsl::RCReference<xla::ifrt::Array>> arrays;
+  {
+    absl::ReaderMutexLock lock(&arrays_mutex_);
+    for (const uint64_t handle :
+         assemble_request.single_device_array_handles()) {
+      TF_ASSIGN_OR_RETURN(arrays.emplace_back(), GetArrayLocked(handle));
+    }
+  }
+
+  Shape shape = FromShapeProto(assemble_request.shape());
+  TF_ASSIGN_OR_RETURN(
+      auto sharding,
+      FromShardingProto(absl::bind_front(&Client::LookupDevice, client_.get()),
+                        assemble_request.sharding()));
+  TF_ASSIGN_OR_RETURN(auto semantics, FromArrayCopySemanticsProto(
+                                          assemble_request.copy_semantics()));
+
+  TF_ASSIGN_OR_RETURN(auto array, client_->AssembleArrayFromSingleDeviceArrays(
+                                      std::move(shape), std::move(sharding),
+                                      absl::MakeSpan(arrays), semantics));
+
+  auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
+
+  uint64_t handle = handle_generator_.New();
+  ifrt_resp->mutable_assemble_array_from_single_device_arrays_response()
+      ->set_array_handle(handle);
+  {
+    absl::MutexLock lock(&arrays_mutex_);
+    arrays_.insert({handle, std::move(array)});
+  }
+
+  return ifrt_resp;
+}
+
+Future<BackendInterface::Response> IfrtBackend::HandleCopyToHostBufferRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  const CopyToHostBufferRequest& copy_to_host =
+      request->copy_to_host_buffer_request();
+
+  auto array = GetArray(copy_to_host.array_handle());
+  if (!array.ok()) {
+    return Future<Response>(array.status());
+  }
+
+  // Determine the size and allocate the host buffer.
+  // TODO(b/282757875): We may need to redo this to account for byte_strides,
+  // padding, and alignment requirements.
+  std::optional<int> element_size = (*array)->dtype().byte_size();
+  if (element_size == std::nullopt) {
+    return Future<Response>(
+        absl::InternalError("Array element size is unknown."));
+  }
+  int64_t host_buffer_size =
+      (*array)->shape().num_elements() * element_size.value();
+  // Use `std::unique_ptr<std::string>` for pointer stability.
+  auto host_buffer = std::make_unique<std::string>();
+  host_buffer->resize(host_buffer_size);
+
+  const auto byte_strides = [&]() -> std::optional<std::vector<int64_t>> {
+    if (!copy_to_host.has_byte_strides()) {
+      return std::nullopt;
+    }
+    return FromByteStridesProto(copy_to_host.byte_strides());
+  }();
+  const auto mem_region = ArrayMemRegion::FromMinimalMemRegion(
+      absl::string_view(*host_buffer), (*array)->dtype(), (*array)->shape(),
+      byte_strides);
+  if (!mem_region.ok()) {
+    return Future<Response>(mem_region.status());
+  }
+
+  // TODO(b/282757875): Consider other ArrayCopySemantics.
+  Future<absl::Status> copy_status =
+      (*array)->CopyToHostBuffer(mem_region->zeroth_element(), byte_strides,
+                                 ArrayCopySemantics::kAlwaysCopy);
+
+  auto resp_promise = Future<BackendInterface::Response>::CreatePromise();
+  Future<BackendInterface::Response> resp_future(resp_promise);
+  auto on_ready = [this, op_id = request->request_metadata().op_id(),
+                   host_buffer = std::move(host_buffer),
+                   host_buffer_handle = copy_to_host.host_buffer_handle()](
+                      absl::Status status) mutable
+      -> absl::StatusOr<std::unique_ptr<IfrtResponse>> {
+    TF_RETURN_IF_ERROR(status);
+
+    TF_RETURN_IF_ERROR(
+        host_buffer_store_->Store(host_buffer_handle, *std::move(host_buffer)));
+
+    std::unique_ptr<IfrtResponse> response = NewIfrtResponse(op_id);
+    response->mutable_copy_to_host_buffer_response();
+    return response;
+  };
+  copy_status.OnReady(
+      [promise = std::move(resp_promise), on_ready = std::move(on_ready)](
+          absl::Status status) mutable { promise.Set(on_ready(status)); });
+
+  return resp_future;
+}
+
+BackendInterface::Response
+IfrtBackend::HandleDisassembleIntoSingleDeviceArraysRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  TF_ASSIGN_OR_RETURN(
+      auto array,
+      GetArray(request->disassemble_into_single_device_arrays_request()
+                   .array_handle()));
+
+  // TODO(b/282757875): Consider other ArrayCopySemantics.
+  TF_ASSIGN_OR_RETURN(auto single_device_arrays,
+                      array->DisassembleIntoSingleDeviceArrays(
+                          xla::ifrt::ArrayCopySemantics::kAlwaysCopy));
+
+  // Set up an IfrtResponse with pre-allocated space for the right number of
+  // single device array handles.
+  int64_t num_arrays = single_device_arrays.size();
+  auto response = NewIfrtResponse(request->request_metadata().op_id());
+
+  // Pre-allocate space in the response proto and fill it in with bulk allocated
+  // new handles.
+  auto* handles =
+      response->mutable_disassemble_into_single_device_arrays_response()
+          ->mutable_single_device_array_handles();
+  handles->Reserve(num_arrays);
+  uint64_t* handles_buf = handles->AddNAlreadyReserved(num_arrays);
+  handle_generator_.BulkNew(absl::MakeSpan(handles_buf, num_arrays));
+
+  // Install the newly created arrays into the arrays_.
+  {
+    absl::MutexLock lock(&arrays_mutex_);
+    for (int i = 0; i < num_arrays; ++i) {
+      arrays_.insert({handles_buf[i], single_device_arrays[i]});
+    }
+  }
+
+  return response;
+}
+
+Future<BackendInterface::Response> IfrtBackend::HandleCheckArrayReadyRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  auto array = GetArray(request->check_array_ready_request().array_handle());
+  if (!array.ok()) {
+    return Future<Response>(array.status());
+  }
+
+  auto ifrt_response_promise =
+      Future<BackendInterface::Response>::CreatePromise();
+  Future<BackendInterface::Response> ifrt_response_future(
+      ifrt_response_promise);
+
+  (*array)->GetReadyFuture().OnReady(
+      [op_id = request->request_metadata().op_id(),
+       promise = std::move(ifrt_response_promise)](
+          absl::Status status) mutable -> void {
+        if (!status.ok()) {
+          promise.Set(std::move(status));
+          return;
+        }
+        auto ifrt_response = NewIfrtResponse(op_id);
+        ifrt_response->mutable_check_array_ready_response();
+        promise.Set(std::move(ifrt_response));
+      });
+  return ifrt_response_future;
+}
+
+BackendInterface::Response IfrtBackend::HandleReshardRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  const auto& reshard_request = request->reshard_request();
+  TF_ASSIGN_OR_RETURN(auto array, GetArray(reshard_request.array_handle()));
+  TF_ASSIGN_OR_RETURN(
+      auto sharding,
+      FromShardingProto(absl::bind_front(&Client::LookupDevice, client_.get()),
+                        reshard_request.sharding()));
+  TF_ASSIGN_OR_RETURN(auto semantics, FromArrayCopySemanticsProto(
+                                          reshard_request.copy_semantics()));
+
+  TF_ASSIGN_OR_RETURN(auto resharded_array,
+                      array->Reshard(sharding, semantics));
+
+  uint64_t resharded_array_handle = handle_generator_.New();
+  {
+    absl::MutexLock lock(&arrays_mutex_);
+    arrays_.insert({resharded_array_handle, std::move(resharded_array)});
+  }
+
+  auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
+  ifrt_resp->mutable_reshard_response()->set_array_handle(
+      resharded_array_handle);
+  return ifrt_resp;
+}
+
+BackendInterface::Response IfrtBackend::HandleFullyReplicatedShardRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  const auto& fully_replicated_shard_request =
+      request->fully_replicated_shard_request();
+  TF_ASSIGN_OR_RETURN(auto array,
+                      GetArray(fully_replicated_shard_request.array_handle()));
+  TF_ASSIGN_OR_RETURN(auto semantics,
+                      FromArrayCopySemanticsProto(
+                          fully_replicated_shard_request.copy_semantics()));
+
+  // Here we are making the assumption that the `FullyReplicatedShard` returns
+  // the Array corresponding to the first device in the sharding - as needed by
+  // the proxy client for making the SingleDeviceSharding corresponding to the
+  // newly created array. Revisit this when IFRT supports: (1) an inexpensive
+  // way to derive a SingleDeviceSharding from a fully replicated Array's
+  // sharding and (2) A generalized Reshard API that allows the user to request
+  // an Array to be made out of a specific single shard.
+  TF_ASSIGN_OR_RETURN(auto new_array, array->FullyReplicatedShard(semantics));
+
+  uint64_t new_array_handle = handle_generator_.New();
+  {
+    absl::MutexLock lock(&arrays_mutex_);
+    arrays_.insert({new_array_handle, std::move(new_array)});
+  }
+  auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
+  ifrt_resp->mutable_fully_replicated_shard_response()->set_array_handle(
+      new_array_handle);
+  return ifrt_resp;
+}
+
+BackendInterface::Response IfrtBackend::HandleDeleteArrayRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  TF_ASSIGN_OR_RETURN(auto array,
+                      GetArray(request->delete_array_request().array_handle()));
+
+  auto deletion_future = array->Delete();
+  uint64_t future_handle = handle_generator_.New();
+  {
+    absl::MutexLock lock(&futures_mutex_);
+    futures_.insert({future_handle, std::move(deletion_future)});
+  }
+
+  auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
+  ifrt_resp->mutable_delete_array_response()->set_deletion_future_handle(
+      future_handle);
+  return ifrt_resp;
+}
+
+BackendInterface::Response IfrtBackend::HandleIsArrayDeletedRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  TF_ASSIGN_OR_RETURN(
+      auto array, GetArray(request->is_array_deleted_request().array_handle()));
+
+  auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
+  ifrt_resp->mutable_is_array_deleted_response()->set_deleted(
+      array->IsDeleted());
+  return ifrt_resp;
+}
+
+BackendInterface::Response IfrtBackend::HandleDestructArrayRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  {
+    absl::MutexLock lock(&arrays_mutex_);
+    bool deleted =
+        arrays_.erase(request->destruct_array_request().array_handle());
+    if (!deleted) {
+      return absl::NotFoundError(
+          absl::StrCat("Unknown array handle: ",
+                       request->destruct_array_request().array_handle()));
+    }
+  }
+  auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
+
+  // Currently DestructArrayResponse is an empty message, but proxy clients may
+  // rely on its presence for correct demuxing.
+  ifrt_resp->mutable_destruct_array_response();
+  return ifrt_resp;
+}
+
+Future<BackendInterface::Response> IfrtBackend::HandleCompileRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  // Perform compilation on a thread pool in order to (1) avoid blocking the RPC
+  // thread during compilation and (2) run compilation with bigger stacks (often
+  // necessary for XLA).
+  auto f = [this, request = std::shared_ptr<IfrtRequest>(
+                      std::move(request))]() -> Response {
+    const CompileRequest& compile_request = request->compile_request();
+
+    TF_ASSIGN_OR_RETURN(auto program, Deserialize<xla::ifrt::Program>(
+                                          compile_request.program()));
+    TF_ASSIGN_OR_RETURN(auto options, Deserialize<xla::ifrt::CompileOptions>(
+                                          compile_request.compile_options()));
+
+    // Deserialize host callbacks. IFRT proxy currently allows only one type of
+    // host callbacks from the client (`RemoteLoadedHostCallback`) and this is
+    // serialized out of band into its own field in the request proto.
+    std::vector<std::shared_ptr<RemoteLoadedHostCallbackQueue>>
+        host_callback_queues;
+    {
+      std::vector<tsl::RCReference<xla::ifrt::LoadedHostCallback>>
+          loaded_host_callbacks;
+      for (int i = 0; i < compile_request.host_callbacks_size(); ++i) {
+        host_callback_queues.emplace_back(
+            std::make_shared<RemoteLoadedHostCallbackQueue>());
+        TF_ASSIGN_OR_RETURN(
+            loaded_host_callbacks.emplace_back(),
+            RemoteLoadedHostCallback::CreateFromSerialized(
+                client_.get(), compile_request.host_callbacks(i),
+                host_callback_queues.back()));
+      }
+      if (!loaded_host_callbacks.empty()) {
+        if (auto xla_options =
+                llvm::dyn_cast<xla::ifrt::XlaCompileOptions>(options.get())) {
+          xla_options->loaded_host_callbacks = std::move(loaded_host_callbacks);
+        } else {
+          return absl::UnimplementedError(
+              "Host callbacks are supported only for XLA-like IFRT "
+              "implementations using `xla::ifrt::XlaCompileOptions`");
+        }
+      }
+    }
+
+    TF_ASSIGN_OR_RETURN(auto executable,
+                        client_->GetDefaultCompiler()->Compile(
+                            std::move(program), std::move(options)));
+
+    std::unique_ptr<IfrtResponse> ifrt_resp =
+        NewIfrtResponse(request->request_metadata().op_id());
+    auto* compile_resp = ifrt_resp->mutable_compile_response();
+
+    uint64_t handle = handle_generator_.New();
+    compile_resp->set_loaded_executable_handle(handle);
+
+    std::vector<uint64_t> host_callback_handles(host_callback_queues.size());
+    handle_generator_.BulkNew(absl::MakeSpan(host_callback_handles));
+    compile_resp->mutable_loaded_host_callback_handles()->Add(
+        host_callback_handles.begin(), host_callback_handles.end());
+
+    // Populate executable metadata.
+    compile_resp->set_name(AsProtoStringData(executable->name()));
+    compile_resp->set_num_devices(executable->num_devices());
+    for (const auto& logical_device_id :
+         executable->addressable_device_logical_ids()) {
+      LogicalDeviceIds* proto =
+          compile_resp->add_addressable_device_logical_ids();
+      proto->set_replica(logical_device_id.replica);
+      proto->set_partition(logical_device_id.partition);
+    }
+    for (const auto* device : executable->addressable_devices()) {
+      compile_resp->add_addressable_device_ids(device->id());
+    }
+    // TODO(b/282757875): Consider making fingerprint calculation asynchronous
+    // if it is expected to take long.
+    auto fingerprint = executable->Fingerprint();
+    if (!fingerprint.ok()) {
+      *compile_resp->mutable_fingerprint_error() =
+          tsl::StatusToProto(fingerprint.status());
+    } else if (fingerprint->has_value()) {
+      compile_resp->set_fingerprint_value(std::move(fingerprint)->value());
+    }
+
+    {
+      absl::MutexLock lock(&executables_mutex_);
+      executables_.insert({handle, std::move(executable)});
+    }
+    {
+      absl::MutexLock lock(&host_callback_queues_mutex_);
+      for (int i = 0; i < host_callback_queues.size(); ++i) {
+        host_callback_queues_.insert(
+            {host_callback_handles[i], std::move(host_callback_queues[i])});
+      }
+    }
+
+    return ifrt_resp;
+  };
+  return AsyncExecute(std::move(f), &compile_thread_pool_);
+}
+
+Future<BackendInterface::Response>
+IfrtBackend::HandleLoadedExecutableMetadataRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  // Call `GetParameterShardings` and `GetOutputShardings` on a thread pool
+  // since some implementations may block until compilation completes.
+  return AsyncExecute([this, request = std::shared_ptr<IfrtRequest>(
+                                 std::move(request))]() -> Response {
+    const uint64_t handle = request->loaded_executable_metadata_request()
+                                .loaded_executable_handle();
+    TF_ASSIGN_OR_RETURN(std::shared_ptr<xla::ifrt::LoadedExecutable> executable,
+                        GetLoadedExecutable(handle));
+
+    std::unique_ptr<IfrtResponse> ifrt_resp =
+        NewIfrtResponse(request->request_metadata().op_id());
+    auto* metadata_resp =
+        ifrt_resp->mutable_loaded_executable_metadata_response();
+
+    if (auto parameter_shardings = executable->GetParameterShardings();
+        parameter_shardings.has_value()) {
+      metadata_resp->mutable_parameter_shardings()->mutable_shardings()->Add(
+          parameter_shardings->begin(), parameter_shardings->end());
+    }
+    if (auto output_shardings = executable->GetOutputShardings();
+        output_shardings.has_value()) {
+      metadata_resp->mutable_output_shardings()->mutable_shardings()->Add(
+          output_shardings->begin(), output_shardings->end());
+    }
+
+    if (auto parameter_layouts = executable->GetParameterLayouts();
+        parameter_layouts.ok()) {
+      auto* const layouts =
+          metadata_resp->mutable_parameter_layouts_list()->mutable_layouts();
+      for (const xla::Layout& layout : *parameter_layouts) {
+        layouts->Add(layout.ToProto());
+      }
+    } else {
+      *metadata_resp->mutable_parameter_layouts_error() =
+          tsl::StatusToProto(parameter_layouts.status());
+    }
+    if (auto output_layouts = executable->GetOutputLayouts();
+        output_layouts.ok()) {
+      auto* const layouts =
+          metadata_resp->mutable_output_layouts_list()->mutable_layouts();
+      for (const xla::Layout& layout : *output_layouts) {
+        layouts->Add(layout.ToProto());
+      }
+    } else {
+      *metadata_resp->mutable_output_layouts_error() =
+          tsl::StatusToProto(output_layouts.status());
+    }
+
+    auto output_memory_kinds = executable->GetOutputMemoryKinds();
+    if (output_memory_kinds.ok()) {
+      for (const auto& memory_kinds : *output_memory_kinds) {
+        auto* const list = metadata_resp->mutable_output_memory_kinds()
+                               ->add_memory_kind_lists()
+                               ->mutable_memory_kinds();
+        list->Reserve(memory_kinds.size());
+        list->Add(memory_kinds.begin(), memory_kinds.end());
+      }
+    } else {
+      *metadata_resp->mutable_output_memory_kinds()->mutable_status() =
+          tsl::StatusToProto(output_memory_kinds.status());
+    }
+
+    return ifrt_resp;
+  });
+}
+
+BackendInterface::Response IfrtBackend::HandleLoadedExecutableExecuteRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  const LoadedExecutableExecuteRequest& execute =
+      request->loaded_executable_execute_request();
+  TF_ASSIGN_OR_RETURN(std::shared_ptr<xla::ifrt::LoadedExecutable> executable,
+                      GetLoadedExecutable(execute.loaded_executable_handle()));
+
+  std::vector<tsl::RCReference<xla::ifrt::Array>> args;
+  args.reserve(execute.args_handles_size());
+  {
+    absl::ReaderMutexLock lock(&arrays_mutex_);
+    for (const uint64_t handle : execute.args_handles()) {
+      TF_ASSIGN_OR_RETURN(args.emplace_back(), GetArrayLocked(handle));
+    }
+  }
+
+  TF_ASSIGN_OR_RETURN(auto execute_options,
+                      xla::ifrt::LoadedExecutable::ExecuteOptions::FromProto(
+                          execute.execute_options()));
+
+  std::optional<DeviceList> devices;
+  if (!execute.device_ids().empty()) {
+    DeviceList::Devices d;
+    d.reserve(execute.device_ids_size());
+    for (const int32_t device_id : execute.device_ids()) {
+      TF_ASSIGN_OR_RETURN(d.emplace_back(), client_->LookupDevice(device_id));
+    }
+    devices = DeviceList(std::move(d));
+  }
+
+  TF_ASSIGN_OR_RETURN(
+      xla::ifrt::LoadedExecutable::ExecuteResult result,
+      executable->Execute(absl::MakeSpan(args), execute_options, devices));
+
+  auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
+  LoadedExecutableExecuteResponse* execute_response =
+      ifrt_resp->mutable_loaded_executable_execute_response();
+
+  // Register the future to `futures_`. Caller is expected to call
+  // `CheckFuture` exactly once to check for its status and erase it. In future,
+  // we may introduce separate mechanisms to remove futures from `futures_`
+  // without checking its status for situations where futures are not used.
+  {
+    absl::MutexLock lock(&futures_mutex_);
+    execute_response->set_status_handle(handle_generator_.New());
+    futures_.insert(
+        {execute_response->status_handle(), std::move(result.status)});
+  }
+
+  // Register output arrays. At this point, we should never early return because
+  // doing so will leak futures or output arrays registered so far.
+  std::vector<uint64_t> output_handles(result.outputs.size());
+  handle_generator_.BulkNew(absl::MakeSpan(output_handles));
+  {
+    absl::MutexLock lock(&arrays_mutex_);
+    for (int i = 0; i < result.outputs.size(); ++i) {
+      tsl::RCReference<xla::ifrt::Array>& array = result.outputs[i];
+
+      LoadedExecutableExecuteResponse::Output* output =
+          execute_response->add_outputs();
+      output->set_dtype(ToDTypeProto(array->dtype()));
+      *output->mutable_shape() = ToShapeProto(array->shape());
+      TF_ASSIGN_OR_RETURN(*output->mutable_sharding(),
+                          ToShardingProto(array->sharding()));
+      output->set_array_handle(output_handles[i]);
+
+      arrays_.insert({output_handles[i], std::move(array)});
+    }
+  }
+
+  return ifrt_resp;
+}
+
+BackendInterface::Response IfrtBackend::HandleLoadedExecutableDeleteRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  const auto& del = request->loaded_executable_delete_request();
+  TF_ASSIGN_OR_RETURN(std::shared_ptr<xla::ifrt::LoadedExecutable> executable,
+                      GetLoadedExecutable(del.loaded_executable_handle()));
+
+  Future<absl::Status> future = executable->Delete();
+
+  auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
+  auto* del_response = ifrt_resp->mutable_loaded_executable_delete_response();
+
+  {
+    absl::MutexLock lock(&futures_mutex_);
+    del_response->set_future_handle(handle_generator_.New());
+    futures_.insert({del_response->future_handle(), std::move(future)});
+  }
+
+  return ifrt_resp;
+}
+
+BackendInterface::Response IfrtBackend::HandleLoadedExecutableIsDeletedRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  const auto& is_deleted = request->loaded_executable_is_deleted_request();
+  TF_ASSIGN_OR_RETURN(
+      std::shared_ptr<xla::ifrt::LoadedExecutable> executable,
+      GetLoadedExecutable(is_deleted.loaded_executable_handle()));
+
+  auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
+  auto* is_deleted_response =
+      ifrt_resp->mutable_loaded_executable_is_deleted_response();
+  is_deleted_response->set_is_deleted(executable->IsDeleted());
+
+  return ifrt_resp;
+}
+
+BackendInterface::Response IfrtBackend::HandleLoadedExecutableDestructRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  const auto& destruct = request->loaded_executable_destruct_request();
+
+  std::shared_ptr<xla::ifrt::LoadedExecutable> executable;
+  {
+    absl::MutexLock lock(&executables_mutex_);
+    const auto it = executables_.find(destruct.loaded_executable_handle());
+    if (it == executables_.end()) {
+      return absl::NotFoundError(
+          absl::StrCat("Unknown loaded executable handle: ",
+                       destruct.loaded_executable_handle()));
+    }
+    executable = std::move(it->second);
+    executables_.erase(it);
+  }
+  executable.reset();
+
+  // `RemoteLoadedHostCallback`'s request queue is closed when the host callback
+  // objects are destroyed by the underlying IFRT implementation when there are
+  // no more host callback executions to be done.
+
+  auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
+  ifrt_resp->mutable_loaded_executable_destruct_response();
+  return ifrt_resp;
+}
+
+Future<BackendInterface::Response>
+IfrtBackend::HandleLoadedHostCallbackPollRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  return AsyncExecute([this, request = std::shared_ptr<IfrtRequest>(
+                                 std::move(request))]() -> Response {
+    const auto& poll = request->loaded_host_callback_poll_request();
+    const uint64_t handle = poll.loaded_host_callback_handle();
+
+    // Find the host callback queue associated with the given handle.
+    std::shared_ptr<RemoteLoadedHostCallbackQueue> queue;
+    {
+      absl::MutexLock lock(&host_callback_queues_mutex_);
+      auto it = host_callback_queues_.find(handle);
+      if (it == host_callback_queues_.end()) {
+        return absl::NotFoundError(
+            absl::StrCat("Unknown loaded host callback handle: ", handle));
+      }
+      queue = it->second;
+    }
+
+    // Block until the host callback has any pending execution and pop its
+    // execution info. May return a nullopt if the host callback has been
+    // deleted by the underlying IFRT implementation.
+    auto execution_request = queue->Pop();
+    if (!execution_request.has_value()) {
+      {
+        absl::MutexLock lock(&host_callback_queues_mutex_);
+        host_callback_queues_.erase(handle);
+      }
+      auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
+      ifrt_resp->mutable_loaded_host_callback_poll_response();
+      return ifrt_resp;
+    }
+
+    // After this point, we must fulfill the promise eventually in order to
+    // avoid deadlock (`absl::Cleanup` ensures this).
+
+    absl::Cleanup cleanup = [&] {
+      std::move(execution_request)
+          ->status.Set(absl::UnknownError(
+              "Unable to enqueue the host callback execution"));
+    };
+
+    // Store the operands as a single contiguous buffer in the host buffer
+    // store. The client retrieves it by invoking `HostBufferLookup`.
+    {
+      std::string buffer;
+      for (const auto& operand : execution_request->operands) {
+        buffer.append(static_cast<const char*>(operand.data), operand.size);
+      }
+      TF_RETURN_IF_ERROR(host_buffer_store_->Store(
+          poll.operand_host_buffer_handle(), std::move(buffer)));
+    }
+
+    const uint64_t execution_handle = handle_generator_.New();
+    {
+      absl::MutexLock lock(&host_callback_executions_mutex_);
+      host_callback_executions_.insert(
+          {execution_handle, *std::move(execution_request)});
+    }
+    std::move(cleanup).Cancel();
+
+    auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
+    auto* poll_response =
+        ifrt_resp->mutable_loaded_host_callback_poll_response();
+    poll_response->set_host_callback_execution_handle(execution_handle);
+    return ifrt_resp;
+  });
+}
+
+BackendInterface::Response IfrtBackend::HandleLoadedHostCallbackReturnRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  const auto& ret = request->loaded_host_callback_return_request();
+
+  RemoteLoadedHostCallbackQueue::ExecutionRequest execution_request;
+  {
+    absl::MutexLock lock(&host_callback_executions_mutex_);
+    const auto it =
+        host_callback_executions_.find(ret.host_callback_execution_handle());
+    if (it == host_callback_executions_.end()) {
+      return absl::NotFoundError(
+          absl::StrCat("Unknown host callback execution: ",
+                       ret.host_callback_execution_handle()));
+    }
+    execution_request = std::move(it->second);
+    host_callback_executions_.erase(it);
+  }
+  absl::Cleanup cleanup = [&] {
+    std::move(execution_request)
+        .status.Set(absl::UnknownError(
+            "Unable to process the host callback execution results"));
+  };
+
+  // Copy the results from the host buffer store to the preallocated result
+  // buffers from `RemoteLoadedHostCallback`. Must be done before fulfilling the
+  // promise since the buffers may not be alive after that.
+  absl::Status status;
+  if (ret.has_result_host_buffer_handle()) {
+    TF_ASSIGN_OR_RETURN(
+        std::shared_ptr<const std::string> buffer,
+        host_buffer_store_->Lookup(ret.result_host_buffer_handle()));
+    absl::Cleanup cleanup = [&] {
+      CHECK_OK(host_buffer_store_->Delete(ret.result_host_buffer_handle()));
+    };
+
+    int64_t offset = 0;
+    for (const auto& result : execution_request.results) {
+      if (offset + result.size > buffer->size()) {
+        return absl::InternalError(
+            absl::StrCat("Buffer overflow while reading host callback "
+                         "execution results; ",
+                         "range: [", offset, ", ", offset + result.size, "), ",
+                         "buffer size: ", buffer->size()));
+      }
+      std::memcpy(result.data, buffer->data() + offset, result.size);
+      offset += result.size;
+    }
+    if (offset != buffer->size()) {
+      return absl::InternalError(
+          absl::StrCat("Host callback execution did not consume the entire "
+                       "result buffer; size: ",
+                       buffer->size(), "; consumed: ", offset));
+    }
+  } else {
+    status = tsl::StatusFromProto(ret.error());
+  }
+
+  // Fulfill the result promise. This unblocks the execution of the associated
+  // `RemoteLoadedHostCallback`. It is unsafe to access `execution_request`
+  // after this since the buffers may not be alive.
+  std::move(execution_request).status.Set(std::move(status));
+  std::move(cleanup).Cancel();
+
+  auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
+  ifrt_resp->mutable_loaded_host_callback_return_response();
+  return ifrt_resp;
+}
+
+BackendInterface::Response IfrtBackend::HandleGetDefaultDeviceAssignmentRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  const auto& get_default_device_assignment_request =
+      request->get_default_device_assignment_request();
+  TF_ASSIGN_OR_RETURN(
+      auto assignment,
+      client_->GetDefaultDeviceAssignment(
+          get_default_device_assignment_request.num_replicas(),
+          get_default_device_assignment_request.num_partitions()));
+
+  auto ifrt_resp = NewIfrtResponse(request->request_metadata().op_id());
+
+  // Currently, the xla::DeviceAssignment::Serialize does not fail. If test
+  // coverage for this error is needed, consider using testing::test_value to
+  // inject one.
+  TF_RETURN_IF_ERROR(assignment.Serialize(
+      ifrt_resp->mutable_get_default_device_assignment_response()
+          ->mutable_device_assignment()));
+
+  return ifrt_resp;
+}
+
+absl::StatusOr<std::shared_ptr<xla::ifrt::LoadedExecutable>>
+IfrtBackend::GetLoadedExecutable(uint64_t handle) {
+  absl::MutexLock lock(&executables_mutex_);
+  auto it = executables_.find(handle);
+  if (it == executables_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("Unknown loaded executable handle: ", handle));
+  }
+  return it->second;
+}
+
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> IfrtBackend::GetArray(
+    uint64_t array_handle) {
+  absl::ReaderMutexLock lock(&arrays_mutex_);
+  return GetArrayLocked(array_handle);
+}
+
+absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> IfrtBackend::GetArrayLocked(
+    uint64_t array_handle) {
+  auto it = arrays_.find(array_handle);
+  if (it == arrays_.end()) {
+    return absl::NotFoundError(
+        absl::StrCat("Unknown array handle: ", array_handle));
+  }
+  return it->second;
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
new file mode 100644
index 00000000000000..9dd57c66dd2a1a
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend.h
@@ -0,0 +1,207 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_SERVER_IFRT_BACKEND_H_
+#define XLA_PYTHON_IFRT_PROXY_SERVER_IFRT_BACKEND_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/client.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/host_callback.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/server/host_buffer.h"
+#include "xla/python/ifrt_proxy/server/host_callback.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/threadpool.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// The abstract class `BackendInterface` defines the interface used by the IFRT
+// service to interact with a variety of backend runtime system it can utilize.
+class BackendInterface {
+ public:
+  virtual ~BackendInterface() = default;
+
+  // Currently, responses (particularly those that carry buffer contents) can be
+  // of non-trivial size. Once we figured out how best to move the data, we may
+  // want to revise the shared_ptr below to the `IfrtResponse` proto itself.
+  // Also, if and when we have a move-only Future in xla::ifrt, we may consider
+  // changing it to std::unique_ptr.
+  using Response = absl::StatusOr<std::shared_ptr<IfrtResponse>>;
+
+  // Processes a given IFRT Request and returns a Future of an IfrtResponse.
+  virtual Future<Response> Process(std::unique_ptr<IfrtRequest> request) = 0;
+};
+
+// IfrtBackend implements a backend that already has a linkable C++ client that
+// conforms to the xla::ifrt API.
+class IfrtBackend final : public BackendInterface {
+ public:
+  // Creates an returns an IfrtBackend that uses the given IFRT Client to
+  // process the incoming proxy client requests. The `ifrt_client` param cannot
+  // be a nullptr.
+  static absl::StatusOr<std::unique_ptr<IfrtBackend>> Create(
+      IfrtProxyVersion version, uint64_t session_id,
+      std::unique_ptr<xla::ifrt::Client> ifrt_client,
+      std::shared_ptr<HostBufferStore> host_buffer_store);
+
+  ~IfrtBackend() override;
+
+  // IFRT Proxy version negotiated between the client and the server.
+  const IfrtProxyVersion& version() const { return version_; }
+
+  Future<Response> Process(std::unique_ptr<IfrtRequest> request) override;
+
+ private:
+  // Generates unique handles for returning to the client. All object types
+  // currently use this single "handle space".
+  class HandleGenerator {
+   public:
+    uint64_t New();
+
+    // Bulk allocates a given number of handles and saves them into the provided
+    // Span.
+    void BulkNew(absl::Span<uint64_t> handles);
+
+   private:
+    absl::Mutex mu_;
+    uint64_t current_ ABSL_GUARDED_BY(mu_) = 1;
+  };
+
+  IfrtBackend(IfrtProxyVersion version, uint64_t session_id,
+              std::unique_ptr<xla::ifrt::Client> ifrt_client,
+              std::shared_ptr<HostBufferStore> host_buffer_store);
+
+  // Executes the given function on the given thread pool and returns a future
+  // that becomes ready when the function returns. If the thread pool is not
+  // given, uses a default thread pool implementation that does not limit the
+  // maximum number of threads.
+  Future<Response> AsyncExecute(std::function<Response()> handle_fn,
+                                tsl::thread::ThreadPool* thread_pool = nullptr);
+
+  //////////////////////////////////////////////////////////////////////
+  // Handlers for individual requests
+  //
+
+  Response HandleInit(std::unique_ptr<IfrtRequest> request);
+
+  Future<Response> HandleCheckFutureRequest(
+      std::unique_ptr<IfrtRequest> request);
+
+  Response HandleMakeArrayFromHostBufferRequest(
+      std::unique_ptr<IfrtRequest> request);
+  Response HandleAssembleArrayFromSingleDeviceArraysRequest(
+      std::unique_ptr<IfrtRequest> request);
+  Future<Response> HandleCopyToHostBufferRequest(
+      std::unique_ptr<IfrtRequest> request);
+  Response HandleDisassembleIntoSingleDeviceArraysRequest(
+      std::unique_ptr<IfrtRequest> request);
+  Response HandleReshardRequest(std::unique_ptr<IfrtRequest> request);
+  Response HandleFullyReplicatedShardRequest(
+      std::unique_ptr<IfrtRequest> request);
+  Future<Response> HandleCheckArrayReadyRequest(
+      std::unique_ptr<IfrtRequest> request);
+  Response HandleDeleteArrayRequest(std::unique_ptr<IfrtRequest> request);
+  Response HandleIsArrayDeletedRequest(std::unique_ptr<IfrtRequest> request);
+  Response HandleDestructArrayRequest(std::unique_ptr<IfrtRequest> request);
+
+  Future<Response> HandleCompileRequest(std::unique_ptr<IfrtRequest> request);
+
+  Future<Response> HandleLoadedExecutableMetadataRequest(
+      std::unique_ptr<IfrtRequest> request);
+  Response HandleLoadedExecutableExecuteRequest(
+      std::unique_ptr<IfrtRequest> request);
+  Response HandleLoadedExecutableDeleteRequest(
+      std::unique_ptr<IfrtRequest> request);
+  Response HandleLoadedExecutableIsDeletedRequest(
+      std::unique_ptr<IfrtRequest> request);
+  Response HandleLoadedExecutableDestructRequest(
+      std::unique_ptr<IfrtRequest> request);
+
+  Future<Response> HandleLoadedHostCallbackPollRequest(
+      std::unique_ptr<IfrtRequest> request);
+  Response HandleLoadedHostCallbackReturnRequest(
+      std::unique_ptr<IfrtRequest> request);
+
+  Response HandleGetDefaultDeviceAssignmentRequest(
+      std::unique_ptr<IfrtRequest> request);
+
+  //////////////////////////////////////////////////////////////////////
+  // Convenient methods for object lookups
+  //
+
+  absl::StatusOr<std::shared_ptr<xla::ifrt::LoadedExecutable>>
+  GetLoadedExecutable(uint64_t handle);
+
+  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> GetArray(uint64_t handle);
+  absl::StatusOr<tsl::RCReference<xla::ifrt::Array>> GetArrayLocked(
+      uint64_t handle) ABSL_SHARED_LOCKS_REQUIRED(arrays_mutex_);
+
+  HandleGenerator handle_generator_;
+
+  // Must not change during the life of this object.
+  const IfrtProxyVersion version_;
+  const uint64_t session_id_;
+  const std::unique_ptr<xla::ifrt::Client> client_;
+  const std::shared_ptr<HostBufferStore> host_buffer_store_;
+
+  absl::Mutex futures_mutex_;
+  absl::flat_hash_map<uint64_t, Future<absl::Status>> futures_
+      ABSL_GUARDED_BY(futures_mutex_);
+
+  absl::Mutex arrays_mutex_;
+  absl::flat_hash_map<uint64_t, tsl::RCReference<xla::ifrt::Array>> arrays_
+      ABSL_GUARDED_BY(arrays_mutex_);
+
+  absl::Mutex executables_mutex_;
+  absl::flat_hash_map<uint64_t, std::shared_ptr<xla::ifrt::LoadedExecutable>>
+      executables_ ABSL_GUARDED_BY(executables_mutex_);
+
+  absl::Mutex host_callback_queues_mutex_;
+  absl::flat_hash_map<uint64_t, std::shared_ptr<RemoteLoadedHostCallbackQueue>>
+      host_callback_queues_ ABSL_GUARDED_BY(host_callback_queues_mutex_);
+
+  absl::Mutex host_callback_executions_mutex_;
+  absl::flat_hash_map<uint64_t, RemoteLoadedHostCallbackQueue::ExecutionRequest>
+      host_callback_executions_
+          ABSL_GUARDED_BY(host_callback_executions_mutex_);
+
+  absl::Mutex in_flight_count_mutex_;
+  int64_t in_flight_count_ ABSL_GUARDED_BY(in_flight_count_mutex_) = 0;
+
+  // Use a separate thread pool for compilation as XLA compilation often
+  // requires a bigger stack.
+  tsl::thread::ThreadPool compile_thread_pool_;
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_SERVER_IFRT_BACKEND_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
new file mode 100644
index 00000000000000..b7b35560be74e3
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_backend_test.cc
@@ -0,0 +1,1406 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/server/ifrt_backend.h"
+
+#include <sys/types.h>
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ExtensibleRTTI.h"
+#include "xla/layout.h"
+#include "xla/layout_util.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
+#include "xla/pjrt/host_callback.h"
+#include "xla/pjrt/pjrt_common.h"
+#include "xla/pjrt/pjrt_device_description.h"
+#include "xla/python/ifrt/array.h"
+#include "xla/python/ifrt/compiler.h"
+#include "xla/python/ifrt/device.h"
+#include "xla/python/ifrt/dtype.h"
+#include "xla/python/ifrt/executable.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt/host_callback.h"
+#include "xla/python/ifrt/memory.h"
+#include "xla/python/ifrt/mock.h"
+#include "xla/python/ifrt/serdes.h"
+#include "xla/python/ifrt/shape.h"
+#include "xla/python/ifrt/sharding.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/types.h"
+#include "xla/python/ifrt_proxy/common/types.pb.h"
+#include "xla/python/ifrt_proxy/server/host_buffer.h"
+#include "xla/python/ifrt_proxy/server/host_callback.h"
+#include "xla/python/ifrt_proxy/server/version.h"
+#include "xla/python/pjrt_ifrt/xla_compiler.h"
+#include "xla/service/computation_placer.h"
+#include "xla/shape_util.h"
+#include "xla/status_macros.h"
+#include "xla/test.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+#include "tsl/platform/status_matchers.h"
+#include "tsl/platform/status_to_from_proto.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+#include "tsl/protobuf/error_codes.pb.h"
+#include "tsl/protobuf/status.pb.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+namespace {
+
+using ::testing::_;
+using ::testing::ByMove;
+using ::testing::DoAll;
+using ::testing::ElementsAreArray;
+using ::testing::HasSubstr;
+using ::testing::Invoke;
+using ::testing::Not;
+using ::testing::NotNull;
+using ::testing::Pointee;
+using ::testing::Return;
+using ::testing::ReturnRef;
+using ::testing::SizeIs;
+using ::testing::StrEq;
+using ::tsl::protobuf::TextFormat;
+using ::tsl::testing::IsOk;
+using ::tsl::testing::IsOkAndHolds;
+using ::tsl::testing::StatusIs;
+
+#if defined(PLATFORM_GOOGLE)
+using ::testing::EquivToProto;
+using ::testing::proto::IgnoringRepeatedFieldOrdering;
+using ::testing::proto::Partially;
+#endif
+
+constexpr uint64_t kSessionId = 12345;
+
+IfrtProxyVersion Version() {
+  IfrtProxyVersion version;
+  version.set_protocol_version(kServerMaxVersion);
+  return version;
+}
+
+// Makes an empty request with the given op_id. Does not fail.
+std::unique_ptr<IfrtRequest> NewIfrtRequest(uint64_t op_id) {
+  auto ifrt_request = std::make_unique<IfrtRequest>();
+  auto* request_metadata = ifrt_request->mutable_request_metadata();
+  request_metadata->set_op_id(op_id);
+  return ifrt_request;
+}
+
+TEST(IfrtBackendTest, CreationFailsWithNullIfrtClient) {
+  EXPECT_THAT(IfrtBackend::Create(Version(), kSessionId, nullptr, nullptr),
+              StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+TEST(IfrtBackendTest, SuccessfulCreation) {
+  auto ifrt_client = std::make_unique<MockClient>();
+  ASSERT_THAT(IfrtBackend::Create(Version(), kSessionId, std::move(ifrt_client),
+                                  std::make_shared<HostBufferStore>()),
+              IsOk());
+}
+
+TEST(IfrtBackendTest, ShutdownSucceeds) {
+  auto ifrt_client = std::make_unique<MockClient>();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto ifrt_backend,
+      IfrtBackend::Create(Version(), kSessionId, std::move(ifrt_client),
+                          std::make_shared<HostBufferStore>()));
+}
+
+TEST(IfrtBackendTest, ProcessFailsWithNoRequestSet) {
+  auto ifrt_client = std::make_unique<MockClient>();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto ifrt_backend,
+      IfrtBackend::Create(Version(), kSessionId, std::move(ifrt_client),
+                          std::make_shared<HostBufferStore>()));
+
+  // Make a new request but leave the `OneOf` `request` field unset. And, that
+  // should fail the Process call.
+  auto request = std::make_unique<IfrtRequest>();
+  auto process_status = ifrt_backend->Process(std::move(request)).Await();
+  ASSERT_THAT(process_status, Not(IsOk()));
+}
+
+struct TestProgram : llvm::RTTIExtends<TestProgram, Program> {
+  static char ID;  // NOLINT
+};
+
+[[maybe_unused]] char TestProgram::ID = 0;  // NOLINT
+
+class TestProgramSerDes : public llvm::RTTIExtends<TestProgramSerDes, SerDes> {
+ public:
+  absl::string_view type_name() const override {
+    return "xla::ifrt::proxy::TestProgram";
+  }
+
+  absl::StatusOr<std::string> Serialize(Serializable& serializable) override {
+    CHECK(llvm::isa<TestProgram>(serializable));
+    return "";
+  }
+
+  absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
+      const std::string& serialized,
+      std::unique_ptr<DeserializeOptions> options) override {
+    return std::make_unique<TestProgram>();
+  }
+
+  static char ID;  // NOLINT
+};
+
+[[maybe_unused]] char TestProgramSerDes::ID = 0;  // NOLINT
+
+struct TestCompileOptions
+    : llvm::RTTIExtends<TestCompileOptions, CompileOptions> {
+  static char ID;  // NOLINT
+};
+
+[[maybe_unused]] char TestCompileOptions::ID = 0;  // NOLINT
+
+class TestCompileOptionsSerDes
+    : public llvm::RTTIExtends<TestCompileOptionsSerDes, SerDes> {
+ public:
+  absl::string_view type_name() const override {
+    return "xla::ifrt::proxy::TestCompileOptions";
+  }
+
+  absl::StatusOr<std::string> Serialize(Serializable& serializable) override {
+    CHECK(llvm::isa<TestCompileOptions>(serializable));
+    return "";
+  }
+
+  absl::StatusOr<std::unique_ptr<Serializable>> Deserialize(
+      const std::string& serialized,
+      std::unique_ptr<DeserializeOptions> options) override {
+    return std::make_unique<TestCompileOptions>();
+  }
+
+  static char ID;  // NOLINT
+};
+
+[[maybe_unused]] char TestCompileOptionsSerDes::ID = 0;  // NOLINT
+
+class IfrtBackendHandlerTest : public testing::Test {
+ protected:
+  static void SetUpTestSuite() {
+    RegisterSerDes<TestProgram>(std::make_unique<TestProgramSerDes>());
+    RegisterSerDes<TestCompileOptions>(
+        std::make_unique<TestCompileOptionsSerDes>());
+  }
+
+  void SetUp() override {
+    auto mock_client = std::make_unique<xla::ifrt::MockClient>();
+
+    std::vector<xla::ifrt::Device*> raw_device_ptrs;
+    for (int i = 0; i < 2; ++i) {
+      auto mock_device = std::make_unique<xla::ifrt::MockDevice>();
+      ON_CALL(*mock_device, global_device_id())
+          .WillByDefault(Return(xla::PjRtGlobalDeviceId(i)));
+      raw_device_ptrs.push_back(mock_device.get());
+      mock_devices_.push_back(std::move(mock_device));
+    }
+
+    ON_CALL(*mock_client, devices()).WillByDefault(Return(raw_device_ptrs));
+    ON_CALL(*mock_client, LookupDevice(_))
+        .WillByDefault(
+            Invoke([this](int id) -> absl::StatusOr<xla::ifrt::Device*> {
+              if (id < 0 || id >= mock_devices_.size()) {
+                return absl::NotFoundError(
+                    absl::StrCat("Unknown device id: ", id));
+              }
+              return mock_devices_[id].get();
+            }));
+
+    // Remembering a raw pointer to the mock client here is OK, since most tests
+    // anyway have to make the basic and tacit assumption that the backend will
+    // call into the mock client --and thus keep it alive-- for the duration of
+    // the test.
+    mock_client_ = mock_client.get();
+
+    EXPECT_CALL(*mock_client_, GetDefaultCompiler)
+        .WillRepeatedly(Return(&mock_compiler_));
+
+    host_buffer_store_ = std::make_shared<HostBufferStore>();
+    TF_ASSERT_OK_AND_ASSIGN(
+        backend_,
+        IfrtBackend::Create(Version(), kSessionId, std::move(mock_client),
+                            host_buffer_store_));
+  }
+
+  absl::StatusOr<std::shared_ptr<IfrtResponse>> CallBackend(
+      std::unique_ptr<IfrtRequest> request) {
+    auto response_future = backend_->Process(std::move(request));
+    return std::move(response_future).Await();
+  }
+
+  uint64_t NewOpId() {
+    absl::MutexLock lock(&mu_);
+    return current_op_id_++;
+  }
+
+  uint64_t NewHostBufferHandle() { return current_host_buffer_handle_++; }
+
+  // Utility method to set up a given MockArray (in the backend) that can then
+  // be the target of the other Array-specific methods. Returns the array
+  // handle.
+  absl::StatusOr<uint64_t> MakeTestArray(tsl::RCReference<Array> mock_array) {
+    EXPECT_CALL(*mock_client_, MakeArrayFromHostBuffer(_, _, _, _, _, _, _))
+        .WillOnce(Return(std::move(mock_array)));
+
+    auto ifrt_request = NewIfrtRequest(NewOpId());
+    {
+      const uint64_t host_buffer_handle = NewHostBufferHandle();
+      TF_RETURN_IF_ERROR(
+          host_buffer_store_->Store(host_buffer_handle, "01234567"));
+
+      auto* make_array =
+          ifrt_request->mutable_make_array_from_host_buffer_request();
+      make_array->set_dtype(proto::DTYPE_S32);
+      make_array->mutable_shape()->add_dimensions(2);
+      make_array->set_host_buffer_handle(host_buffer_handle);
+
+      TF_ASSIGN_OR_RETURN(auto* device, mock_client_->LookupDevice(1));
+      TF_ASSIGN_OR_RETURN(
+          *make_array->mutable_sharding(),
+          ToShardingProto(*SingleDeviceSharding::Create(device, MemoryKind())));
+    }
+    TF_ASSIGN_OR_RETURN(auto make_array_response,
+                        CallBackend(std::move(ifrt_request)));
+
+    TF_RETURN_IF_ERROR(tsl::StatusFromProto(
+        make_array_response->response_metadata().status()));
+    return make_array_response->make_array_from_host_buffer_response()
+        .array_handle();
+  }
+
+  absl::StatusOr<CompileResponse> CompileTestLoadedExecutable(
+      absl::StatusOr<std::unique_ptr<LoadedExecutable>> loaded_executable) {
+    auto request = NewIfrtRequest(NewOpId());
+    CompileRequest* compile_request = request->mutable_compile_request();
+    TestProgram program;
+    TF_ASSIGN_OR_RETURN(*compile_request->mutable_program(),
+                        Serialize(program));
+    TestCompileOptions compile_options;
+    TF_ASSIGN_OR_RETURN(*compile_request->mutable_compile_options(),
+                        Serialize(compile_options));
+
+    EXPECT_CALL(mock_compiler_, Compile(_, _))
+        .WillOnce(Return(ByMove(std::move(loaded_executable))));
+
+    TF_ASSIGN_OR_RETURN(std::shared_ptr<IfrtResponse> response,
+                        CallBackend(std::move(request)));
+
+    TF_RET_CHECK(response->has_compile_response());
+    return response->compile_response();
+  }
+
+  absl::Status CheckFuture(uint64_t handle) {
+    auto request = NewIfrtRequest(NewOpId());
+    request->mutable_check_future_request()->set_future_handle(handle);
+    TF_ASSIGN_OR_RETURN(std::shared_ptr<IfrtResponse> response,
+                        CallBackend(std::move(request)));
+    return tsl::StatusFromProto(response->response_metadata().status());
+  }
+
+  xla::ifrt::MockClient* mock_client_;
+  xla::ifrt::MockCompiler mock_compiler_;
+  std::vector<std::unique_ptr<xla::ifrt::MockDevice>> mock_devices_;
+  std::shared_ptr<HostBufferStore> host_buffer_store_;
+
+ private:
+  absl::Mutex mu_;
+  uint64_t current_op_id_ ABSL_GUARDED_BY(mu_) = 1;
+  uint64_t current_host_buffer_handle_ = 1;
+
+  std::unique_ptr<IfrtBackend> backend_;
+};
+
+// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
+#if defined(PLATFORM_GOOGLE)
+TEST_F(IfrtBackendHandlerTest, Init) {
+  EXPECT_CALL(*mock_client_, platform_name())
+      .WillRepeatedly(Return("ifrt_backend"));
+  EXPECT_CALL(*mock_client_, platform_version()).WillRepeatedly(Return("n/a"));
+  EXPECT_CALL(*mock_client_, platform_id()).WillRepeatedly(Return(42));
+  EXPECT_CALL(*mock_client_, process_index()).WillRepeatedly(Return(1));
+  EXPECT_CALL(*mock_client_, runtime_type())
+      .WillRepeatedly(Return("ifrt-service"));
+
+  std::vector<std::vector<xla::ifrt::Device*>> mock_memory_devices;
+  mock_memory_devices.reserve(mock_devices_.size());
+  for (const auto& mock_device : mock_devices_) {
+    mock_memory_devices.push_back({mock_device.get()});
+  }
+
+  std::vector<MockMemory> mock_memories(mock_devices_.size());
+  for (int i = 0; i < mock_memories.size(); ++i) {
+    MockMemory& memory = mock_memories[i];
+    EXPECT_CALL(memory, devices())
+        .WillRepeatedly(Return(mock_memory_devices[i]));
+    EXPECT_CALL(memory, id()).WillRepeatedly(Return(i));
+    EXPECT_CALL(memory, memory_space_kind()).WillRepeatedly(Return("mock"));
+  }
+
+  std::vector<std::vector<Memory*>> device_memories;
+  device_memories.reserve(mock_devices_.size());
+  for (int i = 0; i < mock_devices_.size(); ++i) {
+    device_memories.push_back({&mock_memories[i]});
+  }
+
+  using AttributeMap =
+      absl::flat_hash_map<std::string, xla::PjRtDeviceAttribute>;
+  std::vector<AttributeMap> device_attributes(mock_devices_.size());
+
+  const uint32_t kLocalHardwareId = 1234;
+  for (int i = 0; i < mock_devices_.size(); ++i) {
+    device_attributes[i].insert({"name", absl::StrCat("device", i)});
+
+    MockDevice& mock_device = *mock_devices_[i];
+    // TODO(b/314368788): Clean up PJRT device ID APIs.
+    EXPECT_CALL(mock_device, local_hardware_id_typed())
+        .WillRepeatedly(Return(xla::PjRtLocalHardwareId(kLocalHardwareId)));
+    EXPECT_CALL(mock_device, local_hardware_id())
+        .WillRepeatedly(Return(kLocalHardwareId));
+    EXPECT_CALL(mock_device, local_device_id())
+        .WillRepeatedly(Return(xla::PjRtLocalDeviceId(kLocalHardwareId)));
+    EXPECT_CALL(mock_device, device_kind()).WillRepeatedly(Return("mock"));
+    EXPECT_CALL(mock_device, memory_spaces())
+        .WillRepeatedly(Return(device_memories[i]));
+    EXPECT_CALL(mock_device, default_memory_space())
+        .WillRepeatedly(Return(&mock_memories[i]));
+    EXPECT_CALL(mock_device, Attributes())
+        .WillRepeatedly(ReturnRef(device_attributes[i]));
+  }
+
+  auto request = NewIfrtRequest(NewOpId());
+  request->mutable_init_request();
+
+  EXPECT_THAT(CallBackend(std::move(request)),
+              IsOkAndHolds(Pointee(
+                  Partially(IgnoringRepeatedFieldOrdering(EquivToProto(R"pb(
+                    init_response {
+                      session_id: 12345
+                      platform_name: "ifrt_backend"
+                      platform_version: "n/a"
+                      platform_id: 42
+                      process_index: 1
+                      runtime_type: "ifrt-service"
+                      devices {
+                        id: 0
+                        local_device_id: 1234
+                        local_hardware_id: 1234
+                        device_kind: "mock"
+                        default_memory_id: 0
+                        memory_ids: [ 0 ]
+                        attributes {
+                          key: "name"
+                          value { string_value: "device0" }
+                        }
+                      }
+                      devices {
+                        id: 1
+                        local_device_id: 1234
+                        local_hardware_id: 1234
+                        device_kind: "mock"
+                        default_memory_id: 1
+                        memory_ids: [ 1 ]
+                        attributes {
+                          key: "name"
+                          value { string_value: "device1" }
+                        }
+                      }
+                      memories {
+                        id: 0
+                        memory_space_kind: "mock"
+                        device_ids: [ 0 ]
+                      }
+                      memories {
+                        id: 1
+                        memory_space_kind: "mock"
+                        device_ids: [ 1 ]
+                      }
+                    }
+                  )pb"))))));
+}
+#endif
+
+// TODO(b/282757875): Use the MockRuntime fixture to cover the error cases for
+// MakeArrayFromHostBuffer and CopyToHostBuffer methods as well.
+
+// Consider redoing the happy-path test below with PjRt CPU-only backend for
+// non-SingleDeviceSharding.
+TEST_F(IfrtBackendHandlerTest, DisassembleIntoSingleDeviceArraysSucceeds) {
+  // Set up a mock source array that returns two single device arrays on
+  // disassembly.
+  std::vector<tsl::RCReference<xla::ifrt::Array>> single_device_arrays;
+  single_device_arrays.push_back(tsl::MakeRef<xla::ifrt::MockArray>());
+  single_device_arrays.push_back(tsl::MakeRef<xla::ifrt::MockArray>());
+  tsl::RCReference<xla::ifrt::MockArray> source_mock_array =
+      tsl::MakeRef<xla::ifrt::MockArray>();
+  EXPECT_CALL(*source_mock_array, DisassembleIntoSingleDeviceArrays(_))
+      .WillOnce(Return(std::move(single_device_arrays)));
+
+  // Inject the mock_array.
+  TF_ASSERT_OK_AND_ASSIGN(auto array_handle,
+                          MakeTestArray(std::move(source_mock_array)));
+
+  // Disassemble.
+  auto disassemble_request = NewIfrtRequest(NewOpId());
+  disassemble_request->mutable_disassemble_into_single_device_arrays_request()
+      ->set_array_handle(array_handle);
+  TF_ASSERT_OK_AND_ASSIGN(auto disassemble_response,
+                          CallBackend(std::move(disassemble_request)));
+
+  // We must have gotten back two handles corresponding to the two single device
+  // arrays we injected.
+  EXPECT_THAT(
+      disassemble_response->disassemble_into_single_device_arrays_response()
+          .single_device_array_handles(),
+      SizeIs(2));
+}
+
+TEST_F(IfrtBackendHandlerTest, MakeArrayFromHostBufferSuccess) {
+  // Given the below shape, dtype, and compact byte_strides, the size of the
+  // array data needs to be 480 bytes.
+  const uint64_t kHostBufferHandle = 1234;
+  ASSERT_THAT(
+      host_buffer_store_->Store(kHostBufferHandle, std::string(480, 'a')),
+      IsOk());
+
+  auto ifrt_request = NewIfrtRequest(NewOpId());
+  {
+    auto* make_array =
+        ifrt_request->mutable_make_array_from_host_buffer_request();
+    ASSERT_TRUE(
+        TextFormat::ParseFromString(R"pb(
+                                      dtype: DTYPE_F64
+                                      shape { dimensions: [ 5, 3, 4 ] }
+                                      byte_strides { strides: [ 8, 40, 120 ] }
+                                    )pb",
+                                    make_array));
+    make_array->set_host_buffer_handle(kHostBufferHandle);
+    TF_ASSERT_OK_AND_ASSIGN(auto* device, mock_client_->LookupDevice(1));
+    TF_ASSERT_OK_AND_ASSIGN(
+        *make_array->mutable_sharding(),
+        ToShardingProto(*SingleDeviceSharding::Create(device, MemoryKind())));
+  }
+
+  const Shape expected_shape({5, 3, 4});
+  const std::vector<int64_t> expected_byte_strides_vec = {8, 40, 120};
+  const std::optional<absl::Span<const int64_t>> expected_byte_strides =
+      absl::Span<const int64_t>(expected_byte_strides_vec);
+
+  tsl::RCReference<xla::ifrt::MockArray> mock_array =
+      tsl::MakeRef<xla::ifrt::MockArray>();
+
+  EXPECT_CALL(*mock_client_,
+              MakeArrayFromHostBuffer(_, DType(DType::kF64), expected_shape,
+                                      expected_byte_strides, _, _, _))
+      .WillOnce(Return(std::move(mock_array)));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto response, CallBackend(std::move(ifrt_request)));
+  EXPECT_NE(response->make_array_from_host_buffer_response().array_handle(), 0);
+}
+
+TEST_F(IfrtBackendHandlerTest, AssembleArrayFromSingleDeviceArrays) {
+  auto ifrt_request = NewIfrtRequest(NewOpId());
+  {
+    ASSERT_TRUE(TextFormat::ParseFromString(
+        R"pb(
+          shape { dimensions: [ 2, 2 ] }
+          copy_semantics: ARRAY_COPY_SEMANTICS_ALWAYS_COPY
+        )pb",
+        ifrt_request
+            ->mutable_assemble_array_from_single_device_arrays_request()));
+    TF_ASSERT_OK_AND_ASSIGN(auto* device, mock_client_->LookupDevice(1));
+    TF_ASSERT_OK_AND_ASSIGN(
+        *ifrt_request
+             ->mutable_assemble_array_from_single_device_arrays_request()
+             ->mutable_sharding(),
+        ToShardingProto(*SingleDeviceSharding::Create(device, MemoryKind())));
+  }
+
+  std::vector<tsl::RCReference<xla::ifrt::MockArray>> single_device_arrays;
+  for (int i = 0; i < 2; ++i) {
+    auto array = tsl::MakeRef<xla::ifrt::MockArray>();
+    single_device_arrays.push_back(array);
+
+    TF_ASSERT_OK_AND_ASSIGN(uint64_t array_handle, MakeTestArray(array));
+    ifrt_request->mutable_assemble_array_from_single_device_arrays_request()
+        ->add_single_device_array_handles(array_handle);
+  }
+
+  tsl::RCReference<xla::ifrt::MockArray> result =
+      tsl::MakeRef<xla::ifrt::MockArray>();
+  const Shape expected_shape({2, 2});
+
+  EXPECT_CALL(*mock_client_,
+              AssembleArrayFromSingleDeviceArrays(
+                  expected_shape, _, ElementsAreArray(single_device_arrays), _))
+      .WillOnce(Return(std::move(result)));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto response, CallBackend(std::move(ifrt_request)));
+  EXPECT_NE(response->assemble_array_from_single_device_arrays_response()
+                .array_handle(),
+            0);
+}
+
+TEST_F(IfrtBackendHandlerTest, CopyToHostSuccess) {
+  Shape shape({5, 3, 4});
+  tsl::RCReference<xla::ifrt::MockArray> array =
+      tsl::MakeRef<xla::ifrt::MockArray>();
+  ON_CALL(*array, shape()).WillByDefault(ReturnRef(shape));
+  ON_CALL(*array, dtype()).WillByDefault(Return(DType(DType::kF64)));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto array_handle, MakeTestArray(array));
+
+  auto ifrt_request = NewIfrtRequest(NewOpId());
+  auto* copy_to_host = ifrt_request->mutable_copy_to_host_buffer_request();
+  ASSERT_TRUE(
+      TextFormat::ParseFromString(R"pb(
+                                    byte_strides { strides: [ 8, 40, 120 ] }
+                                  )pb",
+                                  copy_to_host));
+  copy_to_host->set_array_handle(array_handle);
+  const uint64_t host_buffer_handle = NewHostBufferHandle();
+  copy_to_host->set_host_buffer_handle(host_buffer_handle);
+
+  const std::vector<int64_t> expected_byte_strides_vec = {8, 40, 120};
+  const std::optional<absl::Span<const int64_t>> expected_byte_strides =
+      absl::Span<const int64_t>(expected_byte_strides_vec);
+  EXPECT_CALL(*array, CopyToHostBuffer(_, expected_byte_strides, _))
+      .WillOnce(Return(Future<absl::Status>(absl::OkStatus())));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto response, CallBackend(std::move(ifrt_request)));
+  // Given the above shape, dtype, and compact byte_strides, the size of the
+  // array data needs to be 480 bytes.
+  EXPECT_THAT(host_buffer_store_->Lookup(host_buffer_handle),
+              IsOkAndHolds(Pointee(SizeIs(480))));
+}
+
+TEST_F(IfrtBackendHandlerTest, CopyToHostFailsWithNonExistentArrays) {
+  auto ifrt_request = NewIfrtRequest(NewOpId());
+  ASSERT_TRUE(TextFormat::ParseFromString(
+      R"pb(
+        byte_strides { strides: [ 8, 40, 120 ] }
+      )pb",
+      ifrt_request->mutable_copy_to_host_buffer_request()));
+  ifrt_request->mutable_copy_to_host_buffer_request()->set_array_handle(0);
+
+  EXPECT_THAT(CallBackend(std::move(ifrt_request)),
+              StatusIs(absl::StatusCode::kNotFound));
+}
+
+TEST_F(IfrtBackendHandlerTest,
+       DisassembleIntoSingleArrayFailsWhenBackendRuntimeFails) {
+  // Set up a mock source array that fails the disassembly.
+  constexpr absl::string_view kDisassembleErrorMessage =
+      "Some test-injected error message that is unlikely to match other error "
+      "messages - 1234";
+  tsl::RCReference<xla::ifrt::MockArray> source_mock_array =
+      tsl::MakeRef<xla::ifrt::MockArray>();
+  EXPECT_CALL(*source_mock_array, DisassembleIntoSingleDeviceArrays(_))
+      .WillOnce(Return(absl::UnknownError(kDisassembleErrorMessage)));
+
+  // Set up the mock client to return the source_mock_array when the test tries
+  // to MakeArrayFromHostBuffer.
+  TF_ASSERT_OK_AND_ASSIGN(auto array_handle,
+                          MakeTestArray(std::move(source_mock_array)));
+
+  // Disassembly must fail with the error we injected.
+  auto disassemble_request = NewIfrtRequest(NewOpId());
+  disassemble_request->mutable_disassemble_into_single_device_arrays_request()
+      ->set_array_handle(array_handle);
+  ASSERT_THAT(
+      CallBackend(std::move(disassemble_request)),
+      StatusIs(absl::StatusCode::kUnknown, StrEq(kDisassembleErrorMessage)));
+}
+
+TEST_F(IfrtBackendHandlerTest, ReshardSuccess) {
+  auto src_mock_array = tsl::MakeRef<xla::ifrt::MockArray>();
+  auto resharded_mock_array = tsl::MakeRef<xla::ifrt::MockArray>();
+  EXPECT_CALL(*src_mock_array, Reshard(_, _))
+      .WillOnce(Return(std::move(resharded_mock_array)));
+  TF_ASSERT_OK_AND_ASSIGN(auto src_array_handle,
+                          MakeTestArray(std::move(src_mock_array)));
+
+  auto ifrt_request = NewIfrtRequest(NewOpId());
+  auto* reshard_request = ifrt_request->mutable_reshard_request();
+  reshard_request->set_array_handle(src_array_handle);
+  reshard_request->set_copy_semantics(proto::ARRAY_COPY_SEMANTICS_ALWAYS_COPY);
+  TF_ASSERT_OK_AND_ASSIGN(auto* device, mock_client_->LookupDevice(1));
+  TF_ASSERT_OK_AND_ASSIGN(
+      *ifrt_request->mutable_reshard_request()->mutable_sharding(),
+      ToShardingProto(*SingleDeviceSharding::Create(device, MemoryKind())));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto response, CallBackend(std::move(ifrt_request)));
+
+  EXPECT_THAT(tsl::StatusFromProto(response->response_metadata().status()),
+              IsOk());
+  EXPECT_NE(response->reshard_response().array_handle(), 0);
+}
+
+TEST_F(IfrtBackendHandlerTest, FullyReplicatedShardSuccess) {
+  auto fully_replicated_mock_array = tsl::MakeRef<xla::ifrt::MockArray>();
+  auto resultant_array = tsl::MakeRef<xla::ifrt::MockArray>();
+  EXPECT_CALL(*fully_replicated_mock_array, FullyReplicatedShard(_))
+      .WillOnce(Return(std::move(resultant_array)));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto fully_replicated_array_handle,
+      MakeTestArray(std::move(fully_replicated_mock_array)));
+
+  auto ifrt_request = NewIfrtRequest(NewOpId());
+  auto* fully_replicated_shard_request =
+      ifrt_request->mutable_fully_replicated_shard_request();
+  fully_replicated_shard_request->set_array_handle(
+      fully_replicated_array_handle);
+  fully_replicated_shard_request->set_copy_semantics(
+      proto::ARRAY_COPY_SEMANTICS_ALWAYS_COPY);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto response, CallBackend(std::move(ifrt_request)));
+  EXPECT_NE(response->fully_replicated_shard_response().array_handle(), 0);
+}
+
+TEST_F(IfrtBackendHandlerTest, FullyReplicatedShardFailure) {
+  auto fully_replicated_mock_array = tsl::MakeRef<xla::ifrt::MockArray>();
+  EXPECT_CALL(*fully_replicated_mock_array, FullyReplicatedShard(_))
+      .WillOnce(Return(absl::UnknownError("injected error")));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto fully_replicated_array_handle,
+      MakeTestArray(std::move(fully_replicated_mock_array)));
+
+  auto ifrt_request = NewIfrtRequest(NewOpId());
+  auto* fully_replicated_shard_request =
+      ifrt_request->mutable_fully_replicated_shard_request();
+  fully_replicated_shard_request->set_array_handle(
+      fully_replicated_array_handle);
+  fully_replicated_shard_request->set_copy_semantics(
+      proto::ARRAY_COPY_SEMANTICS_ALWAYS_COPY);
+
+  EXPECT_THAT(CallBackend(std::move(ifrt_request)),
+              StatusIs(absl::StatusCode::kUnknown, StrEq("injected error")));
+}
+
+TEST_F(IfrtBackendHandlerTest,
+       FullyReplicatedShardFailsWithNonExistentArrayHandle) {
+  auto ifrt_request = NewIfrtRequest(NewOpId());
+  auto* fully_replicated_shard_request =
+      ifrt_request->mutable_fully_replicated_shard_request();
+  fully_replicated_shard_request->set_array_handle(0);
+  fully_replicated_shard_request->set_copy_semantics(
+      proto::ARRAY_COPY_SEMANTICS_ALWAYS_COPY);
+
+  EXPECT_THAT(CallBackend(std::move(ifrt_request)),
+              StatusIs(absl::StatusCode::kNotFound));
+}
+
+TEST_F(IfrtBackendHandlerTest, ReshardFailsWhenTheBackendFails) {
+  auto mock_array = tsl::MakeRef<xla::ifrt::MockArray>();
+  EXPECT_CALL(*mock_array, Reshard(_, _))
+      .WillOnce(Return(absl::UnknownError("injected error")));
+  TF_ASSERT_OK_AND_ASSIGN(auto array_handle,
+                          MakeTestArray(std::move(mock_array)));
+
+  auto ifrt_request = NewIfrtRequest(NewOpId());
+  auto* reshard_request = ifrt_request->mutable_reshard_request();
+  reshard_request->set_array_handle(array_handle);
+  reshard_request->set_copy_semantics(proto::ARRAY_COPY_SEMANTICS_ALWAYS_COPY);
+  TF_ASSERT_OK_AND_ASSIGN(auto* device, mock_client_->LookupDevice(1));
+  TF_ASSERT_OK_AND_ASSIGN(
+      *ifrt_request->mutable_reshard_request()->mutable_sharding(),
+      ToShardingProto(*SingleDeviceSharding::Create(device, MemoryKind())));
+
+  EXPECT_THAT(CallBackend(std::move(ifrt_request)),
+              StatusIs(absl::StatusCode::kUnknown, StrEq("injected error")));
+}
+
+TEST_F(IfrtBackendHandlerTest, ReshardFailsWithNonExistentArrayHandle) {
+  auto ifrt_request = NewIfrtRequest(NewOpId());
+  auto* reshard_request = ifrt_request->mutable_reshard_request();
+  reshard_request->set_array_handle(0);
+  reshard_request->set_copy_semantics(proto::ARRAY_COPY_SEMANTICS_ALWAYS_COPY);
+  reshard_request->mutable_sharding();
+
+  EXPECT_THAT(CallBackend(std::move(ifrt_request)),
+              StatusIs(absl::StatusCode::kNotFound));
+}
+
+TEST_F(IfrtBackendHandlerTest,
+       CheckArrayReadyRequestRelaysTheResultFromBackend) {
+  auto mock_array = tsl::MakeRef<xla::ifrt::MockArray>();
+  EXPECT_CALL(*mock_array, GetReadyFuture())
+      .WillOnce(Return(Future<absl::Status>(absl::OkStatus())))
+      .WillOnce(
+          Return(Future<absl::Status>(absl::UnknownError("injected error"))));
+  TF_ASSERT_OK_AND_ASSIGN(auto array_handle,
+                          MakeTestArray(std::move(mock_array)));
+
+  {
+    auto ifrt_request = NewIfrtRequest(NewOpId());
+    ifrt_request->mutable_check_array_ready_request()->set_array_handle(
+        array_handle);
+    TF_ASSERT_OK_AND_ASSIGN(auto ifrt_response,
+                            CallBackend(std::move(ifrt_request)));
+
+    EXPECT_THAT(ifrt_response->response_metadata().status().code(),
+                tensorflow::error::OK);
+    EXPECT_TRUE(ifrt_response->has_check_array_ready_response());
+  }
+
+  {
+    auto ifrt_request = NewIfrtRequest(NewOpId());
+    ifrt_request->mutable_check_array_ready_request()->set_array_handle(
+        array_handle);
+    EXPECT_THAT(CallBackend(std::move(ifrt_request)),
+                StatusIs(absl::StatusCode::kUnknown, StrEq("injected error")));
+  }
+}
+
+TEST_F(IfrtBackendHandlerTest,
+       CheckArrayReadyRequestFailsWithNonExistentArrayHandle) {
+  auto ifrt_request = NewIfrtRequest(NewOpId());
+  ifrt_request->mutable_check_array_ready_request()->set_array_handle(0);
+  EXPECT_THAT(CallBackend(std::move(ifrt_request)),
+              StatusIs(absl::StatusCode::kNotFound));
+}
+
+TEST_F(IfrtBackendHandlerTest, DeleteArraySuccess) {
+  tsl::RCReference<xla::ifrt::MockArray> mock_array =
+      tsl::MakeRef<xla::ifrt::MockArray>();
+  EXPECT_CALL(*mock_array, Delete())
+      .WillOnce(Return(Future<absl::Status>(absl::OkStatus())));
+  TF_ASSERT_OK_AND_ASSIGN(auto array_handle,
+                          MakeTestArray(std::move(mock_array)));
+
+  uint64_t op_id = NewOpId();
+  auto ifrt_request = NewIfrtRequest(op_id);
+  ifrt_request->mutable_delete_array_request()->set_array_handle(array_handle);
+  TF_ASSERT_OK_AND_ASSIGN(auto resp, CallBackend(std::move(ifrt_request)));
+  EXPECT_THAT(tsl::StatusFromProto(resp->response_metadata().status()), IsOk());
+  EXPECT_NE(resp->delete_array_response().deletion_future_handle(), 0);
+}
+
+TEST_F(IfrtBackendHandlerTest, DeleteArrayFailsWithNonExistentArrayHandle) {
+  auto ifrt_request = NewIfrtRequest(NewOpId());
+  ifrt_request->mutable_delete_array_request()->set_array_handle(0);
+  EXPECT_THAT(CallBackend(std::move(ifrt_request)),
+              StatusIs(absl::StatusCode::kNotFound));
+}
+
+TEST_F(IfrtBackendHandlerTest,
+       IsDeleteRelaysBackTheReturnValueFromBackendRuntime) {
+  tsl::RCReference<xla::ifrt::MockArray> mock_array =
+      tsl::MakeRef<xla::ifrt::MockArray>();
+
+  EXPECT_CALL(*mock_array, IsDeleted())
+      .WillOnce(Return(true))
+      .WillOnce(Return(false));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto array_handle,
+                          MakeTestArray(std::move(mock_array)));
+
+  auto ifrt_request = NewIfrtRequest(NewOpId());
+  ifrt_request->mutable_is_array_deleted_request()->set_array_handle(
+      array_handle);
+  TF_ASSERT_OK_AND_ASSIGN(auto resp, CallBackend(std::move(ifrt_request)));
+  EXPECT_TRUE(resp->is_array_deleted_response().deleted());
+
+  ifrt_request = NewIfrtRequest(NewOpId());
+  ifrt_request->mutable_is_array_deleted_request()->set_array_handle(
+      array_handle);
+  TF_ASSERT_OK_AND_ASSIGN(resp, CallBackend(std::move(ifrt_request)));
+  EXPECT_FALSE(resp->is_array_deleted_response().deleted());
+}
+
+TEST_F(IfrtBackendHandlerTest, IsDeleteFailsForNonExistentArrays) {
+  auto ifrt_request = NewIfrtRequest(NewOpId());
+  ifrt_request->mutable_is_array_deleted_request()->set_array_handle(0);
+  EXPECT_THAT(CallBackend(std::move(ifrt_request)),
+              StatusIs(absl::StatusCode::kNotFound));
+}
+
+TEST_F(IfrtBackendHandlerTest, DestructArrayTest) {
+  tsl::RCReference<xla::ifrt::MockArray> mock_array =
+      tsl::MakeRef<xla::ifrt::MockArray>();
+  TF_ASSERT_OK_AND_ASSIGN(auto array_handle,
+                          MakeTestArray(std::move(mock_array)));
+
+  auto ifrt_request = NewIfrtRequest(NewOpId());
+  ifrt_request->mutable_destruct_array_request()->set_array_handle(
+      array_handle);
+  TF_ASSERT_OK_AND_ASSIGN(auto ifrt_resp, CallBackend(std::move(ifrt_request)));
+  EXPECT_TRUE(ifrt_resp->has_destruct_array_response());
+
+  // Retrying DestructArray should fail. And, this establishes that: (1) the
+  // handle no longer exists on the server, (2) DestructArray fails for
+  // non-existent arrays and (3) DestructArray is not idempotent.
+  ifrt_request = NewIfrtRequest(NewOpId());
+  ifrt_request->mutable_destruct_array_request()->set_array_handle(
+      array_handle);
+  EXPECT_THAT(CallBackend(std::move(ifrt_request)),
+              StatusIs(absl::StatusCode::kNotFound));
+}
+
+// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
+#if defined(PLATFORM_GOOGLE)
+TEST_F(IfrtBackendHandlerTest, CompileSuccess) {
+  std::vector<MockDevice> devices(4);
+  for (int i = 0; i < 4; ++i) {
+    EXPECT_CALL(devices[i], global_device_id())
+        .WillOnce(Return(xla::PjRtGlobalDeviceId(i)));
+  }
+
+  std::vector<xla::ifrt::LoadedExecutable::LogicalDeviceIds>
+      addressable_device_logical_ids;
+  std::vector<xla::ifrt::Device*> addressable_devices;
+  for (int i = 0; i < 4; ++i) {
+    addressable_device_logical_ids.push_back(
+        {.replica = i / 2, .partition = i % 2});
+    addressable_devices.push_back(&devices[i]);
+  }
+
+  auto executable = std::make_unique<MockLoadedExecutable>();
+  EXPECT_CALL(*executable, name()).WillOnce(Return("executable_name"));
+  EXPECT_CALL(*executable, num_devices()).WillOnce(Return(4));
+  EXPECT_CALL(*executable, addressable_device_logical_ids())
+      .WillOnce(Return(absl::MakeSpan(addressable_device_logical_ids)));
+  EXPECT_CALL(*executable, addressable_devices())
+      .WillOnce(Return(absl::MakeSpan(addressable_devices)));
+  EXPECT_CALL(*executable, Fingerprint()).WillOnce(Return("fingerprint"));
+
+  EXPECT_THAT(CompileTestLoadedExecutable(std::move(executable)),
+              IsOkAndHolds(Partially(EquivToProto(R"pb(
+                name: "executable_name"
+                num_devices: 4
+                addressable_device_logical_ids { replica: 0 partition: 0 }
+                addressable_device_logical_ids { replica: 0 partition: 1 }
+                addressable_device_logical_ids { replica: 1 partition: 0 }
+                addressable_device_logical_ids { replica: 1 partition: 1 }
+                addressable_device_ids: [ 0, 1, 2, 3 ]
+                fingerprint_value: "fingerprint"
+              )pb"))));
+}
+#endif
+
+TEST_F(IfrtBackendHandlerTest, CompileFailure) {
+  ASSERT_THAT(
+      CompileTestLoadedExecutable(absl::InternalError("injected error")),
+      StatusIs(absl::StatusCode::kInternal, StrEq("injected error")));
+}
+
+// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
+#if defined(PLATFORM_GOOGLE)
+TEST_F(IfrtBackendHandlerTest, LoadedExecutableMetadata) {
+  MockLoadedExecutable* executable;
+  uint64_t handle;
+  {
+    auto e = std::make_unique<MockLoadedExecutable>();
+    executable = e.get();
+    TF_ASSERT_OK_AND_ASSIGN(CompileResponse response,
+                            CompileTestLoadedExecutable(std::move(e)));
+    handle = response.loaded_executable_handle();
+  }
+
+  {
+    OpSharding op_sharding1;
+    ASSERT_TRUE(
+        TextFormat::ParseFromString(R"pb(type: REPLICATED)pb", &op_sharding1));
+
+    OpSharding op_sharding2;
+    ASSERT_TRUE(TextFormat::ParseFromString(
+        R"pb(type: OTHER
+             tile_shape {
+               element_type: BF16
+               dimensions: [ 2, 2 ]
+             }
+             tile_assignment_dimensions: [ 0, 1 ])pb",
+        &op_sharding2));
+
+    EXPECT_CALL(*executable, GetParameterShardings())
+        .WillOnce(Return(std::vector<OpSharding>{op_sharding1, op_sharding2}));
+
+    EXPECT_CALL(*executable, GetOutputShardings())
+        .WillOnce(Return(std::vector<OpSharding>{op_sharding1}));
+
+    EXPECT_CALL(*executable, GetParameterLayouts())
+        .WillOnce(Return(std::vector<xla::Layout>{
+            xla::LayoutUtil::MakeDescendingLayout(/*rank=*/1),
+            xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2),
+        }));
+    EXPECT_CALL(*executable, GetOutputLayouts())
+        .WillOnce(Return(std::vector<xla::Layout>{
+            xla::LayoutUtil::MakeDescendingLayout(/*rank=*/2),
+        }));
+    EXPECT_CALL(*executable, GetOutputMemoryKinds())
+        .WillOnce(Return(std::vector<std::vector<absl::string_view>>{{"foo"}}));
+
+    auto request = NewIfrtRequest(NewOpId());
+    LoadedExecutableMetadataRequest* metadata_request =
+        request->mutable_loaded_executable_metadata_request();
+    metadata_request->set_loaded_executable_handle(handle);
+
+    EXPECT_THAT(CallBackend(std::move(request)),
+                IsOkAndHolds(Pointee(Partially(EquivToProto(R"pb(
+                  loaded_executable_metadata_response {
+                    parameter_shardings {
+                      shardings { type: REPLICATED }
+                      shardings {
+                        type: OTHER
+                        tile_shape {
+                          element_type: BF16
+                          dimensions: [ 2, 2 ]
+                        }
+                        tile_assignment_dimensions: [ 0, 1 ]
+                      }
+                    }
+                    output_shardings { shardings { type: REPLICATED } }
+                    parameter_layouts_list {
+                      layouts { minor_to_major: 0 }
+                      layouts { minor_to_major: [ 1, 0 ] }
+                    }
+                    output_layouts_list { layouts { minor_to_major: [ 1, 0 ] } }
+                    output_memory_kinds {
+                      memory_kind_lists { memory_kinds: [ "foo" ] }
+                    }
+                  }
+                )pb")))));
+  }
+
+  {
+    EXPECT_CALL(*executable, GetParameterShardings())
+        .WillOnce(Return(std::nullopt));
+    EXPECT_CALL(*executable, GetOutputShardings())
+        .WillOnce(Return(std::nullopt));
+    EXPECT_CALL(*executable, GetParameterLayouts())
+        .WillOnce(Return(absl::UnimplementedError("unimplemented")));
+    EXPECT_CALL(*executable, GetOutputLayouts())
+        .WillOnce(Return(absl::UnimplementedError("unimplemented")));
+    EXPECT_CALL(*executable, GetOutputMemoryKinds())
+        .WillOnce(Return(std::vector<std::vector<absl::string_view>>{}));
+
+    auto request = NewIfrtRequest(NewOpId());
+    LoadedExecutableMetadataRequest* metadata_request =
+        request->mutable_loaded_executable_metadata_request();
+    metadata_request->set_loaded_executable_handle(handle);
+
+    TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<IfrtResponse> response,
+                            CallBackend(std::move(request)));
+    const auto& metadata_response =
+        response->loaded_executable_metadata_response();
+    EXPECT_FALSE(metadata_response.has_parameter_shardings());
+    EXPECT_FALSE(metadata_response.has_output_shardings());
+    EXPECT_TRUE(metadata_response.has_parameter_layouts_error());
+    EXPECT_TRUE(metadata_response.has_output_layouts_error());
+  }
+}
+#endif
+
+// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
+#if defined(PLATFORM_GOOGLE)
+TEST_F(IfrtBackendHandlerTest, LoadedExecutableExecute) {
+  MockDevice device;
+  ON_CALL(device, global_device_id())
+      .WillByDefault(Return(xla::PjRtGlobalDeviceId(0)));
+
+  MockLoadedExecutable* executable;
+  uint64_t handle;
+  {
+    auto e = std::make_unique<MockLoadedExecutable>();
+    executable = e.get();
+    TF_ASSERT_OK_AND_ASSIGN(CompileResponse response,
+                            CompileTestLoadedExecutable(std::move(e)));
+    handle = response.loaded_executable_handle();
+  }
+
+  constexpr int kNumArgs = 3;
+  constexpr int kNumOutputs = 2;
+
+  Shape shape({2, 2});
+  auto sharding = SingleDeviceSharding::Create(&device, MemoryKind());
+
+  auto make_array = [&]() {
+    auto array = tsl::MakeRef<MockArray>();
+    ON_CALL(*array, dtype()).WillByDefault(Return(DType(DType::kF32)));
+    ON_CALL(*array, shape()).WillByDefault(ReturnRef(shape));
+    ON_CALL(*array, sharding()).WillByDefault(ReturnRef(*sharding));
+    return array;
+  };
+
+  std::vector<tsl::RCReference<Array>> outputs;
+  outputs.reserve(kNumOutputs);
+  for (int i = 0; i < kNumOutputs; ++i) {
+    outputs.push_back(make_array());
+  }
+
+  EXPECT_CALL(*executable, Execute(SizeIs(kNumArgs), _, _))
+      .WillOnce(
+          Invoke([&](absl::Span<tsl::RCReference<Array>> args,
+                     const xla::ifrt::LoadedExecutable::ExecuteOptions& options,
+                     std::optional<DeviceList> devices)
+                     -> absl::StatusOr<LoadedExecutable::ExecuteResult> {
+            return LoadedExecutable::ExecuteResult{
+                .status =
+                    Future<absl::Status>(absl::InternalError("injected error")),
+                .outputs = outputs,
+            };
+          }));
+
+  auto request = NewIfrtRequest(NewOpId());
+  LoadedExecutableExecuteRequest* execute_request =
+      request->mutable_loaded_executable_execute_request();
+  for (int i = 0; i < kNumArgs; ++i) {
+    TF_ASSERT_OK_AND_ASSIGN(uint64_t arg_handle, MakeTestArray(make_array()));
+    execute_request->add_args_handles(arg_handle);
+  }
+  execute_request->set_loaded_executable_handle(handle);
+  TF_ASSERT_OK_AND_ASSIGN(
+      *execute_request->mutable_execute_options(),
+      xla::ifrt::LoadedExecutable::ExecuteOptions().ToProto());
+
+  TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<IfrtResponse> response,
+                          CallBackend(std::move(request)));
+  EXPECT_THAT(response, Pointee(Partially(EquivToProto(R"pb(
+                loaded_executable_execute_response {
+                  outputs {
+                    dtype: DTYPE_F32
+                    shape { dimensions: [ 2, 2 ] }
+                  }
+                  outputs {
+                    dtype: DTYPE_F32
+                    shape { dimensions: [ 2, 2 ] }
+                  }
+                }
+              )pb"))));
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto sharding_proto,
+      ToShardingProto(*SingleDeviceSharding::Create(&device, MemoryKind())));
+  for (const auto& output :
+       response->loaded_executable_execute_response().outputs()) {
+    EXPECT_THAT(output.sharding(), EquivToProto(sharding_proto));
+    EXPECT_NE(output.array_handle(), 0);
+  }
+
+  EXPECT_THAT(
+      CheckFuture(
+          response->loaded_executable_execute_response().status_handle()),
+      StatusIs(absl::StatusCode::kInternal, StrEq("injected error")));
+
+  // The second call to `CheckFuture` fails since `CheckFuture` above performs a
+  // destructive read.
+  EXPECT_THAT(
+      CheckFuture(
+          response->loaded_executable_execute_response().status_handle()),
+      StatusIs(absl::StatusCode::kNotFound,
+               HasSubstr("Unknown future handle")));
+}
+#endif
+
+// TODO(b/315809436): Test needs rewrite because protobuf matchers are not OSS
+#if defined(PLATFORM_GOOGLE)
+TEST_F(IfrtBackendHandlerTest, LoadedExecutableDelete) {
+  MockLoadedExecutable* executable;
+  uint64_t handle;
+  {
+    auto e = std::make_unique<MockLoadedExecutable>();
+    executable = e.get();
+    TF_ASSERT_OK_AND_ASSIGN(CompileResponse response,
+                            CompileTestLoadedExecutable(std::move(e)));
+    handle = response.loaded_executable_handle();
+  }
+
+  {
+    EXPECT_CALL(*executable, Delete())
+        .WillOnce(Return(Future<absl::Status>(absl::OkStatus())));
+
+    auto request = NewIfrtRequest(NewOpId());
+    LoadedExecutableDeleteRequest* delete_request =
+        request->mutable_loaded_executable_delete_request();
+    delete_request->set_loaded_executable_handle(handle);
+
+    TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<IfrtResponse> response,
+                            CallBackend(std::move(request)));
+    ASSERT_TRUE(response->has_loaded_executable_delete_response());
+
+    EXPECT_THAT(
+        CheckFuture(
+            response->loaded_executable_delete_response().future_handle()),
+        IsOk());
+  }
+
+  {
+    EXPECT_CALL(*executable, IsDeleted()).WillOnce(Return(true));
+
+    auto request = NewIfrtRequest(NewOpId());
+    LoadedExecutableIsDeletedRequest* is_deleted_request =
+        request->mutable_loaded_executable_is_deleted_request();
+    is_deleted_request->set_loaded_executable_handle(handle);
+
+    EXPECT_THAT(CallBackend(std::move(request)),
+                IsOkAndHolds(Pointee(Partially(EquivToProto(R"pb(
+                  loaded_executable_is_deleted_response { is_deleted: true }
+                )pb")))));
+  }
+}
+#endif
+
+TEST_F(IfrtBackendHandlerTest, LoadedExecutableDestruct) {
+  MockLoadedExecutable* executable;
+  uint64_t handle;
+  {
+    auto e = std::make_unique<MockLoadedExecutable>();
+    executable = e.get();
+    TF_ASSERT_OK_AND_ASSIGN(CompileResponse response,
+                            CompileTestLoadedExecutable(std::move(e)));
+    handle = response.loaded_executable_handle();
+  }
+
+  {
+    auto request = NewIfrtRequest(NewOpId());
+    LoadedExecutableDestructRequest* destruct_request =
+        request->mutable_loaded_executable_destruct_request();
+    destruct_request->set_loaded_executable_handle(handle);
+
+    TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<IfrtResponse> response,
+                            CallBackend(std::move(request)));
+    ASSERT_TRUE(response->has_loaded_executable_destruct_response());
+  }
+
+  // Any attempt to access the loaded executable handle should now return an
+  // error.
+  {
+    auto request = NewIfrtRequest(NewOpId());
+    LoadedExecutableDestructRequest* destruct_request =
+        request->mutable_loaded_executable_destruct_request();
+    destruct_request->set_loaded_executable_handle(handle);
+
+    EXPECT_THAT(CallBackend(std::move(request)),
+                StatusIs(absl::StatusCode::kNotFound,
+                         HasSubstr("Unknown loaded executable handle")));
+  }
+}
+
+TEST_F(IfrtBackendHandlerTest, LoadedHostCallbackExecute) {
+  // Build a remote host callback with one F32 argument and one F32 result.
+  std::vector<xla::HostCallbackArgInfo> hcb_args = {{
+      .channel_id = 1,
+      .shape = xla::ShapeUtil::MakeShape(xla::F32, {}),
+  }};
+  std::vector<xla::HostCallbackArgInfo> hcb_results = {{
+      .channel_id = 2,
+      .shape = xla::ShapeUtil::MakeShape(xla::F32, {}),
+  }};
+  auto hcb = tsl::MakeRef<RemoteLoadedHostCallback>(
+      mock_client_, std::move(hcb_args), std::move(hcb_results),
+      /*queue=*/nullptr);
+
+  // Compile an executable with the above host callback. The resulting loaded
+  // host callback handle and `xla::HostCallback` are kept for triggering host
+  // callback execution.
+  //
+  // The setup code must use `xla::ifrt::XlaCompileOptions` for now since this
+  // is the only allowed compile options type that is currently recognized as
+  // supporting host callbacks.
+  MockLoadedExecutable* executable;
+  tsl::RCReference<xla::ifrt::LoadedHostCallback> loaded_host_callback;
+  uint64_t loaded_host_callback_handle;
+  {
+    auto request = NewIfrtRequest(NewOpId());
+    CompileRequest* compile_request = request->mutable_compile_request();
+
+    TestProgram program;
+    TF_ASSERT_OK_AND_ASSIGN(*compile_request->mutable_program(),
+                            Serialize(program));
+    xla::ifrt::XlaCompileOptions compile_options;
+    TF_ASSERT_OK_AND_ASSIGN(*compile_request->mutable_compile_options(),
+                            Serialize(compile_options));
+
+    TF_ASSERT_OK_AND_ASSIGN(std::string host_callback_serialized,
+                            hcb->Serialize());
+    compile_request->add_host_callbacks(std::move(host_callback_serialized));
+
+    auto e = std::make_unique<MockLoadedExecutable>();
+    executable = e.get();
+
+    EXPECT_CALL(mock_compiler_, Compile(_, _))
+        .WillOnce(DoAll(
+            Invoke(
+                [&](const std::unique_ptr<xla::ifrt::Program>& program,
+                    const std::unique_ptr<xla::ifrt::CompileOptions>& options) {
+                  auto* xla_compile_options =
+                      llvm::cast<xla::ifrt::XlaCompileOptions>(options.get());
+                  auto& loaded_host_callbacks =
+                      xla_compile_options->loaded_host_callbacks;
+                  ASSERT_EQ(loaded_host_callbacks.size(), 1);
+                  loaded_host_callback = loaded_host_callbacks.front();
+                }),
+            Return(ByMove(std::move(e)))));
+
+    TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<IfrtResponse> response,
+                            CallBackend(std::move(request)));
+
+    ASSERT_TRUE(response->has_compile_response());
+    CompileResponse compile_response = response->compile_response();
+
+    loaded_host_callback_handle =
+        compile_response.loaded_host_callback_handles(0);
+    ASSERT_THAT(loaded_host_callback, NotNull());
+  }
+
+  // Enqueue a host callback execution. This is done on a separate thread since
+  // `LoadedHostCallbackPollRequest` blocks until there is a pending execution.
+  auto host_callback_thread = absl::WrapUnique(tsl::Env::Default()->StartThread(
+      tsl::ThreadOptions(), "HostCallback", [&]() {
+        xla::Literal x = xla::LiteralUtil::CreateR0(1.0f);
+
+        std::vector<void*> operands;
+        operands.push_back(x.untyped_data());
+
+        xla::Literal out = xla::LiteralUtil::CreateR0(0.0f);
+        std::vector<void*> results;
+        results.push_back(out.untyped_data());
+
+        const xla::HostCallback* xla_host_callback =
+            &llvm::cast<RemoteLoadedHostCallback>(loaded_host_callback.get())
+                 ->host_callback();
+        ASSERT_THAT(
+            xla_host_callback->callback(results.data(), operands.data()),
+            IsOk());
+        EXPECT_EQ(out, xla::LiteralUtil::CreateR0(2.0f));
+      }));
+
+  // Poll for a host callback execution and verify its argument against the one
+  // passed by the execution thread above.
+  uint64_t host_callback_execution_handle;
+  {
+    const uint64_t operand_host_buffer_handle = NewHostBufferHandle();
+
+    auto request = NewIfrtRequest(NewOpId());
+    LoadedHostCallbackPollRequest* poll_request =
+        request->mutable_loaded_host_callback_poll_request();
+    poll_request->set_loaded_host_callback_handle(loaded_host_callback_handle);
+    poll_request->set_operand_host_buffer_handle(operand_host_buffer_handle);
+
+    TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<IfrtResponse> response,
+                            CallBackend(std::move(request)));
+
+    ASSERT_TRUE(response->has_loaded_host_callback_poll_response());
+    const LoadedHostCallbackPollResponse& poll_response =
+        response->loaded_host_callback_poll_response();
+    host_callback_execution_handle =
+        poll_response.host_callback_execution_handle();
+
+    TF_ASSERT_OK_AND_ASSIGN(
+        const std::shared_ptr<const std::string> operands,
+        host_buffer_store_->Lookup(operand_host_buffer_handle));
+    EXPECT_EQ(xla::BorrowingLiteral(operands->data(),
+                                    xla::ShapeUtil::MakeShape(xla::F32, {})),
+              xla::LiteralUtil::CreateR0(1.0f));
+  }
+
+  // Return the execution result. This will unblock the execution thread above,
+  // which also verifies the result.
+  {
+    auto result = xla::LiteralUtil::CreateR0(2.0f);
+    std::string result_buffer(absl::string_view(
+        static_cast<const char*>(result.untyped_data()), result.size_bytes()));
+
+    const uint64_t result_host_buffer_handle = NewHostBufferHandle();
+    ASSERT_THAT(host_buffer_store_->Store(result_host_buffer_handle,
+                                          std::move(result_buffer)),
+                IsOk());
+
+    auto request = NewIfrtRequest(NewOpId());
+    LoadedHostCallbackReturnRequest* ret_request =
+        request->mutable_loaded_host_callback_return_request();
+    ret_request->set_host_callback_execution_handle(
+        host_callback_execution_handle);
+    ret_request->set_result_host_buffer_handle(result_host_buffer_handle);
+
+    TF_ASSERT_OK_AND_ASSIGN(std::shared_ptr<IfrtResponse> response,
+                            CallBackend(std::move(request)));
+    ASSERT_TRUE(response->has_loaded_host_callback_return_response());
+  }
+}
+
+TEST_F(IfrtBackendHandlerTest, GetDefaultDeviceAssignmentSuccess) {
+  const int kNumReplicas = 1;
+  const int kNumPartitions = 3;
+
+  EXPECT_CALL(*mock_client_,
+              GetDefaultDeviceAssignment(kNumReplicas, kNumPartitions))
+      .WillOnce(Return(xla::DeviceAssignment(kNumReplicas, kNumPartitions)));
+
+  auto request = NewIfrtRequest(NewOpId());
+  auto* default_device_assignment_request =
+      request->mutable_get_default_device_assignment_request();
+  default_device_assignment_request->set_num_replicas(kNumReplicas);
+  default_device_assignment_request->set_num_partitions(kNumPartitions);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto response, CallBackend(std::move(request)));
+  TF_ASSERT_OK_AND_ASSIGN(auto assignment_got,
+                          xla::DeviceAssignment::Deserialize(
+                              response->get_default_device_assignment_response()
+                                  .device_assignment()));
+  EXPECT_EQ(assignment_got->replica_count(), kNumReplicas);
+  EXPECT_EQ(assignment_got->computation_count(), kNumPartitions);
+}
+
+TEST_F(IfrtBackendHandlerTest,
+       GetDefaultDeviceAssignmentFailsIfTheBackendFails) {
+  const int kNumReplicas = 1;
+  const int kNumPartitions = 3;
+
+  EXPECT_CALL(*mock_client_,
+              GetDefaultDeviceAssignment(kNumReplicas, kNumPartitions))
+      .WillOnce(Return(absl::UnknownError("injected error")));
+
+  auto request = NewIfrtRequest(NewOpId());
+  auto* default_device_assignment_request =
+      request->mutable_get_default_device_assignment_request();
+  default_device_assignment_request->set_num_replicas(kNumReplicas);
+  default_device_assignment_request->set_num_partitions(kNumPartitions);
+
+  EXPECT_THAT(CallBackend(std::move(request)),
+              StatusIs(absl::StatusCode::kUnknown, StrEq("injected error")));
+}
+
+}  // namespace
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_session_handler.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_session_handler.cc
new file mode 100644
index 00000000000000..a4b0a95f2866f3
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_session_handler.cc
@@ -0,0 +1,117 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/server/ifrt_session_handler.h"
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "absl/log/log.h"
+#include "absl/memory/memory.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/common/proto_util.h"
+
+// The tsl include below is needed only for the Status macros such as
+// ASSIGN_OR_RETURN, since the OSS absl package does not have the counterparts
+// yet.
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+absl::StatusOr<std::unique_ptr<IfrtSessionHandler>> IfrtSessionHandler::Create(
+    uint64_t id, BackendFactory backend_factory) {
+  if (backend_factory == nullptr) {
+    return absl::InvalidArgumentError("BackendFactory cannot be nullptr.");
+  }
+  return absl::WrapUnique(
+      new IfrtSessionHandler(id, std::move(backend_factory)));
+}
+
+IfrtSessionHandler::IfrtSessionHandler(uint64_t id,
+                                       BackendFactory backend_factory)
+    : session_id_(id), backend_factory_(std::move(backend_factory)) {}
+
+void IfrtSessionHandler::NewIncomingRequest(
+    std::unique_ptr<IfrtRequest> request,
+    std::function<void(std::shared_ptr<IfrtResponse>)> on_done) {
+  VLOG(2) << "NewIncomingRequest: " << request->DebugString();
+
+  const uint64_t op_id = request->request_metadata().op_id();
+
+  // The current implementation exploits the async nature of the backend_ IFRT
+  // client to minimize the amount of work we do per request. However, using a
+  // threadpool here might make sense as a performance optimization.
+
+  auto result = [&]() -> Future<Response> {
+    if (request->has_init_request()) {
+      return ProcessInitRequest(std::move(request));
+    }
+    if (auto status = SetupBackendIfNeeded(); !status.ok()) {
+      return Future<Response>(status);
+    }
+    absl::ReaderMutexLock read_lock(&backend_mu_);
+    return backend_->Process(std::move(request));
+  }();
+
+  // Consider maintaining a count of in-flight requests (that won't complete
+  // until the following OnReady callback happens) so we can safely deleting the
+  // reactor_.
+  result.OnReady([op_id, on_done = std::move(on_done)](
+                     absl::StatusOr<std::shared_ptr<IfrtResponse>> result) {
+    if (result.ok()) {
+      on_done(*std::move(result));
+    } else {
+      on_done(NewIfrtResponse(op_id, result.status()));
+    }
+  });
+}
+
+Future<IfrtSessionHandler::Response> IfrtSessionHandler::ProcessInitRequest(
+    std::unique_ptr<IfrtRequest> request) {
+  absl::MutexLock lock(&backend_mu_);
+  if (backend_ != nullptr) {
+    // Currently backends cannot be reinitialized.
+    return Future<Response>(absl::FailedPreconditionError(
+        "This session has already been initialized."));
+  }
+
+  auto backend = backend_factory_(session_id_);
+  if (!backend.ok()) {
+    return Future<Response>(backend.status());
+  }
+  backend_ = *std::move(backend);
+
+  return backend_->Process(std::move(request));
+}
+
+absl::Status IfrtSessionHandler::SetupBackendIfNeeded() {
+  absl::MutexLock lock(&backend_mu_);
+  if (backend_ != nullptr) {
+    return absl::OkStatus();
+  }
+  TF_ASSIGN_OR_RETURN(backend_, backend_factory_(session_id_));
+  return absl::OkStatus();
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_session_handler.h b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_session_handler.h
new file mode 100644
index 00000000000000..505341a6934958
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_session_handler.h
@@ -0,0 +1,82 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_SERVER_IFRT_SESSION_HANDLER_H_
+#define XLA_PYTHON_IFRT_PROXY_SERVER_IFRT_SESSION_HANDLER_H_
+
+#include <functional>
+#include <memory>
+
+#include "absl/container/flat_hash_set.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/server/ifrt_backend.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// IfrtSessionHandler glues an incoming stream to a stack of backend runtimes
+// abstracted out by a `BackendInterface`. It utilizes the provided `Backend` to
+// process the incoming client requests after ensuring that dependencies as
+// specified by the client are honored and the chunked requests are fully
+// re-assembled.
+class IfrtSessionHandler {
+ public:
+  using BackendFactory =
+      absl::AnyInvocable<absl::StatusOr<std::unique_ptr<BackendInterface>>(
+          uint64_t session_id)>;
+
+  using Response = BackendInterface::Response;
+
+  // Makes a new IfrtSessionHandler with the given Session ID that uniquely
+  // identifies this session. The backend_factory cannot be a nullptr.
+  static absl::StatusOr<std::unique_ptr<IfrtSessionHandler>> Create(
+      uint64_t id, BackendFactory backend_factory);
+
+  uint64_t session_id() const { return session_id_; }
+
+  // Top-level handler the transport implementation calls to hand off a new
+  // incoming request. `on_done` is called asynchronously to return responses.
+  void NewIncomingRequest(
+      std::unique_ptr<IfrtRequest> request,
+      std::function<void(std::shared_ptr<IfrtResponse>)> on_done);
+
+ private:
+  IfrtSessionHandler(uint64_t id, BackendFactory backend_factory);
+
+  // InitRequest is treated somewhat differently than the rest since it triggers
+  // the creation of the backend_
+  Future<Response> ProcessInitRequest(std::unique_ptr<IfrtRequest> request)
+      ABSL_LOCKS_EXCLUDED(backend_mu_);
+
+  // Sets up the backaned_ only if needed - i.e., only if it is a nullptr.
+  absl::Status SetupBackendIfNeeded() ABSL_LOCKS_EXCLUDED(backend_mu_);
+
+  const uint64_t session_id_;  // Unique ID of this Session.
+
+  // The backend_ runtime(s) this session relies on for processing the incoming
+  // requests. It is instantiated at the start of a new Bidi stream, and
+  // currently does not change for the life of this object.
+  BackendFactory backend_factory_;
+  absl::Mutex backend_mu_;
+  std::unique_ptr<BackendInterface> backend_ ABSL_GUARDED_BY(backend_mu_);
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_SERVER_IFRT_SESSION_HANDLER_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/ifrt_session_handler_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_session_handler_test.cc
new file mode 100644
index 00000000000000..b5a1e0bc316d57
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/ifrt_session_handler_test.cc
@@ -0,0 +1,70 @@
+// Copyright 2023 The OpenXLA Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "xla/python/ifrt_proxy/server/ifrt_session_handler.h"
+
+#include <cstdint>
+#include <memory>
+#include <utility>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/server/ifrt_backend.h"
+#include "tsl/platform/status_matchers.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+namespace {
+
+using ::testing::Not;
+using ::tsl::testing::IsOk;
+
+// FakeBackend. Currently: Fails or returns illegal values where possible.
+// All other methods return dummy strings or empty vectors. Individual tests
+// can make derived classes that override specific methods as needed.
+class FakeBackend : public BackendInterface {
+ public:
+  FakeBackend() = default;
+  ~FakeBackend() override = default;
+
+  Future<BackendInterface::Response> Process(
+      std::unique_ptr<IfrtRequest> request) override {
+    return Future<BackendInterface::Response>(std::make_unique<IfrtResponse>());
+  }
+};
+
+TEST(IfrtSessionHandlerTest, NullptrForBackendMakerFails) {
+  EXPECT_THAT(IfrtSessionHandler::Create(1234, nullptr), Not(IsOk()));
+}
+
+TEST(IfrtSessionHandlerTest, SuccessfulCreation) {
+  std::unique_ptr<BackendInterface> backend = std::make_unique<FakeBackend>();
+  EXPECT_THAT(
+      IfrtSessionHandler::Create(
+          1234, [&](uint64_t session_id) { return std::move(backend); }),
+      IsOk());
+}
+
+// TODO(b/282757875) Add "end-to-end" tests that cover the entire path from the
+// Server/BidiReactor to the backend.  Since IfrtSessionHandler writes the
+// responses (IfrtResponse messages) directly to the Bidi Reactor, tests for the
+// actual processing of requests need a full server and a fake client that
+// allows us retrieve and examine the responses.
+
+}  // namespace
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/mock_ifrt_backend.h b/third_party/xla/xla/python/ifrt_proxy/server/mock_ifrt_backend.h
new file mode 100644
index 00000000000000..620808f912be87
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/mock_ifrt_backend.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_SERVER_MOCK_IFRT_BACKEND_H_
+#define XLA_PYTHON_IFRT_PROXY_SERVER_MOCK_IFRT_BACKEND_H_
+
+#include <memory>
+
+#include <gmock/gmock.h>
+#include "absl/status/status.h"
+#include "xla/python/ifrt/future.h"
+#include "xla/python/ifrt_proxy/common/ifrt_service.pb.h"
+#include "xla/python/ifrt_proxy/server/ifrt_backend.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+class MockIfrtBackend final : public BackendInterface {
+ public:
+  MOCK_METHOD(Future<Response>, Process, (std::unique_ptr<IfrtRequest> request),
+              (final));
+};
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_SERVER_MOCK_IFRT_BACKEND_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/version.cc b/third_party/xla/xla/python/ifrt_proxy/server/version.cc
new file mode 100644
index 00000000000000..b4f5298203a5ab
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/version.cc
@@ -0,0 +1,48 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "xla/python/ifrt_proxy/server/version.h"
+
+#include <algorithm>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+absl::StatusOr<int> ChooseVersion(int client_min_version,
+                                  int client_max_version,
+                                  int server_min_version,
+                                  int server_max_version) {
+  const int version = std::min(server_max_version, client_max_version);
+
+  if (version < server_min_version || version < client_min_version) {
+    return absl::InvalidArgumentError(absl::StrCat(
+        "IFRT Proxy client and server failed to agree on the "
+        "protocol version; supported versions: client = [",
+        client_min_version, ", ", client_max_version, "], server = [",
+        server_min_version, ", ", server_max_version, "]"));
+  }
+
+  return version;
+}
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/version.h b/third_party/xla/xla/python/ifrt_proxy/server/version.h
new file mode 100644
index 00000000000000..2556b5656f6188
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/version.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XLA_PYTHON_IFRT_PROXY_SERVER_VERSION_H_
+#define XLA_PYTHON_IFRT_PROXY_SERVER_VERSION_H_
+
+#include "absl/status/statusor.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+
+// TODO(b/296144873): Document the version upgrade policy.
+inline constexpr int kServerMinVersion = 1;
+inline constexpr int kServerMaxVersion = 1;
+
+// Returns a version that both the client and the server support, or an error if
+// there is no such a version.
+absl::StatusOr<int> ChooseVersion(int client_min_version,
+                                  int client_max_version,
+                                  int server_min_version = kServerMinVersion,
+                                  int server_max_version = kServerMaxVersion);
+
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
+
+#endif  // XLA_PYTHON_IFRT_PROXY_SERVER_VERSION_H_
diff --git a/third_party/xla/xla/python/ifrt_proxy/server/version_test.cc b/third_party/xla/xla/python/ifrt_proxy/server/version_test.cc
new file mode 100644
index 00000000000000..efebcad9d65d9a
--- /dev/null
+++ b/third_party/xla/xla/python/ifrt_proxy/server/version_test.cc
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2023 The OpenXLA Authors.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "xla/python/ifrt_proxy/server/version.h"
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/status/status.h"
+#include "tsl/platform/status_matchers.h"
+
+namespace xla {
+namespace ifrt {
+namespace proxy {
+namespace {
+
+using ::tsl::testing::IsOk;
+using ::tsl::testing::StatusIs;
+
+struct Param {
+  int client_min_version;
+  int client_max_version;
+  int server_min_version;
+  int server_max_version;
+};
+
+class CompatibleVersionTest : public ::testing::TestWithParam<Param> {};
+
+TEST_P(CompatibleVersionTest, Verify) {
+  const Param& param = GetParam();
+  EXPECT_THAT(ChooseVersion(param.client_min_version, param.client_max_version,
+                            param.server_min_version, param.server_max_version),
+              IsOk());
+}
+
+INSTANTIATE_TEST_SUITE_P(CompatibleVersionTest, CompatibleVersionTest,
+                         ::testing::Values(Param{1, 1, 1, 1}, Param{1, 2, 2, 2},
+                                           Param{2, 2, 1, 2},
+                                           Param{1, 3, 3, 4}));
+
+class IncompatibleVersionTest : public ::testing::TestWithParam<Param> {};
+
+TEST_P(IncompatibleVersionTest, Verify) {
+  const Param& param = GetParam();
+  EXPECT_THAT(ChooseVersion(param.client_min_version, param.client_max_version,
+                            param.server_min_version, param.server_max_version),
+              StatusIs(absl::StatusCode::kInvalidArgument));
+}
+
+INSTANTIATE_TEST_SUITE_P(IncompatibleVersionTest, IncompatibleVersionTest,
+                         ::testing::Values(Param{1, 2, 3, 3}, Param{1, 3, 4, 6},
+                                           Param{1, 1, 2, 2}));
+
+}  // namespace
+}  // namespace proxy
+}  // namespace ifrt
+}  // namespace xla
diff --git a/third_party/xla/xla/python/jax_jit.cc b/third_party/xla/xla/python/jax_jit.cc
index 41dd2ff5ed5c55..57b907c6643bf4 100644
--- a/third_party/xla/xla/python/jax_jit.cc
+++ b/third_party/xla/xla/python/jax_jit.cc
@@ -311,7 +311,7 @@ xla::Status ParseArguments(absl::Span<PyObject* const> positional_args,
       }
     }
   }
-  return ::tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 void BuildJaxjitSubmodule(py::module& m) {
diff --git a/third_party/xla/xla/python/pjrt_ifrt/BUILD b/third_party/xla/xla/python/pjrt_ifrt/BUILD
index 8f26cc8906d177..c635ded4633be5 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/BUILD
+++ b/third_party/xla/xla/python/pjrt_ifrt/BUILD
@@ -1,4 +1,5 @@
 load("//xla:xla.bzl", "xla_cc_test")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 
@@ -20,15 +21,16 @@ package_group(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        ":friends",
+        ":internal",
+    ]),
 )
 
-exports_files(
-    [
-        "BUILD",
-    ],
-    visibility = ["//visibility:public"],
-)
+exports_files([
+    "BUILD",
+])
 
 # TODO(hyeontaek): Move this target out of pjrt_ifrt.
 cc_library(
@@ -42,7 +44,6 @@ cc_library(
         "xla_sharding.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":xla_compiler_proto_cc",
         "//xla:util",
@@ -67,21 +68,18 @@ tf_proto_library(
     srcs = ["xla_host_callback.proto"],
     cc_api_version = 2,
     protodeps = ["//xla:xla_data_proto"],
-    visibility = ["//visibility:public"],
 )
 
 tf_proto_library(
     name = "xla_compiler_proto",
     srcs = ["xla_compiler.proto"],
     protodeps = ["//xla/pjrt:compile_options_proto"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "xla_program_serdes",
     srcs = ["xla_program_serdes.cc"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":xla_ifrt",
         "//xla/mlir_hlo:mhlo_passes",
@@ -124,13 +122,11 @@ tf_proto_library(
         "//xla:xla_data_proto",
         "//xla/python/ifrt:types_proto",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "xla_sharding_serdes",
     srcs = ["xla_sharding_serdes.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":xla_ifrt",
         ":xla_sharding_proto_cc",
@@ -161,7 +157,6 @@ cc_library(
     name = "xla_executable_impl_test_lib",
     testonly = True,
     srcs = ["xla_executable_impl_test_lib.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":xla_ifrt",
         "//xla/pjrt:mlir_to_hlo",
@@ -222,7 +217,6 @@ cc_library(
         "pjrt_tuple.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":xla_ifrt",
         "//xla:literal",
@@ -266,7 +260,6 @@ cc_library(
     name = "tfrt_cpu_client_test_lib",
     testonly = True,
     srcs = ["tfrt_cpu_client_test_lib.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":pjrt_ifrt",
         "//xla/pjrt/cpu:cpu_client",
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
index fe128317a69457..d0f4929bf7c0a0 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.cc
@@ -21,10 +21,13 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_join.h"
 #include "absl/types/span.h"
+#include "xla/pjrt/pjrt_client.h"
+#include "xla/python/ifrt/client.h"
 #include "xla/python/ifrt/sharding.h"
 #include "xla/python/pjrt_ifrt/pjrt_array.h"
 #include "xla/python/pjrt_ifrt/pjrt_tuple.h"
@@ -54,6 +57,25 @@ std::unique_ptr<PjRtClient> PjRtClient::Create(
   return absl::WrapUnique(new PjRtClient(std::move(pjrt_client)));
 }
 
+absl::flat_hash_map<std::string, Client::ClientAttribute>
+PjRtClient::attributes() const {
+  absl::flat_hash_map<std::string, ClientAttribute> attributes;
+  attributes.insert({"supports_executable_serialization", true});
+
+  if (std::optional<PjRtPluginAttributes> plugin_attributes =
+          pjrt_client_->plugin_attributes();
+      plugin_attributes.has_value()) {
+    attributes.insert(
+        {"pjrt_c_api_major_version",
+         ClientAttribute(plugin_attributes->pjrt_c_api_major_version)});
+    attributes.insert(
+        {"pjrt_c_api_minor_version",
+         ClientAttribute(plugin_attributes->pjrt_c_api_minor_version)});
+  }
+
+  return attributes;
+}
+
 StatusOr<tsl::RCReference<PjRtCompatibleArray>> PjRtClient::CreatePjRtArray(
     std::shared_ptr<PjRtBuffer> pjrt_buffer) {
   TF_ASSIGN_OR_RETURN(auto array,
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
index 0266c67153838a..0db769072e5e2e 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_client.h
@@ -111,18 +111,8 @@ class PjRtClient final
     DCHECK(this);
     return pjrt_client_->platform_id();
   }
-  absl::flat_hash_map<std::string, ClientAttribute> attributes()
-      const override {
-    std::optional<PjRtPluginAttributes> attributes =
-        pjrt_client_->plugin_attributes();
-    if (!attributes.has_value()) {
-      return {};
-    }
-    return {{"pjrt_c_api_major_version",
-             ClientAttribute(attributes->pjrt_c_api_major_version)},
-            {"pjrt_c_api_minor_version",
-             ClientAttribute(attributes->pjrt_c_api_minor_version)}};
-  }
+
+  absl::flat_hash_map<std::string, ClientAttribute> attributes() const override;
 
   int device_count() const override {
     DCHECK(this);
diff --git a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
index 7f9e825c17e2d4..66b59d8abf8349 100644
--- a/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
+++ b/third_party/xla/xla/python/pjrt_ifrt/pjrt_executable.cc
@@ -265,7 +265,6 @@ StatusOr<std::unique_ptr<LoadedExecutable>> PjRtLoadedExecutable::Create(
   TF_ASSIGN_OR_RETURN(
       auto pjrt_loaded_executable,
       client->pjrt_client()->Compile(module, std::move(compile_options)));
-  TF_RETURN_IF_ERROR(UpgradeVersionedStablehlo(module));
 
   if (auto_spmd_partitioning) {
     // TODO(hyeontaek): Use a full shape and a sharding rather than a per-shard
diff --git a/third_party/xla/xla/python/profiler.cc b/third_party/xla/xla/python/profiler.cc
index 53521e25bfbb7c..3a6bdfaf582953 100644
--- a/third_party/xla/xla/python/profiler.cc
+++ b/third_party/xla/xla/python/profiler.cc
@@ -60,10 +60,10 @@ tensorflow::ProfileOptions DefaultPythonProfileOptions() {
 }
 
 const PLUGIN_Profiler_Api* FindProfilerApi(const PJRT_Api* pjrt_api) {
-  const PJRT_Structure_Base* next =
-      reinterpret_cast<const PJRT_Structure_Base*>(pjrt_api->extension_start);
+  const PJRT_Extension_Base* next =
+      reinterpret_cast<const PJRT_Extension_Base*>(pjrt_api->extension_start);
   while (next != nullptr &&
-         next->type != PJRT_Structure_Type::PJRT_Structure_Type_Profiler) {
+         next->type != PJRT_Extension_Type::PJRT_Extension_Type_Profiler) {
     next = next->next;
   }
   if (next == nullptr) {
diff --git a/third_party/xla/xla/python/profiler/internal/BUILD b/third_party/xla/xla/python/profiler/internal/BUILD
index 8798ee6602b858..0d9ed5cad4a3ec 100644
--- a/third_party/xla/xla/python/profiler/internal/BUILD
+++ b/third_party/xla/xla/python/profiler/internal/BUILD
@@ -1,9 +1,11 @@
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/profiler/builds:build_config.bzl", "tf_profiler_copts")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla/python/profiler:__subpackages__"],
     licenses = ["notice"],
 )
 
@@ -14,7 +16,10 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     copts = tf_profiler_copts() + ["-fexceptions"],
     features = ["-use_header_modules"],  # Incompatible with -fexceptions.
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//xla/backends/profiler:__subpackages__",
+        "//tensorflow/python/profiler/internal:__subpackages__",
+    ]),
     deps = [
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/memory",
@@ -37,7 +42,10 @@ cc_library(
     name = "traceme_wrapper",
     hdrs = ["traceme_wrapper.h"],
     copts = tf_profiler_copts(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//xla/python:__pkg__",
+        "//tensorflow/python/profiler/internal:__pkg__",
+    ]),
     deps = [
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:macros",
diff --git a/third_party/xla/xla/python/py_array.cc b/third_party/xla/xla/python/py_array.cc
index bed3b172ddbb53..64e4f0fbe866a5 100644
--- a/third_party/xla/xla/python/py_array.cc
+++ b/third_party/xla/xla/python/py_array.cc
@@ -52,6 +52,9 @@ limitations under the License.
 #include "xla/python/types.h"
 #include "xla/python/util.h"
 #include "xla/shape.h"
+#if GOOGLE_CUDA
+#include "xla/stream_executor/cuda/cuda_driver.h"
+#endif
 #include "xla/util.h"
 #include "tsl/platform/statusor.h"
 
@@ -706,6 +709,121 @@ py::dict PyArray::CudaArrayInterface() {
   return result;
 }
 
+StatusOr<pybind11::object> CudaArrayInterfaceToBuffer(
+    const pybind11::dict& cai, std::shared_ptr<PyClient> client) {
+#ifndef GOOGLE_CUDA
+  throw XlaRuntimeError("This operation requires CUDA support.");
+#else
+  if (!cai.contains("data")) {
+    return absl::InvalidArgumentError(
+        "CUDA Array Interface does not define `data`");
+  }
+  if (!cai.contains("shape")) {
+    return absl::InvalidArgumentError(
+        "CUDA Array Interface does not define `shape`");
+  }
+  if (!cai.contains("typestr")) {
+    return absl::InvalidArgumentError(
+        "CUDA Array Interface does not define `typestr`");
+  }
+  if (!cai.contains("version")) {
+    return absl::InvalidArgumentError(
+        "CUDA Array Interface does not define `version`");
+  }
+  auto version = py::cast<int>(cai["version"]);
+  if (version < 2 || version > 3) {
+    LOG(WARNING) << "CUDA Array Interface version " << version
+                 << " support is undefined";
+  }
+  auto data = py::cast<pybind11::tuple>(cai["data"]);
+  auto data_value = pybind11::cast<std::intptr_t>(data[0]);
+  void* data_ptr = reinterpret_cast<void*>(data_value);
+  auto dimensions = pybind11::cast<std::vector<int64_t>>(cai["shape"]);
+  if (data_value == 0 && absl::c_find(dimensions, 0) == dimensions.end()) {
+    return absl::InvalidArgumentError(
+        "CUDA Array Interface `data`(=NULL) and `shape`(no zero-valued "
+        "dimensions) are inconsistent");
+  }
+  auto ndim = dimensions.size();
+  TF_ASSIGN_OR_RETURN(
+      PrimitiveType element_type,
+      DtypeToPrimitiveType(py::dtype::from_args(cai["typestr"])));
+
+  // cannot determine device_id/stream when device pointer is NULL.
+  int device_id =
+      (data_value == 0
+           ? 0
+           : stream_executor::gpu::CreatedContexts::GetDeviceOrdinal(data_ptr));
+  TF_ASSIGN_OR_RETURN(auto device,
+                      client->DeviceFromLocalHardwareId(device_id));
+  bool is_default_stream =
+      data_value == 0 || version == 2 ||
+      (version == 3 && (!cai.contains("stream") || cai["stream"].is_none()));
+  TF_ASSIGN_OR_RETURN(
+      std::intptr_t stream,
+      ([is_default_stream, cai, device]() -> StatusOr<std::intptr_t> {
+        if (is_default_stream) {
+          return device->GetStreamForExternalReadyEvents();
+        } else {
+          auto stream_ = py::cast<std::intptr_t>(cai["stream"]);
+          if (stream_ == 0) {
+            return absl::InvalidArgumentError(
+                "CUDA Array Interface does not allow zero stream value");
+          }
+          return stream_;
+        }
+      }()));
+
+  std::vector<int64_t> minor_to_major(ndim);
+  if (cai.contains("strides") && !cai["strides"].is_none() && data_value != 0) {
+    std::iota(minor_to_major.begin(), minor_to_major.end(), 0);
+    auto strides = pybind11::cast<std::vector<int64_t>>(cai["strides"]);
+    if (strides.size() != ndim) {
+      return absl::InvalidArgumentError(
+          "CUDA Array Interface `shape` and `strides` dimensionalities are "
+          "inconsistent");
+    }
+    absl::c_sort(minor_to_major, [&](int a, int b) {
+      // If two dimensions have the same stride, prefer the major-to-minor
+      // interpretation of the ordering, since that's what JAX wants.
+      return (strides[a] == strides[b] ? b < a : strides[a] < strides[b]);
+    });
+    int64_t stride = ShapeUtil::ByteSizeOfPrimitiveType(element_type);
+    for (int64_t d : minor_to_major) {
+      if (dimensions[d] > 1 && strides[d] != stride) {
+        return absl::UnimplementedError(absl::StrCat(
+            "Only arrays with trivial (compact) striding are supported; "
+            "i.e., arrays whose striding represents a transposition of the "
+            "underlying buffer but not broadcasting. Dimensions were: [%s], "
+            "strides were [%s].",
+            absl::StrJoin(dimensions, ","), absl::StrJoin(strides, ",")));
+      }
+      stride *= dimensions[d];
+    }
+  } else {
+    std::iota(minor_to_major.rbegin(), minor_to_major.rend(), 0);
+  }
+  Shape shape = ShapeUtil::MakeShapeWithDenseLayout(element_type, dimensions,
+                                                    minor_to_major);
+  std::function<void()> on_delete_callback = []() {};
+  TF_ASSIGN_OR_RETURN(
+      auto pjrt_buffer,
+      device->client()->CreateViewOfDeviceBuffer(
+          static_cast<char*>(data_ptr), shape, device.get(), on_delete_callback,
+          stream <= 2 ? std::nullopt : std::make_optional(stream)));
+  auto* ifrt_client =
+      llvm::dyn_cast_or_null<ifrt::PjRtCompatibleClient>(client->ifrt_client());
+  if (ifrt_client == nullptr) {
+    throw XlaRuntimeError(
+        "This operation is implemented for a PjRt-compatible backend only.");
+  }
+  TF_ASSIGN_OR_RETURN(auto ifrt_array,
+                      ifrt_client->CreatePjRtArray(std::move(pjrt_buffer)));
+  return PyArray::MakeFromSingleDeviceArray(std::move(client), Traceback::Get(),
+                                            std::move(ifrt_array), false, true);
+#endif  // GOOGLE_CUDA
+}
+
 Status PyArray::Delete() {
   for (auto& arr : py_arrays()) {
     TF_RETURN_IF_ERROR(arr.Delete());
diff --git a/third_party/xla/xla/python/py_array.h b/third_party/xla/xla/python/py_array.h
index 745f647913b5f6..bc4e687c7ad646 100644
--- a/third_party/xla/xla/python/py_array.h
+++ b/third_party/xla/xla/python/py_array.h
@@ -276,6 +276,9 @@ class PyArrayResultHandler {
   std::vector<int64_t> shape_;
 };
 
+StatusOr<pybind11::object> CudaArrayInterfaceToBuffer(
+    const pybind11::dict& cai, std::shared_ptr<PyClient> cuda_client);
+
 }  // namespace xla
 
 #endif  // XLA_PYTHON_PY_ARRAY_H_
diff --git a/third_party/xla/xla/python/py_executable.cc b/third_party/xla/xla/python/py_executable.cc
index d54a790bf25cac..3c51d2b3f3ccb1 100644
--- a/third_party/xla/xla/python/py_executable.cc
+++ b/third_party/xla/xla/python/py_executable.cc
@@ -354,29 +354,35 @@ StatusOr<PyExecuteResults> PyLoadedExecutable::ExecuteSharded(
 
 StatusOr<std::vector<std::shared_ptr<HloModule>>>
 PyLoadedExecutable::HloModules() const {
+  py::gil_scoped_release gil_release;
   return ifrt_loaded_executable_->GetHloModules();
 }
 
 StatusOr<std::vector<std::vector<absl::string_view>>>
 PyLoadedExecutable::GetOutputMemoryKinds() const {
+  py::gil_scoped_release gil_release;
   return ifrt_loaded_executable_->GetOutputMemoryKinds();
 }
 
 StatusOr<std::vector<Layout>> PyLoadedExecutable::GetParameterLayouts() const {
+  py::gil_scoped_release gil_release;
   return ifrt_loaded_executable_->GetParameterLayouts();
 }
 
 StatusOr<std::vector<Layout>> PyLoadedExecutable::GetOutputLayouts() const {
+  py::gil_scoped_release gil_release;
   return ifrt_loaded_executable_->GetOutputLayouts();
 }
 
 std::optional<std::vector<OpSharding>>
 PyLoadedExecutable::GetParameterShardings() const {
+  py::gil_scoped_release gil_release;
   return ifrt_loaded_executable_->GetParameterShardings();
 }
 
 std::optional<std::vector<OpSharding>> PyLoadedExecutable::GetOutputShardings()
     const {
+  py::gil_scoped_release gil_release;
   return ifrt_loaded_executable_->GetOutputShardings();
 }
 
diff --git a/third_party/xla/xla/python/refine_polymorphic_shapes.cc b/third_party/xla/xla/python/refine_polymorphic_shapes.cc
index 31961654cc2e19..465a3d3565b54d 100644
--- a/third_party/xla/xla/python/refine_polymorphic_shapes.cc
+++ b/third_party/xla/xla/python/refine_polymorphic_shapes.cc
@@ -255,6 +255,7 @@ absl::Status RefinePolymorphicShapes(mlir::ModuleOp module,
   // TODO(necula): we should not need the inliner.
   pm.addPass(mlir::createInlinerPass());
   pm.addPass(mlir::createCSEPass());
+  pm.addPass(mlir::stablehlo::experimental::createChloRecomposeOpsPass());
   pm.addPass(mlir::stablehlo::experimental::createStablehloRefineShapesPass());
   pm.addNestedPass<mlir::func::FuncOp>(
       mlir::stablehlo::experimental::createStablehloCanonicalizeDynamismPass());
diff --git a/third_party/xla/xla/python/sharding.cc b/third_party/xla/xla/python/sharding.cc
index 50b56d573ce77d..9ca04a54a1b5eb 100644
--- a/third_party/xla/xla/python/sharding.cc
+++ b/third_party/xla/xla/python/sharding.cc
@@ -263,12 +263,17 @@ PmapSharding::PmapSharding(py::array devices, ShardingSpec sharding_spec)
           py::cast<pybind11::tuple>(devices_.attr("flat")))) {}
 
 GSPMDSharding::GSPMDSharding(py::tuple devices, xla::HloSharding op_sharding,
-                             py::object memory_kind)
+                             py::object memory_kind, py::object device_list)
     : XLACompatibleSharding(/*num_devices=*/devices.size()),
       devices_(std::move(devices)),
       hlo_sharding_(std::move(op_sharding)),
-      memory_kind_(std::move(memory_kind)),
-      internal_device_list_(std::make_shared<PyDeviceList>(devices_)) {
+      memory_kind_(std::move(memory_kind)) {
+  if (device_list.is_none()) {
+    internal_device_list_ = std::make_shared<PyDeviceList>(devices_);
+  } else {
+    internal_device_list_ =
+        py::cast<std::shared_ptr<jax::PyDeviceList>>(std::move(device_list));
+  }
   // This checks in python if the memory kind is correct for the given
   // devices. Currently in python this check is optimized but we want to
   // move that check to C++ after which we can remove this call.
@@ -279,18 +284,10 @@ GSPMDSharding::GSPMDSharding(py::tuple devices, xla::HloSharding op_sharding,
 }
 
 void RegisterSharding(py::module& m) {
-  py::object abc_module = py::module::import("abc");
-  py::object abc_meta = abc_module.attr("ABCMeta");
-  py::object abc_init = abc_module.attr("_abc_init");
-
-  // NOLINTNEXTLINE(bugprone-unused-raii)
-  py::class_<Sharding>(m, "Sharding", py::metaclass(abc_meta));
-  abc_init(py::type::of<Sharding>());
+  py::class_<Sharding>(m, "Sharding").def(py::init<>());
 
-  // NOLINTNEXTLINE(bugprone-unused-raii)
-  py::class_<XLACompatibleSharding, Sharding>(m, "XLACompatibleSharding",
-                                              py::metaclass(abc_meta));
-  abc_init(py::type::of<XLACompatibleSharding>());
+  py::class_<XLACompatibleSharding, Sharding>(m, "XLACompatibleSharding")
+      .def(py::init<>());
 
   py::class_<NamedSharding, XLACompatibleSharding>(m, "NamedSharding",
                                                    py::dynamic_attr())
@@ -329,18 +326,22 @@ void RegisterSharding(py::module& m) {
 
   py::class_<GSPMDSharding, XLACompatibleSharding>(m, "GSPMDSharding",
                                                    py::dynamic_attr())
-      .def(py::init<py::list, xla::OpSharding, py::object>(),
+      .def(py::init<py::list, xla::OpSharding, py::object, py::object>(),
            py::arg("devices"), py::arg("op_sharding"), py::kw_only(),
-           py::arg("memory_kind") = py::none())
-      .def(py::init<py::tuple, xla::OpSharding, py::object>(),
+           py::arg("memory_kind") = py::none(),
+           py::arg("_device_list") = py::none())
+      .def(py::init<py::tuple, xla::OpSharding, py::object, py::object>(),
            py::arg("devices"), py::arg("op_sharding"), py::kw_only(),
-           py::arg("memory_kind") = py::none())
-      .def(py::init<py::list, xla::HloSharding, py::object>(),
+           py::arg("memory_kind") = py::none(),
+           py::arg("_device_list") = py::none())
+      .def(py::init<py::list, xla::HloSharding, py::object, py::object>(),
            py::arg("devices"), py::arg("op_sharding"), py::kw_only(),
-           py::arg("memory_kind") = py::none())
-      .def(py::init<py::tuple, xla::HloSharding, py::object>(),
+           py::arg("memory_kind") = py::none(),
+           py::arg("_device_list") = py::none())
+      .def(py::init<py::tuple, xla::HloSharding, py::object, py::object>(),
            py::arg("devices"), py::arg("op_sharding"), py::kw_only(),
-           py::arg("memory_kind") = py::none())
+           py::arg("memory_kind") = py::none(),
+           py::arg("_device_list") = py::none())
       .def_property_readonly("_devices", &GSPMDSharding::devices)
       .def_property_readonly("_hlo_sharding", &GSPMDSharding::hlo_sharding)
       .def_property_readonly("_memory_kind", &GSPMDSharding::memory_kind)
diff --git a/third_party/xla/xla/python/sharding.h b/third_party/xla/xla/python/sharding.h
index 694e546d3735fa..56117657fd668f 100644
--- a/third_party/xla/xla/python/sharding.h
+++ b/third_party/xla/xla/python/sharding.h
@@ -170,26 +170,30 @@ class PmapSharding : public XLACompatibleSharding {
 class GSPMDSharding : public XLACompatibleSharding {
  public:
   GSPMDSharding(pybind11::list devices, xla::OpSharding op_sharding,
-                pybind11::object memory_kind = pybind11::none())
+                pybind11::object memory_kind = pybind11::none(),
+                pybind11::object device_list = pybind11::none())
       : GSPMDSharding(
             pybind11::tuple(devices),
             xla::ValueOrThrow(xla::HloSharding::FromProto(op_sharding)),
-            std::move(memory_kind)) {}
+            std::move(memory_kind), std::move(device_list)) {}
 
   GSPMDSharding(pybind11::tuple devices, xla::OpSharding op_sharding,
-                pybind11::object memory_kind = pybind11::none())
+                pybind11::object memory_kind = pybind11::none(),
+                pybind11::object device_list = pybind11::none())
       : GSPMDSharding(
             std::move(devices),
             xla::ValueOrThrow(xla::HloSharding::FromProto(op_sharding)),
-            std::move(memory_kind)) {}
+            std::move(memory_kind), std::move(device_list)) {}
 
   GSPMDSharding(pybind11::list devices, xla::HloSharding op_sharding,
-                pybind11::object memory_kind = pybind11::none())
+                pybind11::object memory_kind = pybind11::none(),
+                pybind11::object device_list = pybind11::none())
       : GSPMDSharding(pybind11::tuple(devices), std::move(op_sharding),
-                      std::move(memory_kind)) {}
+                      std::move(memory_kind), std::move(device_list)) {}
 
   GSPMDSharding(pybind11::tuple devices, xla::HloSharding op_sharding,
-                pybind11::object memory_kind = pybind11::none());
+                pybind11::object memory_kind = pybind11::none(),
+                pybind11::object device_list = pybind11::none());
 
   const pybind11::tuple& devices() const { return devices_; }
   const pybind11::object& memory_kind() const { return memory_kind_; }
diff --git a/third_party/xla/xla/python/tpu_driver/BUILD b/third_party/xla/xla/python/tpu_driver/BUILD
index caaf24e3c78cd0..a89c1c201fc77b 100644
--- a/third_party/xla/xla/python/tpu_driver/BUILD
+++ b/third_party/xla/xla/python/tpu_driver/BUILD
@@ -10,6 +10,7 @@ load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 licenses(["notice"])
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
 )
 
@@ -18,7 +19,6 @@ tf_proto_library(
     srcs = ["tpu_driver.proto"],
     cc_api_version = 2,
     protodeps = [],
-    visibility = ["//visibility:public"],
 )
 
 tf_proto_library(
@@ -34,7 +34,6 @@ tf_proto_library(
         "//xla/service:hlo_proto",
     ],
     use_grpc_namespace = True,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -47,7 +46,6 @@ cc_library(
         "platform/external/compat.h",
         "tpu_driver.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":tpu_driver_proto_cc",
         "//xla:status",
@@ -66,7 +64,6 @@ cc_library(
         "grpc_tpu_driver.cc",
     ],
     hdrs = ["grpc_tpu_driver.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":tpu_driver",
         ":tpu_driver_proto_cc",
@@ -86,7 +83,6 @@ cc_library(
     srcs = [
         "recording_tpu_driver.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":tpu_driver",
         ":tpu_driver_proto_cc",
@@ -107,7 +103,6 @@ cc_library(
 cc_library(
     name = "pod_tpu_driver",
     srcs = ["pod_tpu_driver.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":grpc_tpu_driver",
         ":tpu_driver",
diff --git a/third_party/xla/xla/python/tpu_driver/client/BUILD b/third_party/xla/xla/python/tpu_driver/client/BUILD
index b72ae894c12336..056584fbf37ce6 100644
--- a/third_party/xla/xla/python/tpu_driver/client/BUILD
+++ b/third_party/xla/xla/python/tpu_driver/client/BUILD
@@ -2,7 +2,8 @@ load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla:internal"],
     licenses = ["notice"],
 )
 
@@ -18,5 +19,4 @@ filegroup(
 cc_library(
     name = "libtpu",
     hdrs = ["libtpu.h"],
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/xla/python/tpu_driver/grpc_tpu_driver.cc b/third_party/xla/xla/python/tpu_driver/grpc_tpu_driver.cc
index 4083c839e36663..8b9c466d2c28a5 100644
--- a/third_party/xla/xla/python/tpu_driver/grpc_tpu_driver.cc
+++ b/third_party/xla/xla/python/tpu_driver/grpc_tpu_driver.cc
@@ -134,7 +134,7 @@ class GrpcCompiledProgramHandle : public CompiledProgramHandle {
   Status program_shape(xla::ProgramShapeProto* program_shape) override {
     auto opt_status = OnReady()->AwaitWithTimeout(absl::Hours(1));
     if (!opt_status.has_value()) {
-      return xla::InternalError("Compile failed to finish within 1 hour.");
+      return xla::Internal("Compile failed to finish within 1 hour.");
     }
 
     Status status = opt_status.value();
diff --git a/third_party/xla/xla/python/traceback.cc b/third_party/xla/xla/python/traceback.cc
index bed3c6a99d6c84..2990a24f2689ad 100644
--- a/third_party/xla/xla/python/traceback.cc
+++ b/third_party/xla/xla/python/traceback.cc
@@ -28,6 +28,13 @@ limitations under the License.
 #include "xla/pjrt/exceptions.h"
 #include "xla/python/python_ref_manager.h"
 #include "tsl/platform/logging.h"
+#include "tsl/platform/platform.h"
+
+#ifdef PLATFORM_GOOGLE
+#define Py_BUILD_CORE
+#include "internal/pycore_frame.h"
+#undef Py_BUILD_CORE
+#endif  // PLATFORM_GOOGLE
 
 namespace xla {
 
@@ -54,7 +61,22 @@ Traceback::Traceback() {
     Py_INCREF(py_frame->f_code);
     frames_.emplace_back(py_frame->f_code, py_frame->f_lasti * kLastiWordBytes);
   }
-#else   // PY_VERSION_HEX < 0x030b0000
+#else  // PY_VERSION_HEX < 0x030b0000
+
+#ifdef PLATFORM_GOOGLE
+  // This code is equivalent to the version using public APIs, but it saves us
+  // an allocation of one object per stack frame. However, this is definitely
+  // violating the API contract of CPython, so we only use this where we can be
+  // confident we know exactly which CPython we are using (internal to Google).
+  // Feel free to turn this on if you like, but it might break at any time!
+  for (_PyInterpreterFrame* f = thread_state->cframe->current_frame;
+       f != nullptr; f = f->previous) {
+    if (_PyFrame_IsIncomplete(f)) continue;
+    Py_INCREF(f->f_code);
+    frames_.emplace_back(f->f_code,
+                         _PyInterpreterFrame_LASTI(f) * sizeof(_Py_CODEUNIT));
+  }
+#else   // PLATFORM_GOOGLE
   PyFrameObject* next;
   for (PyFrameObject* py_frame = PyThreadState_GetFrame(thread_state);
        py_frame != nullptr; py_frame = next) {
@@ -62,6 +84,8 @@ Traceback::Traceback() {
     next = PyFrame_GetBack(py_frame);
     Py_XDECREF(py_frame);
   }
+#endif  // PLATFORM_GOOGLE
+
 #endif  // PY_VERSION_HEX < 0x030b0000
 }
 
diff --git a/third_party/xla/xla/python/transfer_guard_lib.cc b/third_party/xla/xla/python/transfer_guard_lib.cc
index bebb2fe788e8b7..6f02c9210c7fc5 100644
--- a/third_party/xla/xla/python/transfer_guard_lib.cc
+++ b/third_party/xla/xla/python/transfer_guard_lib.cc
@@ -114,7 +114,7 @@ xla::Status ApplyTransferGuardToHostToDevice(
       return xla::InvalidArgument("Disallowed host-to-device transfer: %s",
                                   formatter());
   }
-  return ::tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 xla::Status ApplyTransferGuardToDeviceToDevice(
@@ -129,7 +129,7 @@ xla::Status ApplyTransferGuardToDeviceToDevice(
       return xla::InvalidArgument("Disallowed device-to-device transfer: %s",
                                   formatter());
   }
-  return ::tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 xla::Status ApplyTransferGuardToDeviceToHost(
@@ -144,7 +144,7 @@ xla::Status ApplyTransferGuardToDeviceToHost(
       return xla::InvalidArgument("Disallowed device-to-host transfer: %s",
                                   formatter());
   }
-  return ::tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 void BuildTransferGuardSubmodule(py::module& m) {
diff --git a/third_party/xla/xla/python/xla.cc b/third_party/xla/xla/python/xla.cc
index 9c8e85a80d31c9..a35eee3b60436d 100644
--- a/third_party/xla/xla/python/xla.cc
+++ b/third_party/xla/xla/python/xla.cc
@@ -572,12 +572,26 @@ static void Init(py::module_& m) {
     xla::StatusOr<const PJRT_Api*> pjrt_api = pjrt::PjrtApi(platform_name);
     return pjrt_api.ok();
   });
-  m.def("load_pjrt_plugin",
-        [](std::string platform_name, std::string library_path) -> py::capsule {
+  m.def(
+      "load_pjrt_plugin",
+      [](std::string platform_name, std::optional<std::string> library_path,
+         std::optional<py::capsule> c_api) -> py::capsule {
+        if (library_path.has_value()) {
           const PJRT_Api* api = xla::ValueOrThrow(
-              pjrt::LoadPjrtPlugin(platform_name, library_path));
+              pjrt::LoadPjrtPlugin(platform_name, *library_path));
           return py::capsule(absl::bit_cast<void*>(api), "pjrt_c_api");
-        });
+        }
+        if (absl::string_view(c_api->name()) != "pjrt_c_api") {
+          throw py::value_error(
+              "c_api argument to load_pjrt_plugin is not a pjrt_c_api "
+              "capsule.");
+        }
+        xla::ThrowIfError(pjrt::SetPjrtApi(
+            platform_name, static_cast<const PJRT_Api*>(*c_api)));
+        return *c_api;
+      },
+      py::arg("platform_name"), py::arg("library_path") = std::nullopt,
+      py::arg("c_api") = std::nullopt);
   m.def("pjrt_plugin_initialized", [](std::string platform_name) -> bool {
     return xla::ValueOrThrow(pjrt::IsPjrtPluginInitialized(platform_name));
   });
@@ -810,7 +824,11 @@ static void Init(py::module_& m) {
       },
       py::arg("dlpack"), py::arg("cpu_backend") = nullptr,
       py::arg("gpu_backend") = nullptr);
-
+  m.def("cuda_array_interface_to_buffer",
+        [](const pybind11::dict& cai, std::shared_ptr<PyClient> cuda_client) {
+          return xla::ValueOrThrow(
+              CudaArrayInterfaceToBuffer(cai, std::move(cuda_client)));
+        });
   BuildProfilerSubmodule(&m);
   BuildOpsSubmodule(&m);
   BuildOutfeedReceiverSubmodule(&m);
@@ -910,6 +928,16 @@ static void Init(py::module_& m) {
             xla::ThrowIfError(client.KeyValueSet(key, value));
           },
           py::arg("key"), py::arg("value"))
+      // The key must be a string, but the value must a Python bytes object.
+      // Use `key_value_set_bytes()` and `blocking_key_value_get_bytes()`.
+      .def(
+          "key_value_set_bytes",
+          [](DistributedRuntimeClient& client, std::string key,
+             py::bytes value) {
+            py::gil_scoped_release gil_release;
+            xla::ThrowIfError(client.KeyValueSet(key, value));
+          },
+          py::arg("key"), py::arg("value"))
       // Assumes that all values in the directory are Python strings.
       .def(
           "key_value_dir_get",
diff --git a/third_party/xla/xla/python/xla_client.py b/third_party/xla/xla/python/xla_client.py
index ad1d04b4afc476..98748d4a7d2f7f 100644
--- a/third_party/xla/xla/python/xla_client.py
+++ b/third_party/xla/xla/python/xla_client.py
@@ -48,10 +48,10 @@
 
 # Just an internal arbitrary increasing number to help with backward-compatible
 # changes. In JAX, reference this via jax._src.lib.xla_extension_version.
-_version = 233
+_version = 237
 
 # Version number for MLIR:Python components.
-mlir_api_version = 54
+mlir_api_version = 55
 
 xla_platform_names = {
     'cpu': 'Host',
@@ -145,7 +145,11 @@ def pjrt_plugin_loaded(plugin_name: str) -> bool:
 
 
 def load_pjrt_plugin_dynamically(plugin_name: str, library_path: str) -> Any:
-  return _xla.load_pjrt_plugin(plugin_name, library_path)
+  return _xla.load_pjrt_plugin(plugin_name, library_path, c_api=None)
+
+
+def load_pjrt_plugin_with_c_api(plugin_name: str, c_api: Any) -> None:
+  return _xla.load_pjrt_plugin(plugin_name, None, c_api)
 
 
 def pjrt_plugin_initialized(plugin_name: str) -> bool:
@@ -209,10 +213,13 @@ def generate_pjrt_gpu_plugin_options(
   options = {}
   if visible_devices != 'all':
     options['visible_devices'] = [int(x) for x in visible_devices.split(',')]
-    options['platform_name'] = 'cuda'
+  options['platform_name'] = 'cuda'
   allocator = os.getenv('XLA_PYTHON_CLIENT_ALLOCATOR', 'default').lower()
   memory_fraction = os.getenv('XLA_PYTHON_CLIENT_MEM_FRACTION', '')
   preallocate = os.getenv('XLA_PYTHON_CLIENT_PREALLOCATE', '')
+  collective_memory_size = os.getenv(
+      'XLA_PYTHON_CLIENT_COLLECTIVE_MEM_SIZE_MB', ''
+  )
   if allocator not in ('default', 'platform', 'bfc', 'cuda_async'):
     raise ValueError(
         'XLA_PYTHON_CLIENT_ALLOCATOR env var must be "default", "platform", '
@@ -223,6 +230,8 @@ def generate_pjrt_gpu_plugin_options(
     options['memory_fraction'] = float(memory_fraction)
   if preallocate:
     options['preallocate'] = preallocate not in ('false', 'False', '0')
+  if collective_memory_size:
+    options['collective_memory_size'] = int(collective_memory_size) * (1 << 20)
   return options
 
 
diff --git a/third_party/xla/xla/python/xla_client.pyi b/third_party/xla/xla/python/xla_client.pyi
index f0625879d5dd87..5406aa5e1c550c 100644
--- a/third_party/xla/xla/python/xla_client.pyi
+++ b/third_party/xla/xla/python/xla_client.pyi
@@ -122,6 +122,9 @@ def pjrt_plugin_loaded(plugin_name: str) -> bool:
 def load_pjrt_plugin_dynamically(plugin_name: str, library_path: str) -> Any:
   ...
 
+def load_pjrt_plugin_with_c_api(plugin_name: str, c_api: Any) -> None:
+  ...
+
 def pjrt_plugin_initialized(plugin_name: str) -> bool:
   ...
 
diff --git a/third_party/xla/xla/python/xla_client_test.py b/third_party/xla/xla/python/xla_client_test.py
index 0816eddf166c80..995497d4cc4082 100644
--- a/third_party/xla/xla/python/xla_client_test.py
+++ b/third_party/xla/xla/python/xla_client_test.py
@@ -82,6 +82,8 @@ def jax_array_copy_to_host_async(self):
 # use widely for parameterizing tests.
 # pylint: disable=g-complex-comprehension
 
+_CUSTOM_CALLS_REGISTERED = False
+
 
 def TestFactory(xla_backend,
                 cloud_tpu=False,
@@ -109,6 +111,12 @@ def setUp(self):
       super(ComputationTest, self).setUp()
       self.backend = xla_backend()
 
+      global _CUSTOM_CALLS_REGISTERED
+      if self.backend.platform == "cpu" and not _CUSTOM_CALLS_REGISTERED:
+        for name, fn in custom_call_for_test.cpu_custom_call_targets.items():
+          xla_client.register_custom_call_target(name, fn, platform="cpu")
+        _CUSTOM_CALLS_REGISTERED = True
+
     def _NewComputation(self, name=None):
       if name is None:
         name = self.id()
@@ -223,8 +231,9 @@ def testFingerprint(self):
       executable = self.backend.compile(
           xla_computation_to_mlir_module(computation))
       fingerprint = executable.fingerprint
-      if self.backend.platform == "tpu" and not (cloud_tpu or pathways or
-                                                 pathways_ifrt):
+      if (
+          self.backend.platform == "tpu" or self.backend.platform == "gpu"
+      ) and not (cloud_tpu or pathways or pathways_ifrt):
         logging.info("fingerprint: %s", fingerprint)
         self.assertNotEmpty(fingerprint)
       else:
@@ -402,8 +411,6 @@ def testCustomCall(self):
       if self.backend.platform != "cpu":
         self.skipTest("Test requires cpu platform")
       c = self._NewComputation()
-      for name, fn in custom_call_for_test.cpu_custom_call_targets.items():
-        xla_client.register_custom_call_target(name, fn, platform="cpu")
       ops.CustomCallWithLayout(
           c,
           b"test_subtract_f32",
@@ -425,8 +432,6 @@ def testCustomCallWithUnifiedApi(self):
       if self.backend.platform != "cpu":
         self.skipTest("Test requires cpu platform")
       c = self._NewComputation()
-      for name, fn in custom_call_for_test.cpu_custom_call_targets.items():
-        xla_client.register_custom_call_target(name, fn, platform="cpu")
 
       opaque_str = b"foo"
       ops.CustomCallWithLayout(
diff --git a/third_party/xla/xla/python/xla_compiler.cc b/third_party/xla/xla/python/xla_compiler.cc
index f7d0f03dd32ece..3df9af5533ed9a 100644
--- a/third_party/xla/xla/python/xla_compiler.cc
+++ b/third_party/xla/xla/python/xla_compiler.cc
@@ -902,6 +902,16 @@ void BuildXlaCompilerSubmodule(py::module& m) {
       .def_property("xla_gpu_enable_fast_min_max",
                     &DebugOptions::xla_gpu_enable_fast_min_max,
                     &DebugOptions::set_xla_gpu_enable_fast_min_max)
+      .def_property("xla_gpu_dump_autotune_results_to",
+                    &DebugOptions::xla_gpu_dump_autotune_results_to,
+                    [](DebugOptions* self, std::string value) {
+                      self->set_xla_gpu_dump_autotune_results_to(value);
+                    })
+      .def_property("xla_gpu_load_autotune_results_from",
+                    &DebugOptions::xla_gpu_load_autotune_results_from,
+                    [](DebugOptions* self, std::string value) {
+                      self->set_xla_gpu_load_autotune_results_from(value);
+                    })
       .def_property("xla_gpu_cuda_data_dir",
                     &DebugOptions::xla_gpu_cuda_data_dir,
                     [](DebugOptions* self, std::string value) {
diff --git a/third_party/xla/xla/python/xla_extension/__init__.pyi b/third_party/xla/xla/python/xla_extension/__init__.pyi
index 43f40dc4fc956d..432a9d630e43ad 100644
--- a/third_party/xla/xla/python/xla_extension/__init__.pyi
+++ b/third_party/xla/xla/python/xla_extension/__init__.pyi
@@ -305,8 +305,11 @@ class DebugOptions:
   xla_gpu_enable_async_collective_permute: bool
   xla_gpu_enable_async_all_to_all: bool
   xla_gpu_enable_async_reduce_scatter: bool
+  xla_gpu_cuda_data_dir: str
   xla_detailed_logging: bool
   xla_enable_dumping: bool
+  xla_gpu_dump_autotune_results_to: str
+  xla_gpu_load_autotune_results_from: str
 
 class CompiledMemoryStats:
   generated_code_size_in_bytes: int
@@ -561,7 +564,7 @@ def get_default_c_api_topology(
     options: Dict[str, Union[str, int, List[int], float]],
 ) -> DeviceTopology: ...
 def get_topology_for_devices(devices: List[Device]) -> DeviceTopology: ...
-def load_pjrt_plugin(platform_name: str, library_path: str) -> _Status: ...
+def load_pjrt_plugin(platform_name: str, library_path: Optional[str], c_api: Optional[Any]) -> _Status: ...
 def pjrt_plugin_loaded(plugin_name: str) -> bool: ...
 def pjrt_plugin_initialized(plugin_name: str) -> bool: ...
 def initialize_pjrt_plugin(platform_name: str) -> _Status: ...
@@ -688,6 +691,16 @@ def dlpack_managed_tensor_to_buffer(
     tensor: Any, device: Device, stream: int | None
 ) -> ArrayImpl: ...
 
+def cuda_array_interface_to_buffer(
+    cai: Dict[str, Union[
+      str, int, None,
+      Tuple[int, ...], Tuple[int, bool],
+      List[Tuple[str, str]],
+      List[Tuple[str, str, Tuple[int, ...]]]]
+    ],
+    gpu_backend: Optional[Client] = ...,
+) -> ArrayImpl: ...
+
 # Legacy overload
 def dlpack_managed_tensor_to_buffer(
     tensor: Any,
@@ -736,6 +749,7 @@ class DistributedRuntimeClient:
   def key_value_dir_get(self, key: str) -> _Status: ...
   def key_value_dir_get_bytes(self, key: str) -> _Status: ...
   def key_value_set(self, key: str, value: str) -> _Status: ...
+  def key_value_set_bytes(self, key: str, value: bytes) -> _Status: ...
   def key_value_delete(self, key: str) -> _Status: ...
   def wait_at_barrier(self, barrier_id: str, timeout_in_ms: int) -> _Status: ...
 
@@ -845,6 +859,7 @@ class GSPMDSharding(XLACompatibleSharding):
       op_sharding: Union[OpSharding, HloSharding],
       *,
       memory_kind: Optional[str] = None,
+      _device_list: Optional[DeviceList] = None,
   ): ...
   _devices: Tuple[Device, ...]
   _hlo_sharding: HloSharding
diff --git a/third_party/xla/xla/python/xla_extension/pytree.pyi b/third_party/xla/xla/python/xla_extension/pytree.pyi
index e021b6b972bef5..24421a857ca8dd 100644
--- a/third_party/xla/xla/python/xla_extension/pytree.pyi
+++ b/third_party/xla/xla/python/xla_extension/pytree.pyi
@@ -53,7 +53,6 @@ class PyTreeDef:
   def children(self) -> List[PyTreeDef]: ...
   @staticmethod
   def make_from_node_data_and_children(
-      self,
       registry: PyTreeRegistry,
       node_data: Optional[Tuple[Type, Any]],
       children: Iterable[PyTreeDef],
@@ -71,7 +70,7 @@ class PyTreeDef:
   def serialize_using_proto(self) -> bytes: ...
   @staticmethod
   def deserialize_using_proto(
-      self, registry: PyTreeRegistry, data: bytes
+      registry: PyTreeRegistry, data: bytes
   ) -> PyTreeDef:
     ...
 
diff --git a/third_party/xla/xla/python_api/BUILD b/third_party/xla/xla/python_api/BUILD
index aaac07c9bcc78a..1cd1b6cdb834be 100644
--- a/third_party/xla/xla/python_api/BUILD
+++ b/third_party/xla/xla/python_api/BUILD
@@ -5,7 +5,6 @@ load("//xla:strict.default.bzl", "py_strict_library", "py_strict_test")
 load("//xla/tests:build_defs.bzl", "generate_backend_suites")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
diff --git a/third_party/xla/xla/runtime/BUILD b/third_party/xla/xla/runtime/BUILD
index a7bab4317cdfbf..b621f0354fcae2 100644
--- a/third_party/xla/xla/runtime/BUILD
+++ b/third_party/xla/xla/runtime/BUILD
@@ -1,10 +1,12 @@
 load("//xla:xla.bzl", "xla_cc_test")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "if_llvm_system_z_available", "tf_platform_deps")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility(["//xla:internal"]),
     licenses = ["notice"],
 )
 
@@ -13,7 +15,6 @@ cc_library(
     srcs = ["arguments.cc"],
     hdrs = ["arguments.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":async_runtime",
         ":types",
@@ -40,7 +41,6 @@ cc_library(
     srcs = ["async_runtime.cc"],
     hdrs = ["async_runtime.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/base:dynamic_annotations",
         "@local_tsl//tsl/concurrency:async_value",
@@ -65,7 +65,6 @@ cc_library(
     name = "async_values_cache",
     hdrs = ["async_values_cache.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@local_tsl//tsl/platform",
     ] + tf_platform_deps(
@@ -79,10 +78,10 @@ cc_library(
     srcs = ["constraints.cc"],
     hdrs = ["constraints.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
     ],
 )
@@ -90,7 +89,6 @@ cc_library(
 filegroup(
     name = "aot_ffi_execution_context_hdrs",
     srcs = ["aot_ffi_execution_context.h"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -123,7 +121,6 @@ cc_library(
     srcs = ["custom_call.cc"],
     hdrs = ["custom_call.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":async_runtime",
         ":diagnostics",
@@ -172,7 +169,6 @@ cc_library(
     srcs = ["custom_call_registry.cc"],
     hdrs = ["custom_call_registry.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":custom_call",
         "@llvm-project//llvm:Support",
@@ -184,7 +180,6 @@ cc_library(
     srcs = ["diagnostics.cc"],
     hdrs = ["diagnostics.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":logical_result",
         "@com_google_absl//absl/status",
@@ -208,7 +203,6 @@ cc_library(
     name = "errors",
     hdrs = ["errors.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
@@ -220,7 +214,6 @@ cc_library(
     srcs = ["executable.cc"],
     hdrs = ["executable.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":arguments",
         ":async_runtime",
@@ -275,7 +268,6 @@ cc_library(
     srcs = ["execution_engine.cc"],
     hdrs = ["execution_engine.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":errors",
         "@com_google_absl//absl/log:check",
@@ -316,7 +308,6 @@ cc_library(
     srcs = ["jit_executable.cc"],
     hdrs = ["jit_executable.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":async_values_cache",
         ":constraints",
@@ -336,7 +327,6 @@ cc_library(
     name = "logical_result",
     hdrs = ["logical_result.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = ["@llvm-project//mlir:Support"],
 )
 
@@ -344,7 +334,6 @@ cc_library(
     name = "map_by_type",
     hdrs = ["map_by_type.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":type_id",
         "@llvm-project//llvm:Support",
@@ -367,7 +356,6 @@ cc_library(
     srcs = ["memory_mapper.cc"],
     hdrs = ["memory_mapper.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@llvm-project//llvm:ExecutionEngine",
         "@llvm-project//llvm:Support",
@@ -382,7 +370,6 @@ cc_library(
     name = "memref_view",
     hdrs = ["memref_view.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:xla_data_proto_cc",
         "@com_google_absl//absl/types:span",
@@ -393,7 +380,6 @@ cc_library(
     name = "module",
     hdrs = ["module.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":custom_call_registry",
         "@com_google_absl//absl/status",
@@ -406,7 +392,6 @@ cc_library(
     srcs = ["module_registry.cc"],
     hdrs = ["module_registry.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":module",
     ],
@@ -427,7 +412,6 @@ cc_library(
     name = "results",
     hdrs = ["results.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":logical_result",
         ":types",
@@ -451,14 +435,12 @@ cc_library(
     name = "runtime",
     hdrs = ["runtime.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "state",
     hdrs = ["state.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -482,7 +464,6 @@ cc_library(
     srcs = ["symbolic_shape.cc"],
     hdrs = ["symbolic_shape.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":arguments",
         ":constraints",
@@ -514,7 +495,6 @@ cc_library(
     srcs = ["types.cc"],
     hdrs = ["types.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -529,7 +509,6 @@ cc_library(
     name = "tracing",
     hdrs = ["tracing.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":custom_call",
         ":type_id",
@@ -541,7 +520,6 @@ cc_library(
     srcs = ["type_id.cc"],
     hdrs = ["type_id.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/container:flat_hash_map",
         "@llvm-project//mlir:Support",
@@ -552,13 +530,11 @@ cc_library(
     name = "compiler",
     hdrs = ["compiler.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cpu_event",
     hdrs = ["cpu_event.h"],
-    visibility = ["//visibility:public"],
 )
 
 xla_cc_test(
diff --git a/third_party/xla/xla/runtime/constraints.cc b/third_party/xla/xla/runtime/constraints.cc
index 6fa46f07a1a468..0f702404464918 100644
--- a/third_party/xla/xla/runtime/constraints.cc
+++ b/third_party/xla/xla/runtime/constraints.cc
@@ -17,10 +17,10 @@ limitations under the License.
 
 #include <string>
 #include <string_view>
-#include <utility>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 
 namespace xla {
 namespace runtime {
diff --git a/third_party/xla/xla/runtime/default/BUILD b/third_party/xla/xla/runtime/default/BUILD
index 6607072a8d40f7..0cce6f64235abb 100644
--- a/third_party/xla/xla/runtime/default/BUILD
+++ b/third_party/xla/xla/runtime/default/BUILD
@@ -2,7 +2,8 @@ load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla/runtime:__pkg__"],
     licenses = ["notice"],
 )
 
@@ -10,7 +11,6 @@ cc_library(
     name = "async_values_cache",
     hdrs = ["async_values_cache.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/synchronization",
         "@llvm-project//llvm:Support",
@@ -22,6 +22,5 @@ cc_library(
     name = "memory_mapper",
     hdrs = ["memory_mapper.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [],
 )
diff --git a/third_party/xla/xla/runtime/ffi/BUILD b/third_party/xla/xla/runtime/ffi/BUILD
index c7b5092e88e026..f971eadcf16130 100644
--- a/third_party/xla/xla/runtime/ffi/BUILD
+++ b/third_party/xla/xla/runtime/ffi/BUILD
@@ -2,6 +2,7 @@ load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portab
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     default_visibility = ["//visibility:public"],
 )
 
@@ -12,21 +13,18 @@ filegroup(
         "ffi_api.h",
         "ffi_c_api.h",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "ffi_abi",
     hdrs = ["ffi_abi.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "ffi_api",
     hdrs = ["ffi_api.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":ffi_abi",
         ":ffi_c_api_hdrs",
@@ -37,5 +35,4 @@ cc_library(
     name = "ffi_c_api_hdrs",
     hdrs = ["ffi_c_api.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/xla/service/BUILD b/third_party/xla/xla/service/BUILD
index 58a135535e2f6f..977aa331b7deff 100644
--- a/third_party/xla/xla/service/BUILD
+++ b/third_party/xla/xla/service/BUILD
@@ -20,7 +20,7 @@ load(
     "if_rocm",
     "if_rocm_is_configured",
 )
-load("@local_tsl//tsl:tsl.bzl", "if_google", "if_libtpu")
+load("@local_tsl//tsl:tsl.bzl", "if_google", "if_libtpu", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable", "internal_hlo_deps")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
@@ -34,7 +34,8 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
     licenses = ["notice"],
 )
 
@@ -65,7 +66,6 @@ tf_proto_library(
     name = "hlo_profile_printer_data",
     srcs = ["hlo_profile_printer_data.proto"],
     cc_api_version = 2,
-    visibility = ["//visibility:public"],
 )
 
 tf_proto_library(
@@ -73,20 +73,17 @@ tf_proto_library(
     srcs = ["hlo_execution_profile_data.proto"],
     cc_api_version = 2,
     protodeps = [":hlo_profile_printer_data"],
-    visibility = ["//visibility:public"],
 )
 
 tf_proto_library(
     name = "metrics_proto",
     srcs = ["metrics.proto"],
     cc_api_version = 2,
-    visibility = ["//visibility:public"],
 )
 
 xla_py_proto_library(
     name = "metrics_pb2",
     api_version = 2,
-    visibility = ["//visibility:public"],
     deps = [":metrics_proto"],
 )
 
@@ -97,14 +94,12 @@ filegroup(
         "**/*.cc",
         "**/*.h",
     ]),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "collective_opt_utils",
     srcs = ["collective_opt_utils.cc"],
     hdrs = ["collective_opt_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/algorithm:container",
@@ -115,11 +110,11 @@ cc_library(
     name = "async_collective_creator",
     srcs = ["async_collective_creator.cc"],
     hdrs = ["async_collective_creator.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         ":shape_inference",
         "//xla:frontend_attributes",
+        "//xla:util",
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/container:flat_hash_map",
         "@local_tsl//tsl/platform:errors",
@@ -145,7 +140,6 @@ cc_library(
     name = "all_reduce_key",
     srcs = ["all_reduce_key.cc"],
     hdrs = ["all_reduce_key.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_domain_map",
         "//xla/hlo/ir:hlo",
@@ -156,7 +150,6 @@ cc_library(
     name = "all_reduce_promotion",
     srcs = ["all_reduce_promotion.cc"],
     hdrs = ["all_reduce_promotion.h"],
-    visibility = ["//visibility:public"],
     deps = [":change_op_data_type"],
 )
 
@@ -176,7 +169,6 @@ cc_library(
     name = "all_reduce_reassociate",
     srcs = ["all_reduce_reassociate.cc"],
     hdrs = ["all_reduce_reassociate.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":all_reduce_key",
         ":collective_ops_utils",
@@ -212,7 +204,6 @@ cc_library(
     name = "all_reduce_folder",
     srcs = ["all_reduce_folder.cc"],
     hdrs = ["all_reduce_folder.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":all_reduce_key",
         ":hlo_pass",
@@ -242,7 +233,6 @@ cc_library(
     name = "float_support",
     srcs = ["float_support.cc"],
     hdrs = ["float_support.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -253,7 +243,6 @@ cc_library(
     name = "broadcast_canonicalizer",
     srcs = ["broadcast_canonicalizer.cc"],
     hdrs = ["broadcast_canonicalizer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
@@ -277,7 +266,6 @@ cc_library(
     name = "bfloat16_conversion_folding",
     srcs = ["bfloat16_conversion_folding.cc"],
     hdrs = ["bfloat16_conversion_folding.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":float_support",
         ":hlo_dataflow_analysis",
@@ -310,8 +298,8 @@ cc_library(
     name = "float_normalization",
     srcs = ["float_normalization.cc"],
     hdrs = ["float_normalization.h"],
-    visibility = ["//visibility:public"],
     deps = [
+        ":call_graph",
         ":float_support",
         ":hlo_dce",
         ":hlo_pass",
@@ -350,7 +338,6 @@ cc_library(
     name = "bfloat16_propagation",
     srcs = ["bfloat16_propagation.cc"],
     hdrs = ["bfloat16_propagation.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":float_support",
         ":hlo_dataflow_analysis",
@@ -391,7 +378,6 @@ cc_library(
     name = "collective_permute_decomposer",
     srcs = ["collective_permute_decomposer.cc"],
     hdrs = ["collective_permute_decomposer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":collective_ops_utils",
         ":hlo_pass",
@@ -421,7 +407,6 @@ cc_library(
     name = "constant_value",
     srcs = ["constant_value.cc"],
     hdrs = ["constant_value.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:literal",
         "//xla:statusor",
@@ -443,7 +428,6 @@ cc_library(
     name = "convert_async_collectives_to_sync",
     srcs = ["convert_async_collectives_to_sync.cc"],
     hdrs = ["convert_async_collectives_to_sync.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:util",
@@ -460,7 +444,6 @@ cc_library(
     name = "value_range",
     srcs = ["value_range.cc"],
     hdrs = ["value_range.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":constant_value",
         "//xla/hlo/ir:hlo",
@@ -498,7 +481,6 @@ cc_library(
     name = "collective_pipeliner",
     srcs = ["collective_pipeliner.cc"],
     hdrs = ["collective_pipeliner.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":constant_value",
         ":hlo_dce",
@@ -546,7 +528,6 @@ cc_library(
     name = "dump",
     srcs = ["dump.cc"],
     hdrs = ["dump.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_graph_dumper",
         ":hlo_proto_util",
@@ -575,7 +556,6 @@ cc_library(
     name = "shape_inference",
     srcs = ["shape_inference.cc"],
     hdrs = ["shape_inference.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:permutation_util",
         "//xla:shape_util",
@@ -646,7 +626,6 @@ cc_library(
     hdrs = [
         "sharding_propagation.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_graph",
         ":custom_call_sharding_helper",
@@ -706,7 +685,6 @@ cc_library(
     hdrs = [
         "sharding_remover.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:statusor",
@@ -742,7 +720,6 @@ cc_library(
     hdrs = [
         "dot_as_convolution_util.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":shape_inference",
         "//xla:status_macros",
@@ -801,7 +778,6 @@ xla_cc_test(
 cc_library(
     name = "pattern_matcher",
     hdrs = ["pattern_matcher.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_parser",
         "//xla:shape_util",
@@ -835,7 +811,6 @@ cc_library(
     name = "pattern_matcher_gmock",
     testonly = 1,
     hdrs = ["pattern_matcher_gmock.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":pattern_matcher",
         "//xla:test",
@@ -873,6 +848,7 @@ xla_cc_test(
 xla_cc_test(
     name = "hlo_instruction_test",
     srcs = ["hlo_instruction_test.cc"],
+    tags = ["no_aarch64"],
     deps = [
         "//xla:literal",
         "//xla:protobuf_util",
@@ -910,7 +886,6 @@ cc_library(
     name = "call_graph",
     srcs = ["call_graph.cc"],
     hdrs = ["call_graph.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -947,7 +922,6 @@ cc_library(
     name = "flatten_call_graph",
     srcs = ["flatten_call_graph.cc"],
     hdrs = ["flatten_call_graph.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_graph",
         ":hlo_pass",
@@ -962,7 +936,6 @@ cc_library(
     name = "call_inliner",
     srcs = ["call_inliner.cc"],
     hdrs = ["call_inliner.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_graph",
         ":hlo_dce",
@@ -999,7 +972,6 @@ cc_library(
     name = "hlo_computation_deduplicator",
     srcs = ["hlo_computation_deduplicator.cc"],
     hdrs = ["hlo_computation_deduplicator.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla/hlo/ir:hlo",
@@ -1051,7 +1023,6 @@ cc_library(
     name = "platform_util",
     srcs = ["platform_util.cc"],
     hdrs = ["platform_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":compiler",
         "//xla:debug_options_flags",
@@ -1060,6 +1031,7 @@ cc_library(
         "//xla:types",
         "//xla:util",
         "//xla/stream_executor",
+        "//xla/stream_executor:platform_manager",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/host:host_platform_id",
         "//xla/stream_executor/rocm:rocm_platform_id",
@@ -1073,7 +1045,6 @@ cc_library(
     name = "backend",
     srcs = ["backend.cc"],
     hdrs = ["backend.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":compiler",
         ":computation_placer",
@@ -1099,7 +1070,6 @@ cc_library(
     srcs = ["service.cc"],
     hdrs = ["service.h"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    visibility = ["//visibility:public"],
     deps = [
         ":allocation_tracker",
         ":backend",
@@ -1155,7 +1125,6 @@ cc_library(
     name = "local_service",
     srcs = ["local_service.cc"],
     hdrs = ["local_service.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend",
         ":compiler",
@@ -1193,7 +1162,6 @@ cc_library(
     name = "local_service_utils",
     srcs = ["local_service_utils.cc"],
     hdrs = ["local_service_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend",
         ":hlo_module_config",
@@ -1217,7 +1185,6 @@ cc_library(
     name = "latency_hiding_scheduler",
     srcs = ["latency_hiding_scheduler.cc"],
     hdrs = ["latency_hiding_scheduler.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":dump",
         ":hlo_alias_analysis",
@@ -1258,7 +1225,6 @@ cc_library(
     name = "p2p_schedule_preparation",
     srcs = ["p2p_schedule_preparation.cc"],
     hdrs = ["p2p_schedule_preparation.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:status",
@@ -1298,7 +1264,6 @@ cc_library(
     name = "profile_guided_latency_estimator",
     srcs = ["profile_guided_latency_estimator.cc"],
     hdrs = ["profile_guided_latency_estimator.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":latency_hiding_scheduler",
         "//xla/hlo/ir:hlo",
@@ -1328,7 +1293,6 @@ cc_library(
     name = "compile_only_service",
     srcs = ["compile_only_service.cc"],
     hdrs = ["compile_only_service.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend",
         ":compiler",
@@ -1351,7 +1315,6 @@ cc_library(
 cc_library(
     name = "cpu_plugin",
     compatible_with = [],
-    visibility = ["//visibility:public"],
     deps = [
         ":service",
         "//xla/service/cpu:cpu_compiler",
@@ -1364,7 +1327,6 @@ cc_library(
 cc_library(
     name = "gpu_plugin_impl",
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/stream_executor",
     ] + if_gpu_is_configured([
@@ -1382,24 +1344,20 @@ cc_library(
 
 cc_library(
     name = "gpu_plugin_stub",
-    visibility = ["//visibility:public"],
 )
 
 alias(
     name = "gpu_plugin_noncuda",
     actual = if_libtpu("gpu_plugin_stub", "gpu_plugin_impl"),
-    visibility = ["//visibility:public"],
 )
 
 alias(
     name = "gpu_plugin",
     actual = if_cuda("gpu_plugin_impl", "gpu_plugin_noncuda"),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "interpreter_plugin",
-    visibility = ["//visibility:public"],
     deps = [
         ":service",
         "//xla/backends/interpreter:compiler",
@@ -1455,7 +1413,6 @@ cc_library(
         "executable.h",
         "service_executable_run_options.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":computation_layout",
         ":dump",
@@ -1496,7 +1453,6 @@ cc_library(
     name = "compiler",
     srcs = ["compiler.cc"],
     hdrs = ["compiler.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":buffer_assignment",
         ":buffer_value",
@@ -1551,7 +1507,6 @@ cc_library(
     name = "llvm_compiler",
     srcs = ["llvm_compiler.cc"],
     hdrs = ["llvm_compiler.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":compiler",
         "@llvm-project//llvm:Core",
@@ -1563,14 +1518,15 @@ cc_library(
     name = "transfer_manager",
     srcs = ["transfer_manager.cc"],
     hdrs = ["transfer_manager.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":compiler",
         ":executable",
         ":maybe_owning_device_memory",
         ":shaped_buffer",
         "//xla:literal",
+        "//xla:shape_tree",
         "//xla:shape_util",
+        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:types",
@@ -1578,10 +1534,13 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:notification",
     ],
@@ -1591,7 +1550,6 @@ cc_library(
     name = "allocation_tracker",
     srcs = ["allocation_tracker.cc"],
     hdrs = ["allocation_tracker.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend",
         ":transfer_manager",
@@ -1613,7 +1571,6 @@ cc_library(
     name = "execution_tracker",
     srcs = ["execution_tracker.cc"],
     hdrs = ["execution_tracker.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend",
         ":stream_pool",
@@ -1630,7 +1587,6 @@ cc_library(
     name = "channel_tracker",
     srcs = ["channel_tracker.cc"],
     hdrs = ["channel_tracker.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:statusor",
         "//xla:util",
@@ -1642,7 +1598,6 @@ cc_library(
     name = "name_uniquer",
     srcs = ["name_uniquer.cc"],
     hdrs = ["name_uniquer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:types",
@@ -1671,7 +1626,6 @@ cc_library(
     hdrs = [
         "buffer_assignment.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":buffer_assignment_proto_cc",
         ":buffer_value_containers",
@@ -1742,7 +1696,6 @@ cc_library(
     name = "hlo_ordering",
     srcs = ["hlo_ordering.cc"],
     hdrs = ["hlo_ordering.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_graph",
         ":hlo_dataflow_analysis",
@@ -1801,7 +1754,6 @@ cc_library(
     name = "hlo_module_group_metadata",
     srcs = ["hlo_module_group_metadata.cc"],
     hdrs = ["hlo_module_group_metadata.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_alias_analysis",
         ":tuple_points_to_analysis",
@@ -1823,7 +1775,6 @@ cc_library(
     name = "hlo_module_util",
     srcs = ["hlo_module_util.cc"],
     hdrs = ["hlo_module_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":compiler",
         ":hlo_module_config",
@@ -1838,7 +1789,6 @@ cc_library(
     name = "hlo_module_group_util",
     srcs = ["hlo_module_group_util.cc"],
     hdrs = ["hlo_module_group_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_module_group_metadata",
         "//xla:status",
@@ -1905,7 +1855,6 @@ cc_library(
     name = "hlo_memory_scheduler",
     srcs = ["hlo_memory_scheduler.cc"],
     hdrs = ["hlo_memory_scheduler.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_alias_analysis",
         ":hlo_pass",
@@ -1949,7 +1898,6 @@ xla_cc_test(
 cc_library(
     name = "fusion_queue",
     hdrs = ["fusion_queue.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
         "@com_google_absl//absl/strings",
@@ -1960,7 +1908,6 @@ cc_library(
     name = "instruction_fusion",
     srcs = ["instruction_fusion.cc"],
     hdrs = ["instruction_fusion.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":fusion_queue",
         ":hlo_dataflow_analysis",
@@ -1995,7 +1942,6 @@ cc_library(
     name = "multi_output_fusion",
     srcs = ["multi_output_fusion.cc"],
     hdrs = ["multi_output_fusion.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_dataflow_analysis",
         ":hlo_dce",
@@ -2018,7 +1964,6 @@ cc_library(
     hdrs = [
         "hlo_creation_utils.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_module_config",
         ":shape_inference",
@@ -2045,7 +1990,6 @@ cc_library(
     name = "fusion_node_indexing_evaluation",
     srcs = ["fusion_node_indexing_evaluation.cc"],
     hdrs = ["fusion_node_indexing_evaluation.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":elemental_ir_emitter",
         "//xla:types",
@@ -2093,7 +2037,6 @@ cc_library(
     name = "batchnorm_expander",
     srcs = ["batchnorm_expander.cc"],
     hdrs = ["batchnorm_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:literal",
@@ -2118,7 +2061,6 @@ cc_library(
     name = "op_expander_pass",
     srcs = ["op_expander_pass.cc"],
     hdrs = ["op_expander_pass.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
@@ -2133,7 +2075,6 @@ cc_library(
     name = "gather_expander",
     srcs = ["gather_expander.cc"],
     hdrs = ["gather_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         ":op_expander_pass",
@@ -2150,7 +2091,6 @@ cc_library(
     name = "optimization_barrier_expander",
     srcs = ["optimization_barrier_expander.cc"],
     hdrs = ["optimization_barrier_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":op_expander_pass",
     ],
@@ -2160,14 +2100,18 @@ cc_library(
     name = "comparison_expander",
     srcs = ["comparison_expander.cc"],
     hdrs = ["comparison_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
-        ":hlo_creation_utils",
-        ":hlo_pass",
         ":op_expander_pass",
+        "//xla:comparison_util",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:statusor",
         "//xla:util",
-        "//xla/client/lib:comparators",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -2175,7 +2119,6 @@ cc_library(
     name = "scatter_expander",
     srcs = ["scatter_expander.cc"],
     hdrs = ["scatter_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_inliner",
         ":hlo_creation_utils",
@@ -2209,7 +2152,6 @@ cc_library(
     name = "triangular_solve_expander",
     srcs = ["triangular_solve_expander.cc"],
     hdrs = ["triangular_solve_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":op_expander_pass",
         "//xla:literal",
@@ -2249,7 +2191,6 @@ cc_library(
     name = "cholesky_expander",
     srcs = ["cholesky_expander.cc"],
     hdrs = ["cholesky_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":op_expander_pass",
         "//xla:literal",
@@ -2273,7 +2214,6 @@ cc_library(
     name = "qr_expander",
     srcs = ["qr_expander.cc"],
     hdrs = ["qr_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":op_expander_pass",
         "//xla:literal",
@@ -2298,7 +2238,6 @@ cc_library(
     name = "real_imag_expander",
     srcs = ["real_imag_expander.cc"],
     hdrs = ["real_imag_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":op_expander_pass",
         "//xla:literal_util",
@@ -2330,7 +2269,6 @@ cc_library(
     name = "eigh_expander",
     srcs = ["eigh_expander.cc"],
     hdrs = ["eigh_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":op_expander_pass",
         "//xla:literal_util",
@@ -2355,7 +2293,6 @@ cc_library(
     name = "convolution_4d_expander",
     srcs = ["convolution_4d_expander.cc"],
     hdrs = ["convolution_4d_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":op_expander_pass",
         "//xla:shape_util",
@@ -2385,7 +2322,6 @@ cc_library(
     name = "convolution_pred_expander",
     srcs = ["convolution_pred_expander.cc"],
     hdrs = ["convolution_pred_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         ":op_expander_pass",
@@ -2439,7 +2375,6 @@ cc_library(
     name = "algebraic_simplifier",
     srcs = ["algebraic_simplifier.cc"],
     hdrs = ["algebraic_simplifier.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_cost_analysis",
         ":hlo_creation_utils",
@@ -2480,7 +2415,6 @@ cc_library(
     name = "tree_reduction_rewriter",
     srcs = ["tree_reduction_rewriter.cc"],
     hdrs = ["tree_reduction_rewriter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         ":shape_inference",
@@ -2539,7 +2473,6 @@ cc_library(
     name = "simplify_fp_conversions",
     srcs = ["simplify_fp_conversions.cc"],
     hdrs = ["simplify_fp_conversions.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:comparison_util",
@@ -2573,7 +2506,6 @@ cc_library(
     name = "logistic_expander",
     srcs = ["logistic_expander.cc"],
     hdrs = ["logistic_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
@@ -2597,6 +2529,7 @@ xla_cc_test(
     name = "logistic_expander_test",
     srcs = ["logistic_expander_test.cc"],
     deps = [
+        ":dynamic_padder",
         ":hlo_creation_utils",
         ":hlo_parser",
         ":hlo_pass",
@@ -2607,6 +2540,7 @@ xla_cc_test(
         ":shape_inference",
         "//xla:literal",
         "//xla:shape_util",
+        "//xla:statusor",
         "//xla:test",
         "//xla:types",
         "//xla:window_util",
@@ -2623,7 +2557,6 @@ cc_library(
     name = "collectives_schedule_linearizer",
     srcs = ["collectives_schedule_linearizer.cc"],
     hdrs = ["collectives_schedule_linearizer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:statusor",
@@ -2655,7 +2588,6 @@ xla_cc_test(
 cc_library(
     name = "collective_combiner_utils",
     hdrs = ["collective_combiner_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_domain_map",
         "//xla:shape_util",
@@ -2674,7 +2606,6 @@ cc_library(
     name = "collective_decomposer_utils",
     srcs = ["collective_decomposer_utils.cc"],
     hdrs = ["collective_decomposer_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":collective_ops_utils",
         ":hlo_module_config",
@@ -2689,7 +2620,6 @@ cc_library(
     name = "all_gather_broadcast_reorder",
     srcs = ["all_gather_broadcast_reorder.cc"],
     hdrs = ["all_gather_broadcast_reorder.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:shape_util",
@@ -2706,7 +2636,6 @@ cc_library(
     name = "bitcast_dtypes_expander",
     srcs = ["bitcast_dtypes_expander.cc"],
     hdrs = ["bitcast_dtypes_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         ":op_expander_pass",
@@ -2756,7 +2685,6 @@ cc_library(
     name = "all_gather_combiner",
     srcs = ["all_gather_combiner.cc"],
     hdrs = ["all_gather_combiner.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":collective_combiner_utils",
         ":hlo_domain_map",
@@ -2793,7 +2721,6 @@ cc_library(
     name = "all_reduce_combiner",
     srcs = ["all_reduce_combiner.cc"],
     hdrs = ["all_reduce_combiner.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":all_reduce_key",
         ":collective_combiner_utils",
@@ -2834,7 +2761,6 @@ cc_library(
     name = "all_reduce_contiguous",
     srcs = ["all_reduce_contiguous.cc"],
     hdrs = ["all_reduce_contiguous.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:shape_util",
@@ -2862,7 +2788,6 @@ cc_library(
     name = "reduce_scatter_combiner",
     srcs = ["reduce_scatter_combiner.cc"],
     hdrs = ["reduce_scatter_combiner.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":all_reduce_key",
         ":collective_combiner_utils",
@@ -2902,7 +2827,6 @@ cc_library(
     name = "all_reduce_simplifier",
     srcs = ["all_reduce_simplifier.cc"],
     hdrs = ["all_reduce_simplifier.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         ":hlo_replication_analysis",
@@ -2936,7 +2860,6 @@ cc_library(
     name = "reduce_scatter_decomposer",
     srcs = ["reduce_scatter_decomposer.cc"],
     hdrs = ["reduce_scatter_decomposer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":collective_decomposer_utils",
         ":collective_ops_utils",
@@ -2969,7 +2892,6 @@ cc_library(
     name = "reduce_scatter_reassociate",
     srcs = ["reduce_scatter_reassociate.cc"],
     hdrs = ["reduce_scatter_reassociate.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":all_reduce_key",
         ":collective_ops_utils",
@@ -2998,7 +2920,6 @@ cc_library(
     name = "batch_dot_simplification",
     srcs = ["batch_dot_simplification.cc"],
     hdrs = ["batch_dot_simplification.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
@@ -3036,7 +2957,6 @@ cc_library(
     name = "conditional_simplifier",
     srcs = ["conditional_simplifier.cc"],
     hdrs = ["conditional_simplifier.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_graph",
         ":call_inliner",
@@ -3079,7 +2999,6 @@ cc_library(
     name = "conditional_code_motion",
     srcs = ["conditional_code_motion.cc"],
     hdrs = ["conditional_code_motion.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_cse",
         ":hlo_dce",
@@ -3130,7 +3049,6 @@ cc_library(
     name = "convolution_group_converter",
     srcs = ["convolution_group_converter.cc"],
     hdrs = ["convolution_group_converter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
@@ -3168,7 +3086,6 @@ cc_library(
     name = "space_to_batch_converter",
     srcs = ["space_to_batch_converter.cc"],
     hdrs = ["space_to_batch_converter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
@@ -3201,7 +3118,6 @@ cc_library(
     name = "sparse_util",
     srcs = ["sparse_util.cc"],
     hdrs = ["sparse_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
@@ -3227,15 +3143,16 @@ cc_library(
     name = "while_loop_unroller",
     srcs = ["while_loop_unroller.cc"],
     hdrs = ["while_loop_unroller.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_inliner",
+        ":collective_ops_utils",
         ":flatten_call_graph",
         ":hlo_cse",
         ":hlo_pass",
         ":tuple_simplifier",
         ":while_loop_analysis",
         ":while_loop_constant_sinking",
+        "//xla:comparison_util",
         "//xla:literal",
         "//xla:literal_util",
         "//xla:shape_util",
@@ -3247,10 +3164,11 @@ cc_library(
         "//xla/hlo/utils:hlo_query",
         "@com_google_absl//absl/algorithm",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
@@ -3268,10 +3186,12 @@ xla_cc_test(
         "//xla/tests:literal_test_util",
         "//xla/tests:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -3279,11 +3199,12 @@ cc_library(
     name = "while_loop_analysis",
     srcs = ["while_loop_analysis.cc"],
     hdrs = ["while_loop_analysis.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":pattern_matcher",
+        "//xla:comparison_util",
         "//xla:literal",
         "//xla:literal_util",
+        "//xla:shape_util",
         "//xla/hlo/evaluator:hlo_evaluator",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_reachability",
@@ -3296,14 +3217,19 @@ xla_cc_test(
     name = "while_loop_analysis_test",
     srcs = ["while_loop_analysis_test.cc"],
     deps = [
-        ":hlo_parser",
         ":while_loop_analysis",
+        "//xla:comparison_util",
+        "//xla:statusor",
         "//xla:test",
+        "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -3311,7 +3237,6 @@ cc_library(
     name = "while_loop_simplifier",
     srcs = ["while_loop_simplifier.cc"],
     hdrs = ["while_loop_simplifier.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_inliner",
         ":hlo_creation_utils",
@@ -3360,7 +3285,6 @@ cc_library(
     name = "while_loop_trip_count_annotator",
     srcs = ["while_loop_trip_count_annotator.cc"],
     hdrs = ["while_loop_trip_count_annotator.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         ":while_loop_analysis",
@@ -3389,7 +3313,6 @@ cc_library(
     name = "defuser",
     srcs = ["defuser.cc"],
     hdrs = ["defuser.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_graph",
         ":hlo_pass",
@@ -3436,7 +3359,6 @@ cc_library(
     name = "dot_decomposer",
     srcs = ["dot_decomposer.cc"],
     hdrs = ["dot_decomposer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         ":sparse_util",
@@ -3468,7 +3390,6 @@ cc_library(
     name = "dot_dimension_merger",
     srcs = ["dot_dimension_merger.cc"],
     hdrs = ["dot_dimension_merger.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
@@ -3495,7 +3416,6 @@ cc_library(
     name = "dot_merger",
     srcs = ["dot_merger.cc"],
     hdrs = ["dot_merger.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         ":shape_inference",
@@ -3524,7 +3444,6 @@ cc_library(
     name = "convert_mover",
     srcs = ["convert_mover.cc"],
     hdrs = ["convert_mover.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
@@ -3549,7 +3468,6 @@ cc_library(
     name = "all_to_all_decomposer",
     srcs = ["all_to_all_decomposer.cc"],
     hdrs = ["all_to_all_decomposer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":op_expander_pass",
         "//xla:shape_util",
@@ -3562,7 +3480,6 @@ cc_library(
     name = "all_gather_decomposer",
     srcs = ["all_gather_decomposer.cc"],
     hdrs = ["all_gather_decomposer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":collective_decomposer_utils",
         ":collective_ops_utils",
@@ -3597,7 +3514,6 @@ cc_library(
     name = "tuple_simplifier",
     srcs = ["tuple_simplifier.cc"],
     hdrs = ["tuple_simplifier.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla/hlo/ir:hlo",
@@ -3625,7 +3541,6 @@ cc_library(
     name = "reshape_mover",
     srcs = ["reshape_mover.cc"],
     hdrs = ["reshape_mover.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
@@ -3642,7 +3557,6 @@ cc_library(
     name = "reshape_decomposer",
     srcs = ["reshape_decomposer.cc"],
     hdrs = ["reshape_decomposer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
@@ -3655,7 +3569,6 @@ cc_library(
     name = "reduce_decomposer",
     srcs = ["reduce_decomposer.cc"],
     hdrs = ["reduce_decomposer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
@@ -3696,7 +3609,6 @@ cc_library(
     name = "dynamic_window_utils",
     srcs = ["dynamic_window_utils.cc"],
     hdrs = ["dynamic_window_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":shape_inference",
         "//xla:literal",
@@ -3712,7 +3624,6 @@ cc_library(
     name = "dynamic_dimension_inference",
     srcs = ["dynamic_dimension_inference.cc"],
     hdrs = ["dynamic_dimension_inference.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_inliner",
         ":dynamic_window_utils",
@@ -3753,7 +3664,6 @@ cc_library(
     name = "dynamic_dimension_simplifier",
     srcs = ["dynamic_dimension_simplifier.cc"],
     hdrs = ["dynamic_dimension_simplifier.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:status_macros",
@@ -3791,7 +3701,6 @@ cc_library(
     name = "dynamic_padder",
     srcs = ["dynamic_padder.cc"],
     hdrs = ["dynamic_padder.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_graph",
         ":dynamic_dimension_inference",
@@ -3917,7 +3826,6 @@ cc_library(
     name = "computation_placer",
     srcs = ["computation_placer.cc"],
     hdrs = ["computation_placer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":global_device_id",
         "//xla:array2d",
@@ -3947,7 +3855,6 @@ cc_library(
 cc_library(
     name = "computation_placer_hdr",
     hdrs = ["computation_placer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":global_device_id",
         "//xla:array2d",
@@ -3982,7 +3889,6 @@ cc_library(
     name = "human_readable_profile_builder",
     srcs = ["human_readable_profile_builder.cc"],
     hdrs = ["human_readable_profile_builder.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:metric_table_report",
         "//xla:types",
@@ -3998,7 +3904,6 @@ cc_library(
     name = "generic_transfer_manager",
     srcs = ["generic_transfer_manager.cc"],
     hdrs = ["generic_transfer_manager.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":transfer_manager",
         "//xla:literal",
@@ -4008,12 +3913,19 @@ cc_library(
         "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
+        "//xla/service:shaped_buffer",
         "//xla/stream_executor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:numbers",
+        "@local_tsl//tsl/platform:statusor",
     ],
     alwayslink = True,  # Contains per-platform transfer manager registration
 )
@@ -4031,6 +3943,7 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:types",
         "//xla/stream_executor",
+        "//xla/stream_executor:platform_manager",
         "//xla/stream_executor/host:host_platform",
         "//xla/stream_executor/host:host_platform_id",
         "//xla/tests:literal_test_util",
@@ -4047,7 +3960,6 @@ cc_library(
     name = "hlo_cost_analysis",
     srcs = ["hlo_cost_analysis.cc"],
     hdrs = ["hlo_cost_analysis.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:status",
@@ -4095,7 +4007,6 @@ cc_library(
     name = "hlo_execution_profile",
     srcs = ["hlo_execution_profile.cc"],
     hdrs = ["hlo_execution_profile.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_cost_analysis",
         ":hlo_execution_profile_data_cc",
@@ -4186,7 +4097,6 @@ cc_library(
     name = "buffer_value",
     srcs = ["buffer_value.cc"],
     hdrs = ["buffer_value.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_proto_cc",
         "//xla:shape_util",
@@ -4201,7 +4111,6 @@ cc_library(
 cc_library(
     name = "buffer_value_containers",
     hdrs = ["buffer_value_containers.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":buffer_value",
         ":logical_buffer",
@@ -4214,7 +4123,6 @@ cc_library(
     name = "logical_buffer",
     srcs = ["logical_buffer.cc"],
     hdrs = ["logical_buffer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":buffer_value",
         ":hlo_proto_cc",
@@ -4232,7 +4140,6 @@ cc_library(
     name = "hlo_value",
     srcs = ["hlo_value.cc"],
     hdrs = ["hlo_value.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":buffer_value",
         "//xla:lazy",
@@ -4259,7 +4166,6 @@ cc_library(
     name = "hlo_dataflow_analysis",
     srcs = ["hlo_dataflow_analysis.cc"],
     hdrs = ["hlo_dataflow_analysis.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_graph",
         ":hlo_phi_graph",
@@ -4277,6 +4183,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -4318,7 +4225,6 @@ cc_library(
     name = "hlo_phi_graph",
     srcs = ["hlo_phi_graph.cc"],
     hdrs = ["hlo_phi_graph.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_value",
         "//xla/hlo/ir:hlo",
@@ -4343,7 +4249,6 @@ cc_library(
     name = "hlo_value_semantics_analysis",
     srcs = ["hlo_value_semantics_analysis.cc"],
     hdrs = ["hlo_value_semantics_analysis.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_value",
         "//xla:shape_tree",
@@ -4386,7 +4291,6 @@ cc_library(
     name = "hlo_replication_analysis",
     srcs = ["hlo_replication_analysis.cc"],
     hdrs = ["hlo_replication_analysis.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:statusor",
@@ -4418,7 +4322,6 @@ cc_library(
     name = "hlo_liveness_analysis",
     srcs = ["hlo_liveness_analysis.cc"],
     hdrs = ["hlo_liveness_analysis.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_graph",
         ":hlo_value",
@@ -4461,7 +4364,6 @@ cc_library(
     name = "hlo_buffer",
     srcs = ["hlo_buffer.cc"],
     hdrs = ["hlo_buffer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_value",
         "//xla:shape_tree",
@@ -4481,7 +4383,6 @@ cc_library(
     name = "hlo_alias_analysis",
     srcs = ["hlo_alias_analysis.cc"],
     hdrs = ["hlo_alias_analysis.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_buffer",
         ":hlo_dataflow_analysis",
@@ -4533,7 +4434,6 @@ cc_library(
     name = "logical_buffer_analysis",
     srcs = ["logical_buffer_analysis.cc"],
     hdrs = ["logical_buffer_analysis.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":logical_buffer",
         "//xla:shape_util",
@@ -4549,7 +4449,6 @@ cc_library(
     name = "tuple_points_to_analysis",
     srcs = ["tuple_points_to_analysis.cc"],
     hdrs = ["tuple_points_to_analysis.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_dataflow_analysis",
         ":logical_buffer",
@@ -4600,7 +4499,6 @@ cc_library(
     name = "compilation_cache",
     srcs = ["compilation_cache.cc"],
     hdrs = ["compilation_cache.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":executable",
         ":hlo_module_config",
@@ -4621,7 +4519,6 @@ cc_library(
     hdrs = [
         "layout_assignment.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_graph",
         ":computation_layout",
@@ -4664,7 +4561,6 @@ cc_library(
         "compile_time_cap.h",
         "copy_insertion.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":dump",
         ":hlo_alias_analysis",
@@ -4692,7 +4588,6 @@ cc_library(
     name = "loop_schedule_linearizer",
     srcs = ["loop_schedule_linearizer.cc"],
     hdrs = ["loop_schedule_linearizer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_alias_analysis",
         ":hlo_graph_dumper",
@@ -4749,7 +4644,6 @@ cc_library(
     name = "memory_space_propagation",
     srcs = ["memory_space_propagation.cc"],
     hdrs = ["memory_space_propagation.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_dataflow_analysis",
         ":hlo_pass",
@@ -4774,7 +4668,6 @@ cc_library(
     name = "hlo_dce",
     srcs = ["hlo_dce.cc"],
     hdrs = ["hlo_dce.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:status",
@@ -4795,7 +4688,6 @@ cc_library(
     name = "hlo_module_dce",
     srcs = ["hlo_module_dce.cc"],
     hdrs = ["hlo_module_dce.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_dce",
         ":hlo_liveness_analysis",
@@ -4817,7 +4709,6 @@ cc_library(
     name = "hlo_verifier",
     srcs = ["hlo_verifier.cc"],
     hdrs = ["hlo_verifier.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":collective_ops_utils",
         ":hlo_pass",
@@ -4865,7 +4756,6 @@ cc_library(
     name = "cpu_gpu_shape_verifier",
     srcs = ["cpu_gpu_shape_verifier.cc"],
     hdrs = ["cpu_gpu_shape_verifier.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_verifier",
         "//xla:shape_util",
@@ -4895,7 +4785,6 @@ cc_library(
     name = "hlo_rematerialization",
     srcs = ["hlo_rematerialization.cc"],
     hdrs = ["hlo_rematerialization.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_graph",
         ":hlo_cost_analysis",
@@ -4928,7 +4817,6 @@ cc_library(
     name = "hlo_rematerialization_test_utils",
     testonly = 1,
     hdrs = ["hlo_rematerialization_test_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
@@ -5035,7 +4923,6 @@ cc_library(
         "hlo_pass_fix.h",
         "hlo_pass_interface.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status_macros",
         "//xla:statusor",
@@ -5056,7 +4943,6 @@ cc_library(
         "hlo_pass_pipeline.h",
     ],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    visibility = ["//visibility:public"],
     deps = [
         ":compilation_stats",
         ":dump",
@@ -5097,7 +4983,6 @@ cc_library(
     name = "hlo_cse",
     srcs = ["hlo_cse.cc"],
     hdrs = ["hlo_cse.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_domain_map",
         ":hlo_pass",
@@ -5134,7 +5019,6 @@ cc_library(
     name = "hlo_constant_folding",
     srcs = ["hlo_constant_folding.cc"],
     hdrs = ["hlo_constant_folding.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         ":slow_operation_alarm",
@@ -5173,7 +5057,6 @@ cc_library(
     name = "hlo_domain_map",
     srcs = ["hlo_domain_map.cc"],
     hdrs = ["hlo_domain_map.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:statusor",
         "//xla:types",
@@ -5189,7 +5072,6 @@ cc_library(
     name = "hlo_domain_verifier",
     srcs = ["hlo_domain_verifier.cc"],
     hdrs = ["hlo_domain_verifier.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_domain_map",
         ":hlo_graph_dumper",
@@ -5204,7 +5086,6 @@ cc_library(
     name = "hlo_domain_isolator",
     srcs = ["hlo_domain_isolator.cc"],
     hdrs = ["hlo_domain_isolator.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_domain_remover",
         ":hlo_pass",
@@ -5217,7 +5098,6 @@ cc_library(
     name = "hlo_domain_remover",
     srcs = ["hlo_domain_remover.cc"],
     hdrs = ["hlo_domain_remover.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_domain_map",
         ":hlo_domain_verifier",
@@ -5252,7 +5132,6 @@ cc_library(
     name = "hlo_element_type_converter",
     srcs = ["hlo_element_type_converter.cc"],
     hdrs = ["hlo_element_type_converter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:literal",
@@ -5280,7 +5159,6 @@ cc_library(
     name = "conditional_canonicalizer",
     srcs = ["conditional_canonicalizer.cc"],
     hdrs = ["conditional_canonicalizer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:status_macros",
@@ -5315,7 +5193,6 @@ cc_library(
     hdrs = [
         "maybe_owning_device_memory.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/types:variant",
@@ -5330,7 +5207,6 @@ cc_library(
     hdrs = [
         "float8_fnuz_ir_emitter.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:status_macros",
@@ -5345,7 +5221,6 @@ cc_library(
     name = "elemental_ir_emitter",
     srcs = ["elemental_ir_emitter.cc"],
     hdrs = ["elemental_ir_emitter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":float8_fnuz_ir_emitter",
         "//xla:permutation_util",
@@ -5362,6 +5237,7 @@ cc_library(
         "//xla/service/llvm_ir:llvm_loop",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/llvm_ir:loop_emitter",
+        "//xla/service/llvm_ir:math_ops",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/strings",
@@ -5401,7 +5277,6 @@ cc_library(
     name = "hlo_module_config",
     srcs = ["hlo_module_config.cc"],
     hdrs = ["hlo_module_config.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":computation_layout",
         ":computation_placer",
@@ -5437,7 +5312,6 @@ cc_library(
     name = "computation_layout",
     srcs = ["computation_layout.cc"],
     hdrs = ["computation_layout.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:printer",
         "//xla:shape_layout",
@@ -5454,7 +5328,6 @@ cc_library(
     name = "hlo_graph_dumper",
     srcs = ["hlo_graph_dumper.cc"],
     hdrs = ["hlo_graph_dumper.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":pattern_matcher",
         "//xla:literal",
@@ -5505,7 +5378,6 @@ cc_library(
     name = "transpose_folding",
     srcs = ["transpose_folding.cc"],
     hdrs = ["transpose_folding.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:shape_util",
@@ -5549,7 +5421,6 @@ cc_library(
     name = "zero_sized_hlo_elimination",
     srcs = ["zero_sized_hlo_elimination.cc"],
     hdrs = ["zero_sized_hlo_elimination.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:literal",
@@ -5586,7 +5457,6 @@ cc_library(
     name = "stream_pool",
     srcs = ["stream_pool.cc"],
     hdrs = ["stream_pool.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/stream_executor",
     ],
@@ -5599,6 +5469,7 @@ xla_cc_test(
         ":stream_pool",
         "//xla:test_helpers",
         "//xla/stream_executor",
+        "//xla/stream_executor:platform_manager",
         "//xla/stream_executor/host:host_platform",
         "//xla/tests:xla_internal_test_main",
     ],
@@ -5608,7 +5479,6 @@ cc_library(
     name = "hlo_proto_util",
     srcs = ["hlo_proto_util.cc"],
     hdrs = ["hlo_proto_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":buffer_assignment",
         ":hlo_proto_cc",
@@ -5639,7 +5509,6 @@ cc_library(
     name = "hlo_runner_interface",
     srcs = ["hlo_runner_interface.cc"],
     hdrs = ["hlo_runner_interface.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":computation_placer",
         ":executable",
@@ -5658,7 +5527,6 @@ cc_library(
     name = "hlo_runner",
     srcs = ["hlo_runner.cc"],
     hdrs = ["hlo_runner.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend",
         ":compiler",
@@ -5688,7 +5556,6 @@ cc_library(
     name = "hlo_runner_pjrt",
     srcs = ["hlo_runner_pjrt.cc"],
     hdrs = ["hlo_runner_pjrt.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":executable",
         ":hlo_module_util",
@@ -5708,7 +5575,6 @@ cc_library(
     name = "hlo_profile_printer",
     srcs = ["hlo_profile_printer.cc"],
     hdrs = ["hlo_profile_printer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_profile_printer_data_cc",
         ":human_readable_profile_builder",
@@ -5722,7 +5588,6 @@ cc_library(
     name = "sort_simplifier",
     srcs = ["sort_simplifier.cc"],
     hdrs = ["sort_simplifier.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:statusor",
@@ -5752,7 +5617,6 @@ cc_library(
     name = "stable_sort_expander",
     srcs = ["stable_sort_expander.cc"],
     hdrs = ["stable_sort_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         ":op_expander_pass",
@@ -5784,7 +5648,6 @@ cc_library(
     name = "tuple_util",
     srcs = ["tuple_util.cc"],
     hdrs = ["tuple_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_value",
         "//xla:shape_tree",
@@ -5821,7 +5684,6 @@ cc_library(
     name = "root_instruction_sinker",
     srcs = ["root_instruction_sinker.cc"],
     hdrs = ["root_instruction_sinker.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         ":tuple_util",
@@ -5840,11 +5702,51 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "host_memory_offload_annotations_hdr",
+    hdrs = ["host_memory_offload_annotations.h"],
+    deps = [
+        "@com_google_absl//absl/strings:string_view",
+    ],
+)
+
+cc_library(
+    name = "convert_memory_placement_to_internal_annotations",
+    srcs = ["convert_memory_placement_to_internal_annotations.cc"],
+    hdrs = ["convert_memory_placement_to_internal_annotations.h"],
+    deps = [
+        ":host_memory_offload_annotations_hdr",
+        "//xla:side_effect_util",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_pass",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:errors",
+    ],
+)
+
+xla_cc_test(
+    name = "convert_memory_placement_to_internal_annotations_test",
+    srcs = ["convert_memory_placement_to_internal_annotations_test.cc"],
+    deps = [
+        ":convert_memory_placement_to_internal_annotations",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "host_memory_transfer_asyncifier",
     srcs = ["host_memory_transfer_asyncifier.cc"],
     hdrs = ["host_memory_transfer_asyncifier.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:shape_util",
@@ -5855,6 +5757,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -5872,6 +5775,7 @@ xla_cc_test(
         "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
@@ -5879,16 +5783,68 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "host_offload_legalize",
+    srcs = ["host_offload_legalize.cc"],
+    hdrs = ["host_offload_legalize.h"],
+    deps = [
+        ":hlo_alias_analysis",
+        ":hlo_buffer",
+        ":hlo_pass",
+        ":hlo_value",
+        ":host_memory_offload_annotations_hdr",
+        ":host_offloader",
+        ":pattern_matcher",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "host_offload_legalize_test",
+    srcs = ["host_offload_legalize_test.cc"],
+    deps = [
+        ":host_memory_offload_annotations_hdr",
+        ":host_offload_legalize",
+        ":pattern_matcher",
+        ":pattern_matcher_gmock",
+        "//xla:shape_util",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "host_offloader",
     srcs = ["host_offloader.cc"],
     hdrs = ["host_offloader.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_alias_analysis",
         ":hlo_buffer",
         ":hlo_pass",
         ":hlo_value",
+        ":host_memory_offload_annotations_hdr",
+        ":pattern_matcher",
         "//xla:literal_util",
         "//xla:shape_util",
         "//xla:status",
@@ -5909,7 +5865,10 @@ cc_library(
 xla_cc_test(
     name = "host_offloader_test",
     srcs = ["host_offloader_test.cc"],
+    shard_count = 12,
     deps = [
+        ":host_memory_offload_annotations_hdr",
+        ":host_offload_legalize",
         ":host_offloader",
         ":pattern_matcher",
         ":pattern_matcher_gmock",
@@ -5920,6 +5879,7 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
@@ -5930,7 +5890,6 @@ cc_library(
     name = "while_util",
     srcs = ["while_util.cc"],
     hdrs = ["while_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_inliner",
         ":hlo_creation_utils",
@@ -5972,7 +5931,6 @@ cc_library(
     name = "while_loop_all_reduce_code_motion",
     srcs = ["while_loop_all_reduce_code_motion.cc"],
     hdrs = ["while_loop_all_reduce_code_motion.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_graph",
         ":collective_ops_utils",
@@ -6012,7 +5970,6 @@ cc_library(
     name = "while_loop_concat_code_motion",
     srcs = ["while_loop_concat_code_motion.cc"],
     hdrs = ["while_loop_concat_code_motion.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_dce",
         ":hlo_pass",
@@ -6059,7 +6016,6 @@ cc_library(
         "compile_time_cap.h",
         "while_loop_invariant_code_motion.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_dce",
         ":hlo_pass",
@@ -6096,7 +6052,6 @@ cc_library(
     name = "while_loop_expensive_invariant_code_motion",
     srcs = ["while_loop_expensive_invariant_code_motion.cc"],
     hdrs = ["while_loop_expensive_invariant_code_motion.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         ":while_loop_analysis",
@@ -6128,7 +6083,6 @@ cc_library(
     name = "fusion_constant_sinking",
     srcs = ["fusion_constant_sinking.cc"],
     hdrs = ["fusion_constant_sinking.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_dce",
         ":hlo_pass",
@@ -6163,7 +6117,6 @@ cc_library(
     name = "while_loop_constant_sinking",
     srcs = ["while_loop_constant_sinking.cc"],
     hdrs = ["while_loop_constant_sinking.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         ":while_util",
@@ -6192,7 +6145,6 @@ cc_library(
     name = "while_loop_fusible_sinking",
     srcs = ["while_loop_fusible_sinking.cc"],
     hdrs = ["while_loop_fusible_sinking.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         ":while_util",
@@ -6224,7 +6176,6 @@ cc_library(
     name = "despecializer",
     srcs = ["despecializer.cc"],
     hdrs = ["despecializer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":defuser",
         ":float_normalization",
@@ -6242,7 +6193,6 @@ cc_library(
     name = "source_map_util",
     srcs = [],
     hdrs = ["source_map_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":executable",
         "//xla:status",
@@ -6254,7 +6204,6 @@ cc_library(
     name = "indexed_array_analysis",
     srcs = ["indexed_array_analysis.cc"],
     hdrs = ["indexed_array_analysis.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:util",
@@ -6284,7 +6233,6 @@ cc_library(
     name = "hlo_parser",
     srcs = ["hlo_parser.cc"],
     hdrs = ["hlo_parser.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":computation_layout",
         ":hlo_lexer",
@@ -6348,7 +6296,6 @@ cc_library(
     hdrs = [
         "hlo_lexer.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:statusor",
@@ -6367,7 +6314,6 @@ cc_library(
     name = "map_inliner",
     srcs = ["map_inliner.cc"],
     hdrs = ["map_inliner.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:status_macros",
@@ -6385,7 +6331,6 @@ cc_library(
     name = "optimize_input_output_buffer_alias",
     srcs = ["optimize_input_output_buffer_alias.cc"],
     hdrs = ["optimize_input_output_buffer_alias.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:shape_util",
@@ -6423,7 +6368,6 @@ cc_library(
     name = "ar_crs_combiner",
     srcs = ["ar_crs_combiner.cc"],
     hdrs = ["ar_crs_combiner.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_graph",
         ":hlo_pass",
@@ -6447,7 +6391,6 @@ cc_library(
     name = "compilation_stats",
     srcs = ["compilation_stats.cc"],
     hdrs = ["compilation_stats.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:types",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -6461,7 +6404,6 @@ cc_library(
     name = "dynamic_index_splitter",
     srcs = ["dynamic_index_splitter.cc"],
     hdrs = ["dynamic_index_splitter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:shape_util",
@@ -6532,7 +6474,6 @@ cc_library(
     name = "conditional_to_select",
     srcs = ["conditional_to_select.cc"],
     hdrs = ["conditional_to_select.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_graph",
         ":call_inliner",
@@ -6565,7 +6506,6 @@ cc_library(
     name = "slice_sinker",
     srcs = ["slice_sinker.cc"],
     hdrs = ["slice_sinker.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:shape_util",
@@ -6581,6 +6521,17 @@ cc_library(
     visibility = ["//visibility:public"],
 )
 
+cc_test(
+    name = "custom_call_target_registry_test",
+    srcs = ["custom_call_target_registry_test.cc"],
+    deps = [
+        ":custom_call_status",
+        ":custom_call_target_registry",
+        "//xla:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 # Exposes the public interface only and hides internal details. Suitable for
 # linking into a static library or binary.
 cc_library(
@@ -6603,13 +6554,13 @@ filegroup(
         "custom_call_status.h",
         "custom_call_status_internal.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
 )
 
 filegroup(
     name = "custom_call_status_srcs",
     srcs = ["custom_call_status.cc"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
 )
 
 # Internal version that exposes internal details and private interfaces. For
@@ -6620,7 +6571,10 @@ cc_library(
         "custom_call_status_internal.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        ":__subpackages__",
+        "//tensorflow/compiler/tf2xla:__pkg__",
+    ]),
     deps = [
         ":custom_call_status",
         "@com_google_absl//absl/strings",
@@ -6651,7 +6605,6 @@ cc_library(
     testonly = True,
     srcs = ["custom_call_status_test_c_caller.c"],
     hdrs = ["custom_call_status_test_c_caller.h"],
-    visibility = ["//visibility:public"],
     deps = [":custom_call_status"],
 )
 
@@ -6678,7 +6631,6 @@ cc_library(
     name = "rng_expander",
     srcs = ["rng_expander.cc"],
     hdrs = ["rng_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         ":op_expander_pass",
@@ -6693,7 +6645,6 @@ cc_library(
     name = "rng_bit_generator_expander",
     srcs = ["rng_bit_generator_expander.cc"],
     hdrs = ["rng_bit_generator_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":op_expander_pass",
         "//xla:shape_util",
@@ -6711,7 +6662,6 @@ cc_library(
     name = "slow_operation_alarm",
     srcs = ["slow_operation_alarm.cc"],
     hdrs = ["slow_operation_alarm.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
@@ -6728,7 +6678,6 @@ cc_library(
     name = "collective_ops_utils",
     srcs = ["collective_ops_utils.cc"],
     hdrs = ["collective_ops_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":computation_placer",
         ":global_device_id",
@@ -6754,7 +6703,6 @@ cc_library(
     name = "collective_transformation_reorderer",
     srcs = ["collective_transformation_reorderer.cc"],
     hdrs = ["collective_transformation_reorderer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_dce",
         ":hlo_pass",
@@ -6782,10 +6730,15 @@ xla_cc_test(
         ":collective_ops_utils",
         ":computation_placer",
         ":global_device_id",
+        ":hlo_parser",
+        "//xla:shape_util",
         "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
         "//xla/tests:xla_internal_test_main",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings:string_view",
         "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
 )
@@ -6794,11 +6747,11 @@ cc_library(
     name = "topk_rewriter",
     srcs = ["topk_rewriter.cc"],
     hdrs = ["topk_rewriter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         ":pattern_matcher",
         "//xla:shape_util",
+        "//xla:util",
         "//xla/client:xla_builder",
         "//xla/client/lib:comparators",
         "//xla/hlo/ir:hlo",
@@ -6834,7 +6787,6 @@ cc_library(
     name = "operand_upcaster",
     srcs = ["operand_upcaster.cc"],
     hdrs = ["operand_upcaster.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         ":op_expander_pass",
@@ -6861,7 +6813,6 @@ cc_library(
     name = "result_caster",
     srcs = ["result_caster.cc"],
     hdrs = ["result_caster.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":op_expander_pass",
         ":shape_inference",
@@ -6886,7 +6837,6 @@ cc_library(
     name = "global_device_id",
     srcs = ["global_device_id.cc"],
     hdrs = ["global_device_id.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:types",
         "@com_google_absl//absl/strings",
@@ -6899,7 +6849,6 @@ cc_library(
     name = "convert_operand_folding",
     srcs = ["convert_operand_folding.cc"],
     hdrs = ["convert_operand_folding.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":op_expander_pass",
         "//xla:shape_util",
@@ -6929,7 +6878,6 @@ cc_library(
     hdrs = [
         "xla_debug_info_manager.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_proto_cc",
         ":hlo_proto_util",
@@ -6981,7 +6929,6 @@ py_strict_test(
 cc_library(
     name = "mapped_ptr_container_sorter",
     hdrs = ["mapped_ptr_container_sorter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status",
         "//xla:statusor",
@@ -7011,7 +6958,6 @@ xla_cc_test(
 cc_library(
     name = "lockable",
     hdrs = ["lockable.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/strings:str_format",
@@ -7036,7 +6982,6 @@ cc_library(
     name = "rendezvous",
     srcs = ["rendezvous.cc"],
     hdrs = ["rendezvous.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -7058,6 +7003,7 @@ xla_cc_test(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:test",
@@ -7070,7 +7016,6 @@ cc_library(
     name = "compilation_environments",
     srcs = ["compilation_environments.cc"],
     hdrs = ["compilation_environments.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:statusor",
         "//xla:xla_proto_cc",
@@ -7092,7 +7037,6 @@ cc_library(
     name = "custom_call_sharding_helper",
     srcs = ["custom_call_sharding_helper.cc"],
     hdrs = ["custom_call_sharding_helper.h"],
-    visibility = ["//visibility:public"],
     deps = ["//xla/hlo/ir:hlo"],
 )
 
@@ -7101,7 +7045,6 @@ tf_proto_library(
     testonly = 1,
     srcs = ["test_compilation_environment.proto"],
     cc_api_version = 2,
-    visibility = ["//visibility:public"],
 )
 
 xla_cc_test(
@@ -7124,7 +7067,6 @@ cc_library(
     name = "layout_normalization",
     srcs = ["layout_normalization.cc"],
     hdrs = ["layout_normalization.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
@@ -7145,7 +7087,6 @@ cc_library(
     name = "instruction_hoister",
     srcs = ["instruction_hoister.cc"],
     hdrs = ["instruction_hoister.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla/hlo/ir:hlo",
@@ -7156,7 +7097,6 @@ cc_library(
     name = "scatter_simplifier",
     srcs = ["scatter_simplifier.cc"],
     hdrs = ["scatter_simplifier.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":gather_scatter_utils",
         ":hlo_creation_utils",
@@ -7188,7 +7128,6 @@ cc_library(
     name = "select_and_scatter_expander",
     srcs = ["select_and_scatter_expander.cc"],
     hdrs = ["select_and_scatter_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":call_inliner",
         ":op_expander_pass",
@@ -7225,7 +7164,6 @@ cc_library(
     name = "change_op_data_type",
     srcs = ["change_op_data_type.cc"],
     hdrs = ["change_op_data_type.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
@@ -7236,7 +7174,6 @@ cc_library(
     name = "gather_scatter_utils",
     srcs = ["gather_scatter_utils.cc"],
     hdrs = ["gather_scatter_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         "//xla:permutation_util",
@@ -7249,7 +7186,6 @@ cc_library(
     name = "gather_simplifier",
     srcs = ["gather_simplifier.cc"],
     hdrs = ["gather_simplifier.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":gather_scatter_utils",
         ":hlo_creation_utils",
@@ -7267,7 +7203,6 @@ cc_library(
     name = "stochastic_convert_decomposer",
     srcs = ["stochastic_convert_decomposer.cc"],
     hdrs = ["stochastic_convert_decomposer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_creation_utils",
         ":hlo_pass",
@@ -7299,7 +7234,6 @@ xla_cc_test(
 cc_library(
     name = "metrics_hook_interface",
     hdrs = ["metrics_hook_interface.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":metrics_proto_cc",
         "@com_google_absl//absl/strings",
@@ -7311,7 +7245,6 @@ cc_library(
     name = "sub_byte_normalization",
     srcs = ["sub_byte_normalization.cc"],
     hdrs = ["sub_byte_normalization.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:shape_layout",
@@ -7330,7 +7263,6 @@ cc_library(
     testonly = True,
     srcs = ["sharding_format_picker.cc"],
     hdrs = ["sharding_format_picker.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_pass",
         "//xla:statusor",
@@ -7495,7 +7427,10 @@ xla_cc_test(
     name = "xla_aot_compile_cpu_test",
     srcs = ["xla_aot_compile_cpu_test.cc"],
     data = [":xla_aot_compile_test_cpu_executable"],
-    tags = ["no_oss"],
+    tags = [
+        "no_oss",
+        "notap",
+    ],
     deps = [
         ":cpu_plugin",
         ":platform_util",
@@ -7547,9 +7482,6 @@ xla_cc_test(
         ":xla_aot_compile_test_gpu_executable_convolution",
         ":xla_aot_compile_test_gpu_executable_convolution_runtime_autotuning",
     ]),
-    env = {
-        "XLA_FLAGS": "--xla_gpu_enable_xla_runtime_executable",
-    },
     tags = [
         "gpu",
         "no_oss",
@@ -7580,13 +7512,11 @@ tf_proto_library(
     srcs = ["buffer_assignment.proto"],
     cc_api_version = 2,
     make_default_target_header_only = True,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "export_hlo",
     hdrs = ["export_hlo.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/stream_executor:device_description_proto_cc",
@@ -7598,10 +7528,10 @@ cc_library(
     name = "gpu_compilation_environment",
     srcs = ["gpu_compilation_environment.cc"],
     hdrs = ["gpu_compilation_environment.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":compilation_environments",
         "//xla:parse_flags_from_env",
+        "//xla:status",
         "//xla:statusor",
         "//xla:util",
         "//xla:xla_proto_cc",
@@ -7632,7 +7562,6 @@ xla_cc_test(
 cc_library(
     name = "symbol_repository",
     hdrs = ["symbol_repository.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":compiler",
         "//xla:xla_proto_cc",
@@ -7650,7 +7579,6 @@ cc_library(
     name = "time_utils",
     srcs = ["time_utils.cc"],
     hdrs = ["time_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [],
 )
 
@@ -7665,7 +7593,4 @@ tf_proto_library(
     visibility = ["//visibility:public"],
 )
 
-exports_files(
-    ["xla_aot_compile_test_gpu_target_config.prototxt"],
-    visibility = ["//visibility:public"],
-)
+exports_files(["xla_aot_compile_test_gpu_target_config.prototxt"])
diff --git a/third_party/xla/xla/service/algebraic_simplifier.cc b/third_party/xla/xla/service/algebraic_simplifier.cc
index 627abeb8fa2e38..fae7f0706c7259 100644
--- a/third_party/xla/xla/service/algebraic_simplifier.cc
+++ b/third_party/xla/xla/service/algebraic_simplifier.cc
@@ -1122,6 +1122,15 @@ StatusOr<bool> AlgebraicSimplifierVisitor::TrySimplifyTautologicalCompare(
   return false;
 }
 
+Status AlgebraicSimplifierVisitor::HandleAllToAll(HloInstruction* all_to_all) {
+  if (all_to_all->shape().IsArray() &&
+      Match(all_to_all->mutable_operand(0),
+            m::Broadcast(m::ConstantScalar()))) {
+    return ReplaceInstruction(all_to_all, all_to_all->mutable_operand(0));
+  }
+  return OkStatus();
+}
+
 Status AlgebraicSimplifierVisitor::HandleAnd(HloInstruction* logical_and) {
   HloInstruction *lhs, *rhs;
   CHECK(Match(logical_and, m::And(m::Op(&lhs), m::Op(&rhs))));
@@ -1495,10 +1504,19 @@ Status AlgebraicSimplifierVisitor::HandleCopy(HloInstruction* copy) {
     return OkStatus();
   }
 
-  if (HloInstruction* bitcast_operand =
-          BitcastingOperandOfReshapeOrCopyChain(copy, options_)) {
-    ReplaceWithBitcast(copy, bitcast_operand);
-    return OkStatus();
+  const bool copy_is_to_different_memory_space =
+      options_.is_layout_sensitive() && copy->shape().has_layout() &&
+      copy->operand(0)->shape().has_layout() &&
+      copy->shape().layout().memory_space() !=
+          copy->operand(0)->shape().layout().memory_space();
+  if (!copy_is_to_different_memory_space) {
+    // Do not replace a copy between different memory spaces with a bitcast.
+    HloInstruction* bitcast_operand =
+        BitcastingOperandOfReshapeOrCopyChain(copy, options_);
+    if (bitcast_operand != nullptr) {
+      ReplaceWithBitcast(copy, bitcast_operand);
+      return OkStatus();
+    }
   }
 
   // Replace Copy(Reshape()) with Reshape() if the Reshape is a logical bitcast.
@@ -1868,12 +1886,14 @@ AlgebraicSimplifierVisitor::TryRemoveUpcastAndDowncastSurroundingBinaryOp(
   const PrimitiveType bin_op_type = bin_op_instr->shape().element_type();
   if (!primitive_util::IsIntegralType(final_type) ||
       !primitive_util::IsIntegralType(bin_op_type) ||
+      primitive_util::Is4BitType(final_type) ||
+      primitive_util::Is4BitType(bin_op_type) ||
       (primitive_util::IsSignedIntegralType(final_type) !=
        primitive_util::IsSignedIntegralType(bin_op_type)) ||
       (primitive_util::IsUnsignedIntegralType(final_type) !=
        primitive_util::IsUnsignedIntegralType(bin_op_type))) {
     // So far, only the safety of this transformation with same signedness
-    // integer types has been verified.
+    // non-4-bit integer types has been verified.
     // TODO(b/277095299): Add support for floating point types.
     return OkStatus();
   }
@@ -4692,6 +4712,37 @@ Status AlgebraicSimplifierVisitor::HandleCompare(HloInstruction* compare) {
       }
     }
   }
+
+  // Gt(Max(a,b), a) -> Gt(b,a)
+  // Gt(Max(a,b), b) -> Gt(a,b)
+  // Gt(a, Min(a,b)) -> Gt(a,b)
+  // Gt(b, Min(a,b)) -> Gt(b,a)
+  if (compare->comparison_direction() == ComparisonDirection::kGt) {
+    HloInstruction* a;
+    HloInstruction* b;
+    if (Match(lhs, m::Maximum(m::Op(&a), m::Op(&b)))) {
+      if (rhs == a) {  // Gt(Max(a,b), a) -> Gt(b,a)
+        TF_RETURN_IF_ERROR(compare->ReplaceOperandWith(0, b));
+        MarkAsChanged();
+        return OkStatus();
+      } else if (rhs == b) {  // Gt(Max(a,b), b) -> Gt(a,b)
+        TF_RETURN_IF_ERROR(compare->ReplaceOperandWith(0, a));
+        MarkAsChanged();
+        return OkStatus();
+      }
+    } else if (Match(rhs, m::Minimum(m::Op(&a), m::Op(&b)))) {
+      if (lhs == a) {  // Gt(a, Min(a,b)) -> Gt(a,b)
+        TF_RETURN_IF_ERROR(compare->ReplaceOperandWith(1, b));
+        MarkAsChanged();
+        return OkStatus();
+      } else if (lhs == b) {  // Gt(b, Min(a,b)) -> Gt(b,a)
+        TF_RETURN_IF_ERROR(compare->ReplaceOperandWith(1, a));
+        MarkAsChanged();
+        return OkStatus();
+      }
+    }
+  }
+
   return OkStatus();
 }
 
@@ -5121,9 +5172,8 @@ AlgebraicSimplifierVisitor::TryToSinkBroadcastAfterOpWithUniqueNonScalarOperand(
         new_operands.push_back(operand);
       }
     }
-    VLOG(4) << "Sinking broadcast after user:"
-            << "\n  old broadcast: " << broadcast->ToString()
-            << "\n  old user: " << user->ToString();
+    VLOG(4) << "Sinking broadcast after user:" << "\n  old broadcast: "
+            << broadcast->ToString() << "\n  old user: " << user->ToString();
     changed_shape = ShapeUtil::ChangeElementType(operand->shape(),
                                                  user->shape().element_type());
     simplifier_->UpdateLayout(&changed_shape);
@@ -5978,22 +6028,18 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
     // Here we build up the slice dimensions for lhs
     DimensionVector lhs_start_indices, lhs_limit_indices, lhs_strides;
     for (int64_t lhs_index = 0; lhs_index < lhs->shape().rank(); ++lhs_index) {
-      int64_t start = 0;
-      int64_t limit = lhs->shape().dimensions(lhs_index);
-      int64_t stride = 1;
-      if (map_lhs_dot[lhs_index] != -1) {
-        // If it is not a contracting dimension, we slice it according to the
-        // slicing of the corresponding dimension in dot
-        int64_t dot_index = map_lhs_dot[lhs_index];
-        start = slice->slice_starts(dot_index);
-        limit = slice->slice_limits(dot_index);
-        stride = slice->slice_strides(dot_index);
-      }
+      int64_t size = lhs->shape().dimensions(lhs_index);
+      // If it is not a contracting dimension, we slice it according to the
+      // slicing of the corresponding dimension in dot
+      int64_t i = map_lhs_dot[lhs_index];
+      int64_t start = i >= 0 ? slice->slice_starts(i) : 0;
+      int64_t limit = i >= 0 ? slice->slice_limits(i) : size;
+      int64_t stride = i >= 0 ? slice->slice_strides(i) : 1;
       lhs_start_indices.push_back(start);
       lhs_limit_indices.push_back(limit);
       lhs_strides.push_back(stride);
       // Record if any slicing occurs here
-      if (start != 0 || limit < lhs->shape().dimensions(lhs_index)) {
+      if (start != 0 || limit < size || stride != 1) {
         slice_lhs = true;
       }
     }
@@ -6001,22 +6047,18 @@ Status AlgebraicSimplifierVisitor::HandleSlice(HloInstruction* slice) {
     // Here we do the same for rhs
     DimensionVector rhs_start_indices, rhs_limit_indices, rhs_strides;
     for (int64_t rhs_index = 0; rhs_index < rhs->shape().rank(); ++rhs_index) {
-      int64_t start = 0;
-      int64_t limit = rhs->shape().dimensions(rhs_index);
-      int64_t stride = 1;
-      if (map_rhs_dot[rhs_index] != -1) {
-        // If it is not a contracting dimension, we slice it according to the
-        // slicing of the corresponding dimension in dot
-        int64_t dot_index = map_rhs_dot[rhs_index];
-        start = slice->slice_starts(dot_index);
-        limit = slice->slice_limits(dot_index);
-        stride = slice->slice_strides(dot_index);
-      }
+      int64_t size = rhs->shape().dimensions(rhs_index);
+      // If it is not a contracting dimension, we slice it according to the
+      // slicing of the corresponding dimension in dot
+      int64_t i = map_rhs_dot[rhs_index];
+      int64_t start = i >= 0 ? slice->slice_starts(i) : 0;
+      int64_t limit = i >= 0 ? slice->slice_limits(i) : size;
+      int64_t stride = i >= 0 ? slice->slice_strides(i) : 1;
       rhs_start_indices.push_back(start);
       rhs_limit_indices.push_back(limit);
       rhs_strides.push_back(stride);
       // Record if any slicing occurs here
-      if (start != 0 || limit < rhs->shape().dimensions(rhs_index)) {
+      if (start != 0 || limit < size || stride != 1) {
         slice_rhs = true;
       }
     }
@@ -6773,6 +6815,20 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
     return OkStatus();
   }
 
+  HloInstruction* negate_arg;
+  if (ShapeUtil::ElementIsFloating(reduce->shape()) &&
+      Match(arg, m::Negate(m::Op(&negate_arg))) &&
+      IsScalarConstantZero(init_value) &&
+      Match(reduce->to_apply()->root_instruction(),
+            m::AddAnyOrder(m::Parameter(0), m::Parameter(1)))) {
+    TF_RETURN_IF_ERROR(reduce->ReplaceOperandWith(0, negate_arg));
+    auto users = reduce->users();
+    auto* negated_reduce = arg->AddInstruction(HloInstruction::CreateUnary(
+        reduce->shape(), HloOpcode::kNegate, reduce));
+    MarkAsChanged();
+    return reduce->ReplaceUsesWith(users, negated_reduce);
+  }
+
   // Try to reorder reduce(dot(A, B)) to dot(A, reduce(B))
   if (options_.use_associative_reordering()) {
     HloInstruction *a, *b;
@@ -7188,6 +7244,29 @@ Status AlgebraicSimplifierVisitor::HandleReduce(HloInstruction* hlo) {
     }
   }
 
+  // For Computation equal to Min, Max, And or Or, replace Reduce(Broadcast(x),
+  // a, Computation()) with Computation(x, a) when x is a scalar and the
+  // broadcast is reduced to a scalar.
+  if (HloInstruction * broadcast_arg;
+      Match(arg, m::Broadcast(m::Op(&broadcast_arg))) &&
+      (Match(function->root_instruction(),
+             m::MaximumAnyOrder(m::Parameter(0), m::Parameter(1))) ||
+       Match(function->root_instruction(),
+             m::MinimumAnyOrder(m::Parameter(0), m::Parameter(1))) ||
+       Match(function->root_instruction(),
+             m::AndAnyOrder(m::Parameter(0), m::Parameter(1))) ||
+       Match(function->root_instruction(),
+             m::OrAnyOrder(m::Parameter(0), m::Parameter(1))))) {
+    if (broadcast_arg->shape().rank() == 0 &&
+        reduce->dimensions().size() == arg->shape().rank()) {
+      return ReplaceWithNewInstruction(
+          reduce,
+          HloInstruction::CreateBinary(
+              reduce_result_shape, function->root_instruction()->opcode(),
+              broadcast_arg, reduce->mutable_operand(1)));
+    }
+  }
+
   return OkStatus();
 }
 
diff --git a/third_party/xla/xla/service/algebraic_simplifier.h b/third_party/xla/xla/service/algebraic_simplifier.h
index 893533c3761cc1..984d72a1558b9a 100644
--- a/third_party/xla/xla/service/algebraic_simplifier.h
+++ b/third_party/xla/xla/service/algebraic_simplifier.h
@@ -71,6 +71,11 @@ class AlgebraicSimplifierOptions {
     return conv_is_lowerable_callback_(reverse_dims);
   }
 
+  void set_conv_is_lowerable_callback(
+      ConvIsLowerableCallback conv_is_lowerable_callback) {
+    conv_is_lowerable_callback_ = std::move(conv_is_lowerable_callback);
+  }
+
   // If is_layout_sensitive is true, then the simplifier preserves layout during
   // transformation. Otherwise, layout is ignored.
   void set_is_layout_sensitive(bool is_layout_sensitive) {
@@ -313,6 +318,8 @@ class AlgebraicSimplifierVisitor : public DfsHloRewriteVisitor {
 
   Status HandleAdd(HloInstruction* add) override;
 
+  Status HandleAllToAll(HloInstruction* all_to_all) override;
+
   Status HandleAnd(HloInstruction* logical_and) override;
 
   Status HandleBitcast(HloInstruction* bitcast) override;
diff --git a/third_party/xla/xla/service/algebraic_simplifier_test.cc b/third_party/xla/xla/service/algebraic_simplifier_test.cc
index 7e085634ea1e26..8df8939babf899 100644
--- a/third_party/xla/xla/service/algebraic_simplifier_test.cc
+++ b/third_party/xla/xla/service/algebraic_simplifier_test.cc
@@ -928,6 +928,78 @@ TEST_F(AlgebraicSimplifierTest,
   ASSERT_FALSE(AlgebraicSimplifier(options).Run(m.get()).value());
 }
 
+TEST_F(AlgebraicSimplifierTest, ReduceOfNegate) {
+  const char* kModuleStr = R"(
+    HloModule m
+    add_f32 {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT r = f32[] add(p0, p1)
+    }
+
+    ENTRY test {
+      p = f32[15,7] parameter(0)
+      n = negate(p)
+      ROOT reduce = f32[15] reduce(n, f32[] constant(0)), dimensions={1}, to_apply=add_f32
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  AlgebraicSimplifierOptions options = default_options_;
+  options.set_unconditionally_simplify_reduce_of_transpose_or_reshape(true);
+  ASSERT_TRUE(AlgebraicSimplifier(options).Run(m.get()).value());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Negate(m::Reduce(m::Parameter(0), m::ConstantScalar(0)))));
+}
+
+TEST_F(AlgebraicSimplifierTest, ReduceBroadcastOfScalar) {
+  // Test Reduce(Broadcast(x), a, Max)
+  const char* kModuleStrForMax = R"(
+    HloModule m
+    max_f32 {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT r = f32[] maximum(p0, p1)
+    }
+
+    ENTRY test {
+      p = f32[] parameter(0)
+      b = f32[1000,1000] broadcast(p), dimensions={}
+      ROOT reduce = f32[] reduce(b, f32[] constant(0)), dimensions={0,1}, to_apply=max_f32
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m,
+                          ParseAndReturnVerifiedModule(kModuleStrForMax));
+  AlgebraicSimplifierOptions options = default_options_;
+  ASSERT_TRUE(AlgebraicSimplifier(options).Run(m.get()).value());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::MaximumAnyOrder(m::Parameter(0), m::ConstantScalar(0))));
+
+  // Test Reduce(Broadcast(x), a, And)
+  const char* kModuleStrForAnd = R"(
+    HloModule m
+    and_u4 {
+      p0 = u4[] parameter(0)
+      p1 = u4[] parameter(1)
+      ROOT r = u4[] and(p0, p1)
+    }
+
+    ENTRY test {
+      p = u4[] parameter(0)
+      b = u4[1000,1000] broadcast(p), dimensions={}
+      ROOT reduce = u4[] reduce(b, u4[] constant(0)), dimensions={0,1}, to_apply=and_u4
+    }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(m, ParseAndReturnVerifiedModule(kModuleStrForAnd));
+  ASSERT_TRUE(AlgebraicSimplifier(options).Run(m.get()).value());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::AndAnyOrder(m::Parameter(0), m::ConstantScalar(0))));
+}
+
 // Test that Const + A is canonicalized to A + Const.
 TEST_F(AlgebraicSimplifierTest, AddConstOnLHS) {
   auto m = CreateNewVerifiedModule();
@@ -3440,6 +3512,37 @@ TEST_F(AlgebraicSimplifierTest, CopyWithSameLayout) {
   EXPECT_THAT(computation->root_instruction(), param0);
 }
 
+// Test that a simplification which changes copy to a bitcast is not performed
+// if layout sensitive is true.
+TEST_F(AlgebraicSimplifierTest, CopyWithDifferentMemorySpaces) {
+  auto m = CreateNewVerifiedModule();
+  HloComputation::Builder builder(TestName());
+  HloInstruction* param0 =
+      builder.AddInstruction(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(F32, {2, 2}), "param0"));
+  HloInstruction* copy = builder.AddInstruction(
+      HloInstruction::CreateUnary(param0->shape(), HloOpcode::kCopy, param0));
+
+  // Set to different memory spaces.
+  param0->mutable_shape()->mutable_layout()->set_memory_space(0);
+  copy->mutable_shape()->mutable_layout()->set_memory_space(123);
+
+  HloComputation* computation =
+      m->AddEntryComputationWithLayouts(builder.Build());
+
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
+
+  AlgebraicSimplifierOptions options;
+  options.set_is_layout_sensitive(true);
+  AlgebraicSimplifier simplifier(options);
+  EXPECT_FALSE(simplifier.Run(m.get()).value());
+
+  // Copy has not been removed.
+  EXPECT_THAT(computation->root_instruction(),
+              GmockMatch(m::Copy(m::Parameter(0))));
+}
+
 // Test that a reshape which could be replaced with a bitcast is not if
 // add_bitcasts is false.
 TEST_F(AlgebraicSimplifierTest, NoBitcastAdded) {
@@ -6220,6 +6323,30 @@ TEST_F(AlgebraicSimplifierTest, SliceDotReorder) {
               GmockMatch(m::Dot(m::Slice(m::Parameter(0)), m::Parameter(1))));
 }
 
+TEST_F(AlgebraicSimplifierTest, SliceDotReorderWithStrides) {
+  const char* hlo_string = R"(
+    HloModule module
+
+    ENTRY test {
+        a = f32[2048,2] parameter(0)
+        b = f32[2,2048] parameter(1)
+        dot = f32[2048,2048] dot(a,b),
+              lhs_contracting_dims={1},
+              rhs_contracting_dims={0}
+        ROOT slice = f32[16,256] slice(dot), slice={[0:128:8],[0:2048:8]}
+      }
+    )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  AlgebraicSimplifierOptions options;
+  options.set_use_associative_reordering(true);
+  EXPECT_TRUE(AlgebraicSimplifier(options).Run(module.get()).value());
+  ASSERT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Dot(m::Slice(m::Parameter(0)), m::Slice(m::Parameter(1)))));
+}
+
 TEST_F(AlgebraicSimplifierTest, TransposeOfBatchDot) {
   const char* hlo_string = R"(
     HloModule module
@@ -8041,6 +8168,78 @@ TEST_F(AlgebraicSimplifierTest,
   ASSERT_FALSE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
 }
 
+TEST_F(AlgebraicSimplifierTest, CompareGtMaxA) {
+  // Gt(Max(a,b), a) -> Gt(b,a)
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      a = f32[4] parameter(0)
+      b = f32[4] parameter(1)
+      m0 = f32[4] maximum(a, b)
+      ROOT compare = pred[4] compare(m0, a), direction=GT
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Compare(m::Parameter(1), m::Parameter(0))
+                     .WithComparisonDirection(ComparisonDirection::kGt)));
+}
+
+TEST_F(AlgebraicSimplifierTest, CompareGtMaxB) {
+  // Gt(Max(a,b), b) -> Gt(a,b)
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      a = f32[4] parameter(0)
+      b = f32[4] parameter(1)
+      m0 = f32[4] maximum(a, b)
+      ROOT compare = pred[4] compare(m0, b), direction=GT
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Compare(m::Parameter(0), m::Parameter(1))
+                     .WithComparisonDirection(ComparisonDirection::kGt)));
+}
+
+TEST_F(AlgebraicSimplifierTest, CompareGtAMin) {
+  // Gt(a, Min(a,b)) -> Gt(a,b)
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      a = f32[4] parameter(0)
+      b = f32[4] parameter(1)
+      m0 = f32[4] minimum(a, b)
+      ROOT compare = pred[4] compare(a, m0), direction=GT
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Compare(m::Parameter(0), m::Parameter(1))
+                     .WithComparisonDirection(ComparisonDirection::kGt)));
+}
+
+TEST_F(AlgebraicSimplifierTest, CompareGtBMin) {
+  // Gt(b, Min(a,b)) -> Gt(b,a)
+  const char* kModuleStr = R"(
+    HloModule m
+    test {
+      a = f32[4] parameter(0)
+      b = f32[4] parameter(1)
+      m0 = f32[4] minimum(a, b)
+      ROOT compare = pred[4] compare(b, m0), direction=GT
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(kModuleStr));
+  ASSERT_TRUE(AlgebraicSimplifier(default_options_).Run(m.get()).value());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Compare(m::Parameter(1), m::Parameter(0))
+                     .WithComparisonDirection(ComparisonDirection::kGt)));
+}
+
 TEST_F(AlgebraicSimplifierTest, CompareIota) {
   const char* kModuleStr = R"(
     HloModule m
@@ -10069,13 +10268,12 @@ TEST_F(AlgebraicSimplifierTest, TransposeOfBroadcast) {
   EXPECT_TRUE(
       RunHloPass(AlgebraicSimplifier(default_options_), m.get()).value());
   SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(
-          m::Broadcast(m::Parameter(0))
-              .WithPredicate([](const HloInstruction* instr) {
-                return instr->dimensions() == std::vector<int64_t>({0, 3});
-              })));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))
+                             .WithPredicate([](const HloInstruction* instr) {
+                               return instr->dimensions() ==
+                                      std::vector<int64_t>({0, 3});
+                             })));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposeBitcastOfBroadcast) {
@@ -10091,13 +10289,12 @@ TEST_F(AlgebraicSimplifierTest, TransposeBitcastOfBroadcast) {
   options.set_is_layout_sensitive(true);
   EXPECT_TRUE(RunHloPass(AlgebraicSimplifier(options), m.get()).value());
   SCOPED_TRACE(m->ToString());
-  EXPECT_THAT(
-      m->entry_computation()->root_instruction(),
-      GmockMatch(
-          m::Broadcast(m::Parameter(0))
-              .WithPredicate([](const HloInstruction* instr) {
-                return instr->dimensions() == std::vector<int64_t>({0, 3});
-              })));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::Broadcast(m::Parameter(0))
+                             .WithPredicate([](const HloInstruction* instr) {
+                               return instr->dimensions() ==
+                                      std::vector<int64_t>({0, 3});
+                             })));
 }
 
 TEST_F(AlgebraicSimplifierTest, TransposeOfBroadcastWithLayoutCheckSkipped) {
diff --git a/third_party/xla/xla/service/async_collective_creator.cc b/third_party/xla/xla/service/async_collective_creator.cc
index 067d33c202f550..4c421a94f95f1e 100644
--- a/third_party/xla/xla/service/async_collective_creator.cc
+++ b/third_party/xla/xla/service/async_collective_creator.cc
@@ -24,8 +24,10 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/shape_inference.h"
+#include "xla/util.h"
 #include "tsl/platform/errors.h"
 
 namespace xla {
@@ -125,93 +127,110 @@ StatusOr<ReplacedAsync> CreateAsyncStartDone(
 
 }  // namespace
 
+// Find all supported collective ops first as we can't modify the instructions
+// while iterating through them.
+std::vector<HloInstruction*> AsyncCollectiveCreator::MatchCollectives(
+    HloComputation* computation) {
+  std::vector<HloInstruction*> supported_collectives;
+  for (HloInstruction* instruction : computation->instructions()) {
+    const HloOpcode op = instruction->opcode();
+    if ((op == HloOpcode::kAllReduce &&
+         config_.convert_all_reduce(instruction)) ||
+        (op == HloOpcode::kAllGather &&
+         config_.convert_all_gather(instruction)) ||
+        (op == HloOpcode::kCollectivePermute &&
+         config_.convert_collective_permute(instruction)) ||
+        (op == HloOpcode::kAllToAll &&
+         config_.convert_all_to_all(instruction)) ||
+        (op == HloOpcode::kReduceScatter &&
+         config_.convert_reduce_scatter(instruction))) {
+      supported_collectives.push_back(instruction);
+    }
+  }
+  return supported_collectives;
+}
+
+StatusOr<bool> AsyncCollectiveCreator::ReplaceCollectives(
+    HloComputation* computation,
+    std::vector<HloInstruction*>& supported_collectives) {
+  bool changed = false;
+  HloModule* module = computation->parent();
+  absl::flat_hash_map<HloInstruction*, ReplacedAsync> replaced_pairs;
+  const bool should_update_schedule =
+      module->has_schedule() &&
+      module->schedule().is_computation_scheduled(computation);
+  for (HloInstruction* instruction : supported_collectives) {
+    StatusOr<ReplacedAsync> async_pair;
+    switch (instruction->opcode()) {
+      case HloOpcode::kAllReduce:
+        async_pair = CreateAsyncAllReduce(instruction);
+        break;
+      case HloOpcode::kAllGather:
+        async_pair = CreateAsyncAllGather(instruction);
+        break;
+      case HloOpcode::kCollectivePermute:
+        async_pair = CreateAsyncCollectivePermute(
+            instruction, config_.get_context_shapes(instruction));
+        break;
+      case HloOpcode::kAllToAll:
+      case HloOpcode::kReduceScatter:
+        async_pair = CreateAsyncStartDone(
+            instruction, config_.get_context_shapes(instruction));
+        break;
+      default:
+        return Internal("Unexpected opcode %s",
+                        HloOpcodeString(instruction->opcode()));
+    }
+    TF_RETURN_IF_ERROR(async_pair.status());
+    async_pair->start->set_metadata(instruction->metadata());
+    async_pair->start->CopyBackendConfigFrom(instruction);
+    if (should_update_schedule) {
+      replaced_pairs[instruction] = *async_pair;
+    }
+
+    // Update control dependencies if present.
+    TF_RETURN_IF_ERROR(
+        instruction->CopyAllControlDepsTo(async_pair->start, async_pair->done));
+    TF_RETURN_IF_ERROR(instruction->DropAllControlDeps());
+
+    TF_RETURN_WITH_CONTEXT_IF_ERROR(
+        computation->ReplaceInstruction(instruction, async_pair->done),
+        "replacing ", instruction->ToShortString());
+    changed = true;
+  }
+  if (should_update_schedule) {
+    std::vector<HloInstruction*> new_sequence;
+    const HloInstructionSequence& sequence =
+        module->schedule().sequence(computation);
+    new_sequence.reserve(sequence.size() + replaced_pairs.size());
+    for (HloInstruction* instr : sequence.instructions()) {
+      auto it = replaced_pairs.find(instr);
+      if (it != replaced_pairs.end()) {
+        new_sequence.push_back(it->second.start);
+        new_sequence.push_back(it->second.done);
+        continue;
+      }
+      new_sequence.push_back(instr);
+    }
+    module->schedule().set_sequence(computation, new_sequence);
+  }
+  return changed;
+}
+
 StatusOr<bool> AsyncCollectiveCreator::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
-    // Find all supported collective ops first as we can't modify the
-    // instructions while iterating through them.
-    std::vector<HloInstruction*> supported_collectives;
-    for (HloInstruction* instruction : computation->instructions()) {
-      const HloOpcode op = instruction->opcode();
-      if ((op == HloOpcode::kAllReduce &&
-           config_.convert_all_reduce(instruction)) ||
-          (op == HloOpcode::kAllGather &&
-           config_.convert_all_gather(instruction)) ||
-          (op == HloOpcode::kCollectivePermute &&
-           config_.convert_collective_permute(instruction)) ||
-          (op == HloOpcode::kAllToAll &&
-           config_.convert_all_to_all(instruction)) ||
-          (op == HloOpcode::kReduceScatter &&
-           config_.convert_reduce_scatter(instruction))) {
-        supported_collectives.push_back(instruction);
-      }
-    }
+    std::vector<HloInstruction*> supported_collectives =
+        MatchCollectives(computation);
     if (supported_collectives.empty()) {
       continue;
     }
-
-    absl::flat_hash_map<HloInstruction*, ReplacedAsync> replaced_pairs;
-    const bool should_update_schedule =
-        module->has_schedule() &&
-        module->schedule().is_computation_scheduled(computation);
-    for (HloInstruction* instruction : supported_collectives) {
-      StatusOr<ReplacedAsync> async_pair;
-      switch (instruction->opcode()) {
-        case HloOpcode::kAllReduce:
-          async_pair = CreateAsyncAllReduce(instruction);
-          break;
-        case HloOpcode::kAllGather:
-          async_pair = CreateAsyncAllGather(instruction);
-          break;
-        case HloOpcode::kCollectivePermute:
-          async_pair = CreateAsyncCollectivePermute(
-              instruction, config_.get_context_shapes(instruction));
-          break;
-        case HloOpcode::kAllToAll:
-        case HloOpcode::kReduceScatter:
-          async_pair = CreateAsyncStartDone(
-              instruction, config_.get_context_shapes(instruction));
-          break;
-        default:
-          return Internal("Unexpected opcode %s",
-                               HloOpcodeString(instruction->opcode()));
-      }
-      TF_RETURN_IF_ERROR(async_pair.status());
-      async_pair->start->set_metadata(instruction->metadata());
-      async_pair->start->CopyBackendConfigFrom(instruction);
-      if (should_update_schedule) {
-        replaced_pairs[instruction] = *async_pair;
-      }
-
-      // Update control dependencies if present.
-      TF_RETURN_IF_ERROR(instruction->CopyAllControlDepsTo(async_pair->start,
-                                                           async_pair->done));
-      TF_RETURN_IF_ERROR(instruction->DropAllControlDeps());
-
-      TF_RETURN_WITH_CONTEXT_IF_ERROR(
-          computation->ReplaceInstruction(instruction, async_pair->done),
-          "replacing ", instruction->ToShortString());
-      changed = true;
-    }
-    if (should_update_schedule) {
-      std::vector<HloInstruction*> new_sequence;
-      const HloInstructionSequence& sequence =
-          module->schedule().sequence(computation);
-      new_sequence.reserve(sequence.size() + replaced_pairs.size());
-      for (HloInstruction* instr : sequence.instructions()) {
-        auto it = replaced_pairs.find(instr);
-        if (it != replaced_pairs.end()) {
-          new_sequence.push_back(it->second.start);
-          new_sequence.push_back(it->second.done);
-          continue;
-        }
-        new_sequence.push_back(instr);
-      }
-      module->schedule().set_sequence(computation, new_sequence);
-    }
+    TF_ASSIGN_OR_RETURN(bool comp_changed,
+                        ReplaceCollectives(computation, supported_collectives));
+    changed |= comp_changed;
   }
   return changed;
 }
diff --git a/third_party/xla/xla/service/async_collective_creator.h b/third_party/xla/xla/service/async_collective_creator.h
index 43c8d1af8b0e93..830a32de8517a3 100644
--- a/third_party/xla/xla/service/async_collective_creator.h
+++ b/third_party/xla/xla/service/async_collective_creator.h
@@ -51,6 +51,12 @@ class AsyncCollectiveCreator : public HloModulePass {
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
+  std::vector<HloInstruction *> MatchCollectives(HloComputation *computation);
+  StatusOr<bool> ReplaceCollectives(
+      HloComputation *computation,
+      std::vector<HloInstruction *> &supported_collectives);
+  const CollectiveCreatorConfig *config() const { return &config_; }
+
  private:
   CollectiveCreatorConfig config_;
 };
diff --git a/third_party/xla/xla/service/buffer_assignment_test.cc b/third_party/xla/xla/service/buffer_assignment_test.cc
index 0da84316db6e60..ef9bd1de1b4e3f 100644
--- a/third_party/xla/xla/service/buffer_assignment_test.cc
+++ b/third_party/xla/xla/service/buffer_assignment_test.cc
@@ -841,7 +841,8 @@ TEST_F(BufferAssignmentTest, PresetAssignments) {
   auto param1 = builder.AddInstruction(
       HloInstruction::CreateParameter(2, f32vec100_, "p2"));
   Shape f32vec100_color1 = ShapeUtil::MakeShapeWithDenseLayout(
-      F32, {100}, {0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      F32, {100}, {0}, /*tiles=*/{}, /*tail_padding_alignment_in_elements=*/1,
+      /*element_size_in_bits=*/0,
       /*memory_space=*/1);
   auto mul = builder.AddInstruction(HloInstruction::CreateBinary(
       f32vec100_color1, HloOpcode::kMultiply, broadcast, param0));
@@ -903,7 +904,8 @@ TEST_F(BufferAssignmentTest, PresetAssignmentsWhile) {
   // HloValue and HloBuffer (i.e., a while loop).
   auto module = CreateNewVerifiedModule();
   Shape f32vec10_color1 = ShapeUtil::MakeShapeWithDenseLayout(
-      F32, {10}, {0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      F32, {10}, {0}, /*tiles=*/{}, /*tail_padding_alignment_in_elements=*/1,
+      /*element_size_in_bits=*/0,
       /*memory_space=*/1);
   Shape t_s32_f32v10_color1 =
       ShapeUtil::MakeTupleShape({s32_, f32vec10_color1});
diff --git a/third_party/xla/xla/service/buffer_value.h b/third_party/xla/xla/service/buffer_value.h
index 136a38b1170d00..980ae7fd6fe408 100644
--- a/third_party/xla/xla/service/buffer_value.h
+++ b/third_party/xla/xla/service/buffer_value.h
@@ -34,7 +34,7 @@ namespace xla {
 // TODO(b/78906445) Delete this class when TuplePointsToAnalysis is unused.
 //
 // XLA arrays are trivially a single BufferValue. Tuples are made up of more
-// than one BufferValue: an BufferValue for the pointer vector, and an
+// than one BufferValue: a BufferValue for the pointer vector, and a
 // BufferValue for each child element.
 //
 // Every BufferValue is defined by a particular instruction and most
diff --git a/third_party/xla/xla/service/collective_ops_utils.cc b/third_party/xla/xla/service/collective_ops_utils.cc
index a1dcd47878ebc9..6ebfe901eff127 100644
--- a/third_party/xla/xla/service/collective_ops_utils.cc
+++ b/third_party/xla/xla/service/collective_ops_utils.cc
@@ -602,6 +602,28 @@ bool IsCollective(const HloInstruction* instruction) {
   }
 }
 
+bool IsCollectiveWithChannelId(const HloInstruction* instruction) {
+  switch (instruction->opcode()) {
+    case HloOpcode::kAllReduce:
+    case HloOpcode::kAllReduceStart:
+    case HloOpcode::kAllGather:
+    case HloOpcode::kAllGatherStart:
+    case HloOpcode::kAllToAll:
+    case HloOpcode::kCollectivePermute:
+    case HloOpcode::kCollectivePermuteStart:
+      return instruction->channel_id().has_value();
+    case HloOpcode::kFusion:
+      for (const auto* inner_inst : instruction->fused_instructions()) {
+        if (IsCollectiveWithChannelId(inner_inst)) {
+          return true;
+        }
+      }
+      return false;
+    default:
+      return false;
+  }
+}
+
 bool IsSyncCollective(const HloInstruction* instr) {
   auto backend_config = instr->backend_config<xla::gpu::GpuBackendConfig>();
   if (!backend_config.ok()) {
diff --git a/third_party/xla/xla/service/collective_ops_utils.h b/third_party/xla/xla/service/collective_ops_utils.h
index 0e9985fa9dd351..6bcd7f2f68fa18 100644
--- a/third_party/xla/xla/service/collective_ops_utils.h
+++ b/third_party/xla/xla/service/collective_ops_utils.h
@@ -172,6 +172,10 @@ inline constexpr absl::string_view kNopReturnTokenCustomCallTarget =
 // Returns true if instruction is a collective op or a collective fusion.
 bool IsCollective(const HloInstruction* instruction);
 
+// Returns true if instruction is a collective op (or a collective fusion) with
+// channel_id.
+bool IsCollectiveWithChannelId(const HloInstruction* instruction);
+
 // Returns true if instruction is a synchronous collective op.
 bool IsSyncCollective(const HloInstruction* instr);
 
diff --git a/third_party/xla/xla/service/collective_ops_utils_test.cc b/third_party/xla/xla/service/collective_ops_utils_test.cc
index bf6779415fc34f..26f52bd39d4bea 100644
--- a/third_party/xla/xla/service/collective_ops_utils_test.cc
+++ b/third_party/xla/xla/service/collective_ops_utils_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/collective_ops_utils.h"
 
+#include <cstdint>
 #include <iterator>
 #include <optional>
 #include <sstream>
@@ -22,10 +23,16 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/computation_placer.h"
 #include "xla/service/global_device_id.h"
+#include "xla/service/hlo_parser.h"
+#include "xla/shape_util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
@@ -60,6 +67,52 @@ TEST(CollectiveOpsUtilsTest, GetParticipatingIDs_ReplicaGroups) {
   EXPECT_EQ(actual, expected);
 }
 
+TEST(CollectiveOpsUtilsTest, CollectiveWithChannelId) {
+  absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY %cluster {
+    %param0 = f32[512]{0} parameter(0)
+    %copy0 = f32[512]{0} copy(param0)
+    %reshape0 = f32[1,1,512]{2,0,1} reshape(f32[512]{0} %copy0)
+    %all-gather = f32[1,4,512]{2,0,1} all-gather(f32[1,1,512]{2,0,1} %reshape0), channel_id=3621, replica_groups={{0,1,2,3}}, dimensions={1}, use_global_device_ids=true
+    %copy1 = f32[1,4,512]{2,0,1} copy(all-gather)
+    ROOT root = f32[1,4,512]{2,1,0} copy(%copy1)
+  })";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnUnverifiedModule(hlo_string));
+
+  HloInstruction *all_gather =
+      module->entry_computation()->GetInstructionWithName("all-gather");
+
+  EXPECT_TRUE(IsCollectiveWithChannelId(all_gather));
+}
+
+TEST(CollectiveOpsUtilsTest, CollectiveWithChannelId2) {
+  ReplicaGroup group;
+  for (int64_t i = 0; i < 8; i++) {
+    group.add_replica_ids(i);
+  }
+
+  auto builder = HloComputation::Builder("CollectiveWithChannelId2");
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloInstruction * param_0,
+      builder.AddParameter(HloInstruction::CreateParameter(
+          0, ShapeUtil::MakeShape(BF16, {1, 512, 4096}), "p0")));
+  HloInstruction *instr =
+      builder.AddInstruction(HloInstruction::CreateAllGather(
+          ShapeUtil::MakeShape(BF16, {1, 4096, 4096}), {param_0}, 1, {group},
+          true, 231, true));
+  auto computation = builder.Build(
+      builder.AddInstruction(HloInstruction::CreateTuple({instr})));
+  auto fusion =
+      HloInstruction::CreateFusion(ShapeUtil::MakeShape(BF16, {1, 4096, 4096}),
+                                   HloInstruction::FusionKind::kOutput,
+                                   {param_0}, computation.get(), "fusion");
+
+  EXPECT_TRUE(IsCollectiveWithChannelId(fusion.get()));
+}
+
 }  // namespace
 
 // Tests for GetCollectOpGroupMode
diff --git a/third_party/xla/xla/service/comparison_expander.cc b/third_party/xla/xla/service/comparison_expander.cc
index ad399fafd2b700..0ba3c22f96a10a 100644
--- a/third_party/xla/xla/service/comparison_expander.cc
+++ b/third_party/xla/xla/service/comparison_expander.cc
@@ -15,44 +15,59 @@ limitations under the License.
 
 #include "xla/service/comparison_expander.h"
 
-#include "xla/client/lib/comparators.h"
-#include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include <cstdint>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "xla/comparison_util.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/service/hlo_creation_utils.h"
+#include "xla/literal_util.h"
+#include "xla/primitive_util.h"
+#include "xla/shape_util.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 
 HloInstruction* BitcastConvertFloatingPointToIntegral(
-    HloComputation* computation, HloInstruction* value,
-    const Shape& signed_shape, const Shape& unsigned_shape,
-    HloInstruction* zero, HloInstruction* max_value) {
+    HloComputation* computation, HloInstruction* value, HloInstruction* zero,
+    HloInstruction* min_value, HloInstruction* max_value) {
   // Switch from a floating point value to a integer value in such a way that
   // when using the integer value to compare, we get the same result for normal
   // values, and -Nan is treated as the smallest value, and Nan is treated as
   // the largest value.
   // If f is a float, and
   // x = bit_cast<int32_t>(f);
-  // y = x < 0 ? numeric_limits<int32_t>::max() - x : x;
+  // y = x < 0 ? numeric_limits<int32_t>::max() ^ x : x;
   // then y is ordered as an int32_t such that finite values have the obvious
   // order, -0 is ordered before 0, and -NaN and NaN appear at the beginning
   // and end of the ordering.
-  // Note that in order to avoid -x to overflow, we calculate
-  // numeric_limits<int32_t>::max() - x as unsigned, and then convert back to
-  // signed.
+  auto signed_shape = max_value->shape();
   auto signed_value = computation->AddInstruction(
       HloInstruction::CreateBitcastConvert(signed_shape, value));
-  auto unsigned_value = computation->AddInstruction(
-      HloInstruction::CreateBitcastConvert(unsigned_shape, value));
-  auto flipped_value = computation->AddInstruction(HloInstruction::CreateBinary(
-      unsigned_shape, HloOpcode::kSubtract, max_value, unsigned_value));
-  flipped_value = computation->AddInstruction(
-      HloInstruction::CreateBitcastConvert(signed_shape, flipped_value));
-  auto compare_shape = signed_shape;
-  compare_shape.set_element_type(PRED);
+  auto compare_shape = ShapeUtil::ChangeElementType(signed_shape, PRED);
+  HloInstruction* flipped_value;
+  if (primitive_util::HasNegativeZero(value->shape().element_type())) {
+    flipped_value = computation->AddInstruction(HloInstruction::CreateBinary(
+        signed_shape, HloOpcode::kXor, max_value, signed_value));
+  } else {
+    // There is no -0 so min_denorm() must take its place, this is the same as
+    // adding one to flipped_value.
+    flipped_value = computation->AddInstruction(HloInstruction::CreateBinary(
+        signed_shape, HloOpcode::kSubtract, min_value, signed_value));
+
+    // NaN is the smallest value as it is negative.
+    auto nan_bit_pattern = min_value;
+    auto is_nan = computation->AddInstruction(HloInstruction::CreateCompare(
+        compare_shape, signed_value, nan_bit_pattern,
+        ComparisonDirection::kEq));
+    flipped_value = computation->AddInstruction(HloInstruction::CreateTernary(
+        signed_shape, HloOpcode::kSelect, is_nan, min_value, flipped_value));
+  }
   auto is_negative = computation->AddInstruction(HloInstruction::CreateCompare(
       compare_shape, signed_value, zero, ComparisonDirection::kLt));
   return computation->AddInstruction(
@@ -63,9 +78,9 @@ HloInstruction* BitcastConvertFloatingPointToIntegral(
 bool ComparisonExpander::InstructionMatchesPattern(
     HloInstruction* instruction) {
   if (HloCompareInstruction* compare =
-          dynamic_cast<HloCompareInstruction*>(instruction)) {
+          DynCast<HloCompareInstruction>(instruction)) {
     HloInstruction* lhs = instruction->operands()[0];
-    if (compare->type() == Comparison::Type::kFloatTotalOrder &&
+    if (compare->order() == Comparison::Order::kTotal &&
         primitive_util::IsFloatingPointType(lhs->shape().element_type())) {
       return true;
     }
@@ -75,58 +90,62 @@ bool ComparisonExpander::InstructionMatchesPattern(
 
 StatusOr<HloInstruction*> ComparisonExpander::ExpandInstruction(
     HloInstruction* instruction) {
-  CHECK(instruction->opcode() == HloOpcode::kCompare);
+  CHECK_EQ(instruction->opcode(), HloOpcode::kCompare);
   HloCompareInstruction* compare =
       static_cast<HloCompareInstruction*>(instruction);
-  CHECK(compare->type() == Comparison::Type::kFloatTotalOrder);
+  CHECK(compare->order() == Comparison::Order::kTotal)
+      << ComparisonOrderToString(compare->order());
   HloComputation* computation = instruction->parent();
   HloInstruction* lhs = instruction->operands()[0];
   HloInstruction* rhs = instruction->operands()[1];
-  Shape compare_shape = lhs->shape();
-  PrimitiveType compare_type = compare_shape.element_type();
+  PrimitiveType compare_type = lhs->shape().element_type();
   CHECK(primitive_util::IsFloatingPointType(compare_type));
-  // Special-case handling for BF16. We currently do not support direct
-  // comparisons with BF16, so we convert to F32 and then use the F32
-  // comparison logic.
-  if (compare_type == BF16) {
-    compare_type = F32;
-    compare_shape.set_element_type(compare_type);
-    lhs = computation->AddInstruction(
-        HloInstruction::CreateConvert(compare_shape, lhs));
-    rhs = computation->AddInstruction(
-        HloInstruction::CreateConvert(compare_shape, rhs));
+  if (auto do_upcast = absl::c_find_if(
+          expand_via_upcast_,
+          [compare_type](std::pair<PrimitiveType, PrimitiveType> upcast) {
+            return upcast.first == compare_type;
+          });
+      do_upcast != expand_via_upcast_.end()) {
+    CHECK(primitive_util::CastPreservesValues(do_upcast->first,
+                                              do_upcast->second));
+    compare_type = do_upcast->second;
+    lhs = computation->AddInstruction(HloInstruction::CreateConvert(
+        ShapeUtil::ChangeElementType(lhs->shape(), compare_type), lhs));
+    rhs = computation->AddInstruction(HloInstruction::CreateConvert(
+        ShapeUtil::ChangeElementType(rhs->shape(), compare_type), rhs));
   }
 
-  int64_t bit_width = primitive_util::BitWidth(compare_type);
+  int64_t bit_width = primitive_util::BitWidth(lhs->shape().element_type());
   PrimitiveType signed_type =
       primitive_util::SignedIntegralTypeForBitWidth(bit_width);
-  PrimitiveType unsigned_type =
-      primitive_util::UnsignedIntegralTypeForBitWidth(bit_width);
-  auto signed_shape = compare_shape;
-  signed_shape.set_element_type(signed_type);
-  auto unsigned_shape = compare_shape;
-  unsigned_shape.set_element_type(unsigned_type);
+  auto signed_shape = ShapeUtil::ChangeElementType(lhs->shape(), signed_type);
+
   auto zero_value = computation->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::Zero(signed_type)));
-  zero_value = computation->AddInstruction(HloInstruction::CreateBroadcast(
-      signed_shape, zero_value, zero_value->shape().dimensions()));
-  auto max_signed = computation->AddInstruction(
+  zero_value = computation->AddInstruction(
+      HloInstruction::CreateBroadcast(signed_shape, zero_value, {}));
+
+  auto min_value = computation->AddInstruction(HloInstruction::CreateConstant(
+      LiteralUtil::MinValue(signed_shape.element_type())));
+  min_value = computation->AddInstruction(
+      HloInstruction::CreateBroadcast(signed_shape, min_value, {}));
+
+  auto max_value = computation->AddInstruction(
       HloInstruction::CreateConstant(LiteralUtil::MaxValue(signed_type)));
-  auto max_shape = max_signed->shape();
-  max_shape.set_element_type(unsigned_type);
-  auto max_unsigned = computation->AddInstruction(
-      HloInstruction::CreateConvert(max_shape, max_signed));
-  auto max_value = computation->AddInstruction(HloInstruction::CreateBroadcast(
-      unsigned_shape, max_unsigned, max_shape.dimensions()));
-  lhs = BitcastConvertFloatingPointToIntegral(
-      computation, lhs, signed_shape, unsigned_shape, zero_value, max_value);
-  rhs = BitcastConvertFloatingPointToIntegral(
-      computation, rhs, signed_shape, unsigned_shape, zero_value, max_value);
+  max_value = computation->AddInstruction(
+      HloInstruction::CreateBroadcast(signed_shape, max_value, {}));
+
+  lhs = BitcastConvertFloatingPointToIntegral(computation, lhs, zero_value,
+                                              min_value, max_value);
+  rhs = BitcastConvertFloatingPointToIntegral(computation, rhs, zero_value,
+                                              min_value, max_value);
+
   auto new_compare = computation->AddInstruction(HloInstruction::CreateCompare(
       instruction->shape(), lhs, rhs, compare->direction(),
       Comparison::Type::kSigned));
+
   VLOG(2) << "New comparison instruction for total order:"
-          << new_compare->ToString() << "\n";
+          << new_compare->ToString();
   return new_compare;
 }
 
diff --git a/third_party/xla/xla/service/comparison_expander.h b/third_party/xla/xla/service/comparison_expander.h
index 17af0712b102c1..14dd160146b181 100644
--- a/third_party/xla/xla/service/comparison_expander.h
+++ b/third_party/xla/xla/service/comparison_expander.h
@@ -17,10 +17,15 @@ limitations under the License.
 #define XLA_SERVICE_COMPARISON_EXPANDER_H_
 
 #include <utility>
+#include <vector>
 
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/hlo_pass_interface.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/primitive_util.h"
 #include "xla/service/op_expander_pass.h"
+#include "xla/statusor.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 
@@ -28,7 +33,11 @@ namespace xla {
 // order comparison of floating point numbers.
 class ComparisonExpander : public OpExpanderPass {
  public:
-  explicit ComparisonExpander() = default;
+  explicit ComparisonExpander(
+      absl::Span<const std::pair<PrimitiveType, PrimitiveType>>
+          expand_via_upcast = {})
+      : expand_via_upcast_(expand_via_upcast.begin(), expand_via_upcast.end()) {
+  }
   ~ComparisonExpander() override = default;
   absl::string_view name() const override { return "comparison-expander"; }
 
@@ -40,6 +49,8 @@ class ComparisonExpander : public OpExpanderPass {
   // modified).
   StatusOr<HloInstruction*> ExpandInstruction(
       HloInstruction* instruction) override;
+
+  std::vector<std::pair<PrimitiveType, PrimitiveType>> expand_via_upcast_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/compilation_environments.h b/third_party/xla/xla/service/compilation_environments.h
index e698961f2634bc..bb44ca643e37f1 100644
--- a/third_party/xla/xla/service/compilation_environments.h
+++ b/third_party/xla/xla/service/compilation_environments.h
@@ -97,6 +97,8 @@ class CompilationEnvironments {
   T& GetMutableEnv();
   template <typename T>
   const T& GetEnv();
+  template <typename T>
+  bool HasEnv();
 
   // Removes all added environments.
   void Clear() { environments_.clear(); }
@@ -148,6 +150,12 @@ const T& CompilationEnvironments::GetEnv() {
   return GetMutableEnv<T>();
 }
 
+template <typename T>
+bool CompilationEnvironments::HasEnv() {
+  auto descriptor = T::descriptor();
+  return environments_.find(descriptor) != environments_.end();
+}
+
 }  // namespace xla
 
 #endif  // XLA_SERVICE_COMPILATION_ENVIRONMENTS_H_
diff --git a/third_party/xla/xla/service/compilation_environments_test.cc b/third_party/xla/xla/service/compilation_environments_test.cc
index 4d0f58c8276dc8..f0e43fac33fedb 100644
--- a/third_party/xla/xla/service/compilation_environments_test.cc
+++ b/third_party/xla/xla/service/compilation_environments_test.cc
@@ -210,6 +210,13 @@ TEST_F(CompilationEnvironmentsTest, ProtoRoundTrip) {
             20);
 }
 
+TEST_F(CompilationEnvironmentsTest, EnvTypePresenceCheck) {
+  CompilationEnvironments envs;
+  EXPECT_FALSE(envs.HasEnv<TestCompilationEnvironment1>());
+  envs.GetEnv<TestCompilationEnvironment1>();
+  EXPECT_TRUE(envs.HasEnv<TestCompilationEnvironment1>());
+}
+
 }  // namespace
 }  // namespace test
 }  // namespace xla
diff --git a/third_party/xla/xla/service/compiler.h b/third_party/xla/xla/service/compiler.h
index e9fb13d319ac6b..6a4f33c0255762 100644
--- a/third_party/xla/xla/service/compiler.h
+++ b/third_party/xla/xla/service/compiler.h
@@ -44,6 +44,10 @@ limitations under the License.
 #include "tsl/platform/protobuf.h"
 #include "tsl/platform/threadpool.h"
 
+namespace mlir {
+class DialectRegistry;
+}  // namespace mlir
+
 namespace xla {
 
 // The following types are used for ahead of time compilation.
@@ -72,6 +76,11 @@ class AotCompilationResult {
     return Unimplemented("LoadExecutable unimplemented.");
   }
 
+  // Returns the optimized HLO module if one was computed and the implementation
+  // supports it.
+  virtual const HloModule* optimized_module() const = 0;
+  virtual std::unique_ptr<HloModule> consume_optimized_module() = 0;
+
  protected:
   AotCompilationResult() = default;
 };
@@ -145,6 +154,10 @@ class Compiler {
     // AOT device description. If provided, used instead of querying the device
     // on which compilation is performed.
     std::optional<TargetConfig> target_config;
+
+    // Registry of MLIR dialects and plugins to be loaded during optimization.
+    // If non-null, it will be used to construct relevant MLIR contexts.
+    mlir::DialectRegistry* registry = nullptr;
   };
 
   virtual ~Compiler() = default;
diff --git a/third_party/xla/xla/service/compiler_test.cc b/third_party/xla/xla/service/compiler_test.cc
index 0924c9096a9301..c2743c15aff889 100644
--- a/third_party/xla/xla/service/compiler_test.cc
+++ b/third_party/xla/xla/service/compiler_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/third_party/xla/xla/service/computation_placer.cc b/third_party/xla/xla/service/computation_placer.cc
index d0274f5630d941..daa03d351431c3 100644
--- a/third_party/xla/xla/service/computation_placer.cc
+++ b/third_party/xla/xla/service/computation_placer.cc
@@ -165,7 +165,7 @@ StatusOr<DeviceAssignment> ComputationPlacer::AssignDevices(
   auto* computation_placers = GetPlatformComputationPlacers();
   if (computation_placers->find(platform_id) != computation_placers->end()) {
     // TODO(b/282059652): Consider logging the platform name using
-    // MultiPlatformManager::PlatformWithId(). No doing that for now to avoid
+    // PlatformManager::PlatformWithId(). No doing that for now to avoid
     // introducing unwanted dependency.
     LOG(WARNING) << "computation placer already registered. Please check "
                     "linkage and avoid linking the same target more than once.";
diff --git a/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.cc b/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.cc
new file mode 100644
index 00000000000000..e9a6f372e6af0d
--- /dev/null
+++ b/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.cc
@@ -0,0 +1,102 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+
+#include "xla/service/convert_memory_placement_to_internal_annotations.h"
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/host_memory_offload_annotations.h"
+#include "xla/side_effect_util.h"
+#include "xla/statusor.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
+
+namespace xla {
+
+StatusOr<bool> ConvertMemoryPlacementToInternalAnnotations::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  for (HloComputation* c : module->MakeNonfusionComputations()) {
+    for (HloInstruction* instruction : c->MakeInstructionPostOrder()) {
+      if (instruction->IsCustomCall(
+              host_memory_offload_annotations::kDevicePlacement)) {
+        const auto& frontend_attributes = instruction->frontend_attributes();
+        const auto it = frontend_attributes.map().find(kXlaBufferPlacementAttr);
+        if (it == frontend_attributes.map().end()) {
+          continue;
+        }
+        // XLA currently does not differentiate between pinned and unpinned host
+        // memory.
+        const bool is_to_host_case =
+            (it->second ==
+                 host_memory_offload_annotations::kMemoryTargetPinnedHost ||
+             it->second ==
+                 host_memory_offload_annotations::kMemoryTargetUnpinnedHost);
+        const bool is_to_device_case =
+            (it->second ==
+             host_memory_offload_annotations::kMemoryTargetDevice);
+        if (!is_to_host_case && !is_to_device_case) {
+          continue;
+        }
+        if (is_to_host_case) {
+          VLOG(1) << "Process forward case: " << instruction->ToString();
+          if (instruction->users().size() != 1) {
+            VLOG(1) << "Skip because of too many users on instruction";
+            continue;
+          }
+          if (instruction->operand_count() != 1) {
+            return Internal(
+                "Custom calls with target %s must have exactly one operand. %s "
+                "has %d.",
+                host_memory_offload_annotations::kDevicePlacement,
+                instruction->name(), instruction->operand_count());
+          }
+          HloInstruction* input = instruction->mutable_operand(0);
+          TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(
+              c->AddInstruction(HloInstruction::CreateCustomCall(
+                  input->shape(), {input},
+                  host_memory_offload_annotations::
+                      kMoveToHostCustomCallTarget))));
+          TF_RETURN_IF_ERROR(
+              c->RemoveInstructionAndUnusedOperands(instruction));
+          changed = true;
+        } else if (is_to_device_case) {
+          VLOG(1) << "Process backward case: " << instruction->ToString();
+          HloInstruction* custom_call_operand = instruction->mutable_operand(0);
+          if (custom_call_operand->users().size() != 1) {
+            VLOG(1) << "Skip because operand is used by more than one user";
+            continue;
+          }
+          HloInstruction* new_result =
+              c->AddInstruction(HloInstruction::CreateCustomCall(
+                  custom_call_operand->shape(), {custom_call_operand},
+                  host_memory_offload_annotations::
+                      kMoveToDeviceCustomCallTarget));
+          TF_RETURN_IF_ERROR(instruction->ReplaceAllUsesWith(new_result));
+          TF_RETURN_IF_ERROR(
+              c->RemoveInstructionAndUnusedOperands(instruction));
+          changed = true;
+        }
+      }
+    }
+  }
+  return changed;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.h b/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.h
new file mode 100644
index 00000000000000..87fff9d715ec86
--- /dev/null
+++ b/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations.h
@@ -0,0 +1,44 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+
+#ifndef XLA_SERVICE_CONVERT_MEMORY_PLACEMENT_TO_INTERNAL_ANNOTATIONS_H_
+#define XLA_SERVICE_CONVERT_MEMORY_PLACEMENT_TO_INTERNAL_ANNOTATIONS_H_
+
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+class ConvertMemoryPlacementToInternalAnnotations : public HloModulePass {
+ public:
+  ConvertMemoryPlacementToInternalAnnotations() = default;
+
+  absl::string_view name() const override {
+    return "convert-memory-placement-to-internal-annotations";
+  }
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_CONVERT_MEMORY_PLACEMENT_TO_INTERNAL_ANNOTATIONS_H_
diff --git a/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations_test.cc b/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations_test.cc
new file mode 100644
index 00000000000000..7689a575c35996
--- /dev/null
+++ b/third_party/xla/xla/service/convert_memory_placement_to_internal_annotations_test.cc
@@ -0,0 +1,483 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+
+#include "xla/service/convert_memory_placement_to_internal_annotations.h"
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "xla/statusor.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/util.h"
+#include "xla/xla_data.pb.h"
+
+namespace xla {
+namespace {
+
+class ConvertMemoryPlacementToInternalAnnotationsTest : public HloTestBase {
+ public:
+  ConvertMemoryPlacementToInternalAnnotationsTest() = default;
+};
+
+TEST_F(ConvertMemoryPlacementToInternalAnnotationsTest, ConvertPinnedHostTest) {
+  const char* hlo_string = R"(
+HloModule jit_f, entry_computation_layout={(f32[16]{0})->f32[16]{0}}
+
+region_0.9 {
+  arg_tuple.10 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}) parameter(0)
+  get-tuple-element.11 = s32[] get-tuple-element(arg_tuple.10), index=0
+  constant.15 = s32[] constant(1)
+  add.33 = s32[] add(get-tuple-element.11, constant.15)
+  get-tuple-element.12 = f32[16]{0} get-tuple-element(arg_tuple.10), index=1
+  sine.18 = f32[16]{0} sine(get-tuple-element.12)
+  sine.19 = f32[16]{0} sine(sine.18)
+  sine.20 = f32[16]{0} sine(sine.19)
+  get-tuple-element.13 = f32[16,16]{1,0} get-tuple-element(arg_tuple.10), index=2
+  custom-call.21 = f32[16]{0} custom-call(sine.19), custom_call_target="annotate_device_placement", frontend_attributes={_xla_buffer_placement="pinned_host"}
+  reshape.23 = f32[1,16]{1,0} reshape(custom-call.21)
+  constant.17 = s32[] constant(0)
+  compare.24 = pred[] compare(get-tuple-element.11, constant.17), direction=LT
+  constant.16 = s32[] constant(16)
+  add.25 = s32[] add(get-tuple-element.11, constant.16)
+  select.26 = s32[] select(compare.24, add.25, get-tuple-element.11)
+  dynamic-update-slice.27 = f32[16,16]{1,0} dynamic-update-slice(get-tuple-element.13, reshape.23, select.26, constant.17)
+  get-tuple-element.14 = f32[16,16]{1,0} get-tuple-element(arg_tuple.10), index=3
+  custom-call.22 = f32[16]{0} custom-call(sine.20), custom_call_target="annotate_device_placement", frontend_attributes={_xla_buffer_placement="pinned_host"}
+  reshape.28 = f32[1,16]{1,0} reshape(custom-call.22)
+  compare.29 = pred[] compare(get-tuple-element.11, constant.17), direction=LT
+  add.30 = s32[] add(get-tuple-element.11, constant.16)
+  select.31 = s32[] select(compare.29, add.30, get-tuple-element.11)
+  dynamic-update-slice.32 = f32[16,16]{1,0} dynamic-update-slice(get-tuple-element.14, reshape.28, select.31, constant.17)
+  ROOT tuple.34 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}) tuple(add.33, sine.20, dynamic-update-slice.27, dynamic-update-slice.32)
+}
+
+region_1.35 {
+  arg_tuple.36 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}) parameter(0)
+  get-tuple-element.38 = f32[16]{0} get-tuple-element(arg_tuple.36), index=1
+  get-tuple-element.39 = f32[16,16]{1,0} get-tuple-element(arg_tuple.36), index=2
+  get-tuple-element.40 = f32[16,16]{1,0} get-tuple-element(arg_tuple.36), index=3
+  get-tuple-element.37 = s32[] get-tuple-element(arg_tuple.36), index=0
+  constant.41 = s32[] constant(16)
+  ROOT compare.42 = pred[] compare(get-tuple-element.37, constant.41), direction=LT
+}
+
+core_closed_call.43 {
+  constant.47 = s32[] constant(0)
+  Arg_0.44 = f32[16]{0} parameter(0)
+  constant.45 = f32[] constant(0)
+  broadcast.46 = f32[16,16]{1,0} broadcast(constant.45), dimensions={}
+  tuple.48 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}) tuple(constant.47, Arg_0.44, broadcast.46, broadcast.46)
+  while.49 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}) while(tuple.48), condition=region_1.35, body=region_0.9
+  get-tuple-element.50 = s32[] get-tuple-element(while.49), index=0
+  get-tuple-element.51 = f32[16]{0} get-tuple-element(while.49), index=1
+  get-tuple-element.52 = f32[16,16]{1,0} get-tuple-element(while.49), index=2
+  get-tuple-element.53 = f32[16,16]{1,0} get-tuple-element(while.49), index=3
+  ROOT tuple.54 = (f32[16,16]{1,0}, f32[16,16]{1,0}) tuple(get-tuple-element.52, get-tuple-element.53)
+}
+
+region_2.65 {
+  arg_tuple.66 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}, f32[16,16]{1,0}) parameter(0)
+  get-tuple-element.67 = s32[] get-tuple-element(arg_tuple.66), index=0
+  constant.74 = s32[] constant(1)
+  add.108 = s32[] add(get-tuple-element.67, constant.74)
+  get-tuple-element.73 = f32[16,16]{1,0} get-tuple-element(arg_tuple.66), index=6
+  constant.76 = s32[] constant(0)
+  compare.82 = pred[] compare(get-tuple-element.67, constant.76), direction=LT
+  constant.75 = s32[] constant(16)
+  add.83 = s32[] add(get-tuple-element.67, constant.75)
+  select.84 = s32[] select(compare.82, add.83, get-tuple-element.67)
+  dynamic-slice.85 = f32[1,16]{1,0} dynamic-slice(get-tuple-element.73, select.84, constant.76), dynamic_slice_sizes={1,16}
+  reshape.86 = f32[16]{0} reshape(dynamic-slice.85)
+  custom-call.87 = f32[16]{0} custom-call(reshape.86), custom_call_target="annotate_device_placement", frontend_attributes={_xla_buffer_placement="device"}
+  get-tuple-element.69 = f32[16,16]{1,0} get-tuple-element(arg_tuple.66), index=2
+  get-tuple-element.68 = f32[16]{0} get-tuple-element(arg_tuple.66), index=1
+  cosine.88 = f32[16]{0} cosine(get-tuple-element.68)
+  reshape.93 = f32[1,16]{1,0} reshape(cosine.88)
+  compare.94 = pred[] compare(get-tuple-element.67, constant.76), direction=LT
+  add.95 = s32[] add(get-tuple-element.67, constant.75)
+  select.96 = s32[] select(compare.94, add.95, get-tuple-element.67)
+  dynamic-update-slice.97 = f32[16,16]{1,0} dynamic-update-slice(get-tuple-element.69, reshape.93, select.96, constant.76)
+  get-tuple-element.70 = f32[16,16]{1,0} get-tuple-element(arg_tuple.66), index=3
+  sine.89 = f32[16]{0} sine(get-tuple-element.68)
+  cosine.90 = f32[16]{0} cosine(sine.89)
+  reshape.98 = f32[1,16]{1,0} reshape(cosine.90)
+  compare.99 = pred[] compare(get-tuple-element.67, constant.76), direction=LT
+  add.100 = s32[] add(get-tuple-element.67, constant.75)
+  select.101 = s32[] select(compare.99, add.100, get-tuple-element.67)
+  dynamic-update-slice.102 = f32[16,16]{1,0} dynamic-update-slice(get-tuple-element.70, reshape.98, select.101, constant.76)
+  get-tuple-element.71 = f32[16,16]{1,0} get-tuple-element(arg_tuple.66), index=4
+  get-tuple-element.72 = f32[16,16]{1,0} get-tuple-element(arg_tuple.66), index=5
+  compare.77 = pred[] compare(get-tuple-element.67, constant.76), direction=LT
+  add.78 = s32[] add(get-tuple-element.67, constant.75)
+  select.79 = s32[] select(compare.77, add.78, get-tuple-element.67)
+  dynamic-slice.80 = f32[1,16]{1,0} dynamic-slice(get-tuple-element.72, select.79, constant.76), dynamic_slice_sizes={1,16}
+  reshape.81 = f32[16]{0} reshape(dynamic-slice.80)
+  custom-call.91 = f32[16]{0} custom-call(reshape.81), custom_call_target="annotate_device_placement", frontend_attributes={_xla_buffer_placement="device"}
+  cosine.92 = f32[16]{0} cosine(custom-call.91)
+  reshape.103 = f32[1,16]{1,0} reshape(cosine.92)
+  compare.104 = pred[] compare(get-tuple-element.67, constant.76), direction=LT
+  add.105 = s32[] add(get-tuple-element.67, constant.75)
+  select.106 = s32[] select(compare.104, add.105, get-tuple-element.67)
+  dynamic-update-slice.107 = f32[16,16]{1,0} dynamic-update-slice(get-tuple-element.71, reshape.103, select.106, constant.76)
+  ROOT tuple.109 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}, f32[16,16]{1,0}) tuple(add.108, custom-call.87, dynamic-update-slice.97, dynamic-update-slice.102, dynamic-update-slice.107, get-tuple-element.72, get-tuple-element.73)
+}
+
+region_3.110 {
+  arg_tuple.111 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}, f32[16,16]{1,0}) parameter(0)
+  get-tuple-element.113 = f32[16]{0} get-tuple-element(arg_tuple.111), index=1
+  get-tuple-element.114 = f32[16,16]{1,0} get-tuple-element(arg_tuple.111), index=2
+  get-tuple-element.115 = f32[16,16]{1,0} get-tuple-element(arg_tuple.111), index=3
+  get-tuple-element.116 = f32[16,16]{1,0} get-tuple-element(arg_tuple.111), index=4
+  get-tuple-element.117 = f32[16,16]{1,0} get-tuple-element(arg_tuple.111), index=5
+  get-tuple-element.118 = f32[16,16]{1,0} get-tuple-element(arg_tuple.111), index=6
+  get-tuple-element.112 = s32[] get-tuple-element(arg_tuple.111), index=0
+  constant.119 = s32[] constant(16)
+  ROOT compare.120 = pred[] compare(get-tuple-element.112, constant.119), direction=LT
+}
+
+region_4.130 {
+  arg_tuple.131 = (s32[], f32[16]{0}, f32[], f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}) parameter(0)
+  get-tuple-element.132 = s32[] get-tuple-element(arg_tuple.131), index=0
+  constant.140 = s32[] constant(1)
+  add.164 = s32[] add(get-tuple-element.132, constant.140)
+  get-tuple-element.133 = f32[16]{0} get-tuple-element(arg_tuple.131), index=1
+  get-tuple-element.134 = f32[] get-tuple-element(arg_tuple.131), index=2
+  broadcast.159 = f32[16]{0} broadcast(get-tuple-element.134), dimensions={}
+  add.160 = f32[16]{0} add(get-tuple-element.133, broadcast.159)
+  get-tuple-element.137 = f32[16,16]{1,0} get-tuple-element(arg_tuple.131), index=5
+  constant.141 = s32[] constant(16)
+  subtract.142 = s32[] subtract(constant.141, get-tuple-element.132)
+  subtract.143 = s32[] subtract(subtract.142, constant.140)
+  constant.139 = s32[] constant(0)
+  compare.154 = pred[] compare(subtract.143, constant.139), direction=LT
+  add.155 = s32[] add(subtract.143, constant.141)
+  select.156 = s32[] select(compare.154, add.155, subtract.143)
+  dynamic-slice.157 = f32[1,16]{1,0} dynamic-slice(get-tuple-element.137, select.156, constant.139), dynamic_slice_sizes={1,16}
+  reshape.158 = f32[16]{0} reshape(dynamic-slice.157)
+  multiply.161 = f32[16]{0} multiply(add.160, reshape.158)
+  get-tuple-element.136 = f32[16,16]{1,0} get-tuple-element(arg_tuple.131), index=4
+  compare.149 = pred[] compare(subtract.143, constant.139), direction=LT
+  add.150 = s32[] add(subtract.143, constant.141)
+  select.151 = s32[] select(compare.149, add.150, subtract.143)
+  dynamic-slice.152 = f32[1,16]{1,0} dynamic-slice(get-tuple-element.136, select.151, constant.139), dynamic_slice_sizes={1,16}
+  reshape.153 = f32[16]{0} reshape(dynamic-slice.152)
+  multiply.162 = f32[16]{0} multiply(multiply.161, reshape.153)
+  get-tuple-element.135 = f32[16,16]{1,0} get-tuple-element(arg_tuple.131), index=3
+  compare.144 = pred[] compare(subtract.143, constant.139), direction=LT
+  add.145 = s32[] add(subtract.143, constant.141)
+  select.146 = s32[] select(compare.144, add.145, subtract.143)
+  dynamic-slice.147 = f32[1,16]{1,0} dynamic-slice(get-tuple-element.135, select.146, constant.139), dynamic_slice_sizes={1,16}
+  reshape.148 = f32[16]{0} reshape(dynamic-slice.147)
+  multiply.163 = f32[16]{0} multiply(multiply.162, reshape.148)
+  constant.138 = f32[] constant(0)
+  ROOT tuple.165 = (s32[], f32[16]{0}, f32[], f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}) tuple(add.164, multiply.163, constant.138, get-tuple-element.135, get-tuple-element.136, get-tuple-element.137)
+}
+
+region_5.166 {
+  arg_tuple.167 = (s32[], f32[16]{0}, f32[], f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}) parameter(0)
+  get-tuple-element.169 = f32[16]{0} get-tuple-element(arg_tuple.167), index=1
+  get-tuple-element.170 = f32[] get-tuple-element(arg_tuple.167), index=2
+  get-tuple-element.171 = f32[16,16]{1,0} get-tuple-element(arg_tuple.167), index=3
+  get-tuple-element.172 = f32[16,16]{1,0} get-tuple-element(arg_tuple.167), index=4
+  get-tuple-element.173 = f32[16,16]{1,0} get-tuple-element(arg_tuple.167), index=5
+  get-tuple-element.168 = s32[] get-tuple-element(arg_tuple.167), index=0
+  constant.174 = s32[] constant(16)
+  ROOT compare.175 = pred[] compare(get-tuple-element.168, constant.174), direction=LT
+}
+
+ENTRY main.183 {
+  constant.6 = s32[] constant(0)
+  Arg_0.1 = f32[16]{0} parameter(0), sharding={devices=[2]<=[2]}
+  call.55 = (f32[16,16]{1,0}, f32[16,16]{1,0}) call(Arg_0.1), to_apply=core_closed_call.43
+  get-tuple-element.56 = f32[16,16]{1,0} get-tuple-element(call.55), index=0
+  get-tuple-element.57 = f32[16,16]{1,0} get-tuple-element(call.55), index=1
+  constant.7 = f32[] constant(1)
+  tuple.58 = (f32[16,16]{1,0}, f32[16,16]{1,0}, f32[16]{0}, f32[]) tuple(get-tuple-element.56, get-tuple-element.57, Arg_0.1, constant.7)
+  opt-barrier.59 = (f32[16,16]{1,0}, f32[16,16]{1,0}, f32[16]{0}, f32[]) opt-barrier(tuple.58)
+  get-tuple-element.62 = f32[16]{0} get-tuple-element(opt-barrier.59), index=2
+  constant.4 = f32[] constant(0)
+  broadcast.5 = f32[16,16]{1,0} broadcast(constant.4), dimensions={}
+  get-tuple-element.60 = f32[16,16]{1,0} get-tuple-element(opt-barrier.59), index=0
+  get-tuple-element.61 = f32[16,16]{1,0} get-tuple-element(opt-barrier.59), index=1
+  tuple.64 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}, f32[16,16]{1,0}) tuple(constant.6, get-tuple-element.62, broadcast.5, broadcast.5, broadcast.5, get-tuple-element.60, get-tuple-element.61)
+  while.121 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}, f32[16,16]{1,0}) while(tuple.64), condition=region_3.110, body=region_2.65
+  get-tuple-element.122 = s32[] get-tuple-element(while.121), index=0
+  get-tuple-element.123 = f32[16]{0} get-tuple-element(while.121), index=1
+  get-tuple-element.127 = f32[16,16]{1,0} get-tuple-element(while.121), index=5
+  get-tuple-element.128 = f32[16,16]{1,0} get-tuple-element(while.121), index=6
+  constant.2 = f32[] constant(0)
+  broadcast.3 = f32[16]{0} broadcast(constant.2), dimensions={}
+  get-tuple-element.63 = f32[] get-tuple-element(opt-barrier.59), index=3
+  get-tuple-element.124 = f32[16,16]{1,0} get-tuple-element(while.121), index=2
+  get-tuple-element.125 = f32[16,16]{1,0} get-tuple-element(while.121), index=3
+  get-tuple-element.126 = f32[16,16]{1,0} get-tuple-element(while.121), index=4
+  tuple.129 = (s32[], f32[16]{0}, f32[], f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}) tuple(constant.6, broadcast.3, get-tuple-element.63, get-tuple-element.124, get-tuple-element.125, get-tuple-element.126)
+  while.176 = (s32[], f32[16]{0}, f32[], f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}) while(tuple.129), condition=region_5.166, body=region_4.130
+  get-tuple-element.177 = s32[] get-tuple-element(while.176), index=0
+  ROOT get-tuple-element.178 = f32[16]{0} get-tuple-element(while.176), index=1
+  get-tuple-element.179 = f32[] get-tuple-element(while.176), index=2
+  get-tuple-element.180 = f32[16,16]{1,0} get-tuple-element(while.176), index=3
+  get-tuple-element.181 = f32[16,16]{1,0} get-tuple-element(while.176), index=4
+  get-tuple-element.182 = f32[16,16]{1,0} get-tuple-element(while.176), index=5
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  bool changed =
+      ConvertMemoryPlacementToInternalAnnotations().Run(module.get()).value();
+  EXPECT_TRUE(changed);
+  XLA_VLOG_LINES(1, module->ToString());
+  int64_t custom_calls_count = 0;
+  for (auto* c : module->computations()) {
+    for (auto* instr : c->instructions()) {
+      if (instr->IsCustomCall("PipelineForward") ||
+          instr->IsCustomCall("PipelineBackward")) {
+        ++custom_calls_count;
+      }
+    }
+  }
+  EXPECT_EQ(custom_calls_count, 4);
+}
+
+TEST_F(ConvertMemoryPlacementToInternalAnnotationsTest,
+       ConvertUnpinnedHostTest) {
+  const char* hlo_string = R"(
+HloModule jit_f, entry_computation_layout={(f32[16]{0})->f32[16]{0}}
+
+region_0.9 {
+  arg_tuple.10 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}) parameter(0)
+  get-tuple-element.11 = s32[] get-tuple-element(arg_tuple.10), index=0
+  constant.15 = s32[] constant(1)
+  add.33 = s32[] add(get-tuple-element.11, constant.15)
+  get-tuple-element.12 = f32[16]{0} get-tuple-element(arg_tuple.10), index=1
+  sine.18 = f32[16]{0} sine(get-tuple-element.12)
+  sine.19 = f32[16]{0} sine(sine.18)
+  sine.20 = f32[16]{0} sine(sine.19)
+  get-tuple-element.13 = f32[16,16]{1,0} get-tuple-element(arg_tuple.10), index=2
+  custom-call.21 = f32[16]{0} custom-call(sine.19), custom_call_target="annotate_device_placement", frontend_attributes={_xla_buffer_placement="unpinned_host"}
+  reshape.23 = f32[1,16]{1,0} reshape(custom-call.21)
+  constant.17 = s32[] constant(0)
+  compare.24 = pred[] compare(get-tuple-element.11, constant.17), direction=LT
+  constant.16 = s32[] constant(16)
+  add.25 = s32[] add(get-tuple-element.11, constant.16)
+  select.26 = s32[] select(compare.24, add.25, get-tuple-element.11)
+  dynamic-update-slice.27 = f32[16,16]{1,0} dynamic-update-slice(get-tuple-element.13, reshape.23, select.26, constant.17)
+  get-tuple-element.14 = f32[16,16]{1,0} get-tuple-element(arg_tuple.10), index=3
+  custom-call.22 = f32[16]{0} custom-call(sine.20), custom_call_target="annotate_device_placement", frontend_attributes={_xla_buffer_placement="unpinned_host"}
+  reshape.28 = f32[1,16]{1,0} reshape(custom-call.22)
+  compare.29 = pred[] compare(get-tuple-element.11, constant.17), direction=LT
+  add.30 = s32[] add(get-tuple-element.11, constant.16)
+  select.31 = s32[] select(compare.29, add.30, get-tuple-element.11)
+  dynamic-update-slice.32 = f32[16,16]{1,0} dynamic-update-slice(get-tuple-element.14, reshape.28, select.31, constant.17)
+  ROOT tuple.34 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}) tuple(add.33, sine.20, dynamic-update-slice.27, dynamic-update-slice.32)
+}
+
+region_1.35 {
+  arg_tuple.36 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}) parameter(0)
+  get-tuple-element.38 = f32[16]{0} get-tuple-element(arg_tuple.36), index=1
+  get-tuple-element.39 = f32[16,16]{1,0} get-tuple-element(arg_tuple.36), index=2
+  get-tuple-element.40 = f32[16,16]{1,0} get-tuple-element(arg_tuple.36), index=3
+  get-tuple-element.37 = s32[] get-tuple-element(arg_tuple.36), index=0
+  constant.41 = s32[] constant(16)
+  ROOT compare.42 = pred[] compare(get-tuple-element.37, constant.41), direction=LT
+}
+
+core_closed_call.43 {
+  constant.47 = s32[] constant(0)
+  Arg_0.44 = f32[16]{0} parameter(0)
+  constant.45 = f32[] constant(0)
+  broadcast.46 = f32[16,16]{1,0} broadcast(constant.45), dimensions={}
+  tuple.48 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}) tuple(constant.47, Arg_0.44, broadcast.46, broadcast.46)
+  while.49 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}) while(tuple.48), condition=region_1.35, body=region_0.9
+  get-tuple-element.50 = s32[] get-tuple-element(while.49), index=0
+  get-tuple-element.51 = f32[16]{0} get-tuple-element(while.49), index=1
+  get-tuple-element.52 = f32[16,16]{1,0} get-tuple-element(while.49), index=2
+  get-tuple-element.53 = f32[16,16]{1,0} get-tuple-element(while.49), index=3
+  ROOT tuple.54 = (f32[16,16]{1,0}, f32[16,16]{1,0}) tuple(get-tuple-element.52, get-tuple-element.53)
+}
+
+region_2.65 {
+  arg_tuple.66 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}, f32[16,16]{1,0}) parameter(0)
+  get-tuple-element.67 = s32[] get-tuple-element(arg_tuple.66), index=0
+  constant.74 = s32[] constant(1)
+  add.108 = s32[] add(get-tuple-element.67, constant.74)
+  get-tuple-element.73 = f32[16,16]{1,0} get-tuple-element(arg_tuple.66), index=6
+  constant.76 = s32[] constant(0)
+  compare.82 = pred[] compare(get-tuple-element.67, constant.76), direction=LT
+  constant.75 = s32[] constant(16)
+  add.83 = s32[] add(get-tuple-element.67, constant.75)
+  select.84 = s32[] select(compare.82, add.83, get-tuple-element.67)
+  dynamic-slice.85 = f32[1,16]{1,0} dynamic-slice(get-tuple-element.73, select.84, constant.76), dynamic_slice_sizes={1,16}
+  reshape.86 = f32[16]{0} reshape(dynamic-slice.85)
+  custom-call.87 = f32[16]{0} custom-call(reshape.86), custom_call_target="annotate_device_placement", frontend_attributes={_xla_buffer_placement="device"}
+  get-tuple-element.69 = f32[16,16]{1,0} get-tuple-element(arg_tuple.66), index=2
+  get-tuple-element.68 = f32[16]{0} get-tuple-element(arg_tuple.66), index=1
+  cosine.88 = f32[16]{0} cosine(get-tuple-element.68)
+  reshape.93 = f32[1,16]{1,0} reshape(cosine.88)
+  compare.94 = pred[] compare(get-tuple-element.67, constant.76), direction=LT
+  add.95 = s32[] add(get-tuple-element.67, constant.75)
+  select.96 = s32[] select(compare.94, add.95, get-tuple-element.67)
+  dynamic-update-slice.97 = f32[16,16]{1,0} dynamic-update-slice(get-tuple-element.69, reshape.93, select.96, constant.76)
+  get-tuple-element.70 = f32[16,16]{1,0} get-tuple-element(arg_tuple.66), index=3
+  sine.89 = f32[16]{0} sine(get-tuple-element.68)
+  cosine.90 = f32[16]{0} cosine(sine.89)
+  reshape.98 = f32[1,16]{1,0} reshape(cosine.90)
+  compare.99 = pred[] compare(get-tuple-element.67, constant.76), direction=LT
+  add.100 = s32[] add(get-tuple-element.67, constant.75)
+  select.101 = s32[] select(compare.99, add.100, get-tuple-element.67)
+  dynamic-update-slice.102 = f32[16,16]{1,0} dynamic-update-slice(get-tuple-element.70, reshape.98, select.101, constant.76)
+  get-tuple-element.71 = f32[16,16]{1,0} get-tuple-element(arg_tuple.66), index=4
+  get-tuple-element.72 = f32[16,16]{1,0} get-tuple-element(arg_tuple.66), index=5
+  compare.77 = pred[] compare(get-tuple-element.67, constant.76), direction=LT
+  add.78 = s32[] add(get-tuple-element.67, constant.75)
+  select.79 = s32[] select(compare.77, add.78, get-tuple-element.67)
+  dynamic-slice.80 = f32[1,16]{1,0} dynamic-slice(get-tuple-element.72, select.79, constant.76), dynamic_slice_sizes={1,16}
+  reshape.81 = f32[16]{0} reshape(dynamic-slice.80)
+  custom-call.91 = f32[16]{0} custom-call(reshape.81), custom_call_target="annotate_device_placement", frontend_attributes={_xla_buffer_placement="device"}
+  cosine.92 = f32[16]{0} cosine(custom-call.91)
+  reshape.103 = f32[1,16]{1,0} reshape(cosine.92)
+  compare.104 = pred[] compare(get-tuple-element.67, constant.76), direction=LT
+  add.105 = s32[] add(get-tuple-element.67, constant.75)
+  select.106 = s32[] select(compare.104, add.105, get-tuple-element.67)
+  dynamic-update-slice.107 = f32[16,16]{1,0} dynamic-update-slice(get-tuple-element.71, reshape.103, select.106, constant.76)
+  ROOT tuple.109 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}, f32[16,16]{1,0}) tuple(add.108, custom-call.87, dynamic-update-slice.97, dynamic-update-slice.102, dynamic-update-slice.107, get-tuple-element.72, get-tuple-element.73)
+}
+
+region_3.110 {
+  arg_tuple.111 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}, f32[16,16]{1,0}) parameter(0)
+  get-tuple-element.113 = f32[16]{0} get-tuple-element(arg_tuple.111), index=1
+  get-tuple-element.114 = f32[16,16]{1,0} get-tuple-element(arg_tuple.111), index=2
+  get-tuple-element.115 = f32[16,16]{1,0} get-tuple-element(arg_tuple.111), index=3
+  get-tuple-element.116 = f32[16,16]{1,0} get-tuple-element(arg_tuple.111), index=4
+  get-tuple-element.117 = f32[16,16]{1,0} get-tuple-element(arg_tuple.111), index=5
+  get-tuple-element.118 = f32[16,16]{1,0} get-tuple-element(arg_tuple.111), index=6
+  get-tuple-element.112 = s32[] get-tuple-element(arg_tuple.111), index=0
+  constant.119 = s32[] constant(16)
+  ROOT compare.120 = pred[] compare(get-tuple-element.112, constant.119), direction=LT
+}
+
+region_4.130 {
+  arg_tuple.131 = (s32[], f32[16]{0}, f32[], f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}) parameter(0)
+  get-tuple-element.132 = s32[] get-tuple-element(arg_tuple.131), index=0
+  constant.140 = s32[] constant(1)
+  add.164 = s32[] add(get-tuple-element.132, constant.140)
+  get-tuple-element.133 = f32[16]{0} get-tuple-element(arg_tuple.131), index=1
+  get-tuple-element.134 = f32[] get-tuple-element(arg_tuple.131), index=2
+  broadcast.159 = f32[16]{0} broadcast(get-tuple-element.134), dimensions={}
+  add.160 = f32[16]{0} add(get-tuple-element.133, broadcast.159)
+  get-tuple-element.137 = f32[16,16]{1,0} get-tuple-element(arg_tuple.131), index=5
+  constant.141 = s32[] constant(16)
+  subtract.142 = s32[] subtract(constant.141, get-tuple-element.132)
+  subtract.143 = s32[] subtract(subtract.142, constant.140)
+  constant.139 = s32[] constant(0)
+  compare.154 = pred[] compare(subtract.143, constant.139), direction=LT
+  add.155 = s32[] add(subtract.143, constant.141)
+  select.156 = s32[] select(compare.154, add.155, subtract.143)
+  dynamic-slice.157 = f32[1,16]{1,0} dynamic-slice(get-tuple-element.137, select.156, constant.139), dynamic_slice_sizes={1,16}
+  reshape.158 = f32[16]{0} reshape(dynamic-slice.157)
+  multiply.161 = f32[16]{0} multiply(add.160, reshape.158)
+  get-tuple-element.136 = f32[16,16]{1,0} get-tuple-element(arg_tuple.131), index=4
+  compare.149 = pred[] compare(subtract.143, constant.139), direction=LT
+  add.150 = s32[] add(subtract.143, constant.141)
+  select.151 = s32[] select(compare.149, add.150, subtract.143)
+  dynamic-slice.152 = f32[1,16]{1,0} dynamic-slice(get-tuple-element.136, select.151, constant.139), dynamic_slice_sizes={1,16}
+  reshape.153 = f32[16]{0} reshape(dynamic-slice.152)
+  multiply.162 = f32[16]{0} multiply(multiply.161, reshape.153)
+  get-tuple-element.135 = f32[16,16]{1,0} get-tuple-element(arg_tuple.131), index=3
+  compare.144 = pred[] compare(subtract.143, constant.139), direction=LT
+  add.145 = s32[] add(subtract.143, constant.141)
+  select.146 = s32[] select(compare.144, add.145, subtract.143)
+  dynamic-slice.147 = f32[1,16]{1,0} dynamic-slice(get-tuple-element.135, select.146, constant.139), dynamic_slice_sizes={1,16}
+  reshape.148 = f32[16]{0} reshape(dynamic-slice.147)
+  multiply.163 = f32[16]{0} multiply(multiply.162, reshape.148)
+  constant.138 = f32[] constant(0)
+  ROOT tuple.165 = (s32[], f32[16]{0}, f32[], f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}) tuple(add.164, multiply.163, constant.138, get-tuple-element.135, get-tuple-element.136, get-tuple-element.137)
+}
+
+region_5.166 {
+  arg_tuple.167 = (s32[], f32[16]{0}, f32[], f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}) parameter(0)
+  get-tuple-element.169 = f32[16]{0} get-tuple-element(arg_tuple.167), index=1
+  get-tuple-element.170 = f32[] get-tuple-element(arg_tuple.167), index=2
+  get-tuple-element.171 = f32[16,16]{1,0} get-tuple-element(arg_tuple.167), index=3
+  get-tuple-element.172 = f32[16,16]{1,0} get-tuple-element(arg_tuple.167), index=4
+  get-tuple-element.173 = f32[16,16]{1,0} get-tuple-element(arg_tuple.167), index=5
+  get-tuple-element.168 = s32[] get-tuple-element(arg_tuple.167), index=0
+  constant.174 = s32[] constant(16)
+  ROOT compare.175 = pred[] compare(get-tuple-element.168, constant.174), direction=LT
+}
+
+ENTRY main.183 {
+  constant.6 = s32[] constant(0)
+  Arg_0.1 = f32[16]{0} parameter(0), sharding={devices=[2]<=[2]}
+  call.55 = (f32[16,16]{1,0}, f32[16,16]{1,0}) call(Arg_0.1), to_apply=core_closed_call.43
+  get-tuple-element.56 = f32[16,16]{1,0} get-tuple-element(call.55), index=0
+  get-tuple-element.57 = f32[16,16]{1,0} get-tuple-element(call.55), index=1
+  constant.7 = f32[] constant(1)
+  tuple.58 = (f32[16,16]{1,0}, f32[16,16]{1,0}, f32[16]{0}, f32[]) tuple(get-tuple-element.56, get-tuple-element.57, Arg_0.1, constant.7)
+  opt-barrier.59 = (f32[16,16]{1,0}, f32[16,16]{1,0}, f32[16]{0}, f32[]) opt-barrier(tuple.58)
+  get-tuple-element.62 = f32[16]{0} get-tuple-element(opt-barrier.59), index=2
+  constant.4 = f32[] constant(0)
+  broadcast.5 = f32[16,16]{1,0} broadcast(constant.4), dimensions={}
+  get-tuple-element.60 = f32[16,16]{1,0} get-tuple-element(opt-barrier.59), index=0
+  get-tuple-element.61 = f32[16,16]{1,0} get-tuple-element(opt-barrier.59), index=1
+  tuple.64 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}, f32[16,16]{1,0}) tuple(constant.6, get-tuple-element.62, broadcast.5, broadcast.5, broadcast.5, get-tuple-element.60, get-tuple-element.61)
+  while.121 = (s32[], f32[16]{0}, f32[16,16]{1,0}, f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}, f32[16,16]{1,0}) while(tuple.64), condition=region_3.110, body=region_2.65
+  get-tuple-element.122 = s32[] get-tuple-element(while.121), index=0
+  get-tuple-element.123 = f32[16]{0} get-tuple-element(while.121), index=1
+  get-tuple-element.127 = f32[16,16]{1,0} get-tuple-element(while.121), index=5
+  get-tuple-element.128 = f32[16,16]{1,0} get-tuple-element(while.121), index=6
+  constant.2 = f32[] constant(0)
+  broadcast.3 = f32[16]{0} broadcast(constant.2), dimensions={}
+  get-tuple-element.63 = f32[] get-tuple-element(opt-barrier.59), index=3
+  get-tuple-element.124 = f32[16,16]{1,0} get-tuple-element(while.121), index=2
+  get-tuple-element.125 = f32[16,16]{1,0} get-tuple-element(while.121), index=3
+  get-tuple-element.126 = f32[16,16]{1,0} get-tuple-element(while.121), index=4
+  tuple.129 = (s32[], f32[16]{0}, f32[], f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}) tuple(constant.6, broadcast.3, get-tuple-element.63, get-tuple-element.124, get-tuple-element.125, get-tuple-element.126)
+  while.176 = (s32[], f32[16]{0}, f32[], f32[16,16]{1,0}, f32[16,16]{1,0}, /*index=5*/f32[16,16]{1,0}) while(tuple.129), condition=region_5.166, body=region_4.130
+  get-tuple-element.177 = s32[] get-tuple-element(while.176), index=0
+  ROOT get-tuple-element.178 = f32[16]{0} get-tuple-element(while.176), index=1
+  get-tuple-element.179 = f32[] get-tuple-element(while.176), index=2
+  get-tuple-element.180 = f32[16,16]{1,0} get-tuple-element(while.176), index=3
+  get-tuple-element.181 = f32[16,16]{1,0} get-tuple-element(while.176), index=4
+  get-tuple-element.182 = f32[16,16]{1,0} get-tuple-element(while.176), index=5
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  bool changed =
+      ConvertMemoryPlacementToInternalAnnotations().Run(module.get()).value();
+  EXPECT_TRUE(changed);
+  XLA_VLOG_LINES(1, module->ToString());
+  int64_t custom_calls_count = 0;
+  for (auto* c : module->computations()) {
+    for (auto* instr : c->instructions()) {
+      if (instr->IsCustomCall("PipelineForward") ||
+          instr->IsCustomCall("PipelineBackward")) {
+        ++custom_calls_count;
+      }
+    }
+  }
+  EXPECT_EQ(custom_calls_count, 4);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/service/copy_insertion.cc b/third_party/xla/xla/service/copy_insertion.cc
index 2c223ac41cbef7..df47694edb91fc 100644
--- a/third_party/xla/xla/service/copy_insertion.cc
+++ b/third_party/xla/xla/service/copy_insertion.cc
@@ -833,15 +833,14 @@ class ComputeRelativeLocation {
     // During live range analysis of results of `branch_0` this function will be
     // called when entry1 and entry2 are different outputs on `fusion` in
     // `branch_1`. `fusion` defines two buffers, but `value_definition` in
-    // LiveRangeRegions::InstructionInfo does not track output index. The
+    // LiveRangeRegions::InstructionInfo does not track the output index. The
     // analysis will say that they are not interfering and assign the same
-    // buffer to both. This will lead to incorrect numerical results during
-    // runtime.
+    // buffer to both.
     //
     // This check makes sure that outputs of multi-output instructions are
     // always interfering and can not be combined. It can be a false positive
     // when entry1 and entry2 correspond to the same output, but we prefer that
-    // over numerical issues.
+    // over correctness issues.
     //
     // A proper solution would be to track output index in
     // LiveRangeRegions::InstructionInfo.
diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD
index 87e1204340db58..acae1d529c5bb1 100644
--- a/third_party/xla/xla/service/cpu/BUILD
+++ b/third_party/xla/xla/service/cpu/BUILD
@@ -14,7 +14,7 @@ load(
     "acl_deps",
     "if_enable_acl",
 )
-load("@local_tsl//tsl:tsl.bzl", "tf_openmp_copts", "tsl_copts")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility", "tf_openmp_copts", "tsl_copts")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
     "@local_tsl//tsl/mkl:build_defs.bzl",
@@ -25,7 +25,8 @@ load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 load(":build_defs.bzl", "runtime_copts")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
     licenses = ["notice"],
 )
 
@@ -43,7 +44,6 @@ filegroup(
         "**/*.cc",
         "**/*.h",
     ]),
-    visibility = ["//visibility:public"],
 )
 
 bool_flag(
@@ -56,14 +56,12 @@ config_setting(
     flag_values = {
         ":experimental_mlir_gpu": "True",
     },
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "test_header_helper",
     testonly = True,
     hdrs = ["test_target_triple_helper.h"],
-    visibility = ["//visibility:public"],
 )
 
 # When using mlir based HloLowering, the following utils will sometimes be needed to define used symbols.
@@ -107,7 +105,7 @@ filegroup(
         "runtime_matmul_s32.cc",
         "runtime_fork_join.cc",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
 )
 
 filegroup(
@@ -134,14 +132,13 @@ filegroup(
         "runtime_lightweight_check.h",
         "runtime_matmul.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":friends"]),
 )
 
 cc_library(
     name = "cpu_xfeed",
     srcs = ["cpu_xfeed.cc"],
     hdrs = ["cpu_xfeed.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":cpu_runtime",
         "//xla:literal",
@@ -166,7 +163,6 @@ cc_library(
     name = "cpu_transfer_manager",
     srcs = ["cpu_transfer_manager.cc"],
     hdrs = ["cpu_transfer_manager.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":cpu_runtime",
         ":cpu_xfeed",
@@ -195,7 +191,6 @@ cc_library(
     name = "buffer_info_util",
     srcs = ["buffer_info_util.cc"],
     hdrs = ["buffer_info_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:cpu_function_runtime",
         "//xla/hlo/ir:hlo",
@@ -209,7 +204,6 @@ cc_library(
     srcs = ["cpu_compiler.cc"],
     hdrs = ["cpu_compiler.h"],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":buffer_info_util",
         ":compiler_functor",
@@ -426,15 +420,16 @@ cc_library(
     name = "cpu_compiler",
     srcs = ["cpu_compiler_registerer.cc"],
     hdrs = ["cpu_compiler.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "cpu_compiler_pure",
         ":executable_proto_cc",
         ":target_machine_features",
+        ":xla_framework",
         "//xla:cpu_function_runtime",
         "//xla:status",
         "//xla:statusor",
         "//xla:util",
+        "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/ir:hlo_module_group",
         "//xla/service:buffer_assignment",
@@ -446,7 +441,6 @@ cc_library(
         "//xla/service:llvm_compiler",
         "//xla/stream_executor",
         "//xla/stream_executor/host:host_platform_id",
-        "@com_google_absl//absl/functional:any_invocable",
         "@llvm-project//llvm:Target",
     ],
     alwayslink = True,  # Contains compiler registration
@@ -459,21 +453,19 @@ tf_proto_library(
     protodeps = [
         ":xla_framework_proto",
         "//xla/service:hlo_proto",
+        "//xla:xla_proto",
     ],
-    visibility = ["//visibility:public"],
 )
 
 tf_proto_library(
     name = "xla_framework_proto",
     srcs = ["xla_framework.proto"],
     cc_api_version = 2,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "xla_framework",
     hdrs = ["xla_framework.h"],
-    visibility = ["//visibility:public"],
     deps = [":xla_framework_proto_cc"],
 )
 
@@ -485,7 +477,6 @@ cc_library(
         ":experimental_mlir_gpu_enabled": ["EXPERIMENTAL_MLIR_GPU=1"],
         "//conditions:default": [],
     }),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status",
         "//xla/mlir/backends/cpu/transforms:passes",
@@ -536,7 +527,6 @@ cc_library(
     ],
     hdrs = ["simple_orc_jit.h"],
     copts = if_enable_acl(["-DXLA_CPU_USE_ACL=1"]) + tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":compiler_functor",
         ":cpu_runtime",
@@ -583,7 +573,6 @@ cc_library(
     hdrs = ["runtime_lightweight_check.h"],
     compatible_with = get_compatible_with_portable(),
     copts = runtime_copts(),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -595,7 +584,6 @@ cc_library(
         "runtime_fp16.h",
     ],
     copts = runtime_copts(),
-    visibility = ["//visibility:public"],
     deps = ["@com_google_absl//absl/base:core_headers"],
 )
 
@@ -608,21 +596,18 @@ cc_library(
         "runtime_pow.h",
     ],
     copts = runtime_copts(),
-    visibility = ["//visibility:public"],
     deps = ["@com_google_absl//absl/base:core_headers"],
 )
 
 cc_library(
     name = "buffer_desc",
     hdrs = ["buffer_desc.h"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cpu_executable",
     srcs = ["cpu_executable.cc"],
     hdrs = ["cpu_executable.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":buffer_desc",
         ":simple_orc_jit",
@@ -675,7 +660,6 @@ cc_library(
         "ir_emitter.h",
     ],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_config_proto_cc",
         ":cpu_options",
@@ -710,6 +694,7 @@ cc_library(
         "//xla/service/llvm_ir:llvm_type_conversion_util",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/llvm_ir:loop_emitter",
+        "//xla/service/llvm_ir:math_ops",
         "//xla/service/llvm_ir:tuple_ops",
         "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -733,7 +718,6 @@ cc_library(
         "target_machine_features.cc",
     ],
     hdrs = ["target_machine_features.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:cpu_function_runtime",
         "//xla:shape_util",
@@ -748,7 +732,6 @@ cc_library(
     name = "target_machine_features_fake",
     testonly = 1,
     hdrs = ["target_machine_features_fake.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":target_machine_features",
     ],
@@ -758,7 +741,6 @@ cc_library(
     name = "ir_function",
     srcs = ["ir_function.cc"],
     hdrs = ["ir_function.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":cpu_runtime",
         ":ir_emission_utils",
@@ -779,7 +761,6 @@ cc_library(
     name = "parallel_loop_emitter",
     srcs = ["parallel_loop_emitter.cc"],
     hdrs = ["parallel_loop_emitter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":ir_emission_utils",
         "//xla/service/llvm_ir:ir_array",
@@ -795,7 +776,6 @@ cc_library(
     name = "tiled_dot_emitter",
     srcs = ["tiled_dot_emitter.cc"],
     hdrs = ["tiled_dot_emitter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":vector_support_library",
         "//xla:xla_data_proto_cc",
@@ -813,7 +793,6 @@ cc_library(
     hdrs = [
         "dot_op_emitter.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_config_proto_cc",
         ":cpu_options",
@@ -879,7 +858,6 @@ cc_library(
     name = "compiler_functor",
     srcs = ["compiler_functor.cc"],
     hdrs = ["compiler_functor.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":cpu_runtime",
         ":llvm_ir_runtime",
@@ -914,7 +892,6 @@ cc_library(
         "xfeed_manager.h",
     ],
     copts = runtime_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":collectives_interface",
         ":cpu_executable_run_options",
@@ -955,7 +932,6 @@ cc_library(
     hdrs = [
         "llvm_ir_runtime.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":vector_support_library",
         "//xla/service/llvm_ir:llvm_util",
@@ -1183,7 +1159,7 @@ cc_library(
     compatible_with = get_compatible_with_portable(),
     copts = runtime_copts(),
     linkstatic = 1,
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
     deps = [
         "@com_google_absl//absl/base:core_headers",
         "@eigen_archive//:eigen3",
@@ -1284,6 +1260,7 @@ xla_cc_test(
 xla_cc_test(
     name = "cpu_instruction_fusion_test",
     srcs = ["cpu_instruction_fusion_test.cc"],
+    tags = ["no_aarch64"],
     deps = [
         ":cpu_instruction_fusion",
         "//xla:shape_util",
@@ -1316,7 +1293,6 @@ cc_library(
     name = "cpu_instruction_fusion",
     srcs = ["cpu_instruction_fusion.cc"],
     hdrs = ["cpu_instruction_fusion.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/service:fusion_node_indexing_evaluation",
@@ -1330,7 +1306,6 @@ cc_library(
     name = "ir_emission_utils",
     srcs = ["ir_emission_utils.cc"],
     hdrs = ["ir_emission_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":cpu_runtime",
         ":target_machine_features",
@@ -1357,7 +1332,6 @@ cc_library(
     name = "cpu_layout_assignment",
     srcs = ["cpu_layout_assignment.cc"],
     hdrs = ["cpu_layout_assignment.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":dot_op_emitter",
         ":ir_emission_utils",
@@ -1403,7 +1377,6 @@ cc_library(
     name = "conv_canonicalization",
     srcs = ["conv_canonicalization.cc"],
     hdrs = ["conv_canonicalization.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":cpu_runtime",
         ":ir_emission_utils",
@@ -1437,7 +1410,6 @@ cc_library(
     name = "shape_partition",
     srcs = ["shape_partition.cc"],
     hdrs = ["shape_partition.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
     ],
@@ -1459,7 +1431,6 @@ cc_library(
     name = "parallel_task_assignment",
     srcs = ["parallel_task_assignment.cc"],
     hdrs = ["parallel_task_assignment.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_config_proto_cc",
         ":ir_emission_utils",
@@ -1500,7 +1471,6 @@ cc_library(
     name = "cpu_options",
     srcs = ["cpu_options.cc"],
     hdrs = ["cpu_options.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/service:hlo_module_config",
         "@com_google_absl//absl/strings",
@@ -1511,7 +1481,6 @@ cc_library(
     name = "orc_jit_memory_mapper",
     srcs = ["orc_jit_memory_mapper.cc"],
     hdrs = ["orc_jit_memory_mapper.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/synchronization",
@@ -1524,7 +1493,6 @@ cc_library(
     name = "vector_support_library",
     srcs = ["vector_support_library.cc"],
     hdrs = ["vector_support_library.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":target_machine_features",
         "//xla:shape_util",
@@ -1573,7 +1541,6 @@ cc_library(
     name = "mlir_emitter",
     srcs = ["mlir_emitter.cc"],
     hdrs = ["mlir_emitter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:status",
@@ -1598,7 +1565,6 @@ tf_proto_library(
     name = "backend_config_proto",
     srcs = ["backend_config.proto"],
     cc_api_version = 2,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -1609,6 +1575,7 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":runtime_lightweight_check",
+        "//xla:literal",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:statusor",
@@ -1644,11 +1611,13 @@ cc_library(
         ":onednn_memory_util",
         ":runtime_lightweight_check",
         "//xla:executable_run_options",
+        "//xla:shape_util",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/base:dynamic_annotations",
         "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:blocking_counter",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
     ] + mkl_deps(),
 )
@@ -1704,21 +1673,31 @@ cc_library(
     name = "onednn_matmul_rewriter",
     srcs = ["onednn_matmul_rewriter.cc"],
     hdrs = [
+        "onednn_matmul.h",
         "onednn_matmul_rewriter.h",
         "onednn_util.h",
+        "@local_tsl//tsl/util:onednn_util_hdrs",
     ],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_config_proto_cc",
+        ":onednn_matmul",
         ":onednn_memory_util",
+        "//xla:executable_run_options",
+        "//xla:shape_util",
         "//xla:status_macros",
         "//xla:xla_data_proto_cc",
+        "//xla/hlo/evaluator:hlo_evaluator",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_creation_utils",
         "//xla/service:hlo_pass",
         "//xla/service:pattern_matcher",
         "@com_google_absl//absl/algorithm:container",
+        "@eigen_archive//:eigen3",
+        "@local_tsl//tsl/platform:blocking_counter",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:platform_port",
     ] + mkl_deps(),
 )
 
@@ -1730,7 +1709,6 @@ cc_library(
         "onednn_util.h",
     ],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_config_proto_cc",
         ":onednn_memory_util",
@@ -1740,6 +1718,8 @@ cc_library(
         "//xla/service:hlo_creation_utils",
         "//xla/service:hlo_pass",
         "//xla/service:pattern_matcher",
+        "@com_google_absl//absl/algorithm:container",
+        "@local_tsl//tsl/platform:platform_port",
     ] + mkl_deps(),
 )
 
@@ -1748,7 +1728,6 @@ cc_library(
     srcs = ["cpu_float_support.cc"],
     hdrs = ["cpu_float_support.h"],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":onednn_matmul_rewriter",
         "//xla/service:float_support",
@@ -1758,7 +1737,6 @@ cc_library(
 cc_library(
     name = "cpu_symbol_repository",
     hdrs = ["cpu_symbol_repository.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:xla_proto_cc",
         "//xla/service:symbol_repository",
@@ -1768,7 +1746,6 @@ cc_library(
 cc_library(
     name = "collectives_interface",
     hdrs = ["collectives_interface.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:xla_data_proto_cc",
         "//xla/service:collective_ops_utils",
@@ -1784,7 +1761,6 @@ cc_library(
     name = "in_process_collectives",
     srcs = ["in_process_collectives.cc"],
     hdrs = ["in_process_collectives.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":collectives_interface",
         "//xla:refcounting_hash_map",
@@ -1807,6 +1783,5 @@ cc_library(
 cc_library(
     name = "cpu_executable_run_options",
     hdrs = ["cpu_executable_run_options.h"],
-    visibility = ["//visibility:public"],
     deps = [":collectives_interface"],
 )
diff --git a/third_party/xla/xla/service/cpu/backend_config.proto b/third_party/xla/xla/service/cpu/backend_config.proto
index bb4c9a390b4d3b..e32159c8044c5e 100644
--- a/third_party/xla/xla/service/cpu/backend_config.proto
+++ b/third_party/xla/xla/service/cpu/backend_config.proto
@@ -26,9 +26,13 @@ message OneDnnMatMulConfig {
     GELU_ERF = 4;
     GELU_TANH = 5;
     BINARY_ADD = 6;
+    LINEAR = 7;
   }
   repeated FusionKind fused_ops = 3;
   bool bias_broadcast = 4;
+  // To avoid protobuf failures for specific decimal values,
+  // the original float value alpha is type-casted to int32.
+  int32 alpha_typecast = 5;
 }
 
 message OneDnnLayerNormConfig {
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.cc b/third_party/xla/xla/service/cpu/cpu_compiler.cc
index 9767087e509730..5ab9c75f00e1dd 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.cc
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.cc
@@ -155,7 +155,6 @@ limitations under the License.
 #include "xla/service/cpu/runtime/xfeed.h"
 #include "xla/service/cpu/simple_orc_jit.h"
 #include "xla/service/cpu/target_machine_features.h"
-#include "xla/service/cpu/xla_framework.h"
 #include "xla/service/cpu_gpu_shape_verifier.h"
 #include "xla/service/dot_decomposer.h"
 #include "xla/service/dump.h"
@@ -355,20 +354,6 @@ se::Platform::Id CpuAotCompilationOptions::PlatformId() const {
   return se::host::kHostPlatformId;
 }
 
-CpuXlaRuntimeAotCompilationResult::CpuXlaRuntimeAotCompilationResult(
-    HloModuleProto hlo, std::string_view obj_file, std::string_view mlir_module,
-    XlaFrameworkMapping xla_framework_mapping) {
-  XlaRuntimeExecutableProto xla_runtime_executable;
-  *xla_runtime_executable.mutable_hlo_module_proto() = hlo;
-  xla_runtime_executable.set_obj_file(std::string(obj_file));
-  xla_runtime_executable.set_mlir_module(std::string(mlir_module));
-
-  *xla_runtime_cpu_executable_.mutable_xla_runtime_executable() =
-      xla_runtime_executable;
-  *xla_runtime_cpu_executable_.mutable_xla_framework_mapping() =
-      xla_framework_mapping.ToProto();
-}
-
 namespace {
 
 namespace runtime = ::xla::runtime;
@@ -395,7 +380,7 @@ class FlattenTuplesAndBufferizeTypeConverter : public mlir::TypeConverter {
 };
 
 runtime::JitExecutable::Options GetXlaRuntimeJitExecutableOptions(
-    const HloModule& module) {
+    const HloModule& module, mlir::DialectRegistry* custom_registry) {
   runtime::CpuPipelineOptions copts;
   runtime::JitExecutable::Options opts;
   copts.xla_cpu_sparse_cuda_threads =
@@ -404,10 +389,13 @@ runtime::JitExecutable::Options GetXlaRuntimeJitExecutableOptions(
       options::ExperimentalOverriddenPipeline(module.config());
   opts.specialization = runtime::JitExecutable::Specialization::kDisabled;
   opts.compiler.register_dialects =
-      [](xla::runtime::DialectRegistry& dialects) {
+      [custom_registry](xla::runtime::DialectRegistry& dialects) {
         dialects->insert<mlir::mhlo::MhloDialect, mlir::lmhlo::LmhloDialect>();
         runtime::RegisterDefaultXlaCpuRuntimeDialects(dialects);
         RegisterHloXlaRuntimePipelineDialects(*dialects);
+        if (custom_registry) {
+          custom_registry->appendTo(*dialects);
+        }
       };
   opts.compiler.symbols_binding = runtime::ToSymbolsBinding(
       [](runtime::DirectCustomCallRegistry& registry) {
@@ -462,47 +450,24 @@ runtime::JitExecutable::Options GetXlaRuntimeJitExecutableOptions(
 
 }  // namespace
 
-StatusOr<std::unique_ptr<Executable>>
-CpuXlaRuntimeAotCompilationResult::LoadExecutable(
-    Compiler* compiler, const se::StreamExecutor* executor) const {
-  XlaRuntimeExecutableProto xla_runtime_executable =
-      xla_runtime_cpu_executable_.xla_runtime_executable();
-  TF_ASSIGN_OR_RETURN(HloModuleConfig hlo_module_config,
-                      HloModule::CreateModuleConfigFromProto(
-                          xla_runtime_executable.hlo_module_proto(),
-                          GetDebugOptionsFromFlags()));
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> hlo_module,
-      HloModule::CreateFromProto(xla_runtime_executable.hlo_module_proto(),
-                                 hlo_module_config));
-
-  XlaFrameworkMapping xla_framework_mapping;
-  xla_framework_mapping.FromProto(
-      xla_runtime_cpu_executable_.xla_framework_mapping());
-
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<BufferAssignment> buffer_assignment,
-                      compiler->AssignBuffers(hlo_module.get(), executor));
-
-  // TODO(b/232263665): JitOptions should be used only for JIT case because it
-  // has details irrelevant to AOT.
-  runtime::JitExecutable::Options opts =
-      GetXlaRuntimeJitExecutableOptions(*hlo_module);
-
-  return CpuExecutable::LoadFromObjFile(
-      std::move(hlo_module), xla_runtime_executable.obj_file(),
-      xla_runtime_executable.mlir_module(), std::move(buffer_assignment),
-      xla_framework_mapping, opts);
-}
-
 CpuAotCompilationResult::CpuAotCompilationResult(
     ObjectFileData object_file_data, std::vector<BufferInfo> buffer_infos,
-    int64_t result_buffer_index,
+    int64_t result_buffer_index, std::unique_ptr<HloModule> module,
     std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data)
     : object_file_data_(std::move(object_file_data)),
       buffer_infos_(std::move(buffer_infos)),
       result_buffer_index_(result_buffer_index),
+      module_(std::move(module)),
       hlo_profile_printer_data_(std::move(hlo_profile_printer_data)) {}
 
+const HloModule* CpuAotCompilationResult::optimized_module() const {
+  return module_.get();
+}
+
+std::unique_ptr<HloModule> CpuAotCompilationResult::consume_optimized_module() {
+  return std::move(module_);
+}
+
 CpuCompiler::CpuCompiler(bool allow_sparse_shapes)
     : allow_sparse_shapes_(allow_sparse_shapes) {
   // Initialize LLVM the first time the CpuCompiler is initialized.
@@ -679,9 +644,9 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   }
 
   {
-    // Int4Packer must be run before the rest of the pipeline since it modifies
-    // the layout of the entry computation inputs/outputs, which is passed to
-    // LayoutAssignment.
+    // Int4Packer must be run before the rest of the pipeline since it
+    // modifies the layout of the entry computation inputs/outputs, which is
+    // passed to LayoutAssignment.
     HloPassPipeline int4_packer_pipeline("Int4Packer pipeline");
     int4_packer_pipeline.AddPass<SubByteNormalization>(
         SubByteNormalization::SET_ELEMENT_SIZE);
@@ -709,6 +674,12 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
   pipeline.AddPass<ConditionalToSelect>();
   pipeline.AddPass<MapInliner>();
 
+  // The TopkDecomposer generates a compare op with type=TOTALORDER and must
+  // run before the ComparisonExpander which rewrites such comparisons.
+  pipeline.AddPass<TopkDecomposer>([&](const HloInstruction* instr) {
+    return instr->opcode() == HloOpcode::kTopK;
+  });
+
   pipeline.AddPass<ComparisonExpander>();
   pipeline.AddPass<CholeskyExpander>();
   pipeline.AddPass<QrExpander>();
@@ -855,9 +826,6 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
     pipeline.AddPass<ConditionalSimplifier>();
   }();
   pipeline.AddPass<BitcastDtypesExpander>();
-  pipeline.AddPass<TopkDecomposer>([&](const HloInstruction* instr) {
-    return instr->opcode() == HloOpcode::kTopK;
-  });
 
   // XLA lowers topk to a libcall while the MLIR based pipeline does not yet
   // support libcalls. Disable this for now.
@@ -904,7 +872,8 @@ Status CpuCompiler::RunHloPassesThroughLayoutAssn(
 
 Status CpuCompiler::RunHloPassesAfterLayoutAssn(
     HloModule* module, bool is_aot_compile,
-    LLVMTargetMachineFeatures* target_machine_features, bool is_mlir_compile) {
+    LLVMTargetMachineFeatures* target_machine_features,
+    const CompileOptions& compile_options, bool is_mlir_compile) {
   HloPassPipeline pipeline("HLO passes after layout assignment");
 
   // CopyInsertion is still needed by BufferAssignment. MLIR passes will handle
@@ -930,18 +899,22 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
 
   pipeline.AddPass<ReshapeDecomposer>();
 
+  const int max_parallelism =
+      module->config().intra_op_parallelism_threads() > 0
+          ? module->config().intra_op_parallelism_threads()
+          : tsl::port::NumSchedulableCPUs();
+
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
   // AOT compiled code runs in single thread.
   if (!is_aot_compile) {
     // Run SimplifyFPConversions pass to simplify the BF16 pattern and make it
     // easier to match.
-    pipeline.AddPass<SimplifyFPConversions>(
-        SimplifyFPConversions::Scope::kSimplifyAllConversions);
-    pipeline.AddPass<OneDnnMatMulRewriter>();
+    pipeline.AddPass<SimplifyFPConversions>();
+    pipeline.AddPass<OneDnnMatMulRewriter>(max_parallelism,
+                                           compile_options.thread_pool);
     // Run SimplifyFPConversions pass again to remove redundant Convert ops
     // that may exist as a result of running OneDnnMatMulRewriter pass.
-    pipeline.AddPass<SimplifyFPConversions>(
-        SimplifyFPConversions::Scope::kSimplifyAllConversions);
+    pipeline.AddPass<SimplifyFPConversions>();
   }
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
 
@@ -972,10 +945,6 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
   }();
 
   // Outline ops in the entry computation into calls to subcomputations.
-  const int max_parallelism =
-      module->config().intra_op_parallelism_threads() > 0
-          ? module->config().intra_op_parallelism_threads()
-          : tsl::port::NumSchedulableCPUs();
   if (!is_aot_compile) {
     // Run ParallelTaskAssigner to assign parallel tasks to HLOs in module.
     // Note this is not run for AOT because it would bring in thread pool
@@ -1000,13 +969,15 @@ Status CpuCompiler::RunHloPassesAfterLayoutAssn(
 
 Status CpuCompiler::RunHloPasses(HloModule* module, bool is_aot_compile,
                                  llvm::TargetMachine* target_machine,
+                                 const CompileOptions& compile_options,
                                  bool is_mlir_compile) {
   LLVMTargetMachineFeatures target_machine_features(target_machine);
   TF_RETURN_IF_ERROR(RunHloPassesThroughLayoutAssn(
       module, is_aot_compile, &target_machine_features, is_mlir_compile));
 
   return RunHloPassesAfterLayoutAssn(module, is_aot_compile,
-                                     &target_machine_features, is_mlir_compile);
+                                     &target_machine_features, compile_options,
+                                     is_mlir_compile);
 }
 
 namespace {
@@ -1121,7 +1092,7 @@ Status CreateHloProfilingArtifacts(
 
 StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
     std::unique_ptr<HloModule> module, se::StreamExecutor* /*stream_exec*/,
-    const CompileOptions& /*options*/) {
+    const CompileOptions& options) {
   std::unique_ptr<llvm::TargetMachine> jit_target_machine =
       SimpleOrcJIT::InferTargetMachineForJIT(
           CompilerTargetOptions(module->config()),
@@ -1129,6 +1100,7 @@ StatusOr<std::unique_ptr<HloModule>> CpuCompiler::RunHloPasses(
 
   TF_RETURN_IF_ERROR(RunHloPasses(
       module.get(), /*is_aot_compile=*/false, jit_target_machine.get(),
+      /*compile_options=*/options,
       /*is_mlir_compile=*/
       module->config().debug_options().xla_cpu_use_xla_runtime()));
   return std::move(module);
@@ -1396,8 +1368,7 @@ CpuCompiler::CompileLegacyCpuExecutable(std::unique_ptr<HloModule> module) {
       post_optimization_ir_hook,
       CreateOrcJITPostCompilationHook(module.get(), &obj_files));
   if (!jit) {
-    return Internal("Creating JIT failed: %s",
-                         llvm::toString(jit.takeError()));
+    return Internal("Creating JIT failed: %s", llvm::toString(jit.takeError()));
   }
   llvm_module->setDataLayout((*jit)->data_layout());
   llvm_module->setTargetTriple((*jit)->target_triple().getTriple());
@@ -1532,16 +1503,17 @@ namespace {
 StatusOr<std::unique_ptr<XlaRuntimeCpuExecutable>> GetXlaRuntimeCpuExecutable(
     const HloModule& hlo_module, mlir::ModuleOp mlir_module,
     absl::string_view entry_point,
-    const XlaFrameworkMapping& xla_framework_mapping) {
+    const XlaFrameworkMapping& xla_framework_mapping,
+    mlir::DialectRegistry* registry) {
   runtime::JitExecutable::Options opts =
-      GetXlaRuntimeJitExecutableOptions(hlo_module);
+      GetXlaRuntimeJitExecutableOptions(hlo_module, registry);
   std::string serialized_mlir = llvm_ir::DumpToString(mlir_module);
 
   absl::StatusOr<runtime::JitExecutable> jit_executable =
       runtime::JitExecutable::Instantiate(serialized_mlir, entry_point, opts);
   if (!jit_executable.ok()) {
     return Internal("Failed to compile XLA Runtime program: %s",
-                         jit_executable.status().message());
+                    jit_executable.status().message());
   }
 
   return std::make_unique<XlaRuntimeCpuExecutable>(
@@ -1552,7 +1524,7 @@ StatusOr<std::unique_ptr<XlaRuntimeCpuExecutable>> GetXlaRuntimeCpuExecutable(
 
 StatusOr<std::unique_ptr<CpuExecutable>>
 CpuCompiler::CompileXlaRuntimeCpuExecutable(
-    std::unique_ptr<HloModule> hlo_module) {
+    std::unique_ptr<HloModule> hlo_module, mlir::DialectRegistry* registry) {
   // Select an order for emitting the HLO instructions for each
   // computation. Using this sequence enables tighter buffer liveness analysis
   // and reduced memory usage (as compared to using DependencyHloOrdering).
@@ -1588,6 +1560,9 @@ CpuCompiler::CompileXlaRuntimeCpuExecutable(
   }
 
   mlir::MLIRContext mlir_context;
+  if (registry) {
+    mlir_context.appendDialectRegistry(*registry);
+  }
   XlaFrameworkMapping xla_framework_mapping;
   TF_ASSIGN_OR_RETURN(
       auto mlir_module,
@@ -1597,7 +1572,7 @@ CpuCompiler::CompileXlaRuntimeCpuExecutable(
   TF_ASSIGN_OR_RETURN(
       auto xla_runtime_executable,
       GetXlaRuntimeCpuExecutable(*hlo_module, *mlir_module, "main",
-                                 xla_framework_mapping));
+                                 xla_framework_mapping, registry));
 
   if (DumpingEnabledForHloModule(*hlo_module)) {
     TF_ASSIGN_OR_RETURN(std::string_view obj_file,
@@ -1615,7 +1590,7 @@ CpuCompiler::CompileXlaRuntimeCpuExecutable(
 StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
     std::unique_ptr<HloModule> module,
     [[maybe_unused]] se::StreamExecutor* stream_exec,
-    [[maybe_unused]] const CompileOptions& options) {
+    const CompileOptions& options) {
   VLOG(1) << "Compiling: " << module->name();
   XLA_SCOPED_LOGGING_TIMER(
       absl::StrFormat("Compiling [%s] for CPU using JIT", module->name()));
@@ -1628,8 +1603,9 @@ StatusOr<std::unique_ptr<Executable>> CpuCompiler::RunBackend(
 
   std::unique_ptr<CpuExecutable> cpu_executable;
   if (module->config().debug_options().xla_cpu_use_xla_runtime()) {
-    TF_ASSIGN_OR_RETURN(cpu_executable,
-                        CompileXlaRuntimeCpuExecutable(std::move(module)));
+    TF_ASSIGN_OR_RETURN(
+        cpu_executable,
+        CompileXlaRuntimeCpuExecutable(std::move(module), options.registry));
   } else {
     TF_ASSIGN_OR_RETURN(cpu_executable,
                         CompileLegacyCpuExecutable(std::move(module)));
@@ -1742,6 +1718,7 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
 
     TF_RETURN_IF_ERROR(
         RunHloPasses(module, /*is_aot_compile=*/true, target_machine.get(),
+                     /*dummy*/ CompileOptions{},
                      /*is_mlir_compile=*/options.use_mlir_hlo_lowering()));
 
     TF_ASSIGN_OR_RETURN(HloSchedule schedule,
@@ -1905,7 +1882,8 @@ CpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
 
     results.emplace_back(std::make_unique<CpuAotCompilationResult>(
         std::move(object_file_data), std::move(buffer_infos),
-        result_slice.index(), std::move(hlo_profile_printer_data)));
+        result_slice.index(), std::move(modules[i]),
+        std::move(hlo_profile_printer_data)));
   }
 
   VLOG(1) << "Compilation finished";
@@ -1940,10 +1918,13 @@ class CpuExecutableAotCompilationResult : public AotCompilationResult {
                                     const BufferAssignment* buffer_assignment,
                                     std::string_view function_name,
                                     std::string_view obj_file) {
-    *proto_.mutable_hlo_module() = hlo_module->ToProto();
+    *proto_.mutable_hlo_module()->mutable_hlo_module() = hlo_module->ToProto();
     *proto_.mutable_buffer_assignment() = buffer_assignment->ToProto();
     proto_.set_entry_function_name(std::string(function_name));
     proto_.set_obj_file(std::string(obj_file));
+    *proto_.mutable_hlo_module()->mutable_config() =
+        *hlo_module->config().ToProto();
+    module_ = hlo_module->Clone();
   }
 
   StatusOr<std::string> SerializeAsString() const override {
@@ -1957,18 +1938,31 @@ class CpuExecutableAotCompilationResult : public AotCompilationResult {
       return Internal(
           "Failed to parse serialized CpuExecutableAotCompilationResult.");
     }
+
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModule> module,
+        HloModule::CreateFromProtoWithConfig(proto.hlo_module()));
+
     return std::unique_ptr<CpuExecutableAotCompilationResult>(
-        new CpuExecutableAotCompilationResult(proto));
+        new CpuExecutableAotCompilationResult(proto, std::move(module)));
   }
 
   StatusOr<std::unique_ptr<Executable>> LoadExecutable(
       Compiler* compiler, const se::StreamExecutor* stream_exec) const override;
 
+  const HloModule* optimized_module() const override { return module_.get(); }
+
+  std::unique_ptr<HloModule> consume_optimized_module() override {
+    return std::move(module_);
+  }
+
  private:
-  explicit CpuExecutableAotCompilationResult(CompilationResultProto proto)
-      : proto_(std::move(proto)) {}
+  explicit CpuExecutableAotCompilationResult(CompilationResultProto proto,
+                                             std::unique_ptr<HloModule> module)
+      : proto_(std::move(proto)), module_(std::move(module)) {}
 
   CompilationResultProto proto_;
+  std::unique_ptr<HloModule> module_;
 };
 
 }  // namespace
@@ -1977,12 +1971,9 @@ StatusOr<std::unique_ptr<Executable>>
 CpuExecutableAotCompilationResult::LoadExecutable(
     Compiler* compiler, const se::StreamExecutor* stream_exec) const {
   // Recreate HloModule from proto.
-  TF_ASSIGN_OR_RETURN(HloModuleConfig hlo_module_config,
-                      HloModule::CreateModuleConfigFromProto(
-                          proto_.hlo_module(), GetDebugOptionsFromFlags()));
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> module,
-      HloModule::CreateFromProto(proto_.hlo_module(), hlo_module_config));
+      HloModule::CreateFromProtoWithConfig(proto_.hlo_module()));
 
   // Recreate BufferAssignment from proto.
   TF_ASSIGN_OR_RETURN(
@@ -2001,8 +1992,7 @@ CpuExecutableAotCompilationResult::LoadExecutable(
       /*pre_optimization_hook=*/nullptr, /*post_optimization_hook=*/nullptr,
       /*post_codegen_hook=*/nullptr);
   if (!jit) {
-    return Internal("Creating JIT failed: %s",
-                         llvm::toString(jit.takeError()));
+    return Internal("Creating JIT failed: %s", llvm::toString(jit.takeError()));
   }
 
   // Create a named buffer from compiled object file.
diff --git a/third_party/xla/xla/service/cpu/cpu_compiler.h b/third_party/xla/xla/service/cpu/cpu_compiler.h
index 33f96dd2d197b5..e28c9ed34cc8a2 100644
--- a/third_party/xla/xla/service/cpu/cpu_compiler.h
+++ b/third_party/xla/xla/service/cpu/cpu_compiler.h
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/service/compiler.h"
 #include "xla/service/cpu/executable.pb.h"
 #include "xla/service/cpu/target_machine_features.h"
+#include "xla/service/cpu/xla_framework.h"
 #include "xla/service/executable.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_cost_analysis.h"
@@ -40,11 +41,14 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/util.h"
 
+namespace mlir {
+class DialectRegistry;
+}  // namespace mlir
+
 namespace xla {
 namespace cpu {
 
 class CpuExecutable;
-class XlaFrameworkMapping;
 
 // This class wraps the configurability options that LLVM exposes including: the
 // target triple, the target cpu and the target features.  It also includes the
@@ -96,44 +100,12 @@ class CpuAotCompilationOptions : public AotCompilationOptions {
   bool use_mlir_hlo_lowering_ = false;
 };
 
-class CpuXlaRuntimeAotCompilationResult : public AotCompilationResult {
- public:
-  CpuXlaRuntimeAotCompilationResult(HloModuleProto hlo,
-                                    std::string_view obj_file,
-                                    std::string_view mlir_module,
-                                    XlaFrameworkMapping xla_framework_mapping);
-
-  explicit CpuXlaRuntimeAotCompilationResult(
-      XlaRuntimeCpuExecutableProto executable)
-      : xla_runtime_cpu_executable_(executable) {}
-
-  StatusOr<std::string> SerializeAsString() const override {
-    return xla_runtime_cpu_executable_.SerializeAsString();
-  }
-
-  static StatusOr<std::unique_ptr<CpuXlaRuntimeAotCompilationResult>>
-  FromString(const std::string& serialized) {
-    XlaRuntimeCpuExecutableProto xla_runtime_cpu_executable;
-    if (!xla_runtime_cpu_executable.ParseFromString(serialized)) {
-      return Internal("Failed to parse serialized JitRtExecutableProto.");
-    }
-    return std::make_unique<CpuXlaRuntimeAotCompilationResult>(
-        xla_runtime_cpu_executable);
-  }
-
-  StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler* compiler, const se::StreamExecutor* executor) const override;
-
- private:
-  XlaRuntimeCpuExecutableProto xla_runtime_cpu_executable_;
-};
-
 class CpuAotCompilationResult : public AotCompilationResult {
  public:
   CpuAotCompilationResult(
       ObjectFileData object_file_data,
       std::vector<cpu_function_runtime::BufferInfo> buffer_infos,
-      int64_t result_buffer_index,
+      int64_t result_buffer_index, std::unique_ptr<HloModule> module,
       std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data);
   ~CpuAotCompilationResult() override = default;
 
@@ -147,6 +119,9 @@ class CpuAotCompilationResult : public AotCompilationResult {
   }
   int64_t result_buffer_index() const { return result_buffer_index_; }
 
+  const HloModule* optimized_module() const override;
+  std::unique_ptr<HloModule> consume_optimized_module() override;
+
  private:
   // Contains the compiled computation: an object file.
   const ObjectFileData object_file_data_;
@@ -160,6 +135,9 @@ class CpuAotCompilationResult : public AotCompilationResult {
   // parameter when calling the compiled computation.
   const int64_t result_buffer_index_;
 
+  // Contains the optimized HLO module.
+  std::unique_ptr<HloModule> module_;
+
   // Contains an instance of HloProfilePrinterData if HLO profiling is enabled,
   // otherwise is nullptr.
   std::unique_ptr<HloProfilePrinterData> hlo_profile_printer_data_;
@@ -208,8 +186,12 @@ class CpuCompiler : public LLVMCompiler {
   StatusOr<std::unique_ptr<AotCompilationResult>> LoadAotCompilationResult(
       const std::string& serialized_aot_result) override;
 
+  // The optional `registry` supports MLIR dialects and plugins to be loaded
+  // during optimization. If non-null, it will be used to construct relevant
+  // MLIR contexts.
   StatusOr<std::unique_ptr<CpuExecutable>> CompileXlaRuntimeCpuExecutable(
-      std::unique_ptr<HloModule> module);
+      std::unique_ptr<HloModule> module,
+      mlir::DialectRegistry* registry = nullptr);
 
  private:
   // Initialize the LLVM target.
@@ -219,6 +201,7 @@ class CpuCompiler : public LLVMCompiler {
   // correctness.
   Status RunHloPasses(HloModule* module, bool is_aot_compile,
                       llvm::TargetMachine* target_machine,
+                      const CompileOptions& compile_options,
                       bool is_mlir_compile = false);
 
   // Runs HLO passes up to and including layout assignment.
@@ -230,7 +213,8 @@ class CpuCompiler : public LLVMCompiler {
   // Runs HLO passes after layout assignment.
   Status RunHloPassesAfterLayoutAssn(
       HloModule* module, bool is_aot_compile,
-      LLVMTargetMachineFeatures* target_machine_features, bool is_mlir_compile);
+      LLVMTargetMachineFeatures* target_machine_features,
+      const CompileOptions& compile_options, bool is_mlir_compile);
 
   StatusOr<std::unique_ptr<CpuExecutable>> CompileLegacyCpuExecutable(
       std::unique_ptr<HloModule> module);
diff --git a/third_party/xla/xla/service/cpu/cpu_float_support.cc b/third_party/xla/xla/service/cpu/cpu_float_support.cc
index 67c349dec985e4..336a8709725ea7 100644
--- a/third_party/xla/xla/service/cpu/cpu_float_support.cc
+++ b/third_party/xla/xla/service/cpu/cpu_float_support.cc
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/third_party/xla/xla/service/cpu/cpu_float_support.h b/third_party/xla/xla/service/cpu/cpu_float_support.h
index b3f7cc5ba6ab34..38c6a9bd816109 100644
--- a/third_party/xla/xla/service/cpu/cpu_float_support.h
+++ b/third_party/xla/xla/service/cpu/cpu_float_support.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.cc b/third_party/xla/xla/service/cpu/cpu_runtime.cc
index 773a7e9452d4ad..789a8d86fc01b8 100644
--- a/third_party/xla/xla/service/cpu/cpu_runtime.cc
+++ b/third_party/xla/xla/service/cpu/cpu_runtime.cc
@@ -164,6 +164,8 @@ extern const char* const kOneDnnSoftmaxSymbolName =
     "__xla_cpu_runtime_OneDnnSoftmax";
 extern const char* const kOneDnnLayerNormSymbolName =
     "__xla_cpu_runtime_OneDnnLayerNorm";
+extern const char* const kOneDnnMatMulReorderSymbolName =
+    "__xla_cpu_runtime_OneDnnMatMulReorder";
 
 namespace {
 
diff --git a/third_party/xla/xla/service/cpu/cpu_runtime.h b/third_party/xla/xla/service/cpu/cpu_runtime.h
index 894559147f8cba..e4fc06fc85bd5a 100644
--- a/third_party/xla/xla/service/cpu/cpu_runtime.h
+++ b/third_party/xla/xla/service/cpu/cpu_runtime.h
@@ -89,6 +89,7 @@ extern const char* const kReduceScatterSymbolName;
 extern const char* const kOneDnnMatMulSymbolName;
 extern const char* const kOneDnnSoftmaxSymbolName;
 extern const char* const kOneDnnLayerNormSymbolName;
+extern const char* const kOneDnnMatMulReorderSymbolName;
 
 // All symbol names for XLA CPU runtime functions need to start with this
 // prefix.
diff --git a/third_party/xla/xla/service/cpu/dot_op_emitter.cc b/third_party/xla/xla/service/cpu/dot_op_emitter.cc
index e746726a3dd4fa..c6bad1e54ec045 100644
--- a/third_party/xla/xla/service/cpu/dot_op_emitter.cc
+++ b/third_party/xla/xla/service/cpu/dot_op_emitter.cc
@@ -326,7 +326,7 @@ Status DotOpEmitter::EmitLinalgMatmul() {
             /*outputs=*/mlir::ValueRange{a},
             /*indexingMaps=*/
             mlir::AffineMap::inferFromExprList(
-                {b_exprs, c_exprs, parallel_exprs}),
+                {b_exprs, c_exprs, parallel_exprs}, context),
             /*iteratorTypes=*/iteratorTypes,
             [](mlir::OpBuilder& b, mlir::Location loc, mlir::ValueRange args) {
               mlir::ArithBuilder ab(b, loc);
diff --git a/third_party/xla/xla/service/cpu/elemental_ir_emitter.cc b/third_party/xla/xla/service/cpu/elemental_ir_emitter.cc
index 265f6a962e2eb6..6d1980208fc1dc 100644
--- a/third_party/xla/xla/service/cpu/elemental_ir_emitter.cc
+++ b/third_party/xla/xla/service/cpu/elemental_ir_emitter.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/service/llvm_ir/math_ops.h"
 #include "xla/types.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -105,5 +106,32 @@ StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitTanh(PrimitiveType prim_type,
   return result;
 }
 
+StatusOr<llvm::Value*> CpuElementalIrEmitter::EmitErf(PrimitiveType prim_type,
+                                                      llvm::Value* value) {
+  if (prim_type == F64) {
+    std::string function_name = "erf";
+    // Create a function declaration.
+    llvm::Function* function = llvm::dyn_cast<llvm::Function>(
+        module()
+            ->getOrInsertFunction(function_name, value->getType(),
+                                  value->getType())
+            .getCallee());
+    function->setCallingConv(llvm::CallingConv::C);
+    function->setDoesNotThrow();
+    function->setDoesNotAccessMemory();
+    // Create an instruction to call the function.
+    llvm::Value* result = Call(function, value);
+    return result;
+  }
+  // Upcast F16 to F32 if necessary.
+  llvm::Type* type = prim_type == F16 ? b()->getFloatTy() : value->getType();
+  if (type == b()->getFloatTy()) {
+    llvm::Value* x = FPCast(value, type);
+    auto* result = llvm_ir::EmitErfF32(b(), x);
+    return FPCast(result, value->getType());
+  }
+  return Unimplemented("erf");
+}
+
 }  // namespace cpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/cpu/elemental_ir_emitter.h b/third_party/xla/xla/service/cpu/elemental_ir_emitter.h
index 8a87f4ba69f38a..c4a3a0d291e648 100644
--- a/third_party/xla/xla/service/cpu/elemental_ir_emitter.h
+++ b/third_party/xla/xla/service/cpu/elemental_ir_emitter.h
@@ -41,6 +41,8 @@ class CpuElementalIrEmitter : public ElementalIrEmitter {
                                    absl::string_view name) override;
   StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
                                   llvm::Value* value) override;
+  StatusOr<llvm::Value*> EmitErf(PrimitiveType prim_type,
+                                 llvm::Value* value) override;
 
   StatusOr<std::vector<llvm::Value*>> EmitThreadLocalCall(
       const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
diff --git a/third_party/xla/xla/service/cpu/executable.proto b/third_party/xla/xla/service/cpu/executable.proto
index db4ad8a54d1765..2c48a51f4b0430 100644
--- a/third_party/xla/xla/service/cpu/executable.proto
+++ b/third_party/xla/xla/service/cpu/executable.proto
@@ -19,6 +19,7 @@ package xla.cpu;
 
 import "xla/service/cpu/xla_framework.proto";
 import "xla/service/hlo.proto";
+import "xla/xla.proto";
 
 message XlaRuntimeCpuExecutableProto {
   optional XlaRuntimeExecutableProto xla_runtime_executable = 1;
@@ -26,7 +27,7 @@ message XlaRuntimeCpuExecutableProto {
 }
 
 message CompilationResultProto {
-  HloModuleProto hlo_module = 1;
+  HloModuleProtoWithConfig hlo_module = 1;
   BufferAssignmentProto buffer_assignment = 2;
   string entry_function_name = 3;
   bytes obj_file = 4;
diff --git a/third_party/xla/xla/service/cpu/ir_emitter.cc b/third_party/xla/xla/service/cpu/ir_emitter.cc
index 043a2a77fc0c34..55cec03528fa34 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.cc
+++ b/third_party/xla/xla/service/cpu/ir_emitter.cc
@@ -2517,7 +2517,8 @@ Status IrEmitter::HandleTopK(HloInstruction* hlo) {
 }
 
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
-Status IrEmitter::HandleOneDnnMatMul(HloInstruction* custom_call) {
+Status IrEmitter::HandleOneDnnMatMulCalls(HloInstruction* custom_call,
+                                          std::string runtime_symbol_name) {
   // We would like to emit LLVM IR for the following function call
   //      custom_call_target(void* result, void** args)
   // args can be thought of an array of pointers allocated on the stack,
@@ -2548,9 +2549,8 @@ Status IrEmitter::HandleOneDnnMatMul(HloInstruction* custom_call) {
   llvm::Value* nargs_val = b_.getInt64(nargs);
   llvm::Value* nargs_ptr =
       llvm_ir::EmitAllocaAtFunctionEntry(i64_type, "nargs", &b_);
-  llvm::Value* nargs_life_start =
-      b_.CreateLifetimeStart(nargs_ptr, b_.getInt64(-1));
-  llvm::Value* nargs_store = b_.CreateStore(nargs_val, nargs_ptr);
+  b_.CreateLifetimeStart(nargs_ptr, b_.getInt64(-1));
+  b_.CreateStore(nargs_val, nargs_ptr);
   args_val = b_.CreateInsertValue(args_val, nargs_ptr, arg_indx++);
 
   // Insert ExecutableRunOptions.
@@ -2586,15 +2586,14 @@ Status IrEmitter::HandleOneDnnMatMul(HloInstruction* custom_call) {
 
   llvm::Value* args_ptr =
       llvm_ir::EmitAllocaAtFunctionEntry(ptr_array_type, "matmul.args", &b_);
-  llvm::Value* args_life_start =
-      b_.CreateLifetimeStart(args_ptr, b_.getInt64(-1));
-  llvm::Value* args_store = b_.CreateStore(args_val, args_ptr);
+  b_.CreateLifetimeStart(args_ptr, b_.getInt64(-1));
+  b_.CreateStore(args_val, args_ptr);
 
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(custom_call));
   llvm_ir::IrArray result_array = GetIrArrayFor(custom_call);
   auto result_stack_alloca = GetAllocaAndEmitMemrefInfo(b_, result_array);
 
-  EmitCallToFunc(runtime::kOneDnnMatMulSymbolName,
+  EmitCallToFunc(std::move(runtime_symbol_name),
                  {result_stack_alloca.value, args_ptr}, b_.getVoidTy());
 
   // Lifetime ends for all stack allocations.
@@ -2630,9 +2629,8 @@ Status IrEmitter::HandleOneDnnLayerNorm(HloInstruction* custom_call) {
   llvm::Value* nargs_val = b_.getInt64(nargs);
   llvm::Value* nargs_ptr =
       llvm_ir::EmitAllocaAtFunctionEntry(i64_type, "nargs", &b_);
-  llvm::Value* nargs_life_start =
-      b_.CreateLifetimeStart(nargs_ptr, b_.getInt64(-1));
-  llvm::Value* nargs_store = b_.CreateStore(nargs_val, nargs_ptr);
+  b_.CreateLifetimeStart(nargs_ptr, b_.getInt64(-1));
+  b_.CreateStore(nargs_val, nargs_ptr);
   args_val = b_.CreateInsertValue(args_val, nargs_ptr, arg_indx++);
 
   // Insert ExecutableRunOptions.
@@ -2665,9 +2663,8 @@ Status IrEmitter::HandleOneDnnLayerNorm(HloInstruction* custom_call) {
 
   llvm::Value* args_ptr =
       llvm_ir::EmitAllocaAtFunctionEntry(ptr_array_type, "layernorm.args", &b_);
-  llvm::Value* args_life_start =
-      b_.CreateLifetimeStart(args_ptr, b_.getInt64(-1));
-  llvm::Value* args_store = b_.CreateStore(args_val, args_ptr);
+  b_.CreateLifetimeStart(args_ptr, b_.getInt64(-1));
+  b_.CreateStore(args_val, args_ptr);
 
   TF_RETURN_IF_ERROR(EmitTargetAddressForOp(custom_call));
   llvm_ir::IrArray result_array = GetIrArrayFor(custom_call);
@@ -2696,7 +2693,6 @@ Status IrEmitter::HandleOneDnnSoftmax(HloInstruction* custom_call) {
   llvm_ir::IrArray result_array = GetIrArrayFor(custom_call);
   auto result_stack_alloca = GetAllocaAndEmitMemrefInfo(b_, result_array);
 
-  auto typed_custom_call = Cast<HloCustomCallInstruction>(custom_call);
   EmitCallToFunc(runtime::kOneDnnSoftmaxSymbolName,
                  {
                      GetExecutableRunOptionsArgument(),
@@ -2710,7 +2706,6 @@ Status IrEmitter::HandleOneDnnSoftmax(HloInstruction* custom_call) {
 
   return OkStatus();
 }
-
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
 
 Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
@@ -2725,7 +2720,8 @@ Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
   }
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
   if (custom_call->custom_call_target() == "__onednn$matmul") {
-    return HandleOneDnnMatMul(custom_call);
+    return HandleOneDnnMatMulCalls(custom_call,
+                                   runtime::kOneDnnMatMulSymbolName);
   }
   if (custom_call->custom_call_target() == "__onednn$softmax") {
     return HandleOneDnnSoftmax(custom_call);
@@ -2733,6 +2729,10 @@ Status IrEmitter::HandleCustomCall(HloInstruction* custom_call) {
   if (custom_call->custom_call_target() == "__onednn$layernorm") {
     return HandleOneDnnLayerNorm(custom_call);
   }
+  if (custom_call->custom_call_target() == "__onednn$matmul_reorder") {
+    return HandleOneDnnMatMulCalls(custom_call,
+                                   runtime::kOneDnnMatMulReorderSymbolName);
+  }
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
   absl::Span<HloInstruction* const> operands(custom_call->operands());
   llvm::AllocaInst* operands_alloca =
diff --git a/third_party/xla/xla/service/cpu/ir_emitter.h b/third_party/xla/xla/service/cpu/ir_emitter.h
index ace9b36b7590f4..12beb8547fb09c 100644
--- a/third_party/xla/xla/service/cpu/ir_emitter.h
+++ b/third_party/xla/xla/service/cpu/ir_emitter.h
@@ -195,7 +195,8 @@ class IrEmitter : public DfsHloVisitorWithDefault,
   Status HandleAllReduceSingleReplica(HloInstruction* crs);
   Status HandleAllReduceMultipleReplica(HloInstruction* crs);
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
-  Status HandleOneDnnMatMul(HloInstruction* hlo);
+  Status HandleOneDnnMatMulCalls(HloInstruction* hlo,
+                                 std::string runtime_symbol_name);
   Status HandleOneDnnSoftmax(HloInstruction* hlo);
   Status HandleOneDnnLayerNorm(HloInstruction* hlo);
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
diff --git a/third_party/xla/xla/service/cpu/onednn_layer_norm.cc b/third_party/xla/xla/service/cpu/onednn_layer_norm.cc
index c025adb48faccf..79a6a2290a063a 100644
--- a/third_party/xla/xla/service/cpu/onednn_layer_norm.cc
+++ b/third_party/xla/xla/service/cpu/onednn_layer_norm.cc
@@ -46,13 +46,11 @@ using dnnl::stream;
 
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnLayerNorm(
     void* result, void** args) {
-  // args[0]: ptr to nargs
+  // args[0]: ptr to nargs. We don't use nargs here.
   // args[1]: ptr to ExecutableRunOptions
   // args[2]: ptr to OneDnnLayerNormConfig
   // args[3...]: ptrs to operands
-  int arg_indx = 0;
-  const int64_t num_args = *(static_cast<int64_t*>(args[arg_indx++]));
-
+  int arg_indx = 1;
   const xla::ExecutableRunOptions* run_options =
       static_cast<const xla::ExecutableRunOptions*>(args[arg_indx++]);
   XLA_LIGHTWEIGHT_CHECK(run_options != nullptr);
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul.cc b/third_party/xla/xla/service/cpu/onednn_matmul.cc
index e0dfba57160de4..3686827198df7a 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul.cc
@@ -33,6 +33,8 @@ limitations under the License.
 #include "xla/service/cpu/backend_config.pb.h"
 #include "xla/service/cpu/onednn_memory_util.h"
 #include "xla/service/cpu/runtime_lightweight_check.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "tsl/platform/logging.h"
 #include "tsl/util/onednn_threadpool.h"
 
@@ -43,8 +45,128 @@ using dnnl::engine;
 using dnnl::matmul;
 using dnnl::memory;
 using dnnl::stream;
+
+dnnl::memory::desc Transpose(const dnnl::memory::desc& md) {
+  int64_t ndims = md.get_ndims();
+  // Do not transpose 1D
+  if (ndims == 1) {
+    return md;
+  }
+
+  std::vector<int> permutation(ndims);
+  std::iota(permutation.begin(), permutation.end(), 0);
+  std::swap(permutation[ndims - 1], permutation[ndims - 2]);
+  return md.permute_axes(permutation);
+}
+
+dnnl::memory::desc ShapeToMemDesc(const Shape& shape, bool transpose = false) {
+  auto dimensions = shape.dimensions();
+  if (dimensions.size() == 0) {
+    return dnnl::memory::desc{};
+  }
+
+  auto dims = dnnl::memory::dims(dimensions.begin(), dimensions.end());
+
+  dnnl::memory::dims strides(dims.size());
+  dnnl::memory::dim stride = 1;
+  for (auto i : shape.layout().minor_to_major()) {
+    strides.at(i) = stride;
+    stride *= dims.at(i);
+  }
+
+  auto dt = ToOneDnnDataType(static_cast<PrimitiveType>(shape.element_type()));
+
+  return transpose ? Transpose(dnnl::memory::desc(dims, dt, strides))
+                   : dnnl::memory::desc(dims, dt, strides);
+}
+
+dnnl::memory::desc OneDnnMatMulOptWeightsDesc(
+    const dnnl::engine& engine, const dnnl::memory::desc& input_md,
+    const dnnl::memory::desc& weights_md, const dnnl::memory::desc& bias_md,
+    const dnnl::memory::desc& output_md) {
+  auto weights_any_md =
+      memory::desc(weights_md.get_dims(), weights_md.get_data_type(),
+                   dnnl::memory::format_tag::any);
+
+  auto matmul_pd = matmul::primitive_desc(engine, input_md, weights_any_md,
+                                          bias_md, output_md);
+
+  return matmul_pd.weights_desc();
+}
+
+dnnl::memory::desc OneDnnMatMulOptWeightsDesc(
+    const dnnl::engine& engine, const Shape& input_shape,
+    const Shape& weights_shape, const Shape& bias_shape,
+    const Shape& output_shape, const OneDnnMatMulConfig* matmul_config) {
+  auto input_md = ShapeToMemDesc(input_shape, matmul_config->transpose_a());
+  auto weights_md = ShapeToMemDesc(weights_shape, matmul_config->transpose_b());
+  auto bias_md =
+      absl::c_count(matmul_config->fused_ops(), OneDnnMatMulConfig::BIAS) > 0
+          ? ShapeToMemDesc(bias_shape)
+          : dnnl::memory::desc{};
+  auto output_md = ShapeToMemDesc(output_shape);
+
+  // extend bias rank to match result rank
+  auto missed_rank = output_md.get_ndims() - bias_md.get_ndims();
+  XLA_LIGHTWEIGHT_CHECK(missed_rank >= 0);
+  if (!bias_md.is_zero() && missed_rank > 0) {
+    auto bias_dims = bias_md.get_dims();
+    bias_dims.insert(bias_dims.begin(), missed_rank, 1);
+    bias_md = bias_md.reshape(bias_dims);
+  }
+
+  return OneDnnMatMulOptWeightsDesc(engine, input_md, weights_md, bias_md,
+                                    output_md);
+}
+
+Shape MemDescToXlaShape(const dnnl::memory::desc& md) {
+  auto dtype = md.get_data_type();
+  auto element_size = dnnl::memory::data_type_size(dtype);
+  int64_t bytes_num = md.get_size();
+  XLA_LIGHTWEIGHT_CHECK(bytes_num % element_size == 0);
+  int64_t elements_num = static_cast<int64_t>(bytes_num / element_size);
+  return ShapeUtil::MakeShape(ToXlaPrimitiveType(dtype), {elements_num});
+}
+
+std::unique_ptr<tsl::OneDnnThreadPool> CreateOneDnnThreadPool(
+    const xla::ExecutableRunOptions* run_options) {
+#ifndef ENABLE_ONEDNN_OPENMP
+  if (run_options != nullptr &&
+      run_options->intra_op_thread_pool() != nullptr) {
+    return std::make_unique<tsl::OneDnnThreadPool>(
+        run_options->intra_op_thread_pool()->getPool(), false);
+  } else {
+    return nullptr;
+  }
+#else
+  return nullptr;
+#endif  // ENABLE_ONEDNN_OPENMP
+}
+
+dnnl::stream MakeOneDnnStream(
+    const dnnl::engine& cpu_engine,
+    dnnl::threadpool_interop::threadpool_iface* thread_pool) {
+  if (thread_pool != nullptr) {
+    return dnnl::threadpool_interop::make_stream(cpu_engine, thread_pool);
+  } else {
+    return dnnl::stream(cpu_engine);
+  }
+}
+
 }  // namespace
 
+Shape OneDnnMatMulOptWeightsShape(const Shape& input_shape,
+                                  const Shape& weights_shape,
+                                  const Shape& bias_shape,
+                                  const Shape& output_shape,
+                                  const OneDnnMatMulConfig* matmul_config) {
+  engine cpu_engine(engine::kind::cpu, 0);
+  auto optimized_weights_md =
+      OneDnnMatMulOptWeightsDesc(cpu_engine, input_shape, weights_shape,
+                                 bias_shape, output_shape, matmul_config);
+  return MemDescToXlaShape(optimized_weights_md);
+}
+
 ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMul(
     void* result, void** args) {
   // args[0]: ptr to nargs
@@ -58,15 +180,10 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMul(
       static_cast<const xla::ExecutableRunOptions*>(args[arg_indx++]);
   XLA_LIGHTWEIGHT_CHECK(run_options != nullptr);
   XLA_LIGHTWEIGHT_CHECK(run_options->intra_op_thread_pool() != nullptr);
-  tsl::OneDnnThreadPool thread_pool(
-      run_options->intra_op_thread_pool()->getPool(), false);
+
+  auto thread_pool = CreateOneDnnThreadPool(run_options);
   engine cpu_engine(engine::kind::cpu, 0);
-#ifndef ENABLE_ONEDNN_OPENMP
-  auto onednn_stream =
-      stream(dnnl::threadpool_interop::make_stream(cpu_engine, &thread_pool));
-#else
-  auto onednn_stream = stream(cpu_engine);
-#endif  // ENABLE_ONEDNN_OPENMP
+  auto onednn_stream = MakeOneDnnStream(cpu_engine, thread_pool.get());
 
   std::string config_str(static_cast<const char*>(args[arg_indx++]));
   OneDnnMatMulConfig matmul_config;
@@ -82,26 +199,14 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMul(
   auto result_md = result_minfo.GetOneDnnMemDesc();
 
   // Update dims and strides for transposed inputs.
-  bool transpose_a = matmul_config.transpose_a();
-  if (transpose_a) {
-    int64_t ndims = lhs_md.get_ndims();
-    std::vector<int> permutation(ndims);
-    std::iota(permutation.begin(), permutation.end(), 0);
-    std::swap(permutation[ndims - 1], permutation[ndims - 2]);
-    lhs_md = lhs_md.permute_axes(permutation);
+  if (matmul_config.transpose_a()) {
+    lhs_md = Transpose(lhs_md);
   }
-  bool transpose_b = matmul_config.transpose_b();
-  if (transpose_b) {
-    int64_t ndims = rhs_md.get_ndims();
-    std::vector<int> permutation(ndims);
-    std::iota(permutation.begin(), permutation.end(), 0);
-    std::swap(permutation[ndims - 1], permutation[ndims - 2]);
-    rhs_md = rhs_md.permute_axes(permutation);
+
+  if (matmul_config.transpose_b()) {
+    rhs_md = Transpose(rhs_md);
   }
-  auto lhs_mem = memory(lhs_md, cpu_engine, lhs_minfo.Data());
-  auto rhs_mem = memory(rhs_md, cpu_engine, rhs_minfo.Data());
   auto bias_mem = memory(nullptr);
-  auto result_mem = memory(result_md, cpu_engine, result_minfo.Data());
   std::vector<std::pair<int, dnnl::memory>> postop_args;
 
   // Currently, GELU/ReLU only fusion is supported.
@@ -143,6 +248,13 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMul(
         postop_args.emplace_back(
             arg_idx, dnnl::memory(binary_md, cpu_engine, binary_minfo.Data()));
       } break;
+      case OneDnnMatMulConfig::LINEAR: {
+        float const_float;
+        *(reinterpret_cast<int32_t*>(&const_float)) =
+            matmul_config.alpha_typecast();
+        post_ops.append_eltwise(dnnl::algorithm::eltwise_linear, const_float,
+                                0.f);
+      } break;
       default:
         LOG(FATAL) << __FILE__ << ":" << __LINE__
                    << " Attempt to call OneDNN MatMul runtime library with "
@@ -158,6 +270,22 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMul(
     attrs.set_post_ops(post_ops);
   }
 
+  bool weights_packed = rhs_md.get_ndims() == 1 &&
+                        rhs_md.get_dims().front() != lhs_md.get_dims().back();
+  if (weights_packed) {
+    // expected 2D buffer with last dim of input and last dim of output
+    auto rhs_any_md =
+        memory::desc({lhs_md.get_dims().back(), result_md.get_dims().back()},
+                     rhs_md.get_data_type(), memory::format_tag::any);
+
+    rhs_md = OneDnnMatMulOptWeightsDesc(cpu_engine, lhs_md, rhs_any_md, bias_md,
+                                        result_md);
+  }
+
+  auto lhs_mem = memory(lhs_md, cpu_engine, lhs_minfo.Data());
+  auto rhs_mem = memory(rhs_md, cpu_engine, rhs_minfo.Data());
+  auto result_mem = memory(result_md, cpu_engine, result_minfo.Data());
+
   auto matmul_pd = matmul::primitive_desc(cpu_engine, lhs_md, rhs_md, bias_md,
                                           result_md, attrs);
 
@@ -177,6 +305,78 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMul(
   matmul_prim.execute(onednn_stream, matmul_args);
 }
 
+ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_OneDnnMatMulReorder(
+    void* result, void** args) {
+  // args[0]: ptr to nargs
+  // args[1]: ptr to ExecutableRunOptions
+  // args[2]: ptr to OneDnnMatMulConfig
+  // args[3...]: ptrs to operands
+  int arg_indx = 0;
+  const int64_t num_args = *(static_cast<int64_t*>(args[arg_indx++]));
+
+  const xla::ExecutableRunOptions* run_options =
+      static_cast<const xla::ExecutableRunOptions*>(args[arg_indx++]);
+
+  auto thread_pool = CreateOneDnnThreadPool(run_options);
+  engine cpu_engine(engine::kind::cpu, 0);
+  auto onednn_stream = MakeOneDnnStream(cpu_engine, thread_pool.get());
+
+  std::string config_str(static_cast<const char*>(args[arg_indx++]));
+  OneDnnMatMulConfig matmul_config;
+  matmul_config.ParseFromString(config_str);
+
+  MemrefInfo input_minfo(args[arg_indx++]);
+  MemrefInfo weight_minfo(args[arg_indx++]);
+  MemrefInfo output_minfo(args[arg_indx++]);
+  MemrefInfo result_minfo(result);
+
+  auto input_md = input_minfo.GetOneDnnMemDesc();
+  auto weight_md = weight_minfo.GetOneDnnMemDesc();
+  auto output_md = output_minfo.GetOneDnnMemDesc();
+
+  auto bias_md = dnnl::memory::desc{};
+  if (absl::c_count(matmul_config.fused_ops(), OneDnnMatMulConfig::BIAS) > 0) {
+    MemrefInfo bias_minfo(args[arg_indx++]);
+    bias_md = bias_minfo.GetOneDnnMemDesc();
+  }
+
+  XLA_LIGHTWEIGHT_CHECK(num_args >= arg_indx);
+
+  // Update dims and strides for transposed inputs.
+  bool transpose_a = matmul_config.transpose_a();
+  if (transpose_a) {
+    input_md = Transpose(input_md);
+  }
+  bool transpose_b = matmul_config.transpose_b();
+  if (transpose_b) {
+    weight_md = Transpose(weight_md);
+  }
+
+  // extend bias rank to match result rank
+  if (!bias_md.is_zero()) {
+    auto missed_rank = output_md.get_ndims() - bias_md.get_ndims();
+    XLA_LIGHTWEIGHT_CHECK(missed_rank >= 0);
+    if (missed_rank > 0) {
+      auto bias_dims = bias_md.get_dims();
+      bias_dims.insert(bias_dims.begin(), missed_rank, 1);
+      bias_md = bias_md.reshape(bias_dims);
+    }
+  }
+
+  auto result_md = OneDnnMatMulOptWeightsDesc(cpu_engine, input_md, weight_md,
+                                              bias_md, output_md);
+
+  XLA_LIGHTWEIGHT_CHECK(result_minfo.GetOneDnnMemDesc().get_size() ==
+                        result_md.get_size());
+
+  auto weight_mem = dnnl::memory{weight_md, cpu_engine, weight_minfo.Data()};
+  auto result_mem = dnnl::memory{result_md, cpu_engine, result_minfo.Data()};
+
+  dnnl::reorder rdr{weight_mem, result_mem};
+  rdr.execute(onednn_stream, weight_mem, result_mem);
+  onednn_stream.wait();
+}
+
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul.h b/third_party/xla/xla/service/cpu/onednn_matmul.h
index 449971f61a30cb..6647eee2621a90 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul.h
+++ b/third_party/xla/xla/service/cpu/onednn_matmul.h
@@ -17,11 +17,21 @@ limitations under the License.
 #define XLA_SERVICE_CPU_ONEDNN_MATMUL_H_
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
 
+#include "xla/service/cpu/backend_config.pb.h"
+#include "xla/shape.h"
+
 namespace xla {
 namespace cpu {
 
+Shape OneDnnMatMulOptWeightsShape(const Shape& input_shape,
+                                  const Shape& weights_shape,
+                                  const Shape& bias_shape,
+                                  const Shape& output_shape,
+                                  const OneDnnMatMulConfig* matmul_config);
+
 extern "C" {
 extern void __xla_cpu_runtime_OneDnnMatMul(void* result, void** args);
+extern void __xla_cpu_runtime_OneDnnMatMulReorder(void* result, void** args);
 }  // extern "C"
 
 }  // namespace cpu
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
index d7f679917c467e..840661310f8744 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.cc
@@ -15,18 +15,24 @@ limitations under the License.
 
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
 
+#define EIGEN_USE_THREADS
+
 #include "xla/service/cpu/onednn_matmul_rewriter.h"
 
+#include "xla/executable_run_options.h"
+#include "xla/hlo/evaluator/hlo_evaluator.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/cpu/backend_config.pb.h"
+#include "xla/service/cpu/onednn_matmul.h"
 #include "xla/service/cpu/onednn_memory_util.h"
 #include "xla/service/cpu/onednn_util.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/status_macros.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
+#include "tsl/util/onednn_threadpool.h"
 
 namespace xla {
 namespace cpu {
@@ -49,23 +55,6 @@ inline Status ValidateDotDimensionNumbers(
   return OkStatus();
 }
 
-template <typename Pattern>
-inline auto AllowedIntermediateInstructions(HloInstruction** bitcast,
-                                            Pattern pattern) {
-  // Checks the presence of some intermediate operations that can be moved /
-  // folded to allow dot fusion with add.
-  // We try to match either of the following:
-  // 1. custom-call -> bf16-to-fp32 convert
-  // 2. custom-call -> bf16-to-fp32 convert -> bitcast
-  auto common =
-      m::AnyOf<HloInstruction>(m::Convert(std::move(pattern).WithOneUser())
-                                   .WithElementType(PrimitiveType::F32),
-                               std::move(pattern).WithOneUser())
-          .WithOneUser();
-  return m::AnyOf<HloInstruction>(m::Bitcast(bitcast, common), common)
-      .WithOneUser();
-}
-
 // We also check if the convert instruction has only one use.
 inline bool AllOperandsConvertedFromBF16ToF32(const HloInstruction* instr) {
   return absl::c_all_of(instr->operands(), [](HloInstruction* operand) {
@@ -94,16 +83,6 @@ inline auto ConvertBF16ToF32(HloInstruction** instr) {
       .WithElementType(PrimitiveType::F32);
 }
 
-inline void GetBF16Bias(HloInstruction* dot, HloInstruction** old_bias,
-                        HloInstruction** new_bias) {
-  if (dot->shape().element_type() == PrimitiveType::BF16 &&
-      (((*old_bias)->operand_count() == 1 &&
-        Match((*old_bias)->mutable_operand(0), ConvertBF16ToF32(new_bias))) ||
-       Match(*old_bias, ConvertBF16ToF32(new_bias)))) {
-    *old_bias = *new_bias;
-  }
-}
-
 inline auto BcastConstScalar(HloInstruction** instr, double value) {
   return m::Broadcast(instr, m::ConstantScalar(value));
 }
@@ -143,6 +122,26 @@ auto ConstScalarNear(double value) {
       });
 }
 
+bool IsScalar(const HloInstruction* instr) {
+  return ShapeUtil::IsEffectiveScalar(instr->shape());
+}
+
+std::optional<float> GetConstantValueAsFloat32(const HloInstruction* inst) {
+  if (!IsScalar(inst)) {
+    return std::nullopt;
+  }
+  switch (inst->shape().element_type()) {
+    case F16:
+      return inst->literal().GetFirstElement<half>();
+    case BF16:
+      return inst->literal().GetFirstElement<bfloat16>();
+    case F32:
+      return inst->literal().GetFirstElement<float>();
+    default:
+      return std::nullopt;
+  }
+}
+
 inline auto BcastConstScalarNear(double value) {
   return m::Broadcast(ConstScalarNear(value));
 }
@@ -178,12 +177,27 @@ auto GELUActivation(HloInstruction* instr, HloInstruction** src) {
                    .WithOneUser());
 }
 
-StatusOr<Shape> AdjustBiasShape(const HloInstruction* instr,
-                                const absl::Span<const int64_t>& dot_dims) {
-  auto bcast = Cast<HloBroadcastInstruction>(instr);
-  // OneDNN matmul apply auto-broadcast to BIAS
-  // Replace Broadcast with Bitcast to match rank
+// OneDNN matmul can fuse add operation with automatic broadcasting along the
+// addend's dimensions that are 1s. When compatible, Broadcast can be replaced
+// by Bitcast, which is much cheaper. Compute new shape for the Bitcast.
+StatusOr<Shape> AdjustBiasShape(const HloInstruction* broadcast_instr,
+                                const Shape& dot_shape) {
+  if (broadcast_instr->opcode() != HloOpcode::kBroadcast) {
+    return absl::InvalidArgumentError(
+        "Hlo instruction is not a Broadcast insruction.");
+  }
+  auto bcast = Cast<HloBroadcastInstruction>(broadcast_instr);
   Shape new_shape = bcast->shape();
+  // Broadcast instruction has "dimensions" parameter along which its input's
+  // dimensions should not change. For example,
+  //      dot = f32[3,4,5,6] dot(...)
+  //      arg = f32[3,6]{1,0} parameter(0)
+  //      broad = f32[3,4,5,6]{3,2,1,0} broadcast(arg), dimensions={0,3}
+  //      add = f32[3,4,5,6]{3,2,1,0} add(dot, arg)
+  // can be replaced with the following
+  //      arg = f32[3,6]{1,0} parameter(0)
+  //      bitcast = f32[3,1,1,6]{3,2,1,0} bitcast(arg)
+  //      fused = f32[3,4,5,6]{3,2,1,0} custom-call((..., bitcast)
   auto kept_dimensions = bcast->dimensions();
   for (int i = 0; i < new_shape.rank(); i++) {
     if (!absl::c_linear_search(kept_dimensions, i)) {
@@ -191,28 +205,121 @@ StatusOr<Shape> AdjustBiasShape(const HloInstruction* instr,
     }
   }
 
-  // Remove dimensions with value=1 to match dot dimensions
-  bool more_dims_to_delete = true;
-  while (more_dims_to_delete && new_shape.rank() > dot_dims.size()) {
-    for (int64_t i = 0; i < new_shape.rank(); i++) {
-      if (new_shape.dimensions()[i] == 1) {
-        new_shape.DeleteDimension(i);
-        break;
-      }
+  // If rank(new_shape) > rank(dot), extra dimensions with value = 1 can be
+  // deleted from the new_shape.
+  int64_t rank_difference = new_shape.rank() - dot_shape.rank();
+  auto new_dims = new_shape.dimensions();
+  std::vector<int64_t> dims_to_delete;
+  for (int i = 0; i < rank_difference; ++i) {
+    if (new_dims[i] == 1) {
+      dims_to_delete.push_back(i);
     }
-    more_dims_to_delete = false;
   }
+  new_shape = ShapeUtil::DeleteDimensions(dims_to_delete, new_shape);
 
-  // Validate dimensions
-  auto new_shape_dims = new_shape.dimensions();
-  for (int i = 0; i < dot_dims.size() - 2; i++) {
-    if (new_shape_dims[i] != 1 && new_shape_dims[i] != dot_dims[i]) {
-      return absl::CancelledError("bias cannot be fused - dimensions mismatch");
-    }
+  // New shape for bias should satisfy the condition:
+  //   rank(new_shape) <= rank(dot).
+  if (new_shape.rank() > dot_shape.rank()) {
+    return absl::CancelledError(
+        "Bias shape could not be adjusted for a fusion.");
   }
+
   return new_shape;
 };
 
+inline bool IsOperandFusible(HloInstruction* operand, HloInstruction* dot) {
+  // Check if the operand's shape is compatible with matmul for fusion.
+  // An operand is fusable if
+  //    1. rank(operand) <= rank(dot) and
+  //    2. Starting from the last dim in backward direction, the dimension
+  //       size of operand is either 1 or same to dot.
+  auto operand_dims = operand->shape().dimensions();
+  auto dot_dims = dot->shape().dimensions();
+  if (operand_dims.size() > dot_dims.size()) return false;
+  int operand_idx = operand_dims.size() - 1;
+  int dot_idx = dot_dims.size() - 1;
+  for (; operand_idx >= 0; --operand_idx, --dot_idx) {
+    if (operand_dims[operand_idx] != 1 &&
+        operand_dims[operand_idx] != dot_dims[dot_idx])
+      return false;
+  }
+  return true;
+}
+
+inline bool IsRowMajor(const Shape& shape) {
+  return LayoutUtil::IsMonotonicWithDim0Major(shape.layout());
+}
+
+// Whether the element type of instr is compatible with oneDNN kernels.
+// TODO(intel-tf): Restict compatible types based on instruction kind.
+inline bool CompatibleElementType(const HloInstruction* instr) {
+  PrimitiveType element_type = instr->shape().element_type();
+  return element_type == BF16 || element_type == F32;
+}
+
+// Type conversion from and to any of BF16 and FP32.
+// TODO(intel-tf): Support more types when enabled.
+template <typename Pattern>
+inline auto SupportedConvert(Pattern pattern) {
+  auto supported_convert = [](const HloInstruction* instr) -> bool {
+    return CompatibleElementType(instr) &&
+           CompatibleElementType(instr->operand(0));
+  };
+  return m::Convert(pattern).WithPredicate(supported_convert);
+}
+
+template <typename Pattern>
+inline auto SupportedConvert(HloInstruction** convert, Pattern pattern) {
+  auto supported_convert = [](const HloInstruction* instr) -> bool {
+    return CompatibleElementType(instr) &&
+           CompatibleElementType(instr->operand(0));
+  };
+  return m::Convert(convert, pattern).WithPredicate(supported_convert);
+}
+
+template <typename Pattern>
+inline auto BitcastWithReshapeSemantics(HloInstruction** bitcast,
+                                        Pattern pattern) {
+  // TODO(intel-tf): Add stronger condition that Bitcast does not have transpose
+  // semantics. Some of the HLO passes replaces Transpose with Bitcast. Here
+  // the layouts are checked to be rowmajor since the current pass runs after
+  // the layout assignment and oneDNN matmul is enabled for rowmajor layouts.
+  auto is_reshape = [](const HloInstruction* instr) -> bool {
+    if (!instr) return false;
+    auto input_shape = instr->operand(0)->shape();
+    auto output_shape = instr->shape();
+    bool is_same_type = ShapeUtil::SameElementType(input_shape, output_shape);
+    bool has_equal_num_elems = ShapeUtil::ElementsIn(input_shape) ==
+                               ShapeUtil::ElementsIn(output_shape);
+    bool has_rowmajor_layout =
+        IsRowMajor(input_shape) && IsRowMajor(output_shape);
+    return is_same_type && has_equal_num_elems && has_rowmajor_layout;
+  };
+  return m::Bitcast(bitcast, pattern).WithPredicate(is_reshape);
+}
+
+template <typename Pattern>
+inline auto OptionalConvertAndBitcast(HloInstruction** optional_convert,
+                                      HloInstruction** optional_bitcast,
+                                      Pattern pattern) {
+  // Checks the presence of some intermediate operations that can be moved /
+  // folded to allow dot fusion with add.
+  // Try to match either of the following:
+  //   1. pattern-root -> bf16-to-fp32 convert -> bitcast
+  //   2. pattern-root -> bf16-to-fp32 convert
+  //   3. pattern-root -> bitcast
+  //   4. pattern-root
+  auto common =
+      m::AnyOf<HloInstruction>(
+          SupportedConvert(optional_convert, std::move(pattern).WithOneUser())
+              .WithOperand(0, m::Op().WithElementType(PrimitiveType::BF16))
+              .WithElementType(PrimitiveType::F32),
+          std::move(pattern).WithOneUser())
+          .WithOneUser();
+  return m::AnyOf<HloInstruction>(
+      BitcastWithReshapeSemantics(optional_bitcast, common), common);
+}
+
 }  // namespace
 
 bool OneDnnMatMulRewriter::ShouldRewrite(const HloInstruction* dot_instr) {
@@ -244,9 +351,8 @@ bool OneDnnMatMulRewriter::ShouldRewrite(const HloInstruction* dot_instr) {
   }
   // Layout should be row-major, contraction dimensions captures transpose
   // scenarios in last two dimensions.
-  if (!LayoutUtil::IsMonotonicWithDim0Major(lhs_shape.layout()) ||
-      !LayoutUtil::IsMonotonicWithDim0Major(rhs_shape.layout()) ||
-      !LayoutUtil::IsMonotonicWithDim0Major(output_shape.layout())) {
+  if (!IsRowMajor(lhs_shape) || !IsRowMajor(rhs_shape) ||
+      !IsRowMajor(output_shape)) {
     return false;
   }
 
@@ -332,33 +438,41 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
   }
 
   Status HandleAdd(HloInstruction* instr) override {
-    HloInstruction *addend, *dot, *bcast_input, *bitcast_input;
-    HloInstruction* bitcast = nullptr;
-
-    auto pattern =
-        m::Op(&instr)
-            .WithOpcode(HloOpcode::kAdd)
-            .WithBinaryOperandsAnyOrder(
-                AllowedIntermediateInstructions(
-                    &bitcast, m::Op(&dot)
-                                  .WithOneUser()
-                                  .WithOpcode(HloOpcode::kCustomCall)
-                                  .WithCustomCallTarget({"__onednn$matmul"})),
-                m::Op(&addend).WithOneUser());
-
-    auto nonscalar_broadcast =
-        m::Broadcast(m::Op(&bcast_input)
-                         .WithPredicate([](const HloInstruction* ins) {
-                           return !ShapeUtil::IsEffectiveScalar(ins->shape());
-                         })
-                         .WithOneUser())
-            .WithOneUser();
-
-    auto addend_reshape =
-        m::Bitcast(m::Op(&bitcast_input).WithOneUser()).WithOneUser();
+    // Try to do a fusion for Dot(onednn-matmul) + Add. However,
+    // HLO Add instruction might receive the addends after additional
+    // processing like Broadcast, Bitcast, Convert, etc. is applied to the raw
+    // addends. Here, the following possible pattern is matched.
+    //
+    // clang-format off
+    //
+    //            Dot                               addend
+    //             |                                  |
+    //             v                                  v
+    //     optional instructions            optional instructions
+    //    (e.g, Convert, Bitcast)         (e.g, Convert, Broadcast)
+    //             |                                  |
+    //             +--------------+-------------------+
+    //                            |
+    //                            v
+    //                           Add
+    //
+    // clang-format on
+
+    HloInstruction *addend_intermediate, *dot;
+    HloInstruction* optional_dot_bitcast = nullptr;
+    HloInstruction* optional_dot_convert = nullptr;
+
+    auto pattern = m::AddAnyOrder(
+        &instr,
+        OptionalConvertAndBitcast(&optional_dot_convert, &optional_dot_bitcast,
+                                  OneDnnMatmulInstr(&dot))
+            .WithOneUser(),
+        m::Op(&addend_intermediate).WithOneUser());
 
     if (Match(instr, pattern)) {
       if (!IsSupportedType(dot->shape().element_type())) return OkStatus();
+      // TODO(intel-tf): Remove the condition below when the fusion Dot +
+      // Add(bias) + Add(e.g., residual) is enabled.
       if (!dot->backend_config<BackendConfig>()
                ->mutable_onednn_matmul_config()
                ->fused_ops()
@@ -373,64 +487,58 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
         new_operands.push_back(operand);
       }
 
-      // For addend, match one of these patterns:
-      // 1. any(addend) -> broadcast -> add
-      // 2. any(addend) -> bitcast -> add
-      bool bias_bcast = Match(addend, nonscalar_broadcast);
-      bool check_addend = Match(addend, addend_reshape);
-
-      HloInstruction *bf16_addend, *bf16_bcast_input, *bf16_bitcast_input;
-      // If addend is bf16 and being converted to f32, get the original bf16
-      // one.
-      GetBF16Bias(dot, &addend, &bf16_addend);
-      if (bias_bcast) {
-        GetBF16Bias(dot, &bcast_input, &bf16_bcast_input);
-      }
-      if (check_addend) {
-        GetBF16Bias(dot, &bitcast_input, &bf16_bitcast_input);
-      }
-
-      if (bias_bcast) {
-        if (bcast_input->shape().rank() == 1) {
-          new_operands.push_back(bcast_input);
+      // At this point, the addend could have one of the following
+      // possiblities that the current fusion can handle:
+      //
+      //   - addend -> Convert -> Broadcast -> Add
+      //   - addend -> Broadcast -> Convert -> Add
+      //   - addend -> Convert
+      //   - addend -> Broadcast
+      //   - addend
+      //
+      // Hunt for addend through possible sequences above and check the addend
+      // is compatible to onednn-matmul fusion.
+      HloInstruction* addend = nullptr;
+      HloInstruction* optional_addend_broadcast = nullptr;
+      auto addend_pattern = m::AnyOf<HloInstruction>(
+          m::Broadcast(&optional_addend_broadcast,
+                       m::Convert(&addend, m::Op())),
+          m::Convert(m::Broadcast(&optional_addend_broadcast, m::Op(&addend))),
+          m::Convert(&addend, m::Op()),
+          m::Broadcast(&optional_addend_broadcast, m::Op(&addend)),
+          m::Op(&addend));
+      if (!Match(addend_intermediate, addend_pattern)) return OkStatus();
+
+      if (optional_addend_broadcast && addend->shape().rank() != 1) {
+        auto new_shape =
+            AdjustBiasShape(optional_addend_broadcast, dot->shape());
+        if (new_shape.ok()) {
+          addend = addend->AddInstruction(
+              HloInstruction::CreateBitcast(new_shape.value(), addend));
         } else {
-          auto new_shape = AdjustBiasShape(addend, dot->shape().dimensions());
-          if (new_shape.ok()) {
-            auto bitcast = bcast_input->AddInstruction(
-                HloInstruction::CreateBitcast(new_shape.value(), bcast_input));
-            new_operands.push_back(bitcast);
-          } else {
-            LOG(WARNING) << new_shape.status();
-            new_operands.push_back(nullptr);
-          }
+          VLOG(2) << new_shape.status();
+          return OkStatus();
         }
-      } else if (absl::c_equal(dot->shape().dimensions(),
-                               addend->shape().dimensions())) {
-        new_operands.push_back(addend);
-      } else if (check_addend &&
-                 absl::c_equal(dot->shape().dimensions(),
-                               bitcast_input->shape().dimensions())) {
-        new_operands.push_back(bitcast_input);
-      } else {
-        new_operands.push_back(nullptr);
       }
 
-      if (new_operands.back() == nullptr) {
+      // Validate addend for fusion.
+      if (CompatibleElementType(addend) && IsOperandFusible(addend, dot)) {
+        new_operands.push_back(addend);
+      } else {
         return OkStatus();
       }
 
-      bool nd_bias = absl::c_count_if(new_operands.back()->shape().dimensions(),
-                                      [](int64_t dim) { return dim > 1; }) > 1;
-
       auto matmul_call = Cast<HloCustomCallInstruction>(instr->AddInstruction(
           dot->CloneWithNewOperands(dot->shape(), new_operands)));
 
       auto backend_config = matmul_call->backend_config<BackendConfig>();
       backend_config->mutable_onednn_matmul_config()->add_fused_ops(
-          nd_bias ? OneDnnMatMulConfig::BINARY_ADD : OneDnnMatMulConfig::BIAS);
-      backend_config->mutable_onednn_matmul_config()->set_bias_broadcast(
-          bias_bcast);
-
+          addend->shape().rank() != 1 ? OneDnnMatMulConfig::BINARY_ADD
+                                      : OneDnnMatMulConfig::BIAS);
+      if (optional_addend_broadcast) {
+        backend_config->mutable_onednn_matmul_config()->set_bias_broadcast(
+            true);
+      }
       TF_RETURN_IF_ERROR(matmul_call->set_backend_config(*backend_config));
 
       HloInstruction* new_instr;
@@ -438,7 +546,8 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
       // insert bitcast after the new fusion to maintain the correct shape
       // (new-custom-call -> bitcast). Also, this will be followed by -> convert
       // for bf16 case to avoid datatype mismatch.
-      if (bitcast != nullptr && bitcast->opcode() == HloOpcode::kBitcast) {
+      if (optional_dot_bitcast != nullptr &&
+          optional_dot_bitcast->opcode() == HloOpcode::kBitcast) {
         if (matmul_call->shape().element_type() == PrimitiveType::BF16) {
           auto bitcast_call =
               matmul_call->AddInstruction(HloInstruction::CreateBitcast(
@@ -498,6 +607,36 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
                               intermediate_instr);
       }
     }
+
+    HloInstruction *dot, *constant;
+    auto pattern = m::Op(&instr)
+                       .WithOpcode(HloOpcode::kMultiply)
+                       .WithBinaryOperandsAnyOrder(
+                           m::Op(&dot)
+                               .WithOneUser()
+                               .WithOpcode(HloOpcode::kCustomCall)
+                               .WithCustomCallTarget({"__onednn$matmul"}),
+                           m::Broadcast(m::Constant(&constant)).WithOneUser());
+
+    if (Match(instr, pattern)) {
+      std::vector<HloInstruction*> new_operands;
+      auto constant_value = *GetConstantValueAsFloat32(constant);
+
+      for (auto operand : dot->operands()) {
+        new_operands.push_back(operand);
+      }
+      auto matmul_call = Cast<HloCustomCallInstruction>(instr->AddInstruction(
+          dot->CloneWithNewOperands(instr->shape(), new_operands)));
+      auto backend_config = matmul_call->backend_config<BackendConfig>();
+      backend_config->mutable_onednn_matmul_config()->add_fused_ops(
+          OneDnnMatMulConfig::LINEAR);
+      // Casting to int32 because of issues in proto config for decimal types
+      // handling.
+      backend_config->mutable_onednn_matmul_config()->set_alpha_typecast(
+          *(reinterpret_cast<int32_t*>(&constant_value)));
+      TF_RETURN_IF_ERROR(matmul_call->set_backend_config(*backend_config));
+      TF_RETURN_IF_ERROR(ReplaceInstruction(instr, matmul_call));
+    }
     return OkStatus();
   }
 
@@ -522,11 +661,150 @@ class OneDnnMatMulRewriteVisitor : public DfsHloRewriteVisitor {
   }
 };
 
+class OneDnnMatMulReorderVisitor : public DfsHloRewriteVisitor {
+ public:
+  OneDnnMatMulReorderVisitor(int intra_op_parallelism,
+                             const tsl::thread::ThreadPool* compile_threadpool)
+      : intra_op_parallelism_(intra_op_parallelism > 0
+                                  ? intra_op_parallelism
+                                  : tsl::port::MaxParallelism()),
+        evaluator_(/*max_loop_iterations=*/0) {
+    if (compile_threadpool) {
+      threadpool_device_.reset(
+          new Eigen::ThreadPoolDevice(compile_threadpool->AsEigenThreadPool(),
+                                      compile_threadpool->NumThreads()));
+    } else {
+      threadpool_handle_.reset(new tsl::thread::ThreadPool(
+          tsl::Env::Default(), "XLACpuCompile", tsl::port::MaxParallelism()));
+      threadpool_device_.reset(
+          new Eigen::ThreadPoolDevice(threadpool_handle_->AsEigenThreadPool(),
+                                      threadpool_handle_->NumThreads()));
+    }
+
+    evaluator_.set_custom_call_handler(
+        [this](const HloInstruction* custom_call_instr,
+               absl::Span<const Literal*> operands) -> StatusOr<Literal> {
+          TF_ASSIGN_OR_RETURN(
+              auto backend_config,
+              custom_call_instr->backend_config<BackendConfig>());
+          auto& matmul_config = backend_config.onednn_matmul_config();
+
+          auto output = Literal::CreateFromShape(custom_call_instr->shape());
+
+          int64_t nargs = operands.size() + 3;
+          std::vector<void*> args;
+          args.push_back(&nargs);
+
+          ExecutableRunOptions run_options;
+          run_options.set_intra_op_thread_pool(threadpool_device_.get());
+          args.push_back(&run_options);  // No ExecutableRunOptions.
+
+          // OneDnnMatMulConfig
+          std::string config;
+          matmul_config.SerializeToString(&config);
+          args.push_back(config.data());
+
+          std::vector<MemrefInfoHandler> minfo_ptrs(operands.size());
+          std::transform(operands.begin(), operands.end(), minfo_ptrs.begin(),
+                         CreateMemrefInfoFromLiteral);
+          for (auto& minfo_ptr : minfo_ptrs) {
+            args.push_back(static_cast<void*>(minfo_ptr.get()));
+          }
+
+          auto result_ptr = CreateMemrefInfoFromLiteral(&output);
+          __xla_cpu_runtime_OneDnnMatMulReorder(result_ptr.get(), args.data());
+
+          return output;
+        });
+  }
+
+  Status HandleCustomCall(HloInstruction* custom_call) override {
+    HloInstruction* matmul;
+    if (Match(custom_call, OneDnnMatmulInstr(&matmul))) {
+      TF_ASSIGN_OR_RETURN(auto backend_config,
+                          matmul->backend_config<BackendConfig>());
+      auto& matmul_config = backend_config.onednn_matmul_config();
+
+      auto operands = custom_call->operands();
+      auto input = operands[0];
+      auto weight = operands[1];  // assuming weights is the second operand
+
+      auto input_shape = input->shape();
+      auto weight_shape = weight->shape();
+      if (weight_shape.rank() != 2) {
+        // pre-pack only 2D weights
+        return DefaultAction(custom_call);
+      }
+
+      auto bias_shape =
+          absl::c_count(matmul_config.fused_ops(), OneDnnMatMulConfig::BIAS) > 0
+              ? operands.at(2)->shape()
+              : Shape();
+
+      auto output_shape = custom_call->shape();
+
+#ifndef ENABLE_ONEDNN_OPENMP
+      // set oneDNN cuncurrency settings (which is thread-local)
+      tsl::OneDnnThreadPool::set_onednn_max_threads(intra_op_parallelism_);
+#endif
+      auto new_weight_shape = OneDnnMatMulOptWeightsShape(
+          input_shape, weight_shape, bias_shape, output_shape, &matmul_config);
+
+      auto cmpt = custom_call->parent();
+      std::vector<HloInstruction*> new_operands{
+          cmpt->AddInstruction(
+              HloInstruction::CreateConstant(Literal(input_shape))),
+          weight,
+          cmpt->AddInstruction(
+              HloInstruction::CreateConstant(Literal(output_shape))),
+      };
+
+      if (ShapeUtil::IsInitialized(bias_shape)) {
+        new_operands.push_back(cmpt->AddInstruction(
+            HloInstruction::CreateConstant(Literal(bias_shape))));
+      }
+
+      HloInstruction* reorder_call =
+          custom_call->AddInstruction(HloInstruction::CreateCustomCall(
+              new_weight_shape, new_operands, "__onednn$matmul_reorder"));
+
+      reorder_call->CopyBackendConfigFrom(custom_call);
+
+      Literal result;
+
+      if (evaluator_.TryEvaluate(reorder_call, &result, true)) {
+        HloInstruction* reordered_weight = custom_call->AddInstruction(
+            HloInstruction::CreateConstant(std::move(result)));
+        return custom_call->ReplaceOperandWithDifferentShape(1,
+                                                             reordered_weight);
+
+      } else {
+        return DefaultAction(custom_call);
+      }
+    }
+    return DefaultAction(custom_call);
+  }
+
+ private:
+  int intra_op_parallelism_;
+  HloEvaluator evaluator_;
+  std::unique_ptr<tsl::thread::ThreadPool> threadpool_handle_;
+  std::unique_ptr<Eigen::ThreadPoolDevice> threadpool_device_;
+};
+
 StatusOr<bool> OneDnnMatMulRewriter::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   OneDnnMatMulRewriteVisitor visitor;
-  return visitor.RunOnModule(module, execution_threads);
+  TF_ASSIGN_OR_RETURN(auto result,
+                      visitor.RunOnModule(module, execution_threads));
+
+  OneDnnMatMulReorderVisitor reorder_visitor(intra_op_parallelism_,
+                                             compile_threadpool_);
+  TF_ASSIGN_OR_RETURN(auto result2,
+                      reorder_visitor.RunOnModule(module, execution_threads));
+
+  return {result || result2};
 }
 
 }  // namespace cpu
diff --git a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.h b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.h
index 9a4e1aad8f1ec7..4c508e811aa473 100644
--- a/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.h
+++ b/third_party/xla/xla/service/cpu/onednn_matmul_rewriter.h
@@ -20,9 +20,11 @@ limitations under the License.
 #include <optional>
 
 #include "absl/algorithm/container.h"
+#include "unsupported/Eigen/CXX11/Tensor"  // from @eigen_archive
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
+#include "tsl/platform/threadpool.h"
 
 namespace xla {
 namespace cpu {
@@ -31,6 +33,11 @@ namespace cpu {
 // calls.
 class OneDnnMatMulRewriter : public HloModulePass {
  public:
+  OneDnnMatMulRewriter(int intra_op_parallelism,
+                       const tsl::thread::ThreadPool* compile_threadpool)
+      : intra_op_parallelism_(intra_op_parallelism),
+        compile_threadpool_(compile_threadpool) {}
+
   absl::string_view name() const override { return "onednn-matmul-rewriter"; }
 
   using HloPassInterface::Run;
@@ -39,6 +46,10 @@ class OneDnnMatMulRewriter : public HloModulePass {
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
 
   static bool ShouldRewrite(const HloInstruction* dot_instr);
+
+ private:
+  int intra_op_parallelism_;
+  const tsl::thread::ThreadPool* compile_threadpool_;
 };
 
 }  // namespace cpu
diff --git a/third_party/xla/xla/service/cpu/onednn_memory_util.cc b/third_party/xla/xla/service/cpu/onednn_memory_util.cc
index 8b9cadcddb6886..372ce97c278932 100644
--- a/third_party/xla/xla/service/cpu/onednn_memory_util.cc
+++ b/third_party/xla/xla/service/cpu/onednn_memory_util.cc
@@ -50,6 +50,27 @@ struct MemrefInfoPOD {
   void* data;
 };
 
+MemrefInfoHandler CreateMemrefInfoFromLiteral(const Literal* literal) {
+  MemrefInfoHandler result(new MemrefInfoPOD);
+
+  const auto& shape = literal->shape();
+  result->dtype = shape.element_type();
+  result->rank = shape.rank();
+  auto dimensions = shape.dimensions();
+  std::copy(dimensions.begin(), dimensions.end(),
+            absl::MakeSpan(result->dims).begin());
+
+  int64_t stride = 1;
+  for (int i : shape.layout().minor_to_major()) {
+    result->strides[i] = stride;
+    stride *= dimensions.at(i);
+  }
+
+  result->data = const_cast<void*>(literal->untyped_data());
+
+  return result;
+}
+
 StackAlloca GetAllocaAndEmitMemrefInfo(llvm::IRBuilder<>& builder,
                                        const llvm_ir::IrArray& ir_array) {
   const Shape& shape = ir_array.GetShape();
@@ -96,10 +117,8 @@ StackAlloca GetAllocaAndEmitMemrefInfo(llvm::IRBuilder<>& builder,
   // Allocate MemrefInfo on the stack
   llvm::Value* memref_info_ptr = llvm_ir::EmitAllocaAtFunctionEntry(
       memref_info_type, "memref.info", &builder);
-  llvm::Value* memref_life_start =
-      builder.CreateLifetimeStart(memref_info_ptr, builder.getInt64(-1));
-  llvm::Value* memref_store =
-      builder.CreateStore(memref_info_val, memref_info_ptr);
+  builder.CreateLifetimeStart(memref_info_ptr, builder.getInt64(-1));
+  builder.CreateStore(memref_info_val, memref_info_ptr);
 
   return {&builder, memref_info_ptr};
 }
diff --git a/third_party/xla/xla/service/cpu/onednn_memory_util.h b/third_party/xla/xla/service/cpu/onednn_memory_util.h
index 1556da60bd4c46..fb5292843b5a61 100644
--- a/third_party/xla/xla/service/cpu/onednn_memory_util.h
+++ b/third_party/xla/xla/service/cpu/onednn_memory_util.h
@@ -17,11 +17,14 @@ limitations under the License.
 #define XLA_SERVICE_CPU_ONEDNN_MEMORY_UTIL_H_
 #if defined(INTEL_MKL) && defined(ENABLE_ONEDNN_V3)
 
+#include <memory>
+
 #include "dnnl.hpp"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Value.h"
+#include "xla/literal.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/xla_data.pb.h"
 
@@ -40,6 +43,9 @@ struct StackAlloca {
 
 // Declare as opaque to put structure definition together with dependant code.
 struct MemrefInfoPOD;
+using MemrefInfoHandler = std::shared_ptr<MemrefInfoPOD>;
+
+MemrefInfoHandler CreateMemrefInfoFromLiteral(const Literal* literal);
 
 StackAlloca GetAllocaAndEmitMemrefInfo(llvm::IRBuilder<>& builder,
                                        const llvm_ir::IrArray& ir_array);
diff --git a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
index 26a274cbe4d815..ed904f44658876 100644
--- a/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
+++ b/third_party/xla/xla/service/cpu/onednn_ops_rewriter.cc
@@ -44,21 +44,20 @@ HloInstruction* FindLayerNormScale(HloInstruction* instr) {
   auto scalePattern = m::Multiply().WithBinaryOperandsAnyOrder(
       m::Broadcast(m::Op(&scale).WithOpcode(HloOpcode::kReshape)),
       m::Broadcast(m::Reshape(m::Broadcast(m::Rsqrt()))).WithOneUser());
-  auto m = Match(instr, scalePattern);
+  Match(instr, scalePattern);
   return scale;
 }
 
 HloInstruction* FindLayerNormShift(HloInstruction* instr) {
   HloInstruction* shift = nullptr;
-  auto m = Match(
-      instr,
-      m::Add().WithBinaryOperandsAnyOrder(
-          m::Multiply()
-              .WithBinaryOperandsAnyOrder(
-                  m::Op(), m::Subtract(m::Op(), m::Broadcast().WithOneUser())
-                               .WithOneUser())
-              .WithOneUser(),
-          m::Broadcast(m::Op(&shift))));
+  Match(instr,
+        m::Add().WithBinaryOperandsAnyOrder(
+            m::Multiply()
+                .WithBinaryOperandsAnyOrder(
+                    m::Op(), m::Subtract(m::Op(), m::Broadcast().WithOneUser())
+                                 .WithOneUser())
+                .WithOneUser(),
+            m::Broadcast(m::Op(&shift))));
   return shift;
 }
 
@@ -343,7 +342,7 @@ class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
           producer->AddInstruction(HloInstruction::CreateConvert(
               ShapeUtil::ChangeElementType(producer->shape(),
                                            instr->shape().element_type()),
-              {producer}));
+              producer));
       absl::InlinedVector<HloInstruction*, 2> newoperands =
           ln_instr->mutable_operands();
       newoperands.at(0) = newinp;
@@ -359,13 +358,8 @@ class OneDnnOpsRewriterVisitor : public DfsHloRewriteVisitor {
     if (divide_instr->HasControlDependencies()) return OkStatus();
     if (!IsSupportedType(divide_instr->shape().element_type()))
       return OkStatus();
-    std::optional<HloInstruction*> producer;
-    bool found_pattern = false;
-    if (producer = MatchSoftmax(divide_instr)) {
-      found_pattern = true;
-    }
-
-    if (!found_pattern) return OkStatus();
+    std::optional<HloInstruction*> producer = MatchSoftmax(divide_instr);
+    if (producer == std::nullopt) return OkStatus();
 
     const Shape& output_shape = divide_instr->shape();
     HloInstruction* softmax_call =
diff --git a/third_party/xla/xla/service/cpu/onednn_util.h b/third_party/xla/xla/service/cpu/onednn_util.h
index ae975949fecdf2..0b8a7c65b0bf48 100644
--- a/third_party/xla/xla/service/cpu/onednn_util.h
+++ b/third_party/xla/xla/service/cpu/onednn_util.h
@@ -25,15 +25,21 @@ namespace cpu {
 
 inline bool IsSupportedType(xla::PrimitiveType dtype) {
   using tsl::port::CPUFeature;
-  static bool is_bf16_supported = TestCPUFeature(CPUFeature::AVX512_BF16) ||
-                                  TestCPUFeature(CPUFeature::AMX_BF16);
+  // TODO(intel-tf): Enable more types.
   switch (dtype) {
     case F32:
       return true;
     case BF16:
-      return is_bf16_supported;
+      return TestCPUFeature(CPUFeature::AVX512F) ||
+             TestCPUFeature(CPUFeature::AVX_NE_CONVERT) ||
+             TestCPUFeature(CPUFeature::AMX_BF16);
+    case F16:
+      return TestCPUFeature(CPUFeature::AVX512BW) &&
+             (TestCPUFeature(CPUFeature::AVX512_FP16) ||
+              TestCPUFeature(CPUFeature::AMX_FP16) ||
+              TestCPUFeature(CPUFeature::AVX_NE_CONVERT));
     default:
-      break;
+      return false;
   }
   return false;
 }
diff --git a/third_party/xla/xla/service/cpu/runtime/BUILD b/third_party/xla/xla/service/cpu/runtime/BUILD
index 4c01dda0d39b25..1169fdaa39d394 100644
--- a/third_party/xla/xla/service/cpu/runtime/BUILD
+++ b/third_party/xla/xla/service/cpu/runtime/BUILD
@@ -1,7 +1,8 @@
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
     licenses = ["notice"],
 )
 
@@ -23,7 +24,6 @@ cc_library(
     name = "collectives",
     srcs = ["collectives.cc"],
     hdrs = ["collectives.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:executable_run_options",
         "//xla:shape_util",
@@ -31,7 +31,12 @@ cc_library(
         "//xla/runtime:custom_call",
         "//xla/runtime:custom_call_registry",
         "//xla/runtime:executable",
+        "//xla/runtime:memref_view",
         "//xla/service/cpu:cpu_runtime",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Support",
     ],
 )
@@ -40,14 +45,15 @@ cc_library(
     name = "convolution",
     srcs = ["convolution.cc"],
     hdrs = ["convolution.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:executable_run_options",
+        "//xla:xla_data_proto_cc",
         "//xla/runtime:memref_view",
         "//xla/service/cpu:runtime_conv2d",
         "//xla/service/cpu:runtime_conv3d",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/types:span",
+        "@eigen_archive//:eigen3",
     ],
 )
 
@@ -58,9 +64,14 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":convolution",
+        "//xla:xla_data_proto_cc",
         "//xla/runtime:aot_ffi",
         "//xla/runtime:aot_ffi_execution_context",
+        "//xla/runtime:memref_view",
         "//xla/runtime/ffi:ffi_api",
+        "//xla/runtime/ffi:ffi_c_api_hdrs",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:span",
     ],
 )
 
@@ -68,13 +79,15 @@ cc_library(
     name = "convolution_call",
     srcs = ["convolution_call.cc"],
     hdrs = ["convolution_call.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":convolution",
         "//xla:executable_run_options",
         "//xla/runtime:custom_call",
         "//xla/runtime:custom_call_registry",
         "//xla/runtime:executable",
+        "//xla/runtime:memref_view",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -82,16 +95,20 @@ cc_library(
     name = "custom_call",
     srcs = ["custom_call.cc"],
     hdrs = ["custom_call.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:xla_proto_cc",
         "//xla/runtime:custom_call",
         "//xla/runtime:custom_call_registry",
         "//xla/runtime:executable",
+        "//xla/runtime:memref_view",
         "//xla/service:custom_call_status_internal",
+        "//xla/service:custom_call_status_public_headers",
         "//xla/service:custom_call_target_registry",
         "//xla/service:hlo_proto_cc",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Support",
     ],
 )
@@ -100,9 +117,9 @@ cc_library(
     name = "fft_call",
     srcs = ["fft_call.cc"],
     hdrs = ["fft_call.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:executable_run_options",
+        "//xla:xla_data_proto_cc",
         "//xla:xla_proto_cc",
         "//xla/runtime:custom_call",
         "//xla/runtime:custom_call_registry",
@@ -113,6 +130,8 @@ cc_library(
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -120,15 +139,18 @@ cc_library(
     name = "xfeed",
     srcs = ["xfeed.cc"],
     hdrs = ["xfeed.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:executable_run_options",
         "//xla:shape_util",
+        "//xla:xla_data_proto_cc",
         "//xla/runtime:custom_call",
         "//xla/runtime:custom_call_registry",
         "//xla/runtime:executable",
+        "//xla/runtime:memref_view",
         "//xla/service/cpu:cpu_runtime",
-        "@llvm-project//mlir:IR",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:Support",
     ],
 )
@@ -137,11 +159,12 @@ cc_library(
     name = "rng",
     srcs = ["rng.cc"],
     hdrs = ["rng.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:executable_run_options",
+        "//xla:xla_data_proto_cc",
         "//xla/runtime:memref_view",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
@@ -149,13 +172,14 @@ cc_library(
     name = "rng_call",
     srcs = ["rng_call.cc"],
     hdrs = ["rng_call.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":rng",
         "//xla:executable_run_options",
         "//xla/runtime:custom_call",
         "//xla/runtime:custom_call_registry",
         "//xla/runtime:executable",
+        "//xla/runtime:memref_view",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -166,8 +190,12 @@ cc_library(
     visibility = ["//visibility:public"],
     deps = [
         ":rng",
+        "//xla:xla_data_proto_cc",
         "//xla/runtime:aot_ffi",
         "//xla/runtime:aot_ffi_execution_context",
+        "//xla/runtime:memref_view",
         "//xla/runtime/ffi:ffi_api",
+        "//xla/runtime/ffi:ffi_c_api_hdrs",
+        "@com_google_absl//absl/status",
     ],
 )
diff --git a/third_party/xla/xla/service/cpu/runtime/collectives.cc b/third_party/xla/xla/service/cpu/runtime/collectives.cc
index 5c84302321bfe4..6034cc600245b2 100644
--- a/third_party/xla/xla/service/cpu/runtime/collectives.cc
+++ b/third_party/xla/xla/service/cpu/runtime/collectives.cc
@@ -26,11 +26,19 @@
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "xla/executable_run_options.h"
 #include "xla/runtime/custom_call.h"
 #include "xla/runtime/custom_call_registry.h"
 #include "xla/runtime/executable.h"
+#include "xla/runtime/memref_view.h"
 #include "xla/service/cpu/cpu_runtime.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/service/cpu/runtime/convolution.cc b/third_party/xla/xla/service/cpu/runtime/convolution.cc
index 6235cfa3b19e06..bc2c7ef29b2535 100644
--- a/third_party/xla/xla/service/cpu/runtime/convolution.cc
+++ b/third_party/xla/xla/service/cpu/runtime/convolution.cc
@@ -19,9 +19,13 @@
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "Eigen/Core"  // from @eigen_archive
 #include "xla/executable_run_options.h"
+#include "xla/runtime/memref_view.h"
 #include "xla/service/cpu/runtime_conv2d.h"
 #include "xla/service/cpu/runtime_conv3d.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace cpu {
diff --git a/third_party/xla/xla/service/cpu/runtime/convolution.h b/third_party/xla/xla/service/cpu/runtime/convolution.h
index 03da05d7e6f3ad..fe4433774a7040 100644
--- a/third_party/xla/xla/service/cpu/runtime/convolution.h
+++ b/third_party/xla/xla/service/cpu/runtime/convolution.h
@@ -16,6 +16,7 @@
 
 #include <cstdint>
 
+#include "absl/status/status.h"
 #include "absl/types/span.h"
 #include "xla/executable_run_options.h"
 #include "xla/runtime/memref_view.h"
diff --git a/third_party/xla/xla/service/cpu/runtime/convolution_call.cc b/third_party/xla/xla/service/cpu/runtime/convolution_call.cc
index bf8bb3a597a2e5..793f6285da40c1 100644
--- a/third_party/xla/xla/service/cpu/runtime/convolution_call.cc
+++ b/third_party/xla/xla/service/cpu/runtime/convolution_call.cc
@@ -24,9 +24,13 @@
 #include <utility>
 #include <vector>
 
+#include "absl/types/span.h"
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "xla/executable_run_options.h"
 #include "xla/runtime/custom_call.h"
+#include "xla/runtime/custom_call_registry.h"
 #include "xla/runtime/executable.h"
+#include "xla/runtime/memref_view.h"
 #include "xla/service/cpu/runtime/convolution.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/cpu/runtime/convolution_ffi.cc b/third_party/xla/xla/service/cpu/runtime/convolution_ffi.cc
index 7feafcfd1bc84c..9673938a05eea9 100644
--- a/third_party/xla/xla/service/cpu/runtime/convolution_ffi.cc
+++ b/third_party/xla/xla/service/cpu/runtime/convolution_ffi.cc
@@ -14,10 +14,15 @@
 
 #include "xla/service/cpu/runtime/convolution_ffi.h"
 
+#include "absl/status/status.h"
+#include "absl/types/span.h"
 #include "xla/runtime/aot_ffi.h"
 #include "xla/runtime/aot_ffi_execution_context.h"
 #include "xla/runtime/ffi/ffi_api.h"
+#include "xla/runtime/ffi/ffi_c_api.h"
+#include "xla/runtime/memref_view.h"
 #include "xla/service/cpu/runtime/convolution.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 struct ExecutableRunOptions;
diff --git a/third_party/xla/xla/service/cpu/runtime/custom_call.cc b/third_party/xla/xla/service/cpu/runtime/custom_call.cc
index 2b1d846edfdf01..6b45f3d1a36718 100644
--- a/third_party/xla/xla/service/cpu/runtime/custom_call.cc
+++ b/third_party/xla/xla/service/cpu/runtime/custom_call.cc
@@ -25,11 +25,17 @@
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "xla/primitive_util.h"
 #include "xla/runtime/custom_call.h"
 #include "xla/runtime/custom_call_registry.h"
 #include "xla/runtime/executable.h"
+#include "xla/runtime/memref_view.h"
+#include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_status_internal.h"
 #include "xla/service/custom_call_target_registry.h"
 #include "xla/service/hlo.pb.h"
diff --git a/third_party/xla/xla/service/cpu/runtime/fft_call.cc b/third_party/xla/xla/service/cpu/runtime/fft_call.cc
index 0d0e07005e4a2f..c62b57422a1543 100644
--- a/third_party/xla/xla/service/cpu/runtime/fft_call.cc
+++ b/third_party/xla/xla/service/cpu/runtime/fft_call.cc
@@ -27,6 +27,8 @@
 #include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "xla/executable_run_options.h"
 #include "xla/runtime/custom_call.h"
 #include "xla/runtime/custom_call_registry.h"
@@ -35,6 +37,7 @@
 #include "xla/service/cpu/runtime_fft.h"
 #include "xla/service/hlo.pb.h"
 #include "xla/xla.pb.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace cpu {
diff --git a/third_party/xla/xla/service/cpu/runtime/rng.cc b/third_party/xla/xla/service/cpu/runtime/rng.cc
index fa202fb406f1a4..7f2edd42b56b26 100644
--- a/third_party/xla/xla/service/cpu/runtime/rng.cc
+++ b/third_party/xla/xla/service/cpu/runtime/rng.cc
@@ -18,7 +18,10 @@
 #include <cstdint>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "xla/executable_run_options.h"
+#include "xla/runtime/memref_view.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace cpu {
diff --git a/third_party/xla/xla/service/cpu/runtime/rng_call.cc b/third_party/xla/xla/service/cpu/runtime/rng_call.cc
index cd895a8113127f..6bcbe0fe0bf7e4 100644
--- a/third_party/xla/xla/service/cpu/runtime/rng_call.cc
+++ b/third_party/xla/xla/service/cpu/runtime/rng_call.cc
@@ -17,10 +17,12 @@
 #include <array>
 #include <cstdint>
 
+#include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "xla/executable_run_options.h"
 #include "xla/runtime/custom_call.h"
 #include "xla/runtime/custom_call_registry.h"
 #include "xla/runtime/executable.h"
+#include "xla/runtime/memref_view.h"
 #include "xla/service/cpu/runtime/rng.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/cpu/runtime/rng_ffi.cc b/third_party/xla/xla/service/cpu/runtime/rng_ffi.cc
index 886be013f818e4..8efd9aabfade06 100644
--- a/third_party/xla/xla/service/cpu/runtime/rng_ffi.cc
+++ b/third_party/xla/xla/service/cpu/runtime/rng_ffi.cc
@@ -14,10 +14,14 @@
 
 #include "xla/service/cpu/runtime/rng_ffi.h"
 
+#include "absl/status/status.h"
 #include "xla/runtime/aot_ffi.h"
 #include "xla/runtime/aot_ffi_execution_context.h"
 #include "xla/runtime/ffi/ffi_api.h"
+#include "xla/runtime/ffi/ffi_c_api.h"
+#include "xla/runtime/memref_view.h"
 #include "xla/service/cpu/runtime/rng.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 struct ExecutableRunOptions;
diff --git a/third_party/xla/xla/service/cpu/runtime/xfeed.cc b/third_party/xla/xla/service/cpu/runtime/xfeed.cc
index e33d08be8425fa..38bb2eb34644e3 100644
--- a/third_party/xla/xla/service/cpu/runtime/xfeed.cc
+++ b/third_party/xla/xla/service/cpu/runtime/xfeed.cc
@@ -26,15 +26,20 @@
 #include <utility>
 #include <vector>
 
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "absl/status/status.h"
+#include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "xla/executable_run_options.h"
 #include "xla/primitive_util.h"
 #include "xla/runtime/custom_call.h"
 #include "xla/runtime/custom_call_registry.h"
 #include "xla/runtime/executable.h"
+#include "xla/runtime/memref_view.h"
 #include "xla/service/cpu/cpu_runtime.h"
 #include "xla/shape_util.h"
+#include "xla/xla_data.pb.h"
 
 namespace xla {
 namespace cpu {
diff --git a/third_party/xla/xla/service/cpu/simple_orc_jit.cc b/third_party/xla/xla/service/cpu/simple_orc_jit.cc
index bc19cc53a35f5a..3bf08e4f300e6f 100644
--- a/third_party/xla/xla/service/cpu/simple_orc_jit.cc
+++ b/third_party/xla/xla/service/cpu/simple_orc_jit.cc
@@ -539,6 +539,7 @@ bool RegisterKnownJITSymbols() {
   REGISTER_CPU_RUNTIME_SYMBOL(OneDnnMatMul);
   REGISTER_CPU_RUNTIME_SYMBOL(OneDnnSoftmax);
   REGISTER_CPU_RUNTIME_SYMBOL(OneDnnLayerNorm);
+  REGISTER_CPU_RUNTIME_SYMBOL(OneDnnMatMulReorder);
 #endif  // INTEL_MKL && ENABLE_ONEDNN_V3
 
   registry->Register("__gnu_f2h_ieee", reinterpret_cast<void*>(__gnu_f2h_ieee),
diff --git a/third_party/xla/xla/service/cpu/tests/BUILD b/third_party/xla/xla/service/cpu/tests/BUILD
index 8da93b855babf5..893ea643691eca 100644
--- a/third_party/xla/xla/service/cpu/tests/BUILD
+++ b/third_party/xla/xla/service/cpu/tests/BUILD
@@ -6,7 +6,8 @@ load("@local_tsl//tsl:tsl.bzl", "tsl_copts")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
     licenses = ["notice"],
 )
 
@@ -24,14 +25,12 @@ filegroup(
         "**/*.cc",
         "**/*.h",
     ]),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cpu_codegen_test",
     testonly = True,
     hdrs = ["cpu_codegen_test.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/service:cpu_plugin",
         "//xla/tests:llvm_irgen_test_base",
diff --git a/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc b/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc
index 3d0fed47ac1c6f..4eefb5084a43e6 100644
--- a/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc
+++ b/third_party/xla/xla/service/cpu_gpu_shape_verifier.cc
@@ -32,11 +32,13 @@ Status VerifyS4U4Usage(HloInstruction* instruction) {
   switch (instruction->opcode()) {
     case HloOpcode::kBitcast:
     case HloOpcode::kConstant:
+    case HloOpcode::kConcatenate:
     case HloOpcode::kConvert:
     case HloOpcode::kCopy:
     case HloOpcode::kFusion:
     case HloOpcode::kGetTupleElement:
     case HloOpcode::kParameter:
+    case HloOpcode::kSlice:
     case HloOpcode::kTuple:
       break;
     default:
diff --git a/third_party/xla/xla/service/custom_call_target_registry.cc b/third_party/xla/xla/service/custom_call_target_registry.cc
index 8ceb64e87f4636..1c5f731fc2e79b 100644
--- a/third_party/xla/xla/service/custom_call_target_registry.cc
+++ b/third_party/xla/xla/service/custom_call_target_registry.cc
@@ -15,6 +15,12 @@ limitations under the License.
 
 #include "xla/service/custom_call_target_registry.h"
 
+#include <cstdlib>
+#include <iostream>
+#include <mutex>  // NOLINT
+#include <string>
+#include <utility>
+
 namespace xla {
 
 CustomCallTargetRegistry* CustomCallTargetRegistry::Global() {
@@ -26,7 +32,17 @@ void CustomCallTargetRegistry::Register(const std::string& symbol,
                                         void* address,
                                         const std::string& platform) {
   std::lock_guard<std::mutex> lock(mu_);
-  registered_symbols_[std::make_pair(symbol, platform)] = address;
+  const auto [it, inserted] =
+      registered_symbols_.insert({{symbol, platform}, address});
+  if (!inserted && it->second != address) {
+    std::cerr << "Duplicate custom call registration detected for symbol \""
+              << symbol << "\" with different addresses " << address
+              << "(current) and " << it->second << " (previous) on platform "
+              << platform
+              << "Rejecting the registration to avoid confusion about which "
+                 "symbol would actually get used at runtime.\n";
+    std::exit(1);
+  }
 }
 
 void* CustomCallTargetRegistry::Lookup(const std::string& symbol,
diff --git a/third_party/xla/xla/service/custom_call_target_registry.h b/third_party/xla/xla/service/custom_call_target_registry.h
index 8488b5f0c99229..d2b87892778d14 100644
--- a/third_party/xla/xla/service/custom_call_target_registry.h
+++ b/third_party/xla/xla/service/custom_call_target_registry.h
@@ -35,7 +35,14 @@ namespace xla {
 // The XLA:CPU ahead-of-time (AOT) compiler links using a standard offline
 // linker; so when compiling in CPU AOT mode, you *also* need to make sure the
 // name of the callee (presumably implemented in C++) matches up with the
-// symbolic name used in the CustomCall.
+// symbolic name used in the CustomCall. Be careful with the name of the symbol
+// you register with the macros: C++ namespaces are not included, including
+// anonymous namespaces,so if two libraries attempt to register functions with
+// the same name in separate namespaces the registrations will collide. Either
+// call the registration macro from the global namespace so that you have to
+// refer to the function in a fully-qualified manner (which also requires you to
+// emit HLO-based calls to it by the fully-qualified name *and* complicates
+// future refactoring!) or use C-style namespacing directly in the symbol name.
 //
 // We maintain the registry in both the JIT and the AOT cases for simplicity,
 // but we only use it when running in JIT mode.
diff --git a/third_party/xla/xla/service/custom_call_target_registry_test.cc b/third_party/xla/xla/service/custom_call_target_registry_test.cc
new file mode 100644
index 00000000000000..1c7a2b4ad0383e
--- /dev/null
+++ b/third_party/xla/xla/service/custom_call_target_registry_test.cc
@@ -0,0 +1,58 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/custom_call_target_registry.h"
+
+#include "xla/service/custom_call_status.h"
+#include "xla/test.h"
+
+namespace xla {
+namespace {
+
+void custom_call(void*, const void**, XlaCustomCallStatus*) {}
+void custom_call2(void*, const void**, XlaCustomCallStatus*) {}
+
+TEST(CustomCallRegistryTest, Registers) {
+  CustomCallTargetRegistry registry;
+  EXPECT_EQ(registry.Lookup("custom_call", "Host"), nullptr);
+  registry.Register("custom_call", reinterpret_cast<void*>(custom_call),
+                    "Host");
+  EXPECT_EQ(custom_call, registry.Lookup("custom_call", "Host"));
+  // A registration with a different name is fine.
+  registry.Register("custom_call2", reinterpret_cast<void*>(&custom_call),
+                    "Host");
+
+  EXPECT_EQ(registry.Lookup("custom_call", "CUDA"), nullptr);
+  // A registration on a different platform is fine.
+  registry.Register("custom_call", reinterpret_cast<void*>(custom_call),
+                    "CUDA");
+  EXPECT_EQ(custom_call, registry.Lookup("custom_call", "CUDA"));
+
+  // A second registration of the same function is fine.
+  registry.Register("custom_call", reinterpret_cast<void*>(custom_call),
+                    "Host");
+}
+
+TEST(CustomCallRegistryDeathTest, RejectsDuplicateRegistrations) {
+  CustomCallTargetRegistry registry;
+  registry.Register("custom_call", reinterpret_cast<void*>(custom_call),
+                    "Host");
+  EXPECT_DEATH(registry.Register("custom_call",
+                                 reinterpret_cast<void*>(custom_call2), "Host"),
+               "Duplicate custom call");
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/service/elemental_ir_emitter.cc b/third_party/xla/xla/service/elemental_ir_emitter.cc
index 10f07f92f806ad..b066c66454c98c 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter.cc
+++ b/third_party/xla/xla/service/elemental_ir_emitter.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -43,6 +44,7 @@ limitations under the License.
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/llvm_loop.h"
 #include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/service/llvm_ir/math_ops.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
@@ -902,6 +904,8 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitFloatUnaryOp(
           primitive_util::BitWidth(from_type),
           primitive_util::BitWidth(to_type));
     }
+    case HloOpcode::kErf:
+      return EmitErf(op->shape().element_type(), operand_value);
     case HloOpcode::kExp:
       return EmitExp(op->shape().element_type(), operand_value, "");
     case HloOpcode::kExpm1:
@@ -2041,6 +2045,11 @@ StatusOr<llvm::Value*> ElementalIrEmitter::EmitTanh(PrimitiveType prim_type,
   return Unimplemented("tanh");
 }
 
+StatusOr<llvm::Value*> ElementalIrEmitter::EmitErf(PrimitiveType prim_type,
+                                                   llvm::Value* value) {
+  return Unimplemented("erf");
+}
+
 StatusOr<llvm::Value*> ElementalIrEmitter::EmitTan(PrimitiveType prim_type,
                                                    llvm::Value* value) {
   auto sin_x = llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::sin, {value},
@@ -2971,6 +2980,7 @@ llvm_ir::ElementGenerator ElementalIrEmitter::MakeElementGenerator(
     case HloOpcode::kConvert:
     case HloOpcode::kBitcastConvert:
     case HloOpcode::kCos:
+    case HloOpcode::kErf:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
diff --git a/third_party/xla/xla/service/elemental_ir_emitter.h b/third_party/xla/xla/service/elemental_ir_emitter.h
index caef7ca795d3d8..705378237ed7b9 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter.h
+++ b/third_party/xla/xla/service/elemental_ir_emitter.h
@@ -175,6 +175,9 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
                                          llvm::Value* lhs, llvm::Value* rhs,
                                          absl::string_view name);
 
+  virtual StatusOr<llvm::Value*> EmitErf(PrimitiveType prim_type,
+                                         llvm::Value* value);
+
   virtual StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
                                           llvm::Value* value);
 
@@ -233,9 +236,6 @@ class ElementalIrEmitter : public IrBuilderMixin<ElementalIrEmitter> {
                           llvm::Value* accumulator,
                           xla::PrimitiveType primitive_type);
 
-  // Identifier of the thread unique among all threads on the device
-  virtual llvm::Value* EmitThreadId() { return b_->getIntN(128, 0); }
-
   StatusOr<llvm::Value*> EmitElementalSelect(
       const HloInstruction* hlo,
       const HloToElementGeneratorMap& operand_to_generator,
diff --git a/third_party/xla/xla/service/elemental_ir_emitter_test.cc b/third_party/xla/xla/service/elemental_ir_emitter_test.cc
index 58fe971eab6800..90554af8f99023 100644
--- a/third_party/xla/xla/service/elemental_ir_emitter_test.cc
+++ b/third_party/xla/xla/service/elemental_ir_emitter_test.cc
@@ -698,8 +698,10 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
+// TODO(b/324385428): Failing on GPU at head due to an LLVM integrate. Re-enable
+// once this has been fixed.
 XLA_TEST_F(ElementalIrEmitterExecutionTestWithoutFastMinMax,
-           MinimumHandlesNaNsOnTheRight) {
+           DISABLED_MinimumHandlesNaNsOnTheRight) {
   constexpr absl::string_view kHloText = R"(
 HloModule t
 
diff --git a/third_party/xla/xla/service/float_normalization.cc b/third_party/xla/xla/service/float_normalization.cc
index 4b574a9e284ff2..3541297c4cd628 100644
--- a/third_party/xla/xla/service/float_normalization.cc
+++ b/third_party/xla/xla/service/float_normalization.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/call_graph.h"
 #include "xla/service/hlo_dce.h"
 #include "xla/service/tuple_simplifier.h"
 #include "xla/shape_util.h"
@@ -290,6 +291,14 @@ Status FloatNormalizationVisitor::ConvertCalledComputations(
   return OkStatus();
 }
 
+// Returns true if the called computations of the instruction should not
+// be touched by float normalization. In particular, we must not introduce
+// float conversions into collective reductions.
+bool ShouldAvoidNormalizingComputationsForInstruction(HloInstruction* hlo) {
+  return hlo->opcode() == HloOpcode::kAllReduce ||
+         hlo->opcode() == HloOpcode::kReduceScatter;
+}
+
 Status FloatNormalizationVisitor::HandleMultipleOutputs(HloInstruction* hlo) {
   std::vector<PrimitiveType> operand_types(hlo->operand_count());
   std::vector<PrimitiveType> output_types(hlo->operand_count());
@@ -355,7 +364,7 @@ Status FloatNormalizationVisitor::HandleMultipleOutputs(HloInstruction* hlo) {
 
   std::vector<HloComputation*> low_precision_called_comps;
   for (auto* comp : hlo->called_computations()) {
-    if (comp->IsCollectiveCalledComputation()) {
+    if (ShouldAvoidNormalizingComputationsForInstruction(hlo)) {
       continue;
     }
     bool comp_has_low_precision = false;
@@ -434,7 +443,7 @@ Status FloatNormalizationVisitor::HandleInstruction(HloInstruction* hlo) {
 
   std::vector<HloComputation*> low_precision_called_comps;
   for (auto* comp : hlo->called_computations()) {
-    if (comp->IsCollectiveCalledComputation()) {
+    if (ShouldAvoidNormalizingComputationsForInstruction(hlo)) {
       continue;
     }
     bool comp_has_low_precision = false;
@@ -564,6 +573,56 @@ Status FloatNormalizationVisitor::Preprocess(HloInstruction* hlo) {
   return OkStatus();
 }
 
+// We must avoid normalizing computations that have non-normalizing users
+// (e.g., all-reduce's computations should not be normalized). If a
+// computation is shared between normalizing and non-normalizing users, we will
+// clone the computation for the non-normalizing users so that it can be left
+// unchanged. This function clones the shared computations and returns the set
+// of non-normalizing computations that must be skipped by the visitor.
+absl::flat_hash_set<HloComputation*>
+CloneComputationsForNonNormalizingInstructions(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  std::unique_ptr<CallGraph> call_graph =
+      CallGraph::Build(module, execution_threads);
+
+  absl::flat_hash_set<HloComputation*> computations_to_skip;
+  for (const CallGraphNode& node : call_graph->nodes()) {
+    bool has_normalizing_users = false;
+    bool has_users_to_skip_normalization = false;
+    for (const CallSite& site : node.caller_callsites()) {
+      if (ShouldAvoidNormalizingComputationsForInstruction(
+              site.instruction())) {
+        has_users_to_skip_normalization = true;
+      } else {
+        has_normalizing_users = true;
+      }
+    }
+    // If the computation is only used by normalizing users or only by
+    // non-normalizing users, then we do not clone.
+    if (!has_users_to_skip_normalization) {
+      continue;
+    }
+    if (!has_normalizing_users) {
+      computations_to_skip.insert(node.computation());
+      continue;
+    }
+    // Otherwise, we create a clone and replace the normalizing instructions'
+    // computations with the clone.
+    HloComputation* clone = module->DeepCloneComputation(node.computation());
+    for (const CallSite& site : node.caller_callsites()) {
+      if (ShouldAvoidNormalizingComputationsForInstruction(
+              site.instruction())) {
+        site.instruction()->ReplaceCalledComputations(
+            [&](HloComputation* called) {
+              return called == node.computation() ? clone : called;
+            });
+      }
+    }
+    computations_to_skip.insert(clone);
+  }
+  return computations_to_skip;
+}
 }  // namespace
 
 StatusOr<bool> FloatNormalization::Run(
@@ -573,13 +632,14 @@ StatusOr<bool> FloatNormalization::Run(
                         primitive_util::LowercasePrimitiveTypeName(
                             float_support_->LowPrecisionType()) +
                         ", before:\n" + module->ToString());
+  auto computations_to_visit =
+      module->MakeComputationPostOrder(execution_threads);
+  auto computations_to_skip =
+      CloneComputationsForNonNormalizingInstructions(module, execution_threads);
+
   FloatNormalizationVisitor visitor(float_support_, this);
-  for (auto* comp : module->MakeComputationPostOrder(execution_threads)) {
-    if (comp->IsCollectiveCalledComputation()) {
-      XLA_VLOG_LINES(2, "Skip processing collective called computation: " +
-                            comp->ToString());
-      continue;
-    }
+  for (auto* comp : computations_to_visit) {
+    if (computations_to_skip.contains(comp)) continue;
     TF_RETURN_IF_ERROR(comp->Accept(&visitor));
   }
   XLA_VLOG_LINES(2, "FloatNormalization::Run() for " +
diff --git a/third_party/xla/xla/service/float_normalization_test.cc b/third_party/xla/xla/service/float_normalization_test.cc
index 679dc2934423a8..b9e91833acc6d8 100644
--- a/third_party/xla/xla/service/float_normalization_test.cc
+++ b/third_party/xla/xla/service/float_normalization_test.cc
@@ -586,6 +586,67 @@ TEST_F(FloatNormalizationNoComputeSupportTest,
   EXPECT_EQ(ShapeUtil::GetSubshape(crs->shape(), {1}).element_type(), BF16);
 }
 
+TEST_F(FloatNormalizationNoComputeSupportTest,
+       NormalizationClonesSharedApplyAllReduceAndReduce) {
+  auto module = CreateNewVerifiedModule();
+  HloComputation::Builder sum_builder("sum");
+  auto x = sum_builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/0, ShapeUtil::MakeShape(BF16, {}), "x"));
+  auto y = sum_builder.AddInstruction(HloInstruction::CreateParameter(
+      /*parameter_number=*/1, ShapeUtil::MakeShape(BF16, {}), "y"));
+  sum_builder.AddInstruction(HloInstruction::CreateBinary(
+      ShapeUtil::MakeShape(BF16, {}), HloOpcode::kAdd, x, y));
+  HloComputation* reduction =
+      module->AddEmbeddedComputation(sum_builder.Build());
+
+  auto builder = HloComputation::Builder(TestName());
+
+  Shape bf16_shape_a = ShapeUtil::MakeShape(BF16, {2, 4});
+  HloInstruction* a = builder.AddInstruction(
+      HloInstruction::CreateParameter(0, bf16_shape_a, "a"));
+
+  Shape bf16_shape_b = ShapeUtil::MakeShape(BF16, {2, 4, 2});
+  HloInstruction* b = builder.AddInstruction(
+      HloInstruction::CreateParameter(1, bf16_shape_b, "b"));
+
+  Shape bf16_scalar_shape = ShapeUtil::MakeShape(BF16, {});
+  HloInstruction* init = builder.AddInstruction(
+      HloInstruction::CreateParameter(2, bf16_scalar_shape, "init"));
+
+  HloInstruction* all_reduce = builder.AddInstruction(
+      HloInstruction::CreateAllReduce(bf16_shape_a, {a}, reduction,
+                                      /*replica_groups=*/{},
+                                      /*constrain_layout=*/false,
+                                      /*channel_id=*/std::nullopt,
+                                      /*use_global_device_ids=*/false));
+
+  HloInstruction* reduce = builder.AddInstruction(
+      HloInstruction::CreateReduce(bf16_shape_a, b, init, {2}, reduction));
+  builder.AddInstruction(HloInstruction::CreateBinary(
+      bf16_shape_a, HloOpcode::kAdd, all_reduce, reduce));
+
+  auto computation = module->AddEntryComputation(builder.Build());
+  // Verify that the shared computation was cloned, the all-reduce instruction
+  // got the unchanged bf16 add, while the reduction was promoted to f32
+  // together with its called computation.
+  EXPECT_TRUE(Normalize(module.get()));
+  EXPECT_EQ(computation->root_instruction()->shape().element_type(), BF16);
+  EXPECT_EQ(all_reduce->operand(0)->shape().element_type(), BF16);
+  EXPECT_EQ(all_reduce->to_apply()->root_instruction()->opcode(),
+            HloOpcode::kAdd);
+  EXPECT_EQ(all_reduce->to_apply()->root_instruction()->shape().element_type(),
+            BF16);
+  EXPECT_EQ(reduce->called_computations().size(), 1);
+  EXPECT_EQ(reduce->called_computations()[0]
+                ->root_instruction()
+                ->shape()
+                .element_type(),
+            F32);
+  EXPECT_EQ(reduce->called_computations()[0]->root_instruction()->opcode(),
+            HloOpcode::kConvert);
+  EXPECT_EQ(reduce->shape().element_type(), F32);
+}
+
 TEST_F(FloatNormalizationNoComputeSupportTest,
        NoNormalizationForToApplyAllReduce) {
   auto module = CreateNewVerifiedModule();
diff --git a/third_party/xla/xla/service/generic_transfer_manager.cc b/third_party/xla/xla/service/generic_transfer_manager.cc
index 41aca77513974c..2d3675ae59e6cd 100644
--- a/third_party/xla/xla/service/generic_transfer_manager.cc
+++ b/third_party/xla/xla/service/generic_transfer_manager.cc
@@ -15,108 +15,35 @@ limitations under the License.
 
 #include "xla/service/generic_transfer_manager.h"
 
+#include <cstddef>
 #include <cstdint>
+#include <cstring>
+#include <functional>
 #include <memory>
-#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "absl/types/span.h"
-#include "xla/layout_util.h"
 #include "xla/literal.h"
 #include "xla/primitive_util.h"
+#include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/types.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 
 namespace xla {
 
-namespace {
-
-// Transfer a memory block of the given size from the device source into the
-// 'destination' buffer.
-//
-// size is the size to transfer to destination in bytes.
-Status TransferBufferFromDevice(se::Stream* stream,
-                                const se::DeviceMemoryBase& source,
-                                int64_t size, void* destination) {
-  if (source.size() < size) {
-    return absl::FailedPreconditionError(absl::StrFormat(
-        "Source allocation on device not large enough for data transfer: "
-        "%d < %d",
-        source.size(), size));
-  }
-  stream->ThenMemcpy(destination, source, size);
-  return OkStatus();
-}
-
-// Transfer a memory block of the given size from 'source' buffer to the given
-// destination of the device.
-//
-// size is the size to transfer from source in bytes.
-Status TransferBufferToDevice(se::Stream* stream, int64_t size,
-                              const void* source,
-                              se::DeviceMemoryBase* destination) {
-  if (destination->size() < size) {
-    return absl::FailedPreconditionError(absl::StrFormat(
-        "Destination allocation on device not large enough for data transfer: "
-        "%d < %d",
-        destination->size(), size));
-  }
-  stream->ThenMemcpy(destination, source, size);
-  return OkStatus();
-}
-
-// Transfers a buffer of packed int4 values from the device to the host, then
-// unpacks them on the host. 'source' is a buffer with (num_elements+1)/2 bytes
-// where each byte stores two int4 values. 'destination' is a buffer with
-// num_elements bytes, where a single int4 value will be written to each byte
-// in the lower 4 bits.
-Status TransferInt4ArrayFromDevice(se::Stream* stream,
-                                   const se::DeviceMemoryBase& source,
-                                   int64_t num_elements, void* destination) {
-  int64_t packed_size = (num_elements + 1) / 2;
-  auto packed_dst_data = std::make_unique<std::vector<char>>(packed_size);
-  TF_RETURN_IF_ERROR(TransferBufferFromDevice(stream, source, packed_size,
-                                              packed_dst_data->data()));
-  stream->ThenDoHostCallback([destination, num_elements,
-                              moved_dst_data = std::move(packed_dst_data)]() {
-    UnpackInt4(*moved_dst_data,
-               absl::MakeSpan(static_cast<char*>(destination), num_elements));
-  });
-  return OkStatus();
-}
-
-// Packs an array of int4 values then transfers the packed buffer from the host
-// to the device. 'source' is a buffer with num_elements bytes, where the lower
-// 4 bits of each byte stores an int4 value. 'destination' is a buffer with
-// (num_elements+1)/2 bytes, where two int4 values will be written into each
-// byte.
-Status TransferInt4ArrayToDevice(se::Stream* stream, int64_t num_elements,
-                                 const void* source,
-                                 se::DeviceMemoryBase* destination) {
-  auto packed_src_data = std::make_unique<std::vector<char>>(
-      CeilOfRatio(num_elements, int64_t{2}));
-  PackInt4(absl::MakeSpan(static_cast<const char*>(source), num_elements),
-           absl::MakeSpan(*packed_src_data));
-  TF_RETURN_IF_ERROR(TransferBufferToDevice(
-      stream, packed_src_data->size(), packed_src_data->data(), destination));
-  // Ensure the buffer is transferred before we destroy it
-  stream->ThenDoHostCallback([keep_alive = std::move(packed_src_data)] {});
-  return OkStatus();
-}
-
-}  // namespace
-
 GenericTransferManager::GenericTransferManager(se::Platform::Id platform_id,
                                                size_t pointer_size)
     : platform_id_(platform_id), pointer_size_(pointer_size) {}
@@ -286,6 +213,60 @@ Status GenericTransferManager::ResetDevices(
       "Device reset is not yet supported on this platform (b/30481585)");
 }
 
+Status GenericTransferManager::TransferBufferFromDevice(
+    se::Stream* stream, const se::DeviceMemoryBase& source, int64_t size,
+    void* destination) {
+  if (source.size() < size) {
+    return absl::FailedPreconditionError(absl::StrFormat(
+        "Source allocation on device not large enough for data transfer: "
+        "%d < %d",
+        source.size(), size));
+  }
+  stream->ThenMemcpy(destination, source, size);
+  return OkStatus();
+}
+
+Status GenericTransferManager::TransferBufferToDevice(
+    se::Stream* stream, int64_t size, const void* source,
+    se::DeviceMemoryBase* destination) {
+  if (destination->size() < size) {
+    return absl::FailedPreconditionError(absl::StrFormat(
+        "Destination allocation on device not large enough for data transfer: "
+        "%d < %d",
+        destination->size(), size));
+  }
+  stream->ThenMemcpy(destination, source, size);
+  return OkStatus();
+}
+
+Status GenericTransferManager::TransferInt4ArrayFromDevice(
+    se::Stream* stream, const se::DeviceMemoryBase& source,
+    int64_t num_elements, void* destination) {
+  int64_t packed_size = (num_elements + 1) / 2;
+  auto packed_dst_data = std::make_unique<std::vector<char>>(packed_size);
+  TF_RETURN_IF_ERROR(TransferBufferFromDevice(stream, source, packed_size,
+                                              packed_dst_data->data()));
+  stream->ThenDoHostCallback([destination, num_elements,
+                              packed_dst_data = std::move(packed_dst_data)]() {
+    UnpackInt4(*packed_dst_data,
+               absl::MakeSpan(static_cast<char*>(destination), num_elements));
+  });
+  return OkStatus();
+}
+
+Status GenericTransferManager::TransferInt4ArrayToDevice(
+    se::Stream* stream, int64_t num_elements, const void* source,
+    se::DeviceMemoryBase* destination) {
+  auto packed_src_data = std::make_unique<std::vector<char>>(
+      CeilOfRatio(num_elements, int64_t{2}));
+  PackInt4(absl::MakeSpan(static_cast<const char*>(source), num_elements),
+           absl::MakeSpan(*packed_src_data));
+  TF_RETURN_IF_ERROR(TransferBufferToDevice(
+      stream, packed_src_data->size(), packed_src_data->data(), destination));
+  stream->ThenDoHostCallback([keep_alive = std::move(packed_src_data)] {});
+  return OkStatus();
+}
+
 int64_t GenericTransferManager::GetByteSizeRequirement(
     const Shape& shape) const {
   if (shape.is_static() || shape.IsTuple()) {
@@ -304,4 +285,5 @@ Shape GenericTransferManager::HostShapeToDeviceShape(
   }
   return device_shape;
 }
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/generic_transfer_manager.h b/third_party/xla/xla/service/generic_transfer_manager.h
index 1b66b1ad9a817b..bc376a6932c017 100644
--- a/third_party/xla/xla/service/generic_transfer_manager.h
+++ b/third_party/xla/xla/service/generic_transfer_manager.h
@@ -16,10 +16,24 @@ limitations under the License.
 #ifndef XLA_SERVICE_GENERIC_TRANSFER_MANAGER_H_
 #define XLA_SERVICE_GENERIC_TRANSFER_MANAGER_H_
 
-#include <vector>
-
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <memory>
+
+#include "absl/base/thread_annotations.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/types/span.h"
+#include "xla/literal.h"
+#include "xla/service/shaped_buffer.h"
 #include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
+#include "xla/status.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/xla_data.pb.h"
 
@@ -54,6 +68,7 @@ class GenericTransferManager : public TransferManager {
 
   Status TransferLiteralToInfeed(se::StreamExecutor* executor,
                                  const LiteralSlice& literal) override;
+
   Status TransferLiteralFromOutfeed(se::StreamExecutor* executor,
                                     MutableBorrowingLiteral literal) override;
 
@@ -74,6 +89,42 @@ class GenericTransferManager : public TransferManager {
   // can only hold one value, but subclasses can override this.
   virtual bool PackSubbyteTypes() const { return false; }
 
+  // Transfer a memory block of the given size from the device source into the
+  // 'destination' buffer.
+  //
+  // size is the size to transfer to destination in bytes.
+  virtual Status TransferBufferFromDevice(se::Stream* stream,
+                                          const se::DeviceMemoryBase& source,
+                                          int64_t size, void* destination);
+
+  // Transfer a memory block of the given size from 'source' buffer to the given
+  // destination of the device.
+  //
+  // size is the size to transfer from source in bytes.
+  virtual Status TransferBufferToDevice(se::Stream* stream, int64_t size,
+                                        const void* source,
+                                        se::DeviceMemoryBase* destination);
+
+  // Transfers a buffer of packed int4 values from the device to the host, then
+  // unpacks them on the host. 'source' is a buffer with (num_elements+1)/2
+  // bytes where each byte stores two int4 values. 'destination' is a buffer
+  // with num_elements bytes, where a single int4 value will be written to each
+  // byte in the lower 4 bits.
+  virtual Status TransferInt4ArrayFromDevice(se::Stream* stream,
+                                             const se::DeviceMemoryBase& source,
+                                             int64_t num_elements,
+                                             void* destination);
+
+  // Packs an array of int4 values then transfers the packed buffer from the
+  // host to the device. 'source' is a buffer with num_elements bytes, where the
+  // lower 4 bits of each byte stores an int4 value. 'destination' is a buffer
+  // with (num_elements+1)/2 bytes, where two int4 values will be written into
+  // each byte.
+  virtual Status TransferInt4ArrayToDevice(se::Stream* stream,
+                                           int64_t num_elements,
+                                           const void* source,
+                                           se::DeviceMemoryBase* destination);
+
   // The platform this transfer manager targets.
   const se::Platform::Id platform_id_;
 
diff --git a/third_party/xla/xla/service/generic_transfer_manager_test.cc b/third_party/xla/xla/service/generic_transfer_manager_test.cc
index 3e6cae2fb0bf5c..5ae01d81c97b1a 100644
--- a/third_party/xla/xla/service/generic_transfer_manager_test.cc
+++ b/third_party/xla/xla/service/generic_transfer_manager_test.cc
@@ -31,6 +31,7 @@ limitations under the License.
 #include "xla/shape_tree.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/host/host_platform_id.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/types.h"
@@ -58,7 +59,7 @@ class GenericTransferManagerTest : public ::testing::Test {
   void SetUp() override {
     TF_ASSERT_OK_AND_ASSIGN(
         se::Platform * platform,
-        se::MultiPlatformManager::PlatformWithId(se::host::kHostPlatformId));
+        se::PlatformManager::PlatformWithId(se::host::kHostPlatformId));
     TF_ASSERT_OK_AND_ASSIGN(stream_executor_, platform->ExecutorForDevice(0));
     stream_.emplace(stream_executor_);
     stream_->Init();
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
index b438b7d3f1713b..4fdeeefaf7ed7e 100644
--- a/third_party/xla/xla/service/gpu/BUILD
+++ b/third_party/xla/xla/service/gpu/BUILD
@@ -9,6 +9,7 @@ load(
     "//xla/service/gpu:build_defs.bzl",
     "build_cub_sort_kernels",
     "get_cub_sort_kernel_types",
+    "gpu_kernel_library",
 )
 load(
     "//xla/stream_executor:build_defs.bzl",
@@ -20,7 +21,14 @@ load(
     "if_rocm_is_configured",
     "rocm_copts",
 )
-load("@local_tsl//tsl:tsl.bzl", "if_google", "if_nccl", "tsl_copts", "tsl_gpu_library")
+load(
+    "@local_tsl//tsl:tsl.bzl",
+    "if_google",
+    "if_nccl",
+    "internal_visibility",
+    "tsl_copts",
+    "tsl_gpu_library",
+)
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup", "get_compatible_with_portable")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
@@ -38,7 +46,8 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
     licenses = ["notice"],
 )
 
@@ -56,7 +65,6 @@ filegroup(
         "**/*.cc",
         "**/*.h",
     ]),
-    visibility = ["//visibility:public"],
 )
 
 tf_proto_library(
@@ -69,7 +77,6 @@ tf_proto_library(
         "//xla:autotuning_proto",
         "@local_tsl//tsl/protobuf:dnn_proto",
     ],
-    visibility = ["//visibility:public"],
 )
 
 xla_cc_test(
@@ -113,13 +120,11 @@ cc_library(
 cc_library(
     name = "gpu_constants",
     hdrs = ["gpu_constants.h"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "gpu_memory_space_assignment",
     hdrs = ["gpu_memory_space_assignment.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status",
         "//xla/hlo/ir:hlo",
@@ -139,7 +144,6 @@ cc_library(
         "launch_dimensions.h",
     ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:statusor",
@@ -167,24 +171,17 @@ xla_cc_test(
         "//xla/ffi",
         "//xla/ffi:ffi_api",
         "//xla/hlo/ir:hlo",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/runtime:memref_view",
-        "//xla/runtime:module",
-        "//xla/runtime:module_registry",
-        "//xla/runtime/ffi:ffi_api",
         "//xla/service:custom_call_status",
         "//xla/service:custom_call_target_registry",
         "//xla/service:executable",
         "//xla/service:gpu_plugin",
-        "//xla/service/gpu/runtime:custom_call_registry",
-        "//xla/service/gpu/runtime:support",
         "//xla/stream_executor/gpu:gpu_types_header",
         "//xla/tests:client_library_test_base",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:statusor",
@@ -214,7 +211,6 @@ cc_library(
     name = "hlo_to_ir_bindings",
     srcs = ["hlo_to_ir_bindings.cc"],
     hdrs = ["hlo_to_ir_bindings.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":buffer_allocations",
         ":ir_emission_utils",
@@ -238,7 +234,6 @@ cc_library(
     srcs = ["target_util.cc"],
     hdrs = ["target_util.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:status",
@@ -271,7 +266,6 @@ cc_library(
     srcs = ["gpu_device_info_for_tests.cc"],
     hdrs = ["gpu_device_info_for_tests.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/stream_executor:device_description",
     ],
@@ -281,7 +275,6 @@ cc_library(
     name = "ir_emitter_context",
     srcs = ["ir_emitter_context.cc"],
     hdrs = ["ir_emitter_context.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":gpu_constants",
         ":gpu_executable",
@@ -292,6 +285,7 @@ cc_library(
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
     ],
@@ -306,11 +300,9 @@ cc_library(
     ]) + if_rocm_hipblaslt([
         "TF_HIPBLASLT=1",
     ]),
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
-        ":gemm_thunk",
         ":gpu_asm_opts_util",
         ":gpu_constants",
         ":gpu_conv_runner",
@@ -333,6 +325,8 @@ cc_library(
         ":reduction_utils",
         ":target_util",
         ":thunk",
+        ":triton_call",
+        ":triton_fusion_analysis",
         "//xla:autotuning_proto_cc",
         "//xla:literal",
         "//xla:permutation_util",
@@ -359,27 +353,30 @@ cc_library(
         "//xla/service/gpu/fusions:fusion_emitter",
         "//xla/service/gpu/fusions:thunk_util",
         "//xla/service/gpu/fusions:tiling_util",
-        "//xla/service/gpu/kernels:custom_fusion",
         "//xla/service/gpu/kernels:custom_kernel",
         "//xla/service/gpu/kernels:topk_custom_kernel",
-        "//xla/service/gpu/runtime3:command_buffer_cmd",
-        "//xla/service/gpu/runtime3:command_buffer_cmd_emitter",
-        "//xla/service/gpu/runtime3:command_buffer_thunk",
-        "//xla/service/gpu/runtime3:conditional_thunk",
-        "//xla/service/gpu/runtime3:convolution_thunk",
-        "//xla/service/gpu/runtime3:copy_thunk",
-        "//xla/service/gpu/runtime3:custom_call_thunk",
-        "//xla/service/gpu/runtime3:fft_thunk",
-        "//xla/service/gpu/runtime3:for_thunk",
-        "//xla/service/gpu/runtime3:fused_mha_thunk",
-        "//xla/service/gpu/runtime3:infeed_thunk",
-        "//xla/service/gpu/runtime3:kernel_thunk",
-        "//xla/service/gpu/runtime3:norm_thunk",
-        "//xla/service/gpu/runtime3:outfeed_thunk",
-        "//xla/service/gpu/runtime3:replica_id_thunk",
-        "//xla/service/gpu/runtime3:send_recv_thunk",
-        "//xla/service/gpu/runtime3:sequential_thunk",
-        "//xla/service/gpu/runtime3:while_thunk",
+        "//xla/service/gpu/runtime:command_buffer_cmd",
+        "//xla/service/gpu/runtime:command_buffer_cmd_emitter",
+        "//xla/service/gpu/runtime:command_buffer_thunk",
+        "//xla/service/gpu/runtime:conditional_thunk",
+        "//xla/service/gpu/runtime:convolution_thunk",
+        "//xla/service/gpu/runtime:copy_thunk",
+        "//xla/service/gpu/runtime:custom_call_thunk",
+        "//xla/service/gpu/runtime:fft_thunk",
+        "//xla/service/gpu/runtime:fused_mha_thunk",
+        "//xla/service/gpu/runtime:gemm_thunk",
+        "//xla/service/gpu/runtime:infeed_thunk",
+        "//xla/service/gpu/runtime:kernel_thunk",
+        "//xla/service/gpu/runtime:nccl_all_gather_thunk",
+        "//xla/service/gpu/runtime:nccl_all_reduce_thunk",
+        "//xla/service/gpu/runtime:nccl_all_to_all_thunk",
+        "//xla/service/gpu/runtime:norm_thunk",
+        "//xla/service/gpu/runtime:outfeed_thunk",
+        "//xla/service/gpu/runtime:replica_id_thunk",
+        "//xla/service/gpu/runtime:send_recv_thunk",
+        "//xla/service/gpu/runtime:sequential_thunk",
+        "//xla/service/gpu/runtime:wait_for_streams_thunk",
+        "//xla/service/gpu/runtime:while_thunk",
         "//xla/service/llvm_ir:buffer_assignment_util",
         "//xla/service/llvm_ir:dynamic_update_slice_util",
         "//xla/service/llvm_ir:fused_ir_emitter",
@@ -388,11 +385,12 @@ cc_library(
         "//xla/service/llvm_ir:llvm_util",
         "//xla/service/llvm_ir:sort_util",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor:launch_dim",
+        "//xla/stream_executor/gpu:gpu_blas_lt",
         "//xla/translate/hlo_to_mhlo:hlo_utils",
         "//xla/translate/mhlo_to_hlo:attribute_exporter",
         "//xla/translate/mhlo_to_hlo:location_exporter",
         "//xla/translate/mhlo_to_hlo:mlir_hlo_to_hlo",
-        "//xla/translate/mhlo_to_lhlo_with_xla",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -416,8 +414,11 @@ cc_library(
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:LLVMDialect",
         "@llvm-project//mlir:LLVMToLLVMIRTranslation",
+        "@llvm-project//mlir:MathDialect",
         "@llvm-project//mlir:MemRefDialect",
+        "@llvm-project//mlir:MemRefTransforms",
         "@llvm-project//mlir:NVVMToLLVMIRTranslation",
+        "@llvm-project//mlir:Parser",
         "@llvm-project//mlir:ROCDLToLLVMIRTranslation",
         "@llvm-project//mlir:Support",
         "@llvm-project//mlir:ToLLVMIRTranslation",
@@ -426,12 +427,15 @@ cc_library(
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/protobuf:dnn_proto_cc",
+        "@triton//:TritonDialects",
     ] + if_gpu_is_configured([
-        ":cub_sort_thunk",
-        ":gpublas_lt_matmul_thunk",
         ":ir_emitter_triton",
-        "//xla/service/gpu/runtime3:cholesky_thunk",
-        "//xla/service/gpu/runtime3:triangular_solve_thunk",
+        "//xla/service/gpu/runtime:cholesky_thunk",
+        "//xla/service/gpu/runtime:cub_sort_thunk",
+        "//xla/service/gpu/runtime:gpublas_lt_matmul_thunk",
+        "//xla/service/gpu/runtime:triangular_solve_thunk",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rocm_headers",
     ]),
 )
 
@@ -448,7 +452,6 @@ cc_library(
         "ir_emitter_nested.h",
     ],
     copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]),
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":hlo_to_ir_bindings",
@@ -479,6 +482,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
         "@local_tsl//tsl/platform:errors",
     ],
 )
@@ -489,7 +493,6 @@ cc_library(
         "ir_emitter_triton.cc",
     ]),
     hdrs = if_gpu_is_configured(["ir_emitter_triton.h"]),
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_traversal",
         ":ir_emission_utils",
@@ -513,9 +516,11 @@ cc_library(
         "//xla/mlir_hlo",
         "//xla/mlir_hlo:map_mhlo_to_scalar_op",
         "//xla/service:dump",
+        "//xla/service:hlo_module_config",
         "//xla/service/gpu/llvm_gpu_backend",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor:launch_dim",
         "//xla/translate/hlo_to_mhlo:hlo_module_importer",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -557,7 +562,6 @@ cc_library(
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:tensor_float_32_utils",
         "@triton//:TritonDialects",
-        "@triton//:TritonTmaMetadata",
         "@triton//:TritonTransforms",
     ] + if_cuda_is_configured([
         "@triton//:NVGPUToLLVM",
@@ -577,18 +581,18 @@ xla_test(
         "gpu_v100",
     ],
     shard_count = 20,
-    tags = [
-        "noasan",  # TODO(b/319626241): Reenable once the signed-integer-overflow is fixed
-        "nomac",
-    ],
+    tags = ["nomac"],
     deps = [
         ":backend_configs_cc",
         ":gpu_device_info_for_tests",
         ":ir_emission_utils",
         ":ir_emitter_triton",
         ":matmul_utils",
+        ":triton_fusion_analysis",
         "//xla:autotuning_proto_cc",
         "//xla:error_spec",
+        "//xla:literal",
+        "//xla:literal_util",
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:xla_proto_cc",
@@ -601,6 +605,7 @@ xla_test(
         "//xla/tests:filecheck",
         "//xla/tests:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
         "@llvm-project//llvm:Support",
@@ -672,7 +677,6 @@ cc_library(
     name = "triton_autotuner",
     srcs = if_cuda_is_configured(["triton_autotuner.cc"]),
     hdrs = if_cuda_is_configured(["triton_autotuner.h"]),
-    visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":autotuner_compile_util",
         ":autotuner_util",
@@ -700,6 +704,7 @@ cc_library(
         "//xla:autotuning_proto_cc",
         "//xla:shape_util",
         "//xla:status_macros",
+        "//xla/tools:hlo_decomposer_lib",
         "//xla:status",
         "//xla:statusor",
         "//xla:util",
@@ -761,6 +766,7 @@ xla_test(
         "//xla/tests:test_utils",
         "//xla/tests:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "//xla/tools:hlo_decomposer_lib",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
@@ -775,12 +781,26 @@ xla_test(
     ],
 )
 
+cc_library(
+    name = "triton_call",
+    srcs = if_gpu_is_configured(["triton_call.cc"]),
+    hdrs = ["triton_call.h"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
+    deps = [
+        "@llvm-project//mlir:AsmParser",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Parser",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "parallel_loop_emitter",
     srcs = ["parallel_loop_emitter.cc"],
     hdrs = ["parallel_loop_emitter.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":launch_dimensions",
         ":target_util",
@@ -802,7 +822,6 @@ cc_library(
     name = "buffer_allocations",
     srcs = ["buffer_allocations.cc"],
     hdrs = ["buffer_allocations.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status",
         "//xla:statusor",
@@ -820,20 +839,23 @@ cc_library(
     name = "thunk",
     srcs = ["thunk.cc"],
     hdrs = ["thunk.h"],
-    visibility = ["//visibility:public"],
     deps = [
+        ":backend_configs_cc",
         ":buffer_allocations",
         ":gpu_executable_run_options",
+        ":nccl_api",
+        ":nccl_clique",
+        ":nccl_clique_key",
         "//xla:executable_run_options",
         "//xla:status",
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
         "//xla/service:global_device_id",
-        "//xla/service/gpu:nccl_clique_key",
         "//xla/stream_executor",
         "//xla/translate/mhlo_to_hlo:location_exporter",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -842,6 +864,7 @@ cc_library(
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/lib/gtl:int_type",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -849,9 +872,6 @@ cc_library(
 cc_library(
     name = "nccl_collective_thunks",
     srcs = [
-        "nccl_all_gather_thunk.cc",
-        "nccl_all_reduce_thunk.cc",
-        "nccl_all_to_all_thunk.cc",
         "nccl_collective_permute_thunk.cc",
         "nccl_collective_thunk.cc",
         "nccl_p2p_thunk_common.cc",
@@ -859,19 +879,19 @@ cc_library(
         "nccl_send_thunk.cc",
     ],
     hdrs = [
-        "nccl_all_gather_thunk.h",
-        "nccl_all_reduce_thunk.h",
-        "nccl_all_to_all_thunk.h",
         "nccl_collective_permute_thunk.h",
         "nccl_collective_thunk.h",
         "nccl_p2p_thunk_common.h",
         "nccl_recv_thunk.h",
         "nccl_send_thunk.h",
     ],
-    visibility = ["//visibility:public"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW=1",
+    ]),
     deps = [
         ":backend_configs_cc",
         ":buffer_allocations",
+        ":gpu_executable_run_options",
         ":ir_emission_utils",
         ":nccl_api",
         ":nccl_clique",
@@ -888,13 +908,16 @@ cc_library(
         "//xla/mlir_hlo:lhlo_gpu",
         "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
+        "//xla/service:computation_placer",
         "//xla/service:global_device_id",
         "//xla/service:hlo_parser",
-        "//xla/service/gpu:gpu_executable_run_options",
+        "//xla/service:rendezvous",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_activation_header",
+        "//xla/stream_executor/gpu:gpu_driver_header",
         "//xla/stream_executor/gpu:gpu_stream",
+        "//xla/stream_executor/gpu:gpu_types_header",
         "//xla/translate/hlo_to_mhlo:hlo_utils",
         "//xla/translate/mhlo_to_hlo:attribute_exporter",
         "//xla/translate/mhlo_to_hlo:type_to_shape",
@@ -902,6 +925,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
@@ -910,6 +934,7 @@ cc_library(
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:errors",
@@ -930,42 +955,46 @@ cc_library(
 # have `if_nccl` and `if_gpu_configured` that do not compose. NCCL header included directly in
 # :nccl_api target and all other targets should use this header to launch collective operations.
 # This allows to minimize the spreading of #ifdef all over the XLA code base.
-
 alias(
     name = "nccl_api",
     actual = if_nccl(":_nccl_api_impl", ":_nccl_api_stub"),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "_nccl_api_impl",
-    srcs = if_cuda_is_configured(
+    srcs = if_gpu_is_configured(
         ["nccl_api.cc"],
         ["nccl_api_stub.cc"],
     ),
     hdrs = ["nccl_api.h"],
     compatible_with = get_compatible_with_portable(),
-    defines = if_cuda_is_configured(["XLA_ENABLE_XCCL"]),  # TODO(ezhulenev): Remove!
-    visibility = ["//visibility:public"],
     deps = [
         ":nccl_clique_key",
         "//xla:shape_util",
         "//xla:xla_data_proto_cc",
         "//xla/service:collective_ops_utils",
         "//xla/stream_executor",
+        "//xla/stream_executor/gpu:gpu_activation",
         "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
     ] + if_cuda_is_configured([
         "@local_config_nccl//:nccl",
         "//xla/stream_executor/cuda:cuda_driver",
         "//xla/stream_executor/cuda:cuda_executor",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rccl",
+        "//xla/stream_executor/rocm:rocm_driver",
+        "//xla/stream_executor/rocm:rocm_executor",
     ]) + if_gpu_is_configured([
         "//xla/stream_executor/gpu:gpu_stream",
     ]),
@@ -976,7 +1005,6 @@ cc_library(
     srcs = ["nccl_api_stub.cc"],
     hdrs = ["nccl_api.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":nccl_clique_key",
         "//xla:shape_util",
@@ -985,7 +1013,9 @@ cc_library(
         "//xla/stream_executor",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:logging",
     ],
 )
 
@@ -994,21 +1024,32 @@ cc_library(
     srcs = ["nccl_clique_key.cc"],
     hdrs = ["nccl_clique_key.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/service:global_device_id",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
     ],
 )
 
+xla_cc_test(
+    name = "nccl_clique_key_test",
+    srcs = ["nccl_clique_key_test.cc"],
+    deps = [
+        ":nccl_clique_key",
+        "//xla/service:global_device_id",
+        "@com_google_absl//absl/container:btree",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 cc_library(
     name = "nccl_clique",
     srcs = ["nccl_clique.cc"],
     hdrs = ["nccl_clique.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":nccl_api",
         ":nccl_clique_key",
@@ -1018,20 +1059,23 @@ cc_library(
         "//xla/service:global_device_id",
         "//xla/service:lockable",
         "//xla/service:rendezvous",
+        "//xla/stream_executor",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/hash",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/lib/gtl:int_type",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:hash",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -1041,7 +1085,6 @@ cuda_library(
     name = "sleep_kernel",
     srcs = if_cuda_is_configured(["sleep_kernel.cu.cc"]),
     hdrs = if_cuda_is_configured(["sleep_kernel.h"]),
-    visibility = ["//visibility:public"],
     deps = ["@local_config_cuda//cuda:cuda_headers"],
 )
 
@@ -1049,9 +1092,11 @@ cc_library(
     name = "mock_nccl_xml_google",
     srcs = ["mock_nccl_xml.cc"],
     hdrs = ["mock_nccl_xml.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     tags = ["manual"],
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
     deps = [
         "//xla:status",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1065,6 +1110,8 @@ cc_library(
         "@local_tsl//tsl/platform:regexp",
     ] + if_cuda_is_configured([
         "@local_config_nccl//:nccl",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rccl",
     ]),
 )
 
@@ -1072,7 +1119,9 @@ xla_cc_test(
     name = "mock_nccl_xml_test",
     size = "small",
     srcs = if_google(["mock_nccl_xml_test.cc"]),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     tags = tf_cuda_tests_tags(),
     deps = [
         "//xla:status",
@@ -1082,6 +1131,8 @@ xla_cc_test(
         ":mock_nccl_xml_google",
     ]) + if_cuda_is_configured([
         "@local_config_nccl//:nccl",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rccl",
     ]),
 )
 
@@ -1089,7 +1140,6 @@ xla_cc_test(
 cc_library(
     name = "empty",
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
 )
 
 alias(
@@ -1098,7 +1148,6 @@ alias(
         if_google(":_mock_nccl_utils_google", ":_mock_nccl_utils_default"),
         ":empty",
     ),
-    visibility = ["//visibility:public"],
 )
 
 # Do not build mock_nccl_utils.cc in OSS. It uses the nccl internal cost model to estimate the
@@ -1113,9 +1162,8 @@ cc_library(
     ]),
     # Override tsl_gpu_library()'s internal default value of ["//buildenv/target:gce"].
     compatible_with = [],
-    defines = if_cuda_is_configured(["XLA_ENABLE_XCCL"]),  # TODO(ezhulenev): Remove!
     tags = ["manual"],
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
     deps = if_cuda_is_configured([
         ":gpu_executable_run_options",
         ":mock_nccl_xml_google",
@@ -1149,11 +1197,13 @@ cc_library(
         "//xla/service:collective_ops_utils",
         "//xla/service:global_device_id",
         "//xla/service:rendezvous",
+        "//xla/service:lockable",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_activation",
         "//xla/stream_executor/gpu:gpu_stream",
         "//xla/stream_executor/gpu:gpu_types_header",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/lib/gtl:int_type",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ]),
@@ -1166,7 +1216,7 @@ cc_library(
     # Override tsl_gpu_library()'s internal default value of ["//buildenv/target:gce"].
     compatible_with = [],
     tags = ["manual"],
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
     deps = if_gpu_is_configured([
         ":gpu_executable_run_options",
         ":nccl_collective_thunks",
@@ -1184,6 +1234,8 @@ cc_library(
         "//xla/stream_executor",
     ]) + if_cuda_is_configured([
         "@local_config_nccl//:nccl",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rccl",
     ]),
 )
 
@@ -1193,28 +1245,6 @@ bool_flag(
     build_setting_default = if_google(True, False),
 )
 
-cc_library(
-    name = "non_atomically_upgradeable_rw_lock",
-    srcs = [],
-    hdrs = [
-        "non_atomically_upgradeable_rw_lock.h",
-    ],
-    visibility = ["//visibility:public"],
-    deps = [
-        "@com_google_absl//absl/synchronization",
-    ],
-)
-
-xla_cc_test(
-    name = "non_atomically_upgradeable_rw_lock_test",
-    srcs = ["non_atomically_upgradeable_rw_lock_test.cc"],
-    deps = [
-        ":non_atomically_upgradeable_rw_lock",
-        "@com_google_googletest//:gtest_main",
-        "@local_tsl//tsl/platform:test",
-    ],
-)
-
 cc_library(
     name = "gpu_executable",
     srcs = [
@@ -1226,35 +1256,33 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    visibility = ["//visibility:public"],
     deps = [
         ":buffer_allocations",
         ":gpu_constants",
+        ":gpu_executable_run_options",
         ":ir_emission_utils",
-        ":non_atomically_upgradeable_rw_lock",
+        ":nccl_clique",
+        ":nccl_clique_key",
         ":stream_executor_util",
         ":thunk",
+        "//xla:executable_run_options",
         "//xla:shape_tree",
         "//xla:shape_util",
         "//xla:status",
         "//xla:status_macros",
         "//xla:util",
         "//xla/hlo/ir:hlo",
-        "//xla/mlir/runtime/ir:rt",
-        "//xla/mlir/runtime/transforms:compilation_pipeline_gpu",
-        "//xla/mlir/runtime/transforms:type_converter",
-        "//xla/runtime:executable",
         "//xla/service:buffer_assignment",
         "//xla/service:executable",
         "//xla/service:hlo_execution_profile",
+        "//xla/service:hlo_module_config",
         "//xla/service:hlo_parser",
         "//xla/service:rendezvous",
         "//xla/service:shaped_buffer",
         "//xla/service:stream_pool",
         "//xla/service:xla_debug_info_manager",
-        "//xla/service/gpu:nccl_clique_key",
-        "//xla/service/gpu/runtime:executable",
-        "//xla/service/gpu/runtime:tracing",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu/runtime:annotation",
         "//xla/stream_executor",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
@@ -1265,12 +1293,18 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/gpu:gpu_timer_header",
         "//xla/stream_executor/rocm:rocm_platform_id",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/cleanup",
         "@com_google_absl//absl/container:btree",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
@@ -1305,7 +1339,6 @@ cc_library(
     srcs = ["ir_emission_utils.cc"],
     hdrs = ["ir_emission_utils.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_traversal",
         ":target_util",
@@ -1380,7 +1413,6 @@ cc_library(
     srcs = ["reduction_utils.cc"],
     hdrs = ["reduction_utils.h"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    visibility = ["//visibility:public"],
     deps = [
         ":ir_emission_utils",
         "//xla:shape_util",
@@ -1396,12 +1428,25 @@ cc_library(
     ]),
 )
 
+xla_cc_test(
+    name = "reduction_utils_test",
+    srcs = ["reduction_utils_test.cc"],
+    deps = [
+        ":reduction_utils",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_parser",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 cc_library(
     name = "cublas_cudnn",
     srcs = ["cublas_cudnn.cc"],
     hdrs = ["cublas_cudnn.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status",
         "//xla:statusor",
@@ -1413,60 +1458,21 @@ cc_library(
     ],
 )
 
-cuda_library(
-    name = "gpu_prim_cuda",
-    hdrs = ["gpu_prim_cuda.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    visibility = ["//visibility:public"],
+gpu_kernel_library(
+    name = "gpu_prim",
+    hdrs = ["gpu_prim.h"],
     deps = [
         "@eigen_archive//:eigen3",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:bfloat16",
-    ] + if_cuda_is_configured(xla_cub_deps()),
-)
-
-cc_library(
-    name = "variant_visitor",
-    hdrs = ["variant_visitor.h"],
-    visibility = ["//visibility:public"],
-)
-
-cc_library(
-    name = "gpu_prim_rocm",
-    hdrs = ["gpu_prim_rocm.h"],
-    local_defines = if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
-    visibility = ["//visibility:public"],
-    deps = [
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:bfloat16",
-    ] + if_rocm_is_configured([
+    ] + if_cuda_is_configured(xla_cub_deps()) + if_rocm_is_configured([
         "@local_config_rocm//rocm:rocprim",
     ]),
 )
 
 cc_library(
-    name = "cub_sort_thunk",
-    srcs = if_gpu_is_configured(["cub_sort_thunk.cc"]),
-    hdrs = if_gpu_is_configured(["cub_sort_thunk.h"]),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
-    visibility = ["//visibility:public"],
-    deps = if_gpu_is_configured([
-        ":buffer_allocations",
-        ":thunk",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "//xla/service:buffer_assignment",
-        "//xla/stream_executor:device_memory",
-        "//xla:shape_util",
-        "//xla:status",
-        "//xla:statusor",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "@local_tsl//tsl/platform:errors",
-    ] + [":cub_sort_kernel_" + suffix for suffix in get_cub_sort_kernel_types()]),
+    name = "variant_visitor",
+    hdrs = ["variant_visitor.h"],
 )
 
 build_cub_sort_kernels(
@@ -1476,14 +1482,9 @@ build_cub_sort_kernels(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    tags = [
-        "ignore_for_dep=third_party/tensorflow/compiler/xla/service/gpu/gpu_prim_rocm.h",
-    ],
     types = get_cub_sort_kernel_types(),
-    deps = if_cuda_is_configured([
-        ":gpu_prim_cuda",
-    ]) + if_rocm_is_configured([
-        ":gpu_prim_rocm",
+    deps = if_gpu_is_configured([
+        ":gpu_prim",
     ]),
 )
 
@@ -1494,7 +1495,6 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -1529,7 +1529,6 @@ cc_library(
     name = "triton_support",
     srcs = ["triton_support.cc"],
     hdrs = ["triton_support.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":variant_visitor",
         "//xla:xla_data_proto_cc",
@@ -1543,7 +1542,6 @@ cc_library(
     name = "triton_tiling_propagation",
     srcs = ["triton_tiling_propagation.cc"],
     hdrs = ["triton_tiling_propagation.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":triton_support",
         "//xla:permutation_util",
@@ -1566,7 +1564,6 @@ cc_library(
     name = "triton_fusion_analysis",
     srcs = ["triton_fusion_analysis.cc"],
     hdrs = ["triton_fusion_analysis.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":matmul_utils",
         ":triton_tiling_propagation",
@@ -1610,7 +1607,6 @@ cc_library(
     name = "gemm_rewriter_triton",
     srcs = ["gemm_rewriter_triton.cc"],
     hdrs = ["gemm_rewriter_triton.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":cublas_padding_requirements",
@@ -1669,7 +1665,6 @@ cc_library(
     name = "split_k_gemm_rewriter",
     srcs = ["split_k_gemm_rewriter.cc"],
     hdrs = ["split_k_gemm_rewriter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":ir_emission_utils",
         ":matmul_utils",
@@ -1731,7 +1726,6 @@ cc_library(
     name = "fusion_merger_triton",
     srcs = ["fusion_merger_triton.cc"],
     hdrs = ["fusion_merger_triton.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":gpu_fusible",
@@ -1777,7 +1771,6 @@ cc_library(
     name = "softmax_rewriter_triton",
     srcs = ["softmax_rewriter_triton.cc"],
     hdrs = ["softmax_rewriter_triton.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":ir_emission_utils",
@@ -1785,12 +1778,12 @@ cc_library(
         "//xla:shape_util",
         "//xla:status",
         "//xla:status_macros",
-        "//xla:statusor",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
         "//xla/service:hlo_pass",
+        "//xla/service:instruction_fusion",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -1805,43 +1798,6 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "gemm_thunk",
-    srcs = ["gemm_thunk.cc"],
-    hdrs = ["gemm_thunk.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":matmul_utils",
-        ":thunk",
-        "//xla:status",
-        "//xla/service:buffer_assignment",
-        "//xla/stream_executor",
-        "//xla/stream_executor:device_memory",
-        "@com_google_absl//absl/status",
-        "@local_tsl//tsl/platform:logging",
-    ],
-)
-
-cc_library(
-    name = "gpublas_lt_matmul_thunk",
-    srcs = if_gpu_is_configured(["gpublas_lt_matmul_thunk.cc"]),
-    hdrs = if_gpu_is_configured(["gpublas_lt_matmul_thunk.h"]),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
-    visibility = ["//visibility:public"],
-    deps = if_gpu_is_configured([
-        ":matmul_utils",
-        ":thunk",
-        "//xla/service:buffer_assignment",
-        "//xla:status",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
-    ]),
-)
-
 cc_library(
     name = "gemm_algorithm_picker",
     srcs = if_gpu_is_configured(["gemm_algorithm_picker.cc"]),
@@ -1849,11 +1805,9 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    visibility = ["//visibility:public"],
     deps = if_gpu_is_configured([
         ":backend_configs_cc",
         ":buffer_comparator",
-        ":gemm_thunk",
         ":gpu_asm_opts_util",
         ":gpu_conv_runner",
         ":ir_emission_utils",
@@ -1886,7 +1840,6 @@ cc_library(
     name = "autotuner_util",
     srcs = if_gpu_is_configured(["autotuner_util.cc"]),
     hdrs = if_gpu_is_configured(["autotuner_util.h"]),
-    visibility = ["//visibility:public"],
     deps = if_gpu_is_configured([
         ":gpu_asm_opts_util",
         ":stream_executor_util",
@@ -1926,7 +1879,6 @@ cc_library(
     name = "autotuner_compile_util",
     srcs = if_gpu_is_configured(["autotuner_compile_util.cc"]),
     hdrs = if_gpu_is_configured(["autotuner_compile_util.h"]),
-    visibility = ["//visibility:public"],
     deps = if_gpu_is_configured([
         ":autotuner_util",
         ":gpu_executable_run_options",
@@ -1993,7 +1945,6 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":ir_emission_utils",
@@ -2012,6 +1963,7 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_blas_lt",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:status",
@@ -2049,7 +2001,6 @@ cc_library(
     name = "dot_dimension_sorter",
     srcs = ["dot_dimension_sorter.cc"],
     hdrs = ["dot_dimension_sorter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:permutation_util",
         "//xla:shape_util",
@@ -2087,7 +2038,6 @@ cc_library(
     name = "gpu_async_collective_annotator",
     srcs = ["gpu_async_collective_annotator.cc"],
     hdrs = ["gpu_async_collective_annotator.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         "//xla/hlo/utils:hlo_query",
@@ -2114,7 +2064,6 @@ cc_library(
     name = "gpu_convert_async_collectives_to_sync",
     srcs = ["gpu_convert_async_collectives_to_sync.cc"],
     hdrs = ["gpu_convert_async_collectives_to_sync.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         "//xla/service:convert_async_collectives_to_sync",
@@ -2144,7 +2093,6 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    visibility = ["//visibility:public"],
     deps = if_gpu_is_configured([
         ":autotuner_util",
         ":backend_configs_cc",
@@ -2229,7 +2177,6 @@ cc_library(
     name = "gpu_conv_runner",
     srcs = ["gpu_conv_runner.cc"],
     hdrs = ["gpu_conv_runner.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -2254,7 +2201,6 @@ cc_library(
     name = "gpu_norm_runner",
     srcs = ["gpu_norm_runner.cc"],
     hdrs = ["gpu_norm_runner.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -2281,7 +2227,6 @@ cc_library(
     name = "gpu_fused_mha_runner",
     srcs = ["gpu_fused_mha_runner.cc"],
     hdrs = ["gpu_fused_mha_runner.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -2297,6 +2242,7 @@ cc_library(
         "//xla/stream_executor",
         "//xla/stream_executor:dnn",
         "//xla/stream_executor:lazy_op_runner",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
     ],
@@ -2306,7 +2252,6 @@ cc_library(
     name = "gpu_conv_rewriter",
     srcs = ["gpu_conv_rewriter.cc"],
     hdrs = ["gpu_conv_rewriter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -2325,9 +2270,7 @@ cc_library(
     name = "gpu_sort_rewriter",
     srcs = if_gpu_is_configured(["gpu_sort_rewriter.cc"]),
     hdrs = if_gpu_is_configured(["gpu_sort_rewriter.h"]),
-    visibility = ["//visibility:public"],
     deps = [
-        ":cub_sort_thunk",
         ":cublas_cudnn",
         "//xla:comparison_util",
         "//xla:shape_util",
@@ -2336,6 +2279,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
+        "//xla/service/gpu/runtime:cub_sort_thunk",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -2349,7 +2293,6 @@ cc_library(
     name = "move_copy_to_users",
     srcs = ["move_copy_to_users.cc"],
     hdrs = ["move_copy_to_users.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:status",
@@ -2424,7 +2367,6 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:comparison_util",
         "//xla:status",
@@ -2454,7 +2396,6 @@ cc_library(
     name = "cusolver_rewriter",
     srcs = if_gpu_is_configured(["cusolver_rewriter.cc"]),
     hdrs = if_gpu_is_configured(["cusolver_rewriter.h"]),
-    visibility = ["//visibility:public"],
     deps = if_gpu_is_configured([
         ":cusolver_context",
         ":ir_emission_utils",
@@ -2482,7 +2423,6 @@ cc_library(
     name = "instruction_fusion",
     srcs = ["instruction_fusion.cc"],
     hdrs = ["instruction_fusion.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":gpu_fusible",
         "//xla:shape_util",
@@ -2502,67 +2442,117 @@ cc_library(
 )
 
 xla_cc_test(
-    name = "instruction_fusion_test",
-    srcs = ["instruction_fusion_test.cc"],
-    tags = [
-        "nomsan",
-    ],
+    name = "instruction_fusion_test",
+    srcs = ["instruction_fusion_test.cc"],
+    tags = [
+        "no_aarch64",
+        "nomsan",
+    ],
+    deps = [
+        ":gpu_device_info_for_tests",
+        ":gpu_fusible",
+        ":instruction_fusion",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:pattern_matcher",
+        "//xla/service:pattern_matcher_gmock",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:test_utils",
+        "//xla/tests:xla_internal_test_main",
+    ],
+)
+
+tf_proto_library(
+    name = "fusion_process_dump_proto",
+    srcs = ["fusion_process_dump.proto"],
+    cc_api_version = 2,
+    protodeps = [
+        "//xla/stream_executor:device_description_proto",
+    ],
+)
+
+cc_library(
+    name = "fusion_process_dump",
+    srcs = ["fusion_process_dump.cc"],
+    hdrs = ["fusion_process_dump.h"],
+    deps = [
+        ":fusion_process_dump_proto_cc",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_graph_dumper",
+        "//xla/stream_executor:stream_executor_headers",
+        "//xla/tools:hlo_module_loader",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "fusion_process_dump_test",
+    srcs = ["fusion_process_dump_test.cc"],
     deps = [
+        ":fusion_process_dump",
+        ":fusion_process_dump_proto_cc",
         ":gpu_device_info_for_tests",
-        ":gpu_fusible",
-        ":instruction_fusion",
-        "//xla:util",
+        "//xla:test",
         "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_parser",
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
         "//xla/tests:hlo_test_base",
-        "//xla/tests:test_utils",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
-tf_proto_library(
-    name = "fusion_process_dump_proto",
-    srcs = ["fusion_process_dump.proto"],
-    cc_api_version = 2,
-    protodeps = [],
-    visibility = ["//visibility:public"],
-)
-
 cc_library(
     name = "priority_fusion",
     srcs = ["priority_fusion.cc"],
     hdrs = ["priority_fusion.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":fusion_process_dump_proto_cc",
         ":gpu_fusible",
+        ":hlo_fusion_analysis",
         ":hlo_traversal",
+        "//xla:debug_options_flags",
         "//xla:shape_util",
         "//xla:statusor",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:dump",
-        "//xla/service:fusion_node_indexing_evaluation",
         "//xla/service:fusion_queue",
         "//xla/service:hlo_cost_analysis",
+        "//xla/service:hlo_graph_dumper",
         "//xla/service:hlo_pass",
         "//xla/service:instruction_fusion",
         "//xla/service/gpu/model:fusion_analysis_cache",
         "//xla/service/gpu/model:gpu_hlo_cost_analysis",
         "//xla/service/gpu/model:gpu_performance_model",
+        "//xla/service/gpu/model:gpu_performance_model_base",
         "//xla/stream_executor:device_description",
-        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/meta:type_traits",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:blocking_counter",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:status",
     ],
@@ -2585,6 +2575,7 @@ xla_cc_test(
         "//xla/tests:hlo_test_base",
         "//xla/tests:verified_hlo_module",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:status_matchers",
     ],
@@ -2594,7 +2585,6 @@ cc_library(
     name = "multi_output_fusion",
     srcs = ["multi_output_fusion.cc"],
     hdrs = ["multi_output_fusion.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":gpu_fusible",
         "//xla:debug_options_flags",
@@ -2640,6 +2630,34 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "rename_fusions",
+    srcs = ["rename_fusions.cc"],
+    hdrs = ["rename_fusions.h"],
+    deps = [
+        ":hlo_traversal",
+        ":ir_emission_utils",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_pass",
+        "@com_google_absl//absl/container:btree",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+xla_cc_test(
+    name = "rename_fusions_test",
+    srcs = ["rename_fusions_test.cc"],
+    deps = [
+        ":rename_fusions",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
 xla_cc_test(
     name = "softmax_rewriter_triton_test",
     srcs = ["softmax_rewriter_triton_test.cc"],
@@ -2648,13 +2666,13 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
+        "//xla/service:instruction_fusion",
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # build_cleaner: keep
         "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
@@ -2668,7 +2686,6 @@ cc_library(
     name = "gpu_sanitize_constant_names",
     srcs = ["gpu_sanitize_constant_names.cc"],
     hdrs = ["gpu_sanitize_constant_names.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
@@ -2698,7 +2715,6 @@ cc_library(
     name = "fusion_merger",
     srcs = ["fusion_merger.cc"],
     hdrs = ["fusion_merger.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":gpu_fusible",
         "//xla:shape_util",
@@ -2739,7 +2755,6 @@ cc_library(
     name = "gpu_conv_padding_legalization",
     srcs = ["gpu_conv_padding_legalization.cc"],
     hdrs = ["gpu_conv_padding_legalization.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":cublas_cudnn",
         "//xla:literal",
@@ -2776,7 +2791,6 @@ cc_library(
     name = "cudnn_support_utils",
     srcs = ["cudnn_support_utils.cc"],
     hdrs = ["cudnn_support_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":cublas_cudnn",
         "//xla:shape_util",
@@ -2819,7 +2833,6 @@ cc_library(
     name = "cudnn_pad_for_convolutions",
     srcs = ["cudnn_pad_for_convolutions.cc"],
     hdrs = ["cudnn_pad_for_convolutions.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":cublas_cudnn",
         ":cudnn_support_utils",
@@ -2862,7 +2875,6 @@ cc_library(
     name = "cudnn_vectorize_convolutions",
     srcs = ["cudnn_vectorize_convolutions.cc"],
     hdrs = ["cudnn_vectorize_convolutions.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -2915,7 +2927,6 @@ cc_library(
     name = "cudnn_simplify_padding",
     srcs = ["cudnn_simplify_padding.cc"],
     hdrs = ["cudnn_simplify_padding.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -2972,7 +2983,6 @@ cc_library(
     name = "cublas_pad_for_gemms",
     srcs = ["cublas_pad_for_gemms.cc"],
     hdrs = ["cublas_pad_for_gemms.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":gemm_rewriter_triton",
         ":ir_emission_utils",
@@ -2995,7 +3005,6 @@ cc_library(
     name = "cublas_padding_requirements",
     srcs = ["cublas_padding_requirements.cc"],
     hdrs = ["cublas_padding_requirements.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":variant_visitor",
         "//xla:shape_util",
@@ -3028,21 +3037,19 @@ tf_proto_library(
     cc_api_version = 2,
     protodeps = [
         "//xla/service:hlo_proto",
+        "//xla:xla_proto",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "target_constants",
     hdrs = ["target_constants.h"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "gpu_transfer_manager",
     srcs = ["gpu_transfer_manager.cc"],
     hdrs = ["gpu_transfer_manager.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":io_feed_manager",
         ":target_constants",
@@ -3057,16 +3064,26 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/service:compiler",
         "//xla/service:generic_transfer_manager",
+        "//xla/service:shaped_buffer",
         "//xla/service:transfer_manager",
         "//xla/stream_executor",
+        "//xla/stream_executor:memory_allocation",
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/stream_executor/host:host_platform_id",
         "//xla/stream_executor/rocm:rocm_platform_id",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@llvm-project//llvm:Core",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:numbers",
+        "@local_tsl//tsl/platform:statusor",
     ],
     alwayslink = True,  # Contains per-platform transfer manager registration
 )
@@ -3075,7 +3092,6 @@ cc_library(
     name = "gpu_reduce_scatter_creator",
     srcs = ["gpu_reduce_scatter_creator.cc"],
     hdrs = ["gpu_reduce_scatter_creator.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
@@ -3089,7 +3105,6 @@ cc_library(
     name = "gpu_all_gather_optimizer",
     srcs = ["gpu_all_gather_optimizer.cc"],
     hdrs = ["gpu_all_gather_optimizer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:statusor",
@@ -3107,7 +3122,6 @@ cc_library(
     name = "gpu_float_support",
     srcs = ["gpu_float_support.cc"],
     hdrs = ["gpu_float_support.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
@@ -3124,7 +3138,6 @@ cc_library(
     hdrs = [
         "compile_module_to_llvm_ir.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":executable_proto_cc",
         ":gpu_constants",
@@ -3142,7 +3155,6 @@ cc_library(
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/mlir/backends/gpu/transforms:passes",
         "//xla/mlir/runtime/transforms:compilation_pipeline_gpu",
         "//xla/mlir_hlo:transforms_gpu_passes",
         "//xla/service:buffer_assignment",
@@ -3152,18 +3164,15 @@ cc_library(
         "//xla/service:hlo_ordering",
         "//xla/service:hlo_proto_cc",
         "//xla/service:logical_buffer",
-        "//xla/service/gpu/runtime:executable",
-        "//xla/service/gpu/runtime3:conditional_thunk",
-        "//xla/service/gpu/runtime3:for_thunk",
-        "//xla/service/gpu/runtime3:sequential_thunk",
-        "//xla/service/gpu/runtime3:while_thunk",
+        "//xla/service/gpu/runtime:conditional_thunk",
+        "//xla/service/gpu/runtime:sequential_thunk",
+        "//xla/service/gpu/runtime:while_thunk",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor/rocm:rocm_platform_id",
         "//xla/translate/hlo_to_mhlo:hlo_utils",
         "//xla/translate/mhlo_to_hlo:location_exporter",
-        "//xla/translate/mhlo_to_lhlo_with_xla",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
@@ -3190,7 +3199,6 @@ cc_library(
     name = "command_buffer_scheduling",
     srcs = ["command_buffer_scheduling.cc"],
     hdrs = ["command_buffer_scheduling.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -3202,6 +3210,8 @@ cc_library(
         "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu:hlo_traversal",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -3223,6 +3233,7 @@ xla_cc_test(
         ":command_buffer_scheduling",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_parser",
+        "//xla/stream_executor:device_description",
         "//xla/tests:filecheck",
         "//xla/tests:hlo_test_base",
         "//xla/tests:verified_hlo_module",
@@ -3234,17 +3245,16 @@ xla_cc_test(
 )
 
 cc_library(
-    name = "custom_fusion_rewriter",
-    srcs = ["custom_fusion_rewriter.cc"],
-    hdrs = ["custom_fusion_rewriter.h"],
-    visibility = ["//visibility:public"],
+    name = "custom_kernel_fusion_rewriter",
+    srcs = ["custom_kernel_fusion_rewriter.cc"],
+    hdrs = ["custom_kernel_fusion_rewriter.h"],
     deps = [
         "//xla:shape_util",
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
         "//xla/service/gpu/kernels:custom_fusion_library",
-        "//xla/service/gpu/kernels:custom_fusion_pattern",
+        "//xla/service/gpu/kernels:custom_kernel_fusion_pattern",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -3259,13 +3269,13 @@ cc_library(
 )
 
 xla_cc_test(
-    name = "custom_fusion_rewriter_test",
-    srcs = ["custom_fusion_rewriter_test.cc"],
+    name = "custom_kernel_fusion_rewriter_test",
+    srcs = ["custom_kernel_fusion_rewriter_test.cc"],
     deps = [
-        ":custom_fusion_rewriter",
+        ":custom_kernel_fusion_rewriter",
         ":gpu_device_info_for_tests",
         "//xla/hlo/ir:hlo",
-        "//xla/service/gpu/kernels:custom_fusion_pattern",
+        "//xla/service/gpu/kernels:custom_kernel_fusion_pattern",
         "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
         "@local_tsl//tsl/platform:test",
@@ -3273,11 +3283,68 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "address_computation_fusion_rewriter",
+    srcs = ["address_computation_fusion_rewriter.cc"],
+    hdrs = ["address_computation_fusion_rewriter.h"],
+    deps = [
+        ":backend_configs_cc",
+        ":cublas_cudnn",
+        ":hlo_traversal",
+        ":ir_emission_utils",
+        "//xla:shape_util",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla/ffi:ffi_api",
+        "//xla/ffi/api:c_api",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:custom_call_target_registry",
+        "//xla/service:hlo_pass",
+        "//xla/service/gpu/kernels:custom_fusion_library",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "address_computation_fusion_rewriter_test",
+    srcs = if_cuda_is_configured(["address_computation_fusion_rewriter_test.cc"]),
+    deps = [
+        ":address_computation_fusion_rewriter",
+        ":gpu_device_info_for_tests",
+        "//xla:shape_util",
+        "//xla/client:xla_builder",
+        "//xla/client/lib:constants",
+        "//xla/ffi",
+        "//xla/ffi:ffi_api",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_value",
+        "//xla/service:custom_call_target_registry",
+        "//xla/service:executable",
+        "//xla/service:hlo_memory_scheduler",
+        "//xla/service:hlo_module_config",
+        "//xla/stream_executor/gpu:gpu_types_header",
+        "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 cc_library(
     name = "fusion_pipeline",
     srcs = ["fusion_pipeline.cc"],
     hdrs = ["fusion_pipeline.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":fusion_merger",
         ":horizontal_input_fusion",
@@ -3285,6 +3352,7 @@ cc_library(
         ":instruction_fusion",
         ":multi_output_fusion",
         ":priority_fusion",
+        ":rename_fusions",
         ":variadic_op_splitter",
         "//xla:xla_proto_cc",
         "//xla/service:cpu_gpu_shape_verifier",
@@ -3305,7 +3373,6 @@ cc_library(
     name = "prepare_hlo_for_ir_emitting_pipeline",
     srcs = ["prepare_hlo_for_ir_emitting_pipeline.cc"],
     hdrs = ["prepare_hlo_for_ir_emitting_pipeline.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":alias_passthrough_params",
         ":copy_fusion",
@@ -3335,8 +3402,8 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    visibility = ["//visibility:public"],
     deps = if_gpu_is_configured([
+        ":address_computation_fusion_rewriter",
         ":alias_passthrough_params",
         ":all_reduce_blueconnect",
         ":autotuner_util",
@@ -3344,7 +3411,7 @@ cc_library(
         ":compile_module_to_llvm_ir",
         ":conv_layout_normalization",
         ":copy_fusion",
-        ":custom_fusion_rewriter",
+        ":custom_kernel_fusion_rewriter",
         ":dot_dimension_sorter",
         ":dot_operand_converter",
         ":executable_proto_cc",
@@ -3414,9 +3481,6 @@ cc_library(
         "//xla:xla_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/transforms:hlo_constant_splitter",
-        "//xla/mlir/backends/gpu/transforms:passes",
-        "//xla/mlir/runtime/transforms:compilation_pipeline_gpu",
-        "//xla/runtime:jit_executable",
         "//xla/service:algebraic_simplifier",
         "//xla/service:all_gather_broadcast_reorder",
         "//xla/service:all_gather_combiner",
@@ -3515,7 +3579,6 @@ cc_library(
         "//xla/stream_executor/cuda:cuda_platform_id",
         "//xla/translate/hlo_to_mhlo:hlo_utils",
         "//xla/translate/mhlo_to_hlo:location_exporter",
-        "//xla/translate/mhlo_to_lhlo_with_xla",
         "@local_tsl//tsl/platform:blocking_counter",
         "@local_tsl//tsl/platform:casts",
         "@local_tsl//tsl/platform:env",
@@ -3537,15 +3600,12 @@ cc_library(
         "//xla:shape_util",
         "//xla/hlo/ir:hlo_module_group",
         "//xla/mlir/runtime/transforms:compilation_pipeline_options",
-        "//xla/runtime:compiler",
-        "//xla/runtime:executable",
         "//xla/service:buffer_value",
         "//xla/service:dynamic_dimension_inference",
         "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_ordering",
         "//xla/service:layout_assignment",
         "//xla/service:logical_buffer",
-        "//xla/service/gpu/runtime:executable",
         "//xla/stream_executor/rocm:rocm_platform_id",
         "@local_tsl//tsl/platform:numbers",
     ]) + xla_export_hlo_deps() + [
@@ -3555,22 +3615,24 @@ cc_library(
         ":ir_emitter_context",
         ":ir_emitter_unnested",
         ":prepare_hlo_for_ir_emitting_pipeline",
+        ":rename_fusions",
         ":thunk",
+        "//xla/stream_executor:platform_manager",
         "@llvm-project//mlir:FuncDialect",
         "@local_tsl//tsl/lib/monitoring:counter",
     ],
 )
 
-xla_cc_test(
+xla_test(
     name = "gpu_compiler_test",
     srcs = ["gpu_compiler_test.cc"],
-    tags = tf_cuda_tests_tags(),
+    backends = ["gpu"],
     deps = [
         ":horizontal_loop_fusion",
         ":metrics",
         "//xla:autotune_results_proto_cc",
         "//xla/service:buffer_assignment",
-        "//xla/service:gpu_plugin",
+        "//xla/service:hlo_module_config",
         "//xla/service:pattern_matcher",
         "//xla/service:pattern_matcher_gmock",
         "//xla/service:xla_debug_info_manager",
@@ -3606,7 +3668,6 @@ cc_library(
     srcs = if_cuda_is_configured([
         "nvptx_compiler_registration.cc",
     ]),
-    visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         "//xla/stream_executor/cuda:cuda_platform_id",
         ":nvptx_compiler_impl",
@@ -3629,7 +3690,6 @@ cc_library(
         ],
         "//conditions:default": [],
     }),
-    visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":autotuner_util",
         ":buffer_sharing",
@@ -3757,9 +3817,6 @@ xla_cc_test(
     srcs = if_gpu_is_configured([
         "gpu_aot_compilation_test.cc",
     ]),
-    env = {
-        "XLA_FLAGS": "--xla_gpu_enable_xla_runtime_executable",
-    },
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
@@ -3783,8 +3840,8 @@ xla_cc_test(
         "//xla/service:gpu_plugin",
         "//xla/service:platform_util",
         "//xla/stream_executor",
-        "//xla/stream_executor:multi_platform_manager",
         "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",  # build_cleaner: keep
         "@com_google_absl//absl/strings",
@@ -3799,7 +3856,6 @@ cc_library(
         "amdgpu_compiler_registration.cc",
     ],
     tags = ["manual"],
-    visibility = ["//visibility:public"],
     deps = [
         ":amdgpu_compiler_impl",
         "//xla/stream_executor/rocm:rocm_platform_id",
@@ -3816,7 +3872,6 @@ cc_library(
         "amdgpu_compiler.h",
     ],
     tags = ["manual"],
-    visibility = ["//visibility:public"],
     deps = [
         ":conv_algorithm_picker",
         ":cublas_pad_for_gemms",
@@ -3857,7 +3912,6 @@ cc_library(
     name = "all_reduce_blueconnect",
     srcs = ["all_reduce_blueconnect.cc"],
     hdrs = ["all_reduce_blueconnect.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:status_macros",
@@ -3900,7 +3954,6 @@ xla_cc_test(
 cc_library(
     name = "xfeed_queue",
     hdrs = ["xfeed_queue.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/synchronization",
@@ -3920,7 +3973,6 @@ cc_library(
         "outfeed_manager.h",
     ],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    visibility = ["//visibility:public"],
     deps = [
         ":xfeed_queue",
         "//xla:literal",
@@ -3942,7 +3994,6 @@ cc_library(
     name = "gpu_layout_assignment",
     srcs = ["gpu_layout_assignment.cc"],
     hdrs = ["gpu_layout_assignment.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -3999,7 +4050,6 @@ cc_library(
     name = "gpu_schedule_postprocessing",
     srcs = ["gpu_schedule_postprocessing.cc"],
     hdrs = ["gpu_schedule_postprocessing.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         "//xla:statusor",
@@ -4037,7 +4087,6 @@ cc_library(
     name = "gpu_hlo_schedule",
     srcs = ["gpu_hlo_schedule.cc"],
     hdrs = ["gpu_hlo_schedule.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -4126,7 +4175,6 @@ cc_library(
     srcs = ["stream_executor_util.cc"],
     hdrs = ["stream_executor_util.h"],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         ":cublas_cudnn",
         ":launch_dimensions",
@@ -4139,6 +4187,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_module_config",
         "//xla/stream_executor",
+        "//xla/stream_executor:launch_dim",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -4146,6 +4195,7 @@ cc_library(
         "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/util:env_var",
         "@local_tsl//tsl/util/proto:proto_utils",
     ],
@@ -4171,7 +4221,6 @@ cc_library(
     hdrs = ["gpu_asm_opts_util.h"],
     compatible_with = get_compatible_with_portable(),
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:xla_proto_cc",
         "//xla/stream_executor/gpu:gpu_asm_opts",
@@ -4184,7 +4233,6 @@ cc_library(
     srcs = ["hlo_fusion_analysis.cc"],
     hdrs = ["hlo_fusion_analysis.h"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":hlo_traversal",
@@ -4199,6 +4247,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
@@ -4214,6 +4263,7 @@ xla_cc_test(
         "//xla/stream_executor:device_description_proto_cc",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -4225,7 +4275,6 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    visibility = ["//visibility:public"],
     deps = if_gpu_is_configured([
         ":buffer_comparator_kernel",
         ":gpu_asm_opts_util",
@@ -4254,7 +4303,6 @@ cuda_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
     ]) + if_rocm_is_configured([
@@ -4275,7 +4323,7 @@ xla_cc_test(
         "//xla:types",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor:multi_platform_manager",
+        "//xla/stream_executor:platform_manager",
         "@local_tsl//tsl/platform:ml_dtypes",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:test",
@@ -4290,7 +4338,6 @@ cc_library(
     name = "buffer_sharing",
     srcs = ["buffer_sharing.cc"],
     hdrs = ["buffer_sharing.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -4310,7 +4357,6 @@ cc_library(
     name = "gpu_fusible",
     srcs = ["gpu_fusible.cc"],
     hdrs = ["gpu_fusible.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":hlo_traversal",
@@ -4346,7 +4392,6 @@ cc_library(
     srcs = ["cudnn_fused_conv_rewriter.cc"],
     hdrs = ["cudnn_fused_conv_rewriter.h"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -4429,7 +4474,6 @@ cc_library(
     srcs = ["cudnn_norm_rewriter.cc"],
     hdrs = ["cudnn_norm_rewriter.h"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -4482,7 +4526,7 @@ cc_library(
     name = "cudnn_fused_mha_rewriter",
     srcs = ["cudnn_fused_mha_rewriter.cc"],
     hdrs = ["cudnn_fused_mha_rewriter.h"],
-    visibility = ["//visibility:public"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -4508,14 +4552,15 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
-    ],
+    ] + if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+    ]),
 )
 
 cc_library(
     name = "cudnn_fused_mha_transpose_fusion",
     srcs = ["cudnn_fused_mha_transpose_fusion.cc"],
     hdrs = ["cudnn_fused_mha_transpose_fusion.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -4610,7 +4655,6 @@ cc_library(
     name = "variadic_op_splitter",
     srcs = ["variadic_op_splitter.cc"],
     hdrs = ["variadic_op_splitter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:statusor",
         "//xla:util",
@@ -4627,7 +4671,6 @@ cc_library(
     name = "gpu_scatter_expander",
     srcs = ["gpu_scatter_expander.cc"],
     hdrs = ["gpu_scatter_expander.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
@@ -4666,20 +4709,23 @@ tf_proto_library(
         "//xla/service:hlo_proto",
         "//xla:autotuning_proto",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "hlo_algorithm_denylist",
     srcs = ["hlo_algorithm_denylist.cc"],
     hdrs = ["hlo_algorithm_denylist.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":gpu_autotuning_proto_cc",
         "//xla:autotuning_proto_cc",
         "//xla:debug_options_flags",
         "//xla/stream_executor",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:protobuf",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
@@ -4705,7 +4751,6 @@ cc_library(
     name = "alias_passthrough_params",
     srcs = ["alias_passthrough_params.cc"],
     hdrs = ["alias_passthrough_params.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:statusor",
@@ -4737,7 +4782,6 @@ cc_library(
     name = "horizontal_loop_fusion",
     srcs = ["horizontal_loop_fusion.cc"],
     hdrs = ["horizontal_loop_fusion.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":gpu_fusible",
         "//xla:shape_util",
@@ -4745,6 +4789,7 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_creation_utils",
         "//xla/service:hlo_pass",
+        "//xla/service:sub_byte_normalization",
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
@@ -4783,7 +4828,6 @@ cc_library(
     name = "horizontal_input_fusion",
     srcs = ["horizontal_input_fusion.cc"],
     hdrs = ["horizontal_input_fusion.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":gpu_fusible",
         "//xla/hlo/ir:hlo",
@@ -4816,7 +4860,6 @@ cc_library(
     name = "reduction_degenerate_dim_remover",
     srcs = ["reduction_degenerate_dim_remover.cc"],
     hdrs = ["reduction_degenerate_dim_remover.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":ir_emission_utils",
         "//xla:shape_util",
@@ -4836,7 +4879,6 @@ cc_library(
     name = "reduction_dimension_grouper",
     srcs = ["reduction_dimension_grouper.cc"],
     hdrs = ["reduction_dimension_grouper.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:statusor",
@@ -4852,7 +4894,6 @@ cc_library(
     name = "reduction_splitter",
     srcs = ["reduction_splitter.cc"],
     hdrs = ["reduction_splitter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":reduction_utils",
         "//xla:shape_util",
@@ -4882,7 +4923,6 @@ cc_library(
     name = "reduction_layout_normalizer",
     srcs = ["reduction_layout_normalizer.cc"],
     hdrs = ["reduction_layout_normalizer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":ir_emission_utils",
         "//xla:shape_util",
@@ -4903,7 +4943,6 @@ cc_library(
     name = "tree_reduction_rewriter",
     srcs = ["tree_reduction_rewriter.cc"],
     hdrs = ["tree_reduction_rewriter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":reduction_utils",
         "//xla:shape_util",
@@ -4925,7 +4964,6 @@ cc_library(
     name = "gemm_broadcast_folding_rewriter",
     srcs = ["gemm_broadcast_folding_rewriter.cc"],
     hdrs = ["gemm_broadcast_folding_rewriter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":backend_configs_cc",
         ":cublas_cudnn",
@@ -4941,182 +4979,10 @@ cc_library(
     ],
 )
 
-# See tap/tensorflow.xla_gpu_jitrt.
-test_suite(
-    name = "jitrt_executable_tests",
-    tests = [
-        ":cudnn_fused_conv_rewriter_test",
-        ":cudnn_fused_mha_rewriter_test",
-        ":custom_call_test",
-        # copybara:uncomment ":gpu_aot_compilation_test",
-        # copybara:uncomment "//platforms/xla/tests/internal:xfeed_test_gpu",
-        # TODO(anlunx): Re-enable when the TopK is available in Thunk-based runtime.
-        # copybara:uncomment # "//third_party/py/jax/experimental/jax2tf/tests:primitives_test_gpu",
-        # copybara:uncomment "//third_party/py/jax/tests:pmap_test_gpu",
-        # copybara:uncomment "//tensorflow/compiler/tests:fft_test_gpu",
-        "//xla/pjrt/gpu:se_gpu_pjrt_client_test",
-        "//xla/python:xla_client_test_gpu",
-        # copybara:uncomment "//xla/service/gpu/kernels:cutlass_gemm_fusion_test_gpu",
-        "//xla/service/gpu/tests:add_preds.hlo.test",
-        "//xla/service/gpu/tests:concat.hlo.test",
-        "//xla/service/gpu/tests:constant.hlo.test",
-        "//xla/service/gpu/tests:copy.hlo.test",
-        "//xla/service/gpu/tests:copy_nested.hlo.test",
-        "//xla/service/gpu/tests:dynamic_update_slice_inplace.hlo.test",
-        "//xla/service/gpu/tests:element_wise_row_vectorization.hlo.test",
-        "//xla/service/gpu/tests:element_wise_row_vectorization_test",
-        "//xla/service/gpu/tests:fused_scatter.hlo.test",
-        "//xla/service/gpu/tests:fused_slice.hlo.test",
-        "//xla/service/gpu/tests:fused_slice_different_operands.hlo.test",
-        "//xla/service/gpu/tests:fusion.hlo.test",
-        "//xla/service/gpu/tests:gemm_broadcast_folding_rewrite_test",
-        "//xla/service/gpu/tests:gemm_rewrite_test",
-        "//xla/service/gpu/tests:gpu_alignment_test",
-        "//xla/service/gpu/tests:gpu_all_gather_optimizer_test",
-        "//xla/service/gpu/tests:gpu_atomic_test",
-        "//xla/service/gpu/tests:gpu_compilation_parallelism_test",
-        "//xla/service/gpu/tests:gpu_convolution_regression_test",
-        "//xla/service/gpu/tests:gpu_copy_alone_test",
-        "//xla/service/gpu/tests:gpu_copy_test",
-        "//xla/service/gpu/tests:gpu_dyn_shape_test",
-        "//xla/service/gpu/tests:gpu_ftz_test",
-        "//xla/service/gpu/tests:gpu_fusion_test",
-        "//xla/service/gpu/tests:gpu_index_test",
-        "//xla/service/gpu/tests:gpu_infeed_test",
-        "//xla/service/gpu/tests:gpu_input_fusible_slice_test",
-        "//xla/service/gpu/tests:gpu_kernel_tiling_test",
-        "//xla/service/gpu/tests:gpu_ldg_test",
-        "//xla/service/gpu/tests:gpu_noalias_test",
-        "//xla/service/gpu/tests:gpu_reduce_scatter_creator_test",
-        "//xla/service/gpu/tests:gpu_spmd_e2e_compile_test",
-        "//xla/service/gpu/tests:gpu_too_many_blocks_test",
-        "//xla/service/gpu/tests:gpu_unrolling_test",
-        "//xla/service/gpu/tests:in_place_op_test",
-        "//xla/service/gpu/tests:kernel_launch_test",
-        "//xla/service/gpu/tests:kernel_reuse.hlo.test",
-        "//xla/service/gpu/tests:launch_dimensions.hlo.test",
-        "//xla/service/gpu/tests:pad_to_static.hlo.test",
-        "//xla/service/gpu/tests:parallel_reduction_test",
-        "//xla/service/gpu/tests:pred_arithmetic_test",
-        "//xla/service/gpu/tests:reduce_unnested.hlo.test",
-        "//xla/service/gpu/tests:reduction_degenerate_dim_remover_test",
-        "//xla/service/gpu/tests:reduction_dimension_grouper_test",
-        "//xla/service/gpu/tests:reduction_layout_normalizer_test",
-        "//xla/service/gpu/tests:reduction_vectorization_sm_all.hlo.test",
-        "//xla/service/gpu/tests:reduction_vectorization_test",
-        "//xla/service/gpu/tests:rng_get_and_update_state.hlo.test",
-        "//xla/service/gpu/tests:scatter.hlo.test",
-        "//xla/service/gpu/tests:select_and_scatter.hlo.test",
-        "//xla/service/gpu/tests:select_and_scatter_test",
-        "//xla/service/gpu/tests:single_instruction.hlo.test",
-        "//xla/service/gpu/tests:slice_to_dynamic.hlo.test",
-        "//xla/service/gpu/tests:sorting.hlo.test",
-        "//xla/service/gpu/tests:sorting_test",
-        "//xla/service/gpu/tests:swap_conv_operands_test",
-        "//xla/service/gpu/tests:tree_reduction_rewriter_test",
-        "//xla/tests:all_reduce_test_gpu",
-        "//xla/tests:array_elementwise_ops_test_gpu",
-        "//xla/tests:axpy_simple_test_gpu",
-        "//xla/tests:bad_rng_shape_validation_test_gpu",
-        "//xla/tests:batch_normalization_test_gpu",
-        "//xla/tests:bfloat16_test_gpu",
-        "//xla/tests:binop_scaling_test_gpu",
-        "//xla/tests:bitcast_convert_test_gpu",
-        "//xla/tests:broadcast_simple_test_gpu",
-        "//xla/tests:broadcast_test_gpu",
-        "//xla/tests:buffer_donation_test_gpu",
-        "//xla/tests:call_test_gpu",
-        "//xla/tests:check_execution_arity_test_gpu",
-        "//xla/tests:cholesky_test_gpu",
-        "//xla/tests:client_test_gpu",
-        "//xla/tests:compilation_cache_test_gpu",
-        "//xla/tests:compute_constant_test_gpu",
-        "//xla/tests:concat_test_gpu",
-        "//xla/tests:conditional_test_gpu",
-        "//xla/tests:constant_reduction_function_test_gpu",
-        "//xla/tests:constants_test_gpu",
-        "//xla/tests:conv_depthwise_backprop_filter_test_gpu",
-        "//xla/tests:conv_depthwise_test_gpu",
-        "//xla/tests:convert_test_gpu",
-        "//xla/tests:convolution_dimension_numbers_test_gpu",
-        "//xla/tests:convolution_test_1d_autotune_disabled_gpu",
-        "//xla/tests:convolution_test_1d_gpu_alternative_layout_gpu",
-        "//xla/tests:convolution_test_1d_no_vmodule_gpu",
-        "//xla/tests:convolution_test_autotune_disabled_gpu",
-        "//xla/tests:convolution_test_cudnn_frontend_disabled_gpu",
-        "//xla/tests:convolution_test_gpu",
-        "//xla/tests:convolution_test_gpu_alternative_layout_gpu",
-        "//xla/tests:convolution_variants_test_gpu",
-        "//xla/tests:copy_test_gpu",
-        "//xla/tests:cpu_gpu_fusion_test_gpu",
-        "//xla/tests:deallocation_test_gpu",
-        "//xla/tests:deconstruct_tuple_test_gpu",
-        "//xla/tests:deep_graph_test_gpu",
-        "//xla/tests:dot_operation_single_threaded_runtime_test_gpu",
-        "//xla/tests:dot_operation_test_autotune_disabled_gpu",
-        "//xla/tests:dot_operation_test_gpu",
-        "//xla/tests:dynamic_ops_test_gpu",
-        "//xla/tests:float8_test_gpu",
-        "//xla/tests:floor_ceil_test_gpu",
-        "//xla/tests:fmax_fmin_test_gpu",
-        "//xla/tests:gather_operation_test_gpu",
-        "//xla/tests:get_dimension_size_test_gpu",
-        "//xla/tests:grouped_convolution_test_gpu",
-        "//xla/tests:half_test_gpu",
-        "//xla/tests:iota_test_gpu",
-        "//xla/tests:local_client_allocation_test_gpu",
-        "//xla/tests:local_client_execute_test_gpu",
-        "//xla/tests:log_test_gpu",
-        "//xla/tests:map_test_gpu",
-        "//xla/tests:matmul_test_gpu",
-        "//xla/tests:matrix_ops_simple_test_gpu",
-        "//xla/tests:multidimensional_slice_test_gpu",
-        "//xla/tests:multioutput_fusion_test_gpu",
-        "//xla/tests:outfeed_in_nested_computation_test_gpu",
-        "//xla/tests:pad_test_gpu",
-        "//xla/tests:params_test_gpu",
-        "//xla/tests:pred_test_gpu",
-        "//xla/tests:prng_test_gpu",
-        "//xla/tests:ptxas_bug_120501638_gpu",
-        "//xla/tests:query_inferred_shape_test_gpu",
-        "//xla/tests:reduce_hlo_test_gpu",
-        "//xla/tests:reduce_precision_test_gpu",
-        "//xla/tests:reduce_test_gpu",
-        "//xla/tests:reduce_window_test_gpu",
-        "//xla/tests:replay_test_gpu",
-        "//xla/tests:reshape_motion_test_gpu",
-        "//xla/tests:reshape_test_gpu",
-        "//xla/tests:reverse_test_gpu",
-        "//xla/tests:round_trip_packed_literal_test_gpu",
-        "//xla/tests:round_trip_transfer_test_gpu",
-        "//xla/tests:sample_text_test_gpu",
-        "//xla/tests:scalar_computations_test_gpu",
-        "//xla/tests:scatter_test_gpu",
-        "//xla/tests:select_and_scatter_test_gpu",
-        "//xla/tests:select_test_gpu",
-        "//xla/tests:slice_test_gpu",
-        "//xla/tests:token_hlo_test_gpu",
-        "//xla/tests:transfer_manager_test_gpu",
-        "//xla/tests:transpose_test_gpu",
-        "//xla/tests:triangular_solve_test_gpu",
-        "//xla/tests:tuple_test_gpu",
-        "//xla/tests:unary_op_test_gpu",
-        "//xla/tests:value_inference_test_gpu",
-        "//xla/tests:vector_ops_reduce_test_gpu",
-        "//xla/tests:vector_ops_simple_test_gpu",
-        "//xla/tests:while_test_gpu",
-        "//xla/tests:xla_hlo_profile_test_gpu",
-    ] + if_google([
-        # Currently fails in OSS.
-        "//tensorflow/python/kernel_tests/signal:fft_ops_test_xla_gpu",
-    ]),
-)
-
 cc_library(
     name = "metrics",
     srcs = ["metrics.cc"],
     hdrs = ["metrics.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@local_tsl//tsl/lib/monitoring:counter",
         "@local_tsl//tsl/lib/monitoring:gauge",
@@ -5128,7 +4994,6 @@ cc_library(
     name = "dot_operand_converter",
     srcs = ["dot_operand_converter.cc"],
     hdrs = ["dot_operand_converter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
@@ -5165,7 +5030,6 @@ cc_library(
     name = "make_batch_pointers",
     srcs = if_gpu_is_configured(["make_batch_pointers.cc"]),
     hdrs = if_gpu_is_configured(["make_batch_pointers.h"]),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status",
         "//xla:statusor",
@@ -5187,7 +5051,6 @@ cc_library(
 cuda_library(
     name = "make_batch_pointers_kernel",
     srcs = if_cuda_is_configured(["make_batch_pointers.cu.cc"]),
-    visibility = ["//visibility:public"],
     deps = [
         "@local_config_cuda//cuda:cuda_headers",  # build_cleaner: keep
     ],
@@ -5197,7 +5060,6 @@ cc_library(
     name = "triangular_solve_rewriter",
     srcs = ["triangular_solve_rewriter.cc"],
     hdrs = ["triangular_solve_rewriter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":cublas_cudnn",
         "//xla:statusor",
@@ -5212,7 +5074,6 @@ tsl_gpu_library(
     name = "runtime_intrinsics",
     srcs = ["runtime_intrinsics.cc"],
     hdrs = ["runtime_intrinsics.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:status",
@@ -5253,7 +5114,6 @@ cc_library(
     name = "hlo_fusion_stats",
     srcs = ["hlo_fusion_stats.cc"],
     hdrs = ["hlo_fusion_stats.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status",
         "//xla:statusor",
@@ -5285,7 +5145,6 @@ cc_library(
     name = "scatter_slice_simplifier",
     srcs = ["scatter_slice_simplifier.cc"],
     hdrs = ["scatter_slice_simplifier.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
@@ -5313,7 +5172,6 @@ cc_library(
     name = "conv_layout_normalization",
     srcs = ["conv_layout_normalization.cc"],
     hdrs = ["conv_layout_normalization.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":cublas_cudnn",
         "//xla:shape_util",
@@ -5334,7 +5192,6 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:executable_run_options",
         "//xla:shape_util",
@@ -5360,7 +5217,6 @@ cc_library(
     name = "topk_splitter",
     srcs = ["topk_splitter.cc"],
     hdrs = ["topk_splitter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:literal_util",
         "//xla:shape_util",
@@ -5401,11 +5257,39 @@ xla_cc_test(
     ],
 )
 
+xla_cc_test(
+    name = "topk_test",
+    srcs = ["topk_test.cc"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
+    tags = tf_gpu_tests_tags(),
+    deps = [
+        "//xla:error_spec",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla:types",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:gpu_plugin",
+        "//xla/service:hlo_pass",
+        "//xla/service:platform_util",
+        "//xla/service:topk_rewriter",
+        "//xla/service/gpu:topk_specializer",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:verified_hlo_module",
+        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 cc_library(
     name = "copy_fusion",
     srcs = ["copy_fusion.cc"],
     hdrs = ["copy_fusion.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":gpu_fusible",
         ":ir_emission_utils",
@@ -5425,13 +5309,16 @@ cc_library(
     name = "kernel_reuse_cache",
     srcs = ["kernel_reuse_cache.cc"],
     hdrs = ["kernel_reuse_cache.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":kernel_arguments",
         ":launch_dimensions",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "//xla/stream_executor:launch_dim",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:logging",
     ],
 )
@@ -5440,7 +5327,6 @@ cc_library(
     name = "kernel_arguments",
     srcs = ["kernel_arguments.cc"],
     hdrs = ["kernel_arguments.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":gpu_constants",
         ":ir_emission_utils",
@@ -5463,7 +5349,6 @@ cc_library(
     srcs = ["hlo_traversal.cc"],
     hdrs = ["hlo_traversal.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
@@ -5471,6 +5356,7 @@ cc_library(
         "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
     ],
@@ -5491,7 +5377,6 @@ cc_library(
     name = "fusion_wrapper",
     srcs = ["fusion_wrapper.cc"],
     hdrs = ["fusion_wrapper.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":gpu_fusible",
         "//xla:status_macros",
@@ -5554,7 +5439,6 @@ cc_library(
     name = "loop_double_buffer_transformer",
     srcs = ["loop_double_buffer_transformer.cc"],
     hdrs = ["loop_double_buffer_transformer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status",
         "//xla:statusor",
@@ -5622,7 +5506,6 @@ xla_cc_test(
 cc_library(
     name = "gpu_symbol_repository",
     hdrs = ["gpu_symbol_repository.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:xla_proto_cc",
         "//xla/service:symbol_repository",
diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
new file mode 100644
index 00000000000000..6aec517ffe1a70
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.cc
@@ -0,0 +1,328 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/service/gpu/address_computation_fusion_rewriter.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/ffi_api.h"
+#include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/service/custom_call_target_registry.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/hlo_traversal.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+#include "xla/shape.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+bool IsNoOp(const HloInstruction* hlo) {
+  return HloPredicateIsOp<HloOpcode::kBitcast, HloOpcode::kTuple,
+                          HloOpcode::kGetTupleElement>(hlo);
+}
+
+bool IsCustomCall(const HloInstruction* hlo, absl::string_view platform_name) {
+  auto* custom_call = DynCast<HloCustomCallInstruction>(hlo);
+  if (custom_call == nullptr) return false;
+
+  // TODO(vuson): properly handle token by following
+  // `LhloDialectEmitter::EmitCustomCallOp`'s `CreateOperands` logic for
+  // `LhloDialectEmitter::EmitFusionOp`'s `RewriteFusionOperand`
+  if (custom_call->shape().IsTuple() &&
+      absl::c_any_of(
+          custom_call->shape().tuple_shapes(),
+          [&](const Shape& sub_shape) { return sub_shape.IsToken(); }))
+    return false;
+
+  const std::string call_target_name = custom_call->custom_call_target();
+
+  bool is_ffi_custom_call =
+      custom_call->api_version() == CustomCallApiVersion::API_VERSION_TYPED_FFI;
+
+  void* call_target = CustomCallTargetRegistry::Global()->Lookup(
+      call_target_name, std::string(platform_name));
+
+  absl::StatusOr<XLA_FFI_Handler*> handler =
+      ffi::FindHandler(call_target_name, platform_name);
+
+  // At least one implementation should be available at run time.
+  bool found_custom_call = !is_ffi_custom_call && call_target != nullptr;
+  bool found_ffi_handler = is_ffi_custom_call && handler.ok();
+
+  return found_custom_call || found_ffi_handler;
+}
+
+absl::InlinedVector<HloInstruction*, 8> GetSlicedOperandChains(
+    const HloInstruction* instr) {
+  absl::InlinedVector<HloInstruction*, 8> sliced_operand_chains = {
+      const_cast<HloInstruction*>(instr)};
+  auto fusion = HloFusionAdaptor::ForComputation(instr->parent());
+  absl::flat_hash_set<HloInstruction*> processed_sliced_chain_set;
+
+  const auto& aliasing_pairs =
+      Cast<HloCustomCallInstruction>(instr)->output_to_operand_aliasing();
+  absl::flat_hash_set<int64_t> aliased_operands;
+  for (const auto& pair : aliasing_pairs) {
+    aliased_operands.insert(pair.second.first);
+  }
+
+  for (auto* operand : instr->operands()) {
+    // output_to_operand_aliasing means the operand is to be materialized, which
+    // is against the whole idea of address computation fusion. Skip this
+    // operand.
+    if (aliased_operands.contains(instr->operand_index(operand))) continue;
+    absl::InlinedVector<HloInstruction*, 4> maybe_sliced_operand_chain;
+    auto maybe_slice_adaptor =
+        HloFindIf({HloInstructionAdaptor(*operand)}, *fusion, [&](auto node) {
+          const HloInstruction* cur = &node.instruction();
+          if (processed_sliced_chain_set.contains(cur)) return true;
+          maybe_sliced_operand_chain.push_back(
+              const_cast<HloInstruction*>(cur));
+          // TODO(vuson): lift the first restriction by considering fusing other
+          // uses of the operand to reuse the address computation. Only worth it
+          // if other uses are also custom calls though.
+          // TODO(vuson): lift the second restriction by considering fusing the
+          // non-noop instructions to the computation if possible.
+          return cur->user_count() > 1 || !IsNoOp(cur) ||
+                 IsContiguousSlice(*cur);
+        });
+    if (maybe_slice_adaptor == std::nullopt) continue;
+    const auto& maybe_slice_instr = maybe_slice_adaptor->instruction();
+    if (IsContiguousSlice(maybe_slice_instr) ||
+        processed_sliced_chain_set.contains(&maybe_slice_instr)) {
+      sliced_operand_chains.insert(sliced_operand_chains.end(),
+                                   maybe_sliced_operand_chain.begin(),
+                                   maybe_sliced_operand_chain.end());
+      processed_sliced_chain_set.insert(maybe_sliced_operand_chain.begin(),
+                                        maybe_sliced_operand_chain.end());
+    }
+  }
+  return sliced_operand_chains;
+}
+
+absl::InlinedVector<HloInstruction*, 4> GetPatternCaptures(
+    absl::Span<HloInstruction* const> matched) {
+  absl::InlinedVector<HloInstruction*, 4> captures;
+
+  absl::flat_hash_set<HloInstruction*> instructions_set(matched.begin(),
+                                                        matched.end());
+
+  for (HloInstruction* instr : matched) {
+    for (HloInstruction* operand : instr->operands()) {
+      if (!instructions_set.contains(operand) &&
+          absl::c_find(captures, operand) == captures.end()) {
+        captures.emplace_back(operand);
+      }
+    }
+  }
+
+  return captures;
+}
+
+absl::InlinedVector<HloInstruction*, 8> GetSortedMatched(
+    absl::Span<HloInstruction* const> matched) {
+  absl::InlinedVector<HloInstruction*, 8> sorted_matched;
+  absl::flat_hash_set<HloInstruction*> instructions_set(matched.begin(),
+                                                        matched.end());
+  absl::flat_hash_set<HloInstruction*> processed_set;
+  // Topologically sort `matched`
+  for (auto it = matched.rbegin(); it != matched.rend(); ++it) {
+    if (processed_set.contains(*it)) continue;
+    for (auto* operand : (*it)->operands()) {
+      if (!instructions_set.contains(operand)) {
+        continue;
+      }
+      if (!processed_set.contains(operand)) {
+        sorted_matched.emplace_back(operand);
+        processed_set.insert(operand);
+      }
+    }
+    sorted_matched.emplace_back(*it);
+    processed_set.insert(*it);
+  }
+
+  return sorted_matched;
+}
+
+void CreateRootTuple(HloInstruction* root, HloComputation::Builder& builder) {
+  std::vector<HloInstruction*> elements;
+  elements.reserve(root->shape().tuple_shapes_size());
+  for (size_t i = 0; i < root->shape().tuple_shapes_size(); ++i) {
+    if (root->shape().tuple_shapes(i).IsTuple()) {
+      HloInstruction* gte = builder.AddInstruction(
+          HloInstruction::CreateGetTupleElement(root, i));
+      CreateRootTuple(gte, builder);
+      elements.push_back(builder.last_added_instruction());
+    } else {
+      elements.push_back(builder.AddInstruction(
+          HloInstruction::CreateGetTupleElement(root, i)));
+    }
+  }
+  builder.AddInstruction(HloInstruction::CreateTuple(elements));
+}
+
+absl::StatusOr<HloComputation*> CreateFusionBody(
+    HloModule* module, absl::Span<HloInstruction* const> matched,
+    absl::Span<HloInstruction* const> captures) {
+  HloComputation::Builder builder("address-computation");
+
+  // A mapping from original instructions to instructions in the fusion body.
+  absl::flat_hash_map<const HloInstruction*, HloInstruction*> instr_mapping;
+
+  auto mapped_operands = [&](HloInstruction* instr) {
+    absl::InlinedVector<HloInstruction*, 4> operands;
+    for (HloInstruction* operand : instr->operands()) {
+      operands.push_back(instr_mapping.at(operand));
+    }
+    return operands;
+  };
+
+  // For every captured value create a parameter instruction in the computation
+  // body and set up instruction mapping.
+  for (const HloInstruction* capture : captures) {
+    int64_t index = instr_mapping.size();
+    instr_mapping[capture] =
+        builder.AddInstruction(HloInstruction::CreateParameter(
+            index, capture->shape(), absl::StrCat("p", index)));
+  }
+
+  // Instructions in the pattern are already topologically sorted, as we visited
+  // them following use-def chain, then reverse the list.
+  for (HloInstruction* instr : matched) {
+    instr_mapping[instr] = builder.AddInstruction(
+        instr->CloneWithNewOperands(instr->shape(), mapped_operands(instr)));
+  }
+
+  HloInstruction* root = builder.last_added_instruction();
+  // Create a root tuple if the root is a tuple to make sure there's a buffer
+  // assigned for each of the elements. Make sure the tuple is not nil first.
+  if (root->shape().IsTuple() && root->shape().tuple_shapes_size() > 0) {
+    CreateRootTuple(root, builder);
+  }
+
+  return module->AddComputationAndUnifyNamesAndIds(builder.Build(), false);
+}
+
+absl::StatusOr<HloInstruction*> CreateFusionInstruction(
+    HloModule* module, HloInstruction* orig,
+    absl::Span<HloInstruction* const> captures, HloComputation* body) {
+  HloComputation* parent = orig->parent();
+
+  // Add a fusion operation calling outlined fusion computation.
+  HloInstruction* fusion = parent->AddInstruction(HloInstruction::CreateFusion(
+      body->root_instruction()->shape(), HloInstruction::FusionKind::kCustom,
+      captures, body));
+  module->SetAndUniquifyInstrName(fusion, "address_computation");
+
+  // We don't need to set/update output_to_operand_aliasing for the new fusion
+  // instruction because all buffers are already assigned at this point.
+
+  // Set backends config to a matched custom fusion config.
+  GpuBackendConfig gpu_config;
+  FusionBackendConfig& backend_config =
+      *gpu_config.mutable_fusion_backend_config();
+  backend_config.set_kind("__custom_fusion");
+  CustomFusionConfig config;
+  config.set_name("address_computation");
+  *backend_config.mutable_custom_fusion_config() = config;
+  TF_RETURN_IF_ERROR(fusion->set_backend_config(std::move(gpu_config)));
+
+  return fusion;
+}
+
+}  // namespace
+
+absl::StatusOr<bool> AddressComputationFusionRewriter::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  if (!module->has_schedule()) return Internal("module is not scheduled");
+  bool changed = false;
+
+  absl::flat_hash_map<HloInstruction*, absl::InlinedVector<HloInstruction*, 8>>
+      matches;
+
+  // Collect all potential custom call matches in the non-fusion computations.
+  for (HloComputation* computation : module->computations()) {
+    if (computation->IsFusionComputation()) continue;
+    for (HloInstruction* instr : computation->instructions()) {
+      if (IsLegacyCublasMatmul(*instr) || IsCustomCall(instr, platform_name_)) {
+        auto sliced_operand_chains = GetSlicedOperandChains(instr);
+        if (!(sliced_operand_chains.size() == 1 &&
+              sliced_operand_chains.front() == instr)) {
+          matches[instr] = std::move(sliced_operand_chains);
+        }
+      }
+    }
+  }
+
+  HloSchedule& schedule = module->schedule();
+  for (auto& kv : matches) {
+    auto captures = GetPatternCaptures(kv.second);
+    auto sorted = GetSortedMatched(kv.second);
+
+    TF_ASSIGN_OR_RETURN(HloComputation * fusion_body,
+                        CreateFusionBody(module, sorted, captures));
+    TF_ASSIGN_OR_RETURN(
+        HloInstruction * fusion,
+        CreateFusionInstruction(module, kv.first, captures, fusion_body));
+
+    // As we are running after scheduling we have to keep it valid.
+    HloComputation* parent = kv.first->parent();
+
+    // Update schedule to replace the custom call instruction with the fusion
+    // instruction.
+    // Removal of the rest of the instructions in the sequence is handled by
+    // schedule update below.
+    HloInstructionSequence& sequence = schedule.GetOrCreateSequence(parent);
+    sequence.replace_instruction(kv.first, fusion);
+
+    // TODO(vuson): handle control dependencies
+    TF_RETURN_IF_ERROR(parent->ReplaceInstruction(kv.first, fusion));
+    changed = true;
+  }
+
+  if (changed) {
+    TF_RETURN_IF_ERROR(module->schedule().Update());
+  }
+
+  return changed;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.h b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.h
new file mode 100644
index 00000000000000..d2fc6fac228f30
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter.h
@@ -0,0 +1,90 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#ifndef XLA_SERVICE_GPU_ADDRESS_COMPUTATION_FUSION_REWRITER_H_
+#define XLA_SERVICE_GPU_ADDRESS_COMPUTATION_FUSION_REWRITER_H_
+
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// Pattern matches (slice(s) + custom call) to custom address computation
+// fusions and rewrites them into fusion instructions and fusion computations.
+//
+// Example:
+//
+//  ENTRY %main {
+//    %p0 = bf16[2,8,8]{2,1,0} parameter(0)
+//    %p1 = bf16[2,8,8]{2,1,0} parameter(1)
+//    %slice_lhs = bf16[1,8,8]{2,1,0} slice(%p0), slice={[1:2], [0:8], [0:8]}
+//    %bitcast_lhs = bf16[8,8]{1,0} bitcast(%slice_lhs)
+//    %slice_rhs = bf16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+//    %bitcast_rhs = bf16[8,8]{1,0} bitcast(%slice_rhs)
+//    ROOT %dot = bf16[8,8]{1,0} custom-call(%bitcast_lhs, %bitcast_rhs),
+//      custom_call_target="__cublas$gemm"
+//  }
+//
+// After the pass:
+//
+//  %address_computation {
+//    %p0 = bf16[2,8,8]{2,1,0} parameter(0)
+//    %p1 = bf16[2,8,8]{2,1,0} parameter(1)
+//    %slice_lhs = bf16[1,8,8]{2,1,0} slice(%p0), slice={[1:2], [0:8], [0:8]}
+//    %bitcast_lhs = bf16[8,8]{1,0} bitcast(%slice_lhs)
+//    %slice_rhs = bf16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+//    %bitcast_rhs = bf16[8,8]{1,0} bitcast(%slice_rhs)
+//    ROOT %dot = bf16[8,8]{1,0} custom-call(%bitcast_lhs, %bitcast_rhs),
+//      custom_call_target="__cublas$gemm"
+//  }
+//
+//  ENTRY %main {
+//    %p0 = bf16[2,8,8]{2,1,0} parameter(0)
+//    %p1 = bf16[2,8,8]{2,1,0} parameter(1)
+//    ROOT %fusion.2 = bf16[8,8]{1,0} fusion(%p0, %p1),
+//        kind=kCustom, calls=%address_computation,
+//        backend_config={"fusion_backend_config":{
+//            "kind":"__custom_fusion",
+//            "custom_fusion_config":{"name":"address_computation"}
+//        }}
+//  }
+//
+class AddressComputationFusionRewriter : public HloModulePass {
+ public:
+  absl::string_view name() const override {
+    return "address-computation-fusion-rewriter";
+  }
+
+  explicit AddressComputationFusionRewriter(std::string platform_name)
+      : platform_name_(std::move(platform_name)) {}
+
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  std::string platform_name_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_ADDRESS_COMPUTATION_FUSION_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
new file mode 100644
index 00000000000000..70920717206807
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/address_computation_fusion_rewriter_test.cc
@@ -0,0 +1,1004 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/address_computation_fusion_rewriter.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "absl/status/status.h"
+#include "xla/client/lib/constants.h"
+#include "xla/client/xla_builder.h"
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_schedule.h"
+#include "xla/service/buffer_value.h"
+#include "xla/service/custom_call_target_registry.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/hlo_memory_scheduler.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/gpu/gpu_types.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+#define PLATFORM "GPU"
+namespace xla::gpu {
+
+class AddressComputationFusionRewriterTest : public HloTestBase {};
+
+TEST_F(AddressComputationFusionRewriterTest, SimpleGemm) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY %main.9 {
+      %p0 = f16[2,8,8]{2,1,0} parameter(0)
+      %p1 = f16[2,8,8]{2,1,0} parameter(1)
+      %slice.13 = f16[1,8,8]{2,1,0} slice(%p0), slice={[1:2], [0:8], [0:8]}
+      %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+      %slice.14 = f16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+      %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+      ROOT %custom-call.1 = f16[8,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     %address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P0]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = f16[8,8]{1,0} fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK:         }
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
+TEST_F(AddressComputationFusionRewriterTest, SimpleGemmWithWorkspace) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY %main.9 {
+      %p0 = f16[2,8,8]{2,1,0} parameter(0)
+      %p1 = f16[2,8,8]{2,1,0} parameter(1)
+      %slice.13 = f16[1,8,8]{2,1,0} slice(%p0), slice={[1:2], [0:8], [0:8]}
+      %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+      %slice.14 = f16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+      %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+      ROOT %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     %address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P0]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       [[CC:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:       [[DOT:%[^ ]+]] = f16[8,8]{1,0} get-tuple-element([[CC]]), index=0
+    ; CHECK:       [[WORKSPACE:%[^ ]+]] = s8[256]{0} get-tuple-element([[CC]]), index=1
+    ; CHECK:       ROOT [[TUPLE:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0})
+    ; CHECK:              tuple([[DOT]], [[WORKSPACE]])
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = (f16[8,8]{1,0}, s8[256]{0}) fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK:         }
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
+TEST_F(AddressComputationFusionRewriterTest, SimpleGemmNotRoot) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY %main.9 {
+      %p0 = f16[2,8,8]{2,1,0} parameter(0)
+      %p1 = f16[2,8,8]{2,1,0} parameter(1)
+      %slice.13 = f16[1,8,8]{2,1,0} slice(%p0), slice={[1:2], [0:8], [0:8]}
+      %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+      %slice.14 = f16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+      %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+      %custom-call.1 = f16[8,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+      ROOT %res = f16[8,8]{1,0} add(%custom-call.1, %custom-call.1)
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     %address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P0]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       [[FUSION:%[^ ]+]] = f16[8,8]{1,0} fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK:         }
+    ; CHECK:       ROOT {{.*}} = f16[8,8]{1,0} add([[FUSION]], [[FUSION]])
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
+TEST_F(AddressComputationFusionRewriterTest,
+       SimpleGemmOperandHasMultipleUsers) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY %main.9 {
+      %p0 = f16[2,8,8]{2,1,0} parameter(0)
+      %p1 = f16[2,8,8]{2,1,0} parameter(1)
+      %slice.13 = f16[1,8,8]{2,1,0} slice(%p0), slice={[1:2], [0:8], [0:8]}
+      %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+      %slice.14 = f16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+      %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+      %custom-call.1 = f16[8,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+      ROOT %res = f16[8,8]{1,0} add(%custom-call.1, %bitcast.41)
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     %address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[8,8]{1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[P0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P0]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
+    ; CHECK:       [[FUSION:%[^ ]+]] = f16[8,8]{1,0} fusion([[B0]], [[P1]])
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK:         }
+    ; CHECK:       ROOT {{.*}} = f16[8,8]{1,0} add([[FUSION]], [[B0]])
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
+TEST_F(AddressComputationFusionRewriterTest,
+       SimpleGemmOperandsHaveMultipleUsers) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY %main.9 {
+      %p0 = f16[2,8,8]{2,1,0} parameter(0)
+      %p1 = f16[2,8,8]{2,1,0} parameter(1)
+      %slice.13 = f16[1,8,8]{2,1,0} slice(%p0), slice={[1:2], [0:8], [0:8]}
+      %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+      %slice.14 = f16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+      %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+      %custom-call.0 = f16[8,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+
+      ROOT %custom-call.1 = f16[8,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+    }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            std::nullopt);
+}
+
+TEST_F(AddressComputationFusionRewriterTest, SimpleGemmSlicingNotParameter) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY %main.9 {
+      %p0 = f16[4,8,8]{2,1,0} parameter(0)
+      %p1 = f16[2,8,8]{2,1,0} parameter(1)
+      %slice.12 = f16[2,8,8]{2,1,0} slice(%p0), slice={[0:2], [0:8], [0:8]}
+      %slice.13 = f16[1,8,8]{2,1,0} slice(%slice.12), slice={[1:2], [0:8], [0:8]}
+      %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+      %slice.14 = f16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+      %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+      %custom-call.1 = f16[8,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+      ROOT %res = f16[8,8]{1,0} add(%custom-call.1, %custom-call.1)
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     %address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P0]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[4,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[2,8,8]{2,1,0} slice([[P0]]), slice={[0:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
+    ; CHECK:       [[FUSION:%[^ ]+]] = f16[8,8]{1,0} fusion([[S0]], [[P1]])
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK:         }
+    ; CHECK:       ROOT {{.*}} = f16[8,8]{1,0} add([[FUSION]], [[FUSION]])
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
+TEST_F(AddressComputationFusionRewriterTest, SimpleGemmNotContiguousSlice) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY %main.9 {
+      %p0 = f16[2,8,8]{2,1,0} parameter(0)
+      %p1 = f16[2,8,8]{2,1,0} parameter(1)
+      %slice.13 = f16[1,4,6]{2,1,0} slice(%p0), slice={[1:2], [0:4], [0:6]}
+      %bitcast.41 = f16[4,6]{1,0} bitcast(%slice.13)
+      %slice.14 = f16[1,6,4]{2,1,0} slice(%p1), slice={[1:2], [0:6], [0:4]}
+      %bitcast.42 = f16[6,4]{1,0} bitcast(%slice.14)
+
+      ROOT %custom-call.1 = f16[4,4]{1,0} custom-call(%bitcast.41, %bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+    }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            std::nullopt);
+}
+
+TEST_F(AddressComputationFusionRewriterTest, SimpleGemmNonNoOpInSliceChain) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY %main.9 {
+      %p0 = f16[2,8,8]{2,1,0} parameter(0)
+      %p1 = f16[2,8,8]{2,1,0} parameter(1)
+      %slice.13 = f16[1,8,8]{2,1,0} slice(%p0), slice={[0:1], [0:8], [0:8]}
+      %slice.14 = f16[1,8,8]{2,1,0} slice(%p0), slice={[1:2], [0:8], [0:8]}
+      %add.0 = f16[1,8,8]{2,1,0} add(%slice.13, %slice.14)
+      %bitcast.41 = f16[8,8]{1,0} bitcast(%add.0)
+      %slice.15 = f16[1,8,8]{2,1,0} slice(%p1), slice={[0:1], [0:8], [0:8]}
+      %slice.16 = f16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+      %add.1 = f16[1,8,8]{2,1,0} add(%slice.15, %slice.16)
+      %bitcast.42 = f16[8,8]{1,0} bitcast(%add.1)
+
+      ROOT %custom-call.1 = f16[8,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+    }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            std::nullopt);
+}
+
+TEST_F(AddressComputationFusionRewriterTest, SimpleGemmDuplicateOperand) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY %main {
+      %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+      %get-tuple-element.240 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+      %get-tuple-element.241 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+      %concatenate.10 = f32[200,100]{1,0} concatenate(%get-tuple-element.240, %get-tuple-element.241), dimensions={0}
+      %custom-call.16 = (f32[200,100]{1,0}, s8[120000]{0}) custom-call(%concatenate.10, %get-tuple-element.240),
+        custom_call_target="__cublas$gemm",
+        backend_config={
+          "gemm_backend_config":{
+            "alpha_real":1,
+            "beta":0,
+            "dot_dimension_numbers":{
+              "lhs_contracting_dimensions":["1"],
+              "rhs_contracting_dimensions":["0"],
+              "lhs_batch_dimensions":[],
+              "rhs_batch_dimensions":[]
+            },
+            "alpha_imag":0,
+            "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+            "epilogue":"DEFAULT",
+            "lhs_stride":"20000",
+            "rhs_stride":"10000",
+            "grad_x":false,
+            "grad_y":false
+          }
+        }
+      %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
+      %slice.26 = f32[100,100]{1,0} slice(%get-tuple-element.97), slice={[0:100], [0:100]}
+      ROOT %custom-call.17 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.26, %slice.26),
+        custom_call_target="__cublas$gemm",
+        backend_config={
+          "gemm_backend_config":{
+            "alpha_real":1,
+            "beta":0,
+            "dot_dimension_numbers":{
+              "lhs_contracting_dimensions":["1"],
+              "rhs_contracting_dimensions":["0"],
+              "lhs_batch_dimensions":[],
+              "rhs_batch_dimensions":[]
+            },
+            "alpha_imag":0,
+            "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+            "epilogue":"DEFAULT",
+            "lhs_stride":"10000",
+            "rhs_stride":"10000",
+            "grad_x":false,
+            "grad_y":false
+          }
+        }
+  })";
+
+  const char* expected = R"(
+    ; CHECK:     %address-computation {{.*}} {
+    ; CHECK:       [[P0:%[^ ]+]] = f32[200,100]{1,0} parameter(0)
+    ; CHECK:       [[S0:%[^ ]+]] = f32[100,100]{1,0} slice([[P0]]), slice={[0:100], [0:100]}
+    ; CHECK-NOT:   slice
+    ; CHECK:       [[CC:%[^ ]+]] = (f32[100,100]{1,0}, s8[80000]{0}) custom-call([[S0]], [[S0]]),
+    ; CHECK:         custom_call_target="__cublas$gemm"
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = (f32[100,100]{1,0}, s8[80000]{0}) fusion
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK:         }
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
+TEST_F(AddressComputationFusionRewriterTest, SimpleGemmReverseOperandOrder) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY %main.9 {
+      %p0 = f16[2,8,8]{2,1,0} parameter(1)
+      %slice.13 = f16[1,8,8]{2,1,0} slice(%p0), slice={[0:1], [0:8], [0:8]}
+      %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+      %p1 = f16[2,8,8]{2,1,0} parameter(0)
+      %slice.14 = f16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+      %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+      ROOT %custom-call.1 = f16[8,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     %address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P0]]), slice={[0:1], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK-DAG:   [[A0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
+    ; CHECK-DAG:   [[A1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = f16[8,8]{1,0} fusion([[A0]], [[A1]])
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK:         }
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
+TEST_F(AddressComputationFusionRewriterTest, SimpleGemmReverseOperandOrder2) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY %main.9 {
+      %p0 = f16[2,8,8]{2,1,0} parameter(0)
+      %slice.13 = f16[1,8,8]{2,1,0} slice(%p0), slice={[0:1], [0:8], [0:8]}
+      %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+      %p1 = f16[2,8,8]{2,1,0} parameter(1)
+      %slice.14 = f16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+      %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+      ROOT %custom-call.1 = f16[8,8]{1,0} custom-call(%bitcast.42, %bitcast.41),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     %address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P0]]), slice={[1:2], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P1]]), slice={[0:1], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S1]])
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK-DAG:   [[A0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(1)
+    ; CHECK-DAG:   [[A1:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = f16[8,8]{1,0} fusion([[A0]], [[A1]])
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK:         }
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
+TEST_F(AddressComputationFusionRewriterTest, SimpleGemmOperandAliasingOutput) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY %main.9 {
+      %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+      %get-tuple-element.287 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+      %get-tuple-element.288 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+      %concatenate.12 = f32[200,100]{1,0} concatenate(%get-tuple-element.287, %get-tuple-element.288), dimensions={0}
+      %slice.30 = f32[100,100]{1,0} slice(%concatenate.12), slice={[20:120], [0:100]}
+      %slice.34 = f32[100,100]{1,0} slice(%concatenate.12), slice={[99:199], [0:100]}
+      ROOT %cublas-gemm.15 = (f32[100,100]{1,0}, s8[120000]{0}) custom-call(%get-tuple-element.287, %slice.30, %slice.34),
+        custom_call_target="__cublas$gemm",
+        output_to_operand_aliasing={{0}: (2, {})},
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":1,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }}
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     %address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f32[100,100]{1,0} parameter(0)
+    ; CHECK-DAG:   [[P1:%[^ ]+]] = f32[100,100]{1,0} parameter(1)
+    ; CHECK-DAG:   [[P2:%[^ ]+]] = f32[200,100]{1,0} parameter(2)
+    ; CHECK-DAG:   [[S1:%[^ ]+]] = f32[100,100]{1,0} slice([[P2]]), slice={[20:120], [0:100]}
+    ; CHECK:       [[CC:%[^ ]+]] = (f32[100,100]{1,0}, s8[120000]{0}) custom-call([[P0]], [[S1]], [[P1]]),
+    ; CHECK:         custom_call_target="__cublas$gemm"
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK:       [[P:%[^ ]+]] = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+    ; CHECK:       [[GTE0:%[^ ]+]] = f32[100,100]{1,0} get-tuple-element([[P]]), index=0
+    ; CHECK:       [[GTE1:%[^ ]+]] = f32[100,100]{1,0} get-tuple-element([[P]]), index=1
+    ; CHECK:       [[CONCAT:%[^ ]+]] = f32[200,100]{1,0} concatenate([[GTE0]], [[GTE1]]), dimensions={0}
+    ; CHECK:       [[S:%[^ ]+]] = f32[100,100]{1,0} slice([[CONCAT]]), slice={[99:199], [0:100]}
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = (f32[100,100]{1,0}, s8[120000]{0}) fusion([[GTE0]], [[S]], [[CONCAT]])
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK:         }
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
+TEST_F(AddressComputationFusionRewriterTest, SimpleGemmOperandsFromSameSlice) {
+  const char* hlo = R"(
+    HloModule test, is_scheduled=true
+
+    ENTRY %main.9 {
+      %p0 = f16[2,8,8]{2,1,0} parameter(0)
+      %slice.13 = f16[1,8,8]{2,1,0} slice(%p0), slice={[0:1], [0:8], [0:8]}
+      %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+      %bitcast.42 = f16[8,8]{0,1} bitcast(%slice.13)
+
+      ROOT %custom-call.1 = f16[8,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+        custom_call_target="__cublas$gemm",
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }}
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK:     %address-computation {{.*}} {
+    ; CHECK-DAG:   [[P0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK-DAG:   [[S0:%[^ ]+]] = f16[1,8,8]{2,1,0} slice([[P0]]), slice={[0:1], [0:8], [0:8]}
+    ; CHECK-DAG:   [[B0:%[^ ]+]] = f16[8,8]{1,0} bitcast([[S0]])
+    ; CHECK-DAG:   [[B1:%[^ ]+]] = f16[8,8]{0,1} bitcast([[S0]])
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f16[8,8]{1,0} custom-call([[B0]], [[B1]]),
+    ; CHECK:              custom_call_target="__cublas$gemm"
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %main{{.*}} {
+    ; CHECK-DAG:   [[A0:%[^ ]+]] = f16[2,8,8]{2,1,0} parameter(0)
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = f16[8,8]{1,0} fusion([[A0]])
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK:         }
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo, AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
+static absl::Status Memcpy(const ServiceExecutableRunOptions* run_options,
+                           ffi::BufferBase src, ffi::BufferBase dst) {
+  run_options->stream()->ThenMemcpyD2D(
+      &dst.data, src.data,
+      absl::c_accumulate(src.dimensions, 1.0, std::multiplies<int64_t>()) *
+          sizeof(float));
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kMemcpy, Memcpy,
+                       ffi::Ffi::Bind()
+                           .Ctx<ServiceExecutableRunOptions>()
+                           .Arg<ffi::BufferBase>()  // src
+                           .Arg<ffi::BufferBase>()  // dst
+);
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", PLATFORM,
+                         kMemcpy);
+
+TEST_F(AddressComputationFusionRewriterTest, SimpleCustomCall) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "__xla_test$$memcpy",
+             /*operands=*/
+             {Slice(Broadcast(ConstantR0WithType(&b, F32, 42.0), {256}), {0},
+                    {128}, {1})},
+             ShapeUtil::MakeShape(F32, {128}), /*opaque=*/"",
+             /*has_side_effect=*/false,
+             /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+             /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+             /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/false);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
+                                        computation.proto(), hlo_config));
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloSchedule schedule,
+      ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
+        return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+      }));
+  TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
+
+  const char* expected = R"(
+    ; CHECK:     %address-computation {{.*}} {
+    ; CHECK:       [[P0:%[^ ]+]] = f32[256]{0} parameter(0)
+    ; CHECK:       [[S0:%[^ ]+]] = f32[128]{0} slice([[P0]]), slice={[0:128]}
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f32[128]{0} custom-call([[S0]]),
+    ; CHECK:              custom_call_target="__xla_test$$memcpy",
+    ; CHECK:              api_version=API_VERSION_TYPED_FFI
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %{{.*}} {
+    ; CHECK:       [[C0:%[^ ]+]] = f32[] constant(42)
+    ; CHECK:       [[BC:%[^ ]+]] = f32[256]{0} broadcast([[C0]])
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = f32[128]{0} fusion([[BC]])
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK:         }
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo->ToString(),
+                            AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
+void Callback_Void(se::gpu::GpuStreamHandle stream, void** buffers,
+                   const char* /*opaque*/, size_t /*opaque_len*/) {}
+
+XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_Void, PLATFORM);
+
+TEST_F(AddressComputationFusionRewriterTest, SimpleCustomCallLegacy) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "Callback_Void",
+             /*operands=*/
+             {Slice(Broadcast(ConstantR0WithType(&b, F32, 42.0), {256}), {0},
+                    {128}, {1})},
+             ShapeUtil::MakeShape(F32, {128}), /*opaque=*/"");
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/false);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo, xla::HloModule::CreateFromProto(
+                                        computation.proto(), hlo_config));
+  TF_ASSERT_OK_AND_ASSIGN(
+      HloSchedule schedule,
+      ScheduleModule(hlo.get(), [](const BufferValue& buffer) {
+        return ShapeUtil::ByteSizeOf(buffer.shape(), /*pointer_size=*/8);
+      }));
+  TF_CHECK_OK(hlo->set_schedule(std::move(schedule)));
+
+  const char* expected = R"(
+    ; CHECK:     %address-computation {{.*}} {
+    ; CHECK:       [[P0:%[^ ]+]] = f32[256]{0} parameter(0)
+    ; CHECK:       [[S0:%[^ ]+]] = f32[128]{0} slice([[P0]]), slice={[0:128]}
+    ; CHECK:       ROOT [[CC:%[^ ]+]] = f32[128]{0} custom-call([[S0]]),
+    ; CHECK:              custom_call_target="Callback_Void"
+    ; CHECK:     }
+
+    ; CHECK:     ENTRY %{{.*}} {
+    ; CHECK:       [[C0:%[^ ]+]] = f32[] constant(42)
+    ; CHECK:       [[BC:%[^ ]+]] = f32[256]{0} broadcast([[C0]])
+    ; CHECK:       ROOT [[FUSION:%[^ ]+]] = f32[128]{0} fusion([[BC]])
+    ; CHECK:         kind=kCustom, calls=%address-computation,
+    ; CHECK:         backend_config={
+    ; CHECK:           "kind":"__custom_fusion",
+    ; CHECK:           "custom_fusion_config":{"name":"address_computation"}
+    ; CHECK:         }
+    ; CHECK:     }
+  )";
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  RunAndFilecheckHloRewrite(hlo->ToString(),
+                            AddressComputationFusionRewriter(PLATFORM),
+                            expected, [](HloModule* module) {
+                              EXPECT_TRUE(module->has_schedule());
+                              TF_CHECK_OK(module->schedule().Verify());
+                            });
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/amdgpu_compiler.cc b/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
index 30a1fd334aa059..0a985871914cd1 100644
--- a/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/amdgpu_compiler.cc
@@ -76,7 +76,8 @@ absl::Status AMDGPUCompiler::OptimizeHloConvolutionCanonicalization(
   // tf2xla bridge, DepthwiseConvolutionConverter and GpuConvRewriter
   // introduces reshapes and transposes that can be eliminated using
   // AlgebraicSimplifier  We run algsimp to a fixed point.
-  AlgebraicSimplifierOptions options;
+  AlgebraicSimplifierOptions options =
+      GetAlgebraicSimplifierOptions(hlo_module->config());
   options.set_enable_conv_operand_swap(false);
   options.set_enable_unconditional_reduce_of_concat_replacement(false);
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(options);
diff --git a/third_party/xla/xla/service/gpu/autotuner_util.cc b/third_party/xla/xla/service/gpu/autotuner_util.cc
index 18b56b6a38a9f1..ebf9e999836e0b 100644
--- a/third_party/xla/xla/service/gpu/autotuner_util.cc
+++ b/third_party/xla/xla/service/gpu/autotuner_util.cc
@@ -65,17 +65,8 @@ static absl::Mutex autotune_cache_mu(absl::kConstInit);
 static auto& autotune_cache ABSL_GUARDED_BY(autotune_cache_mu) =
     *new AutotuneCacheMap();
 
-/*static*/ absl::Status AutotunerUtil::SerializeAutotuneResults(
-    AutotuneResults* results) {
-  absl::MutexLock lock(&autotune_cache_mu);
-  for (const auto& [k, result] : autotune_cache) {
-    auto& entry = *results->add_results();
-    entry.set_device(std::string(k.GetModelStr()));
-    entry.set_hlo(std::string(k.GetHlo()));
-    *entry.mutable_result() = result;
-  }
-
-  // Sort the results so that they're deterministic.
+// Sort the results so that they're deterministic.
+static void SortAutotuneResults(AutotuneResults* results) {
   std::sort(results->mutable_results()->pointer_begin(),
             results->mutable_results()->pointer_end(),
             [](const auto* a, const auto* b) {
@@ -84,6 +75,40 @@ static auto& autotune_cache ABSL_GUARDED_BY(autotune_cache_mu) =
                      std::make_pair(absl::string_view(b->device()),
                                     absl::string_view(b->hlo()));
             });
+}
+
+// Serialize `results` to string as a proto.
+static absl::StatusOr<std::string> AutotuneResultsToString(
+    const AutotuneResults& results, bool as_textproto) {
+  if (as_textproto) {
+    std::string textproto;
+    if (tsl::protobuf::TextFormat::PrintToString(results, &textproto)) {
+      return textproto;
+    } else {
+      return Internal("Failed to serialize autotune results.");
+    }
+  }
+  return results.SerializeAsString();
+}
+
+// Serialize a single entry to `results`.
+static void SerializeAutotuneEntry(AutotuneResults* results,
+                                   const AutotuneCacheKey& k,
+                                   const AutotuneResult* res) {
+  auto& entry = *results->add_results();
+  entry.set_device(std::string(k.GetModelStr()));
+  entry.set_hlo(std::string(k.GetHlo()));
+  *entry.mutable_result() = *res;
+}
+
+/*static*/ absl::Status AutotunerUtil::SerializeAutotuneResults(
+    AutotuneResults* results) {
+  absl::MutexLock lock(&autotune_cache_mu);
+  for (const auto& [k, result] : autotune_cache) {
+    SerializeAutotuneEntry(results, k, &result);
+  }
+
+  SortAutotuneResults(results);
 
   return absl::OkStatus();
 }
@@ -183,13 +208,14 @@ namespace {
 
 // Bump this version whenever you change the structure of the results.
 // LINT.IfChange(version)
-constexpr int kVersion = 2;
+constexpr int kVersion = 3;
 // LINT.ThenChange()
 
 bool IsTextProtoPath(absl::string_view file_path) {
   return absl::EndsWith(file_path, ".txt") ||
          absl::EndsWith(file_path, ".textproto") ||
-         absl::EndsWith(file_path, ".prototxt");
+         absl::EndsWith(file_path, ".prototxt") ||
+         absl::EndsWith(file_path, ".pbtxt");
 }
 
 }  // anonymous namespace
@@ -221,15 +247,7 @@ bool IsTextProtoPath(absl::string_view file_path) {
   AutotuneResults results;
   results.set_version(kVersion);
   TF_RETURN_IF_ERROR(SerializeAutotuneResults(&results));
-  if (as_textproto) {
-    std::string textproto;
-    if (tsl::protobuf::TextFormat::PrintToString(results, &textproto)) {
-      return textproto;
-    } else {
-      return Internal("Failed to serialize autotune results.");
-    }
-  }
-  return results.SerializeAsString();
+  return AutotuneResultsToString(results, as_textproto);
 }
 
 /*static*/ absl::Status AutotunerUtil::SerializeAutotuneResultsToFile(
@@ -275,42 +293,6 @@ bool IsTextProtoPath(absl::string_view file_path) {
   return absl::OkStatus();
 }
 
-/*static*/ std::unique_ptr<HloModule>
-AutotunerUtil::ExtractInstructionIntoNewModule(const HloInstruction& hlo) {
-  auto new_hlo_module = std::make_unique<HloModule>(
-      "extracted", HloModuleConfig{},
-      std::make_unique<CompilationEnvironments>(hlo.GetModule()->comp_envs()));
-  int parameter_number = 0;
-  HloComputation::Builder builder("entry_computation");
-  HloCloneContext clone_context(new_hlo_module.get());
-  std::vector<HloInstruction*> new_operands;
-  for (const HloInstruction* operand : hlo.operands()) {
-    std::unique_ptr<HloInstruction> new_parameter =
-        HloInstruction::CreateParameter(parameter_number, operand->shape(),
-                                        operand->name());
-    ++parameter_number;
-    new_operands.push_back(builder.AddInstruction(std::move(new_parameter)));
-  }
-  std::unique_ptr<HloInstruction> new_instruction =
-      hlo.CloneWithNewOperands(hlo.shape(), new_operands, &clone_context);
-  builder.AddInstruction(std::move(new_instruction));
-  new_hlo_module->AddEntryComputationWithLayouts(builder.Build());
-  return new_hlo_module;
-}
-
-/*static*/ std::unique_ptr<HloModule>
-AutotunerUtil::ExtractComputationIntoNewModule(
-    const HloComputation& computation) {
-  auto new_hlo_module =
-      std::make_unique<HloModule>("extracted", HloModuleConfig{},
-                                  std::make_unique<CompilationEnvironments>(
-                                      computation.parent()->comp_envs()));
-  HloCloneContext clone_context(new_hlo_module.get());
-  new_hlo_module->AddEntryComputationWithLayouts(
-      computation.CloneInContext(clone_context));
-  return new_hlo_module;
-}
-
 /*static*/ absl::StatusOr<se::RedzoneAllocator>
 AutotunerUtil::CreateRedzoneAllocator(const AutotuneConfig& config,
                                       const DebugOptions& opts,
@@ -327,5 +309,24 @@ AutotunerUtil::CreateRedzoneAllocator(const AutotuneConfig& config,
           : 0);
 }
 
+/*static*/ absl::StatusOr<std::string>
+AutotunerUtil::SerializeAutotuneResultsForModule(
+    const HloModule& module, const AutotuneConfig& autotune_config,
+    bool as_textproto) {
+  AutotuneResults results;
+  results.set_version(kVersion);
+
+  for (const HloInstruction* instr :
+       module.entry_computation()->instructions()) {
+    AutotuneCacheKey k(autotune_config.GetModelStr(), *instr);
+    if (const AutotuneResult* res = TryFindInCache(k)) {
+      SerializeAutotuneEntry(&results, k, res);
+    }
+  }
+
+  SortAutotuneResults(&results);
+  return AutotuneResultsToString(results, as_textproto);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/autotuner_util.h b/third_party/xla/xla/service/gpu/autotuner_util.h
index 54f0d9d99a50cc..d78891def8f214 100644
--- a/third_party/xla/xla/service/gpu/autotuner_util.h
+++ b/third_party/xla/xla/service/gpu/autotuner_util.h
@@ -232,6 +232,16 @@ struct AutotunerUtil {
   static absl::StatusOr<std::string> SerializeAutotuneResults(
       bool as_textproto = false);
 
+  // As above, but only performs serialization for instructions found in the
+  // module.
+  //
+  // Only serializes autotuning results for instructions found in the module:
+  // while this is more expensive than serializing all cache, this avoids
+  // quadratic blow-up when serializing cache for a large number of modules.
+  static absl::StatusOr<std::string> SerializeAutotuneResultsForModule(
+      const HloModule& module, const AutotuneConfig& autotune_config,
+      bool as_textproto = false);
+
   static absl::Status SerializeAutotuneResults(AutotuneResults* results);
   static absl::Status LoadAutotuneResults(absl::string_view data,
                                           bool as_textproto = false);
@@ -253,16 +263,6 @@ struct AutotunerUtil {
   static absl::Status LoadAutotuneResultsFromFile(absl::string_view file_path);
 
   static void ClearAutotuneResults();
-
-  // Extracts an HLO instruction into a new HLO module replacing its operands
-  // with parameter instructions.
-  static std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
-      const HloInstruction& hlo);
-
-  // Extracts an HLO computation into a new HLO module, using its clone as the
-  // root computation.
-  static std::unique_ptr<HloModule> ExtractComputationIntoNewModule(
-      const HloComputation& computation);
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/backend_configs.proto b/third_party/xla/xla/service/gpu/backend_configs.proto
index 65cc7221fd0f6e..f31c19e6838548 100644
--- a/third_party/xla/xla/service/gpu/backend_configs.proto
+++ b/third_party/xla/xla/service/gpu/backend_configs.proto
@@ -165,6 +165,14 @@ message CudnnNormBackendConfig {
 
   // Opaque algorithm number.
   stream_executor.dnn.AlgorithmProto algorithm = 2;
+
+  // Norm type.
+  enum Kind {
+    LAYER_FWD_INFER = 0;
+    LAYER_FWD_TRAIN = 1;
+    LAYER_BWD = 2;
+  }
+  Kind kind = 3;
 }
 
 // Backend config for a fused Multi-Headed Attention (fMHA) that runs through
diff --git a/third_party/xla/xla/service/gpu/buffer_comparator.cc b/third_party/xla/xla/service/gpu/buffer_comparator.cc
index 35c763a38c2f8e..46bc4ff884cab0 100644
--- a/third_party/xla/xla/service/gpu/buffer_comparator.cc
+++ b/third_party/xla/xla/service/gpu/buffer_comparator.cc
@@ -75,7 +75,7 @@ static absl::StatusOr<bool> DeviceCompare(se::Stream* stream,
   uint64_t buffer_size = current_typed.ElementCount();
 
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ComparisonKernelT<ElementT>> comparison_kernel,
+      ComparisonKernelT<ElementT> comparison_kernel,
       (executor->CreateTypedKernel<se::DeviceMemory<ElementT>,
                                    se::DeviceMemory<ElementT>, float, uint64_t,
                                    se::DeviceMemory<uint64_t>>(kernel_name,
@@ -88,7 +88,7 @@ static absl::StatusOr<bool> DeviceCompare(se::Stream* stream,
       CalculateLaunchDimensions(buffer_shape, gpu_device_info);
 
   TF_RETURN_IF_ERROR(stream->ThenLaunch(
-      dim.thread_counts_per_block(), dim.block_counts(), *comparison_kernel,
+      dim.thread_counts_per_block(), dim.block_counts(), comparison_kernel,
       current_typed, expected_typed, static_cast<float>(kTolerance),
       buffer_size, out_param.cref()));
 
diff --git a/third_party/xla/xla/service/gpu/buffer_comparator_test.cc b/third_party/xla/xla/service/gpu/buffer_comparator_test.cc
index 2f590dde2a6db6..4e49e223f2288b 100644
--- a/third_party/xla/xla/service/gpu/buffer_comparator_test.cc
+++ b/third_party/xla/xla/service/gpu/buffer_comparator_test.cc
@@ -26,7 +26,7 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/stream_executor/multi_platform_manager.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/types.h"
 #include "tsl/platform/ml_dtypes.h"
@@ -41,9 +41,9 @@ class BufferComparatorTest : public testing::Test {
  protected:
   BufferComparatorTest()
 #if GOOGLE_CUDA
-      : platform_(se::MultiPlatformManager::PlatformWithName("CUDA").value()),
+      : platform_(se::PlatformManager::PlatformWithName("CUDA").value()),
 #elif TENSORFLOW_USE_ROCM
-      : platform_(se::MultiPlatformManager::PlatformWithName("ROCM").value()),
+      : platform_(se::PlatformManager::PlatformWithName("ROCM").value()),
 #endif
         stream_exec_(platform_->ExecutorForDevice(0).value()) {
   }
diff --git a/third_party/xla/xla/service/gpu/buffer_sharing.cc b/third_party/xla/xla/service/gpu/buffer_sharing.cc
index fc2cc9b550039e..abfcefdd3f2cd8 100644
--- a/third_party/xla/xla/service/gpu/buffer_sharing.cc
+++ b/third_party/xla/xla/service/gpu/buffer_sharing.cc
@@ -79,10 +79,10 @@ std::optional<bool> FusionCanShareBufferHint(const HloInstruction* user,
   stream_executor::GpuDeviceInfoProto device_info;
   stream_executor::DeviceDescription device_description(device_info);
   auto analysis = HloFusionAnalysis::Create(fusion, &device_description);
-  bool is_reduction_emitter = analysis->GetEmitterFusionKind() ==
+  bool is_reduction_emitter = analysis.GetEmitterFusionKind() ==
                               HloFusionAnalysis::EmitterFusionKind::kReduction;
   const HloInstruction* reduction_hero =
-      is_reduction_emitter ? reduction_hero = analysis->FindHeroReduction()
+      is_reduction_emitter ? reduction_hero = analysis.FindHeroReduction()
                            : nullptr;
 
   // We need to make sure that the fusion parameter is accessed in the same
diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
index 7a81080f615302..aef58402d0f33e 100644
--- a/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstddef>
 #include <cstdint>
+#include <optional>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -39,11 +40,12 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/variant_visitor.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
@@ -99,7 +101,7 @@ static bool IsCommand(const HloInstruction*, const CommandBufferConfig&);
 template <>
 bool IsCommand<HloOpcode::kWhile>(const HloInstruction* hlo,
                                   const CommandBufferConfig& config) {
-  return config.contains(DebugOptions::CONDITIONALS) &&
+  return config.enabled_commands.contains(DebugOptions::CONDITIONALS) &&
          IsCommand(hlo->while_body(), config) &&
          IsCommand(hlo->while_condition(), config);
 }
@@ -109,7 +111,7 @@ bool IsCommand<HloOpcode::kWhile>(const HloInstruction* hlo,
 template <>
 bool IsCommand<HloOpcode::kConditional>(const HloInstruction* hlo,
                                         const CommandBufferConfig& config) {
-  return config.contains(DebugOptions::CONDITIONALS) &&
+  return config.enabled_commands.contains(DebugOptions::CONDITIONALS) &&
          absl::c_all_of(hlo->branch_computations(),
                         [&](const HloComputation* comp) {
                           return IsCommand(comp, config);
@@ -118,31 +120,44 @@ bool IsCommand<HloOpcode::kConditional>(const HloInstruction* hlo,
 
 static bool IsCommand(const HloCustomCallInstruction* hlo,
                       const CommandBufferConfig& config) {
-  if (config.contains(DebugOptions::CUBLAS) && IsLegacyCublasMatmul(*hlo)) {
+  if (config.enabled_commands.contains(DebugOptions::CUBLAS) &&
+      IsLegacyCublasMatmul(*hlo)) {
     return true;
   }
 
-  if (config.contains(DebugOptions::CUSTOM_CALL)) {
-    if (hlo->custom_call_target() == "cu_threefry2x32") {
-      if (hlo->operand_count() == 4) {
-        return true;
-      }
-      // This version of cu_threefy2x32 requires synchronization, which is not
-      // supported by command buffers.
-      DCHECK_EQ(hlo->operand_count(), 5);
-    }
-  }
+  if (hlo->custom_call_target() == "triton_kernel_call") return true;
 
   return false;
 }
 
 static bool IsCommand(const HloInstruction* hlo,
                       const CommandBufferConfig& config) {
-  if (auto* fusion = DynCast<HloFusionInstruction>(hlo))
-    return config.contains(DebugOptions::FUSION);
+  if (auto* fusion = DynCast<HloFusionInstruction>(hlo)) {
+    auto gpu_config = fusion->backend_config<GpuBackendConfig>();
+    const FusionBackendConfig& backend_config =
+        gpu_config->fusion_backend_config();
+    const auto& custom_config = backend_config.custom_fusion_config();
+    if (custom_config.name() == "address_computation") {
+      auto fusion_analysis =
+          HloFusionAnalysis::Create(fusion, &config.device_description);
+      const HloFusionAdaptor& adaptor = fusion_analysis.fusion();
+      auto custom_call_adaptor = HloFindIf(
+          adaptor.GetRoots(), adaptor,
+          [](auto node) { return node.opcode() == HloOpcode::kCustomCall; });
+      const auto* custom_call = static_cast<const HloCustomCallInstruction*>(
+          &custom_call_adaptor->instruction());
+      return IsCommand(custom_call, config);
+    }
+    return config.enabled_commands.contains(DebugOptions::FUSION);
+  }
 
   if (auto* sort = DynCast<HloSortInstruction>(hlo))
-    return config.contains(DebugOptions::FUSION);
+    return config.enabled_commands.contains(DebugOptions::FUSION);
+
+  if (hlo->opcode() == HloOpcode::kPartitionId ||
+      hlo->opcode() == HloOpcode::kReplicaId) {
+    return config.enabled_commands.contains(DebugOptions::FUSION);
+  }
 
   if (auto* custom_call = DynCast<HloCustomCallInstruction>(hlo))
     return IsCommand(custom_call, config);
@@ -171,12 +186,28 @@ static bool IsAsyncStartCommand(const HloInstruction* hlo,
                                 const CommandBufferConfig& config) {
   if (hlo->opcode() == HloOpcode::kAllReduceStart ||
       hlo->opcode() == HloOpcode::kAllGatherStart) {
-    return config.contains(DebugOptions::COLLECTIVES);
+    return config.enabled_commands.contains(DebugOptions::COLLECTIVES);
   }
 
   if (hlo->opcode() == HloOpcode::kAsyncStart) {
     if (hlo->async_wrapped_opcode() == HloOpcode::kReduceScatter) {
-      return config.contains(DebugOptions::COLLECTIVES);
+      return config.enabled_commands.contains(DebugOptions::COLLECTIVES);
+    }
+  }
+
+  return false;
+}
+
+static bool IsAsyncDoneCommand(const HloInstruction* hlo,
+                               const CommandBufferConfig& config) {
+  if (hlo->opcode() == HloOpcode::kAllReduceDone ||
+      hlo->opcode() == HloOpcode::kAllGatherDone) {
+    return config.enabled_commands.contains(DebugOptions::COLLECTIVES);
+  }
+
+  if (hlo->opcode() == HloOpcode::kAsyncDone) {
+    if (hlo->async_wrapped_opcode() == HloOpcode::kReduceScatter) {
+      return config.enabled_commands.contains(DebugOptions::COLLECTIVES);
     }
   }
 
@@ -203,11 +234,12 @@ static HloInstruction* FindAsyncDoneCommand(const HloInstruction* start) {
 // Returns true if HLO computation can be executed as a command buffer.
 static bool IsCommand(const HloComputation* computation,
                       const CommandBufferConfig& config) {
-  return absl::c_all_of(computation->instructions(),
-                        [&](const HloInstruction* inst) {
-                          return IsNoOp(inst) || IsConstant(inst) ||
-                                 IsParameter(inst) || IsCommand(inst, config);
-                        });
+  return absl::c_all_of(
+      computation->instructions(), [&](const HloInstruction* inst) {
+        return IsNoOp(inst) || IsConstant(inst) || IsParameter(inst) ||
+               IsCommand(inst, config) || IsAsyncStartCommand(inst, config) ||
+               IsAsyncDoneCommand(inst, config);
+      });
 }
 
 //===----------------------------------------------------------------------===//
@@ -445,7 +477,7 @@ absl::StatusOr<HloComputation*> CommandBufferScheduling::RewriteCommandBuffer(
     HloComputation* parent, const HloInstructionSequence& seq,
     CommandBuffer command_buffer) {
   if (command_buffer.results.empty())
-    return absl::InternalError("command buffer rsults must be not empty");
+    return absl::InternalError("command buffer results must not be empty");
 
   // If we have more than one result we return them as tuple, and get individual
   // values using `get-tuple-element` instructions. Otherwise we simply return
@@ -554,9 +586,9 @@ absl::StatusOr<HloComputation*> CommandBufferScheduling::RewriteCommandBuffer(
 //===----------------------------------------------------------------------===//
 
 CommandBufferScheduling::CommandBufferScheduling(
-    const se::GpuComputeCapability& gpu_compute_comp,
+    const se::DeviceDescription& device_description,
     int32_t gpu_toolkit_version, int32_t gpu_driver_version)
-    : gpu_compute_comp_(gpu_compute_comp),
+    : device_description_(device_description),
       gpu_toolkit_version_(gpu_toolkit_version),
       gpu_driver_version_(gpu_driver_version) {}
 
@@ -572,20 +604,20 @@ absl::StatusOr<bool> CommandBufferScheduling::Run(
 
   const DebugOptions& debug_options = module->config().debug_options();
 
-  CommandBufferConfig config;
+  absl::flat_hash_set<DebugOptions::CommandBufferCmdType> commands;
   for (auto cmd_type : debug_options.xla_gpu_enable_command_buffer()) {
-    config.insert(static_cast<DebugOptions::CommandBufferCmdType>(cmd_type));
+    commands.insert(static_cast<DebugOptions::CommandBufferCmdType>(cmd_type));
   }
+  CommandBufferConfig config{std::move(commands), device_description_};
 
   // Erase command buffer cmd types that are not supported by the gpu runtime.
   static constexpr auto kRequireConditionals = {DebugOptions::CONDITIONALS};
-  static constexpr auto kRequireTracing = {DebugOptions::CUBLAS,
-                                           DebugOptions::CUDNN,
-                                           DebugOptions::CUSTOM_CALL};
+  static constexpr auto kRequireTracing = {
+      DebugOptions::CUBLAS, DebugOptions::CUDNN, DebugOptions::CUSTOM_CALL};
 
   auto erase = [&](absl::Span<const DebugOptions::CommandBufferCmdType> cmds) {
     for (auto cmd : cmds) {
-      if (config.erase(cmd)) {
+      if (config.enabled_commands.erase(cmd)) {
         VLOG(1) << "Removed command buffer support for "
                 << DebugOptions::CommandBufferCmdType_Name(cmd)
                 << " as it's not supported with gpu toolkit version "
@@ -611,7 +643,8 @@ absl::StatusOr<bool> CommandBufferScheduling::Run(
     return true;  // check for ROCM support
   };
 
-  if (std::visit(VariantVisitor{check_cuda, check_rocm}, gpu_compute_comp_)) {
+  if (std::visit(VariantVisitor{check_cuda, check_rocm},
+                 device_description_.gpu_compute_capability())) {
     erase(kRequireTracing);       // cuStreamBeginCaptureToGraph
     erase(kRequireConditionals);  // on-device control flow
   }
diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling.h b/third_party/xla/xla/service/gpu/command_buffer_scheduling.h
index ff114ee1b3d72a..79855f307d6003 100644
--- a/third_party/xla/xla/service/gpu/command_buffer_scheduling.h
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling.h
@@ -24,12 +24,11 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
 
 namespace xla::gpu {
 
@@ -70,12 +69,14 @@ namespace xla::gpu {
 // custom call to a first class operation later.
 class CommandBufferScheduling : public HloModulePass {
  public:
-  // DebugOptions control which commands are enabled. Long term we want to
-  // remove that flag and enable all supported commands by default.
-  using CommandBufferConfig =
-      absl::flat_hash_set<DebugOptions::CommandBufferCmdType>;
+  struct CommandBufferConfig {
+    // DebugOptions control which commands are enabled. Long term we want to
+    // remove that flag and enable all supported commands by default.
+    absl::flat_hash_set<DebugOptions::CommandBufferCmdType> enabled_commands;
+    const se::DeviceDescription& device_description;
+  };
 
-  CommandBufferScheduling(const se::GpuComputeCapability& gpu_compute_comp,
+  CommandBufferScheduling(const se::DeviceDescription& device_description,
                           int32_t gpu_toolkit_version,
                           int32_t gpu_driver_version);
 
@@ -127,7 +128,7 @@ class CommandBufferScheduling : public HloModulePass {
       CommandBuffer command_buffer);
 
  private:
-  se::GpuComputeCapability gpu_compute_comp_;
+  se::DeviceDescription device_description_;
   // For NVIDIA gpus XLA can be compiled with a CUDA version that is larger than
   // the version supported by the driver, e.g. we can compile for CUDA 12.3 but
   // have 12.1 driver installed. When deciding what command buffer features we
diff --git a/third_party/xla/xla/service/gpu/command_buffer_scheduling_test.cc b/third_party/xla/xla/service/gpu/command_buffer_scheduling_test.cc
index 3463ac8001a248..9d75a60a70dda2 100644
--- a/third_party/xla/xla/service/gpu/command_buffer_scheduling_test.cc
+++ b/third_party/xla/xla/service/gpu/command_buffer_scheduling_test.cc
@@ -24,6 +24,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/service/hlo_parser.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/tests/filecheck.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
@@ -39,11 +40,8 @@ class CommandBufferSchedulingTest : public HloTestBase {
   // Use CUDA 12.3 version for testing as it has all the features we rely on.
   static constexpr int32_t kCudaVersion = 12030;
 
-  const auto& gpu_comp() {
-    return backend()
-        .default_stream_executor()
-        ->GetDeviceDescription()
-        .gpu_compute_capability();
+  const se::DeviceDescription& device_desc() {
+    return backend().default_stream_executor()->GetDeviceDescription();
   }
 
   DebugOptions GetDebugOptionsForTest() override {
@@ -101,7 +99,7 @@ TEST_F(CommandBufferSchedulingTest, SingleCommandBuffer) {
 // CHECK: })";
 
   RunAndFilecheckHloRewrite(
-      hlo, CommandBufferScheduling(gpu_comp(), kCudaVersion, kCudaVersion),
+      hlo, CommandBufferScheduling(device_desc(), kCudaVersion, kCudaVersion),
       expected, [](HloModule* module) {
         EXPECT_TRUE(module->has_schedule());
         TF_CHECK_OK(module->schedule().Verify());
@@ -179,7 +177,7 @@ TEST_F(CommandBufferSchedulingTest, MultipleCommandBuffers) {
 // CHECK:  })";
 
   RunAndFilecheckHloRewrite(
-      hlo, CommandBufferScheduling(gpu_comp(), kCudaVersion, kCudaVersion),
+      hlo, CommandBufferScheduling(device_desc(), kCudaVersion, kCudaVersion),
       expected, [](HloModule* module) {
         EXPECT_TRUE(module->has_schedule());
         TF_CHECK_OK(module->schedule().Verify());
@@ -218,7 +216,7 @@ TEST_F(CommandBufferSchedulingTest, AllReduceStartFollowedByDone) {
     CHECK: })";
 
   RunAndFilecheckHloRewrite(
-      hlo, CommandBufferScheduling(gpu_comp(), kCudaVersion, kCudaVersion),
+      hlo, CommandBufferScheduling(device_desc(), kCudaVersion, kCudaVersion),
       expected, [](HloModule* module) {
         EXPECT_TRUE(module->has_schedule());
         TF_CHECK_OK(module->schedule().Verify());
@@ -253,7 +251,7 @@ TEST_F(CommandBufferSchedulingTest, AllGatherStartFollowedByDone) {
     CHECK: })";
 
   RunAndFilecheckHloRewrite(
-      hlo, CommandBufferScheduling(gpu_comp(), kCudaVersion, kCudaVersion),
+      hlo, CommandBufferScheduling(device_desc(), kCudaVersion, kCudaVersion),
       expected, [](HloModule* module) {
         EXPECT_TRUE(module->has_schedule());
         TF_CHECK_OK(module->schedule().Verify());
@@ -294,7 +292,7 @@ TEST_F(CommandBufferSchedulingTest, ReduceScatterStartFollowedByDone) {
     CHECK: })";
 
   RunAndFilecheckHloRewrite(
-      hlo, CommandBufferScheduling(gpu_comp(), kCudaVersion, kCudaVersion),
+      hlo, CommandBufferScheduling(device_desc(), kCudaVersion, kCudaVersion),
       expected, [](HloModule* module) {
         EXPECT_TRUE(module->has_schedule());
         TF_CHECK_OK(module->schedule().Verify());
@@ -351,8 +349,8 @@ TEST_F(CommandBufferSchedulingTest, CollectCommandBufferSequence) {
   }
   EXPECT_EQ(seq.size(), 10);
 
-  CommandBufferScheduling::CommandBufferConfig config;
-  config.insert(DebugOptions::FUSION);
+  CommandBufferScheduling::CommandBufferConfig config{{DebugOptions::FUSION},
+                                                      device_desc()};
 
   std::vector<HloInstructionSequence> command_buffer_sequences =
       CommandBufferScheduling::CollectCommandBufferSequences(seq, config);
@@ -541,7 +539,7 @@ TEST_F(CommandBufferSchedulingTest, ForwardControlDependencies) {
     CHECK: })";
 
   RunAndFilecheckHloRewrite(
-      hlo, CommandBufferScheduling(gpu_comp(), kCudaVersion, kCudaVersion),
+      hlo, CommandBufferScheduling(device_desc(), kCudaVersion, kCudaVersion),
       expected, [](HloModule* module) {
         EXPECT_TRUE(module->has_schedule());
         TF_CHECK_OK(module->schedule().Verify());
@@ -581,7 +579,7 @@ TEST_F(CommandBufferSchedulingTest, ForwardControlDependenciesToParams) {
     CHECK: })";
 
   RunAndFilecheckHloRewrite(
-      hlo, CommandBufferScheduling(gpu_comp(), kCudaVersion, kCudaVersion),
+      hlo, CommandBufferScheduling(device_desc(), kCudaVersion, kCudaVersion),
       expected, [](HloModule* module) {
         EXPECT_TRUE(module->has_schedule());
         TF_CHECK_OK(module->schedule().Verify());
@@ -660,7 +658,7 @@ TEST_F(CommandBufferSchedulingTest, WhileNotCommand) {
     CHECK: })";
 
   RunAndFilecheckHloRewrite(
-      hlo, CommandBufferScheduling(gpu_comp(), kCudaVersion, kCudaVersion),
+      hlo, CommandBufferScheduling(device_desc(), kCudaVersion, kCudaVersion),
       expected, [](HloModule* module) {
         EXPECT_TRUE(module->has_schedule());
         TF_CHECK_OK(module->schedule().Verify());
@@ -722,7 +720,7 @@ TEST_F(CommandBufferSchedulingTest, While) {
     CHECK: })";
 
   RunAndFilecheckHloRewrite(
-      hlo, CommandBufferScheduling(gpu_comp(), kCudaVersion, kCudaVersion),
+      hlo, CommandBufferScheduling(device_desc(), kCudaVersion, kCudaVersion),
       expected, [](HloModule* module) {
         EXPECT_TRUE(module->has_schedule());
         TF_CHECK_OK(module->schedule().Verify());
@@ -798,7 +796,7 @@ TEST_F(CommandBufferSchedulingTest, Conditional) {
     CHECK: })";
 
   RunAndFilecheckHloRewrite(
-      hlo, CommandBufferScheduling(gpu_comp(), kCudaVersion, kCudaVersion),
+      hlo, CommandBufferScheduling(device_desc(), kCudaVersion, kCudaVersion),
       expected, [](HloModule* module) {
         EXPECT_TRUE(module->has_schedule());
         TF_CHECK_OK(module->schedule().Verify());
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
index 15fdf5083594a4..65d71dea1873c1 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.cc
@@ -57,7 +57,6 @@ limitations under the License.
 #include "mlir/Support/LogicalResult.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/mlir/backends/gpu/transforms/passes.h"
 #include "xla/mlir/runtime/transforms/compilation_pipeline_gpu.h"
 #include "xla/mlir_hlo/transforms/gpu_passes.h"
 #include "xla/service/buffer_assignment.h"
@@ -69,11 +68,9 @@ limitations under the License.
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/ir_emitter_unnested.h"
 #include "xla/service/gpu/metrics.h"
-#include "xla/service/gpu/runtime/executable.h"
-#include "xla/service/gpu/runtime3/conditional_thunk.h"
-#include "xla/service/gpu/runtime3/for_thunk.h"
-#include "xla/service/gpu/runtime3/sequential_thunk.h"
-#include "xla/service/gpu/runtime3/while_thunk.h"
+#include "xla/service/gpu/runtime/conditional_thunk.h"
+#include "xla/service/gpu/runtime/sequential_thunk.h"
+#include "xla/service/gpu/runtime/while_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/service/hlo_ordering.h"
@@ -89,7 +86,6 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/translate/hlo_to_mhlo/hlo_utils.h"
 #include "xla/translate/mhlo_to_hlo/location_exporter.h"
-#include "xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
@@ -148,53 +144,6 @@ class DumpAfterPassIfEnabled : public mlir::PassInstrumentation {
   int pass_counter_ = 0;
 };
 
-// Lowers MLIR module to the XLA Gpu runtime custom calls.
-static absl::Status LowerToXlaGpuRuntime(
-    mlir::ModuleOp module, llvm::StringRef entry_function_name,
-    llvm::ArrayRef<int64_t> buffer_sizes, ThunkSequence* thunk_sequence,
-    const HloModule* hlo_module, se::GpuComputeCapability compute_capability) {
-  if (!module) {
-    return Internal("No MLIR module to lower.");
-  }
-
-  const DebugOptions& debug_options = hlo_module->config().debug_options();
-  bool should_verify = debug_options.xla_gpu_llvm_verification_level() >= 1;
-#ifndef NDEBUG
-  should_verify = true;
-#endif
-
-  mlir::PassManager pm(module->getName(), mlir::PassManager::Nesting::Implicit);
-  pm.enableVerifier(should_verify);
-  if (hlo_module != nullptr && DumpingEnabledForHloModule(*hlo_module)) {
-    pm.addInstrumentation(
-        std::make_unique<DumpAfterPassIfEnabled>(hlo_module, &module));
-  }
-
-  absl::flat_hash_set<DebugOptions::CommandBufferCmdType> command_types;
-  for (int command_type_num : debug_options.xla_gpu_enable_command_buffer()) {
-    if (!DebugOptions::CommandBufferCmdType_IsValid(command_type_num)) {
-      return Internal("Invalid command buffer command type");
-    }
-    DebugOptions::CommandBufferCmdType command_type =
-        static_cast<DebugOptions::CommandBufferCmdType>(command_type_num);
-    command_types.insert(command_type);
-  }
-
-  GpuPipelineOpts opts;
-  opts.command_types = command_types;
-  opts.min_graph_size = debug_options.xla_gpu_graph_min_graph_size();
-  opts.enable_concurrent_region =
-      debug_options.xla_gpu_graph_enable_concurrent_region();
-  opts.compute_capability = compute_capability;
-  populateXlaGpuRuntimePasses(pm, thunk_sequence, opts);
-
-  if (pm.run(module).failed()) {
-    return Internal("Failed to lower LMHLO to Gpu runtime custom calls.");
-  }
-
-  return absl::OkStatus();
-}
-
 }  // namespace
 
 void ForAllThunks(const std::function<void(Thunk*)>& fn,
@@ -206,9 +155,6 @@ void ForAllThunks(const std::function<void(Thunk*)>& fn,
            cond_thunk->branch_thunks()) {
         ForAllThunks(fn, &branch_thunks->thunks());
       }
-    } else if (thunk->kind() == Thunk::kFor) {
-      auto* for_thunk = tensorflow::down_cast<ForThunk*>(thunk.get());
-      ForAllThunks(fn, &for_thunk->body_thunk_sequence()->thunks());
     } else if (thunk->kind() == Thunk::kSequential) {
       auto* sequential_thunk =
           tensorflow::down_cast<SequentialThunk*>(thunk.get());
@@ -232,36 +178,6 @@ static void ForwardCollectiveAttrs(mlir::ModuleOp module,
   func->setAttr("num_partitions", b.getI64IntegerAttr(config.num_partitions()));
 }
 
-absl::StatusOr<GpuExecutable::OwnedGpuRuntimeProgram> LowerToJitRt(
-    mlir::ModuleOp mlir_module, llvm::StringRef entry_function_name,
-    llvm::ArrayRef<int64_t> buffer_sizes,
-    std::unique_ptr<ThunkSequence> thunk_sequence, const HloModule* hlo_module,
-    se::GpuComputeCapability compute_capability) {
-  const auto& module_config = hlo_module->config();
-  // Forward collective (NCCL) attributes for use by the lowering pipeline.
-  ForwardCollectiveAttrs(mlir_module, entry_function_name, module_config);
-
-  // Lower LMHLO operations to the XLA:GPU runtime custom calls.
-  TF_RETURN_IF_ERROR(LowerToXlaGpuRuntime(
-      mlir_module, {entry_function_name.data(), entry_function_name.size()},
-      buffer_sizes, thunk_sequence.get(), hlo_module, compute_capability));
-
-  // TODO(b/232033540): Pass MLIR module directly to Gpu runtime executable
-  // without forcing serialization.
-  std::string module_str = llvm_ir::DumpToString(mlir_module);
-
-  if (hlo_module != nullptr) {
-    DumpToFileInDirOrStdout(*hlo_module, "gpu_rt_host", "mlir", module_str);
-  }
-
-  // Collect allocation indices for handling graph capture functions.
-  auto allocation_indices = GetAllocationIndices(mlir_module);
-
-  return std::make_unique<GpuRuntimeProgram>(
-      entry_function_name.str(), std::move(module_str), buffer_sizes.vec(),
-      std::move(allocation_indices), module_config.debug_options());
-}
-
 // Analyze the function signature to reconstruct a vector of BufferAllocation
 // objects, as well as other output information.
 //
@@ -353,70 +269,27 @@ absl::StatusOr<CompileModuleResults> CompileModuleToLlvmIr(
           << ": " << hlo_module->GetFingerprint128();
 
   uint64_t start_usecs = tsl::Env::Default()->NowMicros();
+
   mlir::DialectRegistry registry;
   IrEmitterUnnested::GetDependentDialects(registry);
-
   // Disable MLIR multi-threading to prevent creating too many threads when
   // compiling XLA executables concurrently (e.g. during auto-tuning).
   auto mlir_context = std::make_unique<mlir::MLIRContext>(
       registry, mlir::MLIRContext::Threading::DISABLED);
-
   mlir_context->getDiagEngine().registerHandler(DiagnosticHandler);
-  mlir::OwningOpRef<mlir::ModuleOp> mlir_module = llvm_ir::CreateMlirModuleOp(
-      mlir::Builder(mlir_context.get()).getUnknownLoc(), hlo_module->name());
 
-  absl::flat_hash_map<const mlir::Operation*, const xla::HloInstruction*>
-      operation_map;
-
-  // Store the allocations in the order of the LMHLO buffer arguments.
-  std::vector<const BufferAllocation*> ordered_allocations;
-  TF_RETURN_IF_ERROR(HloToLhloModule(*results.buffer_assignment, *hlo_module,
-                                     *mlir_module, &ordered_allocations,
-                                     &operation_map));
-
-  results.module_name =
-      mlir::mhlo::GetDebugNameFromLocation(mlir_module->getLoc());
-
-  if (DumpingEnabledForHloModule(*hlo_module)) {
-    DumpToFileInDirOrStdout(*hlo_module, "lmhlo", mlir_module.get());
-  }
-
-  auto entry_function = mlir::cast<mlir::func::FuncOp>(
-      mlir_module->lookupSymbol(hlo_module->entry_computation()->name()));
-
-  bool emit_from_hlo = !IsXlaRuntimeExecutableEnabled(hlo_module->config());
-
-  std::vector<BufferAllocation> mlir_allocations;
-  absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo> mlir_output_info;
-  Shape mlir_output_shape;
-  TF_RETURN_IF_ERROR(GetMlirAllocationInfo(entry_function, &mlir_allocations,
-                                           &mlir_output_info,
-                                           &mlir_output_shape));
+  results.module_name = hlo_module->name();
 
   IrEmitterContext ir_emitter_context(
       hlo_module, results.buffer_assignment.get(), platform_name,
       gpu_device_info, mlir_context.get(), results.llvm_module.get(),
-      emit_from_hlo, /*emit_kernels=*/true);
+      /*emit_ir_from_hlo=*/true, /*emit_kernels=*/true);
 
   std::vector<BufferAllocation*> allocations;
-  if (emit_from_hlo) {
     results.output_shape = hlo_module->result_shape();
     TF_ASSIGN_OR_RETURN(results.output_info,
                         GetOutputInfo(*hlo_module, *results.buffer_assignment));
-    TF_RET_CHECK(mlir_allocations.size() == ordered_allocations.size());
-    ir_emitter_context.set_allocations(ordered_allocations);
     results.use_original_allocations = true;
-  } else {
-    results.allocations = std::move(mlir_allocations);
-    results.output_shape = mlir_output_shape;
-    results.output_info = mlir_output_info;
-    allocations.reserve(results.allocations.size());
-    for (auto& allocation : results.allocations) {
-      allocations.push_back(&allocation);
-    }
-    ir_emitter_context.set_allocations(allocations);
-    results.use_original_allocations = false;
-  }
 
   auto ir_emitter = IrEmitterUnnested::Create(&ir_emitter_context);
 
@@ -425,7 +298,7 @@ absl::StatusOr<CompileModuleResults> CompileModuleToLlvmIr(
         "GpuCompiler::RunBackend - IR emission for ", hlo_module->name()));
 
     TF_RETURN_IF_ERROR(
-        ir_emitter->EmitLmhloRegion(&entry_function.getBody(), operation_map));
+        ir_emitter->EmitHloComputation(hlo_module->entry_computation()));
 
     bool supports_runtime_managed_constants =
         // TODO(b/218907125): Implement this feature for ROCm as well.
@@ -446,27 +319,11 @@ absl::StatusOr<CompileModuleResults> CompileModuleToLlvmIr(
     RecordHloToLlvmDuration(end_usecs - start_usecs);
   }
 
-  // TODO(ezhulenev): Remove the FP8 check once https://reviews.llvm.org/D140088
-  // is submitted. Currently we can't emit LLVM IR with fp8 types.
-  if (IsXlaRuntimeExecutableEnabled(hlo_module->config()) &&
-      !HasFp8(*hlo_module)) {
-    // Sizes of all buffers required for running XLA module.
-    std::vector<int64_t> buffer_sizes;
-    llvm::transform(
-        results.allocations, std::back_inserter(buffer_sizes),
-        [](const BufferAllocation& allocation) { return allocation.size(); });
-
-    TF_ASSIGN_OR_RETURN(
-        results.executable,
-        LowerToJitRt(*mlir_module, entry_function.getName(), buffer_sizes,
-                     ir_emitter->ConsumeThunkSequence(), hlo_module,
-                     gpu_device_info.gpu_compute_capability()));
-  } else {
-    auto thunk_sequence = ir_emitter->ConsumeThunkSequence();
-    ForAllThunks([](Thunk* thunk) { thunk->ClearCompileTimeInfo(); },
-                 thunk_sequence.get());
-    results.executable = std::move(thunk_sequence);
-  }
+  auto thunk_sequence = ir_emitter->ConsumeThunkSequence();
+  ForAllThunks([](Thunk* thunk) { thunk->ClearCompileTimeInfo(); },
+               thunk_sequence.get());
+  results.executable = std::move(thunk_sequence);
+
   return results;
 }
 
diff --git a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
index f39cba107e68de..d7f8f395849b0c 100644
--- a/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
+++ b/third_party/xla/xla/service/gpu/compile_module_to_llvm_ir.h
@@ -42,9 +42,7 @@ struct CompileModuleResults {
   std::unique_ptr<llvm::Module> llvm_module;
   std::unique_ptr<BufferAssignment> buffer_assignment;
   std::vector<BufferAllocation> allocations;
-  std::variant<GpuExecutable::OwnedThunkSequence,
-               GpuExecutable::OwnedGpuRuntimeProgram>
-      executable;
+  GpuExecutable::OwnedThunkSequence executable;
   std::vector<GpuExecutable::ConstantInfo> constants;
   absl::flat_hash_map<ShapeIndex, GpuExecutable::OutputInfo> output_info;
   Shape output_shape;
diff --git a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
index 0da9a6684dd59b..6c8cdc8a179b8e 100644
--- a/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/conv_algorithm_picker.cc
@@ -151,6 +151,10 @@ absl::StatusOr<std::vector<GenericConvRunner>> GetAlgorithms(
   se::StreamExecutor* stream_exec = stream->parent();
   std::vector<GenericConvRunner> result;
 
+  auto dnn = stream_exec->AsDnn();
+  if (dnn == nullptr) {
+    return absl::InvalidArgumentError("No DNN in stream executor.");
+  }
   switch (kind) {
     default:
       return Internal("Unknown ConvolutionKind %d", kind);
@@ -160,7 +164,7 @@ absl::StatusOr<std::vector<GenericConvRunner>> GetAlgorithms(
             "GpuConvConfig had fusion ConvolutionKind but no FusionConfig.");
       }
       std::vector<std::unique_ptr<const se::dnn::FusedConvRunner>> runners;
-      TF_RETURN_IF_ERROR(stream_exec->GetFusedConvolveRunners(
+      TF_RETURN_IF_ERROR(dnn->GetFusedConvolveRunners(
           use_cudnn_frontend,
           // This refers to the kind of convolution op inside the fusion, not
           // the whole fused graph.
@@ -186,7 +190,7 @@ absl::StatusOr<std::vector<GenericConvRunner>> GetAlgorithms(
       std::vector<std::unique_ptr<const se::dnn::GraphConvRunner>> runners;
       // This path is cuDNN-only, where the DeviceMemoryBase arguments and the
       // allocator are unused; so, they're all provided as nullptr.
-      TF_RETURN_IF_ERROR(stream_exec->GetGraphConvolveRunners(
+      TF_RETURN_IF_ERROR(dnn->GetGraphConvolveRunners(
           kind, input_type, output_type, stream, config.input_descriptor,
           config.filter_descriptor, config.output_descriptor, config.conv_desc,
           use_fallback, numeric_options, &runners, config.serialized_graph));
@@ -206,7 +210,7 @@ absl::StatusOr<std::vector<GenericConvRunner>> GetAlgorithms(
       std::vector<std::unique_ptr<const se::dnn::ConvRunner>> runners;
       // This path is cuDNN-only, where the DeviceMemoryBase arguments and the
       // allocator are unused; so, they're all provided as nullptr.
-      TF_RETURN_IF_ERROR(stream_exec->GetConvolveRunners(
+      TF_RETURN_IF_ERROR(dnn->GetConvolveRunners(
           use_cudnn_frontend, kind, input_type, output_type, stream,
           config.input_descriptor,
           /* input_data = */ DeviceMemoryBase(nullptr),
@@ -250,7 +254,11 @@ GetMIOpenAlgorithms(const HloCustomCallInstruction* instr,
       GetGpuConvParams(config, operand_buffers, result_buffers));
 
   std::vector<std::unique_ptr<const se::dnn::ConvRunner>> runners;
-  TF_RETURN_IF_ERROR(stream_exec->GetConvolveRunners(
+  auto dnn = stream_exec->AsDnn();
+  if (dnn == nullptr) {
+    return absl::InvalidArgumentError("No DNN in stream executor.");
+  }
+  TF_RETURN_IF_ERROR(dnn->GetConvolveRunners(
       /* use_cudnn_frontend = */ false, kind, dtype, dtype, stream,
       params.config->input_descriptor, params.input_buf,
       params.config->filter_descriptor, params.filter_buf,
diff --git a/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc b/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc
index eec7e5c52aaba9..61386297ca7c9e 100644
--- a/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc
+++ b/third_party/xla/xla/service/gpu/conv_layout_normalization_test.cc
@@ -89,7 +89,8 @@ ENTRY %TestComputation {
   )");
 }
 
-TEST_F(ConvolutionLayoutNormalizationTest, FusedConv3D) {
+// TODO(rocm): No Conv3D
+TEST_F(ConvolutionLayoutNormalizationTest, DISABLED_ON_GPU_ROCM(FusedConv3D)) {
   const char* hlo = R"(
 HloModule TestModule
 
diff --git a/third_party/xla/xla/service/gpu/cub_sort_kernel.cu.cc b/third_party/xla/xla/service/gpu/cub_sort_kernel.cu.cc
index 4f694c735b5509..2efd6d09bffa9b 100644
--- a/third_party/xla/xla/service/gpu/cub_sort_kernel.cu.cc
+++ b/third_party/xla/xla/service/gpu/cub_sort_kernel.cu.cc
@@ -18,11 +18,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 
-#if GOOGLE_CUDA
-#include "xla/service/gpu/gpu_prim_cuda.h"
-#elif TENSORFLOW_USE_ROCM
-#include "xla/service/gpu/gpu_prim_rocm.h"
-#endif  // TENSORFLOW_USE_ROCM
+#include "xla/service/gpu/gpu_prim.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/cublas_cudnn.h b/third_party/xla/xla/service/gpu/cublas_cudnn.h
index cd9e78e805f688..e3937b8bf322fe 100644
--- a/third_party/xla/xla/service/gpu/cublas_cudnn.h
+++ b/third_party/xla/xla/service/gpu/cublas_cudnn.h
@@ -48,6 +48,12 @@ enum class CudnnConvKind {
                        // => output
 };
 
+enum class CudnnNormKind {
+  kLayerForwardInfer,
+  kLayerForwardTrain,
+  kLayerBackward,
+};
+
 enum class CudnnfMHAKind {
   kBmmBmm,
   kScaleBiasMaskSoftmax,
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
index e102ef1e156ad3..891a15280be601 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter.cc
@@ -54,6 +54,10 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
+#if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda.h"
+#endif
+
 namespace xla {
 namespace gpu {
 namespace {
@@ -469,15 +473,11 @@ StatusOr<bool> IsFlashAttention(
   TF_RET_CHECK(seq_q.size() == 1);
   TF_RET_CHECK(seq_k.size() == 1);
   TF_RET_CHECK(hidden_dim.size() == 1);
-  auto is_fixed_topology =
-      (custom_call_name == kCudnnfMHAScaleBiasSoftmaxDropoutCallTarget ||
-       custom_call_name == kCudnnfMHAScaleBiasSoftmaxCallTarget);
 
   auto is_seqlen_supported = seq_q[0] > 512 && seq_k[0] > 512 &&
                              seq_q[0] % 64 == 0 && seq_k[0] % 64 == 0;
   auto is_hidden_dim_supported = hidden_dim[0] == 64 || hidden_dim[0] == 128;
-  auto is_flash_attention =
-      is_seqlen_supported && is_hidden_dim_supported && is_fixed_topology;
+  auto is_flash_attention = is_seqlen_supported && is_hidden_dim_supported;
   auto is_cross_attention = seq_q[0] != seq_k[0];
 
   // flash attention requires cuDNN 8.9.3 to run non-fused QKV
@@ -665,10 +665,12 @@ MatchFwdResult MatchBmm1UnfusedBiasSoftmaxBmm2(MatchFwdResult previous_result,
       OptionalConvert(first_bmm_pattern.WithOneUse()),
       OptionalConvert(
           m::Broadcast(m::Constant(&scale).WithPredicate(IsScalar))));
-
   if (Match(softmax_input,
-            OptionalConvert(OptionalBitcast(first_bmm_pattern)))) {
+            OptionalConvert(OptionalBitcast(m::AnyOf<HloInstruction>(
+                first_bmm_pattern, unfused_scaled_bmm_subpattern))))) {
+    // bmm1 - (scale) - softmax
     match_result.matched_bmm_1 = bmm_1;
+    match_result.matched_scale = scale;
     match_result.matched_custom_call_name =
         has_dropout ? kCudnnfMHASoftmaxDropoutCallTarget
                     : kCudnnfMHASoftmaxCallTarget;
@@ -679,6 +681,7 @@ MatchFwdResult MatchBmm1UnfusedBiasSoftmaxBmm2(MatchFwdResult previous_result,
                            unfused_scaled_bmm_subpattern.WithOneUse(),
                            first_bmm_pattern.WithOneUse()))),
                        m::Op(&bias))))) {
+    // bmm1 - (scale) - bias - softmax
     match_result.matched_bmm_1 = bmm_1;
     match_result.matched_scale = scale;
     match_result.matched_bias = bias;
@@ -736,7 +739,6 @@ MatchFwdResult MatchBmm1ScaleBiasMaskSoftmaxDropoutBmm2(
       matched_result.has_match = false;
       return matched_result;
     }
-    matched_result.is_causal_mask |= IsCausalMaskPattern(mask);
     if (has_dropout) {
       // Found BMM1 - (Scale) - (bias) - Mask - Softmax - dropout - BMM2
       matched_result.matched_custom_call_name =
@@ -1222,11 +1224,8 @@ absl::StatusOr<bool> IsMHABlockSupported(
     return false;
   }
 
-  if (is_training &&
-      (custom_call_name != kCudnnfMHAScaleBiasSoftmaxDropoutCallTarget &&
-       custom_call_name != kCudnnfMHAScaleBiasSoftmaxCallTarget &&
-       custom_call_name != kCudnnfMHAScaleBiasMaskSoftmaxDropoutCallTarget &&
-       custom_call_name != kCudnnfMHAScaleBiasMaskSoftmaxCallTarget)) {
+  // cuDNN FMHA requires softmax for backward
+  if (is_training && custom_call_name == kCudnnfMHABmmBmmCallTarget) {
     VLOG(3) << "Unsupported fused MHA training pattern.\n";
     return false;
   }
@@ -1259,9 +1258,16 @@ absl::StatusOr<bool> IsMHABlockSupported(
   if (is_flash_attention) {
     if (is_causal_mask) {
       // if bias is causal mask, needs to remove bias from name
-      custom_call_name = MHACallHasDropout(custom_call_name)
-                             ? kCudnnfMHASoftmaxDropoutCallTarget
-                             : kCudnnfMHASoftmaxCallTarget;
+      if (custom_call_name == kCudnnfMHAScaleBiasSoftmaxDropoutCallTarget) {
+        custom_call_name = kCudnnfMHASoftmaxDropoutCallTarget;
+      } else if (custom_call_name == kCudnnfMHAScaleBiasSoftmaxCallTarget) {
+        custom_call_name = kCudnnfMHASoftmaxCallTarget;
+      } else if (custom_call_name ==
+                 kCudnnfMHAScaleBiasMaskSoftmaxDropoutCallTarget) {
+        custom_call_name = kCudnnfMHAScaleMaskSoftmaxDropoutCallTarget;
+      } else if (custom_call_name == kCudnnfMHAScaleBiasMaskSoftmaxCallTarget) {
+        custom_call_name = kCudnnfMHAScaleMaskSoftmaxCallTarget;
+      }
     }
     return true;
   }
@@ -1482,7 +1488,7 @@ absl::StatusOr<HloInstruction*> FuseFwdMultiHeadedAttentionBlock(
       } else if (bmm2_grad2_user->opcode() == HloOpcode::kTranspose) {
         activation_output = bmm2_grad2_user;
       } else {
-        return InternalError("Unexpected activation patterns");
+        return Internal("Unexpected activation patterns");
       }
     }
     // if it is flash attention, should output softmax stats to the bwd
@@ -1852,6 +1858,9 @@ absl::StatusOr<bool> CudnnFusedMHARewriter::Run(
         comp->parent()->config().debug_options();
     const auto cudnn_version =
         GetRealCuDNNVersion(cudnn_version_, stream_executor_);
+#if CUDA_VERSION < 12000
+    return false;
+#endif
     if (!debug_options.xla_gpu_enable_cudnn_fmha() ||
         !IsComputeCapabilityAndCudnnSupported(
             compute_capability_, cudnn_version,
diff --git a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
index 781de340efd2be..54376f827b091b 100644
--- a/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_fused_mha_rewriter_test.cc
@@ -3875,16 +3875,18 @@ ENTRY main.92 {
   ComputationLayout computation_layout(
       m->entry_computation()->ComputeProgramShape());
 
-  const HloInstruction* fmha;
+  const HloInstruction* fwd_fmha;
+  const HloInstruction* bwd_fmha;
   SCOPED_TRACE(m->ToString());
   EXPECT_THAT(
       m->entry_computation()->root_instruction(),
       GmockMatch(m::Tuple(
           m::GetTupleElement(
-              m::CustomCall(&fmha, {kCudnnfMHASoftmaxCallTarget}), 0)
+              m::CustomCall(&fwd_fmha, {kCudnnfMHASoftmaxCallTarget}), 0)
               .WithShape(BF16, {2, 6, 2048, 128}),
           m::GetTupleElement(
-              m::CustomCall(&fmha, {kCudnnfMHASoftmaxBackwardCallTarget}), 0)
+              m::CustomCall(&bwd_fmha, {kCudnnfMHASoftmaxBackwardCallTarget}),
+              0)
               .WithShape(BF16, {2, 6, 2048, 128}),
           m::Transpose(
               m::GetTupleElement(
@@ -3894,9 +3896,10 @@ ENTRY main.92 {
               m::CustomCall({kCudnnfMHASoftmaxBackwardCallTarget}), 2)
               .WithShape(BF16, {2, 6, 2048, 128}))));
   TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
-                          fmha->backend_config<GpuBackendConfig>());
+                          fwd_fmha->backend_config<GpuBackendConfig>());
   const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
-  EXPECT_EQ(fmha->operands().size(), 6);
+  EXPECT_EQ(fwd_fmha->operands().size(), 3);
+  EXPECT_EQ(bwd_fmha->operands().size(), 6);
   EXPECT_NEAR(config.dropout_rate(), 0, 1e-2);
   EXPECT_EQ(config.is_flash_attention(), true);
   EXPECT_EQ(config.is_causal_mask(), true);
@@ -4009,6 +4012,220 @@ ENTRY main.92 {
   EXPECT_EQ(config.is_causal_mask(), false);
 }
 
+TEST_F(CudnnFusedMhaRewriterTestHloTest,
+       FlashAttentionBF16TrainingBmm1SoftmaxBmm2Pattern) {
+  const char* module_str = R"(
+HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,2048,128]{3,2,1,0},bf16[2,6,128,2048]{3,2,1,0},bf16[2,6,2048,128]{3,2,1,0},bf16[2,6,2048,128]{3,2,1,0})->(bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,128,2048]{3,2,1,0}, bf16[2,6,2048,128]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
+region_0.32 {
+  Arg_0.33 = bf16[] parameter(0)
+  Arg_1.34 = bf16[] parameter(1)
+  ROOT maximum = bf16[] maximum(Arg_0.33, Arg_1.34)
+}
+region_1.44 {
+  Arg_0.45 = f32[] parameter(0)
+  Arg_1.46 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0.45, Arg_1.46)
+}
+region_2.66 {
+  Arg_0.67 = bf16[] parameter(0)
+  Arg_1.68 = bf16[] parameter(1)
+  ROOT add.1 = bf16[] add(Arg_0.67, Arg_1.68)
+}
+ENTRY main.92 {
+  Arg_0.1 = bf16[2,6,2048,128]{3,2,1,0} parameter(0), sharding={replicated}
+  Arg_1.2 = bf16[2,6,128,2048]{3,2,1,0} parameter(1), sharding={replicated}
+  dot.14 = bf16[2,6,2048,2048]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  constant.17 = bf16[] constant(2)
+  broadcast.29 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(constant.17), dimensions={}
+  multiply.2 = bf16[2,6,2048,2048]{3,2,1,0} multiply(dot.14, broadcast.29)
+  constant.10 = bf16[] constant(-inf)
+  constant.16 = bf16[] constant(0)
+  reduce.36 = bf16[2,6,2048]{2,1,0} reduce(multiply.2, constant.10), dimensions={3}, to_apply=region_0.32
+  broadcast.21 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(reduce.36), dimensions={0,1,2}
+  subtract.1 = bf16[2,6,2048,2048]{3,2,1,0} subtract(multiply.2, broadcast.21)
+  exponential.1 = bf16[2,6,2048,2048]{3,2,1,0} exponential(subtract.1)
+  convert.5 = f32[2,6,2048,2048]{3,2,1,0} convert(exponential.1)
+  constant.14 = f32[] constant(0)
+  reduce.48 = f32[2,6,2048]{2,1,0} reduce(convert.5, constant.14), dimensions={3}, to_apply=region_1.44
+  convert.9 = bf16[2,6,2048]{2,1,0} convert(reduce.48)
+  broadcast.32 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(convert.9), dimensions={0,1,2}
+  divide.5 = bf16[2,6,2048,2048]{3,2,1,0} divide(exponential.1, broadcast.32)
+  Arg_2.3 = bf16[2,6,2048,128]{3,2,1,0} parameter(2), sharding={replicated}
+  dot.57 = bf16[2,6,2048,128]{3,2,1,0} dot(divide.5, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  Arg_3.4 = bf16[2,6,2048,128]{3,2,1,0} parameter(3), sharding={replicated}
+  dot.60 = bf16[2,6,2048,2048]{3,2,1,0} dot(Arg_3.4, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  divide.4 = bf16[2,6,2048,2048]{3,2,1,0} divide(dot.60, broadcast.32)
+  constant.15 = bf16[] constant(1)
+  broadcast.25 = bf16[2,6,2048]{2,1,0} broadcast(constant.15), dimensions={}
+  multiply.3 = bf16[2,6,2048]{2,1,0} multiply(convert.9, convert.9)
+  divide.3 = bf16[2,6,2048]{2,1,0} divide(broadcast.25, multiply.3)
+  broadcast.26 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(divide.3), dimensions={0,1,2}
+  multiply.4 = bf16[2,6,2048,2048]{3,2,1,0} multiply(dot.60, broadcast.26)
+  multiply.5 = bf16[2,6,2048,2048]{3,2,1,0} multiply(multiply.4, exponential.1)
+  reduce.70 = bf16[2,6,2048]{2,1,0} reduce(multiply.5, constant.16), dimensions={3}, to_apply=region_2.66
+  negate.2 = bf16[2,6,2048]{2,1,0} negate(reduce.70)
+  broadcast.31 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(negate.2), dimensions={0,1,2}
+  add.5 = bf16[2,6,2048,2048]{3,2,1,0} add(divide.4, broadcast.31)
+  multiply.8 = bf16[2,6,2048,2048]{3,2,1,0} multiply(add.5, exponential.1)
+  multiply.9 = bf16[2,6,2048,2048]{3,2,1,0} multiply(multiply.8, broadcast.29)
+  dot.90 = bf16[2,6,2048,128]{3,2,1,0} dot(multiply.9, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  dot = bf16[2,6,128,2048]{3,2,1,0} dot(Arg_0.1, multiply.9), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  dot.1 = bf16[2,6,2048,128]{3,2,1,0} dot(divide.5, Arg_3.4), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  ROOT tuple.91 = (bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,2048,128]{3,2,1,0}, bf16[2,6,128,2048]{3,2,1,0}, bf16[2,6,2048,128]{3,2,1,0}) tuple(dot.57, dot.90, dot, dot.1)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+  CudnnFusedMHARewriter fusedMhaRewriter{
+      GetCudaComputeCapability(), GetCudnnVersionWithFlashAttentionSupport()};
+  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
+  HloDCE dce;
+  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
+
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape());
+
+  const HloInstruction* fmha;
+
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Tuple(
+          m::GetTupleElement(
+              m::CustomCall(&fmha, {kCudnnfMHASoftmaxCallTarget}), 0)
+              .WithShape(BF16, {2, 6, 2048, 128}),
+          m::GetTupleElement(
+              m::CustomCall(&fmha, {kCudnnfMHASoftmaxBackwardCallTarget}), 0)
+              .WithShape(BF16, {2, 6, 2048, 128}),
+          m::Transpose(
+              m::GetTupleElement(
+                  m::CustomCall({kCudnnfMHASoftmaxBackwardCallTarget}), 1))
+              .WithShape(BF16, {2, 6, 128, 2048}),
+          m::GetTupleElement(
+              m::CustomCall({kCudnnfMHASoftmaxBackwardCallTarget}), 2)
+              .WithShape(BF16, {2, 6, 2048, 128}))));
+  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
+                          fmha->backend_config<GpuBackendConfig>());
+  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
+  EXPECT_EQ(fmha->operands().size(), 6);
+  EXPECT_NEAR(config.dropout_rate(), 0, 1e-2);
+  EXPECT_FLOAT_EQ(config.fmha_scale(), 2);
+  EXPECT_EQ(config.is_flash_attention(), true);
+  EXPECT_EQ(config.is_causal_mask(), false);
+}
+
+TEST_F(CudnnFusedMhaRewriterTestHloTest,
+       FlashAttentionBF16TrainingBmm1ScaleMaskSoftmaxBmm2Pattern) {
+  const char* module_str = R"(
+HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,2048,64]{3,2,1,0},bf16[2,6,64,2048]{3,2,1,0},bf16[2,6,2048,64]{3,2,1,0},bf16[2,6,2048,64]{3,2,1,0})->(bf16[2,6,2048,64]{3,2,1,0}, bf16[2,6,2048,64]{3,2,1,0}, bf16[2,6,64,2048]{3,2,1,0}, bf16[2,6,2048,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
+
+region_0.21 {
+  Arg_0.22 = bf16[] parameter(0)
+  Arg_1.23 = bf16[] parameter(1)
+  ROOT maximum = bf16[] maximum(Arg_0.22, Arg_1.23)
+}
+
+region_1.33 {
+  Arg_0.34 = f32[] parameter(0)
+  Arg_1.35 = f32[] parameter(1)
+  ROOT add = f32[] add(Arg_0.34, Arg_1.35)
+}
+
+region_2.55 {
+  Arg_0.56 = bf16[] parameter(0)
+  Arg_1.57 = bf16[] parameter(1)
+  ROOT add.1 = bf16[] add(Arg_0.56, Arg_1.57)
+}
+
+ENTRY main.82 {
+  constant.18 = pred[2,6,2048,2048]{3,2,1,0} constant({...})
+  Arg_0.1 = bf16[2,6,2048,64]{3,2,1,0} parameter(0), sharding={replicated}
+  Arg_1.2 = bf16[2,6,64,2048]{3,2,1,0} parameter(1), sharding={replicated}
+  dot.17 = bf16[2,6,2048,2048]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  constant.22 = bf16[] constant(2)
+  broadcast.24 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(constant.22), dimensions={}
+  multiply.2 = bf16[2,6,2048,2048]{3,2,1,0} multiply(dot.17, broadcast.24)
+  constant.19 = bf16[] constant(1)
+  constant.21 = bf16[] constant(0)
+  broadcast.23 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(constant.21), dimensions={}
+  select.1 = bf16[2,6,2048,2048]{3,2,1,0} select(constant.18, multiply.2, broadcast.23)
+  constant.15 = bf16[] constant(-inf)
+  reduce.25 = bf16[2,6,2048]{2,1,0} reduce(select.1, constant.15), dimensions={3}, to_apply=region_0.21
+  broadcast.17 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(reduce.25), dimensions={0,1,2}
+  subtract.1 = bf16[2,6,2048,2048]{3,2,1,0} subtract(select.1, broadcast.17)
+  exponential.1 = bf16[2,6,2048,2048]{3,2,1,0} exponential(subtract.1)
+  convert.5 = f32[2,6,2048,2048]{3,2,1,0} convert(exponential.1)
+  constant.17 = f32[] constant(0)
+  reduce.37 = f32[2,6,2048]{2,1,0} reduce(convert.5, constant.17), dimensions={3}, to_apply=region_1.33
+  convert.9 = bf16[2,6,2048]{2,1,0} convert(reduce.37)
+  broadcast.26 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(convert.9), dimensions={0,1,2}
+  divide.5 = bf16[2,6,2048,2048]{3,2,1,0} divide(exponential.1, broadcast.26)
+  Arg_2.3 = bf16[2,6,2048,64]{3,2,1,0} parameter(2), sharding={replicated}
+  dot.46 = bf16[2,6,2048,64]{3,2,1,0} dot(divide.5, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  Arg_3.4 = bf16[2,6,2048,64]{3,2,1,0} parameter(3), sharding={replicated}
+  dot.49 = bf16[2,6,2048,2048]{3,2,1,0} dot(Arg_3.4, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  divide.4 = bf16[2,6,2048,2048]{3,2,1,0} divide(dot.49, broadcast.26)
+  broadcast.20 = bf16[2,6,2048]{2,1,0} broadcast(constant.19), dimensions={}
+  multiply.3 = bf16[2,6,2048]{2,1,0} multiply(convert.9, convert.9)
+  divide.3 = bf16[2,6,2048]{2,1,0} divide(broadcast.20, multiply.3)
+  broadcast.21 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(divide.3), dimensions={0,1,2}
+  multiply.4 = bf16[2,6,2048,2048]{3,2,1,0} multiply(dot.49, broadcast.21)
+  multiply.5 = bf16[2,6,2048,2048]{3,2,1,0} multiply(multiply.4, exponential.1)
+  reduce.59 = bf16[2,6,2048]{2,1,0} reduce(multiply.5, constant.21), dimensions={3}, to_apply=region_2.55
+  negate.2 = bf16[2,6,2048]{2,1,0} negate(reduce.59)
+  broadcast.25 = bf16[2,6,2048,2048]{3,2,1,0} broadcast(negate.2), dimensions={0,1,2}
+  add.5 = bf16[2,6,2048,2048]{3,2,1,0} add(divide.4, broadcast.25)
+  multiply.8 = bf16[2,6,2048,2048]{3,2,1,0} multiply(add.5, exponential.1)
+  select.3 = bf16[2,6,2048,2048]{3,2,1,0} select(constant.18, multiply.8, broadcast.23)
+  multiply.9 = bf16[2,6,2048,2048]{3,2,1,0} multiply(select.3, broadcast.24)
+  dot.80 = bf16[2,6,2048,64]{3,2,1,0} dot(multiply.9, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+  dot = bf16[2,6,64,2048]{3,2,1,0} dot(Arg_0.1, multiply.9), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  dot.1 = bf16[2,6,2048,64]{3,2,1,0} dot(divide.5, Arg_3.4), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+  ROOT tuple.81 = (bf16[2,6,2048,64]{3,2,1,0}, bf16[2,6,2048,64]{3,2,1,0}, bf16[2,6,64,2048]{3,2,1,0}, bf16[2,6,2048,64]{3,2,1,0}) tuple(dot.46, dot.80, dot, dot.1)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+  CudnnFusedMHARewriter fusedMhaRewriter{
+      GetCudaComputeCapability(), GetCudnnVersionWithFlashAttentionSupport()};
+  TF_ASSERT_OK(RunHloPass(&fusedMhaRewriter, m.get()).status());
+  HloDCE dce;
+  TF_ASSERT_OK(RunHloPass(&dce, m.get()).status());
+
+  ComputationLayout computation_layout(
+      m->entry_computation()->ComputeProgramShape());
+
+  const HloInstruction* fmha;
+
+  SCOPED_TRACE(m->ToString());
+  EXPECT_THAT(
+      m->entry_computation()->root_instruction(),
+      GmockMatch(m::Tuple(
+          m::GetTupleElement(
+              m::CustomCall(&fmha, {kCudnnfMHAScaleMaskSoftmaxCallTarget}), 0)
+              .WithShape(BF16, {2, 6, 2048, 64}),
+          m::GetTupleElement(
+              m::CustomCall(&fmha,
+                            {kCudnnfMHAScaleMaskSoftmaxBackwardCallTarget}),
+              0)
+              .WithShape(BF16, {2, 6, 2048, 64}),
+          m::Transpose(
+              m::GetTupleElement(
+                  m::CustomCall({kCudnnfMHAScaleMaskSoftmaxBackwardCallTarget}),
+                  1))
+              .WithShape(BF16, {2, 6, 64, 2048}),
+          m::GetTupleElement(
+              m::CustomCall({kCudnnfMHAScaleMaskSoftmaxBackwardCallTarget}), 2)
+              .WithShape(BF16, {2, 6, 2048, 64}))));
+  TF_ASSERT_OK_AND_ASSIGN(auto gpu_config,
+                          fmha->backend_config<GpuBackendConfig>());
+  const CudnnfMHABackendConfig& config = gpu_config.cudnn_fmha_backend_config();
+  EXPECT_EQ(fmha->operands().size(), 7);
+  EXPECT_NEAR(config.dropout_rate(), 0, 1e-2);
+  EXPECT_EQ(config.is_flash_attention(), true);
+  EXPECT_EQ(config.is_causal_mask(), false);
+}
+
 TEST_F(CudnnFusedMhaRewriterTestHloTest,
        FlashAttentionF16Bmm1BiasSoftmaxBmm2PatternCrossAttention) {
   const char* module_str = R"(
diff --git a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc
index 6fa4cc9d18c7db..17164251cf3edc 100644
--- a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.cc
@@ -57,6 +57,93 @@ namespace {
 
 namespace m = match;
 
+// Traverses the graph upward starting at instr and returns the
+// first instruction that is not a convert, bitcast or reshape.
+const HloInstruction* SkipUnaryOps(const HloInstruction* instr) {
+  while (instr->opcode() == HloOpcode::kConvert ||
+         instr->opcode() == HloOpcode::kBitcast ||
+         instr->opcode() == HloOpcode::kReshape) {
+    instr = instr->operand(0);
+  }
+  return instr;
+}
+
+// Recursively traverses the graph downward starting at instr and stores in
+// instrs the users that are not a convert, bitcast or reshape.
+void SkipUnaryOpsTopDownRecursive(HloInstruction* instr,
+                                  std::vector<HloInstruction*>& instrs) {
+  if (instr->opcode() == HloOpcode::kConvert ||
+      instr->opcode() == HloOpcode::kBitcast ||
+      instr->opcode() == HloOpcode::kReshape) {
+    for (HloInstruction* user : instr->users()) {
+      SkipUnaryOpsTopDownRecursive(user, instrs);
+    }
+  } else {
+    instrs.emplace_back(instr);
+  }
+}
+
+// Holds auxiliary information about individual layer norm patterns rewritten
+// into a cuDNN Custom Call.
+struct NormMetadata {
+  // Transposes applied to the input and output of the forward layer norm to
+  // order the normalization and non-normalization dimensions as required by
+  // cuDNN. Nullptr if no transposes were inserted.
+  HloInstruction *x_transpose, *y_transpose;
+  // The reduction and non-reduction dimensions of the input into the forward
+  // layer norm before the potential application of transposes.
+  std::vector<int64_t> norm_dims, non_norm_dims;
+};
+
+// Map from the instruction pointer of a layer norm Custom Call to its metadata.
+using NormMetadataMap = absl::flat_hash_map<HloInstruction*, NormMetadata>;
+
+// Captures multiple HloInstruction pointers and verifies that their target
+// is identical.
+//
+// Example:
+// Pattern cos(x) / sin(x) with cos and sin intended to operate on the same
+// HloInstruction:
+//  UniqueHloInstruction x;
+//  bool m = Match(
+//      instr, m::Divide(m::Cos(m::Op().WithPredicate(x.capture_and_verify)),
+//                       m::Sin(m::Op().WithPredicate(x.capture_and_verify))));
+// m is true and x.Instr() returns an HloInstruction pointer to the operand of
+// cosine and sine iff HloInstruction *instr points to a division of a cosine by
+// a sine that operate on the same instruction.
+class UniqueHloInstruction {
+ public:
+  UniqueHloInstruction() : is_set_(false), instr_(nullptr) {}
+  HloInstruction* Instr() const { return instr_; }
+  void SetInstr(HloInstruction* instr) {
+    is_set_ = true;
+    instr_ = instr;
+  }
+
+  // Stores instr when invoked the first time. Otherwise, compares instr to the
+  // stored value and sets the stored value to nullptr if the comparison fails.
+  bool CaptureOrVerify(HloInstruction* instr) {
+    if (is_set_ && instr != instr_) {
+      instr_ = nullptr;
+    }
+    if (!is_set_) {
+      is_set_ = true;
+      instr_ = instr;
+    }
+    return instr_;
+  }
+
+  // Lambda for capturing or verifying an instruction using WithPredicate.
+  const std::function<bool(const HloInstruction*)> capture_or_verify =
+      [this](const HloInstruction* instr) -> bool {
+    return CaptureOrVerify(const_cast<HloInstruction*>(instr));
+  };
+
+ private:
+  bool is_set_;
+  HloInstruction* instr_;
+};
+
 // Returns an architecture-specific constant for the calculation of an upper
 // bound for the size of the scratch space for layer norm kernels.
 absl::StatusOr<int64_t> CConstant(
@@ -78,11 +165,22 @@ bool CompatibleElementType(const HloInstruction* instr) {
 }
 
 // Returns whether the HLO Computation applied by instr calculates the sum of
-// the elements.
-bool AppliesAddReduce(const HloInstruction* instr) {
+// the elements. When provided, compares reduce_dims to the dimensions of the
+// reduction.
+bool AppliesAddReduce(const HloInstruction* instr,
+                      absl::Span<const int64_t> reduce_dims = {}) {
   if (instr->opcode() != HloOpcode::kReduce) {
     return false;
   }
+  if (ShapeUtil::HasDegenerateDimensions(instr->operand(0)->shape())) {
+    VLOG(1) << "Reduction input must not have degenerate dimensions.";
+    return false;
+  }
+  // Verify the dimensions of the reduction.
+  if (!reduce_dims.empty() && instr->dimensions() != reduce_dims) {
+    return false;
+  }
+
   HloComputation* reduce_comp = instr->to_apply();
   HloInstruction* reduce_comp_root = reduce_comp->root_instruction();
   return instr->operand_count() == 2 &&
@@ -97,23 +195,13 @@ bool AppliesAddReduce(const HloInstruction* instr) {
 // Returns whether instr multiplies the result of a reduction by one over the
 // number of reduced elements.
 bool CalculatesExpectation(const HloInstruction* instr) {
-  auto skip_convert_and_reshape =
-      [](const HloInstruction* instr) -> const HloInstruction* {
-    while (instr->opcode() == HloOpcode::kConvert ||
-           instr->opcode() == HloOpcode::kReshape) {
-      instr = instr->operand(0);
-    }
-    return instr;
-  };
-
-  instr = skip_convert_and_reshape(instr);
+  instr = SkipUnaryOps(instr);
   if (instr->opcode() != HloOpcode::kMultiply) {
     return false;
   }
   bool bcast_operand = instr->operand(0)->opcode() != HloOpcode::kBroadcast;
   const HloInstruction *broadcast = instr->operand(bcast_operand),
-                       *reduce = instr->operand(!bcast_operand);
-  reduce = skip_convert_and_reshape(reduce);
+                       *reduce = SkipUnaryOps(instr->operand(!bcast_operand));
   if (reduce->opcode() != HloOpcode::kReduce ||
       broadcast->opcode() != HloOpcode::kBroadcast ||
       broadcast->operand(0)->opcode() != HloOpcode::kConstant) {
@@ -134,6 +222,167 @@ bool CalculatesExpectation(const HloInstruction* instr) {
          ((actual_r_nelems + r_nelems) * numerical_epsilon);
 }
 
+// Returns whether target can be reached from instr by recursively traversing
+// the graph across converts, bitcasts and reshapes.
+bool FindTargetRecursive(
+    const HloInstruction* instr, const HloInstruction* target,
+    absl::flat_hash_set<const HloInstruction*>& visited_instrs,
+    const HloInstruction* transpose) {
+  visited_instrs.emplace(instr);
+  const absl::flat_hash_set<HloOpcode> supported_ops = {
+      HloOpcode::kConvert, HloOpcode::kBitcast, HloOpcode::kReshape};
+  if (instr == target) {
+    return true;
+  }
+  // Look for target among the users of instr.
+  for (HloInstruction* user : instr->users()) {
+    if ((supported_ops.contains(user->opcode()) || user == transpose) &&
+        !visited_instrs.contains(user)) {
+      return FindTargetRecursive(user, target, visited_instrs, transpose);
+    }
+  }
+  // Ascend the graph if target is not found and instr is a convert, bitcast
+  // or reshape.
+  if (supported_ops.contains(instr->opcode())) {
+    return FindTargetRecursive(instr->operand(0), target, visited_instrs,
+                               transpose);
+  }
+  return false;
+}
+
+bool FindTarget(const HloInstruction* custom_call, const HloInstruction* instr,
+                const HloInstruction* target,
+                const NormMetadataMap& norm_metadata) {
+  absl::flat_hash_set<const HloInstruction*> visited_instrs;
+  auto custom_call_metadata = norm_metadata.find(custom_call);
+  if (custom_call_metadata == norm_metadata.end()) {
+    return false;
+  }
+  return FindTargetRecursive(instr, target, visited_instrs,
+                             custom_call_metadata->second.x_transpose);
+}
+
+// Maps the dimension numbers in dimensions from shape original_shape to shape
+// reshaped_shape, assuming that the shapes are related through a strict
+// reshape. Returns an empty vector if a dimension mapping is not found.
+std::vector<int64_t> MapDimensions(const Shape& original_shape,
+                                   const Shape& reshaped_shape,
+                                   const absl::Span<const int64_t> dimensions) {
+  // The original and reshaped shape must not have degenerate dimensions.
+  if (ShapeUtil::HasDegenerateDimensions(original_shape) ||
+      ShapeUtil::HasDegenerateDimensions(reshaped_shape)) {
+    return {};
+  }
+
+  auto dimension_product =
+      [](const Shape& shape,
+         absl::Span<const int64_t> product_dimensions) -> int64_t {
+    int64_t product = 1;
+    for (int64_t product_dimension : product_dimensions) {
+      product *= shape.dimensions(product_dimension);
+    }
+    return product;
+  };
+  // Construct the dimension mapping.
+  absl::flat_hash_map<int64_t, std::vector<int64_t>> dimensions_map;
+  std::vector<int64_t> original_dimensions, reshaped_dimensions;
+  for (int64_t original_dimension = 0, reshaped_dimension = 0;
+       original_dimension < original_shape.rank(); ++original_dimension) {
+    original_dimensions.emplace_back(original_dimension);
+    while (dimension_product(reshaped_shape, reshaped_dimensions) <
+               dimension_product(original_shape, original_dimensions) &&
+           reshaped_dimension < reshaped_shape.rank()) {
+      reshaped_dimensions.emplace_back(reshaped_dimension++);
+    }
+
+    // Many-to-many dimension mappings are not supported.
+    if (original_dimensions.size() > 1 && reshaped_dimensions.size() > 1) {
+      return {};
+    }
+
+    if (dimension_product(original_shape, original_dimensions) ==
+        dimension_product(reshaped_shape, reshaped_dimensions)) {
+      std::vector<int64_t> original_dimensions_in_dimensions;
+      std::set_intersection(
+          original_dimensions.begin(), original_dimensions.end(),
+          dimensions.begin(), dimensions.end(),
+          std::back_inserter(original_dimensions_in_dimensions));
+      // The unique mapping of dimensions requires either all or none of the
+      // entries of original_dimensions to be an element of dimensions.
+      if (original_dimensions_in_dimensions.size() != 0 &&
+          original_dimensions_in_dimensions.size() !=
+              original_dimensions.size()) {
+        return {};
+      }
+      for (int64_t dimension : original_dimensions) {
+        dimensions_map.insert({dimension, reshaped_dimensions});
+      }
+      original_dimensions.clear();
+      reshaped_dimensions.clear();
+    }
+  }
+
+  // Map the dimensions numbers to the reshaped shape.
+  std::vector<int64_t> mapped_dimensions;
+  for (int64_t dimension : dimensions) {
+    auto mapped_dimension = dimensions_map.find(dimension);
+    if (mapped_dimension == dimensions_map.end()) {
+      return {};
+    }
+    mapped_dimensions.insert(mapped_dimensions.end(),
+                             mapped_dimension->second.begin(),
+                             mapped_dimension->second.end());
+  }
+
+  // Eliminate duplicates in the mapped dimension numbers.
+  mapped_dimensions.erase(
+      std::unique(mapped_dimensions.begin(), mapped_dimensions.end()),
+      mapped_dimensions.end());
+  return mapped_dimensions;
+}
+
+// Recursively traverses the graph across converts, bitcasts and reshapes,
+// starting from instr, and returns the first addition-reduction identified.
+// Returns nullptr if no addition-reduction is found.
+HloInstruction* FindAddReduceRecursive(
+    HloInstruction* instr, const Shape& orig_instr_shape,
+    const absl::Span<const int64_t> reduce_dims,
+    absl::flat_hash_set<HloInstruction*>& visited_instrs) {
+  visited_instrs.emplace(instr);
+  const absl::flat_hash_set<HloOpcode> supported_ops = {
+      HloOpcode::kConvert, HloOpcode::kBitcast, HloOpcode::kReshape};
+  // Look for a reduction among the users of instr.
+  for (HloInstruction* user : instr->users()) {
+    if (user->opcode() == HloOpcode::kReduce) {
+      std::vector<int64_t> mapped_reduce_dims =
+          MapDimensions(orig_instr_shape, instr->shape(), reduce_dims);
+      if (!mapped_reduce_dims.empty() &&
+          AppliesAddReduce(user, mapped_reduce_dims)) {
+        return user;
+      }
+    }
+    if (supported_ops.contains(user->opcode()) &&
+        !visited_instrs.contains(user)) {
+      return FindAddReduceRecursive(user, orig_instr_shape, reduce_dims,
+                                    visited_instrs);
+    }
+  }
+  // Ascend the graph if the addition-reduction is not found and instr is a
+  // convert, bitcast or reshape.
+  if (supported_ops.contains(instr->opcode())) {
+    return FindAddReduceRecursive(instr->mutable_operand(0), orig_instr_shape,
+                                  reduce_dims, visited_instrs);
+  }
+  return nullptr;
+}
+
+HloInstruction* FindAddReduce(HloInstruction* instr,
+                              const absl::Span<const int64_t> reduce_dims) {
+  absl::flat_hash_set<HloInstruction*> visited_instrs;
+  return FindAddReduceRecursive(instr, instr->shape(), reduce_dims,
+                                visited_instrs);
+}
+
 // Type conversion from and to any of BF16, FP16 and FP32.
 template <typename Pattern>
 auto SupportedConvert(Pattern pattern) {
@@ -144,69 +393,94 @@ auto SupportedConvert(Pattern pattern) {
   return m::Convert(pattern).WithPredicate(supported_convert);
 }
 
-// Reshape adding or removing degenerate dimensions.
+// Bitcast or reshape adding or removing degenerate dimensions.
 template <typename Pattern>
-auto SupportedReshape(Pattern pattern) {
-  auto supported_reshape = [](const HloInstruction* instr) -> bool {
+auto SupportedBitcastOrReshape(Pattern pattern) {
+  auto supported_bitcast_or_reshape = [](const HloInstruction* instr) -> bool {
     return ShapeUtil::Equal(
         ShapeUtil::DropDegenerateDimensions(instr->shape()),
         ShapeUtil::DropDegenerateDimensions(instr->operand(0)->shape()));
   };
-  return m::Reshape(pattern).WithPredicate(supported_reshape);
+  return m::AnyOf<HloInstruction>(
+      m::Bitcast(pattern).WithPredicate(supported_bitcast_or_reshape),
+      m::Reshape(pattern).WithPredicate(supported_bitcast_or_reshape));
 }
 
-// Matches pattern, SupportedConvert(pattern), SupportedReshape(pattern),
-// SupportedConvert(SupportedReshape(pattern)) and
-// SupportedReshape(SupportedConvert(pattern)).
+// Matches pattern, SupportedConvert(pattern),
+// SupportedBitcastOrReshape(pattern),
+// SupportedConvert(SupportedBitcastOrReshape(pattern)) and
+// SupportedBitcastOrReshape(SupportedConvert(pattern)).
 template <typename Pattern>
-auto OptionalConvertAndOrReshape(Pattern pattern) {
+auto OptionalSupportedTransform(Pattern pattern) {
   auto shared_subpattern = m::SharedSubpattern(pattern);
   return m::AnyOf<HloInstruction>(
-      SupportedConvert(SupportedReshape(shared_subpattern)),
-      SupportedReshape(SupportedConvert(shared_subpattern)),
-      SupportedConvert(shared_subpattern), SupportedReshape(shared_subpattern),
-      shared_subpattern);
+      SupportedConvert(SupportedBitcastOrReshape(shared_subpattern)),
+      SupportedBitcastOrReshape(SupportedConvert(shared_subpattern)),
+      SupportedConvert(shared_subpattern),
+      SupportedBitcastOrReshape(shared_subpattern), shared_subpattern);
+}
+
+// Bitcast or reshape with optional supported type conversion and/or addition or
+// removal of degenerate dimensions.
+template <typename Pattern>
+auto BitcastOrReshape(Pattern pattern) {
+  return OptionalSupportedTransform(
+      m::AnyOf<HloInstruction>(m::Bitcast(pattern), m::Reshape(pattern)));
+}
+
+// Transpose with optional supported type conversion and/or addition or removal
+// of degenerate dimensions.
+template <typename Pattern>
+auto Transpose(Pattern pattern) {
+  return OptionalSupportedTransform(m::Transpose(pattern));
 }
 
-// Rsqrt with optional convert and/or reshape.
+// Rsqrt with optional supported type conversion and/or addition or removal of
+// degenerate dimensions.
 template <typename Pattern>
 auto Rsqrt(HloInstruction** rsqrt, Pattern pattern) {
-  return OptionalConvertAndOrReshape(m::Rsqrt(rsqrt, pattern));
+  return OptionalSupportedTransform(m::Rsqrt(rsqrt, pattern));
 }
 
-// AddAnyOrder with optional convert and/or reshape.
+// AddAnyOrder with optional supported type conversion and/or addition or
+// removal of degenerate dimensions.
 template <typename Pattern0, typename Pattern1>
 auto AddAnyOrder(Pattern0 pattern0, Pattern1 pattern1) {
-  return OptionalConvertAndOrReshape(m::AddAnyOrder(pattern0, pattern1));
+  return OptionalSupportedTransform(m::AddAnyOrder(pattern0, pattern1));
 }
 
-// Subtract with optional convert and/or reshape.
+// Subtract with optional supported type conversion and/or addition or removal
+// of degenerate dimensions.
 template <typename Pattern0, typename Pattern1>
 auto Subtract(Pattern0 pattern0, Pattern1 pattern1) {
-  return OptionalConvertAndOrReshape(m::Subtract(pattern0, pattern1));
+  return OptionalSupportedTransform(m::Subtract(pattern0, pattern1));
 }
 
-// Capturing subtract with optional convert and/or reshape.
+// Capturing subtract with optional supported type conversion and/or addition or
+// removal of degenerate dimensions.
 template <typename Pattern0, typename Pattern1>
 auto Subtract(HloInstruction** subtract, Pattern0 pattern0, Pattern1 pattern1) {
-  return OptionalConvertAndOrReshape(m::Subtract(subtract, pattern0, pattern1));
+  return OptionalSupportedTransform(m::Subtract(subtract, pattern0, pattern1));
 }
 
-// Multiply with optional convert and/or reshape.
+// Multiply with optional supported type conversion and/or addition or removal
+// of degenerate dimensions.
 template <typename Pattern0, typename Pattern1>
 auto MultiplyAnyOrder(Pattern0 pattern0, Pattern1 pattern1) {
-  return OptionalConvertAndOrReshape(m::MultiplyAnyOrder(pattern0, pattern1));
+  return OptionalSupportedTransform(m::MultiplyAnyOrder(pattern0, pattern1));
 }
 
-// Capturing multiply with optional convert and/or reshape.
+// Capturing multiply with optional supported type conversion and/or addition or
+// removal of degenerate dimensions.
 template <typename Pattern0, typename Pattern1>
 auto MultiplyAnyOrder(HloInstruction** multiply, Pattern0 pattern0,
                       Pattern1 pattern1) {
-  return OptionalConvertAndOrReshape(
+  return OptionalSupportedTransform(
       m::MultiplyAnyOrder(multiply, pattern0, pattern1));
 }
 
-// Multiplication of pattern by itself with optional convert and/or reshape.
+// Multiplication of pattern by itself with optional supported type conversion
+// and/or addition or removal of degenerate dimensions.
 template <typename Pattern>
 auto Square(Pattern pattern) {
   return MultiplyAnyOrder(pattern, pattern)
@@ -215,28 +489,49 @@ auto Square(Pattern pattern) {
       });
 }
 
-// Addition-reduction of pattern with optional convert and/or reshape and
-// constant 0 scalar.
+// Multiplication of the square of pattern by pattern with optional supported
+// type conversion and/or addition or removal of degenerate dimensions. The root
+// instruction of pattern cannot be a multiplication.
+template <typename Pattern>
+auto Cube(Pattern pattern) {
+  auto unique_cube = [](const HloInstruction* instr) -> bool {
+    bool square_operand = instr->operand(0)->opcode() != HloOpcode::kMultiply;
+    return instr->operand(!square_operand)->opcode() != HloOpcode::kMultiply &&
+           instr->operand(square_operand)->operand(0) ==
+               instr->operand(!square_operand);
+  };
+  return MultiplyAnyOrder(Square(pattern), pattern).WithPredicate(unique_cube);
+}
+
+// Addition-reduction of pattern with optional supported type conversion and/or
+// addition or removal of degenerate dimensions.
 template <typename Pattern>
 auto AddReduce(Pattern pattern) {
-  return OptionalConvertAndOrReshape(
+  return OptionalSupportedTransform(
       m::Reduce(pattern, m::Op())
           .WithPredicate([](const HloInstruction* instr) {
             return AppliesAddReduce(instr);
           }));
 }
 
-// Capturing addition-reduction of pattern with optional convert and/or reshape
-// and constant 0 scalar.
+// Capturing addition-reduction of pattern with optional supported type
+// conversion and/or addition or removal of degenerate dimensions.
 template <typename Pattern>
 auto AddReduce(HloInstruction** reduction, Pattern pattern) {
-  return OptionalConvertAndOrReshape(
+  return OptionalSupportedTransform(
       m::Reduce(reduction, pattern, m::Op())
           .WithPredicate([](const HloInstruction* instr) {
             return AppliesAddReduce(instr);
           }));
 }
 
+// Negated addition-reduction.
+template <typename Pattern>
+auto NegateAddReduce(HloInstruction** reduction, Pattern pattern) {
+  return m::AnyOf<HloInstruction>(AddReduce(reduction, m::Negate(pattern)),
+                                  m::Negate(AddReduce(reduction, pattern)));
+}
+
 // Expected value, or mean, with optional broadcast.
 template <typename Pattern>
 auto Expectation(Pattern pattern) {
@@ -251,65 +546,56 @@ auto Expectation(Pattern pattern) {
 
 // Expected value, or mean, with optional broadcast.
 template <typename Pattern>
-auto Expectation(HloInstruction** expectation, Pattern pattern) {
+auto Expectation(UniqueHloInstruction* expectation, Pattern pattern) {
   auto shared_subpattern =
-      MultiplyAnyOrder(expectation, m::Broadcast(m::ConstantScalar()),
-                       AddReduce(pattern))
+      MultiplyAnyOrder(m::Broadcast(m::ConstantScalar()), AddReduce(pattern))
           .WithPredicate([](const HloInstruction* instr) {
             return CalculatesExpectation(instr);
-          });
+          })
+          .WithPredicate(expectation->capture_or_verify);
   return m::AnyOf<HloInstruction>(m::Broadcast(shared_subpattern),
                                   shared_subpattern);
 }
 
 // Expected value, or mean, with optional broadcast.
 template <typename Pattern>
-auto Expectation(HloInstruction** expectation, HloInstruction** reduce,
+auto Expectation(UniqueHloInstruction* expectation, HloInstruction** reduce,
                  Pattern pattern) {
-  auto shared_subpattern =
-      MultiplyAnyOrder(expectation, m::Broadcast(m::ConstantScalar()),
-                       AddReduce(reduce, pattern))
-          .WithPredicate([](const HloInstruction* instr) {
-            return CalculatesExpectation(instr);
-          });
+  auto shared_subpattern = MultiplyAnyOrder(m::Broadcast(m::ConstantScalar()),
+                                            AddReduce(reduce, pattern))
+                               .WithPredicate([](const HloInstruction* instr) {
+                                 return CalculatesExpectation(instr);
+                               })
+                               .WithPredicate(expectation->capture_or_verify);
   return m::AnyOf<HloInstruction>(m::Broadcast(shared_subpattern),
                                   shared_subpattern);
 }
 
 // Variance, expressed as expectation(X^2) - expectation(X)^2 or
-// expectation((X - expectation(X))^2). The simultaneous capture of input0 and
-// input1 allows the caller to verify that they are identical.
-auto Variance(HloInstruction** expectation, HloInstruction** input0,
-              HloInstruction** input1) {
-  return m::AnyOf<HloInstruction>(
-      Subtract(Expectation(Square(m::Op(input0))),
-               Square(Expectation(expectation, m::Op(input1)))),
-      Expectation(Square(
-          Subtract(m::Op(input0), Expectation(expectation, m::Op(input1))))));
-}
-
-// Variance, expressed as expectation(X^2) - expectation(X)^2 or
-// expectation((X - expectation(X))^2). The simultaneous capture of input0 and
-// input1 allows the caller to verify that they are identical.
-auto Variance(HloInstruction** variance, HloInstruction** expectation,
-              HloInstruction** input0, HloInstruction** input1) {
+// expectation((X - expectation(X))^2).
+auto Variance(UniqueHloInstruction* variance, UniqueHloInstruction* expectation,
+              UniqueHloInstruction* x) {
   return m::AnyOf<HloInstruction>(
-      Subtract(variance, Expectation(Square(m::Op(input0))),
-               Square(Expectation(expectation, m::Op(input1)))),
-      Expectation(variance,
-                  Square(Subtract(m::Op(input0),
-                                  Expectation(expectation, m::Op(input1))))));
+      Subtract(Expectation(Square(m::Op().WithPredicate(x->capture_or_verify))),
+               Square(Expectation(expectation,
+                                  m::Op().WithPredicate(x->capture_or_verify))))
+          .WithPredicate(variance->capture_or_verify),
+      Expectation(
+          Square(Subtract(m::Op().WithPredicate(x->capture_or_verify),
+                          Expectation(expectation, m::Op().WithPredicate(
+                                                       x->capture_or_verify)))))
+          .WithPredicate(variance->capture_or_verify));
 }
 
 // Reciprocal of the square root of variance + epsilon with optional broadcast.
-// The simultaneous capture of input0 and input1 allows the caller to verify
-// that they are identical.
-auto NormFactor(HloInstruction** norm_factor, HloInstruction** input0,
-                HloInstruction** input1, HloInstruction** variance,
-                HloInstruction** expectation, HloInstruction** epsilon) {
+auto NormFactor(HloInstruction** norm_factor, UniqueHloInstruction* x,
+                UniqueHloInstruction* variance,
+                UniqueHloInstruction* expectation,
+                UniqueHloInstruction* epsilon) {
   auto shared_subpattern = m::SharedSubpattern(Rsqrt(
-      norm_factor, AddAnyOrder(Variance(variance, expectation, input0, input1),
-                               m::Broadcast(m::ConstantScalar(epsilon)))));
+      norm_factor, AddAnyOrder(Variance(variance, expectation, x),
+                               m::Broadcast(m::ConstantScalar().WithPredicate(
+                                   epsilon->capture_or_verify)))));
   return m::AnyOf<HloInstruction>(m::Broadcast(shared_subpattern),
                                   shared_subpattern);
 }
@@ -323,6 +609,22 @@ auto MultiplyMultiplyAnyOrder(P0 p0, P1 p1, P2 p2) {
       MultiplyAnyOrder(p2, MultiplyAnyOrder(p0, p1)));
 }
 
+// Any order of p0 + p1 + p2.
+template <typename P0, typename P1, typename P2>
+auto AddAddAnyOrder(P0 p0, P1 p1, P2 p2) {
+  return m::AnyOf<HloInstruction>(AddAnyOrder(p0, AddAnyOrder(p1, p2)),
+                                  AddAnyOrder(p1, AddAnyOrder(p0, p2)),
+                                  AddAnyOrder(p2, AddAnyOrder(p0, p1)));
+}
+
+// Any order of p0 * (p1 + p2).
+template <typename P0, typename P1, typename P2>
+auto MultiplyAddAnyOrder(P0 p0, P1 p1, P2 p2) {
+  return m::AnyOf<HloInstruction>(
+      MultiplyAnyOrder(p0, AddAnyOrder(p1, p2)),
+      AddAnyOrder(MultiplyAnyOrder(p0, p1), MultiplyAnyOrder(p0, p2)));
+}
+
 // Any order of p0 - p1 + p2.
 template <typename P0, typename P1, typename P2>
 auto SubtractAddAnyOrder(P0 p0, P1 p1, P2 p2) {
@@ -340,6 +642,185 @@ auto SubtractMultiplyAddAnyOrder(P0 p0, P1 p1, P2 p2, P3 p3, P4 p4) {
       AddAnyOrder(MultiplyMultiplyAnyOrder(Subtract(p0, p1), p2, p3), p4));
 }
 
+// Expectation fused into a layer norm Custom Call.
+auto FusedExpectation(UniqueHloInstruction* custom_call) {
+  auto shared_subpattern = m::SharedSubpattern(
+      m::GetTupleElement(m::CustomCall({kCudnnNormCallTarget})
+                             .WithPredicate(custom_call->capture_or_verify),
+                         1));
+  return m::AnyOf<HloInstruction>(shared_subpattern,
+                                  BitcastOrReshape(shared_subpattern));
+}
+
+// Expectation fused into a layer norm Custom Call.
+auto FusedExpectation(UniqueHloInstruction* fused_expectation,
+                      UniqueHloInstruction* custom_call) {
+  auto shared_subpattern = m::SharedSubpattern(
+      m::GetTupleElement(m::CustomCall({kCudnnNormCallTarget})
+                             .WithPredicate(custom_call->capture_or_verify),
+                         1)
+          .WithPredicate(fused_expectation->capture_or_verify));
+  return m::AnyOf<HloInstruction>(shared_subpattern,
+                                  BitcastOrReshape(shared_subpattern));
+}
+
+// Norm factor fused into a layer norm Custom Call.
+auto FusedNormFactor(UniqueHloInstruction* custom_call) {
+  auto shared_subpattern = m::SharedSubpattern(
+      m::GetTupleElement(m::CustomCall({kCudnnNormCallTarget})
+                             .WithPredicate(custom_call->capture_or_verify),
+                         2));
+  return m::AnyOf<HloInstruction>(shared_subpattern,
+                                  BitcastOrReshape(shared_subpattern));
+}
+
+// Norm factor fused into a layer norm Custom Call.
+auto FusedNormFactor(UniqueHloInstruction* fused_norm_factor,
+                     UniqueHloInstruction* custom_call) {
+  auto shared_subpattern = m::SharedSubpattern(
+      m::GetTupleElement(m::CustomCall({kCudnnNormCallTarget})
+                             .WithPredicate(custom_call->capture_or_verify),
+                         2)
+          .WithPredicate(fused_norm_factor->capture_or_verify));
+  return m::AnyOf<HloInstruction>(shared_subpattern,
+                                  BitcastOrReshape(shared_subpattern));
+}
+
+// Derivative of the norm factor w.r.t. variance + epsilon,
+// d(norm_factor)/d(variance + epsilon)
+// = d((variance + epsilon)^-1/2)/d(variance + epsilon)
+// = -1/2 * norm_factor^3.
+// Forwards custom_call to FusedNormFactor for verification.
+auto DNormFactor(UniqueHloInstruction* custom_call) {
+  return MultiplyAnyOrder(m::Broadcast(m::ConstantScalar(-0.5)),
+                          Cube(FusedNormFactor(custom_call)));
+}
+
+//  Zero-centered input of the layer norm, X - expectation(X). Verifies that
+//  custom_call is a forward layer norm fusing X. Forwards custom_call to
+//  FusedExpectation for verification.
+auto XCenter(UniqueHloInstruction* x, UniqueHloInstruction* custom_call,
+             const NormMetadataMap& norm_metadata) {
+  auto capture_or_verify_x =
+      [x, custom_call, &norm_metadata](const HloInstruction* instr) -> bool {
+    return x->CaptureOrVerify(
+        FindTarget(custom_call->Instr(), instr->operand(0),
+                   custom_call->Instr()->operand(0), norm_metadata)
+            ? custom_call->Instr()->mutable_operand(0)
+            : nullptr);
+  };
+  return Subtract(m::Op(), m::Broadcast(FusedExpectation(custom_call)))
+      .WithPredicate(capture_or_verify_x);
+}
+
+// Zero-centered input of the layer norm, X - expectation(X). Captures X in x if
+// custom_call is a forward layer norm fusing X. Forwards custom_call to
+// FusedExpectation for comparison.
+auto XCenter(UniqueHloInstruction* x_center, UniqueHloInstruction* x,
+             UniqueHloInstruction* fused_expectation,
+             UniqueHloInstruction* custom_call,
+             const NormMetadataMap& norm_metadata) {
+  auto capture_or_verify_x = [x, x_center, custom_call, &norm_metadata](
+                                 const HloInstruction* instr) -> bool {
+    return x->CaptureOrVerify(
+        FindTarget(custom_call->Instr(), instr->operand(0),
+                   custom_call->Instr()->operand(0), norm_metadata)
+            ? custom_call->Instr()->mutable_operand(0)
+            : nullptr);
+  };
+  return Subtract(m::Op(), m::Broadcast(FusedExpectation(fused_expectation,
+                                                         custom_call)))
+      .WithPredicate(x_center->capture_or_verify)
+      .WithPredicate(capture_or_verify_x);
+}
+
+// Addition-reduction of the product of XCenter, the broadcasted scale and DY,
+// XCenter * scale * DY. Captures the scale in scale if custom_call is a forward
+// layer norm fusing the scale. Forwards custom_call to XCenter for comparison.
+auto F0(UniqueHloInstruction* custom_call, UniqueHloInstruction* scale,
+        UniqueHloInstruction* dy, UniqueHloInstruction* x,
+        HloInstruction** reduce, const NormMetadataMap& norm_metadata) {
+  auto capture_or_verify_scale = [scale, custom_call, &norm_metadata](
+                                     const HloInstruction* instr) -> bool {
+    return scale->CaptureOrVerify(FindTarget(custom_call->Instr(), instr,
+                                             custom_call->Instr()->operand(1),
+                                             norm_metadata)
+                                      ? custom_call->Instr()->mutable_operand(1)
+                                      : nullptr);
+  };
+  return AddReduce(
+      reduce, MultiplyMultiplyAnyOrder(
+                  XCenter(x, custom_call, norm_metadata),
+                  m::Broadcast(m::Op().WithPredicate(capture_or_verify_scale)),
+                  m::Op().WithPredicate(dy->capture_or_verify)));
+}
+
+// Product of XCenter and the scaled and broadcasted product of F0 and
+// d(norm_factor)/d(variance + epsilon), XCenter * F0 * DNormFactor * 2 /
+// nelems. Forwards custom_call to XCenter, F0 and DNormFactor for capture or
+// verification.
+auto F1(UniqueHloInstruction* x, UniqueHloInstruction* x_center,
+        UniqueHloInstruction* fused_expectation,
+        UniqueHloInstruction* custom_call, UniqueHloInstruction* scale,
+        UniqueHloInstruction* dy, HloInstruction** reduce,
+        const NormMetadataMap& norm_metadata) {
+  auto broadcasts_two_over_nelems = [](const HloInstruction* instr) -> bool {
+    const HloInstruction* multiply = SkipUnaryOps(instr->operand(0));
+    bool bcast_operand =
+        multiply->operand(0)->opcode() != HloOpcode::kBroadcast;
+
+    // The captured scalar must be two over the number of elements in the
+    // broadcasted dimensions.
+    float actual_two_over_nelems = multiply->operand(bcast_operand)
+                                       ->operand(0)
+                                       ->literal()
+                                       .GetAsDouble({})
+                                       .value();
+    int64_t nelems = 1;
+    for (int i = 0; i < instr->shape().dimensions_size(); ++i) {
+      if (!c_linear_search(instr->dimensions(), i)) {
+        nelems *= instr->shape().dimensions()[i];
+      }
+    }
+    // The absolute of the difference between the actual scaling factor and the
+    // reference value must not exceed a prescribed threshold.
+    float two_over_nelems = 2. / static_cast<float>(nelems);
+    float numerical_epsilon = std::numeric_limits<bfloat16>::epsilon();
+    return abs(actual_two_over_nelems - two_over_nelems) <
+           ((actual_two_over_nelems + two_over_nelems) * numerical_epsilon);
+  };
+
+  return MultiplyAnyOrder(
+      XCenter(x_center, x, fused_expectation, custom_call, norm_metadata),
+      m::Broadcast(
+          MultiplyAnyOrder(m::Broadcast(m::ConstantScalar()),
+                           MultiplyAnyOrder(DNormFactor(custom_call),
+                                            F0(custom_call, scale, dy, x,
+                                               reduce, norm_metadata))))
+          .WithPredicate(broadcasts_two_over_nelems));
+}
+
+// Product of the norm factor, scale and DY, NormFactor * scale * DY. Captures
+// the scale in scale if custom_call is a forward layer norm fusing the scale.
+// Forwards custom_call to FusedNormFactor for comparison.
+auto F2(UniqueHloInstruction* fused_norm_factor, UniqueHloInstruction* scale,
+        UniqueHloInstruction* dy, UniqueHloInstruction* custom_call,
+        const NormMetadataMap& norm_metadata) {
+  auto capture_or_verify_scale = [scale, custom_call, &norm_metadata](
+                                     const HloInstruction* instr) -> bool {
+    return scale->CaptureOrVerify(
+        FindTarget(custom_call->Instr(), instr->operand(0),
+                   custom_call->Instr()->operand(1), norm_metadata)
+            ? custom_call->Instr()->mutable_operand(1)
+            : nullptr);
+  };
+  return MultiplyAnyOrder(
+      m::Broadcast(
+          BitcastOrReshape(FusedNormFactor(fused_norm_factor, custom_call))),
+      MultiplyAnyOrder(m::Broadcast().WithPredicate(capture_or_verify_scale),
+                       m::Op().WithPredicate(dy->capture_or_verify)));
+}
+
 class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
  public:
   explicit CudnnNormRewriterVisitor(
@@ -347,7 +828,9 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
       : cuda_compute_capability_(cuda_compute_capability) {}
 
   absl::Status HandleAdd(HloInstruction* instr) override {
-    return MatchLayerNorm(instr);
+    TF_RETURN_IF_ERROR(MatchLayerNorm(instr));
+    TF_RETURN_IF_ERROR(MatchLayerNormGradient(instr));
+    return absl::OkStatus();
   }
 
   absl::Status HandleSubtract(HloInstruction* instr) override {
@@ -355,19 +838,21 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
   }
 
   // Matches and rewrites layer norm patterns,
-  // (X - expectation(X))/(variance(X) + epsilon)^1/2 * scale + bias,
+  // Y = (X - expectation(X))/sqrt(variance(X) + epsilon) * scale + bias,
   // into Custom Calls to cuDNN.
   absl::Status MatchLayerNorm(HloInstruction* instr) {
-    HloInstruction *input, *input0, *input1, *input2, *scale, *bias, *epsilon,
-        *expectation, *expectation0, *reduce, *norm_factor, *variance,
-        *broadcast_scale, *broadcast_bias;
-    if (Match(instr, SubtractMultiplyAddAnyOrder(
-                         m::Op(&input),
-                         Expectation(&expectation, &reduce, m::Op(&input0)),
-                         NormFactor(&norm_factor, &input1, &input2, &variance,
-                                    &expectation0, &epsilon),
-                         m::Broadcast(&broadcast_scale, m::Op(&scale)),
-                         m::Broadcast(&broadcast_bias, m::Op(&bias))))) {
+    UniqueHloInstruction x, expectation, variance, epsilon;
+    HloInstruction *scale, *bias, *reduce, *norm_factor, *broadcast_scale,
+        *broadcast_bias;
+    if (Match(
+            instr,
+            SubtractMultiplyAddAnyOrder(
+                m::Op().WithPredicate(x.capture_or_verify),
+                Expectation(&expectation, &reduce,
+                            m::Op().WithPredicate(x.capture_or_verify)),
+                NormFactor(&norm_factor, &x, &variance, &expectation, &epsilon),
+                m::Broadcast(&broadcast_scale, m::Op(&scale)),
+                m::Broadcast(&broadcast_bias, m::Op(&bias))))) {
 #if CUDNN_VERSION < 8905
       // Layer norm kernels are available with cuDNN 8.9.5 and above.
       VLOG(1) << "Layer norm Custom Calls require cuDNN 8.9.5.";
@@ -391,25 +876,20 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
       }
 
       // Verify the uniqueness of the inputs.
-      auto is_input = [input](HloInstruction* inputx) -> bool {
-        return inputx->unique_id() == input->unique_id() ||
-               (inputx->opcode() == HloOpcode::kConvert &&
-                inputx->operand(0)->unique_id() == input->unique_id());
-      };
-      if (!is_input(input0) || !is_input(input1) || !is_input(input2) ||
-          expectation->unique_id() != expectation0->unique_id()) {
+      if (!x.Instr() || !expectation.Instr() || !variance.Instr() ||
+          !epsilon.Instr()) {
         VLOG(1) << "Layer norm operands not unique.";
         return absl::OkStatus();
       }
 
       // Skip initial convert, if present.
-      if (input->opcode() == HloOpcode::kConvert) {
-        input = input->mutable_operand(0);
+      if (x.Instr()->opcode() == HloOpcode::kConvert) {
+        x.SetInstr(x.Instr()->mutable_operand(0));
       }
 
       // Verify the input and output layouts.
       // TODO(philipphack): Consider supporting more general cases.
-      if (!LayoutUtil::IsMonotonicWithDim0Major(input->shape().layout()) ||
+      if (!LayoutUtil::IsMonotonicWithDim0Major(x.Instr()->shape().layout()) ||
           !LayoutUtil::IsMonotonicWithDim0Major(scale->shape().layout()) ||
           !LayoutUtil::IsMonotonicWithDim0Major(bias->shape().layout()) ||
           !LayoutUtil::IsMonotonicWithDim0Major(instr->shape().layout())) {
@@ -419,7 +899,7 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
 
       // Verify the element types. The types and shapes of the scale and bias
       // must match.
-      if (!CompatibleElementType(input) || !CompatibleElementType(instr) ||
+      if (!CompatibleElementType(x.Instr()) || !CompatibleElementType(instr) ||
           !CompatibleElementType(scale) || !CompatibleElementType(bias) ||
           !ShapeUtil::Equal(scale->shape(), bias->shape())) {
         VLOG(1) << "Layer norm input types or shapes not supported.";
@@ -435,7 +915,7 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
         return absl::OkStatus();
       }
       for (int i = 0; i < norm_dims.size(); ++i) {
-        if (input->shape().dimensions(norm_dims[i]) !=
+        if (x.Instr()->shape().dimensions(norm_dims[i]) !=
             scale->shape().dimensions(i)) {
           VLOG(1) << "Layer norm input dimensions not supported.";
           return absl::OkStatus();
@@ -456,54 +936,55 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
       // If necessary, transpose the input so that the dimensions not being
       // normalized are the leading dimensions.
       std::vector<int64_t> non_norm_dims;
-      for (int64_t input_dim = 0; input_dim < input->shape().rank();
-           ++input_dim) {
-        if (std::find(norm_dims.begin(), norm_dims.end(), input_dim) ==
+      for (int64_t x_dim = 0; x_dim < x.Instr()->shape().rank(); ++x_dim) {
+        if (std::find(norm_dims.begin(), norm_dims.end(), x_dim) ==
             norm_dims.end()) {
-          non_norm_dims.emplace_back(input_dim);
+          non_norm_dims.emplace_back(x_dim);
         }
       }
-      std::vector<int64_t> transpose_order = non_norm_dims;
-      transpose_order.insert(transpose_order.end(), norm_dims.begin(),
-                             norm_dims.end());
+      std::vector<int64_t> x_transpose_order = non_norm_dims;
+      x_transpose_order.insert(x_transpose_order.end(), norm_dims.begin(),
+                               norm_dims.end());
 
       bool apply_transpose = false;
-      for (int i = 0; i < transpose_order.size(); ++i) {
-        if (transpose_order[i] != i) {
+      for (int i = 0; i < x_transpose_order.size(); ++i) {
+        if (x_transpose_order[i] != i) {
           apply_transpose = true;
           break;
         }
       }
 
-      std::optional<HloInstruction*> transpose;
-      std::vector<int64_t> inverse_transpose_order(transpose_order.size());
+      std::optional<HloInstruction*> x_transpose;
+      // The transpose applied to the output is the inverse of the transpose
+      // applied to the input.
+      std::vector<int64_t> y_transpose_order(x_transpose_order.size());
       if (apply_transpose) {
-        for (int k = 0; k < transpose_order.size(); ++k) {
-          inverse_transpose_order[transpose_order[k]] = k;
+        for (int k = 0; k < x_transpose_order.size(); ++k) {
+          y_transpose_order[x_transpose_order[k]] = k;
         }
-        TF_ASSIGN_OR_RETURN(transpose,
-                            MakeTransposeHlo(input, transpose_order));
+        TF_ASSIGN_OR_RETURN(x_transpose,
+                            MakeTransposeHlo(x.Instr(), x_transpose_order));
       }
 
       // Combine the dimensions not normalized into the first dimension of the
       // input as required by cuDNN.
       std::vector<int64_t> reshaped_dims = {1};
       for (auto non_norm_dim : non_norm_dims) {
-        reshaped_dims[0] *= input->shape().dimensions(non_norm_dim);
+        reshaped_dims[0] *= x.Instr()->shape().dimensions(non_norm_dim);
       }
       for (auto norm_dim : norm_dims) {
-        reshaped_dims.emplace_back(input->shape().dimensions(norm_dim));
+        reshaped_dims.emplace_back(x.Instr()->shape().dimensions(norm_dim));
       }
       // cuDNN requires tensors to have at least four dimensions.
       while (reshaped_dims.size() < 4) {
         reshaped_dims.emplace_back(1);
       }
 
-      Shape reshaped_shape =
-          ShapeUtil::MakeShape(input->shape().element_type(), reshaped_dims);
+      Shape reshaped_shape = ShapeUtil::MakeShape(
+          x.Instr()->shape().element_type(), reshaped_dims);
       TF_ASSIGN_OR_RETURN(
-          HloInstruction * reshape,
-          MakeReshapeHlo(reshaped_shape, transpose.value_or(input)));
+          HloInstruction * x_reshape,
+          MakeReshapeHlo(reshaped_shape, x_transpose.value_or(x.Instr())));
 
       // Reshape the scale and bias.
       std::vector<int64_t> reshaped_scale_dims(reshaped_dims.begin() + 1,
@@ -514,14 +995,16 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
       }
       Shape scale_bias_shape = ShapeUtil::MakeShape(
           scale->shape().element_type(), reshaped_scale_dims);
-      TF_ASSIGN_OR_RETURN(HloInstruction * reshaped_scale,
+      TF_ASSIGN_OR_RETURN(HloInstruction * scale_reshape,
                           MakeReshapeHlo(scale_bias_shape, scale));
-      TF_ASSIGN_OR_RETURN(HloInstruction * reshaped_bias,
+      TF_ASSIGN_OR_RETURN(HloInstruction * bias_reshape,
                           MakeReshapeHlo(scale_bias_shape, bias));
-      GpuBackendConfig gpu_config;
+      GpuBackendConfig gpu_backend_config;
       CudnnNormBackendConfig& backend_config =
-          *gpu_config.mutable_cudnn_norm_backend_config();
-      backend_config.set_epsilon(epsilon->literal().GetAsDouble({}).value());
+          *gpu_backend_config.mutable_cudnn_norm_backend_config();
+      backend_config.set_epsilon(
+          epsilon.Instr()->literal().GetAsDouble({}).value());
+      backend_config.set_kind(CudnnNormBackendConfig::LAYER_FWD_INFER);
       auto* algorithm = backend_config.mutable_algorithm();
       algorithm->set_algo_id(0);
       algorithm->set_math_type(se::dnn::AlgorithmProto::TENSOR_OP_MATH);
@@ -538,28 +1021,33 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
       // The output of the Custom Call is a tuple, the second element of which
       // describes the scratch space.
       Shape custom_call_shape = ShapeUtil::MakeTupleShape(
-          {reshape->shape(), ShapeUtil::MakeShape(U8, {workspace_size})});
+          {x_reshape->shape(), ShapeUtil::MakeShape(U8, {workspace_size})});
 
       HloInstruction* custom_call =
           instr->AddInstruction(HloInstruction::CreateCustomCall(
-              custom_call_shape, {reshape, reshaped_scale, reshaped_bias},
+              custom_call_shape, {x_reshape, scale_reshape, bias_reshape},
               kCudnnNormCallTarget));
-      TF_RETURN_IF_ERROR(custom_call->set_backend_config(gpu_config));
+      TF_RETURN_IF_ERROR(custom_call->set_backend_config(gpu_backend_config));
 
       TF_ASSIGN_OR_RETURN(HloInstruction * gte,
                           MakeGetTupleElementHlo(custom_call, 0));
       TF_ASSIGN_OR_RETURN(
-          HloInstruction * inverse_reshape,
-          MakeReshapeHlo(transpose.value_or(instr)->shape(), gte));
+          HloInstruction * y_reshape,
+          MakeReshapeHlo(x_transpose.value_or(instr)->shape(), gte));
 
-      if (!apply_transpose) {
-        TF_RETURN_IF_ERROR(ReplaceInstruction(instr, inverse_reshape));
-      } else {
-        TF_ASSIGN_OR_RETURN(
-            HloInstruction * inverse_transpose,
-            MakeTransposeHlo(inverse_reshape, inverse_transpose_order));
-        TF_RETURN_IF_ERROR(ReplaceInstruction(instr, inverse_transpose));
+      std::optional<HloInstruction*> y_transpose;
+      if (apply_transpose) {
+        TF_ASSIGN_OR_RETURN(y_transpose,
+                            MakeTransposeHlo(y_reshape, y_transpose_order));
       }
+      TF_RETURN_IF_ERROR(
+          ReplaceInstruction(instr, y_transpose.value_or(y_reshape)));
+
+      // Store metadata for potential use in the backward graph.
+      norm_metadata_.insert(
+          {custom_call, NormMetadata({x_transpose.value_or(nullptr),
+                                      y_transpose.value_or(nullptr), norm_dims,
+                                      non_norm_dims})});
 
       VLOG(1) << "Layer norm rewritten into Custom Call.";
 
@@ -583,28 +1071,36 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
   // into the layer norm Custom Call.
   absl::Status MatchNormFactor(HloInstruction* instr,
                                HloInstruction* custom_call,
-                               HloInstruction* variance,
-                               HloInstruction* expectation,
-                               HloInstruction* epsilon) {
-    HloInstruction *variance0, *epsilon0, *gte = custom_call->users()[0];
+                               UniqueHloInstruction& variance,
+                               UniqueHloInstruction& expectation,
+                               UniqueHloInstruction& epsilon) {
+    HloInstruction* gte = custom_call->users()[0];
     if (Match(instr,
-              m::Divide(m::Op(), AddAnyOrder(m::Op(&variance0),
-                                             m::Broadcast(m::ConstantScalar(
-                                                 &epsilon0)))))) {
+              m::Divide(
+                  m::Op(),
+                  AddAnyOrder(m::Op().WithPredicate(variance.capture_or_verify),
+                              m::Broadcast(m::ConstantScalar().WithPredicate(
+                                  epsilon.capture_or_verify)))))) {
       // Verify the uniqueness of the operands.
-      if (variance->unique_id() != variance0->unique_id() ||
-          epsilon->unique_id() != epsilon0->unique_id()) {
+      if (!variance.Instr() || !epsilon.Instr()) {
         VLOG(1) << "Layer norm operands not unique.";
         return absl::OkStatus();
       }
 
       // Verify the element types.
       if (!CompatibleElementType(instr) ||
-          !CompatibleElementType(expectation)) {
+          !CompatibleElementType(expectation.Instr())) {
         VLOG(1) << "Layer norm input types not compatible.";
         return absl::OkStatus();
       }
 
+      // Retrieve metadata of the forward layer norm.
+      auto norm_metadata = norm_metadata_.extract(custom_call);
+      if (!norm_metadata) {
+        VLOG(1) << "Unable to retrieve norm metadata of forward Custom Call.";
+        return absl::OkStatus();
+      }
+
       // The shape of the expectation and norm factor return values of the
       // Custom Call is [nelems, 1, 1, 1], where nelems is the
       // number of elements in the expectation and norm factor shapes.
@@ -613,7 +1109,8 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
                                     {ShapeUtil::ElementsIn(shape), 1, 1, 1});
       };
 
-      Shape expectation_shape = make_compatible_shape(expectation->shape());
+      Shape expectation_shape =
+          make_compatible_shape(expectation.Instr()->shape());
       Shape norm_factor_shape = make_compatible_shape(instr->shape());
 
       // The augmented Custom Call additionally returns the expectation and the
@@ -627,17 +1124,21 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
       HloInstruction* new_custom_call = instr->AddInstruction(
           custom_call->CloneWithNewShape(custom_call_shape));
 
+      TF_ASSIGN_OR_RETURN(
+          GpuBackendConfig gpu_backend_config,
+          custom_call->backend_config<xla::gpu::GpuBackendConfig>());
+      CudnnNormBackendConfig& backend_config =
+          *gpu_backend_config.mutable_cudnn_norm_backend_config();
+      backend_config.set_kind(CudnnNormBackendConfig::LAYER_FWD_TRAIN);
+
       // Update the workspace size.
       TF_ASSIGN_OR_RETURN(const int64_t c_constant,
                           CConstant(cuda_compute_capability_));
       const int64_t workspace_size = (2 * c_constant * (4 + 256)) + 32;
-      TF_ASSIGN_OR_RETURN(GpuBackendConfig gpu_config,
-                          custom_call->backend_config<GpuBackendConfig>());
-      CudnnNormBackendConfig& backend_config =
-          *gpu_config.mutable_cudnn_norm_backend_config();
       backend_config.mutable_algorithm()->mutable_workspace_size()->set_value(
           workspace_size);
-      TF_RETURN_IF_ERROR(custom_call->set_backend_config(gpu_config));
+      TF_RETURN_IF_ERROR(
+          new_custom_call->set_backend_config(gpu_backend_config));
 
       auto replace_with_new_cc = [new_custom_call, this](
                                      HloInstruction* old_instr,
@@ -674,17 +1175,286 @@ class CudnnNormRewriterVisitor : public DfsHloRewriteVisitor {
       // Replace the result of the original Custom Call as well as the
       // expectation and the norm factor with the augmented Custom Call.
       TF_RETURN_IF_ERROR(replace_with_new_cc(gte, 0));
-      TF_RETURN_IF_ERROR(replace_with_new_cc(expectation, 1));
+      TF_RETURN_IF_ERROR(replace_with_new_cc(expectation.Instr(), 1));
       TF_RETURN_IF_ERROR(replace_with_new_cc(instr, 2));
 
+      // Update the Custom Call associated with the metadata of the forward
+      // norm.
+      norm_metadata.key() = new_custom_call;
+      norm_metadata_.insert(std::move(norm_metadata));
+
       VLOG(1)
           << "Expectation and norm factor fused into layer norm Custom Call.";
     }
+
+    return absl::OkStatus();
+  }
+
+  // Matches and rewrites the backward graph of layer norm patterns into Custom
+  // Calls to cuDNN when the associated forward graph has been rewritten into a
+  // cuDNN Custom Call. The gradients are
+  //   DX = F1 + F2 - AddReduce(F1 + F2) / nelems,
+  //   Dscale = AddReduce(DY * XCenter * NormFactor),
+  //   Dbias = AddReduce(DY),
+  // with
+  //   F0 = XCenter * scale * DY,
+  //   F1 =  XCenter * F0 * DNormFactor * 2 / nelems,
+  //   F2 = NormFactor * scale * DY,
+  //   XCenter = X - expectation(X),
+  //   NormFactor = (variance(X) + epsilon)^-1/2 and
+  //   DNormFactor = -1/2 * NormFactor^3.
+  absl::Status MatchLayerNormGradient(HloInstruction* instr) {
+    UniqueHloInstruction fwd_custom_call, x, x_center, scale, dy,
+        fused_expectation, fused_norm_factor;
+    HloInstruction *broadcast, *scalar, *dscale, *dbias, *reduce0, *reduce1,
+        *reduce2, *reduce3;
+    if (Match(instr,
+              AddAddAnyOrder(
+                  m::Broadcast(
+                      &broadcast,
+                      MultiplyAddAnyOrder(
+                          m::Broadcast(m::ConstantScalar(&scalar)),
+                          NegateAddReduce(&reduce0,
+                                          F1(&x, &x_center, &fused_expectation,
+                                             &fwd_custom_call, &scale, &dy,
+                                             &reduce2, norm_metadata_)),
+                          NegateAddReduce(
+                              &reduce1, F2(&fused_norm_factor, &scale, &dy,
+                                           &fwd_custom_call, norm_metadata_)))),
+                  F2(&fused_norm_factor, &scale, &dy, &fwd_custom_call,
+                     norm_metadata_),
+                  F1(&x, &x_center, &fused_expectation, &fwd_custom_call,
+                     &scale, &dy, &reduce3, norm_metadata_)))) {
+      // Skip initial convert, if present.
+      if (instr->user_count() == 1 &&
+          instr->users()[0]->opcode() == HloOpcode::kConvert &&
+          CompatibleElementType(instr->users()[0])) {
+        instr = instr->users()[0];
+      }
+
+      // Verify the uniqueness of the captured Custom Call and inputs.
+      if (!fwd_custom_call.Instr() || !x.Instr() || !dy.Instr() ||
+          !x_center.Instr() || !scale.Instr() || !fused_expectation.Instr() ||
+          !fused_norm_factor.Instr()) {
+        VLOG(1) << "Layer norm gradient inputs not unique.";
+        return absl::OkStatus();
+      }
+
+      // Retrieve metadata of the forward layer norm.
+      auto norm_metadata = norm_metadata_.find(fwd_custom_call.Instr());
+      if (norm_metadata == norm_metadata_.end()) {
+        VLOG(1) << "Unable to retrieve norm metadata of forward Custom Call.";
+        return absl::OkStatus();
+      }
+
+      // Verify the dimensions of reductions in the backward graph.
+      if (reduce0->dimensions() != norm_metadata->second.norm_dims ||
+          reduce1->dimensions() != norm_metadata->second.norm_dims ||
+          reduce2->dimensions() != norm_metadata->second.norm_dims ||
+          reduce3->dimensions() != norm_metadata->second.norm_dims) {
+        VLOG(1) << "Unexpected reductions dimensions in layer norm gradient.";
+        return absl::OkStatus();
+      }
+
+      // The captured scalar must be one over the number of elements in the
+      // broadcasted dimensions.
+      float actual_r_nelems = scalar->literal().GetAsDouble({}).value();
+      int64_t nelems = 1;
+      for (int i = 0; i < broadcast->shape().dimensions_size(); ++i) {
+        if (!c_linear_search(broadcast->dimensions(), i)) {
+          nelems *= broadcast->shape().dimensions()[i];
+        }
+      }
+      // The absolute of the difference between the actual scaling factor and
+      // the reference value must not exceed a prescribed threshold.
+      float r_nelems = 1. / static_cast<float>(nelems);
+      float numerical_epsilon = std::numeric_limits<bfloat16>::epsilon();
+      if (!(abs(actual_r_nelems - r_nelems) <
+            ((actual_r_nelems + r_nelems) * numerical_epsilon))) {
+        VLOG(1)
+            << "Layer norm backward broadcast operand outside expected range.";
+        return absl::OkStatus();
+      }
+
+      // Identify Dscale = AddReduce(DY * XCenter * norm factor) with factor0
+      // and factor1 intended to be XCenter and DY or DY and XCenter.
+      auto find_dscale =
+          [&fused_norm_factor, &norm_metadata](
+              const UniqueHloInstruction& factor0,
+              const UniqueHloInstruction& factor1) -> HloInstruction* {
+        for (HloInstruction* factor0_user : factor0.Instr()->users()) {
+          std::vector<HloInstruction*> users;
+          SkipUnaryOpsTopDownRecursive(factor0_user, users);
+          // One of the users of factor0 must be a chained multiplication by the
+          // fused norm factor and factor1.
+          for (HloInstruction* user : users) {
+            if (Match(user,
+                      MultiplyAnyOrder(
+                          m::Op(), MultiplyAnyOrder(
+                                       m::Broadcast(BitcastOrReshape(m::Op().Is(
+                                           fused_norm_factor.Instr()))),
+                                       m::Op().Is(factor1.Instr()))))) {
+              // Dscale is an addition-reduction of the product.
+              for (HloInstruction* multiply_user : user->users()) {
+                if (AppliesAddReduce(multiply_user,
+                                     norm_metadata->second.non_norm_dims)) {
+                  return multiply_user;
+                }
+              }
+            }
+          }
+        }
+        return nullptr;
+      };
+      if (!(dscale = find_dscale(x_center, dy)) &&
+          !(dscale = find_dscale(dy, x_center))) {
+        VLOG(1) << "Unable to identify Dscale in graph.";
+        return absl::OkStatus();
+      }
+
+      // Find Dbias, i.e. an addition-reduction of DY, starting from DY.
+      // Rewriting proceeds without fusing Dbias if unsuccessful.
+      dbias = FindAddReduce(dy.Instr(), norm_metadata->second.non_norm_dims);
+
+      // Verify the input and output layouts.
+      // TODO(philipphack): Consider supporting more general cases.
+      if (!LayoutUtil::IsMonotonicWithDim0Major(dy.Instr()->shape().layout()) ||
+          !LayoutUtil::IsMonotonicWithDim0Major(instr->shape().layout()) ||
+          !LayoutUtil::IsMonotonicWithDim0Major(dscale->shape().layout()) ||
+          (dbias &&
+           !LayoutUtil::IsMonotonicWithDim0Major(dbias->shape().layout()))) {
+        VLOG(1) << "Layer norm input and/or output layouts nor supported.";
+        return absl::OkStatus();
+      }
+
+      // The types of X and DX must match.
+      if (x.Instr()->shape().element_type() != instr->shape().element_type()) {
+        VLOG(1) << "The types of X and DX must match.";
+        return absl::OkStatus();
+      }
+
+      // The types and shapes of scale, Dscale and Dbias (if present) must
+      // match.
+      if (!ShapeUtil::Equal(
+              ShapeUtil::DropDegenerateDimensions(scale.Instr()->shape()),
+              ShapeUtil::DropDegenerateDimensions(dscale->shape())) ||
+          (dbias &&
+           !ShapeUtil::Equal(
+               ShapeUtil::DropDegenerateDimensions(scale.Instr()->shape()),
+               ShapeUtil::DropDegenerateDimensions(dbias->shape())))) {
+        VLOG(1) << "Backward layer norm types not supported.";
+        return absl::OkStatus();
+      }
+
+      // Verify the element types.
+      if (!CompatibleElementType(dy.Instr())) {
+        VLOG(1) << "Backward layer norm types not supported.";
+        return absl::OkStatus();
+      }
+
+      // cuDNN requires the byte size of the element type of X to be at least
+      // that of DY and scale.
+      if (ShapeUtil::ByteSizeOfPrimitiveType(
+              x.Instr()->shape().element_type()) <
+              ShapeUtil::ByteSizeOfPrimitiveType(
+                  dy.Instr()->shape().element_type()) ||
+          ShapeUtil::ByteSizeOfPrimitiveType(
+              x.Instr()->shape().element_type()) <
+              ShapeUtil::ByteSizeOfPrimitiveType(
+                  scale.Instr()->shape().element_type())) {
+        VLOG(1) << "Backward layer norm types not supported.";
+        return absl::OkStatus();
+      }
+
+      // Transpose DY applying the stored transpose order of X from the forward
+      // graph.
+      HloInstruction* transposed_dy = dy.Instr();
+      if (norm_metadata->second.x_transpose) {
+        TF_ASSIGN_OR_RETURN(
+            transposed_dy,
+            MakeTransposeHlo(dy.Instr(),
+                             norm_metadata->second.x_transpose->dimensions()));
+      }
+      TF_ASSIGN_OR_RETURN(HloInstruction * reshaped_dy,
+                          MakeReshapeHlo(x.Instr()->shape(), transposed_dy));
+
+      Shape dx_shape = ShapeUtil::MakeShape(instr->shape().element_type(),
+                                            x.Instr()->shape().dimensions());
+
+      Shape dscale_dbias_shape = ShapeUtil::MakeShape(
+          dscale->shape().element_type(), scale.Instr()->shape().dimensions());
+
+      GpuBackendConfig gpu_backend_config;
+      CudnnNormBackendConfig& backend_config =
+          *gpu_backend_config.mutable_cudnn_norm_backend_config();
+      backend_config.set_kind(CudnnNormBackendConfig::LAYER_BWD);
+      auto* algorithm = backend_config.mutable_algorithm();
+      algorithm->set_algo_id(0);
+      algorithm->set_math_type(se::dnn::AlgorithmProto::TENSOR_OP_MATH);
+      algorithm->set_is_cudnn_frontend(true);
+
+      // Set the workspace size to its upper bound.
+      // TODO(philipphack): Consider autotuning the norm kernels.
+      TF_ASSIGN_OR_RETURN(const int64_t c_constant,
+                          CConstant(cuda_compute_capability_));
+      const int64_t workspace_size =
+          (2 * c_constant * (4 + 256)) +
+          (2 * x.Instr()->shape().dimensions(0) * 4) + 64;
+      algorithm->mutable_workspace_size()->set_value(workspace_size);
+
+      // The output of the Custom Call is a tuple. The output shape of Dscale
+      // and Dbias is that of scale.
+      Shape custom_call_shape = ShapeUtil::MakeTupleShape(
+          {dx_shape, dscale_dbias_shape, dscale_dbias_shape,
+           ShapeUtil::MakeShape(U8, {workspace_size})});
+
+      HloInstruction* custom_call =
+          instr->AddInstruction(HloInstruction::CreateCustomCall(
+              custom_call_shape,
+              {x.Instr(), scale.Instr(), reshaped_dy, fused_expectation.Instr(),
+               fused_norm_factor.Instr()},
+              kCudnnNormCallTarget));
+      TF_RETURN_IF_ERROR(custom_call->set_backend_config(gpu_backend_config));
+
+      auto replace_with_cc = [custom_call, norm_metadata, transposed_dy, this](
+                                 HloInstruction* old_instr,
+                                 int tuple_index) -> absl::Status {
+        TF_ASSIGN_OR_RETURN(HloInstruction * gte,
+                            MakeGetTupleElementHlo(custom_call, tuple_index));
+        HloInstruction* new_instr;
+        // Transpose DX applying the stored transpose order of Y from the
+        // forward graph.
+        if (tuple_index == 0 && norm_metadata->second.y_transpose) {
+          TF_ASSIGN_OR_RETURN(new_instr,
+                              MakeReshapeHlo(transposed_dy->shape(), gte));
+          TF_ASSIGN_OR_RETURN(
+              new_instr,
+              MakeTransposeHlo(
+                  new_instr, norm_metadata->second.y_transpose->dimensions()));
+        } else {
+          TF_ASSIGN_OR_RETURN(new_instr,
+                              MakeReshapeHlo(old_instr->shape(), gte));
+        }
+        TF_RETURN_IF_ERROR(ReplaceInstruction(old_instr, new_instr));
+        return absl::OkStatus();
+      };
+
+      TF_RETURN_IF_ERROR(replace_with_cc(instr, 0));
+      TF_RETURN_IF_ERROR(replace_with_cc(dscale, 1));
+      if (dbias) {
+        TF_RETURN_IF_ERROR(replace_with_cc(dbias, 2));
+      }
+      VLOG(1) << "Gradients w.r.t. x"
+              << (dbias ? ", scale and bias" : " and scale")
+              << " rewritten into layer norm backward Custom Call.";
+    }
+
     return absl::OkStatus();
   }
 
  private:
   se::CudaComputeCapability cuda_compute_capability_;
+  NormMetadataMap norm_metadata_;
 };
 
 absl::StatusOr<bool> RunOnComputation(
diff --git a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.h b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.h
index 8a3981bf3c70ac..62b242513485b1 100644
--- a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.h
+++ b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter.h
@@ -24,7 +24,7 @@ namespace xla {
 namespace gpu {
 
 // Rewrites norm patterns into Custom Calls to the cuDNN library. Currently, the
-// forward pass of layer norm patterns is implemented.
+// forward and backward passes of layer norm patterns are implemented.
 class CudnnNormRewriter : public HloModulePass {
  public:
   explicit CudnnNormRewriter(se::CudaComputeCapability cuda_compute_capability);
diff --git a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc
index 4a46efffe09c9d..0df94397c096ec 100644
--- a/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/cudnn_norm_rewriter_test.cc
@@ -283,9 +283,9 @@ TEST_F(CudnnNormRewriterTest, LayerNorm4D12) {
 
     ENTRY test {
         input = f32[2,4,6,8] parameter(0)
-        multiply3 = f32[2,4,6,8] multiply(input, input)
+        input_square = f32[2,4,6,8] multiply(input, input)
         c0 = f32[] constant(0)
-        input_square_sum = f32[2,8] reduce(multiply3, c0), dimensions={1,2}, to_apply=apply
+        input_square_sum = f32[2,8] reduce(input_square, c0), dimensions={1,2}, to_apply=apply
         r_nelems = f32[] constant(0.041667)
         r_nelems_bcast = f32[2,8] broadcast(r_nelems), dimensions={}
         input_square_mean = f32[2,8] multiply(input_square_sum, r_nelems_bcast)
@@ -410,9 +410,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain2D1) {
 
     ENTRY test {
         input = f32[2,4] parameter(0)
-        multiply3 = f32[2,4] multiply(input, input)
+        input_square = f32[2,4] multiply(input, input)
         c0 = f32[] constant(0)
-        input_square_sum = f32[2] reduce(multiply3, c0), dimensions={1}, to_apply=apply
+        input_square_sum = f32[2] reduce(input_square, c0), dimensions={1}, to_apply=apply
         r_nelems = f32[] constant(0.25)
         r_nelems_bcast = f32[2] broadcast(r_nelems), dimensions={}
         input_square_mean = f32[2] multiply(input_square_sum,r_nelems_bcast)
@@ -487,9 +487,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain4D3) {
 
     ENTRY test {
         input = f32[2,4,6,8] parameter(0)
-        multiply3 = f32[2,4,6,8] multiply(input, input)
+        input_square = f32[2,4,6,8] multiply(input, input)
         c0 = f32[] constant(0)
-        input_square_sum = f32[2,4,6] reduce(multiply3, c0), dimensions={3}, to_apply=apply
+        input_square_sum = f32[2,4,6] reduce(input_square, c0), dimensions={3}, to_apply=apply
         r_nelems = f32[] constant(0.125)
         r_nelems_bcast = f32[2,4,6] broadcast(r_nelems), dimensions={}
         input_square_mean = f32[2,4,6] multiply(input_square_sum, r_nelems_bcast)
@@ -564,9 +564,9 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain4D12) {
 
     ENTRY test {
         input = f32[2,4,6,8] parameter(0)
-        multiply3 = f32[2,4,6,8] multiply(input, input)
+        input_square = f32[2,4,6,8] multiply(input, input)
         c0 = f32[] constant(0)
-        input_square_sum = f32[2,8] reduce(multiply3, c0), dimensions={1,2}, to_apply=apply
+        input_square_sum = f32[2,8] reduce(input_square, c0), dimensions={1,2}, to_apply=apply
         r_nelems = f32[] constant(0.041667)
         r_nelems_bcast = f32[2,8] broadcast(r_nelems), dimensions={}
         input_square_mean = f32[2,8] multiply(input_square_sum, r_nelems_bcast)
@@ -620,6 +620,710 @@ TEST_F(CudnnNormRewriterTest, LayerNormTrain4D12) {
   TestNorm(hlo_text, optimized_hlo);
 }
 
+TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward2D1) {
+#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
+  GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
+#endif
+  if (!(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::AMPERE) &&
+      !(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "Layer norm kernels require Ampere or Hopper architectures.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] add(a,b)
+    }
+
+    ENTRY test {
+        input = f32[2,4] parameter(0)
+        input_square = f32[2,4] multiply(input, input)
+        c0 = f32[] constant(0)
+        input_square_sum = f32[2] reduce(input_square, c0), dimensions={1}, to_apply=apply
+        reduce = f32[2] reduce(input, c0), dimensions={1}, to_apply=apply
+        r_nelems = f32[] constant(0.25)
+        r_nelems_bcast = f32[2] broadcast(r_nelems), dimensions={}
+        input_square_mean = f32[2] multiply(input_square_sum,r_nelems_bcast)
+        input_mean = f32[2] multiply(reduce, r_nelems_bcast)
+        input_mean_square = f32[2] multiply(input_mean,input_mean)
+        variance = f32[2] subtract(input_square_mean,input_mean_square)
+        epsilon = f32[] constant(0.001)
+        epsilon_bcast = f32[2] broadcast(epsilon), dimensions={}
+        variance_plus_epsilon = f32[2] add(variance, epsilon_bcast)
+        norm_factor = f32[2] rsqrt(variance_plus_epsilon)
+        norm_factor_bcast = f32[2,4] broadcast(norm_factor), dimensions={0}
+        input_mean_bcast = f32[2,4] broadcast(input_mean), dimensions={0}
+        input_center = f32[2,4] subtract(input, input_mean_bcast)
+        norm = f32[2,4] multiply(input_center, norm_factor_bcast)
+        scale = f32[4] parameter(1)
+        scale_bcast = f32[2,4] broadcast(scale), dimensions={1}
+        norm_scale = f32[2,4] multiply(norm, scale_bcast)
+        bias = f32[4] parameter(2)
+        bias_bcast = f32[2,4] broadcast(bias), dimensions={1}
+        norm_scale_bias = f32[2,4] add(norm_scale, bias_bcast)
+        doutput = f32[2,4] parameter(3)
+        dbias = f32[4] reduce(doutput, c0), dimensions={0}, to_apply=apply
+        norm_doutput = f32[2,4] multiply(norm, doutput)
+        dscale = f32[4] reduce(norm_doutput, c0), dimensions={0}, to_apply=apply
+        scale_doutput = f32[2,4] multiply(scale_bcast, doutput)
+        input_center_scale_doutput = f32[2,4] multiply(input_center, scale_doutput)
+        f0 = f32[2] reduce(input_center_scale_doutput, c0), dimensions={1}, to_apply=apply
+        norm_factor_cube = f32[2] divide(norm_factor, variance_plus_epsilon)
+        c1 = f32[] constant(-0.5)
+        c1_bcast = f32[2] broadcast(c1), dimensions={}
+        dnorm_factor = f32[2] multiply(norm_factor_cube, c1_bcast)
+        f0_dnorm_factor = f32[2] multiply(f0, dnorm_factor)
+        c2 = f32[] constant(0.5)
+        c2_bcast = f32[2] broadcast(c2), dimensions={}
+        f0_dnorm_factor_scaled = f32[2] multiply(f0_dnorm_factor, c2_bcast)
+        f0_dnorm_factor_scaled_bcast = f32[2,4] broadcast(f0_dnorm_factor_scaled), dimensions={0}
+        f1 = f32[2,4] multiply(input_center, f0_dnorm_factor_scaled_bcast)
+        minus_f1 = f32[2,4] negate(f1)
+        minus_f1_sum = f32[2] reduce(minus_f1, c0), dimensions={1}, to_apply=apply
+        f2 = f32[2,4] multiply(norm_factor_bcast, scale_doutput)
+        minus_f2 = f32[2,4] negate(f2)
+        minus_f2_sum = f32[2] reduce(minus_f2, c0), dimensions={1}, to_apply=apply
+        minus_f1_f2_sum = f32[2] add(minus_f1_sum, minus_f2_sum)
+        minus_f1_f2_sum_scaled = f32[2] multiply(minus_f1_f2_sum, r_nelems_bcast)
+        minus_f1_f2_sum_scaled_bcast = f32[2,4] broadcast(minus_f1_f2_sum_scaled), dimensions={0}
+        f1_f2 = f32[2,4] add(f1, f2)
+        dinput = f32[2,4] add(f1_f2, minus_f1_f2_sum_scaled_bcast)
+        ROOT out = (f32[2,4], f32[2,4], f32[4], f32[4]) tuple(norm_scale_bias, dinput, dscale, dbias)
+    })";
+
+  const char* optimized_hlo = R"(
+
+; CHECK-LABEL: ENTRY %test (input: f32[2,4], scale: f32[4], bias: f32[4], doutput: f32[2,4]) -> (f32[2,4], f32[2,4], f32[4], f32[4]) {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4]{1,0} parameter(0)
+; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[2,4,1,1]{3,2,1,0} bitcast([[P0]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4]{0} parameter(1)
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[CC0:%[^ ]+]] = (f32[2,4,1,1]{3,2,1,0}, f32[2,1,1,1]{3,2,1,0}, f32[2,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0.001
+; CHECK-DAG:         "kind":"LAYER_FWD_TRAIN"
+; CHECK:           }
+; CHECK-DAG:     [[GTE0:%[^ ]+]] = f32[2,4,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=0
+; CHECK-DAG:     [[GTE0_BITCAST:%[^ ]+]] = f32[2,4]{1,0} bitcast([[GTE0]])
+; CHECK-DAG:     [[P3:%[^ ]+]] = f32[2,4]{1,0} parameter(3)
+; CHECK-DAG:     [[P3_BITCAST:%[^ ]+]] = f32[2,4,1,1]{3,2,1,0} bitcast([[P3]])
+; CHECK-DAG:     [[GTE1:%[^ ]+]] = f32[2,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=1
+; CHECK-DAG:     [[GTE2:%[^ ]+]] = f32[2,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=2
+; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[2,4,1,1]{3,2,1,0}, f32[4,1,1,1]{3,2,1,0}, f32[4,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P3_BITCAST]], [[GTE1]], [[GTE2]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0
+; CHECK-DAG:         "kind":"LAYER_BWD"
+; CHECK:           }
+; CHECK-DAG:     [[GTE3:%[^ ]+]] = f32[2,4,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=0
+; CHECK-DAG:     [[GTE3_BITCAST:%[^ ]+]] = f32[2,4]{1,0} bitcast([[GTE3]])
+; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
+; CHECK-DAG:     [[GTE4_BITCAST:%[^ ]+]] = f32[4]{0} bitcast([[GTE4]])
+; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
+; CHECK-DAG:     [[GTE5_BITCAST:%[^ ]+]] = f32[4]{0} bitcast([[GTE5]])
+; CHECK-DAG:  ROOT [[OUT:%[^ ]+]] = (f32[2,4]{1,0}, f32[2,4]{1,0}, f32[4]{0}, f32[4]{0}) tuple([[GTE0_BITCAST]], [[GTE3_BITCAST]], [[GTE4_BITCAST]], [[GTE5_BITCAST]])
+  )";
+
+  TestNorm(hlo_text, optimized_hlo);
+}
+
+TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D3) {
+#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
+  GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
+#endif
+  if (!(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::AMPERE) &&
+      !(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "Layer norm kernels require Ampere or Hopper architectures.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] add(a,b)
+    }
+
+    ENTRY test {
+        input = f32[2,4,6,8] parameter(0)
+        input_square = f32[2,4,6,8] multiply(input, input)
+        c0 = f32[] constant(0)
+        input_square_sum = f32[2,4,6] reduce(input_square, c0), dimensions={3}, to_apply=apply
+        reduce = f32[2,4,6] reduce(input, c0), dimensions={3}, to_apply=apply
+        r_nelems = f32[] constant(0.125)
+        r_nelems_bcast = f32[2,4,6] broadcast(r_nelems), dimensions={}
+        input_square_mean = f32[2,4,6] multiply(input_square_sum,r_nelems_bcast)
+        input_mean = f32[2,4,6] multiply(reduce, r_nelems_bcast)
+        input_mean_square = f32[2,4,6] multiply(input_mean,input_mean)
+        variance = f32[2,4,6] subtract(input_square_mean,input_mean_square)
+        epsilon = f32[] constant(0.001)
+        epsilon_bcast = f32[2,4,6] broadcast(epsilon), dimensions={}
+        variance_plus_epsilon = f32[2,4,6] add(variance, epsilon_bcast)
+        norm_factor = f32[2,4,6] rsqrt(variance_plus_epsilon)
+        norm_factor_bcast = f32[2,4,6,8] broadcast(norm_factor), dimensions={0,1,2}
+        input_mean_bcast = f32[2,4,6,8] broadcast(input_mean), dimensions={0,1,2}
+        input_center = f32[2,4,6,8] subtract(input, input_mean_bcast)
+        norm = f32[2,4,6,8] multiply(input_center, norm_factor_bcast)
+        scale = f32[8] parameter(1)
+        scale_bcast = f32[2,4,6,8] broadcast(scale), dimensions={3}
+        norm_scale = f32[2,4,6,8] multiply(norm, scale_bcast)
+        bias = f32[8] parameter(2)
+        bias_bcast = f32[2,4,6,8] broadcast(bias), dimensions={3}
+        norm_scale_bias = f32[2,4,6,8] add(norm_scale, bias_bcast)
+        doutput = f32[2,4,6,8] parameter(3)
+        dbias = f32[8] reduce(doutput, c0), dimensions={0,1,2}, to_apply=apply
+        norm_doutput = f32[2,4,6,8] multiply(norm, doutput)
+        dscale = f32[8] reduce(norm_doutput, c0), dimensions={0,1,2}, to_apply=apply
+        scale_doutput = f32[2,4,6,8] multiply(scale_bcast, doutput)
+        input_center_scale_doutput = f32[2,4,6,8] multiply(input_center, scale_doutput)
+        f0 = f32[2,4,6] reduce(input_center_scale_doutput, c0), dimensions={3}, to_apply=apply
+        norm_factor_cube = f32[2,4,6] divide(norm_factor, variance_plus_epsilon)
+        c1 = f32[] constant(-0.5)
+        c1_bcast = f32[2,4,6] broadcast(c1), dimensions={}
+        dnorm_factor = f32[2,4,6] multiply(norm_factor_cube, c1_bcast)
+        f0_dnorm_factor = f32[2,4,6] multiply(f0, dnorm_factor)
+        c2 = f32[] constant(0.25)
+        c2_bcast = f32[2,4,6] broadcast(c2), dimensions={}
+        f0_dnorm_factor_scaled = f32[2,4,6] multiply(f0_dnorm_factor, c2_bcast)
+        f0_dnorm_factor_scaled_bcast = f32[2,4,6,8] broadcast(f0_dnorm_factor_scaled), dimensions={0,1,2}
+        f1 = f32[2,4,6,8] multiply(input_center, f0_dnorm_factor_scaled_bcast)
+        minus_f1 = f32[2,4,6,8] negate(f1)
+        minus_f1_sum = f32[2,4,6] reduce(minus_f1, c0), dimensions={3}, to_apply=apply
+        f2 = f32[2,4,6,8] multiply(norm_factor_bcast, scale_doutput)
+        minus_f2 = f32[2,4,6,8] negate(f2)
+        minus_f2_sum = f32[2,4,6] reduce(minus_f2, c0), dimensions={3}, to_apply=apply
+        minus_f1_f2_sum = f32[2,4,6] add(minus_f1_sum, minus_f2_sum)
+        minus_f1_f2_sum_scaled = f32[2,4,6] multiply(minus_f1_f2_sum, r_nelems_bcast)
+        minus_f1_f2_sum_scaled_bcast = f32[2,4,6,8] broadcast(minus_f1_f2_sum_scaled), dimensions={0,1,2}
+        f1_f2 = f32[2,4,6,8] add(f1, f2)
+        dinput = f32[2,4,6,8] add(f1_f2, minus_f1_f2_sum_scaled_bcast)
+        ROOT out = (f32[2,4,6,8], f32[2,4,6,8], f32[8], f32[8]) tuple(norm_scale_bias, dinput, dscale, dbias)
+    })";
+
+  const char* optimized_hlo = R"(
+
+; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[8], bias: f32[8], doutput: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[8], f32[8]) {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
+; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[48,8,1,1]{3,2,1,0} bitcast([[P0]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[8]{0} parameter(1)
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[8,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[8]{0} parameter(2)
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[8,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[CC0:%[^ ]+]] = (f32[48,8,1,1]{3,2,1,0}, f32[48,1,1,1]{3,2,1,0}, f32[48,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0.001
+; CHECK-DAG:         "kind":"LAYER_FWD_TRAIN"
+; CHECK:           }
+; CHECK-DAG:     [[GTE0:%[^ ]+]] = f32[48,8,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=0
+; CHECK-DAG:     [[GTE0_BITCAST:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} bitcast([[GTE0]])
+; CHECK-DAG:     [[P3:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(3)
+; CHECK-DAG:     [[P3_BITCAST:%[^ ]+]] = f32[48,8,1,1]{3,2,1,0} bitcast([[P3]])
+; CHECK-DAG:     [[GTE1:%[^ ]+]] = f32[48,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=1
+; CHECK-DAG:     [[GTE2:%[^ ]+]] = f32[48,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=2
+; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[48,8,1,1]{3,2,1,0}, f32[8,1,1,1]{3,2,1,0}, f32[8,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P3_BITCAST]], [[GTE1]], [[GTE2]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0
+; CHECK-DAG:         "kind":"LAYER_BWD"
+; CHECK:           }
+; CHECK-DAG:     [[GTE3:%[^ ]+]] = f32[48,8,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=0
+; CHECK-DAG:     [[GTE3_BITCAST:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} bitcast([[GTE3]])
+; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[8,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
+; CHECK-DAG:     [[GTE4_BITCAST:%[^ ]+]] = f32[8]{0} bitcast([[GTE4]])
+; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[8,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
+; CHECK-DAG:     [[GTE5_BITCAST:%[^ ]+]] = f32[8]{0} bitcast([[GTE5]])
+; CHECK-DAG:  ROOT [[OUT:%[^ ]+]] = (f32[2,4,6,8]{3,2,1,0}, f32[2,4,6,8]{3,2,1,0}, f32[8]{0}, f32[8]{0}) tuple([[GTE0_BITCAST]], [[GTE3_BITCAST]], [[GTE4_BITCAST]], [[GTE5_BITCAST]])
+  )";
+
+  TestNorm(hlo_text, optimized_hlo);
+}
+
+TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D2) {
+#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
+  GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
+#endif
+  if (!(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::AMPERE) &&
+      !(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "Layer norm kernels require Ampere or Hopper architectures.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] add(a,b)
+    }
+
+    ENTRY test {
+        input = f32[2,4,6,8] parameter(0)
+        input_square = f32[2,4,6,8] multiply(input, input)
+        c0 = f32[] constant(0)
+        input_square_sum = f32[2,4,8] reduce(input_square, c0), dimensions={2}, to_apply=apply
+        reduce = f32[2,4,8] reduce(input, c0), dimensions={2}, to_apply=apply
+        r_nelems = f32[] constant(0.166667)
+        r_nelems_bcast = f32[2,4,8] broadcast(r_nelems), dimensions={}
+        input_square_mean = f32[2,4,8] multiply(input_square_sum,r_nelems_bcast)
+        input_mean = f32[2,4,8] multiply(reduce, r_nelems_bcast)
+        input_mean_square = f32[2,4,8] multiply(input_mean,input_mean)
+        variance = f32[2,4,8] subtract(input_square_mean,input_mean_square)
+        epsilon = f32[] constant(0.001)
+        epsilon_bcast = f32[2,4,8] broadcast(epsilon), dimensions={}
+        variance_plus_epsilon = f32[2,4,8] add(variance, epsilon_bcast)
+        norm_factor = f32[2,4,8] rsqrt(variance_plus_epsilon)
+        norm_factor_bcast = f32[2,4,6,8] broadcast(norm_factor), dimensions={0,1,3}
+        input_mean_bcast = f32[2,4,6,8] broadcast(input_mean), dimensions={0,1,3}
+        input_center = f32[2,4,6,8] subtract(input, input_mean_bcast)
+        norm = f32[2,4,6,8] multiply(input_center, norm_factor_bcast)
+        scale = f32[6] parameter(1)
+        scale_bcast = f32[2,4,6,8] broadcast(scale), dimensions={2}
+        norm_scale = f32[2,4,6,8] multiply(norm, scale_bcast)
+        bias = f32[6] parameter(2)
+        bias_bcast = f32[2,4,6,8] broadcast(bias), dimensions={2}
+        norm_scale_bias = f32[2,4,6,8] add(norm_scale, bias_bcast)
+        doutput = f32[2,4,6,8] parameter(3)
+        dbias = f32[6] reduce(doutput, c0), dimensions={0,1,3}, to_apply=apply
+        norm_doutput = f32[2,4,6,8] multiply(norm, doutput)
+        dscale = f32[6] reduce(norm_doutput, c0), dimensions={0,1,3}, to_apply=apply
+        scale_doutput = f32[2,4,6,8] multiply(scale_bcast, doutput)
+        input_center_scale_doutput = f32[2,4,6,8] multiply(input_center, scale_doutput)
+        f0 = f32[2,4,8] reduce(input_center_scale_doutput, c0), dimensions={2}, to_apply=apply
+        norm_factor_cube = f32[2,4,8] divide(norm_factor, variance_plus_epsilon)
+        c1 = f32[] constant(-0.5)
+        c1_bcast = f32[2,4,8] broadcast(c1), dimensions={}
+        dnorm_factor = f32[2,4,8] multiply(norm_factor_cube, c1_bcast)
+        f0_dnorm_factor = f32[2,4,8] multiply(f0, dnorm_factor)
+        c2 = f32[] constant(0.333333)
+        c2_bcast = f32[2,4,8] broadcast(c2), dimensions={}
+        f0_dnorm_factor_scaled = f32[2,4,8] multiply(f0_dnorm_factor, c2_bcast)
+        f0_dnorm_factor_scaled_bcast = f32[2,4,6,8] broadcast(f0_dnorm_factor_scaled), dimensions={0,1,3}
+        f1 = f32[2,4,6,8] multiply(input_center, f0_dnorm_factor_scaled_bcast)
+        minus_f1 = f32[2,4,6,8] negate(f1)
+        minus_f1_sum = f32[2,4,8] reduce(minus_f1, c0), dimensions={2}, to_apply=apply
+        f2 = f32[2,4,6,8] multiply(norm_factor_bcast, scale_doutput)
+        minus_f2 = f32[2,4,6,8] negate(f2)
+        minus_f2_sum = f32[2,4,8] reduce(minus_f2, c0), dimensions={2}, to_apply=apply
+        minus_f1_f2_sum = f32[2,4,8] add(minus_f1_sum, minus_f2_sum)
+        minus_f1_f2_sum_scaled = f32[2,4,8] multiply(minus_f1_f2_sum, r_nelems_bcast)
+        minus_f1_f2_sum_scaled_bcast = f32[2,4,6,8] broadcast(minus_f1_f2_sum_scaled), dimensions={0,1,3}
+        f1_f2 = f32[2,4,6,8] add(f1, f2)
+        dinput = f32[2,4,6,8] add(f1_f2, minus_f1_f2_sum_scaled_bcast)
+        ROOT out = (f32[2,4,6,8], f32[2,4,6,8], f32[6], f32[6]) tuple(norm_scale_bias, dinput, dscale, dbias)
+    })";
+
+  const char* optimized_hlo = R"(
+
+; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[6], bias: f32[6], doutput: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[6], f32[6]) {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
+; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,4,8,6]{3,2,1,0} transpose([[P0]]), dimensions={0,1,3,2}
+; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[64,6,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[6]{0} parameter(1)
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[6,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[6]{0} parameter(2)
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[6,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[CC0:%[^ ]+]] = (f32[64,6,1,1]{3,2,1,0}, f32[64,1,1,1]{3,2,1,0}, f32[64,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0.001
+; CHECK-DAG:         "kind":"LAYER_FWD_TRAIN"
+; CHECK:           }
+; CHECK-DAG:     [[GTE0:%[^ ]+]] = f32[64,6,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=0
+; CHECK-DAG:     [[P3:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(3)
+; CHECK-NEXT:    [[TRANSPOSE1:%[^ ]+]] = f32[2,4,8,6]{3,2,1,0} transpose([[P3]]), dimensions={0,1,3,2}
+; CHECK-DAG:     [[P3_BITCAST:%[^ ]+]] = f32[64,6,1,1]{3,2,1,0} bitcast([[TRANSPOSE1]])
+; CHECK-DAG:     [[GTE1:%[^ ]+]] = f32[64,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=1
+; CHECK-DAG:     [[GTE2:%[^ ]+]] = f32[64,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=2
+; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[64,6,1,1]{3,2,1,0}, f32[6,1,1,1]{3,2,1,0}, f32[6,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P3_BITCAST]], [[GTE1]], [[GTE2]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0
+; CHECK-DAG:         "kind":"LAYER_BWD"
+; CHECK:           }
+; CHECK-DAG:     [[GTE3:%[^ ]+]] = f32[64,6,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=0
+; CHECK-DAG:     [[FUSION:%[^ ]+]] = (f32[2,4,6,8]{3,2,1,0}, f32[2,4,6,8]{3,2,1,0}) fusion([[GTE0]], [[GTE3]]), kind=kLoop, calls=[[FUSED_COMPUTATION:%[^ ]+]]
+; CHECK-DAG:     [[GTEF0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} get-tuple-element([[FUSION]]), index=0
+; CHECK-DAG:     [[GTEF1:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} get-tuple-element([[FUSION]]), index=1
+; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[6,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
+; CHECK-DAG:     [[GTE4_BITCAST:%[^ ]+]] = f32[6]{0} bitcast([[GTE4]])
+; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[6,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
+; CHECK-DAG:     [[GTE5_BITCAST:%[^ ]+]] = f32[6]{0} bitcast([[GTE5]])
+; CHECK-DAG:  ROOT [[OUT:%[^ ]+]] = (f32[2,4,6,8]{3,2,1,0}, f32[2,4,6,8]{3,2,1,0}, f32[6]{0}, f32[6]{0}) tuple([[GTEF0]], [[GTEF1]], [[GTE4_BITCAST]], [[GTE5_BITCAST]])
+  )";
+
+  TestNorm(hlo_text, optimized_hlo);
+}
+
+TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D12) {
+#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
+  GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
+#endif
+  if (!(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::AMPERE) &&
+      !(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "Layer norm kernels require Ampere or Hopper architectures.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] add(a,b)
+    }
+
+    ENTRY test {
+        input = f32[2,4,6,8] parameter(0)
+        input_square = f32[2,4,6,8] multiply(input, input)
+        c0 = f32[] constant(0)
+        input_square_sum = f32[2,8] reduce(input_square, c0), dimensions={1,2}, to_apply=apply
+        reduce = f32[2,8] reduce(input, c0), dimensions={1,2}, to_apply=apply
+        r_nelems = f32[] constant(0.041667)
+        r_nelems_bcast = f32[2,8] broadcast(r_nelems), dimensions={}
+        input_square_mean = f32[2,8] multiply(input_square_sum,r_nelems_bcast)
+        input_mean = f32[2,8] multiply(reduce, r_nelems_bcast)
+        input_mean_square = f32[2,8] multiply(input_mean,input_mean)
+        variance = f32[2,8] subtract(input_square_mean,input_mean_square)
+        epsilon = f32[] constant(0.001)
+        epsilon_bcast = f32[2,8] broadcast(epsilon), dimensions={}
+        variance_plus_epsilon = f32[2,8] add(variance, epsilon_bcast)
+        norm_factor = f32[2,8] rsqrt(variance_plus_epsilon)
+        norm_factor_bcast = f32[2,4,6,8] broadcast(norm_factor), dimensions={0,3}
+        input_mean_bcast = f32[2,4,6,8] broadcast(input_mean), dimensions={0,3}
+        input_center = f32[2,4,6,8] subtract(input, input_mean_bcast)
+        norm = f32[2,4,6,8] multiply(input_center, norm_factor_bcast)
+        scale = f32[4,6] parameter(1)
+        scale_bcast = f32[2,4,6,8] broadcast(scale), dimensions={1,2}
+        norm_scale = f32[2,4,6,8] multiply(norm, scale_bcast)
+        bias = f32[4,6] parameter(2)
+        bias_bcast = f32[2,4,6,8] broadcast(bias), dimensions={1,2}
+        norm_scale_bias = f32[2,4,6,8] add(norm_scale, bias_bcast)
+        doutput = f32[2,4,6,8] parameter(3)
+        dbias = f32[4,6] reduce(doutput, c0), dimensions={0,3}, to_apply=apply
+        norm_doutput = f32[2,4,6,8] multiply(norm, doutput)
+        dscale = f32[4,6] reduce(norm_doutput, c0), dimensions={0,3}, to_apply=apply
+        scale_doutput = f32[2,4,6,8] multiply(scale_bcast, doutput)
+        input_center_scale_doutput = f32[2,4,6,8] multiply(input_center, scale_doutput)
+        f0 = f32[2,8] reduce(input_center_scale_doutput, c0), dimensions={1,2}, to_apply=apply
+        norm_factor_cube = f32[2,8] divide(norm_factor, variance_plus_epsilon)
+        c1 = f32[] constant(-0.5)
+        c1_bcast = f32[2,8] broadcast(c1), dimensions={}
+        dnorm_factor = f32[2,8] multiply(norm_factor_cube, c1_bcast)
+        f0_dnorm_factor = f32[2,8] multiply(f0, dnorm_factor)
+        c2 = f32[] constant(0.083333)
+        c2_bcast = f32[2,8] broadcast(c2), dimensions={}
+        f0_dnorm_factor_scaled = f32[2,8] multiply(f0_dnorm_factor, c2_bcast)
+        f0_dnorm_factor_scaled_bcast = f32[2,4,6,8] broadcast(f0_dnorm_factor_scaled), dimensions={0,3}
+        f1 = f32[2,4,6,8] multiply(input_center, f0_dnorm_factor_scaled_bcast)
+        minus_f1 = f32[2,4,6,8] negate(f1)
+        minus_f1_sum = f32[2,8] reduce(minus_f1, c0), dimensions={1,2}, to_apply=apply
+        f2 = f32[2,4,6,8] multiply(norm_factor_bcast, scale_doutput)
+        minus_f2 = f32[2,4,6,8] negate(f2)
+        minus_f2_sum = f32[2,8] reduce(minus_f2, c0), dimensions={1,2}, to_apply=apply
+        minus_f1_f2_sum = f32[2,8] add(minus_f1_sum, minus_f2_sum)
+        minus_f1_f2_sum_scaled = f32[2,8] multiply(minus_f1_f2_sum, r_nelems_bcast)
+        minus_f1_f2_sum_scaled_bcast = f32[2,4,6,8] broadcast(minus_f1_f2_sum_scaled), dimensions={0,3}
+        f1_f2 = f32[2,4,6,8] add(f1, f2)
+        dinput = f32[2,4,6,8] add(f1_f2, minus_f1_f2_sum_scaled_bcast)
+        ROOT out = (f32[2,4,6,8], f32[2,4,6,8], f32[4,6], f32[4,6]) tuple(norm_scale_bias, dinput, dscale, dbias)
+    })";
+
+  const char* optimized_hlo = R"(
+
+; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[4,6], bias: f32[4,6], doutput: f32[2,4,6,8]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[4,6], f32[4,6]) {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
+; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,8,4,6]{3,2,1,0} transpose([[P0]]), dimensions={0,3,1,2}
+; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4,6]{1,0} parameter(1)
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,6,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4,6]{1,0} parameter(2)
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,6,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[CC0:%[^ ]+]] = (f32[16,4,6,1]{3,2,1,0}, f32[16,1,1,1]{3,2,1,0}, f32[16,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0.001
+; CHECK-DAG:         "kind":"LAYER_FWD_TRAIN"
+; CHECK:           }
+; CHECK-DAG:     [[GTE0:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} get-tuple-element([[CC0]]), index=0
+; CHECK-DAG:     [[P3:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(3)
+; CHECK-NEXT:    [[TRANSPOSE1:%[^ ]+]] = f32[2,8,4,6]{3,2,1,0} transpose([[P3]]), dimensions={0,3,1,2}
+; CHECK-DAG:     [[P3_BITCAST:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} bitcast([[TRANSPOSE1]])
+; CHECK-DAG:     [[GTE1:%[^ ]+]] = f32[16,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=1
+; CHECK-DAG:     [[GTE2:%[^ ]+]] = f32[16,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=2
+; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[16,4,6,1]{3,2,1,0}, f32[4,6,1,1]{3,2,1,0}, f32[4,6,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P3_BITCAST]], [[GTE1]], [[GTE2]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0
+; CHECK-DAG:         "kind":"LAYER_BWD"
+; CHECK:           }
+; CHECK-DAG:     [[GTE3:%[^ ]+]] = f32[16,4,6,1]{3,2,1,0} get-tuple-element([[CC1]]), index=0
+; CHECK-DAG:     [[FUSION:%[^ ]+]] = (f32[2,4,6,8]{3,2,1,0}, f32[2,4,6,8]{3,2,1,0}) fusion([[GTE0]], [[GTE3]]), kind=kLoop, calls=[[FUSED_COMPUTATION:%[^ ]+]]
+; CHECK-DAG:     [[GTEF0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} get-tuple-element([[FUSION]]), index=0
+; CHECK-DAG:     [[GTEF1:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} get-tuple-element([[FUSION]]), index=1
+; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[4,6,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
+; CHECK-DAG:     [[GTE4_BITCAST:%[^ ]+]] = f32[4,6]{1,0} bitcast([[GTE4]])
+; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[4,6,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
+; CHECK-DAG:     [[GTE5_BITCAST:%[^ ]+]] = f32[4,6]{1,0} bitcast([[GTE5]])
+; CHECK-DAG:  ROOT [[OUT:%[^ ]+]] = (f32[2,4,6,8]{3,2,1,0}, f32[2,4,6,8]{3,2,1,0}, f32[4,6]{1,0}, f32[4,6]{1,0}) tuple([[GTEF0]], [[GTEF1]], [[GTE4_BITCAST]], [[GTE5_BITCAST]])
+  )";
+
+  TestNorm(hlo_text, optimized_hlo);
+}
+
+TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D1DoutputReshapeSplit) {
+#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
+  GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
+#endif
+  if (!(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::AMPERE) &&
+      !(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "Layer norm kernels require Ampere or Hopper architectures.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] add(a,b)
+    }
+
+    ENTRY test {
+        input = f32[2,4,6,8] parameter(0)
+        input_square = f32[2,4,6,8] multiply(input, input)
+        c0 = f32[] constant(0)
+        input_square_sum = f32[2,6,8] reduce(input_square, c0), dimensions={1}, to_apply=apply
+        reduce = f32[2,6,8] reduce(input, c0), dimensions={1}, to_apply=apply
+        r_nelems = f32[] constant(0.25)
+        r_nelems_bcast = f32[2,6,8] broadcast(r_nelems), dimensions={}
+        input_square_mean = f32[2,6,8] multiply(input_square_sum,r_nelems_bcast)
+        input_mean = f32[2,6,8] multiply(reduce, r_nelems_bcast)
+        input_mean_square = f32[2,6,8] multiply(input_mean,input_mean)
+        variance = f32[2,6,8] subtract(input_square_mean,input_mean_square)
+        epsilon = f32[] constant(0.001)
+        epsilon_bcast = f32[2,6,8] broadcast(epsilon), dimensions={}
+        variance_plus_epsilon = f32[2,6,8] add(variance, epsilon_bcast)
+        norm_factor = f32[2,6,8] rsqrt(variance_plus_epsilon)
+        norm_factor_bcast = f32[2,4,6,8] broadcast(norm_factor), dimensions={0,2,3}
+        input_mean_bcast = f32[2,4,6,8] broadcast(input_mean), dimensions={0,2,3}
+        input_center = f32[2,4,6,8] subtract(input, input_mean_bcast)
+        norm = f32[2,4,6,8] multiply(input_center, norm_factor_bcast)
+        scale = f32[4] parameter(1)
+        scale_bcast = f32[2,4,6,8] broadcast(scale), dimensions={1}
+        norm_scale = f32[2,4,6,8] multiply(norm, scale_bcast)
+        bias = f32[4] parameter(2)
+        bias_bcast = f32[2,4,6,8] broadcast(bias), dimensions={1}
+        norm_scale_bias = f32[2,4,6,8] add(norm_scale, bias_bcast)
+        doutput = f32[2,4,48] parameter(3)
+        dbias = f32[4] reduce(doutput, c0), dimensions={0,2}, to_apply=apply
+        doutput_bitcast = f32[2,4,6,8] reshape(doutput) 
+        norm_doutput = f32[2,4,6,8] multiply(norm, doutput_bitcast)
+        dscale = f32[4] reduce(norm_doutput, c0), dimensions={0,2,3}, to_apply=apply
+        scale_doutput = f32[2,4,6,8] multiply(scale_bcast, doutput_bitcast)
+        input_center_scale_doutput = f32[2,4,6,8] multiply(input_center, scale_doutput)
+        f0 = f32[2,6,8] reduce(input_center_scale_doutput, c0), dimensions={1}, to_apply=apply
+        norm_factor_cube = f32[2,6,8] divide(norm_factor, variance_plus_epsilon)
+        c1 = f32[] constant(-0.5)
+        c1_bcast = f32[2,6,8] broadcast(c1), dimensions={}
+        dnorm_factor = f32[2,6,8] multiply(norm_factor_cube, c1_bcast)
+        f0_dnorm_factor = f32[2,6,8] multiply(f0, dnorm_factor)
+        c2 = f32[] constant(0.5)
+        c2_bcast = f32[2,6,8] broadcast(c2), dimensions={}
+        f0_dnorm_factor_scaled = f32[2,6,8] multiply(f0_dnorm_factor, c2_bcast)
+        f0_dnorm_factor_scaled_bcast = f32[2,4,6,8] broadcast(f0_dnorm_factor_scaled), dimensions={0,2,3}
+        f1 = f32[2,4,6,8] multiply(input_center, f0_dnorm_factor_scaled_bcast)
+        minus_f1 = f32[2,4,6,8] negate(f1)
+        minus_f1_sum = f32[2,6,8] reduce(minus_f1, c0), dimensions={1}, to_apply=apply
+        f2 = f32[2,4,6,8] multiply(norm_factor_bcast, scale_doutput)
+        minus_f2 = f32[2,4,6,8] negate(f2)
+        minus_f2_sum = f32[2,6,8] reduce(minus_f2, c0), dimensions={1}, to_apply=apply
+        minus_f1_f2_sum = f32[2,6,8] add(minus_f1_sum, minus_f2_sum)
+        minus_f1_f2_sum_scaled = f32[2,6,8] multiply(minus_f1_f2_sum, r_nelems_bcast)
+        minus_f1_f2_sum_scaled_bcast = f32[2,4,6,8] broadcast(minus_f1_f2_sum_scaled), dimensions={0,2,3}
+        f1_f2 = f32[2,4,6,8] add(f1, f2)
+        dinput = f32[2,4,6,8] add(f1_f2, minus_f1_f2_sum_scaled_bcast)
+        ROOT out = (f32[2,4,6,8], f32[2,4,6,8], f32[4], f32[4]) tuple(norm_scale_bias, dinput, dscale, dbias)
+    })";
+
+  const char* optimized_hlo = R"(
+
+; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[4], bias: f32[4], doutput: f32[2,4,48]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[4], f32[4]) {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
+; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,6,8,4]{3,2,1,0} transpose([[P0]]), dimensions={0,2,3,1}
+; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[96,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4]{0} parameter(1)
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[CC0:%[^ ]+]] = (f32[96,4,1,1]{3,2,1,0}, f32[96,1,1,1]{3,2,1,0}, f32[96,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0.001
+; CHECK-DAG:         "kind":"LAYER_FWD_TRAIN"
+; CHECK:           }
+; CHECK-DAG:     [[GTE0:%[^ ]+]] = f32[96,4,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=0
+; CHECK-DAG:     [[P3:%[^ ]+]] = f32[2,4,48]{2,1,0} parameter(3)
+; CHECK-DAG:     [[FUSION0:%[^ ]+]] = f32[2,6,8,4]{3,2,1,0} fusion([[P3]]), kind=kLoop, calls=[[FUSED_COMPUTATION0:%[^ ]+]]
+; CHECK-DAG:     [[FUSION0_BITCAST:%[^ ]+]] = f32[96,4,1,1]{3,2,1,0} bitcast([[FUSION0]])
+; CHECK-DAG:     [[GTE1:%[^ ]+]] = f32[96,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=1
+; CHECK-DAG:     [[GTE2:%[^ ]+]] = f32[96,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=2
+; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[96,4,1,1]{3,2,1,0}, f32[4,1,1,1]{3,2,1,0}, f32[4,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[FUSION0_BITCAST]], [[GTE1]], [[GTE2]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0
+; CHECK-DAG:         "kind":"LAYER_BWD"
+; CHECK:           }
+; CHECK-DAG:     [[GTE3:%[^ ]+]] = f32[96,4,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=0
+; CHECK-DAG:     [[FUSION1:%[^ ]+]] = (f32[2,4,6,8]{3,2,1,0}, f32[2,4,6,8]{3,2,1,0}) fusion([[GTE0]], [[GTE3]]), kind=kLoop, calls=[[FUSED_COMPUTATION1:%[^ ]+]]
+; CHECK-DAG:     [[GTEF1:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} get-tuple-element([[FUSION1]]), index=0
+; CHECK-DAG:     [[GTEF2:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} get-tuple-element([[FUSION1]]), index=1
+; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
+; CHECK-DAG:     [[GTE4_BITCAST:%[^ ]+]] = f32[4]{0} bitcast([[GTE4]])
+; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
+; CHECK-DAG:     [[GTE5_BITCAST:%[^ ]+]] = f32[4]{0} bitcast([[GTE5]])
+; CHECK-DAG:  ROOT [[OUT:%[^ ]+]] = (f32[2,4,6,8]{3,2,1,0}, f32[2,4,6,8]{3,2,1,0}, f32[4]{0}, f32[4]{0}) tuple([[GTEF1]], [[GTEF2]], [[GTE4_BITCAST]], [[GTE5_BITCAST]])
+  )";
+
+  TestNorm(hlo_text, optimized_hlo);
+}
+
+TEST_F(CudnnNormRewriterTest, LayerNormTrainBackward4D1DoutputReshapeCombine) {
+#if (CUDA_VERSION < 12000 || CUDNN_VERSION < 8905)
+  GTEST_SKIP() << "Layer norm kernels require CUDA 12 and cuDNN 8.9.5.";
+#endif
+  if (!(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::AMPERE) &&
+      !(GetCudaComputeCapability().major ==
+        se::CudaComputeCapability::HOPPER)) {
+    GTEST_SKIP()
+        << "Layer norm kernels require Ampere or Hopper architectures.";
+  }
+  const char* hlo_text = R"(
+    HloModule test
+
+    apply {
+      a = f32[] parameter(0)
+      b = f32[] parameter(1)
+      ROOT c = f32[] add(a,b)
+    }
+
+    ENTRY test {
+        input = f32[2,4,6,8] parameter(0)
+        input_square = f32[2,4,6,8] multiply(input, input)
+        c0 = f32[] constant(0)
+        input_square_sum = f32[2,6,8] reduce(input_square, c0), dimensions={1}, to_apply=apply
+        reduce = f32[2,6,8] reduce(input, c0), dimensions={1}, to_apply=apply
+        r_nelems = f32[] constant(0.25)
+        r_nelems_bcast = f32[2,6,8] broadcast(r_nelems), dimensions={}
+        input_square_mean = f32[2,6,8] multiply(input_square_sum,r_nelems_bcast)
+        input_mean = f32[2,6,8] multiply(reduce, r_nelems_bcast)
+        input_mean_square = f32[2,6,8] multiply(input_mean,input_mean)
+        variance = f32[2,6,8] subtract(input_square_mean,input_mean_square)
+        epsilon = f32[] constant(0.001)
+        epsilon_bcast = f32[2,6,8] broadcast(epsilon), dimensions={}
+        variance_plus_epsilon = f32[2,6,8] add(variance, epsilon_bcast)
+        norm_factor = f32[2,6,8] rsqrt(variance_plus_epsilon)
+        norm_factor_bcast = f32[2,4,6,8] broadcast(norm_factor), dimensions={0,2,3}
+        input_mean_bcast = f32[2,4,6,8] broadcast(input_mean), dimensions={0,2,3}
+        input_center = f32[2,4,6,8] subtract(input, input_mean_bcast)
+        norm = f32[2,4,6,8] multiply(input_center, norm_factor_bcast)
+        scale = f32[4] parameter(1)
+        scale_bcast = f32[2,4,6,8] broadcast(scale), dimensions={1}
+        norm_scale = f32[2,4,6,8] multiply(norm, scale_bcast)
+        bias = f32[4] parameter(2)
+        bias_bcast = f32[2,4,6,8] broadcast(bias), dimensions={1}
+        norm_scale_bias = f32[2,4,6,8] add(norm_scale, bias_bcast)
+        doutput = f32[2,4,6,2,2,2] parameter(3)
+        dbias = f32[4] reduce(doutput, c0), dimensions={0,2,3,4,5}, to_apply=apply
+        doutput_bitcast = f32[2,4,6,8] reshape(doutput) 
+        norm_doutput = f32[2,4,6,8] multiply(norm, doutput_bitcast)
+        dscale = f32[4] reduce(norm_doutput, c0), dimensions={0,2,3}, to_apply=apply
+        scale_doutput = f32[2,4,6,8] multiply(scale_bcast, doutput_bitcast)
+        input_center_scale_doutput = f32[2,4,6,8] multiply(input_center, scale_doutput)
+        f0 = f32[2,6,8] reduce(input_center_scale_doutput, c0), dimensions={1}, to_apply=apply
+        norm_factor_cube = f32[2,6,8] divide(norm_factor, variance_plus_epsilon)
+        c1 = f32[] constant(-0.5)
+        c1_bcast = f32[2,6,8] broadcast(c1), dimensions={}
+        dnorm_factor = f32[2,6,8] multiply(norm_factor_cube, c1_bcast)
+        f0_dnorm_factor = f32[2,6,8] multiply(f0, dnorm_factor)
+        c2 = f32[] constant(0.5)
+        c2_bcast = f32[2,6,8] broadcast(c2), dimensions={}
+        f0_dnorm_factor_scaled = f32[2,6,8] multiply(f0_dnorm_factor, c2_bcast)
+        f0_dnorm_factor_scaled_bcast = f32[2,4,6,8] broadcast(f0_dnorm_factor_scaled), dimensions={0,2,3}
+        f1 = f32[2,4,6,8] multiply(input_center, f0_dnorm_factor_scaled_bcast)
+        minus_f1 = f32[2,4,6,8] negate(f1)
+        minus_f1_sum = f32[2,6,8] reduce(minus_f1, c0), dimensions={1}, to_apply=apply
+        f2 = f32[2,4,6,8] multiply(norm_factor_bcast, scale_doutput)
+        minus_f2 = f32[2,4,6,8] negate(f2)
+        minus_f2_sum = f32[2,6,8] reduce(minus_f2, c0), dimensions={1}, to_apply=apply
+        minus_f1_f2_sum = f32[2,6,8] add(minus_f1_sum, minus_f2_sum)
+        minus_f1_f2_sum_scaled = f32[2,6,8] multiply(minus_f1_f2_sum, r_nelems_bcast)
+        minus_f1_f2_sum_scaled_bcast = f32[2,4,6,8] broadcast(minus_f1_f2_sum_scaled), dimensions={0,2,3}
+        f1_f2 = f32[2,4,6,8] add(f1, f2)
+        dinput = f32[2,4,6,8] add(f1_f2, minus_f1_f2_sum_scaled_bcast)
+        ROOT out = (f32[2,4,6,8], f32[2,4,6,8], f32[4], f32[4]) tuple(norm_scale_bias, dinput, dscale, dbias)
+    })";
+
+  const char* optimized_hlo = R"(
+
+; CHECK-LABEL: ENTRY %test (input: f32[2,4,6,8], scale: f32[4], bias: f32[4], doutput: f32[2,4,6,2,2,2]) -> (f32[2,4,6,8], f32[2,4,6,8], f32[4], f32[4]) {
+; CHECK-NEXT:    [[P0:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} parameter(0)
+; CHECK-NEXT:    [[TRANSPOSE0:%[^ ]+]] = f32[2,6,8,4]{3,2,1,0} transpose([[P0]]), dimensions={0,2,3,1}
+; CHECK-NEXT:    [[P0_BITCAST:%[^ ]+]] = f32[96,4,1,1]{3,2,1,0} bitcast([[TRANSPOSE0]])
+; CHECK-NEXT:    [[P1:%[^ ]+]] = f32[4]{0} parameter(1)
+; CHECK-NEXT:    [[P1_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P1]])
+; CHECK-NEXT:    [[P2:%[^ ]+]] = f32[4]{0} parameter(2)
+; CHECK-NEXT:    [[P2_BITCAST:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} bitcast([[P2]])
+; CHECK-NEXT:    [[CC0:%[^ ]+]] = (f32[96,4,1,1]{3,2,1,0}, f32[96,1,1,1]{3,2,1,0}, f32[96,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[P2_BITCAST]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0.001
+; CHECK-DAG:         "kind":"LAYER_FWD_TRAIN"
+; CHECK:           }
+; CHECK-DAG:     [[GTE0:%[^ ]+]] = f32[96,4,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=0
+; CHECK-DAG:     [[P3:%[^ ]+]] = f32[2,4,6,2,2,2]{5,4,3,2,1,0} parameter(3)
+; CHECK-DAG:     [[FUSION0:%[^ ]+]] = f32[2,6,8,4]{3,2,1,0} fusion([[P3]]), kind=kLoop, calls=[[FUSED_COMPUTATION0:%[^ ]+]]
+; CHECK-DAG:     [[FUSION0_BITCAST:%[^ ]+]] = f32[96,4,1,1]{3,2,1,0} bitcast([[FUSION0]])
+; CHECK-DAG:     [[GTE1:%[^ ]+]] = f32[96,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=1
+; CHECK-DAG:     [[GTE2:%[^ ]+]] = f32[96,1,1,1]{3,2,1,0} get-tuple-element([[CC0]]), index=2
+; CHECK-NEXT:    [[CC1:%[^ ]+]] = (f32[96,4,1,1]{3,2,1,0}, f32[4,1,1,1]{3,2,1,0}, f32[4,1,1,1]{3,2,1,0}, u8[{{.*}}]{0}) custom-call([[P0_BITCAST]], [[P1_BITCAST]], [[FUSION0_BITCAST]], [[GTE1]], [[GTE2]]),
+; CHECK:           custom_call_target="__cudnn$norm",
+; CHECK:           backend_config={
+; CHECK-DAG:         "epsilon":0
+; CHECK-DAG:         "kind":"LAYER_BWD"
+; CHECK:           }
+; CHECK-DAG:     [[GTE3:%[^ ]+]] = f32[96,4,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=0
+; CHECK-DAG:     [[FUSION1:%[^ ]+]] = (f32[2,4,6,8]{3,2,1,0}, f32[2,4,6,8]{3,2,1,0}) fusion([[GTE0]], [[GTE3]]), kind=kLoop, calls=[[FUSED_COMPUTATION1:%[^ ]+]]
+; CHECK-DAG:     [[GTEF1:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} get-tuple-element([[FUSION1]]), index=0
+; CHECK-DAG:     [[GTEF2:%[^ ]+]] = f32[2,4,6,8]{3,2,1,0} get-tuple-element([[FUSION1]]), index=1
+; CHECK-DAG:     [[GTE4:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=1
+; CHECK-DAG:     [[GTE4_BITCAST:%[^ ]+]] = f32[4]{0} bitcast([[GTE4]])
+; CHECK-DAG:     [[GTE5:%[^ ]+]] = f32[4,1,1,1]{3,2,1,0} get-tuple-element([[CC1]]), index=2
+; CHECK-DAG:     [[GTE5_BITCAST:%[^ ]+]] = f32[4]{0} bitcast([[GTE5]])
+; CHECK-DAG:  ROOT [[OUT:%[^ ]+]] = (f32[2,4,6,8]{3,2,1,0}, f32[2,4,6,8]{3,2,1,0}, f32[4]{0}, f32[4]{0}) tuple([[GTEF1]], [[GTEF2]], [[GTE4_BITCAST]], [[GTE5_BITCAST]])
+  )";
+
+  TestNorm(hlo_text, optimized_hlo);
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/custom_call_test.cc b/third_party/xla/xla/service/gpu/custom_call_test.cc
index 79fbf39c65d0b6..6633f63fd5dc6b 100644
--- a/third_party/xla/xla/service/gpu/custom_call_test.cc
+++ b/third_party/xla/xla/service/gpu/custom_call_test.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <functional>
 #include <ostream>
 #include <sstream>
 #include <string>
@@ -22,6 +23,8 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
+#include "absl/strings/str_format.h"
 #include "xla/shape.h"
 #include "tsl/platform/statusor.h"
 
@@ -44,14 +47,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/runtime/executable.h"  // IWYU pragma: keep
-#include "xla/runtime/memref_view.h"
 #include "xla/service/custom_call_status.h"
 #include "xla/service/custom_call_target_registry.h"
-#include "xla/service/gpu/runtime/custom_call_registry.h"
-#include "xla/service/gpu/runtime/support.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
@@ -353,100 +350,21 @@ TEST_F(CustomCallTest, WithStatusFailed) {
 // XLA runtime custom calls provides type-safe custom call API
 //===----------------------------------------------------------------------===//
 
-// WARNING: We currently rely on a magic custom call prefix `__gpu$` to detect
-// "internal" custom calls that linked statically into the binary. Without this
-// prefix custom calls expected to be registered as XLA:FFI custom calls, and
-// this is not yet fully supported.
-//
-// TODO(ezhulenev): Unify runtime custom calls and XLA:FFI.
-
-// (1) Declare custom call implementations as static functions.
-
-static absl::Status AlwaysFailImpl(runtime::MemrefView arg, int32_t value) {
+static absl::Status AlwaysFail(ffi::BufferBase arg, int32_t value) {
   return absl::InternalError(absl::StrCat("Uh oh, wrong value: ", value));
 }
 
-static absl::Status MemcpyImpl(const ServiceExecutableRunOptions* run_options,
-                               runtime::MemrefView src,
-                               runtime::MemrefView dst) {
-  auto src_mem = gpu::GetDeviceAddress(src);
-  auto dst_mem = gpu::GetDeviceAddress(dst);
-  run_options->stream()->ThenMemcpyD2D(&dst_mem, src_mem, src_mem.size());
-  return absl::OkStatus();
-}
-
-// (2) Declare custom call binding signature. At compile time we check that
-// declared signature matches function handlers, and at run time we check that
-// passed arguments match the signature (number of arguments and their types).
-
-// TODO(ezhulenev): Remove these custom calls once we switch to thunks runtime.
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    AlwaysFail, AlwaysFailImpl, runtime::CustomCall::RuntimeChecks::kDefault,
-    runtime::CustomCall::Bind("__gpu$xla.gpu.ext.always_fail")
-        .Arg<runtime::MemrefView>()  // arg
-        .Attr<int32_t>("value")      // value
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    Memcpy, MemcpyImpl, runtime::CustomCall::RuntimeChecks::kDefault,
-    runtime::CustomCall::Bind("__gpu$xla.gpu.ext.memcpy")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .Arg<runtime::MemrefView>()  // src
-        .Arg<runtime::MemrefView>()  // dst
-);
-
-// (3) Declare FFI handlers as adaptors for legacy XLA runtime custom calls.
-//
-// TODO(ezhulenev): This is a long term replacement for "legacy" custom calls
-// (custom calls with void** arguments) and a type safe xla runtime custom
-// calls (see above). XLA FFI unifies internal custom calls (static linking)
-// with external custom calls (dynamically loaded libraries). Make this the only
-// example, once it's fully supported.
-
-namespace impl {
-static absl::Status AlwaysFail(ffi::Buffer arg, int32_t value) {
-  return AlwaysFailImpl(arg, value);
-}
-
-static absl::Status Memcpy(const ServiceExecutableRunOptions* run_options,
-                           ffi::Buffer src, ffi::Buffer dst) {
-  return MemcpyImpl(run_options, src, dst);
-}
-}  // namespace impl
-
-XLA_FFI_DEFINE_HANDLER(kAlwaysFail, impl::AlwaysFail,
+XLA_FFI_DEFINE_HANDLER(kAlwaysFail, AlwaysFail,
                        ffi::Ffi::Bind()
-                           .Arg<ffi::Buffer>()      // arg
+                           .Arg<ffi::BufferBase>()  // arg
                            .Attr<int32_t>("value")  // value
 );
-
-XLA_FFI_DEFINE_HANDLER(kMemcpy, impl::Memcpy,
-                       ffi::Ffi::Bind()
-                           .Ctx<ServiceExecutableRunOptions>()
-                           .Arg<ffi::Buffer>()  // src
-                           .Arg<ffi::Buffer>()  // dst
-);
-
-// (4) Register custom calls handlers with XLA runtime.
-
-static void RegisterCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("__gpu$xla.gpu.ext.always_fail", AlwaysFail);
-  registry.Register("__gpu$xla.gpu.ext.memcpy", Memcpy);
-}
-
-XLA_GPU_REGISTER_RUNTIME_CUSTOM_CALL(RegisterCustomCalls);
-
-// (5) Register XLA FFI handlers with XLA runtime.
-
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__gpu$xla.gpu.ext.always_fail",
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$always_fail",
                          PLATFORM, kAlwaysFail);
-XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__gpu$xla.gpu.ext.memcpy",
-                         PLATFORM, kMemcpy);
 
 TEST_F(CustomCallTest, RuntimeCustomCallAlwaysFail) {
   XlaBuilder b(TestName());
-  CustomCall(&b, "__gpu$xla.gpu.ext.always_fail", /*operands=*/{},
+  CustomCall(&b, "__xla_test$$always_fail", /*operands=*/{},
              ShapeUtil::MakeShape(F32, {}), /*opaque=*/"{value = 42 : i32}",
              /*has_side_effect=*/false,
              /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
@@ -457,9 +375,27 @@ TEST_F(CustomCallTest, RuntimeCustomCallAlwaysFail) {
   EXPECT_THAT(status.message(), ::testing::HasSubstr("Uh oh, wrong value: 42"));
 }
 
+static absl::Status Memcpy(const ServiceExecutableRunOptions* run_options,
+                           ffi::BufferBase src, ffi::BufferBase dst) {
+  run_options->stream()->ThenMemcpyD2D(
+      &dst.data, src.data,
+      absl::c_accumulate(src.dimensions, 1.0, std::multiplies<int64_t>()) *
+          sizeof(float));
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kMemcpy, Memcpy,
+                       ffi::Ffi::Bind()
+                           .Ctx<ServiceExecutableRunOptions>()
+                           .Arg<ffi::BufferBase>()  // src
+                           .Arg<ffi::BufferBase>()  // dst
+);
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", PLATFORM,
+                         kMemcpy);
+
 TEST_F(CustomCallTest, ExportedFfiMemcpy) {
   XlaBuilder b(TestName());
-  CustomCall(&b, "__gpu$xla.gpu.ext.memcpy",
+  CustomCall(&b, "__xla_test$$memcpy",
              /*operands=*/{Broadcast(ConstantR0WithType(&b, F32, 42.0), {128})},
              ShapeUtil::MakeShape(F32, {128}), /*opaque=*/"",
              /*has_side_effect=*/false,
@@ -470,13 +406,42 @@ TEST_F(CustomCallTest, ExportedFfiMemcpy) {
   EXPECT_THAT(result.data<float>(), ::testing::Each(42));
 }
 
+static absl::Status HandleUserPointer(ffi::BufferBase, const std::string* str) {
+  return absl::InternalError(*str);
+}
+
+XLA_FFI_DEFINE_HANDLER(kHandleUserPointer, HandleUserPointer,
+                       ffi::Ffi::Bind()
+                           .Arg<ffi::BufferBase>()  // buffer for result
+                           .Attr<ffi::Pointer<std::string>>("message"));
+
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$user_data", PLATFORM,
+                         kHandleUserPointer);
+
+TEST_F(CustomCallTest, PassUserPointerWithAttrs) {
+  std::string message = "User-defined message";
+  auto ptr = reinterpret_cast<uintptr_t>(&message);
+
+  XlaBuilder b(TestName());
+  CustomCall(&b, "__xla_test$$user_data", /*operands=*/{},
+             ShapeUtil::MakeShape(F32, {}),
+             /*opaque=*/absl::StrFormat("{message = %d : i64}", ptr),
+             /*has_side_effect=*/false,
+             /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+             /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+             /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  auto status = Execute(&b, {}).status();
+  EXPECT_EQ(status.code(), absl::StatusCode::kInternal);
+  EXPECT_THAT(status.message(), ::testing::HasSubstr("User-defined message"));
+}
+
 //===----------------------------------------------------------------------===//
 // XLA:FFI handler with attached HloComputation
 //===----------------------------------------------------------------------===//
 
 static absl::Status MemcpyWithCalledComputation(
-    const ServiceExecutableRunOptions* run_options, ffi::Buffer src,
-    ffi::Buffer dst, const HloComputation* called_computation) {
+    const ServiceExecutableRunOptions* run_options, ffi::BufferBase src,
+    ffi::BufferBase dst, const HloComputation* called_computation) {
   if (called_computation == nullptr)
     return absl::InternalError("Called computation is not defined");
 
@@ -486,25 +451,22 @@ static absl::Status MemcpyWithCalledComputation(
   if (!DynCast<HloParameterInstruction>(called_computation->root_instruction()))
     return absl::InternalError("ROOT must be a paremeter");
 
-  return MemcpyImpl(run_options, src, dst);
+  return Memcpy(run_options, src, dst);
 }
 
 XLA_FFI_DEFINE_HANDLER(kMemcpyWithCalledComputation,
                        MemcpyWithCalledComputation,
                        ffi::Ffi::Bind()
                            .Ctx<ServiceExecutableRunOptions>()
-                           .Arg<ffi::Buffer>()  // src
-                           .Arg<ffi::Buffer>()  // dst
+                           .Arg<ffi::BufferBase>()  // src
+                           .Arg<ffi::BufferBase>()  // dst
                            .Ctx<ffi::CalledComputation>());
 
 XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(),
-                         "__gpu$xla.gpu.ext.memcpy_with_called_compuation",
-                         PLATFORM, kMemcpyWithCalledComputation);
+                         "xla.gpu.ext.memcpy_with_called_compuation", PLATFORM,
+                         kMemcpyWithCalledComputation);
 
 TEST_F(CustomCallTest, WithCalledComputation) {
-  // FFI handlers with called computations supported only with Thunks runtime.
-  mutable_debug_options()->set_xla_gpu_enable_xla_runtime_executable(false);
-
   auto shape = ShapeUtil::MakeShape(F32, {128});
 
   // Build a called computation which is just a copy instruction.
@@ -515,7 +477,7 @@ TEST_F(CustomCallTest, WithCalledComputation) {
 
   XlaBuilder b(TestName());
   CustomCallWithComputation(
-      &b, "__gpu$xla.gpu.ext.memcpy_with_called_compuation",
+      &b, "xla.gpu.ext.memcpy_with_called_compuation",
       /*operands=*/{Broadcast(ConstantR0WithType(&b, F32, 42.0), {128})},
       copy_computation, shape, /*opaque=*/"",
       /*has_side_effect=*/false,
diff --git a/third_party/xla/xla/service/gpu/custom_fusion_rewriter.cc b/third_party/xla/xla/service/gpu/custom_kernel_fusion_rewriter.cc
similarity index 83%
rename from third_party/xla/xla/service/gpu/custom_fusion_rewriter.cc
rename to third_party/xla/xla/service/gpu/custom_kernel_fusion_rewriter.cc
index 215cae5eff8559..52325876554c8b 100644
--- a/third_party/xla/xla/service/gpu/custom_fusion_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/custom_kernel_fusion_rewriter.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/custom_fusion_rewriter.h"
+#include "xla/service/gpu/custom_kernel_fusion_rewriter.h"
 
 #include <cstdint>
 #include <optional>
@@ -32,7 +32,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/gpu/kernels/custom_fusion_pattern.h"
+#include "xla/service/gpu/kernels/custom_kernel_fusion_pattern.h"
 #include "xla/shape_util.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
@@ -42,9 +42,9 @@ limitations under the License.
 
 namespace xla::gpu {
 
-CustomFusionRewriter::CustomFusionRewriter(
+CustomKernelFusionRewriter::CustomKernelFusionRewriter(
     const se::DeviceDescription* device,
-    const CustomFusionPatternRegistry* patterns)
+    const CustomKernelFusionPatternRegistry* patterns)
     : device_(device), patterns_(patterns) {}
 
 // Returns a set of instruction that have users outside of a matched pattern
@@ -54,7 +54,7 @@ CustomFusionRewriter::CustomFusionRewriter(
 // instruction has external users and does not have a replacement returns empty
 // optional.
 static std::optional<absl::flat_hash_set<HloInstruction*>>
-GetPatternReplacements(const CustomFusionPattern::Match& match) {
+GetPatternReplacements(const CustomKernelFusionPattern::Match& match) {
   absl::flat_hash_set<HloInstruction*> requires_replacement;
   absl::flat_hash_set<HloInstruction*> instructions_set(
       match.instructions().begin(), match.instructions().end());
@@ -68,7 +68,7 @@ GetPatternReplacements(const CustomFusionPattern::Match& match) {
         continue;
       }
 
-      VLOG(3) << "Custom fusion intermediate result " << instr->name()
+      VLOG(3) << "Custom kernel fusion intermediate result " << instr->name()
               << " has users outside of a matched pattern: " << user->name();
       return std::nullopt;
     }
@@ -77,10 +77,10 @@ GetPatternReplacements(const CustomFusionPattern::Match& match) {
   return requires_replacement;
 }
 
-// Returns instructions that have to become custom fusion parameters. Returns an
-// error if matched pattern can't be outlined as a fusion.
+// Returns instructions that have to become custom kernel fusion parameters.
+// Returns an error if matched pattern can't be outlined as a fusion.
 static absl::InlinedVector<HloInstruction*, 4> GetPatternCaptures(
-    const CustomFusionPattern::Match& match) {
+    const CustomKernelFusionPattern::Match& match) {
   absl::InlinedVector<HloInstruction*, 4> captures;
 
   absl::flat_hash_set<HloInstruction*> instructions_set(
@@ -98,9 +98,10 @@ static absl::InlinedVector<HloInstruction*, 4> GetPatternCaptures(
   return captures;
 }
 
-// Creates custom fusion computation and moves all matched instructions into it.
+// Creates custom kernel fusion computation and moves all matched instructions
+// into it.
 static absl::StatusOr<HloComputation*> CreateFusionBody(
-    HloModule* module, const CustomFusionPattern::Match& match,
+    HloModule* module, const CustomKernelFusionPattern::Match& match,
     absl::Span<HloInstruction* const> captures) {
   HloComputation::Builder builder(match.config().name());
 
@@ -133,14 +134,14 @@ static absl::StatusOr<HloComputation*> CreateFusionBody(
 
   HloInstruction* root = builder.last_added_instruction();
 
-  // If custom fusion requires a workspace we add a custom call that allocates
-  // workspace and return a tuple of "real" result and a workspace.
+  // If custom kernel fusion requires a workspace we add a custom call that
+  // allocates workspace and return a tuple of "real" result and a workspace.
   if (match.workspace_size_bytes() > 0) {
     auto workspace_shape =
         ShapeUtil::MakeShape(PrimitiveType::U8, {match.workspace_size_bytes()});
     HloInstruction* workspace =
         builder.AddInstruction(HloInstruction::CreateCustomCall(
-            workspace_shape, {}, CustomFusionPattern::kWorkspace, "",
+            workspace_shape, {}, CustomKernelFusionPattern::kWorkspace, "",
             CustomCallApiVersion::API_VERSION_TYPED_FFI));
     builder.AddInstruction(HloInstruction::CreateTuple({root, workspace}));
   }
@@ -149,10 +150,10 @@ static absl::StatusOr<HloComputation*> CreateFusionBody(
 }
 
 static absl::StatusOr<HloInstruction*> CreateFusionInstruction(
-    HloModule* module, const CustomFusionPattern::Match& match,
+    HloModule* module, const CustomKernelFusionPattern::Match& match,
     absl::Span<HloInstruction* const> captures, HloComputation* body) {
-  // We'll be replacing the root operation of a custom fusion with a fusion
-  // instruction calling fusion computation.
+  // We'll be replacing the root operation of a custom kernel fusion with a
+  // fusion instruction calling fusion computation.
   HloInstruction* root = match.root();
   HloComputation* parent = root->parent();
 
@@ -178,10 +179,10 @@ static absl::StatusOr<HloInstruction*> CreateFusionInstruction(
       HloInstruction::CreateGetTupleElement(fusion, 0));
 }
 
-absl::StatusOr<bool> CustomFusionRewriter::Run(
+absl::StatusOr<bool> CustomKernelFusionRewriter::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  std::vector<CustomFusionPattern::Match> matches;
+  std::vector<CustomKernelFusionPattern::Match> matches;
 
   // Collect all potential custom fusion matches in the module.
   for (HloComputation* computation : module->computations()) {
@@ -193,8 +194,8 @@ absl::StatusOr<bool> CustomFusionRewriter::Run(
 
   if (matches.empty()) return false;
 
-  for (const CustomFusionPattern::Match& match : matches) {
-    VLOG(2) << "Matched custom fusion " << match.config().name()
+  for (const CustomKernelFusionPattern::Match& match : matches) {
+    VLOG(2) << "Matched custom kernel fusion " << match.config().name()
             << "; root instruction: " << match.instructions().back()->name();
 
     auto replacememts = GetPatternReplacements(match);
@@ -209,7 +210,7 @@ absl::StatusOr<bool> CustomFusionRewriter::Run(
         CreateFusionInstruction(module, match, captures, fusion_body));
 
     VLOG(2) << "Added a fusion instruction: " << fusion->name()
-            << " for custom fusion " << match.config().name()
+            << " for custom kernel fusion " << match.config().name()
             << " (instruction count = " << match.instructions().size() << ")";
 
     for (HloInstruction* instr : *replacememts) {
@@ -227,8 +228,8 @@ absl::StatusOr<bool> CustomFusionRewriter::Run(
               << " with: " << replacement->name();
     }
 
-    VLOG(2) << "Replace custom fusion root instruction " << match.root()->name()
-            << "with " << fusion->name();
+    VLOG(2) << "Replace custom kernel fusion root instruction "
+            << match.root()->name() << "with " << fusion->name();
     HloComputation* parent = match.root()->parent();
     TF_RETURN_IF_ERROR(parent->ReplaceInstruction(match.root(), fusion));
   }
diff --git a/third_party/xla/xla/service/gpu/custom_fusion_rewriter.h b/third_party/xla/xla/service/gpu/custom_kernel_fusion_rewriter.h
similarity index 72%
rename from third_party/xla/xla/service/gpu/custom_fusion_rewriter.h
rename to third_party/xla/xla/service/gpu/custom_kernel_fusion_rewriter.h
index 2c4fb495ec89cb..9a09f1153664b1 100644
--- a/third_party/xla/xla/service/gpu/custom_fusion_rewriter.h
+++ b/third_party/xla/xla/service/gpu/custom_kernel_fusion_rewriter.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_CUSTOM_FUSION_REWRITER_H_
-#define XLA_SERVICE_GPU_CUSTOM_FUSION_REWRITER_H_
+#ifndef XLA_SERVICE_GPU_CUSTOM_KERNEL_FUSION_REWRITER_H_
+#define XLA_SERVICE_GPU_CUSTOM_KERNEL_FUSION_REWRITER_H_
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/strings/string_view.h"
@@ -22,16 +22,16 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_schedule.h"
-#include "xla/service/gpu/kernels/custom_fusion_pattern.h"
+#include "xla/service/gpu/kernels/custom_kernel_fusion_pattern.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla::gpu {
 
-// Pattern matches HLO instruction to custom fusions (hand written CUDA C++
-// kernels, e.g. custom GEMMs implemented with CUTLASS) and rewrites them into
-// fusion instructions and fusion computations.
+// Pattern matches HLO instruction to custom kernel fusions (hand written CUDA
+// C++ kernels, e.g. custom GEMMs implemented with CUTLASS) and rewrites them
+// into fusion instructions and fusion computations.
 //
 // Example: pattern matching dot operation into CUTLASS gemm
 //
@@ -60,13 +60,16 @@ namespace xla::gpu {
 //                      custom_fusion_config: {"name":"cutlass_gemm"}}
 //  }
 //
-class CustomFusionRewriter : public HloModulePass {
+class CustomKernelFusionRewriter : public HloModulePass {
  public:
-  explicit CustomFusionRewriter(const se::DeviceDescription* device,
-                                const CustomFusionPatternRegistry* patterns =
-                                    CustomFusionPatternRegistry::Default());
+  explicit CustomKernelFusionRewriter(
+      const se::DeviceDescription* device,
+      const CustomKernelFusionPatternRegistry* patterns =
+          CustomKernelFusionPatternRegistry::Default());
 
-  absl::string_view name() const override { return "custom-fusion-rewriter"; }
+  absl::string_view name() const override {
+    return "custom-kernel-fusion-rewriter";
+  }
 
   absl::StatusOr<bool> Run(
       HloModule* module,
@@ -74,9 +77,9 @@ class CustomFusionRewriter : public HloModulePass {
 
  private:
   const se::DeviceDescription* device_;
-  const CustomFusionPatternRegistry* patterns_;
+  const CustomKernelFusionPatternRegistry* patterns_;
 };
 
 }  // namespace xla::gpu
 
-#endif  // XLA_SERVICE_GPU_CUSTOM_FUSION_REWRITER_H_
+#endif  // XLA_SERVICE_GPU_CUSTOM_KERNEL_FUSION_REWRITER_H_
diff --git a/third_party/xla/xla/service/gpu/custom_fusion_rewriter_test.cc b/third_party/xla/xla/service/gpu/custom_kernel_fusion_rewriter_test.cc
similarity index 85%
rename from third_party/xla/xla/service/gpu/custom_fusion_rewriter_test.cc
rename to third_party/xla/xla/service/gpu/custom_kernel_fusion_rewriter_test.cc
index 8799c69cc57a1a..ac0d1464f3612d 100644
--- a/third_party/xla/xla/service/gpu/custom_fusion_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/custom_kernel_fusion_rewriter_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/custom_fusion_rewriter.h"
+#include "xla/service/gpu/custom_kernel_fusion_rewriter.h"
 
 #include <cstdint>
 #include <optional>
@@ -23,7 +23,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
-#include "xla/service/gpu/kernels/custom_fusion_pattern.h"
+#include "xla/service/gpu/kernels/custom_kernel_fusion_pattern.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
 #include "tsl/platform/test.h"
@@ -31,10 +31,10 @@ limitations under the License.
 namespace xla::gpu {
 
 //===----------------------------------------------------------------------===//
-// Simple pattern matchers for testing custom fusion rewriter.
+// Simple pattern matchers for testing custom kernel_fusion rewriter.
 //===----------------------------------------------------------------------===//
 
-struct SimpleGemmPattern : public CustomFusionPattern {
+struct SimpleGemmPattern : public CustomKernelFusionPattern {
   explicit SimpleGemmPattern(int64_t workspace = 0) : workspace(workspace) {}
 
   std::optional<Match> TryMatch(const se::DeviceDescription& device,
@@ -52,9 +52,9 @@ struct SimpleGemmPattern : public CustomFusionPattern {
 
 //===----------------------------------------------------------------------===//
 
-class CustomFusionRewriterTest : public HloTestBase {};
+class CustomKernelFusionRewriterTest : public HloTestBase {};
 
-TEST_F(CustomFusionRewriterTest, SimpleGemm) {
+TEST_F(CustomKernelFusionRewriterTest, SimpleGemm) {
   const char* hlo = R"(
     HloModule test
 
@@ -84,15 +84,15 @@ TEST_F(CustomFusionRewriterTest, SimpleGemm) {
     ; CHECK: }
   )";
 
-  CustomFusionPatternRegistry patterns;
+  CustomKernelFusionPatternRegistry patterns;
   patterns.Emplace<SimpleGemmPattern>();
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
-  CustomFusionRewriter pass(&device, &patterns);
+  CustomKernelFusionRewriter pass(&device, &patterns);
   RunAndFilecheckHloRewrite(hlo, std::move(pass), expected);
 }
 
-TEST_F(CustomFusionRewriterTest, SimpleGemmWithWorkspace) {
+TEST_F(CustomKernelFusionRewriterTest, SimpleGemmWithWorkspace) {
   const char* hlo = R"(
     HloModule test
 
@@ -111,7 +111,7 @@ TEST_F(CustomFusionRewriterTest, SimpleGemmWithWorkspace) {
     ; CHECK:   [[DOT:%[^ ]+]] = f16[15,17]{1,0} dot([[P0]], [[P1]]),
     ; CHECK:     lhs_contracting_dims={1}, rhs_contracting_dims={0}
     ; CHECK:   [[WORKSPACE:%[^ ]+]] = u8[1024]{0} custom-call(),
-    ; CHECK:     custom_call_target="__custom_fusion$workspace"
+    ; CHECK:     custom_call_target="__custom_kernel_fusion$workspace"
     ; CHECK:   ROOT [[TUPLE:%[^ ]+]] = (f16[15,17]{1,0}, u8[1024]{0})
     ; CHECK:     tuple([[DOT]], [[WORKSPACE]])
     ; CHECK: }
@@ -127,11 +127,11 @@ TEST_F(CustomFusionRewriterTest, SimpleGemmWithWorkspace) {
     ; CHECK: }
   )";
 
-  CustomFusionPatternRegistry patterns;
+  CustomKernelFusionPatternRegistry patterns;
   patterns.Emplace<SimpleGemmPattern>(1024);
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
-  CustomFusionRewriter pass(&device, &patterns);
+  CustomKernelFusionRewriter pass(&device, &patterns);
   RunAndFilecheckHloRewrite(hlo, std::move(pass), expected);
 }
 
diff --git a/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc b/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc
index c1519e6c2d60fd..210c1bc6fd54b8 100644
--- a/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc
+++ b/third_party/xla/xla/service/gpu/elemental_ir_emitter.cc
@@ -49,18 +49,6 @@ namespace gpu {
 
 using absl::StrAppend;
 
-namespace {
-// Returns whether operand is a floating-point literal with the given value.
-bool IsFPLiteralWithValue(const HloInstruction* operand, float value) {
-  if (operand->opcode() == HloOpcode::kConstant &&
-      operand->literal().IsAllFloat(value)) {
-    return true;
-  }
-  return operand->opcode() == HloOpcode::kBroadcast &&
-         IsFPLiteralWithValue(operand->operand(0), value);
-}
-}  // namespace
-
 GpuElementalIrEmitter::GpuElementalIrEmitter(
     IrEmitterContext& ir_emitter_context, llvm::IRBuilder<>* b)
     : ElementalIrEmitter(ir_emitter_context.llvm_module(), b),
@@ -109,29 +97,6 @@ absl::StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitDeviceMathCall(
   return result;
 }
 
-absl::StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitLlvmIntrinsicMathCall(
-    const std::string& callee_name, absl::Span<llvm::Value* const> operands,
-    absl::Span<const PrimitiveType> input_types, PrimitiveType output_type) {
-  // llvm intrinsics differentiate between half/float/double functions via
-  // the suffixes ".f16", ".f32" and ".f64".
-  std::string munged_callee = callee_name;
-  switch (output_type) {
-    case F16:
-      StrAppend(&munged_callee, ".f16");
-      break;
-    case F32:
-      StrAppend(&munged_callee, ".f32");
-      break;
-    case F64:
-      StrAppend(&munged_callee, ".f64");
-      break;
-    default:
-      return Unimplemented("Bad type for llvm intrinsic math call: %s",
-                           PrimitiveType_Name(output_type));
-  }
-  return EmitMathCall(munged_callee, operands, input_types, output_type);
-}
-
 absl::StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitMathCall(
     const std::string& callee_name, absl::Span<llvm::Value* const> operands,
     absl::Span<const PrimitiveType> input_types, PrimitiveType output_type,
@@ -330,6 +295,22 @@ absl::StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitTanh(
                 value->getType(), "tanh");
 }
 
+StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitErf(PrimitiveType prim_type,
+                                                      llvm::Value* value) {
+  if (prim_type == F64) {
+    return EmitDeviceMathCall(TargetDeviceFunctionID::kErf, {value},
+                              {prim_type}, prim_type);
+  }
+  // Upcast F16 to F32 if necessary.
+  llvm::Type* type = prim_type == F16 ? b()->getFloatTy() : value->getType();
+  if (type == b()->getFloatTy()) {
+    llvm::Value* x = FPCast(value, type);
+    auto* result = llvm_ir::EmitErfF32(b(), x);
+    return FPCast(result, value->getType());
+  }
+  return Unimplemented("erf");
+}
+
 absl::StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitComplexAbs(
     PrimitiveType prim_type, llvm::Value* value) {
   return EmitDeviceMathCall(TargetDeviceFunctionID::kHypot,
@@ -343,19 +324,6 @@ absl::StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitCbrt(
                             prim_type);
 }
 
-llvm::Value* GpuElementalIrEmitter::EmitThreadId() {
-  llvm::Value* block_id = IntCast(
-      EmitCallToTargetIntrinsic(TargetIntrinsicID::kBlockIdx, {}, {}, b()),
-      b()->getIntNTy(128), /*isSigned=*/true, "block.id");
-  llvm::Value* thread_id_in_block = IntCast(
-      EmitCallToTargetIntrinsic(TargetIntrinsicID::kThreadIdx, {}, {}, b()),
-      b()->getIntNTy(128), /*isSigned=*/true, "thread.id");
-  llvm::Value* threads_per_block = IntCast(
-      EmitCallToTargetIntrinsic(TargetIntrinsicID::kBlockDimx, {}, {}, b()),
-      b()->getIntNTy(128), /*isSigned=*/true, "threads_per_block");
-  return NSWAdd(NSWMul(block_id, threads_per_block), thread_id_in_block);
-}
-
 absl::StatusOr<llvm::Value*> GpuElementalIrEmitter::EmitF32ToBF16(
     llvm::Value* f32_value) {
   // sm_80 and up has an instruction to convert f32 into bf16.
diff --git a/third_party/xla/xla/service/gpu/elemental_ir_emitter.h b/third_party/xla/xla/service/gpu/elemental_ir_emitter.h
index 037005bd63b3d9..770cee379e4ff6 100644
--- a/third_party/xla/xla/service/gpu/elemental_ir_emitter.h
+++ b/third_party/xla/xla/service/gpu/elemental_ir_emitter.h
@@ -85,6 +85,9 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
   absl::StatusOr<llvm::Value*> EmitTanh(PrimitiveType prim_type,
                                         llvm::Value* value) override;
 
+  absl::StatusOr<llvm::Value*> EmitErf(PrimitiveType prim_type,
+                                       llvm::Value* value) override;
+
   absl::StatusOr<llvm::Value*> EmitComplexAbs(PrimitiveType prim_type,
                                               llvm::Value* value) override;
 
@@ -95,8 +98,6 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
       const HloComputation& callee, absl::Span<llvm::Value* const> parameters,
       absl::string_view, bool /*is_reducer*/) override;
 
-  llvm::Value* EmitThreadId() override;
-
   absl::StatusOr<llvm::Value*> EmitF32ToBF16(llvm::Value* f32_value) override;
 
   bool fast_min_max() override {
@@ -109,13 +110,6 @@ class GpuElementalIrEmitter : public ElementalIrEmitter {
                                            llvm::Value* lhs_value,
                                            llvm::Value* rhs_value);
 
-  // Emits IR to call an LLVM intrinsic of type [T] -> T.  Adjusts
-  // callee_name according to T.  Returns the IR value that represents the
-  // return value of the function.
-  absl::StatusOr<llvm::Value*> EmitLlvmIntrinsicMathCall(
-      const std::string& callee_name, absl::Span<llvm::Value* const> operands,
-      absl::Span<const PrimitiveType> input_types, PrimitiveType output_type);
-
   // Emits IR to call a device function of type [T] -> T.  Adjusts
   // callee_name according to T.  Returns the IR value that represents the
   // return value of the function.
diff --git a/third_party/xla/xla/service/gpu/executable.proto b/third_party/xla/xla/service/gpu/executable.proto
index 700773785d40f6..970d57f0604dd0 100644
--- a/third_party/xla/xla/service/gpu/executable.proto
+++ b/third_party/xla/xla/service/gpu/executable.proto
@@ -18,6 +18,7 @@ syntax = "proto3";
 package xla.gpu;
 
 import "xla/service/hlo.proto";
+import "xla/xla.proto";
 
 message XlaRuntimeGpuExecutableProto {
   message ConstantInfoProto {
@@ -39,7 +40,7 @@ message XlaRuntimeGpuExecutableProto {
 }
 
 message CompilationResultProto {
-  HloModuleProto hlo_module = 1;
+  HloModuleProtoWithConfig hlo_module_with_config = 1;
   BufferAssignmentProto buffer_assignment = 2;
   string asm_text = 3;
   bytes binary = 4;
diff --git a/third_party/xla/xla/service/gpu/fusion_merger.cc b/third_party/xla/xla/service/gpu/fusion_merger.cc
index 6cd747812fbb69..3cbeb695b2b13b 100644
--- a/third_party/xla/xla/service/gpu/fusion_merger.cc
+++ b/third_party/xla/xla/service/gpu/fusion_merger.cc
@@ -105,7 +105,7 @@ absl::Status FusionInstructionMerger::FuseIntoAllUsers(
     HloInstruction* consumer = user;
     if (consumer->opcode() != HloOpcode::kFusion) {
       consumer = computation_->AddInstruction(HloInstruction::CreateFusion(
-          user->shape(), ChooseFusionKind(*producer, *user), user));
+          user->shape(), ChooseFusionKind(*user), user));
       TF_CHECK_OK(computation_->ReplaceInstruction(user, consumer));
     }
 
diff --git a/third_party/xla/xla/service/gpu/fusion_process_dump.cc b/third_party/xla/xla/service/gpu/fusion_process_dump.cc
new file mode 100644
index 00000000000000..9863a3a7b63ef8
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusion_process_dump.cc
@@ -0,0 +1,219 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/fusion_process_dump.h"
+
+#include <string>
+#include <string_view>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/fusion_process_dump.pb.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tools/hlo_module_loader.h"
+#include "xla/util.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
+#include "tsl/platform/path.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+HloInstruction* AddFusionInstruction(HloInstruction* producer,
+                                     HloInstruction* consumer,
+                                     HloComputation* computation,
+                                     std::string_view fusion_name) {
+  if (consumer->opcode() == HloOpcode::kFusion) {
+    return consumer;
+  }
+
+  // This is not true for all fusions, but the fusion kind isn't used in the
+  // cost model and fusion pipeline, so it doesn't matter here. Set kLoop for
+  // everything.
+  auto kind = HloInstruction::FusionKind::kLoop;
+
+  auto fusion_instruction = computation->AddInstruction(
+      HloInstruction::CreateFusion(consumer->shape(), kind, consumer),
+      /*new_name=*/fusion_name);
+  TF_CHECK_OK(computation->ReplaceInstruction(consumer, fusion_instruction));
+
+  return fusion_instruction;
+}
+
+HloInstruction* Fuse(HloInstruction* producer, HloInstruction* consumer,
+                     HloComputation* computation,
+                     std::string_view fusion_name) {
+  HloInstruction* fusion_instruction =
+      AddFusionInstruction(producer, consumer, computation, fusion_name);
+  if (producer->opcode() == HloOpcode::kFusion) {
+    fusion_instruction->MergeFusionInstruction(producer);
+  } else {
+    fusion_instruction->FuseInstruction(producer);
+  }
+
+  if (producer->user_count() == 0) {
+    TF_CHECK_OK(computation->RemoveInstruction(producer));
+  }
+
+  return fusion_instruction;
+}
+
+absl::string_view GetProducerName(const FusionStep& step) {
+  if (step.has_fusion()) {
+    return step.fusion().producer_name();
+  }
+
+  if (step.has_update_priority()) {
+    return step.update_priority().producer_name();
+  }
+
+  if (step.has_producer_ineligible()) {
+    return step.producer_ineligible().producer_name();
+  }
+
+  LOG(FATAL) << "Producer name not found in the current step.";
+}
+
+}  // namespace
+
+absl::StatusOr<FusionProcessDump> FusionProcessDump::LoadFromFile(
+    const std::string& path) {
+  std::string format = std::string(tsl::io::Extension(path));
+  std::string data;
+  TF_RETURN_IF_ERROR(tsl::ReadFileToString(tsl::Env::Default(), path, &data));
+  return FusionProcessDump::LoadFromData(data, format);
+}
+
+absl::StatusOr<FusionProcessDump> FusionProcessDump::LoadFromData(
+    const std::string& data, absl::string_view format) {
+  FusionProcessDumpProto fusion_process_dump_proto;
+  if (format == "txt" || format == "pbtxt") {
+    if (!tsl::protobuf::TextFormat::ParseFromString(
+            data, &fusion_process_dump_proto)) {
+      return InvalidArgument("Failed to parse input as HLO protobuf text");
+    }
+  } else if (format == "pb") {
+    if (!fusion_process_dump_proto.ParseFromString(data)) {
+      return InvalidArgument("Failed to parse input as HLO protobuf binary");
+    }
+  } else {
+    return InvalidArgument(
+        "Invalid format from file extension: '%s'. Expected: txt, pb, or pbtxt",
+        format);
+  }
+
+  return FusionProcessDump::LoadFromProto(fusion_process_dump_proto);
+}
+
+absl::StatusOr<FusionProcessDump> FusionProcessDump::LoadFromProto(
+    const FusionProcessDumpProto& fusion_process_dump_proto) {
+  TF_ASSIGN_OR_RETURN(
+      auto module,
+      LoadModuleFromData(fusion_process_dump_proto.hlo_module_before_fusion(),
+                         /*format=*/"txt"));
+
+  se::DeviceDescription gpu_device_info(
+      fusion_process_dump_proto.gpu_device_info());
+
+  absl::flat_hash_map<std::string, HloComputation*>
+      instruction_name_to_computation_map;
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    for (HloInstruction* instr : computation->instructions()) {
+      instruction_name_to_computation_map[instr->name()] = computation;
+    }
+  }
+
+  return FusionProcessDump(std::move(fusion_process_dump_proto),
+                           std::move(module), std::move(gpu_device_info),
+                           std::move(instruction_name_to_computation_map));
+}
+
+HloComputation* FusionProcessDump::GetCurrentComputation() {
+  return instruction_name_to_computation_map_.at(
+      GetProducerName(CurrentStep()));
+}
+
+HloInstruction* FusionProcessDump::GetInstructionWithName(
+    absl::string_view name) {
+  return instruction_name_to_computation_map_[name]->GetInstructionWithName(
+      name);
+}
+
+HloInstruction* FusionProcessDump::GetProducer() {
+  return GetInstructionWithName(GetProducerName(CurrentStep()));
+}
+
+absl::InlinedVector<HloInstruction*, 2> FusionProcessDump::GetConsumers() {
+  auto& step = CurrentStep();
+
+  if (step.has_fusion()) {
+    return {GetInstructionWithName(step.fusion().consumer_name())};
+  }
+
+  if (step.has_update_priority()) {
+    absl::InlinedVector<HloInstruction*, 2> consumers;
+    for (const auto& consumer_name : step.update_priority().consumer_names()) {
+      consumers.push_back(GetInstructionWithName(consumer_name));
+    }
+    return consumers;
+  }
+
+  return {};
+}
+
+const FusionStep& FusionProcessDump::CurrentStep() {
+  CHECK(HasNext());
+  return fusion_process_dump_proto_.fusion_steps(current_step_idx_);
+}
+
+bool FusionProcessDump::HasNext() {
+  return current_step_idx_ < fusion_process_dump_proto_.fusion_steps_size();
+}
+
+void FusionProcessDump::Advance() {
+  auto step = CurrentStep();
+  if (step.has_fusion()) {
+    const auto& fusion_step = step.fusion();
+
+    auto* computation = GetCurrentComputation();
+
+    HloInstruction* producer =
+        computation->GetInstructionWithName(fusion_step.producer_name());
+    HloInstruction* consumer =
+        computation->GetInstructionWithName(fusion_step.consumer_name());
+
+    HloInstruction* fusion =
+        Fuse(producer, consumer, computation, fusion_step.fusion_name());
+
+    instruction_name_to_computation_map_[fusion->name()] = computation;
+    last_fusion_ = fusion;
+  }
+  ++current_step_idx_;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusion_process_dump.h b/third_party/xla/xla/service/gpu/fusion_process_dump.h
new file mode 100644
index 00000000000000..782702d116d19d
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusion_process_dump.h
@@ -0,0 +1,119 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_FUSION_PROCESS_DUMP_H_
+#define XLA_SERVICE_GPU_FUSION_PROCESS_DUMP_H_
+
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/gpu/fusion_process_dump.pb.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/stream_executor.h"
+
+namespace xla {
+namespace gpu {
+
+// Helper class to work with fusion process dump.
+class FusionProcessDump {
+ public:
+  static absl::StatusOr<FusionProcessDump> LoadFromFile(
+      const std::string& path);
+  static absl::StatusOr<FusionProcessDump> LoadFromData(
+      const std::string& data, absl::string_view format);
+  static absl::StatusOr<FusionProcessDump> LoadFromProto(
+      const FusionProcessDumpProto& fusion_process_dump_proto);
+
+  const FusionProcessDumpProto& proto() { return fusion_process_dump_proto_; }
+
+  HloModule* module() { return hlo_module_.get(); }
+
+  const se::DeviceDescription& device_info() { return device_info_; }
+
+  int64_t current_step_idx() { return current_step_idx_; }
+
+  // Returns computation that contains producer (and other instructions) of the
+  // current step.
+  HloComputation* GetCurrentComputation();
+
+  // Returns the instruction with `name`.
+  HloInstruction* GetInstructionWithName(absl::string_view name);
+
+  // Returns producer of the current step. Should not be null, since all step
+  // types have a producer.
+  HloInstruction* GetProducer();
+
+  // Returns a list of consumers of the current step. The list contains one
+  // instruction is the current step is fusion. The list is empty if the current
+  // step is `producer_ineligible`.
+  absl::InlinedVector<HloInstruction*, 2> GetConsumers();
+
+  // Returns result instruction of the last fusion step. Returns nullptr before
+  // the first fusion.
+  HloInstruction* GetLastFusion() { return last_fusion_; }
+
+  // Returns current step. If current step is `fusion`, the `module` is in the
+  // state *before* the fusion. Next call to `FusionProcessDump::Advance` will
+  // actualy perform the fusion.
+  const FusionStep& CurrentStep();
+
+  // Returns true if there are fusion steps.
+  bool HasNext();
+
+  // Advances to the next fusion step. If current step is `fusion`, modifies the
+  // `module` accordingly.
+  void Advance();
+
+ private:
+  FusionProcessDump(FusionProcessDumpProto fusion_process_dump_proto,
+                    std::unique_ptr<HloModule> hlo_module,
+                    se::DeviceDescription device_info,
+                    absl::flat_hash_map<std::string, HloComputation*>
+                        instruction_name_to_computation_map)
+      : fusion_process_dump_proto_(std::move(fusion_process_dump_proto)),
+        hlo_module_(std::move(hlo_module)),
+        device_info_(std::move(device_info)),
+        instruction_name_to_computation_map_(
+            std::move(instruction_name_to_computation_map)) {}
+
+  FusionProcessDumpProto fusion_process_dump_proto_;
+  std::unique_ptr<HloModule> hlo_module_;
+  se::DeviceDescription device_info_;
+
+  // A map from instructions to computations. HLO module doesn't have a
+  // convenient way to get an instruction by name. This map saves the need to
+  // iterator over all computations in the module.
+  absl::flat_hash_map<std::string, HloComputation*>
+      instruction_name_to_computation_map_;
+
+  // Index of the current step.
+  int64_t current_step_idx_ = 0;
+
+  // Tracks result of the last fusion step.
+  HloInstruction* last_fusion_ = nullptr;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_FUSION_PROCESS_DUMP_H_
diff --git a/third_party/xla/xla/service/gpu/fusion_process_dump.proto b/third_party/xla/xla/service/gpu/fusion_process_dump.proto
index 0fc379441358b1..0c52edb46c09eb 100644
--- a/third_party/xla/xla/service/gpu/fusion_process_dump.proto
+++ b/third_party/xla/xla/service/gpu/fusion_process_dump.proto
@@ -2,6 +2,8 @@ syntax = "proto3";
 
 package xla.gpu;
 
+import "xla/stream_executor/device_description.proto";
+
 message FusionStep {
   message Fusion {
     // Name of the resulting fusion. Can be the same as producer or consumer.
@@ -46,4 +48,13 @@ message FusionStep {
 
 message FusionProcessDumpProto {
   repeated FusionStep fusion_steps = 1;
+
+  stream_executor.GpuDeviceInfoProto gpu_device_info = 2;
+
+  // HLO module before fusion in short parsable string format. The string
+  // represantation is compacter than HloModuleProto in this case, especially
+  // when the fusion process dump is stored as text proto.
+  //
+  // TODO: Consider using base64 or gzip to decrease the size of the string.
+  string hlo_module_before_fusion = 3;
 }
diff --git a/third_party/xla/xla/service/gpu/fusion_process_dump_test.cc b/third_party/xla/xla/service/gpu/fusion_process_dump_test.cc
new file mode 100644
index 00000000000000..37eb3bee29c836
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusion_process_dump_test.cc
@@ -0,0 +1,94 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/fusion_process_dump.h"
+
+#include <string>
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/fusion_process_dump.pb.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/hlo_parser.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/service/pattern_matcher_gmock.h"
+#include "xla/test.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace m = ::xla::match;
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using FusionProcessDumpTest = HloTestBase;
+
+void AddFusion(FusionProcessDumpProto& dump_proto,
+               const std::string& fusion_name, const std::string& producer_name,
+               const std::string& consumer_name) {
+  auto step = dump_proto.add_fusion_steps();
+  auto fusion_step = step->mutable_fusion();
+  fusion_step->set_fusion_name(fusion_name);
+  fusion_step->set_producer_name(producer_name);
+  fusion_step->set_consumer_name(consumer_name);
+}
+
+TEST_F(FusionProcessDumpTest, MultipleFusionSteps) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+    HloModule test_module
+
+    ENTRY main {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      add = f32[] add(p0, p1)
+      subtract = f32[] subtract(p0, p1)
+      abs = f32[] abs(subtract)
+      ROOT multiply = f32[] multiply(add, abs)
+  })"));
+
+  FusionProcessDumpProto dump_proto;
+  *dump_proto.mutable_gpu_device_info() =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo().ToGpuProto();
+  dump_proto.set_hlo_module_before_fusion(
+      module->ToString(HloPrintOptions::ShortParsable()));
+
+  AddFusion(dump_proto, "fusion.1", "subtract", "abs");
+  AddFusion(dump_proto, "fusion.2", "fusion.1", "multiply");
+  AddFusion(dump_proto, "fusion.2", "add", "fusion.2");
+
+  TF_ASSERT_OK_AND_ASSIGN(auto fusion_process_dump,
+                          FusionProcessDump::LoadFromProto(dump_proto));
+
+  fusion_process_dump.Advance();
+  fusion_process_dump.Advance();
+  fusion_process_dump.Advance();
+
+  EXPECT_FALSE(fusion_process_dump.HasNext());
+
+  auto root =
+      fusion_process_dump.module()->entry_computation()->root_instruction();
+  EXPECT_EQ(root->name(), "fusion.2");
+  ASSERT_THAT(root, GmockMatch(m::Fusion(m::Parameter(), m::Parameter())));
+  EXPECT_THAT(root->fused_expression_root(),
+              GmockMatch(m::Multiply(
+                  m::Add(m::Parameter(), m::Parameter()),
+                  m::Abs(m::Subtract(m::Parameter(), m::Parameter())))));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusion_wrapper.cc b/third_party/xla/xla/service/gpu/fusion_wrapper.cc
index bafc2204cbf88d..3dcb448412fb25 100644
--- a/third_party/xla/xla/service/gpu/fusion_wrapper.cc
+++ b/third_party/xla/xla/service/gpu/fusion_wrapper.cc
@@ -18,6 +18,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -67,6 +68,7 @@ absl::StatusOr<bool> FusionWrapper::Run(
       case HloOpcode::kDot:
       case HloOpcode::kDynamicSlice:
       case HloOpcode::kDynamicUpdateSlice:
+      case HloOpcode::kErf:
       case HloOpcode::kExp:
       case HloOpcode::kExpm1:
       case HloOpcode::kFloor:
@@ -114,10 +116,15 @@ absl::StatusOr<bool> FusionWrapper::Run(
         auto* computation = instruction->parent();
         auto* fusion_instruction =
             computation->AddInstruction(HloInstruction::CreateFusion(
-                instruction->shape(),
-                ChooseFusionKind(*instruction, *instruction), instruction));
-        instruction->GetModule()->SetAndUniquifyInstrName(
-            fusion_instruction, absl::StrCat("wrapped_", instruction->name()));
+                instruction->shape(), ChooseFusionKind(*instruction),
+                instruction));
+        const absl::string_view wrapped_opcode =
+            HloOpcodeString(instruction->opcode());
+        module->SetAndUniquifyInstrName(
+            fusion_instruction, absl::StrCat("wrapped_", wrapped_opcode));
+        module->SetAndUniquifyComputationName(
+            fusion_instruction->fused_instructions_computation(),
+            absl::StrCat("wrapped_", wrapped_opcode, "_computation"));
         if (module->has_schedule()) {
           module->schedule().replace_instruction(computation, instruction,
                                                  fusion_instruction);
diff --git a/third_party/xla/xla/service/gpu/fusion_wrapper_test.cc b/third_party/xla/xla/service/gpu/fusion_wrapper_test.cc
index c7860cea8b1229..ad77ad99efb304 100644
--- a/third_party/xla/xla/service/gpu/fusion_wrapper_test.cc
+++ b/third_party/xla/xla/service/gpu/fusion_wrapper_test.cc
@@ -33,7 +33,7 @@ TEST_F(FusionWrapperTest, SimpleOp) {
         ROOT result = f16[60, 41] concatenate(p0, p1), dimensions={0}
       })",
                             FusionWrapper(), R"(
-// CHECK: %fused_computation (param_0: f16[30,41], param_1: f16[30,41]) -> f16[60,41] {
+// CHECK: %wrapped_concatenate_computation (param_0: f16[30,41], param_1: f16[30,41]) -> f16[60,41] {
 // CHECK:   %param_0 = f16[30,41]{1,0} parameter(0)
 // CHECK:   %param_1 = f16[30,41]{1,0} parameter(1)
 // CHECK:   ROOT %result.1 = f16[60,41]{1,0} concatenate(%param_0, %param_1), dimensions={0}
@@ -42,7 +42,7 @@ TEST_F(FusionWrapperTest, SimpleOp) {
 // CHECK: ENTRY %TestComputation (p0: f16[30,41], p1: f16[30,41]) -> f16[60,41] {
 // CHECK:   %p0 = f16[30,41]{1,0} parameter(0)
 // CHECK:   %p1 = f16[30,41]{1,0} parameter(1)
-// CHECK:   ROOT %wrapped_result = f16[60,41]{1,0} fusion(%p0, %p1), kind=kLoop, calls=%fused_computation
+// CHECK:   ROOT %wrapped_concatenate = f16[60,41]{1,0} fusion(%p0, %p1), kind=kLoop, calls=%wrapped_concatenate_computation
 // CHECK: })");
 }
 
@@ -67,7 +67,7 @@ TEST_F(FusionWrapperTest, Scatter) {
             to_apply=update_s32
       })",
                             FusionWrapper(), R"(
-// CHECK: fused_computation
+// CHECK: wrapped_scatter_computation
 // CHECK:   %[[param_0:.*]] = s32[] parameter(0)
 // CHECK:   %[[param_1:.*]] = s32[0]{0} parameter(1)
 // CHECK:   %[[param_2:.*]] = s32[] parameter(2)
@@ -77,7 +77,7 @@ TEST_F(FusionWrapperTest, Scatter) {
 // CHECK:   %[[p0:.*]] = s32[] parameter(0)
 // CHECK:   %[[p1:.*]] = s32[0]{0} parameter(1)
 // CHECK:   %[[p2:.*]] = s32[] parameter(2)
-// CHECK:   ROOT %{{.*}} = s32[] fusion(%[[p0]], %[[p1]], %[[p2]]), kind=kInput, calls=%fused_computation
+// CHECK:   ROOT %{{.*}} = s32[] fusion(%[[p0]], %[[p1]], %[[p2]]), kind=kInput, calls=%wrapped_scatter_computation
 // CHECK: })");
 }
 
@@ -123,28 +123,28 @@ TEST_F(FusionWrapperTest, While) {
         ROOT %while.19 = (f32[5]{0}) while((f32[5]{0}) %tuple), condition=%cond, body=%body
       })",
                             FusionWrapper(), R"(
-// CHECK: %fused_computation.1 {{.*}} {
+// CHECK: %wrapped_broadcast_computation {{.*}} {
 // CHECK:  %param_0.1 = f32[] parameter(0)
 // CHECK:  ROOT %broadcast.0 = f32[5]{0} broadcast(%param_0.1), dimensions={}
 // CHECK: }
 // CHECK: %body {{.*}} {
 // CHECK:   %parameter.5 = (f32[5]{0}) parameter(0)
 // CHECK:   %constant_8 = f32[] constant(0)
-// CHECK:   %wrapped_broadcast.9 = f32[5]{0} fusion(%constant_8), kind=kLoop, calls=%fused_computation.1
-// CHECK:   ROOT %tuple.2 = (f32[5]{0}) tuple(%wrapped_broadcast.9)
+// CHECK:   %wrapped_broadcast = f32[5]{0} fusion(%constant_8), kind=kLoop, calls=%wrapped_broadcast_computation
+// CHECK:   ROOT %tuple.2 = (f32[5]{0}) tuple(%wrapped_broadcast)
 // CHECK: }
 // CHECK: %cond {{.*}} {
 // CHECK:   %parameter.12 = (f32[5]{0}) parameter(0)
 // CHECK:   ROOT %constant_1 = pred[] constant(false)
 // CHECK: }
-// CHECK: %fused_computation {{.*}} {
+// CHECK: %wrapped_copy_computation {{.*}} {
 // CHECK:   %param_0 = f32[5]{0} parameter(0)
 // CHECK:   ROOT %copy.0 = f32[5]{0} copy(%param_0)
 // CHECK: }
 // CHECK: ENTRY %main {{.*}} {
 // CHECK:   %parameter.1 = f32[5]{0} parameter(0)
-// CHECK:   %wrapped_copy.3 = f32[5]{0} fusion(%parameter.1), kind=kLoop, calls=%fused_computation
-// CHECK:   %tuple = (f32[5]{0}) tuple(%wrapped_copy.3)
+// CHECK:   %wrapped_copy = f32[5]{0} fusion(%parameter.1), kind=kLoop, calls=%wrapped_copy_computation
+// CHECK:   %tuple = (f32[5]{0}) tuple(%wrapped_copy)
 // CHECK:   ROOT %while.19 = (f32[5]{0}) while(%tuple), condition=%cond, body=%body
 // CHECK: })");
 }
diff --git a/third_party/xla/xla/service/gpu/fusions/BUILD b/third_party/xla/xla/service/gpu/fusions/BUILD
index 09a2930a18262a..3412a6cdd4d330 100644
--- a/third_party/xla/xla/service/gpu/fusions/BUILD
+++ b/third_party/xla/xla/service/gpu/fusions/BUILD
@@ -1,9 +1,9 @@
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("//xla:xla.bzl", "xla_cc_test")
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
@@ -12,7 +12,6 @@ cc_library(
     name = "in_place_dynamic_update_slice",
     srcs = ["in_place_dynamic_update_slice.cc"],
     hdrs = ["in_place_dynamic_update_slice.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":fusion_emitter",
         "//xla:status",
@@ -26,6 +25,7 @@ cc_library(
         "//xla/service/llvm_ir:dynamic_update_slice_util",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
+        "@com_google_absl//absl/status",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
     ],
@@ -35,7 +35,6 @@ cc_library(
     name = "copy",
     srcs = ["copy.cc"],
     hdrs = ["copy.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":fusion_emitter",
         "//xla:statusor",
@@ -44,7 +43,7 @@ cc_library(
         "//xla/service:buffer_assignment",
         "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu:thunk",
-        "//xla/service/gpu/runtime3:copy_thunk",
+        "//xla/service/gpu/runtime:copy_thunk",
         "@llvm-project//mlir:IR",
     ],
 )
@@ -53,18 +52,23 @@ cc_library(
     name = "custom",
     srcs = ["custom.cc"],
     hdrs = ["custom.h"],
-    visibility = ["//visibility:public"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
         ":fusion_emitter",
         "//xla:shape_util",
+        "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
+        "//xla:util",
+        "//xla/ffi:ffi_api",
+        "//xla/ffi/api:c_api",
         "//xla/hlo/ir:hlo",
         "//xla/mlir_hlo:lhlo",
         "//xla/service:buffer_assignment",
+        "//xla/service:custom_call_status",
+        "//xla/service:custom_call_target_registry",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:cublas_cudnn",
-        "//xla/service/gpu:gemm_thunk",
         "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:hlo_traversal",
         "//xla/service/gpu:ir_emission_utils",
@@ -72,43 +76,65 @@ cc_library(
         "//xla/service/gpu:kernel_arguments",
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu:thunk",
-        "//xla/service/gpu/kernels:custom_fusion",
         "//xla/service/gpu/kernels:custom_kernel",
-        "//xla/service/gpu/runtime3:kernel_thunk",
+        "//xla/service/gpu/kernels:custom_kernel_fusion",
+        "//xla/service/gpu/runtime:custom_call_thunk",
+        "//xla/service/gpu/runtime:gemm_thunk",
+        "//xla/service/gpu/runtime:kernel_thunk",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
 
 xla_test(
     name = "address_computation_fusion_test",
-    srcs = ["address_computation_fusion_test.cc"],
+    srcs = if_cuda_is_configured(["address_computation_fusion_test.cc"]),
     backends = ["gpu"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
-        "//xla:array3d",
-        "//xla:array4d",
         "//xla:error_spec",
-        "//xla:literal_util",
-        "//xla:types",
+        "//xla:shape_util",
+        "//xla/client:xla_builder",
+        "//xla/client:xla_computation",
+        "//xla/client/lib:constants",
+        "//xla/ffi",
+        "//xla/ffi:ffi_api",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:custom_call_target_registry",
+        "//xla/service:executable",
+        "//xla/service:hlo_module_config",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor/gpu:gpu_types_header",
         "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
-    ],
+    ] + if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
+    ]) + if_rocm_is_configured([
+        "@local_config_rocm//rocm:rocm_headers",
+    ]),
 )
 
 cc_library(
     name = "fusion_emitter",
     srcs = ["fusion_emitter.cc"],
     hdrs = ["fusion_emitter.h"],
-    visibility = ["//visibility:public"],
+    visibility = ["//xla/service/gpu:__subpackages__"],
     deps = [
         "//xla:shape_util",
         "//xla:status",
         "//xla:status_macros",
         "//xla:statusor",
+        "//xla:util",
         "//xla/hlo/ir:hlo",
         "//xla/mlir_hlo:lhlo",
         "//xla/service/gpu:ir_emitter_context",
@@ -119,7 +145,7 @@ cc_library(
         "//xla/service/gpu:thunk",
         "//xla/service/gpu/model:indexing_analysis",
         "//xla/service/gpu/model:indexing_map",
-        "//xla/service/gpu/runtime3:kernel_thunk",
+        "//xla/service/gpu/runtime:kernel_thunk",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
         "//xla/stream_executor:device_description",
@@ -128,6 +154,7 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:ir_headers",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:errors",
@@ -139,7 +166,7 @@ cc_library(
     name = "fusions",
     srcs = ["fusions.cc"],
     hdrs = ["fusions.h"],
-    visibility = ["//visibility:public"],
+    visibility = ["//xla/service/gpu:__subpackages__"],
     deps = [
         ":concatenate",
         ":copy",
@@ -173,7 +200,6 @@ cc_library(
     name = "loop",
     srcs = ["loop.cc"],
     hdrs = ["loop.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":fusion_emitter",
         "//xla:shape_util",
@@ -188,6 +214,7 @@ cc_library(
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:parallel_loop_emitter",
         "//xla/service/gpu/model:indexing_analysis",
+        "//xla/service/gpu/model:indexing_map",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "@com_google_absl//absl/numeric:bits",
@@ -221,7 +248,6 @@ cc_library(
     name = "scatter",
     srcs = ["scatter.cc"],
     hdrs = ["scatter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":fusion_emitter",
         "//xla:shape_util",
@@ -266,10 +292,9 @@ cc_library(
     name = "tiling_util",
     srcs = ["tiling_util.cc"],
     hdrs = ["tiling_util.h"],
-    visibility = ["//visibility:public"],
+    visibility = ["//xla/service/gpu:__subpackages__"],
     deps = [
         "//xla:shape_util",
-        "//xla:statusor",
         "//xla:util",
         "//xla/service/gpu:ir_emission_utils",
         "//xla/service/gpu:target_util",
@@ -278,6 +303,7 @@ cc_library(
         "//xla/service/llvm_ir:llvm_loop",
         "//xla/service/llvm_ir:llvm_util",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
@@ -290,7 +316,6 @@ cc_library(
     srcs = ["triton.cc"],
     hdrs = ["triton.h"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    visibility = ["//visibility:public"],
     deps = [
         ":fusion_emitter",
         "//xla:statusor",
@@ -306,7 +331,7 @@ cc_library(
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu:triton_fusion_analysis",
-        "//xla/service/gpu/runtime3:kernel_thunk",
+        "//xla/service/gpu/runtime:kernel_thunk",
         "//xla/service/llvm_ir:llvm_util",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
@@ -339,7 +364,7 @@ cc_library(
     name = "thunk_util",
     srcs = ["thunk_util.cc"],
     hdrs = ["thunk_util.h"],
-    visibility = ["//visibility:public"],
+    visibility = ["//xla/service/gpu:__subpackages__"],
     deps = [
         "//xla:literal",
         "//xla:shape_util",
@@ -349,7 +374,7 @@ cc_library(
         "//xla/service/gpu:gpu_executable",
         "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu:thunk",
-        "//xla/service/gpu/runtime3:memset_thunk",
+        "//xla/service/gpu/runtime:memset_thunk",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/types:span",
         "@llvm-project//mlir:IR",
@@ -360,7 +385,6 @@ cc_library(
     name = "reduction",
     srcs = ["reduction.cc"],
     hdrs = ["reduction.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":fusion_emitter",
         ":thunk_util",
@@ -387,7 +411,9 @@ cc_library(
         "//xla/service/gpu:reduction_utils",
         "//xla/service/gpu:target_util",
         "//xla/service/gpu:thunk",
-        "//xla/service/gpu/runtime3:kernel_thunk",
+        "//xla/service/gpu/model:indexing_analysis",
+        "//xla/service/gpu/model:indexing_map",
+        "//xla/service/gpu/runtime:kernel_thunk",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:kernel_support_library",
@@ -415,11 +441,30 @@ cc_library(
     ],
 )
 
+xla_cc_test(
+    name = "reduction_test",
+    srcs = ["reduction_test.cc"],
+    deps = [
+        ":fusions",
+        ":reduction",
+        "//xla:status_macros",
+        "//xla:statusor",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu/model:indexing_test_utils",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "concatenate",
     srcs = ["concatenate.cc"],
     hdrs = ["concatenate.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":fusion_emitter",
         "//xla:shape_util",
@@ -448,7 +493,6 @@ cc_library(
     name = "transpose",
     srcs = ["transpose.cc"],
     hdrs = ["transpose.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":fusion_emitter",
         ":tiling_util",
@@ -462,17 +506,40 @@ cc_library(
         "//xla/service/gpu:ir_emitter_context",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu:target_util",
+        "//xla/service/gpu/model:indexing_analysis",
+        "//xla/service/gpu/model:indexing_map",
         "//xla/service/llvm_ir:fused_ir_emitter",
         "//xla/service/llvm_ir:ir_array",
         "//xla/service/llvm_ir:llvm_util",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:string_view",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//llvm:ir_headers",
+        "@llvm-project//mlir:IR",
+    ],
+)
+
+xla_cc_test(
+    name = "transpose_test",
+    srcs = ["transpose_test.cc"],
+    deps = [
+        ":fusions",
+        ":transpose",
+        "//xla:status_macros",
+        "//xla:statusor",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu/model:indexing_test_utils",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -480,7 +547,6 @@ cc_library(
     name = "input_slices",
     srcs = ["input_slices.cc"],
     hdrs = ["input_slices.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":fusion_emitter",
         "//xla:shape_util",
diff --git a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
index 4e0d4abe29a722..702551c8ac2f1e 100644
--- a/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/address_computation_fusion_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,19 +12,78 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
-#include "xla/array3d.h"
-#include "xla/array4d.h"
+
+#include <cstddef>
+#include <cstdint>
+#include <functional>
+#include <utility>
+
+#include "absl/algorithm/container.h"
+#include "absl/status/status.h"
+#include "xla/client/lib/constants.h"
+#include "xla/client/xla_builder.h"
 #include "xla/error_spec.h"
-#include "xla/literal_util.h"
+#include "xla/ffi/ffi.h"
+#include "xla/ffi/ffi_api.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/custom_call_target_registry.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/service_executable_run_options.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/tests/hlo_test_base.h"
-#include "xla/types.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
+#define PLATFORM "CUDA"
+#if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda.h"  // IWYU pragma: keep
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/include/driver_types.h"
+#elif TENSORFLOW_USE_ROCM
+#include "rocm/include/hip/hip_runtime.h"
+#define PLATFORM "ROCM"
+#endif
+
+#if GOOGLE_CUDA
+#define gpuSuccess cudaSuccess
+#define gpuMemcpyAsync cudaMemcpyAsync
+#define gpuMemcpyDeviceToDevice cudaMemcpyDeviceToDevice
+#define gpuMemcpy cudaMemcpy
+#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
+#elif TENSORFLOW_USE_ROCM
+#define gpuSuccess hipSuccess
+#define gpuMemcpyAsync hipMemcpyAsync
+#define gpuMemcpyDeviceToDevice hipMemcpyDeviceToDevice
+#define gpuMemcpy hipMemcpy
+#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
+#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
+#endif
+
 namespace xla {
 namespace gpu {
 namespace {
 
-class AddressComputationFusionTest : public HloTestBase {};
+class AddressComputationFusionTest : public HloTestBase {
+ public:
+  HloModuleConfig GetRefModuleConfig() {
+    DebugOptions debug_options = GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+    HloModuleConfig config;
+    config.set_debug_options(debug_options);
+    return config;
+  }
+
+  HloModuleConfig GetOptModuleConfig() {
+    DebugOptions debug_options = GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_enable_address_computation_fusion(true);
+    HloModuleConfig config;
+    config.set_debug_options(debug_options);
+    return config;
+  }
+};
 
 TEST_F(AddressComputationFusionTest, CublasGemmSimple) {
   ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
@@ -32,15 +91,15 @@ TEST_F(AddressComputationFusionTest, CublasGemmSimple) {
   const char* hlo_ref = R"(
   HloModule jit_slice
 
-  ENTRY %main.9 (Arg_0.1: bf16[2,8,8], Arg_1.2: bf16[2,8,8]) -> bf16[8,8] {
-    %Arg_0.1 = bf16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
-    %Arg_1.2 = bf16[2,8,8]{2,1,0} parameter(1), sharding={replicated}
-    %slice.13 = bf16[1,8,8]{2,1,0} slice(bf16[2,8,8]{2,1,0} %Arg_0.1), slice={[1:2], [0:8], [0:8]}
-    %bitcast.41 = bf16[8,8]{1,0} bitcast(bf16[1,8,8]{2,1,0} %slice.13)
-    %slice.14 = bf16[1,8,8]{2,1,0} slice(bf16[2,8,8]{2,1,0} %Arg_1.2), slice={[1:2], [0:8], [0:8]}
-    %bitcast.42 = bf16[8,8]{1,0} bitcast(bf16[1,8,8]{2,1,0} %slice.14)
+  ENTRY %main.9 {
+    %p0 = bf16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
+    %p1 = bf16[2,8,8]{2,1,0} parameter(1), sharding={replicated}
+    %slice.13 = bf16[1,8,8]{2,1,0} slice(%p0), slice={[1:2], [0:8], [0:8]}
+    %bitcast.41 = bf16[8,8]{1,0} bitcast(%slice.13)
+    %slice.14 = bf16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+    %bitcast.42 = bf16[8,8]{1,0} bitcast(%slice.14)
 
-    ROOT %custom-call.1 = bf16[8,8]{1,0} custom-call(bf16[8,8]{1,0} %bitcast.41, bf16[8,8]{1,0} %bitcast.42),
+    ROOT %custom-call.1 = bf16[8,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
       custom_call_target="__cublas$gemm",
       backend_config={"gemm_backend_config":{
         "alpha_real":1,
@@ -64,15 +123,15 @@ TEST_F(AddressComputationFusionTest, CublasGemmSimple) {
   const char* hlo_opt = R"(
   HloModule jit_slice
 
-  %fused_computation (param_0_0: bf16[2,8,8], param_1_0: bf16[2,8,8]) -> bf16[8,8]{1,0} {
+  %fused_computation {
     %param_0_0 = bf16[2,8,8]{2,1,0} parameter(0)
-    %slice.13 = bf16[1,8,8]{2,1,0} slice(bf16[2,8,8]{2,1,0} %param_0_0), slice={[1:2], [0:8], [0:8]}
-    %bitcast.41 = bf16[8,8]{1,0} bitcast(bf16[1,8,8]{2,1,0} %slice.13)
+    %slice.13 = bf16[1,8,8]{2,1,0} slice(%param_0_0), slice={[1:2], [0:8], [0:8]}
+    %bitcast.41 = bf16[8,8]{1,0} bitcast(%slice.13)
     %param_1_0 = bf16[2,8,8]{2,1,0} parameter(1)
-    %slice.14 = bf16[1,8,8]{2,1,0} slice(bf16[2,8,8]{2,1,0} %param_1_0), slice={[1:2], [0:8], [0:8]}
-    %bitcast.42 = bf16[8,8]{1,0} bitcast(bf16[1,8,8]{2,1,0} %slice.14)
+    %slice.14 = bf16[1,8,8]{2,1,0} slice(%param_1_0), slice={[1:2], [0:8], [0:8]}
+    %bitcast.42 = bf16[8,8]{1,0} bitcast(%slice.14)
 
-    ROOT %custom-call.1 = bf16[8,8]{1,0} custom-call(bf16[8,8]{1,0} %bitcast.41, bf16[8,8]{1,0} %bitcast.42),
+    ROOT %custom-call.1 = bf16[8,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
       custom_call_target="__cublas$gemm",
       backend_config={"gemm_backend_config":{
         "alpha_real":1,
@@ -93,22 +152,15 @@ TEST_F(AddressComputationFusionTest, CublasGemmSimple) {
       }}
   }
 
-  ENTRY %main.9 (Arg_0.1: bf16[2,8,8], Arg_1.2: bf16[2,8,8]) -> bf16[8,8] {
-    %Arg_0.1 = bf16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
-    %Arg_1.2 = bf16[2,8,8]{2,1,0} parameter(1), sharding={replicated}
-    ROOT %fusion.2 = bf16[8,8]{1,0} fusion(bf16[2,8,8]{2,1,0} %Arg_0.1, bf16[2,8,8]{2,1,0} %Arg_1.2), kind=kCustom, calls=%fused_computation,
+  ENTRY %main.9 {
+    %p0 = bf16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
+    %p1 = bf16[2,8,8]{2,1,0} parameter(1), sharding={replicated}
+    ROOT %fusion.2 = bf16[8,8]{1,0} fusion(%p0, %p1), kind=kCustom, calls=%fused_computation,
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}}}
   })";
 
-  Array3D<bfloat16> arr0(2, 8, 8);  // bf16[2,8,8]
-  Array3D<bfloat16> arr1(2, 8, 8);  // bf16[2,8,8]
-  arr0.FillIota(static_cast<bfloat16>(1.0));
-  arr1.FillRandom(bfloat16(0.01f), 0.02);
-
-  auto a0 = LiteralUtil::CreateFromArray(arr0);
-  auto a1 = LiteralUtil::CreateFromArray(arr1);
-
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, {&a0, &a1}, error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -118,15 +170,15 @@ TEST_F(AddressComputationFusionTest, CublasGemmWithWorkspace) {
   const char* hlo_ref = R"(
   HloModule jit_slice
 
-  ENTRY %main.9 (Arg_0.1: f16[2,8,8], Arg_1.2: f16[2,8,8]) -> (f16[8,8]{1,0}, s8[256]{0}) {
-    %Arg_0.1 = f16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
-    %Arg_1.2 = f16[2,8,8]{2,1,0} parameter(1), sharding={replicated}
-    %slice.13 = f16[1,8,8]{2,1,0} slice(f16[2,8,8]{2,1,0} %Arg_0.1), slice={[1:2], [0:8], [0:8]}
-    %bitcast.41 = f16[8,8]{1,0} bitcast(f16[1,8,8]{2,1,0} %slice.13)
-    %slice.14 = f16[1,8,8]{2,1,0} slice(f16[2,8,8]{2,1,0} %Arg_1.2), slice={[1:2], [0:8], [0:8]}
-    %bitcast.42 = f16[8,8]{1,0} bitcast(f16[1,8,8]{2,1,0} %slice.14)
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
+    %p1 = f16[2,8,8]{2,1,0} parameter(1), sharding={replicated}
+    %slice.13 = f16[1,8,8]{2,1,0} slice(%p0), slice={[1:2], [0:8], [0:8]}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %slice.14 = f16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
 
-    ROOT %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(f16[8,8]{1,0} %bitcast.41, f16[8,8]{1,0} %bitcast.42),
+    ROOT %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
       custom_call_target="__cublas$gemm",
       backend_config={"gemm_backend_config":{
         "alpha_real":1,
@@ -150,15 +202,15 @@ TEST_F(AddressComputationFusionTest, CublasGemmWithWorkspace) {
   const char* hlo_opt = R"(
   HloModule jit_slice
 
-  %fused_computation (param_0_0: f16[2,8,8], param_1_0: f16[2,8,8]) -> (f16[8,8]{1,0}, s8[256]{0}) {
+  %fused_computation {
     %param_0_0 = f16[2,8,8]{2,1,0} parameter(0)
-    %slice.13 = f16[1,8,8]{2,1,0} slice(f16[2,8,8]{2,1,0} %param_0_0), slice={[1:2], [0:8], [0:8]}
-    %bitcast.41 = f16[8,8]{1,0} bitcast(f16[1,8,8]{2,1,0} %slice.13)
+    %slice.13 = f16[1,8,8]{2,1,0} slice(%param_0_0), slice={[1:2], [0:8], [0:8]}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
     %param_1_0 = f16[2,8,8]{2,1,0} parameter(1)
-    %slice.14 = f16[1,8,8]{2,1,0} slice(f16[2,8,8]{2,1,0} %param_1_0), slice={[1:2], [0:8], [0:8]}
-    %bitcast.42 = f16[8,8]{1,0} bitcast(f16[1,8,8]{2,1,0} %slice.14)
+    %slice.14 = f16[1,8,8]{2,1,0} slice(%param_1_0), slice={[1:2], [0:8], [0:8]}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
 
-    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(f16[8,8]{1,0} %bitcast.41, f16[8,8]{1,0} %bitcast.42),
+    %custom-call.1 = (f16[8,8]{1,0}, s8[256]{0}) custom-call(%bitcast.41, %bitcast.42),
       custom_call_target="__cublas$gemm",
       backend_config={"gemm_backend_config":{
         "alpha_real":1,
@@ -177,27 +229,20 @@ TEST_F(AddressComputationFusionTest, CublasGemmWithWorkspace) {
         "grad_x":false,
         "grad_y":false
       }}
-    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element((f16[8,8]{1,0}, s8[256]{0}) %custom-call.1), index=0
-    %get-tuple-element.1 = s8[256]{0} get-tuple-element((f16[8,8]{1,0}, s8[256]{0}) %custom-call.1), index=1
+    %get-tuple-element.0 = f16[8,8]{1,0} get-tuple-element(%custom-call.1), index=0
+    %get-tuple-element.1 = s8[256]{0} get-tuple-element(%custom-call.1), index=1
     ROOT %tuple = (f16[8,8]{1,0}, s8[256]{0}) tuple(%get-tuple-element.0, %get-tuple-element.1)
   }
 
-  ENTRY %main.9 (Arg_0.1: f16[2,8,8], Arg_1.2: f16[2,8,8]) -> (f16[8,8]{1,0}, s8[256]{0}) {
-    %Arg_0.1 = f16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
-    %Arg_1.2 = f16[2,8,8]{2,1,0} parameter(1), sharding={replicated}
-    ROOT %fusion.2 = (f16[8,8]{1,0}, s8[256]{0}) fusion(f16[2,8,8]{2,1,0} %Arg_0.1, f16[2,8,8]{2,1,0} %Arg_1.2), kind=kCustom, calls=%fused_computation,
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
+    %p1 = f16[2,8,8]{2,1,0} parameter(1), sharding={replicated}
+    ROOT %fusion.2 = (f16[8,8]{1,0}, s8[256]{0}) fusion(%p0, %p1), kind=kCustom, calls=%fused_computation,
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}}}
   })";
 
-  Array3D<bfloat16> arr0(2, 8, 8);  // bf16[2,8,8]
-  Array3D<bfloat16> arr1(2, 8, 8);  // bf16[2,8,8]
-  arr0.FillRandom(bfloat16(0.01f), 0.02);
-  arr1.FillIota(static_cast<bfloat16>(10.0));
-
-  auto a0 = LiteralUtil::CreateFromArray(arr0);
-  auto a1 = LiteralUtil::CreateFromArray(arr1);
-
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, {&a0, &a1}, error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
@@ -207,15 +252,15 @@ TEST_F(AddressComputationFusionTest, ContiguousSlice) {
   const char* hlo_ref = R"(
   HloModule jit_slice
 
-  ENTRY %main.9 (Arg_0.1: bf16[2,8,8], Arg_1.2: bf16[8,8,10,8]) -> bf16[4,8] {
-    %Arg_0.1 = bf16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
-    %Arg_1.2 = bf16[8,8,10,8]{3,2,1,0} parameter(1), sharding={replicated}
-    %slice.13 = bf16[1,4,8]{2,1,0} slice(bf16[2,8,8]{2,1,0} %Arg_0.1), slice={[1:2], [0:4], [0:8]}
-    %bitcast.41 = bf16[4,8]{1,0} bitcast(bf16[1,4,8]{2,1,0} %slice.13)
-    %slice.14 = bf16[1,1,8,8]{3,2,1,0} slice(bf16[8,8,10,8]{3,2,1,0} %Arg_1.2), slice={[0:1], [5:6], [2:10], [0:8]}
-    %bitcast.42 = bf16[8,8]{1,0} bitcast(bf16[1,1,8,8]{3,2,1,0} %slice.14)
+  ENTRY %main.9 {
+    %p0 = bf16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
+    %p1 = bf16[8,8,10,8]{3,2,1,0} parameter(1), sharding={replicated}
+    %slice.13 = bf16[1,4,8]{2,1,0} slice(%p0), slice={[1:2], [0:4], [0:8]}
+    %bitcast.41 = bf16[4,8]{1,0} bitcast(%slice.13)
+    %slice.14 = bf16[1,1,8,8]{3,2,1,0} slice(%p1), slice={[0:1], [5:6], [2:10], [0:8]}
+    %bitcast.42 = bf16[8,8]{1,0} bitcast(%slice.14)
 
-    ROOT %custom-call.1 = bf16[4,8]{1,0} custom-call(bf16[4,8]{1,0} %bitcast.41, bf16[8,8]{1,0} %bitcast.42),
+    ROOT %custom-call.1 = bf16[4,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
       custom_call_target="__cublas$gemm",
       backend_config={"gemm_backend_config":{
         "alpha_real":1,
@@ -239,15 +284,94 @@ TEST_F(AddressComputationFusionTest, ContiguousSlice) {
   const char* hlo_opt = R"(
   HloModule jit_slice
 
-  %fused_computation (param_0_0: bf16[2,8,8], param_1_0: bf16[8,8,10,8]) -> bf16[4,8]{1,0} {
+  %fused_computation {
     %param_0_0 = bf16[2,8,8]{2,1,0} parameter(0)
-    %slice.13 = bf16[1,4,8]{2,1,0} slice(bf16[2,8,8]{2,1,0} %param_0_0), slice={[1:2], [0:4], [0:8]}
-    %bitcast.41 = bf16[4,8]{1,0} bitcast(bf16[1,4,8]{2,1,0} %slice.13)
+    %slice.13 = bf16[1,4,8]{2,1,0} slice(%param_0_0), slice={[1:2], [0:4], [0:8]}
+    %bitcast.41 = bf16[4,8]{1,0} bitcast(%slice.13)
     %param_1_0 = bf16[8,8,10,8]{3,2,1,0} parameter(1)
-    %slice.14 = bf16[1,1,8,8]{3,2,1,0} slice(bf16[8,8,10,8]{3,2,1,0} %param_1_0), slice={[0:1], [5:6], [2:10], [0:8]}
-    %bitcast.42 = bf16[8,8]{1,0} bitcast(bf16[1,1,8,8]{3,2,1,0} %slice.14)
+    %slice.14 = bf16[1,1,8,8]{3,2,1,0} slice(%param_1_0), slice={[0:1], [5:6], [2:10], [0:8]}
+    %bitcast.42 = bf16[8,8]{1,0} bitcast(%slice.14)
+
+    ROOT %custom-call.1 = bf16[4,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  }
+
+  ENTRY %main.9 {
+    %p0 = bf16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
+    %p1 = bf16[8,8,10,8]{3,2,1,0} parameter(1), sharding={replicated}
+    ROOT %fusion.2 = bf16[4,8]{1,0} fusion(%p0, %p1), kind=kCustom, calls=%fused_computation,
+        backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}}}
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, ContiguousSliceNonDefaultLayout) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main.9 {
+    %p0 = bf16[2,8,8]{1,2,0} parameter(0), sharding={replicated}
+    %p1 = bf16[8,8,10,8]{1,2,3,0} parameter(1), sharding={replicated}
+    %slice.13 = bf16[1,8,4]{1,2,0} slice(%p0), slice={[1:2], [0:8], [0:4]}
+    %bitcast.41 = bf16[4,8]{1,0} bitcast(%slice.13)
+    %slice.14 = bf16[1,8,8,1]{1,2,3,0} slice(%p1), slice={[0:1], [0:8], [2:10], [5:6]}
+    %bitcast.42 = bf16[8,8]{1,0} bitcast(%slice.14)
+
+    ROOT %custom-call.1 = bf16[4,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %fused_computation {
+    %param_0_0 = bf16[2,8,8]{1,2,0} parameter(0)
+    %slice.13 = bf16[1,8,4]{1,2,0} slice(%param_0_0), slice={[1:2], [0:8], [0:4]}
+    %bitcast.41 = bf16[4,8]{1,0} bitcast(%slice.13)
+    %param_1_0 = bf16[8,8,10,8]{1,2,3,0} parameter(1)
+    %slice.14 = bf16[1,8,8,1]{1,2,3,0} slice(%param_1_0), slice={[0:1], [0:8], [2:10], [5:6]}
+    %bitcast.42 = bf16[8,8]{1,0} bitcast(%slice.14)
 
-    ROOT %custom-call.1 = bf16[4,8]{1,0} custom-call(bf16[4,8]{1,0} %bitcast.41, bf16[8,8]{1,0} %bitcast.42),
+    ROOT %custom-call.1 = bf16[4,8]{1,0} custom-call(%bitcast.41, %bitcast.42),
       custom_call_target="__cublas$gemm",
       backend_config={"gemm_backend_config":{
         "alpha_real":1,
@@ -268,22 +392,728 @@ TEST_F(AddressComputationFusionTest, ContiguousSlice) {
       }}
   }
 
-  ENTRY %main.9 (Arg_0.1: bf16[2,8,8], Arg_1.2: bf16[8,8,10,8]) -> bf16[4,8] {
-    %Arg_0.1 = bf16[2,8,8]{2,1,0} parameter(0), sharding={replicated}
-    %Arg_1.2 = bf16[8,8,10,8]{3,2,1,0} parameter(1), sharding={replicated}
-    ROOT %fusion.2 = bf16[4,8]{1,0} fusion(bf16[2,8,8]{2,1,0} %Arg_0.1, bf16[8,8,10,8]{3,2,1,0} %Arg_1.2), kind=kCustom, calls=%fused_computation,
+  ENTRY %main.9 {
+    %p0 = bf16[2,8,8]{1,2,0} parameter(0), sharding={replicated}
+    %p1 = bf16[8,8,10,8]{1,2,3,0} parameter(1), sharding={replicated}
+    ROOT %fusion.2 = bf16[4,8]{1,0} fusion(%p0, %p1), kind=kCustom, calls=%fused_computation,
         backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}}}
   })";
 
-  Array3D<bfloat16> arr0(2, 8, 8);      // bf16[2,8,8]
-  Array4D<bfloat16> arr1(8, 8, 10, 8);  // bf16[8,8,10,8]
-  arr0.FillIota(static_cast<bfloat16>(1.0));
-  arr1.FillRandom(bfloat16(0.01f), 0.02);
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, OperandIsSlicedGetTupleElement) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main {
+    %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+    %get-tuple-element.240 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+    %get-tuple-element.241 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+    %concatenate.10 = f32[200,100]{1,0} concatenate(%get-tuple-element.240, %get-tuple-element.241), dimensions={0}
+    %custom-call.16 = (f32[200,100]{1,0}, s8[120000]{0}) custom-call(%concatenate.10, %get-tuple-element.240),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"20000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
+    %slice.26 = f32[100,100]{1,0} slice(%get-tuple-element.97), slice={[0:100], [0:100]}
+    ROOT %custom-call.17 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.26, %get-tuple-element.240),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %address-computation {
+    %p0.3 = f32[200,100]{1,0} parameter(0)
+    %p1.3 = f32[100,100]{1,0} parameter(1)
+    %slice.56 = f32[100,100]{1,0} slice(%p0.3), slice={[0:100], [0:100]}
+    %cublas-gemm.23 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.56, %p1.3),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element.221 = f32[100,100]{1,0} get-tuple-element(%cublas-gemm.23), index=0
+    %get-tuple-element.222 = s8[80000]{0} get-tuple-element(%cublas-gemm.23), index=1
+    ROOT %tuple.58 = (f32[100,100]{1,0}, s8[80000]{0}) tuple(%get-tuple-element.221, %get-tuple-element.222)
+  }
+
+  ENTRY %main {
+    %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+    %get-tuple-element.240 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+    %get-tuple-element.241 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+    %concatenate.10 = f32[200,100]{1,0} concatenate(%get-tuple-element.240, %get-tuple-element.241), dimensions={0}
+    %custom-call.16 = (f32[200,100]{1,0}, s8[120000]{0}) custom-call(%concatenate.10, %get-tuple-element.240),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"20000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
+    ROOT %address_computation.6 = (f32[100,100]{1,0}, s8[80000]{0}) fusion(%get-tuple-element.97, %get-tuple-element.240),
+      kind=kCustom,
+      calls=%address-computation,
+      backend_config={
+        "fusion_backend_config":{
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}
+        }
+      }
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, ReversedOperandOrder) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main.9 {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %slice.13 = f16[1,8,8]{2,1,0} slice(%p0), slice={[0:1], [0:8], [0:8]}
+    %bitcast.41 = f16[8,8]{1,0} bitcast(%slice.13)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    %slice.14 = f16[1,8,8]{2,1,0} slice(%p1), slice={[1:2], [0:8], [0:8]}
+    %bitcast.42 = f16[8,8]{1,0} bitcast(%slice.14)
+
+    ROOT %custom-call.1 = f16[8,8]{1,0} custom-call(%bitcast.42, %bitcast.41),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{
+        "alpha_real":1,
+        "beta":0,
+        "dot_dimension_numbers":{
+          "lhs_contracting_dimensions":["1"],
+          "rhs_contracting_dimensions":["0"],
+          "lhs_batch_dimensions":[],
+          "rhs_batch_dimensions":[]
+        },
+        "alpha_imag":0,
+        "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+        "epilogue":"DEFAULT",
+        "lhs_stride":"64",
+        "rhs_stride":"64",
+        "grad_x":false,
+        "grad_y":false
+      }}
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %address-computation {
+    %p0.1 = f16[2,8,8]{2,1,0} parameter(0)
+    %slice.1 = f16[1,8,8]{2,1,0} slice(%p0.1), slice={[1:2], [0:8], [0:8]}
+    %bitcast.1 = f16[8,8]{1,0} bitcast(%slice.1)
+    %p1.1 = f16[2,8,8]{2,1,0} parameter(1)
+    %slice.0 = f16[1,8,8]{2,1,0} slice(%p1.1), slice={[0:1], [0:8], [0:8]}
+    %bitcast.0 = f16[8,8]{1,0} bitcast(%slice.0)
+    ROOT %custom-call.0 = f16[8,8]{1,0} custom-call(%bitcast.1, %bitcast.0),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"64",
+          "rhs_stride":"64",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+  }
+
+  ENTRY %main {
+    %p0 = f16[2,8,8]{2,1,0} parameter(0)
+    %p1 = f16[2,8,8]{2,1,0} parameter(1)
+    ROOT %address_computation.6 = f16[8,8]{1,0} fusion(%p1, %p0),
+      kind=kCustom,
+      calls=%address-computation,
+      backend_config={
+        "fusion_backend_config":{
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}
+        }
+      }
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, SingleOperandComputation) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+  ENTRY %main {
+    %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+    %get-tuple-element.240 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+    %get-tuple-element.241 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+    %concatenate.10 = f32[200,100]{1,0} concatenate(%get-tuple-element.240, %get-tuple-element.241), dimensions={0}
+    %custom-call.16 = (f32[200,100]{1,0}, s8[120000]{0}) custom-call(%concatenate.10, %get-tuple-element.240),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"20000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
+    %slice.26 = f32[100,100]{1,0} slice(%get-tuple-element.97), slice={[0:100], [0:100]}
+    ROOT %custom-call.17 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.26, %slice.26),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %address-computation {
+    %p0.3 = f32[200,100]{1,0} parameter(0)
+    %slice.56 = f32[100,100]{1,0} slice(%p0.3), slice={[0:100], [0:100]}
+    %cublas-gemm.23 = (f32[100,100]{1,0}, s8[80000]{0}) custom-call(%slice.56, %slice.56),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element.221 = f32[100,100]{1,0} get-tuple-element(%cublas-gemm.23), index=0
+    %get-tuple-element.222 = s8[80000]{0} get-tuple-element(%cublas-gemm.23), index=1
+    ROOT %tuple.58 = (f32[100,100]{1,0}, s8[80000]{0}) tuple(%get-tuple-element.221, %get-tuple-element.222)
+  }
+
+  ENTRY %main {
+    %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+    %get-tuple-element.240 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+    %get-tuple-element.241 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+    %concatenate.10 = f32[200,100]{1,0} concatenate(%get-tuple-element.240, %get-tuple-element.241), dimensions={0}
+    %custom-call.16 = (f32[200,100]{1,0}, s8[120000]{0}) custom-call(%concatenate.10, %get-tuple-element.240),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":0,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"20000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element.97 = f32[200,100]{1,0} get-tuple-element(%custom-call.16), index=0
+    ROOT %address_computation.6 = (f32[100,100]{1,0}, s8[80000]{0}) fusion(%get-tuple-element.97),
+      kind=kCustom,
+      calls=%address-computation,
+      backend_config={
+        "fusion_backend_config":{
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}
+        }
+      }
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+TEST_F(AddressComputationFusionTest, SlicedOperandAliasingOutput) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_ref = R"(
+  HloModule jit_slice
+
+    ENTRY %main.9 {
+      %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+      %get-tuple-element.287 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+      %get-tuple-element.288 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+      %concatenate.12 = f32[200,100]{1,0} concatenate(%get-tuple-element.287, %get-tuple-element.288), dimensions={0}
+      %slice.30 = f32[100,100]{1,0} slice(%concatenate.12), slice={[20:120], [0:100]}
+      %slice.34 = f32[100,100]{1,0} slice(%concatenate.12), slice={[99:199], [0:100]}
+      ROOT %cublas-gemm.15 = (f32[100,100]{1,0}, s8[120000]{0}) custom-call(%get-tuple-element.287, %slice.30, %slice.34),
+        custom_call_target="__cublas$gemm",
+        output_to_operand_aliasing={{0}: (2, {})},
+        backend_config={"gemm_backend_config":{
+          "alpha_real":1,
+          "beta":1,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }}
+  })";
+
+  const char* hlo_opt = R"(
+  HloModule jit_slice
+
+  %address-computation {
+    %p0.1 = f32[100,100]{1,0} parameter(0)
+    %p2 = f32[200,100]{1,0} parameter(2)
+    %slice.0 = f32[100,100]{1,0} slice(f32[200,100]{1,0} %p2), slice={[20:120], [0:100]}
+    %p1 = f32[100,100]{1,0} parameter(1)
+    %cublas-gemm.0 = (f32[100,100]{1,0}, s8[120000]{0}) custom-call(%p0.1, %slice.0, %p1),
+      custom_call_target="__cublas$gemm",
+      backend_config={
+        "gemm_backend_config":{
+          "alpha_real":1,
+          "beta":1,
+          "dot_dimension_numbers":{
+            "lhs_contracting_dimensions":["1"],
+            "rhs_contracting_dimensions":["0"],
+            "lhs_batch_dimensions":[],
+            "rhs_batch_dimensions":[]
+          },
+          "alpha_imag":0,
+          "precision_config":{"operand_precision":["HIGHEST","HIGHEST"]},
+          "epilogue":"DEFAULT",
+          "lhs_stride":"10000",
+          "rhs_stride":"10000",
+          "grad_x":false,
+          "grad_y":false
+        }
+      }
+    %get-tuple-element = f32[100,100]{1,0} get-tuple-element(%cublas-gemm.0), index=0
+    %get-tuple-element.1 = s8[120000]{0} get-tuple-element(%cublas-gemm.0), index=1
+    ROOT %tuple = (f32[100,100]{1,0}, s8[120000]{0}) tuple(%get-tuple-element, %get-tuple-element.1)
+  }
+
+  ENTRY %main {
+    %p0 = (f32[100,100]{1,0}, f32[100,100]{1,0}) parameter(0)
+    %get-tuple-element.287 = f32[100,100]{1,0} get-tuple-element(%p0), index=0
+    %get-tuple-element.288 = f32[100,100]{1,0} get-tuple-element(%p0), index=1
+    %concatenate.12 = f32[200,100]{1,0} concatenate(%get-tuple-element.287, %get-tuple-element.288), dimensions={0}
+    %slice.34 = f32[100,100]{1,0} slice(%concatenate.12), slice={[99:199], [0:100]}
+    ROOT %address_computation.6 = (f32[100,100]{1,0}, s8[120000]{0}) fusion(%get-tuple-element.287, %slice.34, %concatenate.12),
+      kind=kCustom,
+      calls=%address-computation,
+      output_to_operand_aliasing={{0}: (1, {})},
+      backend_config={
+        "fusion_backend_config":{
+          "kind":"__custom_fusion","custom_fusion_config":{"name":"address_computation"}
+        }
+      }
+  })";
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, GetRefModuleConfig(),
+                                      GetOptModuleConfig(), error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+static absl::Status Memcpy(const ServiceExecutableRunOptions* run_options,
+                           ffi::BufferBase src, ffi::BufferBase dst) {
+  run_options->stream()->ThenMemcpyD2D(
+      &dst.data, src.data,
+      absl::c_accumulate(src.dimensions, 1.0, std::multiplies<int64_t>()) *
+          sizeof(float));
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kMemcpy, Memcpy,
+                       ffi::Ffi::Bind()
+                           .Ctx<ServiceExecutableRunOptions>()
+                           .Arg<ffi::BufferBase>()  // src
+                           .Arg<ffi::BufferBase>()  // dst
+);
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$memcpy", PLATFORM,
+                         kMemcpy);
+
+TEST_F(AddressComputationFusionTest, CustomCallSimple) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "__xla_test$$memcpy",
+             /*operands=*/
+             {Slice(Broadcast(ConstantR0WithType(&b, F32, 42.0), {256}), {0},
+                    {128}, {1})},
+             ShapeUtil::MakeShape(F32, {128}), /*opaque=*/"",
+             /*has_side_effect=*/false,
+             /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+             /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+             /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/false);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  debug_options.set_xla_gpu_enable_address_computation_fusion(true);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
+                                      error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+static absl::Status SubBuffers(const ServiceExecutableRunOptions* run_options,
+                               ffi::BufferBase src0, ffi::BufferBase src1,
+                               ffi::BufferBase src2, ffi::BufferBase src3,
+                               ffi::BufferBase src4, ffi::BufferBase dst0,
+                               ffi::BufferBase dst1, ffi::BufferBase dst2,
+                               ffi::BufferBase dst3, ffi::BufferBase dst4) {
+  //  src0:  param 0 at tuple index {0}, shape f32[128]
+  //  src1:  param 0 at tuple index {1}, shape f32[256]
+  //  src2:  param 1 at tuple index {0}, shape f32[1024]
+  //  src3:  param 1 at tuple index {1}, shape f32[8]
+  //  src4:  param 2, shape f32[4,8]
+  //
+  //  dst0:  result at tuple index {0}, shape f32[8]
+  //  dst1:  result at tuple index {1, 0}, shape f32[128]
+  //  dst2:  result at tuple index {1, 1}, shape f32[256]
+  //  dst3:  result at tuple index {2}, shape f32[1024]
+  //  dst4:  result at tuple index {3}, shape f32[4,8]
+
+  run_options->stream()->ThenMemcpyD2D(&dst0.data, src3.data,
+                                       8 * sizeof(float));
+  run_options->stream()->ThenMemcpyD2D(&dst1.data, src0.data,
+                                       128 * sizeof(float));
+  run_options->stream()->ThenMemcpyD2D(&dst2.data, src1.data,
+                                       256 * sizeof(float));
+  run_options->stream()->ThenMemcpyD2D(&dst3.data, src2.data,
+                                       1024 * sizeof(float));
+  run_options->stream()->ThenMemcpyD2D(&dst4.data, src4.data,
+                                       4 * 8 * sizeof(float));
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kSubBuffers, SubBuffers,
+                       ffi::Ffi::Bind()
+                           .Ctx<ServiceExecutableRunOptions>()
+                           .Arg<ffi::BufferBase>()  // src0
+                           .Arg<ffi::BufferBase>()  // src1
+                           .Arg<ffi::BufferBase>()  // src2
+                           .Arg<ffi::BufferBase>()  // src3
+                           .Arg<ffi::BufferBase>()  // src4
+                           .Arg<ffi::BufferBase>()  // dst0
+                           .Arg<ffi::BufferBase>()  // dst1
+                           .Arg<ffi::BufferBase>()  // dst2
+                           .Arg<ffi::BufferBase>()  // dst3
+                           .Arg<ffi::BufferBase>()  // dst4
+);
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$subbuffers",
+                         PLATFORM, kSubBuffers);
+
+TEST_F(AddressComputationFusionTest, CustomCallWithTuple) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "__xla_test$$subbuffers", /*operands=*/
+             {
+                 Tuple(&b,
+                       {
+                           Broadcast(ConstantR0WithType(&b, F32, 1), {128}),
+                           Broadcast(ConstantR0WithType(&b, F32, 2), {256}),
+                       }),
+                 Tuple(&b,
+                       {
+                           Broadcast(ConstantR0WithType(&b, F32, 3), {1024}),
+                           Broadcast(ConstantR0WithType(&b, F32, 4), {8}),
+                       }),
+                 Slice(Broadcast(ConstantR0WithType(&b, F32, 5), {8, 8}),
+                       {0, 0}, {4, 8}, {1, 1}),
+             },
+             ShapeUtil::MakeTupleShape({
+                 ShapeUtil::MakeShape(F32, {8}),
+                 ShapeUtil::MakeTupleShape({
+                     ShapeUtil::MakeShape(F32, {128}),
+                     ShapeUtil::MakeShape(F32, {256}),
+                 }),
+                 ShapeUtil::MakeShape(F32, {1024}),
+                 ShapeUtil::MakeShape(F32, {4, 8}),
+             }),
+             /*opaque=*/"",
+             /*has_side_effect=*/false,
+             /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+             /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+             /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/false);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  debug_options.set_xla_gpu_enable_address_computation_fusion(true);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
+                                      error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+static absl::Status NoOp(const ServiceExecutableRunOptions* run_options,
+                         ffi::BufferBase operand) {
+  return absl::OkStatus();
+}
+
+XLA_FFI_DEFINE_HANDLER(kNoOp, NoOp,
+                       ffi::Ffi::Bind()
+                           .Ctx<ServiceExecutableRunOptions>()
+                           .Arg<ffi::BufferBase>()  // operand
+);
+XLA_FFI_REGISTER_HANDLER(ffi::GetXlaFfiApi(), "__xla_test$$noop", PLATFORM,
+                         kNoOp);
+
+TEST_F(AddressComputationFusionTest, NilTuple) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "__xla_test$$noop",
+             /*operands=*/
+             {Slice(Broadcast(ConstantR0WithType(&b, F32, 42.0), {256}), {0},
+                    {128}, {1})},
+             ShapeUtil::MakeNil(),
+             /*opaque=*/"",
+             /*has_side_effect=*/false,
+             /*output_operand_aliasing=*/{}, /*literal=*/nullptr,
+             /*schedule=*/CustomCallSchedule::SCHEDULE_NONE,
+             /*api_version=*/CustomCallApiVersion::API_VERSION_TYPED_FFI);
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/false);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  debug_options.set_xla_gpu_enable_address_computation_fusion(true);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
+                                      error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+void Callback_Memcpy(se::gpu::GpuStreamHandle stream, void** buffers,
+                     const char* /*opaque*/, size_t /*opaque_len*/) {
+  void* src = buffers[0];
+  void* dst = buffers[1];
+  auto err = gpuMemcpyAsync(dst, src, /*count=*/sizeof(float) * 128,
+                            gpuMemcpyDeviceToDevice, stream);
+  ASSERT_EQ(err, gpuSuccess);
+}
+
+XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_Memcpy, PLATFORM);
+
+TEST_F(AddressComputationFusionTest, CustomCallLegacyAPI) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "Callback_Memcpy",
+             /*operands=*/
+             {Slice(Broadcast(ConstantR0WithType(&b, F32, 42.0), {256}), {0},
+                    {128}, {1})},
+             ShapeUtil::MakeShape(F32, {128}), /*opaque=*/"");
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/false);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  debug_options.set_xla_gpu_enable_address_computation_fusion(true);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
+
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
+                                      error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
+void Callback_Void(se::gpu::GpuStreamHandle /*stream*/, void** /*buffers*/,
+                   const char* /*opaque*/, size_t /*opaque_len*/) {}
+
+XLA_REGISTER_CUSTOM_CALL_TARGET(Callback_Void, PLATFORM);
+
+TEST_F(AddressComputationFusionTest, NilTupleLegacyAPI) {
+  XlaBuilder b(TestName());
+  CustomCall(&b, "Callback_Void", /*operands=*/
+             {Slice(Broadcast(ConstantR0WithType(&b, F32, 42.0), {256}), {0},
+                    {128}, {1})},
+             ShapeUtil::MakeNil(),
+             /*opaque=*/"");
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  TF_ASSERT_OK_AND_ASSIGN(auto computation, b.Build());
+  xla::HloModuleConfig hlo_config(
+      xla::ProgramShape(computation.proto().host_program_shape()),
+      /*ignore_layouts=*/false);
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  debug_options.set_xla_gpu_enable_address_computation_fusion(false);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_ref, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
 
-  auto a0 = LiteralUtil::CreateFromArray(arr0);
-  auto a1 = LiteralUtil::CreateFromArray(arr1);
+  debug_options.set_xla_gpu_enable_address_computation_fusion(true);
+  hlo_config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_opt, xla::HloModule::CreateFromProto(
+                                            computation.proto(), hlo_config));
 
-  EXPECT_TRUE(RunAndCompareTwoModules(hlo_ref, hlo_opt, {&a0, &a1}, error_spec,
+  EXPECT_TRUE(RunAndCompareTwoModules(std::move(hlo_ref), std::move(hlo_opt),
+                                      error_spec,
                                       /*run_hlo_passes=*/false));
 }
 
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate.cc b/third_party/xla/xla/service/gpu/fusions/concatenate.cc
index 36139c00ecd9f2..bdc43ee9fb419e 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate.cc
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/third_party/xla/xla/service/gpu/fusions/concatenate.h b/third_party/xla/xla/service/gpu/fusions/concatenate.h
index 34f9bb7e03d87f..3fdd3b2878fb2e 100644
--- a/third_party/xla/xla/service/gpu/fusions/concatenate.h
+++ b/third_party/xla/xla/service/gpu/fusions/concatenate.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -38,6 +38,13 @@ class ConcatenateFusion : public KernelFusionEmitterBase {
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
       int64_t output_id, mlir::MLIRContext* ctx) const override;
 
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const override {
+    // TODO(b/319081342): Implement this.
+    return std::nullopt;
+  }
+
  protected:
   absl::Status EmitKernel(IrEmitterContext& ir_emitter_context,
                           const HloFusionInstruction& fusion,
diff --git a/third_party/xla/xla/service/gpu/fusions/copy.cc b/third_party/xla/xla/service/gpu/fusions/copy.cc
index eeb934c82085f4..55a1adffba0266 100644
--- a/third_party/xla/xla/service/gpu/fusions/copy.cc
+++ b/third_party/xla/xla/service/gpu/fusions/copy.cc
@@ -20,7 +20,7 @@ limitations under the License.
 #include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/ir_emitter_context.h"
-#include "xla/service/gpu/runtime3/copy_thunk.h"
+#include "xla/service/gpu/runtime/copy_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/statusor.h"
 
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.cc b/third_party/xla/xla/service/gpu/fusions/custom.cc
index 82475274509175..66bf931f6b00c7 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.cc
+++ b/third_party/xla/xla/service/gpu/fusions/custom.cc
@@ -18,6 +18,8 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <string>
+#include <string_view>
 #include <utility>
 #include <variant>
 #include <vector>
@@ -25,31 +27,43 @@ limitations under the License.
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/TypeSwitch.h"
+#include "mlir/AsmParser/AsmParser.h"  // from @llvm-project
+#include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
+#include "xla/ffi/api/c_api.h"
+#include "xla/ffi/ffi_api.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/custom_call_status.h"
+#include "xla/service/custom_call_target_registry.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
-#include "xla/service/gpu/gemm_thunk.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/kernel_arguments.h"
-#include "xla/service/gpu/kernels/custom_fusion.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/service/gpu/kernels/custom_kernel_fusion.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/runtime3/kernel_thunk.h"
+#include "xla/service/gpu/runtime/custom_call_thunk.h"
+#include "xla/service/gpu/runtime/gemm_thunk.h"
+#include "xla/service/gpu/runtime/kernel_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla {
@@ -77,56 +91,315 @@ absl::StatusOr<std::unique_ptr<Thunk>> BuildCustomKernelThunkForFusion(
       instr, std::move(custom_kernel), std::move(kernel_arguments.args()));
 }
 
+// TODO(vuson): this is duplicated from ir_emitter_unnested.cc
+// Converts MLIR dictionary attribute attached to a custom call operation to a
+// custom call thunk attributes that are forwarded to the FFI handler.
+static absl::StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
+    mlir::DictionaryAttr dict) {
+  CustomCallThunk::AttributesMap attributes;
+  for (auto& kv : dict) {
+    std::string_view name = kv.getName().strref();
+
+    auto integer = [&](mlir::IntegerAttr integer) {
+      switch (integer.getType().getIntOrFloatBitWidth()) {
+        case 32:
+          attributes[name] = static_cast<int32_t>(integer.getInt());
+          return absl::OkStatus();
+        case 64:
+          attributes[name] = static_cast<int64_t>(integer.getInt());
+          return absl::OkStatus();
+        default:
+          return absl::InvalidArgumentError(absl::StrCat(
+              "Unsupported integer attribute bit width for attribute: ", name));
+      }
+    };
+
+    auto fp = [&](mlir::FloatAttr fp) {
+      switch (fp.getType().getIntOrFloatBitWidth()) {
+        case 32:
+          attributes[name] = static_cast<float>(fp.getValue().convertToFloat());
+          return absl::OkStatus();
+        default:
+          return absl::InvalidArgumentError(absl::StrCat(
+              "Unsupported float attribute bit width for attribute: ", name));
+      }
+    };
+
+    auto str = [&](mlir::StringAttr str) {
+      attributes[name] = str.getValue().str();
+      return absl::OkStatus();
+    };
+
+    TF_RETURN_IF_ERROR(
+        llvm::TypeSwitch<mlir::Attribute, Status>(kv.getValue())
+            .Case<mlir::IntegerAttr>(integer)
+            .Case<mlir::FloatAttr>(fp)
+            .Case<mlir::StringAttr>(str)
+            .Default([&](mlir::Attribute) {
+              return absl::InvalidArgumentError(absl::StrCat(
+                  "Unsupported attribute type for attribute: ", name));
+            }));
+  }
+  return attributes;
+}
+
 absl::StatusOr<BufferAllocation::Slice> GetSliceWithUpdatedOffsetAndSize(
     const BufferAssignment& buffer_assignment, const HloFusionAdaptor& fusion,
-    const HloInstruction* bufferized_instr, const HloInstruction& start) {
-  TF_ASSIGN_OR_RETURN(
-      BufferAllocation::Slice orig_slice,
-      GetAllocationSlice(buffer_assignment, bufferized_instr, {}));
+    const HloInstruction& fusion_instr, const HloInstruction& start,
+    const ShapeIndex& index) {
+  if (const auto* param = DynCast<HloParameterInstruction>(&start)) {
+    return GetAllocationSlice(buffer_assignment,
+                              fusion_instr.operand(param->parameter_number()),
+                              index);
+  }
 
-  auto maybe_slice_adaptor =
+  auto slice_adaptor =
       HloFindIf({HloInstructionAdaptor(start)}, fusion,
                 [](auto node) { return node.opcode() == HloOpcode::kSlice; });
-  if (maybe_slice_adaptor == std::nullopt) return orig_slice;
+  TF_RET_CHECK(slice_adaptor.has_value())
+      << "AddressComputationFusion expects at least one sliced operand";
 
-  const auto& slice_instr = *static_cast<const HloSliceInstruction*>(
-      &maybe_slice_adaptor->instruction());
+  const auto& slice_instr =
+      *static_cast<const HloSliceInstruction*>(&slice_adaptor->instruction());
 
-  TF_RET_CHECK(IsSliceWithUnitStrides(&slice_instr))
-      << "AddressComputationFusion only handles slices with unit strides "
-         "currently";
   TF_RET_CHECK(IsContiguousSlice(slice_instr))
       << "AddressComputationFusion only handles contiguous slices currently";
 
+  const Shape& src_shape = slice_instr.operand(0)->shape();
+  const Shape& dst_shape = slice_instr.shape();
+  int64_t size = ShapeUtil::ByteSizeOf(dst_shape);
+
+  const auto* param = Cast<HloParameterInstruction>(slice_instr.operand(0));
+  TF_ASSIGN_OR_RETURN(
+      BufferAllocation::Slice orig_slice,
+      GetAllocationSlice(buffer_assignment,
+                         fusion_instr.operand(param->parameter_number()),
+                         index));
+
   // Given this slice
   // f16[1,4,8]{2,1,0} slice(f16[2,8,8]{2,1,0}),
   //                         slice={[1:2], [4:8], [0:8]}
-  // The size of the slice should be:
-  //    4 * 8 * sizeof(f16) = sizeof(f16) * product of dst_shape's dimensions
   //
   // The offset of the slice should be:
   //    slice_starts(0) * 8 * 8 * sizeof(f16) +
   //    slice_starts(1) * 8 * sizeof(f16)
-  const Shape& src_shape = slice_instr.operand(0)->shape();
-  const Shape& dst_shape = slice_instr.shape();
-  int64_t size = ShapeUtil::ByteSizeOfPrimitiveType(dst_shape.element_type());
-  int64_t offset = 0;
-  int64_t cur_size =
-      ShapeUtil::ByteSizeOfPrimitiveType(src_shape.element_type());
-  for (auto dim : src_shape.layout().minor_to_major()) {
-    // IsContiguousSlice ensures that more major dimensions than the first
-    // sliced dimension are all 1.
-    size *= dst_shape.dimensions(dim);
-    offset += slice_instr.slice_starts(dim) * cur_size;
-    cur_size *= src_shape.dimensions(dim);
+  int64_t offset = orig_slice.offset();
+  for (auto [start, stride] : llvm::zip(slice_instr.slice_starts(),
+                                        *ShapeUtil::ByteStrides(src_shape))) {
+    offset += start * stride;
   }
 
   return BufferAllocation::Slice(orig_slice.allocation(), offset, size);
 }
 
+absl::StatusOr<FusionEmissionResult> EmitGemm(
+    IrEmitterContext& ir_emitter_context, const HloFusionAdaptor& adaptor,
+    const HloFusionInstruction& fusion,
+    const HloCustomCallInstruction& custom_call) {
+  const BufferAssignment& buffer_assignment =
+      ir_emitter_context.buffer_assignment();
+
+  TF_ASSIGN_OR_RETURN(
+      BufferAllocation::Slice lhs_slice,
+      GetSliceWithUpdatedOffsetAndSize(buffer_assignment, adaptor, fusion,
+                                       *custom_call.operand(0), /*index=*/{}));
+
+  TF_ASSIGN_OR_RETURN(
+      BufferAllocation::Slice rhs_slice,
+      GetSliceWithUpdatedOffsetAndSize(buffer_assignment, adaptor, fusion,
+                                       *custom_call.operand(1), /*index=*/{}));
+
+  BufferAllocation::Slice output;
+  std::optional<BufferAllocation::Slice> workspace;
+
+  // Result of a legacy cuBLAS custom call can be a tuple if we explicitly
+  // allocate workspace buffer in HLO. If result is an array, it means that
+  // workspace is not available, and cuBLAS will allocate its own workspace.
+  if (custom_call.shape().IsArray()) {
+    TF_ASSIGN_OR_RETURN(output,
+                        GetAllocationSlice(buffer_assignment, &fusion, {}));
+  } else {
+    TF_ASSIGN_OR_RETURN(output,
+                        GetAllocationSlice(buffer_assignment, &fusion, {0}));
+    TF_ASSIGN_OR_RETURN(workspace,
+                        GetAllocationSlice(buffer_assignment, &fusion, {1}));
+  }
+
+  bool deterministic_ops =
+      ir_emitter_context.debug_options().xla_gpu_deterministic_ops();
+
+  TF_ASSIGN_OR_RETURN(
+      GemmConfig config,
+      GemmConfig::For(static_cast<const HloInstruction*>(&custom_call)));
+  auto thunk = std::make_unique<GemmThunk>(
+      Thunk::ThunkInfo::WithProfileAnnotation(&custom_call), std::move(config),
+      lhs_slice, rhs_slice, output, workspace, deterministic_ops);
+
+  FusionEmissionResult result;
+  result.thunks.push_back(std::move(thunk));
+  return result;
+}
+
+absl::StatusOr<FusionEmissionResult> EmitCustomCall(
+    IrEmitterContext& ir_emitter_context, const HloFusionAdaptor& adaptor,
+    const HloFusionInstruction& fusion,
+    const HloCustomCallInstruction& custom_call) {
+  const BufferAssignment& buffer_assignment =
+      ir_emitter_context.buffer_assignment();
+
+  const std::string call_target_name = custom_call.custom_call_target();
+
+  // Typed FFI custom calls is a replacement for legacy custom calls with
+  // a rich type safe API. It's under construction and not fully supported.
+  bool is_ffi_custom_call =
+      custom_call.api_version() == CustomCallApiVersion::API_VERSION_TYPED_FFI;
+
+  void* call_target = CustomCallTargetRegistry::Global()->Lookup(
+      call_target_name, std::string(ir_emitter_context.platform_name()));
+
+  absl::StatusOr<XLA_FFI_Handler*> handler =
+      ffi::FindHandler(call_target_name, ir_emitter_context.platform_name());
+
+  // At least one implementation should be available at run time.
+  bool found_custom_call = !is_ffi_custom_call && call_target != nullptr;
+  bool found_ffi_handler = is_ffi_custom_call && handler.ok();
+
+  TF_RET_CHECK(found_custom_call || found_ffi_handler)
+      << "AddressComputationFusion expects custom calls that are emittable as "
+         "thunks";
+
+  using Slices = std::vector<std::optional<CustomCallThunk::Slice>>;
+
+  Slices operands;
+  // TODO(vuson): add test with custom call with tuple-typed operands
+  for (auto* operand : custom_call.operands()) {
+    TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+        operand->shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+          if (subshape.IsToken()) {
+            operands.push_back(std::nullopt);
+            return absl::OkStatus();
+          }
+          if (!subshape.IsArray()) {
+            return absl::OkStatus();
+          }
+          TF_ASSIGN_OR_RETURN(auto slice, GetSliceWithUpdatedOffsetAndSize(
+                                              buffer_assignment, adaptor,
+                                              fusion, *operand, index));
+          operands.push_back(CustomCallThunk::Slice{slice, subshape});
+          return absl::OkStatus();
+        }));
+  }
+
+  Slices results;
+  TF_RETURN_IF_ERROR(ShapeUtil::ForEachSubshapeWithStatus(
+      fusion.shape(), [&](const Shape& subshape, const ShapeIndex& index) {
+        if (subshape.IsToken()) {
+          results.push_back(std::nullopt);
+          return absl::OkStatus();
+        }
+        if (!subshape.IsArray()) {
+          return absl::OkStatus();
+        }
+        TF_ASSIGN_OR_RETURN(
+            auto slice, GetAllocationSlice(buffer_assignment, &fusion, index));
+        results.push_back(CustomCallThunk::Slice{slice, subshape});
+        return absl::OkStatus();
+      }));
+
+  // For legacy custom calls we convert all API versions into the latest
+  // status-returning one and pass backend config as an opaque string.
+  CustomCallThunk::CustomCallTarget custom_call_target;
+  std::string opaque;
+
+  // For XLA FFI handlers we decode opaque backend config into attributes map
+  // at IR emission time, so that we do not need to parse MLIR at run time. For
+  // FFI handlers backend config must be a compatible MLIR dictionary.
+  CustomCallThunk::AttributesMap attributes;
+
+  // For information about this calling convention, see
+  // xla/g3doc/custom_call.md.
+  switch (custom_call.api_version()) {
+    case CustomCallApiVersion::API_VERSION_ORIGINAL:
+      using original_call_type =
+          void (*)(CustomCallThunk::Stream /*stream*/, void** /*buffers*/,
+                   const char* /*opaque*/, size_t /*opaque_len*/);
+      custom_call_target = [call_target](CustomCallThunk::Stream stream,
+                                         void** buffers, const char* opaque,
+                                         size_t opaque_len,
+                                         XlaCustomCallStatus*) {
+        auto typed_call_target =
+            reinterpret_cast<original_call_type>(call_target);
+        typed_call_target(stream, buffers, opaque, opaque_len);
+      };
+      break;
+    case CustomCallApiVersion::API_VERSION_STATUS_RETURNING:
+    case CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED:
+      using status_returning_call_type =
+          void (*)(CustomCallThunk::Stream /*stream*/, void** /*buffers*/,
+                   const char* /*opaque*/, size_t /*opaque_len*/,
+                   XlaCustomCallStatus* /*status*/);
+      custom_call_target =
+          reinterpret_cast<status_returning_call_type>(call_target);
+      break;
+    case CustomCallApiVersion::API_VERSION_TYPED_FFI:
+      // We already checked `handler` above.
+      break;
+    default:
+      return Internal("Unknown custom-call API version enum value: %d",
+                      custom_call.api_version());
+  }
+
+  auto& backend_config_str = custom_call.raw_backend_config_string();
+  switch (custom_call.api_version()) {
+    case CustomCallApiVersion::API_VERSION_ORIGINAL:
+    case CustomCallApiVersion::API_VERSION_STATUS_RETURNING:
+    case CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED:
+      if (!backend_config_str.empty()) {
+        opaque = backend_config_str;
+      }
+      break;
+
+    case CustomCallApiVersion::API_VERSION_TYPED_FFI:
+      if (!backend_config_str.empty()) {
+        mlir::Attribute attr = mlir::parseAttribute(
+            backend_config_str, ir_emitter_context.mlir_context());
+        if (auto dict = attr.dyn_cast_or_null<mlir::DictionaryAttr>()) {
+          TF_ASSIGN_OR_RETURN(attributes, BuildAttributesMap(dict));
+          break;
+        }
+        return absl::InternalError(
+            "Unsupported backend config. Expected a string parsable into "
+            "dictionary attribute");
+      }
+      break;
+
+    default:
+      return Internal("Unknown custom-call API version enum value: %d",
+                      custom_call.api_version());
+  }
+
+  auto ffi_thunk = [&] {
+    auto& called_computations = custom_call.called_computations();
+    return std::make_unique<CustomCallThunk>(
+        Thunk::ThunkInfo::WithProfileAnnotation(&custom_call), *handler,
+        std::move(operands), std::move(results), std::move(attributes),
+        called_computations.empty() ? nullptr : called_computations[0]);
+  };
+
+  auto legacy_thunk = [&] {
+    return std::make_unique<CustomCallThunk>(
+        Thunk::ThunkInfo::WithProfileAnnotation(&custom_call),
+        std::move(custom_call_target), std::move(operands), std::move(results),
+        std::move(opaque));
+  };
+  FusionEmissionResult result;
+  result.thunks.push_back(found_ffi_handler ? ffi_thunk() : legacy_thunk());
+  return result;
+}
+
 }  // namespace
 
-absl::StatusOr<FusionEmissionResult> CustomFusionEmitter::Emit(
+absl::StatusOr<FusionEmissionResult> CustomFusion::Emit(
     IrEmitterContext& ir_emitter_context, mlir::lmhlo::FusionOp fusion_op,
     const HloFusionInstruction& fusion) const {
   TF_ASSIGN_OR_RETURN(auto gpu_config,
@@ -137,27 +410,28 @@ absl::StatusOr<FusionEmissionResult> CustomFusionEmitter::Emit(
 
   VLOG(3) << "Lower HLO fusion to a custom fusion " << config.name();
 
-  auto* registry = CustomFusionRegistry::Default();
-  auto* custom_fusion = registry->Lookup(config.name());
+  auto* registry = CustomKernelFusionRegistry::Default();
+  auto* custom_kernel_fusion = registry->Lookup(config.name());
 
   // If custom fusion is not found it means that some of the build targets might
   // not be statically linked into the binary.
-  if (custom_fusion == nullptr) {
-    return absl::InternalError(absl::StrCat(
-        "Custom fusion ", config.name(), " not found in a default registry."));
+  if (custom_kernel_fusion == nullptr) {
+    return absl::InternalError(
+        absl::StrCat("Custom kernel fusion ", config.name(),
+                     " not found in a default registry."));
   }
 
   // Load custom kernels that can implement a fusion computation.
-  TF_ASSIGN_OR_RETURN(
-      std::vector<CustomKernel> kernels,
-      custom_fusion->LoadKernels(ir_emitter_context.gpu_device_info(),
-                                 fusion.fused_instructions_computation()));
+  TF_ASSIGN_OR_RETURN(std::vector<CustomKernel> kernels,
+                      custom_kernel_fusion->LoadKernels(
+                          ir_emitter_context.gpu_device_info(),
+                          fusion.fused_instructions_computation()));
 
   // This should never happen, it means that compilation pipeline created a
   // fusion operation that is not supported by a given custom fusion.
   if (kernels.empty()) {
     return absl::InternalError(
-        absl::StrCat("Custom fusion ", config.name(),
+        absl::StrCat("Custom kernel fusion ", config.name(),
                      " returned empty custom kernels for a fused computation"));
   }
 
@@ -175,12 +449,9 @@ absl::StatusOr<FusionEmissionResult> CustomFusionEmitter::Emit(
   return result;
 }
 
-absl::StatusOr<FusionEmissionResult> AddressComputationFusionEmitter::Emit(
+absl::StatusOr<FusionEmissionResult> AddressComputationFusion::Emit(
     IrEmitterContext& ir_emitter_context, mlir::lmhlo::FusionOp fusion_op,
     const HloFusionInstruction& fusion) const {
-  const BufferAssignment& buffer_assignment =
-      ir_emitter_context.buffer_assignment();
-
   const HloFusionAdaptor& adaptor = analysis_.fusion();
   auto maybe_custom_call_adaptor = HloFindIf(
       adaptor.GetRoots(), adaptor,
@@ -190,52 +461,12 @@ absl::StatusOr<FusionEmissionResult> AddressComputationFusionEmitter::Emit(
 
   const auto& custom_call = *static_cast<const HloCustomCallInstruction*>(
       &maybe_custom_call_adaptor->instruction());
+  // TODO(vuson): these Emit* are mostly duplicated from ir_emitter_unnested
   if (IsLegacyCublasMatmul(custom_call)) {
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice lhs_slice,
-                        GetSliceWithUpdatedOffsetAndSize(
-                            buffer_assignment, adaptor, fusion.operand(0),
-                            *custom_call.operand(0)));
-
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice rhs_slice,
-                        GetSliceWithUpdatedOffsetAndSize(
-                            buffer_assignment, adaptor, fusion.operand(1),
-                            *custom_call.operand(1)));
-
-    BufferAllocation::Slice output;
-    std::optional<BufferAllocation::Slice> workspace;
-
-    // Result of a legacy cuBLAS custom call can be a tuple if we explicitly
-    // allocate workspace buffer in HLO. If result is an array, it means that
-    // workspace is not available, and cuBLAS will allocate its own workspace.
-    if (custom_call.shape().IsArray()) {
-      TF_ASSIGN_OR_RETURN(output,
-                          GetAllocationSlice(buffer_assignment, &fusion, {}));
-    } else {
-      TF_ASSIGN_OR_RETURN(output,
-                          GetAllocationSlice(buffer_assignment, &fusion, {0}));
-      TF_ASSIGN_OR_RETURN(workspace,
-                          GetAllocationSlice(buffer_assignment, &fusion, {1}));
-    }
-
-    bool deterministic_ops =
-        ir_emitter_context.debug_options().xla_gpu_deterministic_ops();
-
-    TF_ASSIGN_OR_RETURN(
-        GemmConfig config,
-        GemmConfig::For(static_cast<const HloInstruction*>(&custom_call)));
-    auto thunk = std::make_unique<GemmThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(&custom_call),
-        std::move(config), lhs_slice, rhs_slice, output, workspace,
-        deterministic_ops);
-
-    FusionEmissionResult result;
-    result.thunks.push_back(std::move(thunk));
-    return result;
+    return EmitGemm(ir_emitter_context, adaptor, fusion, custom_call);
   }
 
-  return absl::UnimplementedError(
-      absl::StrCat("No emission for AddressComputationFusion of custom call ",
-                   custom_call.custom_call_target()));
+  return EmitCustomCall(ir_emitter_context, adaptor, fusion, custom_call);
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/custom.h b/third_party/xla/xla/service/gpu/fusions/custom.h
index f8dc27b25d210a..f227e270d82d0e 100644
--- a/third_party/xla/xla/service/gpu/fusions/custom.h
+++ b/third_party/xla/xla/service/gpu/fusions/custom.h
@@ -26,8 +26,9 @@ namespace xla {
 namespace gpu {
 
 // A wrapper for fusions implemented using the mechanism in
-// xla/service/gpu/kernels. See custom_fusion.h in that folder for details.
-class CustomFusionEmitter : public FusionInterface {
+// xla/service/gpu/kernels. See custom_kernel_fusion.h in that folder for
+// details.
+class CustomFusion : public FusionInterface {
  public:
   absl::StatusOr<FusionEmissionResult> Emit(
       IrEmitterContext& ir_emitter_context, mlir::lmhlo::FusionOp fusion_op,
@@ -50,9 +51,9 @@ class CustomFusionEmitter : public FusionInterface {
 // compile-time instead of allocating a new buffer for it at runtime by
 // translating the static slice into offset + size of the original buffer passed
 // into the custom call `%gemm`.
-class AddressComputationFusionEmitter : public FusionInterface {
+class AddressComputationFusion : public FusionInterface {
  public:
-  explicit AddressComputationFusionEmitter(const HloFusionAnalysis& analysis)
+  explicit AddressComputationFusion(const HloFusionAnalysis& analysis)
       : analysis_(analysis) {}
 
   absl::StatusOr<FusionEmissionResult> Emit(
diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
index 67d5790135d496..3d15bfaf53823c 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <cstddef>
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -39,6 +40,7 @@ limitations under the License.
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/TargetParser/Triple.h"
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -48,17 +50,18 @@ limitations under the License.
 #include "xla/service/gpu/kernel_arguments.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/model/indexing_map.h"
-#include "xla/service/gpu/runtime3/kernel_thunk.h"
+#include "xla/service/gpu/runtime/kernel_thunk.h"
 #include "xla/service/gpu/target_util.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
@@ -108,13 +111,14 @@ absl::Status AnnotateKernelLaunchDimensions(
     AnnotateWithInt32Value("reqntidz", launch_dims.thread_counts_per_block().z,
                            kernel_name, llvm_module);
   }
-
+  // Maybe we want to set "reqnctapercluster" here, but not sure if needed or if
+  // LLVM supports that yet. Let's do that later when needed.
   return absl::OkStatus();
 }
 
 }  // namespace
 
-mlir::AffineMap KernelFusionInterface::GetDefaultThreadIdToOutputIndexingMap(
+IndexingMap KernelFusionInterface::GetDefaultThreadIdToOutputIndexingMap(
     const LaunchDimensions& launch_dims, int unroll_factor,
     const Shape& output_shape, mlir::MLIRContext* ctx) {
   std::vector<mlir::AffineExpr> output_dims(output_shape.rank());
@@ -140,7 +144,8 @@ mlir::AffineMap KernelFusionInterface::GetDefaultThreadIdToOutputIndexingMap(
   // This means that this code supports some launch grids that the parallel
   // loop emitter doesn't support. This is safe, since the latter CHECK fails
   // if its assumptions are not fulfilled.
-  mlir::AffineExpr linear_index = mlir::getAffineConstantExpr(0, ctx);
+  mlir::AffineExpr c0 = mlir::getAffineConstantExpr(0, ctx);
+  mlir::AffineExpr linear_index = c0;
   uint64_t stride = 1;
   for (int i = 0; i < 3; ++i) {
     auto coord = mlir::getAffineDimExpr(kIndexingMapThreadIdxDims[i], ctx) +
@@ -150,11 +155,12 @@ mlir::AffineMap KernelFusionInterface::GetDefaultThreadIdToOutputIndexingMap(
     linear_index = linear_index + linear_component;
     stride *= total_sizes[i];
   }
+  mlir::AffineExpr chunk_id = mlir::getAffineSymbolExpr(0, ctx);
+  mlir::AffineExpr unroll_elem_id = mlir::getAffineSymbolExpr(1, ctx);
 
-  if (unroll_factor > 1) {
-    linear_index =
-        linear_index * unroll_factor + mlir::getAffineSymbolExpr(0, ctx);
-  }
+  linear_index = linear_index * unroll_factor +
+                 chunk_id * unroll_factor * launch_dims.launch_bound() +
+                 unroll_elem_id;
 
   // See IndexUtil::LinearIndexToMultidimensionalIndex.
   uint64_t divisor = 1;
@@ -165,13 +171,6 @@ mlir::AffineMap KernelFusionInterface::GetDefaultThreadIdToOutputIndexingMap(
     divisor *= output_shape.dimensions(dimension);
   }
 
-  return mlir::AffineMap::get(/*dimCount=*/6,
-                              /*symbolCount=*/unroll_factor > 1 ? 1 : 0,
-                              output_dims, ctx);
-}
-
-Domain KernelFusionInterface::GetThreadIdDomain(
-    const LaunchDimensions& launch_dims, int unroll_factor) {
   std::vector<Range> dimension_ranges = {
       {0, static_cast<int64_t>(launch_dims.thread_counts_per_block().x) - 1},
       {0, static_cast<int64_t>(launch_dims.thread_counts_per_block().y) - 1},
@@ -181,10 +180,26 @@ Domain KernelFusionInterface::GetThreadIdDomain(
       {0, static_cast<int64_t>(launch_dims.block_counts().z) - 1},
   };
   std::vector<Range> symbol_ranges;
-  if (unroll_factor > 1) {
-    symbol_ranges.push_back({0, unroll_factor - 1});
+  int64_t num_elements = ShapeUtil::ElementsIn(output_shape);
+  symbol_ranges.push_back(
+      {0, CeilOfRatio(num_elements,
+                      static_cast<int64_t>(launch_dims.launch_bound()) *
+                          unroll_factor) -
+              1});
+  symbol_ranges.push_back({0, unroll_factor - 1});
+  IndexingMap indexing_map(
+      mlir::AffineMap::get(/*dimCount=*/6,
+                           /*symbolCount=*/2, output_dims, ctx),
+      dimension_ranges, symbol_ranges);
+  // Remove the unroll_elem_id symbol if unrolling divides num_elements.
+  if (num_elements % unroll_factor == 0) {
+    indexing_map.AddConstraint(linear_index.replace({{unroll_elem_id, c0}}),
+                               Range{0, num_elements - unroll_factor});
+  } else {
+    indexing_map.AddConstraint(linear_index, Range{0, num_elements - 1});
   }
-  return Domain(dimension_ranges, symbol_ranges);
+  indexing_map.Simplify();
+  return indexing_map;
 }
 
 absl::StatusOr<std::tuple<llvm::Function*, std::vector<llvm_ir::IrArray>,
@@ -219,9 +234,11 @@ BuildKernelPrototype(IrEmitterContext& ir_emitter_context,
   // Create the kernel and add it to the module.
   auto* llvm_module = ir_emitter_context.llvm_module();
   llvm::LLVMContext& context = llvm_module->getContext();
+  // Explicitly set global addrspace for SPIR backend.
+  int addrspace = llvm::Triple(llvm_module->getTargetTriple()).isSPIR() ? 1 : 0;
   llvm::FunctionType* kernel_type = llvm::FunctionType::get(
       /*Result=*/llvm::Type::getVoidTy(context),
-      std::vector<llvm::Type*>(kNumLlvmArgs, builder->getPtrTy()),
+      std::vector<llvm::Type*>(kNumLlvmArgs, builder->getPtrTy(addrspace)),
       /*isVarArg=*/false);
   llvm::Function* kernel =
       llvm::Function::Create(kernel_type, llvm::GlobalValue::ExternalLinkage,
@@ -303,28 +320,32 @@ absl::StatusOr<FusionEmissionResult> KernelFusionEmitterBase::Emit(
                       EmitInitializers(ir_emitter_context, fusion_op, fusion));
   auto launch_dims = launch_dimensions();
   std::vector<llvm_ir::IrArray> inputs, outputs;
-  auto [entry, cached] = ir_emitter_context.kernel_cache().GetWithStatus(
-      fused_computation, kernel_arguments.args(), /*discriminator=*/"",
-      [&]() -> absl::StatusOr<KernelReuseCache::Entry> {
-        llvm::Function* kernel;
-        TF_ASSIGN_OR_RETURN(std::tie(kernel, inputs, outputs),
-                            BuildKernelPrototype(
-                                ir_emitter_context, suggested_kernel_name,
-                                kernel_arguments.args(), fusion.operand_count(),
-                                launch_dims, &builder));
-        if (ir_emitter_context.emit_kernels()) {
-          TF_RETURN_IF_ERROR(EmitKernel(ir_emitter_context, fusion, launch_dims,
-                                        std::move(inputs), std::move(outputs),
-                                        &builder));
-        } else {
-          VLOG(3) << "Skipped kernel compilation: " << suggested_kernel_name;
-        }
-        // TODO(jreiffers): Return shmem_bytes from EmitKernel when
-        // converting the Triton emitters to this infrastructure.
-        return KernelReuseCache::Entry{kernel->getName().str(), launch_dims,
-                                       /*shmem_bytes=*/0};
-      });
-  TF_RETURN_IF_ERROR(entry.status());
+  auto [status_or_entry, cached] =
+      ir_emitter_context.kernel_cache().GetWithStatus(
+          fused_computation, kernel_arguments.args(), /*discriminator=*/"",
+          [&]() -> absl::StatusOr<KernelReuseCache::Entry> {
+            llvm::Function* kernel;
+            TF_ASSIGN_OR_RETURN(
+                std::tie(kernel, inputs, outputs),
+                BuildKernelPrototype(ir_emitter_context, suggested_kernel_name,
+                                     kernel_arguments.args(),
+                                     fusion.operand_count(), launch_dims,
+                                     &builder));
+            if (ir_emitter_context.emit_kernels()) {
+              TF_RETURN_IF_ERROR(EmitKernel(ir_emitter_context, fusion,
+                                            launch_dims, std::move(inputs),
+                                            std::move(outputs), &builder));
+            } else {
+              VLOG(3) << "Skipped kernel compilation: "
+                      << suggested_kernel_name;
+            }
+            // TODO(jreiffers): Return shmem_bytes from EmitKernel when
+            // converting the Triton emitters to this infrastructure.
+            return KernelReuseCache::Entry{kernel->getName().str(), launch_dims,
+                                           /*cluster_dim=*/std::nullopt,
+                                           /*shmem_bytes=*/0};
+          });
+  TF_ASSIGN_OR_RETURN(const KernelReuseCache::Entry* entry, status_or_entry);
 
   if (cached) {
     VLOG(3) << "Reuse: " << suggested_kernel_name << " -> "
@@ -334,11 +355,11 @@ absl::StatusOr<FusionEmissionResult> KernelFusionEmitterBase::Emit(
   if (ir_emitter_context.emit_ir_from_hlo()) {
     result.thunks.emplace_back(std::make_unique<KernelThunk>(
         &fusion, entry->kernel_name, kernel_arguments.args(), launch_dims,
-        entry->shmem_bytes));
+        entry->cluster_dim, entry->shmem_bytes));
   } else {
     result.thunks.emplace_back(std::make_unique<KernelThunk>(
         fusion_op, entry->kernel_name, kernel_arguments.args(), launch_dims,
-        entry->shmem_bytes));
+        entry->cluster_dim, entry->shmem_bytes));
   }
 
   return result;
diff --git a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
index 83bc88554df639..66ecf085d97908 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
+++ b/third_party/xla/xla/service/gpu/fusions/fusion_emitter.h
@@ -35,6 +35,7 @@ limitations under the License.
 #include "xla/service/gpu/kernel_arguments.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/shape.h"
@@ -65,7 +66,7 @@ class KernelFusionInterface : public FusionInterface {
   // Returns the fusion's launch dimensions.
   virtual LaunchDimensions launch_dimensions() const = 0;
 
-  // Computes an indexing map from thread to output element(s).
+  // Computes an indexing map from thread to output element(s) of the **hero**.
   //
   // The dimensions in the resulting map are
   //   d0, d1, d2: threadIdx.{x,y,z}
@@ -77,7 +78,14 @@ class KernelFusionInterface : public FusionInterface {
   // unsupported (scatter, in-place DUS). Implementations will return nullopt.
   // Note: Work in progress, not implemented for all emitters.
   virtual std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t output_id, mlir::MLIRContext* ctx) const = 0;
+      int64_t root_index, mlir::MLIRContext* ctx) const = 0;
+
+  // Computes an indexing map from thread to input element(s) of the root's
+  // **hero**. Note that in many cases this is not computable from the output
+  // indexing. The indexing may only be known for some operands of the hero.
+  virtual std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const = 0;
 
   static constexpr std::array<int, 3> kIndexingMapThreadIdxDims = {0, 1, 2};
   static constexpr std::array<int, 3> kIndexingMapBlockIdxDims = {3, 4, 5};
@@ -85,14 +93,11 @@ class KernelFusionInterface : public FusionInterface {
  protected:
   // Returns the default mapping for the given launch dimensions: linearizes
   // the thread index and then reshapes it into the output layout.
-  static mlir::AffineMap GetDefaultThreadIdToOutputIndexingMap(
-      const LaunchDimensions& launch_dims, int unroll_factor,
-      const Shape& output_shape, mlir::MLIRContext* ctx);
-
   // Populates the ranges for d0, d1, d2, d3, d4, d5 from the thread counts and
   // block sizes in the given launch dimensions.
-  static Domain GetThreadIdDomain(const LaunchDimensions& launch_dims,
-                                  int unroll_factor);
+  static IndexingMap GetDefaultThreadIdToOutputIndexingMap(
+      const LaunchDimensions& launch_dims, int unroll_factor,
+      const Shape& output_shape, mlir::MLIRContext* ctx);
 };
 
 // Base class for fusions that are implemented using a single kernel, which is
diff --git a/third_party/xla/xla/service/gpu/fusions/fusions.cc b/third_party/xla/xla/service/gpu/fusions/fusions.cc
index 6f1fa214d56830..ef78788d28c74f 100644
--- a/third_party/xla/xla/service/gpu/fusions/fusions.cc
+++ b/third_party/xla/xla/service/gpu/fusions/fusions.cc
@@ -171,16 +171,16 @@ absl::StatusOr<std::unique_ptr<FusionInterface>> GetFusionEmitter(
     case HloFusionAnalysis::EmitterFusionKind::kCustomFusion: {
       const auto& config = backend_config.custom_fusion_config();
       if (config.name() == "address_computation") {
-        return std::make_unique<AddressComputationFusionEmitter>(analysis);
+        return std::make_unique<AddressComputationFusion>(analysis);
       }
-      return std::make_unique<CustomFusionEmitter>();
+      return std::make_unique<CustomFusion>();
     }
     case HloFusionAnalysis::EmitterFusionKind::kInputSlices:
       return std::make_unique<InputSlicesFusion>(analysis);
     case HloFusionAnalysis::EmitterFusionKind::kLoop: {
       if (IsDynamicUpdateSliceFusion(analysis) &&
           fusion_info.CanEmitDynamicUpdateSliceInPlace()) {
-        return std::make_unique<InPlaceDynamicUpdateSliceEmitter>(analysis);
+        return std::make_unique<InPlaceDynamicUpdateSliceFusion>(analysis);
       }
 
       if (auto copy_fusion = fusion_info.GetCopyFusion()) {
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
index b21471921855f9..1fedb880c88f89 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/status.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/IRBuilder.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -34,12 +35,12 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-LaunchDimensions InPlaceDynamicUpdateSliceEmitter::launch_dimensions() const {
+LaunchDimensions InPlaceDynamicUpdateSliceFusion::launch_dimensions() const {
   const auto& update_shape = dus_ops_.front()->operand(1)->shape();
   return CalculateLaunchDimensions(update_shape, analysis_.device_info());
 }
 
-absl::Status InPlaceDynamicUpdateSliceEmitter::EmitKernel(
+absl::Status InPlaceDynamicUpdateSliceFusion::EmitKernel(
     IrEmitterContext& ir_emitter_context, const HloFusionInstruction& fusion,
     const LaunchDimensions& launch_dims, std::vector<llvm_ir::IrArray> inputs,
     std::vector<llvm_ir::IrArray> outputs, llvm::IRBuilder<>* builder) const {
diff --git a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
index fc6b266ba27837..12be8043b05ec1 100644
--- a/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
+++ b/third_party/xla/xla/service/gpu/fusions/in_place_dynamic_update_slice.h
@@ -58,21 +58,28 @@ namespace gpu {
 // modifies the output in place without touching the un-updated elements. The
 // update slice is assumed to be the exact same for all the
 // dynamic-update-slice ops.
-class InPlaceDynamicUpdateSliceEmitter : public KernelFusionEmitterBase {
+class InPlaceDynamicUpdateSliceFusion : public KernelFusionEmitterBase {
  public:
-  explicit InPlaceDynamicUpdateSliceEmitter(const HloFusionAnalysis& analysis)
+  explicit InPlaceDynamicUpdateSliceFusion(const HloFusionAnalysis& analysis)
       : analysis_(analysis),
         dus_ops_(
             GetOutputDefiningDynamicUpdateSlices(analysis.fusion_roots())) {}
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t output_id, mlir::MLIRContext* ctx) const override {
+      int64_t root_index, mlir::MLIRContext* ctx) const override {
     // The mapping cannot be statically computed in general, since the offsets
     // are unknown.
     return std::nullopt;
   }
 
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const override {
+    // TODO(b/319081342): Implement this.
+    return std::nullopt;
+  }
+
  protected:
   absl::Status EmitKernel(IrEmitterContext& ir_emitter_context,
                           const HloFusionInstruction& fusion,
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices.cc b/third_party/xla/xla/service/gpu/fusions/input_slices.cc
index 5ed9c21e171090..85f661a8f125f5 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices.cc
@@ -34,7 +34,6 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/indexing_analysis.h"
 #include "xla/service/gpu/parallel_loop_emitter.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
@@ -174,15 +173,13 @@ absl::StatusOr<Shape> GetConsistentInputShapeForRootSlices(
   return first_slice_operand_shape;
 }
 
-constexpr int kUnrollFactor = 1;
-
 }  // namespace
 
 LaunchDimensions InputSlicesFusion::launch_dimensions() const {
   auto* root = analysis_.fusion_roots().front();
   const auto& shape = root->operands()[0]->shape();
   return CalculateLaunchDimensions(shape, analysis_.device_info(),
-                                   {kUnrollFactor});
+                                   {unroll_factor_});
 }
 
 std::optional<IndexingMap> InputSlicesFusion::ComputeThreadIdToOutputIndexing(
@@ -193,11 +190,8 @@ std::optional<IndexingMap> InputSlicesFusion::ComputeThreadIdToOutputIndexing(
   // The implementation requires the shapes and layouts to be the same, but we
   // still use the requested output's shape for clarity.
   const auto& shape = analysis_.fusion_roots()[output_id]->shape();
-  IndexingMap result{GetDefaultThreadIdToOutputIndexingMap(
-                         launch_dims, kUnrollFactor, shape, ctx),
-                     GetThreadIdDomain(launch_dims, kUnrollFactor)};
-  result.Simplify();
-  return result;
+  return GetDefaultThreadIdToOutputIndexingMap(launch_dims, unroll_factor_,
+                                               shape, ctx);
 }
 
 absl::Status InputSlicesFusion::EmitKernel(
@@ -207,6 +201,8 @@ absl::Status InputSlicesFusion::EmitKernel(
   TF_ASSIGN_OR_RETURN(Shape element_shape,
                       GetConsistentInputShapeForRootSlices(
                           fusion.fused_instructions_computation()));
+  LaunchDimensionsConfig launch_config;
+  launch_config.unroll_factor = unroll_factor_;
   GpuElementalIrEmitter elemental_emitter(ir_emitter_context, builder);
   return ParallelLoopEmitter(
              [&](const llvm_ir::IrArray::Index index) -> absl::Status {
@@ -214,7 +210,7 @@ absl::Status InputSlicesFusion::EmitKernel(
                    elemental_emitter, fusion.fused_instructions_computation(),
                    inputs, outputs, index, builder);
              },
-             element_shape, launch_dims, builder)
+             element_shape, launch_dims, builder, launch_config)
       .EmitLoop(
           fusion.name(),
           GetIndexTypeForKernel(&fusion, launch_dims.launch_bound(), builder));
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices.h b/third_party/xla/xla/service/gpu/fusions/input_slices.h
index a45f90b2fa63ff..90f4f4e4a24d03 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices.h
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices.h
@@ -43,12 +43,20 @@ namespace gpu {
 class InputSlicesFusion : public KernelFusionEmitterBase {
  public:
   explicit InputSlicesFusion(const HloFusionAnalysis& analysis)
-      : analysis_(analysis) {}
+      : analysis_(analysis),
+        unroll_factor_(analysis.input_output_info().has_4_bit_output ? 2 : 1) {}
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
       int64_t output_id, mlir::MLIRContext* ctx) const override;
 
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const override {
+    // TODO(b/319081342): Implement this.
+    return std::nullopt;
+  }
+
  protected:
   absl::Status EmitKernel(IrEmitterContext& ir_emitter_context,
                           const HloFusionInstruction& fusion,
@@ -59,6 +67,7 @@ class InputSlicesFusion : public KernelFusionEmitterBase {
 
  private:
   const HloFusionAnalysis& analysis_;
+  const int unroll_factor_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/input_slices_test.cc b/third_party/xla/xla/service/gpu/fusions/input_slices_test.cc
index 5992f54a30f15b..094bbfac7a27a9 100644
--- a/third_party/xla/xla/service/gpu/fusions/input_slices_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/input_slices_test.cc
@@ -32,20 +32,13 @@ namespace xla {
 namespace gpu {
 namespace {
 
-using ::testing::ElementsAre;
-using ::testing::HasSubstr;
-using ::testing::IsEmpty;
-
 class InputSlicesTest : public HloTestBase {
  public:
   void SetUp() override {
     HloTestBase::SetUp();
-    printer_.SetDimensionName(0, "th_x");
-    printer_.SetDimensionName(1, "th_y");
-    printer_.SetDimensionName(2, "th_z");
-    printer_.SetDimensionName(3, "bl_x");
-    printer_.SetDimensionName(4, "bl_y");
-    printer_.SetDimensionName(5, "bl_z");
+    printer_ =
+        AffineMapPrinter({"th_x", "th_y", "th_z", "bl_x", "bl_y", "bl_z"},
+                         {"chunk_id", "unroll_id"});
   }
 
  protected:
@@ -75,27 +68,32 @@ TEST_F(InputSlicesTest, ThreadIndexing) {
 
   auto* root = module->entry_computation()->root_instruction();
   auto analysis_fused = AnalyzeFusion(*root, device_info);
-  ASSERT_NE(analysis_fused, std::nullopt);
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto emitter,
-      GetFusionEmitter(PreBufferAssignmentFusionInfo{*analysis_fused}));
+      GetFusionEmitter(PreBufferAssignmentFusionInfo{analysis_fused}));
   auto fusion = dynamic_cast<InputSlicesFusion*>(emitter.get());
   ASSERT_NE(fusion, nullptr);
 
   auto thread_id_to_output_indexing =
       fusion->ComputeThreadIdToOutputIndexing(0, &mlir_context_);
-  EXPECT_THAT(printer_.ToString(thread_id_to_output_indexing->affine_map),
-              HasSubstr("(th_x, th_y, th_z, bl_x, bl_y, bl_z) -> "
-                        "(0, "
-                        "((th_x + bl_x * 128) floordiv 3) mod 2, "
-                        "(th_x + bl_x * 128) mod 3, "
-                        "((th_x + bl_x * 128) floordiv 6) mod 5)"));
-  EXPECT_THAT(thread_id_to_output_indexing->domain,
-              MatchDomain(ElementsAre(MatchRange(0, 127), MatchRange(0, 0),
-                                      MatchRange(0, 0), MatchRange(0, 1),
-                                      MatchRange(0, 0), MatchRange(0, 0)),
-                          IsEmpty()));
+  EXPECT_THAT(thread_id_to_output_indexing->ToString(printer_),
+              MatchIndexingString(R"(
+    (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (0,
+      ((th_x + bl_x * 128) floordiv 3) mod 2,
+       (th_x + bl_x * 128) mod 3,
+       ((bl_x * 64 + th_x floordiv 2) floordiv 3) mod 5)
+    domain:
+    th_x in [0, 127]
+    th_y in [0, 0]
+    th_z in [0, 0]
+    bl_x in [0, 1]
+    bl_y in [0, 0]
+    bl_z in [0, 0]
+    chunk_id in [0, 0]
+    unroll_id in [0, 0]
+    th_x + bl_x * 128 in [0, 29]
+  )"));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/fusions/loop.cc b/third_party/xla/xla/service/gpu/fusions/loop.cc
index e635570d056221..64ccb7bf899782 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop.cc
@@ -215,14 +215,33 @@ LoopFusion::LoopFusion(const HloFusionAnalysis& analysis)
     : analysis_(analysis), config_(ComputeLoopFusionConfig(analysis)) {}
 
 std::optional<IndexingMap> LoopFusion::ComputeThreadIdToOutputIndexing(
-    int64_t output_id, mlir::MLIRContext* ctx) const {
+    int64_t root_index, mlir::MLIRContext* ctx) const {
   auto launch_dims = launch_dimensions();
-  const auto& shape = analysis_.fusion_roots()[output_id]->shape();
-  IndexingMap result{GetDefaultThreadIdToOutputIndexingMap(
-                         launch_dims, config_.unroll_factor, shape, ctx),
-                     GetThreadIdDomain(launch_dims, config_.unroll_factor)};
-  result.Simplify();
-  return result;
+  const auto& shape = analysis_.fusion_roots()[root_index]->shape();
+  return GetDefaultThreadIdToOutputIndexingMap(
+      launch_dims, config_.unroll_factor, shape, ctx);
+}
+
+std::optional<IndexingMap> LoopFusion::ComputeThreadIdToInputIndexing(
+    int64_t root_index, int64_t hero_operand_index,
+    mlir::MLIRContext* ctx) const {
+  std::optional<IndexingMap> thread_id_to_output_indexing =
+      ComputeThreadIdToOutputIndexing(root_index, ctx);
+  if (!thread_id_to_output_indexing.has_value()) {
+    return std::nullopt;
+  }
+  const HloInstruction* fusion_root = analysis_.fusion_roots()[root_index];
+  auto output_to_input_indexing =
+      ComputeOutputToInputIndexing(fusion_root, /*output_id=*/0, ctx);
+  IndexingMapSet output_to_input_indexing_set =
+      output_to_input_indexing.indexing_maps[hero_operand_index];
+  // Since we are computing the indexing for a non-fusion op, there is only one
+  // indexing map per operand.
+  CHECK_EQ(output_to_input_indexing_set.size(), 1);
+  IndexingMap thread_id_to_input_indexing_map = ComposeIndexingMaps(
+      *thread_id_to_output_indexing, *output_to_input_indexing_set.begin());
+  thread_id_to_input_indexing_map.Simplify();
+  return thread_id_to_input_indexing_map;
 }
 
 absl::Status LoopFusion::EmitKernel(IrEmitterContext& ir_emitter_context,
diff --git a/third_party/xla/xla/service/gpu/fusions/loop.h b/third_party/xla/xla/service/gpu/fusions/loop.h
index 38112a6a209cbe..05029a3bc6caf4 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop.h
+++ b/third_party/xla/xla/service/gpu/fusions/loop.h
@@ -26,7 +26,7 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/status.h"
 
@@ -40,7 +40,11 @@ class LoopFusion : public KernelFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t output_id, mlir::MLIRContext* ctx) const override;
+      int64_t root_index, mlir::MLIRContext* ctx) const override;
+
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const override;
 
  protected:
   absl::Status EmitKernel(IrEmitterContext& ir_emitter_context,
diff --git a/third_party/xla/xla/service/gpu/fusions/loop_test.cc b/third_party/xla/xla/service/gpu/fusions/loop_test.cc
index 11a80c64fed37d..e497b52ebaf843 100644
--- a/third_party/xla/xla/service/gpu/fusions/loop_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/loop_test.cc
@@ -35,20 +35,14 @@ namespace xla {
 namespace gpu {
 namespace {
 
-using ::testing::ElementsAre;
-using ::testing::HasSubstr;
-using ::testing::IsEmpty;
-
 class LoopTest : public HloTestBase {
  public:
   void SetUp() override {
     HloTestBase::SetUp();
-    printer_.SetDimensionName(0, "th_x");
-    printer_.SetDimensionName(1, "th_y");
-    printer_.SetDimensionName(2, "th_z");
-    printer_.SetDimensionName(3, "bl_x");
-    printer_.SetDimensionName(4, "bl_y");
-    printer_.SetDimensionName(5, "bl_z");
+
+    printer_ =
+        AffineMapPrinter({"th_x", "th_y", "th_z", "bl_x", "bl_y", "bl_z"},
+                         {"chunk_id", "unroll_id"});
   }
 
  protected:
@@ -58,17 +52,15 @@ class LoopTest : public HloTestBase {
   mlir::MLIRContext mlir_context_;
 };
 
-absl::StatusOr<std::unique_ptr<LoopFusion>> GetLoopFusion(
-    const std::optional<HloFusionAnalysis>& analysis) {
-  TF_RET_CHECK(analysis != std::nullopt);
-
+absl::StatusOr<std::unique_ptr<KernelFusionInterface>> GetFusion(
+    const HloFusionAnalysis& analysis) {
   TF_ASSIGN_OR_RETURN(
-      auto emitter, GetFusionEmitter(PreBufferAssignmentFusionInfo{*analysis}));
-  auto fusion = dynamic_cast<LoopFusion*>(emitter.get());
+      auto emitter, GetFusionEmitter(PreBufferAssignmentFusionInfo{analysis}));
+  auto fusion = dynamic_cast<KernelFusionInterface*>(emitter.get());
   TF_RET_CHECK(fusion != nullptr);
 
   emitter.release();
-  return std::unique_ptr<LoopFusion>{fusion};
+  return std::unique_ptr<KernelFusionInterface>{fusion};
 }
 
 TEST_F(LoopTest, ThreadIndexingUnrolled) {
@@ -89,19 +81,30 @@ TEST_F(LoopTest, ThreadIndexingUnrolled) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
 
-  TF_ASSERT_OK_AND_ASSIGN(auto loop_fusion, GetLoopFusion(analysis));
+  TF_ASSERT_OK_AND_ASSIGN(auto loop_fusion, GetFusion(analysis));
   auto thread_id_to_output_indexing =
-      loop_fusion->ComputeThreadIdToOutputIndexing(0, &mlir_context_);
-  EXPECT_THAT(printer_.ToString(thread_id_to_output_indexing->affine_map),
-              HasSubstr("(th_x, th_y, th_z, bl_x, bl_y, bl_z)[s0] -> ("
-                        "(th_x * 4 + bl_x * 512 + s0) floordiv 60000, "
-                        "((th_x * 4 + bl_x * 512 + s0) floordiv 300) mod 200, "
-                        "(th_x * 4 + bl_x * 512 + s0) mod 300)"));
-  EXPECT_THAT(thread_id_to_output_indexing->domain,
-              MatchDomain(ElementsAre(MatchRange(0, 127), MatchRange(0, 0),
-                                      MatchRange(0, 0), MatchRange(0, 1007),
-                                      MatchRange(0, 0), MatchRange(0, 0)),
-                          ElementsAre(MatchRange(0, 3))));
+      loop_fusion->ComputeThreadIdToOutputIndexing(/*root_index=*/0,
+                                                   &mlir_context_);
+
+  EXPECT_THAT(thread_id_to_output_indexing->ToString(printer_),
+              MatchIndexingString(R"(
+  (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
+   (((bl_x * 16 + th_x floordiv 8) floordiv 3 + chunk_id * 5376) floordiv 625) mod 100,
+   (((th_x + bl_x * 128) floordiv 3 + chunk_id * 43008) floordiv 25) mod 200,
+   th_x * 4 + bl_x * 512 + chunk_id * 516096 + unroll_id -
+     (((th_x + bl_x * 128) floordiv 3 + chunk_id * 43008) floordiv 25) * 300
+  )
+  domain:
+  th_x in [0, 127]
+  th_y in [0, 0]
+  th_z in [0, 0]
+  bl_x in [0, 1007]
+  bl_y in [0, 0]
+  bl_z in [0, 0]
+  chunk_id in [0, 11]
+  unroll_id in [0, 3]
+  (th_x + bl_x * 128) * 4 + chunk_id * 516096 in [0, 5999996]
+)"));
 }
 
 TEST_F(LoopTest, ThreadIndexingNotUnrolled) {
@@ -122,16 +125,98 @@ TEST_F(LoopTest, ThreadIndexingNotUnrolled) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info_);
 
-  TF_ASSERT_OK_AND_ASSIGN(auto loop_fusion, GetLoopFusion(analysis));
+  TF_ASSERT_OK_AND_ASSIGN(auto loop_fusion, GetFusion(analysis));
+  auto thread_id_to_output_indexing =
+      loop_fusion->ComputeThreadIdToOutputIndexing(/*root_index=*/0,
+                                                   &mlir_context_);
+  EXPECT_THAT(thread_id_to_output_indexing->ToString(printer_),
+              MatchIndexingString(R"(
+              (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (th_x)
+              domain:
+              th_x in [0, 19]
+              th_y in [0, 0]
+              th_z in [0, 0]
+              bl_x in [0, 0]
+              bl_y in [0, 0]
+              bl_z in [0, 0]
+              chunk_id in [0, 0]
+              unroll_id in [0, 0]
+            )"));
+  auto thread_id_to_input_indexing =
+      loop_fusion->ComputeThreadIdToInputIndexing(
+          /*root_index=*/0, /*hero_operand_index=*/0, &mlir_context_);
+  EXPECT_THAT(thread_id_to_input_indexing->ToString(printer_),
+              MatchIndexingString(R"(
+              (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (th_x)
+              domain:
+              th_x in [0, 19]
+              th_y in [0, 0]
+              th_z in [0, 0]
+              bl_x in [0, 0]
+              bl_y in [0, 0]
+              bl_z in [0, 0]
+              chunk_id in [0, 0]
+              unroll_id in [0, 0]
+            )"));
+}
+
+TEST_F(LoopTest, Broadcast) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    bcast {
+      %input = f32[20] parameter(0)
+      ROOT bcast = f32[10, 20, 30] broadcast(%input), dimensions={1}
+    }
+
+    ENTRY entry {
+      %input = f32[20] parameter(0)
+      ROOT %fusion = f32[10, 20, 30] fusion(%input), kind=kLoop, calls=bcast
+    })")
+                    .value();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto loop_fusion, GetFusion(analysis));
   auto thread_id_to_output_indexing =
-      loop_fusion->ComputeThreadIdToOutputIndexing(0, &mlir_context_);
-  EXPECT_THAT(printer_.ToString(thread_id_to_output_indexing->affine_map),
-              HasSubstr("(th_x, th_y, th_z, bl_x, bl_y, bl_z) -> (th_x)"));
-  EXPECT_THAT(thread_id_to_output_indexing->domain,
-              MatchDomain(ElementsAre(MatchRange(0, 19), MatchRange(0, 0),
-                                      MatchRange(0, 0), MatchRange(0, 0),
-                                      MatchRange(0, 0), MatchRange(0, 0)),
-                          IsEmpty()));
+      loop_fusion->ComputeThreadIdToOutputIndexing(/*root_index=*/0,
+                                                   &mlir_context_);
+  EXPECT_THAT(thread_id_to_output_indexing->ToString(printer_),
+              MatchIndexingString(R"(
+              (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
+                ((bl_x * 16 + th_x floordiv 8) floordiv 75) mod 10,
+                ((bl_x * 64 + th_x floordiv 2) floordiv 15) mod 20,
+                (th_x + bl_x * 128) mod 30)
+                domain:
+                th_x in [0, 127]
+                th_y in [0, 0]
+                th_z in [0, 0]
+                bl_x in [0, 46]
+                bl_y in [0, 0]
+                bl_z in [0, 0]
+                chunk_id in [0, 0]
+                unroll_id in [0, 0]
+                th_x + bl_x * 128 in [0, 5999]
+            )"));
+  auto thread_id_to_input_indexing =
+      loop_fusion->ComputeThreadIdToInputIndexing(
+          /*root_index=*/0, /*hero_operand_index=*/0, &mlir_context_);
+  EXPECT_THAT(thread_id_to_input_indexing->ToString(printer_),
+              MatchIndexingString(R"(
+              (th_x, th_y, th_z, bl_x, bl_y, bl_z)[chunk_id, unroll_id] -> (
+                ((bl_x * 64 + th_x floordiv 2) floordiv 15) mod 20)
+                domain:
+                th_x in [0, 127]
+                th_y in [0, 0]
+                th_z in [0, 0]
+                bl_x in [0, 46]
+                bl_y in [0, 0]
+                bl_z in [0, 0]
+                chunk_id in [0, 0]
+                unroll_id in [0, 0]
+                th_x + bl_x * 128 in [0, 5999]
+            )"));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction.cc b/third_party/xla/xla/service/gpu/fusions/reduction.cc
index b95d019f9180cd..614ea4080a4bae 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction.cc
+++ b/third_party/xla/xla/service/gpu/fusions/reduction.cc
@@ -15,7 +15,6 @@ limitations under the License.
 #include "xla/service/gpu/fusions/reduction.h"
 
 #include <algorithm>
-#include <array>
 #include <cstddef>
 #include <cstdint>
 #include <functional>
@@ -34,9 +33,9 @@ limitations under the License.
 #include "absl/container/node_hash_map.h"
 #include "absl/log/check.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -47,6 +46,7 @@ limitations under the License.
 #include "llvm/IR/Value.h"
 #include "llvm/Support/AtomicOrdering.h"
 #include "llvm/Support/Casting.h"
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_casting_utils.h"
@@ -54,6 +54,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
+#include "xla/layout_util.h"
 #include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/elemental_ir_emitter.h"
@@ -69,9 +70,11 @@ limitations under the License.
 #include "xla/service/gpu/kernel_arguments.h"
 #include "xla/service/gpu/kernel_reuse_cache.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/parallel_loop_emitter.h"
 #include "xla/service/gpu/reduction_utils.h"
-#include "xla/service/gpu/runtime3/kernel_thunk.h"
+#include "xla/service/gpu/runtime/kernel_thunk.h"
 #include "xla/service/gpu/target_util.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
@@ -96,6 +99,15 @@ namespace xla {
 namespace gpu {
 namespace {
 
+// These are the indices that GetReductionKindAndContiguousComponents uses.
+constexpr int kRowMajorReducedDimension = 0;
+constexpr int kRowKeptDimension = 1;
+constexpr int kRowMinorReducedDimension = 2;
+
+constexpr int kColMajorKeptDimension = 0;
+constexpr int kColReducedDimension = 1;
+constexpr int kColMinorKeptDimension = 2;
+
 using TypedPointer = std::pair<llvm::Value* const, llvm::Type* const>;
 
 // Fusion root -> array of indexes, one per reduction output.
@@ -111,13 +123,16 @@ int GetNumOutputs(const Shape& shape) {
   return 1;
 }
 
+const Shape& OutputShape(const Shape& output_shape, int output_index) {
+  CHECK(output_index == 0 || output_shape.IsTuple());
+  return output_shape.IsTuple() ? output_shape.tuple_shapes(output_index)
+                                : output_shape;
+}
+
 llvm::Type* GetIndexType(const HloFusionInstruction& fusion,
-                         const TilingScheme& tiling_scheme,
-                         llvm::IRBuilder<>* builder) {
-  return GetIndexTypeForKernel(&fusion,
-                               tiling_scheme.GetNumThreadsPerBlockPhysical() *
-                                   tiling_scheme.GetNumBlocksPhysical(),
-                               builder);
+                         const Tiling& tiling, llvm::IRBuilder<>* builder) {
+  return GetIndexTypeForKernel(
+      &fusion, tiling.GetNumThreadsPerBlock() * tiling.GetNumBlocks(), builder);
 }
 
 // For a row reduction, returns the number of rows we can process in parallel
@@ -130,29 +145,15 @@ int RowReductionGetRowsPerWarp(int reduced_dimension_size) {
   return WarpSize() / reduced_dimension_size;
 }
 
-int64_t NearestPowerOfTwo(int64_t v) {
-  if (v < 0) {
-    return 0;
-  }
-  int64_t upper = absl::bit_ceil<uint64_t>(v);
-  int64_t lower = upper >> 1;
-  return upper - v < v - lower ? upper : lower;
-}
+}  // namespace
 
-// Divides `num_reduces` reduces into groups. Different groups will be executed
-// in parallel. Generally speaking, we'd like to run the reduce instructions
-// in parallel without incurring too much recomputation overhead. The current
-// heuristic is to place reduce instructions who share nothing or only
-// (broadcasted) scalars/constants into different groups; otherwise, they are
-// placed in the same group. Non-reduce instructions always go with the reduce
-// instructions into the same group so long as they share any predecessors.
-std::vector<std::vector<const HloInstruction*>> GroupDisjointReductions(
+ReductionFusion::IndexGroups ReductionFusion::GroupDisjointReductions(
     const HloFusionAnalysis& analysis) {
   const int num_fusion_outputs = analysis.fusion_roots().size();
 
   CHECK_NE(0, num_fusion_outputs);
   if (num_fusion_outputs == 1) {
-    return {{analysis.fusion_roots()[0]}};
+    return {{{analysis.fusion_roots()[0]}}, {0}, {true}};
   }
 
   absl::node_hash_map<HloInstructionAdaptor,
@@ -168,11 +169,16 @@ std::vector<std::vector<const HloInstruction*>> GroupDisjointReductions(
                       absl::flat_hash_set<HloInstructionAdaptor>>
       reachable_outputs;
   absl::flat_hash_set<HloInstructionAdaptor> roots_with_reduction;
-  auto roots = analysis.fusion().GetRoots();
+  const auto& roots = analysis.fusion().GetRoots();
+  ReductionFusion::IndexGroups result;
+  result.group_id_per_root.reserve(roots.size());
+  result.is_reduction_root.reserve(roots.size());
   for (auto [root, hero] : llvm::zip(roots, analysis.fusion_heroes())) {
     disjoint_sets[root].Get() = root;
     reachable_outputs[root].insert(root);
-    if (IsRealReductionHero(root.instruction(), *hero)) {
+    result.is_reduction_root.push_back(
+        IsRealReductionHero(root.instruction(), *hero));
+    if (result.is_reduction_root.back()) {
       roots_with_reduction.insert(root);
     } else if (first_non_reduction_root) {
       disjoint_sets[*first_non_reduction_root].Merge(&disjoint_sets[root]);
@@ -230,71 +236,63 @@ std::vector<std::vector<const HloInstruction*>> GroupDisjointReductions(
   }
 
   // Place output instructions in the same set into the same group.
-  ConstHloInstructionMap<std::vector<const HloInstruction*>> groups;
+  ConstHloInstructionMap<std::vector<const HloInstruction*>> group_map;
   for (auto root : roots) {
-    groups[&disjoint_sets[root].Get().instruction()].push_back(
+    group_map[&disjoint_sets[root].Get().instruction()].push_back(
         &root.instruction());
   }
 
-  std::vector<std::vector<const HloInstruction*>> ret;
-  ret.reserve(groups.size());
-  absl::c_for_each(
-      groups, [&](auto& iter) { ret.emplace_back(std::move(iter.second)); });
-  return ret;
-}
+  absl::flat_hash_map<const HloInstruction*, int> set_ids;
+  for (auto&& [id, disjoint_set] : llvm::enumerate(disjoint_sets)) {
+    set_ids[&disjoint_set.second.Get().instruction()] = id;
+  }
 
-// Experimentally determined values to achieve optimal number of
-// bytes-in-flight. With a bound of #warps/SM which can be concurrently
-// scheduled, for small reduced values it can be hard to achieve optimal
-// number of bytes-in-flight. In order to address it, we increase the # of
-// threads/block (physically, while keeping logical mapping the same), which
-// allows larger # of bytes-in-flight.
-int CalculateVirtualThreadScalingFactorForReduction(
-    const HloFusionAnalysis& analysis,
-    const ReductionDimensions& reduction_dimensions) {
-  int64_t dimx = reduction_dimensions.dimensions[TilingScheme::DimX];
-  if (reduction_dimensions.is_row_reduction && dimx <= 128) {
-    int rows_per_warp = RowReductionGetRowsPerWarp(dimx);
-    const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(
-        &analysis.device_info().gpu_compute_capability());
-    if (cuda_cc != nullptr &&
-        cuda_cc->IsAtLeast(se::CudaComputeCapability::AMPERE)) {
-      return rows_per_warp * 3;
-    }
-    return rows_per_warp * 5;
+  for (auto root : roots) {
+    result.group_id_per_root.push_back(
+        set_ids[&disjoint_sets[root].Get().instruction()]);
   }
-  return 1;
+
+  result.grouped_roots.reserve(group_map.size());
+  absl::c_for_each(group_map, [&](auto& it) {
+    result.grouped_roots.emplace_back(std::move(it.second));
+  });
+  return result;
 }
 
-bool CanVectorizeReduction(const HloFusionAnalysis& analysis,
-                           const ReductionDimensions& reduction_dimensions,
-                           int num_threads_x, Vector3 reduction_tiling) {
+namespace {
+
+int GetVectorSize(const HloFusionAnalysis& analysis,
+                  const ReductionDimensions& reduction_dimensions,
+                  int num_threads, Vector3 reduction_tiling) {
   if (!reduction_dimensions.is_row_reduction) {
-    return false;
+    return 1;
   }
 
-  if (reduction_dimensions.dimensions[TilingScheme::DimX] % 2 != 0 ||
+  if (reduction_dimensions.dimensions[kRowMinorReducedDimension] % 2 != 0 ||
       MayPreventVectorization(analysis.fusion())) {
-    return false;
+    return 1;
   }
 
   // Enabling vectorization if number of threads is <= warpsize leads to half or
   // more of the threads not doing any work.
-  if (num_threads_x <= WarpSize()) {
-    return false;
+  if (num_threads <= WarpSize()) {
+    return 1;
   }
 
   const auto* cuda_cc = std::get_if<se::CudaComputeCapability>(
       &analysis.device_info().gpu_compute_capability());
-  if (cuda_cc == nullptr) return false;
-  if (cuda_cc->IsAtLeast(se::CudaComputeCapability::VOLTA)) return true;
+  if (cuda_cc == nullptr) return 1;
+  if (cuda_cc->IsAtLeast(se::CudaComputeCapability::VOLTA)) return 2;
   if (cuda_cc->IsAtLeast(se::CudaComputeCapability::PASCAL_)) {
     return analysis.input_output_info().smallest_input_dtype_bits <= 32 &&
-           reduction_dimensions.dimensions[TilingScheme::DimX] %
-                   (reduction_tiling[2] * num_threads_x) ==
-               0;
+                   reduction_dimensions.dimensions[kRowMinorReducedDimension] %
+                           (reduction_tiling[kRowMinorReducedDimension] *
+                            num_threads) ==
+                       0
+               ? 2
+               : 1;
   }
-  return false;
+  return 1;
 }
 
 llvm::Value* CastSharedToGlobal(llvm::IRBuilder<>* builder, llvm::Value* input,
@@ -321,8 +319,18 @@ class ReductionFusion::ReductionEmitter {
         reduction_codegen_info_(reduction_codegen_info),
         ir_emitter_context_(ir_emitter_context),
         fusion_(fusion),
-        index_ty_(GetIndexType(fusion, reduction_codegen_info.GetTilingScheme(),
-                               elemental_emitter_.builder())) {}
+        index_ty_(GetIndexType(fusion, reduction_codegen_info.GetTiling(),
+                               elemental_emitter_.builder())) {
+    for (auto hero : analysis.fusion_heroes()) {
+      if (hero->opcode() == HloOpcode::kReduce) {
+        for (int i = 0; i < hero->operand_count() / 2; ++i) {
+          CHECK(LayoutUtil::IsMonotonicWithDim0Major(
+              hero->operand(i)->shape().layout()))
+              << "reduction-layout-normalizer must run before code generation";
+        }
+      }
+    }
+  }
 
   absl::StatusOr<FusionEmissionResult> EmitInitializers(
       mlir::lmhlo::FusionOp fusion_op);
@@ -354,7 +362,7 @@ class ReductionFusion::ReductionEmitter {
   void EmitSyncThreads();
 
   int ReducedDimensionSize() const {
-    return reduction_codegen_info_.GetTilingScheme().GetShape()[2];
+    return reduction_codegen_info_.GetTiling().GetShape()[2];
   }
 
   llvm::IRBuilder<>* builder_;
@@ -369,7 +377,7 @@ class ReductionFusion::ReductionEmitter {
 class ReductionFusion::ReductionGroupEmitter {
  public:
   struct ReductionCalculationState {
-    llvm::GlobalVariable* shared_cache;
+    std::optional<llvm_ir::SharedMemoryTile> shared_cache;
     llvm::Value* initial_value;
     llvm::AllocaInst* partial_result_address;
     llvm::AllocaInst* input_address;
@@ -422,14 +430,13 @@ class ReductionFusion::ReductionGroupEmitter {
       const HloReduceInstruction* reduction, const HloInstruction* root,
       int output_idx) const;
 
-  void GenerateElementForReducer(
-      const HloReduceInstruction* reduction, llvm::Value* partial_result_index,
-      const llvm_ir::IrArray::Index& input_index) const;
+  void GenerateElementForReducer(const HloReduceInstruction* reduction,
+                                 const llvm_ir::IrArray::Index& index) const;
 
   absl::Status EmitExtraOutputsForReduce(
       const Shape& reduction_operand_shape,
       const llvm_ir::IrArray::Index& index,
-      const ExtraOutputGensMap& extra_output_gens) const;
+      const ExtraOutputGensMap& extra_output_gens);
 
  private:
   ReductionFusion::ReductionEmitter& reduction_emitter_;
@@ -442,23 +449,6 @@ class ReductionFusion::ReductionGroupEmitter {
   absl::flat_hash_map<const HloInstruction*, ReductionOpState> state_;
 };
 
-// Allocates a shared tile of given dimensions, applying scaling specified in
-// tilng_scheme as a major-most dimension to avoid collisions.
-llvm::GlobalVariable* AllocateShared(
-    llvm::IRBuilder<>* builder, const TilingScheme& tiling_scheme,
-    llvm::Type* element_type,
-    absl::Span<int64_t const> dimensions_major_to_minor,
-    absl::string_view buffer_name) {
-  CHECK(!dimensions_major_to_minor.empty());
-  llvm::Type* ty = element_type;
-  for (auto dim : llvm::reverse(dimensions_major_to_minor)) {
-    ty = llvm::ArrayType::get(ty, dim);
-  }
-  ty = llvm::ArrayType::get(ty, tiling_scheme.GetThreadIdScalingFactor());
-  return llvm_ir::AllocateSharedMemoryTile(
-      builder->GetInsertBlock()->getModule(), ty, buffer_name);
-}
-
 // Creates accumulator alloca's, populates them with initial values, generates
 // __shared__ caches and returns the populated object.
 ReductionFusion::ReductionGroupEmitter::ReductionGroupEmitter(
@@ -476,9 +466,7 @@ ReductionFusion::ReductionGroupEmitter::ReductionGroupEmitter(
   for (const HloReduceInstruction* reduce_hlo : reduce_instr_index_group) {
     for (int op_result_idx = 0;
          op_result_idx < GetNumOutputs(reduce_hlo->shape()); op_result_idx++) {
-      Shape result_shape = reduce_hlo->shape().IsTuple()
-                               ? reduce_hlo->shape().tuple_shapes(op_result_idx)
-                               : reduce_hlo->shape();
+      Shape result_shape = OutputShape(reduce_hlo->shape(), op_result_idx);
 
       llvm::Type* element_type = llvm_ir::PrimitiveTypeToIrType(
           result_shape.element_type(), builder->GetInsertBlock()->getModule());
@@ -498,36 +486,30 @@ ReductionFusion::ReductionGroupEmitter::ReductionGroupEmitter(
                                        .value();
 
       builder->CreateStore(init_ir_value, result_address);
-      const TilingScheme& tiling_scheme = reduction_info.GetTilingScheme();
-      llvm::GlobalVariable* shared_cache = [&]() -> llvm::GlobalVariable* {
+      const Tiling& tiling = reduction_info.GetTiling();
+      auto shared_cache = [&]() -> std::optional<llvm_ir::SharedMemoryTile> {
+        auto* module = reduction_emitter.ir_emitter_context_.llvm_module();
         if (reduction_info.IsRowReduction()) {
           // Multi-row reductions do not use shared memory.
           if (RowReductionGetRowsPerWarp(
                   reduction_emitter_.ReducedDimensionSize()) > 1) {
-            return nullptr;
+            return std::nullopt;
           }
-          // Allocate __shared__
-          // cache[1][num_warps][scaling_factor].
-          CHECK_EQ(tiling_scheme.GetNumThreadsPerBlock() % WarpSize(), 0);
-          int num_warps = tiling_scheme.GetNumThreadsPerBlock() / WarpSize();
-          return AllocateShared(builder, tiling_scheme, element_type,
-                                {1, num_warps}, "shared_cache");
-        } else {
-          const auto& num_threads = tiling_scheme.GetThreadsPerBlock();
-          // Allocate __shared__
-          // cache[num_threads][num_threads + 1], where
-          // num_threads == num_threads_x == num_threads_y.  The "+1" is used to
-          // avoid bank conflicts.
-          //
-          // (Although each thread produces num_partial_results results, we
-          // don't need that much cache: Only one result is live at a time.)
-          CHECK_EQ(num_threads[TilingScheme::DimX],
-                   num_threads[TilingScheme::DimY]);
-          return AllocateShared(builder, tiling_scheme, element_type,
-                                {num_threads[TilingScheme::DimX],
-                                 num_threads[TilingScheme::DimX] + 1},
-                                "shared_cache");
+          // Allocate one shared memory element per warp.
+          auto block_size = tiling.GetThreadsPerBlock();
+          CHECK_EQ(block_size[kRowMinorReducedDimension] % WarpSize(), 0);
+          return llvm_ir::AllocateSharedMemoryTile(
+              module, element_type,
+              {block_size[kRowKeptDimension],
+               block_size[kRowMinorReducedDimension] / WarpSize()},
+              "shared_cache");
         }
+        const auto& num_threads = tiling.GetThreadsPerBlock();
+        int n = num_threads[kColReducedDimension];
+        CHECK_EQ(n, num_threads[kColMinorKeptDimension]);
+        // The "+1" is used to avoid bank conflicts.
+        return llvm_ir::AllocateSharedMemoryTile(module, element_type,
+                                                 {n, n + 1}, "shared_cache");
       }();
 
       llvm_ir::ElementGenerator input_gen =
@@ -594,23 +576,26 @@ ReductionFusion::ReductionEmitter::BuildKernelThunkForFusion(
           : KernelArguments::Create(ir_emitter_context_.allocations(),
                                     fusion_op));
 
-  auto kernel_builder_status = absl::OkStatus();
-  auto [entry, cached] = ir_emitter_context_.kernel_cache().GetWithStatus(
-      fused_computation, kernel_arguments.args(), discriminator,
-      [&]() -> absl::StatusOr<KernelReuseCache::Entry> {
-        llvm::Function* kernel;
-        std::vector<llvm_ir::IrArray> input_arrays;
-        std::vector<llvm_ir::IrArray> output_arrays;
-        TF_ASSIGN_OR_RETURN(
-            std::tie(kernel, input_arrays, output_arrays),
-            BuildKernelPrototype(ir_emitter_context_, suggested_kernel_name,
-                                 kernel_arguments.args(),
-                                 fusion_.operand_count(), launch_dimensions,
-                                 builder_));
-        TF_RETURN_IF_ERROR(kernel_builder_fn(input_arrays, output_arrays));
-        return {{kernel->getName().str(), launch_dimensions}};
-      });
-  TF_RETURN_IF_ERROR(entry.status());
+  auto [status_or_entry, cached] =
+      ir_emitter_context_.kernel_cache().GetWithStatus(
+          fused_computation, kernel_arguments.args(), discriminator,
+          [&]() -> absl::StatusOr<KernelReuseCache::Entry> {
+            llvm::Function* kernel;
+            std::vector<llvm_ir::IrArray> input_arrays;
+            std::vector<llvm_ir::IrArray> output_arrays;
+            TF_ASSIGN_OR_RETURN(
+                std::tie(kernel, input_arrays, output_arrays),
+                BuildKernelPrototype(ir_emitter_context_, suggested_kernel_name,
+                                     kernel_arguments.args(),
+                                     fusion_.operand_count(), launch_dimensions,
+                                     builder_));
+            TF_RETURN_IF_ERROR(kernel_builder_fn(input_arrays, output_arrays));
+            // Shared memory is allocated statically.
+            return {{kernel->getName().str(), launch_dimensions,
+                     /*cluster_dim=*/std::nullopt,
+                     /*shmem_bytes=*/0}};
+          });
+  TF_ASSIGN_OR_RETURN(const KernelReuseCache::Entry* entry, status_or_entry);
   if (cached) {
     VLOG(3) << "Reuse: " << suggested_kernel_name << " -> "
             << entry->kernel_name;
@@ -619,20 +604,17 @@ ReductionFusion::ReductionEmitter::BuildKernelThunkForFusion(
   if (ir_emitter_context_.emit_ir_from_hlo()) {
     return std::make_unique<KernelThunk>(
         &fusion_, entry->kernel_name, kernel_arguments.args(),
-        launch_dimensions,
-        // Shared memory is allocated statically.
-        /*shmem_bytes=*/0);
+        launch_dimensions, entry->cluster_dim, entry->shmem_bytes);
   }
 
   return std::make_unique<KernelThunk>(
       fusion_op, entry->kernel_name, kernel_arguments.args(), launch_dimensions,
-      // Shared memory is allocated statically.
-      /*shmem_bytes=*/0);
+      entry->cluster_dim, entry->shmem_bytes);
 }
 
 absl::Status ReductionFusion::ReductionGroupEmitter::EmitExtraOutputsForReduce(
     const Shape& reduction_operand_shape, const llvm_ir::IrArray::Index& index,
-    const ExtraOutputGensMap& extra_output_gens) const {
+    const ExtraOutputGensMap& extra_output_gens) {
   if (extra_output_gens.empty()) {
     return absl::OkStatus();
   }
@@ -722,20 +704,6 @@ ReductionFusion::ReductionEmitter::BuildFusedInitializerThunk(
                                    builder_fn);
 }
 
-// Gets the output offset as calculated from thread_id.x (to be applied to the
-// offset calculated from block_id and thread_id.y).
-static llvm::Value* GetStartOffsetX(const TilingScheme& tiling_scheme,
-                                    llvm::Value* thread_id_x,
-                                    llvm::Type* index_ty,
-                                    llvm::IRBuilder<>* b) {
-  int64_t multiplier =
-      tiling_scheme.GetIndexingOrder() == TilingScheme::StridedIndexingX
-          ? tiling_scheme.GetVectorSize()
-          : tiling_scheme.GetThreadTileSize()[TilingScheme::DimX];
-  return b->CreateMul(thread_id_x,
-                      llvm::ConstantInt::get(index_ty, multiplier));
-}
-
 // Emits shuffle-down reduction for the `partial_result_address` using the
 // reduction computation `reducer`, writes output into
 // `partial_result_address`.
@@ -803,68 +771,36 @@ ReductionFusion::ReductionGroupEmitter::GetOutputIndexForReduction(
     const HloReduceInstruction* reduction, const HloInstruction* root,
     int output_idx) const {
   auto* builder = reduction_emitter_.builder_;
-  const auto& reduction_info = reduction_emitter_.reduction_codegen_info_;
-  const TilingScheme& tiling_scheme = reduction_info.GetTilingScheme();
-  const TilingThreadIdInfo& thread_id_info = tiling_kernel_info.thread_id_info;
-
-  llvm_ir::IrArray::Index index = [&] {
-    llvm::Value* x_loc = thread_id_info.thread_id_x;
-    llvm::Value* y_loc = thread_id_info.thread_id_y;
-    if (!reduction_info.IsRowReduction()) {
-      std::swap(x_loc, y_loc);
-    }
-    llvm::Value* start_offset_x = GetStartOffsetX(
-        tiling_scheme, x_loc, reduction_emitter_.index_ty_, builder);
-    return tiling_kernel_info.tile_origin
-        .AddOffsetToDim(y_loc, TilingScheme::DimY, builder)
-        .AddOffsetToDim(start_offset_x, TilingScheme::DimX, builder);
-  }();
-
-  const Shape& operand_shape = reduction->inputs()[output_idx]->shape();
-  Shape reduction_kept_element_shape =
-      ShapeUtil::DeleteDimensions(reduction->dimensions(), operand_shape);
-
-  // Given the llvm_ir::IrArray index of a reduction input, returns the linear
-  // address of the reduction output as if the reduction were going to keep
-  // the input shape with the dimensions being reduced moved.
-  llvm::Value* untransposed_output_linear_address = [&] {
+  auto* index_ty = reduction_emitter_.index_ty_;
+
+  // 1d or 2d output index (for row/column reduction).
+  auto projected_index = [&]() -> llvm_ir::IrArray::Index {
+    const auto& reduction_info = reduction_emitter_.reduction_codegen_info_;
+    const auto& offset = tiling_kernel_info.tile_origin;
+    const auto& shape = reduction_info.GetTiling().GetXlaShape();
+    const auto& thread_ids = tiling_kernel_info.thread_id_info.thread_ids;
     if (reduction_info.IsRowReduction()) {
-      // For row-reduction, y-coordinate determines which row we write into.
-      return index[TilingScheme::DimY];
+      constexpr int kDim = kRowKeptDimension;
+      return {{builder->CreateAdd(offset[kDim], thread_ids[kDim])},
+              {shape.dimensions(kDim)},
+              index_ty};
     }
-    // For column reduction, we get the transposed address.
-    absl::Span<const int64_t> dims_in_elem = tiling_scheme.GetShape();
-    llvm::Value* x_dim_size =
-        index.GetConstantWithIndexType(dims_in_elem[TilingScheme::DimX]);
-    llvm::Value* x_block_offset =
-        builder->CreateMul(index[TilingScheme::DimZ], x_dim_size);
-    return builder->CreateAdd(x_block_offset, index[TilingScheme::DimX]);
+    auto* major_idx = offset[kColMajorKeptDimension];
+    auto* minor_idx = builder->CreateAdd(offset[kColMinorKeptDimension],
+                                         thread_ids[kColReducedDimension]);
+    return {{major_idx, minor_idx},
+            ShapeUtil::DeleteDimension(kColReducedDimension, shape),
+            index_ty};
   }();
 
-  // A reduction is allowed to transpose its output.  For example, suppose
-  // we are reducing the second dimension of f32[10,20,30]{3,2,1}.  We are
-  // allowed to produce as output either f32[10,30]{1,0} (no transpose) or
-  // f32[10,30]{0,1} (transposing the two output dims).
-  //
-  // At this point in the function we have a "partial sum" of input elements
-  // (stored in partial_result_addresses), and we need to accumulate it into
-  // the correct output element.
-  llvm_ir::IrArray::Index element_index(
-      /*linear=*/untransposed_output_linear_address,
-      reduction_kept_element_shape, builder);
-  const Shape& output_shape = !reduction->shape().IsTuple()
-                                  ? reduction->shape()
-                                  : reduction->shape().tuple_shapes(output_idx);
-  llvm_ir::IrArray::Index output_index(element_index.multidim(), output_shape,
-                                       element_index.GetType());
-  // We need to check for root == reduction separately, because for variadic
-  // reduce the root shape would be a tuple, while 'output_shape' is the
-  // subshape.
-  return (root == reduction ||
-          ShapeUtil::EqualIgnoringElementType(output_shape, root->shape()))
-             ? output_index
-             : output_index.SourceIndexOfBitcast(output_shape, root->shape(),
-                                                 builder);
+  auto physical_shape = ShapeUtil::DeleteDimensions(
+      reduction->dimensions(), reduction->operand(output_idx)->shape());
+  auto physical_index =
+      projected_index.SourceIndexOfBitcast(physical_shape, builder);
+  return llvm_ir::IrArray::Index(physical_index.multidim(),
+                                 OutputShape(reduction->shape(), output_idx),
+                                 index_ty)
+      .SourceIndexOfBitcast(OutputShape(root->shape(), output_idx), builder);
 }
 
 void ReductionFusion::ReductionGroupEmitter::WriteReductionOutput(
@@ -906,14 +842,14 @@ void ReductionFusion::ReductionGroupEmitter::WriteReductionOutput(
   }
 }
 
-// `current_output`: the value the tile has calculated.
-// `output_address`: address where the output value has to be written.
 void ReductionFusion::ReductionGroupEmitter::EmitReductionOutputForRowReduction(
     const TilingKernelInfo& tiling_kernel_info,
     const HloReduceInstruction* reduction,
     const std::vector<const HloInstruction*>& roots) const {
   const HloComputation* reducer = reduction->to_apply();
   const auto& thread_id_info = tiling_kernel_info.thread_id_info;
+  const auto& thread_ids = thread_id_info.thread_ids;
+  auto* thread_id_x = thread_ids[kRowMinorReducedDimension];
   auto constant = [&](uint64_t c) -> llvm::Constant* {
     return llvm::ConstantInt::get(reduction_emitter_.index_ty_, c);
   };
@@ -926,24 +862,22 @@ void ReductionFusion::ReductionGroupEmitter::EmitReductionOutputForRowReduction(
   int num_outputs = reducer->num_parameters() / 2;
   absl::InlinedVector<TypedPointer, 2> current_outputs;
   for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
-    const ReductionGroupEmitter::ReductionCalculationState& state =
-        GetCalculationStateFor(reduction, output_idx);
+    const auto& state = GetCalculationStateFor(reduction, output_idx);
     current_outputs.push_back(
         {state.partial_result_address,
          state.partial_result_address->getAllocatedType()});
   }
 
   const auto& reduction_info = reduction_emitter_.reduction_codegen_info_;
-  const TilingScheme& tiling_scheme = reduction_info.GetTilingScheme();
+  const Tiling& tiling = reduction_info.GetTiling();
   int num_rows_per_warp =
       RowReductionGetRowsPerWarp(reduction_emitter_.ReducedDimensionSize());
-  EmitFullWarpShuffleDownLoopForReduce(
-      reducer, absl::MakeSpan(current_outputs),
-      tiling_scheme.GetNumThreadsPerBlockPhysical(), num_rows_per_warp);
+  EmitFullWarpShuffleDownLoopForReduce(reducer, absl::MakeSpan(current_outputs),
+                                       tiling.GetNumThreadsPerBlock(),
+                                       num_rows_per_warp);
 
   KernelSupportLibrary ksl(builder);
-  llvm::Value* warp_id =
-      builder->CreateUDiv(thread_id_info.thread_id_x, constant(WarpSize()));
+  llvm::Value* warp_id = builder->CreateUDiv(thread_id_x, constant(WarpSize()));
 
   auto emit_write_output = [&](llvm::Value* write_condition,
                                const absl::Span<TypedPointer const> values) {
@@ -952,69 +886,79 @@ void ReductionFusion::ReductionGroupEmitter::EmitReductionOutputForRowReduction(
     });
   };
 
-  if (num_rows_per_warp > 1) {
-    llvm::Value* is_writing_thread = is_zero(builder->CreateAnd(
-        thread_id_info.thread_id_x,
-        constant(reduction_emitter_.ReducedDimensionSize() - 1)));
-    emit_write_output(is_writing_thread, current_outputs);
-    return;
-  }
-
-  ksl.If("intra_warp_reduce_write", is_zero(thread_id_info.lane_id), [&] {
-    for (int oidx = 0; oidx < num_outputs; oidx++) {
-      const auto& state = GetCalculationStateFor(reduction, oidx);
-      llvm::Value* shmem_output_addr = thread_id_info.GEPIntoSharedMemory(
-          builder, state.shared_cache, {constant(0), warp_id});
-      builder->CreateStore(builder->CreateLoad(current_outputs[oidx].second,
-                                               current_outputs[oidx].first),
-                           shmem_output_addr);
+  // The major kept dimension and vector dimension are not tiled, so they're
+  // always in bounds.
+  llvm::Value* is_in_bounds_y = builder->CreateICmpULT(
+      thread_ids[kRowKeptDimension],
+      tiling_kernel_info.output_tile_bounds[kRowKeptDimension]);
+
+  ksl.If("thread_in_bounds", is_in_bounds_y, [&] {
+    if (num_rows_per_warp > 1) {
+      llvm::Value* is_writing_thread = is_zero(builder->CreateAnd(
+          thread_id_x,
+          constant(reduction_emitter_.ReducedDimensionSize() - 1)));
+      emit_write_output(is_writing_thread, current_outputs);
+      return;
     }
-  });
 
-  // TODO(cheshire): Don't we want to sync it once for everything in the
-  // output? Not once per each?
-  reduction_emitter_.EmitSyncThreads();
-  ksl.If("inter_warp_reduce", is_zero(warp_id), [&] {
-    absl::InlinedVector<TypedPointer, 2> selected_values;
-    for (int oidx = 0; oidx < num_outputs; oidx++) {
-      const auto& state = GetCalculationStateFor(reduction, oidx);
-      llvm::Value* block_accum_addr = thread_id_info.GEPIntoSharedMemory(
-          builder, state.shared_cache, {constant(0), thread_id_info.lane_id});
-
-      llvm::Type* element_type =
-          state.partial_result_address->getAllocatedType();
-
-      // Ensure initial value address is in generic, not scratch.
-      llvm::Value* initial_value_addr =
-          CastSharedToGlobal(builder,
-                             llvm_ir::EmitAllocaAtFunctionEntry(
-                                 element_type, "initial_value_addr", builder),
-                             element_type, /*name=*/"");
-      builder->CreateStore(state.initial_value, initial_value_addr);
-
-      llvm::Value* warp_exists = builder->CreateICmpULT(
-          thread_id_info.thread_id_x,
-          constant(tiling_scheme.GetThreadsPerBlock()[TilingScheme::DimX] /
-                   WarpSize()));
-
-      llvm::Value* selected_value = builder->CreateSelect(
-          warp_exists, block_accum_addr, initial_value_addr);
-
-      selected_values.push_back({selected_value, element_type});
-    }
+    ksl.If("intra_warp_reduce_write", is_zero(thread_id_info.lane_id), [&] {
+      for (int oidx = 0; oidx < num_outputs; oidx++) {
+        auto& state = GetCalculationStateFor(reduction, oidx);
+        state.shared_cache->Store(
+            builder->CreateLoad(current_outputs[oidx].second,
+                                current_outputs[oidx].first),
+            {thread_id_info.thread_ids[kRowKeptDimension], warp_id}, builder);
+      }
+    });
 
-    // If only one warp is present in the block, then we don't need inter-warp
-    // reduction.
-    // TODO(b/241414088) If only warp is present, then inter-warp
-    // communication using shared memory and synchronization using barrier is
-    // also unnecessary and should be removed.
-    if (tiling_scheme.GetNumThreadsPerBlock() > WarpSize()) {
-      EmitFullWarpShuffleDownLoopForReduce(
-          reducer, absl::MakeSpan(selected_values),
-          tiling_scheme.GetNumThreadsPerBlock(), /*num_results_per_warp=*/1);
-    }
+    // TODO(cheshire): Don't we want to sync it once for everything in the
+    // output? Not once per each?
+    reduction_emitter_.EmitSyncThreads();
+    ksl.If("inter_warp_reduce", is_zero(warp_id), [&] {
+      absl::InlinedVector<TypedPointer, 2> selected_values;
+      for (int oidx = 0; oidx < num_outputs; oidx++) {
+        auto& state = GetCalculationStateFor(reduction, oidx);
+        llvm::Value* block_accum_addr = state.shared_cache->Address(
+            {thread_id_info.thread_ids[kRowKeptDimension],
+             thread_id_info.lane_id},
+            builder);
+
+        llvm::Type* element_type =
+            state.partial_result_address->getAllocatedType();
+
+        // Ensure initial value address is in generic, not scratch.
+        llvm::Value* initial_value_addr =
+            CastSharedToGlobal(builder,
+                               llvm_ir::EmitAllocaAtFunctionEntry(
+                                   element_type, "initial_value_addr", builder),
+                               element_type, /*name=*/"");
+        builder->CreateStore(state.initial_value, initial_value_addr);
+
+        llvm::Value* warp_exists = builder->CreateICmpULT(
+            thread_id_x,
+            constant(tiling.GetThreadsPerBlock()[kRowMinorReducedDimension] /
+                     WarpSize()));
+
+        llvm::Value* selected_value = builder->CreateSelect(
+            warp_exists, block_accum_addr, initial_value_addr);
+
+        selected_values.push_back({selected_value, element_type});
+      }
 
-    emit_write_output(is_zero(thread_id_info.thread_id_x), selected_values);
+      // If only one warp produces the output element, we don't need to emit
+      // an inter warp reduce. In our tiling, DimX is the minor reduced
+      // dimension. The major reduced dimension is always emitted as a loop.
+      // TODO(b/241414088) If only warp is present, then inter-warp
+      // communication using shared memory and synchronization using barrier is
+      // also unnecessary and should be removed.
+      if (tiling.GetThreadsPerBlock()[kRowMinorReducedDimension] > WarpSize()) {
+        EmitFullWarpShuffleDownLoopForReduce(
+            reducer, absl::MakeSpan(selected_values),
+            tiling.GetNumThreadsPerBlock(), /*num_results_per_warp=*/1);
+      }
+
+      emit_write_output(is_zero(thread_id_x), selected_values);
+    });
   });
 }
 
@@ -1028,6 +972,7 @@ void ReductionFusion::ReductionGroupEmitter::
   KernelSupportLibrary ksl(builder);
   const HloComputation* reducer = reduction->to_apply();
   const auto& thread_id_info = tiling_kernel_info.thread_id_info;
+  const auto& thread_ids = thread_id_info.thread_ids;
 
   auto constant = [&](uint64_t c) -> llvm::Constant* {
     return llvm::ConstantInt::get(reduction_emitter_.index_ty_, c);
@@ -1036,23 +981,20 @@ void ReductionFusion::ReductionGroupEmitter::
     return builder->CreateICmpEQ(value, constant(0));
   };
   const auto& reduction_info = reduction_emitter_.reduction_codegen_info_;
-  const TilingScheme& tiling_scheme = reduction_info.GetTilingScheme();
+  const Tiling& tiling = reduction_info.GetTiling();
   int num_outputs = reducer->num_parameters() / 2;
 
+  auto* kept_index = thread_ids[kColMinorKeptDimension];
+  auto* reduced_index = thread_ids[kColReducedDimension];
+
   // Store the transpose in shared memory.
   for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
     const auto& state = GetCalculationStateFor(reduction, output_idx);
-    llvm::GlobalVariable* shared_cache = state.shared_cache;
-    llvm::AddrSpaceCastInst* shmem_output_addr =
-        llvm::cast<llvm::AddrSpaceCastInst>(thread_id_info.GEPIntoSharedMemory(
-            builder, shared_cache,
-            {thread_id_info.thread_id_x, thread_id_info.thread_id_y},
-            "shmem_output_address"));
-
-    llvm::Value* current_output_value =
+    auto* current_output_value =
         builder->CreateLoad(state.partial_result_address->getAllocatedType(),
                             state.partial_result_address);
-    builder->CreateStore(current_output_value, shmem_output_addr);
+    state.shared_cache->Store(current_output_value, {kept_index, reduced_index},
+                              builder);
   }
 
   reduction_emitter_.EmitSyncThreads();
@@ -1061,31 +1003,26 @@ void ReductionFusion::ReductionGroupEmitter::
   absl::InlinedVector<TypedPointer, 2> shmem_transposed_addrs;
   for (int output_idx = 0; output_idx < num_outputs; output_idx++) {
     const auto& state = GetCalculationStateFor(reduction, output_idx);
-    llvm::AddrSpaceCastInst* shmem_transposed_addr =
-        llvm::cast<llvm::AddrSpaceCastInst>(thread_id_info.GEPIntoSharedMemory(
-            builder, state.shared_cache,
-            {thread_id_info.thread_id_y, thread_id_info.thread_id_x},
-            "shmem_transposed_addr"));
+    auto* shmem_transposed_addr =
+        state.shared_cache->Address({reduced_index, kept_index}, builder);
     shmem_transposed_addrs.push_back(
-        {shmem_transposed_addr, llvm::cast<llvm::GetElementPtrInst>(
-                                    shmem_transposed_addr->getPointerOperand())
-                                    ->getResultElementType()});
+        {shmem_transposed_addr, state.shared_cache->GetElementType()});
   }
 
   EmitFullWarpShuffleDownLoopForReduce(reducer,
                                        absl::MakeSpan(shmem_transposed_addrs),
-                                       tiling_scheme.GetNumThreadsPerBlock(),
+                                       tiling.GetNumThreadsPerBlock(),
                                        /*num_results_per_warp=*/1);
 
   // Some warps in the block are completely outside of the bound of the
   // tensor, so they should not write any output at all.
   llvm::Value* has_output = builder->CreateAnd(
       builder->CreateICmpULT(
-          GetStartOffsetX(tiling_scheme, thread_id_info.thread_id_y,
-                          reduction_emitter_.index_ty_, builder),
-          tiling_kernel_info.output_tile_bounds[1]),
-      builder->CreateICmpULT(thread_id_info.thread_id_x,
-                             tiling_kernel_info.output_tile_bounds[0]));
+          reduced_index,
+          tiling_kernel_info.output_tile_bounds[kColMinorKeptDimension]),
+      builder->CreateICmpULT(
+          kept_index,
+          tiling_kernel_info.output_tile_bounds[kColReducedDimension]));
 
   ksl.If("reduction_write_output",
          builder->CreateAnd(has_output, is_zero(thread_id_info.lane_id)), [&] {
@@ -1095,10 +1032,10 @@ void ReductionFusion::ReductionGroupEmitter::
 }
 
 // Generate a single element of the tile (update the accumulator state) for a
-// given reducer of index `i`.
+// given reducer.
 void ReductionFusion::ReductionGroupEmitter::GenerateElementForReducer(
-    const HloReduceInstruction* reduction, llvm::Value* partial_result_index,
-    const llvm_ir::IrArray::Index& input_index) const {
+    const HloReduceInstruction* reduction,
+    const llvm_ir::IrArray::Index& index) const {
   HloComputation* reducer = reduction->to_apply();
   auto* builder = reduction_emitter_.builder_;
   CHECK_EQ(reducer->num_parameters() % 2, 0);
@@ -1109,14 +1046,11 @@ void ReductionFusion::ReductionGroupEmitter::GenerateElementForReducer(
     const auto& state = GetCalculationStateFor(reduction, red_idx);
 
     llvm::AllocaInst* input_address = state.input_address;
-    llvm::AllocaInst* partial_reduction_result_address =
-        state.partial_result_address;
+    auto input_index =
+        index.SourceIndexOfBitcast(reduction->operand(0)->shape(), builder);
     llvm::Value* const input_ir_value = *state.input_gen(input_index);
     builder->CreateStore(input_ir_value, input_address);
-    llvm::Value* partial_result_address = builder->CreateInBoundsGEP(
-        partial_reduction_result_address->getAllocatedType(),
-        partial_reduction_result_address, {partial_result_index});
-    reduction_accumulators.push_back(partial_result_address);
+    reduction_accumulators.push_back(state.partial_result_address);
     reduction_input_value.push_back(input_address);
   }
 
@@ -1173,48 +1107,37 @@ absl::Status ReductionFusion::ReductionEmitter::EmitIRForReduction(
   }
 
   CHECK(!heroes.empty()) << " expect at least one reduce instructions.";
-  const TilingScheme& tiling_scheme = reduction_codegen_info_.GetTilingScheme();
-  CHECK_EQ(tiling_scheme.GetNumThreadsPerBlockPhysical() % WarpSize(), 0);
+  const Tiling& tiling = reduction_codegen_info_.GetTiling();
+  CHECK_EQ(tiling.GetNumThreadsPerBlock() % WarpSize(), 0);
   ReductionGroupEmitter group_emitter(*this, heroes, result_ir_arrays,
                                       fused_emitter);
 
-  EmitTileElementFunction emit_reduction_element =
-      [&](const TilingThreadIdInfo& thread_id_info,
-          const llvm_ir::IrArray::Index& index, llvm::Value* y_loc,
-          llvm::Value* x_loc) {
-        llvm_ir::IrArray::Index input_index = GetUnnormalizedIndex(
-            index, input_shape, builder_,
-            reduction_codegen_info_.GetTilingScheme().GetShape());
-        llvm::Value* partial_result_index =
-            reduction_codegen_info_.IsRowReduction()
-                ? builder_->getInt32(0)
-                : builder_->CreateSub(
-                      x_loc,
-                      GetStartOffsetX(tiling_scheme, thread_id_info.thread_id_x,
-                                      index_ty_, builder_));
-
-        // Emit code to generate the input and perform the reduction computation
-        // for each reduction instruction.
-        for (const HloReduceInstruction* reduce : heroes) {
-          group_emitter.GenerateElementForReducer(reduce, partial_result_index,
-                                                  input_index);
-        }
-
-        // Emit code to generate the output for the non-reduction instructions
-        // in the fusion, if any.
-        TF_CHECK_OK(group_emitter.EmitExtraOutputsForReduce(
-            input_shape, input_index, extra_output_gens));
-      };
-
   TF_ASSIGN_OR_RETURN(
       TilingKernelInfo tiling_kernel_info,
       EmitTilingKernel(
-          builder_, tiling_scheme, index_ty_,
+          builder_, tiling, index_ty_,
           [&](const TilingThreadIdInfo& thread_id_info,
-              const llvm_ir::IrArray::Index& index,
-              std::array<llvm::Value*, 2> tile_dimensions) {
-            EmitTile(builder_, reduction_codegen_info_.GetTilingScheme(), index,
-                     thread_id_info, tile_dimensions, emit_reduction_element);
+              const llvm_ir::IrArray::Index& tile_index,
+              absl::Span<llvm::Value* const> tile_dimensions) {
+            auto emit_element =
+                [&](absl::Span<llvm::Value* const> index_in_tile) {
+                  auto index = tile_index.AddOffset(index_in_tile, builder_);
+
+                  // Emit code to generate the input and perform the reduction
+                  // computation for each reduction instruction.
+                  for (const HloReduceInstruction* reduce : heroes) {
+                    group_emitter.GenerateElementForReducer(reduce, index);
+                  }
+
+                  // Emit code to generate the output for the non-reduction
+                  // instructions in the fusion, if any.
+                  TF_CHECK_OK(group_emitter.EmitExtraOutputsForReduce(
+                      ShapeUtil::MakeShape(
+                          F32, reduction_codegen_info_.GetTiling().GetShape()),
+                      index, extra_output_gens));
+                };
+            EmitTile(builder_, reduction_codegen_info_.GetTiling(),
+                     thread_id_info, tile_dimensions, emit_element);
           }));
 
   KernelSupportLibrary ksl(builder_);
@@ -1330,20 +1253,21 @@ absl::Status ReductionFusion::ReductionEmitter::EmitKernel(
   // block_id_y instead of block_id_x simplifies the index calculation
   // for reduction code generation as the block_id_y is orthogonal to
   // the indices used within the reductions.
-  const std::vector<std::vector<const HloInstruction*>>& instr_index_groups =
-      reduction_codegen_info_.GetIndexGroups();
+  const auto& instr_index_groups =
+      reduction_codegen_info_.GetIndexGroups().grouped_roots;
   Shape reduce_operand_shape = reduction_codegen_info_.GetReduceOperandShape();
 
-  llvm::Value* raw_block_id_y = gpu::EmitCallToTargetIntrinsic(
+  llvm::Value* block_id_y = gpu::EmitCallToTargetIntrinsic(
       gpu::TargetIntrinsicID::kBlockIdy, {}, {}, builder_);
   llvm_ir::AddRangeMetadata(0, instr_index_groups.size(),
-                            llvm::cast<llvm::Instruction>(raw_block_id_y));
-  raw_block_id_y = builder_->CreateZExtOrTrunc(
-      raw_block_id_y, builder_->getInt32Ty(), "raw_block_id_y");
+                            llvm::cast<llvm::Instruction>(block_id_y),
+                            builder_);
+  block_id_y = builder_->CreateZExtOrTrunc(block_id_y, builder_->getInt32Ty());
+  block_id_y->setName("block.id.y");
   for (int i = 0; i < instr_index_groups.size(); ++i) {
     TF_RETURN_IF_ERROR(ksl.IfWithStatus(
         absl::StrCat("reduce-group-", i),
-        builder_->CreateICmpEQ(raw_block_id_y, builder_->getInt32(i)), [&] {
+        builder_->CreateICmpEQ(block_id_y, builder_->getInt32(i)), [&] {
           return EmitIRForReduction(instr_index_groups[i], fused_emitter,
                                     result_ir_arrays, reduce_operand_shape);
         }));
@@ -1377,11 +1301,12 @@ absl::Status ReductionFusion::EmitKernel(IrEmitterContext& ir_emitter_context,
 }
 
 LaunchDimensions ReductionFusion::launch_dimensions() const {
-  const TilingScheme& tiling_scheme = reduction_codegen_info_.GetTilingScheme();
-  size_t blocks_y = reduction_codegen_info_.GetIndexGroups().size();
-  return {se::BlockDim(/*x=*/tiling_scheme.GetNumBlocksPhysical(),
+  const Tiling& tiling = reduction_codegen_info_.GetTiling();
+  size_t blocks_y =
+      reduction_codegen_info_.GetIndexGroups().grouped_roots.size();
+  return {se::BlockDim(/*x=*/tiling.GetNumBlocks(),
                        /*y=*/static_cast<int64_t>(blocks_y), /*z=*/1),
-          se::ThreadDim(/*x=*/tiling_scheme.GetNumThreadsPerBlockPhysical(),
+          se::ThreadDim(/*x=*/tiling.GetNumThreadsPerBlock(),
                         /*y=*/1, /*z=*/1)};
 }
 
@@ -1393,57 +1318,189 @@ ReductionFusion::ComputeReductionCodegenInfo(
   Shape input_shape = hero_reduction->operand(0)->shape();
   ReductionDimensions reduction_dimensions =
       GetReductionKindAndContiguousComponents(*hero_reduction);
+  auto shape = reduction_dimensions.dimensions;
   VLOG(10) << "is_row_reduction " << reduction_dimensions.is_row_reduction
-           << " " << reduction_dimensions.dimensions[0] << " "
-           << reduction_dimensions.dimensions[1] << " "
-           << reduction_dimensions.dimensions[2];
+           << " " << shape[0] << " " << shape[1] << " " << shape[2];
   Vector3 reduction_tiling = GetReductionTiling(reduction_dimensions);
 
-  int64_t fan_out = analysis.fusion_roots().size();
   int64_t num_threads_y =
       reduction_dimensions.is_row_reduction ? 1 : WarpSize();
+  int64_t rows_per_warp =
+      reduction_dimensions.is_row_reduction
+          ? RowReductionGetRowsPerWarp(shape[kRowMinorReducedDimension])
+          : 1;
   int64_t num_threads_x = [&] {
     if (reduction_dimensions.is_row_reduction) {
-      if (RowReductionGetRowsPerWarp(reduction_dimensions.dimensions[2]) > 1) {
-        return reduction_dimensions.dimensions[2];
+      if (rows_per_warp > 1) {
+        return shape[kRowMinorReducedDimension];
       }
-      // Use 512 as default block size (threads per block) for row reductions.
-      // For multi-output fusions, reduce the block size further to decrease
-      // register pressure when multiple outputs are computed by each thread.
-      int64_t max_block_size = std::max(
-          MinThreadsXRowReduction(hero_reduction->GetModule()->config()),
-          static_cast<int64_t>(512LL / NearestPowerOfTwo(fan_out)));
-      return std::min(max_block_size,
-                      RoundUpTo(CeilOfRatio(reduction_dimensions.dimensions[2],
-                                            reduction_tiling[2]),
-                                WarpSize()));
+      int64_t max_block_size =
+          MinThreadsXRowReduction(hero_reduction->GetModule()->config());
+      return std::min(
+          max_block_size,
+          RoundUpTo(CeilOfRatio(shape[kRowMinorReducedDimension],
+                                reduction_tiling[kRowMinorReducedDimension]),
+                    WarpSize()));
     }
     return WarpSize();
   }();
 
-  TilingScheme::IndexingOrder indexing_order =
-      reduction_dimensions.is_row_reduction ? TilingScheme::StridedIndexingX
-                                            : TilingScheme::LinearIndexingX;
-  int vector_size = CanVectorizeReduction(analysis, reduction_dimensions,
-                                          num_threads_x, reduction_tiling)
-                        ? 2
-                        : 1;
-
-  Vector3 num_threads = {1, num_threads_y, num_threads_x};
-  int virtual_thread_scaling_factor =
-      CalculateVirtualThreadScalingFactorForReduction(analysis,
-                                                      reduction_dimensions);
-  VLOG(2) << "Using virtual thread scaling: " << virtual_thread_scaling_factor;
-
-  TilingScheme tiling_scheme(reduction_dimensions.dimensions, reduction_tiling,
-                             num_threads, indexing_order, vector_size,
-                             virtual_thread_scaling_factor);
+  // If we're limited by the size of the x dimension, add additional parallelism
+  // in the y dimension. The code generator doesn't currently support
+  // parallelizing the z dimension (major reduced dimensions). The general
+  // recommendation is to use between 128 and 512 threads, so we just go for
+  // 256. See https://forums.developer.nvidia.com/t/55529
+  constexpr int64_t kThreadsPerBlockTarget = 256;
+  if (reduction_dimensions.is_row_reduction &&
+      num_threads_x * 2 <= kThreadsPerBlockTarget) {
+    int64_t kept_size = reduction_dimensions.dimensions[kRowKeptDimension];
+    // Increase the size of the y dimension as long as there's remaining
+    // parallelism.
+    if (kept_size * num_threads_x <= kThreadsPerBlockTarget) {
+      num_threads_y = kept_size;
+      // num_threads_x is a power of two, but it may be less than 32. If dim_y
+      // is also small, we may have to increase the bound so the total number of
+      // threads is a multiple of 32.
+      while ((num_threads_x * num_threads_y) % 32) ++num_threads_y;
+    } else {
+      num_threads_y = kThreadsPerBlockTarget / num_threads_x;
+    }
+  }
+
+  int vector_size = GetVectorSize(analysis, reduction_dimensions, num_threads_x,
+                                  reduction_tiling);
+
+  absl::InlinedVector<int64_t, 4> num_threads{1, num_threads_y, num_threads_x};
+  absl::InlinedVector<int64_t, 4> tiled_shape{shape[0], shape[1],
+                                              shape[2] / vector_size};
+  absl::InlinedVector<int64_t, 4> tile_per_thread{
+      reduction_tiling[0], reduction_tiling[1],
+      reduction_tiling[2] / vector_size};
+  if (rows_per_warp > 1) {
+    // If we produce more than one element per thread, that means the reduced
+    // dimension is small and it can't be tiled - we already have more threads
+    // in a warp than the size of the reduced dimension. The code generator
+    // doesn't currently support tiling the kept dimension, because it just
+    // uses the thread ID as the coordinate.
+    tile_per_thread[2] = 1;
+  }
+  if (vector_size != 1) {
+    num_threads.push_back(1);  // The vector dimension is a loop.
+    tiled_shape.push_back(vector_size);
+    tile_per_thread.push_back(vector_size);
+  }
+
+  Tiling tiling(tiled_shape, tile_per_thread, num_threads,
+                /*loops_to_unroll=*/{false, false, true, false});
   bool reduction_is_race_free = ReductionIsRaceFree(
       hero_reduction->GetModule()->config(), reduction_dimensions);
   return ReductionCodegenInfo(
-      tiling_scheme, reduction_dimensions.is_row_reduction,
-      reduction_is_race_free, GroupDisjointReductions(analysis),
-      hero_reduction);
+      tiling, reduction_dimensions.is_row_reduction, reduction_is_race_free,
+      GroupDisjointReductions(analysis), hero_reduction);
+}
+
+std::optional<IndexingMap> ReductionFusion::ComputeThreadIdToOutputIndexing(
+    int64_t root_index, mlir::MLIRContext* ctx) const {
+  const auto& groups = reduction_codegen_info_.GetIndexGroups();
+  if (!groups.is_reduction_root[root_index]) {
+    // Non-transpose roots are elementwise by definition.
+    return ComputeThreadIdToInputIndexing(root_index, 0, ctx);
+  }
+  auto* root = analysis_.fusion_roots()[root_index];
+  auto* hero = analysis_.fusion_heroes()[root_index];
+
+  const auto& tiling = reduction_codegen_info_.GetTiling();
+  auto block_offsets = GetBlockOffsetsForTiling(tiling, ctx);
+  auto thread_ids = DelinearizeInBoundsIndex(mlir::getAffineDimExpr(0, ctx),
+                                             tiling.GetThreadsPerBlock(),
+                                             tiling.GetThreadStrides());
+
+  auto physical_shape = ShapeUtil::DeleteDimensions(hero->dimensions(),
+                                                    hero->operand(0)->shape());
+
+  std::vector<Range> dimension_ranges{
+      {0, tiling.GetNumThreadsPerBlock() - 1}, {}, {},
+      {0, tiling.GetNumBlocks() - 1},          {}, {},
+  };
+  auto physical_index = [&]() {
+    if (reduction_codegen_info_.IsRowReduction()) {
+      IndexingMap linear_index(
+          mlir::AffineMap::get(6, 0,
+                               block_offsets.getResult(kRowKeptDimension) +
+                                   thread_ids[kRowKeptDimension],
+                               ctx),
+          dimension_ranges, {});
+      int rows_per_warp = RowReductionGetRowsPerWarp(
+          tiling.GetShape()[kRowMinorReducedDimension]);
+      if (rows_per_warp > 1) {
+        linear_index.AddConstraint(thread_ids[kRowMinorReducedDimension] %
+                                       (WarpSize() / rows_per_warp),
+                                   {0, 0});
+      } else {
+        linear_index.AddConstraint(thread_ids[kRowMinorReducedDimension],
+                                   {0, 0});
+      }
+      return ComposeIndexingMaps(
+          linear_index,
+          GetBitcastMap(ShapeUtil::MakeShape(
+                            PRED, {tiling.GetShape()[kRowKeptDimension]}),
+                        physical_shape, ctx));
+    }
+
+    IndexingMap projected_index(
+        mlir::AffineMap::get(6, 0,
+                             {block_offsets.getResult(kColMajorKeptDimension),
+                              block_offsets.getResult(kColMinorKeptDimension) +
+                                  thread_ids[kColReducedDimension]},
+                             ctx),
+        dimension_ranges, {});
+
+    // TODO(b/319081342): Add constraints for the writing threads
+    // (`has_output`).
+    projected_index.AddConstraint(
+        mlir::getAffineDimExpr(kIndexingMapThreadIdxDims[0], ctx) % WarpSize(),
+        {0, 0});
+
+    return ComposeIndexingMaps(
+        projected_index,
+        GetBitcastMap(ShapeUtil::DeleteDimension(kColReducedDimension,
+                                                 tiling.GetXlaShape()),
+                      physical_shape, ctx));
+  }();
+
+  auto map = ComposeIndexingMaps(
+      physical_index, GetBitcastMap(OutputShape(hero->shape(), 0),
+                                    OutputShape(root->shape(), 0), ctx));
+
+  int group_index = groups.group_id_per_root[root_index];
+  map.AddConstraint(mlir::getAffineDimExpr(kIndexingMapBlockIdxDims[1], ctx),
+                    {group_index, group_index});
+  return map;
+}
+
+std::optional<IndexingMap> ReductionFusion::ComputeThreadIdToInputIndexing(
+    int64_t root_index, int64_t hero_operand_index,
+    mlir::MLIRContext* ctx) const {
+  const auto& groups = reduction_codegen_info_.GetIndexGroups();
+
+  auto* hero = analysis_.fusion_heroes()[hero_operand_index];
+  if (groups.is_reduction_root[root_index] &&
+      hero_operand_index >= hero->operand_count() / 2) {
+    // We don't have indexing for the init values.
+    return std::nullopt;
+  }
+
+  const auto& tiling = reduction_codegen_info_.GetTiling();
+  auto map = ComposeIndexingMaps(
+      GetIndexingMapForTiling(tiling, ctx),
+      GetBitcastMap(tiling.GetXlaShape(),
+                    hero->operand(hero_operand_index)->shape(), ctx));
+  // Only threads with the right y block index actually do anything for this
+  // root.
+  int group_index = groups.group_id_per_root[root_index];
+  map.AddConstraint(mlir::getAffineDimExpr(kIndexingMapBlockIdxDims[1], ctx),
+                    {group_index, group_index});
+  return map;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction.h b/third_party/xla/xla/service/gpu/fusions/reduction.h
index ddfd2b0cda9c4b..78dc8a83f95296 100644
--- a/third_party/xla/xla/service/gpu/fusions/reduction.h
+++ b/third_party/xla/xla/service/gpu/fusions/reduction.h
@@ -109,10 +109,11 @@ class ReductionFusion : public KernelFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t output_id, mlir::MLIRContext* ctx) const override {
-    // TODO(b/319081342): Implement this.
-    return std::nullopt;
-  }
+      int64_t root_index, mlir::MLIRContext* ctx) const override;
+
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const override;
 
  protected:
   absl::StatusOr<FusionEmissionResult> EmitInitializers(
@@ -130,20 +131,30 @@ class ReductionFusion : public KernelFusionEmitterBase {
   class ReductionEmitter;
   class ReductionGroupEmitter;
 
+  struct IndexGroups {
+    std::vector<std::vector<const HloInstruction*>> grouped_roots;
+
+    // For each root of the fusion, returns the index of the group it was placed
+    // in.
+    std::vector<int> group_id_per_root;
+
+    // For each root of the fusion, returns whether it is a reduction root, or
+    // an additional output.
+    std::vector<bool> is_reduction_root;
+  };
+
   class ReductionCodegenInfo {
    public:
-    using IndexGroups = std::vector<std::vector<const HloInstruction*>>;
-
-    ReductionCodegenInfo(TilingScheme mapping_scheme, bool is_row_reduction,
+    ReductionCodegenInfo(Tiling tiling, bool is_row_reduction,
                          bool is_race_free, IndexGroups index_groups,
                          const HloInstruction* first_reduce)
-        : tiling_scheme_(mapping_scheme),
+        : tiling_(tiling),
           is_row_reduction_(is_row_reduction),
           is_race_free_(is_race_free),
           index_groups_(std::move(index_groups)),
           first_reduce_(first_reduce) {}
 
-    const TilingScheme& GetTilingScheme() const { return tiling_scheme_; }
+    const Tiling& GetTiling() const { return tiling_; }
     const IndexGroups& GetIndexGroups() const { return index_groups_; }
     Shape GetReduceOperandShape() const {
       return first_reduce_->operand(0)->shape();
@@ -153,13 +164,21 @@ class ReductionFusion : public KernelFusionEmitterBase {
     bool IsRaceFree() const { return is_race_free_; }
 
    private:
-    TilingScheme tiling_scheme_;
+    Tiling tiling_;
     bool is_row_reduction_;
     bool is_race_free_;
     IndexGroups index_groups_;
     const HloInstruction* first_reduce_;
   };
 
+  // Groups the roots of the fusion. Different groups will be executed in
+  // parallel. We run reduce instructions in parallel if we can without too
+  // much recomputation overhead. The current heuristic is to place reduce
+  // instructions that share nothing or only (broadcasted) scalars/constants
+  // into different groups; otherwise, they are placed in the same group. Non-
+  // reduce instructions are always grouped with reduces with which they share
+  // any predecessors.
+  static IndexGroups GroupDisjointReductions(const HloFusionAnalysis& analysis);
   static ReductionCodegenInfo ComputeReductionCodegenInfo(
       const HloFusionAnalysis& analysis);
 
diff --git a/third_party/xla/xla/service/gpu/fusions/reduction_test.cc b/third_party/xla/xla/service/gpu/fusions/reduction_test.cc
new file mode 100644
index 00000000000000..d672a3d9d04b40
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusions/reduction_test.cc
@@ -0,0 +1,360 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/service/gpu/fusions/reduction.h"
+
+#include <memory>
+#include <optional>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/service/gpu/fusions/fusions.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
+#include "xla/status_macros.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class ReductionTest : public HloTestBase {
+ protected:
+  stream_executor::DeviceDescription device_info_ =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
+};
+
+StatusOr<std::unique_ptr<ReductionFusion>> GetReductionFusion(
+    const HloFusionAnalysis& analysis) {
+  TF_ASSIGN_OR_RETURN(
+      auto emitter, GetFusionEmitter(PreBufferAssignmentFusionInfo{analysis}));
+  auto fusion = dynamic_cast<ReductionFusion*>(emitter.get());
+  TF_RET_CHECK(fusion != nullptr);
+
+  emitter.release();
+  return std::unique_ptr<ReductionFusion>{fusion};
+}
+
+TEST_F(ReductionTest, ThreadIndexingRowReduction) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+
+    fusion {
+      %input = f32[100,64,512] parameter(0)
+      %c0 = f32[] constant(0)
+      ROOT reduce = f32[100,64] reduce(%input, %c0), dimensions={2}, to_apply=add
+    }
+
+    ENTRY entry {
+      %input = f32[100,64,512] parameter(0)
+      ROOT %fusion = f32[100,64] fusion(%input), kind=kInput, calls=fusion
+    })")
+                    .value();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto fusion, GetReductionFusion(analysis));
+  mlir::MLIRContext mlir_context;
+
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
+          (d3 * 8 + d0 floordiv 32) floordiv 64,
+          (d3 * 8 + d0 floordiv 32) mod 64,
+          d0 mod 32 + s2 * 32
+        )
+        domain:
+        d0 in [0, 255]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 799]
+        d4 in [0, 0]
+        d5 in [0, 0]
+        s0 in [0, 0]
+        s1 in [0, 0]
+        s2 in [0, 15]
+        0 in [0, 0]
+        d0 mod 32 + s2 * 32 in [0, 511]
+        d3 * 8 + d0 floordiv 32 in [0, 6399]
+      )"));
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5) -> (
+          (d3 * 8 + d0 floordiv 32) floordiv 64,
+          (d3 * 8 + d0 floordiv 32) mod 64
+        )
+        domain:
+        d0 in [0, 255]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 799]
+        d4 in [0, 0]
+        d5 in [0, 0]
+        (d3 * 8 + d0 floordiv 32) mod 64 in [0, 63]
+        d0 mod 32 in [0, 0]
+        d3 * 8 + d0 floordiv 32 in [0, 6399]
+      )"));
+}
+
+TEST_F(ReductionTest, ThreadIndexingMultiRowReduction) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+
+    fusion {
+      %input = f32[100,64,4] parameter(0)
+      %c0 = f32[] constant(0)
+      ROOT reduce = f32[100,64] reduce(%input, %c0), dimensions={2}, to_apply=add
+    }
+
+    ENTRY entry {
+      %input = f32[100,64,4] parameter(0)
+      ROOT %fusion = f32[100,64] fusion(%input), kind=kInput, calls=fusion
+    })")
+                    .value();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto fusion, GetReductionFusion(analysis));
+  mlir::MLIRContext mlir_context;
+
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
+          d3 + (d0 floordiv 4) floordiv 64,
+          (d0 floordiv 4) mod 64,
+          d0 mod 4
+        )
+        domain:
+        d0 in [0, 255]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 99]
+        d4 in [0, 0]
+        d5 in [0, 0]
+        s0 in [0, 0]
+        s1 in [0, 0]
+        s2 in [0, 0]
+        0 in [0, 0]
+        d0 mod 4 in [0, 3]
+        d3 * 64 + d0 floordiv 4 in [0, 6399]
+      )"));
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5) -> (
+          d3 + (d0 floordiv 4) floordiv 64,
+          (d0 floordiv 4) mod 64
+        )
+        domain:
+        d0 in [0, 255]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 99]
+        d4 in [0, 0]
+        d5 in [0, 0]
+        (d0 floordiv 4) mod 64 in [0, 63]
+        d0 mod 4 in [0, 0]
+        d3 * 64 + d0 floordiv 4 in [0, 6399]
+        d3 + (d0 floordiv 4) floordiv 64 in [0, 99]
+      )"));
+}
+
+TEST_F(ReductionTest, ThreadIndexingColumnReduction) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+
+    fusion {
+      %input = f32[100,64,32] parameter(0)
+      %c0 = f32[] constant(0)
+      ROOT reduce = f32[100,32] reduce(%input, %c0), dimensions={1}, to_apply=add
+    }
+
+    ENTRY entry {
+      %input = f32[100,64,32] parameter(0)
+      ROOT %fusion = f32[100,32] fusion(%input), kind=kInput, calls=fusion
+    })")
+                    .value();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto fusion, GetReductionFusion(analysis));
+  mlir::MLIRContext mlir_context;
+
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
+          d3,
+          d0 floordiv 32 + s1 * 32,
+          d0 mod 32
+        )
+        domain:
+        d0 in [0, 1023] d1 in [0, 0] d2 in [0, 0]
+        d3 in [0, 99] d4 in [0, 0] d5 in [0, 0]
+        s0 in [0, 0] s1 in [0, 127] s2 in [0, 0]
+        d0 floordiv 32 + s1 * 32 in [0, 63]
+        d0 mod 32 in [0, 31]
+      )"));
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5) -> (
+          d3,
+          d0 floordiv 32
+        )
+        domain:
+        d0 in [0, 1023] d1 in [0, 0] d2 in [0, 0]
+        d3 in [0, 99] d4 in [0, 0] d5 in [0, 0]
+        d0 mod 32 in [0, 0]
+      )"));
+}
+
+TEST_F(ReductionTest, ThreadIndexingOutputLayout) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+
+    fusion {
+      %input = f32[100,64,512] parameter(0)
+      %c0 = f32[] constant(0)
+      ROOT reduce = f32[100,64]{0,1} reduce(%input, %c0), dimensions={2}, to_apply=add
+    }
+
+    ENTRY entry {
+      %input = f32[100,64,512] parameter(0)
+      ROOT %fusion = f32[100,64]{0,1} fusion(%input), kind=kInput, calls=fusion
+    })")
+                    .value();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto fusion, GetReductionFusion(analysis));
+  mlir::MLIRContext mlir_context;
+
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5) -> (
+          (d3 * 8 + d0 floordiv 32) floordiv 64,
+          (d3 * 8 + d0 floordiv 32) mod 64
+        )
+        domain:
+        d0 in [0, 255]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 799]
+        d4 in [0, 0]
+        d5 in [0, 0]
+        (d3 * 8 + d0 floordiv 32) mod 64 in [0, 63]
+        d0 mod 32 in [0, 0]
+        d3 * 8 + d0 floordiv 32 in [0, 6399]
+      )"));
+}
+
+TEST_F(ReductionTest, ThreadIndexingSideOutput) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+
+    fusion {
+      %input = f32[100,64,512] parameter(0)
+      %c0 = f32[] constant(0)
+      %log = f32[100,64,512] log(%input)
+      %reduce = f32[100,64] reduce(%input, %c0), dimensions={2}, to_apply=add
+      ROOT tuple = (f32[100,64], f32[100,64,512]) tuple(%reduce, %log)
+    }
+
+    ENTRY entry {
+      %input = f32[100,64,512] parameter(0)
+      ROOT %fusion = (f32[100,64], f32[100,64,512]) fusion(%input), kind=kInput, calls=fusion
+    })")
+                    .value();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto fusion, GetReductionFusion(analysis));
+  mlir::MLIRContext mlir_context;
+
+  constexpr char kExpectedIndexing[] = R"(
+      (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
+        (d3 * 8 + d0 floordiv 32) floordiv 64,
+        (d3 * 8 + d0 floordiv 32) mod 64,
+        d0 mod 32 + s2 * 32
+      )
+      domain:
+      d0 in [0, 255]
+      d1 in [0, 0]
+      d2 in [0, 0]
+      d3 in [0, 799]
+      d4 in [1, 0]
+      d5 in [0, 0]
+      s0 in [0, 0]
+      s1 in [0, 0]
+      s2 in [0, 15]
+      0 in [0, 0]
+      d0 mod 32 + s2 * 32 in [0, 511]
+      d3 * 8 + d0 floordiv 32 in [0, 6399]
+  )";
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToInputIndexing(1, 0, &mlir_context)->ToString(),
+      MatchIndexingString(kExpectedIndexing));
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToOutputIndexing(1, &mlir_context)->ToString(),
+      MatchIndexingString(kExpectedIndexing));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter.h b/third_party/xla/xla/service/gpu/fusions/scatter.h
index b90b7833acc5da..6982bbc8e6bd2c 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter.h
+++ b/third_party/xla/xla/service/gpu/fusions/scatter.h
@@ -44,12 +44,19 @@ class ScatterFusion : public KernelFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t output_id, mlir::MLIRContext* ctx) const override {
+      int64_t root_index, mlir::MLIRContext* ctx) const override {
     // The kernel iterates over updates, whose correspondence to output
     // elements cannot be computed statically.
     return std::nullopt;
   }
 
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const override {
+    // TODO(b/319081342): Implement this.
+    return std::nullopt;
+  }
+
  protected:
   absl::Status EmitKernel(IrEmitterContext& ir_emitter_context,
                           const HloFusionInstruction& fusion,
diff --git a/third_party/xla/xla/service/gpu/fusions/scatter_test.cc b/third_party/xla/xla/service/gpu/fusions/scatter_test.cc
index 944339c2d49930..e000c05caf9938 100644
--- a/third_party/xla/xla/service/gpu/fusions/scatter_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/scatter_test.cc
@@ -66,11 +66,10 @@ TEST_F(ScatterFusionTest, ScatterFusion) {
 
   auto* root = module->entry_computation()->root_instruction();
   auto analysis_fused = AnalyzeFusion(*root, device_info);
-  ASSERT_NE(analysis_fused, std::nullopt);
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto emitter,
-      GetFusionEmitter(PreBufferAssignmentFusionInfo{*analysis_fused}));
+      GetFusionEmitter(PreBufferAssignmentFusionInfo{analysis_fused}));
   auto scatter_fusion = dynamic_cast<ScatterFusion*>(emitter.get());
   ASSERT_NE(scatter_fusion, nullptr);
   EXPECT_EQ(scatter_fusion->launch_dimensions().launch_bound(),
diff --git a/third_party/xla/xla/service/gpu/fusions/thunk_util.cc b/third_party/xla/xla/service/gpu/fusions/thunk_util.cc
index 8ebdae0d850f88..ee4cf875cdf2d7 100644
--- a/third_party/xla/xla/service/gpu/fusions/thunk_util.cc
+++ b/third_party/xla/xla/service/gpu/fusions/thunk_util.cc
@@ -30,7 +30,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/ir_emitter_context.h"
-#include "xla/service/gpu/runtime3/memset_thunk.h"
+#include "xla/service/gpu/runtime/memset_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
diff --git a/third_party/xla/xla/service/gpu/fusions/tiling_util.cc b/third_party/xla/xla/service/gpu/fusions/tiling_util.cc
index 88e35bee2352d2..00b5f536ee6054 100644
--- a/third_party/xla/xla/service/gpu/fusions/tiling_util.cc
+++ b/third_party/xla/xla/service/gpu/fusions/tiling_util.cc
@@ -15,14 +15,15 @@ limitations under the License.
 
 #include "xla/service/gpu/fusions/tiling_util.h"
 
-#include <array>
 #include <cstdint>
 #include <limits>
 #include <string>
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/IRBuilder.h"
@@ -35,7 +36,6 @@ limitations under the License.
 #include "xla/service/llvm_ir/kernel_support_library.h"
 #include "xla/service/llvm_ir/llvm_loop.h"
 #include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/util.h"
 #include "tsl/platform/statusor.h"
@@ -44,128 +44,82 @@ namespace xla {
 namespace gpu {
 namespace {
 
-// Gets the output offset as calculated from thread_id.x (to be applied to the
-// offset calculated from block_id and thread_id.y).
-llvm::Value* GetStartOffsetX(const TilingScheme& tiling_scheme,
-                             llvm::Value* thread_id_x, llvm::Type* index_ty,
-                             llvm::IRBuilder<>* b) {
-  int64_t multiplier =
-      tiling_scheme.GetIndexingOrder() == TilingScheme::StridedIndexingX
-          ? tiling_scheme.GetVectorSize()
-          : tiling_scheme.GetThreadTileSize()[TilingScheme::DimX];
-  return b->CreateMul(thread_id_x,
-                      llvm::ConstantInt::get(index_ty, multiplier));
-}
-
-// Emits loop through the minor (X) dimension of a tile, starting at a given
-// offset.
-//
-// Rough pseudocode:
-//
-// Given: offset, callback
-//
-// for (int x = 0; x < x_tile_size / vector_size; ++x) {
-//   for (int i = 0; i < vector_size; ++i) {
-//      callback(offset + x * stride * vector_size + i);
-//   }
-// }
-void EmitXTileLoop(const TilingThreadIdInfo& thread_id_info,
-                   const llvm_ir::IrArray::Index& tile_origin_index,
-                   const TilingScheme& tiling_scheme, bool check_x_tile_bounds,
-                   llvm::Value* y_loc,
-                   std::array<llvm::Value*, 2> tile_dimensions,
-                   llvm::IRBuilder<>* b,
-                   const EmitTileElementFunction* emit_elem_function) {
-  llvm::Type* index_ty = tile_dimensions[1]->getType();
-  KernelSupportLibrary ksl(b, llvm_ir::UnrollMode::kDefaultUnroll);
+void EmitTileRec(const TilingThreadIdInfo& thread_id_info, const Tiling& tiling,
+                 int dim, absl::InlinedVector<llvm::Value*, 4> tile_idx,
+                 absl::Span<llvm::Value* const> tile_dimensions,
+                 llvm::IRBuilder<>* b, const TileElementGenerator& emit_elem) {
+  llvm::Type* index_ty = thread_id_info.thread_id->getType();
   auto constant = [&](int64_t val) {
     return llvm::ConstantInt::get(index_ty, val);
   };
-  llvm::Value* start_offset_x =
-      GetStartOffsetX(tiling_scheme, thread_id_info.thread_id_x, index_ty, b);
 
-  int64_t vector_size = tiling_scheme.GetVectorSize();
-  int64_t stride_x =
-      tiling_scheme.GetIndexingOrder() == TilingScheme::LinearIndexingX
-          ? 1
-          : tiling_scheme.GetThreadsPerBlock()[TilingScheme::DimX];
-  KernelSupportLibrary unrolled_ksl(b, llvm_ir::UnrollMode::kFullyUnroll);
-  unrolled_ksl.For(
-      "tile_loop",
-      /*start=*/constant(0),
-      /*end=*/
-      constant(tiling_scheme.GetThreadTileSize()[TilingScheme::DimX] /
-               vector_size),
-      /*step=*/1, [&](llvm::Value* x) {
-        for (int64_t i = 0; i < vector_size; i++) {
-          llvm::Value* x_offset = b->CreateAdd(
-              b->CreateMul(x, constant(stride_x * vector_size)), constant(i));
-          llvm::Value* x_loc = b->CreateAdd(x_offset, start_offset_x, "x_loc");
-          llvm_ir::IrArray::Index source_idx_x =
-              tile_origin_index
-                  .AddOffsetToDim(y_loc, tiling_scheme.GetTilingDimension(0), b)
-                  .AddOffsetToDim(x_loc, tiling_scheme.GetTilingDimension(1),
-                                  b);
-          auto emit_element = [&] {
-            return (*emit_elem_function)(thread_id_info, source_idx_x, y_loc,
-                                         x_loc);
-          };
-          if (check_x_tile_bounds) {
-            ksl.If("x_in_tile", b->CreateICmpULT(x_loc, tile_dimensions[1]),
-                   emit_element);
-          } else {
-            emit_element();
-          }
+  auto recurse = [&] {
+    if (dim == tile_idx.size() - 1) {
+      emit_elem(tile_idx);
+    } else {
+      EmitTileRec(thread_id_info, tiling, dim + 1, tile_idx, tile_dimensions, b,
+                  emit_elem);
+    }
+  };
+
+  bool unroll = tiling.GetLoopsToUnroll()[dim];
+  KernelSupportLibrary ksl(b, unroll ? llvm_ir::UnrollMode::kFullyUnroll
+                                     : llvm_ir::UnrollMode::kDefaultUnroll);
+
+  if (tiling.GetBlockTileSize()[dim] == 1) {
+    tile_idx[dim] = constant(0);
+    recurse();
+  } else if (unroll) {
+    // TODO(jreiffers): Check if this unrolling does anything useful.
+    int64_t stride = tiling.GetThreadsPerBlock()[dim];
+    int64_t dim_size = tiling.GetThreadTileSize()[dim];
+
+    auto make_loop = [&](bool emit_bounds_checks) {
+      auto body = [&, emit_bounds_checks](llvm::Value* i) {
+        tile_idx[dim] = b->CreateAdd(i, thread_id_info.thread_ids[dim]);
+        if (emit_bounds_checks) {
+          auto* in_bounds =
+              b->CreateICmpULT(tile_idx[dim], tile_dimensions[dim]);
+          ksl.If("x_in_tile", in_bounds, recurse);
+        } else {
+          recurse();
         }
-      });
+      };
+      return [&, body] {
+        ksl.For(absl::StrCat("loop", dim), constant(0),
+                constant(dim_size * stride), constant(stride), body);
+      };
+    };
+    if (stride > 1 && dim_size > 1) {
+      // Most tiles will be full, so we emit a single bounds check for those.
+      auto* is_full_tile = b->CreateICmpEQ(
+          constant(tiling.GetBlockTileSize()[dim]), tile_dimensions[dim]);
+      ksl.If("is_full_tile", is_full_tile, make_loop(false), make_loop(true));
+    } else {
+      make_loop(true)();
+    }
+  } else {
+    // All dimensions are strided (thread 0 processes elements 0, num_threads,
+    // num_threads+2, ...; thread 1 processes elements 1, num_threads + 1 and so
+    // on).
+    ksl.For(absl::StrCat("loop", dim), /*start=*/thread_id_info.thread_ids[dim],
+            /*end=*/tile_dimensions[dim],
+            /*step=*/tiling.GetThreadsPerBlock()[dim], [&](llvm::Value* i) {
+              tile_idx[dim] = i;
+              recurse();
+            });
+  }
 }
 
 }  // namespace
 
-void EmitTile(llvm::IRBuilder<>* builder, const TilingScheme& tiling_scheme,
-              const llvm_ir::IrArray::Index& tile_origin_index,
+void EmitTile(llvm::IRBuilder<>* builder, const Tiling& tiling,
               const TilingThreadIdInfo& thread_id_info,
-              std::array<llvm::Value*, 2> tile_dimensions,
-              const EmitTileElementFunction& emit_elem_function) {
-  llvm::Type* index_ty = tile_dimensions[0]->getType();
-  auto constant = [&](int64_t val) {
-    return llvm::ConstantInt::get(index_ty, val);
-  };
-  llvm::Value* num_threads_y = constant(
-      tiling_scheme.GetThreadsPerBlock()[tiling_scheme.GetTilingDimension(0)]);
-
-  KernelSupportLibrary ksl(builder, llvm_ir::UnrollMode::kDefaultUnroll);
-
-  ksl.For(
-      "y_in_tile",
-      /*start=*/thread_id_info.thread_id_y,
-      /*end=*/
-      tile_dimensions[0],
-      /*step=*/num_threads_y, [&](llvm::Value* y_loc) {
-        auto unroll_inner_tile_loop = [&](bool check_x_tile_bounds) {
-          return EmitXTileLoop(thread_id_info, tile_origin_index, tiling_scheme,
-                               check_x_tile_bounds, y_loc, tile_dimensions,
-                               builder, &emit_elem_function);
-        };
-
-        // Only take this path when we unroll in a way vectorizable by
-        // LLVM. Special case when the tile doesn't fit completely for even
-        // row size. For odd row size every other row isn't aligned to the
-        // vectorized size, so it can't be vectorized by LLVM.
-        if (tiling_scheme.GetIndexingOrder() ==
-            TilingScheme::StridedIndexingX) {
-          ksl.If(
-              "is_full_tile",
-              builder->CreateICmpEQ(
-                  constant(
-                      tiling_scheme.GetBlockTileSize()[TilingScheme::DimX]),
-                  tile_dimensions[1]),
-              [&] { unroll_inner_tile_loop(/*check_x_tile_bounds=*/false); },
-              [&] { unroll_inner_tile_loop(/*check_x_tile_bounds=*/true); });
-        } else {
-          unroll_inner_tile_loop(/*check_x_tile_bounds=*/true);
-        }
-      });
+              absl::Span<llvm::Value* const> tile_dimensions,
+              const TileElementGenerator& emit_elem_function) {
+  absl::InlinedVector<llvm::Value*, 4> tile_idx(tiling.GetShape().size());
+  EmitTileRec(thread_id_info, tiling, 0, tile_idx, tile_dimensions, builder,
+              emit_elem_function);
 }
 
 namespace {
@@ -177,10 +131,11 @@ llvm::Value* EmitBlockId(llvm::IRBuilder<>* builder, int32_t num_blocks,
       EmitCallToTargetIntrinsic(TargetIntrinsicID::kBlockIdx, {}, {}, builder);
   if (num_blocks != 0) {
     llvm_ir::AddRangeMetadata(0, num_blocks,
-                              llvm::cast<llvm::Instruction>(block_id));
+                              llvm::cast<llvm::Instruction>(block_id), builder);
   }
-  return builder->CreateIntCast(block_id, index_ty, /*isSigned=*/true,
-                                "block.id.x");
+  auto ret = builder->CreateIntCast(block_id, index_ty, /*isSigned=*/true);
+  ret->setName("block.id.x");
+  return ret;
 }
 
 // Emits current thread id with the given type.
@@ -190,227 +145,110 @@ llvm::Value* EmitThreadId(llvm::IRBuilder<>* builder, int64_t threads_per_block,
                           llvm::Type* index_ty) {
   // Calculate (y, x) coordinates respectively in the 2D view of thread block,
   // defined by (num_thread_y, num_thread_x) from thread_id.
-  llvm::CallInst* thread_id_raw =
+  llvm::CallInst* thread_id =
       EmitCallToTargetIntrinsic(TargetIntrinsicID::kThreadIdx, {}, {}, builder);
-  llvm_ir::AddRangeMetadata(0, threads_per_block, thread_id_raw);
-  return builder->CreateIntCast(thread_id_raw, index_ty,
-                                /*isSigned=*/true, "thread.id.x");
+  llvm_ir::AddRangeMetadata(0, threads_per_block, thread_id, builder);
+  auto ret = builder->CreateIntCast(thread_id, index_ty, /*isSigned=*/true);
+  ret->setName("thread.id.x");
+  return ret;
 }
 
-// Emits the LLVM values for thread_id, thread_id.x, thread_id.y and lane
-// id.
-//
-// Returns a struct containting these values.
-//
-// In the presence of thread scaling in tiling scheme may return early if the
-// combination of thread_id/block_id does not correspond to a real block.
-// Assumes the current function returns void.
-absl::StatusOr<TilingThreadIdInfo> EmitThreadIdInfo(
-    llvm::IRBuilder<>* builder, const TilingScheme& tiling_scheme,
-    llvm::Type* index_ty) {
+// Emits the LLVM values for thread_id, block_id, coordinates of the current
+// tile and strides of the loops to iterate over the current tile.
+absl::StatusOr<TilingThreadIdInfo> EmitThreadIdInfo(llvm::IRBuilder<>* builder,
+                                                    const Tiling& tiling,
+                                                    llvm::Type* index_ty) {
   auto constant = [&](uint64_t c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_ty, c);
   };
-  llvm::Value* thread_id_physical = EmitThreadId(
-      builder, tiling_scheme.GetNumThreadsPerBlockPhysical(), index_ty);
-  int64_t num_blocks = tiling_scheme.GetNumBlocksPhysical();
+  int64_t num_blocks = tiling.GetNumBlocks();
   if (num_blocks > (int64_t)std::numeric_limits<uint32_t>::max()) {
     return FailedPrecondition(
         "Number of physical blocks (%d) does not fit in an i32 in tiling "
         "scheme: %s",
-        num_blocks, tiling_scheme.ToString());
+        num_blocks, tiling.ToString());
   }
-  llvm::Value* block_id_physical = EmitBlockId(builder, num_blocks, index_ty);
-
-  // Wait this will break coalescing.
-  llvm::Value* thread_id_logical = builder->CreateURem(
-      thread_id_physical, constant(tiling_scheme.GetNumThreadsPerBlock()));
-  llvm::Value* scaling = builder->CreateUDiv(
-      thread_id_physical, constant(tiling_scheme.GetNumThreadsPerBlock()));
-  llvm::Value* block_id_logical = builder->CreateAdd(
-      builder->CreateMul(block_id_physical,
-                         constant(tiling_scheme.GetThreadIdScalingFactor())),
-      scaling);
 
-  llvm::Value* num_threads_x_v =
-      constant(tiling_scheme.GetThreadsPerBlock()[TilingScheme::DimX]);
+  TilingThreadIdInfo info;
+  info.thread_id =
+      EmitThreadId(builder, tiling.GetNumThreadsPerBlock(), index_ty);
+  info.block_id = EmitBlockId(builder, num_blocks, index_ty);
+
+  for (auto [dim, stride] : llvm::enumerate(tiling.GetThreadStrides())) {
+    int64_t size = tiling.GetThreadsPerBlock()[dim];
+    if (size == 1) {
+      info.thread_ids.emplace_back(constant(0));
+    } else {
+      auto& dim_id = info.thread_ids.emplace_back(info.thread_id);
+      if (stride > 1) {
+        dim_id = builder->CreateUDiv(dim_id, constant(stride));
+      }
+      if (dim) {
+        dim_id = builder->CreateURem(dim_id, constant(size));
+      }
+      dim_id->setName(absl::StrCat("thread.id.", dim));
+    }
+  }
 
-  llvm::Value* block_exists = builder->CreateICmpULT(
-      block_id_logical, constant(tiling_scheme.GetNumBlocks()));
-  llvm_ir::EmitEarlyReturn(block_exists, builder);
-  return {
-      {thread_id_logical,
-       /*thread_id_x=*/
-       builder->CreateURem(thread_id_logical, num_threads_x_v, "thread_id.x"),
-       /*thread_id_y=*/
-       builder->CreateUDiv(thread_id_logical, num_threads_x_v, "thread_id.y"),
-       /*lane_id=*/
-       builder->CreateURem(thread_id_logical, constant(WarpSize()), "lane_id"),
-       /*block_id=*/block_id_logical,
-       /*scaling=*/scaling}};
+  info.lane_id =
+      builder->CreateURem(info.thread_id, constant(WarpSize()), "lane_id");
+  return info;
 }
 
 }  // namespace
 
 absl::StatusOr<TilingKernelInfo> EmitTilingKernel(
-    llvm::IRBuilder<>* builder, const TilingScheme& tiling_scheme,
-    llvm::Type* index_ty, const TileElementGenerator& tile_element_generator) {
-  absl::Span<const int64_t> dims_in_elems = tiling_scheme.GetShape();
-  Vector3 dims_in_blocks = tiling_scheme.GetBlockCounts();
+    llvm::IRBuilder<>* builder, const Tiling& tiling, llvm::Type* index_ty,
+    const TileGenerator& tile_generator) {
+  absl::Span<const int64_t> dims_in_elems = tiling.GetShape();
+  const auto& block_counts = tiling.GetBlockCounts();
   auto constant = [&](uint64_t c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_ty, c);
   };
 
   TF_ASSIGN_OR_RETURN(TilingThreadIdInfo thread_id_info,
-                      EmitThreadIdInfo(builder, tiling_scheme, index_ty));
+                      EmitThreadIdInfo(builder, tiling, index_ty));
 
   KernelSupportLibrary ksl(builder, llvm_ir::UnrollMode::kDefaultUnroll);
 
-  int64_t non_tiling_dimension = tiling_scheme.GetTilingDimension(0) == 1
-                                     ? TilingScheme::DimZ
-                                     : TilingScheme::DimY;
   const llvm_ir::IrArray::Index block_coords(
       thread_id_info.block_id,
-      ShapeUtil::MakeShapeWithDenseLayout(
-          PRED /*arbitrary*/, dims_in_blocks,
-          // This layout determines the iteration order. We want the
-          // non-tiling dimension to be the slowest varying dimension.
-          {2, 1 - non_tiling_dimension, non_tiling_dimension}),
-      builder);
-
-  std::array<llvm::Value*, 2> tile_dimensions;
-  // Coordinate access is shifted: 0 corresponds to the first non-tiling
-  // dimension and 1 corresponds to DimX.
-  std::array<int64_t, 2> tiling_coords{1 - non_tiling_dimension,
-                                       TilingScheme::DimX};
-  for (int i = 0; i < 2; ++i) {
-    int64_t block_tile_size =
-        tiling_scheme.GetBlockTileSize()[tiling_coords[i]];
-    // Only last row or column may not have full size.
-    llvm::Value* is_last =
-        builder->CreateICmpEQ(block_coords[tiling_coords[i]],
-                              constant(dims_in_blocks[tiling_coords[i]] - 1));
-    int64_t partial_row =
-        dims_in_elems[tiling_coords[i]] -
-        (dims_in_blocks[tiling_coords[i]] - 1) * block_tile_size;
-    tile_dimensions[i] =
-        builder->CreateSelect(is_last, constant(partial_row),
-                              constant(block_tile_size), "tile_bound");
+      ShapeUtil::MakeShape(PRED /*arbitrary*/, block_counts), builder);
+
+  absl::InlinedVector<llvm::Value*, 4> tile_dimensions;
+  for (int i = 0; i < block_counts.size(); ++i) {
+    int64_t block_tile_size = tiling.GetBlockTileSize()[i];
+    if (dims_in_elems[i] % block_tile_size == 0) {
+      // The block tile size evenly divides the tiled shape -> no need to emit
+      // the bounds check.
+      tile_dimensions.push_back(constant(block_tile_size));
+    } else {
+      // Only the last tile in each dimension may not have full size.
+      llvm::Value* is_last =
+          builder->CreateICmpEQ(block_coords[i], constant(block_counts[i] - 1));
+      int64_t partial_row =
+          dims_in_elems[i] - (block_counts[i] - 1) * block_tile_size;
+      tile_dimensions.push_back(builder->CreateSelect(
+          is_last, constant(partial_row), constant(block_tile_size),
+          absl::StrCat("tile_bound.", i)));
+    }
   }
 
-  llvm_ir::IrArray::Index tile_origin = [&] {
+  llvm_ir::IrArray::Index tile_offset = [&] {
     std::vector<llvm::Value*> elem_multi_index = block_coords.multidim();
     llvm::Type* index_ty = block_coords.GetType();
-    for (int i = 0; i < TilingScheme::DimTot; ++i) {
+    for (int i = 0; i < block_counts.size(); ++i) {
       elem_multi_index[i] = builder->CreateMul(
           block_coords[i],
-          llvm::ConstantInt::get(index_ty, tiling_scheme.GetBlockTileSize()[i]),
-          "tile_origin." + std::to_string(i));
+          llvm::ConstantInt::get(index_ty, tiling.GetBlockTileSize()[i]),
+          absl::StrCat("tile_origin.", i));
     }
-    return llvm_ir::IrArray::Index(elem_multi_index, tiling_scheme.GetShape(),
+    return llvm_ir::IrArray::Index(elem_multi_index, tiling.GetShape(),
                                    index_ty);
   }();
 
-  auto emit_tile = [&](const llvm_ir::IrArray::Index& tile) {
-    tile_element_generator(thread_id_info, tile, tile_dimensions);
-  };
-
-  if (tiling_scheme.GetBlockTileSize()[non_tiling_dimension] == 1) {
-    emit_tile(tile_origin);
-  } else {
-    llvm::Value* starting_tile_index_for_dim =
-        tile_origin[non_tiling_dimension];
-    llvm::Value* block_size_for_dim =
-        constant(tiling_scheme.GetBlockTileSize()[non_tiling_dimension]);
-    llvm::Value* block_id_for_dim =
-        builder->CreateUDiv(starting_tile_index_for_dim, block_size_for_dim);
-    llvm::Value* last_block_for_dim =
-        constant(dims_in_blocks[non_tiling_dimension] - 1);
-    llvm::Value* last_block_size_for_dim =
-        constant(dims_in_elems[non_tiling_dimension] -
-                 (dims_in_blocks[non_tiling_dimension] - 1) *
-                     tiling_scheme.GetBlockTileSize()[non_tiling_dimension]);
-
-    llvm::Value* num_tiles_in_block = builder->CreateSelect(
-        builder->CreateICmpEQ(last_block_for_dim, block_id_for_dim),
-        last_block_size_for_dim, block_size_for_dim);
-    ksl.For("loop_z",
-            /*start=*/constant(0),
-            /*end=*/num_tiles_in_block,
-            /*step=*/1, [&](llvm::Value* block_dim_induction_var) {
-              llvm_ir::IrArray::Index tile_index = tile_origin.AddOffsetToDim(
-                  block_dim_induction_var, non_tiling_dimension, builder);
-              emit_tile(tile_index);
-            });
-  }
-
-  return {{tile_dimensions, tile_origin, thread_id_info}};
-}
-
-llvm::Type* TilingThreadIdInfo::GEPIntoSharedMemoryType(
-    llvm::GlobalVariable* shared,
-    absl::Span<llvm::Value* const> idx_major_to_minor) const {
-  std::vector<llvm::Value*> idxs_scaled;
-  idxs_scaled.push_back(llvm::ConstantInt::get(scaling->getType(), 0));
-  idxs_scaled.push_back(scaling);
-  idxs_scaled.insert(idxs_scaled.end(), idx_major_to_minor.begin(),
-                     idx_major_to_minor.end());
-  return llvm::GetElementPtrInst::getIndexedType(shared->getValueType(),
-                                                 idxs_scaled);
-}
-
-llvm::Value* TilingThreadIdInfo::GEPIntoSharedMemory(
-    llvm::IRBuilder<>* b, llvm::GlobalVariable* shared,
-    absl::Span<llvm::Value* const> idx_major_to_minor,
-    const llvm::Twine& name) const {
-  std::vector<llvm::Value*> idxs_scaled;
-  idxs_scaled.push_back(llvm::ConstantInt::get(scaling->getType(), 0));
-  idxs_scaled.push_back(scaling);
-  idxs_scaled.insert(idxs_scaled.end(), idx_major_to_minor.begin(),
-                     idx_major_to_minor.end());
-  llvm::Value* gep =
-      b->CreateInBoundsGEP(shared->getValueType(), shared, idxs_scaled, name);
-
-  llvm::PointerType* pointer_in_addressspace = llvm::PointerType::get(
-      llvm::cast<llvm::PointerType>(gep->getType())->getContext(),
-      /*AddressSpace=*/0);
-
-  // __shared__ memory uses a different address space, so we cast it to
-  // global address space before writing or reading.
-  return b->CreateAddrSpaceCast(gep, pointer_in_addressspace);
-}
-
-llvm_ir::IrArray::Index GetUnnormalizedIndex(
-    const llvm_ir::IrArray::Index& normalized_shape_index,
-    const Shape& unnormalized_shape, llvm::IRBuilder<>* builder,
-    absl::Span<const int64_t> dims_in_elems) {
-  CHECK_EQ(normalized_shape_index.size(), 3);
-  // If the normalization only add a new dimensions of size 1,
-  // generate simpler indexing. LLVM doesn't always simplify the more
-  // complicated indexing and this prevents it from vectorizing some
-  // cases. We do this only for major_to_minor memory layout.
-  if (unnormalized_shape.rank() == 2 && unnormalized_shape.has_layout() &&
-      unnormalized_shape.dimensions()[0] == normalized_shape_index.dims()[1] &&
-      unnormalized_shape.dimensions()[1] == normalized_shape_index.dims()[2] &&
-      unnormalized_shape.layout().minor_to_major(1) == 0) {
-    CHECK_EQ(normalized_shape_index.dims()[0], 1);
-    auto multidim = normalized_shape_index.multidim();
-    return llvm_ir::IrArray::Index({multidim[1], multidim[2]},
-                                   unnormalized_shape,
-                                   normalized_shape_index.GetType());
-  }
-  if (unnormalized_shape.rank() == 2 && unnormalized_shape.has_layout() &&
-      unnormalized_shape.dimensions()[0] == normalized_shape_index.dims()[2] &&
-      unnormalized_shape.dimensions()[1] == normalized_shape_index.dims()[1] &&
-      unnormalized_shape.layout().minor_to_major(1) == 1) {
-    CHECK_EQ(normalized_shape_index.dims()[0], 1);
-    auto multidim = normalized_shape_index.multidim();
-    return llvm_ir::IrArray::Index({multidim[2], multidim[1]},
-                                   unnormalized_shape,
-                                   normalized_shape_index.GetType());
-  }
-  return normalized_shape_index.SourceIndexOfBitcast(
-      ShapeUtil::MakeShape(F32, dims_in_elems), unnormalized_shape, builder);
+  tile_generator(thread_id_info, tile_offset, tile_dimensions);
+  return {{tile_dimensions, tile_offset, thread_id_info}};
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/tiling_util.h b/third_party/xla/xla/service/gpu/fusions/tiling_util.h
index b6fc4681d26a3f..f06ae8ccab4280 100644
--- a/third_party/xla/xla/service/gpu/fusions/tiling_util.h
+++ b/third_party/xla/xla/service/gpu/fusions/tiling_util.h
@@ -22,6 +22,7 @@ limitations under the License.
 #include "absl/log/check.h"
 #include "absl/types/span.h"
 #include "xla/service/llvm_ir/ir_array.h"
+#include "xla/shape_util.h"
 #include "xla/util.h"
 
 namespace xla {
@@ -29,195 +30,113 @@ namespace gpu {
 
 // Describes tiling used by the kernel.
 //
-// Used by reductions and 021 transpose algorithm. Both algorithms operate over
-// "logical" 3D views over input arrays, hence tiling and number of threads
-// information has only 3 dimensions.
-//
-// In the presence of virtual threadIdx/blockIdx scaling, all accessors are
-// "logical", unless otherwise specified.
-class TilingScheme {
+// Used by reduction and transpose emitters.
+class Tiling {
  public:
-  enum { DimZ = 0, DimY, DimX, DimTot };
-
-  enum IndexingOrder {
-    // Thread reads consecutive elements.
-    LinearIndexingX,
-    // Thread reads strided elements while keeping memory coalescing.
-    StridedIndexingX,
-  };
-
-  TilingScheme(Vector3 dims_in_elems, Vector3 tile_sizes, Vector3 num_threads,
-               IndexingOrder indexing_order, int vector_size,
-               int scaling_factor, Vector2 tiling_dimensions = Vector2{1, 2})
-      : dims_in_elems_(dims_in_elems),
+  Tiling(absl::InlinedVector<int64_t, 4> shape,
+         absl::InlinedVector<int64_t, 4> tile_sizes,
+         absl::InlinedVector<int64_t, 4> num_threads,
+         // By default, don't unroll anything.
+         absl::InlinedVector<bool, 4> loops_to_unroll = {})
+      : shape_(shape),
         tile_sizes_per_thread_(tile_sizes),
-        tile_sizes_per_block_{num_threads[0] * tile_sizes[0],
-                              num_threads[1] * tile_sizes[1],
-                              num_threads[2] * tile_sizes[2]},
-        tiling_dimensions_(tiling_dimensions),
+        tile_sizes_per_block_(shape.size()),
         num_threads_(num_threads),
-        indexing_order_(indexing_order),
-        vector_size_(vector_size),
-        thread_id_virtual_scaling_(scaling_factor) {
-    CHECK_EQ(tile_sizes[2] % vector_size_, 0)
-        << "tile sizes = " << absl::StrJoin(tile_sizes, ", ")
-        << "; vector size = " << vector_size_;
-  }
-
-  static std::string IndexingOrderToString(IndexingOrder order) {
-    switch (order) {
-      case LinearIndexingX:
-        return "linear";
-      case StridedIndexingX:
-        return "strided";
+        num_blocks_(shape.size()),
+        loops_to_unroll_(loops_to_unroll) {
+    for (int64_t i = 0; i < shape.size(); ++i) {
+      tile_sizes_per_block_[i] = tile_sizes[i] * num_threads[i];
+      CHECK_NE(tile_sizes_per_block_[i], 0);
+      num_blocks_[i] = CeilOfRatio(shape[i], tile_sizes_per_block_[i]);
+      CHECK_NE(num_blocks_[i], 0);
     }
+    if (loops_to_unroll_.empty()) loops_to_unroll_.resize(shape.size());
   }
 
   std::string ToString() const {
     return absl::StrJoin(
-        {absl::StrFormat("dims_in_elems = {%s}",
-                         absl::StrJoin(dims_in_elems_, ", ")),
+        {absl::StrFormat("shape = {%s}", absl::StrJoin(shape_, ", ")),
          absl::StrFormat("tile_sizes = {%s}",
                          absl::StrJoin(tile_sizes_per_thread_, ", ")),
          absl::StrFormat("num_threads = {%s}",
-                         absl::StrJoin(num_threads_, ", ")),
-         absl::StrFormat("indexing_order = %s",
-                         IndexingOrderToString(indexing_order_)),
-         absl::StrFormat("vector_size = %d", vector_size_),
-         absl::StrFormat("thread_id_virtual_scaling = %d",
-                         thread_id_virtual_scaling_),
-         absl::StrFormat("tiling_dimensions = {%s}",
-                         absl::StrJoin(tiling_dimensions_, ", "))},
+                         absl::StrJoin(num_threads_, ", "))},
         ", ");
   }
 
-  // Number of elements in each dimension (Z/Y/X respectively).
-  const Vector3& GetShape() const { return dims_in_elems_; }
+  // Number of elements in each dimension.
+  const absl::InlinedVector<int64_t, 4>& GetShape() const { return shape_; }
+  xla::Shape GetXlaShape(PrimitiveType element_type = F32) const {
+    return ShapeUtil::MakeShape(element_type, shape_);
+  }
 
-  Vector3 GetBlockCounts() const {
-    return {GetBlockCount(0), GetBlockCount(1), GetBlockCount(2)};
+  const absl::InlinedVector<int64_t, 4>& GetBlockCounts() const {
+    return num_blocks_;
   }
 
   // Tile size for each thread.
   //
   // Equals to the number of iterations in the loop each tile will make.
-  const Vector3& GetThreadTileSize() const { return tile_sizes_per_thread_; }
+  const absl::InlinedVector<int64_t, 4>& GetThreadTileSize() const {
+    return tile_sizes_per_thread_;
+  }
 
   // Tile size for an entire thread block.
-  const Vector3& GetBlockTileSize() const { return tile_sizes_per_block_; }
-
-  // The tiling dimension for dimension 'd' of the shared memory tile.
-  int64_t GetTilingDimension(int d) const { return tiling_dimensions_.at(d); }
-
-  // Number of logical threads per block.
-  const Vector3& GetThreadsPerBlock() const { return num_threads_; }
-  int64_t GetNumThreadsPerBlock() const {
-    return num_threads_[0] * num_threads_[1] * num_threads_[2];
+  const absl::InlinedVector<int64_t, 4>& GetBlockTileSize() const {
+    return tile_sizes_per_block_;
   }
 
-  // Number of logical blocks.
-  int64_t GetNumBlocks() const {
-    auto counts = GetBlockCounts();
-    return counts[0] * counts[1] * counts[2];
+  const absl::InlinedVector<int64_t, 4>& GetThreadsPerBlock() const {
+    return num_threads_;
   }
 
-  // Number of physical blocks launched (with scaling applied).
-  int64_t GetNumBlocksPhysical() const {
-    return CeilOfRatio(GetNumBlocks(), thread_id_virtual_scaling_);
+  // Returns the strides of the thread index dimensions wrt. the linear thread
+  // id.
+  absl::InlinedVector<int64_t, 4> GetThreadStrides() const {
+    return *ShapeUtil::ByteStrides(ShapeUtil::MakeShape(U8, num_threads_));
   }
 
-  // Number of physical threads per block launched (with scaling applied).
-  int64_t GetNumThreadsPerBlockPhysical() const {
-    return num_threads_[0] * num_threads_[1] * num_threads_[2] *
-           thread_id_virtual_scaling_;
+  // Returns the strides of the block index dimensions wrt. the linear block id.
+  absl::InlinedVector<int64_t, 4> GetBlockStrides() const {
+    return *ShapeUtil::ByteStrides(ShapeUtil::MakeShape(U8, num_blocks_));
   }
 
-  IndexingOrder GetIndexingOrder() const { return indexing_order_; }
-  int GetVectorSize() const { return vector_size_; }
+  int64_t GetNumThreadsPerBlock() const { return Product(num_threads_); }
 
-  // Scaling factor for transforming physical threadId to logical.
-  int GetThreadIdScalingFactor() const { return thread_id_virtual_scaling_; }
+  int64_t GetNumBlocks() const { return Product(num_blocks_); }
 
- private:
-  // Number of blocks required to "cover" the given dimension.
-  int64_t GetBlockCount(int d) const {
-    return CeilOfRatio(dims_in_elems_[d], tile_sizes_per_block_[d]);
+  const absl::InlinedVector<bool, 4>& GetLoopsToUnroll() const {
+    return loops_to_unroll_;
   }
 
+ private:
   // The number of elements in each dimension.
-  Vector3 dims_in_elems_;
+  absl::InlinedVector<int64_t, 4> shape_;
 
   // The number of elements for each dimension of a tile.
-  Vector3 tile_sizes_per_thread_;
-  Vector3 tile_sizes_per_block_;
-
-  // The dimensions which are used for the shared memory tile.
-  Vector2 tiling_dimensions_;
-
-  // Number of threads implicitly assigned to each dimension.
-  Vector3 num_threads_;
-
-  IndexingOrder indexing_order_;
+  absl::InlinedVector<int64_t, 4> tile_sizes_per_thread_;
+  absl::InlinedVector<int64_t, 4> tile_sizes_per_block_;
 
-  // Vector size for dimension X.
-  int vector_size_;
+  absl::InlinedVector<int64_t, 4> num_threads_;
+  absl::InlinedVector<int64_t, 4> num_blocks_;
 
-  // Scaling apply to transform physical threadIdx into logical.
-  int64_t thread_id_virtual_scaling_ = 1;
+  absl::InlinedVector<bool, 4> loops_to_unroll_;
 };
 
-// Contains threading information. Note that for performance we might apply
-// thread id "scaling" where the physical thread id (to achieve good SM
-// occupancy) will differ from logical thread id. This struct contains
-// logical thread ids, along with meta-information about the scaling applied.
 struct TilingThreadIdInfo {
-  TilingThreadIdInfo(llvm::Value* thread_id, llvm::Value* thread_id_x,
-                     llvm::Value* thread_id_y, llvm::Value* lane_id,
-                     llvm::Value* block_id, llvm::Value* scaling)
-      : thread_id(thread_id),
-        thread_id_x(thread_id_x),
-        thread_id_y(thread_id_y),
-        lane_id(lane_id),
-        block_id(block_id),
-        scaling(scaling) {}
-
   llvm::Value* thread_id;
 
-  // X-coordinate calculated from thread id: `thread_id % num_threads_x`
-  llvm::Value* thread_id_x;
-
-  // Y-coordinate calculated from thread id: `thread_id / num_threads_x`
-  llvm::Value* thread_id_y;
+  absl::InlinedVector<llvm::Value*, 4> thread_ids;
 
   // Lane id: `thread_id % WarpSize`
   llvm::Value* lane_id;
 
   // Block id.
   llvm::Value* block_id;
-
-  // Emits GEP into a shared memory, taking virtual thread scaling into
-  // account. Automatically inserts the first zero required by LLVM GEP.
-  // Defined on ThreadIdInfo to keep `scaling` private.
-  //
-  // Same semantics as CreateInBoundsGEP.
-  llvm::Value* GEPIntoSharedMemory(
-      llvm::IRBuilder<>* b, llvm::GlobalVariable* shared,
-      absl::Span<llvm::Value* const> idx_major_to_minor,
-      const llvm::Twine& name = "") const;
-
-  // Calculate the pointee type of the llvm::Value returned by
-  // GEPIntoSharedMemory
-  llvm::Type* GEPIntoSharedMemoryType(
-      llvm::GlobalVariable* shared,
-      absl::Span<llvm::Value* const> idx_major_to_minor) const;
-
- private:
-  llvm::Value* scaling;
 };
 
 struct TilingKernelInfo {
   // Tiling bounds.
-  std::array<llvm::Value*, 2> output_tile_bounds;
+  absl::InlinedVector<llvm::Value*, 4> output_tile_bounds;
 
   // Starting tile, as calculated from block id only.
   llvm_ir::IrArray::Index tile_origin;
@@ -230,61 +149,30 @@ struct TilingKernelInfo {
 //
 // index: Absolute coordinate of the start of the tile in input.
 // tile_dimensions: Size of the tile
-using TileElementGenerator =
+using TileGenerator =
     std::function<void(const TilingThreadIdInfo& thread_id_info,
-                       const llvm_ir::IrArray::Index& index,
-                       std::array<llvm::Value*, 2> tile_dimensions)>;
+                       const llvm_ir::IrArray::Index& tile_start_index,
+                       absl::Span<llvm::Value* const> tile_dimensions)>;
 
 // A function object to generate code to process one element in a tile.
 //
-// index: the index for the first output element of the current thread.
-// y_loc: The y coordinate within a tile.
-// x_loc: The x coordinate within a tile.
-using EmitTileElementFunction =
-    std::function<void(const TilingThreadIdInfo& thread_id_info,
-                       const llvm_ir::IrArray::Index& index, llvm::Value* y_loc,
-                       llvm::Value* x_loc)>;
+// index_in_tile: the current coordinates within the tile. To get the global
+// coordinates, use `tile_start_index.AddOffset(index_in_tile, ...)`.
+using TileElementGenerator =
+    std::function<void(absl::Span<llvm::Value* const> index_in_tile)>;
 
-// Emits code to iterate through a 2-dimensional tile with a given tile
-// dimensions and given strides, and call the callback at each iteration.,
-//
-// thread_id_y` and `thread_id_x` are the intra-tile coordinates for
-// the first element to process, and `index` is the index for the origin of
-// the tile. Emits bounds check to ensure that each processed element
-// is within the boundary defined by `tile_dimensions`.
-//
-// Rough pseudocode:
-//
-// Given: tile_dimensions, x_offset, y_offset
-//
-// for (y = 0; y < tile_dimensions[0]; y += num_threads_y) {
-//   for (x = 0; x < tile_dimensions[1]; x++) {
-//
-//     y_pos = y_offset + y
-//     x_pos = x_offset + x * stride
-//
-//     if (x_loc < tile_width) {
-//       emit_elem_function(y_offset + y, x_loc);
-//     }
-//   }
-// }
-//
-void EmitTile(llvm::IRBuilder<>* builder, const TilingScheme& tiling_scheme,
-              const llvm_ir::IrArray::Index& tile_origin_index,
+// Emits code to iterate through a tile with given tile dimensions and generate
+// elements using the callback.
+void EmitTile(llvm::IRBuilder<>* builder, const Tiling& tiling,
               const TilingThreadIdInfo& thread_id_info,
-              std::array<llvm::Value*, 2> tile_dimensions,
-              const EmitTileElementFunction& emit_elem_function);
+              absl::Span<llvm::Value* const> tile_dimensions,
+              const TileElementGenerator& emit_elem_function);
 
 // Emits a kernel for the hlo instruction using the given kernel mapping
 // scheme.
 absl::StatusOr<TilingKernelInfo> EmitTilingKernel(
-    llvm::IRBuilder<>* builder, const TilingScheme& tiling_scheme,
-    llvm::Type* index_ty, const TileElementGenerator& tile_element_generator);
-
-llvm_ir::IrArray::Index GetUnnormalizedIndex(
-    const llvm_ir::IrArray::Index& normalized_shape_index,
-    const Shape& unnormalized_shape, llvm::IRBuilder<>* builder,
-    absl::Span<const int64_t> dims_in_elems);
+    llvm::IRBuilder<>* builder, const Tiling& tiling, llvm::Type* index_ty,
+    const TileGenerator& tile_element_generator);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose.cc b/third_party/xla/xla/service/gpu/fusions/transpose.cc
index 39df230a2ec5d9..80c0b93915cebc 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose.cc
+++ b/third_party/xla/xla/service/gpu/fusions/transpose.cc
@@ -22,18 +22,18 @@ limitations under the License.
 #include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
-#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/AtomicOrdering.h"
+#include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/permutation_util.h"
 #include "xla/service/gpu/elemental_ir_emitter.h"
@@ -42,6 +42,8 @@ limitations under the License.
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/service/gpu/target_util.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
@@ -53,44 +55,29 @@ namespace xla {
 namespace gpu {
 namespace {
 
-TilingScheme ComputeTransposeTilingScheme(
-    const TransposeDescription& tiled_transpose) {
+Tiling ComputeTransposeTiling(const TransposeDescription& tiled_transpose) {
   constexpr int kNumRows = 4;
   static_assert(WarpSize() % kNumRows == 0);
 
-  // 3D view over the input shape.
-  Vector3 dims = tiled_transpose.dimensions;
-  Vector3 order = tiled_transpose.permutation;
-
-  Vector3 permuted_dims = {dims[order[0]], dims[order[1]], dims[order[2]]};
-  Vector3 tile_sizes{1, 1, 1};
-  tile_sizes[order[2]] = WarpSize() / kNumRows;
-  Vector3 num_threads{1, 1, WarpSize()};
-  num_threads[order[2]] = kNumRows;
-
-  return TilingScheme(
-      /*dims_in_elems=*/permuted_dims,
-      /*tile_sizes=*/tile_sizes,
-      /*num_threads=*/num_threads,
-      /*indexing_order=*/TilingScheme::LinearIndexingX,
-      /*vector_size=*/1,
-      /*scaling_factor=*/1,
-      /*tiling_dimensions=*/{order[2], 2});
-}
+  // 3D view over the output shape.
+  Vector3 transposed_dims = tiled_transpose.dimensions;
+  Vector3 permutation = tiled_transpose.permutation;
 
-llvm::GlobalVariable* AllocateShared(
-    llvm::IRBuilder<>* builder, const TilingScheme& tiling_scheme,
-    llvm::Type* element_type,
-    absl::Span<int64_t const> dimensions_major_to_minor,
-    absl::string_view buffer_name) {
-  CHECK(!dimensions_major_to_minor.empty());
-  llvm::Type* ty = element_type;
-  for (auto dim : llvm::reverse(dimensions_major_to_minor)) {
-    ty = llvm::ArrayType::get(ty, dim);
-  }
-  ty = llvm::ArrayType::get(ty, tiling_scheme.GetThreadIdScalingFactor());
-  return llvm_ir::AllocateSharedMemoryTile(
-      builder->GetInsertBlock()->getModule(), ty, buffer_name);
+  // Note: the supported permutations are their own inverses. Therefore we
+  // always use the permutation, even when we want the inverse.
+  CHECK((permutation == Vector3{0, 2, 1}) || (permutation == Vector3{2, 1, 0}));
+
+  absl::InlinedVector<int64_t, 4> input_dims{transposed_dims[permutation[0]],
+                                             transposed_dims[permutation[1]],
+                                             transposed_dims[permutation[2]]};
+
+  // We tile along the minor dimensions pre- and post-transpose.
+  absl::InlinedVector<int64_t, 4> tile_sizes{1, 1, 1};
+  tile_sizes[permutation[2]] = WarpSize() / kNumRows;
+  absl::InlinedVector<int64_t, 4> num_threads{1, 1, WarpSize()};
+  num_threads[permutation[2]] = kNumRows;
+
+  return Tiling(input_dims, tile_sizes, num_threads);
 }
 
 void MaybeEmitFenceForAMDGPU(llvm::IRBuilder<>* builder,
@@ -121,7 +108,14 @@ llvm_ir::IrArray::Index PermuteIndex(const llvm_ir::IrArray::Index& index,
 
 TransposeFusion::TransposeFusion(const HloFusionAnalysis& analysis)
     : analysis_(analysis),
-      tiling_scheme_(ComputeTransposeTilingScheme(analysis.tiled_transpose())) {
+      tiling_(ComputeTransposeTiling(analysis.tiled_transpose())) {
+  for (auto [root, hero] :
+       llvm::zip(analysis_.fusion_roots(), analysis_.fusion_heroes())) {
+    if (auto transpose = GetDescriptionForTiledTransposeEmitter(*root, *hero)) {
+      permutation_ = transpose->permutation;
+      break;
+    }
+  }
 }
 
 absl::Status TransposeFusion::EmitKernel(IrEmitterContext& ir_emitter_context,
@@ -153,7 +147,7 @@ absl::Status TransposeFusion::EmitKernel(IrEmitterContext& ir_emitter_context,
   std::vector<std::pair<int64_t, const HloInstruction*>> extra_outputs;
 
   for (const auto& [output_idx, root] : llvm::enumerate(hlo_roots)) {
-    const auto& hero = FindNonTrivialHero(*root);
+    const auto& hero = *analysis_.fusion_heroes()[output_idx];
     auto transpose_descr = GetDescriptionForTiledTransposeEmitter(*root, hero);
     if (transpose_descr.has_value()) {
       auto iterator_inserted = transposes_to_roots.insert(std::make_pair(
@@ -169,149 +163,156 @@ absl::Status TransposeFusion::EmitKernel(IrEmitterContext& ir_emitter_context,
     }
   }
 
-  absl::flat_hash_map<const HloInstruction*, llvm::GlobalVariable*> tiles;
+  absl::flat_hash_map<const HloInstruction*, llvm_ir::SharedMemoryTile> tiles;
   Vector3 permutation;
   for (const auto& [tile_idx, tr] : llvm::enumerate(transposes)) {
     permutation = tr.permutation;
-    const auto& block_tile_size = tiling_scheme_.GetBlockTileSize();
-    tiles[tr.instr] =
-        AllocateShared(builder, tiling_scheme_,
-                       llvm_ir::PrimitiveTypeToIrType(
-                           tr.instr->operand(0)->shape().element_type(),
-                           ir_emitter_context.llvm_module()),
-                       {block_tile_size[permutation[TilingScheme::DimX]],
-                        block_tile_size[TilingScheme::DimX] + 1},
-                       absl::StrCat("tr_tile_", tile_idx));
+    auto tile_size = tiling_.GetBlockTileSize();
+    ++tile_size.back();  // Prevent bank conflicts.
+    auto* module = ir_emitter_context.llvm_module();
+    tiles[tr.instr] = llvm_ir::AllocateSharedMemoryTile(
+        module,
+        llvm_ir::PrimitiveTypeToIrType(tr.instr->shape().element_type(),
+                                       module),
+        tile_size, absl::StrCat("tr_tile_", tile_idx));
   }
 
-  TileElementGenerator tile_generator =
-      [&](const TilingThreadIdInfo& thread_id_info,
-          const llvm_ir::IrArray::Index& index,
-          std::array<llvm::Value*, 2> tile_dimensions) {
-        // Copy input parameter values to shared memory buffers:
-        // tile[thread_id_y, thread_id_x] = input[index]
-        // Note that tile_width and tile_height are flipped here because we
-        // are reading a transposed tile.
-        EmitTile(
-            builder, tiling_scheme_, index, thread_id_info, tile_dimensions,
-            [&](const TilingThreadIdInfo& thread_id_info,
-                const llvm_ir::IrArray::Index& index, llvm::Value* y_loc,
-                llvm::Value* x_loc) {
-              for (const auto& tr : transposes) {
-                llvm_ir::ElementGenerator input_gen =
-                    *fused_emitter.GetGenerator(*tr.instr->operand(0));
-                llvm_ir::IrArray::Index untiled_index =
-                    GetUnnormalizedIndex(index, tr.instr->operand(0)->shape(),
-                                         builder, tiling_scheme_.GetShape());
-                llvm::Value* value = *input_gen(untiled_index);
-                llvm::Value* addr = thread_id_info.GEPIntoSharedMemory(
-                    builder, tiles[tr.instr], {y_loc, x_loc});
-
-                builder->CreateStore(value, addr);
-              }
-
-              // Compute all extra output values before writing them. This
-              // avoids overwriting aliased input/output values before all reads
-              // occurred.
-              std::vector<std::tuple<llvm_ir::IrArray, llvm_ir::IrArray::Index,
-                                     llvm::Value*>>
-                  scheduled_writes;
-              for (const auto& [output_idx, root] : extra_outputs) {
-                llvm_ir::IrArray::Index untiled_index = GetUnnormalizedIndex(
-                    index, root->shape(), builder, tiling_scheme_.GetShape());
-                llvm_ir::ElementGenerator output_gen =
-                    *fused_emitter.GetGenerator(*root);
-                llvm::Value* output_value = *output_gen(untiled_index);
-                scheduled_writes.emplace_back(outputs[output_idx],
-                                              untiled_index, output_value);
-              }
-
-              for (const auto& [output, idx, value] : scheduled_writes) {
-                output.EmitWriteArrayElement(idx, value, builder);
-              }
-            });
-
-        EmitSyncThreads(builder, ir_emitter_context);
-
-        llvm_ir::IrArray::Index output_tile_index =
-            PermuteIndex(index, permutation);
-        std::array<llvm::Value*, 2> transposed_tile_dimensions = {
-            tile_dimensions[1], tile_dimensions[0]};
-
-        EmitTile(
-            builder, tiling_scheme_, output_tile_index, thread_id_info,
-            transposed_tile_dimensions,
-            /*emit_elem_function=*/
-            [&](const TilingThreadIdInfo& thread_id_info,
-                const llvm_ir::IrArray::Index& index, llvm::Value* y_loc,
-                llvm::Value* x_loc) {
-              for (const auto& tr : transposes) {
-                std::vector<llvm::Value*> idx = {x_loc, y_loc};
-                llvm::Value* gep = thread_id_info.GEPIntoSharedMemory(
-                    builder, tiles[tr.instr], idx);
-                llvm::Type* type = thread_id_info.GEPIntoSharedMemoryType(
-                    tiles[tr.instr], idx);
-                llvm::Value* loaded =
-                    builder->CreateLoad(type, gep, "tiled_buffer");
-
-                FusedIrEmitter fused_emitter(elemental_emitter);
-                fused_emitter.BindGenerator(
-                    *tr.instr, [&](const llvm_ir::IrArray::Index& index) {
-                      return loaded;
-                    });
-                for (int64_t i = 0;
-                     i <
-                     fusion.fused_instructions_computation()->num_parameters();
-                     ++i) {
-                  llvm_ir::IrArray ir_array = inputs[i];
-                  HloInstruction* fused_operand = fusion.fused_parameter(i);
-                  fused_emitter.BindGenerator(
-                      *fused_operand,
-                      [=](const llvm_ir::IrArray::Index& index) {
-                        return ir_array.EmitReadArrayElement(
-                            index, builder, fused_operand->name());
-                      });
-                }
-
-                // Apply code generation for the code after the real hero.
-                // Compute all output values before writing them. This avoids
-                // overwriting aliased input/output values before all reads
-                // occurred.
-                std::vector<std::tuple<llvm_ir::IrArray,
-                                       llvm_ir::IrArray::Index, llvm::Value*>>
-                    scheduled_writes;
-                for (const auto& [output_idx, root] :
-                     transposes_to_roots[tr.instr]) {
-                  TF_ASSIGN_OR_RETURN(llvm_ir::ElementGenerator gen,
-                                      fused_emitter.GetGenerator(*root));
-
-                  // Both for emission and writing it should be
-                  // index-as-transformed by the computation.
-                  llvm_ir::IrArray::Index untiled_index = GetUnnormalizedIndex(
-                      index, root->shape(), builder,
-                      Permute(tiling_scheme_.GetShape(), permutation));
-                  TF_ASSIGN_OR_RETURN(llvm::Value * generated,
-                                      gen(untiled_index));
-                  scheduled_writes.emplace_back(outputs[output_idx],
-                                                untiled_index, generated);
-                }
-                for (const auto& [output, idx, value] : scheduled_writes) {
-                  output.EmitWriteArrayElement(idx, value, builder);
-                }
-              }
-              return absl::OkStatus();
-            });
-      };
+  auto tile_generator = [&](const TilingThreadIdInfo& thread_id_info,
+                            const llvm_ir::IrArray::Index& tile_start_index,
+                            absl::Span<llvm::Value* const> tile_dimensions) {
+    // Copy input parameter values to shared memory buffers:
+    // tile[thread_id_y, thread_id_x] = input[index]
+    EmitTile(builder, tiling_, thread_id_info, tile_dimensions,
+             [&](absl::Span<llvm::Value* const> index_in_tile) {
+               auto index = tile_start_index.AddOffset(index_in_tile, builder);
+               for (const auto& tr : transposes) {
+                 auto input_gen =
+                     *fused_emitter.GetGenerator(*tr.instr->operand(0));
+                 auto input_index = index.SourceIndexOfBitcast(
+                     tr.instr->operand(0)->shape(), builder);
+                 llvm::Value* value = *input_gen(input_index);
+                 tiles[tr.instr].Store(value, index_in_tile, builder);
+               }
+
+               // Compute all extra output values before writing them. This
+               // avoids overwriting aliased input/output values before all
+               // reads occurred.
+               std::vector<std::tuple<llvm_ir::IrArray, llvm_ir::IrArray::Index,
+                                      llvm::Value*>>
+                   scheduled_writes;
+               for (const auto& [output_idx, root] : extra_outputs) {
+                 auto extra_output_index =
+                     index.SourceIndexOfBitcast(root->shape(), builder);
+                 auto output_gen = *fused_emitter.GetGenerator(*root);
+                 llvm::Value* output_value = *output_gen(extra_output_index);
+                 scheduled_writes.emplace_back(
+                     outputs[output_idx], extra_output_index, output_value);
+               }
+
+               for (const auto& [output, idx, value] : scheduled_writes) {
+                 output.EmitWriteArrayElement(idx, value, builder);
+               }
+             });
+
+    EmitSyncThreads(builder, ir_emitter_context);
+
+    auto output_tile_index = PermuteIndex(tile_start_index, permutation);
+    auto transposed_tile_dimensions = Permute(tile_dimensions, permutation);
+
+    EmitTile(
+        builder, tiling_, thread_id_info, transposed_tile_dimensions,
+        /*emit_elem_function=*/
+        [&](absl::Span<llvm::Value* const> index_in_tile) {
+          auto index = output_tile_index.AddOffset(index_in_tile, builder);
+          for (const auto& tr : transposes) {
+            llvm::Value* loaded = tiles[tr.instr].Load(
+                Permute(index_in_tile, permutation), builder);
+
+            FusedIrEmitter fused_emitter(elemental_emitter);
+            fused_emitter.BindGenerator(
+                *tr.instr,
+                [&](const llvm_ir::IrArray::Index&) { return loaded; });
+            for (int64_t i = 0;
+                 i < fusion.fused_instructions_computation()->num_parameters();
+                 ++i) {
+              llvm_ir::IrArray ir_array = inputs[i];
+              HloInstruction* fused_operand = fusion.fused_parameter(i);
+              fused_emitter.BindGenerator(
+                  *fused_operand, [=](const llvm_ir::IrArray::Index& index) {
+                    return ir_array.EmitReadArrayElement(index, builder,
+                                                         fused_operand->name());
+                  });
+            }
+
+            // Apply code generation for the code after the real hero.
+            // Compute all output values before writing them. This avoids
+            // overwriting aliased input/output values before all reads
+            // occurred.
+            std::vector<std::tuple<llvm_ir::IrArray, llvm_ir::IrArray::Index,
+                                   llvm::Value*>>
+                scheduled_writes;
+            for (const auto& [output_idx, root] :
+                 transposes_to_roots[tr.instr]) {
+              TF_ASSIGN_OR_RETURN(llvm_ir::ElementGenerator gen,
+                                  fused_emitter.GetGenerator(*root));
+
+              // Both for emission and writing it should be
+              // index-as-transformed by the computation.
+              auto untiled_index =
+                  index.SourceIndexOfBitcast(root->shape(), builder);
+              TF_ASSIGN_OR_RETURN(llvm::Value * generated, gen(untiled_index));
+              scheduled_writes.emplace_back(outputs[output_idx], untiled_index,
+                                            generated);
+            }
+            for (const auto& [output, idx, value] : scheduled_writes) {
+              output.EmitWriteArrayElement(idx, value, builder);
+            }
+          }
+          return absl::OkStatus();
+        });
+  };
 
   llvm::Type* index_type =
       GetIndexTypeForKernel(&fusion, launch_dims.launch_bound(), builder);
-  return EmitTilingKernel(builder, tiling_scheme_, index_type, tile_generator)
+  return EmitTilingKernel(builder, tiling_, index_type, tile_generator)
       .status();
 }
 
 LaunchDimensions TransposeFusion::launch_dimensions() const {
-  return LaunchDimensions(tiling_scheme_.GetNumBlocksPhysical(),
-                          tiling_scheme_.GetNumThreadsPerBlockPhysical());
+  return LaunchDimensions(tiling_.GetNumBlocks(),
+                          tiling_.GetNumThreadsPerBlock());
+}
+
+std::optional<IndexingMap> TransposeFusion::ComputeThreadIdToOutputIndexing(
+    int64_t root_index, mlir::MLIRContext* ctx) const {
+  const auto& hero = *analysis_.fusion_heroes()[root_index];
+  const auto& root = *analysis_.fusion_roots()[root_index];
+  if (!GetDescriptionForTiledTransposeEmitter(root, hero)) {
+    // Non-transpose roots are elementwise by definition.
+    return ComputeThreadIdToInputIndexing(root_index, 0, ctx);
+  }
+
+  // The block offsets are permuted, but the thread offsets remain the same.
+  auto block_offset = GetBlockOffsetsForTiling(tiling_, ctx)
+                          .getSubMap(std::vector<unsigned>{permutation_.begin(),
+                                                           permutation_.end()});
+  auto thread_offset = GetThreadOffsetsForTiling(tiling_, ctx);
+  auto permuted_tiled_shape =
+      ShapeUtil::MakeShape(U8, Permute(tiling_.GetShape(), permutation_));
+
+  return ComposeIndexingMaps(
+      GetIndexingMapForTiling(block_offset, thread_offset, tiling_),
+      GetBitcastMap(permuted_tiled_shape, hero.shape(), ctx));
+}
+
+std::optional<IndexingMap> TransposeFusion::ComputeThreadIdToInputIndexing(
+    int64_t root_index, int64_t hero_operand_index,
+    mlir::MLIRContext* ctx) const {
+  const auto& hero = *analysis_.fusion_heroes()[root_index];
+
+  return ComposeIndexingMaps(
+      GetIndexingMapForTiling(tiling_, ctx),
+      GetBitcastMap(tiling_.GetXlaShape(), hero.operand(0)->shape(), ctx));
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose.h b/third_party/xla/xla/service/gpu/fusions/transpose.h
index d973cc0f48af7d..899b1cb94390ae 100644
--- a/third_party/xla/xla/service/gpu/fusions/transpose.h
+++ b/third_party/xla/xla/service/gpu/fusions/transpose.h
@@ -15,17 +15,22 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_FUSIONS_TRANSPOSE_H_
 #define XLA_SERVICE_GPU_FUSIONS_TRANSPOSE_H_
 
+#include <cstdint>
 #include <optional>
 #include <vector>
 
 #include "absl/status/status.h"
 #include "llvm/IR/IRBuilder.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/fusions/tiling_util.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/indexing_map.h"
+#include "xla/service/llvm_ir/ir_array.h"
+#include "xla/util.h"
 
 namespace xla {
 namespace gpu {
@@ -59,10 +64,11 @@ class TransposeFusion : public KernelFusionEmitterBase {
   LaunchDimensions launch_dimensions() const override;
 
   std::optional<IndexingMap> ComputeThreadIdToOutputIndexing(
-      int64_t output_id, mlir::MLIRContext* ctx) const override {
-    // TODO(b/319081342): Implement this.
-    return std::nullopt;
-  }
+      int64_t root_index, mlir::MLIRContext* ctx) const override;
+
+  std::optional<IndexingMap> ComputeThreadIdToInputIndexing(
+      int64_t root_index, int64_t hero_operand_index,
+      mlir::MLIRContext* ctx) const override;
 
  protected:
   absl::Status EmitKernel(IrEmitterContext& ir_emitter_context,
@@ -74,7 +80,8 @@ class TransposeFusion : public KernelFusionEmitterBase {
 
  private:
   const HloFusionAnalysis& analysis_;
-  TilingScheme tiling_scheme_;
+  Tiling tiling_;
+  Vector3 permutation_;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/fusions/transpose_test.cc b/third_party/xla/xla/service/gpu/fusions/transpose_test.cc
new file mode 100644
index 00000000000000..55f6f420ebf0be
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/fusions/transpose_test.cc
@@ -0,0 +1,197 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+#include "xla/service/gpu/fusions/transpose.h"
+
+#include <memory>
+#include <optional>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/service/gpu/fusions/fusions.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/model/indexing_test_utils.h"
+#include "xla/status_macros.h"
+#include "xla/statusor.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ::testing::HasSubstr;
+
+class TransposeTest : public HloTestBase {
+ protected:
+  stream_executor::DeviceDescription device_info_ =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
+};
+
+StatusOr<std::unique_ptr<TransposeFusion>> GetTransposeFusion(
+    const HloFusionAnalysis& analysis) {
+  TF_ASSIGN_OR_RETURN(
+      auto emitter, GetFusionEmitter(PreBufferAssignmentFusionInfo{analysis}));
+  auto fusion = dynamic_cast<TransposeFusion*>(emitter.get());
+  TF_RET_CHECK(fusion != nullptr);
+
+  emitter.release();
+  return std::unique_ptr<TransposeFusion>{fusion};
+}
+
+TEST_F(TransposeTest, ThreadIndexing021) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    fusion {
+      %input = f32[100,32,64] parameter(0)
+      ROOT transpose = f32[100,64,32] transpose(%input), dimensions={0,2,1}
+    }
+
+    ENTRY entry {
+      %input = f32[100,32,64] parameter(0)
+      ROOT %fusion = f32[100,64,32] fusion(%input), kind=kInput, calls=fusion
+    })")
+                    .value();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto fusion, GetTransposeFusion(analysis));
+  mlir::MLIRContext mlir_context;
+
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
+          d3 floordiv 2,
+          d0 floordiv 32 + s1 * 4,
+          (d3 mod 2) * 32 + d0 mod 32
+        )
+        domain:
+        d0 in [0, 127]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 199]
+        d4 in [0, 0]
+        d5 in [0, 0]
+
+        s0 in [0, 0]
+        s1 in [0, 7]
+        s2 in [0, 0]
+
+        (d3 mod 2) * 32 + d0 mod 32 in [0, 63]
+        d0 floordiv 32 + s1 * 4 in [0, 31]
+      )"));
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
+          d3 floordiv 2,
+          (d3 mod 2) * 32 + d0 floordiv 32 + s1 * 4,
+          d0 mod 32
+        )
+        domain:
+        d0 in [0, 127]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 199]
+        d4 in [0, 0]
+        d5 in [0, 0]
+
+        s0 in [0, 0]
+        s1 in [0, 7]
+        s2 in [0, 0]
+
+        (d3 mod 2) * 32 + d0 floordiv 32 + s1 * 4 in [0, 63]
+        d0 mod 32 in [0, 31]
+      )"));
+}
+
+TEST_F(TransposeTest, ThreadIndexing201) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    fusion {
+      %input = f32[100,64,32] parameter(0)
+      ROOT transpose = f32[32,100,64] transpose(%input), dimensions={2,0,1}
+    }
+
+    ENTRY entry {
+      %input = f32[100,64,32] parameter(0)
+      ROOT %fusion = f32[32,100,64] fusion(%input), kind=kInput, calls=fusion
+    })")
+                    .value();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis = AnalyzeFusion(*root, device_info_);
+
+  TF_ASSERT_OK_AND_ASSIGN(auto fusion, GetTransposeFusion(analysis));
+  mlir::MLIRContext mlir_context;
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToInputIndexing(0, 0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
+          (d3 * 32 + d0 floordiv 32 + s1 * 4) floordiv 64,
+          (d3 * 32 + d0 floordiv 32 + s1 * 4) mod 64,
+          d0 mod 32
+        )
+        domain:
+        d0 in [0, 127]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 199]
+        d4 in [0, 0]
+        d5 in [0, 0]
+
+        s0 in [0, 0]
+        s1 in [0, 7]
+        s2 in [0, 0]
+
+        0 in [0, 0]
+        d0 mod 32 in [0, 31]
+        d3 * 32 + d0 floordiv 32 + s1 * 4 in [0, 6399]
+      )"));
+  EXPECT_THAT(
+      fusion->ComputeThreadIdToOutputIndexing(0, &mlir_context)->ToString(),
+      MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
+          d0 floordiv 32 + s1 * 4,
+          (d3 * 32 + d0 mod 32) floordiv 64,
+          (d3 * 32 + d0 mod 32) mod 64
+        )
+        domain:
+        d0 in [0, 127]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 199]
+        d4 in [0, 0]
+        d5 in [0, 0]
+
+        s0 in [0, 0]
+        s1 in [0, 7]
+        s2 in [0, 0]
+
+        0 in [0, 0]
+        d0 floordiv 32 + s1 * 4 in [0, 31]
+        d3 * 32 + d0 mod 32 in [0, 6399]
+      )"));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/fusions/triton.cc b/third_party/xla/xla/service/gpu/fusions/triton.cc
index 3685b25f575160..36f4ae207ad076 100644
--- a/third_party/xla/xla/service/gpu/fusions/triton.cc
+++ b/third_party/xla/xla/service/gpu/fusions/triton.cc
@@ -14,6 +14,7 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/fusions/triton.h"
 
+#include <memory>
 #include <optional>
 #include <string>
 #include <variant>
@@ -35,7 +36,7 @@ limitations under the License.
 #include "xla/service/gpu/kernel_reuse_cache.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/runtime3/kernel_thunk.h"
+#include "xla/service/gpu/runtime/kernel_thunk.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/statusor.h"
@@ -135,12 +136,13 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
     if (fusion_kind == kTritonSoftmaxFusionKind) {
       launch_dimensions = *this->launch_dimensions();
 
-      auto& triton_config = *backend_config.mutable_triton_gemm_config();
-      triton_config.set_num_stages(1);
+      // This is a hack, we use TritonGemmConfig for Softmax too, but we ignore
+      // most parameters.
+      TritonGemmConfig config;
+      config.num_stages = 1;
       // Thread count per block is always a multiple of WarpSize.
-      triton_config.set_num_warps(launch_dimensions.num_threads_per_block() /
-                                  WarpSize());
-      TritonGemmConfig config = TritonGemmConfig::FromProto(triton_config);
+      config.num_warps = launch_dimensions.num_threads_per_block() / WarpSize();
+      config.num_ctas = 1;
 
       TF_ASSIGN_OR_RETURN(auto analysis,
                           TritonFusionAnalysis::Execute(*hlo_computation));
@@ -170,8 +172,9 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
         triton_config.set_num_stages(1);
         triton_config.set_num_warps(2);
       }
-      TritonGemmConfig config =
-          TritonGemmConfig::FromProto(backend_config.triton_gemm_config());
+      TF_ASSIGN_OR_RETURN(
+          TritonGemmConfig config,
+          TritonGemmConfig::FromProto(backend_config.triton_gemm_config()));
 
       TF_ASSIGN_OR_RETURN(auto analysis, TritonFusionAnalysis::Execute(
                                              *hlo_computation, config.split_k));
@@ -209,13 +212,15 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
     impl_fn->eraseFromParent();
 
     return {{kernel->getName().str(), launch_dimensions,
+             triton_wrapper_result.cluster_dim,
              triton_wrapper_result.shmem_bytes}};
   };
 
-  auto [kernel, was_cached] = ir_emitter_context.kernel_cache().GetWithStatus(
-      hlo_computation, kernel_arguments.args(),
-      /*discriminator=*/"", generate);
-  TF_RETURN_IF_ERROR(kernel.status());
+  auto [status_or_entry, was_cached] =
+      ir_emitter_context.kernel_cache().GetWithStatus(
+          hlo_computation, kernel_arguments.args(),
+          /*discriminator=*/"", generate);
+  TF_ASSIGN_OR_RETURN(const KernelReuseCache::Entry* entry, status_or_entry);
 
   std::variant<mlir::Operation*, const HloInstruction*> fusion_op_or_hlo;
   if (ir_emitter_context.emit_ir_from_hlo()) {
@@ -226,8 +231,8 @@ absl::StatusOr<FusionEmissionResult> TritonFusion::Emit(
 
   FusionEmissionResult result;
   result.thunks.emplace_back(std::make_unique<KernelThunk>(
-      fusion_op_or_hlo, kernel->kernel_name, kernel_arguments.args(),
-      kernel->launch_dimensions, kernel->shmem_bytes));
+      fusion_op_or_hlo, entry->kernel_name, kernel_arguments.args(),
+      entry->launch_dimensions, entry->cluster_dim, entry->shmem_bytes));
 
   return result;
 #else
diff --git a/third_party/xla/xla/service/gpu/fusions/triton_test.cc b/third_party/xla/xla/service/gpu/fusions/triton_test.cc
index e74920eae3583e..49b8f6d966444b 100644
--- a/third_party/xla/xla/service/gpu/fusions/triton_test.cc
+++ b/third_party/xla/xla/service/gpu/fusions/triton_test.cc
@@ -71,11 +71,10 @@ TEST_F(TritonFusionTest, TritonSoftmaxFusion) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis_fused =
       AnalyzeProducerConsumerFusion(*root->operand(0), *root, device_info);
-  ASSERT_NE(analysis_fused, std::nullopt);
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto emitter_fused,
-      GetFusionEmitter(PreBufferAssignmentFusionInfo{*analysis_fused}));
+      GetFusionEmitter(PreBufferAssignmentFusionInfo{analysis_fused}));
   auto triton_fusion = dynamic_cast<TritonFusion*>(emitter_fused.get());
   ASSERT_NE(triton_fusion, nullptr);
   auto launch_dims = triton_fusion->launch_dimensions();
@@ -84,11 +83,10 @@ TEST_F(TritonFusionTest, TritonSoftmaxFusion) {
   EXPECT_EQ(launch_dims->num_threads_per_block(), 32);
 
   auto analysis_consumer = AnalyzeFusion(*root, device_info);
-  ASSERT_NE(analysis_consumer, std::nullopt);
 
   TF_ASSERT_OK_AND_ASSIGN(
       auto emitter_consumer,
-      GetFusionEmitter(PreBufferAssignmentFusionInfo{*analysis_consumer}));
+      GetFusionEmitter(PreBufferAssignmentFusionInfo{analysis_consumer}));
   ASSERT_NE(dynamic_cast<TritonFusion*>(emitter_consumer.get()), nullptr);
 }
 
diff --git a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
index 2cbeb77ff43832..89e6ffb5060736 100644
--- a/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
+++ b/third_party/xla/xla/service/gpu/gemm_algorithm_picker.cc
@@ -90,8 +90,7 @@ absl::StatusOr<AutotuneResult> GetBestAlgorithm(
                        output_buffer);
     }
 
-    TF_ASSIGN_OR_RETURN(se::blas::ProfileResult profile_result,
-                        run_benchmark(algorithm));
+    TF_ASSIGN_OR_RETURN(auto profile_result, run_benchmark(algorithm));
 
     results.emplace_back();
     AutotuneResult& result = results.back();
@@ -225,12 +224,12 @@ absl::StatusOr<AutotuneResult> DoGemmAutotuneNoCache(
   TF_ASSIGN_OR_RETURN(se::Stream* const stream, autotune_config.GetStream());
   GpuBackendConfig gpu_config =
       gemm->backend_config<GpuBackendConfig>().value();
-  const GemmBackendConfig& gemm_config = gpu_config.gemm_backend_config();
+  const GemmBackendConfig& backend_config = gpu_config.gemm_backend_config();
   const DebugOptions& debug_options =
       gemm->GetModule()->config().debug_options();
   const bool deterministic_ops = debug_options.xla_gpu_deterministic_ops();
 
-  TF_ASSIGN_OR_RETURN(GemmConfig config, GemmConfig::For(gemm));
+  TF_ASSIGN_OR_RETURN(GemmConfig gemm_config, GemmConfig::For(gemm));
   // Don't run autotuning concurrently on the same GPU.
   absl::MutexLock gpu_lock(&GetGpuMutex(stream->parent()));
 
@@ -276,18 +275,18 @@ absl::StatusOr<AutotuneResult> DoGemmAutotuneNoCache(
   HloModuleConfig& hlo_module_config = gemm->GetModule()->mutable_config();
   AutotuneResult best_algorithm;
   if (IsCublasLtMatmul(*gemm)) {
-    bool has_matrix_bias = config.beta != 0.;
+    bool has_matrix_bias = gemm_config.beta != 0.;
 
     TF_ASSIGN_OR_RETURN(
         bool has_vector_bias,
-        gpublas_lt::EpilogueAddsVectorBias(gemm_config.epilogue()));
+        gpublas_lt::EpilogueAddsVectorBias(backend_config.epilogue()));
 
     TF_ASSIGN_OR_RETURN(
         bool has_aux_output,
-        gpublas_lt::EpilogueHasAuxiliaryOutput(gemm_config.epilogue()));
+        gpublas_lt::EpilogueHasAuxiliaryOutput(backend_config.epilogue()));
 
     TF_ASSIGN_OR_RETURN(auto epilogue,
-                        AsBlasLtEpilogue(gemm_config.epilogue()));
+                        AsBlasLtEpilogue(backend_config.epilogue()));
 
     se::DeviceMemoryBase bias_buffer;
     if (has_vector_bias) {
@@ -309,7 +308,7 @@ absl::StatusOr<AutotuneResult> DoGemmAutotuneNoCache(
     }
 
     TF_ASSIGN_OR_RETURN(auto plan,
-                        BlasLt::GetMatmulPlan(stream, config, epilogue));
+                        BlasLt::GetMatmulPlan(stream, gemm_config, epilogue));
 
     TF_ASSIGN_OR_RETURN(auto algorithms, plan->GetAlgorithms());
 
@@ -318,7 +317,7 @@ absl::StatusOr<AutotuneResult> DoGemmAutotuneNoCache(
         GetBestAlgorithm<BlasLt::MatmulAlgorithm>(
             stream, buffer_allocator, gemm->ToString(), autotune_config,
             lhs_buffer, rhs_buffer, output_buffer, algorithms, output_shape,
-            hlo_module_config, gemm_config.beta(),
+            hlo_module_config, backend_config.beta(),
             [&](const BlasLt::MatmulAlgorithm& algorithm)
                 -> absl::StatusOr<se::blas::ProfileResult> {
               se::OwningScratchAllocator<> scratch_allocator(
@@ -333,11 +332,21 @@ absl::StatusOr<AutotuneResult> DoGemmAutotuneNoCache(
             }));
   } else {
     std::vector<se::blas::AlgorithmType> algorithms;
-    TF_RET_CHECK(stream->parent()->GetBlasGemmAlgorithms(stream, &algorithms));
+    TF_ASSIGN_OR_RETURN(GemmConfig::DescriptorsTuple desc,
+                        gemm_config.GetMatrixDescriptors(lhs_buffer, rhs_buffer,
+                                                         output_buffer));
 
-#if TENSORFLOW_USE_ROCM        // Blas gemm algorithms are not yet supported
+    auto blas = stream->parent()->AsBlas();
+    if (blas == nullptr) {
+      return absl::InternalError("No BLAS support for stream");
+    }
+    blas->GetBlasGemmAlgorithms(stream, desc.lhs, desc.rhs, &desc.output,
+                                &gemm_config.alpha, &gemm_config.beta,
+                                &algorithms);
+
+#if TENSORFLOW_USE_ROCM        // Blas gemm algorithms can be empty for ROCM
     if (algorithms.empty()) {  // nothing to autotune
-      VLOG(1) << "Skipping autotuning for ROCm..";
+      LOG(WARNING) << "No solutions found: skipping autotuning for ROCM..";
       best_algorithm.mutable_gemm()->set_algorithm(se::blas::kDefaultAlgorithm);
       return best_algorithm;
     }
@@ -348,7 +357,7 @@ absl::StatusOr<AutotuneResult> DoGemmAutotuneNoCache(
         GetBestBlasAlgorithm(
             stream, buffer_allocator, gemm->ToString(), autotune_config,
             lhs_buffer, rhs_buffer, output_buffer, algorithms, output_shape,
-            hlo_module_config, gemm_config.beta(),
+            hlo_module_config, backend_config.beta(),
             [&](const se::blas::AlgorithmType& algorithm)
                 -> absl::StatusOr<se::blas::ProfileResult> {
               se::blas::ProfileResult profile_result;
@@ -359,7 +368,7 @@ absl::StatusOr<AutotuneResult> DoGemmAutotuneNoCache(
               // should always return true, and the actual
               // success-ness is returned in
               // ProfileResult::is_valid.
-              TF_RETURN_IF_ERROR(RunGemm(config, lhs_buffer, rhs_buffer,
+              TF_RETURN_IF_ERROR(RunGemm(gemm_config, lhs_buffer, rhs_buffer,
                                          output_buffer, workspace_buffer,
                                          deterministic_ops, stream, algorithm,
                                          &profile_result));
@@ -383,11 +392,11 @@ absl::StatusOr<bool> RunOnInstruction(HloInstruction* gemm,
 
   GpuBackendConfig gpu_config =
       gemm->backend_config<GpuBackendConfig>().value();
-  GemmBackendConfig gemm_config = gpu_config.gemm_backend_config();
+  GemmBackendConfig backend_config = gpu_config.gemm_backend_config();
 
   // Degenerate gemms replaced with memzero operation, no need to auto tune it.
-  if (gemm_config.alpha_real() == 0.0 && gemm_config.alpha_imag() == 0.0 &&
-      gemm_config.beta() == 0.0) {
+  if (backend_config.alpha_real() == 0.0 &&
+      backend_config.alpha_imag() == 0.0 && backend_config.beta() == 0.0) {
     VLOG(3) << "Skip degenerate gemm instruction auto tuning";
     return false;
   }
@@ -399,7 +408,7 @@ absl::StatusOr<bool> RunOnInstruction(HloInstruction* gemm,
                         return DoGemmAutotuneNoCache(gemm, key, config);
                       }));
 
-  GemmBackendConfig updated_config = gemm_config;
+  GemmBackendConfig updated_config = backend_config;
 
   bool update_algorithm = std::visit(
       VariantVisitor{[](const se::CudaComputeCapability& cc) {
@@ -423,7 +432,8 @@ absl::StatusOr<bool> RunOnInstruction(HloInstruction* gemm,
   }
   *gpu_config.mutable_gemm_backend_config() = updated_config;
   TF_RETURN_IF_ERROR(gemm->set_backend_config(gpu_config));
-  return updated_config.SerializeAsString() != gemm_config.SerializeAsString();
+  return updated_config.SerializeAsString() !=
+         backend_config.SerializeAsString();
 }
 
 absl::StatusOr<bool> RunOnComputation(HloComputation* computation,
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
index 95ae7ec9e17883..fbd6ecd4c67c3b 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter_triton.cc
@@ -700,6 +700,9 @@ class GemmRewriterTritonVisitor : public DfsHloRewriteVisitor {
         dot->parent()->AddInstruction(HloInstruction::CreateFusion(
             computation->root_instruction()->shape(),
             HloInstruction::FusionKind::kCustom, fusion_inputs, computation));
+    // Copy the metadata of the `dot` to the newly created `fusion` op. This
+    // is convenient for handling metadata in split-k rewriting subsequently.
+    dot_fusion->set_metadata(dot->metadata());
     dot_fusion->GetModule()->SetAndUniquifyInstrName(dot_fusion, fusion_name);
 
     TF_ASSIGN_OR_RETURN(auto gpu_config,
diff --git a/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc
index 603076e51bedd0..e604de3327fc95 100644
--- a/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/gemm_rewriter_triton_test.cc
@@ -1080,6 +1080,23 @@ e {
                             m::Parameter(), m::Parameter()))));
 }
 
+TEST_F(GemmRewriterTritonTest, CopiesDotMetadataToFusionOp) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+ENTRY e {
+  p0 = f16[2,18] parameter(0)
+  p1 = f16[256,2] parameter(1)
+  ROOT d = f16[18,256] dot(p0, p1),
+    lhs_contracting_dims={0}, rhs_contracting_dims={1}, metadata={op_name="foo"}
+})")
+                    .value();
+  EXPECT_TRUE(GemmRewriterTriton(gpu_version_).Run(module.get()).value());
+  EXPECT_EQ(
+      module->entry_computation()->root_instruction()->metadata().op_name(),
+      "foo");
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc b/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
index 3089231771dc23..aad47e75728e27 100644
--- a/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_aot_compilation_test.cc
@@ -26,24 +26,18 @@ limitations under the License.
 #include "xla/service/compiler.h"
 #include "xla/service/executable.h"
 #include "xla/service/platform_util.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "tsl/platform/statusor.h"
-
-#if GOOGLE_CUDA
-#include "xla/service/gpu/nvptx_compiler.h"
-#elif TF_USE_ROCM
-#include "xla/service/gpu/amdgpu_compiler.h"
-#endif
 #include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
 
 using GpuAotCompilationTest = HloTestBase;
 
-TEST_F(GpuAotCompilationTest, LoadExecutableFromAotCompilation) {
+TEST_F(GpuAotCompilationTest, ExportAndLoadExecutable) {
   const absl::string_view hlo_string = R"(
 HloModule Test
 
@@ -59,62 +53,15 @@ ENTRY main {
   auto name =
       absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
   TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
-                          se::MultiPlatformManager::PlatformWithName(name));
+                          se::PlatformManager::PlatformWithName(name));
   TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * stream_exec,
                           platform->ExecutorForDevice(0));
 
   // Compile AOT.
   auto module_group = std::make_unique<HloModuleGroup>(std::move(module));
   AotCompilationOptions aot_options(compiler->PlatformId());
-  // ToDo: Remove after unification of AOT compiler
-  if (!aot_options.debug_options().xla_gpu_enable_xla_runtime_executable()) {
-    return;
-  }
-
   aot_options.set_executor(stream_exec);
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
-      compiler->CompileAheadOfTime(std::move(module_group), aot_options));
-
-  // Serialize-deserialize AOT compilation result.
-  TF_ASSERT_OK_AND_ASSIGN(std::string serialized_aot_result,
-                          aot_results[0]->SerializeAsString());
-  TF_ASSERT_OK_AND_ASSIGN(
-      std::unique_ptr<AotCompilationResult> aot_result,
-      compiler->LoadAotCompilationResult(serialized_aot_result));
-
-  // Load Executable from AOT compilation result.
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<Executable> executable,
-                          aot_result->LoadExecutable(compiler, stream_exec));
-}
-
-TEST_F(GpuAotCompilationTest, LoadExecutableForThunkRuntime) {
-  const absl::string_view hlo_string = R"(
-HloModule Test
 
-ENTRY main {
-  a = f32[100, 200]{1,0} parameter(0)
-  ROOT b = f32[100, 200]{0,1} copy(a)
-}
-)";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  DebugOptions debug_options;
-  debug_options.set_xla_gpu_enable_xla_runtime_executable(false);
-  module->mutable_config().set_debug_options(debug_options);
-
-  auto compiler = backend().compiler();
-  auto name =
-      absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
-  TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
-                          se::MultiPlatformManager::PlatformWithName(name));
-  TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * stream_exec,
-                          platform->ExecutorForDevice(0));
-
-  // Compile AOT.
-  auto module_group = std::make_unique<HloModuleGroup>(std::move(module));
-  AotCompilationOptions aot_options(compiler->PlatformId());
-  aot_options.set_executor(stream_exec);
   TF_ASSERT_OK_AND_ASSIGN(
       std::vector<std::unique_ptr<AotCompilationResult>> aot_results,
       compiler->CompileAheadOfTime(std::move(module_group), aot_options));
@@ -147,7 +94,7 @@ ENTRY main {
   auto name =
       absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
   TF_ASSERT_OK_AND_ASSIGN(se::Platform * platform,
-                          se::MultiPlatformManager::PlatformWithName(name));
+                          se::PlatformManager::PlatformWithName(name));
   TF_ASSERT_OK_AND_ASSIGN(se::StreamExecutor * stream_exec,
                           platform->ExecutorForDevice(0));
 
@@ -156,11 +103,6 @@ ENTRY main {
   // Stream executor is not passed as an option.
   Compiler::TargetConfig gpu_target_config(stream_exec);
   AotCompilationOptions aot_options(compiler->PlatformId());
-  // ToDo: Remove after unification of AOT compiler
-  if (!aot_options.debug_options().xla_gpu_enable_xla_runtime_executable()) {
-    return;
-  }
-
   aot_options.set_target_config(gpu_target_config);
 
   TF_ASSERT_OK_AND_ASSIGN(
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
index b3c817661a2296..381816745afbf7 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -71,12 +71,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/ir/hlo_schedule.h"
 #include "xla/hlo/transforms/hlo_constant_splitter.h"
-#include "xla/mlir/backends/gpu/transforms/passes.h"
-#include "xla/mlir/runtime/transforms/compilation_pipeline_gpu.h"
-#include "xla/mlir/runtime/transforms/compilation_pipeline_options.h"
-#include "xla/runtime/compiler.h"
-#include "xla/runtime/executable.h"
-#include "xla/runtime/jit_executable.h"
 #include "xla/service/algebraic_simplifier.h"
 #include "xla/service/all_gather_broadcast_reorder.h"
 #include "xla/service/all_gather_combiner.h"
@@ -119,6 +113,7 @@ limitations under the License.
 #include "xla/service/float_support.h"
 #include "xla/service/gather_expander.h"
 #include "xla/service/gather_simplifier.h"
+#include "xla/service/gpu/address_computation_fusion_rewriter.h"
 #include "xla/service/gpu/alias_passthrough_params.h"
 #include "xla/service/gpu/all_reduce_blueconnect.h"
 #include "xla/service/gpu/autotuner_util.h"
@@ -126,7 +121,7 @@ limitations under the License.
 #include "xla/service/gpu/compile_module_to_llvm_ir.h"
 #include "xla/service/gpu/conv_layout_normalization.h"
 #include "xla/service/gpu/copy_fusion.h"
-#include "xla/service/gpu/custom_fusion_rewriter.h"
+#include "xla/service/gpu/custom_kernel_fusion_rewriter.h"
 #include "xla/service/gpu/dot_dimension_sorter.h"
 #include "xla/service/gpu/dot_operand_converter.h"
 #include "xla/service/gpu/fusion_merger_triton.h"
@@ -164,7 +159,7 @@ limitations under the License.
 #include "xla/service/gpu/reduction_layout_normalizer.h"
 #include "xla/service/gpu/reduction_splitter.h"
 #include "xla/service/gpu/reduction_utils.h"
-#include "xla/service/gpu/runtime/executable.h"
+#include "xla/service/gpu/rename_fusions.h"
 #include "xla/service/gpu/runtime_intrinsics.h"
 #include "xla/service/gpu/scatter_slice_simplifier.h"
 #include "xla/service/gpu/softmax_rewriter_triton.h"
@@ -234,8 +229,8 @@ limitations under the License.
 #include "xla/stream_executor/device_description.pb.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
@@ -266,10 +261,102 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 namespace {
-bool ConvIsLowerable(HloInstruction* conv) {
-  return GpuConvRewriter::ConvIsLowerable(conv);
+// A class for storing either an owned thread pool or a non-owning pointer to an
+// external thread pool.
+class MaybeOwningThreadPool {
+ public:
+  // Gets or creates a thread pool.
+  //
+  // See the code for the logic.
+  static MaybeOwningThreadPool GetOrCreate(
+      int parallelism, tsl::thread::ThreadPool* default_thread_pool,
+      int default_parallelism);
+
+  // Not owning (nullptr).
+  MaybeOwningThreadPool();
+  // Not owning.
+  explicit MaybeOwningThreadPool(tsl::thread::ThreadPool* thread_pool);
+  // Owning.
+  explicit MaybeOwningThreadPool(
+      std::unique_ptr<tsl::thread::ThreadPool> thread_pool);
+  tsl::thread::ThreadPool* get();
+  const tsl::thread::ThreadPool* get() const;
+  tsl::thread::ThreadPool* operator->();
+  const tsl::thread::ThreadPool* operator->() const;
+  explicit operator bool() const;
+  bool operator!() const;
+
+ private:
+  std::variant<tsl::thread::ThreadPool*,
+               std::unique_ptr<tsl::thread::ThreadPool>>
+      thread_pool_;
+};
+
+/*static*/ MaybeOwningThreadPool MaybeOwningThreadPool::GetOrCreate(
+    int parallelism, tsl::thread::ThreadPool* default_thread_pool,
+    int default_parallelism) {
+  CHECK_GE(parallelism, 0);
+  CHECK_GE(default_parallelism, 1);
+  // CurrentThreadId() returns -1 if the current thread does not belong to the
+  // thread pool. If the current thread belongs to the thread pool, we should
+  // not be using it, because it can potentially cause deadlocks.
+  CHECK(default_thread_pool == nullptr ||
+        default_thread_pool->CurrentThreadId() == -1);
+
+  auto create_thread_pool = [&](int num_threads) {
+    CHECK_GE(num_threads, 1);
+    return std::make_unique<tsl::thread::ThreadPool>(tsl::Env::Default(), "",
+                                                     num_threads);
+  };
+
+  switch (parallelism) {
+    case 0:
+      if (default_thread_pool == nullptr && default_parallelism > 1) {
+        return MaybeOwningThreadPool(create_thread_pool(default_parallelism));
+      }
+      return MaybeOwningThreadPool(default_thread_pool);
+    case 1:
+      return MaybeOwningThreadPool(nullptr);
+    default:
+      return MaybeOwningThreadPool(create_thread_pool(parallelism));
+  }
 }
 
+MaybeOwningThreadPool::MaybeOwningThreadPool() : thread_pool_(nullptr) {}
+
+MaybeOwningThreadPool::MaybeOwningThreadPool(
+    tsl::thread::ThreadPool* thread_pool)
+    : thread_pool_(thread_pool) {}
+
+MaybeOwningThreadPool::MaybeOwningThreadPool(
+    std::unique_ptr<tsl::thread::ThreadPool> thread_pool)
+    : thread_pool_(std::move(thread_pool)) {}
+
+tsl::thread::ThreadPool* MaybeOwningThreadPool::get() {
+  if (std::holds_alternative<tsl::thread::ThreadPool*>(thread_pool_)) {
+    return std::get<tsl::thread::ThreadPool*>(thread_pool_);
+  }
+  return std::get<std::unique_ptr<tsl::thread::ThreadPool>>(thread_pool_).get();
+}
+
+const tsl::thread::ThreadPool* MaybeOwningThreadPool::get() const {
+  return const_cast<MaybeOwningThreadPool*>(this)->get();
+}
+
+tsl::thread::ThreadPool* MaybeOwningThreadPool::operator->() {
+  tsl::thread::ThreadPool* thread_pool = get();
+  CHECK_NE(thread_pool, nullptr);
+  return thread_pool;
+}
+
+const tsl::thread::ThreadPool* MaybeOwningThreadPool::operator->() const {
+  return const_cast<MaybeOwningThreadPool*>(this)->operator->();
+}
+
+MaybeOwningThreadPool::operator bool() const { return get() != nullptr; }
+
+bool MaybeOwningThreadPool::operator!() const { return get() == nullptr; }
+
 absl::StatusOr<AutotuneConfig> GetAutotuneConfig(
     se::StreamExecutor* stream_exec, const DebugOptions& debug_options,
     const GpuCompiler::CompileOptions& options,
@@ -288,75 +375,21 @@ se::GpuComputeCapability GetGpuVersion(const se::StreamExecutor* stream_exec) {
   return stream_exec->GetDeviceDescription().gpu_compute_capability();
 }
 
-// TODO(b/232263665): It should be shared between GPU and CPU.
-class GpuAotCompilationResult : public AotCompilationResult {
- public:
-  GpuAotCompilationResult(
-      HloModuleProto hlo, std::string_view obj_file,
-      std::string_view mlir_module, std::string_view gpu_asm_text,
-      absl::Span<const uint8_t> gpu_binary,
-      absl::Span<const GpuExecutable::ConstantInfo> constants = {}) {
-    XlaRuntimeExecutableProto xla_runtime_executable;
-    *xla_runtime_executable.mutable_hlo_module_proto() = hlo;
-    xla_runtime_executable.set_obj_file(std::string(obj_file));
-    xla_runtime_executable.set_mlir_module(std::string(mlir_module));
-    *xla_runtime_gpu_executable_.mutable_xla_runtime_executable() =
-        xla_runtime_executable;
-
-    xla_runtime_gpu_executable_.set_gpu_asm_text(std::string(gpu_asm_text));
-    xla_runtime_gpu_executable_.set_gpu_binary(gpu_binary.data(),
-                                               gpu_binary.size());
-
-    for (const GpuExecutable::ConstantInfo& cst : constants) {
-      auto* cst_proto = xla_runtime_gpu_executable_.add_constants();
-      cst_proto->set_symbol_name(cst.symbol_name);
-      cst_proto->set_allocation_index(cst.allocation_index);
-      cst_proto->set_content(cst.content.span().data(),
-                             cst.content.span().size());
-    }
-  }
-
-  explicit GpuAotCompilationResult(XlaRuntimeGpuExecutableProto executable)
-      : xla_runtime_gpu_executable_(executable) {}
-
-  absl::StatusOr<std::string> SerializeAsString() const override {
-    return xla_runtime_gpu_executable_.SerializeAsString();
-  }
-
-  static absl::StatusOr<std::unique_ptr<GpuAotCompilationResult>> FromString(
-      const std::string& serialized) {
-    XlaRuntimeGpuExecutableProto xla_runtime_gpu_executable;
-    if (!xla_runtime_gpu_executable.ParseFromString(serialized)) {
-      return Internal("Failed to parse serialized JitRtExecutableProto.");
-    }
-    return std::make_unique<GpuAotCompilationResult>(
-        xla_runtime_gpu_executable);
-  }
-
-  absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
-      Compiler* compiler, const se::StreamExecutor* executor) const override;
-
- private:
-  XlaRuntimeGpuExecutableProto xla_runtime_gpu_executable_;
-};
-
 class GpuThunkAotCompilationResult : public AotCompilationResult {
  public:
-  GpuThunkAotCompilationResult(const HloModule* hlo_module,
-                               const BufferAssignment* buffer_assignment,
-                               std::string_view asm_text,
-                               absl::Span<const uint8_t> binary) {
-    *proto_.mutable_hlo_module() = hlo_module->ToProto();
-    *proto_.mutable_buffer_assignment() = buffer_assignment->ToProto();
-    proto_.set_asm_text(std::string(asm_text));
-    proto_.set_binary(binary.data(), binary.size());
-  }
-
-  explicit GpuThunkAotCompilationResult(CompilationResultProto proto)
-      : proto_(proto) {}
-
-  absl::StatusOr<std::string> SerializeAsString() const override {
-    return proto_.SerializeAsString();
+  static absl::StatusOr<std::unique_ptr<GpuThunkAotCompilationResult>>
+  FromModule(const HloModule* hlo_module,
+             const BufferAssignment* buffer_assignment,
+             std::string_view asm_text, absl::Span<const uint8_t> binary) {
+    CompilationResultProto proto;
+    TF_ASSIGN_OR_RETURN(*proto.mutable_hlo_module_with_config(),
+                        hlo_module->ToProtoWithConfig());
+    *proto.mutable_buffer_assignment() = buffer_assignment->ToProto();
+    proto.set_asm_text(std::string(asm_text));
+    proto.set_binary(binary.data(), binary.size());
+    return std::unique_ptr<GpuThunkAotCompilationResult>(
+        new GpuThunkAotCompilationResult(hlo_module->Clone(),
+                                         std::move(proto)));
   }
 
   static absl::StatusOr<std::unique_ptr<GpuThunkAotCompilationResult>>
@@ -366,59 +399,44 @@ class GpuThunkAotCompilationResult : public AotCompilationResult {
       return Internal(
           "Failed to parse serialized GpuThunkAotCompilationResult.");
     }
-    return std::make_unique<GpuThunkAotCompilationResult>(proto);
+
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<HloModule> module,
+        HloModule::CreateFromProtoWithConfig(proto.hlo_module_with_config()));
+    return std::unique_ptr<GpuThunkAotCompilationResult>(
+        new GpuThunkAotCompilationResult(std::move(module), std::move(proto)));
+  }
+
+  absl::StatusOr<std::string> SerializeAsString() const override {
+    return proto_.SerializeAsString();
   }
 
   absl::StatusOr<std::unique_ptr<Executable>> LoadExecutable(
       Compiler* compiler, const se::StreamExecutor* stream_exec) const override;
 
+  const HloModule* optimized_module() const override { return module_.get(); }
+  std::unique_ptr<HloModule> consume_optimized_module() override {
+    return std::move(module_);
+  }
+
  private:
+  GpuThunkAotCompilationResult(std::unique_ptr<HloModule> module,
+                               CompilationResultProto proto)
+      : module_(std::move(module)), proto_(std::move(proto)) {}
+
+  std::unique_ptr<HloModule> module_;
   CompilationResultProto proto_;
 };
 
 }  // end anonymous namespace
 
-absl::StatusOr<std::unique_ptr<Executable>>
-GpuAotCompilationResult::LoadExecutable(
-    Compiler* compiler, const se::StreamExecutor* executor) const {
-  XlaRuntimeExecutableProto xla_runtime_executable =
-      xla_runtime_gpu_executable_.xla_runtime_executable();
-  TF_ASSIGN_OR_RETURN(HloModuleConfig hlo_module_config,
-                      HloModule::CreateModuleConfigFromProto(
-                          xla_runtime_executable.hlo_module_proto(),
-                          GetDebugOptionsFromFlags()));
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloModule> hlo_module,
-      HloModule::CreateFromProto(xla_runtime_executable.hlo_module_proto(),
-                                 hlo_module_config));
-  std::vector<GpuExecutable::ConstantInfo> constants;
-  for (auto& cst : xla_runtime_gpu_executable_.constants()) {
-    GpuExecutable::ConstantInfo constant = {
-        cst.symbol_name(),
-        DenseDataIntermediate::Own(
-            std::vector<uint8_t>{cst.content().begin(), cst.content().end()}),
-        cst.allocation_index()};
-    constants.push_back(std::move(constant));
-  }
-
-  return GpuExecutable::LoadFromObjFile(
-      std::move(hlo_module), xla_runtime_executable.obj_file(),
-      xla_runtime_executable.mlir_module(), GetDebugOptionsFromFlags(),
-      xla_runtime_gpu_executable_.gpu_asm_text(),
-      xla_runtime_gpu_executable_.gpu_binary(), std::move(constants),
-      GetGpuVersion(executor));
-}
-
 absl::StatusOr<std::unique_ptr<Executable>>
 GpuThunkAotCompilationResult::LoadExecutable(
     Compiler* compiler, const se::StreamExecutor* stream_exec) const {
-  // Recreate HloModule from proto.
-  TF_ASSIGN_OR_RETURN(HloModuleConfig hlo_module_config,
-                      HloModule::CreateModuleConfigFromProto(
-                          proto_.hlo_module(), GetDebugOptionsFromFlags()));
+  // Recreate HloModule+HloModuleConfig from proto.
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> hlo_module,
-      HloModule::CreateFromProto(proto_.hlo_module(), hlo_module_config));
+      HloModule::CreateFromProtoWithConfig(proto_.hlo_module_with_config()));
 
   // Recreate BufferAssignment from proto.
   TF_ASSIGN_OR_RETURN(
@@ -432,7 +450,7 @@ GpuThunkAotCompilationResult::LoadExecutable(
   // Build the executable, which should be a thunk sequence.
   TF_ASSIGN_OR_RETURN(
       se::Platform * platform,
-      se::MultiPlatformManager::PlatformWithId(compiler->PlatformId()));
+      se::PlatformManager::PlatformWithId(compiler->PlatformId()));
   std::string platform_name = platform->Name();
   se::DeviceDescription gpu_device_info = stream_exec->GetDeviceDescription();
   mlir::DialectRegistry registry;
@@ -451,23 +469,9 @@ GpuThunkAotCompilationResult::LoadExecutable(
                                       mlir_context.get(), llvm_module.get(),
                                       /*emit_ir_from_hlo=*/true,
                                       /*emit_kernels=*/false);
-  mlir::OwningOpRef<mlir::ModuleOp> mlir_module = llvm_ir::CreateMlirModuleOp(
-      mlir::Builder(mlir_context.get()).getUnknownLoc(), hlo_module->name());
-  std::vector<const BufferAllocation*> ordered_allocations;
-  absl::flat_hash_map<const mlir::Operation*, const xla::HloInstruction*>
-      operation_map;
-  TF_RETURN_IF_ERROR(HloToLhloModule(*buffer_assignment, *hlo_module,
-                                     *mlir_module, &ordered_allocations,
-                                     &operation_map));
-  ir_emitter_context.set_allocations(ordered_allocations);
   auto ir_emitter = IrEmitterUnnested::Create(&ir_emitter_context);
-  auto entry_function = mlir::cast<mlir::func::FuncOp>(
-      mlir_module->lookupSymbol(hlo_module->entry_computation()->name()));
-  // TODO(anlunx): EmitLmhloRegion emits fusion kernels. We need to make sure
-  // ptx and cubin already contain emission results and disable kernel emission
-  // here.
   TF_RETURN_IF_ERROR(
-      ir_emitter->EmitLmhloRegion(&entry_function.getBody(), operation_map));
+      ir_emitter->EmitHloComputation(hlo_module->entry_computation()));
   std::unique_ptr<ThunkSequence> thunk_sequence =
       ir_emitter->ConsumeThunkSequence();
   ForAllThunks([](Thunk* thunk) { thunk->ClearCompileTimeInfo(); },
@@ -529,15 +533,6 @@ void AddHloVerifier(HloPassPipeline* pipeline, HloVerifierOpts&& opts = {},
                                                "hlo verifier");
   }
 }
-
-void SetInstructionMetadata(HloModule* module) {
-  for (HloComputation* computation : module->computations()) {
-    for (HloInstruction* instruction : computation->instructions()) {
-      instruction->set_creation_pass_id(-1);
-      instruction->set_logical_creation_pass_id(-1);
-    }
-  }
-}
 }  // namespace
 
 // Runs optimization passes on the given HLO module.
@@ -566,8 +561,14 @@ absl::Status GpuCompiler::OptimizeHloModule(
       /*default_thread_pool=*/options.thread_pool,
       /*default_parallelism=*/tsl::port::MaxParallelism());
 
-  AlgebraicSimplifierOptions layout_insensitive_algsimp_opts({},
-                                                             ConvIsLowerable);
+  AlgebraicSimplifierOptions layout_insensitive_algsimp_opts =
+      GetAlgebraicSimplifierOptions(hlo_module->config());
+  layout_insensitive_algsimp_opts.set_conv_is_lowerable_callback(
+      GpuConvRewriter::ConvIsLowerable);
+  layout_insensitive_algsimp_opts.set_enable_dot_strength_reduction(
+      hlo_module->config()
+          .debug_options()
+          .xla_gpu_enable_dot_strength_reduction());
 
   // GPU only supports canonical convolutions.
   layout_insensitive_algsimp_opts.set_supports_non_canonical_dots(false);
@@ -590,8 +591,6 @@ absl::Status GpuCompiler::OptimizeHloModule(
   layout_insensitive_algsimp_opts
       .set_enable_unconditional_reduce_of_concat_replacement(false);
 
-  SetInstructionMetadata(hlo_module);
-
   HloPassPipeline pre_spmd_pipeline("pre-spmd-partitioner");
   // Run some IR cleanup passes before running the SPMD partitioning
   // passes.
@@ -599,6 +598,8 @@ absl::Status GpuCompiler::OptimizeHloModule(
   pre_spmd_pipeline.AddPass<ZeroSizedHloElimination>();
   pre_spmd_pipeline.AddPass<ConditionalCanonicalizer>();
 
+  // The TopkDecomposer generates a compare op with type=TOTALORDER and must
+  // run before the ComparisonExpander which rewrites such comparisons.
   pre_spmd_pipeline.AddPass<TopkDecomposer>([&](const HloInstruction* instr) {
     return instr->opcode() == HloOpcode::kTopK;
   });
@@ -732,7 +733,7 @@ absl::Status GpuCompiler::OptimizeHloModule(
     pipeline.AddPass<RngBitGeneratorExpander>(RandomAlgorithm::RNG_PHILOX);
 
     // Comparison total order expander
-    pipeline.AddPass<ComparisonExpander>();
+    pipeline.AddPass<ComparisonExpander>(std::array{std::make_pair(BF16, F32)});
 
     // Remove zero-sized HLO from the input so that other passes don't have to
     // handle it.
@@ -1045,6 +1046,7 @@ absl::Status GpuCompiler::OptimizeHloModule(
 
   {
     HloPassPipeline pipeline("post-fusion optimization");
+    pipeline.AddPass<RenameFusions>();
     pipeline.AddPass<AllGatherCombiner>(
         hlo_module->config()
             .debug_options()
@@ -1191,6 +1193,14 @@ absl::Status GpuCompiler::OptimizeHloModule(
   return absl::OkStatus();
 }
 
+AlgebraicSimplifierOptions GpuCompiler::GetAlgebraicSimplifierOptions(
+    const HloModuleConfig& config) {
+  AlgebraicSimplifierOptions opts;
+  opts.set_enable_dot_strength_reduction(
+      config.debug_options().xla_gpu_enable_dot_strength_reduction());
+  return opts;
+}
+
 // Modifies the given HLO module so that it will be accepted by IrEmitter.
 // Unlike optimization passes, the passes are necessary for correctness.
 absl::Status GpuCompiler::PrepareHloModuleForIrEmitting(HloModule* hlo_module) {
@@ -1208,7 +1218,8 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   const se::GpuComputeCapability gpu_version =
       gpu_target_config.device_description.gpu_compute_capability();
   const AlgebraicSimplifierOptions simplifier_options = [&] {
-    AlgebraicSimplifierOptions opts;
+    AlgebraicSimplifierOptions opts =
+        GetAlgebraicSimplifierOptions(hlo_module->config());
     opts.set_supports_non_canonical_dots(false);
     opts.set_is_layout_sensitive(true);
     opts.set_enable_conv_operand_swap(false);
@@ -1237,9 +1248,9 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     sub_pipeline.AddPass<FloatNormalization>(&f8e5m2fnuz_support);
     sub_pipeline.AddPass<FloatNormalization>(&f8e4m3fnuz_support);
     // Remove `f32 -> bf16 -> f32` casts inserted by bf16 normalization.
-    if (debug_options.xla_gpu_simplify_all_fp_conversions()) {
-      sub_pipeline.AddPass<SimplifyFPConversions>(
-          SimplifyFPConversions::Scope::kSimplifyAllConversions);
+    if (debug_options.xla_allow_excess_precision() &&
+        debug_options.xla_gpu_simplify_all_fp_conversions()) {
+      sub_pipeline.AddPass<SimplifyFPConversions>();
     }
   };
 
@@ -1262,15 +1273,15 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
     });
     pipeline.AddPass<HloPassFix<MoveCopyToUsers>>();
 
-    // Greedy pattern matching for custom fusions. We run it before Triton
-    // rewriter or a regular Gemm rewriter to be able to match compatible GEMMs
-    // before they matched into Triton gemm or a cuBLAS custom call.
+    // Greedy pattern matching for custom kernel fusions. We run it before
+    // Triton rewriter or a regular Gemm rewriter to be able to match compatible
+    // GEMMs before they matched into Triton gemm or a cuBLAS custom call.
     //
     // TODO(ezhulenev): This should be plugged into the cost model and fusion
     // heuristic, so we can mix and match various Gemm implementations based
     // on projected (measured) performance.
     if (debug_options.xla_gpu_enable_custom_fusions()) {
-      pipeline.AddPass<CustomFusionRewriter>(
+      pipeline.AddPass<CustomKernelFusionRewriter>(
           &gpu_target_config.device_description);
     }
 
@@ -1346,6 +1357,10 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   pipeline.AddPass<GemmRewriter>(gpu_version);
   // Rewrite GEMMs with broadcasted inputs as strided GEMMs.
   pipeline.AddPass<GemmBroadcastFoldingRewriter>();
+  if (debug_options.xla_gpu_normalize_layouts()) {
+    pipeline.AddPass<LayoutNormalization>(&NormalizeLayoutForGpuCustomCalls);
+    pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(simplifier_options);
+  }
 
   TF_RETURN_IF_ERROR(AddConvAndGemmAutotuningPasses(
       &pipeline, hlo_module, autotune_config, thread_pool));
@@ -1361,14 +1376,13 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   // duplicate or NOPs, so remove them with algebraic simplification and CSE.
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(simplifier_options);
 
-  if (debug_options.xla_gpu_simplify_all_fp_conversions()) {
+  if (debug_options.xla_allow_excess_precision() &&
+      debug_options.xla_gpu_simplify_all_fp_conversions()) {
     // This pass cleans up chains of compiler-generated converts
     // (i.e. f32 -> bf16 -> f32) that have been produced by the algebraic
     // simplifier by rearranging ops (i.e. by pushing broadcasts towards the
     // root).
-    pipeline.AddPass<SimplifyFPConversions>(
-        SimplifyFPConversions::Scope::
-            kOnlySimplifyCompilerGeneratedConversions);
+    pipeline.AddPass<SimplifyFPConversions>();
   }
 
   // Since this CSE runs after collective schedule linearizer which inserts
@@ -1377,8 +1391,32 @@ absl::Status GpuCompiler::OptimizeHloPostLayoutAssignment(
   pipeline.AddPass<HloCSE>(/*is_layout_sensitive=*/true,
                            /*only_fusion_computations=*/false,
                            /*ignore_control_dependencies=*/true);
+
+#ifdef NDEBUG
+  // Verify the module in non-debug builds. For debug builds, the verifier
+  // already runs after every pass.
+  pipeline.AddPass<HloVerifier>(
+      std::make_unique<DefaultVerifierMetadata>(
+          HloVerifierOpts{}
+              .MakeLayoutSensitive()
+              .WithInstructionCanChangeLayout(
+                  LayoutAssignment::InstructionCanChangeLayout)
+              .VerifyBroadcastDimensionsOrder()
+              .VerifyReshapeIsBitcast()),
+      "end-of-post-layout_assignment");
+#endif  // NDEBUG
+
   TF_RETURN_IF_ERROR(pipeline.Run(hlo_module).status());
 
+  if (DumpingEnabledForHloModule(*hlo_module)) {
+    TF_ASSIGN_OR_RETURN(
+        std::string autotune_results,
+        AutotunerUtil::SerializeAutotuneResultsForModule(
+            *hlo_module, autotune_config, /*as_textproto=*/true));
+    DumpToFileInDirOrStdout(*hlo_module, "", "autotune_results.pbtxt",
+                            autotune_results);
+  }
+
   return absl::OkStatus();
 }
 
@@ -1788,7 +1826,7 @@ GpuCompiler::CompileToBackendResult(
       module, schedule_metadata.scheduler_mem_limit, gpu_device_info));
 
   TF_ASSIGN_OR_RETURN(se::Platform * platform,
-                      se::MultiPlatformManager::PlatformWithId(PlatformId()));
+                      se::PlatformManager::PlatformWithId(PlatformId()));
 
   // Compile the module
   TF_ASSIGN_OR_RETURN(
@@ -1810,14 +1848,9 @@ GpuCompiler::CompileToBackendResult(
           module->config(), compile_module_results.llvm_module.get(),
           gpu_device_info.gpu_compute_capability(), executor, options, module));
   RecordXlaDeviceBinarySize(backend_result.binary.size());
-  if (DumpingEnabledForHloModule(*module) &&
-      std::holds_alternative<GpuExecutable::OwnedThunkSequence>(
-          compile_module_results.executable)) {
-    const ThunkSequence& thunk_sequence =
-        *std::get<GpuExecutable::OwnedThunkSequence>(
-            compile_module_results.executable);
+  if (DumpingEnabledForHloModule(*module)) {
     DumpToFileInDirOrStdout(*module, "", "thunk_sequence.txt",
-                            thunk_sequence.ToString());
+                            compile_module_results.executable->ToString());
   }
 
   return CompileResultWithMetadata{std::move(backend_result),
@@ -1881,11 +1914,9 @@ absl::StatusOr<std::unique_ptr<Executable>> GpuCompiler::RunBackend(
       CompileToBackendResult(module.get(), &llvm_context, stream_exec, options,
                              gpu_device_info));
 
-  if (auto thunk_sequence = std::get_if<GpuExecutable::OwnedThunkSequence>(
-          &res.compile_module_results.executable);
-      DumpingEnabledForHloModule(*module) && thunk_sequence) {
+  if (DumpingEnabledForHloModule(*module)) {
     DumpToFileInDirOrStdout(*module, "", "thunk_sequence.txt",
-                            (*thunk_sequence)->ToString());
+                            res.compile_module_results.executable->ToString());
   }
 
   // The module is being moved into the GpuExecutable below and we need to
@@ -1970,71 +2001,14 @@ GpuCompiler::CompileAheadOfTime(std::unique_ptr<HloModuleGroup> module_group,
         CompileToBackendResult(module.get(), &llvm_context, options.executor(),
                                {options.device_allocator()}, gpu_device_info));
 
-    if (!IsXlaRuntimeExecutableEnabled(module->config())) {
-      // Create GpuThunkAotCompilationResult if thunk runtime is enabled.
-      results.emplace_back(std::make_unique<GpuThunkAotCompilationResult>(
-          module.get(), res.compile_module_results.buffer_assignment.get(),
-          res.backend_result.asm_text, res.backend_result.binary));
-      continue;
-    }
-
-    const auto* program = std::get_if<GpuExecutable::OwnedGpuRuntimeProgram>(
-        &res.compile_module_results.executable);
-    if (!program) {
-      return Internal("Gpu runtime program was not provided");
-    }
-
-    // TODO(ezhulenev): Unify AOT compilation with GpuRuntimeExecutable::Create
-    // (see `gpu/runtime/executable.h`).
-
-    // Options for the default XLA runtime compilation pipeline.
-    runtime::CompilationPipelineOptions copts;
-
-    // Populate mapping from XLA (SE) enums/structs type id to symbol names.
-    copts.populate_type_id_names = RegisterXlaGpuTypeIdNames;
-
-    // For passing LMHLO attributes as XLA (SE) enums/structs to custom calls.
-    copts.populate_attr_encodings = RegisterXlaGpuAttrEncoding;
-
-    // Options for constructing XLA runtime JitExecutable.
-    runtime::JitExecutable::Options opts;
-    opts.specialization = runtime::JitExecutable::Specialization::kDisabled;
-    opts.compiler.register_dialects =
-        runtime::RegisterDefaultXlaGpuRuntimeDialects;
-
-    // Register XLA Gpu runtime custom calls with the linker.
-    opts.compiler.symbols_binding = runtime::ToSymbolsBinding(
-        RegisterXlaGpuRuntimeCustomCalls, RegisterXlaGpuTypeIdNames);
-
-    opts.compiler.create_compilation_pipeline =
-        [copts](xla::runtime::PassManager& passes) {
-          runtime::CreateDefaultXlaGpuRuntimeCompilationPipeline(passes, copts);
-          return absl::OkStatus();
-        };
-
-    // Instantiate new JitExecutable from the MLIR source.
-    auto jit_executable = runtime::JitExecutable::Instantiate(
-        (*program)->module, (*program)->entry_point, opts);
-    if (!jit_executable.ok())
-      return Internal("Failed to compile XLA program: %s",
-                      jit_executable.status().message());
-
-    // For static shapes we can always serialize only the default executable.
-    runtime::Executable& executable = jit_executable->DefaultExecutable().get();
-
-    // Check if XLA runtime executable saved the compilation result.
-    std::unique_ptr<llvm::MemoryBuffer> obj_file = executable.obj_file();
-    if (!obj_file)
-      return Internal("XLA runtime executable didn't save the obj file");
-
-    std::string data(obj_file->getBuffer().data(),
-                     obj_file->getBuffer().size());
-
-    results.emplace_back(std::make_unique<GpuAotCompilationResult>(
-        module->ToProto(), data, (*program)->module,
-        res.backend_result.asm_text, res.backend_result.binary,
-        res.compile_module_results.constants));
+    // Create GpuThunkAotCompilationResult if thunk runtime is enabled.
+    TF_ASSIGN_OR_RETURN(
+        results.emplace_back(),
+        GpuThunkAotCompilationResult::FromModule(
+            module.get(), res.compile_module_results.buffer_assignment.get(),
+            res.backend_result.asm_text, res.backend_result.binary));
   }
+
   return std::move(results);
 }
 
@@ -2050,18 +2024,9 @@ absl::StatusOr<std::unique_ptr<AotCompilationResult>> GpuCompiler::Export(
   auto* gpu_executable = tensorflow::down_cast<GpuExecutable*>(executable);
   if (!gpu_executable) return Internal("GpuExecutable is null");
 
-  if (gpu_executable->IsXlaRuntimeEnabled()) {
-    HloModuleProto module_proto = gpu_executable->module().ToProto();
-    auto obj_file = gpu_executable->GetObjFile().value_or("");
-    auto mlir_module = gpu_executable->GetMlirModule().value_or("");
-    return std::make_unique<xla::gpu::GpuAotCompilationResult>(
-        module_proto, obj_file, mlir_module, gpu_executable->text(),
-        gpu_executable->binary(), gpu_executable->constants());
-  } else {
-    return std::make_unique<xla::gpu::GpuThunkAotCompilationResult>(
-        &gpu_executable->module(), gpu_executable->buffer_assignment(),
-        gpu_executable->text(), gpu_executable->binary());
-  }
+  return GpuThunkAotCompilationResult::FromModule(
+      &gpu_executable->module(), gpu_executable->buffer_assignment(),
+      gpu_executable->text(), gpu_executable->binary());
 }
 
 absl::Status GpuCompiler::RunPostSchedulingPipelines(
@@ -2105,6 +2070,16 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
     }
   }
 
+  if (module->config()
+          .debug_options()
+          .xla_gpu_enable_address_computation_fusion()) {
+    HloPassPipeline pipeline("address-computation");
+    TF_ASSIGN_OR_RETURN(se::Platform * platform,
+                        se::PlatformManager::PlatformWithId(PlatformId()));
+    pipeline.AddPass<AddressComputationFusionRewriter>(platform->Name());
+    TF_RETURN_IF_ERROR(pipeline.Run(module).status());
+  }
+
   {
     HloPassPipeline pipeline("fusion-wrapper");
     pipeline.AddPass<FusionWrapper>();
@@ -2125,8 +2100,9 @@ absl::Status GpuCompiler::RunPostSchedulingPipelines(
     constexpr int toolkit_version = TF_ROCM_VERSION;
 #endif
     pipeline.AddPass<CommandBufferScheduling>(
-        gpu_device_info.gpu_compute_capability(), toolkit_version,
+        gpu_device_info, toolkit_version,
         driver_version.value_or(toolkit_version));
+    pipeline.AddPass<GpuSanitizeConstantNames>();
     TF_RETURN_IF_ERROR(pipeline.Run(module).status());
   }
 
@@ -2172,10 +2148,6 @@ GpuCompiler::LoadAotCompilationResult(
 absl::StatusOr<std::unique_ptr<AotCompilationResult>>
 GpuCompiler::LoadAotCompilationResultStatic(
     const std::string& serialized_aot_result) {
-  // TODO(anlunx): Remove the code that loads a GpuAotCompilationResult when we
-  // convert to thunk runtime.
-  auto result = GpuAotCompilationResult::FromString(serialized_aot_result);
-  if (result.ok()) return result;
   return GpuThunkAotCompilationResult::FromString(serialized_aot_result);
 }
 
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.h b/third_party/xla/xla/service/gpu/gpu_compiler.h
index b4c1ef377654aa..95a7faae24f4f8 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler.h
+++ b/third_party/xla/xla/service/gpu/gpu_compiler.h
@@ -28,6 +28,7 @@ limitations under the License.
 #include "xla/autotune_results.pb.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_module_group.h"
+#include "xla/service/algebraic_simplifier.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/compiler.h"
 #include "xla/service/executable.h"
@@ -165,6 +166,9 @@ class GpuCompiler : public LLVMCompiler {
     return absl::OkStatus();
   }
 
+  AlgebraicSimplifierOptions GetAlgebraicSimplifierOptions(
+      const HloModuleConfig& config);
+
  private:
   struct CompileResultWithMetadata {
     BackendCompileResult backend_result;
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
index c8117397fec9ae..91622d7959aa39 100644
--- a/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_compiler_test.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/horizontal_loop_fusion.h"
 #include "xla/service/gpu/metrics.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
 #include "xla/service/xla_debug_info_manager.h"
@@ -309,6 +310,60 @@ ENTRY main {
             HloOpcode::kAllGatherDone);
 }
 
+TEST_F(GpuCompilerTest,
+       GemmRewriterTritonIsNoOpWhenTritonAutotunerFallsBackToCublas) {
+  const absl::string_view hlo_string = R"(
+HloModule test
+
+ENTRY main {
+  param_0 = bf16[3,32,1024,4,1024]{4,3,2,1,0} parameter(0)
+  param_1 = bf16[4,3,32,1024]{3,2,1,0} parameter(1)
+  param_2 = s32[] parameter(2)
+  constant_0 = s32[] constant(0)
+  dynamic-slice_0 = bf16[1,3,32,1024]{3,2,1,0} dynamic-slice(param_1, param_2, constant_0, constant_0, constant_0), dynamic_slice_sizes={1,3,32,1024}
+  reshape_0 = bf16[3,32,1024]{2,1,0} reshape(dynamic-slice_0)
+  broadcast_0 = bf16[3,32,1024,4,1024]{2,1,4,3,0} broadcast(reshape_0), dimensions={0,1,2}
+  add_0 = bf16[3,32,1024,4,1024]{4,3,2,1,0} add(param_0, broadcast_0)
+  transpose_0 = bf16[3,4,1024,32,1024]{2,1,4,3,0} transpose(add_0), dimensions={0,3,4,1,2}
+  slice_0 = bf16[1,4,1024,32,1024]{4,3,2,1,0} slice(transpose_0), slice={[0:1], [0:4], [0:1024], [0:32], [0:1024]}
+  reshape_1 = bf16[4,1024,32,1024]{3,2,1,0} reshape(slice_0)
+  copy_0 = bf16[4,1024,32,1024]{3,2,1,0} copy(reshape_1)
+  constant_1 = bf16[] constant(0.08838)
+  broadcast_1 = bf16[4,1024,32,1024]{3,2,1,0} broadcast(constant_1), dimensions={}
+  multiply_0 = bf16[4,1024,32,1024]{3,2,1,0} multiply(copy_0, broadcast_1)
+  slice_1 = bf16[1,4,1024,32,1024]{4,3,2,1,0} slice(transpose_0), slice={[1:2], [0:4], [0:1024], [0:32], [0:1024]}
+  reshape_2 = bf16[4,1024,32,1024]{3,2,1,0} reshape(slice_1)
+  copy_1 = bf16[4,1024,32,1024]{3,2,1,0} copy(reshape_2)
+  ROOT dot_0 = bf16[4,32,1024,1024]{3,2,1,0} dot(multiply_0, copy_1), lhs_batch_dims={0,2}, lhs_contracting_dims={3}, rhs_batch_dims={0,2}, rhs_contracting_dims={3}
+}
+)";
+
+  HloModuleConfig config;
+  DebugOptions debug_options = GetDebugOptionsForTest();
+  config.set_debug_options(GetDebugOptionsForTest());
+  config.set_replica_count(1);
+  config.set_num_partitions(1);
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string, config));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> triton_enabled_module,
+                          GetOptimizedModule(std::move(module)));
+  debug_options.set_xla_gpu_enable_triton_gemm(false);
+  config.set_debug_options(debug_options);
+  TF_ASSERT_OK_AND_ASSIGN(module,
+                          ParseAndReturnVerifiedModule(hlo_string, config));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> triton_disabled_module,
+                          GetOptimizedModule(std::move(module)));
+  // Make sure autotuner falls back to cuBLAS when enabling triton gemm
+  const HloInstruction* root =
+      triton_enabled_module->entry_computation()->root_instruction();
+  const HloInstruction* custom_op = root->operand(0)->operand(0);
+  EXPECT_TRUE(custom_op->IsCustomCall("__cublas$gemm"));
+  // Make sure that the module has the same number of computations with/without
+  // enabling triton gemm
+  EXPECT_EQ(triton_enabled_module->computation_count(),
+            triton_disabled_module->computation_count());
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_conv_rewriter.cc b/third_party/xla/xla/service/gpu/gpu_conv_rewriter.cc
index 697b69306f127f..ca1388e132caae 100644
--- a/third_party/xla/xla/service/gpu/gpu_conv_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gpu_conv_rewriter.cc
@@ -43,6 +43,30 @@ namespace {
 using ConvolutionMatch = std::optional<
     std::tuple<Window, ConvolutionDimensionNumbers, HloInstruction*>>;
 
+// Determine whether conv2d is equal to conv1d.
+bool MaybeConv1dToConv2d(HloInstruction* conv) {
+  if (conv->window().dimensions().size() != 2) {
+    return false;
+  }
+  if (conv->operand(1)->opcode() != HloOpcode::kReshape) {
+    return false;
+  }
+  auto filter = conv->operand(1);
+  std::optional<ShapeUtil::ShapeEqualityDescriptor> reshape_degenerate =
+      filter->ReshapeMerelyInsertsOrDeletes1SizedDimensions();
+  if (reshape_degenerate.has_value() &&
+      reshape_degenerate->deleted_dimensions.empty() &&
+      reshape_degenerate->inserted_dimensions.size() == 1) {
+    auto dnums = conv->convolution_dimension_numbers();
+    for (auto dim : dnums.kernel_spatial_dimensions()) {
+      if (dim == reshape_degenerate->inserted_dimensions[0]) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 bool CanImplementAsGpuForwardConv(HloInstruction* conv) {
   const ConvolutionDimensionNumbers& dnums =
       conv->convolution_dimension_numbers();
@@ -145,14 +169,18 @@ ConvolutionMatch MatchBackwardFilter(HloInstruction* conv) {
   // convolutions have very small kernel dimensions, while in the backward pass
   // "kernel dimensions" are large. If kernel dimensions are smaller than the
   // output dimensions, return foward conv; otherwise proceed with backward
-  // filter conv.
-  bool exists_small_kernel_dimension = false;
+  // filter conv. But for conv1d, it is not same. Due to conv1d always reshape
+  // 1D-filter to 2D-filter, even backward or forward will exist one small
+  // kernel dimension. We should handle this special case.
+  int small_kernel_dimension_num = 0;
   for (int i = 0; i < kernel_spatial_dims.size(); ++i) {
-    exists_small_kernel_dimension |=
-        (conv->operand(1)->shape().dimensions(kernel_spatial_dims[i]) <=
-         conv->shape().dimensions(output_spatial_dims[i]));
+    if (conv->operand(1)->shape().dimensions(kernel_spatial_dims[i]) <=
+        conv->shape().dimensions(output_spatial_dims[i])) {
+      small_kernel_dimension_num += 1;
+    }
   }
-  if ((kernel_spatial_dims.empty() || exists_small_kernel_dimension) &&
+  if ((kernel_spatial_dims.empty() || small_kernel_dimension_num > 1 ||
+       (!MaybeConv1dToConv2d(conv) && small_kernel_dimension_num == 1)) &&
       !window_util::HasWindowDilation(conv->window())) {
     VLOG(1) << conv->ToString()
             << " is a regular forward convolution. No need "
@@ -313,10 +341,18 @@ ConvolutionMatch MatchBackwardInput(HloInstruction* conv) {
       reverse_filter->opcode() == HloOpcode::kReverse &&
       absl::c_is_permutation(dnums.kernel_spatial_dimensions(),
                              reverse_filter->dimensions());
+  // For conv1d which reshape to conv2d, filter reverse pattern is
+  // reshape(reverse(filter)). It seems we can reuse conv2d backward input
+  // pattern matcher, but after algsimp pass, this pattern will change to
+  // reverse(reshape(filter)) and fail to match. So matching conv1d backward
+  // input need different processing logic.
+  bool is_reversed_conv1d_filter =
+      MaybeConv1dToConv2d(conv) &&
+      reverse_filter->operand(0)->opcode() == HloOpcode::kReverse;
   bool is_1x1_filter =
       absl::c_all_of(conv->window().dimensions(),
                      [](const WindowDimension& d) { return d.size() == 1; });
-  if (!is_reversed_filter &&
+  if (!is_reversed_filter && !is_reversed_conv1d_filter &&
       !(window_util::HasBaseDilation(conv->window()) &&
         (reverse_filter->IsConstant() || is_1x1_filter))) {
     VLOG(1) << "Can't match to backwards convolution. Either filter is not "
@@ -488,6 +524,10 @@ ConvolutionMatch MatchBackwardInput(HloInstruction* conv) {
   // One reverse is subsumed by the cuDNN call.
   if (rhs->opcode() == HloOpcode::kReverse) {
     rhs = rhs->mutable_operand(0);
+  } else if (is_reversed_conv1d_filter) {
+    auto src = rhs->mutable_operand(0)->mutable_operand(0);
+    rhs = conv->parent()->AddInstruction(
+        HloInstruction::CreateReshape(rhs->shape(), src));
   }
   if (conv->feature_group_count() == 1) {
     return std::make_tuple(new_window, dnums, rhs);
diff --git a/third_party/xla/xla/service/gpu/gpu_conv_rewriter_test.cc b/third_party/xla/xla/service/gpu/gpu_conv_rewriter_test.cc
index 0044aced926bfe..f161a3012d0b63 100644
--- a/third_party/xla/xla/service/gpu/gpu_conv_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_conv_rewriter_test.cc
@@ -686,6 +686,52 @@ TEST_F(GpuConvRewriterTest, TestBackwardFilterPatternNoMatch) {
                   0)));
 }
 
+TEST_F(GpuConvRewriterTest, TestConv1dBackwardFilterPatternMatch) {
+  // There exist one kernel dimension equal to output dimension, regard
+  // it as backward filter if conv is 1d.
+  const std::string module_str = absl::StrFormat(R"(
+    HloModule Test
+
+    ENTRY Test {
+      input = f32[8,256,128] parameter(0)
+      filter = f32[8,254,128] parameter(1)
+      reshape.1 = f32[8,1,256,128] reshape(input)
+      reshape.2 = f32[8,1,254,128] reshape(filter)
+      ROOT conv = f32[1,3,128,128] convolution(reshape.1, reshape.2), window={size=1x254}, dim_labels=f01b_i01o->01bf
+    })");
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+
+  EXPECT_TRUE(RunPass(m.get()));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::GetTupleElement(
+                  m::CustomCall({kCudnnConvBackwardFilterCallTarget},
+                                m::Reshape(), m::Reshape()),
+                  0)));
+}
+
+TEST_F(GpuConvRewriterTest, TestConv1dBackwardInputPatternMatch) {
+  // For conv1d backward input, filter may reverse first and then reshape.
+  const std::string module_str = absl::StrFormat(R"(
+    HloModule Test
+
+    ENTRY Test {
+      input = f32[8,254,128] parameter(0)
+      filter = f32[3,128,128] parameter(1)
+      reverse = f32[3,128,128] reverse(filter), dimensions={0}
+      reshape.1 = f32[8,1,254,128] reshape(input)
+      reshape.2 = f32[1,3,128,128] reshape(reverse)
+      ROOT conv = f32[8,1,256,128] convolution(reshape.1, reshape.2), window={size=1x3 pad=0_0x2_2}, dim_labels=b01f_01oi->b01f
+    })");
+  TF_ASSERT_OK_AND_ASSIGN(auto m, ParseAndReturnVerifiedModule(module_str));
+
+  EXPECT_TRUE(RunPass(m.get()));
+  EXPECT_THAT(m->entry_computation()->root_instruction(),
+              GmockMatch(m::GetTupleElement(
+                  m::CustomCall({kCudnnConvBackwardInputCallTarget},
+                                m::Reshape(), m::Reshape()),
+                  0)));
+}
+
 }  // anonymous namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.cc b/third_party/xla/xla/service/gpu/gpu_executable.cc
index 82ce013fa320f1..a1630bbc1119b0 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.cc
+++ b/third_party/xla/xla/service/gpu/gpu_executable.cc
@@ -16,39 +16,38 @@ limitations under the License.
 #include "xla/service/gpu/gpu_executable.h"
 
 #include <algorithm>
-#include <atomic>
 #include <cstdint>
+#include <functional>
 #include <memory>
 #include <optional>
 #include <set>
 #include <string>
-#include <string_view>
 #include <utility>
 #include <variant>
 #include <vector>
 
 #include "absl/container/btree_map.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
-#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "absl/types/span.h"
+#include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/map_util.h"
-#include "xla/mlir/runtime/ir/rt_ops.h"
-#include "xla/mlir/runtime/transforms/compilation_pipeline_gpu.h"
-#include "xla/mlir/runtime/transforms/type_converter.h"
-#include "xla/runtime/executable.h"
 #include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/gpu_constants.h"
+#include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/gpu/nccl_clique.h"
 #include "xla/service/gpu/nccl_clique_key.h"
-#include "xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
-#include "xla/service/gpu/runtime/executable.h"
-#include "xla/service/gpu/runtime/tracing.h"
+#include "xla/service/gpu/runtime/annotation.h"
 #include "xla/service/gpu/stream_executor_util.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/service/hlo_parser.h"
@@ -94,16 +93,22 @@ class GpuTimer {};
 namespace xla {
 namespace gpu {
 
+using ::tsl::profiler::ScopedAnnotation;
+
 bool IsXlaRuntimeExecutableEnabled(const HloModuleConfig& config) {
-  return config.debug_options().xla_gpu_enable_xla_runtime_executable();
+  bool enabled = config.debug_options().xla_gpu_enable_xla_runtime_executable();
+  if (enabled) {
+    LOG(ERROR)
+        << "XLA:GPU tried to use deprecated xla runtime by setting "
+           "--xla_gpu_enable_xla_runtime_executable flag to `true` but the "
+           "flag value was ignored as XLA:GPU uses default runtime. This flag "
+           "together with the deprecated code will be removed soon. Please "
+           "report bugs to XLA team if this breaks your workloads.";
+  }
+  return false;
 }
 
-namespace {
-
-using ::tsl::profiler::ScopedAnnotation;
-using ::tsl::profiler::ScopedAnnotationAlways;
-
-bool NeedsAsyncCommsStream(Thunk& thunk) {
+static bool NeedsAsyncCommsStream(Thunk& thunk) {
   switch (thunk.kind()) {
     case Thunk::Kind::kNcclAllReduceStart:
     case Thunk::Kind::kNcclAllReduceDone:
@@ -113,27 +118,34 @@ bool NeedsAsyncCommsStream(Thunk& thunk) {
   }
 }
 
-}  // namespace
+// Traverses operations in HLO module and collects execution stream ids
+// requested by HLO operations. At run time thunks may use additional streams to
+// launch compute operations in addition to a main one.
+//
+// TODO(ezhulenev): Execution stream requirements should be queried from thunks
+// directly and not from HLO module that might be missing.
+static absl::flat_hash_set<ExecutionStreamId> GetExecutionStreamIds(
+    const HloModule& module) {
+  absl::flat_hash_set<ExecutionStreamId> stream_ids;
+  for (const HloComputation* comp : module.computations()) {
+    for (const HloInstruction* hlo : comp->instructions()) {
+      if (hlo->has_backend_config() &&
+          hlo->backend_config<GpuBackendConfig>().ok()) {
+        int64_t op_queue_id = hlo->backend_config<GpuBackendConfig>()
+                                  .value()
+                                  .operation_queue_id();
+        if (op_queue_id > 0) {
+          stream_ids.insert(ExecutionStreamId(op_queue_id));
+        }
+      }
+    }
+  }
+  return stream_ids;
+}
 
 absl::StatusOr<std::unique_ptr<GpuExecutable>> GpuExecutable::Create(
     Params params) {
-  auto executable = std::move(params.executable);
-  std::unique_ptr<GpuExecutable> result(new GpuExecutable(std::move(params)));
-
-  if (std::holds_alternative<OwnedThunkSequence>(executable)) {
-    result->thunks_ = std::move(std::get<OwnedThunkSequence>(executable));
-    return result;
-  }
-
-  if (std::holds_alternative<OwnedGpuRuntimeProgram>(executable)) {
-    auto& program = std::get<OwnedGpuRuntimeProgram>(executable);
-    TF_ASSIGN_OR_RETURN(
-        result->gpu_runtime_executable_,
-        GpuRuntimeExecutable::Create(result->module_name_, std::move(program)));
-    return result;
-  }
-
-  return Internal("No XLA gpu executable was provided");
+  return std::unique_ptr<GpuExecutable>(new GpuExecutable(std::move(params)));
 }
 
 // Implementation note: HLO profiling is always enabled for GPU executables,
@@ -143,6 +155,10 @@ GpuExecutable::GpuExecutable(GpuExecutable::Params params)
       text_(std::move(params.asm_text)),
       binary_(std::move(params.binary)),
       gpu_version_(params.gpu_version),
+      thunks_(std::move(params.executable)),
+      execution_stream_ids_(has_module()
+                                ? GetExecutionStreamIds(module())
+                                : absl::flat_hash_set<ExecutionStreamId>()),
       module_name_(params.module_name),
       output_shape_(params.output_shape),
       allocations_(std::move(params.mlir_allocations)),
@@ -159,9 +175,6 @@ GpuExecutable::GpuExecutable(GpuExecutable::Params params)
   *(uint64_t*)(&binary_[binary_.size() - 16]) = tsl::EnvTime::NowNanos();
   *(uint64_t*)(&binary_[binary_.size() - 8]) = tsl::random::New64();
 #endif
-  if (has_module()) {
-    annotation_info_.emplace(module());
-  }
   if (has_module() && enable_debug_info_manager_) {
     XlaDebugInfoManager::Get()->RegisterModule(shared_module(),
                                                buffer_assignment_->ToProto());
@@ -228,20 +241,62 @@ class ResourceRequests : public Thunk::ResourceRequests {
     return absl::OkStatus();
   }
 
-  // TODO(ezhulenev): Instead of logging we need to acquire locks for all
-  // requested cliques and pass them to Initialize and Execute.
-  void LogCliqueRequests() {
-    VLOG(2) << "Collected " << cliques_.size() << " clique requests:";
+  absl::StatusOr<Thunk::CollectiveCliques> AcquireCollectiveCliques(
+      const Thunk::CollectiveExecuteParams& params) {
+    if (cliques_.empty()) return Thunk::CollectiveCliques();
+
+    VLOG(2) << "Acquire " << cliques_.size()
+            << " collective cliques for global device id "
+            << params.global_device_id.value()
+            << "; run_id=" << params.run_id.ToInt();
+
+    tsl::profiler::TraceMe trace([&] {
+      return tsl::profiler::TraceMeEncode("AcquireCollectiveCliques",
+                                          {{"num_cliques", cliques_.size()}});
+    });
+
+    auto start_micros = tsl::Env::Default()->NowMicros();
+
+    NcclClique::AcquiredCliquesMap cliques_map;
+
     for (const auto& [clique_key, num_local_participants] : cliques_) {
-      VLOG(2) << "  clique_key: " << clique_key.ToString()
-              << "; num_local_participants: " << num_local_participants;
+      std::optional<int64_t> rank = clique_key.rank(params.global_device_id);
+
+      if (!rank.has_value()) {
+        return absl::InternalError(absl::StrCat(
+            "Can't find global device id ", params.global_device_id.value(),
+            " in clique key ", clique_key.ToString()));
+      }
+
+      bool is_local = clique_key.devices().size() == num_local_participants;
+      TF_ASSIGN_OR_RETURN(
+          const NcclCliqueIdCallback* clique_id_callback,
+          GetNcclCliqueIdCallback(params.nccl_clique_id_callback, is_local));
+
+      TF_ASSIGN_OR_RETURN(
+          std::shared_ptr<NcclClique::Lock> clique,
+          AcquireNcclClique(params.executor, params.run_id, clique_key,
+                            *clique_id_callback, *rank, num_local_participants,
+                            cliques_map));
+
+      cliques_map[clique_key] = std::move(clique);
     }
+
+    auto end_micros = tsl::Env::Default()->NowMicros();
+    VLOG(2) << "Acquired " << cliques_map.size()
+            << " collective cliques for global device id "
+            << params.global_device_id.value() << " in "
+            << (end_micros - start_micros) << " μs"
+            << "; run_id=" << params.run_id.ToInt();
+
+    return Thunk::CollectiveCliques(std::move(cliques_map));
   }
 
  private:
   // Keep all clique requests in an ordered container so that we acquire cliques
-  // in the same order for all participants and do not create a deadlock.
-  absl::btree_map<NcclCliqueKey, int64_t> cliques_;
+  // in the same order for all participants and do not create a deadlock. We use
+  // greater ordering to acquire largest cliques first.
+  absl::btree_map<NcclCliqueKey, int64_t, std::greater<NcclCliqueKey>> cliques_;
 };
 
 absl::Status MaybeSyncAndProfile(
@@ -249,19 +304,17 @@ absl::Status MaybeSyncAndProfile(
     std::optional<se::gpu::GpuTimer> execution_timer,
     se::Stream* stream_to_sync);
 
-absl::Status MaybeRendezvousAfterInitialization(
+absl::Status RendezvousAfterInitialization(
+    const ServiceExecutableRunOptions* run_options);
+
+absl::Status ExecuteThunks(
+    const std::string& module_name, ModuleIdentifier module_id,
+    const ThunkSequence& thunk_sequence,
+    Thunk::ExecutableSource executable_source,
     const ServiceExecutableRunOptions* run_options,
-    std::atomic<int64_t>* thunks_initialized);
-
-absl::Status ExecuteThunks(const std::string& module_name,
-                           ModuleIdentifier module_id,
-                           const ThunkSequence& thunk_sequence,
-                           Thunk::ExecutableSource executable_source,
-                           const ServiceExecutableRunOptions* run_options,
-                           const BufferAllocations& buffer_allocations,
-                           bool block_host_until_done,
-                           bool use_highest_priority_for_async_stream,
-                           std::atomic<int64_t>* thunks_initialized) {
+    const BufferAllocations& buffer_allocations, bool block_host_until_done,
+    bool use_highest_priority_for_async_stream,
+    const absl::flat_hash_set<ExecutionStreamId>& execution_stream_ids) {
   se::Stream* main_stream = run_options->stream();
   se::StreamExecutor* executor = main_stream->parent();
   stream_executor::StreamPriority stream_priority =
@@ -290,6 +343,22 @@ absl::Status ExecuteThunks(const std::string& module_name,
     command_buffer_trace_stream = borrowed_command_buffer_trace_stream->get();
   }
 
+  // Borrow stream for additional compute streams
+  Thunk::ExecutionStreamIdMap additional_execution_streams;
+  std::vector<StreamPool::Ptr> additional_streams;
+  if (!execution_stream_ids.empty()) {
+    TF_ASSIGN_OR_RETURN(additional_streams, run_options->BorrowStreams(
+                                                executor->device_ordinal(),
+                                                execution_stream_ids.size()));
+    int64_t i = 0;
+    for (ExecutionStreamId stream_id : execution_stream_ids) {
+      additional_execution_streams[stream_id] = additional_streams.at(i).get();
+      i++;
+    }
+    VLOG(2) << "Using " << additional_execution_streams.size()
+            << " additional compute streams.";
+  }
+
   tsl::profiler::TraceMe hlo_module_activity(
       [&] { return absl::StrCat(module_name, ":XLA GPU module"); },
       tsl::profiler::TraceMeLevel::kInfo);
@@ -305,16 +374,16 @@ absl::Status ExecuteThunks(const std::string& module_name,
   }
 #endif
 
-  // Prepare parameters for thunks execution.
-  TF_ASSIGN_OR_RETURN(Thunk::ExecuteParams execute_params,
-                      Thunk::ExecuteParams::Create(
-                          *run_options, buffer_allocations, main_stream,
-                          command_buffer_trace_stream, async_comms_streams));
+  // Parameters for executing collective operations.
+  TF_ASSIGN_OR_RETURN(
+      Thunk::CollectiveExecuteParams collective_params,
+      Thunk::CollectiveExecuteParams::Create(
+          *run_options, main_stream->parent()->device_ordinal()));
 
   ResourceRequests resource_requests;
 
   {  // Collect resource requirements from thunks.
-    Thunk::PrepareParams prepare_params{&execute_params.collective_params};
+    Thunk::PrepareParams prepare_params{&collective_params};
 
     tsl::profiler::TraceMe trace([&] { return "Thunks::Prepare"; });
     for (const std::unique_ptr<Thunk>& thunk : thunk_sequence) {
@@ -322,16 +391,16 @@ absl::Status ExecuteThunks(const std::string& module_name,
     }
   }
 
-  resource_requests.LogCliqueRequests();
+  // Acquire collective cliques requested by thunks.
+  TF_ASSIGN_OR_RETURN(
+      Thunk::CollectiveCliques collective_cliques,
+      resource_requests.AcquireCollectiveCliques(collective_params));
 
   {  // Initialize thunks using prepared resources before execution.
     Thunk::InitializeParams initialize_params{
-        executor,
-        executable_source,
-        &buffer_allocations,
-        main_stream,
-        command_buffer_trace_stream,
-        &execute_params.collective_params};
+        executor,           executable_source,           &buffer_allocations,
+        main_stream,        command_buffer_trace_stream, &collective_params,
+        &collective_cliques};
 
     tsl::profiler::TraceMe trace([&] { return "Thunks::Initialize"; });
     for (const std::unique_ptr<Thunk>& thunk : thunk_sequence) {
@@ -339,16 +408,25 @@ absl::Status ExecuteThunks(const std::string& module_name,
     }
   }
 
-  // Maybe join a round of rendezvous after thunk initialization.
-  TF_RETURN_IF_ERROR(
-      MaybeRendezvousAfterInitialization(run_options, thunks_initialized));
+  // Maybe join a round of rendezvous after thunk initialization. We do this
+  // only in presence of collective cliques which means that we have collective
+  // operations in the XLA operations that tend to cause deadlocks.
+  if (!collective_cliques.empty()) {
+    TF_RETURN_IF_ERROR(RendezvousAfterInitialization(run_options));
+  }
+
+  // Prepare parameters for thunks execution.
+  Thunk::ExecuteParams execute_params = Thunk::ExecuteParams::Create(
+      *run_options, buffer_allocations, main_stream,
+      command_buffer_trace_stream, async_comms_streams, &collective_params,
+      &collective_cliques, additional_execution_streams);
 
   for (const std::unique_ptr<Thunk>& thunk : thunk_sequence) {
     // Annotate execution of this op if tracing was enabled when we started
     // running this module.  If tracing is enabled *while* we're running the
     // module, we won't get any data, but that's probably an OK trade-off.
-    ScopedAnnotation annotation([&] { return thunk->profile_annotation(); });
-    VLOG(2) << "Executing the thunk for " << thunk->profile_annotation();
+    tsl::profiler::ScopedAnnotation annotation(thunk->profile_annotation());
+    VLOG(3) << "Executing the thunk for " << thunk->profile_annotation();
     if (NeedsAsyncCommsStream(*thunk)) {
       for (se::Stream* async_stream : async_comms_streams) {
         TF_RET_CHECK(async_stream != nullptr)
@@ -362,9 +440,25 @@ absl::Status ExecuteThunks(const std::string& module_name,
                              block_host_until_done ? main_stream : nullptr);
 }
 
-absl::Status MaybeRendezvousAfterInitialization(
-    const ServiceExecutableRunOptions* run_options,
-    std::atomic<int64_t>* thunks_initialized) {
+namespace {
+// Wrap RunId into a unique struct to guarantee we do not accidentally try to
+// run multiple unrelated rendezvous for a same key.
+struct InitializationKey {
+  RunId run_id;
+
+  template <typename H>
+  friend H AbslHashValue(H h, const InitializationKey& key) {
+    return H::combine(std::move(h), key.run_id);
+  }
+};
+
+bool operator==(const InitializationKey& a, const InitializationKey& b) {
+  return a.run_id == b.run_id;
+}
+}  // namespace
+
+absl::Status RendezvousAfterInitialization(
+    const ServiceExecutableRunOptions* run_options) {
   // Thunk initialization can allocate new control data structures on device
   // that can lead to deadlocks if other replicas are executing concurrently
   // (i.e. this happens if we try to instantiate CUDA graph when other replica
@@ -378,24 +472,6 @@ absl::Status MaybeRendezvousAfterInitialization(
   // are running in a single Gpu config and don't need a rendezvous.
   if (!gpu_opts || !device_assn) return absl::OkStatus();
 
-  // If `thunks_initialized` value is `-1` it means that all thunks are
-  // initialized and we can go ahead and execute all of them. All other values
-  // signal how many threads are executing rendezvous (they can be from
-  // different run ids).
-  if (thunks_initialized->load() < 0) return absl::OkStatus();
-
-  // We rely on CAS operations to make sure that all participants of
-  // potentially multiple concurrent XLA executions join the rendezvous or
-  // none of them join, because otherwise we will get a dead lock.
-  int64_t participant_id = thunks_initialized->load();
-  while (participant_id >= 0 && !thunks_initialized->compare_exchange_weak(
-                                    participant_id, participant_id + 1)) {
-  }
-
-  // If we exited a CAS loop with participant id less than 0 it means that some
-  // other thread completed initialization rendezvous.
-  if (participant_id < 0) return absl::OkStatus();
-
   // Assume that all participants execute locally first, if we have a local
   // device id to global device id map we will use it to get the real number of
   // participating local devices.
@@ -421,25 +497,21 @@ absl::Status MaybeRendezvousAfterInitialization(
           << num_local_participants << " local participants"
           << "; device_ordinal=" << run_options->device_ordinal();
 
-  RendezvousSingle(run_options->run_options().run_id(), num_local_participants,
-                   absl::Seconds(10), absl::Seconds(30));
+  tsl::profiler::TraceMe trace([&] {
+    return tsl::profiler::TraceMeEncode(
+        "RendezvousAfterInitialization",
+        {{"run_id", run_options->run_options().run_id().ToInt()},
+         {"num_local_participants", num_local_participants}});
+  });
 
-  // Reload participant_id and use CAS to decide if we are the one who
-  // should mark initialization completed.
-  participant_id = thunks_initialized->load();
-
-  // Check that no one completed initialization process without us, and the
-  // number of participants inside the critical section is greater than 0 (we
-  // are here, so it can't be 0).
-  CHECK_GT(participant_id, 0);  // NOLINT
-
-  // If we are the last one, we try to mark executable initialization as
-  // completed by writing `-1` into the flag.
-  while (!thunks_initialized->compare_exchange_weak(
-      participant_id, participant_id == 1 ? -1 : participant_id - 1)) {
-    // Check precondition for participant id after CAS failure reloaded it.
-    CHECK_GT(participant_id, 0);  // NOLINT
-  }
+  auto rendezvous_key = InitializationKey{run_options->run_options().run_id()};
+  auto rendezvous_name = absl::StrFormat(
+      "thunk initialization completion for device ordinal %d; run_id=%d",
+      run_options->device_ordinal(),
+      run_options->run_options().run_id().ToInt());
+
+  RendezvousSingle(rendezvous_name, rendezvous_key, num_local_participants,
+                   absl::Seconds(10), absl::Seconds(30));
 
   return absl::OkStatus();
 }
@@ -683,37 +755,6 @@ absl::StatusOr<ScopedShapedBuffer> GpuExecutable::ExecuteAsyncOnStream(
   return out.ConsumeResult();
 }
 
-static absl::Status ExecuteXlaRuntime(
-    const std::string& module_name, ModuleIdentifier module_id,
-    GpuRuntimeExecutable& gpu_runtime_executable,
-    const ServiceExecutableRunOptions* run_options, const std::string& asm_text,
-    const std::vector<uint8_t>& binary,
-    const BufferAllocations& buffer_allocations,
-    const BufferAllocation* temp_buffer, bool block_host_until_done,
-    NonAtomicallyUpgradeableRWLock& gpu_lock) {
-  tsl::profiler::TraceMe hlo_module_activity(
-      [&] { return absl::StrCat(module_name, ":XLA GPU module"); },
-      tsl::profiler::TraceMeLevel::kInfo);
-
-  std::optional<se::gpu::GpuTimer> execution_timer;
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  if (ExecutionProfile* profile =
-          run_options->run_options().execution_profile();
-      profile) {
-    TF_ASSIGN_OR_RETURN(
-        execution_timer,
-        se::gpu::GpuTimer::Create(se::gpu::AsGpuStream(run_options->stream())));
-  }
-#endif
-  auto executed = gpu_runtime_executable.Execute(
-      run_options, asm_text, binary, buffer_allocations, gpu_lock, temp_buffer);
-  if (!executed.ok()) return executed;
-
-  return MaybeSyncAndProfile(
-      run_options, std::move(execution_timer),
-      block_host_until_done ? run_options->stream() : nullptr);
-}
-
 absl::StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
     const ServiceExecutableRunOptions* run_options,
     VariantArguments arguments) {
@@ -737,12 +778,13 @@ absl::StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
   // that may be running during JIT compilation while allowing multiple XLA
   // computations to use the same GPU simultaneously. We do not add locking for
   // "recursive" invocations, which are done when holding a lock already.
-  NonAtomicallyUpgradeableRWLock gpu_lock(&GetGpuMutex(executor));
-  std::optional<NonAtomicallyUpgradeableRWLock::WriterLock> exclusive_gpu_lock;
-  const gpu::GpuExecutableRunOptions* gpu_opts =
-      run_options->run_options().gpu_executable_run_options();
-  if (gpu_opts && gpu_opts->requires_exclusive_lock_on_gpu()) {
-    exclusive_gpu_lock.emplace(&gpu_lock);
+  std::variant<absl::ReaderMutexLock, absl::WriterMutexLock> gpu_lock(
+      std::in_place_index_t<0>{}, &GetGpuMutex(executor));
+
+  // Maybe update to a writer lock to get exlcusive acess to underlying GPU.
+  if (auto* gpu_opts = run_options->run_options().gpu_executable_run_options();
+      gpu_opts && gpu_opts->requires_exclusive_lock_on_gpu()) {
+    gpu_lock.emplace<1>(&GetGpuMutex(executor));
   }
 
   const GpuExecutable::BufferAllocToDeviceMemoryMap* globals;
@@ -762,7 +804,7 @@ absl::StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
       BufferAllocations buffer_allocations,
       GenerateBufferAllocations(arguments, globals, memory_allocator,
                                 device_ordinal));
-  VLOG(2) << buffer_allocations.ToString();
+  VLOG(3) << buffer_allocations.ToString();
   std::set<se::DeviceMemoryBase> buffers_in_result;
 
   const bool is_entire_tuple_contents_aliased = [&] {
@@ -878,8 +920,8 @@ absl::StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
     buffers_in_result.insert(result_buffer);
   }
 
-  TF_RETURN_IF_ERROR(ExecuteThunksOrXlaRuntime(
-      run_options, buffer_allocations, block_host_until_done, gpu_lock));
+  TF_RETURN_IF_ERROR(ExecuteThunksOrXlaRuntime(run_options, buffer_allocations,
+                                               block_host_until_done));
 
   TF_RETURN_IF_ERROR(
       buffer_allocations.TearDown(buffers_in_result, GetAllocations()));
@@ -891,45 +933,16 @@ absl::StatusOr<ExecutionOutput> GpuExecutable::ExecuteAsyncOnStreamImpl(
   return std::move(result);
 }
 
-namespace {
-struct ModuleAnnotationManager {
-  ModuleAnnotationManager(const std::optional<ModuleAnnotations>& annotations) {
-    if (annotations.has_value()) {
-      m_old_annotations = SetCurrentModuleAnnotations(&(*annotations));
-    }
-  }
-  ~ModuleAnnotationManager() {
-    if (m_old_annotations.has_value()) {
-      SetCurrentModuleAnnotations(*m_old_annotations);
-    }
-  }
-
- private:
-  std::optional<const ModuleAnnotations*> m_old_annotations;
-};
-}  // namespace
-
 absl::Status GpuExecutable::ExecuteThunksOrXlaRuntime(
     const ServiceExecutableRunOptions* run_options,
-    const BufferAllocations& buffer_allocations, bool block_host_until_done,
-    NonAtomicallyUpgradeableRWLock& gpu_lock) {
+    const BufferAllocations& buffer_allocations, bool block_host_until_done) {
   TF_RETURN_IF_ERROR(
       CheckCompatibilityWithServiceExecutableRunOptions(run_options));
 
-  // There isn't always an HLO module.
-  ModuleIdentifier unique_id = -1;
-  if (has_module()) {
-    unique_id = module().unique_id();
-  }
+  ScopedAnnotation annotation([&] { return module_annotations_.top_level; });
+  ScopedModuleAnnotations module_annotations(&module_annotations_);
 
-  ScopedAnnotationAlways annotation([&]() -> ModuleAnnotation {
-    if (annotation_info_) {
-      return annotation_info_->top_level;
-    } else {
-      return {module_name_, unique_id};
-    }
-  });
-  ModuleAnnotationManager set_current_kernel_annotations{annotation_info_};
+  ModuleIdentifier unique_id = has_module() ? module().unique_id() : -1;
 
   if (thunks_) {
     Thunk::ExecutableSource executable_source = {text_, binary_};
@@ -942,23 +955,7 @@ absl::Status GpuExecutable::ExecuteThunksOrXlaRuntime(
                            .debug_options()
                            .xla_gpu_enable_highest_priority_async_stream()
                      : false,
-        &thunks_initialized_);
-  }
-
-  // Match IrEmitter's temp buffer allocation for kernel launches. See
-  // IrEmitterUnnested::BuildKernelThunkImpl().
-  const BufferAllocation* temp_buffer = nullptr;
-  for (const BufferAllocation& alloc : GetAllocations()) {
-    if (alloc.IsPreallocatedTempBuffer()) {
-      // Retrieve the first seen temp buffer.
-      if (temp_buffer == nullptr) temp_buffer = &alloc;
-    }
-  }
-
-  if (gpu_runtime_executable_) {
-    return ExecuteXlaRuntime(module_name_, unique_id, *gpu_runtime_executable_,
-                             run_options, text_, binary_, buffer_allocations,
-                             temp_buffer, block_host_until_done, gpu_lock);
+        execution_stream_ids_);
   }
 
   return FailedPrecondition("Expected XLA gpu executable is not supplied.");
@@ -1096,211 +1093,5 @@ GetOutputInfo(const HloModule& hlo_module, const BufferAssignment& assignment) {
   return output;
 }
 
-GpuExecutable::GpuExecutable(
-    std::shared_ptr<HloModule> hlo_module, std::string asm_text,
-    std::vector<uint8_t> binary, std::vector<ConstantInfo> constants,
-    se::GpuComputeCapability gpu_version, absl::string_view module_name,
-    Shape xla_output_shape, std::vector<BufferAllocation> allocations,
-    absl::flat_hash_map<ShapeIndex, OutputInfo> output_info,
-    std::unique_ptr<GpuRuntimeExecutable> gpu_runtime_executable)
-    : Executable(std::move(hlo_module)),
-      text_(std::move(asm_text)),
-      binary_(std::move(binary)),
-      gpu_version_(gpu_version),
-      gpu_runtime_executable_(std::move(gpu_runtime_executable)),
-      module_name_(module_name),
-      output_shape_(xla_output_shape),
-      allocations_(std::move(allocations)),
-      constants_(std::move(constants)),
-      output_info_(std::move(output_info)),
-      enable_debug_info_manager_(true) {
-  if (has_module()) {
-    annotation_info_.emplace(module());
-    XlaDebugInfoManager::Get()->RegisterModule(shared_module(),
-                                               BufferAssignmentProto());
-  }
-}
-
-// Returns a list of functions exported from the `module` that should be loaded
-// from the object file. Entrypoint functions always loaded with ordinal 0.
-static absl::StatusOr<std::vector<runtime::Executable::LoadFunction>>
-GetFunctionsToLoad(mlir::ModuleOp module, std::string_view entry) {
-  std::vector<runtime::Executable::LoadFunction> functions;
-
-  // Use canonical type converter because we currently do not support any
-  // user-defined types in XLA:GPU executables.
-  runtime::TypeConverter type_converter;
-
-  // Converts function type and adds load function metadata. In XLA:GPU exported
-  // function runtime signature is the same as regular signature with an extra
-  // execution context argument at index 0.
-  auto convert = [&](mlir::func::FuncOp func) -> absl::Status {
-    auto signature = type_converter.Convert(func.getFunctionType());
-    if (!signature.ok())
-      return Internal("Failed to convert entry function type: %s",
-                      signature.status().message());
-
-    // TODO(ezhulenev): Copy `signature` once FunctionType is copyable.
-    auto rt_signature = type_converter.Convert(func.getFunctionType());
-    rt_signature->insert_operand(
-        0, std::make_unique<runtime::ExecutionContextOperandType>());
-
-    functions.push_back({func.getName().str(), std::move(*signature),
-                         std::move(*rt_signature)});
-
-    return absl::OkStatus();
-  };
-
-  mlir::SymbolTable sym_table(module);
-
-  // Load entrypoint function first at ordinal 0.
-  TF_CHECK_OK(convert(module.lookupSymbol<mlir::func::FuncOp>(entry)));
-
-  // Load all functions explicitly exported from the module (in XLA:GPU it's
-  // always CUDA graph capture functions). We explicitly sort them by ordinal,
-  // to make sure they are loaded in correct order.
-  auto export_ops = llvm::to_vector(module.getOps<runtime::ExportOp>());
-  llvm::sort(export_ops, [](runtime::ExportOp a, runtime::ExportOp b) {
-    return a.getOrdinal()->getSExtValue() < b.getOrdinal()->getSExtValue();
-  });
-  for (runtime::ExportOp exported : export_ops) {
-    TF_CHECK_OK(convert(
-        sym_table.lookup<mlir::func::FuncOp>(exported.getFunctionRef())));
-  }
-
-  return functions;
-}
-
-// Get arguments buffer sizes from the entry function signature.
-static absl::StatusOr<std::vector<int64_t>> GetBufferSizes(
-    runtime::FunctionType& f) {
-  std::vector<int64_t> buffer_sizes;
-  for (unsigned i = 0; i < f.num_operands(); ++i) {
-    auto* memref = llvm::dyn_cast<runtime::MemrefType>(f.operand(i));
-
-    // Entry function argument must be a statically shaped 1d I8 memref.
-    if (memref == nullptr || memref->element_type() != PrimitiveType::S8 ||
-        memref->rank() != 1 || runtime::MemrefType::IsDynamic(memref->size(0)))
-      return Internal("Illegal buffer argument type: %s",
-                      f.operand(0)->ToString());
-
-    buffer_sizes.push_back(memref->size(0));
-  }
-  return buffer_sizes;
-}
-
-// TODO(ezhulenev): This is a copy of `GetAllocationIndices` from
-// `mlir/backends/gpu/transforms/passes.h`. We can't depend on that file because
-// of a dependency cycle, and this is a short term work around the cuda graph
-// capture bug. This code should not survive beyond Q1 2024.
-static std::vector<std::vector<int64_t>> GetAllocationIndices(
-    mlir::ModuleOp module) {
-  std::vector<std::vector<int64_t>> res;
-
-  mlir::SymbolTable sym_table(module);
-  for (auto op : module.getOps<runtime::ExportOp>()) {
-    unsigned ordinal = *op.ordinal();
-    if (ordinal >= res.size()) res.resize(ordinal + 1);
-
-    auto func = sym_table.lookup<mlir::func::FuncOp>(op.getFunctionRef());
-    res[ordinal].resize(func.getNumArguments(), -1);
-
-    for (unsigned i = 0; i < func.getNumArguments(); ++i) {
-      auto idx =
-          func.getArgAttrOfType<mlir::IntegerAttr>(i, "rt.allocation_index");
-      if (idx) res[ordinal][i] = idx.getInt();
-    }
-  }
-
-  return res;
-}
-
-absl::StatusOr<std::unique_ptr<Executable>> GpuExecutable::LoadFromObjFile(
-    std::shared_ptr<HloModule> hlo_module, absl::string_view obj_file,
-    absl::string_view mlir_module, DebugOptions debug_options,
-    absl::string_view asm_text, absl::string_view binary,
-    std::vector<ConstantInfo> constants, se::GpuComputeCapability gpu_version) {
-  VLOG(1) << "Load serialized Gpu executable from object file: module="
-          << hlo_module->name();
-
-  std::string_view entry = hlo_module->entry_computation()->name();
-
-  // Load MLIR module behind the compiled object file to recover XLA allocations
-  // and output info details. Also recover buffer sizes from the entrypoint
-  // function signature.
-  mlir::MLIRContext context;
-  runtime::AppendXlaGpuDialectRegistry(context);
-
-  auto module = mlir::parseSourceString<mlir::ModuleOp>(mlir_module, &context);
-  if (!module) return Internal("Failed to parse AOT compiled module");
-
-  // Get the list of functions to be loaded from the object file.
-  TF_ASSIGN_OR_RETURN(std::vector<runtime::Executable::LoadFunction> functions,
-                      GetFunctionsToLoad(*module, entry));
-  VLOG(2) << "Found " << functions.size() << " functions to load";
-
-  // Get the buffer sizes from the entry function signature.
-  TF_ASSIGN_OR_RETURN(std::vector<int64_t> buffer_sizes,
-                      GetBufferSizes(functions[0].signature));
-
-  // Get allocation indices from graph capture functions.
-  auto allocation_indices = GetAllocationIndices(*module);
-
-  // Get the XLA module entrypoint function.
-  auto func = mlir::cast<mlir::func::FuncOp>(module->lookupSymbol(entry));
-
-  // Infer XLA allocations and output info from the MLIR module.
-  std::vector<BufferAllocation> allocations;
-  absl::flat_hash_map<ShapeIndex, OutputInfo> output_info;
-  Shape result_xla_shape;
-  TF_RETURN_IF_ERROR(SetUpMlirAllocation(func, buffer_sizes, &allocations,
-                                         &output_info, &result_xla_shape));
-
-  // Create a named buffer from compiled object file.
-  llvm::StringRef data(obj_file.data(), obj_file.size());
-  auto buffer = llvm::MemoryBuffer::getMemBuffer(data, hlo_module->name());
-
-  auto symbol_map = runtime::ToSymbolsBinding(RegisterXlaGpuRuntimeCustomCalls,
-                                              RegisterXlaGpuTypeIdNames);
-
-  // Load XLA Runtime executable from an object file, and link it with Gpu
-  // runtime intrinsics implementing Gpu custom calls.
-  auto executable = runtime::Executable::LoadFromObjFile(
-      hlo_module->name(), std::move(buffer), std::move(functions), symbol_map);
-
-  if (!executable.ok())
-    return Internal("Failed to load XLA Runtime executable: %s",
-                    executable.status().message());
-
-  // Move runtime::Executable ownership to the GpuRuntimeExecutable.
-  TF_ASSIGN_OR_RETURN(auto gpu_runtime_executable,
-                      GpuRuntimeExecutable::Create(
-                          hlo_module->name(), std::move(buffer_sizes),
-                          std::move(allocation_indices), std::move(*executable),
-                          std::move(debug_options)));
-
-  // Construct GpuExecutable for the loaded XLA Runtime executable.
-  std::string name = hlo_module->name();
-  std::string asm_text_string = std::string(asm_text);
-  std::vector<uint8_t> binary_vector(binary.begin(), binary.end());
-  return std::unique_ptr<Executable>(new GpuExecutable(
-      std::move(hlo_module), std::move(asm_text_string),
-      std::move(binary_vector), std::move(constants), gpu_version, name,
-      result_xla_shape, std::move(allocations), std::move(output_info),
-      std::move(gpu_runtime_executable)));
-}
-
-absl::StatusOr<std::string_view> GpuExecutable::GetObjFile() const {
-  if (!gpu_runtime_executable_)
-    return Internal("gpu_runtime_executable is null");
-  return gpu_runtime_executable_->GetObjFile();
-}
-
-absl::StatusOr<std::string_view> GpuExecutable::GetMlirModule() const {
-  if (!gpu_runtime_executable_)
-    return Internal("gpu_runtime_executable is null");
-  return gpu_runtime_executable_->GetMlirModule();
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_executable.h b/third_party/xla/xla/service/gpu/gpu_executable.h
index b3acb1eb5abe8b..fb79d6c1330af4 100644
--- a/third_party/xla/xla/service/gpu/gpu_executable.h
+++ b/third_party/xla/xla/service/gpu/gpu_executable.h
@@ -16,35 +16,37 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_GPU_EXECUTABLE_H_
 #define XLA_SERVICE_GPU_GPU_EXECUTABLE_H_
 
-#include <atomic>
 #include <cstdint>
-#include <functional>
-#include <map>
 #include <memory>
 #include <optional>
 #include <string>
-#include <string_view>
-#include <utility>
 #include <variant>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
-#include "absl/strings/string_view.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "absl/types/variant.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_input_output_alias_config.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/executable.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
 #include "xla/service/gpu/runtime/annotation.h"
-#include "xla/service/gpu/runtime/executable.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/service/hlo_execution_profile.h"
+#include "xla/service/hlo_module_config.h"
+#include "xla/service/rendezvous.h"
+#include "xla/service/service_executable_run_options.h"
 #include "xla/service/shaped_buffer.h"
 #include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -63,7 +65,6 @@ bool IsXlaRuntimeExecutableEnabled(const HloModuleConfig& config);
 class GpuExecutable : public Executable {
  public:
   using OwnedThunkSequence = std::unique_ptr<const ThunkSequence>;
-  using OwnedGpuRuntimeProgram = std::unique_ptr<GpuRuntimeProgram>;
 
   struct ConstantInfo {
     std::string symbol_name;
@@ -87,10 +88,7 @@ class GpuExecutable : public Executable {
     std::string asm_text;
     std::vector<uint8_t> binary;
     se::GpuComputeCapability gpu_version;
-    // The GpuExecutable will either execute Thunks, XLA runtime executable
-    // (native function) or experimental XLA runtime executable (IREE VM
-    // function) depending on which is supplied.
-    std::variant<OwnedThunkSequence, OwnedGpuRuntimeProgram> executable;
+    OwnedThunkSequence executable;
     std::vector<ConstantInfo> constants;
     absl::flat_hash_map<ShapeIndex, OutputInfo> output_info;
     std::string module_name;
@@ -104,36 +102,12 @@ class GpuExecutable : public Executable {
 
   // Analyze the entry function to construct buffer allocation and other output
   // information.
-  //
-  // TODO(ezhulenev): Once Xla runtime enabled by default, hide this method as
-  // an implementation detail of GpuExecutable.
   static absl::Status SetUpMlirAllocation(
       mlir::func::FuncOp func, llvm::ArrayRef<int64_t> buffer_sizes,
       std::vector<BufferAllocation>* allocations,
       absl::flat_hash_map<ShapeIndex, OutputInfo>* output_info,
       Shape* output_shape);
 
-  // Returns an Executable that is loaded from an object file (XLA program
-  // compiled to a native function using the XLA Runtime stack).
-  static absl::StatusOr<std::unique_ptr<Executable>> LoadFromObjFile(
-      std::shared_ptr<HloModule> hlo_module, absl::string_view obj_file,
-      absl::string_view mlir_module, DebugOptions debug_options,
-      absl::string_view asm_text, absl::string_view binary,
-      std::vector<ConstantInfo> constants,
-      se::GpuComputeCapability gpu_version);
-
-  // Constructor to use when loading a GpuExecutable from an object file (native
-  // function compiled for XLA Runtime). Omits setting class members that aren't
-  // used in XLA Runtime execution mode.
-  GpuExecutable(std::shared_ptr<HloModule> hlo_module, std::string asm_text,
-                std::vector<uint8_t> binary,
-                std::vector<ConstantInfo> constants,
-                se::GpuComputeCapability gpu_version,
-                absl::string_view module_name, Shape xla_output_shape,
-                std::vector<BufferAllocation> allocations,
-                absl::flat_hash_map<ShapeIndex, OutputInfo> output_info,
-                std::unique_ptr<GpuRuntimeExecutable> runtime_executable);
-
   static absl::StatusOr<std::unique_ptr<GpuExecutable>> Create(Params params);
   ~GpuExecutable() override;
 
@@ -197,15 +171,8 @@ class GpuExecutable : public Executable {
                                     : buffer_assignment_->Allocations();
   }
 
-  bool IsXlaRuntimeEnabled() const {
-    return gpu_runtime_executable_ != nullptr;
-  }
-
   const std::vector<ConstantInfo>& constants() const { return constants_; }
 
-  absl::StatusOr<std::string_view> GetObjFile() const;
-  absl::StatusOr<std::string_view> GetMlirModule() const;
-
   const BufferAssignment* buffer_assignment() const {
     return buffer_assignment_.get();
   }
@@ -221,8 +188,7 @@ class GpuExecutable : public Executable {
   // GPU execution completes.
   absl::Status ExecuteThunksOrXlaRuntime(
       const ServiceExecutableRunOptions* run_options,
-      const BufferAllocations& buffer_allocations, bool block_host_until_done,
-      NonAtomicallyUpgradeableRWLock& gpu_lock);
+      const BufferAllocations& buffer_allocations, bool block_host_until_done);
 
   using BufferAllocToDeviceMemoryMap =
       absl::flat_hash_map<BufferAllocation::Index, se::DeviceMemoryBase>;
@@ -272,11 +238,8 @@ class GpuExecutable : public Executable {
   // compute_capability_.
   //
   // May be empty, in which case we leave compilation up to the GPU driver.
-#ifdef TENSORFLOW_USE_ROCM
   std::vector<uint8_t> binary_;
-#else
-  const std::vector<uint8_t> binary_;
-#endif
+
   // The GPU version for compute compatibility check.
   se::GpuComputeCapability gpu_version_;
 
@@ -284,18 +247,8 @@ class GpuExecutable : public Executable {
   // IrEmitter (null if XLA:GPU runtime is enabled).
   OwnedThunkSequence thunks_;
 
-  // A flag that signals if `thunks_` initialization is completed. Thunks
-  // initialization might allocate new control data structures on device, which
-  // can lead to deadlocks if executed concurrently with other replicas.
-  //
-  // We use atomic CAS operations to decide if a thread running XLA executable
-  // should join the rendezvous, see implementation for details.
-  std::atomic<int64_t> thunks_initialized_{0};
-
-  // Gpu runtime executable that encapsulates all the state for running Gpu
-  // runtime custom calls implementing gpu abstraction layer (available only if
-  // Xla runtime is enabled).
-  std::unique_ptr<GpuRuntimeExecutable> gpu_runtime_executable_;
+  // Additional execution streams requested by `thunks_`.
+  absl::flat_hash_set<ExecutionStreamId> execution_stream_ids_;
 
   std::string module_name_;
 
@@ -313,7 +266,12 @@ class GpuExecutable : public Executable {
   // This object is also used for dumping debug info.
   std::unique_ptr<const xla::BufferAssignment> buffer_assignment_;
 
-  std::optional<ModuleAnnotations> annotation_info_;
+  ModuleAnnotations module_annotations_ = [this] {
+    if (has_module()) {
+      return ModuleAnnotations(module());
+    }
+    return ModuleAnnotations(module_name_);
+  }();
 
   int64_t debug_buffer_assignment_show_max_;
 
diff --git a/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h b/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h
index ce2f8c8a50e884..c9d19ab0e1ebd7 100644
--- a/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h
+++ b/third_party/xla/xla/service/gpu/gpu_fused_mha_runner.h
@@ -23,10 +23,12 @@ limitations under the License.
 #include <variant>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/shape.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/dnn.h"
@@ -53,7 +55,7 @@ struct GpufMHADescriptor {
   Shape rhs_bmm2_shape;
   Shape intermediate_lhs_bmm2_shape;
   // This will contain both output shape and activation shape
-  std::vector<Shape> output_shapes;
+  absl::InlinedVector<Shape, 2> output_shapes;
   DotDimensionNumbers bmm1_dnums;
   DotDimensionNumbers bmm2_dnums;
 
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.cc b/third_party/xla/xla/service/gpu/gpu_fusible.cc
index 12242883d99c7e..2b880f0ab50320 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.cc
@@ -159,7 +159,8 @@ bool IsInputFusibleTranspose(const HloInstruction& instr) {
   if (instr.opcode() == HloOpcode::kFusion) {
     return HasAnyTiledTransposeRoot(*instr.fused_instructions_computation());
   }
-  return GetDescriptionForTiledTransposeEmitter(instr, instr).has_value();
+  auto& hero = FindNonTrivialHero(instr);
+  return GetDescriptionForTiledTransposeEmitter(instr, hero).has_value();
 }
 
 const HloInstruction* GetRealHeroForMultiOutputFusion(
@@ -191,28 +192,6 @@ const HloInstruction* GetRealHeroForMultiOutputFusion(
   return fused_expression_root->operands()[0];
 }
 
-// Returns whether the output of a fusion with reduction are consistent with
-// `first_reduce`.
-static bool IsFusedReductionOutputConsistent(
-    const HloInstruction* inst, const HloInstruction* first_reduce) {
-  const auto& hero = FindNonTrivialHero(*inst);
-  if (IsRealReductionHero(*inst, hero)) {
-    // Shapes, layouts and dimensions must be the same for all reduces
-    // inside of this fusion.
-    return ShapeUtil::EqualIgnoringElementType(first_reduce->shape(),
-                                               inst->shape()) &&
-           ShapeUtil::EqualIgnoringElementType(
-               first_reduce->operand(0)->shape(), inst->operand(0)->shape()) &&
-           ShapeUtil::EqualIgnoringElementType(
-               first_reduce->operand(1)->shape(), inst->operand(1)->shape()) &&
-           first_reduce->dimensions() == inst->dimensions();
-  }
-  return ShapeUtil::CompatibleIgnoringElementType(
-             first_reduce->operand(0)->shape(), inst->shape()) &&
-         LayoutUtil::Equal(first_reduce->operand(0)->shape().layout(),
-                           inst->shape().layout());
-}
-
 FusionDecision FusionHeroesAreCompatible(const HloInstruction* hero1,
                                          const HloInstruction* hero2) {
   auto hero1_is_unnested_reduce =
@@ -227,7 +206,7 @@ FusionDecision FusionHeroesAreCompatible(const HloInstruction* hero1,
   bool hero2_is_unnested_transpose = tiled_transpose_hero2.has_value();
 
   if (hero1_is_unnested_reduce && hero2_is_unnested_reduce &&
-      !IsFusedReductionOutputConsistent(hero2, hero1)) {
+      !AreReductionsMultiOutputFusionCompatible(hero2, hero1)) {
     return "tiled reductions with different shapes";
   } else if (hero1_is_unnested_transpose && hero2_is_unnested_transpose &&
              // After normalization to rank 3, the transposes should have the
@@ -341,7 +320,8 @@ bool IsInputFusible(const HloInstruction& instr) {
 
 // Returns true if `instr` can be fused as a producer or as a consumer into a
 // kLoop fusion.
-bool IsUniversallyLoopFusible(const HloInstruction& instr) {
+bool IsUniversallyLoopFusible(const HloInstruction& instr,
+                              const HloInstruction& hero) {
   // NOTE: this check is done before the switch below, because a fusion instr
   // can also be elementwise, even if it's not a kLoop.
   if (instr.IsElementwise() && instr.operand_count() > 0 &&
@@ -351,7 +331,7 @@ bool IsUniversallyLoopFusible(const HloInstruction& instr) {
 
   switch (instr.opcode()) {
     case HloOpcode::kCopy:
-      return !GetDescriptionForTiledTransposeEmitter(instr, instr).has_value();
+      return !GetDescriptionForTiledTransposeEmitter(instr, hero).has_value();
 
     case HloOpcode::kFusion:
       return instr.fusion_kind() == HloInstruction::FusionKind::kLoop;
@@ -375,7 +355,8 @@ bool IsUniversallyLoopFusible(const HloInstruction& instr) {
 }
 
 // Returns true if `instr` can be fused as a consumer into a kLoop fusion.
-bool IsLoopFusibleAsConsumer(const HloInstruction& instr) {
+bool IsLoopFusibleAsConsumer(const HloInstruction& instr,
+                             const HloInstruction& hero) {
   // Instr should be fusible.
   if (!instr.IsFusible()) return false;
 
@@ -387,19 +368,12 @@ bool IsLoopFusibleAsConsumer(const HloInstruction& instr) {
   // Any reduction can be fused as a consumer.
   if (instr.opcode() == HloOpcode::kReduce) return true;
 
-  // We may have input fusions which effectively have turned into loop
-  // fusions. Those should still be considered as loop fusible consumers,
-  // but they are not universally loop fusible.
-  if (!IsInputFusible(instr) && instr.opcode() == HloOpcode::kFusion &&
-      instr.fusion_kind() == HloInstruction::FusionKind::kInput) {
-    return true;
-  }
-
-  return IsUniversallyLoopFusible(instr);
+  return IsUniversallyLoopFusible(instr, hero);
 }
 
 // Returns true if `instr` can be fused as a producer into a kLoop fusion.
-bool IsLoopFusibleAsProducer(const HloInstruction& instr) {
+bool IsLoopFusibleAsProducer(const HloInstruction& instr,
+                             const HloInstruction& hero) {
   // Instr should be fusible.
   if (!instr.IsFusible()) return false;
 
@@ -411,7 +385,7 @@ bool IsLoopFusibleAsProducer(const HloInstruction& instr) {
       // Non-variadic reductions can be fused as producers.
       return !instr.shape().IsTuple();
     default:
-      return IsUniversallyLoopFusible(instr);
+      return IsUniversallyLoopFusible(instr, hero);
   }
 }
 
@@ -456,8 +430,12 @@ FusionDecision CanEmitInputFusedScatter(const HloInstruction& producer,
 
 FusionDecision IsProducerConsumerFusible(const HloInstruction& producer,
                                          const HloInstruction& consumer) {
-  if (!IsLoopFusibleAsProducer(producer) &&
-      !IsInputFusibleTranspose(producer)) {
+  const auto& producer_hero = FindNonTrivialHero(producer);
+  const auto& consumer_hero = FindNonTrivialHero(consumer);
+  if (!IsLoopFusibleAsProducer(producer, producer_hero) &&
+      !(GetDescriptionForTiledTransposeEmitter(producer, producer_hero)
+            .has_value() &&
+        &consumer_hero == &producer)) {
     return "the producer is not loop-fusible";
   }
 
@@ -468,10 +446,12 @@ FusionDecision IsProducerConsumerFusible(const HloInstruction& producer,
              .xla_gpu_enable_reduction_epilogue_fusion()) {
       return "Reduction epilogue fusion is not enabled.";
     }
+    // TODO(akuegel): Remove workaround when producer_hero is computed
+    // correctly.
     const HloInstruction& reduce_hero =
-        producer.opcode() == HloOpcode::kFusion
-            ? FindNonTrivialHero(*producer.fused_expression_root())
-            : producer;
+        producer_hero.opcode() == HloOpcode::kFusion
+            ? FindNonTrivialHero(*producer_hero.fused_expression_root())
+            : producer_hero;
     if (!ReductionIsRaceFree(
             reduce_hero.GetModule()->config(),
             GetReductionKindAndContiguousComponents(reduce_hero))) {
@@ -492,7 +472,8 @@ FusionDecision IsProducerConsumerFusible(const HloInstruction& producer,
     return can_fuse;
   }
 
-  if (!IsInputFusible(consumer) && !IsLoopFusibleAsConsumer(consumer)) {
+  if (!IsInputFusible(consumer) &&
+      !IsLoopFusibleAsConsumer(consumer, consumer_hero)) {
     return "the consumer is not input-fusible and not loop-fusible";
   }
 
@@ -553,7 +534,7 @@ FusionDecision IsProducerMultiOutputFusible(const HloInstruction& producer) {
     return "In-place operations are present";
   }
 
-  if (!IsLoopFusibleAsProducer(producer)) {
+  if (!IsLoopFusibleAsProducer(producer, FindNonTrivialHero(producer))) {
     return "producer is not loop-fusible";
   }
 
@@ -564,18 +545,11 @@ FusionDecision IsProducerMultiOutputFusible(const HloInstruction& producer) {
   return {};
 }
 
-// Returns an estimate of the shared memory usage for a given instruction in
-// bytes.
+// Returns shared memory usage for a given instruction in bytes.
 static int64_t SharedMemoryUsageNoCache(const HloInstruction& instr) {
-  if (instr.opcode() == HloOpcode::kFusion) {
-    int64_t sum = 0;
-    for (const HloInstruction* hlo :
-         instr.fused_instructions_computation()->instructions()) {
-      sum += SharedMemoryUsageNoCache(*hlo);
-    }
-    return sum;
-  } else if (instr.opcode() == HloOpcode::kReduce &&
-             IsReductionFromOrToContiguousDimensions(instr)) {
+  // For now we are only fusing reductions.
+  if (instr.opcode() == HloOpcode::kReduce &&
+      IsReductionFromOrToContiguousDimensions(instr)) {
     ReductionDimensions reduction_info =
         GetReductionKindAndContiguousComponents(instr);
     int64_t primitive_size = ShapeUtil::ByteSizeOfPrimitiveType(
@@ -590,11 +564,20 @@ static int64_t SharedMemoryUsageNoCache(const HloInstruction& instr) {
       // from potential x-tiling).
       return 2 * 32 * 33 * primitive_size * num_variadic;
     }
-  } else if (GetDescriptionForTiledTransposeEmitter(instr, instr).has_value()) {
+  } else if (GetDescriptionForTiledTransposeEmitter(instr,
+                                                    FindNonTrivialHero(instr))
+                 .has_value()) {
     // Tile size for transposition.
     int64_t primitive_size =
         ShapeUtil::ByteSizeOfPrimitiveType(instr.shape().element_type());
     return 32 * 33 * primitive_size;
+  } else if (instr.opcode() == HloOpcode::kFusion) {
+    int64_t sum = 0;
+    for (const HloInstruction* hlo :
+         instr.fused_instructions_computation()->instructions()) {
+      sum += SharedMemoryUsageNoCache(*hlo);
+    }
+    return sum;
   }
   // Other fused expressions for now don't need the shared memory budget.
   return 0;
@@ -862,11 +845,9 @@ bool IsFusibleAsMultiOutputFusionRoot(const HloInstruction& instr) {
           instr.IsElementwise());
 }
 
-HloInstruction::FusionKind ChooseFusionKind(const HloInstruction& producer,
-                                            const HloInstruction& consumer) {
-  return (IsInputFusible(consumer) || IsInputFusible(producer))
-             ? HloInstruction::FusionKind::kInput
-             : HloInstruction::FusionKind::kLoop;
+HloInstruction::FusionKind ChooseFusionKind(const HloInstruction& consumer) {
+  return IsInputFusible(consumer) ? HloInstruction::FusionKind::kInput
+                                  : HloInstruction::FusionKind::kLoop;
 }
 
 bool IsConsumerTheOnlyNonRootUser(const HloInstruction& instr,
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible.h b/third_party/xla/xla/service/gpu/gpu_fusible.h
index 87081514d62d06..e562759e3d9ae8 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible.h
+++ b/third_party/xla/xla/service/gpu/gpu_fusible.h
@@ -170,8 +170,7 @@ FusionDecision IsProducerMultiOutputFusible(const HloInstruction& producer);
 bool IsFusibleAsMultiOutputFusionRoot(const HloInstruction& instr);
 
 // Determines the fusion kind to be used when fusing into `consumer`.
-HloInstruction::FusionKind ChooseFusionKind(const HloInstruction& producer,
-                                            const HloInstruction& consumer);
+HloInstruction::FusionKind ChooseFusionKind(const HloInstruction& consumer);
 
 // Returns whether `consumer` is the only non-root user of `instr`.
 bool IsConsumerTheOnlyNonRootUser(const HloInstruction& instr,
diff --git a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
index 722c23116e7383..c775f6a5b28ca3 100644
--- a/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_fusible_test.cc
@@ -1001,51 +1001,6 @@ TEST_F(GpuFusibleTest, ProducerConsumerFusionElementwiseAndReduce) {
   EXPECT_TRUE(ShapesCompatibleForMultiOutputFusion(*producer, *consumer));
 }
 
-TEST_F(GpuFusibleTest, ProducerConsumerFusionTransposeAndLoopFusion) {
-  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
-    fused_add {
-      p0.1 = f32[32,31,30]{2,1,0} parameter(0)
-      p1.1 = f32[32,31,30]{2,1,0} parameter(1)
-      neg = f32[32,31,30]{2,1,0} negate(p0.1)
-      ROOT add = f32[32,31,30]{2,1,0} add(neg, p1.1)
-    }
-
-    ENTRY reduce {
-      p0 = f32[32,31,30]{2,1,0} parameter(0)
-      p1 = f32[32,30,31]{2,1,0} parameter(1)
-      transpose =  f32[32,31,30]{2,1,0} transpose(p1), dimensions={0,2,1}
-      ROOT add = f32[32,31,30]{2,1,0} fusion(p0, transpose), kind=kLoop, calls=fused_add
-    })"))
-                    .value();
-  const HloInstruction* root = module->entry_computation()->root_instruction();
-  const HloInstruction* consumer = root;
-  const HloInstruction* producer = root->operand(1);
-  EXPECT_TRUE(IsProducerConsumerFusible(*producer, *consumer));
-}
-
-TEST_F(GpuFusibleTest, ProducerConsumerFusionReduceAndLoopFusion) {
-  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
-    fused_add {
-      p0.1 = f32[32,31,30]{2,1,0} parameter(0)
-      p1.1 = f32[32,31,30]{2,1,0} parameter(1)
-      neg = f32[32,31,30]{2,1,0} negate(p0.1)
-      ROOT add = f32[32,31,30]{2,1,0} add(neg, p1.1)
-    }
-
-    ENTRY reduce {
-      p0 = f32[32,31,30]{2,1,0} parameter(0)
-      p1 = f32[32,31,30,29]{3,2,1,0} parameter(1)
-      c0 = f32[] constant(0.0)
-      reduce =  f32[32,31,30]{2,1,0} reduce(p1, c0), dimensions={3}, to_apply=scalar_add
-      ROOT add = f32[32,31,30]{2,1,0} fusion(p0, reduce), kind=kLoop, calls=fused_add
-    })"))
-                    .value();
-  const HloInstruction* root = module->entry_computation()->root_instruction();
-  const HloInstruction* consumer = root;
-  const HloInstruction* producer = root->operand(1);
-  EXPECT_TRUE(IsProducerConsumerFusible(*producer, *consumer));
-}
-
 TEST_F(GpuFusibleTest, ProducerConsumerFusionLoopFusionAndReduce) {
   auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
     fused_add {
@@ -1448,22 +1403,5 @@ TEST_F(GpuFusibleTest, DoesNotCreateHeavyComputation_FusionInstr) {
   EXPECT_FALSE(CreatesHeavyComputation(*producer, *consumer));
 }
 
-TEST_F(GpuFusibleTest, ChooseFusionKind) {
-  auto module = ParseAndReturnVerifiedModule(R"(
-HloModule module
-
-ENTRY computation {
-    p = f32[5000,6000]{1,0} parameter(0)
-    c = f32[6000,5000] transpose(p), dimensions={1,0}
-    ROOT r = f32[300,20,5000] reshape(c)
-}
-)")
-                    .value();
-  const HloInstruction* root = module->entry_computation()->root_instruction();
-  const HloInstruction* producer = root->operand(0);
-  EXPECT_EQ(ChooseFusionKind(*producer, *root),
-            HloInstruction::FusionKind::kInput);
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
index 91be135051c6a2..a85a9938c2d4ed 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule.cc
@@ -136,21 +136,24 @@ HloInstructionSequence PostprocessorToScheduleAsEarlyOrLateAsPossible(
       earliest_scheduled.push_back(instr);
       scheduled.insert(instr);
     };
+
     for (HloInstruction* instr : input.instructions()) {
-      if (is_scheduled(instr)) {
-        continue;
-      }
+      if (is_scheduled(instr)) continue;
 
       add_to_schedule(instr);
 
       // Schedule any successor that should be scheduled as early as possible if
       // all of its producers and control_predecessors have been scheduled.
       for (HloInstruction* user : instr->users()) {
+        if (is_scheduled(user)) continue;
+
         if (ShouldScheduleSuccessor(*user, is_scheduled)) {
           add_to_schedule(user);
         }
       }
       for (HloInstruction* successor : instr->control_successors()) {
+        if (is_scheduled(successor)) continue;
+
         if (ShouldScheduleSuccessor(*successor, is_scheduled)) {
           add_to_schedule(successor);
         }
@@ -170,20 +173,22 @@ HloInstructionSequence PostprocessorToScheduleAsEarlyOrLateAsPossible(
     };
     for (auto it = earliest_scheduled.rbegin(); it != earliest_scheduled.rend();
          it++) {
-      if (is_scheduled(*it)) {
-        continue;
-      }
+      if (is_scheduled(*it)) continue;
 
       add_to_schedule(*it);
 
       // Schedule any predecessor that should be scheduled as late as possible
       // if all of its users and control_successors have been scheduled.
       for (HloInstruction* operand : (*it)->operands()) {
+        if (is_scheduled(operand)) continue;
+
         if (ShouldSchedulePredecessor(*operand, is_scheduled)) {
           add_to_schedule(operand);
         }
       }
       for (HloInstruction* predecessor : (*it)->control_predecessors()) {
+        if (is_scheduled(predecessor)) continue;
+
         if (ShouldSchedulePredecessor(*predecessor, is_scheduled)) {
           add_to_schedule(predecessor);
         }
@@ -194,6 +199,12 @@ HloInstructionSequence PostprocessorToScheduleAsEarlyOrLateAsPossible(
   HloInstructionSequence result;
   absl::c_for_each(latest_scheduled,
                    [&](HloInstruction* i) { result.push_back(i); });
+
+  // Schedule post-processing can't introduce new instructions.
+  CHECK(input.instructions().size() == result.size())
+      << "schedule as early or late post-processing changed schedule size from "
+      << input.instructions().size() << " to " << result.size();
+
   return result;
 }
 
@@ -202,25 +213,35 @@ HloInstructionSequence PostprocessorToScheduleAsEarlyOrLateAsPossible(
 HloInstructionSequence PostprocessorToScheduleSyncCollectives(
     const HloInstructionSequence& input) {
   HloInstructionSequence result;
-  auto is_synchronous_op = [](const HloInstruction* instr) {
+
+  // Returns true if `inst` is a synchronous version of async collective start
+  // operation (marked with `is_sync` attribute).
+  auto is_sync_start = [](const HloInstruction* instr) {
     return hlo_query::IsAsyncCollectiveStartOp(instr,
                                                /*include_send_recv=*/true) &&
            IsSyncCollective(instr);
   };
+
   for (HloInstruction* instr : input.instructions()) {
-    if (is_synchronous_op(instr)) {
-      continue;
-    }
-    if (hlo_query::IsAsyncCollectiveDoneOp(instr,
-                                           /*include_send_recv=*/true)) {
-      // Place the start op just before the done op if its synchronous.
+    // Skip synchronous start instruction as it will be scheduled later when
+    // we'll process corresponding done instruction.
+    if (is_sync_start(instr)) continue;
+
+    // Find a start instruction corresponding to done and schedule it right
+    // before a done if it's a synchronous version.
+    if (hlo_query::IsAsyncCollectiveDoneOp(instr, true)) {
       HloInstruction* start = instr->mutable_operand(0);
-      if (is_synchronous_op(start)) {
-        result.push_back(start);
-      }
+      if (is_sync_start(start)) result.push_back(start);
     }
+
     result.push_back(instr);
   }
+
+  // Schedule post-processing can't introduce new instructions.
+  CHECK(input.instructions().size() == result.size())
+      << "sync collectives post-processing changed schedule size from "
+      << input.instructions().size() << " to " << result.size();
+
   return result;
 }
 
@@ -658,10 +679,9 @@ StatusOr<ScheduleMetadata> ScheduleGpuModule(
   // instruction name with ids.
   std::string fingerprint = module->GetFingerprint128(
       HloPrintOptions::Canonical().set_print_backend_config(true));
-  HloInstruction* root = module->entry_computation()->root_instruction();
   FrontendAttributes attributes;
   (*attributes.mutable_map())[std::string(kFingerprintBeforeLHS)] = fingerprint;
-  root->add_frontend_attributes(attributes);
+  module->add_frontend_attributes(attributes);
   VLOG(1) << "Fingerprint before LHS for module " << module->name() << "("
           << module->unique_id() << ") = " << fingerprint;
 
diff --git a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
index fc9ceacf3c7b60..a9007a3cba8d5e 100644
--- a/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
+++ b/third_party/xla/xla/service/gpu/gpu_hlo_schedule_test.cc
@@ -87,9 +87,7 @@ class GpuHloScheduleTest : public HloTestBase {
 
   static bool HasValidFingerprint(HloModule* module) {
     // Verify that the fingerprint of HLO prior to LHS is present.
-    const HloInstruction* root =
-        module->entry_computation()->root_instruction();
-    const FrontendAttributes& attrs = root->frontend_attributes();
+    const FrontendAttributes& attrs = module->frontend_attributes();
     auto it = attrs.map().find(kFingerprintBeforeLHS);
 
     // The fingerprint is 128 bits stored as a hex string (128/4 hex digits).
diff --git a/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h b/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h
index 8c7799118a0807..79c9187cfbde5a 100644
--- a/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h
+++ b/third_party/xla/xla/service/gpu/gpu_memory_space_assignment.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2023 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/third_party/xla/xla/service/gpu/gpu_norm_runner.cc b/third_party/xla/xla/service/gpu/gpu_norm_runner.cc
index 65ce6b82e8c5de..676f00ea2229d9 100644
--- a/third_party/xla/xla/service/gpu/gpu_norm_runner.cc
+++ b/third_party/xla/xla/service/gpu/gpu_norm_runner.cc
@@ -27,38 +27,60 @@ namespace xla {
 namespace gpu {
 
 absl::Status RunGpuNorm(const gpu::GpuNormConfig& config,
-                        const se::DeviceMemoryBase& input_buffer,
+                        const se::DeviceMemoryBase& x_buffer,
                         const se::DeviceMemoryBase& scale_buffer,
-                        const se::DeviceMemoryBase& bias_buffer,
-                        const se::DeviceMemoryBase& output_buffer,
+                        const se::DeviceMemoryBase& y_or_dx_buffer,
+                        std::optional<se::DeviceMemoryBase> bias_buffer,
+                        std::optional<se::DeviceMemoryBase> dy_buffer,
                         std::optional<se::DeviceMemoryBase> expectation_buffer,
                         std::optional<se::DeviceMemoryBase> norm_factor_buffer,
+                        std::optional<se::DeviceMemoryBase> dscale_buffer,
+                        std::optional<se::DeviceMemoryBase> dbias_buffer,
                         const se::DeviceMemoryBase& scratch_memory,
                         se::Stream* stream, RunNormOptions options) {
   se::dnn::LazyOpRunner<se::dnn::NormOp>* lazy_runner =
       options.norm_runner->AsNormRunner();
   std::optional<se::dnn::LazyOpRunner<se::dnn::NormOp>> local_runner;
 
-  se::dnn::NormOp::Config ln_config{config.epsilon,
-                                    config.input_descriptor,
+  TF_ASSIGN_OR_RETURN(se::dnn::NormKind kind,
+                      GetDNNNormKindFromCudnnNormKind(config.kind));
+
+  se::dnn::NormOp::Config ln_config{kind,
+                                    config.epsilon,
+                                    config.x_descriptor,
                                     config.scale_descriptor,
+                                    config.y_or_dx_descriptor,
                                     config.bias_descriptor,
-                                    config.output_descriptor,
+                                    config.dy_descriptor,
                                     config.expectation_descriptor,
-                                    config.norm_factor_descriptor};
+                                    config.norm_factor_descriptor,
+                                    config.dscale_descriptor,
+                                    config.dbias_descriptor};
   TF_ASSIGN_OR_RETURN(auto* runner,
                       lazy_runner->GetOrCreateRunner(ln_config, stream));
 
   std::vector<se::DeviceMemoryBase> operands;
-  operands.emplace_back(input_buffer);
+  operands.emplace_back(x_buffer);
   operands.emplace_back(scale_buffer);
-  operands.emplace_back(bias_buffer);
-  operands.emplace_back(output_buffer);
-  if (expectation_buffer) {
+  operands.emplace_back(y_or_dx_buffer);
+
+  // The remaining operands are composed of inputs followed by outputs of the
+  // library call. The expectation and norm factor are outputs of the forward
+  // training layer norm, and inputs of the backward layer norm.
+  if (config.kind == CudnnNormKind::kLayerForwardInfer ||
+      config.kind == CudnnNormKind::kLayerForwardTrain) {
+    operands.emplace_back(bias_buffer.value());
+  }
+  if (config.kind == CudnnNormKind::kLayerForwardTrain) {
     operands.emplace_back(expectation_buffer.value());
+    operands.emplace_back(norm_factor_buffer.value());
   }
-  if (norm_factor_buffer) {
+  if (config.kind == CudnnNormKind::kLayerBackward) {
+    operands.emplace_back(dy_buffer.value());
+    operands.emplace_back(expectation_buffer.value());
     operands.emplace_back(norm_factor_buffer.value());
+    operands.emplace_back(dscale_buffer.value());
+    operands.emplace_back(dbias_buffer.value());
   }
 
   return (*runner)(stream, options.profile_result, scratch_memory, operands);
diff --git a/third_party/xla/xla/service/gpu/gpu_norm_runner.h b/third_party/xla/xla/service/gpu/gpu_norm_runner.h
index 6452cae6c1844f..442fa2b1b67f7a 100644
--- a/third_party/xla/xla/service/gpu/gpu_norm_runner.h
+++ b/third_party/xla/xla/service/gpu/gpu_norm_runner.h
@@ -36,26 +36,45 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+inline absl::StatusOr<xla::gpu::CudnnNormKind> AsCudnnNormKind(
+    xla::gpu::CudnnNormBackendConfig_Kind kind) {
+  switch (kind) {
+    case xla::gpu::CudnnNormBackendConfig::LAYER_FWD_INFER:
+      return xla::gpu::CudnnNormKind::kLayerForwardInfer;
+    case xla::gpu::CudnnNormBackendConfig::LAYER_FWD_TRAIN:
+      return xla::gpu::CudnnNormKind::kLayerForwardTrain;
+    case xla::gpu::CudnnNormBackendConfig::LAYER_BWD:
+      return xla::gpu::CudnnNormKind::kLayerBackward;
+    default:
+      return xla::Internal("Unknown norm kind.");
+  }
+}
+
 // Intermediate structure used as input to construct GpuNormConfig.
 struct GpuNormDescriptor {
   CudnnNormBackendConfig backend_config;
-  Shape input_shape;
+  Shape x_shape;
   Shape scale_shape;
-  Shape bias_shape;
-  Shape output_shape;
+  std::optional<Shape> bias_shape;
+  Shape y_or_dx_shape;
   std::optional<Shape> expectation_shape;
   std::optional<Shape> norm_factor_shape;
+  std::optional<Shape> dy_shape;
+  std::optional<Shape> dscale_shape;
+  std::optional<Shape> dbias_shape;
   size_t scratch_size;
 };
 
 // Structure to describe static properties of a fused norm op.
 struct GpuNormConfig {
   static absl::StatusOr<GpuNormConfig> For(const GpuNormDescriptor& desc) {
-    std::vector<PrimitiveType> output_types;
+    std::vector<PrimitiveType> y_or_dx_types;
 
     GpuNormConfig config;
     config.epsilon = desc.backend_config.epsilon();
     config.algorithm = se::dnn::AlgorithmDesc(desc.backend_config.algorithm());
+    TF_ASSIGN_OR_RETURN(config.kind,
+                        AsCudnnNormKind(desc.backend_config.kind()));
 
     auto tensor_descriptor_from_shape =
         [](Shape shape) -> absl::StatusOr<se::dnn::TensorDescriptor> {
@@ -66,35 +85,49 @@ struct GpuNormConfig {
                                             shape.layout().minor_to_major());
     };
 
-    TF_ASSIGN_OR_RETURN(config.input_descriptor,
-                        tensor_descriptor_from_shape(desc.input_shape));
+    TF_ASSIGN_OR_RETURN(config.x_descriptor,
+                        tensor_descriptor_from_shape(desc.x_shape));
     TF_ASSIGN_OR_RETURN(config.scale_descriptor,
                         tensor_descriptor_from_shape(desc.scale_shape));
-    TF_ASSIGN_OR_RETURN(config.bias_descriptor,
-                        tensor_descriptor_from_shape(desc.bias_shape));
-    TF_ASSIGN_OR_RETURN(config.output_descriptor,
-                        tensor_descriptor_from_shape(desc.output_shape));
+    TF_ASSIGN_OR_RETURN(config.y_or_dx_descriptor,
+                        tensor_descriptor_from_shape(desc.y_or_dx_shape));
+    if (desc.bias_shape) {
+      TF_ASSIGN_OR_RETURN(config.bias_descriptor, tensor_descriptor_from_shape(
+                                                      desc.bias_shape.value()));
+    }
     if (desc.expectation_shape) {
       TF_ASSIGN_OR_RETURN(
           config.expectation_descriptor,
           tensor_descriptor_from_shape(desc.expectation_shape.value()));
-    }
-    if (desc.norm_factor_shape) {
       TF_ASSIGN_OR_RETURN(
           config.norm_factor_descriptor,
           tensor_descriptor_from_shape(desc.norm_factor_shape.value()));
     }
+    if (desc.dscale_shape) {
+      TF_ASSIGN_OR_RETURN(config.dy_descriptor,
+                          tensor_descriptor_from_shape(desc.dy_shape.value()));
+      TF_ASSIGN_OR_RETURN(
+          config.dscale_descriptor,
+          tensor_descriptor_from_shape(desc.dscale_shape.value()));
+      TF_ASSIGN_OR_RETURN(
+          config.dbias_descriptor,
+          tensor_descriptor_from_shape(desc.dbias_shape.value()));
+    }
     return config;
   }
 
   double epsilon;
+  CudnnNormKind kind;
   se::dnn::AlgorithmDesc algorithm;
-  se::dnn::TensorDescriptor input_descriptor;
+  se::dnn::TensorDescriptor x_descriptor;
   se::dnn::TensorDescriptor scale_descriptor;
-  se::dnn::TensorDescriptor bias_descriptor;
-  se::dnn::TensorDescriptor output_descriptor;
+  std::optional<se::dnn::TensorDescriptor> bias_descriptor;
+  se::dnn::TensorDescriptor y_or_dx_descriptor;
   std::optional<se::dnn::TensorDescriptor> expectation_descriptor;
   std::optional<se::dnn::TensorDescriptor> norm_factor_descriptor;
+  std::optional<se::dnn::TensorDescriptor> dy_descriptor;
+  std::optional<se::dnn::TensorDescriptor> dscale_descriptor;
+  std::optional<se::dnn::TensorDescriptor> dbias_descriptor;
 };
 
 class NormRunner {
@@ -127,13 +160,16 @@ struct RunNormOptions {
   NormRunner* norm_runner;
 };
 
-absl::Status RunGpuNorm(const gpu::GpuNormConfig& config,
-                        const se::DeviceMemoryBase& input_buffer,
+absl::Status RunGpuNorm(const GpuNormConfig& conv_config,
+                        const se::DeviceMemoryBase& x_buffer,
                         const se::DeviceMemoryBase& scale_buffer,
-                        const se::DeviceMemoryBase& bias_buffer,
-                        const se::DeviceMemoryBase& output_buffer,
-                        std::optional<se::DeviceMemoryBase> expectation_buffer,
+                        const se::DeviceMemoryBase& y_or_dx_buffer,
+                        std::optional<se::DeviceMemoryBase> bias_buffer,
+                        std::optional<se::DeviceMemoryBase> dy_buffer,
+                        std::optional<se::DeviceMemoryBase> exepctation_buffer,
                         std::optional<se::DeviceMemoryBase> norm_factor_buffer,
+                        std::optional<se::DeviceMemoryBase> dscale_buffer,
+                        std::optional<se::DeviceMemoryBase> dbias_buffer,
                         const se::DeviceMemoryBase& scratch_memory,
                         se::Stream* stream, RunNormOptions options = {});
 
diff --git a/third_party/xla/xla/service/gpu/gpu_prim_cuda.h b/third_party/xla/xla/service/gpu/gpu_prim.h
similarity index 72%
rename from third_party/xla/xla/service/gpu/gpu_prim_cuda.h
rename to third_party/xla/xla/service/gpu/gpu_prim.h
index aefa395d08ed0e..5e7daa3d86e02d 100644
--- a/third_party/xla/xla/service/gpu/gpu_prim_cuda.h
+++ b/third_party/xla/xla/service/gpu/gpu_prim.h
@@ -11,8 +11,8 @@ BASIS, WITHOUT OF ANY KIND WARRANTIES OR CONDITIONS, either express
 or implied. For the specific language governing permissions and
 limitations under the license, the license you must see.
 ==============================================================================*/
-#ifndef XLA_SERVICE_GPU_GPU_PRIM_CUDA_H_
-#define XLA_SERVICE_GPU_GPU_PRIM_CUDA_H_
+#ifndef XLA_SERVICE_GPU_GPU_PRIM_H_
+#define XLA_SERVICE_GPU_GPU_PRIM_H_
 
 #include "tsl/platform/bfloat16.h"
 
@@ -77,6 +77,42 @@ struct NumericTraits<tsl::bfloat16>
                  /*_NULL_TYPE=*/false, /*_UnsignedBits=*/uint16_t,
                  /*T=*/tsl::bfloat16> {};
 }  // namespace cub
-#endif  // GOOGLE_CUDA
+#elif TENSORFLOW_USE_ROCM
 
-#endif  // XLA_SERVICE_GPU_GPU_PRIM_CUDA_H_
+#include "rocm/include/hipcub/hipcub.hpp"
+#include "rocm/rocm_config.h"
+namespace gpuprim = ::hipcub;
+
+// Required for sorting Eigen::half and bfloat16.
+namespace rocprim {
+namespace detail {
+
+#if (TF_ROCM_VERSION >= 50200)
+template <>
+struct float_bit_mask<Eigen::half> {
+  static constexpr uint16_t sign_bit = 0x8000;
+  static constexpr uint16_t exponent = 0x7C00;
+  static constexpr uint16_t mantissa = 0x03FF;
+  using bit_type = uint16_t;
+};
+
+template <>
+struct float_bit_mask<tsl::bfloat16> {
+  static constexpr uint16_t sign_bit = 0x8000;
+  static constexpr uint16_t exponent = 0x7F80;
+  static constexpr uint16_t mantissa = 0x007F;
+  using bit_type = uint16_t;
+};
+#endif  // TF_ROCM_VERSION >= 50200
+template <>
+struct radix_key_codec_base<Eigen::half>
+    : radix_key_codec_floating<Eigen::half, uint16_t> {};
+template <>
+struct radix_key_codec_base<tsl::bfloat16>
+    : radix_key_codec_floating<tsl::bfloat16, uint16_t> {};
+};  // namespace detail
+};  // namespace rocprim
+
+#endif  // TENSORFLOW_USE_ROCM
+
+#endif  // XLA_SERVICE_GPU_GPU_PRIM_H_
diff --git a/third_party/xla/xla/service/gpu/gpu_prim_rocm.h b/third_party/xla/xla/service/gpu/gpu_prim_rocm.h
deleted file mode 100644
index 647f67d686bf97..00000000000000
--- a/third_party/xla/xla/service/gpu/gpu_prim_rocm.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-To in writing unless required by applicable law or agreed,
-distributed on an, software distributed under the license is "AS IS"
-BASIS, WITHOUT OF ANY KIND WARRANTIES OR CONDITIONS, either express
-or implied. For the specific language governing permissions and
-limitations under the license, the license you must see.
-==============================================================================*/
-#ifndef XLA_SERVICE_GPU_GPU_PRIM_ROCM_H_
-#define XLA_SERVICE_GPU_GPU_PRIM_ROCM_H_
-
-#include "tsl/platform/bfloat16.h"
-
-#if TENSORFLOW_USE_ROCM
-
-#include "rocm/include/hipcub/hipcub.hpp"
-#include "rocm/rocm_config.h"
-namespace gpuprim = ::hipcub;
-
-// Required for sorting Eigen::half and bfloat16.
-namespace rocprim {
-namespace detail {
-
-#if (TF_ROCM_VERSION >= 50200)
-template <>
-struct float_bit_mask<Eigen::half> {
-  static constexpr uint16_t sign_bit = 0x8000;
-  static constexpr uint16_t exponent = 0x7C00;
-  static constexpr uint16_t mantissa = 0x03FF;
-  using bit_type = uint16_t;
-};
-
-template <>
-struct float_bit_mask<tsl::bfloat16> {
-  static constexpr uint16_t sign_bit = 0x8000;
-  static constexpr uint16_t exponent = 0x7F80;
-  static constexpr uint16_t mantissa = 0x007F;
-  using bit_type = uint16_t;
-};
-#endif  // TF_ROCM_VERSION >= 50200
-template <>
-struct radix_key_codec_base<Eigen::half>
-    : radix_key_codec_floating<Eigen::half, uint16_t> {};
-template <>
-struct radix_key_codec_base<tsl::bfloat16>
-    : radix_key_codec_floating<tsl::bfloat16, uint16_t> {};
-};  // namespace detail
-};  // namespace rocprim
-
-#endif  // TENSORFLOW_USE_ROCM
-#endif  // XLA_SERVICE_GPU_GPU_PRIM_ROCM_H_
diff --git a/third_party/xla/xla/service/gpu/gpu_sort_rewriter.cc b/third_party/xla/xla/service/gpu/gpu_sort_rewriter.cc
index 8ab0fc7f55b0d3..04d874979b9d43 100644
--- a/third_party/xla/xla/service/gpu/gpu_sort_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/gpu_sort_rewriter.cc
@@ -31,8 +31,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/gpu/cub_sort_thunk.h"
 #include "xla/service/gpu/cublas_cudnn.h"
+#include "xla/service/gpu/runtime/cub_sort_thunk.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/statusor.h"
diff --git a/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc b/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc
index 4ef4ece8dc957a..0e21b4f55c8e44 100644
--- a/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc
+++ b/third_party/xla/xla/service/gpu/gpu_transfer_manager.cc
@@ -15,32 +15,43 @@ limitations under the License.
 
 #include "xla/service/gpu/gpu_transfer_manager.h"
 
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
 #include <memory>
-#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/cleanup/cleanup.h"
+#include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
+#include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
 #include "llvm/IR/DataLayout.h"
 #include "xla/literal.h"
-#include "xla/literal_util.h"
 #include "xla/service/compiler.h"
+#include "xla/service/generic_transfer_manager.h"
+#include "xla/service/gpu/infeed_manager.h"
 #include "xla/service/gpu/outfeed_manager.h"
 #include "xla/service/gpu/target_constants.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/service/transfer_manager.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
-#include "xla/stream_executor/host/host_platform_id.h"
-#include "xla/stream_executor/multi_platform_manager.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/types.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
+#include "tsl/platform/numbers.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -52,12 +63,6 @@ GpuTransferManager::GpuTransferManager(se::Platform::Id id,
                                        unsigned pointer_size)
     : GenericTransferManager(id, pointer_size) {}
 
-GpuTransferManager::~GpuTransferManager() {
-  if (pinned_chunk_se_) {
-    pinned_chunk_se_->HostMemoryDeallocate(pinned_chunk_);
-  }
-}
-
 absl::Status GpuTransferManager::TransferLiteralToInfeed(
     se::StreamExecutor* executor, const LiteralSlice& literal) {
   return gpu::GetOrCreateInfeedManager(executor)->TransferLiteralToInfeed(
@@ -70,21 +75,25 @@ absl::Status GpuTransferManager::TransferLiteralFromOutfeed(
       executor, literal);
 }
 
-void GpuTransferManager::EnsurePinnedBuffersAllocated(
+absl::Status GpuTransferManager::EnsurePinnedBuffersAllocated(
     se::StreamExecutor* executor) {
   if (pinned_chunk_ != nullptr) {
-    return;
+    return absl::OkStatus();
   }
 
+  TF_ASSIGN_OR_RETURN(pinned_chunk_,
+                      executor->HostMemoryAllocate(kPinnedChunkBytes));
   pinned_chunk_se_ = executor;
-  pinned_chunk_ =
-      reinterpret_cast<char*>(executor->HostMemoryAllocate(kPinnedChunkBytes));
+
   static_assert(kPinnedChunkBytes % kPinnedBufferBytes == 0,
                 "assumption of loop below");
-  for (char* buf = pinned_chunk_; buf < pinned_chunk_ + kPinnedChunkBytes;
+  char* base = reinterpret_cast<char*>(pinned_chunk_->opaque());
+  for (char* buf = base; buf < base + kPinnedChunkBytes;
        buf += kPinnedBufferBytes) {
     pinned_buffers_.push_back(buf);
   }
+
+  return absl::OkStatus();
 }
 
 absl::Status GpuTransferManager::ReadDynamicShapes(
@@ -147,7 +156,7 @@ absl::Status GpuTransferManager::ReadDynamicShapes(
 
   {
     absl::MutexLock lock(&mu_);
-    EnsurePinnedBuffersAllocated(stream->parent());
+    TF_RETURN_IF_ERROR(EnsurePinnedBuffersAllocated(stream->parent()));
 
     for (const auto& src_dst : copies) {
       se::DeviceMemoryBase src = src_dst.first;
@@ -191,6 +200,126 @@ absl::Status GpuTransferManager::ReadDynamicShapes(
   return absl::OkStatus();
 }
 
+// Chunks `size` into chunks of `chunk_size` and calls `callback` for each.
+static void ForEachChunk(
+    size_t size, size_t chunk_size,
+    absl::FunctionRef<void(size_t chunk_offset, size_t chunk_size)> callback) {
+  int64_t num_chunks = CeilOfRatio(size, chunk_size);
+
+  for (int64_t chunk_index = 0; chunk_index < num_chunks; ++chunk_index) {
+    callback(
+        /*chunk_offset=*/chunk_index * chunk_size,
+        /*chunk_size=*/std::min(chunk_size, size - chunk_index * chunk_size));
+  }
+}
+
+absl::Status GpuTransferManager::TransferBufferFromDevice(
+    se::Stream* stream, const se::DeviceMemoryBase& source, int64_t size,
+    void* destination) {
+  if (source.size() < size) {
+    return absl::FailedPreconditionError(absl::StrFormat(
+        "Source allocation on device not large enough for data transfer: "
+        "%d < %d",
+        source.size(), size));
+  }
+
+  VLOG(5) << "Transfer buffer from device: size="
+          << tsl::strings::HumanReadableNumBytes(size);
+
+  TF_ASSIGN_OR_RETURN(auto staging_buffer,
+                      GetOrCreateStagingBuffer(stream->parent()));
+
+  absl::MutexLock lock(&staging_buffer->mutex);
+  void* staging = staging_buffer->allocation->opaque();
+
+  // Transfer chunk of data from device to destination via staging buffer.
+  auto transfer_chunk = [&](size_t chunk_offset, size_t chunk_size) {
+    VLOG(5) << "Transfer buffer chunk from device: offset=" << chunk_offset
+            << " size=" << tsl::strings::HumanReadableNumBytes(chunk_size);
+
+    se::DeviceMemoryBase chunk = source.GetByteSlice(chunk_offset, chunk_size);
+    stream->ThenMemcpy(staging, chunk, chunk_size);
+
+    void* dst = reinterpret_cast<char*>(destination) + chunk_offset;
+    stream->ThenDoHostCallback([=] { std::memcpy(dst, staging, chunk_size); });
+  };
+
+  stream->ThenWaitFor(staging_buffer->transfer_completed.get());
+  ForEachChunk(size, kStagingBufferSize, transfer_chunk);
+  stream->ThenRecordEvent(staging_buffer->transfer_completed.get());
+
+  return absl::OkStatus();
+}
+
+absl::Status GpuTransferManager::TransferBufferToDevice(
+    se::Stream* stream, int64_t size, const void* source,
+    se::DeviceMemoryBase* destination) {
+  if (destination->size() < size) {
+    return absl::FailedPreconditionError(absl::StrFormat(
+        "Destination allocation on device not large enough for data transfer: "
+        "%d < %d",
+        destination->size(), size));
+  }
+
+  VLOG(5) << "Transfer buffer to device: size="
+          << tsl::strings::HumanReadableNumBytes(size);
+
+  TF_ASSIGN_OR_RETURN(auto staging_buffer,
+                      GetOrCreateStagingBuffer(stream->parent()));
+
+  absl::MutexLock lock(&staging_buffer->mutex);
+  void* staging = staging_buffer->allocation->opaque();
+
+  // Transfer chunk of data from device to destination.
+  auto transfer_chunk = [&](size_t chunk_offset, size_t chunk_size) {
+    VLOG(5) << "Transfer buffer chunk to device: offset=" << chunk_offset
+            << " size=" << tsl::strings::HumanReadableNumBytes(chunk_size);
+
+    const void* src = reinterpret_cast<const char*>(source) + chunk_offset;
+    stream->ThenDoHostCallback([=] { std::memcpy(staging, src, chunk_size); });
+
+    auto chunk = destination->GetByteSlice(chunk_offset, chunk_size);
+    stream->ThenMemcpy(&chunk, staging, chunk_size);
+  };
+
+  stream->ThenWaitFor(staging_buffer->transfer_completed.get());
+  ForEachChunk(size, kStagingBufferSize, transfer_chunk);
+  stream->ThenRecordEvent(staging_buffer->transfer_completed.get());
+
+  return absl::OkStatus();
+}
+
+GpuTransferManager::StagingBuffer::StagingBuffer(
+    std::unique_ptr<se::MemoryAllocation> allocation,
+    std::unique_ptr<se::Event> transfer_completed)
+    : allocation(std::move(allocation)),
+      transfer_completed(std::move(transfer_completed)) {}
+
+absl::StatusOr<GpuTransferManager::StagingBuffer*>
+GpuTransferManager::GetOrCreateStagingBuffer(se::StreamExecutor* executor) {
+  absl::MutexLock lock(&mutex_);
+  if (auto it = staging_buffers_.find(executor); it != staging_buffers_.end()) {
+    return &it->second;
+  }
+
+  VLOG(3) << absl::StreamFormat(
+      "Allocate staging buffer of %s for executor %p (device_ordinal=%d)",
+      tsl::strings::HumanReadableNumBytes(kStagingBufferSize), executor,
+      executor->device_ordinal());
+
+  TF_ASSIGN_OR_RETURN(auto staging_buffer,
+                      executor->HostMemoryAllocate(kStagingBufferSize));
+
+  auto transfer_completed = std::make_unique<se::Event>(executor);
+  if (!transfer_completed->Init()) {
+    return absl::InternalError("Failed to initialize transfer completed event");
+  }
+
+  auto emplaced = staging_buffers_.try_emplace(
+      executor, std::move(staging_buffer), std::move(transfer_completed));
+  return &emplaced.first->second;
+}
+
 }  // namespace gpu
 }  // namespace xla
 
@@ -215,4 +344,5 @@ static bool InitModule() {
       stream_executor::rocm::kROCmPlatformId, &CreateAMDGPUTransferManager);
   return true;
 }
+
 static bool module_initialized = InitModule();
diff --git a/third_party/xla/xla/service/gpu/gpu_transfer_manager.h b/third_party/xla/xla/service/gpu/gpu_transfer_manager.h
index 67a94f09388e97..3ec0e6da5f816d 100644
--- a/third_party/xla/xla/service/gpu/gpu_transfer_manager.h
+++ b/third_party/xla/xla/service/gpu/gpu_transfer_manager.h
@@ -16,13 +16,23 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_GPU_TRANSFER_MANAGER_H_
 #define XLA_SERVICE_GPU_GPU_TRANSFER_MANAGER_H_
 
+#include <cstdint>
+#include <memory>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/container/node_hash_map.h"
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/literal.h"
 #include "xla/service/generic_transfer_manager.h"
-#include "xla/service/gpu/infeed_manager.h"
-#include "xla/service/transfer_manager.h"
-#include "xla/shape_tree.h"
-#include "xla/statusor.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/xla_data.pb.h"
 
@@ -34,7 +44,6 @@ namespace gpu {
 class GpuTransferManager : public GenericTransferManager {
  public:
   GpuTransferManager(se::Platform::Id id, unsigned pointer_size);
-  ~GpuTransferManager() override;
 
   absl::Status TransferLiteralToInfeed(se::StreamExecutor* executor,
                                        const LiteralSlice& literal) override;
@@ -45,11 +54,43 @@ class GpuTransferManager : public GenericTransferManager {
                                  Shape* device_shape) override;
 
  private:
+  // We use a fixed-size staging buffers and split transfer into multiple
+  // operations if literal does not fit into it.
+  static constexpr int64_t kStagingBufferSize = 128 * 1024 * 1024;
+
+  // We use host memory allocation (pinned host memory) as a staging buffer for
+  // transfering literals to and from device. We keep a separate staging
+  // allocation per device so we don't need to do cross-device synchronization.
+  // All transfers to and from a device are ordered via stream dependencies.
+  struct StagingBuffer {
+    StagingBuffer(std::unique_ptr<se::MemoryAllocation> allocation,
+                  std::unique_ptr<se::Event> transfer_completed);
+
+    absl::Mutex mutex;
+    std::unique_ptr<se::MemoryAllocation> allocation ABSL_GUARDED_BY(mutex);
+    std::unique_ptr<se::Event> transfer_completed ABSL_GUARDED_BY(mutex);
+  };
+
   GpuTransferManager(const GpuTransferManager&) = delete;
   GpuTransferManager& operator=(const GpuTransferManager&) = delete;
 
   bool PackSubbyteTypes() const override { return true; }
 
+  // Returns or creates the staging buffer for the given executor.
+  absl::StatusOr<StagingBuffer*> GetOrCreateStagingBuffer(
+      se::StreamExecutor* executor);
+
+  absl::Status TransferBufferFromDevice(se::Stream* stream,
+                                        const se::DeviceMemoryBase& source,
+                                        int64_t size,
+                                        void* destination) override;
+
+  absl::Status TransferBufferToDevice(
+      se::Stream* stream, int64_t size, const void* source,
+      se::DeviceMemoryBase* destination) override;
+
+  // TODO(ezhulenev): Unify this with staged buffers for transfering literals.
+
   // This class keeps a pool of pinned memory
   // (StreamExecutor::HostMemoryAllocate()) that serves ReadDynamicShapes().
   // This is a bit of a hack: Callers like TensorFlow already have a full pinned
@@ -82,7 +123,7 @@ class GpuTransferManager : public GenericTransferManager {
   //
   // Lazy initialization works around this, because at that point we have a
   // stream, and therefore we have an already-initialized StreamExecutor.
-  void EnsurePinnedBuffersAllocated(se::StreamExecutor* executor)
+  absl::Status EnsurePinnedBuffersAllocated(se::StreamExecutor* executor)
       ABSL_EXCLUSIVE_LOCKS_REQUIRED(mu_);
 
   static constexpr int64_t kPinnedChunkBytes = 128 * 1024;
@@ -96,11 +137,16 @@ class GpuTransferManager : public GenericTransferManager {
 
   // Chunk of pinned memory of size kPinnedChunkBytes.  The pointers in
   // pinned_buffers_ point into this chunk.  Lazily initialized.
-  char* pinned_chunk_ ABSL_GUARDED_BY(mu_) = nullptr;
+  std::unique_ptr<se::MemoryAllocation> pinned_chunk_ ABSL_GUARDED_BY(mu_);
 
   // Host buffers for reading dynamic shapes.  Each buffer has size
   // kPinnedBufferBytes.  Lazily initialized.
   std::vector<void*> pinned_buffers_ ABSL_GUARDED_BY(mu_);
+
+  // Staging buffers allocated for transfers to and from device.
+  absl::Mutex mutex_;
+  absl::node_hash_map<se::StreamExecutor*, StagingBuffer> staging_buffers_
+      ABSL_GUARDED_BY(mutex_);
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.cc b/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.cc
index 029ee5806edca1..a5619da7de8bf6 100644
--- a/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.cc
+++ b/third_party/xla/xla/service/gpu/hlo_algorithm_denylist.cc
@@ -15,11 +15,20 @@ limitations under the License.
 
 #include "xla/service/gpu/hlo_algorithm_denylist.h"
 
+#include <optional>
 #include <string>
+#include <tuple>
+#include <vector>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/types/span.h"
 #include "xla/debug_options_flags.h"
 #include "xla/service/gpu/gpu_autotuning.pb.h"
+#include "xla/stream_executor/dnn.h"
+#include "tsl/platform/env.h"
+#include "tsl/platform/protobuf.h"
+#include "tsl/platform/status.h"
 
 namespace xla {
 namespace gpu {
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
index df52ddced1bccd..3ef668189998d5 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -130,7 +131,7 @@ HloFusionAnalysis::HloFusionAnalysis(
       input_output_info_(std::move(input_output_info)) {}
 
 // static
-absl::StatusOr<HloFusionAnalysis> HloFusionAnalysis::Create(
+HloFusionAnalysis HloFusionAnalysis::Create(
     FusionBackendConfig backend_config,
     std::unique_ptr<HloFusionAdaptor> fusion,
     const se::DeviceDescription* device_info) {
@@ -165,13 +166,14 @@ absl::StatusOr<HloFusionAnalysis> HloFusionAnalysis::Create(
 }
 
 // static
-absl::StatusOr<HloFusionAnalysis> HloFusionAnalysis::Create(
+HloFusionAnalysis HloFusionAnalysis::Create(
     const HloFusionInstruction* fusion,
     const se::DeviceDescription* device_info) {
   CHECK(device_info != nullptr);
-  TF_ASSIGN_OR_RETURN(auto gpu_config,
-                      fusion->backend_config<GpuBackendConfig>());
-  FusionBackendConfig backend_config = gpu_config.fusion_backend_config();
+  FusionBackendConfig backend_config =
+      fusion->has_backend_config()
+          ? fusion->backend_config<GpuBackendConfig>()->fusion_backend_config()
+          : FusionBackendConfig::default_instance();
   return Create(std::move(backend_config),
                 HloFusionAdaptor::ForInstruction(fusion), device_info);
 }
@@ -211,14 +213,46 @@ HloFusionAnalysis::EmitterFusionKind HloFusionAnalysis::GetEmitterFusionKind()
 
   if (input_output_info_.has_4_bit_input ||
       input_output_info_.has_4_bit_output) {
-    // Only loop fusions currently can handle int4 inputs/outputs, due to the
-    // special handling with IrArray needed to deal with two values occupying a
-    // single byte.
+    // Only loop and input slice fusions currently can handle int4
+    // inputs/outputs, due to the special handling with IrArray needed to deal
+    // with two values occupying a single byte.
+    if (fusion_roots_.size() > 1 &&
+        IsInputFusibleNonStridedSlices(fusion_roots_) &&
+        AllSliceInputsAreCompatible(fusion_roots_)) {
+      return EmitterFusionKind::kInputSlices;
+    }
     return EmitterFusionKind::kLoop;
   }
 
+  const HloInstruction* first_reduce_hero = nullptr;
   for (auto [root, hero] : llvm::zip(fusion_roots_, fusion_heroes_)) {
     if (IsRealReductionHero(*root, *hero)) {
+      first_reduce_hero = hero;
+      break;
+    }
+  }
+  if (first_reduce_hero != nullptr) {
+    bool valid_shapes = true;
+    Shape hero_operand_shape = first_reduce_hero->operand(0)->shape();
+    for (auto [root, hero] : llvm::zip(fusion_roots_, fusion_heroes_)) {
+      if (root == first_reduce_hero) {
+        continue;
+      }
+      if (!IsRealReductionHero(*root, *hero)) {
+        // Needs to have a compatible shape to the reduce operand (compatible
+        // meaning same number of elements).
+        if (ShapeUtil::ElementsIn(root->shape()) !=
+            ShapeUtil::ElementsIn(hero_operand_shape)) {
+          valid_shapes = false;
+          break;
+        }
+      } else if (!AreReductionsMultiOutputFusionCompatible(hero,
+                                                           first_reduce_hero)) {
+        valid_shapes = false;
+        break;
+      }
+    }
+    if (valid_shapes) {
       return EmitterFusionKind::kReduction;
     }
   }
@@ -265,10 +299,10 @@ const HloInstruction* HloFusionAnalysis::FindHeroReduction() const {
   LOG(FATAL) << "Did not find a hero reduction";
 }
 
-std::optional<HloFusionAnalysis> AnalyzeProducerConsumerFusion(
+HloFusionAnalysis AnalyzeProducerConsumerFusion(
     const HloInstruction& producer, const HloInstruction& consumer,
     const se::DeviceDescription& device_info) {
-  auto ret = HloFusionAnalysis::Create(
+  return HloFusionAnalysis::Create(
       consumer.has_backend_config()
           ? consumer.backend_config<GpuBackendConfig>()->fusion_backend_config()
           : producer.backend_config<GpuBackendConfig>()
@@ -277,17 +311,13 @@ std::optional<HloFusionAnalysis> AnalyzeProducerConsumerFusion(
           HloFusionAdaptor::ForInstruction(&producer),
           HloFusionAdaptor::ForInstruction(&consumer)),
       &device_info);
-  if (!ret.ok()) return std::nullopt;
-  return {std::move(*ret)};
 }
 
-std::optional<HloFusionAnalysis> AnalyzeFusion(
-    const HloInstruction& consumer, const se::DeviceDescription& device_info) {
-  auto ret = HloFusionAnalysis::Create(
+HloFusionAnalysis AnalyzeFusion(const HloInstruction& consumer,
+                                const se::DeviceDescription& device_info) {
+  return HloFusionAnalysis::Create(
       consumer.backend_config<GpuBackendConfig>()->fusion_backend_config(),
       HloFusionAdaptor::ForInstruction(&consumer), &device_info);
-  if (!ret.ok()) return std::nullopt;
-  return {std::move(*ret)};
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
index c5df9cd42aa410..011810a72b8532 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis.h
@@ -27,7 +27,6 @@ limitations under the License.
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/launch_dimensions.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla {
@@ -55,13 +54,11 @@ class HloFusionAnalysis {
     int smallest_input_dtype_bits;
   };
 
-  static absl::StatusOr<HloFusionAnalysis> Create(
-      FusionBackendConfig backend_config,
-      std::unique_ptr<HloFusionAdaptor> fusion,
-      const se::DeviceDescription* device_info);
-  static absl::StatusOr<HloFusionAnalysis> Create(
-      const HloFusionInstruction* fusion,
-      const se::DeviceDescription* device_info);
+  static HloFusionAnalysis Create(FusionBackendConfig backend_config,
+                                  std::unique_ptr<HloFusionAdaptor> fusion,
+                                  const se::DeviceDescription* device_info);
+  static HloFusionAnalysis Create(const HloFusionInstruction* fusion,
+                                  const se::DeviceDescription* device_info);
 
   const std::vector<const HloInstruction*>& fusion_roots() const {
     return fusion_roots_;
@@ -116,14 +113,14 @@ class HloFusionAnalysis {
 
 // Creates a HloFusionAnalysis that analyzes a hypothetical fusion of producer
 // into consumer.
-std::optional<HloFusionAnalysis> AnalyzeProducerConsumerFusion(
+HloFusionAnalysis AnalyzeProducerConsumerFusion(
     const HloInstruction& producer, const HloInstruction& consumer,
     const se::DeviceDescription& device_info);
 
 // Creates a HloFusionAnalysis that analyzes just consumer as a standalone
 // fusion.
-std::optional<HloFusionAnalysis> AnalyzeFusion(
-    const HloInstruction& consumer, const se::DeviceDescription& device_info);
+HloFusionAnalysis AnalyzeFusion(const HloInstruction& consumer,
+                                const se::DeviceDescription& device_info);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc b/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc
index 5f389effc04a9a..def90a63192673 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_analysis_test.cc
@@ -14,6 +14,9 @@ limitations under the License.
 ==============================================================================*/
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 
+#include <optional>
+
+#include <gtest/gtest.h>
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/hlo_traversal.h"
@@ -48,14 +51,12 @@ TEST_F(HloFusionAnalysisTest, DoesNotPeekOutsideBoundary) {
 
   auto* root = module->entry_computation()->root_instruction();
   auto analysis = AnalyzeFusion(*root, device_info);
-  ASSERT_NE(analysis, std::nullopt);
-  EXPECT_EQ(analysis->GetEmitterFusionKind(),
+  EXPECT_EQ(analysis.GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kLoop);
 
   auto analysis_fused =
       AnalyzeProducerConsumerFusion(*root->operand(0), *root, device_info);
-  ASSERT_NE(analysis_fused, std::nullopt);
-  EXPECT_EQ(analysis_fused->GetEmitterFusionKind(),
+  EXPECT_EQ(analysis_fused.GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kReduction);
 }
 
@@ -86,12 +87,11 @@ TEST_F(HloFusionAnalysisTest, ReductionWithMultipleUsers) {
 
   auto device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto analysis, HloFusionAnalysis::Create(
-                         FusionBackendConfig::default_instance(),
-                         HloFusionAdaptor::ForInstruction(
-                             module->entry_computation()->root_instruction()),
-                         &device_info));
+  auto analysis = HloFusionAnalysis::Create(
+      FusionBackendConfig::default_instance(),
+      HloFusionAdaptor::ForInstruction(
+          module->entry_computation()->root_instruction()),
+      &device_info);
   EXPECT_EQ(analysis.GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kReduction);
 }
@@ -122,10 +122,9 @@ TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusion) {
   auto device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
 
   auto* root = module->entry_computation()->root_instruction();
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto analysis, HloFusionAnalysis::Create(
-                         FusionBackendConfig::default_instance(),
-                         HloFusionAdaptor::ForInstruction(root), &device_info));
+  auto analysis = HloFusionAnalysis::Create(
+      FusionBackendConfig::default_instance(),
+      HloFusionAdaptor::ForInstruction(root), &device_info);
   EXPECT_EQ(analysis.GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kReduction);
 }
@@ -159,8 +158,7 @@ TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusionPartiallyFused) {
 
   auto analysis =
       AnalyzeProducerConsumerFusion(*root->operand(0), *root, device_info);
-  ASSERT_NE(analysis, std::nullopt);
-  EXPECT_EQ(analysis->GetEmitterFusionKind(),
+  EXPECT_EQ(analysis.GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kReduction);
 }
 
@@ -191,8 +189,7 @@ TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusionPartiallyFusedInConsumer) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis =
       AnalyzeProducerConsumerFusion(*root->operand(0), *root, device_info);
-  ASSERT_NE(analysis, std::nullopt);
-  EXPECT_EQ(analysis->GetEmitterFusionKind(),
+  EXPECT_EQ(analysis.GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kReduction);
 }
 
@@ -229,11 +226,78 @@ TEST_F(HloFusionAnalysisTest, ReductionEpilogueFusionPartiallyFusedInBoth) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis =
       AnalyzeProducerConsumerFusion(*root->operand(0), *root, device_info);
-  ASSERT_NE(analysis, std::nullopt);
-  EXPECT_EQ(analysis->GetEmitterFusionKind(),
+  EXPECT_EQ(analysis.GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kReduction);
 }
 
+TEST_F(HloFusionAnalysisTest, ReduceMultiOutputFusionWithTransposeBitcast) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+    HloModule test_module
+
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+
+    fusion {
+      %p0 = f32[1024, 512]{1,0} parameter(0)
+      %p1 = f32[] parameter(1)
+      %reduce = f32[1024]{0} reduce(%p0, %p1), dimensions={1}, to_apply=add
+      %bitcast = f32[512, 1024]{0,1} bitcast(%p0)
+      ROOT res = (f32[1024]{0}, f32[512, 1024]{0,1}) tuple(%reduce, %bitcast)
+    }
+
+    ENTRY main {
+      %p0 = f32[1024, 512]{1,0} parameter(0)
+      %p1 = f32[] parameter(1)
+      ROOT %fusion = (f32[1024]{0}, f32[512, 1024]{0,1}) fusion(%p0, %p1), kind=kInput, calls=fusion
+    })"));
+
+  auto device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis =
+      AnalyzeProducerConsumerFusion(*root->operand(0), *root, device_info);
+  EXPECT_EQ(analysis.GetEmitterFusionKind(),
+            HloFusionAnalysis::EmitterFusionKind::kReduction);
+}
+
+TEST_F(HloFusionAnalysisTest, InvalidReduceMultiOutputFusion) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(R"(
+    HloModule test_module
+
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+
+    fusion {
+      %p0 = f32[1024, 1024]{1,0} parameter(0)
+      %p1 = f32[] parameter(1)
+      %reduce = f32[1024]{0} reduce(%p0, %p1), dimensions={0}, to_apply=add
+      %reduce2 = f32[1024]{0} reduce(%p0, %p1), dimensions={1}, to_apply=add
+      ROOT res = (f32[1024]{0}, f32[1024]{0}) tuple(reduce, reduce2)
+    }
+
+    ENTRY main {
+      %p0 = f32[1024, 1024]{1,0} parameter(0)
+      %p1 = f32[] parameter(1)
+      ROOT %fusion = (f32[1024]{0}, f32[1024]{0}) fusion(%p0, %p1), kind=kInput, calls=fusion
+    })"));
+
+  auto device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+
+  auto* root = module->entry_computation()->root_instruction();
+  auto analysis =
+      AnalyzeProducerConsumerFusion(*root->operand(0), *root, device_info);
+  // We expect to fallback to the loop emitter, because the two reductions are
+  // not compatible as they reduce over different dimensions.
+  EXPECT_EQ(analysis.GetEmitterFusionKind(),
+            HloFusionAnalysis::EmitterFusionKind::kLoop);
+}
+
 TEST_F(HloFusionAnalysisTest, InvalidDevice) {
   // Verifies that an analysis can be created even with an invalid/empty device
   // info, and that the emitter type is determined correctly.
@@ -260,8 +324,7 @@ TEST_F(HloFusionAnalysisTest, InvalidDevice) {
   auto* root = module->entry_computation()->root_instruction();
   auto analysis_fused =
       AnalyzeProducerConsumerFusion(*root->operand(0), *root, device_info);
-  ASSERT_NE(analysis_fused, std::nullopt);
-  EXPECT_EQ(analysis_fused->GetEmitterFusionKind(),
+  EXPECT_EQ(analysis_fused.GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kReduction);
 }
 
@@ -286,10 +349,9 @@ TEST_F(HloFusionAnalysisTest, ConcatFusion) {
   auto device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
 
   auto* root = module->entry_computation()->root_instruction();
-  TF_ASSERT_OK_AND_ASSIGN(
-      auto analysis, HloFusionAnalysis::Create(
-                         FusionBackendConfig::default_instance(),
-                         HloFusionAdaptor::ForInstruction(root), &device_info));
+  auto analysis = HloFusionAnalysis::Create(
+      FusionBackendConfig::default_instance(),
+      HloFusionAdaptor::ForInstruction(root), &device_info);
   EXPECT_EQ(analysis.GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kConcatenate);
 }
diff --git a/third_party/xla/xla/service/gpu/hlo_fusion_stats.cc b/third_party/xla/xla/service/gpu/hlo_fusion_stats.cc
index 0fdcfefeb15937..11ec2530836807 100644
--- a/third_party/xla/xla/service/gpu/hlo_fusion_stats.cc
+++ b/third_party/xla/xla/service/gpu/hlo_fusion_stats.cc
@@ -48,6 +48,7 @@ class OpcodeCollector : public ConstDfsHloVisitorWithDefault {
       case HloOpcode::kCbrt:
       case HloOpcode::kCeil:
       case HloOpcode::kCos:
+      case HloOpcode::kErf:
       case HloOpcode::kExp:
       case HloOpcode::kExpm1:
       case HloOpcode::kFloor:
diff --git a/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.cc b/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.cc
index ea7a32ccb01b1a..33f9d9fedb4998 100644
--- a/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.cc
+++ b/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.cc
@@ -97,20 +97,6 @@ void HloToIrBindings::EmitBasePointersForHlos(
   }
 }
 
-llvm::Value* HloToIrBindings::EmitGetTupleElement(const HloInstruction* gte,
-                                                  llvm::Value* base_ptr) {
-  // TODO(b/26344050): tighten the alignment based on the real element type.
-  if (gte->operand(0)->opcode() != HloOpcode::kGetTupleElement) {
-    return llvm_ir::EmitGetTupleElement(
-        gte->shape(), gte->tuple_index(), /*alignment=*/1, base_ptr,
-        llvm_ir::ShapeToIrType(gte->operand(0)->shape(), module_), b_);
-  }
-  return llvm_ir::EmitGetTupleElement(
-      gte->shape(), gte->tuple_index(), /*alignment=*/1,
-      EmitGetTupleElement(gte->operand(0), base_ptr),
-      llvm_ir::ShapeToIrType(gte->operand(0)->shape(), module_), b_);
-}
-
 // Returns true if `value` has a name that should not be changed.
 static bool HasMeaningfulName(llvm::Value* value) {
   if (auto* global = llvm::dyn_cast<llvm::GlobalValue>(value)) {
@@ -149,26 +135,9 @@ llvm_ir::IrArray HloToIrBindings::GetIrArray(const HloInstruction& hlo,
   return ir_array;
 }
 
-void HloToIrBindings::UnbindAllLocalIrValues() {
-  std::vector<const HloInstruction*> hlos_to_unbind;
-  for (auto& key_value : base_ptrs_) {
-    if (!llvm::isa<llvm::GlobalVariable>(
-            (key_value.second.element({}))->stripPointerCasts())) {
-      hlos_to_unbind.push_back(key_value.first);
-    }
-  }
-  for (const HloInstruction* hlo_to_unbind : hlos_to_unbind) {
-    VLOG(2) << "Unbinding " << hlo_to_unbind->ToString();
-    base_ptrs_.erase(hlo_to_unbind);
-  }
-}
-
 std::string HloToIrBindings::ToString() const {
   std::string s = StrCat("** HloToIrBindings **\n");
   StrAppend(&s, "  is_nested_=", is_nested_, "\n");
-  StrAppend(&s,
-            "  temp_buffer_base_=", llvm_ir::DumpToString(temp_buffer_base_),
-            "\n");
 
   if (base_ptrs_.empty()) {
     return s;
diff --git a/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.h b/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.h
index 5750c17dc69993..56b347d33ad760 100644
--- a/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.h
+++ b/third_party/xla/xla/service/gpu/hlo_to_ir_bindings.h
@@ -43,21 +43,11 @@ class HloToIrBindings {
   void BindHloToIrValue(const HloInstruction& hlo, llvm::Value* ir_value,
                         ShapeIndexView shape_index = {});
 
-  // Unbinds all IR values that's defined in an LLVM function, e.g., function
-  // arguments and stack variables. Global variables will be kept in bindings_.
-  //
-  // This method is called after emitting code for each top-level HLO. The local
-  // IR values are out of scope at that point and should not be used.
-  void UnbindAllLocalIrValues();
-
   // Returns whether `hlo` is bound to an LLVM IR value.
   bool BoundToIrValue(const HloInstruction& hlo) const {
     return base_ptrs_.contains(&hlo);
   }
 
-  llvm::Value* GetTempBufferBase() const { return temp_buffer_base_; }
-  void SetTempBufferBase(llvm::Value* v) { temp_buffer_base_ = v; }
-
   // A helper method that returns the base pointer of the IrArray containing the
   // output of "inst".at the given ShapeIndex.
   llvm::Value* GetBasePointer(const HloInstruction& hlo,
@@ -81,10 +71,6 @@ class HloToIrBindings {
   std::string ToString() const;
 
  private:
-  // Emits IR to resolve (possibly) recursive GetTupleElement instructions.
-  llvm::Value* EmitGetTupleElement(const HloInstruction* gte,
-                                   llvm::Value* base_ptr);
-
   const bool is_nested_;
 
   llvm::IRBuilder<>* b_;
@@ -96,9 +82,6 @@ class HloToIrBindings {
   // in the ShapeTree.
   absl::flat_hash_map<const HloInstruction*, ShapeTree<llvm::Value*>>
       base_ptrs_;
-
-  // The address of the memory block that contains all temporary buffers.
-  llvm::Value* temp_buffer_base_ = nullptr;
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/hlo_traversal.cc b/third_party/xla/xla/service/gpu/hlo_traversal.cc
index 43f670751139f1..a70821b84c150b 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal.cc
+++ b/third_party/xla/xla/service/gpu/hlo_traversal.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <queue>
+#include <string>
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
@@ -77,6 +78,8 @@ class SingleInstructionFusion : public HloFusionAdaptor {
     return {instruction_};
   }
 
+  std::string ToString() const override { return instruction_.ToString(); }
+
  private:
   HloInstructionAdaptor instruction_;
 };
@@ -155,6 +158,8 @@ class HloComputationFusion : public HloFusionAdaptor {
     return result;
   }
 
+  std::string ToString() const override { return computation_->ToString(); }
+
  private:
   const HloComputation* computation_;
   absl::InlinedVector<HloInstructionAdaptor, 2> roots_;
diff --git a/third_party/xla/xla/service/gpu/hlo_traversal.h b/third_party/xla/xla/service/gpu/hlo_traversal.h
index f4173905565635..fa5bc0f81817fb 100644
--- a/third_party/xla/xla/service/gpu/hlo_traversal.h
+++ b/third_party/xla/xla/service/gpu/hlo_traversal.h
@@ -24,6 +24,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -74,6 +75,7 @@ class HloFusionAdaptor {
   virtual absl::InlinedVector<HloInstructionAdaptor, 2> GetRoots() const = 0;
   virtual absl::InlinedVector<HloInstructionAdaptor, 2>
   MakeInstructionPostOrder() const = 0;
+  virtual std::string ToString() const = 0;
 
   static std::unique_ptr<HloFusionAdaptor> ForInstruction(
       const HloInstruction* instruction);
@@ -114,6 +116,15 @@ class ProducerConsumerFusion : public HloFusionAdaptor {
     return producer_post_order;
   }
 
+  std::string ToString() const override {
+    // TODO: Add a parameter to indent output on nested adaptor for better
+    // visual representation. Nested producer-consumers fusion are not used in
+    // practice yet.
+    return absl::StrJoin({std::string("producer-consumer fusion:"),
+                          producer_->ToString(), consumer_->ToString()},
+                         "\n");
+  }
+
  private:
   std::unique_ptr<HloFusionAdaptor> producer_;
   std::unique_ptr<HloFusionAdaptor> consumer_;
diff --git a/third_party/xla/xla/service/gpu/horizontal_loop_fusion.cc b/third_party/xla/xla/service/gpu/horizontal_loop_fusion.cc
index 069e7fa6e67d1d..a26d86a4af720f 100644
--- a/third_party/xla/xla/service/gpu/horizontal_loop_fusion.cc
+++ b/third_party/xla/xla/service/gpu/horizontal_loop_fusion.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/layout_util.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/hlo_creation_utils.h"
+#include "xla/service/sub_byte_normalization.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
 
@@ -713,6 +714,15 @@ absl::StatusOr<bool> GpuHorizontalLoopFusion::Run(
   TF_ASSIGN_OR_RETURN(bool changed,
                       RunOnComputation(module->entry_computation()));
 
+  if (changed) {
+    // Correctly set element_size_in_bits for any int4 added slice and
+    // concatenate instructions
+    TF_ASSIGN_OR_RETURN(
+        [[maybe_unused]] bool unused,
+        SubByteNormalization{SubByteNormalization::SET_ELEMENT_SIZE}.Run(
+            module));
+  }
+
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/gpu/horizontal_loop_fusion.h b/third_party/xla/xla/service/gpu/horizontal_loop_fusion.h
index e1c198739ad30e..148223878cf7c7 100644
--- a/third_party/xla/xla/service/gpu/horizontal_loop_fusion.h
+++ b/third_party/xla/xla/service/gpu/horizontal_loop_fusion.h
@@ -63,15 +63,16 @@ namespace gpu {
 // fused operations have the same shape or not.
 //
 // case 1: if Mul and Add's output shape and type are the same, then we fuse
-// them into the below pattern: i0 i1   i2 i3
+// them into the below pattern:
+// i0 i1   i2 i3
 //  | |     | |
 //  v v     v v
 //  Mul     Add
 //   |       |
 //   v       v
 //  (ROOT) tuple
-// the fused kernel will be kLoop type, i.e, GPU code is emitted through
-// IrEmitterUnnested::EmitLoopFusion
+// the fused kernel will be kLoop type, and GPU code is emitted through
+// the LoopFusion class.
 //
 // case 2: if Mul and Add's output shape are diffent, then we fuse them into
 // the below pattern that adds extra indexing:
@@ -96,7 +97,7 @@ namespace gpu {
 //  (ROOT) tuple
 //
 // the fused kernel will be kInput type, and, the GPU code is emitted through
-// IrEmitterUnnested::EmitInputFusibleNonStridedSlices
+// the InputSlicesFusion class.
 //
 // In theory, the pattern in case 1 could also be fused into the case2 target
 // graph, but we prefer to fuse into kLoop type, because the codegen for it does
diff --git a/third_party/xla/xla/service/gpu/instruction_fusion.cc b/third_party/xla/xla/service/gpu/instruction_fusion.cc
index a507e086e50256..214aa3d960a097 100644
--- a/third_party/xla/xla/service/gpu/instruction_fusion.cc
+++ b/third_party/xla/xla/service/gpu/instruction_fusion.cc
@@ -130,7 +130,7 @@ FusionDecision GpuInstructionFusion::ShouldFuse(HloInstruction* consumer,
 
 HloInstruction::FusionKind GpuInstructionFusion::ChooseKind(
     const HloInstruction* producer, const HloInstruction* consumer) {
-  return ChooseFusionKind(*producer, *consumer);
+  return ChooseFusionKind(*consumer);
 }
 
 HloInstruction* GpuInstructionFusion::FuseInstruction(
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.cc b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
index 1a3d5d766f2f1a..ca299d71d1ae2b 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
@@ -172,31 +172,6 @@ bool IsCustomCallToTopK(const HloInstruction& hlo) {
          hlo.custom_call_target() == kTopKCustomCallTarget;
 }
 
-bool IsInputFusibleSlices(mlir::Operation* unnested_hlo,
-                          bool verify_no_strides) {
-  auto fusion = mlir::dyn_cast<mlir::lmhlo::FusionOp>(unnested_hlo);
-  if (!fusion) {
-    return false;
-  }
-
-  auto is_non_strided = [](mlir::DenseIntElementsAttr strides) -> bool {
-    return absl::c_all_of(
-        strides, [](const llvm::APInt& stride) { return stride == 1; });
-  };
-
-  for (mlir::Value value : fusion.getFusionResults()) {
-    auto slice =
-        mlir::dyn_cast_or_null<mlir::mhlo::SliceOp>(value.getDefiningOp());
-    if (!slice) {
-      return false;
-    }
-    if (verify_no_strides && !is_non_strided(slice.getStrides())) {
-      return false;
-    }
-  }
-  return true;
-}
-
 bool IsSliceWithUnitStrides(const HloInstruction* instr) {
   auto slice = DynCast<HloSliceInstruction>(instr);
   return slice && absl::c_all_of(slice->slice_strides(),
@@ -221,62 +196,6 @@ bool IsContiguousSlice(const HloInstruction& instr) {
   return true;
 }
 
-// This emits a device-side call to
-// "i32 vprintf(i8* fmt, arguments_type* arguments)" in the driver; see
-// http://docs.nvidia.com/cuda/ptx-writers-guide-to-interoperability/index.html#system-calls
-llvm::Value* EmitPrintf(absl::string_view fmt,
-                        absl::Span<llvm::Value* const> arguments,
-                        llvm::IRBuilder<>* builder) {
-  std::vector<llvm::Type*> argument_types;
-
-  // Variadic arguments implicit promotion [1] converts float to double,
-  // and bool/char/short are converted to int.
-  // [1] https://en.cppreference.com/w/cpp/language/variadic_arguments
-  auto requires_int32_promotion = [](llvm::Type* type) {
-    return type->isIntegerTy(/*BitWidth=*/1) ||
-           type->isIntegerTy(/*BitWidth=*/8) ||
-           type->isIntegerTy(/*BitWidth=*/16);
-  };
-  auto requires_double_promotion = [](llvm::Type* type) {
-    return type->isFloatingPointTy();
-  };
-
-  for (auto argument : arguments) {
-    llvm::Type* type = argument->getType();
-    if (requires_double_promotion(type)) {
-      argument_types.push_back(builder->getDoubleTy());
-    } else if (requires_int32_promotion(type)) {
-      argument_types.push_back(builder->getInt32Ty());
-    } else {
-      argument_types.push_back(type);
-    }
-  }
-  auto* arguments_type = llvm::StructType::create(argument_types);
-  llvm::Value* arguments_ptr = builder->CreateAlloca(arguments_type);
-  for (size_t i = 0; i < arguments.size(); ++i) {
-    llvm::Value* value = arguments[i];
-    llvm::Type* type = value->getType();
-    if (requires_double_promotion(type)) {
-      value = builder->CreateFPCast(value, builder->getDoubleTy());
-    } else if (requires_int32_promotion(type)) {
-      value = builder->CreateIntCast(value, builder->getInt32Ty(),
-                                     /*isSigned=*/true);
-    }
-    builder->CreateStore(
-        value,
-        builder->CreateGEP(arguments_type, arguments_ptr,
-                           {builder->getInt64(0), builder->getInt32(i)}));
-  }
-  llvm::Type* ptr_ty = builder->getPtrTy();
-  return builder->CreateCall(
-      builder->GetInsertBlock()->getParent()->getParent()->getOrInsertFunction(
-          "vprintf",
-          llvm::FunctionType::get(builder->getInt32Ty(), {ptr_ty, ptr_ty},
-                                  /*isVarArg=*/false)),
-      {builder->CreateGlobalStringPtr(llvm_ir::AsStringRef(fmt)),
-       builder->CreatePointerCast(arguments_ptr, ptr_ty)});
-}
-
 // Helper function to emit call to AMDGPU shfl_down function.
 llvm::Value* EmitAMDGPUShflDown(llvm::Value* value, llvm::Value* offset,
                                 llvm::IRBuilder<>* b) {
@@ -994,13 +913,30 @@ std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
 }
 
 bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count,
-                    const HloFusionAdaptor* fusion) {
+                    const HloFusionAdaptor* fusion,
+                    bool add_single_user_check) {
   // Number of operands should be in range [1, allowed_operand_count].
   if (instr->operand_count() == 0 ||
       instr->operand_count() > allowed_operand_count) {
     return false;
   }
 
+  if (add_single_user_check) {
+    // Check that intermediate `instr` doesn't have multiple users. If we have a
+    // fusion, only consider users within the fusion.
+    // TODO(akuegel): Figure out why we still need this check for transpose
+    // fusions.
+    int64_t num_users =
+        fusion ? absl::c_count_if(HloInstructionAdaptor{*instr}.GetUsers(),
+                                  [&](auto user) {
+                                    return fusion->ContainsInstruction(user);
+                                  })
+               : instr->user_count();
+    if (num_users > 1) {
+      return false;
+    }
+  }
+
   if (instr->IsElementwise()) {
     // All elementwise ops are considered intermediate, except for copies that
     // modify the layout. Copies that do not modify the layout are used in
@@ -1026,10 +962,6 @@ bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count,
   }
 }
 
-static bool IsParameter(const HloInstruction& instr) {
-  return instr.opcode() == HloOpcode::kParameter;
-}
-
 static std::optional<HloInstructionAdaptor> FindNonTrivialHero(
     HloInstructionAdaptor root, const HloFusionAdaptor& fusion,
     const std::function<bool(const HloInstruction&)>& predicate) {
@@ -1044,7 +976,11 @@ static std::optional<HloInstructionAdaptor> FindNonTrivialHero(
       return TraversalResult::kSkip;
     }
 
-    if (!IsIntermediate(&node.instruction(), /*allowed_operand_count=*/3)) {
+    // We set add_single_user_check to true because it could be that it causes
+    // problems if we have more than one user in a transpose fusion.
+    // TODO(akuegel): Verify and possibly fix this.
+    if (!IsIntermediate(&node.instruction(), /*allowed_operand_count=*/3,
+                        /*fusion=*/nullptr, /*add_single_user_check=*/true)) {
       return TraversalResult::kSkip;
     }
     return TraversalResult::kAdvance;
@@ -1056,10 +992,14 @@ static std::optional<HloInstructionAdaptor> FindNonTrivialHero(
 
   // Make sure that no non-elementwise op is reachable from the transpose.
   auto is_nontrivial = [](HloInstructionAdaptor node) {
+    // We set add_single_user_check to true because it could be that it causes
+    // problems if we have more than one user in a transpose fusion.
+    // TODO(akuegel): Verify and possibly fix this.
     return node.instruction().opcode() != HloOpcode::kTuple &&
            node.instruction().opcode() != HloOpcode::kParameter &&
            !IsIntermediate(&node.instruction(),
-                           /*allowed_operand_count=*/3);
+                           /*allowed_operand_count=*/3, /*fusion=*/nullptr,
+                           /*add_single_user_check=*/true);
   };
   bool visit_operands = false;
   if (HloAnyOf(hero->GetUsers(), fusion, is_nontrivial, visit_operands)) {
@@ -1097,11 +1037,18 @@ const HloInstruction& FindNonTrivialHero(const HloInstruction& instr,
   if (auto concatenate = FindNonTrivialHero(hero, fusion, is_concatenate)) {
     return concatenate->instruction();
   }
+  if (hero.opcode() != HloOpcode::kReduce) {
+    return instr;
+  }
   return hero.instruction();
 }
 
 const HloInstruction& FindNonTrivialHero(const HloInstruction& instr) {
-  CHECK_NE(instr.opcode(), HloOpcode::kFusion);
+  // It doesn't really make sense to call this function with a fusion, but it
+  // happens. Return the fusion itself for historical reasons.
+  // TODO(jreiffers): Clean this up.
+  if (instr.opcode() == HloOpcode::kFusion) return instr;
+
   return FindNonTrivialHero(instr,
                             *HloFusionAdaptor::ForComputation(instr.parent()));
 }
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.h b/third_party/xla/xla/service/gpu/ir_emission_utils.h
index fccaf414860483..5611a6e3ee7aa4 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils.h
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils.h
@@ -90,12 +90,6 @@ bool IsCustomCallToTopK(const HloInstruction& hlo);
 // is a success/failure code per batch element.
 extern const char* const kCusolverCholeskyCallTarget;
 
-// Returns whether unnested_hlo is an input fusion whose root is either a slice
-// or a tuple of slices. If verify_no_strides is true, returns false unless all
-// ROOT slices have no strides.
-bool IsInputFusibleSlices(mlir::Operation* unnested_hlo,
-                          bool verify_no_strides);
-
 // Returns true if `instr` is a non-strided slice.
 bool IsSliceWithUnitStrides(const HloInstruction* instr);
 
@@ -103,11 +97,6 @@ bool IsSliceWithUnitStrides(const HloInstruction* instr);
 // slice.
 bool IsContiguousSlice(const HloInstruction& instr);
 
-// Emits call to "vprintf" with given format and arguments.
-llvm::Value* EmitPrintf(absl::string_view fmt,
-                        absl::Span<llvm::Value* const> arguments,
-                        llvm::IRBuilder<>* builder);
-
 // Emits code to shuffle data between threads of a warp. This has the same
 // semantics as the PTX "shfl.sync.down" instruction but works for values that
 // aren't 32 bits in size. The last operand of the emitted "shfl" is
@@ -131,11 +120,6 @@ llvm::SmallVector<mlir::Value> GetHloOutputs(mlir::Operation* op);
 
 bool WritesMlirBuffer(mlir::Operation* op, mlir::Value operand);
 
-template <typename T>
-std::vector<T> ToStdVector(const llvm::SmallVectorImpl<T>& v) {
-  return std::vector<T>(v.begin(), v.end());
-}
-
 absl::StatusOr<BufferAllocation::Slice> GetAllocationSlice(
     mlir::Value v, absl::Span<const BufferAllocation* const> allocations,
     std::string* constant_name = nullptr);
@@ -201,11 +185,6 @@ struct TransposeDescription {
                        Vector3 permutation)
       : instr(instr), dimensions(dimensions), permutation(permutation) {}
 
-  std::string ToString() const {
-    return absl::StrCat("dimensions=", VectorString(dimensions),
-                        ", permutation=", VectorString(permutation));
-  }
-
   // Transpose instruction input shape.
   const Shape& input_shape() const { return instr->operand(0)->shape(); }
 
@@ -220,9 +199,12 @@ std::optional<TransposeDescription> GetDescriptionForTiledTransposeEmitter(
     const HloInstruction& root, const HloInstruction& hero);
 
 // Checks if the instruction is elementwise and only has a single user. If
-// a fusion adaptor is provided, only checks for users within the fusion.
+// a fusion adaptor is provided, only checks for users within the fusion. If
+// `add_single_user_check` is true, then it is also checked whether `instr` has
+// at most 1 user.
 bool IsIntermediate(const HloInstruction* instr, int allowed_operand_count = 1,
-                    const HloFusionAdaptor* fusion = nullptr);
+                    const HloFusionAdaptor* fusion = nullptr,
+                    bool add_single_user_check = false);
 
 // Log the given module if the VLOG level is >= level.
 void VLogModule(int level, const llvm::Module& module);
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
index a9aed7c823a940..d53c2142ffb8f2 100644
--- a/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emission_utils_test.cc
@@ -300,6 +300,37 @@ TEST_F(IrEmissionUtilsTest, FindReduceHeroEpilogueFusionHeroAlsoUsedAsNonHero) {
   EXPECT_EQ(result2.name(), "reduce.0");
 }
 
+TEST_F(IrEmissionUtilsTest, DoNotFindTransposeHeroEpilogueFusionTwoRootUsers) {
+  const char* hlo = R"(
+    HloModule module
+
+    fused_computation {
+      param_0 = f32[64,32]{1,0} parameter(0)
+      transpose = f32[32,64]{1,0} transpose(param_0), dimensions={1,0}
+      bitcast.1 = f32[1,32,64]{2,1,0} bitcast(transpose)
+      sign.1 = f32[1,32,64]{2,1,0} sign(bitcast.1)
+      ROOT tuple.12 = (f32[1,32,64]{2,1,0}, f32[1,32,64]{2,1,0}) tuple(bitcast.1, sign.1)
+    }
+
+    ENTRY main.7749 {
+      Arg_2.1 = f32[64,32]{1,0} parameter(0)
+      ROOT fusion = (f32[1,32,64]{2,1,0}, f32[1,32,64]{2,1,0}) fusion(Arg_2.1), kind=kInput, calls=fused_computation
+    }
+    )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo));
+
+  HloInstruction* r = module->entry_computation()->root_instruction();
+  auto fusion = HloFusionAdaptor::ForInstruction(r);
+  const auto& result =
+      FindNonTrivialHero(fusion->GetRoots()[0].instruction(), *fusion);
+  EXPECT_EQ(result.name(), "bitcast.1");
+  const auto& result2 =
+      FindNonTrivialHero(fusion->GetRoots()[1].instruction(), *fusion);
+  EXPECT_EQ(result2.name(), "sign.1");
+}
+
 TEST_F(IrEmissionUtilsTest, FindAnyTiledTransposeWithIntermediateBinaryOp) {
   const char* hlo = R"(
 HloModule module
@@ -561,6 +592,7 @@ ENTRY entry {
   slice.7 = f32[8,12,1,2]{0,1,3,2} slice(c), slice={[0:8], [0:12], [0:1], [0:2]}
   slice.8 = f32[8,2,100,11]{0,1,3,2} slice(c), slice={[0:8], [0:2], [0:100], [0:11]}
   slice.9 = f32[8,2,40,11]{0,1,3,2} slice(c), slice={[0:8], [10:12], [10:50], [0:11]}
+  slice.10 = f32[8,2,50,11]{3,2,1,0} slice(p), slice={[0:8:1], [10:12:1], [0:100:2], [0:11:1]}
   ROOT t = (f32[2,12,100,11]{3,2,1,0},
             f32[1,1,1,11]{3,2,1,0},
             f32[1,1,10,11]{3,2,1,0},
@@ -569,7 +601,8 @@ ENTRY entry {
             f32[8,12,40,11]{0,1,3,2},
             f32[8,12,1,2]{0,1,3,2},
             f32[8,2,100,11]{0,1,3,2},
-            f32[8,2,40,11]{0,1,3,2}) tuple(slice.1, slice.2, slice.3, slice.4, slice.5, slice.6, slice.7, slice.8, slice.9)
+            f32[8,2,40,11]{0,1,3,2},
+            f32[8,2,50,11]{3,2,1,0}) tuple(slice.1, slice.2, slice.3, slice.4, slice.5, slice.6, slice.7, slice.8, slice.9, slice.10)
 }
 )";
 
@@ -594,6 +627,8 @@ ENTRY entry {
       module->entry_computation()->GetInstructionWithName("slice.8");
   HloInstruction* slice9 =
       module->entry_computation()->GetInstructionWithName("slice.9");
+  HloInstruction* slice10 =
+      module->entry_computation()->GetInstructionWithName("slice.10");
   EXPECT_TRUE(IsContiguousSlice(*slice1));
   EXPECT_TRUE(IsContiguousSlice(*slice2));
   EXPECT_TRUE(IsContiguousSlice(*slice3));
@@ -603,6 +638,7 @@ ENTRY entry {
   EXPECT_TRUE(IsContiguousSlice(*slice7));
   EXPECT_TRUE(!IsContiguousSlice(*slice8));
   EXPECT_TRUE(!IsContiguousSlice(*slice9));
+  EXPECT_TRUE(!IsContiguousSlice(*slice10));
 }
 
 TEST_F(IrEmissionUtilsTest, LiteralToAttrToXlaFormat) {
diff --git a/third_party/xla/xla/service/gpu/ir_emitter.h b/third_party/xla/xla/service/gpu/ir_emitter.h
index 4dcec4c2844bce..70cd40fd307ee3 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter.h
@@ -141,13 +141,6 @@ class IrEmitter : public DfsHloVisitorWithDefault,
                                const char* sync_scope_id);
 
  private:
-  // A helper method for HandleSort(). It adds the inner comparison loop where
-  // we compare elements pointed to by 'keys_index' and 'compare_keys_index'.
-  void EmitCompareLoop(int64_t dimension_to_sort,
-                       const llvm_ir::IrArray::Index& keys_index,
-                       const llvm_ir::IrArray::Index& compare_keys_index,
-                       const llvm_ir::IrArray& keys_array);
-
   // A convenience method to determine whether or not IR is emitted for AMDGPU.
   bool IsEmittingForAMDGPU() const;
 };
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_context.cc b/third_party/xla/xla/service/gpu/ir_emitter_context.cc
index f09520eaef2cf4..81eaef0ebe229d 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_context.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_context.cc
@@ -22,6 +22,7 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/TargetParser/Triple.h"
 #include "xla/service/gpu/gpu_constants.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 
@@ -66,6 +67,9 @@ void IrEmitterContext::emit_constant(int64_t num_elements,
                                                 content.span().size()));
   }();
 
+  // Explicitly set global addrspace for SPIR backend.
+  int addrspace =
+      llvm::Triple(llvm_module_->getTargetTriple()).isSPIR() ? 1 : 0;
   // These globals will be looked up by name by GpuExecutable so we need to
   // give them an external linkage.  Not all of their uses are visible in
   // the LLVM IR so we can't give then a linkage that merely preserves their
@@ -79,7 +83,7 @@ void IrEmitterContext::emit_constant(int64_t num_elements,
       llvm::GlobalValue::ExternalLinkage,
       /*Initializer=*/initializer, symbol_name,
       /*TLMode=*/llvm::GlobalValue::NotThreadLocal,
-      /*AddressSpace=*/0,
+      /*AddressSpace=*/addrspace,
       /*isExternallyInitialized=*/false);
   global_for_const->setAlignment(llvm::Align(kConstantBufferAlignBytes));
   llvm_module_->insertGlobalVariable(global_for_const);
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_nested.cc b/third_party/xla/xla/service/gpu/ir_emitter_nested.cc
index 3a54197f27ec1f..c888aff308782c 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_nested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_nested.cc
@@ -25,6 +25,7 @@ limitations under the License.
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/TargetParser/Triple.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
index dba8da3b06a28f..b2f3698011432c 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.cc
@@ -19,6 +19,7 @@ limitations under the License.
 #include <cmath>
 #include <cstdint>
 #include <functional>
+#include <limits>
 #include <memory>
 #include <optional>
 #include <queue>
@@ -103,12 +104,14 @@ limitations under the License.
 #include "xla/service/gpu/target_util.h"
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/gpu/triton_tiling_propagation.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/llvm_ir/llvm_util.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/translate/hlo_to_mhlo/hlo_function_importer.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -119,14 +122,13 @@ limitations under the License.
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/tensor_float_32_utils.h"
 #include "triton/Conversion/NVGPUToLLVM/NVGPUToLLVMPass.h"
-#include "triton/Conversion/TritonGPUToLLVM/TritonGPUToLLVMPass.h"
+#include "triton/Conversion/TritonGPUToLLVM/Passes.h"
 #include "triton/Conversion/TritonToTritonGPU/TritonToTritonGPUPass.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/Triton/IR/Types.h"
 #include "triton/Dialect/Triton/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonNvidiaGPU/Transforms/Passes.h"
-#include "triton/Target/PTX/TmaMetadata.h"
 
 namespace xla {
 namespace gpu {
@@ -140,6 +142,7 @@ namespace mt = ::mlir::triton;
 using ::llvm::SmallVector;
 using mlir::ArrayRef;
 using mlir::ImplicitLocOpBuilder;
+using ::mlir::ShapedType;
 using ::mlir::Type;
 using ::mlir::Value;
 using mlir::ValueRange;
@@ -219,7 +222,7 @@ ma::ConstantOp CreateConst(ImplicitLocOpBuilder& b, Type type, T value,
 }
 
 Value ZerosLike(ImplicitLocOpBuilder& b, Value x) {
-  if (auto src_shaped_ty = x.getType().dyn_cast<mlir::ShapedType>()) {
+  if (auto src_shaped_ty = x.getType().dyn_cast<ShapedType>()) {
     Type src_ty = src_shaped_ty.getElementType();
     return CreateConst(b, src_ty, 0, src_shaped_ty.getShape());
   }
@@ -227,7 +230,7 @@ Value ZerosLike(ImplicitLocOpBuilder& b, Value x) {
 }
 
 Value OnesLike(ImplicitLocOpBuilder& b, Value x) {
-  if (auto src_shaped_ty = x.getType().dyn_cast<mlir::ShapedType>()) {
+  if (auto src_shaped_ty = x.getType().dyn_cast<ShapedType>()) {
     Type src_ty = src_shaped_ty.getElementType();
     return CreateConst(b, src_ty, 1, src_shaped_ty.getShape());
   }
@@ -240,7 +243,7 @@ Value Cast(ImplicitLocOpBuilder& b, Value value, Type dst_element_ty) {
   Type src_element_ty = src_ty;
   Type fp32_ty = b.getF32Type();
   Type dst_ty = dst_element_ty;
-  if (auto src_shaped_ty = src_ty.dyn_cast<mlir::ShapedType>()) {
+  if (auto src_shaped_ty = src_ty.dyn_cast<ShapedType>()) {
     src_element_ty = src_shaped_ty.getElementType();
     dst_ty = src_shaped_ty.clone(src_shaped_ty.getShape(), dst_element_ty);
     fp32_ty = src_shaped_ty.clone(src_shaped_ty.getShape(), b.getF32Type());
@@ -731,20 +734,26 @@ absl::StatusOr<Value> EmitScope(
   return values[instructions.back()];
 }
 
-absl::Status CreateTritonPipeline(mlir::OpPassManager& pm,
-                                  const se::CudaComputeCapability& cc,
-                                  const TritonGemmConfig& config) {
+// Create Triton pipeline.
+//
+// `out_cluster_info` must be kept alive at least until pm.run() is called.
+// It should be read after that. We have to pass the cluster dims to
+// LaunchDimensions. Triton currently uses this as an out-parameter to return
+// the cluster dims determined based on `config.num_ctas` and a heuristic. There
+// are some signs that show that this was intended to be used as an in-out
+// parameter which would give a hint to Triton which cluster dims we prefer to
+// use, but that's not the case currently.
+absl::Status CreateTritonPipeline(
+    mlir::OpPassManager& pm, const se::CudaComputeCapability& cc,
+    const TritonGemmConfig& config,
+    mlir::triton::nvidia_gpu::ClusterInfo& out_cluster_info) {
   const int ccAsInt = cc.major * 10 + cc.minor;
   const int threadsPerWarp = 32;
-  mlir::triton::nvidia_gpu::ClusterInfo clusterInfo;
-  clusterInfo.clusterDimX = config.cluster_dims.x;
-  clusterInfo.clusterDimY = config.cluster_dims.y;
-  clusterInfo.clusterDimZ = config.cluster_dims.z;
 
   // Based on make_ttir() in
-  // @triton//:python/triton/compiler/backends/cuda.py
-  pm.addPass(mt::createRewriteTensorPointerPass(ccAsInt));
+  // @triton//:third_party/nvidia/backend/compiler.py
   pm.addPass(mlir::createInlinerPass());
+  pm.addPass(mt::createRewriteTensorPointerPass());
   pm.addPass(mt::createCombineOpsPass());
   pm.addPass(mlir::createCanonicalizerPass());
   pm.addPass(mt::createReorderBroadcastPass());
@@ -753,13 +762,11 @@ absl::Status CreateTritonPipeline(mlir::OpPassManager& pm,
   pm.addPass(mlir::createSymbolDCEPass());
 
   // Based on make_ttgir() in
-  // @triton//:python/triton/compiler/backends/cuda.py
+  // @triton//:third_party/nvidia/backend/compiler.py
   pm.addPass(mt::createConvertTritonToTritonGPUPass(
       config.num_warps, threadsPerWarp, config.num_ctas, ccAsInt));
   pm.addPass(mt::gpu::createCoalescePass());
-  pm.addPass(mlir::createTritonNvidiaGPUPlanCTAPass(&clusterInfo));
-  pm.addPass(mlir::createTritonGPURewriteTensorPointerPass(ccAsInt));
-  pm.addPass(mlir::createTritonNvidiaGPUPlanCTAPass(&clusterInfo));
+  pm.addPass(mlir::createTritonNvidiaGPUPlanCTAPass(&out_cluster_info));
   pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
   pm.addPass(mt::gpu::createOptimizeThreadLocalityPass());
   pm.addPass(mt::gpu::createAccelerateMatmulPass(ccAsInt));
@@ -767,63 +774,34 @@ absl::Status CreateTritonPipeline(mlir::OpPassManager& pm,
   pm.addPass(mt::gpu::createOptimizeDotOperandsPass());
   pm.addPass(mlir::createCSEPass());
 
-  if (cc.IsAtLeastHopper() && config.enable_warp_specialization) {
-    // Triton currently doesn't support warp specialization for num_warps != 4.
-    // TODO from Triton to add support here:
-    // https://github.com/openai/triton/blob/1bc9c0ea67e4cbec2c77d4acde3173aa7d51c8f9/python/triton/compiler/backends/cuda.py#L119
-    if (config.num_warps != 4) {
-      return absl::UnimplementedError(
-          "Triton currently doesn't support warp specialization for "
-          "num_warps != 4.");
-    }
-    // Ideally, we should run
-    // 'mlir::createTritonNvidiaGPUWSFeasibilityCheckingPass(ccAsInt)' at this
-    // point on the IR to check if warp specialization is feasible. Instead, we
-    // are relying on failures as indication of infeasibility during
-    // auto-tuning.
-    pm.addPass(mlir::createTritonNvidiaGPUWSDecomposingPass(ccAsInt));
-    pm.addPass(mlir::createTritonNvidiaGPUWSPipelinePass(
-        config.num_stages, config.num_warps, ccAsInt));
-    pm.addPass(mlir::createTritonNvidiaGPUWSMutexPass(ccAsInt));
-    pm.addPass(mlir::createTritonNvidiaGPUWSMaterializationPass(ccAsInt));
-    pm.addPass(mlir::createLoopInvariantCodeMotionPass());
-    pm.addPass(mlir::createCSEPass());
-  } else {
+  if (cc.IsAtLeastAmpere()) {
     pm.addPass(mt::gpu::createPipelinePass(config.num_stages, config.num_warps,
                                            config.num_ctas, ccAsInt));
   }
-
-  pm.addPass(mlir::createTritonNvidiaGPUMaterializeLoadStorePass(
-      config.num_warps, ccAsInt));
-  if (ccAsInt <= 80) {
-    pm.addPass(mlir::triton::gpu::createPrefetchPass());
+  if (!cc.IsAtLeastHopper()) {
+    pm.addPass(mt::gpu::createPrefetchPass());
   }
+
   pm.addPass(mt::gpu::createOptimizeDotOperandsPass());
   pm.addPass(mt::gpu::createRemoveLayoutConversionsPass());
-  pm.addPass(mt::gpu::createDecomposeConversionsPass());
-  pm.addPass(mlir::createTritonNvidiaGPUWSFixupMissingAttrs());
+  pm.addPass(mt::gpu::createReduceDataDuplicationPass());
   pm.addPass(mt::gpu::createReorderInstructionsPass());
   pm.addPass(mlir::createCSEPass());
   pm.addPass(mlir::createSymbolDCEPass());
   if (cc.IsAtLeastHopper()) {
     pm.addPass(mlir::createTritonNvidiaGPUFenceInsertionPass(ccAsInt));
   }
-  pm.addPass(mlir::createTritonNvidiaGPUWSFixupMissingAttrs());
   pm.addPass(mlir::createCanonicalizerPass());
 
   // Based on make_llir() in
-  // @triton//:python/triton/compiler/backends/cuda.py
+  // @triton//:third_party/nvidia/backend/compiler.py
+  pm.addPass(mlir::triton::gpu::createDecomposeUnsupportedConversionsPass());
   pm.addPass(mlir::createConvertSCFToCFPass());
   pm.addPass(mlir::createConvertIndexToLLVMPass());
-  // // TODO(b/316566238): Use TMA info collected here in XLA runtime.
-  mlir::triton::gpu::TMAMetadataTy tma_infos;
-  pm.addPass(mt::createConvertTritonGPUToLLVMPass(ccAsInt,
-                                                  /*target=*/mlir::triton::NVVM,
-                                                  &tma_infos));
-  if (cc.IsAtLeastHopper() && config.enable_warp_specialization) {
-    pm.addPass(mlir::createLoopInvariantCodeMotionPass());
-    pm.addPass(mlir::createCSEPass());
-  }
+  pm.addPass(mlir::triton::gpu::createAllocateSharedMemoryPass());
+  pm.addPass(
+      mt::createConvertTritonGPUToLLVMPass(ccAsInt,
+                                           /*target=*/mlir::triton::NVVM));
   pm.addPass(mt::createConvertNVGPUToLLVMPass());
   pm.addPass(mlir::createArithToLLVMConversionPass());
   pm.addPass(mlir::createCanonicalizerPass());
@@ -1546,6 +1524,94 @@ ConstHloInstructionSet ScopeInputs(const TritonFusionAnalysis& analysis,
   return result;
 }
 
+// Truncates |input| of F32 type to the number representable in Bf16 toward
+// zero.
+// It is used for Emit6xBfloat16MatMul.
+Value TruncateToBF16TowardsZero(ImplicitLocOpBuilder& b, Value input) {
+  ShapedType input_type = input.getType().dyn_cast<ShapedType>();
+  Type input_type_as_i32 = input_type.clone(b.getI32Type());
+  Value input_as_i32 = b.create<mt::BitcastOp>(input_type_as_i32, input);
+  Value mask = CreateConst<uint32_t>(b, b.getI32Type(), 0xFFFF0000u,
+                                     input_type.getShape());
+  Value high_bits = b.create<ma::AndIOp>(input_type_as_i32, input_as_i32, mask);
+
+  return b.create<mt::BitcastOp>(input_type, high_bits);
+}
+
+// Finds the middle 8 bits of |input|'s mantissa.
+// It is used for Emit6xBfloat16MatMul.
+Value SoftMiddleEight(ImplicitLocOpBuilder& b, Value input) {
+  Value high = TruncateToBF16TowardsZero(b, input);
+  return b.create<ma::SubFOp>(input, high);
+}
+
+// Finds the low 8 bits of |input|'s mantissa.
+// It is used for Emit6xBfloat16MatMul.
+Value SoftLowEight(ImplicitLocOpBuilder& b, Value input) {
+  // Find the middle bits of the middle bits, and these are the low eight
+  // bits.
+  return SoftMiddleEight(b, SoftMiddleEight(b, input));
+}
+
+// Rounds |input| to BF16 type.
+// It is used for Emit6xBfloat16MatMul.
+Value RoundToBF16(ImplicitLocOpBuilder& b, Value input) {
+  return Cast(b, input, b.getBF16Type());
+}
+
+// Leverages BF16 datatype for F32 matmul computation. It follows the guidance
+// from https://arxiv.org/pdf/1904.06376.pdf.
+StatusOr<Value> Emit6xBfloat16MatMul(ImplicitLocOpBuilder& b, Value lhs,
+                                     Value rhs, Value acc) {
+  Type f32 = b.getF32Type();
+  TF_RET_CHECK(lhs.getType().cast<ShapedType>().getElementType() == f32);
+  TF_RET_CHECK(rhs.getType().cast<ShapedType>().getElementType() == f32);
+  TF_RET_CHECK(acc.getType().cast<ShapedType>().getElementType() == f32);
+
+  Value lhs_high = RoundToBF16(b, TruncateToBF16TowardsZero(b, lhs));
+  Value lhs_middle =
+      RoundToBF16(b, TruncateToBF16TowardsZero(b, SoftMiddleEight(b, lhs)));
+  Value lhs_low =
+      RoundToBF16(b, TruncateToBF16TowardsZero(b, SoftLowEight(b, lhs)));
+
+  Value rhs_high = RoundToBF16(b, TruncateToBF16TowardsZero(b, rhs));
+  Value rhs_middle =
+      RoundToBF16(b, TruncateToBF16TowardsZero(b, SoftMiddleEight(b, rhs)));
+  Value rhs_low =
+      RoundToBF16(b, TruncateToBF16TowardsZero(b, SoftLowEight(b, rhs)));
+
+  auto bf16_dot = [&](Value lhs_bf16, Value rhs_bf16,
+                      Value accumulator) -> Value {
+    return b.create<mt::DotOp>(lhs_bf16, rhs_bf16, accumulator,
+                               /*allowTF32=*/false,
+                               /*maxNumImpreciseAcc=*/0);
+  };
+
+  Value local_acc = ZerosLike(b, acc);
+  Value result = bf16_dot(lhs_middle, rhs_middle, local_acc);
+  result = bf16_dot(lhs_low, rhs_high, result);
+  result = bf16_dot(lhs_high, rhs_low, result);
+  result = bf16_dot(lhs_middle, rhs_high, result);
+  result = bf16_dot(lhs_high, rhs_middle, result);
+  // If lhs is 1.0, we will have lhs_high = 1.0 and lhs_low = 0.0.
+  // If rhs is +infinity, we will have:
+  // +infinity * 1.0 = +infinity
+  // +infinity * 0.0 = NaN
+  // We would get the wrong result if we sum these partial products. Instead, we
+  // must override any accumulated result if the last partial product is
+  // non-finite. See b/115844437.
+  Value positive_inf = CreateConst<float>(
+      b, b.getF32Type(), std::numeric_limits<float>::infinity(),
+      result.getType().cast<ShapedType>().getShape());
+  Value abs_result = b.create<mm::AbsFOp>(result);
+  Value is_finite =
+      b.create<ma::CmpFOp>(ma::CmpFPredicate::OGT, positive_inf, abs_result);
+  result = b.create<ma::SelectOp>(is_finite, result, ZerosLike(b, result));
+  result = bf16_dot(lhs_high, rhs_high, result);
+  result = b.create<ma::AddFOp>(acc, result);
+  return result;
+}
+
 // Variable naming: lhs [m, k] x rhs [k, n] -> out [m, n].
 absl::Status EmitMatMul(mlir::OpBuilder builder,
                         absl::string_view libdevice_path,
@@ -1734,15 +1800,30 @@ absl::Status EmitMatMul(mlir::OpBuilder builder,
                         [](const int precision) {
                           return precision != PrecisionConfig::DEFAULT;
                         });
-
-    // Execute matrix multiplication of input tiles and pass the accumulator.
-    // TODO(manany): Should be looked into once we enable Hopper workloads.
-    // maxNumImpreciseAcc flag was introduced for Hopper to accumulate in a
-    // lower precision than the output type. The change was introduced here:
-    // https://github.com/openai/triton/commit/31b0c521427109a8eda609b58d756c380b21599a
-    Value accumulator_next = b.create<mt::DotOp>(dot_input_lhs, dot_input_rhs,
-                                                 iter_args.back(), allow_tf32,
-                                                 /*maxNumImpreciseAcc=*/0);
+    const HloModule* hlo_module = computation->parent();
+    Type f32 = b.getF32Type();
+    // BF16 datatype is not supported before Ampere.
+    const bool use_bf16_6x =
+        device_info.cuda_compute_capability().IsAtLeastAmpere() &&
+        hlo_module->config().debug_options().xla_gpu_enable_bf16_6way_gemm() &&
+        dot_input_lhs.getType().cast<ShapedType>().getElementType() == f32 &&
+        dot_input_rhs.getType().cast<ShapedType>().getElementType() == f32;
+    Value accumulator_next;
+    if (use_bf16_6x) {
+      StatusOr<Value> accumulator_next_or = Emit6xBfloat16MatMul(
+          b, dot_input_lhs, dot_input_rhs, iter_args.back());
+      TF_CHECK_OK(accumulator_next_or.status());
+      accumulator_next = accumulator_next_or.value();
+    } else {
+      // Execute matrix multiplication of input tiles and pass the accumulator.
+      // TODO(manany): Should be looked into once we enable Hopper workloads.
+      // maxNumImpreciseAcc flag was introduced for Hopper to accumulate in a
+      // lower precision than the output type. The change was introduced here:
+      // https://github.com/openai/triton/commit/31b0c521427109a8eda609b58d756c380b21599a
+      accumulator_next = b.create<mt::DotOp>(dot_input_lhs, dot_input_rhs,
+                                             iter_args.back(), allow_tf32,
+                                             /*maxNumImpreciseAcc=*/0);
+    }
     iter_args_next.push_back(accumulator_next);
 
     b.create<mlir::scf::YieldOp>(iter_args_next);
@@ -1918,6 +1999,7 @@ absl::Status EmitSoftMax(mlir::OpBuilder builder,
     Value base_offset = batch_iterspec ? row_offset : zero_offset;
 
     // We assume that the reduced axis of this parameter has length row_len.
+    // TODO(b/316637896): Relax assumption that param reduce_dim_len == row_len.
     CHECK_EQ(reduce_dim_len, row_len);
 
     // block_size must be a power of two.
@@ -1997,11 +2079,9 @@ absl::StatusOr<std::unique_ptr<llvm::Module>> TranslateLLVMToLLVMIR(
 
 namespace {
 
-std::string GetLibdevicePath(const HloComputation* hlo_computation) {
-  return nvptx::LibDevicePath(hlo_computation->parent()
-                                  ->config()
-                                  .debug_options()
-                                  .xla_gpu_cuda_data_dir());
+std::string GetLibdevicePath(const HloModuleConfig& hlo_config) {
+  return nvptx::LibDevicePath(
+      hlo_config.debug_options().xla_gpu_cuda_data_dir());
 }
 
 }  // namespace
@@ -2041,9 +2121,9 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
   fn.addEntryBlock();
   b.setInsertionPointToStart(&fn.front());
 
-  TF_RETURN_IF_ERROR(ir_emitter(b, GetLibdevicePath(hlo_computation),
-                                device_info, analysis, hlo_computation, fn,
-                                config));
+  TF_RETURN_IF_ERROR(
+      ir_emitter(b, GetLibdevicePath(hlo_computation->parent()->config()),
+                 device_info, analysis, hlo_computation, fn, config));
 
   b.create<mt::ReturnOp>(loc);
 
@@ -2117,12 +2197,21 @@ absl::StatusOr<TritonWrapperResult> TritonWrapper(
   VLOG(2) << config.ToString();
 
   // Compile Triton kernel to LLVM.
-  std::optional<llvm::raw_fd_ostream> log_stream;
   const HloModule* hlo_module = hlo_computation->parent();
+  return CompileTritonToLLVM(hlo_module->config(), hlo_module->name(), cc,
+                             device_info, config, triton_module.get(),
+                             llvm_module, mlir_context);
+}
 
+// TODO(b/325220878): Replace TritonGemmConfig with a more generic abstraction.
+absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
+    const HloModuleConfig& hlo_config, absl::string_view hlo_module_name,
+    const se::CudaComputeCapability& cc,
+    const se::DeviceDescription& device_info, const TritonGemmConfig& config,
+    mlir::ModuleOp triton_module, llvm::Module* llvm_module,
+    mlir::MLIRContext& mlir_context) {
   bool should_verify =
-      (hlo_module->config().debug_options().xla_gpu_llvm_verification_level() >=
-       1);
+      (hlo_config.debug_options().xla_gpu_llvm_verification_level() >= 1);
 #ifndef NDEBUG
   should_verify = true;
 #endif
@@ -2130,13 +2219,14 @@ absl::StatusOr<TritonWrapperResult> TritonWrapper(
   mlir::PassManager pm(&mlir_context);
   pm.enableVerifier(should_verify);
 
-  if (hlo_module->config().debug_options().xla_gpu_dump_llvmir()) {
+  std::optional<llvm::raw_fd_ostream> log_stream;
+  if (hlo_config.debug_options().xla_gpu_dump_llvmir()) {
     const std::string basename =
-        absl::StrCat(absl::string_view(tsl::io::Basename(hlo_module->name())),
+        absl::StrCat(absl::string_view(tsl::io::Basename(hlo_module_name)),
                      ".triton-passes.log");
     std::string outputs_dir;
     if (!tsl::io::GetTestUndeclaredOutputsDir(&outputs_dir)) {
-      outputs_dir = hlo_module->config().debug_options().xla_dump_to();
+      outputs_dir = hlo_config.debug_options().xla_dump_to();
     }
     if (!outputs_dir.empty()) {
       std::string path = tsl::io::JoinPath(outputs_dir, basename);
@@ -2160,7 +2250,8 @@ absl::StatusOr<TritonWrapperResult> TritonWrapper(
     }
   }
 
-  if (!CreateTritonPipeline(pm, cc, config).ok()) {
+  mlir::triton::nvidia_gpu::ClusterInfo cluster_info;
+  if (!CreateTritonPipeline(pm, cc, config, /*out*/ cluster_info).ok()) {
     return Internal("Failed to create Triton pipeline.");
   }
   if (log_stream.has_value()) {
@@ -2173,7 +2264,7 @@ absl::StatusOr<TritonWrapperResult> TritonWrapper(
   // llvm::Linker::linkModules() segfaults if we don't strip locations.
   pm.addPass(mlir::createStripDebugInfoPass());
 
-  bool succeeded = mlir::succeeded(pm.run(*triton_module));
+  bool succeeded = mlir::succeeded(pm.run(triton_module));
 
   if (log_stream.has_value()) {
     log_stream->flush();
@@ -2184,8 +2275,7 @@ absl::StatusOr<TritonWrapperResult> TritonWrapper(
   }
 
   const int shared_mem_bytes =
-      (*triton_module)
-          ->getAttrOfType<mlir::IntegerAttr>("triton_gpu.shared")
+      triton_module->getAttrOfType<mlir::IntegerAttr>("triton_gpu.shared")
           .getInt();
   VLOG(2) << "Shared memory usage: " << shared_mem_bytes << " B";
   if (shared_mem_bytes > device_info.shared_memory_per_block_optin()) {
@@ -2196,8 +2286,8 @@ absl::StatusOr<TritonWrapperResult> TritonWrapper(
 
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<llvm::Module> ll_triton_module,
-      TranslateLLVMToLLVMIR(&llvm_module->getContext(), *triton_module,
-                            GetLibdevicePath(hlo_computation)));
+      TranslateLLVMToLLVMIR(&llvm_module->getContext(), triton_module,
+                            GetLibdevicePath(hlo_config)));
   VLogModule(5, *ll_triton_module);
   if (should_verify) {
     VerifyModule(*ll_triton_module);
@@ -2216,7 +2306,24 @@ absl::StatusOr<TritonWrapperResult> TritonWrapper(
     VerifyModule(*llvm_module);
   }
 
-  return {{shared_mem_bytes}};
+  // `cluster_info` must be read after pm.run().
+  std::optional<se::ClusterDim> cluster_dim;
+  if (config.num_ctas > 1) {
+    VLOG(3) << "num_ctas: " << config.num_ctas
+            << ", cluster_info: " << cluster_info.clusterDimX << ","
+            << cluster_info.clusterDimY << "," << cluster_info.clusterDimZ;
+    if (cluster_info.clusterDimX > 1 || cluster_info.clusterDimY > 1 ||
+        cluster_info.clusterDimZ > 1) {
+      cluster_dim =
+          se::ClusterDim(cluster_info.clusterDimX, cluster_info.clusterDimY,
+                         cluster_info.clusterDimZ);
+    }
+  } else {
+    TF_RET_CHECK(cluster_info.clusterDimX == 1 &&
+                 cluster_info.clusterDimY == 1 &&
+                 cluster_info.clusterDimZ == 1);
+  }
+  return {{shared_mem_bytes, cluster_dim}};
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton.h b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
index d733d192a36346..749c6aab3e58b9 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton.h
@@ -16,7 +16,9 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_IR_EMITTER_TRITON_H_
 #define XLA_SERVICE_GPU_IR_EMITTER_TRITON_H_
 
+#include <cstdint>
 #include <functional>
+#include <optional>
 
 #include "absl/strings/string_view.h"
 #include "llvm/IR/Module.h"
@@ -32,13 +34,15 @@ limitations under the License.
 #include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 
 namespace xla {
 namespace gpu {
 
 struct TritonWrapperResult {
-  int64_t shmem_bytes;
+  int64_t shmem_bytes = 0;
+  std::optional<se::ClusterDim> cluster_dim;
 };
 
 // Compute the launch dimensions for the given Triton MatMul.
@@ -89,6 +93,14 @@ absl::StatusOr<mlir::OwningOpRef<mlir::ModuleOp>> CreateTritonModule(
     const se::DeviceDescription& device_info, const TritonGemmConfig& config,
     TritonIrEmitter ir_emitter, mlir::MLIRContext& mlir_context);
 
+// Compiles a given Triton module to LLVM IR.
+absl::StatusOr<TritonWrapperResult> CompileTritonToLLVM(
+    const HloModuleConfig& hlo_config, absl::string_view hlo_module_name,
+    const se::CudaComputeCapability& cc,
+    const se::DeviceDescription& device_info, const TritonGemmConfig& config,
+    mlir::ModuleOp triton_module, llvm::Module* llvm_module,
+    mlir::MLIRContext& mlir_context);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_large_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_large_test.cc
index d00dcda9ba59d9..eaf73063fde421 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_large_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_large_test.cc
@@ -62,7 +62,7 @@ ENTRY e {
   p0 = f16[65536,32800] parameter(0)
   p1 = f16[32800,32] parameter(1)
   ROOT _ = f16[65536,32] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config="{\"fusion_backend_config\": {kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"32\",\"block_n\":\"32\",\"block_k\":\"32\",\"split_k\":\"1\",\"num_stages\":\"1\",\"num_warps\":\"1\"}}}"
+    backend_config="{\"fusion_backend_config\": {kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"32\",\"block_n\":\"32\",\"block_k\":\"32\",\"split_k\":\"1\",\"num_stages\":\"1\",\"num_warps\":\"1\",\"num_ctas\":\"1\"}}}"
 }
 )";
 
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
index 7e64d8993262d6..a9027ce8aac59b 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_parametrized_test.cc
@@ -58,6 +58,13 @@ class MixedTypeTest : public GpuCodegenTest,
         ->GetDeviceDescription()
         .cuda_compute_capability();
   }
+
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    // We are testing Triton, remove cuBLAS fallback for these tests.
+    debug_options.set_xla_gpu_cublas_fallback(false);
+    return debug_options;
+  }
 };
 
 TEST_P(MixedTypeTest, MixedTypeDotProducesCorrectResult) {
@@ -192,9 +199,14 @@ ENTRY e {
   ROOT triton_gemm__ = f32[15,68]{1,0} fusion(p0, p1), kind=kCustom,
     calls=triton_gemm___computation,
     backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-                    "triton_gemm_config":{"block_m":"32","block_n":"32",
-                                          "block_k":"32","split_k":"1",
-                                          "num_stages":"1","num_warps":"4"}}}
+                    "triton_gemm_config":
+                      {"block_m":"32",
+                       "block_n":"32",
+                       "block_k":"32",
+                       "split_k":"1",
+                       "num_stages":"1",
+                       "num_warps":"4",
+                       "num_ctas":"1"}}}
 })";
   const std::string hlo_test = absl::Substitute(
       kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
@@ -303,9 +315,14 @@ ENTRY e {
   ROOT triton_gemm__ = f32[92,63]{1,0} fusion(p0, p1, p2), kind=kCustom,
     calls=triton_gemm___computation,
     backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-                    "triton_gemm_config":{"block_m":"64","block_n":"32",
-                                          "block_k":"64","split_k":"1",
-                                          "num_stages":"2","num_warps":"2"}}}
+                    "triton_gemm_config":
+                      {"block_m":"64",
+                       "block_n":"32",
+                       "block_k":"64",
+                       "split_k":"1",
+                       "num_stages":"2",
+                       "num_warps":"2",
+                       "num_ctas":"1"}}}
 })";
   const std::string hlo_test = absl::Substitute(
       kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
@@ -430,9 +447,14 @@ ENTRY e {
   ROOT triton_gemm__ = f32[92,63]{1,0} fusion(p0, p1, p2), kind=kCustom,
     calls=triton_gemm___computation,
     backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-                    "triton_gemm_config":{"block_m":"16","block_n":"64",
-                                          "block_k":"16","split_k":"1",
-                                          "num_stages":"3","num_warps":"2"}}}
+                    "triton_gemm_config":
+                      {"block_m":"16",
+                       "block_n":"64",
+                       "block_k":"16",
+                       "split_k":"1",
+                       "num_stages":"3",
+                       "num_warps":"2",
+                       "num_ctas":"1"}}}
 })";
   const std::string hlo_test = absl::Substitute(
       kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type),
@@ -535,9 +557,14 @@ ENTRY e {
   ROOT triton_gemm__ = $1[92,63]{1,0} fusion(p0, p1, p2, p3), kind=kCustom,
     calls=triton_gemm___computation,
     backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-                    "triton_gemm_config":{"block_m":"16","block_n":"64",
-                                          "block_k":"16","split_k":"1",
-                                          "num_stages":"3","num_warps":"2"}}}
+                    "triton_gemm_config":
+                      {"block_m":"16",
+                       "block_n":"64",
+                       "block_k":"16",
+                       "split_k":"1",
+                       "num_stages":"3",
+                       "num_warps":"2",
+                       "num_ctas":"1"}}}
 })";
   const std::string hlo_test = absl::Substitute(
       kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type1),
@@ -628,9 +655,14 @@ ENTRY e {
   ROOT triton_gemm__ = f32[92,63]{1,0} fusion(p0, p1), kind=kCustom,
     calls=triton_gemm___computation,
     backend_config={"fusion_backend_config":{"kind":"__triton_gemm",
-                    "triton_gemm_config":{"block_m":"16","block_n":"64",
-                                          "block_k":"16","split_k":"1",
-                                          "num_stages":"3","num_warps":"2"}}}
+                    "triton_gemm_config":
+                      {"block_m":"16",
+                       "block_n":"64",
+                       "block_k":"16",
+                       "split_k":"1",
+                       "num_stages":"3",
+                       "num_warps":"2",
+                       "num_ctas":"1"}}}
 })";
   const std::string hlo_test = absl::Substitute(
       kHloTestTemplate, primitive_util::LowercasePrimitiveTypeName(data_type));
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
index 37188c9e8e566f..bc61bbc29a6b8b 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_triton_test.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/service/gpu/ir_emitter_triton.h"
 
+#include <iterator>
+#include <limits>
 #include <memory>
 #include <string>
 #include <utility>
@@ -22,6 +24,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "absl/strings/string_view.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/raw_ostream.h"
@@ -32,11 +35,14 @@ limitations under the License.
 #include "xla/error_spec.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/literal.h"
+#include "xla/literal_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "xla/service/gpu/triton_fusion_analysis.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
 #include "xla/status_macros.h"
@@ -77,7 +83,20 @@ class TritonGemmTest : public TritonTest {
  public:
   DebugOptions GetDebugOptionsForTest() override {
     DebugOptions debug_options = TritonTest::GetDebugOptionsForTest();
+    // Do not fall back to cuBLAS, we are testing Triton.
     debug_options.set_xla_gpu_cublas_fallback(false);
+    // Do not autotune split-k by default, since this prevents deterministically
+    // matching the optimized HLO.
+    debug_options.set_xla_gpu_enable_split_k_autotuning(false);
+    return debug_options;
+  }
+};
+
+class TritonGemmTestWithSplitK : public TritonGemmTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = TritonGemmTest::GetDebugOptionsForTest();
+    debug_options.set_xla_gpu_enable_split_k_autotuning(true);
     return debug_options;
   }
 };
@@ -108,6 +127,7 @@ absl::StatusOr<bool> TritonFilecheckTest::CreateTritonIrAndFileCheck(
 
   auto* computation =
       verified_module->GetComputationWithName(triton_fusion_name);
+  TF_RET_CHECK(computation != nullptr);
   TF_ASSIGN_OR_RETURN(auto analysis,
                       TritonFusionAnalysis::Execute(*computation));
 
@@ -143,7 +163,10 @@ ENTRY e {
   p0 = s8[80,115]{1,0} parameter(0)
   ROOT triton_gemm_r = f32[80,137]{1,0} fusion(p0, p1), kind=kCustom,
     calls=triton_gemm_r,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":16,"block_n":64,"block_k":32,"split_k":1,"num_stages":1,"num_warps":2}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":64,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":2,
+                         "num_ctas":1}}}
 })";
   TritonGemmConfig config(16, 64, 32, 1, 1, 1);
   ASSERT_THAT(CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul,
@@ -675,7 +698,8 @@ ENTRY e {
     backend_config={"fusion_backend_config": {kind: "__triton_gemm",
         triton_gemm_config: {
           "block_m":16,"block_n":16,"block_k":16,
-          "split_k":1,"num_stages":1,"num_warps":1
+          "split_k":1,"num_stages":1,"num_warps":1,
+          "num_ctas":1
         }
       }
     }
@@ -712,7 +736,10 @@ ENTRY e {
   parameter_2 = f32[2,10,256]{2,1,0} parameter(2)
   ROOT dot = f32[2,3,384]{2,1,0} fusion(parameter_0, parameter_1, parameter_2),
     kind=kCustom, calls=triton_gemm,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":16,"block_n":64,"block_k":32,"split_k":1,"num_stages":1,"num_warps":2}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":16,"block_n":64,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":2,
+                         "num_ctas":1}}}
 })";
 
   TritonGemmConfig config(16, 64, 32, 1, 1, 2);
@@ -753,7 +780,9 @@ ENTRY e {
   ROOT triton_gemm_r = f32[80,16]{1,0} fusion(p0, p1), kind=kCustom,
     calls=triton_gemm_r,
     backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config:
-      {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":2}}}
+      {"block_m":32,"block_n":32,"block_k":32,
+       "split_k":1,"num_stages":1,"num_warps":2,
+       "num_ctas":1}}}
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
                           ParseAndReturnVerifiedModule(kHloText));
@@ -809,7 +838,9 @@ ENTRY e {
   ROOT triton_gemm_r = f32[80,16]{1,0} fusion(p0, p1), kind=kCustom,
     calls=triton_gemm_r,
     backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config:
-      {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":2}}}
+      {"block_m":32,"block_n":32,"block_k":32,
+      "split_k":1,"num_stages":1,"num_warps":2,
+      "num_ctas":1}}}
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
                           ParseAndReturnVerifiedModule(kHloText));
@@ -883,7 +914,8 @@ ENTRY entry {
   EXPECT_GT(result.shmem_bytes, dev_info.shared_memory_per_block());
 }
 
-TEST_F(TritonGemmTest, WorksWhenKIsDivisibleByBlockKButNotByBlockKTimesSplitK) {
+TEST_F(TritonGemmTestWithSplitK,
+       WorksWhenKIsDivisibleByBlockKButNotByBlockKTimesSplitK) {
   // The condition mentioned in the test name is fulfilled by
   // GemmKey(16, 64, 256, 8, 1, 4), which was part of the default configs for
   // Ampere at the time of the addition of this test case.
@@ -971,10 +1003,11 @@ TEST_F(TritonGemmTest, SplitLhsNoncontractingTransposeRhs) {
 HloModule t
 
 ENTRY e {
-  p0 = s8[3,122,96,12]{3,2,1,0} parameter(0)
+  p0 = pred[3,122,96,12]{3,2,1,0} parameter(0)
   cp0 = f16[3,122,96,12]{3,2,1,0} convert(p0)
-  p1 = f16[1,5,122]{2,1,0} parameter(1)
-  ROOT _ = f16[3,96,12,1,5]{4,3,2,1,0} dot(cp0, p1),
+  p1 = pred[1,5,122]{2,1,0} parameter(1)
+  cp1 = f16[1,5,122]{2,1,0} convert(p1)
+  ROOT _ = f16[3,96,12,1,5]{4,3,2,1,0} dot(cp0, cp1),
     lhs_contracting_dims={1}, rhs_contracting_dims={2}
 })";
 
@@ -987,7 +1020,7 @@ ENTRY e {
 ; CHECK-SAME: "block_m":
 )");
 
-  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/1e-2, /*arel=*/1e-2}));
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{/*aabs=*/0, /*arel=*/0}));
 }
 
 TEST_F(TritonGemmTest, SplitLhsNoncontracting) {
@@ -1403,7 +1436,8 @@ ENTRY e {
     backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
                     "triton_gemm_config":{"block_m":"16","block_n":"64",
                                           "block_k":"16","split_k":"1",
-                                          "num_stages":"3","num_warps":"2"}}}
+                                          "num_stages":"3","num_warps":"2",
+                                          "num_ctas":"1"}}}
 })")
                   .status());
 }
@@ -1608,7 +1642,7 @@ ENTRY e {
 })";
 
   MatchOptimizedHlo(kHloText, R"(
-; CHECK: fused_computation
+; CHECK: fused_subtract
 ; CHECK: negate
 ; CHECK: negate
 ; CHECK: ROOT
@@ -1832,7 +1866,9 @@ ENTRY e {
   p1 = f16[92,75] parameter(1)
   ROOT _ = f16[92,67] fusion(p0, p1), kind=kCustom, calls=triton_dot,
     backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config:
-      {"block_m":32,"block_n":64,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1}}}
+      {"block_m":32,"block_n":64,"block_k":32,
+      "split_k":1,"num_stages":1,"num_warps":1,
+      "num_ctas":1}}}
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
@@ -2171,7 +2207,8 @@ ENTRY e {
   EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-3, /*arel=*/1e-3}));
 }
 
-TEST_F(TritonGemmTest, SplitKDoesNotBreakSlicedFragmentedContractingDimension) {
+TEST_F(TritonGemmTestWithSplitK,
+       SplitKDoesNotBreakSlicedFragmentedContractingDimension) {
   const std::string kHloText = R"(
 ENTRY e {
   p0 = f16[16,8,128]{2,1,0} parameter(0)
@@ -2426,7 +2463,11 @@ ENTRY e {
   p0 = s8[101,202]{1,0} parameter(0)
   p1 = f32[202,303]{1,0} parameter(1)
   ROOT _ = f32[101,303] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":16,"block_n":64,"block_k":32,"split_k":1,"num_stages":3,"num_warps":8}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+      triton_gemm_config:
+        {"block_m":16,"block_n":64,"block_k":32,
+         "split_k":1,"num_stages":3,"num_warps":8,
+         "num_ctas":1}}}
 })";
 
   const char* hlo_text_triton = R"(
@@ -2444,7 +2485,10 @@ ENTRY e {
   p0 = s8[101,202]{1,0} parameter(0)
   p1 = f32[202,303]{1,0} parameter(1)
   ROOT _ = f32[101,303] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":32,"block_n":128,"block_k":32,"split_k":1,"num_stages":2,"num_warps":4}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":128,"block_k":32,
+                         "split_k":1,"num_stages":2,"num_warps":4,
+                         "num_ctas":1}}}
 })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_ref, hlo_text_triton,
@@ -2480,7 +2524,10 @@ ENTRY e {
   p0 = f16[5,7]{1,0} parameter(0)
   p1 = f16[7,33]{1,0} parameter(1)
   ROOT _ = f16[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":1,
+                         "num_ctas":1}}}
 }
 )";
 
@@ -2517,7 +2564,10 @@ ENTRY e {
   p0 = f32[5,7]{1,0} parameter(0)
   p1 = f32[7,33]{1,0} parameter(1)
   ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":1,
+                         "num_ctas":1}}}
 }
 )";
 
@@ -2559,7 +2609,10 @@ ENTRY e {
   arg0 = bf16[512,16]{1,0} parameter(0)
   arg1 = bf16[512,256]{1,0} parameter(1)
   ROOT _ = bf16[16,256]{1,0} fusion(arg0, arg1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":128,"block_n":32,"block_k":16,"split_k":1,"num_stages":2,"num_warps":4}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":128,"block_n":32,"block_k":16,
+                         "split_k":1,"num_stages":2,"num_warps":4,
+                        "num_ctas":1}}}
 }
 )";
 
@@ -2595,7 +2648,10 @@ ENTRY e {
   p0 = s8[332,441]{1,0} parameter(0)
   p1 = f16[441,39]{1,0} parameter(1)
   ROOT _ = f16[332,39]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":128,"block_n":128,"block_k":128,"split_k":1,"num_stages":2,"num_warps":32}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":128,"block_n":128,"block_k":128,
+                         "split_k":1,"num_stages":2,"num_warps":32,
+                         "num_ctas":1}}}
 })";
 
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> hlo_module,
@@ -2613,12 +2669,14 @@ ENTRY e {
                               ->root_instruction()
                               ->backend_config<GpuBackendConfig>());
   const FusionBackendConfig& config = gpu_config.fusion_backend_config();
+  TF_ASSERT_OK_AND_ASSIGN(
+      TritonGemmConfig triton_gemm_config,
+      TritonGemmConfig::FromProto(config.triton_gemm_config()));
   TF_ASSERT_OK_AND_ASSIGN(
       const auto result,
       TritonWrapper(*TritonFusionAnalysis::Execute(*triton_dot_computation),
                     "test_fn", triton_dot_computation, kTritonGemmFusionKind,
-                    GetCudaComputeCapability(), dev_info,
-                    TritonGemmConfig::FromProto(config.triton_gemm_config()),
+                    GetCudaComputeCapability(), dev_info, triton_gemm_config,
                     &llvm_module, &EmitMatMul, mlir_context));
   // The config is chosen so that the used memory size is slightly above the
   // 48 kB boundary of standard / optin shared memory so that any GPU that
@@ -2642,7 +2700,10 @@ ENTRY e {
   p0 = s8[332,441]{1,0} parameter(0)
   p1 = f16[441,39]{1,0} parameter(1)
   ROOT _ = f16[332,39]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":4}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":4,
+                         "num_ctas":1}}}
 })";
 
   EXPECT_TRUE(RunAndCompareTwoModules(kHloTextLowShmem, kHloTextOptinShmem,
@@ -2678,7 +2739,10 @@ ENTRY e {
   arg0 = f16[128,32]{1,0} parameter(0)
   arg1 = f16[64,32]{1,0} parameter(1)
   ROOT _ = f16[128,64]{1,0} fusion(arg0, arg1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":128,"block_n":32,"block_k":64,"split_k":1,"num_stages":2,"num_warps":4}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":128,"block_n":32,"block_k":64,
+                         "split_k":1,"num_stages":2,"num_warps":4,
+                         "num_ctas":1}}}
 }
 )";
 
@@ -2715,7 +2779,10 @@ ENTRY e {
   arg0 = f32[64,128]{1,0} parameter(0)
   arg1 = f32[1024,64]{1,0} parameter(1)
   ROOT _ = f32[128,1024]{1,0} fusion(arg0, arg1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":32,"block_n":32,"block_k":64,"split_k":1,"num_stages":2,"num_warps":4}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":64,
+                         "split_k":1,"num_stages":2,"num_warps":4,
+                         "num_ctas":1}}}
 }
 )";
 
@@ -2763,7 +2830,10 @@ ENTRY e {
   p0 = s8[144,256]{1,0} parameter(0)
   p1 = bf16[256,122]{1,0} parameter(1)
   ROOT _ = bf16[144,122]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":64,"block_n":64,"block_k":64,"split_k":1,"num_stages":1,"num_warps":2}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":64,"block_n":64,"block_k":64,
+                         "split_k":1,"num_stages":1,"num_warps":2,
+                         "num_ctas":1}}}
 }
 )";
 
@@ -2794,7 +2864,10 @@ ENTRY e {
   bitcast.4 = s8[480,120]{1,0} bitcast(p0)
   ROOT triton_gemm_r = bf16[480,16]{1,0} fusion(bitcast.4, p1), kind=kCustom,
     calls=triton_gemm_r,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":64,"block_n":32,"block_k":64,"split_k":1,"num_stages":4,"num_warps":4}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":64,"block_n":32,"block_k":64,
+                         "split_k":1,"num_stages":4,"num_warps":4,
+                         "num_ctas":1}}}
 })";
 
   const std::string hlo_text_splitk = R"(
@@ -2833,7 +2906,10 @@ ENTRY e {
   bitcast.4 = s8[480,120]{1,0} bitcast(p0)
   triton_gemm_r = bf16[4,480,16]{2,1,0} fusion(bitcast.4, p1), kind=kCustom,
     calls=triton_gemm_r,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":32,"block_n":32,"block_k":128,"split_k":4,"num_stages":1,"num_warps":4}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":128,
+                         "split_k":4,"num_stages":1,"num_warps":4,
+                         "num_ctas":1}}}
   ROOT fusion.1 = bf16[480,16]{1,0} fusion(triton_gemm_r), kind=kLoop,
     calls=fused_computation
 })";
@@ -2865,7 +2941,10 @@ ENTRY e {
   tmp_0 = bf16[1,1,800,5,128]{4,3,2,1,0} parameter(1)
   ROOT triton_gemm_dot.24 = f32[5,128,700]{2,1,0} fusion(tmp_3, tmp_0),
     kind=kCustom, calls=triton_gemm_dot.24,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":64,"block_n":32,"block_k":64,"split_k":1,"num_stages":2,"num_warps":8}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":64,"block_n":32,"block_k":64,
+                         "split_k":1,"num_stages":2,"num_warps":8,
+                         "num_ctas":1}}}
 })";
 
   const std::string kHloTextSplitK = R"(
@@ -2893,7 +2972,10 @@ ENTRY e {
   tmp_0 = bf16[1,1,800,5,128]{4,3,2,1,0} parameter(1)
   triton_gemm_dot.24 = f32[8,5,128,700]{3,2,1,0} fusion(tmp_3, tmp_0),
     kind=kCustom, calls=triton_gemm_dot,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":64,"block_n":32,"block_k":64,"split_k":8,"num_stages":1,"num_warps":4}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":64,"block_n":32,"block_k":64,
+                         "split_k":8,"num_stages":1,"num_warps":4,
+                         "num_ctas":1}}}
   constant = f32[] constant(0)
   ROOT reduce = f32[5,128,700]{2,1,0} reduce(triton_gemm_dot.24, constant), dimensions={0}, to_apply=add
 })";
@@ -2926,7 +3008,10 @@ ENTRY entry {
   parameter_1.1 = bf16[16,4,128]{2,1,0} parameter(1)
   ROOT triton_gemm_dot.5316 = bf16[16,96]{1,0} fusion(bitcast.6, parameter_1.1),
     kind=kCustom, calls=triton_gemm_dot.5316,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":32,"block_n":32,"block_k":256,"split_k":1,"num_stages":1,"num_warps":4}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":256,
+                         "split_k":1,"num_stages":1,"num_warps":4,
+                         "num_ctas":1}}}
 })";
 
   const std::string kHloTextSplitK = R"(
@@ -2966,7 +3051,10 @@ ENTRY entry {
   parameter_1.1 = bf16[16,4,128]{2,1,0} parameter(1)
   triton_gemm_dot.5316 = bf16[16,16,96]{2,1,0} fusion(bitcast.6, parameter_1.1),
     kind=kCustom, calls=triton_gemm_dot.5316,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":64,"block_n":32,"block_k":32,"split_k":16,"num_stages":1,"num_warps":4}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":64,"block_n":32,"block_k":32,
+                         "split_k":16,"num_stages":1,"num_warps":4,
+                         "num_ctas":1}}}
   ROOT fusion.1 = bf16[16,96]{1,0} fusion(triton_gemm_dot.5316),
     kind=kLoop, calls=fused_computation
 })";
@@ -2998,7 +3086,11 @@ triton_gemm_dot.clone {
 ENTRY entry_computation {
   p0 = s8[3,129,5,32]{3,2,1,0} parameter(0)
   p1 = f16[16,129]{1,0} parameter(1)
-  ROOT fusion = f16[480,16]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_gemm_dot.clone, backend_config={"fusion_backend_config": {"kind":"__triton_gemm","triton_gemm_config":{"block_m":"32","block_n":"32","block_k":"256","split_k":"1","num_stages":"1","num_warps":"4"}}}
+  ROOT fusion = f16[480,16]{1,0} fusion(p0, p1), kind=kCustom, calls=triton_gemm_dot.clone,
+  backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
+  "triton_gemm_config":{"block_m":"32","block_n":"32","block_k":"256",
+                        "split_k":"1","num_stages":"1","num_warps":"4",
+                        "num_ctas":"1"}}}
 }
 )";
 
@@ -3039,7 +3131,11 @@ fused_computation {
 ENTRY entry_computation {
   p0 = s8[3,129,5,32]{3,2,1,0} parameter(0)
   p1 = f16[16,129]{1,0} parameter(1)
-  fusion = f32[2,480,16]{2,1,0} fusion(p0, p1), kind=kCustom, calls=triton_gemm_dot.clone, backend_config={"fusion_backend_config": {"kind":"__triton_gemm","triton_gemm_config":{"block_m":"128","block_n":"128","block_k":"64","split_k":"2","num_stages":"1","num_warps":"8"}}}
+  fusion = f32[2,480,16]{2,1,0} fusion(p0, p1), kind=kCustom, calls=triton_gemm_dot.clone,
+  backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
+  "triton_gemm_config":{"block_m":"128","block_n":"128","block_k":"64",
+                        "split_k":"2","num_stages":"1","num_warps":"8",
+                        "num_ctas":"1"}}}
   ROOT fusion.1 = f16[480,16]{1,0} fusion(fusion), kind=kLoop, calls=fused_computation
 }
 )";
@@ -3067,7 +3163,10 @@ ENTRY entry_computation {
   p1 = f16[1,1023,128]{2,1,0} parameter(1)
   ROOT triton_gemm_dot.7103 = f16[1,8,4,128]{3,2,1,0} fusion(p0, p1),
     kind=kCustom, calls=triton_gemm_dot.7103_computation.clone,
-    backend_config={"fusion_backend_config": {"kind":"__triton_gemm","triton_gemm_config":{"block_m":"128","block_n":"128","block_k":"32","split_k":"1","num_stages":"4","num_warps":"4"}}}
+    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
+    "triton_gemm_config":{"block_m":"128","block_n":"128","block_k":"32",
+                          "split_k":"1","num_stages":"4","num_warps":"4",
+                          "num_ctas":"1"}}}
 }
 )";
 
@@ -3115,7 +3214,12 @@ fused_computation.1 {
 ENTRY entry_computation {
   p0 = f16[1,8,4,1023]{3,2,1,0} parameter(0)
   p1 = f16[1,1023,128]{2,1,0} parameter(1)
-  triton_gemm_dot.7103 = f16[8,1,8,4,128]{4,3,2,1,0} fusion(p0, p1), kind=kCustom, calls=triton_gemm_dot.7103_computation.clone, backend_config={"fusion_backend_config": {"kind":"__triton_gemm","triton_gemm_config":{"block_m":"16","block_n":"128","block_k":"32","split_k":"8","num_stages":"1","num_warps":"4"}}}
+  triton_gemm_dot.7103 = f16[8,1,8,4,128]{4,3,2,1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_gemm_dot.7103_computation.clone,
+    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
+    "triton_gemm_config":{"block_m":"16","block_n":"128","block_k":"32",
+                          "split_k":"8","num_stages":"1","num_warps":"4",
+                          "num_ctas":"1"}}}
   ROOT fusion.1 = f16[1,8,4,128]{3,2,1,0} fusion(triton_gemm_dot.7103), kind=kLoop, calls=fused_computation.1
 }
 )";
@@ -3141,7 +3245,12 @@ triton_gemm_dot.7103_computation.clone {
 ENTRY entry_computation {
   p0 = f16[1,8,4,1019]{3,2,1,0} parameter(0)
   p1 = f16[1,1019,128]{2,1,0} parameter(1)
-  ROOT triton_gemm_dot.7103 = f16[1,8,4,128]{3,2,1,0} fusion(p0, p1), kind=kCustom, calls=triton_gemm_dot.7103_computation.clone, backend_config={"fusion_backend_config": {"kind":"__triton_gemm","triton_gemm_config":{"block_m":"32","block_n":"32","block_k":"256","split_k":"1","num_stages":"1","num_warps":"4"}}}
+  ROOT triton_gemm_dot.7103 = f16[1,8,4,128]{3,2,1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_gemm_dot.7103_computation.clone,
+    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
+    "triton_gemm_config":{"block_m":"32","block_n":"32","block_k":"256",
+                          "split_k":"1","num_stages":"1","num_warps":"4",
+                          "num_ctas":"1"}}}
 }
 )";
 
@@ -3189,7 +3298,12 @@ fused_computation.1 {
 ENTRY entry_computation {
   p0 = f16[1,8,4,1019]{3,2,1,0} parameter(0)
   p1 = f16[1,1019,128]{2,1,0} parameter(1)
-  triton_gemm_dot.7103 = f16[16,1,8,4,128]{4,3,2,1,0} fusion(p0, p1), kind=kCustom, calls=triton_gemm_dot.7103_computation.clone, backend_config={"fusion_backend_config": {"kind":"__triton_gemm","triton_gemm_config":{"block_m":"64","block_n":"32","block_k":"32","split_k":"16","num_stages":"1","num_warps":"4"}}}
+  triton_gemm_dot.7103 = f16[16,1,8,4,128]{4,3,2,1,0} fusion(p0, p1), kind=kCustom,
+    calls=triton_gemm_dot.7103_computation.clone,
+    backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
+    "triton_gemm_config":{"block_m":"64","block_n":"32","block_k":"32",
+                          "split_k":"16","num_stages":"1","num_warps":"4",
+                          "num_ctas":"1"}}}
   ROOT fusion.1 = f16[1,8,4,128]{3,2,1,0} fusion(triton_gemm_dot.7103), kind=kLoop, calls=fused_computation.1
 }
 )";
@@ -3217,7 +3331,10 @@ ENTRY e {
   p1 = f32[32,50,104]{2,1,0} parameter(1)
   ROOT triton_gemm_dot.6 = f32[32,50,26]{2,0,1} fusion(p0, p1),
     kind=kCustom, calls=triton_gemm_dot.6,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":64,"block_n":16,"block_k":32,"split_k":1,"num_stages":1,"num_warps":4}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":64,"block_n":16,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":4,
+                         "num_ctas":1}}}
 })";
 
   const std::string kHloTextRef = R"(
@@ -3243,7 +3360,10 @@ ENTRY e {
   %parameter_1 = f32[32,50,104]{2,1,0} parameter(1)
   %triton_gemm_dot.127 = f32[32,50,26]{2,1,0} fusion(%parameter_0, %parameter_1),
     kind=kCustom, calls=%triton_gemm_dot.127,
-    backend_config={"fusion_backend_config": {kind: "__triton_gemm", triton_gemm_config: {"block_m":32,"block_n":128,"block_k":64,"split_k":1,"num_stages":2,"num_warps":4}}}
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config: {"block_m":32,"block_n":128,"block_k":64,
+                         "split_k":1,"num_stages":2,"num_warps":4,
+                         "num_ctas":1}}}
   ROOT %fusion.1 = f32[32,50,26]{2,0,1} fusion(%triton_gemm_dot.127), kind=kLoop, calls=%fused_computation
 })";
 
@@ -3272,7 +3392,8 @@ ENTRY e {
     backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
                     "triton_gemm_config":{"block_m":"16","block_n":"64",
                                           "block_k":"16","split_k":"1",
-                                          "num_stages":"3","num_warps":"2"}}}
+                                          "num_stages":"3","num_warps":"2",
+                                          "num_ctas":"1"}}}
 })";
 
   const std::string kHloTextRef = R"(
@@ -3312,7 +3433,8 @@ ENTRY e {
     backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
                     "triton_gemm_config":{"block_m":"16","block_n":"64",
                                           "block_k":"16","split_k":"1",
-                                          "num_stages":"3","num_warps":"2"}}}
+                                          "num_stages":"3","num_warps":"2",
+                                          "num_ctas":"1"}}}
 })";
 
   const std::string kHloTextRef = R"(
@@ -3381,7 +3503,8 @@ ENTRY e {
     backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
                     "triton_gemm_config":{"block_m":"64","block_n":"64",
                                           "block_k":"64","split_k":"1",
-                                          "num_stages":"1","num_warps":"4"}}}
+                                          "num_stages":"1","num_warps":"4",
+                                          "num_ctas":"1"}}}
 })";
 
   const std::string kHloTextRef = R"(
@@ -3467,7 +3590,8 @@ ENTRY e {
     backend_config={"fusion_backend_config": {"kind":"__triton_gemm",
                     "triton_gemm_config":{"block_m":"32","block_n":"16",
                                           "block_k":"32","split_k":"1",
-                                          "num_stages":"1","num_warps":"4"}}}
+                                          "num_stages":"1","num_warps":"4",
+                                          "num_ctas":"1"}}}
 })";
 
   const std::string kHloTextRef = R"(
@@ -3525,7 +3649,9 @@ ENTRY e {
   ROOT r = f16[9,32]{1,0} fusion(p0, p1, p2),
     kind=kCustom, calls=triton_dot,
     backend_config={"fusion_backend_config": {kind: "__triton_gemm",
-    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":2}}}
+    triton_gemm_config: {"block_m":32,"block_n":32,"block_k":32,
+                         "split_k":1,"num_stages":1,"num_warps":2,
+                         "num_ctas":"1"}}}
 })";
 
   const std::string kHloTextRef = R"(
@@ -3661,10 +3787,328 @@ ENTRY e {
                   ->fused_instructions_computation()
                   ->root_instruction(),
               GmockMatch(m::Dot(m::Op().WithShape(BF16, {16, 32}, {1, 0}),
-                                m::Op().WithShape(BF16, {40, 32}, {1, 0}))
+                                m::Op().WithShape(BF16, {32, 40}, {1, 0}))
                              .WithShape(BF16, {16, 40}, {1, 0})));
 }
 
+class Triton6xBF16GemmTest : public TritonFilecheckTest {
+ public:
+  DebugOptions GetDebugOptionsForTest() override {
+    DebugOptions debug_options = GpuCodegenTest::GetDebugOptionsForTest();
+    // Enable triton fusion for all supported gemms.
+    debug_options.set_xla_gpu_triton_gemm_any(true);
+    // Do not fall back to cuBLAS, we are testing Triton.
+    debug_options.set_xla_gpu_cublas_fallback(false);
+    // Do not autotune split-k by default, since this prevents deterministically
+    // matching the optimized HLO.
+    debug_options.set_xla_gpu_enable_split_k_autotuning(false);
+    // Enable bf16_6way gemm to compute F32 matmul.
+    debug_options.set_xla_gpu_enable_bf16_6way_gemm(true);
+    return debug_options;
+  }
+};
+
+TEST_F(Triton6xBF16GemmTest, Emit6xBF16GemmWhenBothInputsAreF32) {
+  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+  const char* kHloText = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[5,7] parameter(0)
+  p1 = f32[7,33] parameter(1)
+  ROOT dot = f32[5,33] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[5,7]{1,0} parameter(0)
+  p1 = f32[7,33]{1,0} parameter(1)
+  ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config:
+    {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}
+}
+)";
+  TritonGemmConfig config(32, 32, 32, 1, 1, 1);
+  ASSERT_THAT(
+      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+CHECK:    tt.func @triton_fn(%[[LHS:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[RHS:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[OUT:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK:          %[[INFINITY:.*]] = arith.constant dense<0x7F800000> : tensor<32x32xf32>
+CHECK:          %[[C_MASK:.*]] = arith.constant dense<-65536> : tensor<32x32xi32>
+CHECK:          %[[C0:.*]] = arith.constant dense<0.000000e+00> : tensor<32x32xf32>
+CHECK:          %[[CAST_I32:.*]] = tt.bitcast %{{.*}} : tensor<32x32xf32> -> tensor<32x32xi32>
+CHECK:          %[[EXTRACT_HI:.*]] = arith.andi %[[CAST_I32]], %[[C_MASK]] : tensor<32x32xi32>
+CHECK:          %[[CAST_HI:.*]] = tt.bitcast %[[EXTRACT_HI]] : tensor<32x32xi32> -> tensor<32x32xf32>
+CHECK:          %[[TRUNC_TO_BF16:.*]] = arith.truncf %[[CAST_HI]] : tensor<32x32xf32> to tensor<32x32xbf16>
+CHECK-COUNT-5:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+CHECK:          %[[ABS:.*]] = math.absf
+CHECK:          %[[CMP:.*]] = arith.cmpf ogt, %[[INFINITY]], %[[ABS]] : tensor<32x32xf32>
+CHECK:          %[[SELECT:.*]] = arith.select %[[CMP]], %{{.*}}, %[[C0]] : tensor<32x32xi1>, tensor<32x32xf32>
+CHECK:          %[[DOT_LAST:.*]] = tt.dot %{{.*}}, %{{.*}}, %[[SELECT]] {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+CHECK:          %[[ACC:.*]] = arith.addf %[[DOT_LAST]], %[[C0]] : tensor<32x32xf32>
+    )"),
+      tsl::testing::IsOkAndHolds(true));
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, ErrorSpec{/*aabs=*/1e-6,
+                                                           /*arel=*/1e-6}));
+}
+
+TEST_F(Triton6xBF16GemmTest, Triton6xBF16GemmWorksForLongContractingDimension) {
+  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+  const char* kHloText = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[5,2048] parameter(0)
+  p1 = f32[2048,33] parameter(1)
+  ROOT dot = f32[5,33] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[5,2048]{1,0} parameter(0)
+  p1 = f32[2048,33]{1,0} parameter(1)
+  ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config:
+    {"block_m":64,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":4, "num_ctas":1}}}
+}
+)";
+  TritonGemmConfig config(64, 32, 32, 1, 1, 4);
+  ASSERT_THAT(
+      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+CHECK:    tt.func @triton_fn(%[[LHS:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[RHS:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[OUT:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<64x32xbf16> * tensor<32x32xbf16> -> tensor<64x32xf32>
+    )"),
+      tsl::testing::IsOkAndHolds(true));
+  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloText, ErrorSpec{/*aabs=*/1e-5,
+                                                           /*arel=*/1e-5}));
+}
+
+TEST_F(Triton6xBF16GemmTest, Triton6xBF16GemmCanHandleInfinity) {
+  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+  const char* kHloText = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[2,2] parameter(0)
+  p1 = f32[2,2] parameter(1)
+  ROOT dot = f32[2,2] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[2,2]{1, 0} parameter(0)
+  p1 = f32[2,2]{1, 0} parameter(1)
+  ROOT _ = f32[2,2] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config:
+    {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1, "num_ctas":1}}}
+}
+)";
+  TritonGemmConfig config(32, 32, 32, 1, 1, 1);
+  ASSERT_THAT(
+      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+CHECK:    tt.func @triton_fn(%[[LHS:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[RHS:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[OUT:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+    )"),
+      tsl::testing::IsOkAndHolds(true));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  std::vector<Literal> arguments(2);
+  arguments[0] =
+      LiteralUtil::CreateR2<float>({{+std::numeric_limits<float>::infinity(),
+                                     +std::numeric_limits<float>::infinity()},
+                                    {+std::numeric_limits<float>::infinity(),
+                                     +std::numeric_limits<float>::infinity()}});
+  arguments[1] = LiteralUtil::CreateR2<float>({{1.0f, 1.0f}, {1.0f, 1.0f}});
+  std::vector<Literal*> argument_ptrs;
+  absl::c_transform(
+      arguments, std::back_inserter(argument_ptrs),
+      [](const Literal& literal) { return const_cast<Literal*>(&literal); });
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), argument_ptrs,
+                                       ErrorSpec{/*aabs=*/0, /*arel=*/0}));
+}
+
+TEST_F(Triton6xBF16GemmTest, Triton6xBF16GemmCanHandleNaN) {
+  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+  const char* kHloText = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[2,2] parameter(0)
+  p1 = f32[2,2] parameter(1)
+  ROOT dot = f32[2,2] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[2,2]{1, 0} parameter(0)
+  p1 = f32[2,2]{1, 0} parameter(1)
+  ROOT _ = f32[2,2] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config:
+    {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1, "num_ctas":1}}}
+}
+)";
+  TritonGemmConfig config(32, 32, 32, 1, 1, 1);
+  ASSERT_THAT(
+      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+CHECK:    tt.func @triton_fn(%[[LHS:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[RHS:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[OUT:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+    )"),
+      tsl::testing::IsOkAndHolds(true));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  std::vector<Literal> arguments(2);
+  arguments[0] =
+      LiteralUtil::CreateR2<float>({{std::numeric_limits<float>::quiet_NaN(),
+                                     std::numeric_limits<float>::quiet_NaN()},
+                                    {std::numeric_limits<float>::quiet_NaN(),
+                                     std::numeric_limits<float>::quiet_NaN()}});
+  arguments[1] = LiteralUtil::CreateR2<float>(
+      {{1.0f, +std::numeric_limits<float>::infinity()},
+       {1.0f, +std::numeric_limits<float>::infinity()}});
+  std::vector<Literal*> argument_ptrs;
+  absl::c_transform(
+      arguments, std::back_inserter(argument_ptrs),
+      [](const Literal& literal) { return const_cast<Literal*>(&literal); });
+
+  EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), argument_ptrs,
+                                       ErrorSpec{/*aabs=*/0, /*arel=*/0}));
+}
+
+// Test case shows that why we truncate the middle term instead of rounding.
+// If we round the middle term, the splitted terms may disagree in sign. This
+// could result in wrong results for extreme values.
+// For example, consider:
+//   x = -3.40282347e+38
+// If we round the middle term, its decomposition would be:
+//   x_hi:  -3.38953139e+38
+//   x_mid: -1.3240357e+36
+//   x_lo:  5.17201445e+33
+// The result of x*x would be NaN instead of positive infinity.
+TEST_F(Triton6xBF16GemmTest, Triton6xBF16GemmWorksForExtremeInputs) {
+  if (!GetCudaComputeCapability().IsAtLeastAmpere()) {
+    GTEST_SKIP() << "No BF16 before Ampere.";
+  }
+  const char* kHloText = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[2,2] parameter(0)
+  p1 = f32[2,2] parameter(1)
+  ROOT dot = f32[2,2] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[2,2]{1, 0} parameter(0)
+  p1 = f32[2,2]{1, 0} parameter(1)
+  ROOT _ = f32[2,2] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config:
+    {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1, "num_ctas":1}}}
+}
+)";
+  TritonGemmConfig config(32, 32, 32, 1, 1, 1);
+  ASSERT_THAT(
+      CreateTritonIrAndFileCheck(kHloText, config, EmitMatMul, "triton_dot", R"(
+CHECK:    tt.func @triton_fn(%[[LHS:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[RHS:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}, %[[OUT:.*]]: !tt.ptr<f32, 1> {tt.divisibility = 16 : i32}) {
+CHECK-COUNT-6:  %{{.*}} = tt.dot %{{.*}}, %{{.*}}, %{{.*}} {allowTF32 = false, maxNumImpreciseAcc = 0 : i32} : tensor<32x32xbf16> * tensor<32x32xbf16> -> tensor<32x32xf32>
+    )"),
+      tsl::testing::IsOkAndHolds(true));
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          GetOptimizedModule(kHloText));
+  std::vector<Literal> arguments(2);
+  arguments[0] = LiteralUtil::CreateR2<float>(
+      {{0x1.0103p72f, 1.0f}, {-0x1.0103p72f, 1.0f}});
+  arguments[1] = LiteralUtil::CreateR2<float>(
+      {{0x1.0103p72f, 1.0f}, {-0x1.0103p72f, 1.0f}});
+  std::vector<Literal*> argument_ptrs;
+  absl::c_transform(
+      arguments, std::back_inserter(argument_ptrs),
+      [](const Literal& literal) { return const_cast<Literal*>(&literal); });
+
+  EXPECT_TRUE(
+      RunAndCompareNoHloPasses(std::move(module), argument_ptrs,
+                               ErrorSpec{/*aabs=*/1e-6, /*arel=*/1e-6}));
+}
+
+TEST_F(Triton6xBF16GemmTest, ShouldNotEmit6xBF16GemmForPreAmpere) {
+  if (GetCudaComputeCapability().IsAtLeastAmpere()) {
+    GTEST_SKIP() << "6xBF16Gemm should be emitted post-Ampere.";
+  }
+  const char* kHloText = R"(
+HloModule t
+
+triton_dot {
+  p0 = f32[5,7] parameter(0)
+  p1 = f32[7,33] parameter(1)
+  ROOT dot = f32[5,33] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[5,7]{1,0} parameter(0)
+  p1 = f32[7,33]{1,0} parameter(1)
+  ROOT _ = f32[5,33] fusion(p0, p1), kind=kCustom, calls=triton_dot,
+    backend_config={"fusion_backend_config": {kind: "__triton_gemm",
+    triton_gemm_config:
+    {"block_m":32,"block_n":32,"block_k":32,"split_k":1,"num_stages":1,"num_warps":1,"num_ctas":1}}}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
+                          ParseAndReturnVerifiedModule(kHloText));
+
+  CompileAndOptionallyVerifyPtx(std::move(verified_module),
+                                R"(
+CHECK-NOT: mma
+CHECK: selp.f32
+CHECK: st.shared.f32
+CHECK: ld.shared.v4.f32
+CHECK: fma.rn.f32
+CHECK: st.shared.f32
+)");
+}
+
+TEST_F(Triton6xBF16GemmTest, Emit6xBF16GemmEndToEnd) {
+  const char* kHloText = R"(
+HloModule t
+
+ENTRY e {
+  p0 = f32[5,32] parameter(0)
+  p1 = f32[32,7] parameter(1)
+  ROOT dot = f32[5,7] dot(p0, p1),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> verified_module,
+                          ParseAndReturnVerifiedModule(kHloText));
+  if (GetCudaComputeCapability().IsAtLeastAmpere()) {
+    CompileAndOptionallyVerifyPtx(std::move(verified_module),
+                                  R"(
+CHECK: mma.sync.aligned.{{.*}}.row.col.f32.bf16.bf16.f32
+CHECK-NOT: mma.sync.aligned.{{.*}}.row.col.f32.tf32.tf32.f32
+)");
+  } else {
+    CompileAndOptionallyVerifyPtx(std::move(verified_module),
+                                  R"(
+CHECK-NOT: mma
+)");
+  }
+  EXPECT_TRUE(RunAndCompare(kHloText, ErrorSpec{/*aabs=*/1e-6,
+                                                /*arel=*/1e-6}));
+}
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
index b556acb4f7543f..0c3af1646ccdd6 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.cc
@@ -20,7 +20,6 @@ limitations under the License.
 #include <cstdint>
 #include <cstring>
 #include <functional>
-#include <iterator>
 #include <memory>
 #include <numeric>
 #include <optional>
@@ -33,6 +32,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -47,6 +47,7 @@ limitations under the License.
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -62,14 +63,18 @@ limitations under the License.
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"  // from @llvm-project
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"  // from @llvm-project
 #include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
+#include "mlir/Dialect/MemRef/Transforms/Passes.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
 #include "mlir/IR/Builders.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/DialectRegistry.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/IR/Operation.h"  // from @llvm-project
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/Builtin/BuiltinToLLVMIRTranslation.h"  // from @llvm-project
 #include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"  // from @llvm-project
@@ -101,7 +106,6 @@ limitations under the License.
 #include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/fusions/fusions.h"
 #include "xla/service/gpu/fusions/thunk_util.h"
-#include "xla/service/gpu/gemm_thunk.h"
 #include "xla/service/gpu/gpu_asm_opts_util.h"
 #include "xla/service/gpu/gpu_conv_runner.h"
 #include "xla/service/gpu/gpu_executable.h"
@@ -112,37 +116,41 @@ limitations under the License.
 #include "xla/service/gpu/ir_emitter_context.h"
 #include "xla/service/gpu/ir_emitter_nested.h"
 #include "xla/service/gpu/kernel_arguments.h"
-#include "xla/service/gpu/kernels/custom_fusion.h"
+#include "xla/service/gpu/kernel_reuse_cache.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/kernels/topk_custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/nccl_all_gather_thunk.h"
-#include "xla/service/gpu/nccl_all_reduce_thunk.h"
-#include "xla/service/gpu/nccl_all_to_all_thunk.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_collective_permute_thunk.h"
 #include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/nccl_recv_thunk.h"
+#include "xla/service/gpu/nccl_send_thunk.h"
 #include "xla/service/gpu/parallel_loop_emitter.h"
-#include "xla/service/gpu/runtime3/command_buffer_cmd.h"
-#include "xla/service/gpu/runtime3/command_buffer_cmd_emitter.h"
-#include "xla/service/gpu/runtime3/command_buffer_thunk.h"
-#include "xla/service/gpu/runtime3/conditional_thunk.h"
-#include "xla/service/gpu/runtime3/convolution_thunk.h"
-#include "xla/service/gpu/runtime3/copy_thunk.h"
-#include "xla/service/gpu/runtime3/custom_call_thunk.h"
-#include "xla/service/gpu/runtime3/fft_thunk.h"
-#include "xla/service/gpu/runtime3/for_thunk.h"
-#include "xla/service/gpu/runtime3/fused_mha_thunk.h"
-#include "xla/service/gpu/runtime3/infeed_thunk.h"
-#include "xla/service/gpu/runtime3/kernel_thunk.h"
-#include "xla/service/gpu/runtime3/norm_thunk.h"
-#include "xla/service/gpu/runtime3/outfeed_thunk.h"
-#include "xla/service/gpu/runtime3/replica_id_thunk.h"
-#include "xla/service/gpu/runtime3/send_recv_thunk.h"
-#include "xla/service/gpu/runtime3/sequential_thunk.h"
-#include "xla/service/gpu/runtime3/while_thunk.h"
+#include "xla/service/gpu/runtime/command_buffer_cmd.h"
+#include "xla/service/gpu/runtime/command_buffer_cmd_emitter.h"
+#include "xla/service/gpu/runtime/command_buffer_thunk.h"
+#include "xla/service/gpu/runtime/conditional_thunk.h"
+#include "xla/service/gpu/runtime/convolution_thunk.h"
+#include "xla/service/gpu/runtime/copy_thunk.h"
+#include "xla/service/gpu/runtime/custom_call_thunk.h"
+#include "xla/service/gpu/runtime/fft_thunk.h"
+#include "xla/service/gpu/runtime/fused_mha_thunk.h"
+#include "xla/service/gpu/runtime/gemm_thunk.h"
+#include "xla/service/gpu/runtime/infeed_thunk.h"
+#include "xla/service/gpu/runtime/kernel_thunk.h"
+#include "xla/service/gpu/runtime/nccl_all_gather_thunk.h"
+#include "xla/service/gpu/runtime/nccl_all_reduce_thunk.h"
+#include "xla/service/gpu/runtime/nccl_all_to_all_thunk.h"
+#include "xla/service/gpu/runtime/norm_thunk.h"
+#include "xla/service/gpu/runtime/outfeed_thunk.h"
+#include "xla/service/gpu/runtime/replica_id_thunk.h"
+#include "xla/service/gpu/runtime/send_recv_thunk.h"
+#include "xla/service/gpu/runtime/sequential_thunk.h"
+#include "xla/service/gpu/runtime/wait_for_streams_thunk.h"
+#include "xla/service/gpu/runtime/while_thunk.h"
 #include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/triton_call.h"
 #include "xla/service/llvm_ir/buffer_assignment_util.h"
 #include "xla/service/llvm_ir/fused_ir_emitter.h"
 #include "xla/service/llvm_ir/ir_array.h"
@@ -156,11 +164,12 @@ limitations under the License.
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/gpu/gpu_blas_lt.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/translate/hlo_to_mhlo/hlo_utils.h"
 #include "xla/translate/mhlo_to_hlo/attribute_exporter.h"
 #include "xla/translate/mhlo_to_hlo/location_exporter.h"
 #include "xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
-#include "xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
@@ -168,16 +177,17 @@ limitations under the License.
 #include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/protobuf/dnn.pb.h"
+#include "triton/Dialect/Triton/IR/Dialect.h"
 
 #if GOOGLE_CUDA || TF_HIPBLASLT
-#include "xla/service/gpu/gpublas_lt_matmul_thunk.h"
+#include "xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h"
 #endif  // GOOGLE_CUDA || TF_HIPBLASLT
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "xla/service/gpu/cub_sort_thunk.h"
 #include "xla/service/gpu/ir_emitter_triton.h"
-#include "xla/service/gpu/runtime3/cholesky_thunk.h"
-#include "xla/service/gpu/runtime3/triangular_solve_thunk.h"
+#include "xla/service/gpu/runtime/cholesky_thunk.h"
+#include "xla/service/gpu/runtime/cub_sort_thunk.h"
+#include "xla/service/gpu/runtime/triangular_solve_thunk.h"
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
 namespace xla {
@@ -263,6 +273,13 @@ absl::StatusOr<xla::gpu::CudnnfMHAKind> AsCudnnBackwardfMHAKind(
     case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::BackwardSoftmaxDropout:
       return xla::gpu::CudnnfMHAKind::kBackwardSoftmaxDropout;
       break;
+    case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::
+        BackwardScaleMaskSoftmax:
+      return xla::gpu::CudnnfMHAKind::kBackwardScaleMaskSoftmax;
+    case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::
+        BackwardScaleMaskSoftmaxDropout:
+      return xla::gpu::CudnnfMHAKind::kBackwardScaleMaskSoftmaxDropout;
+      break;
     default:
       return xla::Internal("Unsupported fused_mha_backward_dag_signature");
   }
@@ -281,54 +298,6 @@ std::unique_ptr<IrEmitterUnnested> IrEmitterUnnested::Create(
       new IrEmitterUnnested(ir_emitter_context));
 }
 
-absl::StatusOr<BufferAllocation::Slice> IrEmitterUnnested::GetAllocationSlice(
-    mlir::Value v) {
-  return xla::gpu::GetAllocationSlice(v, ir_emitter_context_->allocations(),
-                                      nullptr);
-}
-
-absl::StatusOr<std::vector<BufferAllocation::Slice>>
-IrEmitterUnnested::GetAllocationSlices(mlir::OperandRange operands) {
-  std::vector<BufferAllocation::Slice> slices;
-  slices.reserve(operands.size());
-  for (mlir::Value operand : operands) {
-    TF_ASSIGN_OR_RETURN(auto slice, GetAllocationSlice(operand));
-    slices.push_back(slice);
-  }
-  return slices;
-}
-
-absl::Status IrEmitterUnnested::EmitUnreachable(mlir::Operation* op,
-                                                std::string error_message) {
-  AddThunkToThunkSequence(std::unique_ptr<Thunk>(
-      new UnreachableThunk(op, std::move(error_message))));
-  return absl::OkStatus();
-}
-
-absl::Status IrEmitterUnnested::EmitConstant(mlir::Operation* op,
-                                             const Literal& literal) {
-  auto get_global = mlir::cast<mlir::memref::GetGlobalOp>(op);
-  auto module = get_global->getParentOfType<mlir::ModuleOp>();
-  auto global = mlir::cast<mlir::memref::GlobalOp>(
-      module.lookupSymbol(get_global.getName()));
-  TF_ASSIGN_OR_RETURN(DenseDataIntermediate content,
-                      LiteralToXlaFormat(literal));
-
-  int element_bytes = primitive_util::ByteWidth(literal.shape().element_type());
-  TF_RET_CHECK(content.span().size() % element_bytes == 0);
-  // Treat int4 constant as int8 constant with half the number of elements.
-  int num_elements = content.span().size() / element_bytes;
-
-  int64_t arg_index =
-      global->getAttrOfType<mlir::IntegerAttr>("lmhlo.alloc").getInt();
-  int allocation_index = ir_emitter_context_->allocations()[arg_index]->index();
-
-  ir_emitter_context_->emit_constant(num_elements, element_bytes,
-                                     global.getSymName(), allocation_index,
-                                     std::move(content), &b_);
-  return absl::OkStatus();
-}
-
 absl::Status IrEmitterUnnested::EmitConstant(
     const HloConstantInstruction* instr) {
   TF_ASSIGN_OR_RETURN(DenseDataIntermediate content,
@@ -370,37 +339,6 @@ static ConditionalThunkConfig GetConditionalThunkConfig(
   return config;
 }
 
-absl::Status IrEmitterUnnested::EmitConditional(
-    mlir::Operation* op,
-    const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-        hlo_for_lmhlo) {
-  if (ir_emitter_context_->emit_ir_from_hlo())
-    return EmitConditional(hlo_for_lmhlo.at(op));
-
-  auto conditional = mlir::cast<mlir::lmhlo::CaseOp>(op);
-
-  std::vector<ThunkSequence> branch_thunks;
-
-  int branch_count = conditional.getBranches().size();
-  branch_thunks.reserve(branch_count);
-
-  for (int j = 0; j < branch_count; ++j) {
-    mlir::Region* branch_computation = &conditional.getBranches()[j];
-    auto ir_emitter = IrEmitterUnnested::Create(ir_emitter_context_);
-    TF_RETURN_IF_ERROR(
-        ir_emitter->EmitLmhloRegion(branch_computation, hlo_for_lmhlo));
-    branch_thunks.push_back(std::move(*ir_emitter->ConsumeThunkSequence()));
-  }
-
-  ConditionalThunkConfig config =
-      GetConditionalThunkConfig(conditional, std::move(branch_thunks));
-
-  TF_ASSIGN_OR_RETURN(auto slice, GetAllocationSlice(conditional.getIndex()));
-  AddThunkToThunkSequence(std::unique_ptr<Thunk>(new ConditionalThunk(
-      Thunk::ThunkInfo::WithProfileAnnotation(op), std::move(config), slice)));
-  return absl::OkStatus();
-}
-
 static ConditionalThunkConfig GetConditionalThunkConfig(
     const HloInstruction* instr,
     std::vector<ThunkSequence> branch_thunk_sequences) {
@@ -494,21 +432,20 @@ void IrEmitterUnnested::CreateStore(llvm::Value* data, llvm::Value* address,
 
 // Input = {dynamic array(with dynamic dimension meta data at the end)}
 // Output = {static array, dynamic_dim0, dynamic_dim1}
-absl::Status IrEmitterUnnested::EmitPadToStatic(mlir::Operation* op) {
-  // TODO(jurahul): Create an op to represent PadToStatic.
-  auto pad_to_static = mlir::cast<mlir::lmhlo::CustomCallOp>(op);
+absl::Status IrEmitterUnnested::EmitPadToStatic(
+    const HloCustomCallInstruction* instr) {
   int unroll_factor = 1;
-  std::string ir_name = GetIrNameFromLoc(pad_to_static.getLoc());
+  std::string ir_name = std::string(instr->name());
 
-  const Shape& input_shape = GetShape(pad_to_static.getArgs().front());
+  const Shape& input_shape = instr->operand(0)->shape();
 
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
       input_shape, ir_emitter_context_->gpu_device_info(), {unroll_factor});
   std::vector<llvm_ir::IrArray> input_arrays;
   std::vector<llvm_ir::IrArray> output_arrays;
-  TF_ASSIGN_OR_RETURN(
-      std::tie(input_arrays, output_arrays),
-      BuildKernelThunkForNonFusionOp(pad_to_static, launch_dimensions));
+  TF_ASSIGN_OR_RETURN(std::tie(input_arrays, output_arrays),
+                      BuildKernelThunkForNonFusionOp(instr, instr->operands(),
+                                                     launch_dimensions));
 
   CHECK_EQ(output_arrays.size(), 0);
   const llvm_ir::IrArray source_array = input_arrays[0];
@@ -516,8 +453,8 @@ absl::Status IrEmitterUnnested::EmitPadToStatic(mlir::Operation* op) {
   auto output_dim_arrays =
       absl::Span<const llvm_ir::IrArray>(input_arrays).subspan(2);
 
-  llvm::Type* index_ty = GetIndexTypeForKernel(
-      pad_to_static, launch_dimensions.launch_bound(), &b_);
+  llvm::Type* index_ty =
+      GetIndexTypeForKernel(instr, launch_dimensions.launch_bound(), &b_);
 
   // pseudo code for PadToStatic on a 2d array
   //   int* source_array = input[0];
@@ -535,10 +472,13 @@ absl::Status IrEmitterUnnested::EmitPadToStatic(mlir::Operation* op) {
   //   int* dyn_dim1_size = source_array + meta_data_offset + sizeof(int);
   std::vector<llvm::Value*> dynamic_dims;
   int alignment = raw_data_size % sizeof(int32_t);
-  for (int64_t i = 1; i < pad_to_static.getOutput().size(); ++i) {
+  std::vector<ShapeUtil::IndexedShape> output_shapes =
+      ShapeUtil::GetLeafShapes(instr->shape());
+
+  for (int64_t i = 1; i < output_shapes.size(); ++i) {
     // Dynamic size of each dimension is attached at the end of the source
     // array(operand(0)). We need to extract these value.
-    const Shape& dim_shape = GetShape(pad_to_static.getOutput()[i]);
+    const Shape& dim_shape = output_shapes[i].shape;
     TF_RET_CHECK(Shape::Equal()(dim_shape, ShapeUtil::MakeScalarShape(S32)));
 
     const int64_t dim_index = i - 1;
@@ -558,7 +498,7 @@ absl::Status IrEmitterUnnested::EmitPadToStatic(mlir::Operation* op) {
   //     *output[2] = *dyn_dim1_size;
   //   }
   KernelSupportLibrary{&b_}.If("is_thread_0", IsBlock0Thread0(&b_), [&] {
-    for (int64_t i = 1; i < pad_to_static.getOutput().size(); ++i) {
+    for (int64_t i = 1; i < output_shapes.size(); ++i) {
       const int64_t dim_index = i - 1;
       llvm::Value* dest_dim_size_address =
           output_dim_arrays[dim_index].GetBasePointer();
@@ -608,7 +548,7 @@ absl::Status IrEmitterUnnested::EmitPadToStatic(mlir::Operation* op) {
     return absl::OkStatus();
   };
 
-  const Shape& data_shape = GetShape(pad_to_static.getOutput().front());
+  const Shape& data_shape = instr->shape().tuple_shapes(0);
   TF_RETURN_IF_ERROR(ParallelLoopEmitter(body_generator, data_shape,
                                          launch_dimensions, &b_,
                                          {unroll_factor})
@@ -618,25 +558,25 @@ absl::Status IrEmitterUnnested::EmitPadToStatic(mlir::Operation* op) {
 
 // Input = {dynamic array(with dynamic dimension meta data at the end)}
 // Output = {static array, dynamic_dim0, dynamic_dim1}
-absl::Status IrEmitterUnnested::EmitSliceToDynamic(mlir::Operation* op) {
+absl::Status IrEmitterUnnested::EmitSliceToDynamic(
+    const HloCustomCallInstruction* instr) {
   // TODO(jurahul): Create an op to represent SliceToDynamic.
-  auto slice_to_dynamic = mlir::cast<mlir::lmhlo::CustomCallOp>(op);
   int unroll_factor = 1;
-  std::string ir_name = GetIrNameFromLoc(slice_to_dynamic.getLoc());
+  std::string ir_name = std::string(instr->name());
 
-  const Shape& input_shape = GetShape(slice_to_dynamic.getArgs().front());
+  const Shape& input_shape = instr->operand(0)->shape();
 
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
       input_shape, ir_emitter_context_->gpu_device_info(), {unroll_factor});
-  llvm::Type* index_ty = GetIndexTypeForKernel(
-      slice_to_dynamic, launch_dimensions.launch_bound(), &b_);
+  llvm::Type* index_ty =
+      GetIndexTypeForKernel(instr, launch_dimensions.launch_bound(), &b_);
   std::vector<llvm_ir::IrArray> input_arrays, output_arrays;
-  TF_ASSIGN_OR_RETURN(
-      std::tie(input_arrays, output_arrays),
-      BuildKernelThunkForNonFusionOp(slice_to_dynamic, launch_dimensions));
+  TF_ASSIGN_OR_RETURN(std::tie(input_arrays, output_arrays),
+                      BuildKernelThunkForNonFusionOp(instr, instr->operands(),
+                                                     launch_dimensions));
 
-  TF_RET_CHECK(slice_to_dynamic.getOutput().size() == 1);
-  const Shape& data_shape = GetShape(slice_to_dynamic.getOutput().front());
+  const Shape& data_shape = ShapeUtil::MakeStaticShape(instr->shape());
+  TF_RET_CHECK(data_shape.IsArray());
 
   // TODO(jurahul): data_shape here is the static shape of the output (which has
   // a dynamic shape in XLA). Currently, we are mapping that to a static shaped
@@ -658,7 +598,7 @@ absl::Status IrEmitterUnnested::EmitSliceToDynamic(mlir::Operation* op) {
   // Load dynamic dimensions from memory.
   std::vector<llvm::Value*> dynamic_dims;
   int alignment = raw_data_size % sizeof(int32_t);
-  for (int64_t i = 1; i < slice_to_dynamic.getArgs().size(); ++i) {
+  for (int64_t i = 1; i < instr->operand_count(); ++i) {
     llvm::Value* source_buffer = input_arrays[i].GetBasePointer();
     llvm::Type* source_buffer_pointee_type =
         input_arrays[i].GetBasePointeeType();
@@ -675,7 +615,7 @@ absl::Status IrEmitterUnnested::EmitSliceToDynamic(mlir::Operation* op) {
   //     *dyn_dim1_size = *output[2];
   //   }
   KernelSupportLibrary{&b_}.If("is_thread_0", IsBlock0Thread0(&b_), [&] {
-    for (int64_t i = 1; i < slice_to_dynamic.getArgs().size(); ++i) {
+    for (int64_t i = 1; i < instr->operand_count(); ++i) {
       const int64_t dim_index = i - 1;
       llvm::Value* metadata = b_.CreateConstInBoundsGEP1_32(
           b_.getInt8Ty(), dest_buffer,
@@ -808,171 +748,6 @@ absl::Status IrEmitterUnnested::EmitConvolutionThunk(
   return OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitConvolutionThunk(mlir::Operation* op) {
-  using mlir::dyn_cast;
-  using mlir::lmhlo_gpu::Activation;
-  using mlir::lmhlo_gpu::ConvBackwardFilterOp;
-  using mlir::lmhlo_gpu::ConvBackwardInputOp;
-  using mlir::lmhlo_gpu::ConvForwardFusedOp;
-  using mlir::lmhlo_gpu::ConvForwardFusedSideInputOp;
-  using mlir::lmhlo_gpu::ConvForwardGraphOp;
-  using mlir::lmhlo_gpu::ConvForwardOp;
-
-  std::vector<BufferAllocation::Slice> operand_slices, result_slices;
-  int32_t n_aux_outputs = 0;
-  if (auto conv = dyn_cast<ConvForwardGraphOp>(op)) {
-    n_aux_outputs = conv.getNAuxOutputs();
-  }
-  int64_t num_operands = op->getNumOperands();
-  operand_slices.reserve(num_operands - n_aux_outputs - 2);
-
-  // The operands describe inputs, the main result of the convolution, the
-  // scratch workspace and n_aux_outputs return values of ops fused into the
-  // convolution.
-  for (mlir::Value operand : op->getOperands().drop_back(2 + n_aux_outputs)) {
-    TF_ASSIGN_OR_RETURN(auto slice, GetAllocationSlice(operand));
-    operand_slices.push_back(slice);
-  }
-
-  result_slices.reserve(1 + n_aux_outputs);
-  for (mlir::Value result : op->getOperands()
-                                .drop_front(num_operands - n_aux_outputs - 2)
-                                .drop_back(1)) {
-    TF_ASSIGN_OR_RETURN(auto slice, GetAllocationSlice(result));
-    result_slices.push_back(slice);
-  }
-  mlir::Value scratch_result = op->getOperand(num_operands - 1);
-  TF_ASSIGN_OR_RETURN(auto scratch_slice, GetAllocationSlice(scratch_result));
-
-  auto apply_layout = [](const Shape& shape,
-                         mlir::ArrayRef<int64_t> minor_to_major) {
-    return ShapeUtil::MakeShapeWithDenseLayout(
-        shape.element_type(), shape.dimensions(), minor_to_major);
-  };
-
-  GpuConvDescriptor descriptor;
-
-  auto fill_conv_descriptor = [&](auto op) {
-    descriptor.operand0_shape =
-        apply_layout(GetShape(op->getOperand(0)),
-                     op.getBackendConfig().getOperand_0Layout());
-    descriptor.operand1_shape =
-        apply_layout(GetShape(op->getOperand(1)),
-                     op.getBackendConfig().getOperand_1Layout());
-    descriptor.result_shape =
-        apply_layout(GetShape(op->getOperand(num_operands - n_aux_outputs - 2)),
-                     op.getBackendConfig().getResultLayout());
-    descriptor.dnums = ConvertConvDimensionNumbers(op.getDimensionNumbers());
-    descriptor.scratch_size = scratch_slice.size();
-    mlir::DenseIntElementsAttr window_strides = op.getWindowStrides().value();
-    mlir::DenseIntElementsAttr padding = op.getPadding().value();
-    mlir::DenseIntElementsAttr lhs_dilation = op.getLhsDilation().value();
-    mlir::DenseIntElementsAttr rhs_dilation = op.getRhsDilation().value();
-    mlir::DenseElementsAttr window_reversal = op.getWindowReversal().value();
-    for (auto index : llvm::seq<int>(0, window_strides.getNumElements())) {
-      WindowDimension* dim = descriptor.window.add_dimensions();
-      // Window size for a convolution is the same as the kernel size.
-      // Kernel size of the convolution is operand1_shape. We need to look at
-      // the convolution dimension numbers kernel spatial dimensions to get
-      // the window size.
-      int kernel_dim = descriptor.dnums.kernel_spatial_dimensions(index);
-      dim->set_size(descriptor.operand0_shape.dimensions(kernel_dim));
-      dim->set_stride(window_strides.getValues<int64_t>()[index]);
-      dim->set_padding_low(padding.getValues<int64_t>()[index]);
-      dim->set_padding_high(padding.getValues<int64_t>()[index]);
-      dim->set_base_dilation(lhs_dilation.getValues<int64_t>()[index]);
-      dim->set_window_dilation(rhs_dilation.getValues<int64_t>()[index]);
-      dim->set_window_reversal(window_reversal.getValues<bool>()[index]);
-    }
-    descriptor.feature_group_count = op.getFeatureGroupCount();
-    {
-      auto* algorithm = descriptor.backend_config.mutable_algorithm();
-      algorithm->set_algo_id(op.getBackendConfig().getAlgorithm());
-      algorithm->set_math_type(op.getBackendConfig().getTensorOpsEnabled()
-                                   ? se::dnn::AlgorithmProto::TENSOR_OP_MATH
-                                   : se::dnn::AlgorithmProto::DEFAULT_MATH);
-      for (int i = 0; i < op.getBackendConfig().getKnobIds().size(); ++i) {
-        // N.B. tuning_knobs is a map rather than a repeated field, so this
-        // doesn't require reserving space up front.
-        (*algorithm
-              ->mutable_tuning_knobs())[op.getBackendConfig().getKnobIds()[i]] =
-            op.getBackendConfig().getKnobValues()[i];
-      }
-      algorithm->set_is_cudnn_frontend(
-          op.getBackendConfig().getIsCudnnFrontend());
-      auto workspace_size = op.getBackendConfig().getWorkspaceSize();
-      if (workspace_size >= 0) {
-        algorithm->mutable_workspace_size()->set_value(workspace_size);
-      }
-    }
-    descriptor.backend_config.set_conv_result_scale(
-        op.getResultScale().convertToDouble());
-    descriptor.backend_config.set_reordered_int8_nchw_vect(
-        op.getBackendConfig().getIsCudnnReorderedInt8());
-  };
-
-  auto set_activation_mode = [&](auto op) -> absl::Status {
-    TF_ASSIGN_OR_RETURN(stream_executor::dnn::ActivationMode activation_mode,
-                        ConvertConvActivationMode(op.getActivationMode()));
-    descriptor.backend_config.set_activation_mode(activation_mode);
-    return absl::OkStatus();
-  };
-
-  if (auto conv = dyn_cast<ConvForwardOp>(op)) {
-    descriptor.kind = CudnnConvKind::kForward;
-    fill_conv_descriptor(conv);
-  } else if (auto conv = dyn_cast<ConvBackwardInputOp>(op)) {
-    descriptor.kind = CudnnConvKind::kBackwardInput;
-    fill_conv_descriptor(conv);
-  } else if (auto conv = dyn_cast<ConvBackwardFilterOp>(op)) {
-    descriptor.kind = CudnnConvKind::kBackwardFilter;
-    fill_conv_descriptor(conv);
-  } else if (auto conv = dyn_cast<ConvForwardGraphOp>(op)) {
-    descriptor.kind = CudnnConvKind::kForwardGraph;
-    fill_conv_descriptor(conv);
-    descriptor.backend_config.set_serialized_graph(
-        conv.getSerializedGraph().data());
-  } else if (auto conv = dyn_cast<ConvForwardFusedOp>(op)) {
-    descriptor.kind = CudnnConvKind::kForwardActivation;
-    fill_conv_descriptor(conv);
-    TF_RETURN_IF_ERROR(set_activation_mode(conv));
-    descriptor.backend_config.set_leakyrelu_alpha(
-        conv.getLeakyreluAlpha().convertToDouble());
-  } else if (auto conv = dyn_cast<ConvForwardFusedSideInputOp>(op)) {
-    descriptor.kind = CudnnConvKind::kForwardActivation;
-    fill_conv_descriptor(conv);
-    TF_RETURN_IF_ERROR(set_activation_mode(conv));
-    descriptor.backend_config.set_side_input_scale(
-        conv.getSideInputScale().convertToDouble());
-  } else {
-    return Internal("EmitConvolutionThunk: Unexpected operation");
-  }
-  TF_ASSIGN_OR_RETURN(GpuConvConfig config, GetGpuConvConfig(descriptor, ""));
-  AddThunkToThunkSequence(std::make_unique<ConvolutionThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(op), std::move(config),
-      std::move(operand_slices), std::move(result_slices), scratch_slice));
-  return absl::OkStatus();
-}
-
-absl::Status IrEmitterUnnested::EmitGemmThunk(mlir::Operation* op) {
-  auto gemm = mlir::dyn_cast<mlir::lmhlo_gpu::GEMMOp>(op);
-  TF_RET_CHECK(gemm != nullptr);
-
-  TF_ASSIGN_OR_RETURN(auto a, GetAllocationSlice(gemm.getA()));
-  TF_ASSIGN_OR_RETURN(auto b, GetAllocationSlice(gemm.getB()));
-  TF_ASSIGN_OR_RETURN(auto c, GetAllocationSlice(gemm.getC()));
-  bool deterministic_ops =
-      ir_emitter_context_->debug_options().xla_gpu_deterministic_ops();
-
-  TF_ASSIGN_OR_RETURN(GemmConfig config, GemmConfig::For(gemm));
-  auto thunk = std::make_unique<GemmThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(op), std::move(config), a, b, c,
-      std::nullopt, deterministic_ops);
-
-  AddThunkToThunkSequence(std::move(thunk));
-  return absl::OkStatus();
-}
-
 absl::Status IrEmitterUnnested::EmitGemmThunk(
     const HloCustomCallInstruction* instr) {
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice a,
@@ -1008,523 +783,518 @@ absl::Status IrEmitterUnnested::EmitGemmThunk(
 
 #if GOOGLE_CUDA || TF_HIPBLASLT
 
-absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunk(mlir::Operation* op) {
-  auto matmul = mlir::dyn_cast<mlir::lmhlo_gpu::CublasLtMatmulOp>(op);
-  TF_RET_CHECK(matmul != nullptr);
+absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunk(
+    const HloCustomCallInstruction* instr) {
+  TF_ASSIGN_OR_RETURN(const auto gpu_config,
+                      instr->backend_config<xla::gpu::GpuBackendConfig>());
+  xla::gpu::GemmBackendConfig config = gpu_config.gemm_backend_config();
+  xla::gpu::GemmBackendConfig_Epilogue epilogue = config.epilogue();
+
+  TF_ASSIGN_OR_RETURN(bool has_vector_bias,
+                      xla::gpu::gpublas_lt::EpilogueAddsVectorBias(epilogue));
+  bool has_matrix_bias = config.beta() != 0;
+
+  TF_RET_CHECK(instr->operand_count() ==
+               2 + int{has_matrix_bias} + int{has_vector_bias});
 
-  TF_ASSIGN_OR_RETURN(auto a, GetAllocationSlice(matmul.getA()));
-  TF_ASSIGN_OR_RETURN(auto b, GetAllocationSlice(matmul.getB()));
-  TF_ASSIGN_OR_RETURN(auto c, GetAllocationSlice(matmul.getC()));
-  TF_ASSIGN_OR_RETURN(auto d, GetAllocationSlice(matmul.getD()));
+  TF_ASSIGN_OR_RETURN(
+      bool has_aux_output,
+      xla::gpu::gpublas_lt::EpilogueHasAuxiliaryOutput(epilogue));
+  xla::ShapeIndex output_index =
+      has_aux_output ? xla::ShapeIndex{0} : xla::ShapeIndex{};
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice a,
+                      GetAllocationSliceForHlo(instr->operand(0)));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice b,
+                      GetAllocationSliceForHlo(instr->operand(1)));
+  BufferAllocation::Slice c;
+  if (has_matrix_bias) {
+    TF_ASSIGN_OR_RETURN(c, GetAllocationSliceForHlo(instr->operand(2)));
+  } else {
+    TF_ASSIGN_OR_RETURN(c, GetAllocationSliceForHlo(instr, output_index));
+  }
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice d,
+                      GetAllocationSliceForHlo(instr, output_index));
 
-  BufferAllocation::Slice bias, a_scale, b_scale, c_scale, d_scale, d_amax;
-  if (matmul.getBias() != nullptr) {
-    TF_ASSIGN_OR_RETURN(bias, GetAllocationSlice(matmul.getBias()));
+  BufferAllocation::Slice bias;
+  if (has_vector_bias) {
+    TF_ASSIGN_OR_RETURN(bias, GetAllocationSliceForHlo(
+                                  instr->operand(has_matrix_bias ? 3 : 2)));
   }
 
   BufferAllocation::Slice aux;
-  if (matmul.getAux() != nullptr) {
-    TF_ASSIGN_OR_RETURN(aux, GetAllocationSlice(matmul.getAux()));
+  if (has_aux_output) {
+    TF_ASSIGN_OR_RETURN(aux, GetAllocationSliceForHlo(instr, {1}));
   }
 
-  TF_ASSIGN_OR_RETURN(GemmConfig gemm_config, GemmConfig::For(matmul));
-  TF_ASSIGN_OR_RETURN(auto epilogue,
-                      gpublas_lt::AsBlasLtEpilogue(matmul.getEpilogue()));
+  TF_ASSIGN_OR_RETURN(
+      auto gemm_config,
+      GemmConfig::For(static_cast<const HloInstruction*>(instr)));
+
+  // Use the first algorithm by default (i.e. fastest according to heuristics).
+  int64_t algorithm =
+      config.algorithm_case() == GemmBackendConfig::kSelectedAlgorithm
+          ? config.selected_algorithm()
+          : 0;
+
+  BufferAllocation::Slice a_scale, b_scale, c_scale, d_scale, d_amax;
+  TF_ASSIGN_OR_RETURN(se::gpu::BlasLt::Epilogue blas_lt_epilogue,
+                      gpublas_lt::AsBlasLtEpilogue(epilogue));
   auto thunk = std::make_unique<CublasLtMatmulThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(op), std::move(gemm_config),
-      epilogue, matmul.getAlgorithm(), a, b, c, d, bias, aux, a_scale, b_scale,
+      Thunk::ThunkInfo::WithProfileAnnotation(instr), std::move(gemm_config),
+      blas_lt_epilogue, algorithm, a, b, c, d, bias, aux, a_scale, b_scale,
       c_scale, d_scale, d_amax);
-
   AddThunkToThunkSequence(std::move(thunk));
   return absl::OkStatus();
 }
+
 #endif  // GOOGLE_CUDA || TF_HIPBLASLT
 
 #if GOOGLE_CUDA
-absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunkF8(mlir::Operation* op) {
-  auto matmul = mlir::dyn_cast<mlir::lmhlo_gpu::CublasLtMatmulF8Op>(op);
-  TF_RET_CHECK(matmul != nullptr);
+
+absl::Status IrEmitterUnnested::EmitCublasLtMatmulThunkF8(
+    const HloCustomCallInstruction* instr) {
+  TF_RET_CHECK(instr->operand_count() == 6 || instr->operand_count() == 7 ||
+               instr->operand_count() == 8);
+  TF_ASSIGN_OR_RETURN(const auto gpu_config,
+                      instr->backend_config<xla::gpu::GpuBackendConfig>());
+  xla::gpu::GemmBackendConfig config = gpu_config.gemm_backend_config();
+  xla::gpu::GemmBackendConfig_Epilogue epilogue = config.epilogue();
+
+  TF_ASSIGN_OR_RETURN(bool has_vector_bias,
+                      xla::gpu::gpublas_lt::EpilogueAddsVectorBias(epilogue));
+  bool has_damax = instr->shape().IsTuple();
+  xla::ShapeIndex output_index =
+      has_damax ? xla::ShapeIndex{0} : xla::ShapeIndex{};
 
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice a,
-                      GetAllocationSlice(matmul.getA()));
+                      GetAllocationSliceForHlo(instr->operand(0)));
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice b,
-                      GetAllocationSlice(matmul.getB()));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice c,
-                      GetAllocationSlice(matmul.getC()));
+                      GetAllocationSliceForHlo(instr->operand(1)));
+  BufferAllocation::Slice c;
+  bool has_matrix_bias = config.beta() != 0;
+  if (has_matrix_bias) {
+    TF_ASSIGN_OR_RETURN(c, GetAllocationSliceForHlo(instr->operand(2)));
+  } else {
+    TF_ASSIGN_OR_RETURN(c, GetAllocationSliceForHlo(instr, output_index));
+  }
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice d,
-                      GetAllocationSlice(matmul.getD()));
+                      GetAllocationSliceForHlo(instr, output_index));
+
+  int a_scale_index = has_matrix_bias ? 3 : 2;
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice a_scale,
-                      GetAllocationSlice(matmul.getAScale()));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice b_scale,
-                      GetAllocationSlice(matmul.getBScale()));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice c_scale,
-                      GetAllocationSlice(matmul.getCScale()));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice d_scale,
-                      GetAllocationSlice(matmul.getDScale()));
-  BufferAllocation::Slice d_amax, bias;
-  if (matmul.getDAmax() != nullptr) {
-    TF_ASSIGN_OR_RETURN(d_amax, GetAllocationSlice(matmul.getDAmax()));
+                      GetAllocationSliceForHlo(instr->operand(a_scale_index)));
+  TF_ASSIGN_OR_RETURN(
+      BufferAllocation::Slice b_scale,
+      GetAllocationSliceForHlo(instr->operand(a_scale_index + 1)));
+  TF_ASSIGN_OR_RETURN(
+      BufferAllocation::Slice c_scale,
+      GetAllocationSliceForHlo(instr->operand(a_scale_index + 2)));
+  TF_ASSIGN_OR_RETURN(
+      BufferAllocation::Slice d_scale,
+      GetAllocationSliceForHlo(instr->operand(a_scale_index + 3)));
+
+  BufferAllocation::Slice bias;
+  if (has_vector_bias) {
+    TF_ASSIGN_OR_RETURN(
+        bias, GetAllocationSliceForHlo(instr->operand(a_scale_index + 4)));
   }
-  if (matmul.getBias() != nullptr) {
-    TF_ASSIGN_OR_RETURN(bias, GetAllocationSlice(matmul.getBias()));
+
+  BufferAllocation::Slice d_amax;
+  if (has_damax) {
+    TF_ASSIGN_OR_RETURN(d_amax, GetAllocationSliceForHlo(instr, {1}));
   }
 
+  TF_ASSIGN_OR_RETURN(
+      auto gemm_config,
+      GemmConfig::For(static_cast<const HloInstruction*>(instr)));
+
+  // Use the first algorithm by default (i.e. fastest according to heuristics).
+  int64_t algorithm =
+      config.algorithm_case() == GemmBackendConfig::kSelectedAlgorithm
+          ? config.selected_algorithm()
+          : 0;
+
   BufferAllocation::Slice aux;  // Not used.
 
-  TF_ASSIGN_OR_RETURN(GemmConfig gemm_config, GemmConfig::For(matmul));
-  TF_ASSIGN_OR_RETURN(auto epilogue,
-                      gpublas_lt::AsBlasLtEpilogue(matmul.getEpilogue()));
+  TF_ASSIGN_OR_RETURN(se::gpu::BlasLt::Epilogue blas_lt_epilogue,
+                      gpublas_lt::AsBlasLtEpilogue(epilogue));
   auto thunk = std::make_unique<CublasLtMatmulThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(op), std::move(gemm_config),
-      epilogue, matmul.getAlgorithm(), a, b, c, d, bias, aux, a_scale, b_scale,
+      Thunk::ThunkInfo::WithProfileAnnotation(instr), std::move(gemm_config),
+      blas_lt_epilogue, algorithm, a, b, c, d, bias, aux, a_scale, b_scale,
       c_scale, d_scale, d_amax);
-
   AddThunkToThunkSequence(std::move(thunk));
   return absl::OkStatus();
 }
 
 absl::Status IrEmitterUnnested::EmitConvolutionReorderThunk(
-    mlir::Operation* op) {
-  using mlir::dyn_cast;
-  using mlir::lmhlo_gpu::CudnnConvReorderFilterAndBiasOp;
-  using mlir::lmhlo_gpu::CudnnConvReorderFilterOp;
-
-  std::vector<BufferAllocation::Slice> operand_slices;
-  std::vector<BufferAllocation::Slice> result_slices;
-  std::vector<int64_t> filter_dims;
-
-  auto set_filter_data = [&](auto op) -> absl::Status {
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice filter_input,
-                        GetAllocationSlice(op.getFilterInput()));
-    operand_slices.push_back(filter_input);
-
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice filter_output,
-                        GetAllocationSlice(op.getFilterOutput()));
-    result_slices.push_back(filter_output);
-
-    auto filter_dims_values = op.getFilterDims().template getValues<int64_t>();
-    filter_dims.assign(filter_dims_values.begin(), filter_dims_values.end());
-    return absl::OkStatus();
-  };
-
-  if (auto reorder = dyn_cast<CudnnConvReorderFilterAndBiasOp>(op)) {
-    TF_RETURN_IF_ERROR(set_filter_data(reorder));
-
+    const HloCustomCallInstruction* instr) {
+  bool has_bias = instr->operand_count() > 1;
+  Shape shape = has_bias ? instr->shape().tuple_shapes(0) : instr->shape();
+  if (shape.rank() != 5 || shape.dimensions(4) != 32) {
+    return Internal("Unexpected shape for convolution reorder: %s",
+                    instr->ToString());
+  }
+  absl::InlinedVector<int64_t, 4> filter_dims = {
+      shape.dimensions(0), shape.dimensions(1) * 32, shape.dimensions(2),
+      shape.dimensions(3)};
+
+  absl::InlinedVector<BufferAllocation::Slice, 2> operand_slices;
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice filter_input,
+                      GetAllocationSliceForHlo(instr->operand(0)));
+  operand_slices.push_back(filter_input);
+  if (has_bias) {
     TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bias_input,
-                        GetAllocationSlice(reorder.getBiasInput()));
+                        GetAllocationSliceForHlo(instr->operand(1)));
     operand_slices.push_back(bias_input);
+  }
 
+  absl::InlinedVector<BufferAllocation::Slice, 2> result_slices;
+  if (has_bias) {
+    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice filter_output,
+                        GetAllocationSliceForHlo(instr, {0}));
+    result_slices.push_back(filter_output);
     TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bias_output,
-                        GetAllocationSlice(reorder.getBiasOutput()));
+                        GetAllocationSliceForHlo(instr, {1}));
     result_slices.push_back(bias_output);
-  } else if (auto reorder = dyn_cast<CudnnConvReorderFilterOp>(op)) {
-    TF_RETURN_IF_ERROR(set_filter_data(reorder));
   } else {
-    return Internal("Unexpected operation");
+    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice filter_output,
+                        GetAllocationSliceForHlo(instr));
+    result_slices.push_back(filter_output);
   }
 
   auto thunk = std::make_unique<ConvolutionReorderThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(op), absl::MakeSpan(filter_dims),
-      std::move(operand_slices), std::move(result_slices));
-
+      Thunk::ThunkInfo::WithProfileAnnotation(instr),
+      absl::MakeSpan(filter_dims), operand_slices, result_slices);
   AddThunkToThunkSequence(std::move(thunk));
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitNormThunk(mlir::Operation* op) {
-  auto norm = mlir::dyn_cast<mlir::lmhlo_gpu::CudnnNormOp>(op);
-  TF_RET_CHECK(norm != nullptr);
+absl::Status IrEmitterUnnested::EmitNormThunk(
+    const HloCustomCallInstruction* instr) {
+  TF_ASSIGN_OR_RETURN(auto const gpu_backend_config,
+                      instr->backend_config<xla::gpu::GpuBackendConfig>());
+  const xla::gpu::CudnnNormBackendConfig& backend_config =
+      gpu_backend_config.cudnn_norm_backend_config();
 
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice input_slice,
-                      GetAllocationSlice(norm.getInput()));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice x_slice,
+                      GetAllocationSliceForHlo(instr->operand(0)));
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice scale_slice,
-                      GetAllocationSlice(norm.getScale()));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bias_slice,
-                      GetAllocationSlice(norm.getBias()));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice output_slice,
-                      GetAllocationSlice(norm.getOutput()));
+                      GetAllocationSliceForHlo(instr->operand(1)));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice y_or_dx_slice,
+                      GetAllocationSliceForHlo(instr, {0}));
 
-  int64_t num_operands = op->getNumOperands();
-  std::optional<BufferAllocation::Slice> expectation_slice, norm_factor_slice;
-  if (num_operands == 7) {
+  std::optional<BufferAllocation::Slice> bias_slice, expectation_slice,
+      norm_factor_slice, dy_slice, dscale_slice, dbias_slice;
+
+  if (backend_config.kind() ==
+          xla::gpu::CudnnNormBackendConfig::LAYER_FWD_INFER ||
+      backend_config.kind() ==
+          xla::gpu::CudnnNormBackendConfig::LAYER_FWD_TRAIN) {
+    TF_ASSIGN_OR_RETURN(bias_slice,
+                        GetAllocationSliceForHlo(instr->operand(2)));
+  }
+  if (backend_config.kind() ==
+      xla::gpu::CudnnNormBackendConfig::LAYER_FWD_TRAIN) {
     TF_ASSIGN_OR_RETURN(expectation_slice,
-                        GetAllocationSlice(norm.getExpectation()));
+                        GetAllocationSliceForHlo(instr, {1}));
     TF_ASSIGN_OR_RETURN(norm_factor_slice,
-                        GetAllocationSlice(norm.getNormFactor()));
+                        GetAllocationSliceForHlo(instr, {2}));
+  }
+  if (backend_config.kind() == xla::gpu::CudnnNormBackendConfig::LAYER_BWD) {
+    TF_ASSIGN_OR_RETURN(dy_slice, GetAllocationSliceForHlo(instr->operand(2)));
+    TF_ASSIGN_OR_RETURN(expectation_slice,
+                        GetAllocationSliceForHlo(instr->operand(3)));
+    TF_ASSIGN_OR_RETURN(norm_factor_slice,
+                        GetAllocationSliceForHlo(instr->operand(4)));
+    TF_ASSIGN_OR_RETURN(dscale_slice, GetAllocationSliceForHlo(instr, {1}));
+    TF_ASSIGN_OR_RETURN(dbias_slice, GetAllocationSliceForHlo(instr, {2}));
   }
-
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice scratch_slice,
-                      GetAllocationSlice(norm.getScratch()));
+                      GetAllocationSliceForHlo(
+                          instr, {instr->shape().tuple_shapes_size() - 1}));
 
   GpuNormDescriptor descriptor;
-  auto* algorithm = descriptor.backend_config.mutable_algorithm();
-  algorithm->set_algo_id(norm.getAlgorithmConfig().getAlgorithm());
-  algorithm->set_is_cudnn_frontend(true);
-  auto workspace_size = norm.getAlgorithmConfig().getWorkspaceSize();
-  algorithm->mutable_workspace_size()->set_value(workspace_size);
-
-  descriptor.input_shape = GetShape(norm->getOperand(0));
-  descriptor.scale_shape = GetShape(norm->getOperand(1));
-  descriptor.bias_shape = GetShape(norm->getOperand(2));
-  descriptor.output_shape = GetShape(norm->getOperand(3));
-  if (num_operands == 7) {
-    descriptor.expectation_shape = GetShape(norm->getOperand(4));
-    descriptor.norm_factor_shape = GetShape(norm->getOperand(5));
+  descriptor.backend_config = backend_config;
+
+  descriptor.x_shape = instr->operand(0)->shape();
+  descriptor.scale_shape = instr->operand(1)->shape();
+  descriptor.y_or_dx_shape = ShapeUtil::GetSubshape(instr->shape(), {0});
+  if (backend_config.kind() ==
+          xla::gpu::CudnnNormBackendConfig::LAYER_FWD_INFER ||
+      backend_config.kind() ==
+          xla::gpu::CudnnNormBackendConfig::LAYER_FWD_TRAIN) {
+    descriptor.bias_shape = instr->operand(2)->shape();
+  }
+  if (backend_config.kind() ==
+      xla::gpu::CudnnNormBackendConfig::LAYER_FWD_TRAIN) {
+    descriptor.expectation_shape = ShapeUtil::GetSubshape(instr->shape(), {1});
+    descriptor.norm_factor_shape = ShapeUtil::GetSubshape(instr->shape(), {2});
+  }
+  if (backend_config.kind() == xla::gpu::CudnnNormBackendConfig::LAYER_BWD) {
+    descriptor.dy_shape = instr->operand(2)->shape();
+    descriptor.expectation_shape = instr->operand(3)->shape();
+    descriptor.norm_factor_shape = instr->operand(4)->shape();
+    descriptor.dscale_shape = ShapeUtil::GetSubshape(instr->shape(), {1});
+    descriptor.dbias_shape = ShapeUtil::GetSubshape(instr->shape(), {2});
   }
-  descriptor.backend_config.set_epsilon(norm.getEpsilon().convertToDouble());
 
   TF_ASSIGN_OR_RETURN(GpuNormConfig config, GpuNormConfig::For(descriptor));
 
   auto thunk = std::make_unique<NormThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(op), std::move(config),
-      input_slice, scale_slice, bias_slice, output_slice, expectation_slice,
-      norm_factor_slice, scratch_slice);
-
+      Thunk::ThunkInfo::WithProfileAnnotation(instr), std::move(config),
+      x_slice, scale_slice, y_or_dx_slice, bias_slice, expectation_slice,
+      norm_factor_slice, dy_slice, dscale_slice, dbias_slice, scratch_slice);
   AddThunkToThunkSequence(std::move(thunk));
-
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitFusedMHAThunk(mlir::Operation* op) {
-  using mlir::dyn_cast;
-  using mlir::lmhlo_gpu::fusedMHAOp;
-  GpufMHADescriptor descriptor;
-  BufferAllocation::Slice lhs_bmm1_slice, rhs_bmm1_slice, rhs_bmm2_slice,
-      output_slice, scratch_slice, activation_slice, mask_slice, bias_slice;
-
-  auto populate_common = [&](auto fmha) -> absl::Status {
-    descriptor.backend_config.set_fmha_scale(
-        fmha.getFmhaScale().convertToDouble());
-
-    if (fmha.getDropoutRate()) {
-      descriptor.backend_config.set_dropout_rate(
-          (*fmha.getDropoutRate()).convertToDouble());
-    }
-
-    if (fmha.getSeed()) {
-      descriptor.backend_config.set_seed((*fmha.getSeed()));
-    }
-
-    auto* algorithm = descriptor.backend_config.mutable_algorithm();
-    algorithm->set_algo_id(fmha.getAlgorithmConfig().getAlgorithm());
-    for (int i = 0; i < fmha.getAlgorithmConfig().getKnobIds().size(); ++i) {
-      // N.B. tuning_knobs is a map rather than a repeated field, so this
-      // doesn't require reserving space up front.
-      (*algorithm->mutable_tuning_knobs())[fmha.getAlgorithmConfig()
-                                               .getKnobIds()[i]] =
-          fmha.getAlgorithmConfig().getKnobValues()[i];
-    }
-    algorithm->set_is_cudnn_frontend(true);
-    auto workspace_size = fmha.getAlgorithmConfig().getWorkspaceSize();
-    if (workspace_size >= 0) {
-      algorithm->mutable_workspace_size()->set_value(workspace_size);
-    }
-
-    descriptor.bmm1_dnums =
-        ConvertDotDimensionNumbers(fmha.getBmm1DotDimensionNumbers());
-    descriptor.bmm2_dnums =
-        ConvertDotDimensionNumbers(fmha.getBmm2DotDimensionNumbers());
-
-    descriptor.lhs_bmm1_shape = ShapeUtil::MakeShapeWithDenseLayout(
-        GetShape(fmha.getLhsBmm1()).element_type(),
-        GetShape(fmha.getLhsBmm1()).dimensions(),
-        GetShape(fmha.getLhsBmm1()).layout().minor_to_major());
-    TF_ASSIGN_OR_RETURN(lhs_bmm1_slice, GetAllocationSlice(fmha.getLhsBmm1()));
-
-    descriptor.rhs_bmm1_shape = ShapeUtil::MakeShapeWithDenseLayout(
-        GetShape(fmha.getRhsBmm1()).element_type(),
-        GetShape(fmha.getRhsBmm1()).dimensions(),
-        GetShape(fmha.getRhsBmm1()).layout().minor_to_major());
-    TF_ASSIGN_OR_RETURN(rhs_bmm1_slice, GetAllocationSlice(fmha.getRhsBmm1()));
-
-    descriptor.rhs_bmm2_shape = ShapeUtil::MakeShapeWithDenseLayout(
-        GetShape(fmha.getRhsBmm2()).element_type(),
-        GetShape(fmha.getRhsBmm2()).dimensions(),
-        GetShape(fmha.getRhsBmm2()).layout().minor_to_major());
-    TF_ASSIGN_OR_RETURN(rhs_bmm2_slice, GetAllocationSlice(fmha.getRhsBmm2()));
-
-    descriptor.output_shapes.push_back(ShapeUtil::MakeShapeWithDenseLayout(
-        GetShape(fmha.getOutput()).element_type(),
-        GetShape(fmha.getOutput()).dimensions(),
-        GetShape(fmha.getOutput()).layout().minor_to_major()));
-    TF_ASSIGN_OR_RETURN(output_slice, GetAllocationSlice(fmha.getOutput()));
-
-    TF_ASSIGN_OR_RETURN(scratch_slice, GetAllocationSlice(fmha.getScratch()));
-
-    TF_ASSIGN_OR_RETURN(auto intermediate_tensor_dims_array,
-                        ConvertMlirArrayAttrToInt64Array(
-                            fmha.getIntermediateTensorDimensions()));
-    if (fmha.getActivation() != nullptr) {
-      descriptor.output_shapes.push_back(ShapeUtil::MakeShapeWithDenseLayout(
-          GetShape(fmha.getActivation()).element_type(),
-          GetShape(fmha.getActivation()).dimensions(),
-          GetShape(fmha.getActivation()).layout().minor_to_major()));
-      TF_ASSIGN_OR_RETURN(activation_slice,
-                          GetAllocationSlice(fmha.getActivation()));
-    }
-
-    if (fmha.getBias() != nullptr) {
-      descriptor.bias_shape = ShapeUtil::MakeShapeWithDenseLayout(
-          GetShape(fmha.getBias()).element_type(),
-          GetShape(fmha.getBias()).dimensions(),
-          GetShape(fmha.getBias()).layout().minor_to_major());
-
-      TF_ASSIGN_OR_RETURN(bias_slice, GetAllocationSlice(fmha.getBias()));
-    }
-
-    if (fmha.getMask() != nullptr) {
-      descriptor.mask_shape = ShapeUtil::MakeShapeWithDenseLayout(
-          GetShape(fmha.getMask()).element_type(),
-          GetShape(fmha.getMask()).dimensions(),
-          GetShape(fmha.getMask()).layout().minor_to_major());
-
-      TF_ASSIGN_OR_RETURN(mask_slice, GetAllocationSlice(fmha.getMask()));
-    }
-    TF_ASSIGN_OR_RETURN(
-        auto intermediate_tensor_layout_array,
-        ConvertMlirArrayAttrToInt64Array(fmha.getIntermediateTensorLayout()));
-
-    descriptor.intermediate_lhs_bmm2_shape =
-        ShapeUtil::MakeShapeWithDenseLayout(
-            GetShape(fmha.getOutput()).element_type(),
-            intermediate_tensor_dims_array, intermediate_tensor_layout_array);
-
-    // set if flash attention here
-    descriptor.is_flash_attention = fmha.getIsFlashAttention();
-    // set if causal mask here
-    descriptor.is_causal_mask = fmha.getIsCausalMask();
-    return absl::OkStatus();
-  };
-
-  if (auto fmha_op = dyn_cast<fusedMHAOp>(op)) {
-    TF_RET_CHECK(fmha_op != nullptr);
-    TF_ASSIGN_OR_RETURN(CudnnfMHAKind kind,
-                        AsCudnnfMHAKind(fmha_op.getFusedMhaDag()));
-    descriptor.kind = kind;
-    TF_RETURN_IF_ERROR(populate_common(fmha_op));
-  } else {
-    return Internal("Unexpected operation");
-  }
-  TF_ASSIGN_OR_RETURN(GpufMHAConfig config, GpufMHAConfig::For(descriptor));
+absl::Status IrEmitterUnnested::EmitFusedMHAThunk(
+    const HloCustomCallInstruction* instr) {
+  const HloInstruction* lhs_bmm1 = instr->operand(0);
+  const HloInstruction* rhs_bmm1 = instr->operand(1);
+  const HloInstruction* rhs_bmm2 = instr->operand(2);
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice lhs_bmm1_slice,
+                      GetAllocationSliceForHlo(lhs_bmm1));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice rhs_bmm1_slice,
+                      GetAllocationSliceForHlo(rhs_bmm1));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice rhs_bmm2_slice,
+                      GetAllocationSliceForHlo(rhs_bmm2));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice output_slice,
+                      GetAllocationSliceForHlo(instr, {0}));
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice scratch_slice,
+                      GetAllocationSliceForHlo(instr, {1}));
+  BufferAllocation::Slice activation_slice;
+  bool has_activation = xla::ShapeUtil::TupleElementCount(instr->shape()) == 3;
+  if (has_activation) {
+    TF_ASSIGN_OR_RETURN(activation_slice, GetAllocationSliceForHlo(instr, {2}));
+  }
+
+  TF_ASSIGN_OR_RETURN(const xla::gpu::CudnnfMHAKind kind,
+                      xla::gpu::GetCudnnfMHAKind(instr));
+  BufferAllocation::Slice mask_slice, bias_slice;
+  std::optional<Shape> mask_shape, bias_shape;
+  {
+    bool has_mask = kind == CudnnfMHAKind::kScaleMaskSoftmax ||
+                    kind == CudnnfMHAKind::kScaleMaskSoftmaxDropout ||
+                    kind == CudnnfMHAKind::kScaleBiasMaskSoftmax ||
+                    kind == CudnnfMHAKind::kScaleBiasMaskSoftmaxDropout;
+    bool has_bias = kind == CudnnfMHAKind::kScaleBiasMaskSoftmax ||
+                    kind == CudnnfMHAKind::kScaleBiasSoftmaxDropout ||
+                    kind == CudnnfMHAKind::kScaleBiasSoftmax ||
+                    kind == CudnnfMHAKind::kScaleBiasSoftmaxDropout;
+
+    if (has_mask) {
+      const HloInstruction* mask = instr->operand(3);
+      TF_ASSIGN_OR_RETURN(mask_slice, GetAllocationSliceForHlo(mask));
+      mask_shape = mask->shape();
+      if (has_bias) {
+        const HloInstruction* bias = instr->operand(4);
+        TF_ASSIGN_OR_RETURN(bias_slice, GetAllocationSliceForHlo(bias));
+        bias_shape = bias->shape();
+      }
+    } else if (has_bias) {
+      const HloInstruction* bias = instr->operand(3);
+      TF_ASSIGN_OR_RETURN(bias_slice, GetAllocationSliceForHlo(bias));
+      bias_shape = bias->shape();
+    }
+  }
+
+  TF_ASSIGN_OR_RETURN(const auto gpu_config,
+                      instr->backend_config<xla::gpu::GpuBackendConfig>());
+  const xla::gpu::CudnnfMHABackendConfig& config =
+      gpu_config.cudnn_fmha_backend_config();
+  Shape intermediate_tensor_shape(config.intermediate_tensor_shape());
+  absl::InlinedVector<Shape, 2> output_shapes = {
+      ShapeUtil::GetSubshape(instr->shape(), {0})};
+  if (has_activation) {
+    output_shapes.push_back(ShapeUtil::GetSubshape(instr->shape(), {2}));
+  }
+
+  GpufMHADescriptor descriptor = {kind,
+                                  config,
+                                  config.is_flash_attention(),
+                                  config.is_causal_mask(),
+                                  lhs_bmm1->shape(),
+                                  rhs_bmm1->shape(),
+                                  rhs_bmm2->shape(),
+                                  intermediate_tensor_shape,
+                                  output_shapes,
+                                  config.bmm1_dot_dimension_numbers(),
+                                  config.bmm2_dot_dimension_numbers(),
+                                  mask_shape,
+                                  bias_shape};
+
+  TF_ASSIGN_OR_RETURN(GpufMHAConfig fmha_config,
+                      GpufMHAConfig::For(descriptor));
   AddThunkToThunkSequence(std::make_unique<FusedMHAThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(op), std::move(config),
+      Thunk::ThunkInfo::WithProfileAnnotation(instr), std::move(fmha_config),
       lhs_bmm1_slice, rhs_bmm1_slice, rhs_bmm2_slice, output_slice,
       scratch_slice, mask_slice, bias_slice, activation_slice));
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitFusedMHABackwardThunk(mlir::Operation* op) {
-  using mlir::dyn_cast;
-  using mlir::lmhlo_gpu::fusedMHABackwardOp;
-
-  GpufMHABackwardDescriptor descriptor;
-  BufferAllocation::Slice bmm1_grad_gemm1_rhs_slice, bmm1_grad_gemm2_rhs_slice,
-      bmm2_grad_gemm1_lhs_slice, bmm2_grad_gemm2_rhs_slice, d_output_slice,
-      scratch_slice, mask_slice, fwd_output_slice, bias_slice;
-  BufferAllocation::Slice d_bmm1_lhs_slice, d_bmm1_rhs_slice, d_bmm2_rhs_slice,
-      d_s_slice, softmax_sum_slice, d_Q_accum_slice, d_bias_slice;
-
-  auto populate_common = [&](auto fmha) -> absl::Status {
-    descriptor.backend_config.set_fmha_scale(
-        fmha.getFmhaScale().convertToDouble());
-
-    if (fmha.getDropoutRate()) {
-      descriptor.backend_config.set_dropout_rate(
-          (*fmha.getDropoutRate()).convertToDouble());
-    }
-
-    if (fmha.getSeed()) {
-      descriptor.backend_config.set_seed((*fmha.getSeed()));
-    }
-
-    auto* algorithm = descriptor.backend_config.mutable_algorithm();
-    algorithm->set_algo_id(fmha.getAlgorithmConfig().getAlgorithm());
-    for (int i = 0; i < fmha.getAlgorithmConfig().getKnobIds().size(); ++i) {
-      // N.B. tuning_knobs is a map rather than a repeated field, so this
-      // doesn't require reserving space up front.
-      (*algorithm->mutable_tuning_knobs())[fmha.getAlgorithmConfig()
-                                               .getKnobIds()[i]] =
-          fmha.getAlgorithmConfig().getKnobValues()[i];
-    }
-    algorithm->set_is_cudnn_frontend(true);
-    auto workspace_size = fmha.getAlgorithmConfig().getWorkspaceSize();
-    if (workspace_size >= 0) {
-      algorithm->mutable_workspace_size()->set_value(workspace_size);
-    }
-
-    // set if flash attention here
-    descriptor.is_flash_attention = fmha.getIsFlashAttention();
-    // set if causal mask here
-    descriptor.is_causal_mask = fmha.getIsCausalMask();
-    descriptor.bmm1_grad_gemm1_dnums =
-        ConvertDotDimensionNumbers(fmha.getBmm1GradGemm1DotDimensionNumbers());
-    descriptor.bmm1_grad_gemm2_dnums =
-        ConvertDotDimensionNumbers(fmha.getBmm1GradGemm2DotDimensionNumbers());
-    descriptor.bmm2_grad_gemm1_dnums =
-        ConvertDotDimensionNumbers(fmha.getBmm2GradGemm1DotDimensionNumbers());
-    descriptor.bmm2_grad_gemm2_dnums =
-        ConvertDotDimensionNumbers(fmha.getBmm2GradGemm2DotDimensionNumbers());
-
-    descriptor.bmm1_grad_gemm1_rhs_shape = ShapeUtil::MakeShapeWithDenseLayout(
-        GetShape(fmha.getBmm1GradGemm1Rhs()).element_type(),
-        GetShape(fmha.getBmm1GradGemm1Rhs()).dimensions(),
-        GetShape(fmha.getBmm1GradGemm1Rhs()).layout().minor_to_major());
-    TF_ASSIGN_OR_RETURN(bmm1_grad_gemm1_rhs_slice,
-                        GetAllocationSlice(fmha.getBmm1GradGemm1Rhs()));
-
-    descriptor.bmm1_grad_gemm2_rhs_shape = ShapeUtil::MakeShapeWithDenseLayout(
-        GetShape(fmha.getBmm1GradGemm2Rhs()).element_type(),
-        GetShape(fmha.getBmm1GradGemm2Rhs()).dimensions(),
-        GetShape(fmha.getBmm1GradGemm2Rhs()).layout().minor_to_major());
-    TF_ASSIGN_OR_RETURN(bmm1_grad_gemm2_rhs_slice,
-                        GetAllocationSlice(fmha.getBmm1GradGemm2Rhs()));
-
-    // fwd activation
-    // fmha.getBmm2GradGemm1Lhs() could be bmm2_grad_gemm1_lhs for regular
-    // attention or softmax stats for flash attention here we set the shape to
-    // be bmm2_grad_gemm1_lhs even it is flash attention
-    if (descriptor.is_flash_attention) {
-      // flash attention TODO: make sure the layout is correct for
-      // bmm2_grad_gemm1_lhs
-      TF_ASSIGN_OR_RETURN(auto intermediate_tensor_dims_array,
-                          ConvertMlirArrayAttrToInt64Array(
-                              fmha.getIntermediateTensorDimensions()));
-      TF_ASSIGN_OR_RETURN(
-          auto intermediate_tensor_layout_array,
-          ConvertMlirArrayAttrToInt64Array(fmha.getIntermediateTensorLayout()));
-
-      descriptor.bmm2_grad_gemm1_lhs_shape =
-          ShapeUtil::MakeShapeWithDenseLayout(
-              GetShape(fmha.getDOutput()).element_type(),
-              intermediate_tensor_dims_array, intermediate_tensor_layout_array);
-    } else {
-      descriptor.bmm2_grad_gemm1_lhs_shape =
-          ShapeUtil::MakeShapeWithDenseLayout(
-              GetShape(fmha.getBmm2GradGemm1Lhs()).element_type(),
-              GetShape(fmha.getBmm2GradGemm1Lhs()).dimensions(),
-              GetShape(fmha.getBmm2GradGemm1Lhs()).layout().minor_to_major());
-    }
-    TF_ASSIGN_OR_RETURN(bmm2_grad_gemm1_lhs_slice,
-                        GetAllocationSlice(fmha.getBmm2GradGemm1Lhs()));
-
-    descriptor.bmm2_grad_gemm2_rhs_shape = ShapeUtil::MakeShapeWithDenseLayout(
-        GetShape(fmha.getBmm2GradGemm2Rhs()).element_type(),
-        GetShape(fmha.getBmm2GradGemm2Rhs()).dimensions(),
-        GetShape(fmha.getBmm2GradGemm2Rhs()).layout().minor_to_major());
-    TF_ASSIGN_OR_RETURN(bmm2_grad_gemm2_rhs_slice,
-                        GetAllocationSlice(fmha.getBmm2GradGemm2Rhs()));
-
-    descriptor.d_output_shape = ShapeUtil::MakeShapeWithDenseLayout(
-        GetShape(fmha.getDOutput()).element_type(),
-        GetShape(fmha.getDOutput()).dimensions(),
-        GetShape(fmha.getDOutput()).layout().minor_to_major());
-    TF_ASSIGN_OR_RETURN(d_output_slice, GetAllocationSlice(fmha.getDOutput()));
-    descriptor.d_bmm1_lhs_shape = ShapeUtil::MakeShapeWithDenseLayout(
-        GetShape(fmha.getDBmm1Lhs()).element_type(),
-        GetShape(fmha.getDBmm1Lhs()).dimensions(),
-        GetShape(fmha.getDBmm1Lhs()).layout().minor_to_major());
-    TF_ASSIGN_OR_RETURN(d_bmm1_lhs_slice,
-                        GetAllocationSlice(fmha.getDBmm1Lhs()));
-
-    descriptor.d_bmm1_rhs_shape = ShapeUtil::MakeShapeWithDenseLayout(
-        GetShape(fmha.getDBmm1Rhs()).element_type(),
-        GetShape(fmha.getDBmm1Rhs()).dimensions(),
-        GetShape(fmha.getDBmm1Rhs()).layout().minor_to_major());
-    TF_ASSIGN_OR_RETURN(d_bmm1_rhs_slice,
-                        GetAllocationSlice(fmha.getDBmm1Rhs()));
-
-    descriptor.d_bmm2_rhs_shape = ShapeUtil::MakeShapeWithDenseLayout(
-        GetShape(fmha.getDBmm2Rhs()).element_type(),
-        GetShape(fmha.getDBmm2Rhs()).dimensions(),
-        GetShape(fmha.getDBmm2Rhs()).layout().minor_to_major());
-    TF_ASSIGN_OR_RETURN(d_bmm2_rhs_slice,
-                        GetAllocationSlice(fmha.getDBmm2Rhs()));
-
-    TF_ASSIGN_OR_RETURN(scratch_slice, GetAllocationSlice(fmha.getScratch()));
-
-    if (fmha.getD_S() != nullptr) {
-      descriptor.d_s_shape = ShapeUtil::MakeShapeWithDenseLayout(
-          GetShape(fmha.getD_S()).element_type(),
-          GetShape(fmha.getD_S()).dimensions(),
-          GetShape(fmha.getD_S()).layout().minor_to_major());
-      TF_ASSIGN_OR_RETURN(d_s_slice, GetAllocationSlice(fmha.getD_S()));
-    }
-
-    if (fmha.getDBias() != nullptr) {
-      descriptor.d_bias_shape = ShapeUtil::MakeShapeWithDenseLayout(
-          GetShape(fmha.getDBias()).element_type(),
-          GetShape(fmha.getDBias()).dimensions(),
-          GetShape(fmha.getDBias()).layout().minor_to_major());
-      TF_ASSIGN_OR_RETURN(d_bias_slice, GetAllocationSlice(fmha.getDBias()));
-    }
-
-    if (fmha.getMask() != nullptr) {
-      // has mask input
-      TF_RET_CHECK(
-          descriptor.kind != xla::gpu::CudnnfMHAKind::kBackwardBmmBmm &&
-          descriptor.kind != xla::gpu::CudnnfMHAKind::kBackwardSoftmaxDropout &&
-          descriptor.kind != xla::gpu::CudnnfMHAKind::kBackwardSoftmax);
-
-      descriptor.mask_shape = ShapeUtil::MakeShapeWithDenseLayout(
-          GetShape(fmha.getMask()).element_type(),
-          GetShape(fmha.getMask()).dimensions(),
-          GetShape(fmha.getMask()).layout().minor_to_major());
-
-      TF_ASSIGN_OR_RETURN(mask_slice, GetAllocationSlice(fmha.getMask()));
-    }
-    // add flash attention backward related slice here
-    if (fmha.getBias() != nullptr) {
-      descriptor.bias_shape = ShapeUtil::MakeShapeWithDenseLayout(
-          GetShape(fmha.getBias()).element_type(),
-          GetShape(fmha.getBias()).dimensions(),
-          GetShape(fmha.getBias()).layout().minor_to_major());
-      TF_ASSIGN_OR_RETURN(bias_slice, GetAllocationSlice(fmha.getBias()));
-    }
-
-    if (fmha.getSoftmaxSum() != nullptr) {
-      TF_ASSIGN_OR_RETURN(softmax_sum_slice,
-                          GetAllocationSlice(fmha.getSoftmaxSum()));
-    }
-
-    if (fmha.getD_QAccum() != nullptr) {
-      TF_ASSIGN_OR_RETURN(d_Q_accum_slice,
-                          GetAllocationSlice(fmha.getD_QAccum()));
-    }
-
-    if (fmha.getFwdOutput() != nullptr) {
-      descriptor.fwd_output_shape = ShapeUtil::MakeShapeWithDenseLayout(
-          GetShape(fmha.getFwdOutput()).element_type(),
-          GetShape(fmha.getFwdOutput()).dimensions(),
-          GetShape(fmha.getFwdOutput()).layout().minor_to_major());
-      TF_ASSIGN_OR_RETURN(fwd_output_slice,
-                          GetAllocationSlice(fmha.getFwdOutput()));
-    }
-    return absl::OkStatus();
-  };
-
-  if (auto fmha_backward_op = dyn_cast<fusedMHABackwardOp>(op)) {
-    TF_RET_CHECK(fmha_backward_op != nullptr);
-    TF_ASSIGN_OR_RETURN(
-        CudnnfMHAKind kind,
-        AsCudnnBackwardfMHAKind(fmha_backward_op.getFusedMhaDag()));
-    descriptor.kind = kind;
-    TF_RETURN_IF_ERROR(populate_common(fmha_backward_op));
+absl::Status IrEmitterUnnested::EmitFusedMHABackwardThunk(
+    const HloCustomCallInstruction* instr) {
+  TF_ASSIGN_OR_RETURN(const auto gpu_config,
+                      instr->backend_config<xla::gpu::GpuBackendConfig>());
+  const xla::gpu::CudnnfMHABackendConfig& config =
+      gpu_config.cudnn_fmha_backend_config();
+  bool is_flash_attention = config.is_flash_attention();
+
+  int input_index = 0;
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bmm1_grad_gemm1_rhs_slice,
+                      GetAllocationSliceForHlo(instr->operand(input_index)));
+  Shape bmm1_grad_gemm1_rhs_shape = instr->operand(input_index++)->shape();
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bmm1_grad_gemm2_rhs_slice,
+                      GetAllocationSliceForHlo(instr->operand(input_index)));
+  Shape bmm1_grad_gemm2_rhs_shape = instr->operand(input_index++)->shape();
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bmm2_grad_gemm2_rhs_slice,
+                      GetAllocationSliceForHlo(instr->operand(input_index)));
+  Shape bmm2_grad_gemm2_rhs_shape = instr->operand(input_index++)->shape();
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice bmm2_grad_gemm1_lhs_slice,
+                      GetAllocationSliceForHlo(instr->operand(input_index)));
+  Shape bmm2_grad_gemm1_lhs_shape;
+
+  // fmha.getBmm2GradGemm1Lhs() could be bmm2_grad_gemm1_lhs for regular
+  // attention or softmax stats for flash attention here we set the shape to
+  // be bmm2_grad_gemm1_lhs even it is flash attention
+  if (is_flash_attention) {
+    // flash attention TODO: make sure the layout is correct for
+    // bmm2_grad_gemm1_lhs
+    Shape intermediate_tensor_shape(config.intermediate_tensor_shape());
+    bmm2_grad_gemm1_lhs_shape = intermediate_tensor_shape;
+    input_index++;
+  } else {
+    bmm2_grad_gemm1_lhs_shape = instr->operand(input_index++)->shape();
+  }
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice d_output_slice,
+                      GetAllocationSliceForHlo(instr->operand(input_index)));
+  Shape d_output_shape = instr->operand(input_index++)->shape();
+
+  TF_ASSIGN_OR_RETURN(const CudnnfMHAKind kind, GetCudnnfMHAKind(instr));
+  bool has_mask = kind == CudnnfMHAKind::kBackwardScaleMaskSoftmax ||
+                  kind == CudnnfMHAKind::kBackwardScaleBiasMaskSoftmax ||
+                  kind == CudnnfMHAKind::kBackwardScaleMaskSoftmaxDropout ||
+                  kind == CudnnfMHAKind::kBackwardScaleBiasMaskSoftmaxDropout;
+  BufferAllocation::Slice mask_slice;
+  std::optional<Shape> mask_shape;
+  if (has_mask) {
+    TF_ASSIGN_OR_RETURN(mask_slice,
+                        GetAllocationSliceForHlo(instr->operand(input_index)));
+    mask_shape = instr->operand(input_index++)->shape();
+  }
+
+  bool has_bias = is_flash_attention &&
+                  (kind == CudnnfMHAKind::kBackwardScaleBiasSoftmax ||
+                   kind == CudnnfMHAKind::kBackwardScaleBiasSoftmaxDropout ||
+                   kind == CudnnfMHAKind::kBackwardScaleBiasMaskSoftmax ||
+                   kind == CudnnfMHAKind::kBackwardScaleBiasMaskSoftmaxDropout);
+  BufferAllocation::Slice bias_slice;
+  std::optional<Shape> bias_shape;
+  if (has_bias) {
+    TF_ASSIGN_OR_RETURN(bias_slice,
+                        GetAllocationSliceForHlo(instr->operand(input_index)));
+    bias_shape = instr->operand(input_index++)->shape();
+  }
+
+  BufferAllocation::Slice fwd_output_slice;
+  std::optional<Shape> fwd_output_shape;
+  if (is_flash_attention) {
+    TF_ASSIGN_OR_RETURN(fwd_output_slice,
+                        GetAllocationSliceForHlo(instr->operand(input_index)));
+    fwd_output_shape = instr->operand(input_index++)->shape();
+  }
+
+  TF_RET_CHECK(input_index == instr->operand_count());
+
+  int output_index = 0;
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice d_bmm1_lhs_slice,
+                      GetAllocationSliceForHlo(instr, {output_index}));
+  Shape d_bmm1_lhs_shape =
+      ShapeUtil::GetSubshape(instr->shape(), {output_index++});
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice d_bmm1_rhs_slice,
+                      GetAllocationSliceForHlo(instr, {output_index}));
+  Shape d_bmm1_rhs_shape =
+      ShapeUtil::GetSubshape(instr->shape(), {output_index++});
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice d_bmm2_rhs_slice,
+                      GetAllocationSliceForHlo(instr, {output_index}));
+  Shape d_bmm2_rhs_shape =
+      ShapeUtil::GetSubshape(instr->shape(), {output_index++});
+
+  BufferAllocation::Slice d_s_slice, softmax_sum_slice, d_Q_accum_slice;
+  std::optional<Shape> d_s_shape;
+  if (!is_flash_attention) {
+    TF_ASSIGN_OR_RETURN(d_s_slice,
+                        GetAllocationSliceForHlo(instr, {output_index}));
+    d_s_shape = ShapeUtil::GetSubshape(instr->shape(), {output_index++});
   } else {
-    return Internal("Unexpected operation");
+    TF_ASSIGN_OR_RETURN(softmax_sum_slice,
+                        GetAllocationSliceForHlo(instr, {output_index++}));
+    TF_ASSIGN_OR_RETURN(d_Q_accum_slice,
+                        GetAllocationSliceForHlo(instr, {output_index++}));
   }
-  TF_ASSIGN_OR_RETURN(GpufMHABackwardConfig config,
+
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice scratch_slice,
+                      GetAllocationSliceForHlo(instr, {output_index++}));
+
+  bool has_dbias =
+      instr->shape().tuple_shapes().size() == 6 && !is_flash_attention;
+  BufferAllocation::Slice d_bias_slice;
+  std::optional<Shape> d_bias_shape;
+  if (has_dbias) {
+    TF_ASSIGN_OR_RETURN(d_bias_slice,
+                        GetAllocationSliceForHlo(instr, {output_index}));
+    d_bias_shape = ShapeUtil::GetSubshape(instr->shape(), {output_index++});
+  }
+
+  TF_RET_CHECK(output_index == instr->shape().tuple_shapes().size());
+
+  GpufMHABackwardDescriptor descriptor = {
+      kind,
+      config,
+      is_flash_attention,
+      config.is_causal_mask(),
+      bmm1_grad_gemm1_rhs_shape,
+      bmm1_grad_gemm2_rhs_shape,
+      bmm2_grad_gemm1_lhs_shape,
+      bmm2_grad_gemm2_rhs_shape,
+      d_output_shape,
+      d_bmm1_lhs_shape,
+      d_bmm1_rhs_shape,
+      d_bmm2_rhs_shape,
+      config.bmm1_grad_gemm1_dot_dimension_numbers(),
+      config.bmm1_grad_gemm2_dot_dimension_numbers(),
+      config.bmm2_grad_gemm1_dot_dimension_numbers(),
+      config.bmm2_grad_gemm2_dot_dimension_numbers(),
+      d_s_shape,
+      fwd_output_shape,
+      mask_shape,
+      d_bias_shape,
+      bias_shape};
+
+  TF_ASSIGN_OR_RETURN(GpufMHABackwardConfig fmha_backward_config,
                       GpufMHABackwardConfig::For(descriptor));
 
   AddThunkToThunkSequence(std::make_unique<FusedMHABackwardThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(op), std::move(config),
-      bmm1_grad_gemm1_rhs_slice, bmm1_grad_gemm2_rhs_slice,
-      bmm2_grad_gemm1_lhs_slice, bmm2_grad_gemm2_rhs_slice, d_output_slice,
-      scratch_slice, d_bmm1_lhs_slice, d_bmm1_rhs_slice, d_bmm2_rhs_slice,
-      d_s_slice, softmax_sum_slice, d_Q_accum_slice, mask_slice, d_bias_slice,
+      Thunk::ThunkInfo::WithProfileAnnotation(instr),
+      std::move(fmha_backward_config), bmm1_grad_gemm1_rhs_slice,
+      bmm1_grad_gemm2_rhs_slice, bmm2_grad_gemm1_lhs_slice,
+      bmm2_grad_gemm2_rhs_slice, d_output_slice, scratch_slice,
+      d_bmm1_lhs_slice, d_bmm1_rhs_slice, d_bmm2_rhs_slice, d_s_slice,
+      softmax_sum_slice, d_Q_accum_slice, mask_slice, d_bias_slice,
       fwd_output_slice, bias_slice));
 
   return absl::OkStatus();
 }
+
 #endif  // GOOGLE_CUDA
 
 absl::StatusOr<BufferAllocation::Slice>
@@ -1536,81 +1306,44 @@ IrEmitterUnnested::GetAllocationSliceForHlo(const HloInstruction* instr,
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 
-absl::Status IrEmitterUnnested::EmitCubDeviceRadixSort(mlir::Operation* op) {
-  auto radix_sort_op = mlir::cast<mlir::lmhlo_gpu::RadixSortOp>(op);
-  if (radix_sort_op.getInputs().size() != 1 &&
-      radix_sort_op.getInputs().size() != 2) {
+absl::Status IrEmitterUnnested::EmitCubDeviceRadixSort(
+    const HloCustomCallInstruction* instr) {
+  if (instr->operand_count() != 1 && instr->operand_count() != 2) {
     return Internal("Invalid number of operands for radix sort");
   }
 
-  TF_ASSIGN_OR_RETURN(std::vector<BufferAllocation::Slice> operands,
-                      GetAllocationSlices(radix_sort_op.getInputs()));
-  TF_ASSIGN_OR_RETURN(std::vector<BufferAllocation::Slice> results,
-                      GetAllocationSlices(radix_sort_op.getOutput()));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice scratch,
-                      GetAllocationSlice(radix_sort_op.getScratch()));
-
-  auto thunk = std::make_unique<CubSortThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(op),
-      GetShape(op->getOperand(0)).element_type(),
-      radix_sort_op.getInputs().size() == 2
-          ? std::optional(GetShape(op->getOperand(1)).element_type())
-          : std::nullopt,
-      operands, results, scratch, radix_sort_op.getDescending());
-
-  AddThunkToThunkSequence(std::move(thunk));
-  return absl::OkStatus();
-}
-
-absl::Status IrEmitterUnnested::EmitCholeskyThunk(mlir::Operation* op) {
-  auto cholesky_op = mlir::cast<mlir::lmhlo_gpu::CholeskyOp>(op);
-
-  const Shape shape = GetShape(cholesky_op.getInput());
-  int ndim = shape.dimensions_size();
-  CHECK_GE(ndim, 2);
-  int64_t n = shape.dimensions(ndim - 1);
-
-  const auto& dims = shape.dimensions();
-  int64_t batch_size =
-      std::accumulate(dims.begin(), dims.end() - 2, int64_t{1},
-                      [](int64_t a, int64_t b) { return a * b; });
-
-  TF_ASSIGN_OR_RETURN(auto operand_buffer,
-                      GetAllocationSlice(cholesky_op.getInput()));
-  TF_ASSIGN_OR_RETURN(auto a_buffer,
-                      GetAllocationSlice(cholesky_op.getOutput()));
-  TF_ASSIGN_OR_RETURN(auto workspace_buffer,
-                      GetAllocationSlice(cholesky_op.getScratch()));
-  TF_ASSIGN_OR_RETURN(auto info_buffer,
-                      GetAllocationSlice(cholesky_op.getInfo()));
-
-  ThunkSequence thunks;
-
-  if (operand_buffer != a_buffer) {
-    thunks.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(op),
-        /*source_buffer=*/operand_buffer,
-        /*destination_buffer=*/a_buffer,
-        /*mem_size=*/ShapeUtil::ByteSizeOf(shape),
-        /*source_value=*/cholesky_op.getInput(),
-        /*destination_value=*/cholesky_op.getOutput()));
+  absl::InlinedVector<BufferAllocation::Slice, 2> operands;
+  for (int i = 0; i < instr->operand_count(); ++i) {
+    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice operand,
+                        GetAllocationSliceForHlo(instr->operand(i), {}));
+    operands.push_back(operand);
   }
 
-  CholeskyOptions options;
-  options.set_lower(cholesky_op.getIsLower());
-  thunks.push_back(std::make_unique<CholeskyThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(op), options,
-      PtxOptsFromDebugOptions(ir_emitter_context_->debug_options()), a_buffer,
-      workspace_buffer, info_buffer, shape.element_type(), batch_size, n));
+  absl::InlinedVector<BufferAllocation::Slice, 2> results;
+  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result,
+                      GetAllocationSliceForHlo(instr, {0}));
+  results.push_back(result);
 
-  // Elide the sequential thunk if there's no copy.
-  if (thunks.size() == 1) {
-    AddThunkToThunkSequence(std::move(thunks[0]));
+  BufferAllocation::Slice scratch;
+  if (instr->operand_count() == 1) {
+    TF_ASSIGN_OR_RETURN(scratch, GetAllocationSliceForHlo(instr, {1}));
   } else {
-    AddThunkToThunkSequence(std::make_unique<SequentialThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(op), std::move(thunks)));
+    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result,
+                        GetAllocationSliceForHlo(instr, {1}));
+    results.push_back(result);
+    TF_ASSIGN_OR_RETURN(scratch, GetAllocationSliceForHlo(instr, {2}));
   }
 
+  TF_ASSIGN_OR_RETURN(xla::SortOptions options,
+                      instr->backend_config<xla::SortOptions>());
+  auto thunk = std::make_unique<CubSortThunk>(
+      Thunk::ThunkInfo::WithProfileAnnotation(instr),
+      instr->operand(0)->shape().element_type(),
+      instr->operand_count() == 2
+          ? std::optional(instr->operand(1)->shape().element_type())
+          : std::nullopt,
+      operands, results, scratch, options.descending());
+  AddThunkToThunkSequence(std::move(thunk));
   return absl::OkStatus();
 }
 
@@ -1716,182 +1449,6 @@ static absl::StatusOr<CustomCallThunk::AttributesMap> BuildAttributesMap(
   return attributes;
 }
 
-absl::Status IrEmitterUnnested::EmitCustomCallThunk(
-    mlir::Operation* op, const HloCustomCallInstruction* instr) {
-  if (ir_emitter_context_->emit_ir_from_hlo())
-    return EmitCustomCallThunk(instr);
-  auto custom_call = mlir::cast<mlir::lmhlo::CustomCallOp>(op);
-  const std::string call_target_name = custom_call.getCallTargetName().str();
-
-  // Typed FFI custom calls is a replacement for legacy custom calls with
-  // a rich type safe API. It's under construction and not fully supported.
-  bool is_ffi_custom_call =
-      custom_call.getApiVersion() ==
-      mlir::mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI;
-
-  void* call_target = CustomCallTargetRegistry::Global()->Lookup(
-      call_target_name, std::string(platform_name()));
-
-  absl::StatusOr<XLA_FFI_Handler*> handler =
-      ffi::FindHandler(call_target_name, platform_name());
-
-  // At least one implementation should be available at run time.
-  bool found_custom_call = !is_ffi_custom_call && call_target != nullptr;
-  bool found_ffi_handler = is_ffi_custom_call && handler.ok();
-
-  if (!found_custom_call && !found_ffi_handler) {
-    auto& debug_options = ir_emitter_context_->debug_options();
-
-    // If true, then all custom calls that are not found in custom call or FFI
-    // registries will become no-op (we don't emit any thunks for them).
-    if (debug_options.xla_gpu_mock_custom_calls()) {
-      return absl::OkStatus();
-    }
-
-    // TODO(ezhulenev): Custom calls registered with an XLA runtime are not part
-    // of a legacy registry, or an FFI registry. For now we simply ignore them.
-    if (debug_options.xla_gpu_enable_xla_runtime_executable()) {
-      return absl::OkStatus();
-    }
-
-    return absl::UnimplementedError(
-        absl::StrCat("No registered implementation for custom call to ",
-                     call_target_name, " for platform ", platform_name()));
-  }
-
-  using Slices = std::vector<std::optional<CustomCallThunk::Slice>>;
-
-  // Initialize slices and shapes from the value range.
-  auto init_from_values = [&](mlir::ValueRange values, Slices* slices) {
-    for (mlir::Value value : values) {
-      TF_ASSIGN_OR_RETURN(auto slice, GetAllocationSlice(value));
-      slices->push_back(CustomCallThunk::Slice{slice, GetShape(value)});
-    }
-    return absl::OkStatus();
-  };
-
-  // Initialize slices and shapes from the value range with token holes.
-  auto init_from_mapped_values = [&](mlir::ValueRange values,
-                                     absl::Span<const int64_t> target_mapping,
-                                     int64_t target_size, Slices* slices) {
-    slices->resize(target_size);
-    for (auto [index, value] : llvm::zip(target_mapping, values)) {
-      TF_ASSIGN_OR_RETURN(auto slice, GetAllocationSlice(value));
-      (*slices)[index] = CustomCallThunk::Slice{slice, GetShape(value)};
-    }
-    return absl::OkStatus();
-  };
-
-  Slices operands, results;
-
-  // If we have a target mapping, than the number of operands and results of a
-  // custom call handler can be more than a number of operands and results in
-  // the IR. These holes are coming from the HLO token operands and results.
-  if (auto target_mapping = custom_call.getTargetArgMapping()) {
-    auto arg_mapping = target_mapping->getArgsToTargetArgs();
-    auto res_mapping = target_mapping->getResultsToTargetResults();
-
-    TF_RETURN_IF_ERROR(
-        init_from_mapped_values(custom_call.getArgs(), arg_mapping,
-                                target_mapping->getNumArgs(), &operands));
-    TF_RETURN_IF_ERROR(
-        init_from_mapped_values(custom_call.getOutput(), res_mapping,
-                                target_mapping->getNumResults(), &results));
-
-  } else {
-    TF_RETURN_IF_ERROR(init_from_values(custom_call.getArgs(), &operands));
-    TF_RETURN_IF_ERROR(init_from_values(custom_call.getOutput(), &results));
-  }
-
-  // For legacy custom calls we convert all API versions into the the latest
-  // status-returning one and pass backend config as an opaque string.
-  CustomCallThunk::CustomCallTarget custom_call_target;
-  std::string opaque;
-
-  // For XLA FFI handlers we decode opaque backend config into attributes map
-  // at IR emission time, so that we do not need to parse MLIR at run time. For
-  // FFI handlers backend config must be a compatible MLIR dictionary.
-  CustomCallThunk::AttributesMap attributes;
-
-  // For information about this calling convention, see
-  // xla/g3doc/custom_call.md.
-  switch (custom_call.getApiVersion()) {
-    case mlir::mhlo::CustomCallApiVersion::API_VERSION_ORIGINAL:
-      using original_call_type =
-          void (*)(CustomCallThunk::Stream /*stream*/, void** /*buffers*/,
-                   const char* /*opaque*/, size_t /*opaque_len*/);
-      custom_call_target = [call_target](CustomCallThunk::Stream stream,
-                                         void** buffers, const char* opaque,
-                                         size_t opaque_len,
-                                         XlaCustomCallStatus*) {
-        auto typed_call_target =
-            reinterpret_cast<original_call_type>(call_target);
-        typed_call_target(stream, buffers, opaque, opaque_len);
-      };
-      break;
-    case mlir::mhlo::CustomCallApiVersion::API_VERSION_STATUS_RETURNING:
-    case mlir::mhlo::CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED:
-      using status_returning_call_type =
-          void (*)(CustomCallThunk::Stream /*stream*/, void** /*buffers*/,
-                   const char* /*opaque*/, size_t /*opaque_len*/,
-                   XlaCustomCallStatus* /*status*/);
-      custom_call_target =
-          reinterpret_cast<status_returning_call_type>(call_target);
-      break;
-    case mlir::mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI:
-      // We already checked `handler` above.
-      break;
-    default:
-      return Internal("Unknown custom-call API version enum value: %d",
-                      custom_call.getApiVersion());
-  }
-
-  auto backend_config =
-      custom_call.getBackendConfig().value_or(mlir::Attribute());
-
-  switch (custom_call.getApiVersion()) {
-    case mlir::mhlo::CustomCallApiVersion::API_VERSION_ORIGINAL:
-    case mlir::mhlo::CustomCallApiVersion::API_VERSION_STATUS_RETURNING:
-    case mlir::mhlo::CustomCallApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED:
-      if (auto str = backend_config.dyn_cast_or_null<mlir::StringAttr>()) {
-        opaque = str.str();
-        break;
-      }
-      return absl::InternalError(
-          "Unsupported backend config. Expected a string attribute");
-
-    case mlir::mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI:
-      if (auto dict = backend_config.dyn_cast_or_null<mlir::DictionaryAttr>()) {
-        TF_ASSIGN_OR_RETURN(attributes, BuildAttributesMap(dict));
-        break;
-      }
-      return absl::InternalError(
-          "Unsupported backend config. Expected a dictionary attribute");
-
-    default:
-      return Internal("Unknown custom-call API version enum value: %d",
-                      custom_call.getApiVersion());
-  }
-
-  auto ffi_thunk = [&] {
-    auto& called_computations = instr->called_computations();
-    return std::make_unique<CustomCallThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(op), *handler,
-        std::move(operands), std::move(results), std::move(attributes),
-        called_computations.empty() ? nullptr : called_computations[0]);
-  };
-
-  auto legacy_thunk = [&] {
-    return std::make_unique<CustomCallThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(op),
-        std::move(custom_call_target), std::move(operands), std::move(results),
-        std::move(opaque));
-  };
-
-  AddThunkToThunkSequence(found_ffi_handler ? ffi_thunk() : legacy_thunk());
-
-  return absl::OkStatus();
-}
 
 absl::Status IrEmitterUnnested::EmitCustomCallThunk(
     const HloCustomCallInstruction* instr) {
@@ -2059,33 +1616,6 @@ absl::Status IrEmitterUnnested::EmitCustomCallThunk(
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitFftThunk(mlir::Operation* op) {
-  auto fft_op = mlir::cast<mlir::lmhlo::FftOp>(op);
-  const Shape operand_shape = GetShape(fft_op.getOperand());
-  const Shape output_shape = GetShape(fft_op.getOutput());
-  TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(operand_shape.layout()));
-  TF_RET_CHECK(LayoutUtil::IsMonotonicWithDim0Major(output_shape.layout()));
-
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice arg_slice,
-                      GetAllocationSlice(fft_op.getOperand()));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice dest_slice,
-                      GetAllocationSlice(fft_op.getOutput()));
-  TF_ASSIGN_OR_RETURN(
-      xla::FftType fft_type,
-      ConvertFftType(mlir::mhlo::stringifyFftType(fft_op.getFftType())));
-  auto fft_length_values = fft_op.getFftLength().getValues<int64_t>();
-  std::vector<int64_t> fft_length(fft_length_values.begin(),
-                                  fft_length_values.end());
-
-  AddThunkToThunkSequence(std::make_unique<FftThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(op), fft_type, fft_length,
-      /*input_buffer=*/arg_slice,
-      /*output_buffer=*/dest_slice,
-      /*input_shape=*/operand_shape,
-      /*output_shape=*/output_shape));
-  return absl::OkStatus();
-}
-
 absl::Status IrEmitterUnnested::EmitFftThunk(const HloFftInstruction* instr) {
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice arg_slice,
                       GetAllocationSliceForHlo(instr->operand(0)));
@@ -2102,97 +1632,6 @@ absl::Status IrEmitterUnnested::EmitFftThunk(const HloFftInstruction* instr) {
 }
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-absl::Status IrEmitterUnnested::EmitTriangularSolveCustomCall(
-    mlir::Operation* op) {
-  auto custom_call = mlir::cast<mlir::lmhlo::CustomCallOp>(op);
-
-  auto operands = op->getOperands();
-  TF_RET_CHECK(operands.size() == 4);
-
-  // We expect Fortran layout for everything other than the temp buffer (the
-  // last operand).  Fortran layout is not XLA default layout with elements 0
-  // and 1 swapped.  For example instead of default layout {3,2,1,0} we'd have
-  // Fortran layout {2,3,1,0}.
-  TF_RET_CHECK(absl::c_all_of(operands.drop_back(1), [&](mlir::Value v) {
-    const Shape& shape = GetShape(v);
-    const Layout& layout = shape.layout();
-    int n = layout.minor_to_major_size();
-    if (n < 2) {
-      return false;
-    }
-    // Unfortunately the HLO -> LMHLO -> HLO conversion loses layout information
-    // if the shape has any dimensions of size 1: In that case, the new HLO
-    // (which we see here) will have an arbitrary value for the location of the
-    // size-1 dimension.  Just skip this assertion if the shape has any
-    // degenerate dimensions.
-    if (absl::c_any_of(shape.dimensions(),
-                       [](int64_t dim) { return dim == 1; })) {
-      return true;
-    }
-    return layout.minor_to_major(0) == n - 2 &&
-           layout.minor_to_major(1) == n - 1 &&
-           std::is_sorted(layout.minor_to_major().begin() + 2,
-                          layout.minor_to_major().end(),
-                          std::greater<int64_t>());
-  }));
-
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice a_slice,
-                      GetAllocationSlice(operands[0]));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice b_slice,
-                      GetAllocationSlice(operands[1]));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result_slice,
-                      GetAllocationSlice(operands[2]));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice temp_slice,
-                      GetAllocationSlice(operands[3]));
-
-  const Shape b_shape = GetShape(operands[1]);
-  const PrimitiveType elem_ty = b_shape.element_type();
-
-  TriangularSolveOptions backend_config;
-  if (auto str = custom_call.getBackendConfig()
-                     .value_or(mlir::Attribute())
-                     .dyn_cast_or_null<mlir::StringAttr>())
-    TF_RETURN_IF_ERROR(
-        tsl::HumanReadableJsonToProto(str.str(), &backend_config));
-
-  ThunkSequence thunks;
-
-  // Triangular solve is in-place on 'b', so copy 'b' to the output if they
-  // aren't the same buffer.
-  if (b_slice != result_slice) {
-    thunks.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
-        Thunk::ThunkInfo(op),
-        /*source_buffer=*/b_slice,
-        /*destination_buffer=*/result_slice,
-        /*mem_size=*/ShapeUtil::ByteSizeOf(b_shape),
-        /*source_value=*/operands[1],
-        /*destination_value=*/operands[2]));
-  }
-
-  int64_t m = b_shape.dimensions(b_shape.rank() - 2);
-  int64_t n = b_shape.dimensions(b_shape.rank() - 1);
-  int64_t batch_size = std::accumulate(
-      b_shape.dimensions().begin(), b_shape.dimensions().end() - 2, int64_t{1},
-      [](int64_t a, int64_t b) { return a * b; });
-  int64_t elem_size = ShapeUtil::ByteSizeOfPrimitiveType(elem_ty);
-  int64_t a_batch_stride =
-      backend_config.left_side() ? m * m * elem_size : n * n * elem_size;
-  int64_t b_batch_stride = m * n * elem_size;
-  thunks.push_back(std::make_unique<TriangularSolveThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(op), backend_config,
-      PtxOptsFromDebugOptions(ir_emitter_context_->debug_options()),
-      /*a_buffer=*/a_slice, /*b_buffer=*/result_slice, temp_slice, elem_ty,
-      batch_size, m, n, a_batch_stride, b_batch_stride));
-
-  // Elide the sequential thunk if there's no copy.
-  if (thunks.size() == 1) {
-    AddThunkToThunkSequence(std::move(thunks[0]));
-  } else {
-    AddThunkToThunkSequence(std::make_unique<SequentialThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(op), std::move(thunks)));
-  }
-  return absl::OkStatus();
-}
 
 absl::Status IrEmitterUnnested::EmitTriangularSolveCustomCall(
     const HloInstruction* instr) {
@@ -2319,6 +1758,98 @@ absl::Status IrEmitterUnnested::EmitTopKCustomCall(
   return absl::OkStatus();
 }
 
+absl::Status IrEmitterUnnested::EmitTritonCustomCall(
+    const HloCustomCallInstruction* instr) {
+#if !GOOGLE_CUDA
+  return absl::UnimplementedError("Triton support requires CUDA");
+#else
+  auto generate = [this, &instr]() -> absl::StatusOr<KernelReuseCache::Entry> {
+    mlir::MLIRContext& mlir_context = *ir_emitter_context_->mlir_context();
+    mlir_context.loadDialect<mlir::triton::TritonDialect>();
+    auto call =
+        TritonCall::Parse(instr->raw_backend_config_string(), &mlir_context);
+    auto kernel_name =
+        ir_emitter_context_->name_uniquer()->GetUniqueName(call.name);
+    VLOG(3) << "Generating: " << kernel_name;
+
+    auto triton_module =
+        mlir::parseSourceString<mlir::ModuleOp>(call.ir, &mlir_context);
+    auto triton_fn =
+        triton_module->lookupSymbol<mlir::triton::FuncOp>(call.name);
+    triton_fn.setName(kernel_name);
+
+    HloModule* hlo_module = instr->GetModule();
+    auto gemm_config = TritonGemmConfig(
+        /*block_m=*/-1, /*block_n=*/-1, /*block_k=*/-1, /*split_k=*/-1,
+        call.num_stages, call.num_warps);
+    TF_ASSIGN_OR_RETURN(
+        auto result,
+        CompileTritonToLLVM(hlo_module->config(), hlo_module->name(),
+                            ir_emitter_context_->cuda_compute_capability(),
+                            ir_emitter_context_->gpu_device_info(), gemm_config,
+                            triton_module.get(),
+                            ir_emitter_context_->llvm_module(), mlir_context));
+
+    llvm::Function* impl_fn =
+        ir_emitter_context_->llvm_module()->getFunction(kernel_name);
+    TF_RET_CHECK(impl_fn);
+    impl_fn->setName(ir_emitter_context_->name_uniquer()->GetUniqueName(
+        kernel_name + "_impl"));
+
+    TF_ASSIGN_OR_RETURN(
+        auto kernel_arguments,
+        KernelArguments::Create(ir_emitter_context_->buffer_assignment(), instr,
+                                instr->operands()));
+    auto launch_dimensions =
+        LaunchDimensions(se::BlockDim(call.grid_x, call.grid_y, call.grid_z),
+                         se::ThreadDim(call.num_warps * 32));
+
+    llvm::IRBuilder builder(ir_emitter_context_->llvm_module()->getContext());
+
+    llvm::Function* kernel;
+    std::vector<llvm_ir::IrArray> inputs;
+    std::vector<llvm_ir::IrArray> outputs;
+    TF_ASSIGN_OR_RETURN(
+        std::tie(kernel, inputs, outputs),
+        BuildKernelPrototype(*ir_emitter_context_, kernel_name,
+                             kernel_arguments.args(), impl_fn->arg_size(),
+                             launch_dimensions, &builder));
+
+    // Move function body into kernel prototype.
+    llvm::Function* prototype_func = builder.GetInsertBlock()->getParent();
+    prototype_func->splice(prototype_func->begin(), impl_fn);
+    for (const auto& [kernel_arg, arg, input] :
+         llvm::zip(kernel_arguments.args(), impl_fn->args(), inputs)) {
+      // Remove the alignment and aliasing attributes to avoid recompiling the
+      // kernel for each alignment/aliasing combination.
+      arg.removeAttr(llvm::Attribute::Alignment);
+      arg.removeAttr(llvm::Attribute::NoAlias);
+
+      arg.replaceAllUsesWith(input.GetBasePointer());
+    }
+    impl_fn->eraseFromParent();
+
+    return {{kernel->getName().str(), launch_dimensions, result.cluster_dim,
+             result.shmem_bytes}};
+  };
+
+  auto [status_or_entry, was_cached] =
+      ir_emitter_context_->kernel_cache().GetWithStatus(
+          instr->raw_backend_config_string(), generate);
+  TF_ASSIGN_OR_RETURN(const KernelReuseCache::Entry* entry, status_or_entry);
+
+  TF_ASSIGN_OR_RETURN(
+      auto kernel_arguments,
+      KernelArguments::Create(ir_emitter_context_->buffer_assignment(), instr,
+                              instr->operands()));
+
+  AddThunkToThunkSequence(std::make_unique<KernelThunk>(
+      instr, entry->kernel_name, kernel_arguments.args(),
+      entry->launch_dimensions, entry->cluster_dim, entry->shmem_bytes));
+  return absl::OkStatus();
+#endif  // GOOGLE_CUDA
+}
+
 // Convert the following form of fusion region:
 //   fusion() {
 //     %0 = tensor_load %external_memref0
@@ -2388,27 +1919,6 @@ absl::Status IrEmitterUnnested::EmitFusion(const HloFusionInstruction* instr,
       emitter->Emit(*ir_emitter_context_, nullptr, *instr));
 }
 
-absl::Status IrEmitterUnnested::EmitFusion(
-    mlir::Operation* op,
-    const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-        hlo_for_lmhlo) {
-  auto fusion_op = mlir::cast<mlir::lmhlo::FusionOp>(op);
-  auto* fusion = Cast<HloFusionInstruction>(hlo_for_lmhlo.at(fusion_op));
-
-  // Create HloFusionAnalysis instance.
-  const se::DeviceDescription& device_info =
-      ir_emitter_context_->gpu_device_info();
-  TF_ASSIGN_OR_RETURN(auto fusion_analysis,
-                      HloFusionAnalysis::Create(fusion, &device_info));
-
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<FusionInterface> emitter,
-      GetFusionEmitter(LmhloFusionInfo(fusion_analysis, fusion_op,
-                                       ir_emitter_context_->allocations())));
-  return AddThunksToThunkSequence(
-      emitter->Emit(*ir_emitter_context_, fusion_op, *fusion));
-}
-
 absl::Status IrEmitterUnnested::AssertNonDeterminismIsOkay(
     const std::string& op_name) {
   if (ir_emitter_context_->debug_options().xla_gpu_deterministic_ops()) {
@@ -2422,46 +1932,37 @@ absl::Status IrEmitterUnnested::AssertNonDeterminismIsOkay(
 }
 
 absl::Status IrEmitterUnnested::EmitSelectAndScatter(
-    mlir::Operation* op,
-    const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-        hlo_for_lmhlo) {
-  auto select_and_scatter_op = mlir::cast<mlir::lmhlo::SelectAndScatterOp>(op);
-  auto* select_and_scatter =
-      Cast<HloSelectAndScatterInstruction>(hlo_for_lmhlo.at(op));
-
-  const Shape source_shape = GetShape(select_and_scatter_op.getSource());
-  const Shape operand_shape = GetShape(select_and_scatter_op.getOperand());
+    const HloSelectAndScatterInstruction* instr) {
+  const HloInstruction* operand = instr->operand(0);
+  const HloInstruction* source = instr->operand(1);
+  const Shape source_shape = source->shape();
+  const Shape operand_shape = operand->shape();
   const int64_t rank = operand_shape.rank();
 
+  Window window = instr->window();
+
   CHECK_EQ(rank, source_shape.rank());
-  if (select_and_scatter_op.getWindowDimensions()) {
-    CHECK_EQ(rank, select_and_scatter_op.getWindowDimensions()->size());
-  }
+  CHECK_EQ(rank, window.dimensions_size());
 
-  TF_RETURN_IF_ERROR(AssertNonDeterminismIsOkay(
-      mlir::mhlo::GetDebugNameFromLocation(select_and_scatter_op.getLoc())));
+  std::string name = llvm_ir::IrName(instr);
 
-  std::string name = GetIrNameFromLoc(select_and_scatter_op.getLoc());
+  TF_RETURN_IF_ERROR(AssertNonDeterminismIsOkay(name));
 
-  const HloInstruction* init_value = select_and_scatter->operand(2);
+  const HloInstruction* init_value = instr->operand(2);
   // IrEmitterUnnested implements kSelectAndScatter as a SequentialThunk
   // consisting of two thunks, an initializer KernelThunk that initializes
   // the output and another KernelThunk that accumulates the scattered
   // elements.
-  TF_RETURN_IF_ERROR(BuildInitializerThunk(op, select_and_scatter, init_value,
-                                           select_and_scatter_op.getInitValue(),
-                                           select_and_scatter_op.getOut()));
+  TF_RETURN_IF_ERROR(
+      BuildInitializerThunk(nullptr, instr, init_value, nullptr, nullptr));
 
   LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
       source_shape, ir_emitter_context_->gpu_device_info());
 
   // Init value is not needed in IR emission.
-  TF_ASSIGN_OR_RETURN(auto ir_arrays, BuildKernelThunkForNonFusionOp(
-                                          select_and_scatter_op,
-                                          {select_and_scatter_op.getOperand(),
-                                           select_and_scatter_op.getSource(),
-                                           select_and_scatter_op.getOut()},
-                                          launch_dimensions));
+  TF_ASSIGN_OR_RETURN(auto ir_arrays,
+                      BuildKernelThunkForNonFusionOp(instr, {operand, source},
+                                                     launch_dimensions));
 
   auto& [inputs, outputs] = ir_arrays;
   CHECK_EQ(inputs.size(), 3);
@@ -2470,8 +1971,8 @@ absl::Status IrEmitterUnnested::EmitSelectAndScatter(
   const llvm_ir::IrArray& source_array = inputs[1];
   const llvm_ir::IrArray& out_array = inputs[2];
 
-  llvm::Type* index_type = GetIndexTypeForKernel(
-      select_and_scatter_op, launch_dimensions.launch_bound(), &b_);
+  llvm::Type* index_type =
+      GetIndexTypeForKernel(instr, launch_dimensions.launch_bound(), &b_);
   auto index_typed_constant = [&](uint64_t c) -> llvm::Constant* {
     return llvm::ConstantInt::get(index_type, c);
   };
@@ -2520,11 +2021,10 @@ absl::Status IrEmitterUnnested::EmitSelectAndScatter(
                                       index_type);
 
     DimensionVector window_size;
-    mlir::DenseIntElementsAttr window_dimensions =
-        select_and_scatter_op.getWindowDimensions().value();
-    for (const auto& dim : window_dimensions) {
-      window_size.push_back(dim.getSExtValue());
-      CHECK_GT(dim.getSExtValue(), 0);
+    for (const WindowDimension& dim : window.dimensions()) {
+      auto size = static_cast<int64_t>(dim.size());
+      window_size.push_back(size);
+      CHECK_GT(size, 0);
     }
 
     const llvm_ir::IrArray::Index window_index = window_loops.AddLoopsForShape(
@@ -2539,14 +2039,9 @@ absl::Status IrEmitterUnnested::EmitSelectAndScatter(
     std::vector<llvm::Value*> operand_multi_index(source_index.size());
     llvm::Value* in_bounds_condition = b_.getInt1(true);
 
-    auto strides = *select_and_scatter_op.getWindowStrides();
-    auto paddings = *select_and_scatter_op.getPadding();
-
-    for (const auto& stride_and_padding :
-         llvm::enumerate(llvm::zip(strides, paddings))) {
-      const int i = stride_and_padding.index();
-      int64_t stride = std::get<0>(stride_and_padding.value()).getSExtValue();
-      int64_t padding = std::get<1>(stride_and_padding.value()).getSExtValue();
+    for (const auto [i, value] : llvm::enumerate(window.dimensions())) {
+      auto stride = static_cast<int64_t>(value.stride());
+      auto padding = static_cast<int64_t>(value.padding_low());
 
       llvm::Value* strided_index =
           NSWMul(source_index[i], index_typed_constant(stride));
@@ -2598,7 +2093,7 @@ absl::Status IrEmitterUnnested::EmitSelectAndScatter(
         llvm_ir::PrimitiveTypeToIrType(PRED, module_), "select_return_buffer",
         &b_);
 
-    const HloComputation* select_computation = select_and_scatter->select();
+    const HloComputation* select_computation = instr->select();
     TF_RETURN_IF_ERROR(CallNestedComputation(
         &b_, *ir_emitter_context_, *select_computation,
         {selected_value_address, operand_address}, select_return_buffer));
@@ -2643,7 +2138,7 @@ absl::Status IrEmitterUnnested::EmitSelectAndScatter(
           Load(selected_index_address->getAllocatedType(),
                selected_index_address_slot));
     }
-    const Shape output_shape = GetShape(select_and_scatter_op.getOut());
+    const Shape output_shape = instr->shape();
     llvm::Value* source_value_address =
         source_array.EmitArrayElementAddress(source_index, &b_);
     llvm_ir::IrArray::Index selected_index(selected_multi_index, output_shape,
@@ -2651,7 +2146,7 @@ absl::Status IrEmitterUnnested::EmitSelectAndScatter(
     llvm::Value* output_value_address =
         out_array.EmitArrayElementAddress(selected_index, &b_);
 
-    const HloComputation* scatter_computation = select_and_scatter->scatter();
+    const HloComputation* scatter_computation = instr->scatter();
     return EmitAtomicOperationForNestedComputation(
         &b_, *ir_emitter_context_, *scatter_computation, output_value_address,
         source_value_address, source_array.GetElementLlvmType());
@@ -2662,309 +2157,21 @@ absl::Status IrEmitterUnnested::EmitSelectAndScatter(
       .EmitLoop(name, index_type);
 }
 
-absl::Status IrEmitterUnnested::EmitSelectAndScatter(
-    const HloSelectAndScatterInstruction* instr) {
-  const HloInstruction* operand = instr->operand(0);
-  const HloInstruction* source = instr->operand(1);
-  const Shape source_shape = source->shape();
-  const Shape operand_shape = operand->shape();
-  const int64_t rank = operand_shape.rank();
-
-  Window window = instr->window();
+absl::Status IrEmitterUnnested::EmitWhile(const HloInstruction* instr) {
+  TF_ASSIGN_OR_RETURN(auto config,
+                      instr->backend_config<xla::WhileLoopBackendConfig>());
 
-  CHECK_EQ(rank, source_shape.rank());
-  CHECK_EQ(rank, window.dimensions_size());
+  std::optional<int64_t> trip_count = std::nullopt;
+  if (config.has_known_trip_count()) trip_count = config.known_trip_count().n();
 
-  std::string name = llvm_ir::IrName(instr);
+  TF_ASSIGN_OR_RETURN(
+      auto thunk,
+      BuildWhileThunk(instr, Thunk::ThunkInfo::WithProfileAnnotation(instr),
+                      trip_count));
 
-  TF_RETURN_IF_ERROR(AssertNonDeterminismIsOkay(name));
-
-  const HloInstruction* init_value = instr->operand(2);
-  // IrEmitterUnnested implements kSelectAndScatter as a SequentialThunk
-  // consisting of two thunks, an initializer KernelThunk that initializes
-  // the output and another KernelThunk that accumulates the scattered
-  // elements.
-  TF_RETURN_IF_ERROR(
-      BuildInitializerThunk(nullptr, instr, init_value, nullptr, nullptr));
-
-  LaunchDimensions launch_dimensions = CalculateLaunchDimensions(
-      source_shape, ir_emitter_context_->gpu_device_info());
-
-  // Init value is not needed in IR emission.
-  TF_ASSIGN_OR_RETURN(auto ir_arrays,
-                      BuildKernelThunkForNonFusionOp(instr, {operand, source},
-                                                     launch_dimensions));
-
-  auto& [inputs, outputs] = ir_arrays;
-  CHECK_EQ(inputs.size(), 3);
-  CHECK_EQ(outputs.size(), 0);
-  const llvm_ir::IrArray& operand_array = inputs[0];
-  const llvm_ir::IrArray& source_array = inputs[1];
-  const llvm_ir::IrArray& out_array = inputs[2];
-
-  llvm::Type* index_type =
-      GetIndexTypeForKernel(instr, launch_dimensions.launch_bound(), &b_);
-  auto index_typed_constant = [&](uint64_t c) -> llvm::Constant* {
-    return llvm::ConstantInt::get(index_type, c);
-  };
-
-  // kSelectAndScatter is implemented as two kernel launches: the first launch
-  // initializes the output array to the given initial value,
-  // and the second accumulates the "source" matrix to the
-  // selected elements in the output array. The first launch is already
-  // implemented by the initializer thunk generated earlier, so this function
-  // only needs to take care of the select-and-scatter part.
-  //
-  // Pseudo code for select-and-scatter:
-  //
-  // for (coordinates S in the source):  # This loop is parallel.
-  //   initialized_flag = false
-  //   for (coordinates W in the window):
-  //     I = S * stride + W - pad_low
-  //     if I within bounds of operand:
-  //       if !(initialized_flag and select(selected_value, operand(I))):
-  //         selected_value = operand(I)
-  //         selected_index = I
-  //         initialized_flag = true
-  //   if initialized_flag:
-  //     output(selected_index) = scatter(output(selected_index), source(S))
-  auto loop_body_emitter =
-      [&](const llvm_ir::IrArray::Index& source_index) -> absl::Status {
-    // Allocate space to keep the currently selected value, its index, and a
-    // boolean flag if the value is initialized. The initialized_flag is set
-    // false.
-    llvm::Value* selected_value_address = llvm_ir::EmitAllocaAtFunctionEntry(
-        llvm_ir::PrimitiveTypeToIrType(operand_shape.element_type(), module_),
-        "selected_value_address", &b_);
-
-    llvm::AllocaInst* selected_index_address =
-        llvm_ir::EmitAllocaAtFunctionEntryWithCount(
-            index_type, index_typed_constant(rank), "selected_index_address",
-            &b_);
-
-    llvm::AllocaInst* initialized_flag_address =
-        llvm_ir::EmitAllocaAtFunctionEntry(b_.getInt1Ty(),
-                                           "initialized_flag_address", &b_);
-    Store(b_.getInt1(false), initialized_flag_address);
-
-    // Create the inner loop to iterate over the window.
-    llvm_ir::ForLoopNest window_loops(absl::StrCat(name, "inner"), &b_,
-                                      index_type);
-
-    DimensionVector window_size;
-    for (const WindowDimension& dim : window.dimensions()) {
-      auto size = static_cast<int64_t>(dim.size());
-      window_size.push_back(size);
-      CHECK_GT(size, 0);
-    }
-
-    const llvm_ir::IrArray::Index window_index = window_loops.AddLoopsForShape(
-        ShapeUtil::MakeShape(operand_shape.element_type(), window_size),
-        "window");
-    llvm_ir::SetToFirstInsertPoint(window_loops.GetInnerLoopBodyBasicBlock(),
-                                   &b_);
-
-    // Compute the operand index to visit and evaluate the condition whether the
-    // operand index is within the bounds. The unsigned comparison includes
-    // checking whether the operand index >= 0.
-    std::vector<llvm::Value*> operand_multi_index(source_index.size());
-    llvm::Value* in_bounds_condition = b_.getInt1(true);
-
-    for (const auto [i, value] : llvm::enumerate(window.dimensions())) {
-      auto stride = static_cast<int64_t>(value.stride());
-      auto padding = static_cast<int64_t>(value.padding_low());
-
-      llvm::Value* strided_index =
-          NSWMul(source_index[i], index_typed_constant(stride));
-      operand_multi_index[i] = NSWSub(NSWAdd(strided_index, window_index[i]),
-                                      index_typed_constant(padding));
-      llvm::Value* index_condition = ICmpULT(
-          operand_multi_index[i],
-          index_typed_constant(ShapeUtil::GetDimension(operand_shape, i)));
-      in_bounds_condition = And(in_bounds_condition, index_condition);
-    }
-
-    // Only need to do something if the operand index is within the bounds.
-    // First check if the initialized_flag is set.
-    llvm_ir::LlvmIfData if_in_bounds =
-        llvm_ir::EmitIfThenElse(in_bounds_condition, "in-bounds", &b_);
-    llvm_ir::SetToFirstInsertPoint(if_in_bounds.true_block, &b_);
-    llvm_ir::LlvmIfData if_initialized = llvm_ir::EmitIfThenElse(
-        Load(initialized_flag_address->getAllocatedType(),
-             initialized_flag_address),
-        "initialized", &b_);
-
-    // If the initialized_flag is false, initialize the selected value and index
-    // with the currently visiting operand.
-    llvm_ir::SetToFirstInsertPoint(if_initialized.false_block, &b_);
-    const auto save_operand_index =
-        [&](const llvm_ir::IrArray::Index& operand_index) {
-          for (int64_t i = 0; i < rank; ++i) {
-            llvm::Value* selected_index_address_slot =
-                InBoundsGEP(selected_index_address->getAllocatedType(),
-                            selected_index_address, {b_.getInt32(i)});
-            Store(operand_index[i], selected_index_address_slot);
-          }
-        };
-    llvm_ir::IrArray::Index operand_index(operand_multi_index, operand_shape,
-                                          index_type);
-    llvm::Value* operand_data =
-        operand_array.EmitReadArrayElement(operand_index, &b_);
-    Store(operand_data, selected_value_address);
-    save_operand_index(operand_index);
-    Store(b_.getInt1(true), initialized_flag_address);
-
-    // If the initialized_flag is true, call the `select` function to
-    // potentially update the selected value and index with the currently
-    // visiting operand.
-    llvm_ir::SetToFirstInsertPoint(if_initialized.true_block, &b_);
-    llvm::Value* operand_address =
-        operand_array.EmitArrayElementAddress(operand_index, &b_);
-    llvm::AllocaInst* select_return_buffer = llvm_ir::EmitAllocaAtFunctionEntry(
-        llvm_ir::PrimitiveTypeToIrType(PRED, module_), "select_return_buffer",
-        &b_);
-
-    const HloComputation* select_computation = instr->select();
-    TF_RETURN_IF_ERROR(CallNestedComputation(
-        &b_, *ir_emitter_context_, *select_computation,
-        {selected_value_address, operand_address}, select_return_buffer));
-    llvm::Value* result =
-        Load(select_return_buffer->getAllocatedType(), select_return_buffer);
-
-    // If the 'select' function returns false, update the selected value and the
-    // index to the currently visiting operand.
-    llvm::Value* cond =
-        ICmpNE(result,
-               llvm::ConstantInt::get(
-                   llvm_ir::PrimitiveTypeToIrType(PRED, module_), 0),
-               "boolean_predicate");
-    llvm_ir::LlvmIfData if_select_lhs =
-        llvm_ir::EmitIfThenElse(cond, "if-select-lhs", &b_);
-    llvm_ir::SetToFirstInsertPoint(if_select_lhs.false_block, &b_);
-    Store(Load(operand_array.GetElementLlvmType(), operand_address),
-          selected_value_address);
-    save_operand_index(operand_index);
-
-    // If the initialized_flag is true, write to the selected index of the
-    // output; otherwise the window is outside the source (in the padding) and
-    // should be ignored.
-    llvm_ir::SetToFirstInsertPoint(window_loops.GetOuterLoopExitBasicBlock(),
-                                   &b_);
-    llvm_ir::LlvmIfData if_should_store = llvm_ir::EmitIfThenElse(
-        Load(initialized_flag_address->getAllocatedType(),
-             initialized_flag_address),
-        "should-store", &b_, /*emit_else=*/false);
-    llvm_ir::SetToFirstInsertPoint(if_should_store.true_block, &b_);
-
-    // After iterating over the window elements, scatter the source element to
-    // the selected index of the output. The value we store at the output
-    // location is computed by calling the `scatter` function with the source
-    // value and the current output value.
-    std::vector<llvm::Value*> selected_multi_index;
-    for (int64_t i = 0; i < rank; ++i) {
-      llvm::Value* selected_index_address_slot =
-          InBoundsGEP(selected_index_address->getAllocatedType(),
-                      selected_index_address, {b_.getInt32(i)});
-      selected_multi_index.push_back(
-          Load(selected_index_address->getAllocatedType(),
-               selected_index_address_slot));
-    }
-    const Shape output_shape = instr->shape();
-    llvm::Value* source_value_address =
-        source_array.EmitArrayElementAddress(source_index, &b_);
-    llvm_ir::IrArray::Index selected_index(selected_multi_index, output_shape,
-                                           operand_index.GetType());
-    llvm::Value* output_value_address =
-        out_array.EmitArrayElementAddress(selected_index, &b_);
-
-    const HloComputation* scatter_computation = instr->scatter();
-    return EmitAtomicOperationForNestedComputation(
-        &b_, *ir_emitter_context_, *scatter_computation, output_value_address,
-        source_value_address, source_array.GetElementLlvmType());
-  };
-
-  return ParallelLoopEmitter(loop_body_emitter, source_shape, launch_dimensions,
-                             &b_)
-      .EmitLoop(name, index_type);
-}
-
-absl::Status IrEmitterUnnested::EmitWhile(
-    mlir::Operation* op,
-    const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-        hlo_for_lmhlo) {
-  auto while_op = mlir::cast<mlir::lmhlo::WhileOp>(op);
-
-  auto cond_result = GetHloOutputs(while_op);
-  TF_RET_CHECK(cond_result.size() == 1);
-  TF_RET_CHECK(cond_result[0]
-                   .getType()
-                   .cast<mlir::ShapedType>()
-                   .getElementType()
-                   .isInteger(/*width=*/1))
-      << "While condition computation must return bool";
-
-  // Build ForThunk for conformant while loops, otherwise build WhileThunk.
-  //
-  // If Xla runtime is enabled we always lower to `lmhlo.while` operation and
-  // rely on `lmhlo-to-gpu-runtime` to lower while loops with known trip counts
-  // to `scf.for` loops.
-  if (while_op.getTripCount() &&
-      !IsXlaRuntimeExecutableEnabled(
-          ir_emitter_context_->hlo_module().config())) {
-    TF_ASSIGN_OR_RETURN(
-        auto thunk,
-        BuildForThunk(while_op, Thunk::ThunkInfo::WithProfileAnnotation(op),
-                      *while_op.getTripCount(), hlo_for_lmhlo));
-    AddThunkToThunkSequence(std::move(thunk));
-  } else {
-    TF_ASSIGN_OR_RETURN(
-        auto thunk,
-        BuildWhileThunk(while_op, Thunk::ThunkInfo::WithProfileAnnotation(op),
-                        hlo_for_lmhlo));
-    AddThunkToThunkSequence(std::move(thunk));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status IrEmitterUnnested::EmitWhile(const HloInstruction* instr) {
-  TF_ASSIGN_OR_RETURN(auto config,
-                      instr->backend_config<xla::WhileLoopBackendConfig>());
-  if (config.has_known_trip_count()) {
-    int64_t trip_count = config.known_trip_count().n();
-    TF_ASSIGN_OR_RETURN(auto thunk, BuildForThunk(instr, trip_count));
-    AddThunkToThunkSequence(std::move(thunk));
-    return absl::OkStatus();
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      auto thunk,
-      BuildWhileThunk(instr, Thunk::ThunkInfo::WithProfileAnnotation(instr)));
-  AddThunkToThunkSequence(std::move(thunk));
-  return absl::OkStatus();
-}
-
-absl::Status IrEmitterUnnested::EmitRngGetAndUpdateState(mlir::Operation* op) {
-  auto rng_op = mlir::dyn_cast<mlir::lmhlo::RngGetAndUpdateStateOp>(op);
-
-  // Emit a kernel to increment the global state for Philox RNG algorithm.
-  TF_ASSIGN_OR_RETURN(auto ir_arrays,
-                      BuildKernelThunkForNonFusionOp(
-                          rng_op /*, rng_op.getState(),*/, LaunchDimensions()));
-  auto& [inputs, outputs] = ir_arrays;
-
-  llvm::Value* old_state =
-      llvm_ir::RngGetAndUpdateState(rng_op.getDelta(), module_, &b_);
-
-  const Shape shape = GetShape(rng_op.getState());
-
-  llvm::Value* output_address = inputs[0].EmitArrayElementAddress(
-      llvm_ir::IrArray::Index(
-          /*linear=*/b_.getInt64(0), shape, &b_),
-      &b_, "rng_state_address");
-  Store(old_state, output_address);
-
-  return absl::OkStatus();
-}
+  AddThunkToThunkSequence(std::move(thunk));
+  return absl::OkStatus();
+}
 
 absl::Status IrEmitterUnnested::EmitRngGetAndUpdateState(
     const HloRngGetAndUpdateStateInstruction* instr) {
@@ -3007,17 +2214,10 @@ absl::Status IrEmitterUnnested::EmitSort(mlir::Operation* op,
 
     // If possible, we share buffers. If that is not possible, we need to
     // copy the values, because the emitter does the sorting in-place.
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      TF_ASSIGN_OR_RETURN(destination_buffer,
-                          GetAllocationSliceForHlo(sort, shape_index));
-      TF_ASSIGN_OR_RETURN(source_address,
-                          GetAllocationSliceForHlo(sort->operand(i), {}));
-    } else {
-      TF_ASSIGN_OR_RETURN(destination_buffer,
-                          GetAllocationSlice(sort_op.getOutput()[i]));
-      TF_ASSIGN_OR_RETURN(source_address,
-                          GetAllocationSlice(sort_op.getOperands()[i]));
-    }
+    TF_ASSIGN_OR_RETURN(destination_buffer,
+                        GetAllocationSliceForHlo(sort, shape_index));
+    TF_ASSIGN_OR_RETURN(source_address,
+                        GetAllocationSliceForHlo(sort->operand(i), {}));
 
     if (destination_buffer != source_address) {
       // TODO(b/26783907): Figure out why we never seem to share buffers for
@@ -3131,12 +2331,8 @@ absl::Status IrEmitterUnnested::EmitSort(mlir::Operation* op,
     LaunchDimensions launch_dimensions = xor_masks.size() > 1
                                              ? tiled_launch_dimensions
                                              : standard_launch_dimensions;
-    TF_ASSIGN_OR_RETURN(
-        auto ir_arrays,
-        ir_emitter_context_->emit_ir_from_hlo()
-            ? BuildKernelThunkForNonFusionOp(sort, {}, launch_dimensions)
-            : BuildKernelThunkForNonFusionOp(sort_op, sort_op.getOutput(),
-                                             launch_dimensions));
+    TF_ASSIGN_OR_RETURN(auto ir_arrays, BuildKernelThunkForNonFusionOp(
+                                            sort, {}, launch_dimensions));
 
     auto& [inputs, outputs] = ir_arrays;
     auto* comparator = sort->called_computations().front();
@@ -3182,17 +2378,6 @@ absl::Status IrEmitterUnnested::EmitSort(const HloSortInstruction* sort) {
   return EmitSort(nullptr, sort);
 }
 
-template <typename ThunkType, typename OpT>
-absl::Status IrEmitterUnnested::EmitReplicaOrPartitionId(mlir::Operation* op) {
-  auto casted = mlir::cast<OpT>(op);
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result_slice,
-                      GetAllocationSlice(casted.getOperand()));
-  auto thunk = std::make_unique<ThunkType>(
-      Thunk::ThunkInfo::WithProfileAnnotation(op), result_slice);
-  AddThunkToThunkSequence(std::move(thunk));
-  return absl::OkStatus();
-}
-
 template <typename ThunkType>
 absl::Status IrEmitterUnnested::EmitReplicaOrPartitionId(
     const HloInstruction* instr) {
@@ -3204,48 +2389,6 @@ absl::Status IrEmitterUnnested::EmitReplicaOrPartitionId(
   return absl::OkStatus();
 }
 
-Status IrEmitterUnnested::EmitCollectivePermute(mlir::Operation* op) {
-  auto collective_permute_op =
-      mlir::cast<mlir::lmhlo_gpu::CollectivePermuteStartOp>(op);
-
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice source_slice,
-                      GetAllocationSlice(collective_permute_op.getOperand()));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result_slice,
-                      GetAllocationSlice(collective_permute_op.getOutput()));
-
-  const Shape shape = GetShape(collective_permute_op.getOperand());
-  const auto& hlo_config = ir_emitter_context_->hlo_module().config();
-  const int64_t replica_count = hlo_config.replica_count();
-  const int64_t partition_count = hlo_config.num_partitions();
-
-  NcclCollectiveThunk::AsyncExecutor* async_executor;
-  if (NcclCollectivePermuteStartThunk::IsDegenerate(
-          collective_permute_op, replica_count, partition_count)) {
-    // For a degenerate collective permute, just generate a copy thunk.
-    AddThunkToThunkSequence(std::make_unique<DeviceToDeviceCopyThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(op),
-        /*source_buffer=*/source_slice,
-        /*destination_buffer=*/result_slice,
-        /*mem_size=*/ShapeUtil::ByteSizeOf(shape),
-        /*source_value=*/collective_permute_op.getOperand(),
-        /*destination_value=*/collective_permute_op.getOutput()));
-    // Signal that start thunk not created with nullptr.
-    async_executor = nullptr;
-  } else {
-    const NcclCollectiveThunk::Buffer buffer = {
-        /*element_count=*/ShapeUtil::ElementsIn(shape),
-        /*source_buffer=*/source_slice,
-        /*destination_buffer=*/result_slice};
-    auto thunk = std::make_unique<NcclCollectivePermuteStartThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(op), NcclApi::Default(),
-        collective_permute_op, replica_count, partition_count, buffer);
-    async_executor = thunk->async_executor();
-    AddThunkToThunkSequence(std::move(thunk));
-  }
-  async_executors_.insert({op, async_executor});
-  return absl::OkStatus();
-}
-
 Status IrEmitterUnnested::EmitCollectivePermute(
     const HloCollectivePermuteInstruction* instr) {
   TF_RET_CHECK(instr->operand_count() == 1);
@@ -3259,134 +2402,34 @@ Status IrEmitterUnnested::EmitCollectivePermute(
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result_slice,
                       GetAllocationSliceForHlo(instr, {1}));
 
-  const Shape shape = operand->shape();
-  const auto& hlo_config = ir_emitter_context_->hlo_module().config();
-  const int64_t replica_count = hlo_config.replica_count();
-  const int64_t partition_count = hlo_config.num_partitions();
-
-  NcclCollectiveThunk::AsyncExecutor* async_executor;
-  if (NcclCollectivePermuteStartThunk::IsDegenerate(instr, replica_count,
-                                                    partition_count)) {
-    // For a degenerate collective permute, just generate a copy thunk.
-    AddThunkToThunkSequence(std::make_unique<DeviceToDeviceCopyThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(instr),
-        /*source_buffer=*/source_slice,
-        /*destination_buffer=*/result_slice,
-        /*mem_size=*/ShapeUtil::ByteSizeOf(shape),
-        /*source_value=*/nullptr,
-        /*destination_value=*/nullptr));
-    // Signal that start thunk not created with nullptr.
-    async_executor = nullptr;
-  } else {
-    const NcclCollectiveThunk::Buffer buffer = {
-        /*element_count=*/ShapeUtil::ElementsIn(shape),
-        /*source_buffer=*/source_slice,
-        /*destination_buffer=*/result_slice};
-    auto thunk = std::make_unique<NcclCollectivePermuteStartThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(instr), NcclApi::Default(),
-        instr, replica_count, partition_count, buffer);
-    async_executor = thunk->async_executor();
-    AddThunkToThunkSequence(std::move(thunk));
-  }
-  async_executors_.insert({instr, async_executor});
-  return absl::OkStatus();
-}
-
-template <typename NcclThunkType, typename OpT>
-absl::Status IrEmitterUnnested::EmitNcclThunk(mlir::Operation* untyped_op) {
-  OpT op = mlir::cast<OpT>(untyped_op);
-  const auto& hlo_config = ir_emitter_context_->hlo_module().config();
-  int64_t replica_count = hlo_config.replica_count();
-  int64_t partition_count = hlo_config.num_partitions();
-  VLOG(2) << NcclThunkType::GetHloOpName()
-          << "; replica count: " << replica_count
-          << "; partition count: " << partition_count
-          << "; operand count: " << op.getOperands().size();
-
-  // A given collective op can be degenerate if across all groups formed
-  // by it are singleton. In such a case, we don't need to do any communication
-  // and we can just copy the input to the output.
-  bool is_degenerate =
-      GetNcclCollectiveConfigForMlir(op, op.getUseGlobalDeviceIds())
-          .IsDegenerate(replica_count, partition_count);
-  absl::Status implementable_status =
-      NcclThunkType::CheckImplementable(op, replica_count, partition_count);
-  bool should_use_nccl_thunk = !is_degenerate && implementable_status.ok();
-
-  // Stash relevant information in NcclCollectiveThunk::Buffer even if we may
-  // not generate an NcclCollectiveThunk.
-  std::vector<NcclCollectiveThunk::Buffer> buffers;
-  buffers.reserve(op.getInputs().size());
-  for (auto it : llvm::zip(op.getInputs(), op.getOutputs())) {
-    mlir::Value operand = std::get<0>(it);
-    mlir::Value result = std::get<1>(it);
-    const Shape shape = GetShape(operand);
-    TF_ASSIGN_OR_RETURN(auto source_slice, GetAllocationSlice(operand));
-    TF_ASSIGN_OR_RETURN(auto dest_slice, GetAllocationSlice(result));
-    buffers.push_back(NcclCollectiveThunk::Buffer{
-        /*element_count=*/ShapeUtil::ElementsIn(shape),
-        /*source_buffer=*/source_slice,
-        /*destination_buffer=*/dest_slice,
-        /*source_memory_space=*/0,       // always 0 for LMHLO
-        /*destination_memory_space=*/0,  // always 0 for LMHLO
-        /*source_value=*/operand,
-        /*destination_value=*/result});
-  }
-
-  if (should_use_nccl_thunk) {
-    auto thunk = std::make_unique<NcclThunkType>(
-        Thunk::ThunkInfo::WithProfileAnnotation(op), NcclApi::Default(), op,
-        /*buffers=*/std::move(buffers));
-    async_executors_.insert({untyped_op, thunk->async_executor()});
-    AddThunkToThunkSequence(std::move(thunk));
-    return absl::OkStatus();
-  }
-
-  if (!is_degenerate) {
-    return implementable_status;
-  }
-
-  // Signal that start thunk not created with nullptr.
-  async_executors_.insert({untyped_op, nullptr});
-
-  VLOG(1) << "Collective call is degenerate, not doing NCCL call";
-
-  // Degenerate collectives are simply identity function. Buffer
-  // assignment expects a copy, so that's what we do.
-  ThunkSequence thunks;
-  for (int64_t i = 0; i < buffers.size(); i++) {
-    const Shape shape = GetShape(op.getOperands()[i]);
-    thunks.push_back(std::make_unique<DeviceToDeviceCopyThunk>(
-        buffers.size() == 1 ? Thunk::ThunkInfo::WithProfileAnnotation(op)
-                            : Thunk::ThunkInfo(op),
-        /*source_buffer=*/buffers[i].source_buffer,
-        /*destination_buffer=*/buffers[i].destination_buffer,
-        /*mem_size=*/ShapeUtil::ByteSizeOf(shape),
-        /*source_value=*/buffers[i].source_value,
-        /*destination_value=*/buffers[i].destination_value));
-  }
-  if (thunks.size() == 1) {
-    AddThunkToThunkSequence(std::move(thunks[0]));
-  } else {
-    AddThunkToThunkSequence(std::make_unique<SequentialThunk>(
-        Thunk::ThunkInfo::WithProfileAnnotation(op), std::move(thunks)));
-  }
-  return absl::OkStatus();
-}
-
-absl::Status IrEmitterUnnested::EmitNcclAsyncDone(Thunk::Kind kind,
-                                                  mlir::Operation* op,
-                                                  mlir::Value token) {
-  auto start_op = token.getDefiningOp();
-  auto async_executor = async_executors_.extract(start_op);
-  TF_RET_CHECK(async_executor) << "couldn't find async executor for start op";
+  const Shape shape = operand->shape();
+  const auto& hlo_config = ir_emitter_context_->hlo_module().config();
+  const int64_t replica_count = hlo_config.replica_count();
+  const int64_t partition_count = hlo_config.num_partitions();
 
-  // Can be null if no start thunk was created (e.g. if the start op is
-  // degenerate), in which case there's nothing to do here.
-  if (async_executor.mapped() != nullptr) {
-    AddThunkToThunkSequence(std::make_unique<NcclCollectiveDoneThunk>(
-        kind, Thunk::ThunkInfo::WithProfileAnnotation(op),
-        *async_executor.mapped()));
+  if (NcclCollectivePermuteStartThunk::IsDegenerate(instr, replica_count,
+                                                    partition_count)) {
+    // For a degenerate collective permute, just generate a copy thunk.
+    AddThunkToThunkSequence(std::make_unique<DeviceToDeviceCopyThunk>(
+        Thunk::ThunkInfo::WithProfileAnnotation(instr),
+        /*source_buffer=*/source_slice,
+        /*destination_buffer=*/result_slice,
+        /*mem_size=*/ShapeUtil::ByteSizeOf(shape),
+        /*source_value=*/nullptr,
+        /*destination_value=*/nullptr));
+    // Signal that start thunk not created with nullptr.
+    collectives_async_events_.try_emplace(instr, nullptr);
+
+  } else {
+    const NcclCollectiveThunk::Buffer buffer = {
+        /*element_count=*/ShapeUtil::ElementsIn(shape),
+        /*source_buffer=*/source_slice,
+        /*destination_buffer=*/result_slice};
+    auto thunk = std::make_unique<NcclCollectivePermuteStartThunk>(
+        Thunk::ThunkInfo::WithProfileAnnotation(instr), NcclApi::Default(),
+        instr, replica_count, partition_count, buffer);
+    collectives_async_events_.try_emplace(instr, thunk->async_events());
+    AddThunkToThunkSequence(std::move(thunk));
   }
   return absl::OkStatus();
 }
@@ -3465,7 +2508,7 @@ absl::Status IrEmitterUnnested::EmitNcclThunk(
     auto thunk = std::make_unique<NcclThunkType>(
         Thunk::ThunkInfo::WithProfileAnnotation(inst), NcclApi::Default(), inst,
         /*buffers=*/std::move(buffers));
-    async_executors_.insert({async_start, thunk->async_executor()});
+    collectives_async_events_.insert({async_start, thunk->async_events()});
     AddThunkToThunkSequence(std::move(thunk));
     return absl::OkStatus();
   }
@@ -3475,7 +2518,7 @@ absl::Status IrEmitterUnnested::EmitNcclThunk(
   }
 
   // Signal that start thunk not created with nullptr.
-  async_executors_.insert({async_start, nullptr});
+  collectives_async_events_.insert({async_start, nullptr});
 
   VLOG(1) << "Collective call is degenerate, not doing NCCL call";
 
@@ -3504,39 +2547,47 @@ absl::Status IrEmitterUnnested::EmitNcclThunk(
 absl::Status IrEmitterUnnested::EmitNcclAsyncDone(Thunk::Kind kind,
                                                   const HloInstruction* inst) {
   const HloInstruction* start = inst->operand(0);
-  auto async_executor = async_executors_.extract(start);
-  TF_RET_CHECK(async_executor)
-      << "couldn't find async executor for start operation";
+  auto async_events = collectives_async_events_.extract(start);
+  TF_RET_CHECK(async_events)
+      << "couldn't find async events for start operation";
 
   // Can be null if no start thunk was created (e.g. if the start op is
   // degenerate), in which case there's nothing to do here.
-  if (async_executor.mapped() != nullptr) {
+  if (async_events.mapped()) {
     AddThunkToThunkSequence(std::make_unique<NcclCollectiveDoneThunk>(
         kind, Thunk::ThunkInfo::WithProfileAnnotation(inst),
-        *async_executor.mapped()));
+        std::move(async_events.mapped())));
   }
   return absl::OkStatus();
 }
 
-absl::StatusOr<std::vector<ShapedSlice>> IrEmitterUnnested::GetShapedSlices(
-    mlir::Operation::operand_range operands) {
-  std::vector<ShapedSlice> shaped_slices;
-  shaped_slices.reserve(operands.size());
-  for (mlir::Value opnd : operands) {
-    TF_ASSIGN_OR_RETURN(auto slice, GetAllocationSlice(opnd));
-    shaped_slices.push_back(ShapedSlice{slice, GetShape(opnd)});
+absl::Status IrEmitterUnnested::EmitWaitForStreamsThunk(
+    const HloInstruction* inst, GpuBackendConfig& gpu_config,
+    bool is_async_done) {
+  std::vector<ExecutionStreamId> wait_on_streams;
+  ExecutionStreamId source_stream_id = Thunk::GetMainComputeStreamId();
+  // If it's for an async done, then we need to sychronize on the execution
+  // stream of the instruction from main compute stream
+  if (is_async_done) {
+    wait_on_streams.push_back(
+        ExecutionStreamId(gpu_config.operation_queue_id()));
+  } else if (gpu_config.wait_on_operation_queues().size() == 0) {
+    // If wait on queue is empty, we just synchronize on the main compute
+    // stream from the execution stream.
+    wait_on_streams.push_back(Thunk::GetMainComputeStreamId());
+    source_stream_id = gpu_config.operation_queue_id();
+  } else {
+    // Else, we synchronize on all specified
+    // streams from the execution stream.
+    for (int64_t stream_id : gpu_config.wait_on_operation_queues()) {
+      wait_on_streams.push_back(ExecutionStreamId(stream_id));
+    }
+    source_stream_id = gpu_config.operation_queue_id();
   }
-  return shaped_slices;
-}
-
-absl::Status IrEmitterUnnested::EmitInfeed(mlir::Operation* op) {
-  mlir::Operation::operand_range operands =
-      mlir::cast<mlir::lmhlo::InfeedOp>(op).getOutputs();
-  TF_ASSIGN_OR_RETURN(auto shaped_slices, GetShapedSlices(operands));
-  auto thunk = std::make_unique<InfeedThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(op), std::move(shaped_slices));
-  AddThunkToThunkSequence(std::move(thunk));
 
+  AddThunkToThunkSequence(std::make_unique<WaitForStreamsThunk>(
+      Thunk::ThunkInfo::WithProfileAnnotation(inst), source_stream_id,
+      wait_on_streams));
   return absl::OkStatus();
 }
 
@@ -3564,17 +2615,6 @@ absl::Status IrEmitterUnnested::EmitInfeed(const HloInfeedInstruction* instr) {
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitOutfeed(mlir::Operation* op) {
-  mlir::Operation::operand_range operands =
-      mlir::cast<mlir::lmhlo::OutfeedOp>(op).getInputs();
-  TF_ASSIGN_OR_RETURN(auto shaped_slices, GetShapedSlices(operands));
-  auto thunk = std::make_unique<OutfeedThunk>(
-      Thunk::ThunkInfo::WithProfileAnnotation(op), std::move(shaped_slices));
-  AddThunkToThunkSequence(std::move(thunk));
-
-  return absl::OkStatus();
-}
-
 absl::Status IrEmitterUnnested::EmitOutfeed(
     const HloOutfeedInstruction* instr) {
   // HLO outfeed instruction has 2 operands, the source and a token, and a
@@ -3601,39 +2641,6 @@ absl::Status IrEmitterUnnested::EmitOutfeed(
   return absl::OkStatus();
 }
 
-absl::StatusOr<
-    std::pair<std::vector<llvm_ir::IrArray>, std::vector<llvm_ir::IrArray>>>
-IrEmitterUnnested::BuildKernelThunkForNonFusionOp(
-    mlir::Operation* op, mlir::ValueRange needed_operands,
-    const LaunchDimensions& launch_dimensions) {
-  TF_RET_CHECK(!mlir::isa<mlir::lmhlo::FusionOp>(op))
-      << "Please use BuildKernelThunkForFusion!";
-
-  std::string suggested_kernel_name = GetIrNameFromLoc(op->getLoc());
-
-  TF_ASSIGN_OR_RETURN(
-      auto kernel_arguments,
-      KernelArguments::Create(ir_emitter_context_->allocations(), op,
-                              needed_operands));
-
-  VLOG(3) << "Generating (without reuse check): " << suggested_kernel_name;
-
-  llvm::Function* kernel;
-  std::vector<llvm_ir::IrArray> inputs;
-  std::vector<llvm_ir::IrArray> outputs;
-  TF_ASSIGN_OR_RETURN(
-      std::tie(kernel, inputs, outputs),
-      BuildKernelPrototype(*ir_emitter_context_, suggested_kernel_name,
-                           kernel_arguments.args(), needed_operands.size(),
-                           launch_dimensions, &b_));
-
-  AddThunkToThunkSequence(std::make_unique<KernelThunk>(
-      op, kernel->getName().str(), kernel_arguments.args(), launch_dimensions,
-      /*shmem_bytes=*/0));
-
-  return {{inputs, outputs}};
-}
-
 absl::StatusOr<std::pair<std::vector<llvm_ir::IrArray> /*inputs*/,
                          std::vector<llvm_ir::IrArray> /*outputs*/>>
 IrEmitterUnnested::BuildKernelThunkForNonFusionOp(
@@ -3660,19 +2667,12 @@ IrEmitterUnnested::BuildKernelThunkForNonFusionOp(
 
   AddThunkToThunkSequence(std::make_unique<KernelThunk>(
       hlo, kernel->getName().str(), kernel_arguments.args(), launch_dimensions,
+      /*cluster_dim=*/std::nullopt,
       /*shmem_bytes=*/0));
 
   return {{inputs, outputs}};
 }
 
-absl::StatusOr<
-    std::pair<std::vector<llvm_ir::IrArray>, std::vector<llvm_ir::IrArray>>>
-IrEmitterUnnested::BuildKernelThunkForNonFusionOp(
-    mlir::Operation* op, const LaunchDimensions& launch_dimensions) {
-  return BuildKernelThunkForNonFusionOp(op, op->getOperands(),
-                                        launch_dimensions);
-}
-
 absl::Status IrEmitterUnnested::BuildInitializerThunk(
     mlir::Operation* op, const HloInstruction* instr,
     const HloInstruction* init_value, mlir::Value init_value_mlir,
@@ -3680,9 +2680,7 @@ absl::Status IrEmitterUnnested::BuildInitializerThunk(
   // initial value must be a scalar memref.
   TF_RET_CHECK(init_value->shape().rank() == 0);
 
-  auto maybe_dest_slice = ir_emitter_context_->emit_ir_from_hlo()
-                              ? GetAllocationSliceForHlo(instr, {})
-                              : GetAllocationSlice(dest);
+  auto maybe_dest_slice = GetAllocationSliceForHlo(instr, {});
   if (!maybe_dest_slice.ok()) return maybe_dest_slice.status();
 
   BufferAllocation::Slice dest_slice = *maybe_dest_slice;
@@ -3705,11 +2703,7 @@ absl::Status IrEmitterUnnested::BuildInitializerThunk(
       dest_shape, ir_emitter_context_->gpu_device_info());
   TF_ASSIGN_OR_RETURN(
       auto ir_arrays,
-      ir_emitter_context_->emit_ir_from_hlo()
-          ? BuildKernelThunkForNonFusionOp(instr, {init_value},
-                                           launch_dimensions)
-          : BuildKernelThunkForNonFusionOp(op, {init_value_mlir, dest},
-                                           launch_dimensions));
+      BuildKernelThunkForNonFusionOp(instr, {init_value}, launch_dimensions));
   auto& [inputs, outputs] = ir_arrays;
   auto init_array = inputs[0];
 
@@ -3726,37 +2720,8 @@ absl::Status IrEmitterUnnested::BuildInitializerThunk(
 }
 
 absl::StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildWhileThunk(
-    mlir::lmhlo::WhileOp while_op, const Thunk::ThunkInfo& thunk_info,
-    const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-        hlo_for_lmhlo) {
-  // Generate thunk sequence for while 'condition'.
-  mlir::Region* condition = &while_op.getCond();
-  auto ir_emitter_condition = IrEmitterUnnested::Create(ir_emitter_context_);
-
-  TF_RETURN_IF_ERROR(
-      ir_emitter_condition->EmitLmhloRegion(condition, hlo_for_lmhlo));
-
-  // Generate thunk sequence for while 'body'.
-  mlir::Region* body = &while_op.getBody();
-  auto ir_emitter_body = IrEmitterUnnested::Create(ir_emitter_context_);
-
-  TF_RETURN_IF_ERROR(ir_emitter_body->EmitLmhloRegion(body, hlo_for_lmhlo));
-
-  // Extract the condition value from the last op (excluding the terminator op)
-  // in the condition region.
-  auto cond_result = GetHloOutputs(while_op);
-  TF_RET_CHECK(cond_result.size() == 1);
-  TF_ASSIGN_OR_RETURN(auto cond_result_slice,
-                      GetAllocationSlice(cond_result[0]));
-
-  return std::unique_ptr<Thunk>(
-      new WhileThunk(thunk_info, cond_result_slice,
-                     ir_emitter_condition->ConsumeThunkSequence(),
-                     ir_emitter_body->ConsumeThunkSequence()));
-}
-
-absl::StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildWhileThunk(
-    const HloInstruction* instr, const Thunk::ThunkInfo& thunk_info) {
+    const HloInstruction* instr, const Thunk::ThunkInfo& thunk_info,
+    std::optional<int64_t> trip_count) {
   HloComputation* condition = instr->while_condition();
   HloComputation* body = instr->while_body();
 
@@ -3774,31 +2739,7 @@ absl::StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildWhileThunk(
 
   return std::unique_ptr<Thunk>(new WhileThunk(
       thunk_info, pred, ir_emitter_condition->ConsumeThunkSequence(),
-      ir_emitter_body->ConsumeThunkSequence()));
-}
-
-absl::StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildForThunk(
-    const HloInstruction* instr, int64_t loop_limit) {
-  HloComputation* body = instr->while_body();
-  auto ir_emitter_body = IrEmitterUnnested::Create(ir_emitter_context_);
-  TF_RETURN_IF_ERROR(ir_emitter_body->EmitHloComputation(body));
-  return std::unique_ptr<Thunk>(
-      new ForThunk(Thunk::ThunkInfo::WithProfileAnnotation(instr), loop_limit,
-                   ir_emitter_body->ConsumeThunkSequence()));
-}
-
-absl::StatusOr<std::unique_ptr<Thunk>> IrEmitterUnnested::BuildForThunk(
-    mlir::lmhlo::WhileOp while_op, const Thunk::ThunkInfo& thunk_info,
-    const int64_t loop_limit,
-    const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-        hlo_for_lmhlo) {
-  // Generate thunk sequence for while 'body' (will be used a For loop body).
-  auto ir_emitter_body = IrEmitterUnnested::Create(ir_emitter_context_);
-  TF_RETURN_IF_ERROR(
-      ir_emitter_body->EmitLmhloRegion(&while_op.getBody(), hlo_for_lmhlo));
-
-  return std::unique_ptr<Thunk>(new ForThunk(
-      thunk_info, loop_limit, ir_emitter_body->ConsumeThunkSequence()));
+      ir_emitter_body->ConsumeThunkSequence(), trip_count));
 }
 
 absl::Status IrEmitterUnnested::EmitTargetElementLoop(
@@ -3828,6 +2769,21 @@ absl::Status IrEmitterUnnested::EmitSendThunk(const HloSendInstruction* instr) {
   const HloInstruction* src = instr->operand(0);
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice buffer,
                       GetAllocationSliceForHlo(src, {}));
+  if (!instr->is_host_transfer()) {
+    const auto& hlo_config = ir_emitter_context_->hlo_module().config();
+    const int64_t replica_count = hlo_config.replica_count();
+    const int64_t partition_count = hlo_config.num_partitions();
+    const NcclCollectiveThunk::Buffer nccl_buffer = {
+        /*element_count=*/ShapeUtil::ElementsIn(src->shape()),
+        /*source_buffer=*/buffer,
+        /*destination_buffer=*/buffer};
+    auto thunk = std::make_unique<NcclSendThunk>(
+        Thunk::ThunkInfo::WithProfileAnnotation(instr), NcclApi::Default(),
+        instr, replica_count, partition_count, nccl_buffer);
+    collectives_async_events_.try_emplace(instr, thunk->async_events());
+    AddThunkToThunkSequence(std::move(thunk));
+    return absl::OkStatus();
+  }
 
   AddThunkToThunkSequence(std::make_unique<SendThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(instr), src->shape(), buffer,
@@ -3843,6 +2799,10 @@ absl::Status IrEmitterUnnested::EmitSendDoneThunk(
   if (!instr->channel_id().has_value())
     return absl::InternalError("Unknown send done instruction channel id");
 
+  if (!instr->is_host_transfer()) {
+    return EmitNcclAsyncDone(Thunk::kNcclSendDone, instr);
+  }
+
   AddThunkToThunkSequence(std::make_unique<SendDoneThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(instr), *instr->channel_id(),
       send_recv_events_, DeviceConstraint(instr)));
@@ -3853,9 +2813,24 @@ absl::Status IrEmitterUnnested::EmitSendDoneThunk(
 absl::Status IrEmitterUnnested::EmitRecvThunk(const HloRecvInstruction* instr) {
   if (!instr->channel_id().has_value())
     return absl::InternalError("Unknown recv instruction channel id");
-
+  TF_RET_CHECK(instr->shape().IsTuple());
   TF_ASSIGN_OR_RETURN(BufferAllocation::Slice buffer,
                       GetAllocationSliceForHlo(instr, {0}));
+  if (!instr->is_host_transfer()) {
+    const auto& hlo_config = ir_emitter_context_->hlo_module().config();
+    const int64_t replica_count = hlo_config.replica_count();
+    const int64_t partition_count = hlo_config.num_partitions();
+    const NcclCollectiveThunk::Buffer nccl_buffer = {
+        /*element_count=*/ShapeUtil::ElementsIn(instr->shape().tuple_shapes(0)),
+        /*source_buffer=*/buffer,
+        /*destination_buffer=*/buffer};
+    auto thunk = std::make_unique<NcclRecvThunk>(
+        Thunk::ThunkInfo::WithProfileAnnotation(instr), NcclApi::Default(),
+        instr, replica_count, partition_count, nccl_buffer);
+    collectives_async_events_.try_emplace(instr, thunk->async_events());
+    AddThunkToThunkSequence(std::move(thunk));
+    return absl::OkStatus();
+  }
 
   AddThunkToThunkSequence(std::make_unique<RecvThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(instr),
@@ -3872,6 +2847,10 @@ absl::Status IrEmitterUnnested::EmitRecvDoneThunk(
   if (!instr->channel_id().has_value())
     return absl::InternalError("Unknown recv done instruction channel id");
 
+  if (!instr->is_host_transfer()) {
+    return EmitNcclAsyncDone(Thunk::kNcclRecvDone, instr);
+  }
+
   AddThunkToThunkSequence(std::make_unique<RecvDoneThunk>(
       Thunk::ThunkInfo::WithProfileAnnotation(instr), *instr->channel_id(),
       send_recv_events_, DeviceConstraint(instr)));
@@ -3879,347 +2858,6 @@ absl::Status IrEmitterUnnested::EmitRecvDoneThunk(
   return absl::OkStatus();
 }
 
-absl::Status IrEmitterUnnested::EmitOp(
-    mlir::Operation* op,
-    const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-        hlo_for_lmhlo) {
-  if (mlir::isa<mlir::memref::CollapseShapeOp, mlir::func::ConstantOp,
-                mlir::arith::ConstantOp, mlir::memref::ReinterpretCastOp,
-                mlir::func::ReturnOp, mlir::lmhlo::TerminatorOp,
-                mlir::memref::ViewOp>(op)) {
-    return absl::OkStatus();
-  }
-
-  if (mlir::isa<mlir::memref::GetGlobalOp>(op)) {
-    const HloConstantInstruction* hlo_const_instr =
-        DynCast<HloConstantInstruction>(hlo_for_lmhlo.at(op));
-    TF_RET_CHECK(hlo_const_instr);
-    return EmitConstant(op, hlo_const_instr->literal());
-  }
-
-  bool is_gpu_runtime = ir_emitter_context_->debug_options()
-                            .xla_gpu_enable_xla_runtime_executable();
-
-  if (auto call = mlir::dyn_cast<mlir::lmhlo::CustomCallOp>(op)) {
-    if (call.getCallTargetName() == "PadToStatic") {
-      return EmitPadToStatic(op);
-    }
-    if (call.getCallTargetName() == "SliceToDynamic") {
-      return EmitSliceToDynamic(op);
-    }
-    const llvm::StringRef call_target = call.getCallTargetName();
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    if (absl::string_view(call_target.data(), call_target.size()) ==
-        kTriangularSolveCallTarget) {
-      return EmitTriangularSolveCustomCall(op);
-    }
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-    if (!is_gpu_runtime && call.getCallTargetName() == "__gpu$TopK") {
-      return EmitTopKCustomCall(
-          Cast<HloCustomCallInstruction>(hlo_for_lmhlo.at(op)));
-    }
-
-    return EmitCustomCallThunk(
-        op, Cast<HloCustomCallInstruction>(hlo_for_lmhlo.at(op)));
-  }
-
-  if (mlir::isa<mlir::lmhlo_gpu::GEMMOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      const HloCustomCallInstruction* instr =
-          Cast<HloCustomCallInstruction>(hlo_for_lmhlo.at(op));
-      return EmitGemmThunk(instr);
-    }
-    return EmitGemmThunk(op);
-  }
-
-#if GOOGLE_CUDA || TF_HIPBLASLT
-  if (mlir::isa<mlir::lmhlo_gpu::CublasLtMatmulOp>(op)) {
-    return EmitCublasLtMatmulThunk(op);
-  }
-#endif  // GOOGLE_CUDA || TF_HIPBLASLT
-#if GOOGLE_CUDA
-  if (mlir::isa<mlir::lmhlo_gpu::CublasLtMatmulF8Op>(op)) {
-    return EmitCublasLtMatmulThunkF8(op);
-  }
-  if (mlir::isa<mlir::lmhlo_gpu::CudnnConvReorderFilterOp,
-                mlir::lmhlo_gpu::CudnnConvReorderFilterAndBiasOp>(op)) {
-    return EmitConvolutionReorderThunk(op);
-  }
-  if (mlir::isa<mlir::lmhlo_gpu::CudnnNormOp>(op)) {
-    return EmitNormThunk(op);
-  }
-  if (mlir::isa<mlir::lmhlo_gpu::fusedMHAOp>(op)) {
-    return EmitFusedMHAThunk(op);
-  }
-  if (mlir::isa<mlir::lmhlo_gpu::fusedMHABackwardOp>(op)) {
-    return EmitFusedMHABackwardThunk(op);
-  }
-#endif  // GOOGLE_CUDA
-
-  if (mlir::isa<mlir::lmhlo_gpu::ConvForwardOp,
-                mlir::lmhlo_gpu::ConvForwardGraphOp,
-                mlir::lmhlo_gpu::ConvForwardFusedOp,
-                mlir::lmhlo_gpu::ConvForwardFusedSideInputOp,
-                mlir::lmhlo_gpu::ConvBackwardFilterOp,
-                mlir::lmhlo_gpu::ConvBackwardInputOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      return EmitConvolutionThunk(
-          Cast<HloCustomCallInstruction>(hlo_for_lmhlo.at(op)));
-    }
-    return EmitConvolutionThunk(op);
-  }
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  if (mlir::isa<mlir::lmhlo_gpu::RadixSortOp>(op)) {
-    return EmitCubDeviceRadixSort(op);
-  }
-  if (mlir::isa<mlir::lmhlo_gpu::CholeskyOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      return EmitCholeskyThunk(hlo_for_lmhlo.at(op));
-    } else {
-      return EmitCholeskyThunk(op);
-    }
-  }
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-  if (mlir::isa<mlir::lmhlo::FftOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      return EmitFftThunk(Cast<HloFftInstruction>(hlo_for_lmhlo.at(op)));
-    }
-    return EmitFftThunk(op);
-  }
-
-  if (mlir::isa<mlir::lmhlo::TriangularSolveOp>(op)) {
-    return Internal(
-        "TriangularSolve is implemented as a custom-call; we do not expect to "
-        "lower a true HLO TriangularSolve op.");
-  }
-
-  if (mlir::isa<mlir::lmhlo::FusionOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      const HloFusionInstruction* instr =
-          Cast<HloFusionInstruction>(hlo_for_lmhlo.at(op));
-      const se::DeviceDescription& device_info =
-          ir_emitter_context_->gpu_device_info();
-      TF_ASSIGN_OR_RETURN(auto fusion_analysis,
-                          HloFusionAnalysis::Create(instr, &device_info));
-      return EmitFusion(instr, fusion_analysis);
-    }
-
-    return EmitFusion(op, hlo_for_lmhlo);
-  }
-
-  if (mlir::isa<mlir::lmhlo::SelectAndScatterOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      return EmitSelectAndScatter(
-          Cast<HloSelectAndScatterInstruction>(hlo_for_lmhlo.at(op)));
-    }
-    return EmitSelectAndScatter(op, hlo_for_lmhlo);
-  }
-
-  if (mlir::isa<mlir::lmhlo::RngGetAndUpdateStateOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      return EmitRngGetAndUpdateState(
-          Cast<HloRngGetAndUpdateStateInstruction>(hlo_for_lmhlo.at(op)));
-    }
-    return EmitRngGetAndUpdateState(op);
-  }
-
-  if (mlir::isa<mlir::lmhlo::SortOp>(op)) {
-    return EmitSort(op, Cast<HloSortInstruction>(hlo_for_lmhlo.at(op)));
-  }
-
-  if (mlir::isa<mlir::lmhlo::ReplicaIdOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      return EmitReplicaOrPartitionId<ReplicaIdThunk>(hlo_for_lmhlo.at(op));
-    }
-    return EmitReplicaOrPartitionId<ReplicaIdThunk, mlir::lmhlo::ReplicaIdOp>(
-        op);
-  }
-
-  if (mlir::isa<mlir::lmhlo::PartitionIdOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      return EmitReplicaOrPartitionId<PartitionIdThunk>(hlo_for_lmhlo.at(op));
-    }
-    return EmitReplicaOrPartitionId<PartitionIdThunk,
-                                    mlir::lmhlo::PartitionIdOp>(op);
-  }
-
-  if (mlir::isa<mlir::lmhlo_gpu::CollectivePermuteStartOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      return EmitCollectivePermute(
-          Cast<HloCollectivePermuteInstruction>(hlo_for_lmhlo.at(op)));
-    }
-    return EmitCollectivePermute(op);
-  }
-
-  if (mlir::isa<mlir::lmhlo_gpu::CollectivePermuteDoneOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      return EmitNcclAsyncDone(Thunk::kNcclCollectivePermuteDone,
-                               hlo_for_lmhlo.at(op));
-    }
-    return EmitNcclAsyncDone(
-        Thunk::kNcclCollectivePermuteDone, op,
-        mlir::cast<mlir::lmhlo_gpu::CollectivePermuteDoneOp>(op).getToken());
-  }
-
-  if (mlir::isa<mlir::lmhlo_gpu::AllGatherStartOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      auto* all_gather = Cast<HloAllGatherInstruction>(hlo_for_lmhlo.at(op));
-      return EmitNcclThunk<NcclAllGatherStartThunk, HloAllGatherInstruction>(
-          Thunk::kNcclAllGatherStart, all_gather, all_gather,
-          all_gather->use_global_device_ids());
-    }
-    return EmitNcclThunk<NcclAllGatherStartThunk,
-                         mlir::lmhlo_gpu::AllGatherStartOp>(op);
-  }
-
-  if (mlir::isa<mlir::lmhlo_gpu::AllGatherDoneOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      return EmitNcclAsyncDone(Thunk::kNcclAllGatherDone, hlo_for_lmhlo.at(op));
-    }
-    return EmitNcclAsyncDone(
-        Thunk::kNcclAllGatherDone, op,
-        mlir::cast<mlir::lmhlo_gpu::AllGatherDoneOp>(op).getToken());
-  }
-
-  if (mlir::isa<mlir::lmhlo_gpu::AllReduceStartOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      auto* all_reduce = Cast<HloAllReduceInstruction>(hlo_for_lmhlo.at(op));
-      return EmitNcclThunk<NcclAllReduceStartThunk, HloAllReduceInstruction>(
-          Thunk::kNcclAllReduceStart, all_reduce, all_reduce,
-          all_reduce->use_global_device_ids());
-    }
-    return EmitNcclThunk<NcclAllReduceStartThunk,
-                         mlir::lmhlo_gpu::AllReduceStartOp>(op);
-  }
-
-  if (mlir::isa<mlir::lmhlo_gpu::AllReduceDoneOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      return EmitNcclAsyncDone(Thunk::kNcclAllReduceDone, hlo_for_lmhlo.at(op));
-    }
-    return EmitNcclAsyncDone(
-        Thunk::kNcclAllReduceDone, op,
-        mlir::cast<mlir::lmhlo_gpu::AllReduceDoneOp>(op).getToken());
-  }
-
-  if (mlir::isa<mlir::lmhlo_gpu::ReduceScatterStartOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      auto* async_start = hlo_for_lmhlo.at(op);
-      auto* reduce_scatter = Cast<HloReduceScatterInstruction>(
-          async_start->async_wrapped_instruction());
-      return EmitNcclThunk<NcclReduceScatterStartThunk,
-                           HloReduceScatterInstruction>(
-          Thunk::kNcclReduceScatterStart, async_start, reduce_scatter,
-          reduce_scatter->use_global_device_ids());
-    }
-    return EmitNcclThunk<NcclReduceScatterStartThunk,
-                         mlir::lmhlo_gpu::ReduceScatterStartOp>(op);
-  }
-
-  if (mlir::isa<mlir::lmhlo_gpu::ReduceScatterDoneOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      return EmitNcclAsyncDone(Thunk::kNcclReduceScatterDone,
-                               hlo_for_lmhlo.at(op));
-    }
-    return EmitNcclAsyncDone(
-        Thunk::kNcclReduceScatterDone, op,
-        mlir::cast<mlir::lmhlo_gpu::ReduceScatterDoneOp>(op).getToken());
-  }
-
-  if (mlir::isa<mlir::lmhlo_gpu::AllToAllStartOp>(op)) {
-    return EmitNcclThunk<NcclAllToAllStartThunk,
-                         mlir::lmhlo_gpu::AllToAllStartOp>(op);
-  }
-
-  if (mlir::isa<mlir::lmhlo_gpu::AllToAllDoneOp>(op)) {
-    return EmitNcclAsyncDone(
-        Thunk::kNcclAllToAllDone, op,
-        mlir::cast<mlir::lmhlo_gpu::AllToAllDoneOp>(op).getToken());
-  }
-
-  if (mlir::isa<mlir::lmhlo::InfeedOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      return EmitInfeed(Cast<HloInfeedInstruction>(hlo_for_lmhlo.at(op)));
-    }
-    return EmitInfeed(op);
-  }
-
-  if (mlir::isa<mlir::lmhlo::OutfeedOp>(op)) {
-    if (ir_emitter_context_->emit_ir_from_hlo()) {
-      return EmitOutfeed(Cast<HloOutfeedInstruction>(hlo_for_lmhlo.at(op)));
-    }
-    return EmitOutfeed(op);
-  }
-
-  if (mlir::isa<mlir::lmhlo::CaseOp>(op)) {
-    return EmitConditional(op, hlo_for_lmhlo);
-  }
-
-  if (mlir::isa<mlir::lmhlo::WhileOp>(op)) {
-    // TODO(ezhulenev): While loops may contain instructions that do not support
-    // emitting from HLO, so we can't yet enable while thunk emission here.
-    static constexpr bool kWhileThunkNotSupported = true;
-    if (ir_emitter_context_->emit_ir_from_hlo() && !kWhileThunkNotSupported) {
-      return EmitWhile(hlo_for_lmhlo.at(op));
-    }
-    return EmitWhile(op, hlo_for_lmhlo);
-  }
-
-  // Remaining arith.constant ops are the gpu.launch_func dimensions as a result
-  // of inlining the fusion region after lowering. They can safely be skipped
-  // because constants have no side effects.
-  if (mlir::isa<mlir::arith::ConstantOp>(op)) {
-    return absl::OkStatus();
-  }
-
-  if (mlir::isa<mlir::lmhlo::CommandBufferOp>(op)) {
-    return EmitCommandBufferThunk(hlo_for_lmhlo.at(op));
-  }
-
-  // In GPU runtime point-to-point communications implemented as runtime custom
-  // calls, and we do not need real thunks to construct them, so we can emit
-  // stubs that always fail. This is deprecated and will be removed in Q1 2024.
-  if (is_gpu_runtime &&
-      mlir::isa<mlir::lmhlo::SendOp, mlir::lmhlo::RecvOp,
-                mlir::lmhlo::SendDoneOp, mlir::lmhlo::RecvDoneOp>(op)) {
-    return EmitUnreachable(op,
-                           "Point-to-point communication operations are not "
-                           "implemented as thunks");
-  }
-
-  if (mlir::isa<mlir::lmhlo::SendOp>(op)) {
-    return EmitSendThunk(Cast<HloSendInstruction>(hlo_for_lmhlo.at(op)));
-  }
-
-  if (mlir::isa<mlir::lmhlo::SendDoneOp>(op)) {
-    return EmitSendDoneThunk(
-        Cast<HloSendDoneInstruction>(hlo_for_lmhlo.at(op)));
-  }
-
-  if (mlir::isa<mlir::lmhlo::RecvOp>(op)) {
-    return EmitRecvThunk(Cast<HloRecvInstruction>(hlo_for_lmhlo.at(op)));
-  }
-
-  if (mlir::isa<mlir::lmhlo::RecvDoneOp>(op)) {
-    return EmitRecvDoneThunk(
-        Cast<HloRecvDoneInstruction>(hlo_for_lmhlo.at(op)));
-  }
-
-  return Internal("Unrecognized op: %s", llvm_ir::DumpToString(op));
-}
-
-absl::Status IrEmitterUnnested::EmitLmhloRegion(
-    mlir::Region* region,
-    const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-        hlo_for_lmhlo) {
-  for (mlir::Operation& op : llvm::make_early_inc_range(region->front())) {
-    TF_RETURN_IF_ERROR(EmitOp(&op, hlo_for_lmhlo));
-  }
-  return absl::OkStatus();
-}
-
 absl::Status IrEmitterUnnested::EmitHloInstruction(
     const HloInstruction* instr) {
   // TODO(anlunx): Support other instruction opcodes.
@@ -4249,9 +2887,23 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
           return EmitNcclAsyncDone(Thunk::kNcclReduceScatterDone, instr);
         case HloOpcode::kAllToAll:
           return EmitNcclAsyncDone(Thunk::kNcclAllToAllDone, instr);
-        default:
+        default: {
+          if (wrapped->has_backend_config()) {
+            TF_ASSIGN_OR_RETURN(
+                xla::gpu::GpuBackendConfig gpu_config,
+                wrapped->backend_config<xla::gpu::GpuBackendConfig>());
+            if (gpu_config.operation_queue_id() != 0) {
+              // If there an async-done instruction that wraps an instruction
+              // that runs on a non-default stream, then we will
+              // just emit syncOnStreamThunk().
+              return EmitWaitForStreamsThunk(instr, gpu_config,
+                                             /*is_async_done=*/true);
+            }
+          }
+
           return Internal("Unsupported async done wrapped instruction: %s",
                           HloOpcodeString(wrapped->opcode()));
+        }
       }
     }
     case HloOpcode::kAsyncStart: {
@@ -4269,9 +2921,26 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
           return EmitNcclThunk<NcclAllToAllStartThunk, HloAllToAllInstruction>(
               Thunk::kNcclAllToAll, instr, all_to_all, std::nullopt);
         }
-        default:
+        default: {
+          if (wrapped->has_backend_config()) {
+            TF_ASSIGN_OR_RETURN(
+                xla::gpu::GpuBackendConfig gpu_config,
+                wrapped->backend_config<xla::gpu::GpuBackendConfig>());
+            if (gpu_config.operation_queue_id() != 0) {
+              // If there an async instruction that wraps an instruction
+              // that runs on a non-default stream, then we will
+              // emit syncOnStreamThunk(source=execution_stream,
+              //                        wait_on=main_compute_stream)
+              // then the thunk of wrapped instruction.
+              TF_RETURN_IF_ERROR(
+                  EmitWaitForStreamsThunk(instr, gpu_config,
+                                          /*is_async_done=*/false));
+              return EmitHloInstruction(wrapped);
+            }
+          }
           return Internal("Unsupported async start wrapped instruction: %s",
                           HloOpcodeString(wrapped->opcode()));
+        }
       }
     }
 
@@ -4293,6 +2962,28 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
       if (IsLegacyCublasMatmul(*instr)) {
         return EmitGemmThunk(custom_call);
       }
+#if GOOGLE_CUDA || TF_HIPBLASLT
+      if (IsCublasLtMatmul(*instr)) {
+        return EmitCublasLtMatmulThunk(custom_call);
+      }
+#endif  // GOOGLE_CUDA || TF_HIPBLASLT
+#if GOOGLE_CUDA
+      if (IsCublasLtMatmulF8(*instr)) {
+        return EmitCublasLtMatmulThunkF8(custom_call);
+      }
+      if (IsCudnnConvolutionReorder(*instr)) {
+        return EmitConvolutionReorderThunk(custom_call);
+      }
+      if (IsCustomCallToDnnNorm(*instr)) {
+        return EmitNormThunk(custom_call);
+      }
+      if (IsFwdCustomCallTofMHA(*instr)) {
+        return EmitFusedMHAThunk(custom_call);
+      }
+      if (IsBwdCustomCallTofMHA(*instr)) {
+        return EmitFusedMHABackwardThunk(custom_call);
+      }
+#endif  // GOOGLE_CUDA
       if (IsCustomCallToTopK(*instr)) {
         return EmitTopKCustomCall(custom_call);
       }
@@ -4306,15 +2997,26 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
       if (IsTriangularSolve(*instr)) {
         return EmitTriangularSolveCustomCall(instr);
       }
+      if (IsCubDeviceRadixSort(*instr)) {
+        return EmitCubDeviceRadixSort(custom_call);
+      }
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+      if (custom_call->custom_call_target() == "PadToStatic") {
+        return EmitPadToStatic(custom_call);
+      }
+      if (instr->custom_call_target() == "SliceToDynamic") {
+        return EmitSliceToDynamic(custom_call);
+      }
+      if (instr->custom_call_target() == "__gpu$xla.gpu.triton") {
+        return EmitTritonCustomCall(custom_call);
+      }
       return EmitCustomCallThunk(custom_call);
     }
     case HloOpcode::kFusion: {
       auto* fusion = Cast<HloFusionInstruction>(instr);
       const se::DeviceDescription& device_info =
           ir_emitter_context_->gpu_device_info();
-      TF_ASSIGN_OR_RETURN(auto fusion_analysis,
-                          HloFusionAnalysis::Create(fusion, &device_info));
+      auto fusion_analysis = HloFusionAnalysis::Create(fusion, &device_info);
       return EmitFusion(fusion, fusion_analysis);
     }
     case HloOpcode::kInfeed:
@@ -4349,7 +3051,8 @@ absl::Status IrEmitterUnnested::EmitHloInstruction(
     case HloOpcode::kWhile:
       return EmitWhile(instr);
 
-    // HLO module is already ordered, so kAfterAll is a noop.
+    // HLO module is already scheduled, so instructions for ordering are noops.
+    case HloOpcode::kAddDependency:
     case HloOpcode::kAfterAll:
     // We don't need to emit thunks for these operations because their semantics
     // are encoded by buffers.
diff --git a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
index 8ed859fc6d1583..68c29fe12b05f5 100644
--- a/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
+++ b/third_party/xla/xla/service/gpu/ir_emitter_unnested.h
@@ -40,7 +40,7 @@ limitations under the License.
 #include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/ir_emitter.h"
 #include "xla/service/gpu/nccl_collective_thunk.h"
-#include "xla/service/gpu/runtime3/send_recv_thunk.h"
+#include "xla/service/gpu/runtime/send_recv_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/service/llvm_ir/ir_array.h"
 #include "xla/service/llvm_ir/llvm_util.h"
@@ -48,6 +48,11 @@ limitations under the License.
 #include "xla/status.h"
 #include "xla/statusor.h"
 
+#if TENSORFLOW_USE_ROCM
+// for TF_HIPBLASLT
+#include "rocm/rocm_config.h"
+#endif
+
 namespace xla {
 namespace gpu {
 
@@ -106,22 +111,13 @@ class IrEmitterUnnested : public IrEmitter {
     return std::make_unique<ThunkSequence>(std::move(thunk_sequence_));
   }
 
-  // Emits code for the given LMHLO region.
+  // Emits code for the given HLO computation.
   //
   // Also populates related information to 'ir_emitter_context_' for
   // large-constant initializations. Large constants don't get initializers in
   // the generated code and so must be initialized by XLA. The value of these
   // constants will be stored in 'content'. Constants with initializers in the
   // generated code will have empty 'content'.
-  absl::Status EmitLmhloRegion(
-      mlir::Region* region,
-      const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-          hlo_for_lmhlo);
-
-  // Emits code for the given HLO computation. Right now it is only used to emit
-  // thunks for constructing command buffer. The plan is to replace
-  // EmitLmhloRegion by this function altogether, after we support emitting
-  // all instructions from HLO.
   absl::Status EmitHloComputation(const HloComputation* computation);
 
   static void GetDependentDialects(mlir::DialectRegistry& registry);
@@ -129,77 +125,50 @@ class IrEmitterUnnested : public IrEmitter {
  private:
   explicit IrEmitterUnnested(IrEmitterContext* ir_emitter_context);
 
-  absl::Status EmitUnreachable(mlir::Operation* op, std::string error_message);
-
   absl::Status EmitCommandBufferThunk(const HloInstruction* instr);
 
   // IrEmitterUnnested handles the following instructions differently from
   // IrEmitter. It also mixes in some special handling for custom kernels
   // via the ThunkEmitter.
-  absl::Status EmitConstant(mlir::Operation* op, const Literal& literal);
   absl::Status EmitConstant(const HloConstantInstruction* instr);
 
-  absl::Status EmitConditional(
-      mlir::Operation* op,
-      const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-          hlo_for_lmhlo);
   absl::Status EmitConditional(const HloInstruction* instr);
-  absl::Status EmitConvolutionThunk(mlir::Operation* op);
   absl::Status EmitConvolutionThunk(const HloCustomCallInstruction* instr);
-  absl::Status EmitGemmThunk(mlir::Operation* op);
   absl::Status EmitGemmThunk(const HloCustomCallInstruction* instr);
 #if GOOGLE_CUDA || TF_HIPBLASLT
-  absl::Status EmitCublasLtMatmulThunk(mlir::Operation* op);
+  absl::Status EmitCublasLtMatmulThunk(const HloCustomCallInstruction* instr);
 #endif  // GOOGLE_CUDA || TF_HIPBLASLT
 #if GOOGLE_CUDA
-  absl::Status EmitCublasLtMatmulThunkF8(mlir::Operation* op);
-  absl::Status EmitConvolutionReorderThunk(mlir::Operation* op);
-  absl::Status EmitNormThunk(mlir::Operation* op);
-  absl::Status EmitFusedMHAThunk(mlir::Operation* op);
-  absl::Status EmitFusedMHABackwardThunk(mlir::Operation* op);
+  absl::Status EmitCublasLtMatmulThunkF8(const HloCustomCallInstruction* instr);
+  absl::Status EmitConvolutionReorderThunk(
+      const HloCustomCallInstruction* instr);
+  absl::Status EmitNormThunk(const HloCustomCallInstruction* instr);
+  absl::Status EmitFusedMHAThunk(const HloCustomCallInstruction* instr);
+  absl::Status EmitFusedMHABackwardThunk(const HloCustomCallInstruction* instr);
 #endif  // GOOGLE_CUDA
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  absl::Status EmitCubDeviceRadixSort(mlir::Operation* op);
-  absl::Status EmitCholeskyThunk(mlir::Operation* op);
+  absl::Status EmitCubDeviceRadixSort(const HloCustomCallInstruction* instr);
   absl::Status EmitCholeskyThunk(const HloInstruction* instr);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  absl::Status EmitCustomCallThunk(mlir::Operation* op,
-                                   const HloCustomCallInstruction* instr);
   absl::Status EmitCustomCallThunk(const HloCustomCallInstruction* instr);
-  absl::Status EmitFftThunk(mlir::Operation* op);
   absl::Status EmitFftThunk(const HloFftInstruction* instr);
-  absl::Status EmitFusion(
-      mlir::Operation* op,
-      const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-          hlo_for_lmhlo);
   absl::Status EmitFusion(const HloFusionInstruction* instr,
                           HloFusionAnalysis& fusion_analysis);
-  absl::Status EmitSelectAndScatter(
-      mlir::Operation* op,
-      const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-          hlo_for_lmhlo);
   absl::Status EmitSelectAndScatter(
       const HloSelectAndScatterInstruction* instr);
-  absl::Status EmitWhile(
-      mlir::Operation* op,
-      const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-          hlo_for_lmhlo);
   absl::Status EmitWhile(const HloInstruction* instr);
-  absl::Status EmitInfeed(mlir::Operation* op);
   absl::Status EmitInfeed(const HloInfeedInstruction* instr);
-  absl::Status EmitOutfeed(mlir::Operation* op);
   absl::Status EmitOutfeed(const HloOutfeedInstruction* instr);
-  absl::Status EmitRngGetAndUpdateState(mlir::Operation* op);
   absl::Status EmitRngGetAndUpdateState(
       const HloRngGetAndUpdateStateInstruction* instr);
 
   absl::Status EmitSort(mlir::Operation* op, const HloSortInstruction* sort);
   absl::Status EmitSort(const HloSortInstruction* sort);
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  absl::Status EmitTriangularSolveCustomCall(mlir::Operation* op);
   absl::Status EmitTriangularSolveCustomCall(const HloInstruction* instr);
 #endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   absl::Status EmitTopKCustomCall(const HloCustomCallInstruction* instr);
+  absl::Status EmitTritonCustomCall(const HloCustomCallInstruction* instr);
 
   absl::Status EmitSendThunk(const HloSendInstruction* instr);
   absl::Status EmitSendDoneThunk(const HloSendDoneInstruction* instr);
@@ -207,12 +176,6 @@ class IrEmitterUnnested : public IrEmitter {
   absl::Status EmitRecvThunk(const HloRecvInstruction* instr);
   absl::Status EmitRecvDoneThunk(const HloRecvDoneInstruction* instr);
 
-  template <typename NcclThunkType, typename OpT>
-  absl::Status EmitNcclThunk(mlir::Operation* op);
-
-  absl::Status EmitNcclAsyncDone(Thunk::Kind kind, mlir::Operation* op,
-                                 mlir::Value token);
-
   template <typename NcclThunkType, typename HloInstType>
   absl::Status EmitNcclThunk(Thunk::Kind kind,
                              const HloInstruction* async_start,
@@ -221,24 +184,17 @@ class IrEmitterUnnested : public IrEmitter {
 
   absl::Status EmitNcclAsyncDone(Thunk::Kind kind, const HloInstruction* instr);
 
-  template <typename ThunkType, typename OpT>
-  absl::Status EmitReplicaOrPartitionId(mlir::Operation* op);
+  absl::Status EmitWaitForStreamsThunk(const HloInstruction* inst,
+                                       GpuBackendConfig& gpu_config,
+                                       bool is_async_done);
   template <typename ThunkType>
   absl::Status EmitReplicaOrPartitionId(const HloInstruction* instr);
 
-  absl::Status EmitCollectivePermute(mlir::Operation* op);
   absl::Status EmitCollectivePermute(
       const HloCollectivePermuteInstruction* instr);
 
-  absl::Status EmitOp(
-      mlir::Operation* op,
-      const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-          hlo_for_lmhlo);
-
   absl::Status EmitHloInstruction(const HloInstruction* instr);
 
-  static Thunk::ThunkInfo GetThunkInfo(mlir::Operation* op);
-
   absl::Status EmitTargetElementLoop(
       const HloInstruction& hlo,
       const llvm_ir::ElementGenerator& body_emitter) override;
@@ -325,7 +281,7 @@ class IrEmitterUnnested : public IrEmitter {
   //   return;
   // }
   //   ```
-  absl::Status EmitPadToStatic(mlir::Operation* op);
+  absl::Status EmitPadToStatic(const HloCustomCallInstruction* instr);
 
   // Input = {dynamic array(with dynamic dimension meta data at the end)}
   // Output = {static array, dynamic_dim0, dynamic_dim1}
@@ -371,43 +327,13 @@ class IrEmitterUnnested : public IrEmitter {
   //   return;
   // }
   //   ```
-  absl::Status EmitSliceToDynamic(mlir::Operation* op);
-
-  absl::StatusOr<BufferAllocation::Slice> GetAllocationSlice(mlir::Value v);
-  absl::StatusOr<std::vector<BufferAllocation::Slice>> GetAllocationSlices(
-      mlir::OperandRange operands);
+  absl::Status EmitSliceToDynamic(const HloCustomCallInstruction* instr);
 
   int64_t ByteSizeOf(const Shape& shape) const {
     return llvm_ir::ByteSizeOf(
         shape, ir_emitter_context_->llvm_module()->getDataLayout());
   }
 
-  // Emits kernel thunk for a custom fusion implemented with hand written custom
-  // device kernels.
-  absl::StatusOr<FusionEmissionResult> EmitCustomFusion(
-      const HloFusionInstruction* fusion, mlir::lmhlo::FusionOp fusion_op,
-      const CustomFusionConfig& config);
-
-  // Builds a kernel thunk for a non-fusion operation, without reuse.
-  //
-  // All input and output tensors of `op` are passed to the kernel.
-  //
-  // TODO(tdanyluk): Consider also reusing non-fusion kernels.
-  absl::StatusOr<std::pair<std::vector<llvm_ir::IrArray> /*inputs*/,
-                           std::vector<llvm_ir::IrArray> /*outputs*/>>
-  BuildKernelThunkForNonFusionOp(mlir::Operation* op,
-                                 const LaunchDimensions& launch_dimensions);
-
-  // Builds a kernel thunk for a non-fusion operation, without reuse.
-  //
-  // Only the tensors specified in `needed_operands` are passed to the kernel.
-  //
-  // TODO(tdanyluk): Consider also reusing non-fusion kernels.
-  absl::StatusOr<std::pair<std::vector<llvm_ir::IrArray> /*inputs*/,
-                           std::vector<llvm_ir::IrArray> /*outputs*/>>
-  BuildKernelThunkForNonFusionOp(mlir::Operation* op,
-                                 mlir::ValueRange needed_operands,
-                                 const LaunchDimensions& launch_dimensions);
 
   absl::StatusOr<std::pair<std::vector<llvm_ir::IrArray> /*inputs*/,
                            std::vector<llvm_ir::IrArray> /*outputs*/>>
@@ -425,23 +351,8 @@ class IrEmitterUnnested : public IrEmitter {
   // Returns a WhileThunk that invokes thunk sequences for 'condition' and
   // 'body' sub-computations of while instruction.
   absl::StatusOr<std::unique_ptr<Thunk>> BuildWhileThunk(
-      mlir::lmhlo::WhileOp while_op, const Thunk::ThunkInfo& thunk_info,
-      const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-          hlo_for_lmhlo);
-
-  absl::StatusOr<std::unique_ptr<Thunk>> BuildWhileThunk(
-      const HloInstruction* instr, const Thunk::ThunkInfo& thunk_info);
-
-  // Returns a ForThunk which executes 'loop_limit' invocations of a thunk
-  // sequence from the 'body' sub-computation of the while instruction.
-  absl::StatusOr<std::unique_ptr<Thunk>> BuildForThunk(
-      const HloInstruction* instr, int64_t loop_limit);
-
-  absl::StatusOr<std::unique_ptr<Thunk>> BuildForThunk(
-      mlir::lmhlo::WhileOp while_op, const Thunk::ThunkInfo& thunk_info,
-      int64_t loop_limit,
-      const absl::flat_hash_map<const mlir::Operation*, const HloInstruction*>&
-          hlo_for_lmhlo);
+      const HloInstruction* instr, const Thunk::ThunkInfo& thunk_info,
+      std::optional<int64_t> trip_count);
 
   // Returns a ConditionalThunk which executes the thunk sequence for the
   // 'branch_computation' corresponding to the predicate/branch_index of the
@@ -457,20 +368,16 @@ class IrEmitterUnnested : public IrEmitter {
   // The thunk sequence this IrEmitter generates for the input computation.
   ThunkSequence thunk_sequence_;
 
-  // Maps async start ops to their executors so done can access the thunk.
-  // Executor may be null if the start op is degenerate (so not emitted).
+  // Maps async start ops to their async events so we can emit done thunk
+  // sharing events with corresponding start thunk. Async events may be null if
+  // the start op is degenerate (so not emitted).
   absl::flat_hash_map<std::variant<mlir::Operation*, const HloInstruction*>,
-                      NcclCollectiveThunk::AsyncExecutor*>
-      async_executors_;
+                      std::shared_ptr<NcclCollectiveThunk::AsyncEvents>>
+      collectives_async_events_;
 
   // Container for async send/recv events shared by send/recv thunks.
   std::shared_ptr<SendRecvAsyncEvents> send_recv_events_;
 
-  // Begin optional members for XLA HLO -> LMHLO:
-  absl::flat_hash_map<const mlir::Region*, std::unique_ptr<HloModule>>
-      scratch_nested_computations_;
-  // End optional members for XLA HLO -> LMHLO.
-
   // Returns the ShapedSlices for the given operands.
   absl::StatusOr<std::vector<ShapedSlice>> GetShapedSlices(
       mlir::Operation::operand_range operands);
diff --git a/third_party/xla/xla/service/gpu/kernel_reuse_cache.cc b/third_party/xla/xla/service/gpu/kernel_reuse_cache.cc
index ca3e2ba3b2656d..2d4a9d3356b672 100644
--- a/third_party/xla/xla/service/gpu/kernel_reuse_cache.cc
+++ b/third_party/xla/xla/service/gpu/kernel_reuse_cache.cc
@@ -18,8 +18,15 @@ limitations under the License.
 #include <string>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/kernel_arguments.h"
 #include "xla/util.h"
 #include "tsl/platform/logging.h"
 
@@ -77,18 +84,7 @@ std::string GetComputationFingerprint(
                       fused_computation->ToString(print_options));
 }
 
-std::pair<KernelReuseCache::Entry, bool> KernelReuseCache::Get(
-    const HloComputation* fused_computation,
-    absl::Span<const KernelArgument> kernel_arguments,
-    absl::string_view discriminator,
-    const std::function<KernelReuseCache::Entry()>& generator) {
-  auto ret =
-      GetWithStatus(fused_computation, kernel_arguments, discriminator,
-                    [&]() -> absl::StatusOr<Entry> { return generator(); });
-  return {*ret.first, ret.second};
-}
-
-std::pair<absl::StatusOr<KernelReuseCache::Entry>, bool>
+std::pair<absl::StatusOr<const KernelReuseCache::Entry*>, bool>
 KernelReuseCache::GetWithStatus(
     const HloComputation* fused_computation,
     absl::Span<const KernelArgument> kernel_arguments,
@@ -98,14 +94,26 @@ KernelReuseCache::GetWithStatus(
       fused_computation, kernel_arguments, discriminator);
   VLOG(4) << "Fingerprint: ";
   XLA_VLOG_LINES(4, fingerprint);
+  return GetWithStatus(std::move(fingerprint), generator);
+}
 
-  auto& entry = cache_[fingerprint];
-  if (entry.kernel_name.empty()) {
-    auto ret = generator();
-    if (ret.ok()) entry = *ret;
-    return {ret, false};
+std::pair<absl::StatusOr<const KernelReuseCache::Entry*>, bool>
+KernelReuseCache::GetWithStatus(
+    std::string fingerprint,
+    const std::function<absl::StatusOr<KernelReuseCache::Entry>()>& generator) {
+  auto it = cache_.find(fingerprint);
+  if (it != cache_.end()) {
+    return {&it->second, /*was_cached=*/true};
   }
-  return {{entry}, true};
+
+  absl::StatusOr<Entry> entry = generator();
+  if (entry.ok()) {
+    it =
+        cache_.insert({std::move(fingerprint), std::move(entry.value())}).first;
+    return {&it->second, /*was_cached=*/false};
+  }
+
+  return {entry.status(), /*was_cached=*/false};
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/kernel_reuse_cache.h b/third_party/xla/xla/service/gpu/kernel_reuse_cache.h
index 355a70ac8cb001..ea55d97dc43989 100644
--- a/third_party/xla/xla/service/gpu/kernel_reuse_cache.h
+++ b/third_party/xla/xla/service/gpu/kernel_reuse_cache.h
@@ -15,14 +15,20 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_KERNEL_REUSE_CACHE_H_
 #define XLA_SERVICE_GPU_KERNEL_REUSE_CACHE_H_
 
+#include <cstdint>
 #include <functional>
+#include <optional>
 #include <string>
 #include <utility>
 
+#include "absl/container/flat_hash_map.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/service/gpu/kernel_arguments.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/stream_executor/launch_dim.h"
 
 namespace xla {
 namespace gpu {
@@ -34,22 +40,33 @@ class KernelReuseCache {
   struct Entry {
     std::string kernel_name;
     LaunchDimensions launch_dimensions;
-    int64_t shmem_bytes;
+    std::optional<se::ClusterDim> cluster_dim;
+    int64_t shmem_bytes = 0;
   };
 
   // Retrieves the cache entry for the given computation, or generates it using
   // the given generator function and stores it in the cache.
-  std::pair<Entry, bool /*was_cached*/> Get(
-      const HloComputation* fused_computation,
-      absl::Span<const KernelArgument> kernel_arguments,
-      absl::string_view discriminator, const std::function<Entry()>& generator);
-
-  // Like `Get`, but for generator functions that can fail.
-  std::pair<StatusOr<Entry>, bool /*was_cached*/> GetWithStatus(
+  //
+  // The returned pointer is never nullptr.
+  //
+  // A non-OK status is returned if the entry is not found and the generator
+  // failed.
+  std::pair<absl::StatusOr<const Entry*>, bool /*was_cached*/> GetWithStatus(
       const HloComputation* fused_computation,
       absl::Span<const KernelArgument> kernel_arguments,
       absl::string_view discriminator,
-      const std::function<StatusOr<Entry>()>& generator);
+      const std::function<absl::StatusOr<Entry>()>& generator);
+
+  // Retrieves the cache entry for the given fingerprint, or generates it using
+  // the given generator function and stores it in the cache.
+  //
+  // The returned pointer is never nullptr.
+  //
+  // A non-OK status is returned if the entry is not found and the generator
+  // failed.
+  std::pair<absl::StatusOr<const Entry*>, bool /*was_cached*/> GetWithStatus(
+      std::string fingerprint,
+      const std::function<absl::StatusOr<Entry>()>& generator);
 
  private:
   absl::flat_hash_map<std::string /*fingerprint*/, Entry> cache_;
diff --git a/third_party/xla/xla/service/gpu/kernels/BUILD b/third_party/xla/xla/service/gpu/kernels/BUILD
index cf8f4b24992451..cf62411f1451fd 100644
--- a/third_party/xla/xla/service/gpu/kernels/BUILD
+++ b/third_party/xla/xla/service/gpu/kernels/BUILD
@@ -1,11 +1,15 @@
-load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
+load("//xla:xla.bzl", "xla_cc_test")
+load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
+load("//xla/service/gpu:build_defs.bzl", "gpu_kernel_library")
 load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+load("@local_tsl//tsl/platform:build_config_root.bzl", "tf_gpu_tests_tags")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//visibility:private"],
     licenses = ["notice"],
 )
 
@@ -15,10 +19,10 @@ package_group(
 )
 
 cc_library(
-    name = "custom_fusion",
-    srcs = ["custom_fusion.cc"],
-    hdrs = ["custom_fusion.h"],
-    visibility = ["//visibility:public"],
+    name = "custom_kernel_fusion",
+    srcs = ["custom_kernel_fusion.cc"],
+    hdrs = ["custom_kernel_fusion.h"],
+    visibility = [":friends"],
     deps = [
         ":custom_kernel",
         "//xla:status",
@@ -35,10 +39,10 @@ cc_library(
 )
 
 cc_library(
-    name = "custom_fusion_pattern",
-    srcs = ["custom_fusion_pattern.cc"],
-    hdrs = ["custom_fusion_pattern.h"],
-    visibility = ["//visibility:public"],
+    name = "custom_kernel_fusion_pattern",
+    srcs = ["custom_kernel_fusion_pattern.cc"],
+    hdrs = ["custom_kernel_fusion_pattern.h"],
+    visibility = [":friends"],
     deps = [
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
@@ -47,6 +51,7 @@ cc_library(
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
@@ -56,7 +61,7 @@ cc_library(
     name = "custom_kernel",
     srcs = ["custom_kernel.cc"],
     hdrs = ["custom_kernel.h"],
-    visibility = ["//visibility:public"],
+    visibility = [":friends"],
     deps = [
         "//xla/stream_executor",
         "@com_google_absl//absl/strings:str_format",
@@ -67,7 +72,7 @@ cc_library(
 # a single dependency.
 cc_library(
     name = "custom_fusion_library",
-    visibility = ["//visibility:public"],
+    visibility = [":friends"],
     deps = [":cutlass_gemm_fusion"],
 )
 
@@ -75,11 +80,10 @@ cc_library(
     name = "cutlass_gemm_fusion",
     srcs = ["cutlass_gemm_fusion.cc"],
     hdrs = ["cutlass_gemm_fusion.h"],
-    visibility = ["//visibility:public"],
     deps = [
-        ":custom_fusion",
-        ":custom_fusion_pattern",
         ":custom_kernel",
+        ":custom_kernel_fusion",
+        ":custom_kernel_fusion_pattern",
         ":cutlass_gemm",
         ":cutlass_gemm_custom_kernel",
         "//xla:shape_util",
@@ -103,7 +107,7 @@ xla_test(
     backends = ["gpu"],
     tags = ["no_rocm"],
     deps = [
-        ":custom_fusion_pattern",
+        ":custom_kernel_fusion_pattern",
         ":cutlass_gemm_fusion",
         "//xla:array",
         "//xla:array2d",
@@ -111,7 +115,7 @@ xla_test(
         "//xla:error_spec",
         "//xla:literal_util",
         "//xla:types",
-        "//xla/service/gpu:custom_fusion_rewriter",
+        "//xla/service/gpu:custom_kernel_fusion_rewriter",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/tests:hlo_test_base",
         "@local_tsl//tsl/platform:test",
@@ -119,6 +123,76 @@ xla_test(
     ],
 )
 
+cc_library(
+    name = "topk_kernel",
+    srcs = if_gpu_is_configured(["topk_kernel.cc"]),
+    hdrs = if_gpu_is_configured(["topk_kernel.h"]),
+    compatible_with = [],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
+    deps = [
+        "//xla:shape_util",
+        "//xla:types",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/stream_executor",  # build_cleaner: keep
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor/gpu:gpu_stream_header",
+        "//xla/stream_executor/gpu:gpu_types_header",
+        "@com_google_absl//absl/numeric:bits",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
+    ] + if_gpu_is_configured([
+        ":topk_kernel_gpu",
+    ]),
+)
+
+gpu_kernel_library(
+    name = "topk_kernel_gpu",
+    srcs = if_gpu_is_configured([
+        "topk_kernel_bfloat16.cu.cc",
+        "topk_kernel_float.cu.cc",
+        "topk_kernel.cu.h",
+    ]),
+    hdrs = if_gpu_is_configured(["topk_kernel_common.h"]),
+    compatible_with = [],
+    deps = [
+        "//xla:types",
+        "//xla/stream_executor/gpu:gpu_types_header",
+        "@local_tsl//tsl/lib/math:math_util",
+    ],
+)
+
+xla_cc_test(
+    name = "topk_kernel_test",
+    srcs = if_gpu_is_configured(["topk_kernel_test.cc"]),
+    tags = tf_gpu_tests_tags(),
+    deps = [
+        ":topk_kernel",
+        "//xla:types",
+        "//xla:xla_data_proto_cc",
+        "//xla/stream_executor",  # build_cleaner: keep
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor/gpu:gpu_init",
+        "//xla/stream_executor/gpu:gpu_stream_header",
+        "//xla/stream_executor/gpu:gpu_timer_header",
+        "//xla/stream_executor/gpu:gpu_types_header",
+        "//xla/stream_executor/host:host_platform",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/random",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_benchmark",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 cc_library(
     name = "topk_custom_kernel",
     srcs = ["topk_custom_kernel.cc"],
@@ -126,23 +200,20 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    visibility = ["//visibility:public"],
+    visibility = [":friends"],
     deps = [
         ":custom_kernel",
         "//xla:statusor",
+        "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/stream_executor",
         "@com_google_absl//absl/numeric:bits",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:statusor",
     ] + if_gpu_is_configured([
-        "//xla/service/gpu/runtime:gpu_kernel_helper",
-    ]) + if_cuda_is_configured([
-        "//xla/service/gpu/runtime:topk_kernel_cuda",
-    ]) + if_rocm_is_configured([
-        "//xla/service/gpu/runtime:topk_kernel_rocm",
+        ":topk_kernel_gpu",
     ]),
 )
 
@@ -152,18 +223,19 @@ xla_test(
     backends = ["gpu"],
     deps = [
         ":topk_custom_kernel",
+        "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/service:platform_util",
         "//xla/stream_executor",
-        "//xla/stream_executor:multi_platform_manager",
         "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
         "//xla/stream_executor/cuda:cuda_platform",
         "@com_google_absl//absl/random",
         "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
-        "@eigen_archive//:eigen3",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ],
@@ -180,7 +252,6 @@ cc_library(
         ["cutlass_gemm_custom_kernel_stub.cc"],
     ),
     hdrs = ["cutlass_gemm_custom_kernel.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":custom_kernel",
         ":cutlass_gemm",
@@ -203,8 +274,8 @@ xla_test(
         ":cutlass_gemm_custom_kernel",
         "//xla:xla_data_proto_cc",
         "//xla/stream_executor",
-        "//xla/stream_executor:multi_platform_manager",
         "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
         "//xla/stream_executor/cuda:cuda_platform",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:path",
@@ -223,10 +294,11 @@ cc_binary(
         "//xla:xla_data_proto_cc",
         "//xla/service:gpu_plugin",
         "//xla/stream_executor",
-        "//xla/stream_executor:multi_platform_manager",
         "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
         "//xla/stream_executor/cuda:cuda_platform",
         "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_benchmark",
         "@local_tsl//tsl/platform:test_main",
@@ -241,7 +313,6 @@ cc_library(
     name = "cutlass_gemm",
     srcs = ["cutlass_gemm.cc"],
     hdrs = ["cutlass_gemm.h"],
-    visibility = ["//visibility:public"],
     deps = ["@local_tsl//tsl/platform:logging"],
 )
 
@@ -249,7 +320,6 @@ cuda_library(
     name = "cutlass_gemm_adaptor",
     hdrs = if_cuda_is_configured(["cutlass_gemm_adaptor.cu.h"]),
     copts = ["-Wno-unknown-attributes"],  # __grid_constant__ is not supported by clang
-    visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cutlass_gemm",
         "@cutlass_archive//:cutlass",
@@ -260,7 +330,6 @@ cuda_library(
     name = "cutlass_gemm_epilogue",
     # TODO(ezhulenev): Update to regular hdrs after fixing CUTLASS headers.
     textual_hdrs = if_cuda_is_configured(["cutlass_gemm_epilogue.cu.h"]),
-    visibility = ["//visibility:public"],
     deps = if_cuda_is_configured(["@cutlass_archive//:cutlass"]),
 )
 
@@ -273,7 +342,6 @@ cuda_library(
 
 cc_library(
     name = "cutlass_gemm_kernels",
-    visibility = ["//visibility:public"],
     deps = [
         ":cutlass_gemm_kernel_bf16xbf16_to_bf16",
         ":cutlass_gemm_kernel_bf16xbf16_to_bf16_sm80",
@@ -292,7 +360,6 @@ cuda_library(
     name = "cutlass_gemm_kernel_bf16xbf16_to_bf16",
     srcs = if_cuda_is_configured(["cutlass_gemm_kernel_bf16xbf16_to_bf16.cu.cc"]),
     copts = ["-Wno-unknown-attributes -mllvm -unroll-threshold=100000"],
-    visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cutlass_gemm_adaptor",
         "@cutlass_archive//:cutlass",
@@ -304,7 +371,6 @@ cuda_library(
     name = "cutlass_gemm_kernel_bf16xbf16_to_bf16_sm80",
     srcs = if_cuda_is_configured(["cutlass_gemm_kernel_bf16xbf16_to_bf16_sm80.cu.cc"]),
     copts = ["-Wno-unknown-attributes -mllvm -unroll-threshold=100000"],
-    visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cutlass_gemm_adaptor",
         "@cutlass_archive//:cutlass",
@@ -316,7 +382,6 @@ cuda_library(
     name = "cutlass_gemm_kernel_bf16xbf16_to_bf16_sm90",
     srcs = if_cuda_is_configured(["cutlass_gemm_kernel_bf16xbf16_to_bf16_sm90.cu.cc"]),
     copts = ["-Wno-ctad-maybe-unsupported -Wno-unknown-attributes -mllvm -unroll-threshold=100000"],
-    visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cutlass_gemm_adaptor",
         ":cutlass_gemm_epilogue",
@@ -329,7 +394,6 @@ cuda_library(
     name = "cutlass_gemm_kernel_f32xf32_to_f32",
     srcs = if_cuda_is_configured(["cutlass_gemm_kernel_f32xf32_to_f32.cu.cc"]),
     copts = ["-Wno-unknown-attributes"],
-    visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cutlass_gemm_adaptor",
         "@cutlass_archive//:cutlass",
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_fusion.cc b/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.cc
similarity index 74%
rename from third_party/xla/xla/service/gpu/kernels/custom_fusion.cc
rename to third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.cc
index 9731daad18599e..ec65f30510ebb6 100644
--- a/third_party/xla/xla/service/gpu/kernels/custom_fusion.cc
+++ b/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/kernels/custom_fusion.h"
+#include "xla/service/gpu/kernels/custom_kernel_fusion.h"
 
 #include <memory>
 #include <string>
@@ -29,24 +29,25 @@ limitations under the License.
 namespace xla::gpu {
 
 //===----------------------------------------------------------------------===//
-// CustomFusionRegistry
+// CustomKernelFusionRegistry
 //===----------------------------------------------------------------------===//
 
-CustomFusionRegistry* CustomFusionRegistry::Default() {
-  static auto* registry = new CustomFusionRegistry();
+CustomKernelFusionRegistry* CustomKernelFusionRegistry::Default() {
+  static auto* registry = new CustomKernelFusionRegistry();
   return registry;
 }
 
-absl::Status CustomFusionRegistry::Register(
-    std::string name, std::unique_ptr<CustomFusion> fusion) {
+absl::Status CustomKernelFusionRegistry::Register(
+    std::string name, std::unique_ptr<CustomKernelFusion> fusion) {
   absl::MutexLock lock(&mutex_);
   if (auto it = registry_.try_emplace(name, std::move(fusion)); it.second)
     return absl::OkStatus();
   return absl::InternalError(
-      absl::StrCat("Custom fusion ", name, " already registered."));
+      absl::StrCat("Custom kernel fusion ", name, " already registered."));
 }
 
-CustomFusion* CustomFusionRegistry::Lookup(std::string_view name) const {
+CustomKernelFusion* CustomKernelFusionRegistry::Lookup(
+    std::string_view name) const {
   absl::MutexLock lock(&mutex_);
   if (auto it = registry_.find(name); it != registry_.end())
     return it->second.get();
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_fusion.h b/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.h
similarity index 66%
rename from third_party/xla/xla/service/gpu/kernels/custom_fusion.h
rename to third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.h
index e3bbb6c6131e12..0d1443cb3b9502 100644
--- a/third_party/xla/xla/service/gpu/kernels/custom_fusion.h
+++ b/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_KERNELS_CUSTOM_FUSION_H_
-#define XLA_SERVICE_GPU_KERNELS_CUSTOM_FUSION_H_
+#ifndef XLA_SERVICE_GPU_KERNELS_CUSTOM_KERNEL_FUSION_H_
+#define XLA_SERVICE_GPU_KERNELS_CUSTOM_KERNEL_FUSION_H_
 
 #include <memory>
 #include <string>
@@ -35,11 +35,11 @@ limitations under the License.
 namespace xla::gpu {
 
 //===----------------------------------------------------------------------===//
-// CustomFusion
+// CustomKernelFusion
 //===----------------------------------------------------------------------===//
 
-// Custom fusion is a mechanism for registering custom kernels corresponding to
-// HLO fusions.
+// Custom kernel fusion is a mechanism for registering custom kernels
+// corresponding to HLO fusions.
 //
 // Example: row-major mixed dtype gemm with fused bitcast
 //
@@ -69,20 +69,23 @@ namespace xla::gpu {
 // (2) Triton: XLA:GPU uses Triton to codegen gemm fusion into devie kernels
 //     (PTX and CUBIN for NVIDIA gpus).
 //
-// (3) Custom fusion is another mechanism to execute fusion on device, which
+// (3) Custom kernel fusion is another mechanism to execute fusion on device,
+// which
 //     relies on pre-compiled libraries of custom kernels authored by CUDA C++
-//     experts. Custom fusion implements one particular fusion pattern (e.g.
-//     type casting plus a dot operation like in the example above) with custom
-//     kernels that XLA has to choose from at run time based on auto tuning.
+//     experts. Custom kernel fusion implements one particular fusion pattern
+//     (e.g. type casting plus a dot operation like in the example above) with
+//     custom kernels that XLA has to choose from at run time based on auto
+//     tuning.
 //
-//     In practice custom fusion almost always implemented with multiple
+//     In practice custom kernel fusion almost always implemented with multiple
 //     kernels, because input shapes are not known at compile time, and custom
 //     fusion has multiple kernels with different tiling schemes.
 //
-// What differentiates custom fusions from custom calls, is that custom fusion
-// should be implemented with a device kernel, and this allows XLA:GPU to treat
-// custom fusion just like any other device kernel: it's launched as a regular
-// KernelThunk and automatically captured into command buffers.
+// What differentiates custom kernel fusions from custom calls, is that custom
+// kernel fusion should be implemented with a device kernel, and this allows
+// XLA:GPU to treat custom kernel fusion just like any other device kernel: it's
+// launched as a regular KernelThunk and automatically captured into command
+// buffers.
 //
 // Custom calls (registered with XLA:FFI) on the other hand gives much more
 // flexibility, and can be implemented as a combination of a non-trivial host
@@ -90,13 +93,15 @@ namespace xla::gpu {
 //
 // Also XLA:FFI offers a stable C API that allows registering external functions
 // loaded from dynamic libraries compiled with a different toolchain of XLA
-// version. Custom fusions integration relies on C++ ABI and static linking.
+// version. Custom kernel fusions integration relies on C++ ABI and static
+// linking.
 //
 // TODO(ezhulenev): It should be possible to lower `stablehlo.custom_call`
-// operations to custom fusions, albeit with a static linking restriction.
-class CustomFusion {
+// operations to custom kernel fusions, albeit with a static linking
+// restriction.
+class CustomKernelFusion {
  public:
-  virtual ~CustomFusion() = default;
+  virtual ~CustomKernelFusion() = default;
 
   // Loads kernels implementing `hlo_computation` optimized for a given device.
   virtual absl::StatusOr<std::vector<CustomKernel>> LoadKernels(
@@ -105,29 +110,30 @@ class CustomFusion {
 };
 
 //===----------------------------------------------------------------------===//
-// CustomFusionRegistry
+// CustomKernelFusionRegistry
 //===----------------------------------------------------------------------===//
 
-// Custom fusion registry is a mapping from a custom fusion name to the custom
-// fusion implementation, and XLA compiler uses this registry to lower fusion
-// operations to kernels when emitting thunks.
-class CustomFusionRegistry {
+// Custom fusion registry is a mapping from a custom kernel fusion name to the
+// custom fusion implementation, and XLA compiler uses this registry to lower
+// fusion operations to kernels when emitting thunks.
+class CustomKernelFusionRegistry {
  public:
   // Returns a pointer to a default custom fusion registry, which is a global
   // static registry.
-  static CustomFusionRegistry* Default();
+  static CustomKernelFusionRegistry* Default();
 
-  // Registers custom fusion in the registry. Returns error if fusion with the
-  // given name already registered.
-  absl::Status Register(std::string name, std::unique_ptr<CustomFusion> fusion);
+  // Registers custom kernel fusion in the registry. Returns error if fusion
+  // with the given name already registered.
+  absl::Status Register(std::string name,
+                        std::unique_ptr<CustomKernelFusion> fusion);
 
-  // Looks up custom fusion by name. Return nullptr if it's not found.
-  CustomFusion* Lookup(std::string_view name) const;
+  // Looks up custom kernel fusion by name. Return nullptr if it's not found.
+  CustomKernelFusion* Lookup(std::string_view name) const;
 
  private:
   mutable absl::Mutex mutex_;
-  absl::flat_hash_map<std::string, std::unique_ptr<CustomFusion>> registry_
-      ABSL_GUARDED_BY(mutex_);
+  absl::flat_hash_map<std::string, std::unique_ptr<CustomKernelFusion>>
+      registry_ ABSL_GUARDED_BY(mutex_);
 };
 
 }  // namespace xla::gpu
@@ -138,14 +144,14 @@ class CustomFusionRegistry {
 #define XLA_REGISTER_CUSTOM_FUSION_(NAME, FUSION, N) \
   XLA_REGISTER_CUSTOM_FUSION__(NAME, FUSION, N)
 
-#define XLA_REGISTER_CUSTOM_FUSION__(NAME, FUSION, N)              \
-  ABSL_ATTRIBUTE_UNUSED static const bool                          \
-      xla_custom_fusion_##N##_registered_ = [] {                   \
-        absl::Status status =                                      \
-            ::xla::gpu::CustomFusionRegistry::Default()->Register( \
-                NAME, std::make_unique<FUSION>());                 \
-        if (!status.ok()) LOG(ERROR) << status;                    \
-        return status.ok();                                        \
+#define XLA_REGISTER_CUSTOM_FUSION__(NAME, FUSION, N)                    \
+  ABSL_ATTRIBUTE_UNUSED static const bool                                \
+      xla_custom_fusion_##N##_registered_ = [] {                         \
+        absl::Status status =                                            \
+            ::xla::gpu::CustomKernelFusionRegistry::Default()->Register( \
+                NAME, std::make_unique<FUSION>());                       \
+        if (!status.ok()) LOG(ERROR) << status;                          \
+        return status.ok();                                              \
       }()
 
-#endif  // XLA_SERVICE_GPU_KERNELS_CUSTOM_FUSION_H_
+#endif  // XLA_SERVICE_GPU_KERNELS_CUSTOM_KERNEL_FUSION_H_
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_fusion_pattern.cc b/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion_pattern.cc
similarity index 65%
rename from third_party/xla/xla/service/gpu/kernels/custom_fusion_pattern.cc
rename to third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion_pattern.cc
index 76f76381f37a33..aa5531967b7b2e 100644
--- a/third_party/xla/xla/service/gpu/kernels/custom_fusion_pattern.cc
+++ b/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion_pattern.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/kernels/custom_fusion_pattern.h"
+#include "xla/service/gpu/kernels/custom_kernel_fusion_pattern.h"
 
 #include <cstdint>
 #include <memory>
@@ -21,6 +21,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
@@ -31,26 +32,28 @@ limitations under the License.
 namespace xla::gpu {
 
 //===----------------------------------------------------------------------===//
-// CustomFusionPattern::Match
+// CustomKernelFusionPattern::Match
 //===----------------------------------------------------------------------===//
 
-CustomFusionPattern::Match::Match(CustomFusionConfig config,
-                                  std::vector<HloInstruction*> instructions,
-                                  int64_t workspace_size_bytes)
+CustomKernelFusionPattern::Match::Match(
+    CustomFusionConfig config, std::vector<HloInstruction*> instructions,
+    int64_t workspace_size_bytes)
     : config_(std::move(config)),
       instructions_(std::move(instructions)),
       workspace_size_bytes_(workspace_size_bytes) {}
 
-void CustomFusionPattern::Match::AddReplacement(HloInstruction* instr,
-                                                Replacement replacement) {
+void CustomKernelFusionPattern::Match::AddReplacement(HloInstruction* instr,
+                                                      Replacement replacement) {
   replacements_[instr] = std::move(replacement);
 }
 
-bool CustomFusionPattern::Match::HasReplacement(HloInstruction* instr) const {
+bool CustomKernelFusionPattern::Match::HasReplacement(
+    HloInstruction* instr) const {
   return replacements_.contains(instr);
 }
 
-absl::StatusOr<HloInstruction*> CustomFusionPattern::Match::BuildReplacement(
+absl::StatusOr<HloInstruction*>
+CustomKernelFusionPattern::Match::BuildReplacement(
     HloInstruction* instr, HloFusionInstruction* fusion) const {
   if (auto it = replacements_.find(instr); it != replacements_.end()) {
     return it->second(fusion);
@@ -61,17 +64,19 @@ absl::StatusOr<HloInstruction*> CustomFusionPattern::Match::BuildReplacement(
 }
 
 //===----------------------------------------------------------------------===//
-// CustomFusionPatternRegistry
+// CustomKernelFusionPatternRegistry
 //===----------------------------------------------------------------------===//
 
-CustomFusionPatternRegistry* CustomFusionPatternRegistry::Default() {
-  static auto* registry = new CustomFusionPatternRegistry();
+CustomKernelFusionPatternRegistry*
+CustomKernelFusionPatternRegistry::Default() {
+  static auto* registry = new CustomKernelFusionPatternRegistry();
   return registry;
 }
 
-std::vector<CustomFusionPattern::Match> CustomFusionPatternRegistry::Match(
-    const se::DeviceDescription& device, HloInstruction* instr) const {
-  std::vector<CustomFusionPattern::Match> matches;
+std::vector<CustomKernelFusionPattern::Match>
+CustomKernelFusionPatternRegistry::Match(const se::DeviceDescription& device,
+                                         HloInstruction* instr) const {
+  std::vector<CustomKernelFusionPattern::Match> matches;
   for (auto& pattern : patterns_) {
     if (auto matched = pattern->TryMatch(device, instr); matched.has_value())
       matches.push_back(std::move(*matched));
@@ -79,8 +84,8 @@ std::vector<CustomFusionPattern::Match> CustomFusionPatternRegistry::Match(
   return matches;
 }
 
-void CustomFusionPatternRegistry::Add(
-    std::unique_ptr<CustomFusionPattern> pattern) {
+void CustomKernelFusionPatternRegistry::Add(
+    std::unique_ptr<CustomKernelFusionPattern> pattern) {
   patterns_.push_back(std::move(pattern));
 }
 
diff --git a/third_party/xla/xla/service/gpu/kernels/custom_fusion_pattern.h b/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion_pattern.h
similarity index 64%
rename from third_party/xla/xla/service/gpu/kernels/custom_fusion_pattern.h
rename to third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion_pattern.h
index 57b715f2ad6b27..ad7afa8fad726c 100644
--- a/third_party/xla/xla/service/gpu/kernels/custom_fusion_pattern.h
+++ b/third_party/xla/xla/service/gpu/kernels/custom_kernel_fusion_pattern.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_KERNELS_CUSTOM_FUSION_PATTERN_H_
-#define XLA_SERVICE_GPU_KERNELS_CUSTOM_FUSION_PATTERN_H_
+#ifndef XLA_SERVICE_GPU_KERNELS_CUSTOM_KERNEL_FUSION_PATTERN_H_
+#define XLA_SERVICE_GPU_KERNELS_CUSTOM_KERNEL_FUSION_PATTERN_H_
 
 #include <cstdint>
 #include <functional>
@@ -35,28 +35,30 @@ limitations under the License.
 namespace xla::gpu {
 
 //===----------------------------------------------------------------------===//
-// CustomFusionPattern
+// CustomKernelFusionPattern
 //===----------------------------------------------------------------------===//
 
-// Custom fusion pattern matches HLO instruction to custom kernels.
-class CustomFusionPattern {
+// Custom kernel fusion pattern matches HLO instruction to custom kernels.
+class CustomKernelFusionPattern {
  public:
-  // A name of a custom call that can be added to a custom fusion body to
-  // allocate a workspace buffer require for the custom fusion implementation.
-  static constexpr const char *kWorkspace = "__custom_fusion$workspace";
+  // A name of a custom call that can be added to a custom kernel fusion body to
+  // allocate a workspace buffer require for the custom kernel fusion
+  // implementation.
+  static constexpr const char *kWorkspace = "__custom_kernel_fusion$workspace";
 
-  virtual ~CustomFusionPattern() = default;
+  virtual ~CustomKernelFusionPattern() = default;
 
-  // Matched sequence of instructions that can be handled by a custom fusion.
+  // Matched sequence of instructions that can be handled by a custom kernel
+  // fusion.
   class Match {
    public:
     Match(CustomFusionConfig config, std::vector<HloInstruction *> instructions,
           int64_t workspace_size_bytes = 0);
 
     // If some of operations matched by a pattern have users outside of the
-    // custom fusion, pattern can optionally provide a replacement that can be
-    // derived from the fusion instruction result, or from other instructions in
-    // the parent computation.
+    // custom kernel fusion, pattern can optionally provide a replacement that
+    // can be derived from the fusion instruction result, or from other
+    // instructions in the parent computation.
     using Replacement =
         std::function<absl::StatusOr<HloInstruction *>(HloFusionInstruction *)>;
 
@@ -85,30 +87,32 @@ class CustomFusionPattern {
   };
 
   // Returns custom fusion config and a list of instructions that matched to a
-  // custom fusion (one or more custom kernels). Custom fusion pass will outline
-  // matched instructions into a custom fusion operation if possible.
+  // custom kernel fusion (one or more custom kernels). Custom kernel fusion
+  // pass will outline matched instructions into a custom kernel fusion
+  // operation if possible.
   //
-  // TODO(ezhulenev): Today the last instruction defines custom fusion root
-  // (results), however we need to add support for custom fusion that can return
-  // intermediate result, and custom fusions that require an extra workspace.
+  // TODO(ezhulenev): Today the last instruction defines custom kernel fusion
+  // root (results), however we need to add support for custom kernel fusion
+  // that can return intermediate result, and custom kernel fusions that require
+  // an extra workspace.
   virtual std::optional<Match> TryMatch(const se::DeviceDescription &device,
                                         HloInstruction *instr) const = 0;
 };
 
 //===----------------------------------------------------------------------===//
-// CustomFusionPatternRegistry
+// CustomKernelFusionPatternRegistry
 //===----------------------------------------------------------------------===//
 
-class CustomFusionPatternRegistry {
+class CustomKernelFusionPatternRegistry {
  public:
-  // Returns a pointer to a default custom fusion pattern registry, which is a
-  // global static registry.
-  static CustomFusionPatternRegistry *Default();
+  // Returns a pointer to a default custom kernel fusion pattern registry, which
+  // is a global static registry.
+  static CustomKernelFusionPatternRegistry *Default();
 
-  std::vector<CustomFusionPattern::Match> Match(
+  std::vector<CustomKernelFusionPattern::Match> Match(
       const se::DeviceDescription &device, HloInstruction *instr) const;
 
-  void Add(std::unique_ptr<CustomFusionPattern> pattern);
+  void Add(std::unique_ptr<CustomKernelFusionPattern> pattern);
 
   template <typename... Ts, typename = std::enable_if_t<sizeof...(Ts) != 0>>
   void Emplace() {
@@ -122,7 +126,7 @@ class CustomFusionPatternRegistry {
   }
 
  private:
-  std::vector<std::unique_ptr<CustomFusionPattern>> patterns_;
+  std::vector<std::unique_ptr<CustomKernelFusionPattern>> patterns_;
 };
 
 }  // namespace xla::gpu
@@ -133,12 +137,12 @@ class CustomFusionPatternRegistry {
 #define XLA_REGISTER_CUSTOM_FUSION_PATTERN_(PATTERN, N) \
   XLA_REGISTER_CUSTOM_FUSION_PATTERN__(PATTERN, N)
 
-#define XLA_REGISTER_CUSTOM_FUSION_PATTERN__(PATTERN, N)   \
-  ABSL_ATTRIBUTE_UNUSED static const bool                  \
-      xla_custom_fusion_pattern_##N##_registered_ = [] {   \
-        ::xla::gpu::CustomFusionPatternRegistry::Default() \
-            ->Emplace<PATTERN>();                          \
-        return true;                                       \
+#define XLA_REGISTER_CUSTOM_FUSION_PATTERN__(PATTERN, N)         \
+  ABSL_ATTRIBUTE_UNUSED static const bool                        \
+      xla_custom_fusion_pattern_##N##_registered_ = [] {         \
+        ::xla::gpu::CustomKernelFusionPatternRegistry::Default() \
+            ->Emplace<PATTERN>();                                \
+        return true;                                             \
       }()
 
-#endif  // XLA_SERVICE_GPU_KERNELS_CUSTOM_FUSION_PATTERN_H_
+#endif  // XLA_SERVICE_GPU_KERNELS_CUSTOM_KERNEL_FUSION_PATTERN_H_
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc
index 4b2c8c5af5b624..2511b00d46365e 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_benchmarks.cc
@@ -20,12 +20,13 @@ limitations under the License.
 #include "xla/service/gpu/kernels/cutlass_gemm_custom_kernel.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/kernel.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 #include "tsl/platform/test_benchmark.h"
 
@@ -39,7 +40,7 @@ static uint32_t BitPattern(float value) {
 
 static void BM_RowMajorGemm(benchmark::State& state) {
   se::Platform* platform =
-      se::MultiPlatformManager::PlatformWithName("CUDA").value();
+      se::PlatformManager::PlatformWithName("CUDA").value();
   se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
   const se::DeviceDescription& device = executor->GetDeviceDescription();
 
@@ -47,8 +48,6 @@ static void BM_RowMajorGemm(benchmark::State& state) {
   stream.Init();
   ASSERT_TRUE(stream.ok());
 
-  se::Kernel gemm(executor);
-
   // GEMM: 8192x4096 * 4096x16384 -> 8192x16384
   int32_t m = 8192;
   int32_t n = 16384;
@@ -57,7 +56,9 @@ static void BM_RowMajorGemm(benchmark::State& state) {
   auto custom_kernel =
       GetCutlassGemmKernel("cutlass_gemm", PrimitiveType::BF16, m, n, k,
                            /*indices=*/{0, 1, 2}, /*slices=*/{}, device);
-  TF_CHECK_OK(executor->GetKernel(custom_kernel->kernel_spec(), &gemm));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto gemm, se::Kernel::Create(executor, custom_kernel->kernel_spec()));
 
   // Prepare arguments: a=1.1, b=1.2, c=0.0
   se::DeviceMemory<float> a = executor->AllocateArray<float>(m * k, 0);
@@ -74,7 +75,7 @@ static void BM_RowMajorGemm(benchmark::State& state) {
 
   for (auto s : state) {
     TF_CHECK_OK(executor->Launch(&stream, custom_kernel->thread_dims(),
-                                 custom_kernel->block_dims(), gemm, args));
+                                 custom_kernel->block_dims(), *gemm, args));
     TF_CHECK_OK(stream.BlockHostUntilDone());
   }
 }
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc
index b258192cf3ada3..a2e5dd9cba2221 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_custom_kernel_test.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include <vector>
 
 #include "xla/stream_executor/kernel.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/xla_data.pb.h"
@@ -35,20 +35,20 @@ namespace xla::gpu::kernel::gemm_universal {
 
 TEST(CutlassGemmKernelTest, SimpleGemm) {
   se::Platform* platform =
-      se::MultiPlatformManager::PlatformWithName("CUDA").value();
+      se::PlatformManager::PlatformWithName("CUDA").value();
   se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
 
   se::Stream stream(executor);
   stream.Init();
   ASSERT_TRUE(stream.ok());
 
-  se::Kernel gemm(executor);
-
   // Load [4, 4] x [4, 4] gemm kernel written in CUDA C++ with CUTLASS.
   auto custom_kernel = GetCutlassGemmKernel(
       "cutlass_gemm", PrimitiveType::F32, 4, 4, 4,
       /*indices=*/{0, 1, 2}, /*slices=*/{}, executor->GetDeviceDescription());
-  TF_ASSERT_OK(executor->GetKernel(custom_kernel->kernel_spec(), &gemm));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto gemm, se::Kernel::Create(executor, custom_kernel->kernel_spec()));
 
   int64_t length = 4 * 4;
   int64_t byte_length = sizeof(float) * length;
@@ -71,7 +71,7 @@ TEST(CutlassGemmKernelTest, SimpleGemm) {
       std::vector<se::DeviceMemoryBase>({a, b, c}),
       custom_kernel->shared_memory_bytes());
   TF_ASSERT_OK(executor->Launch(&stream, custom_kernel->thread_dims(),
-                                custom_kernel->block_dims(), gemm, arr));
+                                custom_kernel->block_dims(), *gemm, arr));
 
   // Copy `c` data back to host.
   std::vector<float> dst(length, -1.0f);
@@ -87,20 +87,20 @@ TEST(CutlassGemmKernelTest, LoadFromSharedLibrary) {
                         "cutlass_gemm_kernel_f32xf32_to_f32.so");
 
   se::Platform* platform =
-      se::MultiPlatformManager::PlatformWithName("CUDA").value();
+      se::PlatformManager::PlatformWithName("CUDA").value();
   se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
 
   se::Stream stream(executor);
   stream.Init();
   ASSERT_TRUE(stream.ok());
 
-  se::Kernel gemm(executor);
-
   // Load [4, 4] x [4, 4] gemm kernel written in CUDA C++ with CUTLASS.
   auto custom_kernel = LoadCutlassGemmKernel(
       "cutlass_gemm", kernel_lib_path, PrimitiveType::F32, 4, 4, 4,
       /*indices=*/{0, 1, 2}, /*slices=*/{}, executor->GetDeviceDescription());
-  TF_ASSERT_OK(executor->GetKernel(custom_kernel->kernel_spec(), &gemm));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto gemm, se::Kernel::Create(executor, custom_kernel->kernel_spec()));
 
   int64_t length = 4 * 4;
   int64_t byte_length = sizeof(float) * length;
@@ -122,7 +122,7 @@ TEST(CutlassGemmKernelTest, LoadFromSharedLibrary) {
       std::vector<se::DeviceMemoryBase>({a, b, c}),
       custom_kernel->shared_memory_bytes());
   TF_ASSERT_OK(executor->Launch(&stream, custom_kernel->thread_dims(),
-                                custom_kernel->block_dims(), gemm, arr));
+                                custom_kernel->block_dims(), *gemm, arr));
 
   // Copy `c` data back to host.
   std::vector<float> dst(length, -1.0f);
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
index 5ca18f14e183e2..70fa91a8d14154 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.cc
@@ -26,9 +26,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/gpu/kernels/custom_fusion.h"
-#include "xla/service/gpu/kernels/custom_fusion_pattern.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/service/gpu/kernels/custom_kernel_fusion.h"
+#include "xla/service/gpu/kernels/custom_kernel_fusion_pattern.h"
 #include "xla/service/gpu/kernels/cutlass_gemm.h"
 #include "xla/service/gpu/kernels/cutlass_gemm_custom_kernel.h"
 #include "xla/service/pattern_matcher.h"
@@ -59,9 +59,10 @@ struct RootWithWorkspace {
 
 static RootWithWorkspace MatchRootWithWorkspace(HloInstruction* root) {
   RootWithWorkspace result;
-  if (Match(root, m::Tuple(m::Op(&result.root),
-                           m::CustomCall(&result.workspace,
-                                         {CustomFusionPattern::kWorkspace})))) {
+  if (Match(root,
+            m::Tuple(m::Op(&result.root),
+                     m::CustomCall(&result.workspace,
+                                   {CustomKernelFusionPattern::kWorkspace})))) {
     return result;
   }
   return {root, nullptr};
@@ -81,7 +82,13 @@ struct GemmWithDynamicSlice {
   explicit GemmWithDynamicSlice(HloDynamicUpdateSliceInstruction* update_slice)
       : update_slice(update_slice) {}
 
-  std::vector<HloInstruction*> Instrs() { return {dot, bitcast, update_slice}; }
+  std::vector<HloInstruction*> Instrs() {
+    // Bitcast could be optional
+    if (bitcast == nullptr) {
+      return {dot, update_slice};
+    }
+    return {dot, bitcast, update_slice};
+  }
 
   HloInstruction* dot = nullptr;
   HloInstruction* bitcast = nullptr;       // result bitcast
@@ -151,14 +158,20 @@ static absl::StatusOr<GemmWithUpcast> MatchGemmWithUpcast(
   return absl::InternalError("unsupported gemm with upcasing");
 }
 
+template <typename Pattern>
+auto OptionalBitcast(HloInstruction** optional_bitcast, Pattern pattern) {
+  return m::AnyOf<HloInstruction>(m::Bitcast(optional_bitcast, pattern),
+                                  std::move(pattern));
+}
+
 // Returns matched GEMM with result used to update a slice.
 static absl::StatusOr<GemmWithDynamicSlice> MatchGemmWithDynamicUpdateSlice(
     HloDynamicUpdateSliceInstruction* update_slice) {
   GemmWithDynamicSlice match(update_slice);
 
-  if (!Match(
-          const_cast<HloInstruction*>(update_slice->operand(1)),
-          m::Bitcast(&match.bitcast, m::Dot(&match.dot, m::Op(), m::Op())))) {
+  if (!Match(const_cast<HloInstruction*>(update_slice->operand(1)),
+             OptionalBitcast(&match.bitcast,
+                             m::Dot(&match.dot, m::Op(), m::Op())))) {
     return absl::InternalError("failed to match update slice instr");
   }
 
@@ -171,7 +184,7 @@ static absl::StatusOr<GemmWithDynamicSlice> MatchGemmWithDynamicUpdateSlice(
 // Cutlass Gemm Patterns
 //===----------------------------------------------------------------------===//
 
-std::optional<CustomFusionPattern::Match> CutlassGemmPattern::TryMatch(
+std::optional<CustomKernelFusionPattern::Match> CutlassGemmPattern::TryMatch(
     const se::DeviceDescription& device, HloInstruction* instr) const {
   auto* dot = DynCast<HloDotInstruction>(instr);
   if (!dot) return std::nullopt;
@@ -184,7 +197,7 @@ std::optional<CustomFusionPattern::Match> CutlassGemmPattern::TryMatch(
   return Match{config, {instr}};
 }
 
-std::optional<CustomFusionPattern::Match>
+std::optional<CustomKernelFusionPattern::Match>
 CutlassGemmWithDynamicUpdateSlicePattern::TryMatch(
     const se::DeviceDescription& device, HloInstruction* instr) const {
   auto* update_slice = DynCast<HloDynamicUpdateSliceInstruction>(instr);
@@ -203,9 +216,12 @@ CutlassGemmWithDynamicUpdateSlicePattern::TryMatch(
   match.AddReplacement(matched->dot, [=](HloFusionInstruction* fusion) {
     HloComputation* parent = fusion->parent();
     auto* dus = Cast<HloDynamicUpdateSliceInstruction>(matched->update_slice);
+    bool has_bitcast = matched->bitcast != nullptr;
+    const Shape dus_shape =
+        has_bitcast ? matched->bitcast->shape() : matched->dot->shape();
     auto* slice = parent->AddInstruction(HloInstruction::CreateDynamicSlice(
-        matched->bitcast->shape(), fusion, dus->index_operands(),
-        matched->bitcast->shape().dimensions()));
+        dus_shape, fusion, dus->index_operands(), dus_shape.dimensions()));
+
     return parent->AddInstruction(
         HloInstruction::CreateBitcast(matched->dot->shape(), slice));
   });
@@ -213,7 +229,7 @@ CutlassGemmWithDynamicUpdateSlicePattern::TryMatch(
   return match;
 }
 
-std::optional<CustomFusionPattern::Match>
+std::optional<CustomKernelFusionPattern::Match>
 CutlassGemmWithUpcastPattern::TryMatch(const se::DeviceDescription& device,
                                        HloInstruction* instr) const {
   auto* dot = DynCast<HloDotInstruction>(instr);
@@ -236,7 +252,7 @@ CutlassGemmWithUpcastPattern::TryMatch(const se::DeviceDescription& device,
 // Cutlass Gemm Fusions
 //===----------------------------------------------------------------------===//
 
-class CutlassGemmFusion : public CustomFusion {
+class CutlassGemmFusion : public CustomKernelFusion {
  public:
   absl::StatusOr<std::vector<CustomKernel>> LoadKernels(
       const se::DeviceDescription& device,
@@ -274,7 +290,7 @@ class CutlassGemmFusion : public CustomFusion {
   }
 };
 
-class CutlassGemmWithUpcastFusion : public CustomFusion {
+class CutlassGemmWithUpcastFusion : public CustomKernelFusion {
  public:
   absl::StatusOr<std::vector<CustomKernel>> LoadKernels(
       const se::DeviceDescription& device,
@@ -302,7 +318,7 @@ class CutlassGemmWithUpcastFusion : public CustomFusion {
   }
 };
 
-class CutlassGemmWithDynamicUpdateSliceFusion : public CustomFusion {
+class CutlassGemmWithDynamicUpdateSliceFusion : public CustomKernelFusion {
  public:
   absl::StatusOr<std::vector<CustomKernel>> LoadKernels(
       const se::DeviceDescription& device,
@@ -336,7 +352,6 @@ class CutlassGemmWithDynamicUpdateSliceFusion : public CustomFusion {
     // Mapping to a buffer that holds output slice offset.
     auto* offset =
         Cast<HloParameterInstruction>(matched.update_slice->operand(2));
-
     kernel::gemm_universal::DynamicSliceIndices slices;
     slices.out = offset->parameter_number();
 
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.h b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.h
index 27b9d191af5498..e7027c905d9efe 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.h
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion.h
@@ -19,20 +19,21 @@ limitations under the License.
 #include <optional>
 
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/gpu/kernels/custom_fusion_pattern.h"
+#include "xla/service/gpu/kernels/custom_kernel_fusion_pattern.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla::gpu {
 
 // Pattern matches simple row-major gemms to CUTLASS kernels.
-class CutlassGemmPattern : public CustomFusionPattern {
+class CutlassGemmPattern : public CustomKernelFusionPattern {
  public:
   std::optional<Match> TryMatch(const se::DeviceDescription& device,
                                 HloInstruction* instr) const override;
 };
 
 // Pattern matches simple row-major gemms with dynamic-update-slice.
-class CutlassGemmWithDynamicUpdateSlicePattern : public CustomFusionPattern {
+class CutlassGemmWithDynamicUpdateSlicePattern
+    : public CustomKernelFusionPattern {
  public:
   std::optional<Match> TryMatch(const se::DeviceDescription& device,
                                 HloInstruction* instr) const override;
@@ -40,7 +41,7 @@ class CutlassGemmWithDynamicUpdateSlicePattern : public CustomFusionPattern {
 
 // Pattern matches mixed dtype gemms when one of the operands is upcasted to an
 // accumulator (output) dtype, i.e. BF16 <= BF16 x S8.
-class CutlassGemmWithUpcastPattern : public CustomFusionPattern {
+class CutlassGemmWithUpcastPattern : public CustomKernelFusionPattern {
  public:
   std::optional<Match> TryMatch(const se::DeviceDescription& device,
                                 HloInstruction* instr) const override;
diff --git a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion_test.cc b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion_test.cc
index c2fab206c532c8..0ffe3fa8fe4f3c 100644
--- a/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/kernels/cutlass_gemm_fusion_test.cc
@@ -23,9 +23,9 @@ limitations under the License.
 #include "xla/array3d.h"
 #include "xla/error_spec.h"
 #include "xla/literal_util.h"
-#include "xla/service/gpu/custom_fusion_rewriter.h"
+#include "xla/service/gpu/custom_kernel_fusion_rewriter.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
-#include "xla/service/gpu/kernels/custom_fusion_pattern.h"
+#include "xla/service/gpu/kernels/custom_kernel_fusion_pattern.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/types.h"
 #include "tsl/platform/test.h"
@@ -68,11 +68,11 @@ TEST_F(CutlassFusionTest, RowMajorGemm) {
     ; CHECK: }
   )";
 
-  CustomFusionPatternRegistry patterns;
+  CustomKernelFusionPatternRegistry patterns;
   patterns.Emplace<CutlassGemmPattern>();
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
-  CustomFusionRewriter pass(&device, &patterns);
+  CustomKernelFusionRewriter pass(&device, &patterns);
   RunAndFilecheckHloRewrite(hlo, std::move(pass), expected);
 }
 
@@ -108,11 +108,11 @@ TEST_F(CutlassFusionTest, RowMajorGemmWithUpcast) {
     ; CHECK: }
   )";
 
-  CustomFusionPatternRegistry patterns;
+  CustomKernelFusionPatternRegistry patterns;
   patterns.Emplace<CutlassGemmWithUpcastPattern>();
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
-  CustomFusionRewriter pass(&device, &patterns);
+  CustomKernelFusionRewriter pass(&device, &patterns);
   RunAndFilecheckHloRewrite(hlo, std::move(pass), expected);
 }
 
@@ -158,11 +158,11 @@ TEST_F(CutlassFusionTest, RowMajorGemmWithDynamicUpdateSlice) {
     ; CHECK: }
   )";
 
-  CustomFusionPatternRegistry patterns;
+  CustomKernelFusionPatternRegistry patterns;
   patterns.Emplace<CutlassGemmWithDynamicUpdateSlicePattern>();
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
-  CustomFusionRewriter pass(&device, &patterns);
+  CustomKernelFusionRewriter pass(&device, &patterns);
   RunAndFilecheckHloRewrite(hlo, std::move(pass), expected);
 }
 
@@ -217,11 +217,57 @@ TEST_F(CutlassFusionTest, RowMajorGemmWithDynamicUpdateSliceMultipleUses) {
     ; CHECK: }
   )";
 
-  CustomFusionPatternRegistry patterns;
+  CustomKernelFusionPatternRegistry patterns;
   patterns.Emplace<CutlassGemmWithDynamicUpdateSlicePattern>();
 
   auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
-  CustomFusionRewriter pass(&device, &patterns);
+  CustomKernelFusionRewriter pass(&device, &patterns);
+  RunAndFilecheckHloRewrite(hlo, std::move(pass), expected);
+}
+
+TEST_F(CutlassFusionTest, RowMajorGemmWithDynamicUpdateSliceWithoutBitcast) {
+  const char* hlo = R"(
+    HloModule test
+
+    ENTRY %main (p0: f32[4,2], p1: f32[2,2], i: s32[]) -> f32[4,2] {
+      %p0 = f32[4,2]{1,0} parameter(0)
+      %p1 = f32[2,2]{1,0} parameter(1)
+      %i = s32[] parameter(2)
+
+      %dot = f32[2,2]{1,0} dot(%p1, %p1),
+               lhs_contracting_dims={1},
+               rhs_contracting_dims={0}
+
+      ROOT %r = f32[4,2]{1,0} dynamic-update-slice(%p0, %dot, %i, %i)
+    }
+  )";
+
+  const char* expected = R"(
+    ; CHECK: %cutlass_gemm_with_dynamic_update_slice {{.*}} {
+    ; CHECK-DAG: [[P1:%[^ ]+]] = f32[4,2]{1,0} parameter
+    ; CHECK-DAG: [[P0:%[^ ]+]] = f32[2,2]{1,0} parameter
+    ; CHECK-DAG: [[DOT:%[^ ]+]] = f32[2,2]{1,0} dot([[P0]], [[P0]])
+    ; CHECK-DAG: [[P2:%[^ ]+]] = s32[] parameter
+    ; CHECK:     ROOT [[DUS:%[^ ]+]] = f32[4,2]{1,0} dynamic-update-slice([[P1]], [[DOT]], [[P2]], [[P2]])
+    ; CHECK: }
+
+    ; CHECK: ENTRY %main {{.*}} {
+    ; CHECK:   ROOT [[FUSION:%[^ ]+]] = f32[4,2]{1,0} fusion
+    ; CHECK:     kind=kCustom, calls=%cutlass_gemm_with_dynamic_update_slice,
+    ; CHECK:     backend_config={
+    ; CHECK:       "kind":"__custom_fusion",
+    ; CHECK:       "custom_fusion_config":{
+    ; CHECK:         "name":"cutlass_gemm_with_dynamic_update_slice"
+    ; CHECK:       }
+    ; CHECK:     }
+    ; CHECK: }
+  )";
+
+  CustomKernelFusionPatternRegistry patterns;
+  patterns.Emplace<CutlassGemmWithDynamicUpdateSlicePattern>();
+
+  auto device = TestGpuDeviceInfo::RTXA6000DeviceInfo();
+  CustomKernelFusionRewriter pass(&device, &patterns);
   RunAndFilecheckHloRewrite(hlo, std::move(pass), expected);
 }
 
@@ -338,7 +384,7 @@ TEST_F(CutlassFusionTest, RowMajorGemmWithDynamicUpdateSliceKernel) {
     bc.1 = bf16[1,8,8]{2,1,0} bitcast(dot.1)
     r.1 = bf16[2,8,8]{2,1,0} dynamic-update-slice(p1.1, bc.1, p2, p3, p3)
     workspace = u8[1024]{0} custom-call(),
-      custom_call_target="__custom_fusion$workspace",
+      custom_call_target="__custom_kernel_fusion$workspace",
       api_version=API_VERSION_TYPED_FFI
     ROOT tuple = (bf16[2,8,8]{2,1,0}, u8[1024]{0}) tuple(r.1, workspace)
   }
@@ -373,4 +419,72 @@ TEST_F(CutlassFusionTest, RowMajorGemmWithDynamicUpdateSliceKernel) {
                                       /*run_hlo_passes=*/false));
 }
 
+TEST_F(CutlassFusionTest,
+       RowMajorGemmWithDynamicUpdateSliceKernelWithoutBitcast) {
+  ErrorSpec error_spec{/*aabs=*/1e-3, /*arel=*/1e-3};
+
+  const char* hlo_text_cublas = R"(
+  HloModule cublas
+
+  ENTRY e {
+    p0 = bf16[16,8]{1,0} parameter(0)
+    p1 = bf16[8,8]{1,0} parameter(1)
+    p2 = s32[] parameter(2)
+    p3 = s32[] parameter(3)
+
+    gemm.tuple = (bf16[8,8]{1,0}, s8[0]{0}) custom-call(p1, p1),
+      custom_call_target="__cublas$gemm",
+      backend_config={"gemm_backend_config":{"alpha_real":1,"beta":0,"dot_dimension_numbers":{"lhs_contracting_dimensions":[1],"rhs_contracting_dimensions":[0],"lhs_batch_dimensions":[],"rhs_batch_dimensions":[]},"alpha_imag":0,"precision_config":{"operand_precision":["DEFAULT","DEFAULT"]},"epilogue":"DEFAULT"}}
+    gemm = bf16[8,8]{1,0} get-tuple-element(gemm.tuple), index=0
+
+    ROOT r = bf16[16,8]{1,0} dynamic-update-slice(p0, gemm, p2, p3)
+  }
+  )";
+
+  const char* hlo_text_custom_fusion = R"(
+  HloModule cutlass
+
+  cutlass_gemm {
+    p0.1 = bf16[8,8]{1,0} parameter(0)
+    p1.1 = bf16[16,8]{1,0} parameter(1)
+    p2 = s32[] parameter(2)
+    p3 = s32[] parameter(3)
+    dot.1 = bf16[8,8]{1,0} dot(p0.1, p0.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    r.1 = bf16[16,8]{1,0} dynamic-update-slice(p1.1, dot.1, p2, p3)
+    workspace = u8[1024]{0} custom-call(),
+      custom_call_target="__custom_kernel_fusion$workspace",
+      api_version=API_VERSION_TYPED_FFI
+    ROOT tuple = (bf16[16,8]{1,0}, u8[1024]{0}) tuple(r.1, workspace)
+  }
+
+  ENTRY e {
+    p0 = bf16[16,8]{1,0} parameter(0)
+    p1 = bf16[8,8]{1,0} parameter(1)
+    p2 = s32[] parameter(2)
+    p3 = s32[] parameter(3)
+    r.0 = (bf16[16,8]{1,0}, u8[1024]{0}) fusion(p1, p0, p2, p3), kind=kCustom,
+      calls=%cutlass_gemm,
+      backend_config={"fusion_backend_config":{"kind":"__custom_fusion","custom_fusion_config":{"name":"cutlass_gemm_with_dynamic_update_slice"}}}
+    ROOT %get-tuple-element = bf16[16,8]{1,0} get-tuple-element(r.0), index=0
+  })";
+
+  Array2D<bfloat16> p0_arr(16, 8);  // bf16[16,8]
+  Array2D<bfloat16> p1_arr(8, 8);   // bf16[8,8]
+  p1_arr.Each([](int64_t i, int64_t j, bfloat16* out) {
+    *out = bfloat16{1.0f * i * j};
+  });
+
+  Array<int32_t> p2_arr({}, 0);
+  Array<int32_t> p3_arr({}, 1);
+
+  auto p0 = LiteralUtil::CreateFromArray(p0_arr);
+  auto p1 = LiteralUtil::CreateFromArray(p1_arr);
+  auto p2 = LiteralUtil::CreateFromArray(p2_arr);
+  auto p3 = LiteralUtil::CreateFromArray(p3_arr);
+
+  EXPECT_TRUE(RunAndCompareTwoModules(hlo_text_cublas, hlo_text_custom_fusion,
+                                      {&p0, &p1, &p2, &p3}, error_spec,
+                                      /*run_hlo_passes=*/false));
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.cc b/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.cc
index 1fc8d420c512bf..31364a9a0078a8 100644
--- a/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.cc
+++ b/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel.cc
@@ -22,23 +22,23 @@ limitations under the License.
 #include <string>
 #include <utility>
 
-#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
 #include "absl/numeric/bits.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
-#include "Eigen/Core"  // from @eigen_archive
-#include "xla/service/gpu/runtime/gpu_kernel_helper.h"
-#include "xla/service/gpu/runtime/topk_kernel_common.h"
-#include "xla/statusor.h"
+#include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-#include "xla/service/gpu/kernels/custom_kernel.h"
+#include "xla/stream_executor/launch_dim.h"
+#include "xla/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/statusor.h"
 
+#if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
+#include "xla/service/gpu/kernels/topk_kernel_common.h"
+#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
+
 namespace xla::gpu::kernel::topk {
 
 #if defined(GOOGLE_CUDA) || defined(TENSORFLOW_USE_ROCM)
@@ -97,9 +97,8 @@ template <typename T>
 absl::StatusOr<CustomKernel> GetTypedTopK(std::string name, size_t num_elements,
                                           size_t k, size_t batch_size) {
   constexpr size_t kMaxKVSize = sizeof(uint64_t);
-  constexpr size_t kWavefrontSize = WAVEFRONT_SIZE;
   // Allocate shmem assuming we have a full reduction.
-  int shmem_size = absl::bit_ceil(k) * kMaxKVSize * kWavefrontSize;
+  int shmem_size = absl::bit_ceil(k) * kMaxKVSize * GetTopKWaveFrontSize<T>();
   int num_threads = EstimateOptimalNumThreads(num_elements, k, batch_size);
   if (num_threads == 0) {
     return absl::FailedPreconditionError(
@@ -128,8 +127,8 @@ absl::StatusOr<CustomKernel> GetTopKKernel(std::string name,
     case PrimitiveType::F32:
       return GetTypedTopK<float>(std::move(name), num_elements, k, batch_size);
     case PrimitiveType::BF16:
-      return GetTypedTopK<Eigen::bfloat16>(std::move(name), num_elements, k,
-                                           batch_size);
+      return GetTypedTopK<bfloat16>(std::move(name), num_elements, k,
+                                    batch_size);
     default:
       return absl::InvalidArgumentError(
           absl::StrCat("Unsupported GpuTopK data type: ", dtype));
diff --git a/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel_test.cc b/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel_test.cc
index 4c93885429bc9b..63a40ecd4275ef 100644
--- a/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel_test.cc
+++ b/third_party/xla/xla/service/gpu/kernels/topk_custom_kernel_test.cc
@@ -26,15 +26,16 @@ limitations under the License.
 #include "absl/random/random.h"
 #include "absl/strings/ascii.h"
 #include "absl/strings/substitute.h"
-#include "Eigen/Core"  // from @eigen_archive
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/kernel.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "xla/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace xla::gpu::kernel::topk {
@@ -67,7 +68,7 @@ std::vector<T> RandomVecNegative(int num_elements) {
 
 PrimitiveType Get(float) { return PrimitiveType::F32; }
 
-PrimitiveType Get(Eigen::bfloat16) { return PrimitiveType::BF16; }
+PrimitiveType Get(bfloat16) { return PrimitiveType::BF16; }
 
 // Params:
 //  - n_kb: number of elements in kilobytes.
@@ -84,8 +85,7 @@ TEST_P(TopKKernelTest, TopKFloat) {
 
   auto name =
       absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
-  se::Platform* platform =
-      se::MultiPlatformManager::PlatformWithName(name).value();
+  se::Platform* platform = se::PlatformManager::PlatformWithName(name).value();
   se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
 
   se::Stream stream(executor);
@@ -107,10 +107,11 @@ TEST_P(TopKKernelTest, TopKFloat) {
   stream.ThenMemZero(&output_values, k * batch_size * sizeof(T));
   stream.ThenMemZero(&output_indices, k * batch_size * sizeof(uint32_t));
 
-  se::Kernel kernel(executor);
   auto custom_kernel =
       GetTopKKernel("topk", PrimitiveType::F32, n, k, batch_size);
-  TF_ASSERT_OK(executor->GetKernel(custom_kernel->kernel_spec(), &kernel));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto kernel, se::Kernel::Create(executor, custom_kernel->kernel_spec()));
 
   // Launch topk kernel with device memory arguments.
   se::KernelArgsDeviceMemoryArray arr(
@@ -118,7 +119,7 @@ TEST_P(TopKKernelTest, TopKFloat) {
           {input_buffer, output_values, output_indices}),
       custom_kernel->shared_memory_bytes());
   TF_ASSERT_OK(executor->Launch(&stream, custom_kernel->thread_dims(),
-                                custom_kernel->block_dims(), kernel, arr));
+                                custom_kernel->block_dims(), *kernel, arr));
 
   std::vector<T> got(k);
   ASSERT_TRUE(stream.BlockHostUntilDone().ok());
@@ -138,8 +139,7 @@ TEST_P(TopKKernelTest, TopKPackedNegative) {
 
   auto name =
       absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
-  se::Platform* platform =
-      se::MultiPlatformManager::PlatformWithName(name).value();
+  se::Platform* platform = se::PlatformManager::PlatformWithName(name).value();
   se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
 
   se::Stream stream(executor);
@@ -161,10 +161,11 @@ TEST_P(TopKKernelTest, TopKPackedNegative) {
   stream.ThenMemZero(&output_values, k * batch_size * sizeof(T));
   stream.ThenMemZero(&output_indices, k * batch_size * sizeof(uint32_t));
 
-  se::Kernel kernel(executor);
   auto custom_kernel =
       GetTopKKernel("topk", PrimitiveType::F32, n, k, batch_size);
-  TF_ASSERT_OK(executor->GetKernel(custom_kernel->kernel_spec(), &kernel));
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto kernel, se::Kernel::Create(executor, custom_kernel->kernel_spec()));
 
   // Launch topk kernel with device memory arguments.
   se::KernelArgsDeviceMemoryArray arr(
@@ -172,7 +173,7 @@ TEST_P(TopKKernelTest, TopKPackedNegative) {
           {input_buffer, output_values, output_indices}),
       custom_kernel->shared_memory_bytes());
   TF_ASSERT_OK(executor->Launch(&stream, custom_kernel->thread_dims(),
-                                custom_kernel->block_dims(), kernel, arr));
+                                custom_kernel->block_dims(), *kernel, arr));
 
   std::vector<T> got(k);
   ASSERT_TRUE(stream.BlockHostUntilDone().ok());
diff --git a/third_party/xla/xla/service/gpu/runtime/topk_kernel.cc b/third_party/xla/xla/service/gpu/kernels/topk_kernel.cc
similarity index 87%
rename from third_party/xla/xla/service/gpu/runtime/topk_kernel.cc
rename to third_party/xla/xla/service/gpu/kernels/topk_kernel.cc
index 91fde6b8c8d27a..d6f0cfb953a59f 100644
--- a/third_party/xla/xla/service/gpu/runtime/topk_kernel.cc
+++ b/third_party/xla/xla/service/gpu/kernels/topk_kernel.cc
@@ -14,28 +14,30 @@ limitations under the License.
 ==============================================================================*/
 
 // This file contains bespoke and optimized implementation for TopK shapes. When
-// adding support for new shapes/dtypes, you also need to modify the rewritter
+// adding support for new shapes/dtypes, you also need to modify the rewriter
 // on topk_specializer.cc for these changes to be picked up.
 
-#include "xla/service/gpu/runtime/topk_kernel.h"
+#include "xla/service/gpu/kernels/topk_kernel.h"
 
 #include <algorithm>
+#include <cstddef>
 #include <cstdint>
 
 #include "absl/numeric/bits.h"
 #include "absl/status/status.h"
-#include "Eigen/Core"  // from @eigen_archive
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "xla/primitive_util.h"
-#include "xla/service/gpu/runtime/gpu_kernel_helper.h"
-#include "xla/service/gpu/runtime/topk_kernel_common.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
+#include "xla/service/gpu/kernels/topk_kernel_common.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 
 namespace xla::gpu {
-
 namespace {
 
 size_t NumThreads(size_t n, size_t k, size_t batch_size) {
@@ -66,7 +68,7 @@ absl::Status TypedTopK(se::Stream* stream, se::DeviceMemoryBase data,
                        size_t batch_size) {
   constexpr size_t max_kv_size = sizeof(uint64_t);
   // Allocate shmem assuming we have a full reduction.
-  int shmem_size = absl::bit_ceil(k) * max_kv_size * WAVEFRONT_SIZE;
+  int shmem_size = absl::bit_ceil(k) * max_kv_size * GetTopKWaveFrontSize<T>();
   int num_threads = NumThreads(num_elements, k, batch_size);
   if (num_threads == 0) {
     return absl::FailedPreconditionError(
@@ -88,7 +90,7 @@ absl::Status TypedTopK(se::Stream* stream, se::DeviceMemoryBase data,
 
   TF_RETURN_IF_ERROR(stream->ThenLaunch(
       se::ThreadDim(num_threads, 1, 1), se::BlockDim(batch_size, 1, 1),
-      shmem_size, *kernel, data_typed, num_elements, top_elements_typed,
+      shmem_size, kernel, data_typed, num_elements, top_elements_typed,
       top_indices_typed, k));
 
   return absl::OkStatus();
@@ -108,8 +110,8 @@ absl::Status RunTopk(se::Stream* stream, PrimitiveType dtype,
       return TypedTopK<float>(stream, data, num_elements, top_elements,
                               top_indices, k, batch_size);
     case PrimitiveType::BF16:
-      return TypedTopK<Eigen::bfloat16>(
-          stream, data, num_elements, top_elements, top_indices, k, batch_size);
+      return TypedTopK<bfloat16>(stream, data, num_elements, top_elements,
+                                 top_indices, k, batch_size);
     default:
       return absl::UnimplementedError("GpuTopK not implemented for this dtype");
   }
diff --git a/third_party/xla/xla/service/gpu/runtime/topk_kernel.cu.h b/third_party/xla/xla/service/gpu/kernels/topk_kernel.cu.h
similarity index 81%
rename from third_party/xla/xla/service/gpu/runtime/topk_kernel.cu.h
rename to third_party/xla/xla/service/gpu/kernels/topk_kernel.cu.h
index f981682c70d242..44b11394ece3e6 100644
--- a/third_party/xla/xla/service/gpu/runtime/topk_kernel.cu.h
+++ b/third_party/xla/xla/service/gpu/kernels/topk_kernel.cu.h
@@ -13,22 +13,77 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME_TOPK_KERNEL_CU_H_
-#define XLA_SERVICE_GPU_RUNTIME_TOPK_KERNEL_CU_H_
+#ifndef XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_CU_H_
+#define XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_CU_H_
 
 // This file contains bespoke and optimized implementation for TopK shapes. When
-// adding support for new shapes/dtypes, you also need to modify the rewritter
+// adding support for new shapes/dtypes, you also need to modify the rewriter
 // on topk_specializer.cc for these changes to be picked up.
 
 #include <cstddef>
 #include <cstdint>
 #include <limits>
 
-#include "xla/service/gpu/runtime/gpu_kernel_helper.h"
-#include "xla/service/gpu/runtime/topk_kernel_common.h"
+#include "xla/service/gpu/kernels/topk_kernel_common.h"
+#include "xla/stream_executor/gpu/gpu_types.h"
+#include "tsl/lib/math/math_util.h"
+
+#if GOOGLE_CUDA
+
+#define WAVEFRONT_SIZE 32
+#define FORCEINLINE __forceinline__
+
+#elif TENSORFLOW_USE_ROCM  // GOOGLE_CUDA
+
+#ifdef __AMDGCN_WAVEFRONT_SIZE
+#define WAVEFRONT_SIZE __AMDGCN_WAVEFRONT_SIZE
+#else
+#define WAVEFRONT_SIZE 64
+#endif
+#define FORCEINLINE __forceinline__
+
+#endif  // TENSORFLOW_USE_ROCM
 
 namespace xla::gpu {
 
+enum class ShflType { kSync, kUp, kDown, kXor };
+
+template <ShflType Type, class NT>
+__device__ FORCEINLINE NT GpuShuffle(NT val, uint32_t idx,
+                                     uint32_t allmsk = 0xffffffffu) {
+  constexpr uint32_t SZ =
+      tsl::MathUtil::CeilOfRatio(sizeof(NT), sizeof(uint32_t));
+  union S {
+    NT v;
+    uint32_t d[SZ];
+  };
+  S in{val}, res{};
+
+#pragma unroll
+  for (uint32_t i = 0; i < SZ; i++) {
+#if GOOGLE_CUDA
+    if constexpr (Type == ShflType::kSync)
+      res.d[i] = __shfl_sync(allmsk, in.d[i], idx);
+    else if constexpr (Type == ShflType::kUp)
+      res.d[i] = __shfl_up_sync(allmsk, in.d[i], idx);
+    else if constexpr (Type == ShflType::kDown)
+      res.d[i] = __shfl_down_sync(allmsk, in.d[i], idx);
+    else if constexpr (Type == ShflType::kXor)
+      res.d[i] = __shfl_xor_sync(allmsk, in.d[i], idx);
+#elif TENSORFLOW_USE_ROCM  // ROcm does not support sync shuffle intrinsics
+    if constexpr (Type == ShflType::kSync)
+      res.d[i] = __shfl(in.d[i], idx);
+    else if constexpr (Type == ShflType::kUp)
+      res.d[i] = __shfl_up(in.d[i], idx);
+    else if constexpr (Type == ShflType::kDown)
+      res.d[i] = __shfl_down(in.d[i], idx);
+    else if constexpr (Type == ShflType::kXor)
+      res.d[i] = __shfl_xor(in.d[i], idx);
+#endif
+  }
+  return res.v;
+}
+
 // Default implementation for KV holder. Useful for testing while adding support
 // for a new type, but generally bitpacking those values is more efficient. See
 // implementations below.
@@ -191,7 +246,7 @@ struct TopK {
     for (int offset = num_lanes / 2; offset > 0; offset /= 2) {
 #pragma unroll
       for (int i = 0; i < K; i++) {
-        KVT kv = GpuShuffle<ShflType::Down>(tmp[i], offset);
+        KVT kv = GpuShuffle<ShflType::kDown>(tmp[i], offset);
         if (lane_id >= offset) continue;
         Push(tmp, kv);
       }
@@ -245,12 +300,17 @@ __launch_bounds__(kTopKMaxThreadsPerBlock, 1) __global__
 template <typename T, size_t K>
 void* GetTopKKernelForK(int n) {
   // TODO(doak): Switch to uint32_t if we don't have an efficient
-  // implemementation for uint16_t.
+  // implementation for uint16_t.
   return n < std::numeric_limits<uint16_t>::max()
              ? reinterpret_cast<void*>(&Run<K, T, uint16_t>)
              : reinterpret_cast<void*>(&Run<K, T, uint32_t>);
 }
 
+template <typename T>
+int32_t GetTopKWaveFrontSize() {
+  return WAVEFRONT_SIZE;
+}
+
 }  // namespace xla::gpu
 
-#endif  // XLA_SERVICE_GPU_RUNTIME_TOPK_KERNEL_CU_H_
+#endif  // XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_CU_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/topk_kernel.h b/third_party/xla/xla/service/gpu/kernels/topk_kernel.h
similarity index 83%
rename from third_party/xla/xla/service/gpu/runtime/topk_kernel.h
rename to third_party/xla/xla/service/gpu/kernels/topk_kernel.h
index d1f8b7ec803ee3..8e15483f8c0667 100644
--- a/third_party/xla/xla/service/gpu/runtime/topk_kernel.h
+++ b/third_party/xla/xla/service/gpu/kernels/topk_kernel.h
@@ -13,18 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME_TOPK_KERNEL_H_
-#define XLA_SERVICE_GPU_RUNTIME_TOPK_KERNEL_H_
+#ifndef XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_H_
+#define XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_H_
 
 #include <stddef.h>
-#include <stdint.h>
 
 #include "absl/status/status.h"
-#include "xla/stream_executor/gpu/gpu_types.h"
-#include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/types.h"
+#include "xla/types.h"  // IWYU pragma: keep
 #include "xla/xla_data.pb.h"
 
 namespace xla::gpu {
@@ -43,4 +40,4 @@ absl::Status RunTopk(se::Stream* stream, PrimitiveType dtype,
 
 }  // namespace xla::gpu
 
-#endif  // XLA_SERVICE_GPU_RUNTIME_TOPK_KERNEL_H_
+#endif  // XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/topk.h b/third_party/xla/xla/service/gpu/kernels/topk_kernel_bfloat16.cu.cc
similarity index 63%
rename from third_party/xla/xla/service/gpu/runtime/topk.h
rename to third_party/xla/xla/service/gpu/kernels/topk_kernel_bfloat16.cu.cc
index 3b6e056aa43522..c0e47295a18d07 100644
--- a/third_party/xla/xla/service/gpu/runtime/topk.h
+++ b/third_party/xla/xla/service/gpu/kernels/topk_kernel_bfloat16.cu.cc
@@ -13,16 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME_TOPK_H_
-#define XLA_SERVICE_GPU_RUNTIME_TOPK_H_
-
-#include "xla/runtime/custom_call_registry.h"
+#include "xla/service/gpu/kernels/topk_kernel.cu.h"
+#include "xla/types.h"
 
 namespace xla::gpu {
 
-// Registers XLA Gpu runtime TopK custom calls.
-void RegisterTopkCustomCall(runtime::DirectCustomCallRegistry& registry);
+template void* GetTopKKernelForK<bfloat16, 1>(int n);
+template void* GetTopKKernelForK<bfloat16, 2>(int n);
+template void* GetTopKKernelForK<bfloat16, 4>(int n);
+template void* GetTopKKernelForK<bfloat16, 8>(int n);
+template void* GetTopKKernelForK<bfloat16, 16>(int n);
 
-}  // namespace xla::gpu
+template int32_t GetTopKWaveFrontSize<bfloat16>();
 
-#endif  // XLA_SERVICE_GPU_RUNTIME_TOPK_H_
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/topk_kernel_common.h b/third_party/xla/xla/service/gpu/kernels/topk_kernel_common.h
similarity index 83%
rename from third_party/xla/xla/service/gpu/runtime/topk_kernel_common.h
rename to third_party/xla/xla/service/gpu/kernels/topk_kernel_common.h
index d7ba89fce1bb84..5ddd9ed513d9aa 100644
--- a/third_party/xla/xla/service/gpu/runtime/topk_kernel_common.h
+++ b/third_party/xla/xla/service/gpu/kernels/topk_kernel_common.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME_TOPK_KERNEL_COMMON_H_
-#define XLA_SERVICE_GPU_RUNTIME_TOPK_KERNEL_COMMON_H_
+#ifndef XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_COMMON_H_
+#define XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_COMMON_H_
 
 #include <cstddef>
 
@@ -31,6 +31,9 @@ static constexpr size_t kTopKMaxThreadsPerBlock = 1024;
 template <typename T, size_t K>
 void* GetTopKKernelForK(int n);
 
+template <typename T>
+int32_t GetTopKWaveFrontSize();
+
 }  // namespace xla::gpu
 
-#endif  // XLA_SERVICE_GPU_RUNTIME_TOPK_KERNEL_COMMON_H_
+#endif  // XLA_SERVICE_GPU_KERNELS_TOPK_KERNEL_COMMON_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/topk_kernel_float.cu.cc b/third_party/xla/xla/service/gpu/kernels/topk_kernel_float.cu.cc
similarity index 90%
rename from third_party/xla/xla/service/gpu/runtime/topk_kernel_float.cu.cc
rename to third_party/xla/xla/service/gpu/kernels/topk_kernel_float.cu.cc
index 33ffd648c6934d..b7b7823a4dff2e 100644
--- a/third_party/xla/xla/service/gpu/runtime/topk_kernel_float.cu.cc
+++ b/third_party/xla/xla/service/gpu/kernels/topk_kernel_float.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime/topk_kernel.cu.h"
+#include "xla/service/gpu/kernels/topk_kernel.cu.h"
 
 namespace xla::gpu {
 
@@ -23,4 +23,6 @@ template void* GetTopKKernelForK<float, 4>(int n);
 template void* GetTopKKernelForK<float, 8>(int n);
 template void* GetTopKKernelForK<float, 16>(int n);
 
+template int32_t GetTopKWaveFrontSize<float>();
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/topk_kernel_test.cc b/third_party/xla/xla/service/gpu/kernels/topk_kernel_test.cc
similarity index 95%
rename from third_party/xla/xla/service/gpu/runtime/topk_kernel_test.cc
rename to third_party/xla/xla/service/gpu/kernels/topk_kernel_test.cc
index 62a3f09765b3e2..84d342dc99b5a6 100644
--- a/third_party/xla/xla/service/gpu/runtime/topk_kernel_test.cc
+++ b/third_party/xla/xla/service/gpu/kernels/topk_kernel_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime/topk_kernel.h"
+#include "xla/service/gpu/kernels/topk_kernel.h"
 
 #include <stddef.h>
 #include <stdint.h>
@@ -27,15 +27,14 @@ limitations under the License.
 #include "absl/random/random.h"
 #include "absl/strings/substitute.h"
 #include "absl/time/time.h"
-#include "Eigen/Core"  // from @eigen_archive
-#include "xla/service/gpu/runtime/gpu_kernel_helper.h"
 #include "xla/stream_executor/gpu/gpu_init.h"
 #include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/gpu/gpu_timer.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/test.h"
 #include "tsl/platform/test_benchmark.h"
@@ -71,11 +70,11 @@ std::vector<T> RandomVecNegative(int num_elements) {
 }
 
 PrimitiveType Get(float) { return PrimitiveType::F32; }
-PrimitiveType Get(Eigen::bfloat16) { return PrimitiveType::BF16; }
+PrimitiveType Get(bfloat16) { return PrimitiveType::BF16; }
 
 se::StreamExecutor* GetGpuExecutor() {
   auto* platform =
-      se::MultiPlatformManager::PlatformWithName(se::GpuPlatformName()).value();
+      se::PlatformManager::PlatformWithName(se::GpuPlatformName()).value();
   return platform->ExecutorForDevice(0).value();
 }
 
diff --git a/third_party/xla/xla/service/gpu/launch_dimensions.h b/third_party/xla/xla/service/gpu/launch_dimensions.h
index c9edf63a7e04f1..4b372b688cd27e 100644
--- a/third_party/xla/xla/service/gpu/launch_dimensions.h
+++ b/third_party/xla/xla/service/gpu/launch_dimensions.h
@@ -102,7 +102,7 @@ struct LaunchDimensionsConfig {
   // a block of unroll_factor elements. Otherwise each thread will
   // handle only unroll_factor.
   bool few_waves = false;
-  // If `row_optimized` is true, then the block size will equal to
+  // If `row_vectorized` is true, then the block size will equal to
   // `hlo.shape().dimensions().back()/unroll_factor`.
   // Currently few_waves and row_vectorized do not work together.
   bool row_vectorized = false;
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
index be1d96e899b54c..64cdec48e19ca0 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -1,3 +1,4 @@
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("//xla:xla.bzl", "xla_cc_test")
 load(
     "@local_config_rocm//rocm:build_defs.bzl",
@@ -5,7 +6,8 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
     licenses = ["notice"],
 )
 
@@ -26,7 +28,6 @@ cc_library(
         "gpu_backend_lib.h",
         "utils.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:status_macros",
         "//xla:statusor",
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
index 4fd373ca35d292..e2089f0f4bcabd 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.cc
@@ -292,10 +292,10 @@ absl::Status NVPTXTargetModuleLinker(llvm::Module* module,
 std::unique_ptr<llvm::TargetMachine> NVPTXGetTargetMachine(
     llvm::Triple target_triple, se::CudaComputeCapability compute_capability,
     const DebugOptions& debug_options) {
-  // TODO(b/266678775): Make it always PTX 7.1 as soon as TF driver requirements
+  // TODO(b/266678775): Make it always PTX 7.4 as soon as TF driver requirements
   // are updated.
   const std::string ptx_ver =
-      debug_options.xla_gpu_enable_triton_gemm() ? "+ptx71" : "+ptx60";
+      debug_options.xla_gpu_enable_triton_gemm() ? "+ptx74" : "+ptx60";
   // Figure out the exact name of the processor as known to the NVPTX backend
   // from the gpu_architecture flag.
   return GetTargetMachine(target_triple, GetSmName(compute_capability),
@@ -770,9 +770,7 @@ absl::StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
     ir_fs->flush();
   }
   // Locate lld.
-  // TODO(whchung@gmail.com): change to tensorflow::ROCmRoot() after
-  // ROCm-Device-Libs PR.
-  std::string lld_path = tsl::io::JoinPath("/opt/rocm", "llvm/bin");
+  std::string lld_path = tsl::io::JoinPath(tsl::RocmRoot(), "llvm/bin");
   auto lld_program = llvm::sys::findProgramByName("ld.lld", {lld_path});
   if (!lld_program) {
     return xla::Internal("unable to find ld.lld in PATH: %s",
diff --git a/third_party/xla/xla/service/gpu/make_batch_pointers.cc b/third_party/xla/xla/service/gpu/make_batch_pointers.cc
index 9cc84bf6be2ca3..70b22eefe1a839 100644
--- a/third_party/xla/xla/service/gpu/make_batch_pointers.cc
+++ b/third_party/xla/xla/service/gpu/make_batch_pointers.cc
@@ -66,7 +66,7 @@ absl::Status MakeBatchPointers(se::Stream* stream,
 
   TF_RETURN_IF_ERROR(
       stream->ThenLaunch(se::ThreadDim(kThreads, 1, 1),
-                         se::BlockDim(CeilOfRatio(n, kThreads), 1, 1), *kernel,
+                         se::BlockDim(CeilOfRatio(n, kThreads), 1, 1), kernel,
                          base_ptr, stride_bytes, n, ptrs_out));
 #endif
   return absl::OkStatus();
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.cc b/third_party/xla/xla/service/gpu/matmul_utils.cc
index 4f306600293f69..5ba11edb813b96 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.cc
+++ b/third_party/xla/xla/service/gpu/matmul_utils.cc
@@ -26,11 +26,14 @@ limitations under the License.
 
 #include "absl/algorithm/container.h"
 #include "absl/log/check.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
+#include "xla/autotuning.pb.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
 #include "xla/primitive_util.h"
+#include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
@@ -506,7 +509,7 @@ absl::StatusOr<GemmConfig::DescriptorsTuple> GemmConfig::GetMatrixDescriptors(
              ? se::blas::Transpose::kNoTranspose
              : se::blas::Transpose::kTranspose)};
   };
-  // make a local copy to prevent modification of layouts,
+  // TODO: make a local copy to prevent modification of layouts,
   // but maybe we can modify them once instead during creation ?
   se::gpu::MatrixLayout lhs = lhs_layout, rhs = rhs_layout, out = output_layout;
 
@@ -556,21 +559,25 @@ absl::Status DoGemmWithAlgorithm(const se::gpu::MatrixDescriptor& lhs,
   se::DeviceMemory<Output> output_data(output.data);
 
   // Set a workspace for all Blas operations launched below.
-  se::blas::BlasSupport::ScopedWorkspace scoped_workspace(
-      stream->parent()->AsBlas(), &workspace);
+  auto* blas = stream->parent()->AsBlas();
+  if (blas == nullptr) {
+    return absl::InternalError("No Blas support for stream");
+  }
+
+  se::blas::BlasSupport::ScopedWorkspace scoped_workspace(blas, &workspace);
 
   if (output.batch_size != 1) {
-    return stream->ThenBlasGemmStridedBatchedWithAlgorithm(
-        lhs.transpose, rhs.transpose, output.m, output.n, output.k, alpha,
-        lhs.cast<Input>(), lhs.leading_dim_stride, lhs.batch_stride,
+    return blas->BlasGemmStridedBatchedWithAlgorithm(
+        stream, lhs.transpose, rhs.transpose, output.m, output.n, output.k,
+        alpha, lhs.cast<Input>(), lhs.leading_dim_stride, lhs.batch_stride,
         rhs.cast<Input>(), rhs.leading_dim_stride, rhs.batch_stride, beta,
         &output_data, output.leading_dim_stride, output.batch_stride,
         output.batch_size, computation_type, algorithm, numeric_options,
         profile_result, context);
   } else {
-    return stream->ThenBlasGemmWithAlgorithm(
-        lhs.transpose, rhs.transpose, output.m, output.n, output.k, alpha,
-        lhs.cast<Input>(), lhs.leading_dim_stride, rhs.cast<Input>(),
+    return blas->BlasGemmWithAlgorithm(
+        stream, lhs.transpose, rhs.transpose, output.m, output.n, output.k,
+        alpha, lhs.cast<Input>(), lhs.leading_dim_stride, rhs.cast<Input>(),
         rhs.leading_dim_stride, beta, &output_data, output.leading_dim_stride,
         computation_type, algorithm, numeric_options, profile_result, context);
   }
@@ -589,34 +596,34 @@ absl::Status DoGemm(const se::gpu::MatrixDescriptor& lhs,
                     se::blas::CallContext context) {
   CHECK(output.transpose == se::blas::Transpose::kNoTranspose);
   se::DeviceMemory<Output> output_data(output.data);
+  auto* blas = stream->parent()->AsBlas();
+  if (blas == nullptr) {
+    return absl::InternalError("No Blas support for stream");
+  }
 
   // Set a workspace for all Blas operations launched below.
-  se::blas::BlasSupport::ScopedWorkspace scoped_workspace(
-      stream->parent()->AsBlas(), &workspace);
+  se::blas::BlasSupport::ScopedWorkspace scoped_workspace(blas, &workspace);
 
-// TODO: enable DoGemmWithAlgorithm for ROCm !
-#if GOOGLE_CUDA
   if (algorithm) {
     return DoGemmWithAlgorithm<Scale, Input, Output>(
         lhs, rhs, output, workspace, alpha, beta, stream, *algorithm,
         compute_precision, numeric_options, profile_result, context);
   }
-#endif
 
   if (output.batch_size != 1) {
-    return stream->ThenBlasGemmStridedBatched(
-        lhs.transpose, rhs.transpose, output.m, output.n, output.k, alpha,
-        lhs.cast<Input>(), lhs.leading_dim_stride, lhs.batch_stride,
+    return blas->BlasGemmStridedBatched(
+        stream, lhs.transpose, rhs.transpose, output.m, output.n, output.k,
+        alpha, lhs.cast<Input>(), lhs.leading_dim_stride, lhs.batch_stride,
         rhs.cast<Input>(), rhs.leading_dim_stride, rhs.batch_stride, beta,
         &output_data, output.leading_dim_stride, output.batch_stride,
         output.batch_size, numeric_options, context);
   }
 
-  return stream->ThenBlasGemm(
-      lhs.transpose, rhs.transpose, output.m, output.n, output.k, alpha,
-      lhs.cast<Input>(), lhs.leading_dim_stride, rhs.cast<Input>(),
-      rhs.leading_dim_stride, beta, &output_data, output.leading_dim_stride,
-      numeric_options, context);
+  return blas->BlasGemm(stream, lhs.transpose, rhs.transpose, output.m,
+                        output.n, output.k, alpha, lhs.cast<Input>(),
+                        lhs.leading_dim_stride, rhs.cast<Input>(),
+                        rhs.leading_dim_stride, beta, &output_data,
+                        output.leading_dim_stride, numeric_options, context);
 }
 
 }  // namespace
@@ -781,18 +788,46 @@ absl::StatusOr<se::gpu::BlasLt::Epilogue> AsBlasLtEpilogue(
   return Internal("unexpected epilogue value");
 }
 
+absl::StatusOr<se::gpu::BlasLt::Epilogue> AsBlasLtEpilogue(
+    GemmBackendConfig_Epilogue epilogue) {
+  switch (epilogue) {
+    case GemmBackendConfig::DEFAULT:
+      return se::gpu::BlasLt::Epilogue::kDefault;
+    case GemmBackendConfig::RELU:
+      return se::gpu::BlasLt::Epilogue::kReLU;
+    case GemmBackendConfig::GELU:
+      return se::gpu::BlasLt::Epilogue::kGELU;
+    case GemmBackendConfig::GELU_AUX:
+      return se::gpu::BlasLt::Epilogue::kGELUWithAux;
+    case GemmBackendConfig::BIAS:
+      return se::gpu::BlasLt::Epilogue::kBias;
+    case GemmBackendConfig::BIAS_RELU:
+      return se::gpu::BlasLt::Epilogue::kBiasThenReLU;
+    case GemmBackendConfig::BIAS_GELU:
+      return se::gpu::BlasLt::Epilogue::kBiasThenGELU;
+    case GemmBackendConfig::BIAS_GELU_AUX:
+      return se::gpu::BlasLt::Epilogue::kBiasThenGELUWithAux;
+    default:
+      return Internal("unexpected epilogue value");
+  }
+}
+
 }  // namespace gpublas_lt
 
-/*static*/ TritonGemmConfig TritonGemmConfig::FromProto(
+/*static*/ StatusOr<TritonGemmConfig> TritonGemmConfig::FromProto(
     const AutotuneResult::TritonGemmKey& proto) {
-  TritonGemmConfig config;
-  config.block_m = proto.block_m();
-  config.block_n = proto.block_n();
-  config.block_k = proto.block_k();
-  config.split_k = proto.split_k();
-  config.num_stages = proto.num_stages();
-  config.num_warps = proto.num_warps();
-  return config;
+  // Sanity check to avoid loading incomplete data.
+  TF_RET_CHECK(proto.block_m() > 0);
+  TF_RET_CHECK(proto.block_n() > 0);
+  TF_RET_CHECK(proto.block_k() > 0);
+  TF_RET_CHECK(proto.split_k() > 0);
+  TF_RET_CHECK(proto.num_stages() > 0);
+  TF_RET_CHECK(proto.num_warps() > 0);
+  TF_RET_CHECK(proto.num_ctas() > 0);
+
+  return TritonGemmConfig(proto.block_m(), proto.block_n(), proto.block_k(),
+                          proto.split_k(), proto.num_stages(),
+                          proto.num_warps(), proto.num_ctas());
 }
 
 AutotuneResult::TritonGemmKey TritonGemmConfig::ToProto() const {
@@ -803,6 +838,7 @@ AutotuneResult::TritonGemmKey TritonGemmConfig::ToProto() const {
   key.set_split_k(split_k);
   key.set_num_stages(num_stages);
   key.set_num_warps(num_warps);
+  key.set_num_ctas(num_ctas);
   return key;
 }
 
@@ -810,7 +846,7 @@ std::string TritonGemmConfig::ToString() const {
   return absl::StrCat("{block_m:", block_m, ",block_n:", block_n,
                       ",block_k:", block_k, ",split_k:", split_k,
                       ",num_stages:", num_stages, ",num_warps:", num_warps,
-                      "}");
+                      ",num_ctas:", num_ctas, "}");
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.h b/third_party/xla/xla/service/gpu/matmul_utils.h
index ebe6a6e23bd230..efb0c7eb6ddddf 100644
--- a/third_party/xla/xla/service/gpu/matmul_utils.h
+++ b/third_party/xla/xla/service/gpu/matmul_utils.h
@@ -188,53 +188,51 @@ absl::StatusOr<bool> EpilogueHasAuxiliaryOutput(
 
 absl::StatusOr<se::gpu::BlasLt::Epilogue> AsBlasLtEpilogue(
     mlir::lmhlo_gpu::CublasLtMatmulEpilogue epilogue);
+absl::StatusOr<se::gpu::BlasLt::Epilogue> AsBlasLtEpilogue(
+    GemmBackendConfig_Epilogue epilogue);
 
 }  // namespace gpublas_lt
 
 // We should use this in code instead of AutotuneResult::TritonGemmKey.
 // This has some advantages, for example it can be used in hashmaps.
 struct TritonGemmConfig {
-  struct ClusterDims {
-    constexpr ClusterDims() = default;
-    constexpr ClusterDims(int x, int y, int z) : x(x), y(y), z(z) {}
-    int x = 1;
-    int y = 1;
-    int z = 1;
-  };
-
   constexpr TritonGemmConfig() = default;
   constexpr TritonGemmConfig(int block_m, int block_n, int block_k, int split_k,
-                             int num_stages, int num_warps, int num_ctas = 1,
-                             ClusterDims cluster_dims = ClusterDims(1, 1, 1),
-                             bool enable_warp_specialization = false)
+                             int num_stages, int num_warps, int num_ctas = 1)
       : block_m(block_m),
         block_n(block_n),
         block_k(block_k),
         split_k(split_k),
         num_stages(num_stages),
         num_warps(num_warps),
-        num_ctas(num_ctas),
-        cluster_dims(cluster_dims),
-        enable_warp_specialization(enable_warp_specialization) {}
+        num_ctas(num_ctas) {}
   int block_m = 0;
   int block_n = 0;
   int block_k = 0;
   int split_k = 0;
   int num_stages = 0;
   int num_warps = 0;
-  int num_ctas = 1;
-  ClusterDims cluster_dims;
-  bool enable_warp_specialization = false;
+  // Number of blocks in a block cluster.
+  int num_ctas = 0;
+
+  // When adding new members, please update all methods, such as ToTuple,
+  // FromProto, ToProto, ToString, etc. Updating ToTuple is not enough.
+  // Please also add new members to AutotuneResult::TritonGemmKey in
+  // autotuning.proto. Also kVersion has to be incremented in autotuner_util.cc
+  // and all the autotuning results stored in tests, repos, etc. will have to
+  // be updated.
 
  private:
   auto ToTuple() const {
     return std::make_tuple(block_m, block_n, block_k, split_k, num_stages,
-                           num_warps, num_ctas, cluster_dims.x, cluster_dims.y,
-                           cluster_dims.z, enable_warp_specialization);
+                           num_warps, num_ctas);
   }
 
  public:
-  static TritonGemmConfig FromProto(const AutotuneResult::TritonGemmKey& proto);
+  // Creates a TritonGemmConfig from the supplied proto, doing a simple sanity
+  // check.
+  static StatusOr<TritonGemmConfig> FromProto(
+      const AutotuneResult::TritonGemmKey& proto);
   AutotuneResult::TritonGemmKey ToProto() const;
 
   std::string ToString() const;
diff --git a/third_party/xla/xla/service/gpu/mock_nccl_utils.cc b/third_party/xla/xla/service/gpu/mock_nccl_utils.cc
index b1b2225b669807..91f3a0ebba5842 100644
--- a/third_party/xla/xla/service/gpu/mock_nccl_utils.cc
+++ b/third_party/xla/xla/service/gpu/mock_nccl_utils.cc
@@ -655,7 +655,7 @@ absl::StatusOr<std::shared_ptr<NcclClique::Lock>> AcquireNcclClique(
                                   .xla_gpu_nccl_termination_timeout_seconds();
 
   return RendezvousSingle<absl::StatusOr<NcclClique::Lock>>(
-      rendezvous_key, num_local_participants,
+      "acquire mock NCCL clique", rendezvous_key, num_local_participants,
       [&]() -> absl::StatusOr<NcclClique::Lock> {
         const NcclCliqueKey& clique_key = std::get<2>(rendezvous_key);
         NcclClique::Lock clique = cliques[clique_key].Acquire();
diff --git a/third_party/xla/xla/service/gpu/mock_nccl_utils.h b/third_party/xla/xla/service/gpu/mock_nccl_utils.h
index 19526f7e51206c..70f07145d30e1a 100644
--- a/third_party/xla/xla/service/gpu/mock_nccl_utils.h
+++ b/third_party/xla/xla/service/gpu/mock_nccl_utils.h
@@ -18,9 +18,11 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <string>
 #include <vector>
 
 #include "absl/status/status.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/executable_run_options.h"
 #include "xla/service/collective_ops_utils.h"
@@ -31,11 +33,25 @@ limitations under the License.
 #include "xla/service/gpu/nccl_collective_thunk.h"
 #include "xla/service/gpu/nccl_p2p_thunk_common.h"
 #include "xla/service/gpu/thunk.h"
+#include "xla/service/lockable.h"
 #include "xla/stream_executor/stream.h"
+#include "tsl/lib/gtl/int_type.h"
 
 namespace xla {
 namespace gpu {
 
+TSL_LIB_GTL_DEFINE_INT_TYPE(OpId, int64_t);
+
+struct NcclCommName {
+  static std::string ToString(NcclApi::NcclCommHandle comm) {
+    return absl::StrFormat("lockable comm %p", comm);
+  }
+};
+
+struct NcclComm : public Lockable<NcclApi::NcclCommHandle, NcclCommName> {
+  explicit NcclComm(NcclApi::NcclCommHandle comm) : Lockable(comm) {}
+};
+
 // Create the mock nccl communicator assuming all hosts have the same hardwares.
 absl::StatusOr<NcclComm::Lock> LockMockNcclComm(
     const Thunk::CollectiveExecuteParams& params,
diff --git a/third_party/xla/xla/service/gpu/model/BUILD b/third_party/xla/xla/service/gpu/model/BUILD
index 9d06b50fd49672..cb5afdb8d44798 100644
--- a/third_party/xla/xla/service/gpu/model/BUILD
+++ b/third_party/xla/xla/service/gpu/model/BUILD
@@ -2,13 +2,15 @@
 load("//xla/tests:build_defs.bzl", "xla_test")
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//xla:xla.bzl", "xla_cc_test", "xla_nvml_deps")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:build_config_root.bzl", "tf_cuda_tests_tags")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
     licenses = ["notice"],
 )
 
@@ -23,8 +25,8 @@ cc_library(
     name = "analytical_latency_estimator",
     srcs = ["analytical_latency_estimator.cc"],
     hdrs = ["analytical_latency_estimator.h"],
-    visibility = ["//visibility:public"],
     deps = [
+        ":gpu_collective_performance_model",
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
         "//xla:xla_proto_cc",
@@ -71,7 +73,6 @@ cc_library(
     name = "fusion_analysis_cache",
     srcs = ["fusion_analysis_cache.cc"],
     hdrs = ["fusion_analysis_cache.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_fusion_analysis",
@@ -102,7 +103,6 @@ cc_library(
     name = "gpu_cost_model_stats_collection",
     srcs = ["gpu_cost_model_stats_collection.cc"],
     hdrs = ["gpu_cost_model_stats_collection.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":gpu_hlo_cost_analysis",
         ":gpu_performance_model",
@@ -141,12 +141,8 @@ xla_cc_test(
 cc_library(
     name = "gpu_hlo_cost_analysis",
     srcs = ["gpu_hlo_cost_analysis.cc"],
-    hdrs = [
-        "gpu_hlo_cost_analysis.h",
-        "hlo_op_profiles_data.h",
-    ],
+    hdrs = ["gpu_hlo_cost_analysis.h"],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_op_profile_proto_cc",
         ":hlo_op_profiles",
@@ -187,19 +183,121 @@ xla_cc_test(
     ],
 )
 
+cc_library(
+    name = "gpu_performance_model_base",
+    srcs = ["gpu_performance_model_base.cc"],
+    hdrs = ["gpu_performance_model_base.h"],
+    deps = [
+        ":fusion_analysis_cache",
+        ":gpu_hlo_cost_analysis",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu:hlo_traversal",
+        "//xla/service/gpu:launch_dimensions",
+        "//xla/service/gpu/fusions",
+        "//xla/service/gpu/fusions:fusion_emitter",
+        "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+    ],
+)
+
+xla_cc_test(
+    name = "gpu_performance_model_base_test",
+    srcs = ["gpu_performance_model_base_test.cc"],
+    deps = [
+        ":gpu_hlo_cost_analysis",
+        ":gpu_performance_model_base",
+        "//xla:shape_util",
+        "//xla:test_helpers",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/strings",
+        "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "gpu_performance_model",
     srcs = ["gpu_performance_model.cc"],
     hdrs = ["gpu_performance_model.h"],
+    deps = [
+        ":coalescing_analysis",
+        ":gpu_hlo_cost_analysis",
+        ":gpu_performance_model_base",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu:launch_dimensions",
+        "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:status",
+    ],
+)
+
+xla_cc_test(
+    name = "gpu_performance_model_test",
+    srcs = ["gpu_performance_model_test.cc"],
+    deps = [
+        ":gpu_hlo_cost_analysis",
+        ":gpu_indexing_performance_model",
+        ":gpu_performance_model",
+        ":gpu_performance_model_base",
+        "//xla:shape_util",
+        "//xla:test_helpers",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_module_config",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/stream_executor:device_description",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/time",
+        "@com_google_googletest//:gtest",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "gpu_collective_performance_model",
+    srcs = ["gpu_collective_performance_model.cc"],
+    hdrs = ["gpu_collective_performance_model.h"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    visibility = ["//visibility:public"],
     deps = [
         ":coalescing_analysis",
         ":fusion_analysis_cache",
         ":gpu_hlo_cost_analysis",
+        ":gpu_performance_model_base",
+        ":hlo_op_profiles",
+        ":indexing_analysis",
+        ":indexing_map",
         "//xla:shape_util",
+        "//xla:statusor",
         "//xla:util",
         "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_cost_analysis",
         "//xla/service:hlo_dataflow_analysis",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_fusible",
@@ -207,6 +305,7 @@ cc_library(
         "//xla/service/gpu:hlo_traversal",
         "//xla/service/gpu:launch_dimensions",
         "//xla/service/gpu/fusions",
+        "//xla/service/gpu/fusions:fusion_emitter",
         "//xla/stream_executor:device_description",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
@@ -215,21 +314,63 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:status",
     ] + if_cuda_is_configured(xla_nvml_deps()),
 )
 
 xla_cc_test(
-    name = "gpu_performance_model_test",
-    srcs = ["gpu_performance_model_test.cc"],
+    name = "gpu_collective_performance_model_test",
+    srcs = ["gpu_collective_performance_model_test.cc"],
+    deps = [
+        ":gpu_collective_performance_model",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/tests:hlo_test_base",
+        "//xla/tests:xla_internal_test_main",
+        "@com_google_googletest//:gtest",
+    ],
+)
+
+cc_library(
+    name = "gpu_indexing_performance_model",
+    srcs = ["gpu_indexing_performance_model.cc"],
+    hdrs = ["gpu_indexing_performance_model.h"],
     deps = [
+        ":coalescing_analysis",
         ":gpu_hlo_cost_analysis",
-        ":gpu_performance_model",
+        ":gpu_performance_model_base",
+        ":hlo_op_profiles",
+        ":indexing_analysis",
+        ":indexing_map",
+        "//xla:shape_util",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:hlo_fusion_analysis",
+        "//xla/service/gpu:hlo_traversal",
+        "//xla/service/gpu:launch_dimensions",
+        "//xla/stream_executor:device_description",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/time",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:status",
+    ],
+)
+
+xla_cc_test(
+    name = "gpu_indexing_performance_model_test",
+    srcs = ["gpu_indexing_performance_model_test.cc"],
+    deps = [
+        ":gpu_hlo_cost_analysis",
+        ":gpu_indexing_performance_model",
+        ":gpu_performance_model_base",
         "//xla:shape_util",
-        "//xla:test_helpers",
         "//xla/hlo/ir:hlo",
-        "//xla/service:hlo_module_config",
         "//xla/service/gpu:backend_configs_cc",
         "//xla/service/gpu:gpu_device_info_for_tests",
         "//xla/stream_executor:device_description",
@@ -238,7 +379,7 @@ xla_cc_test(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/time",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:errors",
+        "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:statusor",
     ],
 )
@@ -247,9 +388,9 @@ cc_library(
     name = "affine_map_printer",
     srcs = ["affine_map_printer.cc"],
     hdrs = ["affine_map_printer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
@@ -273,7 +414,6 @@ cc_library(
     name = "indexing_map",
     srcs = ["indexing_map.cc"],
     hdrs = ["indexing_map.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":affine_map_printer",
         "@com_google_absl//absl/types:span",
@@ -304,16 +444,18 @@ cc_library(
     testonly = True,
     srcs = ["indexing_test_utils.cc"],
     hdrs = ["indexing_test_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":affine_map_printer",
         ":indexing_analysis",
+        ":indexing_map",
         "//xla/hlo/ir:hlo",
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
+        "@com_google_absl//absl/container:flat_hash_set",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:IR",
     ],
@@ -323,7 +465,6 @@ cc_library(
     name = "indexing_analysis",
     srcs = ["indexing_analysis.cc"],
     hdrs = ["indexing_analysis.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":affine_map_printer",
         ":indexing_map",
@@ -334,9 +475,12 @@ cc_library(
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:hlo_traversal",
         "//xla/service/gpu:matmul_utils",
+        "//xla/service/gpu/fusions:tiling_util",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
@@ -365,7 +509,6 @@ cc_library(
     name = "tile_analysis",
     srcs = ["tile_analysis.cc"],
     hdrs = ["tile_analysis.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":affine_map_printer",
         ":indexing_map",
@@ -400,18 +543,22 @@ cc_library(
     name = "coalescing_analysis",
     srcs = ["coalescing_analysis.cc"],
     hdrs = ["coalescing_analysis.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":indexing_analysis",
-        "//xla:permutation_util",
+        ":indexing_map",
         "//xla:shape_util",
         "//xla/hlo/ir:hlo",
         "//xla/service/gpu:gpu_fusible",
         "//xla/service/gpu:hlo_fusion_analysis",
-        "@com_google_absl//absl/algorithm:container",
+        "//xla/service/gpu:hlo_traversal",
+        "//xla/service/gpu/fusions:fusion_emitter",
         "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/types:span",
+        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
     ],
 )
 
@@ -420,16 +567,18 @@ xla_cc_test(
     srcs = ["coalescing_analysis_test.cc"],
     deps = [
         ":coalescing_analysis",
-        ":indexing_analysis",
+        "//xla:shape_util",
         "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:gpu_device_info_for_tests",
+        "//xla/service/gpu:hlo_fusion_analysis",
         "//xla/service/gpu:hlo_traversal",
+        "//xla/service/gpu/fusions",
+        "//xla/service/gpu/fusions:fusion_emitter",
+        "//xla/stream_executor:device_description",
         "//xla/tests:hlo_test_base",
-        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest_main",
-        "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
     ],
 )
@@ -442,15 +591,16 @@ tf_proto_library(
     protodeps = [
         "//xla/service:hlo_proto",
     ],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "hlo_op_profiles",
     srcs = ["hlo_op_profiles.cc"],
-    hdrs = ["hlo_op_profiles.h"],
+    hdrs = [
+        "hlo_op_profiles.h",
+        "hlo_op_profiles_data.h",
+    ],
     compatible_with = get_compatible_with_portable(),
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_op_profile_proto_cc",
         "//xla:types",
@@ -485,7 +635,6 @@ cc_library(
     srcs = ["hlo_op_profiler.cc"],
     hdrs = ["hlo_op_profiler.h"],
     local_defines = if_cuda(["GOOGLE_CUDA"]),
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_op_profile_proto_cc",
         "//xla:debug_options_flags",
diff --git a/third_party/xla/xla/service/gpu/model/affine_map_printer.cc b/third_party/xla/xla/service/gpu/model/affine_map_printer.cc
index 546bdc2047cf8d..972da89717e32a 100644
--- a/third_party/xla/xla/service/gpu/model/affine_map_printer.cc
+++ b/third_party/xla/xla/service/gpu/model/affine_map_printer.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,8 +18,10 @@ limitations under the License.
 #include <cstdint>
 #include <ostream>
 #include <string>
+#include <string_view>
 
 #include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
@@ -40,6 +42,19 @@ using mlir::AffineSymbolExpr;
 
 }  // namespace
 
+AffineMapPrinter::AffineMapPrinter(
+    absl::Span<const std::string_view> dim_names,
+    absl::Span<const std::string_view> symbol_names) {
+  dim_id_to_name_.reserve(dim_names.size());
+  for (const auto& [index, name] : llvm::enumerate(dim_names)) {
+    dim_id_to_name_[index] = name;
+  }
+  symbol_id_to_name_.reserve(symbol_names.size());
+  for (const auto& [index, name] : llvm::enumerate(symbol_names)) {
+    symbol_id_to_name_[index] = name;
+  }
+}
+
 void AffineMapPrinter::Print(std::ostream& out, AffineMap affine_map) const {
   out << ToString(affine_map);
 }
@@ -83,6 +98,18 @@ std::string AffineMapPrinter::ToString(AffineMap affine_map) const {
   return s;
 }
 
+void AffineMapPrinter::Print(std::ostream& out,
+                             mlir::AffineExpr affine_expr) const {
+  out << ToString(affine_expr);
+}
+
+std::string AffineMapPrinter::ToString(mlir::AffineExpr affine_expr) const {
+  std::string s;
+  llvm::raw_string_ostream ss(s);
+  PrintExprImpl(affine_expr, /*add_parentheses=*/false, ss);
+  return s;
+}
+
 void AffineMapPrinter::PrintExprImpl(const mlir::AffineExpr affine_expr,
                                      bool add_parentheses,
                                      llvm::raw_ostream& os) const {
diff --git a/third_party/xla/xla/service/gpu/model/affine_map_printer.h b/third_party/xla/xla/service/gpu/model/affine_map_printer.h
index 77ed2668fbd8e1..67360a552fe2b9 100644
--- a/third_party/xla/xla/service/gpu/model/affine_map_printer.h
+++ b/third_party/xla/xla/service/gpu/model/affine_map_printer.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -19,7 +19,9 @@ limitations under the License.
 #include <cstdint>
 #include <ostream>
 #include <string>
+#include <string_view>
 
+#include "absl/types/span.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
@@ -32,6 +34,12 @@ namespace gpu {
 // symbol and dimension names.
 class AffineMapPrinter {
  public:
+  AffineMapPrinter() = default;
+  AffineMapPrinter(AffineMapPrinter&& other) = default;
+  AffineMapPrinter& operator=(AffineMapPrinter&& other) = default;
+  AffineMapPrinter(absl::Span<const std::string_view> dim_names,
+                   absl::Span<const std::string_view> symbol_names);
+
   void SetSymbolName(int64_t symbol_id, llvm::StringRef name);
   void SetDimensionName(int64_t dim_id, llvm::StringRef name);
 
@@ -41,6 +49,9 @@ class AffineMapPrinter {
   void Print(std::ostream& out, mlir::AffineMap affine_map) const;
   std::string ToString(mlir::AffineMap affine_map) const;
 
+  void Print(std::ostream& out, mlir::AffineExpr affine_expr) const;
+  std::string ToString(mlir::AffineExpr affine_expr) const;
+
  private:
   void PrintExprImpl(mlir::AffineExpr affine_expr, bool add_parentheses,
                      llvm::raw_ostream& os) const;
diff --git a/third_party/xla/xla/service/gpu/model/affine_map_printer_test.cc b/third_party/xla/xla/service/gpu/model/affine_map_printer_test.cc
index d027558092a460..1544d05686a0f9 100644
--- a/third_party/xla/xla/service/gpu/model/affine_map_printer_test.cc
+++ b/third_party/xla/xla/service/gpu/model/affine_map_printer_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
index bbf5aa77cec668..9865c369dfd702 100644
--- a/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
+++ b/third_party/xla/xla/service/gpu/model/analytical_latency_estimator.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/hlo/utils/hlo_query.h"
+#include "xla/service/gpu/model/gpu_collective_performance_model.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model.h"
 #include "xla/service/hlo_cost_analysis.h"
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
index d9e62659c16ea4..ef30f6d65722c8 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.cc
@@ -15,38 +15,44 @@ limitations under the License.
 
 #include "xla/service/gpu/model/coalescing_analysis.h"
 
+#include <algorithm>
+#include <cassert>
 #include <cstdint>
 #include <optional>
 #include <vector>
 
-#include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "absl/types/span.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/layout.h"
-#include "xla/permutation_util.h"
+#include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/gpu_fusible.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
 
 namespace xla {
 namespace gpu {
 
-using mlir::AffineMap;
-
-bool IsReadCoalescedHeuristic(
-    const std::optional<HloFusionAnalysis>& fusion_analysis,
-    const HloInstruction* producer, const HloInstruction* consumer) {
-  auto analyzed_kind_or_reduction =
-      fusion_analysis ? fusion_analysis->GetEmitterFusionKind()
-                      : HloFusionAnalysis::EmitterFusionKind::kReduction;
-
+// Returns true if all input reads are coalesced. If consumer is not nullptr,
+// producer and consumer are considered as one fusion, otherwise it's only the
+// producer.
+bool IsReadCoalescedHeuristic(HloFusionAnalysis::EmitterFusionKind fusion_kind,
+                              const HloInstruction* producer,
+                              const HloInstruction* consumer) {
   // Transposing minor dimension breaks coalescing.
-  if (analyzed_kind_or_reduction !=
-      HloFusionAnalysis::EmitterFusionKind::kTranspose) {
+  if (fusion_kind != HloFusionAnalysis::EmitterFusionKind::kTranspose) {
     auto is_broadcast = [&](const HloInstruction* instr) {
       while (true) {
         if (instr->opcode() == HloOpcode::kBroadcast) return true;
@@ -57,7 +63,6 @@ bool IsReadCoalescedHeuristic(
         instr = instr->operand(0);
       }
     };
-
     auto is_bad_transpose = [&](const HloInstruction* instr) {
       if (instr->opcode() == HloOpcode::kFusion) {
         for (auto* instr : instr->fused_instructions()) {
@@ -71,59 +76,330 @@ bool IsReadCoalescedHeuristic(
       }
       return TransposesMinorDimension(instr);
     };
-
     if (is_bad_transpose(producer)) return false;
     if (consumer && is_bad_transpose(consumer)) return false;
   }
-
   // Fusing two row reductions breaks coalescing.
-  if (analyzed_kind_or_reduction ==
-          HloFusionAnalysis::EmitterFusionKind::kReduction &&
+  if (fusion_kind == HloFusionAnalysis::EmitterFusionKind::kReduction &&
       IsInputFusibleReduction(*producer) && consumer &&
       IsInputFusibleReduction(*consumer)) {
     return false;
   }
-
   return true;
 }
 
 namespace {
 
-// Converts a layout to a dimensions transposition necessary to get to that
-// layout from identity.
-std::vector<int64_t> ToTransposeDimensions(const Layout& l) {
-  std::vector<int64_t> out(l.minor_to_major().begin(),
-                           l.minor_to_major().end());
-  absl::c_reverse(out);
-  return out;
+using mlir::AffineExpr;
+using mlir::AffineMap;
+using mlir::getAffineConstantExpr;
+using mlir::MLIRContext;
+
+// Performs backtracking to find all feasible dimensions, symbols that satisfy
+// the constraints and then evaluates the affine map at those.
+// For example, for the following indexing map:
+//   (d0)[s0] -> (d0 + s0)
+//   domain:
+//   d0 in [0, 3]
+//   s0 in [0, 1, 2]
+//   s0 mod 2 in [0, 0]
+// The function will compute the following indices [0, 2, 1, 3, 2, 4, 3, 5].
+void FindAllIndices(const IndexingMap& thread_id_to_physical_index,
+                    MLIRContext* mlir_context, int dim_id, int symbol_id,
+                    std::vector<AffineExpr>* dimensions,
+                    std::vector<AffineExpr>* symbols,
+                    std::vector<int64_t>* indices) {
+  if (dim_id < thread_id_to_physical_index.GetDimensionCount()) {
+    Range dim_range = thread_id_to_physical_index.GetDimensionRange(dim_id);
+    for (int64_t dim_value = dim_range.lower_bound;
+         dim_value <= dim_range.upper_bound; ++dim_value) {
+      dimensions->push_back(getAffineConstantExpr(dim_value, mlir_context));
+      FindAllIndices(thread_id_to_physical_index, mlir_context, dim_id + 1,
+                     symbol_id, dimensions, symbols, indices);
+      dimensions->pop_back();
+    }
+    return;
+  }
+  if (symbol_id < thread_id_to_physical_index.GetSymbolCount()) {
+    Range symbol_range = thread_id_to_physical_index.GetSymbolRange(symbol_id);
+    for (int64_t symbol_value = symbol_range.lower_bound;
+         symbol_value <= symbol_range.upper_bound; ++symbol_value) {
+      symbols->push_back(getAffineConstantExpr(symbol_value, mlir_context));
+      FindAllIndices(thread_id_to_physical_index, mlir_context, dim_id,
+                     symbol_id + 1, dimensions, symbols, indices);
+      symbols->pop_back();
+    }
+    return;
+  }
+  if (!thread_id_to_physical_index.ConstraintsSatisfied(*dimensions,
+                                                        *symbols)) {
+    return;
+  }
+  indices->push_back(
+      thread_id_to_physical_index.Evaluate(*dimensions, *symbols).front());
+}
+
+// Computes contiguous intervals of accessed elements.
+// For example, for an indexing map
+//   (thread_x) -> (thread_x * 4 + s0 + (thread_x floordiv 16) * 1984)
+//   d0 in [0, 31]
+//   s0 in [0, 3]
+// The intervals are [0, 63] and [2047, 2111].
+// TODO(b/325613460): Make it faster than O(number of elements in the domain).
+std::vector<Range> FindContiguousIntervals(
+    const IndexingMap& thread_id_to_physical_index) {
+  CHECK(thread_id_to_physical_index.GetAffineMap().getNumResults() == 1)
+      << "Expects an affine map that maps to 1D.";
+  MLIRContext* mlir_context = thread_id_to_physical_index.GetMLIRContext();
+
+  // Find all linear indices, sort and deduplicate them.
+  std::vector<AffineExpr> dimensions, symbols;
+  std::vector<int64_t> linear_indices;
+  FindAllIndices(thread_id_to_physical_index, mlir_context,
+                 /*dim_id=*/0,
+                 /*symbol_id=*/0, &dimensions, &symbols, &linear_indices);
+  std::sort(linear_indices.begin(), linear_indices.end());
+  linear_indices.erase(
+      std::unique(linear_indices.begin(), linear_indices.end()),
+      linear_indices.end());
+
+  // Scan over the sorted unique indices and combine them in intervals.
+  std::vector<Range> intervals;
+  for (int i = 0, start, end; i < linear_indices.size(); ++i) {
+    start = linear_indices[i++];
+    end = start;
+    while (i < linear_indices.size() && linear_indices[i] == end + 1) {
+      ++end;
+      ++i;
+    }
+    intervals.push_back(Range{start, end});
+  }
+  return intervals;
+}
+
+int64_t CeilDiv(int64_t a, int64_t b) { return a / b + (a % b != 0); }
+
+// Approximately estimate the number of memory transactions needed to load all
+// elements in every range and compare it with the "ideal" number of memory
+// transactions, i.e. total number of elements in all ranges / WarpSize().
+// Note, that later we would need to take the element type into account.
+bool EstimateCoalescingViaMemoryTransactionsCount(
+    absl::Span<const Range> intervals, PrimitiveType element_type) {
+  constexpr int64_t kBytesPerMemoryTransaction = 128;
+  int64_t type_size = ShapeUtil::ByteSizeOfPrimitiveType(element_type);
+  int memory_transactions = 0;
+  int total_num_elements = 0;
+  for (const auto& range : intervals) {
+    int64_t num_elements = range.upper_bound - range.lower_bound + 1;
+    memory_transactions +=
+        CeilDiv(num_elements * type_size, kBytesPerMemoryTransaction);
+    total_num_elements += num_elements;
+  }
+  if (memory_transactions == 0) {
+    return true;
+  }
+  int memory_transactions_lower_bound =
+      CeilDiv(total_num_elements * type_size, kBytesPerMemoryTransaction);
+  // The magic value chosen by an uneducated guess.
+  constexpr float kIsCoalescedThreshold = 0.9;
+  return memory_transactions_lower_bound >
+         memory_transactions * kIsCoalescedThreshold;
+}
+
+bool IsCoalesced(const IndexingMap& thread_id_to_input_indexing_map,
+                 PrimitiveType element_type) {
+  // Undefined indexing maps, i.e. those for which we don't know the indexing
+  // are assumed to be uncoalesced.
+  if (thread_id_to_input_indexing_map.IsUndefined()) {
+    return false;
+  }
+  // 0d constants are coalesced.
+  if (thread_id_to_input_indexing_map.GetAffineMap().getNumResults() == 0) {
+    return true;
+  }
+  MLIRContext* mlir_context = thread_id_to_input_indexing_map.GetMLIRContext();
+  AffineExpr thread_x_dim = mlir::getAffineDimExpr(
+      KernelFusionInterface::kIndexingMapThreadIdxDims[0], mlir_context);
+  AffineExpr c0 = mlir::getAffineConstantExpr(0, mlir_context);
+  IndexingMap thread_x_first_32_elements{
+      AffineMap::get(1, 0, {thread_x_dim, c0, c0, c0, c0, c0}, mlir_context),
+      {Range{0, 31}},
+      {}};
+  IndexingMap thread_x_to_linearized_input =
+      thread_x_first_32_elements * thread_id_to_input_indexing_map;
+  thread_x_to_linearized_input.Simplify();
+  thread_x_to_linearized_input.RemoveUnusedSymbols();
+  return EstimateCoalescingViaMemoryTransactionsCount(
+      FindContiguousIntervals(thread_x_to_linearized_input), element_type);
+}
+
+// Returns a linearized shape, i.e. tensor<num_elements(input) x element_type>.
+Shape GetLinearizedShape(const Shape& shape) {
+  if (shape.rank() == 0) {
+    return shape;
+  }
+  std::vector<int64_t> dims{ShapeUtil::ElementsIn(shape)};
+  auto result = Shape(shape.element_type(), dims,
+                      absl::InlinedVector<bool, 4>(dims.size(), false), {});
+  *result.mutable_layout() = xla::Layout({0});
+  return result;
+}
+
+// Returns thread ID to linearized physical layout indexing map for each operand
+// of the fusion.
+std::optional<GroupedByOpIndexingMap> GetThreadIdToInputMemoryLayoutsMaps(
+    const HloFusionAdaptor& fusion_adaptor,
+    absl::Span<const HloInstruction* const> operands,
+    const HloFusionAnalysis& fusion_analysis,
+    KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context) {
+  GroupedByOpIndexingMap result;
+  for (const auto& [root_index, hero] :
+       llvm::enumerate(fusion_analysis.fusion_heroes())) {
+    for (const auto& [hero_operand_index, hero_operand] :
+         llvm::enumerate(hero->operands())) {
+      if (hero_operand->shape().rank() == 0) {
+        continue;
+      }
+      // Compute thread ID -> hero operand indexing map.
+      std::optional<IndexingMap> thread_id_to_hero_operand_map =
+          fusion_interface->ComputeThreadIdToInputIndexing(
+              root_index, hero_operand_index, mlir_context);
+      if (!thread_id_to_hero_operand_map.has_value()) {
+        return std::nullopt;
+      }
+      // Compute indexing from output to inputs for logical layout.
+      HloInstructionAdaptor hero_operand_adaptor(*hero_operand);
+      GroupedByOpIndexingMap instr_indexing_keyed_by_operands =
+          ComputeGroupedOutputToInputIndexing(
+              fusion_adaptor, hero_operand_adaptor, mlir_context);
+      // For every operand compute thread ID -> physical layout of operand
+      // indexing map.
+      for (const HloInstruction* operand : operands) {
+        auto operand_indexing_maps_it =
+            instr_indexing_keyed_by_operands.find(operand);
+        if (operand_indexing_maps_it ==
+            instr_indexing_keyed_by_operands.end()) {
+          continue;
+        }
+        const Shape& operand_shape = operand->shape();
+
+        IndexingMap operand_logical_to_physical_map =
+            GetIndexingMapFromLogicalToPhysicalLayout(operand_shape,
+                                                      mlir_context);
+        IndexingMap operand_physical_to_linearized_shape = GetBitcastMap(
+            ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
+                operand_shape),
+            GetLinearizedShape(operand_shape), mlir_context);
+        IndexingMap operand_logical_to_linearized_physical_shape =
+            operand_logical_to_physical_map *
+            operand_physical_to_linearized_shape;
+        operand_logical_to_linearized_physical_shape.Simplify();
+
+        for (const IndexingMap& operand_indexing_map :
+             operand_indexing_maps_it->second) {
+          // If one of the indexing maps for the operand is undefined, we remove
+          // all indexing maps for it and store only the undefined one.
+          if (operand_indexing_map.IsUndefined()) {
+            result[operand] = {operand_indexing_map};
+            break;
+          }
+          IndexingMap logical_output_to_linearized_physical_input_map =
+              operand_indexing_map *
+              operand_logical_to_linearized_physical_shape;
+          IndexingMap thread_id_to_linearized_physical_input_map =
+              *thread_id_to_hero_operand_map *
+              logical_output_to_linearized_physical_input_map;
+          thread_id_to_linearized_physical_input_map.Simplify();
+          result[operand].insert(thread_id_to_linearized_physical_input_map);
+        }
+      }
+    }
+  }
+  return result;
 }
 
 }  // namespace
 
-bool IsReadCoalesced(const HloInstruction* operand, const HloInstruction* instr,
-                     const absl::flat_hash_map<const HloInstruction*,
-                                               IndexingMapSet>& indexing_maps,
-                     mlir::MLIRContext* mlir_context) {
-  const Shape& output_shape = instr->shape();
-  const Shape& operand_shape = operand->shape();
-
-  AffineMap output_transpose = ComputeTransposeIndexingMap(
-      ToTransposeDimensions(output_shape.layout()), mlir_context);
-  AffineMap operand_transpose = ComputeTransposeIndexingMap(
-      InversePermutation(ToTransposeDimensions(operand_shape.layout())),
-      mlir_context);
-
-  bool is_coalesced = true;
-  for (const auto& indexing_map : indexing_maps.at(operand)) {
-    if (!indexing_map.has_value()) return false;
-
-    AffineMap normalized_indexing_map = operand_transpose.compose(
-        indexing_map->affine_map.compose(output_transpose));
-    // First version is naive, we just check that the affine maps of input and
-    // output have the same minor dimension.
-    is_coalesced &= normalized_indexing_map.isMinorIdentityWithBroadcasting();
-  }
-  return is_coalesced;
+CoalescingAnalysis::CoalescingAnalysis(
+    const HloInstruction* instr,
+    absl::Span<const HloInstruction* const> operands,
+    const HloFusionAnalysis& fusion_analysis,
+    KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context,
+    bool use_heuristic) {
+  auto fusion_adaptor = HloFusionAdaptor::ForInstruction(instr);
+  if (!use_heuristic && ComputeCoalescingForAllOperands(
+                            *fusion_adaptor, operands, fusion_analysis,
+                            fusion_interface, mlir_context)) {
+    return;
+  }
+  // If ComputeCoalescingForAllOperands fails, fallback to using the heuristic.
+  is_coalesced_computed_by_heuristic_ =
+      IsReadCoalescedHeuristic(fusion_analysis.GetEmitterFusionKind(), instr);
+}
+
+CoalescingAnalysis::CoalescingAnalysis(
+    const HloInstruction* producer, const HloInstruction* consumer,
+    absl::Span<const HloInstruction* const> operands,
+    const HloFusionAnalysis& fusion_analysis,
+    KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context,
+    bool use_heuristic) {
+  ProducerConsumerFusion fusion_adaptor(producer, consumer);
+  if (!use_heuristic &&
+      ComputeCoalescingForAllOperands(fusion_adaptor, operands, fusion_analysis,
+                                      fusion_interface, mlir_context)) {
+    return;
+  }
+  // If ComputeCoalescingForAllOperands fails, fallback to using the heuristic.
+  is_coalesced_computed_by_heuristic_ = IsReadCoalescedHeuristic(
+      fusion_analysis.GetEmitterFusionKind(), producer, consumer);
+}
+
+bool CoalescingAnalysis::ComputeCoalescingForAllOperands(
+    const HloFusionAdaptor& fusion_adaptor,
+    absl::Span<const HloInstruction* const> operands,
+    const HloFusionAnalysis& fusion_analysis,
+    KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context) {
+  std::optional<GroupedByOpIndexingMap> thread_id_to_input_memory_layouts =
+      GetThreadIdToInputMemoryLayoutsMaps(fusion_adaptor, operands,
+                                          fusion_analysis, fusion_interface,
+                                          mlir_context);
+  if (!thread_id_to_input_memory_layouts.has_value()) {
+    return false;
+  }
+  for (const HloInstruction* operand : operands) {
+    if (operand->shape().rank() == 0) {
+      coalescing_per_operand_.insert({operand, true});
+      continue;
+    }
+    auto operand_indexing_maps =
+        thread_id_to_input_memory_layouts->find(operand);
+    // If there is no indexing map for the operand, it means that it is not used
+    // in the fusion cluster.
+    if (operand_indexing_maps == thread_id_to_input_memory_layouts->end()) {
+      coalescing_per_operand_.insert({operand, true});
+      continue;
+    }
+    for (const IndexingMap& operand_indexing_map :
+         operand_indexing_maps->second) {
+      bool is_coalesced =
+          IsCoalesced(operand_indexing_map, operand->shape().element_type());
+      auto [it, inserted] =
+          coalescing_per_operand_.insert({operand, is_coalesced});
+      if (!inserted) {
+        it->second &= is_coalesced;
+      }
+      if (!is_coalesced) break;
+    }
+  }
+  return true;
+}
+
+bool CoalescingAnalysis::IsReadCoalesced(const HloInstruction* operand) const {
+  auto it = coalescing_per_operand_.find(operand);
+  if (it == coalescing_per_operand_.end()) {
+    return is_coalesced_computed_by_heuristic_;
+  }
+  return it->second;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
index f75fbc819d9306..300036aa453bae 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis.h
@@ -16,29 +16,60 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_MODEL_COALESCING_ANALYSIS_H_
 #define XLA_SERVICE_GPU_MODEL_COALESCING_ANALYSIS_H_
 
-#include <optional>
-
 #include "absl/container/flat_hash_map.h"
+#include "absl/types/span.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/fusions/fusion_emitter.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
-#include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/hlo_traversal.h"
 
 namespace xla {
 namespace gpu {
 
+// Computes read coalescing for operands of an instruction or a
+// producer-consumer fusion.
+// Note, that later, after we migrate away from using the heuristic, we might
+// want to use HloFusionAdaptor instead of having two different constructors.
+class CoalescingAnalysis {
+ public:
+  // Computes read coalescing for operands of `instr`.
+  CoalescingAnalysis(const HloInstruction* instr,
+                     absl::Span<const HloInstruction* const> operands,
+                     const HloFusionAnalysis& fusion_analysis,
+                     KernelFusionInterface* fusion_interface = nullptr,
+                     mlir::MLIRContext* mlir_context = nullptr,
+                     bool use_heuristic = true);
+
+  // Computes read coalescing for operands of fused `producer` and `consumer`.
+  CoalescingAnalysis(const HloInstruction* producer,
+                     const HloInstruction* consumer,
+                     absl::Span<const HloInstruction* const> operands,
+                     const HloFusionAnalysis& fusion_analysis,
+                     KernelFusionInterface* fusion_interface = nullptr,
+                     mlir::MLIRContext* mlir_context = nullptr,
+                     bool use_heuristic = true);
+
+  // Returns true if the operand is read coalesced.
+  bool IsReadCoalesced(const HloInstruction* operand) const;
+
+ private:
+  bool ComputeCoalescingForAllOperands(
+      const HloFusionAdaptor& fusion_adaptor,
+      absl::Span<const HloInstruction* const> operands,
+      const HloFusionAnalysis& fusion_analysis,
+      KernelFusionInterface* fusion_interface, mlir::MLIRContext* mlir_context);
+
+  absl::flat_hash_map<const HloInstruction*, bool> coalescing_per_operand_;
+  bool is_coalesced_computed_by_heuristic_ = false;
+};
+
 // Returns true if all input reads are coalesced. If consumer is not nullptr,
 // producer and consumer are considered as one fusion, otherwise it's only the
 // producer.
-bool IsReadCoalescedHeuristic(
-    const std::optional<HloFusionAnalysis>& fusion_analysis,
-    const HloInstruction* producer, const HloInstruction* consumer = nullptr);
-
-// Returns true, if operand's read is coalesced.
-bool IsReadCoalesced(const HloInstruction* operand, const HloInstruction* instr,
-                     const absl::flat_hash_map<const HloInstruction*,
-                                               IndexingMapSet>& indexing_maps,
-                     mlir::MLIRContext* mlir_context);
+bool IsReadCoalescedHeuristic(HloFusionAnalysis::EmitterFusionKind fusion_kind,
+                              const HloInstruction* producer,
+                              const HloInstruction* consumer = nullptr);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
index 53c2c01212b6fa..2a4fe70e787ff8 100644
--- a/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/coalescing_analysis_test.cc
@@ -15,106 +15,296 @@ limitations under the License.
 
 #include "xla/service/gpu/model/coalescing_analysis.h"
 
+#include <memory>
+#include <utility>
 #include <vector>
 
 #include <gtest/gtest.h>
-#include "absl/log/check.h"
 #include "absl/strings/string_view.h"
-#include "llvm/ADT/STLExtras.h"
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/fusions/fusion_emitter.h"
+#include "xla/service/gpu/fusions/fusions.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/hlo_traversal.h"
-#include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/tests/hlo_test_base.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
 namespace gpu {
 namespace {
 
-using ::testing::ElementsAreArray;
+using ::testing::ElementsAre;
 
 class CoalescingTest : public HloTestBase {
  public:
-  void GetRoot(absl::string_view hlo_string,
-               absl::Span<const bool> expected_results) {
-    TF_ASSERT_OK_AND_ASSIGN(auto module,
-                            ParseAndReturnVerifiedModule(hlo_string));
+  std::vector<bool> IsReadCoalescedPerOperand(absl::string_view hlo_string) {
+    auto module = ParseAndReturnVerifiedModule(hlo_string).value();
     HloInstruction* root = module->entry_computation()->root_instruction();
+    return IsReadCoalescedPerOperand(root);
+  }
 
-    for (auto* operand : root->operands()) {
-      CHECK(operand->opcode() == HloOpcode::kParameter ||
-            operand->opcode() == HloOpcode::kConstant)
-          << "If there are multiple instructions, they need to be wrapped in a "
-             "fusion.";
-    }
+  std::vector<bool> IsReadCoalescedPerOperand(const HloInstruction* root) {
     auto fusion_adaptor = HloFusionAdaptor::ForInstruction(root);
+    auto analysis = AnalyzeFusion(*root, device_info_);
+    auto emitter = GetFusionEmitter(PreBufferAssignmentFusionInfo{analysis});
+    auto fusion = dynamic_cast<KernelFusionInterface*>(emitter.value().get());
+    EXPECT_TRUE(emitter.ok());
 
-    auto grouped_indexing_maps = ComputeGroupedOutputToInputIndexing(
-        *fusion_adaptor, /*output_id=*/0, &mlir_context_);
-    EXPECT_TRUE(grouped_indexing_maps.has_value());
+    CoalescingAnalysis coalescing_analysis(root, root->operands(), analysis,
+                                           fusion, &mlir_context_,
+                                           /*use_heuristic=*/false);
 
-    std::vector<bool> actual_results;
-    actual_results.reserve(expected_results.size());
-    for (auto [operand_id, is_coalesced] : llvm::enumerate(expected_results)) {
-      auto* operand = root->operand(operand_id);
-      actual_results.push_back(IsReadCoalesced(
-          operand, root, *grouped_indexing_maps, &mlir_context_));
+    std::vector<bool> results;
+    for (const HloInstruction* operand : root->operands()) {
+      results.push_back(coalescing_analysis.IsReadCoalesced(operand));
     }
-    EXPECT_THAT(actual_results, ElementsAreArray(expected_results));
+    return results;
   }
 
+ protected:
+  stream_executor::DeviceDescription device_info_ =
+      TestGpuDeviceInfo::RTXA6000DeviceInfo();
   mlir::MLIRContext mlir_context_;
 };
 
 TEST_F(CoalescingTest, IdentityLayout) {
-  GetRoot(R"(
+  absl::string_view ir = R"(
     HloModule m
+    fusion {
+      p0 = f32[100, 200] parameter(0)
+      p1 = f32[100, 200] parameter(1)
+      ROOT adthread_x = f32[100, 200] add(p0, p1)
+    }
     ENTRY e {
-      p0 = f32[10, 20] parameter(0)
-      p1 = f32[10, 20] parameter(1)
-      ROOT add0 = f32[10, 20] add(p0, p1)
+      p0 = f32[100, 200] parameter(0)
+      p1 = f32[100, 200] parameter(1)
+      ROOT fusion = f32[100, 200] fusion(p0, p1), kind=kInput, calls=fusion
     }
-  )",
-          {true, true});
+  )";
+  // thread_x to linearized input mapping for thread_x in [0, 31]:
+  // Operand 1: (thread_x) -> (thread_x)
+  // Operand 2: (thread_x) -> (thread_x)
+  EXPECT_THAT(IsReadCoalescedPerOperand(ir), ElementsAre(true, true));
 }
 
 TEST_F(CoalescingTest, RhsTransposedLayout) {
-  GetRoot(R"(
+  absl::string_view ir = R"(
     HloModule m
+    fusion {
+      p0 = f32[100, 200]{1, 0} parameter(0)
+      p1 = f32[100, 200]{0, 1} parameter(1)
+      ROOT exp = f32[100, 200]{1, 0} add(p0, p1)
+    }
     ENTRY e {
-      p0 = f32[10, 20]{1, 0} parameter(0)
-      p1 = f32[10, 20]{0, 1} parameter(1)
-      ROOT exp = f32[10, 20]{1, 0} add(p0, p1)
+      p0 = f32[100, 200]{1, 0} parameter(0)
+      p1 = f32[100, 200]{0, 1} parameter(1)
+      ROOT fusion = f32[100, 200]{1, 0} fusion(p0, p1), kind=kInput, calls=fusion
     }
-  )",
-          {true, false});
+  )";
+  // thread_x to linearized input mapping for thread_x in [0, 31]:
+  // Operand 1: (thread_x) -> (thread_x)
+  // Operand 2: (thread_x) -> (thread_x * 100)
+  EXPECT_THAT(IsReadCoalescedPerOperand(ir), ElementsAre(true, false));
 }
 
 TEST_F(CoalescingTest, OutputTransposedLayout) {
-  GetRoot(R"(
+  absl::string_view ir = R"(
     HloModule m
+    fusion {
+      p0 = f32[100, 200]{1, 0} parameter(0)
+      p1 = f32[100, 200]{1, 0} parameter(1)
+      ROOT exp = f32[100, 200]{0, 1} add(p0, p1)
+    }
     ENTRY e {
-      p0 = f32[10, 20]{1, 0} parameter(0)
-      p1 = f32[10, 20]{1, 0} parameter(1)
-      ROOT exp = f32[10, 20]{0, 1} add(p0, p1)
+      p0 = f32[100, 200]{1, 0} parameter(0)
+      p1 = f32[100, 200]{1, 0} parameter(1)
+      ROOT fusion = f32[100, 200]{0, 1} fusion(p0, p1), kind=kInput, calls=fusion
     }
-  )",
-          {false, false});
+  )";
+  // thread_x to linearized input mapping for thread_x in [0, 31]:
+  // Operand 1: (thread_x) -> (thread_x * 200)
+  // Operand 2: (thread_x) -> (thread_x * 200)
+  EXPECT_THAT(IsReadCoalescedPerOperand(ir), ElementsAre(false, false));
 }
 
 TEST_F(CoalescingTest, OutputAndLhsTransposedLayout) {
-  GetRoot(R"(
+  absl::string_view ir = R"(
     HloModule m
+    fusion {
+      p0 = f32[100, 200]{1, 0} parameter(0)
+      p1 = f32[100, 200]{0, 1} parameter(1)
+      ROOT exp = f32[100, 200]{1, 0} add(p0, p1)
+    }
     ENTRY e {
-      p0 = f32[10, 20]{1, 0} parameter(0)
-      p1 = f32[10, 20]{0, 1} parameter(1)
-      ROOT exp = f32[10, 20]{1, 0} add(p0, p1)
+      p0 = f32[100, 200]{1, 0} parameter(0)
+      p1 = f32[100, 200]{0, 1} parameter(1)
+      ROOT fusion = f32[100, 200]{1, 0} fusion(p0, p1), kind=kInput, calls=fusion
+    }
+  )";
+  // thread_x to linearized input mapping for thread_x in [0, 31]:
+  // Operand 1: (thread_x) -> (thread_x)
+  // Operand 2: (thread_x) -> (thread_x * 100)
+  EXPECT_THAT(IsReadCoalescedPerOperand(ir), ElementsAre(true, false));
+}
+
+TEST_F(CoalescingTest, Transpose) {
+  absl::string_view ir = R"(
+    HloModule module
+
+    fusion {
+      %input = f32[100, 64, 32] parameter(0)
+      ROOT transpose = f32[32, 100, 64] transpose(%input), dimensions={2, 0, 1}
+    }
+
+    ENTRY entry {
+      %input = f32[100, 64, 32] parameter(0)
+      ROOT %fusion = f32[32, 100, 64] fusion(%input), kind=kLoop, calls=fusion
+  })";
+  // thread_x to linearized input mapping for thread_x in [0, 31]:
+  // Operand 1:  (thread_x)[s0] -> (thread_x + s0 * 128) for s0 in [0, 7]
+  EXPECT_THAT(IsReadCoalescedPerOperand(ir), ElementsAre(true));
+}
+
+TEST_F(CoalescingTest, TransposeOnlyOuterDims) {
+  absl::string_view ir = R"(
+    HloModule module
+
+    fusion {
+      %input = f32[100, 32, 64] parameter(0)
+      ROOT transpose = f32[32, 100, 64] transpose(%input), dimensions={1, 0, 2}
+    }
+
+    ENTRY entry {
+      %input = f32[100, 32, 64] parameter(0)
+      ROOT %fusion = f32[32, 100, 64] fusion(%input), kind=kLoop, calls=fusion
+  })";
+  // thread_x to linearized input mapping for thread_x in [0, 31]:
+  // Operand 1:
+  //   (thread_x) -> (thread_x * 4 + s0 + (thread_x floordiv 16) * 1984)
+  //   for s0 in [0, 3]
+  EXPECT_THAT(IsReadCoalescedPerOperand(ir), ElementsAre(true));
+}
+
+TEST_F(CoalescingTest, PadOp) {
+  absl::string_view ir = R"(
+    HloModule module
+    fusion {
+      p0 = f32[997, 436] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT pad = f32[1024, 512] pad(p0, p1), padding=10_17x24_52
     }
-  )",
-          {true, false});
+    ENTRY entry {
+      p0 = f32[997, 436] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT %fusion = f32[1024, 512] fusion(p0, p1), kind=kLoop, calls=fusion
+  })";
+  // thread_x to linearized input mapping for thread_x in [0, 31]:
+  // Operand 1: (thread_x)[s0] -> (thread_x * 4 + s0 - 4384)
+  //   for s0 in [0, 3] and thread_x * 4 + s0 in [24, 459]
+  // Operand 2: (thread_x) -> ()
+  EXPECT_THAT(IsReadCoalescedPerOperand(ir), ElementsAre(true, true));
+}
+
+TEST_F(CoalescingTest, RowReduction) {
+  absl::string_view ir = R"(
+    HloModule module
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+    fusion {
+      %input = f32[100,64,512] parameter(0)
+      %c0 = f32[] constant(0)
+      ROOT reduce = f32[100,64] reduce(%input, %c0), dimensions={2}, to_apply=add
+    }
+    ENTRY entry {
+      %input = f32[100,64,512] parameter(0)
+      ROOT %fusion = f32[100,64] fusion(%input), kind=kInput, calls=fusion
+    })";
+  // thread_x to linearized input mapping for thread_x in [0, 31]:
+  // Operand 1: (thread_x)[s0] -> (thread_x + s0 * 32) for s0 in [0, 15]
+  EXPECT_THAT(IsReadCoalescedPerOperand(ir), ElementsAre(true));
+}
+
+TEST_F(CoalescingTest, MultiRowReduction) {
+  absl::string_view ir = R"(
+    HloModule module
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+    fusion {
+      %input = f32[100,64,4] parameter(0)
+      %c0 = f32[] constant(0)
+      ROOT reduce = f32[100,64] reduce(%input, %c0), dimensions={2}, to_apply=add
+    }
+    ENTRY entry {
+      %input = f32[100,64,4] parameter(0)
+      ROOT %fusion = f32[100,64] fusion(%input), kind=kInput, calls=fusion
+  })";
+  // thread_x to linearized input mapping for thread_x in [0, 31]:
+  // Operand 1: (thread_x) -> (thread_x)
+  EXPECT_THAT(IsReadCoalescedPerOperand(ir), ElementsAre(true));
+}
+
+TEST_F(CoalescingTest, ColumnReduction) {
+  absl::string_view ir = R"(
+    HloModule module
+    add {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT add = f32[] add(p0, p1)
+    }
+    fusion {
+      %input = f32[100,64,32] parameter(0)
+      %c0 = f32[] constant(0)
+      ROOT reduce = f32[100,32] reduce(%input, %c0),
+        dimensions={1}, to_apply=add
+    }
+    ENTRY entry {
+      %input = f32[100,64,32] parameter(0)
+      ROOT %fusion = f32[100,32] fusion(%input), kind=kInput, calls=fusion
+    })";
+  // thread_x to linearized input mapping for thread_x in [0, 31]:
+  // Operand 1: (thread_x)[s0] -> (thread_x + s0 * 1024) for s0 in [0, 1]
+  EXPECT_THAT(IsReadCoalescedPerOperand(ir), ElementsAre(true));
+}
+
+TEST_F(CoalescingTest, UnusedParameter) {
+  Shape shape = ShapeUtil::MakeShape(F32, {100000});
+
+  auto module = std::make_unique<HloModule>("m", HloModuleConfig{});
+  HloComputation::Builder b("b");
+  auto p0 = b.AddInstruction(HloInstruction::CreateParameter(0, shape, "p0"));
+  auto p1 = b.AddInstruction(HloInstruction::CreateParameter(1, shape, "p1"));
+
+  HloComputation::Builder sub_builder("subcomp");
+  HloInstruction* p0f = sub_builder.AddInstruction(
+      HloInstruction::CreateParameter(0, shape, "p0f"));
+  // p1f is not used.
+  HloInstruction* p1f = sub_builder.AddInstruction(
+      HloInstruction::CreateParameter(1, shape, "p1f"));
+  ASSERT_NE(p1f, nullptr);
+  sub_builder.AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kNegate, p0f));
+
+  HloComputation* subcomp = module->AddEmbeddedComputation(sub_builder.Build());
+  auto fusion = HloInstruction::CreateFusion(
+      shape, HloInstruction::FusionKind::kLoop, {p0, p1}, subcomp);
+  b.AddInstruction(std::move(fusion));
+  module->AddEntryComputation(b.Build());
+
+  EXPECT_THAT(IsReadCoalescedPerOperand(
+                  module->entry_computation()->root_instruction()),
+              ElementsAre(true, true));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/model/fusion_analysis_cache.cc b/third_party/xla/xla/service/gpu/model/fusion_analysis_cache.cc
index 9481392ea590f9..127d4ac5db4c79 100644
--- a/third_party/xla/xla/service/gpu/model/fusion_analysis_cache.cc
+++ b/third_party/xla/xla/service/gpu/model/fusion_analysis_cache.cc
@@ -24,7 +24,7 @@ limitations under the License.
 
 namespace xla::gpu {
 
-const std::optional<HloFusionAnalysis>& HloFusionAnalysisCache::Get(
+const HloFusionAnalysis& HloFusionAnalysisCache::Get(
     const HloInstruction& instruction) {
   {
     absl::MutexLock lock(&mutex_);
@@ -34,8 +34,7 @@ const std::optional<HloFusionAnalysis>& HloFusionAnalysisCache::Get(
     }
   }
 
-  std::optional<HloFusionAnalysis> analysis =
-      AnalyzeFusion(instruction, device_info_);
+  HloFusionAnalysis analysis = AnalyzeFusion(instruction, device_info_);
   absl::MutexLock lock(&mutex_);
 
   // If some other thread created an entry for this key concurrently, return
@@ -45,10 +44,11 @@ const std::optional<HloFusionAnalysis>& HloFusionAnalysisCache::Get(
     return it->second;
   }
 
-  return analyses_[instruction.unique_id()] = std::move(analysis);
+  return analyses_.emplace(instruction.unique_id(), std::move(analysis))
+      .first->second;
 }
 
-const std::optional<HloFusionAnalysis>& HloFusionAnalysisCache::Get(
+const HloFusionAnalysis& HloFusionAnalysisCache::Get(
     const HloInstruction& producer, const HloInstruction& consumer) {
   std::pair<int, int> key{producer.unique_id(), consumer.unique_id()};
   {
@@ -59,7 +59,7 @@ const std::optional<HloFusionAnalysis>& HloFusionAnalysisCache::Get(
     }
   }
 
-  std::optional<HloFusionAnalysis> analysis =
+  HloFusionAnalysis analysis =
       AnalyzeProducerConsumerFusion(producer, consumer, device_info_);
   absl::MutexLock lock(&mutex_);
 
@@ -74,7 +74,8 @@ const std::optional<HloFusionAnalysis>& HloFusionAnalysisCache::Get(
       producer.unique_id());
   consumers_for_producers_[producer.unique_id()].push_back(
       consumer.unique_id());
-  return producer_consumer_analyses_[key] = std::move(analysis);
+  return producer_consumer_analyses_.emplace(key, std::move(analysis))
+      .first->second;
 }
 
 void HloFusionAnalysisCache::Invalidate(const HloInstruction& instruction) {
diff --git a/third_party/xla/xla/service/gpu/model/fusion_analysis_cache.h b/third_party/xla/xla/service/gpu/model/fusion_analysis_cache.h
index 30d62b173e8a54..a29d86ef0bf500 100644
--- a/third_party/xla/xla/service/gpu/model/fusion_analysis_cache.h
+++ b/third_party/xla/xla/service/gpu/model/fusion_analysis_cache.h
@@ -35,12 +35,11 @@ class HloFusionAnalysisCache {
 
   // Returns the analysis for the given instruction, creating it if it doesn't
   // exist yet. Do not call concurrently with `Invalidate` for the same key.
-  const std::optional<HloFusionAnalysis>& Get(
-      const HloInstruction& instruction);
+  const HloFusionAnalysis& Get(const HloInstruction& instruction);
 
   // Returns the analysis for the given producer/consumer pair.
-  const std::optional<HloFusionAnalysis>& Get(const HloInstruction& producer,
-                                              const HloInstruction& consumer);
+  const HloFusionAnalysis& Get(const HloInstruction& producer,
+                               const HloInstruction& consumer);
 
   // Removes the cache entry for the given instruction, if it exists. Also
   // removes all producer-consumer fusions that involve this instruction.
@@ -54,9 +53,9 @@ class HloFusionAnalysisCache {
 
   absl::Mutex mutex_;
 
-// All `int` keys and values here are unique instruction IDs.
-  absl::node_hash_map<int, std::optional<HloFusionAnalysis>> analyses_;
-  absl::node_hash_map<std::pair<int, int>, std::optional<HloFusionAnalysis>>
+  // All `int` keys and values here are unique instruction IDs.
+  absl::node_hash_map<int, HloFusionAnalysis> analyses_;
+  absl::node_hash_map<std::pair<int, int>, HloFusionAnalysis>
       producer_consumer_analyses_;
 
   // For each instruction `producer`, contains the `consumer`s for which we have
diff --git a/third_party/xla/xla/service/gpu/model/fusion_analysis_cache_test.cc b/third_party/xla/xla/service/gpu/model/fusion_analysis_cache_test.cc
index d89b82f0f0c1e4..d507a88a4d803b 100644
--- a/third_party/xla/xla/service/gpu/model/fusion_analysis_cache_test.cc
+++ b/third_party/xla/xla/service/gpu/model/fusion_analysis_cache_test.cc
@@ -56,17 +56,17 @@ TEST_F(FusionAnalysisCacheTest, CachesAndInvalidates) {
   auto* negate = computation->GetInstructionWithName("n0");
   auto* fusion = module->entry_computation()->root_instruction();
 
-  EXPECT_THAT(cache_.Get(*fusion)->fusion_roots(),
+  EXPECT_THAT(cache_.Get(*fusion).fusion_roots(),
               ::testing::ElementsAre(negate));
 
   computation->set_root_instruction(broadcast);
 
-  EXPECT_THAT(cache_.Get(*fusion)->fusion_roots(),
+  EXPECT_THAT(cache_.Get(*fusion).fusion_roots(),
               ::testing::ElementsAre(negate))
       << "Analysis should be cached.";
 
   cache_.Invalidate(*fusion);
-  EXPECT_THAT(cache_.Get(*fusion)->fusion_roots(),
+  EXPECT_THAT(cache_.Get(*fusion).fusion_roots(),
               ::testing::ElementsAre(broadcast))
       << "Analysis should have been recomputed";
 }
@@ -100,17 +100,17 @@ TEST_F(FusionAnalysisCacheTest, CachesAndInvalidatesProducerConsumerFusions) {
   auto* computation = module->GetComputationWithName("f");
   auto* constant = computation->GetInstructionWithName("c0");
 
-  EXPECT_EQ(cache_.Get(*fusion, *neg)->GetEmitterFusionKind(),
+  EXPECT_EQ(cache_.Get(*fusion, *neg).GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kReduction);
 
   computation->set_root_instruction(constant);
 
-  EXPECT_EQ(cache_.Get(*fusion, *neg)->GetEmitterFusionKind(),
+  EXPECT_EQ(cache_.Get(*fusion, *neg).GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kReduction)
       << "Analysis should be cached.";
 
   cache_.Invalidate(*fusion);
-  EXPECT_EQ(cache_.Get(*fusion, *neg)->GetEmitterFusionKind(),
+  EXPECT_EQ(cache_.Get(*fusion, *neg).GetEmitterFusionKind(),
             HloFusionAnalysis::EmitterFusionKind::kLoop)
       << "Analysis should have been recomputed";
 }
diff --git a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
new file mode 100644
index 00000000000000..188426ece72a77
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.cc
@@ -0,0 +1,304 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/gpu_collective_performance_model.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <cstdlib>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/numbers.h"
+#include "absl/time/time.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/hlo_dataflow_analysis.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/util.h"
+
+#if GOOGLE_CUDA
+#include "third_party/gpus/cuda/nvml/include/nvml.h"
+#endif  // GOOGLE_CUDA
+namespace xla {
+namespace gpu {
+
+namespace {
+
+int64_t GetNcclMaxNumChannels(
+    GpuPerformanceWithCollectiveModel::CollectiveAlgo algorithm) {
+  int64_t max_nchannels = 0;
+  switch (algorithm) {
+    // Tree and Ring algos share the same max channel number.
+    case GpuPerformanceWithCollectiveModel::RING:
+    case GpuPerformanceWithCollectiveModel::TREE:
+      max_nchannels = GpuPerformanceWithCollectiveModel::kMaxNumChannelsRing;
+      break;
+  }
+  const char* env = std::getenv("NCCL_MAX_NCHANNELS");
+  if (env != nullptr) {
+    int64_t max_nchannels_from_env;
+    if (absl::SimpleAtoi(env, &max_nchannels_from_env)) {
+      max_nchannels = std::min(max_nchannels_from_env, max_nchannels);
+    }
+  }
+  return max_nchannels;
+}
+
+int64_t GetMinNumberOfChannels(
+    GpuPerformanceWithCollectiveModel::CollectiveAlgo algorithm) {
+  int64_t min_nchannels = 0;
+  switch (algorithm) {
+    // Tree and Ring algos share the same min channel number.
+    case GpuPerformanceWithCollectiveModel::RING:
+    case GpuPerformanceWithCollectiveModel::TREE:
+      min_nchannels = 1;
+      break;
+  }
+  const char* env = std::getenv("NCCL_MIN_NCHANNELS");
+  if (env != nullptr) {
+    int64_t min_nchannels_from_env;
+    if (absl::SimpleAtoi(env, &min_nchannels_from_env)) {
+      min_nchannels = std::min(min_nchannels_from_env, min_nchannels);
+    }
+  }
+  return min_nchannels;
+}
+
+int GetNumThreads(int warp_size, int min_num_threads, int max_num_threads,
+                  int default_num_threads) {
+  int threads_from_env = default_num_threads;
+  const char* env = std::getenv("NCCL_NTHREADS");
+  if (env != nullptr) {
+    CHECK(absl::SimpleAtoi(env, &threads_from_env));
+  }
+  int num_threads = threads_from_env;
+  if (num_threads > 0) {
+    if (num_threads % warp_size != 0) {
+      num_threads = max_num_threads;
+    } else if (num_threads > max_num_threads) {
+      num_threads = max_num_threads;
+    } else if (num_threads < min_num_threads) {
+      num_threads = min_num_threads;
+    }
+  } else {
+    num_threads = default_num_threads;
+  }
+  return num_threads;
+}
+
+float GetMaxSysBwFromGpu(const se::CudaComputeCapability cc,
+                         const double* bandwidths_table) {
+  switch (cc.major) {
+    case se::CudaComputeCapability::VOLTA:
+      return bandwidths_table[0];
+    case se::CudaComputeCapability::AMPERE:
+      return bandwidths_table[1];
+    case se::CudaComputeCapability::HOPPER:
+      return bandwidths_table[2];
+  }
+  return -1;
+}
+
+}  // namespace
+
+// Returns NVLink bw in GB/s
+/*static*/
+float GpuPerformanceWithCollectiveModel::GetNvlinkBw(
+    se::CudaComputeCapability compute_capability) {
+  return compute_capability.IsAtLeast(se::CudaComputeCapability::HOPPER)
+             ? kSm90NvlinkBandwidth
+         : compute_capability.IsAtLeast(se::CudaComputeCapability::AMPERE)
+             ? kSm80NvlinkBandwidth
+         : compute_capability.IsAtLeast(se::CudaComputeCapability::VOLTA)
+             ? kSm70NvlinkBandwidth
+         : compute_capability.IsAtLeast(se::CudaComputeCapability::PASCAL_)
+             ? kSm60NvlinkBandwidth
+             : kSm80NvlinkBandwidth;
+}
+
+/*static*/ bool GpuPerformanceWithCollectiveModel::InitNvml() {
+#if GOOGLE_CUDA
+  void* libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
+  CHECK(libhandle != nullptr) << "Failed to open libnvidia-ml.so.1";
+
+  struct SymbolEntry {
+    void** functor;
+    char const* name;
+  };
+
+  std::vector<SymbolEntry> symbols = {
+      {(void**)&xla_nvmlInit, "nvmlInit_v2"},
+      {(void**)&xla_nvmlShutdown, "nvmlShutdown"},
+      {(void**)&xla_nvmlDeviceGetHandleByIndex, "nvmlDeviceGetHandleByIndex"},
+      {(void**)&xla_nvmlDeviceGetNvLinkCapability,
+       "nvmlDeviceGetNvLinkCapability"},
+  };
+  for (SymbolEntry se : symbols) {
+    *se.functor = dlsym(libhandle, se.name);
+  }
+  nvmlReturn_t init_result = xla_nvmlInit();
+  return init_result == NVML_SUCCESS;
+#else
+  return false;
+#endif  // GOOGLE_CUDA
+}
+
+/*static*/ bool GpuPerformanceWithCollectiveModel::ShutdownNvml() {
+#if GOOGLE_CUDA
+  nvmlReturn_t shutdown_result = xla_nvmlShutdown();
+  return shutdown_result == NVML_SUCCESS;
+#else
+  return false;
+#endif  // GOOGLE_CUDA
+}
+
+/*static*/ uint32_t
+GpuPerformanceWithCollectiveModel::CheckIfNvlinkSupportsP2P() {
+#if GOOGLE_CUDA
+  // We will use nvml library to detect nvlink capability
+  // to see if it supports p2p communication.
+  // We first load libnvidia-ml.so and assign symbols to function pointers
+  // to avoid linking errors.
+  // Then gpu 0 will be used to query for nvlink capability, note that
+  // we only look at link 0 of gpu 0 since all other links are assumed
+  // to have the same capability.
+  CHECK(InitNvml()) << "NVML init failed.";
+  nvmlDevice_t nvml_device;
+  nvmlReturn_t get_device_result =
+      xla_nvmlDeviceGetHandleByIndex(0, &nvml_device);
+  CHECK(get_device_result == NVML_SUCCESS);
+
+  uint32_t supported_p2p = 0;
+
+  nvmlReturn_t nvlink_cap_result = xla_nvmlDeviceGetNvLinkCapability(
+      nvml_device, /*nvlink link number*/ 0, NVML_NVLINK_CAP_P2P_SUPPORTED,
+      &supported_p2p);
+  CHECK(nvlink_cap_result == NVML_SUCCESS);
+  CHECK(ShutdownNvml()) << "NVML shutdown failed.";
+  return supported_p2p;
+#else
+  return 0;
+#endif  // GOOGLE_CUDA
+}
+
+/*static*/ absl::Duration
+GpuPerformanceWithCollectiveModel::ComputeAllreduceTime(
+    const HloInstruction& instr, const GpuHloCostAnalysis* cost_analysis,
+    const se::DeviceDescription& gpu_device_info) {
+  // We use nccl group call to launch multiple allreduces so launch overhead
+  // only occurs once.
+  absl::Duration total_time = kNcclKernelLaunchOverhead;
+  stream_executor::CudaComputeCapability compute_cap =
+      gpu_device_info.cuda_compute_capability();
+
+  int64_t size_of_speed_array = kIntraNodeSpeeds.size();
+  int64_t size_of_sm90_speed_array = kIntraNodeSpeedsSm90.size();
+
+  int num_speeds = compute_cap.major >= se::CudaComputeCapability::HOPPER
+                       ? size_of_sm90_speed_array
+                       : size_of_speed_array;
+  const double* speeds = compute_cap.major >= se::CudaComputeCapability::HOPPER
+                             ? kIntraNodeSpeedsSm90.data()
+                             : kIntraNodeSpeeds.data();
+
+  int speed_index = 0;
+  float max_sys_bw =
+      GetMaxSysBwFromGpu(compute_cap, kLowLatencyMaxBandwidths.data());
+
+  CHECK_GT(max_sys_bw, 0);
+
+  while ((speed_index < num_speeds - 1) && speeds[speed_index] > max_sys_bw) {
+    speed_index++;
+  }
+  float bw_intra_node = speeds[speed_index];
+  int64_t num_devices = cost_analysis->NumOfDevices(instr);
+
+  int64_t min_nchannels =
+      std::max(num_devices, GetMinNumberOfChannels(CollectiveAlgo::RING));
+  int64_t num_channels =
+      std::max(min_nchannels, GetNcclMaxNumChannels(CollectiveAlgo::RING));
+  int default_threads =
+      (bw_intra_node * num_channels <= kPciBandwidth) ? 256 : kLL128NumThreads;
+
+  int warp_size = gpu_device_info.threads_per_warp();
+  int num_threads = GetNumThreads(warp_size, kLL128NumThreads / 4,
+                                  kLL128NumThreads, default_threads);
+
+  // Since channels are pipelined together, compute time will only occur as in a
+  // single channel.
+  absl::Duration compute_time_per_channel =
+      ComputeTime(gpu_device_info,
+                  cost_analysis->flop_count(instr) / num_channels, num_threads);
+  total_time += compute_time_per_channel;
+
+  uint32_t supported_p2p = CheckIfNvlinkSupportsP2P();
+
+  if (supported_p2p == 0) {
+    VLOG(8) << "Nvlink doesn't support p2p communication. Model will "
+               "continue using default system bandwidth.";
+  } else {
+    VLOG(8) << "Nvlink supports p2p communication, setting intra node "
+               "bandwidth to nvlink bw.";
+    bw_intra_node = GetNvlinkBw(compute_cap);
+  }
+
+  double bus_bandwidth = bw_intra_node * num_channels;
+
+  // Get per channel LL128 ring bandwidth
+  double per_channel_ring_ll128_Bw =
+      GetMaxSysBwFromGpu(compute_cap, kPerChannelMaxRingLL128Bandwidths.data());
+
+  bus_bandwidth = std::min(bus_bandwidth * kRingAlgorithmDiscountFactor,
+                           num_channels * per_channel_ring_ll128_Bw);
+  double actual_bandwidth = bus_bandwidth * cost_analysis->ScalingRatio(instr);
+
+  absl::Duration communication_time = absl::Microseconds(
+      cost_analysis->bytes_accessed(instr) / (1e6 * actual_bandwidth));
+  total_time += communication_time;
+  return total_time;
+}
+
+/*static*/ absl::Duration
+GpuPerformanceWithCollectiveModel::ComputeCollectiveTime(
+    const HloInstruction& instr, const GpuHloCostAnalysis* cost_analysis,
+    const se::DeviceDescription& gpu_device_info) {
+  if (cost_analysis->NumOfDevices(instr) == 1) {
+    VLOG(8) << "Returning only kernel launch overhead for a single partition.";
+    return kNcclKernelLaunchOverhead;
+  }
+
+  if (HloDataflowAnalysis::IsAsynchronousOperationDone(instr.opcode())) {
+    VLOG(8) << "Returning 0 cost for async done op " << instr.name();
+    return absl::ZeroDuration();
+  }
+  switch (instr.opcode()) {
+    case HloOpcode::kAllReduce:
+    case HloOpcode::kAllReduceStart:
+      return ComputeAllreduceTime(instr, cost_analysis, gpu_device_info);
+    default: {
+      LOG(WARNING)
+          << "Runtime estimate for " << instr.name()
+          << " not implemented. Returning only the kernel launch time.";
+      return kNcclKernelLaunchOverhead;
+    }
+  }
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.h
new file mode 100644
index 00000000000000..c11a78c684e80d
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model.h
@@ -0,0 +1,128 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_GPU_COLLECTIVE_PERFORMANCE_MODEL_H_
+#define XLA_SERVICE_GPU_MODEL_GPU_COLLECTIVE_PERFORMANCE_MODEL_H_
+
+#include <array>
+#include <cstdint>
+
+#include "absl/time/time.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/gpu/model/gpu_performance_model_base.h"
+#include "xla/stream_executor/device_description.h"
+
+#if GOOGLE_CUDA
+#include <dlfcn.h>
+
+#include "third_party/gpus/cuda/nvml/include/nvml.h"
+// Below is a list of function pointers to be used
+// for querying device properties through nvml library.
+#define NVML_FUNCTOR(name, rettype, args) \
+  inline rettype(*xla_##name) args = nullptr;
+
+NVML_FUNCTOR(nvmlInit, nvmlReturn_t, ())
+NVML_FUNCTOR(nvmlShutdown, nvmlReturn_t, ())
+NVML_FUNCTOR(nvmlDeviceGetHandleByIndex, nvmlReturn_t,
+             (unsigned int index, nvmlDevice_t* device))
+NVML_FUNCTOR(nvmlDeviceGetNvLinkCapability, nvmlReturn_t,
+             (nvmlDevice_t device, unsigned int link,
+              nvmlNvLinkCapability_t capability, unsigned int* capResult))
+
+#endif
+
+namespace xla {
+namespace gpu {
+
+class GpuPerformanceWithCollectiveModel : public GpuPerformanceModelBase {
+ public:
+  // Different algorithms that can be used to perform the collective.
+  enum CollectiveAlgo {
+    RING = 0,
+    TREE,
+  };
+
+  // Table for max system bandwidths GB/s for using NCCL's low latency
+  // algorithm. This is used for intra-node estimate.
+  static constexpr std::array<double, 3> kLowLatencyMaxBandwidths = {
+      39.0 /* Volta*/, 87.7 /* Ampere*/, 87.7 /* Hopper*/
+  };
+
+  // Max bandwidth in GB/s for ring low latency 128 algorithm per channel on a
+  // single-node
+  static constexpr std::array<double, 3> kPerChannelMaxRingLL128Bandwidths = {
+      20.0 /* Volta */,
+      20.0 /* Ampere */,
+      36.7 /* Hopper */,
+  };
+
+  // Nvlink unidirectional bandwidth for different compute cap. Note this is per
+  // lane bandwidth.
+  static constexpr double kSm60NvlinkBandwidth = 18.0;
+  static constexpr double kSm70NvlinkBandwidth = 20.0;
+  static constexpr double kSm80NvlinkBandwidth = 20.0;
+  static constexpr double kSm90NvlinkBandwidth = 20.0;
+
+  // PCIE bandwidth for PCI Gen3 x16
+  static constexpr double kPciBandwidth = 12.0;
+
+  // Discount factor for ring algorithm
+  static constexpr double kRingAlgorithmDiscountFactor = 0.92;
+
+  // Different tiers for intra-node bandwidth.
+  static constexpr std::array<double, 13> kIntraNodeSpeeds = {
+      40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0};
+  // SM90 has different bandwidths.
+  static constexpr std::array<double, 9> kIntraNodeSpeedsSm90 = {
+      60.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0};
+
+  // Maximum number of channels allowed by NCCL
+  static constexpr int64_t kMaxNumChannelsRing = 16;
+
+  // ll128 is by default enabled for Volta, Ampere and Hopper, ll128 by default
+  // launches 640 threads.
+  static constexpr int64_t kLL128NumThreads = 640;
+
+  static constexpr absl::Duration kNcclKernelLaunchOverhead =
+      absl::Microseconds(5);
+
+  static absl::Duration ComputeCollectiveTime(
+      const HloInstruction& instr, const GpuHloCostAnalysis* cost_analysis,
+      const se::DeviceDescription& gpu_device_info);
+
+  // Returns NVLink bw in GB/s
+  static float GetNvlinkBw(se::CudaComputeCapability compute_capability);
+
+  // Initialize nvml library.
+  static bool InitNvml();
+
+  // Shut down nvml library.
+  static bool ShutdownNvml();
+
+  // This checks if the nvlink supports direct P2P communication,
+  // If not, we will use PCIE bandwidth to estimate latency.
+  static uint32_t CheckIfNvlinkSupportsP2P();
+
+ private:
+  static absl::Duration ComputeAllreduceTime(
+      const HloInstruction& instr, const GpuHloCostAnalysis* cost_analysis,
+      const se::DeviceDescription& gpu_device_info);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_GPU_COLLECTIVE_PERFORMANCE_MODEL_H_
diff --git a/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model_test.cc
new file mode 100644
index 00000000000000..3136a13422a6f3
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/gpu_collective_performance_model_test.cc
@@ -0,0 +1,45 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/gpu_collective_performance_model.h"
+
+#include <gtest/gtest.h>
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using GpuPerformanceWithCollectiveModelTest = HloTestBase;
+
+TEST_F(GpuPerformanceWithCollectiveModelTest, TestNvmlLibraryLoading) {
+#if GOOGLE_CUDA
+  EXPECT_TRUE(GpuPerformanceWithCollectiveModel::InitNvml());
+  // After successful init, we try to use one of the
+  // nvml functions to see if the result is good.
+  nvmlDevice_t nvml_device;
+  nvmlReturn_t get_device_result =
+      xla_nvmlDeviceGetHandleByIndex(0, &nvml_device);
+  EXPECT_TRUE(get_device_result == NVML_SUCCESS);
+
+  EXPECT_TRUE(GpuPerformanceWithCollectiveModel::InitNvml());
+
+#endif  // GOOGLE_CUDA
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
index 2b4dc7e6456264..2f951f3bcae03c 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_hlo_cost_analysis.cc
@@ -41,7 +41,6 @@ limitations under the License.
 #include "xla/service/gpu/cublas_cudnn.h"
 #include "xla/service/gpu/model/hlo_op_profile.pb.h"
 #include "xla/service/gpu/model/hlo_op_profiles.h"
-#include "xla/service/gpu/model/hlo_op_profiles_data.h"
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
@@ -95,7 +94,7 @@ int64_t GpuHloCostAnalysis::FusionParameterReadBytes(
   if (!options_.count_multiple_input_accesses) {
     utilization = fmin(utilization, 1.0);
   }
-  return GetShapeSize(hlo->shape()) * utilization;
+  return std::llround(GetShapeSize(hlo->shape()) * utilization);
 }
 
 absl::Status GpuHloCostAnalysis::FusionCalculateUtilizations(
@@ -327,11 +326,7 @@ int64_t GpuHloCostAnalysis::GetConvolutionFlops(
 
 int64_t FlopsPerElement(const se::DeviceDescription* device_info,
                         const PrimitiveType type, const HloOpcode opcode) {
-  static const auto* hlo_op_profiles =
-      HloOpProfiles::Load(kDeviceHloOpProfiles,
-                          /*default_profile_name=*/"sm_86")
-          .release();
-  auto device_profile = hlo_op_profiles->GetProfile(device_info);
+  auto device_profile = HloOpProfiles::Singleton().GetProfile(device_info);
   // Elementwise instructions typically take at least a few clock cycles.
   constexpr int64_t kDefaultFlopsPerElement = 3;
   return FindOrDefault(device_profile, std::make_pair(opcode, type),
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
new file mode 100644
index 00000000000000..e11933fba5131a
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.cc
@@ -0,0 +1,213 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/gpu_indexing_performance_model.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <vector>
+
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/hlo_traversal.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/coalescing_analysis.h"
+#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/gpu/model/gpu_performance_model_base.h"
+#include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
+#include "xla/shape_util.h"
+#include "xla/util.h"
+#include "tsl/platform/status.h"
+
+namespace xla {
+namespace gpu {
+
+int64_t GpuPerformanceModelWithIndexingAnalysis::FlopsPerElement(
+    const HloInstruction* instr) const {
+  // TODO(shyshkov): Replace dependency on GpuHloCostAnalysis with independent
+  // flops calculation.
+  GpuHloCostAnalysis::Options cost_analysis_options{
+      shape_size_,
+      /*per_second_rates=*/{},
+      /*count_multiple_input_accesses=*/true};
+  GpuHloCostAnalysis cost_analysis(cost_analysis_options, device_info_);
+  TF_CHECK_OK(
+      cost_analysis.RevisitInstruction(const_cast<HloInstruction*>(instr)));
+
+  int64_t num_elements = ShapeUtil::ElementsInRecursive(instr->shape());
+  return cost_analysis.flop_count(*instr) / num_elements;
+}
+
+int64_t GetIterationSpaceSize(const IndexingMap& indexing_map,
+                              const HloInstruction* instr) {
+  if (indexing_map.IsUndefined()) {
+    return ShapeUtil::ElementsInRecursive(instr->shape());
+  }
+
+  if (indexing_map.IsKnownEmpty()) {
+    return 0;
+  }
+
+  auto get_ranges_iteration_space_size = [](const std::vector<Range>& ranges) {
+    int64_t num_iters = 1;
+    for (const Range& range : ranges) {
+      num_iters *= range.upper_bound - range.lower_bound + 1;
+    }
+    return num_iters;
+  };
+
+  return get_ranges_iteration_space_size(indexing_map.GetSymbolRanges()) *
+         get_ranges_iteration_space_size(indexing_map.GetDimensionRanges());
+}
+
+EstimateRunTimeData
+GpuPerformanceModelWithIndexingAnalysis::EstimateRunTimeForFusion(
+    const HloFusionAnalysis& fusion_analysis, bool is_coalesced) {
+  auto& fusion_adaptor = fusion_analysis.fusion();
+  VLOG(5) << "EstimateRunTimeForFusion: " << fusion_adaptor.ToString();
+
+  auto roots = fusion_adaptor.GetRoots();
+  CHECK_EQ(roots.size(), 1)
+      << "Indexing cost model doesn't support multi-output fusions.";
+  auto root_shape = roots.front().shape();
+
+  LaunchDimensions launch_dimensions =
+      EstimateFusionLaunchDimensions(ShapeUtil::ElementsInRecursive(root_shape),
+                                     fusion_analysis, *device_info_);
+
+  int64_t num_threads = launch_dimensions.launch_bound();
+  int64_t num_blocks = launch_dimensions.num_blocks();
+
+  // Compute indexing from root to each instruction in the fusion and fusion
+  // operands. For each instruction, tells which elements of the instructions
+  // result will be used to compute one result element of the fusion.
+  auto grouped_fusion_indexing = ComputeGroupedOutputToInputIndexing(
+      fusion_adaptor, roots[0], mlir_context_);
+
+  int64_t flops = 0;
+  int64_t bytes_read = 0;
+  absl::Duration read_time = absl::ZeroDuration();
+
+  for (const auto& [instr, indexing_maps] : grouped_fusion_indexing) {
+    VLOG(10) << "instr: " << instr->name();
+    HloInstructionAdaptor instr_adaptor(*instr);
+
+    // Instructions inside the fusion are computation and account for FLOPs
+    // count. Instructions outside the fusion are operands of the fusion and
+    // account for memory read time.
+    bool is_operand = !fusion_adaptor.ContainsInstruction(instr_adaptor);
+
+    auto element_type = instr->shape().element_type();
+    int64_t n_bytes_total = 0;
+    for (const auto& indexing_map : indexing_maps) {
+      VLOG(10) << indexing_map.ToString();
+
+      int64_t num_iters = GetIterationSpaceSize(indexing_map, instr);
+
+      if (is_operand) {
+        int64_t type_size = ShapeUtil::ByteSizeOfPrimitiveType(element_type);
+        n_bytes_total += type_size * num_iters;
+      } else {
+        int64_t flops_per_element = FlopsPerElement(instr);
+        flops += flops_per_element * num_iters;
+      }
+    }
+
+    if (is_operand) {
+      int64_t operand_size = shape_size_(instr->shape());
+      int64_t n_bytes_net = std::min(operand_size, n_bytes_total);
+      bytes_read += n_bytes_total;
+
+      VLogOperandRead(instr, n_bytes_total, n_bytes_net, is_coalesced);
+
+      read_time +=
+          ReadTimeWithDRAMHeuristic(*device_info_, num_blocks, n_bytes_net,
+                                    n_bytes_total, element_type, is_coalesced);
+    }
+  }
+
+  int64_t bytes_written = shape_size_(root_shape);
+
+  absl::Duration compute_time = ComputeTime(*device_info_, flops, num_threads);
+  absl::Duration write_time = WriteTime(*device_info_, bytes_written);
+  absl::Duration memory_access_time = read_time + write_time;
+  absl::Duration exec_time = CombineComputeAndMemoryAccessTime(
+      compute_time, memory_access_time,
+      GpuPerformanceModelOptions::PriorityFusion());
+
+  VLogResult(flops, bytes_read, bytes_written, num_threads, compute_time,
+             read_time, write_time, exec_time);
+
+  return EstimateRunTimeData{flops, bytes_written, num_threads, write_time,
+                             exec_time};
+}
+
+EstimateRunTimeData
+GpuPerformanceModelWithIndexingAnalysis::EstimateRunTimeForInstruction(
+    const HloInstruction* producer) {
+  // Stand-alone bitcast is always no-op during runtime.
+  if (producer->opcode() == HloOpcode::kBitcast) {
+    return {0, 0, 0, absl::ZeroDuration(), absl::ZeroDuration()};
+  }
+
+  auto fusion_analysis = AnalyzeFusion(*producer, *device_info_);
+
+  bool is_coalesced = IsReadCoalescedHeuristic(
+      fusion_analysis.GetEmitterFusionKind(), producer);
+  return EstimateRunTimeForFusion(fusion_analysis, is_coalesced);
+}
+
+EstimateRunTimeData
+GpuPerformanceModelWithIndexingAnalysis::EstimateRunTimeForProducerConsumer(
+    const HloInstruction* producer, const HloInstruction* consumer) {
+  auto fusion_analysis =
+      AnalyzeProducerConsumerFusion(*producer, *consumer, *device_info_);
+
+  bool is_coalesced = IsReadCoalescedHeuristic(
+      fusion_analysis.GetEmitterFusionKind(), producer, consumer);
+  return EstimateRunTimeForFusion(fusion_analysis, is_coalesced);
+}
+
+/*static*/
+GpuPerformanceModelWithIndexingAnalysis::RunTimes
+GpuPerformanceModelWithIndexingAnalysis::EstimateRunTimes(
+    const HloInstruction* producer,
+    absl::Span<const HloInstruction* const> fused_consumers) {
+  auto producer_runtime = EstimateRunTimeForInstruction(producer);
+
+  absl::Duration time_unfused =
+      kKernelLaunchOverhead * (fused_consumers.size() + 1) +
+      producer_runtime.exec_time;
+
+  absl::Duration time_fused = kKernelLaunchOverhead * fused_consumers.size();
+
+  for (const auto& consumer : fused_consumers) {
+    time_unfused += EstimateRunTimeForInstruction(consumer).exec_time;
+    time_fused +=
+        EstimateRunTimeForProducerConsumer(producer, consumer).exec_time;
+  }
+
+  return {time_unfused, time_fused};
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
new file mode 100644
index 00000000000000..42c022c8a03482
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model.h
@@ -0,0 +1,73 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_GPU_INDEXING_PERFORMANCE_MODEL_H_
+#define XLA_SERVICE_GPU_MODEL_GPU_INDEXING_PERFORMANCE_MODEL_H_
+
+#include <cstdint>
+
+#include "absl/types/span.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/model/gpu_performance_model_base.h"
+#include "xla/service/gpu/model/hlo_op_profiles.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+// Implementation of Cost Model that uses indexing analysis to estimate amount
+// of compute and memory access time.
+class GpuPerformanceModelWithIndexingAnalysis : public GpuPerformanceModelBase {
+ public:
+  explicit GpuPerformanceModelWithIndexingAnalysis(
+      const se::DeviceDescription* device_info,
+      HloCostAnalysis::ShapeSizeFunction shape_size,
+      mlir::MLIRContext* mlir_context)
+      : hlo_op_profile_(&HloOpProfiles::Singleton().GetProfile(device_info)),
+        device_info_(device_info),
+        shape_size_(shape_size),
+        mlir_context_(mlir_context) {}
+
+  EstimateRunTimeData EstimateRunTimeForFusion(
+      const HloFusionAnalysis& fusion_analysis, bool is_coalesced = true);
+
+  EstimateRunTimeData EstimateRunTimeForInstruction(
+      const HloInstruction* producer);
+
+  EstimateRunTimeData EstimateRunTimeForProducerConsumer(
+      const HloInstruction* producer, const HloInstruction* consumer);
+
+  RunTimes EstimateRunTimes(
+      const HloInstruction* producer,
+      absl::Span<const HloInstruction* const> fused_consumers = {});
+
+ private:
+  // Returns an estimate how many FLOPs will be used to produce one element of
+  // the output.
+  int64_t FlopsPerElement(const HloInstruction* instr) const;
+
+  const HloOpProfiles::HloOpProfile* hlo_op_profile_;
+  const se::DeviceDescription* device_info_;
+  HloCostAnalysis::ShapeSizeFunction shape_size_;
+  mlir::MLIRContext* mlir_context_;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_GPU_INDEXING_PERFORMANCE_MODEL_H_
diff --git a/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc
new file mode 100644
index 00000000000000..8ea6abdb59a358
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/gpu_indexing_performance_model_test.cc
@@ -0,0 +1,110 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/gpu_indexing_performance_model.h"
+
+#include <cstdint>
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "absl/time/time.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/gpu/model/gpu_performance_model_base.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuIndexingPerformanceModelTest : public HloTestBase {
+  GpuHloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const {
+    return [&](const Shape& shape) {
+      constexpr int64_t kPointerSize = 8;
+      return ShapeUtil::ByteSizeOf(shape, kPointerSize);
+    };
+  }
+
+ public:
+  mlir::MLIRContext mlir_context_;
+  // The reference times in the test cases below are measured
+  // on A6000 by profiling the execution of the HLOs.
+  se::DeviceDescription device_info_{TestGpuDeviceInfo::RTXA6000DeviceInfo()};
+  GpuPerformanceModelWithIndexingAnalysis indexing_cost_model_{
+      &device_info_, ShapeSizeBytesFunction(), &mlir_context_};
+
+  GpuIndexingPerformanceModelTest() : HloTestBase() {}
+};
+
+TEST_F(GpuIndexingPerformanceModelTest, BroadcastElementwise) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                           R"(
+HloModule extracted
+
+ENTRY entry_computation {
+  param_0 = f32[32]{0} parameter(0)
+  broadcast = f32[32,1,768]{2,1,0} broadcast(param_0), dimensions={0}
+  param_1 = f32[32,1,768]{2,1,0} parameter(1)
+  ROOT multiply = f32[32,1,768]{2,1,0} multiply(broadcast, param_1)
+}
+)"));
+
+  auto producer =
+      module->entry_computation()->GetInstructionWithName("broadcast");
+  auto consumer =
+      module->entry_computation()->GetInstructionWithName("multiply");
+
+  auto runtime_data = indexing_cost_model_.EstimateRunTimeForProducerConsumer(
+      producer, consumer);
+  EXPECT_EQ(runtime_data.flops, 73728);
+  EXPECT_EQ(runtime_data.bytes_written, 98304);
+  EXPECT_NEAR(absl::ToInt64Nanoseconds(runtime_data.write_time), 128, 2);
+  EXPECT_NEAR(absl::ToInt64Nanoseconds(runtime_data.exec_time), 267, 2);
+}
+
+TEST_F(GpuIndexingPerformanceModelTest, Bitcast) {
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(
+                                           R"(
+HloModule m
+
+ENTRY entry_computation {
+  param_0 = bf16[4,8,65,128]{3,2,1,0} parameter(0)
+  ROOT bitcast = bf16[8,4,65,128]{3,2,0,1} bitcast(param_0)
+}
+)"));
+
+  auto instruction =
+      module->entry_computation()->GetInstructionWithName("bitcast");
+
+  auto runtime_data =
+      indexing_cost_model_.EstimateRunTimeForInstruction(instruction);
+  EXPECT_EQ(runtime_data.flops, 0);
+  EXPECT_EQ(runtime_data.bytes_written, 0);
+  EXPECT_EQ(runtime_data.write_time, absl::ZeroDuration());
+  EXPECT_EQ(runtime_data.exec_time, absl::ZeroDuration());
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
index 1ce0519981c920..8b5dcfc813bfa9 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model.cc
@@ -18,322 +18,53 @@ limitations under the License.
 #include <algorithm>
 #include <cmath>
 #include <cstdint>
-#include <cstdlib>
 #include <optional>
 #include <vector>
 
-#include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
-#include "absl/strings/numbers.h"
-#include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/fusions/fusions.h"
 #include "xla/service/gpu/hlo_fusion_analysis.h"
-#include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/model/coalescing_analysis.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
-#include "xla/service/hlo_dataflow_analysis.h"
+#include "xla/service/gpu/model/gpu_performance_model_base.h"
 #include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
 #include "tsl/platform/status.h"
 
-#if GOOGLE_CUDA
-#include "third_party/gpus/cuda/nvml/include/nvml.h"
-#endif  // GOOGLE_CUDA
 namespace xla {
 namespace gpu {
-
 namespace {
 
-// Estimated values in the absence of easy ways to query them.
-static constexpr absl::Duration kKernelLaunchOverhead = absl::Microseconds(1);
-static constexpr absl::Duration kNcclKernelLaunchOverhead =
-    absl::Microseconds(5);
-static constexpr float kL2CacheSpeedup = 2.5;
-static constexpr float kL1CacheSpeedup = 8;
-// A very conservative estimate. L1 size varies because it can be dynamically
-// configured as shared memory; there is no easy way to query its actual size;
-// also we do not count what occupies cache, but rather claim that what is
-// much smaller than the cache size will likely stay in it.
-// For reference, it can be up to 256 kB per SM on RTX A6000.
-static constexpr float kL1CacheSizePerSM = 2 * 1024;
-
-absl::Duration CombineComputeAndMemoryAccessTime(
-    absl::Duration compute_time, absl::Duration memory_access_time,
-    const GpuPerformanceModelOptions& config) {
-  return compute_time + memory_access_time -
-         std::min(compute_time, memory_access_time) *
-             config.memory_compute_parallelism;
-}
-
-// Returns whether a fusion uses the parameter at the given index elementwise
-// from its root.
-bool FusionUsesParameterElementwiseFromRoot(
-    const HloInstruction* fusion, int parameter_index,
-    const GpuHloCostAnalysis* cost_analysis) {
-  return cost_analysis->CommonElementwiseUtilization(
-             fusion->fused_parameter(parameter_index),
-             fusion->fused_expression_root()) == 1.f;
-}
-
-int GetCoalescingWasteFactor(PrimitiveType element_type) {
-  int64_t element_size_bytes =
-      element_type == PrimitiveType::TUPLE ||
-              element_type == PrimitiveType::TOKEN
-          ? 4 /* Dummy value. TODO(jreiffers): Model this case. */
-          : ShapeUtil::ByteSizeOfPrimitiveType(element_type);
-  // Cache line is 128B that is split into 4 sectors of 32B. Default transaction
-  // size from DRAM -> L2 = 64 Bytes = 2 sectors, since V100, but it can be also
-  // configured.
-  // https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21819-optimizing-applications-for-nvidia-ampere-gpu-architecture.pdf
-  // (page 10).
-  constexpr int kDRAMToL2TransactionSizeBytes = 64;
-  // Assume we use one element from the cache line and waste the remaining
-  // bandwidth. For example, if we're reading f32s, we use 1/16nd of the cache
-  // line.
-  return kDRAMToL2TransactionSizeBytes / element_size_bytes;
-}
-
-// Limit the bandwidth for low occupancy cases. Each SM can issue at most
-// one 32B memory transaction per clock. H100 needs at least 56.8 active SMs
-// (1830 MHz) to saturate the memory bandwidth (3.35 TB/s).
-float AdjustBandwidth(const se::DeviceDescription& gpu_device_info,
-                      float bandwidth, int64_t num_blocks) {
-  float per_block_bandwidth = gpu_device_info.clock_rate_ghz() * 1.0e9f * 32;
-  float max_bandwidth = num_blocks * per_block_bandwidth;
-
-  return std::min(bandwidth, max_bandwidth);
-}
-
-// Estimate read time of n_bytes_total bytes from global memory on a
-// given GPU.
-//
-// Assumes that the first n_bytes_net are always read from DRAM, but next reads
-// can be cached. Applies waste factor if read from DRAM is uncoalesced.
-absl::Duration ReadTimeWithDRAMHeuristic(
-    const se::DeviceDescription& gpu_device_info, int64_t num_blocks,
-    int64_t n_bytes_net, int64_t n_bytes_total, PrimitiveType element_type,
-    bool coalesced) {
-  int waste_factor = coalesced ? 1 : GetCoalescingWasteFactor(element_type);
-
-  // The first read of the input buffer always happens from DRAM. If reads are
-  // no coaleced, bandwidth is reduced by the waste factor.
-  float dram_bandwidth = gpu_device_info.memory_bandwidth() / waste_factor;
-
-  // Two things can happed on re-reading the buffer:
-  //   - If the buffer fits into cache, the L1/L2 cache speedup is applied.
-  //   - If the buffer doesn't fit, it will be read from DRAM and the same
-  //     coalessing waste factor is applied.
-  float rest_bandwidth = gpu_device_info.memory_bandwidth();
-  if (n_bytes_net < gpu_device_info.l2_cache_size()) {
-    rest_bandwidth *= kL2CacheSpeedup;
-    if (n_bytes_net < kL1CacheSizePerSM * gpu_device_info.core_count()) {
-      rest_bandwidth *= kL1CacheSpeedup;
-    }
-  } else {
-    rest_bandwidth /= waste_factor;
-  }
-
-  dram_bandwidth = AdjustBandwidth(gpu_device_info, dram_bandwidth, num_blocks);
-  rest_bandwidth = AdjustBandwidth(gpu_device_info, rest_bandwidth, num_blocks);
-
-  // n_bytes_net > n_bytes_total can happen when we compute read time of
-  // shared operand. This is a flaw in the interface that should be fixed.
-  int64_t n_bytes_read_dram = std::min(n_bytes_net, n_bytes_total);
-
-  // Number of bytes that we be re-read, potentially from cache.
-  int64_t n_bytes_read_cache = n_bytes_total - n_bytes_read_dram;
-
-  return absl::Seconds(n_bytes_read_dram / dram_bandwidth) +
-         absl::Seconds(n_bytes_read_cache / rest_bandwidth);
-}
-
-// Estimate read time of n_bytes_total bytes from global memory on a
-// given GPU. Account for L1 / L2 cache speedup if the input's nominal size
-// n_bytes_net is small.
-absl::Duration ReadTime(const se::DeviceDescription& gpu_device_info,
-                        int64_t num_blocks, int64_t n_bytes_net,
-                        int64_t n_bytes_total) {
-  float bandwidth = gpu_device_info.memory_bandwidth();
-  if (n_bytes_net < gpu_device_info.l2_cache_size()) {
-    bandwidth *= kL2CacheSpeedup;
-    if (n_bytes_net < kL1CacheSizePerSM * gpu_device_info.core_count()) {
-      bandwidth *= kL1CacheSpeedup;
-    }
-  }
-
-  bandwidth = AdjustBandwidth(gpu_device_info, bandwidth, num_blocks);
-  return absl::Seconds(n_bytes_total / bandwidth);
-}
-
-int64_t GetNcclMaxNumChannels(
-    GpuPerformanceWithCollectiveModel::CollectiveAlgo algorithm) {
-  int64_t max_nchannels = 0;
-  switch (algorithm) {
-    // Tree and Ring algos share the same max channel number.
-    case GpuPerformanceWithCollectiveModel::RING:
-    case GpuPerformanceWithCollectiveModel::TREE:
-      max_nchannels = GpuPerformanceWithCollectiveModel::kMaxNumChannelsRing;
-      break;
-  }
-  const char* env = std::getenv("NCCL_MAX_NCHANNELS");
-  if (env != nullptr) {
-    int64_t max_nchannels_from_env;
-    if (absl::SimpleAtoi(env, &max_nchannels_from_env)) {
-      max_nchannels = std::min(max_nchannels_from_env, max_nchannels);
-    }
-  }
-  return max_nchannels;
-}
-
-int64_t GetMinNumberOfChannels(
-    GpuPerformanceWithCollectiveModel::CollectiveAlgo algorithm) {
-  int64_t min_nchannels = 0;
-  switch (algorithm) {
-    // Tree and Ring algos share the same min channel number.
-    case GpuPerformanceWithCollectiveModel::RING:
-    case GpuPerformanceWithCollectiveModel::TREE:
-      min_nchannels = 1;
-      break;
-  }
-  const char* env = std::getenv("NCCL_MIN_NCHANNELS");
-  if (env != nullptr) {
-    int64_t min_nchannels_from_env;
-    if (absl::SimpleAtoi(env, &min_nchannels_from_env)) {
-      min_nchannels = std::min(min_nchannels_from_env, min_nchannels);
-    }
-  }
-  return min_nchannels;
-}
-
-int GetNumThreads(int warp_size, int min_num_threads, int max_num_threads,
-                  int default_num_threads) {
-  int threads_from_env = default_num_threads;
-  const char* env = std::getenv("NCCL_NTHREADS");
-  if (env != nullptr) {
-    CHECK(absl::SimpleAtoi(env, &threads_from_env));
-  }
-  int num_threads = threads_from_env;
-  if (num_threads > 0) {
-    if (num_threads % warp_size != 0) {
-      num_threads = max_num_threads;
-    } else if (num_threads > max_num_threads) {
-      num_threads = max_num_threads;
-    } else if (num_threads < min_num_threads) {
-      num_threads = min_num_threads;
-    }
-  } else {
-    num_threads = default_num_threads;
-  }
-  return num_threads;
-}
-
-float GetMaxSysBwFromGpu(const se::CudaComputeCapability cc,
-                         const double* bandwidths_table) {
-  switch (cc.major) {
-    case se::CudaComputeCapability::VOLTA:
-      return bandwidths_table[0];
-    case se::CudaComputeCapability::AMPERE:
-      return bandwidths_table[1];
-    case se::CudaComputeCapability::HOPPER:
-      return bandwidths_table[2];
+std::vector<const HloInstruction*> GetUniqueFusionOperands(
+    const HloInstruction* producer, const HloInstruction* consumer) {
+  std::vector<const HloInstruction*> fusion_operands;
+  for (const HloInstruction* operand : producer->operands()) {
+    fusion_operands.push_back(operand);
   }
-  return -1;
-}
-
-// Uses HloFusionAnalysis for computing the actual number of threads and blocks
-// that the IR emitter will use.
-LaunchDimensions EstimateFusionLaunchDimensions(
-    int64_t estimated_num_threads,
-    const std::optional<HloFusionAnalysis>& fusion_analysis,
-    const se::DeviceDescription& device_info) {
-  if (fusion_analysis) {
-    auto emitter =
-        GetFusionEmitter(PreBufferAssignmentFusionInfo{*fusion_analysis});
-    if (emitter.ok()) {
-      if (const auto* kernel_emitter =
-              dynamic_cast<const KernelFusionInterface*>(emitter->get())) {
-        return kernel_emitter->launch_dimensions();
-      }
+  for (const HloInstruction* operand : consumer->operands()) {
+    if (operand != producer) {
+      fusion_operands.push_back(operand);
     }
   }
-  int64_t block_size = 128;  // Result for default LaunchDimensionsConfig.
-  int64_t num_blocks = CeilOfRatio(estimated_num_threads, block_size);
-  return LaunchDimensions(num_blocks, block_size);
+  std::sort(fusion_operands.begin(), fusion_operands.end());
+  fusion_operands.erase(
+      std::unique(fusion_operands.begin(), fusion_operands.end()),
+      fusion_operands.end());
+  return fusion_operands;
 }
 
 }  // namespace
 
-std::optional<EstimateRunTimeData> GpuPerformanceModelCache::Get(
-    const HloInstruction& instruction) {
-  absl::MutexLock lock(&mutex_);
-
-  auto it = instruction_runtime_data_.find(HloInstructionAdaptor(instruction));
-  if (it != instruction_runtime_data_.end()) {
-    return it->second;
-  }
-  return std::nullopt;
-}
-
-std::optional<absl::Duration> GpuPerformanceModelCache::Get(
-    const HloInstruction& producer, const HloInstruction& consumer) {
-  absl::MutexLock lock(&mutex_);
-
-  auto it = fusion_runtime_data_.find(HloInstructionAdaptor(producer));
-  if (it != fusion_runtime_data_.end()) {
-    auto jt = it->second.find(HloInstructionAdaptor(consumer));
-    if (jt != it->second.end()) {
-      return jt->second;
-    }
-  }
-  return std::nullopt;
-}
-
-void GpuPerformanceModelCache::Set(const HloInstruction& instruction,
-                                   const EstimateRunTimeData& runtime_data) {
-  absl::MutexLock lock(&mutex_);
-
-  instruction_runtime_data_[HloInstructionAdaptor(instruction)] = runtime_data;
-}
-
-void GpuPerformanceModelCache::Set(const HloInstruction& producer,
-                                   const HloInstruction& consumer,
-                                   absl::Duration runtime) {
-  absl::MutexLock lock(&mutex_);
-  fusion_runtime_data_[HloInstructionAdaptor(producer)]
-                      [HloInstructionAdaptor(consumer)] = runtime;
-}
-
-void GpuPerformanceModelCache::Invalidate(const HloInstruction& instruction) {
-  absl::MutexLock lock(&mutex_);
-  HloInstructionAdaptor adaptor(instruction);
-
-  // Remove runtime data for the instruction.
-  instruction_runtime_data_.erase(adaptor);
-
-  // Remove cache for all producer-consumer pairs where the instruction is
-  // producer.
-  fusion_runtime_data_.erase(adaptor);
-
-  // Iterate through operands to find all producer-consumer pairs where
-  // instruction is consumer and remove them from cache.
-  for (auto* operand : instruction.operands()) {
-    auto it = fusion_runtime_data_.find(HloInstructionAdaptor(*operand));
-    if (it != fusion_runtime_data_.end()) {
-      it->second.erase(adaptor);
-    }
-  }
-}
-
 /*static*/ EstimateRunTimeData
 GpuPerformanceModel::EstimateRunTimeForInstruction(
     const HloInstruction* instr, const GpuHloCostAnalysis* cost_analysis,
@@ -343,17 +74,16 @@ GpuPerformanceModel::EstimateRunTimeForInstruction(
 
   int64_t flops = cost_analysis->flop_count(*instr);
   int64_t bytes_written = cost_analysis->output_bytes_accessed(*instr);
-  int64_t bytes_read = cost_analysis->bytes_accessed(*instr) - bytes_written;
 
   // Use the analysis cache if present.
   // TODO(jreiffers): Remove this once all callers use a cache.
-  std::optional<HloFusionAnalysis> local_analysis =
-      config.fusion_analysis_cache
-          ? std::nullopt
-          : AnalyzeFusion(*instr, *cost_analysis->device_info_);
+  std::optional<HloFusionAnalysis> local_analysis;
+  if (!config.fusion_analysis_cache) {
+    local_analysis = AnalyzeFusion(*instr, *cost_analysis->device_info_);
+  }
   const auto& fusion_analysis = config.fusion_analysis_cache
                                     ? config.fusion_analysis_cache->Get(*instr)
-                                    : local_analysis;
+                                    : local_analysis.value();
   LaunchDimensions launch_dimensions = EstimateFusionLaunchDimensions(
       ShapeUtil::ElementsInRecursive(instr->shape()), fusion_analysis,
       *device_info);
@@ -362,45 +92,33 @@ GpuPerformanceModel::EstimateRunTimeForInstruction(
 
   absl::Duration compute_time = ComputeTime(*device_info, flops, num_threads);
 
-  // TODO(jreiffers): We should be checking each operand.
-  bool coalesced = IsReadCoalescedHeuristic(fusion_analysis, instr,
-                                            /*consumer=*/nullptr);
+  CoalescingAnalysis coalescing_analysis(instr, instr->operands(),
+                                         fusion_analysis);
 
   absl::Duration read_time;
-  for (int i = 0; i < instr->operand_count(); ++i) {
-    auto element_type = instr->operand(i)->shape().element_type();
-    // Information about data read taking into account utilization.
-    // If `operand_utilization` is 0, `operand_bytes_accessed` should be also 0.
-    int64_t n_bytes_total = cost_analysis->operand_bytes_accessed(*instr, i);
-    float operand_utilization = cost_analysis->operand_utilization(*instr, i);
-
-    // An estimate how much data would need to fit into L1/L2 cache to speed up
-    // the operand access.
-    // If `operand_utilization` < 1, only a part of the full operand size should
-    // be read. Otherwise, `n_bytes_total / operand_utilization` is the
-    // size of the operand without reuse.
-    int64_t n_bytes_net =
-        std::llround(n_bytes_total / std::max(operand_utilization, 1.0f));
+  int64_t bytes_read = 0;
+  for (const auto [operand_id, operand] : llvm::enumerate(instr->operands())) {
+    int64_t operand_size = cost_analysis->GetShapeSize(operand->shape());
+    int64_t n_bytes_total =
+        GetOperandBytesAccessed(cost_analysis, instr, operand);
+    int64_t n_bytes_net = std::min(operand_size, n_bytes_total);
+    bytes_read += n_bytes_total;
+
+    bool coalesced = coalescing_analysis.IsReadCoalesced(operand);
+
+    VLogOperandRead(operand, n_bytes_total, n_bytes_net, coalesced);
 
-    read_time +=
-        ReadTimeWithDRAMHeuristic(*device_info, num_blocks, n_bytes_net,
-                                  n_bytes_total, element_type, coalesced);
+    read_time += ReadTimeWithDRAMHeuristic(
+        *device_info, num_blocks, n_bytes_net, n_bytes_total,
+        operand->shape().element_type(), coalesced);
   }
 
-  absl::Duration write_time =
-      absl::Seconds(1.0f * bytes_written / device_info->memory_bandwidth());
+  absl::Duration write_time = WriteTime(*device_info, bytes_written);
   absl::Duration exec_time = CombineComputeAndMemoryAccessTime(
       compute_time, read_time + write_time, config);
 
-  if (VLOG_IS_ON(8)) {
-    LOG(INFO) << "FLOPs: " << flops;
-    LOG(INFO) << "Bytes read: " << bytes_read;
-    LOG(INFO) << "Bytes written: " << bytes_written;
-    LOG(INFO) << "Num threads: " << num_threads;
-    LOG(INFO) << "Compute time: " << compute_time;
-    LOG(INFO) << "Input read time: " << read_time;
-    LOG(INFO) << "Output write time: " << write_time;
-  }
+  VLogResult(flops, bytes_read, bytes_written, num_threads, compute_time,
+             read_time, write_time, exec_time);
 
   return {flops, bytes_written, num_threads, write_time, exec_time};
 }
@@ -425,146 +143,12 @@ GpuPerformanceModel::EstimateRunTimeForInstructionCached(
   return runtime_data;
 }
 
-// Returns utilization of operand by instruction. Returns 0, if the operand is
-// not used by the instruction.
-float GetOperandUtilization(const GpuHloCostAnalysis* cost_analysis,
-                            const HloInstruction* instr,
-                            const HloInstruction* operand) {
-  if (!instr->IsUserOf(operand)) {
-    return 0.f;
-  }
-
-  return cost_analysis->operand_utilization(*instr,
-                                            instr->operand_index(operand));
-}
-
-// Returns utilization `overlap` between a common operand of producer and
-// consumer on merge. `utilization > 0` means that the operand will be accessed
-// more efficiently after fusion.
-//
-// Currently covers two cases:
-// 1) Producer has to use the common operand elementwise from its root if it is
-//    a fusion or just be an elementwise instruction.
-// 2) Consumer has to have common elementwise roots for the producer and the
-//    common operand if it is a fusion or just be an elementwise instruction.
-float GetCommonUtilization(const GpuHloCostAnalysis* cost_analysis,
-                           const HloInstruction* producer,
-                           int64_t producer_idx_of_operand,
-                           const HloInstruction* consumer) {
-  const auto* operand = producer->operand(producer_idx_of_operand);
-
-  if (!consumer || !consumer->IsUserOf(operand)) {
-    return 0.f;
-  }
-
-  if (producer->IsElementwise() ||
-      (producer->opcode() == HloOpcode::kFusion &&
-       FusionUsesParameterElementwiseFromRoot(producer, producer_idx_of_operand,
-                                              cost_analysis))) {
-    if (consumer->opcode() == HloOpcode::kFusion) {
-      int64_t consumer_idx_of_common_operand = consumer->operand_index(operand);
-      int64_t consumer_idx_of_producer = consumer->operand_index(producer);
-      return cost_analysis->CommonElementwiseUtilization(
-          consumer->fused_parameter(consumer_idx_of_common_operand),
-          consumer->fused_parameter(consumer_idx_of_producer));
-    } else {
-      if (consumer->IsElementwise()) {
-        return 1.f;
-      }
-    }
-  }
-  return 0.f;
-}
-
-// Returns utilization of operand after producer and consumer are fused
-// together. `GetCommonUtilization` works only for a limited set of elementwise
-// cases.
-// TODO(shyshkov): Combine logic from GpuHloCostAnalysis with boundary function
-// to properly calculate utilization.
-float GetSharedUtilization(const GpuHloCostAnalysis* cost_analysis,
-                           const HloInstruction* producer,
-                           const HloInstruction* consumer,
-                           const HloInstruction* operand) {
-  float producer_utilization_by_consumer =
-      GetOperandUtilization(cost_analysis, consumer, producer);
-
-  float operand_utilization_by_producer =
-      GetOperandUtilization(cost_analysis, producer, operand);
-
-  float operand_utilization_by_consumer =
-      GetOperandUtilization(cost_analysis, consumer, operand);
-
-  float common_utilization =
-      producer->IsUserOf(operand)
-          ? GetCommonUtilization(cost_analysis, producer,
-                                 producer->operand_index(operand), consumer)
-          : 0.f;
-
-  return producer_utilization_by_consumer * operand_utilization_by_producer +
-         operand_utilization_by_consumer - common_utilization;
-}
-
-// Tells input access time of the producer alone if fused_consumer
-// is not specified. Otherwise estimates the access time to producer's
-// inputs as if it is fused into the consumer.
-/*static*/ absl::Duration GpuPerformanceModel::ProducerInputAccessTime(
-    const GpuHloCostAnalysis* cost_analysis,
-    const se::DeviceDescription& gpu_device_info, int64_t num_blocks,
-    const HloInstruction* producer,
-    const std::optional<HloFusionAnalysis>& fusion_analysis,
-    const GpuPerformanceModelOptions& config,
-    const HloInstruction* fused_consumer) {
-  absl::Duration ret = absl::ZeroDuration();
-  float producer_output_utilization =
-      fused_consumer
-          ? GetOperandUtilization(cost_analysis, fused_consumer, producer)
-          : 1.f;
-
-  for (int i = 0; i < producer->operand_count(); ++i) {
-    // Information about data read taking into account utilization.
-    // If `operand_utilization` is 0, `operand_bytes_accessed` should be also 0.
-    int64_t operand_bytes_accessed =
-        cost_analysis->operand_bytes_accessed(*producer, i);
-    float operand_utilization =
-        cost_analysis->operand_utilization(*producer, i);
-
-    // An estimate how much data would need to fit into L1/L2 cache to speed up
-    // the operand access.
-    // If `operand_utilization` < 1, only a part of the full operand size should
-    // be read. Otherwise, `operand_bytes_accessed / operand_utilization` is the
-    // size of the operand without reuse.
-    int64_t n_bytes_net = std::llround(operand_bytes_accessed /
-                                       std::max(operand_utilization, 1.0f));
-
-    // Look if common operand of producer and consumer will be accessed more
-    // efficiently on merge.
-    float common_utilization = GetCommonUtilization(
-        cost_analysis, producer, /*producer_idx_of_operand=*/i, fused_consumer);
-
-    CHECK_LE(common_utilization, producer_output_utilization);
-    float n_bytes_total = operand_bytes_accessed *
-                          (producer_output_utilization - common_utilization);
-    ret += ReadTime(gpu_device_info, num_blocks, n_bytes_net, n_bytes_total);
-  }
-  return ret;
-}
-
-absl::Duration GpuPerformanceModel::ComputeTime(
-    const se::DeviceDescription& gpu_device_info, int64_t flops,
-    int64_t num_threads) {
-  int64_t fpu_count =
-      gpu_device_info.core_count() * gpu_device_info.fpus_per_core();
-  int64_t n_threads_active = std::min(num_threads, fpu_count);
-  int64_t flop_per_ns_per_fpu = gpu_device_info.clock_rate_ghz() * /*fma:*/ 2;
-  int64_t flop_per_ns_effective = flop_per_ns_per_fpu * n_threads_active;
-  return absl::Nanoseconds(1.0f * flops / flop_per_ns_effective);
-}
-
+/*static*/
 absl::Duration GpuPerformanceModel::EstimateUnfusedExecTime(
     const HloInstruction* producer, const EstimateRunTimeData& producer_runtime,
     const GpuHloCostAnalysis* cost_analysis,
     const GpuPerformanceModelOptions& config,
-    const std::vector<HloInstruction*>& fused_consumers) {
+    absl::Span<const HloInstruction* const> fused_consumers) {
   const se::DeviceDescription* device_info = cost_analysis->device_info_;
 
   absl::Duration time_unfused =
@@ -578,14 +162,14 @@ absl::Duration GpuPerformanceModel::EstimateUnfusedExecTime(
 
     // Use the analysis cache if present.
     // TODO(jreiffers): Remove this once all callers use a cache.
-    std::optional<HloFusionAnalysis> local_analysis =
-        config.fusion_analysis_cache
-            ? std::nullopt
-            : AnalyzeFusion(*fused_consumer, *device_info);
+    std::optional<HloFusionAnalysis> local_analysis;
+    if (!config.fusion_analysis_cache) {
+      local_analysis = AnalyzeFusion(*fused_consumer, *device_info);
+    }
     const auto& analysis_unfused =
         config.fusion_analysis_cache
             ? config.fusion_analysis_cache->Get(*fused_consumer)
-            : local_analysis;
+            : local_analysis.value();
 
     LaunchDimensions launch_dimensions_unfused = EstimateFusionLaunchDimensions(
         ShapeUtil::ElementsInRecursive(fused_consumer->shape()),
@@ -620,64 +204,57 @@ absl::Duration GpuPerformanceModel::EstimateUnfusedExecTime(
   float utilization_by_this_consumer = cost_analysis->operand_utilization(
       *consumer, consumer->operand_index(producer));
 
-  std::optional<HloFusionAnalysis> local_analysis_fused =
-      config.fusion_analysis_cache
-          ? std::nullopt
-          : AnalyzeProducerConsumerFusion(*producer, *consumer, *device_info);
+  std::optional<HloFusionAnalysis> local_analysis_fused;
+  if (!config.fusion_analysis_cache) {
+    local_analysis_fused =
+        AnalyzeProducerConsumerFusion(*producer, *consumer, *device_info);
+  }
   const auto& fusion_analysis =
       config.fusion_analysis_cache
           ? config.fusion_analysis_cache->Get(*producer, *consumer)
-          : local_analysis_fused;
+          : local_analysis_fused.value();
 
   LaunchDimensions launch_dimensions = EstimateFusionLaunchDimensions(
       producer_runtime.num_threads * utilization_by_this_consumer,
       fusion_analysis, *device_info);
 
-  int64_t fused_flops = producer_runtime.flops * utilization_by_this_consumer +
-                        consumer_runtime.flops;
+  int64_t flops = producer_runtime.flops * utilization_by_this_consumer +
+                  consumer_runtime.flops;
 
   int64_t num_threads = launch_dimensions.launch_bound();
-  absl::Duration compute_time =
-      ComputeTime(*device_info, fused_flops, num_threads);
+  absl::Duration compute_time = ComputeTime(*device_info, flops, num_threads);
 
-  absl::flat_hash_set<const HloInstruction*> fusion_operands;
-  for (auto* operand : producer->operands()) {
-    fusion_operands.insert(operand);
-  }
-  for (auto* operand : consumer->operands()) {
-    if (operand != producer) {
-      fusion_operands.insert(operand);
-    }
-  }
+  std::vector<const HloInstruction*> fusion_operands =
+      GetUniqueFusionOperands(producer, consumer);
+  CoalescingAnalysis coalescing_analysis(producer, consumer, fusion_operands,
+                                         fusion_analysis);
 
   absl::Duration read_time;
+  int64_t bytes_read = 0;
   for (const auto* operand : fusion_operands) {
-    float operand_utilization =
-        GetSharedUtilization(cost_analysis, producer, consumer, operand);
-
     int64_t operand_size = cost_analysis->GetShapeSize(operand->shape());
 
-    int64_t n_bytes_total = std::llround(operand_size * operand_utilization);
+    int64_t n_bytes_total = GetSharedOperandBytesAccessed(
+        cost_analysis, producer, consumer, operand);
     int64_t n_bytes_net = std::min(operand_size, n_bytes_total);
+    bytes_read += n_bytes_total;
+
+    bool coalesced = coalescing_analysis.IsReadCoalesced(operand);
 
-    bool coalesced =
-        IsReadCoalescedHeuristic(fusion_analysis, producer, consumer);
+    VLogOperandRead(operand, n_bytes_total, n_bytes_net, coalesced);
 
     read_time += ReadTimeWithDRAMHeuristic(
         *device_info, launch_dimensions.num_blocks(), n_bytes_net,
         n_bytes_total, operand->shape().element_type(), coalesced);
   }
 
-  if (VLOG_IS_ON(8)) {
-    LOG(INFO) << "Fused FLOPs: " << fused_flops;
-    LOG(INFO) << "Num threads: " << num_threads;
-    LOG(INFO) << "Compute time: " << compute_time;
-    LOG(INFO) << "Input read time: " << read_time;
-    LOG(INFO) << "Output write time: " << consumer_runtime.write_time;
-  }
-
-  return CombineComputeAndMemoryAccessTime(
+  auto exec_time = CombineComputeAndMemoryAccessTime(
       compute_time, read_time + consumer_runtime.write_time, config);
+
+  VLogResult(flops, bytes_read, consumer_runtime.bytes_written, num_threads,
+             compute_time, read_time, consumer_runtime.write_time, exec_time);
+
+  return exec_time;
 }
 
 /*static*/
@@ -705,11 +282,13 @@ absl::Duration GpuPerformanceModel::EstimateRunTimeForFusionCached(
   return fusion_runtime;
 }
 
+/*static*/
 absl::Duration GpuPerformanceModel::EstimateFusedExecTime(
     const HloInstruction* producer, const EstimateRunTimeData& producer_runtime,
     const GpuHloCostAnalysis* cost_analysis,
     const GpuPerformanceModelOptions& config,
-    const std::vector<HloInstruction*>& fused_consumers, bool multi_output) {
+    absl::Span<const HloInstruction* const> fused_consumers,
+    bool multi_output) {
   const se::DeviceDescription* device_info = cost_analysis->device_info_;
 
   absl::Duration exec_time_fused =
@@ -720,15 +299,15 @@ absl::Duration GpuPerformanceModel::EstimateFusedExecTime(
     float utilization_by_this_consumer = cost_analysis->operand_utilization(
         *fused_consumer, fused_consumer->operand_index(producer));
 
-    std::optional<HloFusionAnalysis> local_analysis_fused =
-        config.fusion_analysis_cache
-            ? std::nullopt
-            : AnalyzeProducerConsumerFusion(*producer, *fused_consumer,
-                                            *device_info);
+    std::optional<HloFusionAnalysis> local_analysis_fused;
+    if (!config.fusion_analysis_cache) {
+      local_analysis_fused = AnalyzeProducerConsumerFusion(
+          *producer, *fused_consumer, *device_info);
+    }
     const auto& analysis_fused =
         config.fusion_analysis_cache
             ? config.fusion_analysis_cache->Get(*producer, *fused_consumer)
-            : local_analysis_fused;
+            : local_analysis_fused.value();
 
     LaunchDimensions launch_dimensions_fused = EstimateFusionLaunchDimensions(
         producer_runtime.num_threads * utilization_by_this_consumer,
@@ -763,11 +342,13 @@ absl::Duration GpuPerformanceModel::EstimateFusedExecTime(
   return exec_time_fused;
 }
 
+/*static*/
 GpuPerformanceModel::RunTimes
 GpuPerformanceModel::EstimateRunTimesForPriorityFusion(
     const HloInstruction* producer, const GpuHloCostAnalysis* cost_analysis,
     const GpuPerformanceModelOptions& config,
-    std::vector<HloInstruction*> fused_consumers, bool multi_output) {
+    absl::Span<const HloInstruction* const> fused_consumers,
+    bool multi_output) {
   EstimateRunTimeData producer_runtime =
       EstimateRunTimeForInstructionCached(producer, cost_analysis, config);
 
@@ -805,10 +386,12 @@ GpuPerformanceModel::EstimateRunTimesForPriorityFusion(
   return {time_unfused, time_fused};
 }
 
+/*static*/
 GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
     const HloInstruction* producer, const GpuHloCostAnalysis* cost_analysis,
     const GpuPerformanceModelOptions& config,
-    std::vector<HloInstruction*> fused_consumers, bool multi_output) {
+    absl::Span<const HloInstruction* const> fused_consumers,
+    bool multi_output) {
   VLOG(8) << "Producer: " << producer->name();
   if (producer->opcode() == HloOpcode::kFusion) {
     VLOG(10) << producer->fused_instructions_computation()->ToString();
@@ -824,19 +407,8 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
       EstimateFusedExecTime(producer, producer_runtime, cost_analysis, config,
                             fused_consumers, multi_output);
 
-  int64_t fused_consumer_count = fused_consumers.size();
-  float total_producer_utilization = 0;
-
-  for (const HloInstruction* fused_consumer : fused_consumers) {
-    float utilization_by_this_consumer = cost_analysis->operand_utilization(
-        *fused_consumer, fused_consumer->operand_index(producer));
-    total_producer_utilization += utilization_by_this_consumer;
-  }
-
   if (VLOG_IS_ON(8)) {
-    LOG(INFO) << "Consumer count: " << fused_consumer_count;
-    LOG(INFO) << "Utilization of producer output: "
-              << total_producer_utilization;
+    LOG(INFO) << "Consumer count: " << fused_consumers.size();
     LOG(INFO) << "Unfused time: " << time_unfused;
     LOG(INFO) << "Fused time: " << time_fused;
   }
@@ -844,6 +416,7 @@ GpuPerformanceModel::RunTimes GpuPerformanceModel::EstimateRunTimes(
   return {time_unfused, time_fused};
 }
 
+/*static*/
 void GpuPerformanceModel::RecordEstimatedRunTime(
     HloInstruction* instruction, const GpuHloCostAnalysis* cost_analysis,
     const GpuPerformanceModelOptions& config) {
@@ -865,188 +438,5 @@ void GpuPerformanceModel::RecordEstimatedRunTime(
   VLOG(8) << "RecordEstimatedRunTime: " << instruction->ToString();
 }
 
-// Returns NVLink bw in GB/s
-/*static*/
-float GpuPerformanceWithCollectiveModel::GetNvlinkBw(
-    se::CudaComputeCapability compute_capability) {
-  return compute_capability.IsAtLeast(se::CudaComputeCapability::HOPPER)
-             ? kSm90NvlinkBandwidth
-         : compute_capability.IsAtLeast(se::CudaComputeCapability::AMPERE)
-             ? kSm80NvlinkBandwidth
-         : compute_capability.IsAtLeast(se::CudaComputeCapability::VOLTA)
-             ? kSm70NvlinkBandwidth
-         : compute_capability.IsAtLeast(se::CudaComputeCapability::PASCAL_)
-             ? kSm60NvlinkBandwidth
-             : kSm80NvlinkBandwidth;
-}
-
-/*static*/ bool GpuPerformanceWithCollectiveModel::InitNvml() {
-#if GOOGLE_CUDA
-  void* libhandle = dlopen("libnvidia-ml.so.1", RTLD_NOW);
-  CHECK(libhandle != nullptr) << "Failed to open libnvidia-ml.so.1";
-
-  struct SymbolEntry {
-    void** functor;
-    char const* name;
-  };
-
-  std::vector<SymbolEntry> symbols = {
-      {(void**)&xla_nvmlInit, "nvmlInit_v2"},
-      {(void**)&xla_nvmlShutdown, "nvmlShutdown"},
-      {(void**)&xla_nvmlDeviceGetHandleByIndex, "nvmlDeviceGetHandleByIndex"},
-      {(void**)&xla_nvmlDeviceGetNvLinkCapability,
-       "nvmlDeviceGetNvLinkCapability"},
-  };
-  for (SymbolEntry se : symbols) {
-    *se.functor = dlsym(libhandle, se.name);
-  }
-  nvmlReturn_t init_result = xla_nvmlInit();
-  return init_result == NVML_SUCCESS;
-#else
-  return false;
-#endif  // GOOGLE_CUDA
-}
-
-/*static*/ bool GpuPerformanceWithCollectiveModel::ShutdownNvml() {
-#if GOOGLE_CUDA
-  nvmlReturn_t shutdown_result = xla_nvmlShutdown();
-  return shutdown_result == NVML_SUCCESS;
-#else
-  return false;
-#endif  // GOOGLE_CUDA
-}
-
-/*static*/ uint32_t
-GpuPerformanceWithCollectiveModel::CheckIfNvlinkSupportsP2P() {
-#if GOOGLE_CUDA
-  // We will use nvml library to detect nvlink capability
-  // to see if it supports p2p communication.
-  // We first load libnvidia-ml.so and assign symbols to function pointers
-  // to avoid linking errors.
-  // Then gpu 0 will be used to query for nvlink capability, note that
-  // we only look at link 0 of gpu 0 since all other links are assumed
-  // to have the same capability.
-  CHECK(InitNvml()) << "NVML init failed.";
-  nvmlDevice_t nvml_device;
-  nvmlReturn_t get_device_result =
-      xla_nvmlDeviceGetHandleByIndex(0, &nvml_device);
-  CHECK(get_device_result == NVML_SUCCESS);
-
-  uint32_t supported_p2p = 0;
-
-  nvmlReturn_t nvlink_cap_result = xla_nvmlDeviceGetNvLinkCapability(
-      nvml_device, /*nvlink link number*/ 0, NVML_NVLINK_CAP_P2P_SUPPORTED,
-      &supported_p2p);
-  CHECK(nvlink_cap_result == NVML_SUCCESS);
-  CHECK(ShutdownNvml()) << "NVML shutdown failed.";
-  return supported_p2p;
-#else
-  return 0;
-#endif  // GOOGLE_CUDA
-}
-
-/*static*/ absl::Duration
-GpuPerformanceWithCollectiveModel::ComputeAllreduceTime(
-    const HloInstruction& instr, const GpuHloCostAnalysis* cost_analysis,
-    const se::DeviceDescription& gpu_device_info) {
-  // We use nccl group call to launch multiple allreduces so launch overhead
-  // only occurs once.
-  absl::Duration total_time = kNcclKernelLaunchOverhead;
-  stream_executor::CudaComputeCapability compute_cap =
-      gpu_device_info.cuda_compute_capability();
-
-  int64_t size_of_speed_array = kIntraNodeSpeeds.size();
-  int64_t size_of_sm90_speed_array = kIntraNodeSpeedsSm90.size();
-
-  int num_speeds = compute_cap.major >= se::CudaComputeCapability::HOPPER
-                       ? size_of_sm90_speed_array
-                       : size_of_speed_array;
-  const double* speeds = compute_cap.major >= se::CudaComputeCapability::HOPPER
-                             ? kIntraNodeSpeedsSm90.data()
-                             : kIntraNodeSpeeds.data();
-
-  int speed_index = 0;
-  float max_sys_bw =
-      GetMaxSysBwFromGpu(compute_cap, kLowLatencyMaxBandwidths.data());
-
-  CHECK_GT(max_sys_bw, 0);
-
-  while ((speed_index < num_speeds - 1) && speeds[speed_index] > max_sys_bw) {
-    speed_index++;
-  }
-  float bw_intra_node = speeds[speed_index];
-  int64_t num_devices = cost_analysis->NumOfDevices(instr);
-
-  int64_t min_nchannels =
-      std::max(num_devices, GetMinNumberOfChannels(CollectiveAlgo::RING));
-  int64_t num_channels =
-      std::max(min_nchannels, GetNcclMaxNumChannels(CollectiveAlgo::RING));
-  int default_threads =
-      (bw_intra_node * num_channels <= kPciBandwidth) ? 256 : kLL128NumThreads;
-
-  int warp_size = gpu_device_info.threads_per_warp();
-  int num_threads = GetNumThreads(warp_size, kLL128NumThreads / 4,
-                                  kLL128NumThreads, default_threads);
-
-  // Since channels are pipelined together, compute time will only occur as in a
-  // single channel.
-  absl::Duration compute_time_per_channel =
-      ComputeTime(gpu_device_info,
-                  cost_analysis->flop_count(instr) / num_channels, num_threads);
-  total_time += compute_time_per_channel;
-
-  uint32_t supported_p2p = CheckIfNvlinkSupportsP2P();
-
-  if (supported_p2p == 0) {
-    VLOG(8) << "Nvlink doesn't support p2p communication. Model will "
-               "continue using default system bandwidth.";
-  } else {
-    VLOG(8) << "Nvlink supports p2p communication, setting intra node "
-               "bandwidth to nvlink bw.";
-    bw_intra_node = GetNvlinkBw(compute_cap);
-  }
-
-  double bus_bandwidth = bw_intra_node * num_channels;
-
-  // Get per channel LL128 ring bandwidth
-  double per_channel_ring_ll128_Bw =
-      GetMaxSysBwFromGpu(compute_cap, kPerChannelMaxRingLL128Bandwidths.data());
-
-  bus_bandwidth = std::min(bus_bandwidth * kRingAlgorithmDiscountFactor,
-                           num_channels * per_channel_ring_ll128_Bw);
-  double actual_bandwidth = bus_bandwidth * cost_analysis->ScalingRatio(instr);
-
-  absl::Duration communication_time = absl::Microseconds(
-      cost_analysis->bytes_accessed(instr) / (1e6 * actual_bandwidth));
-  total_time += communication_time;
-  return total_time;
-}
-
-/*static*/ absl::Duration
-GpuPerformanceWithCollectiveModel::ComputeCollectiveTime(
-    const HloInstruction& instr, const GpuHloCostAnalysis* cost_analysis,
-    const se::DeviceDescription& gpu_device_info) {
-  if (cost_analysis->NumOfDevices(instr) == 1) {
-    VLOG(8) << "Returning only kernel launch overhead for a single partition.";
-    return kNcclKernelLaunchOverhead;
-  }
-
-  if (HloDataflowAnalysis::IsAsynchronousOperationDone(instr.opcode())) {
-    VLOG(8) << "Returning 0 cost for async done op " << instr.name();
-    return absl::ZeroDuration();
-  }
-  switch (instr.opcode()) {
-    case HloOpcode::kAllReduce:
-    case HloOpcode::kAllReduceStart:
-      return ComputeAllreduceTime(instr, cost_analysis, gpu_device_info);
-    default: {
-      LOG(WARNING)
-          << "Runtime estimate for " << instr.name()
-          << " not implemented. Returning only the kernel launch time.";
-      return kNcclKernelLaunchOverhead;
-    }
-  }
-}
-
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model.h b/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
index 896bf70ec89e6c..d23c74d96563c2 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model.h
@@ -16,125 +16,18 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_MODEL_GPU_PERFORMANCE_MODEL_H_
 #define XLA_SERVICE_GPU_MODEL_GPU_PERFORMANCE_MODEL_H_
 
-#include <array>
-#include <vector>
 
-#include "absl/container/flat_hash_map.h"
 #include "absl/time/time.h"
+#include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/gpu/hlo_fusion_analysis.h"
-#include "xla/service/gpu/hlo_traversal.h"
-#include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
-#include "xla/stream_executor/device_description.h"
-
-#if GOOGLE_CUDA
-#include <dlfcn.h>
-
-#include "third_party/gpus/cuda/nvml/include/nvml.h"
-// Below is a list of function pointers to be used
-// for querying device properties through nvml library.
-#define NVML_FUNCTOR(name, rettype, args) rettype(*xla_##name) args = nullptr;
-
-NVML_FUNCTOR(nvmlInit, nvmlReturn_t, ())
-NVML_FUNCTOR(nvmlShutdown, nvmlReturn_t, ())
-NVML_FUNCTOR(nvmlDeviceGetHandleByIndex, nvmlReturn_t,
-             (unsigned int index, nvmlDevice_t* device))
-NVML_FUNCTOR(nvmlDeviceGetNvLinkCapability, nvmlReturn_t,
-             (nvmlDevice_t device, unsigned int link,
-              nvmlNvLinkCapability_t capability, unsigned int* capResult))
-
-#endif
+#include "xla/service/gpu/model/gpu_performance_model_base.h"
 
 namespace xla {
 namespace gpu {
 
-struct EstimateRunTimeData {
-  int64_t flops;
-  int64_t bytes_written;
-  int64_t num_threads;
-  absl::Duration write_time;
-  absl::Duration exec_time;
-};
-
-class GpuPerformanceModelCache {
- public:
-  // Returns cached runtime data for the instruction or producer-consumer pair.
-  // Returns nullopt if there is no data in cache.
-  std::optional<EstimateRunTimeData> Get(const HloInstruction& instruction);
-  std::optional<absl::Duration> Get(const HloInstruction& producer,
-                                    const HloInstruction& consumer);
-
-  // Sets cache value for the instruction or producer-consumer pair.
-  void Set(const HloInstruction& instruction,
-           const EstimateRunTimeData& runtime_data);
-  void Set(const HloInstruction& producer, const HloInstruction& consumer,
-           absl::Duration runtime);
-
-  // Removes all cache entries for this instruction. The cache contains entries
-  // for individual instructions in instruction_runtime_data_ and for
-  // producer-consumer pairs in fusion_runtime_data_.
-  void Invalidate(const HloInstruction& instruction);
-
- private:
-  absl::Mutex mutex_;
-
-  // Stores unfused runtime data for individual instructions.
-  absl::flat_hash_map<HloInstructionAdaptor, EstimateRunTimeData>
-      instruction_runtime_data_;
-
-  // Stores fused runtime data for producer-consumer pairs.
-  absl::flat_hash_map<
-      HloInstructionAdaptor,
-      absl::flat_hash_map<HloInstructionAdaptor, absl::Duration>>
-      fusion_runtime_data_;
-};
-
-struct GpuPerformanceModelOptions {
-  // Factor for how much parallelism between compute and memory accesses should
-  // be assumed. If 1.0, assume perfect parallelism (the run time is the maximum
-  // of both times). If 0.0, assume no parallelism (the run time is the sum of
-  // both times).
-  double memory_compute_parallelism = 1.0;
-
-  // If present, use this to retrieve fusion analyses.
-  HloFusionAnalysisCache* fusion_analysis_cache = nullptr;
-
-  GpuPerformanceModelCache* gpu_performance_model_cache = nullptr;
-
-  static GpuPerformanceModelOptions Default() {
-    return GpuPerformanceModelOptions();
-  }
-
-  static GpuPerformanceModelOptions PriorityFusion(
-      HloFusionAnalysisCache* fusion_analysis_cache = nullptr,
-      GpuPerformanceModelCache* gpu_performance_model_cache = nullptr) {
-    GpuPerformanceModelOptions config;
-    config.fusion_analysis_cache = fusion_analysis_cache;
-    config.gpu_performance_model_cache = gpu_performance_model_cache;
-    // This constant was chosen empirically in early 2024, based on runtime
-    // performance on a set of benchmarks internal to Google. Intuitively, we
-    // expect it to be close to 1, but not quite 1 (i.e., sometimes, compute
-    // or memory accesses will be stalled waiting for the other, but usually
-    // they won't).
-    config.memory_compute_parallelism = 0.95;
-    return config;
-  }
-
-  static GpuPerformanceModelOptions ForModule(const HloModule* module) {
-    return module->config().debug_options().xla_gpu_enable_priority_fusion()
-               ? PriorityFusion()  // Only cache within priority fusion.
-               : Default();
-  }
-};
-
-class GpuPerformanceModel {
+class GpuPerformanceModel : public GpuPerformanceModelBase {
  public:
-  struct RunTimes {
-    absl::Duration time_unfused;
-    absl::Duration time_fused;
-  };
-
   static EstimateRunTimeData EstimateRunTimeForInstruction(
       const HloInstruction* instr, const GpuHloCostAnalysis* cost_analysis,
       const GpuPerformanceModelOptions& config);
@@ -163,114 +56,32 @@ class GpuPerformanceModel {
       const EstimateRunTimeData& producer_runtime,
       const GpuHloCostAnalysis* cost_analysis,
       const GpuPerformanceModelOptions& config,
-      const std::vector<HloInstruction*>& fused_consumers);
+      absl::Span<const HloInstruction* const> fused_consumers);
 
   static absl::Duration EstimateFusedExecTime(
       const HloInstruction* producer,
       const EstimateRunTimeData& producer_runtime,
       const GpuHloCostAnalysis* cost_analysis,
       const GpuPerformanceModelOptions& config,
-      const std::vector<HloInstruction*>& fused_consumers, bool multi_output);
+      absl::Span<const HloInstruction* const> fused_consumers,
+      bool multi_output);
 
   static RunTimes EstimateRunTimes(
       const HloInstruction* producer, const GpuHloCostAnalysis* cost_analysis,
       const GpuPerformanceModelOptions& config,
-      std::vector<HloInstruction*> fused_consumers = {},
+      absl::Span<const HloInstruction* const> fused_consumers = {},
       bool multi_output = false);
 
   static RunTimes EstimateRunTimesForPriorityFusion(
       const HloInstruction* producer, const GpuHloCostAnalysis* cost_analysis,
       const GpuPerformanceModelOptions& config,
-      std::vector<HloInstruction*> fused_consumers = {},
+      absl::Span<const HloInstruction* const> fused_consumers = {},
       bool multi_output = false);
 
   // Writes estimated execution time to FusionBackendConfig.reification_cost.
   static void RecordEstimatedRunTime(HloInstruction* instruction,
                                      const GpuHloCostAnalysis* cost_analysis,
                                      const GpuPerformanceModelOptions& config);
-  static absl::Duration ComputeTime(
-      const se::DeviceDescription& gpu_device_info, int64_t flops,
-      int64_t num_threads);
-
-  static absl::Duration ProducerInputAccessTime(
-      const GpuHloCostAnalysis* cost_analysis,
-      const se::DeviceDescription& gpu_device_info, int64_t num_blocks,
-      const HloInstruction* producer,
-      const std::optional<HloFusionAnalysis>& fusion_analysis,
-      const GpuPerformanceModelOptions& config,
-      const HloInstruction* fused_consumer = nullptr);
-};
-
-class GpuPerformanceWithCollectiveModel : public GpuPerformanceModel {
- public:
-  // Different algorithms that can be used to perform the collective.
-  enum CollectiveAlgo {
-    RING = 0,
-    TREE,
-  };
-
-  // Table for max system bandwidths GB/s for using NCCL's low latency
-  // algorithm. This is used for intra-node estimate.
-  static constexpr std::array<double, 3> kLowLatencyMaxBandwidths = {
-      39.0 /* Volta*/, 87.7 /* Ampere*/, 87.7 /* Hopper*/
-  };
-
-  // Max bandwidth in GB/s for ring low latency 128 algorithm per channel on a
-  // single-node
-  static constexpr std::array<double, 3> kPerChannelMaxRingLL128Bandwidths = {
-      20.0 /* Volta */,
-      20.0 /* Ampere */,
-      36.7 /* Hopper */,
-  };
-
-  // Nvlink unidirectional bandwidth for different compute cap. Note this is per
-  // lane bandwidth.
-  static constexpr double kSm60NvlinkBandwidth = 18.0;
-  static constexpr double kSm70NvlinkBandwidth = 20.0;
-  static constexpr double kSm80NvlinkBandwidth = 20.0;
-  static constexpr double kSm90NvlinkBandwidth = 20.0;
-
-  // PCIE bandwidth for PCI Gen3 x16
-  static constexpr double kPciBandwidth = 12.0;
-
-  // Discount factor for ring algorithm
-  static constexpr double kRingAlgorithmDiscountFactor = 0.92;
-
-  // Different tiers for intra-node bandwidth.
-  static constexpr std::array<double, 13> kIntraNodeSpeeds = {
-      40.0, 30.0, 20.0, 18.0, 15.0, 12.0, 10.0, 9.0, 7.0, 6.0, 5.0, 4.0, 3.0};
-  // SM90 has different bandwidths.
-  static constexpr std::array<double, 9> kIntraNodeSpeedsSm90 = {
-      60.0, 40.0, 30.0, 24.0, 20.0, 15.0, 12.0, 6.0, 3.0};
-
-  // Maximum number of channels allowed by NCCL
-  static constexpr int64_t kMaxNumChannelsRing = 16;
-
-  // ll128 is by default enabled for Volta, Ampere and Hopper, ll128 by default
-  // launches 640 threads.
-  static constexpr int64_t kLL128NumThreads = 640;
-
-  static absl::Duration ComputeCollectiveTime(
-      const HloInstruction& instr, const GpuHloCostAnalysis* cost_analysis,
-      const se::DeviceDescription& gpu_device_info);
-
-  // Returns NVLink bw in GB/s
-  static float GetNvlinkBw(se::CudaComputeCapability compute_capability);
-
-  // Initialize nvml library.
-  static bool InitNvml();
-
-  // Shut down nvml library.
-  static bool ShutdownNvml();
-
-  // This checks if the nvlink supports direct P2P communication,
-  // If not, we will use PCIE bandwidth to estimate latency.
-  static uint32_t CheckIfNvlinkSupportsP2P();
-
- private:
-  static absl::Duration ComputeAllreduceTime(
-      const HloInstruction& instr, const GpuHloCostAnalysis* cost_analysis,
-      const se::DeviceDescription& gpu_device_info);
 };
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
new file mode 100644
index 00000000000000..40bb1ff69b1b7f
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.cc
@@ -0,0 +1,404 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/gpu_performance_model_base.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <optional>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/fusions/fusion_emitter.h"
+#include "xla/service/gpu/fusions/fusions.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/hlo_traversal.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace gpu {
+
+namespace {
+
+// Returns whether a fusion uses the parameter at the given index elementwise
+// from its root.
+bool FusionUsesParameterElementwiseFromRoot(
+    const HloInstruction* fusion, int parameter_index,
+    const GpuHloCostAnalysis* cost_analysis) {
+  return cost_analysis->CommonElementwiseUtilization(
+             fusion->fused_parameter(parameter_index),
+             fusion->fused_expression_root()) == 1.f;
+}
+
+int GetCoalescingWasteFactor(PrimitiveType element_type) {
+  int64_t element_size_bytes =
+      element_type == PrimitiveType::TUPLE ||
+              element_type == PrimitiveType::TOKEN
+          ? 4 /* Dummy value. TODO(jreiffers): Model this case. */
+          : ShapeUtil::ByteSizeOfPrimitiveType(element_type);
+  // Cache line is 128B that is split into 4 sectors of 32B. Default transaction
+  // size from DRAM -> L2 = 64 Bytes = 2 sectors, since V100, but it can be also
+  // configured.
+  // https://developer.download.nvidia.com/video/gputechconf/gtc/2020/presentations/s21819-optimizing-applications-for-nvidia-ampere-gpu-architecture.pdf
+  // (page 10).
+  constexpr int kDRAMToL2TransactionSizeBytes = 64;
+  // Assume we use one element from the cache line and waste the remaining
+  // bandwidth. For example, if we're reading f32s, we use 1/16nd of the cache
+  // line.
+  return kDRAMToL2TransactionSizeBytes / element_size_bytes;
+}
+
+// Limit the bandwidth for low occupancy cases. Each SM can issue at most
+// one 32B memory transaction per clock. H100 needs at least 56.8 active SMs
+// (1830 MHz) to saturate the memory bandwidth (3.35 TB/s).
+float AdjustBandwidth(const se::DeviceDescription& gpu_device_info,
+                      float bandwidth, int64_t num_blocks) {
+  float per_block_bandwidth = gpu_device_info.clock_rate_ghz() * 1.0e9f * 32;
+  float max_bandwidth = num_blocks * per_block_bandwidth;
+
+  return std::min(bandwidth, max_bandwidth);
+}
+
+}  // namespace
+
+std::optional<EstimateRunTimeData> GpuPerformanceModelCache::Get(
+    const HloInstruction& instruction) {
+  absl::MutexLock lock(&mutex_);
+
+  auto it = instruction_runtime_data_.find(HloInstructionAdaptor(instruction));
+  if (it != instruction_runtime_data_.end()) {
+    return it->second;
+  }
+  return std::nullopt;
+}
+
+std::optional<absl::Duration> GpuPerformanceModelCache::Get(
+    const HloInstruction& producer, const HloInstruction& consumer) {
+  absl::MutexLock lock(&mutex_);
+
+  auto it = fusion_runtime_data_.find(HloInstructionAdaptor(producer));
+  if (it != fusion_runtime_data_.end()) {
+    auto jt = it->second.find(HloInstructionAdaptor(consumer));
+    if (jt != it->second.end()) {
+      return jt->second;
+    }
+  }
+  return std::nullopt;
+}
+
+void GpuPerformanceModelCache::Set(const HloInstruction& instruction,
+                                   const EstimateRunTimeData& runtime_data) {
+  absl::MutexLock lock(&mutex_);
+
+  instruction_runtime_data_[HloInstructionAdaptor(instruction)] = runtime_data;
+}
+
+void GpuPerformanceModelCache::Set(const HloInstruction& producer,
+                                   const HloInstruction& consumer,
+                                   absl::Duration runtime) {
+  absl::MutexLock lock(&mutex_);
+  fusion_runtime_data_[HloInstructionAdaptor(producer)]
+                      [HloInstructionAdaptor(consumer)] = runtime;
+}
+
+void GpuPerformanceModelCache::Invalidate(const HloInstruction& instruction) {
+  absl::MutexLock lock(&mutex_);
+  HloInstructionAdaptor adaptor(instruction);
+
+  // Remove runtime data for the instruction.
+  instruction_runtime_data_.erase(adaptor);
+
+  // Remove cache for all producer-consumer pairs where the instruction is
+  // producer.
+  fusion_runtime_data_.erase(adaptor);
+
+  // Iterate through operands to find all producer-consumer pairs where
+  // instruction is consumer and remove them from cache.
+  for (auto* operand : instruction.operands()) {
+    auto it = fusion_runtime_data_.find(HloInstructionAdaptor(*operand));
+    if (it != fusion_runtime_data_.end()) {
+      it->second.erase(adaptor);
+    }
+  }
+}
+
+/*static*/
+LaunchDimensions GpuPerformanceModelBase::EstimateFusionLaunchDimensions(
+    int64_t estimated_num_threads, const HloFusionAnalysis& fusion_analysis,
+    const se::DeviceDescription& device_info) {
+  auto emitter =
+      GetFusionEmitter(PreBufferAssignmentFusionInfo{fusion_analysis});
+  if (emitter.ok()) {
+    if (const auto* kernel_emitter =
+            dynamic_cast<const KernelFusionInterface*>(emitter->get())) {
+      return kernel_emitter->launch_dimensions();
+    }
+  }
+  int64_t block_size = 128;  // Result for default LaunchDimensionsConfig.
+  int64_t num_blocks = CeilOfRatio(estimated_num_threads, block_size);
+  return LaunchDimensions(num_blocks, block_size);
+}
+
+/*static*/
+int64_t GpuPerformanceModelBase::GetOperandBytesAccessed(
+    const GpuHloCostAnalysis* cost_analysis, const HloInstruction* instr,
+    const HloInstruction* operand) {
+  // When called for a consumer-producer fusion, the operand can be from a
+  // different instruction. GpuHloCostAnalysis can't fail gravefully in this
+  // case, so we need an explicit check.
+  if (!instr->IsUserOf(operand)) {
+    return 0;
+  }
+
+  return cost_analysis->operand_bytes_accessed(*instr,
+                                               instr->operand_index(operand));
+}
+
+/*static*/
+float GpuPerformanceModelBase::GetOperandUtilization(
+    const GpuHloCostAnalysis* cost_analysis, const HloInstruction* instr,
+    const HloInstruction* operand) {
+  // When called for a consumer-producer fusion, the operand can be from a
+  // different instruction. GpuHloCostAnalysis can't fail gravefully in this
+  // case, so we need an explicit check.
+  if (!instr->IsUserOf(operand)) {
+    return 0.f;
+  }
+
+  return cost_analysis->operand_utilization(*instr,
+                                            instr->operand_index(operand));
+}
+
+/*static*/
+float GpuPerformanceModelBase::GetCommonUtilization(
+    const GpuHloCostAnalysis* cost_analysis, const HloInstruction* producer,
+    int64_t producer_idx_of_operand, const HloInstruction* consumer) {
+  const auto* operand = producer->operand(producer_idx_of_operand);
+
+  if (!consumer || !consumer->IsUserOf(operand)) {
+    return 0.f;
+  }
+
+  if (producer->IsElementwise() ||
+      (producer->opcode() == HloOpcode::kFusion &&
+       FusionUsesParameterElementwiseFromRoot(producer, producer_idx_of_operand,
+                                              cost_analysis))) {
+    if (consumer->opcode() == HloOpcode::kFusion) {
+      int64_t consumer_idx_of_common_operand = consumer->operand_index(operand);
+      int64_t consumer_idx_of_producer = consumer->operand_index(producer);
+      return cost_analysis->CommonElementwiseUtilization(
+          consumer->fused_parameter(consumer_idx_of_common_operand),
+          consumer->fused_parameter(consumer_idx_of_producer));
+    } else {
+      if (consumer->IsElementwise()) {
+        return 1.f;
+      }
+    }
+  }
+  return 0.f;
+}
+
+/*static*/
+int64_t GpuPerformanceModelBase::GetSharedOperandBytesAccessed(
+    const GpuHloCostAnalysis* cost_analysis, const HloInstruction* producer,
+    const HloInstruction* consumer, const HloInstruction* operand) {
+  float producer_utilization_by_consumer =
+      GetOperandUtilization(cost_analysis, consumer, producer);
+
+  int64_t bytes_accessed_by_producer =
+      GetOperandBytesAccessed(cost_analysis, producer, operand);
+
+  int64_t bytes_accessed_by_consumer =
+      GetOperandBytesAccessed(cost_analysis, consumer, operand);
+
+  float common_utilization =
+      producer->IsUserOf(operand)
+          ? GetCommonUtilization(cost_analysis, producer,
+                                 producer->operand_index(operand), consumer)
+          : 0.f;
+
+  int64_t operand_size = cost_analysis->GetShapeSize(operand->shape());
+  int64_t common_bytes_accessed =
+      std::llround(operand_size * common_utilization);
+
+  return std::llround(bytes_accessed_by_producer *
+                      producer_utilization_by_consumer) +
+         bytes_accessed_by_consumer - common_bytes_accessed;
+}
+
+/*static*/
+absl::Duration GpuPerformanceModelBase::ReadTime(
+    const se::DeviceDescription& gpu_device_info, int64_t num_blocks,
+    int64_t n_bytes_net, int64_t n_bytes_total) {
+  float bandwidth = gpu_device_info.memory_bandwidth();
+  if (n_bytes_net < gpu_device_info.l2_cache_size()) {
+    bandwidth *= kL2CacheSpeedup;
+    if (n_bytes_net < kL1CacheSizePerSM * gpu_device_info.core_count()) {
+      bandwidth *= kL1CacheSpeedup;
+    }
+  }
+
+  bandwidth = AdjustBandwidth(gpu_device_info, bandwidth, num_blocks);
+  return absl::Seconds(n_bytes_total / bandwidth);
+}
+
+/*static*/
+absl::Duration GpuPerformanceModelBase::ReadTimeWithDRAMHeuristic(
+    const se::DeviceDescription& gpu_device_info, int64_t num_blocks,
+    int64_t n_bytes_net, int64_t n_bytes_total, PrimitiveType element_type,
+    bool coalesced) {
+  int waste_factor = coalesced ? 1 : GetCoalescingWasteFactor(element_type);
+
+  // The first read of the input buffer always happens from DRAM. If reads are
+  // no coaleced, bandwidth is reduced by the waste factor.
+  float dram_bandwidth = gpu_device_info.memory_bandwidth() / waste_factor;
+
+  // Two things can happed on re-reading the buffer:
+  //   - If the buffer fits into cache, the L1/L2 cache speedup is applied.
+  //   - If the buffer doesn't fit, it will be read from DRAM and the same
+  //     coalessing waste factor is applied.
+  float rest_bandwidth = gpu_device_info.memory_bandwidth();
+  if (n_bytes_net < gpu_device_info.l2_cache_size()) {
+    rest_bandwidth *= kL2CacheSpeedup;
+    if (n_bytes_net < kL1CacheSizePerSM * gpu_device_info.core_count()) {
+      rest_bandwidth *= kL1CacheSpeedup;
+    }
+  } else {
+    rest_bandwidth /= waste_factor;
+  }
+
+  dram_bandwidth = AdjustBandwidth(gpu_device_info, dram_bandwidth, num_blocks);
+  rest_bandwidth = AdjustBandwidth(gpu_device_info, rest_bandwidth, num_blocks);
+
+  // n_bytes_net > n_bytes_total can happen when we compute read time of
+  // shared operand. This is a flaw in the interface that should be fixed.
+  int64_t n_bytes_read_dram = std::min(n_bytes_net, n_bytes_total);
+
+  // Number of bytes that we be re-read, potentially from cache.
+  int64_t n_bytes_read_cache = n_bytes_total - n_bytes_read_dram;
+
+  return absl::Seconds(n_bytes_read_dram / dram_bandwidth) +
+         absl::Seconds(n_bytes_read_cache / rest_bandwidth);
+}
+
+/*static*/ absl::Duration GpuPerformanceModelBase::ProducerInputAccessTime(
+    const GpuHloCostAnalysis* cost_analysis,
+    const se::DeviceDescription& gpu_device_info, int64_t num_blocks,
+    const HloInstruction* producer, const HloFusionAnalysis& fusion_analysis,
+    const GpuPerformanceModelOptions& config,
+    const HloInstruction* fused_consumer) {
+  absl::Duration ret = absl::ZeroDuration();
+  float producer_output_utilization =
+      fused_consumer
+          ? GetOperandUtilization(cost_analysis, fused_consumer, producer)
+          : 1.f;
+
+  for (int i = 0; i < producer->operand_count(); ++i) {
+    // Information about data read taking into account utilization.
+    // If `operand_utilization` is 0, `operand_bytes_accessed` should be also 0.
+    int64_t operand_bytes_accessed =
+        cost_analysis->operand_bytes_accessed(*producer, i);
+    float operand_utilization =
+        cost_analysis->operand_utilization(*producer, i);
+
+    // An estimate how much data would need to fit into L1/L2 cache to speed up
+    // the operand access.
+    // If `operand_utilization` < 1, only a part of the full operand size should
+    // be read. Otherwise, `operand_bytes_accessed / operand_utilization` is the
+    // size of the operand without reuse.
+    int64_t n_bytes_net = std::llround(operand_bytes_accessed /
+                                       std::max(operand_utilization, 1.0f));
+
+    // Look if common operand of producer and consumer will be accessed more
+    // efficiently on merge.
+    float common_utilization = GetCommonUtilization(
+        cost_analysis, producer, /*producer_idx_of_operand=*/i, fused_consumer);
+
+    CHECK_LE(common_utilization, producer_output_utilization);
+    float n_bytes_total = operand_bytes_accessed *
+                          (producer_output_utilization - common_utilization);
+    ret += ReadTime(gpu_device_info, num_blocks, n_bytes_net, n_bytes_total);
+  }
+  return ret;
+}
+
+/*static*/
+absl::Duration GpuPerformanceModelBase::WriteTime(
+    const se::DeviceDescription& gpu_device_info, int64_t bytes_written) {
+  return absl::Seconds(1.0f * bytes_written /
+                       gpu_device_info.memory_bandwidth());
+}
+
+/*static*/
+absl::Duration GpuPerformanceModelBase::ComputeTime(
+    const se::DeviceDescription& gpu_device_info, int64_t flops,
+    int64_t num_threads) {
+  int64_t fpu_count =
+      gpu_device_info.core_count() * gpu_device_info.fpus_per_core();
+  int64_t n_threads_active = std::min(num_threads, fpu_count);
+  int64_t flop_per_ns_per_fpu = gpu_device_info.clock_rate_ghz() * /*fma:*/ 2;
+  int64_t flop_per_ns_effective = flop_per_ns_per_fpu * n_threads_active;
+  return absl::Nanoseconds(1.0f * flops / flop_per_ns_effective);
+}
+
+/*static*/
+absl::Duration GpuPerformanceModelBase::CombineComputeAndMemoryAccessTime(
+    absl::Duration compute_time, absl::Duration memory_access_time,
+    const GpuPerformanceModelOptions& config) {
+  return compute_time + memory_access_time -
+         std::min(compute_time, memory_access_time) *
+             config.memory_compute_parallelism;
+}
+
+/*static*/
+void GpuPerformanceModelBase::VLogOperandRead(const HloInstruction* operand,
+                                              int64_t n_bytes_total,
+                                              int64_t n_bytes_net,
+                                              bool coalesced) {
+  VLOG(8) << "operand " << operand->name()
+          << ", n_bytes_total: " << n_bytes_total
+          << ", n_bytes_net: " << n_bytes_net << ", coalesced: " << coalesced;
+}
+
+/*static*/
+void GpuPerformanceModelBase::VLogResult(
+    int64_t flops, int64_t bytes_read, int64_t bytes_written,
+    int64_t num_threads, absl::Duration compute_time, absl::Duration read_time,
+    absl::Duration write_time, absl::Duration exec_time) {
+  if (VLOG_IS_ON(8)) {
+    LOG(INFO) << "FLOPs: " << flops;
+    LOG(INFO) << "Bytes read: " << bytes_read;
+    LOG(INFO) << "Bytes written: " << bytes_written;
+    LOG(INFO) << "Num threads: " << num_threads;
+    LOG(INFO) << "Compute time: " << compute_time;
+    LOG(INFO) << "Input read time: " << read_time;
+    LOG(INFO) << "Output write time: " << write_time;
+    LOG(INFO) << "Exec time: " << exec_time;
+  }
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
new file mode 100644
index 00000000000000..7f71753bab8685
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base.h
@@ -0,0 +1,227 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_MODEL_GPU_PERFORMANCE_MODEL_BASE_H_
+#define XLA_SERVICE_GPU_MODEL_GPU_PERFORMANCE_MODEL_BASE_H_
+
+#include <cstdint>
+#include <optional>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
+#include "xla/service/gpu/hlo_traversal.h"
+#include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/model/fusion_analysis_cache.h"
+#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/stream_executor/device_description.h"
+
+namespace xla {
+namespace gpu {
+
+struct EstimateRunTimeData {
+  int64_t flops;
+  int64_t bytes_written;
+  int64_t num_threads;
+  absl::Duration write_time;
+  absl::Duration exec_time;
+};
+
+class GpuPerformanceModelCache {
+ public:
+  // Returns cached runtime data for the instruction or producer-consumer pair.
+  // Returns nullopt if there is no data in cache.
+  std::optional<EstimateRunTimeData> Get(const HloInstruction& instruction);
+  std::optional<absl::Duration> Get(const HloInstruction& producer,
+                                    const HloInstruction& consumer);
+
+  // Sets cache value for the instruction or producer-consumer pair.
+  void Set(const HloInstruction& instruction,
+           const EstimateRunTimeData& runtime_data);
+  void Set(const HloInstruction& producer, const HloInstruction& consumer,
+           absl::Duration runtime);
+
+  // Removes all cache entries for this instruction. The cache contains entries
+  // for individual instructions in instruction_runtime_data_ and for
+  // producer-consumer pairs in fusion_runtime_data_.
+  void Invalidate(const HloInstruction& instruction);
+
+ private:
+  absl::Mutex mutex_;
+
+  // Stores unfused runtime data for individual instructions.
+  absl::flat_hash_map<HloInstructionAdaptor, EstimateRunTimeData>
+      instruction_runtime_data_;
+
+  // Stores fused runtime data for producer-consumer pairs.
+  absl::flat_hash_map<
+      HloInstructionAdaptor,
+      absl::flat_hash_map<HloInstructionAdaptor, absl::Duration>>
+      fusion_runtime_data_;
+};
+
+struct GpuPerformanceModelOptions {
+  // Factor for how much parallelism between compute and memory accesses should
+  // be assumed. If 1.0, assume perfect parallelism (the run time is the maximum
+  // of both times). If 0.0, assume no parallelism (the run time is the sum of
+  // both times).
+  double memory_compute_parallelism = 1.0;
+
+  // If present, use this to retrieve fusion analyses.
+  HloFusionAnalysisCache* fusion_analysis_cache = nullptr;
+
+  GpuPerformanceModelCache* gpu_performance_model_cache = nullptr;
+
+  static GpuPerformanceModelOptions Default() {
+    return GpuPerformanceModelOptions();
+  }
+
+  static GpuPerformanceModelOptions PriorityFusion(
+      HloFusionAnalysisCache* fusion_analysis_cache = nullptr,
+      GpuPerformanceModelCache* gpu_performance_model_cache = nullptr) {
+    GpuPerformanceModelOptions config;
+    config.fusion_analysis_cache = fusion_analysis_cache;
+    config.gpu_performance_model_cache = gpu_performance_model_cache;
+    // This constant was chosen empirically in early 2024, based on runtime
+    // performance on a set of benchmarks internal to Google. Intuitively, we
+    // expect it to be close to 1, but not quite 1 (i.e., sometimes, compute
+    // or memory accesses will be stalled waiting for the other, but usually
+    // they won't).
+    config.memory_compute_parallelism = 0.95;
+    return config;
+  }
+
+  static GpuPerformanceModelOptions ForModule(const HloModule* module) {
+    return module->config().debug_options().xla_gpu_enable_priority_fusion()
+               ? PriorityFusion()  // Only cache within priority fusion.
+               : Default();
+  }
+};
+
+class GpuPerformanceModelBase {
+ public:
+  struct RunTimes {
+    absl::Duration time_unfused;
+    absl::Duration time_fused;
+  };
+
+  // Estimated values in the absence of easy ways to query them.
+  static constexpr absl::Duration kKernelLaunchOverhead = absl::Microseconds(1);
+  static constexpr absl::Duration kNcclKernelLaunchOverhead =
+      absl::Microseconds(5);
+  static constexpr float kL2CacheSpeedup = 2.5;
+  static constexpr float kL1CacheSpeedup = 8;
+  // A very conservative estimate. L1 size varies because it can be dynamically
+  // configured as shared memory; there is no easy way to query its actual size;
+  // also we do not count what occupies cache, but rather claim that what is
+  // much smaller than the cache size will likely stay in it.
+  // For reference, it can be up to 256 kB per SM on RTX A6000.
+  static constexpr float kL1CacheSizePerSM = 2 * 1024;
+
+  // Uses HloFusionAnalysis for computing the actual number of threads and
+  // blocks that the IR emitter will use.
+  static LaunchDimensions EstimateFusionLaunchDimensions(
+      int64_t estimated_num_threads, const HloFusionAnalysis& fusion_analysis,
+      const se::DeviceDescription& device_info);
+
+  // Returns bytes accessed of operand output by instruction. Returns 0, if the
+  // operand is not used by the instruction.
+  static int64_t GetOperandBytesAccessed(
+      const GpuHloCostAnalysis* cost_analysis, const HloInstruction* instr,
+      const HloInstruction* operand);
+
+  // Returns utilization of operand by instruction. Returns 0, if the operand is
+  // not used by the instruction.
+  static float GetOperandUtilization(const GpuHloCostAnalysis* cost_analysis,
+                                     const HloInstruction* instr,
+                                     const HloInstruction* operand);
+
+  // Returns utilization `overlap` between a common operand of producer and
+  // consumer on merge. `utilization > 0` means that the operand will be
+  // accessed more efficiently after fusion.
+  //
+  // Currently covers two cases:
+  // 1) Producer has to use the common operand elementwise from its root if it
+  //    is a fusion or just be an elementwise instruction.
+  // 2) Consumer has to have common elementwise roots for the producer and the
+  //    common operand if it is a fusion or just be an elementwise instruction.
+  static float GetCommonUtilization(const GpuHloCostAnalysis* cost_analysis,
+                                    const HloInstruction* producer,
+                                    int64_t producer_idx_of_operand,
+                                    const HloInstruction* consumer);
+
+  // Returns bytes accessed of operand after producer and consumer are fused
+  // together. `GetCommonUtilization` works only for a limited set of
+  // elementwise cases.
+  static int64_t GetSharedOperandBytesAccessed(
+      const GpuHloCostAnalysis* cost_analysis, const HloInstruction* producer,
+      const HloInstruction* consumer, const HloInstruction* operand);
+
+  // Estimate read time of n_bytes_total bytes from global memory on a
+  // given GPU. Account for L1 / L2 cache speedup if the input's nominal size
+  // n_bytes_net is small.
+  static absl::Duration ReadTime(const se::DeviceDescription& gpu_device_info,
+                                 int64_t num_blocks, int64_t n_bytes_net,
+                                 int64_t n_bytes_total);
+
+  // Estimate read time of n_bytes_total bytes from global memory on a
+  // given GPU.
+  //
+  // Assumes that the first n_bytes_net are always read from DRAM, but next
+  // reads can be cached. Applies waste factor if read from DRAM is uncoalesced.
+  static absl::Duration ReadTimeWithDRAMHeuristic(
+      const se::DeviceDescription& gpu_device_info, int64_t num_blocks,
+      int64_t n_bytes_net, int64_t n_bytes_total, PrimitiveType element_type,
+      bool coalesced);
+
+  // Tells input access time of the producer alone if fused_consumer
+  // is not specified. Otherwise estimates the access time to producer's
+  // inputs as if it is fused into the consumer.
+  static absl::Duration ProducerInputAccessTime(
+      const GpuHloCostAnalysis* cost_analysis,
+      const se::DeviceDescription& gpu_device_info, int64_t num_blocks,
+      const HloInstruction* producer, const HloFusionAnalysis& fusion_analysis,
+      const GpuPerformanceModelOptions& config,
+      const HloInstruction* fused_consumer = nullptr);
+
+  static absl::Duration WriteTime(const se::DeviceDescription& gpu_device_info,
+                                  int64_t bytes_written);
+
+  static absl::Duration ComputeTime(
+      const se::DeviceDescription& gpu_device_info, int64_t flops,
+      int64_t num_threads);
+
+  static absl::Duration CombineComputeAndMemoryAccessTime(
+      absl::Duration compute_time, absl::Duration memory_access_time,
+      const GpuPerformanceModelOptions& config);
+
+  // Logs estimates for the operand read if VLOG is enabled.
+  static void VLogOperandRead(const HloInstruction* operand,
+                              int64_t n_bytes_total, int64_t n_bytes_net,
+                              bool coalesced);
+
+  // Logs estimate results of the performance model if VLOG is enabled.
+  static void VLogResult(int64_t flops, int64_t bytes_read,
+                         int64_t bytes_written, int64_t num_threads,
+                         absl::Duration compute_time, absl::Duration read_time,
+                         absl::Duration write_time, absl::Duration exec_time);
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_MODEL_GPU_PERFORMANCE_MODEL_BASE_H_
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
new file mode 100644
index 00000000000000..d15c0d4339bfc2
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_base_test.cc
@@ -0,0 +1,196 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/model/gpu_performance_model_base.h"
+
+#include <cstdint>
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/backend_configs.pb.h"
+#include "xla/service/gpu/gpu_device_info_for_tests.h"
+#include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/test_helpers.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+class GpuPerformanceModelBaseTest : public HloTestBase {
+ public:
+  GpuHloCostAnalysis::ShapeSizeFunction ShapeSizeBytesFunction() const {
+    return [&](const Shape& shape) {
+      constexpr int64_t kPointerSize = 8;
+      return ShapeUtil::ByteSizeOf(shape, kPointerSize);
+    };
+  }
+
+  GpuHloCostAnalysis::Options options_{ShapeSizeBytesFunction(),
+                                       /*per_second_rates=*/{},
+                                       /*count_multiple_input_accesses=*/true};
+  // The reference times in the test cases below are measured
+  // on A6000 by profiling the execution of the HLOs.
+  se::DeviceDescription device_info_{TestGpuDeviceInfo::RTXA6000DeviceInfo()};
+  GpuHloCostAnalysis analysis_{options_, &device_info_};
+
+  GpuPerformanceModelBaseTest() : HloTestBase() {}
+};
+
+TEST_F(GpuPerformanceModelBaseTest, SharedOperandBytesAccessed_InPlaceDUS) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+ENTRY entry_computation {
+  param_0 = f32[8,16] parameter(0)
+  param_1 = f32[4,4] parameter(1)
+  c_0 = s32[] constant(0)
+  log = f32[4,4] log(param_1)
+  ROOT dynamic-update-slice = f32[8,16] dynamic-update-slice(param_0, log, c_0, c_0)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto computation = module->entry_computation();
+  ASSERT_IS_OK(computation->Accept(&analysis_));
+
+  auto dus_consumer = computation->root_instruction();
+  auto log_producer = dus_consumer->mutable_operand(1);
+
+  auto get_shared_operand_bytes_accessed = [&](const HloInstruction* operand) {
+    return GpuPerformanceModelBase::GetSharedOperandBytesAccessed(
+        &analysis_, log_producer, dus_consumer, operand);
+  };
+
+  EXPECT_EQ(get_shared_operand_bytes_accessed(dus_consumer->operand(0)), 0);
+  EXPECT_EQ(get_shared_operand_bytes_accessed(log_producer->operand(0)), 64);
+}
+
+TEST_F(GpuPerformanceModelBaseTest, SharedOperandBytesAccessed_DUS) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+ENTRY entry_computation {
+  param_0 = f32[8,16] parameter(0)
+  param_1 = f32[4,4] parameter(1)
+  c_0 = s32[] constant(0)
+  log = f32[8,16] log(param_0)
+  ROOT dynamic-update-slice = f32[8,16] dynamic-update-slice(log, param_1, c_0, c_0)
+}
+)";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto computation = module->entry_computation();
+  ASSERT_IS_OK(computation->Accept(&analysis_));
+
+  auto dus_consumer = computation->root_instruction();
+  auto log_producer = dus_consumer->mutable_operand(0);
+
+  auto get_shared_operand_bytes_accessed = [&](const HloInstruction* operand) {
+    return GpuPerformanceModelBase::GetSharedOperandBytesAccessed(
+        &analysis_, log_producer, dus_consumer, operand);
+  };
+
+  EXPECT_EQ(get_shared_operand_bytes_accessed(dus_consumer->operand(1)), 64);
+  EXPECT_EQ(get_shared_operand_bytes_accessed(log_producer->operand(0)), 448);
+}
+
+// This test documents current behaviour. See comments below how the correct
+// result should look like.
+TEST_F(GpuPerformanceModelBaseTest,
+       ReduceBroadcastedDim_IncorrectBytesAccessed) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+add {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT add = f32[] add(p0, p1)
+}
+
+f1 {
+  p0 = f32[128] parameter(0)
+  c0 = f32[] constant(0)
+  broadcast = f32[128,256] broadcast(p0), dimensions={0}
+  ROOT reduce = f32[128] reduce(broadcast, c0), dimensions={1}, to_apply=add
+}
+
+ENTRY entry_computation {
+  param_0 = f32[128] parameter(0)
+  param_1 = f32[4,4] parameter(1)
+  ROOT fusion = f32[128] fusion(param_0), kind=kLoop, calls=f1
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto computation = module->entry_computation();
+  ASSERT_IS_OK(computation->Accept(&analysis_));
+
+  auto root = computation->root_instruction();
+
+  // Cost Model estimates that input element we be re-read in reduce. Each
+  // element of reduce output needs only one input element. Bytes accessed
+  // should be 4*128=512.
+  EXPECT_EQ(GpuPerformanceModelBase::GetOperandBytesAccessed(&analysis_, root,
+                                                             root->operand(0)),
+            /*4*128*256=*/131072);
+}
+
+// This test documents current behaviour. See comments below how the correct
+// result should look like.
+TEST_F(GpuPerformanceModelBaseTest, ElementwiseBitcast_IncorrectBytesAccessed) {
+  absl::string_view hlo_string = R"(
+HloModule m
+
+f1 {
+  p0 = f32[128] parameter(0)
+  bitcast.1 = f32[8,16] bitcast(p0)
+  log = f32[128] log(p0)
+  bitcast.2 = f32[8,16] bitcast(log)
+  ROOT add = f32[8,16] add(bitcast.1, bitcast.2)
+}
+
+ENTRY entry_computation {
+  param_0 = f32[128] parameter(0)
+  ROOT fusion = f32[8,16] fusion(param_0), kind=kLoop, calls=f1
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto computation = module->entry_computation();
+  ASSERT_IS_OK(computation->Accept(&analysis_));
+
+  auto root = computation->root_instruction();
+
+  // Bitcast breaks the chain of elementwise utilization even if the bitcast
+  // doesn't change physical layout. Each element of `param_0` should be read
+  // only once, but Cost Model estimates that it will be accessed twice. Bytes
+  // accessed should be 4*128=512.
+  EXPECT_EQ(GpuPerformanceModelBase::GetOperandBytesAccessed(&analysis_, root,
+                                                             root->operand(0)),
+            /*2*4*128=*/1024);
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
index 6fa02ad2ad7f60..d6051d778085ad 100644
--- a/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
+++ b/third_party/xla/xla/service/gpu/model/gpu_performance_model_test.cc
@@ -21,8 +21,10 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/time/time.h"
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -30,6 +32,8 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/gpu_device_info_for_tests.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
+#include "xla/service/gpu/model/gpu_indexing_performance_model.h"
+#include "xla/service/gpu/model/gpu_performance_model_base.h"
 #include "xla/service/hlo_module_config.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -68,13 +72,18 @@ class GpuPerformanceModelTest : public HloTestBase {
         fused_consumers);
   }
 
+  mlir::MLIRContext mlir_context_;
   GpuHloCostAnalysis::Options options_{ShapeSizeBytesFunction(),
                                        /*per_second_rates=*/{},
                                        /*count_multiple_input_accesses=*/true};
   // The reference times in the test cases below are measured
   // on A6000 by profiling the execution of the HLOs.
-  se::DeviceDescription dev_info_{TestGpuDeviceInfo::RTXA6000DeviceInfo()};
-  GpuHloCostAnalysis analysis_{options_, &dev_info_};
+  se::DeviceDescription device_info_{TestGpuDeviceInfo::RTXA6000DeviceInfo()};
+  GpuHloCostAnalysis analysis_{options_, &device_info_};
+
+  GpuPerformanceModelWithIndexingAnalysis indexing_cost_model_{
+      &device_info_, ShapeSizeBytesFunction(), &mlir_context_};
+
   GpuPerformanceModelTest() : HloTestBase() {}
 };
 
@@ -103,6 +112,9 @@ ENTRY e {
   auto prio_t = EstimateRunTimesForPriorityFusion(root);
   // Dominated by the DRAM bandwidth.
   EXPECT_NEAR(absl::ToInt64Microseconds(prio_t.time_unfused), 53, 10);
+
+  auto indexing_t = indexing_cost_model_.EstimateRunTimes(root);
+  EXPECT_NEAR(absl::ToInt64Microseconds(indexing_t.time_unfused), 53, 10);
 }
 
 TEST_F(GpuPerformanceModelTest, SmallReadWrite) {
@@ -137,6 +149,9 @@ ENTRY e {
                                .reification_cost()
                                .end_to_end_cycles();
   EXPECT_NEAR(recorded_cycles, 257.7, 0.1);
+
+  auto indexing_t = indexing_cost_model_.EstimateRunTimes(root);
+  EXPECT_NEAR(absl::ToInt64Microseconds(indexing_t.time_unfused), 1, 1);
 }
 
 TEST_F(GpuPerformanceModelTest, LargeReadWrite) {
@@ -261,23 +276,6 @@ TEST_F(GpuPerformanceModelTest, UnusedParameter) {
   EXPECT_NEAR(absl::ToInt64Microseconds(t.time_unfused), 1, 1);
 }
 
-using GpuPerformanceWithCollectiveModelTest = GpuPerformanceModelTest;
-
-TEST_F(GpuPerformanceWithCollectiveModelTest, TestNvmlLibraryLoading) {
-#if GOOGLE_CUDA
-  EXPECT_TRUE(GpuPerformanceWithCollectiveModel::InitNvml());
-  // After successful init, we try to use one of the
-  // nvml functions to see if the result is good.
-  nvmlDevice_t nvml_device;
-  nvmlReturn_t get_device_result =
-      xla_nvmlDeviceGetHandleByIndex(0, &nvml_device);
-  EXPECT_TRUE(get_device_result == NVML_SUCCESS);
-
-  EXPECT_TRUE(GpuPerformanceWithCollectiveModel::InitNvml());
-
-#endif  // GOOGLE_CUDA
-}
-
 TEST_F(GpuPerformanceModelTest, ComputeBoundReducesWithSameLaunchDimensions) {
   // We compare two compute-bound reduces that do ~the same amount of compute
   // and have the same launch dimensions. The result should be approximately
@@ -331,7 +329,7 @@ ENTRY fusion {
   auto run = [&](absl::string_view hlo_text)
       -> absl::StatusOr<GpuPerformanceModel::RunTimes> {
     TF_ASSIGN_OR_RETURN(auto module, ParseAndReturnVerifiedModule(hlo_text));
-    GpuHloCostAnalysis analysis(options_, &dev_info_);
+    GpuHloCostAnalysis analysis(options_, &device_info_);
     TF_RETURN_IF_ERROR(module->entry_computation()->Accept(&analysis));
 
     auto* producer =
diff --git a/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc b/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc
index 7e45f20dac33bf..38479bbc982a38 100644
--- a/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc
+++ b/third_party/xla/xla/service/gpu/model/hlo_op_profiler_run.cc
@@ -89,6 +89,7 @@ int RunProfiler(int argc, char** argv) {
       // Unary
       HloOpcode::kCbrt,
       HloOpcode::kCos,
+      HloOpcode::kErf,
       HloOpcode::kExp,
       HloOpcode::kExpm1,
       HloOpcode::kLog,
diff --git a/third_party/xla/xla/service/gpu/model/hlo_op_profiler_test.cc b/third_party/xla/xla/service/gpu/model/hlo_op_profiler_test.cc
index e2d7ccbcb93446..6a8ed6538e8edb 100644
--- a/third_party/xla/xla/service/gpu/model/hlo_op_profiler_test.cc
+++ b/third_party/xla/xla/service/gpu/model/hlo_op_profiler_test.cc
@@ -39,7 +39,7 @@ TEST_F(HloOpProfilerTest, BasicMeasurementsAreCorrect) {
   EXPECT_GT(profiler.MeasureClockCyclesPerOp(HloOpcode::kDivide, F64)
                 .value()
                 .clock_cycles(),
-            500);
+            400);
   // c128 sqrt is slow.
   EXPECT_GT(profiler.MeasureClockCyclesPerOp(HloOpcode::kSqrt, C128)
                 .value()
diff --git a/third_party/xla/xla/service/gpu/model/hlo_op_profiles.cc b/third_party/xla/xla/service/gpu/model/hlo_op_profiles.cc
index 99f03e248f4637..e8a46b077478cc 100644
--- a/third_party/xla/xla/service/gpu/model/hlo_op_profiles.cc
+++ b/third_party/xla/xla/service/gpu/model/hlo_op_profiles.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -24,6 +24,7 @@ limitations under the License.
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/model/hlo_op_profiles_data.h"
 #include "xla/stream_executor/device_description.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/protobuf.h"
@@ -31,6 +32,14 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+/*static*/ const HloOpProfiles& HloOpProfiles::Singleton() {
+  static const auto* hlo_op_profiles =
+      HloOpProfiles::Load(kDeviceHloOpProfiles,
+                          /*default_profile_name=*/"sm_86")
+          .release();
+  return *hlo_op_profiles;
+}
+
 /*static*/ std::string HloOpProfiles::GetProfileName(
     const se::DeviceDescription* device_info) {
   if (device_info != nullptr) {
diff --git a/third_party/xla/xla/service/gpu/model/hlo_op_profiles.h b/third_party/xla/xla/service/gpu/model/hlo_op_profiles.h
index 8cdc28e70f08b9..000ffc601b4dec 100644
--- a/third_party/xla/xla/service/gpu/model/hlo_op_profiles.h
+++ b/third_party/xla/xla/service/gpu/model/hlo_op_profiles.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -40,6 +40,9 @@ class HloOpProfiles {
       absl::flat_hash_map<std::string,  // compute capability.
                           HloOpProfile>;
 
+  // Returns singleton with profiler data.
+  static const HloOpProfiles& Singleton();
+
   // Returns profile name for the gived device.
   // For CUDA, the format is "sm_XX".
   static std::string GetProfileName(const se::DeviceDescription* device_info);
diff --git a/third_party/xla/xla/service/gpu/model/hlo_op_profiles_test.cc b/third_party/xla/xla/service/gpu/model/hlo_op_profiles_test.cc
index 7faad84778e32b..d8b3e953e09e37 100644
--- a/third_party/xla/xla/service/gpu/model/hlo_op_profiles_test.cc
+++ b/third_party/xla/xla/service/gpu/model/hlo_op_profiles_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
index 84c4e1cb7bbfb4..62594bcada874b 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.cc
@@ -24,13 +24,17 @@ limitations under the License.
 #include <ostream>
 #include <sstream>
 #include <string>
+#include <utility>
 #include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
 #include "absl/types/span.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
@@ -41,7 +45,9 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/layout.h"
 #include "xla/permutation_util.h"
+#include "xla/service/gpu/fusions/tiling_util.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
@@ -60,21 +66,26 @@ using mlir::AffineExpr;
 using mlir::AffineMap;
 using mlir::getAffineConstantExpr;
 using mlir::getAffineDimExpr;
+using mlir::getAffineSymbolExpr;
 using mlir::MLIRContext;
 
 HloInstructionIndexing CreateUnknownIndexing(int64_t count = 1) {
   HloInstructionIndexing indexing;
-  indexing.indexing_maps =
-      std::vector<absl::flat_hash_set<std::optional<IndexingMap>>>(
-          count, {std::nullopt});
+  indexing.indexing_maps = std::vector<absl::flat_hash_set<IndexingMap>>(
+      count, {IndexingMap::GetUndefined()});
   return indexing;
 }
 
 IndexingMap CreateIdentityMap(const Shape& shape, MLIRContext* ctx) {
+  if (shape.IsTuple()) {
+    // Should happen only for variadic reduce. In that case all tuple shapes are
+    // equal.
+    return CreateIdentityMap(shape.tuple_shapes(0), ctx);
+  }
+
   auto dims = shape.dimensions();
-  IndexingMap identity_map{
-      .affine_map = AffineMap::getMultiDimIdentityMap(dims.size(), ctx),
-      .domain = Domain::FromUpperBounds(dims, {})};
+  IndexingMap identity_map = IndexingMap::FromTensorSizes(
+      AffineMap::getMultiDimIdentityMap(dims.size(), ctx), dims, {});
   return identity_map;
 }
 
@@ -106,10 +117,10 @@ HloInstructionIndexing ComputeOutputToInputBroadcastOpIndexing(
   for (int64_t bcast_dim : bcast->dimensions()) {
     exprs.push_back(getAffineDimExpr(bcast_dim, mlir_context));
   }
-  IndexingMap indexing_map{
-      .affine_map = AffineMap::get(output_dims.size(), /*symbolCount=*/0, exprs,
-                                   mlir_context),
-      .domain = Domain::FromUpperBounds(output_dims, {})};
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(output_dims.size(), /*symbolCount=*/0, exprs,
+                     mlir_context),
+      output_dims, {});
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
 }
 
@@ -136,15 +147,23 @@ HloInstructionIndexing ComputeInputToOutputBroadcastOpIndexing(
     exprs.push_back(getAffineDimExpr(
         std::distance(bcast_dims.begin(), bcast_dim), mlir_context));
   }
-  IndexingMap indexing_map{
-      .affine_map = AffineMap::get(input_shape.rank(), added_dims_sizes.size(),
-                                   exprs, mlir_context),
-      .domain =
-          Domain::FromUpperBounds(input_shape.dimensions(), added_dims_sizes)};
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(input_shape.rank(), added_dims_sizes.size(), exprs,
+                     mlir_context),
+      input_shape.dimensions(), added_dims_sizes);
 
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
 }
 
+std::vector<Range> RangesFromUpperBounds(absl::Span<const int64_t> bounds) {
+  std::vector<Range> dim_ranges;
+  dim_ranges.reserve(bounds.size());
+  for (int64_t dim : bounds) {
+    dim_ranges.push_back(Range{0, dim - 1});
+  }
+  return dim_ranges;
+}
+
 HloInstructionIndexing ComputeOutputToInputConcatenateOpIndexing(
     const HloConcatenateInstruction* concat, MLIRContext* mlir_context) {
   const auto& operand_0_dims = concat->operand(0)->shape().dimensions();
@@ -153,11 +172,7 @@ HloInstructionIndexing ComputeOutputToInputConcatenateOpIndexing(
   // be adjusted for a particular operand_id.
   mlir::MutableAffineMap affine_map =
       AffineMap::getMultiDimIdentityMap(operand_0_dims.size(), mlir_context);
-  std::vector<Range> dim_ranges;
-  dim_ranges.reserve(operand_0_dims.size());
-  for (int64_t dim : operand_0_dims) {
-    dim_ranges.push_back(Range{0, dim - 1});
-  }
+  std::vector<Range> dim_ranges = RangesFromUpperBounds(operand_0_dims);
 
   HloInstructionIndexing concat_indexing;
   concat_indexing.indexing_maps.resize(concat->operand_count());
@@ -168,8 +183,9 @@ HloInstructionIndexing ComputeOutputToInputConcatenateOpIndexing(
     affine_map.setResult(concat_dim, concat_dim_expr - offset);
     int64_t operand_concat_dim = operand->shape().dimensions()[concat_dim];
     dim_ranges[concat_dim] = Range{offset, offset + operand_concat_dim - 1};
-    concat_indexing.indexing_maps[operand_id].insert(IndexingMap{
-        affine_map.getAffineMap(), Domain(dim_ranges, /*symbol_ranges=*/{})});
+    concat_indexing.indexing_maps[operand_id].insert(
+        IndexingMap(affine_map.getAffineMap(), dim_ranges,
+                    /*symbol_ranges=*/{}));
     offset += operand_concat_dim;
   }
   return concat_indexing;
@@ -190,8 +206,8 @@ HloInstructionIndexing ComputeInputToOutputConcatenateOpIndexing(
       AffineMap::getMultiDimIdentityMap(operand_dims.size(), mlir_context);
   affine_map.setResult(concat_dim,
                        getAffineDimExpr(concat_dim, mlir_context) + offset);
-  IndexingMap indexing_map{.affine_map = affine_map.getAffineMap(),
-                           .domain = Domain::FromUpperBounds(operand_dims, {})};
+  IndexingMap indexing_map =
+      IndexingMap::FromTensorSizes(affine_map.getAffineMap(), operand_dims, {});
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
 }
 
@@ -202,23 +218,14 @@ HloInstructionIndexing ComputeOutputToInputFusionOpIndexing(
     MLIRContext* mlir_context) {
   auto fusion_adaptor = HloFusionAdaptor::ForInstruction(fusion);
   auto grouped_indexing_maps = ComputeGroupedOutputToInputIndexing(
-      *fusion_adaptor, output_id, mlir_context);
-
-  HloInstructionIndexing fusion_indexing;
-
-  if (!grouped_indexing_maps.has_value()) {
-    fusion_indexing.indexing_maps =
-        std::vector<absl::flat_hash_set<std::optional<IndexingMap>>>(
-            fusion->operand_count(), {std::nullopt});
-    return fusion_indexing;
-  }
+      *fusion_adaptor, fusion_adaptor->GetRoots()[output_id], mlir_context);
 
   // After the traversal, `grouped_indexing_maps` is keyed by
   // HloParameterInstructions. Convert them back to the operand id and return.
+  HloInstructionIndexing fusion_indexing;
   fusion_indexing.indexing_maps.resize(fusion->operand_count());
   for (auto [operand_id, operand] : llvm::enumerate(fusion->operands())) {
-    fusion_indexing.indexing_maps[operand_id] =
-        grouped_indexing_maps.value()[operand];
+    fusion_indexing.indexing_maps[operand_id] = grouped_indexing_maps[operand];
   }
   return fusion_indexing;
 }
@@ -288,21 +295,75 @@ HloInstructionIndexing ComputeOutputToInputDotOpIndexing(
     input_dim_sizes.push_back(lhs_shape.dimensions(lhs_contracting_dim));
   }
 
-  IndexingMap lhs_indexing_map{
-      .affine_map = AffineMap::get(dot->shape().rank(), input_dim_sizes.size(),
-                                   lhs_exprs, mlir_context),
-      .domain =
-          Domain::FromUpperBounds(dot->shape().dimensions(), input_dim_sizes)};
+  IndexingMap lhs_indexing_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(dot->shape().rank(), input_dim_sizes.size(), lhs_exprs,
+                     mlir_context),
+      dot->shape().dimensions(), input_dim_sizes);
 
-  IndexingMap rhs_indexing_map{
-      .affine_map = AffineMap::get(dot->shape().rank(), input_dim_sizes.size(),
-                                   rhs_exprs, mlir_context),
-      .domain =
-          Domain::FromUpperBounds(dot->shape().dimensions(), input_dim_sizes)};
+  IndexingMap rhs_indexing_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(dot->shape().rank(), input_dim_sizes.size(), rhs_exprs,
+                     mlir_context),
+      dot->shape().dimensions(), input_dim_sizes);
   return HloInstructionIndexing::FromIndexingMaps(
       {lhs_indexing_map, rhs_indexing_map});
 }
 
+IndexingMap ComputeOutputToInputPadOpIndexingImpl(
+    absl::Span<const int64_t> output_dims,
+    absl::Span<const int64_t> padding_low,
+    absl::Span<const int64_t> padding_high,
+    absl::Span<const int64_t> padding_interior, MLIRContext* mlir_context) {
+  int64_t output_rank = output_dims.size();
+
+  std::vector<AffineExpr> exprs;
+  std::vector<std::pair<AffineExpr, Range>> constraints;
+  std::vector<Range> dimension_ranges;
+  exprs.reserve(output_rank);
+  constraints.reserve(output_rank);
+  int64_t output_dim_id = 0;
+  for (const auto [output_dim, pad_low, pad_high, pad_interior] :
+       llvm::zip(output_dims, padding_low, padding_high, padding_interior)) {
+    AffineExpr dim_expr = getAffineDimExpr(output_dim_id, mlir_context);
+    dimension_ranges.push_back(
+        Range{std::max(int64_t{0}, pad_low),
+              std::min(output_dim - 1, output_dim - 1 - pad_high)});
+    if (pad_interior == 0) {
+      exprs.push_back(dim_expr - pad_low);
+    } else {
+      exprs.push_back((dim_expr - pad_low).floorDiv(pad_interior + 1));
+      constraints.push_back(
+          {(dim_expr - pad_low) % (pad_interior + 1), Range{0, 0}});
+    }
+    ++output_dim_id;
+  }
+  return IndexingMap{
+      AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
+      dimension_ranges, /*symbol_ranges = */ {}, absl::MakeSpan(constraints)};
+}
+
+HloInstructionIndexing ComputeOutputToInputPadOpIndexing(
+    const HloPadInstruction* pad, MLIRContext* mlir_context) {
+  const Shape& output_shape = pad->shape();
+  int64_t rank = output_shape.rank();
+  SmallVector<int64_t> padding_low, padding_high, padding_interior;
+  padding_low.reserve(rank);
+  padding_high.reserve(rank);
+  padding_interior.reserve(rank);
+  for (const auto& dim_config : pad->padding_config().dimensions()) {
+    padding_low.push_back(dim_config.edge_padding_low());
+    padding_high.push_back(dim_config.edge_padding_high());
+    padding_interior.push_back(dim_config.interior_padding());
+  }
+  IndexingMap input_indexing_map = ComputeOutputToInputPadOpIndexingImpl(
+      output_shape.dimensions(), padding_low, padding_high, padding_interior,
+      mlir_context);
+  IndexingMap padding_value_indexing_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(output_shape.rank(), /*symbolCount=*/0, {}, mlir_context),
+      output_shape.dimensions(), /*symbol_upper_bounds=*/{});
+  return HloInstructionIndexing::FromIndexingMaps(
+      {input_indexing_map, padding_value_indexing_map});
+}
+
 HloInstructionIndexing ComputeOutputToInputReduceOpIndexing(
     const HloReduceInstruction* reduce, int output_id,
     MLIRContext* mlir_context) {
@@ -310,9 +371,7 @@ HloInstructionIndexing ComputeOutputToInputReduceOpIndexing(
                                                reduce->dimensions().end());
 
   const Shape& input_shape = reduce->operand(output_id)->shape();
-  const Shape& output_shape = reduce->shape().IsTuple()
-                                  ? ShapeUtil::GetSubshape(reduce->shape(), {0})
-                                  : reduce->shape();
+  const Shape& output_shape = GetOutputShape(reduce, 0);
 
   std::vector<int64_t> parallel_dims_sizes;
   int64_t output_dim_id = 0;
@@ -328,15 +387,13 @@ HloInstructionIndexing ComputeOutputToInputReduceOpIndexing(
     }
     exprs.push_back(getAffineDimExpr(output_dim_id++, mlir_context));
   }
-  IndexingMap inputs_indexing_map{
-      .affine_map = AffineMap::get(output_shape.rank(), reduce_dims_ids.size(),
-                                   exprs, mlir_context),
-      .domain = Domain::FromUpperBounds(output_shape.dimensions(),
-                                        parallel_dims_sizes)};
-  IndexingMap inits_indexing_map{
-      .affine_map = AffineMap::get(output_shape.rank(), /*symbolCount=*/0, {},
-                                   mlir_context),
-      .domain = Domain::FromUpperBounds(output_shape.dimensions(), {})};
+  IndexingMap inputs_indexing_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(output_shape.rank(), reduce_dims_ids.size(), exprs,
+                     mlir_context),
+      output_shape.dimensions(), parallel_dims_sizes);
+  IndexingMap inits_indexing_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(output_shape.rank(), /*symbolCount=*/0, {}, mlir_context),
+      output_shape.dimensions(), {});
 
   HloInstructionIndexing instr_indexing;
   instr_indexing.indexing_maps.resize(reduce->operand_count());
@@ -355,9 +412,7 @@ HloInstructionIndexing ComputeInputToOutputReduceOpIndexing(
   absl::flat_hash_set<int64_t> reduce_dims_ids(reduce->dimensions().begin(),
                                                reduce->dimensions().end());
   const Shape& input_shape = reduce->operand(input_id)->shape();
-  const Shape& output_shape = reduce->shape().IsTuple()
-                                  ? ShapeUtil::GetSubshape(reduce->shape(), {0})
-                                  : reduce->shape();
+  const Shape& output_shape = GetOutputShape(reduce, 0);
   int64_t output_rank = output_shape.rank();
 
   int64_t output_dim_id = 0;
@@ -370,17 +425,15 @@ HloInstructionIndexing ComputeInputToOutputReduceOpIndexing(
       continue;
     }
     inputs_exprs.push_back(getAffineDimExpr(input_dim_id, mlir_context));
-    inits_exprs.push_back(
-        mlir::getAffineSymbolExpr(output_dim_id++, mlir_context));
-  }
-  IndexingMap inputs_indexing_map{
-      .affine_map = AffineMap::get(input_shape.rank(), /*symbolCount=*/0,
-                                   inputs_exprs, mlir_context),
-      .domain = Domain::FromUpperBounds(input_shape.dimensions(), {})};
-  IndexingMap inits_indexing_map{
-      .affine_map = AffineMap::get(0, /*symbolCount=*/output_rank, inits_exprs,
-                                   mlir_context),
-      .domain = Domain::FromUpperBounds({}, output_shape.dimensions())};
+    inits_exprs.push_back(getAffineSymbolExpr(output_dim_id++, mlir_context));
+  }
+  IndexingMap inputs_indexing_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(input_shape.rank(), /*symbolCount=*/0, inputs_exprs,
+                     mlir_context),
+      input_shape.dimensions(), {});
+  IndexingMap inits_indexing_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(0, /*symbolCount=*/output_rank, inits_exprs, mlir_context),
+      {}, output_shape.dimensions());
 
   HloInstructionIndexing instr_indexing;
   instr_indexing.indexing_maps.resize(reduce->operand_count());
@@ -393,6 +446,80 @@ HloInstructionIndexing ComputeInputToOutputReduceOpIndexing(
   return instr_indexing;
 }
 
+// Indexing for reduce-window with dilations and non-trivial padding can be
+// represented as a composition of pad op and reduce-window that never goes out
+// of bounds.
+HloInstructionIndexing ComputeOutputToInputReduceWindowOpIndexing(
+    const HloReduceWindowInstruction* reduce_window, int output_id,
+    MLIRContext* mlir_context) {
+  const Shape& input_shape = reduce_window->operand(0)->shape();
+  const Shape& output_shape = GetOutputShape(reduce_window, 0);
+  int64_t rank = input_shape.rank();
+
+  // Compute shape of the padded input and the indexing map of pad op required
+  // to pad the input.
+  SmallVector<int64_t> padding_low, padding_high, padding_interior,
+      padded_input_dimensions;
+  padding_low.reserve(rank);
+  padding_high.reserve(rank);
+  padding_interior.reserve(rank);
+  padded_input_dimensions.reserve(rank);
+  SmallVector<AffineExpr, 4> exprs;
+  std::vector<Range> dim_ranges, symbol_ranges;
+  exprs.reserve(rank);
+  dim_ranges.reserve(rank);
+  symbol_ranges.reserve(rank);
+  for (const auto& [dim_id, window_config] :
+       llvm::enumerate(reduce_window->window().dimensions())) {
+    padding_low.push_back(window_config.padding_low());
+    padding_high.push_back(window_config.padding_high());
+    // For some reason interior_padding in HLO pad is offset from base_dilations
+    // in HLO reduce-window by 1.
+    padding_interior.push_back(window_config.base_dilation() - 1);
+    padded_input_dimensions.push_back(input_shape.dimensions(dim_id) +
+                                      window_config.padding_low() +
+                                      window_config.padding_high() +
+                                      (input_shape.dimensions(dim_id) - 1) *
+                                          (window_config.base_dilation() - 1));
+    AffineExpr dim_expr = getAffineDimExpr(dim_id, mlir_context);
+    AffineExpr symbol_expr = getAffineSymbolExpr(dim_id, mlir_context);
+
+    exprs.push_back(symbol_expr + window_config.stride() * dim_expr);
+    dim_ranges.push_back(Range{0, output_shape.dimensions(dim_id) - 1});
+    symbol_ranges.push_back(Range{0, window_config.size() - 1});
+  }
+  // Indexing map for pad op that pads the input.
+  IndexingMap padded_input_indexing = ComputeOutputToInputPadOpIndexingImpl(
+      padded_input_dimensions, padding_low, padding_high, padding_interior,
+      mlir_context);
+  // Indexing map for reduce-window, that does not do any padding.
+  IndexingMap reduce_window_indexing_no_padding(
+      AffineMap::get(rank, rank, exprs, mlir_context), dim_ranges,
+      symbol_ranges);
+
+  // Composed indexing.
+  IndexingMap inputs_indexing = ComposeIndexingMaps(
+      reduce_window_indexing_no_padding, padded_input_indexing);
+  inputs_indexing.Simplify();
+  inputs_indexing.RemoveUnusedSymbols();
+
+  // Indexing map for the init value.
+  IndexingMap inits_indexing_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(output_shape.rank(), /*symbolCount=*/0, {}, mlir_context),
+      output_shape.dimensions(), /*symbol_upper_bounds=*/{});
+
+  HloInstructionIndexing instr_indexing;
+  instr_indexing.indexing_maps.resize(reduce_window->operand_count());
+  for (int64_t id = 0; id < reduce_window->input_count(); ++id) {
+    instr_indexing.indexing_maps[id].insert(inputs_indexing);
+  }
+  for (int64_t id = reduce_window->input_count();
+       id < reduce_window->operand_count(); ++id) {
+    instr_indexing.indexing_maps[id].insert(inits_indexing_map);
+  }
+  return instr_indexing;
+}
+
 // Computes strides for a shape.
 std::vector<int64_t> ComputeStrides(absl::Span<const int64_t> dims) {
   int rank = static_cast<int>(dims.size());
@@ -544,10 +671,9 @@ HloInstructionIndexing ComputeOutputToInputReshapeOpIndexing(
   auto input_dims = reshape->operand(0)->shape().dimensions();
   auto output_dims = reshape->shape().dimensions();
 
-  IndexingMap reshape_indexing_map{
-      .affine_map =
-          ComputeReshapeIndexingMap(input_dims, output_dims, mlir_context),
-      .domain = Domain::FromUpperBounds(output_dims, {})};
+  IndexingMap reshape_indexing_map = IndexingMap::FromTensorSizes(
+      ComputeReshapeIndexingMap(input_dims, output_dims, mlir_context),
+      output_dims, {});
   reshape_indexing_map.Simplify();
   return HloInstructionIndexing::FromIndexingMaps({reshape_indexing_map});
 }
@@ -556,10 +682,9 @@ HloInstructionIndexing ComputeInputToOutputReshapeOpIndexing(
   auto input_dims = reshape->operand(0)->shape().dimensions();
   auto output_dims = reshape->shape().dimensions();
 
-  IndexingMap reshape_indexing_map{
-      .affine_map =
-          ComputeReshapeIndexingMap(output_dims, input_dims, mlir_context),
-      .domain = Domain::FromUpperBounds(input_dims, {})};
+  IndexingMap reshape_indexing_map = IndexingMap::FromTensorSizes(
+      ComputeReshapeIndexingMap(output_dims, input_dims, mlir_context),
+      input_dims, {});
   reshape_indexing_map.Simplify();
   return HloInstructionIndexing::FromIndexingMaps({reshape_indexing_map});
 }
@@ -581,10 +706,10 @@ HloInstructionIndexing ComputeReverseOpIndexing(
     exprs.push_back(-dim_expr + output_dim - 1);
   }
 
-  IndexingMap indexing_map{
-      .affine_map = AffineMap::get(output_dims.size(), /*symbolCount=*/0, exprs,
-                                   mlir_context),
-      .domain = Domain::FromUpperBounds(output_dims, {})};
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(output_dims.size(), /*symbolCount=*/0, exprs,
+                     mlir_context),
+      output_dims, {});
 
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
 }
@@ -600,35 +725,39 @@ HloInstructionIndexing ComputeOutputToInputSliceOpIndexing(
     exprs.push_back(dim_expr * slice->slice_strides()[dim] +
                     slice->slice_starts()[dim]);
   }
-  IndexingMap indexing_map{
-      .affine_map =
-          AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
-      .domain = Domain::FromUpperBounds(slice->shape().dimensions(), {})};
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      AffineMap::get(output_rank, /*symbolCount=*/0, exprs, mlir_context),
+      slice->shape().dimensions(), {});
   return HloInstructionIndexing::FromIndexingMaps({indexing_map});
 }
 
+AffineMap ComputeTransposeIndexingMap(absl::Span<const int64_t> permutation,
+                                      MLIRContext* mlir_context) {
+  return AffineMap::getPermutationMap(
+      std::vector<unsigned>(permutation.begin(), permutation.end()),
+      mlir_context);
+}
+
 HloInstructionIndexing ComputeOutputToInputTransposeOpIndexing(
     const HloTransposeInstruction* transpose, MLIRContext* mlir_context) {
   AffineMap inverse_permutation = ComputeTransposeIndexingMap(
       InversePermutation(transpose->dimensions()), mlir_context);
-  return HloInstructionIndexing::FromIndexingMaps({IndexingMap{
-      .affine_map = inverse_permutation,
-      .domain = Domain::FromUpperBounds(transpose->shape().dimensions(), {})}});
+  return HloInstructionIndexing::FromIndexingMaps({IndexingMap::FromTensorSizes(
+      inverse_permutation, transpose->shape().dimensions(), {})});
 }
 
 HloInstructionIndexing ComputeInputToOutputTransposeOpIndexing(
     const HloTransposeInstruction* transpose, MLIRContext* mlir_context) {
   AffineMap forward_permutation =
       ComputeTransposeIndexingMap(transpose->dimensions(), mlir_context);
-  return HloInstructionIndexing::FromIndexingMaps(
-      {IndexingMap{.affine_map = forward_permutation,
-                   .domain = Domain::FromUpperBounds(
-                       transpose->operand(0)->shape().dimensions(), {})}});
+  return HloInstructionIndexing::FromIndexingMaps({IndexingMap::FromTensorSizes(
+      forward_permutation, transpose->operand(0)->shape().dimensions(), {})});
 }
 
-std::optional<AffineMap> ComputeOutputToInputBitcastOpIndexingImpl(
-    const Shape& input_shape, const Shape& output_shape,
-    MLIRContext* mlir_context) {
+}  // namespace
+
+IndexingMap GetBitcastMap(const Shape& input_shape, const Shape& output_shape,
+                          MLIRContext* ctx) {
   ShapeUtil::BitcastDecomposition decomposed_bitcast =
       ShapeUtil::DecomposeBitcast(input_shape, output_shape);
 
@@ -639,70 +768,175 @@ std::optional<AffineMap> ComputeOutputToInputBitcastOpIndexingImpl(
     CHECK(permutation.has_value())
         << "Failed to deduce permutation for a bitcast.";
 
-    return ComputeTransposeIndexingMap(InversePermutation(permutation.value()),
-                                       mlir_context);
+    return IndexingMap::FromTensorSizes(
+        ComputeTransposeIndexingMap(permutation.value(), ctx),
+        input_shape.dimensions(), {});
   }
   if (std::holds_alternative<ShapeUtil::BitcastDecompositionReshape>(
           decomposed_bitcast)) {
-    return ComputeReshapeIndexingMap(input_shape.dimensions(),
-                                     output_shape.dimensions(), mlir_context);
+    // Note: ComputeReshapeIndexingMap assumes it's computing an output->input
+    // indexing, so input and output are reversed.
+    return IndexingMap::FromTensorSizes(
+        ComputeReshapeIndexingMap(output_shape.dimensions(),
+                                  input_shape.dimensions(), ctx),
+        input_shape.dimensions(), {});
   }
   // `trt` stands for transpose-reshape-transpose decomposition of bitcast.
   auto trt = std::get<ShapeUtil::BitcastDecompositionTrt>(decomposed_bitcast);
-  AffineMap transpose_map_1 = ComputeTransposeIndexingMap(
-      InversePermutation(trt.transpose1_dims), mlir_context);
-  AffineMap reshape_map =
-      ComputeReshapeIndexingMap(trt.transpose1_shape.dimensions(),
-                                trt.reshape_shape.dimensions(), mlir_context);
-  AffineMap transpose_map_2 = ComputeTransposeIndexingMap(
-      InversePermutation(trt.transpose2_dims), mlir_context);
-  return transpose_map_1.compose(reshape_map).compose(transpose_map_2);
+  auto transpose_map_1 = ComputeTransposeIndexingMap(trt.transpose1_dims, ctx);
+  auto reshape_map = ComputeReshapeIndexingMap(
+      trt.reshape_shape.dimensions(), trt.transpose1_shape.dimensions(), ctx);
+  auto transpose_map_2 = ComputeTransposeIndexingMap(trt.transpose2_dims, ctx);
+  auto bitcast_map =
+      transpose_map_2.compose(reshape_map).compose(transpose_map_1);
+  return IndexingMap::FromTensorSizes(bitcast_map, input_shape.dimensions(),
+                                      {});
 }
 
+namespace {
+
 HloInstructionIndexing ComputeOutputToInputBitcastOpIndexing(
     const HloInstruction* bitcast, MLIRContext* mlir_context) {
-  const Shape& input_shape = bitcast->operand(0)->shape();
-  const Shape& output_shape = bitcast->shape();
-  auto bitcast_affine_map = ComputeOutputToInputBitcastOpIndexingImpl(
-      input_shape, output_shape, mlir_context);
-  if (!bitcast_affine_map.has_value()) return CreateUnknownIndexing();
-
-  IndexingMap bitcast_indexing_map{
-      .affine_map = bitcast_affine_map.value(),
-      .domain = Domain::FromUpperBounds(output_shape.dimensions(), {})};
-  bitcast_indexing_map.Simplify();
+  auto bitcast_map = GetBitcastMap(bitcast->shape(),
+                                   bitcast->operand(0)->shape(), mlir_context);
+  bitcast_map.Simplify();
 
-  return HloInstructionIndexing::FromIndexingMaps({bitcast_indexing_map});
+  return HloInstructionIndexing::FromIndexingMaps({bitcast_map});
 }
 
 HloInstructionIndexing ComputeInputToOutputBitcastOpIndexing(
     const HloInstruction* bitcast, MLIRContext* mlir_context) {
-  const Shape& input_shape = bitcast->operand(0)->shape();
-  const Shape& output_shape = bitcast->shape();
+  auto bitcast_map = GetBitcastMap(bitcast->operand(0)->shape(),
+                                   bitcast->shape(), mlir_context);
+  bitcast_map.Simplify();
 
-  auto bitcast_affine_map = ComputeOutputToInputBitcastOpIndexingImpl(
-      output_shape, input_shape, mlir_context);
-  if (!bitcast_affine_map.has_value()) return CreateUnknownIndexing();
+  return HloInstructionIndexing::FromIndexingMaps({bitcast_map});
+}
 
-  IndexingMap bitcast_indexing_map{
-      .affine_map = bitcast_affine_map.value(),
-      .domain = Domain::FromUpperBounds(input_shape.dimensions(), {})};
-  bitcast_indexing_map.Simplify();
+// Converts a layout to a dimensions transposition necessary to get to that
+// layout from identity.
+std::vector<int64_t> ToTransposeDimensions(const Layout& l) {
+  std::vector<int64_t> out(l.minor_to_major().begin(),
+                           l.minor_to_major().end());
+  absl::c_reverse(out);
+  return out;
+}
 
-  return HloInstructionIndexing::FromIndexingMaps({bitcast_indexing_map});
+AffineMap GetTilingAffineMap(llvm::ArrayRef<AffineExpr> exprs,
+                             const Tiling& tiling) {
+  return AffineMap::get(/*dimCount=*/6, /*symbolCount=*/3, exprs,
+                        exprs[0].getContext());
 }
 
 }  // namespace
 
+llvm::SmallVector<AffineExpr, 4> DelinearizeInBoundsIndex(
+    AffineExpr linear, absl::Span<const int64_t> sizes,
+    absl::Span<const int64_t> strides) {
+  llvm::SmallVector<AffineExpr, 4> result;
+  result.reserve(sizes.size());
+  for (auto [size, stride] : llvm::zip(sizes, strides)) {
+    result.push_back(linear.floorDiv(stride) % size);
+  }
+  for (int dim = 0; dim < sizes.size(); ++dim) {
+    if (sizes[dim] > 1) {
+      // We assume the linear index is in bounds, so no mod for the first major
+      // non-degenerate dimension. Degenerate dimensions are already rewritten
+      // to 0 by operator%.
+      result[dim] = linear.floorDiv(strides[dim]);
+      break;
+    }
+  }
+  return result;
+}
+
+IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(const Shape& shape,
+                                                      MLIRContext* ctx) {
+  if (shape.rank() == 0) {
+    return IndexingMap(AffineMap::get(ctx), {}, {});
+  }
+  return IndexingMap::FromTensorSizes(
+      ComputeTransposeIndexingMap(
+          InversePermutation(ToTransposeDimensions(shape.layout())), ctx),
+      ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(shape)
+          .dimensions(),
+      {});
+}
+
+IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(const Shape& shape,
+                                                      MLIRContext* ctx) {
+  if (shape.rank() == 0) {
+    return IndexingMap(AffineMap::get(ctx), {}, {});
+  }
+  return IndexingMap::FromTensorSizes(
+      ComputeTransposeIndexingMap(ToTransposeDimensions(shape.layout()), ctx),
+      shape.dimensions(), {});
+}
+
+AffineMap GetBlockOffsetsForTiling(const Tiling& tiling,
+                                   mlir::MLIRContext* ctx) {
+  auto offsets = DelinearizeInBoundsIndex(getAffineDimExpr(3, ctx),
+                                          tiling.GetBlockCounts(),
+                                          tiling.GetBlockStrides());
+  for (auto&& [offset, tile_size] :
+       llvm::zip(offsets, tiling.GetBlockTileSize())) {
+    offset = offset * tile_size;
+  }
+  return GetTilingAffineMap(offsets, tiling);
+}
+
+AffineMap GetThreadOffsetsForTiling(const Tiling& tiling,
+                                    mlir::MLIRContext* ctx) {
+  auto offsets = DelinearizeInBoundsIndex(getAffineDimExpr(0, ctx),
+                                          tiling.GetThreadsPerBlock(),
+                                          tiling.GetThreadStrides());
+  for (int dim = 0; dim < tiling.GetShape().size(); ++dim) {
+    if (tiling.GetThreadTileSize()[dim] > 1) {
+      offsets[dim] = offsets[dim] + getAffineSymbolExpr(dim, ctx) *
+                                        tiling.GetThreadsPerBlock()[dim];
+    }
+  }
+  return GetTilingAffineMap(offsets, tiling);
+}
+
+IndexingMap GetIndexingMapForTiling(const Tiling& tiling,
+                                    mlir::MLIRContext* ctx) {
+  return GetIndexingMapForTiling(GetBlockOffsetsForTiling(tiling, ctx),
+                                 GetThreadOffsetsForTiling(tiling, ctx),
+                                 tiling);
+}
+
+IndexingMap GetIndexingMapForTiling(AffineMap block_offsets,
+                                    AffineMap thread_offsets,
+                                    const Tiling& tiling) {
+  llvm::SmallVector<AffineExpr, 4> offsets;
+  offsets.reserve(block_offsets.getNumResults());
+  for (auto [block, thread] :
+       llvm::zip(block_offsets.getResults(), thread_offsets.getResults())) {
+    offsets.push_back(block + thread);
+  }
+
+  // TODO(jreiffers): Use general constraints for symbols: in the last blocks
+  // in each each dimension, the bounds can be different if we don't have a
+  // perfect tiling.
+  std::vector<Range> dimension_ranges{
+      {0, tiling.GetNumThreadsPerBlock() - 1}, {}, {},
+      {0, tiling.GetNumBlocks() - 1},          {}, {},
+  };
+  return {GetTilingAffineMap(offsets, tiling), dimension_ranges,
+          RangesFromUpperBounds(tiling.GetThreadTileSize())};
+}
 
 bool HloInstructionIndexing::Simplify() {
   bool any_simplified = false;
   for (auto& operand_indexing : indexing_maps) {
-    std::vector<std::optional<IndexingMap>> to_remove, to_add;
-    for (std::optional<IndexingMap> map : operand_indexing) {
+    std::vector<IndexingMap> to_remove, to_add;
+    for (IndexingMap map : operand_indexing) {
       to_remove.push_back(map);
-      if (!map.has_value() || map->Simplify()) {
+      if (map.IsUndefined()) {
         to_add.push_back(map);
+      } else if (map.Simplify()) {
+        map.RemoveUnusedSymbols();
       } else {
         to_remove.pop_back();
       }
@@ -742,11 +976,11 @@ void HloInstructionIndexing::Print(std::ostream& out,
        llvm::enumerate(indexing_maps)) {
     out << "operand id = " << operand_id << ' ';
     for (const auto& indexing_map : indexing_maps) {
-      if (!indexing_map.has_value()) {
+      if (indexing_map.IsUndefined()) {
         out << "unknown indexing";
         continue;
       }
-      indexing_map->Print(out, printer);
+      indexing_map.Print(out, printer);
     }
   }
 }
@@ -758,6 +992,12 @@ std::ostream& operator<<(std::ostream& out,
   return out;
 }
 
+const Shape& GetOutputShape(const HloInstruction* instr, int64_t output_id) {
+  return instr->shape().IsTuple()
+             ? ShapeUtil::GetSubshape(instr->shape(), {output_id})
+             : instr->shape();
+}
+
 GroupedByOpIndexingMap GroupIndexingMapsByProducers(
     const HloInstructionIndexing& indexing, const HloInstruction* instr) {
   GroupedByOpIndexingMap result;
@@ -769,46 +1009,81 @@ GroupedByOpIndexingMap GroupIndexingMapsByProducers(
   return result;
 }
 
-AffineMap ComputeTransposeIndexingMap(absl::Span<const int64_t> permutation,
-                                      MLIRContext* mlir_context) {
-  return AffineMap::getPermutationMap(
-      std::vector<unsigned>(permutation.begin(), permutation.end()),
-      mlir_context);
-}
-
-std::optional<GroupedByOpIndexingMap> ComputeGroupedOutputToInputIndexing(
-    const HloFusionAdaptor& fusion_adaptor, int output_id, MLIRContext* ctx) {
-  auto root = fusion_adaptor.GetRoots()[output_id];
-
-  auto initial_map = CreateIdentityMap(root.instruction().shape(), ctx);
+GroupedByOpIndexingMap ComputeGroupedOutputToInputIndexing(
+    const HloFusionAdaptor& fusion_adaptor, HloInstructionAdaptor target_instr,
+    MLIRContext* ctx) {
+  auto initial_map = CreateIdentityMap(target_instr.instruction().shape(), ctx);
 
   GroupedByOpIndexingMap grouped_indexing_maps;
-  grouped_indexing_maps[&root.instruction()].insert(initial_map);
+  // If target_instr is a parameter of a fusion, then we create an identity map
+  // for the fusion operand.
+  if (fusion_adaptor.ContainsInstruction(target_instr)) {
+    if (auto parameter_instr =
+            DynCast<HloParameterInstruction>(&target_instr.instruction())) {
+      const HloInstruction* user = parameter_instr->users().front();
+      auto fusion_operand = HloInstructionAdaptor(*user).GetOperand(
+          parameter_instr->parameter_number());
+      grouped_indexing_maps[&fusion_operand.instruction()] = {initial_map};
+      return grouped_indexing_maps;
+    }
+  }
+  grouped_indexing_maps[&target_instr.instruction()].insert(initial_map);
 
   auto post_order = fusion_adaptor.MakeInstructionPostOrder();
 
   // Iterator in reversed post-order (use-before-def).
-  for (auto it = post_order.rbegin(); it != post_order.rend(); ++it) {
+  auto it = std::find(post_order.rbegin(), post_order.rend(), target_instr);
+  for (; it != post_order.rend(); ++it) {
     auto producer_indexing = ComputeOutputToInputIndexing(&it->instruction(),
                                                           /*output_id=*/0, ctx);
-    auto consumer_indexing_maps = grouped_indexing_maps[&it->instruction()];
+    auto consumer_indexing_maps =
+        grouped_indexing_maps.find(&it->instruction());
+    if (consumer_indexing_maps == grouped_indexing_maps.end()) {
+      continue;
+    }
+    // Indexing maps have to be copied because of rehashing. Consider using a
+    // different container to get better performance.
+    IndexingMapSet consumer_indexing_maps_copy = consumer_indexing_maps->second;
     for (const auto& [producer_operand_id, producer_operand_indexing] :
          llvm::enumerate(producer_indexing.indexing_maps)) {
       auto producer_operand_adaptor = it->GetOperand(producer_operand_id);
-      for (const std::optional<IndexingMap>& producer_map :
-           producer_operand_indexing) {
-        for (const std::optional<IndexingMap>& consumer_map :
-             consumer_indexing_maps) {
+      for (const IndexingMap& producer_map : producer_operand_indexing) {
+        for (const IndexingMap& consumer_map : consumer_indexing_maps_copy) {
+          auto composed_map = ComposeIndexingMaps(consumer_map, producer_map);
+          composed_map.Simplify();
+          composed_map.RemoveUnusedSymbols();
           grouped_indexing_maps[&producer_operand_adaptor.instruction()].insert(
-              ComposeIndexingMaps(producer_map, consumer_map));
+              composed_map);
         }
       }
     }
   }
-
   return grouped_indexing_maps;
 }
 
+bool FuseProducerConsumerOutputToInputIndexing(
+    const HloInstruction* producer_instr,
+    absl::flat_hash_map<const HloInstruction*, IndexingMapSet>*
+        consumer_indexing,
+    MLIRContext* mlir_context) {
+  auto producer_indexing = ComputeOutputToInputIndexing(
+      producer_instr, /*output_id=*/0, mlir_context);
+  auto consumer_indexing_maps = (*consumer_indexing)[producer_instr];
+  for (const auto& [producer_operand_id, producer_operand_indexing] :
+       llvm::enumerate(producer_indexing.indexing_maps)) {
+    const HloInstruction* producer_operand_instr =
+        producer_instr->operand(producer_operand_id);
+    for (const IndexingMap& producer_map : producer_operand_indexing) {
+      for (const IndexingMap& consumer_map : consumer_indexing_maps) {
+        (*consumer_indexing)[producer_operand_instr].insert(
+            ComposeIndexingMaps(producer_map, consumer_map));
+      }
+    }
+  }
+  consumer_indexing->erase(producer_instr);
+  return true;
+}
+
 HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
                                                     int output_id,
                                                     MLIRContext* ctx) {
@@ -836,9 +1111,16 @@ HloInstructionIndexing ComputeOutputToInputIndexing(const HloInstruction* instr,
   if (auto iota = DynCast<HloIotaInstruction>(instr)) {
     return HloInstructionIndexing{};
   }
+  if (auto pad = DynCast<HloPadInstruction>(instr)) {
+    return ComputeOutputToInputPadOpIndexing(pad, ctx);
+  }
   if (auto reduce = DynCast<HloReduceInstruction>(instr)) {
     return ComputeOutputToInputReduceOpIndexing(reduce, output_id, ctx);
   }
+  if (auto reduce_window = DynCast<HloReduceWindowInstruction>(instr)) {
+    return ComputeOutputToInputReduceWindowOpIndexing(reduce_window, output_id,
+                                                      ctx);
+  }
   if (auto reshape = DynCast<HloReshapeInstruction>(instr)) {
     return ComputeOutputToInputReshapeOpIndexing(reshape, ctx);
   }
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis.h b/third_party/xla/xla/service/gpu/model/indexing_analysis.h
index d602d6427286dc..6df85b418ec7dd 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis.h
@@ -18,9 +18,7 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
-#include <optional>
 #include <ostream>
-#include <sstream>
 #include <string>
 #include <vector>
 
@@ -28,16 +26,21 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/types/span.h"
 #include "llvm/ADT/Hashing.h"
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/gpu/fusions/tiling_util.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/model/affine_map_printer.h"
 #include "xla/service/gpu/model/indexing_map.h"
+#include "xla/shape.h"
 
 namespace xla {
 namespace gpu {
 
+using IndexingMapSet = absl::flat_hash_set<IndexingMap>;
+
 // Contains indexing maps for all N-dimensional tensor input operands that
 // correspond to a particular output.
 struct HloInstructionIndexing {
@@ -55,7 +58,7 @@ struct HloInstructionIndexing {
       absl::Span<const IndexingMap> indexing_maps);
 
   // Maps input operand index to the indexing map for one particular output.
-  std::vector<absl::flat_hash_set<std::optional<IndexingMap>>> indexing_maps;
+  std::vector<IndexingMapSet> indexing_maps;
 };
 std::ostream& operator<<(std::ostream& out,
                          const HloInstructionIndexing& instr_indexing);
@@ -74,17 +77,68 @@ HloInstructionIndexing ComputeInputToOutputIndexing(const HloInstruction* instr,
                                                     int input_id,
                                                     mlir::MLIRContext* ctx);
 
-// Groups indexing maps by instructions.
-using IndexingMapSet = absl::flat_hash_set<std::optional<IndexingMap>>;
 using GroupedByOpIndexingMap =
     absl::flat_hash_map<const HloInstruction*, IndexingMapSet>;
-std::optional<GroupedByOpIndexingMap> ComputeGroupedOutputToInputIndexing(
-    const HloFusionAdaptor& fusion_adaptor, int output_id,
+
+// Computes output-to-input indexing for every instruction within a fusion
+// cluster starting with `target_instr` and going from def to use.
+GroupedByOpIndexingMap ComputeGroupedOutputToInputIndexing(
+    const HloFusionAdaptor& fusion_adaptor, HloInstructionAdaptor target_instr,
     mlir::MLIRContext* ctx);
 
-// Computes a transpose indexing map.
-mlir::AffineMap ComputeTransposeIndexingMap(
-    absl::Span<const int64_t> permutation, mlir::MLIRContext* mlir_context);
+// Groups indexing maps by instructions.
+absl::flat_hash_map<const HloInstruction*, IndexingMapSet>
+GroupIndexingMapsByProducers(const HloInstructionIndexing& indexing,
+                             const HloInstruction* instr);
+
+// Computes producer indexing maps and fuse/compose them with the consumer
+// indexing maps.
+bool FuseProducerConsumerOutputToInputIndexing(
+    const HloInstruction* producer_instr,
+    absl::flat_hash_map<const HloInstruction*, IndexingMapSet>*
+        consumer_indexing,
+    mlir::MLIRContext* mlir_context);
+
+// Creates an indexing map for bitcasting from `input_shape` to `output_shape`.
+// Equivalent to linearizing the input_shape index and then delinearizing it
+// to output_shape.
+IndexingMap GetBitcastMap(const Shape& input_shape, const Shape& output_shape,
+                          mlir::MLIRContext* ctx);
+
+// Creates an indexing map from the physical layout of the tensor to its logical
+// layout.
+IndexingMap GetIndexingMapFromPhysicalLayoutToLogical(const Shape& shape,
+                                                      mlir::MLIRContext* ctx);
+
+// Creates an indexing map from the logical layout of the tensor to its physical
+// layout.
+IndexingMap GetIndexingMapFromLogicalToPhysicalLayout(const Shape& shape,
+                                                      mlir::MLIRContext* ctx);
+
+// Creates an indexing map from thread and block IDs to elements of the tiled
+// shape. Uses the same convention as KernelFusionInterface: dimensions 0 to 2
+// are thread indices (currently only 0 is used), dimensions 3 to 5 are block
+// indices (currently only 3 is used).
+mlir::AffineMap GetBlockOffsetsForTiling(const Tiling& tiling,
+                                         mlir::MLIRContext* ctx);
+mlir::AffineMap GetThreadOffsetsForTiling(const Tiling& tiling,
+                                          mlir::MLIRContext* ctx);
+
+// Convenience functions for the two functions above
+// (`GetBlockOffsestsForTiling` + `GetThreadOffsetsForTiling`). Also sets up
+// the ranges of dimensions and symbols.
+IndexingMap GetIndexingMapForTiling(const Tiling& tiling,
+                                    mlir::MLIRContext* ctx);
+IndexingMap GetIndexingMapForTiling(mlir::AffineMap block_offsets,
+                                    mlir::AffineMap thread_offsets,
+                                    const Tiling& tiling);
+
+// Returns the shape of the output of the instruction.
+const Shape& GetOutputShape(const HloInstruction* instr, int64_t output_id);
+
+llvm::SmallVector<mlir::AffineExpr, 4> DelinearizeInBoundsIndex(
+    mlir::AffineExpr linear, absl::Span<const int64_t> sizes,
+    absl::Span<const int64_t> strides);
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
index 44edb4669e6629..9b4d4c0d4b0a30 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_analysis_test.cc
@@ -15,8 +15,6 @@ limitations under the License.
 
 #include "xla/service/gpu/model/indexing_analysis.h"
 
-#include <optional>
-
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/strings/string_view.h"
@@ -47,19 +45,59 @@ MATCHER_P2(MatchInstrIndexing, operand_id, indexing_map_matchers, "") {
 class IndexingAnalysisTest : public HloTestBase {
  public:
   HloInstructionIndexing GetOutputToInputIndexingForEntryComputation(
-      absl::string_view hlo_string, int output_id = 0) {
+      absl::string_view hlo_string, int output_id = 0,
+      bool use_physical_layout = false) {
     return ComputeOutputToInputIndexingForEntryComputation(
-        static_cast<HloTestBase*>(this), &mlir_context_, hlo_string, output_id);
+        static_cast<HloTestBase*>(this), &mlir_context_, hlo_string, output_id,
+        use_physical_layout);
   }
 
   HloInstructionIndexing GetInputToOutputIndexingForEntryComputation(
-      absl::string_view hlo_string, int input_id = 0) {
+      absl::string_view hlo_string, int input_id = 0,
+      bool use_physical_layout = false) {
     return ComputeInputToOutputIndexingForEntryComputation(
-        static_cast<HloTestBase*>(this), &mlir_context_, hlo_string, input_id);
+        static_cast<HloTestBase*>(this), &mlir_context_, hlo_string, input_id,
+        use_physical_layout);
   }
   mlir::MLIRContext mlir_context_;
 };
 
+TEST_F(IndexingAnalysisTest, FuseProducerConsumerOutputToInputIndexing) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[1000, 1000] parameter(0)
+      transpose_p0 = f32[1000, 1000]{0, 1} transpose(p0), dimensions={1, 0}
+      ROOT a0 = f32[1000, 1000] add(p0, transpose_p0)
+    }
+  )");
+  EXPECT_TRUE(module.ok());
+  const HloInstruction* root =
+      module.value()->entry_computation()->root_instruction();
+  const HloInstruction* parameter = root->operand(0);
+  const HloInstruction* transpose = root->operand(1);
+
+  auto root_indexing =
+      ComputeOutputToInputIndexing(root, /*output_id=*/0, &mlir_context_);
+
+  auto grouped_by_key = GroupIndexingMapsByProducers(root_indexing, root);
+
+  EXPECT_THAT(
+      grouped_by_key,
+      UnorderedElementsAre(Pair(parameter, ElementsAre(MatchIndexingMap(R"(
+                    (d0, d1) -> (d0, d1)
+                    domain:
+                    d0 in [0, 999]
+                    d1 in [0, 999]
+                  )"))),
+                           Pair(transpose, ElementsAre(MatchIndexingMap(R"(
+                    (d0, d1) -> (d0, d1)
+                    domain:
+                    d0 in [0, 999]
+                    d1 in [0, 999]
+                  )")))));
+}
+
 TEST_F(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing) {
   auto module = ParseAndReturnVerifiedModule(R"(
     HloModule m
@@ -78,28 +116,273 @@ TEST_F(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing) {
   auto fusion_adaptor = ProducerConsumerFusion(transpose, root);
 
   auto grouped_indexing = ComputeGroupedOutputToInputIndexing(
-      fusion_adaptor, /*output_id=*/0, &mlir_context_);
+      fusion_adaptor, fusion_adaptor.GetRoots()[0], &mlir_context_);
+  EXPECT_THAT(grouped_indexing,
+              UnorderedElementsAre(
+                  Pair(root, ElementsAre(MatchIndexingMap(R"(
+                    (d0, d1) -> (d0, d1)
+                    domain:
+                    d0 in [0, 999]
+                    d1 in [0, 999]
+                  )"))),
+                  Pair(transpose, ElementsAre(MatchIndexingMap(R"(
+                    (d0, d1) -> (d0, d1)
+                    domain:
+                    d0 in [0, 999]
+                    d1 in [0, 999]
+                  )"))),
+                  Pair(parameter, UnorderedElementsAre(MatchIndexingMap(R"(
+                        (d0, d1) -> (d0, d1)
+                        domain:
+                        d0 in [0, 999]
+                        d1 in [0, 999]
+                      )"),
+                                                       MatchIndexingMap(R"(
+                        (d0, d1) -> (d1, d0)
+                        domain:
+                        d0 in [0, 999]
+                        d1 in [0, 999]
+                      )")))));
+}
+
+TEST_F(IndexingAnalysisTest,
+       ComputeGroupedOutputToInputIndexing_VariadicReduce) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+HloModule m
+
+add {
+  param_0 = f32[] parameter(0)
+  param_1 = f32[] parameter(1)
+  param_2 = f32[] parameter(2)
+  param_3 = f32[] parameter(3)
+  add.0 = f32[] add(param_0, param_2)
+  add.1 = f32[] add(param_1, param_3)
+  ROOT t = (f32[], f32[]) tuple(add.0, add.1)
+}
+
+ENTRY entry_computation {
+  param_0.3 = f32[32,40]{1,0} parameter(0)
+  param_1.3 = f32[32,40]{1,0} parameter(1)
+  param_2.2 = f32[] parameter(2)
+  constant = f32[] constant(0)
+  ROOT reduce = (f32[32]{0}, f32[32]{0}) reduce(param_0.3, param_1.3, param_2.2, constant), dimensions={1}, to_apply=add
+}
+  )");
+  EXPECT_TRUE(module.ok());
+  const HloInstruction* root =
+      (*module)->entry_computation()->root_instruction();
+
+  auto fusion_adaptor = HloFusionAdaptor::ForInstruction(root);
+
+  auto grouped_indexing = ComputeGroupedOutputToInputIndexing(
+      *fusion_adaptor, fusion_adaptor->GetRoots()[0], &mlir_context_);
+
+  EXPECT_THAT(grouped_indexing,
+              UnorderedElementsAre(
+                  Pair(root, ElementsAre(MatchIndexingMap(R"(
+                    (d0) -> (d0)
+                    domain:
+                    d0 in [0, 31]
+                  )"))),
+                  Pair(root->operand(0), ElementsAre(MatchIndexingMap(R"(
+                    (d0)[s0] -> (d0, s0)
+                    domain:
+                    d0 in [0, 31]
+                    s0 in [0, 39]
+                  )"))),
+                  Pair(root->operand(1), ElementsAre(MatchIndexingMap(R"(
+                    (d0)[s0] -> (d0, s0)
+                    domain:
+                    d0 in [0, 31]
+                    s0 in [0, 39]
+                  )"))),
+                  Pair(root->operand(2), ElementsAre(MatchIndexingMap(R"(
+                    (d0) -> ()
+                    domain:
+                    d0 in [0, 31]
+                  )"))),
+                  Pair(root->operand(3), ElementsAre(MatchIndexingMap(R"(
+                    (d0) -> ()
+                    domain:
+                    d0 in [0, 31]
+                  )")))));
+}
+
+TEST_F(IndexingAnalysisTest, ComputeGroupedOutputToInputIndexing_SingleOp) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[1000, 1000] parameter(0)
+      p1 = f32[1000, 1000] parameter(1)
+      exp0 = f32[1000, 1000] exponential(p1)
+      ROOT a0 = f32[1000, 1000] add(p0, exp0)
+    }
+  )");
+  EXPECT_TRUE(module.ok());
+  HloComputation* entry_computation = (*module)->entry_computation();
+  const HloInstruction* exponential =
+      entry_computation->GetInstructionWithName("exp0");
+  const HloInstruction* parameter =
+      entry_computation->GetInstructionWithName("p1");
+
+  auto fusion_adaptor = HloFusionAdaptor::ForInstruction(exponential);
+  HloInstructionAdaptor parameter_adaptor(*parameter);
+  auto grouped_indexing = ComputeGroupedOutputToInputIndexing(
+      *fusion_adaptor, parameter_adaptor, &mlir_context_);
+  EXPECT_THAT(grouped_indexing, UnorderedElementsAre(Pair(
+                                    parameter, ElementsAre(MatchIndexingMap(R"(
+                                                     (d0, d1) -> (d0, d1)
+                                                     domain:
+                                                     d0 in [0, 999]
+                                                     d1 in [0, 999]
+                                                   )")))));
+}
+
+TEST_F(IndexingAnalysisTest,
+       ComputeGroupedOutputToInputIndexing_StartNotAtRoot) {
+  auto module = ParseAndReturnVerifiedModule(R"(
+    HloModule m
+    max {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT max = f32[] maximum(p0, p1)
+    }
+    f {
+      p0 = f32[15, 20] parameter(0)
+      p0_init = f32[] parameter(1)
+      p0_bcast = f32[15, 32, 20, 64] broadcast(p0), dimensions={0, 2}
+
+      ROOT reduce_2 = f32[15, 64] reduce(p0_bcast, p0_init),
+        dimensions={1, 2}, to_apply=max
+    }
+    ENTRY e {
+      p0 = f32[15, 20] parameter(0)
+      p0_init = f32[] constant(-inf)
+      ROOT fusion = f32[15, 64] fusion(p0, p0_init), kind=kLoop, calls=f
+    }
+  )");
+  EXPECT_TRUE(module.ok());
+
+  auto fusion_adaptor = HloFusionAdaptor::ForInstruction(
+      (*module)->entry_computation()->root_instruction());
+  auto root = fusion_adaptor->GetRoots()[0];
+  auto bcast = root.GetOperand(0);
+  auto parameter_0 = bcast.GetOperand(0);
+
+  auto grouped_indexing = ComputeGroupedOutputToInputIndexing(
+      *fusion_adaptor, bcast, &mlir_context_);
   EXPECT_THAT(
-      *grouped_indexing,
+      grouped_indexing,
       UnorderedElementsAre(
-          Pair(root, ElementsAre(MatchIndexingMap(
-                         "(d0, d1) -> (d0, d1)",
-                         ElementsAre(MatchRange(0, 999), MatchRange(0, 999)),
-                         IsEmpty()))),
-          Pair(transpose,
-               ElementsAre(MatchIndexingMap(
-                   "(d0, d1) -> (d0, d1)",
-                   ElementsAre(MatchRange(0, 999), MatchRange(0, 999)),
-                   IsEmpty()))),
-          Pair(parameter, UnorderedElementsAre(
-                              MatchIndexingMap("(d0, d1) -> (d0, d1)",
-                                               ElementsAre(MatchRange(0, 999),
-                                                           MatchRange(0, 999)),
-                                               IsEmpty()),
-                              MatchIndexingMap("(d0, d1) -> (d1, d0)",
-                                               ElementsAre(MatchRange(0, 999),
-                                                           MatchRange(0, 999)),
-                                               IsEmpty())))));
+          Pair(&bcast.instruction(), ElementsAre(MatchIndexingMap(R"(
+            (d0, d1, d2, d3) -> (d0, d1, d2, d3)
+            domain:
+            d0 in [0, 14]
+            d1 in [0, 31]
+            d2 in [0, 19]
+            d3 in [0, 63]
+          )"))),
+          Pair(&parameter_0.instruction(), ElementsAre(MatchIndexingMap(R"(
+            (d0, d1, d2, d3) -> (d0, d2)
+            domain:
+            d0 in [0, 14]
+            d1 in [0, 31]
+            d2 in [0, 19]
+            d3 in [0, 63]
+          )")))));
+}
+
+TEST_F(IndexingAnalysisTest, PhysicalLayoutTestOutputPermutation) {
+  auto ir = R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[10, 20, 30] parameter(0)
+      ROOT add0 = f32[10, 20, 30]{1, 0, 2} exponential(p0)
+    }
+  )";
+  auto input_indexing = GetOutputToInputIndexingForEntryComputation(
+      ir, /*output_id=*/0, /*use_physical_layout=*/true);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d1, d2, d0)
+                            domain:
+                            d0 in [0, 29]
+                            d1 in [0, 9]
+                            d2 in [0, 19]
+                          )"))));
+
+  auto output_indexing = GetInputToOutputIndexingForEntryComputation(
+      ir, /*input_id=*/0, /*use_physical_layout=*/true);
+  EXPECT_THAT(output_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d2, d0, d1)
+                            domain:
+                            d0 in [0, 9]
+                            d1 in [0, 19]
+                            d2 in [0, 29]
+                          )"))));
+}
+
+TEST_F(IndexingAnalysisTest, PhysicalLayoutTestInputPermutation) {
+  auto ir = R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[10, 20, 30]{1, 0, 2} parameter(0)
+      ROOT add0 = f32[10, 20, 30] exponential(p0)
+    }
+  )";
+  auto input_indexing = GetOutputToInputIndexingForEntryComputation(
+      ir, /*output_id=*/0, /*use_physical_layout=*/true);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d2, d0, d1)
+                            domain:
+                            d0 in [0, 9]
+                            d1 in [0, 19]
+                            d2 in [0, 29]
+                          )"))));
+
+  auto output_indexing = GetInputToOutputIndexingForEntryComputation(
+      ir, /*input_id=*/0, /*use_physical_layout=*/true);
+  EXPECT_THAT(output_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d1, d2, d0)
+                            domain:
+                            d0 in [0, 29]
+                            d1 in [0, 9]
+                            d2 in [0, 19]
+                          )"))));
+}
+
+TEST_F(IndexingAnalysisTest, PhysicalLayoutTestInputAndOutputPermutation) {
+  auto ir = R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[10, 20, 30]{1, 0, 2} parameter(0)
+      ROOT add0 = f32[10, 20, 30]{1, 0, 2} exponential(p0)
+    }
+  )";
+  auto input_indexing = GetOutputToInputIndexingForEntryComputation(
+      ir, /*output_id=*/0, /*use_physical_layout=*/true);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0, d1, d2)
+                            domain:
+                            d0 in [0, 29]
+                            d1 in [0, 9]
+                            d2 in [0, 19]
+                          )"))));
+
+  auto output_indexing = GetInputToOutputIndexingForEntryComputation(
+      ir, /*input_id=*/0, /*use_physical_layout=*/true);
+  EXPECT_THAT(output_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0, d1, d2)
+                            domain:
+                            d0 in [0, 29]
+                            d1 in [0, 9]
+                            d2 in [0, 19]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, ElementwiseOp) {
@@ -112,31 +395,39 @@ TEST_F(IndexingAnalysisTest, ElementwiseOp) {
     }
   )";
   auto input_indexing = GetOutputToInputIndexingForEntryComputation(ir);
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(
-          ElementsAre(MatchIndexingMap(
-              "(d0, d1) -> (d0, d1)",
-              ElementsAre(MatchRange(0, 9), MatchRange(0, 19)), IsEmpty())),
-          ElementsAre(MatchIndexingMap(
-              "(d0, d1) -> (d0, d1)",
-              ElementsAre(MatchRange(0, 9), MatchRange(0, 19)), IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0, d1)
+                            domain:
+                            d0 in [0, 9]
+                            d1 in [0, 19]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0, d1)
+                            domain:
+                            d0 in [0, 9]
+                            d1 in [0, 19]
+                          )"))));
 
   auto output_indexing_0 =
       GetInputToOutputIndexingForEntryComputation(ir, /*input_id=*/0);
-  EXPECT_THAT(
-      output_indexing_0.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1) -> (d0, d1)",
-          ElementsAre(MatchRange(0, 9), MatchRange(0, 19)), IsEmpty()))));
+  EXPECT_THAT(output_indexing_0.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0, d1)
+                            domain:
+                            d0 in [0, 9]
+                            d1 in [0, 19]
+                          )"))));
 
   auto output_indexing_1 =
       GetInputToOutputIndexingForEntryComputation(ir, /*input_id=*/1);
-  EXPECT_THAT(
-      output_indexing_1.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1) -> (d0, d1)",
-          ElementsAre(MatchRange(0, 9), MatchRange(0, 19)), IsEmpty()))));
+  EXPECT_THAT(output_indexing_1.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0, d1)
+                            domain:
+                            d0 in [0, 9]
+                            d1 in [0, 19]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, BitcastIsReshape) {
@@ -147,12 +438,14 @@ TEST_F(IndexingAnalysisTest, BitcastIsReshape) {
       ROOT bitcast = f32[4, 8, 4] bitcast(p0)
     }
   )");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1, d2) -> (d0, d1 * 4 + d2)",
-          ElementsAre(MatchRange(0, 3), MatchRange(0, 7), MatchRange(0, 3)),
-          IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0, d1 * 4 + d2)
+                            domain:
+                            d0 in [0, 3]
+                            d1 in [0, 7]
+                            d2 in [0, 3]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, BitcastIsTranspose) {
@@ -164,11 +457,14 @@ TEST_F(IndexingAnalysisTest, BitcastIsTranspose) {
     }
   )");
   EXPECT_THAT(input_indexing.indexing_maps,
-              ElementsAre(ElementsAre(MatchIndexingMap(
-                  "(d0, d1, d2, d3) -> (d0, d3, d1, d2)",
-                  ElementsAre(MatchRange(0, 2), MatchRange(0, 5),
-                              MatchRange(0, 127), MatchRange(0, 12287)),
-                  IsEmpty()))));
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3) -> (d0, d3, d1, d2)
+                            domain:
+                            d0 in [0, 2]
+                            d1 in [0, 5]
+                            d2 in [0, 127]
+                            d3 in [0, 12287]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, BitcastIsTransposeReshapeTranspose) {
@@ -180,18 +476,22 @@ TEST_F(IndexingAnalysisTest, BitcastIsTransposeReshapeTranspose) {
     }
   )";
   auto input_indexing = GetOutputToInputIndexingForEntryComputation(ir);
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1) -> (d1, d0 floordiv 3, d0 mod 3)",
-          ElementsAre(MatchRange(0, 50), MatchRange(0, 15)), IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d1, d0 floordiv 3, d0 mod 3)
+                            domain:
+                            d0 in [0, 50]
+                            d1 in [0, 15]
+                          )"))));
   auto output_indexing = GetInputToOutputIndexingForEntryComputation(ir);
-  EXPECT_THAT(
-      output_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1, d2) -> (d1 * 3 + d2, d0)",
-          ElementsAre(MatchRange(0, 15), MatchRange(0, 16), MatchRange(0, 2)),
-          IsEmpty()))));
+  EXPECT_THAT(output_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d1 * 3 + d2, d0)
+                            domain:
+                            d0 in [0, 15]
+                            d1 in [0, 16]
+                            d2 in [0, 2]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, BroadcastOp) {
@@ -203,19 +503,24 @@ TEST_F(IndexingAnalysisTest, BroadcastOp) {
     }
   )";
   auto input_indexing = GetOutputToInputIndexingForEntryComputation(ir);
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1, d2) -> (d1)",
-          ElementsAre(MatchRange(0, 9), MatchRange(0, 19), MatchRange(0, 29)),
-          IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d1)
+                            domain:
+                            d0 in [0, 9]
+                            d1 in [0, 19]
+                            d2 in [0, 29]
+                          )"))));
 
   auto output_indexing = GetInputToOutputIndexingForEntryComputation(ir);
-  EXPECT_THAT(
-      output_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0)[s0, s1] -> (s0, d0, s1)", ElementsAre(MatchRange(0, 19)),
-          ElementsAre(MatchRange(0, 9), MatchRange(0, 29))))));
+  EXPECT_THAT(output_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0)[s0, s1] -> (s0, d0, s1)
+                            domain:
+                            d0 in [0, 19]
+                            s0 in [0, 9]
+                            s1 in [0, 29]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, ConstantOp) {
@@ -241,50 +546,61 @@ TEST_F(IndexingAnalysisTest, ConcatenateOp) {
     }
   )";
   auto input_indexing = GetOutputToInputIndexingForEntryComputation(ir);
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(
-          ElementsAre(MatchIndexingMap(
-              "(d0, d1, d2) -> (d0, d1, d2)",
-              ElementsAre(MatchRange(0, 1), MatchRange(0, 4), MatchRange(0, 6)),
-              IsEmpty())),
-          ElementsAre(
-              MatchIndexingMap("(d0, d1, d2) -> (d0, d1 - 5, d2)",
-                               ElementsAre(MatchRange(0, 1), MatchRange(5, 15),
-                                           MatchRange(0, 6)),
-                               IsEmpty())),
-          ElementsAre(
-              MatchIndexingMap("(d0, d1, d2) -> (d0, d1 - 16, d2)",
-                               ElementsAre(MatchRange(0, 1), MatchRange(16, 32),
-                                           MatchRange(0, 6)),
-                               IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0, d1, d2)
+                            domain:
+                            d0 in [0, 1]
+                            d1 in [0, 4]
+                            d2 in [0, 6]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0, d1 - 5, d2)
+                            domain:
+                            d0 in [0, 1]
+                            d1 in [5, 15]
+                            d2 in [0, 6]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0, d1 - 16, d2)
+                            domain:
+                            d0 in [0, 1]
+                            d1 in [16, 32]
+                            d2 in [0, 6]
+                          )"))));
 
   auto output_indexing_0 =
       GetInputToOutputIndexingForEntryComputation(ir, /*input_id=*/0);
-  EXPECT_THAT(
-      output_indexing_0.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1, d2) -> (d0, d1, d2)",
-          ElementsAre(MatchRange(0, 1), MatchRange(0, 4), MatchRange(0, 6)),
-          IsEmpty()))));
+  EXPECT_THAT(output_indexing_0.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0, d1, d2)
+                            domain:
+                            d0 in [0, 1]
+                            d1 in [0, 4]
+                            d2 in [0, 6]
+                          )"))));
 
   auto output_indexing_1 =
       GetInputToOutputIndexingForEntryComputation(ir, /*input_id=*/1);
-  EXPECT_THAT(
-      output_indexing_1.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1, d2) -> (d0, d1 + 5, d2)",
-          ElementsAre(MatchRange(0, 1), MatchRange(0, 10), MatchRange(0, 6)),
-          IsEmpty()))));
+  EXPECT_THAT(output_indexing_1.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0, d1 + 5, d2)
+                            domain:
+                            d0 in [0, 1]
+                            d1 in [0, 10]
+                            d2 in [0, 6]
+                          )"))));
 
   auto output_indexing_2 =
       GetInputToOutputIndexingForEntryComputation(ir, /*input_id=*/2);
-  EXPECT_THAT(
-      output_indexing_2.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1, d2) -> (d0, d1 + 16, d2)",
-          ElementsAre(MatchRange(0, 1), MatchRange(0, 16), MatchRange(0, 6)),
-          IsEmpty()))));
+  EXPECT_THAT(output_indexing_2.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0, d1 + 16, d2)
+                            domain:
+                            d0 in [0, 1]
+                            d1 in [0, 16]
+                            d2 in [0, 6]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, FusionOpWithSingleBinaryOp) {
@@ -301,13 +617,17 @@ TEST_F(IndexingAnalysisTest, FusionOpWithSingleBinaryOp) {
       ROOT fusion = f32[100] fusion(p0, p1), kind=kLoop, calls=f
     }
   )");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(
-          ElementsAre(MatchIndexingMap(
-              "(d0) -> (d0)", ElementsAre(MatchRange(0, 99)), IsEmpty())),
-          ElementsAre(MatchIndexingMap(
-              "(d0) -> (d0)", ElementsAre(MatchRange(0, 99)), IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0) -> (d0)
+                            domain:
+                            d0 in [0, 99]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0) -> (d0)
+                            domain:
+                            d0 in [0, 99]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, FusionOpWithDot) {
@@ -370,45 +690,71 @@ TEST_F(IndexingAnalysisTest, FusionOpWithDot) {
     }
   )");
 
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-                      "(d0, d1, d2, d3, d4, d5)[s0] -> "
-                      "(d2, d0 * 768 + s0, d4, d5)",
-                      ElementsAre(MatchRange(0, 15), MatchRange(0, 15),
-                                  MatchRange(0, 2), MatchRange(0, 0),
-                                  MatchRange(0, 5), MatchRange(0, 127)),
-                      ElementsAre(MatchRange(0, 767)))),
-                  ElementsAre(MatchIndexingMap(
-                      "(d0, d1, d2, d3, d4, d5)[s0] -> (d0 * 768 + s0)",
-                      ElementsAre(MatchRange(0, 15), MatchRange(0, 15),
-                                  MatchRange(0, 2), MatchRange(0, 0),
-                                  MatchRange(0, 5), MatchRange(0, 127)),
-                      ElementsAre(MatchRange(0, 767)))),
-                  ElementsAre(MatchIndexingMap(
-                      "(d0, d1, d2, d3, d4, d5) -> (d1)",
-                      ElementsAre(MatchRange(0, 15), MatchRange(0, 15),
-                                  MatchRange(0, 2), MatchRange(0, 0),
-                                  MatchRange(0, 5), MatchRange(0, 127)),
-                      IsEmpty())),
-                  ElementsAre(MatchIndexingMap(
-                      "(d0, d1, d2, d3, d4, d5)[s0] -> (d1, d0 * 768 + s0)",
-                      ElementsAre(MatchRange(0, 15), MatchRange(0, 15),
-                                  MatchRange(0, 2), MatchRange(0, 0),
-                                  MatchRange(0, 5), MatchRange(0, 127)),
-                      ElementsAre(MatchRange(0, 767)))),
-                  ElementsAre(MatchIndexingMap(
-                      "(d0, d1, d2, d3, d4, d5)[s0] -> (d1, d0 * 768 + s0)",
-                      ElementsAre(MatchRange(0, 15), MatchRange(0, 15),
-                                  MatchRange(0, 2), MatchRange(0, 0),
-                                  MatchRange(0, 5), MatchRange(0, 127)),
-                      ElementsAre(MatchRange(0, 767)))),
-                  ElementsAre(MatchIndexingMap(
-                      "(d0, d1, d2, d3, d4, d5) -> (d2, d4, d5)",
-                      ElementsAre(MatchRange(0, 15), MatchRange(0, 15),
-                                  MatchRange(0, 2), MatchRange(0, 0),
-                                  MatchRange(0, 5), MatchRange(0, 127)),
-                      IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2, d3, d4, d5)[s0] -> (d2, d0 * 768 + s0, d4, d5)
+                domain:
+                d0 in [0, 15]
+                d1 in [0, 15]
+                d2 in [0, 2]
+                d3 in [0, 0]
+                d4 in [0, 5]
+                d5 in [0, 127]
+                s0 in [0, 767]
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2, d3, d4, d5)[s0] -> (d0 * 768 + s0)
+                domain:
+                d0 in [0, 15]
+                d1 in [0, 15]
+                d2 in [0, 2]
+                d3 in [0, 0]
+                d4 in [0, 5]
+                d5 in [0, 127]
+                s0 in [0, 767]
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2, d3, d4, d5) -> (d1)
+                domain:
+                d0 in [0, 15]
+                d1 in [0, 15]
+                d2 in [0, 2]
+                d3 in [0, 0]
+                d4 in [0, 5]
+                d5 in [0, 127]
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2, d3, d4, d5)[s0] -> (d1, d0 * 768 + s0)
+                domain:
+                d0 in [0, 15]
+                d1 in [0, 15]
+                d2 in [0, 2]
+                d3 in [0, 0]
+                d4 in [0, 5]
+                d5 in [0, 127]
+                s0 in [0, 767]
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2, d3, d4, d5)[s0] -> (d1, d0 * 768 + s0)
+                domain:
+                d0 in [0, 15]
+                d1 in [0, 15]
+                d2 in [0, 2]
+                d3 in [0, 0]
+                d4 in [0, 5]
+                d5 in [0, 127]
+                s0 in [0, 767]
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2, d3, d4, d5) -> (d2, d4, d5)
+                domain:
+                d0 in [0, 15]
+                d1 in [0, 15]
+                d2 in [0, 2]
+                d3 in [0, 0]
+                d4 in [0, 5]
+                d5 in [0, 127]
+              )"))));
 }
 
 TEST_F(IndexingAnalysisTest, FusionOpWithSoftmax) {
@@ -457,17 +803,22 @@ TEST_F(IndexingAnalysisTest, FusionOpWithSoftmax) {
         fusion(p0), kind=kLoop, calls=softmax
     }
   )");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(UnorderedElementsAre(
-          MatchIndexingMap("(d0, d1, d2) -> (d0, d1, d2)",
-                           ElementsAre(MatchRange(0, 1), MatchRange(0, 64),
-                                       MatchRange(0, 124)),
-                           IsEmpty()),
-          MatchIndexingMap("(d0, d1, d2)[s0] -> (d0, d1, s0)",
-                           ElementsAre(MatchRange(0, 1), MatchRange(0, 64),
-                                       MatchRange(0, 124)),
-                           ElementsAre(MatchRange(0, 124))))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(UnorderedElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2)[s0] -> (d0, d1, s0)
+                            domain:
+                            d0 in [0, 1]
+                            d1 in [0, 64]
+                            d2 in [0, 124]
+                            s0 in [0, 124]
+                          )"),
+                                               MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0, d1, d2)
+                            domain:
+                            d0 in [0, 1]
+                            d1 in [0, 64]
+                            d2 in [0, 124]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, FusionOpTensorPlusTransposedTensor) {
@@ -483,15 +834,19 @@ TEST_F(IndexingAnalysisTest, FusionOpTensorPlusTransposedTensor) {
       ROOT fusion = f32[1000,1000] fusion(p0), kind=kLoop, calls=f
     }
   )");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(UnorderedElementsAre(
-          MatchIndexingMap("(d0, d1) -> (d1, d0)",
-                           ElementsAre(MatchRange(0, 999), MatchRange(0, 999)),
-                           IsEmpty()),
-          MatchIndexingMap("(d0, d1) -> (d0, d1)",
-                           ElementsAre(MatchRange(0, 999), MatchRange(0, 999)),
-                           IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(UnorderedElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0, d1)
+                            domain:
+                            d0 in [0, 999]
+                            d1 in [0, 999]
+                          )"),
+                                               MatchIndexingMap(R"(
+                            (d0, d1) -> (d1, d0)
+                            domain:
+                            d0 in [0, 999]
+                            d1 in [0, 999]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, FusionExponentialDuplication) {
@@ -516,23 +871,37 @@ TEST_F(IndexingAnalysisTest, FusionExponentialDuplication) {
       ROOT fusion = f32[2] fusion(p0, p1), kind=kLoop,
       calls=fused_computation
     })");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(
-          UnorderedElementsAre(
-              MatchIndexingMap("(d0) -> (d0)", ElementsAre(MatchRange(0, 1)),
-                               IsEmpty()),
-              MatchIndexingMap("(d0) -> (d0 + 1)",
-                               ElementsAre(MatchRange(0, 1)), IsEmpty()),
-              MatchIndexingMap("(d0) -> (d0 + 2)",
-                               ElementsAre(MatchRange(0, 1)), IsEmpty())),
-          UnorderedElementsAre(
-              MatchIndexingMap("(d0) -> (d0)", ElementsAre(MatchRange(0, 1)),
-                               IsEmpty()),
-              MatchIndexingMap("(d0) -> (d0 + 1)",
-                               ElementsAre(MatchRange(0, 1)), IsEmpty()),
-              MatchIndexingMap("(d0) -> (d0 + 2)",
-                               ElementsAre(MatchRange(0, 1)), IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(UnorderedElementsAre(MatchIndexingMap(R"(
+                            (d0) -> (d0 + 1)
+                            domain:
+                            d0 in [0, 1]
+                          )"),
+                                               MatchIndexingMap(R"(
+                            (d0) -> (d0)
+                            domain:
+                            d0 in [0, 1]
+                          )"),
+                                               MatchIndexingMap(R"(
+                            (d0) -> (d0 + 2)
+                            domain:
+                            d0 in [0, 1]
+                          )")),
+                          UnorderedElementsAre(MatchIndexingMap(R"(
+                            (d0) -> (d0 + 2)
+                            domain:
+                            d0 in [0, 1]
+                          )"),
+                                               MatchIndexingMap(R"(
+                            (d0) -> (d0 + 1)
+                            domain:
+                            d0 in [0, 1]
+                          )"),
+                                               MatchIndexingMap(R"(
+                            (d0) -> (d0)
+                            domain:
+                            d0 in [0, 1]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, FusionOpWithReduceOfReduce) {
@@ -558,14 +927,19 @@ TEST_F(IndexingAnalysisTest, FusionOpWithReduceOfReduce) {
     }
   )");
   EXPECT_THAT(input_indexing.indexing_maps,
-              ElementsAre(ElementsAre(MatchIndexingMap(
-                              "(d0)[s0, s1, s2] -> (s0, s2, d0, s1)",
-                              ElementsAre(MatchRange(0, 9)),
-                              ElementsAre(MatchRange(0, 149), MatchRange(0, 49),
-                                          MatchRange(0, 19)))),
-                          ElementsAre(MatchIndexingMap(
-                              "(d0) -> ()", ElementsAre(MatchRange(0, 9)),
-                              IsEmpty()))));
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0)[s0, s1, s2] -> (s0, s2, d0, s1)
+                            domain:
+                            d0 in [0, 9]
+                            s0 in [0, 149]
+                            s1 in [0, 49]
+                            s2 in [0, 19]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0) -> ()
+                            domain:
+                            d0 in [0, 9]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, FusionOpWithReduceOfBroadcast) {
@@ -591,14 +965,19 @@ TEST_F(IndexingAnalysisTest, FusionOpWithReduceOfBroadcast) {
     }
   )");
   EXPECT_THAT(input_indexing.indexing_maps,
-              ElementsAre(ElementsAre(MatchIndexingMap(
-                              "(d0, d1)[s0] -> (d0, s0)",
-                              ElementsAre(MatchRange(0, 14), MatchRange(0, 63)),
-                              ElementsAre(MatchRange(0, 19)))),
-                          ElementsAre(MatchIndexingMap(
-                              "(d0, d1) -> ()",
-                              ElementsAre(MatchRange(0, 14), MatchRange(0, 63)),
-                              IsEmpty()))));
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1)[s0] -> (d0, s0)
+                            domain:
+                            d0 in [0, 14]
+                            d1 in [0, 63]
+                            s0 in [0, 19]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> ()
+                            domain:
+                            d0 in [0, 14]
+                            d1 in [0, 63]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, FusionOpWithTransposeOfTranspose) {
@@ -626,12 +1005,14 @@ TEST_F(IndexingAnalysisTest, FusionOpWithTransposeOfTranspose) {
       ROOT fusion = f32[10, 50, 20] fusion(p0), kind=kLoop, calls=f
     }
   )");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1, d2) -> (d2, d0, d1)",
-          ElementsAre(MatchRange(0, 9), MatchRange(0, 49), MatchRange(0, 19)),
-          IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d2, d0, d1)
+                            domain:
+                            d0 in [0, 9]
+                            d1 in [0, 49]
+                            d2 in [0, 19]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, FusionOpWithReducedSlice) {
@@ -656,15 +1037,19 @@ TEST_F(IndexingAnalysisTest, FusionOpWithReducedSlice) {
       ROOT fusion = f32[32] fusion(p0, p0_init), kind=kLoop, calls=f
     }
   )");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(
-          ElementsAre(MatchIndexingMap(
-              "(d0)[s0, s1] -> (s0 + 5, d0 * 2, s1 * 3 + 50)",
-              ElementsAre(MatchRange(0, 31)),
-              ElementsAre(MatchRange(0, 15), MatchRange(0, 127)))),
-          ElementsAre(MatchIndexingMap(
-              "(d0) -> ()", ElementsAre(MatchRange(0, 31)), IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0)[s0, s1] -> (s0 + 5, d0 * 2, s1 * 3 + 50)
+                            domain:
+                            d0 in [0, 31]
+                            s0 in [0, 15]
+                            s1 in [0, 127]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0) -> ()
+                            domain:
+                            d0 in [0, 31]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, FusionOpWithReshape_CollapseOfExpand) {
@@ -680,10 +1065,12 @@ TEST_F(IndexingAnalysisTest, FusionOpWithReshape_CollapseOfExpand) {
       ROOT fusion = f32[128] fusion(p0), kind=kLoop, calls=f
     }
   )");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0) -> (d0)", ElementsAre(MatchRange(0, 127)), IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0) -> (d0)
+                            domain:
+                            d0 in [0, 127]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, FusionOpWithReshape_ExpandOfCollapse) {
@@ -699,11 +1086,13 @@ TEST_F(IndexingAnalysisTest, FusionOpWithReshape_ExpandOfCollapse) {
       ROOT fusion = f32[8, 16] fusion(p0), kind=kLoop, calls=f
     }
   )");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1) -> (d0, d1)",
-          ElementsAre(MatchRange(0, 7), MatchRange(0, 15)), IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0, d1)
+                            domain:
+                            d0 in [0, 7]
+                            d1 in [0, 15]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, FusionOpWithReshape_ChainedGenericReshapes) {
@@ -719,12 +1108,16 @@ TEST_F(IndexingAnalysisTest, FusionOpWithReshape_ChainedGenericReshapes) {
       ROOT fusion = f32[10, 10, 10] fusion(p0), kind=kLoop, calls=f
     }
   )");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1, d2) -> (d0, d1, d2)",
-          ElementsAre(MatchRange(0, 9), MatchRange(0, 9), MatchRange(0, 9)),
-          IsEmpty()))));
+  // TODO(jreiffers): Remove the redundant constraint.
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0, d1, d2)
+                            domain:
+                            d0 in [0, 9]
+                            d1 in [0, 9]
+                            d2 in [0, 9]
+                            d1 * 10 + d2 - (d1 floordiv 2) * 20 in [0, 19]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, FusionOpWithSliceOfSlice) {
@@ -742,12 +1135,134 @@ TEST_F(IndexingAnalysisTest, FusionOpWithSliceOfSlice) {
       ROOT fusion = f32[7, 9, 24] fusion(p0), kind=kLoop, calls=f
     }
   )");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1, d2) -> (d0 * 2 + 8, d1 * 6 + 8, d2 * 12 + 65)",
-          ElementsAre(MatchRange(0, 6), MatchRange(0, 8), MatchRange(0, 23)),
-          IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0 * 2 + 8,
+                                             d1 * 6 + 8,
+                                             d2 * 12 + 65)
+                            domain:
+                            d0 in [0, 6]
+                            d1 in [0, 8]
+                            d2 in [0, 23]
+                          )"))));
+}
+
+TEST_F(IndexingAnalysisTest, FusionOpSliceOfAllConcatenateOpInputs) {
+  auto input_indexing = GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    f {
+      p0 = f32[2, 5, 7] parameter(0)
+      p1 = f32[2, 11, 7] parameter(1)
+      p2 = f32[2, 17, 7] parameter(2)
+      concat = f32[2, 33, 7] concatenate(
+        f32[2, 5, 7] p0, f32[2, 11, 7] p1, f32[2, 17, 7] p2), dimensions={1}
+      ROOT slice = f32[2, 11, 7] slice(f32[2, 33, 7] concat),
+        slice={[0:2:1], [0:33:3], [0:7:1]}
+    }
+    ENTRY e {
+      p0 = f32[2, 5, 7] parameter(0)
+      p1 = f32[2, 11, 7] parameter(1)
+      p2 = f32[2, 17, 7] parameter(2)
+      ROOT fusion = f32[2, 11, 7] fusion(p0, p1, p2), kind=kLoop, calls=f
+    }
+  )");
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0, d1 * 3, d2)
+                            domain:
+                            d0 in [0, 1]
+                            d1 in [0, 1]
+                            d2 in [0, 6]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0, d1 * 3 - 5, d2)
+                            domain:
+                            d0 in [0, 1]
+                            d1 in [2, 5]
+                            d2 in [0, 6]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0, d1 * 3 - 16, d2)
+                            domain:
+                            d0 in [0, 1]
+                            d1 in [6, 10]
+                            d2 in [0, 6]
+                          )"))));
+}
+
+TEST_F(IndexingAnalysisTest, FusionOpSliceOfOneOfConcatenateOpInputs) {
+  auto input_indexing = GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    f {
+      p0 = f32[2, 5, 7] parameter(0)
+      p1 = f32[2, 11, 7] parameter(1)
+      p2 = f32[2, 17, 7] parameter(2)
+      concat = f32[2, 33, 7] concatenate(
+        f32[2, 5, 7] p0, f32[2, 11, 7] p1, f32[2, 17, 7] p2), dimensions={1}
+      ROOT slice = f32[2, 3, 7] slice(f32[2, 33, 7] concat),
+        slice={[0:2:1], [0:5:2], [0:7:1]}
+    }
+    ENTRY e {
+      p0 = f32[2, 5, 7] parameter(0)
+      p1 = f32[2, 11, 7] parameter(1)
+      p2 = f32[2, 17, 7] parameter(2)
+      ROOT fusion = f32[2, 3, 7] fusion(p0, p1, p2), kind=kLoop, calls=f
+    }
+  )");
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0, d1 * 2, d2)
+                            domain:
+                            d0 in [0, 1]
+                            d1 in [0, 2]
+                            d2 in [0, 6]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0, d1 * 2 - 5, d2)
+                            domain:
+                            d0 in [0, 1]
+                            d1 in [3, 2]
+                            d2 in [0, 6]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0, d1 * 2 - 16, d2)
+                            domain:
+                            d0 in [0, 1]
+                            d1 in [8, 2]
+                            d2 in [0, 6]
+                          )"))));
+}
+
+TEST_F(IndexingAnalysisTest, FusionOpReshapeOfConcat) {
+  auto input_indexing = GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    f {
+      p0 = f32[2] parameter(0)
+      p1 = f32[30] parameter(1)
+      concat = f32[32] concatenate(f32[2] p0, f32[30] p1), dimensions={0}
+      ROOT reshape = f32[4, 8] reshape(concat)
+    }
+    ENTRY e {
+      p0 = f32[2] parameter(0)
+      p1 = f32[30] parameter(1)
+      ROOT fusion = f32[4, 8] fusion(p0, p1), kind=kLoop, calls=f
+    }
+  )");
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0 * 8 + d1)
+                            domain:
+                            d0 in [0, 3]
+                            d1 in [0, 7]
+                            d0 * 8 + d1 in [0, 1]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0 * 8 + d1 - 2)
+                            domain:
+                            d0 in [0, 3]
+                            d1 in [0, 7]
+                            d0 * 8 + d1 in [2, 31]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, IotaOp) {
@@ -770,9 +1285,11 @@ TEST_F(IndexingAnalysisTest, ReshapeOpCollapseShape) {
     }
   )");
   EXPECT_THAT(input_indexing.indexing_maps,
-              ElementsAre(ElementsAre(MatchIndexingMap(
-                  "(d0) -> (d0 floordiv 8, d0 mod 8)",
-                  ElementsAre(MatchRange(0, 31)), IsEmpty()))));
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0) -> (d0 floordiv 8, d0 mod 8)
+                            domain:
+                            d0 in [0, 31]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, ReshapeOpExpandShape) {
@@ -783,11 +1300,13 @@ TEST_F(IndexingAnalysisTest, ReshapeOpExpandShape) {
       ROOT reshape = f32[4, 8] reshape(p0)
     }
   )");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1) -> (d0 * 8 + d1)",
-          ElementsAre(MatchRange(0, 3), MatchRange(0, 7)), IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0 * 8 + d1)
+                            domain:
+                            d0 in [0, 3]
+                            d1 in [0, 7]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, ReshapeOpExpandAndCollapseShape) {
@@ -799,20 +1318,24 @@ TEST_F(IndexingAnalysisTest, ReshapeOpExpandAndCollapseShape) {
     }
   )";
   auto input_indexing = GetOutputToInputIndexingForEntryComputation(ir);
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1, d2) -> (d0 floordiv 8, d0 mod 8, d1 * 4 + d2)",
-          ElementsAre(MatchRange(0, 31), MatchRange(0, 2), MatchRange(0, 3)),
-          IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2) -> (d0 floordiv 8, d0 mod 8, d1 * 4 + d2)
+                domain:
+                d0 in [0, 31]
+                d1 in [0, 2]
+                d2 in [0, 3]
+              )"))));
 
   auto output_indexing = GetInputToOutputIndexingForEntryComputation(ir);
-  EXPECT_THAT(
-      output_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1, d2) -> (d0 * 8 + d1, d2 floordiv 4, d2 mod 4)",
-          ElementsAre(MatchRange(0, 3), MatchRange(0, 7), MatchRange(0, 11)),
-          IsEmpty()))));
+  EXPECT_THAT(output_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2) -> (d0 * 8 + d1, d2 floordiv 4, d2 mod 4)
+                domain:
+                d0 in [0, 3]
+                d1 in [0, 7]
+                d2 in [0, 11]
+              )"))));
 }
 
 TEST_F(IndexingAnalysisTest, ReshapeOpExpandSubshapeOnly) {
@@ -823,12 +1346,14 @@ TEST_F(IndexingAnalysisTest, ReshapeOpExpandSubshapeOnly) {
       ROOT reshape = f32[4, 4, 8] reshape(p0)
     }
   )");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1, d2) -> (d0 * 4 + d1, d2)",
-          ElementsAre(MatchRange(0, 3), MatchRange(0, 3), MatchRange(0, 7)),
-          IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2) -> (d0 * 4 + d1, d2)
+                domain:
+                d0 in [0, 3]
+                d1 in [0, 3]
+                d2 in [0, 7]
+              )"))));
 }
 
 TEST_F(IndexingAnalysisTest, ReshapeOpGenericReshape2DTO3D) {
@@ -839,13 +1364,15 @@ TEST_F(IndexingAnalysisTest, ReshapeOpGenericReshape2DTO3D) {
       ROOT reshape = f32[2, 4, 4] reshape(p0)
     }
   )");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1, d2) -> (d0 * 2 + (d1 * 4 + d2) floordiv 8, "
-          "(d1 * 4 + d2) mod 8)",
-          ElementsAre(MatchRange(0, 1), MatchRange(0, 3), MatchRange(0, 3)),
-          IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2) -> (d0 * 2 + d1 floordiv 2,
+                                d1 * 4 + d2 - (d1 floordiv 2) * 8)
+                domain:
+                d0 in [0, 1]
+                d1 in [0, 3]
+                d2 in [0, 3]
+              )"))));
 }
 
 TEST_F(IndexingAnalysisTest, ReshapeOpGenericReshape3DTO2D) {
@@ -856,12 +1383,96 @@ TEST_F(IndexingAnalysisTest, ReshapeOpGenericReshape3DTO2D) {
       ROOT reshape = f32[4, 8] reshape(p0)
     }
   )");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1) -> ((d0 * 8 + d1) floordiv 16, "
-          "((d0 * 8 + d1) mod 16) floordiv 4, d1 mod 4)",
-          ElementsAre(MatchRange(0, 3), MatchRange(0, 7)), IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0 floordiv 2,
+                                         d0 * 2 - (d0 floordiv 2) * 4 +
+                                           d1 floordiv 4,
+                                         d1 mod 4)
+                            domain:
+                            d0 in [0, 3]
+                            d1 in [0, 7]
+                          )"))));
+}
+
+TEST_F(IndexingAnalysisTest, PadOp) {
+  auto input_indexing = GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[4, 4] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT pad = f32[12, 16] pad(p0, p1), padding=1_4_1x4_8_0
+    }
+  )");
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                                        (d0, d1) -> (
+                                          (d0 - 1) floordiv 2,
+                                          d1 - 4
+                                        )
+                                        domain:
+                                        d0 in [1, 7]
+                                        d1 in [4, 7]
+                                        (d0 - 1) mod 2 in [0, 0]
+                                      )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                                        (d0, d1) -> ()
+                                        domain:
+                                        d0 in [0, 11]
+                                        d1 in [0, 15]
+                                      )"))));
+}
+
+TEST_F(IndexingAnalysisTest, PadOpNoInterior) {
+  auto input_indexing = GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[2,8] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT pad = f32[10,8] pad(p0, p1), padding=1_7x0_0
+    }
+  )");
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                                        (d0, d1) -> (d0 - 1, d1)
+                                        domain:
+                                        d0 in [1, 2]
+                                        d1 in [0, 7]
+                                      )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                                        (d0, d1) -> ()
+                                        domain:
+                                        d0 in [0, 9]
+                                        d1 in [0, 7]
+                                      )"))));
+}
+
+TEST_F(IndexingAnalysisTest, PadOpNegativePadding) {
+  // The interior padding is applied first (even with negative padding), so we
+  // get a size of 5 (7 + 6 - 8).
+  // in:     0 1 2 3 4 5 6
+  // padded: 0 p 1 p 2 p 3 p 4 p 5 p 6
+  // sliced:       p 2 p 3 p
+  auto input_indexing = GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[7] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT pad = f32[5] pad(p0, p1), padding=-3_-5_1
+    }
+  )");
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                                        (d0) -> ((d0 + 3) floordiv 2)
+                                        domain:
+                                        d0 in [0, 4]
+                                        (d0 + 3) mod 2 in [0, 0]
+                                      )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                                        (d0) -> ()
+                                        domain:
+                                        d0 in [0, 4]
+                                      )"))));
 }
 
 TEST_F(IndexingAnalysisTest, ReduceOp) {
@@ -880,28 +1491,38 @@ TEST_F(IndexingAnalysisTest, ReduceOp) {
     }
   )";
   auto input_indexing = GetOutputToInputIndexingForEntryComputation(ir);
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(
-          ElementsAre(MatchIndexingMap(
-              "(d0, d1)[s0, s1] -> (d0, s0, d1, s1)",
-              ElementsAre(MatchRange(0, 149), MatchRange(0, 9)),
-              ElementsAre(MatchRange(0, 19), MatchRange(0, 49)))),
-          ElementsAre(MatchIndexingMap(
-              "(d0, d1) -> ()",
-              ElementsAre(MatchRange(0, 149), MatchRange(0, 9)), IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1)[s0, s1] -> (d0, s0, d1, s1)
+                            domain:
+                            d0 in [0, 149]
+                            d1 in [0, 9]
+                            s0 in [0, 19]
+                            s1 in [0, 49]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> ()
+                            domain:
+                            d0 in [0, 149]
+                            d1 in [0, 9]
+                          )"))));
 
   auto output_indexing = GetInputToOutputIndexingForEntryComputation(ir);
-  EXPECT_THAT(
-      output_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-                      "(d0, d1, d2, d3) -> (d0, d2)",
-                      ElementsAre(MatchRange(0, 149), MatchRange(0, 19),
-                                  MatchRange(0, 9), MatchRange(0, 49)),
-                      IsEmpty())),
-                  ElementsAre(MatchIndexingMap(
-                      "()[s0, s1] -> (s0, s1)", IsEmpty(),
-                      ElementsAre(MatchRange(0, 149), MatchRange(0, 9))))));
+  EXPECT_THAT(output_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3) -> (d0, d2)
+                            domain:
+                            d0 in [0, 149]
+                            d1 in [0, 19]
+                            d2 in [0, 9]
+                            d3 in [0, 49]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            ()[s0, s1] -> (s0, s1)
+                            domain:
+                            s0 in [0, 149]
+                            s1 in [0, 9]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, VariadicReduceOp) {
@@ -929,78 +1550,296 @@ TEST_F(IndexingAnalysisTest, VariadicReduceOp) {
 
   auto output_indexing_0 =
       GetOutputToInputIndexingForEntryComputation(ir, /*output_id=*/0);
-  EXPECT_THAT(
-      output_indexing_0.indexing_maps,
-      ElementsAre(
-          ElementsAre(MatchIndexingMap("(d0)[s0] -> (s0, d0)",
-                                       ElementsAre(MatchRange(0, 9)),
-                                       ElementsAre(MatchRange(0, 255)))),
-          ElementsAre(MatchIndexingMap("(d0)[s0] -> (s0, d0)",
-                                       ElementsAre(MatchRange(0, 9)),
-
-                                       ElementsAre(MatchRange(0, 255)))),
-
-          ElementsAre(MatchIndexingMap(
-              "(d0) -> ()", ElementsAre(MatchRange(0, 9)), IsEmpty())),
-
-          ElementsAre(MatchIndexingMap(
-              "(d0) -> ()", ElementsAre(MatchRange(0, 9)), IsEmpty()))));
+  EXPECT_THAT(output_indexing_0.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0)[s0] -> (s0, d0)
+                            domain:
+                            d0 in [0, 9]
+                            s0 in [0, 255]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0)[s0] -> (s0, d0)
+                            domain:
+                            d0 in [0, 9]
+                            s0 in [0, 255]
+                          )")),
+
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0) -> ()
+                            domain:
+                            d0 in [0, 9]
+                          )")),
+
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0) -> ()
+                            domain:
+                            d0 in [0, 9]
+                          )"))));
 
   auto output_indexing_1 =
       GetOutputToInputIndexingForEntryComputation(ir, /*output_id=*/1);
-  EXPECT_THAT(
-      output_indexing_1.indexing_maps,
-      ElementsAre(
-          ElementsAre(MatchIndexingMap("(d0)[s0] -> (s0, d0)",
-                                       ElementsAre(MatchRange(0, 9)),
-                                       ElementsAre(MatchRange(0, 255)))),
-          ElementsAre(MatchIndexingMap("(d0)[s0] -> (s0, d0)",
-                                       ElementsAre(MatchRange(0, 9)),
-
-                                       ElementsAre(MatchRange(0, 255)))),
-
-          ElementsAre(MatchIndexingMap(
-              "(d0) -> ()", ElementsAre(MatchRange(0, 9)), IsEmpty())),
-
-          ElementsAre(MatchIndexingMap(
-              "(d0) -> ()", ElementsAre(MatchRange(0, 9)), IsEmpty()))));
+  EXPECT_THAT(output_indexing_1.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0)[s0] -> (s0, d0)
+                            domain:
+                            d0 in [0, 9]
+                            s0 in [0, 255]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0)[s0] -> (s0, d0)
+                            domain:
+                            d0 in [0, 9]
+                            s0 in [0, 255]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0) -> ()
+                            domain:
+                            d0 in [0, 9]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0) -> ()
+                            domain:
+                            d0 in [0, 9]
+                          )"))));
 
   auto input_indexing_0 =
       GetInputToOutputIndexingForEntryComputation(ir, /*input_id=*/0);
 
-  EXPECT_THAT(
-      input_indexing_0.indexing_maps,
-      ElementsAre(
-          ElementsAre(MatchIndexingMap(
-              "(d0, d1) -> (d1)",
-              ElementsAre(MatchRange(0, 255), MatchRange(0, 9)), IsEmpty())),
-          ElementsAre(MatchIndexingMap(
-              "(d0, d1) -> (d1)",
-              ElementsAre(MatchRange(0, 255), MatchRange(0, 9)), IsEmpty())),
+  EXPECT_THAT(input_indexing_0.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d1)
+                            domain:
+                            d0 in [0, 255]
+                            d1 in [0, 9]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d1)
+                            domain:
+                            d0 in [0, 255]
+                            d1 in [0, 9]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            ()[s0] -> (s0)
+                            domain:
+                            s0 in [0, 9]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            ()[s0] -> (s0)
+                            domain:
+                            s0 in [0, 9]
+                          )"))));
 
-          ElementsAre(MatchIndexingMap("()[s0] -> (s0)", IsEmpty(),
-                                       ElementsAre(MatchRange(0, 9)))),
+  auto input_indexing_1 =
+      GetInputToOutputIndexingForEntryComputation(ir, /*input_id=*/1);
+  EXPECT_THAT(input_indexing_1.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d1)
+                            domain:
+                            d0 in [0, 255]
+                            d1 in [0, 9]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d1)
+                            domain:
+                            d0 in [0, 255]
+                            d1 in [0, 9]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            ()[s0] -> (s0)
+                            domain:
+                            s0 in [0, 9]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            ()[s0] -> (s0)
+                            domain:
+                            s0 in [0, 9]
+                          )"))));
+}
+
+TEST_F(IndexingAnalysisTest, ReduceWindowOp_NoPadding) {
+  auto ir = R"(
+    HloModule m
+    max {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT max = f32[] maximum(p0, p1)
+    }
+    ENTRY e {
+      c_inf = f32[] constant(-inf)
+      p0 = f32[1024, 514]parameter(0)
+      ROOT reduce-window = f32[1024, 3] reduce-window(p0, c_inf),
+        window={size=1x512 pad=0_0x0_0}, to_apply=max
+    }
+  )";
+  auto input_indexing = GetOutputToInputIndexingForEntryComputation(ir);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1)[s0] -> (d0, d1 + s0)
+                            domain:
+                            d0 in [0, 1023]
+                            d1 in [0, 2]
+                            s0 in [0, 511]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> ()
+                            domain:
+                            d0 in [0, 1023]
+                            d1 in [0, 2]
+                          )"))));
+}
 
-          ElementsAre(MatchIndexingMap("()[s0] -> (s0)", IsEmpty(),
-                                       ElementsAre(MatchRange(0, 9))))));
+TEST_F(IndexingAnalysisTest, ReduceWindowOp_PaddingAndWindowStride) {
+  auto ir = R"(
+    HloModule m
+    max {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT max = f32[] maximum(p0, p1)
+    }
+    ENTRY e {
+      c_inf = f32[] constant(-inf)
+      p0 = f32[13, 17] parameter(0)
+      ROOT reduce-window = f32[7, 17] reduce-window(p0, c_inf),
+       window={size=3x2 stride=2x1 pad=1_1x0_1}, to_apply=max
+    }
+  )";
+  auto input_indexing = GetOutputToInputIndexingForEntryComputation(ir);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1)[s0, s1] -> (d0 * 2 + s0 - 1, d1 + s1)
+                            domain:
+                            d0 in [0, 6]
+                            d1 in [0, 16]
+                            s0 in [0, 2]
+                            s1 in [0, 1]
+                            d0 * 2 + s0 in [1, 13]
+                            d1 + s1 in [0, 16]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> ()
+                            domain:
+                            d0 in [0, 6]
+                            d1 in [0, 16]
+                          )"))));
+}
 
+TEST_F(IndexingAnalysisTest, ReduceWindowOp_Dilation) {
+  auto ir = R"(
+    HloModule m
+    max {
+      p0 = f32[] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT max = f32[] maximum(p0, p1)
+    }
+    ENTRY e {
+      c_inf = f32[] constant(-inf)
+      p0 = f32[2, 3] parameter(0)
+      ROOT reduce-window = f32[3, 5] reduce-window(p0, c_inf),
+       window={size=1x1 pad=0_0x0_0 lhs_dilate=2x2}, to_apply=max
+    }
+  )";
+  auto input_indexing = GetOutputToInputIndexingForEntryComputation(ir);
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0 floordiv 2, d1 floordiv 2)
+                            domain:
+                            d0 in [0, 2]
+                            d1 in [0, 4]
+                            d0 mod 2 in [0, 0]
+                            d1 mod 2 in [0, 0]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> ()
+                            domain:
+                            d0 in [0, 2]
+                            d1 in [0, 4]
+                          )"))));
+}
+
+TEST_F(IndexingAnalysisTest, ReduceWindowOp_Variadic) {
+  auto ir = R"(
+    HloModule m
+    combiner {
+      a0 = f32[] parameter(0)
+      a1 = s32[] parameter(1)
+      b0 = f32[] parameter(2)
+      b1 = s32[] parameter(3)
+      add0 = f32[] add(a0, b0)
+      add1 = s32[] add(a1, b1)
+      ROOT sum2 = (f32[], s32[]) tuple(add0, add1)
+    }
+    ENTRY e {
+      c_f32 = f32[] constant(-inf)
+      c_s32 = s32[] constant(10)
+      p0 = f32[2, 3] parameter(0)
+      p1 = s32[2, 3] parameter(1)
+      ROOT reduce-window = (f32[1, 2], s32[1, 2])
+        reduce-window(p0, p1, c_f32, c_s32),
+        window={size=2x2 pad=0_0x0_0}, to_apply=combiner
+    }
+  )";
+  auto input_indexing_0 =
+      GetOutputToInputIndexingForEntryComputation(ir, /*output_id=*/0);
+  EXPECT_THAT(input_indexing_0.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1)[s0, s1] -> (s0, d1 + s1)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 1]
+                            s0 in [0, 1]
+                            s1 in [0, 1]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1)[s0, s1] -> (s0, d1 + s1)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 1]
+                            s0 in [0, 1]
+                            s1 in [0, 1]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> ()
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 1]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                           (d0, d1) -> ()
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 1]
+                          )"))));
   auto input_indexing_1 =
-      GetInputToOutputIndexingForEntryComputation(ir, /*input_id=*/1);
-  EXPECT_THAT(
-      input_indexing_1.indexing_maps,
-      ElementsAre(
-          ElementsAre(MatchIndexingMap(
-              "(d0, d1) -> (d1)",
-              ElementsAre(MatchRange(0, 255), MatchRange(0, 9)), IsEmpty())),
-          ElementsAre(MatchIndexingMap(
-              "(d0, d1) -> (d1)",
-              ElementsAre(MatchRange(0, 255), MatchRange(0, 9)), IsEmpty())),
-
-          ElementsAre(MatchIndexingMap("()[s0] -> (s0)", IsEmpty(),
-                                       ElementsAre(MatchRange(0, 9)))),
-
-          ElementsAre(MatchIndexingMap("()[s0] -> (s0)", IsEmpty(),
-                                       ElementsAre(MatchRange(0, 9))))));
+      GetOutputToInputIndexingForEntryComputation(ir, /*output_id=*/1);
+  EXPECT_THAT(input_indexing_1.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1)[s0, s1] -> (s0, d1 + s1)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 1]
+                            s0 in [0, 1]
+                            s1 in [0, 1]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1)[s0, s1] -> (s0, d1 + s1)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 1]
+                            s0 in [0, 1]
+                            s1 in [0, 1]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> ()
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 1]
+                          )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                           (d0, d1) -> ()
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 1]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, ReverseOp) {
@@ -1013,19 +1852,25 @@ TEST_F(IndexingAnalysisTest, ReverseOp) {
   )";
   auto input_indexing = GetOutputToInputIndexingForEntryComputation(ir);
   EXPECT_THAT(input_indexing.indexing_maps,
-              ElementsAre(ElementsAre(MatchIndexingMap(
-                  "(d0, d1, d2, d3) -> (d0, -d1 + 16, -d2 + 8, d3)",
-                  ElementsAre(MatchRange(0, 0), MatchRange(0, 16),
-                              MatchRange(0, 8), MatchRange(0, 8)),
-                  IsEmpty()))));
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3) -> (d0, -d1 + 16, -d2 + 8, d3)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 16]
+                            d2 in [0, 8]
+                            d3 in [0, 8]
+                          )"))));
 
   auto output_indexing = GetInputToOutputIndexingForEntryComputation(ir);
   EXPECT_THAT(output_indexing.indexing_maps,
-              ElementsAre(ElementsAre(MatchIndexingMap(
-                  "(d0, d1, d2, d3) -> (d0, -d1 + 16, -d2 + 8, d3)",
-                  ElementsAre(MatchRange(0, 0), MatchRange(0, 16),
-                              MatchRange(0, 8), MatchRange(0, 8)),
-                  IsEmpty()))));
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3) -> (d0, -d1 + 16, -d2 + 8, d3)
+                            domain:
+                            d0 in [0, 0]
+                            d1 in [0, 16]
+                            d2 in [0, 8]
+                            d3 in [0, 8]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, ReverseReshape) {
@@ -1044,11 +1889,13 @@ TEST_F(IndexingAnalysisTest, ReverseReshape) {
       calls=fused_computation
     }
   )");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1) -> (d0, d1)",
-          ElementsAre(MatchRange(0, 9), MatchRange(0, 10)), IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0, d1)
+                            domain:
+                            d0 in [0, 9]
+                            d1 in [0, 10]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, SliceOp) {
@@ -1060,12 +1907,14 @@ TEST_F(IndexingAnalysisTest, SliceOp) {
           slice={[5:10:1], [3:20:7], [0:50:2]}
     }
   )");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-          "(d0, d1, d2) -> (d0 + 5, d1 * 7 + 3, d2 * 2)",
-          ElementsAre(MatchRange(0, 4), MatchRange(0, 2), MatchRange(0, 24)),
-          IsEmpty()))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2) -> (d0 + 5, d1 * 7 + 3, d2 * 2)
+                            domain:
+                            d0 in [0, 4]
+                            d1 in [0, 2]
+                            d2 in [0, 24]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, TransposeOp) {
@@ -1079,19 +1928,24 @@ TEST_F(IndexingAnalysisTest, TransposeOp) {
   )";
   auto input_indexing = GetOutputToInputIndexingForEntryComputation(ir);
   EXPECT_THAT(input_indexing.indexing_maps,
-              ElementsAre(ElementsAre(MatchIndexingMap(
-                  "(d0, d1, d2, d3) -> (d0, d3, d1, d2)",
-                  ElementsAre(MatchRange(0, 2), MatchRange(0, 5),
-                              MatchRange(0, 127), MatchRange(0, 12287)),
-                  IsEmpty()))));
-
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3) -> (d0, d3, d1, d2)
+                            domain:
+                            d0 in [0, 2]
+                            d1 in [0, 5]
+                            d2 in [0, 127]
+                            d3 in [0, 12287]
+                          )"))));
   auto output_indexing = GetInputToOutputIndexingForEntryComputation(ir);
   EXPECT_THAT(output_indexing.indexing_maps,
-              ElementsAre(ElementsAre(MatchIndexingMap(
-                  "(d0, d1, d2, d3) -> (d0, d2, d3, d1)",
-                  ElementsAre(MatchRange(0, 2), MatchRange(0, 12287),
-                              MatchRange(0, 5), MatchRange(0, 127)),
-                  IsEmpty()))));
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3) -> (d0, d2, d3, d1)
+                            domain:
+                            d0 in [0, 2]
+                            d1 in [0, 12287]
+                            d2 in [0, 5]
+                            d3 in [0, 127]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, TransposeOp4D) {
@@ -1103,11 +1957,14 @@ TEST_F(IndexingAnalysisTest, TransposeOp4D) {
     }
   )");
   EXPECT_THAT(input_indexing.indexing_maps,
-              ElementsAre(ElementsAre(MatchIndexingMap(
-                  "(d0, d1, d2, d3) -> (d0, d3, d1, d2)",
-                  ElementsAre(MatchRange(0, 2), MatchRange(0, 5),
-                              MatchRange(0, 127), MatchRange(0, 12287)),
-                  IsEmpty()))));
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                            (d0, d1, d2, d3) -> (d0, d3, d1, d2)
+                            domain:
+                            d0 in [0, 2]
+                            d1 in [0, 5]
+                            d2 in [0, 127]
+                            d3 in [0, 12287]
+                          )"))));
 }
 
 TEST_F(IndexingAnalysisTest, DotOp) {
@@ -1121,22 +1978,31 @@ TEST_F(IndexingAnalysisTest, DotOp) {
         lhs_contracting_dims={4,2}, rhs_contracting_dims={3,0}
     }
   )");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(ElementsAre(MatchIndexingMap(
-                      "(d0, d1, d2, d3, d4, d5)[s0, s1] -> "
-                      "(d2, d1, s1, d3, s0, d0)",
-                      ElementsAre(MatchRange(0, 9), MatchRange(0, 37),
-                                  MatchRange(0, 3), MatchRange(0, 10),
-                                  MatchRange(0, 15), MatchRange(0, 21)),
-                      ElementsAre(MatchRange(0, 17), MatchRange(0, 16)))),
-                  ElementsAre(MatchIndexingMap(
-                      "(d0, d1, d2, d3, d4, d5)[s0, s1] -> "
-                      "(s1, d0, d4, s0, d5, d1)",
-                      ElementsAre(MatchRange(0, 9), MatchRange(0, 37),
-                                  MatchRange(0, 3), MatchRange(0, 10),
-                                  MatchRange(0, 15), MatchRange(0, 21)),
-                      ElementsAre(MatchRange(0, 17), MatchRange(0, 16))))));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2, d3, d4, d5)[s0, s1] -> (d2, d1, s1, d3, s0, d0)
+                domain:
+                d0 in [0, 9]
+                d1 in [0, 37]
+                d2 in [0, 3]
+                d3 in [0, 10]
+                d4 in [0, 15]
+                d5 in [0, 21]
+                s0 in [0, 17]
+                s1 in [0, 16]
+              )")),
+                          ElementsAre(MatchIndexingMap(R"(
+                (d0, d1, d2, d3, d4, d5)[s0, s1] -> (s1, d0, d4, s0, d5, d1)
+                domain:
+                d0 in [0, 9]
+                d1 in [0, 37]
+                d2 in [0, 3]
+                d3 in [0, 10]
+                d4 in [0, 15]
+                d5 in [0, 21]
+                s0 in [0, 17]
+                s1 in [0, 16]
+              )"))));
 }
 
 TEST_F(IndexingAnalysisTest, UnsupportedOps) {
@@ -1151,21 +2017,22 @@ TEST_F(IndexingAnalysisTest, UnsupportedOps) {
     }
   )";
   auto input_indexing = GetOutputToInputIndexingForEntryComputation(ir);
-  EXPECT_THAT(input_indexing.indexing_maps,
-              ElementsAre(ElementsAre(std::nullopt), ElementsAre(std::nullopt),
-                          ElementsAre(std::nullopt)));
+  EXPECT_THAT(
+      input_indexing.indexing_maps,
+      ElementsAre(ElementsAre(UndefinedMap()), ElementsAre(UndefinedMap()),
+                  ElementsAre(UndefinedMap())));
 
   auto output_indexing_0 = GetInputToOutputIndexingForEntryComputation(ir, 0);
   EXPECT_THAT(output_indexing_0.indexing_maps,
-              ElementsAre(ElementsAre(std::nullopt)));
+              ElementsAre(ElementsAre(UndefinedMap())));
 
   auto output_indexing_1 = GetInputToOutputIndexingForEntryComputation(ir, 1);
   EXPECT_THAT(output_indexing_1.indexing_maps,
-              ElementsAre(ElementsAre(std::nullopt)));
+              ElementsAre(ElementsAre(UndefinedMap())));
 
   auto output_indexing_2 = GetInputToOutputIndexingForEntryComputation(ir, 2);
   EXPECT_THAT(output_indexing_2.indexing_maps,
-              ElementsAre(ElementsAre(std::nullopt)));
+              ElementsAre(ElementsAre(UndefinedMap())));
 }
 
 TEST_F(IndexingAnalysisTest, FusionWithUnsupportedOp) {
@@ -1187,15 +2054,39 @@ TEST_F(IndexingAnalysisTest, FusionWithUnsupportedOp) {
           calls=fused_computation
     }
   )");
-  EXPECT_THAT(
-      input_indexing.indexing_maps,
-      ElementsAre(
-          UnorderedElementsAre(
-              MatchIndexingMap("(d0, d1) -> (d0 * 4, d1)",
-                               ElementsAre(MatchRange(0, 4), MatchRange(0, 4)),
-                               IsEmpty()),
-              std::nullopt),
-          ElementsAre(std::nullopt)));
+  EXPECT_THAT(input_indexing.indexing_maps,
+              ElementsAre(UnorderedElementsAre(MatchIndexingMap(R"(
+                            (d0, d1) -> (d0 * 4, d1)
+                            domain:
+                            d0 in [0, 4]
+                            d1 in [0, 4]
+                          )"),
+                                               UndefinedMap()),
+                          ElementsAre(UndefinedMap())));
+}
+
+TEST_F(IndexingAnalysisTest, TilingIndexing) {
+  Tiling tiling{/*shape=*/{1024, 256, 16},
+                /*tile_sizes=*/{8, 1, 4},
+                /*num_threads=*/{1, 4, 4}};
+  EXPECT_THAT(GetIndexingMapForTiling(tiling, &mlir_context_).ToString(),
+              MatchIndexingString(R"(
+        (d0, d1, d2, d3, d4, d5)[s0, s1, s2] -> (
+          (d3 floordiv 64) * 8 + s0,
+          (d3 mod 64) * 4 + d0 floordiv 4,
+          d0 mod 4 + s2 * 4
+        )
+        domain:
+        d0 in [0, 15]
+        d1 in [0, 0]
+        d2 in [0, 0]
+        d3 in [0, 8191]
+        d4 in [0, 0]
+        d5 in [0, 0]
+        s0 in [0, 7]
+        s1 in [0, 0]
+        s2 in [0, 3]
+      )"));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.cc b/third_party/xla/xla/service/gpu/model/indexing_map.cc
index 2e6ccfb1eccf8f..f27b7c0b9290e5 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <algorithm>
 #include <cstdint>
 #include <functional>
+#include <numeric>
 #include <optional>
 #include <ostream>
 #include <sstream>
@@ -27,8 +28,11 @@ limitations under the License.
 
 #include "absl/types/span.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "mlir/Support/LLVM.h"  // from @llvm-project
 #include "xla/service/gpu/model/affine_map_printer.h"
 #include "tsl/platform/logging.h"  // IWYU pragma: keep
@@ -37,6 +41,9 @@ namespace xla {
 namespace gpu {
 namespace {
 
+using llvm::ArrayRef;
+using llvm::SmallBitVector;
+using llvm::SmallVector;
 using mlir::AffineBinaryOpExpr;
 using mlir::AffineConstantExpr;
 using mlir::AffineDimExpr;
@@ -46,245 +53,63 @@ using mlir::AffineMap;
 using mlir::AffineSymbolExpr;
 using mlir::getAffineBinaryOpExpr;
 using mlir::getAffineConstantExpr;
+using mlir::MLIRContext;
 
 int64_t FloorDiv(int64_t dividend, int64_t divisor) {
   return dividend / divisor -
          (((dividend >= 0) != (divisor >= 0) && dividend % divisor) ? 1 : 0);
 }
 
-}  // namespace
-
-std::string Range::ToString() const {
-  std::string s;
-  std::stringstream ss(s);
-  Print(ss);
-  return ss.str();
-}
-
-void Range::Print(std::ostream& out) const {
-  out << '[' << lower_bound << ", " << upper_bound << ")";
-}
-
-std::ostream& operator<<(std::ostream& out, const Range& range) {
-  range.Print(out);
-  return out;
-}
-
-bool operator==(const Range& lhs, const Range& rhs) {
-  return lhs.lower_bound == rhs.lower_bound &&
-         lhs.upper_bound == rhs.upper_bound;
-}
-
-Domain Domain::FromUpperBounds(absl::Span<const int64_t> dim_upper_bounds,
-                               absl::Span<const int64_t> symbol_upper_bounds) {
-  Domain domain;
-  domain.dim_ranges_.reserve(dim_upper_bounds.size());
-  for (int64_t ub : dim_upper_bounds) {
-    CHECK_GT(ub, 0);
-    domain.dim_ranges_.push_back(Range{0, ub - 1});
-  }
-  domain.symbol_ranges_.reserve(symbol_upper_bounds.size());
-  for (int64_t ub : symbol_upper_bounds) {
-    CHECK_GT(ub, 0);
-    domain.symbol_ranges_.push_back(Range{0, ub - 1});
-  }
-  return domain;
+int64_t CeilDiv(int64_t dividend, int64_t divisor) {
+  return dividend / divisor +
+         (((dividend >= 0) == (divisor >= 0) && dividend % divisor) ? 1 : 0);
 }
 
-std::string Domain::ToString(const AffineMapPrinter& printer) const {
-  std::string s;
-  std::stringstream ss(s);
-  Print(ss, printer);
-  return ss.str();
-}
-
-void Domain::Print(std::ostream& out, const AffineMapPrinter& printer) const {
-  for (const auto& [index, range] : llvm::enumerate(dim_ranges_)) {
-    out << printer.GetDimensionName(static_cast<int64_t>(index)) << " in "
-        << range << '\n';
-  }
-  for (const auto& [index, range] : llvm::enumerate(symbol_ranges_)) {
-    out << printer.GetSymbolName(static_cast<int64_t>(index)) << " in " << range
-        << '\n';
-  }
-}
-
-bool RangeEvaluator::IsAlwaysPositiveOrZero(mlir::AffineExpr expr) {
-  return ComputeExpressionRange(expr).lower_bound >= 0;
-}
-
-bool RangeEvaluator::IsAlwaysNegativeOrZero(mlir::AffineExpr expr) {
-  return ComputeExpressionRange(expr).upper_bound <= 0;
-}
-
-Range RangeEvaluator::ComputeExpressionRange(AffineExpr expr) {
-  switch (expr.getKind()) {
-    case AffineExprKind::Constant: {
-      int64_t value = mlir::cast<AffineConstantExpr>(expr).getValue();
-      return Range{value, value};
-    }
-    case AffineExprKind::DimId: {
-      return domain_->GetDimensionRange(
-          mlir::cast<AffineDimExpr>(expr).getPosition());
-    }
-    case AffineExprKind::SymbolId: {
-      return domain_->GetSymbolRange(
-          mlir::cast<AffineSymbolExpr>(expr).getPosition());
-    }
-    default:
-      auto bound = expression_ranges_cache_.find(expr);
-      if (bound != expression_ranges_cache_.end()) {
-        return bound->second;
-      }
-      auto binary_op = mlir::dyn_cast<AffineBinaryOpExpr>(expr);
-      CHECK(binary_op);
-      auto lhs = ComputeExpressionRange(binary_op.getLHS());
-      auto rhs = ComputeExpressionRange(binary_op.getRHS());
-
-      auto& result = expression_ranges_cache_[expr];
-      switch (expr.getKind()) {
-        case AffineExprKind::Add:
-          return result = {lhs.lower_bound + rhs.lower_bound,
-                           lhs.upper_bound + rhs.upper_bound};
-        case AffineExprKind::Mul: {
-          int64_t a = lhs.lower_bound * rhs.lower_bound;
-          int64_t b = lhs.upper_bound * rhs.upper_bound;
-          return result = {std::min(a, b), std::max(a, b)};
-        }
-        case AffineExprKind::Mod: {
-          CHECK(rhs.IsPoint()) << "RHS of mod must be a constant";
-          int64_t m = rhs.lower_bound;
-          if (0 <= lhs.lower_bound && lhs.upper_bound < m) {
-            return result = lhs;
-          }
-          return result = {0, m - 1};
-        }
-        case AffineExprKind::FloorDiv: {
-          CHECK(rhs.IsPoint()) << "RHS of floor_div must be a constant";
-          int64_t d = rhs.lower_bound;
-          int64_t a = FloorDiv(lhs.lower_bound, d);
-          int64_t b = FloorDiv(lhs.upper_bound, d);
-          return result = {std::min(a, b), std::max(a, b)};
-        }
-        default:
-          // We don't use ceildiv, so we don't support it.
-          LOG(FATAL) << "Unsupported expression";
-      }
-  }
-}
-
-std::ostream& operator<<(std::ostream& out, const Domain& domain) {
-  AffineMapPrinter printer;
-  domain.Print(out, printer);
-  return out;
-}
+class AffineExprSimplifier {
+ public:
+  explicit AffineExprSimplifier(RangeEvaluator* range_evaluator)
+      : range_evaluator_(range_evaluator) {}
 
-bool operator==(const Domain& lhs, const Domain& rhs) {
-  return lhs.GetDimensionRanges() == rhs.GetDimensionRanges() &&
-         lhs.GetSymbolRanges() == rhs.GetSymbolRanges();
-}
+  // Simplifies the map as much as possible.
+  mlir::AffineMap Simplify(mlir::AffineMap affine_map);
 
-std::string IndexingMap::ToString(const AffineMapPrinter& printer) const {
-  std::string s;
-  std::stringstream ss(s);
-  Print(ss, printer);
-  return ss.str();
-}
+  mlir::AffineExpr Simplify(mlir::AffineExpr expr);
 
-void IndexingMap::Print(std::ostream& out,
-                        const AffineMapPrinter& printer) const {
-  printer.Print(out, affine_map);
-  out << " with domain\n";
-  domain.Print(out, printer);
-  out << "\n";
-}
+ private:
+  std::optional<int64_t> GetConstantRhsMultiplier(mlir::AffineExpr expr);
 
-std::ostream& operator<<(std::ostream& out, const IndexingMap& indexing_map) {
-  AffineMapPrinter printer;
-  indexing_map.Print(out, printer);
-  return out;
-}
+  // Simplifier for mod.
+  // - Rewrites (a * 100 + ...) % 100 to (...) % 100
+  // - Rewrites a % b to a if a is known to be less than b.
+  mlir::AffineExpr RewriteMod(mlir::AffineBinaryOpExpr mod);
 
-bool operator==(const IndexingMap& lhs, const IndexingMap& rhs) {
-  return lhs.affine_map == rhs.affine_map && lhs.domain == rhs.domain;
-}
+  // Simplifier for floordiv.
+  // - Rewrites (a * 100 + ...) / 100 to a + (...) / 100
+  // - Rewrites a / 100 to 0 when a is known to be less than 100.
+  mlir::AffineExpr RewriteFloorDiv(mlir::AffineBinaryOpExpr div);
 
-bool IndexingMap::Simplify() {
-  RangeEvaluator range_evaluator(&domain);
-  AffineMap simplified_affine_map =
-      IndexingMapSimplifier(&range_evaluator, affine_map.getContext())
-          .Simplify(affine_map);
-  if (simplified_affine_map == affine_map) {
-    return false;
-  }
-  affine_map = simplified_affine_map;
-  return true;
-}
+  mlir::AffineExpr RewriteSum(
+      mlir::AffineExpr expr,
+      const std::function<mlir::AffineExpr(mlir::AffineExpr)>& map);
 
-std::optional<IndexingMap> ComposeIndexingMaps(
-    const std::optional<IndexingMap>& producer_map,
-    const std::optional<IndexingMap>& consumer_map) {
-  if (!producer_map.has_value() || !consumer_map.has_value()) {
-    return std::nullopt;
-  }
-  // AffineMap::compose(some_affine_map) actually computes some_affine_map ∘
-  // this.
-  AffineMap composed_map = mlir::simplifyAffineMap(
-      producer_map->affine_map.compose(consumer_map->affine_map));
+  mlir::AffineExpr RewriteSumIf(
+      mlir::AffineExpr expr, const std::function<bool(mlir::AffineExpr)>& pred);
 
-  // After the composition some of the symbols might become unused, e.g. when a
-  // dimension was added by broadcasting as then reduced. We should remove these
-  // dimensions from the composed affine map and also from the resulting
-  // `domain.symbol_ranges_`.
-  //
-  // For example, if there is a reduction(broadcast):
-  //
-  //   param = f32[15] parameter(0)
-  //   bcast = f32[15, 20] broadcast(p0), dimensions={0}
-  //   reduce = f32[15, 20] reduce(bcast, init) dimensions={1}
-  //
-  // then `reduce` has (d0)[s0] -> (d0, s0) with s0 in [0, 20).
-  // and  `bcast` has (d0, d1) -> (d0) indexing map.
-  //
-  // The composition of there two maps yields (d0)[s0] -> (d0),
-  // although `s0` is not used in the mapping. In order to remove such symbols,
-  // we get the indices of unused symbols and remove them from the composed
-  // affine map and the `domain.symbol_ranges_`.
-  auto unused_symbols_bit_vector =
-      mlir::getUnusedSymbolsBitVector({composed_map});
-  composed_map = mlir::compressSymbols(composed_map, unused_symbols_bit_vector);
+  // Attempts to simplify the expression, but doesn't attempt to simplify the
+  // result further.
+  mlir::AffineExpr SimplifyOnce(mlir::AffineExpr expr);
 
-  // The symbols in the composed map, i.e. combined
-  // producer_map.compose(consumer_map) are packed as [symbols(producer_map) |
-  // symbols(consumer_map)]. In that order we are adding the symbol ranges while
-  // skipping the symbols that are unused.
-  std::vector<Range> combined_symbol_ranges_;
-  combined_symbol_ranges_.reserve(producer_map->domain.GetSymbolCount() +
-                                  consumer_map->domain.GetSymbolCount());
-  int64_t symbol_id = 0;
-  for (const Range& symbol_range :
-       llvm::concat<const Range>(producer_map->domain.GetSymbolRanges(),
-                                 consumer_map->domain.GetSymbolRanges())) {
-    if (unused_symbols_bit_vector[symbol_id++]) {
-      continue;
-    }
-    combined_symbol_ranges_.push_back(symbol_range);
-  }
-  IndexingMap composed_indexing_map{
-      std::move(composed_map), Domain{consumer_map->domain.GetDimensionRanges(),
-                                      combined_symbol_ranges_}};
-  composed_indexing_map.Simplify();
-  return composed_indexing_map;
-}
+  RangeEvaluator* range_evaluator_;
+};
 
-AffineExpr IndexingMapSimplifier::RewriteMod(AffineBinaryOpExpr mod) {
+AffineExpr AffineExprSimplifier::RewriteMod(AffineBinaryOpExpr mod) {
   auto lhs_simplified = SimplifyOnce(mod.getLHS());
 
   auto lhs = range_evaluator_->ComputeExpressionRange(lhs_simplified);
   auto rhs = range_evaluator_->ComputeExpressionRange(mod.getRHS());
 
   // a % b where b is always larger than a?
-  if (0 <= lhs.lower_bound && lhs.upper_bound < rhs.upper_bound) {
+  if (0 <= lhs.lower_bound && lhs.upper_bound < rhs.lower_bound) {
     return lhs_simplified;
   }
 
@@ -294,37 +119,52 @@ AffineExpr IndexingMapSimplifier::RewriteMod(AffineBinaryOpExpr mod) {
   }
   int64_t m = rhs.lower_bound;
 
+  Range no_multiplier_range{0, 0};
+  int64_t multiplier_gcd = -1;
+
   auto new_lhs = RewriteSumIf(lhs_simplified, [&](AffineExpr expr) {
-    if (expr.getKind() != AffineExprKind::Mul) {
+    if (auto multiplier = GetConstantRhsMultiplier(expr)) {
+      if (*multiplier % m == 0) {
+        return false;
+      }
+
+      if (multiplier_gcd == -1) {
+        multiplier_gcd = *multiplier;
+      } else {
+        multiplier_gcd = std::gcd(multiplier_gcd, *multiplier);
+      }
       return true;
     }
-
-    auto mul_rhs = range_evaluator_->ComputeExpressionRange(
-        mlir::cast<AffineBinaryOpExpr>(expr).getRHS());
-    bool remove = mul_rhs.IsPoint() && (mul_rhs.lower_bound % m) == 0;
-    return !remove;  // We keep it if we don't remove it!
+    auto range = range_evaluator_->ComputeExpressionRange(expr);
+    no_multiplier_range.lower_bound += range.lower_bound;
+    no_multiplier_range.upper_bound += range.upper_bound;
+    return true;
   });
 
-  // If we weren't able to remove or simplify anything, return the original
-  // expression.
-  if (new_lhs == mod.getLHS()) {
-    return mod;
-  }
-  // If we removed everything, return 0.
-  if (!new_lhs) {
-    return getAffineConstantExpr(0, mlir_context_);
+  mlir::AffineExpr extracted = getAffineConstantExpr(0, mod.getContext());
+  if (m % multiplier_gcd == 0 && no_multiplier_range.lower_bound >= 0 &&
+      no_multiplier_range.upper_bound < multiplier_gcd) {
+    // Remove everything that doesn't have a multiplier.
+    new_lhs = RewriteSumIf(new_lhs, [&](AffineExpr expr) {
+      if (GetConstantRhsMultiplier(expr)) {
+        return true;
+      }
+      extracted = extracted + expr;
+      return false;
+    });
   }
-  // Otherwise, return new_sum % m.
-  return new_lhs % mod.getRHS();
+  if (!new_lhs) new_lhs = getAffineConstantExpr(0, mod.getContext());
+  return new_lhs % mod.getRHS() + extracted;
 }
 
-AffineExpr IndexingMapSimplifier::RewriteFloorDiv(AffineBinaryOpExpr div) {
+AffineExpr AffineExprSimplifier::RewriteFloorDiv(AffineBinaryOpExpr div) {
+  auto mlir_context = range_evaluator_->GetMLIRContext();
   auto lhs_simplified = SimplifyOnce(div.getLHS());
   auto lhs = range_evaluator_->ComputeExpressionRange(lhs_simplified);
   auto rhs = range_evaluator_->ComputeExpressionRange(div.getRHS());
 
   if (0 <= lhs.lower_bound && lhs.upper_bound < rhs.lower_bound) {
-    return getAffineConstantExpr(0, mlir_context_);
+    return getAffineConstantExpr(0, mlir_context);
   }
 
   // The logic below assumes we have a constant RHS.
@@ -337,15 +177,39 @@ AffineExpr IndexingMapSimplifier::RewriteFloorDiv(AffineBinaryOpExpr div) {
   int64_t a = FloorDiv(lhs.lower_bound, d);
   int64_t b = FloorDiv(lhs.upper_bound, d);
   if (a == b) {
-    return getAffineConstantExpr(a, mlir_context_);
+    return getAffineConstantExpr(a, mlir_context);
   }
 
-  AffineExpr extracted = getAffineConstantExpr(0, mlir_context_);
+  // Rewrite `(a / b) / c` to `a / (b * c)` if `a >= 0` and `b` and `c` are
+  // constants.
+  if (lhs_simplified.getKind() == AffineExprKind::FloorDiv) {
+    auto lhs_div = mlir::cast<AffineBinaryOpExpr>(lhs_simplified);
+    auto lhs_lhs = range_evaluator_->ComputeExpressionRange(lhs_div.getLHS());
+    if (lhs_lhs.lower_bound >= 0) {
+      auto lhs_rhs = range_evaluator_->ComputeExpressionRange(lhs_div.getRHS());
+      if (lhs_rhs.IsPoint()) {
+        return lhs_div.getLHS().floorDiv(lhs_rhs.lower_bound * d);
+      }
+    }
+  }
+
+  Range no_multiplier_range{0, 0};
+  int64_t multiplier_gcd = -1;
+  // The maximum GCD of any remaining multiplier inside the div and the divisor.
+  int64_t max_remaining_multiplier_gcd = -1;
+  AffineExpr extracted = getAffineConstantExpr(0, mlir_context);
   auto new_dividend = RewriteSumIf(lhs_simplified, [&](AffineExpr expr) {
     if (auto multiplier = GetConstantRhsMultiplier(expr)) {
       // (x * 7 + ...) / 3 -> can't extract. We could extract x * 2 and keep
       // one x, but we currently have no reason to do that.
       if (*multiplier % d != 0) {
+        if (multiplier_gcd == -1) {
+          multiplier_gcd = *multiplier;
+        } else {
+          multiplier_gcd = std::gcd(multiplier_gcd, *multiplier);
+        }
+        max_remaining_multiplier_gcd =
+            std::max(max_remaining_multiplier_gcd, std::gcd(*multiplier, d));
         return true;
       }
       int64_t factor = *multiplier / d;
@@ -354,7 +218,9 @@ AffineExpr IndexingMapSimplifier::RewriteFloorDiv(AffineBinaryOpExpr div) {
       // Remove from dividend.
       return false;
     }
-
+    auto range = range_evaluator_->ComputeExpressionRange(expr);
+    no_multiplier_range.lower_bound += range.lower_bound;
+    no_multiplier_range.upper_bound += range.upper_bound;
     // Not a constant multiplier, keep in dividend.
     return true;
   });
@@ -363,8 +229,53 @@ AffineExpr IndexingMapSimplifier::RewriteFloorDiv(AffineBinaryOpExpr div) {
   if (!new_dividend) {
     return extracted;
   }
+
+  if ((d % multiplier_gcd) == 0) {
+    if (no_multiplier_range.lower_bound >= 0 &&
+        no_multiplier_range.upper_bound < multiplier_gcd) {
+      // Remove everything that doesn't have a multiplier.
+      new_dividend = RewriteSumIf(new_dividend, [&](AffineExpr expr) {
+        auto mult = GetConstantRhsMultiplier(expr);
+        return mult.has_value();
+      });
+    }
+  }
+
+  // If we have a gcd > 1, we can split the div into two:
+  // (x * 128 + y) // 192 -> (x * 2 + y // 64) // 3
+  // This rule primarily exists because MLIR's upstream simplifier tends to
+  // generate expressions like this from %:
+  //
+  // s0 * 512
+  // - ((s0 * 2 + s1 floordiv 64) floordiv 3) * 768
+  // + ((s0 * 128 + s1) floordiv 192) * 768
+  //
+  // This rule lets us eliminate the subtraction and the addition.
+  // TODO(pifon): Remove this once the remaining simplification is fixed.
+  if (max_remaining_multiplier_gcd > 1) {
+    AffineExpr partially_extracted = getAffineConstantExpr(0, mlir_context);
+    new_dividend = RewriteSumIf(new_dividend, [&](AffineExpr expr) {
+      if (auto multiplier = GetConstantRhsMultiplier(expr);
+          multiplier && ((*multiplier % max_remaining_multiplier_gcd) == 0)) {
+        auto expr_lhs = mlir::cast<AffineBinaryOpExpr>(expr).getLHS();
+        partially_extracted =
+            partially_extracted +
+            expr_lhs * (*multiplier / max_remaining_multiplier_gcd);
+        // Remove from dividend.
+        return false;
+      }
+      return true;
+    });
+    if (!new_dividend) {
+      new_dividend = getAffineConstantExpr(0, mlir_context);
+    }
+    return extracted + (partially_extracted +
+                        new_dividend.floorDiv(max_remaining_multiplier_gcd))
+                           .floorDiv(d / max_remaining_multiplier_gcd);
+  }
+
   // If we removed nothing, return the original division.
-  if (extracted == getAffineConstantExpr(0, mlir_context_) &&
+  if (extracted == getAffineConstantExpr(0, mlir_context) &&
       new_dividend == div.getLHS()) {
     return div;
   }
@@ -372,7 +283,7 @@ AffineExpr IndexingMapSimplifier::RewriteFloorDiv(AffineBinaryOpExpr div) {
   return extracted + new_dividend.floorDiv(div.getRHS());
 }
 
-std::optional<int64_t> IndexingMapSimplifier::GetConstantRhsMultiplier(
+std::optional<int64_t> AffineExprSimplifier::GetConstantRhsMultiplier(
     AffineExpr expr) {
   if (expr.getKind() != AffineExprKind::Mul) {
     return std::nullopt;
@@ -385,7 +296,16 @@ std::optional<int64_t> IndexingMapSimplifier::GetConstantRhsMultiplier(
   return bound.lower_bound;
 }
 
-AffineExpr IndexingMapSimplifier::RewriteSumIf(
+AffineExpr AffineExprSimplifier::RewriteSum(
+    AffineExpr expr, const std::function<AffineExpr(AffineExpr)>& map) {
+  if (expr.getKind() == AffineExprKind::Add) {
+    auto add = mlir::dyn_cast<AffineBinaryOpExpr>(expr);
+    return RewriteSum(add.getLHS(), map) + RewriteSum(add.getRHS(), map);
+  }
+  return map(expr);
+}
+
+AffineExpr AffineExprSimplifier::RewriteSumIf(
     AffineExpr expr, const std::function<bool(AffineExpr)>& pred) {
   if (expr.getKind() == AffineExprKind::Add) {
     auto add = mlir::dyn_cast<AffineBinaryOpExpr>(expr);
@@ -402,7 +322,7 @@ AffineExpr IndexingMapSimplifier::RewriteSumIf(
   return pred(expr) ? expr : nullptr;
 }
 
-AffineExpr IndexingMapSimplifier::SimplifyOnce(AffineExpr expr) {
+AffineExpr AffineExprSimplifier::SimplifyOnce(AffineExpr expr) {
   switch (expr.getKind()) {
     case AffineExprKind::Mul:
     case AffineExprKind::Add: {
@@ -422,7 +342,8 @@ AffineExpr IndexingMapSimplifier::SimplifyOnce(AffineExpr expr) {
     case AffineExprKind::SymbolId: {
       auto bounds = range_evaluator_->ComputeExpressionRange(expr);
       if (bounds.IsPoint()) {
-        return getAffineConstantExpr(bounds.lower_bound, mlir_context_);
+        return getAffineConstantExpr(bounds.lower_bound,
+                                     range_evaluator_->GetMLIRContext());
       }
       return expr;
     }
@@ -432,7 +353,7 @@ AffineExpr IndexingMapSimplifier::SimplifyOnce(AffineExpr expr) {
   }
 }
 
-AffineExpr IndexingMapSimplifier::Simplify(AffineExpr expr) {
+AffineExpr AffineExprSimplifier::Simplify(AffineExpr expr) {
   while (true) {
     auto simplified = SimplifyOnce(expr);
     if (simplified == expr) {
@@ -442,8 +363,9 @@ AffineExpr IndexingMapSimplifier::Simplify(AffineExpr expr) {
   }
 }
 
-AffineMap IndexingMapSimplifier::Simplify(AffineMap affine_map) {
-  mlir::SmallVector<AffineExpr, 4> results;
+AffineMap AffineExprSimplifier::Simplify(AffineMap affine_map) {
+  affine_map = mlir::simplifyAffineMap(affine_map);
+  SmallVector<AffineExpr, 4> results;
   results.reserve(affine_map.getNumResults());
   bool nothing_changed = true;
   for (AffineExpr expr : affine_map.getResults()) {
@@ -454,9 +376,579 @@ AffineMap IndexingMapSimplifier::Simplify(AffineMap affine_map) {
   if (nothing_changed) {
     return affine_map;
   }
-  return mlir::simplifyAffineMap(
-      AffineMap::get(affine_map.getNumDims(), affine_map.getNumSymbols(),
-                     results, affine_map.getContext()));
+  return Simplify(AffineMap::get(affine_map.getNumDims(),
+                                 affine_map.getNumSymbols(), results,
+                                 affine_map.getContext()));
+}
+
+// Computes intersection of two ranges.
+Range Intersect(const Range& lhs, const Range& rhs) {
+  return Range{std::max(lhs.lower_bound, rhs.lower_bound),
+               std::min(lhs.upper_bound, rhs.upper_bound)};
+}
+
+// Simplifies a constraint range, i.e. a constraint d0 + x in [lb, ub] will
+// become d0 in [lb - x, ub - x]. Also supports *, floorDiv.
+bool SimplifyConstraintRangeOnce(AffineExpr* expr, Range* range) {
+  switch (expr->getKind()) {
+    case AffineExprKind::DimId:
+    case AffineExprKind::SymbolId:
+      // do the trick with constant
+    case AffineExprKind::Constant: {
+      return false;
+    }
+    default: {
+      auto binary_op = mlir::cast<AffineBinaryOpExpr>(*expr);
+      CHECK(binary_op);
+      auto lhs = binary_op.getLHS();
+      auto rhs = binary_op.getRHS();
+      auto constant = mlir::dyn_cast<AffineConstantExpr>(rhs);
+      if (!constant) {
+        return false;
+      }
+      switch (expr->getKind()) {
+        case AffineExprKind::Add: {
+          int64_t shift = constant.getValue();
+          range->lower_bound -= shift;
+          range->upper_bound -= shift;
+          *expr = lhs;
+          return true;
+        }
+        case AffineExprKind::Mul: {
+          int64_t factor = constant.getValue();
+          if (factor < 0) {
+            factor *= -1;
+            range->lower_bound *= -1;
+            range->upper_bound *= -1;
+            std::swap(range->lower_bound, range->upper_bound);
+          }
+          range->lower_bound = CeilDiv(range->lower_bound, factor);
+          range->upper_bound = FloorDiv(range->upper_bound, factor);
+          *expr = lhs;
+          return true;
+        }
+        case AffineExprKind::FloorDiv: {
+          int64_t divisor = constant.getValue();
+          if (divisor < 0) {
+            divisor *= -1;
+            range->lower_bound *= -1;
+            range->upper_bound *= -1;
+            std::swap(range->lower_bound, range->upper_bound);
+          }
+          range->lower_bound *= divisor;
+          range->upper_bound = (range->upper_bound + 1) * divisor - 1;
+          *expr = lhs;
+          return true;
+        }
+        default: {
+          return false;
+        }
+      }
+    }
+  }
+}
+
+// Repeatedly simplifies the range of the constraint.
+bool SimplifyConstraintRange(AffineExpr* expr, Range* range) {
+  bool is_simplified = false;
+  while (SimplifyConstraintRangeOnce(expr, range)) {
+    is_simplified = true;
+  }
+  return is_simplified;
+}
+
+}  // namespace
+
+std::string Range::ToString() const {
+  std::stringstream ss;
+  Print(ss);
+  return ss.str();
+}
+
+void Range::Print(std::ostream& out) const {
+  out << '[' << lower_bound << ", " << upper_bound << "]";
+}
+
+std::ostream& operator<<(std::ostream& out, const Range& range) {
+  range.Print(out);
+  return out;
+}
+
+bool operator==(const Range& lhs, const Range& rhs) {
+  return lhs.lower_bound == rhs.lower_bound &&
+         lhs.upper_bound == rhs.upper_bound;
+}
+
+IndexingMap IndexingMap::FromTensorSizes(
+    AffineMap affine_map, absl::Span<const int64_t> dim_upper_bounds,
+    absl::Span<const int64_t> symbol_upper_bounds) {
+  IndexingMap indexing_map;
+  indexing_map.affine_map_ = affine_map;
+  indexing_map.dim_ranges_.reserve(dim_upper_bounds.size());
+  for (int64_t ub : dim_upper_bounds) {
+    CHECK_GT(ub, 0);
+    indexing_map.dim_ranges_.push_back(Range{0, ub - 1});
+  }
+  indexing_map.symbol_ranges_.reserve(symbol_upper_bounds.size());
+  for (int64_t ub : symbol_upper_bounds) {
+    CHECK_GT(ub, 0);
+    indexing_map.symbol_ranges_.push_back(Range{0, ub - 1});
+  }
+  return indexing_map;
+}
+
+void IndexingMap::AddConstraint(mlir::AffineExpr expr, Range range) {
+  if (auto dim_expr = mlir::dyn_cast<AffineDimExpr>(expr)) {
+    Range& current_range = dim_ranges_[dim_expr.getPosition()];
+    current_range = Intersect(current_range, range);
+    return;
+  }
+  if (auto symbol_expr = mlir::dyn_cast<AffineSymbolExpr>(expr)) {
+    Range& current_range = symbol_ranges_[symbol_expr.getPosition()];
+    current_range = Intersect(current_range, range);
+    return;
+  }
+  // TODO(b/322131639): Add a proper Constraints simplifier that will apply
+  // simplification rules until it converges. For example, it should have a rule
+  // for `symbol_or_dim floorDiv divisor`.
+  if (SimplifyConstraintRange(&expr, &range)) {
+    AddConstraint(expr, range);
+    return;
+  }
+  auto [it, inserted] = constraints_.insert({expr, range});
+  if (!inserted) {
+    it->second = Intersect(it->second, range);
+  }
+}
+
+bool IndexingMap::ConstraintsSatisfied(
+    ArrayRef<AffineExpr> dim_const_exprs,
+    ArrayRef<AffineExpr> symbol_const_exprs) const {
+  CHECK(dim_const_exprs.size() == GetDimensionCount());
+  CHECK(symbol_const_exprs.size() == GetSymbolCount());
+  if (IsKnownEmpty()) {
+    return false;
+  }
+  for (auto& [expr, range] : constraints_) {
+    int64_t expr_value =
+        mlir::cast<AffineConstantExpr>(
+            expr.replaceDimsAndSymbols(dim_const_exprs, symbol_const_exprs))
+            .getValue();
+    if (expr_value < range.lower_bound || expr_value > range.upper_bound) {
+      return false;
+    }
+  }
+  return true;
+}
+
+SmallVector<int64_t, 4> IndexingMap::Evaluate(
+    ArrayRef<AffineExpr> dim_const_exprs,
+    ArrayRef<AffineExpr> symbol_const_exprs) const {
+  CHECK(dim_const_exprs.size() == GetDimensionCount());
+  CHECK(symbol_const_exprs.size() == GetSymbolCount());
+  AffineMap eval = affine_map_.replaceDimsAndSymbols(
+      dim_const_exprs, symbol_const_exprs, dim_const_exprs.size(),
+      symbol_const_exprs.size());
+  return eval.getConstantResults();
+}
+
+bool IndexingMap::IsKnownEmpty() const {
+  auto is_infeasible = [](const Range& range) {
+    return range.lower_bound > range.upper_bound;
+  };
+  return llvm::any_of(dim_ranges_, is_infeasible) ||
+         llvm::any_of(symbol_ranges_, is_infeasible) ||
+         llvm::any_of(constraints_,
+                      [&](const std::pair<AffineExpr, Range>& item) {
+                        return is_infeasible(item.second);
+                      });
+}
+
+RangeEvaluator::RangeEvaluator(absl::Span<const Range> dim_ranges,
+                               absl::Span<const Range> symbol_ranges,
+                               MLIRContext* mlir_context)
+    : mlir_context_(mlir_context) {
+  for (const auto& [index, range] : llvm::enumerate(dim_ranges)) {
+    expression_ranges_cache_[getAffineDimExpr(index, mlir_context_)] = range;
+  }
+  for (const auto& [index, range] : llvm::enumerate(symbol_ranges)) {
+    expression_ranges_cache_[getAffineSymbolExpr(index, mlir_context_)] = range;
+  }
+}
+
+bool RangeEvaluator::IsAlwaysPositiveOrZero(mlir::AffineExpr expr) {
+  return ComputeExpressionRange(expr).lower_bound >= 0;
+}
+
+bool RangeEvaluator::IsAlwaysNegativeOrZero(mlir::AffineExpr expr) {
+  return ComputeExpressionRange(expr).upper_bound <= 0;
+}
+
+Range RangeEvaluator::ComputeExpressionRange(AffineExpr expr) {
+  switch (expr.getKind()) {
+    case AffineExprKind::Constant: {
+      int64_t value = mlir::cast<AffineConstantExpr>(expr).getValue();
+      return Range{value, value};
+    }
+    case AffineExprKind::DimId: {
+      return expression_ranges_cache_[expr];
+    }
+    case AffineExprKind::SymbolId: {
+      return expression_ranges_cache_[expr];
+    }
+    default:
+      auto bound = expression_ranges_cache_.find(expr);
+      if (bound != expression_ranges_cache_.end()) {
+        return bound->second;
+      }
+      auto binary_op = mlir::dyn_cast<AffineBinaryOpExpr>(expr);
+      CHECK(binary_op);
+      auto lhs = ComputeExpressionRange(binary_op.getLHS());
+      auto rhs = ComputeExpressionRange(binary_op.getRHS());
+
+      auto& result = expression_ranges_cache_[expr];
+      switch (expr.getKind()) {
+        case AffineExprKind::Add:
+          return result = {lhs.lower_bound + rhs.lower_bound,
+                           lhs.upper_bound + rhs.upper_bound};
+        case AffineExprKind::Mul: {
+          int64_t a = lhs.lower_bound * rhs.lower_bound;
+          int64_t b = lhs.upper_bound * rhs.upper_bound;
+          return result = {std::min(a, b), std::max(a, b)};
+        }
+        case AffineExprKind::Mod: {
+          CHECK(rhs.IsPoint()) << "RHS of mod must be a constant";
+          int64_t m = rhs.lower_bound;
+          if (0 <= lhs.lower_bound && lhs.upper_bound < m) {
+            return result = lhs;
+          }
+          return result = {0, m - 1};
+        }
+        case AffineExprKind::FloorDiv: {
+          CHECK(rhs.IsPoint()) << "RHS of floor_div must be a constant";
+          int64_t d = rhs.lower_bound;
+          int64_t a = FloorDiv(lhs.lower_bound, d);
+          int64_t b = FloorDiv(lhs.upper_bound, d);
+          return result = {std::min(a, b), std::max(a, b)};
+        }
+        default:
+          // We don't use ceildiv, so we don't support it.
+          LOG(FATAL) << "Unsupported expression";
+      }
+  }
+}
+
+std::string IndexingMap::ToString(const AffineMapPrinter& printer) const {
+  std::stringstream ss;
+  Print(ss, printer);
+  return ss.str();
+}
+
+void IndexingMap::Print(std::ostream& out,
+                        const AffineMapPrinter& printer) const {
+  printer.Print(out, affine_map_);
+  out << "\ndomain:\n";
+  for (const auto& [index, range] : llvm::enumerate(dim_ranges_)) {
+    out << printer.GetDimensionName(static_cast<int64_t>(index)) << " in ";
+    range.Print(out);
+    out << '\n';
+  }
+  for (const auto& [index, range] : llvm::enumerate(symbol_ranges_)) {
+    out << printer.GetSymbolName(static_cast<int64_t>(index)) << " in ";
+    range.Print(out);
+    out << '\n';
+  }
+  std::vector<std::string> expr_range_strings;
+  expr_range_strings.reserve(constraints_.size());
+  for (const auto& [expr, range] : constraints_) {
+    std::stringstream ss;
+    printer.Print(ss, expr);
+    ss << " in ";
+    range.Print(ss);
+    expr_range_strings.push_back(ss.str());
+  }
+  std::sort(expr_range_strings.begin(), expr_range_strings.end());
+  for (const auto& expr_range_string : expr_range_strings) {
+    out << expr_range_string << '\n';
+  }
+}
+
+std::ostream& operator<<(std::ostream& out, const IndexingMap& indexing_map) {
+  AffineMapPrinter printer;
+  indexing_map.Print(out, printer);
+  return out;
+}
+
+bool operator==(const IndexingMap& lhs, const IndexingMap& rhs) {
+  return lhs.GetAffineMap() == rhs.GetAffineMap() &&
+         lhs.GetDimensionRanges() == rhs.GetDimensionRanges() &&
+         lhs.GetSymbolRanges() == rhs.GetSymbolRanges();
+}
+
+IndexingMap operator*(const IndexingMap& lhs, const IndexingMap& rhs) {
+  return ComposeIndexingMaps(lhs, rhs);
+}
+
+// Simplification of IndexingMap has two main parts.
+// At first we optimized constraints to make the domain as small and simple as
+// possible. And only then we simplify the affine_map, because its
+// simplification relies on lower/upper bounds of dimensions and symbols.
+
+// Constraint simplification is performed in two stages repeated until
+// convergence.
+//   1. Simplify affine expressions in all constraints.
+//   2. Simplify constraint ranges for all constraints.
+// We don't optimize every constraint separately to avoid re-initialization of
+// RangeEvaluator for every constraint. Note that we start with "expr"
+// simplification, because the ranges of constraints were already optimized once
+// when IndexingMap was constructed.
+bool IndexingMap::Simplify() {
+  if (IsUndefined()) return false;
+
+  // Simplify constraints to shrink the lower/upper bounds of dims and symbols.
+  bool constraints_were_simplified = false;
+  while (true) {
+    if (!SimplifyConstraintExprs()) break;
+    constraints_were_simplified = true;
+    if (!SimplifyConstraintRanges()) break;
+  }
+  // Simplify affine_map using the optimized ranges.
+  // Potentially, we can be smarter about recreating the range_evaluator.
+  RangeEvaluator range_evaluator(dim_ranges_, symbol_ranges_, GetMLIRContext());
+  AffineMap simplified_affine_map =
+      AffineExprSimplifier(&range_evaluator).Simplify(affine_map_);
+  bool affine_map_was_simplified = simplified_affine_map != affine_map_;
+  if (affine_map_was_simplified) {
+    affine_map_ = simplified_affine_map;
+  }
+  return affine_map_was_simplified || constraints_were_simplified;
+}
+
+bool IndexingMap::SimplifyConstraintExprs() {
+  // Simplify affine expression in the constraints_.
+  RangeEvaluator range_evaluator(dim_ranges_, symbol_ranges_, GetMLIRContext());
+  AffineExprSimplifier simplifier(&range_evaluator);
+  std::vector<AffineExpr> to_remove;
+  std::vector<std::pair<AffineExpr, Range>> to_add;
+  for (const auto& [expr, range] : constraints_) {
+    AffineExpr simplified = simplifier.Simplify(expr);
+
+    // Skip constraints that are always satisfied.
+    Range evaluated_range = range_evaluator.ComputeExpressionRange(simplified);
+    if (evaluated_range.upper_bound <= range.upper_bound &&
+        evaluated_range.lower_bound >= range.lower_bound) {
+      to_remove.push_back(expr);
+      continue;
+    }
+    if (simplified == expr) continue;
+    to_add.push_back({simplified, range});
+    to_remove.push_back(expr);
+  }
+  for (const auto& expr : to_remove) {
+    constraints_.erase(expr);
+  }
+  for (const auto& [expr, range] : to_add) {
+    AddConstraint(expr, range);
+  }
+  return !to_add.empty();
+}
+
+bool IndexingMap::SimplifyConstraintRanges() {
+  std::vector<AffineExpr> to_remove;
+  std::vector<std::pair<AffineExpr, Range>> to_add;
+  for (const auto& [expr, range] : constraints_) {
+    AffineExpr simplified_expr = expr;
+    Range simplified_range = range;
+    if (SimplifyConstraintRange(&simplified_expr, &simplified_range)) {
+      to_add.push_back({simplified_expr, simplified_range});
+      to_remove.push_back(expr);
+    }
+  }
+  for (const auto& expr : to_remove) {
+    constraints_.erase(expr);
+  }
+  for (const auto& [expr, range] : to_add) {
+    AddConstraint(expr, range);
+  }
+  return !to_add.empty();
+}
+
+namespace {
+
+struct UsedParameters {
+  llvm::DenseSet<int64_t> dimension_ids;
+  llvm::DenseSet<int64_t> symbol_ids;
+};
+
+void GetUsedParametersImpl(const AffineExpr& expr,
+                           UsedParameters& used_parameters) {
+  if (auto dim_expr = mlir::dyn_cast<AffineDimExpr>(expr)) {
+    used_parameters.dimension_ids.insert(dim_expr.getPosition());
+    return;
+  }
+  if (auto symbol_expr = mlir::dyn_cast<AffineSymbolExpr>(expr)) {
+    used_parameters.symbol_ids.insert(symbol_expr.getPosition());
+    return;
+  }
+  if (auto binary_expr = mlir::dyn_cast<AffineBinaryOpExpr>(expr)) {
+    GetUsedParametersImpl(binary_expr.getLHS(), used_parameters);
+    GetUsedParametersImpl(binary_expr.getRHS(), used_parameters);
+  }
+}
+
+// Returns IDs of dimensions and symbols that participate in AffineExpr.
+UsedParameters GetUsedParameters(const mlir::AffineExpr& expr) {
+  UsedParameters used_parameters;
+  GetUsedParametersImpl(expr, used_parameters);
+  return used_parameters;
+}
+
+bool IsFunctionOfUnusedDimsAndSymbolsOnly(
+    const UsedParameters& used_parameters,
+    const SmallBitVector& unused_dims_bit_vector,
+    const SmallBitVector& unused_symbols_bit_vector) {
+  for (int64_t dim_id : used_parameters.dimension_ids) {
+    if (!unused_dims_bit_vector[dim_id]) return false;
+  }
+  for (int64_t symbol_id : used_parameters.symbol_ids) {
+    if (!unused_symbols_bit_vector[symbol_id]) return false;
+  }
+  return true;
+}
+
+}  // namespace
+
+void IndexingMap::RemoveUnusedSymbols() {
+  if (IsUndefined()) return;
+
+  // Remove unused symbols from the affine_map.
+  unsigned num_symbols_before = affine_map_.getNumSymbols();
+  SmallBitVector unused_symbols_bit_vector =
+      mlir::getUnusedSymbolsBitVector({affine_map_});
+  SmallBitVector unused_dims_bit_vector =
+      mlir::getUnusedDimsBitVector({affine_map_});
+
+  // Check if the symbols that are unused in `affine_map` are also unused in
+  // expressions.
+  std::vector<std::pair<AffineExpr, UsedParameters>> candidates_to_remove;
+  for (const auto& [expr, range] : constraints_) {
+    UsedParameters used_parameters = GetUsedParameters(expr);
+    // If the expression uses only symbols and dims that are "unused" in
+    // `affine_map`, then we can remove it.
+    if (IsFunctionOfUnusedDimsAndSymbolsOnly(used_parameters,
+                                             unused_dims_bit_vector,
+                                             unused_symbols_bit_vector)) {
+      candidates_to_remove.push_back({expr, used_parameters});
+      continue;
+    }
+    // Otherwise, we need to mark all symbols of these expr as "used".
+    for (int64_t symbol_id : used_parameters.symbol_ids) {
+      unused_symbols_bit_vector[symbol_id] = false;
+    }
+  }
+  for (const auto& [expr, used_parameters] : candidates_to_remove) {
+    if (IsFunctionOfUnusedDimsAndSymbolsOnly(used_parameters,
+                                             unused_dims_bit_vector,
+                                             unused_symbols_bit_vector)) {
+      constraints_.erase(expr);
+    }
+  }
+
+  // Compress `affine_map` using the updated `unused_symbols_bit_vector`.
+  affine_map_ = mlir::compressSymbols(affine_map_, unused_symbols_bit_vector);
+
+  // Remap symbols in the constraint expressions accordingly.
+  unsigned num_symbols_after = affine_map_.getNumSymbols();
+  if (num_symbols_after == num_symbols_before) return;
+
+  std::vector<Range> compressed_symbol_ranges_;
+  MLIRContext* mlir_context = GetMLIRContext();
+  int64_t used_symbols_count = 0;
+  std::vector<AffineExpr> symbol_replacements(
+      num_symbols_before, getAffineConstantExpr(0, mlir_context));
+  for (int i = 0; i < unused_symbols_bit_vector.size(); ++i) {
+    if (!unused_symbols_bit_vector[i]) {
+      compressed_symbol_ranges_.push_back(symbol_ranges_[i]);
+      symbol_replacements[i] =
+          getAffineSymbolExpr(used_symbols_count++, mlir_context);
+    }
+  }
+  symbol_ranges_ = std::move(compressed_symbol_ranges_);
+  std::vector<AffineExpr> to_remove;
+  std::vector<std::pair<AffineExpr, Range>> to_add;
+  for (const auto& [expr, range] : constraints_) {
+    auto updated_expr = expr.replaceSymbols(symbol_replacements);
+    if (updated_expr == expr) continue;
+    to_add.push_back({updated_expr, range});
+    to_remove.push_back(expr);
+  }
+  for (const auto& expr : to_remove) {
+    constraints_.erase(expr);
+  }
+  for (const auto& [expr, range] : to_add) {
+    AddConstraint(expr, range);
+  }
+}
+
+IndexingMap ComposeIndexingMaps(const IndexingMap& first,
+                                const IndexingMap& second) {
+  if (second.IsUndefined() || first.IsUndefined()) {
+    return IndexingMap::GetUndefined();
+  }
+  AffineMap producer_affine_map = second.GetAffineMap();
+  // map1.compose(map2) computes map2 ∘ map1 for some reason.
+  AffineMap composed_map = producer_affine_map.compose(first.GetAffineMap());
+
+  // The symbols in the composed map, i.e. combined
+  // producer_map.compose(consumer_map) are packed as [symbols(producer_map) |
+  // symbols(consumer_map)].
+  std::vector<Range> combined_symbol_ranges;
+  combined_symbol_ranges.reserve(second.GetSymbolCount() +
+                                 first.GetSymbolCount());
+  for (const Range& symbol_range : llvm::concat<const Range>(
+           second.GetSymbolRanges(), first.GetSymbolRanges())) {
+    combined_symbol_ranges.push_back(symbol_range);
+  }
+
+  IndexingMap composed_indexing_map(composed_map, first.GetDimensionRanges(),
+                                    std::move(combined_symbol_ranges));
+  // Add constraints that are already present in the producer_map. We have to
+  // compute consumer_map(producer_constraints). To keep all symbols and
+  // dimension IDs the same as in the `composed_indexing_map.affine_map`, we
+  // create an AffineMap
+  // (dims of producer_affine_map)[symbols_of_producer_affine_map] =
+  //   (constraint_1, ..., constraint_N) and then compose.
+  std::vector<AffineExpr> constraints;
+  std::vector<Range> constraints_ranges;
+  for (const auto& [expr, range] : second.GetConstraints()) {
+    constraints.push_back(expr);
+    constraints_ranges.push_back(range);
+  }
+  auto constraints_map = AffineMap::get(
+      producer_affine_map.getNumDims(), producer_affine_map.getNumSymbols(),
+      constraints, producer_affine_map.getContext());
+  auto remapped_constraints = constraints_map.compose(first.GetAffineMap());
+  for (const auto& [expr, range] :
+       llvm::zip(remapped_constraints.getResults(), constraints_ranges)) {
+    composed_indexing_map.AddConstraint(expr, range);
+  }
+  // Remap symbol ids and add constraints that are already present in the
+  // consumer_map.
+  for (const auto& [expr, range] : first.GetConstraints()) {
+    composed_indexing_map.AddConstraint(
+        expr.shiftSymbols(first.GetSymbolCount(), second.GetSymbolCount()),
+        range);
+  }
+  // Add constraints for consumer's codomain w.r.t. producer's domain.
+  for (auto [index, expr] :
+       llvm::enumerate(first.GetAffineMap().getResults())) {
+    Range producer_dim_range =
+        second.GetDimensionRange(static_cast<int64_t>(index));
+    composed_indexing_map.AddConstraint(
+        expr.shiftSymbols(first.GetSymbolCount(), second.GetSymbolCount()),
+        producer_dim_range);
+  }
+  return composed_indexing_map;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map.h b/third_party/xla/xla/service/gpu/model/indexing_map.h
index 65d7445385f4ea..e631bc7a45bd92 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_map.h
@@ -18,7 +18,6 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
-#include <functional>
 #include <optional>
 #include <ostream>
 #include <string>
@@ -28,8 +27,10 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/SmallVector.h"
 #include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/service/gpu/model/affine_map_printer.h"
 
 namespace xla {
@@ -53,44 +54,13 @@ H AbslHashValue(H h, const Range& range) {
   return H::combine(std::move(h), range.lower_bound, range.upper_bound);
 }
 
-// Domain contains ranges for symbols and dimensions of an affine map.
-class Domain {
- public:
-  Domain() = default;
-
-  Domain(absl::Span<const Range> dim_ranges,
-         absl::Span<const Range> symbol_ranges)
-      : dim_ranges_(dim_ranges.begin(), dim_ranges.end()),
-        symbol_ranges_(symbol_ranges.begin(), symbol_ranges.end()) {}
-
-  static Domain FromUpperBounds(absl::Span<const int64_t> dim_upper_bounds,
-                                absl::Span<const int64_t> symbol_upper_bounds);
-
-  // Setters/getters for dimension ranges.
-  Range GetDimensionRange(int64_t id) const { return dim_ranges_[id]; }
-  absl::Span<const Range> GetDimensionRanges() const { return dim_ranges_; }
-  int64_t GetDimensionCount() const { return dim_ranges_.size(); }
-
-  // Setters/getters for symbol ranges.
-  Range GetSymbolRange(int64_t id) const { return symbol_ranges_[id]; }
-  absl::Span<const Range> GetSymbolRanges() const { return symbol_ranges_; }
-  int64_t GetSymbolCount() const { return symbol_ranges_.size(); }
-
-  std::string ToString(
-      const AffineMapPrinter& printer = AffineMapPrinter()) const;
-
-  void Print(std::ostream& out, const AffineMapPrinter& printer) const;
-
- private:
-  std::vector<Range> dim_ranges_;
-  std::vector<Range> symbol_ranges_;
-};
-
 // Evaluates lower and upper bounds for expressions given the domain.
 // Not thread safe.
 class RangeEvaluator {
  public:
-  explicit RangeEvaluator(const Domain* domain) : domain_(domain) {}
+  RangeEvaluator(absl::Span<const Range> dim_ranges,
+                 absl::Span<const Range> symbol_ranges,
+                 mlir::MLIRContext* mlir_context);
 
   // Checks whether an `AffineExpr` always describes a non-negative value.
   bool IsAlwaysPositiveOrZero(mlir::AffineExpr expr);
@@ -101,19 +71,14 @@ class RangeEvaluator {
   // Computes the range of expression using its subexpression ranges.
   Range ComputeExpressionRange(mlir::AffineExpr expr);
 
+  // Return MLIR context.
+  mlir::MLIRContext* GetMLIRContext() const { return mlir_context_; }
+
  private:
-  const Domain* const domain_;
+  mlir::MLIRContext* mlir_context_;
   llvm::DenseMap<mlir::AffineExpr, Range> expression_ranges_cache_;
 };
 
-std::ostream& operator<<(std::ostream& out, const Domain& domain);
-bool operator==(const Domain& lhs, const Domain& rhs);
-
-template <typename H>
-H AbslHashValue(H h, const Domain& domain) {
-  return H::combine(std::move(h), domain.GetDimensionRanges(),
-                    domain.GetSymbolRanges());
-}
 
 // Contains an affine map with N dimension expressions and M symbols:
 //   (d0, ..., d_{N - 1})[s_0, ..., s_{M - 1}] -> f(d_i, s_j)
@@ -140,7 +105,25 @@ H AbslHashValue(H h, const Domain& domain) {
 // ```
 // can be written as `(d0, d1, d2, d3) -> (d0, -d1 + 16, -d2 + 8, d3)` with
 // d0 in [0, 1), d1 in [0, 16], d2 in [0, 8] and d3 in [0, 8].
-struct IndexingMap {
+class IndexingMap {
+ public:
+  IndexingMap(mlir::AffineMap affine_map, std::vector<Range> dim_ranges,
+              std::vector<Range> symbol_ranges,
+              absl::Span<std::pair<mlir::AffineExpr, Range>> constraints = {})
+      : affine_map_(affine_map),
+        dim_ranges_(std::move(dim_ranges)),
+        symbol_ranges_(std::move(symbol_ranges)) {
+    for (const auto& [expr, range] : constraints) {
+      AddConstraint(expr, range);
+    }
+  }
+
+  static IndexingMap GetUndefined() { return IndexingMap(); }
+
+  static IndexingMap FromTensorSizes(
+      mlir::AffineMap affine_map, absl::Span<const int64_t> dim_upper_bounds,
+      absl::Span<const int64_t> symbol_upper_bounds);
+
   std::string ToString(
       const AffineMapPrinter& printer = AffineMapPrinter()) const;
 
@@ -149,63 +132,93 @@ struct IndexingMap {
   // Returns true if the map was simplified.
   bool Simplify();
 
-  mlir::AffineMap affine_map;
-  Domain domain;
-};
-std::ostream& operator<<(std::ostream& out, const IndexingMap& indexing_map);
-bool operator==(const IndexingMap& lhs, const IndexingMap& rhs);
+  // Return MLIRContext.
+  mlir::MLIRContext* GetMLIRContext() const { return affine_map_.getContext(); }
 
-// Composes affine maps, i.e. consumer_map ∘ producer_map.
-// Right now the ranges of the composed indexing map are correct only when there
-// is no composition with concat.
-// TODO(b/319410501): Generalize domain modelling.
-std::optional<IndexingMap> ComposeIndexingMaps(
-    const std::optional<IndexingMap>& producer_map,
-    const std::optional<IndexingMap>& consumer_map);
+  // Returns the affine map.
+  mlir::AffineMap GetAffineMap() const { return affine_map_; }
 
-template <typename H>
-H AbslHashValue(H h, const IndexingMap& indexing_map) {
-  llvm::hash_code affine_map_hash = llvm::hash_combine(indexing_map.affine_map);
-  return H::combine(std::move(h), static_cast<size_t>(affine_map_hash),
-                    indexing_map.domain);
-}
-
-class IndexingMapSimplifier {
- public:
-  IndexingMapSimplifier(RangeEvaluator* range_evaluator,
-                        mlir::MLIRContext* mlir_context)
-      : range_evaluator_(range_evaluator), mlir_context_(mlir_context) {}
+  // Getters for dimension ranges.
+  Range GetDimensionRange(int64_t id) const { return dim_ranges_[id]; }
+  const std::vector<Range>& GetDimensionRanges() const { return dim_ranges_; }
+  int64_t GetDimensionCount() const { return dim_ranges_.size(); }
 
-  // Simplifies the map as much as possible.
-  mlir::AffineMap Simplify(mlir::AffineMap affine_map);
+  // Getters for symbol ranges.
+  Range GetSymbolRange(int64_t id) const { return symbol_ranges_[id]; }
+  const std::vector<Range>& GetSymbolRanges() const { return symbol_ranges_; }
+  int64_t GetSymbolCount() const { return symbol_ranges_.size(); }
 
-  // Simplifies the expression as much as possible.
-  mlir::AffineExpr Simplify(mlir::AffineExpr expr);
+  // Getters for affine expression constraints.
+  const llvm::DenseMap<mlir::AffineExpr, Range>& GetConstraints() const {
+    return constraints_;
+  }
+  int64_t GetConstraintsCount() const { return constraints_.size(); }
+
+  // Allows to add bounds for the affine expression `expr`. If there are
+  // bounds for the `expr`, then computes intersection of the current and new
+  // ranges.
+  void AddConstraint(mlir::AffineExpr expr, Range range);
+
+  // Evaluates the constraints at a given point and returns `true` if all
+  // constraints are satisfied.
+  bool ConstraintsSatisfied(
+      llvm::ArrayRef<mlir::AffineExpr> dim_const_exprs,
+      llvm::ArrayRef<mlir::AffineExpr> symbol_const_exprs) const;
+
+  // Evaluates indexing map results at a given point.
+  llvm::SmallVector<int64_t, 4> Evaluate(
+      llvm::ArrayRef<mlir::AffineExpr> dim_const_exprs,
+      llvm::ArrayRef<mlir::AffineExpr> symbol_const_exprs) const;
+
+  // Returns true if the domain is empty. Right now it scans through all
+  // constraints to find the one where lower_bound > upper_bound. If it returns
+  // true, that does not mean that the domain is not effectively empty.
+  // For example, if there are two constraints 0 <= d0 mod 7 <= 0 and
+  // 0 <= d0 mod 11 <= 0 for a dimension 0<= d0 <= 50 then there is no d0 that
+  // satisfies both constraints.
+  bool IsKnownEmpty() const;
+
+  bool IsUndefined() const { return affine_map_ == mlir::AffineMap(); }
+
+  // Removes unused symbols from the `affine_map_` and constraints.
+  void RemoveUnusedSymbols();
 
  private:
-  std::optional<int64_t> GetConstantRhsMultiplier(mlir::AffineExpr expr);
-
-  // Simplifier for mod.
-  // - Rewrites (a * 100 + ...) % 100 to (...) % 100
-  // - Rewrites a % b to a if a is known to be less than b.
-  mlir::AffineExpr RewriteMod(mlir::AffineBinaryOpExpr mod);
+  IndexingMap() = default;
 
-  // Simplifier for floordiv.
-  // - Rewrites (a * 100 + ...) / 100 to a + (...) / 100
-  // - Rewrites a / 100 to 0 when a is known to be less than 100.
-  mlir::AffineExpr RewriteFloorDiv(mlir::AffineBinaryOpExpr div);
+  // Performs AffineExpr simplification for all constraints.
+  // Returns true if simplification was performed.
+  bool SimplifyConstraintExprs();
 
-  mlir::AffineExpr RewriteSumIf(
-      mlir::AffineExpr expr, const std::function<bool(mlir::AffineExpr)>& pred);
+  // Performs range simplification for all constraints.
+  // Returns true if simplification was performed.
+  bool SimplifyConstraintRanges();
 
-  // Attempts to simplify the expression, but doesn't attempt to simplify the
-  // result further.
-  mlir::AffineExpr SimplifyOnce(mlir::AffineExpr expr);
-
-  RangeEvaluator* range_evaluator_;
-  mlir::MLIRContext* mlir_context_;
+  mlir::AffineMap affine_map_;
+  std::vector<Range> dim_ranges_;
+  std::vector<Range> symbol_ranges_;
+  // Inequality constraints for affine expressions. They restrict the feasible
+  // set for the domain of the indexing map. It contains affine expressions
+  // other than AffineDimExpr and AffineSymbolExpr.
+  llvm::DenseMap<mlir::AffineExpr, Range> constraints_;
 };
+std::ostream& operator<<(std::ostream& out, const IndexingMap& indexing_map);
+bool operator==(const IndexingMap& lhs, const IndexingMap& rhs);
+IndexingMap operator*(const IndexingMap& lhs, const IndexingMap& rhs);
+
+// Composes affine maps, i.e. first ∘ second.
+IndexingMap ComposeIndexingMaps(const IndexingMap& first,
+                                const IndexingMap& second);
 
+template <typename H>
+H AbslHashValue(H h, const IndexingMap& indexing_map) {
+  llvm::hash_code affine_map_hash =
+      llvm::hash_combine(indexing_map.GetAffineMap());
+  return H::combine(std::move(h), static_cast<size_t>(affine_map_hash),
+                    indexing_map.GetDimensionRanges(),
+                    indexing_map.GetSymbolRanges(),
+                    indexing_map.GetConstraintsCount());
+}
 
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
index de3f407e8b35a0..d9eef56cd0749b 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_map_test.cc
@@ -30,7 +30,6 @@ namespace gpu {
 namespace {
 
 using ::testing::ElementsAre;
-using ::testing::HasSubstr;
 
 class IndexingMapTest : public HloTestBase {
  public:
@@ -38,83 +37,411 @@ class IndexingMapTest : public HloTestBase {
   AffineMapPrinter printer_;
 };
 
-TEST_F(IndexingMapTest, ComposeWithPermutation) {
-  IndexingMap producer{
+TEST_F(IndexingMapTest, Evaluation) {
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
       ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1, s0)", &mlir_context_),
-      Domain::FromUpperBounds({4, 4}, {2, 2})};
+      {4, 4}, {2, 2});
 
-  IndexingMap consumer{ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_),
-                       Domain::FromUpperBounds({4}, {4})};
+  auto results = indexing_map.Evaluate(
+      mlir::getAffineConstantExprs({1, 2}, &mlir_context_),
+      mlir::getAffineConstantExprs({3, 4}, &mlir_context_));
+  EXPECT_THAT(results, ElementsAre(2, 1, 4, 3));
 
-  auto composed = ComposeIndexingMaps(producer, consumer);
-  EXPECT_THAT(
-      composed,
-      MatchIndexingMap(
-          "(d0)[s0, s1, s2] -> (s2, d0, s1, s0)", ElementsAre(MatchRange(0, 3)),
-          ElementsAre(MatchRange(0, 1), MatchRange(0, 1), MatchRange(0, 3))));
+  auto feasible = indexing_map.ConstraintsSatisfied(
+      mlir::getAffineConstantExprs({1, 2}, &mlir_context_),
+      mlir::getAffineConstantExprs({3, 4}, &mlir_context_));
+  EXPECT_TRUE(feasible);
+
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 4", &mlir_context_),
+                             Range{0, 0});
+
+  auto infeasible = indexing_map.ConstraintsSatisfied(
+      mlir::getAffineConstantExprs({1, 2}, &mlir_context_),
+      mlir::getAffineConstantExprs({5, 4}, &mlir_context_));
+  EXPECT_FALSE(infeasible);
+}
+
+TEST_F(IndexingMapTest, Composition_Permutation) {
+  IndexingMap producer = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1, s0)", &mlir_context_),
+      {4, 4}, {2, 2});
+
+  IndexingMap consumer = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_), {4}, {4});
+
+  auto composed = ComposeIndexingMaps(consumer, producer);
+  EXPECT_THAT(composed, MatchIndexingMap(R"(
+                          (d0)[s0, s1, s2] -> (s2, d0, s1, s0)
+                          domain:
+                          d0 in [0, 3]
+                          s0 in [0, 1]
+                          s1 in [0, 1]
+                          s2 in [0, 3]
+                        )"));
+}
+
+TEST_F(IndexingMapTest, Composition_RestrictedRange) {
+  IndexingMap producer = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1, s0)", &mlir_context_),
+      {5, 6}, {7, 2});
+
+  IndexingMap consumer = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_), {10}, {8});
+
+  auto composed = ComposeIndexingMaps(consumer, producer);
+  EXPECT_THAT(composed, MatchIndexingMap(R"(
+                          (d0)[s0, s1, s2] -> (s2, d0, s1, s0)
+                          domain:
+                          d0 in [0, 4]
+                          s0 in [0, 6]
+                          s1 in [0, 1]
+                          s2 in [0, 5]
+                        )"));
+}
+
+TEST_F(IndexingMapTest, Composition_ProducerAndConsumerHaveConstraints) {
+  IndexingMap producer = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1, s0)", &mlir_context_),
+      {50, 60}, {70, 20});
+  producer.AddConstraint(ParseAffineExpr("d0 mod 8", &mlir_context_),
+                         Range{0, 0});
+  producer.AddConstraint(ParseAffineExpr("s0 mod 3", &mlir_context_),
+                         Range{1, 1});
+
+  IndexingMap consumer = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0)[s0] -> (d0, s0)", &mlir_context_), {10}, {8});
+  consumer.AddConstraint(ParseAffineExpr("d0 + s0", &mlir_context_),
+                         Range{0, 20});
+  consumer.AddConstraint(ParseAffineExpr("s0 mod 4", &mlir_context_),
+                         Range{0, 0});
+
+  auto composed = ComposeIndexingMaps(consumer, producer);
+  EXPECT_THAT(composed, MatchIndexingMap(R"(
+                          (d0)[s0, s1, s2] -> (s2, d0, s1, s0)
+                          domain:
+                          d0 in [0, 9]
+                          s0 in [0, 69]
+                          s1 in [0, 19]
+                          s2 in [0, 7]
+                          d0 + s2 in [0, 20]
+                          d0 mod 8 in [0, 0]
+                          s0 mod 3 in [1, 1]
+                          s2 mod 4 in [0, 0]
+                        )"));
+  composed.Simplify();
+  EXPECT_THAT(composed, MatchIndexingMap(R"(
+                          (d0)[s0, s1, s2] -> (s2, d0, s1, s0)
+                          domain:
+                          d0 in [0, 9]
+                          s0 in [0, 69]
+                          s1 in [0, 19]
+                          s2 in [0, 7]
+                          d0 mod 8 in [0, 0]
+                          s0 mod 3 in [1, 1]
+                          s2 mod 4 in [0, 0]
+                        )"));
+}
+
+TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintUsesSymbol) {
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1)", &mlir_context_),
+      {50, 60}, {70, 20});
+  // This constraint cannot be removed, because it contains a "used symbol".
+  indexing_map.AddConstraint(ParseAffineExpr("s0 + s1", &mlir_context_),
+                             Range{1, 100});
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 3", &mlir_context_),
+                             Range{0, 0});
+  indexing_map.RemoveUnusedSymbols();
+  EXPECT_THAT(indexing_map, MatchIndexingMap(R"(
+                          (d0, d1)[s0, s1] -> (d1, d0, s1)
+                          domain:
+                          d0 in [0, 49]
+                          d1 in [0, 59]
+                          s0 in [0, 69]
+                          s1 in [0, 19]
+                          s0 + s1 in [1, 100]
+                          s0 mod 3 in [0, 0]
+                        )"));
+}
+
+TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintUsesOnlyUnusedSymbols) {
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0, d1)[s0, s1] -> (d1, d0, s1)", &mlir_context_),
+      {50, 60}, {70, 20});
+  // This constraint can be removed, because it contains only the unused symbol.
+  indexing_map.AddConstraint(ParseAffineExpr("s0 mod 3", &mlir_context_),
+                             Range{0, 0});
+  indexing_map.RemoveUnusedSymbols();
+  EXPECT_THAT(indexing_map, MatchIndexingMap(R"(
+                          (d0, d1)[s0] -> (d1, d0, s0)
+                          domain:
+                          d0 in [0, 49]
+                          d1 in [0, 59]
+                          s0 in [0, 19]
+                        )"));
+}
+
+TEST_F(IndexingMapTest, RemoveUnusedSymbols_ConstraintsWithManySymbols) {
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0)[s0, s1, s2, s3, s4] -> (d0 * 4 + s1 + s3 - 42)",
+                     &mlir_context_),
+      {32}, {1, 2, 3, 4, 5});
+  indexing_map.AddConstraint(
+      ParseAffineExpr("d0 * 4 + s1 + s3", &mlir_context_), Range{24, 459});
+  indexing_map.RemoveUnusedSymbols();
+  // Symbols s0, s2, s4 will be removed and s1 and s3 will become s0 and s1.
+  EXPECT_THAT(indexing_map, MatchIndexingMap(R"(
+                              (d0)[s0, s1] -> (d0 * 4 + s0 + s1 - 42)
+                              domain:
+                              d0 in [0, 31]
+                              s0 in [0, 1]
+                              s1 in [0, 3]
+                              d0 * 4 + s0 + s1 in [24, 459]
+                            )"));
+}
+
+TEST_F(IndexingMapTest, ConstraintRangeSimplification_Sum) {
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0) -> (d0)", &mlir_context_), {100}, {});
+
+  indexing_map.AddConstraint(ParseAffineExpr("(d0 mod 8) + 5", &mlir_context_),
+                             Range{50, 54});
+
+  EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
+                          (d0) -> (d0)
+                          domain:
+                          d0 in [0, 99]
+                          d0 mod 8 in [45, 49]
+                        )"));
+}
+
+TEST_F(IndexingMapTest,
+       ConstraintRangeSimplification_FloorDivPositiveDivisorPositiveBounds) {
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0) -> (d0)", &mlir_context_), {100}, {});
+
+  indexing_map.AddConstraint(ParseAffineExpr("d0 floordiv 8", &mlir_context_),
+                             Range{5, 11});
+  EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
+                          (d0) -> (d0)
+                          domain:
+                          d0 in [40, 95]
+                        )"));
+}
+
+TEST_F(IndexingMapTest,
+       ConstraintRangeSimplification_FloorDivPositiveDivisorNegativeBounds) {
+  IndexingMap indexing_map =
+      IndexingMap(ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
+                  {Range{0, 99}}, {Range{-99, 99}});
+
+  indexing_map.AddConstraint(ParseAffineExpr("s0 floordiv 3", &mlir_context_),
+                             Range{-11, -5});
+  EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
+                          (d0)[s0] -> (d0)
+                          domain:
+                          d0 in [0, 99]
+                          s0 in [-33, -13]
+                        )"));
 }
 
-TEST_F(IndexingMapTest, SimplifyConstantDims) {
-  IndexingMap indexing_map{ParseAffineMap("(d0) -> (d0)", &mlir_context_),
-                           Domain{{Range{5, 5}}, {}}};
+TEST_F(IndexingMapTest,
+       ConstraintRangeSimplification_FloorDivNegativeDivisorNegativeBounds) {
+  IndexingMap indexing_map =
+      IndexingMap(ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
+                  {Range{0, 99}}, {Range{-99, 99}});
+
+  indexing_map.AddConstraint(ParseAffineExpr("s0 floordiv -3", &mlir_context_),
+                             Range{-11, -5});
+  EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
+                          (d0)[s0] -> (d0)
+                          domain:
+                          d0 in [0, 99]
+                          s0 in [15, 35]
+                        )"));
+}
+
+TEST_F(IndexingMapTest,
+       ConstraintRangeSimplification_MulPositiveMultiplierPositiveBounds) {
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap("(d0) -> (d0)", &mlir_context_), {100}, {});
+
+  indexing_map.AddConstraint(ParseAffineExpr("d0 * 8", &mlir_context_),
+                             Range{14, 33});
+  EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
+                          (d0) -> (d0)
+                          domain:
+                          d0 in [2, 4]
+                        )"));
+}
+
+TEST_F(IndexingMapTest,
+       ConstraintRangeSimplification_MulPositiveMultiplierNegativeBounds) {
+  IndexingMap indexing_map =
+      IndexingMap(ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
+                  {Range{0, 99}}, {Range{-99, 99}});
+
+  indexing_map.AddConstraint(ParseAffineExpr("s0 * 3", &mlir_context_),
+                             Range{-11, -5});
+  EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
+                          (d0)[s0] -> (d0)
+                          domain:
+                          d0 in [0, 99]
+                          s0 in [-3, -2]
+                        )"));
+}
+
+TEST_F(IndexingMapTest,
+       ConstraintRangeSimplification_MulNegativeMultiplierNegativeBounds) {
+  IndexingMap indexing_map =
+      IndexingMap(ParseAffineMap("(d0)[s0] -> (d0)", &mlir_context_),
+                  {Range{0, 99}}, {Range{-99, 99}});
+
+  indexing_map.AddConstraint(ParseAffineExpr("s0 * -3", &mlir_context_),
+                             Range{-11, -5});
+  EXPECT_THAT(indexing_map.ToString(), MatchIndexingString(R"(
+                          (d0)[s0] -> (d0)
+                          domain:
+                          d0 in [0, 99]
+                          s0 in [2, 3]
+                        )"));
+}
+
+TEST_F(IndexingMapTest, AffineMapSimplification_ConstantDims) {
+  IndexingMap indexing_map = IndexingMap(
+      ParseAffineMap("(d0) -> (d0)", &mlir_context_), {Range{5, 5}}, {});
   indexing_map.Simplify();
-  EXPECT_THAT(printer_.ToString(indexing_map.affine_map),
-              HasSubstr("(d0) -> (5)"));
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+                                                  (d0) -> (5)
+                                                  domain:
+                                                  d0 in [5, 5]
+                                                )"));
 }
 
-TEST_F(IndexingMapTest, SimplifyDivsAndModsIfSmallerThanDivisor) {
+TEST_F(IndexingMapTest,
+       AffineMapSimplification_DivsAndModsIfSmallerThanDivisor) {
   auto serialized_map = "(d0, d1) -> (d0 + d1 floordiv 16, d1 mod 16)";
-  IndexingMap indexing_map{ParseAffineMap(serialized_map, &mlir_context_),
-                           Domain::FromUpperBounds({8, 16}, {})};
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap(serialized_map, &mlir_context_), {8, 16}, {});
   indexing_map.Simplify();
-
-  EXPECT_THAT(printer_.ToString(indexing_map.affine_map),
-              HasSubstr("(d0, d1) -> (d0, d1)"));
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+                                                  (d0, d1) -> (d0, d1)
+                                                  domain:
+                                                  d0 in [0, 7]
+                                                  d1 in [0, 15]
+                                                )"));
 }
 
-TEST_F(IndexingMapTest, SimplifyDivsAndModsWithMultipliers) {
+TEST_F(IndexingMapTest, AffineMapSimplification_DivsAndModsWithMultipliers) {
   auto serialized_map =
       "(d0, d1, d2) -> ((d0 * 100 + d1 * 10 + d2) floordiv 100, "
       "((d0 * 100 + d1 * 10 + d2) mod 100) floordiv 10, "
       "d2 mod 10)";
 
-  IndexingMap indexing_map{ParseAffineMap(serialized_map, &mlir_context_),
-                           Domain::FromUpperBounds({9, 9, 9}, {})};
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap(serialized_map, &mlir_context_), {9, 9, 9}, {});
   indexing_map.Simplify();
 
-  EXPECT_THAT(printer_.ToString(indexing_map.affine_map),
-              HasSubstr("(d0, d1, d2) -> (d0, d1, d2)"));
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+                                                  (d0, d1, d2) -> (d0, d1, d2)
+                                                  domain:
+                                                  d0 in [0, 8]
+                                                  d1 in [0, 8]
+                                                  d2 in [0, 8]
+                                                )"));
 }
 
-TEST_F(IndexingMapTest, SimplifyDivsAndModsWithDivisibleMultipliers) {
+TEST_F(IndexingMapTest,
+       AffineMapSimplification_DivsAndModsWithDivisibleMultipliers) {
   auto serialized_map =
       "(d0, d1, d2) -> ((d0 * 16 + d1 * 4 + d2) floordiv 8, "
-      "(d0 * 16 + d1 * 4 + d2) mod 8)";
+      "                 (d0 * 16 + d1 * 4 + d2) mod 8)";
 
-  IndexingMap indexing_map{ParseAffineMap(serialized_map, &mlir_context_),
-                           Domain::FromUpperBounds({10, 10, 10}, {})};
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap(serialized_map, &mlir_context_), {10, 10, 10}, {});
   indexing_map.Simplify();
-
-  EXPECT_THAT(printer_.ToString(indexing_map.affine_map),
-              HasSubstr("(d0, d1, d2) -> (d0 * 2 + (d1 * 4 + d2) floordiv 8, "
-                        "(d1 * 4 + d2) mod 8)"));
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+    (d0, d1, d2) -> (d0 * 2 + (d1 + d2 floordiv 4) floordiv 2, (d1 * 4 + d2) mod 8)
+    domain:
+    d0 in [0, 9]
+    d1 in [0, 9]
+    d2 in [0, 9]
+  )"));
 }
 
-TEST_F(IndexingMapTest, SimplifyDivsAndModsWithReverse) {
+TEST_F(IndexingMapTest, AffineMapSimplification_DivsAndModsWithReverse) {
   auto serialized_map =
       "(d0, d1) -> (-((d0 * -11 - d1 + 109) floordiv 11) + 9, "
       "d0 * 11 + d1 + ((d0 * -11 - d1 + 109) floordiv 11) * 11 - 99)";
-  IndexingMap indexing_map{ParseAffineMap(serialized_map, &mlir_context_),
-                           Domain::FromUpperBounds({8, 9}, {})};
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap(serialized_map, &mlir_context_), {8, 9}, {});
   indexing_map.Simplify();
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+                                                 (d0, d1) -> (d0, d1)
+                                                 domain:
+                                                 d0 in [0, 7]
+                                                 d1 in [0, 8]
+                                               )"));
+}
 
-  EXPECT_THAT(printer_.ToString(indexing_map.affine_map),
-              HasSubstr("(d0, d1) -> (d0, d1)"));
+TEST_F(IndexingMapTest, AffineMapSimplification_DivsInSequence) {
+  auto serialized_map =
+      "()[s0] -> (s0 - ((s0 floordiv 2) floordiv 7) * 14 + (s0 floordiv 14) * "
+      "14)";
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap(serialized_map, &mlir_context_), {}, {1234});
+  indexing_map.Simplify();
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+                                                 ()[s0] -> (s0)
+                                                 domain:
+                                                 s0 in [0, 1233]
+                                               )"));
+}
+
+TEST_F(IndexingMapTest, AffineMapSimplification_DivGcdGreater1) {
+  auto serialized_map =
+      "()[s0, s1, s2] -> (s0 * 512 + s1 * 4 + s2 - ((s0 * 2 + s1 floordiv 64) "
+      "floordiv 3) * 768 + ((s0 * 128 + s1) floordiv 192) * 768)";
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap(serialized_map, &mlir_context_), {}, {1234, 128, 4});
+  indexing_map.Simplify();
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+      ()[s0, s1, s2] -> (s0 * 512 + s1 * 4 + s2)
+      domain:
+      s0 in [0, 1233]
+      s1 in [0, 127]
+      s2 in [0, 3]
+    )"));
+}
+
+TEST_F(IndexingMapTest, AffineMapSimplification_ExtractFromMod) {
+  auto serialized_map =
+      "()[s0, s1, s2, s3] -> ((s0 * 458752 + s1 + s2 * 4 + s3 * 512) mod "
+      "20000)";
+  IndexingMap indexing_map = IndexingMap::FromTensorSizes(
+      ParseAffineMap(serialized_map, &mlir_context_), {}, {872, 4, 128, 896});
+  indexing_map.Simplify();
+  // TODO(jreiffers): Get rid of the division here. The important thing is that
+  // s1 was extracted from the mod and is not in the subtracted value, but we'd
+  // prefer the result to be:
+  //   (s0 * 458752 + s2 * 4 + s3 * 512) mod 20000 + s1
+  EXPECT_THAT(indexing_map.ToString(printer_), MatchIndexingString(R"(
+      ()[s0, s1, s2, s3] -> (
+        s0 * 458752 + s1 + s2 * 4 + s3 * 512 -
+        ((s0 * 14336 + s3 * 16 + s2 floordiv 8) floordiv 625) * 20000
+      )
+      domain:
+      s0 in [0, 871]
+      s1 in [0, 3]
+      s2 in [0, 127]
+      s3 in [0, 895]
+    )"));
 }
 
 TEST_F(IndexingMapTest, RangeEvaluatorTest) {
-  Domain domain({Range{0, 9}, Range{-10, -1}, Range{-1, 2}, Range{0, 0}}, {});
-  RangeEvaluator range_evaluator(&domain);
+  RangeEvaluator range_evaluator(
+      {Range{0, 9}, Range{-10, -1}, Range{-1, 2}, Range{0, 0}}, {},
+      &mlir_context_);
   mlir::AffineExpr d0, d1, d2, d3;
   bindDims(&mlir_context_, d0, d1, d2, d3);
 
@@ -135,24 +462,9 @@ TEST_F(IndexingMapTest, RangeEvaluatorTest) {
   EXPECT_TRUE(range_evaluator.IsAlwaysNegativeOrZero(d3));
 }
 
-// TODO(b/313840171): Simplify `(d1 * 4 + d2) floordiv 8` to `d1 floordiv 2`.
-
-// TODO(b/313840171): Simplify `(d0 * 8 + d1) floordiv 16` to `d0 floordiv 2`.
-
 // TODO(b/313840171): Simplify `((d0 * 8 + d1) mod 16) floordiv 4` to
 // `((d0 * 8 + d1) floordiv 4) mod 4` to `(d0 * 2 + d1 floordiv 4) mod 4`.
 
-TEST_F(IndexingMapTest, AffineMapPrinterTest) {
-  auto map =
-      ParseAffineMap("(d0, d1)[s0, s1] -> (d0 + d1 floordiv 8, s0 + s1 mod 16)",
-                     &mlir_context_);
-  printer_.SetDimensionName(0, "offset");
-  printer_.SetSymbolName(1, "linear_index");
-  EXPECT_THAT(printer_.ToString(map),
-              HasSubstr("(offset, d1)[s0, linear_index] -> "
-                        "(offset + d1 floordiv 8, s0 + linear_index mod 16)"));
-}
-
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc b/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc
index 9c0be330cce535..5285ea4e3e98ad 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc
+++ b/third_party/xla/xla/service/gpu/model/indexing_test_utils.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,28 +15,36 @@ limitations under the License.
 
 #include "xla/service/gpu/model/indexing_test_utils.h"
 
+#include <cctype>
+#include <cstddef>
+#include <optional>
 #include <string>
 
 #include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "llvm/ADT/STLExtras.h"
 #include "mlir/AsmParser/AsmParser.h"  // from @llvm-project
+#include "mlir/IR/AffineExpr.h"  // from @llvm-project
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "mlir/IR/MLIRContext.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/gpu/model/indexing_analysis.h"
+#include "xla/service/gpu/model/indexing_map.h"
 #include "xla/tests/hlo_test_base.h"
 
 namespace xla {
 namespace gpu {
 
+using ::mlir::AffineExpr;
 using ::mlir::AffineMap;
-using ::mlir::AffineMapAttr;
+using ::mlir::MLIRContext;
 
 HloInstructionIndexing ComputeOutputToInputIndexingForEntryComputation(
-    HloTestBase* test_base, mlir::MLIRContext* mlir_context,
-    absl::string_view hlo_string, int output_id) {
+    HloTestBase* test_base, MLIRContext* mlir_context,
+    absl::string_view hlo_string, int output_id, bool use_physical_layout) {
   auto module = test_base->ParseAndReturnVerifiedModule(hlo_string);
   EXPECT_TRUE(module.ok());
 
@@ -50,12 +58,40 @@ HloInstructionIndexing ComputeOutputToInputIndexingForEntryComputation(
       return {};
     }
   }
-  return ComputeOutputToInputIndexing(root, output_id, mlir_context);
+  HloInstructionIndexing indexing =
+      ComputeOutputToInputIndexing(root, output_id, mlir_context);
+
+  if (!use_physical_layout) return indexing;
+
+  IndexingMap output_permutation = GetIndexingMapFromPhysicalLayoutToLogical(
+      GetOutputShape(root, output_id), mlir_context);
+
+  for (const auto& [operand_id, indexing_maps] :
+       llvm::enumerate(indexing.indexing_maps)) {
+    IndexingMap operand_permutation = GetIndexingMapFromLogicalToPhysicalLayout(
+        root->operand(operand_id)->shape(), mlir_context);
+
+    absl::flat_hash_set<IndexingMap> operand_indexing_maps;
+    for (const IndexingMap& indexing_map : indexing_maps) {
+      auto normalized_indexing_map = indexing_map;
+      if (!output_permutation.GetAffineMap().isIdentity()) {
+        normalized_indexing_map =
+            ComposeIndexingMaps(output_permutation, normalized_indexing_map);
+      }
+      if (!operand_permutation.GetAffineMap().isIdentity()) {
+        normalized_indexing_map =
+            ComposeIndexingMaps(normalized_indexing_map, operand_permutation);
+      }
+      operand_indexing_maps.insert(normalized_indexing_map);
+    }
+    indexing.indexing_maps[operand_id] = operand_indexing_maps;
+  }
+  return indexing;
 }
 
 HloInstructionIndexing ComputeInputToOutputIndexingForEntryComputation(
-    HloTestBase* test_base, mlir::MLIRContext* mlir_context,
-    absl::string_view hlo_string, int input_id) {
+    HloTestBase* test_base, MLIRContext* mlir_context,
+    absl::string_view hlo_string, int input_id, bool use_physical_layout) {
   auto module = test_base->ParseAndReturnVerifiedModule(hlo_string);
   EXPECT_TRUE(module.ok());
 
@@ -69,17 +105,80 @@ HloInstructionIndexing ComputeInputToOutputIndexingForEntryComputation(
       return {};
     }
   }
-  return ComputeInputToOutputIndexing(root, input_id, mlir_context);
+  HloInstructionIndexing indexing =
+      ComputeInputToOutputIndexing(root, input_id, mlir_context);
+
+  if (!use_physical_layout) return indexing;
+
+  IndexingMap input_permutation = GetIndexingMapFromPhysicalLayoutToLogical(
+      root->operand(input_id)->shape(), mlir_context);
+
+  for (const auto& [output_id, indexing_maps] :
+       llvm::enumerate(indexing.indexing_maps)) {
+    IndexingMap operand_permutation = GetIndexingMapFromLogicalToPhysicalLayout(
+        GetOutputShape(root, output_id), mlir_context);
+
+    absl::flat_hash_set<IndexingMap> operand_indexing_maps;
+    for (const IndexingMap& indexing_map : indexing_maps) {
+      auto normalized_indexing_map = indexing_map;
+      if (!input_permutation.GetAffineMap().isIdentity()) {
+        normalized_indexing_map =
+            ComposeIndexingMaps(input_permutation, normalized_indexing_map);
+      }
+      if (!operand_permutation.GetAffineMap().isIdentity()) {
+        normalized_indexing_map =
+            ComposeIndexingMaps(normalized_indexing_map, operand_permutation);
+      }
+      operand_indexing_maps.insert(normalized_indexing_map);
+    }
+    indexing.indexing_maps[output_id] = operand_indexing_maps;
+  }
+  return indexing;
 }
 
 AffineMap ParseAffineMap(absl::string_view serialized_affine_map,
-                         mlir::MLIRContext* context) {
+                         MLIRContext* context) {
   std::string full_affine_map_string =
       absl::StrCat("affine_map<", serialized_affine_map, ">");
   return mlir::parseAttribute(full_affine_map_string, context)
-      .cast<AffineMapAttr>()
+      .cast<mlir::AffineMapAttr>()
       .getValue();
 }
 
+// Since MLIR does not have AffineExprAttr, we construct an AffineMap and then
+// retrieve its first result.
+AffineExpr ParseAffineExpr(absl::string_view serialized_affine_expr,
+                           MLIRContext* context) {
+  std::string full_affine_map_string = absl::StrCat(
+      "affine_map<(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9)"
+      "[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9] -> (",
+      serialized_affine_expr, ")>");
+  return mlir::parseAttribute(full_affine_map_string, context)
+      .cast<mlir::AffineMapAttr>()
+      .getValue()
+      .getResult(0);
+}
+
+bool ApproximateMatch(std::string_view lhs, std::string_view rhs) {
+  size_t lhs_length = lhs.size();
+  size_t rhs_length = rhs.size();
+  size_t l = 0, r = 0;
+  while (l < lhs_length && r < rhs_length) {
+    while (l < lhs_length && std::isspace(lhs[l])) {
+      ++l;
+    }
+    while (r < rhs_length && std::isspace(rhs[r])) {
+      ++r;
+    }
+    if (l == lhs_length || r == rhs_length) {
+      continue;
+    }
+    if (lhs[l++] != rhs[r++]) {
+      return false;
+    }
+  }
+  return l == lhs_length && r == rhs_length;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/model/indexing_test_utils.h b/third_party/xla/xla/service/gpu/model/indexing_test_utils.h
index e0b7cd48e7b346..f2e4ac2056c0a9 100644
--- a/third_party/xla/xla/service/gpu/model/indexing_test_utils.h
+++ b/third_party/xla/xla/service/gpu/model/indexing_test_utils.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_MODEL_INDEXING_TEST_UTILS_H_
 #define XLA_SERVICE_GPU_MODEL_INDEXING_TEST_UTILS_H_
 
+#include <string_view>
+
 #include <gmock/gmock.h>
 #include "absl/strings/string_view.h"
 #include "mlir/IR/AffineMap.h"  // from @llvm-project
@@ -27,42 +29,40 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
-MATCHER_P2(MatchRange, lower_bound, upper_bound,
-           absl::StrCat(negation ? "equals " : "doesn't equal ", "range [",
-                        lower_bound, ", ", upper_bound, "]")) {
-  return ExplainMatchResult(::testing::FieldsAre(lower_bound, upper_bound), arg,
-                            result_listener);
-}
+// Matches two strings ignoring whitespaces.
+bool ApproximateMatch(std::string_view lhs, std::string_view rhs);
 
-MATCHER_P2(MatchDomain, dim_ranges, symbol_ranges, "") {
-  return ExplainMatchResult(dim_ranges, arg.GetDimensionRanges(),
-                            result_listener) &&
-         ExplainMatchResult(symbol_ranges, arg.GetSymbolRanges(),
-                            result_listener);
-}
+MATCHER(UndefinedMap, "") { return arg.IsUndefined(); }
 
-MATCHER_P3(MatchIndexingMap, affine_map_string, dim_ranges, symbol_ranges, "") {
-  if (!arg.has_value()) {
+MATCHER_P(MatchIndexingMap, indexing_string, "") {
+  if (arg.IsUndefined()) {
     return false;
   }
-  return ExplainMatchResult(::testing::HasSubstr(affine_map_string),
-                            AffineMapPrinter().ToString(arg->affine_map),
-                            result_listener) &&
-         ExplainMatchResult(MatchDomain(dim_ranges, symbol_ranges), arg->domain,
+  return ExplainMatchResult(
+      true, ApproximateMatch(indexing_string, arg.ToString()), result_listener);
+}
+
+MATCHER_P(MatchIndexingString, indexing_string, "") {
+  return ExplainMatchResult(true, ApproximateMatch(indexing_string, arg),
                             result_listener);
 }
 
 HloInstructionIndexing ComputeOutputToInputIndexingForEntryComputation(
     HloTestBase* test_base, mlir::MLIRContext* mlir_context,
-    absl::string_view hlo_string, int output_id = 0);
+    absl::string_view hlo_string, int output_id = 0,
+    bool use_physical_layout = false);
 
 HloInstructionIndexing ComputeInputToOutputIndexingForEntryComputation(
     HloTestBase* test_base, mlir::MLIRContext* mlir_context,
-    absl::string_view hlo_string, int input_id = 0);
+    absl::string_view hlo_string, int input_id = 0,
+    bool use_physical_layout = false);
 
 mlir::AffineMap ParseAffineMap(absl::string_view serialized_affine_map,
                                mlir::MLIRContext* context);
 
+mlir::AffineExpr ParseAffineExpr(absl::string_view serialized_affine_expr,
+                                 mlir::MLIRContext* context);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis.cc b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
index 8badaba5b09972..dc5126a52876f5 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis.cc
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis.cc
@@ -172,7 +172,7 @@ AffineMap SubstituteAllIndicesAndKnownSymbolsWithSameValue(
 // symbols, since they will have been replaced by constants.
 std::optional<RawSymbolicTile> RawSymbolicTileFromIndexingMap(
     const IndexingMap& indexing_map) {
-  AffineMap affine_map = indexing_map.affine_map;
+  AffineMap affine_map = indexing_map.GetAffineMap();
   if (!AffineMapDescribesTile(affine_map)) {
     return std::nullopt;
   }
@@ -235,7 +235,7 @@ std::optional<RawSymbolicTile> RawSymbolicTileFromIndexingMap(
         if (symbol_expr && symbol_expr.getPosition() < num_known_symbols) {
           CHECK(!size_expr);
           const Range& symbol_range =
-              indexing_map.domain.GetSymbolRange(symbol_expr.getPosition());
+              indexing_map.GetSymbolRange(symbol_expr.getPosition());
           size_expr = getAffineConstantExpr(
               symbol_range.upper_bound - symbol_range.lower_bound + 1,
               mlir_context);
@@ -257,7 +257,9 @@ std::optional<RawSymbolicTile> RawSymbolicTileFromIndexingMap(
   offset_expressions.reserve(num_results);
   std::vector<AffineExpr> stride_expressions;
   stride_expressions.reserve(num_results);
-  RangeEvaluator range_evaluator(&indexing_map.domain);
+  RangeEvaluator range_evaluator(indexing_map.GetDimensionRanges(),
+                                 indexing_map.GetSymbolRanges(),
+                                 indexing_map.GetMLIRContext());
   for (auto [offset_expr, stride_expr, size_expr] :
        llvm::zip(unnormalized_offset_expressions, signed_stride_expressions,
                  size_expressions)) {
@@ -292,8 +294,8 @@ std::optional<RawSymbolicTile> RawSymbolicTileFromIndexingMap(
 
 /*static*/ std::optional<SymbolicTile> SymbolicTile::FromIndexingMap(
     const IndexingMap& indexing_map) {
-  MLIRContext* mlir_context = indexing_map.affine_map.getContext();
-  int64_t num_input_dims = indexing_map.domain.GetDimensionCount();
+  MLIRContext* mlir_context = indexing_map.GetAffineMap().getContext();
+  int64_t num_input_dims = indexing_map.GetDimensionCount();
   std::vector<AffineExpr> exprs;
   exprs.reserve(num_input_dims);
 
@@ -301,11 +303,11 @@ std::optional<RawSymbolicTile> RawSymbolicTileFromIndexingMap(
   tile_dimension_ranges.reserve(num_input_dims);
   std::vector<Range> tile_symbol_ranges;
   tile_symbol_ranges.reserve(kNumTileParametersPerInputDim * num_input_dims +
-                             indexing_map.affine_map.getNumSymbols());
+                             indexing_map.GetAffineMap().getNumSymbols());
 
   // The symbols declared in 'indexing_map.affine_map' will precede those
   // defined in the producer map we construct here.
-  absl::c_copy(indexing_map.domain.GetSymbolRanges(),
+  absl::c_copy(indexing_map.GetSymbolRanges(),
                std::back_inserter(tile_symbol_ranges));
 
   // For each input dims we add kNumTileParametersPerInputDim = 3 symbols, as
@@ -320,7 +322,7 @@ std::optional<RawSymbolicTile> RawSymbolicTileFromIndexingMap(
 
     exprs.push_back(offset + stride * index);
 
-    Range range = indexing_map.domain.GetDimensionRange(dim);
+    Range range = indexing_map.GetDimensionRange(dim);
     tile_dimension_ranges.push_back(range);
 
     for (int64_t symbol_index = 0; symbol_index < kNumTileParametersPerInputDim;
@@ -333,9 +335,10 @@ std::optional<RawSymbolicTile> RawSymbolicTileFromIndexingMap(
       num_input_dims, kNumTileParametersPerInputDim * num_input_dims, exprs,
       mlir_context);
 
-  IndexingMap composed_indexing_map{
-      .affine_map = indexing_map.affine_map.compose(producer_map),
-      .domain = Domain(tile_dimension_ranges, tile_symbol_ranges)};
+  IndexingMap composed_indexing_map(
+      indexing_map.GetAffineMap().compose(producer_map), tile_dimension_ranges,
+      tile_symbol_ranges);
+
   composed_indexing_map.Simplify();
 
   std::optional<RawSymbolicTile> maybe_raw_symbolic_tile =
diff --git a/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc b/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
index 1ab76a8c87a19f..7306a84f774a8e 100644
--- a/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/model/tile_analysis_test.cc
@@ -65,9 +65,7 @@ class SymbolicTileTest : public HloTestBase {
   mlir::MLIRContext mlir_context_;
 };
 
-
-TEST_F(SymbolicTileTest,
-       CanPropagateTileFromDotOutputToInputsWithoutSpecializedTileSizes) {
+TEST_F(SymbolicTileTest, CanPropagateTileFromDotOutputToInputs) {
   auto input_indexing = GetOutputToInputIndexingForEntryComputation(R"(
     HloModule m
     ENTRY e {
@@ -80,14 +78,14 @@ TEST_F(SymbolicTileTest,
   )");
 
   EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(**input_indexing.indexing_maps[0].begin()),
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
       Optional(MatchSymbolicTile(
           "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s0, s3, 0)",
           "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s1, s4, 19)",
           "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s2, s5, 1)")));
 
   EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(**input_indexing.indexing_maps[1].begin()),
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[1].begin()),
       Optional(MatchSymbolicTile(
           "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s0, 0, s6)",
           "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s1, 19, s7)",
@@ -104,7 +102,7 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughTrivialReshape) {
   )");
 
   EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(**input_indexing.indexing_maps[0].begin()),
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
       Optional(MatchSymbolicTile(
           "()[s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11] "
           "-> (s3, s6, s9)",
@@ -114,8 +112,7 @@ TEST_F(SymbolicTileTest, CanPropagateTileThroughTrivialReshape) {
           "-> (s5, s8, s11)")));
 }
 
-TEST_F(SymbolicTileTest,
-       FailsToPropagateTileThroughReshapeWithoutSpecializedTileSizes) {
+TEST_F(SymbolicTileTest, FailsToPropagateTileThroughReshape) {
   auto input_indexing = GetOutputToInputIndexingForEntryComputation(R"(
     HloModule m
     ENTRY e {
@@ -125,12 +122,11 @@ TEST_F(SymbolicTileTest,
   )");
 
   EXPECT_EQ(
-      SymbolicTile::FromIndexingMap(**input_indexing.indexing_maps[0].begin()),
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
       std::nullopt);
 }
 
-TEST_F(SymbolicTileTest,
-       CanPropagateTileThroughElementwiseOpWithoutSpecializedTileSizes) {
+TEST_F(SymbolicTileTest, CanPropagateTileThroughElementwiseOp) {
   auto input_indexing = GetOutputToInputIndexingForEntryComputation(R"(
     HloModule m
     ENTRY e {
@@ -141,14 +137,13 @@ TEST_F(SymbolicTileTest,
   )");
 
   EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(**input_indexing.indexing_maps[0].begin()),
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
       Optional(MatchSymbolicTile("()[s0, s1, s2] -> (s0)",
                                  "()[s0, s1, s2] -> (s1)",
                                  "()[s0, s1, s2] -> (s2)")));
 }
 
-TEST_F(SymbolicTileTest,
-       CanPropagateTileFromBroadcastOutputToInputWithoutSpecializedTileSizes) {
+TEST_F(SymbolicTileTest, CanPropagateTileFromBroadcastOutputToInput) {
   auto input_indexing = GetOutputToInputIndexingForEntryComputation(R"(
     HloModule m
     ENTRY e {
@@ -158,14 +153,13 @@ TEST_F(SymbolicTileTest,
   )");
 
   EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(**input_indexing.indexing_maps[0].begin()),
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
       Optional(MatchSymbolicTile("()[s0, s1, s2, s3, s4, s5] -> (s3)",
                                  "()[s0, s1, s2, s3, s4, s5] -> (s4)",
                                  "()[s0, s1, s2, s3, s4, s5] -> (s5)")));
 }
 
-TEST_F(SymbolicTileTest,
-       CanPropagateTileFromReduceOutputToInputWithoutSpecializedTileSizes) {
+TEST_F(SymbolicTileTest, CanPropagateTileFromReduceOutputToInput) {
   auto input_indexing = GetOutputToInputIndexingForEntryComputation(R"(
     HloModule m
     max {
@@ -182,14 +176,13 @@ TEST_F(SymbolicTileTest,
   )");
 
   EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(**input_indexing.indexing_maps[0].begin()),
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
       Optional(MatchSymbolicTile("()[s0, s1, s2] -> (0, s0)",
                                  "()[s0, s1, s2] -> (125, s1)",
                                  "()[s0, s1, s2] -> (1, s2)")));
 }
 
-TEST_F(SymbolicTileTest,
-       CanPropagateTileThroughReverseWithoutSpecializedTileSizes) {
+TEST_F(SymbolicTileTest, CanPropagateTileThroughReverse) {
   auto input_indexing = GetOutputToInputIndexingForEntryComputation(R"(
     HloModule m
     ENTRY e {
@@ -199,14 +192,13 @@ TEST_F(SymbolicTileTest,
   )");
 
   EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(**input_indexing.indexing_maps[0].begin()),
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
       Optional(MatchSymbolicTile("()[s0, s1, s2] -> (-s0 - s2 * s1 + 178)",
                                  "()[s0, s1, s2] -> (s1)",
                                  "()[s0, s1, s2] -> (s2)")));
 }
 
-TEST_F(SymbolicTileTest,
-       CanPropagateTileFromSliceOutputToInputWithoutSpecializedTileSizes) {
+TEST_F(SymbolicTileTest, CanPropagateTileFromSliceOutputToInput) {
   auto input_indexing = GetOutputToInputIndexingForEntryComputation(R"(
     HloModule m
     ENTRY e {
@@ -216,15 +208,14 @@ TEST_F(SymbolicTileTest,
   )");
 
   EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(**input_indexing.indexing_maps[0].begin()),
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
       Optional(MatchSymbolicTile(
           "()[s0, s1, s2, s3, s4, s5] -> (s0 * 2 + 40, s3 * 4 + 20)",
           "()[s0, s1, s2, s3, s4, s5] -> (s1, s4)",
           "()[s0, s1, s2, s3, s4, s5] -> (s2 * 2, s5 * 4)")));
 }
 
-TEST_F(SymbolicTileTest,
-       CanPropagateTileThroughTransposeWithoutSpecializedTileSizes) {
+TEST_F(SymbolicTileTest, CanPropagateTileThroughTranspose) {
   auto input_indexing = GetOutputToInputIndexingForEntryComputation(R"(
     HloModule m
     ENTRY e {
@@ -234,12 +225,63 @@ TEST_F(SymbolicTileTest,
   )");
 
   EXPECT_THAT(
-      SymbolicTile::FromIndexingMap(**input_indexing.indexing_maps[0].begin()),
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
       Optional(MatchSymbolicTile("()[s0, s1, s2, s3, s4, s5] -> (s3, s0)",
                                  "()[s0, s1, s2, s3, s4, s5] -> (s4, s1)",
                                  "()[s0, s1, s2, s3, s4, s5] -> (s5, s2)")));
 }
 
+TEST_F(SymbolicTileTest, CanPropagateTileThroughConcatenate) {
+  // TODO(325488844): Add additional concat test cases with constraints.
+  auto input_indexing = GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[2,5,7] parameter(0)
+      p1 = f32[2,11,7] parameter(1)
+      p2 = f32[2,17,7] parameter(2)
+      ROOT concat = f32[2,33,7] concatenate(p0, p1, p2), dimensions={1}
+    }
+  )");
+
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      Optional(MatchSymbolicTile(
+          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s0, s3, s6)",
+          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s1, s4, s7)",
+          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s2, s5, s8)")));
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[1].begin()),
+      Optional(MatchSymbolicTile(
+          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s0, s3 - 5, s6)",
+          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s1, s4, s7)",
+          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s2, s5, s8)")));
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[2].begin()),
+      Optional(MatchSymbolicTile(
+          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s0, s3 - 16, s6)",
+          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s1, s4, s7)",
+          "()[s0, s1, s2, s3, s4, s5, s6, s7, s8] -> (s2, s5, s8)")));
+}
+
+TEST_F(SymbolicTileTest, CanPropagateTileThroughPadOpWithoutInteriorPadding) {
+  // TODO(325488844): Add pad tests with defined constraints on tile input.
+  auto input_indexing = GetOutputToInputIndexingForEntryComputation(R"(
+    HloModule m
+    ENTRY e {
+      p0 = f32[4, 4] parameter(0)
+      p1 = f32[] parameter(1)
+      ROOT pad = f32[8,8] pad(p0, p1), padding=2_2_0x1_3_0
+    }
+  )");
+
+  EXPECT_THAT(
+      SymbolicTile::FromIndexingMap(*input_indexing.indexing_maps[0].begin()),
+      Optional(
+          MatchSymbolicTile("()[s0, s1, s2, s3, s4, s5] -> (s0 - 2, s3 - 1)",
+                            "()[s0, s1, s2, s3, s4, s5] -> (s1, s4)",
+                            "()[s0, s1, s2, s3, s4, s5] -> (s2, s5)")));
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/multi_output_fusion.cc b/third_party/xla/xla/service/gpu/multi_output_fusion.cc
index bbc2d2dee5a8e0..3271d209a5a08a 100644
--- a/third_party/xla/xla/service/gpu/multi_output_fusion.cc
+++ b/third_party/xla/xla/service/gpu/multi_output_fusion.cc
@@ -208,10 +208,7 @@ FusionDecision ProducerCandidateIsFusible(
 
   GpuPerformanceModel::RunTimes t = GpuPerformanceModel::EstimateRunTimes(
       &producer, cost_analysis, GpuPerformanceModelOptions::Default(),
-
-      // `EstimateRunTimes`'s interface violates const correctness, so we
-      // need the const cast here.
-      {const_cast<HloInstruction*>(&consumer)},
+      /*fused_consumers=*/{&consumer},
       /*multi_output=*/true);
   if (t.time_fused > t.time_unfused) {
     return "will execute slower if fused";
@@ -462,8 +459,7 @@ absl::StatusOr<bool> GpuMultiOutputFusion::DoMultiOutputFusion() {
               << consumer_for_fusion->name();
     } else {
       input_fusion = computation_->AddInstruction(HloInstruction::CreateFusion(
-          consumer_for_fusion->shape(),
-          ChooseFusionKind(*producer, *consumer_for_fusion),
+          consumer_for_fusion->shape(), ChooseFusionKind(*consumer_for_fusion),
           consumer_for_fusion));
       VLOG(2) << "Fuse producer " << producer->name() << " and its consumer "
               << consumer_for_fusion->name() << " into "
diff --git a/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc b/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc
index 2dff601c502bf9..6e6b1b9cce9c4e 100644
--- a/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/multi_output_fusion_test.cc
@@ -1890,8 +1890,9 @@ ENTRY main {
   CheckGpuMultiOutputFusion(hlo, std::nullopt);
 }
 
-// A variation of the test above, where no CSE was run.
-TEST_F(TransposeMultiOutputFusionTest, TransposesNoCSE) {
+// A variation of the test above, where no CSE was run, so we don't detect
+// 'fusion' as a transpose fusion.
+TEST_F(TransposeMultiOutputFusionTest, IncompatibleTransposesNoCSE) {
   const char* hlo = R"(
 HloModule module
 
@@ -1921,7 +1922,20 @@ ENTRY main {
 }
   )";
 
-  CheckGpuMultiOutputFusion(hlo, std::nullopt);
+  CheckGpuMultiOutputFusion(hlo, R"(
+// CHECK: %fused_computation (param_0.1: f32[18,16,32], param_1.1: f32[32,16,18]) -> (f32[32,16,18], f32[18,32,16]) {
+// CHECK-NEXT: [[param_0:%[^ ]+]] = f32[18,16,32]{2,1,0} parameter(0)
+// CHECK-NEXT: [[s_1:%[^ ]+]] = f32[18,16,32]{2,1,0} sqrt([[param_0]])
+// CHECK-NEXT: [[t_1:%[^ ]+]] = f32[32,16,18]{2,1,0} transpose([[s_1]]), dimensions={2,1,0}
+// CHECK-NEXT: [[param_1:%[^ ]+]] = f32[32,16,18]{2,1,0} parameter(1)
+// CHECK-NEXT: [[sub:%[^ ]+]] = f32[32,16,18]{2,1,0} subtract([[t_1]], [[param_1]])
+// CHECK-NEXT: [[exp_1:%[^ ]+]] = f32[32,16,18]{2,1,0} exponential([[sub]])
+// CHECK-NEXT: [[exp_2:%[^ ]+]] = f32[32,16,18]{2,1,0} exponential([[sub]])
+// CHECK-NEXT: [[add:%[^ ]+]] = f32[32,16,18]{2,1,0} add([[exp_1]], [[exp_2]])
+// CHECK-NEXT: [[s_2:%[^ ]+]] = f32[18,16,32]{2,1,0} sqrt([[param_0]])
+// CHECK-NEXT: [[t_2:%[^ ]+]] = f32[18,32,16]{2,1,0} transpose([[s_2]]), dimensions={0,2,1}
+// CHECK-NEXT: ROOT %{{.*}} = (f32[32,16,18]{2,1,0}, f32[18,32,16]{2,1,0}) tuple([[add]], [[t_2]])
+})");
 }
 
 TEST_F(TransposeMultiOutputFusionTest, CopyAndInput) {
@@ -1954,35 +1968,35 @@ ENTRY main {
 )");
 }
 
-TEST_F(TransposeMultiOutputFusionTest, CopyAndInputEpilogueFusion) {
+TEST_F(TransposeMultiOutputFusionTest, TransposeAndInputEpilogueFusion) {
   const char* hlo = R"(
 HloModule module
 
 fused_computation {
   param_0.1 = f32[16,32]{1,0} parameter(0)
   s.1 = f32[16,32]{1,0} sqrt(param_0.1)
-  c.1 = f32[16,32]{0,1} copy(s.1)
-  ROOT out = f32[16,32,1]{0,1,2} bitcast(c.1)
+  t.1 = f32[32,16]{1,0} transpose(s.1), dimensions={1,0}
+  ROOT out = f32[32,16,1]{2,1,0} bitcast(t.1)
 }
 
 ENTRY main {
   p = f32[16,32]{1,0} parameter(0)
-  fusion = f32[16,32,1]{0,1,2} fusion(p), kind=kInput, calls=fused_computation
+  fusion = f32[32,16,1]{2,1,0} fusion(p), kind=kInput, calls=fused_computation
   c1 = exponential(p)
   ROOT t = tuple(fusion, c1)
 }
   )";
 
   CheckGpuMultiOutputFusion(hlo, R"(
-// CHECK: %fused_computation (param_0.1: f32[16,32]) -> (f32[16,32,1], f32[16,32]) {
+// CHECK: %fused_computation
 // CHECK-NEXT:   [[param_0_1_0:%[^ ]+]] = f32[16,32]{1,0} parameter(0)
 // CHECK-NEXT:   [[s_1_1:%[^ ]+]] = f32[16,32]{1,0} sqrt([[param_0_1_0]])
-// CHECK-NEXT:   [[c_1_2:%[^ ]+]] = f32[16,32]{0,1} copy([[s_1_1]])
-// CHECK-NEXT:   [[out_3:%[^ ]+]] = f32[16,32,1]{0,1,2} bitcast([[c_1_2]])
+// CHECK-NEXT:   [[c_1_2:%[^ ]+]] = f32[32,16]{1,0} transpose([[s_1_1]])
+// CHECK-NEXT:   [[out_3:%[^ ]+]] = f32[32,16,1]{2,1,0} bitcast([[c_1_2]])
 // CHECK-NEXT:   [[c1_1_4:%[^ ]+]] = f32[16,32]{1,0} exponential([[param_0_1_0]])
-// CHECK-NEXT:   ROOT [[tuple_5:%[^ ]+]] = (f32[16,32,1]{0,1,2}, f32[16,32]{1,0}) tuple([[out_3]], [[c1_1_4]])
+// CHECK-NEXT:   ROOT [[tuple_5:%[^ ]+]] = (f32[32,16,1]{2,1,0}, f32[16,32]{1,0}) tuple([[out_3]], [[c1_1_4]])
 // CHECK-NEXT: }
-// CHECK: [[fusion_0:%[^ ]+]] = (f32[16,32,1]{0,1,2}, f32[16,32]{1,0}) fusion([[p_1:%[^ ]+]]), kind=kInput, calls=[[fused_computation_2:%[^ ]+]]
+// CHECK: [[fusion_0:%[^ ]+]] = (f32[32,16,1]{2,1,0}, f32[16,32]{1,0}) fusion([[p_1:%[^ ]+]]), kind=kInput, calls=[[fused_computation_2:%[^ ]+]]
 )");
 }
 
diff --git a/third_party/xla/xla/service/gpu/nccl_api.cc b/third_party/xla/xla/service/gpu/nccl_api.cc
index 8e9c087f389fe0..d79fa4c7e17f20 100644
--- a/third_party/xla/xla/service/gpu/nccl_api.cc
+++ b/third_party/xla/xla/service/gpu/nccl_api.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,8 +17,10 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <optional>
 #include <string_view>
 #include <utility>
+#include <vector>
 
 #include "absl/algorithm/container.h"
 #include "absl/hash/hash.h"
@@ -26,18 +28,32 @@ limitations under the License.
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "third_party/nccl/nccl.h"
+#include "absl/strings/str_join.h"
+#include "absl/types/span.h"
 #include "xla/primitive_util.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_activation.h"
 #include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 
+#if TENSORFLOW_USE_ROCM
+#include "rocm/rocm_config.h"
+#if (TF_ROCM_VERSION >= 50200)
+#include "rocm/include/rccl/rccl.h"
+#else
+#include "rocm/include/rccl.h"
+#endif  // TF_ROCM_VERSION >= 50200
+#else
+#include "third_party/nccl/nccl.h"
+#endif  // TENSORFLOW_USE_ROCM
+
 namespace xla::gpu {
 
 //==-----------------------------------------------------------------------===//
@@ -277,11 +293,17 @@ class DefaultNcclApi final : public NcclApi {
  public:
   absl::StatusOr<NcclCliqueId> GetUniqueId() final;
 
-  absl::StatusOr<NcclCommHandle> CommInitRank(int32_t nranks,
-                                              const NcclCliqueId& clique_id,
-                                              int32_t rank) final;
+  absl::StatusOr<std::vector<OwnedNcclComm>> CommInitRanks(
+      int32_t nranks, const NcclCliqueId& clique_id,
+      absl::Span<const DeviceRank> ranks) final;
+
+  absl::StatusOr<std::vector<OwnedNcclComm>> CommSplit(
+      absl::Span<const NcclCommHandle> comms, int32_t color,
+      absl::Span<const int32_t> keys) final;
 
   absl::Status CommAbort(NcclCommHandle comm) final;
+  absl::Status CommFinalize(NcclCommHandle comm) final;
+  absl::Status CommDestroy(NcclCommHandle comm) final;
 
   absl::StatusOr<int32_t> CommCount(NcclCommHandle comm) final;
 
@@ -342,20 +364,74 @@ absl::StatusOr<NcclCliqueId> DefaultNcclApi::GetUniqueId() {
   return NcclCliqueId(id.internal);
 }
 
-absl::StatusOr<NcclApi::NcclCommHandle> DefaultNcclApi::CommInitRank(
-    int32_t nranks, const NcclCliqueId& clique_id, int32_t rank) {
-  VLOG(1) << "Initialize NCCL communicator for rank #" << rank << " of "
-          << nranks << "; hash(id)=" << absl::HashOf(clique_id.data());
+absl::StatusOr<std::vector<NcclApi::OwnedNcclComm>>
+DefaultNcclApi::CommInitRanks(int32_t nranks, const NcclCliqueId& clique_id,
+                              absl::Span<const DeviceRank> ranks) {
+  VLOG(1) << "Initialize NCCL communicator for " << ranks.size()
+          << " devices; hash(id)=" << absl::HashOf(clique_id);
+
+  std::vector<OwnedNcclComm> comms;
+  comms.reserve(ranks.size());
+
+  TF_RETURN_IF_ERROR(GroupStart());
+  for (size_t i = 0; i < ranks.size(); ++i) {
+    VLOG(1) << "Initialize NCCL communicator for rank #" << ranks[i].rank
+            << " of " << nranks << "; hash(id)=" << absl::HashOf(clique_id);
 
-  if (rank < 0 || rank >= nranks)
-    return absl::InvalidArgumentError(absl::StrFormat(
-        "Invalid rank %d, it must be in [0, %d) range", rank, nranks));
+    se::gpu::ScopedActivateExecutorContext activate_context(ranks[i].device);
 
-  ncclComm_t comm = nullptr;
-  absl::Status status = XLA_NCCL_STATUS(
-      ncclCommInitRank(&comm, nranks, AsNcclUniqueId(clique_id), rank));
+    ncclComm_t comm_handle = nullptr;
+    XLA_NCCL_RETURN_IF_ERROR(ncclCommInitRank(
+        &comm_handle, nranks, AsNcclUniqueId(clique_id), ranks[i].rank));
 
-  return Cast(comm);
+    comms.emplace_back(Cast(comm_handle), NcclCommDeleter{this});
+  }
+  TF_RETURN_IF_ERROR(GroupEnd());
+
+  return comms;
+}
+
+absl::StatusOr<std::vector<NcclApi::OwnedNcclComm>> DefaultNcclApi::CommSplit(
+    absl::Span<const NcclCommHandle> comms, int32_t color,
+    absl::Span<const int32_t> keys) {
+  VLOG(1) << absl::StreamFormat(
+      "Split %d NCCL communicators using color %d and keys: [%s]", comms.size(),
+      color, absl::StrJoin(keys, ","));
+
+#if !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION >= 60000
+  if (keys.size() != comms.size()) {
+    return absl::InvalidArgumentError(
+        absl::StrFormat("Comms and keys must have the same size, but %d != %d",
+                        comms.size(), keys.size()));
+  }
+
+  // In contrast to grouped initialization communicator splitting initializes
+  // communicators only after a successful call to `GroupEnd`, so we keep a
+  // vector of handles and after successful splitting convert to RAII wrappers.
+  std::vector<ncclComm_t> split_comms_handles;
+  split_comms_handles.resize(comms.size(), nullptr);
+
+  TF_RETURN_IF_ERROR(GroupStart());
+  for (size_t i = 0; i < comms.size(); ++i) {
+    VLOG(1) << "Split NCCL communicator " << comms[i] << " with color " << color
+            << " and key " << keys[i];
+    XLA_NCCL_RETURN_IF_ERROR(ncclCommSplit(Cast(comms[i]), color, keys[i],
+                                           &split_comms_handles[i],
+                                           /*config=*/nullptr));
+  }
+  TF_RETURN_IF_ERROR(GroupEnd());
+
+  std::vector<OwnedNcclComm> split_comms;
+  for (size_t i = 0; i < split_comms_handles.size(); ++i) {
+    split_comms.emplace_back(Cast(split_comms_handles[i]),
+                             NcclCommDeleter{this});
+  }
+  return split_comms;
+#else
+  return absl::UnimplementedError(
+      absl::StrFormat("%s:%d: NCCL operation ncclCommSplit not implemented",
+                      __FILE__, __LINE__));
+#endif  // !defined(TENSORFLOW_USE_ROCM) || TF_ROCM_VERSION >= 60000
 }
 
 absl::Status DefaultNcclApi::CommAbort(NcclCommHandle comm) {
@@ -363,6 +439,16 @@ absl::Status DefaultNcclApi::CommAbort(NcclCommHandle comm) {
   return XLA_NCCL_STATUS(ncclCommAbort(Cast(comm)));
 }
 
+absl::Status DefaultNcclApi::CommFinalize(NcclCommHandle comm) {
+  VLOG(1) << "Finalize NCCL communicator: " << comm;
+  return XLA_NCCL_STATUS(ncclCommFinalize(Cast(comm)));
+}
+
+absl::Status DefaultNcclApi::CommDestroy(NcclCommHandle comm) {
+  VLOG(1) << "Destroy NCCL communicator: " << comm;
+  return XLA_NCCL_STATUS(ncclCommDestroy(Cast(comm)));
+}
+
 absl::StatusOr<int32_t> DefaultNcclApi::CommCount(NcclCommHandle comm) {
   VLOG(5) << "Get the number of ranks in NCCL communicator: " << comm;
   int32_t count;
@@ -498,9 +584,10 @@ DefaultNcclApi::RegisterBuffer(NcclCommHandle comm,
       "Register buffer for NCCL communicator; buffer=%p; size=%d; comm=%p",
       buffer.opaque(), buffer.size(), comm);
   void* handle = nullptr;
+#if (NCCL_VERSION_CODE >= 21901)
   XLA_NCCL_RETURN_IF_ERROR(
       ncclCommRegister(Cast(comm), buffer.opaque(), buffer.size(), &handle));
-
+#endif  // NCCL_VERSION_CODE >= 21901
   return reinterpret_cast<NcclRegisteredBufferHandle>(handle);
 }
 
@@ -510,8 +597,10 @@ DefaultNcclApi::DeregisterBuffer(NcclCommHandle comm,
   VLOG(3) << absl::StreamFormat(
       "Deregister buffer for NCCL communicator; handle=%p; comm=%p", handle,
       comm);
+#if (NCCL_VERSION_CODE >= 21901)
   return XLA_NCCL_STATUS(
       ncclCommDeregister(Cast(comm), reinterpret_cast<void*>(handle)));
+#endif  // NCCL_VERSION_CODE >= 21901
 }
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/nccl_api.h b/third_party/xla/xla/service/gpu/nccl_api.h
index 81a480c48c71bd..0dee60e4649bce 100644
--- a/third_party/xla/xla/service/gpu/nccl_api.h
+++ b/third_party/xla/xla/service/gpu/nccl_api.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,9 +18,12 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <memory>
+#include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/shape_util.h"
@@ -28,6 +31,7 @@ limitations under the License.
 #include "xla/stream_executor/stream.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/concurrency/ref_count.h"
+#include "tsl/platform/logging.h"
 
 namespace xla::gpu {
 
@@ -58,6 +62,17 @@ class NcclApi {
   using NcclPersistentPlanAllocatorHandle = NcclPersistentPlanAllocator*;
   using NcclRegisteredBufferHandle = NcclRegisteredBuffer*;
 
+  // RAII handle for NCCL communicator.
+  struct NcclCommDeleter {
+    void operator()(NcclCommHandle comm) {
+      if (auto destroyed = api->CommDestroy(comm); !destroyed.ok())
+        LOG(ERROR) << "Failed to destroy communicator: " << destroyed;
+    }
+    NcclApi* api;
+  };
+
+  using OwnedNcclComm = std::unique_ptr<NcclComm, NcclCommDeleter>;
+
   // Persistent plan allocator allows to pass XLA memory allocator to NCCL to
   // allocate device memory for persistent execution plans for NCCL operations
   // captured into CUDA graphs. It relies on NCCL patch that is not part of
@@ -100,6 +115,14 @@ class NcclApi {
     tsl::RCReference<PersistentPlanAllocator> allocator_;
   };
 
+  struct DeviceRank {
+    DeviceRank(se::StreamExecutor* device, int32_t rank)
+        : device(device), rank(rank) {}
+
+    se::StreamExecutor* device;
+    int32_t rank;
+  };
+
   // Returns a slice of device memory `buff` containing `count` values of data
   // type `dtype` starting from `offset`.
   static se::DeviceMemoryBase Slice(se::DeviceMemoryBase buff,
@@ -114,11 +137,25 @@ class NcclApi {
   // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclgetuniqueid
   virtual absl::StatusOr<NcclCliqueId> GetUniqueId() = 0;
 
-  // Creates a new communicator.
+  // Creates new communicators for given devices.
+  //
+  // This API doesn't have a corresponding API in NCCL and implemented as
+  // multiple calls to ncclCommInitRank within a single group.
   //
   // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcomminitrank
-  virtual absl::StatusOr<NcclCommHandle> CommInitRank(
-      int32_t nranks, const NcclCliqueId& clique_id, int32_t rank) = 0;
+  virtual absl::StatusOr<std::vector<OwnedNcclComm>> CommInitRanks(
+      int32_t nranks, const NcclCliqueId& clique_id,
+      absl::Span<const DeviceRank> ranks) = 0;
+
+  // Creates new communicators by splitting `comms`.
+  //
+  // This API doesn't have a corresponding API in NCCL and implemented as
+  // multiple calls to ncclCommSplit within a single group.
+  //
+  // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommsplit
+  virtual absl::StatusOr<std::vector<OwnedNcclComm>> CommSplit(
+      absl::Span<const NcclCommHandle> comms, int32_t color,
+      absl::Span<const int32_t> keys) = 0;
 
   // Abort any uncompleted operations and destroys the communicator. Frees
   // resources that are allocated to a communicator object comm.
@@ -126,6 +163,16 @@ class NcclApi {
   // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommabort
   virtual absl::Status CommAbort(NcclCommHandle comm) = 0;
 
+  // Finalize a communicator object comm.
+  //
+  // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommdestroy
+  virtual absl::Status CommFinalize(NcclCommHandle comm) = 0;
+
+  // Destroy a communicator object comm.
+  //
+  // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommdestroy
+  virtual absl::Status CommDestroy(NcclCommHandle comm) = 0;
+
   // Returns the number of ranks in the NCCL communicator comm.
   //
   // https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/api/comms.html#ncclcommcount
diff --git a/third_party/xla/xla/service/gpu/nccl_api_stub.cc b/third_party/xla/xla/service/gpu/nccl_api_stub.cc
index 36db9839760633..d9240633601d71 100644
--- a/third_party/xla/xla/service/gpu/nccl_api_stub.cc
+++ b/third_party/xla/xla/service/gpu/nccl_api_stub.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,9 +15,12 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <optional>
+#include <vector>
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/types/span.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
@@ -83,14 +86,27 @@ class NcclApiStub final : public NcclApi {
     return UnimplementedError();
   }
 
-  absl::StatusOr<NcclCommHandle> CommInitRank(int32_t nranks,
-                                              const NcclCliqueId& clique_id,
-                                              int32_t rank) final {
+  absl::StatusOr<std::vector<OwnedNcclComm>> CommInitRanks(
+      int32_t, const NcclCliqueId&, absl::Span<const DeviceRank>) final {
+    return UnimplementedError();
+  }
+
+  absl::StatusOr<std::vector<OwnedNcclComm>> CommSplit(
+      absl::Span<const NcclCommHandle>, int32_t,
+      absl::Span<const int32_t>) final {
     return UnimplementedError();
   }
 
   absl::Status CommAbort(NcclCommHandle) final { return UnimplementedError(); }
 
+  absl::Status CommFinalize(NcclCommHandle) final {
+    return UnimplementedError();
+  }
+
+  absl::Status CommDestroy(NcclCommHandle) final {
+    return UnimplementedError();
+  }
+
   absl::StatusOr<int32_t> CommCount(NcclCommHandle) final {
     return UnimplementedError();
   }
diff --git a/third_party/xla/xla/service/gpu/nccl_clique.cc b/third_party/xla/xla/service/gpu/nccl_clique.cc
index 33229bfb3fa73d..f8dedc74ee35d4 100644
--- a/third_party/xla/xla/service/gpu/nccl_clique.cc
+++ b/third_party/xla/xla/service/gpu/nccl_clique.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,19 +18,25 @@ limitations under the License.
 #include <cstdint>
 #include <cstdlib>
 #include <memory>
+#include <optional>
+#include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
+#include "absl/container/btree_map.h"
 #include "absl/container/node_hash_map.h"
+#include "absl/functional/function_ref.h"
 #include "absl/hash/hash.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
-#include "absl/synchronization/barrier.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/time/clock.h"
 #include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "xla/debug_options_flags.h"
@@ -41,7 +47,10 @@ limitations under the License.
 #include "xla/service/lockable.h"
 #include "xla/service/rendezvous.h"
 #include "xla/status_macros.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/hash.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 
@@ -93,26 +102,49 @@ static absl::Duration TerminateTimeout() {
 // NcclClique
 //===----------------------------------------------------------------------===//
 
-// A group of NCCL communicators making up a clique. With NCCL it's notoriously
-// easy to get a deadlock, so we take extra care by grouping communicators into
-// cliques and making sure that we have a well defined order of all collective
-// operations that does not lead to deadlocks.
-struct NcclCliqueCommunicators {
-  NcclCliqueId clique_id;
-  absl::node_hash_map<int32_t, NcclComm> communicators;
-
-  // The latest (maybe still in progress) XLA run_id that used this clique to
-  // launch collective operations. We use this id to detect potentially
-  // dangerous (deadlocks) concurrent execution of multiple XLA runs.
-  int64_t run_id = -1;
-};
+NcclCliqueCommunicators::NcclCliqueCommunicators(
+    NcclCliqueKey clique_key, std::optional<NcclCliqueId> clique_id,
+    absl::btree_map<int32_t, NcclApi::OwnedNcclComm> communicators)
+    : clique_key_(std::move(clique_key)),
+      clique_id_(std::move(clique_id)),
+      communicators_(std::move(communicators)) {}
+
+std::optional<NcclApi::NcclCommHandle> NcclCliqueCommunicators::comm(
+    int32_t rank) {
+  if (auto it = communicators_.find(rank); it != communicators_.end()) {
+    return it->second.get();
+  }
+  return std::nullopt;
+}
 
-struct NcclClique : public Lockable<NcclCliqueCommunicators> {
-  NcclClique(NcclCliqueId clique_id,
-             absl::node_hash_map<int32_t, NcclComm> communicators)
-      : Lockable(NcclCliqueCommunicators{clique_id, std::move(communicators)}) {
+bool NcclCliqueCommunicators::IsLocal() const {
+  return communicators_.size() == clique_key_.devices().size();
+}
+
+void NcclCliqueCommunicators::ForEachComm(
+    absl::FunctionRef<void(int32_t, NcclApi::NcclCommHandle)> fn) {
+  for (auto& [rank, comm] : communicators_) {
+    fn(rank, comm.get());
   }
-};
+}
+
+std::string NcclCliqueCommunicators::DebugString() const {
+  std::string out =
+      absl::StrFormat("clique_key: %s; hash(id): %d; size: %d; communicators: ",
+                      clique_key_.ToString(),
+                      clique_id_.has_value() ? absl::HashOf(*clique_id_) : 0,
+                      communicators_.size());
+  int32_t cnt = 0;
+  for (const auto& [rank, comm] : communicators_) {
+    if (cnt++) absl::StrAppend(&out, ", ");
+    absl::StrAppendFormat(&out, "[rank=%d, comm=%p]", rank, comm.get());
+  }
+  return out;
+}
+
+std::string NcclClique::DebugString() const {
+  return absl::StrFormat("NcclClique: %s", value().DebugString());
+}
 
 namespace {
 // Container for initialized and ready to use local (in-process) NCCL cliques.
@@ -128,38 +160,59 @@ static NcclCliques& GetNcclCliques() {
   return *cliques;
 }
 
-// Acquires a NCCL clique for a given key. Should be used with extra care if
-// executed outside of a rendezvous callback as it's unsafe to launch unrelated
-// collective operations using the same clique out of order.
-//
-// If NCCL clique for a given key is not initialized returns an empty lock.
-// Caller must always check if lock is valid before trying to use it.
-static absl::StatusOr<NcclClique::Lock> AcquireNcclClique(
-    const NcclCliqueKey& clique_key, RunId run_id,
-    int32_t num_local_participants) {
-  NcclCliques& cliques = GetNcclCliques();
-
-  absl::MutexLock lock(&cliques.mu);
-  if (auto it = cliques.map.find(clique_key); it != cliques.map.end()) {
-    NcclClique::Lock clique = it->second.Acquire();
-
-    // If multiple executable are running simultaneously while using multiple
-    // hosts, it is possible that different executables could acquire the same
-    // clique on different hosts. We protect against this by checking that the
-    // run ID increases monotonically.
-    bool is_local = clique_key.devices().size() == num_local_participants;
-    TF_RET_CHECK(is_local || (run_id.ToInt() >= clique->run_id))
-        << "Run ID " << run_id.ToInt() << " is smaller than clique run ID "
-        << clique->run_id << ". Multiple XLA runs for the same clique "
-        << "can lead to a deadlock. Do not execute concurrent runs of "
-        << "multi-host XLA executable.";
-
-    clique->run_id = run_id.ToInt();
-    return clique;
+//===----------------------------------------------------------------------===//
+// NcclClique Heart Beat Monitor
+//===----------------------------------------------------------------------===//
+
+// Runs an async error check for a `comm` and aborts it if it is in the
+// error state. It will free resources that are allocated to a communicator
+// and abort any uncompleted operations before destroying the communicator.
+static absl::Status CheckComm(NcclApi::NcclCommHandle comm) {
+  absl::Status async_err = NcclApi::Default()->CommGetAsyncError(comm);
+  if (!async_err.ok()) {
+    LOG(ERROR) << "Aborting communicator: " << comm
+               << " due to async NCCL error: " << async_err;
+    TF_RETURN_IF_ERROR(NcclApi::Default()->CommAbort(comm));
+  }
+  return async_err;
+}
+
+// Runs async check on all communicators in a clique.
+static void CheckClique(const NcclCliqueKey& clique_key,
+                        NcclClique& lockable_clique) {
+  if (NcclClique::Lock clique = lockable_clique.TryAcquire()) {
+    VLOG(5) << "Checking NCCL clique " << clique_key.ToString()
+            << " for async errors; num_communicators="
+            << clique->num_communicators();
+    clique->ForEachComm([](int32_t rank, NcclApi::NcclCommHandle comm) {
+      if (auto status = CheckComm(comm); !status.ok()) LOG(ERROR) << status;
+    });
+  } else {
+    VLOG(5) << "Skip checking in-use NCCL clique " << clique_key.ToString();
   }
+}
 
-  // Return empty lock if we do not have a clique for `clique_key`.
-  return NcclClique::Lock();
+// TODO(ezhulenev): We need a mechanism to destroy whole clique when one of the
+// communicators is aborted to be able to recover from errors.
+static void NcclCliqueHeartBeatMonitorThread() {
+  VLOG(5) << "Starting NCCL clique heart beat monitor";
+  while (true) {
+    absl::SleepFor(absl::Seconds(30));
+    NcclCliques& cliques = GetNcclCliques();
+    absl::MutexLock lock(&cliques.mu);
+    VLOG(5) << "Checking NCCL communicators for async errors"
+            << "; num_cliques=" << cliques.map.size();
+    for (auto& [clique_key, lockable_clique] : cliques.map) {
+      CheckClique(clique_key, lockable_clique);
+    }
+  }
+}
+
+static void StartNcclCliqueHeartBeatMonitor() {
+  static auto* monitor_thread = tsl::Env::Default()->StartThread(
+      tsl::ThreadOptions(), "nccl_clique_heart_beat_monitor",
+      NcclCliqueHeartBeatMonitorThread);
+  (void)monitor_thread;  // suppress unused variable warning
 }
 
 //===----------------------------------------------------------------------===//
@@ -168,195 +221,289 @@ static absl::StatusOr<NcclClique::Lock> AcquireNcclClique(
 
 // NcclClique initialization must be executed together by all participants, and
 // we rely on rendezvous to guarantee that all ranks are ready to initialize
-// NCCL communicators.
-
-namespace {
-// Local (in-process) NCCL clique initialization state. Once initialization is
-// complete NCCL clique added to the NcclCliques container (see above).
-struct InitializationState {
-  using Ranks = absl::Span<const int32_t* const>;
-  InitializationState(NcclCliqueId clique_id, Ranks ranks);
-
-  NcclCliqueId clique_id;
-  absl::node_hash_map<int32_t, absl::StatusOr<NcclApi::NcclCommHandle>> comms;
-
-  // Signals when all participants updated entries in `comms`.
-  std::unique_ptr<absl::Barrier> ready;
-};
-
-}  // namespace
-
-InitializationState::InitializationState(NcclCliqueId clique_id, Ranks ranks)
-    : clique_id(clique_id), ready(new absl::Barrier(ranks.size())) {
-  // Initialize `comms` for all ranks so that each participating thread can
-  // write into it without synchronization.
-  for (const int32_t* rank : ranks) {
-    comms[*rank] = absl::InternalError("uninitialized NCCL communicator");
-  }
+// NCCL communicators. In general collective operations are expected to be
+// executed concurrently by all participating ranks, and when some ranks do not
+// join the operation it  leads to deadlocks. We use a combination of rendezvous
+// and locking to guarantee that all collective operations in XLA have a well
+// defined order and do not deadlock inside underlying collective communication
+// library.
+
+static auto DeviceRanksToString(absl::Span<const NcclApi::DeviceRank> ranks) {
+  return absl::StrJoin(ranks, ",", [](std::string* str, auto& rank) {
+    str->append(std::to_string(rank.rank));
+  });
 }
 
-// Creates a new NCCL communicator for a given `rank` and joins a rendezvous to
-// initialize a clique for a `clique_key`. Returns a lock that gives exclusive
-// access to a NCCL clique.
+// Joins a NcclClique initialization rendezvous for a `clique_key` and returns
+// a lock that gives an access to initialized clique (access is shared between
+// all participating ranks that own a shared pointer).
 static absl::StatusOr<std::shared_ptr<NcclClique::Lock>> InitializeNcclClique(
-    RunId run_id, NcclCliqueKey clique_key,
+    se::StreamExecutor* device, RunId run_id, NcclCliqueKey clique_key,
     const NcclCliqueIdCallback& clique_id_callback,
     int32_t num_local_participants, int32_t rank) {
   int nranks = clique_key.devices().size();
   VLOG(3) << "Initialize NCCL clique " << clique_key.ToString() << " rank #"
-          << rank << " of " << nranks
-          << "; num_local_participants=" << num_local_participants;
+          << rank << "; num_local_participants=" << num_local_participants;
 
-  // Creates initialization state for participating ranks.
-  auto create_initialization_state = [&](absl::Span<const int32_t* const> ranks)
-      -> absl::StatusOr<InitializationState> {
+  // Start NCCL clique heart beat monitor when create a first clique.
+  StartNcclCliqueHeartBeatMonitor();
+
+  // Initializes a NcclClique for given device ranks and returns a lock that
+  // gives access to clique communicators.
+  auto initialize = [&](absl::Span<const NcclApi::DeviceRank* const> args)
+      -> absl::StatusOr<NcclClique::Lock> {
     TF_ASSIGN_OR_RETURN(auto clique_id, clique_id_callback(clique_key));
-    VLOG(3) << "Created unique clique id (hash): " << absl::HashOf(clique_id);
-    return InitializationState(clique_id, ranks);
+
+    std::vector<NcclApi::DeviceRank> ranks;
+    ranks.reserve(args.size());
+    for (auto* arg : args) ranks.emplace_back(*arg);
+
+    // Sort device ranks, mainly to get more readable logs below, NCCL does
+    // not care in what order ranks are initialized.
+    absl::c_sort(ranks, [](auto& a, auto& b) { return a.rank < b.rank; });
+
+    VLOG(3) << absl::StreamFormat(
+        "Create NCCL communicators for clique %s; ranks=[%s]; hash(id)=%d",
+        clique_key.ToString(), DeviceRanksToString(ranks),
+        absl::HashOf(clique_id));
+
+    TF_ASSIGN_OR_RETURN(
+        std::vector<NcclApi::OwnedNcclComm> created_comms,
+        NcclApi::Default()->CommInitRanks(nranks, clique_id, ranks));
+
+    absl::btree_map<int32_t, NcclApi::OwnedNcclComm> comms;
+    for (size_t i = 0; i < ranks.size(); ++i) {
+      comms[ranks[i].rank] = std::move(created_comms[i]);
+    }
+
+    VLOG(3) << absl::StreamFormat(
+        "Created NCCL communicators for clique %s; ranks=[%s]; hash(id)=%d",
+        clique_key.ToString(), DeviceRanksToString(ranks),
+        absl::HashOf(clique_id));
+
+    NcclCliques& cliques = GetNcclCliques();
+    absl::MutexLock lock(&cliques.mu);
+
+    // Create a new clique with given clique key and communicators.
+    auto emplaced = cliques.map.try_emplace(clique_key, clique_key, clique_id,
+                                            std::move(comms));
+
+    // We can have a race to create a clique for a given key, the winner
+    // inserts it into a map and the looser destroys all communicators.
+    if (!emplaced.second) {
+      VLOG(3) << "Clique already exists: "
+              << emplaced.first->second.DebugString();
+    } else {
+      VLOG(3) << "Created new clique: " << emplaced.first->second.DebugString();
+    }
+
+    return emplaced.first->second.Acquire();
   };
 
   // We include `run_id` to a rendezvous key to make sure that multiple
   // concurrent initializations will not join the same rendezvous. The winner
   // will update cliques state, and others will destroy unused communicators.
   auto rendezvous_key = std::make_tuple(run_id, clique_key);
+  auto initialization_rendezvous_name =
+      absl::StrFormat("initialize clique for rank %d; clique=%s; run_id=%d",
+                      rank, clique_key.ToString(), run_id.ToInt());
 
-  // Do a round of rendezvous to wait for all participants to join NCCL clique
-  // initialization process.
-  TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<InitializationState> state,
-      RendezvousSingle<absl::StatusOr<InitializationState>>(
-          rendezvous_key, rank, num_local_participants,
-          create_initialization_state, WarnStuckTimeout(), TerminateTimeout()));
+  NcclApi::DeviceRank device_rank = {device, rank};
 
-  VLOG(3) << "Create NCCL communicator for clique " << clique_key.ToString()
-          << " rank #" << rank << " of " << nranks
-          << "; num_local_participants=" << num_local_participants;
+  return RendezvousSingle<absl::StatusOr<NcclClique::Lock>>(
+      initialization_rendezvous_name, rendezvous_key, device_rank,
+      num_local_participants, initialize, WarnStuckTimeout(),
+      TerminateTimeout());
+}
 
-  // TODO(ezhulenev): Currently we leak this comm handle on error path. We
-  // need an OwnedNcclCommHandle with a custom deleter.
-  absl::StatusOr<NcclApi::NcclCommHandle> comm =
-      NcclApi::Default()->CommInitRank(nranks, state->clique_id, rank);
+// Computes a unique NCCL communicator split color from a clique key. We use a
+// deterministic hash function to guarantee that all participating processes get
+// the same color value for a clique.
+static int32_t GetCommSplitColor(const NcclCliqueKey& clique_key) {
+  std::vector<int64_t> global_device_ids;
+  global_device_ids.reserve(clique_key.devices().size());
 
-  if (comm.ok()) {
-    state->comms[rank] = *comm;
-  } else {
-    state->comms[rank] = comm.status();
+  for (GlobalDeviceId id : clique_key.devices()) {
+    global_device_ids.push_back(id.value());
   }
 
-  // Wait for all participants to complete communicator initialization.
-  bool completed_initialization = state->ready->Block();
+  return abs(static_cast<int32_t>(
+      tsl::Hash32(reinterpret_cast<char*>(global_device_ids.data()),
+                  sizeof(int64_t) * global_device_ids.size(), 0)));
+}
 
-  // Check that all ranks successfully initialize communicators.
-  for (const auto& [rank, comm] : state->comms) {
-    TF_RETURN_IF_ERROR(comm.status());
-  }
+// Joins a NcclClique initialization rendezvous for a `clique_key` and returns
+// a lock that gives an access to clique created by splitting already acquired
+// `parent_clique` clique (access is shared between all participating ranks that
+// own a shared pointer).
+static absl::StatusOr<std::shared_ptr<NcclClique::Lock>> InitializeNcclClique(
+    se::StreamExecutor* device, RunId run_id, NcclCliqueKey clique_key,
+    std::shared_ptr<NcclClique::Lock> parent_clique,
+    int32_t num_local_participants, int32_t rank) {
+  // Find our rank in the parent clique.
+  const NcclCliqueKey& parent_clique_key = (*parent_clique)->clique_key();
+  int32_t parent_rank = *parent_clique_key.rank(clique_key.devices()[rank]);
 
-  // If we are the leader who completed the clique initialization we should
-  // update the local (in-process) cliques state.
-  if (completed_initialization) {
-    NcclCliques& cliques = GetNcclCliques();
+  VLOG(3) << "Initialize NCCL clique " << clique_key.ToString() << " rank #"
+          << rank << " by splitting rank #" << parent_rank
+          << " in parent clique " << parent_clique_key.ToString()
+          << "; num_local_participants=" << num_local_participants;
+
+  using RankPair = std::pair<int32_t, int32_t>;
+  RankPair rank_pair = {parent_rank, rank};
+
+  // Current approach for communicator splitting works because of XLAs SPMD
+  // programming model where all collective operations have replica groups that
+  // include all ranks. This property guarantees that we'll split each
+  // communicator exactly once with a unique color computed from rank mapping
+  // and each communicator in the parent clique will become a member of exactly
+  // one new clique. Clique splitting happens concurrently for multiple
+  // non-overlapping clique and this guarantees forward progress even with
+  // implicit synchronization inside NCCL.
+
+  // Initializes a NcclClique for given device ranks and returns a lock that
+  // gives access to clique communicators.
+  auto split = [&](absl::Span<const RankPair* const> rank_pairs)
+      -> absl::StatusOr<NcclClique::Lock> {
+    // Collect mapping from ranks in parent clique to ranks in a new clique.
+    absl::btree_map<int32_t, int32_t> rank_mapping;
+    for (auto* rank_pair : rank_pairs) {
+      rank_mapping[rank_pair->first] = rank_pair->second;
+    }
 
-    // Create NCCL communicators from handles.
-    absl::node_hash_map<int32_t, NcclComm> communicators;
-    for (const auto& [rank, comm] : state->comms) {
-      communicators.try_emplace(rank, *comm);
+    auto rank_mapping_formatter = [](std::string* str, auto mapping) {
+      absl::StrAppend(str, mapping.first, "->", mapping.second);
+    };
+
+    // Collect parent communicators we'll be splitting from and keys for
+    // creating new communicators.
+    std::vector<NcclApi::NcclCommHandle> parent_comms;
+    std::vector<int32_t> keys;
+
+    for (auto& [parent_rank, split_rank] : rank_mapping) {
+      auto parent_comm = (*parent_clique)->comm(parent_rank);
+      if (!parent_comm.has_value()) {
+        return absl::InvalidArgumentError(absl::StrFormat(
+            "Parent clique %s does not have a communicator for rank %d",
+            parent_clique_key.ToString(), parent_rank));
+      }
+
+      parent_comms.push_back(*parent_comm);
+      keys.push_back(split_rank);
     }
 
-    VLOG(3) << "Completed NCCL clique initialization for a clique "
-            << clique_key.ToString();
+    // Get a globally consistent color value for newly created clique.
+    int32_t color = GetCommSplitColor(clique_key);
 
-    // Create a new clique with given clique id and communicators.
+    VLOG(3) << absl::StreamFormat(
+        "Create NCCL communicators for clique %s; parent=%s; color=%d; "
+        "rank_mapping=[%s]",
+        clique_key.ToString(), parent_clique_key.ToString(), color,
+        absl::StrJoin(rank_mapping, ",", rank_mapping_formatter));
+
+    TF_ASSIGN_OR_RETURN(auto splitted_comms, NcclApi::Default()->CommSplit(
+                                                 parent_comms, color, keys));
+
+    absl::btree_map<int32_t, NcclApi::OwnedNcclComm> comms;
+    for (size_t i = 0; i < splitted_comms.size(); ++i) {
+      comms[i] = std::move(splitted_comms[i]);
+    }
+
+    VLOG(3) << absl::StreamFormat(
+        "Created NCCL communicators for clique %s; parent=%s; color=%d; "
+        "rank_mapping=[%s]",
+        clique_key.ToString(), parent_clique_key.ToString(), color,
+        absl::StrJoin(rank_mapping, ",", rank_mapping_formatter));
+
+    NcclCliques& cliques = GetNcclCliques();
     absl::MutexLock lock(&cliques.mu);
-    cliques.map.try_emplace(clique_key, state->clique_id,
-                            std::move(communicators));
-  }
 
-  // Do one more round of rendezvous to guarantee that all ranks that
-  // participated in clique initialization will share an exclusive access to all
-  // communicators in a NCCL clique.
+    // Create a new clique with given clique key and communicators.
+    auto emplaced = cliques.map.try_emplace(clique_key, clique_key,
+                                            std::nullopt, std::move(comms));
+
+    // We can have a race to create a clique for a given key, the winner
+    // inserts it into a map and the looser destroys all communicators.
+    if (!emplaced.second) {
+      VLOG(3) << "Clique already exists: "
+              << emplaced.first->second.DebugString();
+    } else {
+      VLOG(3) << "Created new clique: " << emplaced.first->second.DebugString();
+    }
+
+    return emplaced.first->second.Acquire();
+  };
+
+  // We include `run_id` to a rendezvous key to make sure that multiple
+  // concurrent initializations will not join the same rendezvous. The winner
+  // will update cliques state, and others will destroy unused communicators.
+  auto rendezvous_key = std::make_tuple(run_id, clique_key, parent_clique_key);
+  auto initialization_rendezvous_name = absl::StrFormat(
+      "initialize clique for rank %d; clique=%s; run_id=%d; parent=%s", rank,
+      clique_key.ToString(), run_id.ToInt(), parent_clique_key.ToString());
+
   return RendezvousSingle<absl::StatusOr<NcclClique::Lock>>(
-      rendezvous_key, num_local_participants,
-      [&] {
-        return AcquireNcclClique(clique_key, run_id, num_local_participants);
-      },
-      WarnStuckTimeout(), TerminateTimeout());
+      initialization_rendezvous_name, rendezvous_key, rank_pair,
+      num_local_participants, split, WarnStuckTimeout(), TerminateTimeout());
 }
 
 //===----------------------------------------------------------------------===//
 
-static absl::StatusOr<std::shared_ptr<NcclClique::Lock>> AcquireNcclClique(
-    RunId run_id, OpId op_id, NcclCliqueKey clique_key,
+using AcquiredCliquesMap = NcclClique::AcquiredCliquesMap;
+
+absl::StatusOr<std::shared_ptr<NcclClique::Lock>> AcquireNcclClique(
+    se::StreamExecutor* device, RunId run_id, NcclCliqueKey clique_key,
     const NcclCliqueIdCallback& clique_id_callback, int32_t rank,
-    size_t num_local_participants, bool may_skip_rendezvous) {
+    size_t num_local_participants, const AcquiredCliquesMap& acquired_cliques) {
   VLOG(2) << "Acquire NCCL clique " << clique_key.ToString() << "; run"
-          << run_id.ToString() << "; op" << op_id.value() << "; rank " << rank
+          << run_id.ToString() << "; rank " << rank
           << "; num_local_participants=" << num_local_participants
-          << "; may_skip_rendezvous=" << may_skip_rendezvous;
-
-  // If we prefer to skip rendezvous check if NcclClique is already available
-  // for a given key.
-  // TODO(ezhulenev): Remove this code path as it leads to deadlocks.
-  if (may_skip_rendezvous) {
-    TF_ASSIGN_OR_RETURN(
-        NcclClique::Lock clique,
-        AcquireNcclClique(clique_key, run_id, num_local_participants));
+          << "; acquired_cliques=" << acquired_cliques.size();
 
-    // If lock is not null return it to the caller.
-    if (clique) return std::make_shared<NcclClique::Lock>(std::move(clique));
-
-  } else {
-    // Get the clique lock via the rendezvous process.
-    auto rendezvous_key = std::make_tuple(run_id, clique_key);
-    TF_ASSIGN_OR_RETURN(std::shared_ptr<NcclClique::Lock> clique,
-                        RendezvousSingle<absl::StatusOr<NcclClique::Lock>>(
-                            rendezvous_key, num_local_participants,
-                            [&] {
-                              return AcquireNcclClique(clique_key, run_id,
-                                                       num_local_participants);
-                            },
-                            WarnStuckTimeout(), TerminateTimeout()));
-
-    // If lock is not null return it to the caller.
-    if (*clique) return clique;
-  }
-
-  // If NCCL clique is not found try to initialize a new one for a given key.
-  return InitializeNcclClique(run_id, clique_key, clique_id_callback,
-                              num_local_participants, rank);
-}
-
-absl::StatusOr<NcclComm::Lock> AcquireNcclComm(
-    RunId run_id, OpId op_id, std::vector<GlobalDeviceId> participants,
-    size_t num_local_participants,
-    const NcclCliqueIdCallback& clique_id_callback, int32_t rank,
-    int64_t stream_id, bool enable_clique_optimization) {
-  // Ensure that this group of threads have exclusive access to the clique to
-  // prevent threads from different groups locking communicators in the clique.
-  // The enable_clique_optimization value is only used for asynchronous
-  // collective stream currently. For synchronous collectives, we should always
-  // enable the optimization. For P2P stream, we currently have to always enable
-  // the optimization, because we initially implement this optimization to
-  // workaround an NCCL bug related to P2P operations.
-  NcclCliqueKey clique_key(std::move(participants), stream_id);
+  // Get the clique lock via the rendezvous to guarantee that all clique
+  // members participate in XLA run.
+  auto rendezvous_key = std::make_tuple(run_id, clique_key);
+  auto rendezvous_name =
+      absl::StrFormat("acquire clique for rank %d; clique=%s; run_id=%d", rank,
+                      clique_key.ToString(), run_id.ToInt());
 
   TF_ASSIGN_OR_RETURN(
       std::shared_ptr<NcclClique::Lock> clique,
-      AcquireNcclClique(
-          run_id, op_id, clique_key, clique_id_callback, rank,
-          num_local_participants,
-          enable_clique_optimization ||
-              stream_id != GetStreamId(/*is_async=*/true,
-                                       AsyncStreamKind::kCollective)));
-
-  // Check that clique has a communicator for our rank.
-  auto communicator = (*clique)->communicators.find(rank);
-  if (communicator == (*clique)->communicators.end()) {
-    return absl::InternalError(absl::StrCat("Communicator for rank ", rank,
-                                            " not found in a NCCL clique ",
-                                            clique_key.ToString()));
+      RendezvousSingle<absl::StatusOr<NcclClique::Lock>>(
+          rendezvous_name, rendezvous_key, num_local_participants,
+          [&] {
+            NcclCliques& cliques = GetNcclCliques();
+            absl::MutexLock lock(&cliques.mu);
+            // Returns empty lock if we do not have a clique for `clique_key`.
+            auto it = cliques.map.find(clique_key);
+            return it == cliques.map.end() ? NcclClique::Lock()
+                                           : it->second.Acquire();
+          },
+          WarnStuckTimeout(), TerminateTimeout()));
+
+  // If lock is not null return it to the caller.
+  if (*clique) return clique;
+
+  // Maybe find if we acquired a clique with communicators that we can split.
+  static const int64_t enable_nccl_comm_splitting =
+      xla::GetDebugOptionsFromFlags().xla_gpu_enable_nccl_comm_splitting();
+
+  if (enable_nccl_comm_splitting) {
+    for (auto& [acquired_clique_key, acquired_clique] : acquired_cliques) {
+      // We don't support splitting non-local cliques as it requires careful
+      // synchronization between multiple processes.
+      if (!(*acquired_clique)->IsLocal()) continue;
+
+      if (clique_key.IsSubsetOf(acquired_clique_key)) {
+        return InitializeNcclClique(device, run_id, clique_key, acquired_clique,
+                                    num_local_participants, rank);
+      }
+    }
   }
 
-  return communicator->second.Acquire();
+  // If we can't split any of the acquired cliques, create a new one.
+  return InitializeNcclClique(device, run_id, clique_key, clique_id_callback,
+                              num_local_participants, rank);
 }
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/nccl_clique.h b/third_party/xla/xla/service/gpu/nccl_clique.h
index ede15e49c1015d..5046de02d8e9e1 100644
--- a/third_party/xla/xla/service/gpu/nccl_clique.h
+++ b/third_party/xla/xla/service/gpu/nccl_clique.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -18,15 +18,21 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
-#include <vector>
-
+#include <functional>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/container/btree_map.h"
+#include "absl/functional/function_ref.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_format.h"
 #include "xla/executable_run_options.h"
-#include "xla/service/global_device_id.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/lockable.h"
-#include "tsl/lib/gtl/int_type.h"
+#include "xla/stream_executor/stream_executor.h"
 
 namespace xla::gpu {
 
@@ -62,21 +68,76 @@ absl::StatusOr<const NcclCliqueIdCallback*> GetNcclCliqueIdCallback(
     bool is_local);
 
 //===----------------------------------------------------------------------===//
-// NcclComm
+// NcclClique
 //===----------------------------------------------------------------------===//
 
-TSL_LIB_GTL_DEFINE_INT_TYPE(OpId, int64_t);
+// A group of NCCL communicators making up a clique. With NCCL it's notoriously
+// easy to get a deadlock, so we take extra care by grouping communicators into
+// cliques and making sure that we have a well defined order of all collective
+// operations that does not lead to deadlocks.
+class NcclCliqueCommunicators {
+ public:
+  NcclCliqueCommunicators(
+      NcclCliqueKey clique_key, std::optional<NcclCliqueId> clique_id,
+      absl::btree_map<int32_t, NcclApi::OwnedNcclComm> communicators);
+
+  // Returns a NCCL communicator for a given rank if it's in a clique.
+  std::optional<NcclApi::NcclCommHandle> comm(int32_t rank);
+
+  // Return true if clique is local: all communicators belong to current
+  // process. Non-local cliques spans multiple processes (typically hosts).
+  bool IsLocal() const;
+
+  // Calls `fn` for each communicator in the clique.
+  void ForEachComm(
+      absl::FunctionRef<void(int32_t, NcclApi::NcclCommHandle)> fn);
 
-struct NcclComm : public Lockable<NcclApi::NcclCommHandle> {
-  explicit NcclComm(NcclApi::NcclCommHandle comm) : Lockable(comm) {}
+  const NcclCliqueKey& clique_key() const { return clique_key_; }
+  const std::optional<NcclCliqueId>& clique_id() const { return clique_id_; }
+  size_t num_communicators() const { return communicators_.size(); }
+
+  std::string DebugString() const;
+
+ private:
+  NcclCliqueKey clique_key_;
+  std::optional<NcclCliqueId> clique_id_;
+
+  // TODO(ezhulenev): Switch this map to GlobalDeviceId key.
+  absl::btree_map<int32_t, NcclApi::OwnedNcclComm> communicators_;
 };
 
-// Acquires an exclusive access to NCCL communicator owned by a NCCL clique.
-absl::StatusOr<NcclComm::Lock> AcquireNcclComm(
-    RunId run_id, OpId op_id, std::vector<GlobalDeviceId> participants,
-    size_t num_local_participants,
+struct NcclCliqueName {
+  static std::string ToString(const NcclCliqueCommunicators& comms) {
+    return absl::StrFormat("lockable clique %s", comms.clique_key().ToString());
+  }
+};
+
+struct NcclClique : public Lockable<NcclCliqueCommunicators, NcclCliqueName> {
+  // We keep acquired cliques in a sorted container to guarantee that all
+  // participants iterate over cliques in the same order.
+  using AcquiredCliquesMap =
+      absl::btree_map<NcclCliqueKey, std::shared_ptr<NcclClique::Lock>,
+                      std::greater<NcclCliqueKey>>;
+
+  NcclClique(NcclCliqueKey clique_key, std::optional<NcclCliqueId> clique_id,
+             absl::btree_map<int32_t, NcclApi::OwnedNcclComm> communicators)
+      : Lockable(std::move(clique_key), clique_id, std::move(communicators)) {}
+
+  std::string DebugString() const;
+};
+
+// Acquires an shared access to a NCCL clique (NcclClique::Lock collectively
+// owned by `num_local_participants` threads). XLA uses this lock to serialize
+// execution of all collective operations sharing a `clique_id`.
+//
+// If clique for a given key does not exist it will be initialized from newly
+// created communicators or maybe created by splitting of the already acquired
+// cliques.
+absl::StatusOr<std::shared_ptr<NcclClique::Lock>> AcquireNcclClique(
+    se::StreamExecutor* device, RunId run_id, NcclCliqueKey clique_key,
     const NcclCliqueIdCallback& clique_id_callback, int32_t rank,
-    int64_t stream_id, bool enable_clique_optimization);
+    size_t num_local_participants,
+    const NcclClique::AcquiredCliquesMap& acquired_cliques);
 
 }  // namespace xla::gpu
 
diff --git a/third_party/xla/xla/service/gpu/nccl_clique_key.cc b/third_party/xla/xla/service/gpu/nccl_clique_key.cc
index 5fe0a8fe2e0780..73cf7225d24b40 100644
--- a/third_party/xla/xla/service/gpu/nccl_clique_key.cc
+++ b/third_party/xla/xla/service/gpu/nccl_clique_key.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -17,14 +17,16 @@ limitations under the License.
 
 #include <algorithm>
 #include <cstdint>
+#include <optional>
 #include <string>
 #include <string_view>
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/types/span.h"
 #include "xla/service/global_device_id.h"
 
@@ -42,9 +44,23 @@ absl::Span<const GlobalDeviceId> NcclCliqueKey::devices() const {
   return devices_;
 }
 
+std::optional<int64_t> NcclCliqueKey::rank(GlobalDeviceId id) const {
+  if (auto it = absl::c_find(devices_, id); it != devices_.end()) {
+    return it - devices_.begin();
+  }
+  return std::nullopt;
+}
+
+bool NcclCliqueKey::IsSubsetOf(const NcclCliqueKey& other) const {
+  return stream_id_ == other.stream_id_ &&
+         absl::c_all_of(devices_, [&](GlobalDeviceId id) {
+           return absl::c_linear_search(other.devices_, id);
+         });
+}
+
 std::string NcclCliqueKey::ToString() const {
-  return absl::StrCat("devices=", GlobalDeviceIdsToString(devices_),
-                      "; stream=", stream_id_);
+  return absl::StrFormat("devices=[%s]; stream=%d",
+                         GlobalDeviceIdsToString(devices_), stream_id_);
 }
 
 bool operator==(const NcclCliqueKey& a, const NcclCliqueKey& b) {
@@ -52,10 +68,25 @@ bool operator==(const NcclCliqueKey& a, const NcclCliqueKey& b) {
 }
 
 bool operator<(const NcclCliqueKey& a, const NcclCliqueKey& b) {
-  if (a.stream_id_ < b.stream_id_) return true;
-  if (b.stream_id_ < a.stream_id_) return false;
+  if (a.devices_.size() < b.devices_.size()) return true;
+  if (b.devices_.size() < a.devices_.size()) return false;
+
+  if (a.devices_ < b.devices_) return true;
+  if (b.devices_ < a.devices_) return false;
+
+  return a.stream_id_ < b.stream_id_;
+}
+
+bool operator>(const NcclCliqueKey& a, const NcclCliqueKey& b) {
+  if (a.devices_.size() > b.devices_.size()) return true;
+  if (b.devices_.size() > a.devices_.size()) return false;
+
+  if (a.devices_ > b.devices_) return true;
+  if (b.devices_ > a.devices_) return false;
 
-  return a.devices_ < b.devices_;
+  // We still use `<` to order by stream id as we want to acquire sync cliques
+  // before async ones.
+  return a.stream_id_ < b.stream_id_;
 }
 
 //===----------------------------------------------------------------------===//
@@ -71,8 +102,8 @@ NcclCliqueId::NcclCliqueId(char bytes[kSize]) {
 absl::StatusOr<NcclCliqueId> NcclCliqueId::FromString(std::string_view str) {
   if (str.size() != kSize) {
     return absl::InvalidArgumentError(
-        absl::StrCat("Invalid NCCL clique id size: ", str.size(), ", expected ",
-                     kSize, " bytes"));
+        absl::StrFormat("Invalid NCCL clique id size: %d , expected %d bytes",
+                        str.size(), kSize));
   }
   char bytes[kSize];
   std::copy(str.data(), str.data() + kSize, bytes);
diff --git a/third_party/xla/xla/service/gpu/nccl_clique_key.h b/third_party/xla/xla/service/gpu/nccl_clique_key.h
index 4c6e14094a8485..8897285e4c7ca0 100644
--- a/third_party/xla/xla/service/gpu/nccl_clique_key.h
+++ b/third_party/xla/xla/service/gpu/nccl_clique_key.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -19,6 +19,7 @@ limitations under the License.
 #include <array>
 #include <cstdint>
 #include <functional>
+#include <optional>
 #include <string>
 #include <string_view>
 #include <vector>
@@ -77,6 +78,13 @@ class NcclCliqueKey {
 
   absl::Span<const GlobalDeviceId> devices() const;
 
+  // Returns the rank of the global device in the clique.
+  std::optional<int64_t> rank(GlobalDeviceId id) const;
+
+  // Returns true if this clique is a subset of `other`: both cliques have the
+  // same `stream_id` and all clique devices are part of `other` clique.
+  bool IsSubsetOf(const NcclCliqueKey& other) const;
+
   std::string ToString() const;
 
   template <typename H>
@@ -84,6 +92,7 @@ class NcclCliqueKey {
 
   friend bool operator==(const NcclCliqueKey& a, const NcclCliqueKey& b);
   friend bool operator<(const NcclCliqueKey& a, const NcclCliqueKey& b);
+  friend bool operator>(const NcclCliqueKey& a, const NcclCliqueKey& b);
 
  private:
   const std::vector<GlobalDeviceId> devices_;
diff --git a/third_party/xla/xla/service/gpu/nccl_clique_key_test.cc b/third_party/xla/xla/service/gpu/nccl_clique_key_test.cc
new file mode 100644
index 00000000000000..bfee6e7174dc4f
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/nccl_clique_key_test.cc
@@ -0,0 +1,72 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/nccl_clique_key.h"
+
+#include <cstdint>
+#include <functional>
+
+#include "absl/container/btree_map.h"
+#include "xla/service/global_device_id.h"
+#include "tsl/platform/test.h"
+
+namespace xla::gpu {
+
+TEST(NcclCliqueKeyTest, IsSubsetOf) {
+  GlobalDeviceId id0 = GlobalDeviceId(0);
+  GlobalDeviceId id1 = GlobalDeviceId(1);
+  GlobalDeviceId id2 = GlobalDeviceId(2);
+  GlobalDeviceId id3 = GlobalDeviceId(3);
+
+  NcclCliqueKey key0({id0, id1}, 0);
+  NcclCliqueKey key1({id0, id1, id2, id3}, 0);
+  NcclCliqueKey key2({id0, id1, id2, id3}, 1);
+  NcclCliqueKey key3({id1, id2, id3}, 0);
+
+  EXPECT_TRUE(key0.IsSubsetOf(key1));
+  EXPECT_FALSE(key0.IsSubsetOf(key2));
+  EXPECT_FALSE(key0.IsSubsetOf(key3));
+}
+
+TEST(NcclCliqueKeyTest, LargerCliqueGoFirst) {
+  GlobalDeviceId id0 = GlobalDeviceId(0);
+  GlobalDeviceId id1 = GlobalDeviceId(1);
+  GlobalDeviceId id2 = GlobalDeviceId(2);
+  GlobalDeviceId id3 = GlobalDeviceId(3);
+
+  NcclCliqueKey key0({id0, id1}, 0);
+  NcclCliqueKey key1({id1, id2, id3}, 0);
+
+  EXPECT_LT(key0, key1);
+  EXPECT_GT(key1, key0);
+}
+
+TEST(NcclCliqueKeyTest, BtreeIterationOrder) {
+  GlobalDeviceId id0 = GlobalDeviceId(0);
+  GlobalDeviceId id1 = GlobalDeviceId(1);
+  GlobalDeviceId id2 = GlobalDeviceId(2);
+  GlobalDeviceId id3 = GlobalDeviceId(3);
+
+  NcclCliqueKey key0({id0, id2}, 0);
+  NcclCliqueKey key1({id0, id1, id2, id3}, 0);
+
+  absl::btree_map<NcclCliqueKey, int64_t, std::greater<NcclCliqueKey>> map;
+  map[key0] = 0;
+  map[key1] = 1;
+
+  EXPECT_EQ(map.begin()->first, key1);
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/nccl_collective_permute_thunk.cc b/third_party/xla/xla/service/gpu/nccl_collective_permute_thunk.cc
index 208a8401f58c8d..a46eb0279b5807 100644
--- a/third_party/xla/xla/service/gpu/nccl_collective_permute_thunk.cc
+++ b/third_party/xla/xla/service/gpu/nccl_collective_permute_thunk.cc
@@ -212,16 +212,16 @@ absl::Status NcclCollectivePermuteStartThunk::RunNcclCollective(
                              config_.config.operand_element_type));
   TF_RET_CHECK(device_buffers.size() == 1) << "Expected one buffer pair.";
 
-  GlobalDeviceId global_device_id = params.collective_params.global_device_id;
+  GlobalDeviceId global_device_id = params.collective_params->global_device_id;
 
   TF_ASSIGN_OR_RETURN(const DeviceAssignment::LogicalID current_logical_id,
-                      params.collective_params.device_assn->LogicalIdForDevice(
+                      params.collective_params->device_assn->LogicalIdForDevice(
                           global_device_id));
   const int64_t current_id =
       config_.config.group_mode == CollectiveOpGroupMode::kCrossReplica
           ? current_logical_id.replica_id
           : current_logical_id.computation_id;
-  std::string device_string = GetDeviceString(params.collective_params);
+  std::string device_string = GetDeviceString(*params.collective_params);
 
   const NcclP2PConfig::SourceTargetMapEntry source_target =
       NcclP2PConfig::GetSourceTarget(config_.id_to_source_target, current_id);
diff --git a/third_party/xla/xla/service/gpu/nccl_collective_thunk.cc b/third_party/xla/xla/service/gpu/nccl_collective_thunk.cc
index 363143bb1f8736..476145b4a2a953 100644
--- a/third_party/xla/xla/service/gpu/nccl_collective_thunk.cc
+++ b/third_party/xla/xla/service/gpu/nccl_collective_thunk.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <memory>
 #include <optional>
 #include <string>
+#include <tuple>
 #include <utility>
 #include <vector>
 
@@ -27,32 +28,41 @@ limitations under the License.
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/functional/function_ref.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "mlir/IR/Value.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/layout_util.h"
 #include "xla/primitive_util.h"
 #include "xla/service/collective_ops_utils.h"
+#include "xla/service/computation_placer.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique.h"
 #include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/thunk.h"
+#include "xla/service/rendezvous.h"
 #include "xla/shape.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
+#include "xla/stream_executor/event.h"
 #include "xla/stream_executor/gpu/gpu_activation.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
 
+#if GOOGLE_CUDA
+#include "xla/stream_executor/gpu/gpu_driver.h"
+#include "xla/stream_executor/gpu/gpu_types.h"
+#endif  // GOOGLE_CUDA
+
 namespace xla {
 namespace gpu {
 namespace {
@@ -163,6 +173,18 @@ void NcclCollectiveConfig::SetCollectiveOpKindAndID(
   }
 }
 
+void NcclCollectiveConfig::SetCollectiveOpKindAndID(
+    const HloSendRecvInstruction* instr) {
+  int64_t channel_id = instr->channel_id().value_or(0);
+  if (channel_id > 0) {
+    collective_op_kind = RendezvousKey::kCrossModule;
+    op_id = channel_id;
+  } else {
+    collective_op_kind = RendezvousKey::kCrossReplica;
+    op_id = static_cast<int64_t>(instr->GetModule()->unique_id());
+  }
+}
+
 NcclCollectiveConfig GetNcclCollectiveConfig(
     const HloInstruction* hlo, std::optional<bool> use_global_device_ids) {
   NcclCollectiveConfig config;
@@ -193,13 +215,12 @@ NcclCollectiveThunk::NcclCollectiveThunk(Kind kind, ThunkInfo thunk_info,
                                          NcclApi* nccl_api, bool is_sync)
     : Thunk(kind, thunk_info),
       nccl_api_(nccl_api),
-      async_(is_sync ? std::make_unique<AsyncExecutor>() : nullptr) {}
+      async_events_(is_sync ? nullptr : new AsyncEvents()) {}
 
-absl::StatusOr<NcclComm::Lock> LockNcclComm(
+static absl::StatusOr<NcclCliqueKey> GetNcclCliqueKey(
     const Thunk::CollectiveExecuteParams& params,
     const std::vector<ReplicaGroup>& replica_groups,
-    CollectiveOpGroupMode group_mode, int64_t op_id, int64_t stream_id,
-    bool enable_clique_optimization) {
+    CollectiveOpGroupMode group_mode, int64_t stream_id) {
   GlobalDeviceId global_device_id = params.global_device_id;
 
   TF_ASSIGN_OR_RETURN(
@@ -214,32 +235,20 @@ absl::StatusOr<NcclComm::Lock> LockNcclComm(
         "environment configuration.");
   }
 
-  auto it = absl::c_find(participants, global_device_id);
-  TF_RET_CHECK(it != participants.end());
-  int rank = it - participants.begin();
-
-  std::vector<GlobalDeviceId> local_devices;
-  if (params.global_device_id_map) {
-    local_devices.reserve(params.global_device_id_map->size());
-    for (const auto& entry : *params.global_device_id_map) {
-      local_devices.push_back(entry.second);
-    }
-  }
-  size_t num_local_participants = GetNumLocalParticipants(
-      participants, params.global_device_id_map ? &local_devices : nullptr);
+  return NcclCliqueKey(std::move(participants), stream_id);
+}
 
-  bool is_local = participants.size() == num_local_participants;
+absl::StatusOr<NcclApi::NcclCommHandle> GetNcclComm(
+    const Thunk::CollectiveExecuteParams& params,
+    const Thunk::CollectiveCliques& collective_cliques,
+    const std::vector<ReplicaGroup>& replica_groups,
+    CollectiveOpGroupMode group_mode, int64_t stream_id) {
   TF_ASSIGN_OR_RETURN(
-      const NcclCliqueIdCallback* clique_id_callback,
-      GetNcclCliqueIdCallback(params.nccl_clique_id_callback, is_local));
-
-#ifdef GOOGLE_CUDA
-  se::gpu::ScopedActivateExecutorContext scoped_context(params.stream_executor);
-#endif  // GOOGLE_CUDA
+      NcclCliqueKey clique_key,
+      GetNcclCliqueKey(params, replica_groups, group_mode, stream_id));
 
-  return AcquireNcclComm(params.run_id, OpId(op_id), std::move(participants),
-                         num_local_participants, *clique_id_callback, rank,
-                         stream_id, enable_clique_optimization);
+  std::optional<int64_t> rank = clique_key.rank(params.global_device_id);
+  return collective_cliques.GetComm(std::move(clique_key), *rank);
 }
 
 absl::StatusOr<std::vector<DeviceBufferPair>> ConvertToDeviceBuffers(
@@ -269,44 +278,92 @@ absl::StatusOr<std::vector<DeviceBufferPair>> ConvertToDeviceBuffers(
   return device_buffers;
 }
 
-Status MaybeRegisterBuffers(NcclApi* nccl_api, int device_ordinal,
-                            const std::vector<DeviceBufferPair>& buffers,
-                            NcclApi::NcclCommHandle comm) {
+Status RegisterBufferOnce(NcclApi* nccl_api, int device_ordinal,
+                          NcclApi::NcclCommHandle comm,
+                          se::DeviceMemoryBase buffer) {
   // Keep track of which communicators we have registered for already.
-  // Each device has one NCCL buffer which only needs to be registered once per
-  // each comm.
+  // Each ncclMemAlloc'd buffer needs to be registered once per comm.
   struct RegisteredBuffers {
     absl::Mutex mu;
-    absl::flat_hash_map<int, absl::flat_hash_set<NcclApi::NcclCommHandle>>
-        per_device_comms ABSL_GUARDED_BY(mu);
+    // Device ordinal, communicator, and base pointer address.
+    absl::flat_hash_set<std::tuple<int, NcclApi::NcclCommHandle, void*>> records
+        ABSL_GUARDED_BY(mu);
     // Buffers could be deregistered with ncclCommDeregister.
     std::vector<NcclApi::NcclRegisteredBufferHandle> handles
         ABSL_GUARDED_BY(mu);
   };
   static auto& all_registered = *new RegisteredBuffers;
 
+  // Since each XLA buffer is a slice into a larger BFCAllocator chunk, first
+  // get the base address of buffer. We will use the base address to keep track
+  // of which chunks we have registered.
+  void* base_ptr;
+  size_t base_size;
+#ifdef GOOGLE_CUDA
+  TF_RETURN_IF_ERROR(se::gpu::GpuDriver::GetPointerAddressRange(
+      reinterpret_cast<se::gpu::GpuDevicePtr>(buffer.opaque()),
+      reinterpret_cast<se::gpu::GpuDevicePtr*>(&base_ptr), &base_size));
+#else   // GOOGLE_CUDA
+  base_ptr = nullptr;
+  base_size = 0;
+#endif  // GOOGLE_CUDA
+
   absl::MutexLock lock(&all_registered.mu);
+  if (!all_registered.records.contains({device_ordinal, comm, base_ptr})) {
+    // ncclCommRegister will internally get and use the base address/size of the
+    // address we provide.
+    TF_ASSIGN_OR_RETURN(NcclApi::NcclRegisteredBufferHandle handle,
+                        nccl_api->RegisterBuffer(comm, buffer));
+    all_registered.handles.push_back(handle);
+    all_registered.records.insert({device_ordinal, comm, base_ptr});
+  }
+  return OkStatus();
+}
+
+Status MaybeRegisterBuffers(NcclApi* nccl_api, int device_ordinal,
+                            const std::vector<DeviceBufferPair>& buffers,
+                            NcclApi::NcclCommHandle comm) {
   for (int i = 0; i < buffers.size(); ++i) {
-    if (!all_registered.per_device_comms[device_ordinal].contains(comm)) {
-      if (buffers[i].source_memory_space == kCollectiveMemorySpaceColor) {
-        TF_ASSIGN_OR_RETURN(
-            NcclApi::NcclRegisteredBufferHandle handle,
-            nccl_api->RegisterBuffer(comm, buffers[i].source_buffer));
-        all_registered.handles.push_back(handle);
-        all_registered.per_device_comms[device_ordinal].insert(comm);
-      }
-      if (buffers[i].destination_memory_space == kCollectiveMemorySpaceColor) {
-        TF_ASSIGN_OR_RETURN(
-            NcclApi::NcclRegisteredBufferHandle handle,
-            nccl_api->RegisterBuffer(comm, buffers[i].destination_buffer));
-        all_registered.handles.push_back(handle);
-        all_registered.per_device_comms[device_ordinal].insert(comm);
-      }
+    if (buffers[i].source_memory_space == kCollectiveMemorySpaceColor) {
+      TF_RETURN_IF_ERROR(RegisterBufferOnce(nccl_api, device_ordinal, comm,
+                                            buffers[i].source_buffer));
+    }
+    if (buffers[i].destination_memory_space == kCollectiveMemorySpaceColor) {
+      TF_RETURN_IF_ERROR(RegisterBufferOnce(nccl_api, device_ordinal, comm,
+                                            buffers[i].destination_buffer));
     }
   }
   return OkStatus();
 }
 
+absl::Status NcclCollectiveThunk::AsyncEvents::Initialize(
+    se::StreamExecutor* executor) {
+  absl::MutexLock lock(&mu_);
+  if (events_.contains(executor)) return absl::OkStatus();
+
+  se::Event event(executor);
+  if (!event.Init()) {
+    return absl::InternalError(
+        "Failed to initialize collective operation async completion event");
+  }
+
+  events_.try_emplace(executor, std::move(event));
+  return absl::OkStatus();
+}
+
+absl::StatusOr<se::Event*> NcclCollectiveThunk::AsyncEvents::GetEvent(
+    se::StreamExecutor* executor) {
+  absl::MutexLock lock(&mu_);
+
+  auto event = events_.find(executor);
+  if (event == events_.end()) {
+    return absl::InternalError(
+        "Collective operation async completion event not initialized");
+  }
+
+  return &event->second;
+}
+
 absl::Status NcclCollectiveThunk::Prepare(const PrepareParams& params,
                                           ResourceRequests& resource_requests) {
   const CollectiveExecuteParams* collectives = params.collective_params;
@@ -334,42 +391,95 @@ absl::Status NcclCollectiveThunk::Prepare(const PrepareParams& params,
       num_local_participants);
 }
 
+absl::Status NcclCollectiveThunk::Initialize(const InitializeParams& params) {
+  if (async_events_) {
+    TF_RETURN_IF_ERROR(async_events_->Initialize(params.executor));
+  }
+  return absl::OkStatus();
+}
+
+namespace {
+// Wrap NcclCliqueKey into a unique struct to guarantee we do not accidentally
+// try to run multiple unrelated rendezvous for a same key.
+struct FirstCallRendezvousKey {
+  NcclCliqueKey clique_key;
+
+  template <typename H>
+  friend H AbslHashValue(H h, const FirstCallRendezvousKey& key) {
+    return H::combine(std::move(h), key.clique_key);
+  }
+};
+
+bool operator==(const FirstCallRendezvousKey& a,
+                const FirstCallRendezvousKey& b) {
+  return a.clique_key == b.clique_key;
+}
+}  // namespace
+
 Status NcclCollectiveThunk::ExecuteOnStream(const ExecuteParams& params) {
   VLOG(1) << absl::StreamFormat("Starting %s %s.", IsAsync() ? "async" : "sync",
                                 Thunk::KindToString(kind()));
   const int64_t stream_id = GetStreamId();
   TF_ASSIGN_OR_RETURN(
-      NcclComm::Lock comm,
-      LockNcclComm(params.collective_params, config().replica_groups,
-                   config().group_mode, config().op_id, stream_id,
-                   /*enable_clique_optimization=*/false));
-
-  // Run the collective on main stream or using the async executor.
-  absl::Status status = [&]() {
-    if (!IsAsync()) {
-      return RunNcclCollective(params, *params.stream, *comm);
-    }
-    return async_->Execute(
-        [this](const ExecuteParams& params, se::Stream& stream,
-               NcclApi::NcclCommHandle comm) {
-          return RunNcclCollective(params, stream, comm);
-        },
-        params, *comm, GetAsyncStreamKind());
-  }();
-  TF_RETURN_IF_ERROR(status);
-
-  // Block host on the first call to ensure that all devices have allocated the
-  // required buffers for their communicators before allowing any device to
-  // continue enqueuing operations. Otherwise, the allocations can cause
-  // deadlock in the CUDA driver (b/215649390).
-  if (first_call_to_execute_) {
-    se::Stream* stream = IsAsync()
-                             ? params.async_comms_streams[static_cast<int64_t>(
-                                   GetAsyncStreamKind())]
-                             : params.stream;
-    TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-    first_call_to_execute_ = false;
+      NcclApi::NcclCommHandle comm,
+      GetNcclComm(*params.collective_params, *params.collective_cliques,
+                  config().replica_groups, config().group_mode, stream_id));
+
+  se::StreamExecutor* executor = params.stream->parent();
+  int64_t async_stream_idx = static_cast<int64_t>(GetAsyncStreamKind());
+
+  if (IsAsync()) {
+    // Launch collective operation on an async stream.
+    se::Stream& async_stream = *params.async_comms_streams[async_stream_idx];
+
+    // Wait for main compute stream to make sure all buffers are ready.
+    async_stream.ThenWaitFor(params.stream);
+
+    TF_RETURN_IF_ERROR(RunNcclCollective(params, async_stream, comm));
+
+    // Record collective operation completion.
+    TF_ASSIGN_OR_RETURN(se::Event * event, async_events_->GetEvent(executor));
+    async_stream.ThenRecordEvent(event);
+
+  } else {
+    // Launch collective operation on a main stream.
+    TF_RETURN_IF_ERROR(RunNcclCollective(params, *params.stream, comm));
+  }
+
+  // After a first execution of this instance of collective operation do a
+  // rendezvous with other participants to make sure that all of them allocated
+  // required state (internal to NCCL) and ready to continue. Going too far
+  // ahead on one rank leads to deadlocks in NCCL.
+  if (NeedFirstCallRendzevous() && !first_call_rendezvous_flag_.IsCompleted()) {
+    TF_ASSIGN_OR_RETURN(
+        NcclCliqueKey clique_key,
+        GetNcclCliqueKey(*params.collective_params, config().replica_groups,
+                         config().group_mode, stream_id));
+
+    TF_ASSIGN_OR_RETURN(
+        size_t num_local_participants,
+        params.collective_cliques->num_communicators(clique_key));
+
+    auto global_device_id = params.collective_params->global_device_id;
+    VLOG(1) << "Do a rendezvous after a first call to "
+            << Thunk::KindToString(kind())
+            << "; run_id=" << params.collective_params->run_id.ToInt()
+            << "; op_id=" << config().op_id
+            << "; num_local_participants=" << num_local_participants
+            << "; rank=" << clique_key.rank(global_device_id).value_or(-1)
+            << "; clique_key=" << clique_key.ToString();
+
+    auto rendezvous_key = FirstCallRendezvousKey{std::move(clique_key)};
+    auto rendezvous_name = absl::StrFormat(
+        "first call to collective operation %d; run_id=%d", config().op_id,
+        params.collective_params->run_id.ToInt());
+
+    RendezvousSingle(first_call_rendezvous_flag_, rendezvous_name,
+                     rendezvous_key, num_local_participants,
+                     /*warn_stuck_timeout=*/absl::Seconds(10),
+                     /*terminate_timeout=*/absl::Seconds(30));
   }
+
   return absl::OkStatus();
 }
 
@@ -385,52 +495,17 @@ std::string NcclCollectiveThunk::GetDeviceString(
                          collective_params.local_device_ordinal);
 }
 
-absl::Status NcclCollectiveThunk::AsyncExecutor::Execute(
-    absl::FunctionRef<Status(const ExecuteParams&, se::Stream&,
-                             NcclApi::NcclCommHandle)>
-        fn,
-    const ExecuteParams& params, NcclApi::NcclCommHandle comm,
-    AsyncStreamKind stream_kind) {
-  se::Stream& async_comms_stream =
-      *params.async_comms_streams[static_cast<int64_t>(stream_kind)];
-  // Wait until compute inputs are ready.
-  async_comms_stream.ThenWaitFor(params.stream);
-
-  TF_RETURN_IF_ERROR(fn(params, async_comms_stream, comm));
-
-  // Create an event on the async stream for the completion of the collective.
-  se::Event done_event(async_comms_stream.parent());
-  TF_RET_CHECK(done_event.Init());
-  async_comms_stream.ThenRecordEvent(&done_event);
-
-  int device_ordinal = async_comms_stream.parent()->device_ordinal();
-  absl::MutexLock lock(&mu_);
-  auto [_, was_inserted] =
-      done_events_.insert({device_ordinal, std::move(done_event)});
-  TF_RET_CHECK(was_inserted) << "done event has not been consumed";
-  return absl::OkStatus();
-}
-
-absl::Status NcclCollectiveThunk::AsyncExecutor::Await(
-    const ExecuteParams& params) {
-  int device_ordinal = params.stream->parent()->device_ordinal();
-  auto done_event = [this, device_ordinal] {
-    absl::MutexLock lock(&mu_);
-    return done_events_.extract(device_ordinal);
-  }();
-  TF_RET_CHECK(done_event) << "done event not found";
-  params.stream->ThenWaitFor(&done_event.mapped());
-  return absl::OkStatus();
-}
-
 NcclCollectiveDoneThunk::NcclCollectiveDoneThunk(
     Thunk::Kind kind, ThunkInfo thunk_info,
-    NcclCollectiveThunk::AsyncExecutor& async)
-    : Thunk(kind, std::move(thunk_info)), async_(async) {}
+    std::shared_ptr<NcclCollectiveThunk::AsyncEvents> async_events)
+    : Thunk(kind, std::move(thunk_info)), async_events_(async_events) {}
 
 absl::Status NcclCollectiveDoneThunk::ExecuteOnStream(
     const ExecuteParams& params) {
-  return async_.Await(params);
+  se::StreamExecutor* executor = params.stream->parent();
+  TF_ASSIGN_OR_RETURN(se::Event * event, async_events_->GetEvent(executor));
+  params.stream->ThenWaitFor(event);
+  return absl::OkStatus();
 }
 
 absl::Status IsValidOperand(mlir::Value operand, Thunk::Kind reduction_op) {
diff --git a/third_party/xla/xla/service/gpu/nccl_collective_thunk.h b/third_party/xla/xla/service/gpu/nccl_collective_thunk.h
index b9f87fcebb5a5a..38604e44a14b4d 100644
--- a/third_party/xla/xla/service/gpu/nccl_collective_thunk.h
+++ b/third_party/xla/xla/service/gpu/nccl_collective_thunk.h
@@ -25,8 +25,7 @@ limitations under the License.
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/functional/function_ref.h"
+#include "absl/container/node_hash_map.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
@@ -37,13 +36,13 @@ limitations under the License.
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/gpu/gpu_executable_run_options.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique.h"
 #include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/service/llvm_ir/llvm_util.h"
+#include "xla/service/rendezvous.h"
 #include "xla/shape.h"
 #include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
@@ -68,6 +67,7 @@ struct NcclCollectiveConfig {
   template <typename OpT>
   void SetCollectiveOpKindAndID(OpT op);
   void SetCollectiveOpKindAndID(const HloCollectivePermuteInstruction* instr);
+  void SetCollectiveOpKindAndID(const HloSendRecvInstruction* instr);
   bool IsDegenerate(int64_t replica_count, int64_t partition_count) const;
 };
 
@@ -110,6 +110,9 @@ NcclCollectiveConfig GetNcclCollectiveConfigForMlir(
 // NcclCollectiveThunk
 //===----------------------------------------------------------------------===//
 
+// Forward declare.
+class NcclCollectiveDoneThunk;
+
 // Thunk base class for NCCL collective operations.
 class NcclCollectiveThunk : public Thunk {
  public:
@@ -126,37 +129,36 @@ class NcclCollectiveThunk : public Thunk {
     mlir::Value destination_value;
   };
 
-  class AsyncExecutor {
-   public:
-    // Executes the function on the async communications stream and records a
-    // completion event.
-    absl::Status Execute(
-        absl::FunctionRef<Status(const ExecuteParams&, se::Stream&,
-                                 NcclApi::NcclCommHandle)>
-            fn,
-        const ExecuteParams& params, NcclApi::NcclCommHandle comm,
-        AsyncStreamKind stream_kind);
-    // Blocks the compute stream until async communication is complete.
-    absl::Status Await(const ExecuteParams& params);
+  // Completion events for asynchronous collective operations (operations
+  // launched on a dedicated stream that is synchronized with main compute
+  // stream only when needed).
+  class AsyncEvents {
+   private:
+    friend class NcclCollectiveThunk;
+    friend class NcclCollectiveDoneThunk;
+
+    absl::Status Initialize(se::StreamExecutor* executor);
+    absl::StatusOr<se::Event*> GetEvent(se::StreamExecutor* executor);
 
    private:
     absl::Mutex mu_;
-    // Store done events (by device ordinal) for the done thunk to wait on.
-    absl::flat_hash_map<int, se::Event> done_events_ ABSL_GUARDED_BY(mu_);
+    absl::node_hash_map<se::StreamExecutor*, se::Event> events_
+        ABSL_GUARDED_BY(mu_);
   };
 
   // Logging support.
   static std::string GetDeviceString(
       const Thunk::CollectiveExecuteParams& params);
 
-  AsyncExecutor* async_executor() { return async_.get(); }
-
   absl::Status Prepare(const PrepareParams& params,
                        ResourceRequests& resource_requests) override;
 
+  absl::Status Initialize(const InitializeParams& params) override;
+
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
   NcclApi* nccl_api() const { return nccl_api_; }
+  std::shared_ptr<AsyncEvents> async_events() const { return async_events_; }
 
  protected:
   virtual absl::Status RunNcclCollective(const ExecuteParams& params,
@@ -167,16 +169,34 @@ class NcclCollectiveThunk : public Thunk {
     return AsyncStreamKind::kCollective;
   }
 
+  // A collective thunk is normally an independent operation in a sense that
+  // different instances of the same collective thunk communicate each other.
+  // The only exception are SendThunk and RecvThunk. Assume two devices are
+  // executing a program contains the following instructions, the Recv from
+  // device 1 will release the Send from device 0. Adding first call
+  // rendezvous on the SendThunk would cause a runtime deadlock.
+  //  Send(src_target={0,1})
+  //  Recv(src_target={0,1})
+  virtual bool NeedFirstCallRendzevous() const { return true; }
+
  private:
-  bool IsAsync() const { return async_ != nullptr; }
+  bool IsAsync() const { return async_events_ != nullptr; }
   int64_t GetStreamId() const {
     return xla::gpu::GetStreamId(IsAsync(), GetAsyncStreamKind());
   }
 
   NcclApi* nccl_api_;
-
-  bool first_call_to_execute_ = true;
-  std::unique_ptr<AsyncExecutor> async_;  // null if not async.
+  std::shared_ptr<AsyncEvents> async_events_;
+
+  // After a first call to this particular instance of a NCCL collective thunk
+  // we do a round of rendezvous to make sure that all participants successfully
+  // allocated on-device state required for executing collective operation. This
+  // is required to avoid deadlocks when one device goes too far ahead and
+  // causes a deadlock in CUDA driver (root cause is mysterious).
+  //
+  // TODO(ezhulenev): Try to move this flag to NCCL clique as we need to make
+  // sure that all NCCL resources are allocated just once.
+  RendezvousSingleFlag first_call_rendezvous_flag_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -185,15 +205,18 @@ class NcclCollectiveThunk : public Thunk {
 
 class NcclCollectiveDoneThunk : public Thunk {
  public:
-  NcclCollectiveDoneThunk(Thunk::Kind kind, ThunkInfo thunk_info,
-                          NcclCollectiveThunk::AsyncExecutor& async);
+  NcclCollectiveDoneThunk(
+      Thunk::Kind kind, ThunkInfo thunk_info,
+      std::shared_ptr<NcclCollectiveThunk::AsyncEvents> async_events);
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
-  NcclCollectiveThunk::AsyncExecutor& async_;
+  std::shared_ptr<NcclCollectiveThunk::AsyncEvents> async_events_;
 };
 
+//===----------------------------------------------------------------------===//
+
 absl::Status IsValidOperand(mlir::Value operand, Thunk::Kind reduction_op);
 
 absl::Status IsValidOperand(Shape shape, Thunk::Kind reduction_op);
@@ -234,18 +257,17 @@ size_t GetNumLocalParticipants(
     const std::vector<GlobalDeviceId>& participants,
     const std::vector<GlobalDeviceId>* local_devices);  // may be null
 
-absl::StatusOr<NcclComm::Lock> LockNcclComm(
+absl::StatusOr<NcclApi::NcclCommHandle> GetNcclComm(
     const Thunk::CollectiveExecuteParams& params,
+    const Thunk::CollectiveCliques& collective_cliques,
     const std::vector<ReplicaGroup>& replica_groups,
-    CollectiveOpGroupMode group_mode, int64_t op_id, int64_t stream_id,
-    bool enable_clique_optimization);
+    CollectiveOpGroupMode group_mode, int64_t stream_id);
 
 struct DeviceBufferPair {
   PrimitiveType element_type;
   int64_t element_count;
   se::DeviceMemoryBase source_buffer;
   se::DeviceMemoryBase destination_buffer;
-  // TODO(b/320767790): Remove once memory space added to DeviceMemoryBase.
   int64_t source_memory_space;
   int64_t destination_memory_space;
 };
diff --git a/third_party/xla/xla/service/gpu/nccl_p2p_thunk_common.cc b/third_party/xla/xla/service/gpu/nccl_p2p_thunk_common.cc
index 8fc693fa29422d..8c9b1c78519f0f 100644
--- a/third_party/xla/xla/service/gpu/nccl_p2p_thunk_common.cc
+++ b/third_party/xla/xla/service/gpu/nccl_p2p_thunk_common.cc
@@ -16,12 +16,17 @@ limitations under the License.
 #include "xla/service/gpu/nccl_p2p_thunk_common.h"
 
 #include <cstdint>
+#include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
 #include "absl/status/statusor.h"
 #include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/service/collective_ops_utils.h"
 #include "xla/service/hlo_parser.h"
+#include "xla/shape.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
@@ -48,5 +53,58 @@ absl::StatusOr<std::vector<std::pair<int64_t, int64_t>>> GetSourceTargetPairs(
   return source_target_pairs;
 }
 
+NcclP2PConfig GetNcclP2PConfigForSendRecv(const HloSendRecvInstruction* instr,
+                                          const Shape& shape,
+                                          int64_t replica_count,
+                                          int64_t partition_count) {
+  NcclP2PConfig p2p_config;
+  auto& config = p2p_config.config;
+
+  config.operand_count = 1;
+  config.operand_element_type.push_back(shape.element_type());
+  config.SetCollectiveOpKindAndID(instr);
+  config.group_mode = GetCollectiveOpGroupMode(
+                          instr->channel_id().value_or(0) > 0, std::nullopt)
+                          .value();
+
+  // All execution instances of a Send/Recv together form a replica group.
+  const int64_t num_participants =
+      config.group_mode == CollectiveOpGroupMode::kCrossReplica
+          ? replica_count
+          : partition_count;
+  config.replica_groups.emplace_back();
+  ReplicaGroup& replica_group = config.replica_groups.front();
+  for (int i = 0; i < num_participants; ++i) {
+    replica_group.add_replica_ids(i);
+  }
+
+  std::optional<std::string> source_target_pairs_string =
+      instr->frontend_attributes().map().at(kSendRecvSourceTargetPairsAttr);
+
+  // We currently ignore problems related to the source-target-pair string to
+  // avoid using StatusOr for the return type. This should be ok as Send/Recv
+  // are generated by the compiler.
+  if (!source_target_pairs_string.has_value()) {
+    return p2p_config;
+  }
+  auto statusor = ParseReplicaGroupsOnly(*source_target_pairs_string);
+  if (!statusor.ok()) {
+    return p2p_config;
+  }
+
+  std::vector<ReplicaGroup> replica_groups = statusor.value();
+  for (const ReplicaGroup& replica_group : replica_groups) {
+    int64_t source = replica_group.replica_ids(0);
+    int64_t target = replica_group.replica_ids(1);
+
+    p2p_config.id_to_source_target.insert({target, {}}).first->second.source =
+        source;
+    p2p_config.id_to_source_target.insert({source, {}}).first->second.target =
+        target;
+  }
+
+  return p2p_config;
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/nccl_p2p_thunk_common.h b/third_party/xla/xla/service/gpu/nccl_p2p_thunk_common.h
index 1db0eee581f4c8..97da5f03f678ee 100644
--- a/third_party/xla/xla/service/gpu/nccl_p2p_thunk_common.h
+++ b/third_party/xla/xla/service/gpu/nccl_p2p_thunk_common.h
@@ -130,6 +130,12 @@ GetNcclP2PConfigForSendRecv(OpT op, int64_t replica_count,
   return p2p_config;
 }
 
+// Constructs the NcclP2PConfig for an HLO Send or Recv instruction.
+NcclP2PConfig GetNcclP2PConfigForSendRecv(const HloSendRecvInstruction* instr,
+                                          const Shape& shape,
+                                          int64_t replica_count,
+                                          int64_t partition_count);
+
 }  // namespace gpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/gpu/nccl_recv_thunk.cc b/third_party/xla/xla/service/gpu/nccl_recv_thunk.cc
index b40f871bbb3fe7..f8218c4c6188c3 100644
--- a/third_party/xla/xla/service/gpu/nccl_recv_thunk.cc
+++ b/third_party/xla/xla/service/gpu/nccl_recv_thunk.cc
@@ -22,11 +22,13 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/nccl_p2p_thunk_common.h"
 #include "xla/stream_executor/stream.h"
 #include "tsl/platform/errors.h"
 
@@ -56,6 +58,16 @@ NcclRecvThunk::NcclRecvThunk(ThunkInfo thunk_info, NcclApi* nccl_api, RecvOp op,
       config_(GetNcclP2PConfig(op, replica_count, partition_count)),
       buffer_(buffer) {}
 
+NcclRecvThunk::NcclRecvThunk(ThunkInfo thunk_info, NcclApi* nccl_api,
+                             const HloRecvInstruction* instr,
+                             int64_t replica_count, int64_t partition_count,
+                             const Buffer& buffer)
+    : NcclCollectiveThunk(Thunk::kNcclRecv, thunk_info, nccl_api,
+                          /*is_sync=*/false),
+      config_(GetNcclP2PConfigForSendRecv(instr, instr->shape().tuple_shapes(0),
+                                          replica_count, partition_count)),
+      buffer_(buffer) {}
+
 /*static*/ NcclP2PConfig NcclRecvThunk::GetNcclP2PConfig(
     RecvOp op, int64_t replica_count, int64_t partition_count) {
   return impl::GetNcclP2PConfig(op, replica_count, partition_count);
@@ -80,16 +92,16 @@ absl::Status NcclRecvThunk::RunNcclCollective(const ExecuteParams& params,
                              config_.config.operand_element_type));
   TF_RET_CHECK(device_buffers.size() == 1) << "Expected one buffer pair.";
 
-  GlobalDeviceId global_device_id = params.collective_params.global_device_id;
+  GlobalDeviceId global_device_id = params.collective_params->global_device_id;
 
   TF_ASSIGN_OR_RETURN(const DeviceAssignment::LogicalID current_logical_id,
-                      params.collective_params.device_assn->LogicalIdForDevice(
+                      params.collective_params->device_assn->LogicalIdForDevice(
                           global_device_id));
   const int64_t current_id =
       config_.config.group_mode == CollectiveOpGroupMode::kCrossReplica
           ? current_logical_id.replica_id
           : current_logical_id.computation_id;
-  std::string device_string = GetDeviceString(params.collective_params);
+  std::string device_string = GetDeviceString(*params.collective_params);
 
   const NcclP2PConfig::SourceTargetMapEntry source_target =
       NcclP2PConfig::GetSourceTarget(config_.id_to_source_target, current_id);
diff --git a/third_party/xla/xla/service/gpu/nccl_recv_thunk.h b/third_party/xla/xla/service/gpu/nccl_recv_thunk.h
index 23c84a9a693566..9adb6118076619 100644
--- a/third_party/xla/xla/service/gpu/nccl_recv_thunk.h
+++ b/third_party/xla/xla/service/gpu/nccl_recv_thunk.h
@@ -43,6 +43,10 @@ class NcclRecvThunk : public NcclCollectiveThunk {
                 int64_t replica_count, int64_t partition_count,
                 const Buffer& buffer);
 
+  NcclRecvThunk(ThunkInfo thunk_info, NcclApi* nccl_api,
+                const HloRecvInstruction* instr, int64_t replica_count,
+                int64_t partition_count, const Buffer& buffer);
+
  protected:
   const NcclCollectiveConfig& config() const override { return config_.config; }
   absl::Status RunNcclCollective(const ExecuteParams& params,
@@ -51,6 +55,7 @@ class NcclRecvThunk : public NcclCollectiveThunk {
   AsyncStreamKind GetAsyncStreamKind() const override {
     return AsyncStreamKind::kP2P;
   }
+  bool NeedFirstCallRendzevous() const override { return false; }
 
  private:
   const NcclP2PConfig config_;
diff --git a/third_party/xla/xla/service/gpu/nccl_send_thunk.cc b/third_party/xla/xla/service/gpu/nccl_send_thunk.cc
index 70dffd9852020c..6c77847a179df6 100644
--- a/third_party/xla/xla/service/gpu/nccl_send_thunk.cc
+++ b/third_party/xla/xla/service/gpu/nccl_send_thunk.cc
@@ -22,11 +22,13 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/nccl_p2p_thunk_common.h"
 #include "xla/stream_executor/stream.h"
 #include "tsl/platform/errors.h"
 
@@ -56,6 +58,16 @@ NcclSendThunk::NcclSendThunk(ThunkInfo thunk_info, NcclApi* nccl_api, SendOp op,
       config_(GetNcclP2PConfig(op, replica_count, partition_count)),
       buffer_(buffer) {}
 
+NcclSendThunk::NcclSendThunk(ThunkInfo thunk_info, NcclApi* nccl_api,
+                             const HloSendInstruction* instr,
+                             int64_t replica_count, int64_t partition_count,
+                             const Buffer& buffer)
+    : NcclCollectiveThunk(Thunk::kNcclSend, thunk_info, nccl_api,
+                          /*is_sync=*/false),
+      config_(GetNcclP2PConfigForSendRecv(instr, instr->operand(0)->shape(),
+                                          replica_count, partition_count)),
+      buffer_(buffer) {}
+
 /*static*/ NcclP2PConfig NcclSendThunk::GetNcclP2PConfig(
     SendOp op, int64_t replica_count, int64_t partition_count) {
   return impl::GetNcclP2PConfig(op, replica_count, partition_count);
@@ -80,16 +92,16 @@ absl::Status NcclSendThunk::RunNcclCollective(const ExecuteParams& params,
                              config_.config.operand_element_type));
   TF_RET_CHECK(device_buffers.size() == 1) << "Expected one buffer pair.";
 
-  GlobalDeviceId global_device_id = params.collective_params.global_device_id;
+  GlobalDeviceId global_device_id = params.collective_params->global_device_id;
 
   TF_ASSIGN_OR_RETURN(const DeviceAssignment::LogicalID current_logical_id,
-                      params.collective_params.device_assn->LogicalIdForDevice(
+                      params.collective_params->device_assn->LogicalIdForDevice(
                           global_device_id));
   const int64_t current_id =
       config_.config.group_mode == CollectiveOpGroupMode::kCrossReplica
           ? current_logical_id.replica_id
           : current_logical_id.computation_id;
-  std::string device_string = GetDeviceString(params.collective_params);
+  std::string device_string = GetDeviceString(*params.collective_params);
 
   const NcclP2PConfig::SourceTargetMapEntry source_target =
       NcclP2PConfig::GetSourceTarget(config_.id_to_source_target, current_id);
@@ -118,7 +130,7 @@ absl::Status RunSend(NcclApi* nccl_api,
   // Send source buffer to target peer if needed.
   if (target_id) {
     TF_RETURN_IF_ERROR(nccl_api->Send(src_addr, buffer.element_type,
-                                      buffer.element_type, *target_id, comm,
+                                      buffer.element_count, *target_id, comm,
                                       &stream));
   }
 
diff --git a/third_party/xla/xla/service/gpu/nccl_send_thunk.h b/third_party/xla/xla/service/gpu/nccl_send_thunk.h
index e146f27dd82e79..192679bfcd1109 100644
--- a/third_party/xla/xla/service/gpu/nccl_send_thunk.h
+++ b/third_party/xla/xla/service/gpu/nccl_send_thunk.h
@@ -41,6 +41,9 @@ class NcclSendThunk : public NcclCollectiveThunk {
   NcclSendThunk(ThunkInfo thunk_info, NcclApi* nccl_api, mlir::lmhlo::SendOp op,
                 int64_t replica_count, int64_t partition_count,
                 const Buffer& buffer);
+  NcclSendThunk(ThunkInfo thunk_info, NcclApi* nccl_api,
+                const HloSendInstruction* instr, int64_t replica_count,
+                int64_t partition_count, const Buffer& buffer);
 
  protected:
   const NcclCollectiveConfig& config() const override { return config_.config; }
@@ -50,6 +53,7 @@ class NcclSendThunk : public NcclCollectiveThunk {
   AsyncStreamKind GetAsyncStreamKind() const override {
     return AsyncStreamKind::kP2P;
   }
+  bool NeedFirstCallRendzevous() const override { return false; }
 
  private:
   const NcclP2PConfig config_;
diff --git a/third_party/xla/xla/service/gpu/non_atomically_upgradeable_rw_lock.h b/third_party/xla/xla/service/gpu/non_atomically_upgradeable_rw_lock.h
deleted file mode 100644
index 7a6ad29f2087f7..00000000000000
--- a/third_party/xla/xla/service/gpu/non_atomically_upgradeable_rw_lock.h
+++ /dev/null
@@ -1,95 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_NON_ATOMICALLY_UPGRADEABLE_RW_LOCK_H_
-#define XLA_SERVICE_GPU_NON_ATOMICALLY_UPGRADEABLE_RW_LOCK_H_
-
-#include <memory>
-#include <variant>
-
-#include "absl/synchronization/mutex.h"
-
-namespace xla {
-namespace gpu {
-
-// Augments absl::ReaderMutexLock with a poor man's upgrade/downgrade pair using
-// RAII. Instead of a true upgrade (or downgrade), we simply drop the read
-// (write) lock and then reacquire it as a write (read) lock.
-class ABSL_SCOPED_LOCKABLE NonAtomicallyUpgradeableRWLock {
- public:
-  explicit NonAtomicallyUpgradeableRWLock(absl::Mutex* mu)
-      ABSL_SHARED_LOCK_FUNCTION(mu)
-      : mu_(mu), is_reader_(true) {
-    mu_->ReaderLock();
-  }
-
-  NonAtomicallyUpgradeableRWLock(const NonAtomicallyUpgradeableRWLock&) =
-      delete;
-  NonAtomicallyUpgradeableRWLock(NonAtomicallyUpgradeableRWLock&&) = delete;
-  NonAtomicallyUpgradeableRWLock& operator=(
-      const NonAtomicallyUpgradeableRWLock&) = delete;
-  NonAtomicallyUpgradeableRWLock& operator=(NonAtomicallyUpgradeableRWLock&&) =
-      delete;
-
-  ~NonAtomicallyUpgradeableRWLock() ABSL_UNLOCK_FUNCTION() {
-    if (is_reader_) {
-      mu_->ReaderUnlock();
-    } else {
-      mu_->WriterUnlock();
-    }
-  }
-
-  // Upgrade and downgrade the reader lock via RAII.
-  class ABSL_SCOPED_LOCKABLE WriterLock {
-   public:
-    explicit WriterLock(NonAtomicallyUpgradeableRWLock* parent)
-        ABSL_EXCLUSIVE_LOCK_FUNCTION(parent->mu_)
-        : parent_(parent) {
-      assert(parent_->is_reader_);
-      parent_->mu_->ReaderUnlock();
-      parent_->mu_->WriterLock();
-      parent_->is_reader_ = false;
-    }
-
-    WriterLock(const WriterLock&) = delete;
-    WriterLock(WriterLock&&) = delete;
-    WriterLock& operator=(const WriterLock&) = delete;
-    WriterLock& operator=(WriterLock&&) = delete;
-
-    ~WriterLock() ABSL_UNLOCK_FUNCTION() {
-      parent_->mu_->WriterUnlock();
-      parent_->mu_->ReaderLock();
-      parent_->is_reader_ = true;
-    }
-
-   private:
-    NonAtomicallyUpgradeableRWLock* parent_;
-  };
-
-  // Update the reader lock to a writer lock. The function is invalid if the
-  // lock is already upgraded.
-  WriterLock UpgradeToWriterMutexLock() ABSL_NO_THREAD_SAFETY_ANALYSIS {
-    return WriterLock(this);
-  }
-
- private:
-  absl::Mutex* const mu_;
-  bool is_reader_;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_NON_ATOMICALLY_UPGRADEABLE_RW_LOCK_H_
diff --git a/third_party/xla/xla/service/gpu/non_atomically_upgradeable_rw_lock_test.cc b/third_party/xla/xla/service/gpu/non_atomically_upgradeable_rw_lock_test.cc
deleted file mode 100644
index d163cb77ac5cc8..00000000000000
--- a/third_party/xla/xla/service/gpu/non_atomically_upgradeable_rw_lock_test.cc
+++ /dev/null
@@ -1,45 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
-
-#include <gtest/gtest.h>
-#include "tsl/platform/test.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-TEST(NonAtomicallyUpgradeableRWLock, UpgradeReaderMutexLock) {
-  absl::Mutex mu;
-  {
-    NonAtomicallyUpgradeableRWLock reader_lock(&mu);
-    mu.AssertReaderHeld();
-
-    {
-      NonAtomicallyUpgradeableRWLock::WriterLock writer_lock =
-          reader_lock.UpgradeToWriterMutexLock();
-      mu.AssertHeld();
-    }
-
-    // The lock downgrades after the WriterLock goes out of scope.
-    mu.AssertReaderHeld();
-  }
-  mu.AssertNotHeld();
-}
-
-}  // namespace
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.cc b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
index 5f9848b3622ba6..d8d8b1c08a6b0b 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.cc
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.cc
@@ -157,7 +157,7 @@ absl::Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
       /*layout_sensitive=*/false,
       /*allow_mixed_precision=*/false);
 
-  // Convert upsupported bf16 convolutions to f32.
+  // Convert unsupported bf16 convolutions to f32.
   ConvBfloat16Support conv_bf16_support(dnn_version, cuda_compute_capability);
   pipeline.AddPass<FloatNormalization>(&conv_bf16_support);
 
@@ -174,7 +174,8 @@ absl::Status NVPTXCompiler::OptimizeHloConvolutionCanonicalization(
   pipeline.AddPass<CallInliner>();
   pipeline.AddPass<TupleSimplifier>();
 
-  AlgebraicSimplifierOptions algsimp_options;
+  AlgebraicSimplifierOptions algsimp_options =
+      GetAlgebraicSimplifierOptions(hlo_module->config());
   algsimp_options.set_enable_conv_operand_swap(false);
   algsimp_options.set_enable_unconditional_reduce_of_concat_replacement(false);
   pipeline.AddPass<HloPassFix<AlgebraicSimplifier>>(algsimp_options);
@@ -234,7 +235,8 @@ absl::Status NVPTXCompiler::OptimizeHloPostLayoutAssignment(
     const DebugOptions& debug_options = hlo_module->config().debug_options();
     // The LayoutAssignment pass may leave behind kCopy instructions which are
     // duplicate or NOPs, so remove them with algebraic simplification and CSE.
-    AlgebraicSimplifierOptions alg_sim_options;
+    AlgebraicSimplifierOptions alg_sim_options =
+        GetAlgebraicSimplifierOptions(hlo_module->config());
     alg_sim_options.set_supports_non_canonical_dots(false);
     alg_sim_options.set_is_layout_sensitive(true);
     alg_sim_options.set_enable_conv_operand_swap(false);
@@ -638,42 +640,43 @@ NVPTXCompiler::CompileGpuAsmOrGetCachedResult(
       !options.is_autotuning_compilation);
   tsl::profiler::TraceMe activity("PTX->CUBIN",
                                   tsl::profiler::TraceMeLevel::kInfo);
-  auto [iter, inserted] = [&] {
+  CompilationCacheValue* cache_value = nullptr;
+  bool inserted = [&] {
     absl::MutexLock lock(&mutex_);
-    return compilation_cache_.emplace(
+    auto [iter, inserted] = compilation_cache_.emplace(
         std::piecewise_construct,
         std::forward_as_tuple(ptx, cc.major, cc.minor, relocatable),
         std::forward_as_tuple());
+    // Do not move this assignment outside of the critical section. There is
+    // a TOCTOU if `compilation_cache_` is rehashed before the iterator is used.
+    cache_value = &iter->second;
+    return inserted;
   }();
 
-  // Pointers into compilation_cache_ where the ptx and (optional) cuBIN are
-  // stored.
-  CompilationCacheValue& cache_value = iter->second;
-
   // Compile the ptx if it wasn't in the cache before we called this function.
   // Other threads asking for the same compilation key will block on
   // cache_value->mutex_ until compilation is done.
-  absl::MutexLock lock(&cache_value.mutex);
+  absl::MutexLock lock(&cache_value->mutex);
   if (inserted) {
-    CHECK(!cache_value.compilation_done);
-    absl::Cleanup mark_compilation_as_done = [&cache_value] {
+    CHECK(!cache_value->compilation_done);
+    absl::Cleanup mark_compilation_as_done = [cache_value] {
       // Note that we will set this to true also in the error case, so that we
       // don't retry this compilation.
-      cache_value.compilation_done = true;
-      cache_value.compilation_done_cv.SignalAll();
+      cache_value->compilation_done = true;
+      cache_value->compilation_done_cv.SignalAll();
     };
 
-    TF_ASSIGN_OR_RETURN(cache_value.cubin_data,
+    TF_ASSIGN_OR_RETURN(cache_value->cubin_data,
                         AssembleOptionsAndCompile(ptx, cc, hlo_module_config,
                                                   options, relocatable));
-    return cache_value.cubin_data;
+    return cache_value->cubin_data;
   }
 
-  while (!cache_value.compilation_done) {
-    cache_value.compilation_done_cv.Wait(&cache_value.mutex);
+  while (!cache_value->compilation_done) {
+    cache_value->compilation_done_cv.Wait(&cache_value->mutex);
   }
 
-  return cache_value.cubin_data;
+  return cache_value->cubin_data;
 }
 
 static std::optional<std::array<int64_t, 3>> GetNvLinkVersion(
@@ -694,26 +697,23 @@ static std::optional<std::array<int64_t, 3>> GetNvLinkVersion(
   }
 
   // Make sure nvlink exists and is executable.
-  const std::string bin_path =
+  absl::StatusOr<std::string> bin_path =
       se::FindCudaExecutable("nvlink", preferred_cuda_dir);
-  auto version = se::GetToolVersion(bin_path);
+
+  if (!bin_path.ok()) {
+    return std::nullopt;
+  }
+
+  auto version = se::GetToolVersion(bin_path.value());
   if (!version.ok()) {
     return std::nullopt;
   }
   return *version;
 }
 
-absl::StatusOr<NVPTXCompiler::LinkingMethod> NVPTXCompiler::ChooseLinkingMethod(
-    const std::string& preferred_cuda_dir) {
-  {
-    absl::MutexLock lock(&mutex_);
-    auto it = linking_methods_.find(preferred_cuda_dir);
-    if (it != linking_methods_.end()) {
-      return it->second;
-    }
-  }
-
-  LinkingMethod linking_method = LinkingMethod::kNone;
+absl::StatusOr<NVPTXCompiler::LinkingMethod> ChooseLinkingMethodImpl(
+    const DebugOptions& debug_options, const std::string& preferred_cuda_dir) {
+  using LinkingMethod = NVPTXCompiler::LinkingMethod;
   TF_ASSIGN_OR_RETURN(auto ptxas_version_tuple,
                       se::GetAsmCompilerVersion(preferred_cuda_dir));
 
@@ -724,33 +724,57 @@ absl::StatusOr<NVPTXCompiler::LinkingMethod> NVPTXCompiler::ChooseLinkingMethod(
     return absl::InternalError("XLA requires ptxas version 11.8 or higher");
   }
 
-  static const std::optional<std::array<int64_t, 3>> nvlink_version =
+  std::optional<std::array<int64_t, 3>> nvlink_version =
       GetNvLinkVersion(preferred_cuda_dir);
   if (nvlink_version && *nvlink_version >= ptxas_version_tuple) {
-    linking_method = LinkingMethod::kNvLink;
-  } else {
-    int ptxas_version = std::get<0>(ptxas_version_tuple) * 1000 +
-                        std::get<1>(ptxas_version_tuple) * 10;
-    TF_ASSIGN_OR_RETURN(int driver_version,
-                        se::gpu::GpuDriver::GetDriverVersion());
+    return LinkingMethod::kNvLink;
+  }
+
+  int ptxas_version = std::get<0>(ptxas_version_tuple) * 1000 +
+                      std::get<1>(ptxas_version_tuple) * 10;
+  TF_ASSIGN_OR_RETURN(int driver_version,
+                      se::gpu::GpuDriver::GetDriverVersion());
+
+  if (driver_version >= ptxas_version) {
+    return LinkingMethod::kDriver;
+  }
+
+  LOG_FIRST_N(WARNING, 1)
+      << "The NVIDIA driver's CUDA version is "
+      << absl::StrFormat("%d.%d", driver_version / 1000,
+                         (driver_version % 1000) / 10)
+      << " which is older than the ptxas CUDA version "
+      << absl::StrFormat("(%d.%d.%d)", std::get<0>(ptxas_version_tuple),
+                         std::get<1>(ptxas_version_tuple),
+                         std::get<2>(ptxas_version_tuple))
+      << ". Because the driver is older than the ptxas version, XLA is "
+         "disabling parallel compilation, which may slow down "
+         "compilation. "
+         "You should update your NVIDIA driver or use the "
+         "NVIDIA-provided "
+         "CUDA forward compatibility packages.";
+
+  return LinkingMethod::kNone;
+}
 
-    if (driver_version >= ptxas_version) {
-      linking_method = LinkingMethod::kDriver;
-    } else {
-      LOG_FIRST_N(WARNING, 1)
-          << "The NVIDIA driver's CUDA version is "
-          << absl::StrFormat("%d.%d", driver_version / 1000,
-                             (driver_version % 1000) / 10)
-          << " which is older than the ptxas CUDA version "
-          << absl::StrFormat("(%d.%d.%d)", std::get<0>(ptxas_version_tuple),
-                             std::get<1>(ptxas_version_tuple),
-                             std::get<2>(ptxas_version_tuple))
-          << ". Because the driver is older than the ptxas version, XLA is "
-             "disabling parallel compilation, which may slow down compilation. "
-             "You should update your NVIDIA driver or use the NVIDIA-provided "
-             "CUDA forward compatibility packages.";
+absl::StatusOr<NVPTXCompiler::LinkingMethod> NVPTXCompiler::ChooseLinkingMethod(
+    const DebugOptions& debug_options) {
+  se::GpuAsmOpts ptxas_config = PtxOptsFromDebugOptions(debug_options);
+  std::string& preferred_cuda_dir = ptxas_config.preferred_cuda_dir;
+
+  {
+    absl::MutexLock lock(&mutex_);
+    auto it = linking_methods_.find(preferred_cuda_dir);
+    if (it != linking_methods_.end()) {
+      return it->second;
     }
   }
+
+  // This wrapper only handles caching. The actual choice happens in this call:
+  TF_ASSIGN_OR_RETURN(
+      LinkingMethod linking_method,
+      ChooseLinkingMethodImpl(debug_options, preferred_cuda_dir));
+
   {
     absl::MutexLock lock(&mutex_);
     linking_methods_[preferred_cuda_dir] = linking_method;
@@ -762,10 +786,8 @@ absl::StatusOr<bool> NVPTXCompiler::CanUseLinkModules(
     const HloModuleConfig& hlo_module_config) {
   // TODO(phawkins): rather than comparing version numbers, it might be more
   // robust if we simply tried to link something the first time we compile.
-  auto ptxas_config =
-      PtxOptsFromDebugOptions(hlo_module_config.debug_options());
   TF_ASSIGN_OR_RETURN(LinkingMethod linking_method,
-                      ChooseLinkingMethod(ptxas_config.preferred_cuda_dir));
+                      ChooseLinkingMethod(hlo_module_config.debug_options()));
   return linking_method != LinkingMethod::kNone;
 }
 
@@ -783,7 +805,7 @@ absl::StatusOr<std::vector<uint8_t>> NVPTXCompiler::LinkModules(
       stream_exec->platform_specific_handle().context);
 
   TF_ASSIGN_OR_RETURN(LinkingMethod linking_method,
-                      ChooseLinkingMethod(ptxas_config.preferred_cuda_dir));
+                      ChooseLinkingMethod(debug_options));
   if (linking_method == LinkingMethod::kNvLink) {
     return LinkUsingNvlink(debug_options.xla_gpu_cuda_data_dir(), context,
                            images);
diff --git a/third_party/xla/xla/service/gpu/nvptx_compiler.h b/third_party/xla/xla/service/gpu/nvptx_compiler.h
index fa5afaec7d2fc5..65eb92f279ecae 100644
--- a/third_party/xla/xla/service/gpu/nvptx_compiler.h
+++ b/third_party/xla/xla/service/gpu/nvptx_compiler.h
@@ -72,6 +72,12 @@ class NVPTXCompiler : public GpuCompiler {
       se::GpuComputeCapability gpu_version, bool relocatable,
       const HloModule* debug_module, const CompileOptions& options) override;
 
+  enum class LinkingMethod {
+    kNone,
+    kNvLink,
+    kDriver,
+  };
+
  private:
   absl::StatusOr<bool> CanUseLinkModules(
       const HloModuleConfig& module_config) override;
@@ -83,16 +89,11 @@ class NVPTXCompiler : public GpuCompiler {
 
   absl::Mutex mutex_;
 
-  enum class LinkingMethod {
-    kNone,
-    kNvLink,
-    kDriver,
-  };
   absl::flat_hash_map<std::string, LinkingMethod> linking_methods_
       ABSL_GUARDED_BY(mutex_);
 
   absl::StatusOr<LinkingMethod> ChooseLinkingMethod(
-      const std::string& preferred_cuda_dir);
+      const DebugOptions& debug_options);
 
   // Tries to compile the given ptx string to cubin.  Returns a vector with the
   // compiled cubin if compilation succeeded.
diff --git a/third_party/xla/xla/service/gpu/parallel_loop_emitter.cc b/third_party/xla/xla/service/gpu/parallel_loop_emitter.cc
index 326cdce5d0d719..4a60090709ee5c 100644
--- a/third_party/xla/xla/service/gpu/parallel_loop_emitter.cc
+++ b/third_party/xla/xla/service/gpu/parallel_loop_emitter.cc
@@ -66,7 +66,7 @@ ParallelLoopEmitter::EmitLinearBaseAndThreadIdx(llvm::Type* index_type,
   llvm::Value* block_id =
       EmitCallToTargetIntrinsic(TargetIntrinsicID::kBlockIdx, {}, {}, b_);
   llvm_ir::AddRangeMetadata(0, launch_dimensions_.block_counts().x,
-                            static_cast<llvm::Instruction*>(block_id));
+                            static_cast<llvm::Instruction*>(block_id), b_);
   block_id = b_->CreateZExtOrTrunc(block_id, index_type, "block_id");
 
   // Per the PTX documentation:
@@ -74,7 +74,7 @@ ParallelLoopEmitter::EmitLinearBaseAndThreadIdx(llvm::Type* index_type,
   llvm::Value* thread_id_x =
       EmitCallToTargetIntrinsic(TargetIntrinsicID::kThreadIdx, {}, {}, b_);
   llvm_ir::AddRangeMetadata(0, launch_dimensions_.thread_counts_per_block().x,
-                            static_cast<llvm::Instruction*>(thread_id_x));
+                            static_cast<llvm::Instruction*>(thread_id_x), b_);
   thread_id_x = b_->CreateZExtOrTrunc(thread_id_x, index_type, "thread_id_x");
 
   llvm::Value* linear_index_base =
@@ -88,7 +88,7 @@ ParallelLoopEmitter::EmitLinearBaseAndThreadIdx(llvm::Type* index_type,
     llvm::Value* thread_id_y =
         EmitCallToTargetIntrinsic(TargetIntrinsicID::kThreadIdy, {}, {}, b_);
     llvm_ir::AddRangeMetadata(0, launch_dimensions_.thread_counts_per_block().y,
-                              static_cast<llvm::Instruction*>(thread_id_y));
+                              static_cast<llvm::Instruction*>(thread_id_y), b_);
     thread_id_y = b_->CreateZExtOrTrunc(thread_id_y, index_type, "thread_id_y");
     linear_index_base = b_->CreateAdd(
         linear_index_base,
diff --git a/third_party/xla/xla/service/gpu/priority_fusion.cc b/third_party/xla/xla/service/gpu/priority_fusion.cc
index 2fe9ae0320b4a3..56837f6befbd76 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion.cc
+++ b/third_party/xla/xla/service/gpu/priority_fusion.cc
@@ -22,7 +22,7 @@ limitations under the License.
 #include <limits>
 #include <map>
 #include <memory>
-#include <optional>
+#include <string>
 #include <utility>
 #include <vector>
 
@@ -30,27 +30,37 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/meta/type_traits.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
+#include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/time/time.h"
+#include "llvm/ADT/STLExtras.h"
+#include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/service/dump.h"
-#include "xla/service/fusion_node_indexing_evaluation.h"
 #include "xla/service/fusion_queue.h"
 #include "xla/service/gpu/fusion_process_dump.pb.h"
 #include "xla/service/gpu/gpu_fusible.h"
+#include "xla/service/gpu/hlo_fusion_analysis.h"
 #include "xla/service/gpu/hlo_traversal.h"
 #include "xla/service/gpu/model/fusion_analysis_cache.h"
 #include "xla/service/gpu/model/gpu_hlo_cost_analysis.h"
 #include "xla/service/gpu/model/gpu_performance_model.h"
+#include "xla/service/gpu/model/gpu_performance_model_base.h"
+#include "xla/service/hlo_graph_dumper.h"
 #include "xla/service/instruction_fusion.h"
 #include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/platform/blocking_counter.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/status.h"
+#include "tsl/platform/threadpool.h"
 
 namespace xla {
 namespace gpu {
@@ -107,7 +117,7 @@ bool IsFusible(const HloInstruction& instr) {
 // performance when fusing it to all of its fusible users. We greedily pick the
 // max-benefit producer to fuse, and update the estimated benefits of the fused
 // nodes and their operands.
-class GpuPriorityFusionQueue : public FusionQueue {
+class GpuPriorityFusionQueue {
   using Priority = int64_t;
   using CanFuseCallback = std::function<FusionDecision(
       HloInstruction* /*producer*/, int64_t /*consumer operand_index*/)>;
@@ -128,6 +138,11 @@ class GpuPriorityFusionQueue : public FusionQueue {
     VLOG(2) << "Running full HLO cost analysis for " << computation_->name();
     TF_CHECK_OK(computation_->Accept(&cost_analysis_));
 
+    dump_fusion_visualization_ = computation->parent()
+                                     ->config()
+                                     .debug_options()
+                                     .xla_dump_fusion_visualization();
+
     // Initializes the priority queue.
     std::vector<HloInstruction*> instructions;
     for (auto* instruction : computation->MakeInstructionPostOrder()) {
@@ -171,18 +186,14 @@ class GpuPriorityFusionQueue : public FusionQueue {
     return priorities;
   }
 
-  std::pair<HloInstruction*, std::vector<int64_t>>
-  DequeueNextInstructionAndOperandsToFuseInOrder() override {
-    // When current_consumers_ is empty, we need to dequeue a new producer.
-    // Update the priorities that changed during the last fusion.
-    if (current_consumers_.empty()) {
-      UpdatePriorities();
-    }
+  // Gets the next pair of (producer, consumers) from the queue for fusion.
+  // Returns true if there is the next producer to fuse, otherwise false. Stores
+  // the producer and consumers in `current_producer_` and `current_consumers_`.
+  bool DequeueNextProducer() {
+    current_producer_ = nullptr;
+    current_consumers_.clear();
 
-    while (current_consumers_.empty()) {
-      if (producer_priority_queue_.empty()) {
-        return {};
-      }
+    while (!producer_priority_queue_.empty() && current_consumers_.empty()) {
       auto next_it = std::prev(producer_priority_queue_.end());
       auto priority = next_it->first.first;
 
@@ -195,6 +206,7 @@ class GpuPriorityFusionQueue : public FusionQueue {
       if (priority < 0) {
         continue;
       }
+
       current_consumers_ = current_producer_->users();
 
       if (current_producer_->opcode() == HloOpcode::kBitcast) {
@@ -206,14 +218,7 @@ class GpuPriorityFusionQueue : public FusionQueue {
       }
     }
 
-    auto next_consumer = current_consumers_.back();
-    int64_t producer_operand_index =
-        next_consumer->operand_index(current_producer_);
-    current_consumers_.pop_back();
-    VLOG(5) << "next: " << next_consumer->name() << "(" << next_consumer
-            << ") + " << current_producer_->name() << "(" << current_producer_
-            << ")";
-    return {next_consumer, {producer_operand_index}};
+    return !current_consumers_.empty();
   }
 
   // Update priorities of all affected ops.
@@ -253,7 +258,15 @@ class GpuPriorityFusionQueue : public FusionQueue {
 
   // Prepares producer and consumer instruction to be fused. Invalidates caches
   // and writes logs.
-  void PreFusion(HloInstruction* producer, HloInstruction* consumer) override {
+  void PreFusion(HloInstruction* producer, HloInstruction* consumer) {
+    if (dump_fusion_visualization_) {
+      RegisterFusionState(
+          *computation_,
+          absl::StrCat("About to fuse |", producer->name(), "| into |",
+                       consumer->name(), "| inside PriorityFusion"),
+          *consumer, producer);
+    }
+
     InvalidateCaches(producer);
     InvalidateCaches(consumer);
   }
@@ -273,17 +286,12 @@ class GpuPriorityFusionQueue : public FusionQueue {
 
     gpu_performance_model_cache_.Invalidate(*instruction);
     fusion_analysis_cache_.Invalidate(*instruction);
-
-    for (auto* user : instruction->users()) {
-      fusion_node_evaluations_.erase(user);
-    }
-    fusion_node_evaluations_.erase(instruction);
   }
 
   // Updates data for the new fusion instruction and its users and operands.
   void OnFusingInstruction(HloInstruction* fusion,
                            HloInstruction* original_producer,
-                           HloInstruction* original_consumer) override {
+                           HloInstruction* original_consumer) {
     if (fusion_process_dump_) {
       auto* fusion_step =
           fusion_process_dump_->add_fusion_steps()->mutable_fusion();
@@ -294,6 +302,14 @@ class GpuPriorityFusionQueue : public FusionQueue {
       fusion_step->set_consumer_name(std::string(original_consumer->name()));
     }
 
+    if (dump_fusion_visualization_) {
+      RegisterFusionState(
+          *computation_,
+          absl::StrCat("Fused |", original_producer->name(), "| into |",
+                       fusion->name(), "| inside PriorityFusion"),
+          *fusion);
+    }
+
     // The original consumer was replaced with the fusion, but it's pointer can
     // still be referenced somewhere, for example, in to_update_priority_.
     // Priority recomputation is called before DCE. Remove all references to
@@ -313,8 +329,6 @@ class GpuPriorityFusionQueue : public FusionQueue {
     // Collect the instructions whose priorities need to be updated.
     for (HloInstruction* operand : fusion->operands()) {
       if (operand == original_producer ||
-          original_producer->opcode() == HloOpcode::kBroadcast ||
-          operand->opcode() == HloOpcode::kBroadcast ||
           operand->opcode() == HloOpcode::kConstant ||
           operand->opcode() == HloOpcode::kGetTupleElement) {
         continue;
@@ -331,7 +345,7 @@ class GpuPriorityFusionQueue : public FusionQueue {
   }
 
   // Removes data for the instruction.
-  void RemoveInstruction(HloInstruction* instruction) override {
+  void RemoveInstruction(HloInstruction* instruction) {
     to_update_priority_.erase(instruction);
     fusion_analysis_cache_.Invalidate(*instruction);
 
@@ -343,7 +357,11 @@ class GpuPriorityFusionQueue : public FusionQueue {
     reverse_map_.erase(reverse_it);
   }
 
-  const std::vector<bool>* FusionConfiguration() override { return nullptr; }
+  HloInstruction* current_producer() { return current_producer_; }
+
+  const std::vector<HloInstruction*>& current_consumers() {
+    return current_consumers_;
+  }
 
  private:
   // Returns the priority of the producer based on its current operands and
@@ -361,7 +379,7 @@ class GpuPriorityFusionQueue : public FusionQueue {
     }
 
     // Don't fuse if we can't fuse in all users.
-    if (auto fusion_decision = CanFuseWithAllUsers(producer);
+    if (auto fusion_decision = CanFuseWithAllNonBitcastUsers(producer);
         !fusion_decision) {
       if (fusion_process_dump_) {
         absl::MutexLock lock(&fusion_process_dump_mutex_);
@@ -404,6 +422,10 @@ class GpuPriorityFusionQueue : public FusionQueue {
       return "the consumer is not fusible";
     }
 
+    if (consumer->opcode() == HloOpcode::kBitcast) {
+      return "not fusing into a single bitcast as consumer";
+    }
+
     // Scatter is special as it has no elemental version but is still input
     // fusible. Block attempts to create scatter fusions we can't codegen.
     if (auto can_fuse = CanEmitInputFusedScatter(*producer, *consumer);
@@ -414,10 +436,12 @@ class GpuPriorityFusionQueue : public FusionQueue {
     // Avoid fusing reduce into reduce. Our cost model doesn't currently
     // understand this case due to a lack of tiling analysis.
     // TODO(b/312200883): Remove this.
-    auto contains_signficant_reduce = [&](const HloInstruction* instr) {
+    auto contains_significant_reduce = [&](const HloInstruction* instr) {
       auto fusion = HloFusionAdaptor::ForInstruction(instr);
       return HloAnyOf(fusion->GetRoots(), *fusion, [](auto node) {
-        if (node.opcode() != HloOpcode::kReduce) return false;
+        if (!(node.opcode() == HloOpcode::kReduce && node.shape().IsArray())) {
+          return false;
+        }
 
         int64_t reduction_size =
             ShapeUtil::ElementsIn(node.instruction().operand(0)->shape()) /
@@ -427,8 +451,8 @@ class GpuPriorityFusionQueue : public FusionQueue {
         return reduction_size >= 16;
       });
     };
-    if (contains_signficant_reduce(producer) &&
-        contains_signficant_reduce(consumer)) {
+    if (contains_significant_reduce(producer) &&
+        contains_significant_reduce(consumer)) {
       return "both the producer and the consumer contain a reduce";
     }
 
@@ -436,14 +460,13 @@ class GpuPriorityFusionQueue : public FusionQueue {
     // switch it to the loop emitter. This often occurs during epilog fusion for
     // reductions, which suffer from limited emitter support.
     // TODO(b/312686229): Cost model should handle this.
-    const auto& analysis_fused =
-        fusion_analysis_cache_.Get(*producer, *consumer);
-    if (producer->IsInputFusion() && analysis_fused &&
-        analysis_fused->GetEmitterFusionKind() ==
-            HloFusionAnalysis::EmitterFusionKind::kLoop) {
-      const auto& analysis = fusion_analysis_cache_.Get(*producer);
-      if (!analysis || analysis->GetEmitterFusionKind() ==
-                           HloFusionAnalysis::EmitterFusionKind::kReduction) {
+    const auto& analysis = fusion_analysis_cache_.Get(*producer);
+    if (analysis.GetEmitterFusionKind() ==
+        HloFusionAnalysis::EmitterFusionKind::kReduction) {
+      const auto& analysis_fused =
+          fusion_analysis_cache_.Get(*producer, *consumer);
+      if (analysis_fused.GetEmitterFusionKind() ==
+          HloFusionAnalysis::EmitterFusionKind::kLoop) {
         return "fusion into output of a reduce fusion would create a loop "
                "fusion";
       }
@@ -462,18 +485,8 @@ class GpuPriorityFusionQueue : public FusionQueue {
     // have exponential time/memory requirements for emitting certain fusion
     // kernels, in which case we don't want to fuse.
     // TODO(b/119692968): Remove this once we have fixed our fusion emitter.
-    if (consumer->opcode() == HloOpcode::kFusion) {
-      absl::MutexLock lock(&fusion_node_evaluations_mutex_);
-      if (fusion_node_evaluations_.find(consumer) ==
-          fusion_node_evaluations_.end()) {
-        // We have no cached results for this fusion node yet. Compute it now.
-        fusion_node_evaluations_.emplace(
-            consumer, FusionNodeIndexingEvaluation(consumer));
-      }
-      if (fusion_node_evaluations_.at(consumer).CodeDuplicationTooHigh(
-              producer)) {
-        return "the fusion would result in an overly large code duplication";
-      }
+    if (cost_analysis_.ProducerConsumerMergedTooLarge(*producer, *consumer)) {
+      return "the fusion would result in an overly large code duplication";
     }
 
     // Don't fuse across a root instruction. There are situation when a root
@@ -516,13 +529,18 @@ class GpuPriorityFusionQueue : public FusionQueue {
     return fusion_decision;
   }
 
-  FusionDecision CanFuseWithAllUsers(HloInstruction* producer) {
-    if (producer->users().size() == 0) {
+  FusionDecision CanFuseWithAllNonBitcastUsers(HloInstruction* producer) {
+    if (producer->users().empty()) {
       return "No users to fuse";
     }
 
     FusionDecision result;
+    bool has_non_bitcast_user = false;
     for (const auto& user : producer->users()) {
+      if (user->opcode() == HloOpcode::kBitcast) {
+        continue;
+      }
+      has_non_bitcast_user = true;
       if (auto fusion_decision = CanFuseCached(producer, user);
           !fusion_decision) {
         VLOG(10) << "Cannot fuse " << producer->name() << " with "
@@ -530,6 +548,9 @@ class GpuPriorityFusionQueue : public FusionQueue {
         return fusion_decision;
       }
     }
+    if (!has_non_bitcast_user) {
+      return "not fusing because there are only bitcast users";
+    }
     return {};
   }
 
@@ -580,11 +601,7 @@ class GpuPriorityFusionQueue : public FusionQueue {
 
   GpuPerformanceModelCache gpu_performance_model_cache_;
 
-  // Keep track of the number of times each instruction inside a fusion node is
-  // indexed with different index vectors.
-  absl::Mutex fusion_node_evaluations_mutex_;
-  absl::flat_hash_map<const HloInstruction*, FusionNodeIndexingEvaluation>
-      fusion_node_evaluations_;
+  bool dump_fusion_visualization_;
 };
 
 }  // namespace
@@ -623,6 +640,14 @@ bool IsSmallConstant(const HloInstruction* instr) {
          ShapeUtil::ElementsIn(instr->shape()) <= 1;
 }
 
+bool GpuPriorityFusion::ConsumeFuel(HloInstruction* producer,
+                                    HloInstruction* consumer) {
+  return xla::ConsumeFuel(name(), /*ran_out_of_fuel_msg=*/[&] {
+    return absl::StrFormat("Not fusing producer %s with consumer %s",
+                           producer->name(), consumer->name());
+  });
+};
+
 absl::StatusOr<bool> GpuPriorityFusion::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
@@ -630,6 +655,8 @@ absl::StatusOr<bool> GpuPriorityFusion::Run(
       DumpingEnabledForHloPass(name(), module->config().debug_options());
   if (dump_enabled) {
     fusion_process_dump_ = std::make_unique<FusionProcessDumpProto>();
+    *fusion_process_dump_->mutable_gpu_device_info() =
+        device_info_.ToGpuProto();
   }
 
   // Appends ".0" suffix to all instructions.
@@ -639,7 +666,7 @@ absl::StatusOr<bool> GpuPriorityFusion::Run(
   // Before: broadcast.123 -> broadcast.124
   // After: broadcast.123.0 -> broadcast.123.1
   //
-  // With this modification it will be easier to match intructions before and
+  // With this modification it will be easier to match instructions before and
   // after fusion passes, because they will have the same unique prefix. Names
   // are not used in the pipeline, but it makes debugging much easier.
   for (auto* computation : GetFusionComputations(module, execution_threads)) {
@@ -649,33 +676,72 @@ absl::StatusOr<bool> GpuPriorityFusion::Run(
     }
   }
 
-  auto result = InstructionFusion::Run(module, execution_threads);
-
-  // Fuse all constants.
-  if (result.ok()) {
-    // Note: `GetFusionComputations` doesn't return the fusion computations, but
-    // the computations to be fused.
-    for (auto* computation : GetFusionComputations(module, execution_threads)) {
-      std::vector<HloInstruction*> constants;
-      for (auto* instruction : computation->instructions()) {
-        // Small constants should be fused, because they can be folded and
-        // codegened efficiently.
-        // Fusing large constants doesn't give much benefits, because they're
-        // treated like parameters and read from global memory anyway. Fusion
-        // and duplication of large constants can, however, cause problems if we
-        // want to dump hlo and parse back, because in that case duplicated
-        // constants will be filled with different data.
-        if (IsSmallConstant(instruction)) {
-          constants.push_back(instruction);
+  if (dump_enabled) {
+    fusion_process_dump_->set_hlo_module_before_fusion(
+        module->ToString(HloPrintOptions::ShortParsable()));
+  }
+
+  int changed = false;
+  // Note: `GetFusionComputations` doesn't return the fusion computations, but
+  // the computations to be fused.
+  for (auto* computation : GetFusionComputations(module, execution_threads)) {
+    CHECK(!computation->IsFusionComputation());
+
+    auto fusion_queue = std::make_unique<GpuPriorityFusionQueue>(
+        computation, cost_analysis_options_, &device_info_,
+        fusion_process_dump_.get(), thread_pool_, fusion_analysis_cache_);
+
+    while (fusion_queue->DequeueNextProducer()) {
+      auto producer = fusion_queue->current_producer();
+
+      for (auto* consumer : fusion_queue->current_consumers()) {
+        // Don't fuse into single bitcasts. We ignore them in the check
+        // CanFuseWithAllNonBitcastUsers(), so we need to check it here.
+        if (consumer->opcode() == HloOpcode::kBitcast) {
+          continue;
         }
+        if (!ConsumeFuel(producer, consumer)) continue;
+
+        VLOG(5) << "next: " << consumer->name() << "(" << consumer << ") + "
+                << producer->name() << "(" << producer << ")";
+
+        fusion_queue->PreFusion(producer, consumer);
+        auto fusion_instruction = Fuse(producer, consumer, computation);
+        fusion_queue->OnFusingInstruction(fusion_instruction, producer,
+                                          consumer);
+
+        changed = true;
+      }
+
+      if (producer->user_count() == 0) {
+        fusion_queue->RemoveInstruction(producer);
+        // Remove from computation.
+        TF_RETURN_IF_ERROR(computation->RemoveInstruction(producer));
+      }
+
+      fusion_queue->UpdatePriorities();
+    }
+
+    // Fuse all constants.
+    std::vector<HloInstruction*> constants;
+    for (auto* instruction : computation->instructions()) {
+      // Small constants should be fused, because they can be folded and
+      // codegened efficiently.
+      // Fusing large constants doesn't give much benefits, because they're
+      // treated like parameters and read from global memory anyway. Fusion
+      // and duplication of large constants can, however, cause problems if we
+      // want to dump hlo and parse back, because in that case duplicated
+      // constants will be filled with different data.
+      if (IsSmallConstant(instruction)) {
+        constants.push_back(instruction);
       }
-      for (auto* constant : constants) {
-        auto users = constant->users();
-        for (auto* user : users) {
-          if (IsFusible(*user) && CanEmitInputFusedScatter(*constant, *user)) {
-            InstructionFusion::Fuse(constant, user, computation);
-            result = true;
-          }
+    }
+    for (auto* constant : constants) {
+      auto users = constant->users();
+      for (auto* user : users) {
+        if (IsFusible(*user) && CanEmitInputFusedScatter(*constant, *user)) {
+          Fuse(constant, user, computation);
+          changed = true;
         }
       }
     }
@@ -692,7 +758,7 @@ absl::StatusOr<bool> GpuPriorityFusion::Run(
                                 "priority_fusion_dump");
   }
 
-  return result;
+  return changed;
 }
 
 FusionDecision GpuPriorityFusion::ShouldFuse(HloInstruction* consumer,
@@ -710,8 +776,7 @@ HloInstruction::FusionKind GpuPriorityFusion::ChooseKind(
   // matter but some passes downstream still query these instead of fusion
   // analysis.
   const auto& analysis = fusion_analysis_cache_.Get(*producer, *consumer);
-  if (!analysis) return HloInstruction::FusionKind::kLoop;
-  switch (analysis->GetEmitterFusionKind()) {
+  switch (analysis.GetEmitterFusionKind()) {
     case HloFusionAnalysis::EmitterFusionKind::kLoop:
       return HloInstruction::FusionKind::kLoop;
     case HloFusionAnalysis::EmitterFusionKind::kTriton:
@@ -739,9 +804,7 @@ HloInstruction* GpuPriorityFusion::FuseInstruction(
 
 std::unique_ptr<FusionQueue> GpuPriorityFusion::GetFusionQueue(
     HloComputation* computation) {
-  return std::unique_ptr<FusionQueue>(new GpuPriorityFusionQueue(
-      computation, cost_analysis_options_, &device_info_,
-      fusion_process_dump_.get(), thread_pool_, fusion_analysis_cache_));
+  return nullptr;
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/priority_fusion.h b/third_party/xla/xla/service/gpu/priority_fusion.h
index 7610705cc3ee38..4725e726f2011f 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion.h
+++ b/third_party/xla/xla/service/gpu/priority_fusion.h
@@ -26,7 +26,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/fusion_node_indexing_evaluation.h"
 #include "xla/service/fusion_queue.h"
 #include "xla/service/gpu/fusion_process_dump.pb.h"
 #include "xla/service/gpu/model/fusion_analysis_cache.h"
@@ -74,6 +73,10 @@ class GpuPriorityFusion : public InstructionFusion {
   HloInstruction* FuseInstruction(HloInstruction* fusion_instruction,
                                   HloInstruction* producer) override;
 
+  // Consumes a unit of compiler fuel and returns true if we should
+  // continue with the transformation.
+  bool ConsumeFuel(HloInstruction* producer, HloInstruction* consumer);
+
   tsl::thread::ThreadPool* thread_pool_;
   se::DeviceDescription device_info_;
 
diff --git a/third_party/xla/xla/service/gpu/priority_fusion_test.cc b/third_party/xla/xla/service/gpu/priority_fusion_test.cc
index 27fb31f380d159..8ff3f675e79e3f 100644
--- a/third_party/xla/xla/service/gpu/priority_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/priority_fusion_test.cc
@@ -18,8 +18,12 @@ limitations under the License.
 #include <stdint.h>
 
 #include <memory>
+#include <optional>
+#include <vector>
 
 #include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -38,7 +42,6 @@ limitations under the License.
 
 namespace m = ::xla::match;
 
-using ::testing::ElementsAre;
 using ::testing::UnorderedElementsAre;
 using ::tsl::testing::IsOk;
 using ::tsl::testing::IsOkAndHolds;
@@ -65,11 +68,9 @@ class PriorityFusionTest : public HloTestBase {
       if (!computation->FusionInstruction()) continue;
 
       auto device_info = TestGpuDeviceInfo::RTXA6000DeviceInfo();
-      auto analysis =
-          HloFusionAnalysis::Create(
-              Cast<HloFusionInstruction>(computation->FusionInstruction()),
-              &device_info)
-              .value();
+      auto analysis = HloFusionAnalysis::Create(
+          Cast<HloFusionInstruction>(computation->FusionInstruction()),
+          &device_info);
       kinds.push_back(analysis.GetEmitterFusionKind());
     }
     return kinds;
@@ -140,6 +141,24 @@ CHECK-NEXT: ROOT {{.*}} tuple(%[[FUSION_0]], %[[FUSION_1]])
   )");
 }
 
+TEST_F(PriorityFusionTest, FuseBroadcastIntoBitcastConsumers) {
+  absl::string_view kHlo = R"(
+    HloModule test_module
+
+    ENTRY main {
+      param_0 = f32[96]{0} parameter(0)
+      broadcast = f32[8,96,128,7]{3,2,1,0} broadcast(param_0), dimensions={1}
+      bitcast.6079.2 = f32[8,24,4,128,7]{4,3,2,1,0} bitcast(broadcast)
+      ROOT transpose.1990.2 = f32[8,24,128,7,4]{4,3,2,1,0} transpose(bitcast.6079.2), dimensions={0,1,3,4,2}
+    }
+  )";
+  RunAndFilecheckHloRewrite(kHlo, std::move(priority_fusion_), R"(
+CHECK:      ENTRY
+CHECK-NEXT: %[[PARAM:.*]] = f32[96]{0} parameter(0)
+CHECK-NEXT: ROOT %{{.*}} fusion(%[[PARAM]])
+  )");
+}
+
 TEST_F(PriorityFusionTest, FuseWideningConvertIntoConsumers) {
   absl::string_view kHlo = R"(
     HloModule test_module
@@ -157,8 +176,9 @@ TEST_F(PriorityFusionTest, FuseWideningConvertIntoConsumers) {
 CHECK:      ENTRY
 CHECK-NEXT: %[[PARAM:.*]] = f16[512]{0} parameter(0)
 CHECK-NEXT: %[[FUSION_F32:.*]] = f32[512]{0} fusion(%[[PARAM]])
-CHECK-NEXT: %[[FUSION_S32:.*]] = s32[512]{0} fusion(%[[PARAM]])
-CHECK-NEXT: ROOT %{{.*}} = (f32[512]{0}, s32[512]{0}) tuple(%[[FUSION_F32]], %[[FUSION_S32]])
+CHECK-NEXT: %[[CONVERT_FUSION:.*]] = f32[512]{0} fusion(%[[PARAM]])
+CHECK-NEXT: %[[BITCAST:.*]] = s32[512]{0} bitcast(%[[CONVERT_FUSION]])
+CHECK-NEXT: ROOT %{{.*}} = (f32[512]{0}, s32[512]{0}) tuple(%[[FUSION_F32]], %[[BITCAST]])
   )");
 }
 
@@ -202,7 +222,8 @@ CHECK-COUNT-3: fusion
 }
 
 TEST_F(PriorityFusionTest, ReductionEpilogueFusionRegressionTest) {
-  // Regression test for epilogue fusion of convert+bitcast into a reduction.
+  // Regression test for epilogue fusion of convert into a reduction, even if
+  // the convert has a bitcast as consumer.
   absl::string_view kHlo = R"(
     HloModule test_module
 
@@ -251,10 +272,37 @@ TEST_F(PriorityFusionTest, ReductionEpilogueFusionRegressionTest) {
 
   RunAndFilecheckHloRewrite(kHlo, std::move(priority_fusion_), R"(
 CHECK: ENTRY
-CHECK: ROOT {{.*}} fusion(
+CHECK: ROOT {{.*}} bitcast({{.*}}fusion{{.*}})
   )");
 }
 
+TEST_F(PriorityFusionTest, DoNotChangeReductionFusionToLoopFusion) {
+  // Regression test for epilogue fusion of slice into a reduction. The fusion
+  // kind for the reduction fusion is intentionally chosen to be set to kLoop,
+  // as we cannot rely on reductions always having fusion kind kInput.
+  auto module = *ParseAndReturnVerifiedModule(R"(
+    HloModule test_module
+
+    add {
+      rhs.407 = f32[] parameter(1)
+      lhs.407 = f32[] parameter(0)
+      ROOT add.24451 = f32[] add(lhs.407, rhs.407)
+    }
+
+    fused_computation {
+      p0 = f32[16,64]{1,0} parameter(0)
+      zero = f32[] constant(0.0)
+      ROOT reduce = f32[16]{0} reduce(p0, zero), dimensions={1}, to_apply=add
+    }
+
+    ENTRY main {
+      param0 = f32[16,64]{1,0} parameter(0)
+      fusion = f32[16]{0} fusion(param0), kind=kLoop, calls=fused_computation
+      ROOT slice = f32[8]{0} slice(fusion), slice={[0:8]}
+    })");
+  EXPECT_THAT(priority_fusion_.Run(module.get()), IsOkAndHolds(false));
+}
+
 TEST_F(PriorityFusionTest, DoNotFuseTransposeIntoReduce) {
   absl::string_view kHlo = R"(
     HloModule test_module
@@ -755,5 +803,56 @@ TEST_F(PriorityFusionTest, FuseOnlySmallConstant) {
                   m::Add(m::Parameter(), m::Broadcast(m::Constant())))));
 }
 
+TEST_F(PriorityFusionTest, DoNotFuseProducerConsumerMergedTooLarge) {
+  auto module = *ParseAndReturnVerifiedModule(R"(
+    HloModule module
+
+    fused_computation.1 {
+      iota.9.7 = s32[3,1,1]{2,1,0} iota(), iota_dimension=0
+      param_3.29 = s32[] parameter(2)
+      pad.2.7 = s32[3,1,2]{2,1,0} pad(iota.9.7, param_3.29), padding=0_0x0_0x0_1
+      param_2.39 = s32[] parameter(1)
+      broadcast.76.1 = s32[3,1,2]{2,1,0} broadcast(param_2.39), dimensions={}
+      compare.9.1 = pred[3,1,2]{2,1,0} compare(pad.2.7, broadcast.76.1), direction=GE
+      param_1.73 = s32[2]{0} parameter(0)
+      broadcast.78.1 = s32[3,2]{1,0} broadcast(param_1.73), dimensions={1}
+      bitcast.1 = s32[3,2]{1,0} bitcast(pad.2.7)
+      compare.10.1 = pred[3,2]{1,0} compare(bitcast.1, broadcast.78.1), direction=LE
+      bitcast.2 = pred[3,1,2]{2,1,0} bitcast(compare.10.1)
+      ROOT and.3.1 = pred[3,1,2]{2,1,0} and(compare.9.1, bitcast.2)
+    }
+
+    and {
+      x = pred[] parameter(0)
+      y = pred[] parameter(1)
+      ROOT and = pred[] and(x, y)
+    }
+
+    fused_computation.2 {
+      param0 = pred[3,1,2]{2,1,0} parameter(0)
+      slice = pred[1,1,2]{2,1,0} slice(param0), slice={[0:1], [0:1], [0:2]}
+      bitcast = pred[2]{0} bitcast(slice)
+      init = pred[] constant(true)
+      reduce = pred[2]{0} reduce(param0, init), dimensions={0,1}, to_apply=and
+      and = pred[2]{0} and(bitcast, reduce)
+      pad = pred[3]{0} pad(and, init), padding=0_1
+      broadcast = pred[3,2]{1,0} broadcast(pad), dimensions={0}
+      bitcast2 = pred[6]{0} bitcast(broadcast)
+      broadcast2 = pred[2,3]{1,0} broadcast(pad), dimensions={1}
+      bitcast3 = pred[6]{0} bitcast(broadcast2)
+      ROOT and2 = pred[6]{0} and(bitcast2, bitcast3)
+    }
+
+    ENTRY main {
+      p0 = s32[2]{0} parameter(0)
+      p1 = s32[] parameter(1)
+      p2 = s32[] parameter(2)
+      fusion1 = pred[3,1,2]{2,1,0} fusion(p0, p1, p2), kind=kLoop, calls=fused_computation.1
+      ROOT fusion2 = pred[6]{0} fusion(fusion1), kind=kInput, calls=fused_computation.2
+    }
+  )");
+  EXPECT_THAT(priority_fusion_.Run(module.get()), IsOkAndHolds(false));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/reduction_utils.cc b/third_party/xla/xla/service/gpu/reduction_utils.cc
index b40d11ce185b71..74bdc3d84046f1 100644
--- a/third_party/xla/xla/service/gpu/reduction_utils.cc
+++ b/third_party/xla/xla/service/gpu/reduction_utils.cc
@@ -229,5 +229,13 @@ bool IsRealReductionHero(const HloInstruction& root,
                              GetReductionKindAndContiguousComponents(hero));
 }
 
+bool AreReductionsMultiOutputFusionCompatible(
+    const HloInstruction* reduce_hero, const HloInstruction* first_reduce) {
+  // The reduction kind must be the same for all reduce heroes inside of a
+  // multioutput fusion.
+  return GetReductionKindAndContiguousComponents(*reduce_hero) ==
+         GetReductionKindAndContiguousComponents(*first_reduce);
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/reduction_utils.h b/third_party/xla/xla/service/gpu/reduction_utils.h
index 56ac605cc09562..b5383ae92ed64f 100644
--- a/third_party/xla/xla/service/gpu/reduction_utils.h
+++ b/third_party/xla/xla/service/gpu/reduction_utils.h
@@ -41,6 +41,11 @@ struct ReductionDimensions {
   // For row reduction, we do: [D, H, W] -> [D, H].
   // For column reduction, we do: [D, H, W] -> [D, W].
   Vector3 dimensions;
+
+  bool operator==(const ReductionDimensions& other) const {
+    return is_row_reduction == other.is_row_reduction &&
+           dimensions == other.dimensions;
+  }
 };
 
 // Returns true if using the reduction emitter is estimated to be faster than
@@ -78,6 +83,10 @@ bool ReductionIsRaceFree(const HloModuleConfig& hlo_module_config,
 bool IsRealReductionHero(const HloInstruction& root,
                          const HloInstruction& hero);
 
+// Whether `reduction_hero` is compatible with `first_reduce`.
+bool AreReductionsMultiOutputFusionCompatible(
+    const HloInstruction* reduce_hero, const HloInstruction* first_reduce);
+
 }  // namespace gpu
 }  // namespace xla
 #endif  // XLA_SERVICE_GPU_REDUCTION_UTILS_H_
diff --git a/third_party/xla/xla/service/gpu/reduction_utils_test.cc b/third_party/xla/xla/service/gpu/reduction_utils_test.cc
new file mode 100644
index 00000000000000..9ddd2fd6db1500
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/reduction_utils_test.cc
@@ -0,0 +1,191 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/reduction_utils.h"
+
+#include <gtest/gtest.h>
+#include "absl/strings/str_cat.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/hlo_parser.h"
+#include "xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+using ReductionUtilsTest = HloTestBase;
+
+const char kModulePrefix[] = R"(
+    HloModule test_module
+    scalar_add {
+      lhs = f32[] parameter(0)
+      rhs = f32[] parameter(1)
+      ROOT add = f32[] add(lhs, rhs)
+    })";
+
+TEST_F(ReductionUtilsTest, ReductionsAreMultioutputFusionCompatible) {
+  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    fused_sibling1 {
+      p_0 = f32[32,64]{1,0} parameter(0)
+      constant = f32[] constant(0)
+      ROOT reduce = f32[32]{0} reduce(p_0, constant), dimensions={1}, to_apply=scalar_add
+    }
+
+    fused_sibling2 {
+      p_0 = f32[32,64]{1,0} parameter(0)
+      neg = f32[32,64]{1,0} negate(p_0)
+      constant = f32[] constant(0)
+      ROOT reduce = f32[32]{0} reduce(neg, constant), dimensions={1}, to_apply=scalar_add
+    }
+
+    ENTRY entry {
+      p_0 = f32[32,64]{1,0} parameter(0)
+      fusion1 = f32[32]{0} fusion(p_0), kind=kInput, calls=fused_sibling1
+      fusion2 = f32[32]{0} fusion(p_0), kind=kInput, calls=fused_sibling2
+      ROOT root = (f32[32]{0}, f32[32]{0}) tuple(fusion1, fusion2)
+    })"))
+                    .value();
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* fusion1 = root->operand(0);
+  const HloInstruction* fusion2 = root->operand(1);
+  EXPECT_TRUE(AreReductionsMultiOutputFusionCompatible(
+      fusion1->fused_expression_root(), fusion2->fused_expression_root()));
+}
+
+TEST_F(ReductionUtilsTest,
+       ReductionsWithSameCanonicalizedDimsAreMultioutputFusionCompatible) {
+  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    fused_sibling1 {
+      p_0 = f32[32,64]{1,0} parameter(0)
+      constant = f32[] constant(0)
+      ROOT reduce = f32[32]{0} reduce(p_0, constant), dimensions={1}, to_apply=scalar_add
+    }
+
+    fused_sibling2 {
+      p_0 = f32[32,64]{1,0} parameter(0)
+      bitcast = f32[32,8,8]{2,1,0} bitcast(p_0)
+      constant = f32[] constant(0)
+      ROOT reduce = f32[32]{0} reduce(bitcast, constant), dimensions={1,2}, to_apply=scalar_add
+    }
+
+    ENTRY entry {
+      p_0 = f32[32,64]{1,0} parameter(0)
+      fusion1 = f32[32]{0} fusion(p_0), kind=kInput, calls=fused_sibling1
+      fusion2 = f32[32]{0} fusion(p_0), kind=kInput, calls=fused_sibling2
+      ROOT root = (f32[32]{0}, f32[32]{0}) tuple(fusion1, fusion2)
+    })"))
+                    .value();
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* fusion1 = root->operand(0);
+  const HloInstruction* fusion2 = root->operand(1);
+  EXPECT_TRUE(AreReductionsMultiOutputFusionCompatible(
+      fusion1->fused_expression_root(), fusion2->fused_expression_root()));
+}
+
+TEST_F(ReductionUtilsTest,
+       ReductionsAreNotMultioutputFusionCompatible_DifferentOperandShapes) {
+  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    fused_sibling1 {
+      p_0 = f32[32,64]{1,0} parameter(0)
+      constant = f32[] constant(0)
+      ROOT reduce = f32[32]{0} reduce(p_0, constant), dimensions={1}, to_apply=scalar_add
+    }
+
+    fused_sibling2 {
+      p_0 = f32[64,32]{1,0} parameter(0)
+      neg = f32[64,32]{1,0} negate(p_0)
+      constant = f32[] constant(0)
+      ROOT reduce = f32[32]{0} reduce(neg, constant), dimensions={0}, to_apply=scalar_add
+    }
+
+    ENTRY entry {
+      p_0 = f32[32,64]{1,0} parameter(0)
+      p_1 = f32[64,32]{1,0} parameter(1)
+      fusion1 = f32[32]{0} fusion(p_0), kind=kInput, calls=fused_sibling1
+      fusion2 = f32[32]{0} fusion(p_1), kind=kInput, calls=fused_sibling2
+      ROOT root = (f32[32]{0}, f32[32]{0}) tuple(fusion1, fusion2)
+    })"))
+                    .value();
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* fusion1 = root->operand(0);
+  const HloInstruction* fusion2 = root->operand(1);
+  EXPECT_FALSE(AreReductionsMultiOutputFusionCompatible(
+      fusion1->fused_expression_root(), fusion2->fused_expression_root()));
+}
+
+TEST_F(ReductionUtilsTest,
+       ReductionsAreNotMultioutputFusionCompatible_DifferentOutputShapes) {
+  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    fused_sibling1 {
+      p_0 = f32[32,64]{1,0} parameter(0)
+      constant = f32[] constant(0)
+      ROOT reduce = f32[32]{0} reduce(p_0, constant), dimensions={1}, to_apply=scalar_add
+    }
+
+    fused_sibling2 {
+      p_0 = f32[64,32]{1,0} parameter(0)
+      neg = f32[64,32]{1,0} negate(p_0)
+      constant = f32[] constant(0)
+      ROOT reduce = f32[64]{0} reduce(neg, constant), dimensions={1}, to_apply=scalar_add
+    }
+
+    ENTRY entry {
+      p_0 = f32[32,64]{1,0} parameter(0)
+      p_1 = f32[64,32]{1,0} parameter(1)
+      fusion1 = f32[32]{0} fusion(p_0), kind=kInput, calls=fused_sibling1
+      fusion2 = f32[64]{0} fusion(p_1), kind=kInput, calls=fused_sibling2
+      ROOT root = (f32[32]{0}, f32[64]{0}) tuple(fusion1, fusion2)
+    })"))
+                    .value();
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* fusion1 = root->operand(0);
+  const HloInstruction* fusion2 = root->operand(1);
+  EXPECT_FALSE(AreReductionsMultiOutputFusionCompatible(
+      fusion1->fused_expression_root(), fusion2->fused_expression_root()));
+}
+
+TEST_F(ReductionUtilsTest,
+       ReductionsAreNotMultioutputFusionCompatible_DifferentReduceDimensions) {
+  auto module = ParseAndReturnVerifiedModule(absl::StrCat(kModulePrefix, R"(
+    fused_sibling1 {
+      p_0 = f32[32,32]{1,0} parameter(0)
+      constant = f32[] constant(0)
+      ROOT reduce = f32[32]{0} reduce(p_0, constant), dimensions={0}, to_apply=scalar_add
+    }
+
+    fused_sibling2 {
+      p_0 = f32[32,32]{1,0} parameter(0)
+      neg = f32[32,32]{1,0} negate(p_0)
+      constant = f32[] constant(0)
+      ROOT reduce = f32[32]{0} reduce(neg, constant), dimensions={1}, to_apply=scalar_add
+    }
+
+    ENTRY entry {
+      p_0 = f32[32,32]{1,0} parameter(0)
+      fusion1 = f32[32]{0} fusion(p_0), kind=kInput, calls=fused_sibling1
+      fusion2 = f32[32]{0} fusion(p_0), kind=kInput, calls=fused_sibling2
+      ROOT root = (f32[32]{0}, f32[32]{0}) tuple(fusion1, fusion2)
+    })"))
+                    .value();
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloInstruction* fusion1 = root->operand(0);
+  const HloInstruction* fusion2 = root->operand(1);
+  EXPECT_FALSE(AreReductionsMultiOutputFusionCompatible(
+      fusion1->fused_expression_root(), fusion2->fused_expression_root()));
+}
+
+}  // namespace
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/rename_fusions.cc b/third_party/xla/xla/service/gpu/rename_fusions.cc
new file mode 100644
index 00000000000000..1a6731cdd49190
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/rename_fusions.cc
@@ -0,0 +1,93 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/rename_fusions.h"
+
+#include <memory>
+#include <string>
+
+#include "absl/container/btree_set.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/gpu/hlo_traversal.h"
+#include "xla/service/gpu/ir_emission_utils.h"
+
+namespace xla {
+namespace gpu {
+namespace {
+
+constexpr absl::string_view FusionKindToString(
+    HloInstruction::FusionKind kind) {
+  switch (kind) {
+    case HloInstruction::FusionKind::kCustom:
+      return "custom";
+    case HloInstruction::FusionKind::kLoop:
+      return "loop";
+    case HloInstruction::FusionKind::kInput:
+      return "input";
+    case HloInstruction::FusionKind::kOutput:
+      return "output";
+  }
+}
+
+std::string MakeFusionHeroNames(const HloInstruction* instruction) {
+  std::unique_ptr<HloFusionAdaptor> fusion_adaptor =
+      HloFusionAdaptor::ForInstruction(instruction);
+  absl::btree_set<absl::string_view> heroes;
+
+  for (auto root : fusion_adaptor->GetRoots()) {
+    heroes.insert(HloOpcodeString(
+        FindNonTrivialHero(root.instruction(), *fusion_adaptor).opcode()));
+  }
+  return absl::StrReplaceAll(absl::StrJoin(heroes, "_"), {{"-", "_"}});
+}
+
+void RenameFusion(HloModule* module, HloInstruction* instruction) {
+  std::string hero_names = MakeFusionHeroNames(instruction);
+  module->SetAndUniquifyInstrName(
+      instruction, absl::StrCat(FusionKindToString(instruction->fusion_kind()),
+                                "_", hero_names, "_fusion"));
+  module->SetAndUniquifyComputationName(
+      instruction->fused_instructions_computation(),
+      absl::StrCat("fused_", hero_names));
+}
+
+}  // namespace
+
+absl::StatusOr<bool> RenameFusions::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  for (HloComputation* computation : module->MakeNonfusionComputations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() != HloOpcode::kFusion ||
+          instruction->fusion_kind() == HloInstruction::FusionKind::kCustom) {
+        continue;
+      }
+      RenameFusion(module, instruction);
+    }
+  }
+  return true;
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/rename_fusions.h b/third_party/xla/xla/service/gpu/rename_fusions.h
new file mode 100644
index 00000000000000..247750f378c86a
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/rename_fusions.h
@@ -0,0 +1,46 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RENAME_FUSIONS_H_
+#define XLA_SERVICE_GPU_RENAME_FUSIONS_H_
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/service/hlo_pass_interface.h"
+
+namespace xla {
+namespace gpu {
+
+// An HLO pass that gives fusions and fused computations descriptive names.
+//
+// The name is based on hero instructions and the fusion kind, i.e.
+// Fusions get name "<fusion kind>_<hero instrucitons>_fusion",
+// and fused computations get name "fused_<hero instructions>".
+// In the case of multiple roots, the hero instructions in the name are
+// underscore-separated and alphabetically sorted.
+
+class RenameFusions : public HloModulePass {
+  absl::string_view name() const override { return "rename_fusions"; }
+  absl::StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+};
+
+}  // namespace gpu
+}  // namespace xla
+
+#endif  // XLA_SERVICE_GPU_RENAME_FUSIONS_H_
diff --git a/third_party/xla/xla/service/gpu/rename_fusions_test.cc b/third_party/xla/xla/service/gpu/rename_fusions_test.cc
new file mode 100644
index 00000000000000..60c97cf2ff9438
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/rename_fusions_test.cc
@@ -0,0 +1,83 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/rename_fusions.h"
+
+#include <utility>
+
+#include <gtest/gtest.h>
+#include "absl/strings/string_view.h"
+#include "xla/tests/hlo_test_base.h"
+
+namespace xla {
+namespace gpu {
+
+class RenameFusionsTest : public HloTestBase {
+ protected:
+  RenameFusions rename_fusions_;
+};
+
+TEST_F(RenameFusionsTest, FusionInstructionNames) {
+  absl::string_view kHlo = R"(
+      HloModule test_module
+
+      square {
+        p = f32[16384] parameter(0)
+        ROOT m = f32[16384] multiply(p, p)
+      }
+
+      exp {
+        p = f32[16384] parameter(0)
+        ROOT e = f32[16384] exponential(p)
+      }
+
+      log {
+        p = f32[16384] parameter(0)
+        ROOT l = f32[16384] log(p)
+      }
+
+      add {
+        p0 = f32[] parameter(0)
+        p1 = f32[] parameter(1)
+        ROOT add = f32[] add(p0, p1)
+      }
+
+      ENTRY main {
+        p0 = bf16[1024,8192] parameter(0)
+        p1 = f32[8192] parameter(1)
+        p2 = f32[16384] parameter(2)
+        convert = f32[1024,8192] convert(p0)
+        broadcast = f32[1024,8192] broadcast(p1), dimensions={1}
+        c0 = f32[] constant(0)
+        multiply = f32[1024,8192] multiply(broadcast, convert)
+        reduce = f32[1024] reduce(multiply, c0), dimensions={1}, to_apply=add
+        convert.1 = bf16[1024] convert(reduce)
+        s = f32[16384] fusion(p2), kind=kLoop, calls=square
+        e = f32[16384] fusion(s), kind=kLoop, calls=exp
+        l = f32[16384] fusion(s), kind=kInput, calls=log
+        ROOT result = (bf16[1024]{0}, f32[16384]{0}, f32[16384]{0}) tuple(convert.1, l, e)
+      })";
+
+  RunAndFilecheckHloRewrite(kHlo, std::move(rename_fusions_), R"(
+CHECK: ENTRY %main
+CHECK: %loop_multiply_fusion{{.*}} calls=%fused_multiply
+CHECK: %input_log_fusion{{.*}} calls=%fused_log
+CHECK: %loop_exponential_fusion{{.*}} calls=%fused_exponential
+CHECK: ROOT %result
+  )");
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/BUILD b/third_party/xla/xla/service/gpu/runtime/BUILD
index 4ba2184f440c4b..30539b2e42d3a2 100644
--- a/third_party/xla/xla/service/gpu/runtime/BUILD
+++ b/third_party/xla/xla/service/gpu/runtime/BUILD
@@ -1,774 +1,730 @@
-load("@local_config_cuda//cuda:build_defs.bzl", "cuda_library")
-load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/service/gpu:build_defs.bzl", "gpu_kernel_library")
+load("//xla/tests:build_defs.bzl", "xla_test")
+load("//xla/service/gpu:build_defs.bzl", "get_cub_sort_kernel_types")
 load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
-load(
-    "@local_config_rocm//rocm:build_defs.bzl",
-    "if_rocm_is_configured",
-    "rocm_library",
-)
-load(
-    "@local_tsl//tsl/platform:build_config_root.bzl",
-    "tf_gpu_tests_tags",
-)
+load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
 load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
     licenses = ["notice"],
 )
 
 package_group(
     name = "friends",
-    includes = [
-        "//xla:friends",
-    ],
+    includes = ["//xla:friends"],
 )
 
-gpu_kernel_library(
-    name = "gpu_kernel_helper",
-    hdrs = if_gpu_is_configured(["gpu_kernel_helper.h"]),
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
+#===-------------------------------------------------------------------------------------------===//
+# Runtime tracing libraries
+#===-------------------------------------------------------------------------------------------===//
+
+cc_library(
+    name = "annotation",
+    srcs = ["annotation.cc"],
+    hdrs = ["annotation.h"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
     deps = [
-        "//xla/stream_executor/platform",
-        "@local_tsl//tsl/lib/math:math_util",
-    ] + if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-    ]) + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
-    ]),
+        "//xla/hlo/ir:hlo",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/profiler/lib:nvtx_utils",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
+    ],
 )
 
+#===-------------------------------------------------------------------------------------------===//
+# Command Buffer Integration
+#===-------------------------------------------------------------------------------------------===//
+
 cc_library(
-    name = "cholesky",
-    srcs = ["cholesky.cc"],
-    hdrs = ["cholesky.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
-    visibility = ["//visibility:public"],
+    name = "command_buffer_allocations",
+    srcs = ["command_buffer_allocations.cc"],
+    hdrs = ["command_buffer_allocations.h"],
     deps = [
-        ":support",
-        "//xla:xla_proto_cc",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/service:executable",
-        "//xla/service/gpu:gpu_asm_opts_util",
-        "//xla/service/gpu/runtime3:cholesky_thunk",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
     ],
 )
 
 cc_library(
-    name = "collectives",
-    srcs = ["collectives.cc"],
-    hdrs = ["collectives.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
+    name = "command_buffer_cmd",
+    srcs = ["command_buffer_cmd.cc"],
+    hdrs = ["command_buffer_cmd.h"],
+    local_defines = if_cuda_is_configured([
+        "GOOGLE_CUDA=1",
     ]),
-    visibility = ["//visibility:public"],
     deps = [
-        ":support",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
+        ":custom_call_thunk",
+        ":nccl_all_gather_thunk",
+        ":nccl_all_reduce_thunk",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:types",
+        "//xla:util",
+        "//xla/service:buffer_assignment",
         "//xla/service:collective_ops_utils",
-        "//xla/service:computation_placer_hdr",
-        "//xla/service:executable",
+        "//xla/service:computation_placer",
+        "//xla/service:custom_call_status_internal",
+        "//xla/service:custom_call_status_public_headers",
         "//xla/service:global_device_id",
+        "//xla/service/gpu:buffer_allocations",
         "//xla/service/gpu:gpu_executable_run_options",
+        "//xla/service/gpu:launch_dimensions",
+        "//xla/service/gpu:matmul_utils",
         "//xla/service/gpu:nccl_api",
+        "//xla/service/gpu:nccl_clique",
+        "//xla/service/gpu:nccl_clique_key",
         "//xla/service/gpu:nccl_collective_thunks",
+        "//xla/service/gpu:stream_executor_util",
         "//xla/service/gpu:thunk",
+        "//xla/service/gpu/kernels:custom_kernel",
         "//xla/stream_executor",
+        "//xla/stream_executor/gpu:gpu_stream_header",
+        "//xla/stream_executor/gpu:gpu_types_header",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/functional:function_ref",
         "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-    ] + if_gpu_is_configured([
-        "//xla/service/gpu:mock_nccl_utils",
-    ]),
+        "@local_tsl//tsl/concurrency:ref_count",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
+    ],
 )
 
 cc_library(
-    name = "conv",
-    srcs = ["conv.cc"],
-    hdrs = ["conv.h"],
-    local_defines = if_cuda_is_configured([
-        "GOOGLE_CUDA=1",
-    ]),
-    visibility = ["//visibility:public"],
+    name = "command_buffer_cmd_emitter",
+    srcs = ["command_buffer_cmd_emitter.cc"],
+    hdrs = ["command_buffer_cmd_emitter.h"],
     deps = [
-        ":support",
+        ":command_buffer_cmd",
+        ":conditional_thunk",
+        ":copy_thunk",
+        ":custom_call_thunk",
+        ":gemm_thunk",
+        ":kernel_thunk",
+        ":memset_thunk",
+        ":nccl_all_gather_thunk",
+        ":nccl_all_reduce_thunk",
+        ":replica_id_thunk",
+        ":sequential_thunk",
+        ":wait_for_streams_thunk",
+        ":while_thunk",
         "//xla:status",
-        "//xla:xla_proto_cc",
-        "//xla/mlir/runtime/transforms:custom_call_encoding",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/service:executable",
-        "//xla/service/gpu:autotuner_util",
-        "//xla/service/gpu:gpu_asm_opts_util",
-        "//xla/service/gpu:gpu_conv_runner",
-        "//xla/service/gpu:non_atomically_upgradeable_rw_lock",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
-        "//xla/translate/mhlo_to_hlo:attribute_exporter",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/functional:function_ref",
-        "@com_google_absl//absl/synchronization",
-        "@llvm-project//llvm:Support",
-    ] + if_cuda_is_configured([
-        "//xla/service/gpu:conv_algorithm_picker",
-    ]),
+        "//xla:statusor",
+        "//xla:util",
+        "//xla/service/gpu:nccl_collective_thunks",
+        "//xla/service/gpu:thunk",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
 )
 
-cc_library(
-    name = "conv_reorder",
-    srcs = ["conv_reorder.cc"],
-    hdrs = ["conv_reorder.h"],
-    visibility = ["//visibility:public"],
+xla_test(
+    name = "command_buffer_cmd_test",
+    srcs = if_gpu_is_configured(["command_buffer_cmd_test.cc"]),
+    backends = ["gpu"],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     deps = [
-        ":support",
-        "//xla:xla_proto_cc",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
+        ":command_buffer_cmd",
+        "//xla:status",
+        "//xla:types",
+        "//xla/service:buffer_assignment",
         "//xla/service:executable",
+        "//xla/service:platform_util",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:launch_dimensions",
+        "//xla/service/gpu:thunk",
+        "//xla/stream_executor",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor/gpu:gpu_test_kernels",
+        "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_benchmark",
+        "@local_tsl//tsl/platform:test_main",
     ],
 )
 
+#===-------------------------------------------------------------------------------------------===//
+# XLA Thunks Runtime
+#===-------------------------------------------------------------------------------------------===//
+
 cc_library(
-    name = "norm",
-    srcs = ["norm.cc"],
-    hdrs = ["norm.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":support",
-        "//xla:status",
-        "//xla:xla_proto_cc",
-        "//xla/mlir/runtime/transforms:custom_call_encoding",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/service:executable",
-        "//xla/service/gpu:gpu_asm_opts_util",
-        "//xla/service/gpu:gpu_norm_runner",
+    name = "cholesky_thunk",
+    srcs = if_gpu_is_configured(["cholesky_thunk.cc"]),
+    hdrs = if_gpu_is_configured(["cholesky_thunk.h"]),
+    deps = if_gpu_is_configured([
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:cusolver_context",
+        "//xla/service/gpu:make_batch_pointers",
+        "//xla/service/gpu:thunk",
+        "//xla:types",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/hlo/ir:hlo",
+        "@local_tsl//tsl/platform:logging",
+        "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
-        "//xla/translate/mhlo_to_hlo:attribute_exporter",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/synchronization",
-        "@llvm-project//llvm:Support",
-    ],
+        "//xla/stream_executor/gpu:gpu_asm_opts",
+    ]) + ["@local_tsl//tsl/platform:status"],
 )
 
 cc_library(
-    name = "fused_attention",
-    srcs = ["fused_attention.cc"],
-    hdrs = ["fused_attention.h"],
-    visibility = ["//visibility:public"],
+    name = "command_buffer_thunk",
+    srcs = ["command_buffer_thunk.cc"],
+    hdrs = ["command_buffer_thunk.h"],
     deps = [
-        ":support",
+        ":command_buffer_allocations",
+        ":command_buffer_cmd",
         "//xla:status",
-        "//xla:xla_proto_cc",
-        "//xla/mlir/runtime/transforms:custom_call_encoding",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/service:executable",
-        "//xla/service/gpu:gpu_asm_opts_util",
-        "//xla/service/gpu:gpu_fused_mha_runner",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
-        "//xla/translate/mhlo_to_hlo:attribute_exporter",
-        "@com_google_absl//absl/container:node_hash_map",
+        "//xla:statusor",
+        "//xla/service:buffer_assignment",  # build_cleaner: keep
+        "//xla/service/gpu:buffer_allocations",  # build_cleaner: keep
+        "//xla/service/gpu:thunk",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/synchronization",
-        "@llvm-project//llvm:Support",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:profiler_lock",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
+        "@local_tsl//tsl/profiler/lib:traceme",
+        "@local_tsl//tsl/profiler/lib:traceme_encode",
     ],
 )
 
-cc_library(
-    name = "cub_sort",
-    srcs = ["cub_sort.cc"],
-    hdrs = ["cub_sort.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
-    visibility = ["//visibility:public"],
+xla_test(
+    name = "command_buffer_thunk_test",
+    srcs = if_gpu_is_configured(["command_buffer_thunk_test.cc"]),
+    backend_tags = {
+        "gpu_a100": ["config-cuda-only"],
+        "gpu_v100": ["config-cuda-only"],
+    },
+    backends = [
+        "gpu_a100",
+        "gpu_v100",
+    ],
+    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
     deps = [
-        ":support",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/runtime:memref_view",
+        ":command_buffer_allocations",
+        ":command_buffer_cmd",
+        ":command_buffer_thunk",
+        "//xla:shape_util",
+        "//xla:types",
+        "//xla/service:buffer_assignment",
         "//xla/service:executable",
-        "//xla/stream_executor:device_memory",
-        "@com_google_absl//absl/status",
-    ] + if_gpu_is_configured([
-        "//xla/service/gpu:cub_sort_thunk",
+        "//xla/service:platform_util",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:launch_dimensions",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/service/gpu:thunk",
+        "//xla/stream_executor",
+        "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "//xla/stream_executor/gpu:gpu_test_kernels",
+        "//xla/stream_executor/gpu:gpu_types_header",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ] + if_cuda_is_configured([
+        "@local_config_cuda//cuda:cuda_headers",
     ]),
 )
 
 cc_library(
-    name = "custom_call",
-    srcs = ["custom_call.cc"],
-    hdrs = ["custom_call.h"],
-    copts = [
-        "-fexceptions",
-        "-fno-strict-aliasing",
-    ],
-    features = ["-use_header_modules"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
-    visibility = ["//visibility:public"],
+    name = "conditional_thunk",
+    srcs = ["conditional_thunk.cc"],
+    hdrs = ["conditional_thunk.h"],
     deps = [
-        ":support",
-        ":triangular_solve",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/service:custom_call_status_internal",
-        "//xla/service:custom_call_target_registry",
-        "//xla/service:executable",
-        "//xla/service/gpu:cublas_cudnn",
-        "//xla/stream_executor/gpu:gpu_stream_header",
+        ":sequential_thunk",
+        "//xla:status",
+        "//xla:status_macros",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:thunk",
+        "//xla/service/gpu:variant_visitor",
+        "//xla/stream_executor",
+        "//xla/stream_executor:memory_allocation",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
 cc_library(
-    name = "custom_call_registry",
-    srcs = ["custom_call_registry.cc"],
-    hdrs = ["custom_call_registry.h"],
-    visibility = ["//visibility:public"],
-    deps = ["//xla/runtime:custom_call_registry"],
-)
-
-cc_library(
-    name = "executable",
-    srcs = ["executable.cc"],
-    hdrs = ["executable.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    visibility = ["//visibility:public"],
+    name = "convolution_thunk",
+    srcs = ["convolution_thunk.cc"],
+    hdrs = ["convolution_thunk.h"],
     deps = [
-        ":cholesky",
-        ":collectives",
-        ":concurrent_region",
-        ":conv",
-        ":conv_reorder",
-        ":cub_sort",
-        ":custom_call",
-        ":custom_call_registry",
-        ":fft",
-        ":fused_attention",
-        ":gemm",
-        ":gpublas_lt_matmul",
-        ":graph_launch",
-        ":io_feed",
-        ":kernel_launch",
-        ":memcpy",
-        ":memset",
-        ":norm",
-        ":send_recv",
-        ":stream_synchronization",
-        ":support",
-        ":topk",
-        ":tracing",
-        "//xla:statusor",
-        "//xla:xla_proto_cc",
-        "//xla/mlir/runtime/transforms:compilation_pipeline_gpu",
-        "//xla/runtime:executable",
-        "//xla/runtime:jit_executable",
-        "//xla/runtime:module_registry",
-        "//xla/service:executable",
-        "//xla/service:stream_pool",
-        "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:non_atomically_upgradeable_rw_lock",
+        "//xla:util",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:gpu_conv_runner",
         "//xla/service/gpu:thunk",
         "//xla/stream_executor",
+        "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/status",
-        "@local_tsl//tsl/protobuf:dnn_proto_cc",
     ],
 )
 
 cc_library(
-    name = "fft",
-    srcs = ["fft.cc"],
-    hdrs = ["fft.h"],
-    visibility = ["//visibility:public"],
+    name = "copy_thunk",
+    srcs = ["copy_thunk.cc"],
+    hdrs = ["copy_thunk.h"],
     deps = [
-        ":support",
-        "//xla/mlir/runtime/transforms:custom_call_encoding",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/runtime:state",
-        "//xla/service/gpu/runtime3:fft_thunk",
-        "//xla/stream_executor:fft",
+        "//xla:status",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:thunk",
+        "//xla/stream_executor",
+        "@llvm-project//mlir:IR",
     ],
 )
 
 cc_library(
-    name = "topk_kernel",
-    srcs = if_gpu_is_configured(["topk_kernel.cc"]),
-    hdrs = if_gpu_is_configured(["topk_kernel.h"]),
-    compatible_with = [],
+    name = "cub_sort_thunk",
+    srcs = if_gpu_is_configured(["cub_sort_thunk.cc"]),
+    hdrs = if_gpu_is_configured(["cub_sort_thunk.h"]),
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":gpu_kernel_helper",
-        ":support",
+    deps = if_gpu_is_configured([
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:thunk",
+        "//xla/stream_executor:device_memory",
         "//xla:shape_util",
-        "//xla:types",
         "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla/runtime:memref_view",
-        "//xla/stream_executor",  # build_cleaner: keep
-        "//xla/stream_executor:platform",
-        "//xla/stream_executor/gpu:gpu_stream_header",
-        "//xla/stream_executor/gpu:gpu_types_header",
-        "@com_google_absl//absl/numeric:bits",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@eigen_archive//:eigen3",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-    ] + if_cuda_is_configured([
-        ":topk_kernel_cuda",
-    ]) + if_rocm_is_configured([
-        ":topk_kernel_rocm",
-    ]),
+    ] + ["//xla/service/gpu:cub_sort_kernel_" + suffix for suffix in get_cub_sort_kernel_types()]),
 )
 
-cuda_library(
-    name = "topk_kernel_cuda",
-    srcs = if_cuda_is_configured(
-        [
-            "topk_kernel_bfloat16.cu.cc",
-            "topk_kernel_float.cu.cc",
-            "topk_kernel.cu.h",
-        ],
-    ),
-    hdrs = if_cuda_is_configured(["topk_kernel_common.h"]),
-    compatible_with = [],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    visibility = ["//visibility:public"],
-    deps = [
-        ":gpu_kernel_helper",
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-rocm_library(
-    name = "topk_kernel_rocm",
-    srcs = if_rocm_is_configured(
-        [
-            "topk_kernel_bfloat16.cu.cc",
-            "topk_kernel_float.cu.cc",
-            "topk_kernel.cu.h",
-        ],
-    ),
-    hdrs = if_rocm_is_configured(["topk_kernel_common.h"]),
-    compatible_with = [],
-    local_defines = if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
-    deps = [
-        ":gpu_kernel_helper",
-        "@eigen_archive//:eigen3",
-    ],
-)
-
-xla_cc_test(
-    name = "topk_kernel_test",
-    srcs = if_gpu_is_configured(["topk_kernel_test.cc"]),
-    tags = tf_gpu_tests_tags(),
+cc_library(
+    name = "custom_call_thunk",
+    srcs = ["custom_call_thunk.cc"],
+    hdrs = ["custom_call_thunk.h"],
+    local_defines = if_cuda_is_configured([
+        "GOOGLE_CUDA=1",
+    ]),
     deps = [
-        ":gpu_kernel_helper",
-        ":topk_kernel",
-        "//xla:xla_data_proto_cc",
-        "//xla/stream_executor",  # build_cleaner: keep
-        "//xla/stream_executor:multi_platform_manager",
-        "//xla/stream_executor/gpu:gpu_init",
+        "//xla:executable_run_options",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:util",
+        "//xla/ffi:call_frame",
+        "//xla/ffi:ffi_api",
+        "//xla/ffi/api:c_api",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service:custom_call_status",
+        "//xla/service:custom_call_status_internal",
+        "//xla/service:executable",
+        "//xla/service/gpu:thunk",
+        "//xla/stream_executor:device_memory",
         "//xla/stream_executor/gpu:gpu_stream_header",
-        "//xla/stream_executor/gpu:gpu_timer_header",
         "//xla/stream_executor/gpu:gpu_types_header",
-        "//xla/stream_executor/host:host_platform",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/random",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/time",
-        "@eigen_archive//:eigen3",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_benchmark",
-        "@local_tsl//tsl/platform:test_main",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/strings:string_view",
     ],
 )
 
-xla_cc_test(
-    name = "topk_test",
-    srcs = ["topk_test.cc"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
-    tags = tf_gpu_tests_tags(),
+cc_library(
+    name = "fft_thunk",
+    srcs = ["fft_thunk.cc"],
+    hdrs = ["fft_thunk.h"],
     deps = [
-        ":topk",
-        "//xla:error_spec",
-        "//xla:shape_util",
-        "//xla:status",
-        "//xla:statusor",
         "//xla:types",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
-        "//xla/service:gpu_plugin",
-        "//xla/service:hlo_pass",
-        "//xla/service:platform_util",
-        "//xla/service:topk_rewriter",
-        "//xla/service/gpu:topk_specializer",
-        "//xla/tests:hlo_test_base",
-        "//xla/tests:verified_hlo_module",
-        "//xla/tests:xla_internal_test_main",  # fixdeps: keep
-        "@com_google_absl//absl/container:flat_hash_set",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:thunk",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test_main",
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:status",
     ],
 )
 
 cc_library(
-    name = "topk",
-    srcs = if_gpu_is_configured(["topk.cc"]),
-    hdrs = ["topk.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
-    visibility = ["//visibility:public"],
-    deps = if_gpu_is_configured([":topk_kernel"]) + [
-        ":support",
-        "//xla:executable_run_options",
-        "//xla:shape_util",
-        "//xla:status",
-        "//xla:statusor",
-        "//xla:types",
+    name = "fused_mha_thunk",
+    srcs = ["fused_mha_thunk.cc"],
+    hdrs = ["fused_mha_thunk.h"],
+    deps = [
+        "//xla:util",
         "//xla:xla_data_proto_cc",
-        "//xla:xla_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/mlir/runtime/transforms:custom_call_encoding",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/runtime:state",
-        "//xla/service:executable",
-        "//xla/service:hlo_pass",
-        "//xla/service:tuple_util",
-        "//xla/stream_executor/gpu:gpu_stream_header",
-        "//xla/stream_executor/gpu:gpu_types_header",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:statusor",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:gpu_fused_mha_runner",
+        "//xla/service/gpu:thunk",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/container:flat_hash_map",
     ],
 )
 
 cc_library(
-    name = "gemm",
-    srcs = ["gemm.cc"],
-    hdrs = ["gemm.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
-    visibility = ["//visibility:public"],
+    name = "gemm_thunk",
+    srcs = ["gemm_thunk.cc"],
+    hdrs = ["gemm_thunk.h"],
     deps = [
-        ":support",
         "//xla:status",
-        "//xla:xla_proto_cc",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/runtime:state",
-        "//xla/service:executable",
-        "//xla/service:hlo_module_config",
-        "//xla/service/gpu:gpu_asm_opts_util",
+        "//xla/service:buffer_assignment",
         "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:non_atomically_upgradeable_rw_lock",
-        "//xla/stream_executor:blas",
+        "//xla/service/gpu:thunk",
         "//xla/stream_executor:device_memory",
-        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/status",
-        "@local_tsl//tsl/platform:errors",
-    ] + if_gpu_is_configured([
-        "//xla/service/gpu:gemm_algorithm_picker",
-        "//xla/stream_executor/gpu:redzone_allocator",
-    ]),
+        "@local_tsl//tsl/platform:logging",
+    ],
 )
 
 cc_library(
-    name = "graph_launch",
-    srcs = ["graph_launch.cc"],
-    hdrs = ["graph_launch.h"],
+    name = "gpublas_lt_matmul_thunk",
+    srcs = if_gpu_is_configured(["gpublas_lt_matmul_thunk.cc"]),
+    hdrs = if_gpu_is_configured(["gpublas_lt_matmul_thunk.h"]),
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    visibility = ["//visibility:public"],
+    deps = if_gpu_is_configured([
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/service/gpu:thunk",
+        "//xla:status",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor",
+        "@local_tsl//tsl/platform:logging",
+    ]),
+)
+
+cc_library(
+    name = "infeed_thunk",
+    srcs = ["infeed_thunk.cc"],
+    hdrs = ["infeed_thunk.h"],
     deps = [
-        ":concurrent_region",
-        ":conv",
-        ":gemm",
-        ":kernel_launch",
-        ":support",
-        "//xla:statusor",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/service:executable",
+        "//xla:shape_util",
+        "//xla:status_macros",
+        "//xla:util",
         "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:non_atomically_upgradeable_rw_lock",
+        "//xla/service/gpu:io_feed_manager",
+        "//xla/service/gpu:thunk",
         "//xla/stream_executor",
-        "//xla/stream_executor/gpu:gpu_graph",
-        "@com_google_absl//absl/container:node_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+    ],
+)
+
+cc_library(
+    name = "kernel_thunk",
+    srcs = ["kernel_thunk.cc"],
+    hdrs = ["kernel_thunk.h"],
+    deps = [
+        "//xla:status",
+        "//xla:types",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:kernel_arguments",
+        "//xla/service/gpu:launch_dimensions",
+        "//xla/service/gpu:stream_executor_util",
+        "//xla/service/gpu:thunk",
+        "//xla/service/gpu/kernels:custom_kernel",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/profiler/lib:profiler_lock",
-        "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/profiler/lib:traceme_encode",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
 cc_library(
-    name = "concurrent_region",
-    srcs = ["concurrent_region.cc"],
-    hdrs = ["concurrent_region.h"],
-    visibility = ["//visibility:public"],
+    name = "memset_thunk",
+    srcs = ["memset_thunk.cc"],
+    hdrs = ["memset_thunk.h"],
     deps = [
-        ":support",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/service:executable",
-        "//xla/service:stream_pool",
+        "//xla:status",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:thunk",
         "//xla/stream_executor",
-        "@local_tsl//tsl/platform:statusor",
+        "@com_google_absl//absl/status",
     ],
 )
 
 cc_library(
-    name = "stream_synchronization",
-    srcs = ["stream_synchronization.cc"],
-    hdrs = ["stream_synchronization.h"],
-    visibility = ["//visibility:public"],
+    name = "nccl_all_gather_thunk",
+    srcs = ["nccl_all_gather_thunk.cc"],
+    hdrs = ["nccl_all_gather_thunk.h"],
     deps = [
-        ":concurrent_region",
-        ":support",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:collective_ops_utils",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:nccl_api",
+        "//xla/service/gpu:nccl_collective_thunks",
+        "//xla/service/gpu:thunk",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
 cc_library(
-    name = "io_feed",
-    srcs = ["io_feed.cc"],
-    hdrs = ["io_feed.h"],
-    visibility = ["//visibility:public"],
+    name = "nccl_all_reduce_thunk",
+    srcs = ["nccl_all_reduce_thunk.cc"],
+    hdrs = ["nccl_all_reduce_thunk.h"],
     deps = [
-        ":support",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/service:executable",
-        "//xla/service/gpu:io_feed_manager",
+        "//xla:status_macros",
+        "//xla:xla_data_proto_cc",
+        "//xla/hlo/ir:hlo",
+        "//xla/mlir_hlo:lhlo_gpu",
+        "//xla/service:collective_ops_utils",
+        "//xla/service/gpu:backend_configs_cc",
+        "//xla/service/gpu:nccl_api",
+        "//xla/service/gpu:nccl_collective_thunks",
+        "//xla/service/gpu:thunk",
+        "//xla/stream_executor",
+        "//xla/translate/hlo_to_mhlo:hlo_utils",
+        "//xla/translate/mhlo_to_hlo:type_to_shape",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/types:span",
+        "@llvm-project//mlir:IR",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
 cc_library(
-    name = "kernel_launch",
-    srcs = ["kernel_launch.cc"],
-    hdrs = ["kernel_launch.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
-    visibility = ["//visibility:public"],
+    name = "nccl_all_to_all_thunk",
+    srcs = ["nccl_all_to_all_thunk.cc"],
+    hdrs = ["nccl_all_to_all_thunk.h"],
     deps = [
-        ":concurrent_region",
-        ":support",
-        "//xla:statusor",
+        "//xla:shape_util",
+        "//xla:status_macros",
         "//xla/hlo/ir:hlo",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/runtime:memref_view",
-        "//xla/runtime:state",
-        "//xla/service:executable",
-        "//xla/service:hlo_proto_cc",
-        "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu:stream_executor_util",
-        "//xla/service/gpu/kernels:custom_fusion",
-        "//xla/service/gpu/kernels:custom_kernel",
+        "//xla/service:collective_ops_utils",
+        "//xla/service/gpu:ir_emission_utils",
+        "//xla/service/gpu:nccl_api",
+        "//xla/service/gpu:nccl_collective_thunks",
+        "//xla/service/gpu:thunk",
         "//xla/stream_executor",
-        "//xla/stream_executor/gpu:gpu_graph",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
+        "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
 cc_library(
-    name = "gpublas_lt_matmul",
-    srcs = ["gpublas_lt_matmul.cc"],
-    hdrs = ["gpublas_lt_matmul.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
-    visibility = ["//visibility:public"],
+    name = "norm_thunk",
+    srcs = ["norm_thunk.cc"],
+    hdrs = ["norm_thunk.h"],
     deps = [
-        ":support",
-        "//xla:xla_proto_cc",
-        "//xla/mlir/runtime/transforms:custom_call_encoding",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/runtime:logical_result",
-        "//xla/runtime:state",
-        "//xla/service:executable",
-        "//xla/service/gpu:matmul_utils",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:gpu_norm_runner",
+        "//xla/service/gpu:thunk",
         "//xla/stream_executor",
-        "@local_tsl//tsl/platform:status",
-    ] + if_rocm_is_configured([
-        "@local_config_rocm//rocm:rocm_headers",
-    ]),
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/status",
+    ],
 )
 
 cc_library(
-    name = "memcpy",
-    srcs = ["memcpy.cc"],
-    hdrs = ["memcpy.h"],
-    visibility = ["//visibility:public"],
+    name = "outfeed_thunk",
+    srcs = ["outfeed_thunk.cc"],
+    hdrs = ["outfeed_thunk.h"],
     deps = [
-        ":concurrent_region",
-        ":support",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/service:executable",
+        "//xla:util",
+        "//xla/service/gpu:io_feed_manager",
+        "//xla/service/gpu:thunk",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/status",
     ],
 )
 
 cc_library(
-    name = "memset",
-    srcs = ["memset.cc"],
-    hdrs = ["memset.h"],
-    visibility = ["//visibility:public"],
+    name = "replica_id_thunk",
+    srcs = ["replica_id_thunk.cc"],
+    hdrs = ["replica_id_thunk.h"],
     deps = [
-        ":support",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/service:executable",
-        "@com_google_absl//absl/base",
+        "//xla/service:buffer_assignment",
+        "//xla/service:global_device_id",
+        "//xla/service/gpu:thunk",
+        "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
 cc_library(
-    name = "support",
-    srcs = ["support.cc"],
-    hdrs = ["support.h"],
+    name = "sequential_thunk",
+    srcs = ["sequential_thunk.cc"],
+    hdrs = ["sequential_thunk.h"],
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    visibility = ["//visibility:public"],
     deps = [
-        "//xla:shape_util",
-        "//xla/mlir/runtime/transforms:custom_call_encoding",
-        "//xla/runtime:custom_call",
-        "//xla/service/gpu:matmul_utils",
-        "//xla/stream_executor:blas",
-        "//xla/stream_executor:device_memory",
+        "//xla:status",
+        "//xla/hlo/ir:hlo",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:thunk",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
-        "@llvm-project//llvm:Support",
-        "@local_tsl//tsl/profiler/lib:scoped_annotation_stack",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/profiler/lib:scoped_annotation",
     ],
 )
 
 cc_library(
-    name = "send_recv",
-    srcs = ["send_recv.cc"],
-    hdrs = ["send_recv.h"],
-    visibility = ["//visibility:public"],
+    name = "send_recv_thunk",
+    srcs = ["send_recv_thunk.cc"],
+    hdrs = ["send_recv_thunk.h"],
     deps = [
-        ":support",
-        "//xla/mlir/runtime/transforms:custom_call_encoding",
-        "//xla/mlir_hlo",
-        "//xla/runtime:custom_call",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/service:executable",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:statusor",
+        "//xla:xla_data_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/service:global_device_id",
+        "//xla/service/gpu:thunk",
         "//xla/stream_executor",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/concurrency:async_value",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/profiler/lib:traceme_encode",
     ],
 )
 
 cc_library(
-    name = "tracing",
-    srcs = [
-        "annotation.cc",
-        "tracing.cc",
-    ],
-    hdrs = [
-        "annotation.h",
-        "tracing.h",
+    name = "triangular_solve_thunk",
+    srcs = if_gpu_is_configured(["triangular_solve_thunk.cc"]),
+    hdrs = if_gpu_is_configured(["triangular_solve_thunk.h"]),
+    deps = if_gpu_is_configured([
+        "@com_google_absl//absl/strings:str_format",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:make_batch_pointers",
+        "//xla/service/gpu:thunk",
+        "//xla:types",
+        "//xla:util",
+        "//xla:xla_data_proto_cc",
+        "//xla/service:buffer_assignment",
+        "//xla/hlo/ir:hlo",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "//xla/stream_executor",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor/gpu:gpu_asm_opts",
+    ]) + [
+        "@com_google_absl//absl/status",
+        "@local_tsl//tsl/platform:status",
     ],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    visibility = ["//visibility:public"],
+)
+
+cc_library(
+    name = "while_thunk",
+    srcs = ["while_thunk.cc"],
+    hdrs = ["while_thunk.h"],
     deps = [
-        ":support",
+        ":sequential_thunk",
+        "//xla:status",
         "//xla/hlo/ir:hlo",
-        "//xla/runtime:custom_call_registry",
-        "//xla/runtime:executable",
-        "//xla/runtime:tracing",
-        "//xla/runtime:type_id",
+        "//xla/service:buffer_assignment",
+        "//xla/service/gpu:buffer_allocations",
+        "//xla/service/gpu:thunk",
+        "//xla/stream_executor",
+        "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/profiler/lib:nvtx_utils",
-        "@local_tsl//tsl/profiler/lib:scoped_annotation_stack",
+        "@com_google_absl//absl/synchronization",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
 cc_library(
-    name = "triangular_solve",
-    srcs = ["triangular_solve.cc"],
-    hdrs = ["triangular_solve.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM"]),
-    visibility = ["//visibility:public"],
+    name = "wait_for_streams_thunk",
+    srcs = ["wait_for_streams_thunk.cc"],
+    hdrs = ["wait_for_streams_thunk.h"],
     deps = [
-        ":support",
-        "//xla:xla_proto_cc",
-        "//xla/runtime:custom_call",
-        "//xla/service:executable",
-        "//xla/service/gpu:gpu_asm_opts_util",
-        "//xla/service/gpu/runtime3:triangular_solve_thunk",
-        "@local_tsl//tsl/platform:human_readable_json",
+        "//xla/service:global_device_id",
+        "//xla/service/gpu:thunk",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
diff --git a/third_party/xla/xla/service/gpu/runtime/annotation.cc b/third_party/xla/xla/service/gpu/runtime/annotation.cc
index c95d19f154a6fd..1a48d4fc0a3566 100644
--- a/third_party/xla/xla/service/gpu/runtime/annotation.cc
+++ b/third_party/xla/xla/service/gpu/runtime/annotation.cc
@@ -16,32 +16,39 @@ limitations under the License.
 #include "xla/service/gpu/runtime/annotation.h"
 
 #include <cstddef>
+#include <cstring>
+#include <optional>
+#include <string>
 #include <string_view>
+#include <utility>
 
+#include "absl/status/status.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/str_split.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "tsl/platform/errors.h"
+#include "tsl/profiler/lib/nvtx_utils.h"
 
 namespace xla::gpu {
-
 namespace {
-nvtxStringHandle_t RegisterString(const char* str) {
+
+nvtxStringHandle_t RegisterString(const std::string& str) {
 #if GOOGLE_CUDA
-  auto domain = tsl::profiler::nvtx::GetNVTXDomain();
+  auto domain = tsl::profiler::GetNVTXDomain();
   if (!domain) {
-    // NVTX not enabled, so don't bother registering strings with it
-    return {};
+    return {};  // NVTX not enabled, so don't registering strings.
   }
-  std::string buffer{};
   constexpr auto max_length = 65330;
-  if (auto const length = std::strlen(str); length >= max_length) {
-    // nvbugs 4340868
-    std::string_view suffix{"\n[truncated]\n"};
-    buffer.reserve(max_length);
-    buffer.assign(str, str + length - suffix.size());
-    buffer.append(suffix);
-    str = buffer.c_str();
+  if (str.size() <= max_length) {
+    return nvtxDomainRegisterStringA(*domain, str.c_str());
   }
-  return nvtxDomainRegisterStringA(*domain, str);
+  // nvbugs 4340868
+  std::string_view suffix{"\n[truncated]\n"};
+  std::string buffer(str.data(), max_length - suffix.size());
+  buffer.append(suffix);
+  return nvtxDomainRegisterStringA(*domain, buffer.c_str());
 #else
   return {};
 #endif
@@ -90,16 +97,17 @@ class OpNamePrefixVisitor : public ConstDfsHloVisitorWithDefault {
   absl::Status DefaultAction(const HloInstruction* inst) final {
     auto const& op_name = inst->metadata().op_name();
     if (!op_name.empty()) {
-      prefix = prefix ? LongestPrefix(*prefix, op_name) : op_name;
+      prefix_ = prefix_ ? LongestPrefix(*prefix_, op_name) : op_name;
     }
     return absl::OkStatus();
   }
+
   std::string_view longest_op_name_prefix() const {
-    return prefix.value_or(std::string_view{});
+    return prefix_.value_or("");
   }
 
  private:
-  std::optional<std::string_view> prefix{};
+  std::optional<std::string_view> prefix_;
 };
 
 std::string_view GetLongestOpNamePrefix(const HloModule& mod) {
@@ -134,29 +142,14 @@ std::string MakeTitle(const HloModule& mod, std::string_view longest_prefix) {
 }
 }  // namespace
 
-ModuleAnnotation::ModuleAnnotation(std::string module_name_, int module_id_)
-    : longest_prefix{},
-      title_str{
-          module_id_ >= 0
-              ? absl::StrFormat("XlaModule:#hlo_module=%s,program_id=%d",
-                                module_name_, module_id_)
-              : absl::StrFormat("XlaModule:#hlo_module=%s", module_name_)},
-      title{RegisterString(title_str.c_str())} {}
+ModuleAnnotation::ModuleAnnotation(std::string_view module_name)
+    : title_str(absl::StrFormat("XlaModule:#hlo_module=%s", module_name)),
+      title(RegisterString(title_str)) {}
 
 ModuleAnnotation::ModuleAnnotation(const HloModule& mod)
-    : longest_prefix{GetLongestOpNamePrefix(mod)},
-      title_str{MakeTitle(mod, longest_prefix)},
-      title{RegisterString(title_str.c_str())} {}
-
-std::string_view ModuleAnnotation::longest_op_name_prefix() const {
-  return longest_prefix;
-}
-
-std::string_view ModuleAnnotation::Title() const { return title_str; }
-
-nvtxStringHandle_t ModuleAnnotation::NvtxRegisteredTitle() const {
-  return title;
-}
+    : longest_prefix(GetLongestOpNamePrefix(mod)),
+      title_str(MakeTitle(mod, longest_prefix)),
+      title(RegisterString(title_str)) {}
 
 namespace {
 std::string MakeKernelName(std::string_view prefix,
@@ -186,15 +179,12 @@ std::string MakeKernelName(std::string_view prefix,
 
 KernelAnnotation::KernelAnnotation(const ModuleAnnotation& module_annotation,
                                    const HloInstruction& inst)
-    : title_str{MakeKernelName(module_annotation.longest_op_name_prefix(),
-                               inst)},
-      title{RegisterString(title_str.c_str())} {}
+    : title_str(
+          MakeKernelName(module_annotation.longest_op_name_prefix(), inst)),
+      title(RegisterString(title_str)) {}
 
-std::string_view KernelAnnotation::Title() const { return title_str; }
-
-nvtxStringHandle_t KernelAnnotation::NvtxRegisteredTitle() const {
-  return title;
-}
+ModuleAnnotations::ModuleAnnotations(std::string_view module_name)
+    : top_level(module_name) {}
 
 ModuleAnnotations::ModuleAnnotations(const HloModule& mod) : top_level{mod} {
   // loop through `mod` and populate `kernels` (string -> KernelAnnotation map)
@@ -224,4 +214,25 @@ ModuleAnnotations::ModuleAnnotations(const HloModule& mod) : top_level{mod} {
     }
   }
 }
+
+//===----------------------------------------------------------------------===//
+// Scoped RAII helper to set and restore thread local module annotations
+//===----------------------------------------------------------------------===//
+
+namespace {
+thread_local const ModuleAnnotations* current_annotations = nullptr;
+}  // namespace
+
+ScopedModuleAnnotations::ScopedModuleAnnotations(
+    const ModuleAnnotations* annotations)
+    : restore_(std::exchange(current_annotations, annotations)) {}
+
+ScopedModuleAnnotations::~ScopedModuleAnnotations() {
+  std::exchange(current_annotations, restore_);
+}
+
+const ModuleAnnotations* GetCurrentModuleAnnotations() {
+  return current_annotations;
+}
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/annotation.h b/third_party/xla/xla/service/gpu/runtime/annotation.h
index 9721ad80fd3049..44341e9cf0e731 100644
--- a/third_party/xla/xla/service/gpu/runtime/annotation.h
+++ b/third_party/xla/xla/service/gpu/runtime/annotation.h
@@ -16,44 +16,78 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_RUNTIME_ANNOTATION_H_
 #define XLA_SERVICE_GPU_RUNTIME_ANNOTATION_H_
 
+#include <string>
+#include <string_view>
+
 #include "absl/container/flat_hash_map.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "tsl/profiler/lib/nvtx_utils.h"
 
 namespace xla::gpu {
+
 // Prepared information for the top level NVTX/profiler range covering an
 // HloModule
 struct ModuleAnnotation {
-  ModuleAnnotation(std::string module_name, int module_id);
-  ModuleAnnotation(const HloModule& mod);
-  std::string_view longest_op_name_prefix() const;
-  nvtxStringHandle_t NvtxRegisteredTitle() const;
-  std::string_view Title() const;
+  explicit ModuleAnnotation(std::string_view module_name);
+  explicit ModuleAnnotation(const HloModule& mod);
+
+  std::string_view longest_op_name_prefix() const { return longest_prefix; }
+  explicit operator std::string_view() const { return title_str; }
 
  private:
+  friend void RangePush(nvtxDomainHandle_t domain,
+                        const ModuleAnnotation& annotation) {
+    tsl::profiler::RangePush(domain, annotation.title);
+  }
+
   std::string longest_prefix;
   std::string title_str;
-  nvtxStringHandle_t title{};
+  nvtxStringHandle_t title;
 };
 
 // Prepared information for a kernel/thunk/fusion/... within an HloModule
 struct KernelAnnotation {
-  KernelAnnotation(const ModuleAnnotation& module_annotaion,
+  KernelAnnotation(const ModuleAnnotation& module_annotation,
                    const HloInstruction& inst);
-  nvtxStringHandle_t NvtxRegisteredTitle() const;
-  std::string_view Title() const;
+
+  explicit operator std::string_view() const { return title_str; }
 
  private:
+  friend void RangePush(nvtxDomainHandle_t domain,
+                        const KernelAnnotation& annotation) {
+    tsl::profiler::RangePush(domain, annotation.title);
+  }
+
   std::string title_str;
-  nvtxStringHandle_t title{};
+  nvtxStringHandle_t title;
 };
+
 // Parsed/prepared information for an HloModule that gets propagated to NVTX
 // ranges/profilers/... at execution time.
 struct ModuleAnnotations {
-  ModuleAnnotations(const HloModule&);
+  explicit ModuleAnnotations(std::string_view module_name);
+  explicit ModuleAnnotations(const HloModule&);
+
   ModuleAnnotation top_level;
-  absl::flat_hash_map<std::string_view, KernelAnnotation> kernels{};
+  absl::flat_hash_map<std::string_view, KernelAnnotation> kernels;
 };
+
+//===----------------------------------------------------------------------===//
+// Scoped RAII helper to set and restore thread local module annotations
+//===----------------------------------------------------------------------===//
+
+class ScopedModuleAnnotations {
+ public:
+  explicit ScopedModuleAnnotations(const ModuleAnnotations* annotations);
+  ~ScopedModuleAnnotations();
+
+ private:
+  const ModuleAnnotations* restore_;
+};
+
+const ModuleAnnotations* GetCurrentModuleAnnotations();
+
 }  // namespace xla::gpu
 
 #endif  // XLA_SERVICE_GPU_RUNTIME_ANNOTATION_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/cholesky.cc b/third_party/xla/xla/service/gpu/runtime/cholesky.cc
deleted file mode 100644
index 4291d156b0ef19..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/cholesky.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/cholesky.h"
-
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/executable.h"
-#include "xla/service/gpu/gpu_asm_opts_util.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/service_executable_run_options.h"
-#include "xla/xla.pb.h"
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "xla/service/gpu/runtime3/cholesky_thunk.h"
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-namespace xla {
-namespace gpu {
-
-using ::xla::runtime::CustomCall;
-using ::xla::runtime::MemrefView;
-using ::xla::runtime::StridedMemrefView;
-
-static absl::Status CholeskyImpl(const ServiceExecutableRunOptions* run_options,
-                                 const DebugOptions* debug_options,
-                                 StridedMemrefView operand, StridedMemrefView a,
-                                 MemrefView workspace, MemrefView info,
-                                 int64_t batch_size, bool is_lower, int64_t n) {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  se::DeviceMemoryBase operand_buffer = GetDeviceAddress(operand);
-  se::DeviceMemoryBase a_buffer = GetDeviceAddress(a);
-  se::DeviceMemoryBase workspace_buffer = GetDeviceAddress(workspace);
-  se::DeviceMemoryBase info_buffer = GetDeviceAddress(info);
-
-  VLOG(3) << "Running Cholesky";
-  se::Stream* stream = run_options->stream();
-
-  // Copy operand to the a buffer if they are different.
-  if (a.data != operand.data)
-    stream->ThenMemcpy(&a_buffer, operand_buffer, operand_buffer.size());
-
-  using UpperLower = se::blas::UpperLower;
-  UpperLower uplo = is_lower ? UpperLower::kLower : UpperLower::kUpper;
-
-  CholeskyParams params{n,        batch_size,       uplo,
-                        a_buffer, workspace_buffer, info_buffer};
-  return RunCholesky(xla::gpu::PtxOptsFromDebugOptions(*debug_options),
-                     operand.dtype, &params, stream);
-#else  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  return absl::InternalError("Cholesky is not supported without GPU");
-#endif
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    Cholesky, FunctionWrapper<CholeskyImpl>(), checks,
-    CustomCall::Bind("xla.gpu.cholesky")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<const DebugOptions*>()
-        .Arg<StridedMemrefView>()  // operand
-        .Arg<StridedMemrefView>()  // a
-        .Arg<MemrefView>()         // workspace
-        .Arg<MemrefView>()         // info
-        .Attr<int64_t>("batch_size")
-        .Attr<bool>("is_lower")
-        .Attr<int64_t>("n"));
-
-void RegisterCholeskyCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.cholesky", Cholesky);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/cholesky.h b/third_party/xla/xla/service/gpu/runtime/cholesky.h
deleted file mode 100644
index 0a8639093e2c59..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/cholesky.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_CHOLESKY_H_
-#define XLA_SERVICE_GPU_RUNTIME_CHOLESKY_H_
-
-#include "xla/runtime/custom_call_registry.h"
-
-namespace xla {
-namespace gpu {
-
-// Registers XLA Gpu runtime cholesky custom calls.
-void RegisterCholeskyCustomCalls(runtime::DirectCustomCallRegistry& registry);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_CHOLESKY_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/cholesky_thunk.cc b/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.cc
similarity index 98%
rename from third_party/xla/xla/service/gpu/runtime3/cholesky_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/cholesky_thunk.cc
index d6db6468211924..81f847beede12d 100644
--- a/third_party/xla/xla/service/gpu/runtime3/cholesky_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/cholesky_thunk.h"
+#include "xla/service/gpu/runtime/cholesky_thunk.h"
 
 #include <complex>
 #include <memory>
diff --git a/third_party/xla/xla/service/gpu/runtime3/cholesky_thunk.h b/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.h
similarity index 94%
rename from third_party/xla/xla/service/gpu/runtime3/cholesky_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/cholesky_thunk.h
index 26054a194e4f62..e56c01a7fa2646 100644
--- a/third_party/xla/xla/service/gpu/runtime3/cholesky_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/cholesky_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_CHOLESKY_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_CHOLESKY_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_CHOLESKY_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_CHOLESKY_THUNK_H_
 
 #include <optional>
 
@@ -80,4 +80,4 @@ absl::Status RunCholesky(const se::GpuAsmOpts& asm_opts, PrimitiveType type,
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_CHOLESKY_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_CHOLESKY_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/collectives.cc b/third_party/xla/xla/service/gpu/runtime/collectives.cc
deleted file mode 100644
index 9dcfe6a42e06ea..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/collectives.cc
+++ /dev/null
@@ -1,1028 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/collectives.h"
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_format.h"
-#include "absl/types/span.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/executable.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/service/computation_placer.h"
-#include "xla/service/global_device_id.h"
-#include "xla/service/gpu/gpu_executable_run_options.h"
-#include "xla/service/gpu/nccl_all_gather_thunk.h"
-#include "xla/service/gpu/nccl_all_reduce_thunk.h"
-#include "xla/service/gpu/nccl_all_to_all_thunk.h"
-#include "xla/service/gpu/nccl_api.h"
-#include "xla/service/gpu/nccl_collective_permute_thunk.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
-#include "xla/service/gpu/nccl_recv_thunk.h"
-#include "xla/service/gpu/nccl_send_thunk.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/gpu/thunk.h"
-#include "xla/service/service_executable_run_options.h"
-
-#if XLA_ENABLE_XCCL
-#include "xla/service/gpu/mock_nccl_utils.h"
-#endif  // XLA_ENABLE_XCCL
-
-namespace xla {
-namespace gpu {
-
-using xla::runtime::CustomCall;
-using xla::runtime::FlatMemrefView;
-using xla::runtime::StridedMemrefView;
-
-namespace {
-
-absl::Status RunRepeated(int32_t count, absl::FunctionRef<Status()> to_run) {
-  if (count != 0) {
-    VLOG(3) << "Running each collective " << count << " times\n";
-  }
-  for (int32_t i = 0; i < count; ++i) {
-    TF_RETURN_IF_ERROR(to_run());
-  }
-  return absl::OkStatus();
-}
-
-// Helper function to run a collective either synchronously on main stream or
-// asynchronously on the async stream.
-absl::Status RunSyncOrAsync(
-    const ServiceExecutableRunOptions* run_options,
-    CollectivesSupport* collectives, AsyncCollectivesSupport* async_collectives,
-    int32_t uid, bool is_async,
-    absl::FunctionRef<absl::Status(se::Stream*)> to_run,
-    AsyncStreamKind stream_kind = AsyncStreamKind::kCollective) {
-  se::Stream* main_stream = run_options->stream();
-  se::Stream* async_stream =
-      is_async ? async_collectives->async_comm_stream(stream_kind) : nullptr;
-  if (is_async) {
-    // Wait until compute inputs are ready.
-    async_stream->ThenWaitFor(main_stream);
-  }
-
-  // Launch the collective on either the main or async stream.
-  se::Stream* stream = is_async ? async_stream : main_stream;
-  TF_RETURN_IF_ERROR(to_run(stream));
-
-  if (is_async) {
-    TF_RETURN_IF_ERROR(async_collectives->RecordEvent(uid, stream_kind));
-  }
-  int32_t device_ordinal = main_stream->parent()->device_ordinal();
-  return collectives->MaybeBlockAfterFirstRun(uid, device_ordinal, main_stream);
-}
-
-#if XLA_ENABLE_XCCL
-bool ShouldEnableCliqueOptimization(
-    const Thunk::CollectiveExecuteParams& params,
-    const DebugOptions* debug_options, bool no_parallel_custom_call) {
-  // Enable clique optimization for single-host application, which is indicated
-  // by the absence of nccl_clique_id_callback. For multiple-host, only enable
-  // when a debug flag is set for now, due to some divergent compilation issues.
-  return no_parallel_custom_call &&
-         (!params.nccl_clique_id_callback ||
-          debug_options->xla_gpu_enable_nccl_clique_optimization());
-}
-
-absl::StatusOr<NcclComm::Lock> GetNcclComm(
-    const Thunk::CollectiveExecuteParams& params, int64_t group_mode,
-    int64_t op_id, absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values, int64_t stream_id,
-    bool enable_clique_optimization) {
-  // TODO(b/233930690): Pass the attribute below as a nested array.
-  // Pass an array of arrays using two vectors; one specifying all the values
-  // and another specifying the (ending) offsets of each array in the other
-  // vector. Example: [ [10, 20, 30, 40], [50, 60], [70, 80, 90] ] turns into
-  // offsets=[4, 6, 9] values=[10, 20, 30, 40, 50, 60, 70, 80, 90].
-  std::vector<ReplicaGroup> replica_groups;
-  int i = 0;
-  for (int64_t replica_group_end : replica_group_offsets) {
-    ReplicaGroup replica_group;
-    while (i < replica_group_end)
-      replica_group.add_replica_ids(replica_group_values[i++]);
-    replica_groups.push_back(replica_group);
-  }
-
-  return LockNcclComm(params, replica_groups,
-                      static_cast<CollectiveOpGroupMode>(group_mode), op_id,
-                      stream_id, enable_clique_optimization);
-}
-
-absl::StatusOr<NcclComm::Lock> GetMockNcclComm(
-    const Thunk::CollectiveExecuteParams& params, int64_t group_mode,
-    int64_t op_id, absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values, int64_t stream_id,
-    bool enable_clique_optimization,
-    GpuExecutableRunOptions::MockNcclTopoModel topo_model) {
-  // TODO(b/233930690): Pass the attribute below as a nested array.
-  // Pass an array of arrays using two vectors; one specifying all the values
-  // and another specifying the (ending) offsets of each array in the other
-  // vector. Example: [ [10, 20, 30, 40], [50, 60], [70, 80, 90] ] turns into
-  // offsets=[4, 6, 9] values=[10, 20, 30, 40, 50, 60, 70, 80, 90].
-  std::vector<ReplicaGroup> replica_groups;
-  int i = 0;
-  for (int64_t replica_group_end : replica_group_offsets) {
-    ReplicaGroup replica_group;
-    while (i < replica_group_end)
-      replica_group.add_replica_ids(replica_group_values[i++]);
-    replica_groups.push_back(replica_group);
-  }
-
-  return LockMockNcclComm(params, replica_groups,
-                          static_cast<CollectiveOpGroupMode>(group_mode), op_id,
-                          stream_id, enable_clique_optimization, topo_model);
-}
-#endif  // XLA_ENABLE_XCCL
-
-absl::StatusOr<std::vector<DeviceBufferPair>> GetDeviceBufferPairs(
-    CustomCall::RemainingArgs& args) {
-  // Add MemRef arguments as buffer arguments.
-  TF_RET_CHECK(args.size() % 2 == 0);
-  const int buffer_pairs = args.size() / 2;
-  std::vector<DeviceBufferPair> device_buffers;
-  device_buffers.reserve(buffer_pairs);
-  for (int i = 0; i < buffer_pairs; ++i) {
-    auto source = args.get<StridedMemrefView>(i);
-    auto destination = args.get<StridedMemrefView>(i + buffer_pairs);
-    if (failed(source) || failed(destination)) {
-      return InvalidArgument("Unsupported device buffer pair type");
-    }
-
-    int64_t element_count = 1;
-    for (int64_t size : source->sizes) element_count *= size;
-    device_buffers.emplace_back(DeviceBufferPair{
-        source->dtype, element_count, GetDeviceAddress(*source),
-        GetDeviceAddress(*destination)});
-  }
-  return device_buffers;
-}
-
-// Expects a single argument, and returns a device buffer pair with that
-// argument replicated in both source and destination buffer.
-absl::StatusOr<std::vector<DeviceBufferPair>> GetSingleArgAsDeviceBufferPair(
-    CustomCall::RemainingArgs& args) {
-  TF_RET_CHECK(args.size() == 1);
-  auto buffer = args.get<StridedMemrefView>(0);
-  if (failed(buffer)) {
-    return InvalidArgument("Unsupported device buffer type");
-  }
-  int64_t element_count = 1;
-  for (int64_t size : buffer->sizes) element_count *= size;
-  return std::vector<DeviceBufferPair>{
-      DeviceBufferPair{buffer->dtype, element_count, GetDeviceAddress(*buffer),
-                       GetDeviceAddress(*buffer)}};
-}
-
-absl::Status AsyncDoneImpl(const ServiceExecutableRunOptions* run_options,
-                           AsyncCollectivesSupport* async_collectives,
-                           int32_t uid, std::string_view done_type) {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running " << done_type;
-  se::Stream* stream = run_options->stream();
-
-  TF_ASSIGN_OR_RETURN(se::Event event, async_collectives->PopEvent(uid));
-  stream->ThenWaitFor(&event);
-
-  return absl::OkStatus();
-#else   // XLA_ENABLE_XCCL
-  return absl::InternalError("NCCL disabled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-#if XLA_ENABLE_XCCL
-absl::Status MockNcclImplCommon(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, se::Stream* stream,
-    CustomCall::RemainingArgs args, int64_t group_mode, int64_t op_id,
-    absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values, bool is_async,
-    Thunk::Kind reduce_op,
-    GpuExecutableRunOptions::MockNcclTopoModel topo_model) {
-  TF_ASSIGN_OR_RETURN(Thunk::CollectiveExecuteParams params,
-                      Thunk::CollectiveExecuteParams::Create(
-                          *run_options, stream->parent()->device_ordinal()));
-
-  auto comm =
-      GetMockNcclComm(params, group_mode, op_id, replica_group_offsets,
-                      replica_group_values, GetStreamId(is_async),
-                      debug_options->xla_gpu_enable_nccl_clique_optimization(),
-                      topo_model);  //);
-  if (absl::IsCancelled(comm.status())) return absl::OkStatus();
-  if (!comm.ok()) return comm.status();
-
-  TF_ASSIGN_OR_RETURN(auto device_buffers, GetDeviceBufferPairs(args));
-
-  return RunMockNcclCollectives(NcclApi::Default(), device_buffers, *stream,
-                                **comm, reduce_op);
-}
-#endif  // XLA_ENABLE_XCCL
-
-//===----------------------------------------------------------------------===//
-// CollectivePermute.
-//===----------------------------------------------------------------------===//
-
-#if XLA_ENABLE_XCCL
-using NcclP2PRunner = absl::FunctionRef<absl::Status(
-    NcclApi* nccl_api, NcclP2PConfig::SourceTargetMapEntry source_target,
-    DeviceBufferPair& buffer, se::Stream& stream, NcclApi::NcclCommHandle comm,
-    absl::string_view device_string, int64_t current_id)>;
-
-using DeviceBuffersGetter =
-    absl::FunctionRef<absl::StatusOr<std::vector<DeviceBufferPair>>(
-        CustomCall::RemainingArgs& args)>;
-
-absl::Status MockNcclP2PImplCommon(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, se::Stream* stream,
-    CustomCall::RemainingArgs args, int64_t group_mode, int64_t op_id,
-    absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values,
-    absl::Span<const int64_t> source_peers,
-    absl::Span<const int64_t> target_peers, NcclP2PRunner runner,
-    DeviceBuffersGetter device_buffers_getter, uint64_t stream_id,
-    GpuExecutableRunOptions::MockNcclTopoModel topo_model) {
-  TF_ASSIGN_OR_RETURN(Thunk::CollectiveExecuteParams params,
-                      Thunk::CollectiveExecuteParams::Create(
-                          *run_options, stream->parent()->device_ordinal()));
-
-  const std::string device_string =
-      NcclCollectiveThunk::GetDeviceString(params);
-
-  auto comm = GetMockNcclComm(
-      params, group_mode, op_id, replica_group_offsets, replica_group_values,
-      stream_id, debug_options->xla_gpu_enable_nccl_clique_optimization(),
-      topo_model);
-  if (absl::IsCancelled(comm.status())) return absl::OkStatus();
-  if (!comm.ok()) return comm.status();
-
-  auto device_buffers = device_buffers_getter(args);
-  if (!device_buffers.ok()) return device_buffers.status();
-  if (device_buffers->size() != 1) {
-    return absl::InternalError(absl::StrFormat(
-        "Expected device buffer size: 1, got %d", device_buffers->size()));
-  }
-
-  GlobalDeviceId global_device_id = params.global_device_id;
-
-  TF_ASSIGN_OR_RETURN(DeviceAssignment::LogicalID current_logical_id,
-                      params.device_assn->LogicalIdForDevice(global_device_id));
-
-  const int64_t current_id = static_cast<CollectiveOpGroupMode>(group_mode) ==
-                                     CollectiveOpGroupMode::kCrossReplica
-                                 ? current_logical_id.replica_id
-                                 : current_logical_id.computation_id;
-
-  NcclP2PConfig::IdToSourceTargetMap id_to_source_target;
-  for (int i = 0; i < source_peers.size(); ++i) {
-    id_to_source_target[target_peers[i]].source = source_peers[i];
-    id_to_source_target[source_peers[i]].target = target_peers[i];
-  }
-  const NcclP2PConfig::SourceTargetMapEntry source_target =
-      NcclP2PConfig::GetSourceTarget(id_to_source_target, current_id);
-
-  return runner(NcclApi::Default(), source_target, (*device_buffers)[0],
-                *stream, **comm, device_string, current_id);
-}
-
-absl::Status P2PImplCommon(const ServiceExecutableRunOptions* run_options,
-                           const DebugOptions* debug_options,
-                           se::Stream* stream, CustomCall::RemainingArgs args,
-                           int64_t group_mode, int64_t op_id,
-                           bool no_parallel_custom_call,
-                           absl::Span<const int64_t> replica_group_offsets,
-                           absl::Span<const int64_t> replica_group_values,
-                           absl::Span<const int64_t> source_peers,
-                           absl::Span<const int64_t> target_peers,
-                           NcclP2PRunner runner,
-                           DeviceBuffersGetter device_buffers_getter,
-                           uint64_t stream_id) {
-  (void)no_parallel_custom_call;
-  TF_ASSIGN_OR_RETURN(Thunk::CollectiveExecuteParams params,
-                      Thunk::CollectiveExecuteParams::Create(
-                          *run_options, stream->parent()->device_ordinal()));
-  bool enable_clique_opt = ShouldEnableCliqueOptimization(
-      params, debug_options, no_parallel_custom_call);
-
-  const std::string device_string =
-      NcclCollectiveThunk::GetDeviceString(params);
-  auto comm = GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                          replica_group_values, stream_id, enable_clique_opt);
-  if (!comm.ok()) return comm.status();
-
-  auto device_buffers = device_buffers_getter(args);
-  if (!device_buffers.ok()) return device_buffers.status();
-  if (device_buffers->size() != 1) {
-    return absl::InternalError(absl::StrFormat(
-        "Expected device buffer size: 1, got %d", device_buffers->size()));
-  }
-
-  GlobalDeviceId global_device_id = params.global_device_id;
-
-  TF_ASSIGN_OR_RETURN(DeviceAssignment::LogicalID current_logical_id,
-                      params.device_assn->LogicalIdForDevice(global_device_id));
-
-  const int64_t current_id = static_cast<CollectiveOpGroupMode>(group_mode) ==
-                                     CollectiveOpGroupMode::kCrossReplica
-                                 ? current_logical_id.replica_id
-                                 : current_logical_id.computation_id;
-
-  NcclP2PConfig::IdToSourceTargetMap id_to_source_target;
-  for (int i = 0; i < source_peers.size(); ++i) {
-    id_to_source_target[target_peers[i]].source = source_peers[i];
-    id_to_source_target[source_peers[i]].target = target_peers[i];
-  }
-  const NcclP2PConfig::SourceTargetMapEntry source_target =
-      NcclP2PConfig::GetSourceTarget(id_to_source_target, current_id);
-
-  return RunRepeated(debug_options->xla_gpu_collective_inflation_factor(),
-                     [&]() -> absl::Status {
-                       return runner(NcclApi::Default(), source_target,
-                                     (*device_buffers)[0], *stream, **comm,
-                                     device_string, current_id);
-                     });
-}
-#endif  // XLA_ENABLE_XCCL
-
-absl::Status CollectivePermuteImpl(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, CollectivesSupport* collectives,
-    AsyncCollectivesSupport* async_collectives, CustomCall::RemainingArgs args,
-    int32_t uid, int64_t group_mode, int64_t op_id, bool is_async,
-    bool no_parallel_custom_call,
-    absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values,
-    absl::Span<const int64_t> source_peers,
-    absl::Span<const int64_t> target_peers) {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running CollectivePermute " << (is_async ? "(Async) " : "(Sync) ")
-          << no_parallel_custom_call;
-  return RunSyncOrAsync(
-      run_options, collectives, async_collectives, uid, is_async,
-      [&](se::Stream* stream) {
-        const gpu::GpuExecutableRunOptions* gpu_opts =
-            run_options->run_options().gpu_executable_run_options();
-        if (gpu_opts && gpu_opts->enable_mock_nccl_collectives()) {
-          return MockNcclP2PImplCommon(
-              run_options, debug_options, stream, args, group_mode, op_id,
-              replica_group_offsets, replica_group_values, source_peers,
-              target_peers, RunMockCollectivePermute, GetDeviceBufferPairs,
-              GetStreamId(is_async), gpu_opts->mock_nccl_topo_model());
-        }
-        return P2PImplCommon(run_options, debug_options, stream, args,
-                             group_mode, op_id, no_parallel_custom_call,
-                             replica_group_offsets, replica_group_values,
-                             source_peers, target_peers, RunCollectivePermute,
-                             GetDeviceBufferPairs, GetStreamId(is_async));
-      });
-#else   // XLA_ENABLE_XCCL
-  return absl::InternalError("NCCL disabled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    CollectivePermute, FunctionWrapper<CollectivePermuteImpl>(), checks,
-    CustomCall::Bind("xla.gpu.collective_permute")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<const DebugOptions*>()
-        .UserData<CollectivesSupport*>()
-        .UserData<AsyncCollectivesSupport*>()
-        .RemainingArgs()  // args
-        .Attr<int32_t>("uid")
-        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
-        .Attr<int64_t>("op_id")
-        .Attr<bool>("is_async")
-        .Attr<bool>("no_parallel_custom_call")
-        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
-        .Attr<absl::Span<const int64_t>>("replica_group_values")
-        .Attr<absl::Span<const int64_t>>("source_peers")
-        .Attr<absl::Span<const int64_t>>("target_peers"));
-
-//===----------------------------------------------------------------------===//
-// Send.
-//===----------------------------------------------------------------------===//
-
-static absl::Status P2PSendImpl(const ServiceExecutableRunOptions* run_options,
-                                const DebugOptions* debug_options,
-                                CollectivesSupport* collectives,
-                                AsyncCollectivesSupport* async_collectives,
-                                CustomCall::RemainingArgs args, int32_t uid,
-                                int64_t group_mode, int64_t op_id,
-                                bool is_async, bool no_parallel_custom_call,
-                                absl::Span<const int64_t> replica_group_offsets,
-                                absl::Span<const int64_t> replica_group_values,
-                                absl::Span<const int64_t> source_peers,
-                                absl::Span<const int64_t> target_peers) {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running Send";
-  TF_RET_CHECK(is_async);
-  // The scheduler guarantee no_parallel_custom_call for P2P chain, which is not
-  // reflected in the default value for the attribute.
-  return RunSyncOrAsync(
-      run_options, collectives, async_collectives, uid, is_async,
-      [&](se::Stream* stream) {
-        return P2PImplCommon(
-            run_options, debug_options, stream, args, group_mode, op_id,
-            /*no_parallel_custom_call=*/true, replica_group_offsets,
-            replica_group_values, source_peers, target_peers, RunSend,
-            GetSingleArgAsDeviceBufferPair,
-            GetStreamId(is_async, AsyncStreamKind::kP2P));
-      },
-      AsyncStreamKind::kP2P);
-#else   // XLA_ENABLE_XCCL
-  return absl::InternalError("NCCL disabled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    P2PSend, FunctionWrapper<P2PSendImpl>(), checks,
-    CustomCall::Bind("xla.gpu.send")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<const DebugOptions*>()
-        .UserData<CollectivesSupport*>()
-        .UserData<AsyncCollectivesSupport*>()
-        .RemainingArgs()  // args
-        .Attr<int32_t>("uid")
-        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
-        .Attr<int64_t>("op_id")
-        .Attr<bool>("is_async")
-        .Attr<bool>("no_parallel_custom_call")
-        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
-        .Attr<absl::Span<const int64_t>>("replica_group_values")
-        .Attr<absl::Span<const int64_t>>("source_peers")
-        .Attr<absl::Span<const int64_t>>("target_peers"));
-
-//===----------------------------------------------------------------------===//
-// Recv.
-//===----------------------------------------------------------------------===//
-
-static absl::Status P2PRecvImpl(const ServiceExecutableRunOptions* run_options,
-                                const DebugOptions* debug_options,
-                                CollectivesSupport* collectives,
-                                AsyncCollectivesSupport* async_collectives,
-                                CustomCall::RemainingArgs args, int32_t uid,
-                                int64_t group_mode, int64_t op_id,
-                                bool is_async, bool no_parallel_custom_call,
-                                absl::Span<const int64_t> replica_group_offsets,
-                                absl::Span<const int64_t> replica_group_values,
-                                absl::Span<const int64_t> source_peers,
-                                absl::Span<const int64_t> target_peers) {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running Recv";
-  TF_RET_CHECK(is_async);
-  // The scheduler guarantee no_parallel_custom_call for P2P chain, which is not
-  // reflected in the default value for the attribute.
-  return RunSyncOrAsync(
-      run_options, collectives, async_collectives, uid, is_async,
-      [&](se::Stream* stream) {
-        return P2PImplCommon(
-            run_options, debug_options, stream, args, group_mode, op_id,
-            /*no_parallel_custom_call=*/true, replica_group_offsets,
-            replica_group_values, source_peers, target_peers, RunRecv,
-            GetSingleArgAsDeviceBufferPair,
-            GetStreamId(is_async, AsyncStreamKind::kP2P));
-      },
-      AsyncStreamKind::kP2P);
-#else   // XLA_ENABLE_XCCL
-  return absl::InternalError("NCCL disabled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    P2PRecv, FunctionWrapper<P2PRecvImpl>(), checks,
-    CustomCall::Bind("xla.gpu.recv")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<const DebugOptions*>()
-        .UserData<CollectivesSupport*>()
-        .UserData<AsyncCollectivesSupport*>()
-        .RemainingArgs()  // args
-        .Attr<int32_t>("uid")
-        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
-        .Attr<int64_t>("op_id")
-        .Attr<bool>("is_async")
-        .Attr<bool>("no_parallel_custom_call")
-        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
-        .Attr<absl::Span<const int64_t>>("replica_group_values")
-        .Attr<absl::Span<const int64_t>>("source_peers")
-        .Attr<absl::Span<const int64_t>>("target_peers"));
-
-//===----------------------------------------------------------------------===//
-// AllGather.
-//===----------------------------------------------------------------------===//
-
-#if XLA_ENABLE_XCCL
-absl::Status AllGatherImplCommon(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, se::Stream* stream,
-    CustomCall::RemainingArgs args, int64_t group_mode, int64_t op_id,
-    absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values, bool is_async,
-    bool no_parallel_custom_call) {
-  TF_ASSIGN_OR_RETURN(Thunk::CollectiveExecuteParams params,
-                      Thunk::CollectiveExecuteParams::Create(
-                          *run_options, stream->parent()->device_ordinal()));
-  bool enable_clique_opt = ShouldEnableCliqueOptimization(
-      params, debug_options, no_parallel_custom_call);
-  TF_ASSIGN_OR_RETURN(
-      auto comm, GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                             replica_group_values, GetStreamId(is_async),
-                             enable_clique_opt));
-
-  TF_ASSIGN_OR_RETURN(auto device_buffers, GetDeviceBufferPairs(args));
-
-  return RunRepeated(
-      debug_options->xla_gpu_collective_inflation_factor(), [&]() {
-        return RunAllGather(NcclApi::Default(), device_buffers, *stream, *comm);
-      });
-}
-#endif  // XLA_ENABLE_XCCL
-
-absl::Status AllGatherImpl(const ServiceExecutableRunOptions* run_options,
-                           const DebugOptions* debug_options,
-                           CollectivesSupport* collectives,
-                           AsyncCollectivesSupport* async_collectives,
-                           CustomCall::RemainingArgs args, int32_t uid,
-                           int64_t group_mode, int64_t op_id, bool is_async,
-                           bool no_parallel_custom_call,
-                           absl::Span<const int64_t> replica_group_offsets,
-                           absl::Span<const int64_t> replica_group_values) {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running AllGather " << (is_async ? "(Async) " : "(Sync) ")
-          << no_parallel_custom_call;
-  return RunSyncOrAsync(
-      run_options, collectives, async_collectives, uid, is_async,
-      [&](se::Stream* stream) {
-        const gpu::GpuExecutableRunOptions* gpu_opts =
-            run_options->run_options().gpu_executable_run_options();
-        if (gpu_opts && gpu_opts->enable_mock_nccl_collectives()) {
-          return MockNcclImplCommon(
-              run_options, debug_options, stream, args, group_mode, op_id,
-              replica_group_offsets, replica_group_values, is_async,
-              Thunk::kNcclAllGather, gpu_opts->mock_nccl_topo_model());
-        }
-        return AllGatherImplCommon(run_options, debug_options, stream, args,
-                                   group_mode, op_id, replica_group_offsets,
-                                   replica_group_values, is_async,
-                                   no_parallel_custom_call);
-      });
-#else   // XLA_ENABLE_XCCL
-  return absl::InternalError("NCCL diasbled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    AllGather, FunctionWrapper<AllGatherImpl>(), checks,
-    CustomCall::Bind("xla.gpu.all_gather")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<const DebugOptions*>()
-        .UserData<CollectivesSupport*>()
-        .UserData<AsyncCollectivesSupport*>()
-        .RemainingArgs()  // args
-        .Attr<int32_t>("uid")
-        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
-        .Attr<int64_t>("op_id")
-        .Attr<bool>("is_async")
-        .Attr<bool>("no_parallel_custom_call")
-        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
-        .Attr<absl::Span<const int64_t>>("replica_group_values"));
-
-//===----------------------------------------------------------------------===//
-// AllReduce.
-//===----------------------------------------------------------------------===//
-
-#if XLA_ENABLE_XCCL
-absl::Status AllReduceImplCommon(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, se::Stream* stream,
-    CustomCall::RemainingArgs args, int64_t group_mode, int64_t op_id,
-    int64_t reduction_kind, absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values, bool is_async,
-    bool no_parallel_custom_call) {
-  TF_ASSIGN_OR_RETURN(Thunk::CollectiveExecuteParams params,
-                      Thunk::CollectiveExecuteParams::Create(
-                          *run_options, stream->parent()->device_ordinal()));
-  bool enable_clique_opt = ShouldEnableCliqueOptimization(
-      params, debug_options, no_parallel_custom_call);
-
-  TF_ASSIGN_OR_RETURN(
-      auto comm, GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                             replica_group_values, GetStreamId(is_async),
-                             enable_clique_opt));
-
-  TF_ASSIGN_OR_RETURN(auto device_buffers, GetDeviceBufferPairs(args));
-
-  return RunRepeated(
-      debug_options->xla_gpu_collective_inflation_factor(), [&]() {
-        return RunAllReduce(NcclApi::Default(),
-                            static_cast<ReductionKind>(reduction_kind),
-                            device_buffers, *stream, *comm);
-      });
-}
-#endif  // XLA_ENABLE_XCCL
-
-absl::Status AllReduceImpl(const ServiceExecutableRunOptions* run_options,
-                           const DebugOptions* debug_options,
-                           CollectivesSupport* collectives,
-                           AsyncCollectivesSupport* async_collectives,
-                           CustomCall::RemainingArgs args, int32_t uid,
-                           int64_t group_mode, int64_t op_id, bool is_async,
-                           bool no_parallel_custom_call, int64_t reduction_kind,
-                           absl::Span<const int64_t> replica_group_offsets,
-                           absl::Span<const int64_t> replica_group_values) {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running AllReduce " << (is_async ? "(Async) " : "(Sync) ")
-          << no_parallel_custom_call;
-  return RunSyncOrAsync(
-      run_options, collectives, async_collectives, uid, is_async,
-      [&](se::Stream* stream) {
-        const gpu::GpuExecutableRunOptions* gpu_opts =
-            run_options->run_options().gpu_executable_run_options();
-        if (gpu_opts && gpu_opts->enable_mock_nccl_collectives()) {
-          return MockNcclImplCommon(
-              run_options, debug_options, stream, args, group_mode, op_id,
-              replica_group_offsets, replica_group_values, is_async,
-              Thunk::kNcclAllReduce, gpu_opts->mock_nccl_topo_model());
-        }
-        return AllReduceImplCommon(run_options, debug_options, stream, args,
-                                   group_mode, op_id, reduction_kind,
-                                   replica_group_offsets, replica_group_values,
-                                   is_async, no_parallel_custom_call);
-      });
-#else   // XLA_ENABLE_XCCL
-  // NCCL disabled.
-  return absl::InternalError("NCCL disabled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    AllReduce, FunctionWrapper<AllReduceImpl>(), checks,
-    CustomCall::Bind("xla.gpu.all_reduce")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<const DebugOptions*>()
-        .UserData<CollectivesSupport*>()
-        .UserData<AsyncCollectivesSupport*>()
-        .RemainingArgs()  // args
-        .Attr<int32_t>("uid")
-        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
-        .Attr<int64_t>("op_id")
-        .Attr<bool>("is_async")
-        .Attr<bool>("no_parallel_custom_call")
-        .Attr<int64_t>("reduction_kind")  // ReductionKind
-        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
-        .Attr<absl::Span<const int64_t>>("replica_group_values"));
-
-//===----------------------------------------------------------------------===//
-// AllToAll.
-//===----------------------------------------------------------------------===//
-
-#if XLA_ENABLE_XCCL
-absl::Status MockAllToAllImplCommon(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, se::Stream* stream,
-    CustomCall::RemainingArgs args, int64_t group_mode,
-    bool has_split_dimension, int64_t op_id,
-    absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values, bool is_async,
-    GpuExecutableRunOptions::MockNcclTopoModel topo_model) {
-  TF_ASSIGN_OR_RETURN(Thunk::CollectiveExecuteParams params,
-                      Thunk::CollectiveExecuteParams::Create(
-                          *run_options, stream->parent()->device_ordinal()));
-
-  auto comm = GetMockNcclComm(
-      params, group_mode, op_id, replica_group_offsets, replica_group_values,
-      GetStreamId(is_async),
-      debug_options->xla_gpu_enable_nccl_clique_optimization(), topo_model);
-  // Skip mock nccl calls for gpus with non-zero ranks. Only run the nccl mock
-  // calls for the gpu with rank 0.
-  // TODO: Remove the check, once the pjrt client supports running benchmark
-  // with single gpu.
-  if (absl::IsCancelled(comm.status())) return absl::OkStatus();
-  if (!comm.ok()) return comm.status();
-
-  TF_ASSIGN_OR_RETURN(auto device_buffers, GetDeviceBufferPairs(args));
-
-  return RunMockNcclAllToAll(NcclApi::Default(), has_split_dimension,
-                             device_buffers, *stream, **comm);
-}
-
-absl::Status AllToAllImplCommon(const ServiceExecutableRunOptions* run_options,
-                                const DebugOptions* debug_options,
-                                se::Stream* stream,
-                                CustomCall::RemainingArgs args,
-                                int64_t group_mode, bool has_split_dimension,
-                                int64_t op_id,
-                                absl::Span<const int64_t> replica_group_offsets,
-                                absl::Span<const int64_t> replica_group_values,
-                                bool is_async, bool no_parallel_custom_call) {
-  TF_ASSIGN_OR_RETURN(Thunk::CollectiveExecuteParams params,
-                      Thunk::CollectiveExecuteParams::Create(
-                          *run_options, stream->parent()->device_ordinal()));
-  bool enable_clique_opt = ShouldEnableCliqueOptimization(
-      params, debug_options, no_parallel_custom_call);
-
-  TF_ASSIGN_OR_RETURN(
-      auto comm, GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                             replica_group_values, GetStreamId(is_async),
-                             enable_clique_opt));
-
-  TF_ASSIGN_OR_RETURN(auto device_buffers, GetDeviceBufferPairs(args));
-
-  return RunRepeated(
-      debug_options->xla_gpu_collective_inflation_factor(), [&]() {
-        return RunAllToAll(NcclApi::Default(), has_split_dimension,
-                           device_buffers, *stream, *comm);
-      });
-}
-#endif  // XLA_ENABLE_XCCL
-
-absl::Status AllToAllImpl(const ServiceExecutableRunOptions* run_options,
-                          const DebugOptions* debug_options,
-                          CollectivesSupport* collectives,
-                          AsyncCollectivesSupport* async_collectives,
-                          CustomCall::RemainingArgs args, int32_t uid,
-                          int64_t group_mode, bool has_split_dimension,
-                          int64_t op_id, bool is_async,
-                          bool no_parallel_custom_call,
-                          absl::Span<const int64_t> replica_group_offsets,
-                          absl::Span<const int64_t> replica_group_values) {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running AllToAll " << (is_async ? "(Async) " : "(Sync) ")
-          << no_parallel_custom_call;
-  return RunSyncOrAsync(
-      run_options, collectives, async_collectives, uid, is_async,
-      [&](se::Stream* stream) {
-        const gpu::GpuExecutableRunOptions* gpu_opts =
-            run_options->run_options().gpu_executable_run_options();
-        if (gpu_opts && gpu_opts->enable_mock_nccl_collectives()) {
-          return MockAllToAllImplCommon(
-              run_options, debug_options, stream, args, group_mode,
-              has_split_dimension, op_id, replica_group_offsets,
-              replica_group_values, is_async, gpu_opts->mock_nccl_topo_model());
-        }
-        return AllToAllImplCommon(run_options, debug_options, stream, args,
-                                  group_mode, has_split_dimension, op_id,
-                                  replica_group_offsets, replica_group_values,
-                                  is_async, no_parallel_custom_call);
-      });
-#else   // XLA_ENABLE_XCCL
-  return absl::InternalError("NCCL disabled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    AllToAll, FunctionWrapper<AllToAllImpl>(), checks,
-    CustomCall::Bind("xla.gpu.all_to_all")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<const DebugOptions*>()
-        .UserData<CollectivesSupport*>()
-        .UserData<AsyncCollectivesSupport*>()
-        .RemainingArgs()  // args
-        .Attr<int32_t>("uid")
-        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
-        .Attr<bool>("has_split_dimension")
-        .Attr<int64_t>("op_id")
-        .Attr<bool>("is_async")
-        .Attr<bool>("no_parallel_custom_call")
-        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
-        .Attr<absl::Span<const int64_t>>("replica_group_values"));
-
-//===----------------------------------------------------------------------===//
-// ReduceScatter.
-//===----------------------------------------------------------------------===//
-
-#if XLA_ENABLE_XCCL
-absl::Status ReduceScatterImplCommon(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, se::Stream* stream,
-    CustomCall::RemainingArgs args, int64_t group_mode, int64_t op_id,
-    int64_t reduction_kind, absl::Span<const int64_t> replica_group_offsets,
-    absl::Span<const int64_t> replica_group_values, bool is_async,
-    bool no_parallel_custom_call) {
-  TF_ASSIGN_OR_RETURN(Thunk::CollectiveExecuteParams params,
-                      Thunk::CollectiveExecuteParams::Create(
-                          *run_options, stream->parent()->device_ordinal()));
-  bool enable_clique_opt = ShouldEnableCliqueOptimization(
-      params, debug_options, no_parallel_custom_call);
-
-  TF_ASSIGN_OR_RETURN(
-      auto comm, GetNcclComm(params, group_mode, op_id, replica_group_offsets,
-                             replica_group_values, GetStreamId(is_async),
-                             enable_clique_opt));
-
-  TF_ASSIGN_OR_RETURN(auto device_buffers, GetDeviceBufferPairs(args));
-
-  return RunRepeated(
-      debug_options->xla_gpu_collective_inflation_factor(), [&]() {
-        return RunReduceScatter(NcclApi::Default(),
-                                static_cast<ReductionKind>(reduction_kind),
-                                device_buffers, *stream, *comm);
-      });
-}
-#endif  // XLA_ENABLE_XCCL
-
-absl::Status ReduceScatterImpl(const ServiceExecutableRunOptions* run_options,
-                               const DebugOptions* debug_options,
-                               CollectivesSupport* collectives,
-                               AsyncCollectivesSupport* async_collectives,
-                               CustomCall::RemainingArgs args, int32_t uid,
-                               int64_t group_mode, int64_t op_id, bool is_async,
-                               bool no_parallel_custom_call,
-                               int64_t reduction_kind,
-                               absl::Span<const int64_t> replica_group_offsets,
-                               absl::Span<const int64_t> replica_group_values) {
-#if XLA_ENABLE_XCCL
-  VLOG(3) << "Running ReduceScatter " << (is_async ? "(Async) " : "(Sync) ")
-          << no_parallel_custom_call;
-  return RunSyncOrAsync(
-      run_options, collectives, async_collectives, uid, is_async,
-      [&](se::Stream* stream) {
-        const gpu::GpuExecutableRunOptions* gpu_opts =
-            run_options->run_options().gpu_executable_run_options();
-        if (gpu_opts && gpu_opts->enable_mock_nccl_collectives()) {
-          return MockNcclImplCommon(
-              run_options, debug_options, stream, args, group_mode, op_id,
-              replica_group_offsets, replica_group_values, is_async,
-              Thunk::kNcclReduceScatter, gpu_opts->mock_nccl_topo_model());
-        }
-        return ReduceScatterImplCommon(
-            run_options, debug_options, stream, args, group_mode, op_id,
-            reduction_kind, replica_group_offsets, replica_group_values,
-            is_async, no_parallel_custom_call);
-      });
-#else   // XLA_ENABLE_XCCL
-  return absl::InternalError("NCCL disabled");
-#endif  // XLA_ENABLE_XCCL
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    ReduceScatter, FunctionWrapper<ReduceScatterImpl>(), checks,
-    CustomCall::Bind("xla.gpu.reduce_scatter")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<const DebugOptions*>()
-        .UserData<CollectivesSupport*>()
-        .UserData<AsyncCollectivesSupport*>()
-        .RemainingArgs()  // args
-        .Attr<int32_t>("uid")
-        .Attr<int64_t>("group_mode")  // CollectiveOpGroupMode
-        .Attr<int64_t>("op_id")
-        .Attr<bool>("is_async")
-        .Attr<bool>("no_parallel_custom_call")
-        .Attr<int64_t>("reduction_kind")  // ReductionKind
-        .Attr<absl::Span<const int64_t>>("replica_group_offsets")
-        .Attr<absl::Span<const int64_t>>("replica_group_values"));
-
-//===----------------------------------------------------------------------===//
-// AsyncDone.
-//===----------------------------------------------------------------------===//
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    AsyncDone, FunctionWrapper<AsyncDoneImpl>(), checks,
-    CustomCall::Bind("xla.gpu.async_collective_done")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<AsyncCollectivesSupport*>()
-        .Attr<int32_t>("uid")
-        .Attr<std::string_view>("done_type"));
-
-//===----------------------------------------------------------------------===//
-// ReplicaId.
-//===----------------------------------------------------------------------===//
-
-absl::Status ReplicaPartitionIdImpl(
-    const ServiceExecutableRunOptions* run_options, FlatMemrefView result,
-    bool is_replica_id) {
-  VLOG(3) << "Running " << (is_replica_id ? "ReplicaId" : "PartitionId");
-  se::Stream* stream = run_options->stream();
-  TF_ASSIGN_OR_RETURN(Thunk::CollectiveExecuteParams params,
-                      Thunk::CollectiveExecuteParams::Create(
-                          *run_options, stream->parent()->device_ordinal()));
-
-  GlobalDeviceId global_device_id = params.global_device_id;
-
-  TF_ASSIGN_OR_RETURN(DeviceAssignment::LogicalID logical_id,
-                      params.device_assn->LogicalIdForDevice(global_device_id));
-
-  se::DeviceMemoryBase result_data = GetDeviceAddress(result);
-  const uint32_t id =
-      is_replica_id ? logical_id.replica_id : logical_id.computation_id;
-  stream->ThenMemset32(&result_data, id, /*size=*/4);
-  return absl::OkStatus();
-}
-
-absl::Status ReplicaIdImpl(const ServiceExecutableRunOptions* run_options,
-                           FlatMemrefView result) {
-  return ReplicaPartitionIdImpl(run_options, result, /*is_replica_id=*/true);
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    ReplicaId, FunctionWrapper<ReplicaIdImpl>(), checks,
-    CustomCall::Bind("xla.gpu.replica_id")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .Arg<FlatMemrefView>());
-
-//===----------------------------------------------------------------------===//
-// PartitionId.
-//===----------------------------------------------------------------------===//
-
-absl::Status PartitionIdImpl(const ServiceExecutableRunOptions* run_options,
-                             FlatMemrefView result) {
-  return ReplicaPartitionIdImpl(run_options, result, /*is_replica_id=*/false);
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    PartitionId, FunctionWrapper<PartitionIdImpl>(), checks,
-    CustomCall::Bind("xla.gpu.partition_id")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .Arg<FlatMemrefView>());
-
-//===----------------------------------------------------------------------===//
-
-int64_t Key(int32_t uid, int32_t device_ordinal) {
-  return static_cast<int64_t>(uid) << 32 | device_ordinal;
-}
-
-}  // namespace
-
-//===----------------------------------------------------------------------===//
-// Collectives support library.
-//===----------------------------------------------------------------------===//
-
-absl::Status CollectivesSupport::MaybeBlockAfterFirstRun(int32_t uid,
-                                                         int32_t device_ordinal,
-                                                         se::Stream* stream) {
-  bool block = [&] {
-    absl::MutexLock lock(&mutex_);
-    return executed_.insert(Key(uid, device_ordinal)).second;
-  }();
-  return block ? stream->BlockHostUntilDone() : absl::OkStatus();
-}
-
-AsyncCollectivesSupport::AsyncCollectivesSupport(
-    absl::Span<se::Stream* const> async_streams)
-    : async_comm_streams_(async_streams.begin(), async_streams.end()) {}
-
-absl::Status AsyncCollectivesSupport::RecordEvent(
-    int32_t uid, gpu::AsyncStreamKind async_stream_kind) {
-  // Create an event on the async stream for the completion of the collective.
-  se::Event done_event(async_comm_stream(async_stream_kind)->parent());
-  if (!done_event.Init()) return absl::InternalError("Failed to create event");
-  async_comm_stream(async_stream_kind)->ThenRecordEvent(&done_event);
-
-  absl::MutexLock lock(&mutex_);
-  auto [_, was_inserted] = done_events_.insert({uid, std::move(done_event)});
-  if (!was_inserted) {
-    return absl::InternalError(absl::StrFormat(
-        "Async done event has not been consumed (uid=%d, device_ordinal=%d)",
-        uid, async_comm_stream(async_stream_kind)->parent()->device_ordinal()));
-  }
-  return absl::OkStatus();
-}
-
-absl::StatusOr<se::Event> AsyncCollectivesSupport::PopEvent(int32_t uid) {
-  absl::MutexLock lock(&mutex_);
-  auto done_event = done_events_.extract(uid);
-  if (!done_event) {
-    return absl::InternalError(
-        absl::StrFormat("Async done event was not found (uid=%d)", uid));
-  }
-  return std::move(done_event.mapped());
-}
-
-void RegisterCollectiveCustomCalls(
-    runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.collective_permute", CollectivePermute);
-  registry.Register("xla.gpu.send", P2PSend);
-  registry.Register("xla.gpu.recv", P2PRecv);
-  registry.Register("xla.gpu.all_gather", AllGather);
-  registry.Register("xla.gpu.all_reduce", AllReduce);
-  registry.Register("xla.gpu.all_to_all", AllToAll);
-  registry.Register("xla.gpu.reduce_scatter", ReduceScatter);
-
-  registry.Register("xla.gpu.collective_done", AsyncDone);
-
-  registry.Register("xla.gpu.partition_id", PartitionId);
-  registry.Register("xla.gpu.replica_id", ReplicaId);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/collectives.h b/third_party/xla/xla/service/gpu/runtime/collectives.h
deleted file mode 100644
index d6bda3d16392ab..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/collectives.h
+++ /dev/null
@@ -1,79 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_COLLECTIVES_H_
-#define XLA_SERVICE_GPU_RUNTIME_COLLECTIVES_H_
-
-#include <cstdint>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/flat_hash_set.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/service/gpu/nccl_collective_thunk.h"
-#include "xla/stream_executor/event.h"
-
-namespace xla {
-namespace gpu {
-
-// Support for running async collective operations communicating via events.
-// Registers XLA Gpu runtime collective custom calls.
-void RegisterCollectiveCustomCalls(runtime::DirectCustomCallRegistry& registry);
-
-class CollectivesSupport {
- public:
-  // Maybe block host after the first call to the collective operation with the
-  // given uid, to ensure that all devices have allocated the required buffers
-  // for their communicators before allowing any device to continue enqueuing
-  // operations. Otherwise, the allocations can cause deadlock in the CUDA
-  // driver.
-  //
-  // This basically ports workaround from cr/435058849 to Xla runtime (see
-  // details in the b/215649390).
-  absl::Status MaybeBlockAfterFirstRun(int32_t uid, int32_t device_ordinal,
-                                       se::Stream* stream);
-
- private:
-  absl::Mutex mutex_;
-
-  // Store if a particular collective operation was executed at least once. We
-  // rely on unique `uid` assigned to each collective operation by the lowering
-  // pass.
-  absl::flat_hash_set<int64_t> executed_ ABSL_GUARDED_BY(mutex_);
-};
-
-// Support for running async collective operations communicating via events.
-class AsyncCollectivesSupport {
- public:
-  explicit AsyncCollectivesSupport(absl::Span<se::Stream* const> async_streams);
-
-  absl::Status RecordEvent(int32_t uid, AsyncStreamKind async_stream_kind);
-  absl::StatusOr<se::Event> PopEvent(int32_t uid);
-
-  se::Stream* async_comm_stream(AsyncStreamKind async_stream_kind) const {
-    return async_comm_streams_[static_cast<int64_t>(async_stream_kind)];
-  }
-
- private:
-  absl::Mutex mutex_;
-  absl::InlinedVector<se::Stream*, kAsyncStreamTotal> async_comm_streams_;
-
-  // Store done events for the Done ops to wait upon.
-  absl::flat_hash_map<int, se::Event> done_events_ ABSL_GUARDED_BY(mutex_);
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_COLLECTIVES_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_allocations.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_allocations.cc
similarity index 97%
rename from third_party/xla/xla/service/gpu/runtime3/command_buffer_allocations.cc
rename to third_party/xla/xla/service/gpu/runtime/command_buffer_allocations.cc
index 83ee253dc4d1ad..a8ac270ff8d2e2 100644
--- a/third_party/xla/xla/service/gpu/runtime3/command_buffer_allocations.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_allocations.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/command_buffer_allocations.h"
+#include "xla/service/gpu/runtime/command_buffer_allocations.h"
 
 #include <utility>
 
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_allocations.h b/third_party/xla/xla/service/gpu/runtime/command_buffer_allocations.h
similarity index 90%
rename from third_party/xla/xla/service/gpu/runtime3/command_buffer_allocations.h
rename to third_party/xla/xla/service/gpu/runtime/command_buffer_allocations.h
index 310a8c0bfc0325..3440852b581d92 100644
--- a/third_party/xla/xla/service/gpu/runtime3/command_buffer_allocations.h
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_allocations.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_ALLOCATIONS_H_
-#define XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_ALLOCATIONS_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_ALLOCATIONS_H_
+#define XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_ALLOCATIONS_H_
 
 #include "absl/container/flat_hash_map.h"
 #include "xla/service/buffer_assignment.h"
@@ -48,4 +48,4 @@ class CommandBufferAllocations : public BufferAllocations::ExternalAllocations {
 
 }  // namespace xla::gpu
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_ALLOCATIONS_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_ALLOCATIONS_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
similarity index 59%
rename from third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd.cc
rename to third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
index 0a5edb13c7f495..02f50326b5fb09 100644
--- a/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/command_buffer_cmd.h"
+#include "xla/service/gpu/runtime/command_buffer_cmd.h"
 
 #include <cassert>
 #include <cstddef>
@@ -27,8 +27,10 @@ limitations under the License.
 #include <vector>
 
 #include "absl/algorithm/container.h"
+#include "absl/base/optimization.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/functional/function_ref.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -36,25 +38,30 @@ limitations under the License.
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/collective_ops_utils.h"
+#include "xla/service/computation_placer.h"
+#include "xla/service/global_device_id.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/nccl_all_gather_thunk.h"
-#include "xla/service/gpu/nccl_all_reduce_thunk.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_clique.h"
+#include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/nccl_all_gather_thunk.h"
+#include "xla/service/gpu/runtime/nccl_all_reduce_thunk.h"
 #include "xla/service/gpu/stream_executor_util.h"
+#include "xla/service/gpu/thunk.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"  // IWYU pragma: keep
-#include "xla/util.h"
 #include "tsl/concurrency/ref_count.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
@@ -83,10 +90,10 @@ static std::string_view ReductionKindString(ReductionKind kind) {
 
 // Creates condition command buffer builder from a cmd sequence.
 static se::CommandBuffer::Builder ConditionBuilder(
-    CommandBufferCmdSequence* commands,
-    const CommandBufferCmd::RecordParams* params) {
+    CommandBufferCmdSequence* commands, const Thunk::ExecuteParams* params,
+    CommandBufferCmd::StateManager* state) {
   return [=](se::CommandBuffer* command_buffer) {
-    return commands->Record(*params, command_buffer,
+    return commands->Record(*params, *state, command_buffer,
                             CommandBufferCmdSequence::RecordMode::kConditional);
   };
 }
@@ -94,14 +101,35 @@ static se::CommandBuffer::Builder ConditionBuilder(
 // Creates condition command buffer builders from a span of cmd sequences.
 static std::vector<se::CommandBuffer::Builder> ConditionBuilders(
     absl::Span<CommandBufferCmdSequence> commands,
-    const CommandBufferCmd::RecordParams* params) {
+    const Thunk::ExecuteParams* params, CommandBufferCmd::StateManager* state) {
   std::vector<se::CommandBuffer::Builder> builders;
   for (CommandBufferCmdSequence& cmd : commands) {
-    builders.push_back(ConditionBuilder(&cmd, params));
+    builders.push_back(ConditionBuilder(&cmd, params, state));
   }
   return builders;
 }
 
+//===----------------------------------------------------------------------===//
+// CommandBufferCmd
+//===----------------------------------------------------------------------===//
+
+CommandBufferCmd::State* CommandBufferCmd::StateManager::GetOrNull(
+    const CommandBufferCmd* cmd) {
+  if (auto it = state_.find(cmd); it != state_.end()) {
+    return it->second.get();
+  }
+  return nullptr;
+}
+
+CommandBufferCmd::State* CommandBufferCmd::StateManager::GetOrCreate(
+    const CommandBufferCmd* cmd,
+    absl::FunctionRef<std::unique_ptr<State>()> create) {
+  if (auto it = state_.find(cmd); it != state_.end()) {
+    return it->second.get();
+  }
+  return state_.try_emplace(cmd, create()).first->second.get();
+}
+
 //===----------------------------------------------------------------------===//
 // CommandBufferCmdSequence
 //===----------------------------------------------------------------------===//
@@ -124,7 +152,7 @@ void CommandBufferCmdSequence::Append(std::unique_ptr<CommandBufferCmd> cmd) {
   }
 
   // If the first recorded command is implemented as a nested command buffer we
-  // force a barrier before recording the next command as a workaround for CUDA
+  // force a barrier before recording the next command as a workaround for CUDAF
   // graph bug, where child CUDA graph must be a single CUDA graph root node.
   if (commands_.size() == 1 && commands_.front().cmd->IsNestedCommandBuffer()) {
     requires_barrier = true;
@@ -136,10 +164,20 @@ void CommandBufferCmdSequence::Append(std::unique_ptr<CommandBufferCmd> cmd) {
   TrackBuffers(buffers);
 }
 
+absl::Status CommandBufferCmdSequence::Prepare(
+    const Thunk::PrepareParams& params,
+    Thunk::ResourceRequests& resource_requests) {
+  for (auto& command : commands_) {
+    TF_RETURN_IF_ERROR(command.cmd->Prepare(params, resource_requests));
+  }
+  return absl::OkStatus();
+}
+
 absl::Status CommandBufferCmdSequence::Initialize(
-    se::StreamExecutor* executor, CommandBufferCmd::ExecutableSource source) {
+    const Thunk::InitializeParams& params,
+    CommandBufferCmd::StateManager& state) {
   for (auto& command : commands_) {
-    TF_RETURN_IF_ERROR(command.cmd->Initialize(executor, source));
+    TF_RETURN_IF_ERROR(command.cmd->Initialize(params, state));
   }
   return absl::OkStatus();
 }
@@ -193,7 +231,7 @@ static std::string_view RecordModeString(
 }
 
 absl::Status CommandBufferCmdSequence::Record(
-    const CommandBufferCmd::RecordParams& params,
+    const Thunk::ExecuteParams& params, CommandBufferCmd::StateManager& state,
     se::CommandBuffer* command_buffer, RecordMode mode) {
   VLOG(3) << "Record " << commands_.size() << " commands into command buffer"
           << "; mode=" << RecordModeString(mode);
@@ -212,11 +250,11 @@ absl::Status CommandBufferCmdSequence::Record(
     if (command.requires_barrier) {
       VLOG(3) << "Add command buffer barrier after " << num_recorded_commands
               << " recorded commands";
-      TF_RETURN_IF_ERROR(command_buffer->Barrier(params.executor));
+      TF_RETURN_IF_ERROR(command_buffer->Barrier(params.stream->parent()));
       num_recorded_commands = 0;
     }
 
-    TF_RETURN_IF_ERROR(command.cmd->Record(params, command_buffer));
+    TF_RETURN_IF_ERROR(command.cmd->Record(params, state, command_buffer));
     ++num_recorded_commands;
   }
 
@@ -249,6 +287,206 @@ std::vector<bool> CommandBufferCmdSequence::barriers() const {
   return barriers;
 }
 
+//===----------------------------------------------------------------------===//
+// TracedCommandBuffer
+//===----------------------------------------------------------------------===//
+
+TracedCommandBuffer::TracedCommandBuffer(
+    CommandBufferCmd::BufferUsageVector buffers, int64_t capacity)
+    : capacity_(capacity), entries_(capacity) {
+  CHECK_GT(capacity, 0) << "capacity must be larger than 0";  // NOLINT
+  // Collect unique buffer allocation indices in a set first and convert to
+  // vector as flat hash set iteration has measurable overheads.
+  absl::flat_hash_set<BufferAllocation::Index> allocs_indices;
+  for (auto& buffer : buffers) allocs_indices.insert(buffer.slice.index());
+  allocs_indices_.assign(allocs_indices.begin(), allocs_indices.end());
+}
+
+absl::StatusOr<se::CommandBuffer*> TracedCommandBuffer::GetOrTraceCommandBuffer(
+    const BufferAllocations* buffer_allocation, se::StreamExecutor* executor,
+    se::Stream* stream, absl::FunctionRef<absl::Status(se::Stream*)> trace) {
+  // Collect memory addresses for relevant allocations.
+  absl::InlinedVector<se::DeviceMemoryBase, 4> allocs;
+  allocs.reserve(allocs_indices_.size());
+  for (auto& index : allocs_indices_) {
+    allocs.emplace_back(buffer_allocation->GetDeviceAddress(index));
+  }
+
+  // Moves entry at `i` position to front and moves entries in `[0, i)` range
+  // one element to the right. Returns reference to the first entry.
+  auto shift_right = [&](size_t i) -> Entry& {
+    if (i == 0) return entries_[0];
+
+    Entry entry = std::move(entries_[i]);
+    do {
+      entries_[i] = std::move(entries_[i - 1]);
+    } while (--i > 0);
+
+    return entries_[0] = std::move(entry);
+  };
+
+  for (size_t i = 0; i < capacity_; ++i) {
+    // Found entry for a given allocations, move it to front and return a
+    // pointer to cached command buffer.
+    if (ABSL_PREDICT_TRUE(absl::c_equal(entries_[i].recorded_allocs, allocs) &&
+                          entries_[i].command_buffer)) {
+      return shift_right(i).command_buffer.get();
+    }
+
+    // Create a new entry by calling a user-provided tracing function, move it
+    // to front and return a pointer to cached command buffer.
+    if (entries_[i].command_buffer == nullptr) {
+      TF_ASSIGN_OR_RETURN(entries_[i].command_buffer,
+                          se::CommandBuffer::Trace(executor, stream, trace));
+      entries_[i].recorded_allocs.assign(allocs.begin(), allocs.end());
+      return shift_right(i).command_buffer.get();
+    }
+  }
+
+  // Create a new entry by calling a user-provided tracing function, replace the
+  // last entry with it, move it to front and return a pointer to cached command
+  // buffer.
+  TF_ASSIGN_OR_RETURN(entries_[capacity_ - 1].command_buffer,
+                      se::CommandBuffer::Trace(executor, stream, trace));
+  entries_[capacity_ - 1].recorded_allocs.assign(allocs.begin(), allocs.end());
+  return shift_right(capacity_ - 1).command_buffer.get();
+}
+
+//===----------------------------------------------------------------------===//
+// TracedCommandBufferCmd
+//===----------------------------------------------------------------------===//
+
+absl::Status TracedCommandBufferCmd::AddTracedCommandBuffer(
+    const Thunk::ExecuteParams& params, StateManager& state,
+    se::CommandBuffer* command_buffer,
+    absl::FunctionRef<absl::Status(se::Stream*)> trace) {
+  auto traced_cmd = state.GetOrCreate<TracedCommandBuffer>(
+      this, [&] { return std::make_unique<TracedCommandBuffer>(buffers()); });
+
+  TF_ASSIGN_OR_RETURN(auto nested_cmd,
+                      traced_cmd->GetOrTraceCommandBuffer(
+                          params.buffer_allocations, params.stream->parent(),
+                          params.command_buffer_trace_stream, trace));
+
+  return command_buffer->AddNestedCommandBuffer(*nested_cmd);
+}
+
+//===----------------------------------------------------------------------===//
+// ComputationId
+//===----------------------------------------------------------------------===//
+
+// TODO(ezhulenev): PTX kernel should be replaced with CUDA C++ kernel but
+// today we accidentally try to build them without CUDA support. We need to
+// clean our build and testing infrastructure first.
+
+// PTX kernel compiled from:
+//
+// __global__ void memset32(int64_t n, uint32_t value, uint32_t* dst)
+// {
+//   int i = blockIdx.x*blockDim.x + threadIdx.x;
+//   if (i < n) dst[i] = value;
+// }
+//
+// Easiest way to get PTX from C++ is to use https://godbolt.org.
+inline constexpr std::string_view kMemset32Kernel = R"(
+.version 4.0
+.target sm_50
+.address_size 64
+
+.visible .entry memset32(
+        .param .u64 memset32_param_0,
+        .param .u32 memset32_param_1,
+        .param .u64 memset32_param_2
+)
+{
+        .reg .pred      %p<2>;
+        .reg .b32       %r<6>;
+        .reg .b64       %rd<7>;
+        .loc    1 3 0
+
+        ld.param.u64    %rd3, [memset32_param_0];
+        ld.param.u32    %r1, [memset32_param_1];
+        ld.param.u64    %rd2, [memset32_param_2];
+        .loc    1 5 3
+        mov.u32         %r2, %ctaid.x;
+        mov.u32         %r3, %ntid.x;
+        mov.u32         %r4, %tid.x;
+        mad.lo.s32      %r5, %r2, %r3, %r4;
+        .loc    1 6 3
+        cvt.s64.s32     %rd1, %r5;
+        setp.ge.s64     %p1, %rd1, %rd3;
+        @%p1 bra        $L__BB0_2;
+
+        .loc    1 5 3
+        cvta.to.global.u64      %rd4, %rd2;
+        .loc    1 6 3
+        shl.b64         %rd5, %rd1, 2;
+        add.s64         %rd6, %rd4, %rd5;
+        st.global.u32   [%rd6], %r1;
+
+$L__BB0_2:
+        .loc    1 7 1
+        ret;
+
+})";
+
+ComputationIdCmd::ComputationIdCmd(BufferAllocation::Slice dest, Kind kind)
+    : dest_(dest), kind_(kind) {}
+
+CommandBufferCmd::BufferUsageVector ComputationIdCmd::buffers() {
+  return {{dest_, MemoryAccess::kWrite}};
+}
+
+absl::Status ComputationIdCmd::Initialize(const Thunk::InitializeParams& params,
+                                          StateManager& state) {
+  {
+    absl::MutexLock lock(&mutex_);
+    if (memset_kernels_.contains(params.executor)) return absl::OkStatus();
+  }
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Kernel> kernel,
+                      CreateKernel("memset32", 3, kMemset32Kernel,
+                                   /*cubin_data=*/{}, params.executor,
+                                   /*shared_mem_bytes=*/0));
+
+  absl::MutexLock lock(&mutex_);
+  memset_kernels_.emplace(params.executor, std::move(kernel));
+  return absl::OkStatus();
+}
+
+absl::Status ComputationIdCmd::Record(const Thunk::ExecuteParams& params,
+                                      StateManager& state,
+                                      se::CommandBuffer* command_buffer) {
+  se::DeviceMemoryBase dst = params.buffer_allocations->GetDeviceAddress(dest_);
+
+  GlobalDeviceId global_device_id = params.collective_params->global_device_id;
+  TF_ASSIGN_OR_RETURN(const DeviceAssignment::LogicalID logical_id,
+                      params.collective_params->device_assn->LogicalIdForDevice(
+                          global_device_id));
+
+  uint32_t value = kind_ == Kind::kReplica ? logical_id.replica_id
+                                           : logical_id.computation_id;
+
+  VLOG(5) << "ComputationIdCmd"
+          << ": kind=" << (kind_ == Kind::kReplica ? "replica" : "partition")
+          << "; value=" << value;
+  VLOG(5) << "  Id: " << dest_ << " (" << dst.opaque() << ")";
+
+  se::Kernel* memset_kernel = [&] {
+    absl::MutexLock lock(&mutex_);
+    return memset_kernels_[params.stream->parent()].get();
+  }();
+
+  if (memset_kernel == nullptr) {
+    return absl::InternalError(
+        "Memset kernel not loaded on a command buffer executor");
+  }
+
+  auto args = se::PackKernelArgs(/*shmem_bytes=*/0, int64_t{1}, value, dst);
+  return command_buffer->Launch(se::ThreadDim(1), se::BlockDim(1),
+                                *memset_kernel, *args);
+}
+
 //===----------------------------------------------------------------------===//
 // LaunchCmd
 //===----------------------------------------------------------------------===//
@@ -263,30 +501,32 @@ LaunchCmd::LaunchCmd(std::string kernel_name,
       dims_(dims),
       shmem_bytes_(shmem_bytes) {}
 
-absl::Status LaunchCmd::Initialize(se::StreamExecutor* executor,
-                                   ExecutableSource source) {
+absl::Status LaunchCmd::Initialize(const Thunk::InitializeParams& params,
+                                   StateManager& state) {
   {
     absl::MutexLock lock(&mutex_);
-    if (kernels_.contains(executor)) return absl::OkStatus();
+    if (kernels_.contains(params.executor)) return absl::OkStatus();
   }
 
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Kernel> kernel,
-                      CreateKernel(kernel_name_, args_.size(), source.text,
-                                   source.binary, executor, shmem_bytes_));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<se::Kernel> kernel,
+      CreateKernel(kernel_name_, args_.size(), params.src.text,
+                   params.src.binary, params.executor, shmem_bytes_));
 
   absl::MutexLock lock(&mutex_);
-  kernels_.emplace(executor, std::move(kernel));
+  kernels_.emplace(params.executor, std::move(kernel));
   return absl::OkStatus();
 }
 
-absl::Status LaunchCmd::Record(const RecordParams& params,
+absl::Status LaunchCmd::Record(const Thunk::ExecuteParams& params,
+                               StateManager& state,
                                se::CommandBuffer* command_buffer) {
   VLOG(5) << "LaunchCmd: kernel=" << kernel_name_
           << ", shmem_bytes=" << shmem_bytes_;
 
   se::Kernel* kernel = [&] {
     absl::MutexLock lock(&mutex_);
-    return kernels_[params.executor].get();
+    return kernels_[params.stream->parent()].get();
   }();
 
   if (kernel == nullptr) {
@@ -327,29 +567,30 @@ CustomKernelLaunchCmd::CustomKernelLaunchCmd(
       args_access_(args_access.begin(), args_access.end()),
       custom_kernel_(std::move(custom_kernel)) {}
 
-absl::Status CustomKernelLaunchCmd::Initialize(se::StreamExecutor* executor,
-                                               ExecutableSource source) {
+absl::Status CustomKernelLaunchCmd::Initialize(
+    const Thunk::InitializeParams& params, StateManager& state) {
   {
     absl::MutexLock lock(&mutex_);
-    if (kernels_.contains(executor)) return absl::OkStatus();
+    if (kernels_.contains(params.executor)) return absl::OkStatus();
   }
 
-  auto kernel = std::make_unique<se::Kernel>(executor);
-  TF_RETURN_IF_ERROR(
-      executor->GetKernel(custom_kernel_.kernel_spec(), kernel.get()));
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<se::Kernel> kernel,
+      se::Kernel::Create(params.executor, custom_kernel_.kernel_spec()));
 
   absl::MutexLock lock(&mutex_);
-  kernels_.emplace(executor, std::move(kernel));
+  kernels_.emplace(params.executor, std::move(kernel));
   return absl::OkStatus();
 }
 
-absl::Status CustomKernelLaunchCmd::Record(const RecordParams& params,
+absl::Status CustomKernelLaunchCmd::Record(const Thunk::ExecuteParams& params,
+                                           StateManager& state,
                                            se::CommandBuffer* command_buffer) {
   VLOG(5) << "CustomKernelLaunchCmd: custom_kernel=" << custom_kernel_.name();
 
   se::Kernel* kernel = [&] {
     absl::MutexLock lock(&mutex_);
-    return kernels_[params.executor].get();
+    return kernels_[params.stream->parent()].get();
   }();
 
   if (kernel == nullptr) {
@@ -391,7 +632,8 @@ MemcpyDeviceToDeviceCmd::MemcpyDeviceToDeviceCmd(BufferAllocation::Slice dst,
     : dst_(dst), src_(src), num_bytes_(num_bytes) {}
 
 absl::Status MemcpyDeviceToDeviceCmd::Record(
-    const RecordParams& params, se::CommandBuffer* command_buffer) {
+    const Thunk::ExecuteParams& params, StateManager& state,
+    se::CommandBuffer* command_buffer) {
   se::DeviceMemoryBase dst = params.buffer_allocations->GetDeviceAddress(dst_);
   se::DeviceMemoryBase src = params.buffer_allocations->GetDeviceAddress(src_);
 
@@ -417,7 +659,8 @@ CommandBufferCmd::BufferUsageVector MemcpyDeviceToDeviceCmd::buffers() {
 
 MemzeroCmd::MemzeroCmd(BufferAllocation::Slice dst) : dst_(dst) {}
 
-absl::Status MemzeroCmd::Record(const RecordParams& params,
+absl::Status MemzeroCmd::Record(const Thunk::ExecuteParams& params,
+                                StateManager& state,
                                 se::CommandBuffer* command_buffer) {
   se::DeviceMemoryBase dst = params.buffer_allocations->GetDeviceAddress(dst_);
 
@@ -443,7 +686,8 @@ CommandBufferCmd::BufferUsageVector MemzeroCmd::buffers() {
 Memset32Cmd::Memset32Cmd(BufferAllocation::Slice dst, uint32_t bit_pattern)
     : dst_(dst), bit_pattern_(bit_pattern) {}
 
-absl::Status Memset32Cmd::Record(const RecordParams& params,
+absl::Status Memset32Cmd::Record(const Thunk::ExecuteParams& params,
+                                 StateManager& state,
                                  se::CommandBuffer* command_buffer) {
   se::DeviceMemoryBase dst = params.buffer_allocations->GetDeviceAddress(dst_);
 
@@ -471,18 +715,20 @@ IfCmd::IfCmd(BufferAllocation::Slice pred,
              CommandBufferCmdSequence then_commands)
     : pred_(pred), then_commands_(std::move(then_commands)) {}
 
-absl::Status IfCmd::Initialize(se::StreamExecutor* executor,
-                               ExecutableSource source) {
-  return then_commands_.Initialize(executor, source);
+absl::Status IfCmd::Initialize(const Thunk::InitializeParams& params,
+                               StateManager& state) {
+  return then_commands_.Initialize(params, state);
 }
 
-absl::Status IfCmd::Record(const RecordParams& params,
+absl::Status IfCmd::Record(const Thunk::ExecuteParams& params,
+                           StateManager& state,
                            se::CommandBuffer* command_buffer) {
   se::DeviceMemoryBase pred =
       params.buffer_allocations->GetDeviceAddress(pred_);
 
-  return command_buffer->If(params.executor, se::DeviceMemory<bool>(pred),
-                            ConditionBuilder(&then_commands_, &params));
+  return command_buffer->If(params.stream->parent(),
+                            se::DeviceMemory<bool>(pred),
+                            ConditionBuilder(&then_commands_, &params, &state));
 }
 
 CommandBufferCmd::BufferUsageVector IfCmd::buffers() {
@@ -504,21 +750,23 @@ IfElseCmd::IfElseCmd(BufferAllocation::Slice pred,
       then_commands_(std::move(then_commands)),
       else_commands_(std::move(else_commands)) {}
 
-absl::Status IfElseCmd::Initialize(se::StreamExecutor* executor,
-                                   ExecutableSource source) {
-  TF_RETURN_IF_ERROR(then_commands_.Initialize(executor, source));
-  TF_RETURN_IF_ERROR(else_commands_.Initialize(executor, source));
+absl::Status IfElseCmd::Initialize(const Thunk::InitializeParams& params,
+                                   StateManager& state) {
+  TF_RETURN_IF_ERROR(then_commands_.Initialize(params, state));
+  TF_RETURN_IF_ERROR(else_commands_.Initialize(params, state));
   return absl::OkStatus();
 }
 
-absl::Status IfElseCmd::Record(const RecordParams& params,
+absl::Status IfElseCmd::Record(const Thunk::ExecuteParams& params,
+                               StateManager& state,
                                se::CommandBuffer* command_buffer) {
   se::DeviceMemoryBase pred =
       params.buffer_allocations->GetDeviceAddress(pred_);
 
-  return command_buffer->IfElse(params.executor, se::DeviceMemory<bool>(pred),
-                                ConditionBuilder(&then_commands_, &params),
-                                ConditionBuilder(&else_commands_, &params));
+  return command_buffer->IfElse(
+      params.stream->parent(), se::DeviceMemory<bool>(pred),
+      ConditionBuilder(&then_commands_, &params, &state),
+      ConditionBuilder(&else_commands_, &params, &state));
 }
 
 CommandBufferCmd::BufferUsageVector IfElseCmd::buffers() {
@@ -539,22 +787,23 @@ CaseCmd::CaseCmd(BufferAllocation::Slice index,
                  std::vector<CommandBufferCmdSequence> branches_commands)
     : index_(index), branches_commands_(std::move(branches_commands)) {}
 
-absl::Status CaseCmd::Initialize(se::StreamExecutor* executor,
-                                 ExecutableSource source) {
+absl::Status CaseCmd::Initialize(const Thunk::InitializeParams& params,
+                                 StateManager& state) {
   for (auto& branch : branches_commands_) {
-    TF_RETURN_IF_ERROR(branch.Initialize(executor, source));
+    TF_RETURN_IF_ERROR(branch.Initialize(params, state));
   }
   return absl::OkStatus();
 }
 
-absl::Status CaseCmd::Record(const RecordParams& params,
+absl::Status CaseCmd::Record(const Thunk::ExecuteParams& params,
+                             StateManager& state,
                              se::CommandBuffer* command_buffer) {
   se::DeviceMemoryBase index =
       params.buffer_allocations->GetDeviceAddress(index_);
 
   return command_buffer->Case(
-      params.executor, se::DeviceMemory<int32_t>(index),
-      ConditionBuilders(absl::MakeSpan(branches_commands_), &params));
+      params.stream->parent(), se::DeviceMemory<int32_t>(index),
+      ConditionBuilders(absl::MakeSpan(branches_commands_), &params, &state));
 }
 
 CommandBufferCmd::BufferUsageVector CaseCmd::buffers() {
@@ -576,19 +825,26 @@ ForCmd::ForCmd(int32_t num_iterations, BufferAllocation::Slice loop_counter,
       loop_counter_(loop_counter),
       body_commands_(std::move(body_commands)) {}
 
-absl::Status ForCmd::Initialize(se::StreamExecutor* executor,
-                                ExecutableSource source) {
-  return body_commands_.Initialize(executor, source);
+absl::Status ForCmd::Initialize(const Thunk::InitializeParams& params,
+                                StateManager& state) {
+  return body_commands_.Initialize(params, state);
 }
 
-absl::Status ForCmd::Record(const RecordParams& params,
+absl::Status ForCmd::Record(const Thunk::ExecuteParams& params,
+                            StateManager& state,
                             se::CommandBuffer* command_buffer) {
   se::DeviceMemoryBase loop_counter =
       params.buffer_allocations->GetDeviceAddress(loop_counter_);
 
-  return command_buffer->For(params.executor, num_iterations_,
-                             se::DeviceMemory<int32_t>(loop_counter),
-                             ConditionBuilder(&body_commands_, &params));
+  VLOG(5) << "ForCmd: num_iterations=" << num_iterations_
+          << "; body_commands=" << body_commands_.size();
+  VLOG(5) << "  loop_counter: " << loop_counter_ << " ("
+          << loop_counter.opaque() << ")";
+
+  return command_buffer->For(
+      params.stream->parent(), num_iterations_,
+      se::DeviceMemory<int32_t>(loop_counter),
+      ConditionBuilder(&body_commands_, &params, &state));
 }
 
 CommandBufferCmd::BufferUsageVector ForCmd::buffers() {
@@ -610,20 +866,26 @@ WhileCmd::WhileCmd(BufferAllocation::Slice pred,
       cond_commands_(std::move(cond_commands)),
       body_commands_(std::move(body_commands)) {}
 
-absl::Status WhileCmd::Initialize(se::StreamExecutor* executor,
-                                  ExecutableSource source) {
-  TF_RETURN_IF_ERROR(cond_commands_.Initialize(executor, source));
-  return body_commands_.Initialize(executor, source);
+absl::Status WhileCmd::Initialize(const Thunk::InitializeParams& params,
+                                  StateManager& state) {
+  TF_RETURN_IF_ERROR(cond_commands_.Initialize(params, state));
+  return body_commands_.Initialize(params, state);
 }
 
-absl::Status WhileCmd::Record(const RecordParams& params,
+absl::Status WhileCmd::Record(const Thunk::ExecuteParams& params,
+                              StateManager& state,
                               se::CommandBuffer* command_buffer) {
   se::DeviceMemoryBase pred =
       params.buffer_allocations->GetDeviceAddress(pred_);
 
-  return command_buffer->While(params.executor, se::DeviceMemory<bool>(pred),
-                               ConditionBuilder(&cond_commands_, &params),
-                               ConditionBuilder(&body_commands_, &params));
+  VLOG(5) << "WhileCmd: cond_commands=" << cond_commands_.size()
+          << " body_commands=" << body_commands_.size();
+  VLOG(5) << "  pred: " << pred_ << " (" << pred.opaque() << ")";
+
+  return command_buffer->While(
+      params.stream->parent(), se::DeviceMemory<bool>(pred),
+      ConditionBuilder(&cond_commands_, &params, &state),
+      ConditionBuilder(&body_commands_, &params, &state));
 }
 
 CommandBufferCmd::BufferUsageVector WhileCmd::buffers() {
@@ -643,7 +905,8 @@ CommandBufferCmd::BufferUsageVector WhileCmd::buffers() {
 AllocateCmd::AllocateCmd(BufferAllocation allocation)
     : allocation_(allocation) {}
 
-absl::Status AllocateCmd::Record(const RecordParams& params,
+absl::Status AllocateCmd::Record(const Thunk::ExecuteParams& params,
+                                 StateManager& state,
                                  se::CommandBuffer* command_buffer) {
   // Memory allocation address is returned on graph creation, and there is no
   // update operation
@@ -663,7 +926,8 @@ CommandBufferCmd::BufferUsageVector AllocateCmd::buffers() { return {}; }
 
 FreeCmd::FreeCmd(BufferAllocation allocation) : allocation_(allocation) {}
 
-absl::Status FreeCmd::Record(const RecordParams& params,
+absl::Status FreeCmd::Record(const Thunk::ExecuteParams& params,
+                             StateManager& state,
                              se::CommandBuffer* command_buffer) {
   VLOG(2) << "FreeCmd: index=" << allocation_.index();
 
@@ -695,15 +959,16 @@ GemmCmd::GemmCmd(GemmConfig config, const BufferAllocation::Slice& lhs_buffer,
       workspace_(workspace),
       deterministic_(deterministic) {}
 
-absl::Status GemmCmd::Initialize(se::StreamExecutor* executor,
-                                 ExecutableSource source) {
-  if (!executor->AsBlas()) {
+absl::Status GemmCmd::Initialize(const Thunk::InitializeParams& params,
+                                 StateManager& state) {
+  if (!params.stream->parent()->AsBlas()) {
     return absl::InternalError("Failed to initialize BLAS support for GemmCmd");
   }
   return absl::OkStatus();
 }
 
-absl::Status GemmCmd::Record(const RecordParams& params,
+absl::Status GemmCmd::Record(const Thunk::ExecuteParams& params,
+                             StateManager& state,
                              se::CommandBuffer* command_buffer) {
   se::DeviceMemoryBase lhs =
       params.buffer_allocations->GetDeviceAddress(lhs_buffer_);
@@ -720,15 +985,11 @@ absl::Status GemmCmd::Record(const RecordParams& params,
   VLOG(5) << "  Out: " << output_buffer_ << " (" << out.opaque() << ")";
   VLOG(5) << "  Workspace: " << workspace_ << " (" << workspace.opaque() << ")";
 
-  TF_ASSIGN_OR_RETURN(
-      auto nested_buffer,
-      se::CommandBuffer::Trace(
-          params.executor, params.trace_stream, [&](se::Stream* stream) {
-            return RunGemm(config_, lhs, rhs, out, workspace, deterministic_,
-                           stream);
-          }));
-
-  return command_buffer->AddNestedCommandBuffer(nested_buffer);
+  return AddTracedCommandBuffer(
+      params, state, command_buffer, [&](se::Stream* stream) {
+        return RunGemm(config_, lhs, rhs, out, workspace, deterministic_,
+                       stream);
+      });
 }
 
 CommandBufferCmd::BufferUsageVector GemmCmd::buffers() {
@@ -742,8 +1003,9 @@ CommandBufferCmd::BufferUsageVector GemmCmd::buffers() {
 // CustomCallCmd
 //===----------------------------------------------------------------------===//
 
-Status CustomCallCmd::Record(const RecordParams& params,
-                             se::CommandBuffer* command_buffer) {
+absl::Status CustomCallCmd::Record(const Thunk::ExecuteParams& params,
+                                   StateManager& state,
+                                   se::CommandBuffer* command_buffer) {
   std::vector<void*> buffers;
   buffers.reserve(operands_.size() + results_.size());
   for (auto& slices : {operands_, results_}) {
@@ -753,39 +1015,40 @@ Status CustomCallCmd::Record(const RecordParams& params,
         continue;
       }
 
-      if (!slice->slice.allocation())
-        return InternalError("custom call input missing buffer allocation");
+      if (!slice->slice.allocation()) {
+        return absl::InternalError(
+            "custom call input missing buffer allocation");
+      }
 
       buffers.push_back(
           params.buffer_allocations->GetDeviceAddress(slice->slice).opaque());
     }
   }
 
-  if (VLOG_IS_ON(5)) {
-    VLOG(5) << "CustomCallCmd: ";
-    for (int i = 0; i < operands_.size(); ++i) {
-      if (operands_[i].has_value()) {
-        VLOG(5) << "  Operand " << i << ": " << operands_[i]->slice << " ("
-                << buffers[i] << ")";
-      } else {
-        VLOG(5) << "  Operand " << i << ": null";
-      }
+  VLOG(5) << "CustomCallCmd: ";
+  for (int i = 0; i < operands_.size(); ++i) {
+    if (operands_[i].has_value()) {
+      VLOG(5) << "  Operand " << i << ": " << operands_[i]->slice << " ("
+              << buffers[i] << ")";
+    } else {
+      VLOG(5) << "  Operand " << i << ": null";
     }
-    for (int i = 0; i < results_.size(); ++i) {
-      if (results_[i].has_value()) {
-        VLOG(5) << "  Result " << i << ": " << results_[i]->slice << " ("
-                << buffers[operands_.size() + i] << ")";
-      } else {
-        VLOG(5) << "  Result " << i << ": null";
-      }
+  }
+  for (int i = 0; i < results_.size(); ++i) {
+    if (results_[i].has_value()) {
+      VLOG(5) << "  Result " << i << ": " << results_[i]->slice << " ("
+              << buffers[operands_.size() + i] << ")";
+    } else {
+      VLOG(5) << "  Result " << i << ": null";
     }
   }
 
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   TF_ASSIGN_OR_RETURN(
-      auto nested_buffer,
+      auto nested_cmd,
       se::CommandBuffer::Trace(
-          params.executor, params.trace_stream, [&](se::Stream* stream) {
+          params.stream->parent(), params.command_buffer_trace_stream,
+          [&](se::Stream* stream) {
             se::gpu::GpuStreamHandle gpu_stream =
                 se::gpu::AsGpuStreamValue(stream);
             XlaCustomCallStatus custom_call_status;
@@ -796,9 +1059,9 @@ Status CustomCallCmd::Record(const RecordParams& params,
               return absl::InternalError(
                   absl::StrCat("CustomCall failed: ", *message));
             }
-            return OkStatus();
+            return absl::OkStatus();
           }));
-  return command_buffer->AddNestedCommandBuffer(nested_buffer);
+  return command_buffer->AddNestedCommandBuffer(*nested_cmd);
 #else   //  GOOGLE_CUDA || TENSORFLOW_USE_ROCM
   return Unavailable(
       "Custom calls on GPU are not supported in this configuration. Please "
@@ -817,6 +1080,41 @@ CommandBufferCmd::BufferUsageVector CustomCallCmd::buffers() {
   return buffer_usage;
 }
 
+//===----------------------------------------------------------------------===//
+// CollectiveCmd
+//===----------------------------------------------------------------------===//
+
+CollectiveCmd::CollectiveCmd(NcclApi* nccl_api, NcclCollectiveConfig config)
+    : nccl_api_(nccl_api), config_(std::move(config)) {}
+
+absl::Status CollectiveCmd::Prepare(
+    const Thunk::PrepareParams& params,
+    Thunk::ResourceRequests& resource_requests) {
+  const Thunk::CollectiveExecuteParams* collectives = params.collective_params;
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<GlobalDeviceId> participants,
+      GetParticipatingDevices(collectives->global_device_id,
+                              *collectives->device_assn,
+                              config().replica_groups, config().group_mode));
+
+  std::vector<GlobalDeviceId> local_devices;
+  if (collectives->global_device_id_map) {
+    local_devices.reserve(collectives->global_device_id_map->size());
+    for (const auto& entry : *collectives->global_device_id_map) {
+      local_devices.push_back(entry.second);
+    }
+  }
+
+  size_t num_local_participants = GetNumLocalParticipants(
+      participants,
+      collectives->global_device_id_map ? &local_devices : nullptr);
+
+  return resource_requests.AddClique(
+      NcclCliqueKey(std::move(participants), /*stream_id=*/0),
+      num_local_participants);
+}
+
 //===----------------------------------------------------------------------===//
 // AllReduceCmd
 //===----------------------------------------------------------------------===//
@@ -825,17 +1123,17 @@ AllReduceCmd::AllReduceCmd(
     NcclApi* nccl_api, NcclCollectiveConfig config,
     ReductionKind reduction_kind,
     absl::Span<const NcclCollectiveThunk::Buffer> buffers)
-    : nccl_api_(nccl_api),
-      config_(std::move(config)),
+    : CollectiveCmd(nccl_api, std::move(config)),
       reduction_kind_(reduction_kind),
       buffers_(buffers.begin(), buffers.end()) {}
 
-absl::Status AllReduceCmd::Record(const RecordParams& params,
+absl::Status AllReduceCmd::Record(const Thunk::ExecuteParams& params,
+                                  StateManager& state,
                                   se::CommandBuffer* command_buffer) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(params.buffer_allocations, buffers_,
-                             config_.operand_element_type));
+                             config().operand_element_type));
 
   VLOG(5) << "AllReduceCmd: reduction=" << ReductionKindString(reduction_kind_);
 
@@ -846,34 +1144,30 @@ absl::Status AllReduceCmd::Record(const RecordParams& params,
             << device_buffers[i].destination_buffer.opaque() << ")";
   }
 
-  if (params.collective_params == nullptr) {
+  if (!params.collective_params || !params.collective_cliques) {
     return absl::InvalidArgumentError(
-        "AllReduceCmd requires collective parameters");
+        "AllReduceCmd requires collective parameters and cliques");
   }
 
   // Today when recording collective operations into command buffers we always
-  // use a sync mode and a stream id `0`, and enable clique optimization.
+  // use a sync mode and a stream id `0`.
   TF_ASSIGN_OR_RETURN(
-      NcclComm::Lock comm,
-      LockNcclComm(*params.collective_params, config_.replica_groups,
-                   config_.group_mode, config_.op_id, /*stream_id=*/0,
-                   /*enable_clique_optimization=*/true));
+      NcclApi::NcclCommHandle comm,
+      GetNcclComm(*params.collective_params, *params.collective_cliques,
+                  config().replica_groups, config().group_mode,
+                  /*stream_id=*/0));
 
   // Use custom allocator for persistent execution plans.
   NcclApi::ScopedPersistentPlanAllocator scoped_allocator(
-      *comm, tsl::MakeRef<NcclApi::PersistentPlanAllocator>(
-                 params.buffer_allocations->device_ordinal(),
-                 params.buffer_allocations->memory_allocator(), params.stream));
-
-  TF_ASSIGN_OR_RETURN(
-      auto nested_buffer,
-      se::CommandBuffer::Trace(
-          params.executor, params.trace_stream, [&](se::Stream* stream) {
-            return RunAllReduce(nccl_api_, reduction_kind_, device_buffers,
-                                *stream, *comm);
-          }));
-
-  return command_buffer->AddNestedCommandBuffer(nested_buffer);
+      comm, tsl::MakeRef<NcclApi::PersistentPlanAllocator>(
+                params.buffer_allocations->device_ordinal(),
+                params.buffer_allocations->memory_allocator(), params.stream));
+
+  return AddTracedCommandBuffer(
+      params, state, command_buffer, [&](se::Stream* stream) {
+        return RunAllReduce(nccl_api(), reduction_kind_, device_buffers,
+                            *stream, comm);
+      });
 }
 
 CommandBufferCmd::BufferUsageVector AllReduceCmd::buffers() {
@@ -893,17 +1187,17 @@ ReduceScatterCmd::ReduceScatterCmd(
     NcclApi* nccl_api, NcclCollectiveConfig config,
     ReductionKind reduction_kind,
     absl::Span<const NcclCollectiveThunk::Buffer> buffers)
-    : nccl_api_(nccl_api),
-      config_(std::move(config)),
+    : CollectiveCmd(nccl_api, std::move(config)),
       reduction_kind_(reduction_kind),
       buffers_(buffers.begin(), buffers.end()) {}
 
-absl::Status ReduceScatterCmd::Record(const RecordParams& params,
+absl::Status ReduceScatterCmd::Record(const Thunk::ExecuteParams& params,
+                                      StateManager& state,
                                       se::CommandBuffer* command_buffer) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(params.buffer_allocations, buffers_,
-                             config_.operand_element_type));
+                             config().operand_element_type));
 
   VLOG(5) << "ReduceScatterCmd: reduction="
           << ReductionKindString(reduction_kind_);
@@ -915,34 +1209,30 @@ absl::Status ReduceScatterCmd::Record(const RecordParams& params,
             << device_buffers[i].destination_buffer.opaque() << ")";
   }
 
-  if (params.collective_params == nullptr) {
+  if (!params.collective_params || !params.collective_cliques) {
     return absl::InvalidArgumentError(
-        "ReduceScatterCmd requires collective parameters");
+        "ReduceScatterCmd requires collective parameters and cliques");
   }
 
   // Today when recording collective operations into command buffers we always
-  // use a sync mode and a stream id `0`, and enable clique optimization.
+  // use a sync mode and a stream id `0`.
   TF_ASSIGN_OR_RETURN(
-      NcclComm::Lock comm,
-      LockNcclComm(*params.collective_params, config_.replica_groups,
-                   config_.group_mode, config_.op_id, /*stream_id=*/0,
-                   /*enable_clique_optimization=*/true));
+      NcclApi::NcclCommHandle comm,
+      GetNcclComm(*params.collective_params, *params.collective_cliques,
+                  config().replica_groups, config().group_mode,
+                  /*stream_id=*/0));
 
   // Use custom allocator for persistent execution plans.
   NcclApi::ScopedPersistentPlanAllocator scoped_allocator(
-      *comm, tsl::MakeRef<NcclApi::PersistentPlanAllocator>(
-                 params.buffer_allocations->device_ordinal(),
-                 params.buffer_allocations->memory_allocator(), params.stream));
-
-  TF_ASSIGN_OR_RETURN(
-      auto nested_buffer,
-      se::CommandBuffer::Trace(
-          params.executor, params.trace_stream, [&](se::Stream* stream) {
-            return RunReduceScatter(nccl_api_, reduction_kind_, device_buffers,
-                                    *stream, *comm);
-          }));
-
-  return command_buffer->AddNestedCommandBuffer(nested_buffer);
+      comm, tsl::MakeRef<NcclApi::PersistentPlanAllocator>(
+                params.buffer_allocations->device_ordinal(),
+                params.buffer_allocations->memory_allocator(), params.stream));
+
+  return AddTracedCommandBuffer(
+      params, state, command_buffer, [&](se::Stream* stream) {
+        return RunReduceScatter(nccl_api(), reduction_kind_, device_buffers,
+                                *stream, comm);
+      });
 }
 
 CommandBufferCmd::BufferUsageVector ReduceScatterCmd::buffers() {
@@ -961,16 +1251,16 @@ CommandBufferCmd::BufferUsageVector ReduceScatterCmd::buffers() {
 AllGatherCmd::AllGatherCmd(
     NcclApi* nccl_api, NcclCollectiveConfig config,
     absl::Span<const NcclCollectiveThunk::Buffer> buffers)
-    : nccl_api_(nccl_api),
-      config_(std::move(config)),
+    : CollectiveCmd(nccl_api, std::move(config)),
       buffers_(buffers.begin(), buffers.end()) {}
 
-absl::Status AllGatherCmd::Record(const RecordParams& params,
+absl::Status AllGatherCmd::Record(const Thunk::ExecuteParams& params,
+                                  StateManager& state,
                                   se::CommandBuffer* command_buffer) {
   TF_ASSIGN_OR_RETURN(
       std::vector<DeviceBufferPair> device_buffers,
       ConvertToDeviceBuffers(params.buffer_allocations, buffers_,
-                             config_.operand_element_type));
+                             config().operand_element_type));
 
   VLOG(5) << "AllGatherCmd";
 
@@ -981,33 +1271,29 @@ absl::Status AllGatherCmd::Record(const RecordParams& params,
             << device_buffers[i].destination_buffer.opaque() << ")";
   }
 
-  if (params.collective_params == nullptr) {
+  if (!params.collective_params || !params.collective_cliques) {
     return absl::InvalidArgumentError(
-        "AllGatherCmd requires collective parameters");
+        "AllGatherCmd requires collective parameters and cliques");
   }
 
   // Today when recording collective operations into command buffers we always
-  // use a sync mode and a stream id `0`, and enable clique optimization.
+  // use a sync mode and a stream id `0`.
   TF_ASSIGN_OR_RETURN(
-      NcclComm::Lock comm,
-      LockNcclComm(*params.collective_params, config_.replica_groups,
-                   config_.group_mode, config_.op_id, /*stream_id=*/0,
-                   /*enable_clique_optimization=*/true));
+      NcclApi::NcclCommHandle comm,
+      GetNcclComm(*params.collective_params, *params.collective_cliques,
+                  config().replica_groups, config().group_mode,
+                  /*stream_id=*/0));
 
   // Use custom allocator for persistent execution plans.
   NcclApi::ScopedPersistentPlanAllocator scoped_allocator(
-      *comm, tsl::MakeRef<NcclApi::PersistentPlanAllocator>(
-                 params.buffer_allocations->device_ordinal(),
-                 params.buffer_allocations->memory_allocator(), params.stream));
-
-  TF_ASSIGN_OR_RETURN(
-      auto nested_buffer,
-      se::CommandBuffer::Trace(
-          params.executor, params.trace_stream, [&](se::Stream* stream) {
-            return RunAllGather(nccl_api_, device_buffers, *stream, *comm);
-          }));
-
-  return command_buffer->AddNestedCommandBuffer(nested_buffer);
+      comm, tsl::MakeRef<NcclApi::PersistentPlanAllocator>(
+                params.buffer_allocations->device_ordinal(),
+                params.buffer_allocations->memory_allocator(), params.stream));
+
+  return AddTracedCommandBuffer(
+      params, state, command_buffer, [&](se::Stream* stream) {
+        return RunAllGather(nccl_api(), device_buffers, *stream, comm);
+      });
 }
 
 CommandBufferCmd::BufferUsageVector AllGatherCmd::buffers() {
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd.h b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
similarity index 62%
rename from third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd.h
rename to third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
index ae33b10a2b65a8..ccebafd548e70a 100644
--- a/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd.h
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd.h
@@ -13,12 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_CMD_H_
-#define XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_CMD_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_CMD_H_
+#define XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_CMD_H_
 
 #include <cstddef>
 #include <cstdint>
-#include <functional>
 #include <memory>
 #include <optional>
 #include <string>
@@ -29,36 +28,30 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
+#include "absl/functional/function_ref.h"
 #include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/collective_ops_utils.h"
-#include "xla/service/custom_call_status.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/kernels/custom_kernel.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/nccl_all_reduce_thunk.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_collective_thunk.h"
+#include "xla/service/gpu/runtime/custom_call_thunk.h"
 #include "xla/service/gpu/thunk.h"
-#include "xla/shape.h"
 #include "xla/status.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "xla/stream_executor/gpu/gpu_types.h"
-#endif
-
 namespace xla::gpu {
 
-using OwnedKernel = std::unique_ptr<se::Kernel>;
-
 //===----------------------------------------------------------------------===//
 // CommandBufferCmd
 //===----------------------------------------------------------------------===//
@@ -66,11 +59,16 @@ using OwnedKernel = std::unique_ptr<se::Kernel>;
 // CommandBufferCmd is an abstract command that creates or updates command
 // buffer by recording commands into it.
 //
-// Command initialization and recording must be thread safe as commands can be
-// recorded concurrently for multiple command buffers on different stream
-// executors.
+// Commands have the same execution stages as thunks as they are executed by a
+// command buffer thunk: Prepare, Initialize and Record (Execute). See Thunk
+// documentation for details.
+//
+// Commands must be thread safe as they can be recorded into multiple command
+// buffers concurrently on different stream executors.
 class CommandBufferCmd {
  public:
+  virtual ~CommandBufferCmd() = default;
+
   enum class MemoryAccess { kRead, kWrite };
 
   // BufferUsage tracks memory access type for a buffer slice, so that we can
@@ -93,36 +91,82 @@ class CommandBufferCmd {
   };
 
   using BufferUsageVector = absl::InlinedVector<BufferUsage, 4>;
-  using CollectiveExecuteParams = Thunk::CollectiveExecuteParams;
-  using ExecutableSource = Thunk::ExecutableSource;
 
-  // Run time parameters required for recording commands into the command
-  // buffer. For example when we emit command buffer cmd sequence from an HLO
-  // module, we only know the buffer slices required for HLO operations, but the
-  // concrete device pointers become available only at run time.
+  // A base class for externally managed command state.
+  //
+  // Commands can be executed concurrently for many stream executors (underlying
+  // devices) and command buffers. Managing per-executor state can become
+  // expensive as it requires synchronization. Furthermore the number of command
+  // buffers command is recorded into is unbounded as they come and go (command
+  // buffers evicted and reconstructed) which makes it hard to manage the
+  // lifetime of resources attached to command buffers.
   //
-  // For allocations that performed through command buffer Allocate command, the
-  // target addresses are tracked by command buffer runtime. To record command
-  // that consumes buffers allocated inside command buffer, user should specify
-  // the target address as se::DeviceMemoryBase{nullptr, size}.
-  struct RecordParams {
-    se::StreamExecutor* executor = nullptr;
-    se::Stream* stream = nullptr;
-    se::Stream* trace_stream = nullptr;
-    const BufferAllocations* buffer_allocations = nullptr;
-    const CollectiveExecuteParams* collective_params = nullptr;
+  // Externally managed state (owned and synchronized by CommandBufferThunk)
+  // allows commands to attach a piece of information to command buffer in a
+  // safe and performant way.
+  class State {
+   public:
+    virtual ~State() = default;
   };
 
-  // Prepares a command for recording on a given executor. We split it into a
+  // An external manager for a state attached to commands.
+  class StateManager {
+   public:
+    virtual ~StateManager() = default;
+
+    template <typename ConcreteState>
+    ConcreteState* GetOrNull(const CommandBufferCmd* cmd) {
+      static_assert(std::is_base_of_v<State, ConcreteState>);
+      return static_cast<ConcreteState*>(GetOrNull(cmd));
+    }
+
+    template <typename ConcreteState>
+    ConcreteState* GetOrCreate(
+        const CommandBufferCmd* cmd,
+        absl::FunctionRef<std::unique_ptr<ConcreteState>()> create) {
+      static_assert(std::is_base_of_v<State, ConcreteState>);
+      return static_cast<ConcreteState*>(GetOrCreate(
+          cmd, [&]() -> std::unique_ptr<State> { return create(); }));
+    }
+
+    template <typename ConcreteState>
+    ConcreteState* GetOrCreate(const CommandBufferCmd* cmd) {
+      static_assert(std::is_base_of_v<State, ConcreteState>);
+      return static_cast<ConcreteState*>(
+          GetOrCreate(cmd, [] { return std::make_unique<ConcreteState>(); }));
+    }
+
+   private:
+    State* GetOrNull(const CommandBufferCmd* cmd);
+
+    State* GetOrCreate(const CommandBufferCmd* cmd,
+                       absl::FunctionRef<std::unique_ptr<State>()> create);
+
+    absl::flat_hash_map<const CommandBufferCmd*, std::unique_ptr<State>> state_;
+  };
+
+  // See Thunk documentation for XLA execution stages (prepare, initialize,
+  // execute). Commands mirror thunks as they are executed as CommandBufferThunk
+  // that is plugged into the Thunk execution cycle.
+
+  // Prepare command for execution by allowing command to request shared state
+  // required for recording (i.e. collective commands request cliques).
+  virtual absl::Status Prepare(const Thunk::PrepareParams& params,
+                               Thunk::ResourceRequests& resource_requests) {
+    return absl::OkStatus();
+  }
+
+  // Initialize a command for recording on a given executor. We split it into a
   // separate function to allow expensive initialization (e.g. device kernel
   // loading) to happen before a command buffer thunk execution.
-  virtual absl::Status Initialize(se::StreamExecutor* executor,
-                                  ExecutableSource source) {
+  virtual absl::Status Initialize(const Thunk::InitializeParams& params,
+                                  StateManager& state) {
     return absl::OkStatus();
   }
 
   // Records command into the command buffer.
-  virtual absl::Status Record(const RecordParams& params,
+  virtual absl::Status Record(const Thunk::ExecuteParams& params,
+                              StateManager& state,
                               se::CommandBuffer* command_buffer) = 0;
 
   // Returns all buffers used by the cmd. These will be used to track cmd
@@ -131,8 +175,6 @@ class CommandBufferCmd {
 
   // Returns true if command implemented as a nested command buffer.
   virtual bool IsNestedCommandBuffer() const { return false; }
-
-  virtual ~CommandBufferCmd() = default;
 };
 
 //===----------------------------------------------------------------------===//
@@ -168,12 +210,17 @@ class CommandBufferCmdSequence {
     Append(std::make_unique<T>(std::forward<Args>(args)...));
   }
 
-  // Initialized all commands added to a sequence.
-  absl::Status Initialize(se::StreamExecutor* executor,
-                          CommandBufferCmd::ExecutableSource source);
+  // Prepares all commands added to a sequence.
+  absl::Status Prepare(const Thunk::PrepareParams& params,
+                       Thunk::ResourceRequests& resource_requests);
+
+  // Initializes all commands added to a sequence.
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          CommandBufferCmd::StateManager& state);
 
   // Records all commands added to a sequence into the given command buffer.
-  absl::Status Record(const CommandBufferCmd::RecordParams& params,
+  absl::Status Record(const Thunk::ExecuteParams& params,
+                      CommandBufferCmd::StateManager& state,
                       se::CommandBuffer* command_buffer,
                       RecordMode mode = RecordMode::kExclusive);
 
@@ -221,6 +268,88 @@ class CommandBufferCmdSequence {
   absl::flat_hash_set<BufferAllocation::Slice> write_set_;
 };
 
+//===----------------------------------------------------------------------===//
+// TracedCommandBuffer
+//===----------------------------------------------------------------------===//
+
+// A cache for traced command buffers that will re-trace on change in buffer
+// allocations that are relevant for `buffers` passed to constructor. We use a
+// very simple most-recently-used cache of traced command buffers as in practice
+// subsequent calls to XLA executable tend to reuse the same allocations.
+class TracedCommandBuffer : public CommandBufferCmd::State {
+ public:
+  explicit TracedCommandBuffer(CommandBufferCmd::BufferUsageVector buffers,
+                               int64_t capacity = 16);
+
+  // Returns cached command buffer traced using the same buffer addresses or
+  // traces and caches a new command buffer using user provided callback.
+  absl::StatusOr<se::CommandBuffer*> GetOrTraceCommandBuffer(
+      const BufferAllocations* buffer_allocation, se::StreamExecutor* executor,
+      se::Stream* stream, absl::FunctionRef<absl::Status(se::Stream*)> trace);
+
+ private:
+  std::vector<BufferAllocation::Index> allocs_indices_;
+
+  struct Entry {
+    std::vector<se::DeviceMemoryBase> recorded_allocs;
+    std::unique_ptr<se::CommandBuffer> command_buffer;
+  };
+
+  int64_t capacity_;
+  std::vector<Entry> entries_;
+};
+
+//===----------------------------------------------------------------------===//
+// TracedCommandBufferCmd
+//===----------------------------------------------------------------------===//
+
+// A base class for commands implemented as tracing of stream activities.
+class TracedCommandBufferCmd : public CommandBufferCmd {
+ protected:
+  // Creates a command buffer by calling a user-provided `trace` function and
+  // adds it as a nested command to `command_buffer`. Traced command buffers
+  // cached and reused in an instance of `TracedCommandBuffer` kept in `state`.
+  absl::Status AddTracedCommandBuffer(
+      const Thunk::ExecuteParams& params, StateManager& state,
+      se::CommandBuffer* command_buffer,
+      absl::FunctionRef<absl::Status(se::Stream*)> trace);
+};
+
+//===----------------------------------------------------------------------===//
+// ComputationIdCmd (ReplicaId and PartitionId)
+//===----------------------------------------------------------------------===//
+
+class ComputationIdCmd : public CommandBufferCmd {
+ public:
+  enum class Kind { kReplica, kPartition };
+
+  ComputationIdCmd(BufferAllocation::Slice dest, Kind kind);
+
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
+
+  absl::Status Record(const Thunk::ExecuteParams& params, StateManager& state,
+                      se::CommandBuffer* command_buffer) override;
+
+  BufferUsageVector buffers() override;
+
+ private:
+  BufferAllocation::Slice dest_;
+  Kind kind_;
+
+  // Command sequence can be recorded concurrently for multiple command buffers
+  // on different stream executors and we need to synchronize mutable state.
+  absl::Mutex mutex_;
+
+  // TODO(ezhulenev): This is a workaround for CUDA graphs + conditional nodes
+  // bug that will be fixed in CUDA 12.4.1 release: currently it's impossible to
+  // update a memset node inside a conditional graph. Instead of using memset
+  // node we replace it with a kernel launch node of CUDA kernels doing 1D
+  // memset. This should be removed when bug is fixed in CUDA.
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Kernel>>
+      memset_kernels_ ABSL_GUARDED_BY(mutex_);
+};
+
 //===----------------------------------------------------------------------===//
 // LaunchCmd
 //===----------------------------------------------------------------------===//
@@ -232,10 +361,10 @@ class LaunchCmd : public CommandBufferCmd {
             absl::Span<const MemoryAccess> args_access, LaunchDimensions dims,
             int64_t shmem_bytes);
 
-  absl::Status Initialize(se::StreamExecutor* executor,
-                          ExecutableSource source) override;
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
 
-  absl::Status Record(const RecordParams& params,
+  absl::Status Record(const Thunk::ExecuteParams& params, StateManager& state,
                       se::CommandBuffer* command_buffer) override;
 
   BufferUsageVector buffers() override;
@@ -250,7 +379,7 @@ class LaunchCmd : public CommandBufferCmd {
   // Command sequence can be recorded concurrently for multiple command buffers
   // on different stream executors and we need to synchronize mutable state.
   absl::Mutex mutex_;
-  absl::flat_hash_map<se::StreamExecutor*, OwnedKernel> kernels_
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Kernel>> kernels_
       ABSL_GUARDED_BY(mutex_);
 };
 
@@ -264,10 +393,10 @@ class CustomKernelLaunchCmd : public CommandBufferCmd {
                         absl::Span<const MemoryAccess> args_access,
                         CustomKernel custom_kernel);
 
-  absl::Status Initialize(se::StreamExecutor* executor,
-                          ExecutableSource source) override;
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
 
-  absl::Status Record(const RecordParams& params,
+  absl::Status Record(const Thunk::ExecuteParams& params, StateManager& state,
                       se::CommandBuffer* command_buffer) override;
 
   BufferUsageVector buffers() override;
@@ -280,7 +409,7 @@ class CustomKernelLaunchCmd : public CommandBufferCmd {
   // Command sequence can be recorded concurrently for multiple command buffers
   // on different stream executors and we need to synchronize mutable state.
   absl::Mutex mutex_;
-  absl::flat_hash_map<se::StreamExecutor*, OwnedKernel> kernels_
+  absl::flat_hash_map<se::StreamExecutor*, std::unique_ptr<se::Kernel>> kernels_
       ABSL_GUARDED_BY(mutex_);
 };
 
@@ -293,7 +422,7 @@ class MemcpyDeviceToDeviceCmd : public CommandBufferCmd {
   MemcpyDeviceToDeviceCmd(BufferAllocation::Slice dst,
                           BufferAllocation::Slice src, int64_t num_bytes);
 
-  absl::Status Record(const RecordParams& params,
+  absl::Status Record(const Thunk::ExecuteParams& params, StateManager& state,
                       se::CommandBuffer* command_buffer) override;
 
   BufferUsageVector buffers() override;
@@ -312,7 +441,7 @@ class MemzeroCmd : public CommandBufferCmd {
  public:
   explicit MemzeroCmd(BufferAllocation::Slice dst);
 
-  absl::Status Record(const RecordParams& params,
+  absl::Status Record(const Thunk::ExecuteParams& params, StateManager& state,
                       se::CommandBuffer* command_buffer) override;
 
   BufferUsageVector buffers() override;
@@ -329,7 +458,7 @@ class Memset32Cmd : public CommandBufferCmd {
  public:
   explicit Memset32Cmd(BufferAllocation::Slice dst, uint32_t bit_pattern);
 
-  absl::Status Record(const RecordParams& params,
+  absl::Status Record(const Thunk::ExecuteParams& params, StateManager& state,
                       se::CommandBuffer* command_buffer) override;
 
   BufferUsageVector buffers() override;
@@ -347,10 +476,10 @@ class IfCmd : public CommandBufferCmd {
  public:
   IfCmd(BufferAllocation::Slice pred, CommandBufferCmdSequence then_commands);
 
-  absl::Status Initialize(se::StreamExecutor* executor,
-                          ExecutableSource source) override;
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
 
-  absl::Status Record(const RecordParams& params,
+  absl::Status Record(const Thunk::ExecuteParams& params, StateManager& state,
                       se::CommandBuffer* command_buffer) override;
 
   BufferUsageVector buffers() override;
@@ -370,10 +499,10 @@ class IfElseCmd : public CommandBufferCmd {
             CommandBufferCmdSequence then_commands,
             CommandBufferCmdSequence else_commands);
 
-  absl::Status Initialize(se::StreamExecutor* executor,
-                          ExecutableSource source) override;
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
 
-  absl::Status Record(const RecordParams& params,
+  absl::Status Record(const Thunk::ExecuteParams& params, StateManager& state,
                       se::CommandBuffer* command_buffer) override;
 
   BufferUsageVector buffers() override;
@@ -393,10 +522,10 @@ class CaseCmd : public CommandBufferCmd {
   CaseCmd(BufferAllocation::Slice index,
           std::vector<CommandBufferCmdSequence> branches_commands);
 
-  absl::Status Initialize(se::StreamExecutor* executor,
-                          ExecutableSource source) override;
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
 
-  absl::Status Record(const RecordParams& params,
+  absl::Status Record(const Thunk::ExecuteParams& params, StateManager& state,
                       se::CommandBuffer* command_buffer) override;
 
   BufferUsageVector buffers() override;
@@ -415,10 +544,10 @@ class ForCmd : public CommandBufferCmd {
   ForCmd(int32_t num_iterations, BufferAllocation::Slice loop_counter,
          CommandBufferCmdSequence body_commands);
 
-  absl::Status Initialize(se::StreamExecutor* executor,
-                          ExecutableSource source) override;
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
 
-  absl::Status Record(const RecordParams& params,
+  absl::Status Record(const Thunk::ExecuteParams& params, StateManager& state,
                       se::CommandBuffer* command_buffer) override;
 
   BufferUsageVector buffers() override;
@@ -438,10 +567,10 @@ class WhileCmd : public CommandBufferCmd {
   WhileCmd(BufferAllocation::Slice pred, CommandBufferCmdSequence cond_commands,
            CommandBufferCmdSequence body_commands);
 
-  absl::Status Initialize(se::StreamExecutor* executor,
-                          ExecutableSource source) override;
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
 
-  absl::Status Record(const RecordParams& params,
+  absl::Status Record(const Thunk::ExecuteParams& params, StateManager& state,
                       se::CommandBuffer* command_buffer) override;
 
   BufferUsageVector buffers() override;
@@ -462,7 +591,7 @@ class AllocateCmd : public CommandBufferCmd {
 
   // After calling this function, the allocated memory is tracked in
   // CommandBuffer object.
-  absl::Status Record(const RecordParams& params,
+  absl::Status Record(const Thunk::ExecuteParams& params, StateManager& state,
                       se::CommandBuffer* command_buffer) override;
 
   BufferUsageVector buffers() override;
@@ -481,7 +610,7 @@ class FreeCmd : public CommandBufferCmd {
 
   // After calling this function, the allocated memory address for dst
   // BufferAllocation is freed, no update is required.
-  absl::Status Record(const RecordParams& params,
+  absl::Status Record(const Thunk::ExecuteParams& params, StateManager& state,
                       se::CommandBuffer* command_buffer) override;
 
   BufferUsageVector buffers() override;
@@ -494,17 +623,17 @@ class FreeCmd : public CommandBufferCmd {
 // GemmCmd
 //===----------------------------------------------------------------------===//
 
-class GemmCmd : public CommandBufferCmd {
+class GemmCmd : public TracedCommandBufferCmd {
  public:
   GemmCmd(GemmConfig config, const BufferAllocation::Slice& lhs_buffer,
           const BufferAllocation::Slice& rhs_buffer,
           const BufferAllocation::Slice& output_buffer,
           const BufferAllocation::Slice& workspace, bool deterministic);
 
-  absl::Status Initialize(se::StreamExecutor* executor,
-                          ExecutableSource source) override;
+  absl::Status Initialize(const Thunk::InitializeParams& params,
+                          StateManager& state) override;
 
-  absl::Status Record(const RecordParams& params,
+  absl::Status Record(const Thunk::ExecuteParams& params, StateManager& state,
                       se::CommandBuffer* command_buffer) override;
 
   BufferUsageVector buffers() override;
@@ -527,17 +656,9 @@ class GemmCmd : public CommandBufferCmd {
 
 class CustomCallCmd : public CommandBufferCmd {
  public:
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  using Stream = stream_executor::gpu::GpuStreamHandle;
-#else   //  GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  using Stream = void*;
-#endif  //  GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  using CustomCallTarget = std::function<void(Stream, void**, const char*,
-                                              size_t, XlaCustomCallStatus*)>;
-  struct Slice {
-    BufferAllocation::Slice slice;
-    Shape shape;
-  };
+  using Slice = CustomCallThunk::Slice;
+  using Stream = CustomCallThunk::Stream;
+  using CustomCallTarget = CustomCallThunk::CustomCallTarget;
 
   // This is a legacy custom call API that is discouraged, and will be
   // deprecated once XLA:FFI mechanism is ready.
@@ -551,8 +672,8 @@ class CustomCallCmd : public CommandBufferCmd {
         results_(std::move(results)),
         opaque_(opaque){};
 
-  Status Record(const RecordParams& params,
-                se::CommandBuffer* command_buffer) override;
+  absl::Status Record(const Thunk::ExecuteParams& params, StateManager& state,
+                      se::CommandBuffer* command_buffer) override;
 
   BufferUsageVector buffers() override;
   bool IsNestedCommandBuffer() const final { return true; }
@@ -564,26 +685,44 @@ class CustomCallCmd : public CommandBufferCmd {
   std::string opaque_;
 };
 
+//===----------------------------------------------------------------------===//
+// CollectiveCmd
+//===----------------------------------------------------------------------===//
+
+class CollectiveCmd : public TracedCommandBufferCmd {
+ public:
+  CollectiveCmd(NcclApi* nccl_api, NcclCollectiveConfig config);
+
+  absl::Status Prepare(const Thunk::PrepareParams& params,
+                       Thunk::ResourceRequests& resource_requests) final;
+
+  bool IsNestedCommandBuffer() const final { return true; }
+
+ protected:
+  NcclApi* nccl_api() const { return nccl_api_; }
+  const NcclCollectiveConfig& config() const { return config_; }
+
+ private:
+  NcclApi* nccl_api_;
+  NcclCollectiveConfig config_;
+};
+
 //===----------------------------------------------------------------------===//
 // AllReduceCmd
 //===----------------------------------------------------------------------===//
 
-class AllReduceCmd : public CommandBufferCmd {
+class AllReduceCmd : public CollectiveCmd {
  public:
   AllReduceCmd(NcclApi* nccl_api, NcclCollectiveConfig config,
                ReductionKind reduction_kind,
                absl::Span<const NcclCollectiveThunk::Buffer> buffers);
 
-  absl::Status Record(const RecordParams& params,
+  absl::Status Record(const Thunk::ExecuteParams& params, StateManager& state,
                       se::CommandBuffer* command_buffer) override;
 
   BufferUsageVector buffers() override;
 
-  bool IsNestedCommandBuffer() const final { return true; }
-
  private:
-  NcclApi* nccl_api_;
-  NcclCollectiveConfig config_;
   ReductionKind reduction_kind_;
   std::vector<NcclCollectiveThunk::Buffer> buffers_;
 };
@@ -592,22 +731,18 @@ class AllReduceCmd : public CommandBufferCmd {
 // ReduceScatterCmd
 //===----------------------------------------------------------------------===//
 
-class ReduceScatterCmd : public CommandBufferCmd {
+class ReduceScatterCmd : public CollectiveCmd {
  public:
   ReduceScatterCmd(NcclApi* nccl_api, NcclCollectiveConfig config,
                    ReductionKind reduction_kind,
                    absl::Span<const NcclCollectiveThunk::Buffer> buffers);
 
-  absl::Status Record(const RecordParams& params,
+  absl::Status Record(const Thunk::ExecuteParams& params, StateManager& state,
                       se::CommandBuffer* command_buffer) override;
 
   BufferUsageVector buffers() override;
 
-  bool IsNestedCommandBuffer() const final { return true; }
-
  private:
-  NcclApi* nccl_api_;
-  NcclCollectiveConfig config_;
   ReductionKind reduction_kind_;
   std::vector<NcclCollectiveThunk::Buffer> buffers_;
 };
@@ -616,24 +751,20 @@ class ReduceScatterCmd : public CommandBufferCmd {
 // AllGatherCmd
 //===----------------------------------------------------------------------===//
 
-class AllGatherCmd : public CommandBufferCmd {
+class AllGatherCmd : public CollectiveCmd {
  public:
   AllGatherCmd(NcclApi* nccl_api, NcclCollectiveConfig config,
                absl::Span<const NcclCollectiveThunk::Buffer> buffers);
 
-  absl::Status Record(const RecordParams& params,
+  absl::Status Record(const Thunk::ExecuteParams& params, StateManager& state,
                       se::CommandBuffer* command_buffer) override;
 
   BufferUsageVector buffers() override;
 
-  bool IsNestedCommandBuffer() const final { return true; }
-
  private:
-  NcclApi* nccl_api_;
-  NcclCollectiveConfig config_;
   std::vector<NcclCollectiveThunk::Buffer> buffers_;
 };
 
 }  // namespace xla::gpu
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_CMD_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_CMD_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_emitter.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc
similarity index 84%
rename from third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_emitter.cc
rename to third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc
index 25530ab4c92b4b..f17115e236dbf7 100644
--- a/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_emitter.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/command_buffer_cmd_emitter.h"
+#include "xla/service/gpu/runtime/command_buffer_cmd_emitter.h"
 
 #include <memory>
 #include <optional>
@@ -23,17 +23,19 @@ limitations under the License.
 #include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "xla/service/gpu/gemm_thunk.h"
-#include "xla/service/gpu/nccl_all_gather_thunk.h"
-#include "xla/service/gpu/nccl_all_reduce_thunk.h"
-#include "xla/service/gpu/runtime3/command_buffer_cmd.h"
-#include "xla/service/gpu/runtime3/conditional_thunk.h"
-#include "xla/service/gpu/runtime3/copy_thunk.h"
-#include "xla/service/gpu/runtime3/custom_call_thunk.h"
-#include "xla/service/gpu/runtime3/kernel_thunk.h"
-#include "xla/service/gpu/runtime3/memset_thunk.h"
-#include "xla/service/gpu/runtime3/sequential_thunk.h"
-#include "xla/service/gpu/runtime3/while_thunk.h"
+#include "xla/service/gpu/runtime/command_buffer_cmd.h"
+#include "xla/service/gpu/runtime/conditional_thunk.h"
+#include "xla/service/gpu/runtime/copy_thunk.h"
+#include "xla/service/gpu/runtime/custom_call_thunk.h"
+#include "xla/service/gpu/runtime/gemm_thunk.h"
+#include "xla/service/gpu/runtime/kernel_thunk.h"
+#include "xla/service/gpu/runtime/memset_thunk.h"
+#include "xla/service/gpu/runtime/nccl_all_gather_thunk.h"
+#include "xla/service/gpu/runtime/nccl_all_reduce_thunk.h"
+#include "xla/service/gpu/runtime/replica_id_thunk.h"
+#include "xla/service/gpu/runtime/sequential_thunk.h"
+#include "xla/service/gpu/runtime/wait_for_streams_thunk.h"
+#include "xla/service/gpu/runtime/while_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
@@ -145,26 +147,19 @@ static absl::StatusOr<Command> Convert(const NcclAllGatherStartThunk& thunk) {
                                         thunk.buffers());
 }
 
-static StatusOr<Command> Convert(const CustomCallThunk& thunk) {
-  auto convert_slices =
-      [](const std::vector<std::optional<CustomCallThunk::Slice>>& slices) {
-        std::vector<std::optional<CustomCallCmd::Slice>> converted;
-        converted.reserve(slices.size());
-        for (const std::optional<CustomCallThunk::Slice>& slice : slices) {
-          if (slice.has_value()) {
-            CustomCallCmd::Slice converted_slice = {slice.value().slice,
-                                                    slice.value().shape};
-            converted.push_back(converted_slice);
-          } else {
-            converted.push_back(std::nullopt);
-          }
-        }
-        return converted;
-      };
-
-  return std::make_unique<CustomCallCmd>(
-      thunk.call_target(), convert_slices(thunk.operands()),
-      convert_slices(thunk.results()), thunk.opaque());
+static absl::StatusOr<Command> Convert(const PartitionIdThunk& thunk) {
+  return std::make_unique<ComputationIdCmd>(thunk.dest(),
+                                            ComputationIdCmd::Kind::kPartition);
+}
+
+static absl::StatusOr<Command> Convert(const ReplicaIdThunk& thunk) {
+  return std::make_unique<ComputationIdCmd>(thunk.dest(),
+                                            ComputationIdCmd::Kind::kReplica);
+}
+
+static absl::StatusOr<Command> Convert(const CustomCallThunk& thunk) {
+  return std::make_unique<CustomCallCmd>(thunk.call_target(), thunk.operands(),
+                                         thunk.results(), thunk.opaque());
 }
 
 //===----------------------------------------------------------------------===//
@@ -213,6 +208,10 @@ static absl::Status AppendCommands(CommandBufferCmdSequence& cmd_sequence,
       return append(Convert<NcclAllReduceStartThunk>(thunk));
     case Thunk::Kind::kNcclReduceScatterStart:
       return append(Convert<NcclReduceScatterStartThunk>(thunk));
+    case Thunk::Kind::kPartitionId:
+      return append(Convert<PartitionIdThunk>(thunk));
+    case Thunk::Kind::kReplicaId:
+      return append(Convert<ReplicaIdThunk>(thunk));
     case Thunk::Kind::kWhile:
       return append(Convert<WhileThunk>(thunk, force_barriers));
 
@@ -228,7 +227,8 @@ static absl::Status AppendCommands(CommandBufferCmdSequence& cmd_sequence,
     case Thunk::Kind::kNcclAllGatherDone:
     case Thunk::Kind::kNcclAllReduceDone:
     case Thunk::Kind::kNcclReduceScatterDone:
-      return OkStatus();
+    case Thunk::Kind::kWaitForStreams:
+      return absl::OkStatus();
 
     default:
       return Internal("Unsupported thunk kind: %s",
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_emitter.h b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.h
similarity index 82%
rename from third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_emitter.h
rename to third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.h
index 237f0ee7c41fa6..9b77ab7630a99a 100644
--- a/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_emitter.h
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_emitter.h
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_CMD_EMITTER_H_
-#define XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_CMD_EMITTER_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_CMD_EMITTER_H_
+#define XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_CMD_EMITTER_H_
 
-#include "xla/service/gpu/runtime3/command_buffer_cmd.h"
+#include "xla/service/gpu/runtime/command_buffer_cmd.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/statusor.h"
 
@@ -31,4 +31,4 @@ absl::StatusOr<CommandBufferCmdSequence> ConvertToCommands(
 
 }  // namespace xla::gpu
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_CMD_EMITTER_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_CMD_EMITTER_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_test.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
similarity index 53%
rename from third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_test.cc
rename to third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
index 15330ebb9c8abd..6d543450d7404d 100644
--- a/third_party/xla/xla/service/gpu/runtime3/command_buffer_cmd_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_cmd_test.cc
@@ -13,25 +13,34 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/command_buffer_cmd.h"
+#include "xla/service/gpu/runtime/command_buffer_cmd.h"
 
+#include <array>
 #include <cstdint>
 #include <vector>
 
+#include "absl/functional/function_ref.h"
+#include "absl/status/status.h"
 #include "absl/strings/ascii.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/launch_dimensions.h"
+#include "xla/service/gpu/thunk.h"
 #include "xla/service/platform_util.h"
+#include "xla/service/service_executable_run_options.h"
 #include "xla/status.h"
 #include "xla/stream_executor/command_buffer.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"  // IWYU pragma: keep
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
+#include "tsl/platform/test_benchmark.h"
 
 namespace xla::gpu {
 
@@ -42,7 +51,7 @@ using MemoryAccess = CommandBufferCmd::MemoryAccess;
 static se::StreamExecutor* GpuExecutor() {
   auto name =
       absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
-  auto* platform = se::MultiPlatformManager::PlatformWithName(name).value();
+  auto* platform = se::PlatformManager::PlatformWithName(name).value();
   return platform->ExecutorForDevice(0).value();
 }
 
@@ -53,7 +62,8 @@ struct TestOnlyCommandBufferCmd : public CommandBufferCmd {
   explicit TestOnlyCommandBufferCmd(BufferUsageVector buffer_usage)
       : buffer_usage(buffer_usage) {}
 
-  absl::Status Record(const RecordParams&, se::CommandBuffer*) override {
+  absl::Status Record(const Thunk::ExecuteParams&, StateManager&,
+                      se::CommandBuffer*) override {
     return absl::OkStatus();
   }
 
@@ -146,9 +156,7 @@ TEST(CommandBufferCmdTest, MemcpyCmd) {
   se::StreamExecutor* executor = GpuExecutor();
 
   se::Stream stream(executor);
-  stream.Init();
-  ASSERT_TRUE(stream.ok());
-
+  CHECK_OK(stream.Initialize());
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
 
@@ -170,14 +178,19 @@ TEST(CommandBufferCmdTest, MemcpyCmd) {
   CommandBufferCmdSequence commands;
   commands.Emplace<MemcpyDeviceToDeviceCmd>(slice_b, slice_a, byte_length);
 
+  ServiceExecutableRunOptions run_options;
   BufferAllocations allocations({a, b}, 0, executor->GetAllocator());
 
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+
+  CommandBufferCmd::StateManager state;
+
   auto command_buffer = se::CommandBuffer::Create(executor).value();
-  TF_ASSERT_OK(commands.Record({executor, &stream, &stream, &allocations},
-                               &command_buffer));
+  TF_ASSERT_OK(commands.Record(params, state, command_buffer.get()));
 
   // Execute command buffer and verify that it copied the memory.
-  TF_ASSERT_OK(executor->Submit(&stream, command_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *command_buffer));
 
   // Copy `b` data back to host.
   std::vector<int32_t> dst(4, 0);
@@ -190,9 +203,7 @@ TEST(CommandBufferCmdTest, LaunchCmd) {
   se::StreamExecutor* executor = GpuExecutor();
 
   se::Stream stream(executor);
-  stream.Init();
-  ASSERT_TRUE(stream.ok());
-
+  CHECK_OK(stream.Initialize());
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
 
@@ -220,25 +231,30 @@ TEST(CommandBufferCmdTest, LaunchCmd) {
                               /*shmem_bytes=*/0);
 
   // Initialize command sequence and load device kernels.
-  CommandBufferCmd::ExecutableSource source = {
+  Thunk::ExecutableSource source = {
 #if defined(GOOGLE_CUDA)
-    /*text=*/se::gpu::internal::kAddI32Kernel,
-    /*binary=*/{}
+      /*text=*/se::gpu::internal::kAddI32Kernel,
+      /*binary=*/{}
 #elif defined(TENSORFLOW_USE_ROCM)
-    /*text=*/{},
-    /*binary=*/se::gpu::internal::kAddI32KernelModule
+      /*text=*/{},
+      /*binary=*/se::gpu::internal::kAddI32KernelModule
 #endif
   };
-  TF_ASSERT_OK(commands.Initialize(executor, source));
 
+  CommandBufferCmd::StateManager state;
+  TF_ASSERT_OK(commands.Initialize({executor, source}, state));
+
+  ServiceExecutableRunOptions run_options;
   BufferAllocations allocations({a, b}, 0, executor->GetAllocator());
 
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
+
   auto command_buffer = se::CommandBuffer::Create(executor).value();
-  TF_ASSERT_OK(commands.Record({executor, &stream, &stream, &allocations},
-                               &command_buffer));
+  TF_ASSERT_OK(commands.Record(params, state, command_buffer.get()));
 
   // Execute command buffer and verify that it copied the memory.
-  TF_ASSERT_OK(executor->Submit(&stream, command_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *command_buffer));
 
   // Copy `b` data back to host.
   std::vector<int32_t> dst(4, 0);
@@ -247,4 +263,148 @@ TEST(CommandBufferCmdTest, LaunchCmd) {
   ASSERT_EQ(dst, std::vector<int32_t>(4, 42 + 42));
 }
 
+TEST(CommandBufferCmdStateManageTest, GetOrCreateState) {
+  struct TestState : public CommandBufferCmd::State {
+    int32_t value = 0;
+  };
+
+  // We need a fake command buffer pointer to use as a key.
+  CommandBufferCmd* cmd = reinterpret_cast<CommandBufferCmd*>(0x1234567);
+
+  CommandBufferCmd::StateManager state_manager;
+
+  auto* state0 = state_manager.GetOrNull<TestState>(cmd);
+  ASSERT_EQ(state0, nullptr);
+
+  auto* state1 = state_manager.GetOrCreate<TestState>(cmd);
+  ASSERT_EQ(state1->value, 0);
+  state1->value += 42;
+
+  auto* state2 = state_manager.GetOrCreate<TestState>(cmd);
+  ASSERT_EQ(state2->value, 42);
+  ASSERT_EQ(state1, state2);
+}
+
+TEST(TracedCommandBuffer, GetOrUpdateCommandBuffer) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  se::Stream stream(executor);
+  CHECK_OK(stream.Initialize());
+  BufferAllocation alloc0(/*index=*/0, /*size=*/1024, /*color=*/0);
+  BufferAllocation alloc1(/*index=*/1, /*size=*/1024, /*color=*/0);
+
+  CommandBufferCmd::BufferUsageVector buffers = {
+      {BufferAllocation::Slice(&alloc0, 0, 1024), MemoryAccess::kRead},
+      {BufferAllocation::Slice(&alloc1, 0, 1024), MemoryAccess::kWrite}};
+
+  TracedCommandBuffer traced_cmd_buffer(buffers, /*capacity=*/2);
+
+  se::DeviceMemoryBase mem0(reinterpret_cast<void*>(0x01234567));
+  se::DeviceMemoryBase mem1(reinterpret_cast<void*>(0x12345670));
+
+  BufferAllocations allocations({mem0, mem1}, 0, executor->GetAllocator());
+
+  // No-op trace callback to count how many times it was called.
+  int64_t num_calls = 0;
+  auto trace = [&](se::Stream*) {
+    num_calls++;
+    return absl::OkStatus();
+  };
+
+  TF_ASSERT_OK_AND_ASSIGN(auto* command_buffer0,
+                          traced_cmd_buffer.GetOrTraceCommandBuffer(
+                              &allocations, executor, &stream, trace));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto* command_buffer1,
+                          traced_cmd_buffer.GetOrTraceCommandBuffer(
+                              &allocations, executor, &stream, trace));
+
+  // Check that command buffer was reused as buffer allocations didn't change.
+  ASSERT_EQ(command_buffer0, command_buffer1);
+  EXPECT_EQ(num_calls, 1);
+
+  // Check that when memory address changes we re-trace the command buffer.
+  se::DeviceMemoryBase mem2(reinterpret_cast<void*>(0x23456701));
+  allocations = BufferAllocations({mem0, mem2}, 0, executor->GetAllocator());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto* command_buffer2,
+                          traced_cmd_buffer.GetOrTraceCommandBuffer(
+                              &allocations, executor, &stream, trace));
+
+  ASSERT_NE(command_buffer0, command_buffer2);
+  EXPECT_EQ(num_calls, 2);
+
+  // Check that we keep first command buffer in cache.
+  allocations = BufferAllocations({mem0, mem1}, 0, executor->GetAllocator());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto* command_buffer3,
+                          traced_cmd_buffer.GetOrTraceCommandBuffer(
+                              &allocations, executor, &stream, trace));
+  ASSERT_EQ(command_buffer0, command_buffer3);
+  EXPECT_EQ(num_calls, 2);
+
+  // Check that we trace a new graph when buffer allocation pattern is new.
+  allocations = BufferAllocations({mem0, mem0}, 0, executor->GetAllocator());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto* command_buffer4,
+                          traced_cmd_buffer.GetOrTraceCommandBuffer(
+                              &allocations, executor, &stream, trace));
+  ASSERT_NE(command_buffer4, command_buffer3);
+  ASSERT_NE(command_buffer4, command_buffer2);
+  EXPECT_EQ(num_calls, 3);
+
+  // Check that we still keep the previous graph in cache.
+  allocations = BufferAllocations({mem0, mem1}, 0, executor->GetAllocator());
+
+  TF_ASSERT_OK_AND_ASSIGN(auto* command_buffer5,
+                          traced_cmd_buffer.GetOrTraceCommandBuffer(
+                              &allocations, executor, &stream, trace));
+  ASSERT_EQ(command_buffer0, command_buffer5);
+  EXPECT_EQ(num_calls, 3);
+}
+
+//===----------------------------------------------------------------------===//
+// Performance benchmarks below
+//===----------------------------------------------------------------------===//
+
+static void BM_GetOrTraceCommandBuffer(benchmark::State& state) {
+  se::StreamExecutor* executor = GpuExecutor();
+
+  se::Stream stream(executor);
+  stream.Init();
+  CHECK(stream.ok());
+
+  BufferAllocation alloc0(/*index=*/0, /*size=*/1024, /*color=*/0);
+  BufferAllocation alloc1(/*index=*/1, /*size=*/1024, /*color=*/0);
+
+  CommandBufferCmd::BufferUsageVector buffers = {
+      {BufferAllocation::Slice(&alloc0, 0, 1024), MemoryAccess::kRead},
+      {BufferAllocation::Slice(&alloc1, 0, 1024), MemoryAccess::kWrite}};
+
+  se::DeviceMemoryBase mem0(reinterpret_cast<void*>(0x01234567));
+  se::DeviceMemoryBase mem1(reinterpret_cast<void*>(0x12345670));
+
+  std::array<BufferAllocations, 4> allocations = {
+      BufferAllocations({mem0, mem1}, 0, executor->GetAllocator()),
+      BufferAllocations({mem1, mem0}, 0, executor->GetAllocator()),
+      BufferAllocations({mem0, mem0}, 0, executor->GetAllocator()),
+      BufferAllocations({mem1, mem1}, 0, executor->GetAllocator()),
+  };
+
+  int32_t index = 0;
+  TracedCommandBuffer traced_cmd_buffer(buffers);
+
+  auto trace = [](se::Stream*) { return absl::OkStatus(); };
+  absl::FunctionRef<absl::Status(se::Stream*)> trace_ref(trace);
+
+  for (auto s : state) {
+    TF_CHECK_OK(traced_cmd_buffer
+                    .GetOrTraceCommandBuffer(&allocations[index++ % 4],
+                                             executor, &stream, trace_ref)
+                    .status());
+  }
+}
+
+BENCHMARK(BM_GetOrTraceCommandBuffer);
+
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.cc
similarity index 83%
rename from third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.cc
index 810b6dec858317..9da7c10c4f54d6 100644
--- a/third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/command_buffer_thunk.h"
+#include "xla/service/gpu/runtime/command_buffer_thunk.h"
 
 #include <algorithm>
 #include <cstdint>
@@ -27,7 +27,7 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/gpu/runtime3/command_buffer_cmd.h"
+#include "xla/service/gpu/runtime/command_buffer_cmd.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
@@ -52,7 +52,7 @@ using tsl::profiler::TraceMeEncode;
 //===----------------------------------------------------------------------===//
 
 CommandBufferThunk::ExecutorCommandBuffer::ExecutorCommandBuffer(
-    se::CommandBuffer command_buffer)
+    std::unique_ptr<se::CommandBuffer> command_buffer)
     : command_buffer(std::move(command_buffer)) {}
 
 CommandBufferThunk::CommandBufferThunk(CommandBufferCmdSequence commands,
@@ -80,7 +80,7 @@ CommandBufferThunk::CommandBufferThunk(CommandBufferCmdSequence commands,
 
 bool CommandBufferThunk::ExecutorCommandBuffer::ShouldUpdateCommandBuffer(
     const CommandBufferCmdSequence& commands,
-    const CommandBufferCmd::RecordParams& params) {
+    const Thunk::ExecuteParams& params) {
   bool should_update = false;
   const BufferAllocations* allocs = params.buffer_allocations;
 
@@ -103,38 +103,63 @@ bool CommandBufferThunk::ExecutorCommandBuffer::ShouldUpdateCommandBuffer(
   return should_update;
 }
 
-absl::Status CommandBufferThunk::Initialize(const InitializeParams& params) {
+absl::Status CommandBufferThunk::Prepare(const PrepareParams& params,
+                                         ResourceRequests& resource_requests) {
   // We might end up with empty command sequence if all of the captured fusions
   // are no-op (e.g. memcpy of size 0) and we have no emitted thunks for them.
   if (commands_.empty()) return absl::OkStatus();
 
-  TF_RETURN_IF_ERROR(commands_.Initialize(params.executor, params.src));
+  TF_RETURN_IF_ERROR(commands_.Prepare(params, resource_requests));
 
-  // Always initialize thunks if they are present so we are ready to fall back
+  // Always prepare thunks if they are present so we are ready to fall back
   // on them if we detect profiling activity.
   if (thunks_.has_value()) {
     for (auto& thunk : *thunks_) {
-      TF_RETURN_IF_ERROR(thunk->Initialize(params));
+      TF_RETURN_IF_ERROR(thunk->Prepare(params, resource_requests));
     }
   }
 
+  return absl::OkStatus();
+}
+
+absl::Status CommandBufferThunk::Initialize(const InitializeParams& params) {
+  // We might end up with empty command sequence if all of the captured fusions
+  // are no-op (e.g. memcpy of size 0) and we have no emitted thunks for them.
+  if (commands_.empty()) return absl::OkStatus();
+
   TF_ASSIGN_OR_RETURN(std::shared_ptr<ExecutorCommandBuffer> cmd_buffer,
                       GetOrCreateCommandBuffer(params.executor));
-
   absl::MutexLock lock(&cmd_buffer->mutex);
 
-  CommandBufferCmd::RecordParams record_params = {
-      params.executor, params.stream, params.command_buffer_trace_stream,
-      const_cast<BufferAllocations*>(params.buffer_allocations),
-      params.collective_params};
+  // Initialize commands.
+  TF_RETURN_IF_ERROR(commands_.Initialize(params, cmd_buffer->state));
+
+  // Always initialize thunks if they are present so we are ready to fall back
+  // on them if we detect profiling activity.
+  if (thunks_.has_value()) {
+    for (auto& thunk : *thunks_) {
+      TF_RETURN_IF_ERROR(thunk->Initialize(params));
+    }
+  }
+
+  // Construct ExecuteParams with empty fields for everything that is not needed
+  // for recording commands.
+  Thunk::ExecuteParams execute_params(
+      params.buffer_allocations, params.stream,
+      params.command_buffer_trace_stream, {}, params.collective_params,
+      params.collective_cliques, /*device_to_host_stream=*/nullptr,
+      /*host_to_device_stream=*/nullptr,
+      /*send_device_memory_function=*/nullptr,
+      /*recv_device_memory_function=*/nullptr);
 
   // If command buffer is in `kCreate` state it means that command buffer
   // sequence was never recorded into it. We initialize all command buffers
   // before execution, because command buffers when instantiated will allocate
   // memory on device and this might lead to deadlocks when we have concurrent
   // NCCL operations in flight.
-  if (cmd_buffer->command_buffer.state() == se::CommandBuffer::State::kCreate &&
-      cmd_buffer->ShouldUpdateCommandBuffer(commands_, record_params)) {
+  if (cmd_buffer->command_buffer->state() ==
+          se::CommandBuffer::State::kCreate &&
+      cmd_buffer->ShouldUpdateCommandBuffer(commands_, execute_params)) {
     VLOG(3) << "Initialize command buffer on device #"
             << params.executor->device_ordinal()
             << " by recoding command buffer cmd sequence"
@@ -148,8 +173,8 @@ absl::Status CommandBufferThunk::Initialize(const InitializeParams& params) {
 
     uint64_t start_micros = tsl::Env::Default()->NowMicros();
 
-    TF_RETURN_IF_ERROR(
-        commands_.Record(record_params, &cmd_buffer->command_buffer));
+    TF_RETURN_IF_ERROR(commands_.Record(execute_params, cmd_buffer->state,
+                                        cmd_buffer->command_buffer.get()));
 
     uint64_t end_micros = tsl::Env::Default()->NowMicros();
     VLOG(3) << "Initialized command buffer on device #"
@@ -185,12 +210,7 @@ absl::Status CommandBufferThunk::ExecuteOnStream(const ExecuteParams& params) {
 
   absl::MutexLock lock(&cmd_buffer->mutex);
 
-  CommandBufferCmd::RecordParams record_params = {
-      executor, params.stream, params.command_buffer_trace_stream,
-      const_cast<BufferAllocations*>(params.buffer_allocations),
-      &params.collective_params};
-
-  if (cmd_buffer->ShouldUpdateCommandBuffer(commands_, record_params)) {
+  if (cmd_buffer->ShouldUpdateCommandBuffer(commands_, params)) {
     VLOG(3) << "Update command buffer on device #" << executor->device_ordinal()
             << " by recoding command buffer cmd sequence" << " after "
             << cmd_buffer->num_executions << " executions since last update"
@@ -206,8 +226,8 @@ absl::Status CommandBufferThunk::ExecuteOnStream(const ExecuteParams& params) {
 
     uint64_t start_micros = tsl::Env::Default()->NowMicros();
 
-    TF_RETURN_IF_ERROR(
-        commands_.Record(record_params, &cmd_buffer->command_buffer));
+    TF_RETURN_IF_ERROR(commands_.Record(params, cmd_buffer->state,
+                                        cmd_buffer->command_buffer.get()));
 
     uint64_t end_micros = tsl::Env::Default()->NowMicros();
     VLOG(3) << "Updated command buffer in " << (end_micros - start_micros)
@@ -228,7 +248,7 @@ absl::Status CommandBufferThunk::ExecuteOnStream(const ExecuteParams& params) {
                           {"num_executions", cmd_buffer->num_executions}});
   });
 
-  return executor->Submit(params.stream, cmd_buffer->command_buffer);
+  return executor->Submit(params.stream, *cmd_buffer->command_buffer);
 }
 
 absl::StatusOr<std::shared_ptr<CommandBufferThunk::ExecutorCommandBuffer>>
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk.h b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.h
similarity index 86%
rename from third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.h
index 87366167948334..d81003c899661f 100644
--- a/third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_THUNK_H_
 
 #include <cstdint>
 #include <memory>
@@ -24,11 +24,11 @@ limitations under the License.
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/service/gpu/runtime3/command_buffer_allocations.h"
-#include "xla/service/gpu/runtime3/command_buffer_cmd.h"
+#include "xla/service/gpu/runtime/command_buffer_allocations.h"
+#include "xla/service/gpu/runtime/command_buffer_cmd.h"
 #include "xla/service/gpu/thunk.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/stream_executor.h"
@@ -40,6 +40,8 @@ class CommandBufferThunk : public Thunk {
   CommandBufferThunk(CommandBufferCmdSequence commands, ThunkInfo thunk_info,
                      std::optional<ThunkSequence> thunks = std::nullopt);
 
+  absl::Status Prepare(const PrepareParams& params,
+                       ResourceRequests& resource_requests) override;
   absl::Status Initialize(const InitializeParams& params) override;
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
@@ -53,18 +55,23 @@ class CommandBufferThunk : public Thunk {
   // Command buffer instantiated on a `se::StreamExecutor` instance, and
   // auxiliary state required for efficient command buffer updates.
   struct ExecutorCommandBuffer {
-    explicit ExecutorCommandBuffer(se::CommandBuffer command_buffer);
+    explicit ExecutorCommandBuffer(
+        std::unique_ptr<se::CommandBuffer> command_buffer);
 
     // Returns true if `commands` cmd sequence has to be recorded into
     // `command_buffer` to update it (see `recorded_allocs` below).
     bool ShouldUpdateCommandBuffer(const CommandBufferCmdSequence& commands,
-                                   const CommandBufferCmd::RecordParams& params)
+                                   const Thunk::ExecuteParams& params)
         ABSL_EXCLUSIVE_LOCKS_REQUIRED(mutex);
 
     // se::CommandBuffer is not thread safe, and we guard it with a mutex to
     // guarantee that we do not mutate it concurrently.
     absl::Mutex mutex;
-    se::CommandBuffer command_buffer ABSL_GUARDED_BY(mutex);
+    std::unique_ptr<se::CommandBuffer> command_buffer ABSL_GUARDED_BY(mutex);
+
+    // A manager for an external state attached by commands in a command
+    // sequence to a command buffer.
+    CommandBufferCmd::StateManager state ABSL_GUARDED_BY(mutex);
 
     // TODO(ezhulenev): We need to move command buffer allocations all the way
     // up to the GpuExecutable as we can have Allocate and Free commands in
@@ -134,4 +141,4 @@ class CommandBufferThunk : public Thunk {
 
 }  // namespace xla::gpu
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_COMMAND_BUFFER_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_COMMAND_BUFFER_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk_test.cc b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc
similarity index 91%
rename from third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk_test.cc
rename to third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc
index 0d4284ca0bd4bb..54f6298bdc8cdc 100644
--- a/third_party/xla/xla/service/gpu/runtime3/command_buffer_thunk_test.cc
+++ b/third_party/xla/xla/service/gpu/runtime/command_buffer_thunk_test.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/command_buffer_thunk.h"
+#include "xla/service/gpu/runtime/command_buffer_thunk.h"
 
 #include <algorithm>
 #include <cstdint>
@@ -22,13 +22,14 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/status/statusor.h"
 #include "absl/strings/ascii.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/runtime3/command_buffer_allocations.h"
-#include "xla/service/gpu/runtime3/command_buffer_cmd.h"
+#include "xla/service/gpu/runtime/command_buffer_allocations.h"
+#include "xla/service/gpu/runtime/command_buffer_cmd.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/service/platform_util.h"
 #include "xla/service/service_executable_run_options.h"
@@ -38,14 +39,17 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels.h"
 #include "xla/stream_executor/gpu/gpu_types.h"  // IWYU pragma: keep
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"  // IWYU pragma: keep
 #include "tsl/lib/core/status_test_util.h"
-#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
+#ifdef GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda.h"
+#endif
+
 namespace xla::gpu {
 
 using MemoryAccess = CommandBufferCmd::MemoryAccess;
@@ -56,12 +60,12 @@ namespace {
 static se::StreamExecutor* GpuExecutor() {
   auto name =
       absl::AsciiStrToUpper(PlatformUtil::CanonicalPlatformName("gpu").value());
-  auto* platform = se::MultiPlatformManager::PlatformWithName(name).value();
+  auto* platform = se::PlatformManager::PlatformWithName(name).value();
   return platform->ExecutorForDevice(0).value();
 }
 
-static CommandBufferCmd::ExecutableSource ExecutableSource() {
-  CommandBufferCmd::ExecutableSource source = {
+Thunk::ExecutableSource ExecutableSource() {
+  Thunk::ExecutableSource source = {
 #if defined(GOOGLE_CUDA)
       /*text=*/se::gpu::internal::kAddI32Kernel,
       /*binary=*/{}
@@ -90,8 +94,7 @@ TEST(CommandBufferThunkTest, MemcpyCmd) {
   se::StreamExecutor* executor = GpuExecutor();
 
   se::Stream stream(executor);
-  stream.Init();
-  ASSERT_TRUE(stream.ok());
+  ASSERT_OK(stream.Initialize());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
@@ -120,9 +123,8 @@ TEST(CommandBufferThunkTest, MemcpyCmd) {
   ServiceExecutableRunOptions run_options;
   BufferAllocations allocations({a, b}, 0, executor->GetAllocator());
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Thunk::ExecuteParams params,
-      Thunk::ExecuteParams::Create(run_options, allocations, &stream, &stream));
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
 
   // Execute command buffer thunk and verify that it copied the memory.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -152,8 +154,7 @@ TEST(CommandBufferThunkTest, MemzeroCmd) {
   se::StreamExecutor* executor = GpuExecutor();
 
   se::Stream stream(executor);
-  stream.Init();
-  ASSERT_TRUE(stream.ok());
+  ASSERT_OK(stream.Initialize());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
@@ -176,9 +177,8 @@ TEST(CommandBufferThunkTest, MemzeroCmd) {
   ServiceExecutableRunOptions run_options;
   BufferAllocations allocations({a}, 0, executor->GetAllocator());
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Thunk::ExecuteParams params,
-      Thunk::ExecuteParams::Create(run_options, allocations, &stream, &stream));
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
 
   // Execute command buffer thunk and verify that it zeroes the memory.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -195,8 +195,7 @@ TEST(CommandBufferThunkTest, Memset32Cmd) {
   se::StreamExecutor* executor = GpuExecutor();
 
   se::Stream stream(executor);
-  stream.Init();
-  ASSERT_TRUE(stream.ok());
+  ASSERT_OK(stream.Initialize());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
@@ -220,9 +219,8 @@ TEST(CommandBufferThunkTest, Memset32Cmd) {
   ServiceExecutableRunOptions run_options;
   BufferAllocations allocations({a}, 0, executor->GetAllocator());
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Thunk::ExecuteParams params,
-      Thunk::ExecuteParams::Create(run_options, allocations, &stream, &stream));
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
 
   // Execute command buffer thunk and verify that it set the memory.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -247,8 +245,7 @@ TEST(CommandBufferThunkTest, MemallocFreeCmdSameThunk) {
   se::StreamExecutor* executor = GpuExecutor();
 
   se::Stream stream(executor);
-  stream.Init();
-  ASSERT_TRUE(stream.ok());
+  ASSERT_OK(stream.Initialize());
 
   // Prepare arguments:
   int64_t length = 4;
@@ -287,9 +284,8 @@ TEST(CommandBufferThunkTest, MemallocFreeCmdSameThunk) {
 
   ServiceExecutableRunOptions run_options;
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Thunk::ExecuteParams params,
-      Thunk::ExecuteParams::Create(run_options, allocations, &stream, &stream));
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
 
   // Execute command buffer thunk and verify that it copied the memory.
   TF_ASSERT_OK(thunk.ExecuteOnStream(params));
@@ -314,8 +310,7 @@ TEST(CommandBufferThunkTest, MemallocFreeCmdAcrossThunk) {
   se::StreamExecutor* executor = GpuExecutor();
 
   se::Stream stream(executor);
-  stream.Init();
-  ASSERT_TRUE(stream.ok());
+  ASSERT_OK(stream.Initialize());
 
   // Prepare arguments:
   int64_t length = 4;
@@ -352,9 +347,8 @@ TEST(CommandBufferThunkTest, MemallocFreeCmdAcrossThunk) {
 
   ServiceExecutableRunOptions run_options;
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Thunk::ExecuteParams params,
-      Thunk::ExecuteParams::Create(run_options, allocations, &stream, &stream));
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
 
   // Execute command buffer thunk and verify that it copied the memory.
   TF_ASSERT_OK(thunk1.ExecuteOnStream(params));
@@ -382,8 +376,7 @@ TEST(CommandBufferThunkTest, LaunchCmd) {
   se::StreamExecutor* executor = GpuExecutor();
 
   se::Stream stream(executor);
-  stream.Init();
-  ASSERT_TRUE(stream.ok());
+  ASSERT_OK(stream.Initialize());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
@@ -417,11 +410,10 @@ TEST(CommandBufferThunkTest, LaunchCmd) {
   ServiceExecutableRunOptions run_options;
   BufferAllocations allocations({a, b}, 0, executor->GetAllocator());
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Thunk::ExecuteParams params,
-      Thunk::ExecuteParams::Create(run_options, allocations, &stream, &stream));
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
 
-  CommandBufferCmd::ExecutableSource source = ExecutableSource();
+  Thunk::ExecutableSource source = ExecutableSource();
   TF_ASSERT_OK(thunk.Initialize({executor, source, &allocations, &stream}));
 
   // Execute command buffer thunk and verify that it added the value.
@@ -469,8 +461,7 @@ TEST(CommandBufferThunkTest, CustomAddKernelLaunchCmd) {
   se::StreamExecutor* executor = GpuExecutor();
 
   se::Stream stream(executor);
-  stream.Init();
-  ASSERT_TRUE(stream.ok());
+  ASSERT_OK(stream.Initialize());
 
   auto packing = CreateDefaultArgsPacking();
 
@@ -513,11 +504,10 @@ TEST(CommandBufferThunkTest, CustomAddKernelLaunchCmd) {
   ServiceExecutableRunOptions run_options;
   BufferAllocations allocations({a, b}, 0, executor->GetAllocator());
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Thunk::ExecuteParams params,
-      Thunk::ExecuteParams::Create(run_options, allocations, &stream, &stream));
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
 
-  CommandBufferCmd::ExecutableSource source = ExecutableSource();
+  Thunk::ExecutableSource source = ExecutableSource();
   TF_ASSERT_OK(thunk.Initialize({executor, source, &allocations, &stream}));
 
   // Execute command buffer thunk and verify that it added the value.
@@ -562,14 +552,13 @@ TEST(CommandBufferThunkTest, CustomAddKernelLaunchCmd) {
 }
 
 TEST(CommandBufferThunkTest, GemmCmd) {
-#if !defined(TENSORFLOW_USE_ROCM) && CUDA_VERSION < 12030
+#if defined(GOOGLE_CUDA) && CUDA_VERSION < 12030
   GTEST_SKIP() << "Command buffer tracing is not supported";
 #endif
   se::StreamExecutor* executor = GpuExecutor();
 
   se::Stream stream(executor);
-  stream.Init();
-  ASSERT_TRUE(stream.ok());
+  ASSERT_OK(stream.Initialize());
 
   int64_t lhs_length = sizeof(float) * 2 * 4;
   int64_t rhs_length = sizeof(float) * 4 * 3;
@@ -627,11 +616,10 @@ TEST(CommandBufferThunkTest, GemmCmd) {
   BufferAllocations allocations({lhs, rhs, out, workspace}, 0,
                                 executor->GetAllocator());
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Thunk::ExecuteParams params,
-      Thunk::ExecuteParams::Create(run_options, allocations, &stream, &stream));
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
 
-  CommandBufferCmd::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
+  Thunk::ExecutableSource source = {/*text=*/"", /*binary=*/{}};
   TF_ASSERT_OK(
       thunk.Initialize({executor, source, &allocations, &stream, &stream}));
 
@@ -681,8 +669,7 @@ TEST(CommandBufferThunkTest, MultipleLaunchCmd) {
   se::StreamExecutor* executor = GpuExecutor();
 
   se::Stream stream(executor);
-  stream.Init();
-  ASSERT_TRUE(stream.ok());
+  ASSERT_OK(stream.Initialize());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
@@ -728,11 +715,10 @@ TEST(CommandBufferThunkTest, MultipleLaunchCmd) {
   ServiceExecutableRunOptions run_options;
   BufferAllocations allocations({a, b, c, d}, 0, executor->GetAllocator());
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Thunk::ExecuteParams params,
-      Thunk::ExecuteParams::Create(run_options, allocations, &stream, &stream));
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
 
-  CommandBufferCmd::ExecutableSource source = ExecutableSource();
+  Thunk::ExecutableSource source = ExecutableSource();
   TF_ASSERT_OK(thunk.Initialize({executor, source, &allocations, &stream}));
 
   // Execute command buffer thunk and verify that it added the value.
@@ -798,8 +784,7 @@ TEST(CommandBufferThunkTest, IfCmd) {
   }
 
   se::Stream stream(executor);
-  stream.Init();
-  ASSERT_TRUE(stream.ok());
+  ASSERT_OK(stream.Initialize());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
@@ -843,11 +828,10 @@ TEST(CommandBufferThunkTest, IfCmd) {
   ServiceExecutableRunOptions run_options;
   BufferAllocations allocations({pred, a, b}, 0, executor->GetAllocator());
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Thunk::ExecuteParams params,
-      Thunk::ExecuteParams::Create(run_options, allocations, &stream, &stream));
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
 
-  CommandBufferCmd::ExecutableSource source = ExecutableSource();
+  Thunk::ExecutableSource source = ExecutableSource();
   TF_ASSERT_OK(thunk.Initialize({executor, source, &allocations, &stream}));
 
   // Execute command buffer thunk and verify that it added the value.
@@ -885,8 +869,7 @@ TEST(CommandBufferThunkTest, IfElseCmd) {
   }
 
   se::Stream stream(executor);
-  stream.Init();
-  ASSERT_TRUE(stream.ok());
+  ASSERT_OK(stream.Initialize());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
@@ -942,11 +925,10 @@ TEST(CommandBufferThunkTest, IfElseCmd) {
   ServiceExecutableRunOptions run_options;
   BufferAllocations allocations({pred, a, b}, 0, executor->GetAllocator());
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Thunk::ExecuteParams params,
-      Thunk::ExecuteParams::Create(run_options, allocations, &stream, &stream));
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
 
-  CommandBufferCmd::ExecutableSource source = ExecutableSource();
+  Thunk::ExecutableSource source = ExecutableSource();
   TF_ASSERT_OK(thunk.Initialize({executor, source, &allocations, &stream}));
 
   // Execute command buffer thunk and verify that it added the value.
@@ -977,8 +959,7 @@ TEST(CommandBufferThunkTest, CaseCmd) {
   }
 
   se::Stream stream(executor);
-  stream.Init();
-  ASSERT_TRUE(stream.ok());
+  ASSERT_OK(stream.Initialize());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
@@ -1031,11 +1012,10 @@ TEST(CommandBufferThunkTest, CaseCmd) {
   ServiceExecutableRunOptions run_options;
   BufferAllocations allocations({index, a, b}, 0, executor->GetAllocator());
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Thunk::ExecuteParams params,
-      Thunk::ExecuteParams::Create(run_options, allocations, &stream, &stream));
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
 
-  CommandBufferCmd::ExecutableSource source = ExecutableSource();
+  Thunk::ExecutableSource source = ExecutableSource();
   TF_ASSERT_OK(thunk.Initialize({executor, source, &allocations, &stream}));
 
   // Execute command buffer thunk and verify that it added the value.
@@ -1065,8 +1045,7 @@ TEST(CommandBufferThunkTest, ForCmd) {
   }
 
   se::Stream stream(executor);
-  stream.Init();
-  ASSERT_TRUE(stream.ok());
+  ASSERT_OK(stream.Initialize());
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
@@ -1110,11 +1089,10 @@ TEST(CommandBufferThunkTest, ForCmd) {
   ServiceExecutableRunOptions run_options;
   BufferAllocations allocations({loop_cnt, a, b}, 0, executor->GetAllocator());
 
-  TF_ASSERT_OK_AND_ASSIGN(
-      Thunk::ExecuteParams params,
-      Thunk::ExecuteParams::Create(run_options, allocations, &stream, &stream));
+  Thunk::ExecuteParams params = Thunk::ExecuteParams::Create(
+      run_options, allocations, &stream, &stream, {}, nullptr, nullptr);
 
-  CommandBufferCmd::ExecutableSource source = ExecutableSource();
+  Thunk::ExecutableSource source = ExecutableSource();
   TF_ASSERT_OK(thunk.Initialize({executor, source, &allocations, &stream}));
 
   // Execute command buffer thunk and verify that it added the value 10 times.
diff --git a/third_party/xla/xla/service/gpu/runtime/concurrent_region.cc b/third_party/xla/xla/service/gpu/runtime/concurrent_region.cc
deleted file mode 100644
index e1f2d15567b25c..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/concurrent_region.cc
+++ /dev/null
@@ -1,167 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/concurrent_region.h"
-
-#include <algorithm>
-#include <utility>
-
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/executable.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/stream_pool.h"
-#include "xla/stream_executor/stream.h"
-#include "tsl/platform/statusor.h"
-
-namespace xla {
-namespace gpu {
-
-//===----------------------------------------------------------------------===//
-// Definitions for ConcurrentRegionStatus.
-//===----------------------------------------------------------------------===//
-
-ConcurrentRegionStatus::ConcurrentRegionStatus(
-    const ServiceExecutableRunOptions* run_options, int num_borrowed_streams)
-    : num_borrowed_streams_(num_borrowed_streams),
-      run_options_(run_options),
-      stream_index_(0),
-      capture_stream_(nullptr) {}
-
-ConcurrentRegionStatus::~ConcurrentRegionStatus() {
-  DCHECK(!IsInConcurrentRegion());
-}
-
-// Assign a stream in a round-robin fashion. Either the capture stream or one of
-// the borrowed streams is returned.
-se::Stream* ConcurrentRegionStatus::GetNextStream() {
-  DCHECK(IsInConcurrentRegion());
-  if (borrowed_streams_.empty()) {
-    return nullptr;
-  }
-
-  int index = stream_index_ % (borrowed_streams_.size() + 1);
-  stream_index_++;
-
-  if (index == 0) {
-    return capture_stream_;
-  }
-
-  return borrowed_streams_[index - 1].get();
-}
-
-absl::StatusOr<se::Stream*> ConcurrentRegionStatus::GetStream(int index) {
-  DCHECK(IsInConcurrentRegion());
-
-  if (index < 0 || index >= region_size_) {
-    return absl::OutOfRangeError("Invalid stream index");
-  }
-
-  if (index == 0) {
-    return capture_stream_;
-  }
-
-  return borrowed_streams_[index - 1].get();
-}
-
-absl::Status ConcurrentRegionStatus::StartConcurrentRegion(
-    se::Stream* capture_stream, int64_t size) {
-  if (disabled_) {
-    return absl::OkStatus();
-  }
-
-  DCHECK(!IsInConcurrentRegion());
-  se::StreamExecutor* executor = run_options_->stream()->parent();
-
-  // Stream borrowing should only happen in the first call to this function.
-  if (borrowed_streams_.empty()) {
-    TF_ASSIGN_OR_RETURN(std::vector<StreamPool::Ptr> borrowed_streams,
-                        run_options_->BorrowStreams(executor->device_ordinal(),
-                                                    num_borrowed_streams_));
-    for (StreamPool::Ptr& stream : borrowed_streams) {
-      borrowed_streams_.push_back(std::move(stream));
-    }
-  }
-
-  // Switch borrowed streams into capture mode. We only synchronize enough
-  // streams to run the kernels.
-  for (int i = 0; i < std::min<size_t>(size - 1, num_borrowed_streams_); ++i) {
-    borrowed_streams_[i]->ThenWaitFor(capture_stream);
-  }
-
-  region_size_ = size;
-  capture_stream_ = capture_stream;
-  return absl::OkStatus();
-}
-
-void ConcurrentRegionStatus::EndConcurrentRegion() {
-  if (disabled_) {
-    return;
-  }
-
-  DCHECK(IsInConcurrentRegion());
-
-  // Synchronize main capture stream with all borrowed streams in capture mode.
-  for (int i = 0; i < std::min<size_t>(region_size_ - 1, num_borrowed_streams_);
-       ++i) {
-    capture_stream_->ThenWaitFor(borrowed_streams_[i].get());
-  }
-
-  stream_index_ = 0;
-  capture_stream_ = nullptr;
-}
-
-bool ConcurrentRegionStatus::IsInConcurrentRegion() {
-  return capture_stream_ != nullptr;
-}
-
-//===----------------------------------------------------------------------===//
-// Define custom calls that mark the concurrent region in CUDA graphs.
-//===----------------------------------------------------------------------===//
-
-using xla::runtime::CustomCall;
-
-static absl::Status RegionBegin(const ServiceExecutableRunOptions* run_options,
-                                ConcurrentRegionStatus* region_status,
-                                int64_t size) {
-  se::Stream* capture_stream = run_options->stream();
-  return region_status->StartConcurrentRegion(capture_stream, size);
-}
-
-static absl::Status RegionEnd(ConcurrentRegionStatus* region_status) {
-  region_status->EndConcurrentRegion();
-  return absl::OkStatus();
-}
-
-//===----------------------------------------------------------------------===//
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    Begin, FunctionWrapper<RegionBegin>(), checks,
-    CustomCall::Bind("xla.gpu.concurrent_region.begin")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<ConcurrentRegionStatus*>()
-        .Attr<int64_t>("size"));
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(End, FunctionWrapper<RegionEnd>(), checks,
-                               CustomCall::Bind("xla.gpu.concurrent_region.end")
-                                   .UserData<ConcurrentRegionStatus*>());
-
-void RegisterConcurrentRegionCustomCalls(
-    runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.concurrent_region.begin", Begin);
-  registry.Register("xla.gpu.concurrent_region.end", End);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/concurrent_region.h b/third_party/xla/xla/service/gpu/runtime/concurrent_region.h
deleted file mode 100644
index 2591d6d31a66ab..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/concurrent_region.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_CONCURRENT_REGION_H_
-#define XLA_SERVICE_GPU_RUNTIME_CONCURRENT_REGION_H_
-
-#include <vector>
-
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/service/service_executable_run_options.h"
-
-namespace xla {
-namespace gpu {
-
-// Registers XLA Gpu runtime kernel launch custom calls.
-void RegisterConcurrentRegionCustomCalls(
-    runtime::DirectCustomCallRegistry& registry);
-
-// The state to keep track of the information regarding concurrent regions
-// between custom calls.
-class ConcurrentRegionStatus {
- public:
-  explicit ConcurrentRegionStatus(
-      const ServiceExecutableRunOptions* run_options,
-      int num_borrowed_streams = 10);
-
-  ~ConcurrentRegionStatus();
-
-  absl::Status StartConcurrentRegion(se::Stream* capture_stream, int64_t size);
-  void EndConcurrentRegion();
-
-  // Temporarily disable concurrent execution when we run GPU graphs op-by-op.
-  // If disabled_ is set to true, StartConcurrentRegion will become an no-op and
-  // IsInConcurrentRegion always returns false.
-  void DisableConcurrentRegion() { disabled_ = true; }
-  void EnableConcurrentRegion() { disabled_ = false; }
-
-  // Get a stream on which the concurrent-executable kernel runs. It returns a
-  // different stream each time to avoid building dependencies in the CUDA
-  // graph.
-  se::Stream* GetNextStream();
-
-  absl::StatusOr<se::Stream*> GetStream(int index);
-
-  bool IsInConcurrentRegion();
-
- private:
-  const int num_borrowed_streams_;
-  std::vector<StreamPool::Ptr> borrowed_streams_;
-  const ServiceExecutableRunOptions* run_options_;
-
-  bool disabled_ = false;
-  int32_t stream_index_;
-
-  // It is set to nullptr if not in a concurrent region.
-  se::Stream* capture_stream_;
-  int region_size_;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_CONCURRENT_REGION_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/conditional_thunk.cc b/third_party/xla/xla/service/gpu/runtime/conditional_thunk.cc
similarity index 57%
rename from third_party/xla/xla/service/gpu/runtime3/conditional_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/conditional_thunk.cc
index 65df0cafe861e4..c75072e344e151 100644
--- a/third_party/xla/xla/service/gpu/runtime3/conditional_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/conditional_thunk.cc
@@ -13,15 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/conditional_thunk.h"
+#include "xla/service/gpu/runtime/conditional_thunk.h"
 
+#include <cstdint>
 #include <memory>
+#include <utility>
+#include <variant>
 
 #include "absl/status/status.h"
-#include "xla/hlo/ir/hlo_instruction.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/thunk.h"
+#include "xla/service/gpu/variant_visitor.h"
 #include "xla/status.h"
+#include "xla/status_macros.h"
+#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/memory_allocation.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -55,6 +65,16 @@ absl::Status ConditionalThunk::Initialize(const InitializeParams& params) {
   for (auto& branch_thunk : config_.branch_thunks) {
     TF_RETURN_IF_ERROR(branch_thunk->Initialize(params));
   }
+
+  absl::MutexLock lock(&mutex_);
+  if (auto it = predicates_.find(params.executor); it == predicates_.end()) {
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<se::MemoryAllocation> allocation,
+        params.executor->HostMemoryAllocate(
+            config_.branch_index_is_bool ? sizeof(bool) : sizeof(int32_t)));
+    predicates_.emplace(params.executor, std::move(allocation));
+  }
+
   return absl::OkStatus();
 }
 
@@ -62,28 +82,39 @@ absl::Status ConditionalThunk::ExecuteOnStream(const ExecuteParams& params) {
   auto& stream = *params.stream;
 
   // Copy the predicate value from device.
-  int32_t branch_index = -1;
-  bool pred = false;
+  auto branch_index_or_pred = [&]() -> std::variant<int32_t*, bool*> {
+    absl::MutexLock lock(&mutex_);
+    se::StreamExecutor* executor = stream.parent();
+    if (config_.branch_index_is_bool) {
+      return reinterpret_cast<bool*>(predicates_.at(executor)->opaque());
+    } else {
+      return reinterpret_cast<int32_t*>(predicates_.at(executor)->opaque());
+    }
+  }();
+
   se::DeviceMemoryBase branch_index_address =
       params.buffer_allocations->GetDeviceAddress(branch_index_buffer_index_);
   if (config_.branch_index_is_bool) {
-    stream.ThenMemcpy(&pred, branch_index_address, sizeof(bool));
+    stream.ThenMemcpy(std::get<bool*>(branch_index_or_pred),
+                      branch_index_address, sizeof(bool));
   } else {
-    stream.ThenMemcpy(&branch_index, branch_index_address, sizeof(int32_t));
+    stream.ThenMemcpy(std::get<int32_t*>(branch_index_or_pred),
+                      branch_index_address, sizeof(int32_t));
   }
 
-  absl::Status block_status = stream.BlockHostUntilDone();
-  if (!block_status.ok()) {
+  if (absl::Status blocked = stream.BlockHostUntilDone(); !blocked.ok()) {
     return Internal("Failed to retrieve branch_index value on stream %p: %s.",
-                    &stream, block_status.message());
+                    &stream, blocked.message());
   }
-  if (config_.branch_index_is_bool) {
-    branch_index = pred ? 0 : 1;
-  } else {
-    // Handle default scenario for branch_index not in [0, num_branches).
-    if (branch_index < 0 || branch_index >= config_.branch_count) {
-      branch_index = config_.branch_count - 1;
-    }
+
+  int32_t branch_index = std::visit(
+      VariantVisitor{[](int32_t* branch_index) { return *branch_index; },
+                     [](bool* pred) { return *pred ? 0 : 1; }},
+      branch_index_or_pred);
+
+  // Handle default scenario for branch_index not in [0, num_branches).
+  if (branch_index < 0 || branch_index >= config_.branch_count) {
+    branch_index = config_.branch_count - 1;
   }
 
   // Execute the branch computation corresponding to the value of branch_index.
diff --git a/third_party/xla/xla/service/gpu/runtime3/conditional_thunk.h b/third_party/xla/xla/service/gpu/runtime/conditional_thunk.h
similarity index 79%
rename from third_party/xla/xla/service/gpu/runtime3/conditional_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/conditional_thunk.h
index aca9cac32a64e4..a7a7d3be0e0e37 100644
--- a/third_party/xla/xla/service/gpu/runtime3/conditional_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/conditional_thunk.h
@@ -13,20 +13,23 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_CONDITIONAL_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_CONDITIONAL_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_CONDITIONAL_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_CONDITIONAL_THUNK_H_
 
 #include <cstdint>
 #include <memory>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/gpu/runtime3/sequential_thunk.h"
+#include "xla/service/buffer_assignment.h"
+#include "xla/service/gpu/runtime/sequential_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/status.h"
+#include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
@@ -72,9 +75,15 @@ class ConditionalThunk : public Thunk {
  private:
   const ConditionalThunkConfig config_;
   const BufferAllocation::Slice branch_index_buffer_index_;
+
+  // Pinned host memory for transferring predicate value from device to host.
+  absl::Mutex mutex_;
+  absl::flat_hash_map<se::StreamExecutor*,
+                      std::unique_ptr<se::MemoryAllocation>>
+      predicates_ ABSL_GUARDED_BY(mutex_);
 };
 
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_CONDITIONAL_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_CONDITIONAL_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/conv.cc b/third_party/xla/xla/service/gpu/runtime/conv.cc
deleted file mode 100644
index 166123e81c7bb2..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/conv.cc
+++ /dev/null
@@ -1,689 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/conv.h"
-
-#include <limits>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "llvm/ADT/Sequence.h"
-#include "xla/mlir/runtime/transforms/custom_call_encoding.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/executable.h"
-#include "xla/service/gpu/gpu_asm_opts_util.h"
-#include "xla/service/gpu/gpu_conv_runner.h"
-#include "xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/service_executable_run_options.h"
-#include "xla/status.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/translate/mhlo_to_hlo/attribute_exporter.h"
-#include "xla/xla.pb.h"
-
-#if GOOGLE_CUDA
-#include "xla/service/gpu/autotuner_util.h"
-#include "xla/service/gpu/conv_algorithm_picker.h"
-#endif
-
-namespace xla {
-
-using xla::runtime::AggregateAttrDef;
-using xla::runtime::AggregateAttrEncoding;
-using xla::runtime::CustomCall;
-using xla::runtime::EnumAttrEncoding;
-using xla::runtime::FlatMemrefView;
-using xla::runtime::State;
-using xla::runtime::StridedMemrefView;
-using xla::runtime::Tagged;
-
-namespace lmhlo_gpu = ::mlir::lmhlo_gpu;
-namespace mhlo = ::mlir::mhlo;
-
-//===----------------------------------------------------------------------===//
-// Structs for encoding convolution attributes defined in MHLO dialect.
-//===----------------------------------------------------------------------===//
-
-namespace gpu {
-
-struct ConvDimensionNumbers {
-  int64_t input_batch_dim;
-  int64_t input_feature_dim;
-  absl::Span<const int64_t> input_spatial_dims;
-
-  int64_t kernel_in_feature_dim;
-  int64_t kernel_out_feature_dim;
-  absl::Span<const int64_t> kernel_spatial_dims;
-
-  int64_t output_batch_dim;
-  int64_t output_feature_dim;
-  absl::Span<const int64_t> output_spatial_dims;
-};
-
-struct ConvBackendConfig {
-  int64_t algorithm;
-  bool tensor_ops_enabled;
-  bool is_cudnn_frontend;
-  bool is_cudnn_reordered_int8;
-  absl::Span<const int64_t> knob_ids;
-  absl::Span<const int64_t> knob_values;
-  absl::Span<const int64_t> operand_0_layout;
-  absl::Span<const int64_t> operand_1_layout;
-  absl::Span<const int64_t> result_layout;
-  int64_t workspace_size;
-};
-
-}  // namespace gpu
-
-//===----------------------------------------------------------------------===//
-// Register convolution attributes decoding with the Xla runtime.
-//===----------------------------------------------------------------------===//
-
-namespace runtime {
-
-XLA_RUNTIME_REGISTER_ENUM_ATTR_DECODING(se::dnn::ActivationMode);
-
-XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(
-    xla::gpu::ConvDimensionNumbers,
-    // --- input dimensions
-    AggregateMember<int64_t>("input_batch_dim"),
-    AggregateMember<int64_t>("input_feature_dim"),
-    AggregateMember<absl::Span<const int64_t>>("input_spatial_dims"),
-    // --- kernel dimensions
-    AggregateMember<int64_t>("kernel_in_feature_dim"),
-    AggregateMember<int64_t>("kernel_out_feature_dim"),
-    AggregateMember<absl::Span<const int64_t>>("kernel_spatial_dims"),
-    // --- output dimensions
-    AggregateMember<int64_t>("output_batch_dim"),
-    AggregateMember<int64_t>("output_feature_dim"),
-    AggregateMember<absl::Span<const int64_t>>("output_spatial_dims"));
-
-XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(
-    xla::gpu::ConvBackendConfig,  //
-    AggregateMember<int64_t>("algorithm"),
-    AggregateMember<bool>("tensor_ops_enabled"),
-    AggregateMember<bool>("is_cudnn_frontend"),
-    AggregateMember<bool>("is_cudnn_reordered_int8"),
-    AggregateMember<absl::Span<const int64_t>>("knob_ids"),
-    AggregateMember<absl::Span<const int64_t>>("knob_values"),
-    AggregateMember<absl::Span<const int64_t>>("operand_0_layout"),
-    AggregateMember<absl::Span<const int64_t>>("operand_1_layout"),
-    AggregateMember<absl::Span<const int64_t>>("result_layout"),
-    AggregateMember<int64_t>("workspace_size"));
-
-}  // namespace runtime
-
-//===----------------------------------------------------------------------===//
-// Type names for encoded attributes.
-//===----------------------------------------------------------------------===//
-
-namespace gpu {
-
-void RegisterConvTypeIdNames(runtime::TypeIDNameRegistry& registry) {
-  registry.Register<Tagged<ConvDimensionNumbers>>("__type_id_conv_dim_numbers");
-  registry.Register<Tagged<ConvBackendConfig>>("__type_id_conv_backend_config");
-}
-
-//===----------------------------------------------------------------------===//
-// Encoding from MHLO attributes to Xla runtime aggregate attributes.
-//===----------------------------------------------------------------------===//
-
-// TODO(ezhulenev): We have to support enum encoding that can fail instead of
-// always getting the value from returned StatusOr.
-static auto EncodeConvActivation(lmhlo_gpu::Activation activation) {
-  return ConvertConvActivationMode(activation).value();
-}
-
-void PopulateConvAttrEncoding(runtime::CustomCallAttrEncodingSet& encoding) {
-  {  // --- Encode `lmhlo_gpu::ActivationAttr`.
-    encoding
-        .Add<EnumAttrEncoding<lmhlo_gpu::ActivationAttr, lmhlo_gpu::Activation,
-                              se::dnn::ActivationMode>>(EncodeConvActivation);
-  }
-
-  {  // --- Encode `mhlo::ConvDimensionNumbersAttr`.
-    using Attr = mhlo::ConvDimensionNumbersAttr;
-    encoding.Add<AggregateAttrEncoding<Attr, ConvDimensionNumbers>>(
-        encoding,
-        AggregateAttrDef<Attr>()
-            .Add("input_batch_dim", &Attr::getInputBatchDimension)
-            .Add("input_feature_dim", &Attr::getInputFeatureDimension)
-            .Add("input_spatial_dims", &Attr::getInputSpatialDimensions)
-            .Add("kernel_in_feature_dim", &Attr::getKernelInputFeatureDimension)
-            .Add("kernel_out_feature_dim",
-                 &Attr::getKernelOutputFeatureDimension)
-            .Add("kernel_spatial_dims", &Attr::getKernelSpatialDimensions)
-            .Add("output_batch_dim", &Attr::getOutputBatchDimension)
-            .Add("output_feature_dim", &Attr::getOutputFeatureDimension)
-            .Add("output_spatial_dims", &Attr::getOutputSpatialDimensions));
-  }
-
-  {  // --- Encode `lmhlo_gpu::ConvolutionBackendConfigAttr`.
-    using Attr = lmhlo_gpu::ConvolutionBackendConfigAttr;
-    encoding.Add<AggregateAttrEncoding<Attr, ConvBackendConfig>>(
-        encoding,
-        AggregateAttrDef<Attr>()
-            .Add("algorithm", &Attr::getAlgorithm)
-            .Add("tensor_ops_enabled", &Attr::getTensorOpsEnabled)
-            .Add("is_cudnn_frontend", &Attr::getIsCudnnFrontend)
-            .Add("is_cudnn_reordered_int8", &Attr::getIsCudnnReorderedInt8)
-            .Add("knob_ids", &Attr::getKnobIds)
-            .Add("knob_values", &Attr::getKnobValues)
-            .Add("operand_0_layout", &Attr::getOperand_0Layout)
-            .Add("operand_1_layout", &Attr::getOperand_1Layout)
-            .Add("result_layout", &Attr::getResultLayout)
-            .Add("workspace_size", &Attr::getWorkspaceSize));
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Convolution runners caching.
-//===----------------------------------------------------------------------===//
-
-StreamExecutorConvRunners* ConvRunners::operator()(
-    se::StreamExecutor* executor) {
-  absl::MutexLock lock(&mutex_);
-  return &runners_[executor];
-}
-
-//===----------------------------------------------------------------------===//
-// Convolution custom call implementation.
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-struct Window {
-  absl::Span<const int64_t> window_strides;
-  absl::Span<const int64_t> padding;
-  absl::Span<const int64_t> lhs_dilation;
-  absl::Span<const int64_t> rhs_dilation;
-  absl::Span<const int64_t> window_reversal;
-};
-
-struct ConvAttrs {
-  int64_t feature_group_count;
-  double result_scale;
-};
-
-struct FusedConvAttrs {
-  se::dnn::ActivationMode activation_mode;
-};
-
-struct SideInputAttrs {
-  double side_input_scale;
-};
-
-struct LeakyReluAlphaAttrs {
-  double leaky_relu_alpha;
-};
-
-}  // namespace
-
-static GpuConvDescriptor GetConvDescriptor(
-    CudnnConvKind kind,
-    // Arguments
-    StridedMemrefView operand0, StridedMemrefView operand1,
-    StridedMemrefView output, FlatMemrefView scratch,
-    // Attributes
-    ConvDimensionNumbers dims, Window w, ConvBackendConfig b, ConvAttrs attrs,
-    // Conv-specific arguments and attributes
-    std::optional<FusedConvAttrs> fused = std::nullopt,
-    std::optional<SideInputAttrs> side_input = std::nullopt,
-    std::optional<LeakyReluAlphaAttrs> leakyrelu_alpha = std::nullopt) {
-  // Build a convolution descriptor from the attributes.
-  GpuConvDescriptor descriptor;
-  descriptor.kind = kind;
-
-  // Apply backend config layout to the shape.
-  auto apply_layout = [](StridedMemrefView& memref,
-                         absl::Span<const int64_t> minor_to_major) {
-    Shape shape = ToShape(memref);
-    return ShapeUtil::MakeShapeWithDenseLayout(
-        shape.element_type(), shape.dimensions(), minor_to_major);
-  };
-
-  descriptor.operand0_shape = apply_layout(operand0, b.operand_0_layout);
-  descriptor.operand1_shape = apply_layout(operand1, b.operand_1_layout);
-  descriptor.result_shape = apply_layout(output, b.result_layout);
-
-  // Set up convolution dimensions numbers.
-  ConvolutionDimensionNumbers dns;
-  dns.set_input_batch_dimension(dims.input_batch_dim);
-  dns.set_input_feature_dimension(dims.input_feature_dim);
-  dns.set_kernel_input_feature_dimension(dims.kernel_in_feature_dim);
-  dns.set_kernel_output_feature_dimension(dims.kernel_out_feature_dim);
-  dns.set_output_batch_dimension(dims.output_batch_dim);
-  dns.set_output_feature_dimension(dims.output_feature_dim);
-  for (int64_t d : dims.input_spatial_dims) dns.add_input_spatial_dimensions(d);
-  for (int64_t d : dims.kernel_spatial_dims)
-    dns.add_kernel_spatial_dimensions(d);
-  for (int64_t d : dims.output_spatial_dims)
-    dns.add_output_spatial_dimensions(d);
-  descriptor.dnums = std::move(dns);
-
-  // Put together convolution window config.
-  for (auto index : llvm::seq<int>(0, w.window_strides.size())) {
-    WindowDimension* dim = descriptor.window.add_dimensions();
-    // Window size for a convolution is the same as the kernel size.
-    // Kernel size of the convolution is operand1_shape. We need to look at
-    // the convolution dimension numbers kernel spatial dimensions to get
-    // the window size.
-    int kernel_dim = descriptor.dnums.kernel_spatial_dimensions(index);
-    dim->set_size(descriptor.operand0_shape.dimensions(kernel_dim));
-    dim->set_stride(w.window_strides[index]);
-    dim->set_padding_low(w.padding[index]);
-    dim->set_padding_high(w.padding[index]);
-    dim->set_base_dilation(w.lhs_dilation[index]);
-    dim->set_window_dilation(w.rhs_dilation[index]);
-    dim->set_window_reversal(w.window_reversal[index]);
-  }
-
-  descriptor.scratch_size = scratch.size_in_bytes;
-  descriptor.feature_group_count = attrs.feature_group_count;
-  descriptor.backend_config.set_conv_result_scale(attrs.result_scale);
-  descriptor.backend_config.set_reordered_int8_nchw_vect(
-      b.is_cudnn_reordered_int8);
-
-  // Set up convolution algorigthm.
-  auto* algo = descriptor.backend_config.mutable_algorithm();
-  algo->set_algo_id(b.algorithm);
-  algo->set_math_type(b.tensor_ops_enabled
-                          ? se::dnn::AlgorithmProto::TENSOR_OP_MATH
-                          : se::dnn::AlgorithmProto::DEFAULT_MATH);
-  algo->set_is_cudnn_frontend(b.is_cudnn_frontend);
-
-  if (b.workspace_size >= 0)
-    algo->mutable_workspace_size()->set_value(b.workspace_size);
-
-  for (unsigned i = 0; i < b.knob_ids.size(); ++i) {
-    algo->mutable_tuning_knobs()->insert({b.knob_ids[i], b.knob_values[i]});
-  }
-
-  // Set attributes specific for fused convolutions.
-  if (fused.has_value())
-    descriptor.backend_config.set_activation_mode(fused->activation_mode);
-
-  // Set attributes specific for fused convolutions with leaky_relu_alpha.
-  if (leakyrelu_alpha.has_value())
-    descriptor.backend_config.set_leakyrelu_alpha(
-        leakyrelu_alpha->leaky_relu_alpha);
-
-  // Set attributes specific for convolutions with side input.
-  if (side_input.has_value())
-    descriptor.backend_config.set_side_input_scale(
-        side_input->side_input_scale);
-
-  return descriptor;
-}
-
-template <CudnnConvKind kind>
-static absl::Status DoConv(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, NonAtomicallyUpgradeableRWLock* gpu_lock,
-    State<ConvRunner> runner,
-    // Arguments
-    StridedMemrefView operand0, StridedMemrefView operand1,
-    std::optional<FlatMemrefView> bias,
-    std::optional<StridedMemrefView> side_input,
-    absl::Span<const StridedMemrefView> outputs, FlatMemrefView scratch,
-    int64_t uid,
-    // Convolution config
-    ConvDimensionNumbers conv_dims,
-    // Window config
-    absl::Span<const int64_t> window_strides, absl::Span<const int64_t> padding,
-    absl::Span<const int64_t> lhs_dilation,
-    absl::Span<const int64_t> rhs_dilation,
-    absl::Span<const int64_t> window_reversal,
-    // Backend config attributes
-    ConvBackendConfig backend_config,
-    // Remaining attributes
-    int64_t feature_group_count, double result_scale,
-    // Optional attributes for fused convolutions.
-    std::optional<se::dnn::ActivationMode> activation_mode = std::nullopt,
-    std::optional<double> side_input_scale = std::nullopt,
-    std::optional<double> leakyrelu_alpha = std::nullopt,
-    // Optional extra arguments for graph convolutions.
-    absl::Span<const StridedMemrefView> extra_operands = {},
-    std::optional<std::string_view> serialized_graph = std::nullopt) {
-  // Build config for optional attributes.
-  std::optional<FusedConvAttrs> fused_attrs = std::nullopt;
-  if (activation_mode.has_value()) fused_attrs = {*activation_mode};
-
-  std::optional<SideInputAttrs> side_input_attrs = std::nullopt;
-  if (side_input_scale.has_value()) side_input_attrs = {*side_input_scale};
-
-  std::optional<LeakyReluAlphaAttrs> leakyrelu_alpha_attrs = std::nullopt;
-  if (leakyrelu_alpha.has_value()) leakyrelu_alpha_attrs = {*leakyrelu_alpha};
-
-  bool runtime_autotuning = false;
-  if (backend_config.algorithm == -1) {
-    // Set the algorithm back to the default algorithm to avoid error from
-    // cuDNN.
-    backend_config.algorithm = 0;
-    runtime_autotuning = true;
-  }
-
-  // Get or create the convolution runner state.
-  TF_ASSIGN_OR_RETURN(
-      ConvRunner * conv,
-      runner.GetOrCreate([&]() -> absl::StatusOr<ConvRunner> {
-        GpuConvDescriptor descriptor = GetConvDescriptor(
-            kind, operand0, operand1, outputs[0], scratch, conv_dims,
-            {window_strides, padding, lhs_dilation, rhs_dilation,
-             window_reversal},
-            backend_config, {feature_group_count, result_scale}, fused_attrs,
-            side_input_attrs, leakyrelu_alpha_attrs);
-        if (serialized_graph.has_value()) {
-          descriptor.backend_config.set_serialized_graph(
-              std::string(serialized_graph.value()));
-        }
-        TF_ASSIGN_OR_RETURN(GpuConvConfig conv_config,
-                            GetGpuConvConfig(descriptor, ""));
-
-        return ConvRunner(std::move(conv_config));
-      }));
-
-  // Prepare buffer arguments.
-  std::vector<se::DeviceMemoryBase> buffers = {GetDeviceAddress(operand0),
-                                               GetDeviceAddress(operand1)};
-  if (bias.has_value()) buffers.push_back(GetDeviceAddress(*bias));
-  if (side_input.has_value()) buffers.push_back(GetDeviceAddress(*side_input));
-  for (const StridedMemrefView& operand : extra_operands) {
-    buffers.push_back(GetDeviceAddress(operand));
-  }
-
-  std::vector<se::DeviceMemoryBase> result_buffers;
-  for (const StridedMemrefView& output : outputs) {
-    result_buffers.push_back(GetDeviceAddress(output));
-  }
-  se::DeviceMemoryBase scratch_buffer = GetDeviceAddress(scratch);
-
-  int64_t scratch_buffer_size = scratch_buffer.size();
-
-  // Do runtime conv autotuning.
-  if (runtime_autotuning) {
-#if GOOGLE_CUDA
-    // Don't run autotuning concurrently on the same GPU.
-    NonAtomicallyUpgradeableRWLock::WriterLock writer_lock =
-        gpu_lock->UpgradeToWriterMutexLock();
-
-    auto stream_exec = run_options->stream()->parent();
-    auto allocator = run_options->allocator();
-    AutotuneConfig config(DeviceConfig{stream_exec, allocator}, *debug_options);
-    GpuConvAlgorithmPicker conv_algorithm_picker(config);
-
-    GpuConvConfig gpu_conv_config = conv->config;
-    TF_ASSIGN_OR_RETURN(
-        AutotuneResult best_algo,
-        conv_algorithm_picker.PickBestAlgorithmWithAllocatedBuffer(
-            config, gpu_conv_config, run_options, *debug_options, buffers,
-            result_buffers));
-
-    // Set algorithm in the convolution runner state.
-    se::dnn::AlgorithmDesc algo_desc(best_algo.conv().algorithm(),
-                                     best_algo.conv().tensor_ops_enabled());
-    conv->config.algorithm = algo_desc;
-
-    // Set scratch buffer size according to the selected algorithm.
-    scratch_buffer_size = best_algo.scratch_bytes();
-#else
-    return absl::InternalError(
-        "Failed to run runtime autotuner because CUDA is not enabled");
-#endif
-  }
-
-  RunConvOptions opts;
-  opts.runner_cache = &conv->runner;
-
-  if (scratch_buffer_size > scratch_buffer.size()) {
-    // Need to reallocate scratch buffer.
-    se::DeviceMemoryAllocator* allocator = run_options->allocator();
-    TF_ASSIGN_OR_RETURN(se::OwningDeviceMemory allocated_buffer,
-                        allocator->Allocate(run_options->device_ordinal(),
-                                            scratch_buffer_size));
-    se::DeviceMemoryBase new_scratch_buffer(allocated_buffer.ptr(),
-                                            scratch_buffer_size);
-
-    // Run the convolution using the new scratch buffer.
-    TF_RETURN_IF_ERROR(RunGpuConv(conv->config, buffers, result_buffers,
-                                  new_scratch_buffer, run_options->stream(),
-                                  opts));
-    if (!run_options->stream()->ok()) {
-      return absl::InternalError("run_options stream not ok");
-    }
-    return absl::OkStatus();
-  }
-
-  // Run the convolution.
-  TF_RETURN_IF_ERROR(RunGpuConv(conv->config, buffers, result_buffers,
-                                scratch_buffer, run_options->stream(), opts));
-  if (!run_options->stream()->ok()) {
-    return absl::InternalError("run_options stream not ok");
-  }
-
-  return absl::OkStatus();
-}
-
-template <CudnnConvKind kind>
-static absl::Status ConvImpl(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, NonAtomicallyUpgradeableRWLock* gpu_lock,
-    State<ConvRunner> runner,
-    // Arguments
-    StridedMemrefView operand0, StridedMemrefView operand1,
-    std::optional<FlatMemrefView> bias,
-    std::optional<StridedMemrefView> side_input, StridedMemrefView output,
-    FlatMemrefView scratch, int64_t uid,
-    // Convolution config
-    ConvDimensionNumbers conv_dims,
-    // Window config
-    absl::Span<const int64_t> window_strides, absl::Span<const int64_t> padding,
-    absl::Span<const int64_t> lhs_dilation,
-    absl::Span<const int64_t> rhs_dilation,
-    absl::Span<const int64_t> window_reversal,
-    // Backend config attributes
-    ConvBackendConfig backend_config,
-    // Remaining attributes
-    int64_t feature_group_count, double result_scale,
-    // Optional attributes for fused convolutions.
-    std::optional<se::dnn::ActivationMode> activation_mode = std::nullopt,
-    std::optional<double> side_input_scale = std::nullopt,
-    std::optional<double> leakyrelu_alpha = std::nullopt) {
-  return DoConv<kind>(run_options, debug_options, gpu_lock, runner, operand0,
-                      operand1, bias, side_input, {output}, scratch, uid,
-                      conv_dims, window_strides, padding, lhs_dilation,
-                      rhs_dilation, window_reversal, backend_config,
-                      feature_group_count, result_scale, activation_mode,
-                      side_input_scale, leakyrelu_alpha);
-}
-
-template <CudnnConvKind kind>
-static absl::Status ConvGraphImpl(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, NonAtomicallyUpgradeableRWLock* gpu_lock,
-    State<ConvRunner> runner,
-    // Arguments
-    StridedMemrefView operand0, StridedMemrefView operand1,
-    CustomCall::RemainingArgs args, int64_t uid,
-    // Convolution config
-    ConvDimensionNumbers conv_dims,
-    // Window config
-    absl::Span<const int64_t> window_strides, absl::Span<const int64_t> padding,
-    absl::Span<const int64_t> lhs_dilation,
-    absl::Span<const int64_t> rhs_dilation,
-    absl::Span<const int64_t> window_reversal,
-    // Backend config attributes
-    ConvBackendConfig backend_config,
-    // Remaining attributes
-    int64_t feature_group_count, double result_scale, int32_t n_aux_outputs,
-    std::string_view serialized_graph) {
-  // Let N be the size of 'args'. The first (N - n_aux_outputs - 2) elements of
-  // 'args' are extra operands, which are operands other than the input and
-  // filter. The next (n_aux_outputs + 1) elements are the outputs -- the first
-  // being the main convolution output and the others being the "auxiliary"
-  // outputs (e.g. amax). The last element of 'args' is the scratch space.
-  std::vector<StridedMemrefView> extra_operands;
-  for (int i = 0; i < args.size() - n_aux_outputs - 2; i++) {
-    auto arg = args.get<StridedMemrefView>(i);
-    if (failed(arg)) {
-      return absl::InternalError(
-          "Failed to get operand buffer for convolution graph");
-    }
-    extra_operands.push_back(arg.value());
-  }
-
-  std::vector<StridedMemrefView> outputs;
-  for (int i = args.size() - n_aux_outputs - 2; i < args.size() - 1; i++) {
-    auto arg = args.get<StridedMemrefView>(i);
-    if (failed(arg)) {
-      return absl::InternalError(
-          "Failed to get output buffer for convolution graph");
-    }
-    outputs.push_back(arg.value());
-  }
-
-  auto scratch = args.get<FlatMemrefView>(args.size() - 1);
-  if (failed(scratch)) {
-    return absl::InternalError(
-        "Failed to get scratch buffer for convolution graph");
-  }
-
-  return DoConv<kind>(run_options, debug_options, gpu_lock, runner, operand0,
-                      operand1, /*bias=*/{},
-                      /*side_input=*/{}, outputs, scratch.value(), uid,
-                      conv_dims, window_strides, padding, lhs_dilation,
-                      rhs_dilation, window_reversal, backend_config,
-                      feature_group_count, result_scale, /*activation_mode=*/{},
-                      /*side_input_scale=*/{}, /*leakyrelu_alpha=*/{},
-                      extra_operands, serialized_graph);
-}
-
-//===----------------------------------------------------------------------===//
-// Convolution custom calls bindings and registration.
-//===----------------------------------------------------------------------===//
-
-using Kind = CudnnConvKind;
-
-template <typename... Ts>
-static auto BindConvAttributes(runtime::CustomCallBinding<Ts...> binding) {
-  return std::move(binding)
-      // Unique convolution id for caching state.
-      .template Attr<int64_t>("uid")
-      // Convolution dimensions numbers
-      .template Attr<ConvDimensionNumbers>("conv_dims")
-      // Window config
-      .template Attr<absl::Span<const int64_t>>("window_strides")
-      .template Attr<absl::Span<const int64_t>>("padding")
-      .template Attr<absl::Span<const int64_t>>("lhs_dilation")
-      .template Attr<absl::Span<const int64_t>>("rhs_dilation")
-      .template Attr<absl::Span<const int64_t>>("window_reversal")
-      // Backend config attributes
-      .template Attr<ConvBackendConfig>("backend_config")
-      // Remaining attributes.
-      .template Attr<int64_t>("feature_group_count")
-      .template Attr<double>("result_scale");
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL_TEMPLATE(
-    Kind kind, Conv, FunctionWrapper<ConvImpl<kind>>(), checks,
-    BindConvAttributes(
-        CustomCall::Bind("xla.gpu.conv")
-            .UserData<const ServiceExecutableRunOptions*>()
-            .UserData<const DebugOptions*>()
-            .UserData<NonAtomicallyUpgradeableRWLock*>()
-            .State<ConvRunner>("uid")                   // runner
-            .Arg<StridedMemrefView>()                   // operand0
-            .Arg<StridedMemrefView>()                   // operand1
-            .Value(std::optional<FlatMemrefView>())     // bias
-            .Value(std::optional<StridedMemrefView>())  // side_input
-            .Arg<StridedMemrefView>()                   // output
-            .Arg<FlatMemrefView>()                      // scratch
-        )
-        .Value(std::optional<se::dnn::ActivationMode>())  // activation_mode
-        .Value(std::optional<double>())                   // side_input_scale
-        .Value(std::optional<double>())                   // leaky_relu_alpha
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    ConvFused, FunctionWrapper<ConvImpl<Kind::kForwardActivation>>(), checks,
-    BindConvAttributes(
-        CustomCall::Bind("xla.gpu.conv.fused")
-            .UserData<const ServiceExecutableRunOptions*>()
-            .UserData<const DebugOptions*>()
-            .UserData<NonAtomicallyUpgradeableRWLock*>()
-            .State<ConvRunner>("uid")                   // runner
-            .Arg<StridedMemrefView>()                   // operand0
-            .Arg<StridedMemrefView>()                   // operand1
-            .Arg<FlatMemrefView>()                      // bias
-            .Value(std::optional<StridedMemrefView>())  // side_input
-            .Arg<StridedMemrefView>()                   // output
-            .Arg<FlatMemrefView>()                      // scratch
-        )
-        .Attr<se::dnn::ActivationMode>("activation_mode")
-        .Value(std::optional<double>())   // side_input_scale
-        .Attr<double>("leakyrelu_alpha")  // leaky_relu_alpha
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    ConvFusedSideInput, FunctionWrapper<ConvImpl<Kind::kForwardActivation>>(),
-    checks,
-    BindConvAttributes(CustomCall::Bind("xla.gpu.conv.fused.side_input")
-                           .UserData<const ServiceExecutableRunOptions*>()
-                           .UserData<const DebugOptions*>()
-                           .UserData<NonAtomicallyUpgradeableRWLock*>()
-                           .State<ConvRunner>("uid")  // runner
-                           .Arg<StridedMemrefView>()  // operand0
-                           .Arg<StridedMemrefView>()  // operand1
-                           .Arg<FlatMemrefView>()     // bias
-                           .Arg<StridedMemrefView>()  // side_input
-                           .Arg<StridedMemrefView>()  // output
-                           .Arg<FlatMemrefView>()     // scratch
-                       )
-        .Attr<se::dnn::ActivationMode>("activation_mode")
-        .Attr<double>("side_input_scale")
-        .Value(std::optional<double>()));  // leaky_relu_alpha
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    ConvForwardGraph, FunctionWrapper<ConvGraphImpl<Kind::kForwardGraph>>(),
-    checks,
-    BindConvAttributes(CustomCall::Bind("xla.gpu.conv.forward.graph")
-                           .UserData<const ServiceExecutableRunOptions*>()
-                           .UserData<const DebugOptions*>()
-                           .UserData<NonAtomicallyUpgradeableRWLock*>()
-                           .State<ConvRunner>("uid")  // runner
-                           .Arg<StridedMemrefView>()  // operand0
-                           .Arg<StridedMemrefView>()  // operand1
-                           .RemainingArgs()           // binary_operands
-                       )
-        .Attr<int32_t>("n_aux_outputs")
-        .Attr<std::string_view>("serialized_graph"));
-
-//===----------------------------------------------------------------------===//
-
-void RegisterConvCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  auto conv = [](std::string name) { return "xla.gpu.conv." + name; };
-  registry.Register(conv("forward"), Conv<Kind::kForward>);
-  registry.Register(conv("backward.input"), Conv<Kind::kBackwardInput>);
-  registry.Register(conv("backward.filter"), Conv<Kind::kBackwardFilter>);
-  registry.Register(conv("forward.fused"), ConvFused);
-  registry.Register(conv("forward.fused.side_input"), ConvFusedSideInput);
-  registry.Register(conv("forward.graph"), ConvForwardGraph);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/conv.h b/third_party/xla/xla/service/gpu/runtime/conv.h
deleted file mode 100644
index 91591b8ffab123..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/conv.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_CONV_H_
-#define XLA_SERVICE_GPU_RUNTIME_CONV_H_
-
-#include <memory>
-#include <utility>
-
-#include "absl/container/node_hash_map.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/mlir/runtime/transforms/custom_call_encoding.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/service/gpu/gpu_conv_runner.h"
-
-namespace xla {
-namespace gpu {
-
-// Registers XLA Gpu runtime Conv custom calls.
-void RegisterConvCustomCalls(runtime::DirectCustomCallRegistry& registry);
-
-// Register type names for convoluttion attributes defined by MHLO dialect.
-void RegisterConvTypeIdNames(runtime::TypeIDNameRegistry& registry);
-
-// Add attributes encoding for convoluttion attributes defined by MHLO dialect.
-void PopulateConvAttrEncoding(runtime::CustomCallAttrEncodingSet& encoding);
-
-//===----------------------------------------------------------------------===//
-// Cache conv runners between invocations of convolution custom calls.
-//===----------------------------------------------------------------------===//
-
-struct ConvRunner {
-  explicit ConvRunner(GpuConvConfig config)
-      : config(std::move(config)), runner(this->config) {}
-  GpuConvConfig config;
-  GenericConvRunner runner;
-};
-
-class StreamExecutorConvRunners : public runtime::StateVector<ConvRunner> {};
-
-// Xla executable keeps a mapping from stream executors to convolution runners.
-class ConvRunners {
- public:
-  StreamExecutorConvRunners* operator()(se::StreamExecutor* executor);
-
- private:
-  mutable absl::Mutex mutex_;
-  absl::node_hash_map<se::StreamExecutor*, StreamExecutorConvRunners> runners_
-      ABSL_GUARDED_BY(mutex_);
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_CONV_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/conv_reorder.cc b/third_party/xla/xla/service/gpu/runtime/conv_reorder.cc
deleted file mode 100644
index 2264f6b921b796..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/conv_reorder.cc
+++ /dev/null
@@ -1,105 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/conv_reorder.h"
-
-#include <optional>
-#include <utility>
-
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/executable.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/service_executable_run_options.h"
-#include "xla/xla.pb.h"
-
-namespace xla {
-namespace gpu {
-namespace {
-
-using ::xla::runtime::CustomCall;
-using ::xla::runtime::FlatMemrefView;
-using ::xla::runtime::StridedMemrefView;
-
-se::dnn::FilterDescriptor GetFilterDescriptor(
-    absl::Span<const int64_t> filter_dims) {
-  se::dnn::FilterDescriptor filter_desc(2);
-  filter_desc.set_layout(se::dnn::FilterLayout::kOutputInputYX32);
-  filter_desc.set_output_feature_map_count(filter_dims[0]);
-  filter_desc.set_input_feature_map_count(filter_dims[1]);
-  filter_desc.set_input_filter_height(filter_dims[2]);
-  filter_desc.set_input_filter_width(filter_dims[3]);
-  return filter_desc;
-}
-
-absl::Status ConvReorderFilterImpl(
-    const ServiceExecutableRunOptions* run_options,
-    StridedMemrefView input_view, StridedMemrefView output_view,
-    absl::Span<const int64_t> filter_dims) {
-  auto input = se::DeviceMemory<int8_t>(GetDeviceAddress(input_view));
-  auto output = se::DeviceMemory<int8_t>(GetDeviceAddress(output_view));
-
-  return run_options->stream()->CudnnReorderConvolutionFilterAndBias(
-      GetFilterDescriptor(filter_dims), input, &output, std::nullopt,
-      std::nullopt);
-}
-
-absl::Status ConvReorderFilterAndBiasImpl(
-    const ServiceExecutableRunOptions* run_options,
-    StridedMemrefView filter_input_view, FlatMemrefView bias_input_view,
-    StridedMemrefView filter_output_view, FlatMemrefView bias_output_view,
-    absl::Span<const int64_t> filter_dims) {
-  auto filter_input =
-      se::DeviceMemory<int8_t>(GetDeviceAddress(filter_input_view));
-  auto filter_output =
-      se::DeviceMemory<int8_t>(GetDeviceAddress(filter_output_view));
-  auto bias_input = se::DeviceMemory<float>(GetDeviceAddress(bias_input_view));
-  auto bias_output =
-      se::DeviceMemory<float>(GetDeviceAddress(bias_output_view));
-
-  return run_options->stream()->CudnnReorderConvolutionFilterAndBias(
-      GetFilterDescriptor(filter_dims), filter_input, &filter_output,
-      std::make_optional(bias_input), std::make_optional(bias_output));
-}
-
-}  // namespace
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    ConvReorderFilter, FunctionWrapper<ConvReorderFilterImpl>(), checks,
-    CustomCall::Bind("xla.gpu.conv.reorder.filter")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .Arg<StridedMemrefView>()  // filter_input
-        .Arg<StridedMemrefView>()  // filter_output
-        .Attr<absl::Span<const int64_t>>("filter_dims"));
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    ConvReorderFilterAndBias, FunctionWrapper<ConvReorderFilterAndBiasImpl>(),
-    checks,
-    CustomCall::Bind("xla.gpu.conv.reorder.filter_and_bias")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .Arg<StridedMemrefView>()  // filter_input
-        .Arg<FlatMemrefView>()     // bias_input
-        .Arg<StridedMemrefView>()  // filter_output
-        .Arg<FlatMemrefView>()     // bias_output
-        .Attr<absl::Span<const int64_t>>("filter_dims"));
-
-void RegisterConvReorderCustomCalls(
-    runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.conv.reorder.filter", ConvReorderFilter);
-  registry.Register("xla.gpu.conv.reorder.filter_and_bias",
-                    ConvReorderFilterAndBias);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime3/convolution_thunk.cc b/third_party/xla/xla/service/gpu/runtime/convolution_thunk.cc
similarity index 87%
rename from third_party/xla/xla/service/gpu/runtime3/convolution_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/convolution_thunk.cc
index fdbbaf12f91f2d..1d33eca1ac8603 100644
--- a/third_party/xla/xla/service/gpu/runtime3/convolution_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/convolution_thunk.cc
@@ -13,12 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/convolution_thunk.h"
+#include "xla/service/gpu/runtime/convolution_thunk.h"
 
 #include <memory>
 #include <optional>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
+#include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_conv_runner.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/util.h"
@@ -83,12 +85,12 @@ absl::Status ConvolutionThunk::ExecuteOnStream(const ExecuteParams& params) {
 
 ConvolutionReorderThunk::ConvolutionReorderThunk(
     ThunkInfo thunk_info, absl::Span<int64_t> filter_nchw,
-    std::vector<BufferAllocation::Slice> operand_slices,
-    std::vector<BufferAllocation::Slice> result_slices)
+    absl::InlinedVector<BufferAllocation::Slice, 2> operand_slices,
+    absl::InlinedVector<BufferAllocation::Slice, 2> result_slices)
     : Thunk(Kind::kConvolutionReorder, thunk_info),
       filter_descriptor_(CreateFilterDescriptor(filter_nchw)),
-      operand_buffers_(std::move(operand_slices)),
-      result_buffers_(std::move(result_slices)) {}
+      operand_buffers_(operand_slices),
+      result_buffers_(result_slices) {}
 
 absl::Status ConvolutionReorderThunk::ExecuteOnStream(
     const ExecuteParams& params) {
@@ -110,14 +112,13 @@ absl::Status ConvolutionReorderThunk::ExecuteOnStream(
                      buffer_allocations.GetDeviceAddress(result_buffers_[1])))
                : std::nullopt;
 
-  TF_RETURN_IF_ERROR(params.stream->CudnnReorderConvolutionFilterAndBias(
-      filter_descriptor_, filter_input, &filter_output, std::move(bias_input),
-      std::move(bias_output)));
-
-  if (!params.stream->ok()) {
-    return Internal("ConvolutionReorderThunk::ExecuteOnStream failed.");
+  auto dnn = params.stream->parent()->AsDnn();
+  if (dnn == nullptr) {
+    return absl::InternalError("No DNN for stream.");
   }
-  return absl::OkStatus();
+  return dnn->CudnnReorderConvolutionFilterAndBias(
+      params.stream, filter_descriptor_, filter_input, &filter_output,
+      std::move(bias_input), std::move(bias_output));
 }
 
 se::dnn::FilterDescriptor ConvolutionReorderThunk::CreateFilterDescriptor(
diff --git a/third_party/xla/xla/service/gpu/runtime3/convolution_thunk.h b/third_party/xla/xla/service/gpu/runtime/convolution_thunk.h
similarity index 82%
rename from third_party/xla/xla/service/gpu/runtime3/convolution_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/convolution_thunk.h
index 5dfaeda1624d42..02aecd464fce4a 100644
--- a/third_party/xla/xla/service/gpu/runtime3/convolution_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/convolution_thunk.h
@@ -13,12 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_CONVOLUTION_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_CONVOLUTION_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_CONVOLUTION_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_CONVOLUTION_THUNK_H_
 
 #include <memory>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/container/inlined_vector.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/gpu_conv_runner.h"
 #include "xla/service/gpu/thunk.h"
@@ -63,9 +64,10 @@ class ConvolutionThunk : public Thunk {
 // Launches the kernel that reorders input data for int8x32 convolutions.
 class ConvolutionReorderThunk : public Thunk {
  public:
-  ConvolutionReorderThunk(ThunkInfo thunk_info, absl::Span<int64_t> filter_nchw,
-                          std::vector<BufferAllocation::Slice> operand_slices,
-                          std::vector<BufferAllocation::Slice> result_slices);
+  ConvolutionReorderThunk(
+      ThunkInfo thunk_info, absl::Span<int64_t> filter_nchw,
+      absl::InlinedVector<BufferAllocation::Slice, 2> operand_slices,
+      absl::InlinedVector<BufferAllocation::Slice, 2> result_slices);
 
   ConvolutionReorderThunk(const ConvolutionReorderThunk&) = delete;
   ConvolutionReorderThunk& operator=(const ConvolutionReorderThunk&) = delete;
@@ -77,11 +79,11 @@ class ConvolutionReorderThunk : public Thunk {
       absl::Span<int64_t> filter_nchw);
 
   const se::dnn::FilterDescriptor filter_descriptor_;
-  std::vector<BufferAllocation::Slice> operand_buffers_;
-  std::vector<BufferAllocation::Slice> result_buffers_;
+  absl::InlinedVector<BufferAllocation::Slice, 2> operand_buffers_;
+  absl::InlinedVector<BufferAllocation::Slice, 2> result_buffers_;
 };
 
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_CONVOLUTION_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_CONVOLUTION_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/copy_thunk.cc b/third_party/xla/xla/service/gpu/runtime/copy_thunk.cc
similarity index 97%
rename from third_party/xla/xla/service/gpu/runtime3/copy_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/copy_thunk.cc
index fed80deaf8a52d..e1a73e4ac182e1 100644
--- a/third_party/xla/xla/service/gpu/runtime3/copy_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/copy_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/copy_thunk.h"
+#include "xla/service/gpu/runtime/copy_thunk.h"
 
 #include <cstdint>
 
diff --git a/third_party/xla/xla/service/gpu/runtime3/copy_thunk.h b/third_party/xla/xla/service/gpu/runtime/copy_thunk.h
similarity index 94%
rename from third_party/xla/xla/service/gpu/runtime3/copy_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/copy_thunk.h
index 6db53c6dbc9e9c..76e26e09fe9360 100644
--- a/third_party/xla/xla/service/gpu/runtime3/copy_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/copy_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_COPY_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_COPY_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_COPY_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_COPY_THUNK_H_
 
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/thunk.h"
@@ -64,4 +64,4 @@ class DeviceToDeviceCopyThunk : public Thunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_COPY_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_COPY_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/cub_sort.cc b/third_party/xla/xla/service/gpu/runtime/cub_sort.cc
deleted file mode 100644
index 413880a6109e16..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/cub_sort.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/cub_sort.h"
-
-#include <optional>
-
-#include "absl/status/status.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/runtime/executable.h"  // IWYU pragma: keep
-#include "xla/runtime/memref_view.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/service_executable_run_options.h"
-#include "xla/stream_executor/device_memory.h"
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "xla/service/gpu/cub_sort_thunk.h"
-#endif
-
-namespace xla {
-namespace gpu {
-namespace {
-
-using ::stream_executor::DeviceMemoryBase;
-using ::xla::runtime::CustomCall;
-using ::xla::runtime::FlatMemrefView;
-
-absl::Status CubDeviceRadixSortKeysImpl(
-    const ServiceExecutableRunOptions* run_options, FlatMemrefView input_view,
-    FlatMemrefView output_view, FlatMemrefView scratch_view, bool descending) {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  return RunCubSort(input_view.dtype, std::nullopt,
-                    GetDeviceAddress(input_view), DeviceMemoryBase(),
-                    GetDeviceAddress(output_view), DeviceMemoryBase(),
-                    GetDeviceAddress(scratch_view), descending);
-#else
-  return absl::UnimplementedError("CUB is not available");
-#endif
-}
-
-absl::Status CubDeviceRadixSortPairsImpl(
-    const ServiceExecutableRunOptions* run_options,
-    FlatMemrefView input_keys_view, FlatMemrefView input_values_view,
-    FlatMemrefView output_keys_view, FlatMemrefView output_values_view,
-    FlatMemrefView scratch_view, bool descending) {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  return RunCubSort(
-      input_keys_view.dtype, input_values_view.dtype,
-      GetDeviceAddress(input_keys_view), GetDeviceAddress(input_values_view),
-      GetDeviceAddress(output_keys_view), GetDeviceAddress(output_values_view),
-      GetDeviceAddress(scratch_view), descending);
-#else
-  return absl::UnimplementedError("CUB is not available");
-#endif
-}
-
-}  // namespace
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    CubDeviceRadixSortKeys, FunctionWrapper<CubDeviceRadixSortKeysImpl>(),
-    checks,
-    CustomCall::Bind("xla.gpu.radix_sort_keys")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .Arg<FlatMemrefView>()  // input
-        .Arg<FlatMemrefView>()  // output
-        .Arg<FlatMemrefView>()  // scratch
-        .Attr<bool>("descending"));
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    CubDeviceRadixSortPairs, FunctionWrapper<CubDeviceRadixSortPairsImpl>(),
-    checks,
-    CustomCall::Bind("xla.gpu.radix_sort_pairs")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .Arg<FlatMemrefView>()  // input_keys
-        .Arg<FlatMemrefView>()  // input_values
-        .Arg<FlatMemrefView>()  // output_keys
-        .Arg<FlatMemrefView>()  // output_values
-        .Arg<FlatMemrefView>()  // scratch
-        .Attr<bool>("descending"));
-
-void RegisterCubSortCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.radix_sort_keys", CubDeviceRadixSortKeys);
-  registry.Register("xla.gpu.radix_sort_pairs", CubDeviceRadixSortPairs);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/cub_sort.h b/third_party/xla/xla/service/gpu/runtime/cub_sort.h
deleted file mode 100644
index 113e9bc4726648..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/cub_sort.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_CUB_SORT_H_
-#define XLA_SERVICE_GPU_RUNTIME_CUB_SORT_H_
-
-#include "xla/runtime/custom_call_registry.h"
-
-namespace xla {
-namespace gpu {
-
-// Registers XLA Gpu runtime CUB sort custom calls.
-void RegisterCubSortCustomCalls(runtime::DirectCustomCallRegistry& registry);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_CUB_SORT_H_
diff --git a/third_party/xla/xla/service/gpu/cub_sort_thunk.cc b/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.cc
similarity index 95%
rename from third_party/xla/xla/service/gpu/cub_sort_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.cc
index 7d71f298f2f283..1e4fe32cd97db2 100644
--- a/third_party/xla/xla/service/gpu/cub_sort_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/cub_sort_thunk.h"
+#include "xla/service/gpu/runtime/cub_sort_thunk.h"
 
 #include <cstddef>
 #include <cstdint>
@@ -23,6 +23,7 @@ limitations under the License.
 #include <utility>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "absl/log/check.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
@@ -31,8 +32,6 @@ limitations under the License.
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/cub_sort_kernel.h"
 #include "xla/service/gpu/thunk.h"
-#include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -248,11 +247,12 @@ CubSortRunnerInterface::Create(PrimitiveType type,
                                 : CreateCubSortRunner(type);
 }
 
-CubSortThunk::CubSortThunk(ThunkInfo thunk_info, PrimitiveType type,
-                           std::optional<PrimitiveType> value_type,
-                           std::vector<BufferAllocation::Slice> operands,
-                           std::vector<BufferAllocation::Slice> results,
-                           BufferAllocation::Slice scratch, bool descending)
+CubSortThunk::CubSortThunk(
+    ThunkInfo thunk_info, PrimitiveType type,
+    std::optional<PrimitiveType> value_type,
+    absl::InlinedVector<BufferAllocation::Slice, 2> operands,
+    absl::InlinedVector<BufferAllocation::Slice, 2> results,
+    BufferAllocation::Slice scratch, bool descending)
     : Thunk(Thunk::kCubSort, thunk_info),
       runner_(CubSortRunnerInterface::Create(type, value_type).value()),
       operands_(std::move(operands)),
diff --git a/third_party/xla/xla/service/gpu/cub_sort_thunk.h b/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.h
similarity index 86%
rename from third_party/xla/xla/service/gpu/cub_sort_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.h
index 649de91494e9b9..bd5440d5fca30e 100644
--- a/third_party/xla/xla/service/gpu/cub_sort_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/cub_sort_thunk.h
@@ -13,18 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_CUB_SORT_THUNK_H_
-#define XLA_SERVICE_GPU_CUB_SORT_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_CUB_SORT_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_CUB_SORT_THUNK_H_
 
 #include <cstdint>
 #include <memory>
 #include <optional>
 #include <vector>
 
+#include "absl/container/inlined_vector.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/thunk.h"
-#include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/xla_data.pb.h"
 
@@ -51,8 +50,8 @@ class CubSortThunk : public Thunk {
  public:
   CubSortThunk(ThunkInfo thunk_info, PrimitiveType type,
                std::optional<PrimitiveType> value_type,
-               std::vector<BufferAllocation::Slice> operands,
-               std::vector<BufferAllocation::Slice> results,
+               absl::InlinedVector<BufferAllocation::Slice, 2> operands,
+               absl::InlinedVector<BufferAllocation::Slice, 2> results,
                BufferAllocation::Slice scratch, bool descending);
 
   absl::Status ExecuteOnStream(const ExecuteParams& params) override {
@@ -66,8 +65,8 @@ class CubSortThunk : public Thunk {
 
  private:
   std::unique_ptr<CubSortRunnerInterface> runner_;
-  std::vector<BufferAllocation::Slice> operands_;
-  std::vector<BufferAllocation::Slice> results_;
+  absl::InlinedVector<BufferAllocation::Slice, 2> operands_;
+  absl::InlinedVector<BufferAllocation::Slice, 2> results_;
   BufferAllocation::Slice scratch_;
   bool descending_;
 };
@@ -83,4 +82,4 @@ absl::Status RunCubSort(PrimitiveType type,
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_CUB_SORT_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_CUB_SORT_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/custom_call.cc b/third_party/xla/xla/service/gpu/runtime/custom_call.cc
deleted file mode 100644
index 837ad2484f0af8..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/custom_call.cc
+++ /dev/null
@@ -1,177 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/custom_call.h"
-
-#include <exception>
-#include <string>
-#include <string_view>
-
-#include "xla/runtime/executable.h"
-#include "xla/service/custom_call_status_internal.h"
-#include "xla/service/custom_call_target_registry.h"
-#include "xla/service/gpu/cublas_cudnn.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/gpu/runtime/triangular_solve.h"
-#include "xla/service/service_executable_run_options.h"
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "xla/stream_executor/gpu/gpu_stream.h"
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-namespace xla {
-namespace gpu {
-
-// Custom calls with API version API_VERSION_TYPED_FFI lowered directly to an
-// Xla runtime custom calls. Older API versions handled by adapting Xla runtime
-// calling convention to the calling convention expected by the registered
-// handler.
-//
-// Once all Xla backends will use Xla runtime we will deprecate older API
-// version, and migrate all users to API_VERSION_TYPED_FFI.
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-using xla::runtime::CustomCall;
-using xla::runtime::FlatMemrefView;
-using xla::runtime::StridedMemrefView;
-
-static absl::Status XlaCustomCallImpl(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, CustomCall::RemainingArgs args,
-    std::string_view call_target_name, int32_t api_version,
-    std::string_view backend_config) {
-  // Pattern match custom call to a few special cases, otherwise find the custom
-  // call handler regustered with the runtime.
-  if (call_target_name == kTriangularSolveCallTarget)
-    return TriangularSolve::run(run_options, debug_options, args,
-                                backend_config);
-
-  // Find the Xla custom call handler.
-  auto& platform_name = run_options->stream()->parent()->platform()->Name();
-  void* call_target = CustomCallTargetRegistry::Global()->Lookup(
-      std::string(call_target_name), platform_name);
-  if (!call_target) {
-    return absl::InvalidArgumentError(absl::StrCat(
-        "Cannot find the Xla custom call handler ", call_target_name));
-  }
-
-  // Prepare pointers to buffers to pass to the Xla custom call handler.
-  llvm::SmallVector<void*> buffers;
-  for (unsigned i = 0; i < args.size(); ++i) {
-    if (auto memref = args.get<FlatMemrefView>(i); succeeded(memref)) {
-      buffers.push_back(memref->data);
-      continue;
-    }
-
-    if (auto strided = args.get<StridedMemrefView>(i); succeeded(strided)) {
-      buffers.push_back(strided->data);
-      continue;
-    }
-
-    // TODO(ezhulenev): Add dialect and type to model Xla custom call holes,
-    // today we rely on the fact that custom calls do not support scalar
-    // arguments and we can disambiguate holes from real arguments.
-    if (auto hole = args.get<int64_t>(i); succeeded(hole)) {
-      buffers.push_back(nullptr);
-      continue;
-    }
-
-    return absl::InvalidArgumentError(
-        "Failed to get arguments as (strided) memref view");
-  }
-
-  // Call custom call handler using the calling convention it requires.
-  using ApiVersion = CustomCallApiVersion;
-
-  // Original custom call API version that doesn't support returning status.
-  if (api_version == ApiVersion::API_VERSION_ORIGINAL) {
-    using XlaCustomCallType =
-        void (*)(se::gpu::GpuStreamHandle, void**, const char*, size_t);
-    auto xla_call_target = reinterpret_cast<XlaCustomCallType>(call_target);
-
-    // As this is calling an external library, we should catch the
-    // error as there isn't another working correctly path to return
-    // an error to XLA.
-    try {
-      xla_call_target(se::gpu::AsGpuStreamValue(run_options->stream()),
-                      buffers.data(), backend_config.data(),
-                      backend_config.size());
-    } catch (std::exception& e) {
-      return absl::UnknownError(
-          absl::StrCat(call_target_name,
-                       " XLA extension have thrown an exception: ", e.what()));
-    } catch (...) {
-      return absl::UnknownError(absl::StrCat(
-          call_target_name, " XLA extension have thrown an exception."));
-    }
-
-    return absl::OkStatus();
-  }
-
-  // Xla Custom call API returning status.
-  if (api_version == ApiVersion::API_VERSION_STATUS_RETURNING ||
-      api_version == ApiVersion::API_VERSION_STATUS_RETURNING_UNIFIED) {
-    using XlaCustomCallType =
-        void (*)(se::gpu::GpuStreamHandle, void**, const char*, size_t,
-                 XlaCustomCallStatus*);
-    auto xla_call_target = reinterpret_cast<XlaCustomCallType>(call_target);
-
-    XlaCustomCallStatus custom_call_status;
-    // As this is calling an external library, we should catch the
-    // error as there isn't another working correctly path to return
-    // an error to XLA.
-    try {
-      xla_call_target(se::gpu::AsGpuStreamValue(run_options->stream()),
-                      buffers.data(), backend_config.data(),
-                      backend_config.size(), &custom_call_status);
-    } catch (std::exception& e) {
-      return absl::UnknownError(
-          absl::StrCat(call_target_name,
-                       " XLA extension have thrown an exception: ", e.what()));
-    } catch (...) {
-      return absl::UnknownError(absl::StrCat(
-          call_target_name, " XLA extension have thrown an exception."));
-    }
-
-    if (auto message = CustomCallStatusGetMessage(&custom_call_status)) {
-      return absl::InternalError(message.value());
-    } else {
-      return absl::OkStatus();
-    }
-  }
-
-  return absl::InvalidArgumentError(
-      absl::StrFormat("Unsupported custom call API version: %d", api_version));
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    XlaCustomCall, FunctionWrapper<XlaCustomCallImpl>(), checks,
-    runtime::CustomCall::Bind("xla.gpu.memcpy")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<const DebugOptions*>()
-        .Arg<CustomCall::RemainingArgs>()  // args
-        .Attr<std::string_view>("call_target_name")
-        .Attr<int32_t>("api_version")
-        .Attr<std::string_view>("backend_config"));
-
-void RegisterXlaClassicCustomCalls(
-    runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.custom_call", XlaCustomCall);
-}
-
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/custom_call_registry.cc b/third_party/xla/xla/service/gpu/runtime/custom_call_registry.cc
deleted file mode 100644
index 636c858a7dd069..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/custom_call_registry.cc
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/custom_call_registry.h"
-
-#include <functional>
-#include <vector>
-
-#include "xla/runtime/custom_call_registry.h"
-
-namespace xla::gpu {
-
-using DirectCustomCallRegistration =
-    std::function<void(runtime::DirectCustomCallRegistry&)>;
-
-static std::vector<DirectCustomCallRegistration>*
-DirectCustomCallRegistrations() {
-  static auto* storage = new std::vector<DirectCustomCallRegistration>();
-  return storage;
-}
-
-void AddDirectCustomCallRegistration(
-    DirectCustomCallRegistration registration) {
-  DirectCustomCallRegistrations()->push_back(registration);
-}
-
-// Registers all direct custom calls with the given registry.
-void RegisterDirectCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  for (auto& registration : *DirectCustomCallRegistrations()) {
-    registration(registry);
-  }
-}
-
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/custom_call_registry.h b/third_party/xla/xla/service/gpu/runtime/custom_call_registry.h
deleted file mode 100644
index 20c761de1b690d..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/custom_call_registry.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_CUSTOM_CALL_REGISTRY_H_
-#define XLA_SERVICE_GPU_RUNTIME_CUSTOM_CALL_REGISTRY_H_
-
-#include <functional>
-
-#include "xla/runtime/custom_call_registry.h"
-
-namespace xla::gpu {
-
-// This is a static custom call registry for XLA:GPU executables. XLA runtime
-// custom calls must not be confused with a "classic" custom calls, they are
-// an internal implementation of XLA runtime (and XLA:GPU by extension), and
-// do not provide stable ABI across dynamically loaded libraries. XLA runtime
-// custom calls must be statically linked.
-//
-// XLA:FFI is the planned mechanism for registering "custom calls" via a stable
-// C ABI for internal and external uses, however it's under construction.
-//
-// See more XLA runtime and XLA FFI plans here:
-// https://docs.google.com/document/d/1XHzJyfq-ZFn9WHoKe4o_urnwS991dFHgWoNRboBK_3I/edit#bookmark=id.696pyshem503
-//
-// XLA:FFI will become an official "external custom call" mechanism for XLA:GPU
-// and XLA:CPU some time in 2024.
-
-// Adds a direct custom call registration function to a static registry.
-void AddDirectCustomCallRegistration(
-    std::function<void(runtime::DirectCustomCallRegistry&)> registration);
-
-// Registers all direct custom calls with the given registry.
-void RegisterDirectCustomCalls(runtime::DirectCustomCallRegistry& registry);
-
-//===----------------------------------------------------------------------===//
-// Helper macro to define a static module registration.
-//===----------------------------------------------------------------------===//
-
-#define XLA_GPU_REGISTER_RUNTIME_CUSTOM_CALL(FUNC) \
-  XLA_GPU_REGISTER_RUNTIME_CUSTOM_CALL_IMPL(FUNC, __COUNTER__)
-
-#define XLA_GPU_REGISTER_RUNTIME_CUSTOM_CALL_IMPL(FUNC, N)           \
-  static bool xla_gpu_runtime_custom_call_##N##_registered_ = []() { \
-    ::xla::gpu::AddDirectCustomCallRegistration(FUNC);               \
-    return true;                                                     \
-  }()
-
-}  // namespace xla::gpu
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_CUSTOM_CALL_REGISTRY_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/custom_call_thunk.cc b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
similarity index 99%
rename from third_party/xla/xla/service/gpu/runtime3/custom_call_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
index 6329f6764e90c4..28a7dcebfc1dfa 100644
--- a/third_party/xla/xla/service/gpu/runtime3/custom_call_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/custom_call_thunk.h"
+#include "xla/service/gpu/runtime/custom_call_thunk.h"
 
 #include <optional>
 #include <string>
diff --git a/third_party/xla/xla/service/gpu/runtime3/custom_call_thunk.h b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
similarity index 96%
rename from third_party/xla/xla/service/gpu/runtime3/custom_call_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
index 55134f59232f89..5fa1dce32842fe 100644
--- a/third_party/xla/xla/service/gpu/runtime3/custom_call_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/custom_call_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_CUSTOM_CALL_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_CUSTOM_CALL_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_CUSTOM_CALL_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_CUSTOM_CALL_THUNK_H_
 
 #include <cstddef>
 #include <cstdint>
@@ -126,4 +126,4 @@ class CustomCallThunk : public Thunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_CUSTOM_CALL_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_CUSTOM_CALL_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/executable.cc b/third_party/xla/xla/service/gpu/runtime/executable.cc
deleted file mode 100644
index 814bafa8df5013..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/executable.cc
+++ /dev/null
@@ -1,522 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/executable.h"
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <variant>
-#include <vector>
-
-#include "absl/container/inlined_vector.h"
-#include "absl/status/status.h"
-#include "xla/mlir/runtime/transforms/compilation_pipeline_gpu.h"
-#include "xla/runtime/executable.h"
-#include "xla/runtime/jit_executable.h"
-#include "xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
-#include "xla/service/gpu/runtime/cholesky.h"
-#include "xla/service/gpu/runtime/concurrent_region.h"
-#include "xla/service/gpu/runtime/conv.h"
-#include "xla/service/gpu/runtime/conv_reorder.h"
-#include "xla/service/gpu/runtime/cub_sort.h"
-#include "xla/service/gpu/runtime/custom_call.h"
-#include "xla/service/gpu/runtime/custom_call_registry.h"
-#include "xla/service/gpu/runtime/fft.h"
-#include "xla/service/gpu/runtime/fused_attention.h"
-#include "xla/service/gpu/runtime/gemm.h"
-#include "xla/service/gpu/runtime/gpublas_lt_matmul.h"
-#include "xla/service/gpu/runtime/graph_launch.h"
-#include "xla/service/gpu/runtime/io_feed.h"
-#include "xla/service/gpu/runtime/memcpy.h"
-#include "xla/service/gpu/runtime/memset.h"
-#include "xla/service/gpu/runtime/norm.h"
-#include "xla/service/gpu/runtime/send_recv.h"
-#include "xla/service/gpu/runtime/stream_synchronization.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/gpu/runtime/topk.h"
-#include "xla/service/gpu/runtime/tracing.h"
-#include "xla/service/gpu/thunk.h"
-#include "xla/service/service_executable_run_options.h"
-#include "xla/service/stream_pool.h"
-#include "xla/statusor.h"
-#include "xla/stream_executor/stream.h"
-#include "tsl/protobuf/dnn.pb.h"
-
-namespace xla {
-namespace gpu {
-
-using ::xla::runtime::CustomCallAttrEncodingSet;
-using ::xla::runtime::DirectCustomCallRegistry;
-using ::xla::runtime::Executable;
-using ::xla::runtime::JitExecutable;
-using ::xla::runtime::Tagged;
-using ::xla::runtime::TypeIDNameRegistry;
-
-using ::xla::runtime::CustomCall;
-using ::xla::runtime::DiagnosticEngine;
-using ::xla::runtime::ExportModules;
-
-void RegisterXlaGpuRuntimeCustomCalls(DirectCustomCallRegistry& registry) {
-  // Register custom calls from a static XLA:GPU registry.
-  RegisterDirectCustomCalls(registry);
-
-  // Register builtin XLA:GPU custom calls (aka GPU runtime).
-  RegisterKernelLaunchCustomCalls(registry);
-  RegisterTracingCustomCalls(registry);
-  RegisterFftCustomCalls(registry);
-  RegisterCholeskyCustomCalls(registry);
-  RegisterCollectiveCustomCalls(registry);
-  RegisterGemmCustomCalls(registry);
-  RegisterConvCustomCalls(registry);
-  RegisterConvReorderCustomCalls(registry);
-  RegisterMemcpyCustomCalls(registry);
-  RegisterIoFeedCustomCalls(registry);
-  RegisterMemsetCustomCalls(registry);
-  RegisterSendRecvCustomCalls(registry);
-
-#if GOOGLE_CUDA || TF_HIPBLASLT
-  RegisterMatmulCustomCalls(registry);
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#if GOOGLE_CUDA
-  RegisterNormCustomCalls(registry);
-  RegisterFusedAttentionCustomCalls(registry);
-  RegisterFusedAttentionBackwardCustomCalls(registry);
-#endif  // GOOGLE_CUDA
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  // Graph launch kernels depend on Cuda Graph API.
-  RegisterGraphLaunchCustomCalls(registry);
-  RegisterConcurrentRegionCustomCalls(registry);
-  RegisterStreamSynchronizationCustomCalls(registry);
-  RegisterCubSortCustomCalls(registry);
-  RegisterXlaClassicCustomCalls(registry);
-  RegisterTopkCustomCall(registry);
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-}
-
-void RegisterXlaGpuTypeIdNames(TypeIDNameRegistry& registry) {
-  registry.Register<Tagged<se::dnn::ActivationMode>>(
-      "__type_id_se_dnn_activation");
-  registry.Register<Tagged<DotDimensionNumbers>>(
-      "__type_id_dot_dimension_numbers");
-  registry.Register<Tagged<se::fft::Type>>("__type_id_se_fft_type");
-
-  RegisterTracingTypeIdNames(registry);
-  RegisterConvTypeIdNames(registry);
-  RegisterSendRecvTypeIdNames(registry);
-
-#if GOOGLE_CUDA || TF_HIPBLASLT
-  registry.Register<Tagged<se::gpu::BlasLt::Epilogue>>(
-      "__type_id_se_gpublas_lt_epilogue");
-  RegisterFusedAttentionTypeIdNames(registry);
-  RegisterNormTypeIdNames(registry);
-#endif  // GOOGLE_CUDA || TF_HIPBLASLT
-}
-
-void RegisterXlaGpuAttrEncoding(CustomCallAttrEncodingSet& encoding) {
-  PopulateConvAttrEncoding(encoding);
-  PopulateFftAttrEncoding(encoding);
-  PopulateDotDimsAttrEncoding(encoding);
-  PopulateSendRecvAttrEncoding(encoding);
-
-#if GOOGLE_CUDA || TF_HIPBLASLT
-  PopulateCublasLtMatmulAttrEncoding(encoding);
-  PopulateFusedAttentionAlgorithmConfigAttrEncoding(encoding);
-  PopulateFusedAttentionForwardDAGSignatureAttrEncoding(encoding);
-  PopulateFusedAttentionBackwardDAGSignatureAttrEncoding(encoding);
-  PopulateNormAlgorithmConfigAttrEncoding(encoding);
-#endif  // GOOGLE_CUDA || TF_HIPBLASLT
-}
-
-//===----------------------------------------------------------------------===//
-
-// Executable can have only one "main" function and only graph capture function.
-static int64_t GetNumGraphs(const runtime::Executable& executable) {
-  return executable.num_functions() - 1;
-}
-
-GpuRuntimeExecutable::GpuRuntimeExecutable(
-    std::string module_name, std::vector<int64_t> buffer_sizes,
-    std::vector<std::vector<int64_t>> allocation_indices,
-    std::unique_ptr<JitExecutable> jit_executable, DebugOptions debug_options,
-    ModulesState modules_state)
-    : module_name_(std::move(module_name)),
-      buffer_sizes_(std::move(buffer_sizes)),
-      allocation_indices_(std::move(allocation_indices)),
-      executable_(std::move(jit_executable)),
-      debug_options_(std::move(debug_options)),
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-      graph_instances_(module_name_, GetNumGraphs(executable())),
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-      modules_state_(std::move(modules_state)) {
-  ExportModules(dynamic_custom_calls_);  // export runtime modules
-}
-
-GpuRuntimeExecutable::GpuRuntimeExecutable(
-    std::string module_name, std::vector<int64_t> buffer_sizes,
-    std::vector<std::vector<int64_t>> allocation_indices,
-    std::unique_ptr<Executable> aot_executable, DebugOptions debug_options,
-    ModulesState modules_state)
-    : module_name_(std::move(module_name)),
-      buffer_sizes_(std::move(buffer_sizes)),
-      allocation_indices_(std::move(allocation_indices)),
-      executable_(std::move(aot_executable)),
-      debug_options_(std::move(debug_options)),
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-      graph_instances_(module_name_, GetNumGraphs(executable())),
-#endif  // GOOGL_CUDA || TENSORFLOW_USE_ROCM
-      modules_state_(std::move(modules_state)) {
-  ExportModules(dynamic_custom_calls_);  // export runtime modules
-}
-
-//===----------------------------------------------------------------------===//
-// Compile Xla program lowered to runtime dialects to Gpu runtime executable.
-//===----------------------------------------------------------------------===//
-
-/*static*/ absl::StatusOr<std::unique_ptr<GpuRuntimeExecutable>>
-GpuRuntimeExecutable::Create(std::string module_name,
-                             std::unique_ptr<GpuRuntimeProgram> program) {
-  // Options for the default XLA Runtime compilation pipeline.
-  runtime::CompilationPipelineOptions copts;
-
-  // Populate mapping from XLA (SE) enums/structs type id to symbol names.
-  copts.populate_type_id_names = RegisterXlaGpuTypeIdNames;
-
-  // For passing LMHLO attributes as XLA (SE) enums/structs to custom calls.
-  copts.populate_attr_encodings = RegisterXlaGpuAttrEncoding;
-
-  // Options for constructing XLA runtime JitExecutable.
-  JitExecutable::Options opts;
-  opts.specialization = JitExecutable::Specialization::kDisabled;
-  opts.compiler.verification_level =
-      program->debug_options.xla_gpu_llvm_verification_level();
-  opts.compiler.register_dialects =
-      runtime::RegisterDefaultXlaGpuRuntimeDialects;
-
-  // Register XLA Gpu runtime custom calls with the linker.
-  opts.compiler.symbols_binding = runtime::ToSymbolsBinding(
-      RegisterXlaGpuRuntimeCustomCalls, RegisterXlaGpuTypeIdNames);
-
-  // We just use the default compilation pipeline provided by the XLA runtime.
-  // Alternatively instead of having a separate Xla Runtime program (LMHLO
-  // lowered to canonical dialects), we can assemble a pipeline that will
-  // compile starting from the LMHLO dialect. However this intermediate step
-  // helps with debugging, by materializing IR with XLA runtime custom calls.
-  opts.compiler.create_compilation_pipeline =
-      [copts](xla::runtime::PassManager& passes) {
-        runtime::CreateDefaultXlaGpuRuntimeCompilationPipeline(passes, copts);
-        return absl::OkStatus();
-      };
-
-  // Do not run expensive optimization passes because we do not expect any
-  // non-trivial host code in XLA:GPU host executables.
-  opts.compiler.jit_code_opt_level = llvm::CodeGenOptLevel::None;
-
-  // Instantiate new JitExecutable from the MLIR source.
-  auto jit_executable =
-      JitExecutable::Instantiate(program->module, program->entry_point, opts);
-  if (!jit_executable.ok())
-    return Internal("Failed to compile XLA Runtime program: %s",
-                         jit_executable.status().message());
-
-  // Instantiate state for all registered runtime modules.
-  auto modules_state = ModulesState::Instantiate();
-  if (!modules_state.ok())
-    return Internal("Failed to instantiate modules state: %s",
-                         modules_state.status().message());
-
-  return std::unique_ptr<GpuRuntimeExecutable>(new GpuRuntimeExecutable(
-      std::move(module_name), std::move(program->buffer_sizes),
-      std::move(program->allocation_indices),
-      std::make_unique<JitExecutable>(std::move(*jit_executable)),
-      std::move(program->debug_options), std::move(*modules_state)));
-}
-
-//===----------------------------------------------------------------------===//
-// Constructs Gpu runtime executable from AOT compiled runtime artifact.
-//===----------------------------------------------------------------------===//
-
-/*static*/ absl::StatusOr<std::unique_ptr<GpuRuntimeExecutable>>
-GpuRuntimeExecutable::Create(
-    std::string module_name, std::vector<int64_t> buffer_sizes,
-    std::vector<std::vector<int64_t>> allocation_indices, Executable executable,
-    DebugOptions debug_options) {
-  // Instantiate state for all registered runtime modules.
-  auto modules_state = ModulesState::Instantiate();
-  if (!modules_state.ok())
-    return Internal("Failed to instantiate modules state: %s",
-                         modules_state.status().message());
-
-  return std::unique_ptr<GpuRuntimeExecutable>(new GpuRuntimeExecutable(
-      std::move(module_name), std::move(buffer_sizes),
-      std::move(allocation_indices),
-      std::make_unique<Executable>(std::move(executable)),
-      std::move(debug_options), std::move(*modules_state)));
-}
-
-//===----------------------------------------------------------------------===//
-// Executes with the given buffer arguments.
-//===----------------------------------------------------------------------===//
-
-static runtime::AsyncTaskRunner* NoAsyncTaskRunner() {
-  return reinterpret_cast<runtime::AsyncTaskRunner*>(0XDEADBEEF);
-}
-
-// TODO(ezhulenev): We rely on implementation details of passing memrefs to the
-// compiled kernel. We should have a nicer API to do this, without creating a
-// vector of temporary MemrefDesc for passing operands.
-static void InitializeCallFrame(runtime::Executable::CallFrame& call_frame,
-                                const BufferAllocations& buffer_allocations,
-                                absl::Span<const int64_t> buffer_sizes,
-                                llvm::SmallVectorImpl<void*>& ptrs) {
-  size_t num_allocations = buffer_allocations.size();
-  assert(ptrs.empty() && "pointers storage must be empty");
-  ptrs.resize_for_overwrite(num_allocations);
-
-  // Each buffer allocation passed as 1d memref to the compiled function:
-  //   {basePtr, dataPtr, offset, [sizes, ...], [strides, ...]}
-  size_t num_args_ptrs = 1 + num_allocations * 5;
-  call_frame.args.resize_for_overwrite(num_args_ptrs);
-
-  // Pass pointers to these constants as a memref offset and stride.
-  static int64_t zero = 0;
-  static int64_t one = 1;
-  void* offset = &zero;
-  void* stride = &one;
-
-  // Add a placeholder for the kernel context as the first argument.
-  call_frame.args[0] = nullptr;
-
-  // Initialize arguments for the buffer operands.
-  for (unsigned i = 0; i < num_allocations; ++i) {
-    void* data = &(ptrs[i] = buffer_allocations.GetDeviceAddress(i).opaque());
-    void* size = const_cast<int64_t*>(&buffer_sizes[i]);
-    unsigned idx = 1 + i * 5;
-    call_frame.args[idx + 0] = data;
-    call_frame.args[idx + 1] = data;
-    call_frame.args[idx + 2] = offset;
-    call_frame.args[idx + 3] = size;
-    call_frame.args[idx + 4] = stride;
-  }
-}
-
-absl::Status GpuRuntimeExecutable::Execute(
-    const ServiceExecutableRunOptions* run_options, const std::string& asm_text,
-    const std::vector<uint8_t>& binary,
-    const BufferAllocations& buffer_allocations,
-    NonAtomicallyUpgradeableRWLock& gpu_lock,
-    const BufferAllocation* temp_alloc) {
-  // We pass a pointer to the executable through UserData, so that we can
-  // get access to other exported functions from custom call handlers.
-  runtime::Executable& executable = this->executable();
-
-  // Pack buffer allocations as executable arguments. It is guaranteed that
-  // the compiled function will make a copy of all arguments and will write all
-  // results after the call to `Execute` completes, so it is safe to keep them
-  // on the stack.
-  runtime::Executable::CallFrame call_frame;
-
-  llvm::SmallVector<void*, 16> ptrs;  // storage for device address pointers
-  InitializeCallFrame(call_frame, buffer_allocations, buffer_sizes_, ptrs);
-
-  // Check that initialized call frame is compatible with the executable
-  // entry point signature, otherwise compiled executable can read memory out of
-  // arguments bounds and crash with a segfault.
-  const runtime::FunctionType& signature = executable.signature();
-  if (signature.num_operands() != buffer_allocations.size())
-    return Internal("Expected %d arguments but got %d buffer allocations",
-                         signature.num_operands(), buffer_allocations.size());
-
-  for (unsigned i = 0; i < executable.signature().num_operands(); ++i) {
-    auto* memref = llvm::dyn_cast<runtime::MemrefType>(signature.operand(i));
-    if (!memref) return InvalidArgument("Expected memref as %d-th argument", i);
-
-    if (memref->rank() != 1 || memref->sizes()[0] != buffer_sizes_[i])
-      return InvalidArgument("Expected a buffer of size %d but got %d",
-                             memref->sizes()[0], buffer_sizes_[i]);
-  }
-
-  // XLA Runtime executables do not return any values.
-  runtime::NoResultConverter converter;
-
-  // Get the async communications stream for async collectives.
-  se::StreamExecutor* executor = run_options->stream()->parent();
-  se::StreamPriority stream_priority = se::StreamPriority::Default;
-  if (debug_options_.xla_gpu_enable_highest_priority_async_stream()) {
-    stream_priority = se::StreamPriority::Highest;
-  }
-
-  // Create the needed streams to support NcclCollectiveThunk.
-  //
-  // Calling BorrowStream multiple times doesn't work as intended, see
-  // b/293945751.
-  absl::InlinedVector<se::Stream*, kAsyncStreamTotal> async_comm_streams(
-      kAsyncStreamTotal, nullptr);
-  absl::StatusOr<std::vector<StreamPool::Ptr>> streams =
-      run_options->BorrowStreams(executor->device_ordinal(), kAsyncStreamTotal,
-                                 stream_priority);
-  if (streams.ok()) {
-    for (int64_t i = 0; i < kAsyncStreamTotal; ++i) {
-      async_comm_streams[i] = streams->at(i).get();
-    }
-  }
-
-  // Async Collectives support and Send/Recv events instantiated for each Gpu
-  // executable run, so that concurrent executions can run independently using a
-  // separate set of events for communication.
-  AsyncCollectivesSupport async_collectives(async_comm_streams);
-  SendRecvEvents send_recv_events;
-
-  // Always pass in the temp buffer, even if it is null, to accommodate the
-  // 0-sized buffer corner case.
-  se::DeviceMemoryBase temp_buffer;
-  if (temp_alloc)
-    temp_buffer = buffer_allocations.GetDeviceAddress(temp_alloc->index());
-
-  // State cached separately for each stream executor.
-  StreamExecutorKernels::Snapshot kernels = gpu_kernels_(executor)->snapshot();
-  StreamExecutorConvRunners::Snapshot conv_runners =
-      conv_runners_(executor)->snapshot();
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  std::shared_ptr<StreamExecutorGraphInstances> executor_graphs =
-      graph_instances_(executor);
-
-  StreamExecutorGraphInstances::Snapshot graph_instances =
-      executor_graphs->snapshot();
-  CapturedFunctionExecutionCount::Snapshot execution_count =
-      captured_function_counts_(executor)->snapshot();
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-  // Kernels in concurrent regions should be launched on borrowed stream, so
-  // that the cuda graph won't record dependencies between kernels.
-  // This state stores if the kernel being run is in a concurrent region and
-  // the borrowed streams for executing kernels in concurrent regions.
-  ConcurrentRegionStatus concurrent_region_status(run_options);
-
-  // State cached globally for gpu executable.
-  GemmConfigs::Snapshot gemm_configs = gemm_configs_.snapshot();
-  FftPlans::Snapshot fft_plans = fft_plans_.snapshot();
-
-#if GOOGLE_CUDA || TF_HIPBLASLT
-  MatmulPlans::Snapshot matmul_plans = gpublas_lt_matmul_plans_.snapshot();
-#endif
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  StreamExecutorNormRunners::Snapshot norm_runners =
-      norm_runners_(executor)->snapshot();
-  StreamExecutorFusedAttentionRunners::Snapshot fused_attention_runners =
-      fused_attention_runners_(executor)->snapshot();
-  StreamExecutorFusedAttentionBackwardRunners::Snapshot
-      fused_attention_backward_runners =
-          fused_attention_backward_runners_(executor)->snapshot();
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-  // Pass auxiliary data to the custom call handlers.
-  runtime::CustomCall::UserData user_data(
-      run_options, &executable, &debug_options_, &temp_buffer, &asm_text,
-      &binary, &kernels, &gemm_configs, &conv_runners, &collectives_,
-      &fft_plans, &send_recv_events, &gpu_lock,
-#if GOOGLE_CUDA || TF_HIPBLASLT
-      &matmul_plans,
-#endif
-#if GOOGLE_CUDA
-      // Auxiliary data that is available only if compiled with CUDA support
-      // only.
-      &norm_runners, &fused_attention_runners,
-      &fused_attention_backward_runners,
-#endif  // GOOGLE_CUDA
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-      &graph_instances, &execution_count,
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-      &concurrent_region_status,
-      // Null pointer will be interpreted as an absence of async collectives
-      // support and custom calls will safely return an error.
-      async_collectives.async_comm_stream(AsyncStreamKind::kCollective)
-          ? &async_collectives
-          : nullptr);
-
-  // Initialize state required for running functions from registered modules.
-  auto state_ref = modules_state_.InitializeUserData(user_data);
-  if (!state_ref.ok())
-    return Internal("Failed to initialize runtime modules state: %s",
-                         state_ref.status().message());
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  // Instantiate all CUDA graphs before executing the main function.
-  if (debug_options_.xla_gpu_graph_num_runs_to_instantiate() < 0 &&
-      !graph_instances_.InstantiatedAllGraphs(run_options, executable)) {
-    if (auto instantiated = graph_instances_.InstantiateAllGraphs(
-            run_options, executable, user_data, buffer_allocations,
-            buffer_sizes_, allocation_indices_,
-            debug_options_.xla_gpu_graph_eviction_timeout_seconds());
-        !instantiated.ok()) {
-      return Internal("Failed to instantiate GPU graphs: %s",
-                           instantiated.message());
-    }
-  }
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-  // Collect all emitted diagnostic messages.
-  std::string diagnostic;
-  runtime::DiagnosticEngine diagnostic_engine;
-  AppendDiagnosticToString(diagnostic_engine, &diagnostic, true);
-
-  // Prepare options for executing XLA Runtime program.
-  runtime::Executable::ExecuteOpts opts;
-  opts.async_task_runner = NoAsyncTaskRunner();
-  opts.custom_call_data = &user_data;
-  opts.diagnostic_engine = &diagnostic_engine;
-  opts.custom_call_registry = &dynamic_custom_calls_;
-
-  // Execute with the prepared call frame.
-  executable.Execute(call_frame, opts);
-
-  if (auto st = executable.ReturnResults(converter, &call_frame); !st.ok()) {
-    return Internal("Failed to execute XLA Runtime executable: %s%s%s.",
-                         st.message(), diagnostic.empty() ? "" : ": ",
-                         diagnostic);
-  }
-
-  return absl::OkStatus();
-}
-
-//===----------------------------------------------------------------------===//
-
-const Executable& GpuRuntimeExecutable::executable() const {
-  if (auto* jit = std::get_if<std::unique_ptr<JitExecutable>>(&executable_)) {
-    return *(*jit)->DefaultExecutable();
-  }
-  return *std::get<std::unique_ptr<Executable>>(executable_);
-}
-
-absl::StatusOr<std::string_view> GpuRuntimeExecutable::GetObjFile() const {
-  if (auto obj_file = executable().obj_file())
-    return std::string_view(obj_file->getBuffer());
-
-  return Internal("gpu runtime executable didn't save the obj file");
-}
-
-absl::StatusOr<std::string_view> GpuRuntimeExecutable::GetMlirModule() const {
-  const auto* jit = std::get_if<std::unique_ptr<JitExecutable>>(&executable_);
-  if (!jit) return Internal("MLIR module is not available");
-
-  return (*jit)->mlir_module();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/executable.h b/third_party/xla/xla/service/gpu/runtime/executable.h
deleted file mode 100644
index 57c54186697524..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/executable.h
+++ /dev/null
@@ -1,210 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_EXECUTABLE_H_
-#define XLA_SERVICE_GPU_RUNTIME_EXECUTABLE_H_
-
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <variant>
-#include <vector>
-
-#include "xla/runtime/executable.h"
-#include "xla/runtime/jit_executable.h"
-#include "xla/runtime/module_registry.h"
-#include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
-#include "xla/service/gpu/runtime/collectives.h"
-#include "xla/service/gpu/runtime/conv.h"
-#include "xla/service/gpu/runtime/fft.h"
-#include "xla/service/gpu/runtime/fused_attention.h"
-#include "xla/service/gpu/runtime/gemm.h"
-#include "xla/service/gpu/runtime/gpublas_lt_matmul.h"
-#include "xla/service/gpu/runtime/graph_launch.h"
-#include "xla/service/gpu/runtime/kernel_launch.h"
-#include "xla/service/gpu/runtime/norm.h"
-#include "xla/service/service_executable_run_options.h"
-#include "xla/xla.pb.h"
-
-namespace xla {
-namespace gpu {
-
-// Register custom calls implementing Xla Gpu runtime.
-void RegisterXlaGpuRuntimeCustomCalls(
-    runtime::DirectCustomCallRegistry& registry);
-
-// Register mapping from XLA (SE) enums/structs type ids to symbol names.
-void RegisterXlaGpuTypeIdNames(runtime::TypeIDNameRegistry& registry);
-
-// Register encoding for (L)MHLO attributes required by the runtime functions.
-void RegisterXlaGpuAttrEncoding(runtime::CustomCallAttrEncodingSet& encoding);
-
-// Xla Gpu program lowered to the Xla runtime dialects. Gpu runtime executable
-// jit-compiles this program to an executable artifact (via lowering to LLVM).
-//
-// We have this program as an intermediate step between lowering from HLO to
-// runtime executable to be able to introspect the compilation process. Once we
-// have this program, the Xla gpu compiler job is done, and lowering to LLVM is
-// the responsibility of backend-agnostic Xla runtime passes. This is the last
-// stage when IR is still at a fairly high level of abstraction and has a lot of
-// Gpu specific details in it.
-struct GpuRuntimeProgram {
-  GpuRuntimeProgram(std::string entry_point, std::string module,
-                    std::vector<int64_t> buffer_sizes,
-                    std::vector<std::vector<int64_t>> allocation_indices,
-                    DebugOptions debug_options)
-      : entry_point(std::move(entry_point)),
-        module(std::move(module)),
-        buffer_sizes(std::move(buffer_sizes)),
-        allocation_indices(std::move(allocation_indices)),
-        debug_options(std::move(debug_options)) {}
-
-  std::string entry_point;
-  std::string module;
-  std::vector<int64_t> buffer_sizes;
-  std::vector<std::vector<int64_t>> allocation_indices;
-  DebugOptions debug_options;
-};
-
-// Gpu runtime executable encapsulates the Xla runtime executable compiled from
-// an Xla program and owns all the state required for running it (e.g. it owns
-// various caches required for performance).
-//
-// TODO(ezhulenev): Once thunks are removed from Xla, it might make sense to
-// merge this executable into GpuExecutable. Today we keep it separate to manage
-// the complexity of mixing two execution modes in the same file. GpuExecutable
-// provides an API at XLA level of abstraction (streams and buffers), and this
-// executable provides a lower level API exposing some of the implementation
-// details.
-class GpuRuntimeExecutable {
-  using ModulesState = ::xla::runtime::ModulesState;
-
- public:
-  // Creates GpuRuntimeExecutable from the Xla Gpu Program.
-  static absl::StatusOr<std::unique_ptr<GpuRuntimeExecutable>> Create(
-      std::string module_name, std::unique_ptr<GpuRuntimeProgram> program);
-
-  // Creates GpuRuntimeExecutable from the AOT compiled binary.
-  static absl::StatusOr<std::unique_ptr<GpuRuntimeExecutable>> Create(
-      std::string module_name, std::vector<int64_t> buffer_sizes,
-      std::vector<std::vector<int64_t>> allocation_indices,
-      runtime::Executable executable, DebugOptions debug_options);
-
-  // Executes entry function with the given buffer arguments.
-  absl::Status Execute(const ServiceExecutableRunOptions* run_options,
-                       const std::string& asm_text,
-                       const std::vector<uint8_t>& binary,
-                       const BufferAllocations& buffer_allocations,
-                       NonAtomicallyUpgradeableRWLock& gpu_lock,
-                       const BufferAllocation* temp_alloc = nullptr);
-
-  // Returns object file behind the runtime executable. This object file can
-  // be exported and loaded later to instantiate another executable.
-  absl::StatusOr<std::string_view> GetObjFile() const;
-
-  // Returns MLIR module behind this executable if it is available.
-  absl::StatusOr<std::string_view> GetMlirModule() const;
-
-  std::string_view module_name() const { return module_name_; }
-
- private:
-  GpuRuntimeExecutable(std::string module_name,
-                       std::vector<int64_t> buffer_sizes,
-                       std::vector<std::vector<int64_t>> allocation_indices,
-                       std::unique_ptr<runtime::JitExecutable> jit_executable,
-                       DebugOptions debug_options, ModulesState modules_state);
-
-  GpuRuntimeExecutable(std::string module_name,
-                       std::vector<int64_t> buffer_sizes,
-                       std::vector<std::vector<int64_t>> allocation_indices,
-                       std::unique_ptr<runtime::Executable> aot_executable,
-                       DebugOptions debug_options, ModulesState modules_state);
-
-  std::string module_name_;
-
-  // Depending on the state of `executable_` returns a reference to active
-  // Xla runtime executable.
-  runtime::Executable& executable() {
-    return const_cast<runtime::Executable&>(
-        const_cast<const GpuRuntimeExecutable*>(this)->executable());
-  }
-  const runtime::Executable& executable() const;
-
-  std::vector<int64_t> buffer_sizes_;
-
-  // `rt.allocation_index` attributes for all exported functions. Indexed by
-  // function ordinal.
-  std::vector<std::vector<int64_t>> allocation_indices_;
-
-  // In JIT compilation mode `JitExecutable` is used. In AOT compilation mode
-  // `Executable` is used.
-  std::variant<std::unique_ptr<runtime::JitExecutable>,
-               std::unique_ptr<runtime::Executable>>
-      executable_;
-
-  const DebugOptions debug_options_;
-
-  // Keep gpu kernels loaded by this executable.
-  GpuExecutableKernels gpu_kernels_;
-
-  // Keep gemm configs for all gemm operation in the program.
-  GemmConfigs gemm_configs_;
-
-  // Keep a cache for conv configs for all conv operations in the program.
-  ConvRunners conv_runners_;
-
-  // Keep a cache for fused norm configs for all fused norm operations in the
-  // program.
-  NormRunnerStates norm_runners_;
-
-  // Keep a cache for fused_dot_attention configs for all fused_dot_attention
-  // operations in the program.
-  FusedAttentionRunners fused_attention_runners_;
-
-  // Keep a cache for fused_dot_attention configs for all fused_dot_attention
-  // backward
-  // operations in the program.
-  FusedAttentionBackwardRunners fused_attention_backward_runners_;
-
-  // Support for running collective operations.
-  CollectivesSupport collectives_;
-
-  // Keep a cache of fft plans for all FFT operations in the program.
-  FftPlans fft_plans_;
-
-#if GOOGLE_CUDA || TF_HIPBLASLT  // Keep matmul execution plans.
-  MatmulPlans gpublas_lt_matmul_plans_;
-#endif
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  // Keep captured and instantiated GPU graphs instances.
-  GraphInstances graph_instances_;
-  CapturedFunctionExecutionCounts captured_function_counts_;
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-  // Keep an executable state for all registered runtime modules.
-  ModulesState modules_state_;
-
-  // Dynamic custom calls exported from XLA runtime modules (and FFI modules).
-  runtime::DynamicCustomCallRegistry dynamic_custom_calls_;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_EXECUTABLE_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/fft.cc b/third_party/xla/xla/service/gpu/runtime/fft.cc
deleted file mode 100644
index 11c37aeb363d3c..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/fft.cc
+++ /dev/null
@@ -1,133 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/fft.h"
-
-#include <memory>
-
-#include "xla/mlir/runtime/transforms/custom_call_encoding.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/executable.h"
-#include "xla/runtime/state.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/gpu/runtime3/fft_thunk.h"
-#include "xla/stream_executor/fft.h"
-
-namespace xla {
-
-using xla::runtime::CustomCall;
-using xla::runtime::State;
-using xla::runtime::StridedMemrefView;
-
-//===----------------------------------------------------------------------===//
-// Register FFT attributes decoding with the Xla runtime.
-//===----------------------------------------------------------------------===//
-
-namespace runtime {
-
-XLA_RUNTIME_REGISTER_ENUM_ATTR_DECODING(se::fft::Type);
-
-}  // namespace runtime
-
-//===----------------------------------------------------------------------===//
-// Encoding from MHLO attributes to Xla runtime aggregate attributes.
-//===----------------------------------------------------------------------===//
-
-namespace gpu {
-
-namespace mhlo = ::mlir::mhlo;
-
-static se::fft::Type ConvertFftType(mhlo::FftType type) {
-  switch (type) {
-    case mhlo::FftType::FFT:
-      return se::fft::Type::kC2CForward;
-    case mhlo::FftType::IFFT:
-      return se::fft::Type::kC2CInverse;
-    case mhlo::FftType::RFFT:
-      return se::fft::Type::kR2C;
-    case mhlo::FftType::IRFFT:
-      return se::fft::Type::kC2R;
-    default:
-      return se::fft::Type::kInvalid;
-  }
-}
-
-void PopulateFftAttrEncoding(runtime::CustomCallAttrEncodingSet& encoding) {
-  encoding.Add<runtime::EnumAttrEncoding<mhlo::FftTypeAttr, mhlo::FftType,
-                                         se::fft::Type>>(ConvertFftType);
-}
-
-//===----------------------------------------------------------------------===//
-// FFT custom call implementation.
-//===----------------------------------------------------------------------===//
-
-static absl::Status FftImpl(const ServiceExecutableRunOptions* run_options,
-                            State<std::unique_ptr<FftPlanCache>> state,
-                            StridedMemrefView input, StridedMemrefView output,
-                            absl::Span<const int64_t> fft_length,
-                            se::fft::Type fft_type) {
-  se::Stream* stream = run_options->stream();
-  se::StreamExecutor* executor = stream->parent();
-
-  if (input.dtype == PrimitiveType::F64 || input.dtype == PrimitiveType::C128) {
-    // Adjust FFT type to reflect double precision.
-    switch (fft_type) {
-      case se::fft::Type::kC2CForward:
-        fft_type = se::fft::Type::kZ2ZForward;
-        break;
-      case se::fft::Type::kC2CInverse:
-        fft_type = se::fft::Type::kZ2ZInverse;
-        break;
-      case se::fft::Type::kR2C:
-        fft_type = se::fft::Type::kD2Z;
-        break;
-      case se::fft::Type::kC2R:
-        fft_type = se::fft::Type::kZ2D;
-        break;
-      default:
-        return absl::InvalidArgumentError("Unsupported FFT type");
-    }
-  }
-
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<FftPlanCache> * fft_plan_cache,
-      state.GetOrCreate([]() -> absl::StatusOr<std::unique_ptr<FftPlanCache>> {
-        return std::make_unique<FftPlanCache>();
-      }));
-
-  return RunFft(GetDeviceAddress(input), ToShape(input),
-                GetDeviceAddress(output), ToShape(output), fft_type, fft_length,
-                executor->device_ordinal(), fft_plan_cache->get(), stream,
-                run_options->allocator());
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    Fft, FunctionWrapper<FftImpl>(), checks,
-    CustomCall::Bind("xla.gpu.fft")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .State<std::unique_ptr<FftPlanCache>>("uid")
-        .Arg<StridedMemrefView>()  // input
-        .Arg<StridedMemrefView>()  // output
-        .Attr<absl::Span<const int64_t>>("fft_length")
-        .Attr<se::fft::Type>("fft_type"));
-
-//===----------------------------------------------------------------------===//
-
-void RegisterFftCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.fft", Fft);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/fft.h b/third_party/xla/xla/service/gpu/runtime/fft.h
deleted file mode 100644
index e57572c043607d..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/fft.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_FFT_H_
-#define XLA_SERVICE_GPU_RUNTIME_FFT_H_
-
-#include <memory>
-
-#include "xla/mlir/runtime/transforms/custom_call_encoding.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/service/gpu/runtime3/fft_thunk.h"
-
-namespace xla {
-namespace gpu {
-
-// Registers XLA Gpu runtime fft custom calls.
-void RegisterFftCustomCalls(runtime::DirectCustomCallRegistry& registry);
-
-// Adds attributes encoding set for fft custom calls
-void PopulateFftAttrEncoding(runtime::CustomCallAttrEncodingSet& encoding);
-
-// Keep FftPlanCache for all FFT instances in the executable.
-class FftPlans : public runtime::StateVector<std::unique_ptr<FftPlanCache>> {};
-
-}  //  namespace gpu
-}  //  namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_FFT_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/fft_thunk.cc b/third_party/xla/xla/service/gpu/runtime/fft_thunk.cc
similarity index 76%
rename from third_party/xla/xla/service/gpu/runtime3/fft_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/fft_thunk.cc
index 74195445436ede..728c36752aeed5 100644
--- a/third_party/xla/xla/service/gpu/runtime3/fft_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/fft_thunk.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/fft_thunk.h"
+#include "xla/service/gpu/runtime/fft_thunk.h"
 
 #include <string>
 
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "xla/stream_executor/scratch_allocator.h"
@@ -66,6 +67,22 @@ std::string FftTypeToString(se::fft::Type type) {
   }
 }
 
+absl::StatusOr<stream_executor::blas::BlasSupport*> GetBlas(
+    se::Stream* stream) {
+  auto blas = stream->parent()->AsBlas();
+  if (blas == nullptr) {
+    return absl::InternalError("Unable to get Blas support");
+  }
+  return blas;
+}
+
+absl::StatusOr<stream_executor::fft::FftSupport*> GetFft(se::Stream* stream) {
+  auto fft = stream->parent()->AsFft();
+  if (fft == nullptr) {
+    return absl::InternalError("Unable to get fft support");
+  }
+  return fft;
+}
 }  // namespace
 
 FftThunk::FftThunk(ThunkInfo thunk_info, FftType fft_type,
@@ -113,6 +130,7 @@ absl::Status RunFft(se::DeviceMemoryBase input, const Shape& input_shape,
   // protect each plan with a mutex.
   absl::MutexLock lock(&fft_plan_ptr->mu);
   std::unique_ptr<se::fft::Plan>& fft_plan = fft_plan_ptr->plan;
+  TF_ASSIGN_OR_RETURN(auto fft, GetFft(stream));
   if (fft_plan == nullptr) {
     const int64_t fft_rank = fft_len.size();
     CHECK_LE(fft_rank, 3);
@@ -138,7 +156,7 @@ absl::Status RunFft(se::DeviceMemoryBase input, const Shape& input_shape,
     }
 
     constexpr bool kInPlaceFft = false;
-    fft_plan = stream->parent()->AsFft()->CreateBatchedPlanWithScratchAllocator(
+    fft_plan = fft->CreateBatchedPlanWithScratchAllocator(
         stream, fft_rank, fft_length, input_embed, input_stride, input_distance,
         output_embed, output_stride, output_distance, fft_type, kInPlaceFft,
         batch_size, &scratch_allocator);
@@ -146,8 +164,8 @@ absl::Status RunFft(se::DeviceMemoryBase input, const Shape& input_shape,
         << "Failed to create cuFFT batched plan with scratch allocator";
     fft_plan_ptr->scale_factor = 1.0f / output_distance;
   } else {
-    stream->parent()->AsFft()->UpdatePlanWithScratchAllocator(
-        stream, fft_plan.get(), &scratch_allocator);
+    fft->UpdatePlanWithScratchAllocator(stream, fft_plan.get(),
+                                        &scratch_allocator);
   }
 
   float scale_factor = fft_plan_ptr->scale_factor;
@@ -157,81 +175,72 @@ absl::Status RunFft(se::DeviceMemoryBase input, const Shape& input_shape,
     case se::fft::Type::kC2CForward: {
       se::DeviceMemory<complex64> input_data(input);
       se::DeviceMemory<complex64> output_data(output);
-      launch_ok =
-          stream->ThenFft(fft_plan.get(), input_data, &output_data).ok();
+      launch_ok = fft->DoFft(stream, fft_plan.get(), input_data, &output_data);
       break;
     }
     case se::fft::Type::kZ2ZForward: {
       se::DeviceMemory<complex128> input_data(input);
       se::DeviceMemory<complex128> output_data(output);
-      launch_ok =
-          stream->ThenFft(fft_plan.get(), input_data, &output_data).ok();
+      launch_ok = fft->DoFft(stream, fft_plan.get(), input_data, &output_data);
       break;
     }
     case se::fft::Type::kC2CInverse: {
       se::DeviceMemory<complex64> input_data(input);
       se::DeviceMemory<complex64> output_data(output);
-      launch_ok =
-          stream->ThenFft(fft_plan.get(), input_data, &output_data).ok();
+      launch_ok = fft->DoFft(stream, fft_plan.get(), input_data, &output_data);
       if (launch_ok) {
-        launch_ok = stream
-                        ->ThenBlasScal(ShapeUtil::ElementsIn(output_shape),
-                                       complex64(scale_factor), &output_data, 1)
-                        .ok();
+        TF_ASSIGN_OR_RETURN(auto blas, GetBlas(stream));
+        launch_ok =
+            blas->DoBlasScal(stream, ShapeUtil::ElementsIn(output_shape),
+                             complex64(scale_factor), &output_data, 1);
       }
       break;
     }
     case se::fft::Type::kZ2ZInverse: {
       se::DeviceMemory<complex128> input_data(input);
       se::DeviceMemory<complex128> output_data(output);
-      launch_ok =
-          stream->ThenFft(fft_plan.get(), input_data, &output_data).ok();
+      launch_ok = fft->DoFft(stream, fft_plan.get(), input_data, &output_data);
       if (launch_ok) {
+        TF_ASSIGN_OR_RETURN(auto blas, GetBlas(stream));
         launch_ok =
-            stream
-                ->ThenBlasScal(ShapeUtil::ElementsIn(output_shape),
-                               complex128(scale_factor), &output_data, 1)
-                .ok();
+            blas->DoBlasScal(stream, ShapeUtil::ElementsIn(output_shape),
+                             complex128(scale_factor), &output_data, 1);
       }
       break;
     }
     case se::fft::Type::kR2C: {
       se::DeviceMemory<float> input_data(input);
       se::DeviceMemory<complex64> output_data(output);
-      launch_ok =
-          stream->ThenFft(fft_plan.get(), input_data, &output_data).ok();
+      launch_ok = fft->DoFft(stream, fft_plan.get(), input_data, &output_data);
       break;
     }
     case se::fft::Type::kD2Z: {
       se::DeviceMemory<double> input_data(input);
       se::DeviceMemory<complex128> output_data(output);
-      launch_ok =
-          stream->ThenFft(fft_plan.get(), input_data, &output_data).ok();
+      launch_ok = fft->DoFft(stream, fft_plan.get(), input_data, &output_data);
       break;
     }
     case se::fft::Type::kC2R: {
       se::DeviceMemory<complex64> input_data(input);
       se::DeviceMemory<float> output_data(output);
-      launch_ok =
-          stream->ThenFft(fft_plan.get(), input_data, &output_data).ok();
+      launch_ok = fft->DoFft(stream, fft_plan.get(), input_data, &output_data);
       if (launch_ok) {
-        launch_ok = stream
-                        ->ThenBlasScal(ShapeUtil::ElementsIn(output_shape),
-                                       scale_factor, &output_data, 1)
-                        .ok();
+        TF_ASSIGN_OR_RETURN(auto blas, GetBlas(stream));
+        launch_ok =
+            blas->DoBlasScal(stream, ShapeUtil::ElementsIn(output_shape),
+                             scale_factor, &output_data, 1);
       }
       break;
     }
     case se::fft::Type::kZ2D: {
       se::DeviceMemory<complex128> input_data(input);
       se::DeviceMemory<double> output_data(output);
-      launch_ok =
-          stream->ThenFft(fft_plan.get(), input_data, &output_data).ok();
+      launch_ok = fft->DoFft(stream, fft_plan.get(), input_data, &output_data);
       if (launch_ok) {
-        launch_ok = stream
-                        ->ThenBlasScal(ShapeUtil::ElementsIn(output_shape),
-                                       scale_factor, &output_data, 1)
-                        .ok();
+        TF_ASSIGN_OR_RETURN(auto blas, GetBlas(stream));
+        launch_ok =
+            blas->DoBlasScal(stream, ShapeUtil::ElementsIn(output_shape),
+                             scale_factor, &output_data, 1);
       }
       break;
     }
diff --git a/third_party/xla/xla/service/gpu/runtime3/fft_thunk.h b/third_party/xla/xla/service/gpu/runtime/fft_thunk.h
similarity index 95%
rename from third_party/xla/xla/service/gpu/runtime3/fft_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/fft_thunk.h
index dae9aa4e15affe..bf6e214f039fe0 100644
--- a/third_party/xla/xla/service/gpu/runtime3/fft_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/fft_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_FFT_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_FFT_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_FFT_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_FFT_THUNK_H_
 
 #include <optional>
 
@@ -98,4 +98,4 @@ absl::Status RunFft(se::DeviceMemoryBase input, const Shape& input_shape,
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_FFT_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_FFT_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/fused_attention.cc b/third_party/xla/xla/service/gpu/runtime/fused_attention.cc
deleted file mode 100644
index f504e0cc9fbb57..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/fused_attention.cc
+++ /dev/null
@@ -1,1248 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.1
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/fused_attention.h"
-
-#include <limits>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "llvm/ADT/Sequence.h"
-#include "xla/mlir/runtime/transforms/custom_call_encoding.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/executable.h"
-#include "xla/service/gpu/gpu_asm_opts_util.h"
-#include "xla/service/gpu/gpu_fused_mha_runner.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/service_executable_run_options.h"
-#include "xla/status.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/translate/mhlo_to_hlo/attribute_exporter.h"
-#include "xla/xla.pb.h"
-
-namespace xla {
-
-using xla::runtime::CustomCall;
-using xla::runtime::EnumAttrEncoding;
-using xla::runtime::FlatMemrefView;
-using xla::runtime::State;
-using xla::runtime::StridedMemrefView;
-using xla::runtime::Tagged;
-
-namespace lmhlo_gpu = ::mlir::lmhlo_gpu;
-namespace gpu {
-//===----------------------------------------------------------------------===//
-// Structs for encoding fused attention attributes defined in LMHLO dialect.
-//===----------------------------------------------------------------------===//
-struct AlgorithmConfig {
-  int64_t algorithm;
-  absl::Span<const int64_t> knob_ids;
-  absl::Span<const int64_t> knob_values;
-  int64_t workspace_size;
-};
-
-}  // namespace gpu
-
-//===----------------------------------------------------------------------===//
-// Register fused attention attributes decoding with the Xla runtime.
-//===----------------------------------------------------------------------===//
-namespace runtime {
-XLA_RUNTIME_REGISTER_ENUM_ATTR_DECODING(xla::gpu::CudnnfMHAKind);
-
-XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(
-    xla::gpu::AlgorithmConfig,  //
-    AggregateMember<int64_t>("algorithm"),
-    AggregateMember<absl::Span<const int64_t>>("knob_ids"),
-    AggregateMember<absl::Span<const int64_t>>("knob_values"),
-    AggregateMember<int64_t>("workspace_size"));
-
-}  // namespace runtime
-
-//===----------------------------------------------------------------------===//
-// Type names for encoded attributes.
-//===----------------------------------------------------------------------===//
-
-namespace gpu {
-
-// Register type names for fused attention attributes defined by LMHLO dialect.
-void RegisterFusedAttentionTypeIdNames(runtime::TypeIDNameRegistry& registry) {
-  registry.Register<Tagged<AlgorithmConfig>>("__type_id_algorithm_config");
-  registry.Register<Tagged<xla::gpu::CudnnfMHAKind>>(
-      "__type_id_xla_gpu_cudnn_fmha_kind");
-}
-
-static auto EncodeFusedAttentionDAGSignature(
-    lmhlo_gpu::FusedMhaDagSignature signature) {
-  switch (signature) {
-    case mlir::lmhlo_gpu::FusedMhaDagSignature::Default:
-      return xla::gpu::CudnnfMHAKind::kBmmBmm;
-    case mlir::lmhlo_gpu::FusedMhaDagSignature::ScaleBiasMaskSoftmax:
-      return xla::gpu::CudnnfMHAKind::kScaleBiasMaskSoftmax;
-    case mlir::lmhlo_gpu::FusedMhaDagSignature::ScaleBiasMaskSoftmaxDropout:
-      return xla::gpu::CudnnfMHAKind::kScaleBiasMaskSoftmaxDropout;
-    case mlir::lmhlo_gpu::FusedMhaDagSignature::ScaleMaskSoftmax:
-      return xla::gpu::CudnnfMHAKind::kScaleMaskSoftmax;
-    case mlir::lmhlo_gpu::FusedMhaDagSignature::ScaleMaskSoftmaxDropout:
-      return xla::gpu::CudnnfMHAKind::kScaleMaskSoftmaxDropout;
-    case mlir::lmhlo_gpu::FusedMhaDagSignature::SoftmaxDropout:
-      return xla::gpu::CudnnfMHAKind::kSoftmaxDropout;
-    case mlir::lmhlo_gpu::FusedMhaDagSignature::Softmax:
-      return xla::gpu::CudnnfMHAKind::kSoftmax;
-    case mlir::lmhlo_gpu::FusedMhaDagSignature::ScaleBiasSoftmax:
-      return xla::gpu::CudnnfMHAKind::kScaleBiasSoftmax;
-    case mlir::lmhlo_gpu::FusedMhaDagSignature::ScaleBiasSoftmaxDropout:
-      return xla::gpu::CudnnfMHAKind::kScaleBiasSoftmaxDropout;
-  }
-}
-
-static auto EncodeFusedAttentionBackwardDAGSignature(
-    lmhlo_gpu::FusedMhaBackwardDagSignature signature) {
-  switch (signature) {
-    // backward
-    case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::BackwardSoftmax:
-      return xla::gpu::CudnnfMHAKind::kBackwardSoftmax;
-    case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::BackwardSoftmaxDropout:
-      return xla::gpu::CudnnfMHAKind::kBackwardSoftmaxDropout;
-    case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::
-        BackwardScaleBiasSoftmax:
-      return xla::gpu::CudnnfMHAKind::kBackwardScaleBiasSoftmax;
-    case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::
-        BackwardScaleBiasSoftmaxDropout:
-      return xla::gpu::CudnnfMHAKind::kBackwardScaleBiasSoftmaxDropout;
-    case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::
-        BackwardScaleBiasMaskSoftmax:
-      return xla::gpu::CudnnfMHAKind::kBackwardScaleBiasMaskSoftmax;
-    case mlir::lmhlo_gpu::FusedMhaBackwardDagSignature::
-        BackwardScaleBiasMaskSoftmaxDropout:
-      return xla::gpu::CudnnfMHAKind::kBackwardScaleBiasMaskSoftmaxDropout;
-  }
-}
-
-void PopulateFusedAttentionForwardDAGSignatureAttrEncoding(
-    runtime::CustomCallAttrEncodingSet& encoding) {
-  {  // --- Encode `lmhlo_gpu::FusedMhaDagSignatureAttr`.
-    encoding.Add<EnumAttrEncoding<lmhlo_gpu::FusedMhaDagSignatureAttr,
-                                  lmhlo_gpu::FusedMhaDagSignature,
-                                  xla::gpu::CudnnfMHAKind>>(
-        EncodeFusedAttentionDAGSignature);
-  }
-}
-
-void PopulateFusedAttentionBackwardDAGSignatureAttrEncoding(
-    runtime::CustomCallAttrEncodingSet& encoding) {
-  {  // --- Encode `lmhlo_gpu::FusedMhaBackwardDagSignatureAttr`.
-    encoding.Add<EnumAttrEncoding<lmhlo_gpu::FusedMhaBackwardDagSignatureAttr,
-                                  lmhlo_gpu::FusedMhaBackwardDagSignature,
-                                  xla::gpu::CudnnfMHAKind>>(
-        EncodeFusedAttentionBackwardDAGSignature);
-  }
-}
-
-void PopulateFusedAttentionAlgorithmConfigAttrEncoding(
-    runtime::CustomCallAttrEncodingSet& encoding) {
-  {  // --- Encode `lmhlo_gpu::FusedMHAAlgorithmConfigAttr`.
-    using Attr = mlir::lmhlo_gpu::FusedMHAAlgorithmConfigAttr;
-    encoding.Add<xla::runtime::AggregateAttrEncoding<Attr, AlgorithmConfig>>(
-        encoding, xla::runtime::AggregateAttrDef<Attr>()
-                      .Add("algorithm", &Attr::getAlgorithm)
-                      .Add("knob_ids", &Attr::getKnobIds)
-                      .Add("knob_values", &Attr::getKnobValues)
-                      .Add("workspace_size", &Attr::getWorkspaceSize));
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Fused Dot Attention runners caching.
-//===----------------------------------------------------------------------===//
-
-StreamExecutorFusedAttentionRunners* FusedAttentionRunners::operator()(
-    se::StreamExecutor* executor) {
-  absl::MutexLock lock(&mutex_);
-  return &runners_[executor];
-}
-
-StreamExecutorFusedAttentionBackwardRunners*
-FusedAttentionBackwardRunners::operator()(se::StreamExecutor* executor) {
-  absl::MutexLock lock(&mutex_);
-  return &runners_[executor];
-}
-
-namespace {
-struct DropoutAttrs {
-  double dropout_rate;
-  int64_t seed;
-};
-}  // namespace
-
-static GpufMHADescriptor GetGpufMHADescriptor(
-    CudnnfMHAKind kind, StridedMemrefView lhs_bmm1, StridedMemrefView rhs_bmm1,
-    StridedMemrefView rhs_bmm2, std::optional<StridedMemrefView> mask,
-    std::optional<StridedMemrefView> bias, StridedMemrefView output,
-    std::optional<StridedMemrefView> activation, double fmha_scale,
-    absl::Span<const int64_t> intermediate_tensor_dimensions,
-    absl::Span<const int64_t> intermediate_tensor_layout, AlgorithmConfig algo,
-    DotDimensionNumbers bmm1_dot_dimension_numbers,
-    DotDimensionNumbers bmm2_dot_dimension_numbers, bool is_flash_attention,
-    bool is_causal_mask, std::optional<DropoutAttrs> dropout = std::nullopt) {
-  GpufMHADescriptor descriptor;
-  descriptor.backend_config.set_fmha_scale(fmha_scale);
-
-  auto* algorithm = descriptor.backend_config.mutable_algorithm();
-  algorithm->set_algo_id(algo.algorithm);
-  for (unsigned i = 0; i < algo.knob_ids.size(); ++i) {
-    algorithm->mutable_tuning_knobs()->insert(
-        {algo.knob_ids[i], algo.knob_values[i]});
-  }
-  algorithm->set_is_cudnn_frontend(true);
-  if (algo.workspace_size >= 0) {
-    algorithm->mutable_workspace_size()->set_value(algo.workspace_size);
-  }
-  descriptor.bmm1_dnums =
-      ConvertDotDimensionNumbers(bmm1_dot_dimension_numbers.lhs_batch,
-                                 bmm1_dot_dimension_numbers.lhs_contract,
-                                 bmm1_dot_dimension_numbers.rhs_batch,
-                                 bmm1_dot_dimension_numbers.rhs_contract);
-  descriptor.bmm2_dnums =
-      ConvertDotDimensionNumbers(bmm2_dot_dimension_numbers.lhs_batch,
-                                 bmm2_dot_dimension_numbers.lhs_contract,
-                                 bmm2_dot_dimension_numbers.rhs_batch,
-                                 bmm2_dot_dimension_numbers.rhs_contract);
-  // Apply backend config layout to the shape.
-  auto apply_shape = [](StridedMemrefView& memref) {
-    Shape shape = ToShape(memref);
-    return ShapeUtil::MakeShapeWithDenseLayout(shape.element_type(),
-                                               shape.dimensions(),
-                                               shape.layout().minor_to_major());
-  };
-  descriptor.lhs_bmm1_shape = apply_shape(lhs_bmm1);
-  descriptor.rhs_bmm1_shape = apply_shape(rhs_bmm1);
-  descriptor.rhs_bmm2_shape = apply_shape(rhs_bmm2);
-  descriptor.output_shapes.push_back(apply_shape(output));
-  if (activation.has_value()) {
-    descriptor.output_shapes.push_back(apply_shape(*activation));
-  }
-  if (bias.has_value()) {
-    descriptor.bias_shape = apply_shape(*bias);
-  }
-  if (mask.has_value()) {
-    descriptor.mask_shape = apply_shape(*mask);
-  }
-
-  Shape out_shape = ToShape(output);
-  descriptor.intermediate_lhs_bmm2_shape = ShapeUtil::MakeShapeWithDenseLayout(
-      out_shape.element_type(), intermediate_tensor_dimensions,
-      intermediate_tensor_layout);
-
-  if (dropout.has_value()) {
-    descriptor.backend_config.set_dropout_rate(dropout->dropout_rate);
-    descriptor.backend_config.set_seed(dropout->seed);
-  }
-
-  descriptor.kind = kind;
-  descriptor.is_flash_attention = is_flash_attention;
-  descriptor.is_causal_mask = is_causal_mask;
-  return descriptor;
-}
-
-static GpufMHABackwardDescriptor GetGpufMHABackwardDescriptor(
-    CudnnfMHAKind kind, StridedMemrefView bmm1_grad_gemm1_rhs,
-    StridedMemrefView bmm1_grad_gemm2_rhs,
-    StridedMemrefView bmm2_grad_gemm2_rhs,
-    StridedMemrefView bmm2_grad_gemm1_lhs, StridedMemrefView d_output,
-    std::optional<StridedMemrefView> mask,
-    std::optional<StridedMemrefView> d_bias, StridedMemrefView d_bmm1_lhs,
-    StridedMemrefView d_bmm1_rhs, StridedMemrefView d_bmm2_rhs,
-    std::optional<StridedMemrefView> d_S,
-    std::optional<StridedMemrefView> softmax_sum,
-    std::optional<StridedMemrefView> d_Q_accum,
-    std::optional<StridedMemrefView> fwd_output,
-    std::optional<StridedMemrefView> bias, double fmha_scale,
-    AlgorithmConfig algo,
-    DotDimensionNumbers bmm1_grad_gemm1_dot_dimension_numbers,
-    DotDimensionNumbers bmm1_grad_gemm2_dot_dimension_numbers,
-    DotDimensionNumbers bmm2_grad_gemm1_dot_dimension_numbers,
-    DotDimensionNumbers bmm2_grad_gemm2_dot_dimension_numbers,
-    absl::Span<const int64_t> intermediate_tensor_dimensions,
-    absl::Span<const int64_t> intermediate_tensor_layout,
-    bool is_flash_attention, bool is_causal_mask,
-    std::optional<DropoutAttrs> dropout_attrs = std::nullopt) {
-  GpufMHABackwardDescriptor descriptor;
-  descriptor.backend_config.set_fmha_scale(fmha_scale);
-
-  auto* algorithm = descriptor.backend_config.mutable_algorithm();
-  algorithm->set_algo_id(algo.algorithm);
-  for (unsigned i = 0; i < algo.knob_ids.size(); ++i) {
-    algorithm->mutable_tuning_knobs()->insert(
-        {algo.knob_ids[i], algo.knob_values[i]});
-  }
-  algorithm->set_is_cudnn_frontend(true);
-  if (algo.workspace_size >= 0) {
-    algorithm->mutable_workspace_size()->set_value(algo.workspace_size);
-  }
-
-  descriptor.bmm1_grad_gemm1_dnums = ConvertDotDimensionNumbers(
-      bmm1_grad_gemm1_dot_dimension_numbers.lhs_batch,
-      bmm1_grad_gemm1_dot_dimension_numbers.lhs_contract,
-      bmm1_grad_gemm1_dot_dimension_numbers.rhs_batch,
-      bmm1_grad_gemm1_dot_dimension_numbers.rhs_contract);
-  descriptor.bmm1_grad_gemm2_dnums = ConvertDotDimensionNumbers(
-      bmm1_grad_gemm2_dot_dimension_numbers.lhs_batch,
-      bmm1_grad_gemm2_dot_dimension_numbers.lhs_contract,
-      bmm1_grad_gemm2_dot_dimension_numbers.rhs_batch,
-      bmm1_grad_gemm2_dot_dimension_numbers.rhs_contract);
-  descriptor.bmm2_grad_gemm1_dnums = ConvertDotDimensionNumbers(
-      bmm2_grad_gemm1_dot_dimension_numbers.lhs_batch,
-      bmm2_grad_gemm1_dot_dimension_numbers.lhs_contract,
-      bmm2_grad_gemm1_dot_dimension_numbers.rhs_batch,
-      bmm2_grad_gemm1_dot_dimension_numbers.rhs_contract);
-  descriptor.bmm2_grad_gemm2_dnums = ConvertDotDimensionNumbers(
-      bmm2_grad_gemm2_dot_dimension_numbers.lhs_batch,
-      bmm2_grad_gemm2_dot_dimension_numbers.lhs_contract,
-      bmm2_grad_gemm2_dot_dimension_numbers.rhs_batch,
-      bmm2_grad_gemm2_dot_dimension_numbers.rhs_contract);
-
-  // Apply backend config layout to the shape.
-  auto apply_shape = [](StridedMemrefView& memref) {
-    Shape shape = ToShape(memref);
-    return ShapeUtil::MakeShapeWithDenseLayout(shape.element_type(),
-                                               shape.dimensions(),
-                                               shape.layout().minor_to_major());
-  };
-  descriptor.bmm1_grad_gemm1_rhs_shape = apply_shape(bmm1_grad_gemm1_rhs);
-  descriptor.bmm1_grad_gemm2_rhs_shape = apply_shape(bmm1_grad_gemm2_rhs);
-  descriptor.bmm2_grad_gemm2_rhs_shape = apply_shape(bmm2_grad_gemm2_rhs);
-  if (is_flash_attention) {
-    // if it is flash attention then bmm2_grad_gemm1_lhs will be softmax_stats
-    // instead of P we need to use real P layout
-    descriptor.bmm2_grad_gemm1_lhs_shape = ShapeUtil::MakeShapeWithDenseLayout(
-        descriptor.bmm2_grad_gemm2_rhs_shape.element_type(),
-        intermediate_tensor_dimensions, intermediate_tensor_layout);
-  } else {
-    descriptor.bmm2_grad_gemm1_lhs_shape = apply_shape(bmm2_grad_gemm1_lhs);
-  }
-
-  descriptor.d_output_shape = apply_shape(d_output);
-  descriptor.d_bmm1_lhs_shape = apply_shape(d_bmm1_lhs);
-  descriptor.d_bmm1_rhs_shape = apply_shape(d_bmm1_rhs);
-  descriptor.d_bmm2_rhs_shape = apply_shape(d_bmm2_rhs);
-
-  if (mask.has_value()) {
-    descriptor.mask_shape = apply_shape(*mask);
-  }
-  if (d_bias.has_value()) {
-    descriptor.d_bias_shape = apply_shape(*d_bias);
-  }
-  if (fwd_output.has_value()) {
-    descriptor.fwd_output_shape = apply_shape(*fwd_output);
-  }
-  if (bias.has_value()) {
-    descriptor.bias_shape = apply_shape(*bias);
-  }
-  if (dropout_attrs.has_value()) {
-    descriptor.backend_config.set_dropout_rate(dropout_attrs->dropout_rate);
-    descriptor.backend_config.set_seed(dropout_attrs->seed);
-  }
-
-  descriptor.kind = kind;
-  descriptor.is_flash_attention = is_flash_attention;
-  descriptor.is_causal_mask = is_causal_mask;
-  return descriptor;
-}
-
-static absl::Status FusedAttentionForwardImpl(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, State<FusedAttentionRunner> runner,
-    StridedMemrefView lhs_bmm1, StridedMemrefView rhs_bmm1,
-    StridedMemrefView rhs_bmm2, std::optional<StridedMemrefView> mask,
-    std::optional<StridedMemrefView> bias, StridedMemrefView output,
-    FlatMemrefView scratch, std::optional<StridedMemrefView> activation,
-    int64_t uid, double fmha_scale, bool is_flash_attention,
-    bool is_causal_mask,
-    absl::Span<const int64_t> intermediate_tensor_dimensions,
-    absl::Span<const int64_t> intermediate_tensor_layout,
-    DotDimensionNumbers bmm1_dot_dimension_numbers,
-    DotDimensionNumbers bmm2_dot_dimension_numbers,
-    xla::gpu::CudnnfMHAKind kind, AlgorithmConfig algorithm_config,
-    std::optional<double> dropout_rate = std::nullopt,
-    std::optional<int64_t> seed = std::nullopt) {
-  std::optional<DropoutAttrs> dropout_attrs = std::nullopt;
-  if (dropout_rate.has_value() && seed.has_value()) {
-    dropout_attrs = {*dropout_rate, *seed};
-  }
-  // Get or create the fused attention runner state.
-  absl::StatusOr<FusedAttentionRunner*> fda =
-      runner.GetOrCreate([&]() -> absl::StatusOr<FusedAttentionRunner> {
-        GpufMHADescriptor descriptor = GetGpufMHADescriptor(
-            kind, lhs_bmm1, rhs_bmm1, rhs_bmm2, mask, bias, output, activation,
-            fmha_scale, intermediate_tensor_dimensions,
-            intermediate_tensor_layout, algorithm_config,
-            bmm1_dot_dimension_numbers, bmm2_dot_dimension_numbers,
-            is_flash_attention, is_causal_mask, dropout_attrs);
-
-        absl::StatusOr<GpufMHAConfig> config = GpufMHAConfig::For(descriptor);
-        if (!config.ok()) return tsl::ToAbslStatus(config.status());
-
-        return FusedAttentionRunner(*std::move(config));
-      });
-  if (!fda.ok()) return fda.status();
-
-  se::DeviceMemoryBase lhs_bmm1_buffer = GetDeviceAddress(lhs_bmm1);
-  se::DeviceMemoryBase rhs_bmm1_buffer = GetDeviceAddress(rhs_bmm1);
-  se::DeviceMemoryBase rhs_bmm2_buffer = GetDeviceAddress(rhs_bmm2);
-  se::DeviceMemoryBase output_buffer = GetDeviceAddress(output);
-  se::DeviceMemoryBase scratch_buffer = GetDeviceAddress(scratch);
-
-  se::DeviceMemoryBase mask_buffer;
-  if (mask.has_value()) {
-    mask_buffer = GetDeviceAddress(*mask);
-  }
-  se::DeviceMemoryBase bias_buffer;
-  if (bias.has_value()) {
-    bias_buffer = GetDeviceAddress(*bias);
-  }
-  se::DeviceMemoryBase activation_buffer;
-  if (activation.has_value()) {
-    activation_buffer = GetDeviceAddress(*activation);
-  }
-
-  RunFusedMHAOptions opts;
-  opts.runner_cache = &(*fda)->runner;
-
-  // Run the fused dot attention.
-  auto st =
-      RunGpuFMHA((*fda)->config, lhs_bmm1_buffer, rhs_bmm1_buffer,
-                 rhs_bmm2_buffer, output_buffer, scratch_buffer, mask_buffer,
-                 bias_buffer, activation_buffer, run_options->stream(), opts);
-  if (!st.ok() || !run_options->stream()->ok()) {
-    return tsl::ToAbslStatus(st);
-  }
-  return absl::OkStatus();
-}
-
-static absl::Status FusedAttentionBackwardImpl(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options,
-    State<FusedAttentionBackwardRunner> runner,
-    StridedMemrefView bmm1_grad_gemm1_rhs,
-    StridedMemrefView bmm1_grad_gemm2_rhs,
-    StridedMemrefView bmm2_grad_gemm2_rhs,
-    StridedMemrefView bmm2_grad_gemm1_lhs, StridedMemrefView d_output,
-    std::optional<StridedMemrefView> mask,
-    std::optional<StridedMemrefView> bias,
-    std::optional<StridedMemrefView> fwd_output, StridedMemrefView d_bmm1_lhs,
-    StridedMemrefView d_bmm1_rhs, StridedMemrefView d_bmm2_rhs,
-    std::optional<StridedMemrefView> d_S,
-    std::optional<StridedMemrefView> softmax_sum,
-    std::optional<StridedMemrefView> d_Q_accum, FlatMemrefView scratch,
-    std::optional<StridedMemrefView> d_bias, int64_t uid, double fmha_scale,
-    bool is_flash_attention, bool is_causal_mask,
-    absl::Span<const int64_t> intermediate_tensor_dimensions,
-    absl::Span<const int64_t> intermediate_tensor_layout,
-    DotDimensionNumbers bmm1_grad_gemm1_dot_dimension_numbers,
-    DotDimensionNumbers bmm1_grad_gemm2_dot_dimension_numbers,
-    DotDimensionNumbers bmm2_grad_gemm1_dot_dimension_numbers,
-    DotDimensionNumbers bmm2_grad_gemm2_dot_dimension_numbers,
-    xla::gpu::CudnnfMHAKind kind, AlgorithmConfig algorithm_config,
-    std::optional<double> dropout_rate = std::nullopt,
-    std::optional<int64_t> seed = std::nullopt) {
-  std::optional<DropoutAttrs> dropout_attrs = std::nullopt;
-  if (dropout_rate.has_value() && seed.has_value()) {
-    dropout_attrs = {*dropout_rate, *seed};
-  }
-
-  // Get or create the fused attention runner state.
-  absl::StatusOr<FusedAttentionBackwardRunner*> fda =
-      runner.GetOrCreate([&]() -> absl::StatusOr<FusedAttentionBackwardRunner> {
-        GpufMHABackwardDescriptor descriptor = GetGpufMHABackwardDescriptor(
-            kind, bmm1_grad_gemm1_rhs, bmm1_grad_gemm2_rhs, bmm2_grad_gemm2_rhs,
-            bmm2_grad_gemm1_lhs, d_output, mask, d_bias, d_bmm1_lhs, d_bmm1_rhs,
-            d_bmm2_rhs, d_S, softmax_sum, d_Q_accum, fwd_output, bias,
-            fmha_scale, algorithm_config, bmm1_grad_gemm1_dot_dimension_numbers,
-            bmm1_grad_gemm2_dot_dimension_numbers,
-            bmm2_grad_gemm1_dot_dimension_numbers,
-            bmm2_grad_gemm2_dot_dimension_numbers,
-            intermediate_tensor_dimensions, intermediate_tensor_layout,
-            is_flash_attention, is_causal_mask, dropout_attrs);
-        absl::StatusOr<GpufMHABackwardConfig> config =
-            GpufMHABackwardConfig::For(descriptor);
-        if (!config.ok()) return tsl::ToAbslStatus(config.status());
-
-        return FusedAttentionBackwardRunner(*std::move(config));
-      });
-  if (!fda.ok()) return fda.status();
-
-  se::DeviceMemoryBase bmm1_grad_gemm1_rhs_buffer =
-      GetDeviceAddress(bmm1_grad_gemm1_rhs);
-  se::DeviceMemoryBase bmm1_grad_gemm2_rhs_buffer =
-      GetDeviceAddress(bmm1_grad_gemm2_rhs);
-  se::DeviceMemoryBase bmm2_grad_gemm2_rhs_buffer =
-      GetDeviceAddress(bmm2_grad_gemm2_rhs);
-  se::DeviceMemoryBase bmm2_grad_gemm1_lhs_buffer =
-      GetDeviceAddress(bmm2_grad_gemm1_lhs);
-
-  se::DeviceMemoryBase d_output_buffer = GetDeviceAddress(d_output);
-  se::DeviceMemoryBase d_bmm1_lhs_buffer = GetDeviceAddress(d_bmm1_lhs);
-  se::DeviceMemoryBase d_bmm1_rhs_buffer = GetDeviceAddress(d_bmm1_rhs);
-  se::DeviceMemoryBase d_bmm2_rhs_buffer = GetDeviceAddress(d_bmm2_rhs);
-  se::DeviceMemoryBase scratch_buffer = GetDeviceAddress(scratch);
-
-  se::DeviceMemoryBase d_S_buffer;
-  if (d_S.has_value()) {
-    d_S_buffer = GetDeviceAddress(*d_S);
-  }
-
-  se::DeviceMemoryBase mask_buffer;
-  if (mask.has_value()) {
-    mask_buffer = GetDeviceAddress(*mask);
-  }
-
-  se::DeviceMemoryBase d_bias_buffer;
-  if (d_bias.has_value()) {
-    d_bias_buffer = GetDeviceAddress(*d_bias);
-  }
-
-  se::DeviceMemoryBase softmax_sum_buffer;
-  if (softmax_sum.has_value()) {
-    softmax_sum_buffer = GetDeviceAddress(*softmax_sum);
-  }
-
-  se::DeviceMemoryBase d_Q_accum_buffer;
-  if (d_Q_accum.has_value()) {
-    d_Q_accum_buffer = GetDeviceAddress(*d_Q_accum);
-  }
-
-  se::DeviceMemoryBase fwd_output_buffer;
-  if (fwd_output.has_value()) {
-    fwd_output_buffer = GetDeviceAddress(*fwd_output);
-  }
-
-  se::DeviceMemoryBase bias_buffer;
-  if (bias.has_value()) {
-    bias_buffer = GetDeviceAddress(*bias);
-  }
-
-  RunFusedMHABackwardOptions opts;
-  opts.runner_cache = &(*fda)->runner;
-
-  // Run the fused attention backward.
-  auto st = RunGpuFMHABackward(
-      (*fda)->config, bmm1_grad_gemm1_rhs_buffer, bmm1_grad_gemm2_rhs_buffer,
-      bmm2_grad_gemm1_lhs_buffer, bmm2_grad_gemm2_rhs_buffer, d_output_buffer,
-      scratch_buffer, d_bmm1_lhs_buffer, d_bmm1_rhs_buffer, d_bmm2_rhs_buffer,
-      d_S_buffer, softmax_sum_buffer, d_Q_accum_buffer, mask_buffer,
-      d_bias_buffer, fwd_output_buffer, bias_buffer, run_options->stream(),
-      opts);
-  if (!st.ok() || !run_options->stream()->ok()) {
-    return tsl::ToAbslStatus(st);
-  }
-  return absl::OkStatus();
-}
-
-//===----------------------------------------------------------------------===//
-// Fused Attention custom calls bindings and registration.
-//===----------------------------------------------------------------------===//
-
-template <typename... Ts>
-auto BindFusedAttentionAttributes(runtime::CustomCallBinding<Ts...> binding) {
-  return std::move(binding)
-      .template Attr<int64_t>("uid")
-      .template Attr<double>("fmha_scale")
-      .template Attr<bool>("is_flash_attention")
-      .template Attr<bool>("is_causal_mask")
-      .template Attr<absl::Span<const int64_t>>(
-          "intermediate_tensor_dimensions")
-      .template Attr<absl::Span<const int64_t>>("intermediate_tensor_layout")
-      .template Attr<DotDimensionNumbers>("bmm1_dot_dimension_numbers")
-      .template Attr<DotDimensionNumbers>("bmm2_dot_dimension_numbers")
-      .template Attr<xla::gpu::CudnnfMHAKind>("fused_mha_dag")
-      .template Attr<AlgorithmConfig>("algorithm_config");
-}
-
-auto FusedAttentionCall(const char* name) {
-  return CustomCall::Bind(name)
-      .UserData<const ServiceExecutableRunOptions*>()
-      .UserData<const DebugOptions*>()
-      .State<FusedAttentionRunner>("uid")
-      .Arg<StridedMemrefView>()   // lhs_bmm1
-      .Arg<StridedMemrefView>()   // rhs_bmm1
-      .Arg<StridedMemrefView>();  // rhs_bmm2
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionBmmBmmInference, FunctionWrapper<FusedAttentionForwardImpl>(),
-    checks,
-    BindFusedAttentionAttributes(
-        FusedAttentionCall("xla.gpu.fused.attention.bmm.bmm.inference")
-            .Value(std::optional<StridedMemrefView>())  // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Arg<StridedMemrefView>()                   // output
-            .Arg<FlatMemrefView>()                      // scratch
-            .Value(std::optional<StridedMemrefView>())  // activation
-        )
-        .Value(std::optional<double>())   // dropout_rate
-        .Value(std::optional<int64_t>())  // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionBmmBmmForward, FunctionWrapper<FusedAttentionForwardImpl>(),
-    checks,
-    BindFusedAttentionAttributes(
-        FusedAttentionCall("xla.gpu.fused.attention.bmm.bmm.forward")
-            .Value(std::optional<StridedMemrefView>())  // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Arg<StridedMemrefView>()                   // output
-            .Arg<FlatMemrefView>()                      // scratch
-            .Arg<StridedMemrefView>()                   // activation
-        )
-        .Value(std::optional<double>())   // dropout_rate
-        .Value(std::optional<int64_t>())  // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionSoftmaxInference,
-    FunctionWrapper<FusedAttentionForwardImpl>(), checks,
-    BindFusedAttentionAttributes(
-        FusedAttentionCall("xla.gpu.fused.attention.softmax.inference")
-            .Value(std::optional<StridedMemrefView>())  // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Arg<StridedMemrefView>()                   // output
-            .Arg<FlatMemrefView>()                      // scratch
-            .Value(std::optional<StridedMemrefView>())  // activation
-        )
-        .Value(std::optional<double>())   // dropout_rate
-        .Value(std::optional<int64_t>())  // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionSoftmaxForward, FunctionWrapper<FusedAttentionForwardImpl>(),
-    checks,
-    BindFusedAttentionAttributes(
-        FusedAttentionCall("xla.gpu.fused.attention.softmax.forward")
-            .Value(std::optional<StridedMemrefView>())  // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Arg<StridedMemrefView>()                   // output
-            .Arg<FlatMemrefView>()                      // scratch
-            .Arg<StridedMemrefView>()                   // activation
-        )
-        .Value(std::optional<double>())   // dropout_rate
-        .Value(std::optional<int64_t>())  // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionSoftmaxDropoutInference,
-    FunctionWrapper<FusedAttentionForwardImpl>(), checks,
-    BindFusedAttentionAttributes(
-        FusedAttentionCall("xla.gpu.fused.attention.softmax.dropout.inference")
-            .Value(std::optional<StridedMemrefView>())  // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Arg<StridedMemrefView>()                   // output
-            .Arg<FlatMemrefView>()                      // scratch
-            .Value(std::optional<StridedMemrefView>())  // activation
-        )
-        .Attr<double>("dropout_rate")  // dropout_rate
-        .Attr<int64_t>("seed")         // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionSoftmaxDropoutForward,
-    FunctionWrapper<FusedAttentionForwardImpl>(), checks,
-    BindFusedAttentionAttributes(
-        FusedAttentionCall("xla.gpu.fused.attention.softmax.dropout.forward")
-            .Value(std::optional<StridedMemrefView>())  // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Arg<StridedMemrefView>()                   // output
-            .Arg<FlatMemrefView>()                      // scratch
-            .Arg<StridedMemrefView>()                   // activation
-        )
-        .Attr<double>("dropout_rate")  // dropout_rate
-        .Attr<int64_t>("seed")         // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleBiasSoftmaxInference,
-    FunctionWrapper<FusedAttentionForwardImpl>(), checks,
-    BindFusedAttentionAttributes(
-        FusedAttentionCall(
-            "xla.gpu.fused.attention.scale.bias.softmax.inference")
-            .Value(std::optional<StridedMemrefView>())  // mask
-            .Arg<StridedMemrefView>()                   // bias
-            .Arg<StridedMemrefView>()                   // output
-            .Arg<FlatMemrefView>()                      // scratch
-            .Value(std::optional<StridedMemrefView>())  // activation
-        )
-        .Value(std::optional<double>())   // dropout_rate
-        .Value(std::optional<int64_t>())  // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleBiasSoftmaxForward,
-    FunctionWrapper<FusedAttentionForwardImpl>(), checks,
-    BindFusedAttentionAttributes(
-        FusedAttentionCall("xla.gpu.fused.attention.scale.bias.softmax.forward")
-            .Value(std::optional<StridedMemrefView>())  // mask
-            .Arg<StridedMemrefView>()                   // bias
-            .Arg<StridedMemrefView>()                   // output
-            .Arg<FlatMemrefView>()                      // scratch
-            .Arg<StridedMemrefView>()                   // activation
-        )
-        .Value(std::optional<double>())   // dropout_rate
-        .Value(std::optional<int64_t>())  // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleBiasSoftmaxDropoutInference,
-    FunctionWrapper<FusedAttentionForwardImpl>(), checks,
-    BindFusedAttentionAttributes(
-        FusedAttentionCall(
-            "xla.gpu.fused.attention.scale.bias.softmax.dropout.inference")
-            .Value(std::optional<StridedMemrefView>())  // mask
-            .Arg<StridedMemrefView>()                   // bias
-            .Arg<StridedMemrefView>()                   // output
-            .Arg<FlatMemrefView>()                      // scratch
-            .Value(std::optional<StridedMemrefView>())  // activation
-        )
-        .Attr<double>("dropout_rate")  // dropout_rate
-        .Attr<int64_t>("seed")         // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleBiasSoftmaxDropoutForward,
-    FunctionWrapper<FusedAttentionForwardImpl>(), checks,
-    BindFusedAttentionAttributes(
-        FusedAttentionCall(
-            "xla.gpu.fused.attention.scale.bias.softmax.dropout.forward")
-            .Value(std::optional<StridedMemrefView>())  // mask
-            .Arg<StridedMemrefView>()                   // bias
-            .Arg<StridedMemrefView>()                   // output
-            .Arg<FlatMemrefView>()                      // scratch
-            .Arg<StridedMemrefView>()                   // activation
-        )
-        .Attr<double>("dropout_rate")  // dropout_rate
-        .Attr<int64_t>("seed")         // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleMaskSoftmaxInference,
-    FunctionWrapper<FusedAttentionForwardImpl>(), checks,
-    BindFusedAttentionAttributes(
-        FusedAttentionCall(
-            "xla.gpu.fused.attention.scale.mask.softmax.inference")
-            .Arg<StridedMemrefView>()                   // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Arg<StridedMemrefView>()                   // output
-            .Arg<FlatMemrefView>()                      // scratch
-            .Value(std::optional<StridedMemrefView>())  // activation
-        )
-        .Value(std::optional<double>())   // dropout_rate
-        .Value(std::optional<int64_t>())  // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleMaskSoftmaxForward,
-    FunctionWrapper<FusedAttentionForwardImpl>(), checks,
-    BindFusedAttentionAttributes(
-        FusedAttentionCall("xla.gpu.fused.attention.scale.mask.softmax.forward")
-            .Arg<StridedMemrefView>()                   // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Arg<StridedMemrefView>()                   // output
-            .Arg<FlatMemrefView>()                      // scratch
-            .Arg<StridedMemrefView>()                   // activation
-        )
-        .Value(std::optional<double>())   // dropout_rate
-        .Value(std::optional<int64_t>())  // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleMaskSoftmaxDropoutInference,
-    FunctionWrapper<FusedAttentionForwardImpl>(), checks,
-    BindFusedAttentionAttributes(
-        FusedAttentionCall(
-            "xla.gpu.fused.attention.scale.mask.softmax.dropout.inference")
-            .Arg<StridedMemrefView>()                   // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Arg<StridedMemrefView>()                   // output
-            .Arg<FlatMemrefView>()                      // scratch
-            .Value(std::optional<StridedMemrefView>())  // activation
-        )
-        .Attr<double>("dropout_rate")  // dropout_rate
-        .Attr<int64_t>("seed")         // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleMaskSoftmaxDropoutForward,
-    FunctionWrapper<FusedAttentionForwardImpl>(), checks,
-    BindFusedAttentionAttributes(
-        FusedAttentionCall(
-            "xla.gpu.fused.attention.scale.mask.softmax.dropout.forward")
-            .Arg<StridedMemrefView>()                   // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Arg<StridedMemrefView>()                   // output
-            .Arg<FlatMemrefView>()                      // scratch
-            .Arg<StridedMemrefView>()                   // activation
-        )
-        .Attr<double>("dropout_rate")  // dropout_rate
-        .Attr<int64_t>("seed")         // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleBiasMaskSoftmaxInference,
-    FunctionWrapper<FusedAttentionForwardImpl>(), checks,
-    BindFusedAttentionAttributes(
-        FusedAttentionCall(
-            "xla.gpu.fused.attention.scale.bias.mask.softmax.inference")
-            .Arg<StridedMemrefView>()                   // mask
-            .Arg<StridedMemrefView>()                   // bias
-            .Arg<StridedMemrefView>()                   // output
-            .Arg<FlatMemrefView>()                      // scratch
-            .Value(std::optional<StridedMemrefView>())  // activation
-        )
-        .Value(std::optional<double>())   // dropout_rate
-        .Value(std::optional<int64_t>())  // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleBiasMaskSoftmaxForward,
-    FunctionWrapper<FusedAttentionForwardImpl>(), checks,
-    BindFusedAttentionAttributes(
-        FusedAttentionCall(
-            "xla.gpu.fused.attention.scale.bias.mask.softmax.forward")
-            .Arg<StridedMemrefView>()  // mask
-            .Arg<StridedMemrefView>()  // bias
-            .Arg<StridedMemrefView>()  // output
-            .Arg<FlatMemrefView>()     // scratch
-            .Arg<StridedMemrefView>()  // activation
-        )
-        .Value(std::optional<double>())   // dropout_rate
-        .Value(std::optional<int64_t>())  // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleBiasMaskSoftmaxDropoutInference,
-    FunctionWrapper<FusedAttentionForwardImpl>(), checks,
-    BindFusedAttentionAttributes(
-        FusedAttentionCall(
-            "xla.gpu.fused.attention.scale.bias.mask.softmax.dropout.inference")
-            .Arg<StridedMemrefView>()                   // mask
-            .Arg<StridedMemrefView>()                   // bias
-            .Arg<StridedMemrefView>()                   // output
-            .Arg<FlatMemrefView>()                      // scratch
-            .Value(std::optional<StridedMemrefView>())  // activation
-        )
-        .Attr<double>("dropout_rate")  // dropout_rate
-        .Attr<int64_t>("seed")         // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleBiasMaskSoftmaxDropoutForward,
-    FunctionWrapper<FusedAttentionForwardImpl>(), checks,
-    BindFusedAttentionAttributes(
-        FusedAttentionCall(
-            "xla.gpu.fused.attention.scale.bias.mask.softmax.dropout.forward")
-            .Arg<StridedMemrefView>()  // mask
-            .Arg<StridedMemrefView>()  // bias
-            .Arg<StridedMemrefView>()  // output
-            .Arg<FlatMemrefView>()     // scratch
-            .Arg<StridedMemrefView>()  // activation
-        )
-        .Attr<double>("dropout_rate")  // dropout_rate
-        .Attr<int64_t>("seed")         // seed
-);
-
-template <typename... Ts>
-auto BindFusedAttentionBackwardAttributes(
-    runtime::CustomCallBinding<Ts...> binding) {
-  return std::move(binding)
-      .template Attr<int64_t>("uid")
-      .template Attr<double>("fmha_scale")
-      .template Attr<bool>("is_flash_attention")
-      .template Attr<bool>("is_causal_mask")
-      .template Attr<absl::Span<const int64_t>>(
-          "intermediate_tensor_dimensions")
-      .template Attr<absl::Span<const int64_t>>("intermediate_tensor_layout")
-      .template Attr<DotDimensionNumbers>(
-          "bmm1_grad_gemm1_dot_dimension_numbers")
-      .template Attr<DotDimensionNumbers>(
-          "bmm1_grad_gemm2_dot_dimension_numbers")
-      .template Attr<DotDimensionNumbers>(
-          "bmm2_grad_gemm1_dot_dimension_numbers")
-      .template Attr<DotDimensionNumbers>(
-          "bmm2_grad_gemm2_dot_dimension_numbers")
-      .template Attr<xla::gpu::CudnnfMHAKind>("fused_mha_dag")
-      .template Attr<AlgorithmConfig>("algorithm_config");
-}
-
-auto FusedAttentionBackwardCall(const char* name) {
-  return CustomCall::Bind(name)
-      .UserData<const ServiceExecutableRunOptions*>()
-      .UserData<const DebugOptions*>()
-      .State<FusedAttentionBackwardRunner>("uid")
-      .Arg<StridedMemrefView>()   // bmm1_grad_gemm1_rhs
-      .Arg<StridedMemrefView>()   // bmm1_grad_gemm2_rhs
-      .Arg<StridedMemrefView>()   // bmm2_grad_gemm2_rhs
-      .Arg<StridedMemrefView>()   // bmm2_grad_gemm1_lhs
-      .Arg<StridedMemrefView>();  // d_output
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleBiasSoftmaxBackward,
-    FunctionWrapper<FusedAttentionBackwardImpl>(), checks,
-    BindFusedAttentionBackwardAttributes(
-        FusedAttentionBackwardCall(
-            "xla.gpu.fused.attention.backward.scale.dbias.softmax")
-            .Value(std::optional<StridedMemrefView>())  // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Value(std::optional<StridedMemrefView>())  // fwd_output
-            .Arg<StridedMemrefView>()                   // d_bmm1_lhs
-            .Arg<StridedMemrefView>()                   // d_bmm1_rhs
-            .Arg<StridedMemrefView>()                   // d_bmm2_rhs
-            .Arg<StridedMemrefView>()                   // d_S
-            .Value(std::optional<StridedMemrefView>())  // softmax_sum
-            .Value(std::optional<StridedMemrefView>())  // d_Q_accum
-            .Arg<FlatMemrefView>()                      // scratch
-            .Arg<StridedMemrefView>()                   // d_bias
-        )
-        .Value(std::optional<double>())   // dropout_rate
-        .Value(std::optional<int64_t>())  // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleSoftmaxBackward,
-    FunctionWrapper<FusedAttentionBackwardImpl>(), checks,
-    BindFusedAttentionBackwardAttributes(
-        FusedAttentionBackwardCall(
-            "xla.gpu.fused.attention.backward.scale.softmax")
-            .Value(std::optional<StridedMemrefView>())  // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Value(std::optional<StridedMemrefView>())  // fwd_output
-            .Arg<StridedMemrefView>()                   // d_bmm1_lhs
-            .Arg<StridedMemrefView>()                   // d_bmm1_rhs
-            .Arg<StridedMemrefView>()                   // d_bmm2_rhs
-            .Arg<StridedMemrefView>()                   // d_S
-            .Value(std::optional<StridedMemrefView>())  // softmax_sum
-            .Value(std::optional<StridedMemrefView>())  // d_Q_accum
-            .Arg<FlatMemrefView>()                      // scratch
-            .Value(std::optional<StridedMemrefView>())  // d_bias
-        )
-        .Value(std::optional<double>())   // dropout_rate
-        .Value(std::optional<int64_t>())  // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleBiasSoftmaxDropoutBackward,
-    FunctionWrapper<FusedAttentionBackwardImpl>(), checks,
-    BindFusedAttentionBackwardAttributes(
-        FusedAttentionBackwardCall(
-            "xla.gpu.fused.attention.backward.scale.dbias.softmax.dropout")
-            .Value(std::optional<StridedMemrefView>())  // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Value(std::optional<StridedMemrefView>())  // fwd_output
-            .Arg<StridedMemrefView>()                   // d_bmm1_lhs
-            .Arg<StridedMemrefView>()                   // d_bmm1_rhs
-            .Arg<StridedMemrefView>()                   // d_bmm2_rhs
-            .Arg<StridedMemrefView>()                   // d_S
-            .Value(std::optional<StridedMemrefView>())  // softmax_sum
-            .Value(std::optional<StridedMemrefView>())  // d_Q_accum
-            .Arg<FlatMemrefView>()                      // scratch
-            .Arg<StridedMemrefView>()                   // d_bias
-        )
-        .Attr<double>("dropout_rate")  // dropout_rate
-        .Attr<int64_t>("seed")         // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleSoftmaxDropoutBackward,
-    FunctionWrapper<FusedAttentionBackwardImpl>(), checks,
-    BindFusedAttentionBackwardAttributes(
-        FusedAttentionBackwardCall(
-            "xla.gpu.fused.attention.backward.scale.softmax.dropout")
-            .Value(std::optional<StridedMemrefView>())  // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Value(std::optional<StridedMemrefView>())  // fwd_output
-            .Arg<StridedMemrefView>()                   // d_bmm1_lhs
-            .Arg<StridedMemrefView>()                   // d_bmm1_rhs
-            .Arg<StridedMemrefView>()                   // d_bmm2_rhs
-            .Arg<StridedMemrefView>()                   // d_S
-            .Value(std::optional<StridedMemrefView>())  // softmax_sum
-            .Value(std::optional<StridedMemrefView>())  // d_Q_accum
-            .Arg<FlatMemrefView>()                      // scratch
-            .Value(std::optional<StridedMemrefView>())  // d_bias
-        )
-        .Attr<double>("dropout_rate")  // dropout_rate
-        .Attr<int64_t>("seed")         // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleBiasMaskSoftmaxBackward,
-    FunctionWrapper<FusedAttentionBackwardImpl>(), checks,
-    BindFusedAttentionBackwardAttributes(
-        FusedAttentionBackwardCall(
-            "xla.gpu.fused.attention.backward.scale.dbias.mask.softmax")
-            .Arg<StridedMemrefView>()                   // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Value(std::optional<StridedMemrefView>())  // fwd_output
-            .Arg<StridedMemrefView>()                   // d_bmm1_lhs
-            .Arg<StridedMemrefView>()                   // d_bmm1_rhs
-            .Arg<StridedMemrefView>()                   // d_bmm2_rhs
-            .Arg<StridedMemrefView>()                   // d_S
-            .Value(std::optional<StridedMemrefView>())  // softmax_sum
-            .Value(std::optional<StridedMemrefView>())  // d_Q_accum
-            .Arg<FlatMemrefView>()                      // scratch
-            .Arg<StridedMemrefView>()                   // d_bias
-        )
-        .Value(std::optional<double>())   // dropout_rate
-        .Value(std::optional<int64_t>())  // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleMaskSoftmaxBackward,
-    FunctionWrapper<FusedAttentionBackwardImpl>(), checks,
-    BindFusedAttentionBackwardAttributes(
-        FusedAttentionBackwardCall(
-            "xla.gpu.fused.attention.backward.scale.mask.softmax")
-            .Arg<StridedMemrefView>()                   // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Value(std::optional<StridedMemrefView>())  // fwd_output
-            .Arg<StridedMemrefView>()                   // d_bmm1_lhs
-            .Arg<StridedMemrefView>()                   // d_bmm1_rhs
-            .Arg<StridedMemrefView>()                   // d_bmm2_rhs
-            .Arg<StridedMemrefView>()                   // d_S
-            .Value(std::optional<StridedMemrefView>())  // softmax_sum
-            .Value(std::optional<StridedMemrefView>())  // d_Q_accum
-            .Arg<FlatMemrefView>()                      // scratch
-            .Value(std::optional<StridedMemrefView>())  // d_bias
-        )
-        .Value(std::optional<double>())   // dropout_rate
-        .Value(std::optional<int64_t>())  // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleBiasMaskSoftmaxDropoutBackward,
-    FunctionWrapper<FusedAttentionBackwardImpl>(), checks,
-    BindFusedAttentionBackwardAttributes(
-        FusedAttentionBackwardCall(
-            "xla.gpu.fused.attention.backward.scale.dbias.mask.softmax.dropout")
-            .Arg<StridedMemrefView>()                   // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Value(std::optional<StridedMemrefView>())  // fwd_output
-            .Arg<StridedMemrefView>()                   // d_bmm1_lhs
-            .Arg<StridedMemrefView>()                   // d_bmm1_rhs
-            .Arg<StridedMemrefView>()                   // d_bmm2_rhs
-            .Arg<StridedMemrefView>()                   // d_S
-            .Value(std::optional<StridedMemrefView>())  // softmax_sum
-            .Value(std::optional<StridedMemrefView>())  // d_Q_accum
-            .Arg<FlatMemrefView>()                      // scratch
-            .Arg<StridedMemrefView>()                   // d_bias
-        )
-        .Attr<double>("dropout_rate")  // dropout_rate
-        .Attr<int64_t>("seed")         // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FusedAttentionScaleMaskSoftmaxDropoutBackward,
-    FunctionWrapper<FusedAttentionBackwardImpl>(), checks,
-    BindFusedAttentionBackwardAttributes(
-        FusedAttentionBackwardCall(
-            "xla.gpu.fused.attention.backward.scale.mask.softmax.dropout")
-            .Arg<StridedMemrefView>()                   // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Value(std::optional<StridedMemrefView>())  // fwd_output
-            .Arg<StridedMemrefView>()                   // d_bmm1_lhs
-            .Arg<StridedMemrefView>()                   // d_bmm1_rhs
-            .Arg<StridedMemrefView>()                   // d_bmm2_rhs
-            .Arg<StridedMemrefView>()                   // d_S
-            .Value(std::optional<StridedMemrefView>())  // softmax_sum
-            .Value(std::optional<StridedMemrefView>())  // d_Q_accum
-            .Arg<FlatMemrefView>()                      // scratch
-            .Value(std::optional<StridedMemrefView>())  // d_bias
-        )
-        .Attr<double>("dropout_rate")  // dropout_rate
-        .Attr<int64_t>("seed")         // seed
-);
-
-// flash attention backward custom call
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FlashAttentionScaleBiasSoftmaxBackward,
-    FunctionWrapper<FusedAttentionBackwardImpl>(), checks,
-    BindFusedAttentionBackwardAttributes(
-        FusedAttentionBackwardCall(
-            "xla.gpu.flash.attention.backward.scale.bias.softmax")
-            .Value(std::optional<StridedMemrefView>())  // mask
-            .Arg<StridedMemrefView>()                   // bias
-            .Arg<StridedMemrefView>()                   // fwd_output
-            .Arg<StridedMemrefView>()                   // d_bmm1_lhs
-            .Arg<StridedMemrefView>()                   // d_bmm1_rhs
-            .Arg<StridedMemrefView>()                   // d_bmm2_rhs
-            .Value(std::optional<StridedMemrefView>())  // d_S
-            .Arg<StridedMemrefView>()                   // softmax_sum
-            .Arg<StridedMemrefView>()                   // d_Q_accum
-            .Arg<FlatMemrefView>()                      // scratch
-            .Value(std::optional<StridedMemrefView>())  // d_bias
-        )
-        .Value(std::optional<double>())   // dropout_rate
-        .Value(std::optional<int64_t>())  // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FlashAttentionScaleSoftmaxBackward,
-    FunctionWrapper<FusedAttentionBackwardImpl>(), checks,
-    BindFusedAttentionBackwardAttributes(
-        FusedAttentionBackwardCall(
-            "xla.gpu.flash.attention.backward.scale.softmax")
-            .Value(std::optional<StridedMemrefView>())  // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Arg<StridedMemrefView>()                   // fwd_output
-            .Arg<StridedMemrefView>()                   // d_bmm1_lhs
-            .Arg<StridedMemrefView>()                   // d_bmm1_rhs
-            .Arg<StridedMemrefView>()                   // d_bmm2_rhs
-            .Value(std::optional<StridedMemrefView>())  // d_S
-            .Arg<StridedMemrefView>()                   // softmax_sum
-            .Arg<StridedMemrefView>()                   // d_Q_accum
-            .Arg<FlatMemrefView>()                      // scratch
-            .Value(std::optional<StridedMemrefView>())  // d_bias
-        )
-        .Value(std::optional<double>())   // dropout_rate
-        .Value(std::optional<int64_t>())  // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FlashAttentionScaleBiasSoftmaxDropoutBackward,
-    FunctionWrapper<FusedAttentionBackwardImpl>(), checks,
-    BindFusedAttentionBackwardAttributes(
-        FusedAttentionBackwardCall(
-            "xla.gpu.flash.attention.backward.scale.bias.softmax.dropout")
-            .Value(std::optional<StridedMemrefView>())  // mask
-            .Arg<StridedMemrefView>()                   // bias
-            .Arg<StridedMemrefView>()                   // fwd_output
-            .Arg<StridedMemrefView>()                   // d_bmm1_lhs
-            .Arg<StridedMemrefView>()                   // d_bmm1_rhs
-            .Arg<StridedMemrefView>()                   // d_bmm2_rhs
-            .Value(std::optional<StridedMemrefView>())  // d_S
-            .Arg<StridedMemrefView>()                   // softmax_sum
-            .Arg<StridedMemrefView>()                   // d_Q_accum
-            .Arg<FlatMemrefView>()                      // scratch
-            .Value(std::optional<StridedMemrefView>())  // d_bias
-        )
-        .Attr<double>("dropout_rate")  // dropout_rate
-        .Attr<int64_t>("seed")         // seed
-);
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    FlashAttentionScaleSoftmaxDropoutBackward,
-    FunctionWrapper<FusedAttentionBackwardImpl>(), checks,
-    BindFusedAttentionBackwardAttributes(
-        FusedAttentionBackwardCall(
-            "xla.gpu.flash.attention.backward.scale.softmax.dropout")
-            .Value(std::optional<StridedMemrefView>())  // mask
-            .Value(std::optional<StridedMemrefView>())  // bias
-            .Arg<StridedMemrefView>()                   // fwd_output
-            .Arg<StridedMemrefView>()                   // d_bmm1_lhs
-            .Arg<StridedMemrefView>()                   // d_bmm1_rhs
-            .Arg<StridedMemrefView>()                   // d_bmm2_rhs
-            .Value(std::optional<StridedMemrefView>())  // d_S
-            .Arg<StridedMemrefView>()                   // softmax_sum
-            .Arg<StridedMemrefView>()                   // d_Q_accum
-            .Arg<FlatMemrefView>()                      // scratch
-            .Value(std::optional<StridedMemrefView>())  // d_bias
-        )
-        .Attr<double>("dropout_rate")  // dropout_rate
-        .Attr<int64_t>("seed")         // seed
-);
-
-//===----------------------------------------------------------------------===//
-// cuBLASLt custom calls bindings and registration.
-//===----------------------------------------------------------------------===//
-void RegisterFusedAttentionCustomCalls(
-    runtime::DirectCustomCallRegistry& registry) {
-  auto fused_attention = [](std::string name) {
-    return "xla.gpu.fused.attention." + name;
-  };
-  registry.Register(fused_attention("bmm.bmm.inference"),
-                    FusedAttentionBmmBmmInference);
-  registry.Register(fused_attention("bmm.bmm.forward"),
-                    FusedAttentionBmmBmmForward);
-  registry.Register(fused_attention("softmax.inference"),
-                    FusedAttentionSoftmaxInference);
-  registry.Register(fused_attention("softmax.forward"),
-                    FusedAttentionSoftmaxForward);
-  registry.Register(fused_attention("softmax.dropout.inference"),
-                    FusedAttentionSoftmaxDropoutInference);
-  registry.Register(fused_attention("softmax.dropout.forward"),
-                    FusedAttentionSoftmaxDropoutForward);
-  registry.Register(fused_attention("scale.bias.softmax.inference"),
-                    FusedAttentionScaleBiasSoftmaxInference);
-  registry.Register(fused_attention("scale.bias.softmax.forward"),
-                    FusedAttentionScaleBiasSoftmaxForward);
-  registry.Register(fused_attention("scale.bias.softmax.dropout.inference"),
-                    FusedAttentionScaleBiasSoftmaxDropoutInference);
-  registry.Register(fused_attention("scale.bias.softmax.dropout.forward"),
-                    FusedAttentionScaleBiasSoftmaxDropoutForward);
-  registry.Register(fused_attention("scale.mask.softmax.inference"),
-                    FusedAttentionScaleMaskSoftmaxInference);
-  registry.Register(fused_attention("scale.mask.softmax.forward"),
-                    FusedAttentionScaleMaskSoftmaxForward);
-  registry.Register(fused_attention("scale.mask.softmax.dropout.inference"),
-                    FusedAttentionScaleMaskSoftmaxDropoutInference);
-  registry.Register(fused_attention("scale.mask.softmax.dropout.forward"),
-                    FusedAttentionScaleMaskSoftmaxDropoutForward);
-  registry.Register(fused_attention("scale.bias.mask.softmax.inference"),
-                    FusedAttentionScaleBiasMaskSoftmaxInference);
-  registry.Register(fused_attention("scale.bias.mask.softmax.forward"),
-                    FusedAttentionScaleBiasMaskSoftmaxForward);
-  registry.Register(
-      fused_attention("scale.bias.mask.softmax.dropout.inference"),
-      FusedAttentionScaleBiasMaskSoftmaxDropoutInference);
-  registry.Register(fused_attention("scale.bias.mask.softmax.dropout.forward"),
-                    FusedAttentionScaleBiasMaskSoftmaxDropoutForward);
-}
-
-void RegisterFusedAttentionBackwardCustomCalls(
-    runtime::DirectCustomCallRegistry& registry) {
-  auto fused_attention = [](std::string name) {
-    return "xla.gpu.fused.attention.backward." + name;
-  };
-  registry.Register(fused_attention("scale.dbias.softmax"),
-                    FusedAttentionScaleBiasSoftmaxBackward);
-  registry.Register(fused_attention("scale.softmax"),
-                    FusedAttentionScaleSoftmaxBackward);
-  registry.Register(fused_attention("scale.dbias.softmax.dropout"),
-                    FusedAttentionScaleBiasSoftmaxDropoutBackward);
-  registry.Register(fused_attention("scale.softmax.dropout"),
-                    FusedAttentionScaleSoftmaxDropoutBackward);
-  registry.Register(fused_attention("scale.dbias.mask.softmax"),
-                    FusedAttentionScaleBiasMaskSoftmaxBackward);
-  registry.Register(fused_attention("scale.mask.softmax"),
-                    FusedAttentionScaleMaskSoftmaxBackward);
-  registry.Register(fused_attention("scale.dbias.mask.softmax.dropout"),
-                    FusedAttentionScaleBiasMaskSoftmaxDropoutBackward);
-  registry.Register(fused_attention("scale.mask.softmax.dropout"),
-                    FusedAttentionScaleMaskSoftmaxDropoutBackward);
-  // flash attention bwd
-  auto flash_attention = [](std::string name) {
-    return "xla.gpu.flash.attention.backward." + name;
-  };
-  registry.Register(flash_attention("scale.bias.softmax"),
-                    FlashAttentionScaleBiasSoftmaxBackward);
-  registry.Register(flash_attention("scale.softmax"),
-                    FlashAttentionScaleSoftmaxBackward);
-  registry.Register(flash_attention("scale.bias.softmax.dropout"),
-                    FlashAttentionScaleBiasSoftmaxDropoutBackward);
-  registry.Register(flash_attention("scale.softmax.dropout"),
-                    FlashAttentionScaleSoftmaxDropoutBackward);
-}
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/fused_attention.h b/third_party/xla/xla/service/gpu/runtime/fused_attention.h
deleted file mode 100644
index 6c8a4ce6b2f790..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/fused_attention.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_FUSED_ATTENTION_H_
-#define XLA_SERVICE_GPU_RUNTIME_FUSED_ATTENTION_H_
-
-#include <memory>
-#include <utility>
-
-#include "absl/container/node_hash_map.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/mlir/runtime/transforms/custom_call_encoding.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/service/gpu/gpu_fused_mha_runner.h"
-
-namespace xla {
-namespace gpu {
-
-// Registers XLA Gpu runtime fused attention custom calls.
-void RegisterFusedAttentionCustomCalls(
-    runtime::DirectCustomCallRegistry& registry);
-
-// Register type names for fused attention attributes defined by MHLO dialect.
-void RegisterFusedAttentionTypeIdNames(runtime::TypeIDNameRegistry& registry);
-
-// Add attributes encoding for fused attention attributes defined by LMHLO
-// dialect.
-void PopulateFusedAttentionForwardDAGSignatureAttrEncoding(
-    runtime::CustomCallAttrEncodingSet& encoding);
-
-// Registers XLA Gpu runtime fused attention backward custom calls.
-void RegisterFusedAttentionBackwardCustomCalls(
-    runtime::DirectCustomCallRegistry& registry);
-
-// Add attributes encoding for fused attention backward attributes defined by
-// LMHLO dialect.
-void PopulateFusedAttentionBackwardDAGSignatureAttrEncoding(
-    runtime::CustomCallAttrEncodingSet& encoding);
-
-void PopulateFusedAttentionAlgorithmConfigAttrEncoding(
-    runtime::CustomCallAttrEncodingSet& encoding);
-
-//===----------------------------------------------------------------------===//
-// Cache fused dot attention runners between invocations of fused dot attention
-// custom calls.
-//===----------------------------------------------------------------------===//
-struct FusedAttentionRunner {
-  explicit FusedAttentionRunner(GpufMHAConfig config)
-      : config(std::move(config)), runner(this->config) {}
-  GpufMHAConfig config;
-  FusedMultiHeadedAttentionRunner runner;
-};
-
-struct FusedAttentionBackwardRunner {
-  explicit FusedAttentionBackwardRunner(GpufMHABackwardConfig config)
-      : config(std::move(config)), runner(this->config) {}
-  GpufMHABackwardConfig config;
-  FusedMultiHeadedAttentionBackwardRunner runner;
-};
-
-class StreamExecutorFusedAttentionRunners
-    : public runtime::StateVector<FusedAttentionRunner> {};
-
-class StreamExecutorFusedAttentionBackwardRunners
-    : public runtime::StateVector<FusedAttentionBackwardRunner> {};
-
-// Xla executable keeps a mapping from stream executors to fused attention
-// runners.
-class FusedAttentionRunners {
- public:
-  StreamExecutorFusedAttentionRunners* operator()(se::StreamExecutor* executor);
-
- private:
-  mutable absl::Mutex mutex_;
-  absl::node_hash_map<se::StreamExecutor*, StreamExecutorFusedAttentionRunners>
-      runners_ ABSL_GUARDED_BY(mutex_);
-};
-
-// Xla executable keeps a mapping from stream executors to fused attention
-// backward runners.
-class FusedAttentionBackwardRunners {
- public:
-  StreamExecutorFusedAttentionBackwardRunners* operator()(
-      se::StreamExecutor* executor);
-
- private:
-  mutable absl::Mutex mutex_;
-  absl::node_hash_map<se::StreamExecutor*,
-                      StreamExecutorFusedAttentionBackwardRunners>
-      runners_ ABSL_GUARDED_BY(mutex_);
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_FUSED_ATTENTION_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/fused_mha_thunk.cc b/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.cc
similarity index 99%
rename from third_party/xla/xla/service/gpu/runtime3/fused_mha_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.cc
index 731f4a087afa63..03978f0d1f8184 100644
--- a/third_party/xla/xla/service/gpu/runtime3/fused_mha_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/fused_mha_thunk.h"
+#include "xla/service/gpu/runtime/fused_mha_thunk.h"
 
 #include <memory>
 #include <utility>
diff --git a/third_party/xla/xla/service/gpu/runtime3/fused_mha_thunk.h b/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h
similarity index 97%
rename from third_party/xla/xla/service/gpu/runtime3/fused_mha_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h
index c00ea8240942ce..9b7cbf000d12db 100644
--- a/third_party/xla/xla/service/gpu/runtime3/fused_mha_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/fused_mha_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_FUSED_MHA_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_FUSED_MHA_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_FUSED_MHA_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_FUSED_MHA_THUNK_H_
 
 #include <memory>
 
@@ -127,4 +127,4 @@ class FusedMHABackwardThunk : public Thunk {
 };
 }  // namespace gpu
 }  // namespace xla
-#endif  // XLA_SERVICE_GPU_RUNTIME3_FUSED_MHA_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_FUSED_MHA_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/gemm.cc b/third_party/xla/xla/service/gpu/runtime/gemm.cc
deleted file mode 100644
index a4a1e47baca94c..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/gemm.cc
+++ /dev/null
@@ -1,208 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/gemm.h"
-
-#include <cstdint>
-#include <limits>
-#include <optional>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/executable.h"
-#include "xla/service/gpu/gpu_asm_opts_util.h"
-#include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/hlo_module_config.h"
-#include "xla/service/service_executable_run_options.h"
-#include "xla/status.h"
-#include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/xla.pb.h"
-#include "tsl/platform/errors.h"
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "xla/service/gpu/gemm_algorithm_picker.h"
-#include "xla/stream_executor/gpu/redzone_allocator.h"
-#endif
-
-namespace xla {
-namespace gpu {
-
-using xla::runtime::CustomCall;
-using xla::runtime::State;
-using xla::runtime::StridedMemrefView;
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-// TODO(ezhulenev): Delete run time auto tuning from XLA.
-absl::Status DoRuntimeAutotuning(se::Stream* stream, GemmConfig* config,
-                                 se::DeviceMemoryBase lhs_buffer,
-                                 se::DeviceMemoryBase rhs_buffer,
-                                 se::DeviceMemoryBase output_buffer,
-                                 const Shape& output_shape, double beta,
-                                 const DebugOptions* debug_options,
-                                 NonAtomicallyUpgradeableRWLock* gpu_lock) {
-  VLOG(3) << "Running GEMM runtime autotuning";
-  std::vector<se::blas::AlgorithmType> algorithms;
-  stream->parent()->GetBlasGemmAlgorithms(stream, &algorithms);
-  const bool deterministic_ops = debug_options->xla_gpu_deterministic_ops();
-
-  AutotuneConfig autotune_config{
-      DeviceConfig{stream->parent(), stream->parent()->GetAllocator()},
-      *debug_options};
-
-  // TODO(jlebar): We should not use stream->parent()->GetAllocator() here;
-  // that's the global CUDA allocator.  There may not be any free space in
-  // there, because TF usually gobbles it all up for its own BFCAllocator.  We
-  // should use the allocator the user passed when running the XLA program.
-  se::RedzoneAllocator buffer_allocator(
-      stream, stream->parent()->GetAllocator(),
-      PtxOptsFromDebugOptions(*debug_options),
-      /*memory_limit=*/std::numeric_limits<int64_t>::max(),
-      /*redzone_size=*/autotune_config.should_check_correctness()
-          ? debug_options->xla_gpu_redzone_padding_bytes()
-          : 0);
-
-  // Upgrade the reader lock for execution to a writer lock to protect runtime
-  // autotuning.
-  NonAtomicallyUpgradeableRWLock::WriterLock writer_lock =
-      gpu_lock->UpgradeToWriterMutexLock();
-
-  TF_ASSIGN_OR_RETURN(
-      AutotuneResult best_algorithm,
-      GetBestBlasAlgorithm(
-          stream, buffer_allocator, /*gemm_str=*/std::nullopt, autotune_config,
-          lhs_buffer, rhs_buffer, output_buffer, algorithms, output_shape,
-          HloModuleConfig(), beta,
-          [&](const se::blas::AlgorithmType& algorithm)
-              -> absl::StatusOr<se::blas::ProfileResult> {
-            se::blas::ProfileResult profile_result;
-            // We expect GemmWithAlgorithm to fail sometimes -- in fact, it will
-            // fail for all algorithms if we're targeting < sm_50.  But because
-            // we pass a non-null ProfileResult, DoGemmWithAlgorithm should
-            // always return true, and the actual success-ness is returned in
-            // ProfileResult::is_valid.
-            TF_RETURN_IF_ERROR(
-                RunGemm(*config, lhs_buffer, rhs_buffer, output_buffer,
-                        se::DeviceMemoryBase(nullptr, 0), deterministic_ops,
-                        stream, algorithm, &profile_result));
-            return std::move(profile_result);
-          }));
-
-  if (best_algorithm.has_gemm()) {
-    config->algorithm = algorithms[best_algorithm.gemm().algorithm()];
-    return absl::OkStatus();
-  } else {
-    return Internal("Runtime autotuning failed to select an algorithm");
-  }
-}
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-static absl::Status GemmImpl(const ServiceExecutableRunOptions* run_options,
-                             const DebugOptions* debug_options,
-                             NonAtomicallyUpgradeableRWLock* gpu_lock,
-                             State<GemmConfig> state, StridedMemrefView lhs,
-                             StridedMemrefView rhs, StridedMemrefView out,
-                             StridedMemrefView workspace, int64_t algorithm,
-                             double alpha_real, double alpha_imag, double beta,
-                             DotDimensionNumbers dot_dims,
-                             absl::Span<const int32_t> precision) {
-  se::DeviceMemoryBase lhs_data = GetDeviceAddress(lhs);
-  se::DeviceMemoryBase rhs_data = GetDeviceAddress(rhs);
-  se::DeviceMemoryBase output_data = GetDeviceAddress(out);
-  se::DeviceMemoryBase workspace_data = GetDeviceAddress(workspace);
-  const bool deterministic_ops = debug_options->xla_gpu_deterministic_ops();
-
-  VLOG(3) << "Running GEMM";
-  se::Stream* stream = run_options->stream();
-  Shape output_shape = ToShape(out);
-
-  // Get the gemm config from the state.
-  TF_ASSIGN_OR_RETURN(GemmConfig * gemm_config, state.GetOrCreate([&] {
-    absl::StatusOr<GemmConfig> gemm_config =
-        GetGemmConfig(lhs, rhs, out, algorithm, alpha_real, alpha_imag, beta,
-                      dot_dims.lhs_batch, dot_dims.lhs_contract,
-                      dot_dims.rhs_batch, dot_dims.rhs_contract,
-                      precision.empty() ? se::blas::kDefaultComputePrecision
-                                        : *absl::c_max_element(precision));
-    return ToAbsl(gemm_config);
-  }));
-
-  // Set the gemm algorithm by runtime autotuning. We do runtime autotuning
-  // outside of state.GetOrCreate() because otherwise it would be a potential
-  // deadlock.
-  if (gemm_config->algorithm == stream_executor::blas::kRuntimeAutotuning) {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    auto status = DoRuntimeAutotuning(stream, gemm_config, lhs_data, rhs_data,
-                                      output_data, output_shape, beta,
-                                      debug_options, gpu_lock);
-    if (!status.ok()) {
-      return absl::InternalError(status.ToString());
-    }
-#else
-    return absl::InternalError(
-        "Failed to run runtime autotuner because GPU support is not enabled");
-#endif
-  }
-
-  return RunGemm(*gemm_config, lhs_data, rhs_data, output_data, workspace_data,
-                 deterministic_ops, stream);
-}
-
-static absl::Status InitCuBLASImpl(
-    const ServiceExecutableRunOptions* run_options) {
-  // Initialize (with memoization) BlasSupport here because cublasCreate fails
-  // during gpu graph capturing.
-  se::StreamExecutor* executor = run_options->stream()->parent();
-  if (!executor->AsBlas()) {
-    return absl::InternalError("Failed to initialize BLAS support");
-  }
-  return absl::OkStatus();
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    Gemm, FunctionWrapper<GemmImpl>(), checks,
-    CustomCall::Bind("xla.gpu.gemm")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<const DebugOptions*>()
-        .UserData<NonAtomicallyUpgradeableRWLock*>()
-        .State<GemmConfig>("uid")
-        .Arg<StridedMemrefView>()  // lhs
-        .Arg<StridedMemrefView>()  // rhs
-        .Arg<StridedMemrefView>()  // out
-        .Arg<StridedMemrefView>()  // workspace
-        .Attr<int64_t>("algorithm")
-        .Attr<double>("alpha_real")
-        .Attr<double>("alpha_imag")
-        .Attr<double>("beta")
-        .Attr<DotDimensionNumbers>("dot_dims")
-        .Attr<absl::Span<const int32_t>>("precision"));
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    InitCuBLAS, FunctionWrapper<InitCuBLASImpl>(), checks,
-    CustomCall::Bind("xla.gpu.init_cublas")
-        .UserData<const ServiceExecutableRunOptions*>());
-
-void RegisterGemmCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.gemm", Gemm);
-  registry.Register("xla.gpu.init_cublas", InitCuBLAS);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/gemm.h b/third_party/xla/xla/service/gpu/runtime/gemm.h
deleted file mode 100644
index 8ccaea1ea25131..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/gemm.h
+++ /dev/null
@@ -1,36 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_GEMM_H_
-#define XLA_SERVICE_GPU_RUNTIME_GEMM_H_
-
-#include "absl/container/node_hash_map.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/runtime/state.h"
-#include "xla/service/gpu/matmul_utils.h"
-
-namespace xla {
-namespace gpu {
-
-// Registers XLA Gpu runtime Gemm# custom calls.
-void RegisterGemmCustomCalls(runtime::DirectCustomCallRegistry& registry);
-
-// Keep GemmConfigs for all gemm/matmul instances in the executable.
-class GemmConfigs : public runtime::StateVector<GemmConfig> {};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_GEMM_H_
diff --git a/third_party/xla/xla/service/gpu/gemm_thunk.cc b/third_party/xla/xla/service/gpu/runtime/gemm_thunk.cc
similarity index 91%
rename from third_party/xla/xla/service/gpu/gemm_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/gemm_thunk.cc
index 333543f2edaad2..cc74707c84e01f 100644
--- a/third_party/xla/xla/service/gpu/gemm_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/gemm_thunk.cc
@@ -13,16 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/gemm_thunk.h"
+#include "xla/service/gpu/runtime/gemm_thunk.h"
 
 #include <optional>
-#include <utility>
 
 #include "absl/status/status.h"
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/thunk.h"
-#include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "tsl/platform/logging.h"
 
@@ -50,10 +48,14 @@ absl::Status GemmThunk::ExecuteOnStream(const ExecuteParams& params) {
   if (workspace_.has_value()) {
     workspace = allocs.GetDeviceAddress(workspace_.value());
   }
+  TF_ASSIGN_OR_RETURN(
+      se::Stream * stream,
+      GetStreamForExecution(Thunk::execution_stream_id(), params));
+
   return RunGemm(config_, allocs.GetDeviceAddress(lhs_buffer_),
                  allocs.GetDeviceAddress(rhs_buffer_),
                  allocs.GetDeviceAddress(output_buffer_), workspace,
-                 deterministic_, params.stream);
+                 deterministic_, stream);
 }
 
 absl::Status GemmThunk::Initialize(const InitializeParams& params) {
diff --git a/third_party/xla/xla/service/gpu/gemm_thunk.h b/third_party/xla/xla/service/gpu/runtime/gemm_thunk.h
similarity index 93%
rename from third_party/xla/xla/service/gpu/gemm_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/gemm_thunk.h
index 882235eaad59d7..a134ed2623b1ae 100644
--- a/third_party/xla/xla/service/gpu/gemm_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/gemm_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_GEMM_THUNK_H_
-#define XLA_SERVICE_GPU_GEMM_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_GEMM_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_GEMM_THUNK_H_
 
 #include <optional>
 
@@ -22,7 +22,6 @@ limitations under the License.
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/status.h"
-#include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
 namespace gpu {
@@ -67,4 +66,4 @@ class GemmThunk : public Thunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_GEMM_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_GEMM_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/gpu_kernel_helper.h b/third_party/xla/xla/service/gpu/runtime/gpu_kernel_helper.h
deleted file mode 100644
index f5a69111dc7342..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/gpu_kernel_helper.h
+++ /dev/null
@@ -1,148 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_GPU_KERNEL_HELPER_H_
-#define XLA_SERVICE_GPU_RUNTIME_GPU_KERNEL_HELPER_H_
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-#include <type_traits>
-
-#include "tsl/lib/math/math_util.h"
-
-#if GOOGLE_CUDA
-#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
-#else
-#include "rocm/include/hip/hip_runtime.h"
-#endif
-
-namespace xla {
-namespace gpu {
-
-#if GOOGLE_CUDA
-#define WAVEFRONT_SIZE 32
-#define FORCEINLINE __forceinline__
-using gpuStream_t = cudaStream_t;
-using gpuError_t = cudaError_t;
-using gpuEvent_t = cudaEvent_t;
-#define gpuSuccess cudaSuccess
-#define gpuGetLastError cudaGetLastError
-#define gpuGetErrorString cudaGetErrorString
-#define gpuEventRecord cudaEventRecord
-#define gpuEventSynchronize cudaEventSynchronize
-#define gpuEventDestroy cudaEventDestroy
-#define gpuEventCreate cudaEventCreate
-#define gpuEventCreateWithFlags cudaEventCreateWithFlags
-#define gpuEventDisableTiming cudaEventDisableTiming
-#define gpuEventElapsedTime cudaEventElapsedTime
-#define gpuDeviceSynchronize cudaDeviceSynchronize
-#define gpuLaunchKernel cudaLaunchKernel
-#define gpuMemcpy cudaMemcpy
-#define gpuMalloc cudaMalloc
-#define gpuFree cudaFree
-#define gpuMemcpyHostToDevice cudaMemcpyHostToDevice
-#define gpuMemcpyDeviceToHost cudaMemcpyDeviceToHost
-#define gpuStreamCreate cudaStreamCreate
-#define gpuStreamSynchronize cudaStreamSynchronize
-
-#elif TENSORFLOW_USE_ROCM
-using gpuStream_t = hipStream_t;
-using gpuError_t = hipError_t;
-using gpuEvent_t = hipEvent_t;
-#define gpuSuccess hipSuccess
-#define gpuGetLastError hipGetLastError
-#define gpuGetErrorString hipGetErrorString
-#define gpuEventRecord hipEventRecord
-#define gpuEventDestroy hipEventDestroy
-#define gpuEventSynchronize hipEventSynchronize
-#define gpuEventCreate hipEventCreate
-#define gpuEventCreateWithFlags hipEventCreateWithFlags
-#define gpuEventDisableTiming hipEventDisableTiming
-#define gpuEventElapsedTime hipEventElapsedTime
-#define gpuDeviceSynchronize hipDeviceSynchronize
-#define gpuLaunchKernel hipLaunchKernel
-#define gpuMemcpy hipMemcpy
-#define gpuMalloc hipMalloc
-#define gpuFree hipFree
-#define gpuMemcpyHostToDevice hipMemcpyHostToDevice
-#define gpuMemcpyDeviceToHost hipMemcpyDeviceToHost
-#define gpuStreamCreate hipStreamCreate
-#define gpuStreamSynchronize hipStreamSynchronize
-
-#ifdef __AMDGCN_WAVEFRONT_SIZE
-#define WAVEFRONT_SIZE __AMDGCN_WAVEFRONT_SIZE
-#else
-#define WAVEFRONT_SIZE 64
-#endif
-#define FORCEINLINE __forceinline__
-#endif
-
-// macro wrapper to declare dynamic shared memory
-#if GOOGLE_CUDA
-
-#define GPU_DYNAMIC_SHARED_MEM_DECL(ALIGN, TYPE, NAME) \
-  extern __shared__ __align__(ALIGN)                   \
-  TYPE NAME[]
-
-#elif TENSORFLOW_USE_ROCM
-
-#define GPU_DYNAMIC_SHARED_MEM_DECL(ALIGN, TYPE, NAME) \
-  HIP_DYNAMIC_SHARED(TYPE, NAME)
-
-#endif
-
-enum class ShflType { Sync, Up, Down, Xor };
-
-template <ShflType Type, class NT>
-__device__ FORCEINLINE NT GpuShuffle(NT val, uint32_t idx,
-                                     uint32_t allmsk = 0xffffffffu) {
-  constexpr uint32_t SZ =
-      tsl::MathUtil::CeilOfRatio(sizeof(NT), sizeof(uint32_t));
-  union S {
-    NT v;
-    uint32_t d[SZ];
-  };
-  S in{val}, res{};
-
-#pragma unroll
-  for (uint32_t i = 0; i < SZ; i++) {
-#if GOOGLE_CUDA
-    if constexpr (Type == ShflType::Sync)
-      res.d[i] = __shfl_sync(allmsk, in.d[i], idx);
-    else if constexpr (Type == ShflType::Up)
-      res.d[i] = __shfl_up_sync(allmsk, in.d[i], idx);
-    else if constexpr (Type == ShflType::Down)
-      res.d[i] = __shfl_down_sync(allmsk, in.d[i], idx);
-    else if constexpr (Type == ShflType::Xor)
-      res.d[i] = __shfl_xor_sync(allmsk, in.d[i], idx);
-#elif TENSORFLOW_USE_ROCM  // ROcm does not support sync shuffle intrinsics
-    if constexpr (Type == ShflType::Sync)
-      res.d[i] = __shfl(in.d[i], idx);
-    else if constexpr (Type == ShflType::Up)
-      res.d[i] = __shfl_up(in.d[i], idx);
-    else if constexpr (Type == ShflType::Down)
-      res.d[i] = __shfl_down(in.d[i], idx);
-    else if constexpr (Type == ShflType::Xor)
-      res.d[i] = __shfl_xor(in.d[i], idx);
-#endif
-  }
-  return res.v;
-}
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#endif  // XLA_SERVICE_GPU_RUNTIME_GPU_KERNEL_HELPER_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul.cc b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul.cc
deleted file mode 100644
index 0183254e756a14..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul.cc
+++ /dev/null
@@ -1,302 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.1
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/gpublas_lt_matmul.h"
-
-#include <optional>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "xla/mlir/runtime/transforms/custom_call_encoding.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/executable.h"
-#include "xla/runtime/logical_result.h"
-#include "xla/runtime/state.h"
-#include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/service_executable_run_options.h"
-#include "xla/stream_executor/scratch_allocator.h"
-#include "xla/xla.pb.h"
-#include "tsl/platform/status.h"
-
-#if TENSORFLOW_USE_ROCM
-#include "rocm/rocm_config.h"
-#endif
-
-namespace xla {
-#if GOOGLE_CUDA || TF_HIPBLASLT
-
-using xla::runtime::CustomCall;
-using xla::runtime::CustomCallAttrEncodingSet;
-using xla::runtime::EnumAttrEncoding;
-using xla::runtime::State;
-using xla::runtime::StridedMemrefView;
-
-namespace lmhlo_gpu = ::mlir::lmhlo_gpu;
-
-//===----------------------------------------------------------------------===//
-// Register cuBLASLt attributes decoding with the Xla runtime.
-//===----------------------------------------------------------------------===//
-
-namespace runtime {
-XLA_RUNTIME_REGISTER_ENUM_ATTR_DECODING(se::gpu::BlasLt::Epilogue);
-}  // namespace runtime
-
-//===----------------------------------------------------------------------===//
-// Encoding from MHLO attributes to Xla runtime enums.
-//===----------------------------------------------------------------------===//
-
-namespace gpu {
-
-void PopulateCublasLtMatmulAttrEncoding(CustomCallAttrEncodingSet& encoding) {
-  encoding.Add<EnumAttrEncoding<lmhlo_gpu::CublasLtMatmulEpilogueAttr,
-                                lmhlo_gpu::CublasLtMatmulEpilogue,
-                                se::gpu::BlasLt::Epilogue>>(
-      [](lmhlo_gpu::CublasLtMatmulEpilogue value) -> se::gpu::BlasLt::Epilogue {
-        return gpublas_lt::AsBlasLtEpilogue(value).value();
-      });
-}
-
-//===----------------------------------------------------------------------===//
-// cuBLASLt matmul custom call implementation.
-//===----------------------------------------------------------------------===//
-
-namespace {
-
-absl::Status DoMatmul(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, State<GemmConfig> gemm_config,
-    State<se::gpu::BlasLt::MatmulPlanPtr> matmul_plan, StridedMemrefView a,
-    StridedMemrefView b, StridedMemrefView c, StridedMemrefView d,
-    std::optional<StridedMemrefView> bias, std::optional<StridedMemrefView> aux,
-    std::optional<StridedMemrefView> a_scale,
-    std::optional<StridedMemrefView> b_scale,
-    std::optional<StridedMemrefView> c_scale,
-    std::optional<StridedMemrefView> d_scale,
-    std::optional<StridedMemrefView> d_amax, int64_t algorithm,
-    double alpha_real, double alpha_imag, double beta,
-    DotDimensionNumbers dot_dims, se::gpu::BlasLt::Epilogue epilogue,
-    absl::Span<const int32_t> precision) {
-  se::Stream* stream = run_options->stream();
-
-  // Find the gemm config for this instance of matmul.
-  TF_ASSIGN_OR_RETURN(GemmConfig * config, gemm_config.GetOrCreate([&] {
-    return ToAbsl(GetGemmConfig(
-        a, b, d, algorithm, alpha_real, alpha_imag, beta, dot_dims.lhs_batch,
-        dot_dims.lhs_contract, dot_dims.rhs_batch, dot_dims.rhs_contract,
-        precision.empty() ? se::blas::kDefaultComputePrecision
-                          : *absl::c_max_element(precision),
-        c, bias));
-  }));
-
-  // Get the matmul plan for this instance of matmul.
-  TF_ASSIGN_OR_RETURN(auto plan, matmul_plan.GetOrCreate([&] {
-    return ToAbsl(se::gpu::BlasLt::GetMatmulPlan(stream, *config, epilogue));
-  }));
-
-  TF_ASSIGN_OR_RETURN(auto algos, (*plan)->GetAlgorithms());
-  if (static_cast<size_t>(algorithm) >= algos.size()) {
-    return absl::InternalError(
-        absl::StrFormat("The requested gpublas-lt matmul "
-                        "algorithm is not found. Total algorithms available: "
-                        "%zu; requested: %ld",
-                        algos.size(), algorithm));
-  }
-
-  se::DeviceMemoryBase a_data = GetDeviceAddress(a);
-  se::DeviceMemoryBase b_data = GetDeviceAddress(b);
-  se::DeviceMemoryBase c_data = GetDeviceAddress(c);
-  se::DeviceMemoryBase d_data = GetDeviceAddress(d);
-  se::DeviceMemoryBase bias_data;
-  if (bias.has_value()) bias_data = GetDeviceAddress(*bias);
-  se::DeviceMemoryBase aux_data;
-  if (aux.has_value()) aux_data = GetDeviceAddress(*aux);
-
-  se::DeviceMemoryBase a_scale_data;
-  if (a_scale.has_value()) a_scale_data = GetDeviceAddress(*a_scale);
-  se::DeviceMemoryBase b_scale_data;
-  if (b_scale.has_value()) b_scale_data = GetDeviceAddress(*b_scale);
-  se::DeviceMemoryBase c_scale_data;
-  if (c_scale.has_value()) c_scale_data = GetDeviceAddress(*c_scale);
-  se::DeviceMemoryBase d_scale_data;
-  if (d_scale.has_value()) d_scale_data = GetDeviceAddress(*d_scale);
-  se::DeviceMemoryBase d_amax_data;
-  if (d_amax.has_value()) d_amax_data = GetDeviceAddress(*d_amax);
-
-  se::OwningScratchAllocator<> scratch_allocator(
-      stream->parent()->device_ordinal(), stream->parent()->GetAllocator());
-
-  return (*plan)->ExecuteOnStream(
-      stream, a_data, b_data, c_data, d_data, bias_data, aux_data, a_scale_data,
-      b_scale_data, c_scale_data, d_scale_data, d_amax_data, algos[algorithm],
-      scratch_allocator);
-}
-
-}  // namespace
-
-static absl::Status CublasLtMatmulImpl(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, State<GemmConfig> gemm_config,
-    State<se::gpu::BlasLt::MatmulPlanPtr> matmul_plan, StridedMemrefView a,
-    StridedMemrefView b, StridedMemrefView c, StridedMemrefView d,
-    std::optional<StridedMemrefView> bias, std::optional<StridedMemrefView> aux,
-    int64_t algorithm, double alpha_real, double alpha_imag, double beta,
-    DotDimensionNumbers dot_dims, se::gpu::BlasLt::Epilogue epilogue,
-    absl::Span<const int32_t> precision) {
-  VLOG(3) << "Running CublasLtMatmul";
-  std::optional<StridedMemrefView> a_scale, b_scale, c_scale, d_scale, d_amax;
-  return DoMatmul(run_options, debug_options, gemm_config, matmul_plan, a, b, c,
-                  d, bias, aux, a_scale, b_scale, c_scale, d_scale, d_amax,
-                  algorithm, alpha_real, alpha_imag, beta, dot_dims, epilogue,
-                  precision);
-}
-
-static absl::Status CublasLtMatmulF8Impl(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, State<GemmConfig> gemm_config,
-    State<se::gpu::BlasLt::MatmulPlanPtr> matmul_plan, StridedMemrefView a,
-    StridedMemrefView b, StridedMemrefView c, StridedMemrefView a_scale,
-    StridedMemrefView b_scale, StridedMemrefView c_scale,
-    StridedMemrefView d_scale, StridedMemrefView d,
-    CustomCall::RemainingArgs remaining_args, int64_t algorithm,
-    double alpha_real, double alpha_imag, double beta,
-    DotDimensionNumbers dot_dims, se::gpu::BlasLt::Epilogue epilogue,
-    absl::Span<const int32_t> precision) {
-  VLOG(3) << "Running CublasLtMatmulF8";
-  std::optional<StridedMemrefView> bias, d_amax, aux;
-  int current_remaining_arg = 0;
-
-  // Get bias, if present
-  if (epilogue == se::gpu::BlasLt::Epilogue::kBias ||
-      epilogue == se::gpu::BlasLt::Epilogue::kBiasThenReLU ||
-      epilogue == se::gpu::BlasLt::Epilogue::kBiasThenGELU ||
-      epilogue == se::gpu::BlasLt::Epilogue::kBiasThenGELUWithAux) {
-    if (remaining_args.size() <= current_remaining_arg) {
-      return absl::InternalError("Epilogue not present in CublasLtMatmulF8 op");
-    }
-    auto bias_or_failure =
-        remaining_args.get<StridedMemrefView>(current_remaining_arg++);
-    if (failed(bias_or_failure)) {
-      return absl::InternalError("Failed to get epilogue");
-    }
-    bias = bias_or_failure.value();
-  }
-
-  // Get amax, if present
-  if (remaining_args.size() > current_remaining_arg) {
-    auto d_amax_or_failure =
-        remaining_args.get<StridedMemrefView>(current_remaining_arg++);
-    if (failed(d_amax_or_failure)) {
-      return absl::InternalError("Failed to get d_amax");
-    }
-    d_amax = d_amax_or_failure.value();
-  }
-
-  return DoMatmul(run_options, debug_options, gemm_config, matmul_plan, a, b, c,
-                  d, bias, aux, a_scale, b_scale, c_scale, d_scale, d_amax,
-                  algorithm, alpha_real, alpha_imag, beta, dot_dims, epilogue,
-                  precision);
-}
-
-//===----------------------------------------------------------------------===//
-// cuBLASLt custom calls bindings and registration.
-//===----------------------------------------------------------------------===//
-
-template <typename... Ts>
-auto BindMatmulAttributes(runtime::CustomCallBinding<Ts...> binding) {
-  return std::move(binding)
-      .template Attr<int64_t>("algorithm")
-      .template Attr<double>("alpha_real")
-      .template Attr<double>("alpha_imag")
-      .template Attr<double>("beta")
-      .template Attr<DotDimensionNumbers>("dot_dims")
-      .template Attr<se::gpu::BlasLt::Epilogue>("epilogue")
-      .template Attr<absl::Span<const int32_t>>("precision");
-}
-
-auto CublasLtMatmulCall(const char* name) {
-  return CustomCall::Bind(name)
-      .UserData<const ServiceExecutableRunOptions*>()
-      .UserData<const DebugOptions*>()
-      .State<GemmConfig>("uid")
-      .State<se::gpu::BlasLt::MatmulPlanPtr>("uid")
-      .Arg<StridedMemrefView>()   // a
-      .Arg<StridedMemrefView>()   // b
-      .Arg<StridedMemrefView>()   // c
-      .Arg<StridedMemrefView>();  // d
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    CublasLtMatmul, FunctionWrapper<CublasLtMatmulImpl>(), checks,
-    BindMatmulAttributes(CublasLtMatmulCall("xla.gpu.cublas.lt.matmul")
-                             .Value(std::optional<StridedMemrefView>())  // bias
-                             .Value(std::optional<StridedMemrefView>())  // aux
-                         ));
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    CublasLtMatmulBias, FunctionWrapper<CublasLtMatmulImpl>(), checks,
-    BindMatmulAttributes(CublasLtMatmulCall("xla.gpu.cublas.lt.matmul.bias")
-                             .Arg<StridedMemrefView>()                   // bias
-                             .Value(std::optional<StridedMemrefView>())  // aux
-                         ));
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    CublasLtMatmulAux, FunctionWrapper<CublasLtMatmulImpl>(), checks,
-    BindMatmulAttributes(CublasLtMatmulCall("xla.gpu.cublas.lt.matmul.aux")
-                             .Value(std::optional<StridedMemrefView>())  // bias
-                             .Arg<StridedMemrefView>()                   // aux
-                         ));
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    CublasLtMatmulBiasAux, FunctionWrapper<CublasLtMatmulImpl>(), checks,
-    BindMatmulAttributes(CublasLtMatmulCall("xla.gpu.cublas.lt.matmul.bias.aux")
-                             .Arg<StridedMemrefView>()  // bias
-                             .Arg<StridedMemrefView>()  // aux
-                         ));
-
-auto CublasLtMatmulF8Call(const char* name) {
-  return CustomCall::Bind(name)
-      .UserData<const ServiceExecutableRunOptions*>()
-      .UserData<const DebugOptions*>()
-      .State<GemmConfig>("uid")
-      .State<se::gpu::BlasLt::MatmulPlanPtr>("uid")
-      .Arg<StridedMemrefView>()   // a
-      .Arg<StridedMemrefView>()   // b
-      .Arg<StridedMemrefView>()   // c
-      .Arg<StridedMemrefView>()   // a_scale
-      .Arg<StridedMemrefView>()   // b_scale
-      .Arg<StridedMemrefView>()   // c_scale
-      .Arg<StridedMemrefView>()   // d_scale
-      .Arg<StridedMemrefView>();  // d
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    CublasLtMatmulF8, FunctionWrapper<CublasLtMatmulF8Impl>(), checks,
-    BindMatmulAttributes(
-        CublasLtMatmulF8Call("xla.gpu.cublas.lt.matmul.f8").RemainingArgs()));
-
-void RegisterMatmulCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.cublas.lt.matmul", CublasLtMatmul);
-  registry.Register("xla.gpu.cublas.lt.matmul.bias", CublasLtMatmulBias);
-  registry.Register("xla.gpu.cublas.lt.matmul.aux", CublasLtMatmulAux);
-  registry.Register("xla.gpu.cublas.lt.matmul.bias.aux", CublasLtMatmulBiasAux);
-  registry.Register("xla.gpu.cublas.lt.matmul.f8", CublasLtMatmulF8);
-}
-
-}  // namespace gpu
-#endif  // GOOGLE_CUDA || TF_HIPBLASLT
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul.h b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul.h
deleted file mode 100644
index d66a0db32bd2e3..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_GPUBLAS_LT_MATMUL_H_
-#define XLA_SERVICE_GPU_RUNTIME_GPUBLAS_LT_MATMUL_H_
-
-#include "xla/mlir/runtime/transforms/custom_call_encoding.h"
-#include "xla/runtime/custom_call_registry.h"
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "xla/service/gpu/matmul_utils.h"
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-namespace xla {
-namespace gpu {
-
-// Add cuBLASLt attributes encoding
-void PopulateCublasLtMatmulAttrEncoding(
-    runtime::CustomCallAttrEncodingSet& encoding);
-
-#if GOOGLE_CUDA || TF_HIPBLASLT
-
-// Registers XLA Gpu runtime cuBLASLt custom calls.
-void RegisterMatmulCustomCalls(runtime::DirectCustomCallRegistry& registry);
-
-// Keep cublas_lt::MatmulPlan's for all matmul instances in the executable.
-class MatmulPlans
-    : public runtime::StateVector<se::gpu::BlasLt::MatmulPlanPtr> {};
-#endif  // GOOGLE_CUDA || TF_HIPBLASLT
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_GPUBLAS_LT_MATMUL_H_
diff --git a/third_party/xla/xla/service/gpu/gpublas_lt_matmul_thunk.cc b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc
similarity index 98%
rename from third_party/xla/xla/service/gpu/gpublas_lt_matmul_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc
index 1e70910855c6cf..14cc3149163c1e 100644
--- a/third_party/xla/xla/service/gpu/gpublas_lt_matmul_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.cc
@@ -13,9 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/gpublas_lt_matmul_thunk.h"
+#include "xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h"
 
-#include <memory>
 #include <utility>
 
 #include "xla/service/gpu/matmul_utils.h"
diff --git a/third_party/xla/xla/service/gpu/gpublas_lt_matmul_thunk.h b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
similarity index 93%
rename from third_party/xla/xla/service/gpu/gpublas_lt_matmul_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
index 1cac62dfccf9c7..ca80a7bbd6ccea 100644
--- a/third_party/xla/xla/service/gpu/gpublas_lt_matmul_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/gpublas_lt_matmul_thunk.h
@@ -13,18 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_GPUBLAS_LT_MATMUL_THUNK_H_
-#define XLA_SERVICE_GPU_GPUBLAS_LT_MATMUL_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_GPUBLAS_LT_MATMUL_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_GPUBLAS_LT_MATMUL_THUNK_H_
 
-#include <memory>
 #include <optional>
-#include <utility>
 
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/matmul_utils.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/status.h"
-#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -82,4 +79,4 @@ class CublasLtMatmulThunk : public Thunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_GPUBLAS_LT_MATMUL_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_GPUBLAS_LT_MATMUL_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/graph_launch.cc b/third_party/xla/xla/service/gpu/runtime/graph_launch.cc
deleted file mode 100644
index 2a23d9dfe97497..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/graph_launch.cc
+++ /dev/null
@@ -1,730 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/graph_launch.h"
-
-#include <algorithm>
-#include <array>
-#include <atomic>
-#include <cstddef>
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <variant>
-#include <vector>
-
-#include "absl/log/check.h"
-#include "absl/log/log.h"
-#include "absl/status/status.h"
-#include "absl/synchronization/mutex.h"
-#include "absl/types/span.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/executable.h"
-#include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/gpu/non_atomically_upgradeable_rw_lock.h"
-#include "xla/service/gpu/runtime/concurrent_region.h"
-#include "xla/service/gpu/runtime/conv.h"
-#include "xla/service/gpu/runtime/gemm.h"
-#include "xla/service/gpu/runtime/kernel_launch.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/service_executable_run_options.h"
-#include "xla/statusor.h"
-#include "tsl/profiler/lib/profiler_lock.h"
-#include "tsl/profiler/lib/traceme.h"
-#include "tsl/profiler/lib/traceme_encode.h"
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "xla/stream_executor/gpu/gpu_graph.h"
-#endif  // #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-namespace xla {
-namespace gpu {
-
-using tsl::profiler::TraceMe;
-using tsl::profiler::TraceMeEncode;
-
-using xla::runtime::Arguments;
-using xla::runtime::AsyncTaskRunner;
-using xla::runtime::CustomCall;
-using xla::runtime::Executable;
-using xla::runtime::FunctionRef;
-using xla::runtime::FunctionType;
-using xla::runtime::MemrefDesc;
-using xla::runtime::MemrefType;
-using xla::runtime::StridedMemrefView;
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-using se::gpu::OwnedGpuGraph;
-
-// Captures Gpu graph by running given function in capture mode.
-static absl::StatusOr<OwnedGpuGraph> CaptureGraph(
-    const ServiceExecutableRunOptions* run_options,
-    runtime::FunctionRef function_ref, Arguments<MemrefDesc>& args,
-    CustomCall::UserData user_data);
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-//===----------------------------------------------------------------------===//
-// GPU graphs caching.
-//===----------------------------------------------------------------------===//
-
-struct GraphInstances::Impl {
-  struct State {
-    // A flag signalling if `InstantiateAllGraphs` was already called and we
-    // have all Gpu graph instantiated ahead of time.
-    bool instantiated = false;
-
-    // Last time graph instances were used by a particular stream executor.
-    uint64_t last_use_micros = 0;
-
-    std::shared_ptr<StreamExecutorGraphInstances> instances =
-        std::make_shared<StreamExecutorGraphInstances>();
-  };
-
-  // XLA module name that owns graph instances. We use it only to produce logs
-  // that can be attributed back to XLA executables.
-  std::string module_name;
-
-  // Number of graphs in the parent module.
-  int64_t num_graphs = 0;
-
-  mutable absl::Mutex mu;
-  absl::node_hash_map<se::StreamExecutor*, State> graphs ABSL_GUARDED_BY(mu);
-};
-
-// Keep track of instantiated graphs on each StreamExecutor, we use this
-// information in the graph eviction policy.
-using GraphInstancesState = absl::flat_hash_map<se::StreamExecutor*, int64_t>;
-
-static absl::Mutex* GetGraphInstancesStateMutex() {
-  static auto* mu = new absl::Mutex();
-  return mu;
-}
-
-static GraphInstancesState& GetGraphInstancesState() {
-  static auto* state = new GraphInstancesState();
-  return *state;
-}
-
-static int64_t NotifyGraphInstancesCreated(se::StreamExecutor* executor,
-                                           int64_t num_graphs) {
-  absl::MutexLock lock(GetGraphInstancesStateMutex());
-  return GetGraphInstancesState()[executor] += num_graphs;
-}
-
-static int64_t NotifyGraphInstancesDestroyed(se::StreamExecutor* executor,
-                                             int64_t num_graphs) {
-  absl::MutexLock lock(GetGraphInstancesStateMutex());
-  return GetGraphInstancesState()[executor] -= num_graphs;
-}
-
-// We keep track of all graph instances in the process, to implement graph
-// eviction on OOM. Graph instances owned by GpuExecutable, so we rely on
-// weak ptr to check if they are still alive.
-using GraphInstancesVec = std::vector<std::weak_ptr<GraphInstances::Impl>>;
-
-static absl::Mutex* GetGraphInstancesVecMutex() {
-  static auto* mu = new absl::Mutex();
-  return mu;
-}
-
-static GraphInstancesVec& GetGraphInstancesVec() {
-  static auto* vec = new GraphInstancesVec();
-  return *vec;
-}
-
-static void AddGraphInstances(std::weak_ptr<GraphInstances::Impl> impl) {
-  absl::MutexLock lock(GetGraphInstancesVecMutex());
-  GetGraphInstancesVec().push_back(std::move(impl));
-}
-
-// Evicts all graphs for a given executor in the current process.
-static void EvictAllGraphs(
-    se::StreamExecutor* executor,
-    std::optional<uint64_t> eviction_timeout_seconds = std::nullopt) {
-  // We WARN only when we evict all Gpu graphs because it happens when we
-  // recover from OOM. Eviction by time out is business as usual.
-  if (eviction_timeout_seconds.has_value()) {
-    VLOG(3) << "Evict timed out gpu graphs from executor " << executor;
-  } else {
-    LOG(WARNING) << "Evict all gpu graphs from executor " << executor;
-  }
-
-  TraceMe trace_instantiation([&] {
-    return TraceMeEncode("cuda.graph.evict_all_graphs",
-                         {{"device_ordinal", executor->device_ordinal()}});
-  });
-
-  absl::MutexLock lock(GetGraphInstancesVecMutex());
-  auto& vec = GetGraphInstancesVec();
-
-  // Erase all expired graph instances.
-  vec.erase(std::remove_if(vec.begin(), vec.end(),
-                           [](auto& weak_ptr) { return weak_ptr.expired(); }),
-            vec.end());
-
-  auto timed_out = [&](GraphInstances::Impl::State& state) -> bool {
-    if (!eviction_timeout_seconds.has_value()) {
-      return false;
-    }
-
-    auto diff = tsl::Env::Default()->NowMicros() - state.last_use_micros;
-    return (diff / (1000 * 1000)) > *eviction_timeout_seconds;
-  };
-
-  int64_t num_evicted = 0;
-
-  for (auto& weak_ptr : vec) {
-    auto ptr = weak_ptr.lock();
-    if (!ptr) continue;
-
-    if (!ptr->mu.TryLock()) continue;
-
-    auto it = ptr->graphs.find(executor);
-    if (it == ptr->graphs.end()) {
-      ptr->mu.Unlock();
-      continue;
-    }
-
-    // If we have a timeout value, than check it first, otherwise always evict
-    // graphs for a given executor.
-    bool is_timed_out = timed_out(it->second);
-    if (eviction_timeout_seconds.has_value() && !is_timed_out) {
-      ptr->mu.Unlock();
-      continue;
-    }
-
-    if (ptr->num_graphs > 0) {
-      VLOG(3) << "Evict " << ptr->num_graphs << " graphs for: @"
-              << ptr->module_name << " at executor: " << executor
-              << " (timed_out = " << is_timed_out << ")."
-              << " Total remaining graphs at given executor: "
-              << NotifyGraphInstancesDestroyed(executor, ptr->num_graphs);
-    }
-    ptr->graphs.erase(it);
-    ptr->mu.Unlock();
-    ++num_evicted;
-  }
-
-  if (num_evicted > 0) {
-    VLOG(3) << "Evicted " << num_evicted << " graphs from executor "
-            << executor;
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    se::gpu::GpuGraphSupport::TrimDeviceMemory(executor);
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  }
-}
-
-GraphInstances::GraphInstances(std::string module_name, int64_t num_graphs)
-    : impl_(std::make_shared<Impl>()) {
-  impl_->module_name = std::move(module_name);
-  impl_->num_graphs = num_graphs;
-  if (impl_->num_graphs > 0) {
-    VLOG(3) << "Construct graph instances cache for: @" << impl_->module_name
-            << " (num_graphs = " << impl_->num_graphs << ")";
-  }
-  AddGraphInstances(impl_);
-}
-
-GraphInstances::~GraphInstances() {
-  if (impl_->num_graphs > 0) {
-    VLOG(3) << "Destroy graph instances cache for: @" << impl_->module_name
-            << " (num_graphs = " << impl_->num_graphs << ")";
-
-    absl::MutexLock lock(&impl_->mu);
-    for (auto& [executor, state] : impl_->graphs) {
-      VLOG(3) << "Destroy " << impl_->num_graphs << " graphs for: @"
-              << impl_->module_name << " at executor: " << executor
-              << ". Total remaining graphs at given executor: "
-              << NotifyGraphInstancesDestroyed(executor, impl_->num_graphs);
-    }
-  }
-}
-
-std::shared_ptr<StreamExecutorGraphInstances> GraphInstances::operator()(
-    se::StreamExecutor* executor) {
-  absl::MutexLock lock(&impl_->mu);
-
-  auto it = impl_->graphs.try_emplace(executor);
-  if (it.second && impl_->num_graphs > 0) {
-    VLOG(3) << "Instantiate " << impl_->num_graphs << " graphs for: @"
-            << impl_->module_name << " at executor: " << executor
-            << ". Total graphs at given executor: "
-            << NotifyGraphInstancesCreated(executor, impl_->num_graphs);
-  }
-
-  Impl::State& state = it.first->second;
-  state.last_use_micros = tsl::Env::Default()->NowMicros();
-  return state.instances;
-}
-
-bool GraphInstances::InstantiatedAllGraphs(
-    const ServiceExecutableRunOptions* run_options,
-    const Executable& executable) {
-  if (executable.num_functions() == 1) return true;
-
-  absl::MutexLock lock(&impl_->mu);
-  return impl_->graphs[run_options->stream()->parent()].instantiated;
-}
-
-absl::Status GraphInstances::InstantiateAllGraphs(
-    const ServiceExecutableRunOptions* run_options,
-    const Executable& executable, const CustomCall::UserData& user_data,
-    const BufferAllocations& buffer_allocations,
-    absl::Span<const int64_t> buffer_sizes,
-    absl::Span<const std::vector<int64_t>> allocation_indices,
-    std::optional<uint64_t> eviction_timeout_seconds) {
-  // We have only "main" function in the executable.
-  if (executable.num_functions() == 1) return absl::OkStatus();
-
-  absl::MutexLock lock(&impl_->mu);
-  se::StreamExecutor* executor = run_options->stream()->parent();
-
-  Impl::State& state = impl_->graphs[executor];
-
-  // All Gpu graphs are already instantiated for a given executor.
-  if (state.instantiated) return absl::OkStatus();
-
-  TraceMe trace("gpu.graph.instantiate_all");
-
-  // Evict all timeout graphs before trying to instantiate new ones.
-  EvictAllGraphs(executor, eviction_timeout_seconds);
-
-  // We'll retry graph instantiation on OOM errors after evicting all graphs
-  // instantiated on `executor`.
-  int32_t num_retries = 0;
-
-  StreamExecutorGraphInstances::Snapshot instances =
-      state.instances->snapshot();
-
-  // Instantiate all Gpu graphs by calling graph capture functions with fake
-  // arguments. Once we'll execute them first time for real, they'll be updated
-  // with correct pointers.
-  for (unsigned ordinal = 1; ordinal < executable.num_functions(); ++ordinal) {
-    if (!absl::StartsWith(executable.function_name(ordinal),
-                          "xla.gpu.graph.capture"))
-      continue;
-
-    VLOG(3) << "Instantiate Gpu graph defined by capture function @"
-            << executable.function_name(ordinal) << " (ordinal = " << ordinal
-            << ")";
-
-    TraceMe trace_instantiation([&] {
-      return TraceMeEncode("gpu.graph.instantiate", {{"ordinal", ordinal}});
-    });
-
-    FunctionRef function_ref = executable.function_ref(ordinal);
-
-    const FunctionType& signature = executable.signature(ordinal);
-    assert(signature.num_results() == 0 && "unexpected number of results");
-    Arguments<MemrefDesc> args(signature.num_operands());
-
-    // Mapping from graph capture argument to buffer allocation index.
-    absl::Span<const int64_t> capture_allocs = allocation_indices[ordinal];
-    if (capture_allocs.size() != signature.num_operands())
-      return absl::InternalError(
-          "Invalid number of allocation indices for a graph capture function");
-
-    // Prepare arguments for the graph capture function.
-    for (size_t j = 0; j < signature.num_operands(); ++j) {
-      auto* memref = llvm::dyn_cast<MemrefType>(signature.operand(j));
-
-      if (!memref)
-        return absl::InternalError(absl::StrFormat(
-            "Unsupported capture function argument type #%d", j));
-
-      if (memref->sizes().size() != 1)
-        return absl::InternalError(
-            absl::StrFormat("Unsupported capture function memref rank #%d: %d",
-                            j, memref->sizes().size()));
-
-      std::array<int64_t, 1> sizes = {memref->size(0)};
-      std::array<int64_t, 1> strides = {1};
-
-      int64_t allocation_index = capture_allocs[j];
-      args.emplace_back<MemrefDesc>(
-          memref->element_type(),
-          buffer_allocations.GetDeviceAddress(allocation_index).opaque(),
-          /*offset=*/0, sizes, strides);
-    }
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-    // Instantiate a Gpu graph with fake arguments.
-    auto instantiate = [&]() -> absl::StatusOr<GraphInstance> {
-      TF_ASSIGN_OR_RETURN(
-          auto g, CaptureGraph(run_options, function_ref, args, user_data));
-      TF_ASSIGN_OR_RETURN(auto e, se::gpu::InstantiateGpuGraph(std::move(g)));
-      return GraphInstance(0, std::move(e));
-    };
-
-    absl::StatusOr<GraphInstance*> instance =
-        instances.GetOrCreate(ordinal, instantiate);
-
-    if (instance.status().code() == absl::StatusCode::kResourceExhausted) {
-      if (num_retries == 0) {
-        LOG(WARNING) << "InstantiateAllGraph failed due to insufficient memory."
-                        " Try to evict all graphs and free device memory.";
-
-        // Retry on OOM error after evicting all graphs from executor.
-        EvictAllGraphs(executor);
-        num_retries++;
-        ordinal--;  // we'll try to instantiate the same graph one more time
-        continue;
-      } else {
-        LOG(WARNING) << "InstantiateAllGraph failed due to insufficient memory."
-                        " Unitialized graphs will run in op-by-op mode.";
-        return absl::OkStatus();
-      }
-    }
-
-    // Otherwise return an error to the caller.
-    if (!instance.ok()) return instance.status();
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  }
-
-  state.instantiated = true;
-  return absl::OkStatus();
-}
-
-CapturedFunctionExecutionCount* CapturedFunctionExecutionCounts::operator()(
-    se::StreamExecutor* executor) {
-  absl::MutexLock lock(&mutex_);
-  return &counts_[executor];
-}
-
-//===----------------------------------------------------------------------===//
-// Helper structure to hash the remaining arguments' memref pointers.
-//===----------------------------------------------------------------------===//
-
-struct RemainingArgsPtrs {
-  CustomCall::RemainingArgs args;
-  se::DeviceMemoryBase* temp_buffer;
-
-  template <typename H>
-  friend H AbslHashValue(H h, const RemainingArgsPtrs& m);
-};
-
-template <typename H>
-H AbslHashValue(H h, const RemainingArgsPtrs& m) {
-  for (size_t i = 0; i < m.args.size(); ++i) {
-    if (auto memref = m.args.get<StridedMemrefView>(i); succeeded(memref))
-      h = H::combine(std::move(h), memref->data);
-  }
-  return std::move(H::combine(std::move(h), m.temp_buffer->opaque()));
-}
-
-//----------------------------------------------------------------------------//
-// Runs capture function exported by the executable to construct a gpu graph.
-//----------------------------------------------------------------------------//
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-static bool InDebugMode() {
-#ifdef NDEBUG
-  return false;
-#endif
-  return true;
-}
-
-// Forwards custom call arguments to an arguments container that can be passed
-// to an executable function.
-static absl::Status ForwardArguments(CustomCall::RemainingArgs fwd_args,
-                                     Arguments<MemrefDesc>& args) {
-  for (size_t i = 0; i < fwd_args.size(); ++i) {
-    if (auto memref = fwd_args.get<StridedMemrefView>(i); succeeded(memref)) {
-      args.emplace_back<MemrefDesc>(memref->dtype, memref->data, /*offset=*/0,
-                                    memref->sizes, memref->strides);
-      continue;
-    }
-
-    return absl::InvalidArgumentError("Unsupported argument type");
-  }
-
-  return absl::OkStatus();
-}
-
-static absl::StatusOr<OwnedGpuGraph> CaptureGraph(
-    const ServiceExecutableRunOptions* run_options,
-    runtime::FunctionRef function_ref, Arguments<MemrefDesc>& args,
-    CustomCall::UserData user_data) {
-  // We capture graph on a borrowed stream because we do not want to
-  // accidentally record any concurrent kernel launches from other XLA
-  // executables.
-  se::StreamExecutor* executor = run_options->stream()->parent();
-
-  // Initialize (with memoization) BlasSupport here because cublasCreate fails
-  // during gpu graph capturing.
-  if (function_ref.RequiresBlas()) {
-    if (!executor->AsBlas()) {
-      return absl::InternalError("Failed to initialize BLAS support");
-    }
-  }
-
-  absl::StatusOr<StreamPool::Ptr> capture_stream =
-      run_options->BorrowStream(executor->device_ordinal());
-
-  if (!capture_stream.ok())
-    return absl::InternalError(
-        absl::StrFormat("Failed to borrow a stream for graph capture: %s",
-                        capture_stream.status().message()));
-
-  TraceMe trace([&] {
-    return TraceMeEncode("gpu.graph.capture",
-                         {{"ordinal", function_ref.ordinal()}});
-  });
-
-  // TODO(ezhulenev): Pass graph capture context explicitly to the custom calls
-  // via UserData to be able to detect when executing custom call in graph
-  // capture mode. Currently we rely on the fact that we know for sure that
-  // operations in the graph capture function do not need anything except the
-  // main stream (we capture only kernel launches).
-  ExecutableRunOptions capture_run_options;
-  capture_run_options.set_stream(capture_stream->get());
-
-  const ServiceExecutableRunOptions capture_opts(capture_run_options);
-  user_data.insert(&capture_opts);
-
-  // Collect all emitted diagnostic messages.
-  std::string diagnostic;
-  runtime::DiagnosticEngine diagnostic_engine;
-  AppendDiagnosticToString(diagnostic_engine, &diagnostic);
-
-  // Prepare options for executing graph capture function.
-  Executable::ExecuteOpts opts;
-  opts.custom_call_data = &user_data;
-  opts.diagnostic_engine = &diagnostic_engine;
-
-  // Graph capture function should not launch any async tasks.
-  opts.async_task_runner = reinterpret_cast<AsyncTaskRunner*>(0XDEADBEEF);
-
-  // Create a graph from running the graph capture function.
-  auto captured = se::gpu::CaptureGpuGraph(capture_stream->get(), [&]() {
-    return function_ref(args, runtime::NoResultConverter{}, opts,
-                        /*verify_arguments=*/InDebugMode())
-        .status();
-  });
-
-  if (!captured.ok()) {
-    return Internal("CaptureGpuGraph failed (%s): %s",
-                         diagnostic.empty() ? "<no details>" : diagnostic,
-                         captured.status().ToString());
-  }
-  return std::move(*captured);
-}
-
-// When graph execution is disabled we run the graph capture function in
-// "regular" mode and execute all operation one by one.
-static absl::Status RunGraphOpByOp(
-    const ServiceExecutableRunOptions* run_options,
-    runtime::FunctionRef function_ref, CustomCall::RemainingArgs fwd_args,
-    CustomCall::UserData user_data) {
-  // Prepare options for executing graph capture function.
-  Executable::ExecuteOpts opts;
-  auto* concurrent_region_status = user_data.get<ConcurrentRegionStatus>();
-  // Ops should not run in parallel during op-by-op execution.
-  concurrent_region_status->DisableConcurrentRegion();
-  opts.custom_call_data = &user_data;
-
-  TraceMe trace([&] {
-    return TraceMeEncode("gpu.graph.run_op_by_op_fallback",
-                         {{"ordinal", function_ref.ordinal()}});
-  });
-
-  // Collect all emitted diagnostic messages.
-  std::string diagnostic;
-  runtime::DiagnosticEngine diagnostic_engine;
-  AppendDiagnosticToString(diagnostic_engine, &diagnostic);
-
-  opts.diagnostic_engine = &diagnostic_engine;
-
-  // Graph capture function should not launch any async tasks.
-  opts.async_task_runner = reinterpret_cast<AsyncTaskRunner*>(0XDEADBEEF);
-
-  Arguments<MemrefDesc> args(fwd_args.size());
-  TF_RETURN_IF_ERROR(ForwardArguments(fwd_args, args));
-
-  auto executed =
-      function_ref(args, runtime::NoResultConverter{}, opts, InDebugMode());
-  concurrent_region_status->EnableConcurrentRegion();
-  if (!executed.ok()) {
-    return Internal("RunGraphOpByOp failed (%s): %s",
-                         diagnostic.empty() ? "<no details>" : diagnostic,
-                         executed.status().ToString());
-  }
-  return absl::OkStatus();
-}
-
-#endif  // #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-//===----------------------------------------------------------------------===//
-// Define the gpu graph launch custom call.
-//===----------------------------------------------------------------------===//
-
-static absl::Status LaunchGraph(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, const std::string* ptx,
-    const std::vector<uint8_t>* cubin, se::DeviceMemoryBase* temp_buffer,
-    StreamExecutorKernels::Snapshot* kernels,
-    StreamExecutorConvRunners::Snapshot* convs,
-    StreamExecutorGraphInstances::Snapshot* instances,
-    CapturedFunctionExecutionCount::Snapshot* counts,
-    GemmConfigs::Snapshot* gemm_config, runtime::Executable* executable,
-    NonAtomicallyUpgradeableRWLock* gpu_lock,
-    ConcurrentRegionStatus* region_status, CustomCall::RemainingArgs fwd_args,
-    CustomCall::FunctionOrdinal capture) {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  VLOG(1) << "Launch GPU Graph: ordinal = " << capture.ordinal;
-
-  // Get a reference to exported function that captures the gpu graph.
-  runtime::FunctionRef function_ref = executable->function_ref(capture.ordinal);
-
-  // Compute the hash of the buffer arguments.
-  size_t ptrs_hash = absl::HashOf(RemainingArgsPtrs{fwd_args, temp_buffer});
-
-  // Forwards user data required for launching kernels.
-  auto user_data = [&] {
-    return CustomCall::UserData(run_options, debug_options, ptx, cubin,
-                                temp_buffer, kernels, convs, executable,
-                                gemm_config, gpu_lock, region_status);
-  };
-
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<std::atomic<uint64_t>> * get_count,
-                      counts->GetOrCreate(capture.ordinal, [] {
-                        return std::make_unique<std::atomic<uint64_t>>(0);
-                      }));
-
-  int64_t count = (*get_count)->fetch_add(1);
-  int64_t num_runs_to_instantiate =
-      debug_options->xla_gpu_graph_num_runs_to_instantiate();
-
-  // TODO(b/290773547): Profiler + CUDA graphs lead to memory corruption. As a
-  // work around disable graph execution and run everything in op-by-op mode.
-  bool is_profiling = tsl::profiler::ProfilerLock::HasActiveSession();
-
-  if (count < num_runs_to_instantiate || is_profiling) {
-    VLOG(3) << "Run gpu graph in op-by-op mode: ordinal = " << capture.ordinal;
-    return RunGraphOpByOp(run_options, function_ref, fwd_args, user_data());
-  }
-
-  // Instantiate Gpu graph by running graph capture function.
-  auto instantiate = [&]() -> absl::StatusOr<GraphInstance> {
-    Arguments<MemrefDesc> args(fwd_args.size());
-    TF_RETURN_IF_ERROR(ForwardArguments(fwd_args, args));
-
-    TF_ASSIGN_OR_RETURN(
-        auto g, CaptureGraph(run_options, function_ref, args, user_data()));
-
-    TF_ASSIGN_OR_RETURN(auto e, se::gpu::InstantiateGpuGraph(std::move(g)));
-
-    return GraphInstance(ptrs_hash, std::move(e));
-  };
-
-  GraphInstance* instance;
-  if (num_runs_to_instantiate < 0) {
-    // If num_runs_to_instantiate is less than 0, all graphs should be
-    // instantiated ahead-of-time. If we fail to get the graph instance, then
-    // graph instantiation failed due to OOM. So we run the graph op-by-op.
-    absl::StatusOr<GraphInstance*> try_get_instance =
-        instances->Get(capture.ordinal);
-    if (try_get_instance.ok()) {
-      instance = try_get_instance.value();
-    } else {
-      return RunGraphOpByOp(run_options, function_ref, fwd_args, user_data());
-    }
-  } else {
-    TF_ASSIGN_OR_RETURN(instance,
-                        instances->GetOrCreate(capture.ordinal, instantiate));
-  }
-
-  {
-    // Lock graph instance for read only access. If we'll have to update the
-    // graph, we'll update to a writer lock below.
-    absl::ReaderMutexLock lock(instance->mutex.get());
-
-    // If pointers did not change we can run captured graph.
-    if (ptrs_hash == instance->ptr_hash) {
-      TraceMe trace([&] {
-        return TraceMeEncode("gpu.graph.launch_cached",
-                             {{"ordinal", capture.ordinal}});
-      });
-
-      VLOG(3) << "Execute cached graph instance";
-      return instance->exec.Launch(run_options->stream());
-    }
-  }
-
-  // Otherwise we have to re-capture the graph and update the graph instance.
-  VLOG(3) << "Update cached graph instance";
-
-  Arguments<MemrefDesc> args(fwd_args.size());
-  TF_RETURN_IF_ERROR(ForwardArguments(fwd_args, args));
-
-  // Capture GPU graph by running capture function.
-  TF_ASSIGN_OR_RETURN(
-      auto g, CaptureGraph(run_options, function_ref, args, user_data()));
-
-  // At this point we have to grab a writer lock, because we might potentially
-  // have concurrent execution of the cached graph instance.
-  absl::WriterMutexLock lock(instance->mutex.get());
-
-  // Update captured graph executable.
-  TF_RETURN_IF_ERROR(instance->exec.Update(std::move(g)));
-
-  // Update captured pointer hash.
-  instance->ptr_hash = ptrs_hash;
-
-  TraceMe trace([&] {
-    return TraceMeEncode("gpu.graph.launch_updated",
-                         {{"ordinal", capture.ordinal}});
-  });
-
-  return instance->exec.Launch(run_options->stream());
-
-#else  // #if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
-
-  return absl::InternalError("GPU graphs are not supported");
-
-#endif  // #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-}
-
-//===----------------------------------------------------------------------===//
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    Launch, FunctionWrapper<LaunchGraph>(), checks,
-    CustomCall::Bind("xla.gpu.graph.launch")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<const DebugOptions*>()
-        .UserData<const std::string*>()
-        .UserData<const std::vector<uint8_t>*>()
-        .UserData<se::DeviceMemoryBase*>()
-        .UserData<StreamExecutorKernels::Snapshot*>()
-        .UserData<StreamExecutorConvRunners::Snapshot*>()
-        .UserData<StreamExecutorGraphInstances::Snapshot*>()
-        .UserData<CapturedFunctionExecutionCount::Snapshot*>()
-        .UserData<GemmConfigs::Snapshot*>()
-        .UserData<Executable*>()
-        .UserData<NonAtomicallyUpgradeableRWLock*>()
-        .UserData<ConcurrentRegionStatus*>()
-        .RemainingArgs()
-        .Attr<CustomCall::FunctionOrdinal>("capture"));
-
-void RegisterGraphLaunchCustomCalls(
-    runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.graph.launch", Launch);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/graph_launch.h b/third_party/xla/xla/service/gpu/runtime/graph_launch.h
deleted file mode 100644
index e030781aeabc68..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/graph_launch.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_GRAPH_LAUNCH_H_
-#define XLA_SERVICE_GPU_RUNTIME_GRAPH_LAUNCH_H_
-
-#include <atomic>
-#include <cstdint>
-#include <memory>
-#include <optional>
-#include <string>
-#include <utility>
-#include <variant>
-
-#include "absl/container/node_hash_map.h"
-#include "absl/types/span.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/runtime/executable.h"
-#include "xla/service/gpu/buffer_allocations.h"
-#include "xla/service/service_executable_run_options.h"
-#include "xla/stream_executor/stream_executor.h"
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "xla/stream_executor/gpu/gpu_graph.h"
-#endif  // #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-namespace xla {
-namespace gpu {
-
-// Registers XLA Gpu runtime graph launch custom calls.
-void RegisterGraphLaunchCustomCalls(
-    runtime::DirectCustomCallRegistry& registry);
-
-struct GraphInstance;                // Forward declare
-class StreamExecutorGraphInstances;  // Forward declare
-
-// A state vector that keeps track of the number of times a capture function
-// gets executed. Graph capture function ordinal is the key in this container.
-class CapturedFunctionExecutionCount
-    : public runtime::StateVector<std::unique_ptr<std::atomic<uint64_t>>> {};
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-// A state vector that owns all instantiated GPU graphs. Graph capture function
-// ordinal is the key in this container.
-class StreamExecutorGraphInstances
-    : public runtime::StateVector<GraphInstance> {};
-
-// Instantiated GPU graph instance guarded with a mutex for exclusive access.
-struct GraphInstance {
-  GraphInstance(size_t ptr_hash, se::gpu::OwnedGpuGraphExec exec)
-      : ptr_hash(ptr_hash), exec(std::move(exec)), mutex(new absl::Mutex) {}
-
-  // Graph instance is fully identified by the hash of its pointer arguments
-  // because currently it's guaranteed that all shapes and launch dimensions
-  // will be constant from run to run.
-  size_t ptr_hash ABSL_GUARDED_BY(*mutex);
-  se::gpu::OwnedGpuGraphExec exec ABSL_GUARDED_BY(*mutex);
-
-  // Access to a graph instance must be synchronized, because we potentially can
-  // run concurrent graph instance updates.
-  std::unique_ptr<absl::Mutex> mutex;
-};
-
-#else  // #if !GOOGLE_CUDA && !TENSORFLOW_USE_ROCM
-
-// Define empty struct and empty state when GPU is not enabled.
-struct GraphInstance {};
-class StreamExecutorGraphInstances
-    : public runtime::StateVector<GraphInstance> {};
-
-#endif  // #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-// Xla executable keeps a mapping from stream executors to graph instances.
-//
-// Graph instances allocate on-device memory, so we periodically destroy
-// them to free up some space on device. JAX for example keeps all XLA
-// executables alive, and destroys them when the process shuts down, so we can
-// end up with thousands of unused (or rarely used) graphs in device memory.
-class GraphInstances {
- public:
-  struct Impl;
-
-  GraphInstances(std::string module_name, int64_t num_graphs);
-  ~GraphInstances();
-
-  std::shared_ptr<StreamExecutorGraphInstances> operator()(
-      se::StreamExecutor* executor);
-
-  // Instantiates all Gpu graphs defined by the given executable using user
-  // provided run options. This guarantees that once we start execution, all Gpu
-  // graphs are ready, and will only require cheap update operation and will not
-  // require allocating new resources (we avoid non deterministic OOM errors).
-  //
-  // If timeout is not nullopt it will evict all previously instantiated graphs
-  // that were used more than `eviction_timeout_seconds` seconds ago.
-  absl::Status InstantiateAllGraphs(
-      const ServiceExecutableRunOptions* run_options,
-      const runtime::Executable& executable,
-      const runtime::CustomCall::UserData& user_data,
-      const BufferAllocations& buffer_allocations,
-      absl::Span<const int64_t> buffer_sizes,
-      absl::Span<const std::vector<int64_t>> allocation_indices,
-      std::optional<uint64_t> eviction_timeout_seconds = std::nullopt);
-
-  // Returns true if all Gpu graphs were already instantiated.
-  bool InstantiatedAllGraphs(const ServiceExecutableRunOptions* run_options,
-                             const runtime::Executable& executable);
-
- private:
-  std::shared_ptr<Impl> impl_;
-};
-
-// Xla executable keeps a mapping from stream executors to execution counts.
-class CapturedFunctionExecutionCounts {
- public:
-  CapturedFunctionExecutionCount* operator()(se::StreamExecutor* executor);
-
- private:
-  mutable absl::Mutex mutex_;
-  absl::node_hash_map<se::StreamExecutor*, CapturedFunctionExecutionCount>
-      counts_ ABSL_GUARDED_BY(mutex_);
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_GRAPH_LAUNCH_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/infeed_thunk.cc b/third_party/xla/xla/service/gpu/runtime/infeed_thunk.cc
similarity index 93%
rename from third_party/xla/xla/service/gpu/runtime3/infeed_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/infeed_thunk.cc
index 62bccdabab9abf..7a6dc048bf8cd8 100644
--- a/third_party/xla/xla/service/gpu/runtime3/infeed_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/infeed_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/infeed_thunk.h"
+#include "xla/service/gpu/runtime/infeed_thunk.h"
 
 #include "absl/status/status.h"
 #include "xla/service/gpu/buffer_allocations.h"
@@ -45,7 +45,8 @@ absl::Status InfeedThunk::ExecuteOnStream(const ExecuteParams& params) {
     se::ScopedDeviceMemory<uint8_t>& buffer = source.second;
     const Shape& source_shape =
         ShapeUtil::GetSubshape(source_buffers.shape(), shape_index);
-    TF_RET_CHECK(ShapeUtil::Equal(dest_slices_[index].shape, source_shape))
+    TF_RET_CHECK(
+        ShapeUtil::ReshapeIsBitcast(dest_slices_[index].shape, source_shape))
         << "Mismatch between infeed source buffer shape "
         << ShapeUtil::HumanStringWithLayout(source_shape)
         << " and infeed dest buffer shape "
@@ -62,7 +63,7 @@ absl::Status InfeedThunk::ExecuteOnStream(const ExecuteParams& params) {
   absl::Status block_status = stream.BlockHostUntilDone();
   if (!block_status.ok()) {
     return Internal("Failed to complete data transfer on stream %p: %s",
-                         &stream, block_status.message());
+                    &stream, block_status.message());
   }
 
   VLOG(2) << "Infeeding to GPU complete";
diff --git a/third_party/xla/xla/service/gpu/runtime3/infeed_thunk.h b/third_party/xla/xla/service/gpu/runtime/infeed_thunk.h
similarity index 90%
rename from third_party/xla/xla/service/gpu/runtime3/infeed_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/infeed_thunk.h
index 9d685aa2c19d5f..9286193045f3cc 100644
--- a/third_party/xla/xla/service/gpu/runtime3/infeed_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/infeed_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_INFEED_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_INFEED_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_INFEED_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_INFEED_THUNK_H_
 
 #include "xla/service/gpu/thunk.h"
 
@@ -42,4 +42,4 @@ class InfeedThunk : public Thunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_INFEED_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_INFEED_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/io_feed.cc b/third_party/xla/xla/service/gpu/runtime/io_feed.cc
deleted file mode 100644
index 97e6a83cb669c3..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/io_feed.cc
+++ /dev/null
@@ -1,178 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/io_feed.h"
-
-#include <memory>
-#include <string_view>
-
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/executable.h"
-#include "xla/service/gpu/infeed_manager.h"
-#include "xla/service/gpu/outfeed_manager.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/service_executable_run_options.h"
-
-namespace xla {
-namespace gpu {
-
-using runtime::CustomCall;
-
-static absl::Status InfeedImpl(const ServiceExecutableRunOptions* run_options,
-                               CustomCall::RemainingArgs args,
-                               std::string_view config) {
-  VLOG(3) << "Infeeding to GPU";
-
-  se::Stream* stream = run_options->stream();
-  ShapeTree<se::ScopedDeviceMemory<uint8_t>> source_buffers =
-      GetOrCreateInfeedManager(stream->parent())->BlockingGetNextDestination();
-
-  // Check that we have correct number of arguments.
-  if (args.size() != source_buffers.leaf_count())
-    return absl::InvalidArgumentError("Incorrect number of arguments");
-
-  size_t index = 0;
-  for (auto& source : source_buffers.leaves()) {
-    // Get the destination buffer.
-    auto dest = args.get<runtime::StridedMemrefView>(index);
-    if (failed(dest))
-      return absl::InternalError("Failed to get the destination buffer");
-
-    // Get the source buffer shape.
-    const Shape& source_shape =
-        ShapeUtil::GetSubshape(source_buffers.shape(), source.first);
-
-    // Check that destination shape matches the source shape.
-    Shape dest_shape = ToShape(*dest);
-    if (!ShapeUtil::ReshapeIsBitcast(dest_shape, source_shape)) {
-      return absl::InvalidArgumentError(
-          "The destination shape does not match the source shape");
-    }
-
-    se::DeviceMemoryBase dest_address = GetDeviceAddress(*dest);
-    se::ScopedDeviceMemory<uint8_t>& buffer = source.second;
-    stream->ThenMemcpy(&dest_address, *buffer.ptr(), buffer.ptr()->size());
-
-    ++index;
-  }
-
-  // TODO(ezhulenev): Make this function async?
-  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-
-  VLOG(3) << "Infeeding to GPU complete";
-
-  return absl::OkStatus();
-}
-
-static absl::Status OutfeedImpl(const ServiceExecutableRunOptions* run_options,
-                                CustomCall::RemainingArgs args,
-                                std::string_view config) {
-  VLOG(3) << "Outfeeding from GPU";
-
-  se::Stream* stream = run_options->stream();
-  OutfeedManager* outfeed_manager = GetOrCreateOutfeedManager(stream->parent());
-  ShapeTree<std::unique_ptr<OutfeedBuffer>>* dest_buffers =
-      outfeed_manager->BlockingGetNextDestination();
-
-  // Nothing to be done for an outfeed with no inputs.
-  // Note: Must do this after `BlockingGetNextDestination` above to dequeue an
-  // entry from the outfeed manager.
-  if (args.empty()) return absl::OkStatus();
-
-  // Check that we have correct number of arguments.
-  if (args.size() != dest_buffers->leaf_count())
-    return absl::InvalidArgumentError("Incorrect number of arguments");
-
-  int64_t leaf_count = dest_buffers->leaf_count();
-  auto dest_leaf_it = dest_buffers->leaf_begin();
-
-  for (int64_t index = 0; index < leaf_count; ++index) {
-    const ShapeIndex& shape_index = dest_leaf_it->first;
-    std::unique_ptr<OutfeedBuffer>& buffer = dest_leaf_it->second;
-
-    // NOTE: This code needs deal with the `dest_buffers` object getting
-    // deleted when it is executing. Specifically, objects in the outfeed queue
-    // are pointers to instances of stack-allocated objects in
-    // `GpuTransferManager::TransferLiteralFromOutfeed`. When all leaf node
-    // buffers are notified via "buffer->Done()" below in the stream host
-    // callback, `TransferLiteralFromOutfeed` deletes this stack-allocated
-    // object when it returns. This means that it is possible that during the
-    // last iteration, after the call to "buffer->Done()" is scheduled onto the
-    // stream, the `dest_buffers` object might get deleted, so we should avoid
-    // accessing the object after that.
-    //
-    // To achieve that, increment the leaf iterator here before the last "Done"
-    // is enqueued, instead of in the loop increment, which would be after the
-    // "Done" is scheduled.
-    ++dest_leaf_it;
-
-    // Get the source buffer.
-    auto source = args.get<runtime::StridedMemrefView>(index);
-    if (failed(source))
-      return absl::InternalError("Failed to get the source buffer");
-
-    // Get the source buffer shape.
-    const Shape& dest_shape =
-        ShapeUtil::GetSubshape(dest_buffers->shape(), shape_index);
-
-    // Check that destination shape matches the source shape.
-    Shape source_shape = ToShape(*source);
-    if (!ShapeUtil::ReshapeIsBitcast(dest_shape, source_shape)) {
-      return absl::InvalidArgumentError(
-          "The destination shape does not match the source shape");
-    }
-
-    se::DeviceMemoryBase source_address = GetDeviceAddress(*source);
-
-    // Schedule the memory transfer.
-    auto* dest_address = buffer->destination()->untyped_data();
-    stream->ThenMemcpy(dest_address, source_address, buffer->length())
-        .ThenDoHostCallback([&buffer]() { buffer->Done(); });
-  }
-
-  TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-
-  VLOG(3) << "Outfeeding from GPU complete";
-
-  return absl::OkStatus();
-}
-
-//===----------------------------------------------------------------------===//
-// Define Xla runtime bindings for the custom calls.
-//===----------------------------------------------------------------------===//
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    Infeed, FunctionWrapper<InfeedImpl>(), checks,
-    CustomCall::Bind("xla.gpu.infeed")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .Arg<CustomCall::RemainingArgs>()  // args
-        .Attr<std::string_view>("config"));
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    Outfeed, FunctionWrapper<OutfeedImpl>(), checks,
-    CustomCall::Bind("xla.gpu.outfeed")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .Arg<CustomCall::RemainingArgs>()  // args
-        .Attr<std::string_view>("config"));
-
-//===----------------------------------------------------------------------===//
-
-void RegisterIoFeedCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.infeed", Infeed);
-  registry.Register("xla.gpu.outfeed", Outfeed);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/io_feed.h b/third_party/xla/xla/service/gpu/runtime/io_feed.h
deleted file mode 100644
index e3d22f694b52a9..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/io_feed.h
+++ /dev/null
@@ -1,30 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_IO_FEED_H_
-#define XLA_SERVICE_GPU_RUNTIME_IO_FEED_H_
-
-#include "xla/runtime/custom_call_registry.h"
-
-namespace xla {
-namespace gpu {
-
-// Registers XLA Gpu runtime infeed and outfeed custom calls.
-void RegisterIoFeedCustomCalls(runtime::DirectCustomCallRegistry& registry);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_IO_FEED_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/kernel_launch.cc b/third_party/xla/xla/service/gpu/runtime/kernel_launch.cc
deleted file mode 100644
index 8ab4b2d4f0ccbc..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/kernel_launch.cc
+++ /dev/null
@@ -1,331 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/kernel_launch.h"
-
-#include <cassert>
-#include <cstdint>
-#include <memory>
-#include <string>
-#include <string_view>
-#include <vector>
-
-#include "absl/container/flat_hash_map.h"
-#include "absl/container/inlined_vector.h"
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/hlo/ir/hlo_computation.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/runtime/executable.h"
-#include "xla/runtime/memref_view.h"
-#include "xla/runtime/state.h"
-#include "xla/service/gpu/kernels/custom_fusion.h"
-#include "xla/service/gpu/kernels/custom_kernel.h"
-#include "xla/service/gpu/launch_dimensions.h"
-#include "xla/service/gpu/runtime/concurrent_region.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/gpu/stream_executor_util.h"
-#include "xla/service/hlo.pb.h"
-#include "xla/service/service_executable_run_options.h"
-#include "xla/statusor.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/kernel.h"
-#include "xla/stream_executor/launch_dim.h"
-#include "xla/stream_executor/stream.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "xla/stream_executor/gpu/gpu_graph.h"
-#endif  // #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-namespace xla {
-namespace gpu {
-
-using xla::runtime::CustomCall;
-using xla::runtime::State;
-using xla::runtime::StridedMemrefView;
-
-StreamExecutorKernels* GpuExecutableKernels::operator()(
-    se::StreamExecutor* executor) {
-  absl::MutexLock lock(&mutex_);
-  return &kernels_[executor];
-}
-
-//===----------------------------------------------------------------------===//
-// Define the kernel launch custom call.
-//===----------------------------------------------------------------------===//
-
-static absl::Status LaunchImpl(
-    const ServiceExecutableRunOptions* run_options, const std::string* ptx,
-    const std::vector<uint8_t>* cubin, se::DeviceMemoryBase* temp_buffer,
-    ConcurrentRegionStatus* region_status,
-    State<std::unique_ptr<se::Kernel>> device_kernel,
-    int32_t shared_memory_bytes, int32_t grid_size_x, int32_t grid_size_y,
-    int32_t grid_size_z, int32_t block_size_x, int32_t block_size_y,
-    int32_t block_size_z, CustomCall::RemainingArgs args, std::string_view name,
-    int64_t stream_id) {
-  se::Stream* stream = run_options->stream();
-  se::StreamExecutor* executor = stream->parent();
-
-  LaunchDimensions launch_dimensions(
-      se::BlockDim(grid_size_x, grid_size_y, grid_size_z),
-      se::ThreadDim(block_size_x, block_size_y, block_size_z));
-
-  const int args_size_including_temp_buffer = args.size() + 1;
-
-  // If kernel does not exist create it from the ptx and cubin.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<se::Kernel> * kernel, device_kernel.GetOrCreate([&] {
-        return ToAbsl(CreateKernel(absl::string_view(name.data(), name.size()),
-                                   args_size_including_temp_buffer, *ptx,
-                                   *cubin, executor, shared_memory_bytes));
-      }));
-  assert((*kernel)->name() == name && "unexpected loaded kernel");
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  if (VLOG_IS_ON(3)) {
-    TF_ASSIGN_OR_RETURN(bool is_capturing, se::gpu::IsStreamCapturing(stream));
-    if (is_capturing) {
-      if (region_status->IsInConcurrentRegion()) {
-        LOG(INFO) << "Launching " << (*kernel)->name()
-                  << "in a concurrent region during GPU graph capture";
-      } else {
-        LOG(INFO) << "Launching " << (*kernel)->name()
-                  << "during GPU graph capture";
-      }
-    } else {
-      LOG(INFO) << "Launching " << (*kernel)->name();
-    }
-  }
-#else
-  VLOG(3) << "Launching " << (*kernel)->name();
-#endif
-
-  absl::InlinedVector<se::DeviceMemoryBase, 8> buffer_args(
-      args_size_including_temp_buffer);
-
-  // Add MemRef arguments as buffer arguments.
-  for (unsigned i = 0; i < args.size(); ++i) {
-    // We get arguments corresponding to XLA allocations required by the
-    // compiled device kernel, and not the actual memrefs that device kernel
-    // writes/reads, so we don't have to pass the size along with the pointer.
-    if (auto strided = args.get<StridedMemrefView>(i); succeeded(strided)) {
-      buffer_args[i] = se::DeviceMemoryBase(strided->data);
-      continue;
-    }
-
-    return absl::InvalidArgumentError(
-        absl::StrFormat("Unsupported argument #%d type", i));
-  }
-
-  // Always add temporary buffer as the last kernel argument.
-  buffer_args.back() = *temp_buffer;
-
-  // If we are capturing a concurrent region in a GPU graph, then use the
-  // stream provided by ConcurrentRegionStatus to execute the kernel.
-  se::Stream* execution_stream = stream;
-  if (stream_id != 0) {
-    DCHECK(region_status->IsInConcurrentRegion());
-    TF_ASSIGN_OR_RETURN(execution_stream, region_status->GetStream(stream_id));
-  } else if (region_status->IsInConcurrentRegion()) {
-    execution_stream = region_status->GetNextStream();
-  }
-
-  // Execute device kernel on the execution stream.
-  return ExecuteKernelOnStream(**kernel, buffer_args, launch_dimensions,
-                               execution_stream);
-}
-
-//===----------------------------------------------------------------------===//
-// Define the custom kernel (fusion) launch custom call.
-//===----------------------------------------------------------------------===//
-
-static absl::StatusOr<std::unique_ptr<se::Kernel>> CreateCustomKernel(
-    se::StreamExecutor* executor, std::string_view name,
-    std::string_view custom_fusion_computation) {
-  auto* registry = CustomFusionRegistry::Default();
-  auto* custom_fusion = registry->Lookup(name);
-
-  // If custom fusion is not found it means that some of the build targets might
-  // not be statically linked into the binary.
-  if (custom_fusion == nullptr) {
-    return absl::InternalError(absl::StrCat(
-        "Custom fusion ", name, " not found in a default registry."));
-  }
-
-  // Parse attached custom fusion computation.
-  HloComputationProto computation_proto;
-  if (!computation_proto.ParseFromArray(custom_fusion_computation.data(),
-                                        custom_fusion_computation.size())) {
-    return absl::InternalError("Failed to parse custom fusion computation");
-  }
-
-  // Build HloComputation from a proto for passing to custom fusion.
-  absl::flat_hash_map<int64_t, HloComputation*> computation_map;
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<HloComputation> computation,
-      HloComputation::CreateFromProto(computation_proto, computation_map));
-
-  // Load custom kernels that can implement a fusion computation.
-  TF_ASSIGN_OR_RETURN(std::vector<CustomKernel> kernels,
-                      custom_fusion->LoadKernels(
-                          executor->GetDeviceDescription(), computation.get()));
-
-  // This should never happen, it means that compilation pipeline created a
-  // fusion operation that is not supported by a given custom fusion.
-  if (kernels.empty()) {
-    return absl::InternalError(
-        absl::StrCat("Custom fusion ", name,
-                     " returned empty custom kernels for a fused computation"));
-  }
-
-  auto kernel = std::make_unique<se::Kernel>(executor);
-  TF_RETURN_IF_ERROR(
-      executor->GetKernel(kernels[0].kernel_spec(), kernel.get()));
-
-  return kernel;
-}
-
-static absl::Status CustomLaunchImpl(
-    const ServiceExecutableRunOptions* run_options, const std::string* ptx,
-    const std::vector<uint8_t>* cubin, se::DeviceMemoryBase* temp_buffer,
-    ConcurrentRegionStatus* region_status,
-    State<std::unique_ptr<se::Kernel>> device_kernel,
-    int32_t shared_memory_bytes, int32_t grid_size_x, int32_t grid_size_y,
-    int32_t grid_size_z, int32_t block_size_x, int32_t block_size_y,
-    int32_t block_size_z, CustomCall::RemainingArgs args, std::string_view name,
-    int64_t stream_id, std::string_view custom_fusion_computation) {
-  se::Stream* stream = run_options->stream();
-  se::StreamExecutor* executor = stream->parent();
-
-  LaunchDimensions launch_dimensions(
-      se::BlockDim(grid_size_x, grid_size_y, grid_size_z),
-      se::ThreadDim(block_size_x, block_size_y, block_size_z));
-
-  // If kernel does not exist load it from a custom fusion computation.
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<se::Kernel> * kernel, device_kernel.GetOrCreate([&] {
-        return ToAbsl(
-            CreateCustomKernel(executor, name, custom_fusion_computation));
-      }));
-  assert((*kernel)->name() == name && "unexpected loaded kernel");
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  if (VLOG_IS_ON(3)) {
-    TF_ASSIGN_OR_RETURN(bool is_capturing, se::gpu::IsStreamCapturing(stream));
-    if (is_capturing) {
-      if (region_status->IsInConcurrentRegion()) {
-        LOG(INFO) << "Launching " << (*kernel)->name()
-                  << "in a concurrent region during GPU graph capture";
-      } else {
-        LOG(INFO) << "Launching " << (*kernel)->name()
-                  << "during GPU graph capture";
-      }
-    } else {
-      LOG(INFO) << "Launching " << (*kernel)->name();
-    }
-  }
-#else
-  VLOG(3) << "Launching " << (*kernel)->name();
-#endif
-
-  absl::InlinedVector<se::DeviceMemoryBase, 8> buffer_args(args.size());
-
-  // Add MemRef arguments as buffer arguments.
-  for (unsigned i = 0; i < args.size(); ++i) {
-    // We get arguments corresponding to XLA allocations required by the
-    // compiled device kernel, and not the actual memrefs that device kernel
-    // writes/reads, so we don't have to pass the size along with the pointer.
-    if (auto strided = args.get<StridedMemrefView>(i); succeeded(strided)) {
-      buffer_args[i] = se::DeviceMemoryBase(strided->data);
-      continue;
-    }
-
-    return absl::InvalidArgumentError(
-        absl::StrFormat("Unsupported argument #%d type", i));
-  }
-
-  // If we are capturing a concurrent region in a GPU graph, then use the
-  // stream provided by ConcurrentRegionStatus to execute the kernel.
-  se::Stream* execution_stream = stream;
-  if (stream_id != 0) {
-    DCHECK(region_status->IsInConcurrentRegion());
-    TF_ASSIGN_OR_RETURN(execution_stream, region_status->GetStream(stream_id));
-  } else if (region_status->IsInConcurrentRegion()) {
-    execution_stream = region_status->GetNextStream();
-  }
-
-  se::KernelArgsDeviceMemoryArray kernel_args(buffer_args, shared_memory_bytes);
-  return executor->Launch(
-      stream, se::ThreadDim(block_size_x, block_size_y, block_size_z),
-      se::BlockDim(grid_size_x, grid_size_y, grid_size_z), **kernel,
-      kernel_args);
-}
-
-//===----------------------------------------------------------------------===//
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    Launch, FunctionWrapper<LaunchImpl>(), checks,
-    CustomCall::Bind("xla.gpu.func.launch")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<const std::string*>()
-        .UserData<const std::vector<uint8_t>*>()
-        .UserData<se::DeviceMemoryBase*>()
-        .UserData<ConcurrentRegionStatus*>()
-        .State<std::unique_ptr<se::Kernel>>("uid")
-        .Arg<int32_t>()   // shared_memory_bytes
-        .Arg<int32_t>()   // grid_size_x
-        .Arg<int32_t>()   // grid_size_y
-        .Arg<int32_t>()   // grid_size_z
-        .Arg<int32_t>()   // block_size_x
-        .Arg<int32_t>()   // block_size_y
-        .Arg<int32_t>()   // block_size_x
-        .RemainingArgs()  // args
-        .Attr<std::string_view>("kernel")
-        .Attr<int64_t>("stream"));
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    CustomLaunch, FunctionWrapper<CustomLaunchImpl>(), checks,
-    CustomCall::Bind("xla.gpu.func.custom_launch")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<const std::string*>()
-        .UserData<const std::vector<uint8_t>*>()
-        .UserData<se::DeviceMemoryBase*>()
-        .UserData<ConcurrentRegionStatus*>()
-        .State<std::unique_ptr<se::Kernel>>("uid")
-        .Arg<int32_t>()   // shared_memory_bytes
-        .Arg<int32_t>()   // grid_size_x
-        .Arg<int32_t>()   // grid_size_y
-        .Arg<int32_t>()   // grid_size_z
-        .Arg<int32_t>()   // block_size_x
-        .Arg<int32_t>()   // block_size_y
-        .Arg<int32_t>()   // block_size_x
-        .RemainingArgs()  // args
-        .Attr<std::string_view>("kernel")
-        .Attr<int64_t>("stream")
-        .Attr<std::string_view>("__custom_fusion_computation"));
-
-void RegisterKernelLaunchCustomCalls(
-    runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.func.launch", Launch);
-  registry.Register("xla.gpu.func.custom_launch", CustomLaunch);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/kernel_launch.h b/third_party/xla/xla/service/gpu/runtime/kernel_launch.h
deleted file mode 100644
index 9bec6c71dcae0e..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/kernel_launch.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_KERNEL_LAUNCH_H_
-#define XLA_SERVICE_GPU_RUNTIME_KERNEL_LAUNCH_H_
-
-#include <memory>
-#include <string_view>
-#include <tuple>
-
-#include "absl/container/node_hash_map.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/runtime/state.h"
-#include "xla/stream_executor/stream_executor.h"
-
-namespace xla {
-namespace gpu {
-
-// Registers XLA Gpu runtime kernel launch custom calls.
-void RegisterKernelLaunchCustomCalls(
-    runtime::DirectCustomCallRegistry& registry);
-
-// Kernels loaded by Gpu executable for a single stream executor.
-class StreamExecutorKernels
-    : public runtime::StateVector<std::unique_ptr<se::Kernel>> {};
-
-// Xla runtime Gpu executable owns the pre-compiled device module (PTX and
-// Cubin for Nvidia Gpus) for all device kernels, and the cache keeps a mapping
-// from stream executor to pre-loaded kernels
-class GpuExecutableKernels {
- public:
-  StreamExecutorKernels* operator()(se::StreamExecutor* executor);
-
- private:
-  mutable absl::Mutex mutex_;
-  absl::node_hash_map<se::StreamExecutor*, StreamExecutorKernels> kernels_
-      ABSL_GUARDED_BY(mutex_);
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_KERNEL_LAUNCH_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/kernel_thunk.cc b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc
similarity index 89%
rename from third_party/xla/xla/service/gpu/runtime3/kernel_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc
index ef741d71d8b104..7852ee74ab3474 100644
--- a/third_party/xla/xla/service/gpu/runtime3/kernel_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.cc
@@ -13,16 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/kernel_thunk.h"
+#include "xla/service/gpu/runtime/kernel_thunk.h"
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <variant>
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_format.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
@@ -36,6 +38,7 @@ limitations under the License.
 #include "xla/status.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
@@ -63,7 +66,8 @@ mlir::Value RemoveTransformingOperations(mlir::Value value) {
 KernelThunk::KernelThunk(
     std::variant<mlir::Operation*, const HloInstruction*> op,
     std::string kernel_name, absl::Span<const KernelArgument> kernel_arguments,
-    LaunchDimensions launch_dimensions, int64_t shmem_bytes)
+    LaunchDimensions launch_dimensions,
+    std::optional<se::ClusterDim> cluster_dim, int64_t shmem_bytes)
     : Thunk(Kind::kKernel, std::holds_alternative<mlir::Operation*>(op)
                                ? Thunk::ThunkInfo::WithProfileAnnotation(
                                      std::get<mlir::Operation*>(op))
@@ -71,6 +75,7 @@ KernelThunk::KernelThunk(
                                      std::get<const HloInstruction*>(op))),
       kernel_name_(std::move(kernel_name)),
       launch_dimensions_(std::move(launch_dimensions)),
+      cluster_dim_(std::move(cluster_dim)),
       shmem_bytes_(shmem_bytes) {
   args_.reserve(kernel_arguments.size());
   written_.reserve(kernel_arguments.size());
@@ -95,8 +100,10 @@ KernelThunk::KernelThunk(
 }
 
 std::string KernelThunk::ToStringExtra(int indent) const {
-  return absl::StrFormat(", kernel = %s, launch dimensions = %s", kernel_name_,
-                         launch_dimensions_.ToString());
+  return absl::StrFormat(
+      ", kernel = %s, launch dimensions = %s, cluster_dim = %s", kernel_name_,
+      launch_dimensions_.ToString(),
+      cluster_dim_.has_value() ? cluster_dim_->ToString() : "nullopt");
 }
 
 absl::Status KernelThunk::Initialize(const InitializeParams& params) {
@@ -141,6 +148,7 @@ absl::Status KernelThunk::ExecuteOnStream(const ExecuteParams& params) {
   // Load the kernel.
   se::StreamExecutor* executor = params.stream->parent();
   LaunchDimensions launch_dimensions;
+  std::optional<se::ClusterDim> cluster_dim;
   const se::Kernel* kernel = nullptr;
 
   {
@@ -149,6 +157,7 @@ absl::Status KernelThunk::ExecuteOnStream(const ExecuteParams& params) {
     CHECK(it != kernel_cache_.end())
         << "Initialize() not called for StreamExecutor " << executor;
     launch_dimensions = launch_dimensions_;
+    cluster_dim = cluster_dim_;
     kernel = it->second.get();
   }
 
@@ -165,8 +174,13 @@ absl::Status KernelThunk::ExecuteOnStream(const ExecuteParams& params) {
     PrintBufferContents(params.stream, buffer_args);
   }
 
-  return ExecuteKernelOnStream(*kernel, buffer_args, launch_dimensions,
-                               params.stream);
+  if (cluster_dim.has_value()) {
+    return ExecuteKernelOnStream(*kernel, buffer_args, launch_dimensions,
+                                 cluster_dim.value(), params.stream);
+  } else {
+    return ExecuteKernelOnStream(*kernel, buffer_args, launch_dimensions,
+                                 params.stream);
+  }
 }
 
 //===----------------------------------------------------------------------===//
@@ -215,9 +229,9 @@ absl::Status CustomKernelThunk::Initialize(const InitializeParams& params) {
 
   auto it = kernel_cache_.find(params.executor);
   if (kernel_cache_.end() == it) {
-    auto kernel = std::make_unique<se::Kernel>(params.executor);
-    TF_RETURN_IF_ERROR(
-        params.executor->GetKernel(custom_kernel_.kernel_spec(), kernel.get()));
+    TF_ASSIGN_OR_RETURN(
+        std::unique_ptr<se::Kernel> kernel,
+        se::Kernel::Create(params.executor, custom_kernel_.kernel_spec()));
     kernel_cache_.emplace(params.executor, std::move(kernel));
   }
 
diff --git a/third_party/xla/xla/service/gpu/runtime3/kernel_thunk.h b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h
similarity index 93%
rename from third_party/xla/xla/service/gpu/runtime3/kernel_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/kernel_thunk.h
index 6e4a45a4ffa765..43c47eb29abc2c 100644
--- a/third_party/xla/xla/service/gpu/runtime3/kernel_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/kernel_thunk.h
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_KERNEL_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_KERNEL_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_KERNEL_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_KERNEL_THUNK_H_
 
 #include <cstdint>
 #include <memory>
+#include <optional>
 #include <string>
 #include <string_view>
 #include <variant>
@@ -36,6 +37,7 @@ limitations under the License.
 #include "xla/service/gpu/launch_dimensions.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/status.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"  // IWYU pragma: keep
 
@@ -72,7 +74,8 @@ class KernelThunk : public Thunk {
   KernelThunk(std::variant<mlir::Operation*, const HloInstruction*> op,
               std::string kernel_name,
               absl::Span<const KernelArgument> kernel_arguments,
-              LaunchDimensions launch_dimensions, int64_t shmem_bytes);
+              LaunchDimensions launch_dimensions,
+              std::optional<se::ClusterDim> cluster_dim, int64_t shmem_bytes);
   KernelThunk(const KernelThunk&) = delete;
   KernelThunk& operator=(const KernelThunk&) = delete;
   ~KernelThunk() override = default;
@@ -115,6 +118,9 @@ class KernelThunk : public Thunk {
   // The thread and block dimension used to launch the kernel.
   const LaunchDimensions launch_dimensions_;
 
+  // The cluster dimensions used to launch the kernel.
+  const std::optional<se::ClusterDim> cluster_dim_;
+
   int64_t shmem_bytes_;
 
   // mlir::Value(s) corresponding to the buffer slice arguments.
@@ -185,4 +191,4 @@ class CustomKernelThunk : public Thunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_KERNEL_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_KERNEL_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/memcpy.cc b/third_party/xla/xla/service/gpu/runtime/memcpy.cc
deleted file mode 100644
index 9fd5888433f0bc..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/memcpy.cc
+++ /dev/null
@@ -1,98 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/memcpy.h"
-
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/executable.h"
-#include "xla/service/gpu/runtime/concurrent_region.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/service_executable_run_options.h"
-
-namespace xla {
-namespace gpu {
-
-using xla::runtime::CustomCall;
-using xla::runtime::StridedMemrefView;
-
-enum class MemcpyDirection { kD2D, kD2H, kH2D };
-
-template <MemcpyDirection direction>
-absl::Status MemcpyImpl(const ServiceExecutableRunOptions* run_options,
-                        ConcurrentRegionStatus* region_status,
-                        runtime::StridedMemrefView dst,
-                        runtime::StridedMemrefView src, int64_t stream_id) {
-  se::Stream* stream = run_options->stream();
-  if (stream_id != 0) {
-    DCHECK(region_status->IsInConcurrentRegion());
-    TF_ASSIGN_OR_RETURN(stream, region_status->GetStream(stream_id));
-  } else if (region_status->IsInConcurrentRegion()) {
-    stream = region_status->GetNextStream();
-  }
-
-  if (dst.sizes != src.sizes) {
-    return absl::InvalidArgumentError(
-        "Source memref sizes do not match destination memref sizes");
-  }
-
-  if (dst.strides != src.strides) {
-    return absl::InvalidArgumentError(
-        "Source memref strides do not match destination memref strides");
-  }
-
-  switch (direction) {
-    case MemcpyDirection::kD2D: {
-      se::DeviceMemoryBase dst_data = GetDeviceAddress(dst);
-      se::DeviceMemoryBase src_data = GetDeviceAddress(src);
-      stream->ThenMemcpy(&dst_data, src_data, src_data.size());
-    } break;
-    case MemcpyDirection::kD2H: {
-      se::DeviceMemoryBase src_data = GetDeviceAddress(src);
-      stream->ThenMemcpy(dst.data, src_data, src_data.size());
-    } break;
-    case MemcpyDirection::kH2D: {
-      se::DeviceMemoryBase dst_data = GetDeviceAddress(dst);
-      stream->ThenMemcpy(&dst_data, src.data, dst_data.size());
-    } break;
-  }
-
-  // TODO(jacksonstokes): H2D and D2H memcpy instead of blocking the execution
-  // thread should return an async token that will become available when
-  // transfer is completed.
-  if (direction != MemcpyDirection::kD2D) {
-    TF_RETURN_IF_ERROR(stream->BlockHostUntilDone());
-  }
-
-  return absl::OkStatus();
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL_TEMPLATE(
-    MemcpyDirection direction, Memcpy, FunctionWrapper<MemcpyImpl<direction>>(),
-    checks,
-    CustomCall::Bind("xla.gpu.memcpy")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<ConcurrentRegionStatus*>()
-        .Arg<runtime::StridedMemrefView>()  // dst
-        .Arg<runtime::StridedMemrefView>()  // src
-        .Attr<int64_t>("stream"));
-
-void RegisterMemcpyCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.memcpy.d2d", Memcpy<MemcpyDirection::kD2D>);
-  registry.Register("xla.gpu.memcpy.h2d", Memcpy<MemcpyDirection::kH2D>);
-  registry.Register("xla.gpu.memcpy.d2h", Memcpy<MemcpyDirection::kD2H>);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/memset.cc b/third_party/xla/xla/service/gpu/runtime/memset.cc
deleted file mode 100644
index b22834a1642375..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/memset.cc
+++ /dev/null
@@ -1,147 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/memset.h"
-
-#include "absl/base/casts.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/executable.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/service_executable_run_options.h"
-
-namespace xla {
-namespace gpu {
-
-using xla::runtime::CustomCall;
-using xla::runtime::StridedMemrefView;
-
-// Checks all supported data types to see if the value is zero.
-static bool IsZero(CustomCall::VariantArg constant) {
-  if (auto i1 = constant.get<bool>(); succeeded(i1))
-    return *i1 == false;
-  else if (auto i8 = constant.get<int8_t>(); succeeded(i8))
-    return *i8 == 0;
-  else if (auto i16 = constant.get<int16_t>(); succeeded(i16))
-    return *i16 == 0;
-  else if (auto i32 = constant.get<int32_t>(); succeeded(i32))
-    return *i32 == 0;
-  else if (auto i64 = constant.get<int64_t>(); succeeded(i64))
-    return *i64 == 0;
-  else if (auto bf16 = constant.get<bfloat16>(); succeeded(bf16))
-    return *bf16 == bfloat16(0.0);
-  else if (auto f16 = constant.get<half>(); succeeded(f16))
-    return *f16 == half(0.0);
-  else if (auto f32 = constant.get<float>(); succeeded(f32))
-    return *f32 == 0.0;
-  else if (auto f64 = constant.get<double>(); succeeded(f64))
-    return *f64 == 0.0;
-
-  return false;
-}
-
-// Convert constant value to 32-bit pattern.
-static absl::StatusOr<uint32_t> ToBitPattern(CustomCall::VariantArg constant) {
-  // If the value is 8 or 16 bits wide, we can emit a 32-bit memset by
-  // repeating the value 4 or 2 times, so long as the destination buffer is
-  // an even multiple of 32 bits long.
-  //
-  // This code is identical to `ir_emitter_unnested`.
-  //
-  // We use `memcpy` operation to copy bytes between value and the uint32_t bit
-  // pattern because in theory they might have incompatible alignment, and we
-  // rely on LLVM to optimize it.
-  auto extend = [](auto value) -> uint32_t {
-    static constexpr size_t num_bytes = sizeof(value);
-    static_assert(num_bytes < 4);
-
-    uint16_t pattern16;
-    if constexpr (num_bytes == 1) {
-      uint8_t b = value;
-      pattern16 = uint16_t{b} | (uint16_t{b} << 8);
-    } else {
-      memcpy(&pattern16, &value, sizeof(pattern16));
-    }
-    return uint32_t{pattern16} | (uint32_t{pattern16} << 16);
-  };
-
-  // Truncate value to 32-bit pattern.
-  auto truncate = [](auto value) -> uint32_t {
-    static_assert(sizeof(value) >= 4);
-
-    uint32_t pattern;
-    memcpy(&pattern, &value, sizeof(pattern));
-    return pattern;
-  };
-
-  if (auto i1 = constant.get<bool>(); succeeded(i1))
-    return extend(*i1);
-  else if (auto i8 = constant.get<int8_t>(); succeeded(i8))
-    return extend(*i8);
-  else if (auto i16 = constant.get<int16_t>(); succeeded(i16))
-    return extend(*i16);
-  else if (auto i32 = constant.get<int32_t>(); succeeded(i32))
-    return truncate(*i32);
-  else if (auto i64 = constant.get<int64_t>(); succeeded(i64))
-    return truncate(*i64);
-  else if (auto bf16 = constant.get<bfloat16>(); succeeded(bf16))
-    return extend(absl::bit_cast<uint16_t>(*bf16));
-  else if (auto f16 = constant.get<half>(); succeeded(f16))
-    return extend(absl::bit_cast<uint16_t>(*f16));
-  else if (auto f32 = constant.get<float>(); succeeded(f32))
-    return truncate(*f32);
-  else if (auto f64 = constant.get<double>(); succeeded(f64))
-    return truncate(*f64);
-
-  return absl::InvalidArgumentError("Unsupported memset constant type");
-}
-
-static absl::Status MemsetImpl(const ServiceExecutableRunOptions* run_options,
-                               StridedMemrefView dst,
-                               CustomCall::VariantArg constant) {
-  se::Stream* stream = run_options->stream();
-  se::DeviceMemoryBase dst_data = GetDeviceAddress(dst);
-
-  // If the constant is zero we can use memzero directly.
-  if (IsZero(constant)) {
-    stream->ThenMemZero(&dst_data, dst_data.size());
-    return absl::OkStatus();
-  }
-
-  // If the constant is not zero, use the given pattern to `memset`.
-  TF_ASSIGN_OR_RETURN(uint32_t pattern, ToBitPattern(constant));
-
-  if (dst_data.size() % 4 != 0) {
-    return absl::InvalidArgumentError("Memref size is not divisible by 4");
-  }
-
-  stream->ThenMemset32(&dst_data, pattern, dst_data.size());
-
-  return absl::OkStatus();
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    Memset, FunctionWrapper<MemsetImpl>(), checks,
-    CustomCall::Bind("xla.gpu.memset")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .Arg<StridedMemrefView>()       // dst
-        .Arg<CustomCall::VariantArg>()  // constant
-);
-
-void RegisterMemsetCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.memset", Memset);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime3/memset_thunk.cc b/third_party/xla/xla/service/gpu/runtime/memset_thunk.cc
similarity index 96%
rename from third_party/xla/xla/service/gpu/runtime3/memset_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/memset_thunk.cc
index 98c79a5fe4cb2e..52976b8de0f899 100644
--- a/third_party/xla/xla/service/gpu/runtime3/memset_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/memset_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/memset_thunk.h"
+#include "xla/service/gpu/runtime/memset_thunk.h"
 
 #include "absl/status/status.h"
 #include "xla/stream_executor/stream_executor.h"
diff --git a/third_party/xla/xla/service/gpu/runtime3/memset_thunk.h b/third_party/xla/xla/service/gpu/runtime/memset_thunk.h
similarity index 94%
rename from third_party/xla/xla/service/gpu/runtime3/memset_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/memset_thunk.h
index b4f3886a424807..657807193e87e4 100644
--- a/third_party/xla/xla/service/gpu/runtime3/memset_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/memset_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_MEMSET_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_MEMSET_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_MEMSET_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_MEMSET_THUNK_H_
 
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/thunk.h"
@@ -83,4 +83,4 @@ class Memset32BitValueThunk : public Thunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_MEMSET_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_MEMSET_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/nccl_all_gather_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc
similarity index 93%
rename from third_party/xla/xla/service/gpu/nccl_all_gather_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc
index d71a7fed092df9..49a83b344923cc 100644
--- a/third_party/xla/xla/service/gpu/nccl_all_gather_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.cc
@@ -13,10 +13,9 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/nccl_all_gather_thunk.h"
+#include "xla/service/gpu/runtime/nccl_all_gather_thunk.h"
 
 #include <cstdint>
-#include <utility>
 #include <vector>
 
 #include "absl/status/status.h"
@@ -25,7 +24,6 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/service/collective_ops_utils.h"
 #include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/nccl_api.h"
 #include "xla/service/gpu/nccl_collective_thunk.h"
 #include "xla/service/gpu/thunk.h"
@@ -64,7 +62,7 @@ absl::Status CheckImplementableInst(const HloAllGatherInstruction* inst) {
 
     if (!ShapeUtil::IsEffectivelyMostMajorDimension(
             shape, inst->all_gather_dimension())) {
-      return tsl::errors::Unimplemented(absl::StrFormat(
+      return absl::AbortedError(absl::StrFormat(
           "all-gather dim %u is not the most major in input shape %s",
           inst->all_gather_dimension(), shape.ToString(/*print_layout=*/true)));
     }
@@ -79,7 +77,7 @@ absl::Status CheckImplementable(AllGatherStartOp op) {
     Shape shape = GetShape(operand);
     if (!ShapeUtil::IsEffectivelyMostMajorDimension(
             shape, op.getAllGatherDimension())) {
-      return tsl::errors::Unimplemented(absl::StrFormat(
+      return absl::AbortedError(absl::StrFormat(
           "all-gather dim %u is not the most major in input shape %s",
           op.getAllGatherDimension(), shape.ToString(/*print_layout=*/true)));
     }
@@ -102,9 +100,7 @@ NcclAllGatherStartThunk::NcclAllGatherStartThunk(
     ThunkInfo thunk_info, NcclApi* nccl_api,
     const HloAllGatherInstruction* inst, std::vector<Buffer> buffers)
     : NcclCollectiveThunk(Thunk::kNcclAllGatherStart, thunk_info, nccl_api,
-                          inst->backend_config<GpuBackendConfig>()
-                              ->collective_backend_config()
-                              .is_sync()),
+                          IsSyncCollective(inst)),
       config_(impl::GetNcclAllGatherConfig(inst)),
       buffers_(std::move(buffers)) {
   CHECK_EQ(config_.config.operand_count, buffers_.size());
diff --git a/third_party/xla/xla/service/gpu/nccl_all_gather_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.h
similarity index 94%
rename from third_party/xla/xla/service/gpu/nccl_all_gather_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.h
index a35355c17dc77d..cc62464ec501b1 100644
--- a/third_party/xla/xla/service/gpu/nccl_all_gather_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_gather_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_NCCL_ALL_GATHER_THUNK_H_
-#define XLA_SERVICE_GPU_NCCL_ALL_GATHER_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_ALL_GATHER_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_ALL_GATHER_THUNK_H_
 
 #include <cstdint>
 #include <vector>
@@ -81,4 +81,4 @@ absl::Status RunAllGather(NcclApi* nccl_api,
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_NCCL_ALL_GATHER_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_ALL_GATHER_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/nccl_all_reduce_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc
similarity index 97%
rename from third_party/xla/xla/service/gpu/nccl_all_reduce_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc
index 50565cf43f851a..a2644e38c79574 100644
--- a/third_party/xla/xla/service/gpu/nccl_all_reduce_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/nccl_all_reduce_thunk.h"
+#include "xla/service/gpu/runtime/nccl_all_reduce_thunk.h"
 
 #include <array>
 #include <cstdint>
@@ -126,7 +126,7 @@ absl::Status CheckImplementableInst(const HloInstruction* inst,
 
   if (!MatchReductionComputation(inst->called_computations().front())
            .has_value()) {
-    return tsl::errors::Unimplemented("Unrecognized reduction computation");
+    return absl::UnimplementedError("Unrecognized reduction computation");
   }
 
   return absl::OkStatus();
@@ -140,7 +140,7 @@ absl::Status CheckImplementable(OpT op, Thunk::Kind reduction_op) {
   if (!NcclAllReduceReduceScatterThunkBase::MatchAllReduceComputation(
            op.getComputation())
            .has_value()) {
-    return tsl::errors::Unimplemented("Unrecognized reduction computation");
+    return absl::UnimplementedError("Unrecognized reduction computation");
   }
   return absl::OkStatus();
 }
@@ -251,9 +251,7 @@ NcclAllReduceStartThunk::NcclAllReduceStartThunk(
     : NcclAllReduceReduceScatterThunkBase(
           Thunk::kNcclAllReduceStart, thunk_info, nccl_api,
           impl::GetNcclAllReduceConfigInst(inst), std::move(buffers),
-          inst->backend_config<GpuBackendConfig>()
-              ->collective_backend_config()
-              .is_sync()) {}
+          IsSyncCollective(inst)) {}
 
 absl::Status NcclAllReduceStartThunk::CheckImplementable(
     AllReduceStartOp op, int64_t replica_count, int64_t partition_count) {
diff --git a/third_party/xla/xla/service/gpu/nccl_all_reduce_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.h
similarity index 96%
rename from third_party/xla/xla/service/gpu/nccl_all_reduce_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.h
index 79750940fed80e..6fe1b8788a7240 100644
--- a/third_party/xla/xla/service/gpu/nccl_all_reduce_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_reduce_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_NCCL_ALL_REDUCE_THUNK_H_
-#define XLA_SERVICE_GPU_NCCL_ALL_REDUCE_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_ALL_REDUCE_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_ALL_REDUCE_THUNK_H_
 
 #include <cstdint>
 #include <optional>
@@ -144,4 +144,4 @@ absl::Status RunReduceScatter(NcclApi* nccl_api, ReductionKind reduction_kind,
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_NCCL_ALL_REDUCE_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_ALL_REDUCE_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/nccl_all_to_all_thunk.cc b/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc
similarity index 99%
rename from third_party/xla/xla/service/gpu/nccl_all_to_all_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc
index 1b11d0b86b4373..e83db700456b95 100644
--- a/third_party/xla/xla/service/gpu/nccl_all_to_all_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/nccl_all_to_all_thunk.h"
+#include "xla/service/gpu/runtime/nccl_all_to_all_thunk.h"
 
 #include <cstdint>
 #include <cstdlib>
diff --git a/third_party/xla/xla/service/gpu/nccl_all_to_all_thunk.h b/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.h
similarity index 94%
rename from third_party/xla/xla/service/gpu/nccl_all_to_all_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.h
index 83337339814083..ed19d638e3455e 100644
--- a/third_party/xla/xla/service/gpu/nccl_all_to_all_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/nccl_all_to_all_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_NCCL_ALL_TO_ALL_THUNK_H_
-#define XLA_SERVICE_GPU_NCCL_ALL_TO_ALL_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_NCCL_ALL_TO_ALL_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_NCCL_ALL_TO_ALL_THUNK_H_
 
 #include <vector>
 
@@ -74,4 +74,4 @@ absl::Status RunAllToAll(NcclApi* nccl_api, bool has_split_dimension,
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_NCCL_ALL_TO_ALL_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_NCCL_ALL_TO_ALL_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/norm.cc b/third_party/xla/xla/service/gpu/runtime/norm.cc
deleted file mode 100644
index 5a7647521029bf..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/norm.cc
+++ /dev/null
@@ -1,231 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/norm.h"
-
-#include <limits>
-#include <optional>
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "llvm/ADT/Sequence.h"
-#include "xla/mlir/runtime/transforms/custom_call_encoding.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/executable.h"
-#include "xla/service/gpu/gpu_asm_opts_util.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/service_executable_run_options.h"
-#include "xla/status.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/device_memory_allocator.h"
-#include "xla/translate/mhlo_to_hlo/attribute_exporter.h"
-#include "xla/xla.pb.h"
-
-namespace xla {
-
-using xla::runtime::CustomCall;
-using xla::runtime::EnumAttrEncoding;
-using xla::runtime::FlatMemrefView;
-using xla::runtime::State;
-using xla::runtime::StridedMemrefView;
-using xla::runtime::Tagged;
-
-namespace lmhlo_gpu = ::mlir::lmhlo_gpu;
-namespace gpu {
-
-struct NormAlgorithmConfig {
-  int64_t algorithm;
-  int64_t workspace_size;
-};
-
-void PopulateNormAlgorithmConfigAttrEncoding(
-    runtime::CustomCallAttrEncodingSet& encoding) {
-  {  // --- Encode `lmhlo_gpu::NormAlgorithmConfigAttr`.
-    using Attr = mlir::lmhlo_gpu::NormAlgorithmConfigAttr;
-    encoding
-        .Add<xla::runtime::AggregateAttrEncoding<Attr, NormAlgorithmConfig>>(
-            encoding, xla::runtime::AggregateAttrDef<Attr>()
-                          .Add("algorithm", &Attr::getAlgorithm)
-                          .Add("workspace_size", &Attr::getWorkspaceSize));
-  }
-}
-}  // namespace gpu
-
-namespace runtime {
-XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(
-    xla::gpu::NormAlgorithmConfig,  //
-    AggregateMember<int64_t>("algorithm"),
-    AggregateMember<int64_t>("workspace_size"));
-}  // namespace runtime
-
-namespace gpu {
-
-void RegisterNormTypeIdNames(runtime::TypeIDNameRegistry& registry) {
-  registry.Register<Tagged<NormAlgorithmConfig>>(
-      "__type_id_norm_algorithm_config");
-}
-
-static GpuNormDescriptor GetGpuNormDescriptor(
-    StridedMemrefView input, StridedMemrefView scale, StridedMemrefView bias,
-    StridedMemrefView output, std::optional<StridedMemrefView> expectation,
-    std::optional<StridedMemrefView> norm_factor, double epsilon,
-    NormAlgorithmConfig algorithm_config,
-    absl::Span<const int64_t> operand_layouts) {
-  GpuNormDescriptor descriptor;
-
-  auto* algorithm = descriptor.backend_config.mutable_algorithm();
-  algorithm->set_algo_id(algorithm_config.algorithm);
-  algorithm->set_is_cudnn_frontend(true);
-  if (algorithm_config.workspace_size >= 0) {
-    algorithm->mutable_workspace_size()->set_value(
-        algorithm_config.workspace_size);
-  }
-
-  // Apply backend config layout to the shape.
-  int layout_idx = 0;
-  auto apply_shape = [&operand_layouts,
-                      &layout_idx](const StridedMemrefView& memref) -> Shape {
-    std::vector<int64_t> minor_to_major = {
-        operand_layouts.begin() + layout_idx,
-        operand_layouts.begin() + layout_idx + memref.sizes.size()};
-    layout_idx += memref.sizes.size();
-    Shape shape = ToShape(memref);
-    return ShapeUtil::MakeShapeWithDenseLayout(
-        shape.element_type(), shape.dimensions(), minor_to_major);
-  };
-
-  descriptor.input_shape = apply_shape(input);
-  descriptor.scale_shape = apply_shape(scale);
-  descriptor.bias_shape = apply_shape(bias);
-  descriptor.output_shape = apply_shape(output);
-  if (expectation) {
-    descriptor.expectation_shape = apply_shape(*expectation);
-  }
-  if (norm_factor) {
-    descriptor.norm_factor_shape = apply_shape(*norm_factor);
-  }
-
-  descriptor.backend_config.set_epsilon(epsilon);
-
-  return descriptor;
-}
-
-static absl::Status NormImpl(const ServiceExecutableRunOptions* run_options,
-                             const DebugOptions* debug_options,
-                             State<NormRunnerState> runner_state,
-                             StridedMemrefView input, StridedMemrefView scale,
-                             StridedMemrefView bias, StridedMemrefView output,
-                             CustomCall::RemainingArgs remaining_args,
-                             int64_t uid, double epsilon,
-                             absl::Span<const int64_t> operand_layouts,
-                             NormAlgorithmConfig algorithm_config) {
-  std::optional<StridedMemrefView> expectation, norm_factor;
-  // Final remaining arg is the scratch space.
-  if (remaining_args.size() == 3) {
-    auto expectation_ = remaining_args.get<StridedMemrefView>(0);
-    if (failed(expectation_)) {
-      return absl::InternalError("Failure while retrieving expectation.");
-    }
-    expectation = expectation_.value();
-
-    auto norm_factor_ = remaining_args.get<StridedMemrefView>(1);
-    if (failed(norm_factor_)) {
-      return absl::InternalError("Failure while retrieving norm factor.");
-    }
-    norm_factor = norm_factor_.value();
-  }
-
-  GpuNormDescriptor descriptor =
-      GetGpuNormDescriptor(input, scale, bias, output, expectation, norm_factor,
-                           epsilon, algorithm_config, operand_layouts);
-
-  auto config = GpuNormConfig::For(descriptor);
-  if (!config.ok()) {
-    return tsl::ToAbslStatus(config.status());
-  }
-  auto current_runner =
-      runner_state.GetOrCreate([&config]() -> absl::StatusOr<NormRunnerState> {
-        return NormRunnerState(std::move(config.value()));
-      });
-  if (!current_runner.ok()) {
-    return tsl::ToAbslStatus(current_runner.status());
-  }
-
-  se::DeviceMemoryBase input_buffer = GetDeviceAddress(input);
-  se::DeviceMemoryBase scale_buffer = GetDeviceAddress(scale);
-  se::DeviceMemoryBase bias_buffer = GetDeviceAddress(bias);
-  se::DeviceMemoryBase output_buffer = GetDeviceAddress(output);
-  std::optional<se::DeviceMemoryBase> expectation_buffer, norm_factor_buffer;
-  if (expectation) {
-    expectation_buffer = GetDeviceAddress(expectation.value());
-  }
-  if (norm_factor) {
-    norm_factor_buffer = GetDeviceAddress(norm_factor.value());
-  }
-
-  auto scratch = remaining_args.get<FlatMemrefView>(remaining_args.size() - 1);
-  if (failed(scratch)) {
-    return absl::InternalError("Failure while retrieving scratch.");
-  }
-  se::DeviceMemoryBase scratch_buffer = GetDeviceAddress(scratch.value());
-
-  RunNormOptions opts;
-  opts.norm_runner = &current_runner.value()->runner;
-
-  // Run the norm.
-  return RunGpuNorm(current_runner.value()->config, input_buffer, scale_buffer,
-                    bias_buffer, output_buffer, expectation_buffer,
-                    norm_factor_buffer, scratch_buffer, run_options->stream(),
-                    opts);
-}
-
-template <typename... Ts>
-auto BindNormAttributes(runtime::CustomCallBinding<Ts...> binding) {
-  return std::move(binding)
-      // Unique convolution id for caching state.
-      .template Attr<int64_t>("uid")
-      .template Attr<double>("epsilon")
-      .template Attr<absl::Span<const int64_t>>("operand_layouts")
-      .template Attr<NormAlgorithmConfig>("norm_algorithm_config");
-}
-
-auto NormCall(const char* name) {
-  return CustomCall::Bind(name)
-      .UserData<const ServiceExecutableRunOptions*>()
-      .UserData<const DebugOptions*>()
-      .State<NormRunnerState>("uid")
-      .Arg<StridedMemrefView>()   // input
-      .Arg<StridedMemrefView>()   // scale
-      .Arg<StridedMemrefView>()   // bias
-      .Arg<StridedMemrefView>();  // output
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    Norm, FunctionWrapper<NormImpl>(), checks,
-    BindNormAttributes(NormCall("xla.gpu.norm").RemainingArgs()));
-
-void RegisterNormCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.norm", Norm);
-}
-
-StreamExecutorNormRunners* NormRunnerStates::operator()(
-    se::StreamExecutor* executor) {
-  absl::MutexLock lock(&mutex_);
-  return &runners_[executor];
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/norm.h b/third_party/xla/xla/service/gpu/runtime/norm.h
deleted file mode 100644
index ad2c13dbb4cc3b..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/norm.h
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_NORM_H_
-#define XLA_SERVICE_GPU_RUNTIME_NORM_H_
-
-#include <memory>
-#include <utility>
-
-#include "absl/container/node_hash_map.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/mlir/runtime/transforms/custom_call_encoding.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/service/gpu/gpu_norm_runner.h"
-
-namespace xla {
-namespace gpu {
-
-// Registers XLA GPU runtime norm custom calls.
-void RegisterNormCustomCalls(runtime::DirectCustomCallRegistry& registry);
-
-// Register type names for norm attributes defined by MHLO dialect.
-void RegisterNormTypeIdNames(runtime::TypeIDNameRegistry& registry);
-
-void PopulateNormAlgorithmConfigAttrEncoding(
-    runtime::CustomCallAttrEncodingSet& encoding);
-
-// State of the norm runners between invocations.
-struct NormRunnerState {
-  explicit NormRunnerState(GpuNormConfig config)
-      : config(std::move(config)), runner(this->config) {}
-  GpuNormConfig config;
-  NormRunner runner;
-};
-
-class StreamExecutorNormRunners : public runtime::StateVector<NormRunnerState> {
-};
-
-// XLA executable keeps a mapping from stream executors to norm runners.
-class NormRunnerStates {
- public:
-  StreamExecutorNormRunners* operator()(se::StreamExecutor* executor);
-
- private:
-  mutable absl::Mutex mutex_;
-  absl::node_hash_map<se::StreamExecutor*, StreamExecutorNormRunners> runners_
-      ABSL_GUARDED_BY(mutex_);
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_NORM_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/norm_thunk.cc b/third_party/xla/xla/service/gpu/runtime/norm_thunk.cc
similarity index 61%
rename from third_party/xla/xla/service/gpu/runtime3/norm_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/norm_thunk.cc
index 966f696ce8dafc..d3862f7bfeac74 100644
--- a/third_party/xla/xla/service/gpu/runtime3/norm_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/norm_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/norm_thunk.h"
+#include "xla/service/gpu/runtime/norm_thunk.h"
 
 #include <memory>
 #include <optional>
@@ -26,20 +26,26 @@ namespace xla {
 namespace gpu {
 
 NormThunk::NormThunk(ThunkInfo thunk_info, GpuNormConfig config,
-                     BufferAllocation::Slice input_slice,
+                     BufferAllocation::Slice x_slice,
                      BufferAllocation::Slice scale_slice,
-                     BufferAllocation::Slice bias_slice,
-                     BufferAllocation::Slice output_slice,
+                     BufferAllocation::Slice y_or_dx_slice,
+                     std::optional<BufferAllocation::Slice> bias_slice,
                      std::optional<BufferAllocation::Slice> expectation_slice,
                      std::optional<BufferAllocation::Slice> norm_factor_slice,
+                     std::optional<BufferAllocation::Slice> dy_slice,
+                     std::optional<BufferAllocation::Slice> dscale_slice,
+                     std::optional<BufferAllocation::Slice> dbias_slice,
                      BufferAllocation::Slice scratch_slice)
     : Thunk(Kind::kNorm, thunk_info),
-      input_buffer_(input_slice),
+      x_buffer_(x_slice),
       scale_buffer_(scale_slice),
+      y_or_dx_buffer_(y_or_dx_slice),
       bias_buffer_(bias_slice),
-      output_buffer_(output_slice),
       expectation_buffer_(expectation_slice),
       norm_factor_buffer_(norm_factor_slice),
+      dy_buffer_(dy_slice),
+      dscale_buffer_(dscale_slice),
+      dbias_buffer_(dbias_slice),
       scratch_buffer_(scratch_slice),
       config_(config) {}
 
@@ -57,25 +63,31 @@ NormRunner& NormThunk::GetOrCreateRunner(
 absl::Status NormThunk::ExecuteOnStream(const ExecuteParams& params) {
   const auto& buffer_allocations = *params.buffer_allocations;
 
-  se::DeviceMemoryBase input_se_buffer =
-      buffer_allocations.GetDeviceAddress(input_buffer_);
+  se::DeviceMemoryBase x_se_buffer =
+      buffer_allocations.GetDeviceAddress(x_buffer_);
   se::DeviceMemoryBase scale_se_buffer =
       buffer_allocations.GetDeviceAddress(scale_buffer_);
-  se::DeviceMemoryBase bias_se_buffer =
-      buffer_allocations.GetDeviceAddress(bias_buffer_);
-  se::DeviceMemoryBase output_se_buffer =
-      buffer_allocations.GetDeviceAddress(output_buffer_);
+  se::DeviceMemoryBase y_or_dx_se_buffer =
+      buffer_allocations.GetDeviceAddress(y_or_dx_buffer_);
 
-  std::optional<se::DeviceMemoryBase> expectation_se_buffer,
-      norm_factor_se_buffer;
+  std::optional<se::DeviceMemoryBase> bias_se_buffer, expectation_se_buffer,
+      norm_factor_se_buffer, dy_se_buffer, dscale_se_buffer, dbias_se_buffer;
+  if (bias_buffer_) {
+    bias_se_buffer = buffer_allocations.GetDeviceAddress(bias_buffer_.value());
+  }
   if (expectation_buffer_) {
     expectation_se_buffer =
         buffer_allocations.GetDeviceAddress(expectation_buffer_.value());
-  }
-  if (norm_factor_buffer_) {
     norm_factor_se_buffer =
         buffer_allocations.GetDeviceAddress(norm_factor_buffer_.value());
   }
+  if (dscale_buffer_) {
+    dy_se_buffer = buffer_allocations.GetDeviceAddress(dy_buffer_.value());
+    dscale_se_buffer =
+        buffer_allocations.GetDeviceAddress(dscale_buffer_.value());
+    dbias_se_buffer =
+        buffer_allocations.GetDeviceAddress(dbias_buffer_.value());
+  }
 
   se::DeviceMemoryBase scratch =
       buffer_allocations.GetDeviceAddress(scratch_buffer_);
@@ -83,10 +95,10 @@ absl::Status NormThunk::ExecuteOnStream(const ExecuteParams& params) {
   RunNormOptions opts;
   opts.norm_runner = &GetOrCreateRunner(params.stream);
 
-  TF_RETURN_IF_ERROR(RunGpuNorm(config_, input_se_buffer, scale_se_buffer,
-                                bias_se_buffer, output_se_buffer,
-                                expectation_se_buffer, norm_factor_se_buffer,
-                                scratch, params.stream, opts));
+  TF_RETURN_IF_ERROR(RunGpuNorm(
+      config_, x_se_buffer, scale_se_buffer, y_or_dx_se_buffer, bias_se_buffer,
+      dy_se_buffer, expectation_se_buffer, norm_factor_se_buffer,
+      dscale_se_buffer, dbias_se_buffer, scratch, params.stream, opts));
 
   if (!params.stream->ok()) {
     return Internal("NormThunk::ExecuteOnStream failed.");
diff --git a/third_party/xla/xla/service/gpu/runtime3/norm_thunk.h b/third_party/xla/xla/service/gpu/runtime/norm_thunk.h
similarity index 70%
rename from third_party/xla/xla/service/gpu/runtime3/norm_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/norm_thunk.h
index ba414941d695fa..4e362118df9138 100644
--- a/third_party/xla/xla/service/gpu/runtime3/norm_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/norm_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_NORM_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_NORM_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_NORM_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_NORM_THUNK_H_
 
 #include <memory>
 #include <optional>
@@ -32,10 +32,14 @@ namespace gpu {
 class NormThunk : public Thunk {
  public:
   NormThunk(ThunkInfo thunk_info, GpuNormConfig config,
-            BufferAllocation::Slice input, BufferAllocation::Slice scale,
-            BufferAllocation::Slice bias, BufferAllocation::Slice output,
+            BufferAllocation::Slice x, BufferAllocation::Slice scale,
+            BufferAllocation::Slice y_or_dx,
+            std::optional<BufferAllocation::Slice> bias,
             std::optional<BufferAllocation::Slice> expectation,
             std::optional<BufferAllocation::Slice> norm_factor,
+            std::optional<BufferAllocation::Slice> dy,
+            std::optional<BufferAllocation::Slice> dscale,
+            std::optional<BufferAllocation::Slice> dbias,
             BufferAllocation::Slice scratch);
 
   NormThunk(const NormThunk&) = delete;
@@ -44,12 +48,15 @@ class NormThunk : public Thunk {
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
  private:
-  BufferAllocation::Slice input_buffer_;
+  BufferAllocation::Slice x_buffer_;
   BufferAllocation::Slice scale_buffer_;
-  BufferAllocation::Slice bias_buffer_;
-  BufferAllocation::Slice output_buffer_;
+  BufferAllocation::Slice y_or_dx_buffer_;
+  std::optional<BufferAllocation::Slice> bias_buffer_;
   std::optional<BufferAllocation::Slice> expectation_buffer_;
   std::optional<BufferAllocation::Slice> norm_factor_buffer_;
+  std::optional<BufferAllocation::Slice> dy_buffer_;
+  std::optional<BufferAllocation::Slice> dscale_buffer_;
+  std::optional<BufferAllocation::Slice> dbias_buffer_;
   BufferAllocation::Slice scratch_buffer_;
   NormRunner& GetOrCreateRunner(const stream_executor::Stream*);
 
@@ -63,4 +70,4 @@ class NormThunk : public Thunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_NORM_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_NORM_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/outfeed_thunk.cc b/third_party/xla/xla/service/gpu/runtime/outfeed_thunk.cc
similarity index 95%
rename from third_party/xla/xla/service/gpu/runtime3/outfeed_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/outfeed_thunk.cc
index 85708fba01f741..f64fa3ced12e68 100644
--- a/third_party/xla/xla/service/gpu/runtime3/outfeed_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/outfeed_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/outfeed_thunk.h"
+#include "xla/service/gpu/runtime/outfeed_thunk.h"
 
 #include "absl/status/status.h"
 #include "xla/service/gpu/outfeed_manager.h"
@@ -73,7 +73,8 @@ absl::Status OutfeedThunk::ExecuteOnStream(const ExecuteParams& params) {
     ++output_leaf_it;
     const Shape& output_shape =
         ShapeUtil::GetSubshape(output_buffers->shape(), shape_index);
-    TF_RET_CHECK(ShapeUtil::Equal(source_slices_[index].shape, output_shape))
+    TF_RET_CHECK(
+        ShapeUtil::ReshapeIsBitcast(source_slices_[index].shape, output_shape))
         << "Mismatch between outfeed output buffer shape "
         << ShapeUtil::HumanStringWithLayout(output_shape)
         << " and outfeed source buffer shape "
@@ -96,7 +97,7 @@ absl::Status OutfeedThunk::ExecuteOnStream(const ExecuteParams& params) {
   absl::Status block_status = stream.BlockHostUntilDone();
   if (!block_status.ok()) {
     return Internal("Failed to complete data transfer on stream %p: %s",
-                         &stream, block_status.message());
+                    &stream, block_status.message());
   }
 
   VLOG(2) << "Outfeeding from GPU complete";
diff --git a/third_party/xla/xla/service/gpu/runtime3/outfeed_thunk.h b/third_party/xla/xla/service/gpu/runtime/outfeed_thunk.h
similarity index 90%
rename from third_party/xla/xla/service/gpu/runtime3/outfeed_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/outfeed_thunk.h
index af76973ee48fda..6ffd59cae9713a 100644
--- a/third_party/xla/xla/service/gpu/runtime3/outfeed_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/outfeed_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_OUTFEED_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_OUTFEED_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_OUTFEED_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_OUTFEED_THUNK_H_
 
 #include "xla/service/gpu/thunk.h"
 
@@ -42,4 +42,4 @@ class OutfeedThunk : public Thunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_OUTFEED_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_OUTFEED_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/replica_id_thunk.cc b/third_party/xla/xla/service/gpu/runtime/replica_id_thunk.cc
similarity index 86%
rename from third_party/xla/xla/service/gpu/runtime3/replica_id_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/replica_id_thunk.cc
index ca011fcd6f4e64..3784e44d6c4378 100644
--- a/third_party/xla/xla/service/gpu/runtime3/replica_id_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/replica_id_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/replica_id_thunk.h"
+#include "xla/service/gpu/runtime/replica_id_thunk.h"
 
 #include "absl/status/status.h"
 #include "xla/service/global_device_id.h"
@@ -26,9 +26,9 @@ absl::Status ReplicaOrPartitionIdThunk::ExecuteOnStream(
     const ExecuteParams& params) {
   auto dest_addr = params.buffer_allocations->GetDeviceAddress(dest_);
 
-  GlobalDeviceId global_device_id = params.collective_params.global_device_id;
+  GlobalDeviceId global_device_id = params.collective_params->global_device_id;
   TF_ASSIGN_OR_RETURN(const DeviceAssignment::LogicalID logical_id,
-                      params.collective_params.device_assn->LogicalIdForDevice(
+                      params.collective_params->device_assn->LogicalIdForDevice(
                           global_device_id));
   int id = kind() == Kind::kReplicaId ? logical_id.replica_id
                                       : logical_id.computation_id;
diff --git a/third_party/xla/xla/service/gpu/runtime3/replica_id_thunk.h b/third_party/xla/xla/service/gpu/runtime/replica_id_thunk.h
similarity index 88%
rename from third_party/xla/xla/service/gpu/runtime3/replica_id_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/replica_id_thunk.h
index 663339820e7915..f9c2cda14c8001 100644
--- a/third_party/xla/xla/service/gpu/runtime3/replica_id_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/replica_id_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_REPLICA_ID_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_REPLICA_ID_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_REPLICA_ID_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_REPLICA_ID_THUNK_H_
 
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/gpu/thunk.h"
@@ -24,8 +24,11 @@ namespace gpu {
 
 // Thunk that implements the ReplicaId(Idx == 0) or PartitionId(Idx == 1).
 class ReplicaOrPartitionIdThunk : public Thunk {
+ public:
   absl::Status ExecuteOnStream(const ExecuteParams& params) override;
 
+  BufferAllocation::Slice dest() const { return dest_; }
+
  protected:
   ReplicaOrPartitionIdThunk(Kind kind, ThunkInfo thunk_info,
                             const BufferAllocation::Slice& dest)
@@ -50,4 +53,4 @@ class PartitionIdThunk : public ReplicaOrPartitionIdThunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_REPLICA_ID_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_REPLICA_ID_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/send_recv.cc b/third_party/xla/xla/service/gpu/runtime/send_recv.cc
deleted file mode 100644
index 7093b1f5be9fed..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/send_recv.cc
+++ /dev/null
@@ -1,307 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/send_recv.h"
-
-#include <memory>
-#include <string>
-#include <string_view>
-#include <utility>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/strings/str_format.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/mlir/runtime/transforms/custom_call_encoding.h"
-#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/executable.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/service_executable_run_options.h"
-#include "tsl/concurrency/async_value.h"
-#include "tsl/concurrency/async_value_ref.h"
-#include "tsl/profiler/lib/traceme.h"
-#include "tsl/profiler/lib/traceme_encode.h"
-
-namespace xla {
-namespace gpu {
-
-using absl::InternalError;
-using absl::InvalidArgumentError;
-using absl::StrFormat;
-
-using tsl::AsyncValueRef;
-using tsl::profiler::TraceMe;
-using tsl::profiler::TraceMeEncode;
-
-using xla::runtime::AggregateAttrDef;
-using xla::runtime::AggregateAttrEncoding;
-using xla::runtime::CustomCall;
-using xla::runtime::CustomCallAttrEncodingSet;
-using xla::runtime::Dictionary;
-using xla::runtime::StridedMemrefView;
-using xla::runtime::Tagged;
-using xla::runtime::TypeIDNameRegistry;
-
-namespace mhlo = ::mlir::mhlo;
-
-//===----------------------------------------------------------------------===//
-// Structs for encoding send/recv operations attributes.
-//===----------------------------------------------------------------------===//
-
-struct ChannelHandle {
-  int64_t handle;
-  int64_t type;
-};
-
-}  // namespace gpu
-
-//===----------------------------------------------------------------------===//
-// Register send/recv attributes decoding with the Xla runtime.
-//===----------------------------------------------------------------------===//
-
-namespace runtime {
-
-XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(xla::gpu::ChannelHandle,
-                                             AggregateMember<int64_t>("handle"),
-                                             AggregateMember<int64_t>("type"));
-
-}  // namespace runtime
-
-//===----------------------------------------------------------------------===//
-// Type names for encoded attributes.
-//===----------------------------------------------------------------------===//
-
-namespace gpu {
-
-void RegisterSendRecvTypeIdNames(TypeIDNameRegistry& registry) {
-  registry.Register<Tagged<ChannelHandle>>("__type_id_channel_handle");
-}
-
-//===----------------------------------------------------------------------===//
-// Encoding from MHLO attributes to Xla runtime aggregate attributes.
-//===----------------------------------------------------------------------===//
-
-void PopulateSendRecvAttrEncoding(CustomCallAttrEncodingSet& encoding) {
-  {  // --- Encode `mhlo::ChannelHandleAttr`.
-    using Attr = mhlo::ChannelHandleAttr;
-    encoding.Add<AggregateAttrEncoding<Attr, ChannelHandle>>(
-        encoding, AggregateAttrDef<Attr>()
-                      .Add("handle", &Attr::getHandle)
-                      .Add("type", &Attr::getType));
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Support for running asynchronous Send/Recv SendDone/RecvDone operations.
-//===----------------------------------------------------------------------===//
-
-absl::Status SendRecvEvents::PushEvent(int32_t handle,
-                                       AsyncValueRef<se::Event> event) {
-  absl::MutexLock lock(&mutex_);
-  if (auto it = events_.try_emplace(handle, std::move(event)); it.second)
-    return absl::OkStatus();
-
-  return InternalError(
-      StrFormat("Async send/recv event already exists (handle=%d)", handle));
-}
-
-absl::StatusOr<AsyncValueRef<se::Event>> SendRecvEvents::PopEvent(
-    int32_t handle) {
-  absl::MutexLock lock(&mutex_);
-  if (auto event = events_.extract(handle)) return std::move(event.mapped());
-
-  return InternalError(
-      StrFormat("Async send/recv event was not found (handle==%d)", handle));
-}
-
-//===----------------------------------------------------------------------===//
-// Generate a map with frontend attributes.
-//===----------------------------------------------------------------------===//
-
-absl::flat_hash_map<std::string, std::string> GenerateFrontEndAttributeMap(
-    Dictionary frontend_attrs) {
-  absl::flat_hash_map<std::string, std::string> frontend_attr_map;
-  for (std::string_view key : frontend_attrs.keys()) {
-    auto frontend_attr = frontend_attrs.get<std::string_view>(key);
-    if (mlir::succeeded(frontend_attr)) {
-      frontend_attr_map.insert({std::string(key), std::string(*frontend_attr)});
-    }
-  }
-  return frontend_attr_map;
-}
-
-//===----------------------------------------------------------------------===//
-// Send/Recv custom call implementation.
-//===----------------------------------------------------------------------===//
-
-static absl::Status SendImpl(const ServiceExecutableRunOptions* run_options,
-                             SendRecvEvents* events, StridedMemrefView arg,
-                             ChannelHandle channel, Dictionary frontend_attrs) {
-  VLOG(3) << "Host Send buffer:"
-          << " channel=" << channel.handle;
-
-  TraceMe trace([&] {
-    return TraceMeEncode("xla.gpu.send_host", {{"channel", channel.handle}});
-  });
-
-  // Use device_to_host stream if it is available.
-  se::Stream* stream = run_options->run_options().device_to_host_stream();
-  if (stream) {
-    stream->ThenWaitFor(run_options->stream());
-  } else {
-    stream = run_options->stream();
-  }
-
-  // Send buffer to a handler registered with the run options.
-  if (auto* send = run_options->run_options().send_device_memory_function()) {
-    TF_ASSIGN_OR_RETURN(
-        auto done_event,
-        (*send)(channel.handle, stream, ToShape(arg), GetDeviceAddress(arg),
-                GenerateFrontEndAttributeMap(frontend_attrs)));
-    return events->PushEvent(channel.handle, std::move(done_event));
-  }
-
-  return InvalidArgumentError("SendDeviceMemoryFunction is not available");
-}
-
-static absl::Status RecvImpl(const ServiceExecutableRunOptions* run_options,
-                             SendRecvEvents* events, StridedMemrefView arg,
-                             ChannelHandle channel, Dictionary frontend_attrs) {
-  VLOG(3) << "Host Receive buffer:"
-          << " channel=" << channel.handle;
-
-  TraceMe trace([&] {
-    return TraceMeEncode("xla.gpu.recv_host", {{"channel", channel.handle}});
-  });
-
-  // Use host_to_device stream if it is available.
-  se::Stream* stream = run_options->run_options().host_to_device_stream();
-  if (stream) {
-    stream->ThenWaitFor(run_options->stream());
-  } else {
-    stream = run_options->stream();
-  }
-
-  // Recv buffer from a handler registered with the run options.
-  if (auto* recv = run_options->run_options().recv_device_memory_function()) {
-    auto dst = GetDeviceAddress(arg);
-    TF_ASSIGN_OR_RETURN(auto done_event,
-                        (*recv)(channel.handle, stream, ToShape(arg), &dst,
-                                GenerateFrontEndAttributeMap(frontend_attrs)));
-    return events->PushEvent(channel.handle, std::move(done_event));
-  }
-
-  return InvalidArgumentError("RecvDeviceMemoryFunction is not available");
-}
-
-static absl::Status SendDoneImpl(const ServiceExecutableRunOptions* run_options,
-                                 SendRecvEvents* events,
-                                 ChannelHandle channel) {
-  VLOG(3) << "Wait for Host Send completion:"
-          << " channel=" << channel.handle;
-
-  TraceMe trace([&] {
-    return TraceMeEncode("xla.gpu.send_done_host",
-                         {{"channel", channel.handle}});
-  });
-
-  TF_ASSIGN_OR_RETURN(auto done_event, events->PopEvent(channel.handle));
-
-  // Wait until send handler will record an event on the stream.
-  BlockUntilReady(done_event.GetAsyncValue());
-  if (done_event.IsError()) return done_event.GetError();
-
-  VLOG(5) << "Completed Host Send operation: "
-          << " channel=" << channel.handle;
-
-  // Once event is recorded we can add a stream dependency.
-  run_options->stream()->ThenWaitFor(&done_event.get());
-  return absl::OkStatus();
-}
-
-static absl::Status RecvDoneImpl(const ServiceExecutableRunOptions* run_options,
-                                 SendRecvEvents* events,
-                                 ChannelHandle channel) {
-  VLOG(3) << "Wait for Recv completion:"
-          << " channel=" << channel.handle;
-
-  TraceMe trace([&] {
-    return TraceMeEncode("xla.gpu.recv_done_host",
-                         {{"channel", channel.handle}});
-  });
-
-  TF_ASSIGN_OR_RETURN(auto done_event, events->PopEvent(channel.handle));
-
-  // Wait until send handler will record an event on the stream.
-  BlockUntilReady(done_event.GetAsyncValue());
-  if (done_event.IsError()) return done_event.GetError();
-
-  VLOG(5) << "Completed Host Recv operation: "
-          << " channel=" << channel.handle;
-
-  // Once event is recorded we can add a stream dependency.
-  run_options->stream()->ThenWaitFor(&done_event.get());
-  return absl::OkStatus();
-}
-
-//===----------------------------------------------------------------------===//
-// Send/Recv custom calls bindings and registration.
-//===----------------------------------------------------------------------===//
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    SendHost, FunctionWrapper<SendImpl>(), checks,
-    CustomCall::Bind("xla.gpu.send_host")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<SendRecvEvents*>()
-        .Arg<StridedMemrefView>()
-        .Attr<ChannelHandle>("channel_handle")
-        .Attr<Dictionary>("frontend_attributes"));
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    RecvHost, FunctionWrapper<RecvImpl>(), checks,
-    CustomCall::Bind("xla.gpu.recv_host")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<SendRecvEvents*>()
-        .Arg<StridedMemrefView>()
-        .Attr<ChannelHandle>("channel_handle")
-        .Attr<Dictionary>("frontend_attributes"));
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    SendDoneHost, FunctionWrapper<SendDoneImpl>(), checks,
-    CustomCall::Bind("xla.gpu.send_done_host")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<SendRecvEvents*>()
-        .Attr<ChannelHandle>("channel_handle"));
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    RecvDoneHost, FunctionWrapper<RecvDoneImpl>(), checks,
-    CustomCall::Bind("xla.gpu.recv_done_host")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .UserData<SendRecvEvents*>()
-        .Attr<ChannelHandle>("channel_handle"));
-
-//===----------------------------------------------------------------------===//
-
-// Registers XLA Gpu runtime Host Send/Recv custom calls.
-void RegisterSendRecvCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.gpu.send_host", SendHost);
-  registry.Register("xla.gpu.recv_host", RecvHost);
-  registry.Register("xla.gpu.send_done_host", SendDoneHost);
-  registry.Register("xla.gpu.recv_done_host", RecvDoneHost);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/send_recv.h b/third_party/xla/xla/service/gpu/runtime/send_recv.h
deleted file mode 100644
index 1090f643e1def7..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/send_recv.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_SEND_RECV_H_
-#define XLA_SERVICE_GPU_RUNTIME_SEND_RECV_H_
-
-#include <memory>
-#include <utility>
-
-#include "xla/mlir/runtime/transforms/custom_call_encoding.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/stream_executor/event.h"
-
-namespace xla {
-namespace gpu {
-
-// Registers XLA Gpu runtime Send/Recv custom calls.
-void RegisterSendRecvCustomCalls(runtime::DirectCustomCallRegistry& registry);
-
-// Register type names for communication attributes defined by MHLO dialect.
-void RegisterSendRecvTypeIdNames(runtime::TypeIDNameRegistry& registry);
-
-// Adds attributes encoding for Send/Recv custom calls
-void PopulateSendRecvAttrEncoding(runtime::CustomCallAttrEncodingSet& encoding);
-
-//===----------------------------------------------------------------------===//
-// Support for running asynchronous Send/Recv SendDone/RecvDone operations.
-//===----------------------------------------------------------------------===//
-
-class SendRecvEvents {
- public:
-  absl::Status PushEvent(int32_t handle, tsl::AsyncValueRef<se::Event> event);
-  absl::StatusOr<tsl::AsyncValueRef<se::Event>> PopEvent(int32_t handle);
-
- private:
-  absl::Mutex mutex_;
-  absl::flat_hash_map<int, tsl::AsyncValueRef<se::Event>> events_
-      ABSL_GUARDED_BY(mutex_);
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_SEND_RECV_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/send_recv_thunk.cc b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc
similarity index 98%
rename from third_party/xla/xla/service/gpu/runtime3/send_recv_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc
index 9f86af2a9d674d..096dba28e2c8d3 100644
--- a/third_party/xla/xla/service/gpu/runtime3/send_recv_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/send_recv_thunk.h"
+#include "xla/service/gpu/runtime/send_recv_thunk.h"
 
 #include <cstdint>
 #include <memory>
@@ -54,7 +54,7 @@ static absl::StatusOr<bool> ShouldSkip(
     const std::optional<GlobalDeviceId>& device_constraint) {
   if (!device_constraint.has_value()) return false;
 
-  GlobalDeviceId global_device_id = params.collective_params.global_device_id;
+  GlobalDeviceId global_device_id = params.collective_params->global_device_id;
   bool skip = global_device_id != *device_constraint;
   if (skip) {
     VLOG(3) << "Skip " << operation << " as device id " << global_device_id
diff --git a/third_party/xla/xla/service/gpu/runtime3/send_recv_thunk.h b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h
similarity index 97%
rename from third_party/xla/xla/service/gpu/runtime3/send_recv_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h
index 25e891c7f4f970..034a9e01249853 100644
--- a/third_party/xla/xla/service/gpu/runtime3/send_recv_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/send_recv_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_SEND_RECV_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_SEND_RECV_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_SEND_RECV_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_SEND_RECV_THUNK_H_
 
 #include <cstdint>
 #include <memory>
@@ -166,4 +166,4 @@ class RecvDoneThunk : public Thunk {
 
 }  // namespace xla::gpu
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_SEND_RECV_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_SEND_RECV_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/sequential_thunk.cc b/third_party/xla/xla/service/gpu/runtime/sequential_thunk.cc
similarity index 97%
rename from third_party/xla/xla/service/gpu/runtime3/sequential_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/sequential_thunk.cc
index d2f4e8616da94f..cb1fff423cd5c7 100644
--- a/third_party/xla/xla/service/gpu/runtime3/sequential_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/sequential_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/sequential_thunk.h"
+#include "xla/service/gpu/runtime/sequential_thunk.h"
 
 #include <string>
 #include <utility>
diff --git a/third_party/xla/xla/service/gpu/runtime3/sequential_thunk.h b/third_party/xla/xla/service/gpu/runtime/sequential_thunk.h
similarity index 91%
rename from third_party/xla/xla/service/gpu/runtime3/sequential_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/sequential_thunk.h
index 2538af9b45cf76..fe1abafe5df5d1 100644
--- a/third_party/xla/xla/service/gpu/runtime3/sequential_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/sequential_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_SEQUENTIAL_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_SEQUENTIAL_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_SEQUENTIAL_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_SEQUENTIAL_THUNK_H_
 
 #include <string>
 
@@ -50,4 +50,4 @@ class SequentialThunk : public Thunk {
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_SEQUENTIAL_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_SEQUENTIAL_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/stream_synchronization.cc b/third_party/xla/xla/service/gpu/runtime/stream_synchronization.cc
deleted file mode 100644
index d41a57aa35d2a5..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/stream_synchronization.cc
+++ /dev/null
@@ -1,55 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/stream_synchronization.h"
-
-#include "xla/runtime/executable.h"
-#include "xla/service/gpu/runtime/concurrent_region.h"
-#include "xla/service/gpu/runtime/support.h"
-
-namespace xla {
-namespace gpu {
-
-static absl::Status AwaitImpl(ConcurrentRegionStatus* region_status,
-                              int64_t from, absl::Span<const int64_t> to) {
-  TF_ASSIGN_OR_RETURN(se::Stream * from_stream, region_status->GetStream(from));
-  for (int64_t to_index : to) {
-    TF_ASSIGN_OR_RETURN(se::Stream * to_stream,
-                        region_status->GetStream(to_index));
-    from_stream->ThenWaitFor(to_stream);
-  }
-
-  return absl::OkStatus();
-}
-
-//===----------------------------------------------------------------------===//
-// Define custom calls that mark the concurrent region in CUDA graphs.
-//===----------------------------------------------------------------------===//
-
-using xla::runtime::CustomCall;
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(Await, FunctionWrapper<AwaitImpl>(), checks,
-                               CustomCall::Bind("xla.streams.await")
-                                   .UserData<ConcurrentRegionStatus*>()
-                                   .Attr<int64_t>("from")
-                                   .Attr<absl::Span<const int64_t>>("to"));
-
-void RegisterStreamSynchronizationCustomCalls(
-    runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.streams.await", Await);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/stream_synchronization.h b/third_party/xla/xla/service/gpu/runtime/stream_synchronization.h
deleted file mode 100644
index 1b480b9864c8cb..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/stream_synchronization.h
+++ /dev/null
@@ -1,31 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_STREAM_SYNCHRONIZATION_H_
-#define XLA_SERVICE_GPU_RUNTIME_STREAM_SYNCHRONIZATION_H_
-
-#include "xla/runtime/custom_call_registry.h"
-
-namespace xla {
-namespace gpu {
-
-// Registers XLA Gpu runtime stream synchronization custom calls.
-void RegisterStreamSynchronizationCustomCalls(
-    runtime::DirectCustomCallRegistry& registry);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_STREAM_SYNCHRONIZATION_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/support.cc b/third_party/xla/xla/service/gpu/runtime/support.cc
deleted file mode 100644
index a59c0d44cfdc53..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/support.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/support.h"
-
-#include <string>
-#include <string_view>
-
-#include "tsl/profiler/lib/scoped_annotation_stack.h"
-
-namespace xla {
-namespace gpu {
-
-namespace {
-static thread_local std::string_view current_tracing_scope = {};
-}  // namespace
-
-void SetCurrentTracingScope(std::string_view scope) {
-  current_tracing_scope = scope;
-}
-
-void ResetCurrentTracingScope() { current_tracing_scope = std::string_view(); }
-
-void AppendDiagnosticToString(runtime::DiagnosticEngine& diagnostic_engine,
-                              std::string* diagnostic,
-                              bool append_annotation_stack) {
-  diagnostic_engine.AddHandler(
-      [append_annotation_stack, diagnostic](runtime::Diagnostic& d) {
-        if (!diagnostic->empty()) absl::StrAppend(diagnostic, "; ");
-        absl::StrAppend(diagnostic, d.status().message());
-
-        // Append the current trace which should help identifying original HLO
-        // operation that fails.
-        if (!current_tracing_scope.empty()) {
-          absl::StrAppend(diagnostic,
-                          "; current tracing scope: ", current_tracing_scope);
-        }
-
-        // Append current profiling annotation which will have the XLA
-        // executable name and program id.
-        if (append_annotation_stack) {
-          absl::StrAppend(diagnostic, "; current profiling annotation: ",
-                          tsl::profiler::AnnotationStack::Get());
-        }
-
-        LOG(WARNING) << "Intercepted XLA runtime error:\n"
-                     << d.status().ToString(
-                            absl::StatusToStringMode::kWithEverything);
-
-        return runtime::success();
-      });
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/support.h b/third_party/xla/xla/service/gpu/runtime/support.h
deleted file mode 100644
index 10efd22a01e122..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/support.h
+++ /dev/null
@@ -1,166 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_SUPPORT_H_
-#define XLA_SERVICE_GPU_RUNTIME_SUPPORT_H_
-
-#include <string>
-#include <string_view>
-#include <utility>
-
-#include "absl/strings/str_cat.h"
-#include "llvm/ADT/ArrayRef.h"
-#include "xla/mlir/runtime/transforms/custom_call_encoding.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/service/gpu/matmul_utils.h"
-#include "xla/shape_util.h"
-#include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/device_memory.h"
-
-namespace xla {
-namespace gpu {
-
-template <auto T>
-using FunctionWrapper = xla::runtime::CustomCall::FunctionWrapper<T>;
-
-struct DotDimensionNumbers {
-  absl::Span<const int64_t> lhs_batch;
-  absl::Span<const int64_t> lhs_contract;
-  absl::Span<const int64_t> rhs_batch;
-  absl::Span<const int64_t> rhs_contract;
-};
-
-// Disable expensive CustomCall checks in optimized build.
-inline constexpr runtime::CustomCall::RuntimeChecks checks =  // NOLINT
-#if defined(NDEBUG)
-    runtime::CustomCall::RuntimeChecks::kLess;
-#else
-    runtime::CustomCall::RuntimeChecks::kDefault;
-#endif
-
-template <typename T>
-absl::StatusOr<T> ToAbsl(absl::StatusOr<T> status_or) {
-  if (!status_or.ok()) return status_or.status();
-  return std::move(status_or).value();
-}
-
-inline se::DeviceMemoryBase GetDeviceAddress(
-    const runtime::FlatMemrefView& memref) {
-  return se::DeviceMemoryBase(memref.data, memref.size_in_bytes);
-}
-
-inline se::DeviceMemoryBase GetDeviceAddress(
-    const runtime::MemrefView& memref) {
-  uint64_t size = primitive_util::ByteWidth(memref.dtype);
-  for (auto dim : memref.sizes) size *= dim;
-  return se::DeviceMemoryBase(memref.data, size);
-}
-
-inline se::DeviceMemoryBase GetDeviceAddress(
-    const runtime::StridedMemrefView& memref) {
-  uint64_t size = primitive_util::ByteWidth(memref.dtype);
-  for (auto dim : memref.sizes) size *= dim;
-  if (primitive_util::Is4BitType(memref.dtype)) {
-    size = (size + 1) / 2;
-  }
-  return se::DeviceMemoryBase(memref.data, size);
-}
-
-inline Shape ToShape(const runtime::StridedMemrefView& memref) {
-  // Recover `minor_to_major` dimensions permutation from strides.
-  auto indexed_strides_range =
-      llvm::map_range(llvm::enumerate(memref.strides), [](auto pair) {
-        return std::pair<int64_t, size_t>{pair.value(), pair.index()};
-      });
-
-  auto indexed_strides = llvm::to_vector(indexed_strides_range);
-  llvm::stable_sort(indexed_strides);
-
-  llvm::SmallVector<int64_t> minor_to_major;
-  minor_to_major.reserve(indexed_strides.size());
-  for (auto& pair : indexed_strides) minor_to_major.push_back(pair.second);
-
-  return ShapeUtil::MakeShapeWithDenseLayout(memref.dtype, memref.sizes,
-                                             minor_to_major);
-}
-
-inline absl::StatusOr<GemmConfig> GetGemmConfig(
-    const runtime::StridedMemrefView& lhs,
-    const runtime::StridedMemrefView& rhs,
-    const runtime::StridedMemrefView& out, int64_t algorithm, double alpha_real,
-    double alpha_imag, double beta, absl::Span<const int64_t> lhs_batch,
-    absl::Span<const int64_t> lhs_contract, absl::Span<const int64_t> rhs_batch,
-    absl::Span<const int64_t> rhs_contract, int64_t compute_precision,
-    const std::optional<runtime::StridedMemrefView> c = std::nullopt,
-    const std::optional<runtime::StridedMemrefView>& bias = std::nullopt,
-    bool grad_x = false, bool grad_y = false) {
-  Shape c_shape = ToShape(c.value_or(out));
-  Shape bias_shape;
-  Shape* bias_shape_ptr = nullptr;
-  if (bias) {
-    bias_shape = ToShape(*bias);
-    bias_shape_ptr = &bias_shape;
-  }
-  return GemmConfig::For(ToShape(lhs), lhs_batch, lhs_contract, ToShape(rhs),
-                         rhs_batch, rhs_contract, c_shape, bias_shape_ptr,
-                         ToShape(out), alpha_real, alpha_imag, beta, algorithm,
-                         compute_precision, grad_x, grad_y);
-}
-
-// adds Dot Dimension Attribute encodings for calls to Gemm and cuBLASLt
-inline void PopulateDotDimsAttrEncoding(
-    runtime::CustomCallAttrEncodingSet& encoding) {
-  using DotDimsAttr = mlir::mhlo::DotDimensionNumbersAttr;
-  encoding.Add<
-      xla::runtime::AggregateAttrEncoding<DotDimsAttr, DotDimensionNumbers>>(
-      encoding,
-      xla::runtime::AggregateAttrDef<DotDimsAttr>()
-          .Add("lhs_batch", &DotDimsAttr::getLhsBatchingDimensions)
-          .Add("lhs_contract", &DotDimsAttr::getLhsContractingDimensions)
-          .Add("rhs_batch", &DotDimsAttr::getRhsBatchingDimensions)
-          .Add("rhs_contract", &DotDimsAttr::getRhsContractingDimensions));
-}
-
-// Appends to `diagnostic_engine` a handler that appends all emitted errors to
-// the `diagnostic` string. If `append_annotation_stack` is true, it will append
-// current profiler annotation stack to the diagnostic message (annotation used
-// in Xprof).
-void AppendDiagnosticToString(runtime::DiagnosticEngine& diagnostic_engine,
-                              std::string* diagnostic,
-                              bool append_annotation_stack = false);
-
-// Sets the current tracing scope that will be added to all emitted diagnostics.
-void SetCurrentTracingScope(std::string_view scope);
-void ResetCurrentTracingScope();
-
-}  // namespace gpu
-}  // namespace xla
-
-namespace xla {
-namespace runtime {
-
-// using llvm::ArrayRef;
-
-XLA_RUNTIME_REGISTER_AGGREGATE_ATTR_DECODING(
-    xla::gpu::DotDimensionNumbers,
-    AggregateMember<absl::Span<const int64_t>>("lhs_batch"),
-    AggregateMember<absl::Span<const int64_t>>("lhs_contract"),
-    AggregateMember<absl::Span<const int64_t>>("rhs_batch"),
-    AggregateMember<absl::Span<const int64_t>>("rhs_contract"));
-
-}  // namespace runtime
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_SUPPORT_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/topk.cc b/third_party/xla/xla/service/gpu/runtime/topk.cc
deleted file mode 100644
index 197bd68bbd2c60..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/topk.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-/* Copyright 2023 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/topk.h"
-
-#include <stdint.h>
-
-#include <cstddef>
-
-#include "absl/status/status.h"
-#include "absl/types/span.h"
-#include "xla/runtime/custom_call.h"
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/runtime/executable.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "xla/service/gpu/runtime/topk_kernel.h"
-#include "xla/service/service_executable_run_options.h"
-#include "xla/stream_executor/gpu/gpu_stream.h"
-#include "xla/types.h"
-#include "xla/xla_data.pb.h"
-
-namespace xla::gpu {
-using ::xla::runtime::CustomCall;
-using ::xla::runtime::StridedMemrefView;
-
-static absl::Status TopkImpl(const ServiceExecutableRunOptions* run_options,
-                             StridedMemrefView data,
-                             StridedMemrefView top_elements,
-                             StridedMemrefView indices) {
-  if (data.sizes.size() > 2)
-    return absl::InvalidArgumentError("Invalid input shape");
-  if (indices.dtype != PrimitiveType::S32)
-    return absl::InvalidArgumentError("Indices should be S32");
-  bool has_batch = data.sizes.size() == 2;
-  size_t batch_size = has_batch ? data.sizes[0] : 1;
-  size_t n = has_batch ? data.sizes[1] : data.sizes[0];
-  size_t k = has_batch ? top_elements.sizes[1] : top_elements.sizes[0];
-  return RunTopk(run_options->stream(), data.dtype, GetDeviceAddress(data), n,
-                 GetDeviceAddress(top_elements), GetDeviceAddress(indices), k,
-                 batch_size);
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    Topk, FunctionWrapper<TopkImpl>(), checks,
-    CustomCall::Bind("__gpu$TopK")
-        .UserData<const ServiceExecutableRunOptions*>()
-        .Arg<StridedMemrefView>()  // input
-        .Arg<StridedMemrefView>()  // output (values)
-        .Arg<StridedMemrefView>()  // output (indices)
-);
-
-void RegisterTopkCustomCall(runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("__gpu$TopK", Topk);
-}
-
-}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/tracing.cc b/third_party/xla/xla/service/gpu/runtime/tracing.cc
deleted file mode 100644
index f2f32552d6b0e2..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/tracing.cc
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/tracing.h"
-
-#include <memory>
-
-#include "absl/status/statusor.h"
-#include "absl/strings/str_format.h"
-#include "xla/runtime/executable.h"
-#include "xla/runtime/tracing.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "tsl/profiler/lib/scoped_annotation_stack.h"
-
-namespace xla {
-namespace gpu {
-
-using ::xla::runtime::CustomCall;
-using ::xla::runtime::HloTrace;
-
-using ::tsl::profiler::ScopedAnnotationStack;
-
-//===----------------------------------------------------------------------===//
-// Type names for encoded attributes.
-//===----------------------------------------------------------------------===//
-
-void RegisterTracingTypeIdNames(runtime::TypeIDNameRegistry& registry) {
-  runtime::PopulateTraceTypeIdNames(registry);
-}
-
-//===----------------------------------------------------------------------===//
-// Tracing custom calls implementation.
-//===----------------------------------------------------------------------===//
-
-namespace {
-thread_local const ModuleAnnotations* current_annotations{};
-}
-
-static absl::StatusOr<int64_t> ActivityStart(runtime::HloTrace annotation) {
-  SetCurrentTracingScope(annotation.hlo_op);
-  if (current_annotations) {
-    // We know which HloModule we belong to, and may have pre-prepared
-    // annotation structs ready to use
-    const auto iter = current_annotations->kernels.find(annotation.hlo_op);
-    if (iter != current_annotations->kernels.end()) {
-      // Have a pre-prepared annotation, use it
-      return ScopedAnnotationStack::ActivityStart([&] { return iter->second; });
-    }
-  }
-  return ScopedAnnotationStack::ActivityStart([&] {
-    // We use the same tracing annotation scheme as the ThunkSequence.
-    return absl::StrFormat("Thunk:#hlo_op=%s#", annotation.hlo_op);
-  });
-}
-
-static absl::Status ActivityEnd(int64_t activity_id) {
-  ResetCurrentTracingScope();
-  ScopedAnnotationStack::ActivityEnd(activity_id);
-  return absl::OkStatus();
-}
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(Start, FunctionWrapper<ActivityStart>(), checks,
-                               CustomCall::Bind("xla.trace.activity_start")
-                                   .Attr<HloTrace>("annotation")
-                                   .Ret<int64_t>());
-
-XLA_RUNTIME_DEFINE_CUSTOM_CALL(
-    End, FunctionWrapper<ActivityEnd>(), checks,
-    CustomCall::Bind("xla.trace.activity_end").Arg<int64_t>());
-
-void RegisterTracingCustomCalls(runtime::DirectCustomCallRegistry& registry) {
-  registry.Register("xla.trace.activity_start", Start);
-  registry.Register("xla.trace.activity_end", End);
-}
-
-const ModuleAnnotations* SetCurrentModuleAnnotations(
-    const ModuleAnnotations* annotations) {
-  return std::exchange(current_annotations, annotations);
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/tracing.h b/third_party/xla/xla/service/gpu/runtime/tracing.h
deleted file mode 100644
index c59db877e49f8d..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/tracing.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_TRACING_H_
-#define XLA_SERVICE_GPU_RUNTIME_TRACING_H_
-
-#include <memory>
-
-#include "xla/runtime/custom_call_registry.h"
-#include "xla/runtime/type_id.h"
-#include "xla/service/gpu/runtime/annotation.h"
-
-namespace xla {
-namespace gpu {
-
-void RegisterTracingTypeIdNames(runtime::TypeIDNameRegistry& registry);
-
-void RegisterTracingCustomCalls(runtime::DirectCustomCallRegistry& registry);
-
-const ModuleAnnotations* SetCurrentModuleAnnotations(
-    const ModuleAnnotations* annotations);
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_TRACING_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/triangular_solve.cc b/third_party/xla/xla/service/gpu/runtime/triangular_solve.cc
deleted file mode 100644
index 264dcbde686f41..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/triangular_solve.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime/triangular_solve.h"
-
-#include <numeric>
-#include <string>
-#include <string_view>
-
-#include "xla/runtime/custom_call.h"
-#include "xla/service/gpu/gpu_asm_opts_util.h"
-#include "xla/service/gpu/runtime/support.h"
-#include "tsl/platform/human_readable_json.h"
-
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-#include "xla/service/gpu/runtime3/triangular_solve_thunk.h"
-#endif  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-
-namespace xla {
-namespace gpu {
-
-using xla::runtime::CustomCall;
-
-using mlir::failure;
-using mlir::FailureOr;
-
-absl::Status TriangularSolve::run(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, CustomCall::RemainingArgs args,
-    std::string_view backend_config) {
-  TriangularSolve handler = TriangularSolve::Handler();
-
-  if (args.size() != 4)
-    return absl::InvalidArgumentError(
-        absl::StrFormat("Expected 4 arguments, got %d", args.size()));
-
-  // Check if all arguments have the correct type.
-  auto a = args.get<runtime::StridedMemrefView>(0);
-  auto b = args.get<runtime::StridedMemrefView>(1);
-  auto result = args.get<runtime::StridedMemrefView>(2);
-  auto temp = args.get<runtime::FlatMemrefView>(3);
-  if (failed(a) || failed(b) || failed(result) || failed(temp))
-    return absl::InvalidArgumentError("Incorrect argument types");
-
-  // Parse backend config string.
-  TriangularSolveOptions opts;
-
-  const std::string backend_config_str =
-      std::string(backend_config.data(), backend_config.length());
-
-  TF_RETURN_IF_ERROR(tsl::HumanReadableJsonToProto(backend_config_str, &opts));
-
-  return handler(run_options, debug_options, *a, *b, *result, *temp,
-                 opts.left_side(), opts.lower(), opts.unit_diagonal(),
-                 opts.transpose_a());
-}
-
-absl::Status TriangularSolve::operator()(
-    const ServiceExecutableRunOptions* run_options,
-    const DebugOptions* debug_options, runtime::StridedMemrefView a,
-    runtime::StridedMemrefView b, runtime::StridedMemrefView result,
-    runtime::FlatMemrefView temp, bool left_side, bool lower,
-    bool unit_diagonal, TriangularSolveOptions::Transpose transpose_a) const {
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  se::Stream* stream = run_options->stream();
-
-  se::DeviceMemoryBase a_data = GetDeviceAddress(a);
-  se::DeviceMemoryBase b_data = GetDeviceAddress(b);
-  se::DeviceMemoryBase result_data = GetDeviceAddress(result);
-  se::DeviceMemoryBase temp_data = GetDeviceAddress(temp);
-
-  // Triangular solve is in-place on 'b', so copy 'b' to the output if they
-  // aren't the same buffer.
-  if (b.data != result.data)
-    stream->ThenMemcpy(&result_data, b_data, b_data.size());
-
-  Shape b_shape = ToShape(b);
-  int64_t m = b_shape.dimensions(b_shape.rank() - 2);
-  int64_t n = b_shape.dimensions(b_shape.rank() - 1);
-  int64_t batch_size = std::accumulate(
-      b_shape.dimensions().begin(), b_shape.dimensions().end() - 2, int64_t{1},
-      [](int64_t a, int64_t b) { return a * b; });
-
-  PrimitiveType elem_type = b.dtype;
-  int64_t elem_size = ShapeUtil::ByteSizeOfPrimitiveType(elem_type);
-  int64_t a_batch_stride = left_side ? m * m * elem_size : n * n * elem_size;
-  int64_t b_batch_stride = m * n * elem_size;
-
-  using Side = se::blas::Side;
-  using Diagonal = se::blas::Diagonal;
-  using Transpose = se::blas::Transpose;
-  using UpperLower = se::blas::UpperLower;
-
-  // Convert custom call attributes to se::blas enums.
-  UpperLower uplo = lower ? UpperLower::kLower : UpperLower::kUpper;
-  Side side = left_side ? Side::kLeft : Side::kRight;
-  Diagonal diagonal = unit_diagonal ? Diagonal::kUnit : Diagonal::kNonUnit;
-
-  auto transpose = [&]() -> mlir::FailureOr<Transpose> {
-    switch (transpose_a) {
-      case TriangularSolveOptions::NO_TRANSPOSE:
-        return se::blas::Transpose::kNoTranspose;
-      case TriangularSolveOptions::TRANSPOSE:
-        return se::blas::Transpose::kTranspose;
-      case TriangularSolveOptions::ADJOINT:
-        return se::blas::Transpose::kConjugateTranspose;
-      default:
-        return failure();
-    }
-  }();
-
-  if (failed(transpose))
-    return absl::InternalError("Failed to convert transpose type");
-
-  return RunTriangularSolve(a_data, result_data, temp_data,
-                            PtxOptsFromDebugOptions(*debug_options), uplo, side,
-                            diagonal, *transpose, elem_type, batch_size, m, n,
-                            a_batch_stride, b_batch_stride, stream);
-#else  // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  return absl::InternalError("Not implemented without Gpu");
-#endif
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime/triangular_solve.h b/third_party/xla/xla/service/gpu/runtime/triangular_solve.h
deleted file mode 100644
index 3f5d9c22f0eb92..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime/triangular_solve.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2022 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME_TRIANGULAR_SOLVE_H_
-#define XLA_SERVICE_GPU_RUNTIME_TRIANGULAR_SOLVE_H_
-
-#include <string_view>
-
-#include "xla/runtime/custom_call.h"
-#include "xla/service/service_executable_run_options.h"
-#include "xla/xla.pb.h"
-
-namespace xla {
-namespace gpu {
-
-using runtime::CustomCall;
-
-struct TriangularSolve {
-  // Adaptor from XlaCustomCall API to properly typed TriangularSolve handler.
-  static absl::Status run(const ServiceExecutableRunOptions* run_options,
-                          const DebugOptions* debug_options,
-                          CustomCall::RemainingArgs args,
-                          std::string_view backend_config);
-
-  absl::Status operator()(const ServiceExecutableRunOptions* run_options,
-                          const DebugOptions* debug_options,
-                          runtime::StridedMemrefView a,
-                          runtime::StridedMemrefView b,
-                          runtime::StridedMemrefView result,
-                          runtime::FlatMemrefView temp, bool left_side,
-                          bool lower, bool unit_diagonal,
-                          TriangularSolveOptions::Transpose transpose_a) const;
-
-  static TriangularSolve Handler() { return TriangularSolve(); }
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME_TRIANGULAR_SOLVE_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/triangular_solve_thunk.cc b/third_party/xla/xla/service/gpu/runtime/triangular_solve_thunk.cc
similarity index 70%
rename from third_party/xla/xla/service/gpu/runtime3/triangular_solve_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/triangular_solve_thunk.cc
index 81607a93f2498e..8be15a1846e12b 100644
--- a/third_party/xla/xla/service/gpu/runtime3/triangular_solve_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/triangular_solve_thunk.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/triangular_solve_thunk.h"
+#include "xla/service/gpu/runtime/triangular_solve_thunk.h"
 
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
@@ -99,49 +99,43 @@ absl::Status RunTriangularSolve(
   const int lda = side == se::blas::Side::kLeft ? m : n;
   const int ldb = m;
 
+  auto blas = stream->parent()->AsBlas();
+  if (blas == nullptr) {
+    return absl::InternalError("No BLAS support in stream.");
+  }
   bool launch_ok;
   if (batch_size == 1) {
     switch (type) {
       case F32: {
         se::DeviceMemory<float> b_data_typed(b_data);
-        launch_ok =
-            stream
-                ->ThenBlasTrsm(side, uplo, transpose_a, unit_diagonal, m, n,
-                               /*alpha=*/1.0f, se::DeviceMemory<float>(a_data),
-                               lda, &b_data_typed, ldb)
-                .ok();
+        launch_ok = blas->DoBlasTrsm(
+            stream, side, uplo, transpose_a, unit_diagonal, m, n,
+            /*alpha=*/1.0f, se::DeviceMemory<float>(a_data), lda, &b_data_typed,
+            ldb);
         break;
       }
       case F64: {
         se::DeviceMemory<double> b_data_typed(b_data);
-        launch_ok =
-            stream
-                ->ThenBlasTrsm(side, uplo, transpose_a, unit_diagonal, m, n,
-                               /*alpha=*/1.0, se::DeviceMemory<double>(a_data),
-                               lda, &b_data_typed, ldb)
-                .ok();
+        launch_ok = blas->DoBlasTrsm(
+            stream, side, uplo, transpose_a, unit_diagonal, m, n,
+            /*alpha=*/1.0, se::DeviceMemory<double>(a_data), lda, &b_data_typed,
+            ldb);
         break;
       }
       case C64: {
         se::DeviceMemory<std::complex<float>> b_data_typed(b_data);
-        launch_ok =
-            stream
-                ->ThenBlasTrsm(side, uplo, transpose_a, unit_diagonal, m, n,
-                               /*alpha=*/1.0f,
-                               se::DeviceMemory<std::complex<float>>(a_data),
-                               lda, &b_data_typed, ldb)
-                .ok();
+        launch_ok = blas->DoBlasTrsm(
+            stream, side, uplo, transpose_a, unit_diagonal, m, n,
+            /*alpha=*/1.0f, se::DeviceMemory<std::complex<float>>(a_data), lda,
+            &b_data_typed, ldb);
         break;
       }
       case C128: {
         se::DeviceMemory<std::complex<double>> b_data_typed(b_data);
-        launch_ok =
-            stream
-                ->ThenBlasTrsm(side, uplo, transpose_a, unit_diagonal, m, n,
-                               /*alpha=*/1.0,
-                               se::DeviceMemory<std::complex<double>>(a_data),
-                               lda, &b_data_typed, ldb)
-                .ok();
+        launch_ok = blas->DoBlasTrsm(
+            stream, side, uplo, transpose_a, unit_diagonal, m, n,
+            /*alpha=*/1.0, se::DeviceMemory<std::complex<double>>(a_data), lda,
+            &b_data_typed, ldb);
         break;
       }
       default:
@@ -167,46 +161,34 @@ absl::Status RunTriangularSolve(
     switch (type) {
       case F32: {
         se::DeviceMemory<float*> typed_b_pointers(b_pointers);
-        launch_ok =
-            stream
-                ->ThenBlasTrsmBatched(side, uplo, transpose_a, unit_diagonal, m,
-                                      n, /*alpha=*/1.0f,
-                                      se::DeviceMemory<float*>(a_pointers), lda,
-                                      &typed_b_pointers, ldb, batch_size)
-                .ok();
+        launch_ok = blas->DoBlasTrsmBatched(
+            stream, side, uplo, transpose_a, unit_diagonal, m, n,
+            /*alpha=*/1.0f, se::DeviceMemory<float*>(a_pointers), lda,
+            &typed_b_pointers, ldb, batch_size);
         break;
       }
       case F64: {
         se::DeviceMemory<double*> typed_b_pointers(b_pointers);
-        launch_ok =
-            stream
-                ->ThenBlasTrsmBatched(side, uplo, transpose_a, unit_diagonal, m,
-                                      n, /*alpha=*/1.0f,
-                                      se::DeviceMemory<double*>(a_pointers),
-                                      lda, &typed_b_pointers, ldb, batch_size)
-                .ok();
+        launch_ok = blas->DoBlasTrsmBatched(
+            stream, side, uplo, transpose_a, unit_diagonal, m, n,
+            /*alpha=*/1.0f, se::DeviceMemory<double*>(a_pointers), lda,
+            &typed_b_pointers, ldb, batch_size);
         break;
       }
       case C64: {
         se::DeviceMemory<std::complex<float>*> typed_b_pointers(b_pointers);
-        launch_ok = stream
-                        ->ThenBlasTrsmBatched(
-                            side, uplo, transpose_a, unit_diagonal, m, n,
-                            /*alpha=*/1.0f,
-                            se::DeviceMemory<std::complex<float>*>(a_pointers),
-                            lda, &typed_b_pointers, ldb, batch_size)
-                        .ok();
+        launch_ok = blas->DoBlasTrsmBatched(
+            stream, side, uplo, transpose_a, unit_diagonal, m, n,
+            /*alpha=*/1.0f, se::DeviceMemory<std::complex<float>*>(a_pointers),
+            lda, &typed_b_pointers, ldb, batch_size);
         break;
       }
       case C128: {
         se::DeviceMemory<std::complex<double>*> typed_b_pointers(b_pointers);
-        launch_ok = stream
-                        ->ThenBlasTrsmBatched(
-                            side, uplo, transpose_a, unit_diagonal, m, n,
-                            /*alpha=*/1.0f,
-                            se::DeviceMemory<std::complex<double>*>(a_pointers),
-                            lda, &typed_b_pointers, ldb, batch_size)
-                        .ok();
+        launch_ok = blas->DoBlasTrsmBatched(
+            stream, side, uplo, transpose_a, unit_diagonal, m, n,
+            /*alpha=*/1.0f, se::DeviceMemory<std::complex<double>*>(a_pointers),
+            lda, &typed_b_pointers, ldb, batch_size);
         break;
       }
       default:
diff --git a/third_party/xla/xla/service/gpu/runtime3/triangular_solve_thunk.h b/third_party/xla/xla/service/gpu/runtime/triangular_solve_thunk.h
similarity index 94%
rename from third_party/xla/xla/service/gpu/runtime3/triangular_solve_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/triangular_solve_thunk.h
index 451a004b949462..e55ffb4da36a84 100644
--- a/third_party/xla/xla/service/gpu/runtime3/triangular_solve_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/triangular_solve_thunk.h
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_TRIANGULAR_SOLVE_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_TRIANGULAR_SOLVE_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_TRIANGULAR_SOLVE_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_TRIANGULAR_SOLVE_THUNK_H_
 
 #include <optional>
 
@@ -83,4 +83,4 @@ absl::Status RunTriangularSolve(
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_TRIANGULAR_SOLVE_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_TRIANGULAR_SOLVE_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.cc b/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.cc
new file mode 100644
index 00000000000000..f61bddb9cc14b1
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.cc
@@ -0,0 +1,47 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/runtime/wait_for_streams_thunk.h"
+
+#include <string>
+#include <utility>
+
+#include "absl/status/status.h"
+#include "absl/strings/str_cat.h"
+#include "xla/service/gpu/thunk.h"
+#include "tsl/platform/errors.h"
+
+namespace xla::gpu {
+
+absl::Status WaitForStreamsThunk::ExecuteOnStream(const ExecuteParams& params) {
+  TF_ASSIGN_OR_RETURN(se::Stream * stream,
+                      Thunk::GetStreamForExecution(stream_id_, params));
+
+  VLOG(5) << "Waiting for stream ids: "
+          << absl::StrJoin(
+                 wait_for_stream_ids_, ", ",
+                 [&](std::string* s, const ExecutionStreamId& stream_id) {
+                   absl::StrAppend(s, stream_id.value());
+                 });
+  for (const auto& stream_id : wait_for_stream_ids_) {
+    TF_ASSIGN_OR_RETURN(se::Stream * wait_on_stream,
+                        Thunk::GetStreamForExecution(stream_id, params));
+
+    stream->ThenWaitFor(wait_on_stream);
+  }
+  return absl::OkStatus();
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.h b/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.h
new file mode 100644
index 00000000000000..4835b56b7e1e59
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/runtime/wait_for_streams_thunk.h
@@ -0,0 +1,53 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_GPU_RUNTIME_WAIT_FOR_STREAMS_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_WAIT_FOR_STREAMS_THUNK_H_
+
+#include <string>
+
+#include "absl/status/status.h"
+#include "xla/service/gpu/thunk.h"
+
+namespace xla::gpu {
+
+// This thunk
+class WaitForStreamsThunk : public Thunk {
+ public:
+  WaitForStreamsThunk(ThunkInfo thunk_info, ExecutionStreamId stream_id,
+                      std::vector<ExecutionStreamId> wait_for_stream_ids)
+      : Thunk(Kind::kWaitForStreams, thunk_info),
+        stream_id_(stream_id),
+        wait_for_stream_ids_(wait_for_stream_ids){};
+
+  WaitForStreamsThunk(const WaitForStreamsThunk&) = delete;
+  WaitForStreamsThunk& operator=(const WaitForStreamsThunk&) = delete;
+
+  const ExecutionStreamId& stream_id() const { return stream_id_; }
+
+  const std::vector<ExecutionStreamId>& wait_for_stream_ids() const {
+    return wait_for_stream_ids_;
+  }
+
+  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
+
+ private:
+  ExecutionStreamId stream_id_;
+  std::vector<ExecutionStreamId> wait_for_stream_ids_;
+};
+
+}  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_RUNTIME_WAIT_FOR_STREAMS_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/while_thunk.cc b/third_party/xla/xla/service/gpu/runtime/while_thunk.cc
similarity index 61%
rename from third_party/xla/xla/service/gpu/runtime3/while_thunk.cc
rename to third_party/xla/xla/service/gpu/runtime/while_thunk.cc
index 46eca0d7a50406..9bdd55df03bebe 100644
--- a/third_party/xla/xla/service/gpu/runtime3/while_thunk.cc
+++ b/third_party/xla/xla/service/gpu/runtime/while_thunk.cc
@@ -13,19 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime3/while_thunk.h"
+#include "xla/service/gpu/runtime/while_thunk.h"
 
+#include <cstdint>
 #include <memory>
+#include <optional>
 #include <utility>
 
 #include "absl/status/status.h"
 #include "absl/strings/str_format.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/runtime3/sequential_thunk.h"
+#include "xla/service/gpu/runtime/sequential_thunk.h"
 #include "xla/service/gpu/thunk.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/memory_allocation.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace gpu {
@@ -34,13 +39,15 @@ WhileThunk::WhileThunk(
     ThunkInfo thunk_info,
     const BufferAllocation::Slice& condition_result_buffer_index,
     std::unique_ptr<ThunkSequence> condition_thunk_sequence,
-    std::unique_ptr<ThunkSequence> body_thunk_sequence)
+    std::unique_ptr<ThunkSequence> body_thunk_sequence,
+    std::optional<int64_t> trip_count)
     : Thunk(Kind::kWhile, thunk_info),
       condition_result_buffer_index_(condition_result_buffer_index),
       condition_thunk_sequence_(std::make_unique<SequentialThunk>(
           ThunkInfo(thunk_info.op), std::move(*condition_thunk_sequence))),
       body_thunk_sequence_(std::make_unique<SequentialThunk>(
-          ThunkInfo(thunk_info.op), std::move(*body_thunk_sequence))) {}
+          ThunkInfo(thunk_info.op), std::move(*body_thunk_sequence))),
+      trip_count_(trip_count) {}
 
 absl::Status WhileThunk::Prepare(const PrepareParams& params,
                                  ResourceRequests& resource_requests) {
@@ -53,6 +60,14 @@ absl::Status WhileThunk::Prepare(const PrepareParams& params,
 absl::Status WhileThunk::Initialize(const InitializeParams& params) {
   TF_RETURN_IF_ERROR(condition_thunk_sequence_->Initialize(params));
   TF_RETURN_IF_ERROR(body_thunk_sequence_->Initialize(params));
+
+  absl::MutexLock lock(&mutex_);
+  if (auto it = predicates_.find(params.executor); it == predicates_.end()) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<se::MemoryAllocation> allocation,
+                        params.executor->HostMemoryAllocate(sizeof(bool)));
+    predicates_.emplace(params.executor, std::move(allocation));
+  }
+
   return absl::OkStatus();
 }
 
@@ -63,28 +78,43 @@ absl::Status WhileThunk::ExecuteOnStream(const ExecuteParams& params) {
       params.buffer_allocations->GetDeviceAddress(
           condition_result_buffer_index_);
 
+  if (trip_count_.has_value()) {
+    VLOG(2) << "Executing WhileThunk for " << *trip_count_ << " iterations";
+    for (int64_t i = 0; i < trip_count_; ++i) {
+      VLOG(3) << "Executing iteration # " << i;
+      TF_RETURN_IF_ERROR(body_thunk_sequence_->ExecuteOnStream(params));
+    }
+    return absl::OkStatus();
+  }
+
+  int64_t iter = 0;
+
+  // Get memory allocation for copying condition result from device.
+  bool* condition_result = [&] {
+    absl::MutexLock lock(&mutex_);
+    return reinterpret_cast<bool*>(predicates_.at(stream.parent())->opaque());
+  }();
+
   while (true) {
-    // Invoke thunk sequence for while 'condition' computation.
-    VLOG(3) << "Executing condition computation";
+    VLOG(3) << "Executing WhileThunk condition computation; iter=" << iter;
     TF_RETURN_IF_ERROR(condition_thunk_sequence_->ExecuteOnStream(params));
 
     // Copy the result of condition computation and break the loop if 'false'.
-    bool condition_result;
-    stream.ThenMemcpy(&condition_result, condition_result_data, sizeof(bool));
-    VLOG(3) << "condition_result = " << condition_result;
-    absl::Status block_status = stream.BlockHostUntilDone();
-    if (!block_status.ok()) {
+    stream.ThenMemcpy(condition_result, condition_result_data, sizeof(bool));
+
+    if (absl::Status blocked = stream.BlockHostUntilDone(); !blocked.ok()) {
       return absl::InternalError(absl::StrFormat(
           "Failed to complete all kernels launched on stream %p: %s", &stream,
-          block_status.message()));
+          blocked.message()));
     }
 
-    if (!condition_result) {
+    VLOG(3) << "condition_result = " << *condition_result;
+    if (!*condition_result) {
+      VLOG(3) << "Break WhileThunk loop; iter=" << iter;
       break;
     }
 
-    VLOG(3) << "Executing body computation";
-    // Invoke thunk sequence for while 'body' computation.
+    VLOG(3) << "Executing WhileThunk body computation; iter=" << iter++;
     TF_RETURN_IF_ERROR(body_thunk_sequence_->ExecuteOnStream(params));
   }
   return absl::OkStatus();
diff --git a/third_party/xla/xla/service/gpu/runtime3/while_thunk.h b/third_party/xla/xla/service/gpu/runtime/while_thunk.h
similarity index 68%
rename from third_party/xla/xla/service/gpu/runtime3/while_thunk.h
rename to third_party/xla/xla/service/gpu/runtime/while_thunk.h
index 34399321622af5..ec48b0c5714b04 100644
--- a/third_party/xla/xla/service/gpu/runtime3/while_thunk.h
+++ b/third_party/xla/xla/service/gpu/runtime/while_thunk.h
@@ -13,15 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME3_WHILE_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_WHILE_THUNK_H_
+#ifndef XLA_SERVICE_GPU_RUNTIME_WHILE_THUNK_H_
+#define XLA_SERVICE_GPU_RUNTIME_WHILE_THUNK_H_
 
+#include <cstdint>
 #include <memory>
+#include <optional>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/status/status.h"
+#include "absl/synchronization/mutex.h"
 #include "xla/service/buffer_assignment.h"
-#include "xla/service/gpu/runtime3/sequential_thunk.h"
+#include "xla/service/gpu/runtime/sequential_thunk.h"
 #include "xla/service/gpu/thunk.h"
+#include "xla/stream_executor/memory_allocation.h"
+#include "xla/stream_executor/stream_executor.h"
 
 namespace xla {
 namespace gpu {
@@ -32,15 +39,22 @@ namespace gpu {
 // buffers for the following set of while-related instructions share the same
 // allocation:
 //   init, condition.parameter, body.parameter, body.root, while.result
+//
 // WhileThunk synchronizes the stream to test the result of the 'condition'
 // computation.
+//
+// If `trip_count` is available it means that the while loop trip count is known
+// statically and while loop is actually a for loop, and in this case at run
+// time condition thunk might not be executed and instead body thunk will be
+// executed for `trip_count` times.
 class WhileThunk : public Thunk {
  public:
   // Constructs a WhileThunk to compute while instruction 'hlo'.
   WhileThunk(ThunkInfo thunk_info,
              const BufferAllocation::Slice& condition_result_buffer_index,
              std::unique_ptr<ThunkSequence> condition_thunk_sequence,
-             std::unique_ptr<ThunkSequence> body_thunk_sequence);
+             std::unique_ptr<ThunkSequence> body_thunk_sequence,
+             std::optional<int64_t> trip_count = std::nullopt);
   WhileThunk(const WhileThunk&) = delete;
   WhileThunk& operator=(const WhileThunk&) = delete;
 
@@ -52,6 +66,7 @@ class WhileThunk : public Thunk {
   SequentialThunk* condition_thunk_sequence() const {
     return condition_thunk_sequence_.get();
   }
+
   SequentialThunk* body_thunk_sequence() const {
     return body_thunk_sequence_.get();
   }
@@ -64,9 +79,16 @@ class WhileThunk : public Thunk {
   const BufferAllocation::Slice condition_result_buffer_index_;
   std::unique_ptr<SequentialThunk> condition_thunk_sequence_;
   std::unique_ptr<SequentialThunk> body_thunk_sequence_;
+  std::optional<int64_t> trip_count_;
+
+  // Pinned host memory for transfering predicate value from device to host.
+  absl::Mutex mutex_;
+  absl::flat_hash_map<se::StreamExecutor*,
+                      std::unique_ptr<se::MemoryAllocation>>
+      predicates_ ABSL_GUARDED_BY(mutex_);
 };
 
 }  // namespace gpu
 }  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_RUNTIME3_WHILE_THUNK_H_
+#endif  // XLA_SERVICE_GPU_RUNTIME_WHILE_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime3/BUILD b/third_party/xla/xla/service/gpu/runtime3/BUILD
deleted file mode 100644
index a3f1525ca6a998..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime3/BUILD
+++ /dev/null
@@ -1,563 +0,0 @@
-load("//xla/tests:build_defs.bzl", "xla_test")
-load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
-load("@local_tsl//tsl/platform/default:cuda_build_defs.bzl", "if_cuda_is_configured")
-
-package(
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],
-)
-
-package_group(
-    name = "friends",
-    includes = ["//xla:friends"],
-)
-
-#===-------------------------------------------------------------------------------------------===//
-# Command Buffer Integration
-#===-------------------------------------------------------------------------------------------===//
-
-cc_library(
-    name = "command_buffer_allocations",
-    srcs = ["command_buffer_allocations.cc"],
-    hdrs = ["command_buffer_allocations.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:status",
-        "//xla:statusor",
-        "//xla/service:buffer_assignment",
-        "//xla/service/gpu:buffer_allocations",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-    ],
-)
-
-cc_library(
-    name = "command_buffer_cmd",
-    srcs = ["command_buffer_cmd.cc"],
-    hdrs = ["command_buffer_cmd.h"],
-    local_defines = if_cuda_is_configured([
-        "GOOGLE_CUDA=1",
-    ]),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:shape_util",
-        "//xla:status",
-        "//xla:types",
-        "//xla:util",
-        "//xla/service:buffer_assignment",
-        "//xla/service:collective_ops_utils",
-        "//xla/service:custom_call_status_internal",
-        "//xla/service:custom_call_status_public_headers",
-        "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:gpu_executable_run_options",
-        "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:nccl_api",
-        "//xla/service/gpu:nccl_clique",
-        "//xla/service/gpu:nccl_collective_thunks",
-        "//xla/service/gpu:stream_executor_util",
-        "//xla/service/gpu:thunk",
-        "//xla/service/gpu/kernels:custom_kernel",
-        "//xla/stream_executor",
-        "//xla/stream_executor/gpu:gpu_stream_header",
-        "//xla/stream_executor/gpu:gpu_types_header",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/concurrency:ref_count",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-cc_library(
-    name = "command_buffer_cmd_emitter",
-    srcs = ["command_buffer_cmd_emitter.cc"],
-    hdrs = ["command_buffer_cmd_emitter.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":command_buffer_cmd",
-        ":conditional_thunk",
-        ":copy_thunk",
-        ":custom_call_thunk",
-        ":kernel_thunk",
-        ":memset_thunk",
-        ":sequential_thunk",
-        ":while_thunk",
-        "//xla:status",
-        "//xla:statusor",
-        "//xla:util",
-        "//xla/service/gpu:gemm_thunk",
-        "//xla/service/gpu:nccl_collective_thunks",
-        "//xla/service/gpu:thunk",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-xla_test(
-    name = "command_buffer_cmd_test",
-    srcs = if_gpu_is_configured(["command_buffer_cmd_test.cc"]),
-    backends = ["gpu"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
-    deps = [
-        ":command_buffer_cmd",
-        "//xla:status",
-        "//xla:types",
-        "//xla/service:buffer_assignment",
-        "//xla/service:platform_util",
-        "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:launch_dimensions",
-        "//xla/stream_executor",
-        "//xla/stream_executor:multi_platform_manager",
-        "//xla/stream_executor:platform",
-        "//xla/stream_executor/gpu:gpu_test_kernels",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/lib/core:status_test_util",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_main",
-    ],
-)
-
-#===-------------------------------------------------------------------------------------------===//
-# XLA Thunks Runtime
-#===-------------------------------------------------------------------------------------------===//
-
-cc_library(
-    name = "cholesky_thunk",
-    srcs = if_gpu_is_configured(["cholesky_thunk.cc"]),
-    hdrs = if_gpu_is_configured(["cholesky_thunk.h"]),
-    visibility = ["//visibility:public"],
-    deps = if_gpu_is_configured([
-        "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:cusolver_context",
-        "//xla/service/gpu:make_batch_pointers",
-        "//xla/service/gpu:thunk",
-        "//xla:types",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/service:buffer_assignment",
-        "//xla/hlo/ir:hlo",
-        "@local_tsl//tsl/platform:logging",
-        "//xla/stream_executor",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor/gpu:gpu_asm_opts",
-    ]) + ["@local_tsl//tsl/platform:status"],
-)
-
-cc_library(
-    name = "command_buffer_thunk",
-    srcs = ["command_buffer_thunk.cc"],
-    hdrs = ["command_buffer_thunk.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":command_buffer_allocations",
-        ":command_buffer_cmd",
-        "//xla:status",
-        "//xla:statusor",
-        "//xla/service:buffer_assignment",  # build_cleaner: keep
-        "//xla/service/gpu:buffer_allocations",  # build_cleaner: keep
-        "//xla/service/gpu:thunk",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/profiler/lib:profiler_lock",
-        "@local_tsl//tsl/profiler/lib:scoped_annotation",
-        "@local_tsl//tsl/profiler/lib:traceme",
-        "@local_tsl//tsl/profiler/lib:traceme_encode",
-    ],
-)
-
-xla_test(
-    name = "command_buffer_thunk_test",
-    srcs = if_gpu_is_configured(["command_buffer_thunk_test.cc"]),
-    backend_tags = {
-        "gpu_a100": ["config-cuda-only"],
-        "gpu_v100": ["config-cuda-only"],
-    },
-    backends = [
-        "gpu_a100",
-        "gpu_v100",
-    ],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
-    deps = [
-        ":command_buffer_allocations",
-        ":command_buffer_cmd",
-        ":command_buffer_thunk",
-        "//xla:shape_util",
-        "//xla:types",
-        "//xla/service:buffer_assignment",
-        "//xla/service:executable",
-        "//xla/service:platform_util",
-        "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu:matmul_utils",
-        "//xla/service/gpu:thunk",
-        "//xla/stream_executor",
-        "//xla/stream_executor:multi_platform_manager",
-        "//xla/stream_executor:platform",
-        "//xla/stream_executor/gpu:gpu_test_kernels",
-        "//xla/stream_executor/gpu:gpu_types_header",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/lib/core:status_test_util",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/platform:test",
-        "@local_tsl//tsl/platform:test_main",
-    ],
-)
-
-cc_library(
-    name = "conditional_thunk",
-    srcs = ["conditional_thunk.cc"],
-    hdrs = ["conditional_thunk.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":sequential_thunk",
-        "//xla:status",
-        "//xla:util",
-        "//xla/hlo/ir:hlo",
-        "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:thunk",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/types:span",
-        "@local_tsl//tsl/platform:errors",
-    ],
-)
-
-cc_library(
-    name = "convolution_thunk",
-    srcs = ["convolution_thunk.cc"],
-    hdrs = ["convolution_thunk.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:util",
-        "//xla/service:buffer_assignment",
-        "//xla/service/gpu:gpu_conv_runner",
-        "//xla/service/gpu:thunk",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status",
-    ],
-)
-
-cc_library(
-    name = "copy_thunk",
-    srcs = ["copy_thunk.cc"],
-    hdrs = ["copy_thunk.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:status",
-        "//xla/service:buffer_assignment",
-        "//xla/service/gpu:thunk",
-        "//xla/stream_executor",
-        "@llvm-project//mlir:IR",
-    ],
-)
-
-cc_library(
-    name = "custom_call_thunk",
-    srcs = ["custom_call_thunk.cc"],
-    hdrs = ["custom_call_thunk.h"],
-    local_defines = if_cuda_is_configured([
-        "GOOGLE_CUDA=1",
-    ]),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:executable_run_options",
-        "//xla:shape_util",
-        "//xla:status",
-        "//xla:util",
-        "//xla/ffi:call_frame",
-        "//xla/ffi:ffi_api",
-        "//xla/ffi/api:c_api",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:buffer_assignment",
-        "//xla/service:custom_call_status",
-        "//xla/service:custom_call_status_internal",
-        "//xla/service:executable",
-        "//xla/service/gpu:thunk",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor/gpu:gpu_stream_header",
-        "//xla/stream_executor/gpu:gpu_types_header",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/strings:string_view",
-    ],
-)
-
-cc_library(
-    name = "fft_thunk",
-    srcs = ["fft_thunk.cc"],
-    hdrs = ["fft_thunk.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:types",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:buffer_assignment",
-        "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:thunk",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
-    ],
-)
-
-cc_library(
-    name = "for_thunk",
-    srcs = ["for_thunk.cc"],
-    hdrs = ["for_thunk.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":sequential_thunk",
-        "//xla:status",
-        "//xla/hlo/ir:hlo",
-        "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:thunk",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/status",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-    ],
-)
-
-cc_library(
-    name = "fused_mha_thunk",
-    srcs = ["fused_mha_thunk.cc"],
-    hdrs = ["fused_mha_thunk.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/service:buffer_assignment",
-        "//xla/service/gpu:gpu_fused_mha_runner",
-        "//xla/service/gpu:thunk",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/container:flat_hash_map",
-    ],
-)
-
-cc_library(
-    name = "infeed_thunk",
-    srcs = ["infeed_thunk.cc"],
-    hdrs = ["infeed_thunk.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:shape_util",
-        "//xla:status_macros",
-        "//xla:util",
-        "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:io_feed_manager",
-        "//xla/service/gpu:thunk",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/status",
-    ],
-)
-
-cc_library(
-    name = "kernel_thunk",
-    srcs = ["kernel_thunk.cc"],
-    hdrs = ["kernel_thunk.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:status",
-        "//xla:types",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:buffer_assignment",
-        "//xla/service/gpu:kernel_arguments",
-        "//xla/service/gpu:launch_dimensions",
-        "//xla/service/gpu:stream_executor_util",
-        "//xla/service/gpu:thunk",
-        "//xla/service/gpu/kernels:custom_kernel",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        "@llvm-project//mlir:IR",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-cc_library(
-    name = "memset_thunk",
-    srcs = ["memset_thunk.cc"],
-    hdrs = ["memset_thunk.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:status",
-        "//xla/service:buffer_assignment",
-        "//xla/service/gpu:thunk",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/status",
-    ],
-)
-
-cc_library(
-    name = "norm_thunk",
-    srcs = ["norm_thunk.cc"],
-    hdrs = ["norm_thunk.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/service:buffer_assignment",
-        "//xla/service/gpu:gpu_norm_runner",
-        "//xla/service/gpu:thunk",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status",
-    ],
-)
-
-cc_library(
-    name = "outfeed_thunk",
-    srcs = ["outfeed_thunk.cc"],
-    hdrs = ["outfeed_thunk.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:util",
-        "//xla/service/gpu:io_feed_manager",
-        "//xla/service/gpu:thunk",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/status",
-    ],
-)
-
-cc_library(
-    name = "replica_id_thunk",
-    srcs = ["replica_id_thunk.cc"],
-    hdrs = ["replica_id_thunk.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla/service:buffer_assignment",
-        "//xla/service:global_device_id",
-        "//xla/service/gpu:thunk",
-        "@com_google_absl//absl/status",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-cc_library(
-    name = "sequential_thunk",
-    srcs = ["sequential_thunk.cc"],
-    hdrs = ["sequential_thunk.h"],
-    local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "TENSORFLOW_USE_ROCM=1",
-    ]),
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:status",
-        "//xla/hlo/ir:hlo",
-        "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:thunk",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/profiler/lib:scoped_annotation",
-    ],
-)
-
-cc_library(
-    name = "send_recv_thunk",
-    srcs = ["send_recv_thunk.cc"],
-    hdrs = ["send_recv_thunk.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:shape_util",
-        "//xla:status",
-        "//xla:statusor",
-        "//xla:xla_data_proto_cc",
-        "//xla/service:buffer_assignment",
-        "//xla/service:global_device_id",
-        "//xla/service/gpu:thunk",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/log",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/concurrency:async_value",
-        "@local_tsl//tsl/platform:statusor",
-        "@local_tsl//tsl/profiler/lib:traceme",
-    ],
-)
-
-cc_library(
-    name = "triangular_solve_thunk",
-    srcs = if_gpu_is_configured(["triangular_solve_thunk.cc"]),
-    hdrs = if_gpu_is_configured(["triangular_solve_thunk.h"]),
-    visibility = ["//visibility:public"],
-    deps = if_gpu_is_configured([
-        "@com_google_absl//absl/strings:str_format",
-        "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:make_batch_pointers",
-        "//xla/service/gpu:thunk",
-        "//xla:types",
-        "//xla:util",
-        "//xla:xla_data_proto_cc",
-        "//xla/service:buffer_assignment",
-        "//xla/hlo/ir:hlo",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-        "//xla/stream_executor",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor/gpu:gpu_asm_opts",
-    ]) + [
-        "@com_google_absl//absl/status",
-        "@local_tsl//tsl/platform:status",
-    ],
-)
-
-cc_library(
-    name = "while_thunk",
-    srcs = ["while_thunk.cc"],
-    hdrs = ["while_thunk.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":sequential_thunk",
-        "//xla:status",
-        "//xla/hlo/ir:hlo",
-        "//xla/service:buffer_assignment",
-        "//xla/service/gpu:buffer_allocations",
-        "//xla/service/gpu:thunk",
-        "//xla/stream_executor",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:str_format",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:logging",
-    ],
-)
diff --git a/third_party/xla/xla/service/gpu/runtime3/README.md b/third_party/xla/xla/service/gpu/runtime3/README.md
deleted file mode 100644
index 351de805194d9a..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime3/README.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# XLA:GPU Runtime Under Construction
-
-This is a temporary folder to consolidate and clean up the Thunk-based XLA:GPU
-runtime (right now it's all over the xla/servive/gpu folder), with a goal to
-eventually delete `runtime` and `runtime2` folders and make `runtime3` the
-default and only XLA:GPU runtime.
-
-Preliminary timeline for completion is late Q4 2023 - early Q1 2024.
diff --git a/third_party/xla/xla/service/gpu/runtime3/for_thunk.cc b/third_party/xla/xla/service/gpu/runtime3/for_thunk.cc
deleted file mode 100644
index 727f0404da2462..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime3/for_thunk.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/service/gpu/runtime3/for_thunk.h"
-
-#include <cstdint>
-#include <memory>
-#include <utility>
-
-#include "absl/status/status.h"
-#include "xla/service/gpu/runtime3/sequential_thunk.h"
-#include "xla/service/gpu/thunk.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/logging.h"
-
-namespace xla {
-namespace gpu {
-
-ForThunk::ForThunk(ThunkInfo thunk_info, const int64_t loop_limit,
-                   std::unique_ptr<ThunkSequence> body_thunk_sequence)
-    : Thunk(Kind::kFor, thunk_info),
-      loop_limit_(loop_limit),
-      body_thunk_sequence_(std::make_unique<SequentialThunk>(
-          // Pass nullptr as the HloInstruction* to the body_thunk_sequence_
-          // constructor because this SequentialThunk is logically "part of"
-          // this ForThunk, and shouldn't be profiled separately from it.
-          ThunkInfo(thunk_info.op), std::move(*body_thunk_sequence))) {}
-
-absl::Status ForThunk::Prepare(const PrepareParams& params,
-                               ResourceRequests& resource_requests) {
-  return body_thunk_sequence_->Prepare(params, resource_requests);
-}
-
-absl::Status ForThunk::Initialize(const InitializeParams& params) {
-  return body_thunk_sequence_->Initialize(params);
-}
-
-absl::Status ForThunk::ExecuteOnStream(const ExecuteParams& params) {
-  VLOG(2) << "Executing ForThunk with " << loop_limit_ << " iters";
-  for (int64_t i = 0; i < loop_limit_; ++i) {
-    VLOG(3) << "Executing iteration # " << i;
-    // Invoke loop body thunk sequence.
-    TF_RETURN_IF_ERROR(body_thunk_sequence_->ExecuteOnStream(params));
-  }
-  return absl::OkStatus();
-}
-
-}  // namespace gpu
-}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/runtime3/for_thunk.h b/third_party/xla/xla/service/gpu/runtime3/for_thunk.h
deleted file mode 100644
index 635dbb3cd43f25..00000000000000
--- a/third_party/xla/xla/service/gpu/runtime3/for_thunk.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* Copyright 2017 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_SERVICE_GPU_RUNTIME3_FOR_THUNK_H_
-#define XLA_SERVICE_GPU_RUNTIME3_FOR_THUNK_H_
-
-#include <cstdint>
-#include <memory>
-
-#include "absl/status/status.h"
-#include "xla/service/gpu/runtime3/sequential_thunk.h"
-#include "xla/service/gpu/thunk.h"
-
-namespace xla {
-namespace gpu {
-
-// ForThunk executes 'loop_limit' invocations of 'body_thunk_sequence'.
-class ForThunk : public Thunk {
- public:
-  ForThunk(ThunkInfo thunk_info, int64_t loop_limit,
-           std::unique_ptr<ThunkSequence> body_thunk_sequence);
-  ForThunk(const ForThunk&) = delete;
-  ForThunk& operator=(const ForThunk&) = delete;
-
-  absl::Status Prepare(const PrepareParams& params,
-                       ResourceRequests& resource_requests) override;
-  absl::Status Initialize(const InitializeParams& params) override;
-  absl::Status ExecuteOnStream(const ExecuteParams& params) override;
-
-  SequentialThunk* body_thunk_sequence() { return body_thunk_sequence_.get(); }
-
- private:
-  const int64_t loop_limit_;
-  std::unique_ptr<SequentialThunk> body_thunk_sequence_;
-};
-
-}  // namespace gpu
-}  // namespace xla
-
-#endif  // XLA_SERVICE_GPU_RUNTIME3_FOR_THUNK_H_
diff --git a/third_party/xla/xla/service/gpu/runtime_intrinsics.cc b/third_party/xla/xla/service/gpu/runtime_intrinsics.cc
index 13c81ba11e4737..05ddcc621e2431 100644
--- a/third_party/xla/xla/service/gpu/runtime_intrinsics.cc
+++ b/third_party/xla/xla/service/gpu/runtime_intrinsics.cc
@@ -29,8 +29,8 @@ limitations under the License.
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/statusor.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/util.h"
@@ -51,7 +51,7 @@ absl::Status AssertOnGpu(void* stream_handle, void* buffer,
                          absl::string_view error_msg) {
   TF_ASSIGN_OR_RETURN(
       se::Platform * platform,
-      se::MultiPlatformManager::PlatformWithName(GetGpuPlatformName()));
+      se::PlatformManager::PlatformWithName(GetGpuPlatformName()));
   se::StreamExecutorConfig config;
   config.gpu_stream = stream_handle;
   TF_ASSIGN_OR_RETURN(se::StreamExecutor * executor,
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
index 0401ae5b546d3f..509a0d9289dd4b 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.cc
@@ -13,8 +13,8 @@ limitations under the License.
 #include "xla/service/gpu/softmax_rewriter_triton.h"
 
 #include <functional>
-#include <optional>
 #include <string>
+#include <variant>
 #include <vector>
 
 #include "absl/algorithm/container.h"
@@ -34,11 +34,11 @@ limitations under the License.
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/ir_emission_utils.h"
 #include "xla/service/gpu/triton_support.h"
+#include "xla/service/instruction_fusion.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -231,81 +231,10 @@ bool IsTriviallyConnectedProducerOf(
 
 bool IsTritonSupportedComputation(const HloComputation* computation,
                                   const se::GpuComputeCapability& gpu_version) {
-  for (const HloInstruction* instr : computation->instructions()) {
-    if (!IsTritonSupportedInstruction(instr, gpu_version)) {
-      return false;
-    }
-  }
-  return true;
-}
-
-std::optional<HloInstruction*> MatchesTritonCompatibleClosedReductionDiamond(
-    HloInstruction* instr, const se::GpuComputeCapability& gpu_version) {
-  // Return the producer of the following pattern:
-  //
-  // producer
-  // |    \
-  // |  reduce_{max,sum,...}
-  // |     |
-  // |  broadcast
-  // |   /
-  // binop (elementwise)
-  //
-  // where each edge is allowed to contain also trivial operations that can be
-  // generated by Triton. We mean by "trivial" here those operations that do not
-  // increase the amount of memory read/written by the fusion, and that are
-  // compatible with any chosen tiling.
-  //
-  // We also assume that the reduction is done on the last axis of the producer
-  // array.
-  std::optional<HloInstruction*> match_failure = std::nullopt;
-
-  if (!instr->IsElementwiseBinary() ||
-      !IsTritonSupportedInstruction(instr, gpu_version)) {
-    return match_failure;
-  }
-
-  HloInstruction* producer;
-  HloInstruction* broadcast;
-  HloInstruction* reduce;
-
-  if (!(TrivialEdge(&broadcast, instr->mutable_operand(1),
-                    HloOpcode::kBroadcast, gpu_version) &&
-        TrivialEdge(&reduce, broadcast->mutable_operand(0), HloOpcode::kReduce,
-                    gpu_version) &&
-        HasDefaultLayout(broadcast->shape()) &&
-        HasDefaultLayout(reduce->shape()) && reduce->operand_count() == 2 &&
-        reduce->operand(1)->opcode() == HloOpcode::kConstant &&
-        IsTritonSupportedComputation(reduce->to_apply(), gpu_version))) {
-    return match_failure;
-  }
-
-  if (!HasOneUse(broadcast) || !HasOneUse(reduce)) {
-    return match_failure;
-  }
-
-  producer = reduce->mutable_operand(0);
-
-  if (!(reduce->dimensions().size() == 1 &&
-        reduce->dimensions(0) == producer->shape().rank() - 1 &&
-        !absl::c_linear_search(broadcast->dimensions(),
-                               broadcast->shape().rank() - 1))) {
-    return match_failure;
-  }
-
-  while (IsTriviallyFusible(producer, gpu_version)) {
-    producer = ChooseOperandForFusionProcessing(producer);
-  }
-
-  if (!HasDefaultLayout(producer->shape()) ||
-      !IsTriviallyConnectedProducerOf(producer, instr->mutable_operand(0),
-                                      gpu_version) ||
-      !(producer == instr->operand(0) ||
-        instr->operand(0)->user_count() == 1)) {
-    return match_failure;
-  }
-
-  return producer;
+  return absl::c_all_of(
+      computation->instructions(), [&](const HloInstruction* instr) {
+        return IsTritonSupportedInstruction(instr, gpu_version);
+      });
 }
 
 // Finds the first non-fusible producer of a diamond. This instruction is either
@@ -391,6 +320,88 @@ using DiamondDescriptor = DiamondChainDescriptor;
 
 }  // anonymous namespace
 
+DiamondMatchingDecision
+SoftmaxRewriterTriton::MatchesTritonCompatibleClosedReductionDiamond(
+    HloInstruction* instr) const {
+  if (!instr->IsElementwiseBinary()) {
+    return "Root is not elementwise binary.";
+  }
+
+  if (!IsTritonSupportedInstruction(instr, gpu_version_)) {
+    return "Root is not supported for Triton instruction.";
+  }
+
+  HloInstruction* producer;
+  HloInstruction* broadcast;
+  HloInstruction* reduce;
+
+  if (!TrivialEdge(&broadcast, instr->mutable_operand(1), HloOpcode::kBroadcast,
+                   gpu_version_)) {
+    return "Could not find a trivial connection from root to a broadcast.";
+  }
+
+  if (!TrivialEdge(&reduce, broadcast->mutable_operand(0), HloOpcode::kReduce,
+                   gpu_version_)) {
+    return "Could not find a trivial connection from matched broadcast to a "
+           "reduction.";
+  }
+
+  if (!(HasDefaultLayout(broadcast->shape()) &&
+        HasDefaultLayout(reduce->shape()))) {
+    return "Broadcast or reduce have non-default layouts.";
+  }
+
+  if (!(reduce->operand_count() == 2 &&
+        reduce->operand(1)->opcode() == HloOpcode::kConstant)) {
+    return "Reduce has a non-constant second operand and/or is variadic.";
+  }
+
+  if (!(IsTritonSupportedComputation(reduce->to_apply(), gpu_version_))) {
+    return "Unsupported reduction by Triton.";
+  }
+
+  if (!HasOneUse(broadcast) || !HasOneUse(reduce)) {
+    return "More than one use of broadcast or reduce.";
+  }
+
+  producer = reduce->mutable_operand(0);
+
+  if (reduce->dimensions().size() != 1 ||
+      reduce->dimensions(0) != producer->shape().rank() - 1) {
+    return "Reduction is not a row-reduction of a single operand.";
+  }
+
+  if (absl::c_linear_search(broadcast->dimensions(),
+                            broadcast->shape().rank() - 1)) {
+    return "Broadcast is not along the reduction dimension.";
+  }
+
+  while (IsTriviallyFusible(producer, gpu_version_)) {
+    producer = ChooseOperandForFusionProcessing(producer);
+  }
+
+  if (!HasDefaultLayout(producer->shape())) {
+    return "Producer has non-default layout.";
+  }
+
+  if (!IsTriviallyConnectedProducerOf(producer, instr->mutable_operand(0),
+                                      gpu_version_)) {
+    return "Producer is not trivially connected.";
+  }
+
+  if (producer != instr->operand(0) && instr->operand(0)->user_count() != 1) {
+    return "Unsupported root-producer connection.";
+  }
+
+  VLOG(5) << "Matched Softmax diamond with: ";
+  VLOG(5) << "root: " << instr->ToString();
+  VLOG(5) << "producer: " << producer->ToString();
+  VLOG(5) << "broadcast: " << broadcast->ToString();
+  VLOG(5) << "reduce: " << reduce->ToString();
+
+  return producer;
+}
+
 std::vector<DiamondChainDescriptor>
 SoftmaxRewriterTriton::FindAllFusibleDiamondChains(
     HloModule& module,
@@ -411,9 +422,16 @@ SoftmaxRewriterTriton::FindAllFusibleDiamondChains(
         continue;
       }
 
-      if (auto producer = MatchesTritonCompatibleClosedReductionDiamond(
-              instr, gpu_version_)) {
-        matched_diamonds.push_back(DiamondDescriptor{instr, producer.value()});
+      auto producer = MatchesTritonCompatibleClosedReductionDiamond(instr);
+      if (std::holds_alternative<HloInstruction*>(producer)) {
+        matched_diamonds.push_back(DiamondDescriptor{
+            instr,
+            std::get<HloInstruction*>(producer),
+        });
+      } else {
+        VLOG(5) << "Cannot match the diamond pattern for instruction "
+                << instr->ToString()
+                << ". Reason: " << std::get<FusionDecision>(producer).Explain();
       }
     }
   }
@@ -509,7 +527,8 @@ SoftmaxRewriterTriton::FindAllFusibleDiamondChains(
     // diamond producer of diamond chain n+1.
     diamond_chains.push_back(DiamondChainDescriptor{
         last_trivially_fusible_user(previous_diamond_root),
-        current_fusion_producer});
+        current_fusion_producer,
+    });
 
     current_fusion_producer = first_non_fusible_diamond_producer;
     current_reduce_dimension_size = diamond_reduce_dimension_size;
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.h b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.h
index 3b71d3e3f9280e..9463d510f4590e 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton.h
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_SOFTMAX_REWRITER_TRITON_H_
 #define XLA_SERVICE_GPU_SOFTMAX_REWRITER_TRITON_H_
 
+#include <variant>
 #include <vector>
 
 #include "absl/container/flat_hash_set.h"
@@ -23,8 +24,8 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
+#include "xla/service/instruction_fusion.h"
 #include "xla/status.h"
-#include "xla/statusor.h"
 #include "xla/stream_executor/device_description.h"
 
 namespace xla {
@@ -35,6 +36,8 @@ struct DiamondChainDescriptor {
   HloInstruction* producer = nullptr;
 };
 
+using DiamondMatchingDecision = std::variant<FusionDecision, HloInstruction*>;
+
 // Rewrite compatible Softmax into a custom fusion region to be code-generated
 // with the Triton-based Softmax emitter.
 class SoftmaxRewriterTriton : public HloModulePass {
@@ -60,6 +63,26 @@ class SoftmaxRewriterTriton : public HloModulePass {
   // fusion.
   absl::Status FuseDiamondChain(const DiamondChainDescriptor& diamond_chain);
 
+  // Return the producer of the following pattern:
+  //
+  // producer
+  // |    \
+  // |  reduce_{max,sum,...}
+  // |     |
+  // |  broadcast
+  // |   /
+  // binop (elementwise)
+  //
+  // where each edge is allowed to contain also trivial operations that can be
+  // generated by Triton. We mean by "trivial" here those operations that do not
+  // increase the amount of memory read/written by the fusion, and that are
+  // compatible with any chosen tiling.
+  //
+  // We also assume that the reduction is done on the last axis of the producer
+  // array.
+  DiamondMatchingDecision MatchesTritonCompatibleClosedReductionDiamond(
+      HloInstruction* instr) const;
+
  private:
   se::GpuComputeCapability gpu_version_;
 };
diff --git a/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc b/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
index 80c258cf9e47fc..6cbfc618a6b760 100644
--- a/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
+++ b/third_party/xla/xla/service/gpu/softmax_rewriter_triton_test.cc
@@ -13,16 +13,18 @@ limitations under the License.
 
 #include <memory>
 #include <string>
+#include <variant>
 #include <vector>
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/base/optimization.h"
 #include "absl/log/check.h"
-#include "absl/log/log.h"
 #include "absl/strings/substitute.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/primitive_util.h"
+#include "xla/service/instruction_fusion.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
 #include "xla/statusor.h"
@@ -38,6 +40,8 @@ namespace {
 
 namespace m = ::xla::match;
 
+using ::testing::HasSubstr;
+
 // Wrapper around SoftmaxRewriterTriton(gpu_version).Run(module) that finds
 // and fuses as many diamond chains as possible without invoking any kind of
 // cost analysis.
@@ -1736,6 +1740,47 @@ ENTRY main {
       SoftmaxRewriterTritonMatchAndRewrite(gpu_version_, module.get()).value());
 }
 
+TEST_F(SoftmaxRewriterTritonTest, FusionDecisionIsCapturedExplicitly) {
+  const std::string hlo_string = R"(
+HloModule softmax
+max_computation {
+  arg_0 = f32[] parameter(0)
+  arg_1 = f32[] parameter(1)
+  ROOT maximum = f32[] maximum(arg_0, arg_1)
+}
+ENTRY main {
+  param_0 = f32[127,125]{1,0} parameter(0)
+  identity = f32[] parameter(1)
+  reduce = f32[127]{0} reduce(param_0, identity), dimensions={1}, to_apply=max_computation
+  broadcast = f32[127,125]{1,0} broadcast(reduce), dimensions={0}
+  ROOT subtract = f32[127,125]{1,0} subtract(param_0, broadcast)
+}
+)";
+
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+  SoftmaxRewriterTriton softmax_rewriter_triton(gpu_version_);
+  int unmatched = 0, matched = 0;
+  for (HloInstruction* instruction :
+       module->entry_computation()->MakeInstructionPostOrder()) {
+    DiamondMatchingDecision decision =
+        softmax_rewriter_triton.MatchesTritonCompatibleClosedReductionDiamond(
+            instruction);
+    if (std::holds_alternative<FusionDecision>(decision)) {
+      std::string actual_decision =
+          std::get<FusionDecision>(decision).Explain();
+      EXPECT_THAT(actual_decision,
+                  AnyOf(HasSubstr("Root is not elementwise binary"),
+                        HasSubstr("Reduce has a non-constant second operand "
+                                  "and/or is variadic")));
+      unmatched++;
+    } else {
+      matched++;
+    }
+  }
+  EXPECT_EQ(unmatched, 5);
+  EXPECT_EQ(matched, 0);
+}
+
 INSTANTIATE_TEST_SUITE_P(SoftmaxRewriterTritonTestSuite,
                          SoftmaxRewriterTritonTest,
                          ::testing::Values(F32, F16, BF16));
diff --git a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
index 84d393fecfe6b1..4269c180195af7 100644
--- a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
+++ b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter.cc
@@ -379,9 +379,9 @@ absl::Status MakeDotSplitKBatch(HloInstruction* dot_fusion,
       dot_fusion->parent()->AddInstruction(HloInstruction::CreateConstant(
           LiteralUtil::Zero(root->shape().element_type())));
   // The batch dimension to reduce is the first one by construction.
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * reduce,
-      MakeReduceHlo(dot_fusion, zero, /*dimensions=*/{0}, HloOpcode::kAdd));
+  TF_ASSIGN_OR_RETURN(HloInstruction * reduce,
+                      MakeReduceHlo(dot_fusion, zero, /*dimensions=*/{0},
+                                    HloOpcode::kAdd, &dot_fusion->metadata()));
 
   // The output of the reduce has to have the layout of the original dot.
   *reduce->mutable_shape()->mutable_layout() = output_layout;
diff --git a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc
index 7d1756abbe047b..d064c733ec4a16 100644
--- a/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc
+++ b/third_party/xla/xla/service/gpu/split_k_gemm_rewriter_test.cc
@@ -92,15 +92,17 @@ ENTRY e {
   p0 = s8[3,128,5,32]{3,2,1,0} parameter(0)
   p1 = bf16[16,128]{1,0} parameter(1)
   ROOT fusion = bf16[480,16]{1,0} fusion(p0, p1),
-    kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm"
+    kind=kCustom, calls=triton_gemm_dot, backend_config="__triton_gemm",
+    metadata={op_name="foo"}
 })";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
                           ParseAndReturnVerifiedModule(hlo_text));
   TritonGemmConfig config(16, 16, 16, 4, 1, 4);
   TF_EXPECT_OK(MakeDotSplitKBatch(
       module->entry_computation()->root_instruction(), config));
-  EXPECT_EQ(module->entry_computation()->root_instruction()->opcode(),
-            HloOpcode::kReduce);
+  const HloInstruction* root = module->entry_computation()->root_instruction();
+  EXPECT_EQ(root->opcode(), HloOpcode::kReduce);
+  EXPECT_EQ(root->metadata().op_name(), "foo");
 }
 
 TEST_F(SplitKTest, MakeSplitKWithOutputFusion) {
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util.cc b/third_party/xla/xla/service/gpu/stream_executor_util.cc
index d7a10df1ffb073..25c0581c499589 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util.cc
+++ b/third_party/xla/xla/service/gpu/stream_executor_util.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/stream_executor_util.h"
 
+#include <cstdint>
 #include <iterator>
 #include <limits>
 #include <map>
@@ -37,8 +38,10 @@ limitations under the License.
 #include "xla/layout_util.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/util.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/util/env_var.h"
 #include "tsl/util/proto/proto_utils.h"
 
@@ -334,12 +337,13 @@ absl::StatusOr<std::unique_ptr<se::Kernel>> CreateKernel(
         reinterpret_cast<const char*>(cubin_data.data()), kernel_name);
   }
 
-  auto kernel_base = std::make_unique<se::Kernel>(stream_exec);
-  TF_RETURN_IF_ERROR(stream_exec->GetKernel(loader_spec, kernel_base.get()));
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<se::Kernel> kernel,
+                      se::Kernel::Create(stream_exec, loader_spec));
+
   se::KernelMetadata m;
   m.set_shared_memory_bytes(shared_mem_bytes);
-  kernel_base->set_metadata(m);
-  return std::move(kernel_base);
+  kernel->set_metadata(m);
+  return kernel;
 }
 
 absl::Status ExecuteKernelOnStream(const se::Kernel& kernel,
@@ -354,6 +358,20 @@ absl::Status ExecuteKernelOnStream(const se::Kernel& kernel,
                                   dims.block_counts(), kernel, *kernel_args);
 }
 
+absl::Status ExecuteKernelOnStream(const se::Kernel& kernel,
+                                   absl::Span<const se::DeviceMemoryBase> args,
+                                   const LaunchDimensions& dims,
+                                   const se::ClusterDim& cluster_dim,
+                                   se::Stream* stream) {
+  TF_ASSIGN_OR_RETURN(
+      std::unique_ptr<se::KernelArgsPackedArrayBase> kernel_args,
+      se::PackKernelArgs(args, kernel.metadata()));
+
+  return stream->parent()->Launch(stream, dims.thread_counts_per_block(),
+                                  dims.block_counts(), cluster_dim, kernel,
+                                  *kernel_args);
+}
+
 // Unimplemented for integers yet.
 template <typename T, typename Generator>
 typename std::enable_if<std::is_integral<T>::value,
@@ -468,6 +486,20 @@ absl::StatusOr<se::dnn::ConvolutionKind> GetDNNConvKindFromCudnnConvKind(
   return Internal("Unexpected convolution kind");
 }
 
+absl::StatusOr<se::dnn::NormKind> GetDNNNormKindFromCudnnNormKind(
+    CudnnNormKind kind) {
+  switch (kind) {
+    case CudnnNormKind::kLayerForwardInfer:
+      return se::dnn::LAYER_FWD_INFER;
+    case CudnnNormKind::kLayerForwardTrain:
+      return se::dnn::LAYER_FWD_TRAIN;
+    case CudnnNormKind::kLayerBackward:
+      return se::dnn::LAYER_BWD;
+    default:
+      return Internal("Unexpected norm kind");
+  }
+}
+
 absl::StatusOr<se::dnn::FusedMHAKind> GetDNNFusedMHAKindFromCudnnfMHAKind(
     CudnnfMHAKind kind) {
   switch (kind) {
@@ -519,7 +551,7 @@ absl::StatusOr<se::dnn::DataType> GetDNNDataTypeFromPrimitiveType(
     default:
       break;
   }
-  return Internal("Unsupported convolution datatype");
+  return Internal("Unsupported datatype");
 }
 
 bool RequireDeterminism(const HloModuleConfig& config) {
@@ -573,16 +605,16 @@ absl::Status NoAlgorithmSuppliedInternalError(
     std::optional<std::string_view> instr_str) {
   std::ostringstream msg;
   if (instr_str.has_value()) {
-    msg << "The are no algorithm candiates for computing: \n  "
+    msg << "There are no algorithm candiates for computing: \n  "
         << instr_str.value()
         << "\nThis likely means that the instruction shape is not supported by "
            "the target GPU library.";
   } else {
-    msg << "The are no algorithm candiates for computing the instruction.\n"
+    msg << "There are no algorithm candiates for computing the instruction.\n"
            "This likely means that the instruction shape is not supported by "
            "the target GPU library.";
   }
-  return InternalError("%s", msg.str());
+  return Internal("%s", msg.str());
 }
 
 void SortAutotuningResultsByRunTime(std::vector<AutotuneResult>& results) {
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util.h b/third_party/xla/xla/service/gpu/stream_executor_util.h
index ef418f5274a399..38598f3f42750c 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util.h
+++ b/third_party/xla/xla/service/gpu/stream_executor_util.h
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/service/hlo_module_config.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"
 #include "xla/xla_data.pb.h"
@@ -91,6 +92,13 @@ absl::Status ExecuteKernelOnStream(const se::Kernel& kernel,
                                    const LaunchDimensions& dims,
                                    se::Stream* stream);
 
+// Runs loaded kernel on the stream with the provided arguments.
+absl::Status ExecuteKernelOnStream(const se::Kernel& kernel,
+                                   absl::Span<const se::DeviceMemoryBase> args,
+                                   const LaunchDimensions& dims,
+                                   const se::ClusterDim& cluster_dim,
+                                   se::Stream* stream);
+
 // Initializes `buffer` with random data on `stream`.
 // `rng_state` is an inout parameter for the pseudorandom generator state.
 // `buffer_type` determines what buffer would be filled out with.
@@ -103,6 +111,9 @@ void InitializeBuffer(se::Stream* stream, PrimitiveType buffer_type,
 absl::StatusOr<se::dnn::ConvolutionKind> GetDNNConvKindFromCudnnConvKind(
     CudnnConvKind kind);
 
+absl::StatusOr<se::dnn::NormKind> GetDNNNormKindFromCudnnNormKind(
+    CudnnNormKind kind);
+
 absl::StatusOr<se::dnn::FusedMHAKind> GetDNNFusedMHAKindFromCudnnfMHAKind(
     CudnnfMHAKind kind);
 
diff --git a/third_party/xla/xla/service/gpu/stream_executor_util_test.cc b/third_party/xla/xla/service/gpu/stream_executor_util_test.cc
index 63122ccb2afbe3..558de0ec3604ba 100644
--- a/third_party/xla/xla/service/gpu/stream_executor_util_test.cc
+++ b/third_party/xla/xla/service/gpu/stream_executor_util_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/third_party/xla/xla/service/gpu/target_util.cc b/third_party/xla/xla/service/gpu/target_util.cc
index e7cd015f1161ac..15d0799b005153 100644
--- a/third_party/xla/xla/service/gpu/target_util.cc
+++ b/third_party/xla/xla/service/gpu/target_util.cc
@@ -205,6 +205,9 @@ struct TargetDeviceFunction GetDeviceFunctionRoot(
     case TargetDeviceFunctionID::kCos: {
       return {"__nv_cos", "__ocml_cos", "_Z15__spirv_ocl_cos"};
     }
+    case TargetDeviceFunctionID::kErf: {
+      return {"__nv_erf", "__ocml_erf", "_Z15__spirv_ocl_erf"};
+    }
     case TargetDeviceFunctionID::kExp: {
       return {"__nv_exp", "__ocml_exp", "_Z15__spirv_ocl_exp"};
     }
@@ -256,6 +259,8 @@ absl::StatusOr<TargetDeviceFunctionID> GetTargetDeviceFunctionID(HloOpcode op) {
       return TargetDeviceFunctionID::kCos;
     case HloOpcode::kExp:
       return TargetDeviceFunctionID::kExp;
+    case HloOpcode::kErf:
+      return TargetDeviceFunctionID::kErf;
     case HloOpcode::kExpm1:
       return TargetDeviceFunctionID::kExpm1;
     case HloOpcode::kLog:
diff --git a/third_party/xla/xla/service/gpu/target_util.h b/third_party/xla/xla/service/gpu/target_util.h
index 7024438b940be1..d88e8ed5b54f14 100644
--- a/third_party/xla/xla/service/gpu/target_util.h
+++ b/third_party/xla/xla/service/gpu/target_util.h
@@ -62,6 +62,7 @@ enum class TargetDeviceFunctionID {
   kSqrt,
   kTan,
   kTanh,
+  kErf,
 };
 
 // HLO opcode -> TargetDeviceFunctionID mapping.
diff --git a/third_party/xla/xla/service/gpu/tests/BUILD b/third_party/xla/xla/service/gpu/tests/BUILD
index 241a774bf40477..9ca235a4e659ed 100644
--- a/third_party/xla/xla/service/gpu/tests/BUILD
+++ b/third_party/xla/xla/service/gpu/tests/BUILD
@@ -2,11 +2,9 @@
 # verify the IR emitted.
 
 load("//xla/tests:build_defs.bzl", "xla_test")
-load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
 load(
     "//xla:xla.bzl",
-    "xla_cc_binary",
     "xla_cc_test",
 )
 load(
@@ -25,7 +23,8 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
     licenses = ["notice"],
 )
 
@@ -43,7 +42,6 @@ filegroup(
         "**/*.cc",
         "**/*.h",
     ]),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -52,12 +50,12 @@ cc_library(
     srcs = ["gpu_codegen_test.cc"],
     hdrs = ["gpu_codegen_test.h"],
     tags = tf_cuda_tests_tags(),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:debug_options_flags",
         "//xla:shape_util",
         "//xla/service:gpu_plugin",
         "//xla/service/gpu:gpu_executable",
+        "//xla/stream_executor:platform_manager",
         "//xla/tests:filecheck",
         "//xla/tests:llvm_irgen_test_base",
         "//xla/tests:verified_hlo_module",
@@ -87,6 +85,17 @@ xla_cc_test(
     ],
 )
 
+xla_test(
+    name = "float_conversions_test",
+    srcs = ["float_conversions_test.cc"],
+    backends = ["gpu"],
+    deps = [
+        ":gpu_codegen_test",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 xla_cc_test(
     name = "gpu_reduce_scatter_creator_test",
     srcs = ["gpu_reduce_scatter_creator_test.cc"],
@@ -654,74 +663,27 @@ xla_cc_test(
     ],
 )
 
-build_test(
-    name = "hlo_to_llvm_ir_build_test",
-    targets = [
-        ":hlo_to_llvm_ir",
-    ],
-)
-
-xla_cc_binary(
-    name = "hlo_to_llvm_ir",
-    testonly = True,
-    srcs = ["hlo_to_llvm_ir.cc"],
-    copts = if_cuda_is_configured(["-DGOOGLE_CUDA=1"]) + if_rocm_is_configured([
-        "-DTENSORFLOW_USE_ROCM=1",
-    ]),
-    deps = [
-        "//xla:status",
-        "//xla/hlo/ir:hlo",
-        "//xla/service/gpu:buffer_sharing",
-        "//xla/service/gpu:compile_module_to_llvm_ir",
-        "//xla/service/gpu:gpu_compiler",
-        "//xla/service/gpu:gpu_device_info_for_tests",
-        "//xla/service/gpu:gpu_hlo_schedule",
-        "//xla/service/gpu:target_constants",
-        "//xla/service/gpu/llvm_gpu_backend",
-        "//xla/stream_executor",
-        "//xla/stream_executor:device_description",
-        "//xla/stream_executor:dnn",
-        "//xla/stream_executor/host:host_platform",
-        "//xla/tests:test_utils",
-        "//xla/tools:hlo_module_loader",
-        "@com_google_absl//absl/status",
-        "@llvm-project//llvm:Target",
-        "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/util:command_line_flags",
-    ] + if_cuda_is_configured([
-        "//xla/stream_executor/cuda:cuda_platform_id",
-        "//xla/service/gpu:nvptx_compiler_impl",
-        "//xla/stream_executor/cuda:cublas_plugin",
-    ]) + if_rocm_is_configured([
-        "//xla/stream_executor/rocm:rocm_platform_id",  # build_cleaner: keep
-        "//xla/stream_executor/rocm:rocblas_plugin",
-        "//xla/stream_executor/rocm:rocm_helpers",
-        "//xla/service/gpu:amdgpu_compiler_impl",
-        "@local_tsl//tsl/platform:rocm_rocdl_path",  # build_cleaner: keep
-    ]),
-)
-
 lit_test_suite(
     name = "all_tests",
     srcs = enforce_glob(
         [
             "add_preds.hlo",
             "calling_convention.hlo",
-            "concat.hlo",
-            "constant.hlo",
             "copy.hlo",
-            "copy_nested.hlo",
             "dynamic_update_slice_inplace.hlo",
             "element_wise_row_vectorization.hlo",
             "fused_scatter.hlo",
-            "fused_slice_different_operands.hlo",
             "fused_slice.hlo",
-            "fusion.hlo",
             "kernel_reuse.hlo",
             "launch_dimensions.hlo",
             "pad_to_static.hlo",
+            "reduce_atomic_min.hlo",
+            "reduce_column_layout_change.hlo",
+            "reduce_f64_column.hlo",
+            "reduce_large_row_to_scalar.hlo",
+            "reduce_row_vectorized.hlo",
             "reduce_unnested.hlo",
+            "reduce_variadic_column.hlo",
             "reduction_vectorization_sm_all.hlo",
             "rng_get_and_update_state.hlo",
             "scatter.hlo",
@@ -729,6 +691,10 @@ lit_test_suite(
             "single_instruction.hlo",
             "slice_to_dynamic.hlo",
             "sorting.hlo",
+            "transpose_021.hlo",
+            "transpose_021_extra_output.hlo",
+            "transpose_210.hlo",
+            "transpose_210_extra_output.hlo",
             "triton_naming.hlo",
         ],
         include = [
@@ -737,10 +703,15 @@ lit_test_suite(
     ),
     args = if_cuda_is_configured([
         "--param=PTX=PTX",
+        "--param=GPU=a6000",
     ]) + if_rocm_is_configured([
         "--param=PTX=GCN",
+        "--param=GPU=mi200",
     ]),
     cfg = "//xla:lit.cfg.py",
+    data = [
+        ":test_utilities",
+    ],
     default_tags = tf_cuda_tests_tags(),
     tags_override = {
         "reduction_vectorization_sm_all.hlo": ["no_rocm"],
@@ -749,7 +720,22 @@ lit_test_suite(
         "reduce_unnested.hlo": ["no_rocm"],
     },
     tools = [
-        ":hlo_to_llvm_ir",
+        "//xla/tools:hlo-opt",
+        "@llvm-project//llvm:FileCheck",
+    ],
+)
+
+filegroup(
+    name = "test_utilities",
+    testonly = True,
+    data = [
+        "//xla/tools:hlo-opt",
+        "//xla/tools/hlo_opt:gpu_specs/a100_80.txtpb",
+        "//xla/tools/hlo_opt:gpu_specs/a6000.txtpb",
+        "//xla/tools/hlo_opt:gpu_specs/h100.txtpb",
+        "//xla/tools/hlo_opt:gpu_specs/mi200.txtpb",
+        "//xla/tools/hlo_opt:gpu_specs/p100.txtpb",
+        "//xla/tools/hlo_opt:gpu_specs/v100.txtpb",
         "@llvm-project//llvm:FileCheck",
     ],
 )
@@ -795,6 +781,7 @@ xla_cc_test(
         "//xla:shape_util",
         "//xla:types",
         "//xla:xla_proto_cc",
+        "//xla/stream_executor:platform_manager",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
@@ -876,7 +863,6 @@ cc_library(
     testonly = True,
     srcs = ["simple_optimization_test.cc"],
     tags = tf_cuda_tests_tags(),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/tests:hlo_test_base",
         "//xla/tests:xla_internal_test_main",
diff --git a/third_party/xla/xla/service/gpu/tests/add_preds.hlo b/third_party/xla/xla/service/gpu/tests/add_preds.hlo
index ea942260f5d356..120b6a5ad686bf 100644
--- a/third_party/xla/xla/service/gpu/tests/add_preds.hlo
+++ b/third_party/xla/xla/service/gpu/tests/add_preds.hlo
@@ -1,29 +1,9 @@
-// RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s
 
-// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-// CHECK-LABEL: entry:
-// CHECK-PTX:     %[[VAL_0:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x
-// CHECK-GCN:     %[[VAL_0:.*]] = call i32 @llvm.amdgcn.workgroup.id.x
-// CHECK-PTX:     %[[VAL_1:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x
-// CHECK-GCN:     %[[VAL_1:.*]] = call i32 @llvm.amdgcn.workitem.id.x
-// CHECK:         %[[VAL_2:.*]] = mul nuw nsw i32 %[[VAL_0]], 1
-// CHECK:         %[[VAL_3:.*]] = add nuw nsw i32 %[[VAL_2]], %[[VAL_1]]
-// CHECK:         %[[VAL_4:.*]] = icmp ult i32 %[[VAL_3]], 1
-// CHECK:         call void @llvm.assume(i1 %[[VAL_4]])
-// CHECK:         %[[VAL_5:.*]] = icmp ult i32 %[[VAL_3]], 1
-// CHECK:         br i1 %[[VAL_5]], label %[[VAL_6:.*]], label %[[VAL_7:.*]]
-// CHECK:       fusion.in_bounds-after:                           ; preds = %[[VAL_6]], %[[VAL_8:.*]]
-// CHECK:         ret void
-// CHECK:       fusion.in_bounds-true:                            ; preds = %[[VAL_8]]
-// CHECK:         %[[VAL_9:.*]] = load i8, ptr %[[VAL_10:.*]], align 1, !invariant.load
-// CHECK:         %[[VAL_11:.*]] = load i8, ptr %[[VAL_12:.*]], align 1, !invariant.load
-// CHECK:         %[[VAL_13:.*]] = or i8 %[[VAL_9]], %[[VAL_11]]
-// CHECK:         %[[VAL_14:.*]] = trunc i8 %[[VAL_13]] to i1
-// CHECK:         %[[VAL_15:.*]] = xor i1 %[[VAL_14]], true
-// CHECK:         %[[VAL_16:.*]] = zext i1 %[[VAL_15]] to i8
-// CHECK:         store i8 %[[VAL_16]], ptr %[[VAL_17:.*]], align 1
-// CHECK:         br label %[[VAL_7]]
+// CHECK: define void @fusion({{.*}}%[[ARG0:.*]], {{.*}}%[[ARG1:.*]],
+// CHECK:   %[[A:.*]] = load {{.*}} ptr %[[ARG0]]
+// CHECK:   %[[B:.*]] = load {{.*}} ptr %[[ARG1]]
+// CHECK:   or {{.*}} %[[A]], %[[B]]
 
 HloModule xla_computation_f.8, is_scheduled=true
 
diff --git a/third_party/xla/xla/service/gpu/tests/calling_convention.hlo b/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
index 4debd51e29f2e0..c84e0194c347cb 100644
--- a/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
+++ b/third_party/xla/xla/service/gpu/tests/calling_convention.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // Arguments are passed separately.
 // Even constant arguments are passed as arguments.
@@ -6,8 +6,8 @@
 // CHECK-LABEL: target triple
 // CHECK: @buffer_for_dynamic
 // CHECK: @buffer_for_static
-// CHECK-PTX: define void @custom_call(ptr noalias align 16 dereferenceable(32) %arg0, ptr noalias align 128 dereferenceable(4) %arg1, ptr noalias align 128 dereferenceable(4) %arg2, ptr noalias align 128 dereferenceable(32) %arg3)
-// CHECK-GCN: define amdgpu_kernel void @custom_call(ptr noalias align 16 dereferenceable(32) %arg0, ptr noalias align 128 dereferenceable(4) %arg1, ptr noalias align 128 dereferenceable(4) %arg2, ptr noalias align 128 dereferenceable(32) %arg3)
+// CHECK-PTX: define void @custom_call(ptr noalias align 16 dereferenceable(32) %arg0, ptr noalias align 128 dereferenceable(4) %arg1, ptr noalias align 128 dereferenceable(4) %arg2, ptr noalias align 128 dereferenceable(44) %arg3)
+// CHECK-GCN: define amdgpu_kernel void @custom_call(ptr noalias align 16 dereferenceable(32) %arg0, ptr noalias align 128 dereferenceable(4) %arg1, ptr noalias align 128 dereferenceable(4) %arg2, ptr noalias align 128 dereferenceable(44) %arg3)
 // CHECK-NOT: @buffer_for_dynamic
 // CHECK-NOT: @buffer_for_static
 
diff --git a/third_party/xla/xla/service/gpu/tests/concat.hlo b/third_party/xla/xla/service/gpu/tests/concat.hlo
deleted file mode 100644
index 86282ad611d233..00000000000000
--- a/third_party/xla/xla/service/gpu/tests/concat.hlo
+++ /dev/null
@@ -1,202 +0,0 @@
-// RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
-
-// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-// CHECK-LABEL: entry:
-// CHECK-PTX:     %[[VAL_0:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x
-// CHECK-GCN:     %[[VAL_0:.*]] = call i32 @llvm.amdgcn.workgroup.id.x
-// CHECK-PTX:     %[[VAL_1:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x
-// CHECK-GCN:     %[[VAL_1:.*]] = call i32 @llvm.amdgcn.workitem.id.x
-// CHECK-PTX:     %[[VAL_2:.*]] = mul nuw nsw i32 %[[VAL_0]], 128
-// CHECK-GCN:     %[[VAL_2:.*]] = mul nuw nsw i32 %[[VAL_0]], 256
-// CHECK:         %[[VAL_3:.*]] = add nuw nsw i32 %[[VAL_2]], %[[VAL_1]]
-// CHECK:         %[[VAL_4:.*]] = icmp ult i32 %[[VAL_3]], 11008
-// CHECK:         call void @llvm.assume(i1 %[[VAL_4]])
-// CHECK:         %[[VAL_5:.*]] = add nuw nsw i32 %[[VAL_3]], 0
-// CHECK:         %[[VAL_6:.*]] = udiv i32 %[[VAL_5]], 1
-// CHECK:         %[[VAL_7:.*]] = icmp ult i32 %[[VAL_3]], 11000
-// CHECK:         br i1 %[[VAL_7]], label %[[VAL_8:.*]], label %[[VAL_9:.*]]
-// CHECK:       fusion.in_bounds-after:                           ; preds = %[[VAL_10:.*]], %[[VAL_11:.*]]
-// CHECK:         ret void
-// CHECK:       fusion.in_bounds-true:                            ; preds = %[[VAL_11]]
-// CHECK:         br label %[[VAL_12:.*]]
-// CHECK:       concat_index_from_operand_id0:                    ; preds = %[[VAL_13:.*]]
-// CHECK:         %[[VAL_14:.*]] = phi i32 [ 0, %[[VAL_13]] ]
-// CHECK:         %[[VAL_15:.*]] = sub nsw i32 %[[VAL_6]], %[[VAL_14]]
-// CHECK:         %[[VAL_16:.*]] = getelementptr inbounds [1000 x float], ptr %[[VAL_17:.*]], i32 0, i32 %[[VAL_15]]
-// CHECK:         %[[VAL_18:.*]] = load float, ptr %[[VAL_16]], align 4, !invariant.load
-// CHECK:         %[[VAL_19:.*]] = fptrunc float %[[VAL_18]] to half
-// CHECK:         br label %[[VAL_10]]
-// CHECK:       concat_index_from_operand_id1:                    ; preds = %[[VAL_20:.*]]
-// CHECK:         %[[VAL_21:.*]] = phi i32 [ 1000, %[[VAL_20]] ]
-// CHECK:         %[[VAL_22:.*]] = sub nsw i32 %[[VAL_6]], %[[VAL_21]]
-// CHECK:         %[[VAL_23:.*]] = getelementptr inbounds [1000 x float], ptr %[[VAL_24:.*]], i32 0, i32 %[[VAL_22]]
-// CHECK:         %[[VAL_25:.*]] = load float, ptr %[[VAL_23]], align 4, !invariant.load
-// CHECK:         %[[VAL_26:.*]] = fptrunc float %[[VAL_25]] to half
-// CHECK:         br label %[[VAL_10]]
-// CHECK:       concat_index_from_operand_id2:                    ; preds = %[[VAL_27:.*]]
-// CHECK:         %[[VAL_28:.*]] = phi i32 [ 2000, %[[VAL_27]] ]
-// CHECK:         %[[VAL_29:.*]] = sub nsw i32 %[[VAL_6]], %[[VAL_28]]
-// CHECK:         %[[VAL_30:.*]] = getelementptr inbounds [1000 x float], ptr %[[VAL_31:.*]], i32 0, i32 %[[VAL_29]]
-// CHECK:         %[[VAL_32:.*]] = load float, ptr %[[VAL_30]], align 4, !invariant.load
-// CHECK:         %[[VAL_33:.*]] = fptrunc float %[[VAL_32]] to half
-// CHECK:         br label %[[VAL_10]]
-// CHECK:       concat_index_from_operand_id3:                    ; preds = %[[VAL_34:.*]]
-// CHECK:         %[[VAL_35:.*]] = phi i32 [ 3000, %[[VAL_34]] ]
-// CHECK:         %[[VAL_36:.*]] = sub nsw i32 %[[VAL_6]], %[[VAL_35]]
-// CHECK:         %[[VAL_37:.*]] = getelementptr inbounds [1000 x float], ptr %[[VAL_38:.*]], i32 0, i32 %[[VAL_36]]
-// CHECK:         %[[VAL_39:.*]] = load float, ptr %[[VAL_37]], align 4, !invariant.load
-// CHECK:         %[[VAL_40:.*]] = fptrunc float %[[VAL_39]] to half
-// CHECK:         br label %[[VAL_10]]
-// CHECK:       concat_index_from_operand_id4:                    ; preds = %[[VAL_41:.*]]
-// CHECK:         %[[VAL_42:.*]] = phi i32 [ 4000, %[[VAL_41]] ]
-// CHECK:         %[[VAL_43:.*]] = sub nsw i32 %[[VAL_6]], %[[VAL_42]]
-// CHECK:         %[[VAL_44:.*]] = getelementptr inbounds [1000 x float], ptr %[[VAL_45:.*]], i32 0, i32 %[[VAL_43]]
-// CHECK:         %[[VAL_46:.*]] = load float, ptr %[[VAL_44]], align 4, !invariant.load
-// CHECK:         %[[VAL_47:.*]] = fptrunc float %[[VAL_46]] to half
-// CHECK:         br label %[[VAL_10]]
-// CHECK:       concat_index_from_operand_id5:                    ; preds = %[[VAL_48:.*]]
-// CHECK:         %[[VAL_49:.*]] = phi i32 [ 5000, %[[VAL_48]] ]
-// CHECK:         %[[VAL_50:.*]] = sub nsw i32 %[[VAL_6]], %[[VAL_49]]
-// CHECK:         %[[VAL_51:.*]] = getelementptr inbounds [1000 x float], ptr %[[VAL_52:.*]], i32 0, i32 %[[VAL_50]]
-// CHECK:         %[[VAL_53:.*]] = load float, ptr %[[VAL_51]], align 4, !invariant.load
-// CHECK:         %[[VAL_54:.*]] = fptrunc float %[[VAL_53]] to half
-// CHECK:         br label %[[VAL_10]]
-// CHECK:       concat_index_from_operand_id6:                    ; preds = %[[VAL_55:.*]]
-// CHECK:         %[[VAL_56:.*]] = phi i32 [ 6000, %[[VAL_55]] ]
-// CHECK:         %[[VAL_57:.*]] = sub nsw i32 %[[VAL_6]], %[[VAL_56]]
-// CHECK:         %[[VAL_58:.*]] = getelementptr inbounds [1000 x float], ptr %[[VAL_59:.*]], i32 0, i32 %[[VAL_57]]
-// CHECK:         %[[VAL_60:.*]] = load float, ptr %[[VAL_58]], align 4, !invariant.load
-// CHECK:         %[[VAL_61:.*]] = fptrunc float %[[VAL_60]] to half
-// CHECK:         br label %[[VAL_10]]
-// CHECK:       concat_index_from_operand_id7:                    ; preds = %[[VAL_62:.*]]
-// CHECK:         %[[VAL_63:.*]] = phi i32 [ 7000, %[[VAL_62]] ]
-// CHECK:         %[[VAL_64:.*]] = sub nsw i32 %[[VAL_6]], %[[VAL_63]]
-// CHECK:         %[[VAL_65:.*]] = getelementptr inbounds [1000 x float], ptr %[[VAL_66:.*]], i32 0, i32 %[[VAL_64]]
-// CHECK:         %[[VAL_67:.*]] = load float, ptr %[[VAL_65]], align 4, !invariant.load
-// CHECK:         %[[VAL_68:.*]] = fptrunc float %[[VAL_67]] to half
-// CHECK:         br label %[[VAL_10]]
-// CHECK:       concat_index_from_operand_id8:                    ; preds = %[[VAL_69:.*]]
-// CHECK:         %[[VAL_70:.*]] = phi i32 [ 8000, %[[VAL_69]] ]
-// CHECK:         %[[VAL_71:.*]] = sub nsw i32 %[[VAL_6]], %[[VAL_70]]
-// CHECK:         %[[VAL_72:.*]] = getelementptr inbounds [1000 x float], ptr %[[VAL_73:.*]], i32 0, i32 %[[VAL_71]]
-// CHECK:         %[[VAL_74:.*]] = load float, ptr %[[VAL_72]], align 4, !invariant.load
-// CHECK:         %[[VAL_75:.*]] = fptrunc float %[[VAL_74]] to half
-// CHECK:         br label %[[VAL_10]]
-// CHECK:       concat_index_from_operand_id9:                    ; preds = %[[VAL_76:.*]]
-// CHECK:         %[[VAL_77:.*]] = phi i32 [ 9000, %[[VAL_76]] ]
-// CHECK:         %[[VAL_78:.*]] = sub nsw i32 %[[VAL_6]], %[[VAL_77]]
-// CHECK:         %[[VAL_79:.*]] = getelementptr inbounds [1000 x float], ptr %[[VAL_80:.*]], i32 0, i32 %[[VAL_78]]
-// CHECK:         %[[VAL_81:.*]] = load float, ptr %[[VAL_79]], align 4, !invariant.load
-// CHECK:         %[[VAL_82:.*]] = fptrunc float %[[VAL_81]] to half
-// CHECK:         br label %[[VAL_10]]
-// CHECK:       concat_index_from_operand_id10:                   ; preds = %[[VAL_83:.*]]
-// CHECK:         %[[VAL_84:.*]] = phi i32 [ 10000, %[[VAL_83]] ]
-// CHECK:         %[[VAL_85:.*]] = sub nsw i32 %[[VAL_6]], %[[VAL_84]]
-// CHECK:         %[[VAL_86:.*]] = getelementptr inbounds [1000 x float], ptr %[[VAL_87:.*]], i32 0, i32 %[[VAL_85]]
-// CHECK:         %[[VAL_88:.*]] = load float, ptr %[[VAL_86]], align 4, !invariant.load
-// CHECK:         %[[VAL_89:.*]] = fptrunc float %[[VAL_88]] to half
-// CHECK:         br label %[[VAL_10]]
-// CHECK:       concatenate.pivot.5000.:                          ; preds = %[[VAL_8]]
-// CHECK:         %[[VAL_90:.*]] = icmp ult i32 %[[VAL_6]], 5000
-// CHECK:         br i1 %[[VAL_90]], label %[[VAL_91:.*]], label %[[VAL_92:.*]]
-// CHECK:       concatenate.pivot.2000.:                          ; preds = %[[VAL_12]]
-// CHECK:         %[[VAL_93:.*]] = icmp ult i32 %[[VAL_6]], 2000
-// CHECK:         br i1 %[[VAL_93]], label %[[VAL_94:.*]], label %[[VAL_95:.*]]
-// CHECK:       concatenate.pivot.1000.:                          ; preds = %[[VAL_91]]
-// CHECK:         %[[VAL_96:.*]] = icmp ult i32 %[[VAL_6]], 1000
-// CHECK:         br i1 %[[VAL_96]], label %[[VAL_13]], label %[[VAL_20]]
-// CHECK:       concatenate.pivot.0.:                             ; preds = %[[VAL_94]]
-// CHECK:         br label %[[VAL_97:.*]]
-// CHECK:       concatenate.pivot.1000.1:                         ; preds = %[[VAL_94]]
-// CHECK:         br label %[[VAL_98:.*]]
-// CHECK:       concatenate.pivot.3000.:                          ; preds = %[[VAL_91]]
-// CHECK:         %[[VAL_99:.*]] = icmp ult i32 %[[VAL_6]], 3000
-// CHECK:         br i1 %[[VAL_99]], label %[[VAL_27]], label %[[VAL_100:.*]]
-// CHECK:       concatenate.pivot.2000.2:                         ; preds = %[[VAL_95]]
-// CHECK:         br label %[[VAL_101:.*]]
-// CHECK:       concatenate.pivot.4000.:                          ; preds = %[[VAL_95]]
-// CHECK:         %[[VAL_102:.*]] = icmp ult i32 %[[VAL_6]], 4000
-// CHECK:         br i1 %[[VAL_102]], label %[[VAL_34]], label %[[VAL_41]]
-// CHECK:       concatenate.pivot.3000.3:                         ; preds = %[[VAL_100]]
-// CHECK:         br label %[[VAL_103:.*]]
-// CHECK:       concatenate.pivot.4000.4:                         ; preds = %[[VAL_100]]
-// CHECK:         br label %[[VAL_104:.*]]
-// CHECK:       concatenate.pivot.8000.:                          ; preds = %[[VAL_12]]
-// CHECK:         %[[VAL_105:.*]] = icmp ult i32 %[[VAL_6]], 8000
-// CHECK:         br i1 %[[VAL_105]], label %[[VAL_106:.*]], label %[[VAL_107:.*]]
-// CHECK:       concatenate.pivot.6000.:                          ; preds = %[[VAL_92]]
-// CHECK:         %[[VAL_108:.*]] = icmp ult i32 %[[VAL_6]], 6000
-// CHECK:         br i1 %[[VAL_108]], label %[[VAL_48]], label %[[VAL_109:.*]]
-// CHECK:       concatenate.pivot.5000.5:                         ; preds = %[[VAL_106]]
-// CHECK:         br label %[[VAL_110:.*]]
-// CHECK:       concatenate.pivot.7000.:                          ; preds = %[[VAL_106]]
-// CHECK:         %[[VAL_111:.*]] = icmp ult i32 %[[VAL_6]], 7000
-// CHECK:         br i1 %[[VAL_111]], label %[[VAL_55]], label %[[VAL_62]]
-// CHECK:       concatenate.pivot.6000.6:                         ; preds = %[[VAL_109]]
-// CHECK:         br label %[[VAL_112:.*]]
-// CHECK:       concatenate.pivot.7000.7:                         ; preds = %[[VAL_109]]
-// CHECK:         br label %[[VAL_113:.*]]
-// CHECK:       concatenate.pivot.9000.:                          ; preds = %[[VAL_92]]
-// CHECK:         %[[VAL_114:.*]] = icmp ult i32 %[[VAL_6]], 9000
-// CHECK:         br i1 %[[VAL_114]], label %[[VAL_69]], label %[[VAL_115:.*]]
-// CHECK:       concatenate.pivot.8000.8:                         ; preds = %[[VAL_107]]
-// CHECK:         br label %[[VAL_116:.*]]
-// CHECK:       concatenate.pivot.10000.:                         ; preds = %[[VAL_107]]
-// CHECK:         %[[VAL_117:.*]] = icmp ult i32 %[[VAL_6]], 10000
-// CHECK:         br i1 %[[VAL_117]], label %[[VAL_76]], label %[[VAL_83]]
-// CHECK:       concatenate.pivot.9000.9:                         ; preds = %[[VAL_115]]
-// CHECK:         br label %[[VAL_118:.*]]
-// CHECK:       concatenate.pivot.10000.10:                       ; preds = %[[VAL_115]]
-// CHECK:         br label %[[VAL_119:.*]]
-// CHECK:       out.1.merge:                                      ; preds = %[[VAL_119]], %[[VAL_118]], %[[VAL_116]], %[[VAL_113]], %[[VAL_112]], %[[VAL_110]], %[[VAL_104]], %[[VAL_103]], %[[VAL_101]], %[[VAL_98]], %[[VAL_97]]
-// CHECK:         %[[VAL_120:.*]] = phi half [ %[[VAL_19]], %[[VAL_97]] ], [ %[[VAL_26]], %[[VAL_98]] ], [ %[[VAL_33]], %[[VAL_101]] ], [ %[[VAL_40]], %[[VAL_103]] ], [ %[[VAL_47]], %[[VAL_104]] ], [ %[[VAL_54]], %[[VAL_110]] ], [ %[[VAL_61]], %[[VAL_112]] ], [ %[[VAL_68]], %[[VAL_113]] ], [ %[[VAL_75]], %[[VAL_116]] ], [ %[[VAL_82]], %[[VAL_118]] ], [ %[[VAL_89]], %[[VAL_119]] ]
-// CHECK:         %[[VAL_121:.*]] = getelementptr half, ptr %[[VAL_122:.*]], i32 %[[VAL_3]]
-// CHECK:         %[[VAL_123:.*]] = getelementptr inbounds half, ptr %[[VAL_121]], i32 0
-// CHECK:         store half %[[VAL_120]], ptr %[[VAL_123]], align 2
-// CHECK:         br label %[[VAL_9]]
-
-
-HloModule module, is_scheduled=true
-
-%fused_computation (param_0.1: f32[1000], param_1.2: f32[1000], param_2.3: f32[1000], param_3.4: f32[1000], param_4.5: f32[1000], param_5.6: f32[1000], param_6.7: f32[1000], param_7.8: f32[1000], param_8.9: f32[1000], param_9.10: f32[1000], param_10.11: f32[1000]) -> f16[11000] {
-  %param_10.11 = f32[1000]{0} parameter(10)
-  %converted0.1 = f16[1000]{0} convert(f32[1000]{0} %param_10.11)
-  %param_9.10 = f32[1000]{0} parameter(9)
-  %converted1.1 = f16[1000]{0} convert(f32[1000]{0} %param_9.10)
-  %param_8.9 = f32[1000]{0} parameter(8)
-  %converted2.1 = f16[1000]{0} convert(f32[1000]{0} %param_8.9)
-  %param_7.8 = f32[1000]{0} parameter(7)
-  %converted3.1 = f16[1000]{0} convert(f32[1000]{0} %param_7.8)
-  %param_6.7 = f32[1000]{0} parameter(6)
-  %converted4.1 = f16[1000]{0} convert(f32[1000]{0} %param_6.7)
-  %param_5.6 = f32[1000]{0} parameter(5)
-  %converted5.1 = f16[1000]{0} convert(f32[1000]{0} %param_5.6)
-  %param_4.5 = f32[1000]{0} parameter(4)
-  %converted6.1 = f16[1000]{0} convert(f32[1000]{0} %param_4.5)
-  %param_3.4 = f32[1000]{0} parameter(3)
-  %converted7.1 = f16[1000]{0} convert(f32[1000]{0} %param_3.4)
-  %param_2.3 = f32[1000]{0} parameter(2)
-  %converted8.1 = f16[1000]{0} convert(f32[1000]{0} %param_2.3)
-  %param_1.2 = f32[1000]{0} parameter(1)
-  %converted9.1 = f16[1000]{0} convert(f32[1000]{0} %param_1.2)
-  %param_0.1 = f32[1000]{0} parameter(0)
-  %converted10.1 = f16[1000]{0} convert(f32[1000]{0} %param_0.1)
-  ROOT %out.1 = f16[11000]{0} concatenate(f16[1000]{0} %converted0.1, f16[1000]{0} %converted1.1, f16[1000]{0} %converted2.1, f16[1000]{0} %converted3.1, f16[1000]{0} %converted4.1, /*index=5*/f16[1000]{0} %converted5.1, f16[1000]{0} %converted6.1, f16[1000]{0} %converted7.1, f16[1000]{0} %converted8.1, f16[1000]{0} %converted9.1, /*index=10*/f16[1000]{0} %converted10.1), dimensions={0}
-}
-
-ENTRY %computation (p0: f32[1000], p1: f32[1000], p2: f32[1000], p3: f32[1000], p4: f32[1000], p5: f32[1000], p6: f32[1000], p7: f32[1000], p8: f32[1000], p9: f32[1000], p10: f32[1000]) -> f16[11000] {
-  %p10 = f32[1000]{0} parameter(10)
-  %p9 = f32[1000]{0} parameter(9)
-  %p8 = f32[1000]{0} parameter(8)
-  %p7 = f32[1000]{0} parameter(7)
-  %p6 = f32[1000]{0} parameter(6)
-  %p5 = f32[1000]{0} parameter(5)
-  %p4 = f32[1000]{0} parameter(4)
-  %p3 = f32[1000]{0} parameter(3)
-  %p2 = f32[1000]{0} parameter(2)
-  %p1 = f32[1000]{0} parameter(1)
-  %p0 = f32[1000]{0} parameter(0)
-  ROOT %fusion = f16[11000]{0} fusion(f32[1000]{0} %p10, f32[1000]{0} %p9, f32[1000]{0} %p8, f32[1000]{0} %p7, f32[1000]{0} %p6, /*index=5*/f32[1000]{0} %p5, f32[1000]{0} %p4, f32[1000]{0} %p3, f32[1000]{0} %p2, f32[1000]{0} %p1, /*index=10*/f32[1000]{0} %p0), kind=kLoop, calls=%fused_computation
-}
-
diff --git a/third_party/xla/xla/service/gpu/tests/concatenate_emitter_test.cc b/third_party/xla/xla/service/gpu/tests/concatenate_emitter_test.cc
index 57d544ccd434d5..3e9ac66072b4ec 100644
--- a/third_party/xla/xla/service/gpu/tests/concatenate_emitter_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/concatenate_emitter_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/third_party/xla/xla/service/gpu/tests/constant.hlo b/third_party/xla/xla/service/gpu/tests/constant.hlo
deleted file mode 100644
index 567d3745893913..00000000000000
--- a/third_party/xla/xla/service/gpu/tests/constant.hlo
+++ /dev/null
@@ -1,19 +0,0 @@
-// RUN: hlo_to_llvm_ir %s | FileCheck %s
-
-HloModule Test, is_scheduled=true
-
-fused_computation {
-  param_0 = pred[2,2]{1,0} parameter(0)
-  param_1 = pred[2,2]{1,0} parameter(1)
-  ROOT xor.1 = pred[2,2]{1,0} xor(pred[2,2]{1,0} param_0, pred[2,2]{1,0} param_1)
-}
-
-ENTRY main {
-// CHECK: %[[VAL_1:.*]] = getelementptr i8, ptr %arg0, i32 %{{.*}}
-// CHECK: %{{.*}} = getelementptr inbounds i8, ptr %[[VAL_1]], i32 0
-// CHECK: %[[VAL_2:.*]] = getelementptr i8, ptr %arg1, i32 %{{.*}}
-// CHECK: %{{.*}} = getelementptr inbounds i8, ptr %[[VAL_2]], i32 0
-  a = pred[2, 2]{1,0} constant({{false, true}, {true, false}})
-  b = pred[2, 2]{1,0} constant({{false, true}, {false, true}})
-  ROOT wrapped_xor = pred[2,2]{1,0} fusion(pred[2,2]{1,0} a, pred[2,2]{1,0} b), kind=kLoop, calls=fused_computation
-}
diff --git a/third_party/xla/xla/service/gpu/tests/copy.hlo b/third_party/xla/xla/service/gpu/tests/copy.hlo
index 255fec49900464..beac8e6d36b115 100644
--- a/third_party/xla/xla/service/gpu/tests/copy.hlo
+++ b/third_party/xla/xla/service/gpu/tests/copy.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
diff --git a/third_party/xla/xla/service/gpu/tests/copy_nested.hlo b/third_party/xla/xla/service/gpu/tests/copy_nested.hlo
deleted file mode 100644
index 18e456b82600db..00000000000000
--- a/third_party/xla/xla/service/gpu/tests/copy_nested.hlo
+++ /dev/null
@@ -1,95 +0,0 @@
-// RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
-
-// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-// CHECK-LABEL: entry:
-// CHECK:         %[[VAL_0:.*]] = alloca i32, align 4
-// CHECK-PTX:     store i32 0, ptr %[[VAL_0]], align 4
-// CHECK-GCN:     store i32 0, ptr addrspace(5) %[[VAL_0]], align 4
-// CHECK:         br label %[[VAL_1:.*]]
-// CHECK:       loop.loop_header:                                 ; preds = %[[VAL_2:.*]], %[[VAL_3:.*]]
-// CHECK-PTX:     %[[VAL_4:.*]] = load i32, ptr %[[VAL_0]], align 4
-// CHECK-GCN:     %[[VAL_4:.*]] = load i32, ptr addrspace(5) %[[VAL_0]], align 4
-// CHECK:         %[[VAL_5:.*]] = icmp uge i32 %[[VAL_4]], 6000000
-// CHECK:         br i1 %[[VAL_5]], label %[[VAL_6:.*]], label %[[VAL_7:.*]]
-// CHECK:       loop.loop_body:                                   ; preds = %[[VAL_1]]
-// CHECK-PTX:     %[[VAL_8:.*]] = add nuw nsw i32 %[[VAL_4]], 516096
-// CHECK-GCN:     %[[VAL_8:.*]] = add nuw nsw i32 %[[VAL_4]], 851968
-// CHECK-PTX:     store i32 %[[VAL_8]], ptr %[[VAL_0]], align 4
-// CHECK-GCN:     store i32 %[[VAL_8]], ptr addrspace(5) %[[VAL_0]], align 4
-// CHECK:         %[[VAL_9:.*]] = icmp eq i32 %[[VAL_4]], 0
-// CHECK-PTX:     %[[VAL_10:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x
-// CHECK-GCN:     %[[VAL_10:.*]] = call i32 @llvm.amdgcn.workgroup.id.x
-// CHECK-PTX:     %[[VAL_11:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x
-// CHECK-GCN:     %[[VAL_11:.*]] = call i32 @llvm.amdgcn.workitem.id.x
-// CHECK:         %[[VAL_12:.*]] = mul nuw nsw i32 %[[VAL_10]], 128
-// CHECK:         %[[VAL_13:.*]] = add nuw nsw i32 %[[VAL_12]], %[[VAL_11]]
-// CHECK-PTX:     %[[VAL_14:.*]] = icmp ult i32 %[[VAL_13]], 129024
-// CHECK-GCN:     %[[VAL_14:.*]] = icmp ult i32 %[[VAL_13]], 212992
-// CHECK:         call void @llvm.assume(i1 %[[VAL_14]])
-// CHECK:         %[[VAL_15:.*]] = mul nuw nsw i32 %[[VAL_13]], 4
-// CHECK:         %[[VAL_16:.*]] = add nuw nsw i32 %[[VAL_15]], %[[VAL_4]]
-// CHECK:         %[[VAL_17:.*]] = add nuw nsw i32 %[[VAL_16]], 0
-// CHECK:         %[[VAL_18:.*]] = udiv i32 %[[VAL_17]], 1
-// CHECK:         %[[VAL_19:.*]] = urem i32 %[[VAL_18]], 300
-// CHECK:         %[[VAL_20:.*]] = udiv i32 %[[VAL_17]], 300
-// CHECK:         %[[VAL_21:.*]] = urem i32 %[[VAL_20]], 100
-// CHECK:         %[[VAL_22:.*]] = udiv i32 %[[VAL_17]], 30000
-// CHECK:         %[[VAL_23:.*]] = add nuw nsw i32 %[[VAL_16]], 1
-// CHECK:         %[[VAL_24:.*]] = udiv i32 %[[VAL_23]], 1
-// CHECK:         %[[VAL_25:.*]] = urem i32 %[[VAL_24]], 300
-// CHECK:         %[[VAL_26:.*]] = udiv i32 %[[VAL_23]], 300
-// CHECK:         %[[VAL_27:.*]] = urem i32 %[[VAL_26]], 100
-// CHECK:         %[[VAL_28:.*]] = udiv i32 %[[VAL_23]], 30000
-// CHECK:         %[[VAL_29:.*]] = add nuw nsw i32 %[[VAL_16]], 2
-// CHECK:         %[[VAL_30:.*]] = udiv i32 %[[VAL_29]], 1
-// CHECK:         %[[VAL_31:.*]] = urem i32 %[[VAL_30]], 300
-// CHECK:         %[[VAL_32:.*]] = udiv i32 %[[VAL_29]], 300
-// CHECK:         %[[VAL_33:.*]] = urem i32 %[[VAL_32]], 100
-// CHECK:         %[[VAL_34:.*]] = udiv i32 %[[VAL_29]], 30000
-// CHECK:         %[[VAL_35:.*]] = add nuw nsw i32 %[[VAL_16]], 3
-// CHECK:         %[[VAL_36:.*]] = udiv i32 %[[VAL_35]], 1
-// CHECK:         %[[VAL_37:.*]] = urem i32 %[[VAL_36]], 300
-// CHECK:         %[[VAL_38:.*]] = udiv i32 %[[VAL_35]], 300
-// CHECK:         %[[VAL_39:.*]] = urem i32 %[[VAL_38]], 100
-// CHECK:         %[[VAL_40:.*]] = udiv i32 %[[VAL_35]], 30000
-// CHECK:         %[[VAL_41:.*]] = icmp ult i32 %[[VAL_16]], 6000000
-// CHECK:         br i1 %[[VAL_41]], label %[[VAL_42:.*]], label %[[VAL_2]]
-// CHECK:       wrapped_b.in_bounds-after:                        ; preds = %[[VAL_42]], %[[VAL_7]]
-// CHECK:         br label %[[VAL_1]], !llvm.loop
-// CHECK:       loop.loop_exit:                                   ; preds = %[[VAL_1]]
-// CHECK:         ret void
-// CHECK:       wrapped_b.in_bounds-true:                         ; preds = %[[VAL_7]]
-// CHECK:         %[[VAL_43:.*]] = getelementptr inbounds [100 x [200 x [300 x float]]], ptr %[[VAL_44:.*]], i32 0, i32 %[[VAL_21]], i32 %[[VAL_22]], i32 %[[VAL_19]]
-// CHECK:         %[[VAL_45:.*]] = load float, ptr %[[VAL_43]], align 4, !invariant.load
-// CHECK:         %[[VAL_46:.*]] = getelementptr float, ptr %[[VAL_47:.*]], i32 %[[VAL_16]]
-// CHECK:         %[[VAL_48:.*]] = getelementptr inbounds float, ptr %[[VAL_46]], i32 0
-// CHECK:         store float %[[VAL_45]], ptr %[[VAL_48]], align 4
-// CHECK:         %[[VAL_49:.*]] = getelementptr inbounds [100 x [200 x [300 x float]]], ptr %[[VAL_44]], i32 0, i32 %[[VAL_27]], i32 %[[VAL_28]], i32 %[[VAL_25]]
-// CHECK:         %[[VAL_50:.*]] = load float, ptr %[[VAL_49]], align 4, !invariant.load
-// CHECK:         %[[VAL_51:.*]] = getelementptr float, ptr %[[VAL_47]], i32 %[[VAL_16]]
-// CHECK:         %[[VAL_52:.*]] = getelementptr inbounds float, ptr %[[VAL_51]], i32 1
-// CHECK:         store float %[[VAL_50]], ptr %[[VAL_52]], align 4
-// CHECK:         %[[VAL_53:.*]] = getelementptr inbounds [100 x [200 x [300 x float]]], ptr %[[VAL_44]], i32 0, i32 %[[VAL_33]], i32 %[[VAL_34]], i32 %[[VAL_31]]
-// CHECK:         %[[VAL_54:.*]] = load float, ptr %[[VAL_53]], align 4, !invariant.load
-// CHECK:         %[[VAL_55:.*]] = getelementptr float, ptr %[[VAL_47]], i32 %[[VAL_16]]
-// CHECK:         %[[VAL_56:.*]] = getelementptr inbounds float, ptr %[[VAL_55]], i32 2
-// CHECK:         store float %[[VAL_54]], ptr %[[VAL_56]], align 4
-// CHECK:         %[[VAL_57:.*]] = getelementptr inbounds [100 x [200 x [300 x float]]], ptr %[[VAL_44]], i32 0, i32 %[[VAL_39]], i32 %[[VAL_40]], i32 %[[VAL_37]]
-// CHECK:         %[[VAL_58:.*]] = load float, ptr %[[VAL_57]], align 4, !invariant.load
-// CHECK:         %[[VAL_59:.*]] = getelementptr float, ptr %[[VAL_47]], i32 %[[VAL_16]]
-// CHECK:         %[[VAL_60:.*]] = getelementptr inbounds float, ptr %[[VAL_59]], i32 3
-// CHECK:         store float %[[VAL_58]], ptr %[[VAL_60]], align 4
-// CHECK:         br label %[[VAL_2]]
-
-HloModule Test, is_scheduled=true
-
-fused_computation {
-  param_0 = f32[100,200,300]{2,1,0} parameter(0)
-  ROOT b.1 = f32[100,200,300]{2,0,1} copy(f32[100,200,300]{2,1,0} param_0)
-}
-
-ENTRY main {
-  a = f32[100, 200, 300]{2,1,0} parameter(0)
-  ROOT wrapped_b = f32[100,200,300]{2,0,1} fusion(f32[100,200,300]{2,1,0} %a), kind=kLoop, calls=fused_computation
-}
diff --git a/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc b/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc
index ff8c5e59d2ef41..b0190d147a2ee0 100644
--- a/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/dynamic_shared_memory_test.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/asm_compiler.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/xla.pb.h"
 #include "tsl/platform/status.h"
@@ -130,7 +131,7 @@ TEST(SharedMemoryUseTest, ArrayReversalWorks) {
   // memory with it, read it back inverting both axes,
   // copy the result back to the host and verify it.
   se::Platform* platform =
-      se::MultiPlatformManager::PlatformWithName("cuda").value();
+      se::PlatformManager::PlatformWithName("cuda").value();
   se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
   se::Stream stream(executor);
   stream.Init();
diff --git a/third_party/xla/xla/service/gpu/tests/dynamic_update_slice_inplace.hlo b/third_party/xla/xla/service/gpu/tests/dynamic_update_slice_inplace.hlo
index 338e07c99a6dad..ae88c58014bf0e 100644
--- a/third_party/xla/xla/service/gpu/tests/dynamic_update_slice_inplace.hlo
+++ b/third_party/xla/xla/service/gpu/tests/dynamic_update_slice_inplace.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
diff --git a/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization.hlo b/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization.hlo
index bd6ab76d2e8a2e..b49e155da0a685 100644
--- a/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization.hlo
+++ b/third_party/xla/xla/service/gpu/tests/element_wise_row_vectorization.hlo
@@ -1,5 +1,5 @@
-// RUN: hlo_to_llvm_ir --ptx --sm=70 --xla_disable_all_hlo_passes=true %s | FileCheck %s
-// RUN: hlo_to_llvm_ir --xla_disable_all_hlo_passes=true %s | FileCheck --check-prefix=CHECK-LLVM %s
+// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/v100.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK-LLVM %s
 // We check that the row loads are vectorized.
 
 HloModule SimpleAddRowBroadcasting, is_scheduled=true
@@ -310,5 +310,8 @@ ENTRY computation {
   ROOT %fusion.9 = f16[5000,65,65,32] fusion(p0, zero), kind=kLoop, calls=%fused_computation.1
 }
 
-// Check that we emit vectorized read.
-// CHECK: ld.global.nc.v4.f32
+// Our codegen can't emit a vectorized load here, but it can emit a vectorized
+// store.
+// CHECK-LABEL: .visible .entry fusion_9
+// CHECK-COUNT-4: ld.global.nc.u16
+// CHECK: st.global.v4.b16
diff --git a/third_party/xla/xla/service/gpu/tests/float_conversions_test.cc b/third_party/xla/xla/service/gpu/tests/float_conversions_test.cc
new file mode 100644
index 00000000000000..649af9eb9345b7
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/float_conversions_test.cc
@@ -0,0 +1,153 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/tests/gpu_codegen_test.h"
+#include "tsl/platform/test.h"
+
+namespace xla {
+namespace gpu {
+
+class FloatConversionTest : public GpuCodegenTest {};
+
+TEST_F(FloatConversionTest, F8E5M2ToF16) {
+  EXPECT_TRUE(RunAndCompare(R"(ENTRY m {
+                                 %p = f8e5m2[] parameter(0)
+                                 ROOT %c = f16[] convert(%p)
+                               })",
+                            ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(FloatConversionTest, F8E4M3FNToF16) {
+  EXPECT_TRUE(RunAndCompare(R"(ENTRY m {
+                                 %p = f8e4m3fn[] parameter(0)
+                                 ROOT %c = f16[] convert(%p)
+                               })",
+                            ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(FloatConversionTest, F8E4M3B11FNUZToF16) {
+  EXPECT_TRUE(RunAndCompare(R"(ENTRY m {
+                                 %p = f8e4m3b11fnuz[] parameter(0)
+                                 ROOT %c = f16[] convert(%p)
+                               })",
+                            ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(FloatConversionTest, F8E5M2FNUZToF16) {
+  EXPECT_TRUE(RunAndCompare(R"(ENTRY m {
+                                 %p = f8e5m2fnuz[] parameter(0)
+                                 ROOT %c = f16[] convert(%p)
+                               })",
+                            ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(FloatConversionTest, F8E4M3FNUZToF16) {
+  EXPECT_TRUE(RunAndCompare(R"(ENTRY m {
+                                 %p = f8e4m3fnuz[] parameter(0)
+                                 ROOT %c = f16[] convert(%p)
+                               })",
+                            ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(FloatConversionTest, BF16ToF32) {
+  EXPECT_TRUE(RunAndCompare(R"(ENTRY m {
+                                 %p = bf16[] parameter(0)
+                                 ROOT %c = f32[] convert(%p)
+                               })",
+                            ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(FloatConversionTest, F16ToF32) {
+  EXPECT_TRUE(RunAndCompare(R"(ENTRY m {
+                                 %p = f16[] parameter(0)
+                                 ROOT %c = f32[] convert(%p)
+                               })",
+                            ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(FloatConversionTest, F64ToF32) {
+  EXPECT_TRUE(RunAndCompare(R"(ENTRY m {
+                                 %p = f64[] parameter(0)
+                                 ROOT %c = f32[] convert(%p)
+                               })",
+                            ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(FloatConversionTest, F16ToF8E5M2) {
+  EXPECT_TRUE(RunAndCompare(R"(ENTRY m {
+                                 %p = f16[] parameter(0)
+                                 ROOT %c = f8e5m2[] convert(%p)
+                               })",
+                            ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(FloatConversionTest, F16ToF8E4M3FN) {
+  EXPECT_TRUE(RunAndCompare(R"(ENTRY m {
+                                 %p = f16[] parameter(0)
+                                 ROOT %c = f8e4m3fn[] convert(%p)
+                               })",
+                            ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(FloatConversionTest, F16ToF8E4M3B11FNUZ) {
+  EXPECT_TRUE(RunAndCompare(R"(ENTRY m {
+                                 %p = f16[] parameter(0)
+                                 ROOT %c = f8e4m3b11fnuz[] convert(%p)
+                               })",
+                            ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(FloatConversionTest, F16ToF8E5M2FNUZ) {
+  EXPECT_TRUE(RunAndCompare(R"(ENTRY m {
+                                 %p = f16[] parameter(0)
+                                 ROOT %c = f8e5m2fnuz[] convert(%p)
+                               })",
+                            ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(FloatConversionTest, F16ToF8E4M3FNUZ) {
+  EXPECT_TRUE(RunAndCompare(R"(ENTRY m {
+                                 %p = f16[] parameter(0)
+                                 ROOT %c = f8e4m3fnuz[] convert(%p)
+                               })",
+                            ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(FloatConversionTest, F32ToBF16) {
+  EXPECT_TRUE(RunAndCompare(R"(ENTRY m {
+                                 %p = f32[] parameter(0)
+                                 ROOT %c = bf16[] convert(%p)
+                               })",
+                            ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(FloatConversionTest, F32ToF16) {
+  EXPECT_TRUE(RunAndCompare(R"(ENTRY m {
+                                 %p = f32[] parameter(0)
+                                 ROOT %c = f16[] convert(%p)
+                               })",
+                            ErrorSpec{1e-5, 1e-5}));
+}
+
+TEST_F(FloatConversionTest, F32ToF64) {
+  EXPECT_TRUE(RunAndCompare(R"(ENTRY m {
+                                 %p = f32[] parameter(0)
+                                 ROOT %c = f64[] convert(%p)
+                               })",
+                            ErrorSpec{1e-5, 1e-5}));
+}
+
+}  // namespace gpu
+}  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo b/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo
index 99475db3fb0040..9a30436ebfa38c 100644
--- a/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo
+++ b/third_party/xla/xla/service/gpu/tests/fused_scatter.hlo
@@ -1,93 +1,8 @@
-// RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
-// CHECK-LABEL: entry:
-// CHECK-PTX:     %[[VAL_0:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x
-// CHECK-GCN:     %[[VAL_0:.*]] = call i32 @llvm.amdgcn.workgroup.id.x
-// CHECK-PTX:     %[[VAL_1:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x
-// CHECK-GCN:     %[[VAL_1:.*]] = call i32 @llvm.amdgcn.workitem.id.x
-// CHECK:         %[[VAL_2:.*]] = mul nuw nsw i32 %[[VAL_0]], 2
-// CHECK:         %[[VAL_3:.*]] = add nuw nsw i32 %[[VAL_2]], %[[VAL_1]]
-// CHECK:         %[[VAL_4:.*]] = icmp ult i32 %[[VAL_3]], 2
-// CHECK:         call void @llvm.assume(i1 %[[VAL_4]])
-// CHECK:         %[[VAL_5:.*]] = add nuw nsw i32 %[[VAL_3]], 0
-// CHECK:         %[[VAL_6:.*]] = udiv i32 %[[VAL_5]], 1
-// CHECK:         %[[VAL_7:.*]] = icmp ult i32 %[[VAL_3]], 2
-// CHECK:         br i1 %[[VAL_7]], label %[[VAL_8:.*]], label %[[VAL_9:.*]]
-// CHECK:       wrapped_indices.in_bounds-after:                  ; preds = %[[VAL_8]], %[[VAL_10:.*]]
-// CHECK:         ret void
-// CHECK:       wrapped_indices.in_bounds-true:                   ; preds = %[[VAL_10]]
-// CHECK:         %[[VAL_11:.*]] = getelementptr i32, ptr %[[VAL_12:.*]], i32 %[[VAL_3]]
-// CHECK:         %[[VAL_13:.*]] = getelementptr inbounds i32, ptr %[[VAL_11]], i32 0
-// CHECK:         %[[VAL_14:.*]] = load i32, ptr %[[VAL_13]], align 4, !invariant.load
-// CHECK:         %[[VAL_15:.*]] = getelementptr i32, ptr %[[VAL_12]], i32 %[[VAL_3]]
-// CHECK:         %[[VAL_16:.*]] = getelementptr inbounds i32, ptr %[[VAL_15]], i32 0
-// CHECK:         %[[VAL_17:.*]] = load i32, ptr %[[VAL_16]], align 4, !invariant.load
-// CHECK:         %[[VAL_18:.*]] = add i32 %[[VAL_14]], %[[VAL_17]]
-// CHECK:         %[[VAL_19:.*]] = getelementptr i32, ptr %[[VAL_20:.*]], i32 %[[VAL_3]]
-// CHECK:         %[[VAL_21:.*]] = getelementptr inbounds i32, ptr %[[VAL_19]], i32 0
-// CHECK:         store i32 %[[VAL_18]], ptr %[[VAL_21]], align 4
-// CHECK:         br label %[[VAL_9]]
-// CHECK:       entry:
-// CHECK-PTX:     %[[VAL_22:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x
-// CHECK-GCN:     %[[VAL_22:.*]] = call i32 @llvm.amdgcn.workgroup.id.x
-// CHECK-PTX:     %[[VAL_23:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x
-// CHECK-GCN:     %[[VAL_23:.*]] = call i32 @llvm.amdgcn.workitem.id.x
-// CHECK:         %[[VAL_24:.*]] = mul nuw nsw i32 %[[VAL_22]], 6
-// CHECK:         %[[VAL_25:.*]] = add nuw nsw i32 %[[VAL_24]], %[[VAL_23]]
-// CHECK:         %[[VAL_26:.*]] = icmp ult i32 %[[VAL_25]], 6
-// CHECK:         call void @llvm.assume(i1 %[[VAL_26]])
-// CHECK:         %[[VAL_27:.*]] = add nuw nsw i32 %[[VAL_25]], 0
-// CHECK:         %[[VAL_28:.*]] = udiv i32 %[[VAL_27]], 1
-// CHECK:         %[[VAL_29:.*]] = urem i32 %[[VAL_28]], 3
-// CHECK:         %[[VAL_30:.*]] = udiv i32 %[[VAL_27]], 3
-// CHECK:         %[[VAL_31:.*]] = icmp ult i32 %[[VAL_25]], 6
-// CHECK:         br i1 %[[VAL_31]], label %[[VAL_32:.*]], label %[[VAL_33:.*]]
-// CHECK:       wrapped_updates.in_bounds-after:                  ; preds = %[[VAL_32]], %[[VAL_34:.*]]
-// CHECK:         ret void
-// CHECK:       wrapped_updates.in_bounds-true:                   ; preds = %[[VAL_34]]
-// CHECK:         %[[VAL_35:.*]] = getelementptr i32, ptr %[[VAL_36:.*]], i32 %[[VAL_25]]
-// CHECK:         %[[VAL_37:.*]] = getelementptr inbounds i32, ptr %[[VAL_35]], i32 0
-// CHECK:         %[[VAL_38:.*]] = load i32, ptr %[[VAL_37]], align 4, !invariant.load
-// CHECK:         %[[VAL_39:.*]] = getelementptr i32, ptr %[[VAL_36]], i32 %[[VAL_25]]
-// CHECK:         %[[VAL_40:.*]] = getelementptr inbounds i32, ptr %[[VAL_39]], i32 0
-// CHECK:         %[[VAL_41:.*]] = load i32, ptr %[[VAL_40]], align 4, !invariant.load
-// CHECK:         %[[VAL_42:.*]] = add i32 %[[VAL_38]], %[[VAL_41]]
-// CHECK:         %[[VAL_43:.*]] = getelementptr i32, ptr %[[VAL_44:.*]], i32 %[[VAL_25]]
-// CHECK:         %[[VAL_45:.*]] = getelementptr inbounds i32, ptr %[[VAL_43]], i32 0
-// CHECK:         store i32 %[[VAL_42]], ptr %[[VAL_45]], align 4
-// CHECK:         br label %[[VAL_33]]
-// CHECK:       entry:
-// CHECK-PTX:     %[[VAL_46:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x
-// CHECK-GCN:     %[[VAL_46:.*]] = call i32 @llvm.amdgcn.workgroup.id.x
-// CHECK-PTX:     %[[VAL_47:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x
-// CHECK-GCN:     %[[VAL_47:.*]] = call i32 @llvm.amdgcn.workitem.id.x
-// CHECK:         %[[VAL_48:.*]] = mul nuw nsw i32 %[[VAL_46]], 9
-// CHECK:         %[[VAL_49:.*]] = add nuw nsw i32 %[[VAL_48]], %[[VAL_47]]
-// CHECK:         %[[VAL_50:.*]] = icmp ult i32 %[[VAL_49]], 9
-// CHECK:         call void @llvm.assume(i1 %[[VAL_50]])
-// CHECK:         %[[VAL_51:.*]] = add nuw nsw i32 %[[VAL_49]], 0
-// CHECK:         %[[VAL_52:.*]] = udiv i32 %[[VAL_51]], 1
-// CHECK:         %[[VAL_53:.*]] = urem i32 %[[VAL_52]], 3
-// CHECK:         %[[VAL_54:.*]] = udiv i32 %[[VAL_51]], 3
-// CHECK:         %[[VAL_55:.*]] = icmp ult i32 %[[VAL_49]], 9
-// CHECK:         br i1 %[[VAL_55]], label %[[VAL_56:.*]], label %[[VAL_57:.*]]
-// CHECK:       wrapped_operand.in_bounds-after:                  ; preds = %[[VAL_56]], %[[VAL_58:.*]]
-// CHECK:         ret void
-// CHECK:       wrapped_operand.in_bounds-true:                   ; preds = %[[VAL_58]]
-// CHECK:         %[[VAL_59:.*]] = getelementptr i32, ptr %[[VAL_60:.*]], i32 %[[VAL_49]]
-// CHECK:         %[[VAL_61:.*]] = getelementptr inbounds i32, ptr %[[VAL_59]], i32 0
-// CHECK:         %[[VAL_62:.*]] = load i32, ptr %[[VAL_61]], align 4, !invariant.load
-// CHECK:         %[[VAL_63:.*]] = getelementptr i32, ptr %[[VAL_60]], i32 %[[VAL_49]]
-// CHECK:         %[[VAL_64:.*]] = getelementptr inbounds i32, ptr %[[VAL_63]], i32 0
-// CHECK:         %[[VAL_65:.*]] = load i32, ptr %[[VAL_64]], align 4, !invariant.load
-// CHECK:         %[[VAL_66:.*]] = add i32 %[[VAL_62]], %[[VAL_65]]
-// CHECK:         %[[VAL_67:.*]] = getelementptr i32, ptr %[[VAL_68:.*]], i32 %[[VAL_49]]
-// CHECK:         %[[VAL_69:.*]] = getelementptr inbounds i32, ptr %[[VAL_67]], i32 0
-// CHECK:         store i32 %[[VAL_66]], ptr %[[VAL_69]], align 4
-// CHECK:         br label %[[VAL_57]]
-// CHECK:       entry:
+// CHECK:       define void @wrapped_scatter
 // CHECK:         %[[VAL_70:.*]] = alloca i32, align 4
 // CHECK-PTX:     %[[VAL_71:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x
 // CHECK-GCN:     %[[VAL_71:.*]] = call i32 @llvm.amdgcn.workgroup.id.x
diff --git a/third_party/xla/xla/service/gpu/tests/fused_slice.hlo b/third_party/xla/xla/service/gpu/tests/fused_slice.hlo
index 871856d6801786..4affcb0de7533b 100644
--- a/third_party/xla/xla/service/gpu/tests/fused_slice.hlo
+++ b/third_party/xla/xla/service/gpu/tests/fused_slice.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
diff --git a/third_party/xla/xla/service/gpu/tests/fused_slice_different_operands.hlo b/third_party/xla/xla/service/gpu/tests/fused_slice_different_operands.hlo
deleted file mode 100644
index cd4b0bff3f174f..00000000000000
--- a/third_party/xla/xla/service/gpu/tests/fused_slice_different_operands.hlo
+++ /dev/null
@@ -1,98 +0,0 @@
-// RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
-
-// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-// The script is designed to make adding checks to
-// a test case fast, it is *not* designed to be authoritative
-// about what constitutes a good test! The CHECK should be
-// minimized and named to reflect the test intent.
-
-
-// CHECK-LABEL: entry:
-// CHECK-PTX:     %[[VAL_0:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x
-// CHECK-GCN:     %[[VAL_0:.*]] = call i32 @llvm.amdgcn.workgroup.id.x
-// CHECK-PTX:     %[[VAL_1:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x
-// CHECK-GCN:     %[[VAL_1:.*]] = call i32 @llvm.amdgcn.workitem.id.x
-// CHECK-PTX:     %[[VAL_2:.*]] = mul nuw nsw i32 %[[VAL_0]], 128
-// CHECK-GCN:     %[[VAL_2:.*]] = mul nuw nsw i32 %[[VAL_0]], 256
-// CHECK:         %[[VAL_3:.*]] = add nuw nsw i32 %[[VAL_2]], %[[VAL_1]]
-// CHECK:         %[[VAL_4:.*]] = icmp ult i32 %[[VAL_3]], 1024
-// CHECK:         call void @llvm.assume(i1 %[[VAL_4]])
-// CHECK:         %[[VAL_5:.*]] = add nuw nsw i32 %[[VAL_3]], 0
-// CHECK:         %[[VAL_6:.*]] = udiv i32 %[[VAL_5]], 1
-// CHECK:         %[[VAL_7:.*]] = icmp ult i32 %[[VAL_3]], 1024
-// CHECK:         br i1 %[[VAL_7]], label %[[VAL_8:.*]], label %[[VAL_9:.*]]
-// CHECK:       fusion.in_bounds-after:                           ; preds = %[[VAL_10:.*]], %[[VAL_11:.*]]
-// CHECK:         ret void
-// CHECK:       fusion.in_bounds-true:                            ; preds = %[[VAL_11]]
-// CHECK:         %[[VAL_12:.*]] = add i32 %[[VAL_6]], 0
-// CHECK:         br label %[[VAL_13:.*]]
-// CHECK:       concat_index_from_operand_id0:                    ; preds = %[[VAL_14:.*]]
-// CHECK:         %[[VAL_15:.*]] = phi i32 [ 0, %[[VAL_14]] ]
-// CHECK:         %[[VAL_16:.*]] = sub nsw i32 %[[VAL_12]], %[[VAL_15]]
-// CHECK:         %[[VAL_17:.*]] = getelementptr inbounds [1024 x half], ptr %[[VAL_18:.*]], i32 0, i32 %[[VAL_16]]
-// CHECK:         %[[VAL_19:.*]] = load half, ptr %[[VAL_17]], align 2, !invariant.load
-// CHECK:         %[[VAL_20:.*]] = getelementptr inbounds [1024 x half], ptr %[[VAL_21:.*]], i32 0, i32 %[[VAL_16]]
-// CHECK:         %[[VAL_22:.*]] = load half, ptr %[[VAL_20]], align 2, !invariant.load
-// CHECK:         %[[VAL_23:.*]] = fmul half %[[VAL_19]], %[[VAL_22]]
-// CHECK:         br label %[[VAL_10]]
-// CHECK:       concat_index_from_operand_id1:                    ; preds = %[[VAL_24:.*]]
-// CHECK:         %[[VAL_25:.*]] = phi i32 [ 1024, %[[VAL_24]] ]
-// CHECK:         %[[VAL_26:.*]] = sub nsw i32 %[[VAL_12]], %[[VAL_25]]
-// CHECK:         %[[VAL_27:.*]] = getelementptr inbounds [1023 x half], ptr %[[VAL_28:.*]], i32 0, i32 %[[VAL_26]]
-// CHECK:         %[[VAL_29:.*]] = load half, ptr %[[VAL_27]], align 2, !invariant.load
-// CHECK:         %[[VAL_30:.*]] = getelementptr inbounds [1023 x half], ptr %[[VAL_31:.*]], i32 0, i32 %[[VAL_26]]
-// CHECK:         %[[VAL_32:.*]] = load half, ptr %[[VAL_30]], align 2, !invariant.load
-// CHECK:         %[[VAL_33:.*]] = fadd half %[[VAL_29]], %[[VAL_32]]
-// CHECK:         br label %[[VAL_10]]
-// CHECK:       concatenate.pivot.1024.:                          ; preds = %[[VAL_8]]
-// CHECK:         %[[VAL_34:.*]] = icmp ult i32 %[[VAL_12]], 1024
-// CHECK:         br i1 %[[VAL_34]], label %[[VAL_14]], label %[[VAL_24]]
-// CHECK:       concatenate.pivot.0.:                             ; preds = %[[VAL_13]]
-// CHECK:         br label %[[VAL_35:.*]]
-// CHECK:       concatenate.pivot.1024.1:                         ; preds = %[[VAL_13]]
-// CHECK:         br label %[[VAL_36:.*]]
-// CHECK:       concat.1.merge:                                   ; preds = %[[VAL_36]], %[[VAL_35]]
-// CHECK:         %[[VAL_37:.*]] = phi half [ %[[VAL_23]], %[[VAL_35]] ], [ %[[VAL_33]], %[[VAL_36]] ]
-// CHECK:         %[[VAL_38:.*]] = insertvalue { half, half } undef, half %[[VAL_37]], 0
-// CHECK:         %[[VAL_39:.*]] = add i32 %[[VAL_6]], 0
-// CHECK:         %[[VAL_40:.*]] = getelementptr inbounds [1024 x half], ptr %[[VAL_18]], i32 0, i32 %[[VAL_39]]
-// CHECK:         %[[VAL_41:.*]] = load half, ptr %[[VAL_40]], align 2, !invariant.load
-// CHECK:         %[[VAL_42:.*]] = getelementptr inbounds [1024 x half], ptr %[[VAL_21]], i32 0, i32 %[[VAL_39]]
-// CHECK:         %[[VAL_43:.*]] = load half, ptr %[[VAL_42]], align 2, !invariant.load
-// CHECK:         %[[VAL_44:.*]] = fmul half %[[VAL_41]], %[[VAL_43]]
-// CHECK:         %[[VAL_45:.*]] = insertvalue { half, half } %[[VAL_38]], half %[[VAL_44]], 1
-// CHECK:         %[[VAL_46:.*]] = extractvalue { half, half } %[[VAL_45]], 0
-// CHECK:         %[[VAL_47:.*]] = getelementptr half, ptr %[[VAL_48:.*]], i32 %[[VAL_3]]
-// CHECK:         %[[VAL_49:.*]] = getelementptr inbounds half, ptr %[[VAL_47]], i32 0
-// CHECK:         store half %[[VAL_46]], ptr %[[VAL_49]], align 2
-// CHECK:         %[[VAL_50:.*]] = extractvalue { half, half } %[[VAL_45]], 1
-// CHECK:         %[[VAL_51:.*]] = getelementptr half, ptr %[[VAL_52:.*]], i32 %[[VAL_3]]
-// CHECK:         %[[VAL_53:.*]] = getelementptr inbounds half, ptr %[[VAL_51]], i32 0
-// CHECK:         store half %[[VAL_50]], ptr %[[VAL_53]], align 2
-// CHECK:         br label %[[VAL_9]]
-
-HloModule input_fusion_with_a_tuple_of_slices, is_scheduled=true
-
-fused_computation {
-  arg.1 = f16[1024]{0} parameter(0)
-  arg.2 = f16[1024]{0} parameter(1)
-  arg.3 = f16[1023]{0} parameter(2)
-  arg.4 = f16[1023]{0} parameter(3)
-  mul.1 = f16[1024]{0} multiply(arg.1, arg.2)
-  add.1 = f16[1023]{0} add(arg.3, arg.4)
-  concat.1 = f16[2047]{0} concatenate(mul.1, add.1), dimensions={0}
-  slice.1 = f16[1024]{0} slice(concat.1), slice={[0:1024]}
-  slice.2 = f16[1024]{0} slice(mul.1), slice={[0:1024]}
-  ROOT tuple.1 = (f16[1024]{0}, f16[1024]{0}) tuple(slice.1, slice.2)
-}
-
-ENTRY kernel_entry {
-  arg.1 = f16[1024]{0} parameter(0)
-  arg.2 = f16[1024]{0} parameter(1)
-  arg.3 = f16[1023]{0} parameter(2)
-  arg.4 = f16[1023]{0} parameter(3)
-  ROOT fusion = (f16[1024]{0}, f16[1024]{0})
-      fusion(arg.1, arg.2, arg.3, arg.4), kind=kLoop, calls=fused_computation
-}
-
diff --git a/third_party/xla/xla/service/gpu/tests/fusion.hlo b/third_party/xla/xla/service/gpu/tests/fusion.hlo
deleted file mode 100644
index d6a20cabef9e39..00000000000000
--- a/third_party/xla/xla/service/gpu/tests/fusion.hlo
+++ /dev/null
@@ -1,290 +0,0 @@
-// RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
-
-HloModule TestModule, is_scheduled=true
-
-// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-
-// CHECK-LABEL: entry:
-// CHECK-PTX:     %[[VAL_0:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x
-// CHECK-GCN:     %[[VAL_0:.*]] = call i32 @llvm.amdgcn.workgroup.id.x
-// CHECK-PTX:     %[[VAL_1:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x
-// CHECK-GCN:     %[[VAL_1:.*]] = call i32 @llvm.amdgcn.workitem.id.x
-// CHECK-PTX:     %[[VAL_2:.*]] = mul nuw nsw i32 %[[VAL_0]], 128
-// CHECK-GCN:     %[[VAL_2:.*]] = mul nuw nsw i32 %[[VAL_0]], 256
-// CHECK:         %[[VAL_3:.*]] = add nuw nsw i32 %[[VAL_2]], %[[VAL_1]]
-// CHECK:         %[[VAL_4:.*]] = icmp ult i32 %[[VAL_3]], 25690112
-// CHECK:         call void @llvm.assume(i1 %[[VAL_4]])
-// CHECK:         %[[VAL_5:.*]] = mul nuw nsw i32 %[[VAL_3]], 4
-// CHECK:         %[[VAL_6:.*]] = add nuw nsw i32 %[[VAL_5]], 0
-// CHECK:         %[[VAL_7:.*]] = udiv i32 %[[VAL_6]], 1
-// CHECK:         %[[VAL_8:.*]] = urem i32 %[[VAL_7]], 64
-// CHECK:         %[[VAL_9:.*]] = udiv i32 %[[VAL_6]], 64
-// CHECK:         %[[VAL_10:.*]] = urem i32 %[[VAL_9]], 112
-// CHECK:         %[[VAL_11:.*]] = udiv i32 %[[VAL_6]], 7168
-// CHECK:         %[[VAL_12:.*]] = urem i32 %[[VAL_11]], 112
-// CHECK:         %[[VAL_13:.*]] = udiv i32 %[[VAL_6]], 802816
-// CHECK:         %[[VAL_14:.*]] = add nuw nsw i32 %[[VAL_5]], 1
-// CHECK:         %[[VAL_15:.*]] = udiv i32 %[[VAL_14]], 1
-// CHECK:         %[[VAL_16:.*]] = urem i32 %[[VAL_15]], 64
-// CHECK:         %[[VAL_17:.*]] = udiv i32 %[[VAL_14]], 64
-// CHECK:         %[[VAL_18:.*]] = urem i32 %[[VAL_17]], 112
-// CHECK:         %[[VAL_19:.*]] = udiv i32 %[[VAL_14]], 7168
-// CHECK:         %[[VAL_20:.*]] = urem i32 %[[VAL_19]], 112
-// CHECK:         %[[VAL_21:.*]] = udiv i32 %[[VAL_14]], 802816
-// CHECK:         %[[VAL_22:.*]] = add nuw nsw i32 %[[VAL_5]], 2
-// CHECK:         %[[VAL_23:.*]] = udiv i32 %[[VAL_22]], 1
-// CHECK:         %[[VAL_24:.*]] = urem i32 %[[VAL_23]], 64
-// CHECK:         %[[VAL_25:.*]] = udiv i32 %[[VAL_22]], 64
-// CHECK:         %[[VAL_26:.*]] = urem i32 %[[VAL_25]], 112
-// CHECK:         %[[VAL_27:.*]] = udiv i32 %[[VAL_22]], 7168
-// CHECK:         %[[VAL_28:.*]] = urem i32 %[[VAL_27]], 112
-// CHECK:         %[[VAL_29:.*]] = udiv i32 %[[VAL_22]], 802816
-// CHECK:         %[[VAL_30:.*]] = add nuw nsw i32 %[[VAL_5]], 3
-// CHECK:         %[[VAL_31:.*]] = udiv i32 %[[VAL_30]], 1
-// CHECK:         %[[VAL_32:.*]] = urem i32 %[[VAL_31]], 64
-// CHECK:         %[[VAL_33:.*]] = udiv i32 %[[VAL_30]], 64
-// CHECK:         %[[VAL_34:.*]] = urem i32 %[[VAL_33]], 112
-// CHECK:         %[[VAL_35:.*]] = udiv i32 %[[VAL_30]], 7168
-// CHECK:         %[[VAL_36:.*]] = urem i32 %[[VAL_35]], 112
-// CHECK:         %[[VAL_37:.*]] = udiv i32 %[[VAL_30]], 802816
-// CHECK:         %[[VAL_38:.*]] = icmp ult i32 %[[VAL_5]], 102760448
-// CHECK:         br i1 %[[VAL_38]], label %[[VAL_39:.*]], label %[[VAL_40:.*]]
-// CHECK:       fusion.1.in_bounds-after:                         ; preds = %[[VAL_39]], %[[VAL_41:.*]]
-// CHECK:         ret void
-// CHECK:       fusion.1.in_bounds-true:                          ; preds = %[[VAL_41]]
-// CHECK:         %[[VAL_42:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_43:.*]], i32 0, i32 %[[VAL_8]]
-// CHECK:         %[[VAL_44:.*]] = load float, ptr %[[VAL_42]], align 4, !invariant.load
-// CHECK:         %[[VAL_45:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_46:.*]], i32 0, i32 %[[VAL_8]]
-// CHECK:         %[[VAL_47:.*]] = load float, ptr %[[VAL_45]], align 4, !invariant.load
-// CHECK:         %[[VAL_48:.*]] = fmul float %[[VAL_44]], %[[VAL_47]]
-// CHECK:         %[[VAL_49:.*]] = load float, ptr @0, align 4
-// CHECK:         %[[VAL_50:.*]] = fmul float %[[VAL_48]], %[[VAL_49]]
-// CHECK:         %[[VAL_51:.*]] = getelementptr half, ptr %[[VAL_52:.*]], i32 %[[VAL_5]]
-// CHECK:         %[[VAL_53:.*]] = getelementptr inbounds half, ptr %[[VAL_51]], i32 0
-// CHECK:         %[[VAL_54:.*]] = load half, ptr %[[VAL_53]], align 2, !invariant.load
-// CHECK:         %[[VAL_55:.*]] = load half, ptr @2, align 2
-// CHECK:         %[[VAL_56:.*]] = fcmp ogt half %[[VAL_54]], %[[VAL_55]]
-// CHECK:         %[[VAL_57:.*]] = zext i1 %[[VAL_56]] to i8
-// CHECK:         %[[VAL_58:.*]] = getelementptr half, ptr %[[VAL_59:.*]], i32 %[[VAL_5]]
-// CHECK:         %[[VAL_60:.*]] = getelementptr inbounds half, ptr %[[VAL_58]], i32 0
-// CHECK:         %[[VAL_61:.*]] = load half, ptr %[[VAL_60]], align 2, !invariant.load
-// CHECK:         %[[VAL_62:.*]] = trunc i8 %[[VAL_57]] to i1
-// CHECK:         %[[VAL_63:.*]] = select i1 %[[VAL_62]], half %[[VAL_61]], half %[[VAL_55]]
-// CHECK:         %[[VAL_64:.*]] = fpext half %[[VAL_63]] to float
-// CHECK:         %[[VAL_65:.*]] = load float, ptr @1, align 4
-// CHECK:         %[[VAL_66:.*]] = fmul float %[[VAL_64]], %[[VAL_65]]
-// CHECK:         %[[VAL_67:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_68:.*]], i32 0, i32 %[[VAL_8]]
-// CHECK:         %[[VAL_69:.*]] = load float, ptr %[[VAL_67]], align 4, !invariant.load
-// CHECK:         %[[VAL_70:.*]] = fsub float %[[VAL_66]], %[[VAL_69]]
-// CHECK:         %[[VAL_71:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_72:.*]], i32 0, i32 %[[VAL_8]]
-// CHECK:         %[[VAL_73:.*]] = load float, ptr %[[VAL_71]], align 4, !invariant.load
-// CHECK:         %[[VAL_74:.*]] = getelementptr half, ptr %[[VAL_75:.*]], i32 %[[VAL_5]]
-// CHECK:         %[[VAL_76:.*]] = getelementptr inbounds half, ptr %[[VAL_74]], i32 0
-// CHECK:         %[[VAL_77:.*]] = load half, ptr %[[VAL_76]], align 2, !invariant.load
-// CHECK:         %[[VAL_78:.*]] = fpext half %[[VAL_77]] to float
-// CHECK:         %[[VAL_79:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_80:.*]], i32 0, i32 %[[VAL_8]]
-// CHECK:         %[[VAL_81:.*]] = load float, ptr %[[VAL_79]], align 4, !invariant.load
-// CHECK:         %[[VAL_82:.*]] = load float, ptr @0, align 4
-// CHECK:         %[[VAL_83:.*]] = fmul float %[[VAL_81]], %[[VAL_82]]
-// CHECK:         %[[VAL_84:.*]] = fsub float %[[VAL_78]], %[[VAL_83]]
-// CHECK:         %[[VAL_85:.*]] = fmul float %[[VAL_73]], %[[VAL_84]]
-// CHECK:         %[[VAL_86:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_87:.*]], i32 0, i32 %[[VAL_8]]
-// CHECK:         %[[VAL_88:.*]] = load float, ptr %[[VAL_86]], align 4, !invariant.load
-// CHECK:         %[[VAL_89:.*]] = fdiv float %[[VAL_85]], %[[VAL_88]]
-// CHECK:         %[[VAL_90:.*]] = fsub float %[[VAL_70]], %[[VAL_89]]
-// CHECK:         %[[VAL_91:.*]] = fmul float %[[VAL_50]], %[[VAL_90]]
-// CHECK:         %[[VAL_92:.*]] = fptrunc float %[[VAL_91]] to half
-// CHECK:         %[[VAL_93:.*]] = getelementptr half, ptr %[[VAL_94:.*]], i32 %[[VAL_5]]
-// CHECK:         %[[VAL_95:.*]] = getelementptr inbounds half, ptr %[[VAL_93]], i32 0
-// CHECK:         store half %[[VAL_92]], ptr %[[VAL_95]], align 2
-// CHECK:         %[[VAL_96:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_43]], i32 0, i32 %[[VAL_16]]
-// CHECK:         %[[VAL_97:.*]] = load float, ptr %[[VAL_96]], align 4, !invariant.load
-// CHECK:         %[[VAL_98:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_46]], i32 0, i32 %[[VAL_16]]
-// CHECK:         %[[VAL_99:.*]] = load float, ptr %[[VAL_98]], align 4, !invariant.load
-// CHECK:         %[[VAL_100:.*]] = fmul float %[[VAL_97]], %[[VAL_99]]
-// CHECK:         %[[VAL_101:.*]] = load float, ptr @0, align 4
-// CHECK:         %[[VAL_102:.*]] = fmul float %[[VAL_100]], %[[VAL_101]]
-// CHECK:         %[[VAL_103:.*]] = getelementptr half, ptr %[[VAL_52]], i32 %[[VAL_5]]
-// CHECK:         %[[VAL_104:.*]] = getelementptr inbounds half, ptr %[[VAL_103]], i32 1
-// CHECK:         %[[VAL_105:.*]] = load half, ptr %[[VAL_104]], align 2, !invariant.load
-// CHECK:         %[[VAL_106:.*]] = load half, ptr @2, align 2
-// CHECK:         %[[VAL_107:.*]] = fcmp ogt half %[[VAL_105]], %[[VAL_106]]
-// CHECK:         %[[VAL_108:.*]] = zext i1 %[[VAL_107]] to i8
-// CHECK:         %[[VAL_109:.*]] = getelementptr half, ptr %[[VAL_59]], i32 %[[VAL_5]]
-// CHECK:         %[[VAL_110:.*]] = getelementptr inbounds half, ptr %[[VAL_109]], i32 1
-// CHECK:         %[[VAL_111:.*]] = load half, ptr %[[VAL_110]], align 2, !invariant.load
-// CHECK:         %[[VAL_112:.*]] = trunc i8 %[[VAL_108]] to i1
-// CHECK:         %[[VAL_113:.*]] = select i1 %[[VAL_112]], half %[[VAL_111]], half %[[VAL_106]]
-// CHECK:         %[[VAL_114:.*]] = fpext half %[[VAL_113]] to float
-// CHECK:         %[[VAL_115:.*]] = load float, ptr @1, align 4
-// CHECK:         %[[VAL_116:.*]] = fmul float %[[VAL_114]], %[[VAL_115]]
-// CHECK:         %[[VAL_117:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_68]], i32 0, i32 %[[VAL_16]]
-// CHECK:         %[[VAL_118:.*]] = load float, ptr %[[VAL_117]], align 4, !invariant.load
-// CHECK:         %[[VAL_119:.*]] = fsub float %[[VAL_116]], %[[VAL_118]]
-// CHECK:         %[[VAL_120:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_72]], i32 0, i32 %[[VAL_16]]
-// CHECK:         %[[VAL_121:.*]] = load float, ptr %[[VAL_120]], align 4, !invariant.load
-// CHECK:         %[[VAL_122:.*]] = getelementptr half, ptr %[[VAL_75]], i32 %[[VAL_5]]
-// CHECK:         %[[VAL_123:.*]] = getelementptr inbounds half, ptr %[[VAL_122]], i32 1
-// CHECK:         %[[VAL_124:.*]] = load half, ptr %[[VAL_123]], align 2, !invariant.load
-// CHECK:         %[[VAL_125:.*]] = fpext half %[[VAL_124]] to float
-// CHECK:         %[[VAL_126:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_80]], i32 0, i32 %[[VAL_16]]
-// CHECK:         %[[VAL_127:.*]] = load float, ptr %[[VAL_126]], align 4, !invariant.load
-// CHECK:         %[[VAL_128:.*]] = load float, ptr @0, align 4
-// CHECK:         %[[VAL_129:.*]] = fmul float %[[VAL_127]], %[[VAL_128]]
-// CHECK:         %[[VAL_130:.*]] = fsub float %[[VAL_125]], %[[VAL_129]]
-// CHECK:         %[[VAL_131:.*]] = fmul float %[[VAL_121]], %[[VAL_130]]
-// CHECK:         %[[VAL_132:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_87]], i32 0, i32 %[[VAL_16]]
-// CHECK:         %[[VAL_133:.*]] = load float, ptr %[[VAL_132]], align 4, !invariant.load
-// CHECK:         %[[VAL_134:.*]] = fdiv float %[[VAL_131]], %[[VAL_133]]
-// CHECK:         %[[VAL_135:.*]] = fsub float %[[VAL_119]], %[[VAL_134]]
-// CHECK:         %[[VAL_136:.*]] = fmul float %[[VAL_102]], %[[VAL_135]]
-// CHECK:         %[[VAL_137:.*]] = fptrunc float %[[VAL_136]] to half
-// CHECK:         %[[VAL_138:.*]] = getelementptr half, ptr %[[VAL_94]], i32 %[[VAL_5]]
-// CHECK:         %[[VAL_139:.*]] = getelementptr inbounds half, ptr %[[VAL_138]], i32 1
-// CHECK:         store half %[[VAL_137]], ptr %[[VAL_139]], align 2
-// CHECK:         %[[VAL_140:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_43]], i32 0, i32 %[[VAL_24]]
-// CHECK:         %[[VAL_141:.*]] = load float, ptr %[[VAL_140]], align 4, !invariant.load
-// CHECK:         %[[VAL_142:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_46]], i32 0, i32 %[[VAL_24]]
-// CHECK:         %[[VAL_143:.*]] = load float, ptr %[[VAL_142]], align 4, !invariant.load
-// CHECK:         %[[VAL_144:.*]] = fmul float %[[VAL_141]], %[[VAL_143]]
-// CHECK:         %[[VAL_145:.*]] = load float, ptr @0, align 4
-// CHECK:         %[[VAL_146:.*]] = fmul float %[[VAL_144]], %[[VAL_145]]
-// CHECK:         %[[VAL_147:.*]] = getelementptr half, ptr %[[VAL_52]], i32 %[[VAL_5]]
-// CHECK:         %[[VAL_148:.*]] = getelementptr inbounds half, ptr %[[VAL_147]], i32 2
-// CHECK:         %[[VAL_149:.*]] = load half, ptr %[[VAL_148]], align 2, !invariant.load
-// CHECK:         %[[VAL_150:.*]] = load half, ptr @2, align 2
-// CHECK:         %[[VAL_151:.*]] = fcmp ogt half %[[VAL_149]], %[[VAL_150]]
-// CHECK:         %[[VAL_152:.*]] = zext i1 %[[VAL_151]] to i8
-// CHECK:         %[[VAL_153:.*]] = getelementptr half, ptr %[[VAL_59]], i32 %[[VAL_5]]
-// CHECK:         %[[VAL_154:.*]] = getelementptr inbounds half, ptr %[[VAL_153]], i32 2
-// CHECK:         %[[VAL_155:.*]] = load half, ptr %[[VAL_154]], align 2, !invariant.load
-// CHECK:         %[[VAL_156:.*]] = trunc i8 %[[VAL_152]] to i1
-// CHECK:         %[[VAL_157:.*]] = select i1 %[[VAL_156]], half %[[VAL_155]], half %[[VAL_150]]
-// CHECK:         %[[VAL_158:.*]] = fpext half %[[VAL_157]] to float
-// CHECK:         %[[VAL_159:.*]] = load float, ptr @1, align 4
-// CHECK:         %[[VAL_160:.*]] = fmul float %[[VAL_158]], %[[VAL_159]]
-// CHECK:         %[[VAL_161:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_68]], i32 0, i32 %[[VAL_24]]
-// CHECK:         %[[VAL_162:.*]] = load float, ptr %[[VAL_161]], align 4, !invariant.load
-// CHECK:         %[[VAL_163:.*]] = fsub float %[[VAL_160]], %[[VAL_162]]
-// CHECK:         %[[VAL_164:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_72]], i32 0, i32 %[[VAL_24]]
-// CHECK:         %[[VAL_165:.*]] = load float, ptr %[[VAL_164]], align 4, !invariant.load
-// CHECK:         %[[VAL_166:.*]] = getelementptr half, ptr %[[VAL_75]], i32 %[[VAL_5]]
-// CHECK:         %[[VAL_167:.*]] = getelementptr inbounds half, ptr %[[VAL_166]], i32 2
-// CHECK:         %[[VAL_168:.*]] = load half, ptr %[[VAL_167]], align 2, !invariant.load
-// CHECK:         %[[VAL_169:.*]] = fpext half %[[VAL_168]] to float
-// CHECK:         %[[VAL_170:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_80]], i32 0, i32 %[[VAL_24]]
-// CHECK:         %[[VAL_171:.*]] = load float, ptr %[[VAL_170]], align 4, !invariant.load
-// CHECK:         %[[VAL_172:.*]] = load float, ptr @0, align 4
-// CHECK:         %[[VAL_173:.*]] = fmul float %[[VAL_171]], %[[VAL_172]]
-// CHECK:         %[[VAL_174:.*]] = fsub float %[[VAL_169]], %[[VAL_173]]
-// CHECK:         %[[VAL_175:.*]] = fmul float %[[VAL_165]], %[[VAL_174]]
-// CHECK:         %[[VAL_176:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_87]], i32 0, i32 %[[VAL_24]]
-// CHECK:         %[[VAL_177:.*]] = load float, ptr %[[VAL_176]], align 4, !invariant.load
-// CHECK:         %[[VAL_178:.*]] = fdiv float %[[VAL_175]], %[[VAL_177]]
-// CHECK:         %[[VAL_179:.*]] = fsub float %[[VAL_163]], %[[VAL_178]]
-// CHECK:         %[[VAL_180:.*]] = fmul float %[[VAL_146]], %[[VAL_179]]
-// CHECK:         %[[VAL_181:.*]] = fptrunc float %[[VAL_180]] to half
-// CHECK:         %[[VAL_182:.*]] = getelementptr half, ptr %[[VAL_94]], i32 %[[VAL_5]]
-// CHECK:         %[[VAL_183:.*]] = getelementptr inbounds half, ptr %[[VAL_182]], i32 2
-// CHECK:         store half %[[VAL_181]], ptr %[[VAL_183]], align 2
-// CHECK:         %[[VAL_184:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_43]], i32 0, i32 %[[VAL_32]]
-// CHECK:         %[[VAL_185:.*]] = load float, ptr %[[VAL_184]], align 4, !invariant.load
-// CHECK:         %[[VAL_186:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_46]], i32 0, i32 %[[VAL_32]]
-// CHECK:         %[[VAL_187:.*]] = load float, ptr %[[VAL_186]], align 4, !invariant.load
-// CHECK:         %[[VAL_188:.*]] = fmul float %[[VAL_185]], %[[VAL_187]]
-// CHECK:         %[[VAL_189:.*]] = load float, ptr @0, align 4
-// CHECK:         %[[VAL_190:.*]] = fmul float %[[VAL_188]], %[[VAL_189]]
-// CHECK:         %[[VAL_191:.*]] = getelementptr half, ptr %[[VAL_52]], i32 %[[VAL_5]]
-// CHECK:         %[[VAL_192:.*]] = getelementptr inbounds half, ptr %[[VAL_191]], i32 3
-// CHECK:         %[[VAL_193:.*]] = load half, ptr %[[VAL_192]], align 2, !invariant.load
-// CHECK:         %[[VAL_194:.*]] = load half, ptr @2, align 2
-// CHECK:         %[[VAL_195:.*]] = fcmp ogt half %[[VAL_193]], %[[VAL_194]]
-// CHECK:         %[[VAL_196:.*]] = zext i1 %[[VAL_195]] to i8
-// CHECK:         %[[VAL_197:.*]] = getelementptr half, ptr %[[VAL_59]], i32 %[[VAL_5]]
-// CHECK:         %[[VAL_198:.*]] = getelementptr inbounds half, ptr %[[VAL_197]], i32 3
-// CHECK:         %[[VAL_199:.*]] = load half, ptr %[[VAL_198]], align 2, !invariant.load
-// CHECK:         %[[VAL_200:.*]] = trunc i8 %[[VAL_196]] to i1
-// CHECK:         %[[VAL_201:.*]] = select i1 %[[VAL_200]], half %[[VAL_199]], half %[[VAL_194]]
-// CHECK:         %[[VAL_202:.*]] = fpext half %[[VAL_201]] to float
-// CHECK:         %[[VAL_203:.*]] = load float, ptr @1, align 4
-// CHECK:         %[[VAL_204:.*]] = fmul float %[[VAL_202]], %[[VAL_203]]
-// CHECK:         %[[VAL_205:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_68]], i32 0, i32 %[[VAL_32]]
-// CHECK:         %[[VAL_206:.*]] = load float, ptr %[[VAL_205]], align 4, !invariant.load
-// CHECK:         %[[VAL_207:.*]] = fsub float %[[VAL_204]], %[[VAL_206]]
-// CHECK:         %[[VAL_208:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_72]], i32 0, i32 %[[VAL_32]]
-// CHECK:         %[[VAL_209:.*]] = load float, ptr %[[VAL_208]], align 4, !invariant.load
-// CHECK:         %[[VAL_210:.*]] = getelementptr half, ptr %[[VAL_75]], i32 %[[VAL_5]]
-// CHECK:         %[[VAL_211:.*]] = getelementptr inbounds half, ptr %[[VAL_210]], i32 3
-// CHECK:         %[[VAL_212:.*]] = load half, ptr %[[VAL_211]], align 2, !invariant.load
-// CHECK:         %[[VAL_213:.*]] = fpext half %[[VAL_212]] to float
-// CHECK:         %[[VAL_214:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_80]], i32 0, i32 %[[VAL_32]]
-// CHECK:         %[[VAL_215:.*]] = load float, ptr %[[VAL_214]], align 4, !invariant.load
-// CHECK:         %[[VAL_216:.*]] = load float, ptr @0, align 4
-// CHECK:         %[[VAL_217:.*]] = fmul float %[[VAL_215]], %[[VAL_216]]
-// CHECK:         %[[VAL_218:.*]] = fsub float %[[VAL_213]], %[[VAL_217]]
-// CHECK:         %[[VAL_219:.*]] = fmul float %[[VAL_209]], %[[VAL_218]]
-// CHECK:         %[[VAL_220:.*]] = getelementptr inbounds [64 x float], ptr %[[VAL_87]], i32 0, i32 %[[VAL_32]]
-// CHECK:         %[[VAL_221:.*]] = load float, ptr %[[VAL_220]], align 4, !invariant.load
-// CHECK:         %[[VAL_222:.*]] = fdiv float %[[VAL_219]], %[[VAL_221]]
-// CHECK:         %[[VAL_223:.*]] = fsub float %[[VAL_207]], %[[VAL_222]]
-// CHECK:         %[[VAL_224:.*]] = fmul float %[[VAL_190]], %[[VAL_223]]
-// CHECK:         %[[VAL_225:.*]] = fptrunc float %[[VAL_224]] to half
-// CHECK:         %[[VAL_226:.*]] = getelementptr half, ptr %[[VAL_94]], i32 %[[VAL_5]]
-// CHECK:         %[[VAL_227:.*]] = getelementptr inbounds half, ptr %[[VAL_226]], i32 3
-// CHECK:         store half %[[VAL_225]], ptr %[[VAL_227]], align 2
-// CHECK:         br label %[[VAL_40]]
-
-
-%fused_computation.1 (param_0.5: f32[64], param_1.3088: f32[64], param_2.2116: f32[64], param_3.974: f32[64], param_4.1162: f32[64], param_5.893: f32[64], param_6.809: f16[128,64,112,112], param_7.770: f16[128,64,112,112], param_8.637: f16[128,64,112,112]) -> f16[128,64,112,112] {
-  %param_4.1162 = f32[64]{0} parameter(4)
-  %broadcast.2313 = f32[128,64,112,112]{1,3,2,0} broadcast(f32[64]{0} %param_4.1162), dimensions={1}
-  %param_3.974 = f32[64]{0} parameter(3)
-  %broadcast.1844 = f32[128,64,112,112]{1,3,2,0} broadcast(f32[64]{0} %param_3.974), dimensions={1}
-  %multiply.1049 = f32[128,64,112,112]{1,3,2,0} multiply(f32[128,64,112,112]{1,3,2,0} %broadcast.2313, f32[128,64,112,112]{1,3,2,0} %broadcast.1844)
-  %constant_1404 = f32[] constant(6.22807704e-07)
-  %broadcast.1843 = f32[128,64,112,112]{1,3,2,0} broadcast(f32[] %constant_1404), dimensions={}
-  %multiply.1048 = f32[128,64,112,112]{1,3,2,0} multiply(f32[128,64,112,112]{1,3,2,0} %multiply.1049, f32[128,64,112,112]{1,3,2,0} %broadcast.1843)
-  %param_8.637 = f16[128,64,112,112]{1,3,2,0} parameter(8)
-  %constant_3626 = f16[] constant(0)
-  %broadcast.4770 = f16[128,64,112,112]{1,3,2,0} broadcast(f16[] %constant_3626), dimensions={}
-  %compare.259 = pred[128,64,112,112]{1,3,2,0} compare(f16[128,64,112,112]{1,3,2,0} %param_8.637, f16[128,64,112,112]{1,3,2,0} %broadcast.4770), direction=GT
-  %param_7.770 = f16[128,64,112,112]{1,3,2,0} parameter(7)
-  %select.254 = f16[128,64,112,112]{1,3,2,0} select(pred[128,64,112,112]{1,3,2,0} %compare.259, f16[128,64,112,112]{1,3,2,0} %param_7.770, f16[128,64,112,112]{1,3,2,0} %broadcast.4770)
-  %convert.108 = f32[128,64,112,112]{1,3,2,0} convert(f16[128,64,112,112]{1,3,2,0} %select.254)
-  %constant_1390 = f32[] constant(1605632)
-  %broadcast.1841 = f32[128,64,112,112]{1,3,2,0} broadcast(f32[] %constant_1390), dimensions={}
-  %multiply.1046 = f32[128,64,112,112]{1,3,2,0} multiply(f32[128,64,112,112]{1,3,2,0} %convert.108, f32[128,64,112,112]{1,3,2,0} %broadcast.1841)
-  %param_2.2116 = f32[64]{0} parameter(2)
-  %broadcast.1840 = f32[128,64,112,112]{1,3,2,0} broadcast(f32[64]{0} %param_2.2116), dimensions={1}
-  %subtract.266 = f32[128,64,112,112]{1,3,2,0} subtract(f32[128,64,112,112]{1,3,2,0} %multiply.1046, f32[128,64,112,112]{1,3,2,0} %broadcast.1840)
-  %param_1.3088 = f32[64]{0} parameter(1)
-  %broadcast.1839 = f32[128,64,112,112]{1,3,2,0} broadcast(f32[64]{0} %param_1.3088), dimensions={1}
-  %param_6.809 = f16[128,64,112,112]{1,3,2,0} parameter(6)
-  %convert.644 = f32[128,64,112,112]{1,3,2,0} convert(f16[128,64,112,112]{1,3,2,0} %param_6.809)
-  %param_5.893 = f32[64]{0} parameter(5)
-  %broadcast.3388 = f32[64]{0} broadcast(f32[] %constant_1404), dimensions={}
-  %multiply.2336 = f32[64]{0} multiply(f32[64]{0} %param_5.893, f32[64]{0} %broadcast.3388)
-  %broadcast.3387 = f32[128,64,112,112]{1,3,2,0} broadcast(f32[64]{0} %multiply.2336), dimensions={1}
-  %subtract.591 = f32[128,64,112,112]{1,3,2,0} subtract(f32[128,64,112,112]{1,3,2,0} %convert.644, f32[128,64,112,112]{1,3,2,0} %broadcast.3387)
-  %multiply.1045 = f32[128,64,112,112]{1,3,2,0} multiply(f32[128,64,112,112]{1,3,2,0} %broadcast.1839, f32[128,64,112,112]{1,3,2,0} %subtract.591)
-  %param_0.5 = f32[64]{0} parameter(0)
-  %broadcast.1838 = f32[128,64,112,112]{1,3,2,0} broadcast(f32[64]{0} %param_0.5), dimensions={1}
-  %divide.212 = f32[128,64,112,112]{1,3,2,0} divide(f32[128,64,112,112]{1,3,2,0} %multiply.1045, f32[128,64,112,112]{1,3,2,0} %broadcast.1838)
-  %subtract.265 = f32[128,64,112,112]{1,3,2,0} subtract(f32[128,64,112,112]{1,3,2,0} %subtract.266, f32[128,64,112,112]{1,3,2,0} %divide.212)
-  %multiply.1044 = f32[128,64,112,112]{1,3,2,0} multiply(f32[128,64,112,112]{1,3,2,0} %multiply.1048, f32[128,64,112,112]{1,3,2,0} %subtract.265)
-  ROOT %convert.107 = f16[128,64,112,112]{1,3,2,0} convert(f32[128,64,112,112]{1,3,2,0} %multiply.1044)
-}
-
-ENTRY main {
-  %get-tuple-element.1532 = f32[64]{0} parameter(0)
-  %get-tuple-element.876 = f32[64]{0} parameter(1)
-  %get-tuple-element.877 = f32[64]{0} parameter(2)
-  %get-tuple-element.1530 = f32[64]{0} parameter(3)
-  %arg112.113 = f32[64]{0} parameter(4)
-  %get-tuple-element.881 = f32[64]{0} parameter(5)
-  %get-tuple-element.872 = f16[128,64,112,112]{1,3,2,0} parameter(6)
-  %select-and-scatter.3626 = f16[128,64,112,112]{1,3,2,0} parameter(7)
-  %fusion.845 = f16[128,64,112,112]{1,3,2,0} parameter(8)
-
-  ROOT %fusion.1 = f16[128,64,112,112]{1,3,2,0} fusion(f32[64]{0} %get-tuple-element.1532, f32[64]{0} %get-tuple-element.876, f32[64]{0} %get-tuple-element.877, f32[64]{0} %get-tuple-element.1530, f32[64]{0} %arg112.113, f32[64]{0} %get-tuple-element.881, f16[128,64,112,112]{1,3,2,0} %get-tuple-element.872, f16[128,64,112,112]{1,3,2,0} %select-and-scatter.3626, f16[128,64,112,112]{1,3,2,0} %fusion.845), kind=kLoop, calls=%fused_computation.1
-}
diff --git a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
index 930d2e2c5449e4..493f9f5b0e62c8 100644
--- a/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gemm_rewrite_test.cc
@@ -38,6 +38,8 @@ limitations under the License.
 
 #if GOOGLE_CUDA
 #include "third_party/gpus/cuda/include/cuda.h"
+#elif TENSORFLOW_USE_ROCM
+#include "rocm/rocm_config.h"
 #endif
 
 namespace xla {
@@ -985,10 +987,6 @@ ENTRY bf16gemm {
 }
 
 TEST_P(ParameterizedGemmRewriteTest, Int8Gemm) {
-  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
-    GTEST_SKIP() << "DoBlasGemmWithAlgorithm is not yet implemented on ROCm";
-  }
-
   const char* hlo_text = R"(
 HloModule int8gemm
 
@@ -1044,9 +1042,6 @@ ENTRY main.4 {
 }
 
 TEST_P(ParameterizedGemmRewriteTest, Int8GemmNoAlphaRewrite) {
-  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
-    GTEST_SKIP() << "DoBlasGemmWithAlgorithm is not yet implemented on ROCm";
-  }
   const char* hlo_text = R"(
 HloModule int8gemm
 
@@ -1082,9 +1077,6 @@ ENTRY int8gemm {
 }
 
 TEST_P(ParameterizedGemmRewriteTest, Int8GemmNoBetaRewrite) {
-  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
-    GTEST_SKIP() << "DoBlasGemmWithAlgorithm is not yet implemented on ROCm";
-  }
   const char* hlo_text = R"(
 HloModule int8gemm
 
@@ -1898,9 +1890,6 @@ ENTRY test {
 #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
 // Test gemm matrix bias add fusion with mix type
 TEST_F(LegacyCublasGemmRewriteTest, MatrixBiasMixType) {
-  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
-    GTEST_SKIP() << "DoBlasGemmWithAlgorithm is not yet implemented on ROCm";
-  }
   std::vector<std::tuple<absl::string_view, absl::string_view>>
       type_combinations = {
           {"f16", "f32"},
@@ -1939,9 +1928,6 @@ ENTRY test {
 
 // Test batch gemm matrix bias add fusion with mix type
 TEST_F(LegacyCublasGemmRewriteTest, MatrixBiasMixTypeBatched) {
-  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
-    GTEST_SKIP() << "DoBlasGemmWithAlgorithm is not yet implemented on ROCm";
-  }
   std::vector<std::tuple<absl::string_view, absl::string_view>>
       type_combinations = {
           {"f16", "f32"},
@@ -3456,7 +3442,12 @@ ENTRY test {
 }
 
 TEST_F(CublasLtGemmRewriteTest, VectorBiasThenApproxGeluActivation) {
-  if (CudaOrRocmCheck(Switch::False, Switch::True)) {
+#if TENSORFLOW_USE_ROCM && TF_ROCM_VERSION >= 60000
+  auto rocm_switch = Switch::False;  // GELU is only available from ROCM 6.0
+#else
+  auto rocm_switch = Switch::True;
+#endif
+  if (CudaOrRocmCheck(Switch::False, rocm_switch)) {
     GTEST_SKIP() << "TODO: Unsupported blas-lt epilogue on ROCM";
   }
   const char* hlo_text = R"(
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_alignment_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_alignment_test.cc
index e98a1767659c52..27e7a5925e2612 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_alignment_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_alignment_test.cc
@@ -13,13 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include <memory>
-#include <utility>
-
-#include "xla/service/custom_call_target_registry.h"
 #include "xla/service/gpu/tests/gpu_codegen_test.h"
-#include "xla/service/llvm_ir/alias_analysis.h"
-#include "xla/tests/filecheck.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
@@ -44,13 +38,10 @@ ENTRY main {
 }
 )";
 
-  auto expected_ir = is_built_with_rocm_ ? R"(
-CHECK: @fusion(ptr noalias align 128 dereferenceable(800) %arg0, ptr noalias align 16 dereferenceable(400) %arg1, ptr noalias align 128 dereferenceable(600) %arg2)
-)"
-                                         : R"(
-CHECK: define void @fusion(ptr noalias align 128 dereferenceable(800) %arg0, ptr noalias align 16 dereferenceable(400) %arg1, ptr noalias align 128 dereferenceable(600) %arg2)
-)";
-  CompileAndVerifyIr(hlo_string, expected_ir);
+  CompileAndVerifyIr(
+      hlo_string,
+      "CHECK: {{.*}}align 128 dereferenceable(800) %{{.*}}align 16 "
+      "dereferenceable(400) %{{.*}}align 128 dereferenceable(600) %");
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_codegen_test.h b/third_party/xla/xla/service/gpu/tests/gpu_codegen_test.h
index c1648bb873a29d..4be343add7dd5c 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_codegen_test.h
+++ b/third_party/xla/xla/service/gpu/tests/gpu_codegen_test.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <memory>
 #include <string>
 
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/tests/llvm_irgen_test_base.h"
 #include "xla/tests/verified_hlo_module.h"
 
@@ -30,7 +31,7 @@ class GpuCodegenTest : public LlvmIrGenTestBase {
  public:
   GpuCodegenTest()
       : is_built_with_rocm_(
-            se::MultiPlatformManager::PlatformWithName("ROCM").ok()) {}
+            se::PlatformManager::PlatformWithName("ROCM").ok()) {}
 
  protected:
   // Converts LLVM match to be platform-specific.
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_copy_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_copy_test.cc
index 87e863c9123761..e97a15fad66520 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_copy_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_copy_test.cc
@@ -52,5 +52,24 @@ TEST_F(GpuCopyTest, UseMemcpy) {
                      /*match_optimized_ir=*/false);
 }
 
+TEST_F(GpuCopyTest, CopyTranspose) {
+  const char* hlo_text = R"(
+    HloModule Test
+
+    fused_computation {
+      param_0 = f32[100,200,300]{2,1,0} parameter(0)
+      ROOT b.1 = f32[100,200,300]{2,0,1} copy(f32[100,200,300]{2,1,0} param_0)
+    }
+
+    ENTRY main {
+      a = f32[100, 200, 300]{2,1,0} parameter(0)
+      ROOT wrapped_b = f32[100,200,300]{2,0,1} fusion(f32[100,200,300]{2,1,0} %a), kind=kLoop, calls=fused_computation
+    })";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> optimized_module,
+                          ParseAndReturnVerifiedModule(hlo_text));
+
+  EXPECT_TRUE(RunAndCompare(hlo_text, ErrorSpec{1e-5, 1e-5}));
+}
+
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_dyn_shape_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_dyn_shape_test.cc
index 9bc2cbda244739..19d37adaadf482 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_dyn_shape_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_dyn_shape_test.cc
@@ -41,8 +41,8 @@ TEST_F(GpuDynamicShapeTest, DynamicShapeR2) {
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-DAG: is_thread_0-true
-; CHECK-DAG: x_padded{{(_1)?}}.in_dyn_bounds-true
-; CHECK-DAG: x_padded{{(_1)?}}.in_bounds-true
+; CHECK-DAG: x.padded{{.*}}.in_dyn_bounds-true
+; CHECK-DAG: x.padded{{.*}}.in_bounds-true
 ; CHECK: %[[dyn_dim_size:.*]] = load i32, ptr
 ; CHECK: %[[dyn_element_total:.*]] = mul i32 1, %[[dyn_dim_size:.*]]
 ; CHECK: %[[linear_index:.*]] = add nuw nsw i32
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
index df74b772c3f10a..4ee0fa5d0b58df 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_fused_mha_test.cc
@@ -84,7 +84,7 @@ class MultiHeadedAttentionTest : public GpuCodegenTest {
  protected:
   DebugOptions GetDebugOptionsForTest() override {
     auto debug_options = HloTestBase::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_enable_xla_runtime_executable(true);
+    debug_options.set_xla_gpu_enable_xla_runtime_executable(false);
     return debug_options;
   }
 
@@ -2709,6 +2709,384 @@ class FlashAttentionBMMScaleBiasSoftmaxBMM : public MultiHeadedAttentionTest {
   }
 };
 
+class FlashAttentionBMMScaleBiasMaskSoftmaxBMM
+    : public MultiHeadedAttentionTest {
+ protected:
+  const std::string  // NOLINT
+  GetModuleFlash_Attention_Training_BMM1_Bias_Mask_Softmax_BMM2_HloString_BF16() {  // NOLINT
+    const std::string hlo_text = R"(
+    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,1024,64]{3,2,1,0},bf16[2,6,64,1024]{3,2,1,0},bf16[2,6,1024,64]{3,2,1,0},bf16[2,6,1024,64]{3,2,1,0},pred[2,6,1024,1024]{3,2,1,0})->(bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
+
+    region_0.21 {
+      Arg_0.22 = bf16[] parameter(0)
+      Arg_1.23 = bf16[] parameter(1)
+      ROOT maximum.24 = bf16[] maximum(Arg_0.22, Arg_1.23)
+    }
+
+    region_1.33 {
+      Arg_0.34 = f32[] parameter(0)
+      Arg_1.35 = f32[] parameter(1)
+      ROOT add.36 = f32[] add(Arg_0.34, Arg_1.35)
+    }
+
+    region_2.55 {
+      Arg_0.56 = bf16[] parameter(0)
+      Arg_1.57 = bf16[] parameter(1)
+      ROOT add.58 = bf16[] add(Arg_0.56, Arg_1.57)
+    }
+
+    region_3.67 {
+      Arg_0.68 = f32[] parameter(0)
+      Arg_1.69 = f32[] parameter(1)
+      ROOT add.70 = f32[] add(Arg_0.68, Arg_1.69)
+    }
+
+    ENTRY main.82 {
+      constant.16 = pred[2,6,1024,1024]{3,2,1,0} parameter(4)
+      Arg_0.1 = bf16[2,6,1024,64]{3,2,1,0} parameter(0)
+      Arg_1.2 = bf16[2,6,64,1024]{3,2,1,0} parameter(1)
+      dot.17 = bf16[2,6,1024,1024]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      constant.5 = bf16[] constant(2)
+      broadcast.6 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(constant.5), dimensions={}
+      multiply.18 = bf16[2,6,1024,1024]{3,2,1,0} multiply(dot.17, broadcast.6)
+      constant.14 = bf16[] constant(1)
+      broadcast.15 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(constant.14), dimensions={}
+      add.19 = bf16[2,6,1024,1024]{3,2,1,0} add(multiply.18, broadcast.15)
+      constant.7 = bf16[] constant(0)
+      broadcast.8 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(constant.7), dimensions={}
+      select.20 = bf16[2,6,1024,1024]{3,2,1,0} select(constant.16, add.19, broadcast.8)
+      constant.12 = bf16[] constant(-inf)
+      reduce.25 = bf16[2,6,1024]{2,1,0} reduce(select.20, constant.12), dimensions={3}, to_apply=region_0.21
+      reshape.26 = bf16[2,6,1024,1]{3,2,1,0} reshape(reduce.25)
+      broadcast.27 = bf16[2,6,1024,1]{3,2,1,0} broadcast(reshape.26), dimensions={0,1,2,3}
+      reshape.28 = bf16[2,6,1024]{2,1,0} reshape(broadcast.27)
+      broadcast.29 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.28), dimensions={0,1,2}
+      subtract.30 = bf16[2,6,1024,1024]{3,2,1,0} subtract(select.20, broadcast.29)
+      exponential.31 = bf16[2,6,1024,1024]{3,2,1,0} exponential(subtract.30)
+      convert.32 = f32[2,6,1024,1024]{3,2,1,0} convert(exponential.31)
+      constant.11 = f32[] constant(0)
+      reduce.37 = f32[2,6,1024]{2,1,0} reduce(convert.32, constant.11), dimensions={3}, to_apply=region_1.33
+      reshape.38 = f32[2,6,1024,1]{3,2,1,0} reshape(reduce.37)
+      convert.39 = bf16[2,6,1024,1]{3,2,1,0} convert(reshape.38)
+      broadcast.40 = bf16[2,6,1024,1]{3,2,1,0} broadcast(convert.39), dimensions={0,1,2,3}
+      reshape.41 = bf16[2,6,1024]{2,1,0} reshape(broadcast.40)
+      broadcast.42 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.41), dimensions={0,1,2}
+      divide.43 = bf16[2,6,1024,1024]{3,2,1,0} divide(exponential.31, broadcast.42)
+      Arg_2.3 = bf16[2,6,1024,64]{3,2,1,0} parameter(2)
+      dot.46 = bf16[2,6,1024,64]{3,2,1,0} dot(divide.43, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      Arg_3.4 = bf16[2,6,1024,64]{3,2,1,0} parameter(3)
+      dot.49 = bf16[2,6,1024,1024]{3,2,1,0} dot(Arg_3.4, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+      broadcast.62 = bf16[2,6,1024,1]{3,2,1,0} broadcast(convert.39), dimensions={0,1,2,3}
+      reshape.63 = bf16[2,6,1024]{2,1,0} reshape(broadcast.62)
+      broadcast.64 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.63), dimensions={0,1,2}
+      divide.65 = bf16[2,6,1024,1024]{3,2,1,0} divide(dot.49, broadcast.64)
+      constant.9 = bf16[] constant(1)
+      broadcast.10 = bf16[2,6,1024,1]{3,2,1,0} broadcast(constant.9), dimensions={}
+      multiply.44 = bf16[2,6,1024,1]{3,2,1,0} multiply(convert.39, convert.39)
+      divide.45 = bf16[2,6,1024,1]{3,2,1,0} divide(broadcast.10, multiply.44)
+      broadcast.50 = bf16[2,6,1024,1]{3,2,1,0} broadcast(divide.45), dimensions={0,1,2,3}
+      reshape.51 = bf16[2,6,1024]{2,1,0} reshape(broadcast.50)
+      broadcast.52 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.51), dimensions={0,1,2}
+      multiply.53 = bf16[2,6,1024,1024]{3,2,1,0} multiply(dot.49, broadcast.52)
+      multiply.54 = bf16[2,6,1024,1024]{3,2,1,0} multiply(multiply.53, exponential.31)
+      constant.13 = bf16[] constant(0)
+      reduce.59 = bf16[2,6,1024]{2,1,0} reduce(multiply.54, constant.13), dimensions={3}, to_apply=region_2.55
+      reshape.60 = bf16[2,6,1024,1]{3,2,1,0} reshape(reduce.59)
+      negate.61 = bf16[2,6,1024,1]{3,2,1,0} negate(reshape.60)
+      convert.66 = f32[2,6,1024,1]{3,2,1,0} convert(negate.61)
+      reduce.71 = f32[2,6,1024]{2,1,0} reduce(convert.66, constant.11), dimensions={3}, to_apply=region_3.67
+      broadcast.72 = f32[2,6,1024,1024]{3,2,1,0} broadcast(reduce.71), dimensions={0,1,2}
+      convert.73 = bf16[2,6,1024,1024]{3,2,1,0} convert(broadcast.72)
+      add.74 = bf16[2,6,1024,1024]{3,2,1,0} add(divide.65, convert.73)
+      multiply.75 = bf16[2,6,1024,1024]{3,2,1,0} multiply(add.74, exponential.31)
+      select.76 = bf16[2,6,1024,1024]{3,2,1,0} select(constant.16, multiply.75, broadcast.8)
+      multiply.77 = bf16[2,6,1024,1024]{3,2,1,0} multiply(select.76, broadcast.6)
+      dot.80 = bf16[2,6,1024,64]{3,2,1,0} dot(multiply.77, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+      dot.78 = bf16[2,6,1024,64]{3,2,1,0} dot(multiply.77, Arg_0.1), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      transpose.79 = bf16[2,6,64,1024]{2,3,1,0} transpose(dot.78), dimensions={0,1,3,2}
+      dot.47 = bf16[2,6,64,1024]{3,2,1,0} dot(Arg_3.4, divide.43), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      transpose.48 = bf16[2,6,1024,64]{2,3,1,0} transpose(dot.47), dimensions={0,1,3,2}
+      ROOT tuple.81 = (bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{2,3,1,0}, bf16[2,6,1024,64]{2,3,1,0}) tuple(dot.46, dot.80, transpose.79, transpose.48)
+    }
+  )";
+
+    return hlo_text;
+  }
+
+  template <typename T>
+  void TestImpl_Flash_Attention_Training_BMM1_Bias_Mask_Softmax_BMM2() {
+    stream_executor::CudaComputeCapability cc = GetCudaComputeCapability();
+    se::dnn::VersionInfo real_cudnn_version = GetCudnnVersion();
+    if (!(cc.IsAtLeast(se::CudaComputeCapability::AMPERE) && cc.minor == 0 &&
+          real_cudnn_version >= se::dnn::VersionInfo(8, 9, 3))) {
+      GTEST_SKIP() << "Flash Attention is supported with the Nvidia AMPERE+ "
+                      "GPUs and cuDNN >= 8.9.3.";
+    }
+    XlaBuilder builder(TestName());
+    auto lhs_bmm1_literal =
+        GetInput4DLiteral<T>({2, 6, 1024, 64}, {3, 2, 1, 0});
+    auto rhs_bmm1_literal =
+        GetInput4DLiteral<T>({2, 6, 64, 1024}, {3, 2, 1, 0});
+    auto rhs_bmm2_literal =
+        GetInput4DLiteral<T>({2, 6, 1024, 64}, {3, 2, 1, 0});
+    auto mask_literal = GetMask4DLiteral({2, 6, 1024, 1024}, {3, 2, 1, 0});
+    auto do_literal = GetInput4DLiteral<T>({2, 6, 1024, 64}, {3, 2, 1, 0});
+    std::string hlo_string =
+        GetModuleFlash_Attention_Training_BMM1_Bias_Mask_Softmax_BMM2_HloString_BF16();  // NOLINT
+    ExecuteAndCompare(hlo_string,
+                      {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal,
+                       &do_literal, &mask_literal},
+                      true);
+  }
+};
+
+class FlashAttentionBMMScaleSoftmaxBMM : public MultiHeadedAttentionTest {
+ protected:
+  const std::string  // NOLINT
+  GetModuleFlash_Attention_Training_BMM1_Softmax_BMM2_HloString_BF16() {  // NOLINT
+    const std::string hlo_text = R"(
+    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0})->(bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
+
+    region_0.13 {
+      Arg_0.14 = bf16[] parameter(0)
+      Arg_1.15 = bf16[] parameter(1)
+      ROOT maximum.16 = bf16[] maximum(Arg_0.14, Arg_1.15)
+    }
+
+    region_1.25 {
+      Arg_0.26 = f32[] parameter(0)
+      Arg_1.27 = f32[] parameter(1)
+      ROOT add.28 = f32[] add(Arg_0.26, Arg_1.27)
+    }
+
+    region_2.47 {
+      Arg_0.48 = bf16[] parameter(0)
+      Arg_1.49 = bf16[] parameter(1)
+      ROOT add.50 = bf16[] add(Arg_0.48, Arg_1.49)
+    }
+
+    region_3.59 {
+      Arg_0.60 = f32[] parameter(0)
+      Arg_1.61 = f32[] parameter(1)
+      ROOT add.62 = f32[] add(Arg_0.60, Arg_1.61)
+    }
+
+    ENTRY main.72 {
+      Arg_0.1 = bf16[2,6,1024,64]{3,2,1,0} parameter(0), sharding={replicated}
+      Arg_1.2 = bf16[2,6,64,1024]{3,2,1,0} parameter(1), sharding={replicated}
+      dot.11 = bf16[2,6,1024,1024]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      constant.17 = bf16[] constant(2)
+      broadcast.29 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(constant.17), dimensions={}
+      multiply.2 = bf16[2,6,1024,1024]{3,2,1,0} multiply(dot.11, broadcast.29)
+      constant.9 = bf16[] constant(-inf)
+      reduce.17 = bf16[2,6,1024]{2,1,0} reduce(multiply.2, constant.9), dimensions={3}, to_apply=region_0.13
+      reshape.18 = bf16[2,6,1024,1]{3,2,1,0} reshape(reduce.17)
+      broadcast.19 = bf16[2,6,1024,1]{3,2,1,0} broadcast(reshape.18), dimensions={0,1,2,3}
+      reshape.20 = bf16[2,6,1024]{2,1,0} reshape(broadcast.19)
+      broadcast.21 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.20), dimensions={0,1,2}
+      subtract.22 = bf16[2,6,1024,1024]{3,2,1,0} subtract(multiply.2, broadcast.21)
+      exponential.23 = bf16[2,6,1024,1024]{3,2,1,0} exponential(subtract.22)
+      convert.24 = f32[2,6,1024,1024]{3,2,1,0} convert(exponential.23)
+      constant.8 = f32[] constant(0)
+      reduce.29 = f32[2,6,1024]{2,1,0} reduce(convert.24, constant.8), dimensions={3}, to_apply=region_1.25
+      reshape.30 = f32[2,6,1024,1]{3,2,1,0} reshape(reduce.29)
+      convert.31 = bf16[2,6,1024,1]{3,2,1,0} convert(reshape.30)
+      broadcast.32 = bf16[2,6,1024,1]{3,2,1,0} broadcast(convert.31), dimensions={0,1,2,3}
+      reshape.33 = bf16[2,6,1024]{2,1,0} reshape(broadcast.32)
+      broadcast.34 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.33), dimensions={0,1,2}
+      divide.35 = bf16[2,6,1024,1024]{3,2,1,0} divide(exponential.23, broadcast.34)
+      Arg_2.3 = bf16[2,6,1024,64]{3,2,1,0} parameter(2), sharding={replicated}
+      dot.38 = bf16[2,6,1024,64]{3,2,1,0} dot(divide.35, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      Arg_4.5 = bf16[2,6,1024,64]{3,2,1,0} parameter(3), sharding={replicated}
+      dot.41 = bf16[2,6,1024,1024]{3,2,1,0} dot(Arg_4.5, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+      broadcast.54 = bf16[2,6,1024,1]{3,2,1,0} broadcast(convert.31), dimensions={0,1,2,3}
+      reshape.55 = bf16[2,6,1024]{2,1,0} reshape(broadcast.54)
+      broadcast.56 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.55), dimensions={0,1,2}
+      divide.57 = bf16[2,6,1024,1024]{3,2,1,0} divide(dot.41, broadcast.56)
+      constant.5 = bf16[] constant(1)
+      broadcast.6 = bf16[2,6,1024,1]{3,2,1,0} broadcast(constant.5), dimensions={}
+      multiply.36 = bf16[2,6,1024,1]{3,2,1,0} multiply(convert.31, convert.31)
+      divide.37 = bf16[2,6,1024,1]{3,2,1,0} divide(broadcast.6, multiply.36)
+      broadcast.42 = bf16[2,6,1024,1]{3,2,1,0} broadcast(divide.37), dimensions={0,1,2,3}
+      reshape.43 = bf16[2,6,1024]{2,1,0} reshape(broadcast.42)
+      broadcast.44 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.43), dimensions={0,1,2}
+      multiply.45 = bf16[2,6,1024,1024]{3,2,1,0} multiply(dot.41, broadcast.44)
+      multiply.46 = bf16[2,6,1024,1024]{3,2,1,0} multiply(multiply.45, exponential.23)
+      constant.7 = bf16[] constant(0)
+      reduce.51 = bf16[2,6,1024]{2,1,0} reduce(multiply.46, constant.7), dimensions={3}, to_apply=region_2.47
+      reshape.52 = bf16[2,6,1024,1]{3,2,1,0} reshape(reduce.51)
+      negate.53 = bf16[2,6,1024,1]{3,2,1,0} negate(reshape.52)
+      convert.58 = f32[2,6,1024,1]{3,2,1,0} convert(negate.53)
+      reduce.63 = f32[2,6,1024]{2,1,0} reduce(convert.58, constant.8), dimensions={3}, to_apply=region_3.59
+      broadcast.64 = f32[2,6,1024,1024]{3,2,1,0} broadcast(reduce.63), dimensions={0,1,2}
+      convert.65 = bf16[2,6,1024,1024]{3,2,1,0} convert(broadcast.64)
+      add.66 = bf16[2,6,1024,1024]{3,2,1,0} add(divide.57, convert.65)
+      multiply.67 = bf16[2,6,1024,1024]{3,2,1,0} multiply(add.66, exponential.23)
+      dot.70 = bf16[2,6,1024,64]{3,2,1,0} dot(multiply.67, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+      dot.68 = bf16[2,6,1024,64]{3,2,1,0} dot(multiply.67, Arg_0.1), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      transpose.69 = bf16[2,6,64,1024]{2,3,1,0} transpose(dot.68), dimensions={0,1,3,2}
+      dot.39 = bf16[2,6,64,1024]{3,2,1,0} dot(Arg_4.5, divide.35), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      transpose.40 = bf16[2,6,1024,64]{2,3,1,0} transpose(dot.39), dimensions={0,1,3,2}
+      ROOT tuple.71 = (bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{2,3,1,0}, bf16[2,6,1024,64]{2,3,1,0}) tuple(dot.38, dot.70, transpose.69, transpose.40)
+    }
+  )";
+    return hlo_text;
+  }
+
+  template <typename T>
+  void TestImpl_Flash_Attention_Training_BMM1_Softmax_BMM2() {
+    stream_executor::CudaComputeCapability cc = GetCudaComputeCapability();
+    se::dnn::VersionInfo real_cudnn_version = GetCudnnVersion();
+    if (!(cc.IsAtLeast(se::CudaComputeCapability::AMPERE) && cc.minor == 0 &&
+          real_cudnn_version >= se::dnn::VersionInfo(8, 9, 3))) {
+      GTEST_SKIP() << "Flash Attention is supported with the Nvidia AMPERE+ "
+                      "GPUs and cuDNN >= 8.9.3.";
+    }
+    XlaBuilder builder(TestName());
+    auto lhs_bmm1_literal =
+        GetInput4DLiteral<T>({2, 6, 1024, 64}, {3, 2, 1, 0});
+    auto rhs_bmm1_literal =
+        GetInput4DLiteral<T>({2, 6, 64, 1024}, {3, 2, 1, 0});
+    auto rhs_bmm2_literal =
+        GetInput4DLiteral<T>({2, 6, 1024, 64}, {3, 2, 1, 0});
+    auto do_literal = GetInput4DLiteral<T>({2, 6, 1024, 64}, {3, 2, 1, 0});
+    std::string hlo_string = "";
+    hlo_string =
+        GetModuleFlash_Attention_Training_BMM1_Softmax_BMM2_HloString_BF16();  // NOLINT
+    ExecuteAndCompare(
+        hlo_string,
+        {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal, &do_literal},
+        true);
+  }
+};
+
+class FlashAttentionBMMScaleMaskSoftmaxBMM : public MultiHeadedAttentionTest {
+ protected:
+  const std::string  // NOLINT
+  GetModuleFlash_Attention_Training_BMM1_Mask_Softmax_BMM2_HloString_BF16() {  // NOLINT
+    const std::string hlo_text = R"(
+    HloModule jit__unnamed_wrapped_function_, entry_computation_layout={(bf16[2,6,1024,64]{3,2,1,0},bf16[2,6,64,1024]{3,2,1,0},bf16[2,6,1024,64]{3,2,1,0},bf16[2,6,1024,64]{3,2,1,0},pred[2,6,1024,1024]{3,2,1,0})->(bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0})}, allow_spmd_sharding_propagation_to_output={true,true,true,true}
+
+    region_0.21 {
+      Arg_0.22 = bf16[] parameter(0)
+      Arg_1.23 = bf16[] parameter(1)
+      ROOT maximum.24 = bf16[] maximum(Arg_0.22, Arg_1.23)
+    }
+
+    region_1.33 {
+      Arg_0.34 = f32[] parameter(0)
+      Arg_1.35 = f32[] parameter(1)
+      ROOT add.36 = f32[] add(Arg_0.34, Arg_1.35)
+    }
+
+    region_2.55 {
+      Arg_0.56 = bf16[] parameter(0)
+      Arg_1.57 = bf16[] parameter(1)
+      ROOT add.58 = bf16[] add(Arg_0.56, Arg_1.57)
+    }
+
+    region_3.67 {
+      Arg_0.68 = f32[] parameter(0)
+      Arg_1.69 = f32[] parameter(1)
+      ROOT add.70 = f32[] add(Arg_0.68, Arg_1.69)
+    }
+
+    ENTRY main.82 {
+      constant.16 = pred[2,6,1024,1024]{3,2,1,0} parameter(4)
+      Arg_0.1 = bf16[2,6,1024,64]{3,2,1,0} parameter(0)
+      Arg_1.2 = bf16[2,6,64,1024]{3,2,1,0} parameter(1)
+      dot.17 = bf16[2,6,1024,1024]{3,2,1,0} dot(Arg_0.1, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      constant.5 = bf16[] constant(2)
+      broadcast.6 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(constant.5), dimensions={}
+      multiply.18 = bf16[2,6,1024,1024]{3,2,1,0} multiply(dot.17, broadcast.6)
+      constant.7 = bf16[] constant(0)
+      broadcast.8 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(constant.7), dimensions={}
+      select.20 = bf16[2,6,1024,1024]{3,2,1,0} select(constant.16, multiply.18, broadcast.8)
+      constant.12 = bf16[] constant(-inf)
+      reduce.25 = bf16[2,6,1024]{2,1,0} reduce(select.20, constant.12), dimensions={3}, to_apply=region_0.21
+      reshape.26 = bf16[2,6,1024,1]{3,2,1,0} reshape(reduce.25)
+      broadcast.27 = bf16[2,6,1024,1]{3,2,1,0} broadcast(reshape.26), dimensions={0,1,2,3}
+      reshape.28 = bf16[2,6,1024]{2,1,0} reshape(broadcast.27)
+      broadcast.29 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.28), dimensions={0,1,2}
+      subtract.30 = bf16[2,6,1024,1024]{3,2,1,0} subtract(select.20, broadcast.29)
+      exponential.31 = bf16[2,6,1024,1024]{3,2,1,0} exponential(subtract.30)
+      convert.32 = f32[2,6,1024,1024]{3,2,1,0} convert(exponential.31)
+      constant.11 = f32[] constant(0)
+      reduce.37 = f32[2,6,1024]{2,1,0} reduce(convert.32, constant.11), dimensions={3}, to_apply=region_1.33
+      reshape.38 = f32[2,6,1024,1]{3,2,1,0} reshape(reduce.37)
+      convert.39 = bf16[2,6,1024,1]{3,2,1,0} convert(reshape.38)
+      broadcast.40 = bf16[2,6,1024,1]{3,2,1,0} broadcast(convert.39), dimensions={0,1,2,3}
+      reshape.41 = bf16[2,6,1024]{2,1,0} reshape(broadcast.40)
+      broadcast.42 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.41), dimensions={0,1,2}
+      divide.43 = bf16[2,6,1024,1024]{3,2,1,0} divide(exponential.31, broadcast.42)
+      Arg_2.3 = bf16[2,6,1024,64]{3,2,1,0} parameter(2)
+      dot.46 = bf16[2,6,1024,64]{3,2,1,0} dot(divide.43, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      Arg_3.4 = bf16[2,6,1024,64]{3,2,1,0} parameter(3)
+      dot.49 = bf16[2,6,1024,1024]{3,2,1,0} dot(Arg_3.4, Arg_2.3), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+      broadcast.62 = bf16[2,6,1024,1]{3,2,1,0} broadcast(convert.39), dimensions={0,1,2,3}
+      reshape.63 = bf16[2,6,1024]{2,1,0} reshape(broadcast.62)
+      broadcast.64 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.63), dimensions={0,1,2}
+      divide.65 = bf16[2,6,1024,1024]{3,2,1,0} divide(dot.49, broadcast.64)
+      constant.9 = bf16[] constant(1)
+      broadcast.10 = bf16[2,6,1024,1]{3,2,1,0} broadcast(constant.9), dimensions={}
+      multiply.44 = bf16[2,6,1024,1]{3,2,1,0} multiply(convert.39, convert.39)
+      divide.45 = bf16[2,6,1024,1]{3,2,1,0} divide(broadcast.10, multiply.44)
+      broadcast.50 = bf16[2,6,1024,1]{3,2,1,0} broadcast(divide.45), dimensions={0,1,2,3}
+      reshape.51 = bf16[2,6,1024]{2,1,0} reshape(broadcast.50)
+      broadcast.52 = bf16[2,6,1024,1024]{3,2,1,0} broadcast(reshape.51), dimensions={0,1,2}
+      multiply.53 = bf16[2,6,1024,1024]{3,2,1,0} multiply(dot.49, broadcast.52)
+      multiply.54 = bf16[2,6,1024,1024]{3,2,1,0} multiply(multiply.53, exponential.31)
+      constant.13 = bf16[] constant(0)
+      reduce.59 = bf16[2,6,1024]{2,1,0} reduce(multiply.54, constant.13), dimensions={3}, to_apply=region_2.55
+      reshape.60 = bf16[2,6,1024,1]{3,2,1,0} reshape(reduce.59)
+      negate.61 = bf16[2,6,1024,1]{3,2,1,0} negate(reshape.60)
+      convert.66 = f32[2,6,1024,1]{3,2,1,0} convert(negate.61)
+      reduce.71 = f32[2,6,1024]{2,1,0} reduce(convert.66, constant.11), dimensions={3}, to_apply=region_3.67
+      broadcast.72 = f32[2,6,1024,1024]{3,2,1,0} broadcast(reduce.71), dimensions={0,1,2}
+      convert.73 = bf16[2,6,1024,1024]{3,2,1,0} convert(broadcast.72)
+      add.74 = bf16[2,6,1024,1024]{3,2,1,0} add(divide.65, convert.73)
+      multiply.75 = bf16[2,6,1024,1024]{3,2,1,0} multiply(add.74, exponential.31)
+      select.76 = bf16[2,6,1024,1024]{3,2,1,0} select(constant.16, multiply.75, broadcast.8)
+      multiply.77 = bf16[2,6,1024,1024]{3,2,1,0} multiply(select.76, broadcast.6)
+      dot.80 = bf16[2,6,1024,64]{3,2,1,0} dot(multiply.77, Arg_1.2), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+      dot.78 = bf16[2,6,1024,64]{3,2,1,0} dot(multiply.77, Arg_0.1), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      transpose.79 = bf16[2,6,64,1024]{2,3,1,0} transpose(dot.78), dimensions={0,1,3,2}
+      dot.47 = bf16[2,6,64,1024]{3,2,1,0} dot(Arg_3.4, divide.43), lhs_batch_dims={0,1}, lhs_contracting_dims={2}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+      transpose.48 = bf16[2,6,1024,64]{2,3,1,0} transpose(dot.47), dimensions={0,1,3,2}
+      ROOT tuple.81 = (bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,1024,64]{3,2,1,0}, bf16[2,6,64,1024]{2,3,1,0}, bf16[2,6,1024,64]{2,3,1,0}) tuple(dot.46, dot.80, transpose.79, transpose.48)
+    }
+  )";
+
+    return hlo_text;
+  }
+
+  template <typename T>
+  void TestImpl_Flash_Attention_Training_BMM1_Mask_Softmax_BMM2() {
+    stream_executor::CudaComputeCapability cc = GetCudaComputeCapability();
+    se::dnn::VersionInfo real_cudnn_version = GetCudnnVersion();
+    if (!(cc.IsAtLeast(se::CudaComputeCapability::AMPERE) && cc.minor == 0 &&
+          real_cudnn_version >= se::dnn::VersionInfo(8, 9, 3))) {
+      GTEST_SKIP() << "Flash Attention is supported with the Nvidia AMPERE+ "
+                      "GPUs and cuDNN >= 8.9.3.";
+    }
+    XlaBuilder builder(TestName());
+    auto lhs_bmm1_literal =
+        GetInput4DLiteral<T>({2, 6, 1024, 64}, {3, 2, 1, 0});
+    auto rhs_bmm1_literal =
+        GetInput4DLiteral<T>({2, 6, 64, 1024}, {3, 2, 1, 0});
+    auto rhs_bmm2_literal =
+        GetInput4DLiteral<T>({2, 6, 1024, 64}, {3, 2, 1, 0});
+    auto mask_literal = GetMask4DLiteral({2, 6, 1024, 1024}, {3, 2, 1, 0});
+    auto do_literal = GetInput4DLiteral<T>({2, 6, 1024, 64}, {3, 2, 1, 0});
+    std::string hlo_string =
+        GetModuleFlash_Attention_Training_BMM1_Mask_Softmax_BMM2_HloString_BF16();  // NOLINT
+    ExecuteAndCompare(hlo_string,
+                      {&lhs_bmm1_literal, &rhs_bmm1_literal, &rhs_bmm2_literal,
+                       &do_literal, &mask_literal},
+                      true);
+  }
+};
+
 // BMM1 - BMM2
 XLA_TEST_F(MultiHeadedAttentionBMMBMM, FMHABMM_BMM_vanilla_F16) {
   TestImpl_FMHABMM_BMM_vanilla<Eigen::half>();
@@ -2867,5 +3245,23 @@ XLA_TEST_F(FlashAttentionBMMScaleBiasSoftmaxBMM,
            Flash_Attention_BMM1_Bias_Softmax_BMM2_BF16_Cross_Attention) {
   TestImpl_Flash_Attention_BMM1_Bias_Softmax_BMM2_Cross_Attention<bfloat16>();
 }
+
+// BMM1 - Scale - Bias - Mask - Softmax - BMM2
+XLA_TEST_F(FlashAttentionBMMScaleBiasMaskSoftmaxBMM,
+           Flash_Attention_Training_BMM1_Bias_Mask_Softmax_BMM2_BF16) {
+  TestImpl_Flash_Attention_Training_BMM1_Bias_Mask_Softmax_BMM2<bfloat16>();
+}
+
+// BMM1 - Scale - Softmax - BMM2
+XLA_TEST_F(FlashAttentionBMMScaleSoftmaxBMM,
+           Flash_Attention_Training_BMM1_Softmax_BMM2_BF16) {
+  TestImpl_Flash_Attention_Training_BMM1_Softmax_BMM2<bfloat16>();
+}
+
+// BMM1 - Scale - Mask - Softmax - BMM2
+XLA_TEST_F(FlashAttentionBMMScaleMaskSoftmaxBMM,
+           Flash_Attention_Training_BMM1_Mask_Softmax_BMM2_BF16) {
+  TestImpl_Flash_Attention_Training_BMM1_Mask_Softmax_BMM2<bfloat16>();
+}
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_fusion_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_fusion_test.cc
index 325ce52381a2ae..5d4d4bd5788e5c 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_fusion_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_fusion_test.cc
@@ -123,26 +123,26 @@ ENTRY main {
 )");
 }
 
-TEST_F(TransposeFusionTest, ReshapeAfterCopyFused) {
+TEST_F(TransposeFusionTest, ReshapeAfterTransposeFused) {
   const char* hlo = R"(
 HloModule module
 
 ENTRY main {
   p = f32[16,32]{1,0} parameter(0)
   s = sqrt(p)
-  c = f32[16,32]{0,1} copy(s)
-  ROOT r = f32[16,32,1]{0,1,2} reshape(c)
+  t = f32[32,16]{1,0} transpose(s), dimensions={1,0}
+  ROOT r = f32[32,16,1]{2,1,0} reshape(t)
 }
   )";
 
   CheckGpuFusion(hlo, R"(
-// CHECK: %fused_computation (param_0.2: f32[16,32]) -> f32[16,32,1] {
+// CHECK: %fused_computation (param_0.2: f32[16,32]) -> f32[32,16,1] {
 // CHECK-NEXT:   [[param_0_2_0:%[^ ]+]] = f32[16,32]{1,0} parameter(0)
 // CHECK-NEXT:   [[s_1_1:%[^ ]+]] = f32[16,32]{1,0} sqrt([[param_0_2_0]])
-// CHECK-NEXT:   [[c_1_2:%[^ ]+]] = f32[16,32]{0,1} copy([[s_1_1]])
-// CHECK-NEXT:   ROOT [[r_1_3:%[^ ]+]] = f32[16,32,1]{0,1,2} reshape([[c_1_2]])
+// CHECK-NEXT:   [[t_1_2:%[^ ]+]] = f32[32,16]{1,0} transpose([[s_1_1]])
+// CHECK-NEXT:   ROOT [[r_1_3:%[^ ]+]] = f32[32,16,1]{2,1,0} reshape([[t_1_2]])
 // CHECK-NEXT: }
-// CHECK:   ROOT [[fusion_1:%[^ ]+]] = f32[16,32,1]{0,1,2} fusion
+// CHECK:   ROOT [[fusion_1:%[^ ]+]] = f32[32,16,1]{2,1,0} fusion
 )");
 }
 
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_input_fusible_slice_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_input_fusible_slice_test.cc
index 58413aa022ff42..d7348f0a6d734c 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_input_fusible_slice_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_input_fusible_slice_test.cc
@@ -67,12 +67,12 @@ TEST_F(GpuSliceInputFusionTest, InputFusionWithATupleOfSlices) {
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .value();
   auto expected_ir = is_built_with_rocm_ ? R"(
-; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK-LABEL: define amdgpu_kernel void @{{[a-z_]*}}fusion
 ; CHECK: slice2
 ; CHECK: }
 )"
                                          : R"(
-; CHECK-LABEL: define void @fusion
+; CHECK-LABEL: define void @{{[a-z_]*}}fusion
 ; CHECK: slice2
 ; CHECK: }
 )";
@@ -114,12 +114,12 @@ TEST_F(GpuSliceInputFusionTest, ConcatThenSplit) {
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .value();
   auto expected_ir = is_built_with_rocm_ ? R"(
-; CHECK-LABEL: define amdgpu_kernel void @fusion
+; CHECK-LABEL: define amdgpu_kernel void @{{[a-z_]*}}fusion
 ; CHECK: slice2
 ; CHECK: }
 )"
                                          : R"(
-; CHECK-LABEL: define void @fusion
+; CHECK-LABEL: define void @{{[a-z_]*}}fusion
 ; CHECK: slice2
 ; CHECK: }
 )";
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_kernel_tiling_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
index 8c3895ba3b69bb..e073e100a79900 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_kernel_tiling_test.cc
@@ -134,7 +134,7 @@ TEST_F(GpuKernelTilingTest, SimpleFusionWithTransposeTiled) {
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .value();
   auto expected_ir = R"(
-; CHECK-LABEL: define KERNEL_ANNOTATION @fusion
+; CHECK-LABEL: define KERNEL_ANNOTATION @{{[a-z_]*}}fusion
 ; CHECK: call void BARRIER()
 ; CHECK: }
 )";
@@ -170,7 +170,7 @@ TEST_F(GpuKernelTilingTest, MultipleOutputFusionWithOnePossibleTransposeTiled) {
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .value();
   auto expected_ir = R"(
-; CHECK-LABEL: define KERNEL_ANNOTATION @fusion
+; CHECK-LABEL: define KERNEL_ANNOTATION @{{[a-z_]*}}fusion
 ; CHECK: call void BARRIER()
 ; CHECK: }
 )";
@@ -202,7 +202,7 @@ TEST_F(GpuKernelTilingTest, TransposedInputWithUserReverseNotTiled) {
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .value();
   auto expected_ir = R"(
-; CHECK-LABEL: define KERNEL_ANNOTATION @fusion
+; CHECK-LABEL: define KERNEL_ANNOTATION @{{[a-z_]*}}fusion
 ; CHECK-NOT: call void BARRIER()
 ; CHECK: }
 )";
@@ -231,7 +231,7 @@ TEST_F(GpuKernelTilingTest, TransposedInputWithUserBitcastNotTiled) {
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .value();
   auto expected_ir = R"(
-; CHECK-LABEL: define KERNEL_ANNOTATION @fusion
+; CHECK-LABEL: define KERNEL_ANNOTATION @{{[a-z_]*}}fusion
 ; CHECK-NOT: call void BARRIER()
 ; CHECK: }
 )";
@@ -268,7 +268,7 @@ TEST_F(GpuKernelTilingTest, TransposedInputWithoutUnsafeUseTiled) {
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .value();
   auto expected_ir = R"(
-; CHECK-LABEL: define KERNEL_ANNOTATION @fusion
+; CHECK-LABEL: define KERNEL_ANNOTATION @{{[a-z_]*}}fusion
 ; CHECK: call void BARRIER()
 ; CHECK: }
 )";
@@ -342,7 +342,7 @@ TEST_F(GpuKernelTilingTest, ColumnReductionWithLayoutChangeTiled) {
                      /*match_optimized_ir=*/true);
 
   // Check that the kernel runs correctly.
-  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.001}));
+  EXPECT_TRUE(RunAndCompare(kHloString, ErrorSpec{0.001}));
 }
 
 TEST_F(GpuKernelTilingTest, RowReductionWithLayoutChangeTiled) {
@@ -366,7 +366,7 @@ TEST_F(GpuKernelTilingTest, RowReductionWithLayoutChangeTiled) {
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .value();
   auto expected_ir = R"(
-; CHECK-LABEL: define KERNEL_ANNOTATION @{{(wrapped_reduce|fusion)}}
+; CHECK-LABEL: define KERNEL_ANNOTATION @{{(wrapped_reduce|.*fusion)}}
 ; CHECK: call SHUFFLE
 ; CHECK: }
 )";
@@ -375,7 +375,7 @@ TEST_F(GpuKernelTilingTest, RowReductionWithLayoutChangeTiled) {
                      /*match_optimized_ir=*/true);
 
   // Check that the kernel runs correctly.
-  EXPECT_TRUE(RunAndCompareNoHloPasses(kHloString, ErrorSpec{0.001}));
+  EXPECT_TRUE(RunAndCompare(kHloString, ErrorSpec{0.001}));
 }
 
 TEST_F(GpuKernelTilingTest, RowReductionTwoRowsPerWarp) {
@@ -400,7 +400,7 @@ TEST_F(GpuKernelTilingTest, RowReductionTwoRowsPerWarp) {
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .value();
   auto expected_ir = R"(
-; CHECK-LABEL: define KERNEL_ANNOTATION @{{(wrapped_reduce|fusion)}}
+; CHECK-LABEL: define KERNEL_ANNOTATION @{{(wrapped_reduce|.*fusion)}}
 ; CHECK: %[[TID_X:.*]] = tail call i32 TIDX()
 ; CHECK: %[[TID_LOGICAL:.*]] = and i32 %[[TID_X]], 15
 ; CHECK: call SHUFFLE
@@ -439,14 +439,11 @@ TEST_F(GpuKernelTilingTest, RowReductionFourRowsPerWarp) {
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .value();
   auto expected_ir = R"(
-; CHECK-LABEL: define KERNEL_ANNOTATION @{{(wrapped_reduce|fusion)}}
+; CHECK-LABEL: define KERNEL_ANNOTATION @{{(wrapped_reduce|.*fusion)}}
 ; CHECK: %[[TID_X:.*]] = tail call i32 TIDX()
 ; CHECK: %[[TID_LOGICAL:.*]] = and i32 %[[TID_X]], 7
 ; CHECK: call SHUFFLE
 ; CHECK: %[[LOGICAL_T0:.*]] = icmp eq i32 %[[TID_LOGICAL]], 0
-; CHECK: LCAL
-; CHECK: EXTV
-; CHECK: BR_CAL
 )";
 
   CompileAndVerifyIr(std::move(hlo_module),
@@ -479,7 +476,7 @@ TEST_F(GpuKernelTilingTest,
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .value();
   const char *expected_ir = R"(
-; CHECK-LABEL: define KERNEL_ANNOTATION @{{(wrapped_reduce|fusion)}}
+; CHECK-LABEL: define KERNEL_ANNOTATION @{{(wrapped_reduce|.*fusion)}}
 ; CHECK: store float %{{.*}}, ptr addrspace(1)
 ; CHECK: }
 )";
@@ -557,7 +554,7 @@ TEST_F(GpuKernelTilingTest,
       ParseAndReturnVerifiedModule(kHloString, ConfigWithoutLayoutAssignment())
           .value();
   auto expected_ir = R"(
-; CHECK-LABEL: define KERNEL_ANNOTATION @{{(wrapped_reduce|fusion)}}
+; CHECK-LABEL: define KERNEL_ANNOTATION @{{(wrapped_reduce|.*fusion)}}
 ; CHECK-NOT: call SHUFFLE
 ; CHECK: }
 )";
@@ -637,10 +634,11 @@ TEST_F(GpuKernelTilingTest, RowReductionCorrectShmemUsage) {
   )";
   auto hlo_module = ParseAndReturnVerifiedModule(kHloString).value();
   auto expected_ir = is_built_with_rocm_ ? R"(
-; CHECK: initial_value_addr = internal unnamed_addr addrspace({{[0-9]*}}) global [1024 x float] poison, align 4
+; CHECK: %llvm.amdgcn.kernel.input_reduce_fusion.lds.t = type { [4 x [2 x float]] }
+; CHECK: @llvm.amdgcn.kernel.input_reduce_fusion.lds = internal addrspace(3) global %llvm.amdgcn.kernel.input_reduce_fusion.lds.t poison
   )"
                                          : R"(
-; CHECK: shared_cache = private unnamed_addr addrspace({{[0-9]*}}) global [1 x [1 x [2 x float]]]
+; CHECK: shared_cache = private unnamed_addr addrspace({{[0-9]*}}) global [4 x [2 x float]]
   )";
   CompileAndVerifyIr(std::move(hlo_module), expected_ir,
                      /*match_optimized_ir=*/true);
@@ -657,9 +655,9 @@ TEST_F(GpuKernelTilingTest, ReductionInputTooLarge) {
   }
 
   ENTRY reduce.1 {
-    parameter = f32[4,1048576,1024,1024] parameter(0)
+    parameter = f32[16,1048576,1024,1024] parameter(0)
     init_value = f32[] constant(0)
-    ROOT reduce = f32[4,1048576,1024] reduce(parameter, init_value), dimensions={3}, to_apply=Sum
+    ROOT reduce = f32[16,1048576,1024] reduce(parameter, init_value), dimensions={3}, to_apply=Sum
   }
   )";
   auto hlo_module = ParseAndReturnVerifiedModule(kHloString).value();
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc
index 5b1301fa3eea40..758e7397319aa2 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_too_many_blocks_test.cc
@@ -55,7 +55,7 @@ ENTRY primitive_computation_mul.8 {
   EXPECT_FALSE(failed_executable.ok());
   EXPECT_THAT(
       failed_executable.status().ToString(),
-      ::testing::ContainsRegex("Kernel 'fusion.*' launch needs more blocks"));
+      ::testing::ContainsRegex("Kernel '.*fusion.*' launch needs more blocks"));
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/tests/gpu_unrolling_test.cc b/third_party/xla/xla/service/gpu/tests/gpu_unrolling_test.cc
index f5bf91339c316d..786c0f3b0b68ae 100644
--- a/third_party/xla/xla/service/gpu/tests/gpu_unrolling_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/gpu_unrolling_test.cc
@@ -50,27 +50,14 @@ TEST_F(GpuUnrollingTest, UnrollDefaultTimes) {
 
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
-; CHECK-LABEL: @fusion
-; CHECK: load float
-; CHECK: load float
-; CHECK: fadd
-; CHECK: store float
-; CHECK: load float
-; CHECK: load float
-; CHECK: fadd
-; CHECK: store float
-; CHECK: load float
-; CHECK: load float
-; CHECK: fadd
-; CHECK: store float
-; CHECK: load float
-; CHECK: load float
-; CHECK: fadd
-; CHECK: store float
-; CHECK-NOT: fadd
-; CHECK: }
+; CHECK-LABEL: @{{[a-z_]*}}fusion
+; CHECK-NOT: load float
+; CHECK-NOT: store float
+; CHECK: load <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
       )",
-                     /*match_optimized_ir=*/false);
+                     /*match_optimized_ir=*/true);
 }
 
 TEST_F(GpuUnrollingTest, UnrollUnfusedAdd) {
@@ -91,26 +78,13 @@ TEST_F(GpuUnrollingTest, UnrollUnfusedAdd) {
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
 ; CHECK-LABEL: @wrapped_add
-; CHECK: load float
-; CHECK: load float
-; CHECK: fadd
-; CHECK: store float
-; CHECK: load float
-; CHECK: load float
-; CHECK: fadd
-; CHECK: store float
-; CHECK: load float
-; CHECK: load float
-; CHECK: fadd
-; CHECK: store float
-; CHECK: load float
-; CHECK: load float
-; CHECK: fadd
-; CHECK: store float
-; CHECK-NOT: fadd
-; CHECK: }
+; CHECK-NOT: load float
+; CHECK-NOT: store float
+; CHECK: load <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
       )",
-                     /*match_optimized_ir=*/false);
+                     /*match_optimized_ir=*/true);
 }
 
 TEST_F(GpuUnrollingTest, DisabledUnrollUnfusedSine) {
@@ -243,38 +217,15 @@ TEST_F(GpuUnrollingTest, UnrollMultiOutputFusion) {
 
   CompileAndVerifyIr(std::move(hlo_module),
                      R"(
-; CHECK-LABEL: @fusion
-; CHECK: load float
-; CHECK: load float
-; CHECK-NOT: load float
-; CHECK-NOT: load float
-; CHECK: fadd
-; CHECK: load float
-; CHECK: load float
-; CHECK-NOT: load float
-; CHECK-NOT: load float
-; CHECK: fmul
-; CHECK: store float
-; CHECK: store float
-; CHECK-NOT: store float
-; CHECK-NOT: store float
-; CHECK: load float
-; CHECK: load float
+; CHECK-LABEL: @{{[a-z_]*}}fusion
 ; CHECK-NOT: load float
-; CHECK-NOT: load float
-; CHECK: fadd
-; CHECK: load float
-; CHECK: load float
-; CHECK-NOT: load float
-; CHECK-NOT: load float
-; CHECK: fmul
-; CHECK: store float
-; CHECK: store float
-; CHECK-NOT: store float
 ; CHECK-NOT: store float
-; CHECK: }
+; CHECK: load <4 x float>
+; CHECK: load <4 x float>
+; CHECK: store <4 x float>
+; CHECK: store <4 x float>
       )",
-                     /*match_optimized_ir=*/false);
+                     /*match_optimized_ir=*/true);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc b/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc
deleted file mode 100644
index 8af44cfa01d7ff..00000000000000
--- a/third_party/xla/xla/service/gpu/tests/hlo_to_llvm_ir.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include <iostream>
-#include <string>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "xla/hlo/ir/hlo_module.h"
-#include "xla/service/gpu/buffer_sharing.h"
-#include "xla/service/gpu/compile_module_to_llvm_ir.h"
-#include "xla/service/gpu/gpu_device_info_for_tests.h"
-#include "xla/service/gpu/gpu_hlo_schedule.h"
-#include "xla/service/gpu/llvm_gpu_backend/gpu_backend_lib.h"
-#include "xla/service/gpu/target_constants.h"
-#include "xla/status.h"
-
-#if GOOGLE_CUDA
-#include "xla/stream_executor/cuda/cuda_platform_id.h"
-#elif TENSORFLOW_USE_ROCM
-#include "xla/stream_executor/rocm/rocm_platform_id.h"
-#include "tsl/platform/rocm_rocdl_path.h"
-#endif
-#include "xla/tests/test_utils.h"
-#include "xla/tools/hlo_module_loader.h"
-#include "tsl/platform/init_main.h"
-#include "tsl/util/command_line_flags.h"
-
-const char* const kUsage = R"(
-This tool reads in an scheduled HloModule from a file, compiles it using the
-NVPTX or AMDGPU compiler and prints out the LLVM IR generated by the IR emitter.
-The LLVM IR is not optimized by the LLVM pass pipeline, so this tool can be used
-to unit test the XLA GPU IR emitters.
-
-Note that the LLVM IR does not contain the *full* module, but only parts that
-will be code generated into PTX/Hsaco. The NVPTX/Hsaco compiler also generates a
-GpuExecutable on the side that is not printed.
-
-When passed the parameter `--ptx`, the LLVM IR will be optimized and PTX
-will be emitted and printed instead of the non-optimized LLVM.
-By default SM 70 is targeted. But this can be changed with `--sm=SM`.)";
-
-namespace {
-absl::Status CompileAndPrintLlvmIr(const std::string& hlo_text,
-                                   bool generate_ptx, int sm) {
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<xla::HloModule> hlo_module,
-      xla::LoadModuleFromData(/*data=*/hlo_text, /*format=*/"hlo"));
-
-  CHECK(hlo_module->has_schedule());
-  TF_RETURN_IF_ERROR(VerifyHloModule(hlo_module.get(),
-                                     /*layout_sensitive=*/false,
-                                     /*allow_mixed_precision=*/true));
-#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
-  llvm::LLVMContext llvm_context;
-
-  stream_executor::CudaComputeCapability cuda_compute_capability;
-  cuda_compute_capability.major = sm / 10;
-  cuda_compute_capability.minor = sm % 10;
-
-#if GOOGLE_CUDA
-  stream_executor::DeviceDescription gpu_device_info =
-      xla::gpu::TestGpuDeviceInfo::RTXA6000DeviceInfo(cuda_compute_capability);
-  std::string target_triple = xla::gpu::nvptx::TargetTriple();
-  std::string data_layout = xla::gpu::nvptx::DataLayout();
-  std::string platform_name = "CUDA";
-  stream_executor::Platform::Id platform_id =
-      stream_executor::cuda::kCudaPlatformId;
-#elif TENSORFLOW_USE_ROCM
-  stream_executor::DeviceDescription gpu_device_info =
-      xla::gpu::TestGpuDeviceInfo::AMDMI210DeviceInfo();
-  std::string target_triple = xla::gpu::amdgpu::TargetTriple();
-  std::string data_layout = xla::gpu::amdgpu::DataLayout();
-  std::string platform_name = "ROCm";
-  stream_executor::Platform::Id platform_id =
-      stream_executor::rocm::kROCmPlatformId;
-#endif
-
-  auto buffer_size_bytes_function = [](const xla::BufferValue& buffer) {
-    return xla::gpu::GetSizeOfShape(buffer.shape(), /*pointer_size=*/8);
-  };
-
-  TF_ASSIGN_OR_RETURN(
-      xla::gpu::CompileModuleResults results,
-      xla::gpu::CompileModuleToLlvmIr(
-          hlo_module.get(), &llvm_context, target_triple, data_layout,
-          platform_name, platform_id, gpu_device_info,
-          &xla::gpu::CanShareBufferHint, buffer_size_bytes_function));
-  llvm::Module* llvm_module = results.llvm_module.get();
-
-  if (!generate_ptx) {
-    llvm_module->print(llvm::outs(), nullptr);
-  } else {
-#if GOOGLE_CUDA
-    TF_ASSIGN_OR_RETURN(
-        std::string ptx,
-        xla::gpu::nvptx::CompileToPtx(llvm_module, cuda_compute_capability,
-                                      hlo_module->config().debug_options()));
-    std::cout << ptx << std::endl;
-#elif TENSORFLOW_USE_ROCM
-    return absl::UnimplementedError("Feature not yet implemented in ROCm");
-#endif
-  }
-#endif
-  return absl::OkStatus();
-}
-
-absl::Status CompileAndPrintLlvmIrFromFile(const std::string& file_name,
-                                           bool ptx, int sm) {
-  std::string full_text;
-  TF_RETURN_IF_ERROR(
-      tsl::ReadFileToString(tsl::Env::Default(), file_name, &full_text));
-
-  std::vector<std::string> hlo_module_texts =
-      absl::StrSplit(full_text, "// -----");
-  for (const std::string& hlo_module_text : hlo_module_texts) {
-    TF_RETURN_IF_ERROR(CompileAndPrintLlvmIr(hlo_module_text, ptx, sm));
-  }
-
-  return absl::OkStatus();
-}
-}  // namespace
-
-int main(int argc, char** argv) {
-  bool ptx = false;
-  int sm = 70;
-  std::vector<tsl::Flag> flag_list;
-  xla::AppendDebugOptionsFlags(&flag_list);
-  flag_list.emplace_back("ptx", &ptx,
-                         "Print PTX instead of not optimized LLVM.");
-  flag_list.emplace_back("sm", &sm,
-                         "Specify the SM to target (useful only with --ptx).");
-  // The usage string includes the message at the top of the file, the
-  // DebugOptions flags and the flags defined above.
-  const std::string kUsageString =
-      absl::StrCat(kUsage, "\n\n", tsl::Flags::Usage(argv[0], flag_list));
-  bool parse_ok = tsl::Flags::Parse(&argc, argv, flag_list);
-  tsl::port::InitMain(kUsageString.c_str(), &argc, &argv);
-  if (!parse_ok) {
-    LOG(QFATAL) << kUsageString;
-  }
-
-  QCHECK(argc == 2) << "Must specify a single input file";
-  TF_CHECK_OK(CompileAndPrintLlvmIrFromFile(argv[1], ptx, sm));
-
-  return 0;
-}
diff --git a/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo b/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
index 7079246521ca24..41734e06259a00 100644
--- a/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
+++ b/third_party/xla/xla/service/gpu/tests/kernel_reuse.hlo
@@ -1,11 +1,11 @@
-// RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // All fusions must reuse the same kernel:
 // CHECK-LABEL: target triple
 // CHECK-PTX: define void
-// CHECK-GCN: define amdgpu_kernelvoid
+// CHECK-GCN: define amdgpu_kernel void
 // CHECK-PTX-NOT: define void
-// CHECK-GCN-NOT: define amdgpu_kernelvoid
+// CHECK-GCN-NOT: define amdgpu_kernel void
 
 HloModule KernelReuse, is_scheduled=true
 
@@ -47,11 +47,11 @@ ENTRY main {
 // All (Triton) fusions must reuse the same kernel:
 // CHECK-LABEL: target triple
 // CHECK-PTX-NOT: define void
-// CHECK-GCN-NOT: define amdgpu_kernelvoid
+// CHECK-GCN-NOT: define amdgpu_kernel void
 // CHECK-PTX: define void @triton_gemm_dot1(
-// CHECK-GCN: define amdgpu_kernelvoid @triton_gemm_dot1(
+// CHECK-GCN: define amdgpu_kernel void @triton_gemm_dot1(
 // CHECK-PTX-NOT: define void
-// CHECK-GCN-NOT: define amdgpu_kernelvoid
+// CHECK-GCN-NOT: define amdgpu_kernel void
 
 HloModule t, is_scheduled=true
 
@@ -74,8 +74,8 @@ ENTRY e {
   p2 = f16[15,19]{1,0} parameter(2)
   p1 = s8[19,17]{1,0} parameter(1)
   p0 = f16[15,19]{1,0} parameter(0)
-  triton_gemm_dot1 = f16[15,17]{1,0} fusion(p3, p2), kind=kCustom, calls=triton_gemm_dot1, backend_config="{ \"fusion_backend_config\": {kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\"}}}"
-  triton_gemm_dot0 = f16[15,17]{1,0} fusion(p1, p0), kind=kCustom, calls=triton_gemm_dot0, backend_config="{ \"fusion_backend_config\": {kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\"}}}"
+  triton_gemm_dot1 = f16[15,17]{1,0} fusion(p3, p2), kind=kCustom, calls=triton_gemm_dot1, backend_config="{ \"fusion_backend_config\": {kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\",\"num_ctas\":\"1\"}}}"
+  triton_gemm_dot0 = f16[15,17]{1,0} fusion(p1, p0), kind=kCustom, calls=triton_gemm_dot0, backend_config="{ \"fusion_backend_config\": {kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"4\",\"num_warps\":\"4\",\"num_ctas\":\"1\"}}}"
   ROOT tuple = (f16[15,17]{1,0}, f16[15,17]{1,0}) tuple(triton_gemm_dot0, triton_gemm_dot1)
 }
 
@@ -85,12 +85,12 @@ ENTRY e {
 // - @fusion_2's %arg0 must have align 16, because we are passing a module input
 // - @fusion_1's %arg0 must have align 128, because we are passing an internal buffer
 // CHECK-LABEL: target triple
-// CHECK-PTX: define void @fusion_2(ptr noalias align 16 dereferenceable(100) %arg0, ptr noalias align 128 dereferenceable(100) %arg1)
-// CHECK-GCN: define amdgpu_kernelvoid @fusion_2(ptr noalias align 16 dereferenceable(100) %arg0, ptr noalias align 128 dereferenceable(100) %arg1)
-// CHECK-PTX: define void @fusion_1(ptr noalias align 128 dereferenceable(100) %arg0, ptr noalias align 128 dereferenceable(100) %arg1)
-// CHECK-GCN: define amdgpu_kernelvoid @fusion_1(ptr noalias align 128 dereferenceable(100) %arg0, ptr noalias align 128 dereferenceable(100) %arg1)
+// CHECK-PTX-DAG: define void @fusion_2(ptr noalias align 16 dereferenceable(100) %{{.*}}, ptr noalias align 128 dereferenceable(100) %{{.*}})
+// CHECK-GCN-DAG: define amdgpu_kernel void @fusion_2(ptr noalias align 16 dereferenceable(100) %{{.*}}, ptr noalias align 128 dereferenceable(100) %{{.*}})
+// CHECK-PTX-DAG: define void @fusion_1(ptr noalias align 128 dereferenceable(100) %{{.*}}, ptr noalias align 128 dereferenceable(100) %{{.*}})
+// CHECK-GCN-DAG: define amdgpu_kernel void @fusion_1(ptr noalias align 128 dereferenceable(100) %{{.*}}, ptr noalias align 128 dereferenceable(100) %{{.*}})
 // CHECK-PTX-NOT: define void
-// CHECK-GCN-NOT: define amdgpu_kernelvoid
+// CHECK-GCN-NOT: define amdgpu_kernel void
 
 HloModule KernelReuse, is_scheduled=true
 
@@ -132,10 +132,10 @@ ENTRY main {
 // The first has just 2 parameters (1 input, 1 output) and the second has 3 (2 input, 1 output).
 // All the parameters are noalias, because we are not passing the same argument twice to the kernel.
 // CHECK-LABEL: target triple
-// CHECK-PTX: define void @fusion_2(ptr noalias align 128 dereferenceable(100) %arg0, ptr noalias align 128 dereferenceable(100) %arg1)
-// CHECK-GCN: define amdgpu_kernelvoid @fusion_2(ptr noalias align 128 dereferenceable(100) %arg0, ptr noalias align 128 dereferenceable(100) %arg1)
-// CHECK-PTX: define void @fusion_1(ptr noalias align 128 dereferenceable(100) %arg0, ptr noalias align 128 dereferenceable(100) %arg1, ptr noalias align 128 dereferenceable(100) %arg2)
-// CHECK-GCN: define amdgpu_kernelvoid @fusion_1(ptr noalias align 128 dereferenceable(100) %arg0, ptr noalias align 128 dereferenceable(100) %arg1, ptr noalias align 128 dereferenceable(100) %arg2)
+// CHECK-PTX-DAG: define void @fusion_2(ptr noalias align 128 dereferenceable(100) %{{.*}}, ptr noalias align 128 dereferenceable(100) %{{.*}})
+// CHECK-GCN-DAG: define amdgpu_kernel void @fusion_2(ptr noalias align 128 dereferenceable(100) %{{.*}}, ptr noalias align 128 dereferenceable(100) %{{.*}})
+// CHECK-PTX-DAG: define void @fusion_1(ptr noalias align 128 dereferenceable(100) %{{.*}}, ptr noalias align 128 dereferenceable(100) %{{.*}}, ptr noalias align 128 dereferenceable(100) %{{.*}})
+// CHECK-GCN-DAG: define amdgpu_kernel void @fusion_1(ptr noalias align 128 dereferenceable(100) %{{.*}}, ptr noalias align 128 dereferenceable(100) %{{.*}}, ptr noalias align 128 dereferenceable(100) %{{.*}})
 // CHECK-NOT: define void
 
 HloModule KernelReuse, is_scheduled=true
@@ -178,16 +178,16 @@ ENTRY main {
 // "!invariant.load" (thanks to ir_array.MarkInvariantOverWholeProgram).
 //
 // CHECK-LABEL: target triple
-// CHECK-PTX: define void @fusion_2(ptr noalias align 128 dereferenceable(100) %arg0)
-// CHECK-GCN: define amdgpu_kernelvoid @fusion_2(ptr noalias align 128 dereferenceable(100) %arg0)
+// CHECK-PTX: define void @fusion_2(ptr noalias align 128 dereferenceable(100) %{{.*}})
+// CHECK-GCN: define amdgpu_kernel void @fusion_2(ptr noalias align 128 dereferenceable(100) %{{.*}})
 // CHECK-NOT: !invariant.load
-// CHECK-PTX: define void @fusion(ptr noalias align 128 dereferenceable(100) %arg0, ptr noalias align 128 dereferenceable(100) %arg1)
-// CHECK-GCN: define amdgpu_kernelvoid @fusion(ptr noalias align 128 dereferenceable(100) %arg0, ptr noalias align 128 dereferenceable(100) %arg1)
+// CHECK-PTX: define void @fusion(ptr noalias align 128 dereferenceable(100) %{{.*}}, ptr noalias align 128 dereferenceable(100) %{{.*}})
+// CHECK-GCN: define amdgpu_kernel void @fusion(ptr noalias align 128 dereferenceable(100) %{{.*}}, ptr noalias align 128 dereferenceable(100) %{{.*}})
 // CHECK-PTX-NOT: define void
-// CHECK-GCN-NOT: define amdgpu_kernelvoid
+// CHECK-GCN-NOT: define amdgpu_kernel void
 // CHECK: !invariant.load
 // CHECK-PTX-NOT: define void
-// CHECK-GCN-NOT: define amdgpu_kernelvoid
+// CHECK-GCN-NOT: define amdgpu_kernel void
 
 HloModule KernelReuse, is_scheduled=true
 
diff --git a/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo b/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo
index 9191de9b492365..bcfa37733f7e67 100644
--- a/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo
+++ b/third_party/xla/xla/service/gpu/tests/launch_dimensions.hlo
@@ -1,14 +1,14 @@
-// RUN:   hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 // This tests that we do not increase the grid launch size when
 // few_waves is enabled.
 
-// CHECK-LABEL: entry:
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-GCN:   call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-GCN:   call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
-// CHECK:   ![[ctaid_range]] = !{i32 0, i32 2}
-// CHECK:   ![[tid_range]] = !{i32 0, i32 1024}
+// CHECK-LABEL:   define void @wrapped_b
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-DAG:     ![[ctaid_range]] = !{i32 0, i32 2}
+// CHECK-DAG:     ![[tid_range]] = !{i32 0, i32 1024}
 
 
 HloModule Test, is_scheduled=true
@@ -27,14 +27,14 @@ ENTRY main {
 
 // This tests that we cap grid launch code when few_waves is enabled.
 
-// CHECK-LABEL: entry:
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-GCN:   call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-GCN:   call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-PTX:   ![[ctaid_range]] = !{i32 0, i32 1008}
-// CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 1664}
-// CHECK:   ![[tid_range]] = !{i32 0, i32 128}
+// CHECK-LABEL:   define void @wrapped_b
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008}
+// CHECK-GCN-DAG: ![[ctaid_range]] = !{i32 0, i32 1760}
+// CHECK-DAG:     ![[tid_range]] = !{i32 0, i32 128}
 
 HloModule Test, is_scheduled=true
 
@@ -53,15 +53,15 @@ ENTRY main {
 // This tests that we cap grid launch code when few_waves is enabled
 // and scalar broadcast are present.
 
-// CHECK-LABEL: entry:
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-PTX:   ![[ctaid_range]] = !{i32 0, i32 1008}
-// CHECK-PTX:   ![[tid_range]] = !{i32 0, i32 128}
-// CHECK-GCN:   call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-GCN:   call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 1664}
-// CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 128}
+// CHECK-LABEL:   define void @fusion_3
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008}
+// CHECK-PTX-DAG: ![[tid_range]] = !{i32 0, i32 128}
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-GCN-DAG: ![[ctaid_range]] = !{i32 0, i32 1760}
+// CHECK-GCN-DAG: ![[tid_range]] = !{i32 0, i32 128}
 
 HloModule ScalarBroadcast, is_scheduled=true
 
@@ -84,15 +84,15 @@ ENTRY main {
 // This tests that we enable few_waves in a simple fusion. It is the baseline
 // for the tests below.
 
-// CHECK-LABEL: entry:
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-PTX:   ![[ctaid_range]] = !{i32 0, i32 1008}
-// CHECK-PTX:   ![[tid_range]] = !{i32 0, i32 128}
-// CHECK-GCN:   call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-GCN:   call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 1664}
-// CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 128}
+// CHECK-LABEL:   define void @fusion
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008}
+// CHECK-PTX-DAG: ![[tid_range]] = !{i32 0, i32 128}
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-GCN-DAG: ![[ctaid_range]] = !{i32 0, i32 1760}
+// CHECK-GCN-DAG: ![[tid_range]] = !{i32 0, i32 128}
 
 HloModule SimpleFusion, is_scheduled=true
 
@@ -113,15 +113,15 @@ ENTRY main {
 
 // This tests that we keep few_waves enabled for large constants.
 
-// CHECK-LABEL: entry:
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-PTX:   ![[ctaid_range]] = !{i32 0, i32 1008}
-// CHECK-PTX:   ![[tid_range]] = !{i32 0, i32 128}
-// CHECK-GCN:   call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-GCN:   call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 1664}
-// CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 128}
+// CHECK-LABEL:   define void @fusion
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008}
+// CHECK-PTX-DAG: ![[tid_range]] = !{i32 0, i32 128}
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-GCN-DAG: ![[ctaid_range]] = !{i32 0, i32 1760}
+// CHECK-GCN-DAG: ![[tid_range]] = !{i32 0, i32 128}
 
 HloModule LargeConstant, is_scheduled=true
 
@@ -141,15 +141,15 @@ ENTRY main {
 
 // This tests that we disable few_waves if a non-elementwise op is present.
 
-// CHECK-LABEL: entry:
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-PTX:   ![[ctaid_range]] = !{i32 0, i32 195313}
-// CHECK-PTX:   ![[tid_range]] = !{i32 0, i32 128}
-// CHECK-GCN:   call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-GCN:   call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 97657}
-// CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 256}
+// CHECK-LABEL:   define void @fusion
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 195313}
+// CHECK-PTX-DAG: ![[tid_range]] = !{i32 0, i32 128}
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-GCN-DAG: ![[ctaid_range]] = !{i32 0, i32 97657}
+// CHECK-GCN-DAG: ![[tid_range]] = !{i32 0, i32 256}
 
 HloModule NonElementwise, is_scheduled=true
 
@@ -175,15 +175,15 @@ ENTRY main {
 // - the fusion is not row-vectorizable
 // It serves as a baseline for the tests below.
 
-// CHECK-LABEL: entry:
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-PTX:   ![[ctaid_range]] = !{i32 0, i32 7813}
-// CHECK-PTX:   ![[tid_range]] = !{i32 0, i32 128}
-// CHECK-GCN:   call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-GCN:   call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 3907}
-// CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 256}
+// CHECK-LABEL:   define void @fusion
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 7813}
+// CHECK-PTX-DAG: ![[tid_range]] = !{i32 0, i32 128}
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-GCN-DAG: ![[ctaid_range]] = !{i32 0, i32 3907}
+// CHECK-GCN-DAG: ![[tid_range]] = !{i32 0, i32 256}
 
 HloModule NoFewWaves, is_scheduled=true
 
@@ -219,15 +219,15 @@ ENTRY main {
 // - the fusion IS row-vectorizable
 // In this case, the block count is changed from 7813 to 2000.
 
-// CHECK-LABEL: entry:
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-PTX:   ![[ctaid_range]] = !{i32 0, i32 2000}
-// CHECK-PTX:   ![[tid_range]] = !{i32 0, i32 500}
-// CHECK-GCN:   call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-GCN:   call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 2000}
-// CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 500}
+// CHECK-LABEL:   define void @fusion
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 2000}
+// CHECK-PTX-DAG: ![[tid_range]] = !{i32 0, i32 500}
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-GCN-DAG: ![[ctaid_range]] = !{i32 0, i32 2000}
+// CHECK-GCN-DAG: ![[tid_range]] = !{i32 0, i32 500}
 
 HloModule RowVectorizable, is_scheduled=true
 
@@ -260,15 +260,15 @@ ENTRY main {
 // - the fusion is not row-vectorizable
 // In this case, the block count is changed from 7813 to 1008.
 
-// CHECK-LABEL: entry:
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-PTX:   ![[ctaid_range]] = !{i32 0, i32 1008}
-// CHECK-PTX:   ![[tid_range]] = !{i32 0, i32 128}
-// CHECK-GCN:   call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-GCN:   call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 1664}
-// CHECK-GCN:   ![[tid_range]] = !{i32 0, i32 128}
+// CHECK-LABEL:   define void @fusion
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008}
+// CHECK-PTX-DAG: ![[tid_range]] = !{i32 0, i32 128}
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-GCN-DAG: ![[ctaid_range]] = !{i32 0, i32 1760}
+// CHECK-GCN-DAG: ![[tid_range]] = !{i32 0, i32 128}
 
 HloModule ScalarBroadcastFourInputs, is_scheduled=true
 
@@ -300,14 +300,14 @@ ENTRY main {
 // This tests the GELU kernel. The original kernel that
 // motivated few_waves implementation.
 
-// CHECK-LABEL: entry:
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-GCN:   call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
-// CHECK-PTX:   call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-GCN:   call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
-// CHECK-PTX:   ![[ctaid_range]] = !{i32 0, i32 1008}
-// CHECK-GCN:   ![[ctaid_range]] = !{i32 0, i32 1664}
-// CHECK:   ![[tid_range]] = !{i32 0, i32 128}
+// CHECK-LABEL:   define void @fusion
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workgroup.id.x(), !range ![[ctaid_range:[0-9]+]]
+// CHECK-PTX-DAG: call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-GCN-DAG: call i32 @llvm.amdgcn.workitem.id.x(), !range ![[tid_range:[0-9]+]]
+// CHECK-PTX-DAG: ![[ctaid_range]] = !{i32 0, i32 1008}
+// CHECK-GCN-DAG: ![[ctaid_range]] = !{i32 0, i32 1760}
+// CHECK-DAG:     ![[tid_range]] = !{i32 0, i32 128}
 
 HloModule Test, is_scheduled=true
 
diff --git a/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo b/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
index 906a93b8efc187..6e147df3928c09 100644
--- a/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
+++ b/third_party/xla/xla/service/gpu/tests/pad_to_static.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo b/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo
new file mode 100644
index 00000000000000..cff4ff8a79a5b9
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/reduce_atomic_min.hlo
@@ -0,0 +1,310 @@
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s --check-prefixes=CHECK,CHECK-%{PTX}
+
+// Check that for "min" we are still using atomics (CAS loop).
+
+HloModule MinReduce, is_scheduled=true
+
+Min {
+  x.1 = f32[] parameter(0)
+  y.1 = f32[] parameter(1)
+  ROOT min.1 = f32[] minimum(x.1, y.1)
+}
+
+fused_computation {
+  param_0 = f32[300000]{0} parameter(0)
+  param_1 = f32[] parameter(1)
+  ROOT reduce.1 = f32[] reduce(f32[300000]{0} param_0, f32[] param_1), dimensions={0}, to_apply=Min
+}
+
+ENTRY reduce.1 {
+  parameter = f32[300000] parameter(0)
+  init_value = f32[] constant(0)
+  ROOT wrapped_reduce = f32[] fusion(f32[300000]{0} parameter, f32[] init_value), kind=kInput, calls=fused_computation
+}
+
+// CHECK-LABEL: entry:
+// CHECK-PTX:     %[[VAL_0:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !4
+// CHECK-GCN:     %[[VAL_0:.*]] = call i32 @llvm.amdgcn.workgroup.id.x
+// CHECK:         %[[VAL_1:.*]] = zext i32 %[[VAL_0]] to i64
+// CHECK-PTX:     %[[VAL_2:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !4
+// CHECK-GCN:     %[[VAL_2:.*]] = call i32 @llvm.amdgcn.workitem.id.x
+// CHECK:         %[[VAL_3:.*]] = zext i32 %[[VAL_2]] to i64
+// CHECK:         %[[VAL_4:.*]] = mul nuw nsw i64 %[[VAL_1]], 1
+// CHECK:         %[[VAL_5:.*]] = add nuw nsw i64 %[[VAL_4]], %[[VAL_3]]
+// CHECK:         %[[VAL_6:.*]] = icmp ult i64 %[[VAL_5]], 1
+// CHECK:         call void @llvm.assume(i1 %[[VAL_6]])
+// CHECK:         %[[VAL_7:.*]] = add nuw nsw i64 %[[VAL_5]], 0
+// CHECK:         %[[VAL_8:.*]] = icmp ult i64 %[[VAL_5]], 1
+// CHECK:         br i1 %[[VAL_8]], label %[[VAL_9:.*]], label %[[VAL_10:.*]]
+// CHECK:       wrapped_reduce.in_bounds-after:                   ; preds = %[[VAL_9]], %[[VAL_11:.*]]
+// CHECK:         ret void
+// CHECK:       wrapped_reduce.in_bounds-true:                    ; preds = %[[VAL_11]]
+// CHECK:         %[[VAL_12:.*]] = load float, ptr %[[VAL_13:.*]], align 4, !invariant.load !5
+// CHECK:         store float %[[VAL_12]], ptr %[[VAL_14:.*]], align 4
+// CHECK:         br label %[[VAL_10]]
+// CHECK:       entry:
+// CHECK:         %[[VAL_15:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_16:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_17:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_18:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_19:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_20:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_21:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_22:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_23:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_24:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_25:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_26:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_27:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_28:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_29:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_30:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_31:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_32:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_33:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_34:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_35:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_36:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_37:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_38:.*]] = alloca float, align 4
+// CHECK:         %[[LOOP3_I_2:loop3.invar_address.*]] = alloca i32, align 4
+// CHECK:         %[[LOOP2_I_2:loop2.invar_address.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_42:return_buffer.*]] = alloca float, align 4
+// CHECK:         %[[VAL_40:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_43:.*]] = alloca i32, align 4
+// CHECK:         %partial_reduction_result = alloca float, align 4
+// CHECK:         %reduction_input_address = alloca float, align 4
+// CHECK-PTX:     %[[VAL_47:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !range !4
+// CHECK-GCN:     %[[VAL_47:.*]] = call i32 @llvm.amdgcn.workgroup.id.y
+// CHECK:         %[[VAL_48:.*]] = icmp eq i32 %[[VAL_47]], 0
+// CHECK:         br i1 %[[VAL_48]], label %[[VAL_49:.*]], label %[[VAL_50:.*]]
+// CHECK:       reduce-group-0-after:                             ; preds = %[[VAL_51:.*]], %[[VAL_52:.*]]
+// CHECK:         ret void
+// CHECK:       reduce-group-0-true:                              ; preds = %[[VAL_52]]
+// CHECK:         %[[VAL_53:.*]] = load float, ptr %[[VAL_54:.*]], align 4, !invariant.load !5
+// CHECK:         store float %[[VAL_53]], ptr %partial_reduction_result, align 4
+// CHECK-PTX:     %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !6
+// CHECK-GCN:     %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x
+// CHECK-PTX:     %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !7
+// CHECK-GCN:     %block.id.x = call i32 @llvm.amdgcn.workgroup.id.x
+// CHECK:         %thread.id.2 = urem i32 %thread.id.x, 1024
+// CHECK:         %lane_id = urem i32 %thread.id.x, 32
+// CHECK:         %[[VAL_63:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VECTOR_OFFSET:.*]] = urem i32 %[[VAL_63]], 1
+// CHECK:         %[[VAL_63_2:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_64:.*]] = urem i32 %[[VAL_63_2]], 19
+// CHECK:         %[[VAL_65:.*]] = udiv i32 %block.id.x, 19
+// CHECK:         %[[VAL_66:.*]] = urem i32 %[[VAL_65]], 1
+// CHECK:         %[[VAL_67:.*]] = udiv i32 %block.id.x, 19
+// CHECK:         %[[VAL_68:.*]] = icmp eq i32 %[[VAL_64]], 18
+// CHECK:         %tile_bound.2 = select i1 %[[VAL_68]], i32 2544, i32 8192
+// CHECK:         %tile_origin.0 = mul i32 %[[VAL_67]], 1
+// CHECK:         %tile_origin.1 = mul i32 %[[VAL_66]], 1
+// CHECK:         %tile_origin.2 = mul i32 %[[VAL_64]], 8192
+// CHECK:         %tile_origin.3 = mul i32 %[[VECTOR_OFFSET]], 2
+// CHECK:         %[[VAL_81:.*]] = icmp eq i32 8192, %tile_bound.2
+// CHECK:         br i1 %[[VAL_81]], label %[[VAL_82:.*]], label %[[VAL_83:.*]]
+// CHECK:       is_full_tile-after:                               ; preds = %[[VAL_84:.*]], %[[VAL_85:.*]]
+// CHECK:         %[[VAL_86:.*]] = load float, ptr %partial_reduction_result, align 4
+// CHECK:         %[[VAL_87:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_86]], i32 16, i32 31)
+// CHECK:         store float %[[VAL_87]], ptr %[[VAL_37]], align 4
+// CHECK:         call void @[[MIN:Min.*]](ptr %partial_reduction_result, ptr %[[VAL_37]], ptr %[[VAL_36]])
+// CHECK:         %[[VAL_88:.*]] = load float, ptr %[[VAL_36]], align 4
+// CHECK:         store float %[[VAL_88]], ptr %partial_reduction_result, align 4
+// CHECK:         %[[VAL_89:.*]] = load float, ptr %partial_reduction_result, align 4
+// CHECK:         %[[VAL_90:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_89]], i32 8, i32 31)
+// CHECK:         store float %[[VAL_90]], ptr %[[VAL_35]], align 4
+// CHECK:         call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_35]], ptr %[[VAL_34]])
+// CHECK:         %[[VAL_91:.*]] = load float, ptr %[[VAL_34]], align 4
+// CHECK:         store float %[[VAL_91]], ptr %partial_reduction_result, align 4
+// CHECK:         %[[VAL_92:.*]] = load float, ptr %partial_reduction_result, align 4
+// CHECK:         %[[VAL_93:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_92]], i32 4, i32 31)
+// CHECK:         store float %[[VAL_93]], ptr %[[VAL_33]], align 4
+// CHECK:         call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_33]], ptr %[[VAL_32]])
+// CHECK:         %[[VAL_94:.*]] = load float, ptr %[[VAL_32]], align 4
+// CHECK:         store float %[[VAL_94]], ptr %partial_reduction_result, align 4
+// CHECK:         %[[VAL_95:.*]] = load float, ptr %partial_reduction_result, align 4
+// CHECK:         %[[VAL_96:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_95]], i32 2, i32 31)
+// CHECK:         store float %[[VAL_96]], ptr %[[VAL_31]], align 4
+// CHECK:         call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_31]], ptr %[[VAL_30]])
+// CHECK:         %[[VAL_97:.*]] = load float, ptr %[[VAL_30]], align 4
+// CHECK:         store float %[[VAL_97]], ptr %partial_reduction_result, align 4
+// CHECK:         %[[VAL_98:.*]] = load float, ptr %partial_reduction_result, align 4
+// CHECK:         %[[VAL_99:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_98]], i32 1, i32 31)
+// CHECK:         store float %[[VAL_99]], ptr %[[VAL_29]], align 4
+// CHECK:         call void @[[MIN]](ptr %partial_reduction_result, ptr %[[VAL_29]], ptr %[[VAL_28]])
+// CHECK:         %[[VAL_100:.*]] = load float, ptr %[[VAL_28]], align 4
+// CHECK:         store float %[[VAL_100]], ptr %partial_reduction_result, align 4
+// CHECK:         %[[VAL_101:.*]] = udiv i32 %thread.id.2, 32
+// CHECK:         br i1 true, label %[[VAL_105:.*]], label %[[VAL_51]]
+// CHECK:       thread_in_bounds-after:
+// CHECK:         br label %[[VAL_50]]
+// CHECK:       is_full_tile-true:
+// CHECK:         store i32 0, ptr %[[VAL_43]], align 4
+// CHECK:         br label %[[VAL_107:.*]]
+// CHECK:       loop2.loop_header:                                ; preds = %[[VAL_108:.*]], %[[VAL_82]]
+// CHECK:         %[[VAL_109:.*]] = load i32, ptr %[[VAL_43]], align 4
+// CHECK:         %[[VAL_110:.*]] = icmp uge i32 %[[VAL_109]], 8
+// CHECK:         br i1 %[[VAL_110]], label %loop2.loop_exit, label %loop2.loop_body
+// CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_107]]
+// CHECK:         %[[VAL_111:.*]] = add nuw nsw i32 %[[VAL_109]], 1
+// CHECK:         store i32 %[[VAL_111]], ptr %[[VAL_43]], align 4
+// CHECK:         %[[VAL_112:.*]] = icmp eq i32 %[[VAL_109]], 0
+// CHECK:         %[[OFFSET_2:.*]] = add i32 %loop2.indvar, %thread.id.2
+// CHECK:         store i32 0, ptr %loop3.invar_address, align 4
+// CHECK:         br label %loop3.loop_header
+// CHECK:       loop3.loop_header:
+// CHECK:         %loop3.indvar = load i32, ptr %loop3.invar_address, align 4
+// CHECK:         %[[LOOP3_OOB:.*]] = icmp uge i32 %loop3.indvar, 2
+// CHECK:         br i1 %[[LOOP3_OOB]], label %loop3.loop_exit, label %loop3.loop_body
+// CHECK:       loop3.loop_body:
+// CHECK:         %[[LOOP3_INC:.*]] = add nuw nsw i32 %loop3.indvar, 1
+// CHECK:         store i32 %[[LOOP3_INC]], ptr %loop3.invar_address, align 4
+// CHECK:         %[[START_0:.*]] = add i32 %tile_origin.0, 0
+// CHECK:         %[[START_1:.*]] = add i32 %tile_origin.1, 0
+// CHECK:         %[[START_2:.*]] = add i32 %tile_origin.2, %[[OFFSET_2]]
+// CHECK:         %[[START_3:.*]] = add i32 %tile_origin.3, %loop3.indvar
+// CHECK:         %[[VAL_113:.*]] = mul nuw nsw i32 %[[START_3]], 1
+// CHECK:         %[[VAL_114:.*]] = add nuw nsw i32 0, %[[VAL_113]]
+// CHECK:         %[[VAL_115:.*]] = mul nuw nsw i32 %[[START_2]], 2
+// CHECK:         %[[VAL_116:.*]] = add nuw nsw i32 %[[VAL_114]], %[[VAL_115]]
+// CHECK:         %[[VAL_119:.*]] = getelementptr inbounds [300000 x float], ptr %[[VAL_120:.*]], i32 0, i32 %[[VAL_116]]
+// CHECK:         %[[VAL_121:.*]] = load float, ptr %[[VAL_119]], align 4, !invariant.load !5
+// CHECK:         store float %[[VAL_121]], ptr %reduction_input_address, align 4
+// CHECK:         call void @[[MIN]](ptr %partial_reduction_result, ptr %reduction_input_address, ptr %[[VAL_42]])
+// CHECK:         %[[VAL_123:.*]] = load float, ptr %[[VAL_42]], align 4
+// CHECK:         store float %[[VAL_123]], ptr %partial_reduction_result, align 4
+// CHECK:         br label %loop3.loop_header
+// CHECK:       loop3.loop_exit:
+// CHECK:         br label %loop2.loop_header
+// CHECK:       loop2.loop_exit:
+// CHECK:         br label %is_full_tile-after
+// CHECK:       is_full_tile-false:
+// CHECK:         store i32 0, ptr %[[LOOP2_I_2]], align 4
+// CHECK:         br label %[[VAL_134:.*]]
+// CHECK:       loop2.loop_header4:
+// CHECK:         %[[VAL_136:.*]] = load i32, ptr %[[LOOP2_I_2]], align 4
+// CHECK:         %[[VAL_137:.*]] = icmp uge i32 %[[VAL_136]], 8
+// CHECK:         br i1 %[[VAL_137]], label %[[VAL_84]], label %[[VAL_138:.*]]
+// CHECK:       loop2.loop_body5:
+// CHECK:         %[[VAL_139:.*]] = add nuw nsw i32 %[[VAL_136]], 1
+// CHECK:         store i32 %[[VAL_139]], ptr %[[LOOP2_I_2]], align 4
+// CHECK:         %[[VAL_140:.*]] = icmp eq i32 %[[VAL_136]], 0
+// CHECK:         %[[VAL_141:.*]] = add i32 %[[VAL_136]], %thread.id.2
+// CHECK:         %[[VAL_144:.*]] = icmp ult i32 %[[VAL_141]], %tile_bound.2
+// CHECK:         br i1 %[[VAL_144]], label %x_in_tile-true, label %x_in_tile-after
+// CHECK:       x_in_tile-after:
+// CHECK:         br label %loop2.loop_header4
+// CHECK:       loop2.loop_exit3:
+// CHECK:         br label %is_full_tile-after
+// CHECK:       x_in_tile-true:                                   ; preds = %[[VAL_138]]
+// CHECK:         store i32 0, ptr %[[LOOP3_I_2]], align 4
+// CHECK:         br label %loop3.loop_header10
+// CHECK:      loop3.loop_header10:
+// CHECK:         %[[VAL_145:.*]] = load i32, ptr %[[LOOP3_I_2]], align 4
+// CHECK:         %[[VAL_146:.*]] = icmp uge i32 %[[VAL_145]], 2
+// CHECK:         br i1 %[[VAL_146]], label %loop3.loop_exit9, label %loop3.loop_body11
+// CHECK:      loop3.loop_body11:
+// CHECK:         %[[VAL_147:.*]] = add nuw nsw i32 %[[VAL_145]], 1
+// CHECK:         store i32 %[[VAL_147]], ptr %[[LOOP3_I_2]], align 4
+// CHECK:         %[[IDX0:.*]] = add i32 %tile_origin.0, 0
+// CHECK:         %[[IDX1:.*]] = add i32 %tile_origin.1, 0
+// CHECK:         %[[IDX2:.*]] = add i32 %tile_origin.2, %[[VAL_141]]
+// CHECK:         %[[IDX3:.*]] = add i32 %tile_origin.3, %[[VAL_145]]
+// CHECK:         %[[VAL_148:.*]] = mul nuw nsw i32 %[[IDX3]], 1
+// CHECK:         %[[VAL_149:.*]] = add nuw nsw i32 0, %[[VAL_148]]
+// CHECK:         %[[VAL_150:.*]] = mul nuw nsw i32 %[[IDX2]], 2
+// CHECK:         %[[VAL_151:.*]] = add nuw nsw i32 %[[VAL_149]], %[[VAL_150]]
+// CHECK:         %[[VAL_155:.*]] = getelementptr inbounds [300000 x float], ptr %[[VAL_120]], i32 0, i32 %[[VAL_151]]
+// CHECK:         %[[VAL_156:.*]] = load float, ptr %[[VAL_155]], align 4, !invariant.load !5
+// CHECK:         store float %[[VAL_156]], ptr %reduction_input_address, align 4
+// CHECK:         call void @[[MIN]](ptr %partial_reduction_result, ptr %reduction_input_address, ptr %[[VAL_38]])
+// CHECK:         %[[VAL_158:.*]] = load float, ptr %[[VAL_38]], align 4
+// CHECK:         store float %[[VAL_158]], ptr %partial_reduction_result, align 4
+// CHECK:         br label %loop3.loop_header10
+// CHECK:       loop3.loop_exit9:
+// CHECK:         br label %x_in_tile-after
+// CHECK:       thread_in_bounds-true:
+// CHECK:         %[[VAL_166:.*]] = icmp eq i32 %lane_id, 0
+// CHECK:         br i1 %[[VAL_166]], label %[[VAL_167:.*]], label %[[VAL_168:.*]]
+// CHECK:       intra_warp_reduce_write-after:                    ; preds = %[[VAL_167]], %[[VAL_105]]
+// CHECK:         call void @llvm.nvvm.barrier0()
+// CHECK:         %[[VAL_169:.*]] = icmp eq i32 %[[VAL_101]], 0
+// CHECK:         br i1 %[[VAL_169]], label %inter_warp_reduce-true, label %inter_warp_reduce-after
+// CHECK:       inter_warp_reduce-after:                          ; preds = %[[VAL_171:.*]], %[[VAL_168]]
+// CHECK:         br label %[[VAL_51]]
+// CHECK:       intra_warp_reduce_write-true:                     ; preds = %[[VAL_105]]
+// CHECK:         %[[VAL_172:.*]] = load float, ptr %partial_reduction_result, align 4
+// CHECK:         %[[VAL_173:.*]] = getelementptr inbounds [1 x [32 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 0, i32 %[[VAL_101]]
+// CHECK:         %[[VAL_174:.*]] = addrspacecast ptr addrspace(3) %[[VAL_173]] to ptr
+// CHECK:         store float %[[VAL_172]], ptr %[[VAL_174]], align 4
+// CHECK:         br label %[[VAL_168]]
+// CHECK:       inter_warp_reduce-true:                           ; preds = %[[VAL_168]]
+// CHECK:         %[[VAL_175:.*]] = getelementptr inbounds [1 x [32 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 0, i32 %lane_id
+// CHECK:         %[[VAL_176:.*]] = addrspacecast ptr addrspace(3) %[[VAL_175]] to ptr
+// CHECK:         store float %[[VAL_53]], ptr %[[VAL_27]], align 4
+// CHECK:         %[[VAL_177:.*]] = icmp ult i32 %thread.id.2, 32
+// CHECK:         %[[VAL_178:.*]] = select i1 %[[VAL_177]], ptr %[[VAL_176]], ptr %[[VAL_27]]
+// CHECK:         %[[VAL_179:.*]] = load float, ptr %[[VAL_178]], align 4
+// CHECK:         %[[VAL_180:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_179]], i32 16, i32 31)
+// CHECK:         store float %[[VAL_180]], ptr %[[VAL_26]], align 4
+// CHECK:         call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_26]], ptr %[[VAL_25]])
+// CHECK:         %[[VAL_181:.*]] = load float, ptr %[[VAL_25]], align 4
+// CHECK:         store float %[[VAL_181]], ptr %[[VAL_178]], align 4
+// CHECK:         %[[VAL_182:.*]] = load float, ptr %[[VAL_178]], align 4
+// CHECK:         %[[VAL_183:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_182]], i32 8, i32 31)
+// CHECK:         store float %[[VAL_183]], ptr %[[VAL_24]], align 4
+// CHECK:         call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_24]], ptr %[[VAL_23]])
+// CHECK:         %[[VAL_184:.*]] = load float, ptr %[[VAL_23]], align 4
+// CHECK:         store float %[[VAL_184]], ptr %[[VAL_178]], align 4
+// CHECK:         %[[VAL_185:.*]] = load float, ptr %[[VAL_178]], align 4
+// CHECK:         %[[VAL_186:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_185]], i32 4, i32 31)
+// CHECK:         store float %[[VAL_186]], ptr %[[VAL_22]], align 4
+// CHECK:         call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_22]], ptr %[[VAL_21]])
+// CHECK:         %[[VAL_187:.*]] = load float, ptr %[[VAL_21]], align 4
+// CHECK:         store float %[[VAL_187]], ptr %[[VAL_178]], align 4
+// CHECK:         %[[VAL_188:.*]] = load float, ptr %[[VAL_178]], align 4
+// CHECK:         %[[VAL_189:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_188]], i32 2, i32 31)
+// CHECK:         store float %[[VAL_189]], ptr %[[VAL_20]], align 4
+// CHECK:         call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_20]], ptr %[[VAL_19]])
+// CHECK:         %[[VAL_190:.*]] = load float, ptr %[[VAL_19]], align 4
+// CHECK:         store float %[[VAL_190]], ptr %[[VAL_178]], align 4
+// CHECK:         %[[VAL_191:.*]] = load float, ptr %[[VAL_178]], align 4
+// CHECK:         %[[VAL_192:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_191]], i32 1, i32 31)
+// CHECK:         store float %[[VAL_192]], ptr %[[VAL_18]], align 4
+// CHECK:         call void @[[MIN]](ptr %[[VAL_178]], ptr %[[VAL_18]], ptr %[[VAL_17]])
+// CHECK:         %[[VAL_193:.*]] = load float, ptr %[[VAL_17]], align 4
+// CHECK:         store float %[[VAL_193]], ptr %[[VAL_178]], align 4
+// CHECK:         %[[VAL_194:.*]] = icmp eq i32 %thread.id.2, 0
+// CHECK:         br i1 %[[VAL_194]], label %[[VAL_195:.*]], label %[[VAL_171]]
+// CHECK:       reduction_write_output-after:
+// CHECK:         br label %inter_warp_reduce-after
+// CHECK:       reduction_write_output-true:
+// CHECK:         %[[VAL_200:.*]] = load float, ptr %[[VAL_178]], align 4
+// CHECK:         %[[VAL_201:.*]] = load i32, ptr %[[VAL_202:.*]], align 4
+// CHECK:         store i32 %[[VAL_201]], ptr %[[VAL_16]], align 4
+// CHECK:         br label %[[VAL_203:.*]]
+// CHECK:       atomic_op_loop_exit:                              ; preds = %[[VAL_204:.*]], %[[VAL_203]]
+// CHECK:         br label %[[VAL_171]]
+// CHECK:       atomic_op_loop_body:                              ; preds = %[[VAL_204]], %[[VAL_195]]
+// CHECK:         %[[VAL_205:.*]] = load i32, ptr %[[VAL_16]], align 4
+// CHECK:         store i32 %[[VAL_205]], ptr %[[VAL_15]], align 4
+// CHECK:         call void @[[MIN]](ptr %[[VAL_15]], ptr %[[VAL_178]], ptr %[[VAL_15]])
+// CHECK:         %[[VAL_206:.*]] = load i32, ptr %[[VAL_15]], align 4
+// CHECK:         %[[VAL_207:.*]] = icmp eq i32 %[[VAL_205]], %[[VAL_206]]
+// CHECK:         br i1 %[[VAL_207]], label %atomic_op_loop_exit, label %atomic_op_loop_cas
+// CHECK:       atomic_op_loop_cas:                               ; preds = %[[VAL_203]]
+// CHECK:         %[[VAL_208:.*]] = cmpxchg ptr %[[VAL_202]], i32 %[[VAL_205]], i32 %[[VAL_206]] seq_cst seq_cst, align 4
+// CHECK:         %[[VAL_209:.*]] = extractvalue { i32, i1 } %[[VAL_208]], 0
+// CHECK:         store i32 %[[VAL_209]], ptr %[[VAL_16]], align 4
+// CHECK:         %[[VAL_210:.*]] = extractvalue { i32, i1 } %[[VAL_208]], 1
+// CHECK:         br i1 %[[VAL_210]], label %atomic_op_loop_exit, label %atomic_op_loop_body
+// CHECK:       entry:
+// CHECK:         %[[VAL_211:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_212:.*]] = load float, ptr %[[VAL_213:.*]], align 4
+// CHECK:         %[[VAL_214:.*]] = load float, ptr %[[VAL_215:.*]], align 4
+// CHECK:         %[[VAL_216:.*]] = call float @llvm.minimum.f32(float %[[VAL_212]], float %[[VAL_214]])
+// CHECK:         store float %[[VAL_216]], ptr %[[VAL_211]], align 4
+// CHECK:         %[[VAL_217:.*]] = load float, ptr %[[VAL_211]], align 4
+// CHECK:         store float %[[VAL_217]], ptr %[[VAL_218:.*]], align 4
+// CHECK:         ret void
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_column_layout_change.hlo b/third_party/xla/xla/service/gpu/tests/reduce_column_layout_change.hlo
new file mode 100644
index 00000000000000..4c90b12e02ca73
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/reduce_column_layout_change.hlo
@@ -0,0 +1,196 @@
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s --check-prefixes=CHECK,CHECK-%{PTX}
+
+HloModule reduce_with_layout_change, is_scheduled=true
+
+reduction0 {
+  x0 = f32[] parameter(0)
+  y0 = f32[] parameter(1)
+  ROOT add0 = f32[] add(x0, y0)
+}
+
+fused_computation {
+  arg0 = f32[12,3,32,16,32,4,3,12] parameter(0)
+  constant0 = f32[] constant(0)
+  ROOT reduce0 = f32[16,32,4,3,12]{1,3,2,0,4} reduce(arg0, constant0), dimensions={0,1,2}, to_apply=reduction0
+}
+
+ENTRY kernel_entry {
+  arg0 = f32[12,3,32,16,32,4,3,12] parameter(0)
+  ROOT fusion = f32[16,32,4,3,12]{1,3,2,0,4} fusion(arg0), kind=kInput, calls=fused_computation
+}
+
+// CHECK-LABEL: entry:
+// CHECK:         %[[VAL_0:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_1:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_2:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_3:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_4:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_5:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_6:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_7:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_8:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_9:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_10:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_11:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_12:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_13:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_14:.*]] = alloca float, align 4
+// CHECK-PTX:     %[[VAL_15:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !range !2
+// CHECK-GCN:     %[[VAL_15:.*]] = call i32 @llvm.amdgcn.workgroup.id.y
+// CHECK:         %[[VAL_16:.*]] = icmp eq i32 %[[VAL_15]], 0
+// CHECK:         br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_18:.*]]
+// CHECK:       reduce-group-0-after:                             ; preds = %[[VAL_19:.*]], %[[VAL_20:.*]]
+// CHECK:         ret void
+// CHECK:       reduce-group-0-true:                              ; preds = %[[VAL_20]]
+// CHECK:         %[[VAL_21:.*]] = load float, ptr @0, align 4
+// CHECK:         store float %[[VAL_21]], ptr %[[VAL_13]], align 4
+// CHECK-PTX:     %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
+// CHECK-GCN:     %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x
+// CHECK-PTX:     %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !4
+// CHECK-GCN:     %block.id.x = call i32 @llvm.amdgcn.workgroup.id.x
+// CHECK:         %[[VAL_22:.*]] = udiv i32 %thread.id.x, 32
+// CHECK:         %thread.id.1 = urem i32 %[[VAL_22]], 32
+// CHECK:         %thread.id.2 = urem i32 %thread.id.x, 32
+// CHECK:         %lane_id = urem i32 %thread.id.x, 32
+// CHECK:         %[[VAL_23:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_24:.*]] = urem i32 %[[VAL_23]], 2304
+// CHECK:         %[[VAL_25:.*]] = udiv i32 %block.id.x, 2304
+// CHECK:         %[[VAL_26:.*]] = urem i32 %[[VAL_25]], 1
+// CHECK:         %[[VAL_27:.*]] = udiv i32 %block.id.x, 2304
+// CHECK:         %[[VAL_28:.*]] = icmp eq i32 %[[VAL_26]], 0
+// CHECK:         %tile_bound.1 = select i1 %[[VAL_28]], i32 1152, i32 4096
+// CHECK:         %tile_origin.0 = mul i32 %[[VAL_27]], 1
+// CHECK:         %tile_origin.1 = mul i32 %[[VAL_26]], 4096
+// CHECK:         %tile_origin.2 = mul i32 %[[VAL_24]], 32
+// CHECK:         store i32 %thread.id.1, ptr %[[VAL_12]], align 4
+// CHECK:         br label %[[VAL_29:.*]]
+// CHECK:       loop1.loop_header:                                ; preds = %[[VAL_30:.*]], %[[VAL_17]]
+// CHECK:         %[[VAL_31:.*]] = load i32, ptr %[[VAL_12]], align 4
+// CHECK:         %[[VAL_32:.*]] = icmp uge i32 %[[VAL_31]], %tile_bound.1
+// CHECK:         br i1 %[[VAL_32]], label %[[VAL_33:.*]], label %[[VAL_34:.*]]
+// CHECK:       loop1.loop_body:                                  ; preds = %[[VAL_29]]
+// CHECK:         %[[VAL_35:.*]] = add nuw nsw i32 %[[VAL_31]], 32
+// CHECK:         store i32 %[[VAL_35]], ptr %[[VAL_12]], align 4
+// CHECK:         %[[VAL_36:.*]] = icmp eq i32 %[[VAL_31]], %thread.id.1
+// CHECK:         store i32 0, ptr %[[VAL_11]], align 4
+// CHECK:         br label %[[VAL_37:.*]]
+// CHECK:       loop2.loop_header:                                ; preds = %[[VAL_38:.*]], %[[VAL_34]]
+// CHECK:         %[[VAL_39:.*]] = load i32, ptr %[[VAL_11]], align 4
+// CHECK:         %[[VAL_40:.*]] = icmp uge i32 %[[VAL_39]], 32
+// CHECK:         br i1 %[[VAL_40]], label %[[VAL_30]], label %[[VAL_41:.*]]
+// CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_37]]
+// CHECK:         %[[VAL_42:.*]] = add nuw nsw i32 %[[VAL_39]], 32
+// CHECK:         store i32 %[[VAL_42]], ptr %[[VAL_11]], align 4
+// CHECK:         %[[VAL_43:.*]] = icmp eq i32 %[[VAL_39]], 0
+// CHECK:         %[[VAL_44:.*]] = add i32 %[[VAL_39]], %thread.id.2
+// CHECK:         %[[VAL_45:.*]] = icmp ult i32 %[[VAL_44]], 32
+// CHECK:         br i1 %[[VAL_45]], label %[[VAL_46:.*]], label %[[VAL_38]]
+// CHECK:       x_in_tile-after:                                  ; preds = %[[VAL_46]], %[[VAL_41]]
+// CHECK:         br label %[[VAL_37]], !llvm.loop !5
+// CHECK:       loop2.loop_exit:                                  ; preds = %[[VAL_37]]
+// CHECK:         br label %[[VAL_29]], !llvm.loop !8
+// CHECK:       loop1.loop_exit:                                  ; preds = %[[VAL_29]]
+// CHECK:         %[[VAL_47:.*]] = load float, ptr %[[VAL_13]], align 4
+// CHECK:         %[[VAL_48:.*]] = getelementptr inbounds [32 x [33 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.2, i32 %thread.id.1
+// CHECK:         %[[VAL_49:.*]] = addrspacecast ptr addrspace(3) %[[VAL_48]] to ptr
+// CHECK:         store float %[[VAL_47]], ptr %[[VAL_49]], align 4
+// CHECK:         call void @llvm.nvvm.barrier0()
+// CHECK:         %[[VAL_50:.*]] = getelementptr inbounds [32 x [33 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.1, i32 %thread.id.2
+// CHECK:         %[[VAL_51:.*]] = addrspacecast ptr addrspace(3) %[[VAL_50]] to ptr
+// CHECK:         %[[VAL_52:.*]] = load float, ptr %[[VAL_51]], align 4
+// CHECK:         %[[VAL_53:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_52]], i32 16, i32 31)
+// CHECK:         store float %[[VAL_53]], ptr %[[VAL_9]], align 4
+// CHECK:         call void @[[REDUCTION0:reduction0.*]](ptr %[[VAL_51]], ptr %[[VAL_9]], ptr %[[VAL_8]])
+// CHECK:         %[[VAL_54:.*]] = load float, ptr %[[VAL_8]], align 4
+// CHECK:         store float %[[VAL_54]], ptr %[[VAL_51]], align 4
+// CHECK:         %[[VAL_55:.*]] = load float, ptr %[[VAL_51]], align 4
+// CHECK:         %[[VAL_56:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_55]], i32 8, i32 31)
+// CHECK:         store float %[[VAL_56]], ptr %[[VAL_7]], align 4
+// CHECK:         call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_7]], ptr %[[VAL_6]])
+// CHECK:         %[[VAL_57:.*]] = load float, ptr %[[VAL_6]], align 4
+// CHECK:         store float %[[VAL_57]], ptr %[[VAL_51]], align 4
+// CHECK:         %[[VAL_58:.*]] = load float, ptr %[[VAL_51]], align 4
+// CHECK:         %[[VAL_59:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_58]], i32 4, i32 31)
+// CHECK:         store float %[[VAL_59]], ptr %[[VAL_5]], align 4
+// CHECK:         call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_5]], ptr %[[VAL_4]])
+// CHECK:         %[[VAL_60:.*]] = load float, ptr %[[VAL_4]], align 4
+// CHECK:         store float %[[VAL_60]], ptr %[[VAL_51]], align 4
+// CHECK:         %[[VAL_61:.*]] = load float, ptr %[[VAL_51]], align 4
+// CHECK:         %[[VAL_62:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_61]], i32 2, i32 31)
+// CHECK:         store float %[[VAL_62]], ptr %[[VAL_3]], align 4
+// CHECK:         call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_3]], ptr %[[VAL_2]])
+// CHECK:         %[[VAL_63:.*]] = load float, ptr %[[VAL_2]], align 4
+// CHECK:         store float %[[VAL_63]], ptr %[[VAL_51]], align 4
+// CHECK:         %[[VAL_64:.*]] = load float, ptr %[[VAL_51]], align 4
+// CHECK:         %[[VAL_65:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_64]], i32 1, i32 31)
+// CHECK:         store float %[[VAL_65]], ptr %[[VAL_1]], align 4
+// CHECK:         call void @[[REDUCTION0]](ptr %[[VAL_51]], ptr %[[VAL_1]], ptr %[[VAL_0]])
+// CHECK:         %[[VAL_66:.*]] = load float, ptr %[[VAL_0]], align 4
+// CHECK:         store float %[[VAL_66]], ptr %[[VAL_51]], align 4
+// CHECK:         %[[VAL_67:.*]] = icmp ult i32 %thread.id.1, 32
+// CHECK:         %[[VAL_68:.*]] = icmp ult i32 %thread.id.2, %tile_bound.1
+// CHECK:         %[[VAL_69:.*]] = and i1 %[[VAL_67]], %[[VAL_68]]
+// CHECK:         %[[VAL_70:.*]] = icmp eq i32 %lane_id, 0
+// CHECK:         %[[VAL_71:.*]] = and i1 %[[VAL_69]], %[[VAL_70]]
+// CHECK:         br i1 %[[VAL_71]], label %[[VAL_72:.*]], label %[[VAL_19]]
+// CHECK:       reduction_write_output-after:                     ; preds = %[[VAL_72]], %[[VAL_33]]
+// CHECK:         br label %[[VAL_18]]
+// CHECK:       x_in_tile-true:                                   ; preds = %[[VAL_41]]
+// CHECK:         %[[VAL_73:.*]] = add i32 %tile_origin.0, 0
+// CHECK:         %[[VAL_74:.*]] = add i32 %tile_origin.1, %[[VAL_31]]
+// CHECK:         %[[VAL_75:.*]] = add i32 %tile_origin.2, %[[VAL_44]]
+// CHECK:         %[[VAL_76:.*]] = mul nuw nsw i32 %[[VAL_75]], 1
+// CHECK:         %[[VAL_77:.*]] = add nuw nsw i32 0, %[[VAL_76]]
+// CHECK:         %[[VAL_78:.*]] = urem i32 %[[VAL_77]], 12
+// CHECK:         %[[VAL_79:.*]] = udiv i32 %[[VAL_77]], 12
+// CHECK:         %[[VAL_80:.*]] = urem i32 %[[VAL_79]], 3
+// CHECK:         %[[VAL_81:.*]] = udiv i32 %[[VAL_79]], 3
+// CHECK:         %[[VAL_82:.*]] = urem i32 %[[VAL_81]], 4
+// CHECK:         %[[VAL_83:.*]] = udiv i32 %[[VAL_81]], 4
+// CHECK:         %[[VAL_84:.*]] = urem i32 %[[VAL_83]], 32
+// CHECK:         %[[VAL_85:.*]] = udiv i32 %[[VAL_83]], 32
+// CHECK:         %[[VAL_86:.*]] = udiv i32 %[[VAL_85]], 16
+// CHECK:         %[[VAL_87:.*]] = mul nuw nsw i32 %[[VAL_74]], 1
+// CHECK:         %[[VAL_88:.*]] = add nuw nsw i32 0, %[[VAL_87]]
+// CHECK:         %[[VAL_89:.*]] = urem i32 %[[VAL_88]], 32
+// CHECK:         %[[VAL_90:.*]] = udiv i32 %[[VAL_88]], 32
+// CHECK:         %[[VAL_91:.*]] = urem i32 %[[VAL_90]], 3
+// CHECK:         %[[VAL_92:.*]] = udiv i32 %[[VAL_90]], 3
+// CHECK:         %[[VAL_93:.*]] = udiv i32 %[[VAL_92]], 12
+// CHECK:         %[[VAL_94:.*]] = mul nuw nsw i32 %[[VAL_73]], 1
+// CHECK:         %[[VAL_95:.*]] = add nuw nsw i32 0, %[[VAL_94]]
+// CHECK:         %[[VAL_96:.*]] = getelementptr inbounds [12 x [3 x [32 x [16 x [32 x [4 x [3 x [12 x float]]]]]]]], ptr %[[VAL_97:.*]], i32 0, i32 %[[VAL_92]], i32 %[[VAL_91]], i32 %[[VAL_89]], i32 %[[VAL_85]], i32 %[[VAL_84]], i32 %[[VAL_82]], i32 %[[VAL_80]], i32 %[[VAL_78]]
+// CHECK:         %[[VAL_98:.*]] = load float, ptr %[[VAL_96]], align 4, !invariant.load !9
+// CHECK:         store float %[[VAL_98]], ptr %[[VAL_14]], align 4
+// CHECK:         call void @[[REDUCTION0]](ptr %[[VAL_13]], ptr %[[VAL_14]], ptr %[[VAL_10]])
+// CHECK:         %[[VAL_99:.*]] = load float, ptr %[[VAL_10]], align 4
+// CHECK:         store float %[[VAL_99]], ptr %[[VAL_13]], align 4
+// CHECK:         br label %[[VAL_38]]
+// CHECK:       reduction_write_output-true:                      ; preds = %[[VAL_33]]
+// CHECK:         %[[VAL_100:.*]] = add i32 %tile_origin.2, %thread.id.1
+// CHECK:         %[[VAL_101:.*]] = mul nuw nsw i32 %[[VAL_100]], 1
+// CHECK:         %[[VAL_102:.*]] = add nuw nsw i32 0, %[[VAL_101]]
+// CHECK:         %[[VAL_103:.*]] = urem i32 %[[VAL_102]], 12
+// CHECK:         %[[VAL_104:.*]] = udiv i32 %[[VAL_102]], 12
+// CHECK:         %[[VAL_105:.*]] = urem i32 %[[VAL_104]], 3
+// CHECK:         %[[VAL_106:.*]] = udiv i32 %[[VAL_104]], 3
+// CHECK:         %[[VAL_107:.*]] = urem i32 %[[VAL_106]], 4
+// CHECK:         %[[VAL_108:.*]] = udiv i32 %[[VAL_106]], 4
+// CHECK:         %[[VAL_109:.*]] = urem i32 %[[VAL_108]], 32
+// CHECK:         %[[VAL_110:.*]] = udiv i32 %[[VAL_108]], 32
+// CHECK:         %[[VAL_111:.*]] = udiv i32 %[[VAL_110]], 16
+// CHECK:         %[[VAL_112:.*]] = mul nuw nsw i32 %tile_origin.0, 1
+// CHECK:         %[[VAL_113:.*]] = add nuw nsw i32 0, %[[VAL_112]]
+// CHECK:         %[[VAL_114:.*]] = getelementptr inbounds [12 x [16 x [4 x [3 x [32 x float]]]]], ptr %[[VAL_115:.*]], i32 0, i32 %[[VAL_103]], i32 %[[VAL_110]], i32 %[[VAL_107]], i32 %[[VAL_105]], i32 %[[VAL_109]]
+// CHECK:         %[[VAL_116:.*]] = load float, ptr %[[VAL_51]], align 4
+// CHECK:         store float %[[VAL_116]], ptr %[[VAL_114]], align 4
+// CHECK:         br label %[[VAL_19]]
+// CHECK:       entry:
+// CHECK:         %[[VAL_117:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_118:.*]] = load float, ptr %[[VAL_119:.*]], align 4
+// CHECK:         %[[VAL_120:.*]] = load float, ptr %[[VAL_121:.*]], align 4
+// CHECK:         %[[VAL_122:.*]] = fadd float %[[VAL_118]], %[[VAL_120]]
+// CHECK:         store float %[[VAL_122]], ptr %[[VAL_117]], align 4
+// CHECK:         %[[VAL_123:.*]] = load float, ptr %[[VAL_117]], align 4
+// CHECK:         store float %[[VAL_123]], ptr %[[VAL_124:.*]], align 4
+// CHECK:         ret void
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_f64_column.hlo b/third_party/xla/xla/service/gpu/tests/reduce_f64_column.hlo
new file mode 100644
index 00000000000000..abfab462332389
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/reduce_f64_column.hlo
@@ -0,0 +1,256 @@
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s --check-prefixes=CHECK,CHECK-%{PTX}
+
+HloModule m, is_scheduled=true
+
+add {
+  a = f64[] parameter(0)
+  b = f64[] parameter(1)
+  ROOT out = f64[] add(a, b)
+}
+
+fused_computation {
+  p1 = f64[1024,1024]{1,0} parameter(0)
+  p2 = f64[1024,1024]{1,0} parameter(1)
+  s = pred[1024,1024]{1,0} parameter(2)
+  p = f64[1024,1024]{1,0} select(s, p1, p2)
+  z = f64[] constant(0)
+  ROOT out = f64[1024]{0} reduce(p, z), to_apply=add, dimensions={0}
+}
+
+ENTRY e {
+  p1 = f64[1024,1024]{1,0} parameter(0)
+  p2 = f64[1024,1024]{1,0} parameter(1)
+  s = pred[1024,1024]{1,0} parameter(2)
+  ROOT f = f64[1024]{0} fusion(p1, p2, s), kind=kInput, calls=fused_computation
+}
+
+// CHECK: @shared_cache = private addrspace(3) global [32 x [33 x double]]
+
+// CHECK-LABEL: entry:
+// CHECK:         %[[VAL_0:.*]] = alloca double, align 8
+// CHECK:         %[[VAL_1:.*]] = alloca double, align 8
+// CHECK:         %[[VAL_2:.*]] = alloca double, align 8
+// CHECK:         %[[VAL_3:.*]] = alloca double, align 8
+// CHECK:         %[[VAL_4:.*]] = alloca double, align 8
+// CHECK:         %[[VAL_5:.*]] = alloca double, align 8
+// CHECK:         %[[VAL_6:.*]] = alloca double, align 8
+// CHECK:         %[[VAL_7:.*]] = alloca double, align 8
+// CHECK:         %[[VAL_8:.*]] = alloca double, align 8
+// CHECK:         %[[VAL_9:.*]] = alloca double, align 8
+// CHECK:         %[[VAL_10:.*]] = alloca double, align 8
+// CHECK:         %[[VAL_11:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_12:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_13:.*]] = alloca double, align 8
+// CHECK:         %[[VAL_14:.*]] = alloca double, align 8
+// CHECK-PTX:     %[[VAL_15:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !range !2
+// CHECK-GCN:     %[[VAL_15:.*]] = call i32 @llvm.amdgcn.workgroup.id.y
+// CHECK:         %[[VAL_16:.*]] = icmp eq i32 %[[VAL_15]], 0
+// CHECK:         br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_18:.*]]
+// CHECK:       reduce-group-0-after:                             ; preds = %[[VAL_19:.*]], %[[VAL_20:.*]]
+// CHECK:         ret void
+// CHECK:       reduce-group-0-true:                              ; preds = %[[VAL_20]]
+// CHECK:         %[[VAL_21:.*]] = load double, ptr @0, align 8
+// CHECK:         store double %[[VAL_21]], ptr{{.*}}%[[VAL_13]], align 8
+// CHECK-PTX:     %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
+// CHECK-GCN:     %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x
+// CHECK-PTX:     %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !4
+// CHECK-GCN:     %block.id.x = call i32 @llvm.amdgcn.workgroup.id.x
+// CHECK:         %[[VAL_22:.*]] = udiv i32 %thread.id.x, 32
+// CHECK:         %thread.id.1 = urem i32 %[[VAL_22]], 32
+// CHECK:         %thread.id.2 = urem i32 %thread.id.x, 32
+// CHECK:         %lane_id = urem i32 %thread.id.x, 32
+// CHECK:         %[[VAL_23:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_24:.*]] = urem i32 %[[VAL_23]], 32
+// CHECK:         %[[VAL_25:.*]] = udiv i32 %block.id.x, 32
+// CHECK:         %[[VAL_26:.*]] = urem i32 %[[VAL_25]], 1
+// CHECK:         %[[VAL_27:.*]] = udiv i32 %block.id.x, 32
+// CHECK:         %[[VAL_28:.*]] = icmp eq i32 %[[VAL_26]], 0
+// CHECK:         %tile_bound.1 = select i1 %[[VAL_28]], i32 1024, i32 4096
+// CHECK:         %tile_origin.0 = mul i32 %[[VAL_27]], 1
+// CHECK:         %tile_origin.1 = mul i32 %[[VAL_26]], 4096
+// CHECK:         %tile_origin.2 = mul i32 %[[VAL_24]], 32
+// CHECK:         store i32 %thread.id.1, ptr{{.*}}%[[VAL_12]], align 4
+// CHECK:         br label %[[VAL_29:.*]]
+// CHECK:       loop1.loop_header:                                ; preds = %[[VAL_30:.*]], %[[VAL_17]]
+// CHECK:         %[[VAL_31:.*]] = load i32, ptr{{.*}}%[[VAL_12]], align 4
+// CHECK:         %[[VAL_32:.*]] = icmp uge i32 %[[VAL_31]], %tile_bound.1
+// CHECK:         br i1 %[[VAL_32]], label %[[VAL_33:.*]], label %[[VAL_34:.*]]
+// CHECK:       loop1.loop_body:                                  ; preds = %[[VAL_29]]
+// CHECK:         %[[VAL_35:.*]] = add nuw nsw i32 %[[VAL_31]], 32
+// CHECK:         store i32 %[[VAL_35]], ptr{{.*}}%[[VAL_12]], align 4
+// CHECK:         %[[VAL_36:.*]] = icmp eq i32 %[[VAL_31]], %thread.id.1
+// CHECK:         store i32 0, ptr{{.*}}%[[VAL_11]], align 4
+// CHECK:         br label %[[VAL_37:.*]]
+// CHECK:       loop2.loop_header:                                ; preds = %[[VAL_38:.*]], %[[VAL_34]]
+// CHECK:         %[[VAL_39:.*]] = load i32, ptr{{.*}}%[[VAL_11]], align 4
+// CHECK:         %[[VAL_40:.*]] = icmp uge i32 %[[VAL_39]], 32
+// CHECK:         br i1 %[[VAL_40]], label %[[VAL_30]], label %[[VAL_41:.*]]
+// CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_37]]
+// CHECK:         %[[VAL_42:.*]] = add nuw nsw i32 %[[VAL_39]], 32
+// CHECK:         store i32 %[[VAL_42]], ptr{{.*}}%[[VAL_11]], align 4
+// CHECK:         %[[VAL_43:.*]] = icmp eq i32 %[[VAL_39]], 0
+// CHECK:         %[[VAL_44:.*]] = add i32 %[[VAL_39]], %thread.id.2
+// CHECK:         %[[VAL_45:.*]] = icmp ult i32 %[[VAL_44]], 32
+// CHECK:         br i1 %[[VAL_45]], label %[[VAL_46:.*]], label %[[VAL_38]]
+// CHECK:       x_in_tile-after:                                  ; preds = %[[VAL_46]], %[[VAL_41]]
+// CHECK:         br label %[[VAL_37]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop2.loop_exit:                                  ; preds = %[[VAL_37]]
+// CHECK:         br label %[[VAL_29]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop1.loop_exit:                                  ; preds = %[[VAL_29]]
+// CHECK:         %[[VAL_47:.*]] = load double, ptr{{.*}}%[[VAL_13]], align 8
+// CHECK:         %[[VAL_48:.*]] = getelementptr inbounds [32 x [33 x double]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.2, i32 %thread.id.1
+// CHECK:         %[[VAL_49:.*]] = addrspacecast ptr addrspace(3) %[[VAL_48]] to ptr
+// CHECK:         store double %[[VAL_47]], ptr{{.*}}%[[VAL_49]], align 8
+// CHECK-PTX:     call void @llvm.nvvm.barrier0()
+// CHECK-GCN:     call void @llvm.amdgcn.s.barrier()
+// CHECK:         %[[VAL_50:.*]] = getelementptr inbounds [32 x [33 x double]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.1, i32 %thread.id.2
+// CHECK:         %[[VAL_51:.*]] = addrspacecast ptr addrspace(3) %[[VAL_50]] to ptr
+// CHECK:         %[[VAL_52:.*]] = load double, ptr{{.*}}%[[VAL_51]], align 8
+// CHECK:         %[[VAL_53:.*]] = bitcast double %[[VAL_52]] to i64
+// CHECK:         %[[VAL_54:.*]] = bitcast i64 %[[VAL_53]] to <2 x i32>
+// CHECK:         %[[VAL_55:.*]] = extractelement <2 x i32> %[[VAL_54]], i64 0
+// CHECK-PTX:     %[[VAL_56:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_55]], i32 16, i32 31)
+// CHECK-GCN:     %[[VAL_56:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_55]], i32 16)
+// CHECK:         %[[VAL_57:.*]] = insertelement <2 x i32> %[[VAL_54]], i32 %[[VAL_56]], i64 0
+// CHECK:         %[[VAL_58:.*]] = extractelement <2 x i32> %[[VAL_57]], i64 1
+// CHECK-PTX:     %[[VAL_59:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_58]], i32 16, i32 31)
+// CHECK-GCN:     %[[VAL_59:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_58]], i32 16)
+// CHECK:         %[[VAL_60:.*]] = insertelement <2 x i32> %[[VAL_57]], i32 %[[VAL_59]], i64 1
+// CHECK:         %[[VAL_61:.*]] = bitcast <2 x i32> %[[VAL_60]] to i64
+// CHECK:         %[[VAL_62:.*]] = bitcast i64 %[[VAL_61]] to double
+// CHECK:         store double %[[VAL_62]], ptr{{.*}}%[[VAL_9]], align 8
+// CHECK-PTX:     call void @[[ADD:add.*]](ptr %[[VAL_51]], ptr %[[VAL_9]], ptr %[[VAL_8]])
+// CHECK-GCN:     %[[VAL_9_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_9]] to ptr
+// CHECK-GCN:     %[[VAL_8_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_8]] to ptr
+// CHECK-GCN:     call void @[[ADD:add.*]](ptr %[[VAL_51]], ptr %[[VAL_9_1]], ptr %[[VAL_8_1]])
+// CHECK:         %[[VAL_63:.*]] = load double, ptr{{.*}}%[[VAL_8]], align 8
+// CHECK:         store double %[[VAL_63]], ptr{{.*}}%[[VAL_51]], align 8
+// CHECK:         %[[VAL_64:.*]] = load double, ptr{{.*}}%[[VAL_51]], align 8
+// CHECK:         %[[VAL_65:.*]] = bitcast double %[[VAL_64]] to i64
+// CHECK:         %[[VAL_66:.*]] = bitcast i64 %[[VAL_65]] to <2 x i32>
+// CHECK:         %[[VAL_67:.*]] = extractelement <2 x i32> %[[VAL_66]], i64 0
+// CHECK-PTX:     %[[VAL_68:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_67]], i32 8, i32 31)
+// CHECK-GCN:     %[[VAL_68:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_67]], i32 8)
+// CHECK:         %[[VAL_69:.*]] = insertelement <2 x i32> %[[VAL_66]], i32 %[[VAL_68]], i64 0
+// CHECK:         %[[VAL_70:.*]] = extractelement <2 x i32> %[[VAL_69]], i64 1
+// CHECK-PTX:     %[[VAL_71:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_70]], i32 8, i32 31)
+// CHECK-GCN:     %[[VAL_71:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_70]], i32 8)
+// CHECK:         %[[VAL_72:.*]] = insertelement <2 x i32> %[[VAL_69]], i32 %[[VAL_71]], i64 1
+// CHECK:         %[[VAL_73:.*]] = bitcast <2 x i32> %[[VAL_72]] to i64
+// CHECK:         %[[VAL_74:.*]] = bitcast i64 %[[VAL_73]] to double
+// CHECK:         store double %[[VAL_74]], ptr{{.*}}%[[VAL_7]], align 8
+// CHECK-PTX:     call void @[[ADD]](ptr %[[VAL_51]], ptr %[[VAL_7]], ptr %[[VAL_6]])
+// CHECK-GCN:     %[[VAL_7_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_7]] to ptr
+// CHECK-GCN:     %[[VAL_6_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_6]] to ptr
+// CHECK-GCN:     call void @[[ADD]](ptr %[[VAL_51]], ptr %[[VAL_7_1]], ptr %[[VAL_6_1]])
+// CHECK:         %[[VAL_75:.*]] = load double, ptr{{.*}}%[[VAL_6]], align 8
+// CHECK:         store double %[[VAL_75]], ptr{{.*}}%[[VAL_51]], align 8
+// CHECK:         %[[VAL_76:.*]] = load double, ptr{{.*}}%[[VAL_51]], align 8
+// CHECK:         %[[VAL_77:.*]] = bitcast double %[[VAL_76]] to i64
+// CHECK:         %[[VAL_78:.*]] = bitcast i64 %[[VAL_77]] to <2 x i32>
+// CHECK:         %[[VAL_79:.*]] = extractelement <2 x i32> %[[VAL_78]], i64 0
+// CHECK-PTX:     %[[VAL_80:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_79]], i32 4, i32 31)
+// CHECK-GCN:     %[[VAL_80:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_79]], i32 4)
+// CHECK:         %[[VAL_81:.*]] = insertelement <2 x i32> %[[VAL_78]], i32 %[[VAL_80]], i64 0
+// CHECK:         %[[VAL_82:.*]] = extractelement <2 x i32> %[[VAL_81]], i64 1
+// CHECK-PTX:     %[[VAL_83:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_82]], i32 4, i32 31)
+// CHECK-GCN:     %[[VAL_83:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_82]], i32 4)
+// CHECK:         %[[VAL_84:.*]] = insertelement <2 x i32> %[[VAL_81]], i32 %[[VAL_83]], i64 1
+// CHECK:         %[[VAL_85:.*]] = bitcast <2 x i32> %[[VAL_84]] to i64
+// CHECK:         %[[VAL_86:.*]] = bitcast i64 %[[VAL_85]] to double
+// CHECK:         store double %[[VAL_86]], ptr{{.*}}%[[VAL_5]], align 8
+// CHECK-PTX:     call void @[[ADD]](ptr %[[VAL_51]], ptr %[[VAL_5]], ptr %[[VAL_4]])
+// CHECK-GCN:     %[[VAL_5_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_5]] to ptr
+// CHECK-GCN:     %[[VAL_4_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_4]] to ptr
+// CHECK-GCN:     call void @[[ADD]](ptr %[[VAL_51]], ptr %[[VAL_5_1]], ptr %[[VAL_4_1]])
+// CHECK:         %[[VAL_87:.*]] = load double, ptr{{.*}}%[[VAL_4]], align 8
+// CHECK:         store double %[[VAL_87]], ptr{{.*}}%[[VAL_51]], align 8
+// CHECK:         %[[VAL_88:.*]] = load double, ptr{{.*}}%[[VAL_51]], align 8
+// CHECK:         %[[VAL_89:.*]] = bitcast double %[[VAL_88]] to i64
+// CHECK:         %[[VAL_90:.*]] = bitcast i64 %[[VAL_89]] to <2 x i32>
+// CHECK:         %[[VAL_91:.*]] = extractelement <2 x i32> %[[VAL_90]], i64 0
+// CHECK-PTX:     %[[VAL_92:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_91]], i32 2, i32 31)
+// CHECK-GCN:     %[[VAL_92:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_91]], i32 2)
+// CHECK:         %[[VAL_93:.*]] = insertelement <2 x i32> %[[VAL_90]], i32 %[[VAL_92]], i64 0
+// CHECK:         %[[VAL_94:.*]] = extractelement <2 x i32> %[[VAL_93]], i64 1
+// CHECK-PTX:     %[[VAL_95:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_94]], i32 2, i32 31)
+// CHECK-GCN:     %[[VAL_95:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_94]], i32 2)
+// CHECK:         %[[VAL_96:.*]] = insertelement <2 x i32> %[[VAL_93]], i32 %[[VAL_95]], i64 1
+// CHECK:         %[[VAL_97:.*]] = bitcast <2 x i32> %[[VAL_96]] to i64
+// CHECK:         %[[VAL_98:.*]] = bitcast i64 %[[VAL_97]] to double
+// CHECK:         store double %[[VAL_98]], ptr{{.*}}%[[VAL_3]], align 8
+// CHECK-PTX:     call void @[[ADD]](ptr %[[VAL_51]], ptr %[[VAL_3]], ptr %[[VAL_2]])
+// CHECK-GCN:     %[[VAL_3_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_3]] to ptr
+// CHECK-GCN:     %[[VAL_2_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_2]] to ptr
+// CHECK-GCN:     call void @[[ADD]](ptr %[[VAL_51]], ptr %[[VAL_3_1]], ptr %[[VAL_2_1]])
+// CHECK:         %[[VAL_99:.*]] = load double, ptr{{.*}}%[[VAL_2]], align 8
+// CHECK:         store double %[[VAL_99]], ptr{{.*}}%[[VAL_51]], align 8
+// CHECK:         %[[VAL_100:.*]] = load double, ptr{{.*}}%[[VAL_51]], align 8
+// CHECK:         %[[VAL_101:.*]] = bitcast double %[[VAL_100]] to i64
+// CHECK:         %[[VAL_102:.*]] = bitcast i64 %[[VAL_101]] to <2 x i32>
+// CHECK:         %[[VAL_103:.*]] = extractelement <2 x i32> %[[VAL_102]], i64 0
+// CHECK-PTX:     %[[VAL_104:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_103]], i32 1, i32 31)
+// CHECK-GCN:     %[[VAL_104:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_103]], i32 1)
+// CHECK:         %[[VAL_105:.*]] = insertelement <2 x i32> %[[VAL_102]], i32 %[[VAL_104]], i64 0
+// CHECK:         %[[VAL_106:.*]] = extractelement <2 x i32> %[[VAL_105]], i64 1
+// CHECK-PTX:     %[[VAL_107:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_106]], i32 1, i32 31)
+// CHECK-GCN:     %[[VAL_107:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_106]], i32 1)
+// CHECK:         %[[VAL_108:.*]] = insertelement <2 x i32> %[[VAL_105]], i32 %[[VAL_107]], i64 1
+// CHECK:         %[[VAL_109:.*]] = bitcast <2 x i32> %[[VAL_108]] to i64
+// CHECK:         %[[VAL_110:.*]] = bitcast i64 %[[VAL_109]] to double
+// CHECK:         store double %[[VAL_110]], ptr{{.*}}%[[VAL_1]], align 8
+// CHECK-PTX:     call void @[[ADD]](ptr %[[VAL_51]], ptr %[[VAL_1]], ptr %[[VAL_0]])
+// CHECK-GCN:     %[[VAL_1_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_1]] to ptr
+// CHECK-GCN:     %[[VAL_0_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_0]] to ptr
+// CHECK-GCN:     call void @[[ADD]](ptr %[[VAL_51]], ptr %[[VAL_1_1]], ptr %[[VAL_0_1]])
+// CHECK:         %[[VAL_111:.*]] = load double, ptr{{.*}}%[[VAL_0]], align 8
+// CHECK:         store double %[[VAL_111]], ptr{{.*}}%[[VAL_51]], align 8
+// CHECK-PTX:     %[[VAL_112:.*]] = icmp ult i32 %thread.id.1, 32
+// CHECK-PTX:     %[[VAL_113:.*]] = icmp ult i32 %thread.id.2, %tile_bound.1
+// CHECK-GCN:     %[[VAL_113:.*]] = icmp ult i32 %thread.id.2, %tile_bound.1
+// CHECK-GCN:     %[[VAL_112:.*]] = icmp ult i32 %thread.id.1, 32
+// CHECK:         %[[VAL_114:.*]] = and i1 %[[VAL_112]], %[[VAL_113]]
+// CHECK:         %[[VAL_115:.*]] = icmp eq i32 %lane_id, 0
+// CHECK:         %[[VAL_116:.*]] = and i1 %[[VAL_114]], %[[VAL_115]]
+// CHECK:         br i1 %[[VAL_116]], label %[[VAL_117:.*]], label %[[VAL_19]]
+// CHECK:       reduction_write_output-after:                     ; preds = %[[VAL_117]], %[[VAL_33]]
+// CHECK:         br label %[[VAL_18]]
+// CHECK:       x_in_tile-true:                                   ; preds = %[[VAL_41]]
+// CHECK:         %[[VAL_118:.*]] = add i32 %tile_origin.0, 0
+// CHECK:         %[[VAL_119:.*]] = add i32 %tile_origin.1, %[[VAL_31]]
+// CHECK:         %[[VAL_120:.*]] = add i32 %tile_origin.2, %[[VAL_44]]
+// CHECK:         %[[VAL_121:.*]] = getelementptr inbounds [1024 x [1024 x i8]], ptr{{.*}}%[[VAL_122:.*]], i32 0, i32 %[[VAL_119]], i32 %[[VAL_120]]
+// CHECK:         %[[VAL_123:.*]] = load i8, ptr{{.*}}%[[VAL_121]], align 1, !invariant.load !{{[0-9]}}
+// CHECK:         %[[VAL_124:.*]] = getelementptr inbounds [1024 x [1024 x double]], ptr{{.*}}%[[VAL_125:.*]], i32 0, i32 %[[VAL_119]], i32 %[[VAL_120]]
+// CHECK:         %[[VAL_126:.*]] = load double, ptr{{.*}}%[[VAL_124]], align 8, !invariant.load !{{[0-9]}}
+// CHECK:         %[[VAL_127:.*]] = getelementptr inbounds [1024 x [1024 x double]], ptr{{.*}}%[[VAL_128:.*]], i32 0, i32 %[[VAL_119]], i32 %[[VAL_120]]
+// CHECK:         %[[VAL_129:.*]] = load double, ptr{{.*}}%[[VAL_127]], align 8, !invariant.load !{{[0-9]}}
+// CHECK:         %[[VAL_130:.*]] = trunc i8 %[[VAL_123]] to i1
+// CHECK:         %[[VAL_131:.*]] = select i1 %[[VAL_130]], double %[[VAL_126]], double %[[VAL_129]]
+// CHECK:         store double %[[VAL_131]], ptr{{.*}}%[[VAL_14]], align 8
+// CHECK-PTX:     call void @[[ADD]](ptr %[[VAL_13]], ptr %[[VAL_14]], ptr %[[VAL_10]])
+// CHECK-GCN:     %[[VAL_13_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_13]] to ptr
+// CHECK-GCN:     %[[VAL_14_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_14]] to ptr
+// CHECK-GCN:     %[[VAL_10_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_10]] to ptr
+// CHECK-GCN:     call void @[[ADD]](ptr %[[VAL_13_1]], ptr %[[VAL_14_1]], ptr %[[VAL_10_1]])
+// CHECK:         %[[VAL_132:.*]] = load double, ptr{{.*}}%[[VAL_10]], align 8
+// CHECK:         store double %[[VAL_132]], ptr{{.*}}%[[VAL_13]], align 8
+// CHECK:         br label %[[VAL_38]]
+// CHECK:       reduction_write_output-true:                      ; preds = %[[VAL_33]]
+// CHECK:         %[[VAL_135:.*]] = add i32 %tile_origin.2, %thread.id.1
+// CHECK:         %[[VAL_139:.*]] = getelementptr inbounds [1024 x double], ptr{{.*}}%[[VAL_140:.*]], i32 0, i32 %[[VAL_135]]
+// CHECK:         %[[VAL_141:.*]] = load double, ptr{{.*}}%[[VAL_51]], align 8
+// CHECK:         store double %[[VAL_141]], ptr{{.*}}%[[VAL_139]], align 8
+// CHECK:         br label %[[VAL_19]]
+// CHECK:       entry:
+// CHECK:         %[[VAL_142:.*]] = alloca double, align 8
+// CHECK:         %[[VAL_143:.*]] = load double, ptr{{.*}}%[[VAL_144:.*]], align 8
+// CHECK:         %[[VAL_145:.*]] = load double, ptr{{.*}}%[[VAL_146:.*]], align 8
+// CHECK:         %[[VAL_147:.*]] = fadd double %[[VAL_143]], %[[VAL_145]]
+// CHECK:         store double %[[VAL_147]], ptr{{.*}}%[[VAL_142]], align 8
+// CHECK:         %[[VAL_148:.*]] = load double, ptr{{.*}}%[[VAL_142]], align 8
+// CHECK:         store double %[[VAL_148]], ptr{{.*}}%[[VAL_149:.*]], align 8
+// CHECK:         ret void
+
+// CHECK-PTX: !3 = !{i32 0, i32 1024}
+// CHECK-PTX: !4 = !{i32 0, i32 32}
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo b/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo
new file mode 100644
index 00000000000000..21d32aebf1915e
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/reduce_large_row_to_scalar.hlo
@@ -0,0 +1,420 @@
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK,CHECK-%{PTX}
+
+HloModule LargeReduction, is_scheduled=true
+
+Sum {
+    x.1 = c128[] parameter(0)
+    y.1 = c128[] parameter(1)
+    ROOT add.1 = c128[] add(x.1, y.1)
+}
+
+fused_computation {
+  param_0 = c128[10000]{0} parameter(0)
+  param_1 = c128[] parameter(1)
+  ROOT out1.1 = c128[] reduce(c128[10000]{0} param_0, c128[] param_1), dimensions={0}, to_apply=Sum
+}
+
+ENTRY reduce.1 {
+    parameter = c128[10000] parameter(0)
+    init_value = c128[] constant((0, 0))
+    ROOT wrapped_out1 = c128[] fusion(c128[10000]{0} parameter, c128[] init_value), kind=kInput, calls=fused_computation
+}
+
+// CHECK-LABEL: entry:
+// CHECK:         %[[VAL_0:.*]] = alloca %[[VAL_1:.*]], align 8
+// CHECK:         %[[VAL_2:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_3:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_4:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_5:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_6:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_7:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_8:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_9:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_10:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_11:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_12:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_13:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_14:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_15:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_16:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_17:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_18:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_19:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_20:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_21:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_22:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_23:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_24:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_25:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_26:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_27:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_28:.*]] = alloca %[[VAL_1]], align 8
+// CHECK:         %[[VAL_29:.*]] = alloca %[[VAL_1]], align 8
+// CHECK-PTX:     %[[VAL_30:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !range !2
+// CHECK-GCN:     %[[VAL_30:.*]] = call i32 @llvm.amdgcn.workgroup.id.y
+// CHECK:         %[[VAL_31:.*]] = icmp eq i32 %[[VAL_30]], 0
+// CHECK:         br i1 %[[VAL_31]], label %[[VAL_32:.*]], label %[[VAL_33:.*]]
+// CHECK:       reduce-group-0-after:                             ; preds = %thread_in_bounds-after, %[[VAL_34:.*]]
+// CHECK:         ret void
+// CHECK:       reduce-group-0-true:                              ; preds = %[[VAL_34]]
+// CHECK:         %[[VAL_35:.*]] = load %[[VAL_1]], ptr %[[VAL_36:.*]], align 1, !invariant.load !3
+// CHECK:         store %[[VAL_1]] %[[VAL_35]], ptr %[[VAL_28]], align 1
+// CHECK-PTX:     %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !4
+// CHECK-GCN:     %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x
+// CHECK-PTX:     %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !2
+// CHECK-GCN:     %block.id.x = call i32 @llvm.amdgcn.workgroup.id.x
+// CHECK:         %thread.id.2 = urem i32 %thread.id.x, 640
+// CHECK:         %lane_id = urem i32 %thread.id.x, 32
+// CHECK:         %[[VAL_37:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_38:.*]] = urem i32 %[[VAL_37]], 1
+// CHECK:         %[[VAL_39:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_40:.*]] = urem i32 %[[VAL_39]], 1
+// CHECK:         %[[VAL_41:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_42:.*]] = urem i32 %[[VAL_41]], 1
+// CHECK:         %[[VAL_43:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_44:.*]] = icmp eq i32 %[[VAL_40]], 0
+// CHECK:         %tile_bound.2 = select i1 %[[VAL_44]], i32 5000, i32 5120
+// CHECK:         %tile_origin.0 = mul i32 %[[VAL_43]], 1
+// CHECK:         %tile_origin.1 = mul i32 %[[VAL_42]], 1
+// CHECK:         %tile_origin.2 = mul i32 %[[VAL_40]], 5120
+// CHECK:         %tile_origin.3 = mul i32 %[[VAL_38]], 2
+// CHECK:         %[[VAL_45:.*]] = icmp eq i32 5120, %tile_bound.2
+// CHECK:         br i1 %[[VAL_45]], label %[[VAL_46:.*]], label %[[VAL_47:.*]]
+// CHECK:       is_full_tile-after:                               ; preds = %[[VAL_48:.*]], %[[VAL_49:.*]]
+// CHECK:         %[[VAL_50:.*]] = load i128, ptr %[[VAL_28]], align 16
+// CHECK:         %[[VAL_51:.*]] = bitcast i128 %[[VAL_50]] to <4 x i32>
+// CHECK:         %[[VAL_52:.*]] = extractelement <4 x i32> %[[VAL_51]], i64 0
+// CHECK:         %[[VAL_53:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_52]], i32 16, i32 31)
+// CHECK:         %[[VAL_54:.*]] = insertelement <4 x i32> %[[VAL_51]], i32 %[[VAL_53]], i64 0
+// CHECK:         %[[VAL_55:.*]] = extractelement <4 x i32> %[[VAL_54]], i64 1
+// CHECK:         %[[VAL_56:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_55]], i32 16, i32 31)
+// CHECK:         %[[VAL_57:.*]] = insertelement <4 x i32> %[[VAL_54]], i32 %[[VAL_56]], i64 1
+// CHECK:         %[[VAL_58:.*]] = extractelement <4 x i32> %[[VAL_57]], i64 2
+// CHECK:         %[[VAL_59:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_58]], i32 16, i32 31)
+// CHECK:         %[[VAL_60:.*]] = insertelement <4 x i32> %[[VAL_57]], i32 %[[VAL_59]], i64 2
+// CHECK:         %[[VAL_61:.*]] = extractelement <4 x i32> %[[VAL_60]], i64 3
+// CHECK:         %[[VAL_62:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_61]], i32 16, i32 31)
+// CHECK:         %[[VAL_63:.*]] = insertelement <4 x i32> %[[VAL_60]], i32 %[[VAL_62]], i64 3
+// CHECK:         %[[VAL_64:.*]] = bitcast <4 x i32> %[[VAL_63]] to i128
+// CHECK:         store i128 %[[VAL_64]], ptr %[[VAL_21]], align 16
+// CHECK:         call void @[[SUM:Sum.*]](ptr %[[VAL_28]], ptr %[[VAL_21]], ptr %[[VAL_20]])
+// CHECK:         %[[VAL_65:.*]] = load %[[VAL_1]], ptr %[[VAL_20]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_65]], ptr %[[VAL_28]], align 1
+// CHECK:         %[[VAL_66:.*]] = load i128, ptr %[[VAL_28]], align 16
+// CHECK:         %[[VAL_67:.*]] = bitcast i128 %[[VAL_66]] to <4 x i32>
+// CHECK:         %[[VAL_68:.*]] = extractelement <4 x i32> %[[VAL_67]], i64 0
+// CHECK:         %[[VAL_69:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_68]], i32 8, i32 31)
+// CHECK:         %[[VAL_70:.*]] = insertelement <4 x i32> %[[VAL_67]], i32 %[[VAL_69]], i64 0
+// CHECK:         %[[VAL_71:.*]] = extractelement <4 x i32> %[[VAL_70]], i64 1
+// CHECK:         %[[VAL_72:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_71]], i32 8, i32 31)
+// CHECK:         %[[VAL_73:.*]] = insertelement <4 x i32> %[[VAL_70]], i32 %[[VAL_72]], i64 1
+// CHECK:         %[[VAL_74:.*]] = extractelement <4 x i32> %[[VAL_73]], i64 2
+// CHECK:         %[[VAL_75:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_74]], i32 8, i32 31)
+// CHECK:         %[[VAL_76:.*]] = insertelement <4 x i32> %[[VAL_73]], i32 %[[VAL_75]], i64 2
+// CHECK:         %[[VAL_77:.*]] = extractelement <4 x i32> %[[VAL_76]], i64 3
+// CHECK:         %[[VAL_78:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_77]], i32 8, i32 31)
+// CHECK:         %[[VAL_79:.*]] = insertelement <4 x i32> %[[VAL_76]], i32 %[[VAL_78]], i64 3
+// CHECK:         %[[VAL_80:.*]] = bitcast <4 x i32> %[[VAL_79]] to i128
+// CHECK:         store i128 %[[VAL_80]], ptr %[[VAL_19]], align 16
+// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_19]], ptr %[[VAL_18]])
+// CHECK:         %[[VAL_81:.*]] = load %[[VAL_1]], ptr %[[VAL_18]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_81]], ptr %[[VAL_28]], align 1
+// CHECK:         %[[VAL_82:.*]] = load i128, ptr %[[VAL_28]], align 16
+// CHECK:         %[[VAL_83:.*]] = bitcast i128 %[[VAL_82]] to <4 x i32>
+// CHECK:         %[[VAL_84:.*]] = extractelement <4 x i32> %[[VAL_83]], i64 0
+// CHECK:         %[[VAL_85:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_84]], i32 4, i32 31)
+// CHECK:         %[[VAL_86:.*]] = insertelement <4 x i32> %[[VAL_83]], i32 %[[VAL_85]], i64 0
+// CHECK:         %[[VAL_87:.*]] = extractelement <4 x i32> %[[VAL_86]], i64 1
+// CHECK:         %[[VAL_88:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_87]], i32 4, i32 31)
+// CHECK:         %[[VAL_89:.*]] = insertelement <4 x i32> %[[VAL_86]], i32 %[[VAL_88]], i64 1
+// CHECK:         %[[VAL_90:.*]] = extractelement <4 x i32> %[[VAL_89]], i64 2
+// CHECK:         %[[VAL_91:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_90]], i32 4, i32 31)
+// CHECK:         %[[VAL_92:.*]] = insertelement <4 x i32> %[[VAL_89]], i32 %[[VAL_91]], i64 2
+// CHECK:         %[[VAL_93:.*]] = extractelement <4 x i32> %[[VAL_92]], i64 3
+// CHECK:         %[[VAL_94:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_93]], i32 4, i32 31)
+// CHECK:         %[[VAL_95:.*]] = insertelement <4 x i32> %[[VAL_92]], i32 %[[VAL_94]], i64 3
+// CHECK:         %[[VAL_96:.*]] = bitcast <4 x i32> %[[VAL_95]] to i128
+// CHECK:         store i128 %[[VAL_96]], ptr %[[VAL_17]], align 16
+// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_17]], ptr %[[VAL_16]])
+// CHECK:         %[[VAL_97:.*]] = load %[[VAL_1]], ptr %[[VAL_16]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_97]], ptr %[[VAL_28]], align 1
+// CHECK:         %[[VAL_98:.*]] = load i128, ptr %[[VAL_28]], align 16
+// CHECK:         %[[VAL_99:.*]] = bitcast i128 %[[VAL_98]] to <4 x i32>
+// CHECK:         %[[VAL_100:.*]] = extractelement <4 x i32> %[[VAL_99]], i64 0
+// CHECK:         %[[VAL_101:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_100]], i32 2, i32 31)
+// CHECK:         %[[VAL_102:.*]] = insertelement <4 x i32> %[[VAL_99]], i32 %[[VAL_101]], i64 0
+// CHECK:         %[[VAL_103:.*]] = extractelement <4 x i32> %[[VAL_102]], i64 1
+// CHECK:         %[[VAL_104:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_103]], i32 2, i32 31)
+// CHECK:         %[[VAL_105:.*]] = insertelement <4 x i32> %[[VAL_102]], i32 %[[VAL_104]], i64 1
+// CHECK:         %[[VAL_106:.*]] = extractelement <4 x i32> %[[VAL_105]], i64 2
+// CHECK:         %[[VAL_107:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_106]], i32 2, i32 31)
+// CHECK:         %[[VAL_108:.*]] = insertelement <4 x i32> %[[VAL_105]], i32 %[[VAL_107]], i64 2
+// CHECK:         %[[VAL_109:.*]] = extractelement <4 x i32> %[[VAL_108]], i64 3
+// CHECK:         %[[VAL_110:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_109]], i32 2, i32 31)
+// CHECK:         %[[VAL_111:.*]] = insertelement <4 x i32> %[[VAL_108]], i32 %[[VAL_110]], i64 3
+// CHECK:         %[[VAL_112:.*]] = bitcast <4 x i32> %[[VAL_111]] to i128
+// CHECK:         store i128 %[[VAL_112]], ptr %[[VAL_15]], align 16
+// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_15]], ptr %[[VAL_14]])
+// CHECK:         %[[VAL_113:.*]] = load %[[VAL_1]], ptr %[[VAL_14]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_113]], ptr %[[VAL_28]], align 1
+// CHECK:         %[[VAL_114:.*]] = load i128, ptr %[[VAL_28]], align 16
+// CHECK:         %[[VAL_115:.*]] = bitcast i128 %[[VAL_114]] to <4 x i32>
+// CHECK:         %[[VAL_116:.*]] = extractelement <4 x i32> %[[VAL_115]], i64 0
+// CHECK:         %[[VAL_117:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_116]], i32 1, i32 31)
+// CHECK:         %[[VAL_118:.*]] = insertelement <4 x i32> %[[VAL_115]], i32 %[[VAL_117]], i64 0
+// CHECK:         %[[VAL_119:.*]] = extractelement <4 x i32> %[[VAL_118]], i64 1
+// CHECK:         %[[VAL_120:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_119]], i32 1, i32 31)
+// CHECK:         %[[VAL_121:.*]] = insertelement <4 x i32> %[[VAL_118]], i32 %[[VAL_120]], i64 1
+// CHECK:         %[[VAL_122:.*]] = extractelement <4 x i32> %[[VAL_121]], i64 2
+// CHECK:         %[[VAL_123:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_122]], i32 1, i32 31)
+// CHECK:         %[[VAL_124:.*]] = insertelement <4 x i32> %[[VAL_121]], i32 %[[VAL_123]], i64 2
+// CHECK:         %[[VAL_125:.*]] = extractelement <4 x i32> %[[VAL_124]], i64 3
+// CHECK:         %[[VAL_126:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_125]], i32 1, i32 31)
+// CHECK:         %[[VAL_127:.*]] = insertelement <4 x i32> %[[VAL_124]], i32 %[[VAL_126]], i64 3
+// CHECK:         %[[VAL_128:.*]] = bitcast <4 x i32> %[[VAL_127]] to i128
+// CHECK:         store i128 %[[VAL_128]], ptr %[[VAL_13]], align 16
+// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_13]], ptr %[[VAL_12]])
+// CHECK:         %[[VAL_129:.*]] = load %[[VAL_1]], ptr %[[VAL_12]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_129]], ptr %[[VAL_28]], align 1
+// CHECK:         %[[VAL_130:.*]] = udiv i32 %thread.id.2, 32
+// CHECK:         br i1 true, label %thread_in_bounds-true, label %thread_in_bounds-after
+// CHECK:       thread_in_bounds-after:                           ; preds = %[[VAL_131:.*]], %[[VAL_132:.*]]
+// CHECK:         br label %[[VAL_33]]
+// CHECK:       is_full_tile-true:                                ; preds = %[[VAL_32]]
+// CHECK:         store i32 0, ptr %[[VAL_27]], align 4
+// CHECK:         br label %[[VAL_133:.*]]
+// CHECK:       loop2.loop_header:                                ; preds = %[[VAL_134:.*]], %[[VAL_46]]
+// CHECK:         %[[VAL_135:.*]] = load i32, ptr %[[VAL_27]], align 4
+// CHECK:         %[[VAL_136:.*]] = icmp uge i32 %[[VAL_135]], 5120
+// CHECK:         br i1 %[[VAL_136]], label %[[VAL_49]], label %[[VAL_137:.*]]
+// CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_133]]
+// CHECK:         %[[VAL_138:.*]] = add nuw nsw i32 %[[VAL_135]], 640
+// CHECK:         store i32 %[[VAL_138]], ptr %[[VAL_27]], align 4
+// CHECK:         %[[VAL_139:.*]] = icmp eq i32 %[[VAL_135]], 0
+// CHECK:         %[[VAL_140:.*]] = add i32 %[[VAL_135]], %thread.id.2
+// CHECK:         store i32 0, ptr %[[VAL_26]], align 4
+// CHECK:         br label %[[VAL_141:.*]]
+// CHECK:       loop3.loop_header:                                ; preds = %[[VAL_142:.*]], %[[VAL_137]]
+// CHECK:         %[[VAL_143:.*]] = load i32, ptr %[[VAL_26]], align 4
+// CHECK:         %[[VAL_144:.*]] = icmp uge i32 %[[VAL_143]], 2
+// CHECK:         br i1 %[[VAL_144]], label %[[VAL_134]], label %[[VAL_142]]
+// CHECK:       loop3.loop_body:                                  ; preds = %[[VAL_141]]
+// CHECK:         %[[VAL_145:.*]] = add nuw nsw i32 %[[VAL_143]], 1
+// CHECK:         store i32 %[[VAL_145]], ptr %[[VAL_26]], align 4
+// CHECK:         %[[VAL_146:.*]] = icmp eq i32 %[[VAL_143]], 0
+// CHECK:         %[[VAL_147:.*]] = add i32 %tile_origin.0, 0
+// CHECK:         %[[VAL_148:.*]] = add i32 %tile_origin.1, 0
+// CHECK:         %[[VAL_149:.*]] = add i32 %tile_origin.2, %[[VAL_140]]
+// CHECK:         %[[VAL_150:.*]] = add i32 %tile_origin.3, %[[VAL_143]]
+// CHECK:         %[[VAL_151:.*]] = mul nuw nsw i32 %[[VAL_150]], 1
+// CHECK:         %[[VAL_152:.*]] = add nuw nsw i32 0, %[[VAL_151]]
+// CHECK:         %[[VAL_153:.*]] = mul nuw nsw i32 %[[VAL_149]], 2
+// CHECK:         %[[VAL_154:.*]] = add nuw nsw i32 %[[VAL_152]], %[[VAL_153]]
+// CHECK:         %[[VAL_155:.*]] = udiv i32 %[[VAL_154]], 10000
+// CHECK:         %[[VAL_156:.*]] = mul nuw nsw i32 %[[VAL_148]], 1
+// CHECK:         %[[VAL_157:.*]] = add nuw nsw i32 0, %[[VAL_156]]
+// CHECK:         %[[VAL_158:.*]] = mul nuw nsw i32 %[[VAL_147]], 1
+// CHECK:         %[[VAL_159:.*]] = add nuw nsw i32 0, %[[VAL_158]]
+// CHECK:         %[[VAL_160:.*]] = getelementptr inbounds [10000 x %[[VAL_1]]], ptr %[[VAL_161:.*]], i32 0, i32 %[[VAL_154]]
+// CHECK:         %[[VAL_162:.*]] = load %[[VAL_1]], ptr %[[VAL_160]], align 1, !invariant.load !3
+// CHECK:         store %[[VAL_1]] %[[VAL_162]], ptr %[[VAL_29]], align 1
+// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_25]])
+// CHECK:         %[[VAL_163:.*]] = load %[[VAL_1]], ptr %[[VAL_25]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_163]], ptr %[[VAL_28]], align 1
+// CHECK:         br label %[[VAL_141]], !llvm.loop !5
+// CHECK:       loop3.loop_exit:                                  ; preds = %[[VAL_141]]
+// CHECK:         br label %[[VAL_133]], !llvm.loop !7
+// CHECK:       loop2.loop_exit:                                  ; preds = %[[VAL_133]]
+// CHECK:         br label %[[VAL_132]]
+// CHECK:       is_full_tile-false:                               ; preds = %[[VAL_32]]
+// CHECK:         store i32 0, ptr %[[VAL_24]], align 4
+// CHECK:         br label %[[VAL_164:.*]]
+// CHECK:       loop2.loop_header4:                               ; preds = %[[VAL_165:.*]], %[[VAL_47]]
+// CHECK:         %[[VAL_166:.*]] = load i32, ptr %[[VAL_24]], align 4
+// CHECK:         %[[VAL_167:.*]] = icmp uge i32 %[[VAL_166]], 5120
+// CHECK:         br i1 %[[VAL_167]], label %[[VAL_48]], label %[[VAL_168:.*]]
+// CHECK:       loop2.loop_body5:                                 ; preds = %[[VAL_164]]
+// CHECK:         %[[VAL_169:.*]] = add nuw nsw i32 %[[VAL_166]], 640
+// CHECK:         store i32 %[[VAL_169]], ptr %[[VAL_24]], align 4
+// CHECK:         %[[VAL_170:.*]] = icmp eq i32 %[[VAL_166]], 0
+// CHECK:         %[[VAL_171:.*]] = add i32 %[[VAL_166]], %thread.id.2
+// CHECK:         %[[VAL_172:.*]] = icmp ult i32 %[[VAL_171]], %tile_bound.2
+// CHECK:         br i1 %[[VAL_172]], label %[[VAL_173:.*]], label %[[VAL_165]]
+// CHECK:       x_in_tile-after:                                  ; preds = %[[VAL_174:.*]], %[[VAL_168]]
+// CHECK:         br label %[[VAL_164]], !llvm.loop !9
+// CHECK:       loop2.loop_exit3:                                 ; preds = %[[VAL_164]]
+// CHECK:         br label %[[VAL_132]]
+// CHECK:       x_in_tile-true:                                   ; preds = %[[VAL_168]]
+// CHECK:         store i32 0, ptr %[[VAL_23]], align 4
+// CHECK:         br label %[[VAL_175:.*]]
+// CHECK:       loop3.loop_header10:                              ; preds = %[[VAL_176:.*]], %[[VAL_173]]
+// CHECK:         %[[VAL_177:.*]] = load i32, ptr %[[VAL_23]], align 4
+// CHECK:         %[[VAL_178:.*]] = icmp uge i32 %[[VAL_177]], 2
+// CHECK:         br i1 %[[VAL_178]], label %[[VAL_174]], label %[[VAL_176]]
+// CHECK:       loop3.loop_body11:                                ; preds = %[[VAL_175]]
+// CHECK:         %[[VAL_179:.*]] = add nuw nsw i32 %[[VAL_177]], 1
+// CHECK:         store i32 %[[VAL_179]], ptr %[[VAL_23]], align 4
+// CHECK:         %[[VAL_180:.*]] = icmp eq i32 %[[VAL_177]], 0
+// CHECK:         %[[VAL_181:.*]] = add i32 %tile_origin.0, 0
+// CHECK:         %[[VAL_182:.*]] = add i32 %tile_origin.1, 0
+// CHECK:         %[[VAL_183:.*]] = add i32 %tile_origin.2, %[[VAL_171]]
+// CHECK:         %[[VAL_184:.*]] = add i32 %tile_origin.3, %[[VAL_177]]
+// CHECK:         %[[VAL_185:.*]] = mul nuw nsw i32 %[[VAL_184]], 1
+// CHECK:         %[[VAL_186:.*]] = add nuw nsw i32 0, %[[VAL_185]]
+// CHECK:         %[[VAL_187:.*]] = mul nuw nsw i32 %[[VAL_183]], 2
+// CHECK:         %[[VAL_188:.*]] = add nuw nsw i32 %[[VAL_186]], %[[VAL_187]]
+// CHECK:         %[[VAL_189:.*]] = udiv i32 %[[VAL_188]], 10000
+// CHECK:         %[[VAL_190:.*]] = mul nuw nsw i32 %[[VAL_182]], 1
+// CHECK:         %[[VAL_191:.*]] = add nuw nsw i32 0, %[[VAL_190]]
+// CHECK:         %[[VAL_192:.*]] = mul nuw nsw i32 %[[VAL_181]], 1
+// CHECK:         %[[VAL_193:.*]] = add nuw nsw i32 0, %[[VAL_192]]
+// CHECK:         %[[VAL_194:.*]] = getelementptr inbounds [10000 x %[[VAL_1]]], ptr %[[VAL_161]], i32 0, i32 %[[VAL_188]]
+// CHECK:         %[[VAL_195:.*]] = load %[[VAL_1]], ptr %[[VAL_194]], align 1, !invariant.load !3
+// CHECK:         store %[[VAL_1]] %[[VAL_195]], ptr %[[VAL_29]], align 1
+// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_22]])
+// CHECK:         %[[VAL_196:.*]] = load %[[VAL_1]], ptr %[[VAL_22]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_196]], ptr %[[VAL_28]], align 1
+// CHECK:         br label %[[VAL_175]], !llvm.loop !10
+// CHECK:       loop3.loop_exit9:                                 ; preds = %[[VAL_175]]
+// CHECK:         br label %[[VAL_165]]
+// CHECK:       thread_in_bounds-true:                            ; preds = %[[VAL_132]]
+// CHECK:         %[[VAL_197:.*]] = icmp eq i32 %lane_id, 0
+// CHECK:         br i1 %[[VAL_197]], label %[[VAL_198:.*]], label %[[VAL_199:.*]]
+// CHECK:       intra_warp_reduce_write-after:                    ; preds = %[[VAL_198]], %thread_in_bounds-true
+// CHECK:         call void @llvm.nvvm.barrier0()
+// CHECK:         %[[VAL_200:.*]] = icmp eq i32 %[[VAL_130]], 0
+// CHECK:         br i1 %[[VAL_200]], label %[[VAL_201:.*]], label %[[VAL_131]]
+// CHECK:       inter_warp_reduce-after:                          ; preds = %[[VAL_202:.*]], %[[VAL_199]]
+// CHECK:         br label %thread_in_bounds-after
+// CHECK:       intra_warp_reduce_write-true:                     ; preds = %thread_in_bounds-true
+// CHECK:         %[[VAL_203:.*]] = load %[[VAL_1]], ptr %[[VAL_28]], align 1
+// CHECK:         %[[VAL_204:.*]] = getelementptr inbounds [1 x [20 x %[[VAL_1]]]], ptr addrspace(3) @shared_cache, i32 0, i32 0, i32 %[[VAL_130]]
+// CHECK:         %[[VAL_205:.*]] = addrspacecast ptr addrspace(3) %[[VAL_204]] to ptr
+// CHECK:         store %[[VAL_1]] %[[VAL_203]], ptr %[[VAL_205]], align 1
+// CHECK:         br label %[[VAL_199]]
+// CHECK:       inter_warp_reduce-true:                           ; preds = %[[VAL_199]]
+// CHECK:         %[[VAL_206:.*]] = getelementptr inbounds [1 x [20 x %[[VAL_1]]]], ptr addrspace(3) @shared_cache, i32 0, i32 0, i32 %lane_id
+// CHECK:         %[[VAL_207:.*]] = addrspacecast ptr addrspace(3) %[[VAL_206]] to ptr
+// CHECK:         store %[[VAL_1]] %[[VAL_35]], ptr %[[VAL_11]], align 1
+// CHECK:         %[[VAL_208:.*]] = icmp ult i32 %thread.id.2, 20
+// CHECK:         %[[VAL_209:.*]] = select i1 %[[VAL_208]], ptr %[[VAL_207]], ptr %[[VAL_11]]
+// CHECK:         %[[VAL_210:.*]] = load i128, ptr %[[VAL_209]], align 16
+// CHECK:         %[[VAL_211:.*]] = bitcast i128 %[[VAL_210]] to <4 x i32>
+// CHECK:         %[[VAL_212:.*]] = extractelement <4 x i32> %[[VAL_211]], i64 0
+// CHECK:         %[[VAL_213:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_212]], i32 16, i32 31)
+// CHECK:         %[[VAL_214:.*]] = insertelement <4 x i32> %[[VAL_211]], i32 %[[VAL_213]], i64 0
+// CHECK:         %[[VAL_215:.*]] = extractelement <4 x i32> %[[VAL_214]], i64 1
+// CHECK:         %[[VAL_216:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_215]], i32 16, i32 31)
+// CHECK:         %[[VAL_217:.*]] = insertelement <4 x i32> %[[VAL_214]], i32 %[[VAL_216]], i64 1
+// CHECK:         %[[VAL_218:.*]] = extractelement <4 x i32> %[[VAL_217]], i64 2
+// CHECK:         %[[VAL_219:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_218]], i32 16, i32 31)
+// CHECK:         %[[VAL_220:.*]] = insertelement <4 x i32> %[[VAL_217]], i32 %[[VAL_219]], i64 2
+// CHECK:         %[[VAL_221:.*]] = extractelement <4 x i32> %[[VAL_220]], i64 3
+// CHECK:         %[[VAL_222:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_221]], i32 16, i32 31)
+// CHECK:         %[[VAL_223:.*]] = insertelement <4 x i32> %[[VAL_220]], i32 %[[VAL_222]], i64 3
+// CHECK:         %[[VAL_224:.*]] = bitcast <4 x i32> %[[VAL_223]] to i128
+// CHECK:         store i128 %[[VAL_224]], ptr %[[VAL_10]], align 16
+// CHECK:         call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_10]], ptr %[[VAL_9]])
+// CHECK:         %[[VAL_225:.*]] = load %[[VAL_1]], ptr %[[VAL_9]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_225]], ptr %[[VAL_209]], align 1
+// CHECK:         %[[VAL_226:.*]] = load i128, ptr %[[VAL_209]], align 16
+// CHECK:         %[[VAL_227:.*]] = bitcast i128 %[[VAL_226]] to <4 x i32>
+// CHECK:         %[[VAL_228:.*]] = extractelement <4 x i32> %[[VAL_227]], i64 0
+// CHECK:         %[[VAL_229:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_228]], i32 8, i32 31)
+// CHECK:         %[[VAL_230:.*]] = insertelement <4 x i32> %[[VAL_227]], i32 %[[VAL_229]], i64 0
+// CHECK:         %[[VAL_231:.*]] = extractelement <4 x i32> %[[VAL_230]], i64 1
+// CHECK:         %[[VAL_232:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_231]], i32 8, i32 31)
+// CHECK:         %[[VAL_233:.*]] = insertelement <4 x i32> %[[VAL_230]], i32 %[[VAL_232]], i64 1
+// CHECK:         %[[VAL_234:.*]] = extractelement <4 x i32> %[[VAL_233]], i64 2
+// CHECK:         %[[VAL_235:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_234]], i32 8, i32 31)
+// CHECK:         %[[VAL_236:.*]] = insertelement <4 x i32> %[[VAL_233]], i32 %[[VAL_235]], i64 2
+// CHECK:         %[[VAL_237:.*]] = extractelement <4 x i32> %[[VAL_236]], i64 3
+// CHECK:         %[[VAL_238:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_237]], i32 8, i32 31)
+// CHECK:         %[[VAL_239:.*]] = insertelement <4 x i32> %[[VAL_236]], i32 %[[VAL_238]], i64 3
+// CHECK:         %[[VAL_240:.*]] = bitcast <4 x i32> %[[VAL_239]] to i128
+// CHECK:         store i128 %[[VAL_240]], ptr %[[VAL_8]], align 16
+// CHECK:         call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_8]], ptr %[[VAL_7]])
+// CHECK:         %[[VAL_241:.*]] = load %[[VAL_1]], ptr %[[VAL_7]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_241]], ptr %[[VAL_209]], align 1
+// CHECK:         %[[VAL_242:.*]] = load i128, ptr %[[VAL_209]], align 16
+// CHECK:         %[[VAL_243:.*]] = bitcast i128 %[[VAL_242]] to <4 x i32>
+// CHECK:         %[[VAL_244:.*]] = extractelement <4 x i32> %[[VAL_243]], i64 0
+// CHECK:         %[[VAL_245:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_244]], i32 4, i32 31)
+// CHECK:         %[[VAL_246:.*]] = insertelement <4 x i32> %[[VAL_243]], i32 %[[VAL_245]], i64 0
+// CHECK:         %[[VAL_247:.*]] = extractelement <4 x i32> %[[VAL_246]], i64 1
+// CHECK:         %[[VAL_248:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_247]], i32 4, i32 31)
+// CHECK:         %[[VAL_249:.*]] = insertelement <4 x i32> %[[VAL_246]], i32 %[[VAL_248]], i64 1
+// CHECK:         %[[VAL_250:.*]] = extractelement <4 x i32> %[[VAL_249]], i64 2
+// CHECK:         %[[VAL_251:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_250]], i32 4, i32 31)
+// CHECK:         %[[VAL_252:.*]] = insertelement <4 x i32> %[[VAL_249]], i32 %[[VAL_251]], i64 2
+// CHECK:         %[[VAL_253:.*]] = extractelement <4 x i32> %[[VAL_252]], i64 3
+// CHECK:         %[[VAL_254:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_253]], i32 4, i32 31)
+// CHECK:         %[[VAL_255:.*]] = insertelement <4 x i32> %[[VAL_252]], i32 %[[VAL_254]], i64 3
+// CHECK:         %[[VAL_256:.*]] = bitcast <4 x i32> %[[VAL_255]] to i128
+// CHECK:         store i128 %[[VAL_256]], ptr %[[VAL_6]], align 16
+// CHECK:         call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_6]], ptr %[[VAL_5]])
+// CHECK:         %[[VAL_257:.*]] = load %[[VAL_1]], ptr %[[VAL_5]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_257]], ptr %[[VAL_209]], align 1
+// CHECK:         %[[VAL_258:.*]] = load i128, ptr %[[VAL_209]], align 16
+// CHECK:         %[[VAL_259:.*]] = bitcast i128 %[[VAL_258]] to <4 x i32>
+// CHECK:         %[[VAL_260:.*]] = extractelement <4 x i32> %[[VAL_259]], i64 0
+// CHECK:         %[[VAL_261:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_260]], i32 2, i32 31)
+// CHECK:         %[[VAL_262:.*]] = insertelement <4 x i32> %[[VAL_259]], i32 %[[VAL_261]], i64 0
+// CHECK:         %[[VAL_263:.*]] = extractelement <4 x i32> %[[VAL_262]], i64 1
+// CHECK:         %[[VAL_264:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_263]], i32 2, i32 31)
+// CHECK:         %[[VAL_265:.*]] = insertelement <4 x i32> %[[VAL_262]], i32 %[[VAL_264]], i64 1
+// CHECK:         %[[VAL_266:.*]] = extractelement <4 x i32> %[[VAL_265]], i64 2
+// CHECK:         %[[VAL_267:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_266]], i32 2, i32 31)
+// CHECK:         %[[VAL_268:.*]] = insertelement <4 x i32> %[[VAL_265]], i32 %[[VAL_267]], i64 2
+// CHECK:         %[[VAL_269:.*]] = extractelement <4 x i32> %[[VAL_268]], i64 3
+// CHECK:         %[[VAL_270:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_269]], i32 2, i32 31)
+// CHECK:         %[[VAL_271:.*]] = insertelement <4 x i32> %[[VAL_268]], i32 %[[VAL_270]], i64 3
+// CHECK:         %[[VAL_272:.*]] = bitcast <4 x i32> %[[VAL_271]] to i128
+// CHECK:         store i128 %[[VAL_272]], ptr %[[VAL_4]], align 16
+// CHECK:         call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_4]], ptr %[[VAL_3]])
+// CHECK:         %[[VAL_273:.*]] = load %[[VAL_1]], ptr %[[VAL_3]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_273]], ptr %[[VAL_209]], align 1
+// CHECK:         %[[VAL_274:.*]] = load i128, ptr %[[VAL_209]], align 16
+// CHECK:         %[[VAL_275:.*]] = bitcast i128 %[[VAL_274]] to <4 x i32>
+// CHECK:         %[[VAL_276:.*]] = extractelement <4 x i32> %[[VAL_275]], i64 0
+// CHECK:         %[[VAL_277:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_276]], i32 1, i32 31)
+// CHECK:         %[[VAL_278:.*]] = insertelement <4 x i32> %[[VAL_275]], i32 %[[VAL_277]], i64 0
+// CHECK:         %[[VAL_279:.*]] = extractelement <4 x i32> %[[VAL_278]], i64 1
+// CHECK:         %[[VAL_280:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_279]], i32 1, i32 31)
+// CHECK:         %[[VAL_281:.*]] = insertelement <4 x i32> %[[VAL_278]], i32 %[[VAL_280]], i64 1
+// CHECK:         %[[VAL_282:.*]] = extractelement <4 x i32> %[[VAL_281]], i64 2
+// CHECK:         %[[VAL_283:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_282]], i32 1, i32 31)
+// CHECK:         %[[VAL_284:.*]] = insertelement <4 x i32> %[[VAL_281]], i32 %[[VAL_283]], i64 2
+// CHECK:         %[[VAL_285:.*]] = extractelement <4 x i32> %[[VAL_284]], i64 3
+// CHECK:         %[[VAL_286:.*]] = call i32 @llvm.nvvm.shfl.sync.down.i32(i32 -1, i32 %[[VAL_285]], i32 1, i32 31)
+// CHECK:         %[[VAL_287:.*]] = insertelement <4 x i32> %[[VAL_284]], i32 %[[VAL_286]], i64 3
+// CHECK:         %[[VAL_288:.*]] = bitcast <4 x i32> %[[VAL_287]] to i128
+// CHECK:         store i128 %[[VAL_288]], ptr %[[VAL_2]], align 16
+// CHECK:         call void @[[SUM]](ptr %[[VAL_209]], ptr %[[VAL_2]], ptr %[[VAL_0]])
+// CHECK:         %[[VAL_289:.*]] = load %[[VAL_1]], ptr %[[VAL_0]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_289]], ptr %[[VAL_209]], align 1
+// CHECK:         %[[VAL_290:.*]] = icmp eq i32 %thread.id.2, 0
+// CHECK:         br i1 %[[VAL_290]], label %[[VAL_291:.*]], label %[[VAL_202]]
+// CHECK:       reduction_write_output-after:                     ; preds = %[[VAL_291]], %[[VAL_201]]
+// CHECK:         br label %[[VAL_131]]
+// CHECK:       reduction_write_output-true:                      ; preds = %[[VAL_201]]
+// CHECK:         %[[VAL_293:.*]] = add i32 %tile_origin.1, 0
+// CHECK:         %[[VAL_296:.*]] = load %[[VAL_1]], ptr %[[VAL_209]], align 1
+// CHECK:         store %[[VAL_1]] %[[VAL_296]], ptr %[[VAL_297:.*]], align 1
+// CHECK:         br label %[[VAL_202]]
+// CHECK:       entry:
+// CHECK:         %[[VAL_298:.*]] = alloca %[[VAL_299:.*]], align 8
+// CHECK:         %[[VAL_300:.*]] = load %[[VAL_299]], ptr %[[VAL_301:.*]], align 1
+// CHECK:         %[[VAL_302:.*]] = load %[[VAL_299]], ptr %[[VAL_303:.*]], align 1
+// CHECK:         %[[VAL_304:.*]] = extractvalue %[[VAL_299]] %[[VAL_300]], 0
+// CHECK:         %[[VAL_305:.*]] = extractvalue %[[VAL_299]] %[[VAL_302]], 0
+// CHECK:         %[[VAL_306:.*]] = fadd double %[[VAL_304]], %[[VAL_305]]
+// CHECK:         %[[VAL_307:.*]] = extractvalue %[[VAL_299]] %[[VAL_300]], 1
+// CHECK:         %[[VAL_308:.*]] = extractvalue %[[VAL_299]] %[[VAL_302]], 1
+// CHECK:         %[[VAL_309:.*]] = fadd double %[[VAL_307]], %[[VAL_308]]
+// CHECK:         %[[VAL_310:.*]] = insertvalue %[[VAL_299]] zeroinitializer, double %[[VAL_306]], 0
+// CHECK:         %[[VAL_311:.*]] = insertvalue %[[VAL_299]] %[[VAL_310]], double %[[VAL_309]], 1
+// CHECK:         store %[[VAL_299]] %[[VAL_311]], ptr %[[VAL_298]], align 1
+// CHECK:         %[[VAL_312:.*]] = load %[[VAL_299]], ptr %[[VAL_298]], align 1
+// CHECK:         store %[[VAL_299]] %[[VAL_312]], ptr %[[VAL_313:.*]], align 1
+// CHECK:         ret void
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo b/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo
new file mode 100644
index 00000000000000..b85eeb0ac8831a
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/reduce_row_vectorized.hlo
@@ -0,0 +1,300 @@
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s --check-prefixes=CHECK,CHECK-%{PTX}
+
+HloModule RowReductionVectorized, is_scheduled=true
+
+Sum {
+    x.1 = f32[] parameter(0)
+    y.1 = f32[] parameter(1)
+    ROOT add.1 = f32[] add(x.1, y.1)
+}
+
+fusion_vectorized {
+    a = f32[131072,1024] parameter(0)
+    init = f32[] constant(0)
+    ROOT reduce = f32[131072] reduce(a, init), dimensions={1}, to_apply=Sum
+}
+
+ENTRY reduce.1 {
+    parameter0 = f32[131072,1024] parameter(0)
+    ROOT fusion_row_reduction_vectorized = f32[131072] fusion(
+        f32[131072,1024] parameter0
+    ), kind=kLoop, calls=fusion_vectorized
+}
+
+// CHECK-LABEL: entry:
+// CHECK:         %[[VAL_0:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_1:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_2:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_3:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_4:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_5:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_6:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_7:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_8:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_9:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_10:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_11:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_12:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_13:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_14:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_15:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_16:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_17:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_18:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_19:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_20:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_21:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_22:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_23:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_24:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_25:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_26:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_27:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_28:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_29:.*]] = alloca float, align 4
+// CHECK-PTX:     %[[VAL_30:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !range !2
+// CHECK-GCN:     %[[VAL_30:.*]] = call i32 @llvm.amdgcn.workgroup.id.y
+// CHECK:         %[[VAL_31:.*]] = icmp eq i32 %[[VAL_30]], 0
+// CHECK:         br i1 %[[VAL_31]], label %[[VAL_32:.*]], label %[[VAL_33:.*]]
+// CHECK:       reduce-group-0-after:                             ; preds = %thread_in_bounds-after, %[[VAL_34:.*]]
+// CHECK:         ret void
+// CHECK:       reduce-group-0-true:                              ; preds = %[[VAL_34]]
+// CHECK:         %[[VAL_35:.*]] = load float, ptr @0, align 4
+// CHECK:         store float %[[VAL_35]], ptr %[[VAL_28]], align 4
+// CHECK-PTX:     %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !3
+// CHECK-GCN:     %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x
+// CHECK-PTX:     %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !4
+// CHECK-GCN:     %block.id.x = call i32 @llvm.amdgcn.workgroup.id.x
+// CHECK:         %[[VAL_36:.*]] = udiv i32 %thread.id.x, 64
+// CHECK:         %thread.id.1 = urem i32 %[[VAL_36]], 4
+// CHECK:         %thread.id.2 = urem i32 %thread.id.x, 64
+// CHECK:         %lane_id = urem i32 %thread.id.x, 32
+// CHECK:         %[[VAL_37:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_38:.*]] = urem i32 %[[VAL_37]], 1
+// CHECK:         %[[VAL_39:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_40:.*]] = urem i32 %[[VAL_39]], 1
+// CHECK:         %[[VAL_41:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_42:.*]] = urem i32 %[[VAL_41]], 32768
+// CHECK:         %[[VAL_43:.*]] = udiv i32 %block.id.x, 32768
+// CHECK:         %tile_origin.0 = mul i32 %[[VAL_43]], 1
+// CHECK:         %tile_origin.1 = mul i32 %[[VAL_42]], 4
+// CHECK:         %tile_origin.2 = mul i32 %[[VAL_40]], 512
+// CHECK:         %tile_origin.3 = mul i32 %[[VAL_38]], 2
+// CHECK:         store i32 %thread.id.1, ptr %[[VAL_27]], align 4
+// CHECK:         br label %[[VAL_44:.*]]
+// CHECK:       loop1.loop_header:                                ; preds = %[[VAL_45:.*]], %[[VAL_32]]
+// CHECK:         %[[VAL_46:.*]] = load i32, ptr %[[VAL_27]], align 4
+// CHECK:         %[[VAL_47:.*]] = icmp uge i32 %[[VAL_46]], 4
+// CHECK:         br i1 %[[VAL_47]], label %[[VAL_48:.*]], label %[[VAL_49:.*]]
+// CHECK:       loop1.loop_body:                                  ; preds = %[[VAL_44]]
+// CHECK:         %[[VAL_50:.*]] = add nuw nsw i32 %[[VAL_46]], 4
+// CHECK:         store i32 %[[VAL_50]], ptr %[[VAL_27]], align 4
+// CHECK:         %[[VAL_51:.*]] = icmp eq i32 %[[VAL_46]], %thread.id.1
+// CHECK:         br i1 true, label %[[VAL_52:.*]], label %[[VAL_53:.*]]
+// CHECK:       is_full_tile-after:                               ; preds = %[[VAL_54:.*]], %[[VAL_55:.*]]
+// CHECK:         br label %[[VAL_44]], !llvm.loop !5
+// CHECK:       loop1.loop_exit:                                  ; preds = %[[VAL_44]]
+// CHECK:         %[[VAL_56:.*]] = load float, ptr %[[VAL_28]], align 4
+// CHECK:         %[[VAL_57:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_56]], i32 16, i32 31)
+// CHECK:         store float %[[VAL_57]], ptr %[[VAL_20]], align 4
+// CHECK:         call void @[[SUM:Sum.*]](ptr %[[VAL_28]], ptr %[[VAL_20]], ptr %[[VAL_19]])
+// CHECK:         %[[VAL_58:.*]] = load float, ptr %[[VAL_19]], align 4
+// CHECK:         store float %[[VAL_58]], ptr %[[VAL_28]], align 4
+// CHECK:         %[[VAL_59:.*]] = load float, ptr %[[VAL_28]], align 4
+// CHECK:         %[[VAL_60:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_59]], i32 8, i32 31)
+// CHECK:         store float %[[VAL_60]], ptr %[[VAL_18]], align 4
+// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_18]], ptr %[[VAL_17]])
+// CHECK:         %[[VAL_61:.*]] = load float, ptr %[[VAL_17]], align 4
+// CHECK:         store float %[[VAL_61]], ptr %[[VAL_28]], align 4
+// CHECK:         %[[VAL_62:.*]] = load float, ptr %[[VAL_28]], align 4
+// CHECK:         %[[VAL_63:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_62]], i32 4, i32 31)
+// CHECK:         store float %[[VAL_63]], ptr %[[VAL_16]], align 4
+// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_16]], ptr %[[VAL_15]])
+// CHECK:         %[[VAL_64:.*]] = load float, ptr %[[VAL_15]], align 4
+// CHECK:         store float %[[VAL_64]], ptr %[[VAL_28]], align 4
+// CHECK:         %[[VAL_65:.*]] = load float, ptr %[[VAL_28]], align 4
+// CHECK:         %[[VAL_66:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_65]], i32 2, i32 31)
+// CHECK:         store float %[[VAL_66]], ptr %[[VAL_14]], align 4
+// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_14]], ptr %[[VAL_13]])
+// CHECK:         %[[VAL_67:.*]] = load float, ptr %[[VAL_13]], align 4
+// CHECK:         store float %[[VAL_67]], ptr %[[VAL_28]], align 4
+// CHECK:         %[[VAL_68:.*]] = load float, ptr %[[VAL_28]], align 4
+// CHECK:         %[[VAL_69:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_68]], i32 1, i32 31)
+// CHECK:         store float %[[VAL_69]], ptr %[[VAL_12]], align 4
+// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_12]], ptr %[[VAL_11]])
+// CHECK:         %[[VAL_70:.*]] = load float, ptr %[[VAL_11]], align 4
+// CHECK:         store float %[[VAL_70]], ptr %[[VAL_28]], align 4
+// CHECK:         %[[VAL_71:.*]] = udiv i32 %thread.id.2, 32
+// CHECK:         %[[VAL_72:.*]] = icmp ult i32 %thread.id.1, 4
+// CHECK:         br i1 %[[VAL_72]], label %thread_in_bounds-true, label %thread_in_bounds-after
+// CHECK:       thread_in_bounds-after:                           ; preds = %[[VAL_73:.*]], %[[VAL_48]]
+// CHECK:         br label %[[VAL_33]]
+// CHECK:       is_full_tile-true:                                ; preds = %[[VAL_49]]
+// CHECK:         store i32 0, ptr %[[VAL_26]], align 4
+// CHECK:         br label %[[VAL_74:.*]]
+// CHECK:       loop2.loop_header:                                ; preds = %[[VAL_75:.*]], %[[VAL_52]]
+// CHECK:         %[[VAL_76:.*]] = load i32, ptr %[[VAL_26]], align 4
+// CHECK:         %[[VAL_77:.*]] = icmp uge i32 %[[VAL_76]], 512
+// CHECK:         br i1 %[[VAL_77]], label %[[VAL_55]], label %[[VAL_78:.*]]
+// CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_74]]
+// CHECK:         %[[VAL_79:.*]] = add nuw nsw i32 %[[VAL_76]], 64
+// CHECK:         store i32 %[[VAL_79]], ptr %[[VAL_26]], align 4
+// CHECK:         %[[VAL_80:.*]] = icmp eq i32 %[[VAL_76]], 0
+// CHECK:         %[[VAL_81:.*]] = add i32 %[[VAL_76]], %thread.id.2
+// CHECK:         store i32 0, ptr %[[VAL_25]], align 4
+// CHECK:         br label %[[VAL_82:.*]]
+// CHECK:       loop3.loop_header:                                ; preds = %[[VAL_83:.*]], %[[VAL_78]]
+// CHECK:         %[[VAL_84:.*]] = load i32, ptr %[[VAL_25]], align 4
+// CHECK:         %[[VAL_85:.*]] = icmp uge i32 %[[VAL_84]], 2
+// CHECK:         br i1 %[[VAL_85]], label %[[VAL_75]], label %[[VAL_83]]
+// CHECK:       loop3.loop_body:                                  ; preds = %[[VAL_82]]
+// CHECK:         %[[VAL_86:.*]] = add nuw nsw i32 %[[VAL_84]], 1
+// CHECK:         store i32 %[[VAL_86]], ptr %[[VAL_25]], align 4
+// CHECK:         %[[VAL_87:.*]] = icmp eq i32 %[[VAL_84]], 0
+// CHECK:         %[[VAL_88:.*]] = add i32 %tile_origin.0, 0
+// CHECK:         %[[VAL_89:.*]] = add i32 %tile_origin.1, %[[VAL_46]]
+// CHECK:         %[[VAL_90:.*]] = add i32 %tile_origin.2, %[[VAL_81]]
+// CHECK:         %[[VAL_91:.*]] = add i32 %tile_origin.3, %[[VAL_84]]
+// CHECK:         %[[VAL_92:.*]] = mul nuw nsw i32 %[[VAL_91]], 1
+// CHECK:         %[[VAL_93:.*]] = add nuw nsw i32 0, %[[VAL_92]]
+// CHECK:         %[[VAL_94:.*]] = mul nuw nsw i32 %[[VAL_90]], 2
+// CHECK:         %[[VAL_95:.*]] = add nuw nsw i32 %[[VAL_93]], %[[VAL_94]]
+// CHECK:         %[[VAL_96:.*]] = udiv i32 %[[VAL_95]], 1024
+// CHECK:         %[[VAL_97:.*]] = mul nuw nsw i32 %[[VAL_89]], 1
+// CHECK:         %[[VAL_98:.*]] = add nuw nsw i32 0, %[[VAL_97]]
+// CHECK:         %[[VAL_99:.*]] = udiv i32 %[[VAL_98]], 131072
+// CHECK:         %[[VAL_100:.*]] = mul nuw nsw i32 %[[VAL_88]], 1
+// CHECK:         %[[VAL_101:.*]] = add nuw nsw i32 0, %[[VAL_100]]
+// CHECK:         %[[VAL_102:.*]] = getelementptr inbounds [131072 x [1024 x float]], ptr %[[VAL_103:.*]], i32 0, i32 %[[VAL_98]], i32 %[[VAL_95]]
+// CHECK:         %[[VAL_104:.*]] = load float, ptr %[[VAL_102]], align 4, !invariant.load !7
+// CHECK:         store float %[[VAL_104]], ptr %[[VAL_29]], align 4
+// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_24]])
+// CHECK:         %[[VAL_105:.*]] = load float, ptr %[[VAL_24]], align 4
+// CHECK:         store float %[[VAL_105]], ptr %[[VAL_28]], align 4
+// CHECK:         br label %[[VAL_82]], !llvm.loop !8
+// CHECK:       loop3.loop_exit:                                  ; preds = %[[VAL_82]]
+// CHECK:         br label %[[VAL_74]], !llvm.loop !9
+// CHECK:       loop2.loop_exit:                                  ; preds = %[[VAL_74]]
+// CHECK:         br label %[[VAL_45]]
+// CHECK:       is_full_tile-false:                               ; preds = %[[VAL_49]]
+// CHECK:         store i32 0, ptr %[[VAL_23]], align 4
+// CHECK:         br label %[[VAL_106:.*]]
+// CHECK:       loop2.loop_header5:                               ; preds = %[[VAL_107:.*]], %[[VAL_53]]
+// CHECK:         %[[VAL_108:.*]] = load i32, ptr %[[VAL_23]], align 4
+// CHECK:         %[[VAL_109:.*]] = icmp uge i32 %[[VAL_108]], 512
+// CHECK:         br i1 %[[VAL_109]], label %[[VAL_54]], label %[[VAL_110:.*]]
+// CHECK:       loop2.loop_body6:                                 ; preds = %[[VAL_106]]
+// CHECK:         %[[VAL_111:.*]] = add nuw nsw i32 %[[VAL_108]], 64
+// CHECK:         store i32 %[[VAL_111]], ptr %[[VAL_23]], align 4
+// CHECK:         %[[VAL_112:.*]] = icmp eq i32 %[[VAL_108]], 0
+// CHECK:         %[[VAL_113:.*]] = add i32 %[[VAL_108]], %thread.id.2
+// CHECK:         %[[VAL_114:.*]] = icmp ult i32 %[[VAL_113]], 512
+// CHECK:         br i1 %[[VAL_114]], label %[[VAL_115:.*]], label %[[VAL_107]]
+// CHECK:       x_in_tile-after:                                  ; preds = %[[VAL_116:.*]], %[[VAL_110]]
+// CHECK:         br label %[[VAL_106]], !llvm.loop !11
+// CHECK:       loop2.loop_exit4:                                 ; preds = %[[VAL_106]]
+// CHECK:         br label %[[VAL_45]]
+// CHECK:       x_in_tile-true:                                   ; preds = %[[VAL_110]]
+// CHECK:         store i32 0, ptr %[[VAL_22]], align 4
+// CHECK:         br label %[[VAL_117:.*]]
+// CHECK:       loop3.loop_header11:                              ; preds = %[[VAL_118:.*]], %[[VAL_115]]
+// CHECK:         %[[VAL_119:.*]] = load i32, ptr %[[VAL_22]], align 4
+// CHECK:         %[[VAL_120:.*]] = icmp uge i32 %[[VAL_119]], 2
+// CHECK:         br i1 %[[VAL_120]], label %[[VAL_116]], label %[[VAL_118]]
+// CHECK:       loop3.loop_body12:                                ; preds = %[[VAL_117]]
+// CHECK:         %[[VAL_121:.*]] = add nuw nsw i32 %[[VAL_119]], 1
+// CHECK:         store i32 %[[VAL_121]], ptr %[[VAL_22]], align 4
+// CHECK:         %[[VAL_122:.*]] = icmp eq i32 %[[VAL_119]], 0
+// CHECK:         %[[VAL_123:.*]] = add i32 %tile_origin.0, 0
+// CHECK:         %[[VAL_124:.*]] = add i32 %tile_origin.1, %[[VAL_46]]
+// CHECK:         %[[VAL_125:.*]] = add i32 %tile_origin.2, %[[VAL_113]]
+// CHECK:         %[[VAL_126:.*]] = add i32 %tile_origin.3, %[[VAL_119]]
+// CHECK:         %[[VAL_127:.*]] = mul nuw nsw i32 %[[VAL_126]], 1
+// CHECK:         %[[VAL_128:.*]] = add nuw nsw i32 0, %[[VAL_127]]
+// CHECK:         %[[VAL_129:.*]] = mul nuw nsw i32 %[[VAL_125]], 2
+// CHECK:         %[[VAL_130:.*]] = add nuw nsw i32 %[[VAL_128]], %[[VAL_129]]
+// CHECK:         %[[VAL_131:.*]] = udiv i32 %[[VAL_130]], 1024
+// CHECK:         %[[VAL_132:.*]] = mul nuw nsw i32 %[[VAL_124]], 1
+// CHECK:         %[[VAL_133:.*]] = add nuw nsw i32 0, %[[VAL_132]]
+// CHECK:         %[[VAL_134:.*]] = udiv i32 %[[VAL_133]], 131072
+// CHECK:         %[[VAL_135:.*]] = mul nuw nsw i32 %[[VAL_123]], 1
+// CHECK:         %[[VAL_136:.*]] = add nuw nsw i32 0, %[[VAL_135]]
+// CHECK:         %[[VAL_137:.*]] = getelementptr inbounds [131072 x [1024 x float]], ptr %[[VAL_103]], i32 0, i32 %[[VAL_133]], i32 %[[VAL_130]]
+// CHECK:         %[[VAL_138:.*]] = load float, ptr %[[VAL_137]], align 4, !invariant.load !7
+// CHECK:         store float %[[VAL_138]], ptr %[[VAL_29]], align 4
+// CHECK:         call void @[[SUM]](ptr %[[VAL_28]], ptr %[[VAL_29]], ptr %[[VAL_21]])
+// CHECK:         %[[VAL_139:.*]] = load float, ptr %[[VAL_21]], align 4
+// CHECK:         store float %[[VAL_139]], ptr %[[VAL_28]], align 4
+// CHECK:         br label %[[VAL_117]], !llvm.loop !12
+// CHECK:       loop3.loop_exit10:                                ; preds = %[[VAL_117]]
+// CHECK:         br label %[[VAL_107]]
+// CHECK:       thread_in_bounds-true:                            ; preds = %[[VAL_48]]
+// CHECK:         %[[VAL_140:.*]] = icmp eq i32 %lane_id, 0
+// CHECK:         br i1 %[[VAL_140]], label %[[VAL_141:.*]], label %[[VAL_142:.*]]
+// CHECK:       intra_warp_reduce_write-after:                    ; preds = %[[VAL_141]], %thread_in_bounds-true
+// CHECK:         call void @llvm.nvvm.barrier0()
+// CHECK:         %[[VAL_143:.*]] = icmp eq i32 %[[VAL_71]], 0
+// CHECK:         br i1 %[[VAL_143]], label %[[VAL_144:.*]], label %[[VAL_73]]
+// CHECK:       inter_warp_reduce-after:                          ; preds = %[[VAL_145:.*]], %[[VAL_142]]
+// CHECK:         br label %thread_in_bounds-after
+// CHECK:       intra_warp_reduce_write-true:                     ; preds = %thread_in_bounds-true
+// CHECK:         %[[VAL_146:.*]] = load float, ptr %[[VAL_28]], align 4
+// CHECK:         %[[VAL_147:.*]] = getelementptr inbounds [4 x [2 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.1, i32 %[[VAL_71]]
+// CHECK:         %[[VAL_148:.*]] = addrspacecast ptr addrspace(3) %[[VAL_147]] to ptr
+// CHECK:         store float %[[VAL_146]], ptr %[[VAL_148]], align 4
+// CHECK:         br label %[[VAL_142]]
+// CHECK:       inter_warp_reduce-true:                           ; preds = %[[VAL_142]]
+// CHECK:         %[[VAL_149:.*]] = getelementptr inbounds [4 x [2 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.1, i32 %lane_id
+// CHECK:         %[[VAL_150:.*]] = addrspacecast ptr addrspace(3) %[[VAL_149]] to ptr
+// CHECK:         store float %[[VAL_35]], ptr %[[VAL_10]], align 4
+// CHECK:         %[[VAL_151:.*]] = icmp ult i32 %thread.id.2, 2
+// CHECK:         %[[VAL_152:.*]] = select i1 %[[VAL_151]], ptr %[[VAL_150]], ptr %[[VAL_10]]
+// CHECK:         %[[VAL_153:.*]] = load float, ptr %[[VAL_152]], align 4
+// CHECK:         %[[VAL_154:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_153]], i32 16, i32 31)
+// CHECK:         store float %[[VAL_154]], ptr %[[VAL_9]], align 4
+// CHECK:         call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_9]], ptr %[[VAL_8]])
+// CHECK:         %[[VAL_155:.*]] = load float, ptr %[[VAL_8]], align 4
+// CHECK:         store float %[[VAL_155]], ptr %[[VAL_152]], align 4
+// CHECK:         %[[VAL_156:.*]] = load float, ptr %[[VAL_152]], align 4
+// CHECK:         %[[VAL_157:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_156]], i32 8, i32 31)
+// CHECK:         store float %[[VAL_157]], ptr %[[VAL_7]], align 4
+// CHECK:         call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_7]], ptr %[[VAL_6]])
+// CHECK:         %[[VAL_158:.*]] = load float, ptr %[[VAL_6]], align 4
+// CHECK:         store float %[[VAL_158]], ptr %[[VAL_152]], align 4
+// CHECK:         %[[VAL_159:.*]] = load float, ptr %[[VAL_152]], align 4
+// CHECK:         %[[VAL_160:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_159]], i32 4, i32 31)
+// CHECK:         store float %[[VAL_160]], ptr %[[VAL_5]], align 4
+// CHECK:         call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_5]], ptr %[[VAL_4]])
+// CHECK:         %[[VAL_161:.*]] = load float, ptr %[[VAL_4]], align 4
+// CHECK:         store float %[[VAL_161]], ptr %[[VAL_152]], align 4
+// CHECK:         %[[VAL_162:.*]] = load float, ptr %[[VAL_152]], align 4
+// CHECK:         %[[VAL_163:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_162]], i32 2, i32 31)
+// CHECK:         store float %[[VAL_163]], ptr %[[VAL_3]], align 4
+// CHECK:         call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_3]], ptr %[[VAL_2]])
+// CHECK:         %[[VAL_164:.*]] = load float, ptr %[[VAL_2]], align 4
+// CHECK:         store float %[[VAL_164]], ptr %[[VAL_152]], align 4
+// CHECK:         %[[VAL_165:.*]] = load float, ptr %[[VAL_152]], align 4
+// CHECK:         %[[VAL_166:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_165]], i32 1, i32 31)
+// CHECK:         store float %[[VAL_166]], ptr %[[VAL_1]], align 4
+// CHECK:         call void @[[SUM]](ptr %[[VAL_152]], ptr %[[VAL_1]], ptr %[[VAL_0]])
+// CHECK:         %[[VAL_167:.*]] = load float, ptr %[[VAL_0]], align 4
+// CHECK:         store float %[[VAL_167]], ptr %[[VAL_152]], align 4
+// CHECK:         %[[VAL_168:.*]] = icmp eq i32 %thread.id.2, 0
+// CHECK:         br i1 %[[VAL_168]], label %[[VAL_169:.*]], label %[[VAL_145]]
+// CHECK:       reduction_write_output-after:                     ; preds = %[[VAL_169]], %[[VAL_144]]
+// CHECK:         br label %[[VAL_73]]
+// CHECK:       reduction_write_output-true:                      ; preds = %[[VAL_144]]
+// CHECK:         %[[VAL_171:.*]] = add i32 %tile_origin.1, %thread.id.1
+// CHECK:         %[[VAL_175:.*]] = getelementptr inbounds [131072 x float], ptr %[[VAL_176:.*]], i32 0, i32 %[[VAL_171]]
+// CHECK:         %[[VAL_177:.*]] = load float, ptr %[[VAL_152]], align 4
+// CHECK:         store float %[[VAL_177]], ptr %[[VAL_175]], align 4
+// CHECK:         br label %[[VAL_145]]
+// CHECK:       entry:
+// CHECK:         %[[VAL_178:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_179:.*]] = load float, ptr %[[VAL_180:.*]], align 4
+// CHECK:         %[[VAL_181:.*]] = load float, ptr %[[VAL_182:.*]], align 4
+// CHECK:         %[[VAL_183:.*]] = fadd float %[[VAL_179]], %[[VAL_181]]
+// CHECK:         store float %[[VAL_183]], ptr %[[VAL_178]], align 4
+// CHECK:         %[[VAL_184:.*]] = load float, ptr %[[VAL_178]], align 4
+// CHECK:         store float %[[VAL_184]], ptr %[[VAL_185:.*]], align 4
+// CHECK:         ret void
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_unnested.hlo b/third_party/xla/xla/service/gpu/tests/reduce_unnested.hlo
index e1762e5b2740ea..844c3ded2ef024 100644
--- a/third_party/xla/xla/service/gpu/tests/reduce_unnested.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduce_unnested.hlo
@@ -1,196 +1,4 @@
-// RUN: hlo_to_llvm_ir %s | FileCheck %s
-
-// CHECK: target datalayout
-// CHECK:         call float @llvm.nvvm.shfl.sync.down.f32
-
-HloModule Test, is_scheduled=true
-
-Add {
-  scalar_lhs.0 = f32[] parameter(0)
-  scalar_rhs.0 = f32[] parameter(1)
-  scalar_lhs.1 = f32[] parameter(2)
-  scalar_rhs.1 = f32[] parameter(3)
-  add.0 = f32[] add(scalar_lhs.0, scalar_lhs.1)
-  add.1 = f32[] add(scalar_rhs.0, scalar_rhs.1)
-  ROOT t = (f32[], f32[]) tuple(add.0, add.1)
-}
-
-fused_computation {
-  param_0 = f32[5,200,300]{2,1,0} parameter(0)
-  param_1 = f32[5,200,300]{2,1,0} parameter(1)
-  param_2 = f32[] parameter(2)
-  ROOT d.1 = (f32[200]{0}, f32[200]{0}) reduce(f32[5,200,300]{2,1,0} param_0, f32[5,200,300]{2,1,0} %param_1, f32[] param_2, f32[] param_2), dimensions={0,2}, to_apply=Add
-}
-
-ENTRY main {
-  a = f32[5, 200, 300]{2,1,0} parameter(0)
-  b = f32[5, 200, 300]{2,1,0} parameter(1)
-  c = f32[] constant(0)
-  ROOT wrapped_d = (f32[200]{0}, f32[200]{0}) fusion(f32[5,200,300]{2,1,0} a, f32[5,200,300]{2,1,0} b, f32[] c), kind=kInput, calls=fused_computation
-}
-
-// -----
-
-// CHECK: target datalayout
-// CHECK:         @llvm.nvvm.shfl.sync.down
-
-HloModule LargeReduction, is_scheduled=true
-
-Sum {
-    x.1 = c128[] parameter(0)
-    y.1 = c128[] parameter(1)
-    ROOT add.1 = c128[] add(x.1, y.1)
-}
-
-fused_computation {
-  param_0 = c128[10000]{0} parameter(0)
-  param_1 = c128[] parameter(1)
-  ROOT out1.1 = c128[] reduce(c128[10000]{0} param_0, c128[] param_1), dimensions={0}, to_apply=Sum
-}
-
-ENTRY reduce.1 {
-    parameter = c128[10000] parameter(0)
-    init_value = c128[] constant((0, 0))
-    ROOT wrapped_out1 = c128[] fusion(c128[10000]{0} parameter, c128[] init_value), kind=kInput, calls=fused_computation
-}
-
-// -----
-
-// Check that for "max" we are still using atomics (CAS loop).
-
-// CHECK: target datalayout
-// CHECK: cmpxchg
-
-HloModule MinReduce, is_scheduled=true
-
-Min {
-  x.1 = f32[] parameter(0)
-  y.1 = f32[] parameter(1)
-  ROOT min.1 = f32[] minimum(x.1, y.1)
-}
-
-fused_computation {
-  param_0 = f32[300000]{0} parameter(0)
-  param_1 = f32[] parameter(1)
-  ROOT reduce.1 = f32[] reduce(f32[300000]{0} param_0, f32[] param_1), dimensions={0}, to_apply=Min
-}
-
-ENTRY reduce.1 {
-  parameter = f32[300000] parameter(0)
-  init_value = f32[] constant(0)
-  ROOT wrapped_reduce = f32[] fusion(f32[300000]{0} parameter, f32[] init_value), kind=kInput, calls=fused_computation
-}
-
-// -----
-
-// CHECK-LABEL: define void @fusion_not_vectorized_too_many_large_params(
-// CHECK-COUNT-6: call void @Sum
-// CHECK-NOT: call void @Sum
-
-HloModule ColumnReductionNotVectorizedTooManyLargeParams, is_scheduled=true
-
-Sum {
-    x.1 = f32[] parameter(0)
-    y.1 = f32[] parameter(1)
-    ROOT add.1 = f32[] add(x.1, y.1)
-}
-
-// Not vectorized because there are too many large parameters.
-fusion_not_vectorized {
-    a = f32[64,65536] parameter(0)
-    b = f32[128,65536] parameter(1)
-    c = f32[128,65536] parameter(2)
-    d = f32[128,65536] parameter(3)
-
-    sum.1 = f32[128,65536] add(b, c)
-    sum.2 = f32[128,65536] add(sum.1, d)
-    slice = f32[1,1] slice(sum.2), slice={[0:1],[0:1]}
-    init = f32[] reshape(f32[1,1] slice)
-
-    ROOT reduce = f32[65536] reduce(a, init), dimensions={0}, to_apply=Sum
-}
-
-ENTRY reduce.1 {
-    parameter0 = f32[64,65536] parameter(0)
-    parameter1 = f32[128,65536] parameter(1)
-    parameter2 = f32[128,65536] parameter(2)
-    parameter3 = f32[128,65536] parameter(3)
-    ROOT fusion_not_vectorized_too_many_large_params = f32[65536] fusion(
-        f32[64,65536] parameter0,
-        f32[128,65536] parameter1,
-        f32[128,65536] parameter2,
-        f32[128,65536] parameter3
-    ), kind=kLoop, calls=fusion_not_vectorized
-}
-
-// -----
-
-// CHECK: define void @fusion_not_vectorized_non_elementwise(
-// CHECK-COUNT-6: call void @Sum
-// CHECK-NOT: call void @Sum
-
-HloModule ColumnReductionNotVectorizedNonElementwise, is_scheduled=true
-
-Sum {
-    x.1 = f32[] parameter(0)
-    y.1 = f32[] parameter(1)
-    ROOT add.1 = f32[] add(x.1, y.1)
-}
-
-// Not vectorized because there are two large parameters and the `a` parameter
-// goes through a non-elementwise instruction.
-fusion_not_vectorized {
-    a = f32[64,65536] parameter(0)
-    b = f32[128,65536] parameter(1)
-    c = f32[128,65536] parameter(2)
-
-    sum = f32[128,65536] add(b, c)
-    slice = f32[1,1] slice(sum), slice={[0:1],[0:1]}
-    init = f32[] reshape(f32[1,1] slice)
-
-    reverse = f32[64,65536] reverse(f32[64,65536] a), dimensions={0}
-    ROOT reduce = f32[65536] reduce(reverse, init), dimensions={0}, to_apply=Sum
-}
-
-ENTRY reduce.1 {
-    parameter0 = f32[64,65536] parameter(0)
-    parameter1 = f32[128,65536] parameter(1)
-    parameter2 = f32[128,65536] parameter(2)
-    ROOT fusion_not_vectorized_non_elementwise = f32[65536] fusion(
-        f32[64,65536] parameter0,
-        f32[128,65536] parameter1,
-        f32[128,65536] parameter2
-    ), kind=kLoop, calls=fusion_not_vectorized
-}
-
-// -----
-
-// CHECK: define void @fusion_row_reduction_vectorized(
-// CHECK-COUNT-2: {{^x_in_tile-true}}
-// CHECK-NOT: {{^x_in_tile-true}}
-
-HloModule RowReductionVectorized, is_scheduled=true
-
-Sum {
-    x.1 = f32[] parameter(0)
-    y.1 = f32[] parameter(1)
-    ROOT add.1 = f32[] add(x.1, y.1)
-}
-
-fusion_vectorized {
-    a = f32[131072,1024] parameter(0)
-    init = f32[] constant(0)
-    ROOT reduce = f32[131072] reduce(a, init), dimensions={1}, to_apply=Sum
-}
-
-ENTRY reduce.1 {
-    parameter0 = f32[131072,1024] parameter(0)
-    ROOT fusion_row_reduction_vectorized = f32[131072] fusion(
-        f32[131072,1024] parameter0
-    ), kind=kLoop, calls=fusion_vectorized
-}
-
-// -----
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s
 
 // CHECK: define void @fusion_row_reduction_too_small(
 // CHECK-COUNT-1: {{^x_in_tile-true}}
@@ -272,34 +80,3 @@ ENTRY reduce.1 {
         f32[131072,1024] parameter0
     ), kind=kLoop, calls=fusion_not_vectorized
 }
-
-// -----
-
-// TODO(jreiffers): This should most likely not be unrolled. The heuristic only
-// checks instructions that are directly in the fusion, not nested computations.
-
-// CHECK: define void @fusion_row_reduction_sin_does_not_prevent_vectorization(
-// CHECK-COUNT-2: {{^x_in_tile-true}}
-// CHECK-NOT: {{^x_in_tile-true}}
-
-HloModule RowReductionVectorized, is_scheduled=true
-
-Sum {
-    x.1 = f32[] parameter(0)
-    y.1 = f32[] parameter(1)
-    sin = f32[] sine(y.1)
-    ROOT add.1 = f32[] add(x.1, sin)
-}
-
-fusion_vectorized {
-    a = f32[131072,1024] parameter(0)
-    init = f32[] constant(0)
-    ROOT reduce = f32[131072] reduce(a, init), dimensions={1}, to_apply=Sum
-}
-
-ENTRY reduce.1 {
-    parameter0 = f32[131072,1024] parameter(0)
-    ROOT fusion_row_reduction_sin_does_not_prevent_vectorization = f32[131072] fusion(
-        f32[131072,1024] parameter0
-    ), kind=kLoop, calls=fusion_vectorized
-}
diff --git a/third_party/xla/xla/service/gpu/tests/reduce_variadic_column.hlo b/third_party/xla/xla/service/gpu/tests/reduce_variadic_column.hlo
new file mode 100644
index 00000000000000..2032a64930717b
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/reduce_variadic_column.hlo
@@ -0,0 +1,464 @@
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s --check-prefixes=CHECK,CHECK-%{PTX}
+
+HloModule Test, is_scheduled=true
+
+Add {
+  scalar_lhs.0 = f32[] parameter(0)
+  scalar_rhs.0 = f32[] parameter(1)
+  scalar_lhs.1 = f32[] parameter(2)
+  scalar_rhs.1 = f32[] parameter(3)
+  add.0 = f32[] add(scalar_lhs.0, scalar_lhs.1)
+  add.1 = f32[] add(scalar_rhs.0, scalar_rhs.1)
+  ROOT t = (f32[], f32[]) tuple(add.0, add.1)
+}
+
+fused_computation {
+  param_0 = f32[5,200,300]{2,1,0} parameter(0)
+  param_1 = f32[5,200,300]{2,1,0} parameter(1)
+  param_2 = f32[] parameter(2)
+  ROOT d.1 = (f32[200]{0}, f32[200]{0}) reduce(f32[5,200,300]{2,1,0} param_0, f32[5,200,300]{2,1,0} %param_1, f32[] param_2, f32[] param_2), dimensions={0,2}, to_apply=Add
+}
+
+ENTRY main {
+  a = f32[5, 200, 300]{2,1,0} parameter(0)
+  b = f32[5, 200, 300]{2,1,0} parameter(1)
+  c = f32[] constant(0)
+  ROOT wrapped_d = (f32[200]{0}, f32[200]{0}) fusion(f32[5,200,300]{2,1,0} a, f32[5,200,300]{2,1,0} b, f32[] c), kind=kInput, calls=fused_computation
+}
+
+// CHECK-LABEL: entry:
+// CHECK:         %[[VAL_0:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_1:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_2:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_3:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_4:.*]] = alloca [2 x ptr], align 8
+// CHECK:         %[[VAL_5:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_6:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_7:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_8:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_9:.*]] = alloca [2 x ptr], align 8
+// CHECK:         %[[VAL_10:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_11:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_12:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_13:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_14:.*]] = alloca [2 x ptr], align 8
+// CHECK:         %[[VAL_15:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_16:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_17:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_18:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_19:.*]] = alloca [2 x ptr], align 8
+// CHECK:         %[[VAL_20:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_21:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_22:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_23:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_24:.*]] = alloca [2 x ptr], align 8
+// CHECK:         %[[VAL_25:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_26:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_27:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_28:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_29:.*]] = alloca [2 x ptr], align 8
+// CHECK:         %[[VAL_30:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_31:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_32:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_33:.*]] = alloca [2 x ptr], align 8
+// CHECK:         %[[VAL_34:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_35:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_36:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_37:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_38:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_39:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_40:.*]] = alloca float, align 4
+// CHECK-PTX:     %[[VAL_41:.*]] = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.y(), !range !2
+// CHECK-GCN:     %[[VAL_41:.*]] = call i32 @llvm.amdgcn.workgroup.id.y
+// CHECK:         %[[VAL_42:.*]] = icmp eq i32 %[[VAL_41]], 0
+// CHECK:         br i1 %[[VAL_42]], label %[[VAL_43:.*]], label %[[VAL_44:.*]]
+// CHECK:       reduce-group-0-after:                             ; preds = %thread_in_bounds-after, %[[VAL_45:.*]]
+// CHECK:         ret void
+// CHECK:       reduce-group-0-true:                              ; preds = %[[VAL_45]]
+// CHECK:         %[[VAL_46:.*]] = load float, ptr{{.*}}%[[VAL_47:.*]], align 4, !invariant.load !{{[0-9]}}
+// CHECK:         store float %[[VAL_46]], ptr{{.*}}%[[VAL_39]], align 4
+// CHECK:         %[[VAL_48:.*]] = load float, ptr{{.*}}%[[VAL_47]], align 4, !invariant.load !{{[0-9]}}
+// CHECK:         store float %[[VAL_48]], ptr{{.*}}%[[VAL_37]], align 4
+// CHECK-PTX:     %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !4
+// CHECK-GCN:     %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x
+// CHECK-PTX:     %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !5
+// CHECK-GCN:     %block.id.x = call i32 @llvm.amdgcn.workgroup.id.x
+// CHECK:         %[[VAL_49:.*]] = udiv i32 %thread.id.x, 32
+// CHECK:         %thread.id.1 = urem i32 %[[VAL_49]], 8
+// CHECK:         %thread.id.2 = urem i32 %thread.id.x, 32
+// CHECK:         %lane_id = urem i32 %thread.id.x, 32
+// CHECK:         %[[VAL_50:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_51:.*]] = urem i32 %[[VAL_50]], 1
+// CHECK:         %[[VAL_52:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_53:.*]] = urem i32 %[[VAL_52]], 25
+// CHECK:         %[[VAL_54:.*]] = udiv i32 %block.id.x, 25
+// CHECK:         %[[VAL_55:.*]] = icmp eq i32 %[[VAL_51]], 0
+// CHECK:         %tile_bound.2 = select i1 %[[VAL_55]], i32 300, i32 512
+// CHECK:         %tile_origin.0 = mul i32 %[[VAL_54]], 5
+// CHECK:         %tile_origin.1 = mul i32 %[[VAL_53]], 8
+// CHECK:         %tile_origin.2 = mul i32 %[[VAL_51]], 512
+// CHECK:         store i32 0, ptr{{.*}}%[[VAL_36]], align 4
+// CHECK:         br label %[[VAL_56:.*]]
+// CHECK:       loop0.loop_header:                                ; preds = %[[VAL_57:.*]], %[[VAL_43]]
+// CHECK:         %[[VAL_58:.*]] = load i32, ptr{{.*}}%[[VAL_36]], align 4
+// CHECK:         %[[VAL_59:.*]] = icmp uge i32 %[[VAL_58]], 5
+// CHECK:         br i1 %[[VAL_59]], label %[[VAL_60:.*]], label %[[VAL_61:.*]]
+// CHECK:       loop0.loop_body:                                  ; preds = %[[VAL_56]]
+// CHECK:         %[[VAL_62:.*]] = add nuw nsw i32 %[[VAL_58]], 1
+// CHECK:         store i32 %[[VAL_62]], ptr{{.*}}%[[VAL_36]], align 4
+// CHECK:         %[[VAL_63:.*]] = icmp eq i32 %[[VAL_58]], 0
+// CHECK:         store i32 %thread.id.1, ptr{{.*}}%[[VAL_35]], align 4
+// CHECK:         br label %[[VAL_64:.*]]
+// CHECK:       loop1.loop_header:                                ; preds = %[[VAL_65:.*]], %[[VAL_61]]
+// CHECK:         %[[VAL_66:.*]] = load i32, ptr{{.*}}%[[VAL_35]], align 4
+// CHECK:         %[[VAL_67:.*]] = icmp uge i32 %[[VAL_66]], 8
+// CHECK:         br i1 %[[VAL_67]], label %[[VAL_57]], label %[[VAL_68:.*]]
+// CHECK:       loop1.loop_body:                                  ; preds = %[[VAL_64]]
+// CHECK:         %[[VAL_69:.*]] = add nuw nsw i32 %[[VAL_66]], 8
+// CHECK:         store i32 %[[VAL_69]], ptr{{.*}}%[[VAL_35]], align 4
+// CHECK:         %[[VAL_70:.*]] = icmp eq i32 %[[VAL_66]], %thread.id.1
+// CHECK:         %[[VAL_71:.*]] = icmp eq i32 512, %tile_bound.2
+// CHECK:         br i1 %[[VAL_71]], label %[[VAL_72:.*]], label %[[VAL_73:.*]]
+// CHECK:       is_full_tile-after:                               ; preds = %[[VAL_74:.*]], %[[VAL_75:.*]]
+// CHECK:         br label %[[VAL_64]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop1.loop_exit:                                  ; preds = %[[VAL_64]]
+// CHECK:         br label %[[VAL_56]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop0.loop_exit:                                  ; preds = %[[VAL_56]]
+// CHECK:         %[[VAL_76:.*]] = load float, ptr{{.*}}%[[VAL_39]], align 4
+// CHECK-PTX:     %[[VAL_77:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_76]], i32 16, i32 31)
+// CHECK-GCN:     %[[VAL_76_1:.*]] = bitcast float %[[VAL_76]] to i32
+// CHECK-GCN:     %[[VAL_77_1:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_76_1:.*]], i32 16)
+// CHECK-GCN:     %[[VAL_77:.*]] = bitcast i32 %[[VAL_77_1:.*]] to float
+// CHECK:         store float %[[VAL_77]], ptr{{.*}}%[[VAL_26]], align 4
+// CHECK:         %[[VAL_78:.*]] = load float, ptr{{.*}}%[[VAL_37]], align 4
+// CHECK-PTX:     %[[VAL_79:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_78]], i32 16, i32 31)
+// CHECK-GCN:     %[[VAL_78_1:.*]] = bitcast float %[[VAL_78]] to i32
+// CHECK-GCN:     %[[VAL_79_1:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_78_1:.*]], i32 16)
+// CHECK-GCN:     %[[VAL_79:.*]] = bitcast i32 %[[VAL_79_1:.*]] to float
+// CHECK:         store float %[[VAL_79]], ptr{{.*}}%[[VAL_25]], align 4
+// CHECK-GCN:     %[[VAL_22_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_22]] to ptr
+// CHECK:         %[[VAL_80:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_24]], i64 0, i64 0
+// CHECK-PTX:     store ptr %[[VAL_22]], ptr %[[VAL_80]], align 8
+// CHECK-GCN:     store ptr %[[VAL_22_1]], ptr{{.*}}%[[VAL_80]], align 8
+// CHECK-GCN:     %[[VAL_23_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_23]] to ptr
+// CHECK:         %[[VAL_81:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_24]], i64 0, i64 1
+// CHECK-PTX:     store ptr %[[VAL_23]], ptr %[[VAL_81]], align 8
+// CHECK-GCN:     store ptr %[[VAL_23_1]], ptr{{.*}}%[[VAL_81]], align 8
+// CHECK-PTX:     call void @[[ADD:Add.*]](ptr %[[VAL_39]], ptr %[[VAL_37]], ptr %[[VAL_26]], ptr %[[VAL_25]], ptr %[[VAL_24]])
+// CHECK-GCN:     %[[VAL_39_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_39]] to ptr
+// CHECK-GCN:     %[[VAL_37_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_37]] to ptr
+// CHECK-GCN:     %[[VAL_26_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_26]] to ptr
+// CHECK-GCN:     %[[VAL_25_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_25]] to ptr
+// CHECK-GCN:     %[[VAL_24_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_24]] to ptr
+// CHECK-GCN:     call void @[[ADD:Add.*]](ptr %[[VAL_39_1]], ptr %[[VAL_37_1]], ptr %[[VAL_26_1]], ptr %[[VAL_25_1]], ptr %[[VAL_24_1]])
+// CHECK:         %[[VAL_82:.*]] = load float, ptr{{.*}}%[[VAL_22]], align 4
+// CHECK:         %[[VAL_83:.*]] = load float, ptr{{.*}}%[[VAL_23]], align 4
+// CHECK:         store float %[[VAL_82]], ptr{{.*}}%[[VAL_39]], align 4
+// CHECK:         store float %[[VAL_83]], ptr{{.*}}%[[VAL_37]], align 4
+// CHECK:         %[[VAL_84:.*]] = load float, ptr{{.*}}%[[VAL_39]], align 4
+// CHECK-PTX:     %[[VAL_85:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_84]], i32 8, i32 31)
+// CHECK-GCN:     %[[VAL_84_1:.*]] = bitcast float %[[VAL_84]] to i32
+// CHECK-GCN:     %[[VAL_85_1:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_84_1:.*]], i32 8)
+// CHECK-GCN:     %[[VAL_85:.*]] = bitcast i32 %[[VAL_85_1:.*]] to float
+// CHECK:         store float %[[VAL_85]], ptr{{.*}}%[[VAL_21]], align 4
+// CHECK:         %[[VAL_86:.*]] = load float, ptr{{.*}}%[[VAL_37]], align 4
+// CHECK-PTX:     %[[VAL_87:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_86]], i32 8, i32 31)
+// CHECK-GCN:     %[[VAL_86_1:.*]] = bitcast float %[[VAL_86]] to i32
+// CHECK-GCN:     %[[VAL_87_1:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_86_1:.*]], i32 8)
+// CHECK-GCN:     %[[VAL_87:.*]] = bitcast i32 %[[VAL_87_1:.*]] to float
+// CHECK:         store float %[[VAL_87]], ptr{{.*}}%[[VAL_20]], align 4
+// CHECK-GCN:     %[[VAL_17_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_17]] to ptr
+// CHECK:         %[[VAL_88:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_19]], i64 0, i64 0
+// CHECK-PTX:     store ptr %[[VAL_17]], ptr %[[VAL_88]], align 8
+// CHECK-GCN:     store ptr %[[VAL_17_1]], ptr{{.*}}%[[VAL_88]], align 8
+// CHECK-GCN:     %[[VAL_18_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_18]] to ptr
+// CHECK:         %[[VAL_89:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_19]], i64 0, i64 1
+// CHECK-PTX:     store ptr %[[VAL_18]], ptr %[[VAL_89]], align 8
+// CHECK-GCN:     store ptr %[[VAL_18_1]], ptr{{.*}}%[[VAL_89]], align 8
+// CHECK-PTX:     call void @[[ADD:Add.*]](ptr %[[VAL_39]], ptr %[[VAL_37]], ptr %[[VAL_21]], ptr %[[VAL_20]], ptr %[[VAL_19]])
+// CHECK-GCN:     %[[VAL_39_2:.*]] = addrspacecast ptr{{.*}}%[[VAL_39]] to ptr
+// CHECK-GCN:     %[[VAL_37_2:.*]] = addrspacecast ptr{{.*}}%[[VAL_37]] to ptr
+// CHECK-GCN:     %[[VAL_21_2:.*]] = addrspacecast ptr{{.*}}%[[VAL_21]] to ptr
+// CHECK-GCN:     %[[VAL_20_2:.*]] = addrspacecast ptr{{.*}}%[[VAL_20]] to ptr
+// CHECK-GCN:     %[[VAL_19_2:.*]] = addrspacecast ptr{{.*}}%[[VAL_19]] to ptr
+// CHECK-GCN:     call void @[[ADD:Add.*]](ptr %[[VAL_39_2]], ptr %[[VAL_37_2]], ptr %[[VAL_21_2]], ptr %[[VAL_20_2]], ptr %[[VAL_19_2]])
+// CHECK:         %[[VAL_90:.*]] = load float, ptr{{.*}}%[[VAL_17]], align 4
+// CHECK:         %[[VAL_91:.*]] = load float, ptr{{.*}}%[[VAL_18]], align 4
+// CHECK:         store float %[[VAL_90]], ptr{{.*}}%[[VAL_39]], align 4
+// CHECK:         store float %[[VAL_91]], ptr{{.*}}%[[VAL_37]], align 4
+// CHECK:         %[[VAL_92:.*]] = load float, ptr{{.*}}%[[VAL_39]], align 4
+// CHECK-PTX:     %[[VAL_93:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_92]], i32 4, i32 31)
+// CHECK-GCN:     %[[VAL_92_1:.*]] = bitcast float %[[VAL_92]] to i32
+// CHECK-GCN:     %[[VAL_93_1:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_92_1:.*]], i32 4)
+// CHECK-GCN:     %[[VAL_93:.*]] = bitcast i32 %[[VAL_93_1:.*]] to float
+// CHECK:         store float %[[VAL_93]], ptr{{.*}}%[[VAL_16]], align 4
+// CHECK:         %[[VAL_94:.*]] = load float, ptr{{.*}}%[[VAL_37]], align 4
+// CHECK-PTX:     %[[VAL_95:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_94]], i32 4, i32 31)
+// CHECK-GCN:     %[[VAL_94_1:.*]] = bitcast float %[[VAL_94]] to i32
+// CHECK-GCN:     %[[VAL_95_1:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_94_1:.*]], i32 4)
+// CHECK-GCN:     %[[VAL_95:.*]] = bitcast i32 %[[VAL_95_1:.*]] to float
+// CHECK:         store float %[[VAL_95]], ptr{{.*}}%[[VAL_15]], align 4
+// CHECK-GCN:     %[[VAL_12_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_12]] to ptr
+// CHECK:         %[[VAL_96:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_14]], i64 0, i64 0
+// CHECK-PTX:     store ptr %[[VAL_12]], ptr %[[VAL_96]], align 8
+// CHECK-GCN:     store ptr %[[VAL_12_1]], ptr{{.*}}%[[VAL_96]], align 8
+// CHECK-GCN:     %[[VAL_13_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_13]] to ptr
+// CHECK:         %[[VAL_97:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_14]], i64 0, i64 1
+// CHECK-PTX:     store ptr %[[VAL_13]], ptr %[[VAL_97]], align 8
+// CHECK-GCN:     store ptr %[[VAL_13_1]], ptr{{.*}}%[[VAL_97]], align 8
+// CHECK-PTX:     call void @[[ADD:Add.*]](ptr %[[VAL_39]], ptr %[[VAL_37]], ptr %[[VAL_16]], ptr %[[VAL_15]], ptr %[[VAL_14]])
+// CHECK-GCN:     %[[VAL_39_3:.*]] = addrspacecast ptr{{.*}}%[[VAL_39]] to ptr
+// CHECK-GCN:     %[[VAL_37_3:.*]] = addrspacecast ptr{{.*}}%[[VAL_37]] to ptr
+// CHECK-GCN:     %[[VAL_16_3:.*]] = addrspacecast ptr{{.*}}%[[VAL_16]] to ptr
+// CHECK-GCN:     %[[VAL_15_3:.*]] = addrspacecast ptr{{.*}}%[[VAL_15]] to ptr
+// CHECK-GCN:     %[[VAL_14_3:.*]] = addrspacecast ptr{{.*}}%[[VAL_14]] to ptr
+// CHECK-GCN:     call void @[[ADD:Add.*]](ptr %[[VAL_39_3]], ptr %[[VAL_37_3]], ptr %[[VAL_16_3]], ptr %[[VAL_15_3]], ptr %[[VAL_14_3]])
+// CHECK:         %[[VAL_98:.*]] = load float, ptr{{.*}}%[[VAL_12]], align 4
+// CHECK:         %[[VAL_99:.*]] = load float, ptr{{.*}}%[[VAL_13]], align 4
+// CHECK:         store float %[[VAL_98]], ptr{{.*}}%[[VAL_39]], align 4
+// CHECK:         store float %[[VAL_99]], ptr{{.*}}%[[VAL_37]], align 4
+// CHECK:         %[[VAL_100:.*]] = load float, ptr{{.*}}%[[VAL_39]], align 4
+// CHECK-PTX:     %[[VAL_101:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_100]], i32 2, i32 31)
+// CHECK-GCN:     %[[VAL_100_1:.*]] = bitcast float %[[VAL_100]] to i32
+// CHECK-GCN:     %[[VAL_101_1:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_100_1:.*]], i32 2)
+// CHECK-GCN:     %[[VAL_101:.*]] = bitcast i32 %[[VAL_101_1:.*]] to float
+// CHECK:         store float %[[VAL_101]], ptr{{.*}}%[[VAL_11]], align 4
+// CHECK:         %[[VAL_102:.*]] = load float, ptr{{.*}}%[[VAL_37]], align 4
+// CHECK-PTX:     %[[VAL_103:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_102]], i32 2, i32 31)
+// CHECK-GCN:     %[[VAL_102_1:.*]] = bitcast float %[[VAL_102]] to i32
+// CHECK-GCN:     %[[VAL_103_1:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_102_1:.*]], i32 2)
+// CHECK-GCN:     %[[VAL_103:.*]] = bitcast i32 %[[VAL_103_1:.*]] to float
+// CHECK:         store float %[[VAL_103]], ptr{{.*}}%[[VAL_10]], align 4
+// CHECK-GCN:     %[[VAL_7_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_7]] to ptr
+// CHECK:         %[[VAL_104:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_9]], i64 0, i64 0
+// CHECK-PTX:     store ptr %[[VAL_7]], ptr %[[VAL_104]], align 8
+// CHECK-GCN:     store ptr %[[VAL_7_1]], ptr{{.*}}%[[VAL_104]], align 8
+// CHECK-GCN:     %[[VAL_8_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_8]] to ptr
+// CHECK:         %[[VAL_105:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_9]], i64 0, i64 1
+// CHECK-PTX:     store ptr %[[VAL_8]], ptr %[[VAL_105]], align 8
+// CHECK-GCN:     store ptr %[[VAL_8_1]], ptr{{.*}}%[[VAL_105]], align 8
+// CHECK-PTX:     call void @[[ADD:Add.*]](ptr %[[VAL_39]], ptr %[[VAL_37]], ptr %[[VAL_11]], ptr %[[VAL_10]], ptr %[[VAL_9]])
+// CHECK-GCN:     %[[VAL_39_4:.*]] = addrspacecast ptr{{.*}}%[[VAL_39]] to ptr
+// CHECK-GCN:     %[[VAL_37_4:.*]] = addrspacecast ptr{{.*}}%[[VAL_37]] to ptr
+// CHECK-GCN:     %[[VAL_11_4:.*]] = addrspacecast ptr{{.*}}%[[VAL_11]] to ptr
+// CHECK-GCN:     %[[VAL_10_4:.*]] = addrspacecast ptr{{.*}}%[[VAL_10]] to ptr
+// CHECK-GCN:     %[[VAL_9_4:.*]] = addrspacecast ptr{{.*}}%[[VAL_9]] to ptr
+// CHECK-GCN:     call void @[[ADD:Add.*]](ptr %[[VAL_39_4]], ptr %[[VAL_37_4]], ptr %[[VAL_11_4]], ptr %[[VAL_10_4]], ptr %[[VAL_9_4]])
+// CHECK:         %[[VAL_106:.*]] = load float, ptr{{.*}}%[[VAL_7]], align 4
+// CHECK:         %[[VAL_107:.*]] = load float, ptr{{.*}}%[[VAL_8]], align 4
+// CHECK:         store float %[[VAL_106]], ptr{{.*}}%[[VAL_39]], align 4
+// CHECK:         store float %[[VAL_107]], ptr{{.*}}%[[VAL_37]], align 4
+// CHECK:         %[[VAL_108:.*]] = load float, ptr{{.*}}%[[VAL_39]], align 4
+// CHECK-PTX:     %[[VAL_109:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_108]], i32 1, i32 31)
+// CHECK-GCN:     %[[VAL_108_1:.*]] = bitcast float %[[VAL_108]] to i32
+// CHECK-GCN:     %[[VAL_109_1:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_108_1:.*]], i32 1)
+// CHECK-GCN:     %[[VAL_109:.*]] = bitcast i32 %[[VAL_109_1:.*]] to float
+// CHECK:         store float %[[VAL_109]], ptr{{.*}}%[[VAL_6]], align 4
+// CHECK:         %[[VAL_110:.*]] = load float, ptr{{.*}}%[[VAL_37]], align 4
+// CHECK-PTX:     %[[VAL_111:.*]] = call float @llvm.nvvm.shfl.sync.down.f32(i32 -1, float %[[VAL_110]], i32 1, i32 31)
+// CHECK-GCN:     %[[VAL_110_1:.*]] = bitcast float %[[VAL_110]] to i32
+// CHECK-GCN:     %[[VAL_111_1:.*]] = call i32 @__ockl_readuplane_i32(i32 %[[VAL_110_1:.*]], i32 1)
+// CHECK-GCN:     %[[VAL_111:.*]] = bitcast i32 %[[VAL_111_1:.*]] to float
+// CHECK:         store float %[[VAL_111]], ptr{{.*}}%[[VAL_5]], align 4
+// CHECK-GCN:     %[[VAL_2_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_2]] to ptr
+// CHECK:         %[[VAL_112:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_4]], i64 0, i64 0
+// CHECK-PTX:     store ptr %[[VAL_2]], ptr %[[VAL_112]], align 8
+// CHECK-GCN:     store ptr %[[VAL_2_1]], ptr{{.*}}%[[VAL_112]], align 8
+// CHECK-GCN:     %[[VAL_3_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_3]] to ptr
+// CHECK:         %[[VAL_113:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_4]], i64 0, i64 1
+// CHECK-PTX:     store ptr %[[VAL_3]], ptr %[[VAL_113]], align 8
+// CHECK-GCN:     store ptr %[[VAL_3_1]], ptr{{.*}}%[[VAL_113]], align 8
+// CHECK-PTX:     call void @[[ADD:Add.*]](ptr %[[VAL_39]], ptr %[[VAL_37]], ptr %[[VAL_6]], ptr %[[VAL_5]], ptr %[[VAL_4]])
+// CHECK-GCN:     %[[VAL_39_5:.*]] = addrspacecast ptr{{.*}}%[[VAL_39]] to ptr
+// CHECK-GCN:     %[[VAL_37_5:.*]] = addrspacecast ptr{{.*}}%[[VAL_37]] to ptr
+// CHECK-GCN:     %[[VAL_6_5:.*]] = addrspacecast ptr{{.*}}%[[VAL_6]] to ptr
+// CHECK-GCN:     %[[VAL_5_5:.*]] = addrspacecast ptr{{.*}}%[[VAL_5]] to ptr
+// CHECK-GCN:     %[[VAL_4_5:.*]] = addrspacecast ptr{{.*}}%[[VAL_4]] to ptr
+// CHECK-GCN:     call void @[[ADD:Add.*]](ptr %[[VAL_39_5]], ptr %[[VAL_37_5]], ptr %[[VAL_6_5]], ptr %[[VAL_5_5]], ptr %[[VAL_4_5]])
+// CHECK:         %[[VAL_114:.*]] = load float, ptr{{.*}}%[[VAL_2]], align 4
+// CHECK:         %[[VAL_115:.*]] = load float, ptr{{.*}}%[[VAL_3]], align 4
+// CHECK:         store float %[[VAL_114]], ptr{{.*}}%[[VAL_39]], align 4
+// CHECK:         store float %[[VAL_115]], ptr{{.*}}%[[VAL_37]], align 4
+// CHECK:         %[[VAL_116:.*]] = udiv i32 %thread.id.2, 32
+// CHECK:         %[[VAL_117:.*]] = icmp ult i32 %thread.id.1, 8
+// CHECK:         br i1 %[[VAL_117]], label %thread_in_bounds-true, label %thread_in_bounds-after
+// CHECK:       thread_in_bounds-after:                           ; preds = %[[VAL_118:.*]], %[[VAL_60]]
+// CHECK:         br label %[[VAL_44]]
+// CHECK:       is_full_tile-true:                                ; preds = %[[VAL_68]]
+// CHECK:         store i32 0, ptr{{.*}}%[[VAL_34]], align 4
+// CHECK:         br label %[[VAL_119:.*]]
+// CHECK:       loop2.loop_header:                                ; preds = %[[VAL_120:.*]], %[[VAL_72]]
+// CHECK:         %[[VAL_121:.*]] = load i32, ptr{{.*}}%[[VAL_34]], align 4
+// CHECK:         %[[VAL_122:.*]] = icmp uge i32 %[[VAL_121]], 512
+// CHECK:         br i1 %[[VAL_122]], label %[[VAL_75]], label %[[VAL_120]]
+// CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_119]]
+// CHECK:         %[[VAL_123:.*]] = add nuw nsw i32 %[[VAL_121]], 32
+// CHECK:         store i32 %[[VAL_123]], ptr{{.*}}%[[VAL_34]], align 4
+// CHECK:         %[[VAL_124:.*]] = icmp eq i32 %[[VAL_121]], 0
+// CHECK:         %[[VAL_125:.*]] = add i32 %[[VAL_121]], %thread.id.2
+// CHECK:         %[[VAL_126:.*]] = add i32 %tile_origin.0, %[[VAL_58]]
+// CHECK:         %[[VAL_127:.*]] = add i32 %tile_origin.1, %[[VAL_66]]
+// CHECK:         %[[VAL_128:.*]] = add i32 %tile_origin.2, %[[VAL_125]]
+// CHECK:         %[[VAL_129:.*]] = getelementptr inbounds [5 x [200 x [300 x float]]], ptr{{.*}}%[[VAL_130:.*]], i32 0, i32 %[[VAL_126]], i32 %[[VAL_127]], i32 %[[VAL_128]]
+// CHECK:         %[[VAL_131:.*]] = load float, ptr{{.*}}%[[VAL_129]], align 4, !invariant.load !{{[0-9]}}
+// CHECK:         store float %[[VAL_131]], ptr{{.*}}%[[VAL_40]], align 4
+// CHECK:         %[[VAL_132:.*]] = getelementptr inbounds [5 x [200 x [300 x float]]], ptr{{.*}}%[[VAL_133:.*]], i32 0, i32 %[[VAL_126]], i32 %[[VAL_127]], i32 %[[VAL_128]]
+// CHECK:         %[[VAL_134:.*]] = load float, ptr{{.*}}%[[VAL_132]], align 4, !invariant.load !{{[0-9]}}
+// CHECK:         store float %[[VAL_134]], ptr{{.*}}%[[VAL_38]], align 4
+// CHECK-GCN:     %[[VAL_31_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_31]] to ptr
+// CHECK:         %[[VAL_135:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_33]], i64 0, i64 0
+// CHECK-PTX:     store ptr %[[VAL_31]], ptr %[[VAL_135]], align 8
+// CHECK-GCN:     store ptr %[[VAL_31_1]], ptr{{.*}}%[[VAL_135]], align 8
+// CHECK-GCN:     %[[VAL_32_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_32]] to ptr
+// CHECK:         %[[VAL_136:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_33]], i64 0, i64 1
+// CHECK-PTX:     store ptr %[[VAL_32]], ptr %[[VAL_136]], align 8
+// CHECK-GCN:     store ptr %[[VAL_32_1]], ptr{{.*}}%[[VAL_136]], align 8
+// CHECK-PTX:     call void @[[ADD:Add.*]](ptr %[[VAL_39]], ptr %[[VAL_37]], ptr %[[VAL_40]], ptr %[[VAL_38]], ptr %[[VAL_33]])
+// CHECK-GCN:     %[[VAL_39_6:.*]] = addrspacecast ptr{{.*}}%[[VAL_39]] to ptr
+// CHECK-GCN:     %[[VAL_37_6:.*]] = addrspacecast ptr{{.*}}%[[VAL_37]] to ptr
+// CHECK-GCN:     %[[VAL_40_6:.*]] = addrspacecast ptr{{.*}}%[[VAL_40]] to ptr
+// CHECK-GCN:     %[[VAL_38_6:.*]] = addrspacecast ptr{{.*}}%[[VAL_38]] to ptr
+// CHECK-GCN:     %[[VAL_33_6:.*]] = addrspacecast ptr{{.*}}%[[VAL_33]] to ptr
+// CHECK-GCN:     call void @[[ADD:Add.*]](ptr %[[VAL_39_6]], ptr %[[VAL_37_6]], ptr %[[VAL_40_6]], ptr %[[VAL_38_6]], ptr %[[VAL_33_6]])
+// CHECK:         %[[VAL_137:.*]] = load float, ptr{{.*}}%[[VAL_31]], align 4
+// CHECK:         %[[VAL_138:.*]] = load float, ptr{{.*}}%[[VAL_32]], align 4
+// CHECK:         store float %[[VAL_137]], ptr{{.*}}%[[VAL_39]], align 4
+// CHECK:         store float %[[VAL_138]], ptr{{.*}}%[[VAL_37]], align 4
+// CHECK:         br label %[[VAL_119]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop2.loop_exit:                                  ; preds = %[[VAL_119]]
+// CHECK:         br label %[[VAL_65]]
+// CHECK:       is_full_tile-false:                               ; preds = %[[VAL_68]]
+// CHECK:         store i32 0, ptr{{.*}}%[[VAL_30]], align 4
+// CHECK:         br label %[[VAL_139:.*]]
+// CHECK:       loop2.loop_header9:                               ; preds = %[[VAL_140:.*]], %[[VAL_73]]
+// CHECK:         %[[VAL_141:.*]] = load i32, ptr{{.*}}%[[VAL_30]], align 4
+// CHECK:         %[[VAL_142:.*]] = icmp uge i32 %[[VAL_141]], 512
+// CHECK:         br i1 %[[VAL_142]], label %[[VAL_74]], label %[[VAL_143:.*]]
+// CHECK:       loop2.loop_body10:                                ; preds = %[[VAL_139]]
+// CHECK:         %[[VAL_144:.*]] = add nuw nsw i32 %[[VAL_141]], 32
+// CHECK:         store i32 %[[VAL_144]], ptr{{.*}}%[[VAL_30]], align 4
+// CHECK:         %[[VAL_145:.*]] = icmp eq i32 %[[VAL_141]], 0
+// CHECK:         %[[VAL_146:.*]] = add i32 %[[VAL_141]], %thread.id.2
+// CHECK:         %[[VAL_147:.*]] = icmp ult i32 %[[VAL_146]], %tile_bound.2
+// CHECK:         br i1 %[[VAL_147]], label %[[VAL_148:.*]], label %[[VAL_140]]
+// CHECK:       x_in_tile-after:                                  ; preds = %[[VAL_148]], %[[VAL_143]]
+// CHECK:         br label %[[VAL_139]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop2.loop_exit8:                                 ; preds = %[[VAL_139]]
+// CHECK:         br label %[[VAL_65]]
+// CHECK:       x_in_tile-true:                                   ; preds = %[[VAL_143]]
+// CHECK:         %[[VAL_149:.*]] = add i32 %tile_origin.0, %[[VAL_58]]
+// CHECK:         %[[VAL_150:.*]] = add i32 %tile_origin.1, %[[VAL_66]]
+// CHECK:         %[[VAL_151:.*]] = add i32 %tile_origin.2, %[[VAL_146]]
+// CHECK:         %[[VAL_152:.*]] = getelementptr inbounds [5 x [200 x [300 x float]]], ptr{{.*}}%[[VAL_130]], i32 0, i32 %[[VAL_149]], i32 %[[VAL_150]], i32 %[[VAL_151]]
+// CHECK:         %[[VAL_153:.*]] = load float, ptr{{.*}}%[[VAL_152]], align 4, !invariant.load !{{[0-9]}}
+// CHECK:         store float %[[VAL_153]], ptr{{.*}}%[[VAL_40]], align 4
+// CHECK:         %[[VAL_154:.*]] = getelementptr inbounds [5 x [200 x [300 x float]]], ptr{{.*}}%[[VAL_133]], i32 0, i32 %[[VAL_149]], i32 %[[VAL_150]], i32 %[[VAL_151]]
+// CHECK:         %[[VAL_155:.*]] = load float, ptr{{.*}}%[[VAL_154]], align 4, !invariant.load !{{[0-9]}}
+// CHECK:         store float %[[VAL_155]], ptr{{.*}}%[[VAL_38]], align 4
+// CHECK-GCN:     %[[VAL_27_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_27]] to ptr
+// CHECK:         %[[VAL_156:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_29]], i64 0, i64 0
+// CHECK-PTX:     store ptr %[[VAL_27]], ptr %[[VAL_156]], align 8
+// CHECK-GCN:     store ptr %[[VAL_27_1]], ptr{{.*}}%[[VAL_156]], align 8
+// CHECK-GCN:     %[[VAL_28_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_28]] to ptr
+// CHECK:         %[[VAL_157:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_29]], i64 0, i64 1
+// CHECK-PTX:     store ptr %[[VAL_28]], ptr %[[VAL_157]], align 8
+// CHECK-GCN:     store ptr %[[VAL_28_1]], ptr{{.*}}%[[VAL_157]], align 8
+// CHECK-PTX:     call void @[[ADD:Add.*]](ptr %[[VAL_39]], ptr %[[VAL_37]], ptr %[[VAL_40]], ptr %[[VAL_38]], ptr %[[VAL_29]])
+// CHECK-GCN:     %[[VAL_39_7:.*]] = addrspacecast ptr{{.*}}%[[VAL_39]] to ptr
+// CHECK-GCN:     %[[VAL_37_7:.*]] = addrspacecast ptr{{.*}}%[[VAL_37]] to ptr
+// CHECK-GCN:     %[[VAL_40_7:.*]] = addrspacecast ptr{{.*}}%[[VAL_40]] to ptr
+// CHECK-GCN:     %[[VAL_38_7:.*]] = addrspacecast ptr{{.*}}%[[VAL_38]] to ptr
+// CHECK-GCN:     %[[VAL_29_7:.*]] = addrspacecast ptr{{.*}}%[[VAL_29]] to ptr
+// CHECK-GCN:     call void @[[ADD:Add.*]](ptr %[[VAL_39_7]], ptr %[[VAL_37_7]], ptr %[[VAL_40_7]], ptr %[[VAL_38_7]], ptr %[[VAL_29_7]])
+// CHECK:         %[[VAL_158:.*]] = load float, ptr{{.*}}%[[VAL_27]], align 4
+// CHECK:         %[[VAL_159:.*]] = load float, ptr{{.*}}%[[VAL_28]], align 4
+// CHECK:         store float %[[VAL_158]], ptr{{.*}}%[[VAL_39]], align 4
+// CHECK:         store float %[[VAL_159]], ptr{{.*}}%[[VAL_37]], align 4
+// CHECK:         br label %[[VAL_140]]
+// CHECK:       thread_in_bounds-true:                            ; preds = %[[VAL_60]]
+// CHECK:         %[[VAL_160:.*]] = icmp eq i32 %lane_id, 0
+// CHECK:         br i1 %[[VAL_160]], label %[[VAL_161:.*]], label %[[VAL_162:.*]]
+// CHECK:       intra_warp_reduce_write-after:                    ; preds = %[[VAL_161]], %thread_in_bounds-true
+// CHECK-PTX:     call void @llvm.nvvm.barrier0()
+// CHECK-GCN:     call void @llvm.amdgcn.s.barrier()
+// CHECK:         %[[VAL_163:.*]] = icmp eq i32 %[[VAL_116]], 0
+// CHECK:         br i1 %[[VAL_163]], label %[[VAL_164:.*]], label %[[VAL_118]]
+// CHECK:       inter_warp_reduce-after:                          ; preds = %[[VAL_165:.*]], %[[VAL_162]]
+// CHECK:         br label %thread_in_bounds-after
+// CHECK:       intra_warp_reduce_write-true:                     ; preds = %thread_in_bounds-true
+// CHECK:         %[[VAL_166:.*]] = load float, ptr{{.*}}%[[VAL_39]], align 4
+// CHECK:         %[[VAL_167:.*]] = getelementptr inbounds [8 x [1 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.1, i32 %[[VAL_116]]
+// CHECK:         %[[VAL_168:.*]] = addrspacecast ptr addrspace(3) %[[VAL_167]] to ptr
+// CHECK:         store float %[[VAL_166]], ptr{{.*}}%[[VAL_168]], align 4
+// CHECK:         %[[VAL_169:.*]] = load float, ptr{{.*}}%[[VAL_37]], align 4
+// CHECK:         %[[VAL_170:.*]] = getelementptr inbounds [8 x [1 x float]], ptr addrspace(3) @shared_cache{{.*}}, i32 0, i32 %thread.id.1, i32 %[[VAL_116]]
+// CHECK:         %[[VAL_171:.*]] = addrspacecast ptr addrspace(3) %[[VAL_170]] to ptr
+// CHECK:         store float %[[VAL_169]], ptr{{.*}}%[[VAL_171]], align 4
+// CHECK:         br label %[[VAL_162]]
+// CHECK:       inter_warp_reduce-true:                           ; preds = %[[VAL_162]]
+// CHECK:         %[[VAL_172:.*]] = getelementptr inbounds [8 x [1 x float]], ptr addrspace(3) @shared_cache, i32 0, i32 %thread.id.1, i32 %lane_id
+// CHECK:         %[[VAL_173:.*]] = addrspacecast ptr addrspace(3) %[[VAL_172]] to ptr
+// CHECK-GCN:     %[[VAL_1_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_1]] to ptr
+// CHECK-PTX:     store float %[[VAL_46]], ptr %[[VAL_1]], align 4
+// CHECK-GCN:     store float %[[VAL_46]], ptr %[[VAL_1_1]], align 4
+// CHECK:         %[[VAL_174:.*]] = icmp ult i32 %thread.id.2, 1
+// CHECK-PTX:     %[[VAL_175:.*]] = select i1 %[[VAL_174]], ptr %[[VAL_173]], ptr %[[VAL_1]]
+// CHECK-GCN:     %[[VAL_175:.*]] = select i1 %[[VAL_174]], ptr %[[VAL_173]], ptr %[[VAL_1_1]]
+// CHECK:         %[[VAL_176:.*]] = getelementptr inbounds [8 x [1 x float]], ptr addrspace(3) @shared_cache{{.*}}, i32 0, i32 %thread.id.1, i32 %lane_id
+// CHECK:         %[[VAL_177:.*]] = addrspacecast ptr addrspace(3) %[[VAL_176]] to ptr
+// CHECK-GCN:     %[[VAL_0_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_0]] to ptr
+// CHECK-PTX:     store float %[[VAL_48]], ptr{{.*}}%[[VAL_0]], align 4
+// CHECK-GCN:     store float %[[VAL_48]], ptr{{.*}}%[[VAL_0_1]], align 4
+// CHECK:         %[[VAL_178:.*]] = icmp ult i32 %thread.id.2, 1
+// CHECK-PTX:     %[[VAL_179:.*]] = select i1 %[[VAL_178]], ptr{{.*}}%[[VAL_177]], ptr %[[VAL_0]]
+// CHECK-GCN:     %[[VAL_179:.*]] = select i1 %[[VAL_178]], ptr{{.*}}%[[VAL_177]], ptr %[[VAL_0_1]]
+// CHECK:         %[[VAL_180:.*]] = icmp eq i32 %thread.id.2, 0
+// CHECK:         br i1 %[[VAL_180]], label %[[VAL_181:.*]], label %[[VAL_165]]
+// CHECK:       reduction_write_output-after:                     ; preds = %[[VAL_181]], %[[VAL_164]]
+// CHECK:         br label %[[VAL_118]]
+// CHECK:       reduction_write_output-true:                      ; preds = %[[VAL_164]]
+// CHECK:         %[[VAL_183:.*]] = add i32 %tile_origin.1, %thread.id.1
+// CHECK:         %[[VAL_186:.*]] = getelementptr inbounds [200 x float], ptr{{.*}}%[[VAL_187:.*]], i32 0, i32 %[[VAL_183]]
+// CHECK:         %[[VAL_188:.*]] = load float, ptr{{.*}}%[[VAL_175]], align 4
+// CHECK:         store float %[[VAL_188]], ptr{{.*}}%[[VAL_186]], align 4
+// CHECK:         %[[VAL_190:.*]] = add i32 %tile_origin.1, %thread.id.1
+// CHECK:         %[[VAL_193:.*]] = getelementptr inbounds [200 x float], ptr{{.*}}%[[VAL_194:.*]], i32 0, i32 %[[VAL_190]]
+// CHECK:         %[[VAL_195:.*]] = load float, ptr{{.*}}%[[VAL_179]], align 4
+// CHECK:         store float %[[VAL_195]], ptr{{.*}}%[[VAL_193]], align 4
+// CHECK:         br label %[[VAL_165]]
+// CHECK:       entry:
+// CHECK:         %[[VAL_196:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_197:.*]] = alloca float, align 4
+// CHECK:         %[[VAL_198:.*]] = alloca [2 x ptr], align 8
+// CHECK:         %[[VAL_199:.*]] = alloca [2 x ptr], align 8
+// CHECK:         %[[VAL_200:.*]] = alloca [2 x ptr], align 8
+// CHECK:         %[[VAL_201:.*]] = load float, ptr{{.*}}%[[VAL_202:.*]], align 4
+// CHECK:         %[[VAL_203:.*]] = load float, ptr{{.*}}%[[VAL_204:.*]], align 4
+// CHECK:         %[[VAL_205:.*]] = fadd float %[[VAL_201]], %[[VAL_203]]
+// CHECK:         store float %[[VAL_205]], ptr{{.*}}%[[VAL_197]], align 4
+// CHECK:         %[[VAL_206:.*]] = load float, ptr{{.*}}%[[VAL_207:.*]], align 4
+// CHECK:         %[[VAL_208:.*]] = load float, ptr{{.*}}%[[VAL_209:.*]], align 4
+// CHECK:         %[[VAL_210:.*]] = fadd float %[[VAL_206]], %[[VAL_208]]
+// CHECK:         store float %[[VAL_210]], ptr{{.*}}%[[VAL_196]], align 4
+// CHECK-GCN:     %[[VAL_197_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_197]] to ptr
+// CHECK:         %[[VAL_211:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_200]], i64 0, i64 0
+// CHECK-PTX:     store ptr %[[VAL_197]], ptr %[[VAL_211]], align 8
+// CHECK-GCN:     store ptr %[[VAL_197_1]], ptr{{.*}}%[[VAL_211]], align 8
+// CHECK-GCN:     %[[VAL_196_1:.*]] = addrspacecast ptr{{.*}}%[[VAL_196]] to ptr
+// CHECK:         %[[VAL_212:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_200]], i64 0, i64 1
+// CHECK-PTX:     store ptr %[[VAL_196]], ptr %[[VAL_212]], align 8
+// CHECK-GCN:     store ptr %[[VAL_196_1]], ptr{{.*}}%[[VAL_212]], align 8
+// CHECK:         %[[VAL_213:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_214:.*]], i64 0, i64 0
+// CHECK:         %[[VAL_215:.*]] = load ptr, ptr{{.*}}%[[VAL_213]], align 8, !dereferenceable !{{[0-9]*}}, !align !{{[0-9]*}}
+// CHECK:         %[[VAL_216:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_200]], i64 0, i64 0
+// CHECK:         %[[VAL_217:.*]] = load ptr, ptr{{.*}}%[[VAL_216]], align 8, !dereferenceable !{{[0-9]*}}, !align !{{[0-9]*}}
+// CHECK:         %[[VAL_218:.*]] = load float, ptr{{.*}}%[[VAL_217]], align 4
+// CHECK:         store float %[[VAL_218]], ptr{{.*}}%[[VAL_215]], align 4
+// CHECK:         %[[VAL_219:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_214]], i64 0, i64 1
+// CHECK:         %[[VAL_220:.*]] = load ptr, ptr{{.*}}%[[VAL_219]], align 8, !dereferenceable !{{[0-9]*}}, !align !{{[0-9]*}}
+// CHECK:         %[[VAL_221:.*]] = getelementptr inbounds [2 x ptr], ptr{{.*}}%[[VAL_200]], i64 0, i64 1
+// CHECK:         %[[VAL_222:.*]] = load ptr, ptr{{.*}}%[[VAL_221]], align 8, !dereferenceable !{{[0-9]*}}, !align !{{[0-9]*}}
+// CHECK:         %[[VAL_223:.*]] = load float, ptr{{.*}}%[[VAL_222]], align 4
+// CHECK:         store float %[[VAL_223]], ptr{{.*}}%[[VAL_220]], align 4
+// CHECK:         ret void
+
diff --git a/third_party/xla/xla/service/gpu/tests/reduction_vectorization_sm_all.hlo b/third_party/xla/xla/service/gpu/tests/reduction_vectorization_sm_all.hlo
index fb3509310802ae..6a25580a4bcff9 100644
--- a/third_party/xla/xla/service/gpu/tests/reduction_vectorization_sm_all.hlo
+++ b/third_party/xla/xla/service/gpu/tests/reduction_vectorization_sm_all.hlo
@@ -1,8 +1,7 @@
-// RUN: hlo_to_llvm_ir --ptx %s | FileCheck %s
-// RUN: hlo_to_llvm_ir --ptx --sm=50 %s | FileCheck %s --check-prefix=CHECK-SM50
-// RUN: hlo_to_llvm_ir --ptx --sm=60 %s | FileCheck %s --check-prefix=CHECK-SM60
-// RUN: hlo_to_llvm_ir --ptx --sm=70 %s | FileCheck %s --check-prefix=CHECK-SM70
-// RUN: hlo_to_llvm_ir --ptx --sm=86 %s | FileCheck %s --check-prefix=CHECK-SM86
+// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/p100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM60
+// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/v100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM70
+// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/a6000.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM86
 
 // CHECK-LABEL: .entry wrapped_reduce_odd_row
 // CHECK-NOT: ld.global.nc.v2.f32
@@ -33,7 +32,7 @@ ENTRY %main {
 // -----
 
 // CHECK-SM86-LABEL: .entry wrapped_reduce_small_row
-// CHECK-SM86: .reqntid 96, 1, 1
+// CHECK-SM86: .reqntid 256, 1, 1
 
 HloModule ReduceSmallRow, is_scheduled=true
 
@@ -90,10 +89,6 @@ ENTRY %main {
 
 // SM dependent tests
 
-// CHECK-SM50-LABEL: .entry wrapped_reduce_exp
-// CHECK-SM50-NOT: ld.global.nc.v2.f32
-// CHECK-SM50-COUNT-8: ld.global.nc.f32
-
 // CHECK-SM60: .entry wrapped_exp
 // CHECK-SM60-LABEL: .entry wrapped_reduce_exp
 // CHECK-SM60-COUNT-8: ld.global.nc.v2.f32
@@ -132,10 +127,6 @@ ENTRY %main {
 
 HloModule ReduceTileFit, is_scheduled=true
 
-// CHECK-SM50-LABEL: .entry wrapped_reduce_tile_fit
-// CHECK-SM50-NOT: ld.global.nc.v2.f32
-// CHECK-SM50-COUNT-8: ld.global.nc.f32
-
 // CHECK-SM60-LABEL: .entry wrapped_reduce_tile_fit
 // CHECK-SM60-COUNT-8: ld.global.nc.v2.f32
 
@@ -164,10 +155,6 @@ ENTRY %main {
 
 HloModule ReducePower2, is_scheduled=true
 
-// CHECK-SM50-LABEL: .entry wrapped_reduce_pow_2
-// CHECK-SM50-NOT: ld.global.nc.v2.f32
-// CHECK-SM50-COUNT-8: ld.global.nc.f32
-
 // CHECK-SM60-LABEL: .entry wrapped_reduce_pow_2
 // CHECK-SM60-COUNT-4: ld.global.nc.v2.f32
 
@@ -202,7 +189,7 @@ HloModule ReduceEvenColumns, is_scheduled=true
 
 // CHECK-SM70-LABEL: .entry wrapped_reduce_even_col
 // CHECK-SM70-COUNT-2: ld.global.nc.v2.f32
-// CHECK-SM70-COUNT-2: ld.global.nc.f32
+// CHECK-SM70-COUNT-2: ld.global.nc.v2.f32
 
 %max_ {
   %x = f32[] parameter(0)
diff --git a/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo b/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
index 297d60f372050e..e140b56af9d60c 100644
--- a/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
+++ b/third_party/xla/xla/service/gpu/tests/rng_get_and_update_state.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo_to_llvm_ir %s | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck %s
 
 HloModule TestModule, is_scheduled=true
 
diff --git a/third_party/xla/xla/service/gpu/tests/scatter.hlo b/third_party/xla/xla/service/gpu/tests/scatter.hlo
index 2f87305cb383fa..74b2b056c8e980 100644
--- a/third_party/xla/xla/service/gpu/tests/scatter.hlo
+++ b/third_party/xla/xla/service/gpu/tests/scatter.hlo
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
-// RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // CHECK-LABEL: entry:
 // CHECK:         %[[VAL_0:.*]] = alloca i32, align 4
diff --git a/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo b/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo
index b5f9708e7bd09e..08751943c13efb 100644
--- a/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo
+++ b/third_party/xla/xla/service/gpu/tests/select_and_scatter.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
diff --git a/third_party/xla/xla/service/gpu/tests/simplify_fp_conversions_test.cc b/third_party/xla/xla/service/gpu/tests/simplify_fp_conversions_test.cc
index 9cf312bdde936a..98e56085123a17 100644
--- a/third_party/xla/xla/service/gpu/tests/simplify_fp_conversions_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/simplify_fp_conversions_test.cc
@@ -27,7 +27,7 @@ class SimplifyFPConversionsTest : public HloTestBase {
  public:
   DebugOptions GetDebugOptionsForTest() override {
     DebugOptions debug_options = HloTestBase::GetDebugOptionsForTest();
-    debug_options.set_xla_gpu_simplify_all_fp_conversions(
+    debug_options.set_xla_allow_excess_precision(
         enable_simplify_all_fp_conversions_);
     return debug_options;
   }
diff --git a/third_party/xla/xla/service/gpu/tests/single_instruction.hlo b/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
index 51daf82bb7ceb8..c8378f746aa983 100644
--- a/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
+++ b/third_party/xla/xla/service/gpu/tests/single_instruction.hlo
@@ -1,6 +1,6 @@
-// RUN: hlo_to_llvm_ir --ptx %s | FileCheck %s
-// RUN: hlo_to_llvm_ir --ptx %s --sm=80 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SM80
-// RUN: hlo_to_llvm_ir --ptx %s --sm=90 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SM90
+// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/a100_80.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM80
+// RUN: hlo-opt %s --platform=gpu --stage=ptx --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/h100.txtpb --split-input-file | FileCheck %s --check-prefixes=CHECK-SM90
 
 // CHECK-DAG: sqrt.approx.f32
 
diff --git a/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo b/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
index 82f4a9941ffd21..242bd749bdaf11 100644
--- a/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
+++ b/third_party/xla/xla/service/gpu/tests/slice_to_dynamic.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
 
@@ -34,7 +34,7 @@
 // CHECK:         %[[VAL_26:.*]] = udiv i32 %[[VAL_21]], 4
 // CHECK:         %[[VAL_27:.*]] = icmp ult i32 %[[VAL_19]], 8
 // CHECK:         br i1 %[[VAL_27]], label %[[VAL_28:.*]], label %[[VAL_29:.*]]
-// CHECK:       custom_call.in_bounds-after:                      ; preds = %[[VAL_30:.*]], %[[VAL_11]]
+// CHECK:       custom-call.in_bounds-after:                      ; preds = %[[VAL_30:.*]], %[[VAL_11]]
 // CHECK:         ret void
 // CHECK:       is_thread_0-true:                                 ; preds = %[[VAL_12]]
 // CHECK:         %[[VAL_31:.*]] = getelementptr inbounds i8, ptr %[[VAL_32:.*]], i32 32
@@ -44,7 +44,7 @@
 // CHECK:         %[[VAL_34:.*]] = getelementptr inbounds i8, ptr %[[VAL_32]], i32 40
 // CHECK:         store i32 %[[VAL_4]], ptr %[[VAL_34]], align 4
 // CHECK:         br label %[[VAL_11]]
-// CHECK:       custom_call.in_bounds-true:                       ; preds = %[[VAL_11]]
+// CHECK:       custom-call.in_bounds-true:                       ; preds = %[[VAL_11]]
 // CHECK:         %[[VAL_35:.*]] = mul nuw nsw i32 %[[VAL_23]], 1
 // CHECK:         %[[VAL_36:.*]] = add nuw nsw i32 0, %[[VAL_35]]
 // CHECK:         %[[VAL_37:.*]] = mul nuw nsw i32 %[[VAL_25]], 2
@@ -53,9 +53,9 @@
 // CHECK:         %[[VAL_40:.*]] = add nuw nsw i32 %[[VAL_38]], %[[VAL_39]]
 // CHECK:         %[[VAL_41:.*]] = icmp ult i32 %[[VAL_40]], %[[VAL_15]]
 // CHECK:         br i1 %[[VAL_41]], label %[[VAL_42:.*]], label %[[VAL_30]]
-// CHECK:       custom_call.in_dyn_bounds-after:                  ; preds = %[[VAL_42]], %[[VAL_28]]
+// CHECK:       custom-call.in_dyn_bounds-after:                  ; preds = %[[VAL_42]], %[[VAL_28]]
 // CHECK:         br label %[[VAL_29]]
-// CHECK:       custom_call.in_dyn_bounds-true:                   ; preds = %[[VAL_28]]
+// CHECK:       custom-call.in_dyn_bounds-true:                   ; preds = %[[VAL_28]]
 // CHECK:         %[[VAL_43:.*]] = udiv i32 %[[VAL_40]], 1
 // CHECK:         %[[VAL_44:.*]] = urem i32 %[[VAL_43]], %[[VAL_4]]
 // CHECK:         %[[VAL_45:.*]] = mul i32 1, %[[VAL_4]]
diff --git a/third_party/xla/xla/service/gpu/tests/sorting.hlo b/third_party/xla/xla/service/gpu/tests/sorting.hlo
index dc7578ae455613..dd1152b7e176fa 100644
--- a/third_party/xla/xla/service/gpu/tests/sorting.hlo
+++ b/third_party/xla/xla/service/gpu/tests/sorting.hlo
@@ -1,5 +1,5 @@
 // NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
-// RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 HloModule TestModule, is_scheduled=true
 
diff --git a/third_party/xla/xla/service/gpu/tests/test_autotune_cache.textproto b/third_party/xla/xla/service/gpu/tests/test_autotune_cache.textproto
index 2dee80241f5da4..b20a9d20ece50a 100644
--- a/third_party/xla/xla/service/gpu/tests/test_autotune_cache.textproto
+++ b/third_party/xla/xla/service/gpu/tests/test_autotune_cache.textproto
@@ -1,4 +1,18 @@
-version: 2
+# Copyright 2023 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+version: 3
 results {
   device: "sm_8.0 with 42331013120B RAM, 108 cores, 1410000KHz clock, 1215000KHz mem clock, 41943040B L2$"
   hlo: "{\n  tmp_0 = f16[1,16,17,3]{3,2,1,0} parameter(0)\n  tmp_1 = f16[16,51]{1,0} bitcast(f16[1,16,17,3]{3,2,1,0} tmp_0)\n  tmp_2 = s8[16,17,3]{2,1,0} parameter(1)\n  tmp_3 = s8[51,16]{0,1} bitcast(s8[16,17,3]{2,1,0} tmp_2)\n  tmp_4 = f16[51,16]{0,1} convert(s8[51,16]{0,1} tmp_3)\n  tmp_5 = f16[16,16]{1,0} dot(f16[16,51]{1,0} tmp_1, f16[51,16]{0,1} tmp_4), lhs_contracting_dims={1}, rhs_contracting_dims={0}\n  ROOT tmp_6 = f16[1,16,16]{2,1,0} bitcast(f16[16,16]{1,0} tmp_5)\n}"
@@ -13,6 +27,7 @@ results {
       split_k: 1
       num_stages: 1
       num_warps: 4
+      num_ctas: 1
     }
   }
 }
diff --git a/third_party/xla/xla/service/gpu/tests/transpose_021.hlo b/third_party/xla/xla/service/gpu/tests/transpose_021.hlo
new file mode 100644
index 00000000000000..370779aed2142a
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/transpose_021.hlo
@@ -0,0 +1,107 @@
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+
+HloModule Transpose, is_scheduled=true
+
+%fused_computation {
+  %p0 = f32[2,16,17]{2,1,0} parameter(0)
+  ROOT %transpose = f32[2,17,16]{2,1,0} transpose(%p0), dimensions={0,2,1}
+}
+
+ENTRY main {
+  %param = f32[2,16,17]{2,1,0} parameter(0)
+  ROOT %fusion = f32[2,17,16] fusion(%param), kind=kInput, calls=%fused_computation
+}
+
+// CHECK-LABEL: entry:
+// CHECK:         %[[VAL_0:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_1:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_2:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_3:.*]] = alloca i32, align 4
+// CHECK-PTX:     %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
+// CHECK-GCN:     %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x
+// CHECK-PTX:     %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !3
+// CHECK-GCN:     %block.id.x = call i32 @llvm.amdgcn.workgroup.id.x
+// CHECK:         %[[VAL_4:.*]] = udiv i32 %thread.id.x, 32
+// CHECK:         %thread.id.1 = urem i32 %[[VAL_4]], 4
+// CHECK:         %thread.id.2 = urem i32 %thread.id.x, 32
+// CHECK:         %lane_id = urem i32 %thread.id.x, 32
+// CHECK:         %[[VAL_5:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_6:.*]] = urem i32 %[[VAL_5]], 1
+// CHECK:         %[[VAL_7:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_8:.*]] = urem i32 %[[VAL_7]], 1
+// CHECK:         %[[VAL_9:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_10:.*]] = icmp eq i32 %[[VAL_8]], 0
+// CHECK:         %tile_bound.1 = select i1 %[[VAL_10]], i32 16, i32 32
+// CHECK:         %[[VAL_11:.*]] = icmp eq i32 %[[VAL_6]], 0
+// CHECK:         %tile_bound.2 = select i1 %[[VAL_11]], i32 17, i32 32
+// CHECK:         %tile_origin.0 = mul i32 %[[VAL_9]], 1
+// CHECK:         %tile_origin.1 = mul i32 %[[VAL_8]], 32
+// CHECK:         %tile_origin.2 = mul i32 %[[VAL_6]], 32
+// CHECK:         store i32 %thread.id.1, ptr{{.*}} %[[VAL_3]], align 4
+// CHECK:         br label %[[VAL_12:.*]]
+// CHECK:       loop1.loop_header:                                ; preds = %[[VAL_13:.*]], %[[VAL_14:.*]]
+// CHECK:         %[[VAL_15:.*]] = load i32, ptr{{.*}} %[[VAL_3]], align 4
+// CHECK:         %[[VAL_16:.*]] = icmp uge i32 %[[VAL_15]], %tile_bound.1
+// CHECK:         br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_18:.*]]
+// CHECK:       loop1.loop_body:                                  ; preds = %[[VAL_12]]
+// CHECK:         %[[VAL_19:.*]] = add nuw nsw i32 %[[VAL_15]], 4
+// CHECK:         store i32 %[[VAL_19]], ptr{{.*}} %[[VAL_3]], align 4
+// CHECK:         %[[VAL_20:.*]] = icmp eq i32 %[[VAL_15]], %thread.id.1
+// CHECK:         store i32 %thread.id.2, ptr{{.*}} %[[VAL_2]], align 4
+// CHECK:         br label %[[VAL_21:.*]]
+// CHECK:       loop2.loop_header:                                ; preds = %[[VAL_22:.*]], %[[VAL_18]]
+// CHECK:         %[[VAL_23:.*]] = load i32, ptr{{.*}} %[[VAL_2]], align 4
+// CHECK:         %[[VAL_24:.*]] = icmp uge i32 %[[VAL_23]], %tile_bound.2
+// CHECK:         br i1 %[[VAL_24]], label %[[VAL_13]], label %[[VAL_22]]
+// CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_21]]
+// CHECK:         %[[VAL_25:.*]] = add nuw nsw i32 %[[VAL_23]], 32
+// CHECK:         store i32 %[[VAL_25]], ptr{{.*}} %[[VAL_2]], align 4
+// CHECK:         %[[VAL_26:.*]] = icmp eq i32 %[[VAL_23]], %thread.id.2
+// CHECK:         %[[VAL_27:.*]] = add i32 %tile_origin.0, 0
+// CHECK:         %[[VAL_28:.*]] = add i32 %tile_origin.1, %[[VAL_15]]
+// CHECK:         %[[VAL_29:.*]] = add i32 %tile_origin.2, %[[VAL_23]]
+// CHECK:         %[[VAL_30:.*]] = getelementptr{{.*}} inbounds [2 x [16 x [17 x float]]], ptr{{.*}} %[[VAL_31:.*]], i32 0, i32 %[[VAL_27]], i32 %[[VAL_28]], i32 %[[VAL_29]]
+// CHECK:         %[[VAL_32:.*]] = load float, ptr{{.*}} %[[VAL_30]], align 4, !invariant.load !{{[0-9]}}
+// CHECK:         %[[VAL_33:.*]] = getelementptr{{.*}} inbounds [1 x [32 x [33 x float]]], ptr{{.*}} addrspace(3) @tr_tile_0, i32 0, i32 0, i32 %[[VAL_15]], i32 %[[VAL_23]]
+// CHECK:         %[[VAL_34:.*]] = addrspacecast ptr{{.*}} addrspace(3) %[[VAL_33]] to ptr
+// CHECK:         store float %[[VAL_32]], ptr{{.*}} %[[VAL_34]], align 4
+// CHECK:         br label %[[VAL_21]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop2.loop_exit:                                  ; preds = %[[VAL_21]]
+// CHECK:         br label %[[VAL_12]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop1.loop_exit:                                  ; preds = %[[VAL_12]]
+// CHECK-PTX:     call void @llvm.nvvm.barrier0()
+// CHECK-GCN:     call void @llvm.amdgcn.s.barrier()
+// CHECK:         store i32 %thread.id.1, ptr{{.*}} %[[VAL_1]], align 4
+// CHECK:         br label %[[VAL_35:.*]]
+// CHECK:       loop1.loop_header4:                               ; preds = %[[VAL_36:.*]], %[[VAL_17]]
+// CHECK:         %[[VAL_37:.*]] = load i32, ptr{{.*}} %[[VAL_1]], align 4
+// CHECK:         %[[VAL_38:.*]] = icmp uge i32 %[[VAL_37]], %tile_bound.2
+// CHECK:         br i1 %[[VAL_38]], label %[[VAL_39:.*]], label %[[VAL_40:.*]]
+// CHECK:       loop1.loop_body5:                                 ; preds = %[[VAL_35]]
+// CHECK:         %[[VAL_41:.*]] = add nuw nsw i32 %[[VAL_37]], 4
+// CHECK:         store i32 %[[VAL_41]], ptr{{.*}} %[[VAL_1]], align 4
+// CHECK:         %[[VAL_42:.*]] = icmp eq i32 %[[VAL_37]], %thread.id.1
+// CHECK:         store i32 %thread.id.2, ptr{{.*}} %[[VAL_0]], align 4
+// CHECK:         br label %[[VAL_43:.*]]
+// CHECK:       loop2.loop_header10:                              ; preds = %[[VAL_44:.*]], %[[VAL_40]]
+// CHECK:         %[[VAL_45:.*]] = load i32, ptr{{.*}} %[[VAL_0]], align 4
+// CHECK:         %[[VAL_46:.*]] = icmp uge i32 %[[VAL_45]], %tile_bound.1
+// CHECK:         br i1 %[[VAL_46]], label %[[VAL_36]], label %[[VAL_44]]
+// CHECK:       loop2.loop_body11:                                ; preds = %[[VAL_43]]
+// CHECK:         %[[VAL_47:.*]] = add nuw nsw i32 %[[VAL_45]], 32
+// CHECK:         store i32 %[[VAL_47]], ptr{{.*}} %[[VAL_0]], align 4
+// CHECK:         %[[VAL_48:.*]] = icmp eq i32 %[[VAL_45]], %thread.id.2
+// CHECK:         %[[VAL_49:.*]] = add i32 %tile_origin.0, 0
+// CHECK:         %[[VAL_50:.*]] = add i32 %tile_origin.2, %[[VAL_37]]
+// CHECK:         %[[VAL_51:.*]] = add i32 %tile_origin.1, %[[VAL_45]]
+// CHECK:         %[[VAL_52:.*]] = getelementptr{{.*}} inbounds [1 x [32 x [33 x float]]], ptr{{.*}} addrspace(3) @tr_tile_0, i32 0, i32 0, i32 %[[VAL_45]], i32 %[[VAL_37]]
+// CHECK:         %[[VAL_53:.*]] = addrspacecast ptr{{.*}} addrspace(3) %[[VAL_52]] to ptr
+// CHECK:         %[[VAL_54:.*]] = load float, ptr{{.*}} %[[VAL_53]], align 4
+// CHECK:         %[[VAL_55:.*]] = getelementptr{{.*}} inbounds [2 x [17 x [16 x float]]], ptr{{.*}} %[[VAL_56:.*]], i32 0, i32 %[[VAL_49]], i32 %[[VAL_50]], i32 %[[VAL_51]]
+// CHECK:         store float %[[VAL_54]], ptr{{.*}} %[[VAL_55]], align 4
+// CHECK:         br label %[[VAL_43]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop2.loop_exit9:                                 ; preds = %[[VAL_43]]
+// CHECK:         br label %[[VAL_35]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop1.loop_exit3:                                 ; preds = %[[VAL_35]]
+// CHECK:         ret void
diff --git a/third_party/xla/xla/service/gpu/tests/transpose_021_extra_output.hlo b/third_party/xla/xla/service/gpu/tests/transpose_021_extra_output.hlo
new file mode 100644
index 00000000000000..b285b335e91610
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/transpose_021_extra_output.hlo
@@ -0,0 +1,115 @@
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+
+HloModule Transpose, is_scheduled=true
+
+%fused_computation {
+  %p0 = f32[2,16,17] parameter(0)
+  %neg = f32[2,16,17] negate(%p0)
+  %transpose = f32[2,17,16] transpose(%p0), dimensions={0,2,1}
+  ROOT %tuple = (f32[2,16,17], f32[2,17,16]) tuple(%neg, %transpose)
+}
+
+ENTRY main {
+  %param = f32[2,16,17] parameter(0)
+  ROOT %fusion = (f32[2,16,17], f32[2,17,16]) fusion(%param), kind=kInput, calls=%fused_computation
+}
+
+
+// CHECK-LABEL: entry:
+// CHECK:         %[[VAL_0:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_1:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_2:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_3:.*]] = alloca i32, align 4
+// CHECK-PTX:     %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
+// CHECK-GCN:     %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x
+// CHECK-PTX:     %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !3
+// CHECK-GCN:     %block.id.x = call i32 @llvm.amdgcn.workgroup.id.x
+// CHECK:         %[[VAL_4:.*]] = udiv i32 %thread.id.x, 32
+// CHECK:         %thread.id.1 = urem i32 %[[VAL_4]], 4
+// CHECK:         %thread.id.2 = urem i32 %thread.id.x, 32
+// CHECK:         %lane_id = urem i32 %thread.id.x, 32
+// CHECK:         %[[VAL_5:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_6:.*]] = urem i32 %[[VAL_5]], 1
+// CHECK:         %[[VAL_7:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_8:.*]] = urem i32 %[[VAL_7]], 1
+// CHECK:         %[[VAL_9:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_10:.*]] = icmp eq i32 %[[VAL_8]], 0
+// CHECK:         %tile_bound.1 = select i1 %[[VAL_10]], i32 16, i32 32
+// CHECK:         %[[VAL_11:.*]] = icmp eq i32 %[[VAL_6]], 0
+// CHECK:         %tile_bound.2 = select i1 %[[VAL_11]], i32 17, i32 32
+// CHECK:         %tile_origin.0 = mul i32 %[[VAL_9]], 1
+// CHECK:         %tile_origin.1 = mul i32 %[[VAL_8]], 32
+// CHECK:         %tile_origin.2 = mul i32 %[[VAL_6]], 32
+// CHECK:         store i32 %thread.id.1, ptr{{.*}} %[[VAL_3]], align 4
+// CHECK:         br label %[[VAL_12:.*]]
+// CHECK:       loop1.loop_header:                                ; preds = %[[VAL_13:.*]], %[[VAL_14:.*]]
+// CHECK:         %[[VAL_15:.*]] = load i32, ptr{{.*}} %[[VAL_3]], align 4
+// CHECK:         %[[VAL_16:.*]] = icmp uge i32 %[[VAL_15]], %tile_bound.1
+// CHECK:         br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_18:.*]]
+// CHECK:       loop1.loop_body:                                  ; preds = %[[VAL_12]]
+// CHECK:         %[[VAL_19:.*]] = add nuw nsw i32 %[[VAL_15]], 4
+// CHECK:         store i32 %[[VAL_19]], ptr{{.*}} %[[VAL_3]], align 4
+// CHECK:         %[[VAL_20:.*]] = icmp eq i32 %[[VAL_15]], %thread.id.1
+// CHECK:         store i32 %thread.id.2, ptr{{.*}} %[[VAL_2]], align 4
+// CHECK:         br label %[[VAL_21:.*]]
+// CHECK:       loop2.loop_header:                                ; preds = %[[VAL_22:.*]], %[[VAL_18]]
+// CHECK:         %[[VAL_23:.*]] = load i32, ptr{{.*}} %[[VAL_2]], align 4
+// CHECK:         %[[VAL_24:.*]] = icmp uge i32 %[[VAL_23]], %tile_bound.2
+// CHECK:         br i1 %[[VAL_24]], label %[[VAL_13]], label %[[VAL_22]]
+// CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_21]]
+// CHECK:         %[[VAL_25:.*]] = add nuw nsw i32 %[[VAL_23]], 32
+// CHECK:         store i32 %[[VAL_25]], ptr{{.*}} %[[VAL_2]], align 4
+// CHECK:         %[[VAL_26:.*]] = icmp eq i32 %[[VAL_23]], %thread.id.2
+// CHECK:         %[[VAL_27:.*]] = add i32 %tile_origin.0, 0
+// CHECK:         %[[VAL_28:.*]] = add i32 %tile_origin.1, %[[VAL_15]]
+// CHECK:         %[[VAL_29:.*]] = add i32 %tile_origin.2, %[[VAL_23]]
+// CHECK:         %[[VAL_30:.*]] = getelementptr inbounds [2 x [16 x [17 x float]]], ptr{{.*}} %[[VAL_31:.*]], i32 0, i32 %[[VAL_27]], i32 %[[VAL_28]], i32 %[[VAL_29]]
+// CHECK:         %[[VAL_32:.*]] = load float, ptr{{.*}} %[[VAL_30]], align 4, !invariant.load !{{[0-9]}}
+// CHECK:         %[[VAL_33:.*]] = getelementptr inbounds [1 x [32 x [33 x float]]], ptr{{.*}} addrspace(3) @tr_tile_0, i32 0, i32 0, i32 %[[VAL_15]], i32 %[[VAL_23]]
+// CHECK:         %[[VAL_34:.*]] = addrspacecast ptr{{.*}} addrspace(3) %[[VAL_33]] to ptr
+// CHECK:         store float %[[VAL_32]], ptr{{.*}} %[[VAL_34]], align 4
+// CHECK:         %[[VAL_35:.*]] = getelementptr inbounds [2 x [16 x [17 x float]]], ptr{{.*}} %[[VAL_31]], i32 0, i32 %[[VAL_27]], i32 %[[VAL_28]], i32 %[[VAL_29]]
+// CHECK:         %[[VAL_36:.*]] = load float, ptr{{.*}} %[[VAL_35]], align 4, !invariant.load !{{[0-9]}}
+// CHECK:         %[[VAL_37:.*]] = fneg float %[[VAL_36]]
+// CHECK:         %[[VAL_38:.*]] = getelementptr inbounds [2 x [16 x [17 x float]]], ptr{{.*}} %[[VAL_39:.*]], i32 0, i32 %[[VAL_27]], i32 %[[VAL_28]], i32 %[[VAL_29]]
+// CHECK:         store float %[[VAL_37]], ptr{{.*}} %[[VAL_38]], align 4
+// CHECK:         br label %[[VAL_21]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop2.loop_exit:                                  ; preds = %[[VAL_21]]
+// CHECK:         br label %[[VAL_12]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop1.loop_exit:                                  ; preds = %[[VAL_12]]
+// CHECK-PTX:     call void @llvm.nvvm.barrier0()
+// CHECK-GCN:     call void @llvm.amdgcn.s.barrier()
+// CHECK:         store i32 %thread.id.1, ptr{{.*}} %[[VAL_1]], align 4
+// CHECK:         br label %[[VAL_40:.*]]
+// CHECK:       loop1.loop_header6:                               ; preds = %[[VAL_41:.*]], %[[VAL_17]]
+// CHECK:         %[[VAL_42:.*]] = load i32, ptr{{.*}} %[[VAL_1]], align 4
+// CHECK:         %[[VAL_43:.*]] = icmp uge i32 %[[VAL_42]], %tile_bound.2
+// CHECK:         br i1 %[[VAL_43]], label %[[VAL_44:.*]], label %[[VAL_45:.*]]
+// CHECK:       loop1.loop_body7:                                 ; preds = %[[VAL_40]]
+// CHECK:         %[[VAL_46:.*]] = add nuw nsw i32 %[[VAL_42]], 4
+// CHECK:         store i32 %[[VAL_46]], ptr{{.*}} %[[VAL_1]], align 4
+// CHECK:         %[[VAL_47:.*]] = icmp eq i32 %[[VAL_42]], %thread.id.1
+// CHECK:         store i32 %thread.id.2, ptr{{.*}} %[[VAL_0]], align 4
+// CHECK:         br label %[[VAL_48:.*]]
+// CHECK:       loop2.loop_header12:                              ; preds = %[[VAL_49:.*]], %[[VAL_45]]
+// CHECK:         %[[VAL_50:.*]] = load i32, ptr{{.*}} %[[VAL_0]], align 4
+// CHECK:         %[[VAL_51:.*]] = icmp uge i32 %[[VAL_50]], %tile_bound.1
+// CHECK:         br i1 %[[VAL_51]], label %[[VAL_41]], label %[[VAL_49]]
+// CHECK:       loop2.loop_body13:                                ; preds = %[[VAL_48]]
+// CHECK:         %[[VAL_52:.*]] = add nuw nsw i32 %[[VAL_50]], 32
+// CHECK:         store i32 %[[VAL_52]], ptr{{.*}} %[[VAL_0]], align 4
+// CHECK:         %[[VAL_53:.*]] = icmp eq i32 %[[VAL_50]], %thread.id.2
+// CHECK:         %[[VAL_54:.*]] = add i32 %tile_origin.0, 0
+// CHECK:         %[[VAL_55:.*]] = add i32 %tile_origin.2, %[[VAL_42]]
+// CHECK:         %[[VAL_56:.*]] = add i32 %tile_origin.1, %[[VAL_50]]
+// CHECK:         %[[VAL_57:.*]] = getelementptr inbounds [1 x [32 x [33 x float]]], ptr{{.*}} addrspace(3) @tr_tile_0, i32 0, i32 0, i32 %[[VAL_50]], i32 %[[VAL_42]]
+// CHECK:         %[[VAL_58:.*]] = addrspacecast ptr{{.*}} addrspace(3) %[[VAL_57]] to ptr
+// CHECK:         %[[VAL_59:.*]] = load float, ptr{{.*}} %[[VAL_58]], align 4
+// CHECK:         %[[VAL_60:.*]] = getelementptr inbounds [2 x [17 x [16 x float]]], ptr{{.*}} %[[VAL_61:.*]], i32 0, i32 %[[VAL_54]], i32 %[[VAL_55]], i32 %[[VAL_56]]
+// CHECK:         store float %[[VAL_59]], ptr{{.*}} %[[VAL_60]], align 4
+// CHECK:         br label %[[VAL_48]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop2.loop_exit11:                                ; preds = %[[VAL_48]]
+// CHECK:         br label %[[VAL_40]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop1.loop_exit5:                                 ; preds = %[[VAL_40]]
+// CHECK:         ret void
diff --git a/third_party/xla/xla/service/gpu/tests/transpose_210.hlo b/third_party/xla/xla/service/gpu/tests/transpose_210.hlo
new file mode 100644
index 00000000000000..cf83fa7a8c0292
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/transpose_210.hlo
@@ -0,0 +1,106 @@
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+
+HloModule Transpose, is_scheduled=true
+
+%fused_computation {
+  %p0 = f32[33,49,65]{2,1,0} parameter(0)
+  ROOT %transpose = f32[65,49,33]{2,1,0} transpose(%p0), dimensions={2,1,0}
+}
+
+ENTRY main {
+  %param = f32[33,49,65]{2,1,0} parameter(0)
+  ROOT %fusion = f32[65,49,33] fusion(%param), kind=kInput, calls=%fused_computation
+}
+
+// CHECK-LABEL: entry:
+// CHECK:         %[[VAL_0:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_1:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_2:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_3:.*]] = alloca i32, align 4
+// CHECK-PTX:     %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
+// CHECK-GCN:     %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x
+// CHECK-PTX:     %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !3
+// CHECK-GCN:     %block.id.x = call i32 @llvm.amdgcn.workgroup.id.x
+// CHECK:         %thread.id.0 = udiv i32 %thread.id.x, 32
+// CHECK:         %thread.id.2 = urem i32 %thread.id.x, 32
+// CHECK:         %lane_id = urem i32 %thread.id.x, 32
+// CHECK:         %[[VAL_5:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_6:.*]] = urem i32 %[[VAL_5]], 3
+// CHECK:         %[[VAL_7:.*]] = udiv i32 %block.id.x, 3
+// CHECK:         %[[VAL_8:.*]] = urem i32 %[[VAL_7]], 49
+// CHECK:         %[[VAL_9:.*]] = udiv i32 %block.id.x, 147
+// CHECK:         %[[VAL_10:.*]] = icmp eq i32 %[[VAL_9]], 1
+// CHECK:         %tile_bound.0 = select i1 %[[VAL_10]], i32 1, i32 32
+// CHECK:         %[[VAL_11:.*]] = icmp eq i32 %[[VAL_6]], 2
+// CHECK:         %tile_bound.2 = select i1 %[[VAL_11]], i32 1, i32 32
+// CHECK:         %tile_origin.0 = mul i32 %[[VAL_9]], 32
+// CHECK:         %tile_origin.1 = mul i32 %[[VAL_8]], 1
+// CHECK:         %tile_origin.2 = mul i32 %[[VAL_6]], 32
+// CHECK:         store i32 %thread.id.0, ptr{{.*}} %[[VAL_3]], align 4
+// CHECK:         br label %[[VAL_12:.*]]
+// CHECK:       loop0.loop_header:                                ; preds = %[[VAL_13:.*]], %[[VAL_14:.*]]
+// CHECK:         %[[VAL_15:.*]] = load i32, ptr{{.*}} %[[VAL_3]], align 4
+// CHECK:         %[[VAL_16:.*]] = icmp uge i32 %[[VAL_15]], %tile_bound.0
+// CHECK:         br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_18:.*]]
+// CHECK:       loop0.loop_body:                                  ; preds = %[[VAL_12]]
+// CHECK:         %[[VAL_19:.*]] = add nuw nsw i32 %[[VAL_15]], 4
+// CHECK:         store i32 %[[VAL_19]], ptr{{.*}} %[[VAL_3]], align 4
+// CHECK:         %[[VAL_20:.*]] = icmp eq i32 %[[VAL_15]], %thread.id.0
+// CHECK:         store i32 %thread.id.2, ptr{{.*}} %[[VAL_2]], align 4
+// CHECK:         br label %[[VAL_21:.*]]
+// CHECK:       loop2.loop_header:                                ; preds = %[[VAL_22:.*]], %[[VAL_18]]
+// CHECK:         %[[VAL_23:.*]] = load i32, ptr{{.*}} %[[VAL_2]], align 4
+// CHECK:         %[[VAL_24:.*]] = icmp uge i32 %[[VAL_23]], %tile_bound.2
+// CHECK:         br i1 %[[VAL_24]], label %[[VAL_13]], label %[[VAL_22]]
+// CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_21]]
+// CHECK:         %[[VAL_25:.*]] = add nuw nsw i32 %[[VAL_23]], 32
+// CHECK:         store i32 %[[VAL_25]], ptr{{.*}} %[[VAL_2]], align 4
+// CHECK:         %[[VAL_26:.*]] = icmp eq i32 %[[VAL_23]], %thread.id.2
+// CHECK:         %[[VAL_27:.*]] = add i32 %tile_origin.0, %[[VAL_15]]
+// CHECK:         %[[VAL_28:.*]] = add i32 %tile_origin.1, 0
+// CHECK:         %[[VAL_29:.*]] = add i32 %tile_origin.2, %[[VAL_23]]
+// CHECK:         %[[VAL_30:.*]] = getelementptr inbounds [33 x [49 x [65 x float]]], ptr %[[VAL_31:.*]], i32 0, i32 %[[VAL_27]], i32 %[[VAL_28]], i32 %[[VAL_29]]
+// CHECK:         %[[VAL_32:.*]] = load float, ptr %[[VAL_30]], align 4, !invariant.load !{{[0-9]}}
+// CHECK:         %[[VAL_33:.*]] = getelementptr inbounds [32 x [1 x [33 x float]]], ptr addrspace(3) @tr_tile_0, i32 0, i32 %[[VAL_15]], i32 0, i32 %[[VAL_23]]
+// CHECK:         %[[VAL_34:.*]] = addrspacecast ptr addrspace(3) %[[VAL_33]] to ptr
+// CHECK:         store float %[[VAL_32]], ptr %[[VAL_34]], align 4
+// CHECK:         br label %[[VAL_21]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop2.loop_exit:                                  ; preds = %[[VAL_21]]
+// CHECK:         br label %[[VAL_12]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop0.loop_exit:                                  ; preds = %[[VAL_12]]
+// CHECK-PTX:     call void @llvm.nvvm.barrier0()
+// CHECK-GCN:     call void @llvm.amdgcn.s.barrier()
+// CHECK:         store i32 %thread.id.0, ptr{{.*}} %[[VAL_1]], align 4
+// CHECK:         br label %[[VAL_35:.*]]
+// CHECK:       loop0.loop_header4:                               ; preds = %[[VAL_36:.*]], %[[VAL_17]]
+// CHECK:         %[[VAL_37:.*]] = load i32, ptr{{.*}} %[[VAL_1]], align 4
+// CHECK:         %[[VAL_38:.*]] = icmp uge i32 %[[VAL_37]], %tile_bound.2
+// CHECK:         br i1 %[[VAL_38]], label %[[VAL_39:.*]], label %[[VAL_40:.*]]
+// CHECK:       loop0.loop_body5:                                 ; preds = %[[VAL_35]]
+// CHECK:         %[[VAL_41:.*]] = add nuw nsw i32 %[[VAL_37]], 4
+// CHECK:         store i32 %[[VAL_41]], ptr{{.*}} %[[VAL_1]], align 4
+// CHECK:         %[[VAL_42:.*]] = icmp eq i32 %[[VAL_37]], %thread.id.0
+// CHECK:         store i32 %thread.id.2, ptr{{.*}} %[[VAL_0]], align 4
+// CHECK:         br label %[[VAL_43:.*]]
+// CHECK:       loop2.loop_header10:                              ; preds = %[[VAL_44:.*]], %[[VAL_40]]
+// CHECK:         %[[VAL_45:.*]] = load i32, ptr{{.*}} %[[VAL_0]], align 4
+// CHECK:         %[[VAL_46:.*]] = icmp uge i32 %[[VAL_45]], %tile_bound.0
+// CHECK:         br i1 %[[VAL_46]], label %[[VAL_36]], label %[[VAL_44]]
+// CHECK:       loop2.loop_body11:                                ; preds = %[[VAL_43]]
+// CHECK:         %[[VAL_47:.*]] = add nuw nsw i32 %[[VAL_45]], 32
+// CHECK:         store i32 %[[VAL_47]], ptr{{.*}} %[[VAL_0]], align 4
+// CHECK:         %[[VAL_48:.*]] = icmp eq i32 %[[VAL_45]], %thread.id.2
+// CHECK:         %[[VAL_49:.*]] = add i32 %tile_origin.2, %[[VAL_37]]
+// CHECK:         %[[VAL_50:.*]] = add i32 %tile_origin.1, 0
+// CHECK:         %[[VAL_51:.*]] = add i32 %tile_origin.0, %[[VAL_45]]
+// CHECK:         %[[VAL_52:.*]] = getelementptr inbounds [32 x [1 x [33 x float]]], ptr addrspace(3) @tr_tile_0, i32 0, i32 %[[VAL_45]], i32 0, i32 %[[VAL_37]]
+// CHECK:         %[[VAL_53:.*]] = addrspacecast ptr addrspace(3) %[[VAL_52]] to ptr
+// CHECK:         %[[VAL_54:.*]] = load float, ptr{{.*}} %[[VAL_53]], align 4
+// CHECK:         %[[VAL_55:.*]] = getelementptr inbounds [65 x [49 x [33 x float]]], ptr %[[VAL_56:.*]], i32 0, i32 %[[VAL_49]], i32 %[[VAL_50]], i32 %[[VAL_51]]
+// CHECK:         store float %[[VAL_54]], ptr %[[VAL_55]], align 4
+// CHECK:         br label %[[VAL_43]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop2.loop_exit9:                                 ; preds = %[[VAL_43]]
+// CHECK:         br label %[[VAL_35]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop0.loop_exit3:                                 ; preds = %[[VAL_35]]
+// CHECK:         ret void
diff --git a/third_party/xla/xla/service/gpu/tests/transpose_210_extra_output.hlo b/third_party/xla/xla/service/gpu/tests/transpose_210_extra_output.hlo
new file mode 100644
index 00000000000000..9581099deed5c4
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/tests/transpose_210_extra_output.hlo
@@ -0,0 +1,113 @@
+// NOTE: Assertions have been autogenerated by utils/generate-test-checks.py
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
+
+HloModule Transpose, is_scheduled=true
+
+%fused_computation {
+  %p0 = f32[33,49,65] parameter(0)
+  %neg = f32[33,49,65] negate(%p0)
+  %transpose = f32[65,49,33] transpose(%p0), dimensions={2,1,0}
+  ROOT %tuple = (f32[33,49,65], f32[65,49,33]) tuple(%neg, %transpose)
+}
+
+ENTRY main {
+  %param = f32[33,49,65]{2,1,0} parameter(0)
+  ROOT %fusion = (f32[33,49,65], f32[65,49,33]) fusion(%param), kind=kInput, calls=%fused_computation
+}
+
+// CHECK-LABEL: entry:
+// CHECK:         %[[VAL_0:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_1:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_2:.*]] = alloca i32, align 4
+// CHECK:         %[[VAL_3:.*]] = alloca i32, align 4
+// CHECK-PTX:     %thread.id.x = call i32 @llvm.nvvm.read.ptx.sreg.tid.x(), !range !2
+// CHECK-GCN:     %thread.id.x = call i32 @llvm.amdgcn.workitem.id.x
+// CHECK-PTX:     %block.id.x = call i32 @llvm.nvvm.read.ptx.sreg.ctaid.x(), !range !3
+// CHECK-GCN:     %block.id.x = call i32 @llvm.amdgcn.workgroup.id.x
+// CHECK:         %thread.id.0 = udiv i32 %thread.id.x, 32
+// CHECK:         %thread.id.2 = urem i32 %thread.id.x, 32
+// CHECK:         %lane_id = urem i32 %thread.id.x, 32
+// CHECK:         %[[VAL_5:.*]] = udiv i32 %block.id.x, 1
+// CHECK:         %[[VAL_6:.*]] = urem i32 %[[VAL_5]], 3
+// CHECK:         %[[VAL_7:.*]] = udiv i32 %block.id.x, 3
+// CHECK:         %[[VAL_8:.*]] = urem i32 %[[VAL_7]], 49
+// CHECK:         %[[VAL_9:.*]] = udiv i32 %block.id.x, 147
+// CHECK:         %[[VAL_10:.*]] = icmp eq i32 %[[VAL_9]], 1
+// CHECK:         %tile_bound.0 = select i1 %[[VAL_10]], i32 1, i32 32
+// CHECK:         %[[VAL_11:.*]] = icmp eq i32 %[[VAL_6]], 2
+// CHECK:         %tile_bound.2 = select i1 %[[VAL_11]], i32 1, i32 32
+// CHECK:         %tile_origin.0 = mul i32 %[[VAL_9]], 32
+// CHECK:         %tile_origin.1 = mul i32 %[[VAL_8]], 1
+// CHECK:         %tile_origin.2 = mul i32 %[[VAL_6]], 32
+// CHECK:         store i32 %thread.id.0, ptr{{.*}} %[[VAL_3]], align 4
+// CHECK:         br label %[[VAL_12:.*]]
+// CHECK:       loop0.loop_header:                                ; preds = %[[VAL_13:.*]], %[[VAL_14:.*]]
+// CHECK:         %[[VAL_15:.*]] = load i32, ptr{{.*}} %[[VAL_3]], align 4
+// CHECK:         %[[VAL_16:.*]] = icmp uge i32 %[[VAL_15]], %tile_bound.0
+// CHECK:         br i1 %[[VAL_16]], label %[[VAL_17:.*]], label %[[VAL_18:.*]]
+// CHECK:       loop0.loop_body:                                  ; preds = %[[VAL_12]]
+// CHECK:         %[[VAL_19:.*]] = add nuw nsw i32 %[[VAL_15]], 4
+// CHECK:         store i32 %[[VAL_19]], ptr{{.*}} %[[VAL_3]], align 4
+// CHECK:         %[[VAL_20:.*]] = icmp eq i32 %[[VAL_15]], %thread.id.0
+// CHECK:         store i32 %thread.id.2, ptr{{.*}} %[[VAL_2]], align 4
+// CHECK:         br label %[[VAL_21:.*]]
+// CHECK:       loop2.loop_header:                                ; preds = %[[VAL_22:.*]], %[[VAL_18]]
+// CHECK:         %[[VAL_23:.*]] = load i32, ptr{{.*}} %[[VAL_2]], align 4
+// CHECK:         %[[VAL_24:.*]] = icmp uge i32 %[[VAL_23]], %tile_bound.2
+// CHECK:         br i1 %[[VAL_24]], label %[[VAL_13]], label %[[VAL_22]]
+// CHECK:       loop2.loop_body:                                  ; preds = %[[VAL_21]]
+// CHECK:         %[[VAL_25:.*]] = add nuw nsw i32 %[[VAL_23]], 32
+// CHECK:         store i32 %[[VAL_25]], ptr{{.*}} %[[VAL_2]], align 4
+// CHECK:         %[[VAL_26:.*]] = icmp eq i32 %[[VAL_23]], %thread.id.2
+// CHECK:         %[[VAL_27:.*]] = add i32 %tile_origin.0, %[[VAL_15]]
+// CHECK:         %[[VAL_28:.*]] = add i32 %tile_origin.1, 0
+// CHECK:         %[[VAL_29:.*]] = add i32 %tile_origin.2, %[[VAL_23]]
+// CHECK:         %[[VAL_30:.*]] = getelementptr inbounds [33 x [49 x [65 x float]]], ptr{{.*}} %[[VAL_31:.*]], i32 0, i32 %[[VAL_27]], i32 %[[VAL_28]], i32 %[[VAL_29]]
+// CHECK:         %[[VAL_32:.*]] = load float, ptr{{.*}} %[[VAL_30]], align 4, !invariant.load !{{[0-9]}}
+// CHECK:         %[[VAL_33:.*]] = getelementptr inbounds [32 x [1 x [33 x float]]], ptr{{.*}} addrspace(3) @tr_tile_0, i32 0, i32 %[[VAL_15]], i32 0, i32 %[[VAL_23]]
+// CHECK:         %[[VAL_34:.*]] = addrspacecast ptr{{.*}} addrspace(3) %[[VAL_33]] to ptr
+// CHECK:         store float %[[VAL_32]], ptr{{.*}} %[[VAL_34]], align 4
+// CHECK:         %[[VAL_35:.*]] = getelementptr inbounds [33 x [49 x [65 x float]]], ptr{{.*}} %[[VAL_31]], i32 0, i32 %[[VAL_27]], i32 %[[VAL_28]], i32 %[[VAL_29]]
+// CHECK:         %[[VAL_36:.*]] = load float, ptr{{.*}} %[[VAL_35]], align 4, !invariant.load !{{[0-9]}}
+// CHECK:         %[[VAL_37:.*]] = fneg float %[[VAL_36]]
+// CHECK:         %[[VAL_38:.*]] = getelementptr inbounds [33 x [49 x [65 x float]]], ptr{{.*}} %[[VAL_39:.*]], i32 0, i32 %[[VAL_27]], i32 %[[VAL_28]], i32 %[[VAL_29]]
+// CHECK:         store float %[[VAL_37]], ptr{{.*}} %[[VAL_38]], align 4
+// CHECK:         br label %[[VAL_21]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop2.loop_exit:                                  ; preds = %[[VAL_21]]
+// CHECK:         br label %[[VAL_12]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop0.loop_exit:                                  ; preds = %[[VAL_12]]
+// CHECK-PTX:     call void @llvm.nvvm.barrier0()
+// CHECK-GCN:     call void @llvm.amdgcn.s.barrier()
+// CHECK:         store i32 %thread.id.0, ptr{{.*}} %[[VAL_1]], align 4
+// CHECK:         br label %[[VAL_40:.*]]
+// CHECK:       loop0.loop_header6:                               ; preds = %[[VAL_41:.*]], %[[VAL_17]]
+// CHECK:         %[[VAL_42:.*]] = load i32, ptr{{.*}} %[[VAL_1]], align 4
+// CHECK:         %[[VAL_43:.*]] = icmp uge i32 %[[VAL_42]], %tile_bound.2
+// CHECK:         br i1 %[[VAL_43]], label %[[VAL_44:.*]], label %[[VAL_45:.*]]
+// CHECK:       loop0.loop_body7:                                 ; preds = %[[VAL_40]]
+// CHECK:         %[[VAL_46:.*]] = add nuw nsw i32 %[[VAL_42]], 4
+// CHECK:         store i32 %[[VAL_46]], ptr{{.*}} %[[VAL_1]], align 4
+// CHECK:         %[[VAL_47:.*]] = icmp eq i32 %[[VAL_42]], %thread.id.0
+// CHECK:         store i32 %thread.id.2, ptr{{.*}} %[[VAL_0]], align 4
+// CHECK:         br label %[[VAL_48:.*]]
+// CHECK:       loop2.loop_header12:                              ; preds = %[[VAL_49:.*]], %[[VAL_45]]
+// CHECK:         %[[VAL_50:.*]] = load i32, ptr{{.*}} %[[VAL_0]], align 4
+// CHECK:         %[[VAL_51:.*]] = icmp uge i32 %[[VAL_50]], %tile_bound.0
+// CHECK:         br i1 %[[VAL_51]], label %[[VAL_41]], label %[[VAL_49]]
+// CHECK:       loop2.loop_body13:                                ; preds = %[[VAL_48]]
+// CHECK:         %[[VAL_52:.*]] = add nuw nsw i32 %[[VAL_50]], 32
+// CHECK:         store i32 %[[VAL_52]], ptr{{.*}} %[[VAL_0]], align 4
+// CHECK:         %[[VAL_53:.*]] = icmp eq i32 %[[VAL_50]], %thread.id.2
+// CHECK:         %[[VAL_54:.*]] = add i32 %tile_origin.2, %[[VAL_42]]
+// CHECK:         %[[VAL_55:.*]] = add i32 %tile_origin.1, 0
+// CHECK:         %[[VAL_56:.*]] = add i32 %tile_origin.0, %[[VAL_50]]
+// CHECK:         %[[VAL_57:.*]] = getelementptr inbounds [32 x [1 x [33 x float]]], ptr{{.*}} addrspace(3) @tr_tile_0, i32 0, i32 %[[VAL_50]], i32 0, i32 %[[VAL_42]]
+// CHECK:         %[[VAL_58:.*]] = addrspacecast ptr{{.*}} addrspace(3) %[[VAL_57]] to ptr
+// CHECK:         %[[VAL_59:.*]] = load float, ptr{{.*}} %[[VAL_58]], align 4
+// CHECK:         %[[VAL_60:.*]] = getelementptr inbounds [65 x [49 x [33 x float]]], ptr{{.*}} %[[VAL_61:.*]], i32 0, i32 %[[VAL_54]], i32 %[[VAL_55]], i32 %[[VAL_56]]
+// CHECK:         store float %[[VAL_59]], ptr{{.*}} %[[VAL_60]], align 4
+// CHECK:         br label %[[VAL_48]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop2.loop_exit11:                                ; preds = %[[VAL_48]]
+// CHECK:         br label %[[VAL_40]], !llvm.loop !{{[0-9]}}
+// CHECK:       loop0.loop_exit5:                                 ; preds = %[[VAL_40]]
+// CHECK:         ret void
diff --git a/third_party/xla/xla/service/gpu/tests/transpose_emitter_test.cc b/third_party/xla/xla/service/gpu/tests/transpose_emitter_test.cc
index cce5678fd369c9..e60c1fd13063b5 100644
--- a/third_party/xla/xla/service/gpu/tests/transpose_emitter_test.cc
+++ b/third_party/xla/xla/service/gpu/tests/transpose_emitter_test.cc
@@ -92,14 +92,14 @@ HloModule m
 %fused_computation {
   %param_0.1 = f32[16,32]{1,0} parameter(0)
   %s.1 = f32[16,32]{1,0} sqrt(%param_0.1)
-  %c.1 = f32[16,32]{0,1} copy(%s.1)
-  b = f32[16,32,1]{0,1,2} bitcast(%c.1)
-  ROOT o = f32[16,32,1]{0,1,2} sqrt(b)
+  %t.1 = f32[32,16]{1,0} transpose(%s.1), dimensions={1,0}
+  b = f32[32,16,1]{2,1,0} bitcast(%t.1)
+  ROOT o = f32[32,16,1]{2,1,0} sqrt(b)
 }
 
 ENTRY main {
   %p = f32[16,32]{1,0} parameter(0)
-  ROOT %fusion = f32[16,32,1]{0,1,2} fusion(%p), kind=kInput, calls=%fused_computation
+  ROOT %fusion = f32[32,16,1]{2,1,0} fusion(%p), kind=kInput, calls=%fused_computation
 }
   )";
 
@@ -111,23 +111,23 @@ ENTRY main {
   EXPECT_TRUE(RunAndCompareNoHloPasses(hlo, ErrorSpec{1e-3}));
 }
 
-TEST_F(TransposeEmitterTest, MultipleCopiesWithPostFusion) {
+TEST_F(TransposeEmitterTest, MultipleTransposesWithPostFusion) {
   const char* hlo = R"(
 HloModule m
 
 %fused_computation {
   %param_0.1 = f32[16,32]{1,0} parameter(0)
   %s.1 = f32[16,32]{1,0} sqrt(%param_0.1)
-  %c.1 = f32[16,32]{0,1} copy(%s.1)
-  %c1.1 = f32[16,32]{0,1} copy(%param_0.1)
-  %r.1 = f32[16,32,1]{0,1,2} reshape(%c.1)
-  %r1.1 = f32[16,32,1]{0,1,2} reshape(%c1.1)
-  ROOT %tuple = (f32[16,32,1]{0,1,2}, f32[16,32,1]{0,1,2}) tuple(%r.1, %r1.1)
+  %t.1 = f32[32,16]{1,0} transpose(%s.1), dimensions={1,0}
+  %t1.1 = f32[32,16]{1,0} transpose(%param_0.1), dimensions={1,0}
+  %r.1 = f32[32,16,1]{2,1,0} reshape(%t.1)
+  %r1.1 = f32[32,16,1]{2,1,0} reshape(%t1.1)
+  ROOT %tuple = (f32[32,16,1]{2,1,0}, f32[32,16,1]{2,1,0}) tuple(%r.1, %r1.1)
 }
 
 ENTRY main {
   %p = f32[16,32]{1,0} parameter(0)
-  ROOT %fusion = (f32[16,32,1]{0,1,2}, f32[16,32,1]{0,1,2}) fusion(%p), kind=kInput, calls=%fused_computation
+  ROOT %fusion = (f32[32,16,1]{2,1,0}, f32[32,16,1]{2,1,0}) fusion(%p), kind=kInput, calls=%fused_computation
 }
   )";
 
diff --git a/third_party/xla/xla/service/gpu/tests/triton_naming.hlo b/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
index d994d53e8791ec..2739e349181786 100644
--- a/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
+++ b/third_party/xla/xla/service/gpu/tests/triton_naming.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo_to_llvm_ir %s | FileCheck --check-prefixes=CHECK-%{PTX} %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/../../../tools/hlo_opt/gpu_specs/%{GPU}.txtpb | FileCheck --check-prefixes=CHECK-%{PTX} %s
 
 // CHECK-PTX: define void @triton_gemm_r(
 // CHECK-GCN: define amdgpu_kernel void @triton_gemm_r(
@@ -15,5 +15,5 @@ HloModule t, is_scheduled=true, entry_computation_layout={(f16[15,19]{1,0},s8[19
 ENTRY %e (p0: f16[15,19], p1: s8[19,17]) -> f16[15,17] {
   %p1 = s8[19,17]{1,0} parameter(1)
   %p0 = f16[15,19]{1,0} parameter(0)
-  ROOT %triton_gemm_r = f16[15,17]{1,0} fusion(%p1, %p0), kind=kCustom, calls=%triton_gemm_r, backend_config="{ \"fusion_backend_config\": {kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"8\"}}}"
+  ROOT %triton_gemm_r = f16[15,17]{1,0} fusion(%p1, %p0), kind=kCustom, calls=%triton_gemm_r, backend_config="{ \"fusion_backend_config\": {kind: \"__triton_gemm\", triton_gemm_config: {\"block_m\":\"64\",\"block_n\":\"32\",\"block_k\":\"64\",\"split_k\":\"1\",\"num_stages\":\"2\",\"num_warps\":\"8\",\"num_ctas\":\"1\"}}}"
 }
diff --git a/third_party/xla/xla/service/gpu/thunk.cc b/third_party/xla/xla/service/gpu/thunk.cc
index f59413c97faf28..36285b075a1ba5 100644
--- a/third_party/xla/xla/service/gpu/thunk.cc
+++ b/third_party/xla/xla/service/gpu/thunk.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/thunk.h"
 
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <memory>
@@ -33,8 +34,11 @@ limitations under the License.
 #include "xla/executable_run_options.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/service/global_device_id.h"
+#include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/buffer_allocations.h"
 #include "xla/service/gpu/gpu_executable_run_options.h"
+#include "xla/service/gpu/nccl_api.h"
+#include "xla/service/gpu/nccl_clique.h"
 #include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/stream.h"
@@ -44,6 +48,46 @@ limitations under the License.
 namespace xla {
 namespace gpu {
 
+//===----------------------------------------------------------------------===//
+// Thunk::CollectiveCliques
+//===----------------------------------------------------------------------===//
+
+Thunk::CollectiveCliques::CollectiveCliques(
+    NcclClique::AcquiredCliquesMap cliques_map)
+    : cliques_map_(std::move(cliques_map)) {}
+
+absl::StatusOr<NcclApi::NcclCommHandle> Thunk::CollectiveCliques::GetComm(
+    const NcclCliqueKey& clique_key, int32_t rank) const {
+  // Check that we locked access to a clique for `clique_key`.
+  auto clique = cliques_map_.find(clique_key);
+  if (clique == cliques_map_.end()) {
+    return absl::NotFoundError(absl::StrCat("No clique found for clique key: ",
+                                            clique_key.ToString()));
+  }
+
+  // Check that clique has a communicator for our rank.
+  auto communicator = (*clique->second)->comm(rank);
+  if (!communicator.has_value()) {
+    return absl::InternalError(absl::StrCat("Communicator for rank ", rank,
+                                            " not found in a NCCL clique ",
+                                            clique_key.ToString()));
+  }
+
+  return *communicator;
+}
+
+absl::StatusOr<size_t> Thunk::CollectiveCliques::num_communicators(
+    const NcclCliqueKey& clique_key) const {
+  // Check that we locked access to a clique for `clique_key`.
+  auto clique = cliques_map_.find(clique_key);
+  if (clique == cliques_map_.end()) {
+    return absl::NotFoundError(absl::StrCat("No clique found for clique key: ",
+                                            clique_key.ToString()));
+  }
+
+  return (*clique->second)->num_communicators();
+}
+
 //===----------------------------------------------------------------------===//
 // Thunk::CollectiveExecuteParams
 //===----------------------------------------------------------------------===//
@@ -85,18 +129,20 @@ Thunk::CollectiveExecuteParams::Create(
   TF_ASSIGN_OR_RETURN(GlobalDeviceId global_device_id,
                       GetGlobalDeviceId(device_id_map, local_device_ordinal));
 
-  return CollectiveExecuteParams(run_options.run_options().run_id(),
+  return CollectiveExecuteParams(run_options.stream()->parent(),
+                                 run_options.run_options().run_id(),
                                  local_device_ordinal, global_device_id,
                                  run_options.run_options().device_assignment(),
                                  device_id_map, nccl_callback);
 }
 
 Thunk::CollectiveExecuteParams::CollectiveExecuteParams(
-    RunId run_id, int64_t local_device_ordinal, GlobalDeviceId global_device_id,
-    const DeviceAssignment* device_assn,
+    se::StreamExecutor* executor, RunId run_id, int64_t local_device_ordinal,
+    GlobalDeviceId global_device_id, const DeviceAssignment* device_assn,
     const GlobalDeviceIdMap* global_device_id_map,
     const NcclCliqueIdCallback* nccl_clique_id_callback)
-    : run_id(run_id),
+    : executor(executor),
+      run_id(run_id),
       local_device_ordinal(local_device_ordinal),
       global_device_id(global_device_id),
       device_assn(device_assn),
@@ -107,40 +153,45 @@ Thunk::CollectiveExecuteParams::CollectiveExecuteParams(
 // Thunk::ExecuteParams
 //===----------------------------------------------------------------------===//
 
-absl::StatusOr<Thunk::ExecuteParams> Thunk::ExecuteParams::Create(
+Thunk::ExecuteParams Thunk::ExecuteParams::Create(
     const ServiceExecutableRunOptions& run_options,
     const BufferAllocations& buffer_allocations, se::Stream* stream,
     se::Stream* command_buffer_trace_stream,
-    absl::Span<se::Stream* const> async_streams) {
-  TF_ASSIGN_OR_RETURN(auto collective_params,
-                      CollectiveExecuteParams::Create(
-                          run_options, stream->parent()->device_ordinal()));
+    absl::Span<se::Stream* const> async_streams,
+    CollectiveExecuteParams* collective_params,
+    CollectiveCliques* collective_cliques,
+    ExecutionStreamIdMap additional_compute_streams) {
   return ExecuteParams(&buffer_allocations, stream, command_buffer_trace_stream,
                        {async_streams.begin(), async_streams.end()},
-                       std::move(collective_params),
+                       collective_params, collective_cliques,
                        run_options.run_options().device_to_host_stream(),
                        run_options.run_options().host_to_device_stream(),
                        run_options.run_options().send_device_memory_function(),
-                       run_options.run_options().recv_device_memory_function());
+                       run_options.run_options().recv_device_memory_function(),
+                       additional_compute_streams);
 }
 
 Thunk::ExecuteParams::ExecuteParams(
     const BufferAllocations* buffer_allocations, se::Stream* stream,
     se::Stream* command_buffer_trace_stream,
     absl::InlinedVector<se::Stream*, 4> async_comms_streams,
-    CollectiveExecuteParams collective_params,
-    se::Stream* device_to_host_stream, se::Stream* host_to_device_stream,
+    CollectiveExecuteParams* collective_params,
+    CollectiveCliques* collective_cliques, se::Stream* device_to_host_stream,
+    se::Stream* host_to_device_stream,
     SendDeviceMemoryFunction* send_device_memory_function,
-    RecvDeviceMemoryFunction* recv_device_memory_function)
+    RecvDeviceMemoryFunction* recv_device_memory_function,
+    ExecutionStreamIdMap additional_compute_streams)
     : buffer_allocations(buffer_allocations),
       stream(stream),
       command_buffer_trace_stream(command_buffer_trace_stream),
       async_comms_streams(async_comms_streams),
-      collective_params(std::move(collective_params)),
+      collective_params(collective_params),
+      collective_cliques(collective_cliques),
       device_to_host_stream(device_to_host_stream),
       host_to_device_stream(host_to_device_stream),
       send_device_memory_function(send_device_memory_function),
-      recv_device_memory_function(recv_device_memory_function) {}
+      recv_device_memory_function(recv_device_memory_function),
+      additional_compute_streams(additional_compute_streams) {}
 
 //===----------------------------------------------------------------------===//
 
@@ -175,9 +226,10 @@ Thunk::ExecuteParams::ExecuteParams(
     CASE(kNcclAllToAllStart);
     CASE(kNcclAllToAllDone);
     CASE(kNcclSend);
+    CASE(kNcclSendDone);
     CASE(kNcclRecv);
+    CASE(kNcclRecvDone);
     CASE(kFft);
-    CASE(kFor);
     CASE(kGemm);
     CASE(kInfeed);
     CASE(kKernel);
@@ -195,9 +247,23 @@ Thunk::ExecuteParams::ExecuteParams(
     CASE(kTriangularSolve);
     CASE(kWhile);
     CASE(kFusedMHA);
+    CASE(kWaitForStreams);
   }
 }
 
+/*static*/
+absl::StatusOr<se::Stream*> Thunk::GetStreamForExecution(
+    ExecutionStreamId stream_id, const ExecuteParams& params) {
+  if (stream_id == GetMainComputeStreamId()) {
+    return params.stream;
+  }
+  auto iter = params.additional_compute_streams.find(stream_id);
+  if (iter == params.additional_compute_streams.end()) {
+    return absl::InvalidArgumentError("Invalid execution stream id.");
+  }
+  return iter->second;
+}
+
 std::ostream& operator<<(std::ostream& os, Thunk::Kind kind) {
   return os << Thunk::KindToString(kind);
 }
@@ -250,6 +316,12 @@ Thunk::ThunkInfo Thunk::ThunkInfo::WithProfileAnnotation(
   ThunkInfo thunk_info(nullptr);
   thunk_info.profile_annotation =
       absl::StrFormat("Thunk:#hlo_op=%s#", instr->name());
+  auto gpu_backend_config = instr->backend_config<GpuBackendConfig>();
+  if (gpu_backend_config.ok()) {
+    thunk_info.execution_stream_id =
+        std::max(Thunk::GetMainComputeStreamId().value(),
+                 gpu_backend_config->operation_queue_id());
+  }
   return thunk_info;
 }
 
diff --git a/third_party/xla/xla/service/gpu/thunk.h b/third_party/xla/xla/service/gpu/thunk.h
index 9f79a42860b76f..00f284fd63abb7 100644
--- a/third_party/xla/xla/service/gpu/thunk.h
+++ b/third_party/xla/xla/service/gpu/thunk.h
@@ -16,6 +16,7 @@ limitations under the License.
 #ifndef XLA_SERVICE_GPU_THUNK_H_
 #define XLA_SERVICE_GPU_THUNK_H_
 
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <map>
@@ -25,6 +26,7 @@ limitations under the License.
 #include <string_view>
 #include <vector>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
@@ -36,14 +38,19 @@ limitations under the License.
 #include "xla/service/buffer_assignment.h"
 #include "xla/service/global_device_id.h"
 #include "xla/service/gpu/buffer_allocations.h"
+#include "xla/service/gpu/nccl_api.h"
+#include "xla/service/gpu/nccl_clique.h"
 #include "xla/service/gpu/nccl_clique_key.h"
 #include "xla/service/service_executable_run_options.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "tsl/lib/gtl/int_type.h"
 
 namespace xla {
 namespace gpu {
 
+TSL_LIB_GTL_DEFINE_INT_TYPE(ExecutionStreamId, int64_t);
+
 // Thunk acts as the bridge between IrEmitter and GpuExecutable. It stores the
 // metadata IrEmitter generates for GpuExecutable to invoke an HloInstruction.
 //
@@ -72,6 +79,9 @@ namespace gpu {
 // different threads and coordinate resource acquisition via rendezvous.
 class Thunk {
  public:
+  using ExecutionStreamIdMap =
+      absl::flat_hash_map<ExecutionStreamId, se::Stream*>;
+
   enum Kind {
     kCholesky,
     kConditional,
@@ -84,7 +94,6 @@ class Thunk {
     kCustomCall,
     kCustomKernel,
     kFft,
-    kFor,
     kGemm,
     kInfeed,
     kKernel,
@@ -106,7 +115,9 @@ class Thunk {
     kNcclAllToAllStart,
     kNcclAllToAllDone,
     kNcclSend,
+    kNcclSendDone,
     kNcclRecv,
+    kNcclRecvDone,
     kNorm,
     kOutfeed,
     kPartitionId,
@@ -118,7 +129,8 @@ class Thunk {
     kSendDone,
     kTriangularSolve,
     kWhile,
-    kFusedMHA
+    kFusedMHA,
+    kWaitForStreams
   };
 
   // TODO(ezhulenev): This should become a part of StreamExecutor library, but
@@ -139,6 +151,8 @@ class Thunk {
     // TODO(b/304613751): This is only needed by the LMHLO. Remove this when
     // LMHLO is removed from the runtime pipeline.
     mlir::Operation* op;
+
+    ExecutionStreamId execution_stream_id = Thunk::GetMainComputeStreamId();
   };
 
   //===--------------------------------------------------------------------===//
@@ -155,6 +169,31 @@ class Thunk {
                                    int32_t num_local_participants) = 0;
   };
 
+  //===--------------------------------------------------------------------===//
+  // CollectiveCliques
+  //===--------------------------------------------------------------------===//
+
+  // A collection of collective cliques acquired based on resource requests
+  // collected from all thunks at prepare stage.
+  class CollectiveCliques {
+   public:
+    CollectiveCliques() = default;
+    explicit CollectiveCliques(NcclClique::AcquiredCliquesMap cliques_map);
+
+    absl::StatusOr<NcclApi::NcclCommHandle> GetComm(
+        const NcclCliqueKey& clique_key, int32_t rank) const;
+
+    // Returns the number of communicators in a collective clique. Returns error
+    // if we do not have an acquired clique for a given key.
+    absl::StatusOr<size_t> num_communicators(
+        const NcclCliqueKey& clique_key) const;
+
+    bool empty() const { return cliques_map_.empty(); }
+
+   private:
+    NcclClique::AcquiredCliquesMap cliques_map_;
+  };
+
   //===--------------------------------------------------------------------===//
   // CollectiveExecuteParams
   //===--------------------------------------------------------------------===//
@@ -172,6 +211,8 @@ class Thunk {
     // A mapping from local device ordinals to global device IDs.
     using GlobalDeviceIdMap = std::map<int32_t, GlobalDeviceId>;
 
+    se::StreamExecutor* executor;
+
     // XLA execution run id allows us to distinguish collective operations
     // from different concurrent executions and avoid deadlocks.
     RunId run_id;
@@ -185,12 +226,59 @@ class Thunk {
 
    private:
     CollectiveExecuteParams(
-        RunId run_id, int64_t local_device_ordinal,
-        GlobalDeviceId global_device_id, const DeviceAssignment* device_assn,
+        se::StreamExecutor* executor, RunId run_id,
+        int64_t local_device_ordinal, GlobalDeviceId global_device_id,
+        const DeviceAssignment* device_assn,
         const GlobalDeviceIdMap* global_device_id_map,
         const NcclCliqueIdCallback* nccl_clique_id_callback);
   };
 
+  //===--------------------------------------------------------------------===//
+  // PrepareParams
+  //===--------------------------------------------------------------------===//
+
+  // Parameters passed to Prepare. At thunk prepare time we do not launch any
+  // work or do any expensive initialization and only pass resource requirements
+  // back to executable, i.e. request collective cliques required at run time.
+  struct PrepareParams {
+    // Parameters for executing collective operations.
+    const CollectiveExecuteParams* collective_params = nullptr;
+  };
+
+  //===--------------------------------------------------------------------===//
+  // InitializeParams
+  //===--------------------------------------------------------------------===//
+
+  // TODO(ezhulenev): Merge InitializeParams and ExecuteParams as they have
+  // almost the same members and tightly coupled.
+
+  // Parameters passed to Initialize. At thunk initialization time we do not
+  // launch any "work" on device and only initialize thunks for execution, i.e.
+  // we pre-load kernels on device and instantiate all command buffers.
+  struct InitializeParams {
+    se::StreamExecutor* executor = nullptr;
+    ExecutableSource src;
+
+    const BufferAllocations* buffer_allocations = nullptr;
+
+    // Main compute stream that will be used, passed via `ExecuteParams` to
+    // `ExecuteOnStream`. It can be used to initialize on-device "state" (i.e.
+    // various control structures) at command buffer recording time (we use it
+    // to initialize NCCL execution plans on device when we trace NCCL
+    // operations into command buffers);
+    se::Stream* stream = nullptr;
+
+    // Auxiliary stream for tracing command buffers. We use a separate stream to
+    // avoid accidental tracing of unrelated activities on a main stream.
+    se::Stream* command_buffer_trace_stream = nullptr;
+
+    // Parameters for executing collective operations.
+    CollectiveExecuteParams* collective_params = nullptr;
+
+    // Collective cliques acquired based on resource requests.
+    CollectiveCliques* collective_cliques = nullptr;
+  };
+
   //===--------------------------------------------------------------------===//
   // ExecuteParams
   //===--------------------------------------------------------------------===//
@@ -201,11 +289,14 @@ class Thunk {
   struct ExecuteParams {
     // Constructs execute parameters from an executable run options. Return
     // error if run options are misconfigured.
-    static absl::StatusOr<ExecuteParams> Create(
+    static ExecuteParams Create(
         const ServiceExecutableRunOptions& run_options,
         const BufferAllocations& buffer_allocations, se::Stream* stream,
         se::Stream* command_buffer_trace_stream,
-        absl::Span<se::Stream* const> async_streams = {});
+        absl::Span<se::Stream* const> async_streams,
+        CollectiveExecuteParams* collective_params,
+        CollectiveCliques* collective_cliques,
+        ExecutionStreamIdMap additional_compute_streams = {});
 
     const BufferAllocations* buffer_allocations;  // never null
 
@@ -221,7 +312,10 @@ class Thunk {
     absl::InlinedVector<se::Stream*, 4> async_comms_streams;
 
     // Parameters for executing collective operations.
-    CollectiveExecuteParams collective_params;
+    CollectiveExecuteParams* collective_params;
+
+    // Collective cliques acquired based on resource requests.
+    CollectiveCliques* collective_cliques;
 
     // Streams for moving data between host and device.
     se::Stream* device_to_host_stream;
@@ -231,55 +325,22 @@ class Thunk {
     SendDeviceMemoryFunction* send_device_memory_function;
     RecvDeviceMemoryFunction* recv_device_memory_function;
 
+    // Additional compute streams on which thunks launch operations.
+    ExecutionStreamIdMap additional_compute_streams;
+
    private:
+    friend class CommandBufferThunk;
+
     ExecuteParams(const BufferAllocations* buffer_allocations,
                   se::Stream* stream, se::Stream* command_buffer_trace_stream,
                   absl::InlinedVector<se::Stream*, 4> async_comms_streams,
-                  CollectiveExecuteParams collective_params,
+                  CollectiveExecuteParams* collective_params,
+                  CollectiveCliques* collective_cliques,
                   se::Stream* device_to_host_stream,
                   se::Stream* host_to_device_stream,
                   SendDeviceMemoryFunction* send_device_memory_function,
-                  RecvDeviceMemoryFunction* recv_device_memory_function);
-  };
-
-  //===--------------------------------------------------------------------===//
-  // PrepareParams
-  //===--------------------------------------------------------------------===//
-
-  // Parameters passed to Prepare. At thunk prepare time we do not launch any
-  // work or do any expensive initialization and only pass resource requirements
-  // back to executable, i.e. request collective cliques required at run time.
-  struct PrepareParams {
-    // Parameters for executing collective operations.
-    const CollectiveExecuteParams* collective_params = nullptr;
-  };
-
-  //===--------------------------------------------------------------------===//
-  // InitializeParams
-  //===--------------------------------------------------------------------===//
-
-  // Parameters passed to Initialize. At thunk initialization time we do not
-  // launch any "work" on device and only initialize thunks for execution, i.e.
-  // we pre-load kernels on device and instantiate all command buffers.
-  struct InitializeParams {
-    se::StreamExecutor* executor = nullptr;
-    ExecutableSource src;
-
-    const BufferAllocations* buffer_allocations = nullptr;
-
-    // Main compute stream that will be used, passed via `ExecuteParams` to
-    // `ExecuteOnStream`. It can be used to initialize on-device "state" (i.e.
-    // various control structures) at command buffer recording time (we use it
-    // to initialize NCCL execution plans on device when we trace NCCL
-    // operations into command buffers);
-    se::Stream* stream = nullptr;
-
-    // Auxiliary stream for tracing command buffers. We use a separate stream to
-    // avoid accidental tracing of unrelated activities on a main stream.
-    se::Stream* command_buffer_trace_stream = nullptr;
-
-    // Parameters for executing collective operations.
-    const CollectiveExecuteParams* collective_params = nullptr;
+                  RecvDeviceMemoryFunction* recv_device_memory_function,
+                  ExecutionStreamIdMap additional_compute_streams = {});
   };
 
   //===--------------------------------------------------------------------===//
@@ -290,7 +351,8 @@ class Thunk {
   Thunk(Kind kind, ThunkInfo thunk_info)
       : kind_(kind),
         profile_annotation_(thunk_info.profile_annotation),
-        op_(thunk_info.op) {}
+        op_(thunk_info.op),
+        execution_stream_id_(thunk_info.execution_stream_id) {}
   virtual ~Thunk() = default;
   Thunk(const Thunk&) = delete;
   Thunk& operator=(const Thunk&) = delete;
@@ -336,10 +398,20 @@ class Thunk {
 
   static absl::string_view KindToString(Thunk::Kind kind);
 
+  ExecutionStreamId execution_stream_id() const { return execution_stream_id_; }
+
+  static absl::StatusOr<se::Stream*> GetStreamForExecution(
+      ExecutionStreamId stream_id, const ExecuteParams& params);
+
+  static ExecutionStreamId GetMainComputeStreamId() {
+    return ExecutionStreamId(0);
+  }
+
  private:
   Kind kind_;
   std::string profile_annotation_;
   mlir::Operation* op_;
+  ExecutionStreamId execution_stream_id_;
 };
 
 // A sequence of thunks.
diff --git a/third_party/xla/xla/service/gpu/runtime/topk_test.cc b/third_party/xla/xla/service/gpu/topk_test.cc
similarity index 99%
rename from third_party/xla/xla/service/gpu/runtime/topk_test.cc
rename to third_party/xla/xla/service/gpu/topk_test.cc
index a508aa79b4de9c..28a9caa7bda2ff 100644
--- a/third_party/xla/xla/service/gpu/runtime/topk_test.cc
+++ b/third_party/xla/xla/service/gpu/topk_test.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/service/gpu/runtime/topk.h"
-
 #include <stddef.h>
 
 #include <memory>
diff --git a/third_party/xla/xla/service/gpu/triton_autotuner.cc b/third_party/xla/xla/service/gpu/triton_autotuner.cc
index 42e2164c398f12..be223f7ff5133e 100644
--- a/third_party/xla/xla/service/gpu/triton_autotuner.cc
+++ b/third_party/xla/xla/service/gpu/triton_autotuner.cc
@@ -74,6 +74,7 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/redzone_allocator.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/tools/hlo_decomposer.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
 #include "tsl/lib/core/bits.h"
@@ -133,7 +134,7 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
                   return absl::InternalError(absl::StrCat(
                       "Expect autotune result cache hit for deviceless "
                       "compilation (HLO: ",
-                      hlo->ToString()));
+                      hlo->ToString(), ")"));
                 }
                 return absl::InternalError("Expect autotune result cache hit.");
               }));
@@ -156,8 +157,9 @@ class TritonAutotunerVisitor : public DfsHloRewriteVisitor {
 
     // This cannot be the "else" branch of the previous "if".
     if (backend_config.has_triton_gemm_config()) {
-      const TritonGemmConfig config =
-          TritonGemmConfig::FromProto(backend_config.triton_gemm_config());
+      TF_ASSIGN_OR_RETURN(
+          const TritonGemmConfig config,
+          TritonGemmConfig::FromProto(backend_config.triton_gemm_config()));
       if (config.split_k > 1) {
         TF_RETURN_IF_ERROR(MakeDotSplitKBatch(hlo, config));
       }
@@ -290,23 +292,12 @@ constexpr std::array<int, 6> BLOCK_SIZES = {16, 32, 64, 128, 256, 512};
 constexpr std::array<int, 4> NUM_STAGES = {1, 2, 3, 4};
 constexpr std::array<int, 4> NUM_WARPS = {2, 4, 8, 16};
 constexpr std::array<int, 5> SPLIT_K = {1, 2, 4, 8, 16};
-
-// For arch >= Hopper autotuning.
-constexpr std::array<TritonGemmConfig::ClusterDims, 7> CLUSTER_DIMS = {
-    TritonGemmConfig::ClusterDims(1, 1, 1),
-    TritonGemmConfig::ClusterDims(2, 2, 1),
-    TritonGemmConfig::ClusterDims(2, 4, 1),
-    TritonGemmConfig::ClusterDims(4, 2, 1),
-    TritonGemmConfig::ClusterDims(4, 4, 1),
-    TritonGemmConfig::ClusterDims(2, 8, 1),
-    TritonGemmConfig::ClusterDims(8, 2, 1),
-};
-constexpr std::array<bool, 2> WARP_SPECIALIZATION = {false, true};
-
-// Currently we believe that num_ctas is inferable from cluster_dims.
-int InferNumCtas(TritonGemmConfig::ClusterDims cluster_dims) {
-  return cluster_dims.x * cluster_dims.y * cluster_dims.z;
-}
+// This is the number of blocks per cluster.
+//
+// Clusters have 3 dimensions (x,y,z) and only 1 <= x*y*z <= 16 are supported.
+// Triton doesn't support (3,3,1) and possibly other non-"power of 2" values.
+// It's possible that some other values may be(come) supported.
+constexpr std::array<int, 5> NUM_CTAS = {1, 2, 4, 8, 16};
 
 std::vector<TritonGemmConfig> GetExhaustiveMatmulAutotuneConfigs(
     const HloDotInstruction& dot,
@@ -331,9 +322,7 @@ std::vector<TritonGemmConfig> GetExhaustiveMatmulAutotuneConfigs(
           continue;
         }
         for (int block_n : BLOCK_SIZES) {
-          // Exclude configs not supported by MMA layout v2.
-          if (block_n > limit.block_n ||
-              (mma_layout_v2 && (block_m * block_n / 256) % num_warps != 0)) {
+          if (block_n > limit.block_n) {
             continue;
           }
           for (int block_k : BLOCK_SIZES) {
@@ -351,15 +340,14 @@ std::vector<TritonGemmConfig> GetExhaustiveMatmulAutotuneConfigs(
                     block_m, block_n, block_k, split_k, num_stages, num_warps));
                 continue;
               }
-
               // Arch >= Hopper autotuning.
-              for (bool enable_ws : WARP_SPECIALIZATION) {
-                for (TritonGemmConfig::ClusterDims cluster_dims :
-                     CLUSTER_DIMS) {
-                  configs.push_back(TritonGemmConfig(
-                      block_m, block_n, block_k, split_k, num_stages, num_warps,
-                      InferNumCtas(cluster_dims), cluster_dims, enable_ws));
-                }
+              // We only want to autotune this if it provides any speedup. So
+              // please think about that before adding it to the default
+              // autotuning parameters.
+              for (int num_ctas : NUM_CTAS) {
+                configs.push_back(TritonGemmConfig(block_m, block_n, block_k,
+                                                   split_k, num_stages,
+                                                   num_warps, num_ctas));
               }
             }
           }
@@ -398,14 +386,13 @@ std::vector<TritonGemmConfig> GetFixedMatmulAutotuneConfigs(
         std::back_inserter(configs));
   }
   if (compute_capability.IsAtLeast(se::CudaComputeCapability::HOPPER)) {
-    configs.erase(
-        std::remove_if(configs.begin(), configs.end(),
-                       [](const Config& config) {
-                         return (config.block_m * config.block_n / 256) %
-                                    config.num_warps !=
-                                0;
-                       }),
-        configs.end());
+    absl::c_copy(
+        std::vector<Config>{
+            Config(16, 32, 32, 8, 1, 2),
+            Config(16, 64, 128, 8, 1, 4),
+            Config(16, 64, 128, 16, 3, 4),
+        },
+        std::back_inserter(configs));
   }
   configs.erase(std::remove_if(configs.begin(), configs.end(),
                                [&](const Config& config) {
@@ -447,7 +434,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> TritonGemmAutotuneExtractor(
     const HloFusionInstruction* fusion, DebugOptions debug_opts,
     bool allow_filtering_kernels_spilling_registers) {
   std::unique_ptr<HloModule> new_module =
-      AutotunerUtil::ExtractInstructionIntoNewModule(*fusion);
+      ExtractInstructionIntoNewModule(*fusion);
   // Reduce memory usage during compilation by disabling GPU runtime.
   debug_opts.set_xla_gpu_enable_xla_runtime_executable(false);
   // TODO(anlunx): Disable command buffers for now because it breaks triton
@@ -484,7 +471,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> TritonGemmAutotuneExtractor(
     if (root->opcode() == HloOpcode::kReduce) {
       HloInstruction* fusion_instruction =
           entry_computation->AddInstruction(HloInstruction::CreateFusion(
-              root->shape(), ChooseFusionKind(*root, *root), root));
+              root->shape(), ChooseFusionKind(*root), root));
       HloInstruction* init_value = root->mutable_operand(1);
       TF_CHECK_OK(
           entry_computation->ReplaceInstruction(root, fusion_instruction));
@@ -501,7 +488,7 @@ absl::StatusOr<std::unique_ptr<HloModule>> CublasGemmAutotuneExtractor(
   const HloComputation* fusion_computation =
       fusion->called_computations().at(0);
   std::unique_ptr<HloModule> new_module =
-      AutotunerUtil::ExtractComputationIntoNewModule(*fusion_computation);
+      ExtractComputationIntoNewModule(*fusion_computation);
   new_module->mutable_config().set_debug_options(debug_opts);
 
   GemmRewriter rewriter(config.GetGpuComputeCapability());
@@ -669,6 +656,7 @@ CompileMany(const AutotuneConfig& config, AutotunerCompileUtil& util,
       const GemmConfigSet& gemm_config_set = key_value.second;
 
       for (const TritonGemmConfig& gemm_config : gemm_config_set.configs) {
+        VLOG(5) << "Compiling " << gemm_config.ToString();
         TF_ASSIGN_OR_RETURN(
             bool has_executable,
             compile(
@@ -851,17 +839,20 @@ absl::StatusOr<AutotuneResult> Execute(const AutotuneConfig& config,
   return best_triton;
 }
 
-absl::Status DumpAutotunedFusion(const AutotuneConfig& config,
+absl::Status DumpAutotunedFusion(const AutotuneConfig& autotune_config,
                                  AutotunerCompileUtil& util,
                                  const AutotuneResult result,
                                  const HloFusionInstruction* fusion,
                                  int fusion_id) {
+  TF_ASSIGN_OR_RETURN(TritonGemmConfig triton_gemm_config,
+                      TritonGemmConfig::FromProto(result.triton()));
+  const se::DeviceDescription& device_desc =
+      autotune_config.GetExecutor()->GetDeviceDescription();
   TF_ASSIGN_OR_RETURN(
       std::unique_ptr<HloModule> module,
       util.ExtractModule([&](const DebugOptions& debug_opts) {
         return TritonGemmAutotuneExtractor(
-            TritonGemmConfig::FromProto(result.triton()),
-            config.GetExecutor()->GetDeviceDescription(), fusion, debug_opts,
+            triton_gemm_config, device_desc, fusion, debug_opts,
             /*allow_filtering_kernels_spilling_registers=*/true);
       }));
   module->set_name(std::string(fusion->name()));
diff --git a/third_party/xla/xla/service/gpu/triton_autotuner_test.cc b/third_party/xla/xla/service/gpu/triton_autotuner_test.cc
index 8bca3088f86563..588e7ebee24c98 100644
--- a/third_party/xla/xla/service/gpu/triton_autotuner_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_autotuner_test.cc
@@ -48,6 +48,7 @@ limitations under the License.
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/test_utils.h"
 #include "xla/tests/verified_hlo_module.h"
+#include "xla/tools/hlo_decomposer.h"
 #include "xla/xla.pb.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/lib/core/status_test_util.h"
@@ -88,9 +89,8 @@ ENTRY entry {
 })")
                                                   .value();
 
-  std::unique_ptr<HloModule> extracted_module =
-      AutotunerUtil::ExtractInstructionIntoNewModule(
-          *module->entry_computation()->root_instruction()->operand(0));
+  std::unique_ptr<HloModule> extracted_module = ExtractInstructionIntoNewModule(
+      *module->entry_computation()->root_instruction()->operand(0));
 
   // Destroy the original module to be sure that the extracted one has no
   // dependency on it.
@@ -127,11 +127,10 @@ ENTRY entry {
                                                   .value();
 
   std::unique_ptr<HloModule> extracted_module =
-      AutotunerUtil::ExtractComputationIntoNewModule(
-          *module->entry_computation()
-               ->root_instruction()
-               ->operand(0)
-               ->fused_instructions_computation());
+      ExtractComputationIntoNewModule(*module->entry_computation()
+                                           ->root_instruction()
+                                           ->operand(0)
+                                           ->fused_instructions_computation());
 
   // Destroy the original module to be sure that the extracted one has no
   // dependency on it.
@@ -426,12 +425,12 @@ ENTRY e {
   p0 = f16[55,120]{1,0} parameter(0)
   p1 = f16[120,20]{1,0} parameter(1)
   ROOT _ = f16[55,20] fusion(p0, p1), kind=kCustom, calls=triton_dot,
-    backend_config={"fusion_backend_config":{kind: "__triton_gemm", triton_gemm_config: {"block_m":16,"block_n":64,"block_k":32,"split_k":3,"num_stages":1,"num_warps":2}}}
+    backend_config={"fusion_backend_config":{kind: "__triton_gemm", triton_gemm_config: {"block_m":16,"block_n":64,"block_k":32,"split_k":3,"num_stages":1,"num_warps":2,"num_ctas":1}}}
 })";
 
   MatchOptimizedHlo(kHloText, R"(
 ; CHECK: f16[3,55,20]
-; CHECK: {"block_m":16,"block_n":64,"block_k":32,"split_k":3,"num_stages":1,"num_warps":2}
+; CHECK: {"block_m":16,"block_n":64,"block_k":32,"split_k":3,"num_stages":1,"num_warps":2,"num_ctas":1}
 ; CHECK: f16[55,20]{1,0} {{(reduce|fusion)}}
 )");
 
@@ -455,7 +454,7 @@ ENTRY %e {
   %get-tuple-element.7020 = s8[12288,1536]{1,0} parameter(0)
   %convert = s8[4,12288]{1,0} parameter(1)
   ROOT %triton = s8[4,1536]{1,0} fusion(s8[12288,1536]{1,0} %get-tuple-element.7020, s8[4,12288]{1,0} %convert), kind=kCustom, calls=%triton_gemm_dot,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"16","split_k":"1","num_stages":"1","num_warps":"16"}}}
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"16","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1"}}}
 })";
 
   if (!GetCudaComputeCapability().IsAtLeast(
@@ -493,7 +492,7 @@ ENTRY %e {
   %get-tuple-element.7020 = s8[12288,1536]{1,0} parameter(0)
   %convert = s8[4,12288]{1,0} parameter(1)
   ROOT %triton = s8[4,1536]{1,0} fusion(s8[12288,1536]{1,0} %get-tuple-element.7020, s8[4,12288]{1,0} %convert), kind=kCustom, calls=%triton_gemm_dot,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"16","split_k":"1","num_stages":"1","num_warps":"16"}}}
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"256","block_n":"256","block_k":"16","split_k":"1","num_stages":"1","num_warps":"16","num_ctas":"1"}}}
 })";
 
   if (!GetCudaComputeCapability().IsAtLeast(
@@ -535,7 +534,7 @@ ENTRY %e {
   %p0 = s8[12288,1536]{1,0} parameter(0)
   %p1 = f16[4,12288]{1,0} parameter(1)
   ROOT %triton_dot = f16[4,1536]{1,0} fusion(s8[12288,1536]{1,0} %p0, f16[4,12288]{1,0} %p1), kind=kCustom, calls=%triton_gemm_dot,
-    backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"16","block_n":"32","block_k":"16","split_k":"1","num_stages":"1","num_warps":"2"}}}
+    backend_config={"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"16","block_n":"32","block_k":"16","split_k":"1","num_stages":"1","num_warps":"2","num_ctas":"1"}}}
 })";
 
   auto module = ParseAndReturnVerifiedModule(kHloText).value();
@@ -631,7 +630,7 @@ ENTRY e {
         RunFileCheck(
             module->ToString(HloPrintOptions{}.set_print_operand_shape(false)),
             R"(
-// CHECK: backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"32","block_n":"32","block_k":"32","split_k":"1","num_stages":"1","num_warps":"4"}}}
+// CHECK: backend_config={"operation_queue_id":"0","wait_on_operation_queues":[],"fusion_backend_config":{"kind":"__triton_gemm","triton_gemm_config":{"block_m":"32","block_n":"32","block_k":"32","split_k":"1","num_stages":"1","num_warps":"4","num_ctas":"1"}}}
             )"));
     EXPECT_TRUE(filecheck_matches);
   } else {
diff --git a/third_party/xla/xla/service/gpu/triton_call.cc b/third_party/xla/xla/service/gpu/triton_call.cc
new file mode 100644
index 00000000000000..dfc88e578b0ebe
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/triton_call.cc
@@ -0,0 +1,51 @@
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/gpu/triton_call.h"
+
+#include <cstdint>
+#include <string_view>
+#include <utility>
+
+#include "mlir/AsmParser/AsmParser.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
+#include "mlir/Parser/Parser.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+
+namespace xla::gpu {
+
+TritonCall TritonCall::Parse(std::string_view backend_config,
+                             mlir::MLIRContext* mlir_context) {
+  // TODO(slebedev): Plumb through num_ctas and enable_wrap_specialization.
+  auto attrs = mlir::cast<mlir::DictionaryAttr>(
+      mlir::parseAttribute(backend_config, mlir_context));
+  auto name = attrs.getAs<mlir::StringAttr>("name").getValue().str();
+  auto ir = attrs.getAs<mlir::StringAttr>("ir").str();
+  auto grid_x = static_cast<int32_t>(
+      attrs.getAs<mlir::IntegerAttr>("grid_x").getValue().getSExtValue());
+  auto grid_y = static_cast<int32_t>(
+      attrs.getAs<mlir::IntegerAttr>("grid_y").getValue().getSExtValue());
+  auto grid_z = static_cast<int32_t>(
+      attrs.getAs<mlir::IntegerAttr>("grid_z").getValue().getSExtValue());
+  auto num_stages =
+      attrs.getAs<mlir::IntegerAttr>("num_stages").getValue().getSExtValue();
+  auto num_warps =
+      attrs.getAs<mlir::IntegerAttr>("num_warps").getValue().getSExtValue();
+  return TritonCall{std::move(name), std::move(ir), num_stages, num_warps,
+                    grid_x,          grid_y,        grid_z};
+}
+
+}  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/runtime/topk_kernel_bfloat16.cu.cc b/third_party/xla/xla/service/gpu/triton_call.h
similarity index 50%
rename from third_party/xla/xla/service/gpu/runtime/topk_kernel_bfloat16.cu.cc
rename to third_party/xla/xla/service/gpu/triton_call.h
index 218770510e6657..169e4e703e7dc2 100644
--- a/third_party/xla/xla/service/gpu/runtime/topk_kernel_bfloat16.cu.cc
+++ b/third_party/xla/xla/service/gpu/triton_call.h
@@ -1,4 +1,4 @@
-/* Copyright 2023 The OpenXLA Authors.
+/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,15 +13,31 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "Eigen/Core"  // from @eigen_archive
-#include "xla/service/gpu/runtime/topk_kernel.cu.h"
+#ifndef XLA_SERVICE_GPU_TRITON_CALL_H_
+#define XLA_SERVICE_GPU_TRITON_CALL_H_
+
+#include <cstdint>
+#include <string>
+#include <string_view>
+
+#include "mlir/IR/MLIRContext.h"  // from @llvm-project
 
 namespace xla::gpu {
 
-template void* GetTopKKernelForK<Eigen::bfloat16, 1>(int n);
-template void* GetTopKKernelForK<Eigen::bfloat16, 2>(int n);
-template void* GetTopKKernelForK<Eigen::bfloat16, 4>(int n);
-template void* GetTopKKernelForK<Eigen::bfloat16, 8>(int n);
-template void* GetTopKKernelForK<Eigen::bfloat16, 16>(int n);
+struct TritonCall {
+  std::string name;
+  std::string ir;
+  int64_t num_stages;
+  int64_t num_warps;
+  int32_t grid_x;
+  int32_t grid_y;
+  int32_t grid_z;
+
+  // Parse the metadata of a __gpu$xla.gpu.triton call.
+  static TritonCall Parse(std::string_view backend_config,
+                          mlir::MLIRContext* mlir_context);
+};
 
 }  // namespace xla::gpu
+
+#endif  // XLA_SERVICE_GPU_TRITON_CALL_H_
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
index 9981ad86ac03c9..c942962ce35867 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis.cc
@@ -256,6 +256,7 @@ absl::Status TritonFusionAnalysis::ExecuteForDotFusion(
           .insert(
               {output, context.dim_orders().at(output).ToTensorIterationSpec()})
           .second);
+  parameters_[Scope::OUTPUT] = {};
   if (output != &dot) {
     // Propagate back to parameters of the output fusion.
     TF_RETURN_IF_ERROR(context.PropagateDimensionOrdersToParameters(
diff --git a/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc b/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
index f1f70353e445a4..aa9639e5e09986 100644
--- a/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
+++ b/third_party/xla/xla/service/gpu/triton_fusion_analysis_test.cc
@@ -39,6 +39,28 @@ using ::testing::FieldsAre;
 
 using TritonDotAnalysisTest = HloTestBase;
 
+TEST_F(TritonDotAnalysisTest, QueryingOutputScopeParametersAlwaysWorks) {
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+triton_dot {
+  p0 = f32[8,8] parameter(0)
+  ROOT dot = f32[8,8] dot(p0, p0),
+    lhs_contracting_dims={1}, rhs_contracting_dims={0}
+}
+
+ENTRY e {
+  p0 = f32[8,8] parameter(0)
+  ROOT r = f32[8,8] fusion(p0), kind=kCustom, calls=triton_dot
+})"));
+  TF_ASSERT_OK_AND_ASSIGN(
+      const auto analysis,
+      TritonFusionAnalysis::Execute(*module->entry_computation()
+                                         ->root_instruction()
+                                         ->called_computations()[0]));
+  EXPECT_TRUE(
+      analysis.ScopeParameters(TritonFusionAnalysis::Scope::OUTPUT).empty());
+}
+
 TEST_F(TritonDotAnalysisTest, NopBitcasts) {
   const std::string hlo_text = R"(
 HloModule t
@@ -834,6 +856,40 @@ ENTRY main {
                           TritonFusionAnalysis::Execute(*computation));
 }
 
+TEST_F(TritonSoftmaxAnalysisTest, SliceWithinTritonSoftmaxIsNotSupported) {
+  // Slices cannot yet be tiled into triton softmax (b/316637896) because they
+  // cannot be emitted.
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(R"(
+HloModule t
+
+add {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT add = f32[] add(p0, p1)
+}
+
+triton_softmax_computation {
+  param_0 = f32[27,260]{1,0} parameter(0)
+  slice = f32[4,127]{1,0} slice(param_0), slice={[7:27:5], [6:260:2]}
+  constant_0 = f32[] constant(0)
+  reduce = f32[4]{0} reduce(slice,  constant_0), dimensions={1}, to_apply=add
+  ROOT broadcast = f32[4,127]{1,0} broadcast(reduce), dimensions={0}
+}
+
+ENTRY main {
+  param_0 = f32[27,260]{1,0} parameter(0)
+  ROOT fusion = f32[4,127]{1,0} fusion(param_0), kind=kCustom,
+    calls=triton_softmax_computation,
+    backend_config={"kind":"__triton_softmax"}
+})"));
+
+  const HloComputation* computation =
+      module->entry_computation()->root_instruction()->called_computations()[0];
+  const auto analysis = TritonFusionAnalysis::Execute(*computation);
+  EXPECT_FALSE(analysis.ok());
+}
+
 }  // namespace
 }  // namespace gpu
 }  // namespace xla
diff --git a/third_party/xla/xla/service/gpu/triton_support.cc b/third_party/xla/xla/service/gpu/triton_support.cc
index c207f59859c155..5713bd70050ff1 100644
--- a/third_party/xla/xla/service/gpu/triton_support.cc
+++ b/third_party/xla/xla/service/gpu/triton_support.cc
@@ -90,7 +90,7 @@ std::vector<HloOpcode> TritonSupportedUnaryElementwise(
                                         HloOpcode::kLog1p, HloOpcode::kRsqrt,
                                         HloOpcode::kSin, HloOpcode::kSqrt,
                                         HloOpcode::kCbrt, HloOpcode::kTan,
-                                        HloOpcode::kTanh},
+                                        HloOpcode::kTanh, HloOpcode::kErf},
                  std::back_inserter(ret));
   }
   return ret;
diff --git a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
index d3ea92c982938b..123032dd955cfe 100644
--- a/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
+++ b/third_party/xla/xla/service/gpu/triton_tiling_propagation.cc
@@ -875,6 +875,10 @@ DimOrderMapOrError GetPropagatedDimOrders(const HloInstruction& hlo,
     return GetPropagatedDimOrdersForBitcast(hlo, direction, src_dim_order,
                                             properties);
   } else if (hlo.opcode() == HloOpcode::kSlice) {
+    // TODO(b/316637896) Add support for slices in softmax.
+    if (std::holds_alternative<SoftmaxProperties>(properties)) {
+      return "Slices are not supported in Softmax fusions yet.";
+    }
     if (direction != TransformDirection::kOutputToInput) {
       return "Unsupported slice direction.";
     }
diff --git a/third_party/xla/xla/service/gpu_compilation_environment.cc b/third_party/xla/xla/service/gpu_compilation_environment.cc
index b0043f54313458..d92f541f52a354 100644
--- a/third_party/xla/xla/service/gpu_compilation_environment.cc
+++ b/third_party/xla/xla/service/gpu_compilation_environment.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "xla/parse_flags_from_env.h"
 #include "xla/service/compilation_environments.h"
+#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla.pb.h"
@@ -78,47 +79,51 @@ GpuCompilationEnvironment CreateGpuCompEnvWithDefaultValues() {
   return env;
 }
 
-namespace {
-
-// Implement a CompilationEnvironment::ProcessNewEnvFn for
-// GpuCompilationEnvironment, so that we can add GpuCompilationEnvironments
-// to CompilationEnvironments.
-//
-// The implementation returns Default env if one doesn't exist already.
-// NOLINTNEXTLINE
-StatusOr<std::unique_ptr<tsl::protobuf::Message>>
-ProcessNewGpuCompilationEnvironment(
-    std::unique_ptr<tsl::protobuf::Message> env) {  // NOLINT
-  if (!env) {
-    env = std::make_unique<GpuCompilationEnvironment>();
-  }
+Status InitializeMissingFieldsFromXLAFlags(GpuCompilationEnvironment& env) {
   TF_ASSIGN_OR_RETURN(GpuCompilationEnvironment from_env,
                       CreateGpuCompEnvFromEnvVar());
 
   auto default_env = CreateGpuCompEnvWithDefaultValues();
 
-  auto reflection = env->GetReflection();
+  auto reflection = env.GetReflection();
   auto reflection_from_env = from_env.GetReflection();
   auto descriptor = GpuCompilationEnvironment::descriptor();
   std::vector<const tsl::protobuf::FieldDescriptor*> missing_fields;
 
   for (int j = 0; j < descriptor->field_count(); ++j) {
     const tsl::protobuf::FieldDescriptor* field = descriptor->field(j);
-    if (reflection->HasField(*env, field) &&
+    if (reflection->HasField(env, field) &&
         reflection_from_env->HasField(from_env, field)) {
       return InvalidArgument(
           "Flag %s is set in both XLA_FLAGS env var and "
           "GpuCompilationEnvironment.",
           field->name());
-    } else if (!reflection->HasField(*env, field) &&
+    } else if (!reflection->HasField(env, field) &&
                !reflection_from_env->HasField(from_env, field)) {
       missing_fields.push_back(field);
     }
   }
-  env->MergeFrom(from_env);
+  env.MergeFrom(from_env);
 
   if (!missing_fields.empty()) {
-    reflection->SwapFields(env.get(), &default_env, missing_fields);
+    reflection->SwapFields(&env, &default_env, missing_fields);
+  }
+  return OkStatus();
+}
+
+namespace {
+
+// Implement a CompilationEnvironment::ProcessNewEnvFn for
+// GpuCompilationEnvironment, so that we can add GpuCompilationEnvironments
+// to CompilationEnvironments.
+//
+// The implementation returns Empty env if one doesn't exist already.
+// NOLINTNEXTLINE
+StatusOr<std::unique_ptr<tsl::protobuf::Message>>
+ProcessNewGpuCompilationEnvironment(
+    std::unique_ptr<tsl::protobuf::Message> env) {  // NOLINT
+  if (!env) {
+    env = std::make_unique<GpuCompilationEnvironment>();
   }
   return env;
 }
diff --git a/third_party/xla/xla/service/gpu_compilation_environment.h b/third_party/xla/xla/service/gpu_compilation_environment.h
index 0194e3b96e256f..881ddc802ad08d 100644
--- a/third_party/xla/xla/service/gpu_compilation_environment.h
+++ b/third_party/xla/xla/service/gpu_compilation_environment.h
@@ -30,5 +30,9 @@ StatusOr<GpuCompilationEnvironment> CreateGpuCompEnvFromEnvVar();
 
 GpuCompilationEnvironment CreateGpuCompEnvWithDefaultValues();
 
+// Returns non-OK status if XLA_FLAGS env var has malformed values or
+// if it has conflict with the GpuCompilationEnvironment proto
+Status InitializeMissingFieldsFromXLAFlags(GpuCompilationEnvironment& env);
+
 }  // namespace xla
 #endif  // XLA_SERVICE_GPU_COMPILATION_ENVIRONMENT_H_
diff --git a/third_party/xla/xla/service/gpu_compilation_environment_test.cc b/third_party/xla/xla/service/gpu_compilation_environment_test.cc
index bf8564ce41084d..85dde778f8aee3 100644
--- a/third_party/xla/xla/service/gpu_compilation_environment_test.cc
+++ b/third_party/xla/xla/service/gpu_compilation_environment_test.cc
@@ -15,9 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu_compilation_environment.h"
 
-#include <memory>
 #include <string>
-#include <utility>
 #include <vector>
 
 #include <gmock/gmock.h>
@@ -96,54 +94,50 @@ TEST(CreateGpuCompEnvFromEnvVarTest, InvalidFlagValue) {
               StatusIs(tsl::error::INVALID_ARGUMENT));
 }
 
-TEST(ProcessNewEnvTest, BothProtoAndEnvVarUnset) {
+TEST(InitializeMissingFieldsFromXLAFlagsTest, BothProtoAndEnvVarUnset) {
   set_xla_flags_env_var("");
-  CompilationEnvironments envs;
-
-  const auto& env = envs.GetEnv<GpuCompilationEnvironment>();
+  GpuCompilationEnvironment env;
 
+  TF_ASSERT_OK(InitializeMissingFieldsFromXLAFlags(env));
   EXPECT_EQ(env.dummy_flag(), 1);
 }
 
-TEST(ProcessNewEnvTest, ProtoSetButEnvVarUnset) {
+TEST(InitializeMissingFieldsFromXLAFlagsTest, ProtoSetButEnvVarUnset) {
   set_xla_flags_env_var("");
-  CompilationEnvironments envs;
-  {
-    auto env = std::make_unique<GpuCompilationEnvironment>();
-    env->set_dummy_flag(2);
-    TF_ASSERT_OK(envs.AddEnv(std::move(env)));
-  }
-  const auto& env = envs.GetEnv<GpuCompilationEnvironment>();
+  GpuCompilationEnvironment env;
+  env.set_dummy_flag(2);
+
+  TF_ASSERT_OK(InitializeMissingFieldsFromXLAFlags(env));
 
   EXPECT_EQ(env.dummy_flag(), 2);
 }
 
-TEST(ProcessNewEnvTest, ProtoUnsetButEnvVarSet) {
+TEST(InitializeMissingFieldsFromXLAFlagsTest, ProtoUnsetButEnvVarSet) {
   set_xla_flags_env_var("--dummy_flag=4");
-  CompilationEnvironments envs;
-  const auto& env = envs.GetEnv<GpuCompilationEnvironment>();
+
+  GpuCompilationEnvironment env;
+  TF_ASSERT_OK(InitializeMissingFieldsFromXLAFlags(env));
 
   EXPECT_EQ(env.dummy_flag(), 4);
 }
 
-TEST(ProcessNewEnvTest, BothProtoAndEnvVarSetButNoConflict) {
+TEST(InitializeMissingFieldsFromXLAFlagsTest,
+     BothProtoAndEnvVarSetButNoConflict) {
   set_xla_flags_env_var("--dummy_flag=4");
   CompilationEnvironments envs;
-  {
-    auto env = std::make_unique<GpuCompilationEnvironment>();
-    TF_ASSERT_OK(envs.AddEnv(std::move(env)));
-  }
-  const auto& env = envs.GetEnv<GpuCompilationEnvironment>();
+  GpuCompilationEnvironment env;
+  TF_ASSERT_OK(InitializeMissingFieldsFromXLAFlags(env));
   EXPECT_EQ(env.dummy_flag(), 4);
 }
 
-TEST(ProcessNewEnvTest, BothProtoAndEnvVarSetWithConflict) {
+TEST(InitializeMissingFieldsFromXLAFlagsTest,
+     BothProtoAndEnvVarSetWithConflict) {
   set_xla_flags_env_var("--dummy_flag=4");
 
   CompilationEnvironments envs;
-  auto env = std::make_unique<GpuCompilationEnvironment>();
-  env->set_dummy_flag(2);
-  EXPECT_THAT(envs.AddEnv(std::move(env)),
+  GpuCompilationEnvironment env;
+  env.set_dummy_flag(2);
+  EXPECT_THAT(InitializeMissingFieldsFromXLAFlags(env),
               StatusIs(tsl::error::INVALID_ARGUMENT));
 }
 
diff --git a/third_party/xla/xla/service/graphcycles/BUILD b/third_party/xla/xla/service/graphcycles/BUILD
index 3cd370a96da244..dfaf6a27b5ca21 100644
--- a/third_party/xla/xla/service/graphcycles/BUILD
+++ b/third_party/xla/xla/service/graphcycles/BUILD
@@ -1,9 +1,12 @@
 load("//xla:xla.bzl", "xla_cc_test")
-load("@local_tsl//tsl:tsl.bzl", "set_external_visibility")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//tensorflow/compiler:__subpackages__",
+    ]),
     licenses = ["notice"],
 )
 
@@ -11,7 +14,6 @@ cc_library(
     name = "graphcycles",
     srcs = ["graphcycles.cc"],
     hdrs = ["graphcycles.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":ordered_set",
         "@com_google_absl//absl/algorithm:container",
@@ -26,7 +28,6 @@ cc_library(
 cc_library(
     name = "ordered_set",
     hdrs = ["ordered_set.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/types:span",
diff --git a/third_party/xla/xla/service/heap_simulator/BUILD b/third_party/xla/xla/service/heap_simulator/BUILD
index 2f756362ec9b9f..161c8f069028d3 100644
--- a/third_party/xla/xla/service/heap_simulator/BUILD
+++ b/third_party/xla/xla/service/heap_simulator/BUILD
@@ -8,7 +8,8 @@ load(
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
     licenses = ["notice"],
 )
 
@@ -23,7 +24,6 @@ cc_library(
     name = "allocation_block",
     srcs = ["allocation_block.cc"],
     hdrs = ["allocation_block.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
@@ -35,7 +35,6 @@ cc_library(
     name = "heap_simulator",
     srcs = ["heap_simulator.cc"],
     hdrs = ["heap_simulator.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":allocation_block",
         "//xla:comparison_util",
diff --git a/third_party/xla/xla/service/hlo.proto b/third_party/xla/xla/service/hlo.proto
index e1908ce0f6a8c3..e505f31f78ec72 100644
--- a/third_party/xla/xla/service/hlo.proto
+++ b/third_party/xla/xla/service/hlo.proto
@@ -30,6 +30,7 @@ syntax = "proto3";
 
 package xla;
 
+import "google/protobuf/any.proto";
 import "xla/xla_data.proto";
 
 option cc_enable_arenas = true;
@@ -111,7 +112,7 @@ enum CustomCallApiVersion {
 }
 
 // Serialization of HloInstruction.
-// Next ID: 86
+// Next ID: 87
 message HloInstructionProto {
   reserved 10;
   reserved "parameter_name";
@@ -373,6 +374,9 @@ message HloInstructionProto {
   reserved 83;
   // Used to be wait_on_operation_queues.
   reserved 84;
+
+  // Sparsity descriptor for dot operation.
+  xla.SparsityDescriptor dot_sparsity = 86;
 }
 
 // Serialization of HloComputation.
@@ -790,6 +794,9 @@ message HloPassMetadata {
   // Timestamp before and after the pass is run. Note they may be equal.
   int64 start_timestamp_usec = 8;
   int64 end_timestamp_usec = 9;
+
+  // Custom metadata for the pass.
+  google.protobuf.Any custom_metadata = 10;
 }
 
 // Encodes the underlying Xla runtime executable compiled from the XLA module.
diff --git a/third_party/xla/xla/service/hlo_cost_analysis.cc b/third_party/xla/xla/service/hlo_cost_analysis.cc
index 8889ba7849012e..1511edb3ec828b 100644
--- a/third_party/xla/xla/service/hlo_cost_analysis.cc
+++ b/third_party/xla/xla/service/hlo_cost_analysis.cc
@@ -135,13 +135,14 @@ Status HloCostAnalysis::HandleElementwiseOp(
   // operation can correspond to several floating point ops.
   // kLogistic is included in "trascendental" as it is implemented using
   // trascendental ops (tanh or exp).
-  if (opcode == HloOpcode::kExp || opcode == HloOpcode::kLog ||
-      opcode == HloOpcode::kLogistic || opcode == HloOpcode::kPower ||
-      opcode == HloOpcode::kSqrt || opcode == HloOpcode::kCbrt ||
-      opcode == HloOpcode::kRsqrt || opcode == HloOpcode::kTanh ||
-      opcode == HloOpcode::kSin || opcode == HloOpcode::kCos ||
-      opcode == HloOpcode::kExpm1 || opcode == HloOpcode::kLog1p ||
-      opcode == HloOpcode::kAtan2 || opcode == HloOpcode::kTan) {
+  if (opcode == HloOpcode::kErf || opcode == HloOpcode::kExp ||
+      opcode == HloOpcode::kLog || opcode == HloOpcode::kLogistic ||
+      opcode == HloOpcode::kPower || opcode == HloOpcode::kSqrt ||
+      opcode == HloOpcode::kCbrt || opcode == HloOpcode::kRsqrt ||
+      opcode == HloOpcode::kTanh || opcode == HloOpcode::kSin ||
+      opcode == HloOpcode::kCos || opcode == HloOpcode::kExpm1 ||
+      opcode == HloOpcode::kLog1p || opcode == HloOpcode::kAtan2 ||
+      opcode == HloOpcode::kTan) {
     current_properties_[kTranscendentalsKey] = computation_count;
   } else {
     // Note: transcendental operations are considered a separate category from
diff --git a/third_party/xla/xla/service/hlo_creation_utils.h b/third_party/xla/xla/service/hlo_creation_utils.h
index 1ece27022567b6..c55ae84eeede9c 100644
--- a/third_party/xla/xla/service/hlo_creation_utils.h
+++ b/third_party/xla/xla/service/hlo_creation_utils.h
@@ -284,8 +284,8 @@ HloInstruction* MakeScalarLike(HloInstruction* base, NativeT value) {
     *scalar->mutable_shape() = base->shape();
     return scalar;
   }
-  return base->AddInstruction(
-      HloInstruction::CreateBroadcast(base->shape(), scalar, {}));
+  return base->AddInstruction(HloInstruction::CreateBroadcast(
+      ShapeUtil::MakeStaticShape(base->shape()), scalar, {}));
 }
 
 // Creates a fusion instruction and fuses `fused` into the created fusion
diff --git a/third_party/xla/xla/service/hlo_creation_utils_test.cc b/third_party/xla/xla/service/hlo_creation_utils_test.cc
index fc4ee3cf36ea6b..4df62a6463e484 100644
--- a/third_party/xla/xla/service/hlo_creation_utils_test.cc
+++ b/third_party/xla/xla/service/hlo_creation_utils_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
@@ -372,6 +373,7 @@ TEST_F(HloCreationUtilsTest, MaybeMakeTupleTuplizesMultipleOperands) {
   Literal expected_result = LiteralUtil::MakeTuple({&input1, &input0});
   EXPECT_EQ(result_literal, expected_result);
 }
+
 TEST_F(HloCreationUtilsTest, DynamicUpdateSliceVectorStartIndices) {
   auto module = CreateNewVerifiedModule("dus-creation-test");
   // arg:
@@ -485,5 +487,19 @@ TEST_F(HloCreationUtilsTest, ReduceWindow) {
             expected_output_shape);
 }
 
+TEST_F(HloCreationUtilsTest, DynamicBroadcastShape) {
+  HloInstruction* param;
+  HloComputation* entry_computation;
+
+  auto module = CreateModuleWithProgramShape(F32, /*input_shape_dims=*/{10},
+                                             /*output_shape_dims=*/{10}, &param,
+                                             &entry_computation);
+  param->mutable_shape()->set_dynamic_dimension(0, true);
+
+  HloInstruction* one_constant = MakeScalarLike(param, 1.0f);
+  // Broadcasts should always have a static shape that is inferred.
+  EXPECT_TRUE(one_constant->shape().is_static());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_dataflow_analysis.cc b/third_party/xla/xla/service/hlo_dataflow_analysis.cc
index a115624c608e52..cf4f665fd7b91b 100644
--- a/third_party/xla/xla/service/hlo_dataflow_analysis.cc
+++ b/third_party/xla/xla/service/hlo_dataflow_analysis.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "absl/container/flat_hash_set.h"
 #include "absl/container/inlined_vector.h"
 #include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
 #include "absl/memory/memory.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
@@ -332,7 +333,7 @@ HloValue* HloDataflowAnalysis::NewHloValue(HloInstruction* instruction,
 }
 
 void HloDataflowAnalysis::MarkValueForDeletion(HloValue::Id value_id) {
-  const HloValue& value = *values_.at(value_id);
+  const HloValue& value = GetValue(value_id);
   VLOG(4) << "MarkValueForDeletion(" << value.ToShortString() << ")";
 
   value_ids_to_delete_.push_back(value_id);
@@ -526,11 +527,15 @@ bool HloDataflowAnalysis::Phi(
 }
 
 const HloValue& HloDataflowAnalysis::GetValue(HloValue::Id value_id) const {
-  return *values_.at(value_id);
+  const auto value = values_.find(value_id);
+  CHECK(value != values_.end()) << "Value not found: " << value_id;
+  return *value->second;
 }
 
 HloValue& HloDataflowAnalysis::GetValue(HloValue::Id value_id) {
-  return *values_.at(value_id);
+  const auto value = values_.find(value_id);
+  CHECK(value != values_.end()) << "Value not found: " << value_id;
+  return *value->second;
 }
 
 HloValueSet HloDataflowAnalysis::GetFlattenedValueSet(
@@ -1403,12 +1408,18 @@ void HloDataflowAnalysis::Propagate() {
 
 const InstructionValueSet& HloDataflowAnalysis::GetInstructionValueSet(
     const HloInstruction* instruction) const {
-  return *value_sets_.at(instruction);
+  const auto value_set = value_sets_.find(instruction);
+  CHECK(value_set != value_sets_.end())
+      << "Instruction " << instruction->ToString() << " not found.";
+  return *value_set->second;
 }
 
 InstructionValueSet& HloDataflowAnalysis::GetInstructionValueSet(
     const HloInstruction* instruction) {
-  return *value_sets_.at(instruction);
+  const auto value_set = value_sets_.find(instruction);
+  CHECK(value_set != value_sets_.end())
+      << "Instruction " << instruction->ToString() << " not found.";
+  return *value_set->second;
 }
 
 Status HloDataflowAnalysis::InitializeInstructionValueSets() {
diff --git a/third_party/xla/xla/service/hlo_dce_test.cc b/third_party/xla/xla/service/hlo_dce_test.cc
index 8b44ebd7f484fa..e80cb84c67a5e7 100644
--- a/third_party/xla/xla/service/hlo_dce_test.cc
+++ b/third_party/xla/xla/service/hlo_dce_test.cc
@@ -111,6 +111,30 @@ TEST_F(HloDceTest, CustomCallInstructionsWithSideEffect) {
   EXPECT_FALSE(result);
 }
 
+TEST_F(HloDceTest, AsyncCustomCallInstructionsWithSideEffect) {
+  // Verify that custom call instruction with side-effect is not removed.
+  auto builder = HloComputation::Builder(TestName());
+  auto instr = Cast<HloCustomCallInstruction>(builder.AddInstruction(
+      HloInstruction::CreateCustomCall(ShapeUtil::MakeShape(F32, {}),
+                                       /*operands=*/{},
+                                       /*custom_call_target=*/"foo")));
+  instr->set_custom_call_has_side_effect(true);
+  builder.AddInstruction(HloInstruction::CreateTuple({}));
+
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN([[maybe_unused]] HloInstruction * async_done,
+                          module->entry_computation()->CreateAsyncInstructions(
+                              instr, {{ShapeUtil::MakeScalarShape(U32)}},
+                              HloInstruction::kMainExecutionThread,
+                              /*replace=*/true, /*override_names=*/true));
+
+  HloDCE dce;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&dce, module.get()));
+  EXPECT_FALSE(result);
+}
+
 TEST_F(HloDceTest, CustomCallInstructionsWithoutSideEffect) {
   // Verify that custom call instruction without side-effect is removed.
   auto builder = HloComputation::Builder(TestName());
@@ -128,6 +152,30 @@ TEST_F(HloDceTest, CustomCallInstructionsWithoutSideEffect) {
   EXPECT_TRUE(result);
 }
 
+TEST_F(HloDceTest, AsyncCustomCallInstructionsWithoutSideEffect) {
+  // Verify that custom call instruction without side-effect is removed.
+  auto builder = HloComputation::Builder(TestName());
+  auto instr = Cast<HloCustomCallInstruction>(builder.AddInstruction(
+      HloInstruction::CreateCustomCall(ShapeUtil::MakeShape(F32, {}),
+                                       /*operands=*/{},
+                                       /*custom_call_target=*/"foo")));
+  instr->set_custom_call_has_side_effect(false);
+  builder.AddInstruction(HloInstruction::CreateTuple({}));
+
+  auto module = CreateNewVerifiedModule();
+  module->AddEntryComputation(builder.Build());
+
+  TF_ASSERT_OK_AND_ASSIGN([[maybe_unused]] HloInstruction * async_done,
+                          module->entry_computation()->CreateAsyncInstructions(
+                              instr, {{ShapeUtil::MakeScalarShape(U32)}},
+                              HloInstruction::kMainExecutionThread,
+                              /*replace=*/true, /*override_names=*/true));
+
+  HloDCE dce;
+  TF_ASSERT_OK_AND_ASSIGN(bool result, RunHloPass(&dce, module.get()));
+  EXPECT_TRUE(result);
+}
+
 TEST_F(HloDceTest, ShardingCustomCallInstruction) {
   // Verify that sharding custom call instruction is not removed.
   auto builder = HloComputation::Builder(TestName());
diff --git a/third_party/xla/xla/service/hlo_graph_dumper.cc b/third_party/xla/xla/service/hlo_graph_dumper.cc
index 02325b834dfe05..366ce291f5dfc3 100644
--- a/third_party/xla/xla/service/hlo_graph_dumper.cc
+++ b/third_party/xla/xla/service/hlo_graph_dumper.cc
@@ -1094,6 +1094,7 @@ ColorScheme HloDotDumper::GetInstructionColor(const HloInstruction* instr) {
     case HloOpcode::kConvert:
     case HloOpcode::kCos:
     case HloOpcode::kDivide:
+    case HloOpcode::kErf:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
diff --git a/third_party/xla/xla/service/hlo_instruction_test.cc b/third_party/xla/xla/service/hlo_instruction_test.cc
index e40cc977c9aca4..0ddabf39028575 100644
--- a/third_party/xla/xla/service/hlo_instruction_test.cc
+++ b/third_party/xla/xla/service/hlo_instruction_test.cc
@@ -856,6 +856,60 @@ TEST_F(HloInstructionTest, AsyncOp) {
   EXPECT_EQ(computation->root_instruction(), async_done);
 }
 
+TEST_F(HloInstructionTest, AsyncOpWithDeps) {
+  HloComputation::Builder builder(TestName());
+  // Create a call instruction containing a single binary operation.
+  auto constant1 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.1f)));
+  auto constant2 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.1f)));
+
+  auto constant3 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(1.1f)));
+  auto constant4 = builder.AddInstruction(
+      HloInstruction::CreateConstant(LiteralUtil::CreateR0<float>(42.1f)));
+
+  auto add1 = builder.AddInstruction(HloInstruction::CreateBinary(
+      r0f32_, HloOpcode::kAdd, constant3, constant4));
+
+  auto add = builder.AddInstruction(HloInstruction::CreateBinary(
+      r0f32_, HloOpcode::kAdd, constant1, constant2));
+
+  auto add2 = builder.AddInstruction(HloInstruction::CreateBinary(
+      r0f32_, HloOpcode::kAdd, constant1, constant2));
+
+  // control chain is add1 <- add <- add2
+  TF_ASSERT_OK(add1->AddControlDependencyTo(add));
+
+  TF_ASSERT_OK(add->AddControlDependencyTo(add2));
+
+  auto module = CreateNewVerifiedModule();
+  auto* computation = module->AddEntryComputation(builder.Build());
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto* async_done,
+      computation->CreateAsyncInstructions(
+          add, {ShapeUtil::MakeScalarShape(U32)}, "parallel_thread"));
+  auto* async_start = async_done->operand(0);
+  // Verify that control chain is not broken.
+  // New chain should be add1 <- asyncStart <- asyncDone <- add2
+  EXPECT_EQ(async_start->control_predecessors().size(), 1);
+  EXPECT_EQ(async_start->control_predecessors()[0], add1);
+
+  EXPECT_EQ(async_done->control_successors().size(), 1);
+  EXPECT_EQ(async_done->control_successors()[0], add2);
+
+  EXPECT_EQ(async_start->shape().tuple_shapes_size(), 3);
+  EXPECT_EQ(async_start->async_execution_thread(), "parallel_thread");
+  EXPECT_EQ(async_done->async_execution_thread(), "parallel_thread");
+  EXPECT_TRUE(ShapeUtil::Equal(async_start->shape().tuple_shapes(2),
+                               ShapeUtil::MakeScalarShape(U32)));
+  EXPECT_EQ(async_start->async_wrapped_computation()->execution_thread(),
+            "parallel_thread");
+  EXPECT_EQ(async_done->async_wrapped_computation()->execution_thread(),
+            "parallel_thread");
+  EXPECT_THAT(async_start->operands(), ElementsAre(constant1, constant2));
+}
+
 TEST_F(HloInstructionTest, PreserveOutfeedShapeThroughClone) {
   HloComputation::Builder builder(TestName());
   auto constant = builder.AddInstruction(
diff --git a/third_party/xla/xla/service/hlo_parser.cc b/third_party/xla/xla/service/hlo_parser.cc
index 57946ecf8fe6c5..070619db547d8b 100644
--- a/third_party/xla/xla/service/hlo_parser.cc
+++ b/third_party/xla/xla/service/hlo_parser.cc
@@ -129,6 +129,7 @@ bool CanInferShape(HloOpcode code) {
     case HloOpcode::kDivide:
     case HloOpcode::kDomain:
     case HloOpcode::kDot:
+    case HloOpcode::kErf:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kFft:
@@ -1504,6 +1505,7 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
     case HloOpcode::kCopyDone:
     case HloOpcode::kCos:
     case HloOpcode::kOptimizationBarrier:
+    case HloOpcode::kErf:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kImag:
@@ -2105,14 +2107,15 @@ HloInstruction* HloParserImpl::CreateInstruction(  // NOLINT
           !ParseAttributes(attrs, allow_attributes)) {
         return nullptr;
       }
-      if (dynamic_cast<const HloChannelInstruction*>(operands[0]) == nullptr) {
-        return nullptr;
-      }
-      if (channel_id != operands[0]->channel_id()) {
-        return nullptr;
+
+      if (dynamic_cast<const HloChannelInstruction*>(operands[0]) != nullptr) {
+        if (channel_id != operands[0]->channel_id()) {
+          return nullptr;
+        }
       }
-      return builder->AddInstruction(
-          HloInstruction::CreateRecvDone(operands[0], *is_host_transfer));
+
+      return builder->AddInstruction(HloInstruction::CreateRecvDone(
+          operands[0], channel_id.value(), *is_host_transfer));
     }
     case HloOpcode::kSend: {
       optional<int64_t> channel_id;
@@ -5664,6 +5667,7 @@ bool HloParserImpl::ParseLayoutIntAttribute(
 //   ::= '{' int64_list
 //       (':' dim_level_types
 //            tiles
+//            tail_padding_alignment_in_elements
 //            element_size_in_bits
 //            memory_space
 //            physical_shape
@@ -5687,6 +5691,7 @@ bool HloParserImpl::ParseLayout(Layout* layout) {
   int64_t memory_space = 0;
   std::optional<Shape> physical_shape;
   int64_t dynamic_shape_metadata_prefix_bytes = 0;
+  int64_t tail_padding_alignment_in_elements = 1;
 
   auto parse_and_add_item = [&]() {
     int64_t i;
@@ -5725,6 +5730,12 @@ bool HloParserImpl::ParseLayout(Layout* layout) {
         ParseTiles(&tiles);
       }
 
+      if (lexer_.GetKind() == TokKind::kIdent && lexer_.GetStrVal() == "L") {
+        lexer_.Lex();
+        ParseLayoutIntAttribute(&tail_padding_alignment_in_elements,
+                                "multiple padded to in elements");
+      }
+
       if (lexer_.GetKind() == TokKind::kOctothorp) {
         lexer_.Lex();
         ParseToken(
@@ -5782,11 +5793,11 @@ bool HloParserImpl::ParseLayout(Layout* layout) {
   for (int i = 0; i < tiles.size(); i++) {
     vec_tiles[i] = Tile(tiles[i]);
   }
-  *layout = LayoutUtil::MakeLayout(minor_to_major, dim_level_types, dim_unique,
-                                   dim_ordered, vec_tiles, index_primitive_type,
-                                   pointer_primitive_type, element_size_in_bits,
-                                   memory_space, std::move(physical_shape),
-                                   dynamic_shape_metadata_prefix_bytes);
+  *layout = LayoutUtil::MakeLayout(
+      minor_to_major, dim_level_types, dim_unique, dim_ordered, vec_tiles,
+      tail_padding_alignment_in_elements, index_primitive_type,
+      pointer_primitive_type, element_size_in_bits, memory_space,
+      std::move(physical_shape), dynamic_shape_metadata_prefix_bytes);
   return true;
 }
 
diff --git a/third_party/xla/xla/service/hlo_parser_test.cc b/third_party/xla/xla/service/hlo_parser_test.cc
index c9122b4a24eb07..1a2a88b100b884 100644
--- a/third_party/xla/xla/service/hlo_parser_test.cc
+++ b/third_party/xla/xla/service/hlo_parser_test.cc
@@ -4138,7 +4138,7 @@ TEST_F(HloParserTest, ParseShapeStringWithElementSizeInBits) {
   std::string shape_string = "s4[123,456]{1,0:T(2,128)E(4)}";
   TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
   Shape expected = ShapeUtil::MakeShapeWithDenseLayout(S4, {123, 456}, {1, 0},
-                                                       {Tile({2, 128})}, 4);
+                                                       {Tile({2, 128})}, 1, 4);
   EXPECT_EQ(expected, actual)
       << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
       << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
@@ -4148,8 +4148,8 @@ TEST_F(HloParserTest, ParseShapeStringWithMemorySpaceLayout) {
   // Tile, element size, and memory space.
   std::string shape_string = "pred[123,456]{1,0:T(2,128)S(3)}";
   TF_ASSERT_OK_AND_ASSIGN(Shape actual, ParseShape(shape_string));
-  Shape expected = ShapeUtil::MakeShapeWithDenseLayout(PRED, {123, 456}, {1, 0},
-                                                       {Tile({2, 128})}, 0, 3);
+  Shape expected = ShapeUtil::MakeShapeWithDenseLayout(
+      PRED, {123, 456}, {1, 0}, {Tile({2, 128})}, 1, 0, 3);
   EXPECT_EQ(expected, actual)
       << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
       << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
@@ -4157,8 +4157,8 @@ TEST_F(HloParserTest, ParseShapeStringWithMemorySpaceLayout) {
   // Element size and memory space.
   shape_string = "pred[123,456]{1,0:S(3)}";
   TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
-  expected =
-      ShapeUtil::MakeShapeWithDenseLayout(PRED, {123, 456}, {1, 0}, {}, 0, 3);
+  expected = ShapeUtil::MakeShapeWithDenseLayout(PRED, {123, 456}, {1, 0}, {},
+                                                 1, 0, 3);
   EXPECT_EQ(expected, actual)
       << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
       << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
@@ -4166,8 +4166,8 @@ TEST_F(HloParserTest, ParseShapeStringWithMemorySpaceLayout) {
   // Memory space only.
   shape_string = "pred[123,456]{1,0:S(3)}";
   TF_ASSERT_OK_AND_ASSIGN(actual, ParseShape(shape_string));
-  expected =
-      ShapeUtil::MakeShapeWithDenseLayout(PRED, {123, 456}, {1, 0}, {}, 0, 3);
+  expected = ShapeUtil::MakeShapeWithDenseLayout(PRED, {123, 456}, {1, 0}, {},
+                                                 1, 0, 3);
   EXPECT_EQ(expected, actual)
       << "expected: " << ShapeUtil::HumanStringWithLayout(expected)
       << "actual:   " << ShapeUtil::HumanStringWithLayout(actual);
@@ -5057,5 +5057,75 @@ ENTRY %Entry (p0: f32[10]) -> f32[20] {
                             "but got foo_thread")));
 }
 
+TEST_F(HloParserTest, PipelinedSendRecv) {
+  const std::string hlo_string = R"(
+  HloModule test
+  cond {
+    param = (u32[], u32[2], (u32[2], u32[], token[])) parameter(0)
+    count = get-tuple-element(%param), index=0
+    ub = u32[] constant(1)
+    ROOT result = pred[] compare(count, ub), direction=LT
+  }
+
+  body {
+    param = (u32[], u32[2], (u32[2], u32[], token[])) parameter(0)
+    count = get-tuple-element(%param), index=0
+    send-data = get-tuple-element(%param), index=1
+
+    after-all.0 = token[] after-all()
+    send.0 = (u32[2], u32[], token[]) send(send-data, after-all.0),
+      channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{1,0}}"
+      }
+
+    recv.0 = (u32[2], u32[], token[]) get-tuple-element(param), index=2
+    recv-done.0 = (u32[2], token[]) recv-done(recv.0), channel_id=1
+    recv-data.0 = u32[2] get-tuple-element(recv-done.0), index=0
+
+    c1 = u32[] constant(1)
+    new_count = u32[] add(count, c1)
+
+    after-all.0.n = token[] after-all()
+    recv.0.n = (u32[2], u32[], token[]) recv(after-all.0.n), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{1,0}}"
+      }
+
+    send-done.0 = token[] send-done(send.0), channel_id=1
+
+    ROOT result = (u32[], u32[2], (u32[2], u32[], token[])) tuple(new_count, recv-data.0, recv.0.n)
+  }
+
+  ENTRY test_computation {
+    c0 = u32[] constant(0)
+    init = u32[2] broadcast(c0), dimensions={}
+    after-all.0.p = token[] after-all()
+    recv.0.p = (u32[2], u32[], token[]) recv(after-all.0.p), channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{1,0}}"
+      }
+
+    while_init = (u32[], u32[2], (u32[2], u32[], token[])) tuple(c0, init, recv.0.p)
+    while_result = (u32[], u32[2], (u32[2], u32[], token[])) while(while_init), body=body, condition=cond
+
+    send-data.q = u32[2] get-tuple-element(while_result), index=1
+    after-all.0.q = token[] after-all()
+    send.0.q = (u32[2], u32[], token[]) send(send-data.q, after-all.0.q),
+      channel_id=1,
+      frontend_attributes={
+        _xla_send_recv_source_target_pairs="{{1,0}}"
+      }
+
+    recv.0.q = (u32[2], u32[], token[]) get-tuple-element(while_result), index=2
+    recv-done.0.q = (u32[2], token[]) recv-done(recv.0.q), channel_id=1
+    send-done.0.q = token[] send-done(send.0.q), channel_id=1
+
+    ROOT recv-data.0.q = u32[2] get-tuple-element(recv-done.0.q), index=0
+      })";
+  auto result = ParseAndReturnUnverifiedModule(hlo_string);
+  EXPECT_EQ(OkStatus(), result.status());
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/hlo_pass_pipeline.cc b/third_party/xla/xla/service/hlo_pass_pipeline.cc
index d08ebbd8762c8c..01a9c7ee103794 100644
--- a/third_party/xla/xla/service/hlo_pass_pipeline.cc
+++ b/third_party/xla/xla/service/hlo_pass_pipeline.cc
@@ -99,29 +99,6 @@ void RecordPassEndMetadata(HloModuleGroup& module_group,
   }
 }
 
-void SetInstructionMetadata(HloModule& module) {
-  StatusOr<int64_t> pass_id = module.metadata()->current_pass_id();
-  if (!pass_id.ok()) {
-    LOG(FATAL) << pass_id.status();
-  }
-  for (xla::HloComputation* computation : module.computations()) {
-    for (xla::HloInstruction* instruction : computation->instructions()) {
-      if (instruction->metadata().creation_pass_id() == 0) {
-        instruction->set_creation_pass_id(*pass_id);
-      }
-      if (instruction->metadata().logical_creation_pass_id() == 0) {
-        instruction->set_logical_creation_pass_id(*pass_id);
-      }
-    }
-  }
-}
-
-void SetInstructionMetadata(HloModuleGroup& module_group) {
-  for (HloModule* module : module_group.modules()) {
-    SetInstructionMetadata(*module);
-  }
-}
-
 }  // namespace
 
 template <typename HloT>
@@ -179,7 +156,6 @@ StatusOr<bool> HloPassPipeline::RunPassesInternal(
       RunInvariantCheckers(hlo, kPipelineStart, execution_threads));
 
   RecordPassStartMetadata(*hlo, std::string(kPipelineStart), pipeline_name);
-  SetInstructionMetadata(*hlo);
   MaybeDumpHloAndSaveFilenames(*hlo,
                                /*after_pass_name=*/kPipelineStart,
                                /*before_pass_name=*/passes.empty()
@@ -207,7 +183,6 @@ StatusOr<bool> HloPassPipeline::RunPassesInternal(
           pass_name, absl::StatusCodeToString(status.code()));
     }
     TF_ASSIGN_OR_RETURN(bool pass_changed, status_or_changed);
-    SetInstructionMetadata(*hlo);
     if (!dump_regex.empty() && (pass_changed || dump_regex != ".*")) {
       MaybeDumpHloAndSaveFilenames(*hlo,
                                    /*after_pass_name=*/pass_name,
diff --git a/third_party/xla/xla/service/hlo_runner.cc b/third_party/xla/xla/service/hlo_runner.cc
index 0e56fda23a26db..14e9b9b461c204 100644
--- a/third_party/xla/xla/service/hlo_runner.cc
+++ b/third_party/xla/xla/service/hlo_runner.cc
@@ -640,6 +640,11 @@ HloRunner::CreateExecutableWithBufferAssignment(
       LOG(WARNING) << "Ignoring buffer assignment provided because hlo passes "
                       "are enabled.";
     }
+    // Setup intra-op threads in module config
+    if (backend().eigen_intra_op_thread_pool() != nullptr) {
+      module->mutable_config().set_intra_op_parallelism_threads(
+          backend().eigen_intra_op_thread_pool()->NumThreads());
+    }
     auto module_group = std::make_unique<HloModuleGroup>(std::move(module));
     TF_ASSIGN_OR_RETURN(
         auto executables,
diff --git a/third_party/xla/xla/service/hlo_verifier.cc b/third_party/xla/xla/service/hlo_verifier.cc
index d41de21e9fbd78..e182a6e193f004 100644
--- a/third_party/xla/xla/service/hlo_verifier.cc
+++ b/third_party/xla/xla/service/hlo_verifier.cc
@@ -300,21 +300,10 @@ Status ShapeVerifier::HandleOptimizationBarrier(HloInstruction* hlo) {
 }
 
 bool ShapeVerifier::ShapesSame(const Shape& a, const Shape& b,
-                               bool minor_to_major_only,
-                               bool ignore_memory_space, bool ignore_tiles) {
+                               Shape::Equal equal) {
   if (!opts_.layout_sensitive) {
     return ShapeUtil::Compatible(a, b);
   }
-  Shape::Equal equal;
-  if (ignore_memory_space) {
-    equal.IgnoreMemorySpaceInLayout();
-  }
-  if (minor_to_major_only) {
-    equal.MinorToMajorOnlyInLayout();
-  }
-  if (ignore_tiles) {
-    equal.IgnoreTilesInLayout();
-  }
   return equal(a, b);
 }
 
@@ -939,14 +928,14 @@ Status ShapeVerifier::HandleRngBitGenerator(HloInstruction* hlo) {
   if (hlo->shape().IsTuple() && hlo->shape().tuple_shapes_size() != 2) {
     return Internal(
         "Expected tuple shape with 2 elements for RngBitGenerator. Got: %s",
-        hlo->shape().ToString());
+        hlo->shape().ToString(true));
   }
   if (!ShapeUtil::Compatible(hlo->operand(0)->shape(),
                              hlo->shape().tuple_shapes(0))) {
     return Internal(
         "Expected state shape to match between input and output for "
         "RngBitGenerator. Got %s vs. %s",
-        hlo->operand(0)->shape().ToString(),
+        hlo->operand(0)->shape().ToString(true),
         hlo->shape().tuple_shapes(0).ToString());
   }
   return OkStatus();
@@ -1285,7 +1274,7 @@ Status ShapeVerifier::HandleCustomCall(HloInstruction* instruction) {
           custom_call->operand_shapes_with_layout()[i];
       TF_RET_CHECK(ShapeUtil::Compatible(custom_call->operand(i)->shape(),
                                          operand_shape_with_layout))
-          << custom_call->operand(i)->shape().ToString() << " operand "
+          << custom_call->operand(i)->shape().ToString(true) << " operand "
           << operand_shape_with_layout.ToString();
       TF_RET_CHECK(LayoutUtil::HasLayout(operand_shape_with_layout));
     }
@@ -1572,8 +1561,8 @@ Status ShapeVerifier::HandleAsyncUpdate(HloInstruction* async_update) {
     return Internal(
         "The %s expects the shape of operand and output to match (%s vs %s).",
         HloOpcodeString(async_update->opcode()),
-        async_update->operand(0)->shape().ToString(),
-        async_update->shape().ToString());
+        async_update->operand(0)->shape().ToString(true),
+        async_update->shape().ToString(true));
   }
   TF_RETURN_IF_ERROR(
       CheckAsyncOpComputationShapes(async_update, async_update->shape()));
@@ -1589,8 +1578,8 @@ Status ShapeVerifier::HandleAsyncDone(HloInstruction* async_done) {
     return Internal(
         "The %s expects the shape of output to match the async shape at index "
         "{1} (%s vs %s).",
-        HloOpcodeString(async_done->opcode()), async_done->shape().ToString(),
-        root_shape.ToString());
+        HloOpcodeString(async_done->opcode()),
+        async_done->shape().ToString(true), root_shape.ToString(true));
   }
   return CheckAsyncOpOperand(async_done);
 }
@@ -1608,8 +1597,7 @@ Status ShapeVerifier::HandleCopyDone(HloInstruction* copy_done) {
   const Shape& dest_shape = ShapeUtil::GetTupleElementShape(operand_shape, 0);
   const Shape& src_shape = ShapeUtil::GetTupleElementShape(operand_shape, 1);
   if (!ShapesSame(dest_shape, src_shape,
-                  /*minor_to_major_only=*/false,
-                  /*ignore_memory_space=*/true)) {
+                  Shape::Equal().IgnoreMemorySpaceInLayout())) {
     return Internal(
         "Source and destination buffers in CopyDone arguments need to be the "
         "same shape found %s and %s\n%s",
@@ -1834,18 +1822,27 @@ Status ShapeVerifier::CheckShape(const HloInstruction* instruction,
       case HloOpcode::kSend:
       case HloOpcode::kSendDone:
       case HloOpcode::kTuple:
-      case HloOpcode::kWhile:
-        return ShapesSame(instruction->shape(), inferred_shape,
-                          only_compare_minor_to_major_in_layout);
-      case HloOpcode::kDynamicUpdateSlice:
-        // For DynamicUpdateSlice it has an "in-place" update semantics, but
-        // inside of fusions memory space propagation doesn't propagate the
-        // memory spaces all the way, causing possible mismatches. Relax the
-        // constraint in that condition.
-        return ShapesSame(instruction->shape(), inferred_shape,
-                          only_compare_minor_to_major_in_layout,
-                          /*ignore_memory_space=*/
-                          instruction->parent()->IsFusionComputation());
+      case HloOpcode::kWhile: {
+        Shape::Equal equal;
+        if (only_compare_minor_to_major_in_layout) {
+          equal.MinorToMajorOnlyInLayout();
+        }
+        return ShapesSame(instruction->shape(), inferred_shape, equal);
+      }
+      case HloOpcode::kDynamicUpdateSlice: {
+        Shape::Equal equal;
+        if (only_compare_minor_to_major_in_layout) {
+          equal.MinorToMajorOnlyInLayout();
+        }
+        if (instruction->parent()->IsFusionComputation()) {
+          // For DynamicUpdateSlice it has an "in-place" update semantics, but
+          // inside of fusions memory space propagation doesn't propagate the
+          // memory spaces all the way, causing possible mismatches. Relax the
+          // constraint in that condition.
+          equal.IgnoreMemorySpaceInLayout();
+        }
+        return ShapesSame(instruction->shape(), inferred_shape, equal);
+      }
 
       // We allow arbitrary layout and f32->bf16 transformations on all other
       // instructions, although this may be made more strict pending discussion
@@ -1918,8 +1915,9 @@ Status ShapeVerifier::VerifyEntryComputationLayout(const HloModule& module) {
   // let's not check that.
   if (!ShapesSame(computation->root_instruction()->shape(),
                   result_layout.shape(),
-                  /*minor_to_major_only=*/false, /*ignore_memory_space=*/false,
-                  /*ignore_tiles=*/true)) {
+                  Shape::Equal()
+                      .IgnoreTilesInLayout()
+                      .IgnoreTailPaddingAlignmentInElements())) {
     return Internal(
         "Shape of the root instruction of entry computation (%s) should be "
         "compatible to one specified in module's entry computation layout (%s)",
@@ -1941,9 +1939,9 @@ Status ShapeVerifier::VerifyEntryComputationLayout(const HloModule& module) {
     // TPU layout assignment doesn't set the tiles on entry_computation_layout,
     // so let's not check that.
     if (!ShapesSame(parameter->shape(), layout.parameter_shape(i),
-                    /*minor_to_major_only=*/false,
-                    /*ignore_memory_space=*/false,
-                    /*ignore_tiles=*/true)) {
+                    Shape::Equal()
+                        .IgnoreTilesInLayout()
+                        .IgnoreTailPaddingAlignmentInElements())) {
       return Internal(
           "Shape of the entry computation parameter %d is %s should be "
           "compatible to the one specified in module's entry computation "
@@ -2248,20 +2246,40 @@ Status VerifyChannels(const HloModule& module) {
         }
         case HloOpcode::kRecv: {
           TF_RET_CHECK(instruction->users().size() == 1);
-          const HloInstruction* recv_done = instruction->users().front();
-          TF_RET_CHECK(recv_done->opcode() == HloOpcode::kRecvDone);
-          TF_RETURN_IF_ERROR(CheckSameChannel(instruction, recv_done));
-          TF_RETURN_IF_ERROR(CheckSameIsHostTransfer(instruction, recv_done));
+          const HloInstruction* recv_user = instruction->users().front();
+          if (recv_user->opcode() == HloOpcode::kRecvDone) {
+            TF_RETURN_IF_ERROR(CheckSameChannel(instruction, recv_user));
+            TF_RETURN_IF_ERROR(CheckSameIsHostTransfer(instruction, recv_user));
+          } else {
+            // If a Recv user is not a RecvDone, it has to be a tuple that is
+            // either the root of a while-body or the init of a while-loop.
+            TF_RET_CHECK(recv_user->opcode() == HloOpcode::kTuple);
+            if (recv_user != recv_user->parent()->root_instruction()) {
+              TF_RET_CHECK(recv_user->users().size() == 1);
+              const HloInstruction* user = recv_user->users().front();
+              TF_RET_CHECK(user->opcode() == HloOpcode::kWhile);
+            }
+          }
           break;
         }
         case HloOpcode::kSendDone:
           TF_RET_CHECK(instruction->operands().size() == 1);
           TF_RET_CHECK(instruction->operand(0)->opcode() == HloOpcode::kSend);
           break;
-        case HloOpcode::kRecvDone:
+        case HloOpcode::kRecvDone: {
           TF_RET_CHECK(instruction->operands().size() == 1);
-          TF_RET_CHECK(instruction->operand(0)->opcode() == HloOpcode::kRecv);
+          const HloInstruction* recv_done_operand = instruction->operand(0);
+          if (recv_done_operand->opcode() != HloOpcode::kRecv) {
+            // If the RecvDone operand is not a Recv, it has to be either part
+            // of a while-loop result or a parameter of a while-body.
+            TF_RET_CHECK(recv_done_operand->opcode() ==
+                         HloOpcode::kGetTupleElement);
+            HloOpcode opcode = recv_done_operand->operand(0)->opcode();
+            TF_RET_CHECK(opcode == HloOpcode::kWhile ||
+                         opcode == HloOpcode::kParameter);
+          }
           break;
+        }
         default:
           break;
       }
@@ -2859,12 +2877,6 @@ void MetadataTracker::HandleMetadata(const OpMetadata& metadata) {
   if (metadata.source_line() != 0) {
     ++has_source_line_count_;
   }
-  if (metadata.creation_pass_id() != 0) {
-    ++has_creation_pass_id_count_;
-  }
-  if (metadata.logical_creation_pass_id() != 0) {
-    ++has_logical_creation_pass_id_count_;
-  }
   if (metadata.size_of_generated_code_in_bytes() != 0) {
     ++has_size_of_generated_code_in_bytes_count_;
   }
diff --git a/third_party/xla/xla/service/hlo_verifier.h b/third_party/xla/xla/service/hlo_verifier.h
index ffeda60b0d7fc3..2debca84b89c23 100644
--- a/third_party/xla/xla/service/hlo_verifier.h
+++ b/third_party/xla/xla/service/hlo_verifier.h
@@ -244,9 +244,7 @@ class ShapeVerifier : public DfsHloVisitor {
 
  protected:
   // Helpers that switch on layout_sensitive_.
-  bool ShapesSame(const Shape& a, const Shape& b,
-                  bool minor_to_major_only = false,
-                  bool ignore_memory_space = false, bool ignore_tiles = false);
+  bool ShapesSame(const Shape& a, const Shape& b, Shape::Equal equal = {});
 
   // Check the instruction's shape against the shape given by ShapeInference
   // and return an appropriate error if there is a mismatch.
@@ -269,19 +267,6 @@ class ShapeVerifier : public DfsHloVisitor {
   Status CheckVariadicShape(const HloInstruction* instruction);
 
  private:
-  bool ShapesSameIgnoringFpPrecision(const Shape& a, const Shape& b,
-                                     bool minor_to_major_only = false) {
-    if (!opts_.layout_sensitive) {
-      return ShapeUtil::CompatibleIgnoringFpPrecision(a, b);
-    }
-    Shape::Equal equal;
-    if (minor_to_major_only) {
-      equal.MinorToMajorOnlyInLayout();
-    }
-    equal.IgnoreFpPrecision();
-    return equal(a, b);
-  }
-
   std::string StringifyShape(const Shape& s) {
     return opts_.layout_sensitive ? ShapeUtil::HumanStringWithLayout(s)
                                   : ShapeUtil::HumanString(s);
diff --git a/third_party/xla/xla/service/host_memory_offload_annotations.h b/third_party/xla/xla/service/host_memory_offload_annotations.h
new file mode 100644
index 00000000000000..fc7ab75559d084
--- /dev/null
+++ b/third_party/xla/xla/service/host_memory_offload_annotations.h
@@ -0,0 +1,42 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+
+#ifndef XLA_SERVICE_HOST_MEMORY_OFFLOAD_ANNOTATIONS_H_
+#define XLA_SERVICE_HOST_MEMORY_OFFLOAD_ANNOTATIONS_H_
+
+#include "absl/strings/string_view.h"
+
+namespace xla {
+namespace host_memory_offload_annotations {
+
+// External annotations:
+inline const absl::string_view kDevicePlacement = "annotate_device_placement";
+inline const absl::string_view kMemoryTargetPinnedHost = "pinned_host";
+inline const absl::string_view kMemoryTargetUnpinnedHost = "unpinned_host";
+inline const absl::string_view kMemoryTargetDevice = "device";
+
+// Internal annotations:
+// These are currently called PipelineForward/PipelineBackward, because they
+// were originally meant as a hook point for the collective-pipeliner. They do
+// more than just that though (identify memory movement direction), so should be
+// renamed to something related to memory movement.
+inline const absl::string_view kMoveToHostCustomCallTarget = "PipelineForward";
+inline const absl::string_view kMoveToDeviceCustomCallTarget =
+    "PipelineBackward";
+
+}  // namespace host_memory_offload_annotations
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HOST_MEMORY_OFFLOAD_ANNOTATIONS_H_
diff --git a/third_party/xla/xla/service/host_memory_transfer_asyncifier.cc b/third_party/xla/xla/service/host_memory_transfer_asyncifier.cc
index df6d1487617eaa..499e6f62cca116 100644
--- a/third_party/xla/xla/service/host_memory_transfer_asyncifier.cc
+++ b/third_party/xla/xla/service/host_memory_transfer_asyncifier.cc
@@ -19,6 +19,7 @@ limitations under the License.
 
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/log.h"
+#include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/dfs_hlo_visitor_with_default.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -60,6 +61,10 @@ class HostMemoryTransferAsyncifierVisitor : public DfsHloVisitorWithDefault {
                             ", does not have a layout.");
     }
 
+    VLOG(3) << absl::StreamFormat(
+        "\"%s\" from S(%d) to S(%d)", dynamic_slice->name(),
+        dynamic_slice_operand->shape().layout().memory_space(),
+        dynamic_slice->shape().layout().memory_space());
     // Check that this is a dynamic-slice slicing from host memory to device
     // memory.
     if (dynamic_slice_operand->shape().layout().memory_space() !=
diff --git a/third_party/xla/xla/service/host_memory_transfer_asyncifier_test.cc b/third_party/xla/xla/service/host_memory_transfer_asyncifier_test.cc
index eb3d2e3666a28b..c2ceea7589cda7 100644
--- a/third_party/xla/xla/service/host_memory_transfer_asyncifier_test.cc
+++ b/third_party/xla/xla/service/host_memory_transfer_asyncifier_test.cc
@@ -20,6 +20,7 @@ limitations under the License.
 
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
+#include "absl/status/status.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -48,7 +49,7 @@ class HostMemoryTransferAsyncifierTest : public HloTestBase {
   StatusOr<bool> RunAsyncifier(HloModule* module) {
     TF_EXPECT_OK(verifier().Run(module).status());
     if (module->has_schedule()) {
-      return InternalError("Expected a non-scheduled module");
+      return absl::InternalError("Expected a non-scheduled module");
     }
 
     HostMemoryTransferAsyncifier asyncifier(kHostMemorySpaceColor);
diff --git a/third_party/xla/xla/service/host_offload_legalize.cc b/third_party/xla/xla/service/host_offload_legalize.cc
new file mode 100644
index 00000000000000..38f66b93ab6285
--- /dev/null
+++ b/third_party/xla/xla/service/host_offload_legalize.cc
@@ -0,0 +1,508 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/host_offload_legalize.h"
+
+#include <array>
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/container/inlined_vector.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/host_memory_offload_annotations.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
+#include "xla/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+
+namespace {
+
+// Find an annotation moving up. Meant to find an annotation from a DUS operand.
+HloInstruction* FindAnnotationToUpdate(HloInstruction* instr) {
+  while (!instr->IsCustomCall(
+      host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
+    if (instr->user_count() != 1 || (instr->opcode() != HloOpcode::kBitcast &&
+                                     instr->opcode() != HloOpcode::kReshape)) {
+      return nullptr;
+    }
+    instr = instr->mutable_operand(0);
+  }
+  return instr;
+}
+
+// Find a DUS starting from an annotation.
+HloInstruction* FindDUSFromAnnotation(HloInstruction* instr) {
+  while (instr->opcode() != HloOpcode::kDynamicUpdateSlice) {
+    if (instr->user_count() != 1 || (instr->opcode() != HloOpcode::kBitcast &&
+                                     instr->opcode() != HloOpcode::kReshape)) {
+      break;
+    }
+    instr = instr->users()[0];
+  }
+  return instr;
+}
+
+// Make sure that broadcasts are duplicated for each use.
+StatusOr<bool> DuplicateBroadcastForEachUse(HloModule* module) {
+  bool split_at_least_one = false;
+  for (HloComputation* computation : module->computations()) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() != HloOpcode::kBroadcast ||
+          !instruction->HasConstantOperand()) {
+        continue;
+      }
+      absl::InlinedVector<HloUse, 8> uses;
+      for (HloInstruction* user : instruction->users()) {
+        for (int64_t i = 0; i < user->operand_count(); ++i) {
+          if (user->operand(i) != instruction) {
+            continue;
+          }
+          uses.push_back(HloUse{user, i, /*operand_index=*/{}});
+        }
+      }
+
+      if (uses.size() <= 1) {
+        VLOG(5) << "Skipping broadcast " << instruction->ToString()
+                << " which has " << uses.size() << " uses";
+        continue;
+      }
+
+      VLOG(5) << "Splitting broadcast " << instruction->ToString()
+              << " which has " << uses.size() << " uses";
+      split_at_least_one = true;
+      // Don't create a new broadcast for the first use; we can still use the
+      // original.
+      for (int i = 1; i < uses.size(); ++i) {
+        const HloUse& use = uses[i];
+        HloInstruction* new_broadcast =
+            instruction->parent()->AddInstruction(instruction->Clone());
+        VLOG(5) << "New broadcast " << new_broadcast->ToString();
+        TF_RETURN_IF_ERROR(use.instruction->ReplaceOperandWith(
+            use.operand_number, new_broadcast));
+      }
+    }
+  }
+  return split_at_least_one;
+}
+
+// Walk up in the chain of memory offloaded instructions. Status not-ok when
+// an instructions not supported or end of chain reached.
+StatusOr<std::pair<HloInstruction*, int>> WalkUpMemoryOffload(
+    std::pair<HloInstruction*, int> current_value,
+    const CallGraph& call_graph) {
+  // TODO(maggioni): Verify that set of instructions supported in chain by
+  // legalization is in sync with host_offloader.
+  auto& [instruction, index] = current_value;
+  // Walk up to find definition
+  switch (instruction->opcode()) {
+    case HloOpcode::kGetTupleElement: {
+      CHECK_EQ(index, -1);
+      return std::make_pair(instruction->mutable_operand(0),
+                            instruction->tuple_index());
+    }
+    case HloOpcode::kBitcast:
+    case HloOpcode::kReshape: {
+      return std::make_pair(instruction->mutable_operand(0), index);
+    }
+    case HloOpcode::kTuple: {
+      return std::make_pair(instruction->mutable_operand(index), -1);
+    }
+    case HloOpcode::kOptimizationBarrier: {
+      return std::make_pair(instruction->mutable_operand(0), index);
+    }
+    case HloOpcode::kWhile: {
+      HloComputation* while_body = instruction->while_body();
+      HloInstruction* root = while_body->root_instruction();
+      CHECK_EQ(root->opcode(), HloOpcode::kTuple);
+      return std::make_pair(root, index);
+    }
+    case HloOpcode::kParameter: {
+      CHECK_NE(instruction->parent(),
+               instruction->GetModule()->entry_computation());
+      auto callers = call_graph.GetComputationCallers(instruction->parent());
+      if (callers.size() != 1) {
+        return absl::InvalidArgumentError(
+            "Expected to be called only by one caller");
+      }
+      auto* caller = callers[0];
+      if (caller->opcode() != HloOpcode::kWhile) {
+        return absl::InvalidArgumentError(
+            "Expected to be called by a while loop");
+      }
+      return std::make_pair(caller->mutable_operand(0), index);
+    }
+    case HloOpcode::kDynamicUpdateSlice: {
+      return std::make_pair(instruction->mutable_operand(0), index);
+    }
+    case HloOpcode::kCustomCall: {
+      if (!instruction->IsCustomCall("AllocateBuffer") &&
+          !instruction->IsCustomCall(
+              host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
+        return absl::InvalidArgumentError(
+            "Expected AllocateBuffer or MoveToHost custom-call");
+      }
+      return std::make_pair(instruction, index);
+    }
+    case HloOpcode::kBroadcast: {
+      auto* broadcast_operand = instruction->mutable_operand(0);
+      if (broadcast_operand->opcode() != HloOpcode::kConstant) {
+        return absl::InvalidArgumentError("Expected a constant as operand");
+      }
+      if (!ShapeUtil::IsEffectiveScalar(broadcast_operand->shape())) {
+        return absl::InvalidArgumentError("Expected a scalar broadcast");
+      }
+      return std::make_pair(instruction, index);
+    }
+    default: {
+      return absl::InvalidArgumentError(
+          absl::StrFormat("Invalid opcode %s", instruction->ToString()));
+    }
+  }
+}
+
+// Walk down in the chain of memory offloaded instructions. Status not-ok when
+// an instructions not supported or end of chain reached.
+StatusOr<std::pair<HloInstruction*, int>> WalkDownMemoryOffload(
+    const std::pair<HloInstruction*, int64_t>& current_value,
+    const CallGraph& call_graph) {
+  // TODO(maggioni): Verify that set of instructions supported in chain by
+  // legalization is in sync with host_offloader.
+  VLOG(5) << "Current value in progress: " << current_value.first->ToString()
+          << " idx: " << current_value.second;
+  auto find_gte_for_idx = [](HloInstruction* instr,
+                             int idx) -> StatusOr<HloInstruction*> {
+    HloInstruction* gte = nullptr;
+    for (HloInstruction* user : instr->users()) {
+      if (user->opcode() != HloOpcode::kGetTupleElement) {
+        return absl::InvalidArgumentError(
+            "Expected users to be only get-tuple-elements");
+      }
+      if (user->tuple_index() != idx) {
+        continue;
+      }
+      if (gte != nullptr) {
+        return absl::InvalidArgumentError(
+            "Expected to find only one gte per index.");
+      }
+      gte = user;
+    }
+    return gte;
+  };
+  if (current_value.first->user_count() == 0) {
+    if (current_value.first->parent()->root_instruction() ==
+        current_value.first) {
+      auto callers =
+          call_graph.GetComputationCallers(current_value.first->parent());
+      if (callers.size() != 1 || callers[0]->opcode() != HloOpcode::kWhile) {
+        return absl::InvalidArgumentError(
+            "Expected to be called only by one caller and caller be a While");
+      }
+      TF_ASSIGN_OR_RETURN(HloInstruction * gte,
+                          find_gte_for_idx(callers[0], current_value.second));
+      return std::make_pair(gte, -1);
+    }
+  }
+  if (current_value.first->user_count() != 1) {
+    if (current_value.first->opcode() == HloOpcode::kParameter &&
+        current_value.first->shape().IsTuple()) {
+      TF_ASSIGN_OR_RETURN(
+          HloInstruction * gte,
+          find_gte_for_idx(current_value.first, current_value.second));
+      return std::make_pair(gte, -1);
+    }
+    return absl::InvalidArgumentError("Number of users > 1");
+  }
+  HloInstruction* user = current_value.first->users()[0];
+  switch (user->opcode()) {
+    case HloOpcode::kGetTupleElement: {
+      CHECK_NE(user->tuple_index(), -1);
+      if (user->tuple_index() != current_value.second) {
+        return absl::InvalidArgumentError("Invalid index for gte");
+      }
+      return std::make_pair(user, static_cast<int>(-1));
+    }
+    case HloOpcode::kTuple: {
+      auto output_indices = user->OperandIndices(current_value.first);
+      if (output_indices.size() != 1) {
+        return absl::InvalidArgumentError(
+            "Expected operand to be used only once in the tuple.");
+      }
+      return std::make_pair(user, output_indices[0]);
+    }
+    case HloOpcode::kOptimizationBarrier: {
+      return std::make_pair(user, current_value.second);
+    }
+    case HloOpcode::kWhile: {
+      HloComputation* while_body = user->while_body();
+      HloInstruction* parameter = while_body->parameter_instruction(0);
+      return std::make_pair(parameter, current_value.second);
+    }
+    case HloOpcode::kDynamicUpdateSlice: {
+      if (user->OperandIndices(current_value.first)[0] != 0) {
+        return absl::InvalidArgumentError(
+            "Expected to be used by first operand of dynamic-update-slice");
+      }
+      return std::make_pair(user, current_value.second);
+    }
+    case HloOpcode::kDynamicSlice:
+    case HloOpcode::kBitcast:
+    case HloOpcode::kReshape:
+    case HloOpcode::kSlice: {
+      return std::make_pair(user, current_value.second);
+    }
+    default: {
+      return absl::InvalidArgumentError("Unrecognized user opcode");
+    }
+  }
+}
+
+StatusOr<bool> ProcessAnnotationForCopyMovement(
+    HloInstruction* instruction, const CallGraph* call_graph,
+    absl::flat_hash_set<HloInstruction*>& processed_annotations) {
+  HloInstruction* starting_instr =
+      FindDUSFromAnnotation(instruction->users()[0]);
+  VLOG(3) << "Dus or Annotation: " << starting_instr->ToString();
+  // If it's the pure copy case reset instruction.
+  if (starting_instr->opcode() != HloOpcode::kDynamicUpdateSlice) {
+    starting_instr = instruction;
+  }
+  std::pair<HloInstruction*, int> current_value =
+      std::make_pair(starting_instr, -1);
+  // Walk down the chain and find a layout changing copy to move.
+  auto current_value_down = WalkDownMemoryOffload(current_value, *call_graph);
+  while (current_value_down.ok()) {
+    current_value = current_value_down.value();
+    current_value_down = WalkDownMemoryOffload(current_value, *call_graph);
+  }
+  HloInstruction* last_value = nullptr;
+  if (current_value.first->user_count() == 1) {
+    if (current_value.first->users()[0]->opcode() != HloOpcode::kCopy) {
+      VLOG(5) << "No copy";
+      return false;
+    }
+    last_value = current_value.first;
+  } else {
+    VLOG(5) << "Users > 1";
+    return false;
+  }
+  // Found a copy that would block offloading. Walk up to find all annotations
+  // to update (required in case there are multiple insertions in the buffer).
+  std::vector<std::pair<HloInstruction*, HloInstruction*>>
+      annotations_to_update;
+  if (current_value.first->IsCustomCall(
+          host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
+    annotations_to_update.push_back(
+        std::make_pair(current_value.first, current_value.first));
+  }
+  while (true) {
+    VLOG(10) << "Current value before: " << current_value.first->ToString();
+    auto current_value_up = WalkUpMemoryOffload(current_value, *call_graph);
+    if (!current_value_up.ok() || current_value_up.value() == current_value) {
+      break;
+    }
+    current_value = current_value_up.value();
+    VLOG(10) << "Current value after: "
+             << current_value_up.value().first->ToString();
+    HloInstruction* annotation = current_value_up.value().first;
+    if (annotation->opcode() == HloOpcode::kDynamicUpdateSlice) {
+      HloInstruction* real_annotation =
+          FindAnnotationToUpdate(annotation->mutable_operand(1));
+      if (real_annotation->IsCustomCall(
+              host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
+        annotations_to_update.push_back(
+            std::make_pair(annotation, real_annotation));
+      } else {
+        annotations_to_update.push_back(std::make_pair(annotation, annotation));
+      }
+    }
+    if (annotation->IsCustomCall(
+            host_memory_offload_annotations::kMoveToHostCustomCallTarget)) {
+      annotations_to_update.push_back(std::make_pair(
+          current_value_up.value().first, current_value_up.value().first));
+    }
+  }
+  if (current_value.first->parent() != last_value->parent()) {
+    VLOG(5) << "Parent mismatch";
+    return false;
+  }
+  std::vector<std::pair<HloInstruction*, int>> instructions_to_change(
+      1, current_value);
+  // Do a final walkdown from the top to collect all the instructions that need
+  // their shape updated.
+  while (true) {
+    current_value_down = WalkDownMemoryOffload(current_value, *call_graph);
+    if (!current_value_down.ok()) {
+      break;
+    }
+    VLOG(5) << "Current value last down: "
+            << current_value_down.value().first->ToString();
+    current_value = current_value_down.value();
+    instructions_to_change.push_back(current_value_down.value());
+  }
+  if (current_value.first != last_value) {
+    return false;
+  }
+  if (instructions_to_change.empty()) {
+    return false;
+  }
+  HloInstruction* copy_to_move = last_value->users()[0];
+  VLOG(3) << "Copy to move: " << copy_to_move->ToString();
+  VLOG(3) << "Instructions to change num: " << instructions_to_change.size();
+  for (auto& instruction : instructions_to_change) {
+    const Shape& shape =
+        instruction.first->shape().IsTuple()
+            ? instruction.first->shape().tuple_shapes()[instruction.second]
+            : instruction.first->shape();
+    if (shape != last_value->shape()) {
+      return false;
+    }
+    VLOG(3) << "\tInstruction to change: " << instruction.first->ToString();
+  }
+  // Helper to update the shape.
+  auto update_shape_layout =
+      [&](const std::pair<HloInstruction*, int>& instruction) {
+        // Update shape. Tuple shape vs array shape.
+        if (instruction.second != -1) {
+          *instruction.first->mutable_shape()
+               ->mutable_tuple_shapes(instruction.second)
+               ->mutable_layout() = copy_to_move->shape().layout();
+        } else {
+          *instruction.first->mutable_shape()->mutable_layout() =
+              copy_to_move->shape().layout();
+        }
+      };
+  for (auto& instruction : instructions_to_change) {
+    update_shape_layout(instruction);
+    if (instruction.first->opcode() == HloOpcode::kParameter) {
+      auto callers =
+          call_graph->GetComputationCallers(instruction.first->parent());
+      if (callers[0]->opcode() == HloOpcode::kWhile) {
+        update_shape_layout(std::make_pair(callers[0], instruction.second));
+      }
+    }
+  }
+  VLOG(3) << "Annotations to update num: " << annotations_to_update.size();
+  // Update annotations. The copy needs to be placed before every annotation.
+  for (auto& [insertion_point, annotation] : annotations_to_update) {
+    VLOG(3) << "\tAnnotation to update (insertion pt): "
+            << insertion_point->ToString();
+    VLOG(3) << "\tAnnotation to update (instr): " << annotation->ToString();
+    processed_annotations.insert(annotation);
+    // Move the annotation first just before dynamic-update-slice to avoid shape
+    // changes.
+    if (insertion_point != annotation) {
+      CHECK_EQ(insertion_point->opcode(), HloOpcode::kDynamicUpdateSlice);
+      TF_RETURN_IF_ERROR(insertion_point->ReplaceOperandWith(
+          1, insertion_point->AddInstruction(annotation->CloneWithNewOperands(
+                 insertion_point->operand(1)->shape(),
+                 {insertion_point->mutable_operand(1)}))));
+      TF_RETURN_IF_ERROR(
+          annotation->ReplaceAllUsesWith(annotation->mutable_operand(0)));
+      TF_RETURN_IF_ERROR(annotation->parent()->RemoveInstruction(annotation));
+      insertion_point = insertion_point->mutable_operand(1);
+      annotation = insertion_point;
+    }
+    Shape instruction_shape = insertion_point->shape();
+    *instruction_shape.mutable_layout() = copy_to_move->shape().layout();
+    auto* new_copy =
+        insertion_point->AddInstruction(copy_to_move->CloneWithNewOperands(
+            instruction_shape, {insertion_point->mutable_operand(0)}));
+    TF_RETURN_IF_ERROR(
+        insertion_point->mutable_operand(0)->ReplaceAllUsesWithDifferentShape(
+            new_copy));
+  }
+  TF_RETURN_IF_ERROR(copy_to_move->ReplaceAllUsesWithDifferentShape(
+      copy_to_move->mutable_operand(0)));
+  TF_RETURN_IF_ERROR(copy_to_move->parent()->RemoveInstruction(copy_to_move));
+  return true;
+}
+
+// Fixes layout changing copies in between on the path to users.
+StatusOr<bool> FixupInterveningCopies(
+    const std::vector<HloInstruction*>& copy_to_host_annotations,
+    const CallGraph* call_graph) {
+  absl::flat_hash_set<HloInstruction*> processed_annotations;
+  bool changed = false;
+  for (HloInstruction* instruction : copy_to_host_annotations) {
+    if (processed_annotations.count(instruction)) {
+      continue;
+    }
+    TF_ASSIGN_OR_RETURN(bool changed_annotation_for_copy_movement,
+                        ProcessAnnotationForCopyMovement(
+                            instruction, call_graph, processed_annotations));
+    changed |= changed_annotation_for_copy_movement;
+  }
+  return changed;
+}
+
+}  // namespace
+
+StatusOr<bool> HostOffloadLegalize::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+
+  // Split broadcasts so that each HloUse of a broadcast instruction will get
+  // its own copy.
+  // TODO(b/319293925): Do not blindly duplicate all broadcasts, instead do it
+  // only when necessary.
+  TF_ASSIGN_OR_RETURN(bool duplicated_at_least_one_broadcast,
+                      DuplicateBroadcastForEachUse(module));
+  if (duplicated_at_least_one_broadcast) {
+    changed = true;
+  }
+  if (!after_layout_) {
+    return changed;
+  }
+  std::unique_ptr<CallGraph> call_graph = CallGraph::Build(module);
+  std::vector<HloInstruction*> copy_to_host_annotations;
+
+  // Iterate over all instructions and look for XLA host offload annotations.
+  for (HloComputation* computation :
+       module->MakeNonfusionComputations(execution_threads)) {
+    for (HloInstruction* instruction : computation->instructions()) {
+      if (instruction->opcode() != HloOpcode::kCustomCall) {
+        continue;
+      }
+      if (instruction->custom_call_target() ==
+          host_memory_offload_annotations::kMoveToHostCustomCallTarget) {
+        copy_to_host_annotations.push_back(instruction);
+      }
+    }
+  }
+  // Fixup layout changing copies that are in between memory offloaded sections.
+  // Move them before the data is moved to the host.
+  TF_ASSIGN_OR_RETURN(
+      bool changed_intervening_copies,
+      FixupInterveningCopies(copy_to_host_annotations, call_graph.get()));
+  changed |= changed_intervening_copies;
+
+  return changed;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/service/host_offload_legalize.h b/third_party/xla/xla/service/host_offload_legalize.h
new file mode 100644
index 00000000000000..aca3cbd7e94cf5
--- /dev/null
+++ b/third_party/xla/xla/service/host_offload_legalize.h
@@ -0,0 +1,58 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+ ==============================================================================*/
+#ifndef XLA_SERVICE_HOST_OFFLOAD_LEGALIZE_H_
+#define XLA_SERVICE_HOST_OFFLOAD_LEGALIZE_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/strings/string_view.h"
+#include "xla/service/hlo_alias_analysis.h"
+#include "xla/service/hlo_pass_interface.h"
+
+namespace xla {
+
+class HloCostAnalysis;
+
+// This pass does "host memory offloading". If a tensor is annotated to be moved
+// to or from the host, this pass will remove the annotations and update each
+// tensor's layout with host memory spaces and insert copies if necessary. This
+// pass checks to make sure that no compute is done on the tensors annotated for
+// host memory offload; if there is compute, it is considered a user error and
+// an error will be returned.
+class HostOffloadLegalize : public HloModulePass {
+ public:
+  explicit HostOffloadLegalize(int64_t host_memory_space_color,
+                               bool after_layout)
+      : kHostMemorySpaceColor(host_memory_space_color),
+        after_layout_(after_layout) {}
+  ~HostOffloadLegalize() override = default;
+
+  absl::string_view name() const override { return "host-offloader"; }
+
+  using HloPassInterface::Run;
+  StatusOr<bool> Run(
+      HloModule* module,
+      const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+
+ private:
+  const int64_t kHostMemorySpaceColor;
+  const bool after_layout_;
+};
+
+}  // namespace xla
+
+#endif  // XLA_SERVICE_HOST_OFFLOAD_LEGALIZE_H_
diff --git a/third_party/xla/xla/service/host_offload_legalize_test.cc b/third_party/xla/xla/service/host_offload_legalize_test.cc
new file mode 100644
index 00000000000000..6e62d886e28ca8
--- /dev/null
+++ b/third_party/xla/xla/service/host_offload_legalize_test.cc
@@ -0,0 +1,235 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/host_offload_legalize.h"
+
+#include <cstdint>
+#include <stack>
+#include <string>
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+#include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/host_memory_offload_annotations.h"
+#include "xla/service/pattern_matcher.h"
+#include "xla/service/pattern_matcher_gmock.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/statusor.h"
+#include "xla/tests/hlo_test_base.h"
+#include "xla/util.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
+
+namespace m = ::xla::match;
+
+namespace xla {
+namespace {
+
+class HostOffloadLegalizeTest : public HloTestBase {
+ protected:
+  static constexpr int64_t kHostMemorySpaceColor{5};
+
+  StatusOr<bool> RunHostOffloadLegalize(HloModule* module) {
+    TF_EXPECT_OK(verifier().Run(module).status());
+    if (module->has_schedule()) {
+      return absl::InternalError("Expected a non-scheduled module");
+    }
+    HostOffloadLegalize host_offload_legalize(kHostMemorySpaceColor,
+                                              /*after_layout=*/true);
+    return host_offload_legalize.Run(module);
+  }
+
+  void TestShapeHasMemorySpace(const Shape& shape, int64_t memory_space) {
+    ASSERT_TRUE(shape.has_layout());
+    EXPECT_EQ(shape.layout().memory_space(), memory_space);
+  }
+
+  bool HaveRemainingOffloadAnnotations(const HloModule* module) {
+    for (const HloComputation* computation : module->computations()) {
+      for (const HloInstruction* instruction : computation->instructions()) {
+        if (instruction->IsCustomCall(
+                {host_memory_offload_annotations::kMoveToHostCustomCallTarget,
+                 host_memory_offload_annotations::
+                     kMoveToDeviceCustomCallTarget})) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+};
+
+TEST_F(HostOffloadLegalizeTest, NoCopyWithOptBarrierMoreElaborate) {
+  const std::string& hlo_string = R"(
+HloModule jit_f, entry_computation_layout={(f32[16,256]{0,1})->f32[16,256]{0,1}}
+
+ENTRY main.24 {
+  Arg_0.1 = f32[16,256]{0,1} parameter(0)
+  cosine.4 = f32[16,256]{0,1} cosine(Arg_0.1)
+  custom-call.5 = f32[16,256]{0,1} custom-call(cosine.4), custom_call_target="PipelineForward"
+  sine.3 = f32[16,256]{0,1} sine(Arg_0.1)
+  cosine.7 = f32[16,256]{0,1} cosine(sine.3)
+  custom-call.8 = f32[16,256]{0,1} custom-call(cosine.7), custom_call_target="PipelineForward"
+  sine.6 = f32[16,256]{0,1} sine(sine.3)
+  cosine.9 = f32[16,256]{0,1} cosine(sine.6)
+  custom-call.10 = f32[16,256]{0,1} custom-call(cosine.9), custom_call_target="PipelineForward"
+  constant.2 = f32[] constant(1)
+  cp = f32[16,256]{1,0} copy(custom-call.8)
+  tuple.11 = (f32[16,256]{0,1}, f32[16,256]{1,0}, f32[16,256]{0,1}, f32[]) tuple(custom-call.5, cp, custom-call.10, constant.2)
+  opt-barrier.12 = (f32[16,256]{0,1}, f32[16,256]{1,0}, f32[16,256]{0,1}, f32[]) opt-barrier(tuple.11)
+  get-tuple-element.16 = f32[] get-tuple-element(opt-barrier.12), index=3
+  broadcast.20 = f32[16,256]{0,1} broadcast(get-tuple-element.16), dimensions={}
+  get-tuple-element.15 = f32[16,256]{0,1} get-tuple-element(opt-barrier.12), index=2
+  custom-call.19 = f32[16,256]{0,1} custom-call(get-tuple-element.15), custom_call_target="PipelineBackward"
+  multiply.21 = f32[16,256]{0,1} multiply(broadcast.20, custom-call.19)
+  cp2 = f32[16,256]{1,0} copy(multiply.21)
+  get-tuple-element.14 = f32[16,256]{1,0} get-tuple-element(opt-barrier.12), index=1
+  custom-call.18 = f32[16,256]{1,0} custom-call(get-tuple-element.14), custom_call_target="PipelineBackward"
+  multiply.22 = f32[16,256]{1,0} multiply(cp2, custom-call.18)
+  get-tuple-element.13 = f32[16,256]{0,1} get-tuple-element(opt-barrier.12), index=0
+  custom-call.17 = f32[16,256]{0,1} custom-call(get-tuple-element.13), custom_call_target="PipelineBackward"
+  cp3 = f32[16,256]{1,0} copy(custom-call.17)
+  ROOT multiply.23 = f32[16,256]{1,0} multiply(multiply.22, cp3)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloadLegalize(module.get()));
+
+  EXPECT_TRUE(changed);
+  XLA_VLOG_LINES(1, module->ToString());
+  HloInstruction* custom_call = FindInstruction(module.get(), "custom-call.8");
+  EXPECT_EQ(custom_call->operand(0)->opcode(), HloOpcode::kCopy);
+  EXPECT_EQ(custom_call->shape().layout(), LayoutUtil::MakeLayout({1, 0}));
+}
+
+TEST_F(HostOffloadLegalizeTest, LlmActivationHostMemoryMultipleConsumers) {
+  const std::string& hlo_string = R"(
+HloModule llm_while
+
+producing_while_condition {
+  producing_condition_param = (s32[], f32[96,8,6,2048,2048]{0,1,2,3,4}) parameter(0)
+  producing_condition_current_iteration_index = s32[] get-tuple-element(producing_condition_param), index=0
+  producing_condition_iteration_count = s32[] constant(96)
+  ROOT producing_condition_result = pred[] compare(producing_condition_current_iteration_index, producing_condition_iteration_count), direction=LT
+}
+
+consuming_while_condition {
+  consuming_condition_param = (s32[], f32[96,8,6,2048,2048]{0,1,2,3,4}) parameter(0)
+  consuming_condition_current_iteration_index = s32[] get-tuple-element(consuming_condition_param), index=0
+  consuming_condition_iteration_count = s32[] constant(96)
+  ROOT consuming_condition_result = pred[] compare(consuming_condition_current_iteration_index, consuming_condition_iteration_count), direction=LT
+}
+
+producing_while_body {
+  input_tuple.0 = (s32[], f32[96,8,6,2048,2048]{0,1,2,3,4}) parameter(0)
+  current_iteration_index.0 = s32[] get-tuple-element(input_tuple.0), index=0
+  data_0.0 = f32[96,8,6,2048,2048]{0,1,2,3,4} get-tuple-element(input_tuple.0), index=1
+  constant_0.0 = s32[] constant(0)
+  constant_1.0 = s32[] constant(1)
+  constant_96 = s32[] constant(96)
+
+  /* Create dummy data used in DUS */
+  slice_data_0 = f32[1,8,6,2048,2048]  constant({...})
+
+  /* Build DUS index */
+  compare_result.0 = pred[] compare(current_iteration_index.0, constant_0.0), direction=LT
+  add_result = s32[] add(current_iteration_index.0, constant_96)
+  select_result.0 = s32[] select(compare_result.0, add_result, current_iteration_index.0)
+
+  /* Annotate DUS for offload */
+  custom_call_0.0 = f32[1,8,6,2048,2048] custom-call(slice_data_0), custom_call_target="PipelineForward"
+
+  dynamic_update_slice_0 = f32[96,8,6,2048,2048]{0,1,2,3,4} dynamic-update-slice(data_0.0, custom_call_0.0, select_result.0, constant_0.0, constant_0.0, constant_0.0, constant_0.0)
+
+  /* Increment iteration index */
+  incremented_index.0 = s32[] add(current_iteration_index.0, constant_1.0)
+  ROOT tuple_result.0 = (s32[], f32[96,8,6,2048,2048]{0,1,2,3,4}) tuple(incremented_index.0, dynamic_update_slice_0)
+}
+
+consuming_while_body {
+  input_tuple.1 = (s32[], f32[96,8,6,2048,2048]{0,1,3,2,4}) parameter(0)
+  current_iteration_index.1 = s32[] get-tuple-element(input_tuple.1), index=0
+  data_0.1 = f32[96,8,6,2048,2048]{0,1,3,2,4} get-tuple-element(input_tuple.1), index=1
+  constant_0.1 = s32[] constant(0)
+  constant_1.1 = s32[] constant(1)
+  constant_95 = s32[] constant(95)
+  constant_191 = s32[] constant(191)
+
+  /* Build DS index */
+  subtract_0 = s32[] subtract(constant_95, current_iteration_index.1)
+  compare_result.1 = pred[] compare(subtract_0, constant_0.1), direction=LT
+  subtract_1 = s32[] subtract(constant_191, current_iteration_index.1)
+  select_result.1 = s32[] select(compare_result.1, subtract_1, subtract_0)
+
+  dynamic_slice_0 = f32[1,8,6,2048,2048] dynamic-slice(data_0.1, select_result.1, constant_0.1, constant_0.1, constant_0.1, constant_0.1), dynamic_slice_sizes={1,8,6,2048,2048}
+
+  /* Annotate DS for offload */
+  custom_call_0.1 = f32[1,8,6,2048,2048] custom-call(dynamic_slice_0), custom_call_target="PipelineBackward"
+
+  /* Do some work with the dynamic slice outputs. */
+  tanh_0 = f32[1,8,6,2048,2048] tanh(custom_call_0.1)
+
+  /* Increment iteration index */
+  incremented_index.1 = s32[] add(current_iteration_index.1, constant_1.1)
+  ROOT tuple_result.1 = (s32[], f32[96,8,6,2048,2048]{0,1,3,2,4}) tuple(incremented_index.1, data_0.1)
+}
+
+ENTRY main {
+  entry_param_0 = f32[] parameter(0)
+  entry_param_1 = s32[] parameter(1)
+  entry_param_2 = s32[] parameter(2)
+  broadcast_0 = f32[96,8,6,2048,2048]{0,1,2,3,4} broadcast(entry_param_0), dimensions={}
+  constant_s32_0 = s32[] constant(0)
+  tuple_for_producing_while = (s32[], f32[96,8,6,2048,2048]{0,1,2,3,4}) tuple(constant_s32_0, broadcast_0)
+  producing_while = (s32[], f32[96,8,6,2048,2048]{0,1,2,3,4}) while(tuple_for_producing_while), condition=producing_while_condition, body=producing_while_body
+  while_output_1 = f32[96,8,6,2048,2048]{0,1,2,3,4} get-tuple-element(producing_while), index=1
+  cp = f32[96,8,6,2048,2048]{0,1,3,2,4} copy(while_output_1)
+  tuple_for_consuming_while = (s32[], f32[96,8,6,2048,2048]{0,1,3,2,4}) tuple(constant_s32_0, cp)
+  consuming_while = (s32[], f32[96,8,6,2048,2048]{0,1,3,2,4}) while(tuple_for_consuming_while), condition=consuming_while_condition, body=consuming_while_body
+  second_while_output = f32[96,8,6,2048,2048]{0,1,3,2,4} get-tuple-element(consuming_while), index=1
+  final_dynamic_slice_0 = f32[1,8,6,2048,2048] dynamic-slice(second_while_output, entry_param_1, constant_s32_0, constant_s32_0, constant_s32_0, constant_s32_0), dynamic_slice_sizes={1,8,6,2048,2048}
+  final_host_to_device_custom_call_0 = f32[1,8,6,2048,2048] custom-call(final_dynamic_slice_0), custom_call_target="PipelineBackward"
+  final_slice_0 = f32[1,8,6,2048,2048] slice(second_while_output), slice={[41:42], [0:8], [0:6], [0:2048], [0:2048]}
+  final_host_to_device_custom_call_1 = f32[1,8,6,2048,2048] custom-call(final_slice_0), custom_call_target="PipelineBackward"
+  ROOT add = f32[1,8,6,2048,2048] add(final_host_to_device_custom_call_0, final_host_to_device_custom_call_1)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloadLegalize(module.get()));
+
+  EXPECT_TRUE(changed);
+  HloInstruction* copy = FindInstruction(module.get(), HloOpcode::kCopy);
+  HloInstruction* producing_while =
+      FindInstruction(module.get(), "producing_while");
+  EXPECT_NE(copy, nullptr);
+  EXPECT_NE(producing_while, nullptr);
+  EXPECT_EQ(copy->parent(), producing_while->while_body());
+  XLA_VLOG_LINES(1, module->ToString());
+}
+
+}  // namespace
+
+}  // namespace xla
diff --git a/third_party/xla/xla/service/host_offloader.cc b/third_party/xla/xla/service/host_offloader.cc
index 716999fa797c1c..83903ee17ff5aa 100644
--- a/third_party/xla/xla/service/host_offloader.cc
+++ b/third_party/xla/xla/service/host_offloader.cc
@@ -32,6 +32,8 @@ limitations under the License.
 #include "xla/literal_util.h"
 #include "xla/service/hlo_buffer.h"
 #include "xla/service/hlo_value.h"
+#include "xla/service/host_memory_offload_annotations.h"
+#include "xla/service/pattern_matcher.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
@@ -49,51 +51,106 @@ void SetMemorySpace(Shape* shape, int64_t memory_space_color) {
   shape->mutable_layout()->set_memory_space(memory_space_color);
 }
 
-StatusOr<bool> DuplicateBroadcastForEachUse(HloModule* module) {
-  bool split_at_least_one = false;
-  for (HloComputation* computation : module->computations()) {
-    for (HloInstruction* instruction :
-         computation->MakeInstructionPostOrder()) {
-      if (instruction->opcode() != HloOpcode::kBroadcast ||
-          !instruction->HasConstantOperand()) {
-        continue;
-      }
-      absl::InlinedVector<HloUse, 8> uses;
-      for (HloInstruction* user : instruction->users()) {
-        for (int64_t i = 0; i < user->operand_count(); ++i) {
-          if (user->operand(i) != instruction) {
-            continue;
+// Checks if all of the HloPositions of this HloValue, apart from the defining
+// position, are allowed when doing memory-only offload.
+bool AllPositionsAreAllowed(const HloValue* value) {
+  // Given an HloValue, validate that none of its positions are doing any
+  // compute.
+  for (const HloPosition& position : value->positions()) {
+    if (position == value->defining_position()) {
+      // Skip defining positions.
+      continue;
+    }
+    // Check if this position is of an allowed type.
+    if (!absl::c_linear_search(HostOffloader::GetAllowedPositionOpcodes(),
+                               position.instruction->opcode())) {
+      VLOG(1) << "Position " << position.instruction->ToString()
+              << " is not supported.";
+      return false;
+    }
+  }
+
+  // Did not find any invalid ops.
+  return true;
+}
+
+bool DefiningPositionIsAllowed(const HloInstruction* instruction) {
+  static constexpr std::array kAllowedOpcodes = {HloOpcode::kWhile,
+                                                 HloOpcode::kParameter};
+  return absl::c_linear_search(kAllowedOpcodes, instruction->opcode());
+}
+
+template <typename MatcherType>
+StatusOr<HloInstruction*> BufferHasPositionWithUser(const HloBuffer& buffer,
+                                                    MatcherType matcher) {
+  HloInstruction* result = nullptr;
+  for (const HloValue* value : buffer.values()) {
+    for (const HloPosition& position : value->positions()) {
+      for (HloInstruction* user : position.instruction->users()) {
+        if (Match(user, matcher)) {
+          if (result != nullptr && result != user) {
+            return Internal("Found multiple matching users! At least %s and %s",
+                            result->name(), user->name());
           }
-          uses.push_back(HloUse{user, i, /*operand_index=*/{}});
+          result = user;
         }
       }
+    }
+  }
+  return result;
+}
 
-      if (uses.size() <= 1) {
-        continue;
+template <typename MatcherType>
+StatusOr<std::vector<HloInstruction*>> GetBufferUsersOfType(
+    const HloBuffer& buffer, MatcherType matcher) {
+  std::vector<HloInstruction*> result;
+  for (const HloValue* value : buffer.values()) {
+    VLOG(3) << "Buffer defined at " << value->defining_instruction()->name()
+            << " has positions ["
+            << absl::StrJoin(value->positions(), ", ",
+                             [](std::string* out, const HloPosition& position) {
+                               out->append(position.instruction->name());
+                             })
+            << "]";
+    for (const HloPosition& position : value->positions()) {
+      VLOG(4) << "  Position " << position.instruction->name() << " has users ["
+              << absl::StrJoin(
+                     position.instruction->users(), ", ",
+                     [](std::string* out, const HloInstruction* user) {
+                       out->append(user->name());
+                     })
+              << "]";
+      for (HloInstruction* user : position.instruction->users()) {
+        if (Match(user, matcher)) {
+          result.emplace_back(user);
+        }
       }
+    }
+  }
+  return result;
+}
 
-      VLOG(1) << "Splitting broadcast " << instruction->ToString()
-              << " which has " << uses.size() << " uses";
-      split_at_least_one = true;
-      // Don't create a new broadcast for the first use; we can still use the
-      // original.
-      for (int i = 1; i < uses.size(); ++i) {
-        const HloUse& use = uses[i];
-        HloInstruction* new_broadcast =
-            instruction->parent()->AddInstruction(instruction->Clone());
-        VLOG(2) << "New broadcast " << new_broadcast->ToString();
-        TF_RETURN_IF_ERROR(use.instruction->ReplaceOperandWith(
-            use.operand_number, new_broadcast));
-      }
+HloInstruction* FindDSAnnotation(HloInstruction* hlo) {
+  while (!hlo->IsCustomCall(
+      host_memory_offload_annotations::kMoveToDeviceCustomCallTarget)) {
+    if (hlo->opcode() != HloOpcode::kReshape &&
+        hlo->opcode() != HloOpcode::kBitcast) {
+      break;
+    }
+    if (hlo->user_count() != 1) {
+      break;
     }
+    hlo = hlo->users()[0];
   }
-  return split_at_least_one;
+  return hlo;
 }
 
 }  // namespace
 
 Status HostOffloader::HandlePipelineForwardCustomCall(
     HloInstruction* custom_call) {
+  VLOG(2) << "Found a custom call annotating start-of-host-offload: "
+          << custom_call->ToString();
   // Save a pointer to this custom call for when we want to remove it later.
   custom_calls_to_remove_.emplace(custom_call);
 
@@ -107,91 +164,327 @@ Status HostOffloader::HandlePipelineForwardCustomCall(
                         out->append(user->name());
                       }));
   }
-  HloInstruction* dynamic_update_slice = custom_call->users()[0];
+  HloInstruction* op_being_annotated = custom_call->users()[0];
 
   // Skip past any bitcasts.
-  // TODO(b/319167527): Update this to be a bit more generic and safe.
-  while (dynamic_update_slice->opcode() == HloOpcode::kBitcast) {
-    VLOG(1) << "Skipping bitcast " << dynamic_update_slice->ToString();
-    dynamic_update_slice = dynamic_update_slice->users()[0];
+  while (op_being_annotated->opcode() == HloOpcode::kBitcast) {
+    VLOG(1) << "Skipping bitcast " << op_being_annotated->ToString();
+    op_being_annotated = op_being_annotated->users()[0];
   }
-  if (dynamic_update_slice->opcode() != HloOpcode::kDynamicUpdateSlice) {
-    return InternalError(
-        "Expecting only bitcasts between custom call (%s) and dynamic update "
-        "slice (%s)",
-        custom_call->name(), dynamic_update_slice->name());
+
+  if (op_being_annotated->opcode() == HloOpcode::kDynamicUpdateSlice) {
+    TF_RETURN_IF_ERROR(MemoryOnlyOffloadStartingWithDus(op_being_annotated));
+  } else if (op_being_annotated->opcode() == HloOpcode::kCopy) {
+    TF_RETURN_IF_ERROR(MemoryOnlyOffloadStartingWithCopy(op_being_annotated));
+  } else {
+    TF_RETURN_IF_ERROR(MemoryOnlyOffloadInsertCopies(custom_call));
   }
+  return OkStatus();
+}
 
+Status HostOffloader::MemoryOnlyOffloadStartingWithDus(
+    const HloInstruction* dynamic_update_slice) {
+  // The user wants to offload the data defined by this dynamic-update-slice.
+  VLOG(2) << "Host memory offload starts with a dynamic-update-slice: "
+          << dynamic_update_slice->name();
   // Get the buffer for this DUS.
   const HloBuffer& unique_buffer =
       alias_analysis_->GetUniqueBufferAt(dynamic_update_slice);
 
-  // Look at the positions of this DUS:
-  // TODO(b/319167527):
-  //  Add kCopy to the list after ensuring that it is always safe to
-  //  do so.
-  constexpr std::array kAllowedPositionOpcodes = {
-      HloOpcode::kTuple,
-      HloOpcode::kGetTupleElement,
-      HloOpcode::kDynamicUpdateSlice,
-      HloOpcode::kBroadcast,
-      HloOpcode::kWhile,
-      HloOpcode::kParameter,
-      HloOpcode::kOptimizationBarrier};
+  // We must find at least two HloValues:
+  //  1. Defined by a broadcast.
+  //    a. For now, we only offload if the original destination of DUS is
+  //    created by a broadcast.
+  //  2. Defined by a dynamic-update-slice.
+  const HloValue* dus_value = nullptr;
+  const HloValue* broadcast_value = nullptr;
   for (const HloValue* value : unique_buffer.values()) {
-    for (const HloPosition& position : value->positions()) {
-      // Check if this position is of an allowed type.
-      if (absl::c_find(kAllowedPositionOpcodes,
-                       position.instruction->opcode()) ==
-          kAllowedPositionOpcodes.end()) {
-        return InternalError(
-            "DynamicUpdateSlice %s's position %s is not supported. Not going "
-            "to offload this one",
-            dynamic_update_slice->name(), position.instruction->name());
+    HloInstruction* defining_instruction =
+        value->defining_position().instruction;
+    if (defining_instruction->opcode() == HloOpcode::kBroadcast) {
+      if (broadcast_value != nullptr) {
+        LOG(WARNING) << "Already found one broadcast ("
+                     << broadcast_value->defining_position().instruction->name()
+                     << ") value for this buffer. This one is "
+                     << defining_instruction->name();
+      }
+      broadcast_value = value;
+    } else if (defining_instruction->opcode() ==
+               HloOpcode::kDynamicUpdateSlice) {
+      if (dus_value != nullptr) {
+        LOG(WARNING) << "Already found one dynamic-update-slice ("
+                     << dus_value->defining_position().instruction->name()
+                     << ") value for this buffer. This one is "
+                     << defining_instruction->name();
+      }
+      dus_value = value;
+    } else {
+      // For all values other than the two we were looking for, ensure that the
+      // defining position is non-compute as well as all other positions.
+      if (!DefiningPositionIsAllowed(value->defining_position().instruction)) {
+        return Internal(
+            "HloValue is defined by an unsupported op: %s. HloValue: %s",
+            defining_instruction->name(), value->ToString());
+      }
+      if (!AllPositionsAreAllowed(value)) {
+        return Internal(
+            "HloValue defined by %s has an invalid position. HloValue: %s",
+            defining_instruction->name(), value->ToString());
       }
     }
   }
 
-  // Check if there is a broadcast which creates this buffer.
-  // For now, we only offload if the original destination of DUS is created by a
-  // broadcast.
-  HloInstruction* broadcast_instruction = nullptr;
-  for (const HloValue* val : unique_buffer.values()) {
-    HloInstruction* defining_instruction = val->defining_position().instruction;
-    if (defining_instruction->opcode() == HloOpcode::kBroadcast) {
-      VLOG(1) << "Found a broadcast instruction "
-              << defining_instruction->ToString();
-      broadcast_instruction = defining_instruction;
-      break;
-    }
+  // For the two found HloValues, ensure that all other positions are
+  // non-compute.
+  if (dus_value == nullptr) {
+    return Internal(
+        "DynamicUpdateSlice's buffer does not have a value which is defined by "
+        "a dynamic update slice. HloBuffer: %s",
+        unique_buffer.ToString());
+  }
+  if (!AllPositionsAreAllowed(dus_value)) {
+    return Internal(
+        "HloValue defined by %s has an invalid position. HloValue: %s",
+        dus_value->defining_position().instruction->name(),
+        dus_value->ToString());
+  }
+  if (broadcast_value == nullptr) {
+    return Internal(
+        "DynamicUpdateSlice's buffer does not have a value which is defined by "
+        "a broadcast. HloBuffer: %s",
+        unique_buffer.ToString());
+  }
+  if (!AllPositionsAreAllowed(broadcast_value)) {
+    return Internal(
+        "HloValue defined by %s has an invalid position. HloValue: %s",
+        broadcast_value->defining_position().instruction->name(),
+        broadcast_value->ToString());
   }
 
-  if (broadcast_instruction == nullptr) {
-    return InternalError(
-        "The destination buffer of %s was not created by a broadcast; cannot "
-        "offload. Has defining position(s) [%s]",
-        dynamic_update_slice->name(),
-        absl::StrJoin(unique_buffer.values(), ", ",
-                      [](std::string* str, const HloValue* value) {
-                        str->append(
-                            value->defining_position().instruction->name());
-                      }));
+  // TODO(b/319681297): Further analyze the HloValue defined by the broadcast.
+  // Make sure that nothing is expecting the result of the broadcast, as we'll
+  // be replacing it.
+
+  // Check that this buffer is finally an input to at least one slice or
+  // dynamic-slice.
+  TF_ASSIGN_OR_RETURN(
+      std::vector<HloInstruction*> consuming_slices,
+      GetBufferUsersOfType(
+          unique_buffer,
+          match::AnyOf<HloInstruction>(match::Slice(), match::DynamicSlice())));
+  VLOG(2) << dynamic_update_slice->name()
+          << " is consumed by [dynamic-]slices: ["
+          << absl::StrJoin(consuming_slices, ", ",
+                           [](std::string* out, const HloInstruction* inst) {
+                             out->append(inst->name());
+                           })
+          << ']';
+  if (consuming_slices.empty()) {
+    return Internal(
+        "The dynamic-update-slice (%s) never feeds into a slice nor "
+        "dynamic-slice.",
+        dynamic_update_slice->name());
   }
 
-  // TODO(b/319681297): Check that all uses of the broadcast are preceded by a
-  // host copy.
+  // Each dynamic_slice and slice should feed into another annotation.
+  for (HloInstruction* consuming_slice : consuming_slices) {
+    VLOG(1) << "Host data produced by " << dynamic_update_slice->name()
+            << " is consumed by " << consuming_slice->name();
+    if (consuming_slice->user_count() != 1) {
+      return Internal(
+          "Slice/Dynamic-slice %s should only have one user. It should be an "
+          "annotation "
+          "to load the data back on the device. Instead, it has users [%s]",
+          consuming_slice->name(),
+          absl::StrJoin(consuming_slice->users(), ", ",
+                        [](std::string* out, const HloInstruction* inst) {
+                          out->append(inst->name());
+                        }));
+    }
+    HloInstruction* consuming_slice_user =
+        FindDSAnnotation(consuming_slice->users()[0]);
+    if (consuming_slice_user->opcode() != HloOpcode::kCustomCall) {
+      return Internal(
+          "Slice/Dynamic-slice %s does not have a matching annotation.",
+          consuming_slice->name());
+    }
+    if (consuming_slice_user->custom_call_target() !=
+        host_memory_offload_annotations::kMoveToDeviceCustomCallTarget) {
+      return Internal(
+          "Found custom-call (%s) is not the expected matching host offload "
+          "annotation",
+          consuming_slice_user->name());
+    }
+    expected_host_to_device_annotations_.emplace(consuming_slice_user);
+  }
 
   // Save the broadcast to later be replaced with a
   // custom-call("AllocateBuffer")
-  broadcasts_to_replace_.emplace(broadcast_instruction);
-  buffers_to_move_to_host_memory_.emplace(&unique_buffer);
+  broadcasts_to_replace_.emplace(
+      broadcast_value->defining_position().instruction);
+  AddAllPositionsToBeMovedToHostMemory(unique_buffer);
+  return OkStatus();
+}
+
+void HostOffloader::AddAllPositionsToBeMovedToHostMemory(
+    const HloBuffer& unique_buffer) {
+  for (const HloValue* value : unique_buffer.values()) {
+    for (const HloPosition& position : value->positions()) {
+      positions_to_move_to_host_memory_.emplace(position);
+    }
+  }
+}
+
+Status HostOffloader::MemoryOnlyOffloadStartingWithCopy(
+    const HloInstruction* copy) {
+  // The user wants to offload the data defined by this copy.
+  VLOG(2) << "Host memory offload starts with a copy: " << copy->name();
+
+  // Get the buffer for this copy.
+  const HloBuffer& unique_buffer = alias_analysis_->GetUniqueBufferAt(copy);
+
+  // Look for a value defined by a copy.
+  const HloValue* copy_value = nullptr;
+  for (const HloValue* value : unique_buffer.values()) {
+    HloInstruction* defining_instruction =
+        value->defining_position().instruction;
+    if (defining_instruction->opcode() == HloOpcode::kCopy) {
+      if (copy_value != nullptr) {
+        LOG(WARNING)
+            << "Already found one dynamic-update-slice value for this buffer";
+      }
+      copy_value = value;
+    } else {
+      // For all other values (that aren't defined by a copy), ensure that the
+      // defining position is non-compute as well as all other positions.
+      if (!DefiningPositionIsAllowed(value->defining_position().instruction)) {
+        return Internal(
+            "HloValue is defined by an unsupported op: %s. HloValue: %s",
+            defining_instruction->name(), value->ToString());
+      }
+      if (!AllPositionsAreAllowed(value)) {
+        return Internal(
+            "HloValue defined by %s has an invalid position. HloValue: %s",
+            defining_instruction->name(), value->ToString());
+      }
+    }
+  }
+
+  if (copy_value == nullptr) {
+    return Internal(
+        "Copy's buffer does not have a value which is defined by a copy. "
+        "HloBuffer: %s",
+        unique_buffer.ToString());
+  }
+  // For the copy, ensure that all other positions are non-compute.
+  if (!AllPositionsAreAllowed(copy_value)) {
+    return Internal(
+        "HloValue defined by %s has an invalid position. HloValue: %s",
+        copy_value->defining_position().instruction->name(),
+        copy_value->ToString());
+  }
+
+  // Check that this buffer is finally an input to another copy.
+  TF_ASSIGN_OR_RETURN(HloInstruction * consuming_copy,
+                      BufferHasPositionWithUser(unique_buffer, match::Copy()));
+  if (consuming_copy == nullptr) {
+    return Internal("The copy (%s) never feeds into another copy.",
+                    copy->name());
+  }
+
+  // The copy should feed into another annotation.
+  if (consuming_copy->user_count() != 1) {
+    return Internal(
+        "Copy should only have one user. It should be an annotation to load "
+        "the data back on the device. Instead, it has users [%s]",
+        absl::StrJoin(consuming_copy->users(), ", ",
+                      [](std::string* out, const HloInstruction* inst) {
+                        out->append(inst->name());
+                      }));
+  }
+  HloInstruction* consuming_copy_user = consuming_copy->users()[0];
+  if (consuming_copy_user->opcode() != HloOpcode::kCustomCall) {
+    return Internal("Copy does not have a matching annotation.");
+  }
+  if (consuming_copy_user->custom_call_target() !=
+      host_memory_offload_annotations::kMoveToDeviceCustomCallTarget) {
+    return Internal(
+        "Found custom-call is not the expected matching host offload "
+        "annotation");
+  }
+  expected_host_to_device_annotations_.emplace(consuming_copy_user);
+
+  AddAllPositionsToBeMovedToHostMemory(unique_buffer);
+  return OkStatus();
+}
+
+Status HostOffloader::MemoryOnlyOffloadInsertCopies(
+    HloInstruction* custom_call) {
+  VLOG(3) << "Found an offload annotation (" << custom_call->name()
+          << "). Expecting that we'll need to insert copies";
+  const HloBuffer& unique_buffer =
+      alias_analysis_->GetUniqueBufferAt(custom_call);
+  for (const HloValue* value : unique_buffer.values()) {
+    HloInstruction* defining_instruction =
+        value->defining_position().instruction;
+    if (!AllPositionsAreAllowed(value)) {
+      return Internal(
+          "HloValue defined by %s has an invalid position. HloValue: %s",
+          defining_instruction->name(), value->ToString());
+    }
+  }
+
+  // Check that this buffer is finally an input to a load-from-host custom-call.
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * matching_annotation,
+      BufferHasPositionWithUser(
+          unique_buffer,
+          match::CustomCall({host_memory_offload_annotations::
+                                 kMoveToDeviceCustomCallTarget})));
+  if (matching_annotation == nullptr) {
+    return Internal(
+        "The offloaded data (from %s) never feeds into a matching \"load\" "
+        "annotation.",
+        custom_call->name());
+  }
+  expected_host_to_device_annotations_.emplace(matching_annotation);
+
+  // This fits the pattern that we're looking for. Now insert copies to do the
+  // offload and reload.
+  HloInstruction* thing_to_offload = custom_call->operands()[0];
+  // Create a copy (to host) of the first and only operand to the given custom
+  // call.
+  HloInstruction* copy_to_host =
+      custom_call->parent()->AddInstruction(HloInstruction::CreateUnary(
+          thing_to_offload->shape(), HloOpcode::kCopy, thing_to_offload));
+  // Replace all uses of the offloading custom call with the first copy.
+  TF_RETURN_IF_ERROR(custom_call->ReplaceAllUsesWith(copy_to_host));
+
+  HloInstruction* operand_of_load_annotation =
+      matching_annotation->mutable_operand(0);
+  // Create another copy (back to device) of that copy.
+  HloInstruction* copy_to_device =
+      custom_call->parent()->AddInstruction(HloInstruction::CreateUnary(
+          copy_to_host->shape(), HloOpcode::kCopy, operand_of_load_annotation));
+  // Replace all uses of the operand of the matching annotation with the second
+  // copy.
+  TF_RETURN_IF_ERROR(
+      operand_of_load_annotation->ReplaceAllUsesWith(copy_to_device));
+
+  AddAllPositionsToBeMovedToHostMemory(unique_buffer);
+  // Also save the position of the newly created copy-to-host to later have its
+  // memory space updated.
+  positions_to_move_to_host_memory_.emplace(HloPosition{copy_to_host});
   return OkStatus();
 }
 
-void HostOffloader::HandlePipelineBackwardCustomCall(
+Status HostOffloader::HandlePipelineBackwardCustomCall(
     HloInstruction* custom_call) {
+  VLOG(2) << "Found a custom call annotating end-of-host-offload: "
+          << custom_call->ToString();
   // Save a pointer to this custom call for later removal.
-  custom_calls_to_remove_.emplace(custom_call);
+  found_host_to_device_annotations_.emplace(custom_call);
+  return OkStatus();
 }
 
 Status HostOffloader::DynamifySlice(HloInstruction* slice) {
@@ -222,54 +515,64 @@ StatusOr<bool> HostOffloader::Run(
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   bool changed = false;
 
-  // Split broadcasts so that each HloUse of a broadcast instruction will get
-  // its own copy.
-  // TODO(b/319293925): Do not blindly duplicate all broadcasts, instead do it
-  // only when necessary.
-  TF_ASSIGN_OR_RETURN(bool duplicated_at_least_one_broadcast,
-                      DuplicateBroadcastForEachUse(module));
-  if (duplicated_at_least_one_broadcast) {
-    changed = true;
-  }
-
   // Run HloAliasAnalysis on module.
   TF_ASSIGN_OR_RETURN(alias_analysis_, HloAliasAnalysis::Run(module));
 
   // Iterate over all instructions and look for XLA host offload annoations.
   for (HloComputation* computation :
        module->MakeNonfusionComputations(execution_threads)) {
-    for (HloInstruction* instruction : computation->instructions()) {
+    for (HloInstruction* instruction :
+         computation->MakeInstructionPostOrder()) {
       if (instruction->opcode() != HloOpcode::kCustomCall) {
         continue;
       }
-      if (instruction->custom_call_target() == kPipelineForwardTarget) {
+      if (instruction->custom_call_target() ==
+          host_memory_offload_annotations::kMoveToHostCustomCallTarget) {
         TF_RETURN_IF_ERROR(HandlePipelineForwardCustomCall(instruction));
-      } else if (instruction->custom_call_target() == kPipelineBackwardTarget) {
-        HandlePipelineBackwardCustomCall(instruction);
+      } else if (instruction->custom_call_target() ==
+                 host_memory_offload_annotations::
+                     kMoveToDeviceCustomCallTarget) {
+        TF_RETURN_IF_ERROR(HandlePipelineBackwardCustomCall(instruction));
       }
     }
   }
 
+  // Check that we found all the annotations that we expected.
+  if (found_host_to_device_annotations_ !=
+      expected_host_to_device_annotations_) {
+    return Internal(
+        "There is a mismatch between the expected host-to-device annotations "
+        "(%s) and the found host-to-device annotations (%s)",
+        absl::StrJoin(expected_host_to_device_annotations_, ", ",
+                      [](std::string* str, HloInstruction* instr) {
+                        str->append(instr->name());
+                      }),
+        absl::StrJoin(found_host_to_device_annotations_, ", ",
+                      [](std::string* str, HloInstruction* instr) {
+                        str->append(instr->name());
+                      }));
+  }
+
+  // Remove these host-to-device annotations.
+  for (HloInstruction* instr : found_host_to_device_annotations_) {
+    custom_calls_to_remove_.emplace(instr);
+  }
+
   absl::flat_hash_set<HloInstruction*> slices_to_dynamify;
-  // Change the memory space of these buffers to the host memory space.
-  for (const HloBuffer* buffer : buffers_to_move_to_host_memory_) {
-    for (const HloValue* value : buffer->values()) {
-      for (const HloPosition& position : value->positions()) {
-        for (HloInstruction* user : position.instruction->users()) {
-          // If a user of this position is a slice, change it to be a
-          // dynamic-slice.
-          if (user->opcode() == HloOpcode::kSlice) {
-            slices_to_dynamify.emplace(user);
-          }
-        }
-        Shape* shape_to_change = ShapeUtil::GetMutableSubshape(
-            position.instruction->mutable_shape(), position.index);
-        VLOG(2) << "Setting instruction to have host memory space: "
-                << position.instruction->name();
-        SetMemorySpace(shape_to_change, kHostMemorySpaceColor);
-        changed = true;
+  // Change the memory space of these positions to the host memory space.
+  for (const HloPosition& position : positions_to_move_to_host_memory_) {
+    // If a user of this position is a slice, change it to be a dynamic-slice.
+    for (HloInstruction* user : position.instruction->users()) {
+      if (user->opcode() == HloOpcode::kSlice) {
+        slices_to_dynamify.emplace(user);
       }
     }
+    Shape* shape_to_change = ShapeUtil::GetMutableSubshape(
+        position.instruction->mutable_shape(), position.index);
+    VLOG(2) << "Setting instruction to have host memory space: "
+            << position.instruction->name();
+    SetMemorySpace(shape_to_change, kHostMemorySpaceColor);
+    changed = true;
   }
 
   for (HloInstruction* user : slices_to_dynamify) {
@@ -293,19 +596,7 @@ StatusOr<bool> HostOffloader::Run(
   for (HloInstruction* custom_call : custom_calls_to_remove_) {
     CHECK_EQ(custom_call->operand_count(), 1);
     HloInstruction* operand = custom_call->operands()[0];
-
-    if (custom_call->shape().layout() != operand->shape().layout()) {
-      // LayoutAssignment might change the layout of the operand but leave the
-      // custom call layout unchanged. In that case, insert a copy.
-      // TODO(b/319686942): Once LayoutAssignment propagates the layout through
-      // this specific custom call, remove this insertion of a copy.
-      TF_RETURN_IF_ERROR(custom_call->ReplaceAllUsesWith(
-          custom_call->parent()->AddInstruction(HloInstruction::CreateUnary(
-              custom_call->shape(), HloOpcode::kCopy, operand))));
-    } else {
-      CHECK_OK(custom_call->ReplaceAllUsesWith(operand));
-    }
-
+    CHECK_OK(custom_call->ReplaceAllUsesWith(operand));
     TF_RETURN_IF_ERROR(custom_call->parent()->RemoveInstruction(custom_call));
     changed = true;
   }
diff --git a/third_party/xla/xla/service/host_offloader.h b/third_party/xla/xla/service/host_offloader.h
index ed2bc6ddf48c01..6d09717279df1f 100644
--- a/third_party/xla/xla/service/host_offloader.h
+++ b/third_party/xla/xla/service/host_offloader.h
@@ -29,18 +29,12 @@ class HloCostAnalysis;
 
 // This pass does "host memory offloading". If a tensor is annotated to be moved
 // to or from the host, this pass will remove the annotations and update each
-// tensor's layout with host memory spaces and insert copies* if necessary. This
+// tensor's layout with host memory spaces and insert copies if necessary. This
 // pass checks to make sure that no compute is done on the tensors annotated for
 // host memory offload; if there is compute, it is considered a user error and
 // an error will be returned.
-//
-// * TODO(b/319293918): Inserting of copies is not yet implemented.
 class HostOffloader : public HloModulePass {
  public:
-  static constexpr absl::string_view kPipelineForwardTarget = "PipelineForward";
-  static constexpr absl::string_view kPipelineBackwardTarget =
-      "PipelineBackward";
-
   explicit HostOffloader(int64_t host_memory_space_color)
       : kHostMemorySpaceColor(host_memory_space_color) {}
   ~HostOffloader() override = default;
@@ -51,17 +45,45 @@ class HostOffloader : public HloModulePass {
   StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
+  static absl::Span<const HloOpcode> GetAllowedPositionOpcodes() {
+    return kAllowedPositionOpcodes;
+  }
 
  private:
   const int64_t kHostMemorySpaceColor;
   std::unique_ptr<HloAliasAnalysis> alias_analysis_;
+  absl::flat_hash_set<HloInstruction*> found_host_to_device_annotations_;
+  absl::flat_hash_set<HloInstruction*> expected_host_to_device_annotations_;
   absl::flat_hash_set<HloInstruction*> custom_calls_to_remove_;
   absl::flat_hash_set<HloInstruction*> broadcasts_to_replace_;
-  absl::flat_hash_set<const HloBuffer*> buffers_to_move_to_host_memory_;
+  absl::flat_hash_set<HloPosition> positions_to_move_to_host_memory_;
+
+  // Positions of all HloValues of the given HloBuffer will be added to
+  // positions_to_move_to_host_memory_.
+  void AddAllPositionsToBeMovedToHostMemory(const HloBuffer& unique_buffer);
 
   Status HandlePipelineForwardCustomCall(HloInstruction* custom_call);
-  void HandlePipelineBackwardCustomCall(HloInstruction* custom_call);
+  Status HandlePipelineBackwardCustomCall(HloInstruction* custom_call);
+
+  // Handle memory-only offloading where the data is written to the host via a
+  // dynamic-update-slice and is read back via a dynamic-slice.
+  Status MemoryOnlyOffloadStartingWithDus(
+      const HloInstruction* dynamic_update_slice);
+
+  // Handle memory-only offloading where the data is written to the host via a
+  // copy and is read back via a copy.
+  Status MemoryOnlyOffloadStartingWithCopy(const HloInstruction* copy);
+
+  // Handle memory-only offloading where there are no ops yet for data movement.
+  // We will insert copies at the points where the annotations are.
+  Status MemoryOnlyOffloadInsertCopies(HloInstruction* custom_call);
+
   Status DynamifySlice(HloInstruction* slice);
+
+  static constexpr std::array kAllowedPositionOpcodes = {
+      HloOpcode::kTuple, HloOpcode::kGetTupleElement,
+      HloOpcode::kOptimizationBarrier, HloOpcode::kParameter,
+      HloOpcode::kWhile};
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/host_offloader_test.cc b/third_party/xla/xla/service/host_offloader_test.cc
index f9ee35b200aac7..ee1372bf130983 100644
--- a/third_party/xla/xla/service/host_offloader_test.cc
+++ b/third_party/xla/xla/service/host_offloader_test.cc
@@ -22,9 +22,12 @@ limitations under the License.
 #include <gmock/gmock.h>
 #include <gtest/gtest.h>
 #include "absl/container/flat_hash_set.h"
+#include "absl/status/status.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/host_memory_offload_annotations.h"
+#include "xla/service/host_offload_legalize.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
 #include "xla/shape.h"
@@ -47,11 +50,17 @@ class HostOffloaderTest : public HloTestBase {
   StatusOr<bool> RunHostOffloader(HloModule* module) {
     TF_EXPECT_OK(verifier().Run(module).status());
     if (module->has_schedule()) {
-      return InternalError("Expected a non-scheduled module");
+      return absl::InternalError("Expected a non-scheduled module");
     }
-
+    bool changed = false;
+    HostOffloadLegalize host_offload_legalize(kHostMemorySpaceColor,
+                                              /*after_layout=*/false);
+    TF_ASSIGN_OR_RETURN(bool legal_changed, host_offload_legalize.Run(module));
+    changed |= legal_changed;
     HostOffloader host_offloader(kHostMemorySpaceColor);
-    return host_offloader.Run(module);
+    TF_ASSIGN_OR_RETURN(bool offload_changed, host_offloader.Run(module));
+    changed |= offload_changed;
+    return changed;
   }
 
   void TestShapeHasMemorySpace(const Shape& shape, int64_t memory_space) {
@@ -63,8 +72,9 @@ class HostOffloaderTest : public HloTestBase {
     for (const HloComputation* computation : module->computations()) {
       for (const HloInstruction* instruction : computation->instructions()) {
         if (instruction->IsCustomCall(
-                {HostOffloader::kPipelineForwardTarget,
-                 HostOffloader::kPipelineBackwardTarget})) {
+                {host_memory_offload_annotations::kMoveToHostCustomCallTarget,
+                 host_memory_offload_annotations::
+                     kMoveToDeviceCustomCallTarget})) {
           return true;
         }
       }
@@ -75,7 +85,7 @@ class HostOffloaderTest : public HloTestBase {
 
 TEST_F(HostOffloaderTest, BasicDusDs) {
   const std::string& hlo_string = R"(
-HloModule llm_while
+HloModule my_module
 ENTRY main {
   data_param = f32[1,2048,2048] parameter(0)
   index_param = s32[] parameter(1)
@@ -122,9 +132,322 @@ ENTRY main {
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
+TEST_F(HostOffloaderTest, BasicCopy) {
+  const std::string& hlo_string = R"(
+HloModule my_module
+ENTRY main {
+  data_param = f32[2048] parameter(0)
+  offload_custom_call = f32[2048] custom-call(data_param), custom_call_target="PipelineForward"
+  copy_0 = f32[2048] copy(offload_custom_call)
+  copy_1 = f32[2048] copy(copy_0)
+  ROOT load_custom_call = f32[2048] custom-call(copy_1), custom_call_target="PipelineBackward"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  // param
+  //   |
+  // copy (to host)
+  //   |
+  // copy (to device)
+
+  HloInstruction* param;
+  HloInstruction* copy_to_host;
+  HloInstruction* copy_to_device;
+  ASSERT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Copy(&copy_to_device,
+                         m::Copy(&copy_to_host, m::Parameter(&param, 0)))));
+  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy_to_host->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy_to_device->shape(), Layout::kDefaultMemorySpace);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
+TEST_F(HostOffloaderTest, BasicNoCopy) {
+  const std::string& hlo_string = R"(
+HloModule my_module
+ENTRY main {
+  data_param = f32[2048] parameter(0)
+  offload_custom_call = f32[2048] custom-call(data_param), custom_call_target="PipelineForward"
+  ROOT load_custom_call = f32[2048] custom-call(offload_custom_call), custom_call_target="PipelineBackward"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  // param
+  //   |
+  // copy (to host)
+  //   |
+  // copy (to device)
+
+  HloInstruction* param;
+  HloInstruction* copy_to_host;
+  HloInstruction* copy_to_device;
+  ASSERT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Copy(&copy_to_device,
+                         m::Copy(&copy_to_host, m::Parameter(&param, 0)))));
+  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy_to_host->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy_to_device->shape(), Layout::kDefaultMemorySpace);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
+TEST_F(HostOffloaderTest, NoCopyWithOptBarrier) {
+  const std::string& hlo_string = R"(
+HloModule my_module
+ENTRY main {
+  data_param = f32[2048] parameter(0)
+  offload_custom_call = f32[2048] custom-call(data_param), custom_call_target="PipelineForward"
+  tuple = (f32[2048]) tuple(offload_custom_call)
+  opt_barrier = (f32[2048]) opt-barrier(tuple)
+  get_tuple_element = f32[2048] get-tuple-element(opt_barrier), index=0
+  ROOT load_custom_call = f32[2048] custom-call(get_tuple_element), custom_call_target="PipelineBackward"
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  // param
+  //   |
+  // copy (to host)
+  //   |
+  // tuple
+  //   |
+  // opt-barrier
+  //   |
+  // get-tuple-element
+  //   |
+  // copy (to device)
+
+  HloInstruction* param;
+  HloInstruction* copy_to_host;
+  HloInstruction* tuple;
+  HloInstruction* opt_barrier;
+  HloInstruction* gte;
+  HloInstruction* copy_to_device;
+  ASSERT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Copy(
+          &copy_to_device,
+          m::GetTupleElement(
+              &gte, m::OptimizationBarrier(
+                        &opt_barrier,
+                        m::Tuple(&tuple, m::Copy(&copy_to_host,
+                                                 m::Parameter(&param, 0))))))));
+  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy_to_host->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {0}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(opt_barrier->shape(), {0}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(gte->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy_to_device->shape(), Layout::kDefaultMemorySpace);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
+TEST_F(HostOffloaderTest, NoCopyWithOptBarrierMoreElaborate) {
+  const std::string& hlo_string = R"(
+HloModule jit_f, entry_computation_layout={(f32[16]{0})->f32[16]{0}}
+
+ENTRY main.24 {
+  Arg_0.1 = f32[16]{0} parameter(0), sharding={devices=[2]<=[2]}
+  cosine.4 = f32[16]{0} cosine(Arg_0.1)
+  custom-call.5 = f32[16]{0} custom-call(cosine.4), custom_call_target="PipelineForward"
+  sine.3 = f32[16]{0} sine(Arg_0.1)
+  cosine.7 = f32[16]{0} cosine(sine.3)
+  custom-call.8 = f32[16]{0} custom-call(cosine.7), custom_call_target="PipelineForward"
+  sine.6 = f32[16]{0} sine(sine.3)
+  cosine.9 = f32[16]{0} cosine(sine.6)
+  custom-call.10 = f32[16]{0} custom-call(cosine.9), custom_call_target="PipelineForward"
+  constant.2 = f32[] constant(1)
+  tuple.11 = (f32[16]{0}, f32[16]{0}, f32[16]{0}, f32[]) tuple(custom-call.5, custom-call.8, custom-call.10, constant.2)
+  opt-barrier.12 = (f32[16]{0}, f32[16]{0}, f32[16]{0}, f32[]) opt-barrier(tuple.11)
+  get-tuple-element.16 = f32[] get-tuple-element(opt-barrier.12), index=3
+  broadcast.20 = f32[16]{0} broadcast(get-tuple-element.16), dimensions={}
+  get-tuple-element.15 = f32[16]{0} get-tuple-element(opt-barrier.12), index=2
+  custom-call.19 = f32[16]{0} custom-call(get-tuple-element.15), custom_call_target="PipelineBackward"
+  multiply.21 = f32[16]{0} multiply(broadcast.20, custom-call.19)
+  get-tuple-element.14 = f32[16]{0} get-tuple-element(opt-barrier.12), index=1
+  custom-call.18 = f32[16]{0} custom-call(get-tuple-element.14), custom_call_target="PipelineBackward"
+  multiply.22 = f32[16]{0} multiply(multiply.21, custom-call.18)
+  get-tuple-element.13 = f32[16]{0} get-tuple-element(opt-barrier.12), index=0
+  custom-call.17 = f32[16]{0} custom-call(get-tuple-element.13), custom_call_target="PipelineBackward"
+  ROOT multiply.23 = f32[16]{0} multiply(multiply.22, custom-call.17)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // Look for the following pattern:
+  //                          param                         constant
+  //                __________/ |                             |
+  //               /            |                             |
+  //          cosine           sine                           |
+  //            |               |  \____________              |
+  //            |               |               \             |
+  //            |               |              sine           |
+  //            |               |                |            |
+  //            |             cosine          cosine          |
+  //            |               |               |             |
+  //       copy(to host)   copy(to host)   copy(to host)      |
+  //                  \                \   /                  |
+  //                   \______________  | |  _________________/
+  //                                  \ | | /
+  //                                   tuple
+  //                                     |
+  //                                 opt-barrier
+  //                   _____________/   /  \   \_____________
+  //                  /                /    \                \
+  // get-tuple-element  get-tuple-element  get-tuple-element  get-tuple-element
+  //        |                   |                  |                  |
+  //   copy(to device)     copy(to device)    copy(to device)     broadcast
+  //                  \                   \                 \    /
+  //                   \                   \__________     multiply
+  //                    \                             \       /
+  //                     \                             multiply
+  //                      \_________________________        /
+  //                                                \      /
+  //                                                multiply
+
+  HloInstruction* param;
+  HloInstruction* constant;
+  HloInstruction* sine_0;
+  HloInstruction* sine_1;
+  HloInstruction* cosine_0;
+  HloInstruction* cosine_1;
+  HloInstruction* cosine_2;
+  HloInstruction* copy_to_host_0;
+  HloInstruction* copy_to_host_1;
+  HloInstruction* copy_to_host_2;
+  HloInstruction* tuple;
+  HloInstruction* opt_barrier;
+  HloInstruction* gte_0;
+  HloInstruction* gte_1;
+  HloInstruction* gte_2;
+  HloInstruction* gte_3;
+  HloInstruction* broadcast;
+  HloInstruction* copy_to_device_0;
+  HloInstruction* copy_to_device_1;
+  HloInstruction* copy_to_device_2;
+  HloInstruction* multiply_0;
+  HloInstruction* multiply_1;
+  HloInstruction* multiply_2;
+
+  auto parameter_matcher = m::Parameter(&param, 0);
+  auto first_sine_matcher = m::Op(&sine_0)
+                                .WithOpcode(xla::HloOpcode::kSin)
+                                .WithOperand(0, parameter_matcher);
+  auto opt_barrier_matcher = m::OptimizationBarrier(
+      &opt_barrier,
+      m::Tuple(
+          &tuple,
+          m::Copy(&copy_to_host_0, m::Op(&cosine_0)
+                                       .WithOpcode(xla::HloOpcode::kCos)
+                                       .WithOperand(0, parameter_matcher)),
+          m::Copy(&copy_to_host_1, m::Op(&cosine_1)
+                                       .WithOpcode(xla::HloOpcode::kCos)
+                                       .WithOperand(0, first_sine_matcher)),
+          m::Copy(&copy_to_host_2,
+                  m::Op(&cosine_2)
+                      .WithOpcode(xla::HloOpcode::kCos)
+                      .WithOperand(0, m::Op(&sine_1)
+                                          .WithOpcode(xla::HloOpcode::kSin)
+                                          .WithOperand(0, first_sine_matcher))),
+          m::Constant(&constant)));
+  ASSERT_THAT(
+      module->entry_computation()->root_instruction(),
+      GmockMatch(m::Multiply(
+          &multiply_0,
+          m::Multiply(
+              &multiply_1,
+              m::Multiply(
+                  &multiply_2,
+                  m::Broadcast(&broadcast, m::GetTupleElement(
+                                               &gte_3, opt_barrier_matcher, 3)),
+                  m::Copy(&copy_to_device_2,
+                          m::GetTupleElement(&gte_2, opt_barrier_matcher, 2))),
+              m::Copy(&copy_to_device_1,
+                      m::GetTupleElement(&gte_1, opt_barrier_matcher, 1))),
+          m::Copy(&copy_to_device_0,
+                  m::GetTupleElement(&gte_0, opt_barrier_matcher, 0)))));
+
+  TestShapeHasMemorySpace(param->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(constant->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(sine_0->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(sine_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(cosine_0->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(cosine_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(cosine_2->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy_to_host_0->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy_to_host_1->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(copy_to_host_2->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {0}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {1}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {2}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {3}),
+                          Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(opt_barrier->shape(), {0}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(opt_barrier->shape(), {1}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(opt_barrier->shape(), {2}),
+                          kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(ShapeUtil::GetSubshape(opt_barrier->shape(), {3}),
+                          Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(gte_0->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(gte_1->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(gte_2->shape(), kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(gte_3->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(broadcast->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy_to_device_0->shape(),
+                          Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy_to_device_1->shape(),
+                          Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(copy_to_device_2->shape(),
+                          Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(multiply_0->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(multiply_1->shape(), Layout::kDefaultMemorySpace);
+  TestShapeHasMemorySpace(multiply_2->shape(), Layout::kDefaultMemorySpace);
+
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
 TEST_F(HostOffloaderTest, BasicDusDsWithMultipleBroadcastUsers) {
   const std::string& hlo_string = R"(
-HloModule llm_while
+HloModule my_module
 ENTRY main {
   data_param = f32[1,2048,2048] parameter(0)
   index_param = s32[] parameter(1)
@@ -189,7 +512,7 @@ ENTRY main {
 
 TEST_F(HostOffloaderTest, BasicDusDsBitcastBeforeDus) {
   const std::string& hlo_string = R"(
-HloModule llm_while
+HloModule my_module
 ENTRY main {
   data_param = f32[2048,2048] parameter(0)
   index_param = s32[] parameter(1)
@@ -246,7 +569,7 @@ ENTRY main {
 // before.
 TEST_F(HostOffloaderTest, BasicDusDsDusAnnotationOnWrongSide) {
   const std::string& hlo_string = R"(
-HloModule llm_while
+HloModule my_module
 ENTRY main {
   data_param = f32[1,2048,2048] parameter(0)
   index_param = s32[] parameter(1)
@@ -269,10 +592,9 @@ ENTRY main {
 }
 
 // The annotation is mistakenly before the dynamic-slice; it should be after.
-// TODO(b/319686133): Enable this test once it passes.
-TEST_F(HostOffloaderTest, DISABLED_BasicDusDsDsAnnotationOnWrongSide) {
+TEST_F(HostOffloaderTest, BasicDusDsDsAnnotationOnWrongSide) {
   const std::string& hlo_string = R"(
-HloModule llm_while
+HloModule my_module
 ENTRY main {
   data_param = f32[1,2048,2048] parameter(0)
   index_param = s32[] parameter(1)
@@ -375,9 +697,9 @@ consuming_while_body {
 }
 
 ENTRY main {
-  moop = f32[] parameter(0)
-  broadcast_0 = f32[96,8,6,2048,2048] broadcast(moop), dimensions={}
-  broadcast_1 = f32[96,8,6,2048,1] broadcast(moop), dimensions={}
+  entry_param_0 = f32[] parameter(0)
+  broadcast_0 = f32[96,8,6,2048,2048] broadcast(entry_param_0), dimensions={}
+  broadcast_1 = f32[96,8,6,2048,1] broadcast(entry_param_0), dimensions={}
   constant_s32_0 = s32[] constant(0)
   tuple_for_producing_while = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) tuple(constant_s32_0, broadcast_0, broadcast_1)
   producing_while = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) while(tuple_for_producing_while), condition=producing_while_condition, body=producing_while_body
@@ -615,6 +937,601 @@ ENTRY main {
   EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
 }
 
+TEST_F(HostOffloaderTest, LlmActivationDsWithReshape) {
+  const std::string& hlo_string = R"(
+HloModule llm_while
+
+producing_while_condition {
+  producing_condition_param = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) parameter(0)
+  producing_condition_current_iteration_index = s32[] get-tuple-element(producing_condition_param), index=0
+  producing_condition_iteration_count = s32[] constant(96)
+  ROOT producing_condition_result = pred[] compare(producing_condition_current_iteration_index, producing_condition_iteration_count), direction=LT
+}
+
+consuming_while_condition {
+  consuming_condition_param = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) parameter(0)
+  consuming_condition_current_iteration_index = s32[] get-tuple-element(consuming_condition_param), index=0
+  consuming_condition_iteration_count = s32[] constant(96)
+  ROOT consuming_condition_result = pred[] compare(consuming_condition_current_iteration_index, consuming_condition_iteration_count), direction=LT
+}
+
+producing_while_body {
+  input_tuple.0 = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) parameter(0)
+  current_iteration_index.0 = s32[] get-tuple-element(input_tuple.0), index=0
+  data_0.0 = f32[96,8,6,2048,2048] get-tuple-element(input_tuple.0), index=1
+  data_1.0 = f32[96,8,6,2048,1] get-tuple-element(input_tuple.0), index=2
+  constant_0.0 = s32[] constant(0)
+  constant_1.0 = s32[] constant(1)
+  constant_96 = s32[] constant(96)
+
+  /* Create dummy data used in DUS */
+  slice_data_0 = f32[1,8,6,2048,2048]  constant({...})
+  slice_data_1 = f32[1,8,6,2048,1]  constant({...})
+
+  /* Build DUS index */
+  compare_result.0 = pred[] compare(current_iteration_index.0, constant_0.0), direction=LT
+  add_result = s32[] add(current_iteration_index.0, constant_96)
+  select_result.0 = s32[] select(compare_result.0, add_result, current_iteration_index.0)
+
+  /* Annotate DUS for offload */
+  custom_call_0.0 = f32[1,8,6,2048,2048] custom-call(slice_data_0), custom_call_target="PipelineForward"
+  custom_call_1.0 = f32[1,8,6,2048,1] custom-call(slice_data_1), custom_call_target="PipelineForward"
+
+  dynamic_update_slice_0 = f32[96,8,6,2048,2048] dynamic-update-slice(data_0.0, custom_call_0.0, select_result.0, constant_0.0, constant_0.0, constant_0.0, constant_0.0)
+  dynamic_update_slice_1 = f32[96,8,6,2048,1] dynamic-update-slice(data_1.0, custom_call_1.0, select_result.0, constant_0.0, constant_0.0, constant_0.0, constant_0.0)
+
+  /* Increment iteration index */
+  incremented_index.0 = s32[] add(current_iteration_index.0, constant_1.0)
+  ROOT tuple_result.0 = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) tuple(incremented_index.0, dynamic_update_slice_0, dynamic_update_slice_1)
+}
+
+consuming_while_body {
+  input_tuple.1 = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) parameter(0)
+  current_iteration_index.1 = s32[] get-tuple-element(input_tuple.1), index=0
+  data_0.1 = f32[96,8,6,2048,2048] get-tuple-element(input_tuple.1), index=1
+  data_1.1 = f32[96,8,6,2048,1] get-tuple-element(input_tuple.1), index=2
+  constant_0.1 = s32[] constant(0)
+  constant_1.1 = s32[] constant(1)
+  constant_95 = s32[] constant(95)
+  constant_191 = s32[] constant(191)
+
+  /* Build DS index */
+  subtract_0 = s32[] subtract(constant_95, current_iteration_index.1)
+  compare_result.1 = pred[] compare(subtract_0, constant_0.1), direction=LT
+  subtract_1 = s32[] subtract(constant_191, current_iteration_index.1)
+  select_result.1 = s32[] select(compare_result.1, subtract_1, subtract_0)
+
+  dynamic_slice_0 = f32[1,8,6,2048,2048] dynamic-slice(data_0.1, select_result.1, constant_0.1, constant_0.1, constant_0.1, constant_0.1), dynamic_slice_sizes={1,8,6,2048,2048}
+  dynamic_slice_1 = f32[1,8,6,2048,1] dynamic-slice(data_1.1, select_result.1, constant_0.1, constant_0.1, constant_0.1, constant_0.1), dynamic_slice_sizes={1,8,6,2048,1}
+  rs = f32[1,8,6,2048,2048] reshape(dynamic_slice_0)
+  rs2 = f32[1,8,6,2048,1] reshape(dynamic_slice_1)
+  /* Annotate DS for offload */
+  custom_call_0.1 = f32[1,8,6,2048,2048] custom-call(rs), custom_call_target="PipelineBackward"
+  custom_call_1.1 = f32[1,8,6,2048,1] custom-call(rs2), custom_call_target="PipelineBackward"
+
+  /* Do some work with the dynamic slice outputs. */
+  tanh_0 = f32[1,8,6,2048,2048] tanh(custom_call_0.1)
+  tanh_1 = f32[1,8,6,2048,1] tanh(custom_call_1.1)
+
+  /* Increment iteration index */
+  incremented_index.1 = s32[] add(current_iteration_index.1, constant_1.1)
+  ROOT tuple_result.1 = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) tuple(incremented_index.1, data_0.1, data_1.1)
+}
+
+ENTRY main {
+  entry_param_0 = f32[] parameter(0)
+  broadcast_0 = f32[96,8,6,2048,2048] broadcast(entry_param_0), dimensions={}
+  broadcast_1 = f32[96,8,6,2048,1] broadcast(entry_param_0), dimensions={}
+  constant_s32_0 = s32[] constant(0)
+  tuple_for_producing_while = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) tuple(constant_s32_0, broadcast_0, broadcast_1)
+  producing_while = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) while(tuple_for_producing_while), condition=producing_while_condition, body=producing_while_body
+  while_output_1 = f32[96,8,6,2048,2048] get-tuple-element(producing_while), index=1
+  while_output_2 = f32[96,8,6,2048,1] get-tuple-element(producing_while), index=2
+  tuple_for_consuming_while = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) tuple(constant_s32_0, while_output_1, while_output_2)
+  ROOT consuming_while = (s32[], f32[96,8,6,2048,2048], f32[96,8,6,2048,1]) while(tuple_for_consuming_while), condition=consuming_while_condition, body=consuming_while_body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // First, look for the pattern:
+  //  producing_while
+  //       /  \
+  //     gte  gte  constant
+  //       \  /   /
+  //        \/   /
+  //        tuple
+  //         |
+  //  consuming_while
+  HloInstruction* consuming_while;
+  HloInstruction* producing_while_0;
+  HloInstruction* producing_while_1;
+  {
+    HloInstruction* tuple;
+    HloInstruction* gte_0;
+    HloInstruction* gte_1;
+    ASSERT_THAT(
+        module->entry_computation()->root_instruction(),
+        GmockMatch(m::While(
+            &consuming_while,
+            m::Tuple(
+                &tuple, m::Constant(),
+                m::GetTupleElement(&gte_0, m::While(&producing_while_0)),
+                m::GetTupleElement(&gte_1, m::While(&producing_while_1))))));
+    ASSERT_EQ(producing_while_0, producing_while_1);
+
+    // Check that the memory spaces were properly set.
+    TestShapeHasMemorySpace(gte_0->shape(), kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(gte_1->shape(), kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(
+        ShapeUtil::GetSubshape(consuming_while->shape(), {1}),
+        kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(
+        ShapeUtil::GetSubshape(consuming_while->shape(), {2}),
+        kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(
+        ShapeUtil::GetSubshape(producing_while_0->shape(), {1}),
+        kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(
+        ShapeUtil::GetSubshape(producing_while_0->shape(), {2}),
+        kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {1}),
+                            kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {2}),
+                            kHostMemorySpaceColor);
+  }
+
+  // Now, look for the AllocateBuffers leading into the producing while.
+  {
+    HloInstruction* allocate_buffer_0;
+    HloInstruction* allocate_buffer_1;
+    ASSERT_THAT(producing_while_0,
+                GmockMatch(m::While(m::Tuple(
+                    m::Constant(),
+                    m::CustomCall(&allocate_buffer_0, {"AllocateBuffer"}),
+                    m::CustomCall(&allocate_buffer_1, {"AllocateBuffer"})))));
+    // Check that the memory spaces were properly set.
+    ASSERT_TRUE(allocate_buffer_0->shape().has_layout());
+    EXPECT_EQ(allocate_buffer_0->shape().layout().memory_space(),
+              kHostMemorySpaceColor);
+    ASSERT_TRUE(allocate_buffer_1->shape().has_layout());
+    EXPECT_EQ(allocate_buffer_1->shape().layout().memory_space(),
+              kHostMemorySpaceColor);
+  }
+
+  // There are 4 computations to look at:
+  //  - Consuming while's body
+  //  - Consuming while's condition
+  //  - Producing while's body
+  //  - Producing while's condition
+
+  // For the condition computations, just check that the parameters have the
+  // right memory space.
+  TestShapeHasMemorySpace(
+      ShapeUtil::GetSubshape(
+          consuming_while->while_condition()->parameter_instruction(0)->shape(),
+          {1}),
+      kHostMemorySpaceColor);
+  TestShapeHasMemorySpace(
+      ShapeUtil::GetSubshape(
+          consuming_while->while_condition()->parameter_instruction(0)->shape(),
+          {2}),
+      kHostMemorySpaceColor);
+
+  // Now, check the producing while for the following pattern:
+  //    param      param
+  //      |          |
+  //     gte  _...  gte  _...
+  //     |   /      |   /
+  //     |  /       |  /
+  //     | /        | /
+  //     dus       dus
+  //      |       /
+  //      |      /
+  //  _   |     /
+  //   \  |    /
+  //    \ |   /
+  //     \|  /
+  //    tuple
+  {
+    HloInstruction* tuple;
+    HloInstruction* dynamic_update_slice_0;
+    HloInstruction* dynamic_update_slice_1;
+    HloInstruction* dynamic_update_slice_second_param_0;
+    HloInstruction* dynamic_update_slice_second_param_1;
+    HloInstruction* gte_0;
+    HloInstruction* gte_1;
+    HloInstruction* param_0;
+    HloInstruction* param_1;
+    ASSERT_THAT(producing_while_0->while_body()->root_instruction(),
+                GmockMatch(m::Tuple(
+                    &tuple, m::Op(),
+                    m::DynamicUpdateSlice(
+                        &dynamic_update_slice_0,
+                        m::GetTupleElement(&gte_0, m::Parameter(&param_0)),
+                        m::Op(&dynamic_update_slice_second_param_0), m::Op(),
+                        m::Op(), m::Op(), m::Op(), m::Op()),
+                    m::DynamicUpdateSlice(
+                        &dynamic_update_slice_1,
+                        m::GetTupleElement(&gte_1, m::Parameter(&param_1)),
+                        m::Op(&dynamic_update_slice_second_param_1), m::Op(),
+                        m::Op(), m::Op(), m::Op(), m::Op()))));
+    EXPECT_EQ(param_0, param_1);
+
+    // Check that the memory spaces were properly set.
+    // HOST:
+    //  tuple subshape 1
+    //  tuple subshape 2
+    //  dynamic_update_slice_0 shape
+    //  dynamic_update_slice_1 shape
+    //  gte_0 shape
+    //  gte_1 shape
+    //  param_0 subshape 1
+    //  param_0 subshape 2
+    // DEVICE:
+    //  dynamic_update_slice_second_param_0
+    //  dynamic_update_slice_second_param_1
+
+    TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {1}),
+                            kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {2}),
+                            kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(dynamic_update_slice_0->shape(),
+                            kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(dynamic_update_slice_1->shape(),
+                            kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(gte_0->shape(), kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(gte_1->shape(), kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(ShapeUtil::GetSubshape(param_0->shape(), {1}),
+                            kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(ShapeUtil::GetSubshape(param_0->shape(), {2}),
+                            kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(dynamic_update_slice_second_param_0->shape(),
+                            Layout::kDefaultMemorySpace);
+    TestShapeHasMemorySpace(dynamic_update_slice_second_param_1->shape(),
+                            Layout::kDefaultMemorySpace);
+  }
+
+  // Now, check the consuming while for the following pattern:
+  //  param
+  //  |   |
+  // gte gte
+  //  |   |
+  //  ds  ds
+  {
+    // Since we do not do anything meaningful with the result of the
+    // dynamic-slices, there is no easy way to access them from the root.
+    // Instead, search from the parameter and find all dynamic-slices.
+    EXPECT_EQ(consuming_while->while_body()->parameter_instructions().size(),
+              1);
+    const HloInstruction* param =
+        consuming_while->while_body()->parameter_instruction(0);
+    absl::flat_hash_set<const HloInstruction*> dynamic_slices;
+    std::stack<const HloInstruction*> stack;
+    stack.emplace(param);
+    while (!stack.empty()) {
+      const HloInstruction* current = stack.top();
+      stack.pop();
+      if (current->opcode() == HloOpcode::kDynamicSlice) {
+        dynamic_slices.emplace(current);
+        continue;
+      }
+      // Add all users.
+      for (const HloInstruction* user : current->users()) {
+        stack.emplace(user);
+      }
+    }
+    // There should only be two dynamic-slices.
+    ASSERT_EQ(dynamic_slices.size(), 2);
+    for (const HloInstruction* dynamic_slice : dynamic_slices) {
+      const HloInstruction* get_tuple_element;
+      const HloInstruction* parameter;
+      ASSERT_THAT(
+          dynamic_slice,
+          GmockMatch(m::DynamicSlice(
+              m::GetTupleElement(&get_tuple_element, m::Parameter(&parameter)),
+              m::Op(), m::Op(), m::Op(), m::Op(), m::Op())));
+
+      // Check that the memory spaces were properly set.
+      // HOST:
+      //  parameter subshape 1
+      //  parameter subshape 2
+      //  get_tuple_element
+      // DEVICE:
+      //  dynamic_slice
+      TestShapeHasMemorySpace(ShapeUtil::GetSubshape(parameter->shape(), {1}),
+                              kHostMemorySpaceColor);
+      TestShapeHasMemorySpace(ShapeUtil::GetSubshape(parameter->shape(), {2}),
+                              kHostMemorySpaceColor);
+      TestShapeHasMemorySpace(get_tuple_element->shape(),
+                              kHostMemorySpaceColor);
+      TestShapeHasMemorySpace(dynamic_slice->shape(),
+                              Layout::kDefaultMemorySpace);
+    }
+  }
+
+  // Finally, ensure that all annotations have been removed.
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
+TEST_F(HostOffloaderTest, LlmActivationHostMemoryMultipleConsumers) {
+  const std::string& hlo_string = R"(
+HloModule llm_while
+
+producing_while_condition {
+  producing_condition_param = (s32[], f32[96,8,6,2048,2048]) parameter(0)
+  producing_condition_current_iteration_index = s32[] get-tuple-element(producing_condition_param), index=0
+  producing_condition_iteration_count = s32[] constant(96)
+  ROOT producing_condition_result = pred[] compare(producing_condition_current_iteration_index, producing_condition_iteration_count), direction=LT
+}
+
+consuming_while_condition {
+  consuming_condition_param = (s32[], f32[96,8,6,2048,2048]) parameter(0)
+  consuming_condition_current_iteration_index = s32[] get-tuple-element(consuming_condition_param), index=0
+  consuming_condition_iteration_count = s32[] constant(96)
+  ROOT consuming_condition_result = pred[] compare(consuming_condition_current_iteration_index, consuming_condition_iteration_count), direction=LT
+}
+
+producing_while_body {
+  input_tuple.0 = (s32[], f32[96,8,6,2048,2048]) parameter(0)
+  current_iteration_index.0 = s32[] get-tuple-element(input_tuple.0), index=0
+  data_0.0 = f32[96,8,6,2048,2048] get-tuple-element(input_tuple.0), index=1
+  constant_0.0 = s32[] constant(0)
+  constant_1.0 = s32[] constant(1)
+  constant_96 = s32[] constant(96)
+
+  /* Create dummy data used in DUS */
+  slice_data_0 = f32[1,8,6,2048,2048]  constant({...})
+
+  /* Build DUS index */
+  compare_result.0 = pred[] compare(current_iteration_index.0, constant_0.0), direction=LT
+  add_result = s32[] add(current_iteration_index.0, constant_96)
+  select_result.0 = s32[] select(compare_result.0, add_result, current_iteration_index.0)
+
+  /* Annotate DUS for offload */
+  custom_call_0.0 = f32[1,8,6,2048,2048] custom-call(slice_data_0), custom_call_target="PipelineForward"
+
+  dynamic_update_slice_0 = f32[96,8,6,2048,2048] dynamic-update-slice(data_0.0, custom_call_0.0, select_result.0, constant_0.0, constant_0.0, constant_0.0, constant_0.0)
+
+  /* Increment iteration index */
+  incremented_index.0 = s32[] add(current_iteration_index.0, constant_1.0)
+  ROOT tuple_result.0 = (s32[], f32[96,8,6,2048,2048]) tuple(incremented_index.0, dynamic_update_slice_0)
+}
+
+consuming_while_body {
+  input_tuple.1 = (s32[], f32[96,8,6,2048,2048]) parameter(0)
+  current_iteration_index.1 = s32[] get-tuple-element(input_tuple.1), index=0
+  data_0.1 = f32[96,8,6,2048,2048] get-tuple-element(input_tuple.1), index=1
+  constant_0.1 = s32[] constant(0)
+  constant_1.1 = s32[] constant(1)
+  constant_95 = s32[] constant(95)
+  constant_191 = s32[] constant(191)
+
+  /* Build DS index */
+  subtract_0 = s32[] subtract(constant_95, current_iteration_index.1)
+  compare_result.1 = pred[] compare(subtract_0, constant_0.1), direction=LT
+  subtract_1 = s32[] subtract(constant_191, current_iteration_index.1)
+  select_result.1 = s32[] select(compare_result.1, subtract_1, subtract_0)
+
+  dynamic_slice_0 = f32[1,8,6,2048,2048] dynamic-slice(data_0.1, select_result.1, constant_0.1, constant_0.1, constant_0.1, constant_0.1), dynamic_slice_sizes={1,8,6,2048,2048}
+
+  /* Annotate DS for offload */
+  custom_call_0.1 = f32[1,8,6,2048,2048] custom-call(dynamic_slice_0), custom_call_target="PipelineBackward"
+
+  /* Do some work with the dynamic slice outputs. */
+  tanh_0 = f32[1,8,6,2048,2048] tanh(custom_call_0.1)
+
+  /* Increment iteration index */
+  incremented_index.1 = s32[] add(current_iteration_index.1, constant_1.1)
+  ROOT tuple_result.1 = (s32[], f32[96,8,6,2048,2048]) tuple(incremented_index.1, data_0.1)
+}
+
+ENTRY main {
+  entry_param_0 = f32[] parameter(0)
+  entry_param_1 = s32[] parameter(1)
+  entry_param_2 = s32[] parameter(2)
+  broadcast_0 = f32[96,8,6,2048,2048] broadcast(entry_param_0), dimensions={}
+  constant_s32_0 = s32[] constant(0)
+  tuple_for_producing_while = (s32[], f32[96,8,6,2048,2048]) tuple(constant_s32_0, broadcast_0)
+  producing_while = (s32[], f32[96,8,6,2048,2048]) while(tuple_for_producing_while), condition=producing_while_condition, body=producing_while_body
+  while_output_1 = f32[96,8,6,2048,2048] get-tuple-element(producing_while), index=1
+  tuple_for_consuming_while = (s32[], f32[96,8,6,2048,2048]) tuple(constant_s32_0, while_output_1)
+  consuming_while = (s32[], f32[96,8,6,2048,2048]) while(tuple_for_consuming_while), condition=consuming_while_condition, body=consuming_while_body
+  second_while_output = f32[96,8,6,2048,2048] get-tuple-element(consuming_while), index=1
+  final_dynamic_slice_0 = f32[1,8,6,2048,2048] dynamic-slice(second_while_output, entry_param_1, constant_s32_0, constant_s32_0, constant_s32_0, constant_s32_0), dynamic_slice_sizes={1,8,6,2048,2048}
+  final_host_to_device_custom_call_0 = f32[1,8,6,2048,2048] custom-call(final_dynamic_slice_0), custom_call_target="PipelineBackward"
+  final_slice_0 = f32[1,8,6,2048,2048] slice(second_while_output), slice={[41:42], [0:8], [0:6], [0:2048], [0:2048]}
+  final_host_to_device_custom_call_1 = f32[1,8,6,2048,2048] custom-call(final_slice_0), custom_call_target="PipelineBackward"
+  ROOT add = f32[1,8,6,2048,2048] add(final_host_to_device_custom_call_0, final_host_to_device_custom_call_1)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed, RunHostOffloader(module.get()));
+
+  EXPECT_TRUE(changed);
+
+  // First, look for the pattern:
+  //           producing_while
+  //                 |
+  //      constant  gte
+  //           \     |
+  //            \    |
+  //             tuple
+  //               |
+  //       consuming_while
+  //               |
+  //              gte
+  //             /   \
+  // dynamic-slice   dynamic-slice
+  //             \   /
+  //              add
+  // Note: The second dynamic-slice was originally a slice.
+  HloInstruction* consuming_while;
+  HloInstruction* producing_while;
+  {
+    HloInstruction* tuple;
+    HloInstruction* gte_between_whiles;
+    HloInstruction* final_gte;
+    HloInstruction* dynamic_slice_0;
+    HloInstruction* dynalic_slice_1;
+    HloInstruction* add;
+    auto pattern_ending_in_gte = m::GetTupleElement(
+        &final_gte,
+        m::While(&consuming_while,
+                 m::Tuple(&tuple, m::Constant(),
+                          m::GetTupleElement(&gte_between_whiles,
+                                             m::While(&producing_while)))));
+    ASSERT_THAT(
+        module->entry_computation()->root_instruction(),
+        GmockMatch(
+            m::Add(&add,
+                   m::DynamicSlice(&dynamic_slice_0, pattern_ending_in_gte,
+                                   m::Op(), m::Op(), m::Op(), m::Op(), m::Op()),
+                   m::DynamicSlice(&dynalic_slice_1, pattern_ending_in_gte,
+                                   m::ConstantScalar(41), m::Op(), m::Op(),
+                                   m::Op(), m::Op()))));
+
+    // Check that the memory spaces were properly set.
+    TestShapeHasMemorySpace(gte_between_whiles->shape(), kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(
+        ShapeUtil::GetSubshape(consuming_while->shape(), {1}),
+        kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(
+        ShapeUtil::GetSubshape(producing_while->shape(), {1}),
+        kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {1}),
+                            kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(final_gte->shape(), kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(dynamic_slice_0->shape(),
+                            Layout::kDefaultMemorySpace);
+    TestShapeHasMemorySpace(dynalic_slice_1->shape(),
+                            Layout::kDefaultMemorySpace);
+    TestShapeHasMemorySpace(add->shape(), Layout::kDefaultMemorySpace);
+  }
+
+  // Now, look for the AllocateBuffers leading into the producing while.
+  {
+    HloInstruction* allocate_buffer;
+    ASSERT_THAT(producing_while,
+                GmockMatch(m::While(m::Tuple(
+                    m::Constant(),
+                    m::CustomCall(&allocate_buffer, {"AllocateBuffer"})))));
+    // Check that the memory spaces were properly set.
+    ASSERT_TRUE(allocate_buffer->shape().has_layout());
+    EXPECT_EQ(allocate_buffer->shape().layout().memory_space(),
+              kHostMemorySpaceColor);
+  }
+
+  // There are 4 computations to look at:
+  //  - Consuming while's body
+  //  - Consuming while's condition
+  //  - Producing while's body
+  //  - Producing while's condition
+
+  // For the condition computations, just check that the parameters have the
+  // right memory space.
+  TestShapeHasMemorySpace(
+      ShapeUtil::GetSubshape(
+          consuming_while->while_condition()->parameter_instruction(0)->shape(),
+          {1}),
+      kHostMemorySpaceColor);
+
+  // Now, check the producing while for the following pattern:
+  //    param
+  //      |
+  //     gte  _
+  //      |  /
+  //      | /
+  //  _  dus
+  //   \  |
+  //   tuple
+  {
+    HloInstruction* tuple;
+    HloInstruction* dynamic_update_slice;
+    HloInstruction* dynamic_update_slice_second_param;
+    HloInstruction* gte;
+    HloInstruction* param;
+    ASSERT_THAT(
+        producing_while->while_body()->root_instruction(),
+        GmockMatch(m::Tuple(&tuple, m::Op(),
+                            m::DynamicUpdateSlice(
+                                &dynamic_update_slice,
+                                m::GetTupleElement(&gte, m::Parameter(&param)),
+                                m::Op(&dynamic_update_slice_second_param),
+                                m::Op(), m::Op(), m::Op(), m::Op(), m::Op()))));
+
+    // Check that the memory spaces were properly set.
+    TestShapeHasMemorySpace(ShapeUtil::GetSubshape(tuple->shape(), {1}),
+                            kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(dynamic_update_slice->shape(),
+                            kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(gte->shape(), kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(ShapeUtil::GetSubshape(param->shape(), {1}),
+                            kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(dynamic_update_slice_second_param->shape(),
+                            Layout::kDefaultMemorySpace);
+  }
+
+  // Now, check the consuming while for the following pattern:
+  //  param
+  //    |
+  //   gte
+  //    |
+  //    ds
+  {
+    // Since we do not do anything meaningful with the result of the
+    // dynamic-slices, there is no easy way to access them from the root.
+    // Instead, search from the parameter and find all dynamic-slices.
+    EXPECT_EQ(consuming_while->while_body()->parameter_instructions().size(),
+              1);
+    const HloInstruction* param =
+        consuming_while->while_body()->parameter_instruction(0);
+    absl::flat_hash_set<const HloInstruction*> dynamic_slices;
+    std::stack<const HloInstruction*> stack;
+    stack.emplace(param);
+    while (!stack.empty()) {
+      const HloInstruction* current = stack.top();
+      stack.pop();
+      if (current->opcode() == HloOpcode::kDynamicSlice) {
+        dynamic_slices.emplace(current);
+        continue;
+      }
+      // Add all users.
+      for (const HloInstruction* user : current->users()) {
+        stack.emplace(user);
+      }
+    }
+    // There should only be one dynamic-slice.
+    ASSERT_EQ(dynamic_slices.size(), 1);
+    const HloInstruction* dynamic_slice = *dynamic_slices.begin();
+    const HloInstruction* get_tuple_element;
+    const HloInstruction* parameter;
+    ASSERT_THAT(
+        dynamic_slice,
+        GmockMatch(m::DynamicSlice(
+            m::GetTupleElement(&get_tuple_element, m::Parameter(&parameter)),
+            m::Op(), m::Op(), m::Op(), m::Op(), m::Op())));
+
+    // Check that the memory spaces were properly set.
+    TestShapeHasMemorySpace(ShapeUtil::GetSubshape(parameter->shape(), {1}),
+                            kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(get_tuple_element->shape(), kHostMemorySpaceColor);
+    TestShapeHasMemorySpace(dynamic_slice->shape(),
+                            Layout::kDefaultMemorySpace);
+  }
+
+  // Finally, ensure that all annotations have been removed.
+  EXPECT_FALSE(HaveRemainingOffloadAnnotations(module.get()));
+}
+
 }  // namespace
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/instruction_fusion.cc b/third_party/xla/xla/service/instruction_fusion.cc
index 59095d70086c2b..85a5b3175b5f32 100644
--- a/third_party/xla/xla/service/instruction_fusion.cc
+++ b/third_party/xla/xla/service/instruction_fusion.cc
@@ -183,6 +183,7 @@ bool IsAlwaysDuplicable(const HloInstruction& instruction) {
     case HloOpcode::kCustomCall:
     case HloOpcode::kDomain:
     case HloOpcode::kDot:
+    case HloOpcode::kErf:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kFft:
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.cc b/third_party/xla/xla/service/latency_hiding_scheduler.cc
index 2fa2d8b95ed584..8ba9d68143dfb8 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.cc
@@ -75,6 +75,10 @@ CanonicalAsyncOp DefaultGetCanonicalAsyncOp(const HloInstruction& hlo) {
       return {HloOpcode::kAsyncStart, HloOpcode::kAllGather};
     case HloOpcode::kCollectivePermuteStart:
       return {HloOpcode::kAsyncStart, HloOpcode::kCollectivePermute};
+    case HloOpcode::kCopyStart:
+      return {HloOpcode::kAsyncStart, HloOpcode::kCopy};
+    case HloOpcode::kCopyDone:
+      return {HloOpcode::kAsyncDone, HloOpcode::kCopy};
     case HloOpcode::kAllReduceDone:
       return {HloOpcode::kAsyncDone, HloOpcode::kAllReduce};
     case HloOpcode::kAllGatherDone:
@@ -135,6 +139,7 @@ bool AsyncTracker::IsSupportedAsyncDone(const HloInstruction& hlo) const {
       case HloOpcode::kAllGather:
       case HloOpcode::kAllReduce:
       case HloOpcode::kCollectivePermute:
+      case HloOpcode::kCopy:
       case HloOpcode::kReduceScatter:
         return true;
       default:
@@ -162,6 +167,7 @@ bool AsyncTracker::IsSupportedAsyncStart(const HloInstruction& hlo) const {
       case HloOpcode::kAllGather:
       case HloOpcode::kAllReduce:
       case HloOpcode::kCollectivePermute:
+      case HloOpcode::kCopy:
       case HloOpcode::kReduceScatter:
         return true;
       default:
@@ -184,6 +190,8 @@ ResourcesVector AsyncTracker::GetResourcesFromInstructionImpl(
         return ResourceType::kAllToAll;
       case HloOpcode::kCollectivePermute:
         return ResourceType::kCollectivePermute;
+      case HloOpcode::kCopy:
+        return ResourceType::kCopy;
       case HloOpcode::kReduceScatter:
         return ResourceType::kReduceScatter;
       default:
@@ -327,6 +335,8 @@ void AsyncTracker::SetConcurrentResourceLimits(
   max_concurrent_resource[ResourceTypeToIndex(
       ResourceType::kCollectivePermute)] =
       config_.collective_permute_overlap_limit;
+  max_concurrent_resource[ResourceTypeToIndex(ResourceType::kCopy)] =
+      config_.copy_overlap_limit;
   max_concurrent_resource[ResourceTypeToIndex(ResourceType::kAllToAll)] =
       config_.all_to_all_overlap_limit;
   max_concurrent_resource[ResourceTypeToIndex(ResourceType::kAllGather)] =
@@ -362,12 +372,16 @@ absl::string_view AsyncTracker::GetResourceName(int64_t resource_type) const {
       return "kAllReduce";
     case ResourceTypeToIndex(ResourceType::kCollectivePermute):
       return "kCollectivePermute";
+    case ResourceTypeToIndex(ResourceType::kCopy):
+      return "kCopy";
     case ResourceTypeToIndex(ResourceType::kSendRecv):
       return "kSendRecv";
     case ResourceTypeToIndex(ResourceType::kSendHost):
       return "kSendHost";
     case ResourceTypeToIndex(ResourceType::kRecvHost):
       return "kRecvHost";
+    case ResourceTypeToIndex(ResourceType::kReduceScatter):
+      return "kReduceScatter";
     default:
       return "Not a valid default resource";
   }
@@ -697,11 +711,6 @@ class ReadySetLt {
             "kForceDelay")) {
       return *value;
     }
-    if (early_target_scheduling_rule_) {
-      if (auto value = early_target_scheduling_rule_(a, b)) {
-        return *value;
-      }
-    }
     // Prioritize instructions that are NOPs as they have no memory pressure
     // issue and unlock different operations for being scheduled.
     if (auto value = DefaultSchedulerCore::ChooseBestCandidate(
@@ -763,6 +772,11 @@ class ReadySetLt {
         return *value;
       }
     }
+    if (early_target_scheduling_rule_) {
+      if (auto value = early_target_scheduling_rule_(a, b)) {
+        return *value;
+      }
+    }
     // Some heuristic that try to prioritize unlocking "done" instructions
     // so that we can perform overlap. More fancy heuristics can be used by
     // discovering the closest "done" to every instruction and prioritize
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler.h b/third_party/xla/xla/service/latency_hiding_scheduler.h
index 5f73cff7e1afbd..e491dd86ec47a0 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler.h
+++ b/third_party/xla/xla/service/latency_hiding_scheduler.h
@@ -56,11 +56,12 @@ enum class ResourceType {
   kAllGather = 2,
   kAllReduce = 3,
   kCollectivePermute = 4,
-  kReduceScatter = 5,
-  kSendRecv = 6,
-  kSendHost = 7,
-  kRecvHost = 8,
-  kNumResources = 9,
+  kCopy = 5,
+  kReduceScatter = 6,
+  kSendRecv = 7,
+  kSendHost = 8,
+  kRecvHost = 9,
+  kNumResources = 10,
   kTargetDefinedResourcesBound = 10000,
 };
 
@@ -99,6 +100,7 @@ struct SchedulerConfig {
   int64_t reduce_scatter_overlap_limit = 1;
   int64_t send_recv_overlap_limit = 1;
   int64_t send_recv_host_overlap_limit = 1;
+  int64_t copy_overlap_limit = 1;
   uint64_t memory_limit = UINT64_MAX;
   bool schedule_send_recvs = false;
   // Consider send recv as the same resource. Some platforms do not take well
diff --git a/third_party/xla/xla/service/latency_hiding_scheduler_test.cc b/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
index dec1378f8aa3c8..03bcf499c7dc94 100644
--- a/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
+++ b/third_party/xla/xla/service/latency_hiding_scheduler_test.cc
@@ -2974,4 +2974,79 @@ ENTRY main {
   // not create a failure of scheduling by the async done checks.
   EXPECT_TRUE(RunScheduler(hlo_module.get(), sched_config).ok());
 }
+
+TEST_F(LatencyHidingSchedulerTest, CopyScheduling) {
+  absl::string_view hlo_string = R"(
+HloModule EinsumTest, is_scheduled=true
+ENTRY AddR2 {
+  y_host = bf16[12800,12800]{1,0:T(8,128)(2,1)} parameter(1)
+  z = bf16[12800,12800]{1,0:T(8,128)(2,1)} parameter(2)
+  x = bf16[12800,12800]{1,0:T(8,128)(2,1)} parameter(0)
+  convolution = bf16[12800,12800]{1,0:T(8,128)(2,1)} convolution(x, z), dim_labels=bf_io->bf
+  copy-start = (bf16[12800,12800]{1,0:T(8,128)(2,1)}, bf16[12800,12800]{1,0:T(8,128)(2,1)}, u32[]{:S(2)}) copy-start(y_host)
+  copy-done = bf16[12800,12800]{1,0:T(8,128)(2,1)} copy-done(copy-start)
+  ROOT convolution.1 = bf16[12800,12800]{1,0:T(8,128)(2,1)} convolution(convolution, copy-done), dim_labels=bf_io->bf
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloText(hlo_string));
+  HloSchedule& module_schedule = hlo_module->schedule();
+  EXPECT_TRUE(hlo_module->has_entry_computation());
+  HloComputation* entry_computation = hlo_module->entry_computation();
+  std::vector<HloInstruction*> original_instruction_sequence =
+      module_schedule.sequence(entry_computation).instructions();
+  auto sched_config = GetDefaultSchedConfig();
+  EXPECT_TRUE(RunScheduler(hlo_module.get(), sched_config).ok());
+  const HloInstruction* conv = FindInstruction(hlo_module.get(), "convolution");
+  const HloInstruction* cps = FindInstruction(hlo_module.get(), "copy-start");
+  const HloInstruction* cpd = FindInstruction(hlo_module.get(), "copy-done");
+  std::vector<HloInstruction*> new_instruction_sequence =
+      module_schedule.sequence(entry_computation).instructions();
+  EXPECT_LT(PositionInVector(new_instruction_sequence, cps),
+            PositionInVector(new_instruction_sequence, conv));
+  EXPECT_LT(PositionInVector(new_instruction_sequence, conv),
+            PositionInVector(new_instruction_sequence, cpd));
+  XLA_VLOG_LINES(1, hlo_module->ToString());
+}
+
+TEST_F(LatencyHidingSchedulerTest, MaxCopyScheduling) {
+  absl::string_view hlo_string = R"(
+HloModule EinsumTest, is_scheduled=true
+ENTRY AddR2 {
+  y_host = bf16[12800,12800]{1,0:T(8,128)(2,1)} parameter(1)
+  q_host = bf16[12800,12800]{1,0:T(8,128)(2,1)} parameter(3)
+  z = bf16[12800,12800]{1,0:T(8,128)(2,1)} parameter(2)
+  x = bf16[12800,12800]{1,0:T(8,128)(2,1)} parameter(0)
+  convolution = bf16[12800,12800]{1,0:T(8,128)(2,1)} convolution(x, z), dim_labels=bf_io->bf
+  copy-start = (bf16[12800,12800]{1,0:T(8,128)(2,1)}, bf16[12800,12800]{1,0:T(8,128)(2,1)}, u32[]{:S(2)}) copy-start(y_host)
+  copy-done = bf16[12800,12800]{1,0:T(8,128)(2,1)} copy-done(copy-start)
+  copy-start2 = (bf16[12800,12800]{1,0:T(8,128)(2,1)}, bf16[12800,12800]{1,0:T(8,128)(2,1)}, u32[]{:S(2)}) copy-start(q_host)
+  copy-done2 = bf16[12800,12800]{1,0:T(8,128)(2,1)} copy-done(copy-start2)
+  ROOT t = (bf16[12800,12800]{1,0:T(8,128)(2,1)}, bf16[12800,12800]{1,0:T(8,128)(2,1)})  tuple(copy-done2, copy-done)
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module, ParseHloText(hlo_string));
+  HloSchedule& module_schedule = hlo_module->schedule();
+  EXPECT_TRUE(hlo_module->has_entry_computation());
+  HloComputation* entry_computation = hlo_module->entry_computation();
+  std::vector<HloInstruction*> original_instruction_sequence =
+      module_schedule.sequence(entry_computation).instructions();
+  auto sched_config = GetDefaultSchedConfig();
+  EXPECT_TRUE(RunScheduler(hlo_module.get(), sched_config).ok());
+  const HloInstruction* conv = FindInstruction(hlo_module.get(), "convolution");
+  const HloInstruction* cps = FindInstruction(hlo_module.get(), "copy-start");
+  const HloInstruction* cps2 = FindInstruction(hlo_module.get(), "copy-start2");
+  const HloInstruction* cpd2 = FindInstruction(hlo_module.get(), "copy-done2");
+  std::vector<HloInstruction*> new_instruction_sequence =
+      module_schedule.sequence(entry_computation).instructions();
+  EXPECT_LT(PositionInVector(new_instruction_sequence, cps2),
+            PositionInVector(new_instruction_sequence, conv));
+  EXPECT_LT(PositionInVector(new_instruction_sequence, conv),
+            PositionInVector(new_instruction_sequence, cpd2));
+  EXPECT_LT(PositionInVector(new_instruction_sequence, cps),
+            PositionInVector(new_instruction_sequence, cpd2));
+  XLA_VLOG_LINES(1, hlo_module->ToString());
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/layout_assignment.cc b/third_party/xla/xla/service/layout_assignment.cc
index 08f09ccdd49a3f..6c1b5e4dc501d3 100644
--- a/third_party/xla/xla/service/layout_assignment.cc
+++ b/third_party/xla/xla/service/layout_assignment.cc
@@ -2218,7 +2218,8 @@ Status LayoutAssignment::AssignLayouts(LayoutConstraints& constraints) {
                                   computation->root_instruction()));
       computation->set_root_instruction(new_root);
     } else {
-      // Copy the tiling info specified in result layout.
+      // Copy the tiling info/tail_padding_alignment_in_elements specified in
+      // result layout.
       auto copy_tiling = [&constraints](xla::Shape* subshape,
                                         const xla::ShapeIndex& index) {
         if (subshape->IsArray()) {
@@ -2231,6 +2232,8 @@ Status LayoutAssignment::AssignLayouts(LayoutConstraints& constraints) {
           }
           subshape->mutable_layout()->set_element_size_in_bits(
               result_shape.layout().element_size_in_bits());
+          subshape->mutable_layout()->set_tail_padding_alignment_in_elements(
+              result_shape.layout().tail_padding_alignment_in_elements());
         }
       };
       xla::ShapeUtil::ForEachMutableSubshape(
@@ -2727,6 +2730,7 @@ bool LayoutAssignment::InstructionCanChangeLayout(
     case HloOpcode::kDivide:
     case HloOpcode::kDynamicSlice:
     case HloOpcode::kDynamicUpdateSlice:
+    case HloOpcode::kErf:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kFft:
diff --git a/third_party/xla/xla/service/layout_normalization.cc b/third_party/xla/xla/service/layout_normalization.cc
index 729ab2578b7443..052d6e458440fe 100644
--- a/third_party/xla/xla/service/layout_normalization.cc
+++ b/third_party/xla/xla/service/layout_normalization.cc
@@ -238,6 +238,24 @@ class LayoutNormalizationVisitor : public DfsHloRewriteVisitor {
     return OkStatus();
   }
 
+  Status HandleIota(HloInstruction* hlo) override {
+    VLOG(3) << "Input iota: " << hlo->ToString();
+    auto s = hlo->shape();
+    auto normalized_shape = Normalize(s);
+    std::vector<int64_t> orig_output_layout_as_permutation =
+        ToTransposeDimensions(s.layout());
+    int64_t iota_dimension = hlo->dimensions()[0];
+    int64_t new_iota_dimension =
+        FindIndex(orig_output_layout_as_permutation, iota_dimension);
+    auto normalized_iota = hlo->AddInstruction(
+        HloInstruction::CreateIota(normalized_shape, new_iota_dimension));
+    SetVisited(*normalized_iota);
+    VLOG(3) << "Generated iota: " << normalized_iota->ToString();
+    auto bc_to_orig = MakeBitcastHlo(normalized_iota, s);
+    TF_RETURN_IF_ERROR(ReplaceInstruction(hlo, bc_to_orig));
+    return OkStatus();
+  }
+
   // BitcastConvert is only layout-preserving if it doesn't change the rank.
   Status HandleBitcastConvert(HloInstruction* hlo) override {
     // If the rank isn't changing this is just an unary op.
diff --git a/third_party/xla/xla/service/layout_normalization_test.cc b/third_party/xla/xla/service/layout_normalization_test.cc
index 5dd284edbf29c1..6cebe0f2a858cc 100644
--- a/third_party/xla/xla/service/layout_normalization_test.cc
+++ b/third_party/xla/xla/service/layout_normalization_test.cc
@@ -311,6 +311,23 @@ ENTRY main {
 )");
 }
 
+TEST_F(LayoutNormalizationTest, IotaCustomOutputLayout) {
+  const char* hlo = R"(
+HloModule module
+
+ENTRY main {
+  a = f32[2,4,3]{1,2,0} iota(), iota_dimension=2
+  ROOT out = abs(a)
+}
+)";
+
+  CheckLayoutNormalization(hlo, R"(
+// CHECK: [[iota_2:%[^ ]+]] = f32[2,3,4]{2,1,0} iota(), iota_dimension=1
+// CHECK: [[abs_3:%[^ ]+]] = f32[2,3,4]{2,1,0} abs([[iota_2]])
+// CHECK: ROOT [[bitcast_3_4:%[^ ]+]] = f32[2,4,3]{1,2,0} bitcast([[abs_3]])
+)");
+}
+
 TEST_F(LayoutNormalizationTest, Concatenate) {
   const char* hlo = R"(
 HloModule module
diff --git a/third_party/xla/xla/service/llvm_ir/BUILD b/third_party/xla/xla/service/llvm_ir/BUILD
index a932ff23a0102b..a706957cb6f960 100644
--- a/third_party/xla/xla/service/llvm_ir/BUILD
+++ b/third_party/xla/xla/service/llvm_ir/BUILD
@@ -2,11 +2,13 @@
 #    Libraries for helping construct LLVM IR for XLA backends.
 
 load("//xla:xla.bzl", "xla_cc_test")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
     licenses = ["notice"],
 )
 
@@ -24,14 +26,12 @@ filegroup(
         "**/*.cc",
         "**/*.h",
     ]),
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "alias_analysis",
     srcs = ["alias_analysis.cc"],
     hdrs = ["alias_analysis.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":ir_array",
         ":llvm_type_conversion_util",
@@ -63,7 +63,6 @@ cc_library(
     name = "llvm_util",
     srcs = ["llvm_util.cc"],
     hdrs = ["llvm_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":llvm_type_conversion_util",
         "//xla:literal",
@@ -81,6 +80,7 @@ cc_library(
         "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Core",
         "@llvm-project//llvm:Support",
+        "@llvm-project//llvm:TargetParser",
         "@llvm-project//llvm:TransformUtils",
         "@llvm-project//mlir:IR",
         "@local_tsl//tsl/platform:byte_order",
@@ -93,7 +93,6 @@ cc_library(
 cc_library(
     name = "llvm_type_conversion_util",
     hdrs = ["llvm_type_conversion_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
@@ -104,7 +103,6 @@ cc_library(
 cc_library(
     name = "llvm_command_line_options",
     hdrs = ["llvm_command_line_options.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/strings",
         "@llvm-project//llvm:Support",
@@ -116,7 +114,6 @@ cc_library(
     name = "ir_array",
     srcs = ["ir_array.cc"],
     hdrs = ["ir_array.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":llvm_type_conversion_util",
         ":llvm_util",
@@ -138,7 +135,6 @@ cc_library(
     name = "llvm_loop",
     srcs = ["llvm_loop.cc"],
     hdrs = ["llvm_loop.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":ir_array",
         ":llvm_util",
@@ -157,7 +153,6 @@ cc_library(
     name = "loop_emitter",
     srcs = ["loop_emitter.cc"],
     hdrs = ["loop_emitter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":ir_array",
         ":llvm_loop",
@@ -177,7 +172,6 @@ cc_library(
     name = "fused_ir_emitter",
     srcs = ["fused_ir_emitter.cc"],
     hdrs = ["fused_ir_emitter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":ir_array",
         ":llvm_util",
@@ -191,6 +185,7 @@ cc_library(
         "//xla/service:fusion_node_indexing_evaluation",
         "@com_google_absl//absl/container:flat_hash_map",
         "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:TargetParser",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
@@ -201,7 +196,6 @@ cc_library(
     name = "dynamic_update_slice_util",
     srcs = ["dynamic_update_slice_util.cc"],
     hdrs = ["dynamic_update_slice_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":fused_ir_emitter",
         ":ir_array",
@@ -220,7 +214,6 @@ cc_library(
     name = "sort_util",
     srcs = ["sort_util.cc"],
     hdrs = ["sort_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":ir_array",
         ":kernel_support_library",
@@ -244,7 +237,6 @@ cc_library(
     name = "tuple_ops",
     srcs = ["tuple_ops.cc"],
     hdrs = ["tuple_ops.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":ir_array",
         ":llvm_type_conversion_util",
@@ -262,7 +254,6 @@ cc_library(
     name = "kernel_support_library",
     srcs = ["kernel_support_library.cc"],
     hdrs = ["kernel_support_library.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":llvm_loop",
         ":llvm_type_conversion_util",
@@ -276,7 +267,6 @@ cc_library(
     name = "buffer_assignment_util",
     srcs = ["buffer_assignment_util.cc"],
     hdrs = ["buffer_assignment_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/service:buffer_assignment",
@@ -288,7 +278,6 @@ cc_library(
     name = "math_ops",
     srcs = ["math_ops.cc"],
     hdrs = ["math_ops.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":llvm_util",
         "@llvm-project//llvm:Core",
@@ -299,7 +288,6 @@ cc_library(
     name = "ir_builder_mixin",
     srcs = [],
     hdrs = ["ir_builder_mixin.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@llvm-project//llvm:Core",
     ],
diff --git a/third_party/xla/xla/service/llvm_ir/fused_ir_emitter.cc b/third_party/xla/xla/service/llvm_ir/fused_ir_emitter.cc
index d15499c60aa8ec..8d5ed99a76a11a 100644
--- a/third_party/xla/xla/service/llvm_ir/fused_ir_emitter.cc
+++ b/third_party/xla/xla/service/llvm_ir/fused_ir_emitter.cc
@@ -23,6 +23,7 @@ limitations under the License.
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Value.h"
+#include "llvm/TargetParser/Triple.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_opcode.h"
@@ -95,6 +96,8 @@ FusedIrEmitter::IndexedGenerator FusedIrEmitter::HandleConstant(
   llvm::Module* module = elemental_emitter_.module();
   llvm::IRBuilder<>* b = elemental_emitter_.b();
 
+  // Explicitly set global addrspace for SPIR backend.
+  int addrspace = llvm::Triple(module->getTargetTriple()).isSPIR() ? 1 : 0;
   llvm::Constant* initializer =
       llvm_ir::ConvertLiteralToIrConstant(constant.literal(), module);
   llvm::GlobalVariable* global = new llvm::GlobalVariable(
@@ -104,7 +107,7 @@ FusedIrEmitter::IndexedGenerator FusedIrEmitter::HandleConstant(
       /*Initializer=*/initializer,
       /*Name=*/"", /*InsertBefore=*/nullptr,
       /*TLMode=*/llvm::GlobalValue::NotThreadLocal,
-      /*AddressSpace=*/0,
+      /*AddressSpace=*/addrspace,
       /*isExternallyInitialized=*/false);
   global->setUnnamedAddr(llvm::GlobalVariable::UnnamedAddr::Global);
 
diff --git a/third_party/xla/xla/service/llvm_ir/ir_array.cc b/third_party/xla/xla/service/llvm_ir/ir_array.cc
index d29fabe8588e90..25785d175b108c 100644
--- a/third_party/xla/xla/service/llvm_ir/ir_array.cc
+++ b/third_party/xla/xla/service/llvm_ir/ir_array.cc
@@ -373,6 +373,13 @@ IrArray::Index IrArray::Index::SourceIndexOfBitcast(
   return index;
 }
 
+IrArray::Index IrArray::Index::SourceIndexOfBitcast(
+    const Shape& operand_shape, llvm::IRBuilder<>* builder) const {
+  auto shape = ShapeUtil::MakeShape(F32, dims_);
+  *shape.mutable_layout() = layout_;
+  return SourceIndexOfBitcast(shape, operand_shape, builder);
+}
+
 IrArray::Index IrArray::Index::SourceIndexOfBroadcast(
     const Shape& shape, const Shape& operand_shape,
     absl::Span<const int64_t> dimension_mapping,
diff --git a/third_party/xla/xla/service/llvm_ir/ir_array.h b/third_party/xla/xla/service/llvm_ir/ir_array.h
index fcf90d96b37c45..0212e90e8da0d0 100644
--- a/third_party/xla/xla/service/llvm_ir/ir_array.h
+++ b/third_party/xla/xla/service/llvm_ir/ir_array.h
@@ -101,6 +101,17 @@ class IrArray {
       return with_offset;
     }
 
+    Index AddOffset(absl::Span<llvm::Value* const> offsets,
+                    llvm::IRBuilder<>* b) const {
+      CHECK_EQ(multidim_.size(), offsets.size());
+      Index with_offset = *this;
+      with_offset.linear_ = nullptr;
+      for (auto&& [dim, offset] : llvm::zip(with_offset.multidim_, offsets)) {
+        dim = b->CreateAdd(dim, offset);
+      }
+      return with_offset;
+    }
+
     const std::vector<llvm::Value*>& multidim() const { return multidim_; }
     const std::vector<int64_t>& dims() const { return dims_; }
     llvm::Value* linear() const { return linear_; }
@@ -154,6 +165,9 @@ class IrArray {
     // to `shape`, returns the source index.
     Index SourceIndexOfBitcast(const Shape& shape, const Shape& operand_shape,
                                llvm::IRBuilder<>* builder) const;
+    // Same as above, but for bitcasts from `operand_shape` to `this->dims`.
+    Index SourceIndexOfBitcast(const Shape& operand_shape,
+                               llvm::IRBuilder<>* builder) const;
 
     // Given that "this" is the target index of a broadcast from `operand_shape`
     // to `shape` with the given dimension mapping, returns the source index.
diff --git a/third_party/xla/xla/service/llvm_ir/llvm_util.cc b/third_party/xla/xla/service/llvm_ir/llvm_util.cc
index 7a0b514cc1f102..89f730bec26f32 100644
--- a/third_party/xla/xla/service/llvm_ir/llvm_util.cc
+++ b/third_party/xla/xla/service/llvm_ir/llvm_util.cc
@@ -50,6 +50,7 @@ limitations under the License.
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/TargetParser/Triple.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "mlir/IR/BuiltinOps.h"  // from @llvm-project
 #include "mlir/IR/Location.h"  // from @llvm-project
@@ -361,6 +362,49 @@ llvm::GlobalVariable* AllocateSharedMemoryTile(llvm::Module* module,
       llvm::GlobalValue::NotThreadLocal, kGPUSharedMemoryAddrSpace);
 }
 
+SharedMemoryTile AllocateSharedMemoryTile(
+    llvm::Module* module, llvm::Type* element_type,
+    absl::Span<int64_t const> dimensions_major_to_minor,
+    absl::string_view buffer_name) {
+  llvm::Type* ty = element_type;
+  for (auto dim : llvm::reverse(dimensions_major_to_minor)) {
+    ty = llvm::ArrayType::get(ty, dim);
+  }
+  return SharedMemoryTile{
+      llvm_ir::AllocateSharedMemoryTile(module, ty, buffer_name), element_type};
+}
+
+static std::vector<llvm::Value*> IndexWith0(
+    absl::Span<llvm::Value* const> index, llvm::IRBuilder<>* b) {
+  std::vector<llvm::Value*> index_with_0{
+      llvm::ConstantInt::get(index.front()->getType(), 0)};
+  absl::c_copy(index, std::back_inserter(index_with_0));
+  return index_with_0;
+}
+
+llvm::Value* SharedMemoryTile::Address(absl::Span<llvm::Value* const> index,
+                                       llvm::IRBuilder<>* b) const {
+  llvm::Value* gep = b->CreateInBoundsGEP(base_ptr_->getValueType(), base_ptr_,
+                                          IndexWith0(index, b));
+  // __shared__ memory uses a different address space, so we cast it
+  // to global address space before writing or reading.
+  return b->CreateAddrSpaceCast(gep,
+                                llvm::PointerType::get(b->getContext(), 0));
+};
+
+llvm::Value* SharedMemoryTile::Load(absl::Span<llvm::Value* const> index,
+                                    llvm::IRBuilder<>* b) const {
+  auto* load_type = llvm::GetElementPtrInst::getIndexedType(
+      base_ptr_->getValueType(), IndexWith0(index, b));
+  return b->CreateLoad(load_type, Address(index, b));
+}
+
+llvm::StoreInst* SharedMemoryTile::Store(llvm::Value* value,
+                                         absl::Span<llvm::Value* const> index,
+                                         llvm::IRBuilder<>* b) const {
+  return b->CreateStore(value, Address(index, b));
+}
+
 llvm::AllocaInst* EmitAllocaAtFunctionEntry(llvm::Type* type,
                                             absl::string_view name,
                                             llvm::IRBuilder<>* b,
@@ -377,8 +421,11 @@ llvm::AllocaInst* EmitAllocaAtFunctionEntryWithCount(llvm::Type* type,
   llvm::Function* function = b->GetInsertBlock()->getParent();
   b->SetInsertPoint(&function->getEntryBlock(),
                     function->getEntryBlock().getFirstInsertionPt());
+  llvm::Module* module = b->GetInsertBlock()->getModule();
+  // Explicitly set local addrspace for SPIR backend.
+  int addrspace = llvm::Triple(module->getTargetTriple()).isSPIR() ? 5 : 0;
   llvm::AllocaInst* alloca =
-      b->CreateAlloca(type, element_count, AsStringRef(name));
+      b->CreateAlloca(type, addrspace, element_count, AsStringRef(name));
   if (alignment != 0) {
     alloca->setAlignment(llvm::Align(alignment));
   }
@@ -496,7 +543,12 @@ void SetDereferenceableMetadataForLoad(llvm::LoadInst* load,
 }
 
 llvm::Instruction* AddRangeMetadata(int32_t lower, int32_t upper,
-                                    llvm::Instruction* inst) {
+                                    llvm::Instruction* inst,
+                                    llvm::IRBuilder<>* b) {
+  llvm::Module* module = b->GetInsertBlock()->getModule();
+  if (llvm::Triple(module->getTargetTriple()).isSPIR()) {
+    return inst;
+  }
   llvm::LLVMContext& context = inst->getParent()->getContext();
   llvm::IntegerType* i32 = llvm::Type::getInt32Ty(context);
   inst->setMetadata(
diff --git a/third_party/xla/xla/service/llvm_ir/llvm_util.h b/third_party/xla/xla/service/llvm_ir/llvm_util.h
index 4008b667d56852..ba39394ffcc993 100644
--- a/third_party/xla/xla/service/llvm_ir/llvm_util.h
+++ b/third_party/xla/xla/service/llvm_ir/llvm_util.h
@@ -157,6 +157,33 @@ llvm::GlobalVariable* AllocateSharedMemoryTile(llvm::Module* module,
                                                llvm::Type* tile_type,
                                                absl::string_view name);
 
+// Utility class for working with shared memory.
+class SharedMemoryTile {
+ public:
+  SharedMemoryTile() = default;
+  explicit SharedMemoryTile(llvm::GlobalVariable* base_ptr,
+                            llvm::Type* element_type)
+      : base_ptr_(base_ptr), element_type_(element_type) {}
+
+  llvm::Value* Address(absl::Span<llvm::Value* const> index,
+                       llvm::IRBuilder<>* b) const;
+  llvm::Value* Load(absl::Span<llvm::Value* const> index,
+                    llvm::IRBuilder<>* b) const;
+  llvm::StoreInst* Store(llvm::Value* value,
+                         absl::Span<llvm::Value* const> index,
+                         llvm::IRBuilder<>* b) const;
+  llvm::Type* GetElementType() const { return element_type_; }
+
+ private:
+  llvm::GlobalVariable* base_ptr_;
+  llvm::Type* element_type_;
+};
+
+SharedMemoryTile AllocateSharedMemoryTile(
+    llvm::Module* module, llvm::Type* element_type,
+    absl::Span<int64_t const> dimensions_major_to_minor,
+    absl::string_view buffer_name);
+
 // Inserts an allocate of the requested type at the entry point of the
 // function that the builder is currently building. The insert point
 // of the builder is set to the same place after calling this function
@@ -246,7 +273,8 @@ void SetDereferenceableMetadataForLoad(llvm::LoadInst* load,
 
 // Tells LLVM `inst >= lower && inst < upper`. Returns `inst` for convenience.
 llvm::Instruction* AddRangeMetadata(int32_t lower, int32_t upper,
-                                    llvm::Instruction* inst);
+                                    llvm::Instruction* inst,
+                                    llvm::IRBuilder<>* b);
 
 void SetToFirstInsertPoint(llvm::BasicBlock* blk, llvm::IRBuilder<>* builder);
 
diff --git a/third_party/xla/xla/service/llvm_ir/math_ops.cc b/third_party/xla/xla/service/llvm_ir/math_ops.cc
index 2210a64b77c454..f33e8ec40bb3b8 100644
--- a/third_party/xla/xla/service/llvm_ir/math_ops.cc
+++ b/third_party/xla/xla/service/llvm_ir/math_ops.cc
@@ -78,5 +78,76 @@ llvm::Value* EmitFastTanh(llvm::IRBuilder<>* b, llvm::Value* input,
                          b->CreateFDiv(numerator, denominator));
 }
 
+llvm::Value* EmitErfF32(llvm::IRBuilder<>* b, llvm::Value* x) {
+  auto type = x->getType();
+  constexpr float kErfInvOneMinusHalfULP = 3.832506856900711f;
+  auto call_fabs = [b](llvm::Value* operand_value) {
+    return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fabs, {operand_value},
+                                        {operand_value->getType()}, b);
+  };
+  auto fcmp_le = [b](llvm::Value* lhs_value, llvm::Value* rhs_value) {
+    return b->CreateFCmpOLE(lhs_value, rhs_value);
+  };
+  llvm::Value* const clamp = fcmp_le(
+      llvm::ConstantFP::get(type, kErfInvOneMinusHalfULP), call_fabs(x));
+  // The monomial coefficients of the numerator polynomial (odd).
+  llvm::Value* const alpha_1 = llvm::ConstantFP::get(type, 1.128379143519084f);
+  llvm::Value* const alpha_3 =
+      llvm::ConstantFP::get(type, 0.18520832239976145f);
+  llvm::Value* const alpha_5 =
+      llvm::ConstantFP::get(type, 0.050955695062380861f);
+  llvm::Value* const alpha_7 =
+      llvm::ConstantFP::get(type, 0.0034082910107109506f);
+  llvm::Value* const alpha_9 =
+      llvm::ConstantFP::get(type, 0.00022905065861350646f);
+
+  // The monomial coefficients of the denominator polynomial (even).
+  llvm::Value* const beta_0 = llvm::ConstantFP::get(type, 1.0f);
+  llvm::Value* const beta_2 = llvm::ConstantFP::get(type, 0.49746925110067538f);
+  llvm::Value* const beta_4 = llvm::ConstantFP::get(type, 0.11098505178285362f);
+  llvm::Value* const beta_6 =
+      llvm::ConstantFP::get(type, 0.014070470171167667f);
+  llvm::Value* const beta_8 =
+      llvm::ConstantFP::get(type, 0.0010179625278914885f);
+  llvm::Value* const beta_10 =
+      llvm::ConstantFP::get(type, 0.000023547966471313185f);
+  llvm::Value* const beta_12 =
+      llvm::ConstantFP::get(type, -1.1791602954361697e-7f);
+
+  // Since the polynomials are odd/even, we need x^2.
+  llvm::Value* const x2 = b->CreateFMul(x, x);
+
+  // Evaluate the numerator polynomial p.
+  auto call_fma = [b](llvm::Value* multiplier, llvm::Value* multiplicand,
+                      llvm::Value* addend) {
+    return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::fma,
+                                        {multiplier, multiplicand, addend},
+                                        {multiplier->getType()}, b);
+  };
+  llvm::Value* p = call_fma(x2, alpha_9, alpha_7);
+  p = call_fma(x2, p, alpha_5);
+  p = call_fma(x2, p, alpha_3);
+  p = call_fma(x2, p, alpha_1);
+  p = b->CreateFMul(x, p);
+
+  // Evaluate the denominator polynomial p.
+  llvm::Value* q = call_fma(x2, beta_12, beta_10);
+  q = call_fma(x2, q, beta_8);
+  q = call_fma(x2, q, beta_6);
+  q = call_fma(x2, q, beta_4);
+  q = call_fma(x2, q, beta_2);
+  q = call_fma(x2, q, beta_0);
+
+  // Divide the numerator by the denominator.
+  auto call_copysign = [b](llvm::Value* mag, llvm::Value* sign) {
+    return llvm_ir::EmitCallToIntrinsic(llvm::Intrinsic::copysign, {mag, sign},
+                                        {mag->getType()}, b);
+  };
+  auto* result =
+      b->CreateSelect(clamp, call_copysign(llvm::ConstantFP::get(type, 1.0), x),
+                      b->CreateFDiv(p, q));
+  return result;
+}
+
 }  // namespace llvm_ir
 }  // namespace xla
diff --git a/third_party/xla/xla/service/llvm_ir/math_ops.h b/third_party/xla/xla/service/llvm_ir/math_ops.h
index 558e6a77d83148..7c5bf27c55de0d 100644
--- a/third_party/xla/xla/service/llvm_ir/math_ops.h
+++ b/third_party/xla/xla/service/llvm_ir/math_ops.h
@@ -28,6 +28,10 @@ namespace llvm_ir {
 llvm::Value* EmitFastTanh(llvm::IRBuilder<>* b, llvm::Value* input,
                           bool with_fma = false);
 
+// Emits an approximation of erf. The implementation uses the same rational
+// interpolant as implemented in Eigen3.
+llvm::Value* EmitErfF32(llvm::IRBuilder<>* b, llvm::Value* x);
+
 }  // namespace llvm_ir
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/llvm_ir/sort_util.cc b/third_party/xla/xla/service/llvm_ir/sort_util.cc
index 7cbe55ad27f848..577df46666c649 100644
--- a/third_party/xla/xla/service/llvm_ir/sort_util.cc
+++ b/third_party/xla/xla/service/llvm_ir/sort_util.cc
@@ -165,7 +165,7 @@ Status EmitTiledCompareLoop(
   llvm::Value* thread_id = gpu::EmitCallToTargetIntrinsic(
       gpu::TargetIntrinsicID::kThreadIdx, {}, {}, b);
   llvm_ir::AddRangeMetadata(0, tile_size / 2,
-                            llvm::cast<llvm::Instruction>(thread_id));
+                            llvm::cast<llvm::Instruction>(thread_id), b);
   thread_id = b->CreateIntCast(thread_id, tiled_keys_index.GetType(),
                                /*isSigned=*/true, "thread.id.x");
 
diff --git a/third_party/xla/xla/service/lockable.h b/third_party/xla/xla/service/lockable.h
index 824eb8645f69d1..3a71685c653432 100644
--- a/third_party/xla/xla/service/lockable.h
+++ b/third_party/xla/xla/service/lockable.h
@@ -73,10 +73,16 @@ class Lockable {
   };
 
   Lockable() = default;
+
   explicit Lockable(T value) : value_(std::move(value)) {
     VLOG(2) << "Constructed " << LockableName::ToString(value_);
   }
 
+  template <typename... Args>
+  explicit Lockable(Args&&... args) : value_(std::forward<Args>(args)...) {
+    VLOG(2) << "Constructed " << LockableName::ToString(value_);
+  }
+
   Lockable(const Lockable&) = delete;
   Lockable& operator=(const Lockable&) = delete;
 
@@ -95,8 +101,25 @@ class Lockable {
     return Lock(this);
   }
 
+  Lock TryAcquire() {
+    absl::MutexLock lock(&mutex_);
+
+    // Someone already locked this object, return an empty lock.
+    if (is_unlocked_ == false) {
+      VLOG(2) << "Failed to acquire " << LockableName::ToString(value_);
+      return Lock();
+    }
+
+    VLOG(2) << "Acquired " << LockableName::ToString(value_);
+    is_unlocked_ = false;
+    return Lock(this);
+  }
+
   std::string ToString() const { return LockableName::ToString(value_); }
 
+ protected:
+  const T& value() const { return value_; }
+
  private:
   friend class Lock;
 
diff --git a/third_party/xla/xla/service/lockable_test.cc b/third_party/xla/xla/service/lockable_test.cc
index 19fb7b76dab1de..9118fb9e7276bf 100644
--- a/third_party/xla/xla/service/lockable_test.cc
+++ b/third_party/xla/xla/service/lockable_test.cc
@@ -57,12 +57,23 @@ TEST(LockableTest, LockProperties) {
   EXPECT_FALSE(lock1);
   EXPECT_TRUE(lock2);
 
+  // TryAcquire will return empty lock for locked object.
+  LockableString::Lock lock3 = str.TryAcquire();
+  EXPECT_FALSE(lock3);
+
   // Locks have human readable names.
   EXPECT_EQ(lock1.ToString(), "<empty lock>");
   EXPECT_EQ(lock2.ToString(), "lockable string foo");
 
   // Lockable has human readable name.
   EXPECT_EQ(str.ToString(), "lockable string foo");
+
+  // After lock is destructed we can acquire lockable with TryLock.
+  auto sink = [](LockableString::Lock) {};
+  sink(std::move(lock2));
+
+  LockableString::Lock lock4 = str.TryAcquire();
+  EXPECT_TRUE(lock4);
 }
 
 TEST(LockableTest, ExclusiveAccess) {
@@ -73,8 +84,10 @@ TEST(LockableTest, ExclusiveAccess) {
 
   for (size_t i = 0; i < 100; ++i) {
     thread_pool.Schedule([&] {
-      auto exclusive_str = str.Acquire();
-      ASSERT_EQ(*exclusive_str, "foo");
+      {  // Decrement counter only after lock is released.
+        auto exclusive_str = str.Acquire();
+        ASSERT_EQ(*exclusive_str, "foo");
+      }
       counter.DecrementCount();
     });
   }
diff --git a/third_party/xla/xla/service/logistic_expander_test.cc b/third_party/xla/xla/service/logistic_expander_test.cc
index af808f0653452b..dae9292b715e4a 100644
--- a/third_party/xla/xla/service/logistic_expander_test.cc
+++ b/third_party/xla/xla/service/logistic_expander_test.cc
@@ -16,30 +16,19 @@ limitations under the License.
 #include "xla/service/logistic_expander.h"
 
 #include <memory>
+#include <string_view>
 
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_join.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/layout_util.h"
-#include "xla/literal.h"
-#include "xla/service/hlo_creation_utils.h"
+#include "xla/service/dynamic_padder.h"
 #include "xla/service/hlo_parser.h"
-#include "xla/service/hlo_pass_fix.h"
-#include "xla/service/hlo_pass_pipeline.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/service/pattern_matcher_gmock.h"
-#include "xla/service/shape_inference.h"
-#include "xla/shape_util.h"
+#include "xla/statusor.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
-#include "xla/types.h"
-#include "xla/window_util.h"
 #include "xla/xla_data.pb.h"
-#include "tsl/lib/core/status_test_util.h"
 
 namespace xla {
 namespace {
@@ -73,5 +62,21 @@ TEST_F(LogisticExpanderTest, ExpandWith) {
                                  m::Exp(m::Negate(m::Parameter(0)))))));
 }
 
+TEST_F(LogisticExpanderTest, DynamicDimensions) {
+  constexpr std::string_view hlo = R"(
+HloModule DynamicDimensions
+
+ENTRY main {
+  p = f32[<=10] parameter(0)
+  ROOT root = f32[<=10] logistic(p)
+}
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo));
+
+  LogisticExpander logistic_expander;
+  ASSERT_TRUE(logistic_expander.Run(module.get()).value());
+  DynamicPadder dynamic_padder;
+  EXPECT_TRUE(dynamic_padder.Run(module.get()).value());
+}
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/memory_space_assignment/BUILD b/third_party/xla/xla/service/memory_space_assignment/BUILD
index 6b90dd29c50ac0..eaccc3d1d405e9 100644
--- a/third_party/xla/xla/service/memory_space_assignment/BUILD
+++ b/third_party/xla/xla/service/memory_space_assignment/BUILD
@@ -5,6 +5,7 @@ load(
     "//xla:xla.bzl",
     "xla_cc_test",
 )
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load(
     "@local_tsl//tsl/platform:build_config.bzl",
     "tf_proto_library",
@@ -12,7 +13,8 @@ load(
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
     licenses = ["notice"],
 )
 
@@ -28,17 +30,19 @@ tf_proto_library(
     srcs = ["memory_space_assignment.proto"],
     cc_api_version = 2,
     make_default_target_header_only = True,
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "memory_space_assignment",
     srcs = ["memory_space_assignment.cc"],
     hdrs = ["memory_space_assignment.h"],
-    visibility = ["//visibility:public"],
     deps = [
+        ":allocation",
+        ":cost_analysis",
         ":memory_space_assignment_proto_cc",
+        ":prefetch_interval_picker",
         ":repacking",
+        ":slice",
         ":tuning_utils",
         ":utils",
         "//xla:debug_options_flags",
@@ -47,6 +51,7 @@ cc_library(
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:util",
+        "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_live_range",
         "//xla/service:buffer_value",
@@ -58,7 +63,6 @@ cc_library(
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_value",
         "//xla/service:time_utils",
-        "//xla/service:tuple_util",
         "//xla/service/heap_simulator",
         "//xla/service/heap_simulator:allocation_block",
         "@com_google_absl//absl/algorithm:container",
@@ -84,9 +88,14 @@ xla_cc_test(
     name = "memory_space_assignment_test",
     srcs = ["memory_space_assignment_test.cc"],
     deps = [
+        ":allocation",
+        ":cost_analysis",
         ":memory_space_assignment",
         ":memory_space_assignment_proto_cc",
+        ":prefetch_interval_picker",
         ":repacking",
+        ":slice",
+        ":testing_utils",
         "//xla:shape_util",
         "//xla:status",
         "//xla:statusor",
@@ -126,7 +135,6 @@ xla_cc_test(
 cc_library(
     name = "repacking",
     hdrs = ["repacking.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:statusor",
         "//xla/service/heap_simulator:allocation_block",
@@ -138,7 +146,6 @@ cc_library(
     name = "best_fit_repacker",
     srcs = ["best_fit_repacker.cc"],
     hdrs = ["best_fit_repacker.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":repacking",
         "//xla:comparison_util",
@@ -160,7 +167,6 @@ cc_library(
     name = "utils",
     srcs = ["utils.cc"],
     hdrs = ["utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_value",
@@ -168,17 +174,166 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "slice",
+    srcs = ["slice.cc"],
+    hdrs = [
+        "slice.h",
+    ],
+    deps = [
+        ":memory_space_assignment_proto_cc",
+        "//xla:shape_util",
+        "//xla/service/heap_simulator",
+        "@com_google_absl//absl/strings",
+    ],
+)
+
+cc_library(
+    name = "allocation",
+    srcs = ["allocation.cc"],
+    hdrs = [
+        "allocation.h",
+    ],
+    deps = [
+        ":cost_analysis",
+        ":memory_space_assignment_proto_cc",
+        ":slice",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_value",
+        "//xla/service:time_utils",
+        "//xla/service:tuple_util",
+        "//xla/service/heap_simulator",
+        "//xla/service/heap_simulator:allocation_block",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
+    ],
+)
+
 cc_library(
     name = "tuning_utils",
     srcs = ["tuning_utils.cc"],
     hdrs = ["tuning_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/service/heap_simulator",
     ],
 )
 
+cc_library(
+    name = "cost_analysis",
+    srcs = ["cost_analysis.cc"],
+    hdrs = ["cost_analysis.h"],
+    deps = [
+        "//xla:shape_util",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_live_range",
+        "//xla/service:call_graph",
+        "//xla/service:hlo_alias_analysis",
+        "//xla/service:hlo_buffer",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service:hlo_value",
+        "//xla/service/heap_simulator",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/functional:function_ref",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "cost_analysis_test",
+    srcs = ["cost_analysis_test.cc"],
+    deps = [
+        ":cost_analysis",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/lib/core:status_test_util",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+cc_library(
+    name = "prefetch_interval_picker",
+    srcs = ["prefetch_interval_picker.cc"],
+    hdrs = ["prefetch_interval_picker.h"],
+    deps = [
+        ":cost_analysis",
+        ":memory_space_assignment_proto_cc",
+        "//xla:shape_util",
+        "//xla:status",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_live_range",
+        "//xla/service:hlo_proto_cc",
+        "//xla/service:hlo_value",
+        "//xla/service/heap_simulator",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:logging",
+    ],
+)
+
+cc_library(
+    name = "testing_utils",
+    testonly = True,
+    hdrs = ["testing_utils.h"],
+    deps = [
+        ":cost_analysis",
+        "//xla:shape_util",
+        "//xla:statusor",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/utils:hlo_live_range",
+        "//xla/service:call_graph",
+        "//xla/service:hlo_alias_analysis",
+        "//xla/service:hlo_cost_analysis",
+        "@com_google_absl//absl/memory",
+        "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
+xla_cc_test(
+    name = "prefetch_interval_picker_test",
+    srcs = ["prefetch_interval_picker_test.cc"],
+    deps = [
+        ":cost_analysis",
+        ":prefetch_interval_picker",
+        ":testing_utils",
+        "//xla:shape_util",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_cost_analysis",
+        "//xla/service:hlo_value",
+        "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 xla_cc_test(
     name = "best_fit_repacker_test",
     srcs = ["best_fit_repacker_test.cc"],
diff --git a/third_party/xla/xla/service/memory_space_assignment/allocation.cc b/third_party/xla/xla/service/memory_space_assignment/allocation.cc
new file mode 100644
index 00000000000000..54cf00f9a31f39
--- /dev/null
+++ b/third_party/xla/xla/service/memory_space_assignment/allocation.cc
@@ -0,0 +1,854 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/memory_space_assignment/allocation.h"
+
+#include <algorithm>
+#include <cstdint>
+#include <functional>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/function_ref.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/memory/memory.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/heap_simulator/allocation_block.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
+#include "xla/service/memory_space_assignment/slice.h"
+#include "xla/service/time_utils.h"
+#include "xla/service/tuple_util.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
+#include "xla/util.h"
+#include "tsl/platform/errors.h"
+
+namespace xla::memory_space_assignment {
+namespace {
+
+std::string UsesToString(const std::vector<HloUse>& uses) {
+  if (uses.empty()) {
+    return "none";
+  }
+  std::vector<std::string> uses_str;
+  uses_str.reserve(uses.size());
+  for (const auto& use : uses) {
+    uses_str.push_back(use.ToString());
+  }
+  return absl::StrJoin(uses_str, ",");
+}
+
+// Helper function to compute the start time for a SlicedCopyAllocation.
+int64_t GetSlicedCopyAllocationExclusiveStartTime(
+    const std::vector<SliceDecision>&
+        slice_decisions_sorted_by_exclusive_start_time) {
+  if (slice_decisions_sorted_by_exclusive_start_time.empty()) {
+    return -1;
+  }
+
+  return slice_decisions_sorted_by_exclusive_start_time.front()
+      .exclusive_start_time;
+}
+
+// Helper function to compute the underlying Allocation chunk for a
+// SlicedCopyAllocation.
+std::optional<HeapSimulator::Chunk> GetSlicedCopyAllocationChunk(
+    const std::vector<SliceDecision>& slice_decisions_sorted_by_start_time) {
+  if (slice_decisions_sorted_by_start_time.empty()) {
+    return std::nullopt;
+  }
+  auto offset_cmp = [](const SliceDecision& lhs, const SliceDecision& rhs) {
+    return lhs.chunk.offset < rhs.chunk.offset;
+  };
+  auto end_cmp = [](const SliceDecision& lhs, const SliceDecision& rhs) {
+    return lhs.chunk.chunk_end() < rhs.chunk.chunk_end();
+  };
+  return HeapSimulator::Chunk::FromOffsetEnd(
+      std::min_element(slice_decisions_sorted_by_start_time.begin(),
+                       slice_decisions_sorted_by_start_time.end(), offset_cmp)
+          ->chunk.offset,
+      std::max_element(slice_decisions_sorted_by_start_time.begin(),
+                       slice_decisions_sorted_by_start_time.end(), end_cmp)
+          ->chunk.chunk_end());
+}
+
+}  // namespace
+
+std::optional<int64_t> Allocation::cross_program_prefetch_index() const {
+  return cross_program_prefetch_index_;
+}
+
+HeapSimulator::Chunk Allocation::chunk() const {
+  CHECK(chunk_.has_value());
+  return *chunk_;
+}
+
+void Allocation::set_offset(int64_t offset) {
+  CHECK(chunk_.has_value());
+  *chunk_ = HeapSimulator::Chunk::FromOffsetSize(offset, chunk_->size);
+}
+
+bool Allocation::is_in_alternate_mem() const {
+  return memory_space_ == MemorySpace::kAlternate;
+}
+
+bool Allocation::is_in_default_mem() const {
+  return memory_space_ == MemorySpace::kDefault;
+}
+
+void Allocation::AddUse(HloUse use) {
+  HloInstruction* operand =
+      use.instruction->mutable_operand(use.operand_number);
+  // If the use is a tuple, look inside the tuple to find the actual use.
+  for (int64_t index : use.operand_index) {
+    if (operand->opcode() != HloOpcode::kTuple) {
+      break;
+    }
+    operand = operand->mutable_operand(index);
+  }
+
+  // Look beyond GetTupleElement(Tuple()) pattern for any bitcasts.
+  std::function<HloInstruction*(HloInstruction*)> get_simplified_operand;
+  get_simplified_operand = [&](HloInstruction* instruction) {
+    while (instruction->opcode() == HloOpcode::kGetTupleElement) {
+      HloInstruction* operand =
+          get_simplified_operand(instruction->mutable_operand(0));
+      if (operand->opcode() == HloOpcode::kTuple) {
+        instruction = operand->mutable_operand(instruction->tuple_index());
+      } else {
+        return instruction;
+      }
+    }
+    return instruction;
+  };
+  operand = get_simplified_operand(operand);
+
+  uses_.push_back(use);
+}
+
+Status Allocation::UpdateUses(HloComputation* computation,
+                              HloInstruction* producing_instruction) {
+  for (const HloUse& use : uses()) {
+    HloInstruction* replacement_instruction = producing_instruction;
+    Shape operand_shape = use.instruction->operand(use.operand_number)->shape();
+    if (operand_shape.IsTuple()) {
+      TF_ASSIGN_OR_RETURN(
+          replacement_instruction,
+          TupleUtil::ReplaceTupleWith(
+              producing_instruction,
+              use.instruction->mutable_operand(use.operand_number),
+              use.operand_index));
+    } else if (operand_shape != producing_instruction->shape()) {
+      // When processing allocations, we treat bitcasts as trivial positions and
+      // do not create allocations for them. We insert bitcasts after copies, to
+      // account for the fact that we don't have an allocation for the bitcast.
+      VLOG(4) << "Old shape = " << operand_shape.ToString()
+              << ", new shape = " << producing_instruction->shape().ToString()
+              << "; inserting a bitcast.";
+      replacement_instruction = computation->AddInstruction(
+          HloInstruction::CreateBitcast(operand_shape, producing_instruction));
+    }
+    TF_RETURN_IF_ERROR(use.instruction->ReplaceOperandWith(
+        use.operand_number, replacement_instruction));
+  }
+  return OkStatus();
+}
+
+bool Allocation::is_copy_like_allocation() const {
+  return is_copy_allocation() || is_sliced_copy_allocation();
+}
+
+HloInstruction* Allocation::AddGetTupleElements() const {
+  CHECK_NE(defining_position().instruction, nullptr);
+
+  Shape shape = defining_position().shape();
+  CHECK(shape.IsArray()) << "Allocation shape is not an array. Shape = "
+                         << shape.ToString()
+                         << " position = " << defining_position().shape();
+  return TupleUtil::AddGetTupleElements(defining_position());
+}
+
+Allocation::Allocation(HloPosition defining_position, MemorySpace memory_space,
+                       std::optional<HeapSimulator::Chunk> chunk,
+                       int64_t start_time, int64_t end_time,
+                       bool is_scoped_allocation,
+                       std::optional<int64_t> cross_program_prefetch_index)
+    : original_defining_position_(std::move(defining_position)),
+      memory_space_(memory_space),
+      chunk_(std::move(chunk)),
+      start_time_(start_time),
+      end_time_(end_time),
+      is_scoped_allocation_(is_scoped_allocation),
+      cross_program_prefetch_index_(cross_program_prefetch_index) {
+  CHECK(!is_scoped_allocation ||
+        original_defining_position_.index == ShapeIndex({}));
+}
+
+HloPosition Allocation::original_defining_position() const {
+  return original_defining_position_;
+}
+
+void Allocation::set_original_defining_position(HloPosition defining_position) {
+  original_defining_position_ = std::move(defining_position);
+}
+
+bool Allocation::base_is_equal(const Allocation& other) const {
+  return defining_position() == other.defining_position() &&
+         uses() == other.uses() && memory_space() == other.memory_space() &&
+         chunk() == other.chunk() && start_time() == other.start_time() &&
+         end_time() == other.end_time() &&
+         earliest_available_time() == other.earliest_available_time() &&
+         is_copy_allocation() == other.is_copy_allocation() &&
+         is_scoped_allocation() == other.is_scoped_allocation();
+}
+
+PinnedAllocation::PinnedAllocation(HloPosition defining_position,
+                                   MemorySpace memory_space,
+                                   std::optional<HeapSimulator::Chunk> chunk,
+                                   int64_t start_time, int64_t end_time,
+                                   bool is_scoped_allocation)
+    : Allocation(std::move(defining_position), memory_space, chunk, start_time,
+                 end_time, is_scoped_allocation,
+                 /*cross_program_prefetch_index=*/std::nullopt) {}
+
+HloPosition PinnedAllocation::defining_position() const {
+  return original_defining_position();
+}
+
+bool PinnedAllocation::operator==(const PinnedAllocation& other) const {
+  return this->base_is_equal(static_cast<const Allocation&>(other));
+}
+
+bool MirroredAllocation::operator==(const MirroredAllocation& other) const {
+  return this->base_is_equal(static_cast<const Allocation&>(other));
+}
+
+bool ParentAllocation::operator==(const ParentAllocation& other) const {
+  return this->base_is_equal(static_cast<const Allocation&>(other));
+}
+
+bool PinnedAllocation::operator==(const Allocation& other) const {
+  const PinnedAllocation* casted_other =
+      dynamic_cast<const PinnedAllocation*>(&other);
+  return casted_other != nullptr && (*this) == (*casted_other);
+}
+
+Status PinnedAllocation::Process() {
+  if (is_scoped_allocation()) {
+    // Nothing to do here for scoped allocations.
+    return OkStatus();
+  }
+  HloInstruction* producing_instruction = AddGetTupleElements();
+  HloComputation* computation = producing_instruction->parent();
+  return UpdateUses(computation, producing_instruction);
+}
+
+std::string PinnedAllocation::ToString() const {
+  std::string memory_space_str =
+      memory_space() == MemorySpace::kDefault ? "def" : "alt";
+  std::optional<HeapSimulator::Chunk> chunk = maybe_chunk();
+  if (chunk) {
+    absl::StrAppend(&memory_space_str, " (off: ", chunk->offset, ")");
+  }
+  return absl::StrCat((is_scoped_allocation() ? "Scoped " : ""),
+                      "PinnedAllocation in ", memory_space_str, " defined at ",
+                      original_defining_position().ToString(),
+                      ", start_time:", start_time(), ", end_time:", end_time(),
+                      ", uses: ", UsesToString(uses()));
+}
+
+void PinnedAllocation::MarkIfNeeded(
+    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
+  MarkNeeded(needed_allocations);
+}
+
+void PinnedAllocation::MarkNeeded(
+    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
+  needed_allocations.insert(this);
+}
+
+CopyAllocation::CopyAllocation(
+    Allocation& prev_allocation, MemorySpace memory_space,
+    std::optional<HeapSimulator::Chunk> chunk,
+    int64_t copy_start_schedule_after_time,
+    int64_t copy_done_schedule_before_time, int64_t end_time,
+    std::optional<int64_t> cross_program_prefetch_index)
+    : Allocation(
+          /*defining_position=*/{nullptr, {}}, memory_space, chunk,
+          // Allocation uses an inclusive start time
+          ExclusiveToInclusiveStartTime(copy_start_schedule_after_time),
+          end_time,
+          /*is_scoped_allocation=*/false, cross_program_prefetch_index),
+      prev_allocation_(prev_allocation),
+      copy_start_schedule_after_(copy_start_schedule_after_time),
+      copy_done_schedule_before_(copy_done_schedule_before_time) {}
+
+int64_t CopyAllocation::earliest_available_time() const {
+  return copy_done_schedule_before_;
+}
+
+Status CopyAllocation::Process() {
+  // Copy allocations need to insert asynchronous copy nodes.
+  Shape shape = defining_position().shape();
+  HloInstruction* producing_instruction = AddGetTupleElements();
+  HloComputation* computation = producing_instruction->parent();
+  copy_start_ = computation->AddInstruction(HloInstruction::CreateCopyStart(
+      ShapeUtil::MakeTupleShape({shape, shape, ShapeUtil::MakeShape(U32, {})}),
+      producing_instruction, cross_program_prefetch_index()));
+  copy_done_ = computation->AddInstruction(
+      HloInstruction::CreateUnary(shape, HloOpcode::kCopyDone, copy_start_));
+  VLOG(4) << "Created " << copy_start_->name()
+          << " for copy allocation: " << ToString();
+
+  // Update the allocation position with the copy complete instruction, so that
+  // if there are further copies from it, they can find the correct position.
+  set_original_defining_position(HloPosition{copy_done_, {}});
+  return UpdateUses(computation, copy_done_);
+}
+
+void CopyAllocation::MarkIfNeeded(
+    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
+  MarkNeeded(needed_allocations);
+}
+
+void CopyAllocation::MarkNeeded(
+    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
+  needed_allocations.insert(this);
+  prev_allocation_.MarkNeeded(needed_allocations);
+}
+
+std::string CopyAllocation::ToString() const {
+  std::string memory_space_str =
+      memory_space() == MemorySpace::kDefault ? "def" : "alt";
+  std::optional<HeapSimulator::Chunk> chunk = maybe_chunk();
+  if (chunk) {
+    absl::StrAppend(&memory_space_str, " (off: ", chunk->offset, ")");
+  }
+  return absl::StrCat("Copy Allocation in ", memory_space_str,
+                      ", start_time:", start_time(), ", end_time:", end_time(),
+                      ", copy_start_after_time: ", copy_start_schedule_after(),
+                      ", copy_done_before_time: ", copy_done_schedule_before(),
+                      ", uses: ", UsesToString(uses()), ", from ",
+                      prev_allocation_.ToString());
+}
+
+HloPosition CopyAllocation::defining_position() const {
+  // Unless explicitly set, the defining position of a copy allocation is
+  // retrieved from the previous allocation. This is because we don't create
+  // new CopyStart/CopyDone instructions until later and the position should
+  // point to the previous (copy or otherwise) allocation's position for the
+  // original defining position.
+  HloPosition defining_position = original_defining_position();
+  if (defining_position.instruction == nullptr) {
+    return prev_allocation_.defining_position();
+  }
+  return defining_position;
+}
+
+bool CopyAllocation::operator==(const CopyAllocation& other) const {
+  return this->base_is_equal(static_cast<const Allocation&>(other)) &&
+         copy_done_schedule_before() == other.copy_done_schedule_before() &&
+         copy_start_schedule_after() == other.copy_start_schedule_after() &&
+         copy_start() == other.copy_start() && copy_done() == other.copy_done();
+}
+
+bool CopyAllocation::operator==(const Allocation& other) const {
+  const CopyAllocation* casted_other =
+      dynamic_cast<const CopyAllocation*>(&other);
+  return casted_other != nullptr && (*this) == (*casted_other);
+}
+
+void CopyAllocation::set_copy_start_schedule_after(
+    int64_t copy_start_schedule_after) {
+  copy_start_schedule_after_ = copy_start_schedule_after;
+}
+
+void CopyAllocation::set_copy_done_schedule_before(
+    int64_t copy_done_schedule_before) {
+  copy_done_schedule_before_ = copy_done_schedule_before;
+}
+
+int64_t CopyAllocation::copy_start_schedule_after() const {
+  return copy_start_schedule_after_;
+}
+
+int64_t CopyAllocation::copy_done_schedule_before() const {
+  return copy_done_schedule_before_;
+}
+
+SlicedCopyAllocation::SlicedCopyAllocation(
+    const Allocation& prev_allocation, MemorySpace memory_space,
+    std::vector<SliceDecision> slice_decisions_sorted_by_exclusive_start_time,
+    int64_t copy_done_schedule_before_time, int64_t end_time,
+    const SlicedPrefetchOptions& sliced_prefetch_options,
+    absl::FunctionRef<Shape(const Shape&)> get_equivalent_s8_shape_fn)
+    : Allocation(
+          /*defining_position=*/{nullptr, {}}, memory_space,
+          GetSlicedCopyAllocationChunk(
+              slice_decisions_sorted_by_exclusive_start_time),
+          // Allocation uses an inclusive start time
+          ExclusiveToInclusiveStartTime(
+              GetSlicedCopyAllocationExclusiveStartTime(
+                  slice_decisions_sorted_by_exclusive_start_time)),
+          end_time,
+          /*is_scoped_allocation=*/false,
+          /*cross_program_prefetch_index=*/std::nullopt),
+      original_shape_to_slice_(prev_allocation.defining_position().shape()),
+      prev_allocation_(prev_allocation),
+      sliced_prefetch_options_(sliced_prefetch_options),
+      get_equivalent_s8_shape_fn_(get_equivalent_s8_shape_fn) {
+  CHECK_GE(slice_decisions_sorted_by_exclusive_start_time.size(), 2);
+  slice_details_sorted_by_exclusive_start_time_.reserve(
+      slice_decisions_sorted_by_exclusive_start_time.size());
+  for (SliceDecision& decision :
+       slice_decisions_sorted_by_exclusive_start_time) {
+    int64_t copy_done_schedule_after_time = decision.exclusive_start_time;
+    slice_details_sorted_by_exclusive_start_time_.push_back(SliceDetail{
+        std::move(decision),
+        copy_done_schedule_after_time,
+        copy_done_schedule_before_time,
+        /*copy_start=*/nullptr,
+        /*copy_done=*/nullptr,
+    });
+  }
+}
+
+Status SlicedCopyAllocation::Process() {
+  Shape shape = defining_position().shape();
+  HloInstruction* producing_instruction = AddGetTupleElements();
+
+  // Calling Process() over the previous allocation might have modified the
+  // defining position, and hence the shape that was used when we computed
+  // the slices. In cases where the shape has changed, we insert a bitcast, so
+  // slice instructions operate on the originally sliced shape.
+  //
+  // Note, these bitcasts are being inserted in the same cases that
+  // UpdateUses() is inserting bitcasts, except we are
+  // inserting the bitcasts before the copy, instead of after the copy.
+  if (!Shape::Equal().IgnoreMemorySpaceInLayout()(shape,
+                                                  original_shape_to_slice_)) {
+    int64_t new_memory_space = shape.layout().memory_space();
+    shape = original_shape_to_slice_;
+    shape.mutable_layout()->set_memory_space(new_memory_space);
+    producing_instruction = producing_instruction->parent()->AddInstruction(
+        HloInstruction::CreateBitcast(shape, producing_instruction));
+  }
+
+  HloComputation* computation = producing_instruction->parent();
+  std::vector<HloInstruction*> slice_dones;
+  slice_dones.reserve(slice_details_sorted_by_exclusive_start_time_.size());
+
+  // If we are trying to make all slices a uniform size, we bitcast the
+  // producing instruction to an array of bytes, so it is easy to slice into any
+  // size.
+  Shape slice_shape = shape;
+  if (IsUniformSliceSizingEnabled(sliced_prefetch_options_)) {
+    slice_shape = get_equivalent_s8_shape_fn_(shape);
+    producing_instruction = producing_instruction->parent()->AddInstruction(
+        HloInstruction::CreateBitcast(slice_shape, producing_instruction));
+  }
+
+  // Sliced copy allocations need to insert asynchronous copy nodes.
+  for (SliceDetail& slice_detail :
+       slice_details_sorted_by_exclusive_start_time_) {
+    TF_RETURN_IF_ERROR(slice_detail.CreateAsyncSlice(
+        slice_shape, *producing_instruction, *computation));
+    VLOG(4) << "Created " << slice_detail.copy_start->name()
+            << " for sliced copy allocation: " << ToString();
+    slice_dones.push_back(slice_detail.copy_done);
+  }
+
+  TF_RETURN_IF_ERROR(CreateBitcastConcat(shape, slice_dones));
+
+  // If we bitcast to an array of bytes above, the result of the concatenated
+  // slices will also be an array of bytes. Thus, we need to cast the
+  // concatentation back to the original shape.
+  if (IsUniformSliceSizingEnabled(sliced_prefetch_options_)) {
+    concat_ = concat_->parent()->AddInstruction(
+        HloInstruction::CreateBitcast(shape, concat_));
+  }
+
+  // Update the allocation position with the copy complete instruction, so that
+  // if there are further copies from it, they can find the correct position.
+  set_original_defining_position(HloPosition{concat_, {}});
+  return UpdateUses(computation, concat_);
+}
+
+void SlicedCopyAllocation::MarkIfNeeded(
+    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
+  MarkNeeded(needed_allocations);
+}
+
+void SlicedCopyAllocation::MarkNeeded(
+    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
+  needed_allocations.insert(this);
+  prev_allocation_.MarkNeeded(needed_allocations);
+}
+
+HloPosition SlicedCopyAllocation::defining_position() const {
+  // Unless explicitly set, the defining position of a sliced copy allocation is
+  // retrieved from the previous allocation. This is because we don't create
+  // new CopyStart/CopyDone instructions until later and the position should
+  // point to the previous (copy or otherwise) allocation's position for the
+  // original defining position.
+  HloPosition defining_position = original_defining_position();
+  if (defining_position.instruction == nullptr) {
+    return prev_allocation_.defining_position();
+  }
+  return defining_position;
+}
+
+int64_t SlicedCopyAllocation::earliest_available_time() const {
+  return slice_details_sorted_by_start_time().back().copy_done_before_time;
+}
+
+std::vector<int64_t> SlicedCopyAllocation::SliceOffsetsSortedByStartTime()
+    const {
+  std::vector<int64_t> offsets;
+  offsets.reserve(slice_details_sorted_by_exclusive_start_time_.size());
+
+  for (const SliceDetail& slice_detail :
+       slice_details_sorted_by_exclusive_start_time_) {
+    offsets.push_back(slice_detail.slice_decision.chunk.offset);
+  }
+
+  return offsets;
+}
+
+void SlicedCopyAllocation::AddDiffToAllSliceOffsets(int64_t diff) {
+  for (SliceDetail& slice_detail :
+       slice_details_sorted_by_exclusive_start_time_) {
+    HeapSimulator::Chunk& chunk = slice_detail.slice_decision.chunk;
+    chunk =
+        HeapSimulator::Chunk::FromOffsetSize(chunk.offset + diff, chunk.size);
+  }
+}
+
+void SlicedCopyAllocation::ImportRepackedSliceData(
+    const SlicedAllocationData& data) {
+  int num_slices = slice_details_sorted_by_exclusive_start_time_.size();
+  CHECK_EQ(data.slices_sorted_by_offset.size(), num_slices);
+
+  std::vector<SliceDetail*> slice_details_sorted_by_offset;
+  slice_details_sorted_by_offset.reserve(num_slices);
+  for (SliceDetail& slice_detail :
+       slice_details_sorted_by_exclusive_start_time_) {
+    slice_details_sorted_by_offset.push_back(&slice_detail);
+  }
+  absl::c_sort(slice_details_sorted_by_offset, [](const SliceDetail* lhs,
+                                                  const SliceDetail* rhs) {
+    return lhs->slice_decision.chunk.offset < rhs->slice_decision.chunk.offset;
+  });
+
+  for (int i = 0; i < num_slices; ++i) {
+    SliceDetail* slice_detail = slice_details_sorted_by_offset[i];
+    HeapSimulator::Chunk& chunk = slice_detail->slice_decision.chunk;
+    const AllocatedSlice& repacked_slice_data = data.slices_sorted_by_offset[i];
+    chunk = HeapSimulator::Chunk::FromOffsetSize(repacked_slice_data.offset,
+                                                 chunk.size);
+    slice_detail->copy_start_after_time =
+        repacked_slice_data.inclusive_start_time - 1;
+    slice_detail->slice_decision.exclusive_start_time =
+        InclusiveToExclusiveStartTime(repacked_slice_data.inclusive_start_time);
+  }
+
+  absl::c_sort(slice_details_sorted_by_exclusive_start_time_,
+               [](const SliceDetail& lhs, const SliceDetail& rhs) {
+                 return std::make_tuple(lhs.copy_start_after_time,
+                                        lhs.slice_decision.chunk.offset) <
+                        std::make_tuple(rhs.copy_start_after_time,
+                                        rhs.slice_decision.chunk.offset);
+               });
+}
+
+const std::vector<SlicedCopyAllocation::SliceDetail>&
+SlicedCopyAllocation::slice_details_sorted_by_start_time() const {
+  return slice_details_sorted_by_exclusive_start_time_;
+}
+
+std::vector<SlicedCopyAllocation::SliceDetail>&
+SlicedCopyAllocation::mutable_slice_details_sorted_by_start_time() {
+  return slice_details_sorted_by_exclusive_start_time_;
+}
+
+bool SlicedCopyAllocation::operator==(const SlicedCopyAllocation& other) const {
+  return this->base_is_equal(static_cast<const Allocation&>(other)) &&
+         slice_details_sorted_by_exclusive_start_time_ ==
+             other.slice_details_sorted_by_exclusive_start_time_ &&
+         concat_ == other.concat_;
+}
+
+std::string SlicedCopyAllocation::ToString() const {
+  std::string memory_space_str = "def";
+  if (memory_space() == MemorySpace::kAlternate) {
+    memory_space_str = absl::StrCat("alt (off: ", maybe_chunk()->offset, ")");
+  }
+  return absl::StrCat(
+      "Sliced Copy Allocation in ", memory_space_str,
+      ", start_time:", start_time(), ", end_time:", end_time(),
+      ", first_slice_copy_start_after_time: ",
+      slice_details_sorted_by_start_time().front().copy_start_after_time,
+      ", last_slice_copy_done_before_time: ",
+      slice_details_sorted_by_start_time().back().copy_done_before_time,
+      ", uses: ", UsesToString(uses()), ", from ", prev_allocation_.ToString());
+}
+
+Status SlicedCopyAllocation::CreateBitcastConcat(
+    const Shape& shape, absl::Span<HloInstruction* const> slices) {
+  CHECK(!slices.empty());
+  concat_ =
+      slices.front()->parent()->AddInstruction(HloInstruction::CreateCustomCall(
+          shape, slices,
+          xla::memory_space_assignment::kConcatBitcastCustomCall));
+  return OkStatus();
+}
+
+std::string SlicedCopyAllocation::SliceDetail::ToString() const {
+  return absl::StrCat("{ slice_decision: ", slice_decision.ToString(),
+                      ", copy_start_after_time: ", copy_start_after_time,
+                      ", copy_done_before_time: ", copy_done_before_time, " }");
+}
+
+std::tuple<const SliceDecision&, int64_t, int64_t, const HloInstruction*,
+           const HloInstruction*>
+SliceDetailToTuple(const SlicedCopyAllocation::SliceDetail& slice_detail) {
+  return std::make_tuple(std::ref(slice_detail.slice_decision),
+                         slice_detail.copy_start_after_time,
+                         slice_detail.copy_done_before_time,
+                         slice_detail.copy_start, slice_detail.copy_done);
+}
+
+bool SlicedCopyAllocation::SliceDetail::operator==(
+    const SliceDetail& other) const {
+  return SliceDetailToTuple(*this) == SliceDetailToTuple(other);
+}
+
+Status SlicedCopyAllocation::SliceDetail::CreateAsyncSlice(
+    const Shape& original_shape, HloInstruction& producer,
+    HloComputation& parent) {
+  if (original_shape.rank() != slice_decision.sizing.slice_params.size()) {
+    return FailedPrecondition(
+        "%s", absl::StrCat("The number of SlicedCopyAllocation parameters ",
+                           slice_decision.sizing.slice_params.size(),
+                           " does not match the rank ", original_shape.rank(),
+                           " of the tensor we are slicing."));
+  }
+
+  std::vector<int64_t> start_indices;
+  start_indices.reserve(slice_decision.sizing.slice_params.size());
+  std::vector<int64_t> limit_indices;
+  limit_indices.reserve(slice_decision.sizing.slice_params.size());
+  std::vector<int64_t> strides;
+  strides.reserve(slice_decision.sizing.slice_params.size());
+
+  for (int i = 0; i < slice_decision.sizing.slice_params.size(); ++i) {
+    const SliceParam& slice_param = slice_decision.sizing.slice_params[i];
+    start_indices.push_back(slice_param.start_inclusive);
+    limit_indices.push_back(slice_param.end_exclusive);
+    strides.push_back(1);
+    const int64_t new_dim =
+        slice_param.end_exclusive - slice_param.start_inclusive;
+    if (new_dim <= 0) {
+      return FailedPrecondition(
+          "%s", absl::StrCat("SlicedCopyAllocation new dimension size is ",
+                             new_dim, ", expected something > 0."));
+    }
+    if (original_shape.dimensions(i) < new_dim) {
+      return FailedPrecondition(
+          "%s",
+          absl::StrCat("SlicedCopyAllocation sliced dimension size ", new_dim,
+                       " is bigger than its original dimension size of ",
+                       original_shape.dimensions(i), "."));
+    }
+  }
+
+  HloInstruction* slice = parent.AddInstruction(
+      HloInstruction::CreateSlice(slice_decision.sizing.slice_shape, &producer,
+                                  start_indices, limit_indices, strides));
+  TF_ASSIGN_OR_RETURN(copy_done, parent.CreateAsyncInstructions(
+                                     slice, {ShapeUtil::MakeShape(S32, {})}));
+  copy_start = copy_done->mutable_operand(0);
+
+  return OkStatus();
+}
+
+bool SlicedCopyAllocation::operator==(const Allocation& other) const {
+  const SlicedCopyAllocation* casted_other =
+      dynamic_cast<const SlicedCopyAllocation*>(&other);
+  return casted_other != nullptr && (*this) == (*casted_other);
+}
+
+HloPosition MirroredAllocation::defining_position() const {
+  return original_defining_position();
+}
+
+std::string MirroredAllocation::ToString() const {
+  return absl::StrCat("Mirrored Allocation for ",
+                      original_allocation_.ToString());
+}
+
+std::string ParentAllocation::ToString() const {
+  return absl::StrCat("Parent Allocation mirrored at ",
+                      original_defining_position().ToString(), ", originally ",
+                      original_allocation_.ToString());
+}
+
+MirroredAllocation::MirroredAllocation(const Allocation& original_allocation,
+                                       int64_t time)
+    : Allocation(original_allocation.defining_position(), MemorySpace::kDefault,
+                 original_allocation.maybe_chunk(),
+                 /*start_time=*/time,
+                 /*end_time=*/time, /*is_scoped_allocation=*/false,
+                 /*cross_program_prefetch_index=*/std::nullopt),
+      original_allocation_(original_allocation) {}
+
+Status MirroredAllocation::Process() {
+  set_original_defining_position(original_allocation_.defining_position());
+  if (is_scoped_allocation()) {
+    // Nothing to do here for scoped allocations.
+    return OkStatus();
+  }
+  HloInstruction* producing_instruction = AddGetTupleElements();
+  HloComputation* computation = producing_instruction->parent();
+  return UpdateUses(computation, producing_instruction);
+}
+
+ParentAllocation::ParentAllocation(const Allocation& original_allocation,
+                                   HloInstruction* calling_instruction,
+                                   HloPosition position, int64_t time)
+    : Allocation(std::move(position), MemorySpace::kDefault,
+                 original_allocation.maybe_chunk(),
+                 /*start_time=*/time,
+                 /*end_time=*/time, /*is_scoped_allocation=*/false,
+                 /*cross_program_prefetch_index=*/std::nullopt),
+      original_allocation_(original_allocation),
+      calling_instruction_(calling_instruction) {}
+
+HloPosition ParentAllocation::defining_position() const {
+  return original_defining_position();
+}
+
+Status ParentAllocation::Process() {
+  // Add an additional parameter to the while HLO with a reference to the buffer
+  // in the default memory space.
+  HloInstruction* producing_instruction =
+      original_allocation_.AddGetTupleElements();
+  int new_tuple_index = calling_instruction_->shape().tuple_shapes_size();
+
+  TF_ASSIGN_OR_RETURN(
+      HloInstruction * new_while_operand,
+      TupleUtil::ReplaceTupleWith(producing_instruction,
+                                  calling_instruction_->mutable_operand(0),
+                                  {new_tuple_index}));
+  TF_RETURN_IF_ERROR(calling_instruction_->ReplaceOperandWithDifferentShape(
+      0, new_while_operand));
+  *calling_instruction_->mutable_shape() = new_while_operand->shape();
+  *calling_instruction_->while_condition()
+       ->parameter_instruction(0)
+       ->mutable_shape() = new_while_operand->shape();
+  *calling_instruction_->while_body()
+       ->parameter_instruction(0)
+       ->mutable_shape() = new_while_operand->shape();
+  HloPosition defining_position = original_defining_position();
+  defining_position.index = {new_tuple_index};
+  set_original_defining_position(defining_position);
+  // Also replace the while op with a tuple that has the old shape. Note that we
+  // need to first take a snapshot of the users before calling ExtractPrefix
+  // since ExtractPrefix introduces additional gte users.
+  std::vector<HloInstruction*> while_users = calling_instruction_->users();
+  HloInstruction* tuple_with_old_shape =
+      TupleUtil::ExtractPrefix(calling_instruction_, new_tuple_index);
+  TF_RETURN_IF_ERROR(calling_instruction_->ReplaceAllUsesWithDifferentShape(
+      while_users, tuple_with_old_shape));
+
+  if (is_scoped_allocation()) {
+    // Nothing to do here for scoped allocations.
+    return OkStatus();
+  }
+  HloInstruction* final_instruction = AddGetTupleElements();
+  HloComputation* computation = final_instruction->parent();
+  return UpdateUses(computation, final_instruction);
+}
+
+Status ParentAllocation::PostProcess() {
+  // Update the root of the while body with the new parameter. The reason why we
+  // need a separate post-process for this is because other allocations may have
+  // while body root as a use, so they would update the old root instead of the
+  // new root. Doing the post-process step later ensures the root has been
+  // updated with other changes, and we can safely add the additional parameter.
+  HloComputation* while_body = calling_instruction_->while_body();
+  TF_ASSIGN_OR_RETURN(HloInstruction * new_while_body_root,
+                      TupleUtil::ReplaceTupleWith(
+                          AddGetTupleElements(), while_body->root_instruction(),
+                          original_defining_position().index));
+  while_body->set_root_instruction(new_while_body_root,
+                                   /*accept_different_shape=*/true);
+  return OkStatus();
+}
+
+void ParentAllocation::MarkIfNeeded(
+    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
+  // Parent allocations are only needed if they have any uses or if there is a
+  // copy allocation that copies this value (in that case, the copy allocation
+  // will call this allocation's MarkNeeded function).
+  if (!has_no_uses()) {
+    MarkNeeded(needed_allocations);
+  }
+}
+
+void ParentAllocation::MarkNeeded(
+    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
+  needed_allocations.insert(this);
+  original_allocation_.MarkNeeded(needed_allocations);
+}
+
+bool ParentAllocation::operator==(const Allocation& other) const {
+  const ParentAllocation* casted_other =
+      dynamic_cast<const ParentAllocation*>(&other);
+  return casted_other != nullptr && (*this) == (*casted_other);
+}
+
+void MirroredAllocation::MarkIfNeeded(
+    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
+  MarkNeeded(needed_allocations);
+}
+
+void MirroredAllocation::MarkNeeded(
+    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
+  needed_allocations.insert(this);
+  original_allocation_.MarkNeeded(needed_allocations);
+}
+
+bool MirroredAllocation::operator==(const Allocation& other) const {
+  const MirroredAllocation* casted_other =
+      dynamic_cast<const MirroredAllocation*>(&other);
+  return casted_other != nullptr && (*this) == (*casted_other);
+}
+
+}  // namespace xla::memory_space_assignment
diff --git a/third_party/xla/xla/service/memory_space_assignment/allocation.h b/third_party/xla/xla/service/memory_space_assignment/allocation.h
new file mode 100644
index 00000000000000..86bdab72c55bae
--- /dev/null
+++ b/third_party/xla/xla/service/memory_space_assignment/allocation.h
@@ -0,0 +1,447 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_ALLOCATION_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_ALLOCATION_H_
+
+#include <algorithm>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "absl/functional/function_ref.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/heap_simulator/allocation_block.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/service/memory_space_assignment/slice.h"
+#include "xla/shape.h"
+#include "xla/status.h"
+
+namespace xla::memory_space_assignment {
+
+// MemorySpaceAssignment uses a notion of a slow and large default memory
+// space and a fast and small alternate memory space.
+enum class MemorySpace : std::uint8_t { kDefault, kAlternate };
+
+// An interface describing what to do with a value in memory over its lifetime.
+// An allocation might either be placed in the default or alternate memory. An
+// HloValue might live in multiple different allocations over its lifetime. The
+// lifetimes of the allocations are defined using start_time and end_time, which
+// corresponds to the instruction indexes in the flattened schedule. Each of
+// these allocations might partially overlap with each other.
+//
+// Consider an instruction Foo, and its users Bar and Baz, and the times given
+// in terms of the flattened schedule of the entire module:
+//
+//      Foo:10
+//       /   \
+//    Bar:14  \
+//           Baz:25
+//
+// A valid memory space assignment could be like the following:
+//
+//  Time:         10 ... 14        ...      25
+//                Foo    Bar                Baz
+//  Alternate     +-------+           +-----+
+//  Default           +---------------------+
+//                    ^   ^           ^     ^
+//                    |   |           |     |
+//                evict   evict  prefetch  prefetch
+//                start    end    start      end
+//
+// This would be represented with:
+//   - PinnedAllocation(memory_space=kAlternate, start_time=10, end_time=14)
+//   - CopyAllocation(memory_space=kDefault, start_time=12, end_time=25)
+//   - CopyAllocation(memory_space=kAlternate, start_time=22, end_time=25)
+class Allocation {
+ public:
+  virtual ~Allocation() = default;
+
+  // Allocation source methods
+  // --------------------------------------------------------------------------
+  // Returns the defining position for this allocation.
+  virtual HloPosition defining_position() const = 0;
+  // Returns the cross-program prefetch index for this allocation.
+  std::optional<int64_t> cross_program_prefetch_index() const;
+
+  // Allocation timing methods
+  // --------------------------------------------------------------------------
+  // TODO(cl/604356742): update all timing methods to explicitly state that
+  // they're representing inclusive intervals.
+  int64_t start_time() const { return start_time_; }
+  int64_t end_time() const { return end_time_; }
+  // Returns the time the buffer is first available to be used
+  virtual int64_t earliest_available_time() const = 0;
+  void set_start_time(int64_t start_time) { start_time_ = start_time; }
+  void set_end_time(int64_t end_time) { end_time_ = end_time; }
+  // Extends the end time of this allocation.
+  void Extend(int64_t end_time) { end_time_ = std::max(end_time_, end_time); }
+
+  // Allocation space methods
+  // --------------------------------------------------------------------------
+  MemorySpace memory_space() const { return memory_space_; }
+  // Returns the associated chunk that may be a nullopt if the allocation is
+  // in the default memory space.
+  std::optional<HeapSimulator::Chunk> maybe_chunk() const { return chunk_; }
+  // Returns the associated chunk. The caller should ensure that the chunk is
+  // defined (the allocation should be in the alternate memory space).
+  HeapSimulator::Chunk chunk() const;
+  HeapSimulator::Chunk* mutable_chunk() { return &*chunk_; }
+  void set_offset(int64_t offset);
+  bool is_scoped_allocation() const { return is_scoped_allocation_; }
+  // Returns true if the allocation is in the alternate memory space.
+  bool is_in_alternate_mem() const;
+  // Returns true if the allocation is in the default memory space.
+  bool is_in_default_mem() const;
+
+  // Use methods
+  // --------------------------------------------------------------------------
+  const std::vector<HloUse>& uses() const { return uses_; }
+  void clear_uses() { uses_.clear(); }
+  bool has_no_uses() const { return uses_.empty(); }
+  // Adds a use to this allocation.
+  void AddUse(HloUse use);
+  // Replaces all uses of the allocation with the copy_complete instruction.
+  Status UpdateUses(HloComputation* computation,
+                    HloInstruction* producing_instruction);
+
+  // Allocation type methods
+  // --------------------------------------------------------------------------
+  virtual bool is_copy_allocation() const = 0;
+  virtual bool is_sliced_copy_allocation() const = 0;
+  // True if the allocation is for a copy or a sliced-copy.
+  bool is_copy_like_allocation() const;
+
+  // Processing methods
+  // --------------------------------------------------------------------------
+  // Recursively create kGetTupleElement instructions if the defining position
+  // shape is not an array. Returns the new instruction that has array shape.
+  HloInstruction* AddGetTupleElements() const;
+  // After all of the time ranges for the allocations have been assigned,
+  // Process morphs the instructions affected to assign the memory spaces and
+  // insert asynchronous copy instructions if necessary.
+  virtual Status Process() = 0;
+  // An optional post-process step that will be called after all allocations
+  // have been processed.
+  virtual Status PostProcess() = 0;
+  // Marks (adds this allocation to needed_allocations) if this allocation is
+  // needed. PinnedAllocation and CopyAllocations are always needed and
+  // ParentAllocations are needed if they have any uses or if other
+  // CopyAllocation or ParentAllocations depend on them.
+  virtual void MarkIfNeeded(
+      absl::flat_hash_set<const Allocation*>& needed_allocations) const = 0;
+  // Marks this allocation as needed.
+  virtual void MarkNeeded(
+      absl::flat_hash_set<const Allocation*>& needed_allocations) const = 0;
+
+  // Utility methods
+  // --------------------------------------------------------------------------
+  virtual std::string ToString() const = 0;
+  virtual bool operator==(const Allocation& other) const = 0;
+
+ protected:
+  // Protected constructor to encourage use of the final subclasses (e.g.,
+  // PinnedAllocation, CopyAllocation, etc.).
+  Allocation(HloPosition defining_position, MemorySpace memory_space,
+             std::optional<HeapSimulator::Chunk> chunk, int64_t start_time,
+             int64_t end_time, bool is_scoped_allocation,
+             std::optional<int64_t> cross_program_prefetch_index);
+
+  // Returns the original defining position of this allocation.
+  HloPosition original_defining_position() const;
+  // Sets the original defining position of this allocation.
+  void set_original_defining_position(HloPosition defining_position);
+  bool base_is_equal(const Allocation& other) const;
+
+ private:
+  HloPosition original_defining_position_;
+  MemorySpace memory_space_;
+  std::optional<HeapSimulator::Chunk> chunk_;
+  int64_t start_time_;
+  int64_t end_time_;
+  const bool is_scoped_allocation_;
+  std::vector<HloUse> uses_;
+  std::optional<int64_t> cross_program_prefetch_index_;
+};
+
+// This class represents an allocation that pins a tensor to
+// a specific memory space.
+class PinnedAllocation final : public Allocation {
+ public:
+  PinnedAllocation(HloPosition defining_position, MemorySpace memory_space,
+                   std::optional<HeapSimulator::Chunk> chunk,
+                   int64_t start_time, int64_t end_time,
+                   bool is_scoped_allocation);
+
+  // Overridden methods
+  //
+  // Returns the original defining position.
+  HloPosition defining_position() const override;
+  int64_t earliest_available_time() const override { return start_time(); }
+  bool is_copy_allocation() const override { return false; }
+  bool is_sliced_copy_allocation() const override { return false; }
+  Status Process() override;
+  Status PostProcess() override { return OkStatus(); }
+  void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  std::string ToString() const override;
+  bool operator==(const Allocation& other) const override;
+
+  // New non-virtual methods
+  bool operator==(const PinnedAllocation& other) const;
+};
+
+// This class represents an allocation as a result of an asynchronous copy.
+// Note: CopyStart instructions are inserted after
+// `copy_start_schedule_after`, while CopyDone instructions are inserted
+// before `copy_done_schedule_before_time`.
+class CopyAllocation final : public Allocation {
+ public:
+  // TODO(b/307342076): Reorder scheduling times to be
+  // copy_start_schedule_after_time, copy_done_schedule_before_time, end_time
+  CopyAllocation(
+      Allocation& prev_allocation, MemorySpace memory_space,
+      std::optional<HeapSimulator::Chunk> chunk,
+      int64_t copy_start_schedule_after_time,
+      int64_t copy_done_schedule_before_time, int64_t end_time,
+      std::optional<int64_t> cross_program_prefetch_index = std::nullopt);
+
+  // Overridden methods
+  //
+  HloPosition defining_position() const override;
+  // Returns the time the buffer is first available to be used. For
+  // CopyAllocation, this is when the copy ends, which is
+  // copy_done_schedule_before.
+  int64_t earliest_available_time() const override;
+  bool is_copy_allocation() const override { return true; }
+  bool is_sliced_copy_allocation() const override { return false; }
+  Status Process() override;
+  Status PostProcess() override { return OkStatus(); }
+  void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  std::string ToString() const override;
+  bool operator==(const Allocation& other) const override;
+
+  // New non-virtual methods
+  bool operator==(const CopyAllocation& other) const;
+
+  const Allocation& prev_allocation() { return prev_allocation_; }
+  Allocation& mutable_prev_allocation() { return prev_allocation_; }
+
+  HloInstruction* copy_start() const { return copy_start_; }
+  HloInstruction* copy_done() const { return copy_done_; }
+
+  void set_copy_start_schedule_after(int64_t copy_start_schedule_after);
+  void set_copy_done_schedule_before(int64_t copy_done_schedule_before);
+  int64_t copy_start_schedule_after() const;
+  int64_t copy_done_schedule_before() const;
+
+ private:
+  Allocation& prev_allocation_;
+  // These variables define the scheduling boundaries where CopyStart and
+  // CopyDone can be scheduled. The earliest CopyStart can be scheduled is
+  // after copy_start_schedule_after_ and the latest CopyDone can be scheduled
+  // is before copy_done_schedule_before_.
+  int64_t copy_start_schedule_after_;
+  int64_t copy_done_schedule_before_;
+  HloInstruction* copy_start_ = nullptr;
+  HloInstruction* copy_done_ = nullptr;
+};
+
+// This class represents an allocation resulting from asynchronous sliced
+// copies.
+//
+// Let the sliced allocation be represented as follows, and imagine that t3
+// is the time when the entire buffer [p0, p3) is available for use
+//
+//   space
+//    ^
+// p3 |       +-----------+
+//    |       |           |
+// p2 |   +---+           |
+//    |   |               |
+// p1 |   +-------+       |
+//    |           |       |
+// p0 |           +-------+
+//    +---|---|---|---|---|----> time
+//        t0  t1  t2  t3  t4
+//
+// The PinnedAllocation underlying the SlicedCopyAllocation will use the
+// following dimensions:
+// - chunk = [p0, p3)
+// - start time = t2
+// - earliest_available_time = t3
+// - end_time = t4
+class SlicedCopyAllocation final : public Allocation {
+ public:
+  // Full details about a slice in the sliced allocation.
+  struct SliceDetail {
+    std::string ToString() const;
+    std::tuple<const SliceDecision&, int64_t, int64_t, const HloInstruction*,
+               const HloInstruction*>
+    ToTuple() const;
+    bool operator==(const SliceDetail& other) const;
+
+    // Create the instructions to copy the slice. This method updates
+    // copy_start and copy_done.
+    Status CreateAsyncSlice(const Shape& original_shape,
+                            HloInstruction& producer, HloComputation& parent);
+
+    SliceDecision slice_decision;
+    int64_t copy_start_after_time = -1;
+    int64_t copy_done_before_time = -1;
+    HloInstruction* copy_start = nullptr;
+    HloInstruction* copy_done = nullptr;
+  };
+
+  // REQUIRES:
+  // - slice_decisions_sorted_by_exclusive_start_time.size() >= 2, otherwise,
+  //   CopyAllocation should be used.
+  SlicedCopyAllocation(
+      const Allocation& prev_allocation, MemorySpace memory_space,
+      std::vector<SliceDecision> slice_decisions_sorted_by_exclusive_start_time,
+      int64_t copy_done_schedule_before_time, int64_t end_time,
+      const SlicedPrefetchOptions& sliced_prefetch_options,
+      absl::FunctionRef<Shape(const Shape&)> get_equivalent_s8_shape_fn);
+
+  // Overridden methods
+  //
+  HloPosition defining_position() const override;
+  // Returns the time the buffer is first available to be used. For
+  // SlicedCopyAllocation, this is when all copies have ended.
+  int64_t earliest_available_time() const override;
+  bool is_copy_allocation() const override { return false; }
+  bool is_sliced_copy_allocation() const override { return true; }
+  // MemorySpaceAssignment::Process() calls Process() to create asynchronous
+  // slice copies, and a bitcast-concat call to glue the slices back together.
+  Status Process() override;
+  Status PostProcess() override { return OkStatus(); }
+  // Marks the allocation as needed.
+  void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  std::string ToString() const override;
+  bool operator==(const Allocation& other) const override;
+
+  // New non-virtual methods
+  bool operator==(const SlicedCopyAllocation& other) const;
+
+  std::vector<int64_t> SliceOffsetsSortedByStartTime() const;
+  void AddDiffToAllSliceOffsets(int64_t diff);
+  // Used to update offsets and start times after repacking.
+  void ImportRepackedSliceData(const SlicedAllocationData& data);
+  const std::vector<SliceDetail>& slice_details_sorted_by_start_time() const;
+  std::vector<SliceDetail>& mutable_slice_details_sorted_by_start_time();
+  HloInstruction* concat() const { return concat_; }
+
+ private:
+  SlicedCopyAllocation() = delete;
+
+  // Create an instruction to concatenate the slices. Populates concat_.
+  Status CreateBitcastConcat(const Shape& shape,
+                             absl::Span<HloInstruction* const> slices);
+
+  Shape original_shape_to_slice_;
+  const Allocation& prev_allocation_;
+  // REQUIRES:
+  // - sorted_segments_[i].copy_start_after_time <=
+  //   sorted_segments_[i+j].copy.start_after_time
+  // - sorted_segments_[i].copy_done_before_time <=
+  //   sorted_segments_[i+j].copy.start_before_time
+  std::vector<SliceDetail> slice_details_sorted_by_exclusive_start_time_;
+  HloInstruction* concat_ = nullptr;
+  const SlicedPrefetchOptions& sliced_prefetch_options_;
+  absl::FunctionRef<Shape(const Shape&)> get_equivalent_s8_shape_fn_;
+};
+
+// An allocation in the default memory space that mirrors another Allocation
+// object. This is useful to model an eviction that happens before a while op
+// so that we don't need to redundantly evict the buffer after the while op as
+// well.
+class MirroredAllocation final : public Allocation {
+ public:
+  MirroredAllocation(const Allocation& original_allocation, int64_t time);
+
+  // Overridden methods
+  //
+  // Returns the original defining position.
+  HloPosition defining_position() const override;
+  int64_t earliest_available_time() const override { return start_time(); }
+  bool is_copy_allocation() const override { return false; }
+  bool is_sliced_copy_allocation() const override { return false; }
+  Status Process() override;
+  Status PostProcess() override { return OkStatus(); }
+  void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  std::string ToString() const override;
+  bool operator==(const Allocation& other) const override;
+
+  // New non-virtual methods
+  bool operator==(const MirroredAllocation& other) const;
+
+ private:
+  const Allocation& original_allocation_;
+};
+
+// An allocation in default memory space that is defined in the parent
+// computation. If a value has a copy in the default memory space in the
+// parent computation, we don't need to evict this buffer in a while loop.
+class ParentAllocation final : public Allocation {
+ public:
+  ParentAllocation(const Allocation& original_allocation,
+                   HloInstruction* calling_instruction, HloPosition position,
+                   int64_t time);
+
+  // Overridden methods
+  //
+  // Returns the original defining position.
+  HloPosition defining_position() const override;
+  int64_t earliest_available_time() const override { return start_time(); }
+  bool is_copy_allocation() const override { return false; }
+  bool is_sliced_copy_allocation() const override { return false; }
+  Status Process() override;
+  Status PostProcess() override;
+  void MarkIfNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
+      const override;
+  std::string ToString() const override;
+  bool operator==(const Allocation& other) const override;
+
+  // New non-virtual methods
+  bool operator==(const ParentAllocation& other) const;
+
+ private:
+  const Allocation& original_allocation_;
+  HloInstruction* calling_instruction_;
+};
+
+}  // namespace xla::memory_space_assignment
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_ALLOCATION_H_
diff --git a/third_party/xla/xla/service/memory_space_assignment/cost_analysis.cc b/third_party/xla/xla/service/memory_space_assignment/cost_analysis.cc
new file mode 100644
index 00000000000000..2e5c07ac7546c1
--- /dev/null
+++ b/third_party/xla/xla/service/memory_space_assignment/cost_analysis.cc
@@ -0,0 +1,403 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/memory_space_assignment/cost_analysis.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <utility>
+
+#include "absl/log/check.h"
+#include "absl/memory/memory.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo_alias_analysis.h"
+#include "xla/service/hlo_buffer.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/hlo_value.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/statusor.h"
+#include "xla/util.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace memory_space_assignment {
+/*static*/ StatusOr<std::unique_ptr<CostAnalysis>> CostAnalysis::Create(
+    const HloCostAnalysis& cost_analysis, const CostAnalysisOptions& options,
+    const HloModule& module) {
+  TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(&module));
+  TF_ASSIGN_OR_RETURN(auto hlo_live_range,
+                      HloLiveRange::Run(module.schedule(), *alias_analysis,
+                                        module.entry_computation()));
+  auto call_graph = CallGraph::Build(&module);
+  // Using `new` to access a non-public constructor.
+  return absl::WrapUnique(
+      new CostAnalysis(cost_analysis, options, std::move(alias_analysis),
+                       std::move(hlo_live_range), std::move(call_graph)));
+}
+
+float CostAnalysis::GetAlternateMemoryBenefit(
+    const HloInstruction& instruction, float elapsed_time_due_to_alternate_mem,
+    CostAnalysis::Cache* cache) const {
+  float elapsed_time_due_to_compute =
+      GetInstructionElapsedDueToCompute(instruction);
+  float elapsed_time_due_to_memory =
+      GetInstructionElapsedDueToMemory(instruction);
+  if (elapsed_time_due_to_memory > elapsed_time_due_to_compute) {
+    // Memory bound, return how much alternate memory is better.
+    float while_nest_multiplier;
+    if (cache) {
+      // If there is a cache provided, memoize the while nest multiplier.
+      auto it = cache->while_nest_multiplier.find(&instruction);
+      if (it != cache->while_nest_multiplier.end()) {
+        while_nest_multiplier = it->second;
+      } else {
+        while_nest_multiplier = GetWhileNestMultiplier(
+            CalculateComputationNestLevel(&instruction,
+                                          /*while_only=*/true));
+        cache->while_nest_multiplier[&instruction] = while_nest_multiplier;
+      }
+    } else {
+      while_nest_multiplier = GetWhileNestMultiplier(
+          CalculateComputationNestLevel(&instruction,
+                                        /*while_only=*/true));
+    }
+    return (elapsed_time_due_to_memory - elapsed_time_due_to_alternate_mem) *
+           while_nest_multiplier;
+  } else {
+    // Compute bound, return how far off are we to memory boundedness.
+    return elapsed_time_due_to_memory - elapsed_time_due_to_compute;
+  }
+}
+
+float CostAnalysis::GetMemoryBoundedness(
+    const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval,
+    CostAnalysis::Cache* cache) const {
+  if (cache) {
+    auto it =
+        cache->memory_boundedness.find(interval.buffer->defining_position());
+    if (it != cache->memory_boundedness.end()) {
+      return it->second;
+    }
+  }
+  float alternate_mem_benefit =
+      GetAlternateMemoryBenefit(interval.buffer->defining_position(), cache);
+
+  for (const HloBuffer* buffer : alias_analysis_->ComputeBuffersAt(
+           interval.buffer->defining_position().instruction,
+           interval.buffer->defining_position().index)) {
+    for (const HloValue* value : buffer->values()) {
+      for (const HloUse& use : value->GetUses()) {
+        // We look inside the called computations of while and conditional, so
+        // don't use the benefit of while and conditional directly.
+        if (use.instruction->opcode() == HloOpcode::kWhile ||
+            use.instruction->opcode() == HloOpcode::kConditional) {
+          continue;
+        }
+        float use_alternate_mem_benefit = GetAlternateMemoryBenefit(use, cache);
+        // If the benefit is positive (memory bound), add it to this buffer's
+        // benefit. If the benefit is negative (compute bound), calculate the
+        // maximum.
+        if (alternate_mem_benefit > 0 && use_alternate_mem_benefit > 0) {
+          alternate_mem_benefit += use_alternate_mem_benefit;
+        } else {
+          alternate_mem_benefit =
+              std::max(alternate_mem_benefit, use_alternate_mem_benefit);
+        }
+      }
+    }
+  }
+
+  // Penalize larger buffers by dividing the benefit by the square root of
+  // the size. Empirically, we observed this resulted in better performance
+  // compared to dividing by the size.
+  float memory_boundedness = 1;
+  if (options_
+          .xla_tpu_alternate_memory_benefit_scaling_factor_for_large_buffers ==
+      "NO_SCALE") {
+    memory_boundedness = alternate_mem_benefit;
+  } else {
+    memory_boundedness = alternate_mem_benefit / std::sqrt(interval.size);
+  }
+
+  if (cache) {
+    cache->memory_boundedness[interval.buffer->defining_position()] =
+        memory_boundedness;
+  }
+  return memory_boundedness;
+}
+
+float CostAnalysis::GetAlternateMemoryBenefit(
+    const HloPosition& position, CostAnalysis::Cache* cache) const {
+  return GetAlternateMemoryBenefit(
+      *position.instruction,
+      GetInstructionElapsedDueToMemory(
+          *position.instruction,
+          /*operands_in_alternate_mem=*/{},
+          /*outputs_in_alternate_mem=*/{position.index}),
+      cache);
+}
+
+float CostAnalysis::GetAlternateMemoryBenefit(
+    const HloUse& use, CostAnalysis::Cache* cache) const {
+  return GetAlternateMemoryBenefit(
+      *use.instruction,
+      GetInstructionElapsedDueToMemory(
+          *use.instruction,
+          /*operands_in_alternate_mem=*/{std::make_pair(use.operand_number,
+                                                        use.operand_index)}),
+      cache);
+}
+
+int CostAnalysis::CalculateComputationNestLevel(
+    const HloInstruction* instruction, bool while_only) const {
+  int nest_level = 0;
+  const HloComputation* computation = instruction->parent();
+  while (!computation->IsEntryComputation()) {
+    auto& node = call_graph_->GetNode(computation);
+    auto callsites = node.caller_callsites();
+    CHECK(node.computation()->IsAsyncComputation() || callsites.size() == 1)
+        << "The module is not flattened!";
+    auto& callsite = callsites[0];
+    if (!while_only || callsite.instruction()->opcode() == HloOpcode::kWhile) {
+      ++nest_level;
+    }
+    computation = callsite.instruction()->parent();
+  }
+  return nest_level;
+}
+
+float CostAnalysis::GetWhileNestMultiplier(int while_nest_level) const {
+  return IPow<float>(
+      options_.xla_tpu_memory_space_assignment_while_execution_count,
+      while_nest_level);
+}
+
+float CostAnalysis::GetDefaultMemoryAccessOverhead(
+    const HloInstruction& instruction,
+    absl::Span<const std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem,
+    absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
+  // Calculate the pipeline overhead of accessing the default memory. We use the
+  // maximum of the window size heuristic and the actual default memory bytes
+  // accessed multiplied with the compute as the overhead. So, the math is:
+  //
+  // overhead = compute_per_iteration
+  //          = compute_elapsed / num_iterations
+  //          = compute_elapsed / (bytes_accessed / window_size)
+  //          = (window_size / bytes_accessed) * compute_elapsed
+  const float window_size_bytes =
+      options_.pipeline_overhead_window_size_mib * 1024 * 1024;
+  const float bytes_accessed = hlo_cost_analysis_.bytes_accessed(instruction);
+  const float default_memory_bytes_accessed =
+      bytes_accessed -
+      GetBytesAccessedFromAlternateMemory(
+          instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
+  const float compute_elapsed = GetInstructionElapsedDueToCompute(instruction);
+  const float effective_window_size_bytes =
+      std::min(window_size_bytes, default_memory_bytes_accessed);
+  float overhead = 0;
+  if (bytes_accessed > 0) {
+    overhead = (effective_window_size_bytes / bytes_accessed) * compute_elapsed;
+  }
+  return overhead;
+}
+
+float CostAnalysis::GetDefaultMemoryBandwidthIdleTime(
+    const HloInstruction& instruction,
+    absl::Span<const std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem,
+    absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
+  const float default_memory_bytes_accessed =
+      hlo_cost_analysis_.bytes_accessed(instruction) -
+      GetBytesAccessedFromAlternateMemory(
+          instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
+  const float elapsed_due_to_default_mem =
+      default_memory_bytes_accessed /
+      hlo_cost_analysis_.per_second_rate(HloCostAnalysis::kBytesAccessedKey);
+  const float elapsed = GetInstructionElapsedInAlternateMemory(
+      instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
+  return elapsed - elapsed_due_to_default_mem;
+}
+
+float CostAnalysis::GetBytesAccessedFromAlternateMemory(
+    const HloInstruction& instruction,
+    absl::Span<const std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem,
+    absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
+  float bytes_accessed_from_alternate_mem = 0.0;
+  for (auto& operand : operands_in_alternate_mem) {
+    const float operand_bytes_accessed =
+        hlo_cost_analysis_.operand_bytes_accessed(instruction, operand.first,
+                                                  operand.second);
+    bytes_accessed_from_alternate_mem += operand_bytes_accessed;
+  }
+
+  for (auto& shape_idx : outputs_in_alternate_mem) {
+    const float output_bytes_accessed =
+        hlo_cost_analysis_.output_bytes_accessed(instruction, shape_idx);
+    bytes_accessed_from_alternate_mem += output_bytes_accessed;
+  }
+  return bytes_accessed_from_alternate_mem;
+}
+
+namespace {
+// Returns true on async instructions since we assume they are already
+// efficiently scheduled such that they are not in the critical path and appear
+// to take no time.
+bool ExcludeInstructionFromElapsed(const HloInstruction& instruction) {
+  return instruction.opcode() == HloOpcode::kAllGatherStart ||
+         instruction.opcode() == HloOpcode::kAllGatherDone ||
+         instruction.opcode() == HloOpcode::kAllReduceStart ||
+         instruction.opcode() == HloOpcode::kAllReduceDone ||
+         instruction.opcode() == HloOpcode::kAsyncStart ||
+         instruction.opcode() == HloOpcode::kAsyncDone ||
+         instruction.opcode() == HloOpcode::kCollectivePermuteStart ||
+         instruction.opcode() == HloOpcode::kCollectivePermuteDone ||
+         instruction.opcode() == HloOpcode::kCopyStart ||
+         instruction.opcode() == HloOpcode::kCopyDone;
+}
+}  // namespace
+
+float CostAnalysis::GetInstructionElapsedDueToCompute(
+    const HloInstruction& instruction) const {
+  if (ExcludeInstructionFromElapsed(instruction)) {
+    return 0.0f;
+  }
+  return std::max(
+      hlo_cost_analysis_.flop_count(instruction) /
+          hlo_cost_analysis_.per_second_rate(HloCostAnalysis::kFlopsKey),
+      hlo_cost_analysis_.transcendental_count(instruction) /
+          hlo_cost_analysis_.per_second_rate(
+              HloCostAnalysis::kTranscendentalsKey));
+}
+
+float CostAnalysis::GetInstructionElapsedDueToMemory(
+    const HloInstruction& instruction,
+    absl::Span<const std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem,
+    absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
+  if (ExcludeInstructionFromElapsed(instruction)) {
+    return 0.0f;
+  }
+  float total_bytes_accessed = hlo_cost_analysis_.bytes_accessed(instruction);
+  float bytes_accessed_from_alternate_mem = GetBytesAccessedFromAlternateMemory(
+      instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
+  float elapsed_due_to_alternate_mem =
+      bytes_accessed_from_alternate_mem /
+      options_.alternate_mem_bandwidth_bytes_per_second;
+  float elapsed_due_to_default_mem =
+      (total_bytes_accessed - bytes_accessed_from_alternate_mem) /
+      hlo_cost_analysis_.per_second_rate(HloCostAnalysis::kBytesAccessedKey);
+  return elapsed_due_to_alternate_mem + elapsed_due_to_default_mem;
+}
+
+float CostAnalysis::GetInstructionElapsedDueToMemory(
+    const HloInstruction& instruction,
+    IsInAlternateMemoryFun is_in_alternate_mem) const {
+  if (ExcludeInstructionFromElapsed(instruction)) {
+    return 0.0f;
+  }
+  float total_bytes_accessed = hlo_cost_analysis_.bytes_accessed(instruction);
+  float bytes_accessed_from_alternate_mem = 0.0;
+  for (int operand_num = 0; operand_num < instruction.operand_count();
+       ++operand_num) {
+    ShapeUtil::ForEachSubshape(
+        instruction.operand(operand_num)->shape(),
+        [&](const Shape& subshape, const ShapeIndex& index) {
+          if (!subshape.IsArray()) {
+            return;
+          }
+          if (is_in_alternate_mem(operand_num, index, subshape)) {
+            bytes_accessed_from_alternate_mem +=
+                hlo_cost_analysis_.operand_bytes_accessed(instruction,
+                                                          operand_num, index);
+          }
+        });
+  }
+  ShapeUtil::ForEachSubshape(instruction.shape(), [&](const Shape& subshape,
+                                                      const ShapeIndex& index) {
+    if (!subshape.IsArray()) {
+      return;
+    }
+    if (is_in_alternate_mem(/*operand_num=*/std::nullopt, index, subshape)) {
+      bytes_accessed_from_alternate_mem +=
+          hlo_cost_analysis_.output_bytes_accessed(instruction, index);
+    }
+  });
+  float elapsed_due_to_alternate_mem =
+      bytes_accessed_from_alternate_mem /
+      options_.alternate_mem_bandwidth_bytes_per_second;
+  float elapsed_due_to_default_mem =
+      (total_bytes_accessed - bytes_accessed_from_alternate_mem) /
+      hlo_cost_analysis_.per_second_rate(HloCostAnalysis::kBytesAccessedKey);
+  return elapsed_due_to_alternate_mem + elapsed_due_to_default_mem;
+}
+
+float CostAnalysis::GetInstructionElapsed(
+    const HloInstruction& instruction) const {
+  if (ExcludeInstructionFromElapsed(instruction)) {
+    return 0.0f;
+  }
+  float overhead = GetDefaultMemoryAccessOverhead(instruction);
+  return std::max(GetInstructionElapsedDueToCompute(instruction),
+                  GetInstructionElapsedDueToMemory(instruction) + overhead);
+}
+
+float CostAnalysis::GetInstructionElapsedInAlternateMemory(
+    const HloInstruction& instruction,
+    absl::Span<const std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem,
+    absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
+  if (ExcludeInstructionFromElapsed(instruction)) {
+    return 0.0f;
+  }
+  float overhead = GetDefaultMemoryAccessOverhead(
+      instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
+  return std::max(
+      GetInstructionElapsedDueToCompute(instruction),
+      GetInstructionElapsedDueToMemory(instruction, operands_in_alternate_mem,
+                                       outputs_in_alternate_mem) +
+          overhead);
+}
+
+float CostAnalysis::GetInstructionElapsedInAlternateMemory(
+    const HloInstruction& instruction,
+    IsInAlternateMemoryFun is_in_alternate_mem) const {
+  if (ExcludeInstructionFromElapsed(instruction)) {
+    return 0.0f;
+  }
+  return std::max(
+      GetInstructionElapsedDueToCompute(instruction),
+      GetInstructionElapsedDueToMemory(instruction, is_in_alternate_mem));
+}
+
+float CostAnalysis::GetAsyncCopyElapsed(const Shape& shape) const {
+  int64_t size_in_bytes = hlo_cost_analysis_.GetShapeSize(shape);
+  return static_cast<float>(size_in_bytes) /
+         (options_.async_copy_bandwidth_bytes_per_second *
+          options_.async_copy_bandwidth_scaling_factor);
+}
+
+int64_t CostAnalysis::GetScheduleEndTime() const {
+  return hlo_live_range_->schedule_end_time();
+}
+
+}  // namespace memory_space_assignment
+}  // namespace xla
diff --git a/third_party/xla/xla/service/memory_space_assignment/cost_analysis.h b/third_party/xla/xla/service/memory_space_assignment/cost_analysis.h
new file mode 100644
index 00000000000000..8463f7f914564b
--- /dev/null
+++ b/third_party/xla/xla/service/memory_space_assignment/cost_analysis.h
@@ -0,0 +1,237 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_COST_ANALYSIS_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_COST_ANALYSIS_H_
+
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <utility>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/functional/function_ref.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo_alias_analysis.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/hlo_value.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/statusor.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace memory_space_assignment {
+
+// Options to be passed to the CostAnalysis.
+struct CostAnalysisOptions {
+  // This variable is used by the cost analysis in estimating how many times
+  // each while loop will execute. Nested loops will be assumed to have
+  // executed pow(while_execution_count, nesting_level) times.
+  uint64_t xla_tpu_memory_space_assignment_while_execution_count = 5ULL;
+
+  // This variable is used to scale the alternate memory benefit factor for
+  // large buffers. The default scaling function is sqrt.
+  std::string
+      xla_tpu_alternate_memory_benefit_scaling_factor_for_large_buffers =
+          "SQRT";
+
+  // The window size used to calculate the pipeline overhead when HLO accesses
+  // the default memory, in MiB.
+  float pipeline_overhead_window_size_mib = 0;
+
+  float alternate_mem_bandwidth_bytes_per_second = 0.0f;
+
+  float async_copy_bandwidth_bytes_per_second = 0.0f;
+
+  // Scales effective bandwidth for async copies. Valid range is (0, 1].
+  float async_copy_bandwidth_scaling_factor = 1.0;
+};
+
+// A wrapper class around HloCostAnalysis with additional knowledge about the
+// bandwidths of different memory spaces.
+class CostAnalysis {
+ public:
+  // An optional Cache object may be provided to some of the methods below to
+  // speed up the lookup.
+  struct Cache {
+    absl::flat_hash_map<const HloInstruction*, float> while_nest_multiplier;
+    absl::flat_hash_map<HloPosition, float> memory_boundedness;
+  };
+
+  // Function type that can be used to indicate which input/output values are in
+  // the alternate memory.
+  using IsInAlternateMemoryFun = absl::FunctionRef<bool(
+      std::optional<int> /*operand_num*/, const ShapeIndex& /*index*/,
+      const Shape& /*shape*/)>;
+
+  virtual ~CostAnalysis() = default;
+
+  static StatusOr<std::unique_ptr<CostAnalysis>> Create(
+      const HloCostAnalysis& cost_analysis, const CostAnalysisOptions& options,
+      const HloModule& module);
+
+  const HloCostAnalysis& hlo_cost_analysis() const {
+    return hlo_cost_analysis_;
+  }
+
+  // Returns a heuristic value that captures how much putting this tensor to the
+  // alternate memory would help if the op is memory bound, or otherwise how far
+  // off is the op to memory boundedness. The larger this number, the higher
+  // priority it will be placed in the alternate memory.
+  float GetAlternateMemoryBenefit(const HloInstruction& instruction,
+                                  float elapsed_time_due_to_alternate_mem,
+                                  Cache* cache = nullptr) const;
+  // Like above, return the benefit of putting the output tensor in the
+  // alternate memory.
+  float GetAlternateMemoryBenefit(const HloPosition& position,
+                                  Cache* cache = nullptr) const;
+  // Like above, return the benefit of putting the input tensor in the alternate
+  // memory.
+  float GetAlternateMemoryBenefit(const HloUse& use,
+                                  Cache* cache = nullptr) const;
+
+  // Returns a heuristic value of memory boundedness for the given
+  // BufferInterval.  The larger this number, the higher priority it will be
+  // placed in the alternate memory.
+  float GetMemoryBoundedness(
+      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval,
+      Cache* cache = nullptr) const;
+
+  // If enabled in CostAnalysisOptions::pipeline_overhead_window_size_mib,
+  // returns the overhead of accessing the default memory, in seconds. The
+  // source of the overhead is the software pipelining ovehead. The lowering of
+  // the operations typically use tiling to copy one window at a time from
+  // default memory, and perform compute:
+  //
+  // Pipeline overhead:                          <->
+  //                        +----+----+----+----+
+  // Copy from default mem: |    |    |    |    |
+  //                        +----+----+----+----+
+  //                            \    \    \    \
+  //                             \    \    \    \
+  //                              V    V    V    V
+  //                             +--+ +--+ +--+ +--+
+  // Compute:                    |  | |  | |  | |  |
+  //                             +--+ +--+ +--+ +--+
+  float GetDefaultMemoryAccessOverhead(
+      const HloInstruction& instruction,
+      absl::Span<const std::pair<int64_t, ShapeIndex>>
+          operands_in_alternate_mem = {},
+      absl::Span<const ShapeIndex> outputs_in_alternate_mem = {}) const;
+
+  // Returns the amount of time the default memory bandwidth is idle, while
+  // executing this instruction, in seconds.  This value can be multiplied with
+  // the default memory bandwidth to get the amount of bytes that are available
+  // to be copied to/from default memory during the execution of this
+  // instruction.
+  float GetDefaultMemoryBandwidthIdleTime(
+      const HloInstruction& instruction,
+      absl::Span<const std::pair<int64_t, ShapeIndex>>
+          operands_in_alternate_mem = {},
+      absl::Span<const ShapeIndex> outputs_in_alternate_mem = {}) const;
+
+  // Returns the bytes accessed from alternate memory.
+  float GetBytesAccessedFromAlternateMemory(
+      const HloInstruction& instruction,
+      absl::Span<const std::pair<int64_t, ShapeIndex>>
+          operands_in_alternate_mem = {},
+      absl::Span<const ShapeIndex> outputs_in_alternate_mem = {}) const;
+
+  // Returns the elapsed time in seconds due to compute only.
+  float GetInstructionElapsedDueToCompute(
+      const HloInstruction& instruction) const;
+
+  // Returns the elapsed time in seconds due to memory only. If
+  // operands_in_alternate_mem or outputs_in_alternate_mem is provided, it will
+  // assume that the corresponding operands or output will be in the alternate
+  // memory space. This is useful for calculating the benefit of placing the
+  // buffer in alternate memory.
+  float GetInstructionElapsedDueToMemory(
+      const HloInstruction& instruction,
+      absl::Span<const std::pair<int64_t, ShapeIndex>>
+          operands_in_alternate_mem = {},
+      absl::Span<const ShapeIndex> outputs_in_alternate_mem = {}) const;
+
+  // Like above, only the inputs/outputs indicated by is_in_alternate_mem are in
+  // the alternate memory.
+  float GetInstructionElapsedDueToMemory(
+      const HloInstruction& instruction,
+      IsInAlternateMemoryFun is_in_alternate_mem) const;
+
+  // Returns the estimated elapsed duration of the instruction in seconds.  It
+  // assumes all operands and outputs of the instruction are in the default
+  // memory.
+  virtual float GetInstructionElapsed(const HloInstruction& instruction) const;
+
+  // Returns the estimated elapsed duration of the instruction in seconds.  It
+  // assumes all operands and outputs of the instruction are in the default
+  // memory, except for the operands and outputs specified to be in the
+  // alternate memory.
+  virtual float GetInstructionElapsedInAlternateMemory(
+      const HloInstruction& instruction,
+      absl::Span<const std::pair<int64_t, ShapeIndex>>
+          operands_in_alternate_mem,
+      absl::Span<const ShapeIndex> outputs_in_alternate_mem) const;
+
+  // Like above, only the inputs/outputs indicated by is_in_alternate_mem are in
+  // the alternate memory.
+  float GetInstructionElapsedInAlternateMemory(
+      const HloInstruction& instruction,
+      IsInAlternateMemoryFun is_in_alternate_mem) const;
+
+  // Returns the elapsed time it would take to asynchronously copy the shape
+  // from default to alternate memory space (or vice versa).
+  virtual float GetAsyncCopyElapsed(const Shape& shape) const;
+
+  int64_t GetScheduleEndTime() const;
+
+  // Returns the number of nested computation levels this instruction resides
+  // in. If while_only is true, it returns the while loop nest level and 0
+  // means the instruction is not in a while loop.
+  int CalculateComputationNestLevel(const HloInstruction* instruction,
+                                    bool while_only) const;
+  float GetWhileNestMultiplier(int while_nest_level) const;
+
+  const HloLiveRange& hlo_live_range() const { return *hlo_live_range_; }
+
+ protected:
+  CostAnalysis(const HloCostAnalysis& hlo_cost_analysis,
+               const CostAnalysisOptions& options,
+               std::unique_ptr<HloAliasAnalysis> alias_analysis,
+               std::unique_ptr<HloLiveRange> hlo_live_range,
+               std::unique_ptr<CallGraph> call_graph)
+      : hlo_cost_analysis_(hlo_cost_analysis),
+        options_(options),
+        alias_analysis_(std::move(alias_analysis)),
+        hlo_live_range_(std::move(hlo_live_range)),
+        call_graph_(std::move(call_graph)) {}
+
+ private:
+  const HloCostAnalysis& hlo_cost_analysis_;
+  const CostAnalysisOptions options_;
+  std::unique_ptr<HloAliasAnalysis> alias_analysis_;
+  std::unique_ptr<HloLiveRange> hlo_live_range_;
+  std::unique_ptr<CallGraph> call_graph_;
+};
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_COST_ANALYSIS_H_
diff --git a/third_party/xla/xla/service/memory_space_assignment/cost_analysis_test.cc b/third_party/xla/xla/service/memory_space_assignment/cost_analysis_test.cc
new file mode 100644
index 00000000000000..54567bb3855d0f
--- /dev/null
+++ b/third_party/xla/xla/service/memory_space_assignment/cost_analysis_test.cc
@@ -0,0 +1,229 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/memory_space_assignment/cost_analysis.h"
+
+#include <cstdint>
+#include <memory>
+
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+using memory_space_assignment::CostAnalysis;
+using memory_space_assignment::CostAnalysisOptions;
+
+constexpr int64_t kPointerSize = 8;
+
+int64_t ShapeSize(const Shape& shape) {
+  return ShapeUtil::ByteSizeOf(shape, kPointerSize);
+}
+
+class MemorySpaceAssignmentCostAnalysisTest : public HloTestBase {
+ protected:
+  Status Initialize(const HloModule* module,
+                    float pipeline_overhead_window_size_mib = 0.0) {
+    HloCostAnalysis::Options options;
+    options_.alternate_mem_bandwidth_bytes_per_second = 128;
+    options_.async_copy_bandwidth_bytes_per_second = 32;
+    options_.pipeline_overhead_window_size_mib =
+        pipeline_overhead_window_size_mib;
+    options.shape_size = ShapeSize;
+    options.set_flops_per_second(8);
+    options.set_bytes_per_second(32);
+    options.set_transcendentals_per_second(16);
+    hlo_cost_analysis_ = std::make_unique<HloCostAnalysis>(options);
+    TF_RETURN_IF_ERROR(
+        module->entry_computation()->Accept(hlo_cost_analysis_.get()));
+    TF_ASSIGN_OR_RETURN(
+        cost_analysis_,
+        CostAnalysis::Create(*hlo_cost_analysis_, options_, *module));
+    return OkStatus();
+  }
+
+  CostAnalysisOptions options_;
+  std::unique_ptr<HloCostAnalysis> hlo_cost_analysis_;
+  std::unique_ptr<CostAnalysis> cost_analysis_;
+};
+
+TEST_F(MemorySpaceAssignmentCostAnalysisTest, NoPipelineOverhead) {
+  absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY Entry {
+    param0 = f32[2,4] parameter(0)
+    param1 = f32[2,4] parameter(1)
+    ROOT add = f32[2,4] add(param0, param1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  TF_ASSERT_OK(Initialize(module.get()));
+
+  const HloInstruction* add = module->entry_computation()->root_instruction();
+  const float expected_compute_elapsed =
+      /*num_flops=*/8 / /*flops_per_second=*/8.0;
+  LOG(INFO) << "Expected compute elapsed = " << expected_compute_elapsed;
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToCompute(*add),
+            expected_compute_elapsed);
+  float expected_memory_elapsed =
+      /*bytes_accessed=*/(3 * 4 * 8) / /*bytes_per_second=*/32.0;
+  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToMemory(*add),
+            expected_memory_elapsed);
+
+  // This HLO is memory-bound.
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsed(*add),
+            expected_memory_elapsed);
+  EXPECT_EQ(
+      cost_analysis_->GetInstructionElapsedInAlternateMemory(*add, {}, {}),
+      expected_memory_elapsed);
+
+  // Put operand 0 in alternate memory. Still memory bound.
+  expected_memory_elapsed =
+      (/*bytes_accessed=*/(2 * 4 * 8) / /*bytes_per_second=*/32.0) +
+      (/*bytes_accessed=*/(4 * 8) / /*bytes_per_second=*/128.0);
+  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToMemory(*add, {{0, {}}}),
+            expected_memory_elapsed);
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedInAlternateMemory(
+                *add, {{0, {}}}, {}),
+            expected_memory_elapsed);
+
+  // Put operand 0 and output in alternate memory. Still memory bound.
+  expected_memory_elapsed =
+      (/*bytes_accessed=*/(4 * 8) / /*bytes_per_second=*/32.0) +
+      (/*bytes_accessed=*/(2 * 4 * 8) / /*bytes_per_second=*/128.0);
+  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
+  EXPECT_EQ(
+      cost_analysis_->GetInstructionElapsedDueToMemory(*add, {{0, {}}}, {{}}),
+      expected_memory_elapsed);
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedInAlternateMemory(
+                *add, {{0, {}}}, {{}}),
+            expected_memory_elapsed);
+
+  // Put everything in alternate memory. We're now compute bound.
+  expected_memory_elapsed =
+      /*bytes_accessed=*/(3 * 4 * 8) / /*bytes_per_second=*/128.0;
+  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToMemory(
+                *add, {{0, {}}, {1, {}}}, {{}}),
+            expected_memory_elapsed);
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedInAlternateMemory(
+                *add, {{0, {}}, {1, {}}}, {{}}),
+            expected_compute_elapsed);
+}
+
+TEST_F(MemorySpaceAssignmentCostAnalysisTest, PipelineOverhead) {
+  absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY Entry {
+    param0 = f32[2,4] parameter(0)
+    param1 = f32[2,4] parameter(1)
+    ROOT add = f32[2,4] add(param0, param1)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  // Set the window size 64B.
+  TF_ASSERT_OK(
+      Initialize(module.get(),
+                 /*pipeline_overhead_window_size_mib=*/(64.0 / 1024 / 1024)));
+
+  const HloInstruction* add = module->entry_computation()->root_instruction();
+  const float expected_compute_elapsed =
+      /*num_flops=*/8 / /*flops_per_second=*/8.0;
+  LOG(INFO) << "Expected compute elapsed = " << expected_compute_elapsed;
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToCompute(*add),
+            expected_compute_elapsed);
+  float expected_memory_elapsed =
+      /*bytes_accessed=*/(3 * 4 * 8) / /*bytes_per_second=*/32.0;
+  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToMemory(*add),
+            expected_memory_elapsed);
+
+  float expected_overhead = expected_compute_elapsed * 2 / 3;
+  LOG(INFO) << "Expected overhead = " << expected_overhead;
+  EXPECT_EQ(cost_analysis_->GetDefaultMemoryAccessOverhead(*add),
+            expected_overhead);
+  // This HLO is memory-bound.
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsed(*add),
+            expected_memory_elapsed + expected_overhead);
+  EXPECT_EQ(
+      cost_analysis_->GetInstructionElapsedInAlternateMemory(*add, {}, {}),
+      expected_memory_elapsed + expected_overhead);
+
+  // Put operand 0 in alternate memory. Still memory bound.
+  expected_memory_elapsed =
+      (/*bytes_accessed=*/(2 * 4 * 8) / /*bytes_per_second=*/32.0) +
+      (/*bytes_accessed=*/(4 * 8) / /*bytes_per_second=*/128.0);
+  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
+  EXPECT_EQ(cost_analysis_->GetDefaultMemoryAccessOverhead(*add, {{0, {}}}),
+            expected_overhead);
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToMemory(*add, {{0, {}}}),
+            expected_memory_elapsed);
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedInAlternateMemory(
+                *add, {{0, {}}}, {}),
+            expected_memory_elapsed + expected_overhead);
+
+  // Put operand 0 and output in alternate memory. Still memory bound.
+  expected_memory_elapsed =
+      (/*bytes_accessed=*/(4 * 8) / /*bytes_per_second=*/32.0) +
+      (/*bytes_accessed=*/(2 * 4 * 8) / /*bytes_per_second=*/128.0);
+  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
+  expected_overhead = expected_compute_elapsed / 3;
+  LOG(INFO) << "Expected overhead = " << expected_overhead;
+  EXPECT_EQ(
+      cost_analysis_->GetDefaultMemoryAccessOverhead(*add, {{0, {}}}, {{}}),
+      expected_overhead);
+  EXPECT_EQ(
+      cost_analysis_->GetInstructionElapsedDueToMemory(*add, {{0, {}}}, {{}}),
+      expected_memory_elapsed);
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedInAlternateMemory(
+                *add, {{0, {}}}, {{}}),
+            expected_memory_elapsed + expected_overhead);
+
+  // Put everything in alternate memory. We're now compute bound.
+  expected_memory_elapsed =
+      /*bytes_accessed=*/(3 * 4 * 8) / /*bytes_per_second=*/128.0;
+  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
+  expected_overhead = 0;
+  LOG(INFO) << "Expected overhead = " << expected_overhead;
+  EXPECT_EQ(cost_analysis_->GetDefaultMemoryAccessOverhead(
+                *add, {{0, {}}, {1, {}}}, {{}}),
+            expected_overhead);
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToMemory(
+                *add, {{0, {}}, {1, {}}}, {{}}),
+            expected_memory_elapsed);
+  EXPECT_EQ(cost_analysis_->GetInstructionElapsedInAlternateMemory(
+                *add, {{0, {}}, {1, {}}}, {{}}),
+            expected_compute_elapsed);
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
index 2b82442ceb6305..40ec7d2eaf05ef 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.cc
@@ -42,11 +42,9 @@ limitations under the License.
 #include "absl/functional/function_ref.h"
 #include "absl/log/check.h"
 #include "absl/memory/memory.h"
-#include "absl/strings/numbers.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
-#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "re2/re2.h"
@@ -65,18 +63,21 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_dataflow_analysis.h"
 #include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/allocation.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
 #include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
 #include "xla/service/memory_space_assignment/repacking.h"
+#include "xla/service/memory_space_assignment/slice.h"
 #include "xla/service/memory_space_assignment/tuning_utils.h"
 #include "xla/service/memory_space_assignment/utils.h"
 #include "xla/service/time_utils.h"
-#include "xla/service/tuple_util.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
+#include "xla/xla_data.pb.h"
 #include "tsl/platform/casts.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
@@ -93,13 +94,6 @@ const HeapSimulator::Chunk kDummyChunk =
 // For cross-program prefetched buffer, we only perform the freeing optimization
 // if the buffer occupies less of the execution time ratio than this value.
 const float kCrossProgramPrefetchOccupyFreeingLimit = 0.6;
-// Each time we retry compilation, increase the preferred eviction end time by
-// this amount multiplied by preferred overlap to async copy ratio.
-const float kEvictionRetryMultiplier = 2.0;
-// The number of decreasing intervals for CostAnalysisPrefetchIntervalPicker to
-// return when it runs out of increasing intervals. Increasing this number may
-// hurt compilation time.
-const int kNumExploredDecreasingIntervals = 100;
 
 template <typename T>
 std::string VectorToString(const std::vector<T>& v,
@@ -233,6 +227,17 @@ std::vector<HloUse> FindCrossProgramPrefetchUses(
 bool IsCrossProgramPrefetchCandidate(const HloValue& value,
                                      const HloAliasAnalysis& alias_analysis,
                                      const Options& options) {
+  // Filter out values that alias with the entry computation root.
+  const HloBuffer& buffer = alias_analysis.GetBufferContainingValue(value);
+  const HloInstruction* root = alias_analysis.dataflow_analysis()
+                                   .module()
+                                   .entry_computation()
+                                   ->root_instruction();
+  for (const HloPosition& position : buffer.ComputePositions()) {
+    if (position.instruction == root) {
+      return false;
+    }
+  }
   std::vector<HloUse> uses =
       FindCrossProgramPrefetchUses(value.GetUses(), alias_analysis);
   return value.defining_instruction()->parent() ==
@@ -340,18 +345,6 @@ Status InsertInstructionAndEnsureOperandsInserted(
   return OkStatus();
 }
 
-std::string UsesToString(const std::vector<HloUse>& uses) {
-  if (uses.empty()) {
-    return "none";
-  }
-  std::vector<std::string> uses_str;
-  uses_str.reserve(uses.size());
-  for (const auto& use : uses) {
-    uses_str.push_back(use.ToString());
-  }
-  return absl::StrJoin(uses_str, ",");
-}
-
 StatusOr<xla::HloLiveRange::LogicalTime> GetScheduleTimeFromInstructionName(
     absl::string_view name,
     const absl::flat_hash_map<const xla::HloInstruction*,
@@ -474,14 +467,61 @@ StatusOr<std::optional<int64_t>> GetOverriddenPreferredPrefetchTime(
   return static_cast<StatusOr<std::optional<int64_t>>>(std::nullopt);
 }
 
+bool DoesResultMatchFilter(const HloPositionMatcher& filter,
+                           const ShapeIndex& index,
+                           HloInstruction* instruction) {
+  if (filter.has_instruction_regex() &&
+      !RE2::FullMatch(instruction->ToString(), filter.instruction_regex())) {
+    return false;
+  }
+  if (filter.has_instruction_name_regex() &&
+      !RE2::FullMatch(instruction->name(), filter.instruction_name_regex())) {
+    return false;
+  }
+  if (filter.has_tuple_index() &&
+      index != ShapeIndex(filter.tuple_index().index().begin(),
+                          filter.tuple_index().index().end())) {
+    return false;
+  }
+  return true;
+}
+
+// Returns an integer representing the priority of a BufferInterval during
+// assignment, a smaller number indicates a higher priority.
+int64_t GetBufferIntervalOverridePriority(
+    const MsaSortOrderOverrides& msa_sort_order_overrides,
+    const BufferInterval& buffer_interval) {
+  if (msa_sort_order_overrides.overrides_size() == 0) {
+    return 0;
+  }
+  for (int64_t i = 0; i < msa_sort_order_overrides.overrides_size(); ++i) {
+    const auto& override = msa_sort_order_overrides.overrides(i);
+    if (!DoesResultMatchFilter(override.hlo_position_matcher(),
+                               buffer_interval.buffer->index(),
+                               buffer_interval.buffer->instruction())) {
+      continue;
+    }
+    LOG(INFO) << "Override Sort Order Config " << i << " matches "
+              << buffer_interval.buffer->instruction()->ToString();
+    switch (override.override_options().options_case()) {
+      case MsaSortOrderOverrideOptions::kAssignFirst:
+        return std::numeric_limits<int64_t>::lowest() + i;
+      case MsaSortOrderOverrideOptions::kAssignLast:
+        return std::numeric_limits<int64_t>::max() - i;
+      case MsaSortOrderOverrideOptions::OPTIONS_NOT_SET:
+        continue;
+    }
+  }
+  return 0;
+}
+
 std::tuple<int64_t, bool, int64_t> GetAllocationSortTuple(
-    const std::unique_ptr<MemorySpaceAssignment::Allocation>& allocation) {
+    const std::unique_ptr<Allocation>& allocation) {
   int64_t scheduled_on_or_before = allocation->start_time();
   int64_t scheduled_on_or_after = allocation->start_time();
   if (allocation->is_copy_allocation()) {
     auto copy_allocation =
-        tensorflow::down_cast<MemorySpaceAssignment::CopyAllocation*>(
-            allocation.get());
+        tensorflow::down_cast<CopyAllocation*>(allocation.get());
     scheduled_on_or_before = copy_allocation->copy_done_schedule_before();
     scheduled_on_or_after = copy_allocation->copy_start_schedule_after();
   }
@@ -492,12 +532,10 @@ std::tuple<int64_t, bool, int64_t> GetAllocationSortTuple(
 
 void SortAllocationSequence(
     MemorySpaceAssignment::AllocationSequence& allocations) {
-  absl::c_sort(
-      allocations,
-      [](const std::unique_ptr<MemorySpaceAssignment::Allocation>& lhs,
-         const std::unique_ptr<MemorySpaceAssignment::Allocation>& rhs) {
-        return GetAllocationSortTuple(lhs) < GetAllocationSortTuple(rhs);
-      });
+  absl::c_sort(allocations, [](const std::unique_ptr<Allocation>& lhs,
+                               const std::unique_ptr<Allocation>& rhs) {
+    return GetAllocationSortTuple(lhs) < GetAllocationSortTuple(rhs);
+  });
 }
 
 std::string AllocationSequenceToString(
@@ -507,8 +545,7 @@ std::string AllocationSequenceToString(
     SortAllocationSequence(allocations);
   }
   std::string allocations_str = "\n";
-  for (const std::unique_ptr<MemorySpaceAssignment::Allocation>& allocation :
-       allocations) {
+  for (const std::unique_ptr<Allocation>& allocation : allocations) {
     absl::StrAppend(&allocations_str, allocation->ToString(), "\n");
   }
   return allocations_str;
@@ -532,15 +569,12 @@ std::string InstructionScheduleToString(const HloLiveRange& hlo_live_range) {
   return instruction_schedule_str;
 }
 
-void EnsureParentAllocationIsAvailableForCopy(
-    MemorySpaceAssignment::CopyAllocation* copy_allocation) {
-  MemorySpaceAssignment::Allocation& parent_allocation =
-      copy_allocation->mutable_prev_allocation();
+void EnsureParentAllocationIsAvailableForCopy(CopyAllocation* copy_allocation) {
+  Allocation& parent_allocation = copy_allocation->mutable_prev_allocation();
   parent_allocation.Extend(copy_allocation->copy_done_schedule_before());
   if (parent_allocation.is_copy_allocation()) {
     auto parent_copy_allocation =
-        tensorflow::down_cast<MemorySpaceAssignment::CopyAllocation*>(
-            &parent_allocation);
+        tensorflow::down_cast<CopyAllocation*>(&parent_allocation);
     parent_copy_allocation->set_copy_done_schedule_before(
         std::min(parent_copy_allocation->copy_done_schedule_before(),
                  copy_allocation->start_time()));
@@ -550,8 +584,8 @@ void EnsureParentAllocationIsAvailableForCopy(
   }
 }
 
-void MakeCopyAllocationJitForSingleUse(
-    MemorySpaceAssignment::CopyAllocation* copy_allocation, int64_t use_time) {
+void MakeCopyAllocationJitForSingleUse(CopyAllocation* copy_allocation,
+                                       int64_t use_time) {
   copy_allocation->set_start_time(use_time - 1);
   copy_allocation->set_copy_start_schedule_after(use_time - 1);
   copy_allocation->set_end_time(use_time);
@@ -563,12 +597,10 @@ int64_t GetUseTime(const HloUse& use, const HloLiveRange& hlo_live_range) {
   return hlo_live_range.instruction_schedule().at(use.instruction);
 }
 
-std::vector<MemorySpaceAssignment::Allocation*>
-GetAllocationSequenceInRawPointers(
+std::vector<Allocation*> GetAllocationSequenceInRawPointers(
     MemorySpaceAssignment::AllocationSequence& allocations) {
-  std::vector<MemorySpaceAssignment::Allocation*> allocations_in_raw_pointers;
-  for (const std::unique_ptr<MemorySpaceAssignment::Allocation>& allocation :
-       allocations) {
+  std::vector<Allocation*> allocations_in_raw_pointers;
+  for (const std::unique_ptr<Allocation>& allocation : allocations) {
     allocations_in_raw_pointers.push_back(allocation.get());
   }
   return allocations_in_raw_pointers;
@@ -577,14 +609,13 @@ GetAllocationSequenceInRawPointers(
 void ProcessPrefetchesToAlternateMemory(
     MemorySpaceAssignment::AllocationSequence& allocations,
     const HloLiveRange& hlo_live_range) {
-  std::vector<MemorySpaceAssignment::Allocation*> allocations_in_raw_pointers =
+  std::vector<Allocation*> allocations_in_raw_pointers =
       GetAllocationSequenceInRawPointers(allocations);
   for (auto allocation : allocations_in_raw_pointers) {
     if (allocation->is_copy_allocation() && allocation->is_in_alternate_mem() &&
         !allocation->uses().empty()) {
-      MemorySpaceAssignment::CopyAllocation* prefetch =
-          tensorflow::down_cast<MemorySpaceAssignment::CopyAllocation*>(
-              allocation);
+      CopyAllocation* prefetch =
+          tensorflow::down_cast<CopyAllocation*>(allocation);
       std::vector<HloUse> uses = prefetch->uses();  // Create a copy of uses.
       prefetch->clear_uses();                       // Clear old uses.
       // For every prefetch, update prefetch to serve earliest use just in time.
@@ -596,11 +627,9 @@ void ProcessPrefetchesToAlternateMemory(
       for (size_t use_index = 1; use_index < uses.size(); ++use_index) {
         const HloUse& use = uses[use_index];
         int64_t use_time = GetUseTime(use, hlo_live_range);
-        auto jit_single_use_prefetch =
-            std::make_unique<MemorySpaceAssignment::CopyAllocation>(
-                prefetch->mutable_prev_allocation(),
-                MemorySpaceAssignment::MemorySpace::kAlternate,
-                prefetch->chunk(), use_time - 1, use_time, use_time);
+        auto jit_single_use_prefetch = std::make_unique<CopyAllocation>(
+            prefetch->mutable_prev_allocation(), MemorySpace::kAlternate,
+            prefetch->chunk(), use_time - 1, use_time, use_time);
         jit_single_use_prefetch->set_copy_start_schedule_after(use_time - 1);
         jit_single_use_prefetch->AddUse(use);
         EnsureParentAllocationIsAvailableForCopy(jit_single_use_prefetch.get());
@@ -610,28 +639,21 @@ void ProcessPrefetchesToAlternateMemory(
   }
 }
 
-void MakeEvictionImmediate(MemorySpaceAssignment::CopyAllocation* eviction) {
-  const MemorySpaceAssignment::Allocation& parent_allocation =
-      eviction->prev_allocation();
+void MakeEvictionImmediate(CopyAllocation* eviction) {
+  const Allocation& parent_allocation = eviction->prev_allocation();
   eviction->set_start_time(parent_allocation.start_time());
   eviction->set_copy_start_schedule_after(parent_allocation.start_time());
   eviction->set_copy_done_schedule_before(parent_allocation.start_time() + 1);
   eviction->Extend(parent_allocation.start_time() + 1);
 }
 
-absl::flat_hash_map<MemorySpaceAssignment::Allocation*,
-                    MemorySpaceAssignment::CopyAllocation*>
-GetEvictionsMap(std::vector<MemorySpaceAssignment::Allocation*>& allocations) {
-  absl::flat_hash_map<MemorySpaceAssignment::Allocation*,
-                      MemorySpaceAssignment::CopyAllocation*>
-      evictions_map;
+absl::flat_hash_map<Allocation*, CopyAllocation*> GetEvictionsMap(
+    std::vector<Allocation*>& allocations) {
+  absl::flat_hash_map<Allocation*, CopyAllocation*> evictions_map;
   for (auto& allocation : allocations) {
     if (allocation->is_copy_allocation() && allocation->is_in_default_mem()) {
-      auto eviction =
-          tensorflow::down_cast<MemorySpaceAssignment::CopyAllocation*>(
-              allocation);
-      MemorySpaceAssignment::Allocation& parent_allocation =
-          eviction->mutable_prev_allocation();
+      auto eviction = tensorflow::down_cast<CopyAllocation*>(allocation);
+      Allocation& parent_allocation = eviction->mutable_prev_allocation();
       if (!parent_allocation.is_copy_allocation()) {
         evictions_map[&parent_allocation] = eviction;
       }
@@ -643,13 +665,12 @@ GetEvictionsMap(std::vector<MemorySpaceAssignment::Allocation*>& allocations) {
 void ProcessBuffersProducedInAlternateMemory(
     MemorySpaceAssignment::AllocationSequence& allocations,
     const HloLiveRange& hlo_live_range) {
-  std::vector<MemorySpaceAssignment::Allocation*> allocations_in_raw_pointers =
+  std::vector<Allocation*> allocations_in_raw_pointers =
       GetAllocationSequenceInRawPointers(allocations);
   // For all parent allocations produced in alternate memory, create a map from
   // parent allocation -> eviction.
-  absl::flat_hash_map<MemorySpaceAssignment::Allocation*,
-                      MemorySpaceAssignment::CopyAllocation*>
-      evictions_map = GetEvictionsMap(allocations_in_raw_pointers);
+  absl::flat_hash_map<Allocation*, CopyAllocation*> evictions_map =
+      GetEvictionsMap(allocations_in_raw_pointers);
   // Make all such evictions immediate.
   for (auto& [_, eviction] : evictions_map) {
     MakeEvictionImmediate(eviction);
@@ -675,22 +696,19 @@ void ProcessBuffersProducedInAlternateMemory(
           continue;
         }
         if (!evictions_map.contains(allocation)) {
-          auto eviction_unique_ptr =
-              std::make_unique<MemorySpaceAssignment::CopyAllocation>(
-                  *allocation, MemorySpaceAssignment::MemorySpace::kDefault,
-                  std::nullopt, allocation->start_time(),
-                  allocation->start_time() + 1, allocation->start_time() + 1);
+          auto eviction_unique_ptr = std::make_unique<CopyAllocation>(
+              *allocation, MemorySpace::kDefault, std::nullopt,
+              allocation->start_time(), allocation->start_time() + 1,
+              allocation->start_time() + 1);
           eviction_unique_ptr->set_copy_start_schedule_after(
               allocation->start_time());
           evictions_map[allocation] = eviction_unique_ptr.get();
           allocations.push_back(std::move(eviction_unique_ptr));
         }
-        MemorySpaceAssignment::CopyAllocation* eviction =
-            evictions_map[allocation];
-        auto jit_single_use_prefetch =
-            std::make_unique<MemorySpaceAssignment::CopyAllocation>(
-                *eviction, MemorySpaceAssignment::MemorySpace::kAlternate,
-                allocation->chunk(), use_time - 1, use_time, use_time);
+        CopyAllocation* eviction = evictions_map[allocation];
+        auto jit_single_use_prefetch = std::make_unique<CopyAllocation>(
+            *eviction, MemorySpace::kAlternate, allocation->chunk(),
+            use_time - 1, use_time, use_time);
         jit_single_use_prefetch->set_copy_start_schedule_after(use_time - 1);
         jit_single_use_prefetch->AddUse(use);
         EnsureParentAllocationIsAvailableForCopy(jit_single_use_prefetch.get());
@@ -715,881 +733,9 @@ void TransformAllocationSequenceToSpill(
   XLA_LOG_LINES(2, AllocationSequenceToString(allocations, true));
   SortAllocationSequence(allocations);
 }
-}  // namespace
-
-/*static*/ StatusOr<std::unique_ptr<MemorySpaceAssignmentCostAnalysis>>
-MemorySpaceAssignmentCostAnalysis::Create(const HloCostAnalysis& cost_analysis,
-                                          const Options& options,
-                                          const HloModule& module) {
-  TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(&module));
-  TF_ASSIGN_OR_RETURN(auto hlo_live_range,
-                      HloLiveRange::Run(module.schedule(), *alias_analysis,
-                                        module.entry_computation()));
-  auto call_graph = CallGraph::Build(&module);
-  return absl::WrapUnique(new MemorySpaceAssignmentCostAnalysis(
-      cost_analysis, options, std::move(alias_analysis),
-      std::move(hlo_live_range), std::move(call_graph)));
-}
-
-float MemorySpaceAssignmentCostAnalysis::GetAlternateMemoryBenefit(
-    const HloInstruction& instruction, float elapsed_time_due_to_alternate_mem,
-    MemorySpaceAssignmentCostAnalysis::Cache* cache) const {
-  float elapsed_time_due_to_compute =
-      GetInstructionElapsedDueToCompute(instruction);
-  float elapsed_time_due_to_memory =
-      GetInstructionElapsedDueToMemory(instruction);
-  if (elapsed_time_due_to_memory > elapsed_time_due_to_compute) {
-    // Memory bound, return how much alternate memory is better.
-    float while_nest_multiplier;
-    if (cache) {
-      // If there is a cache provided, memoize the while nest multiplier.
-      auto it = cache->while_nest_multiplier.find(&instruction);
-      if (it != cache->while_nest_multiplier.end()) {
-        while_nest_multiplier = it->second;
-      } else {
-        while_nest_multiplier = IPow<float>(
-            options_.xla_tpu_memory_space_assignment_while_execution_count,
-            CalculateComputationNestLevel(&instruction,
-                                          /*while_only=*/true));
-        cache->while_nest_multiplier[&instruction] = while_nest_multiplier;
-      }
-    } else {
-      while_nest_multiplier = IPow<float>(
-          options_.xla_tpu_memory_space_assignment_while_execution_count,
-          CalculateComputationNestLevel(&instruction,
-                                        /*while_only=*/true));
-    }
-    return (elapsed_time_due_to_memory - elapsed_time_due_to_alternate_mem) *
-           while_nest_multiplier;
-  } else {
-    // Compute bound, return how far off are we to memory boundedness.
-    return elapsed_time_due_to_memory - elapsed_time_due_to_compute;
-  }
-}
 
-float MemorySpaceAssignmentCostAnalysis::GetMemoryBoundedness(
-    const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval,
-    MemorySpaceAssignmentCostAnalysis::Cache* cache) const {
-  if (cache) {
-    auto it =
-        cache->memory_boundedness.find(interval.buffer->defining_position());
-    if (it != cache->memory_boundedness.end()) {
-      return it->second;
-    }
-  }
-  float alternate_mem_benefit =
-      GetAlternateMemoryBenefit(interval.buffer->defining_position(), cache);
-
-  for (const HloBuffer* buffer : alias_analysis_->ComputeBuffersAt(
-           interval.buffer->defining_position().instruction,
-           interval.buffer->defining_position().index)) {
-    for (const HloValue* value : buffer->values()) {
-      for (const HloUse& use : value->GetUses()) {
-        // We look inside the called computations of while and conditional, so
-        // don't use the benefit of while and conditional directly.
-        if (use.instruction->opcode() == HloOpcode::kWhile ||
-            use.instruction->opcode() == HloOpcode::kConditional) {
-          continue;
-        }
-        float use_alternate_mem_benefit = GetAlternateMemoryBenefit(use, cache);
-        // If the benefit is positive (memory bound), add it to this buffer's
-        // benefit. If the benefit is negative (compute bound), calculate the
-        // maximum.
-        if (alternate_mem_benefit > 0 && use_alternate_mem_benefit > 0) {
-          alternate_mem_benefit += use_alternate_mem_benefit;
-        } else {
-          alternate_mem_benefit =
-              std::max(alternate_mem_benefit, use_alternate_mem_benefit);
-        }
-      }
-    }
-  }
-
-  // Penalize larger buffers by dividing the benefit by the square root of
-  // the size. Empirically, we observed this resulted in better performance
-  // compared to dividing by the size.
-  float memory_boundedness = 1;
-  if (options_
-          .xla_tpu_alternate_memory_benefit_scaling_factor_for_large_buffers ==
-      "NO_SCALE") {
-    memory_boundedness = alternate_mem_benefit;
-  } else {
-    memory_boundedness = alternate_mem_benefit / std::sqrt(interval.size);
-  }
-
-  if (cache) {
-    cache->memory_boundedness[interval.buffer->defining_position()] =
-        memory_boundedness;
-  }
-  return memory_boundedness;
-}
-
-float MemorySpaceAssignmentCostAnalysis::GetAlternateMemoryBenefit(
-    const HloPosition& position,
-    MemorySpaceAssignmentCostAnalysis::Cache* cache) const {
-  return GetAlternateMemoryBenefit(
-      *position.instruction,
-      GetInstructionElapsedDueToMemory(
-          *position.instruction,
-          /*operands_in_alternate_mem=*/{},
-          /*outputs_in_alternate_mem=*/{position.index}),
-      cache);
-}
-
-float MemorySpaceAssignmentCostAnalysis::GetAlternateMemoryBenefit(
-    const HloUse& use, MemorySpaceAssignmentCostAnalysis::Cache* cache) const {
-  return GetAlternateMemoryBenefit(
-      *use.instruction,
-      GetInstructionElapsedDueToMemory(
-          *use.instruction,
-          /*operands_in_alternate_mem=*/{std::make_pair(use.operand_number,
-                                                        use.operand_index)}),
-      cache);
-}
-
-int MemorySpaceAssignmentCostAnalysis::CalculateComputationNestLevel(
-    const HloInstruction* instruction, bool while_only) const {
-  int nest_level = 0;
-  const HloComputation* computation = instruction->parent();
-  while (!computation->IsEntryComputation()) {
-    auto& node = call_graph_->GetNode(computation);
-    auto callsites = node.caller_callsites();
-    CHECK(node.computation()->IsAsyncComputation() || callsites.size() == 1)
-        << "The module is not flattened!";
-    auto& callsite = callsites[0];
-    if (!while_only || callsite.instruction()->opcode() == HloOpcode::kWhile) {
-      ++nest_level;
-    }
-    computation = callsite.instruction()->parent();
-  }
-  return nest_level;
-}
-
-float MemorySpaceAssignmentCostAnalysis::GetDefaultMemoryAccessOverhead(
-    const HloInstruction& instruction,
-    absl::Span<const std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem,
-    absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
-  // Calculate the pipeline overhead of accessing the default memory. We use the
-  // maximum of the window size heuristic and the actual default memory bytes
-  // accessed multiplied with the compute as the overhead. So, the math is:
-  //
-  // overhead = compute_per_iteration
-  //          = compute_elapsed / num_iterations
-  //          = compute_elapsed / (bytes_accessed / window_size)
-  //          = (window_size / bytes_accessed) * compute_elapsed
-  const float window_size_bytes =
-      options_.pipeline_overhead_window_size_mib * 1024 * 1024;
-  const float bytes_accessed = cost_analysis_.bytes_accessed(instruction);
-  const float default_memory_bytes_accessed =
-      bytes_accessed -
-      GetBytesAccessedFromAlternateMemory(
-          instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
-  const float compute_elapsed = GetInstructionElapsedDueToCompute(instruction);
-  const float effective_window_size_bytes =
-      std::min(window_size_bytes, default_memory_bytes_accessed);
-  float overhead = 0;
-  if (bytes_accessed > 0) {
-    overhead = (effective_window_size_bytes / bytes_accessed) * compute_elapsed;
-  }
-  return overhead;
-}
-
-float MemorySpaceAssignmentCostAnalysis::GetDefaultMemoryBandwidthIdleTime(
-    const HloInstruction& instruction,
-    absl::Span<const std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem,
-    absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
-  const float default_memory_bytes_accessed =
-      cost_analysis_.bytes_accessed(instruction) -
-      GetBytesAccessedFromAlternateMemory(
-          instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
-  const float elapsed_due_to_default_mem =
-      default_memory_bytes_accessed /
-      cost_analysis_.per_second_rate(HloCostAnalysis::kBytesAccessedKey);
-  const float elapsed = GetInstructionElapsedInAlternateMemory(
-      instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
-  return elapsed - elapsed_due_to_default_mem;
-}
-
-float MemorySpaceAssignmentCostAnalysis::GetBytesAccessedFromAlternateMemory(
-    const HloInstruction& instruction,
-    absl::Span<const std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem,
-    absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
-  float bytes_accessed_from_alternate_mem = 0.0;
-  for (auto& operand : operands_in_alternate_mem) {
-    const float operand_bytes_accessed = cost_analysis_.operand_bytes_accessed(
-        instruction, operand.first, operand.second);
-    bytes_accessed_from_alternate_mem += operand_bytes_accessed;
-  }
-
-  for (auto& shape_idx : outputs_in_alternate_mem) {
-    const float output_bytes_accessed =
-        cost_analysis_.output_bytes_accessed(instruction, shape_idx);
-    bytes_accessed_from_alternate_mem += output_bytes_accessed;
-  }
-  return bytes_accessed_from_alternate_mem;
-}
-
-namespace {
-// Returns true on async instructions since we assume they are already
-// efficiently scheduled such that they are not in the critical path and appear
-// to take no time.
-bool ExcludeInstructionFromElapsed(const HloInstruction& instruction) {
-  return instruction.opcode() == HloOpcode::kAllGatherStart ||
-         instruction.opcode() == HloOpcode::kAllGatherDone ||
-         instruction.opcode() == HloOpcode::kAllReduceStart ||
-         instruction.opcode() == HloOpcode::kAllReduceDone ||
-         instruction.opcode() == HloOpcode::kAsyncStart ||
-         instruction.opcode() == HloOpcode::kAsyncDone ||
-         instruction.opcode() == HloOpcode::kCollectivePermuteStart ||
-         instruction.opcode() == HloOpcode::kCollectivePermuteDone ||
-         instruction.opcode() == HloOpcode::kCopyStart ||
-         instruction.opcode() == HloOpcode::kCopyDone;
-}
 }  // namespace
 
-float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedDueToCompute(
-    const HloInstruction& instruction) const {
-  if (ExcludeInstructionFromElapsed(instruction)) {
-    return 0.0f;
-  }
-  return std::max(
-      cost_analysis_.flop_count(instruction) /
-          cost_analysis_.per_second_rate(HloCostAnalysis::kFlopsKey),
-      cost_analysis_.transcendental_count(instruction) /
-          cost_analysis_.per_second_rate(HloCostAnalysis::kTranscendentalsKey));
-}
-
-float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedDueToMemory(
-    const HloInstruction& instruction,
-    absl::Span<const std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem,
-    absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
-  if (ExcludeInstructionFromElapsed(instruction)) {
-    return 0.0f;
-  }
-  float total_bytes_accessed = cost_analysis_.bytes_accessed(instruction);
-  float bytes_accessed_from_alternate_mem = GetBytesAccessedFromAlternateMemory(
-      instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
-  float elapsed_due_to_alternate_mem =
-      bytes_accessed_from_alternate_mem /
-      options().alternate_mem_bandwidth_bytes_per_second;
-  float elapsed_due_to_default_mem =
-      (total_bytes_accessed - bytes_accessed_from_alternate_mem) /
-      cost_analysis_.per_second_rate(HloCostAnalysis::kBytesAccessedKey);
-  return elapsed_due_to_alternate_mem + elapsed_due_to_default_mem;
-}
-
-float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedDueToMemory(
-    const HloInstruction& instruction,
-    IsInAlternateMemoryFun is_in_alternate_mem) const {
-  if (ExcludeInstructionFromElapsed(instruction)) {
-    return 0.0f;
-  }
-  float total_bytes_accessed = cost_analysis_.bytes_accessed(instruction);
-  float bytes_accessed_from_alternate_mem = 0.0;
-  for (int operand_num = 0; operand_num < instruction.operand_count();
-       ++operand_num) {
-    ShapeUtil::ForEachSubshape(
-        instruction.operand(operand_num)->shape(),
-        [&](const Shape& subshape, const ShapeIndex& index) {
-          if (!subshape.IsArray()) {
-            return;
-          }
-          if (is_in_alternate_mem(operand_num, index, subshape)) {
-            bytes_accessed_from_alternate_mem +=
-                cost_analysis_.operand_bytes_accessed(instruction, operand_num,
-                                                      index);
-          }
-        });
-  }
-  ShapeUtil::ForEachSubshape(instruction.shape(), [&](const Shape& subshape,
-                                                      const ShapeIndex& index) {
-    if (!subshape.IsArray()) {
-      return;
-    }
-    if (is_in_alternate_mem(/*operand_num=*/std::nullopt, index, subshape)) {
-      bytes_accessed_from_alternate_mem +=
-          cost_analysis_.output_bytes_accessed(instruction, index);
-    }
-  });
-  float elapsed_due_to_alternate_mem =
-      bytes_accessed_from_alternate_mem /
-      options().alternate_mem_bandwidth_bytes_per_second;
-  float elapsed_due_to_default_mem =
-      (total_bytes_accessed - bytes_accessed_from_alternate_mem) /
-      cost_analysis_.per_second_rate(HloCostAnalysis::kBytesAccessedKey);
-  return elapsed_due_to_alternate_mem + elapsed_due_to_default_mem;
-}
-
-float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsed(
-    const HloInstruction& instruction) const {
-  if (ExcludeInstructionFromElapsed(instruction)) {
-    return 0.0f;
-  }
-  float overhead = GetDefaultMemoryAccessOverhead(instruction);
-  return std::max(GetInstructionElapsedDueToCompute(instruction),
-                  GetInstructionElapsedDueToMemory(instruction) + overhead);
-}
-
-float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedInAlternateMemory(
-    const HloInstruction& instruction,
-    absl::Span<const std::pair<int64_t, ShapeIndex>> operands_in_alternate_mem,
-    absl::Span<const ShapeIndex> outputs_in_alternate_mem) const {
-  if (ExcludeInstructionFromElapsed(instruction)) {
-    return 0.0f;
-  }
-  float overhead = GetDefaultMemoryAccessOverhead(
-      instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
-  return std::max(
-      GetInstructionElapsedDueToCompute(instruction),
-      GetInstructionElapsedDueToMemory(instruction, operands_in_alternate_mem,
-                                       outputs_in_alternate_mem) +
-          overhead);
-}
-
-float MemorySpaceAssignmentCostAnalysis::GetInstructionElapsedInAlternateMemory(
-    const HloInstruction& instruction,
-    IsInAlternateMemoryFun is_in_alternate_mem) const {
-  if (ExcludeInstructionFromElapsed(instruction)) {
-    return 0.0f;
-  }
-  return std::max(
-      GetInstructionElapsedDueToCompute(instruction),
-      GetInstructionElapsedDueToMemory(instruction, is_in_alternate_mem));
-}
-
-float MemorySpaceAssignmentCostAnalysis::GetAsyncCopyElapsed(
-    const Shape& shape) const {
-  int64_t size_in_bytes = cost_analysis_.GetShapeSize(shape);
-  return static_cast<float>(size_in_bytes) /
-         (options().async_copy_bandwidth_bytes_per_second *
-          options().async_copy_bandwidth_scaling_factor);
-}
-
-int64_t MemorySpaceAssignmentCostAnalysis::GetScheduleEndTime() const {
-  return hlo_live_range_->schedule_end_time();
-}
-
-bool InstructionCountPrefetchIntervalPicker::CanAllocateInAlternateMemoryNoCopy(
-    const Shape& shape, int64_t start_time, int64_t end_time) const {
-  return end_time - start_time <= max_overlap_count_;
-}
-
-int64_t InstructionCountPrefetchIntervalPicker::PreferredEvictionEndTime(
-    const Shape& shape, int64_t start_time, int64_t latest_end_time) const {
-  return std::min(start_time + min_overlap_count_, latest_end_time);
-}
-
-int64_t InstructionCountPrefetchIntervalPicker::LatestPrefetchStartTime(
-    const Shape& shape, int64_t start_time, int64_t end_time,
-    const HloUse* use) const {
-  return end_time - min_overlap_count_;
-}
-
-int64_t InstructionCountPrefetchIntervalPicker::PreferredPrefetchStartTime(
-    const Shape& shape, int64_t earliest_prefetch_start_time,
-    int64_t latest_prefetch_start_time, int64_t prefetch_end_time) const {
-  return std::max(earliest_prefetch_start_time,
-                  prefetch_end_time - max_overlap_count_);
-}
-
-int64_t InstructionCountPrefetchIntervalPicker::EstimatedPrefetchEndTime(
-    const Shape& shape, int64_t start_time, int64_t end_time) const {
-  // For testing, assume the end time is the estimated prefetch end time.
-  return end_time;
-}
-
-float InstructionCountPrefetchIntervalPicker::GetLogicalIntervalElapsed(
-    int64_t start_time, int64_t end_time) const {
-  // For testing, just assume every HLO takes 1 second.
-  return static_cast<float>(end_time - start_time - 1);
-}
-
-void InstructionCountPrefetchIntervalPicker::Begin(
-    const HloUse& use, int64_t start_time, int64_t end_time,
-    std::optional<int64_t> preferred_time) {
-  end_time_ = end_time;
-  const Shape& shape = ShapeUtil::GetSubshape(
-      use.instruction->operand(use.operand_number)->shape(), use.operand_index);
-  if (preferred_time) {
-    current_prefetch_time_ = *preferred_time;
-  } else {
-    current_prefetch_time_ =
-        PreferredPrefetchStartTime(shape, start_time, end_time, end_time);
-  }
-}
-
-int64_t InstructionCountPrefetchIntervalPicker::Next() {
-  CHECK(!Done()) << "Prefetch interval picker's Next() is called even though "
-                    "Done() is false";
-  return current_prefetch_time_++;
-}
-
-bool InstructionCountPrefetchIntervalPicker::Done() const {
-  return end_time_ - current_prefetch_time_ <= min_overlap_count_;
-}
-
-int64_t InstructionCountPrefetchIntervalPicker::latest_time() const {
-  return end_time_ - min_overlap_count_ - 1;
-}
-
-std::string InstructionCountPrefetchIntervalPicker::ToDebugString() const {
-  return absl::StrCat("Overlapped HLOs = ", end_time_ - current_prefetch_time_);
-}
-
-std::string InstructionCountPrefetchIntervalPicker::ToNoCopyDebugString(
-    const Shape& shape, int64_t start_time, int64_t end_time) const {
-  return absl::StrCat("Overlapped HLOs = ", end_time - start_time);
-}
-
-CostAnalysisPrefetchIntervalPicker::CostAnalysisPrefetchIntervalPicker(
-    const MemorySpaceAssignmentCostAnalysis& cost_analysis,
-    float min_overlap_to_async_copy_ratio,
-    float preferred_overlap_to_async_copy_ratio,
-    float max_overlap_to_mem_size_async_copy_ratio, int64_t mem_size_bytes,
-    const Shape* shape_override)
-    : while_nest_level_(
-          cost_analysis.hlo_live_range().instruction_schedule().size() + 1, 0),
-      computation_nest_level_(
-          cost_analysis.hlo_live_range().instruction_schedule().size() + 1, 0),
-      cost_analysis_(cost_analysis),
-      min_overlap_to_async_copy_ratio_(min_overlap_to_async_copy_ratio),
-      preferred_overlap_to_async_copy_ratio_(
-          preferred_overlap_to_async_copy_ratio),
-      max_async_copy_elapsed_(
-          cost_analysis_.GetAsyncCopyElapsed(
-              ShapeUtil::MakeShape(S32, {mem_size_bytes / 4})) *
-          max_overlap_to_mem_size_async_copy_ratio),
-      shape_override_(shape_override ? std::optional(*shape_override)
-                                     : std::nullopt) {
-  instruction_schedule_ =
-      &cost_analysis_.hlo_live_range().instruction_schedule();
-
-  // Create a vector of elapsed times and while nesting levels of HLO
-  // instructions. The elapsed times are multiplied by
-  // pow(while_execution_count, nest_level) to account for executing the HLOs
-  // multiple times in while loops.
-  std::vector<float> instructions_elapsed_time(
-      instruction_schedule_->size() + 1, 0.0);
-  int max_while_nest_level = 0;
-  for (const auto& instruction_and_logical_time : *instruction_schedule_) {
-    // To avoid double counting, don't include the elapsed time of while and
-    // conditional HLOs.
-    const HloInstruction* instruction = instruction_and_logical_time.first;
-    int64_t logical_time = instruction_and_logical_time.second;
-    if (logical_time >= instructions_elapsed_time.size()) {
-      instructions_elapsed_time.resize(logical_time + 1, 0.0);
-      while_nest_level_.resize(logical_time + 1, 0);
-    }
-    int while_nest_level = cost_analysis_.CalculateComputationNestLevel(
-        instruction_and_logical_time.first, /*while_only=*/true);
-    while_nest_level_[logical_time] = while_nest_level;
-    max_while_nest_level = std::max(max_while_nest_level, while_nest_level);
-    int computation_nest_level = cost_analysis_.CalculateComputationNestLevel(
-        instruction_and_logical_time.first, /*while_only=*/false);
-    computation_nest_level_[logical_time] = computation_nest_level;
-    if (instruction->opcode() == HloOpcode::kWhile ||
-        instruction->opcode() == HloOpcode::kConditional) {
-      continue;
-    }
-    float elapsed_time = cost_analysis_.GetInstructionElapsed(
-        *instruction_and_logical_time.first);
-    instructions_elapsed_time[logical_time] =
-        elapsed_time *
-        IPow<float>(cost_analysis_.options()
-                        .xla_tpu_memory_space_assignment_while_execution_count,
-                    while_nest_level);
-  }
-  // As an optimization, create a cumulative sum vector of elapsed time.
-  float cumsum = 0.0;
-  elapsed_time_cumsum_.reserve(instructions_elapsed_time.size());
-  for (float elapsed_time : instructions_elapsed_time) {
-    cumsum += elapsed_time;
-    elapsed_time_cumsum_.push_back(cumsum);
-  }
-  // To be able to accurately determine the minimum nest level between a start
-  // time and an end time efficiently, populate a data structure that stores the
-  // closest 'smaller' nest level change index.
-  const int64_t size = instructions_elapsed_time.size();
-  CHECK_EQ(size, while_nest_level_.size());
-  std::vector<int> most_recent_by_level(while_nest_level_.size(), -1);
-  int prev_nest_level = 0;
-  int change_idx = -1;
-  while_nest_level_change_.reserve(size);
-  for (int i = 0; i < size; ++i) {
-    int nest_level = while_nest_level_[i];
-    if (nest_level != prev_nest_level) {
-      prev_nest_level = nest_level;
-      // Compute last change index by choosing the most recent instruction index
-      // with smaller nesting level. Note that it may happen that even though
-      // there were few different regions with other nest levels before, all of
-      // then are same or bigger than this one, in which case we'll end up with
-      // -1, e.g. if you got nest level 0 no need checking anything else.
-      change_idx = -1;
-      for (int smaller_level = 0; smaller_level < nest_level; smaller_level++) {
-        change_idx = std::max(change_idx, most_recent_by_level[smaller_level]);
-      }
-    }
-    most_recent_by_level[nest_level] = i;
-    while_nest_level_change_.push_back(change_idx);
-  }
-  for (int i = 0; i <= max_while_nest_level; ++i) {
-    while_execution_counts_.push_back(
-        IPow<float>(cost_analysis_.options()
-                        .xla_tpu_memory_space_assignment_while_execution_count,
-                    i));
-  }
-}
-
-float CostAnalysisPrefetchIntervalPicker::GetMaxElapsedInAlternateMemory(
-    float async_copy_elapsed) const {
-  return max_async_copy_elapsed_;
-}
-
-bool CostAnalysisPrefetchIntervalPicker::CanAllocateInAlternateMemoryNoCopy(
-    const Shape& shape, int64_t start_time, int64_t end_time) const {
-  // Even though this method returns if we allow the buffer in alternate memory
-  // _without_ asynchronous copies, calculate how long it would have taken to
-  // copy it and compare it to the elapsed time in the logical interval.
-  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(
-      shape_override_ ? *shape_override_ : shape);
-  float logical_interval_elapsed =
-      GetLogicalIntervalElapsed(start_time, end_time);
-  return GetMaxElapsedInAlternateMemory(async_copy_elapsed) >
-         logical_interval_elapsed;
-}
-
-int64_t CostAnalysisPrefetchIntervalPicker::PreferredEvictionEndTime(
-    const Shape& shape, int64_t start_time, int64_t latest_end_time) const {
-  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(
-      shape_override_ ? *shape_override_ : shape);
-  int64_t end_time;
-  for (end_time = start_time + 1; end_time <= latest_end_time; ++end_time) {
-    float logical_interval_elapsed =
-        GetLogicalIntervalElapsed(start_time, end_time);
-    if (logical_interval_elapsed >=
-        (1 + kEvictionRetryMultiplier * retry_number_) *
-            preferred_overlap_to_async_copy_ratio_ * async_copy_elapsed) {
-      break;
-    }
-  }
-  return end_time;
-}
-
-int64_t CostAnalysisPrefetchIntervalPicker::LatestPrefetchStartTime(
-    const Shape& shape, int64_t start_time, int64_t end_time,
-    const HloUse* use) const {
-  // Find the earliest time that satisfies max_overlap_to_async_copy_ratio_.
-  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(
-      shape_override_ ? *shape_override_ : shape);
-  // If there is a use, estimate the time we would save by having this op in
-  // alternate memory.
-  float inst_elapsed_reduction = 0.0f;
-  if (use) {
-    float elapsed_time =
-        cost_analysis_.GetInstructionElapsed(*use->instruction);
-    float elapsed_time_in_alternate_mem =
-        cost_analysis_.GetInstructionElapsedInAlternateMemory(
-            *use->instruction,
-            /*operands_in_alternate_mem=*/
-            {std::make_pair(use->operand_number, use->operand_index)},
-            /*outputs_in_alternate_mem=*/{});
-    inst_elapsed_reduction = elapsed_time - elapsed_time_in_alternate_mem;
-  }
-  int end_nest_level = computation_nest_level_[end_time];
-
-  // Find the latest time we're allowed to start prefetching.
-  float min_interval = min_overlap_to_async_copy_ratio_ * async_copy_elapsed;
-  int latest_prefetch_time;
-  for (latest_prefetch_time = end_time - 1;
-       latest_prefetch_time >= start_time &&
-       (computation_nest_level_[latest_prefetch_time] != end_nest_level ||
-        min_interval >
-            GetLogicalIntervalElapsed(latest_prefetch_time, end_time) +
-                inst_elapsed_reduction);
-       --latest_prefetch_time) {
-  }
-
-  return latest_prefetch_time;
-}
-
-int64_t CostAnalysisPrefetchIntervalPicker::PreferredPrefetchStartTime(
-    const Shape& shape, int64_t earliest_prefetch_start_time,
-    int64_t latest_prefetch_start_time, int64_t prefetch_end_time) const {
-  // Between the earliest and latest prefetch interval, find the interval
-  // closest to the preferred interval and start iterating from there.
-  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(
-      shape_override_ ? *shape_override_ : shape);
-  int64_t preferred_prefetch_start_time = earliest_prefetch_start_time;
-  float preferred_interval =
-      preferred_overlap_to_async_copy_ratio_ * async_copy_elapsed;
-  float best_interval = GetLogicalIntervalElapsed(earliest_prefetch_start_time,
-                                                  prefetch_end_time);
-  int end_nest_level = computation_nest_level_[prefetch_end_time];
-  for (int64_t prefetch_start_time = earliest_prefetch_start_time + 1;
-       prefetch_start_time <= latest_prefetch_start_time;
-       ++prefetch_start_time) {
-    float interval =
-        GetLogicalIntervalElapsed(prefetch_start_time, prefetch_end_time);
-    if (computation_nest_level_[prefetch_start_time] == end_nest_level &&
-        std::abs(preferred_interval - interval) <
-            std::abs(preferred_interval - best_interval)) {
-      best_interval = interval;
-      preferred_prefetch_start_time = prefetch_start_time;
-    }
-  }
-  return preferred_prefetch_start_time;
-}
-
-int64_t CostAnalysisPrefetchIntervalPicker::LatestPrefetchEndTime(
-    int64_t original_prefetch_end_time,
-    int64_t proposed_prefetch_end_time) const {
-  // Iterate towards the beginning until we find a suitable end time that is the
-  // same while nest level as the original prefetch end time.
-  int64_t original_nest_level =
-      computation_nest_level_[original_prefetch_end_time];
-  int64_t new_prefetch_end_time;
-  for (new_prefetch_end_time = proposed_prefetch_end_time;
-       computation_nest_level_[new_prefetch_end_time] != original_nest_level;
-       --new_prefetch_end_time) {
-  }
-  return new_prefetch_end_time;
-}
-
-int64_t CostAnalysisPrefetchIntervalPicker::EstimatedPrefetchEndTime(
-    const Shape& shape, int64_t start_time, int64_t end_time) const {
-  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(
-      shape_override_ ? *shape_override_ : shape);
-  int64_t estimated_end_time;
-  for (estimated_end_time = start_time + 1; estimated_end_time < end_time;
-       ++estimated_end_time) {
-    float interval = GetLogicalIntervalElapsed(start_time, estimated_end_time);
-    if (interval >= async_copy_elapsed) {
-      break;
-    }
-  }
-  return estimated_end_time;
-}
-
-void CostAnalysisPrefetchIntervalPicker::Begin(
-    const HloUse& use, int64_t start_time, int64_t end_time,
-    std::optional<int64_t> preferred_time) {
-  const Shape& shape = ShapeUtil::GetSubshape(
-      use.instruction->operand(use.operand_number)->shape(), use.operand_index);
-  // Find the earliest time that satisfies max_overlap_to_async_copy_ratio_.
-  async_copy_elapsed_ = cost_analysis_.GetAsyncCopyElapsed(
-      shape_override_ ? *shape_override_ : shape);
-  // Estimate the time we would save by having this op in alternate memory.
-  float elapsed_time = cost_analysis_.GetInstructionElapsed(*use.instruction);
-  float elapsed_time_in_alternate_mem =
-      cost_analysis_.GetInstructionElapsedInAlternateMemory(
-          *use.instruction, /*operands_in_alternate_mem=*/
-          {std::make_pair(use.operand_number, use.operand_index)},
-          /*outputs_in_alternate_mem=*/{});
-  inst_elapsed_reduction_ = elapsed_time - elapsed_time_in_alternate_mem;
-  end_logical_time_ = end_time;
-  int end_nest_level = computation_nest_level_[end_logical_time_];
-
-  // Find the latest time we're allowed to start prefetching.
-  float min_interval = min_overlap_to_async_copy_ratio_ * async_copy_elapsed_;
-  latest_prefetch_time_ =
-      LatestPrefetchStartTime(shape, start_time, end_time, &use);
-
-  // Find the earliest time we're allowed to start prefetching.
-  float max_interval = GetMaxElapsedInAlternateMemory(async_copy_elapsed_);
-  for (earliest_prefetch_time_ = start_time;
-       earliest_prefetch_time_ < latest_prefetch_time_ &&
-       (computation_nest_level_[earliest_prefetch_time_] != end_nest_level ||
-        max_interval < GetLogicalIntervalElapsed(earliest_prefetch_time_,
-                                                 end_logical_time_));
-       ++earliest_prefetch_time_) {
-  }
-  if (earliest_prefetch_time_ > latest_prefetch_time_) {
-    // There is no available prefetch interval for the given start and end
-    // times. Set the iterators accordingly to ensure Done() returns true.
-    increasing_prefetch_time_iterator_ = earliest_prefetch_time_;
-    decreasing_prefetch_time_iterator_ = latest_prefetch_time_;
-    CHECK(Done());
-    return;
-  }
-
-  int64_t starting_prefetch_time;
-  if (preferred_time && *preferred_time <= latest_prefetch_time_) {
-    starting_prefetch_time = *preferred_time;
-  } else {
-    starting_prefetch_time =
-        PreferredPrefetchStartTime(shape, earliest_prefetch_time_,
-                                   latest_prefetch_time_, end_logical_time_);
-  }
-  float preferred_interval =
-      preferred_overlap_to_async_copy_ratio_ * async_copy_elapsed_;
-  VLOG(4) << "Interval min/max/preferred = " << min_interval << " "
-          << max_interval << " " << preferred_interval
-          << " prefetch time earliest/latest/starting = "
-          << earliest_prefetch_time_ << " " << latest_prefetch_time_ << " "
-          << starting_prefetch_time;
-
-  increasing_prefetch_time_iterator_ = starting_prefetch_time;
-  decreasing_prefetch_time_iterator_ = starting_prefetch_time;
-  using_increasing_prefetch_time_iterator_ = true;
-  // Since both iterators start at the same position, call Next() once to
-  // advance one of the iterators.
-  Next();
-}
-
-int64_t CostAnalysisPrefetchIntervalPicker::Next() {
-  CHECK(!Done()) << "Prefetch interval picker's Next() is called even though "
-                    "Done() is false";
-  if (using_increasing_prefetch_time_iterator_) {
-    int64_t prefetch_time = increasing_prefetch_time_iterator_++;
-    while (increasing_prefetch_time_iterator_ <= latest_prefetch_time_ &&
-           computation_nest_level_[increasing_prefetch_time_iterator_] !=
-               computation_nest_level_[end_logical_time_]) {
-      ++increasing_prefetch_time_iterator_;
-    }
-    if (decreasing_prefetch_time_iterator_ >= earliest_prefetch_time_) {
-      using_increasing_prefetch_time_iterator_ = false;
-    }
-    return prefetch_time;
-  } else {
-    int64_t prefetch_time = decreasing_prefetch_time_iterator_--;
-    // As a compilation time optimization, reduce the number of intervals that
-    // this prefetch interval picker returns. When we run out of the increasing
-    // prefetch time iterator, only explore up to
-    // kNumExploredDecreasingIntervals intervals. To do that, calculate the
-    // 1/kNumExploredDecreasingIntervals of the elapsed time between the
-    // earliest prefetch time and the use, and decrement the iterator until the
-    // prefetch elapsed time is at least as large as this target value. This
-    // allows us to reduce the number of expensive heap fit and resource checks
-    // when the graph consists of a large number of fast-executing HLOs.
-    //
-    // Shown pictorially, assuming kNumExploredDecreasingIntervals = 3 and the
-    // numbers indicating the elapsed time of the HLOs, only the indicated
-    // options for prefetch start time would be explored:
-    //
-    //    ---1---1---3---1---1---1---1---0---0---0---0---1---5---X
-    //     ^           ^                                   ^     ^
-    //  Option3     Option2                             Option1 Use
-    // (Earliest)
-    float next_target_interval_elapsed = 0;
-    if (increasing_prefetch_time_iterator_ > latest_prefetch_time_) {
-      next_target_interval_elapsed =
-          GetLogicalIntervalElapsed(prefetch_time, end_logical_time_) +
-          (GetLogicalIntervalElapsed(earliest_prefetch_time_,
-                                     end_logical_time_) /
-           kNumExploredDecreasingIntervals);
-      VLOG(3) << "Next target interval elapsed: "
-              << next_target_interval_elapsed;
-    }
-    while (decreasing_prefetch_time_iterator_ >= earliest_prefetch_time_ &&
-           (computation_nest_level_[decreasing_prefetch_time_iterator_] !=
-                computation_nest_level_[end_logical_time_] ||
-            GetLogicalIntervalElapsed(decreasing_prefetch_time_iterator_,
-                                      end_logical_time_) <
-                next_target_interval_elapsed)) {
-      --decreasing_prefetch_time_iterator_;
-    }
-    if (increasing_prefetch_time_iterator_ <= latest_prefetch_time_) {
-      using_increasing_prefetch_time_iterator_ = true;
-    }
-    return prefetch_time;
-  }
-}
-
-bool CostAnalysisPrefetchIntervalPicker::Done() const {
-  return increasing_prefetch_time_iterator_ > latest_prefetch_time_ &&
-         decreasing_prefetch_time_iterator_ < earliest_prefetch_time_;
-}
-
-int64_t CostAnalysisPrefetchIntervalPicker::latest_time() const {
-  return latest_prefetch_time_;
-}
-
-void CostAnalysisPrefetchIntervalPicker::SetRetryNumber(int retry_number) {
-  retry_number_ = retry_number;
-}
-
-int CostAnalysisPrefetchIntervalPicker::GetMinWhileNestLevel(
-    int64_t start_time, int64_t end_time) const {
-  int min_nest_level =
-      std::min(while_nest_level_[start_time], while_nest_level_[end_time]);
-  int change_idx = while_nest_level_change_[end_time];
-  while (change_idx >= start_time) {
-    min_nest_level = std::min(min_nest_level, while_nest_level_[change_idx]);
-    change_idx = while_nest_level_change_[change_idx];
-  }
-  return min_nest_level;
-}
-
-float CostAnalysisPrefetchIntervalPicker::GetLogicalIntervalElapsed(
-    int64_t start_time, int64_t end_time) const {
-  CHECK_LE(start_time, end_time);
-  if (start_time == end_time) {
-    return 0.0;
-  }
-  if (start_time < 0) {
-    start_time = 0;
-  }
-  // Since elapsed_time_cumsum_ is already weighed by the while loop nesting
-  // level, normalize the elapsed time by dividing with the nesting factor of
-  // the interval (start and end times).
-  int interval_while_nest_level = GetMinWhileNestLevel(start_time, end_time);
-  return (elapsed_time_cumsum_[end_time - 1] -
-          elapsed_time_cumsum_[start_time]) /
-         while_execution_counts_[interval_while_nest_level];
-}
-
-std::string CostAnalysisPrefetchIntervalPicker::ToDebugString() const {
-  int current_logical_prefetch_time = using_increasing_prefetch_time_iterator_
-                                          ? increasing_prefetch_time_iterator_
-                                          : decreasing_prefetch_time_iterator_;
-  float logical_interval_elapsed = GetLogicalIntervalElapsed(
-      current_logical_prefetch_time, end_logical_time_);
-  return absl::StrCat(
-      "Async copy elapsed (s) = ", async_copy_elapsed_,
-      ", inst elapsed reduction (s) = ", inst_elapsed_reduction_,
-      ", logical interval elapsed (s) = ", logical_interval_elapsed,
-      ", interval = (", current_logical_prefetch_time, ", ", end_logical_time_,
-      ")");
-}
-
-std::string CostAnalysisPrefetchIntervalPicker::ToNoCopyDebugString(
-    const Shape& shape, int64_t start_time, int64_t end_time) const {
-  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(
-      shape_override_ ? *shape_override_ : shape);
-  float logical_interval_elapsed =
-      GetLogicalIntervalElapsed(start_time, end_time);
-  return absl::StrCat(
-      "Async copy elapsed (s) = ", async_copy_elapsed,
-      ", logical interval elapsed (s) = ", logical_interval_elapsed);
-}
-
-std::optional<float>
-CostAnalysisPrefetchIntervalPicker::BufferIntervalAlternateMemoryBenefit(
-    const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval)
-    const {
-  return cost_analysis_.GetMemoryBoundedness(interval);
-}
-
-bool MemorySpaceAssignment::Allocation::operator==(
-    const MemorySpaceAssignment::Allocation& other) const {
-  return defining_position() == other.defining_position() &&
-         uses() == other.uses() && memory_space() == other.memory_space() &&
-         chunk() == other.chunk() && start_time() == other.start_time() &&
-         end_time() == other.end_time() &&
-         earliest_available_time() == other.earliest_available_time() &&
-         is_copy_allocation() == other.is_copy_allocation() &&
-         is_scoped_allocation() == other.is_scoped_allocation();
-}
-
-bool MemorySpaceAssignment::CopyAllocation::operator==(
-    const MemorySpaceAssignment::CopyAllocation& other) const {
-  return static_cast<const Allocation&>(*this) ==
-             static_cast<const Allocation&>(other) &&
-         copy_done_schedule_before() == other.copy_done_schedule_before() &&
-         copy_start_schedule_after() == other.copy_start_schedule_after() &&
-         copy_start() == other.copy_start() && copy_done() == other.copy_done();
-}
-
 std::string MemorySpaceAssignment::AllocationValue::ToString() const {
   std::string out = absl::StrCat("computation = ", computation()->name());
   absl::StrAppend(&out,
@@ -1621,10 +767,14 @@ AlternateMemoryBestFitHeap::AlternateMemoryBestFitHeap(
     MemorySpaceAssignment::AllocationSequence* allocations,
     const Options& options, const HloAliasAnalysis& alias_analysis,
     const HloLiveRange& hlo_live_range)
-    : GlobalDecreasingSizeBestFitHeap(options.alignment_in_bytes,
-                                      /*type=*/kSpatial,
-                                      /*buffer_interval_compare=*/nullptr,
-                                      SliceTimePermutationIterator::Ty::kAll),
+    : GlobalDecreasingSizeBestFitHeap(
+          options.alignment_in_bytes,
+          /*type=*/kSpatial, /*buffer_interval_compare=*/nullptr,
+          (options.sliced_prefetch_options.max_slices() >
+                   options.sliced_prefetch_options
+                       .all_slice_time_permutations_threshold()
+               ? SliceTimePermutationIterator::Ty::kPreferred
+               : SliceTimePermutationIterator::Ty::kAll)),
       allocations_(allocations),
       options_(options),
       alias_analysis_(alias_analysis),
@@ -2055,8 +1205,7 @@ void AlternateMemoryBestFitHeap::AppendScopedAllocationBufferInfoDebugString(
 }
 
 void AlternateMemoryBestFitHeap::AppendAllocationInfoDebugString(
-    const MemorySpaceAssignment::Allocation& allocation,
-    std::string& debug_str) const {
+    const Allocation& allocation, std::string& debug_str) const {
   // Columns in allocation information:
   // buffer_id: int. This value can be used the match with buffer info.
   // size: int. In bytes.
@@ -2096,7 +1245,7 @@ MemoryBoundLoopOptimizer::Create(
     int loop_start, int loop_end, uint64_t alternate_memory_size,
     const MemoryBoundLoopOptimizerOptions& options,
     const HloLiveRange& hlo_live_range, const HloAliasAnalysis& alias_analysis,
-    const MemorySpaceAssignmentCostAnalysis& cost_analysis,
+    const CostAnalysis& cost_analysis,
     const BufferValue::SizeFunction& size_function) {
   std::unique_ptr<MemoryBoundLoopOptimizer> optimizer =
       absl::WrapUnique(new MemoryBoundLoopOptimizer(
@@ -2110,7 +1259,7 @@ MemoryBoundLoopOptimizer::MemoryBoundLoopOptimizer(
     int loop_start, int loop_end, uint64_t alternate_memory_size,
     const MemoryBoundLoopOptimizerOptions& options,
     const HloLiveRange& hlo_live_range, const HloAliasAnalysis& alias_analysis,
-    const MemorySpaceAssignmentCostAnalysis& cost_analysis,
+    const CostAnalysis& cost_analysis,
     const BufferValue::SizeFunction& size_function)
     : loop_start_(loop_start),
       loop_end_(loop_end),
@@ -2255,7 +1404,7 @@ void MemoryBoundLoopOptimizer::MaybeCreateLoopValue(
       // Keep track of bytes accessed by this value.
       if (loop_index || prev_iteration_index) {
         float bytes_accessed =
-            cost_analysis_.cost_analysis().output_bytes_accessed(
+            cost_analysis_.hlo_cost_analysis().output_bytes_accessed(
                 *position.instruction, position.index);
         pos_bytes += bytes_accessed;
         VLOG(3) << " accessed: " << bytes_accessed;
@@ -2286,7 +1435,7 @@ void MemoryBoundLoopOptimizer::MaybeCreateLoopValue(
       // Keep track of bytes accessed by this value.
       if (loop_index || next_iteration_index) {
         float bytes_accessed =
-            cost_analysis_.cost_analysis().operand_bytes_accessed(
+            cost_analysis_.hlo_cost_analysis().operand_bytes_accessed(
                 *use.instruction, use.operand_number, use.operand_index);
         use_bytes += bytes_accessed;
         VLOG(3) << " accessed: " << bytes_accessed;
@@ -2389,14 +1538,12 @@ void MemoryBoundLoopOptimizer::Optimize() {
 
 float MemoryBoundLoopOptimizer::CalculateExecutionTime() const {
   // First populate the list of prefetches.
-  std::vector<std::pair<const MemorySpaceAssignment::CopyAllocation*, float>>
-      prefetches;
+  std::vector<std::pair<const CopyAllocation*, float>> prefetches;
   for (const LoopValue& value : loop_values_) {
     if (!value.allocations.empty() &&
         value.allocations.back()->is_copy_allocation()) {
       prefetches.push_back(
-          {static_cast<const MemorySpaceAssignment::CopyAllocation*>(
-               value.allocations.back().get()),
+          {static_cast<const CopyAllocation*>(value.allocations.back().get()),
            cost_analysis_.GetAsyncCopyElapsed(
                value.hlo_values.front()->shape())});
     }
@@ -2420,11 +1567,8 @@ float MemoryBoundLoopOptimizer::CalculateExecutionTime() const {
 
   // Sort the prefetches by first the start time, then the effective done time.
   absl::c_sort(
-      prefetches,
-      [&](const std::pair<const MemorySpaceAssignment::CopyAllocation*, float>&
-              a,
-          const std::pair<const MemorySpaceAssignment::CopyAllocation*, float>&
-              b) {
+      prefetches, [&](const std::pair<const CopyAllocation*, float>& a,
+                      const std::pair<const CopyAllocation*, float>& b) {
         return std::forward_as_tuple(
                    a.first->copy_start_schedule_after(),
                    get_effective_done_time(
@@ -2696,11 +1840,9 @@ void MemoryBoundLoopOptimizer::PostProcess() {
     if (!unallocated_uses.empty()) {
       // TODO(b/281582241): We should find the correct position. For now, we're
       // using the defining position on the first HLO value.
-      value.allocations.push_back(
-          std::make_unique<MemorySpaceAssignment::Allocation>(
-              value.hlo_values.front()->defining_position(),
-              MemorySpaceAssignment::MemorySpace::kDefault, std::nullopt, 0,
-              loop_size_, /*is_scoped_allocation=*/false));
+      value.allocations.push_back(std::make_unique<PinnedAllocation>(
+          value.hlo_values.front()->defining_position(), MemorySpace::kDefault,
+          std::nullopt, 0, loop_size_, /*is_scoped_allocation=*/false));
       for (const HloUse& use : unallocated_uses) {
         value.allocations.back()->AddUse(use);
       }
@@ -2744,12 +1886,10 @@ bool MemoryBoundLoopOptimizer::AllocateTemporary(LoopValue& value) {
   bool success = AllocateBetween(definition_idx, max_use_idx, value.size);
   if (success) {
     VLOG(3) << "Pos: " << value.loop_positions[0].second;
-    value.allocations.push_back(
-        std::make_unique<MemorySpaceAssignment::Allocation>(
-            value.loop_positions[0].second,
-            MemorySpaceAssignment::MemorySpace::kAlternate, std::nullopt,
-            definition_idx, max_use_idx,
-            /*is_scoped_allocation=*/false));
+    value.allocations.push_back(std::make_unique<PinnedAllocation>(
+        value.loop_positions[0].second, MemorySpace::kAlternate, std::nullopt,
+        definition_idx, max_use_idx,
+        /*is_scoped_allocation=*/false));
     AddAllLoopPositionsAndUses(value, /*allocate_next_iteration_uses=*/true);
   }
   return success;
@@ -2759,12 +1899,10 @@ bool MemoryBoundLoopOptimizer::AllocatePinned(LoopValue& value) {
   bool success = AllocateBetween(0, loop_size_ - 1, value.size);
   if (success) {
     CHECK(value.header_position);
-    value.allocations.push_back(
-        std::make_unique<MemorySpaceAssignment::Allocation>(
-            *value.header_position,
-            MemorySpaceAssignment::MemorySpace::kAlternate, std::nullopt, 0,
-            loop_size_,
-            /*is_scoped_allocation=*/false));
+    value.allocations.push_back(std::make_unique<PinnedAllocation>(
+        *value.header_position, MemorySpace::kAlternate, std::nullopt, 0,
+        loop_size_,
+        /*is_scoped_allocation=*/false));
     AddAllLoopPositionsAndUses(value, /*allocate_next_iteration_uses=*/false);
   }
   return success;
@@ -2992,8 +2130,8 @@ bool MemoryBoundLoopOptimizer::AllocatePrefetch(
         if (early_forced_value->allocations.empty()) {
           continue;
         }
-        const MemorySpaceAssignment::CopyAllocation* early_forced_prefetch =
-            static_cast<const MemorySpaceAssignment::CopyAllocation*>(
+        const CopyAllocation* early_forced_prefetch =
+            static_cast<const CopyAllocation*>(
                 early_forced_value->allocations.back().get());
         VLOG(3) << "Prefetch: " << early_forced_prefetch->ToString();
 
@@ -3138,16 +2276,13 @@ bool MemoryBoundLoopOptimizer::AllocatePrefetch(
 
   // Create the Allocation objects that correspond to the scheduled prefetch.
   CHECK(value->header_position);
-  value->allocations.push_back(
-      std::make_unique<MemorySpaceAssignment::Allocation>(
-          *value->header_position, MemorySpaceAssignment::MemorySpace::kDefault,
-          std::nullopt, 0, loop_size_, /*is_scoped_allocation=*/false));
-  value->allocations.push_back(
-      std::make_unique<MemorySpaceAssignment::CopyAllocation>(
-          *value->allocations.back(),
-          MemorySpaceAssignment::MemorySpace::kAlternate, std::nullopt,
-          ((*copy_start_time - 1) + loop_size_) % loop_size_, first_use_idx,
-          last_use_idx_sentinel));
+  value->allocations.push_back(std::make_unique<PinnedAllocation>(
+      *value->header_position, MemorySpace::kDefault, std::nullopt, 0,
+      loop_size_, /*is_scoped_allocation=*/false));
+  value->allocations.push_back(std::make_unique<CopyAllocation>(
+      *value->allocations.back(), MemorySpace::kAlternate, std::nullopt,
+      ((*copy_start_time - 1) + loop_size_) % loop_size_, first_use_idx,
+      last_use_idx_sentinel));
   AddAllLoopPositionsAndUses(*value, /*allocate_next_iteration_uses=*/true);
 
   // Account for the additional memory used by early forcing the already
@@ -3158,9 +2293,8 @@ bool MemoryBoundLoopOptimizer::AllocatePrefetch(
     LoopValue* early_forced_value = context.values.at(
         context.value_indices[early_forced_prefetch_value_index]);
     CHECK(!early_forced_value->allocations.empty());
-    MemorySpaceAssignment::CopyAllocation* early_forced_prefetch =
-        static_cast<MemorySpaceAssignment::CopyAllocation*>(
-            early_forced_value->allocations.back().get());
+    CopyAllocation* early_forced_prefetch = static_cast<CopyAllocation*>(
+        early_forced_value->allocations.back().get());
     for (int index = early_forced_prefetch->copy_start_schedule_after();
          index >= *copy_start_time; --index) {
       update_additional_memory_used(index, early_forced_value->size);
@@ -3177,7 +2311,7 @@ bool MemoryBoundLoopOptimizer::AllocatePrefetch(
 void MemoryBoundLoopOptimizer::AddAllLoopPositionsAndUses(
     LoopValue& value, bool allocate_next_iteration_uses) {
   CHECK_GE(value.allocations.size(), 1);
-  MemorySpaceAssignment::Allocation& allocation = *value.allocations.back();
+  Allocation& allocation = *value.allocations.back();
   for (const auto& [idx, position] : value.loop_positions) {
     positions_in_alternate_mem_[position.instruction].push_back(position.index);
   }
@@ -3874,26 +3008,23 @@ HloPosition TupleUseToPosition(const HloUse& use) {
 }
 
 // Returns the memory space of the defining position of an Allocation object.
-MemorySpaceAssignment::MemorySpace GetDefiningPositionMemorySpace(
-    const MemorySpaceAssignment::Allocation& allocation) {
+MemorySpace GetDefiningPositionMemorySpace(const Allocation& allocation) {
   if (!allocation.is_copy_like_allocation()) {
     return allocation.memory_space();
   }
-  if (allocation.memory_space() ==
-      MemorySpaceAssignment::MemorySpace::kDefault) {
-    return MemorySpaceAssignment::MemorySpace::kAlternate;
+  if (allocation.memory_space() == MemorySpace::kDefault) {
+    return MemorySpace::kAlternate;
   }
-  return MemorySpaceAssignment::MemorySpace::kDefault;
+  return MemorySpace::kDefault;
 }
 
 }  // namespace
 
-std::vector<std::vector<const MemorySpaceAssignment::Allocation*>>
+std::vector<std::vector<const Allocation*>>
 AlternateMemoryBestFitHeap::GetLinkedAllocationsInAlternateMemory(
     absl::Span<const AlternateMemoryBestFitHeap::AllocationValue>
         allocation_values) const {
-  std::vector<std::vector<const MemorySpaceAssignment::Allocation*>>
-      linked_allocations;
+  std::vector<std::vector<const Allocation*>> linked_allocations;
   // A map from position to index into linked_allocations.
   absl::flat_hash_map<HloPosition, int> link_id_map;
   // Iterate over the allocation values. Find Allocation objects across the
@@ -3986,8 +3117,7 @@ AlternateMemoryBestFitHeap::GetLinkedAllocationsInAlternateMemory(
   if (VLOG_IS_ON(3)) {
     for (int i = 0; i < linked_allocations.size(); ++i) {
       VLOG(3) << "Link id = " << i;
-      for (const MemorySpaceAssignment::Allocation* allocation :
-           linked_allocations[i]) {
+      for (const Allocation* allocation : linked_allocations[i]) {
         VLOG(3) << "  " << allocation->ToString();
       }
     }
@@ -4026,15 +3156,16 @@ AlternateMemoryBestFitHeap::GetInefficientAllocationSites(
           const HloPosition& defining_position =
               allocation->defining_position();
           int64_t accessed =
-              options_.cost_analysis->cost_analysis().output_bytes_accessed(
+              options_.cost_analysis->hlo_cost_analysis().output_bytes_accessed(
                   *defining_position.instruction, defining_position.index);
           VLOG(3) << "  pos: " << defining_position.ToString()
                   << ", accessed: " << accessed << " / " << size;
         }
         for (const HloUse& use : allocation->uses()) {
           int64_t accessed =
-              options_.cost_analysis->cost_analysis().operand_bytes_accessed(
-                  *use.instruction, use.operand_number, use.operand_index);
+              options_.cost_analysis->hlo_cost_analysis()
+                  .operand_bytes_accessed(*use.instruction, use.operand_number,
+                                          use.operand_index);
           VLOG(3) << "  use: " << use.ToString() << ", accessed: " << accessed
                   << " / " << size;
         }
@@ -4042,12 +3173,11 @@ AlternateMemoryBestFitHeap::GetInefficientAllocationSites(
     }
   }
 
-  std::vector<std::vector<const MemorySpaceAssignment::Allocation*>>
-      linked_allocations =
-          GetLinkedAllocationsInAlternateMemory(allocation_values);
+  std::vector<std::vector<const Allocation*>> linked_allocations =
+      GetLinkedAllocationsInAlternateMemory(allocation_values);
   std::vector<AlternateMemoryBestFitHeap::HloPositionOrUse> inefficient_sites;
-  for (const std::vector<const MemorySpaceAssignment::Allocation*>&
-           allocation_group : linked_allocations) {
+  for (const std::vector<const Allocation*>& allocation_group :
+       linked_allocations) {
     // For all of allocation in the linked allocation group, calculate the total
     // use bytes in alternate memory and async copy bytes. If the ratio between
     // the two is below inefficient_use_to_copy_ratio, add all of the
@@ -4055,8 +3185,7 @@ AlternateMemoryBestFitHeap::GetInefficientAllocationSites(
     VLOG(3) << "AllocationGroup:";
     int64_t copy_bytes = 0;
     int64_t use_bytes = 0;
-    for (const MemorySpaceAssignment::Allocation* allocation :
-         allocation_group) {
+    for (const Allocation* allocation : allocation_group) {
       VLOG(3) << " Allocation: " << allocation->ToString();
       MemorySpace position_memory_space =
           GetDefiningPositionMemorySpace(*allocation);
@@ -4065,22 +3194,22 @@ AlternateMemoryBestFitHeap::GetInefficientAllocationSites(
       }
       if (position_memory_space == MemorySpace::kAlternate) {
         use_bytes +=
-            options_.cost_analysis->cost_analysis().output_bytes_accessed(
+            options_.cost_analysis->hlo_cost_analysis().output_bytes_accessed(
                 *allocation->defining_position().instruction,
                 allocation->defining_position().index);
       }
       if (allocation->memory_space() == MemorySpace::kAlternate) {
         for (const HloUse& use : allocation->uses()) {
           use_bytes +=
-              options_.cost_analysis->cost_analysis().operand_bytes_accessed(
-                  *use.instruction, use.operand_number, use.operand_index);
+              options_.cost_analysis->hlo_cost_analysis()
+                  .operand_bytes_accessed(*use.instruction, use.operand_number,
+                                          use.operand_index);
         }
       }
     }
     VLOG(3) << " use bytes: " << use_bytes << ", copy bytes: " << copy_bytes;
     if (options_.inefficient_use_to_copy_ratio * copy_bytes > use_bytes) {
-      for (const MemorySpaceAssignment::Allocation* allocation :
-           allocation_group) {
+      for (const Allocation* allocation : allocation_group) {
         MemorySpace position_memory_space =
             GetDefiningPositionMemorySpace(*allocation);
         if (position_memory_space == MemorySpace::kAlternate) {
@@ -4355,16 +3484,22 @@ AlternateMemoryBestFitHeap::AllocateAllocationValues(
             loop_optimized_allocations_map_.end()) {
           const LoopOptimizedAllocationInfo& loop_optimized_allocation_info =
               loop_optimized_allocation_it->second;
-          const MemorySpaceAssignment::Allocation* allocation =
+          const Allocation* allocation =
               loop_optimized_allocation_info.loop_optimized_allocation;
           VLOG(3) << "Found optimized allocation for " << use.hlo_use.ToString()
                   << " (loop idx: " << loop_optimized_allocation_info.use_index
                   << "): " << allocation->ToString();
-          if (allocation->is_copy_allocation()) {
+          if (require_no_copy_alternate_mem_allocation) {
+            if (allocation->is_copy_allocation() ||
+                allocation->memory_space() == MemorySpace::kDefault) {
+              LOG(WARNING) << "Optimized allocation could not be applied "
+                              "because the tensor is pre-colored, allocation: "
+                           << allocation->ToString();
+            }
+          } else if (allocation->is_copy_allocation()) {
             allow_no_copy_alternate_mem_allocation = true;
-            const MemorySpaceAssignment::CopyAllocation* copy_allocation =
-                static_cast<const MemorySpaceAssignment::CopyAllocation*>(
-                    allocation);
+            const CopyAllocation* copy_allocation =
+                static_cast<const CopyAllocation*>(allocation);
             int64_t effective_copy_start_time =
                 copy_allocation->copy_start_schedule_after();
             if (copy_allocation->copy_start_schedule_after() ==
@@ -4492,9 +3627,8 @@ AlternateMemoryBestFitHeap::AllocateAllocationValues(
       }
 
       // Propagate the allocation to any aliases this use might have had.
-      MemorySpaceAssignment::Allocation* aliased_allocation =
-          GetLiveAllocationAt(*allocation_value.allocation_sequence(),
-                              use_time);
+      Allocation* aliased_allocation = GetLiveAllocationAt(
+          *allocation_value.allocation_sequence(), use_time);
       for (const HloPosition& aliased_position : use.aliases) {
         AddAliasedRequiredAssignment(aliased_position.instruction,
                                      aliased_position.index,
@@ -4545,7 +3679,7 @@ AlternateMemoryBestFitHeap::AllocateAllocationValues(
             int64_t body_parameter_time = instruction_schedule.at(
                 body_allocation_value_it->defining_instruction());
             body_allocation_value_it->mutable_allocation_sequence()->push_back(
-                std::make_unique<MemorySpaceAssignment::ParentAllocation>(
+                std::make_unique<ParentAllocation>(
                     **prev_allocation_in_default_mem_it, hlo_use.instruction,
                     body_allocation_value_it->defining_position(),
                     body_parameter_time));
@@ -4563,9 +3697,8 @@ AlternateMemoryBestFitHeap::AllocateAllocationValues(
                     << after_while_allocation_value_it->ToShortString();
             int64_t while_time = instruction_schedule.at(hlo_use.instruction);
             after_while_allocation_value_it->mutable_allocation_sequence()
-                ->push_back(
-                    std::make_unique<MemorySpaceAssignment::MirroredAllocation>(
-                        **prev_allocation_in_default_mem_it, while_time));
+                ->push_back(std::make_unique<MirroredAllocation>(
+                    **prev_allocation_in_default_mem_it, while_time));
             VLOG(3) << "Created: "
                     << after_while_allocation_value_it->allocation_sequence()
                            ->back()
@@ -4867,7 +4000,7 @@ struct CopyResourceDumpData {
 
 std::string AsynchronousCopyResource::Dump(
     int64_t start_time, int64_t end_time,
-    MemorySpaceAssignment::MemorySpace memory_space_filter) const {
+    MemorySpace memory_space_filter) const {
   std::vector<float> available = GetCurrentResources();
   std::vector<CopyResourceDumpData> time_dump_data;
   for (int i = start_time; i < end_time; ++i) {
@@ -4929,15 +4062,14 @@ std::string AsynchronousCopyResource::Dump(
 }
 
 AlternateMemoryBestFitHeap::AliasedOffset*
-AlternateMemoryBestFitHeap::GetAliasedOffset(
-    const MemorySpaceAssignment::Allocation& allocation) {
+AlternateMemoryBestFitHeap::GetAliasedOffset(const Allocation& allocation) {
   auto aliased_offset_it = aliased_offset_map_.find(&allocation);
   CHECK(aliased_offset_it != aliased_offset_map_.end());
   return aliased_offset_it->second;
 }
 
 void AlternateMemoryBestFitHeap::CreateOrAddToAliasedOffset(
-    const MemorySpaceAssignment::Allocation& allocation,
+    const Allocation& allocation,
     AlternateMemoryBestFitHeap::AliasedOffset* aliased_offset) {
   CHECK(allocation.memory_space() == MemorySpace::kAlternate);
   CHECK(!aliased_offset_map_.contains(&allocation));
@@ -4950,8 +4082,7 @@ void AlternateMemoryBestFitHeap::CreateOrAddToAliasedOffset(
   aliased_offset_map_[&allocation] = aliased_offset;
 }
 
-/*static*/ MemorySpaceAssignment::Allocation*
-AlternateMemoryBestFitHeap::GetLiveAllocationAt(
+/*static*/ Allocation* AlternateMemoryBestFitHeap::GetLiveAllocationAt(
     const MemorySpaceAssignment::AllocationSequence& allocations,
     int64_t time) {
   for (auto allocation_it = allocations.rbegin();
@@ -4978,7 +4109,7 @@ void AlternateMemoryBestFitHeap::AllocateCrossProgramPrefetchBuffer(
   module->AddCrossProgramPrefetch(parameter, buffer->index());
 
   MemorySpaceAssignment::AllocationSequence allocations;
-  allocations.push_back(std::make_unique<MemorySpaceAssignment::Allocation>(
+  allocations.push_back(std::make_unique<PinnedAllocation>(
       buffer->defining_position(), MemorySpace::kDefault, kDummyChunk,
       prefetch_candidate.start, prefetch_candidate.end,
       /*is_scoped_allocation=*/false));
@@ -5146,10 +4277,9 @@ void AlternateMemoryBestFitHeap::AllocateReservedScopedAllocations() {
             instruction, i, reserved_scoped_memory, buffer_info_str_);
       }
 
-      allocations_->push_back(
-          std::make_unique<MemorySpaceAssignment::Allocation>(
-              HloPosition{instruction_sequence[i], {}}, MemorySpace::kAlternate,
-              chunk_candidate, i, i, /*is_scoped_allocation=*/true));
+      allocations_->push_back(std::make_unique<PinnedAllocation>(
+          HloPosition{instruction_sequence[i], {}}, MemorySpace::kAlternate,
+          chunk_candidate, i, i, /*is_scoped_allocation=*/true));
 
       repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
           i, i, reserved_scoped_memory,
@@ -5226,7 +4356,7 @@ AlternateMemoryBestFitHeap::AliasedRequiredAssignmentForUse(
 
 void AlternateMemoryBestFitHeap::AddAliasedRequiredAssignment(
     const HloInstruction* instruction, ShapeIndex index,
-    const MemorySpaceAssignment::Allocation* aliased_allocation) {
+    const Allocation* aliased_allocation) {
   AliasedOffset* offset = nullptr;
   if (aliased_allocation->memory_space() == MemorySpace::kAlternate) {
     offset = GetAliasedOffset(*aliased_allocation);
@@ -5237,8 +4367,8 @@ void AlternateMemoryBestFitHeap::AddAliasedRequiredAssignment(
 
 void AlternateMemoryBestFitHeap::AddRequiredAssignment(
     const HloValue* value, const HloInstruction* instruction,
-    MemorySpaceAssignment::MemorySpace memory_space, int64_t time,
-    AliasedOffset* offset, bool add_to_pending) {
+    MemorySpace memory_space, int64_t time, AliasedOffset* offset,
+    bool add_to_pending) {
   // Check for existing required assignment at this time and make sure it is the
   // same as this if there is one.
   auto existing_required_assignment = RequiredMemoryAssignmentAt(value, time);
@@ -5473,7 +4603,7 @@ void AlternateMemoryBestFitHeap::UpdateReservedScopedAllocationSize() {
   }
   // Update scoped allocation sizes.
   for (RepackAllocationBlock& allocation_block : repack_allocation_blocks_) {
-    MemorySpaceAssignment::Allocation* allocation = allocation_block.allocation;
+    Allocation* allocation = allocation_block.allocation;
     if (allocation->is_scoped_allocation()) {
       allocation_block.size =
           reserved_scoped_memory_map[allocation->start_time()];
@@ -5485,7 +4615,6 @@ void AlternateMemoryBestFitHeap::UpdateReservedScopedAllocationSize() {
 
 void AlternateMemoryBestFitHeap::ExportAllocationsForRepacking(
     std::vector<AllocationBlock*>& allocations) {
-  using SlicedCopyAllocation = MemorySpaceAssignment::SlicedCopyAllocation;
   using SliceDetail = SlicedCopyAllocation::SliceDetail;
 
   if (options_.reduce_scoped_memory_limit) {
@@ -5548,7 +4677,7 @@ void AlternateMemoryBestFitHeap::ImportRepackedAllocations() {
 
 void AlternateMemoryBestFitHeap::ImportRepackedNonSlicedAllocation(
     RepackAllocationBlock& block) {
-  MemorySpaceAssignment::Allocation* allocation = block.allocation;
+  Allocation* allocation = block.allocation;
   int64_t original_offset = block.initial_offset;
   int64_t repacked_offset = block.offset;
 
@@ -5567,7 +4696,7 @@ void AlternateMemoryBestFitHeap::ImportRepackedNonSlicedAllocation(
 
 void AlternateMemoryBestFitHeap::ImportRepackedSlicedAllocation(
     RepackAllocationBlock& block) {
-  using SlicedCopyAllocation = MemorySpaceAssignment::SlicedCopyAllocation;
+  using SlicedCopyAllocation = memory_space_assignment::SlicedCopyAllocation;
   using SliceDetail = SlicedCopyAllocation::SliceDetail;
 
   CHECK_OK(AreRepackedSlicesValid(block));
@@ -5734,8 +4863,7 @@ void AlternateMemoryBestFitHeap::UncommitPendingChunks(
 
 void AlternateMemoryBestFitHeap::FinalizeAllocations(
     absl::Span<AllocationValue> allocation_values) {
-  absl::flat_hash_map<const AliasedOffset*,
-                      std::vector<MemorySpaceAssignment::Allocation*>>
+  absl::flat_hash_map<const AliasedOffset*, std::vector<Allocation*>>
       colocation_map;
   for (AllocationValue& allocation_value : allocation_values) {
     for (auto& allocation : *allocation_value.mutable_allocation_sequence()) {
@@ -5752,8 +4880,7 @@ void AlternateMemoryBestFitHeap::FinalizeAllocations(
         }
       }
       allocations_->push_back(std::move(allocation));
-      MemorySpaceAssignment::Allocation* inserted_allocation =
-          allocations_->back().get();
+      Allocation* inserted_allocation = allocations_->back().get();
       if (inserted_allocation->memory_space() == MemorySpace::kAlternate) {
         colocation_map[GetAliasedOffset(*inserted_allocation)].push_back(
             inserted_allocation);
@@ -5765,8 +4892,7 @@ void AlternateMemoryBestFitHeap::FinalizeAllocations(
   // reduce fragmentation.
   for (auto& colocation : colocation_map) {
     std::vector<AllocationBlock*> colocations;
-    for (MemorySpaceAssignment::Allocation* colocated_allocation :
-         colocation.second) {
+    for (Allocation* colocated_allocation : colocation.second) {
       repack_allocation_blocks_.push_back(MakeRepackAllocationBlock(
           colocated_allocation->start_time(), colocated_allocation->end_time(),
           colocated_allocation->chunk().size,
@@ -5834,7 +4960,7 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
   // consumed multiple times by the same instruction. We can just find the
   // previous allocation and use that allocation.
   if (request.inclusive_start_time == request.end_time) {
-    MemorySpaceAssignment::Allocation* allocation =
+    Allocation* allocation =
         GetLiveAllocationAt(*allocation_sequence, request.end_time);
     CHECK_NE(allocation, nullptr);
     allocation->AddUse(request.use->hlo_use);
@@ -5865,12 +4991,13 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
             << " use benefit = "
             << options_.cost_analysis->GetAlternateMemoryBenefit(
                    request.use->hlo_use);
-    VLOG(3) << "Definition bytes accessed = "
-            << options_.cost_analysis->cost_analysis().output_bytes_accessed(
-                   *defining_position.instruction, defining_position.index)
-            << ", use bytes accessed = "
-            << options_.cost_analysis->cost_analysis().operand_bytes_accessed(
-                   *use.instruction, use.operand_number, use.operand_index);
+    VLOG(3)
+        << "Definition bytes accessed = "
+        << options_.cost_analysis->hlo_cost_analysis().output_bytes_accessed(
+               *defining_position.instruction, defining_position.index)
+        << ", use bytes accessed = "
+        << options_.cost_analysis->hlo_cost_analysis().operand_bytes_accessed(
+               *use.instruction, use.operand_number, use.operand_index);
   }
 
   // There could be a requirement to pin this buffer to default memory either
@@ -5924,12 +5051,11 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
         aliased_chunk = Chunk::FromOffsetSize(
             required_assignment_at_start->offset->offset, request.size);
       }
-      allocation_sequence->push_back(
-          std::make_unique<MemorySpaceAssignment::Allocation>(
-              defining_position, required_assignment_at_start->memory_space,
-              aliased_chunk, request.inclusive_start_time,
-              request.inclusive_start_time,
-              /*is_scoped_allocation=*/false));
+      allocation_sequence->push_back(std::make_unique<PinnedAllocation>(
+          defining_position, required_assignment_at_start->memory_space,
+          aliased_chunk, request.inclusive_start_time,
+          request.inclusive_start_time,
+          /*is_scoped_allocation=*/false));
       if (required_assignment_at_start->memory_space ==
           MemorySpace::kAlternate) {
         CreateOrAddToAliasedOffset(*allocation_sequence->back(),
@@ -5978,12 +5104,10 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
     }
     prev_allocation_in_default_mem_it = allocation_sequence->rbegin();
   } else if (prev_allocation_in_default_mem_it == allocation_sequence->rend()) {
-    allocation_sequence->push_back(
-        std::make_unique<MemorySpaceAssignment::Allocation>(
-            defining_position, MemorySpace::kDefault,
-            /*chunk=*/std::nullopt, request.inclusive_start_time,
-            request.end_time,
-            /*is_scoped_allocation=*/false));
+    allocation_sequence->push_back(std::make_unique<PinnedAllocation>(
+        defining_position, MemorySpace::kDefault,
+        /*chunk=*/std::nullopt, request.inclusive_start_time, request.end_time,
+        /*is_scoped_allocation=*/false));
     prev_allocation_in_default_mem_it = allocation_sequence->rbegin();
   }
 
@@ -6022,21 +5146,17 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
         // Warn if the prefetch time picked doesn't match the preferred prefetch
         // time.
         CHECK(!request.allocation_value->allocation_sequence()->empty());
-        const MemorySpaceAssignment::Allocation* allocation =
+        const Allocation* allocation =
             request.allocation_value->allocation_sequence()->back().get();
         int64_t prefetch_time = 0;
         if (allocation->is_copy_allocation()) {
-          prefetch_time =
-              static_cast<const MemorySpaceAssignment::CopyAllocation*>(
-                  allocation)
-                  ->copy_start_schedule_after();
+          prefetch_time = static_cast<const CopyAllocation*>(allocation)
+                              ->copy_start_schedule_after();
         } else if (allocation->is_sliced_copy_allocation()) {
-          prefetch_time =
-              static_cast<const MemorySpaceAssignment::SlicedCopyAllocation*>(
-                  allocation)
-                  ->slice_details_sorted_by_start_time()
-                  .front()
-                  .copy_start_after_time;
+          prefetch_time = static_cast<const SlicedCopyAllocation*>(allocation)
+                              ->slice_details_sorted_by_start_time()
+                              .front()
+                              .copy_start_after_time;
         } else {
           LOG(FATAL) << "Prefetch allocation are expected to be "
                         "CopyAllocations or SlicedCopyAllocations.";
@@ -6083,34 +5203,29 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::AllocateSegment(
 }
 
 void AlternateMemoryBestFitHeap::AddAsyncCopy(
-    MemorySpaceAssignment::Allocation& prev_allocation,
-    MemorySpace memory_space, std::optional<Chunk> chunk,
-    int64_t exclusive_start_time, int64_t end_time,
+    Allocation& prev_allocation, MemorySpace memory_space,
+    std::optional<Chunk> chunk, int64_t exclusive_start_time, int64_t end_time,
     int64_t copy_done_schedule_before_time,
     MemorySpaceAssignment::AllocationSequence* allocations,
     AliasedOffset* aliased_offset, float resource,
     std::optional<int> cross_program_prefetch_index) {
   VLOG(3) << "Copy to "
-          << (memory_space == MemorySpaceAssignment::MemorySpace::kDefault
-                  ? "default"
-                  : "alternate")
+          << (memory_space == MemorySpace::kDefault ? "default" : "alternate")
           << " memory in (" << exclusive_start_time << ", "
           << copy_done_schedule_before_time << "), keeping until " << end_time
           << ", estimated copy resource is " << resource;
   CHECK_LT(exclusive_start_time, copy_done_schedule_before_time);
 
-  allocations->push_back(
-      std::make_unique<MemorySpaceAssignment::CopyAllocation>(
-          prev_allocation, memory_space, chunk, exclusive_start_time,
-          copy_done_schedule_before_time, end_time,
-          cross_program_prefetch_index));
+  allocations->push_back(std::make_unique<CopyAllocation>(
+      prev_allocation, memory_space, chunk, exclusive_start_time,
+      copy_done_schedule_before_time, end_time, cross_program_prefetch_index));
 
   // Register the additional async copy with the interval tree to keep track of
   // the limit at any given time.
   pending_async_copies_.push_back({exclusive_start_time,
                                    copy_done_schedule_before_time, resource,
                                    memory_space, next_async_copy_id_++});
-  if (memory_space == MemorySpaceAssignment::MemorySpace::kAlternate) {
+  if (memory_space == MemorySpace::kAlternate) {
     prefetch_interval_tree_.Add(
         /*start=*/
         ExclusiveToInclusiveStartTime(exclusive_start_time),
@@ -6138,8 +5253,8 @@ namespace {
 // - When the allocation for the slice ends
 // - An estimation of how much copy resource the slice consumes
 std::string SliceTimesAndCopyResourcesToString(
-    const std::vector<MemorySpaceAssignment::SliceDecision>& slice_decisions,
-    int64_t prefetch_end, int64_t allocation_end) {
+    const std::vector<SliceDecision>& slice_decisions, int64_t prefetch_end,
+    int64_t allocation_end) {
   std::vector<std::string> slice_strings;
   slice_strings.reserve(slice_decisions.size());
 
@@ -6163,11 +5278,10 @@ std::string SliceTimesAndCopyResourcesToString(
 }  // namespace
 
 void AlternateMemoryBestFitHeap::AddAsyncSlicesForPrefetch(
-    const MemorySpaceAssignment::Allocation& prev_allocation,
+    const Allocation& prev_allocation,
     MemorySpaceAssignment::AllocationSequence* allocations,
     AliasedOffset* aliased_offset,
-    const std::vector<MemorySpaceAssignment::SliceDecision>&
-        slice_decisions_sorted_by_start_time,
+    const std::vector<SliceDecision>& slice_decisions_sorted_by_start_time,
     int64_t prefetch_end_time, int64_t allocation_end_time) {
   VLOG(3) << "Sliced copy to alternate memory. "
           << SliceTimesAndCopyResourcesToString(
@@ -6178,19 +5292,18 @@ void AlternateMemoryBestFitHeap::AddAsyncSlicesForPrefetch(
         return slice_decision.exclusive_start_time < prefetch_end_time;
       }));
 
-  allocations->push_back(
-      std::make_unique<MemorySpaceAssignment::SlicedCopyAllocation>(
-          prev_allocation, MemorySpaceAssignment::MemorySpace::kAlternate,
-          slice_decisions_sorted_by_start_time, prefetch_end_time,
-          allocation_end_time));
+  allocations->push_back(std::make_unique<SlicedCopyAllocation>(
+      prev_allocation, MemorySpace::kAlternate,
+      slice_decisions_sorted_by_start_time, prefetch_end_time,
+      allocation_end_time, options_.sliced_prefetch_options,
+      options_.get_equivalent_s8_shape_fn));
 
   // Register the additional async copy with the interval tree to keep track of
   // the limit at any given time.
   for (const auto& slice_decision : slice_decisions_sorted_by_start_time) {
     pending_async_copies_.push_back(
         {slice_decision.exclusive_start_time, prefetch_end_time,
-         slice_decision.copy_resource_consumed,
-         MemorySpaceAssignment::MemorySpace::kAlternate,
+         slice_decision.copy_resource_consumed, MemorySpace::kAlternate,
          next_async_copy_id_++});
     prefetch_interval_tree_.Add(slice_decision.exclusive_start_time,
                                 prefetch_end_time, kDummyChunk);
@@ -6235,7 +5348,7 @@ bool AlternateMemoryBestFitHeap::ViolatesMaximumOutstandingAsyncCopies(
 AlternateMemoryBestFitHeap::Result
 AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
     const AllocationRequest& request) {
-  MemorySpaceAssignment::Allocation* prev_allocation = nullptr;
+  Allocation* prev_allocation = nullptr;
   bool can_eliminate_copy = false;
   if (request.allocation_value->allocation_sequence()->empty()) {
     // There hasn't been any allocations for this interval so far. We can
@@ -6344,7 +5457,7 @@ AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
       prev_allocation->set_end_time(request.end_time);
     } else {
       request.allocation_value->mutable_allocation_sequence()->push_back(
-          std::make_unique<MemorySpaceAssignment::Allocation>(
+          std::make_unique<PinnedAllocation>(
               defining_position, MemorySpace::kAlternate, chunk_candidate,
               request.inclusive_start_time, request.end_time,
               /*is_scoped_allocation=*/false));
@@ -6366,7 +5479,7 @@ AlternateMemoryBestFitHeap::AllocateInAlternateMemoryNoCopy(
 AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Evict(
     const AllocationRequest& request) {
   CHECK_GT(request.allocation_value->allocation_sequence()->size(), 0);
-  MemorySpaceAssignment::Allocation* prev_allocation =
+  Allocation* prev_allocation =
       request.allocation_value->allocation_sequence()->back().get();
   // We do not ever expect an Evict() to be immediately proceeded by a prefetch.
   // If that case ever occurs, the eviction_exclusive_start_time below will be
@@ -6499,7 +5612,7 @@ namespace {
 
 // A debugging/logging method for describing a sliced solution.
 std::string DescribeSlicedBufferMove(
-    const std::vector<MemorySpaceAssignment::SliceDecision>& slice_decisions,
+    const std::vector<SliceDecision>& slice_decisions,
     const AlternateMemoryBestFitHeap::HeapResult& heap_result,
     const AlternateMemoryBestFitHeap::Chunk& full_chunk,
     absl::string_view prefetch_picker_debug_string) {
@@ -6524,7 +5637,7 @@ std::string DescribeSlicedBufferMove(
 
 AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::Prefetch(
     const AllocationRequest& request,
-    MemorySpaceAssignment::Allocation& prev_allocation_in_default_mem) {
+    Allocation& prev_allocation_in_default_mem) {
   // Try partially placing the buffer in the alternate space. The time that is
   // overlapped will be used to asynchronously copy the buffer from the
   // default memory to the alternate memory.
@@ -6709,12 +5822,10 @@ void AlternateMemoryBestFitHeap::GenerateSliceProposal(
   }
 
   VLOG(6) << log_prefix() << ". Slice proposal = ["
-          << absl::StrJoin(
-                 status_or_proposal.value(), ", ",
-                 [](std::string* out,
-                    const MemorySpaceAssignment::SliceProposal& proposal) {
-                   absl::StrAppend(out, proposal.ToString());
-                 })
+          << absl::StrJoin(status_or_proposal.value(), ", ",
+                           [](std::string* out, const SliceProposal& proposal) {
+                             absl::StrAppend(out, proposal.ToString());
+                           })
           << "]";
 
   context.slice_proposal_collection = std::move(status_or_proposal.value());
@@ -6748,7 +5859,7 @@ void AlternateMemoryBestFitHeap::SetupPrefetchWorkingIntervalsAndSliceProposal(
                 context.sliced_solution_intervals.full));
     std::vector<int64_t> sizes;
     sizes.reserve(context.slice_proposal_collection->size());
-    for (const MemorySpaceAssignment::SliceProposal& single_slice_proposal :
+    for (const SliceProposal& single_slice_proposal :
          *context.slice_proposal_collection) {
       sizes.push_back(single_slice_proposal.slice_size);
     }
@@ -6851,12 +5962,10 @@ float CopyResourceForShape(const Options& options, const Shape& shape) {
 // collection, in descending order.
 std::vector<float> GetCopyResourcesSortedDescending(
     const Options& options,
-    const MemorySpaceAssignment::SliceProposalCollection&
-        slice_proposal_collection) {
+    const SliceProposalCollection& slice_proposal_collection) {
   std::vector<float> copy_resources;
   copy_resources.reserve(slice_proposal_collection.size());
-  for (const MemorySpaceAssignment::SliceProposal& proposal :
-       slice_proposal_collection) {
+  for (const SliceProposal& proposal : slice_proposal_collection) {
     copy_resources.push_back(
         CopyResourceForShape(options, proposal.slice_shape));
   }
@@ -7074,20 +6183,16 @@ AlternateMemoryBestFitHeap::Result AlternateMemoryBestFitHeap::CheckPrefetchFit(
         GetCandidateToProposalIndexMap(chunk_candidates);
 
     // Create slice decisions, sorted by time.
-    std::vector<MemorySpaceAssignment::SliceDecision>
-        slice_decisions_sorted_by_start_time;
+    std::vector<SliceDecision> slice_decisions_sorted_by_start_time;
     for (int64_t slice_time = 0;
          slice_time < sliced_buffer_interval->num_slices(); ++slice_time) {
-      const MemorySpaceAssignment::SliceProposal& proposal =
-          context.slice_proposal_collection->at(
-              candidate_to_proposal_index_map[slice_time]);
+      const SliceProposal& proposal = context.slice_proposal_collection->at(
+          candidate_to_proposal_index_map[slice_time]);
       copy_resource_per_slice_sorted_by_start_time[slice_time] =
           CopyResourceForShape(options_, proposal.slice_shape);
-      slice_decisions_sorted_by_start_time.push_back(
-          MemorySpaceAssignment::SliceDecision{
-              chunk_candidates[slice_time],
-              exclusive_slice_start_times[slice_time], proposal,
-              copy_resource_per_slice_sorted_by_start_time[slice_time]});
+      slice_decisions_sorted_by_start_time.push_back(SliceDecision{
+          chunk_candidates[slice_time], exclusive_slice_start_times[slice_time],
+          proposal, copy_resource_per_slice_sorted_by_start_time[slice_time]});
     }
 
     // Check that we have enough copy resources for all the slice decisions.
@@ -7494,45 +6599,6 @@ Status MemorySpaceAssignment::FindAllocationSequence(
   return OkStatus();
 }
 
-bool MemorySpaceAssignment::Allocation::is_copy_like_allocation() const {
-  return is_copy_allocation() || is_sliced_copy_allocation();
-}
-
-void MemorySpaceAssignment::Allocation::AddUse(HloUse use) {
-  HloInstruction* operand =
-      use.instruction->mutable_operand(use.operand_number);
-  // If the use is a tuple, look inside the tuple to find the actual use.
-  for (int64_t index : use.operand_index) {
-    if (operand->opcode() != HloOpcode::kTuple) {
-      break;
-    }
-    operand = operand->mutable_operand(index);
-  }
-
-  // Look beyond GetTupleElement(Tuple()) pattern for any bitcasts.
-  std::function<HloInstruction*(HloInstruction*)> get_simplified_operand;
-  get_simplified_operand = [&](HloInstruction* instruction) {
-    while (instruction->opcode() == HloOpcode::kGetTupleElement) {
-      HloInstruction* operand =
-          get_simplified_operand(instruction->mutable_operand(0));
-      if (operand->opcode() == HloOpcode::kTuple) {
-        instruction = operand->mutable_operand(instruction->tuple_index());
-      } else {
-        return instruction;
-      }
-    }
-    return instruction;
-  };
-  operand = get_simplified_operand(operand);
-
-  uses_.push_back(use);
-}
-
-void MemorySpaceAssignment::Allocation::set_offset(int64_t offset) {
-  CHECK(chunk_.has_value());
-  *chunk_ = Chunk::FromOffsetSize(offset, chunk_->size);
-}
-
 float MemorySpaceAssignment::ComputeEstimatedElapsedTime(
     const HloLiveRange& hlo_live_range, const AllocationSequence& allocations) {
   absl::flat_hash_map<const HloInstruction*, std::vector<ShapeIndex>>
@@ -7575,646 +6641,16 @@ float MemorySpaceAssignment::ComputeEstimatedElapsedTime(
         options_.cost_analysis->GetInstructionElapsedInAlternateMemory(
             *instruction, operands_in_alternate_memory,
             outputs_in_alternate_memory);
-    float while_nest_multiplier = IPow<float>(
-        options_.xla_tpu_memory_space_assignment_while_execution_count,
-        options_.cost_analysis->CalculateComputationNestLevel(
-            instruction,
-            /*while_only=*/true));
+    float while_nest_multiplier =
+        options_.cost_analysis->GetWhileNestMultiplier(
+            options_.cost_analysis->CalculateComputationNestLevel(
+                instruction,
+                /*while_only=*/true));
     total_elapsed += while_nest_multiplier * instruction_elapsed;
   }
   return total_elapsed;
 }
 
-Status MemorySpaceAssignment::Allocation::Process() {
-  if (is_scoped_allocation()) {
-    // Nothing to do here for scoped allocations.
-    return OkStatus();
-  }
-  HloInstruction* producing_instruction = AddGetTupleElements();
-  HloComputation* computation = producing_instruction->parent();
-  for (const HloUse& use : uses_) {
-    Shape operand_shape = use.instruction->operand(use.operand_number)->shape();
-    HloInstruction* replacement_instruction = producing_instruction;
-    if (operand_shape.IsTuple()) {
-      TF_ASSIGN_OR_RETURN(
-          replacement_instruction,
-          TupleUtil::ReplaceTupleWith(
-              producing_instruction,
-              use.instruction->mutable_operand(use.operand_number),
-              use.operand_index));
-    } else if (operand_shape != producing_instruction->shape()) {
-      VLOG(4) << "Old shape = " << operand_shape.ToString()
-              << ", new shape = " << producing_instruction->shape().ToString()
-              << "; inserting a bitcast.";
-      replacement_instruction = computation->AddInstruction(
-          HloInstruction::CreateBitcast(operand_shape, producing_instruction));
-    }
-    TF_RETURN_IF_ERROR(use.instruction->ReplaceOperandWith(
-        use.operand_number, replacement_instruction));
-  }
-  return OkStatus();
-}
-
-HloInstruction* MemorySpaceAssignment::Allocation::AddGetTupleElements() const {
-  CHECK_NE(defining_position().instruction, nullptr);
-
-  Shape shape = defining_position().shape();
-  CHECK(shape.IsArray()) << "Allocation shape is not an array. Shape = "
-                         << shape.ToString()
-                         << " position = " << defining_position().shape();
-  return TupleUtil::AddGetTupleElements(defining_position());
-}
-
-std::string MemorySpaceAssignment::Allocation::ToString() const {
-  std::string memory_space_str =
-      memory_space_ == MemorySpace::kDefault ? "def" : "alt";
-  if (chunk_) {
-    absl::StrAppend(&memory_space_str, " (off: ", chunk_->offset, ")");
-  }
-  return absl::StrCat((is_scoped_allocation() ? "Scoped " : ""),
-                      "Allocation in ", memory_space_str, " defined at ",
-                      defining_position_.ToString(),
-                      ", start_time:", start_time(), ", end_time:", end_time(),
-                      ", uses: ", UsesToString(uses()));
-}
-
-std::string MemorySpaceAssignment::CopyAllocation::ToString() const {
-  std::string memory_space_str =
-      memory_space_ == MemorySpace::kDefault ? "def" : "alt";
-  if (chunk_) {
-    absl::StrAppend(&memory_space_str, " (off: ", chunk_->offset, ")");
-  }
-  return absl::StrCat("Copy Allocation in ", memory_space_str,
-                      ", start_time:", start_time(), ", end_time:", end_time(),
-                      ", copy_start_after_time: ", copy_start_schedule_after(),
-                      ", copy_done_before_time: ", copy_done_schedule_before(),
-                      ", uses: ", UsesToString(uses()), ", from ",
-                      prev_allocation_.ToString());
-}
-
-std::string MemorySpaceAssignment::SliceParam::ToString() const {
-  return absl::StrCat("[", start_inclusive, ",", end_exclusive, ")");
-}
-
-bool MemorySpaceAssignment::SliceParam::operator==(
-    const SliceParam& other) const {
-  return start_inclusive == other.start_inclusive &&
-         end_exclusive == other.end_exclusive;
-}
-
-std::string MemorySpaceAssignment::SliceProposal::ToString() const {
-  return absl::StrCat(
-      "{ slice_shape: ", slice_shape.ToString(true), ", slice_params: { ",
-      absl::StrJoin(slice_params, ", ",
-                    [](std::string* out, const SliceParam& param) {
-                      absl::StrAppend(out, param.ToString());
-                    }),
-      " }, slice_size: ", slice_size, " }");
-}
-
-std::ostream& operator<<(std::ostream& os,
-                         const MemorySpaceAssignment::SliceProposal& proposal) {
-  os << proposal.ToString();
-  return os;
-}
-
-std::tuple<const Shape&, const std::vector<MemorySpaceAssignment::SliceParam>&,
-           int64_t>
-MemorySpaceAssignment::SliceProposal::ToTuple() const {
-  return std::make_tuple(std::ref(slice_shape), std::ref(slice_params),
-                         slice_size);
-}
-
-bool MemorySpaceAssignment::SliceProposal::operator==(
-    const SliceProposal& other) const {
-  return ToTuple() == other.ToTuple();
-}
-
-std::string MemorySpaceAssignment::SliceDecision::ToString() const {
-  return absl::StrCat("{ chunk: ", chunk.ToString(),
-                      ", (exclusive) start_time: ", exclusive_start_time,
-                      ", sizing: ", sizing.ToString(),
-                      ", copy_resource_consumed: ", copy_resource_consumed,
-                      " }");
-}
-
-namespace {
-
-std::tuple<const MemorySpaceAssignment::Chunk&, int64_t,
-           const MemorySpaceAssignment::SliceProposal&, float>
-SliceDecisionToTuple(const MemorySpaceAssignment::SliceDecision& decision) {
-  return std::make_tuple(
-      std::ref(decision.chunk), decision.exclusive_start_time,
-      std::ref(decision.sizing), decision.copy_resource_consumed);
-}
-
-}  // namespace
-
-bool MemorySpaceAssignment::SliceDecision::operator==(
-    const SliceDecision& other) const {
-  return SliceDecisionToTuple(*this) == SliceDecisionToTuple(other);
-}
-
-std::string MemorySpaceAssignment::SlicedCopyAllocation::SliceDetail::ToString()
-    const {
-  return absl::StrCat("{ slice_decision: ", slice_decision.ToString(),
-                      ", copy_start_after_time: ", copy_start_after_time,
-                      ", copy_done_before_time: ", copy_done_before_time, " }");
-}
-
-namespace {
-
-std::tuple<const MemorySpaceAssignment::SliceDecision&, int64_t, int64_t,
-           const HloInstruction*, const HloInstruction*>
-SliceDetailToTuple(
-    const MemorySpaceAssignment::SlicedCopyAllocation::SliceDetail&
-        slice_detail) {
-  return std::make_tuple(std::ref(slice_detail.slice_decision),
-                         slice_detail.copy_start_after_time,
-                         slice_detail.copy_done_before_time,
-                         slice_detail.copy_start, slice_detail.copy_done);
-}
-
-}  // namespace
-
-bool MemorySpaceAssignment::SlicedCopyAllocation::SliceDetail::operator==(
-    const SliceDetail& other) const {
-  return SliceDetailToTuple(*this) == SliceDetailToTuple(other);
-}
-
-Status
-MemorySpaceAssignment::SlicedCopyAllocation::SliceDetail::CreateAsyncSlice(
-    const Shape& original_shape, HloInstruction& producer,
-    HloComputation& parent) {
-  if (original_shape.rank() != slice_decision.sizing.slice_params.size()) {
-    return FailedPrecondition(
-        "%s", absl::StrCat("The number of SlicedCopyAllocation parameters ",
-                           slice_decision.sizing.slice_params.size(),
-                           " does not match the rank ", original_shape.rank(),
-                           " of the tensor we are slicing."));
-  }
-
-  std::vector<int64_t> start_indices;
-  start_indices.reserve(slice_decision.sizing.slice_params.size());
-  std::vector<int64_t> limit_indices;
-  limit_indices.reserve(slice_decision.sizing.slice_params.size());
-  std::vector<int64_t> strides;
-  strides.reserve(slice_decision.sizing.slice_params.size());
-
-  for (int i = 0; i < slice_decision.sizing.slice_params.size(); ++i) {
-    const SliceParam& slice_param = slice_decision.sizing.slice_params[i];
-    start_indices.push_back(slice_param.start_inclusive);
-    limit_indices.push_back(slice_param.end_exclusive);
-    strides.push_back(1);
-    const int64_t new_dim =
-        slice_param.end_exclusive - slice_param.start_inclusive;
-    if (new_dim <= 0) {
-      return FailedPrecondition(
-          "%s", absl::StrCat("SlicedCopyAllocation new dimension size is ",
-                             new_dim, ", expected something > 0."));
-    }
-    if (original_shape.dimensions(i) < new_dim) {
-      return FailedPrecondition(
-          "%s",
-          absl::StrCat("SlicedCopyAllocation sliced dimension size ", new_dim,
-                       " is bigger than its original dimension size of ",
-                       original_shape.dimensions(i), "."));
-    }
-  }
-
-  HloInstruction* slice = parent.AddInstruction(
-      HloInstruction::CreateSlice(slice_decision.sizing.slice_shape, &producer,
-                                  start_indices, limit_indices, strides));
-  TF_ASSIGN_OR_RETURN(copy_done, parent.CreateAsyncInstructions(
-                                     slice, {ShapeUtil::MakeShape(S32, {})}));
-  copy_start = copy_done->mutable_operand(0);
-
-  return OkStatus();
-}
-
-namespace {
-
-// Helper function to compute the underlying Allocation chunk for a
-// SlicedCopyAllocation.
-std::optional<MemorySpaceAssignment::Chunk> GetSlicedCopyAllocationChunk(
-    const std::vector<MemorySpaceAssignment::SliceDecision>&
-        slice_decisions_sorted_by_start_time) {
-  if (slice_decisions_sorted_by_start_time.empty()) {
-    return std::nullopt;
-  }
-  auto offset_cmp = [](const MemorySpaceAssignment::SliceDecision& lhs,
-                       const MemorySpaceAssignment::SliceDecision& rhs) {
-    return lhs.chunk.offset < rhs.chunk.offset;
-  };
-  auto end_cmp = [](const MemorySpaceAssignment::SliceDecision& lhs,
-                    const MemorySpaceAssignment::SliceDecision& rhs) {
-    return lhs.chunk.chunk_end() < rhs.chunk.chunk_end();
-  };
-  return MemorySpaceAssignment::Chunk::FromOffsetEnd(
-      std::min_element(slice_decisions_sorted_by_start_time.begin(),
-                       slice_decisions_sorted_by_start_time.end(), offset_cmp)
-          ->chunk.offset,
-      std::max_element(slice_decisions_sorted_by_start_time.begin(),
-                       slice_decisions_sorted_by_start_time.end(), end_cmp)
-          ->chunk.chunk_end());
-}
-
-// Helper function to compute the start time for a SlicedCopyAllocation.
-int64_t GetSlicedCopyAllocationExclusiveStartTime(
-    const std::vector<MemorySpaceAssignment::SliceDecision>&
-        slice_decisions_sorted_by_exclusive_start_time) {
-  if (slice_decisions_sorted_by_exclusive_start_time.empty()) {
-    return -1;
-  }
-
-  return slice_decisions_sorted_by_exclusive_start_time.front()
-      .exclusive_start_time;
-}
-
-}  // namespace
-
-MemorySpaceAssignment::SlicedCopyAllocation::SlicedCopyAllocation(
-    const Allocation& prev_allocation, MemorySpace memory_space,
-    std::vector<SliceDecision> slice_decisions_sorted_by_exclusive_start_time,
-    int64_t copy_done_schedule_before_time, int64_t end_time)
-    : Allocation(
-          /*defining_position=*/{nullptr, {}}, memory_space,
-          GetSlicedCopyAllocationChunk(
-              slice_decisions_sorted_by_exclusive_start_time),
-          // Allocation uses an inclusive start time
-          ExclusiveToInclusiveStartTime(
-              GetSlicedCopyAllocationExclusiveStartTime(
-                  slice_decisions_sorted_by_exclusive_start_time)),
-          end_time,
-          /*is_scoped_allocation=*/false),
-      original_shape_to_slice_(prev_allocation.defining_position().shape()),
-      prev_allocation_(prev_allocation) {
-  CHECK_GE(slice_decisions_sorted_by_exclusive_start_time.size(), 2);
-  slice_details_sorted_by_start_time_.reserve(
-      slice_decisions_sorted_by_exclusive_start_time.size());
-  for (SliceDecision& decision :
-       slice_decisions_sorted_by_exclusive_start_time) {
-    int64_t copy_done_schedule_after_time = decision.exclusive_start_time;
-    slice_details_sorted_by_start_time_.push_back(SliceDetail{
-        std::move(decision),
-        copy_done_schedule_after_time,
-        copy_done_schedule_before_time,
-        /*copy_start=*/nullptr,
-        /*copy_done=*/nullptr,
-    });
-  }
-}
-
-namespace {
-
-// Sets defining_position with the copy_complete instruction and replaces all
-// uses of the allocation with the copy_complete instruction.
-Status ProcessCopyLikeAllocationUses(HloPosition& defining_position,
-                                     std::vector<HloUse>& uses,
-                                     HloComputation* computation,
-                                     HloInstruction* copy_complete) {
-  // Update the allocation position with the copy complete instruction, so that
-  // if there are further copies from it, they can find the correct position.
-  defining_position = HloPosition{copy_complete, {}};
-
-  // Replace all the uses of the copy-like allocation with the copy complete
-  // instruction.
-  for (HloUse use : uses) {
-    // If the operand is a tuple, we need to descend to the actual instruction
-    // we want to replace.
-    HloInstruction* replacement_instruction = copy_complete;
-    Shape operand_shape = use.instruction->operand(use.operand_number)->shape();
-    if (operand_shape.IsTuple()) {
-      TF_ASSIGN_OR_RETURN(
-          replacement_instruction,
-          TupleUtil::ReplaceTupleWith(
-              copy_complete,
-              use.instruction->mutable_operand(use.operand_number),
-              use.operand_index));
-    } else if (operand_shape != copy_complete->shape()) {
-      // When processing allocations, we treat bitcasts as trivial positions and
-      // do not create allocations for them. We insert bitcasts after copies, to
-      // account for the fact that we don't have an allocation for the bitcast.
-      VLOG(4) << "Old shape = " << operand_shape.ToString()
-              << ", new shape = " << copy_complete->shape().ToString()
-              << "; inserting a bitcast.";
-      replacement_instruction = computation->AddInstruction(
-          HloInstruction::CreateBitcast(operand_shape, copy_complete));
-    }
-    TF_RETURN_IF_ERROR(use.instruction->ReplaceOperandWith(
-        use.operand_number, replacement_instruction));
-  }
-
-  return OkStatus();
-}
-
-}  // namespace
-
-Status MemorySpaceAssignment::SlicedCopyAllocation::Process() {
-  Shape shape = defining_position().shape();
-  HloInstruction* producing_instruction = AddGetTupleElements();
-
-  // Calling Process() over the previous allocation might have modified the
-  // defining position, and hence the shape that was used when we computed
-  // the slices. In cases where the shape has changed, we insert a bitcast, so
-  // slice instructions operate on the originally sliced shape.
-  //
-  // Note, these bitcasts are being inserted in the same cases that
-  // ProcessCopyLikeAllocationUses() is inserting bitcasts, except we are
-  // inserting the bitcasts before the copy, instead of after the copy.
-  if (!Shape::Equal().IgnoreMemorySpaceInLayout()(shape,
-                                                  original_shape_to_slice_)) {
-    int64_t new_memory_space = shape.layout().memory_space();
-    shape = original_shape_to_slice_;
-    shape.mutable_layout()->set_memory_space(new_memory_space);
-    producing_instruction = producing_instruction->parent()->AddInstruction(
-        HloInstruction::CreateBitcast(shape, producing_instruction));
-  }
-
-  HloComputation* computation = producing_instruction->parent();
-  std::vector<HloInstruction*> slice_dones;
-  slice_dones.reserve(slice_details_sorted_by_start_time_.size());
-
-  // Sliced copy allocations need to insert asynchronous copy nodes.
-  for (SliceDetail& slice_detail : slice_details_sorted_by_start_time_) {
-    TF_RETURN_IF_ERROR(slice_detail.CreateAsyncSlice(
-        shape, *producing_instruction, *computation));
-    VLOG(4) << "Created " << slice_detail.copy_start->name()
-            << " for sliced copy allocation: " << ToString();
-    slice_dones.push_back(slice_detail.copy_done);
-  }
-
-  TF_RETURN_IF_ERROR(CreateBitcastConcat(shape, slice_dones));
-
-  return ProcessCopyLikeAllocationUses(defining_position_, uses_, computation,
-                                       concat_);
-}
-
-void MemorySpaceAssignment::SlicedCopyAllocation::MarkNeeded(
-    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
-  needed_allocations.insert(this);
-  prev_allocation_.MarkNeeded(needed_allocations);
-}
-
-HloPosition MemorySpaceAssignment::SlicedCopyAllocation::defining_position()
-    const {
-  // Unless explicitly set, the defining position of a sliced copy allocation is
-  // retrieved from the previous allocation. This is because we don't create
-  // new CopyStart/CopyDone instructions until later and the position should
-  // point to the previous (copy or otherwise) allocation's position for the
-  // original defining position.
-  if (defining_position_.instruction == nullptr) {
-    return prev_allocation_.defining_position();
-  }
-  return defining_position_;
-}
-
-int64_t MemorySpaceAssignment::SlicedCopyAllocation::earliest_available_time()
-    const {
-  return slice_details_sorted_by_start_time().back().copy_done_before_time;
-}
-
-std::vector<int64_t>
-MemorySpaceAssignment::SlicedCopyAllocation::SliceOffsetsSortedByStartTime()
-    const {
-  std::vector<int64_t> offsets;
-  offsets.reserve(slice_details_sorted_by_start_time_.size());
-
-  for (const SliceDetail& slice_detail : slice_details_sorted_by_start_time_) {
-    offsets.push_back(slice_detail.slice_decision.chunk.offset);
-  }
-
-  return offsets;
-}
-
-void MemorySpaceAssignment::SlicedCopyAllocation::AddDiffToAllSliceOffsets(
-    int64_t diff) {
-  for (SliceDetail& slice_detail : slice_details_sorted_by_start_time_) {
-    Chunk& chunk = slice_detail.slice_decision.chunk;
-    chunk = Chunk::FromOffsetSize(chunk.offset + diff, chunk.size);
-  }
-}
-
-void MemorySpaceAssignment::SlicedCopyAllocation::ImportRepackedSliceData(
-    const SlicedAllocationData& data) {
-  int num_slices = slice_details_sorted_by_start_time_.size();
-  CHECK_EQ(data.slices_sorted_by_offset.size(), num_slices);
-
-  std::vector<SliceDetail*> slice_details_sorted_by_offset;
-  slice_details_sorted_by_offset.reserve(num_slices);
-  for (SliceDetail& slice_detail : slice_details_sorted_by_start_time_) {
-    slice_details_sorted_by_offset.push_back(&slice_detail);
-  }
-  absl::c_sort(slice_details_sorted_by_offset, [](const SliceDetail* lhs,
-                                                  const SliceDetail* rhs) {
-    return lhs->slice_decision.chunk.offset < rhs->slice_decision.chunk.offset;
-  });
-
-  for (int i = 0; i < num_slices; ++i) {
-    SliceDetail* slice_detail = slice_details_sorted_by_offset[i];
-    Chunk& chunk = slice_detail->slice_decision.chunk;
-    const AllocatedSlice& repacked_slice_data = data.slices_sorted_by_offset[i];
-    chunk = Chunk::FromOffsetSize(repacked_slice_data.offset, chunk.size);
-    slice_detail->copy_start_after_time =
-        repacked_slice_data.inclusive_start_time - 1;
-    slice_detail->slice_decision.exclusive_start_time =
-        InclusiveToExclusiveStartTime(repacked_slice_data.inclusive_start_time);
-  }
-
-  absl::c_sort(slice_details_sorted_by_start_time_,
-               [](const SliceDetail& lhs, const SliceDetail& rhs) {
-                 return std::make_tuple(lhs.copy_start_after_time,
-                                        lhs.slice_decision.chunk.offset) <
-                        std::make_tuple(rhs.copy_start_after_time,
-                                        rhs.slice_decision.chunk.offset);
-               });
-}
-
-const std::vector<MemorySpaceAssignment::SlicedCopyAllocation::SliceDetail>&
-MemorySpaceAssignment::SlicedCopyAllocation::
-    slice_details_sorted_by_start_time() const {
-  return slice_details_sorted_by_start_time_;
-}
-
-std::vector<MemorySpaceAssignment::SlicedCopyAllocation::SliceDetail>&
-MemorySpaceAssignment::SlicedCopyAllocation::
-    mutable_slice_details_sorted_by_start_time() {
-  return slice_details_sorted_by_start_time_;
-}
-
-std::tuple<const MemorySpaceAssignment::Allocation&,
-           const std::vector<
-               MemorySpaceAssignment::SlicedCopyAllocation::SliceDetail>&,
-           const HloInstruction*>
-MemorySpaceAssignment::SlicedCopyAllocation::ToTuple() const {
-  return std::make_tuple(
-      std::ref(*this), std::ref(slice_details_sorted_by_start_time_), concat_);
-}
-
-bool MemorySpaceAssignment::SlicedCopyAllocation::operator==(
-    const SlicedCopyAllocation& other) const {
-  return ToTuple() == other.ToTuple();
-}
-
-std::string MemorySpaceAssignment::SlicedCopyAllocation::ToString() const {
-  std::string memory_space_str = "def";
-  if (memory_space_ == MemorySpace::kAlternate) {
-    memory_space_str = absl::StrCat("alt (off: ", chunk_->offset, ")");
-  }
-  return absl::StrCat(
-      "Sliced Copy Allocation in ", memory_space_str,
-      ", start_time:", start_time(), ", end_time:", end_time(),
-      ", first_slice_copy_start_after_time: ",
-      slice_details_sorted_by_start_time().front().copy_start_after_time,
-      ", last_slice_copy_done_before_time: ",
-      slice_details_sorted_by_start_time().back().copy_done_before_time,
-      ", uses: ", UsesToString(uses()), ", from ", prev_allocation_.ToString());
-}
-
-Status MemorySpaceAssignment::SlicedCopyAllocation::CreateBitcastConcat(
-    const Shape& shape, absl::Span<HloInstruction* const> slices) {
-  CHECK(!slices.empty());
-  concat_ =
-      slices.front()->parent()->AddInstruction(HloInstruction::CreateCustomCall(
-          shape, slices, kConcatBitcastCustomCall));
-  return OkStatus();
-}
-
-std::string MemorySpaceAssignment::MirroredAllocation::ToString() const {
-  return absl::StrCat("Mirrored Allocation for ",
-                      original_allocation_.ToString());
-}
-
-std::string MemorySpaceAssignment::ParentAllocation::ToString() const {
-  return absl::StrCat("Parent Allocation mirrored at ",
-                      defining_position_.ToString(), ", originally ",
-                      original_allocation_.ToString());
-}
-
-MemorySpaceAssignment::CopyAllocation::CopyAllocation(
-    Allocation& prev_allocation, MemorySpace memory_space,
-    std::optional<Chunk> chunk, int64_t copy_start_schedule_after_time,
-    int64_t copy_done_schedule_before_time, int64_t end_time,
-    std::optional<int64_t> cross_program_prefetch_index)
-    : Allocation(/*defining_position=*/{nullptr, {}}, memory_space, chunk,
-                 // Allocation uses an inclusive start time
-                 ExclusiveToInclusiveStartTime(copy_start_schedule_after_time),
-                 end_time,
-                 /*is_scoped_allocation=*/false),
-      prev_allocation_(prev_allocation),
-      copy_start_schedule_after_(copy_start_schedule_after_time),
-      copy_done_schedule_before_(copy_done_schedule_before_time),
-      cross_program_prefetch_index_(cross_program_prefetch_index) {}
-
-Status MemorySpaceAssignment::CopyAllocation::Process() {
-  // Copy allocations need to insert asynchronous copy nodes.
-  Shape shape = defining_position().shape();
-  HloInstruction* producing_instruction = AddGetTupleElements();
-  HloComputation* computation = producing_instruction->parent();
-  copy_start_ = computation->AddInstruction(HloInstruction::CreateCopyStart(
-      ShapeUtil::MakeTupleShape({shape, shape, ShapeUtil::MakeShape(U32, {})}),
-      producing_instruction, cross_program_prefetch_index_));
-  copy_done_ = computation->AddInstruction(
-      HloInstruction::CreateUnary(shape, HloOpcode::kCopyDone, copy_start_));
-  VLOG(4) << "Created " << copy_start_->name()
-          << " for copy allocation: " << ToString();
-
-  return ProcessCopyLikeAllocationUses(defining_position_, uses_, computation,
-                                       copy_done_);
-}
-
-Status MemorySpaceAssignment::MirroredAllocation::Process() {
-  defining_position_ = original_allocation_.defining_position();
-  return Allocation::Process();
-}
-
-Status MemorySpaceAssignment::ParentAllocation::Process() {
-  // Add an additional parameter to the while HLO with a reference to the buffer
-  // in the default memory space.
-  HloInstruction* producing_instruction =
-      original_allocation_.AddGetTupleElements();
-  int new_tuple_index = calling_instruction_->shape().tuple_shapes_size();
-
-  TF_ASSIGN_OR_RETURN(
-      HloInstruction * new_while_operand,
-      TupleUtil::ReplaceTupleWith(producing_instruction,
-                                  calling_instruction_->mutable_operand(0),
-                                  {new_tuple_index}));
-  TF_RETURN_IF_ERROR(calling_instruction_->ReplaceOperandWithDifferentShape(
-      0, new_while_operand));
-  *calling_instruction_->mutable_shape() = new_while_operand->shape();
-  *calling_instruction_->while_condition()
-       ->parameter_instruction(0)
-       ->mutable_shape() = new_while_operand->shape();
-  *calling_instruction_->while_body()
-       ->parameter_instruction(0)
-       ->mutable_shape() = new_while_operand->shape();
-  defining_position_.index = {new_tuple_index};
-  // Also replace the while op with a tuple that has the old shape. Note that we
-  // need to first take a snapshot of the users before calling ExtractPrefix
-  // since ExtractPrefix introduces additional gte users.
-  std::vector<HloInstruction*> while_users = calling_instruction_->users();
-  HloInstruction* tuple_with_old_shape =
-      TupleUtil::ExtractPrefix(calling_instruction_, new_tuple_index);
-  TF_RETURN_IF_ERROR(calling_instruction_->ReplaceAllUsesWithDifferentShape(
-      while_users, tuple_with_old_shape));
-  return Allocation::Process();
-}
-
-Status MemorySpaceAssignment::ParentAllocation::PostProcess() {
-  // Update the root of the while body with the new parameter. The reason why we
-  // need a separate post-process for this is because other allocations may have
-  // while body root as a use, so they would update the old root instead of the
-  // new root. Doing the post-process step later ensures the root has been
-  // updated with other changes, and we can safely add the additional parameter.
-  HloComputation* while_body = calling_instruction_->while_body();
-  TF_ASSIGN_OR_RETURN(HloInstruction * new_while_body_root,
-                      TupleUtil::ReplaceTupleWith(
-                          AddGetTupleElements(), while_body->root_instruction(),
-                          defining_position_.index));
-  while_body->set_root_instruction(new_while_body_root,
-                                   /*accept_different_shape=*/true);
-  return OkStatus();
-}
-
-void MemorySpaceAssignment::Allocation::MarkIfNeeded(
-    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
-  MarkNeeded(needed_allocations);
-}
-
-void MemorySpaceAssignment::Allocation::MarkNeeded(
-    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
-  needed_allocations.insert(this);
-}
-
-void MemorySpaceAssignment::CopyAllocation::MarkNeeded(
-    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
-  needed_allocations.insert(this);
-  prev_allocation_.MarkNeeded(needed_allocations);
-}
-
-void MemorySpaceAssignment::ParentAllocation::MarkIfNeeded(
-    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
-  // Parent allocations are only needed if they have any uses or if there is a
-  // copy allocation that copies this value (in that case, the copy allocation
-  // will call this allocation's MarkNeeded function).
-  if (!uses_.empty()) {
-    MarkNeeded(needed_allocations);
-  }
-}
-
-void MemorySpaceAssignment::ParentAllocation::MarkNeeded(
-    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
-  needed_allocations.insert(this);
-  original_allocation_.MarkNeeded(needed_allocations);
-}
-
-void MemorySpaceAssignment::MirroredAllocation::MarkNeeded(
-    absl::flat_hash_set<const Allocation*>& needed_allocations) const {
-  needed_allocations.insert(this);
-  original_allocation_.MarkNeeded(needed_allocations);
-}
-
 Status MemorySpaceAssignment::Process(const HloLiveRange& hlo_live_range) {
   VLOG(1) << "Processing assigned buffers...";
   // Since some parent allocations may not be needed (e.g. when they don't have
@@ -8525,8 +6961,7 @@ class AsyncCopyStep {
 
 class AsyncCopyStepForCopyAllocation : public AsyncCopyStep {
  public:
-  explicit AsyncCopyStepForCopyAllocation(
-      MemorySpaceAssignment::CopyAllocation* copy_allocation)
+  explicit AsyncCopyStepForCopyAllocation(CopyAllocation* copy_allocation)
       : AsyncCopyStep(), copy_allocation_(copy_allocation) {}
 
   ~AsyncCopyStepForCopyAllocation() override = default;
@@ -8552,14 +6987,13 @@ class AsyncCopyStepForCopyAllocation : public AsyncCopyStep {
   }
 
  private:
-  MemorySpaceAssignment::CopyAllocation* copy_allocation_ = nullptr;
+  CopyAllocation* copy_allocation_ = nullptr;
 };
 
 class AsyncCopyStepForSlice : public AsyncCopyStep {
  public:
-  AsyncCopyStepForSlice(
-      MemorySpaceAssignment::SlicedCopyAllocation* sliced_copy_allocation,
-      size_t slice_index)
+  AsyncCopyStepForSlice(SlicedCopyAllocation* sliced_copy_allocation,
+                        size_t slice_index)
       : AsyncCopyStep(),
         sliced_copy_allocation_(sliced_copy_allocation),
         slice_index_(slice_index) {}
@@ -8571,10 +7005,9 @@ class AsyncCopyStepForSlice : public AsyncCopyStep {
   }
 
   std::optional<StartPhase> start_phase() const override {
-    const MemorySpaceAssignment::SlicedCopyAllocation::SliceDetail&
-        slice_details =
-            sliced_copy_allocation_
-                ->slice_details_sorted_by_start_time()[slice_index_];
+    const SlicedCopyAllocation::SliceDetail& slice_details =
+        sliced_copy_allocation_
+            ->slice_details_sorted_by_start_time()[slice_index_];
     StartPhase phase{slice_details.copy_start_after_time,
                      slice_details.copy_start};
 
@@ -8588,10 +7021,9 @@ class AsyncCopyStepForSlice : public AsyncCopyStep {
   }
 
   DonePhase done_phase() const override {
-    const MemorySpaceAssignment::SlicedCopyAllocation::SliceDetail&
-        slice_details =
-            sliced_copy_allocation_
-                ->slice_details_sorted_by_start_time()[slice_index_];
+    const SlicedCopyAllocation::SliceDetail& slice_details =
+        sliced_copy_allocation_
+            ->slice_details_sorted_by_start_time()[slice_index_];
     DonePhase phase{slice_details.copy_done_before_time,
                     slice_details.copy_done};
 
@@ -8599,15 +7031,14 @@ class AsyncCopyStepForSlice : public AsyncCopyStep {
   }
 
  private:
-  MemorySpaceAssignment::SlicedCopyAllocation* sliced_copy_allocation_ =
-      nullptr;
+  SlicedCopyAllocation* sliced_copy_allocation_ = nullptr;
   size_t slice_index_;
 };
 
 class AsyncCopyStepForSliceConcat : public AsyncCopyStep {
  public:
   explicit AsyncCopyStepForSliceConcat(
-      MemorySpaceAssignment::SlicedCopyAllocation* sliced_copy_allocation)
+      SlicedCopyAllocation* sliced_copy_allocation)
       : AsyncCopyStep(), sliced_copy_allocation_(sliced_copy_allocation) {}
 
   ~AsyncCopyStepForSliceConcat() override = default;
@@ -8628,8 +7059,7 @@ class AsyncCopyStepForSliceConcat : public AsyncCopyStep {
   }
 
  private:
-  MemorySpaceAssignment::SlicedCopyAllocation* sliced_copy_allocation_ =
-      nullptr;
+  SlicedCopyAllocation* sliced_copy_allocation_ = nullptr;
 };
 
 }  // namespace
@@ -9061,16 +7491,26 @@ DefaultCrossProgramPrefetchBufferIntervalComparator::GetTuple(
 
 MemoryBoundednessBufferIntervalComparator::
     MemoryBoundednessBufferIntervalComparator(
-        const MemorySpaceAssignmentCostAnalysis& cost_analysis,
-        MemorySpaceAssignmentCostAnalysis::Cache* cost_analysis_cache)
+        const CostAnalysis& cost_analysis,
+        CostAnalysis::Cache* cost_analysis_cache)
     : MemorySpaceAssignment::BufferIntervalComparator(),
       cost_analysis_(cost_analysis),
       cost_analysis_cache_(cost_analysis_cache) {}
 
+MemoryBoundednessBufferIntervalComparator::
+    MemoryBoundednessBufferIntervalComparator(
+        const CostAnalysis& cost_analysis,
+        CostAnalysis::Cache* cost_analysis_cache,
+        MsaSortOrderOverrides msa_sort_order_overrides)
+    : MemorySpaceAssignment::BufferIntervalComparator(),
+      cost_analysis_(cost_analysis),
+      cost_analysis_cache_(cost_analysis_cache),
+      msa_sort_order_overrides_(msa_sort_order_overrides) {}
+
 std::string
 MemoryBoundednessBufferIntervalComparator::DescribeComparisonCriteria() const {
-  return "[ -memory boundedness, -size, -buffer duration, latest use time, "
-         "(inclusive) start time, instruction id ]";
+  return "[override priority, -memory boundedness, -size, -buffer duration, "
+         "latest use time, (inclusive) start time, instruction id ]";
 }
 
 std::string MemoryBoundednessBufferIntervalComparator::CriteriaToString(
@@ -9084,8 +7524,7 @@ bool MemoryBoundednessBufferIntervalComparator::LessThan(
   return GetTuple(lhs) < GetTuple(rhs);
 }
 
-MemoryBoundednessBufferIntervalComparator::ComparisonTuple
-MemoryBoundednessBufferIntervalComparator::GetTuple(
+int64_t MemoryBoundednessBufferIntervalComparator::GetLatestUseTime(
     const BufferInterval& buffer_interval) {
   auto latest_use_it = buffer_to_latest_use_.find(buffer_interval.buffer);
   if (latest_use_it == buffer_to_latest_use_.end()) {
@@ -9102,13 +7541,25 @@ MemoryBoundednessBufferIntervalComparator::GetTuple(
             .insert(std::make_pair(buffer_interval.buffer, latest_use_time))
             .first;
   }
+  return latest_use_it->second;
+}
 
-  return std::make_tuple(-1.0 * cost_analysis_.GetMemoryBoundedness(
-                                    buffer_interval, cost_analysis_cache_),
-                         -1 * buffer_interval.size,
-                         buffer_interval.start - buffer_interval.end,
-                         latest_use_it->second, buffer_interval.start,
-                         buffer_interval.buffer->id());
+MemoryBoundednessBufferIntervalComparator::ComparisonTuple
+MemoryBoundednessBufferIntervalComparator::GetTuple(
+    const BufferInterval& buffer_interval) {
+  int64_t priority = GetBufferIntervalOverridePriority(
+      msa_sort_order_overrides_, buffer_interval);
+  float inverse_memory_boundedness =
+      -1.0 * cost_analysis_.GetMemoryBoundedness(buffer_interval,
+                                                 cost_analysis_cache_);
+  int64_t inverse_buffer_size = -1 * buffer_interval.size;
+  int64_t inverse_buffer_duration = buffer_interval.start - buffer_interval.end;
+  int64_t latest_use_time = GetLatestUseTime(buffer_interval);
+  int64_t buffer_start_time = buffer_interval.start;
+  auto buffer_id = buffer_interval.buffer->id();
+  return std::make_tuple(priority, inverse_memory_boundedness,
+                         inverse_buffer_size, inverse_buffer_duration,
+                         latest_use_time, buffer_start_time, buffer_id);
 }
 
 }  // namespace memory_space_assignment
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
index ffefdfcc637a48..a5f9a5a3f42ef5 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.h
@@ -13,6 +13,156 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+/*
+Quick reference
+
+This section is meant as to be a quick reference for getting the gist of
+commonly used terminology in the code and logging. Please see the code for more
+details.
+
+General concepts
+
+  - Time: In MSA, time typically refers to an index into the flattened
+    instruction schedule.
+
+  - Cross-program prefetch: Cross-program prefetched tensors are copied from
+    memory to alternate the first time a program executes, like usual
+    prefetches. MSA keeps these buffers alive in alternate memory at the end of
+    the program, such that if the same program is executed again, these tensors
+    would not need to be prefetched again.
+
+Classes
+
+  - HloPosition (Hlo dataflow analysis concept): Identifies a tensor referenced
+    in an instruction's output. Defined by <instruction, shape index>.
+
+  - HloValue (Hlo dataflow analysis concept): The value of a tensor. Each
+    HloValue is represented by a collection of HloPositions. Exactly 1 of those
+    positions is the HloValue's defining position, i.e., the point in code where
+    the value is created/written. The rest of the positions pertain to read-only
+    uses of the value.
+    * Example: A tensor that is inserted in a Tuple has 2 HloPositions, one for
+      the instruction that creates the tensor, and one indexing into the Tuple
+      instruction result.
+    * The read-only positions of an HloValue should not be confused with
+      HloUses. Read-only positions are references to the HloValue in the output
+      of an instruction. Uses are references to an HloValue in the input of an
+      instruction.
+    * Dataflow analysis assigns HloValues for the instructions in computations
+      pertaining to while loops, conditionals, and call ops. However, it does
+      not assign HloValues to the computations pertaining to instructions with
+      "call" semantics (e.g., fusions, reduce, and custom-call) because those
+      computations are treated as black boxes.
+    * If a while loop does not modify an input tensor, that tensor will be
+      assigned 1 HloValue that lasts from its creation point through the while
+      loop.
+    * If a while loop modifies one of its input tensors, that tensor will
+      receive at least the following HloValues:
+      - An HloValue for the tensor's creation, with a use at the operand of the
+        while instruction.
+      - An HloValue with its defining position at the while body's parameter.
+      - An HloValue whose defining position is an instruction in the while body
+        that feeds the new tensor value to the body's ROOT instruction.
+      - An HloValue with its defining position at the while instruction's
+        result.
+
+  - HloBuffer (Hlo alias analysis concept): A memory container that holds one
+    or more HloValues that must alias. Typically, each HloValue corresponds to
+    1 HloBuffer; however, many exceptions exist. For example, tensors that are
+    modified by a while loop have their HloValues share an HloBuffer, for the
+    HloValues that come immediately before, during, and immediately after the
+    loop. HloBuffers are shared between HloValues wherever their is aliasing,
+    whether implicit by the nature of the instruction (e.g.,
+    dynamic-update-slice) or explicit (e.g., fusion input-output aliasing).
+
+  - BufferInterval (HeapSimulator concept): A BufferInterval is defined by a
+    buffer of a given size, with a defined lifetime. In MSA, the buffer
+    corresponds to an HloValue.
+
+  - AllocationValue: An AllocationValue is defined by an HloValue, and *one* of
+    its HloPositions.
+    * We do not create AllocationValues for non-trivial HloPositions, e.g., ones
+      defined by Tuple, GetTupleElement, and Bitcast instructions.
+    * The HloPosition used to define the AllocationValue is referred to as the
+      AllocationValue's defining position.
+      * Typically, this is also the defining position of the HloValue. However,
+        it may not be. For example, we would create an AllocationValue with an
+        HloPosition of a read-only while loop parameter, but the HloValue
+        corresponding to that HloPosition would have a different defining
+        position.
+    * The uses of an AllocationValue are limited to the direct uses of the
+      AllocationValue's defining position.
+    * An AllocationValue is associated with an AllocationSequence, describing
+      what to do with the underlying tensor, in memory, over the lifetime of the
+      AllocationValue.
+
+  - (Use) Segment: Each AllocationValue and its uses are separated into periods
+    of time called use segments. The first use segment is from the (inclusive)
+    time of the AllocationValue's defining position to its first use
+    (inclusive). The second use segment is from the first use (inclusive) to
+    the second use (inclusive), etc.
+
+  - AllocationRequest: A request to determine what to do with an
+    AllocationValue, in memory, during a use segment. It also contains
+    restrictions and preferences on what to do.
+    * A request results in updates to the AllocationValue's AllocationSequence.
+      It may add Allocations, or modify existing Allocations in the sequence.
+
+  - Allocation: A description of what to do with an AllocationValue in memory,
+    over a period of time.
+    * Pure virtual base class of all Allocations.
+
+  - AllocationSequence: A sequential list of Allocations, explaining what to do
+    with an AllocationValue over its lifetime. Allocations in the sequence may
+    overlap.
+
+  - Pinned Allocation: Represents producing a tensor in a particular memory
+    space, or keeping a tensor in a memory space in which it already exists.
+
+  - Copy Allocation: Instructions to copy an AllocationValue from one memory
+    space to another. Used for prefetching (default mem -> alt mem), and
+    eviction (alt mem -> default mem).
+    * A copy Allocation contains a copy_done_schedule_before_time. The buffer is
+      available for use at that schedule time, through the Allocation's
+      end_time.
+
+  - Sliced Copy Allocation: Similar to a Copy Allocation, except the memory is
+    copied in slices, in an effort to delay allocating memory in the destination
+    memory space, for as long as possible.
+
+  - Mirrored Allocation and Parent Allocation: R/W tensors passed to while loops
+    typically have at least 3 AllocationValues, 1 for the producer of the tensor
+    before the while loop, 1 for the while loop's body parameter, and 1 for the
+    result of the while loop. There are situations heading into a while loop, in
+    which the while loop input is both in alternate memory and default memory.
+    (For example, this could happen beause we want the buffer in alternate
+    memory for the while loop and default memory after the while loop, but we
+    don't have resources to evict the buffer after the while loop.) In those
+    cases, we use a mirrored allocation for the AllocationValue inside the
+    while loop, to mirror the allocation in default memory. We use a parent
+    allocation for the AllocationValue resulting from the while loop result.
+
+Useful logging and error messages
+
+  - Live range too long: The live range of a use segement is too long to for an
+    alternate memory no copy, i.e., its longer than we want to keep a buffer in
+    alternate memory wihtout being used.
+    * If the CostAnalysisPrefetchIntervalPicker is used, which is the default,
+      live range too long is governed by the picker's
+      max_overlap_to_mem_size_async_copy_ratio argument.
+
+  - Live range too short: The live range of a use segement is too short to
+    prefetch a buffer to alternate memory, according to some heuristic and not
+    based on limited copy resource.
+    * If the CostAnalysisPrefetchIntervalPicker is used, which is the default,
+      live range too long is governed by the picker's
+      min_overlap_to_async_copy_ratio argument.
+
+  - "Finding allocation for": Magical logging phrase indicating the point in
+    time where we are are trying to determine how to update an AllocationValue's
+    AllocationSequenece, for a particular use segment.
+*/
+
 #ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_H_
 #define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_MEMORY_SPACE_ASSIGNMENT_H_
 
@@ -41,7 +191,6 @@ limitations under the License.
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/functional/any_invocable.h"
-#include "absl/functional/function_ref.h"
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -52,10 +201,13 @@ limitations under the License.
 #include "xla/service/hlo.pb.h"
 #include "xla/service/hlo_alias_analysis.h"
 #include "xla/service/hlo_buffer.h"
-#include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/allocation.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
 #include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/service/memory_space_assignment/prefetch_interval_picker.h"
 #include "xla/service/memory_space_assignment/repacking.h"
+#include "xla/service/memory_space_assignment/slice.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/statusor.h"
@@ -67,8 +219,6 @@ namespace memory_space_assignment {
 // Forward Declaration of Options.
 struct Options;
 
-inline constexpr char kConcatBitcastCustomCall[] = "ConcatBitcast";
-
 // This class contains pre-set assignments determined by memory space
 // assignment. It contains two data structures: (1) a chunks vector that maps a
 // defining HloPosition to a Chunk (offset and size), and (2) an assignment_info
@@ -139,428 +289,6 @@ class PresetAssignments {
   std::string instruction_schedule_str_;
 };
 
-// A wrapper class around HloCostAnalysis with additional knowledge about the
-// bandwidths of different memory spaces.
-class MemorySpaceAssignmentCostAnalysis {
- public:
-  // An optional Cache object may be provided to some of the methods below to
-  // speed up the lookup.
-  struct Cache {
-    absl::flat_hash_map<const HloInstruction*, float> while_nest_multiplier;
-    absl::flat_hash_map<HloPosition, float> memory_boundedness;
-  };
-
-  // Function type that can be used to indicate which input/output values are in
-  // the alternate memory.
-  using IsInAlternateMemoryFun = absl::FunctionRef<bool(
-      std::optional<int> /*operand_num*/, const ShapeIndex& /*index*/,
-      const Shape& /*shape*/)>;
-
-  virtual ~MemorySpaceAssignmentCostAnalysis() = default;
-
-  static StatusOr<std::unique_ptr<MemorySpaceAssignmentCostAnalysis>> Create(
-      const HloCostAnalysis& cost_analysis, const Options& options,
-      const HloModule& module);
-
-  const HloCostAnalysis& cost_analysis() const { return cost_analysis_; }
-
-  // Returns a heuristic value that captures how much putting this tensor to the
-  // alternate memory would help if the op is memory bound, or otherwise how far
-  // off is the op to memory boundedness. The larger this number, the higher
-  // priority it will be placed in the alternate memory.
-  float GetAlternateMemoryBenefit(const HloInstruction& instruction,
-                                  float elapsed_time_due_to_alternate_mem,
-                                  Cache* cache = nullptr) const;
-  // Like above, return the benefit of putting the output tensor in the
-  // alternate memory.
-  float GetAlternateMemoryBenefit(const HloPosition& position,
-                                  Cache* cache = nullptr) const;
-  // Like above, return the benefit of putting the input tensor in the alternate
-  // memory.
-  float GetAlternateMemoryBenefit(const HloUse& use,
-                                  Cache* cache = nullptr) const;
-
-  // Returns a heuristic value of memory boundedness for the given
-  // BufferInterval.  The larger this number, the higher priority it will be
-  // placed in the alternate memory.
-  float GetMemoryBoundedness(
-      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval,
-      Cache* cache = nullptr) const;
-
-  // If enabled in Options::pipeline_overhead_window_size_mib, returns the
-  // overhead of accessing the default memory, in seconds. The source of the
-  // overhead is the software pipelining ovehead. The lowering of the operations
-  // typically use tiling to copy one window at a time from default memory, and
-  // perform compute:
-  //
-  // Pipeline overhead:                          <->
-  //                        +----+----+----+----+
-  // Copy from default mem: |    |    |    |    |
-  //                        +----+----+----+----+
-  //                            \    \    \    \
-  //                             \    \    \    \
-  //                              V    V    V    V
-  //                             +--+ +--+ +--+ +--+
-  // Compute:                    |  | |  | |  | |  |
-  //                             +--+ +--+ +--+ +--+
-  float GetDefaultMemoryAccessOverhead(
-      const HloInstruction& instruction,
-      absl::Span<const std::pair<int64_t, ShapeIndex>>
-          operands_in_alternate_mem = {},
-      absl::Span<const ShapeIndex> outputs_in_alternate_mem = {}) const;
-
-  // Returns the amount of time the default memory bandwidth is idle, while
-  // executing this instruction, in seconds.  This value can be multiplied with
-  // the default memory bandwidth to get the amount of bytes that are available
-  // to be copied to/from default memory during the execution of this
-  // instruction.
-  float GetDefaultMemoryBandwidthIdleTime(
-      const HloInstruction& instruction,
-      absl::Span<const std::pair<int64_t, ShapeIndex>>
-          operands_in_alternate_mem = {},
-      absl::Span<const ShapeIndex> outputs_in_alternate_mem = {}) const;
-
-  // Returns the bytes accessed from alternate memory.
-  float GetBytesAccessedFromAlternateMemory(
-      const HloInstruction& instruction,
-      absl::Span<const std::pair<int64_t, ShapeIndex>>
-          operands_in_alternate_mem = {},
-      absl::Span<const ShapeIndex> outputs_in_alternate_mem = {}) const;
-
-  // Returns the elapsed time in seconds due to compute only.
-  float GetInstructionElapsedDueToCompute(
-      const HloInstruction& instruction) const;
-
-  // Returns the elapsed time in seconds due to memory only. If
-  // operands_in_alternate_mem or outputs_in_alternate_mem is provided, it will
-  // assume that the corresponding operands or output will be in the alternate
-  // memory space. This is useful for calculating the benefit of placing the
-  // buffer in alternate memory.
-  float GetInstructionElapsedDueToMemory(
-      const HloInstruction& instruction,
-      absl::Span<const std::pair<int64_t, ShapeIndex>>
-          operands_in_alternate_mem = {},
-      absl::Span<const ShapeIndex> outputs_in_alternate_mem = {}) const;
-
-  // Like above, only the inputs/outputs indicated by is_in_alternate_mem are in
-  // the alternate memory.
-  float GetInstructionElapsedDueToMemory(
-      const HloInstruction& instruction,
-      IsInAlternateMemoryFun is_in_alternate_mem) const;
-
-  // Returns the estimated elapsed duration of the instruction in seconds.  It
-  // assumes all operands and outputs of the instruction are in the default
-  // memory.
-  virtual float GetInstructionElapsed(const HloInstruction& instruction) const;
-
-  // Returns the estimated elapsed duration of the instruction in seconds.  It
-  // assumes all operands and outputs of the instruction are in the default
-  // memory, except for the operands and outputs specified to be in the
-  // alternate memory.
-  virtual float GetInstructionElapsedInAlternateMemory(
-      const HloInstruction& instruction,
-      absl::Span<const std::pair<int64_t, ShapeIndex>>
-          operands_in_alternate_mem,
-      absl::Span<const ShapeIndex> outputs_in_alternate_mem) const;
-
-  // Like above, only the inputs/outputs indicated by is_in_alternate_mem are in
-  // the alternate memory.
-  float GetInstructionElapsedInAlternateMemory(
-      const HloInstruction& instruction,
-      IsInAlternateMemoryFun is_in_alternate_mem) const;
-
-  // Returns the elapsed time it would take to asynchronously copy the shape
-  // from default to alternate memory space (or vice versa).
-  virtual float GetAsyncCopyElapsed(const Shape& shape) const;
-
-  int64_t GetScheduleEndTime() const;
-
-  // Returns the number of nested computation levels this instruction resides
-  // in. If while_only is true, it returns the while loop nest level and 0
-  // means the instruction is not in a while loop.
-  int CalculateComputationNestLevel(const HloInstruction* instruction,
-                                    bool while_only) const;
-
-  const HloLiveRange& hlo_live_range() const { return *hlo_live_range_; }
-  const Options& options() const { return options_; }
-
- protected:
-  MemorySpaceAssignmentCostAnalysis(
-      const HloCostAnalysis& cost_analysis, const Options& options,
-      std::unique_ptr<HloAliasAnalysis> alias_analysis,
-      std::unique_ptr<HloLiveRange> hlo_live_range,
-      std::unique_ptr<CallGraph> call_graph)
-      : cost_analysis_(cost_analysis),
-        options_(options),
-        alias_analysis_(std::move(alias_analysis)),
-        hlo_live_range_(std::move(hlo_live_range)),
-        call_graph_(std::move(call_graph)) {}
-
- private:
-  const HloCostAnalysis& cost_analysis_;
-  const Options& options_;
-  std::unique_ptr<HloAliasAnalysis> alias_analysis_;
-  std::unique_ptr<HloLiveRange> hlo_live_range_;
-  std::unique_ptr<CallGraph> call_graph_;
-};
-
-// Abstract base class that memory space assignment uses to pick prefetch
-// intervals.
-class PrefetchIntervalPicker {
- public:
-  PrefetchIntervalPicker() = default;
-  virtual ~PrefetchIntervalPicker() = default;
-
-  // Returns true if the buffer can be allocated in alternate memory space
-  // without any copies (prefetches).
-  virtual bool CanAllocateInAlternateMemoryNoCopy(const Shape& shape,
-                                                  int64_t start_time,
-                                                  int64_t end_time) const = 0;
-
-  // Returns the preferred end time for an eviction that starts at a given time
-  // and must end by the given end time.
-  virtual int64_t PreferredEvictionEndTime(const Shape& shape,
-                                           int64_t start_time,
-                                           int64_t latest_end_time) const = 0;
-
-  // Returns the latest time that a prefetch can start.
-  virtual int64_t LatestPrefetchStartTime(const Shape& shape,
-                                          int64_t start_time, int64_t end_time,
-                                          const HloUse* use) const = 0;
-
-  // Returns the preferred time that a prefetch can start.
-  virtual int64_t PreferredPrefetchStartTime(
-      const Shape& shape, int64_t earliest_prefetch_start_time,
-      int64_t latest_prefetch_start_time, int64_t prefetch_end_time) const = 0;
-
-  // Returns the latest time that a prefetch can end that is less than or equal
-  // to proposed_prefetch_end_time.
-  virtual int64_t LatestPrefetchEndTime(
-      int64_t original_prefetch_end_time,
-      int64_t proposed_prefetch_end_time) const {
-    return proposed_prefetch_end_time;
-  }
-
-  // Returns the estimated end time of a prefetch that starts at the given time.
-  virtual int64_t EstimatedPrefetchEndTime(const Shape& shape,
-                                           int64_t start_time,
-                                           int64_t end_time) const = 0;
-
-  // Returns the elapsed time in seconds between the logical interval that
-  // corresponds to the instruction schedule.
-  virtual float GetLogicalIntervalElapsed(int64_t start_time,
-                                          int64_t end_time) const = 0;
-
-  // Begins the iterator for the first start time of the prefetch.
-  virtual void Begin(const HloUse& use, int64_t start_time, int64_t end_time,
-                     std::optional<int64_t> preferred_time) = 0;
-
-  // Advances the start time of the prefetch and returns that value.
-  virtual int64_t Next() = 0;
-
-  // Returns true if the available prefetch intervals have been exhausted.
-  virtual bool Done() const = 0;
-
-  // Returns the latest time the prefetch interval picker will have pick.
-  virtual int64_t latest_time() const = 0;
-
-  // The retry number can be used to modify the interval picking policies. The
-  // first attempt will have a retry_number of 0, then 1, etc.
-  virtual void SetRetryNumber(int retry_number) {
-    retry_number_ = retry_number;
-  }
-  int retry_number() const { return retry_number_; }
-
-  // Returns a debug string for the current state of the prefetch interval
-  // picker.
-  virtual std::string ToDebugString() const = 0;
-
-  // Returns a debug string for no-copy allocation.
-  virtual std::string ToNoCopyDebugString(const Shape& shape,
-                                          int64_t start_time,
-                                          int64_t end_time) const = 0;
-
-  // Prefetch interval pickers may return a value corresponding to the benefit
-  // of placing the BufferInterval in the alternate memory. The larger value,
-  // the more beneficial.
-  virtual std::optional<float> BufferIntervalAlternateMemoryBenefit(
-      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval)
-      const {
-    return std::nullopt;
-  }
-
- protected:
-  const absl::flat_hash_map<const HloInstruction*, int64_t>*
-      instruction_schedule_ = nullptr;
-  int retry_number_ = 0;
-};
-
-// Prefetch interval picker that uses instruction count to overlap asynchronous
-// copies with independent computation. The min and max overlap counts describe
-// the number of independent HLOs overlapped while a value is being prefetched
-// into the alternate memory (between CopyStart and CopyDone HLO instructions).
-// max_overlap_count attempts to prevent bringing tensors into the alternate
-// memory too eagerly and hence occupying the space for other tensors which
-// might use it.  min_overlap_count attempts to prevent cases where tensors are
-// prefetched into the alternate memory without sufficient time for the copy to
-// take place.  In those cases, it's just better to keep the tensor in the
-// default memory instead of hurting the critical path with this copy that
-// likely won't finish in time.
-class InstructionCountPrefetchIntervalPicker : public PrefetchIntervalPicker {
- public:
-  InstructionCountPrefetchIntervalPicker(int64_t min_overlap_count,
-                                         int64_t max_overlap_count)
-      : min_overlap_count_(min_overlap_count),
-        max_overlap_count_(max_overlap_count) {}
-
-  bool CanAllocateInAlternateMemoryNoCopy(const Shape& shape,
-                                          int64_t start_time,
-                                          int64_t end_time) const override;
-
-  int64_t PreferredEvictionEndTime(const Shape& shape, int64_t start_time,
-                                   int64_t latest_end_time) const override;
-
-  int64_t LatestPrefetchStartTime(const Shape& shape, int64_t start_time,
-                                  int64_t end_time,
-                                  const HloUse* use) const override;
-
-  int64_t PreferredPrefetchStartTime(const Shape& shape,
-                                     int64_t earliest_prefetch_start_time,
-                                     int64_t latest_prefetch_start_time,
-                                     int64_t prefetch_end_time) const override;
-
-  int64_t EstimatedPrefetchEndTime(const Shape& shape, int64_t start_time,
-                                   int64_t end_time) const override;
-  float GetLogicalIntervalElapsed(int64_t start_time,
-                                  int64_t end_time) const override;
-
-  void Begin(const HloUse& use, int64_t start_time, int64_t end_time,
-             std::optional<int64_t> preferred_time) override;
-
-  int64_t Next() override;
-  bool Done() const override;
-
-  int64_t latest_time() const override;
-
-  std::string ToDebugString() const override;
-  std::string ToNoCopyDebugString(const Shape& shape, int64_t start_time,
-                                  int64_t end_time) const override;
-
- private:
-  int64_t min_overlap_count_;
-  int64_t max_overlap_count_;
-  int64_t end_time_;
-  int64_t current_prefetch_time_;
-};
-
-// Forward Declaration of MemorySpaceAssignmentCostAnalysis
-class MemorySpaceAssignmentCostAnalysis;
-// Prefetch interval picker that uses cost analysis to overlap asynchronous
-// copies with independent computation. It uses min (independent computation
-// duration) / (asynchronous copy duration) ratio to guide whether the prefetch
-// is within the lower bound. For the upper bound, it restricts the maximum
-// duration that a buffer may occupy the alternate memory space as a multiple of
-// the time it would take to copy a buffer that is the size of the alternate
-// memory. It starts with the preferred ratio in Begin() and works its way for
-// alternately earlier and later prefetches until hitting min and max ratios.
-// The value for buffer size for max async copy is a mechanism to prevent
-// copying small buffers between the two memories unnecessarily. For calculating
-// the max time that the buffer can reside in alternate memory, we use the
-// larger of this value and the actual size of the buffer. A shape override can
-// also be provided which causes the interval picker to use that shape for async
-// copy durations instead of the actual shape of the copy.
-class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
- public:
-  CostAnalysisPrefetchIntervalPicker(
-      const MemorySpaceAssignmentCostAnalysis& cost_analysis,
-      float min_overlap_to_async_copy_ratio,
-      float preferred_overlap_to_async_copy_ratio,
-      float max_overlap_to_mem_size_async_copy_ratio, int64_t mem_size_bytes,
-      const Shape* shape_override = nullptr);
-
-  bool CanAllocateInAlternateMemoryNoCopy(const Shape& shape,
-                                          int64_t start_time,
-                                          int64_t end_time) const override;
-
-  int64_t PreferredEvictionEndTime(const Shape& shape, int64_t start_time,
-                                   int64_t latest_end_time) const override;
-
-  int64_t LatestPrefetchEndTime(
-      int64_t original_prefetch_end_time,
-      int64_t proposed_prefetch_end_time) const override;
-
-  int64_t LatestPrefetchStartTime(const Shape& shape, int64_t start_time,
-                                  int64_t end_time,
-                                  const HloUse* use) const override;
-
-  int64_t PreferredPrefetchStartTime(const Shape& shape,
-                                     int64_t earliest_prefetch_start_time,
-                                     int64_t latest_prefetch_start_time,
-                                     int64_t prefetch_end_time) const override;
-
-  int64_t EstimatedPrefetchEndTime(const Shape& shape, int64_t start_time,
-                                   int64_t end_time) const override;
-  float GetLogicalIntervalElapsed(int64_t start_time,
-                                  int64_t end_time) const override;
-
-  void Begin(const HloUse& use, int64_t start_time, int64_t end_time,
-             std::optional<int64_t> preferred_time) override;
-
-  int64_t Next() override;
-  bool Done() const override;
-
-  int64_t latest_time() const override;
-
-  void SetRetryNumber(int retry_number) override;
-
-  std::string ToDebugString() const override;
-  std::string ToNoCopyDebugString(const Shape& shape, int64_t start_time,
-                                  int64_t end_time) const override;
-
-  std::optional<float> BufferIntervalAlternateMemoryBenefit(
-      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval)
-      const override;
-
- private:
-  // Finds the minimum nest level in the given interval.
-  int GetMinWhileNestLevel(int64_t start_time, int64_t end_time) const;
-
-  // Given the elapsed time to copy this buffer to the alternate memory, returns
-  // the longest time that this buffer may reside in the alternate memory space.
-  float GetMaxElapsedInAlternateMemory(float async_copy_elapsed) const;
-
-  // For each instruction in the flattened schedule, maintain their elapsed time
-  // (in cumulative sum) and while nesting level.
-  std::vector<float> elapsed_time_cumsum_;
-  std::vector<int> while_nest_level_;
-  std::vector<int> computation_nest_level_;
-  // Maintain the index of the most recent (before this instruction) nest level
-  // change in order to efficiently determine the minimum nest level in an
-  // interval.
-  std::vector<int> while_nest_level_change_;
-
-  const MemorySpaceAssignmentCostAnalysis& cost_analysis_;
-  float min_overlap_to_async_copy_ratio_;
-  float preferred_overlap_to_async_copy_ratio_;
-  float max_async_copy_elapsed_;
-  float max_overlap_multiplier_ = 1.0;
-
-  float async_copy_elapsed_;
-  float inst_elapsed_reduction_;
-  int64_t end_logical_time_;
-  int64_t earliest_prefetch_time_;
-  int64_t latest_prefetch_time_;
-  bool using_increasing_prefetch_time_iterator_ = true;
-  int64_t increasing_prefetch_time_iterator_;
-  int64_t decreasing_prefetch_time_iterator_;
-
-  std::vector<float> while_execution_counts_;
-  // Shape override is used to override the shape of the shape of the async copy
-  // to treat all async copies the same duration. Having an override forces
-  // prefetches to be scheduled roughly in FIFO order.
-  std::optional<Shape> shape_override_;
-};
-
 // A class for turning a copy start time and end time into slice start times.
 class SlicedPrefetchStartTimePicker {
  public:
@@ -614,6 +342,8 @@ class MemorySpaceAssignment {
           std::pair<int, ShapeIndex>>& /*operands_in_alternate_memory*/,
       const absl::flat_hash_set<ShapeIndex>& /*outputs_in_alternate_memory*/)>;
 
+  using AllocationSequence = std::vector<std::unique_ptr<Allocation>>;
+
   // The BufferInterval sorting interface that MemorySpaceAssignment expects.
   class BufferIntervalComparator {
    public:
@@ -647,445 +377,6 @@ class MemorySpaceAssignment {
     BufferIntervalComparator() = default;
   };
 
-  // MemorySpaceAssignment uses a notion of a slow and large default memory
-  // space and a fast and small alternate memory space.
-  enum class MemorySpace { kDefault, kAlternate };
-
-  // Forward declaration for Allocation.
-  class Allocation;
-  class ParentAllocation;
-
-  // This class represents an allocation that might either be in the default or
-  // alternate memory. An HloValue might live in multiple different allocations
-  // over its lifetime. The lifetimes of the allocations are defined using
-  // start_time and end_time, which corresponds to the instruction indexes in
-  // the flattened schedule. Each of these allocations might partially overlap
-  // with each other. CopyAllocation defined below represents asynchronous
-  // copies between Allocations.
-  //
-  // Consider an instruction Foo, and its users Bar and Baz, and the times given
-  // in terms of the flattened schedule of the entire module:
-  //
-  //      Foo:10
-  //       /   \
-  //    Bar:14  \
-  //           Baz:25
-  //
-  // A valid memory space assignment could be like the following:
-  //
-  //  Time:         10 ... 14        ...      25
-  //                Foo    Bar                Baz
-  //  Alternate     +-------+           +-----+
-  //  Default           +---------------------+
-  //                    ^   ^           ^     ^
-  //                    |   |           |     |
-  //                evict   evict  prefetch  prefetch
-  //                start    end    start      end
-  //
-  // This would be represented with:
-  //   - Allocation(memory_space=kAlternate, start_time=10, end_time=14)
-  //   - CopyAllocation(memory_space=kDefault, start_time=12, end_time=25)
-  //   - CopyAllocation(memory_space=kAlternate, start_time=22, end_time=25)
-  class Allocation {
-    friend class ParentAllocation;
-
-   public:
-    Allocation(HloPosition defining_position, MemorySpace memory_space,
-               std::optional<Chunk> chunk, int64_t start_time, int64_t end_time,
-               bool is_scoped_allocation)
-        : defining_position_(defining_position),
-          memory_space_(memory_space),
-          chunk_(chunk),
-          start_time_(start_time),
-          end_time_(end_time),
-          is_scoped_allocation_(is_scoped_allocation) {
-      CHECK(!is_scoped_allocation || defining_position.index == ShapeIndex({}));
-    }
-    virtual ~Allocation() = default;
-
-    // True if the allocation is for a copy or a sliced-copy.
-    bool is_copy_like_allocation() const;
-
-    virtual bool is_copy_allocation() const { return false; }
-    virtual bool is_sliced_copy_allocation() const { return false; }
-
-    // Adds a use to this allocation.
-    void AddUse(HloUse use);
-
-    // Extends the end time of this allocation.
-    void Extend(int64_t end_time) { end_time_ = std::max(end_time_, end_time); }
-
-    // After all of the time ranges for the allocations have been assigned,
-    // Process morphs the instructions affected to assign the memory spaces and
-    // insert asynchronous copy instructions if necessary.
-    virtual Status Process();
-
-    // An optional post-process step that will be called after all allocations
-    // have been processed.
-    virtual Status PostProcess() { return OkStatus(); }
-
-    // Marks (adds this allocation to needed_allocations) if this allocation is
-    // needed. Allocation and CopyAllocations are always needed and
-    // ParentAllocations are needed if they have any uses or if other
-    // CopyAllocation or ParentAllocations depend on them.
-    virtual void MarkIfNeeded(
-        absl::flat_hash_set<const Allocation*>& needed_allocations) const;
-
-    // Marks this allocation as needed.
-    virtual void MarkNeeded(
-        absl::flat_hash_set<const Allocation*>& needed_allocations) const;
-
-    // Returns the defining position for this allocation.
-    virtual HloPosition defining_position() const { return defining_position_; }
-
-    // Returns the time the buffer is first available to be used. For
-    // Allocation, this is start_time.
-    virtual int64_t earliest_available_time() const { return start_time_; }
-
-    const std::vector<HloUse>& uses() const { return uses_; }
-    void clear_uses() { uses_.clear(); }
-    MemorySpace memory_space() const { return memory_space_; }
-    // Returns the associated chunk that may be a nullopt if the allocation is
-    // in the default memory space.
-    std::optional<Chunk> maybe_chunk() const { return chunk_; }
-    // Returns the associated chunk. The caller should ensure that the chunk is
-    // defined (the allocation should be in the alternate memory space).
-    Chunk chunk() const {
-      CHECK(chunk_.has_value());
-      return *chunk_;
-    }
-    Chunk* mutable_chunk() { return &*chunk_; }
-    void set_offset(int64_t offset);
-    void set_start_time(int64_t start_time) { start_time_ = start_time; }
-    void set_end_time(int64_t end_time) { end_time_ = end_time; }
-    int64_t start_time() const { return start_time_; }
-    int64_t end_time() const { return end_time_; }
-    bool is_scoped_allocation() const { return is_scoped_allocation_; }
-    virtual std::optional<int64_t> cross_program_prefetch_index() const {
-      return std::nullopt;
-    }
-
-    bool operator==(const Allocation& other) const;
-    virtual std::string ToString() const;
-
-    bool is_in_alternate_mem() const {
-      return memory_space_ == MemorySpace::kAlternate;
-    }
-    bool is_in_default_mem() const {
-      return memory_space_ == MemorySpace::kDefault;
-    }
-
-   protected:
-    // Recursively create kGetTupleElement instructions if the defining position
-    // shape is not an array. Returns the new instruction that has array shape.
-    HloInstruction* AddGetTupleElements() const;
-
-    HloPosition defining_position_;
-    std::vector<HloUse> uses_;
-    MemorySpace memory_space_;
-    std::optional<Chunk> chunk_;
-    int64_t start_time_;
-    int64_t end_time_;
-    const bool is_scoped_allocation_;
-  };
-
-  // This class represents an allocation as a result of an asynchronous copy.
-  // Note: CopyStart instructions are inserted after
-  // `copy_start_schedule_after`, while CopyDone instructions are inserted
-  // before `copy_done_schedule_before_time`.
-  class CopyAllocation : public Allocation {
-   public:
-    // TODO(b/307342076): Reorder scheduling times to be
-    // copy_start_schedule_after_time, copy_done_schedule_before_time, end_time
-    CopyAllocation(
-        Allocation& prev_allocation, MemorySpace memory_space,
-        std::optional<Chunk> chunk, int64_t copy_start_schedule_after_time,
-        int64_t copy_done_schedule_before_time, int64_t end_time,
-        std::optional<int64_t> cross_program_prefetch_index = std::nullopt);
-
-    bool is_copy_allocation() const override { return true; }
-
-    Status Process() override;
-
-    void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
-        const override;
-
-    HloPosition defining_position() const override {
-      // Unless explicitly set, the defining position of a copy allocation in
-      // retrieved from the previous allocation. This is because we don't create
-      // new CopyStart/CopyDone instructions until later and the position should
-      // point to the previous (copy or otherwise) allocation's position for the
-      // original defining position.
-      if (defining_position_.instruction == nullptr) {
-        return prev_allocation_.defining_position();
-      }
-      return defining_position_;
-    }
-
-    HloInstruction* copy_start() const { return copy_start_; }
-    HloInstruction* copy_done() const { return copy_done_; }
-
-    // Returns the time the buffer is first available to be used. For
-    // CopyAllocation, this is when the copy ends, which is
-    // copy_done_schedule_before.
-    int64_t earliest_available_time() const override {
-      return copy_done_schedule_before_;
-    }
-
-    int64_t copy_start_schedule_after() const {
-      return copy_start_schedule_after_;
-    }
-    int64_t copy_done_schedule_before() const {
-      return copy_done_schedule_before_;
-    }
-
-    void set_copy_start_schedule_after(int64_t copy_start_schedule_after) {
-      copy_start_schedule_after_ = copy_start_schedule_after;
-    }
-
-    void set_copy_done_schedule_before(int64_t copy_done_schedule_before) {
-      copy_done_schedule_before_ = copy_done_schedule_before;
-    }
-
-    std::optional<int64_t> cross_program_prefetch_index() const override {
-      return cross_program_prefetch_index_;
-    }
-
-    bool operator==(const CopyAllocation& other) const;
-    std::string ToString() const override;
-
-    const Allocation& prev_allocation() { return prev_allocation_; }
-    Allocation& mutable_prev_allocation() { return prev_allocation_; }
-
-   private:
-    Allocation& prev_allocation_;
-    // These variables define the scheduling boundaries where CopyStart and
-    // CopyDone can be scheduled. The earliest CopyStart can be scheduled is
-    // after copy_start_schedule_after_ and the latest CopyDone can be scheduled
-    // is before copy_done_schedule_before_.
-    int64_t copy_start_schedule_after_;
-    int64_t copy_done_schedule_before_;
-    HloInstruction* copy_start_;
-    HloInstruction* copy_done_;
-    std::optional<int64_t> cross_program_prefetch_index_;
-  };
-
-  // The parameters for slicing a single dimension of a tensor.
-  struct SliceParam {
-    std::string ToString() const;
-    bool operator==(const SliceParam& other) const;
-
-    int64_t start_inclusive;
-    int64_t end_exclusive;
-  };
-
-  // A proposed way to slice a buffer.
-  struct SliceProposal {
-    std::string ToString() const;
-    friend std::ostream& operator<<(std::ostream& os,
-                                    const SliceProposal& proposal);
-    std::tuple<const Shape&,
-               const std::vector<MemorySpaceAssignment::SliceParam>&, int64_t>
-    ToTuple() const;
-    bool operator==(const SliceProposal& other) const;
-
-    // Shape resulting from the slice.
-    Shape slice_shape;
-
-    // slice_params map to the parameters that would be passed to a slice
-    // instruction. Thus:
-    // * There should be a slice parameter for every dimension in the shape of
-    //   the tensor being sliced.
-    // * The ith slice_param applies to the ith logical dimension in the shape
-    //   being sliced.
-    // * If a dimension is not being sliced, it should have a SliceParam of
-    //   {0, dim size}.
-    std::vector<MemorySpaceAssignment::SliceParam> slice_params;
-
-    // The size to be allocated for the slice. Note, this may be > the size of
-    // the slice shape, due to additional padding that may occur when the slices
-    // are concatenated back together.
-    int64_t slice_size;
-  };
-
-  // A SliceProposalCollection proposes a way to to slice an AllocationRequest.
-  // A SliceProposalCollection is generated from a SliceProposalFunction and is
-  // used when we want to slice a prefetch.
-  using SliceProposalCollection = std::vector<SliceProposal>;
-  using SliceProposalFunction = std::function<StatusOr<SliceProposalCollection>(
-      const Shape& shape, const SlicedPrefetchOptions& options)>;
-
-  // A SliceDecision is a SliceProposal that we've determined where and when to
-  // allocate.
-  struct SliceDecision {
-    std::string ToString() const;
-    bool operator==(const SliceDecision& other) const;
-
-    Chunk chunk;
-    int64_t exclusive_start_time;
-    SliceProposal sizing;
-    float copy_resource_consumed;
-  };
-
-  // This class represents an allocation resulting from asynchronous sliced
-  // copies.
-  //
-  // Let the sliced allocation be represented as follows, and imagine that t3
-  // is the time when the entire buffer [p0, p3) is available for use
-  //
-  //   space
-  //    ^
-  // p3 |       +-----------+
-  //    |       |           |
-  // p2 |   +---+           |
-  //    |   |               |
-  // p1 |   +-------+       |
-  //    |           |       |
-  // p0 |           +-------+
-  //    +---|---|---|---|---|----> time
-  //        t0  t1  t2  t3  t4
-  //
-  // The Allocation underlying the SlicedCopyAllocation will use the following
-  // dimensions:
-  // - chunk = [p0, p3)
-  // - start time = t2
-  // - earliest_available_time = t3
-  // - end_time = t4
-  class SlicedCopyAllocation : public Allocation {
-   public:
-    // Full details about a slice in the sliced allocation.
-    struct SliceDetail {
-      std::string ToString() const;
-      std::tuple<const SliceDecision&, int64_t, int64_t, const HloInstruction*,
-                 const HloInstruction*>
-      ToTuple() const;
-      bool operator==(const SliceDetail& other) const;
-
-      // Create the instructions to copy the slice. This method updates
-      // copy_start and copy_done.
-      Status CreateAsyncSlice(const Shape& original_shape,
-                              HloInstruction& producer, HloComputation& parent);
-
-      SliceDecision slice_decision;
-      int64_t copy_start_after_time = -1;
-      int64_t copy_done_before_time = -1;
-      HloInstruction* copy_start = nullptr;
-      HloInstruction* copy_done = nullptr;
-    };
-
-    // REQUIRES:
-    // - slice_decisions_sorted_by_start_time.size() >= 2, otherwise,
-    //   CopyAllocation should be used.
-    SlicedCopyAllocation(
-        const Allocation& prev_allocation, MemorySpace memory_space,
-        std::vector<SliceDecision> slice_decisions_sorted_by_start_time,
-        int64_t copy_done_schedule_before_time, int64_t end_time);
-
-    bool is_sliced_copy_allocation() const override { return true; }
-
-    // MemorySpaceAssignment::Process() calls Process() to create asynchronous
-    // slice copies, and a bitcast-concat call to glue the slices back together.
-    Status Process() override;
-
-    // Marks the allocation as needed.
-    void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
-        const override;
-
-    // Returns the defining position for this allocation.
-    HloPosition defining_position() const override;
-
-    // Returns the time the buffer is first available to be used. For
-    // SlicedCopyAllocation, this is when all copies have ended.
-    int64_t earliest_available_time() const override;
-
-    std::vector<int64_t> SliceOffsetsSortedByStartTime() const;
-    void AddDiffToAllSliceOffsets(int64_t diff);
-
-    // Used to update offsets and start times after repacking.
-    void ImportRepackedSliceData(const SlicedAllocationData& data);
-
-    const std::vector<SliceDetail>& slice_details_sorted_by_start_time() const;
-    std::vector<SliceDetail>& mutable_slice_details_sorted_by_start_time();
-    HloInstruction* concat() const { return concat_; }
-
-    std::tuple<const Allocation&, const std::vector<SliceDetail>&,
-               const HloInstruction*>
-    ToTuple() const;
-    bool operator==(const SlicedCopyAllocation& other) const;
-    std::string ToString() const override;
-
-   private:
-    SlicedCopyAllocation() = delete;
-
-    // Create an instruction to concatenate the slices. Populates concat_.
-    Status CreateBitcastConcat(const Shape& shape,
-                               absl::Span<HloInstruction* const> slices);
-
-    Shape original_shape_to_slice_;
-    const Allocation& prev_allocation_;
-    // REQUIRES:
-    // - sorted_segments_[i].copy_start_after_time <=
-    //   sorted_segments_[i+j].copy.start_after_time
-    // - sorted_segments_[i].copy_done_before_time <=
-    //   sorted_segments_[i+j].copy.start_before_time
-    std::vector<SliceDetail> slice_details_sorted_by_start_time_;
-    HloInstruction* concat_ = nullptr;
-  };
-
-  // An allocation in the default memory space that mirrors another Allocation
-  // object. This is useful to model an eviction that happens before a while op
-  // so that we don't need to redundantly evict the buffer after the while op as
-  // well.
-  class MirroredAllocation : public Allocation {
-   public:
-    MirroredAllocation(const Allocation& original_allocation, int64_t time)
-        : Allocation(original_allocation.defining_position(),
-                     MemorySpace::kDefault, original_allocation.maybe_chunk(),
-                     /*start_time=*/time,
-                     /*end_time=*/time, /*is_scoped_allocation=*/false),
-          original_allocation_(original_allocation) {}
-
-    Status Process() override;
-
-    void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
-        const override;
-
-    std::string ToString() const override;
-
-   private:
-    const Allocation& original_allocation_;
-  };
-
-  // An allocation in default memory space that is defined in the parent
-  // computation. If a value has a copy in the default memory space in the
-  // parent computation, we don't need to evict this buffer in a while loop.
-  class ParentAllocation : public Allocation {
-   public:
-    ParentAllocation(const Allocation& original_allocation,
-                     HloInstruction* calling_instruction, HloPosition position,
-                     int64_t time)
-        : Allocation(position, MemorySpace::kDefault,
-                     original_allocation.maybe_chunk(), /*start_time=*/time,
-                     /*end_time=*/time, /*is_scoped_allocation=*/false),
-          original_allocation_(original_allocation),
-          calling_instruction_(calling_instruction) {}
-
-    Status Process() override;
-    Status PostProcess() override;
-
-    void MarkIfNeeded(absl::flat_hash_set<const Allocation*>&
-                          needed_allocations) const override;
-    void MarkNeeded(absl::flat_hash_set<const Allocation*>& needed_allocations)
-        const override;
-
-    std::string ToString() const override;
-
-   private:
-    const Allocation& original_allocation_;
-    HloInstruction* calling_instruction_;
-  };
-
-  using AllocationSequence = std::vector<std::unique_ptr<Allocation>>;
   // AllocationValue is used to break up HloValues for each non-trivial position
   // (trivial positions are considered Tuple, GetTupleElement, and Bitcast). An
   // HloValue may include positions and uses that alias with each other across
@@ -1349,8 +640,13 @@ class MemoryBoundednessBufferIntervalComparator
     : public MemorySpaceAssignment::BufferIntervalComparator {
  public:
   MemoryBoundednessBufferIntervalComparator(
-      const MemorySpaceAssignmentCostAnalysis& cost_analysis,
-      MemorySpaceAssignmentCostAnalysis::Cache* cost_analysis_cache);
+      const CostAnalysis& cost_analysis,
+      CostAnalysis::Cache* cost_analysis_cache);
+
+  MemoryBoundednessBufferIntervalComparator(
+      const CostAnalysis& cost_analysis,
+      CostAnalysis::Cache* cost_analysis_cache,
+      MsaSortOrderOverrides msa_sort_order_overrides);
 
   ~MemoryBoundednessBufferIntervalComparator() override = default;
 
@@ -1361,14 +657,18 @@ class MemoryBoundednessBufferIntervalComparator
  private:
   // See the value returned by DescribeComparisonCriteria() for the meaning of
   // each tuple element.
-  using ComparisonTuple =
-      std::tuple<float, int64_t, int64_t, int64_t, int64_t, BufferValue::Id>;
+  using ComparisonTuple = std::tuple<int64_t, float, int64_t, int64_t, int64_t,
+                                     int64_t, BufferValue::Id>;
 
   ComparisonTuple GetTuple(const BufferInterval& buffer_interval);
-
+  int64_t GetLatestUseTime(const BufferInterval& buffer_interval);
   absl::flat_hash_map<const HloValue*, int64_t> buffer_to_latest_use_;
-  const MemorySpaceAssignmentCostAnalysis& cost_analysis_;
-  MemorySpaceAssignmentCostAnalysis::Cache* cost_analysis_cache_;
+  const CostAnalysis& cost_analysis_;
+  CostAnalysis::Cache* cost_analysis_cache_;
+
+  // Config to override alternate memory assignment sorting order for filtered
+  // buffers.
+  MsaSortOrderOverrides msa_sort_order_overrides_;
 };
 
 // The default BufferIntervalComparator used for cross-program prefetching.
@@ -1424,11 +724,13 @@ struct Options {
   PrefetchIntervalPicker* prefetch_interval_picker = nullptr;
 
   // This object is used to determine the benefit of a particular allocation.
-  MemorySpaceAssignmentCostAnalysis* cost_analysis = nullptr;
+  CostAnalysis* cost_analysis = nullptr;
 
   // Size function for buffer values.
   BufferValue::SizeFunction size_fn;
 
+  std::function<Shape(const Shape&)> get_equivalent_s8_shape_fn;
+
   // This function can be used to prevent certain HloValues (e.g., based on
   // the opcode) to be placed on the alternate memory.
   MemorySpaceAssignment::IsAllowedInAlternateMemoryFunction
@@ -1483,21 +785,6 @@ struct Options {
   // greater than 0, repacker must be non-nullptr.
   int64_t max_repacks = 0;
 
-  // This variable is used by the cost analysis in estimating how many times
-  // each while loop will execute. Nested loops will be assumed to have
-  // executed pow(while_execution_count, nesting_level) times.
-  uint64_t xla_tpu_memory_space_assignment_while_execution_count = 5ULL;
-
-  // This variable is used to scale the alternate memory benefit factor for
-  // large buffers. The default scaling function is sqrt.
-  std::string
-      xla_tpu_alternate_memory_benefit_scaling_factor_for_large_buffers =
-          "SQRT";
-
-  float async_copy_bandwidth_bytes_per_second = 0.0f;
-
-  float alternate_mem_bandwidth_bytes_per_second = 0.0f;
-
   // The repacking algorithm to reduce fragmentation. Must be non-null if
   // max_repacks is greater than 0.
   MemorySpaceAssignmentRepacker* repacker = nullptr;
@@ -1543,9 +830,6 @@ struct Options {
   // to sort allocated buffers.
   std::optional<std::vector<uint64_t>> autotuning_config = std::nullopt;
 
-  // Scales effective bandwidth for async copies. Valid range is (0, 1].
-  float async_copy_bandwidth_scaling_factor = 1.0;
-
   // If true, uses the earlier instance of the same instruction to use as
   // preferred prefetch start time.
   bool use_repeated_instance_for_preferred_prefetch_time = false;
@@ -1571,10 +855,6 @@ struct Options {
       absl::Span<HloPosition>)>
       get_inefficient_allocation_sites_fn = nullptr;
 
-  // The window size used to calculate the pipeline overhead when HLO accesses
-  // the default memory, in MiB.
-  float pipeline_overhead_window_size_mib = 0;
-
   // Config to filter prefetches and update preferred prefetch times for the
   // filtered prefetches.
   PreferredPrefetchOverrides preferred_prefetch_overrides;
@@ -1585,9 +865,9 @@ struct Options {
   // Options for the memory-bound loop optimizer feature.
   MemoryBoundLoopOptimizerOptions memory_bound_loop_optimizer_options;
 
-  MemorySpaceAssignment::SliceProposalFunction propose_slice_fn =
-      [](const Shape&, const SlicedPrefetchOptions&)
-      -> xla::StatusOr<MemorySpaceAssignment::SliceProposalCollection> {
+  SliceProposalFunction propose_slice_fn = [](const Shape&,
+                                              const SlicedPrefetchOptions&)
+      -> xla::StatusOr<SliceProposalCollection> {
     return UnimplementedStrCat("Generation of SliceProposals unimplemented");
   };
 
@@ -1603,12 +883,10 @@ struct AsynchronousCopy {
   int64_t exclusive_start_time;
   int64_t end_time;
   float resource;
-  MemorySpaceAssignment::MemorySpace destination;
+  MemorySpace destination;
   int64_t id;
 
-  std::tuple<int64_t, int64_t, float, MemorySpaceAssignment::MemorySpace,
-             int64_t>
-  AsTuple() const {
+  std::tuple<int64_t, int64_t, float, MemorySpace, int64_t> AsTuple() const {
     return std::make_tuple(exclusive_start_time, end_time, resource,
                            destination, id);
   }
@@ -1723,9 +1001,8 @@ class AsynchronousCopyResource {
 
   // A useful debugging tool for printing several pieces of information about
   // AsynchronousCopyResource.
-  std::string Dump(
-      int64_t start_time, int64_t end_time,
-      MemorySpaceAssignment::MemorySpace memory_space_filter) const;
+  std::string Dump(int64_t start_time, int64_t end_time,
+                   MemorySpace memory_space_filter) const;
 
  private:
   // Internal helper method to implement adding/removing/checking resources.
@@ -1874,7 +1151,7 @@ class MemoryBoundLoopOptimizer {
       const MemoryBoundLoopOptimizerOptions& options,
       const HloLiveRange& hlo_live_range,
       const HloAliasAnalysis& alias_analysis_,
-      const MemorySpaceAssignmentCostAnalysis& cost_analysis,
+      const CostAnalysis& cost_analysis,
       const BufferValue::SizeFunction& size_function);
 
   // Optimize the loop. Initialize must be called first.
@@ -1916,13 +1193,13 @@ class MemoryBoundLoopOptimizer {
     std::vector<int64_t> additional_memory_used;
   };
 
-  MemoryBoundLoopOptimizer(
-      int loop_start, int loop_end, uint64_t alternate_memory_size,
-      const MemoryBoundLoopOptimizerOptions& options,
-      const HloLiveRange& hlo_live_range,
-      const HloAliasAnalysis& alias_analysis_,
-      const MemorySpaceAssignmentCostAnalysis& cost_analysis,
-      const BufferValue::SizeFunction& size_function);
+  MemoryBoundLoopOptimizer(int loop_start, int loop_end,
+                           uint64_t alternate_memory_size,
+                           const MemoryBoundLoopOptimizerOptions& options,
+                           const HloLiveRange& hlo_live_range,
+                           const HloAliasAnalysis& alias_analysis_,
+                           const CostAnalysis& cost_analysis,
+                           const BufferValue::SizeFunction& size_function);
 
   // Initializes the data structures used by the optimizer.
   Status Initialize();
@@ -1991,7 +1268,7 @@ class MemoryBoundLoopOptimizer {
   MemoryBoundLoopOptimizerOptions options_;
   const HloLiveRange& hlo_live_range_;
   const HloAliasAnalysis& alias_analysis_;
-  const MemorySpaceAssignmentCostAnalysis& cost_analysis_;
+  const CostAnalysis& cost_analysis_;
   BufferValue::SizeFunction size_function_;
 
   absl::flat_hash_map<const HloInstruction*, int64_t> instructions_in_loop_;
@@ -2013,7 +1290,6 @@ class MemoryBoundLoopOptimizer {
 class AlternateMemoryBestFitHeap
     : public GlobalDecreasingSizeBestFitHeap<HloValue> {
  public:
-  using MemorySpace = MemorySpaceAssignment::MemorySpace;
   using AllocationValue = MemorySpaceAssignment::AllocationValue;
   using HloPositionOrUse = std::variant<HloPosition, HloUse>;
 
@@ -2065,14 +1341,14 @@ class AlternateMemoryBestFitHeap
   // We inherit AllocationBlock struct to attach the Allocation information to
   // make importing repacked offsets easier.
   struct RepackAllocationBlock : AllocationBlock {
-    MemorySpaceAssignment::Allocation* allocation;
+    Allocation* allocation;
   };
 
   // A data structure we use to associate Allocation objects that are aliased
   // and must get the same offset.
   struct AliasedOffset {
     int64_t offset;
-    absl::flat_hash_set<const MemorySpaceAssignment::Allocation*> allocations;
+    absl::flat_hash_set<const Allocation*> allocations;
   };
 
   // An allocation request for a use segment. A use segment is the time segment
@@ -2117,7 +1393,7 @@ class AlternateMemoryBestFitHeap
   // time of the parameter instruction, and an output's time would correspond to
   // the time of last use.
   struct RequiredMemoryAssignment {
-    MemorySpaceAssignment::MemorySpace memory_space;
+    MemorySpace memory_space;
     int64_t time;
     AliasedOffset* offset;
 
@@ -2145,7 +1421,7 @@ class AlternateMemoryBestFitHeap
     // instruction.
     int64_t loop_size;
     // A pointer into an Allocation in loop_optimized_allocations_.
-    const MemorySpaceAssignment::Allocation* loop_optimized_allocation;
+    const Allocation* loop_optimized_allocation;
   };
 
   // A context object that is used to share state amongst the methods that
@@ -2180,8 +1456,7 @@ class AlternateMemoryBestFitHeap
       // p0 |           +-------+
       //    +---|---|---|---|---|----> time
       //        t0  t1  t2  t3  t4
-      std::vector<MemorySpaceAssignment::SliceDecision>
-          slice_decisions_sorted_by_start_time;
+      std::vector<SliceDecision> slice_decisions_sorted_by_start_time;
 
       // In order to support colocated buffer calculations, we need to add a
       // BufferInterval-Chunk pair to pending_chunks_, such that:
@@ -2243,7 +1518,7 @@ class AlternateMemoryBestFitHeap
 
     // Parameters to Prefetch().
     const AllocationRequest* request;
-    MemorySpaceAssignment::Allocation* prev_allocation_in_default_mem;
+    Allocation* prev_allocation_in_default_mem;
 
     // Intermediate calculations common to both the sliced and unsliced
     // solutions.
@@ -2257,8 +1532,8 @@ class AlternateMemoryBestFitHeap
     std::optional<int64_t> exclusive_out_of_mem_start = std::nullopt;
 
     // Data structures used to compute and store the sliced solution.
-    std::optional<MemorySpaceAssignment::SliceProposalCollection>
-        slice_proposal_collection = std::nullopt;
+    std::optional<SliceProposalCollection> slice_proposal_collection =
+        std::nullopt;
     WorkingIntervals sliced_solution_intervals;
     std::optional<SlicedSolution> sliced_solution;
 
@@ -2343,20 +1618,18 @@ class AlternateMemoryBestFitHeap
   void AllocateReservedScopedAllocations();
 
   // Returns the AliasedOffset object associated with the allocation.
-  AliasedOffset* GetAliasedOffset(
-      const MemorySpaceAssignment::Allocation& allocation);
+  AliasedOffset* GetAliasedOffset(const Allocation& allocation);
 
   // If aliased_offset is non-null, this method adds the allocation to
   // aliased_offset. Otherwise, it creates a new AliasedOffset object and adds
   // the allocation to this new AliasedOffset.
-  void CreateOrAddToAliasedOffset(
-      const MemorySpaceAssignment::Allocation& allocation,
-      AliasedOffset* aliased_offset);
+  void CreateOrAddToAliasedOffset(const Allocation& allocation,
+                                  AliasedOffset* aliased_offset);
 
   // Given an allocation sequence, returns the live allocation at time with a
   // preference towards allocations in alternate memory. Returns nullptr if no
   // allocation is alive at that time.
-  static MemorySpaceAssignment::Allocation* GetLiveAllocationAt(
+  static Allocation* GetLiveAllocationAt(
       const MemorySpaceAssignment::AllocationSequence& allocations,
       int64_t time);
 
@@ -2402,9 +1675,8 @@ class AlternateMemoryBestFitHeap
                               int64_t earliest_prefetch_time) const;
 
   // Try prefetching to alternate memory space.
-  Result Prefetch(
-      const AllocationRequest& request,
-      MemorySpaceAssignment::Allocation& prev_allocation_in_default_mem);
+  Result Prefetch(const AllocationRequest& request,
+                  Allocation& prev_allocation_in_default_mem);
 
   // Helper methods used to implement Prefetch().
   //
@@ -2459,9 +1731,9 @@ class AlternateMemoryBestFitHeap
           colocated_intervals);
 
   // Propagates aliased required assignment for a given position.
-  void AddAliasedRequiredAssignment(
-      const HloInstruction* instruction, ShapeIndex index,
-      const MemorySpaceAssignment::Allocation* aliased_allocation);
+  void AddAliasedRequiredAssignment(const HloInstruction* instruction,
+                                    ShapeIndex index,
+                                    const Allocation* aliased_allocation);
 
   // This sets a required assignment. CHECK fails if there is a conflicting
   // required assignment at the same time.
@@ -2489,7 +1761,7 @@ class AlternateMemoryBestFitHeap
   // allocations all share a common allocation site (a use or position) with
   // each other. This can be used to determine if a group of linked allocations
   // are considered efficient or not.
-  std::vector<std::vector<const MemorySpaceAssignment::Allocation*>>
+  std::vector<std::vector<const Allocation*>>
   GetLinkedAllocationsInAlternateMemory(
       absl::Span<const AllocationValue> allocation_values) const;
 
@@ -2552,10 +1824,9 @@ class AlternateMemoryBestFitHeap
 
   // Adds an asynchronous copy to allocations.
   void AddAsyncCopy(
-      MemorySpaceAssignment::Allocation& prev_allocation,
-      MemorySpace memory_space, std::optional<Chunk> chunk,
-      int64_t exclusive_start_time, int64_t end_time,
-      int64_t copy_done_schedule_before_time,
+      Allocation& prev_allocation, MemorySpace memory_space,
+      std::optional<Chunk> chunk, int64_t exclusive_start_time,
+      int64_t end_time, int64_t copy_done_schedule_before_time,
       MemorySpaceAssignment::AllocationSequence* allocations,
       AliasedOffset* aliased_offset, float resource,
       std::optional<int> cross_program_prefetch_index = std::nullopt);
@@ -2564,11 +1835,10 @@ class AlternateMemoryBestFitHeap
   // asynchronous copy data structures, prefetch_interval_tree_, and aliasing
   // data structures
   void AddAsyncSlicesForPrefetch(
-      const MemorySpaceAssignment::Allocation& prev_allocation,
+      const Allocation& prev_allocation,
       MemorySpaceAssignment::AllocationSequence* allocations,
       AliasedOffset* aliased_offset,
-      const std::vector<MemorySpaceAssignment::SliceDecision>&
-          slice_decisions_sorted_by_start_time,
+      const std::vector<SliceDecision>& slice_decisions_sorted_by_start_time,
       int64_t prefetch_end_time, int64_t allocation_end_time);
 
   // This method is used for committing the chunk candidate but adding it to
@@ -2595,9 +1865,8 @@ class AlternateMemoryBestFitHeap
   void AppendScopedAllocationBufferInfoDebugString(
       const HloInstruction* instruction, int64_t time, int64_t size,
       std::string& debug_str) const;
-  void AppendAllocationInfoDebugString(
-      const MemorySpaceAssignment::Allocation& allocation,
-      std::string& debug_str) const;
+  void AppendAllocationInfoDebugString(const Allocation& allocation,
+                                       std::string& debug_str) const;
   void DumpDebugStringsIfEnabled() const;
 
   // Returns the available heap size in the alternate memory.
@@ -2614,8 +1883,7 @@ class AlternateMemoryBestFitHeap
   // Creates and returns a RepackAllocationBlock.
   static RepackAllocationBlock MakeRepackAllocationBlock(
       int64_t start_time, int64_t end_time, int64_t size,
-      int64_t initial_offset, int64_t id,
-      MemorySpaceAssignment::Allocation* allocation) {
+      int64_t initial_offset, int64_t id, Allocation* allocation) {
     RepackAllocationBlock allocation_block;
     allocation_block.inclusive_start_time = start_time;
     allocation_block.end_time = end_time;
@@ -2667,8 +1935,7 @@ class AlternateMemoryBestFitHeap
   // The data structure that contains AliasedOffset objects and Allocation to
   // AliasedOffset map for efficient lookup.
   std::list<AliasedOffset> aliased_offsets_;
-  absl::flat_hash_map<const MemorySpaceAssignment::Allocation*, AliasedOffset*>
-      aliased_offset_map_;
+  absl::flat_hash_map<const Allocation*, AliasedOffset*> aliased_offset_map_;
   // This map contains required memory assignments for HloValues (e.g., input
   // and outputs).
   absl::flat_hash_map<const HloValue*, std::vector<RequiredMemoryAssignment>>
@@ -2710,6 +1977,7 @@ class AlternateMemoryBestFitHeap
   std::string allocation_info_str_;
   std::string instruction_schedule_str_;
 };
+
 }  // namespace memory_space_assignment
 }  // namespace xla
 
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto
index 77e35a13d362bb..47a89e74bebb00 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment.proto
@@ -36,6 +36,14 @@ message SlicedPrefetchOptions {
   // size that is not a multiple of the required hardware alignment. Otherwise,
   // we will choose not to slice such situations, which is always safe.
   bool fail_on_non_alignment_boundary_slice_proposal = 3;
+
+  // The threshold for max_slices after which we limit the permutations of slice
+  // times that we try when placing a sliced allocation.
+  uint32 all_slice_time_permutations_threshold = 4;
+
+  // The preferred slize size for MSA sliced prefetches. 0 means there is no
+  // preferred slice size, in which case, we'll try to slice into max_slices.
+  uint64 preferred_slice_size = 5;
 }
 
 // Options for memory-bound loop optimizations in memory space assignment. If
@@ -114,3 +122,45 @@ message PreferredPrefetchOverride {
 message PreferredPrefetchOverrides {
   repeated PreferredPrefetchOverride overrides = 1;
 }
+
+// A message that identifies one or more HloPositions.
+message HloPositionMatcher {
+  // Regex to match the entire instruction HLO. The HLO string is constructed
+  // using default HloPrintOptions. Refer to the HloPrintOptions class in
+  // hlo_instruction.h to know more about the format of the HLO string used for
+  // matching.
+  optional string instruction_regex = 1;
+  // Regex to match instruction name.
+  optional string instruction_name_regex = 2;
+  // If output of an instruction is a tuple and indexing into the
+  // tuple is required.
+  optional TupleShapeIndex tuple_index = 3;
+}
+
+// Options to override preferred prefetch time for an operand.
+message MsaSortOrderOverrideOptions {
+  oneof options {
+    // Assign alternate memory to the filtered buffer before other buffers. If
+    // multiple buffers are to be assigned first (within the same override
+    // config) other tie breakers and stable sort order will take effect.
+    bool assign_first = 1;
+    // Assign alternate memory to the filtered buffer after other buffers. If
+    // multiple buffers are to be assigned last (within the same override
+    // config) other tie breakers and stable sort order will take effect.
+    bool assign_last = 2;
+  }
+}
+
+// Specifies details on how to override the sort order for matching
+// HloPositions.
+message MsaSortOrderOverride {
+  optional HloPositionMatcher hlo_position_matcher = 1;
+  optional xla.memory_space_assignment.MsaSortOrderOverrideOptions
+      override_options = 2;
+}
+
+// Encloses chained override configs. The first config has highest precedence
+// and so on.
+message MsaSortOrderOverrides {
+  repeated MsaSortOrderOverride overrides = 1;
+}
diff --git a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
index 5eebda2e776a38..e9421c99735823 100644
--- a/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/memory_space_assignment_test.cc
@@ -41,6 +41,7 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_join.h"
 #include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -53,8 +54,13 @@ limitations under the License.
 #include "xla/service/hlo_cost_analysis.h"
 #include "xla/service/hlo_value.h"
 #include "xla/service/instruction_hoister.h"
+#include "xla/service/memory_space_assignment/allocation.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
 #include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/service/memory_space_assignment/prefetch_interval_picker.h"
 #include "xla/service/memory_space_assignment/repacking.h"
+#include "xla/service/memory_space_assignment/slice.h"
+#include "xla/service/memory_space_assignment/testing_utils.h"
 #include "xla/service/time_utils.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
@@ -66,38 +72,18 @@ limitations under the License.
 #include "xla/xla_data.pb.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/errors.h"
-#include "tsl/platform/protobuf.h"
+#include "tsl/platform/protobuf.h"  // IWYU pragma: keep
 #include "tsl/platform/status.h"
 #include "tsl/platform/status_matchers.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace xla {
+namespace memory_space_assignment {
 namespace {
 
 namespace op = xla::testing::opcode_matchers;
 using Chunk = HeapSimulator::Chunk;
-using memory_space_assignment::AsynchronousCopy;
-using memory_space_assignment::AsynchronousCopyOrdering;
-using memory_space_assignment::AsynchronousCopyResource;
-using memory_space_assignment::CostAnalysisPrefetchIntervalPicker;
-using memory_space_assignment::InstructionCountPrefetchIntervalPicker;
-using memory_space_assignment::MemoryBoundLoopOptimizer;
-using memory_space_assignment::MemoryBoundLoopOptimizerOptions;
-using memory_space_assignment::MemorySpaceAssignment;
-using memory_space_assignment::MemorySpaceAssignmentCostAnalysis;
-using memory_space_assignment::MemorySpaceAssignmentRepacker;
-using memory_space_assignment::Options;
-using memory_space_assignment::PreferredPrefetchOverrides;
-using memory_space_assignment::PrefetchIntervalPicker;
-using memory_space_assignment::PresetAssignments;
-using memory_space_assignment::SlicedPrefetchOptions;
-using SliceParam = memory_space_assignment::MemorySpaceAssignment::SliceParam;
-using SliceProposal =
-    memory_space_assignment::MemorySpaceAssignment::SliceProposal;
-using SliceProposalCollection =
-    memory_space_assignment::MemorySpaceAssignment::SliceProposalCollection;
-using MSA = memory_space_assignment::MemorySpaceAssignment;
 using ::testing::_;
 using ::testing::Return;
 using ::testing::UnorderedElementsAre;
@@ -176,8 +162,6 @@ class MemorySpaceAssignmentTestBase : public HloTestBase {
 
   Options DefaultMemorySpaceOptions() {
     Options options;
-    options.async_copy_bandwidth_bytes_per_second = kAsyncCopyBandwidth;
-    options.alternate_mem_bandwidth_bytes_per_second = kAlternateMemBandwidth;
     options.max_size_in_bytes = 128;
     options.alignment_in_bytes = 8;
     options.verify = true;
@@ -190,6 +174,13 @@ class MemorySpaceAssignmentTestBase : public HloTestBase {
     return options;
   }
 
+  CostAnalysisOptions DefaultCostAnalysisOptions() {
+    CostAnalysisOptions options;
+    options.async_copy_bandwidth_bytes_per_second = kAsyncCopyBandwidth;
+    options.alternate_mem_bandwidth_bytes_per_second = kAlternateMemBandwidth;
+    return options;
+  }
+
   Options UpdateMaxAsyncCopies(Options options, int64_t max_async_copies) {
     options.max_outstanding_prefetches = max_async_copies;
     options.max_outstanding_evictions = max_async_copies;
@@ -200,14 +191,18 @@ class MemorySpaceAssignmentTestBase : public HloTestBase {
   std::unique_ptr<PresetAssignments> AssignMemorySpaceUsingCostAnalysis(
       HloModule* module,
       std::optional<Options> memory_space_options_override = std::nullopt,
-      std::optional<HloCostAnalysis::Options> cost_options_override =
+      std::optional<CostAnalysisOptions> cost_analysis_options_override =
+          std::nullopt,
+      std::optional<HloCostAnalysis::Options> hlo_cost_options_override =
+          std::nullopt,
+      std::optional<MsaSortOrderOverrides> optional_msa_sort_order_overrides =
           std::nullopt) {
-    HloCostAnalysis::Options cost_options = DefaultHloCostAnalysisOptions();
-    if (cost_options_override) {
-      cost_options = *cost_options_override;
+    HloCostAnalysis::Options hlo_cost_options = DefaultHloCostAnalysisOptions();
+    if (hlo_cost_options_override) {
+      hlo_cost_options = *hlo_cost_options_override;
     }
 
-    HloCostAnalysis hlo_cost_analysis(cost_options);
+    HloCostAnalysis hlo_cost_analysis(hlo_cost_options);
     for (HloComputation* computation : module->MakeNonfusionComputations()) {
       TF_CHECK_OK(computation->Accept(&hlo_cost_analysis));
     }
@@ -217,10 +212,14 @@ class MemorySpaceAssignmentTestBase : public HloTestBase {
     if (memory_space_options_override) {
       memory_space_options = *memory_space_options_override;
     }
+    CostAnalysisOptions cost_analysis_options = DefaultCostAnalysisOptions();
+    if (cost_analysis_options_override) {
+      cost_analysis_options = *cost_analysis_options_override;
+    }
 
-    auto cost_analysis = MemorySpaceAssignmentCostAnalysis::Create(
-                             hlo_cost_analysis, memory_space_options, *module)
-                             .value();
+    auto cost_analysis =
+        CostAnalysis::Create(hlo_cost_analysis, cost_analysis_options, *module)
+            .value();
     memory_space_options.cost_analysis = cost_analysis.get();
     CostAnalysisPrefetchIntervalPicker prefetch_interval_picker(
         CostAnalysisPrefetchIntervalPicker(
@@ -228,8 +227,12 @@ class MemorySpaceAssignmentTestBase : public HloTestBase {
             /*preferred_overlap_to_async_copy_ratio=*/1.5,
             /*max_overlap_to_mem_size_async_copy_ratio=*/10.0,
             /*mem_size_bytes=*/memory_space_options.max_size_in_bytes));
-    memory_space_assignment::MemoryBoundednessBufferIntervalComparator
-        comparator(*cost_analysis, &cache_);
+    MsaSortOrderOverrides msa_sort_order_overrides;
+    if (optional_msa_sort_order_overrides.has_value()) {
+      msa_sort_order_overrides = optional_msa_sort_order_overrides.value();
+    }
+    MemoryBoundednessBufferIntervalComparator comparator(
+        *cost_analysis, &cache_, msa_sort_order_overrides);
     return AssignMemorySpace(
         module, memory_space_options,
         [&comparator](const MemorySpaceAssignment::BufferInterval& lhs,
@@ -495,7 +498,7 @@ class MemorySpaceAssignmentTestBase : public HloTestBase {
     return module;
   }
 
-  MemorySpaceAssignmentCostAnalysis::Cache cache_;
+  CostAnalysis::Cache cache_;
 };
 
 class MemorySpaceAssignmentTest : public MemorySpaceAssignmentTestBase,
@@ -504,93 +507,6 @@ class MemorySpaceAssignmentTest : public MemorySpaceAssignmentTestBase,
   bool allocate_across_sequential_calls() const override { return GetParam(); }
 };
 
-// For testing purposes, we define a cost analysis where we can control the
-// elapsed times of each HLO and asynchronous copy.
-class FakeMemorySpaceAssignmentCostAnalysis
-    : public MemorySpaceAssignmentCostAnalysis {
- public:
-  static StatusOr<std::unique_ptr<FakeMemorySpaceAssignmentCostAnalysis>>
-  Create(const HloCostAnalysis& cost_analysis, const HloModule& module,
-         const Options& options) {
-    TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(&module));
-    TF_ASSIGN_OR_RETURN(auto hlo_live_range,
-                        HloLiveRange::Run(module.schedule(), *alias_analysis,
-                                          module.entry_computation()));
-    auto call_graph = CallGraph::Build(&module);
-    return absl::WrapUnique(new FakeMemorySpaceAssignmentCostAnalysis(
-        cost_analysis, options, std::move(alias_analysis),
-        std::move(hlo_live_range), std::move(call_graph)));
-  }
-
-  float GetInstructionElapsed(
-      const HloInstruction& instruction) const override {
-    if (get_instruction_elapsed_override_) {
-      return get_instruction_elapsed_override_(instruction);
-    }
-    return 1.0;
-  }
-
-  float GetInstructionElapsedInAlternateMemory(
-      const HloInstruction& instruction,
-      absl::Span<const std::pair<int64_t, ShapeIndex>>
-          operands_in_alternate_mem,
-      absl::Span<const ShapeIndex> outputs_in_alternate_mem) const override {
-    if (get_instruction_elapsed_in_alternate_memory_override_) {
-      return get_instruction_elapsed_in_alternate_memory_override_(
-          instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
-    }
-    if (!operands_in_alternate_mem.empty()) {
-      return 0.5;
-    } else {
-      return 1.0;
-    }
-  }
-
-  float GetAsyncCopyElapsed(const Shape& shape) const override {
-    if (get_async_copy_elapsed_override_) {
-      return get_async_copy_elapsed_override_(shape);
-    }
-    return 3.0;
-  }
-
-  // The following methods can be used to override what the above API calls
-  // return.
-  void SetOverrideForGetInstructionElapsed(
-      std::function<float(const HloInstruction&)> function) {
-    get_instruction_elapsed_override_ = function;
-  }
-  void SetOverrideForGetInstructionElapsedInAlternateMemory(
-      std::function<float(const HloInstruction&,
-                          absl::Span<const std::pair<int64_t, ShapeIndex>>,
-                          absl::Span<const ShapeIndex>)>
-          function) {
-    get_instruction_elapsed_in_alternate_memory_override_ = function;
-  }
-  void SetOverrideForGetAsyncCopyElapsed(
-      std::function<float(const Shape&)> function) {
-    get_async_copy_elapsed_override_ = function;
-  }
-
- protected:
-  FakeMemorySpaceAssignmentCostAnalysis(
-      const HloCostAnalysis& cost_analysis, const Options& options,
-      std::unique_ptr<HloAliasAnalysis> alias_analysis,
-      std::unique_ptr<HloLiveRange> hlo_live_range,
-      std::unique_ptr<CallGraph> call_graph)
-      : MemorySpaceAssignmentCostAnalysis(
-            cost_analysis, options, std::move(alias_analysis),
-            std::move(hlo_live_range), std::move(call_graph)) {}
-
- private:
-  std::function<float(const HloInstruction&)>
-      get_instruction_elapsed_override_ = nullptr;
-  std::function<float(const HloInstruction&,
-                      absl::Span<const std::pair<int64_t, ShapeIndex>>,
-                      absl::Span<const ShapeIndex>)>
-      get_instruction_elapsed_in_alternate_memory_override_ = nullptr;
-  std::function<float(const Shape&)> get_async_copy_elapsed_override_ = nullptr;
-};
-
 TEST_P(MemorySpaceAssignmentTest, ParameterOnly) {
   // A module consisting of a single parameter. Inputs/outputs are currently
   // excluded from memory space assignment.
@@ -643,6 +559,7 @@ TEST_P(MemorySpaceAssignmentTest, Simple) {
       F32, {2, 3},
       /*minor_to_major=*/{1, 0},
       /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1,
       /*element_size_in_bits=*/0, kAlternateMemorySpace);
   EXPECT_THAT(p0, op::ShapeWithLayout(shape));
   EXPECT_THAT(p1, op::ShapeWithLayout(shape));
@@ -705,6 +622,7 @@ TEST_P(MemorySpaceAssignmentTest, NegateChain) {
       F32, {2, 3},
       /*minor_to_major=*/{1, 0},
       /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1,
       /*element_size_in_bits=*/0, kAlternateMemorySpace);
   EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
@@ -989,6 +907,7 @@ TEST_P(MemorySpaceAssignmentTest, FilterUpdatePreferredPrefetchTest) {
       F32, {2, 3},
       /*minor_to_major=*/{1, 0},
       /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1,
       /*element_size_in_bits=*/0, kAlternateMemorySpace);
   EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
@@ -1064,6 +983,7 @@ TEST_P(MemorySpaceAssignmentTest, FilterUpdateConfigExactMatchBeforeTest) {
       F32, {2, 3},
       /*minor_to_major=*/{1, 0},
       /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1,
       /*element_size_in_bits=*/0, kAlternateMemorySpace);
   EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
@@ -1139,6 +1059,7 @@ TEST_P(MemorySpaceAssignmentTest, FilterUpdateConfigExactMatchAfterTest) {
       F32, {2, 3},
       /*minor_to_major=*/{1, 0},
       /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1,
       /*element_size_in_bits=*/0, kAlternateMemorySpace);
   EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
@@ -1213,6 +1134,7 @@ TEST_P(MemorySpaceAssignmentTest, FilterUpdateConfigExactMatchTooLateTest) {
       F32, {2, 3},
       /*minor_to_major=*/{1, 0},
       /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1,
       /*element_size_in_bits=*/0, kAlternateMemorySpace);
   EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
@@ -1285,6 +1207,7 @@ TEST_P(MemorySpaceAssignmentTest, FilterUpdateConfigPrecedenceTest) {
       F32, {2, 3},
       /*minor_to_major=*/{1, 0},
       /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1,
       /*element_size_in_bits=*/0, kAlternateMemorySpace);
   EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
@@ -1365,6 +1288,7 @@ TEST_P(MemorySpaceAssignmentTest, FilterUpdateConfigExactMatchPrecedenceTest) {
       F32, {2, 3},
       /*minor_to_major=*/{1, 0},
       /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1,
       /*element_size_in_bits=*/0, kAlternateMemorySpace);
   EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
@@ -1441,6 +1365,7 @@ TEST_P(MemorySpaceAssignmentTest, FilterUpdatePreferredPrefetchNoMatchTest) {
       F32, {2, 3},
       /*minor_to_major=*/{1, 0},
       /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1,
       /*element_size_in_bits=*/0, kAlternateMemorySpace);
   EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
@@ -1761,7 +1686,8 @@ TEST_P(MemorySpaceAssignmentTest, While) {
   }
   Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {2, 3},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1, /*element_size_in_bits=*/0,
       kAlternateMemorySpace);
   EXPECT_THAT(body_data_mul, op::ShapeWithLayout(shape_in_alternate_mem));
 }
@@ -2100,7 +2026,8 @@ TEST_P(MemorySpaceAssignmentTest, BitcastMultiUse) {
   AssignMemorySpace(module.get());
   Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {2, 3},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1, /*element_size_in_bits=*/0,
       kAlternateMemorySpace);
   EXPECT_THAT(negate0->operand(0), op::ShapeWithLayout(shape));
   EXPECT_THAT(add->operand(0), op::ShapeWithLayout(shape_in_alternate_mem));
@@ -2155,7 +2082,8 @@ TEST_P(MemorySpaceAssignmentTest, BitcastMultiUseTuple) {
   AssignMemorySpace(module.get());
   Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {2, 3},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1, /*element_size_in_bits=*/0,
       kAlternateMemorySpace);
   EXPECT_THAT(negate0->operand(0), op::ShapeWithLayout(shape));
   EXPECT_THAT(fusion->operand(0)->operand(0),
@@ -4064,6 +3992,7 @@ TEST_P(MemorySpaceAssignmentTest, NonEntryComputationSchedule6) {
       LayoutUtil::MakeLayout(
           /*minor_to_major=*/{1, 0}, /*dim_level_types=*/{}, /*dim_unique=*/{},
           /*dim_ordered=*/{}, /*tiles=*/{},
+          /*tail_padding_alignment_in_elements=*/1,
           /*index_primitive_type=*/PRIMITIVE_TYPE_INVALID,
           /*pointer_primitive_type=*/PRIMITIVE_TYPE_INVALID,
           /*element_size_in_bits=*/0, kAlternateMemorySpace);
@@ -4072,6 +4001,7 @@ TEST_P(MemorySpaceAssignmentTest, NonEntryComputationSchedule6) {
       LayoutUtil::MakeLayout(
           /*minor_to_major=*/{}, /*dim_level_types=*/{}, /*dim_unique=*/{},
           /*dim_ordered=*/{}, /*tiles=*/{},
+          /*tail_padding_alignment_in_elements=*/1,
           /*index_primitive_type=*/PRIMITIVE_TYPE_INVALID,
           /*pointer_primitive_type=*/PRIMITIVE_TYPE_INVALID,
           /*element_size_in_bits=*/0, kDefaultMemorySpace);
@@ -4080,6 +4010,7 @@ TEST_P(MemorySpaceAssignmentTest, NonEntryComputationSchedule6) {
       LayoutUtil::MakeLayout(
           /*minor_to_major=*/{1, 0}, /*dim_level_types=*/{}, /*dim_unique=*/{},
           /*dim_ordered=*/{}, /*tiles=*/{},
+          /*tail_padding_alignment_in_elements=*/1,
           /*index_primitive_type=*/PRIMITIVE_TYPE_INVALID,
           /*pointer_primitive_type=*/PRIMITIVE_TYPE_INVALID,
           /*element_size_in_bits=*/0, kDefaultMemorySpace);
@@ -4515,7 +4446,8 @@ TEST_P(MemorySpaceAssignmentTest, CostAnalysis) {
   // Negate instructions are in the alternate memory space (1).
   Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {2, 3},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1, /*element_size_in_bits=*/0,
       kAlternateMemorySpace);
   EXPECT_THAT(negate0, op::ShapeWithLayout(shape_in_alternate_mem));
   EXPECT_THAT(negate1, op::ShapeWithLayout(shape_in_alternate_mem));
@@ -4585,7 +4517,8 @@ TEST_P(MemorySpaceAssignmentTest, MemoryBoundednessBufferIntervalCompare) {
   EXPECT_THAT(p1, op::ShapeWithLayout(shape));
   Shape shape_in_default_mem = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {4, 3},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1, /*element_size_in_bits=*/0,
       kDefaultMemorySpace);
   // Expect only negates to be in alternate memory space. Not all might fit but
   // make sure at least one does.
@@ -4604,6 +4537,143 @@ TEST_P(MemorySpaceAssignmentTest, MemoryBoundednessBufferIntervalCompare) {
   EXPECT_THAT(tanh4, op::ShapeWithLayout(shape_in_default_mem));
 }
 
+TEST_P(MemorySpaceAssignmentTest,
+       MemoryBoundednessOverrideSortOrderAssignFirst) {
+  // Override MSA sort order and try to assign all negates to alternate memory
+  // first.
+  absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    p0 = f32[3,4]{1,0} parameter(0)
+    p1 = f32[3,4]{1,0} parameter(1)
+    tanh0 = f32[3,4]{1,0} tanh(p0)
+    negate0 = f32[3,4]{1,0} negate(p1)
+    tanh1 = f32[3,4]{1,0} tanh(tanh0)
+    negate1 = f32[3,4]{1,0} negate(negate0)
+    tanh2 = f32[3,4]{1,0} tanh(tanh1)
+    negate2 = f32[3,4]{1,0} negate(negate1)
+    tanh3 = f32[3,4]{1,0} tanh(tanh2)
+    negate3 = f32[3,4]{1,0} negate(negate2)
+    tanh4 = f32[3,4]{1,0} tanh(tanh3)
+    negate4 = f32[3,4]{1,0} negate(negate3)
+    ROOT tuple = (f32[3,4]{1,0}, f32[3,4]{1,0}) tuple(tanh4, negate4)
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  const std::string text_proto = R"pb(
+    overrides {
+      hlo_position_matcher { instruction_name_regex: "negate(.*)" }
+      override_options { assign_first: true }
+    })pb";
+  TF_ASSERT_OK_AND_ASSIGN(auto msa_sort_order_overrides,
+                          ParseTextProto<MsaSortOrderOverrides>(text_proto));
+
+  AssignMemorySpaceUsingCostAnalysis(
+      module.get(), /*memory_space_options_override=*/std::nullopt,
+      /*cost_analysis_options_override=*/std::nullopt,
+      /*hlo_cost_options_override=*/std::nullopt,
+      /*optional_msa_sort_order_overrides=*/msa_sort_order_overrides);
+  // Parameters are in the default memory space.
+  const HloInstruction* p0 = FindInstruction(module.get(), "p0");
+  EXPECT_EQ(p0->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* p1 = FindInstruction(module.get(), "p1");
+  EXPECT_EQ(p1->shape().layout().memory_space(), kDefaultMemorySpace);
+  // All negates are in alternate memory space except negate4.
+  HloInstruction* negate0 = FindInstruction(module.get(), "negate0");
+  EXPECT_EQ(negate0->shape().layout().memory_space(), kAlternateMemorySpace);
+  HloInstruction* negate1 = FindInstruction(module.get(), "negate1");
+  EXPECT_EQ(negate1->shape().layout().memory_space(), kAlternateMemorySpace);
+  HloInstruction* negate2 = FindInstruction(module.get(), "negate2");
+  EXPECT_EQ(negate2->shape().layout().memory_space(), kAlternateMemorySpace);
+  HloInstruction* negate3 = FindInstruction(module.get(), "negate3");
+  EXPECT_EQ(negate3->shape().layout().memory_space(), kAlternateMemorySpace);
+  HloInstruction* negate4 = FindInstruction(module.get(), "negate4");
+  EXPECT_EQ(negate4->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* tanh0 = FindInstruction(module.get(), "tanh0");
+  EXPECT_EQ(tanh0->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* tanh1 = FindInstruction(module.get(), "tanh1");
+  EXPECT_EQ(tanh1->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* tanh2 = FindInstruction(module.get(), "tanh2");
+  EXPECT_EQ(tanh2->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* tanh3 = FindInstruction(module.get(), "tanh3");
+  EXPECT_EQ(tanh3->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* tanh4 = FindInstruction(module.get(), "tanh4");
+  EXPECT_EQ(tanh4->shape().layout().memory_space(), kDefaultMemorySpace);
+}
+
+TEST_P(MemorySpaceAssignmentTest,
+       MemoryBoundednessOverrideSortOrderAssignLast) {
+  // Override MSA sort order and try to assign all negates to alternate memory
+  // last.
+  absl::string_view hlo_string = R"(
+  HloModule module, is_scheduled=true
+
+  ENTRY entry {
+    p0 = f32[3,4]{1,0} parameter(0)
+    p1 = f32[3,4]{1,0} parameter(1)
+    tanh0 = f32[3,4]{1,0} tanh(p0)
+    negate0 = f32[3,4]{1,0} negate(p1)
+    tanh1 = f32[3,4]{1,0} tanh(tanh0)
+    negate1 = f32[3,4]{1,0} negate(negate0)
+    tanh2 = f32[3,4]{1,0} tanh(tanh1)
+    negate2 = f32[3,4]{1,0} negate(negate1)
+    tanh3 = f32[3,4]{1,0} tanh(tanh2)
+    negate3 = f32[3,4]{1,0} negate(negate2)
+    tanh4 = f32[3,4]{1,0} tanh(tanh3)
+    negate4 = f32[3,4]{1,0} negate(negate3)
+    ROOT tuple = (f32[3,4]{1,0}, f32[3,4]{1,0}) tuple(tanh4, negate4)
+  }
+  )";
+
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<VerifiedHloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  const std::string text_proto = R"pb(
+    overrides {
+      hlo_position_matcher { instruction_name_regex: "negate(.*)" }
+      override_options { assign_last: true }
+    }
+  )pb";
+  TF_ASSERT_OK_AND_ASSIGN(auto msa_sort_order_overrides,
+                          ParseTextProto<MsaSortOrderOverrides>(text_proto));
+
+  AssignMemorySpaceUsingCostAnalysis(
+      module.get(), /*memory_space_options_override=*/std::nullopt,
+      /*cost_analysis_options_override=*/std::nullopt,
+      /*hlo_cost_options_override=*/std::nullopt,
+      /*optional_msa_sort_order_overrides=*/msa_sort_order_overrides);
+  // Parameters are in the default memory space.
+  const HloInstruction* p0 = FindInstruction(module.get(), "p0");
+  EXPECT_EQ(p0->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* p1 = FindInstruction(module.get(), "p1");
+  EXPECT_EQ(p1->shape().layout().memory_space(), kDefaultMemorySpace);
+  // All negates are in default memory space except negate3.
+  HloInstruction* negate0 = FindInstruction(module.get(), "negate0");
+  EXPECT_EQ(negate0->shape().layout().memory_space(), kDefaultMemorySpace);
+  HloInstruction* negate1 = FindInstruction(module.get(), "negate1");
+  EXPECT_EQ(negate1->shape().layout().memory_space(), kDefaultMemorySpace);
+  HloInstruction* negate2 = FindInstruction(module.get(), "negate2");
+  EXPECT_EQ(negate2->shape().layout().memory_space(), kDefaultMemorySpace);
+  HloInstruction* negate3 = FindInstruction(module.get(), "negate3");
+  EXPECT_EQ(negate3->shape().layout().memory_space(), kAlternateMemorySpace);
+  HloInstruction* negate4 = FindInstruction(module.get(), "negate4");
+  EXPECT_EQ(negate4->shape().layout().memory_space(), kDefaultMemorySpace);
+  const HloInstruction* tanh0 = FindInstruction(module.get(), "tanh0");
+  EXPECT_EQ(tanh0->shape().layout().memory_space(), kAlternateMemorySpace);
+  const HloInstruction* tanh1 = FindInstruction(module.get(), "tanh1");
+  EXPECT_EQ(tanh1->shape().layout().memory_space(), kAlternateMemorySpace);
+  const HloInstruction* tanh2 = FindInstruction(module.get(), "tanh2");
+  EXPECT_EQ(tanh2->shape().layout().memory_space(), kAlternateMemorySpace);
+  const HloInstruction* tanh3 = FindInstruction(module.get(), "tanh3");
+  EXPECT_EQ(tanh3->shape().layout().memory_space(), kAlternateMemorySpace);
+  const HloInstruction* tanh4 = FindInstruction(module.get(), "tanh4");
+  EXPECT_EQ(tanh4->shape().layout().memory_space(), kDefaultMemorySpace);
+}
+
 TEST_P(MemorySpaceAssignmentTest, SimpleWhileTupleTest) {
   Shape s32 = ShapeUtil::MakeShape(xla::S32, {});
   Shape f32v1 = ShapeUtil::MakeShape(F32, {1});
@@ -4693,15 +4763,18 @@ TEST_P(MemorySpaceAssignmentTest, SimpleWhileTupleTest) {
   // Ensure all parameters and while are placed in default memory.
   Shape shape_in_default_mem = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {4, 6},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1, /*element_size_in_bits=*/0,
       kDefaultMemorySpace);
   Shape s32_in_default_mem = ShapeUtil::MakeShapeWithDenseLayout(
       xla::S32, {},
-      /*minor_to_major=*/{}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      /*minor_to_major=*/{}, /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1, /*element_size_in_bits=*/0,
       kDefaultMemorySpace);
   Shape f32v1_in_default_mem = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {1},
-      /*minor_to_major=*/{0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      /*minor_to_major=*/{0}, /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1, /*element_size_in_bits=*/0,
       kDefaultMemorySpace);
   Shape t_s32_f32v1_in_default_mem =
       ShapeUtil::MakeTupleShape({s32_in_default_mem, f32v1_in_default_mem});
@@ -4812,7 +4885,8 @@ TEST_P(MemorySpaceAssignmentTest,
   Shape shape = ShapeUtil::MakeShape(F32, {2, 3});
   Shape shape_in_alternate_mem = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {2, 3},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1, /*element_size_in_bits=*/0,
       kAlternateMemorySpace);
   // p0 is in the default memory space.
   HloInstruction* p0 =
@@ -6873,15 +6947,14 @@ TEST_P(MemorySpaceAssignmentTest, RepackExportsAliasedOffsets) {
 
   // Expect that of the four separate allocations for the "a" buffer, the first
   // and the next three are in separate colocations.
-  auto check_fun =
-      [](absl::Span<AllocationBlock*> allocations) {
-        EXPECT_TRUE(allocations.at(0)->GetColocationsCount() == 1 ||
-                    allocations.at(0)->GetColocationsCount() == 3);
-        EXPECT_EQ(allocations.at(1)->GetColocationsCount(), 3);
-        EXPECT_EQ(allocations.at(2)->GetColocationsCount(), 3);
-        EXPECT_TRUE(allocations.at(3)->GetColocationsCount() == 1 ||
-                    allocations.at(3)->GetColocationsCount() == 3);
-      };
+  auto check_fun = [](absl::Span<AllocationBlock*> allocations) {
+    EXPECT_TRUE(allocations.at(0)->GetColocationsCount() == 1 ||
+                allocations.at(0)->GetColocationsCount() == 3);
+    EXPECT_EQ(allocations.at(1)->GetColocationsCount(), 3);
+    EXPECT_EQ(allocations.at(2)->GetColocationsCount(), 3);
+    EXPECT_TRUE(allocations.at(3)->GetColocationsCount() == 1 ||
+                allocations.at(3)->GetColocationsCount() == 3);
+  };
   FakeMemorySpaceAssignmentRepacker repacker =
       FakeMemorySpaceAssignmentRepacker(repack_map, check_fun);
   Options options = DefaultMemorySpaceOptions();
@@ -7424,7 +7497,8 @@ ENTRY entry {
   // Disable inefficiency check. Expect that the fusion output and operand are
   // in the alternate memory.
   options.inefficient_use_to_copy_ratio = 0.0;
-  AssignMemorySpaceUsingCostAnalysis(module.get(), options);
+  AssignMemorySpaceUsingCostAnalysis(module.get(),
+                                     /*memory_space_options_override=*/options);
   if (allocate_across_sequential_calls()) {
     EXPECT_THAT(
         module->entry_computation()->root_instruction(),
@@ -7440,7 +7514,8 @@ ENTRY entry {
   // f32[2,3]), so this should be considered inefficient (8/48 < 0.5).
   TF_ASSERT_OK_AND_ASSIGN(module, ParseAndReturnVerifiedModule(hlo_string));
   options.inefficient_use_to_copy_ratio = 0.5;
-  AssignMemorySpaceUsingCostAnalysis(module.get(), options);
+  AssignMemorySpaceUsingCostAnalysis(module.get(),
+                                     /*memory_space_options_override=*/options);
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               op::Tuple(op::Fusion(op::Parameter()), op::Negate()));
 }
@@ -7510,10 +7585,13 @@ ENTRY entry {
   Options options = DefaultMemorySpaceOptions();
   options.enable_cross_program_prefetch = false;
   options.inefficient_use_to_copy_ratio = 0.5;
-  HloCostAnalysis::Options cost_options = DefaultHloCostAnalysisOptions();
-  cost_options.set_transcendentals_per_second(0.4);
+  HloCostAnalysis::Options hlo_cost_options = DefaultHloCostAnalysisOptions();
+  hlo_cost_options.set_transcendentals_per_second(0.4);
 
-  AssignMemorySpaceUsingCostAnalysis(module.get(), options, cost_options);
+  AssignMemorySpaceUsingCostAnalysis(
+      module.get(), /*memory_space_options_override=*/options,
+      /*cost_analysis_options_override=*/std::nullopt,
+      /*hlo_cost_options_override=*/hlo_cost_options);
 }
 
 TEST_P(MemorySpaceAssignmentTest, AsyncOpElapsedTime) {
@@ -7620,7 +7698,7 @@ TEST_F(AsynchronousCopyOrderingTest, Simple) {
   // 6,17          +----------+   Violate
   // 5,13         +-------+       OK (same start as 5,14)
   // 5,14         +--------+      OK (same as 5,14)
-  auto alternate_mem_space = MemorySpaceAssignment::MemorySpace::kAlternate;
+  auto alternate_mem_space = MemorySpace::kAlternate;
   AsynchronousCopyOrdering ordering;
   EXPECT_FALSE(ordering.ViolatesOrdering(3, 11));
   ordering.AddCopy({3, 11, 1, alternate_mem_space, 0});
@@ -7640,7 +7718,7 @@ TEST_F(AsynchronousCopyOrderingTest, Simple) {
 }
 
 TEST_F(AsynchronousCopyOrderingTest, SameInterval) {
-  auto alternate_mem_space = MemorySpaceAssignment::MemorySpace::kAlternate;
+  auto alternate_mem_space = MemorySpace::kAlternate;
   AsynchronousCopyOrdering ordering;
   EXPECT_FALSE(ordering.ViolatesOrdering(1, 5));
   EXPECT_FALSE(ordering.ViolatesOrdering(2, 4));
@@ -7672,7 +7750,7 @@ TEST_F(AsynchronousCopyResourceTest, Simple) {
   //  4,9,3              +-------+    Violate
   //  4,8,2              +-----+      OK; The 5,9 copy shifts resource to right.
   // resource:  0 0 0 3 7 0 0 0 0 4
-  auto alternate_mem_space = MemorySpaceAssignment::MemorySpace::kAlternate;
+  auto alternate_mem_space = MemorySpace::kAlternate;
   AsynchronousCopyResource resource(
       {2.0, 3.0, 1.0, 6.0, 7.0, 1.0, 7.0, 2.0, 2.0, 4.0});
   EXPECT_TRUE(resource.HasEnoughResource(-1, 3, 5.0));
@@ -7706,7 +7784,7 @@ TEST_F(AsynchronousCopyResourceTest, Propagate) {
   // 0,4,3       +-----+               OK
   // resource:  2 0 0 0 0 0 0 0 0 0
   // 0,4,1       +-----+               Violate
-  auto alternate_mem_space = MemorySpaceAssignment::MemorySpace::kAlternate;
+  auto alternate_mem_space = MemorySpace::kAlternate;
   AsynchronousCopyResource resource(
       {2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0});
   EXPECT_TRUE(resource.HasEnoughResource(6, 10, 2.0));
@@ -7750,7 +7828,7 @@ TEST_F(AsynchronousCopyResourceTest, CantPropagate) {
   // 4,8,4               +-----+       OK
   // resource:  2 2 2 2 2 0 0 0 0 2
   // 3,6,4             +---+           Violate
-  auto alternate_mem_space = MemorySpaceAssignment::MemorySpace::kAlternate;
+  auto alternate_mem_space = MemorySpace::kAlternate;
   AsynchronousCopyResource resource(
       {2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0});
   EXPECT_TRUE(resource.HasEnoughResource(5, 10, 2.0));
@@ -7777,7 +7855,7 @@ TEST_F(AsynchronousCopyResourceTest, Nested) {
   // 1,3,2         +-+       OK
   // resource:  2 2 0 2 2
   // 0,4,4       +-----+     Violate
-  auto alternate_mem_space = MemorySpaceAssignment::MemorySpace::kAlternate;
+  auto alternate_mem_space = MemorySpace::kAlternate;
   AsynchronousCopyResource resource({2.0, 2.0, 2.0, 2.0, 2.0});
   EXPECT_TRUE(resource.HasEnoughResource(1, 3, 2.0));
   resource.AddCopy({1, 3, 2.0, alternate_mem_space, 0});
@@ -7801,7 +7879,7 @@ TEST_F(AsynchronousCopyResourceTest, Remove) {
   // resource:  0 1 2 2 2
   // rem:-1,2,3+---+
   // resource:  2 2 2 2 2
-  auto alternate_mem_space = MemorySpaceAssignment::MemorySpace::kAlternate;
+  auto alternate_mem_space = MemorySpace::kAlternate;
   AsynchronousCopyResource resource({2.0, 2.0, 2.0, 2.0, 2.0});
   AsynchronousCopy copy1{2, 5, 2.0, alternate_mem_space, 0};
   AsynchronousCopy copy2{-1, 2, 3.0, alternate_mem_space, 1};
@@ -7844,7 +7922,7 @@ TEST_F(AsynchronousCopyResourceTest, NestedRemove) {
   // resource:  2 2 2 2 2
   // add:1,3,2     +-+       OK
   // resource:  2 2 0 2 2
-  auto alternate_mem_space = MemorySpaceAssignment::MemorySpace::kAlternate;
+  auto alternate_mem_space = MemorySpace::kAlternate;
   AsynchronousCopyResource resource({2.0, 2.0, 2.0, 2.0, 2.0});
   AsynchronousCopy copy1{1, 3, 2.0, alternate_mem_space, 0};
   AsynchronousCopy copy2{0, 4, 4.0, alternate_mem_space, 1};
@@ -7891,7 +7969,7 @@ TEST_F(AsynchronousCopyResourceTest, PropagateRemove) {
   // resource:  2 0 0 0 0 0 0 0 1 2
   // rem:0,4,3   +-----+
   // resource:  2 2 0 0 0 0 0 0 2 2
-  auto alternate_mem_space = MemorySpaceAssignment::MemorySpace::kAlternate;
+  auto alternate_mem_space = MemorySpace::kAlternate;
   AsynchronousCopyResource resource(
       {2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0});
   EXPECT_TRUE(resource.HasEnoughResource(6, 10, 2.0));
@@ -7943,7 +8021,7 @@ TEST_F(AsynchronousCopyResourceTest, StartAtZeroAndRemove) {
   // resource:  0 0 1 1 2
   // add:0,4,2   +-----+     OK
   // resource:  0 0 0 0 2
-  auto alternate_mem_space = MemorySpaceAssignment::MemorySpace::kAlternate;
+  auto alternate_mem_space = MemorySpace::kAlternate;
   AsynchronousCopyResource resource({0.0, 0.0, 1.0, 1.0, 2.0});
   AsynchronousCopy copy1{0, 4, 2.0, alternate_mem_space, 0};
   EXPECT_TRUE(resource.HasEnoughResource(0, 4, 2.0));
@@ -7986,7 +8064,7 @@ TEST_F(AsynchronousCopyResourceTest, OutOfOrderRemovalSameStartTime) {
   // resource:  2 2 1 2 2
   // rem:1,5,1     +-----+
   // resource:  2 2 2 2 2
-  auto alternate_mem_space = MemorySpaceAssignment::MemorySpace::kAlternate;
+  auto alternate_mem_space = MemorySpace::kAlternate;
   AsynchronousCopyResource resource({2.0, 2.0, 2.0, 2.0, 2.0});
   AsynchronousCopy copy1{1, 3, 1.0, alternate_mem_space, 0};
   AsynchronousCopy copy2{1, 4, 2.0, alternate_mem_space, 1};
@@ -8051,7 +8129,7 @@ TEST_F(AsynchronousCopyResourceTest, HasEnoughResourceMultiCheckSuccess) {
   //  0,6,4    +-----------+
   //  4,6,3              +-+          2 copies OK; The 1,10 copy shifts.
   // resource:  0 0 0 0 6 0 7 2 2 4
-  auto alternate_mem_space = MemorySpaceAssignment::MemorySpace::kAlternate;
+  auto alternate_mem_space = MemorySpace::kAlternate;
   AsynchronousCopyResource resource(
       {2.0, 1.0, 3.0, 6.0, 7.0, 3.0, 7.0, 2.0, 2.0, 4.0});
   EXPECT_TRUE(resource.HasEnoughResource(-1, 3, 5.0));
@@ -8079,7 +8157,7 @@ TEST_F(AsynchronousCopyResourceTest, HasEnoughResourceMultiCheckFailure) {
   // resource:  0 0 0 3 7 3 7 2 2 4
   //  0,6,4    +-----------+
   //  4,6,4              +-+          Not-OK
-  auto alternate_mem_space = MemorySpaceAssignment::MemorySpace::kAlternate;
+  auto alternate_mem_space = MemorySpace::kAlternate;
   AsynchronousCopyResource resource(
       {2.0, 1.0, 3.0, 6.0, 7.0, 3.0, 7.0, 2.0, 2.0, 4.0});
   EXPECT_TRUE(resource.HasEnoughResource(-1, 3, 5.0));
@@ -8096,7 +8174,7 @@ TEST_F(AsynchronousCopyResourceTest, HasEnoughResourceMultiCheckFailure) {
 
 TEST_F(AsynchronousCopyResourceTest,
        HasEnoughResourceMultiCheckRegressionTest) {
-  auto alternate_mem_space = MemorySpaceAssignment::MemorySpace::kAlternate;
+  auto alternate_mem_space = MemorySpace::kAlternate;
   AsynchronousCopyResource resource({/*0:*/ 24.0f,
                                      /*1:*/ 0.0f,
                                      /*2:*/ 6.0f,
@@ -8610,7 +8688,8 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchPinnedTest) {
   auto lhs_shape = ShapeUtil::MakeShape(F32, {kBatch, kFeature});
   auto rhs_shape = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {kFeature, kOutput},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1, /*element_size_in_bits=*/0,
       kAlternateMemorySpace);
   auto result_shape = ShapeUtil::MakeShape(F32, {kBatch, kOutput});
   HloInstruction* lhs = builder.AddInstruction(
@@ -8652,7 +8731,8 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramPrefetchPinnedTupleTest) {
   auto lhs_shape = ShapeUtil::MakeShape(F32, {kBatch, kFeature});
   auto rhs_shape = ShapeUtil::MakeShapeWithDenseLayout(
       F32, {kFeature, kOutput},
-      /*minor_to_major=*/{1, 0}, /*tiles=*/{}, /*element_size_in_bits=*/0,
+      /*minor_to_major=*/{1, 0}, /*tiles=*/{},
+      /*tail_padding_alignment_in_elements=*/1, /*element_size_in_bits=*/0,
       kAlternateMemorySpace);
   auto result_shape = ShapeUtil::MakeShape(F32, {kBatch, kOutput});
   auto tuple_shape = ShapeUtil::MakeTupleShape({lhs_shape, rhs_shape});
@@ -8710,6 +8790,35 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramRootDupMayAlias) {
               op::Parameter(0));
 }
 
+TEST_P(MemorySpaceAssignmentTest, CrossProgramRootDusFusionMayAlias) {
+  absl::string_view hlo_string = R"(
+  HloModule cross_program_prefetch, is_scheduled=true, input_output_alias={ {}: (0, {}, may-alias) }
+    fused_computation {
+      fused_p0 = s32[2,2] parameter(0)
+      fused_p1 = s32[1,2] parameter(1)
+      fused_p2 = s32[] parameter(2)
+      fused_p3 = s32[] parameter(3)
+      ROOT dus = s32[2,2] dynamic-update-slice(fused_p0, fused_p1, fused_p2, fused_p3)
+    }
+
+    ENTRY CrossProgramPrefetch {
+      p0 = s32[2,2] parameter(0)
+      c0 = s32[1,2] constant({{77, 77}})
+      c1 = s32[] constant(0)
+      bitcast1 = s32[2,2] bitcast(p0)
+      ROOT fusion = s32[2,2] fusion(bitcast1, c0, c1, c1), kind=kLoop, calls=fused_computation
+    }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  auto preset_assignments = AssignMemorySpace(
+      module.get(), DefaultMemorySpaceOptions(),
+      /*max_prefetch_interval=*/5, /*min_prefetch_interval=*/2);
+
+  auto cross_program_prefetches = module->CrossProgramPrefetches();
+  EXPECT_EQ(cross_program_prefetches.size(), 0);
+}
+
 TEST_P(MemorySpaceAssignmentTest, CrossProgramRootDup) {
   absl::string_view hlo_string = R"(
   HloModule cross_program_prefetch, is_scheduled=true
@@ -8774,19 +8883,15 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramRootDotMayAlias) {
       /*max_prefetch_interval=*/5, /*min_prefetch_interval=*/2);
 
   auto cross_program_prefetches = module->CrossProgramPrefetches();
-  EXPECT_EQ(cross_program_prefetches.size(), 1);
+  EXPECT_EQ(cross_program_prefetches.size(), 0);
   EXPECT_THAT(FindInstruction(module.get(), "dot")->operand(1),
-              op::AsyncCopy(kAlternateMemorySpace, kDefaultMemorySpace,
-                            op::Parameter(0)));
+              op::Parameter(0));
 }
 
 TEST_P(MemorySpaceAssignmentTest, CrossProgramRootLiveOutBug) {
-  // An in-place fusion that lives out should not be included as a use to the
-  // cross-program prefetch allocation. Due to a bug, we considered in-place
-  // update that feeds the ROOT of the entry computation as a valid use of the
-  // cross-program prefetch. This then would cause this live-out buffer to be
-  // placed in the alternate memory. We expect p0 to be cross-program prefetched
-  // but only for the dot operand and not the fusion operand.
+  // Input-output aliased buffers should not be cross-program prefetched since
+  // the update on the buffer will not be reflected on the next program
+  // execution (the data in the alternate memory would be stale).
   absl::string_view hlo_string = R"(
   HloModule cross_program_prefetch, is_scheduled=true, input_output_alias={ {0}: (0, {}, may-alias) }
     fused_computation {
@@ -8812,12 +8917,7 @@ TEST_P(MemorySpaceAssignmentTest, CrossProgramRootLiveOutBug) {
       /*max_prefetch_interval=*/5, /*min_prefetch_interval=*/2);
 
   auto cross_program_prefetches = module->CrossProgramPrefetches();
-  EXPECT_EQ(cross_program_prefetches.size(), 1);
-  EXPECT_THAT(FindInstruction(module.get(), "dot")->operand(1),
-              op::AsyncCopy(kAlternateMemorySpace, kDefaultMemorySpace,
-                            op::Parameter(0)));
-  EXPECT_THAT(FindInstruction(module.get(), "fusion")->operand(0),
-              op::Parameter(0));
+  EXPECT_EQ(cross_program_prefetches.size(), 0);
 }
 
 TEST_P(MemorySpaceAssignmentTest, CrossProgramRootParameter) {
@@ -9200,9 +9300,10 @@ ENTRY main {
 
   // Setup cost analysis so it takes 2 instructions to prefetch anything.
   HloCostAnalysis hlo_cost_analysis(ShapeSize);
+  CostAnalysisOptions cost_analysis_options;
   TF_ASSERT_OK_AND_ASSIGN(auto cost_analysis,
-                          FakeMemorySpaceAssignmentCostAnalysis::Create(
-                              hlo_cost_analysis, *module, options));
+                          FakeCostAnalysis::Create(hlo_cost_analysis, *module,
+                                                   cost_analysis_options));
   cost_analysis->SetOverrideForGetInstructionElapsed(
       [](const HloInstruction& instruction) -> float { return 10.0; });
   cost_analysis->SetOverrideForGetAsyncCopyElapsed(
@@ -9289,549 +9390,6 @@ ENTRY main {
   EXPECT_EQ(f_index, p1_copy_end + 1);
 }
 
-using CostAnalysisPrefetchIntervalPickerTest = HloTestBase;
-
-TEST_F(CostAnalysisPrefetchIntervalPickerTest, PrefetchIntervalOrder) {
-  absl::string_view hlo_string = R"(
-  HloModule bug, is_scheduled=true
-
-  ENTRY Entry {
-    param0 = f32[2,4] parameter(0)
-    a = f32[2,4] negate(param0)
-    b = f32[2,4] negate(a)
-    c = f32[2,4] negate(b)
-    d = f32[2,4] negate(c)
-    e = f32[2,4] negate(d)
-    f = f32[2,4] negate(e)
-    g = f32[2,4] negate(f)
-    h = f32[2,4] negate(g)
-    i = f32[2,4] negate(h)
-    j = f32[2,4] negate(i)
-    k = f32[2,4] negate(j)
-    l = f32[2,4] negate(k)
-    m = f32[2,4] negate(l)
-    n = f32[2,4] negate(m)
-    o = f32[2,4] negate(n)
-    p = f32[2,4] negate(o)
-    q = f32[2,4] negate(p)
-    r = f32[2,4] negate(q)
-    s = f32[2,4] negate(r)
-    t = f32[2,4] negate(s)
-    u = f32[2,4] negate(t)
-    ROOT v = f32[2,4] add(u, param0)
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-
-  HloCostAnalysis hlo_cost_analysis(ShapeSize);
-  Options options;
-  TF_ASSERT_OK_AND_ASSIGN(auto cost_analysis,
-                          FakeMemorySpaceAssignmentCostAnalysis::Create(
-                              hlo_cost_analysis, *module, options));
-  CostAnalysisPrefetchIntervalPicker interval_picker(
-      *cost_analysis,
-      /*min_overlap_to_async_copy_ratio=*/1.0,
-      /*preferred_overlap_to_async_copy_ratio=*/2.0,
-      /*max_overlap_to_mem_size_async_copy_ratio=*/4.0,
-      /*mem_size_bytes=*/32);
-
-  HloInstruction* root = module->entry_computation()->root_instruction();
-  const HloUse use{root, /*operand_number=*/1, /*operand_index=*/{}};
-  interval_picker.Begin(use, /*start_time=*/0, /*end_time=*/22, std::nullopt);
-
-  // Expect that the first interval is (15, 22), which has elapsed time of 6.0,
-  // twice of the async copy elased (3.0). Then we expect that intervals will be
-  // visited in alternating increasing and decreasing orders until hitting the
-  // min and max async copy overlap ratios, which are the intervals (18, 22)
-  // and (9, 22) respectively.
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_EQ(interval_picker.Next(), 15);
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_EQ(interval_picker.Next(), 16);
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_EQ(interval_picker.Next(), 14);
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_EQ(interval_picker.Next(), 17);
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_EQ(interval_picker.Next(), 13);
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_EQ(interval_picker.Next(), 18);  // Min async overlap ratio reached.
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_EQ(interval_picker.Next(), 12);
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_EQ(interval_picker.Next(), 11);
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_EQ(interval_picker.Next(), 10);
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_EQ(interval_picker.Next(), 9);  // Max async overlap ratio reached.
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_TRUE(interval_picker.Done());
-
-  // Expect that if the time between start_time and end_time is too short, there
-  // won't be any available intervals.
-  interval_picker.Begin(use, /*start_time=*/19, /*end_time=*/22, std::nullopt);
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_TRUE(interval_picker.Done());
-}
-
-TEST_F(CostAnalysisPrefetchIntervalPickerTest, PrefetchIntervalOrderWhile) {
-  absl::string_view hlo_string = R"(
-  HloModule bug, is_scheduled=true
-
-  while_condition {
-    param1 = (f32[2,4]) parameter(0)    // 19
-    ROOT cond = pred[] constant(true)   // 20
-  }
-
-  while_body {
-    param2 = (f32[2,4]) parameter(0)    // 21
-    gte2 = f32[2,4] get-tuple-element(param2), index=0  // 22
-    add = f32[2,4] add(gte2, gte2)      // 23
-    ROOT tuple2 = (f32[2,4]) tuple(add) // 24
-  }
-
-  ENTRY Entry {
-    param0 = f32[2,4] parameter(0)  // 0
-    a = f32[2,4] negate(param0)     // 1
-    b = f32[2,4] negate(a)          // 2
-    c = f32[2,4] negate(b)          // 3
-    d = f32[2,4] negate(c)          // 4
-    e = f32[2,4] negate(d)          // 5
-    f = f32[2,4] negate(e)          // 6
-    g = f32[2,4] negate(f)          // 7
-    h = f32[2,4] negate(g)          // 8
-    i = f32[2,4] negate(h)          // 9
-    j = f32[2,4] negate(i)          // 10
-    k = f32[2,4] negate(j)          // 11
-    l = f32[2,4] negate(k)          // 12
-    m = f32[2,4] negate(l)          // 13
-    n = f32[2,4] negate(m)          // 14
-    o = f32[2,4] negate(n)          // 15
-    p = f32[2,4] negate(o)          // 16
-    q = f32[2,4] negate(p)          // 17
-    tuple = (f32[2,4]) tuple(q)     // 18
-    while = (f32[2,4]) while(tuple), condition=while_condition, body=while_body  // 25
-    gte1 = f32[2,4] get-tuple-element(while), index=0  // 26
-    r = f32[2,4] negate(gte1)       // 27
-    s = f32[2,4] negate(r)          // 28
-    t = f32[2,4] negate(s)          // 29
-    u = f32[2,4] negate(t)          // 30
-    ROOT v = f32[2,4] add(u, param0)  // 31
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-
-  HloCostAnalysis hlo_cost_analysis(ShapeSize);
-  Options options;
-  TF_ASSERT_OK_AND_ASSIGN(auto cost_analysis,
-                          FakeMemorySpaceAssignmentCostAnalysis::Create(
-                              hlo_cost_analysis, *module, options));
-  CostAnalysisPrefetchIntervalPicker interval_picker(
-      *cost_analysis,
-      /*min_overlap_to_async_copy_ratio=*/1.0,
-      /*preferred_overlap_to_async_copy_ratio=*/2.0,
-      /*max_overlap_to_mem_size_async_copy_ratio=*/12.0,
-      /*mem_size_bytes=*/32);
-
-  EXPECT_EQ(cost_analysis->options()
-                .xla_tpu_memory_space_assignment_while_execution_count,
-            5);
-  HloInstruction* root = module->entry_computation()->root_instruction();
-  const HloUse use{root, /*operand_number=*/1, /*operand_index=*/{}};
-  interval_picker.Begin(use, /*start_time=*/0, /*end_time=*/31, std::nullopt);
-
-  // Because there are while loop computations between [19, 24], we ensure that
-  // the interval picker avoids this interval.
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_EQ(interval_picker.Next(), 25);
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_EQ(interval_picker.Next(), 26);
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_EQ(interval_picker.Next(), 18);
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_EQ(interval_picker.Next(), 27);  // Min async overlap ratio reached.
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_EQ(interval_picker.Next(), 17);  // Max async overlap ratio reached.
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_TRUE(interval_picker.Done());
-}
-
-TEST_F(CostAnalysisPrefetchIntervalPickerTest, NestedWhile) {
-  // This test is to check against a bug where we didn't assign
-  // while_nest_level_ for while instructions, and defaulting to 0. This could
-  // cause the prefetch interval logic to think a nested while instruction is
-  // the same level as the outermost computation.
-  absl::string_view hlo_string = R"(
-  HloModule bug, is_scheduled=true
-
-  while_condition.2 {
-    param1 = (f32[2,4]) parameter(0)    // 11
-    ROOT cond = pred[] constant(true)   // 12
-  }
-
-  while_body.2 {
-    param2 = (f32[2,4]) parameter(0)    // 13
-    gte2 = f32[2,4] get-tuple-element(param2), index=0  // 14
-    add = f32[2,4] add(gte2, gte2)      // 15
-    ROOT tuple2 = (f32[2,4]) tuple(add) // 16
-  }
-
-  while_condition.1 {
-    param3 = (f32[2,4]) parameter(0)    // 5
-    ROOT cond = pred[] constant(true)   // 6
-  }
-
-  while_body.1 {
-    param4 = (f32[2,4]) parameter(0)    // 7
-    gte1 = f32[2,4] get-tuple-element(param4), index=0  // 8
-    add1 = f32[2,4] add(gte1, gte1)     // 9
-    tuple1 = (f32[2,4]) tuple(add1)     // 10
-    while = (f32[2,4]) while(tuple1), condition=while_condition.2, body=while_body.2  // 17
-    gte2 = f32[2,4] get-tuple-element(while), index=0  // 18
-    add2 = f32[2,4] add(gte2, gte2)     // 19
-    ROOT tuple2 = (f32[2,4]) tuple(add2)  // 20
-  }
-
-  ENTRY Entry {
-    param0 = f32[2,4] parameter(0)  // 0
-    a = f32[2,4] negate(param0)     // 1
-    b = f32[2,4] negate(a)          // 2
-    c = f32[2,4] negate(b)          // 3
-    tuple = (f32[2,4]) tuple(c)     // 4
-    while = (f32[2,4]) while(tuple), condition=while_condition.1, body=while_body.1  // 21
-    gte1 = f32[2,4] get-tuple-element(while), index=0  // 22
-    ROOT root = f32[2,4] add(gte1, param0)  // 23
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-
-  HloCostAnalysis hlo_cost_analysis(ShapeSize);
-  Options options;
-  TF_ASSERT_OK_AND_ASSIGN(auto cost_analysis,
-                          FakeMemorySpaceAssignmentCostAnalysis::Create(
-                              hlo_cost_analysis, *module, options));
-  CostAnalysisPrefetchIntervalPicker interval_picker(
-      *cost_analysis,
-      /*min_overlap_to_async_copy_ratio=*/1.0,
-      /*preferred_overlap_to_async_copy_ratio=*/2.0,
-      /*max_overlap_to_mem_size_async_copy_ratio=*/12.0,
-      /*mem_size_bytes=*/32);
-
-  HloInstruction* root = module->entry_computation()->root_instruction();
-  const HloUse use{root, /*operand_number=*/1, /*operand_index=*/{}};
-  const Shape& shape = root->operand(1)->shape();
-
-  // We expect the root's latest prefetch start time to be before the while loop
-  // (logical time 4).
-  EXPECT_EQ(interval_picker.LatestPrefetchStartTime(shape, /*start_time=*/0,
-                                                    /*end_time=*/23, &use),
-            4);
-}
-
-TEST_F(CostAnalysisPrefetchIntervalPickerTest, ConsecutiveConditionals) {
-  // This is a test for b/170668492, where prefetching for consecutive
-  // conditionals can cause the prefetch to start in the conditional's
-  // computation.
-  absl::string_view hlo_string = R"(
-  HloModule bug, is_scheduled=true
-
-  true_computation.0 {
-    p0 = (f32[3]{0}) parameter(0)                   // 5
-    gte = f32[3]{0} get-tuple-element(p0), index=0  // 6
-    ROOT neg1 = f32[3]{0} negate(gte)               // 7
-  }
-
-  false_computation.0 {
-    p0 = (f32[3]{0}) parameter(0)                   // 8
-    gte = f32[3]{0} get-tuple-element(p0), index=0  // 9
-    ROOT neg2 = f32[3]{0} negate(gte)               // 10
-  }
-
-  true_computation.1 {
-    p0 = (f32[3]{0}) parameter(0)                   // 12
-    gte = f32[3]{0} get-tuple-element(p0), index=0  // 13
-    ROOT neg1 = f32[3]{0} negate(gte)               // 14
-  }
-
-  false_computation.1 {
-    p0 = (f32[3]{0}) parameter(0)                   // 15
-    gte = f32[3]{0} get-tuple-element(p0), index=0  // 16
-    ROOT neg2 = f32[3]{0} negate(gte)               // 17
-  }
-
-  ENTRY entry {
-    p0 = f32[3]{0} parameter(0)       // 0
-    p1 = f32[3]{0} parameter(1)       // 1
-    p2 = pred[] parameter(2)          // 2
-    tuple0 = (f32[3]{0}) tuple(p0)    // 3
-    tuple1 = (f32[3]{0}) tuple(p1)    // 4
-    conditional0 = f32[3]{0} conditional(p2, tuple0, tuple0), true_computation=true_computation.0, false_computation=false_computation.0  // 11
-    conditional1 = f32[3]{0} conditional(p2, tuple1, tuple1), true_computation=true_computation.1, false_computation=false_computation.1  // 18
-    ROOT tuple2 = (f32[3]{0}, f32[3]{0}) tuple(conditional0, conditional1)  // 19
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-
-  HloCostAnalysis hlo_cost_analysis(ShapeSize);
-  Options options;
-  TF_ASSERT_OK_AND_ASSIGN(auto cost_analysis,
-                          FakeMemorySpaceAssignmentCostAnalysis::Create(
-                              hlo_cost_analysis, *module, options));
-  CostAnalysisPrefetchIntervalPicker interval_picker(
-      *cost_analysis,
-      /*min_overlap_to_async_copy_ratio=*/1.0,
-      /*preferred_overlap_to_async_copy_ratio=*/2.0,
-      /*max_overlap_to_mem_size_async_copy_ratio=*/12.0,
-      /*mem_size_bytes=*/32);
-
-  LOG(INFO) << module->ToString();
-
-  HloInstruction* conditional1 =
-      module->entry_computation()->GetInstructionWithName("conditional1");
-  const HloUse use{conditional1, /*operand_number=*/1, /*operand_index=*/{0}};
-  const Shape& shape =
-      module->entry_computation()->parameter_instruction(0)->shape();
-
-  // Expect that the prefetch to start before conditional0's called
-  // computations.
-  EXPECT_LT(interval_picker.LatestPrefetchStartTime(shape, /*start_time=*/0,
-                                                    /*end_time=*/11, &use),
-            5);
-}
-
-TEST_F(CostAnalysisPrefetchIntervalPickerTest, EarliestLatestWindowTooSmall) {
-  // This tests the scenario where there is an op that takes a long time (tanh
-  // in this example) and as a result the earliest and latest times both fall
-  // inside this long-running op. In this case, we should still return a valid
-  // prefetch interval just before the long-running op.
-  absl::string_view hlo_string = R"(
-  HloModule bug, is_scheduled=true
-
-  ENTRY Entry {
-    param0 = f32[2,4] parameter(0)
-    negate = f32[2,4] negate(param0)
-    tanh = f32[2,4] tanh(param0)
-    ROOT add = f32[2,4] add(tanh, negate)
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-
-  HloCostAnalysis hlo_cost_analysis(ShapeSize);
-  Options options;
-  TF_ASSERT_OK_AND_ASSIGN(auto cost_analysis,
-                          FakeMemorySpaceAssignmentCostAnalysis::Create(
-                              hlo_cost_analysis, *module, options));
-  cost_analysis->SetOverrideForGetInstructionElapsed(
-      [](const HloInstruction& hlo) {
-        if (hlo.opcode() == HloOpcode::kTanh) {
-          return 20.0;
-        }
-        return 1.0;
-      });
-  CostAnalysisPrefetchIntervalPicker interval_picker(
-      *cost_analysis,
-      /*min_overlap_to_async_copy_ratio=*/1.0,
-      /*preferred_overlap_to_async_copy_ratio=*/2.0,
-      /*max_overlap_to_mem_size_async_copy_ratio=*/12.0,
-      /*mem_size_bytes=*/32);
-
-  HloInstruction* root = module->entry_computation()->root_instruction();
-  const HloUse use{root, /*operand_number=*/1, /*operand_index=*/{}};
-  interval_picker.Begin(use, /*start_time=*/1, /*end_time=*/3, std::nullopt);
-
-  LOG(INFO) << interval_picker.ToDebugString();
-  EXPECT_FALSE(interval_picker.Done());
-  EXPECT_EQ(interval_picker.Next(), 1);
-  EXPECT_TRUE(interval_picker.Done());
-}
-
-class MemorySpaceAssignmentCostAnalysisTest : public HloTestBase {
- protected:
-  Status Initialize(const HloModule* module,
-                    float pipeline_overhead_window_size_mib = 0.0) {
-    HloCostAnalysis::Options options;
-    options_.alternate_mem_bandwidth_bytes_per_second = 128;
-    options_.async_copy_bandwidth_bytes_per_second = 32;
-    options_.pipeline_overhead_window_size_mib =
-        pipeline_overhead_window_size_mib;
-    options.shape_size = ShapeSize;
-    options.set_flops_per_second(8);
-    options.set_bytes_per_second(32);
-    options.set_transcendentals_per_second(16);
-    hlo_cost_analysis_ = std::make_unique<HloCostAnalysis>(options);
-    TF_RETURN_IF_ERROR(
-        module->entry_computation()->Accept(hlo_cost_analysis_.get()));
-    TF_ASSIGN_OR_RETURN(cost_analysis_,
-                        MemorySpaceAssignmentCostAnalysis::Create(
-                            *hlo_cost_analysis_, options_, *module));
-    return OkStatus();
-  }
-
-  Options options_;
-  std::unique_ptr<HloCostAnalysis> hlo_cost_analysis_;
-  std::unique_ptr<MemorySpaceAssignmentCostAnalysis> cost_analysis_;
-};
-
-TEST_F(MemorySpaceAssignmentCostAnalysisTest, NoPipelineOverhead) {
-  absl::string_view hlo_string = R"(
-  HloModule module, is_scheduled=true
-
-  ENTRY Entry {
-    param0 = f32[2,4] parameter(0)
-    param1 = f32[2,4] parameter(1)
-    ROOT add = f32[2,4] add(param0, param1)
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  TF_ASSERT_OK(Initialize(module.get()));
-
-  const HloInstruction* add = module->entry_computation()->root_instruction();
-  const float expected_compute_elapsed =
-      /*num_flops=*/8 / /*flops_per_second=*/8.0;
-  LOG(INFO) << "Expected compute elapsed = " << expected_compute_elapsed;
-  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToCompute(*add),
-            expected_compute_elapsed);
-  float expected_memory_elapsed =
-      /*bytes_accessed=*/(3 * 4 * 8) / /*bytes_per_second=*/32.0;
-  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
-  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToMemory(*add),
-            expected_memory_elapsed);
-
-  // This HLO is memory-bound.
-  EXPECT_EQ(cost_analysis_->GetInstructionElapsed(*add),
-            expected_memory_elapsed);
-  EXPECT_EQ(
-      cost_analysis_->GetInstructionElapsedInAlternateMemory(*add, {}, {}),
-      expected_memory_elapsed);
-
-  // Put operand 0 in alternate memory. Still memory bound.
-  expected_memory_elapsed =
-      (/*bytes_accessed=*/(2 * 4 * 8) / /*bytes_per_second=*/32.0) +
-      (/*bytes_accessed=*/(4 * 8) / /*bytes_per_second=*/128.0);
-  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
-  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToMemory(*add, {{0, {}}}),
-            expected_memory_elapsed);
-  EXPECT_EQ(cost_analysis_->GetInstructionElapsedInAlternateMemory(
-                *add, {{0, {}}}, {}),
-            expected_memory_elapsed);
-
-  // Put operand 0 and output in alternate memory. Still memory bound.
-  expected_memory_elapsed =
-      (/*bytes_accessed=*/(4 * 8) / /*bytes_per_second=*/32.0) +
-      (/*bytes_accessed=*/(2 * 4 * 8) / /*bytes_per_second=*/128.0);
-  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
-  EXPECT_EQ(
-      cost_analysis_->GetInstructionElapsedDueToMemory(*add, {{0, {}}}, {{}}),
-      expected_memory_elapsed);
-  EXPECT_EQ(cost_analysis_->GetInstructionElapsedInAlternateMemory(
-                *add, {{0, {}}}, {{}}),
-            expected_memory_elapsed);
-
-  // Put everything in alternate memory. We're now compute bound.
-  expected_memory_elapsed =
-      /*bytes_accessed=*/(3 * 4 * 8) / /*bytes_per_second=*/128.0;
-  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
-  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToMemory(
-                *add, {{0, {}}, {1, {}}}, {{}}),
-            expected_memory_elapsed);
-  EXPECT_EQ(cost_analysis_->GetInstructionElapsedInAlternateMemory(
-                *add, {{0, {}}, {1, {}}}, {{}}),
-            expected_compute_elapsed);
-}
-
-TEST_F(MemorySpaceAssignmentCostAnalysisTest, PipelineOverhead) {
-  absl::string_view hlo_string = R"(
-  HloModule module, is_scheduled=true
-
-  ENTRY Entry {
-    param0 = f32[2,4] parameter(0)
-    param1 = f32[2,4] parameter(1)
-    ROOT add = f32[2,4] add(param0, param1)
-  }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(auto module,
-                          ParseAndReturnVerifiedModule(hlo_string));
-  // Set the window size 64B.
-  TF_ASSERT_OK(
-      Initialize(module.get(),
-                 /*pipeline_overhead_window_size_mib=*/(64.0 / 1024 / 1024)));
-
-  const HloInstruction* add = module->entry_computation()->root_instruction();
-  const float expected_compute_elapsed =
-      /*num_flops=*/8 / /*flops_per_second=*/8.0;
-  LOG(INFO) << "Expected compute elapsed = " << expected_compute_elapsed;
-  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToCompute(*add),
-            expected_compute_elapsed);
-  float expected_memory_elapsed =
-      /*bytes_accessed=*/(3 * 4 * 8) / /*bytes_per_second=*/32.0;
-  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
-  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToMemory(*add),
-            expected_memory_elapsed);
-
-  float expected_overhead = expected_compute_elapsed * 2 / 3;
-  LOG(INFO) << "Expected overhead = " << expected_overhead;
-  EXPECT_EQ(cost_analysis_->GetDefaultMemoryAccessOverhead(*add),
-            expected_overhead);
-  // This HLO is memory-bound.
-  EXPECT_EQ(cost_analysis_->GetInstructionElapsed(*add),
-            expected_memory_elapsed + expected_overhead);
-  EXPECT_EQ(
-      cost_analysis_->GetInstructionElapsedInAlternateMemory(*add, {}, {}),
-      expected_memory_elapsed + expected_overhead);
-
-  // Put operand 0 in alternate memory. Still memory bound.
-  expected_memory_elapsed =
-      (/*bytes_accessed=*/(2 * 4 * 8) / /*bytes_per_second=*/32.0) +
-      (/*bytes_accessed=*/(4 * 8) / /*bytes_per_second=*/128.0);
-  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
-  EXPECT_EQ(cost_analysis_->GetDefaultMemoryAccessOverhead(*add, {{0, {}}}),
-            expected_overhead);
-  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToMemory(*add, {{0, {}}}),
-            expected_memory_elapsed);
-  EXPECT_EQ(cost_analysis_->GetInstructionElapsedInAlternateMemory(
-                *add, {{0, {}}}, {}),
-            expected_memory_elapsed + expected_overhead);
-
-  // Put operand 0 and output in alternate memory. Still memory bound.
-  expected_memory_elapsed =
-      (/*bytes_accessed=*/(4 * 8) / /*bytes_per_second=*/32.0) +
-      (/*bytes_accessed=*/(2 * 4 * 8) / /*bytes_per_second=*/128.0);
-  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
-  expected_overhead = expected_compute_elapsed / 3;
-  LOG(INFO) << "Expected overhead = " << expected_overhead;
-  EXPECT_EQ(
-      cost_analysis_->GetDefaultMemoryAccessOverhead(*add, {{0, {}}}, {{}}),
-      expected_overhead);
-  EXPECT_EQ(
-      cost_analysis_->GetInstructionElapsedDueToMemory(*add, {{0, {}}}, {{}}),
-      expected_memory_elapsed);
-  EXPECT_EQ(cost_analysis_->GetInstructionElapsedInAlternateMemory(
-                *add, {{0, {}}}, {{}}),
-            expected_memory_elapsed + expected_overhead);
-
-  // Put everything in alternate memory. We're now compute bound.
-  expected_memory_elapsed =
-      /*bytes_accessed=*/(3 * 4 * 8) / /*bytes_per_second=*/128.0;
-  LOG(INFO) << "Expected memory elapsed = " << expected_memory_elapsed;
-  expected_overhead = 0;
-  LOG(INFO) << "Expected overhead = " << expected_overhead;
-  EXPECT_EQ(cost_analysis_->GetDefaultMemoryAccessOverhead(
-                *add, {{0, {}}, {1, {}}}, {{}}),
-            expected_overhead);
-  EXPECT_EQ(cost_analysis_->GetInstructionElapsedDueToMemory(
-                *add, {{0, {}}, {1, {}}}, {{}}),
-            expected_memory_elapsed);
-  EXPECT_EQ(cost_analysis_->GetInstructionElapsedInAlternateMemory(
-                *add, {{0, {}}, {1, {}}}, {{}}),
-            expected_compute_elapsed);
-}
-
 class MemoryBoundLoopOptimizerTest : public HloTestBase {
  public:
   MemoryBoundLoopOptimizerTest() = default;
@@ -9849,9 +9407,9 @@ class MemoryBoundLoopOptimizerTest : public HloTestBase {
     optimizer_options.set_allow_unsatisfied_fully_pipelined_prefetch(false);
     optimizer_options.set_min_num_iterations(3.0);
     options_.memory_bound_loop_optimizer_options = optimizer_options;
-    options_.alternate_mem_bandwidth_bytes_per_second = 128;
-    options_.async_copy_bandwidth_bytes_per_second = 32;
-    options_.pipeline_overhead_window_size_mib = 1;
+    cost_analysis_options_.alternate_mem_bandwidth_bytes_per_second = 128;
+    cost_analysis_options_.async_copy_bandwidth_bytes_per_second = 32;
+    cost_analysis_options_.pipeline_overhead_window_size_mib = 1;
     options.shape_size = ShapeSize;
     options.set_flops_per_second(16);
     options.set_bytes_per_second(32);
@@ -9860,8 +9418,8 @@ class MemoryBoundLoopOptimizerTest : public HloTestBase {
     TF_RETURN_IF_ERROR(
         module->entry_computation()->Accept(hlo_cost_analysis_.get()));
     TF_ASSIGN_OR_RETURN(cost_analysis_,
-                        MemorySpaceAssignmentCostAnalysis::Create(
-                            *hlo_cost_analysis_, options_, *module));
+                        CostAnalysis::Create(*hlo_cost_analysis_,
+                                             cost_analysis_options_, *module));
     TF_ASSIGN_OR_RETURN(alias_analysis_, HloAliasAnalysis::Run(module));
     TF_ASSIGN_OR_RETURN(live_range_,
                         HloLiveRange::Run(module->schedule(), *alias_analysis_,
@@ -10069,9 +9627,9 @@ ENTRY Entry {
     if (!cost_analysis_) {
       TF_RETURN_IF_ERROR(Initialize(module, alternate_memory_size));
     }
-    MemorySpaceAssignmentCostAnalysis::Cache cache;
-    memory_space_assignment::MemoryBoundednessBufferIntervalComparator
-        comparator(*cost_analysis_, &cache);
+    CostAnalysis::Cache cache;
+    MemoryBoundednessBufferIntervalComparator comparator(*cost_analysis_,
+                                                         &cache);
     options_.buffer_interval_comparator = &comparator;
     CostAnalysisPrefetchIntervalPicker prefetch_interval_picker(
         CostAnalysisPrefetchIntervalPicker(
@@ -10114,9 +9672,7 @@ ENTRY Entry {
   Status VerifyMsaEquivalence(HloModule* module,
                               bool expect_unsupported_allocations = false) {
     // Create a map indexed by instruction number and operand number.
-    absl::flat_hash_map<std::pair<int, int>,
-                        const MemorySpaceAssignment::Allocation*>
-        allocation_map;
+    absl::flat_hash_map<std::pair<int, int>, const Allocation*> allocation_map;
     for (const MemoryBoundLoopOptimizer::LoopValue& value :
          optimizer_->loop_values()) {
       // Skip verification for unsupported allocations as they will go through
@@ -10170,25 +9726,22 @@ ENTRY Entry {
             TF_RET_CHECK(expect_unsupported_allocations);
             continue;
           }
-          const MemorySpaceAssignment::Allocation* allocation =
+          const Allocation* allocation =
               allocation_map.at({inst_number, operand_number});
           if (!allocation->is_copy_allocation()) {
             // We don't expect a prefetch here.
             EXPECT_NE(operand->opcode(), HloOpcode::kCopyDone);
             int expected_memory_space =
-                allocation->memory_space() ==
-                        MemorySpaceAssignment::MemorySpace::kDefault
+                allocation->memory_space() == MemorySpace::kDefault
                     ? kDefaultMemorySpace
                     : kAlternateMemorySpace;
             EXPECT_EQ(operand->shape().layout().memory_space(),
                       expected_memory_space);
           } else {
-            EXPECT_EQ(allocation->memory_space(),
-                      MemorySpaceAssignment::MemorySpace::kAlternate);
+            EXPECT_EQ(allocation->memory_space(), MemorySpace::kAlternate);
             TF_RET_CHECK(operand->opcode() == HloOpcode::kCopyDone);
-            const MemorySpaceAssignment::CopyAllocation* copy_allocation =
-                static_cast<const MemorySpaceAssignment::CopyAllocation*>(
-                    allocation);
+            const CopyAllocation* copy_allocation =
+                static_cast<const CopyAllocation*>(allocation);
             if (copy_allocation->copy_done_schedule_before() != inst_number) {
               // The only case where the copy done schedule before is not the
               // same as this use would be that this use is not the first use of
@@ -10247,8 +9800,9 @@ ENTRY Entry {
 
  private:
   Options options_;
+  CostAnalysisOptions cost_analysis_options_;
   std::unique_ptr<HloCostAnalysis> hlo_cost_analysis_;
-  std::unique_ptr<MemorySpaceAssignmentCostAnalysis> cost_analysis_;
+  std::unique_ptr<CostAnalysis> cost_analysis_;
   std::unique_ptr<HloAliasAnalysis> alias_analysis_;
   std::unique_ptr<HloLiveRange> live_range_;
   std::unique_ptr<MemoryBoundLoopOptimizer> optimizer_;
@@ -10321,8 +9875,7 @@ TEST_F(MemoryBoundLoopOptimizerTest, NoAlternateMem) {
        optimizer->loop_values()) {
     LOG(INFO) << loop_value.ToString();
     for (const auto& allocation : loop_value.allocations) {
-      EXPECT_EQ(allocation->memory_space(),
-                MemorySpaceAssignment::MemorySpace::kDefault);
+      EXPECT_EQ(allocation->memory_space(), MemorySpace::kDefault);
       for (const HloUse& use : allocation->uses()) {
         EXPECT_FALSE(seen_uses.contains(use)) << use.ToString();
         seen_uses.insert(use);
@@ -10418,20 +9971,19 @@ TEST_F(MemoryBoundLoopOptimizerTest, PrefetchFifoOrderWithOverlap) {
   //                                                  +=========+
   //  13 14| 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14| 0  1
   //  prev |                  loop                      | next
-  std::vector<const MemorySpaceAssignment::CopyAllocation*> prefetches;
+  std::vector<const CopyAllocation*> prefetches;
   for (const MemoryBoundLoopOptimizer::LoopValue& loop_value :
        optimizer->loop_values()) {
     if (!loop_value.allocations.empty() &&
         loop_value.allocations.back()->is_copy_allocation()) {
-      prefetches.push_back(
-          static_cast<const MemorySpaceAssignment::CopyAllocation*>(
-              loop_value.allocations.back().get()));
+      prefetches.push_back(static_cast<const CopyAllocation*>(
+          loop_value.allocations.back().get()));
     }
   }
   EXPECT_EQ(prefetches.size(), 3);
   bool seen_overlap = false;
   bool seen_nonoverlap = false;
-  for (const MemorySpaceAssignment::CopyAllocation* prefetch : prefetches) {
+  for (const CopyAllocation* prefetch : prefetches) {
     const HloUse& use = *prefetch->uses().begin();
     if (use.instruction->name() == "op14") {
       EXPECT_EQ(prefetch->copy_done_schedule_before(), 14);
@@ -10529,19 +10081,18 @@ TEST_F(MemoryBoundLoopOptimizerTest, PrefetchFifoOrderWithoutOverlap) {
   // =====>             ===============================>
   //  13 14| 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14| 0  1
   //  prev |                  loop                      | next
-  std::vector<const MemorySpaceAssignment::CopyAllocation*> prefetches;
+  std::vector<const CopyAllocation*> prefetches;
   for (const MemoryBoundLoopOptimizer::LoopValue& loop_value :
        optimizer->loop_values()) {
     if (!loop_value.allocations.empty() &&
         loop_value.allocations.back()->is_copy_allocation()) {
-      prefetches.push_back(
-          static_cast<const MemorySpaceAssignment::CopyAllocation*>(
-              loop_value.allocations.back().get()));
+      prefetches.push_back(static_cast<const CopyAllocation*>(
+          loop_value.allocations.back().get()));
     }
   }
   EXPECT_EQ(prefetches.size(), 2);
   std::optional<int> expected_op14_copy_start_time;
-  for (const MemorySpaceAssignment::CopyAllocation* prefetch : prefetches) {
+  for (const CopyAllocation* prefetch : prefetches) {
     const HloUse& use = *prefetch->uses().begin();
     if (use.instruction->name() == "op1") {
       EXPECT_EQ(prefetch->copy_done_schedule_before(), 1);
@@ -10550,7 +10101,7 @@ TEST_F(MemoryBoundLoopOptimizerTest, PrefetchFifoOrderWithoutOverlap) {
     }
   }
   EXPECT_TRUE(expected_op14_copy_start_time.has_value());
-  for (const MemorySpaceAssignment::CopyAllocation* prefetch : prefetches) {
+  for (const CopyAllocation* prefetch : prefetches) {
     const HloUse& use = *prefetch->uses().begin();
     if (use.instruction->name() == "op14") {
       EXPECT_EQ(prefetch->copy_done_schedule_before(), 14);
@@ -10616,20 +10167,19 @@ TEST_F(MemoryBoundLoopOptimizerTest, PrefetchFifoOrderWithOverlap2) {
   // ==>    ========================================>    ======
   //  13 14| 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14| 0  1
   //  prev |                  loop                      | next
-  std::vector<const MemorySpaceAssignment::CopyAllocation*> prefetches;
+  std::vector<const CopyAllocation*> prefetches;
   for (const MemoryBoundLoopOptimizer::LoopValue& loop_value :
        optimizer->loop_values()) {
     if (!loop_value.allocations.empty() &&
         loop_value.allocations.back()->is_copy_allocation()) {
-      prefetches.push_back(
-          static_cast<const MemorySpaceAssignment::CopyAllocation*>(
-              loop_value.allocations.back().get()));
+      prefetches.push_back(static_cast<const CopyAllocation*>(
+          loop_value.allocations.back().get()));
     }
   }
   EXPECT_EQ(prefetches.size(), 3);
   bool seen_overlap = false;
   bool seen_nonoverlap = false;
-  for (const MemorySpaceAssignment::CopyAllocation* prefetch : prefetches) {
+  for (const CopyAllocation* prefetch : prefetches) {
     const HloUse& use = *prefetch->uses().begin();
     if (use.instruction->name() == "op13") {
       EXPECT_EQ(prefetch->copy_done_schedule_before(), 13);
@@ -11046,7 +10596,7 @@ class SlicedPrefetchStartTimePickerTest : public ::testing::Test {
   std::vector<int64_t> Pick(
       const std::vector<FakeInstructionData>& schedule_data, int64_t num_slices,
       int64_t prefetch_start_time, int64_t prefetch_end_time) {
-    return memory_space_assignment::SlicedPrefetchStartTimePicker::Pick(
+    return SlicedPrefetchStartTimePicker::Pick(
         num_slices, prefetch_start_time, prefetch_end_time,
         [&schedule_data](int64_t exclusive_start_time,
                          int64_t exclusive_end_time) {
@@ -11269,8 +10819,8 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
   }
 
   // A class that can be mocked to set expectations on slice proposals. To do
-  // that, we set memory_space_assignment::Options::propose_slice_fn to a lambda
-  // that calls our mocks ProposeSlices() method.
+  // that, we set Options::propose_slice_fn to a lambda that calls our mocks
+  // ProposeSlices() method.
   class SliceProposer {
    public:
     SliceProposer() = default;
@@ -11290,9 +10840,11 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
   // An HloInstruction* matcher for matching the asynchronous sliced copies
   // produced by MSA. In particular, the matcher performs the following
   // checks:
-  // - The copy is concluded with a concat-bitcast custom call
+  // - The copy is concluded with a concat-bitcast custom call, or a
+  //   bitcast of a concat-bitcast custom call if expect_bitcasted_io is true
   // - The operands to the concat-bitcast are asynchronous slices of the
-  //   expected operand
+  //   expected operand, or asynchronous slices of a bitcast of the expected
+  //   operand if expect_bitcasted_io is true
   // - The number of slices is as expected (i.e.,
   //   expected_slice_params_per_slice_in_spatial_order_.size())
   // - The copy is from and to the correct memory spaces
@@ -11310,47 +10862,57 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
     AsyncSlicedCopy(int64_t to_space, int64_t from_space,
                     std::vector<std::vector<SliceParam>>
                         expected_slice_params_per_slice_in_spatial_order,
-                    ::testing::Matcher<const HloInstruction*> operand)
+                    ::testing::Matcher<const HloInstruction*> operand,
+                    bool expect_bitcasted_io)
         : to_space_(to_space),
           from_space_(from_space),
           expected_slice_params_per_slice_in_spatial_order_(
               std::move(expected_slice_params_per_slice_in_spatial_order)),
-          custom_call_matcher_(
-              memory_space_assignment::kConcatBitcastCustomCall,
-              std::vector<::testing::Matcher<const HloInstruction*>>(
-                  expected_slice_params_per_slice_in_spatial_order_.size(),
-                  op::AsyncDone(op::AsyncStart(operand)))) {}
+          base_hlo_matcher_(CreateBaseHloMatcher(
+              operand, expected_slice_params_per_slice_in_spatial_order_.size(),
+              expect_bitcasted_io)),
+          expect_bitcasted_io_(expect_bitcasted_io) {}
 
     bool MatchAndExplain(
         const HloInstruction* instruction,
         ::testing::MatchResultListener* listener) const override {
-      // Match the custom call.
-      if (!custom_call_matcher_.MatchAndExplain(instruction, listener)) {
+      // Match opcodes and number of operands.
+      if (!base_hlo_matcher_.MatchAndExplain(instruction, listener)) {
         return false;
       }
 
-      // Check if the custom call has the proper memory space.
-      const HloInstruction* concat_bitcast = instruction;
-      if (!MatchMemorySpace(concat_bitcast, to_space_, "concat-bitcast",
-                            listener)) {
+      // Check if the copied result has the proper memory space.
+      if (!MatchMemorySpace(instruction, to_space_, "copy result", listener)) {
         return false;
       }
 
-      // Check if the copied tensor has the proper memory space.
+      // Find some instructions in the async copy.
+      const HloInstruction* concat_bitcast =
+          (expect_bitcasted_io_ ? instruction->operand(0) : instruction);
+      VLOG(2) << "AsyncSlicedCopy identified the concat-bitcast as "
+              << concat_bitcast->name();
       const HloInstruction* copy_operand =
           concat_bitcast->operand(0)->operand(0)->operand(0);
-      if (!MatchMemorySpace(copy_operand, from_space_, "copy operand",
+      const HloInstruction* original_copy_operand =
+          (expect_bitcasted_io_ ? copy_operand->operand(0) : copy_operand);
+      VLOG(2) << "AsyncSlicedCopy identified the copy operand as "
+              << copy_operand->name() << ", and the original copy operand as "
+              << original_copy_operand->name();
+
+      // Check if the copied tensor has the proper memory space.
+      if (!MatchMemorySpace(original_copy_operand, from_space_, "copy operand",
                             listener)) {
         return false;
       }
 
       // Check if the copied tensor retains its shape.
-      if (!Shape::Equal().IgnoreMemorySpaceInLayout()(concat_bitcast->shape(),
-                                                      copy_operand->shape())) {
+      if (!Shape::Equal().IgnoreMemorySpaceInLayout()(
+              instruction->shape(), original_copy_operand->shape())) {
         *listener << " has a shape of "
-                  << copy_operand->shape().ToString(/*print_layout=*/true)
+                  << original_copy_operand->shape().ToString(
+                         /*print_layout=*/true)
                   << " before copying but a shape of "
-                  << concat_bitcast->shape().ToString(/*print_layout=*/true)
+                  << instruction->shape().ToString(/*print_layout=*/true)
                   << " after copying (ignoring memory space)";
 
         return false;
@@ -11426,7 +10988,7 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
     }
 
     void DescribeTo(std::ostream* os) const override {
-      custom_call_matcher_.DescribeTo(os);
+      base_hlo_matcher_.DescribeTo(os);
       std::vector<std::string> slice_parameters_per_operand;
       for (int op_idx = 0;
            op_idx < expected_slice_params_per_slice_in_spatial_order_.size();
@@ -11454,6 +11016,22 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
     }
 
    private:
+    static ::testing::Matcher<const HloInstruction*> CreateBaseHloMatcher(
+        ::testing::Matcher<const HloInstruction*> operand, int64_t num_slices,
+        bool expect_bitcasted_io) {
+      if (expect_bitcasted_io) {
+        return op::Bitcast(op::CustomCall(
+            kConcatBitcastCustomCall,
+            std::vector<::testing::Matcher<const HloInstruction*>>(
+                num_slices,
+                op::AsyncDone(op::AsyncStart(op::Bitcast(operand))))));
+      }
+      return op::CustomCall(
+          kConcatBitcastCustomCall,
+          std::vector<::testing::Matcher<const HloInstruction*>>(
+              num_slices, op::AsyncDone(op::AsyncStart(operand))));
+    }
+
     static bool MatchMemorySpace(const HloInstruction* instruction,
                                  int64_t expected_memory_space,
                                  std::string_view error_message_identifier,
@@ -11481,7 +11059,8 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
     int64_t from_space_;
     std::vector<std::vector<SliceParam>>
         expected_slice_params_per_slice_in_spatial_order_;
-    ::xla::testing::HloCustomCallMatcher custom_call_matcher_;
+    ::testing::Matcher<const HloInstruction*> base_hlo_matcher_;
+    bool expect_bitcasted_io_;
   };
 
   // Returns an AsyncSlicedCopy matcher.
@@ -11489,10 +11068,11 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
       int64_t to_space, int64_t from_space,
       std::vector<std::vector<SliceParam>>
           expected_slice_params_per_slice_in_spatial_order,
-      ::testing::Matcher<const HloInstruction*> operand_matcher) {
+      ::testing::Matcher<const HloInstruction*> operand_matcher,
+      bool expect_bitcasted_io = false) {
     return ::testing::MakeMatcher(new AsyncSlicedCopy(
         to_space, from_space, expected_slice_params_per_slice_in_spatial_order,
-        operand_matcher));
+        operand_matcher, expect_bitcasted_io));
   }
 
   // We make our own matcher for SlicedPrefetchOptions to work around the fact
@@ -11606,8 +11186,7 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
 
   // Returns true if instruction is a concat-bitcast.
   static bool IsConcatBitcast(const HloInstruction* instruction) {
-    return instruction->IsCustomCall(
-        memory_space_assignment::kConcatBitcastCustomCall);
+    return instruction->IsCustomCall(kConcatBitcastCustomCall);
   }
 
   // Returns the index of the first instruction with the given name.
@@ -11886,6 +11465,8 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
       std::string_view slices_start_after_instruction_name,
       std::string_view slices_done_before_instruction_name,
       bool expect_slices_started_at_different_times) {
+    CHECK(concat_bitcast->IsCustomCall(kConcatBitcastCustomCall));
+
     // Get the schedule.
     auto entry_schedule =
         module.schedule().sequence(module.entry_computation()).instructions();
@@ -11959,29 +11540,38 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
   }
 
   // Returns OkStatus iff:
-  // - When the slices of concat_bitcast are sorted in expected spatial order,
-  //   they are assigned chunks that spatially fall in the same order AND
-  // - The slices of concat_bitcast are assigned contiguous memory chunks AND
-  // - The concat_bitcast is assigned a chunk that is the concatenation of the
-  //   slice chunks AND
-  // - The size of the chunk assigned to the concat_bitcast has the same size
-  //   as the instruction's shape
+  // - Each slice is assigned a chunk that is the same size as the slice
+  //   instruction's shape.
+  // - When the slices of sliced_copy_result are sorted in expected spatial
+  //   order, they are assigned chunks that spatially fall in the same order AND
+  // - The slices of sliced_copy_result are assigned contiguous memory chunks
+  //   AND
+  // - The sliced_copy_result is assigned a chunk that is the concatenation of
+  //   the slice chunks AND
+  // - The size of the chunk assigned to the sliced_copy_result has the same
+  //   size as the instruction's shape
   static Status CheckSliceChunks(const PresetAssignments& assignments,
-                                 const HloInstruction* concat_bitcast) {
+                                 const HloInstruction* sliced_copy_result,
+                                 bool expect_bitcasted_io = false) {
+    const HloInstruction* concat_bitcast =
+        (expect_bitcasted_io ? sliced_copy_result->operand(0)
+                             : sliced_copy_result);
+    CHECK(concat_bitcast->IsCustomCall(kConcatBitcastCustomCall));
+
     absl::flat_hash_map<const HloInstruction*, Chunk> slices_to_chunks;
-    std::optional<Chunk> concat_bitcast_chunk = std::nullopt;
+    std::optional<Chunk> result_chunk = std::nullopt;
 
     for (const std::pair<HloPosition, Chunk>& position_chunk_pair :
          assignments.chunks()) {
-      if (position_chunk_pair.first.instruction == concat_bitcast) {
-        if (concat_bitcast_chunk.has_value()) {
+      if (position_chunk_pair.first.instruction == sliced_copy_result) {
+        if (result_chunk.has_value()) {
           return FailedPrecondition(
-              "%s", absl::StrCat("Concat-bitcast ", concat_bitcast->name(),
+              "%s", absl::StrCat("Sliced copy ", sliced_copy_result->name(),
                                  " is assigned more than one chunk: ",
-                                 concat_bitcast_chunk->ToString(), " and ",
+                                 result_chunk->ToString(), " and ",
                                  position_chunk_pair.second.ToString()));
         }
-        concat_bitcast_chunk = position_chunk_pair.second;
+        result_chunk = position_chunk_pair.second;
       }
       for (const HloInstruction* slice : concat_bitcast->operands()) {
         if (position_chunk_pair.first.instruction == slice) {
@@ -12000,7 +11590,7 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
 
     std::vector<const HloInstruction*> sorted_slices =
         SortSlicesInExpectedSpatialOrder(concat_bitcast);
-    VLOG(1) << "Chunk assignments for " << concat_bitcast->name() << ":\n"
+    VLOG(1) << "Chunk assignments for " << sliced_copy_result->name() << ":\n"
             << absl::StrJoin(
                    sorted_slices, "\n",
                    [&](std::string* out, const HloInstruction* slice) {
@@ -12012,16 +11602,16 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
                      absl::StrAppend(out, "  slice ", slice->name(), ": ",
                                      chunk);
                    })
-            << "\n  concat-bitcast " << concat_bitcast->name() << ": "
-            << (concat_bitcast_chunk.has_value()
-                    ? concat_bitcast_chunk->ToString()
-                    : "no chunk assigned");
+            << "\n  sliced copy result " << sliced_copy_result->name() << ": "
+            << (result_chunk.has_value() ? result_chunk->ToString()
+                                         : "no chunk assigned");
     if (sorted_slices.empty()) {
       return OkStatus();
     }
 
     // Check that slices are assigned contiguous chunks that are spatially
-    // ordered according to sorted_slices.
+    // ordered according to sorted_slices. Also make sure that slices are
+    // assigned chunks with sizes that match their shape.
     int64_t previous_end = -1;
     int64_t min_offset = std::numeric_limits<int64_t>::max();
     int64_t max_limit = std::numeric_limits<int64_t>::min();
@@ -12033,6 +11623,16 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
             absl::StrCat("Slice ", slice->name(), " is not assigned a chunk"));
       }
       const Chunk& chunk = it->second;
+
+      if (chunk.size != ShapeSize(slice->shape())) {
+        return FailedPrecondition(
+            "%s",
+            absl::StrCat("Slice ", slice->name(), " is assigned chunk ",
+                         chunk.ToString(), " with size ", chunk.size,
+                         ". Expected a size of ", ShapeSize(slice->shape()),
+                         ", to match its shape."));
+      }
+
       if (previous_end != -1 && chunk.offset != previous_end) {
         return FailedPrecondition(
             "%s", absl::StrCat(
@@ -12045,31 +11645,29 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
       max_limit = std::max(max_limit, chunk.chunk_end());
     }
 
-    // Check that the concat_bitcast is assigned a chunk that is the
+    // Check that the sliced copy result is assigned a chunk that is the
     // concatenation of the slice chunks.
-    if (!concat_bitcast_chunk.has_value()) {
+    if (!result_chunk.has_value()) {
       return FailedPrecondition(
-          "%s", absl::StrCat("Concat-bitcast ", concat_bitcast->name(),
+          "%s", absl::StrCat("Sliced copy result ", sliced_copy_result->name(),
                              " is not assigned a chunk."));
     }
-    Chunk expected_concat_bitcast_chunk =
-        Chunk::FromOffsetEnd(min_offset, max_limit);
-    if (!(*concat_bitcast_chunk == expected_concat_bitcast_chunk)) {
+    Chunk expected_result_chunk = Chunk::FromOffsetEnd(min_offset, max_limit);
+    if (!(*result_chunk == expected_result_chunk)) {
       return FailedPrecondition(
-          "%s",
-          absl::StrCat("Concat-bitcast ", concat_bitcast->name(),
-                       " is assigned chunk ", concat_bitcast_chunk->ToString(),
-                       " but its expected to be assigned chunk ",
-                       expected_concat_bitcast_chunk.ToString()));
+          "%s", absl::StrCat("Sliced copy result ", sliced_copy_result->name(),
+                             " is assigned chunk ", result_chunk->ToString(),
+                             ", but it's expected to be assigned chunk ",
+                             expected_result_chunk.ToString()));
     }
-    if (concat_bitcast_chunk->size != ShapeSize(concat_bitcast->shape())) {
+    if (result_chunk->size != ShapeSize(sliced_copy_result->shape())) {
       return FailedPrecondition(
-          "%s",
-          absl::StrCat(
-              "Concat-bitcast ", concat_bitcast->name(), " is assigned chunk ",
-              concat_bitcast_chunk->ToString(), " with size ",
-              concat_bitcast_chunk->size, ". Expected a size of ",
-              ShapeSize(concat_bitcast->shape()), ", to match its shape."));
+          "%s", absl::StrCat("Sliced copy result ", sliced_copy_result->name(),
+                             " is assigned chunk ", result_chunk->ToString(),
+                             " with size ", result_chunk->size,
+                             ". Expected a size of ",
+                             ShapeSize(sliced_copy_result->shape()),
+                             ", to match its shape."));
     }
 
     return OkStatus();
@@ -12082,11 +11680,13 @@ class SlicedPrefetchTest : public MemorySpaceAssignmentTestBase {
     options_.max_size_in_bytes = 1024;
     options_.sliced_prefetch_options.set_max_slices(2);
     options_.sliced_prefetch_options.set_min_bytes(8);
-    options_.propose_slice_fn =
-        [&](const Shape& shape,
-            const memory_space_assignment::SlicedPrefetchOptions& options) {
-          return slice_proposer_.ProposeSlices(shape, options);
-        };
+    options_.propose_slice_fn = [&](const Shape& shape,
+                                    const SlicedPrefetchOptions& options) {
+      return slice_proposer_.ProposeSlices(shape, options);
+    };
+    options_.get_equivalent_s8_shape_fn = [](const Shape& original_shape) {
+      return ShapeUtil::MakeShape(S8, {ShapeSize(original_shape)});
+    };
   }
 
   bool allocate_across_sequential_calls() const override { return true; }
@@ -12363,7 +11963,8 @@ ENTRY main {
           << module->ToString(HloPrintOptions::ShortParsable());
 
   std::unique_ptr<PresetAssignments> assignments =
-      AssignMemorySpaceUsingCostAnalysis(module.get(), options_);
+      AssignMemorySpaceUsingCostAnalysis(
+          module.get(), /*memory_space_options_override=*/options_);
 
   VLOG(1) << "Post-MSA module:\n"
           << module->ToString(HloPrintOptions::ShortParsable());
@@ -12439,7 +12040,8 @@ ENTRY main {
           << module->ToString(HloPrintOptions::ShortParsable());
 
   std::unique_ptr<PresetAssignments> assignments =
-      AssignMemorySpaceUsingCostAnalysis(module.get(), options_);
+      AssignMemorySpaceUsingCostAnalysis(
+          module.get(), /*memory_space_options_override=*/options_);
 
   VLOG(1) << "Post-MSA module:\n"
           << module->ToString(HloPrintOptions::ShortParsable());
@@ -12984,5 +12586,97 @@ TEST_F(RepackingTest, Colocations) {
   EXPECT_EQ(f.GetColocationsCount(), 3);
   EXPECT_THAT(f.GetColocations(), UnorderedElementsAre(&d, &e, &f));
 }
+
+TEST_F(SlicedPrefetchTest, UniformSizedSlicing) {
+  std::string hlo_text = R"zz(
+HloModule Slice, is_scheduled=true
+
+ENTRY main {
+  p0 = f32[8,8] parameter(0)
+  p1 = f32[8,8] parameter(1)
+  p2 = f32[8,16] parameter(2)
+  constant1 = f32[] constant(1.1)
+
+  a = f32[8,8] tanh(p0)
+  b = f32[8,8] tanh(a)
+  c = f32[8,8] tanh(b)
+  d = f32[8,8] tanh(c)
+  e = f32[8,8] tanh(d)
+  f = f32[8,8] tanh(e)
+  g = f32[8,8] tanh(f)
+  h = f32[8,8] tanh(g)
+
+  x = f32[8,8] add(p1, h)
+  padded_x = f32[8,16] pad(x, constant1), padding=0_0x0_8
+  ROOT r = f32[8,16] add(padded_x, p2)
+})zz";
+  const Shape f32_8_16 = ShapeUtil::MakeShape(F32, {8, 16});
+  const Shape s8_128 = ShapeUtil::MakeShape(S8, {128});
+
+  options_.sliced_prefetch_options.set_max_slices(100000);
+  options_.sliced_prefetch_options.set_preferred_slice_size(4 * 8 * 4);
+
+  EXPECT_CALL(slice_proposer_,
+              ProposeSlices(f32_8_8_, EqualsSlicedPrefetchOptions(
+                                          options_.sliced_prefetch_options)))
+      .WillRepeatedly(Return(SliceProposalCollection({
+          SliceProposal(
+              {s8_128, std::vector<SliceParam>({{0, 128}}), ShapeSize(s8_128)}),
+          SliceProposal({s8_128, std::vector<SliceParam>({{128, 256}}),
+                         ShapeSize(s8_128)}),
+      })));
+
+  EXPECT_CALL(slice_proposer_,
+              ProposeSlices(f32_8_16, EqualsSlicedPrefetchOptions(
+                                          options_.sliced_prefetch_options)))
+      .WillRepeatedly(Return(SliceProposalCollection({
+          SliceProposal(
+              {s8_128, std::vector<SliceParam>({{0, 128}}), ShapeSize(s8_128)}),
+          SliceProposal({s8_128, std::vector<SliceParam>({{128, 256}}),
+                         ShapeSize(s8_128)}),
+          SliceProposal({s8_128, std::vector<SliceParam>({{256, 384}}),
+                         ShapeSize(s8_128)}),
+          SliceProposal({s8_128, std::vector<SliceParam>({{384, 512}}),
+                         ShapeSize(s8_128)}),
+      })));
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module, ParseAndReturnVerifiedModule(hlo_text));
+  VLOG(1) << "Original module:\n"
+          << module->ToString(HloPrintOptions::ShortParsable());
+
+  std::unique_ptr<PresetAssignments> assignments = AssignMemorySpace(
+      module.get(), options_,
+      /*max_prefetch_interval=*/100, /*min_prefetch_interval=*/1);
+
+  VLOG(1) << "Post-MSA module:\n"
+          << module->ToString(HloPrintOptions::ShortParsable());
+
+  auto root = module->entry_computation()->root_instruction();
+
+  // Expect p1 to be asynchronously copied via 2 slices, and p2 to be
+  // asynchronously copied via 4 slices. We expect p1 and p2 to be bitcast
+  // before slicing and after slicing.
+  EXPECT_THAT(
+      root,
+      op::Add(op::Pad(op::Add(IsAsyncSlicedCopy(
+                                  kAlternateMemorySpace, kDefaultMemorySpace,
+                                  {{{0, 128}}, {{128, 256}}}, op::Parameter(1),
+                                  /*expect_bitcasted_io=*/true),
+                              /*don't care*/ _),
+                      /*padding constant*/ _),
+              IsAsyncSlicedCopy(
+                  kAlternateMemorySpace, kDefaultMemorySpace,
+                  {{{0, 128}}, {{128, 256}}, {{256, 384}}, {{384, 512}}},
+                  op::Parameter(2), /*expect_bitcasted_io=*/true)));
+
+  // Check expectations on the chunks assigned to the asynchronous sliced copy.
+  TF_EXPECT_OK(CheckSliceChunks(*assignments, root->operand(1),
+                                /*expect_bitcasted_io=*/true));
+  TF_EXPECT_OK(CheckSliceChunks(*assignments,
+                                root->operand(0)->operand(0)->operand(0),
+                                /*expect_bitcasted_io=*/true));
+}
+
 }  // namespace
+}  // namespace memory_space_assignment
 }  // namespace xla
diff --git a/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker.cc b/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker.cc
new file mode 100644
index 00000000000000..ad63a509dc4fdb
--- /dev/null
+++ b/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker.cc
@@ -0,0 +1,553 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/memory_space_assignment/prefetch_interval_picker.h"
+
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "absl/log/check.h"
+#include "absl/strings/str_cat.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
+#include "xla/util.h"
+#include "tsl/platform/logging.h"
+
+namespace xla {
+namespace memory_space_assignment {
+namespace {
+
+// Each time we retry compilation, increase the preferred eviction end time by
+// this amount multiplied by preferred overlap to async copy ratio.
+const float kEvictionRetryMultiplier = 2.0;
+
+// The number of decreasing intervals for CostAnalysisPrefetchIntervalPicker to
+// return when it runs out of increasing intervals. Increasing this number may
+// hurt compilation time.
+const int kNumExploredDecreasingIntervals = 100;
+
+}  // namespace
+
+bool InstructionCountPrefetchIntervalPicker::CanAllocateInAlternateMemoryNoCopy(
+    const Shape& shape, int64_t start_time, int64_t end_time) const {
+  return end_time - start_time <= max_overlap_count_;
+}
+
+int64_t InstructionCountPrefetchIntervalPicker::PreferredEvictionEndTime(
+    const Shape& shape, int64_t start_time, int64_t latest_end_time) const {
+  return std::min(start_time + min_overlap_count_, latest_end_time);
+}
+
+int64_t InstructionCountPrefetchIntervalPicker::LatestPrefetchStartTime(
+    const Shape& shape, int64_t start_time, int64_t end_time,
+    const HloUse* use) const {
+  return end_time - min_overlap_count_;
+}
+
+int64_t InstructionCountPrefetchIntervalPicker::PreferredPrefetchStartTime(
+    const Shape& shape, int64_t earliest_prefetch_start_time,
+    int64_t latest_prefetch_start_time, int64_t prefetch_end_time) const {
+  return std::max(earliest_prefetch_start_time,
+                  prefetch_end_time - max_overlap_count_);
+}
+
+int64_t InstructionCountPrefetchIntervalPicker::EstimatedPrefetchEndTime(
+    const Shape& shape, int64_t start_time, int64_t end_time) const {
+  // For testing, assume the end time is the estimated prefetch end time.
+  return end_time;
+}
+
+float InstructionCountPrefetchIntervalPicker::GetLogicalIntervalElapsed(
+    int64_t start_time, int64_t end_time) const {
+  // For testing, just assume every HLO takes 1 second.
+  return static_cast<float>(end_time - start_time - 1);
+}
+
+void InstructionCountPrefetchIntervalPicker::Begin(
+    const HloUse& use, int64_t start_time, int64_t end_time,
+    std::optional<int64_t> preferred_time) {
+  end_time_ = end_time;
+  const Shape& shape = ShapeUtil::GetSubshape(
+      use.instruction->operand(use.operand_number)->shape(), use.operand_index);
+  if (preferred_time) {
+    current_prefetch_time_ = *preferred_time;
+  } else {
+    current_prefetch_time_ =
+        PreferredPrefetchStartTime(shape, start_time, end_time, end_time);
+  }
+}
+
+int64_t InstructionCountPrefetchIntervalPicker::Next() {
+  CHECK(!Done()) << "Prefetch interval picker's Next() is called even though "
+                    "Done() is false";
+  return current_prefetch_time_++;
+}
+
+bool InstructionCountPrefetchIntervalPicker::Done() const {
+  return end_time_ - current_prefetch_time_ <= min_overlap_count_;
+}
+
+int64_t InstructionCountPrefetchIntervalPicker::latest_time() const {
+  return end_time_ - min_overlap_count_ - 1;
+}
+
+std::string InstructionCountPrefetchIntervalPicker::ToDebugString() const {
+  return absl::StrCat("Overlapped HLOs = ", end_time_ - current_prefetch_time_);
+}
+
+std::string InstructionCountPrefetchIntervalPicker::ToNoCopyDebugString(
+    const Shape& shape, int64_t start_time, int64_t end_time) const {
+  return absl::StrCat("Overlapped HLOs = ", end_time - start_time);
+}
+
+CostAnalysisPrefetchIntervalPicker::CostAnalysisPrefetchIntervalPicker(
+    const CostAnalysis& cost_analysis, float min_overlap_to_async_copy_ratio,
+    float preferred_overlap_to_async_copy_ratio,
+    float max_overlap_to_mem_size_async_copy_ratio, int64_t mem_size_bytes,
+    const Shape* shape_override)
+    : while_nest_level_(
+          cost_analysis.hlo_live_range().instruction_schedule().size() + 1, 0),
+      computation_nest_level_(
+          cost_analysis.hlo_live_range().instruction_schedule().size() + 1, 0),
+      cost_analysis_(cost_analysis),
+      min_overlap_to_async_copy_ratio_(min_overlap_to_async_copy_ratio),
+      preferred_overlap_to_async_copy_ratio_(
+          preferred_overlap_to_async_copy_ratio),
+      max_async_copy_elapsed_(
+          cost_analysis_.GetAsyncCopyElapsed(
+              ShapeUtil::MakeShape(S32, {mem_size_bytes / 4})) *
+          max_overlap_to_mem_size_async_copy_ratio),
+      shape_override_(shape_override ? std::optional(*shape_override)
+                                     : std::nullopt) {
+  instruction_schedule_ =
+      &cost_analysis_.hlo_live_range().instruction_schedule();
+
+  // Create a vector of elapsed times and while nesting levels of HLO
+  // instructions. The elapsed times are multiplied by
+  // pow(while_execution_count, nest_level) to account for executing the HLOs
+  // multiple times in while loops.
+  std::vector<float> instructions_elapsed_time(
+      instruction_schedule_->size() + 1, 0.0);
+  int max_while_nest_level = 0;
+  for (const auto& instruction_and_logical_time : *instruction_schedule_) {
+    // To avoid double counting, don't include the elapsed time of while and
+    // conditional HLOs.
+    const HloInstruction* instruction = instruction_and_logical_time.first;
+    int64_t logical_time = instruction_and_logical_time.second;
+    if (logical_time >= instructions_elapsed_time.size()) {
+      instructions_elapsed_time.resize(logical_time + 1, 0.0);
+      while_nest_level_.resize(logical_time + 1, 0);
+    }
+    int while_nest_level = cost_analysis_.CalculateComputationNestLevel(
+        instruction_and_logical_time.first, /*while_only=*/true);
+    while_nest_level_[logical_time] = while_nest_level;
+    max_while_nest_level = std::max(max_while_nest_level, while_nest_level);
+    int computation_nest_level = cost_analysis_.CalculateComputationNestLevel(
+        instruction_and_logical_time.first, /*while_only=*/false);
+    computation_nest_level_[logical_time] = computation_nest_level;
+    if (instruction->opcode() == HloOpcode::kWhile ||
+        instruction->opcode() == HloOpcode::kConditional) {
+      continue;
+    }
+    float elapsed_time = cost_analysis_.GetInstructionElapsed(
+        *instruction_and_logical_time.first);
+    instructions_elapsed_time[logical_time] =
+        elapsed_time * cost_analysis_.GetWhileNestMultiplier(while_nest_level);
+  }
+  // As an optimization, create a cumulative sum vector of elapsed time.
+  float cumsum = 0.0;
+  elapsed_time_cumsum_.reserve(instructions_elapsed_time.size());
+  for (float elapsed_time : instructions_elapsed_time) {
+    cumsum += elapsed_time;
+    elapsed_time_cumsum_.push_back(cumsum);
+  }
+  // To be able to accurately determine the minimum nest level between a start
+  // time and an end time efficiently, populate a data structure that stores the
+  // closest 'smaller' nest level change index.
+  const int64_t size = instructions_elapsed_time.size();
+  CHECK_EQ(size, while_nest_level_.size());
+  std::vector<int> most_recent_by_level(while_nest_level_.size(), -1);
+  int prev_nest_level = 0;
+  int change_idx = -1;
+  while_nest_level_change_.reserve(size);
+  for (int i = 0; i < size; ++i) {
+    int nest_level = while_nest_level_[i];
+    if (nest_level != prev_nest_level) {
+      prev_nest_level = nest_level;
+      // Compute last change index by choosing the most recent instruction index
+      // with smaller nesting level. Note that it may happen that even though
+      // there were few different regions with other nest levels before, all of
+      // then are same or bigger than this one, in which case we'll end up with
+      // -1, e.g. if you got nest level 0 no need checking anything else.
+      change_idx = -1;
+      for (int smaller_level = 0; smaller_level < nest_level; smaller_level++) {
+        change_idx = std::max(change_idx, most_recent_by_level[smaller_level]);
+      }
+    }
+    most_recent_by_level[nest_level] = i;
+    while_nest_level_change_.push_back(change_idx);
+  }
+  for (int i = 0; i <= max_while_nest_level; ++i) {
+    while_execution_counts_.push_back(cost_analysis_.GetWhileNestMultiplier(i));
+  }
+}
+
+float CostAnalysisPrefetchIntervalPicker::GetMaxElapsedInAlternateMemory(
+    float async_copy_elapsed) const {
+  return max_async_copy_elapsed_;
+}
+
+bool CostAnalysisPrefetchIntervalPicker::CanAllocateInAlternateMemoryNoCopy(
+    const Shape& shape, int64_t start_time, int64_t end_time) const {
+  // Even though this method returns if we allow the buffer in alternate memory
+  // _without_ asynchronous copies, calculate how long it would have taken to
+  // copy it and compare it to the elapsed time in the logical interval.
+  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(
+      shape_override_ ? *shape_override_ : shape);
+  float logical_interval_elapsed =
+      GetLogicalIntervalElapsed(start_time, end_time);
+  return GetMaxElapsedInAlternateMemory(async_copy_elapsed) >
+         logical_interval_elapsed;
+}
+
+int64_t CostAnalysisPrefetchIntervalPicker::PreferredEvictionEndTime(
+    const Shape& shape, int64_t start_time, int64_t latest_end_time) const {
+  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(
+      shape_override_ ? *shape_override_ : shape);
+  int64_t end_time;
+  for (end_time = start_time + 1; end_time <= latest_end_time; ++end_time) {
+    float logical_interval_elapsed =
+        GetLogicalIntervalElapsed(start_time, end_time);
+    if (logical_interval_elapsed >=
+        (1 + kEvictionRetryMultiplier * retry_number_) *
+            preferred_overlap_to_async_copy_ratio_ * async_copy_elapsed) {
+      break;
+    }
+  }
+  return end_time;
+}
+
+int64_t CostAnalysisPrefetchIntervalPicker::LatestPrefetchStartTime(
+    const Shape& shape, int64_t start_time, int64_t end_time,
+    const HloUse* use) const {
+  // Find the earliest time that satisfies max_overlap_to_async_copy_ratio_.
+  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(
+      shape_override_ ? *shape_override_ : shape);
+  // If there is a use, estimate the time we would save by having this op in
+  // alternate memory.
+  float inst_elapsed_reduction = 0.0f;
+  if (use) {
+    float elapsed_time =
+        cost_analysis_.GetInstructionElapsed(*use->instruction);
+    float elapsed_time_in_alternate_mem =
+        cost_analysis_.GetInstructionElapsedInAlternateMemory(
+            *use->instruction,
+            /*operands_in_alternate_mem=*/
+            {std::make_pair(use->operand_number, use->operand_index)},
+            /*outputs_in_alternate_mem=*/{});
+    inst_elapsed_reduction = elapsed_time - elapsed_time_in_alternate_mem;
+  }
+  int end_nest_level = computation_nest_level_[end_time];
+
+  // Find the latest time we're allowed to start prefetching.
+  float min_interval = min_overlap_to_async_copy_ratio_ * async_copy_elapsed;
+  int latest_prefetch_time;
+  for (latest_prefetch_time = end_time - 1;
+       latest_prefetch_time >= start_time &&
+       (computation_nest_level_[latest_prefetch_time] != end_nest_level ||
+        min_interval >
+            GetLogicalIntervalElapsed(latest_prefetch_time, end_time) +
+                inst_elapsed_reduction);
+       --latest_prefetch_time) {
+  }
+
+  return latest_prefetch_time;
+}
+
+int64_t CostAnalysisPrefetchIntervalPicker::PreferredPrefetchStartTime(
+    const Shape& shape, int64_t earliest_prefetch_start_time,
+    int64_t latest_prefetch_start_time, int64_t prefetch_end_time) const {
+  // Between the earliest and latest prefetch interval, find the interval
+  // closest to the preferred interval and start iterating from there.
+  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(
+      shape_override_ ? *shape_override_ : shape);
+  int64_t preferred_prefetch_start_time = earliest_prefetch_start_time;
+  float preferred_interval =
+      preferred_overlap_to_async_copy_ratio_ * async_copy_elapsed;
+  float best_interval = GetLogicalIntervalElapsed(earliest_prefetch_start_time,
+                                                  prefetch_end_time);
+  int end_nest_level = computation_nest_level_[prefetch_end_time];
+  for (int64_t prefetch_start_time = earliest_prefetch_start_time + 1;
+       prefetch_start_time <= latest_prefetch_start_time;
+       ++prefetch_start_time) {
+    float interval =
+        GetLogicalIntervalElapsed(prefetch_start_time, prefetch_end_time);
+    if (computation_nest_level_[prefetch_start_time] == end_nest_level &&
+        std::abs(preferred_interval - interval) <
+            std::abs(preferred_interval - best_interval)) {
+      best_interval = interval;
+      preferred_prefetch_start_time = prefetch_start_time;
+    }
+  }
+  return preferred_prefetch_start_time;
+}
+
+int64_t CostAnalysisPrefetchIntervalPicker::LatestPrefetchEndTime(
+    int64_t original_prefetch_end_time,
+    int64_t proposed_prefetch_end_time) const {
+  // Iterate towards the beginning until we find a suitable end time that is the
+  // same while nest level as the original prefetch end time.
+  int64_t original_nest_level =
+      computation_nest_level_[original_prefetch_end_time];
+  int64_t new_prefetch_end_time;
+  for (new_prefetch_end_time = proposed_prefetch_end_time;
+       computation_nest_level_[new_prefetch_end_time] != original_nest_level;
+       --new_prefetch_end_time) {
+  }
+  return new_prefetch_end_time;
+}
+
+int64_t CostAnalysisPrefetchIntervalPicker::EstimatedPrefetchEndTime(
+    const Shape& shape, int64_t start_time, int64_t end_time) const {
+  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(
+      shape_override_ ? *shape_override_ : shape);
+  int64_t estimated_end_time;
+  for (estimated_end_time = start_time + 1; estimated_end_time < end_time;
+       ++estimated_end_time) {
+    float interval = GetLogicalIntervalElapsed(start_time, estimated_end_time);
+    if (interval >= async_copy_elapsed) {
+      break;
+    }
+  }
+  return estimated_end_time;
+}
+
+void CostAnalysisPrefetchIntervalPicker::Begin(
+    const HloUse& use, int64_t start_time, int64_t end_time,
+    std::optional<int64_t> preferred_time) {
+  const Shape& shape = ShapeUtil::GetSubshape(
+      use.instruction->operand(use.operand_number)->shape(), use.operand_index);
+  // Find the earliest time that satisfies max_overlap_to_async_copy_ratio_.
+  async_copy_elapsed_ = cost_analysis_.GetAsyncCopyElapsed(
+      shape_override_ ? *shape_override_ : shape);
+  // Estimate the time we would save by having this op in alternate memory.
+  float elapsed_time = cost_analysis_.GetInstructionElapsed(*use.instruction);
+  float elapsed_time_in_alternate_mem =
+      cost_analysis_.GetInstructionElapsedInAlternateMemory(
+          *use.instruction, /*operands_in_alternate_mem=*/
+          {std::make_pair(use.operand_number, use.operand_index)},
+          /*outputs_in_alternate_mem=*/{});
+  inst_elapsed_reduction_ = elapsed_time - elapsed_time_in_alternate_mem;
+  end_logical_time_ = end_time;
+  int end_nest_level = computation_nest_level_[end_logical_time_];
+
+  // Find the latest time we're allowed to start prefetching.
+  float min_interval = min_overlap_to_async_copy_ratio_ * async_copy_elapsed_;
+  latest_prefetch_time_ =
+      LatestPrefetchStartTime(shape, start_time, end_time, &use);
+
+  // Find the earliest time we're allowed to start prefetching.
+  float max_interval = GetMaxElapsedInAlternateMemory(async_copy_elapsed_);
+  for (earliest_prefetch_time_ = start_time;
+       earliest_prefetch_time_ < latest_prefetch_time_ &&
+       (computation_nest_level_[earliest_prefetch_time_] != end_nest_level ||
+        max_interval < GetLogicalIntervalElapsed(earliest_prefetch_time_,
+                                                 end_logical_time_));
+       ++earliest_prefetch_time_) {
+  }
+  if (earliest_prefetch_time_ > latest_prefetch_time_) {
+    // There is no available prefetch interval for the given start and end
+    // times. Set the iterators accordingly to ensure Done() returns true.
+    increasing_prefetch_time_iterator_ = earliest_prefetch_time_;
+    decreasing_prefetch_time_iterator_ = latest_prefetch_time_;
+    CHECK(Done());
+    return;
+  }
+
+  int64_t starting_prefetch_time;
+  if (preferred_time && *preferred_time <= latest_prefetch_time_) {
+    starting_prefetch_time = *preferred_time;
+  } else {
+    starting_prefetch_time =
+        PreferredPrefetchStartTime(shape, earliest_prefetch_time_,
+                                   latest_prefetch_time_, end_logical_time_);
+  }
+  float preferred_interval =
+      preferred_overlap_to_async_copy_ratio_ * async_copy_elapsed_;
+  VLOG(4) << "Interval min/max/preferred = " << min_interval << " "
+          << max_interval << " " << preferred_interval
+          << " prefetch time earliest/latest/starting = "
+          << earliest_prefetch_time_ << " " << latest_prefetch_time_ << " "
+          << starting_prefetch_time;
+
+  increasing_prefetch_time_iterator_ = starting_prefetch_time;
+  decreasing_prefetch_time_iterator_ = starting_prefetch_time;
+  using_increasing_prefetch_time_iterator_ = true;
+  // Since both iterators start at the same position, call Next() once to
+  // advance one of the iterators.
+  Next();
+}
+
+int64_t CostAnalysisPrefetchIntervalPicker::Next() {
+  CHECK(!Done()) << "Prefetch interval picker's Next() is called even though "
+                    "Done() is false";
+  if (using_increasing_prefetch_time_iterator_) {
+    int64_t prefetch_time = increasing_prefetch_time_iterator_++;
+    while (increasing_prefetch_time_iterator_ <= latest_prefetch_time_ &&
+           computation_nest_level_[increasing_prefetch_time_iterator_] !=
+               computation_nest_level_[end_logical_time_]) {
+      ++increasing_prefetch_time_iterator_;
+    }
+    if (decreasing_prefetch_time_iterator_ >= earliest_prefetch_time_) {
+      using_increasing_prefetch_time_iterator_ = false;
+    }
+    return prefetch_time;
+  } else {
+    int64_t prefetch_time = decreasing_prefetch_time_iterator_--;
+    // As a compilation time optimization, reduce the number of intervals that
+    // this prefetch interval picker returns. When we run out of the increasing
+    // prefetch time iterator, only explore up to
+    // kNumExploredDecreasingIntervals intervals. To do that, calculate the
+    // 1/kNumExploredDecreasingIntervals of the elapsed time between the
+    // earliest prefetch time and the use, and decrement the iterator until the
+    // prefetch elapsed time is at least as large as this target value. This
+    // allows us to reduce the number of expensive heap fit and resource checks
+    // when the graph consists of a large number of fast-executing HLOs.
+    //
+    // Shown pictorially, assuming kNumExploredDecreasingIntervals = 3 and the
+    // numbers indicating the elapsed time of the HLOs, only the indicated
+    // options for prefetch start time would be explored:
+    //
+    //    ---1---1---3---1---1---1---1---0---0---0---0---1---5---X
+    //     ^           ^                                   ^     ^
+    //  Option3     Option2                             Option1 Use
+    // (Earliest)
+    float next_target_interval_elapsed = 0;
+    if (increasing_prefetch_time_iterator_ > latest_prefetch_time_) {
+      next_target_interval_elapsed =
+          GetLogicalIntervalElapsed(prefetch_time, end_logical_time_) +
+          (GetLogicalIntervalElapsed(earliest_prefetch_time_,
+                                     end_logical_time_) /
+           kNumExploredDecreasingIntervals);
+      VLOG(3) << "Next target interval elapsed: "
+              << next_target_interval_elapsed;
+    }
+    while (decreasing_prefetch_time_iterator_ >= earliest_prefetch_time_ &&
+           (computation_nest_level_[decreasing_prefetch_time_iterator_] !=
+                computation_nest_level_[end_logical_time_] ||
+            GetLogicalIntervalElapsed(decreasing_prefetch_time_iterator_,
+                                      end_logical_time_) <
+                next_target_interval_elapsed)) {
+      --decreasing_prefetch_time_iterator_;
+    }
+    if (increasing_prefetch_time_iterator_ <= latest_prefetch_time_) {
+      using_increasing_prefetch_time_iterator_ = true;
+    }
+    return prefetch_time;
+  }
+}
+
+bool CostAnalysisPrefetchIntervalPicker::Done() const {
+  return increasing_prefetch_time_iterator_ > latest_prefetch_time_ &&
+         decreasing_prefetch_time_iterator_ < earliest_prefetch_time_;
+}
+
+int64_t CostAnalysisPrefetchIntervalPicker::latest_time() const {
+  return latest_prefetch_time_;
+}
+
+void CostAnalysisPrefetchIntervalPicker::SetRetryNumber(int retry_number) {
+  retry_number_ = retry_number;
+}
+
+int CostAnalysisPrefetchIntervalPicker::GetMinWhileNestLevel(
+    int64_t start_time, int64_t end_time) const {
+  int min_nest_level =
+      std::min(while_nest_level_[start_time], while_nest_level_[end_time]);
+  int change_idx = while_nest_level_change_[end_time];
+  while (change_idx >= start_time) {
+    min_nest_level = std::min(min_nest_level, while_nest_level_[change_idx]);
+    change_idx = while_nest_level_change_[change_idx];
+  }
+  return min_nest_level;
+}
+
+float CostAnalysisPrefetchIntervalPicker::GetLogicalIntervalElapsed(
+    int64_t start_time, int64_t end_time) const {
+  CHECK_LE(start_time, end_time);
+  if (start_time == end_time) {
+    return 0.0;
+  }
+  if (start_time < 0) {
+    start_time = 0;
+  }
+  // Since elapsed_time_cumsum_ is already weighed by the while loop nesting
+  // level, normalize the elapsed time by dividing with the nesting factor of
+  // the interval (start and end times).
+  int interval_while_nest_level = GetMinWhileNestLevel(start_time, end_time);
+  return (elapsed_time_cumsum_[end_time - 1] -
+          elapsed_time_cumsum_[start_time]) /
+         while_execution_counts_[interval_while_nest_level];
+}
+
+std::string CostAnalysisPrefetchIntervalPicker::ToDebugString() const {
+  int current_logical_prefetch_time = using_increasing_prefetch_time_iterator_
+                                          ? increasing_prefetch_time_iterator_
+                                          : decreasing_prefetch_time_iterator_;
+  float logical_interval_elapsed = GetLogicalIntervalElapsed(
+      current_logical_prefetch_time, end_logical_time_);
+  return absl::StrCat(
+      "Async copy elapsed (s) = ", async_copy_elapsed_,
+      ", inst elapsed reduction (s) = ", inst_elapsed_reduction_,
+      ", logical interval elapsed (s) = ", logical_interval_elapsed,
+      ", interval = (", current_logical_prefetch_time, ", ", end_logical_time_,
+      ")");
+}
+
+std::string CostAnalysisPrefetchIntervalPicker::ToNoCopyDebugString(
+    const Shape& shape, int64_t start_time, int64_t end_time) const {
+  float async_copy_elapsed = cost_analysis_.GetAsyncCopyElapsed(
+      shape_override_ ? *shape_override_ : shape);
+  float logical_interval_elapsed =
+      GetLogicalIntervalElapsed(start_time, end_time);
+  return absl::StrCat(
+      "Async copy elapsed (s) = ", async_copy_elapsed,
+      ", logical interval elapsed (s) = ", logical_interval_elapsed);
+}
+
+std::optional<float>
+CostAnalysisPrefetchIntervalPicker::BufferIntervalAlternateMemoryBenefit(
+    const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval)
+    const {
+  return cost_analysis_.GetMemoryBoundedness(interval);
+}
+
+}  // namespace memory_space_assignment
+}  // namespace xla
diff --git a/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker.h b/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker.h
new file mode 100644
index 00000000000000..0ae8af53071283
--- /dev/null
+++ b/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker.h
@@ -0,0 +1,292 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_PREFETCH_INTERVAL_PICKER_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_PREFETCH_INTERVAL_PICKER_H_
+
+#include <cstdint>
+#include <optional>
+#include <string>
+#include <vector>
+
+#include "absl/container/flat_hash_map.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/shape.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace memory_space_assignment {
+
+// Abstract base class that memory space assignment uses to pick prefetch
+// intervals.
+class PrefetchIntervalPicker {
+ public:
+  PrefetchIntervalPicker() = default;
+  virtual ~PrefetchIntervalPicker() = default;
+
+  // Returns true if the buffer can be allocated in alternate memory space
+  // without any copies (prefetches).
+  virtual bool CanAllocateInAlternateMemoryNoCopy(const Shape& shape,
+                                                  int64_t start_time,
+                                                  int64_t end_time) const = 0;
+
+  // Returns the preferred end time for an eviction that starts at a given time
+  // and must end by the given end time.
+  virtual int64_t PreferredEvictionEndTime(const Shape& shape,
+                                           int64_t start_time,
+                                           int64_t latest_end_time) const = 0;
+
+  // Returns the latest time that a prefetch can start.
+  virtual int64_t LatestPrefetchStartTime(const Shape& shape,
+                                          int64_t start_time, int64_t end_time,
+                                          const HloUse* use) const = 0;
+
+  // Returns the preferred time that a prefetch can start.
+  virtual int64_t PreferredPrefetchStartTime(
+      const Shape& shape, int64_t earliest_prefetch_start_time,
+      int64_t latest_prefetch_start_time, int64_t prefetch_end_time) const = 0;
+
+  // Returns the latest time that a prefetch can end that is less than or equal
+  // to proposed_prefetch_end_time.
+  virtual int64_t LatestPrefetchEndTime(
+      int64_t original_prefetch_end_time,
+      int64_t proposed_prefetch_end_time) const {
+    return proposed_prefetch_end_time;
+  }
+
+  // Returns the estimated end time of a prefetch that starts at the given time.
+  virtual int64_t EstimatedPrefetchEndTime(const Shape& shape,
+                                           int64_t start_time,
+                                           int64_t end_time) const = 0;
+
+  // Returns the elapsed time in seconds between the logical interval that
+  // corresponds to the instruction schedule.
+  virtual float GetLogicalIntervalElapsed(int64_t start_time,
+                                          int64_t end_time) const = 0;
+
+  // Begins the iterator for the first start time of the prefetch.
+  virtual void Begin(const HloUse& use, int64_t start_time, int64_t end_time,
+                     std::optional<int64_t> preferred_time) = 0;
+
+  // Advances the start time of the prefetch and returns that value.
+  virtual int64_t Next() = 0;
+
+  // Returns true if the available prefetch intervals have been exhausted.
+  virtual bool Done() const = 0;
+
+  // Returns the latest time the prefetch interval picker will have pick.
+  virtual int64_t latest_time() const = 0;
+
+  // The retry number can be used to modify the interval picking policies. The
+  // first attempt will have a retry_number of 0, then 1, etc.
+  virtual void SetRetryNumber(int retry_number) {
+    retry_number_ = retry_number;
+  }
+  int retry_number() const { return retry_number_; }
+
+  // Returns a debug string for the current state of the prefetch interval
+  // picker.
+  virtual std::string ToDebugString() const = 0;
+
+  // Returns a debug string for no-copy allocation.
+  virtual std::string ToNoCopyDebugString(const Shape& shape,
+                                          int64_t start_time,
+                                          int64_t end_time) const = 0;
+
+  // Prefetch interval pickers may return a value corresponding to the benefit
+  // of placing the BufferInterval in the alternate memory. The larger value,
+  // the more beneficial.
+  virtual std::optional<float> BufferIntervalAlternateMemoryBenefit(
+      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval)
+      const {
+    return std::nullopt;
+  }
+
+ protected:
+  const absl::flat_hash_map<const HloInstruction*, int64_t>*
+      instruction_schedule_ = nullptr;
+  int retry_number_ = 0;
+};
+
+// Prefetch interval picker that uses instruction count to overlap asynchronous
+// copies with independent computation. The min and max overlap counts describe
+// the number of independent HLOs overlapped while a value is being prefetched
+// into the alternate memory (between CopyStart and CopyDone HLO instructions).
+// max_overlap_count attempts to prevent bringing tensors into the alternate
+// memory too eagerly and hence occupying the space for other tensors which
+// might use it.  min_overlap_count attempts to prevent cases where tensors are
+// prefetched into the alternate memory without sufficient time for the copy to
+// take place.  In those cases, it's just better to keep the tensor in the
+// default memory instead of hurting the critical path with this copy that
+// likely won't finish in time.
+class InstructionCountPrefetchIntervalPicker : public PrefetchIntervalPicker {
+ public:
+  InstructionCountPrefetchIntervalPicker(int64_t min_overlap_count,
+                                         int64_t max_overlap_count)
+      : min_overlap_count_(min_overlap_count),
+        max_overlap_count_(max_overlap_count) {}
+
+  bool CanAllocateInAlternateMemoryNoCopy(const Shape& shape,
+                                          int64_t start_time,
+                                          int64_t end_time) const override;
+
+  int64_t PreferredEvictionEndTime(const Shape& shape, int64_t start_time,
+                                   int64_t latest_end_time) const override;
+
+  int64_t LatestPrefetchStartTime(const Shape& shape, int64_t start_time,
+                                  int64_t end_time,
+                                  const HloUse* use) const override;
+
+  int64_t PreferredPrefetchStartTime(const Shape& shape,
+                                     int64_t earliest_prefetch_start_time,
+                                     int64_t latest_prefetch_start_time,
+                                     int64_t prefetch_end_time) const override;
+
+  int64_t EstimatedPrefetchEndTime(const Shape& shape, int64_t start_time,
+                                   int64_t end_time) const override;
+  float GetLogicalIntervalElapsed(int64_t start_time,
+                                  int64_t end_time) const override;
+
+  void Begin(const HloUse& use, int64_t start_time, int64_t end_time,
+             std::optional<int64_t> preferred_time) override;
+
+  int64_t Next() override;
+  bool Done() const override;
+
+  int64_t latest_time() const override;
+
+  std::string ToDebugString() const override;
+  std::string ToNoCopyDebugString(const Shape& shape, int64_t start_time,
+                                  int64_t end_time) const override;
+
+ private:
+  int64_t min_overlap_count_;
+  int64_t max_overlap_count_;
+  int64_t end_time_;
+  int64_t current_prefetch_time_;
+};
+
+// Prefetch interval picker that uses cost analysis to overlap asynchronous
+// copies with independent computation. It uses min (independent computation
+// duration) / (asynchronous copy duration) ratio to guide whether the prefetch
+// is within the lower bound. For the upper bound, it restricts the maximum
+// duration that a buffer may occupy the alternate memory space as a multiple of
+// the time it would take to copy a buffer that is the size of the alternate
+// memory. It starts with the preferred ratio in Begin() and works its way for
+// alternately earlier and later prefetches until hitting min and max ratios.
+// The value for buffer size for max async copy is a mechanism to prevent
+// copying small buffers between the two memories unnecessarily. For calculating
+// the max time that the buffer can reside in alternate memory, we use the
+// larger of this value and the actual size of the buffer. A shape override can
+// also be provided which causes the interval picker to use that shape for async
+// copy durations instead of the actual shape of the copy.
+class CostAnalysisPrefetchIntervalPicker : public PrefetchIntervalPicker {
+ public:
+  CostAnalysisPrefetchIntervalPicker(
+      const CostAnalysis& cost_analysis, float min_overlap_to_async_copy_ratio,
+      float preferred_overlap_to_async_copy_ratio,
+      float max_overlap_to_mem_size_async_copy_ratio, int64_t mem_size_bytes,
+      const Shape* shape_override = nullptr);
+
+  bool CanAllocateInAlternateMemoryNoCopy(const Shape& shape,
+                                          int64_t start_time,
+                                          int64_t end_time) const override;
+
+  int64_t PreferredEvictionEndTime(const Shape& shape, int64_t start_time,
+                                   int64_t latest_end_time) const override;
+
+  int64_t LatestPrefetchEndTime(
+      int64_t original_prefetch_end_time,
+      int64_t proposed_prefetch_end_time) const override;
+
+  int64_t LatestPrefetchStartTime(const Shape& shape, int64_t start_time,
+                                  int64_t end_time,
+                                  const HloUse* use) const override;
+
+  int64_t PreferredPrefetchStartTime(const Shape& shape,
+                                     int64_t earliest_prefetch_start_time,
+                                     int64_t latest_prefetch_start_time,
+                                     int64_t prefetch_end_time) const override;
+
+  int64_t EstimatedPrefetchEndTime(const Shape& shape, int64_t start_time,
+                                   int64_t end_time) const override;
+  float GetLogicalIntervalElapsed(int64_t start_time,
+                                  int64_t end_time) const override;
+
+  void Begin(const HloUse& use, int64_t start_time, int64_t end_time,
+             std::optional<int64_t> preferred_time) override;
+
+  int64_t Next() override;
+  bool Done() const override;
+
+  int64_t latest_time() const override;
+
+  void SetRetryNumber(int retry_number) override;
+
+  std::string ToDebugString() const override;
+  std::string ToNoCopyDebugString(const Shape& shape, int64_t start_time,
+                                  int64_t end_time) const override;
+
+  std::optional<float> BufferIntervalAlternateMemoryBenefit(
+      const GlobalDecreasingSizeBestFitHeap<HloValue>::BufferInterval& interval)
+      const override;
+
+ private:
+  // Finds the minimum nest level in the given interval.
+  int GetMinWhileNestLevel(int64_t start_time, int64_t end_time) const;
+
+  // Given the elapsed time to copy this buffer to the alternate memory, returns
+  // the longest time that this buffer may reside in the alternate memory space.
+  float GetMaxElapsedInAlternateMemory(float async_copy_elapsed) const;
+
+  // For each instruction in the flattened schedule, maintain their elapsed time
+  // (in cumulative sum) and while nesting level.
+  std::vector<float> elapsed_time_cumsum_;
+  std::vector<int> while_nest_level_;
+  std::vector<int> computation_nest_level_;
+  // Maintain the index of the most recent (before this instruction) nest level
+  // change in order to efficiently determine the minimum nest level in an
+  // interval.
+  std::vector<int> while_nest_level_change_;
+
+  const CostAnalysis& cost_analysis_;
+  float min_overlap_to_async_copy_ratio_;
+  float preferred_overlap_to_async_copy_ratio_;
+  float max_async_copy_elapsed_;
+  float async_copy_elapsed_;
+  float inst_elapsed_reduction_;
+  int64_t end_logical_time_;
+  int64_t earliest_prefetch_time_;
+  int64_t latest_prefetch_time_;
+  bool using_increasing_prefetch_time_iterator_ = true;
+  int64_t increasing_prefetch_time_iterator_;
+  int64_t decreasing_prefetch_time_iterator_;
+
+  std::vector<float> while_execution_counts_;
+  // Shape override is used to override the shape of the shape of the async copy
+  // to treat all async copies the same duration. Having an override forces
+  // prefetches to be scheduled roughly in FIFO order.
+  std::optional<Shape> shape_override_;
+};
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_PREFETCH_INTERVAL_PICKER_H_
diff --git a/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker_test.cc b/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker_test.cc
new file mode 100644
index 00000000000000..7b8cac3fcab70a
--- /dev/null
+++ b/third_party/xla/xla/service/memory_space_assignment/prefetch_interval_picker_test.cc
@@ -0,0 +1,406 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/memory_space_assignment/prefetch_interval_picker.h"
+
+#include <cstdint>
+#include <optional>
+
+#include <gtest/gtest.h>
+#include "absl/log/log.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/hlo_value.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
+#include "xla/service/memory_space_assignment/testing_utils.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace memory_space_assignment {
+namespace {
+
+constexpr int64_t kPointerSize = 8;
+
+int64_t ShapeSize(const Shape& shape) {
+  return ShapeUtil::ByteSizeOf(shape, kPointerSize);
+}
+
+using CostAnalysisPrefetchIntervalPickerTest = HloTestBase;
+
+TEST_F(CostAnalysisPrefetchIntervalPickerTest, PrefetchIntervalOrder) {
+  absl::string_view hlo_string = R"(
+  HloModule bug, is_scheduled=true
+
+  ENTRY Entry {
+    param0 = f32[2,4] parameter(0)
+    a = f32[2,4] negate(param0)
+    b = f32[2,4] negate(a)
+    c = f32[2,4] negate(b)
+    d = f32[2,4] negate(c)
+    e = f32[2,4] negate(d)
+    f = f32[2,4] negate(e)
+    g = f32[2,4] negate(f)
+    h = f32[2,4] negate(g)
+    i = f32[2,4] negate(h)
+    j = f32[2,4] negate(i)
+    k = f32[2,4] negate(j)
+    l = f32[2,4] negate(k)
+    m = f32[2,4] negate(l)
+    n = f32[2,4] negate(m)
+    o = f32[2,4] negate(n)
+    p = f32[2,4] negate(o)
+    q = f32[2,4] negate(p)
+    r = f32[2,4] negate(q)
+    s = f32[2,4] negate(r)
+    t = f32[2,4] negate(s)
+    u = f32[2,4] negate(t)
+    ROOT v = f32[2,4] add(u, param0)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  HloCostAnalysis hlo_cost_analysis(ShapeSize);
+  CostAnalysisOptions options;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto cost_analysis,
+      FakeCostAnalysis::Create(hlo_cost_analysis, *module, options));
+  CostAnalysisPrefetchIntervalPicker interval_picker(
+      *cost_analysis,
+      /*min_overlap_to_async_copy_ratio=*/1.0,
+      /*preferred_overlap_to_async_copy_ratio=*/2.0,
+      /*max_overlap_to_mem_size_async_copy_ratio=*/4.0,
+      /*mem_size_bytes=*/32);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloUse use{root, /*operand_number=*/1, /*operand_index=*/{}};
+  interval_picker.Begin(use, /*start_time=*/0, /*end_time=*/22, std::nullopt);
+
+  // Expect that the first interval is (15, 22), which has elapsed time of 6.0,
+  // twice of the async copy elased (3.0). Then we expect that intervals will be
+  // visited in alternating increasing and decreasing orders until hitting the
+  // min and max async copy overlap ratios, which are the intervals (18, 22)
+  // and (9, 22) respectively.
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 15);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 16);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 14);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 17);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 13);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 18);  // Min async overlap ratio reached.
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 12);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 11);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 10);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 9);  // Max async overlap ratio reached.
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_TRUE(interval_picker.Done());
+
+  // Expect that if the time between start_time and end_time is too short, there
+  // won't be any available intervals.
+  interval_picker.Begin(use, /*start_time=*/19, /*end_time=*/22, std::nullopt);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_TRUE(interval_picker.Done());
+}
+
+TEST_F(CostAnalysisPrefetchIntervalPickerTest, PrefetchIntervalOrderWhile) {
+  absl::string_view hlo_string = R"(
+  HloModule bug, is_scheduled=true
+
+  while_condition {
+    param1 = (f32[2,4]) parameter(0)    // 19
+    ROOT cond = pred[] constant(true)   // 20
+  }
+
+  while_body {
+    param2 = (f32[2,4]) parameter(0)    // 21
+    gte2 = f32[2,4] get-tuple-element(param2), index=0  // 22
+    add = f32[2,4] add(gte2, gte2)      // 23
+    ROOT tuple2 = (f32[2,4]) tuple(add) // 24
+  }
+
+  ENTRY Entry {
+    param0 = f32[2,4] parameter(0)  // 0
+    a = f32[2,4] negate(param0)     // 1
+    b = f32[2,4] negate(a)          // 2
+    c = f32[2,4] negate(b)          // 3
+    d = f32[2,4] negate(c)          // 4
+    e = f32[2,4] negate(d)          // 5
+    f = f32[2,4] negate(e)          // 6
+    g = f32[2,4] negate(f)          // 7
+    h = f32[2,4] negate(g)          // 8
+    i = f32[2,4] negate(h)          // 9
+    j = f32[2,4] negate(i)          // 10
+    k = f32[2,4] negate(j)          // 11
+    l = f32[2,4] negate(k)          // 12
+    m = f32[2,4] negate(l)          // 13
+    n = f32[2,4] negate(m)          // 14
+    o = f32[2,4] negate(n)          // 15
+    p = f32[2,4] negate(o)          // 16
+    q = f32[2,4] negate(p)          // 17
+    tuple = (f32[2,4]) tuple(q)     // 18
+    while = (f32[2,4]) while(tuple), condition=while_condition, body=while_body  // 25
+    gte1 = f32[2,4] get-tuple-element(while), index=0  // 26
+    r = f32[2,4] negate(gte1)       // 27
+    s = f32[2,4] negate(r)          // 28
+    t = f32[2,4] negate(s)          // 29
+    u = f32[2,4] negate(t)          // 30
+    ROOT v = f32[2,4] add(u, param0)  // 31
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  HloCostAnalysis hlo_cost_analysis(ShapeSize);
+  CostAnalysisOptions options;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto cost_analysis,
+      FakeCostAnalysis::Create(hlo_cost_analysis, *module, options));
+  CostAnalysisPrefetchIntervalPicker interval_picker(
+      *cost_analysis,
+      /*min_overlap_to_async_copy_ratio=*/1.0,
+      /*preferred_overlap_to_async_copy_ratio=*/2.0,
+      /*max_overlap_to_mem_size_async_copy_ratio=*/12.0,
+      /*mem_size_bytes=*/32);
+
+  EXPECT_EQ(cost_analysis->GetWhileNestMultiplier(1), 5.0);
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloUse use{root, /*operand_number=*/1, /*operand_index=*/{}};
+  interval_picker.Begin(use, /*start_time=*/0, /*end_time=*/31, std::nullopt);
+
+  // Because there are while loop computations between [19, 24], we ensure that
+  // the interval picker avoids this interval.
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 25);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 26);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 18);
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 27);  // Min async overlap ratio reached.
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_EQ(interval_picker.Next(), 17);  // Max async overlap ratio reached.
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_TRUE(interval_picker.Done());
+}
+
+TEST_F(CostAnalysisPrefetchIntervalPickerTest, NestedWhile) {
+  // This test is to check against a bug where we didn't assign
+  // while_nest_level_ for while instructions, and defaulting to 0. This could
+  // cause the prefetch interval logic to think a nested while instruction is
+  // the same level as the outermost computation.
+  absl::string_view hlo_string = R"(
+  HloModule bug, is_scheduled=true
+
+  while_condition.2 {
+    param1 = (f32[2,4]) parameter(0)    // 11
+    ROOT cond = pred[] constant(true)   // 12
+  }
+
+  while_body.2 {
+    param2 = (f32[2,4]) parameter(0)    // 13
+    gte2 = f32[2,4] get-tuple-element(param2), index=0  // 14
+    add = f32[2,4] add(gte2, gte2)      // 15
+    ROOT tuple2 = (f32[2,4]) tuple(add) // 16
+  }
+
+  while_condition.1 {
+    param3 = (f32[2,4]) parameter(0)    // 5
+    ROOT cond = pred[] constant(true)   // 6
+  }
+
+  while_body.1 {
+    param4 = (f32[2,4]) parameter(0)    // 7
+    gte1 = f32[2,4] get-tuple-element(param4), index=0  // 8
+    add1 = f32[2,4] add(gte1, gte1)     // 9
+    tuple1 = (f32[2,4]) tuple(add1)     // 10
+    while = (f32[2,4]) while(tuple1), condition=while_condition.2, body=while_body.2  // 17
+    gte2 = f32[2,4] get-tuple-element(while), index=0  // 18
+    add2 = f32[2,4] add(gte2, gte2)     // 19
+    ROOT tuple2 = (f32[2,4]) tuple(add2)  // 20
+  }
+
+  ENTRY Entry {
+    param0 = f32[2,4] parameter(0)  // 0
+    a = f32[2,4] negate(param0)     // 1
+    b = f32[2,4] negate(a)          // 2
+    c = f32[2,4] negate(b)          // 3
+    tuple = (f32[2,4]) tuple(c)     // 4
+    while = (f32[2,4]) while(tuple), condition=while_condition.1, body=while_body.1  // 21
+    gte1 = f32[2,4] get-tuple-element(while), index=0  // 22
+    ROOT root = f32[2,4] add(gte1, param0)  // 23
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  HloCostAnalysis hlo_cost_analysis(ShapeSize);
+  CostAnalysisOptions options;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto cost_analysis,
+      FakeCostAnalysis::Create(hlo_cost_analysis, *module, options));
+  CostAnalysisPrefetchIntervalPicker interval_picker(
+      *cost_analysis,
+      /*min_overlap_to_async_copy_ratio=*/1.0,
+      /*preferred_overlap_to_async_copy_ratio=*/2.0,
+      /*max_overlap_to_mem_size_async_copy_ratio=*/12.0,
+      /*mem_size_bytes=*/32);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloUse use{root, /*operand_number=*/1, /*operand_index=*/{}};
+  const Shape& shape = root->operand(1)->shape();
+
+  // We expect the root's latest prefetch start time to be before the while loop
+  // (logical time 4).
+  EXPECT_EQ(interval_picker.LatestPrefetchStartTime(shape, /*start_time=*/0,
+                                                    /*end_time=*/23, &use),
+            4);
+}
+
+TEST_F(CostAnalysisPrefetchIntervalPickerTest, ConsecutiveConditionals) {
+  // This is a test for b/170668492, where prefetching for consecutive
+  // conditionals can cause the prefetch to start in the conditional's
+  // computation.
+  absl::string_view hlo_string = R"(
+  HloModule bug, is_scheduled=true
+
+  true_computation.0 {
+    p0 = (f32[3]{0}) parameter(0)                   // 5
+    gte = f32[3]{0} get-tuple-element(p0), index=0  // 6
+    ROOT neg1 = f32[3]{0} negate(gte)               // 7
+  }
+
+  false_computation.0 {
+    p0 = (f32[3]{0}) parameter(0)                   // 8
+    gte = f32[3]{0} get-tuple-element(p0), index=0  // 9
+    ROOT neg2 = f32[3]{0} negate(gte)               // 10
+  }
+
+  true_computation.1 {
+    p0 = (f32[3]{0}) parameter(0)                   // 12
+    gte = f32[3]{0} get-tuple-element(p0), index=0  // 13
+    ROOT neg1 = f32[3]{0} negate(gte)               // 14
+  }
+
+  false_computation.1 {
+    p0 = (f32[3]{0}) parameter(0)                   // 15
+    gte = f32[3]{0} get-tuple-element(p0), index=0  // 16
+    ROOT neg2 = f32[3]{0} negate(gte)               // 17
+  }
+
+  ENTRY entry {
+    p0 = f32[3]{0} parameter(0)       // 0
+    p1 = f32[3]{0} parameter(1)       // 1
+    p2 = pred[] parameter(2)          // 2
+    tuple0 = (f32[3]{0}) tuple(p0)    // 3
+    tuple1 = (f32[3]{0}) tuple(p1)    // 4
+    conditional0 = f32[3]{0} conditional(p2, tuple0, tuple0), true_computation=true_computation.0, false_computation=false_computation.0  // 11
+    conditional1 = f32[3]{0} conditional(p2, tuple1, tuple1), true_computation=true_computation.1, false_computation=false_computation.1  // 18
+    ROOT tuple2 = (f32[3]{0}, f32[3]{0}) tuple(conditional0, conditional1)  // 19
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  HloCostAnalysis hlo_cost_analysis(ShapeSize);
+  CostAnalysisOptions options;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto cost_analysis,
+      FakeCostAnalysis::Create(hlo_cost_analysis, *module, options));
+  CostAnalysisPrefetchIntervalPicker interval_picker(
+      *cost_analysis,
+      /*min_overlap_to_async_copy_ratio=*/1.0,
+      /*preferred_overlap_to_async_copy_ratio=*/2.0,
+      /*max_overlap_to_mem_size_async_copy_ratio=*/12.0,
+      /*mem_size_bytes=*/32);
+
+  LOG(INFO) << module->ToString();
+
+  HloInstruction* conditional1 =
+      module->entry_computation()->GetInstructionWithName("conditional1");
+  const HloUse use{conditional1, /*operand_number=*/1, /*operand_index=*/{0}};
+  const Shape& shape =
+      module->entry_computation()->parameter_instruction(0)->shape();
+
+  // Expect that the prefetch to start before conditional0's called
+  // computations.
+  EXPECT_LT(interval_picker.LatestPrefetchStartTime(shape, /*start_time=*/0,
+                                                    /*end_time=*/11, &use),
+            5);
+}
+
+TEST_F(CostAnalysisPrefetchIntervalPickerTest, EarliestLatestWindowTooSmall) {
+  // This tests the scenario where there is an op that takes a long time (tanh
+  // in this example) and as a result the earliest and latest times both fall
+  // inside this long-running op. In this case, we should still return a valid
+  // prefetch interval just before the long-running op.
+  absl::string_view hlo_string = R"(
+  HloModule bug, is_scheduled=true
+
+  ENTRY Entry {
+    param0 = f32[2,4] parameter(0)
+    negate = f32[2,4] negate(param0)
+    tanh = f32[2,4] tanh(param0)
+    ROOT add = f32[2,4] add(tanh, negate)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  HloCostAnalysis hlo_cost_analysis(ShapeSize);
+  CostAnalysisOptions options;
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto cost_analysis,
+      FakeCostAnalysis::Create(hlo_cost_analysis, *module, options));
+  cost_analysis->SetOverrideForGetInstructionElapsed(
+      [](const HloInstruction& hlo) {
+        if (hlo.opcode() == HloOpcode::kTanh) {
+          return 20.0;
+        }
+        return 1.0;
+      });
+  CostAnalysisPrefetchIntervalPicker interval_picker(
+      *cost_analysis,
+      /*min_overlap_to_async_copy_ratio=*/1.0,
+      /*preferred_overlap_to_async_copy_ratio=*/2.0,
+      /*max_overlap_to_mem_size_async_copy_ratio=*/12.0,
+      /*mem_size_bytes=*/32);
+
+  HloInstruction* root = module->entry_computation()->root_instruction();
+  const HloUse use{root, /*operand_number=*/1, /*operand_index=*/{}};
+  interval_picker.Begin(use, /*start_time=*/1, /*end_time=*/3, std::nullopt);
+
+  LOG(INFO) << interval_picker.ToDebugString();
+  EXPECT_FALSE(interval_picker.Done());
+  EXPECT_EQ(interval_picker.Next(), 1);
+  EXPECT_TRUE(interval_picker.Done());
+}
+
+}  // namespace
+}  // namespace memory_space_assignment
+}  // namespace xla
diff --git a/third_party/xla/xla/service/memory_space_assignment/slice.cc b/third_party/xla/xla/service/memory_space_assignment/slice.cc
new file mode 100644
index 00000000000000..e550e965f804fb
--- /dev/null
+++ b/third_party/xla/xla/service/memory_space_assignment/slice.cc
@@ -0,0 +1,89 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/service/memory_space_assignment/slice.h"
+
+#include <cstdint>
+#include <functional>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/shape.h"
+
+namespace xla::memory_space_assignment {
+
+std::tuple<const HeapSimulator::Chunk&, int64_t, const SliceProposal&, float>
+SliceDecisionToTuple(const SliceDecision& decision) {
+  return std::make_tuple(
+      std::ref(decision.chunk), decision.exclusive_start_time,
+      std::ref(decision.sizing), decision.copy_resource_consumed);
+}
+
+std::string SliceDecision::ToString() const {
+  return absl::StrCat("{ chunk: ", chunk.ToString(),
+                      ", (exclusive) start_time: ", exclusive_start_time,
+                      ", sizing: ", sizing.ToString(),
+                      ", copy_resource_consumed: ", copy_resource_consumed,
+                      " }");
+}
+
+bool SliceDecision::operator==(const SliceDecision& other) const {
+  return SliceDecisionToTuple(*this) == SliceDecisionToTuple(other);
+}
+
+std::string SliceProposal::ToString() const {
+  return absl::StrCat(
+      "{ slice_shape: ", slice_shape.ToString(true), ", slice_params: { ",
+      absl::StrJoin(slice_params, ", ",
+                    [](std::string* out, const SliceParam& param) {
+                      absl::StrAppend(out, param.ToString());
+                    }),
+      " }, slice_size: ", slice_size, " }");
+}
+
+std::ostream& operator<<(std::ostream& os, const SliceProposal& proposal) {
+  os << proposal.ToString();
+  return os;
+}
+
+std::tuple<const Shape&, const std::vector<SliceParam>&, int64_t>
+SliceProposal::ToTuple() const {
+  return std::make_tuple(std::ref(slice_shape), std::ref(slice_params),
+                         slice_size);
+}
+
+bool SliceProposal::operator==(const SliceProposal& other) const {
+  return ToTuple() == other.ToTuple();
+}
+
+std::string SliceParam::ToString() const {
+  return absl::StrCat("[", start_inclusive, ",", end_exclusive, ")");
+}
+
+bool SliceParam::operator==(const SliceParam& other) const {
+  return start_inclusive == other.start_inclusive &&
+         end_exclusive == other.end_exclusive;
+}
+
+bool IsUniformSliceSizingEnabled(const SlicedPrefetchOptions& options) {
+  return options.max_slices() > 0 && options.preferred_slice_size() > 0;
+}
+
+}  // namespace xla::memory_space_assignment
diff --git a/third_party/xla/xla/service/memory_space_assignment/slice.h b/third_party/xla/xla/service/memory_space_assignment/slice.h
new file mode 100644
index 00000000000000..ca67dd29faf7b1
--- /dev/null
+++ b/third_party/xla/xla/service/memory_space_assignment/slice.h
@@ -0,0 +1,119 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This file contains definitions for MSA slicing. Slicing is an allocation
+// technique in which we allocate a buffer in slices that can start at different
+// times, but once allocated, form a contiguous buffer. When copying buffers, we
+// may want to allocate a buffer in slices, so that we delay allocating memory
+// that would otherwise not be in use, due to copy bandwidth constraints.
+//
+// The following illustrates a buffer that is fully allocated at time t2, via
+// slices.
+//
+//   space
+//    ^
+// p3 |       +-----------+
+//    |       |    s2     |
+// p2 |   +---+-----------+
+//    |   |      s1       |
+// p1 |   +-------+-------+
+//    |           |  s0   |
+// p0 |           +-------+
+//    +---|---|---|---|---|----> time
+//        t0  t1  t2  t3  t4
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_SLICE_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_SLICE_H_
+
+#include <cstdint>
+#include <ostream>
+#include <string>
+#include <tuple>
+#include <vector>
+
+#include "xla/service/heap_simulator/heap_simulator.h"
+#include "xla/service/memory_space_assignment/memory_space_assignment.pb.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+
+namespace xla::memory_space_assignment {
+
+// The target of a custom call that slicing uses to concatenate slices
+// that are already contiguous in memory, into a larger buffer.
+inline constexpr char kConcatBitcastCustomCall[] = "ConcatBitcast";
+
+// The parameters for slicing a single dimension of a tensor.
+struct SliceParam {
+  std::string ToString() const;
+  bool operator==(const SliceParam& other) const;
+
+  int64_t start_inclusive;
+  int64_t end_exclusive;
+};
+
+// A proposed way to slice a buffer.
+struct SliceProposal {
+  std::string ToString() const;
+  friend std::ostream& operator<<(std::ostream& os,
+                                  const SliceProposal& proposal);
+  std::tuple<const Shape&, const std::vector<SliceParam>&, int64_t> ToTuple()
+      const;
+  bool operator==(const SliceProposal& other) const;
+
+  // Shape resulting from the slice.
+  Shape slice_shape;
+
+  // slice_params map to the parameters that would be passed to a slice
+  // instruction. Thus:
+  // * There should be a slice parameter for every dimension in the shape of
+  //   the tensor being sliced.
+  // * The ith slice_param applies to the ith logical dimension in the shape
+  //   being sliced.
+  // * If a dimension is not being sliced, it should have a SliceParam of
+  //   {0, dim size}.
+  std::vector<SliceParam> slice_params;
+
+  // The size to be allocated for the slice. Note, this may be > the size of
+  // the slice shape, due to additional padding that may occur when the slices
+  // are concatenated back together.
+  int64_t slice_size;
+};
+
+// A SliceProposalCollection proposes a way to to slice an AllocationRequest.
+// A SliceProposalCollection is generated from a SliceProposalFunction and is
+// used when we want to slice a prefetch.
+using SliceProposalCollection = std::vector<SliceProposal>;
+using SliceProposalFunction = std::function<StatusOr<SliceProposalCollection>(
+    const Shape& shape, const SlicedPrefetchOptions& options)>;
+
+// A SliceDecision is a SliceProposal that we've determined where and when to
+// allocate.
+struct SliceDecision {
+  std::string ToString() const;
+  bool operator==(const SliceDecision& other) const;
+
+  HeapSimulator::Chunk chunk;
+  int64_t exclusive_start_time;
+  SliceProposal sizing;
+  float copy_resource_consumed;
+};
+
+// Returns true if the options indicates that there is a preferred slice
+// size.
+bool IsUniformSliceSizingEnabled(const SlicedPrefetchOptions& options);
+
+}  // namespace xla::memory_space_assignment
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_SLICE_H_
diff --git a/third_party/xla/xla/service/memory_space_assignment/testing_utils.h b/third_party/xla/xla/service/memory_space_assignment/testing_utils.h
new file mode 100644
index 00000000000000..b1d3c94cc0f421
--- /dev/null
+++ b/third_party/xla/xla/service/memory_space_assignment/testing_utils.h
@@ -0,0 +1,128 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_TESTING_UTILS_H_
+#define XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_TESTING_UTILS_H_
+
+#include <cstdint>
+#include <functional>
+#include <memory>
+#include <utility>
+
+#include "absl/memory/memory.h"
+#include "absl/types/span.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/utils/hlo_live_range.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/hlo_alias_analysis.h"
+#include "xla/service/hlo_cost_analysis.h"
+#include "xla/service/memory_space_assignment/cost_analysis.h"
+#include "xla/shape.h"
+#include "xla/shape_util.h"
+#include "xla/statusor.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace memory_space_assignment {
+
+// For testing purposes, we define a cost analysis where we can control the
+// elapsed times of each HLO and asynchronous copy.
+class FakeCostAnalysis : public CostAnalysis {
+ public:
+  static StatusOr<std::unique_ptr<FakeCostAnalysis>> Create(
+      const HloCostAnalysis& cost_analysis, const HloModule& module,
+      const CostAnalysisOptions& options) {
+    TF_ASSIGN_OR_RETURN(auto alias_analysis, HloAliasAnalysis::Run(&module));
+    TF_ASSIGN_OR_RETURN(auto hlo_live_range,
+                        HloLiveRange::Run(module.schedule(), *alias_analysis,
+                                          module.entry_computation()));
+    auto call_graph = CallGraph::Build(&module);
+    return absl::WrapUnique(
+        new FakeCostAnalysis(cost_analysis, options, std::move(alias_analysis),
+                             std::move(hlo_live_range), std::move(call_graph)));
+  }
+
+  float GetInstructionElapsed(
+      const HloInstruction& instruction) const override {
+    if (get_instruction_elapsed_override_) {
+      return get_instruction_elapsed_override_(instruction);
+    }
+    return 1.0;
+  }
+
+  float GetInstructionElapsedInAlternateMemory(
+      const HloInstruction& instruction,
+      absl::Span<const std::pair<int64_t, ShapeIndex>>
+          operands_in_alternate_mem,
+      absl::Span<const ShapeIndex> outputs_in_alternate_mem) const override {
+    if (get_instruction_elapsed_in_alternate_memory_override_) {
+      return get_instruction_elapsed_in_alternate_memory_override_(
+          instruction, operands_in_alternate_mem, outputs_in_alternate_mem);
+    }
+    if (!operands_in_alternate_mem.empty()) {
+      return 0.5;
+    } else {
+      return 1.0;
+    }
+  }
+
+  float GetAsyncCopyElapsed(const Shape& shape) const override {
+    if (get_async_copy_elapsed_override_) {
+      return get_async_copy_elapsed_override_(shape);
+    }
+    return 3.0;
+  }
+
+  // The following methods can be used to override what the above API calls
+  // return.
+  void SetOverrideForGetInstructionElapsed(
+      std::function<float(const HloInstruction&)> function) {
+    get_instruction_elapsed_override_ = function;
+  }
+  void SetOverrideForGetInstructionElapsedInAlternateMemory(
+      std::function<float(const HloInstruction&,
+                          absl::Span<const std::pair<int64_t, ShapeIndex>>,
+                          absl::Span<const ShapeIndex>)>
+          function) {
+    get_instruction_elapsed_in_alternate_memory_override_ = function;
+  }
+  void SetOverrideForGetAsyncCopyElapsed(
+      std::function<float(const Shape&)> function) {
+    get_async_copy_elapsed_override_ = function;
+  }
+
+ protected:
+  FakeCostAnalysis(const HloCostAnalysis& cost_analysis,
+                   const CostAnalysisOptions& options,
+                   std::unique_ptr<HloAliasAnalysis> alias_analysis,
+                   std::unique_ptr<HloLiveRange> hlo_live_range,
+                   std::unique_ptr<CallGraph> call_graph)
+      : CostAnalysis(cost_analysis, options, std::move(alias_analysis),
+                     std::move(hlo_live_range), std::move(call_graph)) {}
+
+ private:
+  std::function<float(const HloInstruction&)>
+      get_instruction_elapsed_override_ = nullptr;
+  std::function<float(const HloInstruction&,
+                      absl::Span<const std::pair<int64_t, ShapeIndex>>,
+                      absl::Span<const ShapeIndex>)>
+      get_instruction_elapsed_in_alternate_memory_override_ = nullptr;
+  std::function<float(const Shape&)> get_async_copy_elapsed_override_ = nullptr;
+};
+
+}  // namespace memory_space_assignment
+}  // namespace xla
+
+#endif  // XLA_SERVICE_MEMORY_SPACE_ASSIGNMENT_TESTING_UTILS_H_
diff --git a/third_party/xla/xla/service/memory_space_assignment/utils.cc b/third_party/xla/xla/service/memory_space_assignment/utils.cc
index 1f7321d0f1888d..4f5ed3bbe3c35e 100644
--- a/third_party/xla/xla/service/memory_space_assignment/utils.cc
+++ b/third_party/xla/xla/service/memory_space_assignment/utils.cc
@@ -67,6 +67,19 @@ bool MemorySpaceAssignmentUtils::IsValueAllowedInAlternateMemory(
       return false;
     }
 
+    // TODO(berkin): disable aliased custom calls until NaN issue is resolved.
+    if (auto* callable =
+            DynCast<HloCallableInstruction>(position.instruction)) {
+      for (const auto& pair : callable->output_to_operand_aliasing()) {
+        if (position.index == pair.first) {
+          VLOG(4) << "Keeping value " << value->ToShortString()
+                  << " in default mem because it is a custom-call/fusion output"
+                     " that aliases an operand buffer.";
+          return false;
+        }
+      }
+    }
+
     // If the tensor is pre-colored to a memory space that is neither the
     // default (0) nor the alternate, disallow it from the alternate memory
     // space.
diff --git a/third_party/xla/xla/service/metrics.proto b/third_party/xla/xla/service/metrics.proto
index 910616b35dcffc..90325b70fcc6fc 100644
--- a/third_party/xla/xla/service/metrics.proto
+++ b/third_party/xla/xla/service/metrics.proto
@@ -2,6 +2,7 @@ syntax = "proto3";
 
 package xla;
 
+import "google/protobuf/any.proto";
 import "google/protobuf/duration.proto";
 import "google/protobuf/timestamp.proto";
 
@@ -13,6 +14,10 @@ message PassMetrics {
   string pass_name = 2;
   // Duration of the pass.
   google.protobuf.Duration pass_duration = 3;
+  // Custom pass metrics. This is kept opaque, via `google.protobuf.Any`, in
+  // order to decouple pass agnostic compilation logs from possibly proprietary
+  // compiler passes.
+  google.protobuf.Any custom_metrics = 4;
 }
 
 // Defines XLA compilation metrics.
diff --git a/third_party/xla/xla/service/optimization_barrier_expander.h b/third_party/xla/xla/service/optimization_barrier_expander.h
index 540a974c125ed8..562d29fa8e93a4 100644
--- a/third_party/xla/xla/service/optimization_barrier_expander.h
+++ b/third_party/xla/xla/service/optimization_barrier_expander.h
@@ -27,7 +27,6 @@ class OptimizationBarrierExpander : public HloModulePass {
 
   absl::string_view name() const override { return "cse_barrier_expander"; }
 
- protected:
   using HloPassInterface::Run;
   StatusOr<bool> Run(
       HloModule* module,
diff --git a/third_party/xla/xla/service/pattern_matcher.h b/third_party/xla/xla/service/pattern_matcher.h
index 9796671527baa1..e460b7bbbb5a83 100644
--- a/third_party/xla/xla/service/pattern_matcher.h
+++ b/third_party/xla/xla/service/pattern_matcher.h
@@ -2694,6 +2694,7 @@ XLA_UNOP_PATTERN(IsFinite)
 XLA_UNOP_PATTERN(Log)
 XLA_UNOP_PATTERN(Not)
 XLA_UNOP_PATTERN(Negate)
+XLA_UNOP_PATTERN(OptimizationBarrier)
 XLA_UNOP_PATTERN(Real)
 XLA_UNOP_PATTERN(Recv)
 XLA_UNOP_PATTERN(RecvDone)
diff --git a/third_party/xla/xla/service/platform_util.cc b/third_party/xla/xla/service/platform_util.cc
index bce5419de36261..83c54d2e9b1a65 100644
--- a/third_party/xla/xla/service/platform_util.cc
+++ b/third_party/xla/xla/service/platform_util.cc
@@ -27,6 +27,7 @@ limitations under the License.
 #include "xla/statusor.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 #include "xla/stream_executor/host/host_platform_id.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/rocm/rocm_platform_id.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/types.h"
@@ -64,7 +65,7 @@ std::string CanonicalPlatformName(const std::string& platform_name) {
 }
 
 StatusOr<std::vector<se::Platform*>> GetSupportedPlatforms() {
-  return se::MultiPlatformManager::PlatformsWithFilter(
+  return se::PlatformManager::PlatformsWithFilter(
       [](const se::Platform* platform) {
         auto compiler_status = Compiler::GetForPlatform(platform);
         bool supported = compiler_status.ok();
@@ -124,7 +125,7 @@ PlatformUtil::GetSupportedPlatforms() {
 /*static*/ StatusOr<se::Platform*> PlatformUtil::GetPlatform(
     const std::string& platform_name) {
   TF_ASSIGN_OR_RETURN(se::Platform * platform,
-                      se::MultiPlatformManager::PlatformWithName(
+                      se::PlatformManager::PlatformWithName(
                           xla::CanonicalPlatformName(platform_name)));
   TF_RETURN_IF_ERROR(Compiler::GetForPlatform(platform).status());
   return platform;
diff --git a/third_party/xla/xla/service/profile_guided_latency_estimator.cc b/third_party/xla/xla/service/profile_guided_latency_estimator.cc
index 288e6775db59b0..50d250a102a205 100644
--- a/third_party/xla/xla/service/profile_guided_latency_estimator.cc
+++ b/third_party/xla/xla/service/profile_guided_latency_estimator.cc
@@ -37,10 +37,27 @@ LatencyEstimator::TimeCost ProfileGuidedLatencyEstimator::GetLatencyBetween(
   }
 
   auto it = instr_map_.find(from.GetInstr().name());
+  if (it == instr_map_.end() &&
+      (from.GetInstr().opcode() == HloOpcode::kAsyncStart ||
+       from.GetInstr().opcode() == HloOpcode::kAsyncDone)) {
+    absl::string_view wrapped_inst_name =
+        from.GetInstr().async_wrapped_instruction()->name();
+    VLOG(10) << "PGLE found async wrapped instruction: " << wrapped_inst_name
+             << " in " << from.GetInstr().name();
+    it = instr_map_.find(wrapped_inst_name);
+  }
+
   if (it == instr_map_.end()) {
     return latency_estimator_->GetLatencyBetween(from, target);
   }
+
   auto it2 = it->second.latencies.find(target.GetInstr().name());
+  if (it2 == it->second.latencies.end() &&
+      (target.GetInstr().opcode() == HloOpcode::kAsyncStart ||
+       target.GetInstr().opcode() == HloOpcode::kAsyncDone)) {
+    it2 = it->second.latencies.find(
+        target.GetInstr().async_wrapped_instruction()->name());
+  }
   if (it2 != it->second.latencies.end()) {
     VLOG(10) << "PGLE found latency between " << from.GetInstr().name()
              << " and " << target.GetInstr().name() << " in latency info";
diff --git a/third_party/xla/xla/service/profile_guided_latency_estimator_test.cc b/third_party/xla/xla/service/profile_guided_latency_estimator_test.cc
index 49aab73c61dcae..1f226f69e6e11c 100644
--- a/third_party/xla/xla/service/profile_guided_latency_estimator_test.cc
+++ b/third_party/xla/xla/service/profile_guided_latency_estimator_test.cc
@@ -158,4 +158,54 @@ ENTRY entry {
 INSTANTIATE_TEST_SUITE_P(LatencyHidingSchedulerTest, LatencyHidingSchedulerTest,
                          ::testing::Bool());
 
+using ProfileGuidedLatencyEstimatorTest = HloTestBase;
+
+TEST_F(ProfileGuidedLatencyEstimatorTest,
+       TestProfileGuidedLatencyEstimatorWithAsyncInstruction) {
+  absl::string_view hlo_string = R"(
+HloModule module, is_scheduled=true
+
+add.1 {
+  x = f32[] parameter(0)
+  y = f32[] parameter(1)
+  ROOT add = f32[] add(x, y)
+}
+
+ENTRY entry {
+  p0 = f32[16,64,256]{2,1,0} parameter(0)
+  p1 = f32[16,64,256]{2,1,0} parameter(1)
+  reduce-scatter-start = ((f32[16,64,256]{2,1,0}, f32[16,64,256]{2,1,0}), (f32[4,64,256]{2,1,0}, f32[4,64,256]{2,1,0})) reduce-scatter-start(p0, p1), channel_id=1, replica_groups={}, dimensions={0}, to_apply=add.1
+  reduce-scatter-done = (f32[4,64,256]{2,1,0}, f32[4,64,256]{2,1,0}) reduce-scatter-done(reduce-scatter-start)
+  ROOT gte = f32[4,64,256]{2,1,0} get-tuple-element(reduce-scatter-done), index=0
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto hlo_module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+  EXPECT_TRUE(hlo_module->has_entry_computation());
+
+  std::string profiled_instructions_text_proto = R"pb(
+    costs { name: "reduce-scatter" cost_us: 120.0 }
+  )pb";
+  ;
+  tensorflow::profiler::ProfiledInstructionsProto profiled_instructions_proto;
+  ASSERT_TRUE(tsl::protobuf::TextFormat::ParseFromString(
+      profiled_instructions_text_proto, &profiled_instructions_proto));
+
+  auto sched_config = GetDefaultSchedConfig();
+  auto latency_estimator = std::make_unique<ProfileGuidedLatencyEstimator>(
+      sched_config, std::make_unique<ApproximateLatencyEstimator>(),
+      profiled_instructions_proto);
+  HloInstruction* rs_start =
+      FindInstruction(hlo_module.get(), "reduce-scatter-start");
+  HloInstruction* rs_done =
+      FindInstruction(hlo_module.get(), "reduce-scatter-done");
+  HloGraphNode rs_start_node = HloGraphNode(rs_start, 0);
+  HloGraphNode rs_done_node = HloGraphNode(rs_done, 1);
+
+  double latency =
+      latency_estimator->GetLatencyBetween(rs_start_node, rs_done_node);
+  EXPECT_EQ(latency, 120.0);
+}
+
 }  // namespace xla
diff --git a/third_party/xla/xla/service/rendezvous.cc b/third_party/xla/xla/service/rendezvous.cc
index a98be96eaad1a5..ff37bebca30acc 100644
--- a/third_party/xla/xla/service/rendezvous.cc
+++ b/third_party/xla/xla/service/rendezvous.cc
@@ -15,24 +15,31 @@ limitations under the License.
 
 #include "xla/service/rendezvous.h"
 
+#include <atomic>
+#include <cstdint>
 #include <cstdlib>
+#include <limits>
+#include <string_view>
 
 #include "absl/synchronization/notification.h"
 #include "absl/time/time.h"
 #include "tsl/platform/logging.h"
 
-namespace xla::internal {
+namespace xla {
+namespace internal {
 
-void AwaitAndLogIfStuck(absl::Notification& ready,
-                        absl::Duration warn_stuck_timeout,
+void AwaitAndLogIfStuck(absl::Notification& ready, std::string_view name,
+                        size_t num_threads, absl::Duration warn_stuck_timeout,
                         absl::Duration terminate_timeout) {
   if (ready.WaitForNotificationWithTimeout(warn_stuck_timeout)) {
     return;
   }
 
-  LOG(ERROR) << "This thread has been waiting for "
+  LOG(ERROR) << "This thread has been waiting for `" << name << "` for "
              << absl::ToInt64Seconds(warn_stuck_timeout)
-             << " seconds and may be stuck:";
+             << " seconds and may be stuck. Expected " << num_threads
+             << " threads to join the rendezvous, but not all of them arrived"
+             << " on time.";
 
   if (ready.WaitForNotificationWithTimeout(terminate_timeout)) {
     LOG(ERROR) << "Thread is unstuck! Warning above was a false-positive. "
@@ -40,10 +47,71 @@ void AwaitAndLogIfStuck(absl::Notification& ready,
     return;
   }
 
-  LOG(ERROR)
-      << "Termination timeout of " << absl::ToInt64Seconds(terminate_timeout)
-      << " seconds exceeded. Exiting to ensure a consistent program state.";
+  LOG(ERROR) << "Termination timeout for `" << name << "` of "
+             << absl::ToInt64Seconds(terminate_timeout)
+             << " seconds exceeded. Exiting to ensure a consistent program"
+             << " state. Expected " << num_threads
+             << " threads to join the rendezvous, but not all of them arrived"
+             << " on time.";
   std::exit(42);
 }
 
-}  // namespace xla::internal
+}  // namespace internal
+
+namespace {
+inline constexpr int32_t kPending = 0;
+inline constexpr int32_t kCompleted = std::numeric_limits<int32_t>::max();
+}  // namespace
+
+RendezvousSingleFlag::RendezvousSingleFlag() : state_(kPending) {}
+
+RendezvousSingleFlag::InFlightRendezvous::InFlightRendezvous(
+    RendezvousSingleFlag* flag)
+    : flag_(flag) {}
+
+RendezvousSingleFlag::InFlightRendezvous::~InFlightRendezvous() {
+  if (flag_ == nullptr) return;
+
+  // Reload state and use CAS to decide if we are the one who
+  // should mark rendezvous flag completed.
+  int32_t state = flag_->state_.load();
+
+  CHECK(state != kPending && state != kCompleted)  // NOLINT
+      << "rendezvous can't be in pending or completed state";
+
+  // Exit the critical section and maybe mark rendezvous as completed.
+  while (!flag_->state_.compare_exchange_weak(
+      state, state == 1 ? kCompleted : state - 1)) {
+    // Check state after CAS failure: while we are in this function no one
+    // should complete rendezvous without us or switch it back to pending.
+    CHECK(state != kPending && state != kCompleted);  // NOLINT
+  }
+}
+
+RendezvousSingleFlag::InFlightRendezvous::operator bool() const {
+  return flag_ != nullptr;
+}
+
+RendezvousSingleFlag::InFlightRendezvous RendezvousSingleFlag::TryJoin() {
+  // If `state_` is `kCompleted` it means that we have at least one completed
+  // rendezvous for this flag and can skip it.
+  if (state_.load() == kCompleted) return InFlightRendezvous(nullptr);
+
+  // Try to increment a state in a CAS loop to signal all other participants
+  // that we joined an in-flight rendezvous.
+  int32_t state = state_.load();
+  while (state != kCompleted &&
+         !state_.compare_exchange_weak(state, state + 1)) {
+  }
+
+  // Someone else completed the rendezvous and we don't need to join.
+  if (state == kCompleted) return InFlightRendezvous(nullptr);
+
+  return InFlightRendezvous(this);
+}
+
+bool RendezvousSingleFlag::IsCompleted() const {
+  return state_.load() == kCompleted;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/service/rendezvous.h b/third_party/xla/xla/service/rendezvous.h
index 4b3f1ca85f2066..bee54f44fc9bd4 100644
--- a/third_party/xla/xla/service/rendezvous.h
+++ b/third_party/xla/xla/service/rendezvous.h
@@ -21,7 +21,9 @@ limitations under the License.
 #include <cstdint>
 #include <memory>
 #include <optional>
+#include <string_view>
 #include <type_traits>
+#include <utility>
 #include <vector>
 
 #include "absl/base/thread_annotations.h"
@@ -49,6 +51,8 @@ struct RendezvousResult {
   using Type = std::shared_ptr<R>;
 
   static Type Wrap(R result) { return std::make_shared<R>(std::move(result)); }
+
+  static Type Empty() { return std::shared_ptr<R>(); }
 };
 
 template <typename R>
@@ -59,6 +63,8 @@ struct RendezvousResult<absl::StatusOr<R>> {
     if (!result.ok()) return result.status();
     return std::make_shared<R>(std::move(*result));
   }
+
+  static Type Empty() { return {std::shared_ptr<R>()}; }
 };
 
 template <typename R>
@@ -67,26 +73,98 @@ using RendezvousResultType = typename RendezvousResult<R>::Type;
 // The group of threads identifies itself with a key that must be unique to
 // the the group. When all threads have arrived at the rendezvous, one thread
 // executes the given function with the values supplied by each thread, and
-// all threads receive the result.
+// all threads receive the result. Rendezvous must have a human readable name to
+// make easy to debug stuck and timed out attempts.
 template <typename R, typename K, typename V, typename Fn>
 RendezvousResultType<R> RendezvousSingle(
-    const K& key, const V& value, size_t num_threads, Fn fn,
-    absl::Duration warn_stuck_timeout = absl::InfiniteDuration(),
+    std::string_view name, const K& key, const V& value, size_t num_threads,
+    Fn fn, absl::Duration warn_stuck_timeout = absl::InfiniteDuration(),
     absl::Duration terminate_timeout = absl::InfiniteDuration());
 
 // A rendezvous for a group of threads that do not have any value arguments.
 template <typename R, typename K, typename Fn>
 RendezvousResultType<R> RendezvousSingle(
-    const K& key, size_t num_threads, Fn fn,
+    std::string_view name, const K& key, size_t num_threads, Fn fn,
     absl::Duration warn_stuck_timeout = absl::InfiniteDuration(),
     absl::Duration terminate_timeout = absl::InfiniteDuration());
 
 // A rendezvous for a group of threads that do not have any computation to run
 // and simply acts as a barrier for a group of thread.
 template <typename K>
-void RendezvousSingle(const K& key, size_t num_threads,
-                      absl::Duration warn_stuck_timeout,
-                      absl::Duration terminate_timeout);
+void RendezvousSingle(
+    std::string_view name, const K& key, size_t num_threads,
+    absl::Duration warn_stuck_timeout = absl::InfiniteDuration(),
+    absl::Duration terminate_timeout = absl::InfiniteDuration());
+
+// An `std::once_flag`-like primitive for executing RendezvousSingle operations.
+//
+// RendezvousSingleFlag guarantees that all or none participants in a rendezvous
+// join the rendezvous process and once rendezvous is completed flag marked as
+// `completed` and all further rendezvous using this flag will be skipped. It
+// has a weaker than exactly-once guarantee and multiple racing rendezvous can
+// execute in parallel, and the last completed rendezvous will switch flag to
+// `completed` state.
+//
+// In XLA rendezvous are rare and used to guard costly shared state
+// initialization, so in practice we do not expect to see many racing rendezvous
+// and prefer simpler implementation with weaker guarantees.
+//
+// See: https://en.cppreference.com/w/cpp/thread/once_flag
+class RendezvousSingleFlag {
+ public:
+  RendezvousSingleFlag();
+
+  RendezvousSingleFlag(const RendezvousSingleFlag&) = delete;
+  RendezvousSingleFlag& operator=(const RendezvousSingleFlag&) = delete;
+
+  // RAII wrapper to exit from in-flight rendezvous when destructed.
+  class InFlightRendezvous {
+   public:
+    explicit InFlightRendezvous(RendezvousSingleFlag* flag);
+    ~InFlightRendezvous();
+
+    InFlightRendezvous(const InFlightRendezvous&) = delete;
+    InFlightRendezvous& operator=(const InFlightRendezvous&) = delete;
+
+    operator bool() const;  // NOLINT
+
+   private:
+    RendezvousSingleFlag* flag_;
+  };
+
+  // Returns InFlightRendezvous convertible to `true` if the caller should join
+  // the rendezvous process. If result conversion to bool is `false` it means
+  // that the rendezvous is already completed.
+  InFlightRendezvous TryJoin();
+
+  bool IsCompleted() const;
+
+ private:
+  friend class InFlightRendezvous;
+
+  std::atomic<int32_t> state_;
+};
+
+// A rendezvous for a group of threads that will be executed only if the flag is
+// not in `completed` state and will switch it to `completed` after finishing a
+// rendezvous. If rendezvous will not be executed it will return empty shared
+// pointer result.
+template <typename R, typename K, typename Fn>
+RendezvousResultType<R> RendezvousSingle(
+    RendezvousSingleFlag& flag, std::string_view name, const K& key,
+    size_t num_threads, Fn fn,
+    absl::Duration warn_stuck_timeout = absl::InfiniteDuration(),
+    absl::Duration terminate_timeout = absl::InfiniteDuration());
+
+// A rendezvous for a group of threads that will be executed only if the flag is
+// not in `completed` state and will switch it to `completed` after finishing a
+// rendezvous.
+template <typename K>
+void RendezvousSingle(
+    RendezvousSingleFlag& flag, std::string_view name, const K& key,
+    size_t num_threads,
+    absl::Duration warn_stuck_timeout = absl::InfiniteDuration(),
+    absl::Duration terminate_timeout = absl::InfiniteDuration());
 
 //===----------------------------------------------------------------------===//
 // Internal implementation details.
@@ -170,8 +248,8 @@ class RendezvousMap {
   absl::flat_hash_map<K, std::shared_ptr<State>> state_ ABSL_GUARDED_BY(mutex_);
 };
 
-void AwaitAndLogIfStuck(absl::Notification& ready,
-                        absl::Duration warn_stuck_timeout,
+void AwaitAndLogIfStuck(absl::Notification& ready, std::string_view name,
+                        size_t num_threads, absl::Duration warn_stuck_timeout,
                         absl::Duration terminate_timeout);
 }  // namespace internal
 
@@ -180,8 +258,9 @@ void AwaitAndLogIfStuck(absl::Notification& ready,
 //===----------------------------------------------------------------------===//
 
 template <typename R, typename K, typename V, typename Fn>
-RendezvousResultType<R> RendezvousSingle(const K& key, const V& value,
-                                         size_t num_threads, Fn fn,
+RendezvousResultType<R> RendezvousSingle(std::string_view name, const K& key,
+                                         const V& value, size_t num_threads,
+                                         Fn fn,
                                          absl::Duration warn_stuck_timeout,
                                          absl::Duration terminate_timeout) {
   // Check that `fn` is callable with a span of values and returns `R`.
@@ -205,7 +284,9 @@ RendezvousResultType<R> RendezvousSingle(const K& key, const V& value,
       << "Id can't be larger than the number of participating threads"
       << "; id=" << id << "; num_threads=" << num_threads;
 
-  state->values[id] = &value;
+  // std::vector::operator[] creates data races, so we rely on data pointer
+  // here and when we create an absl::Span below.
+  *(state->values.data() + id) = &value;
 
   // Use a second atomic to safely publish values without data races.
   if constexpr (!std::is_same_v<R, std::nullopt_t>) {
@@ -215,39 +296,65 @@ RendezvousResultType<R> RendezvousSingle(const K& key, const V& value,
   if (id < num_threads - 1) {
     // Threads arriving before the last one wait for a result to be computed by
     // the last joining thread.
-    internal::AwaitAndLogIfStuck(state->ready, warn_stuck_timeout,
-                                 terminate_timeout);
+    internal::AwaitAndLogIfStuck(state->ready, name, num_threads,
+                                 warn_stuck_timeout, terminate_timeout);
   } else {
     // Last thread to arrive executes the function and completes rendezvous by
     // making result available to all participants. All other participants will
     // be notified via `state->ready` notification when result is ready, and we
     // rely on the notification to create a memory barrier that makes access to
     // `state->result` safe without any extra synchronization.
-    rendezvous.Complete(key, RendezvousResult<R>::Wrap(fn(state->values)));
+    absl::Span<const V*> values(state->values.data(), num_threads);
+    rendezvous.Complete(key, RendezvousResult<R>::Wrap(fn(values)));
   }
 
   return state->result;
 }
 
 template <typename R, typename K, typename Fn>
-RendezvousResultType<R> RendezvousSingle(const K& key, size_t num_threads,
-                                         Fn fn,
+RendezvousResultType<R> RendezvousSingle(std::string_view name, const K& key,
+                                         size_t num_threads, Fn fn,
                                          absl::Duration warn_stuck_timeout,
                                          absl::Duration terminate_timeout) {
   return RendezvousSingle<R, K, std::nullopt_t>(
-      key, std::nullopt, num_threads, [fn](auto) { return fn(); },
+      name, key, std::nullopt, num_threads, [fn](auto) { return fn(); },
       warn_stuck_timeout, terminate_timeout);
 }
 
 template <typename K>
-void RendezvousSingle(const K& key, size_t num_threads,
+void RendezvousSingle(std::string_view name, const K& key, size_t num_threads,
                       absl::Duration warn_stuck_timeout,
                       absl::Duration terminate_timeout) {
   RendezvousSingle<std::nullopt_t, K, std::nullopt_t>(
-      key, std::nullopt, num_threads, [](auto) { return std::nullopt; },
+      name, key, std::nullopt, num_threads, [](auto) { return std::nullopt; },
       warn_stuck_timeout, terminate_timeout);
 }
 
+template <typename R, typename K, typename Fn>
+RendezvousResultType<R> RendezvousSingle(RendezvousSingleFlag& flag,
+                                         std::string_view name, const K& key,
+                                         size_t num_threads, Fn fn,
+                                         absl::Duration warn_stuck_timeout,
+                                         absl::Duration terminate_timeout) {
+  if (auto in_flight_rendezvous = flag.TryJoin()) {
+    return RendezvousSingle<K>(name, key, num_threads, std::move(fn),
+                               warn_stuck_timeout, terminate_timeout);
+  } else {
+    return RendezvousResult<R>::Empty();
+  }
+}
+
+template <typename K>
+void RendezvousSingle(RendezvousSingleFlag& flag, std::string_view name,
+                      const K& key, size_t num_threads,
+                      absl::Duration warn_stuck_timeout,
+                      absl::Duration terminate_timeout) {
+  if (auto in_flight_rendezvous = flag.TryJoin()) {
+    RendezvousSingle<K>(name, key, num_threads, warn_stuck_timeout,
+                        terminate_timeout);
+  }
+}
+
 }  // namespace xla
 
 #endif  // XLA_SERVICE_RENDEZVOUS_H_
diff --git a/third_party/xla/xla/service/rendezvous_test.cc b/third_party/xla/xla/service/rendezvous_test.cc
index b221dbb7e31799..f7fda89bdccdd0 100644
--- a/third_party/xla/xla/service/rendezvous_test.cc
+++ b/third_party/xla/xla/service/rendezvous_test.cc
@@ -16,10 +16,13 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <string>
 #include <vector>
 
 #include "absl/status/statusor.h"
 #include "absl/synchronization/blocking_counter.h"
+#include "absl/synchronization/notification.h"
+#include "absl/time/time.h"
 #include "absl/types/span.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/test.h"
@@ -29,12 +32,16 @@ limitations under the License.
 namespace xla {
 namespace {
 
+absl::Duration Timeout() { return absl::Seconds(10); }
+absl::Duration Terminate() { return absl::Seconds(10); }
+
 tsl::thread::ThreadPool CreateThreadPool(int32_t size) {
   return tsl::thread::ThreadPool(tsl::Env::Default(), "rendezvous_test", size);
 }
 
 TEST(RendezvousTest, OneParticipant) {
-  auto result = RendezvousSingle<int32_t>(0, 1, [] { return 42; });
+  auto result =
+      RendezvousSingle<int32_t>("rendezvous_test", 0, 1, [] { return 42; });
   ASSERT_EQ(*result, 42);
 }
 
@@ -44,7 +51,8 @@ TEST(RendezvousTest, TwoParticipants) {
 
   auto task = [&](int32_t id) {
     return [&, id] {
-      results[id] = RendezvousSingle<int32_t>(0, 2, [] { return 42; });
+      results[id] =
+          RendezvousSingle<int32_t>("rendezvous_test", 0, 2, [] { return 42; });
       counter.DecrementCount();
     };
   };
@@ -71,7 +79,8 @@ TEST(RendezvousTest, TwoParticipantsWithValues) {
 
   auto task = [&](int32_t id) {
     return [&, id] {
-      results[id] = RendezvousSingle<int32_t>(0, id, 2, accumulate);
+      results[id] =
+          RendezvousSingle<int32_t>("rendezvous_test", 0, id, 2, accumulate);
       counter.DecrementCount();
     };
   };
@@ -93,7 +102,7 @@ TEST(RendezvousTest, RepeatRendezvous) {
     absl::BlockingCounter counter(2);
 
     auto task = [&] {
-      RendezvousSingle<int32_t>(i, 2, [] { return 42; });
+      RendezvousSingle<int32_t>("rendezvous_test", i, 2, [] { return 42; });
       counter.DecrementCount();
     };
 
@@ -109,8 +118,8 @@ TEST(RendezvousTest, ReturningStatusOr) {
 
   auto task = [&](int32_t id) {
     return [&, id] {
-      results[id] =
-          RendezvousSingle<absl::StatusOr<int32_t>>(0, 2, [] { return 42; });
+      results[id] = RendezvousSingle<absl::StatusOr<int32_t>>(
+          "rendezvous_test", 0, 2, [] { return 42; });
       counter.DecrementCount();
     };
   };
@@ -125,6 +134,97 @@ TEST(RendezvousTest, ReturningStatusOr) {
   ASSERT_EQ(**results[1], 42);
 }
 
+TEST(RendezvousTest, RendezvousSingleFlag) {
+  RendezvousSingleFlag flag;
+
+  auto thread_pool = CreateThreadPool(2);
+  int32_t num_executed = 0;
+
+  absl::BlockingCounter round_0(2);
+  absl::BlockingCounter round_1(2);
+
+  auto task = [&](absl::BlockingCounter& counter) {
+    return [&] {
+      RendezvousSingle<int32_t>(
+          flag, "rendezvous_test", 0, 2, [&] { return ++num_executed; },
+          Timeout(), Terminate());
+      counter.DecrementCount();
+    };
+  };
+
+  // Execute rendezvous a first time.
+  thread_pool.Schedule(task(round_0));
+  thread_pool.Schedule(task(round_0));
+  round_0.Wait();
+
+  ASSERT_EQ(num_executed, 1);
+
+  // Execute rendezvous a second time.
+  thread_pool.Schedule(task(round_1));
+  thread_pool.Schedule(task(round_1));
+  round_1.Wait();
+
+  // Check that we did not execute it second time.
+  ASSERT_EQ(num_executed, 1);
+}
+
+TEST(RendezvousTest, RendezvousSingleFlagRace) {
+  RendezvousSingleFlag flag;
+
+  static constexpr int32_t kNumRendezvous = 16;
+  static constexpr int32_t kNumThreads = 8;
+
+  auto thread_pool = CreateThreadPool(kNumRendezvous * kNumThreads);
+
+  auto task = [&](int32_t key) {
+    return [&, key] {
+      RendezvousSingle(flag, "key: " + std::to_string(key), key, kNumThreads,
+                       Timeout(), Terminate());
+    };
+  };
+
+  for (int32_t key = 0; key < kNumRendezvous; ++key) {
+    for (int32_t thread = 0; thread < kNumThreads; ++thread) {
+      thread_pool.Schedule(task(key));
+    }
+  }
+}
+
+TEST(RendezvousTest, RendezvousSingleFlagRaceWithBarriers) {
+  RendezvousSingleFlag flag;
+
+  static constexpr int32_t kNumRendezvous = 16;
+  static constexpr int32_t kNumThreads = 8;
+
+  auto thread_pool = CreateThreadPool(kNumRendezvous * kNumThreads);
+
+  // We use barriers and notifications to make sure all 128 threads start
+  // rendezvous at the same time to detect potential deadlocks and data races.
+  absl::BlockingCounter participants_ready(kNumRendezvous * kNumThreads);
+  absl::Notification participants_notification;
+  absl::BlockingCounter participants_done(kNumRendezvous * kNumThreads);
+
+  auto task = [&](int32_t key) {
+    return [&, key] {
+      participants_ready.DecrementCount();
+      participants_notification.WaitForNotification();
+      RendezvousSingle(flag, "key: " + std::to_string(key), key, kNumThreads,
+                       Timeout(), Terminate());
+      participants_done.DecrementCount();
+    };
+  };
+
+  for (int32_t key = 0; key < kNumRendezvous; ++key) {
+    for (int32_t thread = 0; thread < kNumThreads; ++thread) {
+      thread_pool.Schedule(task(key));
+    }
+  }
+
+  participants_notification.Notify();
+  participants_ready.Wait();
+  participants_done.Wait();
+}
+
 //===----------------------------------------------------------------------===//
 // Performance benchmarks below
 //===----------------------------------------------------------------------===//
@@ -137,7 +237,8 @@ static void BM_Rendezvous(benchmark::State& state) {
     absl::BlockingCounter counter(num_threads);
     for (int64_t i = 0; i < num_threads; ++i) {
       thread_pool.Schedule([&] {
-        RendezvousSingle<int32_t>(0, num_threads, [] { return 42; });
+        RendezvousSingle<int32_t>("rendezvous_test", 0, num_threads,
+                                  [] { return 42; });
         counter.DecrementCount();
       });
     }
@@ -154,7 +255,7 @@ static void BM_RendezvousWithValues(benchmark::State& state) {
     for (int64_t i = 0; i < num_threads; ++i) {
       thread_pool.Schedule([&] {
         int32_t value = i;
-        RendezvousSingle<int32_t>(0, value, num_threads,
+        RendezvousSingle<int32_t>("rendezvous_test", 0, value, num_threads,
                                   [](auto) { return 42; });
         counter.DecrementCount();
       });
diff --git a/third_party/xla/xla/service/shape_inference.cc b/third_party/xla/xla/service/shape_inference.cc
index 258c90ee42b904..69173a7c832f21 100644
--- a/third_party/xla/xla/service/shape_inference.cc
+++ b/third_party/xla/xla/service/shape_inference.cc
@@ -349,6 +349,7 @@ StatusOr<DimAndBound> InferMostSpecificDimAndBound(int64_t dim,
   switch (opcode) {
     case HloOpcode::kFloor:
     case HloOpcode::kCeil:
+    case HloOpcode::kErf:
     case HloOpcode::kRoundNearestAfz:
     case HloOpcode::kRoundNearestEven:
       if (!ShapeUtil::ElementIsFloating(shape)) {
@@ -3918,8 +3919,9 @@ Status ValidateScatterDimensionNumbers(
   }
 
   // Validate scatter_dims_to_operand_dims in ScatterDimensionNumbers.
-  if (dim_numbers.scatter_dims_to_operand_dims_size() !=
-      scatter_indices_shape[dim_numbers.index_vector_dim()]) {
+  if (!CompatibleDimensionSizes(
+          dim_numbers.scatter_dims_to_operand_dims_size(),
+          scatter_indices_shape[dim_numbers.index_vector_dim()])) {
     return InvalidArgument(
         "Scatter op has %d elements in scatter_dims_to_operand_dims and the "
         "bound of dimension index_vector_dim=%d of scatter_indices is %d. "
@@ -4051,8 +4053,9 @@ Status ValidateScatterDimensionNumbers(
       if (scatter_dims_seen == scatter_dim_numbers.index_vector_dim()) {
         ++scatter_dims_seen;
       }
-      if (updates_shape.dimensions(i) !=
-          expanded_scatter_indices_shape[scatter_dims_seen]) {
+      if (!CompatibleDimensionSizes(
+              updates_shape.dimensions(i),
+              expanded_scatter_indices_shape[scatter_dims_seen])) {
         return InvalidArgument(
             "Bounds of the scatter dimensions of updates must be same as the "
             "bounds of the corresponding dimensions of scatter indices. For "
diff --git a/third_party/xla/xla/service/shape_inference_test.cc b/third_party/xla/xla/service/shape_inference_test.cc
index c3f04748ad9a68..ce05fb7492c0e7 100644
--- a/third_party/xla/xla/service/shape_inference_test.cc
+++ b/third_party/xla/xla/service/shape_inference_test.cc
@@ -4009,6 +4009,16 @@ TEST_F(UnboundedConcatenateOpShapeInferenceTest,
               HasSubstr("Mismatched bound sizes 3 and 4 in dimension 1"));
 }
 
+TEST_F(ShapeInferenceTest, UnboundedConvert) {
+  TF_ASSERT_OK_AND_ASSIGN(Shape operand, ParseShape("f32[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f64[?]"));
+  TF_ASSERT_OK_AND_ASSIGN(Shape result, ShapeInference::InferConvertShape(
+                                            operand, PrimitiveType::F64));
+  EXPECT_TRUE(ShapeUtil::Equal(result, expected))
+      << "inferred: " << ShapeUtil::HumanString(result)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
 TEST_F(ShapeInferenceTest, UnboundedConvolution) {
   TF_ASSERT_OK_AND_ASSIGN(Shape lhs, ParseShape("f32[?, 2, ?, 128]"));
   TF_ASSERT_OK_AND_ASSIGN(Shape rhs, ParseShape("f32[2, 2, <=128, 8]"));
@@ -4345,6 +4355,31 @@ TEST_P(UnboundedBinaryOpShapeInferenceTest, UnboundedSub) {
   }
 }
 
+TEST_F(ShapeInferenceTest, UnboundedScatter) {
+  TF_ASSERT_OK_AND_ASSIGN(Shape input, ParseShape("f32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(Shape scatter_indices, ParseShape("s32[?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(Shape updates, ParseShape("f32[?, ?, ?, ?]"));
+  TF_ASSERT_OK_AND_ASSIGN(Shape expected, ParseShape("f32[?, ?, ?]"));
+
+  const ProgramShape to_apply = ShapeUtil::MakeProgramShape({f32_, f32_}, f32_);
+
+  ScatterDimensionNumbers dimension_numbers;
+  dimension_numbers.add_update_window_dims(2);
+  dimension_numbers.add_update_window_dims(3);
+  dimension_numbers.add_inserted_window_dims(0);
+  dimension_numbers.add_scatter_dims_to_operand_dims(1);
+  dimension_numbers.add_scatter_dims_to_operand_dims(0);
+  dimension_numbers.set_index_vector_dim(2);
+
+  TF_ASSERT_OK_AND_ASSIGN(
+      Shape result,
+      ShapeInference::InferScatterShape({&input, &scatter_indices, &updates},
+                                        to_apply, dimension_numbers));
+  EXPECT_TRUE(ShapeUtil::Equal(result, expected))
+      << "inferred: " << ShapeUtil::HumanString(result)
+      << " expected: " << ShapeUtil::HumanString(expected);
+}
+
 TEST_F(ShapeInferenceTest, UnboundedTranspose) {
   TF_ASSERT_OK_AND_ASSIGN(Shape operand,
                           ParseShape("f32[1, ?, 2, ?, <=2]{4,3,2,1,0}"));
@@ -4527,6 +4562,7 @@ INSTANTIATE_TEST_SUITE_P(UnboundedDynamism, UnboundedUnaryOpShapeInferenceTest,
                               {"f32[?]", "f32[?]", HloOpcode::kCeil},
                               {"u32[?]", "u32[?]", HloOpcode::kClz},
                               {"f32[?]", "f32[?]", HloOpcode::kCos},
+                              {"f32[?]", "f32[?]", HloOpcode::kErf},
                               {"f32[?]", "f32[?]", HloOpcode::kExp},
                               {"f32[?]", "f32[?]", HloOpcode::kExpm1},
                               {"f32[?]", "f32[?]", HloOpcode::kFloor},
diff --git a/third_party/xla/xla/service/sharding_propagation.cc b/third_party/xla/xla/service/sharding_propagation.cc
index 24750840b794cb..c1a61846fbfc8c 100644
--- a/third_party/xla/xla/service/sharding_propagation.cc
+++ b/third_party/xla/xla/service/sharding_propagation.cc
@@ -305,6 +305,7 @@ const HloInstruction* PickRepresentativeOperand(
     case HloOpcode::kAllToAll:
     case HloOpcode::kCollectivePermute:
     case HloOpcode::kDivide:
+    case HloOpcode::kErf:
     case HloOpcode::kExp:
     case HloOpcode::kExpm1:
     case HloOpcode::kFloor:
diff --git a/third_party/xla/xla/service/simplify_fp_conversions.cc b/third_party/xla/xla/service/simplify_fp_conversions.cc
index c0a0fca5d66ee2..065aad80610a18 100644
--- a/third_party/xla/xla/service/simplify_fp_conversions.cc
+++ b/third_party/xla/xla/service/simplify_fp_conversions.cc
@@ -16,11 +16,8 @@ limitations under the License.
 #include "xla/service/simplify_fp_conversions.h"
 
 #include <cstddef>
-#include <string>
-#include <vector>
 
 #include "absl/container/flat_hash_set.h"
-#include "absl/strings/match.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_computation.h"
@@ -36,37 +33,13 @@ namespace xla {
 namespace {
 
 // Simplifies floating-point conversions `A -> B -> C -> D` as `A -> D`.
-StatusOr<bool> RunOnComputation(HloComputation& computation,
-                                SimplifyFPConversions::Scope scope) {
-  // Since the goal of this pass is to simplify type conversions by removing
-  // some Convert ops, we don't want to run this pass for tests that are meant
-  // to test for functionality of the Convert op itself.
-  const absl::string_view comp_name = computation.name();
-  const std::vector<absl::string_view> test_names{
-      "ConvertF16F8e5m2Roundtrip",
-      "ConvertF16F8e4m3fnRoundtrip",
-      "ConvertF16F8e4m3b11fnuzRoundtrip",
-      "ConvertF16F8e5m2fnuzRoundtrip",
-      "ConvertF32F8e5m2fnuzRoundtrip",
-      "ConvertF8e5m2fnuzRoundtripExhaustive",
-      "ConvertF16F8e4m3fnuzRoundtrip",
-      "ConvertF32F8e4m3fnuzRoundtrip",
-      "ConvertF8e4m3fnuzRoundtripExhaustive"};
-  for (const auto& test_name : test_names) {
-    if (absl::StrContains(comp_name, test_name)) {
-      return false;
-    }
-  }
-  const int minimum_logical_creation_pass_id =
-      (scope == SimplifyFPConversions::Scope::kSimplifyAllConversions) ? -1 : 0;
+StatusOr<bool> RunOnComputation(HloComputation& computation) {
   bool changed = false;
   for (HloInstruction* instruction : computation.MakeInstructionPostOrder()) {
     HloInstruction* input = instruction;
     size_t convert_chain_length = 0;
 
-    while ((input->opcode() == HloOpcode::kConvert) &&
-           (input->metadata().logical_creation_pass_id() >=
-            minimum_logical_creation_pass_id) &&
+    while (input->opcode() == HloOpcode::kConvert &&
            primitive_util::IsFloatingPointType(input->shape().element_type())) {
       input = input->mutable_operand(0);
       ++convert_chain_length;
@@ -89,36 +62,23 @@ StatusOr<bool> RunOnComputation(HloComputation& computation,
   return changed;
 }
 
-std::string ToString(SimplifyFPConversions::Scope scope) {
-  using Scope = SimplifyFPConversions::Scope;
-  switch (scope) {
-    case Scope::kSimplifyAllConversions:
-      return "SimplifyAllConversions";
-    case Scope::kOnlySimplifyCompilerGeneratedConversions:
-      return "OnlySimplifyCompilerGeneratedConversions";
-  }
-}
-
 }  // namespace
 
 StatusOr<bool> SimplifyFPConversions::Run(
     HloModule* module,
     const absl::flat_hash_set<absl::string_view>& execution_threads) {
   XLA_VLOG_LINES(
-      2,
-      absl::StrFormat("SimplifyFPConversions::Run() with scope=%s, before:\n%s",
-                      ToString(scope_), module->ToString()));
+      2, absl::StrFormat("SimplifyFPConversions::Run() with before:\n%s",
+                         module->ToString()));
   bool changed = false;
   for (HloComputation* computation :
        module->MakeComputationPostOrder(execution_threads)) {
-    TF_ASSIGN_OR_RETURN(bool comp_changed,
-                        RunOnComputation(*computation, scope_));
+    TF_ASSIGN_OR_RETURN(bool comp_changed, RunOnComputation(*computation));
     changed |= comp_changed;
   }
-  XLA_VLOG_LINES(
-      2,
-      absl::StrFormat("SimplifyFPConversions::Run() with scope=%s, after:\n%s",
-                      ToString(scope_), module->ToString()));
+  XLA_VLOG_LINES(2,
+                 absl::StrFormat("SimplifyFPConversions::Run() with after:\n%s",
+                                 module->ToString()));
   return changed;
 }
 
diff --git a/third_party/xla/xla/service/simplify_fp_conversions.h b/third_party/xla/xla/service/simplify_fp_conversions.h
index b7d0ac4463f655..4da4b74252227f 100644
--- a/third_party/xla/xla/service/simplify_fp_conversions.h
+++ b/third_party/xla/xla/service/simplify_fp_conversions.h
@@ -26,21 +26,12 @@ namespace xla {
 // Simplifies chains of floating-point conversions.
 //
 // The algebraic simplifier will remove convert pairs of the form `X -> Y -> X`,
-// only when they are a no-op (e.g. `bf16 -> f32 -> bf16`). This passes does
-// similar, but has two scopes:
-// - kSimplifyAllConversions: Simplify any chain of float conversions, possibly
-//   improving  accuracy (e.g. `f32 -> bf16 -> f32` is removed).
-// - kOnlySimplifyCompilerGeneratedConversions: Only simplify chains of float
-//   conversions generated by the compiler in one of the previous optimization
-//   passes.
+// only when they are a no-op, e.g. `bf16 -> f32 -> bf16` or
+// `f32 -> bf16 -> f32`. Note that the latter optimization might lead to
+// increased precision.
 class SimplifyFPConversions : public HloModulePass {
  public:
-  enum class Scope {
-    kOnlySimplifyCompilerGeneratedConversions,
-    kSimplifyAllConversions
-  };
-
-  explicit SimplifyFPConversions(Scope scope) : scope_(scope) {}
+  explicit SimplifyFPConversions() = default;
 
   absl::string_view name() const override { return "simplify-fp-conversions"; }
 
@@ -48,9 +39,6 @@ class SimplifyFPConversions : public HloModulePass {
   StatusOr<bool> Run(
       HloModule* module,
       const absl::flat_hash_set<absl::string_view>& execution_threads) override;
-
- private:
-  Scope scope_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/simplify_fp_conversions_test.cc b/third_party/xla/xla/service/simplify_fp_conversions_test.cc
index 8249f1969c47d9..ad85bb873eb654 100644
--- a/third_party/xla/xla/service/simplify_fp_conversions_test.cc
+++ b/third_party/xla/xla/service/simplify_fp_conversions_test.cc
@@ -36,32 +36,6 @@ using ::tsl::testing::IsOkAndHolds;
 
 using SimplifyFPConversionsTest = HloTestBase;
 
-// This marks all ops in `module` as user-provided, meaning the
-// simplifier won't remove any of the converts
-static void InitializeCreationPassIds(HloModule* module) {
-  constexpr int kUserSuppliedOpCreationPassId = -1;
-  for (HloComputation* computation : module->computations()) {
-    for (HloInstruction* instruction : computation->instructions()) {
-      instruction->set_creation_pass_id(kUserSuppliedOpCreationPassId);
-      instruction->set_logical_creation_pass_id(kUserSuppliedOpCreationPassId);
-    }
-  }
-}
-
-// This marks all converts ops in `module` as being created by the
-// optimization pass `creation_pass_id`.
-static void SetCreationPassIdInAllConvertOps(HloModule* module,
-                                             int creation_pass_id) {
-  for (HloComputation* computation : module->computations()) {
-    for (HloInstruction* instruction : computation->instructions()) {
-      if (instruction->opcode() == HloOpcode::kConvert) {
-        instruction->set_creation_pass_id(creation_pass_id);
-        instruction->set_logical_creation_pass_id(creation_pass_id);
-      }
-    }
-  }
-}
-
 TEST_F(SimplifyFPConversionsTest, DoesNotChangeSingleConvert) {
   const absl::string_view kModuleStr = R"(
     HloModule test
@@ -74,10 +48,8 @@ TEST_F(SimplifyFPConversionsTest, DoesNotChangeSingleConvert) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleStr));
-  InitializeCreationPassIds(module.get());
 
-  SimplifyFPConversions simplifier{
-      SimplifyFPConversions::Scope::kSimplifyAllConversions};
+  SimplifyFPConversions simplifier;
   EXPECT_THAT(simplifier.Run(module.get()), IsOkAndHolds(false));
 }
 
@@ -94,60 +66,13 @@ TEST_F(SimplifyFPConversionsTest, SimplifiesF32ToBF16ToF32) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleStr));
-  InitializeCreationPassIds(module.get());
 
-  SimplifyFPConversions simplifier{
-      SimplifyFPConversions::Scope::kSimplifyAllConversions};
+  SimplifyFPConversions simplifier;
   EXPECT_THAT(simplifier.Run(module.get()), IsOkAndHolds(true));
   EXPECT_THAT(module->entry_computation()->root_instruction(),
               op::Tuple(op::Parameter(0)));
 }
 
-TEST_F(SimplifyFPConversionsTest, SimplifiesCompilerGeneratedF32ToBF16ToF32) {
-  const absl::string_view kModuleStr = R"(
-    HloModule test
-
-    ENTRY entry {
-      p0 = f32[2,3] parameter(0)
-      c0 = bf16[2,3] convert(p0)
-      c1 = f32[2,3] convert(c0)
-      ROOT ret = (f32[2,3]) tuple(c1)
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleStr));
-  InitializeCreationPassIds(module.get());
-
-  constexpr int kRandomCreationPassId = 42;
-  SetCreationPassIdInAllConvertOps(module.get(), kRandomCreationPassId);
-
-  SimplifyFPConversions simplifier{
-      SimplifyFPConversions::Scope::kOnlySimplifyCompilerGeneratedConversions};
-  EXPECT_THAT(simplifier.Run(module.get()), IsOkAndHolds(true));
-  EXPECT_THAT(module->entry_computation()->root_instruction(),
-              op::Tuple(op::Parameter(0)));
-}
-
-TEST_F(SimplifyFPConversionsTest, DoesNotChangeUserInsertedConverts) {
-  const absl::string_view kModuleStr = R"(
-    HloModule test
-
-    ENTRY entry {
-      p0 = f32[2,3] parameter(0)
-      c0 = bf16[2,3] convert(p0)
-      c1 = f32[2,3] convert(c0)
-      ROOT ret = (f32[2,3]) tuple(c1)
-    }
-  )";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kModuleStr));
-  InitializeCreationPassIds(module.get());
-
-  SimplifyFPConversions simplifier{
-      SimplifyFPConversions::Scope::kOnlySimplifyCompilerGeneratedConversions};
-  EXPECT_THAT(simplifier.Run(module.get()), IsOkAndHolds(false));
-}
-
 TEST_F(SimplifyFPConversionsTest, SimplifiesF64ToF16ToF32ToBF16) {
   const absl::string_view kModuleStr = R"(
     HloModule test
@@ -162,10 +87,8 @@ TEST_F(SimplifyFPConversionsTest, SimplifiesF64ToF16ToF32ToBF16) {
   )";
   TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
                           ParseAndReturnVerifiedModule(kModuleStr));
-  InitializeCreationPassIds(module.get());
 
-  SimplifyFPConversions simplifier{
-      SimplifyFPConversions::Scope::kSimplifyAllConversions};
+  SimplifyFPConversions simplifier;
   EXPECT_THAT(simplifier.Run(module.get()), IsOkAndHolds(true));
   EXPECT_THAT(
       module->entry_computation()->root_instruction(),
diff --git a/third_party/xla/xla/service/spmd/BUILD b/third_party/xla/xla/service/spmd/BUILD
index 2cd13d18fbb2d6..5f0e4af6a67712 100644
--- a/third_party/xla/xla/service/spmd/BUILD
+++ b/third_party/xla/xla/service/spmd/BUILD
@@ -4,7 +4,8 @@ load("//xla:xla.bzl", "xla_cc_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
     licenses = ["notice"],
 )
 
@@ -32,7 +33,6 @@ cc_library(
         "spmd_partitioner.h",
         "spmd_partitioner_util.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:array",
         "//xla:comparison_util",
@@ -129,7 +129,6 @@ cc_library(
     name = "canonicalize_all_gather_for_cse",
     srcs = ["canonicalize_all_gather_for_cse.cc"],
     hdrs = ["canonicalize_all_gather_for_cse.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_query",
@@ -159,7 +158,6 @@ cc_library(
     name = "schedule_aware_collective_ops_cse",
     srcs = ["schedule_aware_collective_ops_cse.cc"],
     hdrs = ["schedule_aware_collective_ops_cse.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
@@ -189,7 +187,6 @@ cc_library(
     name = "spmd_prepare",
     srcs = ["spmd_prepare.cc"],
     hdrs = ["spmd_prepare.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/hlo/utils:hlo_sharding_util",
@@ -203,7 +200,6 @@ cc_library(
     name = "stateful_rng_spmd_partitioner",
     srcs = ["stateful_rng_spmd_partitioner.cc"],
     hdrs = ["stateful_rng_spmd_partitioner.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":spmd_partitioner",
         "//xla/hlo/ir:hlo",
@@ -235,7 +231,6 @@ cc_library(
     name = "collective_permute_motion",
     srcs = ["collective_permute_motion.cc"],
     hdrs = ["collective_permute_motion.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:comparison_util",
         "//xla:shape_util",
@@ -268,7 +263,6 @@ cc_library(
     hdrs = [
         "partition_assignment.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_pass",
@@ -294,7 +288,6 @@ cc_library(
     hdrs = [
         "whole_graph_manual_pass.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner.cc b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
index c9770bf1fa4305..036598b1513f9d 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner.cc
@@ -3343,7 +3343,8 @@ Status SpmdPartitioningVisitor::HandleAllReduce(HloInstruction* hlo) {
     TF_RET_CHECK(ar->use_global_device_ids())
         << "Cross-partition allreduce in partial manual partitioning mode must "
            "use global device IDs.";
-    absl::flat_hash_map<int64_t, int64_t> partition_to_group_id;
+    std::vector<int64_t> partition_to_group_id(
+        hlo->sharding().tile_assignment().num_elements());
     hlo->sharding().tile_assignment().Each(
         [&](absl::Span<const int64_t> indices, int64_t partition) {
           int64_t group_id = 0;
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
index 9692e4238eb0ab..7bb7927b3f8353 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util.cc
@@ -246,13 +246,16 @@ SPMDCollectiveOpsCreator GetPerGroupCollectiveOpsCreator(
     const SPMDCollectiveOpsCreator& creator,
     const std::vector<std::vector<int64_t>>& device_groups) {
   SPMDCollectiveOpsCreator result;
-  result.create_partition_id = [creator, device_groups](SpmdBuilder* b) {
-    return GetInGroupPartitionId(creator.create_partition_id(b), device_groups,
-                                 b);
+  auto device_groups_ptr =
+      std::make_shared<const std::vector<std::vector<int64_t>>>(device_groups);
+  result.create_partition_id = [creator, device_groups_ptr](SpmdBuilder* b) {
+    return GetInGroupPartitionId(creator.create_partition_id(b),
+                                 *device_groups_ptr, b);
   };
   auto expand_partition_groups =
-      [device_groups](
+      [device_groups_ptr](
           const std::vector<std::vector<int64_t>>& partition_subgroups) {
+        auto& device_groups = *device_groups_ptr;
         if (partition_subgroups.empty()) {
           return device_groups;
         }
@@ -280,10 +283,11 @@ SPMDCollectiveOpsCreator GetPerGroupCollectiveOpsCreator(
             channel_id);
       };
   result.create_cross_partition_collective_permute =
-      [creator, device_groups](
+      [creator, device_groups_ptr](
           SpmdBuilder* b, HloInstruction* operand,
           std::vector<std::pair<int64_t, int64_t>>& src_dst_pairs,
           int64_t next_channel_id) {
+        auto& device_groups = *device_groups_ptr;
         std::vector<std::pair<int64_t, int64_t>> expanded_pairs(
             src_dst_pairs.size() * device_groups.size());
         for (int64_t g = 0; g < device_groups.size(); ++g) {
@@ -1842,7 +1846,7 @@ std::optional<GroupedSharding> AlignGroupsWithInternal(
   auto get_permutation = [](absl::Span<const int64_t> src,
                             absl::Span<const int64_t> dst) {
     CHECK_EQ(src.size(), dst.size());
-    absl::flat_hash_map<int64_t, int64_t> dst_reverse_map;
+    absl::flat_hash_map<int64_t, int64_t> dst_reverse_map(dst.size());
     for (int64_t i = 0; i < dst.size(); ++i) {
       dst_reverse_map[dst[i]] = i;
     }
@@ -1856,7 +1860,8 @@ std::optional<GroupedSharding> AlignGroupsWithInternal(
   };
   CHECK_EQ(grouped_sharding.device_groups.size(),
            reference.device_groups.size());
-  absl::flat_hash_map<int64_t, int64_t> device_to_ref_group;
+  std::vector<int64_t> device_to_ref_group(reference.device_groups.size() *
+                                           reference.device_groups[0].size());
   for (int64_t g = 0; g < reference.device_groups.size(); ++g) {
     for (int64_t device : reference.device_groups[g]) {
       device_to_ref_group[device] = g;
@@ -2027,6 +2032,10 @@ std::optional<std::vector<int64_t>> FindMatchingPartitionedDimsForGrouping(
   if (sharding.IsTileMaximal() || device_groups.size() < 2) {
     return std::nullopt;
   }
+  const int64_t num_devices = sharding.tile_assignment().num_elements();
+  if (num_devices != device_groups.size() * device_groups[0].size()) {
+    return std::nullopt;
+  }
   std::vector<int64_t> dims;
   if (device_groups[0].size() < 2) {
     // Trivial case: single member groups
@@ -2037,15 +2046,16 @@ std::optional<std::vector<int64_t>> FindMatchingPartitionedDimsForGrouping(
     }
     return dims;
   }
-  int64_t rank = sharding.tile_assignment().num_dimensions();
-  absl::flat_hash_map<int64_t, std::vector<int64_t>> device_to_index;
+
+  std::vector<std::vector<int64_t>> device_to_index(
+      num_devices,
+      std::vector<int64_t>(sharding.tile_assignment().num_dimensions()));
   sharding.tile_assignment().Each(
       [&](absl::Span<const int64_t> index, int64_t device) {
-        device_to_index[device] =
-            std::vector<int64_t>(index.begin(), index.begin() + rank);
+        device_to_index[device].assign(index.begin(), index.end());
       });
   int64_t group_count = 1;
-  for (int64_t i = 0; i < rank; ++i) {
+  for (int64_t i = 0; i < sharding.tile_assignment().num_dimensions(); ++i) {
     if (device_to_index[device_groups[0][0]][i] ==
         device_to_index[device_groups[0][1]][i]) {
       dims.push_back(i);
diff --git a/third_party/xla/xla/service/spmd/spmd_partitioner_util_test.cc b/third_party/xla/xla/service/spmd/spmd_partitioner_util_test.cc
index f8eec69a0e9631..663bb45e21cb05 100644
--- a/third_party/xla/xla/service/spmd/spmd_partitioner_util_test.cc
+++ b/third_party/xla/xla/service/spmd/spmd_partitioner_util_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/third_party/xla/xla/service/stream_pool_test.cc b/third_party/xla/xla/service/stream_pool_test.cc
index e227df525fe145..551a35cef1843c 100644
--- a/third_party/xla/xla/service/stream_pool_test.cc
+++ b/third_party/xla/xla/service/stream_pool_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <memory>
 
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/test_helpers.h"
 
@@ -27,7 +28,7 @@ class StreamPoolTest : public ::testing::Test {
  protected:
   std::unique_ptr<se::StreamExecutor> NewStreamExecutor() {
     se::Platform* platform =
-        se::MultiPlatformManager::PlatformWithName("Host").value();
+        se::PlatformManager::PlatformWithName("Host").value();
     se::StreamExecutorConfig config(/*ordinal=*/0);
     return platform->GetUncachedExecutor(config).value();
   }
@@ -93,78 +94,5 @@ TEST_F(StreamPoolTest, TwoStreamPool) {
   EXPECT_NE(stream3_ptr, stream4_ptr);
 }
 
-TEST_F(StreamPoolTest, BadStreamDiscarded) {
-  std::unique_ptr<se::StreamExecutor> executor = NewStreamExecutor();
-  StreamPool pool;
-
-  // Borrow a stream.
-  StreamPool::Ptr stream1 = pool.BorrowStream(executor.get());
-  EXPECT_TRUE(stream1->ok());
-
-  // Force an error on the stream; here we call a method that requires
-  // DNN support, which we know the Host platform doesn't support.
-  stream1->ThenDepthConcatenate({}, {}, nullptr);
-  EXPECT_FALSE(stream1->ok());
-
-  // Return stream1 and borrow stream2.
-  stream1 = nullptr;
-  StreamPool::Ptr stream2 = pool.BorrowStream(executor.get());
-  se::Stream* stream2_ptr = stream2.get();
-  EXPECT_TRUE(stream2->ok());
-
-  // The underlying streams should be different. They would have been
-  // the same, but since we forced an error on stream1, it cannot be
-  // put back into the pool. Sadly we can't just check:
-  //    EXPECT_NE(stream1_ptr, stream2_ptr);
-  //
-  // The above should hold logically, but it may fail if the new
-  // stream instance allocated for stream2 happens to reside in the
-  // same memory address as stream1, which has been deleted.
-  //
-  // The check that stream2->ok() serves as a good-enough check.
-
-  // Return stream2 and borrow stream3. The previous error on stream1
-  // has no effect on these streams, and they are the same.
-  stream2 = nullptr;
-  StreamPool::Ptr stream3 = pool.BorrowStream(executor.get());
-  se::Stream* stream3_ptr = stream3.get();
-  EXPECT_TRUE(stream3->ok());
-  EXPECT_EQ(stream2_ptr, stream3_ptr);
-}
-
-TEST_F(StreamPoolTest, BadStreamAfterReturnDiscarded) {
-  std::unique_ptr<se::StreamExecutor> executor = NewStreamExecutor();
-  StreamPool pool;
-
-  // Borrow a stream.
-  StreamPool::Ptr stream1 = pool.BorrowStream(executor.get());
-  EXPECT_TRUE(stream1->ok());
-
-  // Return the stream, but hold a handle to it.
-  se::Stream* stream1_ptr = stream1.get();
-  stream1 = nullptr;
-
-  // Now stream1 is back in the pool, force an error on the stream. Here we call
-  // a method that requires DNN support, which we know the Host platform doesn't
-  // support.
-  stream1_ptr->ThenDepthConcatenate({}, {}, nullptr);
-  EXPECT_FALSE(stream1_ptr->ok());
-
-  // Borrow stream2.
-  StreamPool::Ptr stream2 = pool.BorrowStream(executor.get());
-  EXPECT_TRUE(stream2->ok());
-
-  // The underlying streams should be different. They would have been
-  // the same, but since we forced an error on stream1, it cannot be
-  // put back into the pool. Sadly we can't just check:
-  //    EXPECT_NE(stream1_ptr, stream2_ptr);
-  //
-  // The above should hold logically, but it may fail if the new
-  // stream instance allocated for stream2 happens to reside in the
-  // same memory address as stream1, which has been deleted.
-  //
-  // The check that stream2->ok() serves as a good-enough check.
-}
-
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/sub_byte_normalization.cc b/third_party/xla/xla/service/sub_byte_normalization.cc
index 47c4a47c74af16..028eb83b5f2268 100644
--- a/third_party/xla/xla/service/sub_byte_normalization.cc
+++ b/third_party/xla/xla/service/sub_byte_normalization.cc
@@ -93,8 +93,11 @@ StatusOr<bool> SubByteNormalization::Run(
     changed |= UpdateShape(shape, mode_);
     return OkStatus();
   });
-  for (HloComputation* computation :
-       module->MakeNonfusionComputations(execution_threads)) {
+  for (HloComputation* computation : module->computations()) {
+    // We rewrite all computations instead of non-fusion computations, despite
+    // element_size_in_bits within fusions being meaningless, because HloVerfier
+    // checks for the correct use of element_size_in_bits even in fusion
+    // computations.
     TF_RETURN_IF_ERROR(computation->Accept(&visitor));
   }
   auto* computation_layout = module->mutable_entry_computation_layout();
diff --git a/third_party/xla/xla/service/topk_rewriter.cc b/third_party/xla/xla/service/topk_rewriter.cc
index a138ea2e7907b4..0258d607b42848 100644
--- a/third_party/xla/xla/service/topk_rewriter.cc
+++ b/third_party/xla/xla/service/topk_rewriter.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include "xla/service/topk_rewriter.h"
 
 #include <array>
+#include <cstdint>
 #include <memory>
 #include <optional>
 #include <vector>
@@ -29,8 +30,10 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/primitive_util.h"
 #include "xla/service/pattern_matcher.h"
 #include "xla/shape_util.h"
+#include "xla/util.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 
@@ -120,6 +123,36 @@ static bool IsNanSafeGt(HloComputation* comp) {
                      param_s32);
   };
 
+  auto match_generic_iec559 = [](int64_t parameter_number,
+                                 PrimitiveType fp_type,
+                                 PrimitiveType int_type) {
+    auto param = m::Parameter(parameter_number)
+                     .WithShape(m::Shape().WithElementType(fp_type));
+    auto signed_value = m::BitcastConvert(param).WithShape(
+        m::Shape().WithElementType(int_type));
+    int64_t bit_width = primitive_util::BitWidth(fp_type);
+    auto max_value = m::ConstantScalar(LsbMask<uint64_t>(bit_width - 1));
+    auto flipped_value = m::XorAnyOrder(max_value, signed_value);
+    auto is_negative = m::Lt(signed_value, m::ConstantScalar(0));
+    return m::Select(is_negative, flipped_value, signed_value);
+  };
+
+  auto match_generic_iec559_with_convert =
+      [](int64_t parameter_number, PrimitiveType param_type,
+         PrimitiveType fp_type, PrimitiveType int_type) {
+        auto param = m::Parameter(parameter_number)
+                         .WithShape(m::Shape().WithElementType(param_type));
+        auto convert =
+            m::Convert(param).WithShape(m::Shape().WithElementType(fp_type));
+        auto signed_value = m::BitcastConvert(convert).WithShape(
+            m::Shape().WithElementType(int_type));
+        int64_t bit_width = primitive_util::BitWidth(fp_type);
+        auto max_value = m::ConstantScalar(LsbMask<uint64_t>(bit_width - 1));
+        auto flipped_value = m::XorAnyOrder(max_value, signed_value);
+        auto is_negative = m::Lt(signed_value, m::ConstantScalar(0));
+        return m::Select(is_negative, flipped_value, signed_value);
+      };
+
   auto match_s32 = [](int64_t parameter_number) {
     auto param = m::Parameter(parameter_number)
                      .WithShape(m::Shape().WithElementType(S32));
@@ -155,6 +188,15 @@ static bool IsNanSafeGt(HloComputation* comp) {
   };
 
   return Match(comp->root_instruction(),
+               m::Gt(match_generic_iec559(0, F32, S32),
+                     match_generic_iec559(1, F32, S32))) ||
+         Match(comp->root_instruction(),
+               m::Gt(match_generic_iec559(0, BF16, S16),
+                     match_generic_iec559(1, BF16, S16))) ||
+         Match(comp->root_instruction(),
+               m::Gt(match_generic_iec559_with_convert(0, BF16, F32, S32),
+                     match_generic_iec559_with_convert(1, BF16, F32, S32))) ||
+         Match(comp->root_instruction(),
                m::Gt(match_bitcast_f32(0), match_bitcast_f32(1))) ||
          Match(comp->root_instruction(),
                m::Gt(match_bitcast_bf16(0), match_bitcast_bf16(1))) ||
diff --git a/third_party/xla/xla/service/transfer_manager.cc b/third_party/xla/xla/service/transfer_manager.cc
index 5307001e111e35..2fc48520e27eed 100644
--- a/third_party/xla/xla/service/transfer_manager.cc
+++ b/third_party/xla/xla/service/transfer_manager.cc
@@ -15,24 +15,31 @@ limitations under the License.
 
 #include "xla/service/transfer_manager.h"
 
+#include <cstdint>
 #include <functional>
 #include <memory>
-#include <string>
 #include <utility>
+#include <vector>
 
+#include "absl/base/const_init.h"
 #include "absl/cleanup/cleanup.h"
-#include "absl/strings/str_cat.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/literal.h"
 #include "xla/service/compiler.h"
 #include "xla/service/maybe_owning_device_memory.h"
+#include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
 #include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/status_macros.h"
-#include "xla/types.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/util.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/notification.h"
 
-using absl::StrCat;
-
 namespace xla {
 
 /* static */ absl::Mutex TransferManager::platform_transfer_manager_mutex_(
diff --git a/third_party/xla/xla/service/transfer_manager.h b/third_party/xla/xla/service/transfer_manager.h
index 89fc0d876614b6..3157b37b627443 100644
--- a/third_party/xla/xla/service/transfer_manager.h
+++ b/third_party/xla/xla/service/transfer_manager.h
@@ -16,19 +16,24 @@ limitations under the License.
 #ifndef XLA_SERVICE_TRANSFER_MANAGER_H_
 #define XLA_SERVICE_TRANSFER_MANAGER_H_
 
-#include <map>
-#include <set>
-#include <vector>
+#include <cstdint>
+#include <functional>
+#include <memory>
 
 #include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "xla/literal.h"
-#include "xla/service/executable.h"
+#include "xla/service/maybe_owning_device_memory.h"
 #include "xla/service/shaped_buffer.h"
+#include "xla/shape.h"
+#include "xla/shape_tree.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/types.h"
 #include "xla/xla_data.pb.h"
 
 namespace xla {
diff --git a/third_party/xla/xla/service/while_loop_analysis.cc b/third_party/xla/xla/service/while_loop_analysis.cc
index 625cdd3c4e0733..01aa802a929d69 100644
--- a/third_party/xla/xla/service/while_loop_analysis.cc
+++ b/third_party/xla/xla/service/while_loop_analysis.cc
@@ -15,8 +15,13 @@ limitations under the License.
 
 #include "xla/service/while_loop_analysis.h"
 
+#include <algorithm>
+#include <cmath>
+#include <cstdint>
+
 #include "absl/base/casts.h"
 #include "absl/container/flat_hash_map.h"
+#include "xla/comparison_util.h"
 #include "xla/hlo/evaluator/hlo_evaluator.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -25,6 +30,7 @@ limitations under the License.
 #include "xla/literal.h"
 #include "xla/literal_util.h"
 #include "xla/service/pattern_matcher.h"
+#include "xla/shape_util.h"
 
 namespace xla {
 
@@ -348,7 +354,7 @@ optional<int64_t> CheckedSubtract(int64_t a, int64_t b) {
 optional<int64_t> MatchTrivialLoopTripCount(const HloInstruction* while_op,
                                             int64_t indvar_tuple_idx,
                                             const Literal& indvar_init) {
-  // First, find the scalar constant K that `i` is initialized to.
+  // First, find the scalar constant init that `i` is initialized to.
   optional<int64_t> indvar_init_val =
       LiteralUtil::LiteralAsScalarInt64(indvar_init);
   if (!indvar_init_val) {
@@ -358,21 +364,51 @@ optional<int64_t> MatchTrivialLoopTripCount(const HloInstruction* while_op,
     return nullopt;
   }
 
-  // Check that `i` goes as `i++` in the while body.
-  //
-  // TODO(jlebar): We could also handle i-- and other idioms.
+  // Check that `i` goes as `i += k` in the while body where k is a natural
+  // number.
   auto* while_body = while_op->while_body();
   auto* while_body_indvar_update =
-      while_body->root_instruction()->operand(indvar_tuple_idx);
+      while_body->root_instruction()->mutable_operand(indvar_tuple_idx);
   auto* while_body_indvar = NonConstantOperand(while_body_indvar_update);
+  HloInstruction* trip_count_increase_step_instr = nullptr;
+  int64_t trip_count_step = 0;
   if (!Match(while_body_indvar_update,
              m::AddAnyOrder(m::Op().Is(while_body_indvar),
-                            m::ConstantEffectiveScalar(1)))) {
-    VLOG(2) << "Pattern-match failed: induction variable does not go as i++: "
-            << while_body_indvar_update->ToString();
-    return nullopt;
+                            m::Op(&trip_count_increase_step_instr)))) {
+    if (trip_count_increase_step_instr == nullptr) {
+      VLOG(2) << "Pattern-match failed: induction variable is not getting "
+                 "updated by an add operation: "
+              << while_body_indvar_update->ToString();
+      return nullopt;
+    }
+    if (!trip_count_increase_step_instr->IsConstant() ||
+        !ShapeUtil::IsEffectiveScalar(
+            trip_count_increase_step_instr->shape())) {
+      VLOG(2) << "Pattern-match failed: induction variable is not getting "
+                 "incremented by constant: "
+              << while_body_indvar_update->ToString();
+      return nullopt;
+    }
+    if (!LiteralUtil::LiteralAsScalarInt64(
+             trip_count_increase_step_instr->literal())
+             .has_value()) {
+      VLOG(2)
+          << "Pattern-match failed: trip count step is not an integral type: "
+          << trip_count_increase_step_instr->shape().ToString();
+      return nullopt;
+    }
+    VLOG(2) << "Pattern-match for trip count step failed: "
+            << trip_count_increase_step_instr->ToString();
   }
 
+  trip_count_step = LiteralUtil::LiteralAsScalarInt64(
+                        trip_count_increase_step_instr->literal())
+                        .value();
+  if (trip_count_step <= 0) {
+    VLOG(2) << "Pattern-match failed: trip count step is not a natural number: "
+            << trip_count_step;
+    return nullopt;
+  }
   // Check that we do op(i, N) or op(N, i) as the while condition.  Capture the
   // value N.
   auto* while_cond = while_op->while_condition();
@@ -397,7 +433,7 @@ optional<int64_t> MatchTrivialLoopTripCount(const HloInstruction* while_op,
     return nullopt;
   }
 
-  // Handle `i = K; i < N; ++i`.
+  // Handle `i = init; i < N; i+=k`.
   if (Match(while_cond_root,
             m::Op()
                 .WithComparisonDirection(ComparisonDirection::kLt)
@@ -407,14 +443,26 @@ optional<int64_t> MatchTrivialLoopTripCount(const HloInstruction* while_op,
     optional<int64_t> trips =
         CheckedSubtract(*while_cond_bound_val, *indvar_init_val);
     if (trips) {
-      return std::max(int64_t{0}, *trips);
-    } else {
-      VLOG(2) << "Pattern-match failed: Trip count exceeds INT64_MAX.";
-      return nullopt;
+      const int64_t remainder = std::remainder(*trips, trip_count_step);
+      const int64_t div = std::floor(*trips / trip_count_step);
+      if (remainder == 0) {
+        return std::max(int64_t{0}, div);
+      }
+      trips = CheckedAdd(div, 1);
+      if (!trips) {
+        VLOG(2) << "Pattern-match failed: Trip count exceeds INT64_MAX.";
+        return nullopt;
+      }
+      if (*trips < *while_cond_bound_val) {
+        return std::max(int64_t{0}, *trips);
+      }
+      return std::max(int64_t{0}, div);
     }
+    VLOG(2) << "Pattern-match failed: Trip count exceeds INT64_MAX.";
+    return nullopt;
   }
 
-  // Handle `i = K; i <= N; ++i`.
+  // Handle `i = init; i <= N; i+=k`.
   if (Match(while_cond_root,
             m::Op()
                 .WithComparisonDirection(ComparisonDirection::kLe)
@@ -427,7 +475,7 @@ optional<int64_t> MatchTrivialLoopTripCount(const HloInstruction* while_op,
       VLOG(2) << "Pattern-match failed: Trip count exceeds INT64_MAX";
       return nullopt;
     }
-    trips = CheckedAdd(*trips, 1);
+    trips = CheckedAdd(std::floor(*trips / trip_count_step), 1);
     if (!trips) {
       VLOG(2) << "Pattern-match failed: Trip count exceeds INT64_MAX";
       return nullopt;
diff --git a/third_party/xla/xla/service/while_loop_analysis_test.cc b/third_party/xla/xla/service/while_loop_analysis_test.cc
index 6dd512dd6e0e99..1beeb4efb8eae5 100644
--- a/third_party/xla/xla/service/while_loop_analysis_test.cc
+++ b/third_party/xla/xla/service/while_loop_analysis_test.cc
@@ -15,58 +15,98 @@ limitations under the License.
 
 #include "xla/service/while_loop_analysis.h"
 
+#include <cstdint>
+#include <memory>
+#include <optional>
+#include <string>
+#include <vector>
+
 #include <gtest/gtest.h>
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
+#include "xla/comparison_util.h"
 #include "xla/hlo/ir/hlo_casting_utils.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/service/hlo_parser.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/statusor.h"
 #include "xla/test.h"
 #include "xla/tests/hlo_test_base.h"
-#include "tsl/lib/core/status_test_util.h"
+#include "xla/util.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
 
-class WhileLoopAnalysisTest : public HloTestBase {};
+class WhileLoopAnalysisTest : public HloTestBase {
+ protected:
+  [[nodiscard]] StatusOr<int64_t> MakeWhileLoopAndGetTripCount(
+      int init, int limit, int step, ComparisonDirection dir);
+};
 
-TEST_F(WhileLoopAnalysisTest, SingleIterationUpperBound) {
-  const char* const kHloModule = R"(
-    HloModule ModuleWithWhile
+StatusOr<int64_t> WhileLoopAnalysisTest::MakeWhileLoopAndGetTripCount(
+    int init, int limit, int step, ComparisonDirection dir) {
+  std::string hlo_string_template = R"(
+  HloModule ModuleWithWhile
 
     body {
       p_body = (f32[2], s32[]) parameter(0)
       val = f32[2] get-tuple-element(p_body), index=0
-      const = s32[] constant(-1)
-      ROOT root = (f32[2], s32[]) tuple(val, const)
+      index = s32[] get-tuple-element(p_body), index=1
+      one = s32[] constant({{STEP}})
+      inc = s32[] add(index, one)
+      ROOT root = (f32[2], s32[]) tuple(val, inc)
     }
 
     condition {
       p_cond = (f32[2], s32[]) parameter(0)
       gte = s32[] get-tuple-element(p_cond), index=1
-      const = s32[] constant(42)
-      ROOT result = pred[] compare(gte, const), direction=EQ
+      const = s32[] constant({{LIMIT}})
+      ROOT result = pred[] compare(gte, const), direction={{COMP_DIR}}
     }
 
     ENTRY entry {
       param.0 = f32[2] parameter(0)
-      param.1 = s32[] parameter(1)
+      param.1 = s32[] constant({{INIT}})
       while_init = (f32[2], s32[]) tuple(param.0, param.1)
       ROOT while = (f32[2], s32[]) while(while_init), condition=condition, body=body
-    })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kHloModule));
+    }
+  )";
+
+  std::string hlo_string =
+      absl::StrReplaceAll(hlo_string_template,
+                          {{"{{INIT}}", absl::StrCat(init)},
+                           {"{{LIMIT}}", absl::StrCat(limit)},
+                           {"{{STEP}}", absl::StrCat(step)},
+                           {"{{COMP_DIR}}", ComparisonDirectionToString(dir)}});
+
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                      ParseAndReturnVerifiedModule(hlo_string));
 
   HloInstruction* while_op = module->entry_computation()->root_instruction();
-  EXPECT_EQ(*ComputeWhileLoopTripCountUpperBound(while_op), 1);
+  std::optional<int64_t> trip_count = MatchTrivialLoopTripCount(
+      while_op, 1,
+      Cast<HloConstantInstruction>(
+          module->GetComputationWithName("entry")->GetInstructionWithName(
+              "param.1"))
+          ->literal());
+
+  CHECK(trip_count.has_value());
+
+  return *trip_count;
 }
 
-TEST_F(WhileLoopAnalysisTest, NoUpperBound) {
+TEST_F(WhileLoopAnalysisTest, SingleIterationUpperBound) {
   const char* const kHloModule = R"(
     HloModule ModuleWithWhile
 
     body {
       p_body = (f32[2], s32[]) parameter(0)
       val = f32[2] get-tuple-element(p_body), index=0
-      const = s32[] constant(42)
+      const = s32[] constant(-1)
       ROOT root = (f32[2], s32[]) tuple(val, const)
     }
 
@@ -87,32 +127,30 @@ TEST_F(WhileLoopAnalysisTest, NoUpperBound) {
                           ParseAndReturnVerifiedModule(kHloModule));
 
   HloInstruction* while_op = module->entry_computation()->root_instruction();
-  EXPECT_EQ(ComputeWhileLoopTripCountUpperBound(while_op), std::nullopt);
+  EXPECT_EQ(*ComputeWhileLoopTripCountUpperBound(while_op), 1);
 }
 
-TEST_F(WhileLoopAnalysisTest, ExactBoundTrivialTripCount) {
+TEST_F(WhileLoopAnalysisTest, NoUpperBound) {
   const char* const kHloModule = R"(
     HloModule ModuleWithWhile
 
     body {
       p_body = (f32[2], s32[]) parameter(0)
       val = f32[2] get-tuple-element(p_body), index=0
-      index = s32[] get-tuple-element(p_body), index=1
-      one = s32[] constant(1)
-      inc = s32[] add(index, one)
-      ROOT root = (f32[2], s32[]) tuple(val, inc)
+      const = s32[] constant(42)
+      ROOT root = (f32[2], s32[]) tuple(val, const)
     }
 
     condition {
       p_cond = (f32[2], s32[]) parameter(0)
       gte = s32[] get-tuple-element(p_cond), index=1
       const = s32[] constant(42)
-      ROOT result = pred[] compare(gte, const), direction=LT
+      ROOT result = pred[] compare(gte, const), direction=EQ
     }
 
     ENTRY entry {
       param.0 = f32[2] parameter(0)
-      param.1 = s32[] constant(0)
+      param.1 = s32[] parameter(1)
       while_init = (f32[2], s32[]) tuple(param.0, param.1)
       ROOT while = (f32[2], s32[]) while(while_init), condition=condition, body=body
     })";
@@ -120,47 +158,54 @@ TEST_F(WhileLoopAnalysisTest, ExactBoundTrivialTripCount) {
                           ParseAndReturnVerifiedModule(kHloModule));
 
   HloInstruction* while_op = module->entry_computation()->root_instruction();
-
-  EXPECT_EQ(
-      *MatchTrivialLoopTripCount(
-          while_op, 1,
-          Cast<HloConstantInstruction>(module->GetComputationWithName("entry")
-                                           ->GetInstructionWithName("param.1"))
-              ->literal()),
-      42);
+  EXPECT_EQ(ComputeWhileLoopTripCountUpperBound(while_op), std::nullopt);
 }
 
-TEST_F(WhileLoopAnalysisTest, ExactBound) {
-  const char* const kHloModule = R"(
-    HloModule ModuleWithWhile
-
-    body {
-      p_body = (f32[2], s32[]) parameter(0)
-      val = f32[2] get-tuple-element(p_body), index=0
-      index = s32[] get-tuple-element(p_body), index=1
-      one = s32[] constant(1)
-      inc = s32[] add(index, one)
-      ROOT root = (f32[2], s32[]) tuple(val, inc)
+int CalculateTripCount(int init, int limit, int step, ComparisonDirection dir) {
+  int trip_count = 0;
+  if (dir == ComparisonDirection::kLt) {
+    for (int i = init; i < limit; i += step) {
+      trip_count++;
     }
-
-    condition {
-      p_cond = (f32[2], s32[]) parameter(0)
-      gte = s32[] get-tuple-element(p_cond), index=1
-      const = s32[] constant(42)
-      ROOT result = pred[] compare(gte, const), direction=LT
+  } else if (dir == ComparisonDirection::kLe) {
+    for (int i = init; i <= limit; i += step) {
+      trip_count++;
     }
+  } else {
+    LOG(FATAL) << "Unknown comparison direction: "
+               << ComparisonDirectionToString(dir);
+  }
+  return trip_count;
+}
 
-    ENTRY entry {
-      param.0 = f32[2] parameter(0)
-      param.1 = s32[] constant(0)
-      while_init = (f32[2], s32[]) tuple(param.0, param.1)
-      ROOT while = (f32[2], s32[]) while(while_init), condition=condition, body=body
-    })";
-  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
-                          ParseAndReturnVerifiedModule(kHloModule));
+TEST_F(WhileLoopAnalysisTest, ExactBoundTrivialTripCount) {
+  // LT cases
+  EXPECT_EQ(
+      MakeWhileLoopAndGetTripCount(0, 42, 1, ComparisonDirection::kLt).value(),
+      CalculateTripCount(0, 42, 1, ComparisonDirection::kLt));
+  EXPECT_EQ(
+      MakeWhileLoopAndGetTripCount(0, 42, 2, ComparisonDirection::kLt).value(),
+      CalculateTripCount(0, 42, 2, ComparisonDirection::kLt));
+  EXPECT_EQ(
+      MakeWhileLoopAndGetTripCount(0, 42, 5, ComparisonDirection::kLt).value(),
+      CalculateTripCount(0, 42, 5, ComparisonDirection::kLt));
+  EXPECT_EQ(
+      MakeWhileLoopAndGetTripCount(0, 40, 5, ComparisonDirection::kLt).value(),
+      CalculateTripCount(0, 40, 5, ComparisonDirection::kLt));
 
-  HloInstruction* while_op = module->entry_computation()->root_instruction();
-  EXPECT_EQ(*ComputeWhileLoopTripCountUpperBound(while_op), 42);
+  // LE cases
+  EXPECT_EQ(
+      MakeWhileLoopAndGetTripCount(0, 42, 1, ComparisonDirection::kLe).value(),
+      CalculateTripCount(0, 42, 1, ComparisonDirection::kLe));
+  EXPECT_EQ(
+      MakeWhileLoopAndGetTripCount(0, 42, 2, ComparisonDirection::kLe).value(),
+      CalculateTripCount(0, 42, 2, ComparisonDirection::kLe));
+  EXPECT_EQ(
+      MakeWhileLoopAndGetTripCount(0, 42, 5, ComparisonDirection::kLe).value(),
+      CalculateTripCount(0, 42, 5, ComparisonDirection::kLe));
+  EXPECT_EQ(
+      MakeWhileLoopAndGetTripCount(0, 40, 5, ComparisonDirection::kLe).value(),
+      CalculateTripCount(0, 40, 5, ComparisonDirection::kLe));
 }
 
 TEST_F(WhileLoopAnalysisTest, NoAIVNoConstChain) {
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking.cc b/third_party/xla/xla/service/while_loop_fusible_sinking.cc
index 091a99dd79a896..755b1d9e6480c2 100644
--- a/third_party/xla/xla/service/while_loop_fusible_sinking.cc
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking.cc
@@ -147,16 +147,17 @@ StatusOr<bool> WhileLoopFusibleSinking::TrySinkingFusiblesIntoWhileLoop(
 
     // If a while operand is used by a slicing instruction, avoid fusing
     // invariant value into the loop.
-    if (absl::c_any_of(init_value->users(), [](const HloInstruction* use) {
-          switch (use->opcode()) {
-            case HloOpcode::kDynamicSlice:
-            case HloOpcode::kGather:
-            case HloOpcode::kSlice:
-              return true;
-            default:
-              return false;
-          }
-        })) {
+    if (absl::c_any_of(invariant_body_gte->users(),
+                       [](const HloInstruction* use) {
+                         switch (use->opcode()) {
+                           case HloOpcode::kDynamicSlice:
+                           case HloOpcode::kGather:
+                           case HloOpcode::kSlice:
+                             return true;
+                           default:
+                             return false;
+                         }
+                       })) {
       continue;
     }
 
diff --git a/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc b/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
index b634d82da0c4ba..c94d9ecfec26cb 100644
--- a/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
+++ b/third_party/xla/xla/service/while_loop_fusible_sinking_test.cc
@@ -110,5 +110,49 @@ ENTRY entry {
                         _, _));
 }
 
+TEST_F(WhileLoopFusibleSinkingTest, NoSinkSlicedMask) {
+  const char* const hlo_string = R"(
+HloModule ModuleWithWhile
+
+body {
+  p_body = (f32[5,7],f32[5,7]) parameter(0)
+  p_body.0 = get-tuple-element(p_body), index=0
+  p_body.1 = get-tuple-element(p_body), index=1
+  z = s32[] constant(0)
+  j = s32[] constant(3)
+  ds = f32[1,7] dynamic-slice(p_body.1, j, z), dynamic_slice_sizes={1,7}
+  r = f32[7] reshape(ds)
+  b = f32[5,7] broadcast(r), dimensions={1}
+  a = add(b, p_body.0)
+  add.0 = add(a, p_body.1)
+  ROOT root = tuple(add.0, p_body.1)
+}
+
+condition {
+  p_cond = (f32[5,7],f32[5,7]) parameter(0)
+  ROOT result = pred[] constant(true)
+}
+
+ENTRY entry {
+  const_0 = f32[5,7] parameter(0)
+  p = f32[5] parameter(1)
+  a = f32[5,7] iota(), iota_dimension=0
+  b = f32[5,7] iota(), iota_dimension=1
+  c = add(a, b)
+  d = f32[5,7] broadcast(p), dimensions={0}
+  mask = multiply(c,d)
+  while_init = tuple(const_0, mask)
+  ROOT while = while(while_init), condition=condition, body=body
+}
+)";
+
+  TF_ASSERT_OK_AND_ASSIGN(auto module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  TF_ASSERT_OK_AND_ASSIGN(bool changed,
+                          WhileLoopFusibleSinking{}.Run(module.get()));
+  EXPECT_FALSE(changed);
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/service/while_loop_unroller.cc b/third_party/xla/xla/service/while_loop_unroller.cc
index 6b2c03669f04ba..4f96cc48ef53e1 100644
--- a/third_party/xla/xla/service/while_loop_unroller.cc
+++ b/third_party/xla/xla/service/while_loop_unroller.cc
@@ -15,7 +15,6 @@ limitations under the License.
 
 #include "xla/service/while_loop_unroller.h"
 
-#include <algorithm>
 #include <cstdint>
 #include <iterator>
 #include <memory>
@@ -25,11 +24,14 @@ limitations under the License.
 
 #include "absl/algorithm/algorithm.h"
 #include "absl/algorithm/container.h"
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
 #include "absl/log/check.h"
 #include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/string_view.h"
+#include "xla/comparison_util.h"
 #include "xla/hlo/evaluator/hlo_evaluator.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
@@ -41,6 +43,7 @@ limitations under the License.
 #include "xla/overflow_util.h"
 #include "xla/primitive_util.h"
 #include "xla/service/call_inliner.h"
+#include "xla/service/collective_ops_utils.h"
 #include "xla/service/flatten_call_graph.h"
 #include "xla/service/hlo_cse.h"
 #include "xla/service/hlo_pass_fix.h"
@@ -48,6 +51,7 @@ limitations under the License.
 #include "xla/service/while_loop_analysis.h"
 #include "xla/service/while_loop_constant_sinking.h"
 #include "xla/shape.h"
+#include "xla/shape_util.h"
 #include "xla/statusor.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
@@ -56,30 +60,188 @@ limitations under the License.
 
 namespace xla {
 namespace {
+
 using hlo_query::ContainsInstrWithOpcode;
 
+// Parameters for the unroller that can be adjusted.
 const int kUnrollTripCountThreshold = 64;
 const int kUnrollInstructionCountThreshold = 800;
 const int kUnrollExpandFactorThreshold = 10000;
-};  // namespace
+
+// The following sequence of passes are necessary to prepare loops for
+// unrolling. Failure to run these passes will prevent unroller from unrolling
+// loops that would have been otherwise unrollable.
+//
+// Instead of placing these passes in compiler, they are placed
+// here to indicate explicit dependency to these passes.
+StatusOr<bool> PrepareModuleForUnrolling(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  bool changed = false;
+  TF_ASSIGN_OR_RETURN(
+      bool applied_cse,
+      HloCSE{/*is_layout_sensitive=*/true}.Run(module, execution_threads));
+  if (applied_cse) {
+    changed = true;
+    VLOG(3) << "Applied hlo cse to module " << module->name();
+  }
+
+  TF_ASSIGN_OR_RETURN(bool applied_tuple_simplifier,
+                      TupleSimplifier{}.Run(module, execution_threads));
+  if (applied_tuple_simplifier) {
+    changed = true;
+    VLOG(3) << "Applied tuple simplifier to module " << module->name();
+  }
+
+  // We apply constant sinking to fix point.
+  HloPassFix<WhileLoopConstantSinking> constant_sinking(
+      /*sink_broadcast_of_constants=*/true);
+  TF_ASSIGN_OR_RETURN(bool applied_constant_sinking,
+                      constant_sinking.Run(module, execution_threads));
+  if (applied_constant_sinking) {
+    changed = true;
+    VLOG(3) << "Applied constant sinking to module " << module->name();
+  }
+  return changed;
+}
+
+// A utility function that decides whether a loop is unrollable or not.
+std::optional<WhileLoopConfig> IsLoopUnrollable(HloInstruction* while_op) {
+  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
+
+  // TODO(b/300668690): Add support for unrolling loops with control dependency.
+  // For now, we bail.
+  //
+  // Finding all the while loops where other instructions have explicit control
+  // dependencies on them.
+  std::vector<HloInstruction*> while_dependees;
+  for (HloComputation* comp : while_op->GetModule()->computations()) {
+    for (HloInstruction* instr : comp->instructions()) {
+      for (HloInstruction* control_dep : instr->control_predecessors()) {
+        if (control_dep->opcode() == HloOpcode::kWhile) {
+          while_dependees.push_back(control_dep);
+        }
+      }
+    }
+  }
+  if (absl::linear_search(while_dependees.begin(), while_dependees.end(),
+                          while_op)) {
+    VLOG(2) << "Not attempting to unroll " << while_op->name()
+            << " due to control dependency: " << while_op->ToShortString();
+    return std::nullopt;
+  }
+
+  // We can't remove while loops that contain send/recv nodes, because we
+  // rely on the particular loop structure around the node matching on the
+  // send and recv sides.
+  if (ContainsInstrWithOpcode(while_op->while_body(),
+                              {HloOpcode::kSend, HloOpcode::kSendDone,
+                               HloOpcode::kRecv, HloOpcode::kRecvDone}) ||
+      ContainsInstrWithOpcode(while_op->while_condition(),
+                              {HloOpcode::kSend, HloOpcode::kSendDone,
+                               HloOpcode::kRecv, HloOpcode::kRecvDone})) {
+    VLOG(2) << "Not attempting to unroll " << while_op->name()
+            << " because it contains a send/recv node: "
+            << while_op->ToShortString();
+    return std::nullopt;
+  }
+
+  if (while_op->operand(0)->opcode() != HloOpcode::kTuple) {
+    VLOG(2) << "Not attempting to unroll " << while_op->name()
+            << " because the operand is not a tuple: "
+            << while_op->ToShortString();
+    return std::nullopt;
+  }
+
+  // We cannot unroll loops that have side effecting condition because the
+  // condition will be removed after unrolling. This might be relaxed
+  // later when we add partial unrolling.
+  if (while_op->while_condition()->HasSideEffect()) {
+    VLOG(2) << "Not attempting to remove while loop whose condition contains "
+               "side-effecting instructions: "
+            << while_op->ToShortString();
+    return std::nullopt;
+  }
+
+  std::optional<int64_t> indvar_tuple_idx =
+      GetLoopInductionVarTupleIdx(while_op);
+  if (!indvar_tuple_idx.has_value()) {
+    return std::nullopt;
+  }
+
+  HloEvaluator evaluator(/*max_loop_iterations=*/0);
+  const HloInstruction* while_init = while_op->operand(0);
+  const HloInstruction* indvar_init = while_init->operand(*indvar_tuple_idx);
+  StatusOr<Literal> indvar_init_result = evaluator.Evaluate(indvar_init);
+  if (!indvar_init_result.ok()) {
+    VLOG(2) << "Couldn't evaluate induction variable init, "
+            << indvar_init_result.status() << ", " << indvar_init->ToString();
+    return std::nullopt;
+  }
+  Literal indvar_iter_val = std::move(indvar_init_result).value();
+
+  std::optional<int64_t> trip_count =
+      MatchTrivialLoopTripCount(while_op, *indvar_tuple_idx, indvar_iter_val);
+  if (!trip_count.has_value()) {
+    return std::nullopt;
+  }
+
+  VLOG(3) << "Loop trip count " << trip_count.value();
+
+  WhileLoopConfig config;
+  config.init =
+      LiteralUtil::LiteralAsScalarInt64(std::move(indvar_iter_val)).value();
+  config.trip_count = trip_count.value();
+  config.induction_var_idx = *indvar_tuple_idx;
+
+  return config;
+}
+
+std::unique_ptr<HloInstruction> GetConstantWithPrimitiveType(PrimitiveType type,
+                                                             int64_t value) {
+  return primitive_util::PrimitiveTypeSwitch<std::unique_ptr<HloInstruction>>(
+      [&](auto literal_constant) -> std::unique_ptr<HloInstruction> {
+        if constexpr (primitive_util::IsIntegralType(literal_constant)) {
+          using NativeT = primitive_util::NativeTypeOf<literal_constant>;
+          return HloInstruction::CreateConstant(
+              LiteralUtil::CreateR0(static_cast<NativeT>(value)));
+        }
+        LOG(FATAL) << "literal is of non-integral type";
+      },
+      type);
+}
 
 // Helper function that replaces a single iteration of a while loop with
 // induction variable equal to induction_value.
-static StatusOr<std::unique_ptr<HloComputation>>
-UnrollSingleIterationOfTrivialLoop(HloInstruction* while_op,
-                                   const int64_t indvar_idx,
-                                   const int64_t induction_value) {
+StatusOr<std::unique_ptr<HloComputation>> UnrollSingleIterationOfTrivialLoop(
+    HloInstruction* while_op, const int64_t indvar_idx,
+    const int64_t induction_value) {
   // We clone the body since we are changing the computation.
   std::unique_ptr<HloComputation> while_body_clone =
       while_op->while_body()->Clone(absl::StrCat(induction_value));
 
-  const HloInstruction* induction_var_hlo =
-      while_op->operand(0)->operand(indvar_idx);
+  HloInstruction* induction_var_hlo =
+      while_op->mutable_operand(0)->mutable_operand(indvar_idx);
+
+  // We record the next channel id to utilize when unrolling loops with
+  // collective communication instructions. During unrolling a single iteration
+  // of the body, we can reuse the same unique_channel_id. For the later
+  // iterations, we obtain it again.
+  int64_t unique_channel_id = hlo_query::NextChannelId(*while_op->GetModule());
 
   // Go through the instructions in while body to get the instruction that
   // points to the induction var. Then replace it everywhere with the concrete
   // value.
   for (HloInstruction* body_inst : while_body_clone->instructions()) {
+    // We need to assign a unique channel_id for the collective ops that are
+    // unrolled within the while loop body or fusions containing collectives.
+    if (IsCollectiveWithChannelId(body_inst)) {
+      // To obtain the channel_id for the collective ops we only need to
+      // increment the `unique_channel_id` since it records the next available
+      // channel_id across the module.
+      body_inst->set_channel_id(unique_channel_id++);
+    }
+
     if (body_inst->opcode() != HloOpcode::kGetTupleElement) {
       continue;
     }
@@ -103,22 +265,8 @@ UnrollSingleIterationOfTrivialLoop(HloInstruction* while_op,
           // Found the induction var as an operand of body instruction.
           if (indvar_use_operand == body_inst) {
             std::unique_ptr<HloInstruction> constant =
-                primitive_util::PrimitiveTypeSwitch<
-                    std::unique_ptr<HloInstruction>>(
-                    [&](auto literal_constant)
-                        -> std::unique_ptr<HloInstruction> {
-                      if constexpr (primitive_util::IsIntegralType(
-                                        literal_constant)) {
-                        using NativeT =
-                            primitive_util::NativeTypeOf<literal_constant>;
-                        return HloInstruction::CreateConstant(
-                            LiteralUtil::CreateR0(
-                                static_cast<NativeT>(induction_value)));
-                      }
-                      LOG(FATAL) << "literal is of non-integral type";
-                    },
-                    induction_var_hlo->shape().element_type());
-
+                GetConstantWithPrimitiveType(
+                    induction_var_hlo->shape().element_type(), induction_value);
             // Assign the same shape of the old instruction to the new
             // instruction.
             *constant->mutable_shape() = body_inst->shape();
@@ -132,45 +280,182 @@ UnrollSingleIterationOfTrivialLoop(HloInstruction* while_op,
   return while_body_clone;
 }
 
-StatusOr<bool> WhileLoopUnroller::Run(
-    HloModule* module,
-    const absl::flat_hash_set<absl::string_view>& execution_threads) {
-  // TODO(b/288130138) For now, we only support full unrolling. Will add partial
-  // unrolling if needed.
-  if (unroll_factor_ != -1) {
-    return false;
+// Helper function to create a condition for a single iteration while loop in
+// the form of 'i <= init_value' where i is the induction variable.
+std::unique_ptr<HloComputation> MakeSingleIterWhileCond(
+    HloInstruction* while_op, int64_t induction_idx, int64_t init_value) {
+  auto condition_builder =
+      HloComputation::Builder(absl::StrCat("unrolled-cond-", while_op->name()));
+
+  auto param_instruction = condition_builder.AddParameter(
+      while_op->while_condition()->parameter_instruction(0)->Clone());
+
+  CHECK_OK(param_instruction);
+
+  HloInstruction* indvar_instruction = condition_builder.AddInstruction(
+      HloInstruction::CreateGetTupleElement(*param_instruction, induction_idx));
+
+  auto init_value_constant =
+      condition_builder.AddInstruction(GetConstantWithPrimitiveType(
+          indvar_instruction->shape().element_type(), init_value));
+
+  return condition_builder.Build(
+      condition_builder.AddInstruction(HloInstruction::CreateCompare(
+          ShapeUtil::MakeShape(PrimitiveType::PRED, {}), indvar_instruction,
+          init_value_constant, ComparisonDirection::kLe)));
+}
+
+absl::Status InitialFeasibilityCheck(HloInstruction* while_op,
+                                     WhileLoopConfig config,
+                                     int64_t unroll_factor) {
+  CHECK_EQ(while_op->opcode(), HloOpcode::kWhile);
+
+  // While loop must have a single tuple operand.
+  CHECK_EQ(while_op->operands().size(), 1);
+  if (while_op->operands().size() != 1) {
+    return FailedPrecondition(
+        "%s",
+        absl::StrCat("Cannot unroll while loop. While loop must have a single "
+                     "tuple operand, instead has more than one operand: ",
+                     while_op->operands().size()));
   }
-  XLA_VLOG_LINES(3, "WhileLoopUnroller::Run(), before:\n" + module->ToString());
-  bool changed = false;
 
-  // The following sequence of passes are necessary to prepare loops for
-  // unrolling. Instead of placing these passes in compiler, they are placed
-  // here to indicate explicit dependency to these passes.
-  TF_ASSIGN_OR_RETURN(
-      bool applied_cse,
-      HloCSE{/*is_layout_sensitive=*/true}.Run(module, execution_threads));
-  if (applied_cse) {
-    changed = true;
-    VLOG(3) << "Applied hlo cse to module " << module->name();
+  VLOG(5) << "Trying to unroll " << while_op->ToShortString();
+
+  // TODO(b/288130138): For now, we only support full unrolling. Will add
+  // partial unrolling if needed.
+  if (unroll_factor != -1) {
+    return UnimplementedStrCat(
+        "Currently, only full unrolling is supported, unroll factor: ",
+        unroll_factor);
   }
 
-  TF_ASSIGN_OR_RETURN(bool applied_tuple_simplifier,
-                      TupleSimplifier{}.Run(module, execution_threads));
-  if (applied_tuple_simplifier) {
-    changed = true;
-    VLOG(3) << "Applied tuple simplifier to module " << module->name();
+  // TODO(b/291628533): Extract this parameter to the unroller config. We don't
+  // attempt to unroll loops where the body has more than
+  // kUnrollInstructionCountThreshold instructions.
+  if (while_op->while_body()->instruction_count() >
+      kUnrollInstructionCountThreshold) {
+    return FailedPrecondition(
+        "%s",
+        absl::StrCat(
+            "Cannot unroll while loop. Too many instructions in the body: ",
+            while_op->while_body()->instruction_count()));
   }
 
-  // We apply constant sinking to fix point.
-  HloPassFix<WhileLoopConstantSinking> constant_sinking(
-      /*sink_broadcast_of_constants=*/true);
-  TF_ASSIGN_OR_RETURN(bool applied_constant_sinking,
-                      constant_sinking.Run(module, execution_threads));
-  if (applied_constant_sinking) {
-    VLOG(3) << "Applied constant sinking to module " << module->name();
+  // TODO(b/291628533): Extract this parameter to the an unroller config. We
+  // only unroll loops up to a threshold.
+  if (config.trip_count > kUnrollTripCountThreshold) {
+    return FailedPrecondition(
+        "%s",
+        absl::StrCat("Cannot unroll while loop. The tip count is greater "
+                     "than the threshold: ",
+                     config.trip_count, " vs ", kUnrollTripCountThreshold));
   }
 
-  // Processing the while loops in the reverse of topological order. If the body
+  // TODO(b/291628533): Extract this parameter to the unroller config. We don't
+  // unroll loops that increase the instruction count by more than
+  // kUnrollExpandFactorThreshold.
+  if (config.trip_count * while_op->while_body()->instruction_count() >
+      kUnrollExpandFactorThreshold) {
+    return FailedPrecondition(
+        "%s", absl::StrCat("Not attempting to unroll due to instruction count "
+                           "increase explosion. New instruction count: ",
+                           config.trip_count *
+                               while_op->while_body()->instruction_count(),
+                           " vs ", kUnrollExpandFactorThreshold));
+  }
+  return absl::OkStatus();
+}
+
+StatusOr<bool> UnrollInternal(HloInstruction* while_op, WhileLoopConfig config,
+                              int64_t unroll_factor) {
+  TF_RETURN_IF_ERROR(InitialFeasibilityCheck(while_op, config, unroll_factor));
+
+  VLOG(3) << "Unrolling while instruction " << while_op->ToShortString()
+          << " with body instruction count "
+          << while_op->while_body()->instruction_count();
+
+  HloModule* module = while_op->GetModule();
+  HloComputation* computation = while_op->parent();
+  HloInstruction* unrolled_body_call_op;
+  std::vector<HloInstruction*> call_operands = {while_op->operands().at(0)};
+  for (int64_t i = config.init; i < config.trip_count + config.init; ++i) {
+    CHECK(OverflowSafeAdd(i, (int64_t)1).has_value());
+
+    HloComputation* unrolled_body = module->AddEmbeddedComputation(
+        UnrollSingleIterationOfTrivialLoop(while_op, config.induction_var_idx,
+                                           i)
+            .value());
+    unrolled_body_call_op =
+        computation->AddInstruction(HloInstruction::CreateCall(
+            while_op->shape(), call_operands, unrolled_body));
+    call_operands.clear();
+    call_operands.emplace_back(unrolled_body_call_op);
+  }
+  TF_RETURN_IF_ERROR(
+      computation->ReplaceInstruction(while_op, unrolled_body_call_op));
+
+  // Needed for the nested while loops in which the outer loop has been
+  // unrolled which leaves the call graph non-flat.
+  TF_RETURN_IF_ERROR(FlattenCallGraph().Run(module).status());
+  return true;
+}
+
+StatusOr<bool> UnrollInternalWrapped(HloInstruction* while_op,
+                                     WhileLoopConfig config,
+                                     int64_t unroll_factor) {
+  TF_RETURN_IF_ERROR(InitialFeasibilityCheck(while_op, config, unroll_factor));
+
+  VLOG(3) << "Unrolling (wrapped) while instruction "
+          << while_op->ToShortString() << " with body instruction count "
+          << while_op->while_body()->instruction_count();
+
+  HloModule* module = while_op->GetModule();
+  HloComputation* computation = while_op->parent();
+  HloInstruction* unrolled_body_call_op;
+
+  auto body_builder =
+      HloComputation::Builder(absl::StrCat("unrolled-body-", while_op->name()));
+  StatusOr<HloInstruction*> p = body_builder.AddParameter(
+      while_op->while_body()->parameter_instruction(0)->Clone());
+
+  std::vector<HloInstruction*> call_operands = {p.value()};
+  for (int64_t i = config.init; i < config.trip_count + config.init; ++i) {
+    CHECK(OverflowSafeAdd(i, (int64_t)1).has_value());
+
+    HloComputation* unrolled_body = module->AddEmbeddedComputation(
+        UnrollSingleIterationOfTrivialLoop(while_op, config.induction_var_idx,
+                                           i)
+            .value());
+    unrolled_body_call_op =
+        body_builder.AddInstruction(HloInstruction::CreateCall(
+            while_op->shape(), call_operands, unrolled_body));
+    call_operands.clear();
+    call_operands.emplace_back(unrolled_body_call_op);
+  }
+  HloComputation* new_body =
+      module->AddEmbeddedComputation(body_builder.Build(unrolled_body_call_op));
+  HloComputation* new_cond = module->AddEmbeddedComputation(
+      MakeSingleIterWhileCond(while_op, config.induction_var_idx, config.init));
+
+  HloInstruction* new_while_op =
+      computation->AddInstruction(HloInstruction::CreateWhile(
+          while_op->shape(), new_cond, new_body, while_op->mutable_operand(0)));
+
+  CHECK_OK(computation->ReplaceInstruction(while_op, new_while_op));
+
+  // Needed for the nested while loops in which the outer loop has been
+  // unrolled which leaves the call graph non-flat.
+  TF_RETURN_IF_ERROR(FlattenCallGraph().Run(module).status());
+  return true;
+}
+
+};  // namespace
+
+absl::flat_hash_map<HloInstruction*, WhileLoopConfig> GetUnrollableLoops(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  // Processing the while loops in the reverse topological order. If the body
   // of while loop A calls while loop B, B comes before A.
   std::vector<HloInstruction*> all_while_ops;
   for (auto* comp : module->MakeComputationPostOrder(execution_threads)) {
@@ -178,184 +463,97 @@ StatusOr<bool> WhileLoopUnroller::Run(
                     HloPredicateIsOp<HloOpcode::kWhile>);
   }
 
-  // Finding all the while loops where other instructions have explicit control
-  // dependencies on them.
-  std::vector<HloInstruction*> while_with_deps;
-  for (HloComputation* comp : module->computations(execution_threads)) {
-    for (HloInstruction* instr : comp->instructions()) {
-      for (HloInstruction* control_dep : instr->control_predecessors()) {
-        if (control_dep->opcode() == HloOpcode::kWhile) {
-          if (std::find(all_while_ops.begin(), all_while_ops.end(),
-                        control_dep) != all_while_ops.end()) {
-            while_with_deps.push_back(control_dep);
-          }
-        }
-      }
-    }
-  }
-
-  // Gather a preliminary vector of all the while ops that we think we can
-  // unroll. We only consider while loops that take a tuple as an argument. We
-  // do this ahead of time so we don't have to worry about mutating the lists of
-  // computations or instructions while we iterate.
-  std::vector<HloInstruction*> while_ops;
+  absl::flat_hash_map<HloInstruction*, WhileLoopConfig> while_loop_configs;
   for (HloInstruction* instr : all_while_ops) {
-    // TODO(b/300668690): Check control dependencies to the while
-    // instruction
-    if (absl::linear_search(while_with_deps.begin(), while_with_deps.end(),
-                            instr)) {
-      VLOG(2) << "Not attempting to unroll " << instr->name()
-              << " due to control dependency: " << instr->ToShortString();
-      continue;
+    std::optional<WhileLoopConfig> config = IsLoopUnrollable(instr);
+    if (config.has_value()) {
+      while_loop_configs[instr] = *config;
     }
-
-    // We can't remove while loops that contain send/recv nodes, because we
-    // rely on the particular loop structure around the node matching on the
-    // send and recv sides.
-    if (ContainsInstrWithOpcode(instr->while_body(),
-                                {HloOpcode::kSend, HloOpcode::kSendDone,
-                                 HloOpcode::kRecv, HloOpcode::kRecvDone}) ||
-        ContainsInstrWithOpcode(instr->while_condition(),
-                                {HloOpcode::kSend, HloOpcode::kSendDone,
-                                 HloOpcode::kRecv, HloOpcode::kRecvDone})) {
-      VLOG(2) << "Not attempting to unroll " << instr->name()
-              << " because it contains a send/recv node: "
-              << instr->ToShortString();
-      continue;
-    }
-    // TODO(b/291146216): Handle this case later
-    if (ContainsInstrWithOpcode(instr->while_body(), {HloOpcode::kAllReduce,
-                                                      HloOpcode::kAllGather})) {
-      VLOG(2) << "Not attempting to unroll " << instr->name()
-              << " for now because it contains an all-reduce or an all-gather: "
-              << instr->ToShortString();
-      continue;
-    }
-    if (instr->operand(0)->opcode() != HloOpcode::kTuple) {
-      VLOG(2) << "Not attempting to unroll " << instr->name()
-              << " because the operand is not a tuple: "
-              << instr->ToShortString();
-      continue;
-    }
-    // We cannot unroll loops that have side effecting condition because the
-    // condition will be removed after unrolling. This might be relaxed
-    // later when we add partial unrolling.
-    if (instr->while_condition()->HasSideEffect()) {
-      VLOG(2) << "Not attempting to remove while loop whose condition contains "
-                 "side-effecting instructions: "
-              << instr->ToShortString();
-      return false;
-    }
-    // TODO(b/291628533): Extract this to the unroller config
-    if (instr->while_body()->instruction_count() >
-        kUnrollInstructionCountThreshold) {
-      continue;
-    }
-    while_ops.push_back(instr);
   }
+  return while_loop_configs;
+}
 
-  VLOG(3) << "Number of while instructions in the module to unroll: "
-          << while_ops.size();
+StatusOr<bool> Unroll(HloInstruction* while_op, int64_t unroll_factor,
+                      bool wrap_in_trivial_loop) {
+  bool changed = false;
+  HloModule* module = while_op->GetModule();
 
-  for (HloInstruction* while_op : while_ops) {
-    VLOG(3) << "Trying to unroll " << while_op->ToShortString();
-    bool unrolled_current_loop = false;
-    int64_t unroll_factor_current_loop = unroll_factor_;
+  // Make sure all the necessary passes are executed before unrolling in order
+  // to unroll every possible loop.
+  TF_ASSIGN_OR_RETURN(
+      changed, PrepareModuleForUnrolling(module, /*execution_threads=*/{}));
 
-    // TODO(b/288130138) For now, we only support full unrolling. Will add
-    // partial unrolling if needed.
-    CHECK_EQ(unroll_factor_current_loop, -1);
+  // Construct the loop config
+  std::optional<WhileLoopConfig> config = IsLoopUnrollable(while_op);
+  if (!config.has_value()) {
+    return false;
+  }
 
-    std::optional<int64_t> indvar_tuple_idx =
-        GetLoopInductionVarTupleIdx(while_op);
-    if (!indvar_tuple_idx.has_value()) {
-      continue;
-    }
+  bool unrolled = false;
+  if (wrap_in_trivial_loop) {
+    TF_ASSIGN_OR_RETURN(unrolled, UnrollInternalWrapped(
+                                      while_op, config.value(), unroll_factor));
+  } else {
+    TF_ASSIGN_OR_RETURN(
+        unrolled, UnrollInternal(while_op, config.value(), unroll_factor));
+  }
 
-    HloEvaluator evaluator(/*max_loop_iterations=*/0);
-    const HloInstruction* while_init = while_op->operand(0);
-    const HloInstruction* indvar_init = while_init->operand(*indvar_tuple_idx);
-    StatusOr<Literal> indvar_init_result = evaluator.Evaluate(indvar_init);
-    if (!indvar_init_result.ok()) {
-      VLOG(2) << "Couldn't evaluate induction variable init, "
-              << indvar_init_result.status() << ", " << indvar_init->ToString();
-      continue;
-    }
-    Literal indvar_iter_val = std::move(indvar_init_result).value();
+  // We need to inline the calls created for unrolling since later passes rely
+  // on the calls to be inlined.
+  if (unrolled) {
+    TF_RETURN_IF_ERROR(CallInliner().Run(module).status());
+  }
+  return unrolled;
+}
 
-    // TODO(b/288907795): Try using ComputeWhileLoopTripCount
-    std::optional<int64_t> trip_count =
-        MatchTrivialLoopTripCount(while_op, *indvar_tuple_idx, indvar_iter_val);
-    if (!trip_count.has_value()) {
-      continue;
-    }
+StatusOr<bool> WhileLoopUnroller::Run(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads) {
+  // TODO(b/288130138) For now, we only support full unrolling. Will add partial
+  // unrolling if needed.
+  if (unroll_factor_ != -1) {
+    return false;
+  }
+  XLA_VLOG_LINES(3, "WhileLoopUnroller::Run(), before:\n" + module->ToString());
+  bool changed = false;
 
-    VLOG(3) << "Loop trip count " << trip_count.value();
+  // Make sure all the necessary passes are executed before unrolling in order
+  // to unroll every possible loop.
+  TF_ASSIGN_OR_RETURN(changed,
+                      PrepareModuleForUnrolling(module, execution_threads));
 
-    // TODO(b/291628533): Extract this to the unroller config. We only unroll
-    // loops up to a threshold.
-    if (trip_count > kUnrollTripCountThreshold) {
-      continue;
-    }
+  // Processing the while loops in the reverse of topological order. If the body
+  // of while loop A calls while loop B, B comes before A.
+  std::vector<HloInstruction*> all_while_ops;
+  for (auto* comp : module->MakeComputationPostOrder(execution_threads)) {
+    absl::c_copy_if(comp->instructions(), std::back_inserter(all_while_ops),
+                    HloPredicateIsOp<HloOpcode::kWhile>);
+  }
 
-    unroll_factor_current_loop = trip_count.value();
-
-    // TODO(b/291628533): Extract this to the unroller config. We don't unroll
-    // loops that increase the instruction count by more than
-    // kUnrollExpandFactorThreshold.
-    if (trip_count.value() * while_op->while_body()->instruction_count() >
-        kUnrollExpandFactorThreshold) {
-      VLOG(3) << "Not attempting to unroll due to instruction count increase "
-                 "explosion.";
-      VLOG(3) << "New instruction count: "
-              << trip_count.value() *
-                     while_op->while_body()->instruction_count();
-      continue;
-    }
+  // Gather a preliminary vector of all the while ops that we think we can
+  // unroll. We do this ahead of time so we don't have to worry about mutating
+  // the lists of computations or instructions while we iterate.
+  absl::flat_hash_map<HloInstruction*, WhileLoopConfig> unrollable_while_ops =
+      GetUnrollableLoops(module, execution_threads);
 
-    std::optional<int64_t> init_value =
-        LiteralUtil::LiteralAsScalarInt64(indvar_iter_val);
-    // Init value must be int64_t at this point since we found the trip count.
-    CHECK(init_value.has_value());
-
-    unrolled_current_loop = true;
-    VLOG(3) << "Unrolling while instruction " << while_op->ToShortString()
-            << " with body instruction count "
-            << while_op->while_body()->instruction_count();
-    HloComputation* computation = while_op->parent();
-    HloInstruction* unrolled_body_call_op;
-    std::vector<HloInstruction*> call_operands;
-    // We assume while has only one tuple parameter
-    call_operands.emplace_back(while_op->operands().at(0));
-    for (int64_t i = init_value.value();
-         i < unroll_factor_current_loop + init_value.value(); ++i) {
-      CHECK(OverflowSafeAdd(i, (int64_t)1).has_value());
-
-      HloComputation* unrolled_body = module->AddEmbeddedComputation(
-          UnrollSingleIterationOfTrivialLoop(while_op, *indvar_tuple_idx, i)
-              .value());
-      unrolled_body_call_op =
-          computation->AddInstruction(HloInstruction::CreateCall(
-              while_op->shape(), call_operands, unrolled_body));
-      call_operands.clear();
-      call_operands.emplace_back(unrolled_body_call_op);
-    }
-    CHECK_OK(computation->ReplaceInstruction(while_op, unrolled_body_call_op));
-
-    // Need to perform following passes only if the current while loop has been
-    // unrolled.
-    if (unrolled_current_loop) {
-      // Needed for the nested while loops in which the outer loop has been
-      // unrolled which leaves the call graph non-flat.
-      TF_RETURN_IF_ERROR(
-          FlattenCallGraph().Run(module, execution_threads).status());
+  VLOG(3) << "Number of while instructions in the module to unroll: "
+          << unrollable_while_ops.size();
+
+  bool unrolled = false;
+  for (auto& [while_op, config] : unrollable_while_ops) {
+    if (wrap_in_trivial_loop_) {
+      TF_ASSIGN_OR_RETURN(
+          unrolled, UnrollInternalWrapped(while_op, config, unroll_factor_));
+    } else {
+      TF_ASSIGN_OR_RETURN(unrolled,
+                          UnrollInternal(while_op, config, unroll_factor_));
     }
-    changed |= unrolled_current_loop;
+    changed |= unrolled;
   }
 
+  // We need to inline the calls created for unrolling since later passes rely
+  // on the calls to be inlined.
   if (changed) {
-    // We need to inline the calls created for unrolling since later passes rely
-    // on the calls to be inlined.
     TF_RETURN_IF_ERROR(CallInliner().Run(module, execution_threads).status());
   }
 
diff --git a/third_party/xla/xla/service/while_loop_unroller.h b/third_party/xla/xla/service/while_loop_unroller.h
index 751b672034d1b6..bd9bebaa3d16e0 100644
--- a/third_party/xla/xla/service/while_loop_unroller.h
+++ b/third_party/xla/xla/service/while_loop_unroller.h
@@ -17,16 +17,39 @@ limitations under the License.
 #define XLA_SERVICE_WHILE_LOOP_UNROLLER_H_
 
 #include <cstdint>
+#include <optional>
 
+#include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_pass_interface.h"
 #include "xla/statusor.h"
 
 namespace xla {
 
+// Config for unrollable while loops.
+struct WhileLoopConfig {
+  // The initial value of the induction variable of the while loop.
+  int64_t init;
+  // The number of iterations the loop executes.
+  int64_t trip_count;
+  // The index of the induction variable in the input tuple of the while loop.
+  int64_t induction_var_idx;
+};
+
+// Returns the list of unrollable loops in the given module
+absl::flat_hash_map<HloInstruction*, WhileLoopConfig> GetUnrollableLoops(
+    HloModule* module,
+    const absl::flat_hash_set<absl::string_view>& execution_threads);
+
+// Unrolls the given while loop with the defaul behaviour set to full unroll. If
+// wrap_in_trivial_loop is set, the unrolled body of the loop will be wrapped in
+// a loop with trip count of one.
+StatusOr<bool> Unroll(HloInstruction* while_op, int64_t unroll_factor = -1,
+                      bool wrap_in_trivial_loop = false);
+
 // This pass unrolls while loops with the given unrolling factor. The value of
 // unroll_factor = -1 will fully unroll the loop.
 //
@@ -42,8 +65,10 @@ class WhileLoopUnroller : public HloModulePass {
   ~WhileLoopUnroller() override = default;
 
   // Default unroll_factor of -1 indicates full unrolling
-  explicit WhileLoopUnroller(int64_t unroll_factor = -1)
-      : unroll_factor_(unroll_factor) {}
+  explicit WhileLoopUnroller(int64_t unroll_factor = -1,
+                             bool wrap_in_trivial_loop = false)
+      : unroll_factor_(unroll_factor),
+        wrap_in_trivial_loop_(wrap_in_trivial_loop) {}
 
   absl::string_view name() const override { return "while_loop_unroller"; }
 
@@ -54,6 +79,8 @@ class WhileLoopUnroller : public HloModulePass {
 
  private:
   int64_t unroll_factor_;
+  // Whether to wrap the unrolled computation in a loop with trip count of one.
+  bool wrap_in_trivial_loop_;
 };
 
 }  // namespace xla
diff --git a/third_party/xla/xla/service/while_loop_unroller_test.cc b/third_party/xla/xla/service/while_loop_unroller_test.cc
index 0c529a7e7dd249..d8122067f30bf4 100644
--- a/third_party/xla/xla/service/while_loop_unroller_test.cc
+++ b/third_party/xla/xla/service/while_loop_unroller_test.cc
@@ -16,21 +16,26 @@ limitations under the License.
 #include "xla/service/while_loop_unroller.h"
 
 #include <cstdint>
+#include <iterator>
 #include <memory>
 #include <optional>
 #include <string>
 #include <utility>
+#include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
 #include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_replace.h"
 #include "absl/types/span.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/literal.h"
 #include "xla/tests/hlo_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/verified_hlo_module.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
@@ -47,15 +52,19 @@ class WhileLoopUnrollerTest : public HloTestBase {
   MakeModuleWithLoopBodyNestedCopyIndVar(int num_iters);
   [[nodiscard]] std::unique_ptr<VerifiedHloModule>
   MakeModuleWithWhileFeedingAnotherWhile(int num_iters);
+  [[nodiscard]] std::unique_ptr<VerifiedHloModule>
+  MakeModuleWithSimpleLoopAllReduce(int num_iters);
 
  public:
   void UnrollAndCompare(std::unique_ptr<HloModule> module,
                         absl::Span<Literal* const> arguments,
-                        int64_t unroll_factor = -1) {
+                        int64_t unroll_factor = -1, bool wrap_in_loop = false) {
     Literal before_unroll = ExecuteAndTransfer(module->Clone(), arguments);
-    VLOG(2) << "after unroll value: " << before_unroll.ToString();
+    VLOG(2) << "before unroll value: " << before_unroll.ToString();
 
-    EXPECT_TRUE(WhileLoopUnroller(unroll_factor).Run(module.get()).value());
+    EXPECT_TRUE(WhileLoopUnroller(unroll_factor, wrap_in_loop)
+                    .Run(module.get())
+                    .value());
 
     Literal after_unroll = ExecuteAndTransfer(std::move(module), arguments);
     VLOG(2) << "after unroll value: " << after_unroll.ToString();
@@ -293,8 +302,55 @@ WhileLoopUnrollerTest::MakeModuleWithWhileFeedingAnotherWhile(int num_iters) {
   return ParseAndReturnVerifiedModule(hlo_string).value();
 }
 
+std::unique_ptr<VerifiedHloModule>
+WhileLoopUnrollerTest::MakeModuleWithSimpleLoopAllReduce(int num_iters) {
+  std::string hlo_string_template = R"(
+  HloModule SimpleLoop
+  
+  %reduction {
+    %x = f32[] parameter(0)
+    %y = f32[] parameter(1)
+    ROOT %add = f32[] add(f32[] %x, f32[] %y)
+  }
+  
+  SimpleLoop.body {
+    loop_var.1 = (s32[], f32[1024, 1024], f32[1024, 1024]) parameter(0)
+    get-tuple-element.1 = s32[] get-tuple-element(loop_var.1), index=0
+    get-tuple-element.2 = f32[1024, 1024] get-tuple-element(loop_var.1), index=1
+    get-tuple-element.3 = f32[1024, 1024] get-tuple-element(loop_var.1), index=2
+    
+    %all-reduce = f32[1024, 1024] all-reduce(f32[1024, 1024] get-tuple-element.2), channel_id=1, replica_groups={{0}}, to_apply=%reduction
+    %accumulation = f32[1024, 1024] add(f32[1024, 1024] %all-reduce, f32[1024, 1024] get-tuple-element.3)
+    
+    constant.1 = s32[] constant(1)
+    add = s32[] add(get-tuple-element.1, constant.1)
+    ROOT tuple = (s32[], f32[1024, 1024], f32[1024, 1024]) tuple(add, get-tuple-element.2, %accumulation)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s32[], f32[1024, 1024], f32[1024, 1024]) parameter(0)
+    get-tuple-element.3 = s32[] get-tuple-element(loop_var.2), index=0
+    constant.2 = s32[] constant({{LOOP_BOUND}})
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
+  }
+  ENTRY SimpleLoop {
+    %param.1 = f32[1024, 1024] parameter(0)
+    constant.3 = s32[] constant(0)
+    
+    %accumulation_buffer_init = f32[] constant(0)
+    %accumulation_buffer = f32[1024, 1024] broadcast(f32[] %accumulation_buffer_init), dimensions={}
+    
+    tuple.1 =    (s32[], f32[1024, 1024], f32[1024, 1024]) tuple(constant.3, %param.1, %accumulation_buffer)
+    ROOT while = (s32[], f32[1024, 1024], f32[1024, 1024]) while(tuple.1), condition=SimpleLoop.condition, body=SimpleLoop.body
+  }
+  )";
+  std::string hlo_string = absl::StrReplaceAll(
+      hlo_string_template, {{"{{LOOP_BOUND}}", absl::StrCat(num_iters)}});
+  return ParseAndReturnVerifiedModule(hlo_string).value();
+}
+
 TEST_F(WhileLoopUnrollerTest, SimpleLoopUnroll) {
-  UnrollAndCompare(MakeModuleWithSimpleLoop(/*num_iters=*/5), {});
+  UnrollAndCompare(MakeModuleWithSimpleLoop(/*num_iters=*/5), {}, -1, false);
+  UnrollAndCompare(MakeModuleWithSimpleLoop(/*num_iters=*/5), {}, -1, true);
 }
 
 TEST_F(WhileLoopUnrollerTest, SimpleLoopNotRoot) {
@@ -325,10 +381,95 @@ TEST_F(WhileLoopUnrollerTest, SimpleLoopNotRoot) {
     ROOT result = s32[3]{0} get-tuple-element(while), index=1
   }
   )";
-  UnrollAndCompare(ParseAndReturnVerifiedModule(hlo_string).value(), {});
+  UnrollAndCompare(ParseAndReturnVerifiedModule(hlo_string).value(), {}, -1,
+                   false);
+  UnrollAndCompare(ParseAndReturnVerifiedModule(hlo_string).value(), {}, -1,
+                   true);
 }
 
-TEST_F(WhileLoopUnrollerTest, SimpleLoopNonZeroInit) {
+TEST_F(WhileLoopUnrollerTest, GetUnrollableLoops) {
+  std::string hlo_string = R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s64[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s64[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s64[] constant(1)
+    add = s64[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply = s32[3]{0} add(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s64[], s32[3]{0}) tuple(add, multiply)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s64[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s64[] get-tuple-element(loop_var.2), index=0
+    /* number of iterations is 10 */
+    constant.2 = s64[] constant(10)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
+  }
+  SimpleLoop.body.2 {
+    loop_var.1 = (s64[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s64[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s64[] constant(1)
+    add = s64[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply = s32[3]{0} add(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s64[], s32[3]{0}) tuple(add, multiply)
+  }
+  SimpleLoop.condition.2 {
+    loop_var.2 = (s64[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s64[] get-tuple-element(loop_var.2), index=0
+    /* number of iterations is 10 */
+    constant.2 = s64[] constant(10)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
+  }
+  SimpleLoop.body.3 {
+    loop_var.1 = (s64[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s64[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s64[] constant(1)
+    add = s64[] multiply(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply = s32[3]{0} add(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s64[], s32[3]{0}) tuple(add, multiply)
+  }
+  SimpleLoop.condition.3 {
+    loop_var.2 = (s64[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s64[] get-tuple-element(loop_var.2), index=0
+    /* number of iterations is 10 */
+    constant.2 = s64[] constant(10)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
+  }
+  ENTRY SimpleLoop {
+    constant.3 = s64[] constant(0)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s64[], s32[3]{0}) tuple(constant.3, constant.4)
+    while1 = (s64[], s32[3]{0}) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+    while3 = (s64[], s32[3]{0}) while(tuple.1), condition=
+      SimpleLoop.condition.3, body=SimpleLoop.body.3
+    while2 = (s64[], s32[3]{0}) while(tuple.1), condition=
+      SimpleLoop.condition.2, body=SimpleLoop.body.2
+    o1 = s32[3]{0} get-tuple-element(while1), index=1
+    o2 = s32[3]{0} get-tuple-element(while2), index=1
+    ROOT result = (s32[3]{0}, s32[3]{0}) tuple(o1,o2)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  HloInstruction* while1 =
+      module->entry_computation()->GetInstructionWithName("while1");
+  HloInstruction* while2 =
+      module->entry_computation()->GetInstructionWithName("while2");
+  HloInstruction* while3 =
+      module->entry_computation()->GetInstructionWithName("while3");
+
+  auto unrollable_loops = GetUnrollableLoops(module.get(), {});
+  EXPECT_TRUE(unrollable_loops.contains(while1));
+  EXPECT_TRUE(unrollable_loops.contains(while2));
+  EXPECT_FALSE(unrollable_loops.contains(while3));
+}
+
+TEST_F(WhileLoopUnrollerTest, UnrollMutipleLoops) {
   std::string hlo_string = R"(
   HloModule SimpleLoop
   SimpleLoop.body {
@@ -347,6 +488,86 @@ TEST_F(WhileLoopUnrollerTest, SimpleLoopNonZeroInit) {
     constant.2 = s64[] constant(10)
     ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
   }
+  SimpleLoop.body.2 {
+    loop_var.1 = (s64[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s64[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s64[] constant(1)
+    add = s64[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply = s32[3]{0} add(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s64[], s32[3]{0}) tuple(add, multiply)
+  }
+  SimpleLoop.condition.2 {
+    loop_var.2 = (s64[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s64[] get-tuple-element(loop_var.2), index=0
+    /* number of iterations is 10 */
+    constant.2 = s64[] constant(10)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
+  }
+  ENTRY SimpleLoop {
+    constant.3 = s64[] constant(0)
+    constant.4 = s32[3]{0} constant({0, 1, 2})
+    tuple.1 = (s64[], s32[3]{0}) tuple(constant.3, constant.4)
+    while1 = (s64[], s32[3]{0}) while(tuple.1), condition=
+      SimpleLoop.condition, body=SimpleLoop.body
+    input = s32[3]{0} get-tuple-element(while1), index=1
+    tuple.2 = (s64[], s32[3]{0}) tuple(constant.3, input)
+    while2 = (s64[], s32[3]{0}) while(tuple.2), condition=
+      SimpleLoop.condition.2, body=SimpleLoop.body.2
+    o1 = s32[3]{0} get-tuple-element(while1), index=1
+    o2 = s32[3]{0} get-tuple-element(while2), index=1
+    ROOT result = (s32[3]{0}, s32[3]{0}) tuple(o1,o2)
+  }
+  )";
+  TF_ASSERT_OK_AND_ASSIGN(std::unique_ptr<HloModule> module,
+                          ParseAndReturnVerifiedModule(hlo_string));
+
+  // Unroll the first loop
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool unrolled1,
+      Unroll(module->entry_computation()->GetInstructionWithName("while1")));
+  EXPECT_TRUE(unrolled1);
+
+  // There should be no call instructions after unrolling either loops since we
+  // inline all the calls after unrolling.
+  std::vector<HloInstruction*> call_instrs_1;
+  for (auto* comp : module->MakeComputationPostOrder()) {
+    absl::c_copy_if(comp->instructions(), std::back_inserter(call_instrs_1),
+                    HloPredicateIsOp<HloOpcode::kCall>);
+  }
+  EXPECT_EQ(call_instrs_1.size(), 0);
+
+  // Unroll the second loop
+  TF_ASSERT_OK_AND_ASSIGN(
+      bool unrolled2,
+      Unroll(module->entry_computation()->GetInstructionWithName("while2")));
+  EXPECT_TRUE(unrolled2);
+  std::vector<HloInstruction*> call_instrs_2;
+  for (auto* comp : module->MakeComputationPostOrder()) {
+    absl::c_copy_if(comp->instructions(), std::back_inserter(call_instrs_2),
+                    HloPredicateIsOp<HloOpcode::kCall>);
+  }
+  EXPECT_EQ(call_instrs_2.size(), 0);
+}
+
+TEST_F(WhileLoopUnrollerTest, SimpleLoopNonZeroInit) {
+  std::string hlo_string = R"(
+  HloModule SimpleLoop
+  SimpleLoop.body {
+    loop_var.1 = (s64[], s32[3]{0}) parameter(0)
+    get-tuple-element.1 = s64[] get-tuple-element(loop_var.1), index=0
+    constant.1 = s64[] constant(1)
+    add = s64[] add(get-tuple-element.1, constant.1)
+    get-tuple-element.2 = s32[3]{0} get-tuple-element(loop_var.1), index=1
+    multiply = s32[3]{0} add(get-tuple-element.2, get-tuple-element.2)
+    ROOT tuple = (s64[], s32[3]{0}) tuple(add, multiply)
+  }
+  SimpleLoop.condition {
+    loop_var.2 = (s64[], s32[3]{0}) parameter(0)
+    get-tuple-element.3 = s64[] get-tuple-element(loop_var.2), index=0
+    constant.2 = s64[] constant(10)
+    ROOT less-than = pred[] compare(get-tuple-element.3, constant.2), direction=LT
+  }
   ENTRY SimpleLoop {
     constant.3 = s64[] constant(4)
     constant.4 = s32[3]{0} constant({0, 1, 2})
@@ -356,7 +577,10 @@ TEST_F(WhileLoopUnrollerTest, SimpleLoopNonZeroInit) {
     ROOT result = s32[3]{0} get-tuple-element(while), index=1
   }
   )";
-  UnrollAndCompare(ParseAndReturnVerifiedModule(hlo_string).value(), {});
+  UnrollAndCompare(ParseAndReturnVerifiedModule(hlo_string).value(), {}, -1,
+                   false);
+  UnrollAndCompare(ParseAndReturnVerifiedModule(hlo_string).value(), {}, -1,
+                   true);
 }
 
 TEST_F(WhileLoopUnrollerTest, SimpleLoopS16IndVar) {
@@ -386,7 +610,10 @@ TEST_F(WhileLoopUnrollerTest, SimpleLoopS16IndVar) {
       SimpleLoop.condition, body=SimpleLoop.body
   }
   )";
-  UnrollAndCompare(ParseAndReturnVerifiedModule(hlo_string).value(), {});
+  UnrollAndCompare(ParseAndReturnVerifiedModule(hlo_string).value(), {}, -1,
+                   false);
+  UnrollAndCompare(ParseAndReturnVerifiedModule(hlo_string).value(), {}, -1,
+                   true);
 }
 
 TEST_F(WhileLoopUnrollerTest, LoopWithControlDep) {
@@ -431,17 +658,244 @@ TEST_F(WhileLoopUnrollerTest, SimpleLoopPartialUnroll) {
 TEST_F(WhileLoopUnrollerTest, IndirectBodyInc) {
   std::unique_ptr<HloModule> module =
       MakeModuleWithLoopBodyIndirectInc(/*num_iters=*/5);
-  UnrollAndCompare(std::move(module), {});
+  UnrollAndCompare(MakeModuleWithLoopBodyIndirectInc(/*num_iters=*/5), {}, -1,
+                   false);
+  UnrollAndCompare(MakeModuleWithLoopBodyIndirectInc(/*num_iters=*/5), {}, -1,
+                   true);
 }
 
 TEST_F(WhileLoopUnrollerTest, NestedIndirectBodyInc) {
   std::unique_ptr<HloModule> module =
       MakeModuleWithNestedLoopBodyIndirectInc(/*num_iters=*/5);
-  UnrollAndCompare(std::move(module), {});
+  UnrollAndCompare(MakeModuleWithNestedLoopBodyIndirectInc(/*num_iters=*/5), {},
+                   -1, false);
+  UnrollAndCompare(MakeModuleWithNestedLoopBodyIndirectInc(/*num_iters=*/5), {},
+                   -1, true);
 }
 
 TEST_F(WhileLoopUnrollerTest, WhileFeedingWhile) {
-  UnrollAndCompare(MakeModuleWithWhileFeedingAnotherWhile(/*num_iters=*/5), {});
+  UnrollAndCompare(MakeModuleWithWhileFeedingAnotherWhile(/*num_iters=*/5), {},
+                   -1, false);
+  UnrollAndCompare(MakeModuleWithWhileFeedingAnotherWhile(/*num_iters=*/5), {},
+                   -1, true);
+}
+
+TEST_F(WhileLoopUnrollerTest, LoopWithCollective) {
+  int64_t num_iters = 5;
+  auto module = MakeModuleWithSimpleLoopAllReduce(num_iters);
+
+  EXPECT_TRUE(
+      WhileLoopUnroller(/*unroll_factor=*/-1).Run(module.get()).value());
+
+  EXPECT_EQ(absl::c_count_if(module->entry_computation()->instructions(),
+                             [](const HloInstruction* instruction) {
+                               return instruction->opcode() ==
+                                      HloOpcode::kAllReduce;
+                             }),
+            num_iters);
+}
+
+TEST_F(WhileLoopUnrollerTest, LoopWithCollective2) {
+  std::string hlo_string = R"(
+  HloModule module, entry_computation_layout={(s8[1,32,2048]{2,1,0:T(8,128)(4,1)S(1)}, s8[1,2048,4096]{2,1,0:T(8,128)(4,1)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)})->(s8[1,32,2048]{2,1,0:T(8,128)(4,1)S(1)}, s8[1,2048,4096]{2,1,0:T(8,128)(4,1)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}, u32[]{:T(128)}, /*index=5*/u32[]{:T(128)}, u32[256]{0:T(256)}, u32[]{:T(128)}, u32[]{:T(128)}, s32[]{:T(128)}, /*index=10*/u32[]{:T(128)}, u32[]{:T(128)}, u32[]{:T(128)})}
+
+  fused_computation.70.clone.clone.clone {
+    param_0.10545 = s8[1,32,2048]{2,1,0:T(8,128)(4,1)S(1)} parameter(0)
+    ROOT bitcast.7213 = s8[32,2048,1]{1,0,2:T(8,128)(4,1)} bitcast(param_0.10545)
+  }
+
+  fused_computation.68.clone.clone.clone {
+    param_1.12561 = s8[1,2048,1,4096]{3,1,2,0:T(8,128)(4,1)S(1)} parameter(1)
+    constant.26622 = s8[]{:T(512)} constant(0)
+    pad.3783 = s8[1,2048,2,4096]{3,1,2,0:T(8,128)(4,1)} pad(param_1.12561, constant.26622), padding=0_0x0_0x0_1x0_0
+    constant.26621 = s32[]{:T(128)} constant(0)
+    param_2.10214 = s32[]{:T(128)S(6)} parameter(2)
+    dynamic-slice.5474 = s8[1,2048,2,256]{3,1,2,0:T(8,128)(4,1)} dynamic-slice(pad.3783, constant.26621, constant.26621, constant.26621, param_2.10214), dynamic_slice_sizes={1,2048,2,256}
+    pad.3782 = s8[1,2048,2,4096]{3,1,2,0:T(8,128)(4,1)} pad(param_1.12561, constant.26622), padding=0_0x0_0x1_0x0_0
+    param_0.10544 = s32[]{:T(128)S(6)} parameter(0)
+    dynamic-slice.5473 = s8[1,2048,2,256]{3,1,2,0:T(8,128)(4,1)} dynamic-slice(pad.3782, constant.26621, constant.26621, constant.26621, param_0.10544), dynamic_slice_sizes={1,2048,2,256}
+    add.10207 = s8[1,2048,2,256]{3,1,2,0:T(8,128)(4,1)} add(dynamic-slice.5474, dynamic-slice.5473)
+    ROOT bitcast.7212 = s8[2048,2,256]{2,0,1:T(8,128)(4,1)} bitcast(add.10207)
+  }
+
+  fused_computation.71.clone {
+    param_3.7588 = s8[1,32,2048]{2,1,0:T(8,128)(4,1)S(1)} parameter(3)
+    fusion.4288 = s8[32,2048,1]{1,0,2:T(8,128)(4,1)} fusion(param_3.7588), kind=kLoop, calls=fused_computation.70.clone.clone.clone
+    param_0.10546 = s32[]{:T(128)S(6)} parameter(0)
+    param_1.12562 = s8[1,2048,1,4096]{3,1,2,0:T(8,128)(4,1)S(1)} parameter(1)
+    param_2.10215 = s32[]{:T(128)S(6)} parameter(2)
+    fusion.4287 = s8[2048,2,256]{2,0,1:T(8,128)(4,1)} fusion(param_0.10546, param_1.12562, param_2.10215), kind=kLoop, calls=fused_computation.68.clone.clone.clone
+    convolution.802 = s32[32,2,256]{2,0,1:T(8,128)} convolution(fusion.4288, fusion.4287), window={size=2 pad=1_1 rhs_reversal=1}, dim_labels=bf0_i0o->b0f
+    ROOT bitcast.7214 = s32[1,32,2,256]{3,1,2,0:T(8,128)S(1)} bitcast(convolution.802)
+  }
+
+  fused_computation.76.clone {
+    param_0.10547 = s32[1,32,256]{2,1,0:T(8,128)S(1)} parameter(0)
+    param_1.12563 = s32[1,32,2,256]{3,1,2,0:T(8,128)S(1)} parameter(1)
+    slice.12606 = s32[1,32,1,256]{3,1,2,0:T(8,128)} slice(param_1.12563), slice={[0:1], [0:32], [1:2], [0:256]}
+    bitcast.7215 = s32[1,32,256]{2,1,0:T(8,128)} bitcast(slice.12606)
+    add.10208 = s32[1,32,256]{2,1,0:T(8,128)S(1)} add(param_0.10547, bitcast.7215)
+    param_2.10216 = s32[1,32,256]{2,1,0:T(8,128)S(1)} parameter(2)
+    slice.12000.clone.2 = s32[1,32,1,256]{3,1,2,0:T(8,128)} slice(param_1.12563), slice={[0:1], [0:32], [0:1], [0:256]}
+    bitcast.1776.clone.2 = s32[1,32,256]{2,1,0:T(8,128)} bitcast(slice.12000.clone.2)
+    add.6006.clone.2 = s32[1,32,256]{2,1,0:T(8,128)S(1)} add(param_2.10216, bitcast.1776.clone.2)
+    ROOT tuple.2892 = (s32[1,32,256]{2,1,0:T(8,128)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}) tuple(add.10208, add.6006.clone.2)
+  }
+
+  fused_computation.69.clone.clone.clone {
+    param_0.10549 = s8[1,32,2048]{2,1,0:T(8,128)(4,1)S(1)} parameter(0)
+    ROOT bitcast.7217 = s8[32,2048,1]{1,0,2:T(8,128)(4,1)} bitcast(param_0.10549)
+  }
+
+  fused_computation.66.clone.clone.clone {
+    param_1.12564 = s8[1,2048,1,4096]{3,1,2,0:T(8,128)(4,1)S(1)} parameter(1)
+    constant.26625 = s8[]{:T(512)} constant(0)
+    pad.3785 = s8[1,2048,2,4096]{3,1,2,0:T(8,128)(4,1)} pad(param_1.12564, constant.26625), padding=0_0x0_0x0_1x0_0
+    constant.26624 = s32[]{:T(128)} constant(0)
+    param_2.10217 = s32[]{:T(128)S(6)} parameter(2)
+    dynamic-slice.5476 = s8[1,2048,2,256]{3,1,2,0:T(8,128)(4,1)} dynamic-slice(pad.3785, constant.26624, constant.26624, constant.26624, param_2.10217), dynamic_slice_sizes={1,2048,2,256}
+    pad.3784 = s8[1,2048,2,4096]{3,1,2,0:T(8,128)(4,1)} pad(param_1.12564, constant.26625), padding=0_0x0_0x1_0x0_0
+    param_0.10548 = s32[]{:T(128)S(6)} parameter(0)
+    dynamic-slice.5475 = s8[1,2048,2,256]{3,1,2,0:T(8,128)(4,1)} dynamic-slice(pad.3784, constant.26624, constant.26624, constant.26624, param_0.10548), dynamic_slice_sizes={1,2048,2,256}
+    add.10212 = s8[1,2048,2,256]{3,1,2,0:T(8,128)(4,1)} add(dynamic-slice.5476, dynamic-slice.5475)
+    ROOT bitcast.7216 = s8[2048,2,256]{2,0,1:T(8,128)(4,1)} bitcast(add.10212)
+  }
+
+  fused_computation.72.clone {
+    param_3.7589 = s8[1,32,2048]{2,1,0:T(8,128)(4,1)S(1)} parameter(3)
+    fusion.4292 = s8[32,2048,1]{1,0,2:T(8,128)(4,1)} fusion(param_3.7589), kind=kLoop, calls=fused_computation.69.clone.clone.clone
+    param_0.10550 = s32[]{:T(128)S(6)} parameter(0)
+    param_1.12565 = s8[1,2048,1,4096]{3,1,2,0:T(8,128)(4,1)S(1)} parameter(1)
+    param_2.10218 = s32[]{:T(128)S(6)} parameter(2)
+    fusion.4291 = s8[2048,2,256]{2,0,1:T(8,128)(4,1)} fusion(param_0.10550, param_1.12565, param_2.10218), kind=kLoop, calls=fused_computation.66.clone.clone.clone
+    convolution.803 = s32[32,2,256]{2,0,1:T(8,128)} convolution(fusion.4292, fusion.4291), window={size=2 pad=1_1 rhs_reversal=1}, dim_labels=bf0_i0o->b0f
+    ROOT bitcast.7218 = s32[1,32,2,256]{3,1,2,0:T(8,128)S(1)} bitcast(convolution.803)
+  }
+
+  fused_computation.74.clone {
+    param_0.10551 = s32[1,32,256]{2,1,0:T(8,128)S(1)} parameter(0)
+    param_1.12566 = s32[1,32,2,256]{3,1,2,0:T(8,128)S(1)} parameter(1)
+    slice.12607 = s32[1,32,1,256]{3,1,2,0:T(8,128)} slice(param_1.12566), slice={[0:1], [0:32], [1:2], [0:256]}
+    bitcast.7219 = s32[1,32,256]{2,1,0:T(8,128)} bitcast(slice.12607)
+    add.10213 = s32[1,32,256]{2,1,0:T(8,128)S(1)} add(param_0.10551, bitcast.7219)
+    param_2.10219 = s32[1,32,256]{2,1,0:T(8,128)S(1)} parameter(2)
+    slice.11997.clone.2 = s32[1,32,1,256]{3,1,2,0:T(8,128)} slice(param_1.12566), slice={[0:1], [0:32], [0:1], [0:256]}
+    bitcast.1773.clone.2 = s32[1,32,256]{2,1,0:T(8,128)} bitcast(slice.11997.clone.2)
+    add.6005.clone.2 = s32[1,32,256]{2,1,0:T(8,128)S(1)} add(param_2.10219, bitcast.1773.clone.2)
+    ROOT tuple.2893 = (s32[1,32,256]{2,1,0:T(8,128)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}) tuple(add.10213, add.6005.clone.2)
+  }
+
+  wide.windowed_dot_general_body {
+    wide_param.41 = (s8[1,32,2048]{2,1,0:T(8,128)(4,1)S(1)}, s8[1,2048,4096]{2,1,0:T(8,128)(4,1)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}, u32[]{:T(128)}, /*index=5*/u32[]{:T(128)}, u32[256]{0:T(256)}, u32[]{:T(128)}, u32[]{:T(128)}, s32[]{:T(128)}, /*index=10*/u32[]{:T(128)}, u32[]{:T(128)}, u32[]{:T(128)}) parameter(0)
+    get-tuple-element.29000 = s8[1,32,2048]{2,1,0:T(8,128)(4,1)S(1)} get-tuple-element(wide_param.41), index=0
+    get-tuple-element.29001 = s8[1,2048,4096]{2,1,0:T(8,128)(4,1)S(1)} get-tuple-element(wide_param.41), index=1
+    get-tuple-element.28990 = s32[1,32,256]{2,1,0:T(8,128)S(1)} get-tuple-element(wide_param.41), index=3
+    collective-permute-start = (s32[1,32,256]{2,1,0:T(8,128)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}, u32[]{:S(2)}, u32[]{:S(2)}) collective-permute-start(get-tuple-element.28990), channel_id=18, source_target_pairs={{0,1},{1,2},{2,3},{3,4},{4,5},{5,6},{6,7},{7,8},{8,9},{9,10},{10,11},{11,12},{12,13},{13,14},{14,15},{15,0},{16,17},{17,18},{18,19},{19,20},{20,21},{21,22},{22,23},{23,24},{24,25},{25,26},{26,27},{27,28},{28,29},{29,30},{30,31},{31,16},{32,33},{33,34},{34,35},{35,36},{36,37},{37,38},{38,39},{39,40},{40,41},{41,42},{42,43},{43,44},{44,45},{45,46},{46,47},{47,32},{48,49},{49,50},{50,51},{51,52},{52,53},{53,54},{54,55},{55,56},{56,57},{57,58},{58,59},{59,60},{60,61},{61,62},{62,63},{63,48},{64,65},{65,66},{66,67},{67,68},{68,69},{69,70},{70,71},{71,72},{72,73},{73,74},{74,75},{75,76},{76,77},{77,78},{78,79},{79,64},{80,81},{81,82},{82,83},{83,84},{84,85},{85,86},{86,87},{87,88},{88,89},{89,90},{90,91},{91,92},{92,93},{93,94},{94,95},{95,80},{96,97},{97,98},{98,99},{99,100},{100,101},{101,102},{102,103},{103,104},{104,105},{105,106},{106,107},{107,108},{108,109},{109,110},{110,111},{111,96},{112,113},{113,114},{114,115},{115,116},{116,117},{117,118},{118,119},{119,120},{120,121},{121,122},{122,123},{123,124},{124,125},{125,126},{126,127},{127,112},{128,129},{129,130},{130,131},{131,132},{132,133},{133,134},{134,135},{135,136},{136,137},{137,138},{138,139},{139,140},{140,141},{141,142},{142,143},{143,128},{144,145},{145,146},{146,147},{147,148},{148,149},{149,150},{150,151},{151,152},{152,153},{153,154},{154,155},{155,156},{156,157},{157,158},{158,159},{159,144},{160,161},{161,162},{162,163},{163,164},{164,165},{165,166},{166,167},{167,168},{168,169},{169,170},{170,171},{171,172},{172,173},{173,174},{174,175},{175,160},{176,177},{177,178},{178,179},{179,180},{180,181},{181,182},{182,183},{183,184},{184,185},{185,186},{186,187},{187,188},{188,189},{189,190},{190,191},{191,176},{192,193},{193,194},{194,195},{195,196},{196,197},{197,198},{198,199},{199,200},{200,201},{201,202},{202,203},{203,204},{204,205},{205,206},{206,207},{207,192},{208,209},{209,210},{210,211},{211,212},{212,213},{213,214},{214,215},{215,216},{216,217},{217,218},{218,219},{219,220},{220,221},{221,222},{222,223},{223,208},{224,225},{225,226},{226,227},{227,228},{228,229},{229,230},{230,231},{231,232},{232,233},{233,234},{234,235},{235,236},{236,237},{237,238},{238,239},{239,224},{240,241},{241,242},{242,243},{243,244},{244,245},{245,246},{246,247},{247,248},{248,249},{249,250},{250,251},{251,252},{252,253},{253,254},{254,255},{255,240}}
+    collective-permute-done = s32[1,32,256]{2,1,0:T(8,128)S(1)} collective-permute-done(collective-permute-start)
+    get-tuple-element.29005 = u32[]{:T(128)} get-tuple-element(wide_param.41), index=5
+    get-tuple-element.29006 = u32[256]{0:T(256)} get-tuple-element(wide_param.41), index=6
+    partition-id.101 = u32[] partition-id()
+    dynamic-slice.5472 = u32[1]{0:T(128)} dynamic-slice(get-tuple-element.29006, partition-id.101), dynamic_slice_sizes={1}
+    bitcast.7210 = u32[]{:T(128)} bitcast(dynamic-slice.5472)
+    get-tuple-element.29007 = u32[]{:T(128)} get-tuple-element(wide_param.41), index=7
+    add.10204 = u32[]{:T(128)S(6)} add(bitcast.7210, get-tuple-element.29007)
+    get-tuple-element.28991 = u32[]{:T(128)} get-tuple-element(wide_param.41), index=4
+    subtract.2863 = u32[]{:T(128)S(6)} subtract(add.10204, get-tuple-element.28991)
+    get-tuple-element.29008 = u32[]{:T(128)} get-tuple-element(wide_param.41), index=8
+    and.400 = u32[]{:T(128)S(6)} and(subtract.2863, get-tuple-element.29008)
+    clamp.1712 = u32[]{:T(128)S(6)} clamp(get-tuple-element.29005, and.400, get-tuple-element.29008)
+    convert.8615 = s32[]{:T(128)S(6)} convert(clamp.1712)
+    get-tuple-element.29009 = s32[]{:T(128)} get-tuple-element(wide_param.41), index=9
+    multiply.14830 = s32[]{:T(128)S(6)} multiply(convert.8615, get-tuple-element.29009)
+    bitcast.8823 = s8[1,2048,1,4096]{3,1,2,0:T(8,128)(4,1)S(1)} bitcast(get-tuple-element.29001)
+    add.10205 = u32[]{:T(128)S(6)} add(get-tuple-element.28991, bitcast.7210)
+    get-tuple-element.29010 = u32[]{:T(128)} get-tuple-element(wide_param.41), index=10
+    add.10206 = u32[]{:T(128)S(6)} add(add.10205, get-tuple-element.29010)
+    and.401 = u32[]{:T(128)S(6)} and(add.10206, get-tuple-element.29008)
+    clamp.1713 = u32[]{:T(128)S(6)} clamp(get-tuple-element.29005, and.401, get-tuple-element.29008)
+    convert.8616 = s32[]{:T(128)S(6)} convert(clamp.1713)
+    multiply.14831 = s32[]{:T(128)S(6)} multiply(convert.8616, get-tuple-element.29009)
+    fusion.4289 = s32[1,32,2,256]{3,1,2,0:T(8,128)S(1)} fusion(multiply.14830, bitcast.8823, multiply.14831, get-tuple-element.29000), kind=kOutput, calls=fused_computation.71.clone
+    get-tuple-element.28989 = s32[1,32,256]{2,1,0:T(8,128)S(1)} get-tuple-element(wide_param.41), index=2
+    collective-permute-start.1 = (s32[1,32,256]{2,1,0:T(8,128)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}, u32[]{:S(2)}, u32[]{:S(2)}) collective-permute-start(get-tuple-element.28989), channel_id=17, source_target_pairs={{0,15},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6},{8,7},{9,8},{10,9},{11,10},{12,11},{13,12},{14,13},{15,14},{16,31},{17,16},{18,17},{19,18},{20,19},{21,20},{22,21},{23,22},{24,23},{25,24},{26,25},{27,26},{28,27},{29,28},{30,29},{31,30},{32,47},{33,32},{34,33},{35,34},{36,35},{37,36},{38,37},{39,38},{40,39},{41,40},{42,41},{43,42},{44,43},{45,44},{46,45},{47,46},{48,63},{49,48},{50,49},{51,50},{52,51},{53,52},{54,53},{55,54},{56,55},{57,56},{58,57},{59,58},{60,59},{61,60},{62,61},{63,62},{64,79},{65,64},{66,65},{67,66},{68,67},{69,68},{70,69},{71,70},{72,71},{73,72},{74,73},{75,74},{76,75},{77,76},{78,77},{79,78},{80,95},{81,80},{82,81},{83,82},{84,83},{85,84},{86,85},{87,86},{88,87},{89,88},{90,89},{91,90},{92,91},{93,92},{94,93},{95,94},{96,111},{97,96},{98,97},{99,98},{100,99},{101,100},{102,101},{103,102},{104,103},{105,104},{106,105},{107,106},{108,107},{109,108},{110,109},{111,110},{112,127},{113,112},{114,113},{115,114},{116,115},{117,116},{118,117},{119,118},{120,119},{121,120},{122,121},{123,122},{124,123},{125,124},{126,125},{127,126},{128,143},{129,128},{130,129},{131,130},{132,131},{133,132},{134,133},{135,134},{136,135},{137,136},{138,137},{139,138},{140,139},{141,140},{142,141},{143,142},{144,159},{145,144},{146,145},{147,146},{148,147},{149,148},{150,149},{151,150},{152,151},{153,152},{154,153},{155,154},{156,155},{157,156},{158,157},{159,158},{160,175},{161,160},{162,161},{163,162},{164,163},{165,164},{166,165},{167,166},{168,167},{169,168},{170,169},{171,170},{172,171},{173,172},{174,173},{175,174},{176,191},{177,176},{178,177},{179,178},{180,179},{181,180},{182,181},{183,182},{184,183},{185,184},{186,185},{187,186},{188,187},{189,188},{190,189},{191,190},{192,207},{193,192},{194,193},{195,194},{196,195},{197,196},{198,197},{199,198},{200,199},{201,200},{202,201},{203,202},{204,203},{205,204},{206,205},{207,206},{208,223},{209,208},{210,209},{211,210},{212,211},{213,212},{214,213},{215,214},{216,215},{217,216},{218,217},{219,218},{220,219},{221,220},{222,221},{223,222},{224,239},{225,224},{226,225},{227,226},{228,227},{229,228},{230,229},{231,230},{232,231},{233,232},{234,233},{235,234},{236,235},{237,236},{238,237},{239,238},{240,255},{241,240},{242,241},{243,242},{244,243},{245,244},{246,245},{247,246},{248,247},{249,248},{250,249},{251,250},{252,251},{253,252},{254,253},{255,254}}
+    collective-permute-done.1 = s32[1,32,256]{2,1,0:T(8,128)S(1)} collective-permute-done(collective-permute-start.1)
+    fusion.4290 = (s32[1,32,256]{2,1,0:T(8,128)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}) fusion(collective-permute-done, fusion.4289, collective-permute-done.1), kind=kLoop, calls=fused_computation.76.clone
+    get-tuple-element.22079 = s32[1,32,256]{2,1,0:T(8,128)S(1)} get-tuple-element(fusion.4290), index=0
+    collective-permute-start.2 = (s32[1,32,256]{2,1,0:T(8,128)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}, u32[]{:S(2)}, u32[]{:S(2)}) collective-permute-start(get-tuple-element.22079), channel_id=20, source_target_pairs={{0,1},{1,2},{2,3},{3,4},{4,5},{5,6},{6,7},{7,8},{8,9},{9,10},{10,11},{11,12},{12,13},{13,14},{14,15},{15,0},{16,17},{17,18},{18,19},{19,20},{20,21},{21,22},{22,23},{23,24},{24,25},{25,26},{26,27},{27,28},{28,29},{29,30},{30,31},{31,16},{32,33},{33,34},{34,35},{35,36},{36,37},{37,38},{38,39},{39,40},{40,41},{41,42},{42,43},{43,44},{44,45},{45,46},{46,47},{47,32},{48,49},{49,50},{50,51},{51,52},{52,53},{53,54},{54,55},{55,56},{56,57},{57,58},{58,59},{59,60},{60,61},{61,62},{62,63},{63,48},{64,65},{65,66},{66,67},{67,68},{68,69},{69,70},{70,71},{71,72},{72,73},{73,74},{74,75},{75,76},{76,77},{77,78},{78,79},{79,64},{80,81},{81,82},{82,83},{83,84},{84,85},{85,86},{86,87},{87,88},{88,89},{89,90},{90,91},{91,92},{92,93},{93,94},{94,95},{95,80},{96,97},{97,98},{98,99},{99,100},{100,101},{101,102},{102,103},{103,104},{104,105},{105,106},{106,107},{107,108},{108,109},{109,110},{110,111},{111,96},{112,113},{113,114},{114,115},{115,116},{116,117},{117,118},{118,119},{119,120},{120,121},{121,122},{122,123},{123,124},{124,125},{125,126},{126,127},{127,112},{128,129},{129,130},{130,131},{131,132},{132,133},{133,134},{134,135},{135,136},{136,137},{137,138},{138,139},{139,140},{140,141},{141,142},{142,143},{143,128},{144,145},{145,146},{146,147},{147,148},{148,149},{149,150},{150,151},{151,152},{152,153},{153,154},{154,155},{155,156},{156,157},{157,158},{158,159},{159,144},{160,161},{161,162},{162,163},{163,164},{164,165},{165,166},{166,167},{167,168},{168,169},{169,170},{170,171},{171,172},{172,173},{173,174},{174,175},{175,160},{176,177},{177,178},{178,179},{179,180},{180,181},{181,182},{182,183},{183,184},{184,185},{185,186},{186,187},{187,188},{188,189},{189,190},{190,191},{191,176},{192,193},{193,194},{194,195},{195,196},{196,197},{197,198},{198,199},{199,200},{200,201},{201,202},{202,203},{203,204},{204,205},{205,206},{206,207},{207,192},{208,209},{209,210},{210,211},{211,212},{212,213},{213,214},{214,215},{215,216},{216,217},{217,218},{218,219},{219,220},{220,221},{221,222},{222,223},{223,208},{224,225},{225,226},{226,227},{227,228},{228,229},{229,230},{230,231},{231,232},{232,233},{233,234},{234,235},{235,236},{236,237},{237,238},{238,239},{239,224},{240,241},{241,242},{242,243},{243,244},{244,245},{245,246},{246,247},{247,248},{248,249},{249,250},{250,251},{251,252},{252,253},{253,254},{254,255},{255,240}}
+    collective-permute-done.2 = s32[1,32,256]{2,1,0:T(8,128)S(1)} collective-permute-done(collective-permute-start.2)
+    get-tuple-element.29011 = u32[]{:T(128)} get-tuple-element(wide_param.41), index=11
+    add.10209 = u32[]{:T(128)S(6)} add(get-tuple-element.28991, get-tuple-element.29011)
+    subtract.2864 = u32[]{:T(128)S(6)} subtract(add.10204, add.10209)
+    and.402 = u32[]{:T(128)S(6)} and(subtract.2864, get-tuple-element.29008)
+    clamp.1714 = u32[]{:T(128)S(6)} clamp(get-tuple-element.29005, and.402, get-tuple-element.29008)
+    convert.8617 = s32[]{:T(128)S(6)} convert(clamp.1714)
+    multiply.14832 = s32[]{:T(128)S(6)} multiply(convert.8617, get-tuple-element.29009)
+    bitcast.8824 = s8[1,2048,1,4096]{3,1,2,0:T(8,128)(4,1)S(1)} bitcast(get-tuple-element.29001)
+    add.10210 = u32[]{:T(128)S(6)} add(add.10209, bitcast.7210)
+    add.10211 = u32[]{:T(128)S(6)} add(add.10210, get-tuple-element.29010)
+    and.403 = u32[]{:T(128)S(6)} and(add.10211, get-tuple-element.29008)
+    clamp.1715 = u32[]{:T(128)S(6)} clamp(get-tuple-element.29005, and.403, get-tuple-element.29008)
+    convert.8618 = s32[]{:T(128)S(6)} convert(clamp.1715)
+    multiply.14833 = s32[]{:T(128)S(6)} multiply(convert.8618, get-tuple-element.29009)
+    fusion.4293 = s32[1,32,2,256]{3,1,2,0:T(8,128)S(1)} fusion(multiply.14832, bitcast.8824, multiply.14833, get-tuple-element.29000), kind=kOutput, calls=fused_computation.72.clone
+    get-tuple-element.22080 = s32[1,32,256]{2,1,0:T(8,128)S(1)} get-tuple-element(fusion.4290), index=1
+    collective-permute-start.3 = (s32[1,32,256]{2,1,0:T(8,128)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}, u32[]{:S(2)}, u32[]{:S(2)}) collective-permute-start(get-tuple-element.22080), channel_id=19, source_target_pairs={{0,15},{1,0},{2,1},{3,2},{4,3},{5,4},{6,5},{7,6},{8,7},{9,8},{10,9},{11,10},{12,11},{13,12},{14,13},{15,14},{16,31},{17,16},{18,17},{19,18},{20,19},{21,20},{22,21},{23,22},{24,23},{25,24},{26,25},{27,26},{28,27},{29,28},{30,29},{31,30},{32,47},{33,32},{34,33},{35,34},{36,35},{37,36},{38,37},{39,38},{40,39},{41,40},{42,41},{43,42},{44,43},{45,44},{46,45},{47,46},{48,63},{49,48},{50,49},{51,50},{52,51},{53,52},{54,53},{55,54},{56,55},{57,56},{58,57},{59,58},{60,59},{61,60},{62,61},{63,62},{64,79},{65,64},{66,65},{67,66},{68,67},{69,68},{70,69},{71,70},{72,71},{73,72},{74,73},{75,74},{76,75},{77,76},{78,77},{79,78},{80,95},{81,80},{82,81},{83,82},{84,83},{85,84},{86,85},{87,86},{88,87},{89,88},{90,89},{91,90},{92,91},{93,92},{94,93},{95,94},{96,111},{97,96},{98,97},{99,98},{100,99},{101,100},{102,101},{103,102},{104,103},{105,104},{106,105},{107,106},{108,107},{109,108},{110,109},{111,110},{112,127},{113,112},{114,113},{115,114},{116,115},{117,116},{118,117},{119,118},{120,119},{121,120},{122,121},{123,122},{124,123},{125,124},{126,125},{127,126},{128,143},{129,128},{130,129},{131,130},{132,131},{133,132},{134,133},{135,134},{136,135},{137,136},{138,137},{139,138},{140,139},{141,140},{142,141},{143,142},{144,159},{145,144},{146,145},{147,146},{148,147},{149,148},{150,149},{151,150},{152,151},{153,152},{154,153},{155,154},{156,155},{157,156},{158,157},{159,158},{160,175},{161,160},{162,161},{163,162},{164,163},{165,164},{166,165},{167,166},{168,167},{169,168},{170,169},{171,170},{172,171},{173,172},{174,173},{175,174},{176,191},{177,176},{178,177},{179,178},{180,179},{181,180},{182,181},{183,182},{184,183},{185,184},{186,185},{187,186},{188,187},{189,188},{190,189},{191,190},{192,207},{193,192},{194,193},{195,194},{196,195},{197,196},{198,197},{199,198},{200,199},{201,200},{202,201},{203,202},{204,203},{205,204},{206,205},{207,206},{208,223},{209,208},{210,209},{211,210},{212,211},{213,212},{214,213},{215,214},{216,215},{217,216},{218,217},{219,218},{220,219},{221,220},{222,221},{223,222},{224,239},{225,224},{226,225},{227,226},{228,227},{229,228},{230,229},{231,230},{232,231},{233,232},{234,233},{235,234},{236,235},{237,236},{238,237},{239,238},{240,255},{241,240},{242,241},{243,242},{244,243},{245,244},{246,245},{247,246},{248,247},{249,248},{250,249},{251,250},{252,251},{253,252},{254,253},{255,254}}
+    collective-permute-done.3 = s32[1,32,256]{2,1,0:T(8,128)S(1)} collective-permute-done(collective-permute-start.3)
+    fusion.4294 = (s32[1,32,256]{2,1,0:T(8,128)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}) fusion(collective-permute-done.2, fusion.4293, collective-permute-done.3), kind=kLoop, calls=fused_computation.74.clone
+    get-tuple-element.29002 = s32[1,32,256]{2,1,0:T(8,128)S(1)} get-tuple-element(fusion.4294), index=1
+    get-tuple-element.29003 = s32[1,32,256]{2,1,0:T(8,128)S(1)} get-tuple-element(fusion.4294), index=0
+    get-tuple-element.29012 = u32[]{:T(128)} get-tuple-element(wide_param.41), index=12
+    constant.28871 = u32[]{:T(128)} constant(2)
+    add.10214 = u32[]{:T(128)} add(get-tuple-element.28991, constant.28871)
+    ROOT tuple.3341 = (s8[1,32,2048]{2,1,0:T(8,128)(4,1)S(1)}, s8[1,2048,4096]{2,1,0:T(8,128)(4,1)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}, u32[]{:T(128)}, /*index=5*/u32[]{:T(128)}, u32[256]{0:T(256)}, u32[]{:T(128)}, u32[]{:T(128)}, s32[]{:T(128)}, /*index=10*/u32[]{:T(128)}, u32[]{:T(128)}, u32[]{:T(128)}) tuple(get-tuple-element.29000, get-tuple-element.29001, get-tuple-element.29002, get-tuple-element.29003, add.10214, get-tuple-element.29005, get-tuple-element.29006, get-tuple-element.29007, get-tuple-element.29008, get-tuple-element.29009, get-tuple-element.29010, get-tuple-element.29011, get-tuple-element.29012)
+  }
+
+  wide.windowed_dot_general_cond {
+    wide_param.40 = (s8[1,32,2048]{2,1,0:T(8,128)(4,1)S(1)}, s8[1,2048,4096]{2,1,0:T(8,128)(4,1)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}, u32[]{:T(128)}, /*index=5*/u32[]{:T(128)}, u32[256]{0:T(256)}, u32[]{:T(128)}, u32[]{:T(128)}, s32[]{:T(128)}, /*index=10*/u32[]{:T(128)}, u32[]{:T(128)}, u32[]{:T(128)}) parameter(0)
+    get-tuple-element.22055 = u32[]{:T(128)} get-tuple-element(wide_param.40), index=4
+    constant.26614 = u32[]{:T(128)} constant(8)
+    ROOT compare.2683 = pred[]{:T(512)} compare(get-tuple-element.22055, constant.26614), direction=LT
+  }
+
+  ENTRY test {
+    fusion.4456 = s8[1,32,2048]{2,1,0:T(8,128)(4,1)S(1)} parameter(0)
+    fusion.4457 = s8[1,2048,4096]{2,1,0:T(8,128)(4,1)S(1)} parameter(1)
+    broadcast.26239 = s32[1,32,256]{2,1,0:T(8,128)S(1)} parameter(2)
+    broadcast.26239.clone = s32[1,32,256]{2,1,0:T(8,128)S(1)} parameter(3)
+    constant.28863 = u32[]{:T(128)} constant(0)
+    constant.28864 = u32[]{:T(128)} constant(0)
+    constant.28865 = u32[256]{0:T(256)} constant({0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255})
+    constant.28866 = u32[]{:T(128)} constant(8)
+    constant.28867 = u32[]{:T(128)} constant(15)
+    constant.28868 = s32[]{:T(128)} constant(256)
+    constant.28869 = u32[]{:T(128)} constant(9)
+    constant.28870 = u32[]{:T(128)} constant(1)
+    constant.28871 = u32[]{:T(128)} constant(2)
+    tuple.3339 = (s8[1,32,2048]{2,1,0:T(8,128)(4,1)S(1)}, s8[1,2048,4096]{2,1,0:T(8,128)(4,1)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}, u32[]{:T(128)}, /*index=5*/u32[]{:T(128)}, u32[256]{0:T(256)}, u32[]{:T(128)}, u32[]{:T(128)}, s32[]{:T(128)}, /*index=10*/u32[]{:T(128)}, u32[]{:T(128)}, u32[]{:T(128)}) tuple(fusion.4456, fusion.4457, broadcast.26239, broadcast.26239.clone, constant.28863, constant.28864, constant.28865, constant.28866, constant.28867, constant.28868, constant.28869, constant.28870, constant.28871)
+    ROOT while.636 = (s8[1,32,2048]{2,1,0:T(8,128)(4,1)S(1)}, s8[1,2048,4096]{2,1,0:T(8,128)(4,1)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}, s32[1,32,256]{2,1,0:T(8,128)S(1)}, u32[]{:T(128)}, /*index=5*/u32[]{:T(128)}, u32[256]{0:T(256)}, u32[]{:T(128)}, u32[]{:T(128)}, s32[]{:T(128)}, /*index=10*/u32[]{:T(128)}, u32[]{:T(128)}, u32[]{:T(128)}) while(tuple.3339), condition=wide.windowed_dot_general_cond, body=wide.windowed_dot_general_body
+  })";
+  auto module = ParseAndReturnVerifiedModule(hlo_string).value();
+
+  int64_t fusion_instr_count = absl::c_count_if(
+      module->GetComputationWithName("wide.windowed_dot_general_body")
+          ->instructions(),
+      [](const HloInstruction* instr) {
+        return (instr->IsLoopFusion() || instr->IsOutputFusion());
+      });
+
+  // Fully unroll the specific loop (trip count is 4)
+  EXPECT_TRUE(
+      WhileLoopUnroller(/*unroll_factor=*/-1).Run(module.get()).value());
+
+  int64_t fusion_instr_count_after_unroll = absl::c_count_if(
+      module->entry_computation()->instructions(),
+      [](const HloInstruction* instr) {
+        return (instr->IsLoopFusion() || instr->IsOutputFusion());
+      });
+
+  // The total number of fusions in the unrolled version in the entry must be
+  // equal to loop_trip_count * fusion_instr_count
+  EXPECT_EQ(fusion_instr_count * 4, fusion_instr_count_after_unroll);
 }
 
 }  // namespace
diff --git a/third_party/xla/xla/service/xla_aot_compile_cpu_test.cc b/third_party/xla/xla/service/xla_aot_compile_cpu_test.cc
index 2e8a82707b485d..00dc0743f9670e 100644
--- a/third_party/xla/xla/service/xla_aot_compile_cpu_test.cc
+++ b/third_party/xla/xla/service/xla_aot_compile_cpu_test.cc
@@ -69,7 +69,9 @@ TEST(XlaCompileTest, LoadCpuExecutable) {
   executable_run_options.set_allocator(client->backend().memory_allocator());
   TF_ASSERT_OK_AND_ASSIGN(
       ScopedShapedBuffer result,
-      local_executable->Run({&array1, &array2}, executable_run_options));
+      local_executable->Run(
+          absl::Span<const ShapedBuffer* const>{&array1, &array2},
+          executable_run_options));
 
   TF_ASSERT_OK_AND_ASSIGN(Literal output,
                           client->ShapedBufferToLiteral(result));
diff --git a/third_party/xla/xla/service/xla_aot_compile_stablehlo_test.mlir b/third_party/xla/xla/service/xla_aot_compile_stablehlo_test.mlir
index 9925ef2a232cee..cc409ec78737b9 100644
--- a/third_party/xla/xla/service/xla_aot_compile_stablehlo_test.mlir
+++ b/third_party/xla/xla/service/xla_aot_compile_stablehlo_test.mlir
@@ -1,7 +1,7 @@
 module @axpy {
   func.func public @main(%alpha: tensor<f32>, %x: tensor<4 x f32>, %y: tensor<4 x f32>) -> tensor<4 x f32> {
     %a = "stablehlo.broadcast_in_dim" (%alpha) {
-      broadcast_dimensions = dense<[]> : tensor<0 x i64>
+      broadcast_dimensions = array<i64>
     } : (tensor<f32>) -> tensor<4 x f32>
     %ax = stablehlo.multiply %a, %x : tensor<4 x f32>
     %result = stablehlo.add %ax, %y : tensor<4 x f32>
diff --git a/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt b/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt
index 6b413161dc4860..0f339aa4974f0a 100644
--- a/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt
+++ b/third_party/xla/xla/service/xla_aot_compile_test_autotune_results.prototxt
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-version: 2
+version: 3
 results {
   device: "sm_6.0 with 17071734784B RAM, 56 cores, 1480500KHz clock, 715000KHz mem clock, 4194304B L2$"
   hlo: "(f32[3,3]{1,0}, s8[72]{0}) custom-call(f32[3,3]{1,0}, f32[3,3]{1,0}), custom_call_target=\"__cublas$gemm\", backend_config={\"operation_queue_id\":\"0\",\"wait_on_operation_queues\":[],\"gemm_backend_config\":{\"alpha_real\":1,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"alpha_imag\":0,\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]},\"epilogue\":\"DEFAULT\",\"lhs_stride\":\"9\",\"rhs_stride\":\"9\",\"grad_x\":false,\"grad_y\":false}}"
diff --git a/third_party/xla/xla/service/xla_compile_main.cc b/third_party/xla/xla/service/xla_compile_main.cc
index e78b598730a93e..892620f1b8078f 100644
--- a/third_party/xla/xla/service/xla_compile_main.cc
+++ b/third_party/xla/xla/service/xla_compile_main.cc
@@ -88,8 +88,8 @@ xla::StatusOr<std::unique_ptr<HloModule>> LoadModule(
   auto format = std::string(tsl::io::Extension(module_path));
   if (format == "hlo" || format == "txt") {
     return LoadModuleFromFile(
-        module_path, hlo_module_loader_details::Config(),
-        /*format=*/"hlo", [&](HloModuleConfig* c) {}, nullptr);
+        module_path, /*format=*/"hlo", hlo_module_loader_details::Config(),
+        [&](HloModuleConfig* c) {}, nullptr);
   }
   std::string module_string;
   TF_RETURN_IF_ERROR(
diff --git a/third_party/xla/xla/shape.cc b/third_party/xla/xla/shape.cc
index 41d91f8e286db4..f562e2f5958a12 100644
--- a/third_party/xla/xla/shape.cc
+++ b/third_party/xla/xla/shape.cc
@@ -116,23 +116,17 @@ std::string Shape::ToString(bool print_layout) const {
 }
 
 bool Shape::IsInteger() const {
-  if (primitive_util::IsIntegralType(element_type())) {
-    return true;
-  }
   if (IsTuple()) {
-    return absl::c_any_of(tuple_shapes_,
+    return absl::c_all_of(tuple_shapes_,
                           [](const Shape& s) { return s.IsInteger(); });
   }
-  return false;
+  return primitive_util::IsIntegralType(element_type());
 }
 
 bool Shape::is_static() const {
   if (IsTuple()) {
-    for (const Shape& subshape : tuple_shapes_) {
-      if (!subshape.is_static()) {
-        return false;
-      }
-    }
+    return absl::c_all_of(tuple_shapes_,
+                          [](const Shape& s) { return s.is_static(); });
   }
   return !absl::c_any_of(dynamic_dimensions_, [](bool b) { return b; });
 }
@@ -233,6 +227,9 @@ bool Shape::Equal::operator()(const Shape& lhs, const Shape& rhs) {
         if (ignore_memory_space_in_layout_) {
           equal.IgnoreMemorySpace();
         }
+        if (ignore_tail_padding_alignment_in_elements_in_layout_) {
+          equal.IgnoreTailPaddingAlignmentInElements();
+        }
         if (!equal(lhs.layout(), rhs.layout())) {
           VLOG(3) << "CompareShapes: lhs layout != rhs layout";
           return false;
diff --git a/third_party/xla/xla/shape.h b/third_party/xla/xla/shape.h
index a34ec03fd4cc8b..5361d8dfc7d1f8 100644
--- a/third_party/xla/xla/shape.h
+++ b/third_party/xla/xla/shape.h
@@ -130,6 +130,11 @@ class Shape {
     return dynamic_dimensions_[dimension];
   }
 
+  // Returns true if the given dimension is statically-sized.
+  bool is_static_dimension(int dimension) const {
+    return !dynamic_dimensions_[dimension];
+  }
+
   // Sets whether or not the given dimension is dynamically-sized.
   void set_dynamic_dimension(int dimension, bool is_dynamic) {
     dynamic_dimensions_[dimension] = is_dynamic;
@@ -280,6 +285,7 @@ class Shape {
       ignore_tiles_in_layout_ = true;
       ignore_element_size_in_layout_ = true;
       ignore_memory_space_in_layout_ = true;
+      ignore_tail_padding_alignment_in_elements_in_layout_ = true;
       return *this;
     }
     Equal& IgnoreElementType() {
@@ -298,6 +304,10 @@ class Shape {
       ignore_dimensions_ = true;
       return *this;
     }
+    Equal& IgnoreTailPaddingAlignmentInElements() {
+      ignore_tail_padding_alignment_in_elements_in_layout_ = true;
+      return *this;
+    }
 
    private:
     bool ignore_layout_ = false;
@@ -308,6 +318,7 @@ class Shape {
     bool ignore_fp_precision_ = false;
     bool ignore_dynamic_dimension_ = false;
     bool ignore_dimensions_ = false;
+    bool ignore_tail_padding_alignment_in_elements_in_layout_ = false;
   };
 
   // Test that all fields of the shape are the same, equivalent to Equal().
diff --git a/third_party/xla/xla/shape_test.cc b/third_party/xla/xla/shape_test.cc
index d573fa3d5c5e9a..3d23b421718ed8 100644
--- a/third_party/xla/xla/shape_test.cc
+++ b/third_party/xla/xla/shape_test.cc
@@ -108,6 +108,26 @@ TEST_F(ShapeTest, EqualityTest) {
             ShapeUtil::MakeShapeWithDenseLayout(F32, {23, 44}, {1, 0}));
 }
 
+TEST_F(ShapeTest, IsInteger) {
+  EXPECT_FALSE(opaque_.IsInteger());
+  EXPECT_FALSE(token_.IsInteger());
+  EXPECT_TRUE(matrix_.IsInteger());
+  EXPECT_FALSE(tuple_.IsInteger());
+  EXPECT_FALSE(nested_tuple_.IsInteger());
+
+  Shape u32_shape = ShapeUtil::MakeShape(U32, {1});
+  EXPECT_TRUE(u32_shape.IsInteger());
+
+  Shape f32_shape = ShapeUtil::MakeShape(F32, {1});
+  EXPECT_FALSE(f32_shape.IsInteger());
+
+  Shape integer_tuple = ShapeUtil::MakeTupleShape({u32_shape, u32_shape});
+  EXPECT_TRUE(integer_tuple.IsInteger());
+
+  Shape mixed_type_tuple = ShapeUtil::MakeTupleShape({u32_shape, f32_shape});
+  EXPECT_FALSE(mixed_type_tuple.IsInteger());
+}
+
 TEST_F(ShapeTest, IsStatic) {
   EXPECT_TRUE(opaque_.is_static());
   EXPECT_TRUE(token_.is_static());
@@ -165,6 +185,15 @@ TEST_F(ShapeTest, IsDynamicDimension) {
   EXPECT_FALSE(unbounded_.is_dynamic_dimension(1));
 }
 
+TEST_F(ShapeTest, IsStaticDimension) {
+  Shape dynamic_matrix = matrix_;
+  dynamic_matrix.set_dynamic_dimension(1, true);
+  EXPECT_TRUE(dynamic_matrix.is_static_dimension(0));
+  EXPECT_FALSE(dynamic_matrix.is_static_dimension(1));
+  EXPECT_FALSE(unbounded_.is_static_dimension(0));
+  EXPECT_TRUE(unbounded_.is_static_dimension(1));
+}
+
 TEST_F(ShapeTest, ProgramShapeToFromProto) {
   ProgramShape program_shape;
   *program_shape.add_parameters() = ShapeUtil::MakeShape(F32, {1, 2, 3});
diff --git a/third_party/xla/xla/shape_util.cc b/third_party/xla/xla/shape_util.cc
index d711ad0f291c4e..439f20243145e9 100644
--- a/third_party/xla/xla/shape_util.cc
+++ b/third_party/xla/xla/shape_util.cc
@@ -122,9 +122,10 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
     absl::Span<const int64_t> minor_to_major,
     absl::Span<const DimLevelType> dim_level_types,
     absl::Span<const bool> dim_unique, absl::Span<const bool> dim_ordered,
-    absl::Span<const Tile> tiles, PrimitiveType index_primitive_type,
-    PrimitiveType pointer_primitive_type, int64_t element_size_in_bits,
-    int64_t memory_space, std::optional<Shape> physical_shape) {
+    absl::Span<const Tile> tiles, int64_t tail_padding_alignment_in_elements,
+    PrimitiveType index_primitive_type, PrimitiveType pointer_primitive_type,
+    int64_t element_size_in_bits, int64_t memory_space,
+    std::optional<Shape> physical_shape) {
   if (dimensions.size() != minor_to_major.size()) {
     return InvalidArgument("Dimensions size is %ld, but layout size is %ld.",
                            dimensions.size(), minor_to_major.size());
@@ -143,8 +144,9 @@ StatusOr<Shape> MakeShapeWithLayoutInternal(
   }
   *shape.mutable_layout() = LayoutUtil::MakeLayout(
       minor_to_major, dim_level_types, dim_unique, dim_ordered, tiles,
-      index_primitive_type, pointer_primitive_type, element_size_in_bits,
-      memory_space, std::move(physical_shape));
+      tail_padding_alignment_in_elements, index_primitive_type,
+      pointer_primitive_type, element_size_in_bits, memory_space,
+      std::move(physical_shape));
   TF_RETURN_IF_ERROR(ShapeUtil::ValidateShape(shape));
   return std::move(shape);
 }
@@ -341,10 +343,12 @@ Shape MakeTupleShapeImpl(absl::Span<ShapePtrOrRef> shapes) {
 /* static */ Shape ShapeUtil::MakeShapeWithDenseLayout(
     PrimitiveType element_type, absl::Span<const int64_t> dimensions,
     absl::Span<const int64_t> minor_to_major, absl::Span<const Tile> tiles,
-    int64_t element_size_in_bits, int64_t memory_space) {
+    int64_t tail_padding_alignment_in_elements, int64_t element_size_in_bits,
+    int64_t memory_space) {
   auto ret = MakeShapeWithLayoutInternal(
       element_type, dimensions, minor_to_major, /*dim_level_types=*/{},
       /*dim_unique=*/{}, /*dim_ordered=*/{}, tiles,
+      tail_padding_alignment_in_elements,
       /*index_primitive_type=*/PRIMITIVE_TYPE_INVALID,
       /*pointer_primitive_type=*/PRIMITIVE_TYPE_INVALID, element_size_in_bits,
       memory_space,
@@ -359,12 +363,13 @@ Shape MakeTupleShapeImpl(absl::Span<ShapePtrOrRef> shapes) {
     absl::Span<const DimLevelType> dim_level_types,
     absl::Span<const bool> dim_unique, absl::Span<const bool> dim_ordered,
     PrimitiveType index_primitive_type, PrimitiveType pointer_primitive_type,
-    int64_t element_size_in_bits, int64_t memory_space,
-    std::optional<Shape> physical_shape) {
+    int64_t tail_padding_alignment_in_elements, int64_t element_size_in_bits,
+    int64_t memory_space, std::optional<Shape> physical_shape) {
   auto ret = MakeShapeWithLayoutInternal(
       element_type, dimensions, minor_to_major, dim_level_types, dim_unique,
-      dim_ordered, /*tiles=*/{}, index_primitive_type, pointer_primitive_type,
-      element_size_in_bits, memory_space, std::move(physical_shape));
+      dim_ordered, /*tiles=*/{}, tail_padding_alignment_in_elements,
+      index_primitive_type, pointer_primitive_type, element_size_in_bits,
+      memory_space, std::move(physical_shape));
   TF_CHECK_OK(ret.status());
   return *ret;
 }
@@ -421,6 +426,8 @@ ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
         shape.layout().tiles().begin(), shape.layout().tiles().end());
     new_shape.mutable_layout()->set_element_size_in_bits(
         shape.layout().element_size_in_bits());
+    new_shape.mutable_layout()->set_tail_padding_alignment_in_elements(
+        shape.layout().tail_padding_alignment_in_elements());
   }
   for (int i = 0; i < shape.dimensions_size(); ++i) {
     new_shape.set_dynamic_dimension(i, shape.is_dynamic_dimension(i));
@@ -681,8 +688,7 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
 }
 
 /* static */ bool ShapeUtil::IsZeroElementArray(const Shape& shape) {
-  return shape.IsArray() &&
-         absl::c_any_of(shape.dimensions(), [](int64_t d) { return d == 0; });
+  return shape.IsArray() && absl::c_linear_search(shape.dimensions(), 0);
 }
 
 /* static */ bool ShapeUtil::IsScalarWithElementType(
@@ -959,6 +965,7 @@ Shape ShapeUtil::PrependMajorDimension(int64_t bound, Shape shape) {
     }
     bool overflow;
     std::tie(product, overflow) = OverflowSafeMultiply(product, dimension);
+    any_overflows |= overflow;
   }
   if (any_overflows) {
     return InvalidArgument("shape's dimensions overflow: %s",
@@ -2052,6 +2059,7 @@ Shape ShapeUtil::DeviceShapeToHostShape(Shape s) {
       subshape->mutable_layout()->set_memory_space(Layout::kDefaultMemorySpace);
       subshape->mutable_layout()->clear_physical_shape();
       subshape->mutable_layout()->set_element_size_in_bits(0);
+      subshape->mutable_layout()->set_tail_padding_alignment_in_elements(1);
       subshape->mutable_layout()->set_dynamic_shape_metadata_prefix_bytes(0);
     }
   });
@@ -2077,6 +2085,16 @@ Status ShapeUtil::ByteStrides(const Shape& shape, absl::Span<int64_t> strides) {
   return OkStatus();
 }
 
+/*static*/
+std::optional<absl::InlinedVector<int64_t, 4>> ShapeUtil::ByteStrides(
+    const Shape& shape) {
+  absl::InlinedVector<int64_t, 4> strides(shape.dimensions_size());
+  if (!ByteStrides(shape, absl::MakeSpan(strides)).ok()) {
+    return std::nullopt;
+  }
+  return strides;
+}
+
 /*static*/ int64_t ShapeUtil::ArraySize(const Shape& shape) {
   CHECK(LayoutUtil::IsDenseArray(shape));
   if (shape.layout().tiles().empty()) {
@@ -2109,6 +2127,11 @@ Status ShapeUtil::ByteStrides(const Shape& shape, absl::Span<int64_t> strides) {
         num_of_elements * shape.layout().element_size_in_bits();
     return CeilOfRatio<int64_t>(num_bits, CHAR_BIT);
   }
+
+  if (shape.layout().tail_padding_alignment_in_elements() != 1) {
+    num_of_elements = RoundUpTo(
+        num_of_elements, shape.layout().tail_padding_alignment_in_elements());
+  }
   return num_of_elements * ByteSizeOfPrimitiveType(shape.element_type());
 }
 
diff --git a/third_party/xla/xla/shape_util.h b/third_party/xla/xla/shape_util.h
index 31d240e7268ac4..10bb4788b39302 100644
--- a/third_party/xla/xla/shape_util.h
+++ b/third_party/xla/xla/shape_util.h
@@ -422,8 +422,9 @@ class ShapeUtil {
   static Shape MakeShapeWithDenseLayout(
       PrimitiveType element_type, absl::Span<const int64_t> dimensions,
       absl::Span<const int64_t> minor_to_major,
-      absl::Span<const Tile> tiles = {}, int64_t element_size_in_bits = 0,
-      int64_t memory_space = 0);
+      absl::Span<const Tile> tiles = {},
+      int64_t tail_padding_alignment_in_elements = 1,
+      int64_t element_size_in_bits = 0, int64_t memory_space = 0);
 
   // Constructs a new sparse array shape with the given minor_to_major order and
   // dim_level_types in its Layout. Returns a value shape such that
@@ -436,6 +437,7 @@ class ShapeUtil {
       absl::Span<const bool> dim_ordered = {},
       PrimitiveType index_primitive_type = PRIMITIVE_TYPE_INVALID,
       PrimitiveType pointer_primitive_type = PRIMITIVE_TYPE_INVALID,
+      int64_t tail_padding_alignment_in_elements = 1,
       int64_t element_size_in_bits = 0, int64_t memory_space = 0,
       std::optional<Shape> physical_shape = std::nullopt);
 
@@ -991,6 +993,9 @@ class ShapeUtil {
   // layout. Ignores tiling. `strides` must have size equal to the number of
   // dimensions of `shape`.
   static Status ByteStrides(const Shape& shape, absl::Span<int64_t> strides);
+  // Same as above but returns the stride array, or std::nullopt if error.
+  static std::optional<absl::InlinedVector<int64_t, 4>> ByteStrides(
+      const Shape& shape);
 
   // Returns the array size in bytes (layout/tiling required), all paddings are
   // included.
diff --git a/third_party/xla/xla/shape_util_test.cc b/third_party/xla/xla/shape_util_test.cc
index 9ab46a1868e309..f640ae028ff830 100644
--- a/third_party/xla/xla/shape_util_test.cc
+++ b/third_party/xla/xla/shape_util_test.cc
@@ -337,6 +337,14 @@ TEST(ShapeUtilTest, ByteSizeOfWithoutPadding) {
   EXPECT_EQ(1600, ShapeUtil::ByteSizeOf(ShapeUtil::MakeShape(C64, {10, 20})));
 }
 
+TEST(ShapeUtilTest, ByteStrides) {
+  Shape shape1 = ShapeUtil::MakeShape(F32, {3, 5, 7});
+  Shape shape2 = ShapeUtil::MakeShape(F16, {5, 7, 9});
+
+  EXPECT_THAT(*ShapeUtil::ByteStrides(shape1), ElementsAre(140, 28, 4));
+  EXPECT_THAT(*ShapeUtil::ByteStrides(shape2), ElementsAre(126, 18, 2));
+}
+
 TEST(ShapeUtilTest, NilShape) {
   EXPECT_TRUE(ShapeUtil::IsEmptyTuple(ShapeUtil::MakeNil()));
   EXPECT_FALSE(ShapeUtil::IsEmptyTuple(ShapeUtil::MakeShape(F32, {1, 2, 3})));
diff --git a/third_party/xla/xla/status.h b/third_party/xla/xla/status.h
index c11623d44ea6a9..818bfdf4b1ba2d 100644
--- a/third_party/xla/xla/status.h
+++ b/third_party/xla/xla/status.h
@@ -16,12 +16,14 @@ limitations under the License.
 #ifndef XLA_STATUS_H_
 #define XLA_STATUS_H_
 
-#include "tsl/platform/status.h"  // IWYU pragma: export
+#include "absl/log/check.h"  // IWYU pragma: export
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
 
 namespace xla {
 // NOLINTBEGIN(misc-unused-using-decls)
-using tsl::OkStatus;
-using tsl::Status;  // TENSORFLOW_STATUS_OK
+using absl::OkStatus;
+using absl::Status;
 // NOLINTEND(misc-unused-using-decls)
 }  // namespace xla
 
diff --git a/third_party/xla/xla/stream_executor/BUILD b/third_party/xla/xla/stream_executor/BUILD
index b1b38c1789d793..8ffcccf4149256 100644
--- a/third_party/xla/xla/stream_executor/BUILD
+++ b/third_party/xla/xla/stream_executor/BUILD
@@ -1,13 +1,14 @@
 load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends", "stream_executor_internal")
-load("@local_tsl//tsl:tsl.bzl", "set_external_visibility", "transitive_hdrs")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility", "transitive_hdrs")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_proto_library")
 load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
     licenses = ["notice"],
 )
 
@@ -50,7 +51,7 @@ package_group(
 # an implementation detail of StreamExecutor and has internal visibility.
 #
 # TODO(ezhulenev): Remove from public API headers that are exported via standalone public libraries,
-# e.g. `platform` and `multi_platform_manager` should be added with an explicit dependency.
+# e.g. `platform` and `platform_manager` should be added with an explicit dependency.
 filegroup(
     name = "stream_executor_api_headers",
     srcs = [
@@ -63,20 +64,21 @@ filegroup(
         "device_options.h",
         "event.h",
         "executor_cache.h",
+        "host_memory_allocation.h",
         "kernel.h",
         "kernel_spec.h",
         "launch_dim.h",
+        "memory_allocation.h",
         "module_spec.h",
         "multi_platform_manager.h",
         "numeric_options.h",
         "platform.h",
+        "platform_manager.h",
         "scratch_allocator.h",
         "stream.h",
         "stream_executor.h",
-        "temporary_device_memory.h",
-        "temporary_memory_manager.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
 )
 
 # These are the headers for default StreamExecutor plugins.
@@ -87,7 +89,7 @@ filegroup(
         "dnn.h",
         "fft.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
 )
 
 # This is a list of dependencies required for building `stream_executor` target (and required for
@@ -127,7 +129,6 @@ cc_library(
         ":stream_executor_api_headers",
         ":stream_executor_plugin_headers",
     ],
-    visibility = ["//visibility:public"],
     deps = STREAM_EXECUTOR_DEPENDENCIES + [
         ":stream_executor_pimpl",
         "@com_google_absl//absl/status:statusor",
@@ -153,14 +154,12 @@ tf_proto_library(
     cc_api_version = 2,
     make_default_target_header_only = True,
     protodeps = ["//xla:autotune_results_proto"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "device_description",
     srcs = ["device_description.cc"],
     hdrs = ["device_description.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":device_description_proto_cc",
         ":launch_dim",
@@ -175,7 +174,6 @@ cc_library(
 cc_library(
     name = "device_memory",
     hdrs = ["device_memory.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/stream_executor/platform",
         "@local_tsl//tsl/platform:logging",
@@ -185,7 +183,7 @@ cc_library(
 cc_library(
     name = "data_type",
     hdrs = ["data_type.h"],
-    visibility = ["//visibility:public"],
+    visibility = [":internal"],
     deps = [
         "@local_tsl//tsl/platform:ml_dtypes",
         "@local_tsl//tsl/protobuf:dnn_proto_cc",
@@ -195,7 +193,6 @@ cc_library(
 cc_library(
     name = "device_memory_allocator",
     hdrs = ["device_memory_allocator.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":device_memory",
         ":platform",
@@ -211,32 +208,43 @@ cc_library(
 cc_library(
     name = "device_options",
     hdrs = ["device_options.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:logging",
     ],
 )
 
+cc_library(
+    name = "host_memory_allocation",
+    srcs = ["host_memory_allocation.cc"],
+    hdrs = ["host_memory_allocation.h"],
+    deps = [
+        ":memory_allocation",
+        ":stream_executor_internal",  # TODO(b/323534971): Remove dependency on Interface.
+    ],
+)
+
 cc_library(
     name = "host_or_device_scalar",
     hdrs = ["host_or_device_scalar.h"],
-    visibility = ["//visibility:public"],
     deps = [":device_memory"],
 )
 
 cc_library(
     name = "launch_dim",
     hdrs = ["launch_dim.h"],
-    visibility = ["//visibility:public"],
     deps = ["@com_google_absl//absl/strings"],
 )
 
 cc_library(
-    name = "multi_platform_manager",
-    srcs = ["multi_platform_manager.cc"],
-    hdrs = ["multi_platform_manager.h"],
-    visibility = ["//visibility:public"],
+    name = "memory_allocation",
+    hdrs = ["memory_allocation.h"],
+)
+
+cc_library(
+    name = "platform_manager",
+    srcs = ["platform_manager.cc"],
+    hdrs = ["platform_manager.h"],
     deps = [
         ":platform",
         "//xla/stream_executor/platform",
@@ -252,17 +260,24 @@ cc_library(
     ],
 )
 
+# TODO(hebecker): Remove compatibility target when all users have been migrated to :platform_manager
+cc_library(
+    name = "multi_platform_manager",
+    hdrs = ["multi_platform_manager.h"],
+    deprecation = "The type MultiPlatformManager is being renamed to PlatformManager. " +
+                  "Use target :platform_manager instead.",
+    deps = [":platform_manager"],
+)
+
 cc_library(
     name = "numeric_options",
     hdrs = ["numeric_options.h"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "platform",
     srcs = ["platform.cc"],
     hdrs = ["platform.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":device_description",
         ":device_options",
@@ -284,7 +299,6 @@ cc_library(
     name = "blas",
     srcs = ["blas.cc"],
     hdrs = ["blas.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":data_type",
         ":device_memory",
@@ -293,6 +307,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/protobuf:dnn_proto_cc",
     ],
 )
@@ -301,7 +316,6 @@ cc_library(
     name = "dnn",
     srcs = ["dnn.cc"],
     hdrs = ["dnn.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":data_type",
         ":device_description_proto_cc",
@@ -327,7 +341,6 @@ cc_library(
 cc_library(
     name = "fft",
     hdrs = ["fft.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/stream_executor/platform",
     ],
@@ -336,7 +349,6 @@ cc_library(
 cc_library(
     name = "lazy_op_runner",
     hdrs = ["lazy_op_runner.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":stream_executor_headers",
         "@com_google_absl//absl/base",
@@ -347,10 +359,7 @@ cc_library(
 )
 
 # TODO(ezhulenev): This should be removed.
-exports_files(
-    ["lazy_op_runner.h"],
-    visibility = ["//visibility:public"],
-)
+exports_files(["lazy_op_runner.h"])
 
 #===--------------------------------------------------------------------------------------------===#
 # StreamExecutor platform-dependent interfaces
@@ -366,7 +375,7 @@ exports_files(
 cc_library(
     name = "stream_executor_internal",
     hdrs = ["stream_executor_internal.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([":internal"]),
     deps = [
         ":stream_executor_headers",
         "//xla/stream_executor/platform",
@@ -392,7 +401,7 @@ cc_library(
         ":stream_executor_api_headers",
         ":stream_executor_plugin_headers",
     ],
-    visibility = ["//visibility:public"],
+    visibility = [":internal"],
     deps = STREAM_EXECUTOR_DEPENDENCIES + if_static([
         "@com_google_protobuf//:protobuf",  # indirectly-used by dnn.h
     ]) + [
@@ -405,7 +414,7 @@ cc_library(
     name = "plugin_registry",
     srcs = ["plugin_registry.cc"],
     hdrs = ["plugin_registry.h"],
-    visibility = ["//visibility:public"],
+    visibility = [":internal"],
     deps = [
         ":blas",
         ":dnn",
@@ -432,7 +441,7 @@ cc_library(
     name = "allocator_stats",
     srcs = ["allocator_stats.cc"],
     hdrs = ["allocator_stats.h"],
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
     deps = [
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/strings:str_format",
@@ -449,7 +458,7 @@ cc_library(
         ],
         "//conditions:default": [],
     }),
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
     deps = [
         ":stream_executor_headers",
         ":stream_executor_internal",
@@ -465,7 +474,7 @@ cc_library(
     name = "event",
     srcs = ["event.cc"],
     hdrs = ["event.h"],
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
     deps = [
         ":stream_executor_headers",
         ":stream_executor_internal",
@@ -478,7 +487,7 @@ cc_library(
     name = "executor_cache",
     srcs = ["executor_cache.cc"],
     hdrs = ["executor_cache.h"],
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
     deps = [
         ":platform",
         ":stream_executor_headers",
@@ -497,7 +506,7 @@ cc_library(
     name = "kernel_spec",
     srcs = ["kernel_spec.cc"],
     hdrs = ["kernel_spec.h"],
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
     deps = [
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/status:statusor",
@@ -511,27 +520,31 @@ cc_library(
     name = "kernel",
     srcs = ["kernel.cc"],
     hdrs = ["kernel.h"],
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
     deps = [
         ":device_memory",
+        ":kernel_spec",
         ":platform",
         ":stream_executor_headers",
         ":stream_executor_internal",
         "@com_google_absl//absl/container:inlined_vector",
+        "@com_google_absl//absl/memory",
         "@com_google_absl//absl/meta:type_traits",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
 cc_library(
     name = "scratch_allocator",
     hdrs = ["scratch_allocator.h"],
-    visibility = ["//visibility:public"],
+    visibility = ["//visibility:private"],
     deps = [
         ":device_memory_allocator",
         ":stream_executor_headers",
@@ -542,36 +555,14 @@ cc_library(
     ],
 )
 
-cc_library(
-    name = "temporary_device_memory",
-    srcs = ["temporary_device_memory.cc"],
-    hdrs = ["temporary_device_memory.h"],
-    visibility = ["//visibility:public"],
-    deps = [":stream_executor_headers"],
-)
-
-cc_library(
-    name = "temporary_memory_manager",
-    srcs = ["temporary_memory_manager.cc"],
-    hdrs = ["temporary_memory_manager.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":stream_executor_headers",
-        ":temporary_device_memory",  # build_cleaner: keep
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:logging",
-    ],
-)
-
 #===--------------------------------------------------------------------------------------------===#
 
 transitive_hdrs(
     name = "stream_executor_install_hdrs",
+    tags = [
+        "alt_dep=:stream_executor_headers",
+        "avoid_dep",
+    ],
     deps = [":stream_executor_headers"],
 )
 
@@ -585,17 +576,18 @@ cc_library(
         "stream_executor_pimpl.cc",
     ],
     hdrs = ["stream_executor_pimpl.h"],
-    visibility = ["//visibility:public"],
+    tags = ["avoid_dep"],
+    visibility = ["//visibility:private"],
     deps = [
         ":blas",  # build_cleaner: keep
         ":command_buffer",  # build_cleaner: keep
         ":dnn",  # build_cleaner: keep
         ":fft",
+        ":host_memory_allocation",
         ":kernel_spec",
         ":platform",
         ":stream_executor_headers",
         ":stream_executor_internal",
-        ":temporary_memory_manager",  # build_cleaner: keep
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/functional:any_invocable",
@@ -623,7 +615,6 @@ cc_library(
 # things that lead to nearly impossible to debug run time crashes.
 cc_library(
     name = "stream_executor_impl",
-    visibility = ["//visibility:public"],
     deps = [
         ":allocator_stats",
         ":device_description",
@@ -634,13 +625,11 @@ cc_library(
         ":kernel",
         ":kernel_spec",
         ":launch_dim",
-        ":multi_platform_manager",
         ":platform",
+        ":platform_manager",
         ":scratch_allocator",
         ":stream_executor_headers",
         ":stream_executor_pimpl",
-        ":temporary_device_memory",
-        ":temporary_memory_manager",
         "@local_tsl//tsl/protobuf:dnn_proto_cc_impl",
     ],
 )
@@ -662,6 +651,18 @@ xla_cc_test(
     ],
 )
 
+xla_cc_test(
+    name = "stream_executor_test",
+    srcs = ["stream_executor_test.cc"],
+    deps = [
+        ":stream_executor",
+        "//xla/stream_executor/host:host_platform",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
+
 xla_cc_test(
     name = "stream_test",
     size = "small",
@@ -669,6 +670,7 @@ xla_cc_test(
     deps = [
         ":stream_executor",
         "//xla/stream_executor/host:host_platform",
+        "@com_google_absl//absl/log:check",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ],
@@ -692,23 +694,21 @@ xla_cc_test(
 alias(
     name = "cuda_platform",
     actual = "//xla/stream_executor/cuda:all_runtime",
-    visibility = ["//visibility:public"],
 )
 
 alias(
     name = "rocm_platform",
     actual = "//xla/stream_executor/rocm:all_runtime",
-    visibility = ["//visibility:public"],
 )
 
 # TODO(ezhulenev): This should be removed.
 cc_library(
     name = "stream_executor_bundle",
-    visibility = ["//visibility:public"],
+    visibility = [":internal"],
     deps = [
         ":dnn",
         ":event",
-        ":multi_platform_manager",
+        ":platform_manager",
         ":scratch_allocator",
         ":stream_executor",
         "//xla/stream_executor/cuda:cuda_platform_id",
diff --git a/third_party/xla/xla/stream_executor/blas.h b/third_party/xla/xla/stream_executor/blas.h
index 0d067d6aee6696..07a5dadc681624 100644
--- a/third_party/xla/xla/stream_executor/blas.h
+++ b/third_party/xla/xla/stream_executor/blas.h
@@ -1,3 +1,4 @@
+#include "tsl/platform/errors.h"
 /* Copyright 2015 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
@@ -17,24 +18,6 @@ limitations under the License.
 // use in conjunction with the StreamExecutor abstraction.
 //
 // Note that this interface is optionally supported by platforms.
-//
-// This abstraction makes it simple to entrain BLAS operations on GPU data into
-// a Stream -- users typically will not use this API directly, but will use the
-// Stream builder methods to entrain these operations "under the hood". For
-// example:
-//
-//  DeviceMemory<float> x = stream_exec->AllocateArray<float>(1024);
-//  DeviceMemory<float> y = stream_exec->AllocateArray<float>(1024);
-//  // ... populate x and y ...
-//  Stream stream{stream_exec};
-//  stream
-//    .Init()
-//    .ThenBlasAxpy(1024, 5.5, x, 1, &y, 1);
-//  TF_CHECK_OK(stream.BlockHostUntilDone());
-//
-// By using stream operations in this manner the user can easily intermix custom
-// kernel launches (via StreamExecutor::ThenLaunch()) with these pre-canned BLAS
-// routines.
 
 #ifndef XLA_STREAM_EXECUTOR_BLAS_H_
 #define XLA_STREAM_EXECUTOR_BLAS_H_
@@ -61,7 +44,9 @@ namespace stream_executor {
 
 namespace gpu {
 struct BlasLt;
-}
+struct MatrixDescriptor;
+struct OutputMatrixDescriptor;
+}  // namespace gpu
 
 class Stream;
 class ScratchAllocator;
@@ -204,6 +189,21 @@ class AlgorithmConfig {
 typedef int64_t ComputePrecision;
 constexpr ComputePrecision kDefaultComputePrecision = 0;
 
+namespace detail {
+
+// Helper to return if `T` is the same type as `First` or any or `Rest`.
+template <typename T>
+constexpr bool is_any_of() {
+  return false;
+}
+
+template <typename T, typename First, typename... Rest>
+constexpr bool is_any_of() {
+  return std::is_same_v<T, First> || is_any_of<T, Rest...>();
+}
+
+}  // namespace detail
+
 // BLAS support interface -- this can be derived from a GPU executor when the
 // underlying platform has an BLAS library implementation available. See
 // StreamExecutor::AsBlas().
@@ -311,7 +311,10 @@ class BlasSupport {
 
   // Gets a list of supported algorithms for DoBlasGemmWithAlgorithm.
   virtual bool GetBlasGemmAlgorithms(
-      Stream *stream, std::vector<AlgorithmType> *out_algorithms) = 0;
+      Stream *stream, const gpu::MatrixDescriptor &a,
+      const gpu::MatrixDescriptor &b, gpu::OutputMatrixDescriptor *c,
+      const void *alpha, const void *beta,
+      std::vector<blas::AlgorithmType> *out_algorithms) = 0;
 
   // Like DoBlasGemm, but accepts an algorithm and an compute type.
   //
@@ -404,6 +407,170 @@ class BlasSupport {
       DeviceMemoryBase *c, int ldc, int64_t stride_c, int batch_count,
       const NumericOptions &numeric_options, blas::CallContext context) = 0;
 
+  template <typename InputType, typename OutputType, typename ConstantType>
+  absl::Status BlasGemmStridedBatchedWithAlgorithm(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64 n, uint64_t k, ConstantType alpha,
+      const DeviceMemory<InputType> &a, int lda, int64_t stride_a,
+      const DeviceMemory<InputType> &b, int ldb, int64_t stride_b,
+      ConstantType beta, DeviceMemory<OutputType> *c, int ldc, int64_t stride_c,
+      int batch_count, blas::ComputationType computation_type,
+      blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
+      blas::ProfileResult *output_profile_result, blas::CallContext context) {
+    TF_RETURN_IF_ERROR(
+        CheckTypesForExtendedBlas<InputType, OutputType, ConstantType>(
+            computation_type));
+
+    void *alpha_ptr = &alpha;
+    void *beta_ptr = &beta;
+    float alpha_storage, beta_storage;
+    UpcastHalfToFloat<ConstantType>(&alpha_ptr, &beta_ptr, &alpha_storage,
+                                    &beta_storage);
+    absl::Status status = DoBlasGemmStridedBatchedWithAlgorithm(
+        stream, transa, transb, m, n, k, alpha_ptr, a,
+        blas::ToDataType<InputType>::value, lda, stride_a, b,
+        blas::ToDataType<InputType>::value, ldb, stride_b, beta_ptr, c,
+        blas::ToDataType<OutputType>::value, ldc, stride_c, batch_count,
+        computation_type, algorithm, numeric_options, output_profile_result,
+        context);
+    if (output_profile_result) {
+      // The error is recorded in the profile.
+      return absl::OkStatus();
+    }
+    return status;
+  }
+
+  template <typename InputType, typename OutputType, typename ConstantType>
+  absl::Status BlasGemm(Stream *stream, blas::Transpose transa,
+                        blas::Transpose transb, uint64_t m, uint64 n, uint64 k,
+                        ConstantType alpha, const DeviceMemory<InputType> &a,
+                        int lda, const DeviceMemory<InputType> &b, int ldb,
+                        ConstantType beta, DeviceMemory<OutputType> *c, int ldc,
+                        const NumericOptions &numeric_options,
+                        blas::CallContext context) {
+    static_assert(
+        detail::is_any_of<InputType, int8_t, Eigen::half, Eigen::bfloat16,
+                          float, double, std::complex<float>,
+                          std::complex<double>>(),
+        "Input can be int8_t, half, bf16, float, double, std::complex<float> "
+        "or "
+        "std::complex<double>");
+    static_assert(!std::is_same_v<InputType, Eigen::half> ||
+                      detail::is_any_of<ConstantType, float, Eigen::half>(),
+                  "If input is Eigen::half, constant has to be either "
+                  "Eigen::half or float");
+    static_assert(detail::is_any_of<InputType, int8_t, Eigen::half,
+                                    Eigen::bfloat16, ConstantType>(),
+                  "If input is not int8_t, Eigen::half, constant and input "
+                  "types have to match");
+    void *alpha_ptr = &alpha;
+    void *beta_ptr = &beta;
+    float alpha_storage, beta_storage;
+    UpcastHalfToFloat<ConstantType>(&alpha_ptr, &beta_ptr, &alpha_storage,
+                                    &beta_storage);
+
+    return DoBlasGemm(stream, transa, transb, m, n, k,
+                      blas::ToDataType<InputType>::value, alpha_ptr, a, lda, b,
+                      ldb, beta_ptr, c, ldc, numeric_options, context);
+  }
+
+  template <typename InputType, typename OutputType>
+  absl::Status BlasGemm(Stream *stream, blas::Transpose transa,
+                        blas::Transpose transb, uint64_t m, uint64 n, uint64 k,
+                        const DeviceMemory<InputType> &a, int lda,
+                        const DeviceMemory<InputType> &b, int ldb,
+                        DeviceMemory<OutputType> *c, int ldc,
+                        const NumericOptions &numeric_options,
+                        blas::CallContext context) {
+    InputType alpha{1.0};
+    InputType beta{0.0};
+    return BlasGemm(stream, transa, transb, m, n, k, alpha, a, lda, b, ldb,
+                    beta, c, ldc, numeric_options, context);
+  }
+
+  template <typename InputType, typename OutputType, typename ConstantType>
+  absl::Status BlasGemmWithAlgorithm(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64 n, uint64_t k, ConstantType alpha,
+      const DeviceMemory<InputType> &a, int lda,
+      const DeviceMemory<InputType> &b, int ldb, ConstantType beta,
+      DeviceMemory<OutputType> *c, int ldc,
+      blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+      const NumericOptions &numeric_options,
+      blas::ProfileResult *output_profile_result, blas::CallContext context) {
+    TF_RETURN_IF_ERROR(
+        CheckTypesForExtendedBlas<InputType, OutputType, ConstantType>(
+            computation_type));
+
+    void *alpha_ptr = &alpha;
+    void *beta_ptr = &beta;
+    float alpha_storage, beta_storage;
+    UpcastHalfToFloat<ConstantType>(&alpha_ptr, &beta_ptr, &alpha_storage,
+                                    &beta_storage);
+
+    absl::Status st = DoBlasGemmWithAlgorithm(
+        stream, transa, transb, m, n, k, alpha_ptr, a,
+        blas::ToDataType<InputType>::value, lda, b,
+        blas::ToDataType<InputType>::value, ldb, beta_ptr, c,
+        blas::ToDataType<OutputType>::value, ldc, computation_type, algorithm,
+        numeric_options, output_profile_result, context);
+
+    if (output_profile_result) {
+      // The error is recorded in the profile.
+      return absl::OkStatus();
+    }
+    return st;
+  }
+
+  template <typename InputType, typename OutputType>
+  absl::Status BlasGemmWithAlgorithm(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64 n, uint64_t k, const DeviceMemory<InputType> &a,
+      int lda, const DeviceMemory<InputType> &b, int ldb,
+      DeviceMemory<OutputType> *c, int ldc,
+      blas::ComputationType computation_type, blas::AlgorithmType algorithm,
+      blas::ProfileResult *output_profile_result, blas::CallContext context) {
+    OutputType alpha{1};
+    OutputType beta{0};
+
+    return BlasGemmWithAlgorithm(stream, transa, transb, m, n, k, alpha, a, lda,
+                                 b, ldb, beta, c, ldc, computation_type,
+                                 algorithm, NumericOptions{},
+                                 output_profile_result, context);
+  }
+
+  template <typename InputType, typename OutputType, typename ConstantType>
+  absl::Status BlasGemmStridedBatched(
+      Stream *stream, blas::Transpose transa, blas::Transpose transb,
+      uint64_t m, uint64 n, uint64_t k, ConstantType alpha,
+      const DeviceMemory<InputType> &a, int lda, int64_t stride_a,
+      const DeviceMemory<InputType> &b, int ldb, int64_t stride_b,
+      ConstantType beta, DeviceMemory<OutputType> *c, int ldc, int64_t stride_c,
+      int batch_count, const NumericOptions &numeric_options,
+      blas::CallContext context) {
+    static_assert(
+        detail::is_any_of<InputType, int8_t, float, Eigen::half,
+                          Eigen::bfloat16, double, std::complex<float>,
+                          std::complex<double>>(),
+        "Unsupported input type");
+    static_assert(std::is_same_v<ConstantType, InputType> ||
+                      (detail::is_any_of<InputType, int8_t, Eigen::half,
+                                         Eigen::bfloat16>() &&
+                       std::is_same_v<ConstantType, float>),
+                  "Mismatched input and alpha/beta types");
+
+    void *alpha_ptr = &alpha;
+    void *beta_ptr = &beta;
+    float alpha_storage, beta_storage;
+    UpcastHalfToFloat<ConstantType>(&alpha_ptr, &beta_ptr, &alpha_storage,
+                                    &beta_storage);
+
+    return DoBlasGemmStridedBatched(
+        stream, transa, transb, m, n, k, blas::ToDataType<InputType>::value,
+        alpha_ptr, a, lda, stride_a, b, ldb, stride_b, beta_ptr, c, ldc,
+        stride_c, batch_count, numeric_options, context);
+  }
+
   // Solves a triangular matrix equation.
   //
   //     op(a) * x = alpha * b,
@@ -505,6 +672,71 @@ class BlasSupport {
   // own memory pool for allocating workspace.
   void ResetWorkspace();
 
+  // Checks whether types match before a call to extended BLAS version.
+  template <typename ABType, typename CType, typename ScaleType>
+  absl::Status CheckTypesForExtendedBlas(
+      blas::ComputationType computation_type) {
+    static_assert(
+        detail::is_any_of<ABType, Eigen::half, Eigen::bfloat16, float, double,
+                          int8_t, std::complex<float>, std::complex<double>>(),
+        "The only buffer types supported are: Eigen::half, float, "
+        "double, int8, std::complex<float> and std::complex<double>");
+    static_assert(
+        std::is_same_v<ScaleType, CType> ||
+            (std::is_same_v<ScaleType, float> &&
+             detail::is_any_of<CType, Eigen::half, Eigen::bfloat16>()),
+        "Mismatched alpha/beta and output types");
+
+    bool valid_computation_type = [computation_type] {
+      switch (computation_type) {
+        case blas::ComputationType::kF16:
+          return std::is_same_v<CType, Eigen::half>;
+        case blas::ComputationType::kF32:
+          return detail::is_any_of<CType, Eigen::half, Eigen::bfloat16, float,
+                                   std::complex<float>>();
+        case blas::ComputationType::kF64:
+          return detail::is_any_of<CType, double, std::complex<double>>();
+        case blas::ComputationType::kI32:
+          return std::is_same_v<CType, int32_t>;
+        case blas::ComputationType::kF16AsF32:   // fall-through
+        case blas::ComputationType::kBF16AsF32:  // fall-through
+        case blas::ComputationType::kTF32AsF32:
+          return detail::is_any_of<CType, float, std::complex<float>>();
+      }
+    }();
+
+    if (!valid_computation_type) {
+      return absl::InternalError(absl::StrCat(
+          "Invalid computation type ",
+          blas::ComputationTypeString(computation_type), " for output type: ",
+          blas::DataTypeString(blas::ToDataType<CType>::value)));
+    }
+    return absl::OkStatus();
+  }
+
+  // Non-extended BLAS interface requires alpha/beta to be floats when input
+  // type is Eigen::half. However, for consistency purposes it is convenient
+  // for the interface to accept Eigen::half.
+  template <typename T>
+  void UpcastHalfToFloat(void **alpha_ptr, void **beta_ptr,
+                         float *alpha_storage, float *beta_storage) {
+    if (std::is_same<T, Eigen::half>::value) {
+      *alpha_storage =
+          static_cast<float>(*reinterpret_cast<Eigen::half *>(*alpha_ptr));
+      *beta_storage =
+          static_cast<float>(*reinterpret_cast<Eigen::half *>(*beta_ptr));
+      *alpha_ptr = alpha_storage;
+      *beta_ptr = beta_storage;
+    } else if (std::is_same<T, Eigen::bfloat16>::value) {
+      *alpha_storage =
+          static_cast<float>(*reinterpret_cast<Eigen::bfloat16 *>(*alpha_ptr));
+      *beta_storage =
+          static_cast<float>(*reinterpret_cast<Eigen::bfloat16 *>(*beta_ptr));
+      *alpha_ptr = alpha_storage;
+      *beta_ptr = beta_storage;
+    }
+  }
+
   BlasSupport(const BlasSupport &) = delete;
   void operator=(const BlasSupport &) = delete;
 };
@@ -563,9 +795,11 @@ class BlasSupport {
       const void *beta, DeviceMemoryBase *c, int ldc,                          \
       const NumericOptions &numeric_options, blas::CallContext context)        \
       override;                                                                \
-  bool GetBlasGemmAlgorithms(Stream *stream,                                   \
-                             std::vector<blas::AlgorithmType> *out_algorithms) \
-      override;                                                                \
+  bool GetBlasGemmAlgorithms(                                                  \
+      Stream *stream, const gpu::MatrixDescriptor &a,                          \
+      const gpu::MatrixDescriptor &b, gpu::OutputMatrixDescriptor *c,          \
+      const void *alpha, const void *beta,                                     \
+      std::vector<blas::AlgorithmType> *out_algorithms) override;              \
   absl::Status DoBlasGemmWithAlgorithm(                                        \
       Stream *stream, blas::Transpose transa, blas::Transpose transb,          \
       uint64_t m, uint64 n, uint64 k, const void *alpha,                       \
diff --git a/third_party/xla/xla/stream_executor/build_defs.bzl b/third_party/xla/xla/stream_executor/build_defs.bzl
index a29f7d28d60718..6916574c646edf 100644
--- a/third_party/xla/xla/stream_executor/build_defs.bzl
+++ b/third_party/xla/xla/stream_executor/build_defs.bzl
@@ -1,7 +1,11 @@
 """Configurations for StreamExecutor builds"""
 
 load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda_is_configured")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm_is_configured")
+load("@local_config_rocm//rocm:build_defs.bzl", _if_gpu_is_configured = "if_gpu_is_configured")
+load(
+    "@local_tsl//tsl/platform:rules_cc.bzl",
+    "cc_library",
+)
 
 def stream_executor_friends():
     return ["//..."]
@@ -13,12 +17,11 @@ def tf_additional_cuda_platform_deps():
     return []
 
 def tf_additional_cudnn_plugin_copts():
-    # TODO(timshen): remove TF_ENABLE_CUDNN_FRONTEND once cudnn-frontend is imported.
-    return ["-DNV_CUDNN_DISABLE_EXCEPTION", "-DTF_ENABLE_CUDNN_FRONTEND"]
+    return ["-DNV_CUDNN_DISABLE_EXCEPTION"]
 
-# Returns whether any GPU backend is configuered.
-def if_gpu_is_configured(x):
-    return if_cuda_is_configured(x) + if_rocm_is_configured(x)
+# Returns whether any GPU backend is configured.
+def if_gpu_is_configured(if_true, if_false = []):
+    return _if_gpu_is_configured(if_true, if_false)
 
 def if_cuda_or_rocm(x):
     return if_gpu_is_configured(x)
@@ -27,3 +30,61 @@ def if_cuda_or_rocm(x):
 # unnecessary dependency
 def tf_additional_gpu_compilation_copts():
     return ["-DTF_DISABLE_NVLINK_BY_DEFAULT"]
+
+def gpu_only_cc_library(name, tags = [], **kwargs):
+    """A library that only gets compiled when GPU is configured, otherwise it's an empty target.
+
+    Args:
+      name: Name of the target
+      tags: Tags being applied to the implementation target
+      **kwargs: Accepts all arguments that a `cc_library` would also accept
+    """
+    if not native.package_name().startswith("xla/stream_executor"):
+        fail("gpu_only_cc_library may only be used in `xla/stream_executor/...`.")
+
+    cc_library(
+        name = "%s_non_gpu" % name,
+        tags = ["manual"],
+    )
+    cc_library(
+        name = "%s_gpu_only" % name,
+        tags = tags + ["manual"],
+        **kwargs
+    )
+    native.alias(
+        name = name,
+        actual = if_gpu_is_configured(":%s_gpu_only" % name, ":%s_non_gpu" % name),
+        visibility = kwargs.get("visibility"),
+        compatible_with = kwargs.get("compatible_with"),
+        restricted_to = kwargs.get("restricted_to"),
+        target_compatible_with = kwargs.get("target_compatible_with"),
+    )
+
+def cuda_only_cc_library(name, tags = [], **kwargs):
+    """A library that only gets compiled when CUDA is configured, otherwise it's an empty target.
+
+    Args:
+      name: Name of the target
+      tags: Tags being applied to the implementation target
+      **kwargs: Accepts all arguments that a `cc_library` would also accept
+    """
+    if not native.package_name().startswith("xla/stream_executor"):
+        fail("cuda_only_cc_library may only be used in `xla/stream_executor/...`.")
+
+    cc_library(
+        name = "%s_non_cuda" % name,
+        tags = ["manual"],
+    )
+    cc_library(
+        name = "%s_cuda_only" % name,
+        tags = tags + ["manual"],
+        **kwargs
+    )
+    native.alias(
+        name = name,
+        actual = if_cuda_is_configured(":%s_cuda_only" % name, ":%s_non_cuda" % name),
+        visibility = kwargs.get("visibility"),
+        compatible_with = kwargs.get("compatible_with"),
+        restricted_to = kwargs.get("restricted_to"),
+        target_compatible_with = kwargs.get("target_compatible_with"),
+    )
diff --git a/third_party/xla/xla/stream_executor/command_buffer.cc b/third_party/xla/xla/stream_executor/command_buffer.cc
index 874d0198e9ff71..ad887bd6815d6f 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/command_buffer.cc
@@ -15,19 +15,14 @@ limitations under the License.
 
 #include "xla/stream_executor/command_buffer.h"
 
-#include <cstddef>
-#include <cstdint>
 #include <memory>
 #include <utility>
-#include <vector>
 
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/kernel.h"
-#include "xla/stream_executor/kernel_spec.h"
-#include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
 #include "tsl/platform/errors.h"
@@ -35,26 +30,12 @@ limitations under the License.
 
 namespace stream_executor {
 
-CommandBuffer::~CommandBuffer() = default;
-CommandBuffer::CommandBuffer(CommandBuffer&&) = default;
-CommandBuffer& CommandBuffer::operator=(CommandBuffer&&) = default;
-
-void CommandBuffer::Deleter::operator()(
-    internal::CommandBufferInterface* impl) {
-  if (owned) delete impl;
-}
-
-/*static*/ absl::StatusOr<CommandBuffer> CommandBuffer::Create(
+absl::StatusOr<std::unique_ptr<CommandBuffer>> CommandBuffer::Create(
     StreamExecutor* executor, Mode mode) {
-  TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<internal::CommandBufferInterface> command_buffer,
-      executor->implementation()->GetCommandBufferImplementation(mode));
-
-  CommandBuffer cmd(std::move(command_buffer));
-  return cmd;
+  return executor->implementation()->CreateCommandBuffer(mode);
 }
 
-/*static*/ absl::StatusOr<CommandBuffer> CommandBuffer::Trace(
+absl::StatusOr<std::unique_ptr<CommandBuffer>> CommandBuffer::Trace(
     StreamExecutor* executor,
     absl::AnyInvocable<absl::Status(Stream*)> function, Mode mode) {
   Stream stream(executor);
@@ -65,7 +46,7 @@ void CommandBuffer::Deleter::operator()(
   return Trace(executor, &stream, std::move(function), mode);
 }
 
-/*static*/ absl::StatusOr<CommandBuffer> CommandBuffer::Trace(
+absl::StatusOr<std::unique_ptr<CommandBuffer>> CommandBuffer::Trace(
     StreamExecutor* executor, Stream* stream,
     absl::AnyInvocable<absl::Status(Stream*)> function, Mode mode) {
   if (stream == nullptr)
@@ -73,19 +54,18 @@ void CommandBuffer::Deleter::operator()(
         "Can't trace command buffer on a null stream");
 
   // Prepare an empty command buffer instance.
-  TF_ASSIGN_OR_RETURN(CommandBuffer command_buffer,
+  TF_ASSIGN_OR_RETURN(std::unique_ptr<CommandBuffer> command_buffer,
                       CommandBuffer::Create(executor, mode));
 
   // Trace and finalize the command buffer.
-  TF_RETURN_IF_ERROR(command_buffer.implementation()->Trace(
-      stream, [&]() { return function(stream); }));
-  TF_RETURN_IF_ERROR(command_buffer.implementation()->Finalize());
+  TF_RETURN_IF_ERROR(
+      command_buffer->Trace(stream, [&]() { return function(stream); }));
+  TF_RETURN_IF_ERROR(command_buffer->Finalize());
 
   return command_buffer;
 }
 
-/*static*/ bool CommandBuffer::SupportsConditionalCommands(
-    const Platform* platform) {
+bool CommandBuffer::SupportsConditionalCommands(const Platform* platform) {
   // TODO(ezhulenev): We should extend a Platform with a way to query
   // implemented StreamExecutor features, for now we know that only CUDA
   // platform supports conditional commands in command buffers.
@@ -95,110 +75,4 @@ void CommandBuffer::Deleter::operator()(
   return false;
 }
 
-const internal::CommandBufferInterface* CommandBuffer::implementation() const {
-  return implementation_.get();
-}
-
-internal::CommandBufferInterface* CommandBuffer::implementation() {
-  return implementation_.get();
-}
-
-/*static*/ CommandBuffer CommandBuffer::Create(
-    std::unique_ptr<internal::CommandBufferInterface> implementation) {
-  return CommandBuffer(std::move(implementation));
-}
-
-/*static*/ absl::Status CommandBuffer::Build(
-    internal::CommandBufferInterface* implementation,
-    const CommandBuffer::Builder& builder) {
-  CommandBuffer command_buffer(implementation);
-  return builder(&command_buffer);
-}
-
-CommandBuffer::CommandBuffer(
-    std::unique_ptr<internal::CommandBufferInterface> implementation)
-    : implementation_(implementation.release(), {/*owned=*/true}) {}
-
-CommandBuffer::CommandBuffer(internal::CommandBufferInterface* implementation)
-    : implementation_(implementation, {/*owned=*/false}) {}
-
-absl::Status CommandBuffer::Barrier(StreamExecutor* executor) {
-  return implementation_->Barrier(executor);
-}
-
-absl::Status CommandBuffer::Launch(const ThreadDim& threads,
-                                   const BlockDim& blocks, const Kernel& kernel,
-                                   const KernelArgs& args) {
-  return implementation_->Launch(threads, blocks, kernel, args);
-}
-
-absl::Status CommandBuffer::AddNestedCommandBuffer(
-    const CommandBuffer& nested) {
-  return implementation_->AddNestedCommandBuffer(nested);
-}
-
-absl::Status CommandBuffer::MemcpyDeviceToDevice(DeviceMemoryBase* dst,
-                                                 const DeviceMemoryBase& src,
-                                                 uint64_t size) {
-  return implementation_->MemcpyDeviceToDevice(dst, src, size);
-}
-
-absl::Status CommandBuffer::Memset(DeviceMemoryBase* dst,
-                                   BitPattern bit_pattern,
-                                   size_t num_elements) {
-  return implementation_->Memset(dst, bit_pattern, num_elements);
-}
-
-absl::StatusOr<DeviceMemoryBase> CommandBuffer::Allocate(size_t bytes) {
-  return implementation_->Allocate(bytes);
-}
-
-absl::Status CommandBuffer::If(StreamExecutor* executor,
-                               DeviceMemory<bool> pred, Builder then_builder) {
-  return implementation_->If(executor, pred, std::move(then_builder));
-}
-
-absl::Status CommandBuffer::IfElse(StreamExecutor* executor,
-                                   DeviceMemory<bool> pred,
-                                   Builder then_builder, Builder else_builder) {
-  return implementation_->IfElse(executor, pred, std::move(then_builder),
-                                 std::move(else_builder));
-}
-
-absl::Status CommandBuffer::Case(StreamExecutor* executor,
-                                 DeviceMemory<int32_t> index,
-                                 std::vector<Builder> branches) {
-  return implementation_->Case(executor, index, std::move(branches));
-}
-
-absl::Status CommandBuffer::For(StreamExecutor* executor, int32_t num_iteration,
-                                DeviceMemory<int32_t> loop_counter,
-                                Builder body_builder) {
-  return implementation_->For(executor, num_iteration, loop_counter,
-                              std::move(body_builder));
-}
-
-absl::Status CommandBuffer::While(StreamExecutor* executor,
-                                  DeviceMemory<bool> pred, Builder cond_builder,
-                                  Builder body_builder) {
-  return implementation_->While(executor, pred, std::move(cond_builder),
-                                std::move(body_builder));
-}
-
-absl::Status CommandBuffer::Free(DeviceMemoryBase dst) {
-  return implementation_->Free(dst);
-}
-
-CommandBuffer::Mode CommandBuffer::mode() const {
-  return implementation_->mode();
-}
-
-CommandBuffer::State CommandBuffer::state() const {
-  return implementation_->state();
-}
-
-absl::Status CommandBuffer::Finalize() { return implementation_->Finalize(); }
-
-absl::Status CommandBuffer::Update() { return implementation_->Update(); }
-
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/command_buffer.h b/third_party/xla/xla/stream_executor/command_buffer.h
index 7dd88aa9766741..b9a74ccdeed9d6 100644
--- a/third_party/xla/xla/stream_executor/command_buffer.h
+++ b/third_party/xla/xla/stream_executor/command_buffer.h
@@ -37,10 +37,6 @@ namespace stream_executor {
 class Stream;
 class StreamExecutor;
 
-namespace internal {
-class CommandBufferInterface;
-}
-
 //===----------------------------------------------------------------------===//
 // CommandBuffer
 //===----------------------------------------------------------------------===//
@@ -56,9 +52,11 @@ class CommandBuffer {
   // Builder constructs nested command buffers owned by a parent command buffer.
   using Builder = std::function<absl::Status(CommandBuffer*)>;
 
-  ~CommandBuffer();
-  CommandBuffer(CommandBuffer&&);
-  CommandBuffer& operator=(CommandBuffer&&);
+  CommandBuffer() = default;
+  virtual ~CommandBuffer() = default;
+
+  CommandBuffer(const CommandBuffer&) = delete;
+  void operator=(const CommandBuffer&) = delete;
 
   // Command buffer state:
   //
@@ -86,9 +84,13 @@ class CommandBuffer {
   // Command buffer constructors
   //===--------------------------------------------------------------------===//
 
+  // TODO(b/323534971): Command buffer constructors should be moved to
+  // StreamExecutor or a dedicated CommandBufferFactory accessible via
+  // StreamExecutor.
+
   // Creates a new empty command buffer on the given executor.
-  static absl::StatusOr<CommandBuffer> Create(StreamExecutor* executor,
-                                              Mode mode = Mode::kPrimary);
+  static absl::StatusOr<std::unique_ptr<CommandBuffer>> Create(
+      StreamExecutor* executor, Mode mode = Mode::kPrimary);
 
   // Creates a new command buffer on the given executor by tracing `function`
   // invocation. All StreamExecutor operations on a Stream argument will be
@@ -100,14 +102,14 @@ class CommandBuffer {
   // default we construct traced command buffers in nested mode because the
   // primary use case for traced command buffers is to be inserted into primary
   // command buffers constructed with explicit APIs.
-  static absl::StatusOr<CommandBuffer> Trace(
+  static absl::StatusOr<std::unique_ptr<CommandBuffer>> Trace(
       StreamExecutor* executor,
       absl::AnyInvocable<absl::Status(Stream*)> function,
       Mode mode = Mode::kNested);
 
   // Creates a new command buffer on the given executor by tracing `function`
   // invocation using a user provided stream that will be passed to `function`.
-  static absl::StatusOr<CommandBuffer> Trace(
+  static absl::StatusOr<std::unique_ptr<CommandBuffer>> Trace(
       StreamExecutor* executor, Stream* stream,
       absl::AnyInvocable<absl::Status(Stream*)> function,
       Mode mode = Mode::kNested);
@@ -126,33 +128,41 @@ class CommandBuffer {
 
   // Adds an execution barrier to a command buffer: all commands added before a
   // barrier will complete before any of the commands added after a barrier.
-  absl::Status Barrier(StreamExecutor* executor);
+  virtual absl::Status Barrier(StreamExecutor* executor) = 0;
 
   // Adds a kernel launch command to the command buffer.
-  absl::Status Launch(const ThreadDim& threads, const BlockDim& blocks,
-                      const Kernel& kernel, const KernelArgs& args);
+  virtual absl::Status Launch(const ThreadDim& threads, const BlockDim& blocks,
+                              const Kernel& kernel, const KernelArgs& args) = 0;
+
+  // Type-safe wrapper for launching typed kernels. Notice that the order of
+  // arguments is different do disambiguate from the regular launch API.
+  template <typename... Params, typename... Args>
+  absl::Status Launch(const TypedKernel<Params...>& kernel,
+                      const ThreadDim& threads, const BlockDim& blocks,
+                      Args... args);
 
   // Adds a nested command buffer to the command buffer.
-  absl::Status AddNestedCommandBuffer(const CommandBuffer& nested);
+  virtual absl::Status AddNestedCommandBuffer(const CommandBuffer& nested) = 0;
 
   // Adds a device-to-device memory copy to the command buffer.
-  absl::Status MemcpyDeviceToDevice(DeviceMemoryBase* dst,
-                                    const DeviceMemoryBase& src, uint64_t size);
+  virtual absl::Status MemcpyDeviceToDevice(DeviceMemoryBase* dst,
+                                            const DeviceMemoryBase& src,
+                                            uint64_t size) = 0;
 
   // Adds a memset node to the command buffer.
   using BitPattern = std::variant<uint8_t, uint16_t, uint32_t>;
-  absl::Status Memset(DeviceMemoryBase* dst, BitPattern bit_pattern,
-                      size_t num_elements);
+  virtual absl::Status Memset(DeviceMemoryBase* dst, BitPattern bit_pattern,
+                              size_t num_elements) = 0;
 
   //--------------------------------------------------------------------------//
   // Command buffer memory allocation API
   //--------------------------------------------------------------------------//
 
   // Adds a device memory allocation command to the command buffer.
-  absl::StatusOr<DeviceMemoryBase> Allocate(size_t bytes);
+  virtual absl::StatusOr<DeviceMemoryBase> Allocate(size_t bytes) = 0;
 
   // This API free buffer that is allocated by Allocate command
-  absl::Status Free(DeviceMemoryBase dst);
+  virtual absl::Status Free(DeviceMemoryBase dst) = 0;
 
   //--------------------------------------------------------------------------//
   // Command buffer condtitional commands API
@@ -160,29 +170,31 @@ class CommandBuffer {
 
   // Adds a conditional operation that will execute a command buffer constructed
   // by `then_builder` if `pred` value is `true`.
-  absl::Status If(StreamExecutor* executor, DeviceMemory<bool> pred,
-                  Builder then_builder);
+  virtual absl::Status If(StreamExecutor* executor, DeviceMemory<bool> pred,
+                          Builder then_builder) = 0;
 
   // Adds a conditional operation that will execute a command buffer constructed
   // by `then_builder` if `pred` value is `true`, or a command buffer
   // constructed by `else_builder` if `pred` is `false`.
-  absl::Status IfElse(StreamExecutor* executor, DeviceMemory<bool> pred,
-                      Builder then_builder, Builder else_builder);
+  virtual absl::Status IfElse(StreamExecutor* executor, DeviceMemory<bool> pred,
+                              Builder then_builder, Builder else_builder) = 0;
 
   // Adds a conditional operation that will execute a command buffer constructed
   // by the `branches` builder at `index`. If `index` is out of range, then it
   // will run a conditional command buffer constructed by the last builder.
   //
   // See: https://github.com/openxla/stablehlo/blob/main/docs/spec.md#case
-  absl::Status Case(StreamExecutor* executor, DeviceMemory<int32_t> index,
-                    std::vector<Builder> branches);
+  virtual absl::Status Case(StreamExecutor* executor,
+                            DeviceMemory<int32_t> index,
+                            std::vector<Builder> branches) = 0;
 
   // Adds a conditional operation that will execute a command buffer constructed
   // by the `body_builder` exactly `num_iteration` times. This means the
   // condition is known at compile time (`num_iteration` < `loop_counter`), and
   // does not require a `cond_builder`.
-  absl::Status For(StreamExecutor* executor, int32_t num_iteration,
-                   DeviceMemory<int32_t> loop_counter, Builder body_builder);
+  virtual absl::Status For(StreamExecutor* executor, int32_t num_iteration,
+                           DeviceMemory<int32_t> loop_counter,
+                           Builder body_builder) = 0;
 
   // Adds a conditional operation that will execute a command buffer constructed
   // by the `cond_builder` that must update `pred` value, and then depending on
@@ -197,68 +209,40 @@ class CommandBuffer {
   //     body_builder()
   //     cond_builder()
   //
-  absl::Status While(StreamExecutor* executor, DeviceMemory<bool> pred,
-                     Builder cond_builder, Builder body_builder);
+  virtual absl::Status While(StreamExecutor* executor, DeviceMemory<bool> pred,
+                             Builder cond_builder, Builder body_builder) = 0;
 
   //--------------------------------------------------------------------------//
+  // Command buffer state management API
+  //--------------------------------------------------------------------------//
 
   // Finalizes command buffer and makes it executable. Once command buffer is
   // finalized no commands can be added to it.
-  absl::Status Finalize();
+  virtual absl::Status Finalize() = 0;
 
   // Begins command buffer update. Command buffer update should be finalized
   // before it can be executed.
-  absl::Status Update();
-
-  // Type-safe wrapper for launching typed kernels. Notice that the order of
-  // arguments is different do disambiguate from the regular launch API.
-  template <typename... Params, typename... Args>
-  absl::Status Launch(const TypedKernel<Params...>& kernel,
-                      const ThreadDim& threads, const BlockDim& blocks,
-                      Args... args);
+  virtual absl::Status Update() = 0;
 
   // Returns command buffer execution mode.
-  Mode mode() const;
+  virtual Mode mode() const = 0;
 
   // Returns command buffer state.
-  State state() const;
-
-  //===--------------------------------------------------------------------===//
-  // Semi-internal APIs
-  //===--------------------------------------------------------------------===//
-
-  // Following APIs are public, but considered to be implementation detail and
-  // discouraged from uses outside of StreamExecutor package.
-  const internal::CommandBufferInterface* implementation() const;
-  internal::CommandBufferInterface* implementation();
-
-  // Creates a command buffer from a platform-specific command buffer
-  // implementation.
-  static CommandBuffer Create(
-      std::unique_ptr<internal::CommandBufferInterface> implementation);
-
-  // An adaptor for a command buffer builder that records commands into the
-  // platform-specific implementation
-  static absl::Status Build(internal::CommandBufferInterface* implementation,
-                            const CommandBuffer::Builder& builder);
+  virtual State state() const = 0;
 
+  //--------------------------------------------------------------------------//
+  // Command buffer tracing API
+  //--------------------------------------------------------------------------//
  private:
-  explicit CommandBuffer(
-      std::unique_ptr<internal::CommandBufferInterface> implementation);
-
-  explicit CommandBuffer(internal::CommandBufferInterface* implementation);
-
-  // A custom deleter to be able to construct command buffer that doesn't own
-  // underlying implementation (behaves like std::weak_ptr for implementation).
-  struct Deleter {
-    void operator()(internal::CommandBufferInterface*);
-    bool owned = true;
-  };
-
-  std::unique_ptr<internal::CommandBufferInterface, Deleter> implementation_;
-
-  CommandBuffer(const CommandBuffer&) = delete;
-  void operator=(const CommandBuffer&) = delete;
+  // Tracing APIs are private because they do not compose with command buffer
+  // updates. Instead of tracing directly into the command buffer users should
+  // create traced command buffers using factory methods and add them to primary
+  // command buffers as nested operations.
+
+  // Traces `function` invocation by recording all operations on the `stream`
+  // into the command buffer. Command buffer must be empty.
+  virtual absl::Status Trace(Stream* stream,
+                             absl::AnyInvocable<absl::Status()> function) = 0;
 };
 
 //===----------------------------------------------------------------------===//
@@ -271,7 +255,7 @@ inline absl::Status CommandBuffer::Launch(const TypedKernel<Params...>& kernel,
                                           const BlockDim& blocks,
                                           Args... args) {
   auto kernel_args = PackKernelArgs(kernel, args...);
-  TF_RETURN_IF_ERROR(Launch(threads, blocks, kernel, *kernel_args));
+  TF_RETURN_IF_ERROR(Launch(threads, blocks, *kernel, *kernel_args));
   return absl::OkStatus();
 }
 
diff --git a/third_party/xla/xla/stream_executor/cuda/BUILD b/third_party/xla/xla/stream_executor/cuda/BUILD
index dff6e99f2ec6f8..98beeb781b5b5c 100644
--- a/third_party/xla/xla/stream_executor/cuda/BUILD
+++ b/third_party/xla/xla/stream_executor/cuda/BUILD
@@ -6,12 +6,13 @@ load(
 )
 load(
     "//xla/stream_executor:build_defs.bzl",
+    "cuda_only_cc_library",
     "stream_executor_friends",
     "tf_additional_cuda_platform_deps",
     "tf_additional_cudnn_plugin_copts",
     "tf_additional_gpu_compilation_copts",
 )
-load("@local_tsl//tsl:tsl.bzl", "if_google", "if_nccl", "set_external_visibility", "tsl_copts")
+load("@local_tsl//tsl:tsl.bzl", "if_google", "if_nccl", "internal_visibility", "tsl_copts")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
     "if_static",
@@ -27,7 +28,8 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
     licenses = ["notice"],
 )
 
@@ -48,7 +50,6 @@ bool_flag(
 config_setting(
     name = "graph_conditional_enabled",
     flag_values = {":enable_graph_conditional": "True"},
-    visibility = ["//visibility:public"],
 )
 
 bool_flag(
@@ -64,133 +65,115 @@ config_setting(
     flag_values = {
         ":enable_libnvptxcompiler_support": "True",
     },
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "cuda_platform_id",
     srcs = ["cuda_platform_id.cc"],
     hdrs = ["cuda_platform_id.h"],
-    visibility = ["//visibility:public"],
     deps = ["//xla/stream_executor:platform"],
 )
 
-cc_library(
+cuda_only_cc_library(
     name = "cuda_platform",
-    srcs = if_cuda_is_configured(["cuda_platform.cc"]),
-    hdrs = if_cuda_is_configured(["cuda_platform.h"]),
+    srcs = ["cuda_platform.cc"],
+    hdrs = ["cuda_platform.h"],
     visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured(
+    deps =
         [
             ":cuda_activation",
-            ":cuda_driver",
-            ":cuda_runtime",
             ":cuda_collectives",
+            ":cuda_driver",
             ":cuda_executor",
             ":cuda_platform_id",
+            ":cuda_runtime",
+            "//xla/stream_executor",
+            "//xla/stream_executor:platform_manager",
+            "//xla/stream_executor:stream_executor_internal",
+            "//xla/stream_executor/gpu:gpu_driver_header",
+            "//xla/stream_executor/gpu:gpu_executor_header",
+            "//xla/stream_executor/platform",
             "@com_google_absl//absl/base",
             "@com_google_absl//absl/base:core_headers",
+            "@com_google_absl//absl/log",
+            "@com_google_absl//absl/log:check",
             "@com_google_absl//absl/memory",
             "@com_google_absl//absl/status",
             "@com_google_absl//absl/status:statusor",
             "@com_google_absl//absl/strings",
             "@com_google_absl//absl/strings:str_format",
             "@com_google_absl//absl/synchronization",
-            "//xla/stream_executor",
-            "//xla/stream_executor:stream_executor_internal",
-            "//xla/stream_executor:multi_platform_manager",
-            "//xla/stream_executor/gpu:gpu_executor_header",
-            "//xla/stream_executor/platform",
             "@local_tsl//tsl/platform:errors",
             "@local_tsl//tsl/platform:status",
             "@local_tsl//tsl/platform:statusor",
-        ],
-    ) + tf_additional_cuda_platform_deps(),
-    alwayslink = True,  # Registers itself with the MultiPlatformManager.
+        ] + tf_additional_cuda_platform_deps(),
+    alwayslink = True,  # Registers itself with the PlatformManager.
 )
 
-cc_library(
+cuda_only_cc_library(
     name = "cuda_diagnostics",
-    srcs = if_cuda_is_configured(["cuda_diagnostics.cc"]),
-    hdrs = if_cuda_is_configured(["cuda_diagnostics.h"]),
-    visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured([
+    srcs = ["cuda_diagnostics.cc"],
+    hdrs = ["cuda_diagnostics.h"],
+    deps = [
+        "//xla/stream_executor/gpu:gpu_diagnostics_header",
         "@com_google_absl//absl/container:inlined_vector",
-        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "//xla/stream_executor/gpu:gpu_diagnostics_header",
-        "//xla/stream_executor/platform",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:platform_port",
-        "@local_tsl//tsl/platform:status",
-    ]),
+    ],
 )
 
 # Buildozer can not remove dependencies inside select guards, so we have to use
 # an intermediate target.
-cc_library(
-    name = "ptxas_wrapper",
-    visibility = ["//visibility:public"],
-)
+cc_library(name = "ptxas_wrapper")
 
-cc_library(
-    name = "nvlink_wrapper",
-    visibility = ["//visibility:public"],
-)
+cc_library(name = "nvlink_wrapper")
 
 # Buildozer can not remove dependencies inside select guards, so we have to use
 # an intermediate target.
-cc_library(
-    name = "fatbinary_wrapper",
-    visibility = ["//visibility:public"],
-)
+cc_library(name = "fatbinary_wrapper")
 
-cc_library(
+cuda_only_cc_library(
     name = "cuda_driver",
-    srcs = if_cuda_is_configured(["cuda_driver.cc"]),
-    hdrs = if_cuda_is_configured(["cuda_driver.h"]),
-    visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured([
-        ":cuda_diagnostics",
+    srcs = ["cuda_driver.cc"],
+    hdrs = ["cuda_driver.h"],
+    local_defines = ["GOOGLE_CUDA=1"],
+    deps = [
+        "//xla/stream_executor",
+        "//xla/stream_executor:device_options",
+        "//xla/stream_executor/gpu:gpu_diagnostics_header",
+        "//xla/stream_executor/gpu:gpu_driver_header",
+        "//xla/stream_executor/gpu:gpu_types_header",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/container:node_hash_map",
         "@com_google_absl//absl/debugging:leak_check",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/memory",
-        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
         "@local_config_cuda//cuda:cuda_headers",
-        "//xla/stream_executor:device_options",
-        "//xla/stream_executor",
-        "//xla/stream_executor/gpu:gpu_diagnostics_header",
-        "//xla/stream_executor/gpu:gpu_driver_header",
-        "//xla/stream_executor/gpu:gpu_types_header",
-        "//xla/stream_executor/platform",
-        "//xla/stream_executor/platform:dso_loader",
         "@local_tsl//tsl/cuda",
         "@local_tsl//tsl/cuda:cudart",
         "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:macros",
         "@local_tsl//tsl/platform:numbers",
         "@local_tsl//tsl/platform:stacktrace",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-    ]),
+    ],
 )
 
-cc_library(
+cuda_only_cc_library(
     name = "cuda_runtime",
-    srcs = if_cuda_is_configured(["cuda_runtime.cc"]),
-    visibility = ["//visibility:public"],
+    srcs = ["cuda_runtime.cc"],
     deps = [
         "//xla/stream_executor/gpu:gpu_runtime_header",
         "//xla/stream_executor/gpu:gpu_types_header",
@@ -199,18 +182,16 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:statusor",
-    ] + if_cuda_is_configured([
-        "@local_config_cuda//cuda:cuda_headers",
-    ]),
+    ],
 )
 
-cc_library(
+cuda_only_cc_library(
     name = "cuda_collectives",
-    srcs = if_cuda_is_configured(["cuda_collectives.cc"]),
+    srcs = ["cuda_collectives.cc"],
     defines = if_nccl(["STREAM_EXECUTOR_GPU_ENABLE_XCCL"]),
-    visibility = ["//visibility:public"],
     deps = [
         ":cuda_driver",
         "//xla/stream_executor/gpu:gpu_collectives_header",
@@ -251,7 +232,8 @@ xla_cc_test(
     ],
     deps = [
         ":cuda_driver",
-        "@com_google_absl//absl/memory",
+        "//xla/stream_executor/gpu:gpu_driver_header",
+        "@com_google_absl//absl/log",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
@@ -269,8 +251,9 @@ xla_cc_test(
         ":cuda_platform",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:multi_platform_manager",
+        "//xla/stream_executor:platform_manager",
         "@com_google_absl//absl/memory",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ],
@@ -285,77 +268,66 @@ cc_library(
     deps = ["//xla/stream_executor/gpu:gpu_activation_header"],
 )
 
-cc_library(
+cuda_only_cc_library(
     name = "cuda_activation",
     srcs = [],
-    hdrs = if_cuda_is_configured(["cuda_activation.h"]),
-    visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured([
+    hdrs = ["cuda_activation.h"],
+    deps = [
         ":cuda_driver",
-        "@local_config_cuda//cuda:cuda_headers",
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_internal",
         "//xla/stream_executor/gpu:gpu_activation",
         "//xla/stream_executor/platform",
-    ]),
+        "@local_config_cuda//cuda:cuda_headers",
+    ],
 )
 
-cc_library(
+cuda_only_cc_library(
     name = "cublas_lt_header",
-    hdrs = if_cuda_is_configured([
+    hdrs = [
         "cuda_blas_lt.h",
         "cuda_blas_utils.h",
-    ]),
+    ],
     visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured([
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@local_config_cuda//cuda:cuda_headers",
+    deps = [
         "//xla:types",
-        "//xla/stream_executor:host_or_device_scalar",
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_blas_lt",
-        "//xla/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/synchronization",
+        "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-    ]),
+    ],
 )
 
-cc_library(
+cuda_only_cc_library(
     name = "cublas_plugin",
-    srcs = if_cuda_is_configured([
+    srcs = [
         "cuda_blas.cc",
         "cuda_blas_lt.cc",
-    ]),
-    hdrs = if_cuda_is_configured([
+    ],
+    hdrs = [
         "cuda_blas.h",
         "cuda_blas_lt.h",
-    ]),
+    ],
     visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured([
+    deps = [
         ":cuda_activation",
         ":cuda_blas_utils",
         ":cuda_executor",
         ":cuda_helpers",
         ":cuda_platform_id",
         ":cuda_stream",
-        "@com_google_absl//absl/base:core_headers",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/synchronization",
-        "@eigen_archive//:eigen3",
-        "@local_config_cuda//cuda:cuda_headers",
         "//xla:shape_util",
         "//xla:status_macros",
         "//xla:types",
         "//xla:util",
         "//xla/stream_executor",
-        "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:device_memory",
         "//xla/stream_executor:host_or_device_scalar",
+        "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor/gpu:gpu_activation_header",
         "//xla/stream_executor/gpu:gpu_blas_lt",
         "//xla/stream_executor/gpu:gpu_executor_header",
@@ -364,67 +336,76 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_timer",
         "//xla/stream_executor/gpu:gpu_types_header",
         "//xla/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
+        "@eigen_archive//:eigen3",
+        "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/cuda:cublas",
         "@local_tsl//tsl/cuda:cublas_lt",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
+        "@local_tsl//tsl/platform:ml_dtypes",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
-    ]) + if_static([
+        "@local_tsl//tsl/protobuf:dnn_proto_cc",
+    ] + if_static([
         "@local_tsl//tsl/platform:tensor_float_32_utils",
     ]),
     alwayslink = True,
 )
 
-cc_library(
+cuda_only_cc_library(
     name = "cuda_blas_utils",
-    srcs = if_cuda_is_configured(["cuda_blas_utils.cc"]),
-    hdrs = if_cuda_is_configured(["cuda_blas_utils.h"]),
-    visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured([
-        "@com_google_absl//absl/strings",
+    srcs = ["cuda_blas_utils.cc"],
+    hdrs = ["cuda_blas_utils.h"],
+    deps = [
+        "//xla/stream_executor",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_config_cuda//cuda:cuda_headers",
-        "//xla/stream_executor",
         "@local_tsl//tsl/cuda:cublas",
         "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
-    ]),
+    ],
 )
 
-cc_library(
+cuda_only_cc_library(
     name = "cufft_plugin",
-    srcs = if_cuda_is_configured(["cuda_fft.cc"]),
-    hdrs = if_cuda_is_configured(["cuda_fft.h"]),
+    srcs = ["cuda_fft.cc"],
+    hdrs = ["cuda_fft.h"],
     visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured([
+    deps = [
         ":cuda_activation_header",
-        ":cuda_helpers",
         ":cuda_platform_id",
-        ":cuda_stream",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/status",
-        "@local_config_cuda//cuda:cuda_headers",
         "//xla/stream_executor",
-        "//xla/stream_executor:stream_executor_internal",
         "//xla/stream_executor:fft",
         "//xla/stream_executor:plugin_registry",
+        "//xla/stream_executor:stream_executor_internal",
         "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/stream_executor/gpu:gpu_helpers_header",
+        "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/platform",
-        "//xla/stream_executor/platform:dso_loader",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
+        "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/cuda:cufft",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:status",
-    ]),
+        "@local_tsl//tsl/platform:statusor",
+    ],
     alwayslink = True,
 )
 
 cc_library(
     name = "cuda_dnn_headers",
     textual_hdrs = ["cuda_dnn.h"],
-    visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         ":cuda_activation_header",
         "//xla/stream_executor:dnn",
@@ -432,45 +413,55 @@ cc_library(
     ]) + [
         "//xla/stream_executor",  # build_cleaner: keep
         "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/protobuf:dnn_proto_cc",
     ],
 )
 
-cc_library(
+cuda_only_cc_library(
     name = "cudnn_plugin",
-    srcs = if_cuda_is_configured(["cuda_dnn.cc"]),
-    hdrs = if_cuda_is_configured(["cuda_dnn.h"]),
+    srcs = ["cuda_dnn.cc"],
+    hdrs = ["cuda_dnn.h"],
     copts = tf_additional_cudnn_plugin_copts(),
     visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured([
+    deps = [
         ":cuda_activation",
         ":cuda_diagnostics",
         ":cuda_driver",
         ":cuda_executor",
         ":cuda_platform_id",
         ":cuda_stream",
+        "//xla/stream_executor",
+        "//xla/stream_executor:dnn",
+        "//xla/stream_executor:plugin_registry",
+        "//xla/stream_executor:stream_executor_internal",
+        "//xla/stream_executor/gpu:gpu_activation_header",
+        "//xla/stream_executor/gpu:gpu_diagnostics_header",
+        "//xla/stream_executor/gpu:gpu_driver_header",
+        "//xla/stream_executor/gpu:gpu_executor_header",
+        "//xla/stream_executor/gpu:gpu_stream",
+        "//xla/stream_executor/gpu:gpu_timer_header",
+        "//xla/stream_executor/platform",
+        "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/container:flat_hash_map",
         "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/time",
         "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/status:statusor",
         "@cudnn_frontend_archive//:cudnn_frontend",
         "@eigen_archive//:eigen3",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_config_cuda//cuda:cudnn_header",
-        "//xla/stream_executor:dnn",
-        "//xla/stream_executor:plugin_registry",
-        "//xla/stream_executor:stream_executor_internal",
-        "//xla/stream_executor",
-        "//xla/stream_executor/gpu:gpu_executor_header",
-        "//xla/stream_executor/gpu:gpu_timer_header",
-        "//xla/stream_executor/platform",
         "@local_tsl//tsl/cuda:cudnn",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
@@ -478,27 +469,27 @@ cc_library(
         "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:tensor_float_32_hdr_lib",
         "@local_tsl//tsl/platform:tensor_float_32_utils",
+        "@local_tsl//tsl/protobuf:dnn_proto_cc",
         "@local_tsl//tsl/util:env_var",
-    ]),
+    ],
     alwayslink = True,
 )
 
-cc_library(
+cuda_only_cc_library(
     name = "cuda_kernel",
-    srcs = if_cuda_is_configured(["cuda_kernel.cc"]),
-    hdrs = if_cuda_is_configured(["cuda_kernel.h"]),
-    visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured([
+    srcs = ["cuda_kernel.cc"],
+    hdrs = ["cuda_kernel.h"],
+    deps = [
         ":cuda_driver",
-        "@com_google_absl//absl/log",
-        "@local_config_cuda//cuda:cuda_headers",
-        "@com_google_absl//absl/status:statusor",
         "//xla/stream_executor",
-        "//xla/stream_executor/gpu:gpu_kernel_header",
         "//xla/stream_executor/gpu:gpu_driver_header",
+        "//xla/stream_executor/gpu:gpu_kernel_header",
         "//xla/stream_executor/platform",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/status:statusor",
+        "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:statusor",
-    ]),
+    ],
 )
 
 cuda_library(
@@ -511,7 +502,6 @@ cuda_library(
         ":graph_conditional_enabled": ["STREAM_EXECUTOR_CUDA_ENABLE_GRAPH_CONDITIONAL=1"],
         "//conditions:default": [],
     }),
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/log",
         "@local_config_cuda//cuda:cuda_headers",
@@ -522,47 +512,37 @@ cuda_library(
 cc_library(
     name = "cuda_helpers",
     textual_hdrs = if_cuda_is_configured(["cuda_helpers.h"]),
-    visibility = ["//visibility:public"],
     deps = if_cuda_is_configured([
         "//xla/stream_executor/gpu:gpu_helpers_header",
     ]),
 )
 
-cc_library(
+cuda_only_cc_library(
     name = "cuda_event",
-    srcs = if_cuda_is_configured(["cuda_event.cc"]),
-    hdrs = if_cuda_is_configured(["cuda_event.h"]),
-    visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured([
-        ":cuda_driver",
-        ":cuda_stream",
+    srcs = ["cuda_event.cc"],
+    hdrs = ["cuda_event.h"],
+    deps = [
         "//xla/stream_executor",
+        "//xla/stream_executor/gpu:gpu_driver_header",
         "//xla/stream_executor/gpu:gpu_event",
+        "//xla/stream_executor/gpu:gpu_executor_header",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status:statusor",
         "@local_config_cuda//cuda:cuda_headers",
-        "//xla/stream_executor/gpu:gpu_executor_header",
-        "//xla/stream_executor/gpu:gpu_stream_header",
-        "@local_tsl//tsl/platform:statusor",
-    ]),
+    ],
 )
 
-cc_library(
+cuda_only_cc_library(
     name = "cuda_stream",
     srcs = [],
-    hdrs = if_cuda_is_configured(["cuda_stream.h"]),
-    visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured([
-        ":cuda_driver",
+    hdrs = ["cuda_stream.h"],
+    deps = [
         "//xla/stream_executor",
         "//xla/stream_executor/gpu:gpu_stream",
-        "//xla/stream_executor/platform",
-    ]),
+    ],
 )
 
-cc_library(
-    name = "libnvptxcompiler_empty",
-    visibility = ["//visibility:public"],
-)
+cc_library(name = "libnvptxcompiler_empty")
 
 # This intermediate target is needed because we can't nest `select` statements.
 alias(
@@ -572,12 +552,11 @@ alias(
         "//conditions:default": ":libnvptxcompiler_empty",
     }),
     tags = ["manual"],
-    visibility = ["//visibility:public"],
 )
 
-cc_library(
+cuda_only_cc_library(
     name = "cuda_asm_compiler",
-    srcs = if_cuda_is_configured(["cuda_asm_compiler.cc"]),
+    srcs = ["cuda_asm_compiler.cc"],
     copts = tf_additional_gpu_compilation_copts(),
     local_defines = select({
         ":libnvptxcompiler_support_enabled": [
@@ -585,9 +564,13 @@ cc_library(
         ],
         "//conditions:default": [],
     }),
-    visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured([
+    deps = [
         ":libnvptxcompiler_if_enabled",
+        "//xla:status_macros",
+        "//xla/stream_executor:stream_executor_headers",
+        "//xla/stream_executor/gpu:asm_compiler_header",
+        "//xla/stream_executor/gpu:gpu_asm_opts",
+        "//xla/stream_executor/gpu:gpu_driver_header",
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/base:core_headers",
@@ -598,64 +581,55 @@ cc_library(
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@local_config_cuda//cuda:cuda_headers",
-        "//xla:status_macros",
-        "//xla/stream_executor/gpu:asm_compiler_header",
-        "//xla/stream_executor/gpu:gpu_asm_opts",
-        "//xla/stream_executor/gpu:gpu_diagnostics_header",
-        "//xla/stream_executor/gpu:gpu_driver_header",
+        "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:subprocess",
-    ]),
+    ],
 )
 
-cc_library(
+cuda_only_cc_library(
     name = "cuda_executor",
-    srcs = if_cuda_is_configured(["cuda_executor.cc"]),
-    visibility = ["//visibility:public"],
-    deps = if_cuda_is_configured([
-        ":cuda_activation",
-        ":cuda_asm_compiler",
+    srcs = ["cuda_executor.cc"],
+    deps = [
+        ":cuda_collectives",  # buildcleaner: keep
         ":cuda_diagnostics",
         ":cuda_driver",
-        ":cuda_event",
-        ":cuda_kernel",
+        ":cuda_event",  # buildcleaner: keep
+        ":cuda_kernel",  # buildcleaner: keep
         ":cuda_platform_id",
-        ":cuda_runtime",
-        ":cuda_collectives",
-        ":cuda_stream",
-        "@com_google_absl//absl/functional:any_invocable",
-        "@com_google_absl//absl/log:check",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/synchronization",
-        "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/status:statusor",
-        "@com_google_absl//absl/strings:str_format",
-        "@local_config_cuda//cuda:cuda_headers",
+        ":cuda_runtime",  # buildcleaner: keep
         "//xla/stream_executor",
         "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:stream_executor_internal",
-        "//xla/stream_executor/gpu:asm_compiler",
-        "//xla/stream_executor/gpu:gpu_command_buffer",
         "//xla/stream_executor/gpu:gpu_collectives_header",
+        "//xla/stream_executor/gpu:gpu_command_buffer",
+        "//xla/stream_executor/gpu:gpu_diagnostics_header",
         "//xla/stream_executor/gpu:gpu_driver_header",
-        "//xla/stream_executor/gpu:gpu_runtime_header",
         "//xla/stream_executor/gpu:gpu_event_header",
-        "//xla/stream_executor/gpu:gpu_executor_header",
         "//xla/stream_executor/gpu:gpu_kernel_header",
+        "//xla/stream_executor/gpu:gpu_runtime_header",
         "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/gpu:gpu_timer",
         "//xla/stream_executor/gpu:gpu_types_header",
-        "//xla/stream_executor/platform",
-        "//xla/stream_executor/platform:dso_loader",
+        "@com_google_absl//absl/base",
+        "@com_google_absl//absl/functional:any_invocable",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/numeric:int128",
+        "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
+        "@com_google_absl//absl/synchronization",
+        "@com_google_absl//absl/types:span",
+        "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:fingerprint",
         "@local_tsl//tsl/platform:logging",
-        "@local_tsl//tsl/platform:numbers",
-        "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-    ]),
+    ],
     alwayslink = True,
 )
 
@@ -680,12 +654,10 @@ cc_library(
 cc_library(
     name = "IOKit",
     linkopts = ["-framework IOKit"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
     name = "stream_executor_cuda",
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/stream_executor:stream_executor_bundle",
     ] + if_google(
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.cc b/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.cc
index bf2a8957164684..e61ebb233dcc62 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_asm_compiler.cc
@@ -20,11 +20,12 @@ limitations under the License.
 #include <sstream>
 #include <string>
 #include <string_view>
+#include <utility>
 #include <vector>
 
 #include "absl/algorithm/container.h"
-#include "absl/base/attributes.h"
 #include "absl/base/call_once.h"
+#include "absl/base/optimization.h"
 #include "absl/cleanup/cleanup.h"
 #include "absl/log/log.h"
 #include "absl/status/status.h"
@@ -33,12 +34,16 @@ limitations under the License.
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/gpu/asm_compiler.h"
 #include "xla/stream_executor/gpu/gpu_asm_opts.h"
-#include "xla/stream_executor/gpu/gpu_diagnostics.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/subprocess.h"
 
 #ifdef ENABLE_LIBNVPTXCOMPILER_SUPPORT
@@ -69,8 +74,10 @@ absl::StatusOr<std::vector<uint8_t>> LinkUsingNvlink(
     absl::call_once(log_once,
                     [] { LOG(INFO) << "Using nvlink for parallel linking"; });
   }
-  const std::string bin_path =
-      FindCudaExecutable("nvlink", std::string(preferred_cuda_dir));
+
+  TF_ASSIGN_OR_RETURN(
+      std::string bin_path,
+      FindCudaExecutable("nvlink", std::string(preferred_cuda_dir)));
 
   if (images.empty()) {
     return std::vector<uint8>();
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
index ca8bf1b3087594..dd9e6cb6dc7d4c 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.cc
@@ -16,24 +16,32 @@ limitations under the License.
 #include "xla/stream_executor/cuda/cuda_blas.h"
 
 #include <complex>
+#include <cstddef>
 #include <cstdint>
+#include <optional>
 #include <string>
+#include <vector>
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "Eigen/Core"  // from @eigen_archive
 #include "third_party/gpus/cuda/include/cublas_v2.h"
 #include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda_bf16.h"
+#include "third_party/gpus/cuda/include/cuda_fp16.h"
+#include "third_party/gpus/cuda/include/driver_types.h"
+#include "third_party/gpus/cuda/include/library_types.h"
+#include "third_party/gpus/cuda/include/vector_types.h"
 #include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/cuda/cuda_activation.h"
 #include "xla/stream_executor/cuda/cuda_blas_utils.h"
-#include "xla/stream_executor/cuda/cuda_helpers.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
-#include "xla/stream_executor/cuda/cuda_stream.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/gpu/gpu_activation.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_helpers.h"
 #include "xla/stream_executor/gpu/gpu_stream.h"
@@ -45,8 +53,11 @@ limitations under the License.
 #include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/tensor_float_32_utils.h"
+#include "tsl/protobuf/dnn.pb.h"
 
 namespace stream_executor {
 namespace cuda {
@@ -833,7 +844,9 @@ absl::Status CUDABlas::DoBlasGemmStridedBatchedWithAlgorithm(
 }
 
 bool CUDABlas::GetBlasGemmAlgorithms(
-    Stream *stream, std::vector<blas::AlgorithmType> *out_algorithms) {
+    Stream *stream, const gpu::MatrixDescriptor &,
+    const gpu::MatrixDescriptor &, gpu::OutputMatrixDescriptor *, const void *,
+    const void *, std::vector<blas::AlgorithmType> *out_algorithms) {
   // cublasGemmAlgo_t (and the function that accepts this type, cublasGemmEx)
   // were first introduced in CUDA 8.
   //
@@ -942,45 +955,18 @@ absl::Status CUDABlas::DoBlasGemmBatchedInternal(
 
   const size_t size = batch_count * sizeof(CUDA_T *);
 
-  // Device-side copy of pointers to matrices.
-  DeviceMemory<CUDA_T *> a;
-  DeviceMemory<CUDA_T *> b;
-  DeviceMemory<CUDA_T *> c;
-
-  // If temporary space is allocated for device-side copies of pointers to
-  // matrices, that temporary space should not be freed until this function
-  // returns. Although the values for these unique_ptrs are not set here, they
-  // are declared at this scope so they will be destroyed when the function
-  // returns.
-  //
-  // If a scratch allocator is provided, these pointers will not be used at all.
-  std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> a_temporary;
-  std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> b_temporary;
-  std::unique_ptr<TemporaryDeviceMemory<CUDA_T *>> c_temporary;
-
-  // Decide how to allocate device-side copy of pointers to matrices based on
-  // whether a scratch allocator was passed.
-  if (scratch_allocator != nullptr) {
-    TF_ASSIGN_OR_RETURN(DeviceMemory<uint8_t> a_bytes,
-                        scratch_allocator->AllocateBytes(size));
-    TF_ASSIGN_OR_RETURN(DeviceMemory<uint8_t> b_bytes,
-                        scratch_allocator->AllocateBytes(size));
-    TF_ASSIGN_OR_RETURN(DeviceMemory<uint8_t> c_bytes,
-                        scratch_allocator->AllocateBytes(size));
-    a = DeviceMemory<CUDA_T *>(a_bytes);
-    b = DeviceMemory<CUDA_T *>(b_bytes);
-    c = DeviceMemory<CUDA_T *>(c_bytes);
-  } else {
-    TF_ASSIGN_OR_RETURN(a_temporary,
-                        stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
-    TF_ASSIGN_OR_RETURN(b_temporary,
-                        stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
-    TF_ASSIGN_OR_RETURN(c_temporary,
-                        stream->AllocateTemporaryArray<CUDA_T *>(batch_count));
-    a = DeviceMemory<CUDA_T *>(*a_temporary->mutable_device_memory());
-    b = DeviceMemory<CUDA_T *>(*b_temporary->mutable_device_memory());
-    c = DeviceMemory<CUDA_T *>(*c_temporary->mutable_device_memory());
+  if (scratch_allocator == nullptr) {
+    return absl::InternalError("scratch_allocator is null");
   }
+  TF_ASSIGN_OR_RETURN(DeviceMemory<uint8_t> a_bytes,
+                      scratch_allocator->AllocateBytes(size));
+  TF_ASSIGN_OR_RETURN(DeviceMemory<uint8_t> b_bytes,
+                      scratch_allocator->AllocateBytes(size));
+  TF_ASSIGN_OR_RETURN(DeviceMemory<uint8_t> c_bytes,
+                      scratch_allocator->AllocateBytes(size));
+  DeviceMemory<CUDA_T *> a(a_bytes);
+  DeviceMemory<CUDA_T *> b(b_bytes);
+  DeviceMemory<CUDA_T *> c(c_bytes);
 
   if (!stream->ThenMemcpy(&a, a_raw_ptrs.data(), size).ok() ||
       !stream->ThenMemcpy(&b, b_raw_ptrs.data(), size).ok() ||
@@ -1471,5 +1457,6 @@ void initialize_cublas() {
 }  // namespace cuda
 }  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(register_cublas,
-                            { stream_executor::cuda::initialize_cublas(); });
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(register_cublas, {
+  stream_executor::cuda::initialize_cublas();
+});
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas.h b/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
index f760a968f66dd7..5f69e8b04765a8 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas.h
@@ -20,14 +20,17 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
 #define XLA_STREAM_EXECUTOR_CUDA_CUDA_BLAS_H_
 
+#include <cstdint>
+
 #include "absl/base/thread_annotations.h"
+#include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
 #include "third_party/gpus/cuda/include/cublas_v2.h"
+#include "third_party/gpus/cuda/include/driver_types.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/cuda/cuda_blas_lt.h"
-#include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform/port.h"
-#include "xla/stream_executor/plugin_registry.h"
 
 namespace stream_executor {
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
index 6fde4412ba59fe..0d20233d296e4b 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.cc
@@ -15,20 +15,27 @@ limitations under the License.
 
 #include "xla/stream_executor/cuda/cuda_blas_lt.h"
 
+#include <Eigen/Core>
 #include <algorithm>
 #include <any>
 #include <climits>
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <optional>
-#include <string>
 #include <tuple>
 #include <utility>
 #include <vector>
 
+#include "absl/log/log.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
 #include "third_party/gpus/cuda/include/cublasLt.h"
 #include "third_party/gpus/cuda/include/cublas_v2.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/library_types.h"
 #include "xla/primitive_util.h"
 #include "xla/status_macros.h"
 #include "xla/stream_executor/blas.h"
@@ -41,7 +48,11 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_timer.h"
 #include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
+#include "xla/types.h"
 #include "xla/util.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/ml_dtypes.h"
+#include "tsl/platform/statusor.h"
 
 #define SET_ATTR(setter, handle, attr, value) \
   ToStatus(setter(handle, attr, &value, sizeof(decltype(value))), #setter)
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.h b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.h
index 8c3e86c19c92a5..2a0a5611b81ce0 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas_lt.h
@@ -16,13 +16,20 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_BLAS_LT_H_
 #define XLA_STREAM_EXECUTOR_CUDA_CUDA_BLAS_LT_H_
 
+#include <cstddef>
 #include <memory>
+#include <type_traits>
+#include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/synchronization/mutex.h"
 #include "third_party/gpus/cuda/include/cublasLt.h"
 #include "third_party/gpus/cuda/include/cublas_v2.h"
+#include "third_party/gpus/cuda/include/library_types.h"
+#include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/types.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas_utils.cc b/third_party/xla/xla/stream_executor/cuda/cuda_blas_utils.cc
index c85c30b39f7e75..ec700d6f3168f1 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas_utils.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas_utils.cc
@@ -15,10 +15,12 @@ limitations under the License.
 
 #include "xla/stream_executor/cuda/cuda_blas_utils.h"
 
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "third_party/gpus/cuda/include/cublas_v2.h"
 #include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/library_types.h"
 #include "xla/stream_executor/blas.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_blas_utils.h b/third_party/xla/xla/stream_executor/cuda/cuda_blas_utils.h
index cb37c850a48dd4..aaaf4257f4f5b5 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_blas_utils.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_blas_utils.h
@@ -16,10 +16,10 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_BLAS_UTILS_H_
 #define XLA_STREAM_EXECUTOR_CUDA_CUDA_BLAS_UTILS_H_
 
-#include <string>
 
 #include "absl/status/status.h"
 #include "third_party/gpus/cuda/include/cublas_v2.h"
+#include "third_party/gpus/cuda/include/library_types.h"
 #include "xla/stream_executor/blas.h"
 #include "tsl/platform/errors.h"
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.cc b/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.cc
index 99f687e3c2b03b..561ac0d401e2f2 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.cc
@@ -24,15 +24,16 @@ limitations under the License.
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
 #if !defined(PLATFORM_WINDOWS)
 #include <link.h>
 #include <sys/sysmacros.h>
 #include <unistd.h>
 #endif
+
 #include <sys/stat.h>
 
-#include <algorithm>
-#include <memory>
+#include <string>
 #include <vector>
 
 #include "absl/container/inlined_vector.h"
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.h b/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.h
index 9bffa93deccc2b..ea1fa0cfc51a7c 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_diagnostics.h
@@ -16,6 +16,9 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
 #define XLA_STREAM_EXECUTOR_CUDA_CUDA_DIAGNOSTICS_H_
 
+#include <string>
+
+#include "absl/status/statusor.h"
 #include "xla/stream_executor/gpu/gpu_diagnostics.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
index c62c4f538e7ee3..e991f3e11f33e7 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.cc
@@ -17,34 +17,49 @@ limitations under the License.
 
 #include <algorithm>
 #include <array>
+#include <cstddef>
 #include <cstdint>
 #include <functional>
 #include <iterator>
 #include <limits>
 #include <memory>
 #include <optional>
+#include <sstream>
 #include <string>
 #include <tuple>
 #include <utility>
 #include <variant>
 #include <vector>
 
+#include "absl/algorithm/container.h"
 #include "absl/base/optimization.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
-#include "absl/memory/memory.h"
+#include "absl/container/inlined_vector.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/string_view.h"
+#include "absl/synchronization/mutex.h"
+#include "absl/time/time.h"
+#include "absl/types/span.h"
 #include "Eigen/Core"  // from @eigen_archive
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cuda_runtime_api.h"
+#include "third_party/gpus/cuda/include/driver_types.h"
 #include "xla/stream_executor/cuda/cuda_activation.h"
 #include "xla/stream_executor/cuda/cuda_diagnostics.h"
-#include "xla/stream_executor/cuda/cuda_driver.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
-#include "xla/stream_executor/cuda/cuda_stream.h"
+#include "xla/stream_executor/data_type.h"
+#include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/gpu/gpu_activation.h"
+#include "xla/stream_executor/gpu/gpu_diagnostics.h"
+#include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
+#include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/gpu/gpu_timer.h"
 #include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform/initialize.h"
@@ -55,19 +70,47 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor_internal.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
+#include "tsl/platform/status.h"
 #include "tsl/platform/statusor.h"
 #include "tsl/platform/tensor_float_32_utils.h"
+#include "tsl/protobuf/dnn.pb.h"
 #include "tsl/util/env_var.h"
 
 // clang-format off
 #include "third_party/gpus/cuda/include/library_types.h"
-#include "third_party/gpus/cudnn/cudnn.h"
 #include "third_party/gpus/cudnn/cudnn_version.h"
-#if CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
+
+#if CUDNN_VERSION >= 9000
+#include "third_party/gpus/cudnn/cudnn_adv.h"
+#include "third_party/gpus/cudnn/cudnn_cnn.h"
+#include "third_party/gpus/cudnn/cudnn_ops.h"
+#elif CUDNN_VERSION >= 8100
+#include "third_party/gpus/cudnn/cudnn_adv_infer.h"
+#include "third_party/gpus/cudnn/cudnn_adv_train.h"
+#include "third_party/gpus/cudnn/cudnn_cnn_infer.h"
+#include "third_party/gpus/cudnn/cudnn_cnn_train.h"
+#include "third_party/gpus/cudnn/cudnn_ops_infer.h"
+#include "third_party/gpus/cudnn/cudnn_ops_train.h"
+#endif
+
+#include "third_party/gpus/cudnn/cudnn_backend.h"
+
+#if CUDNN_VERSION >= 8100
 #include "third_party/cudnn_frontend/include/cudnn_frontend.h"
 #include "third_party/cudnn_frontend/include/cudnn_frontend_utils.h"
-#endif  // CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
-#include "absl/strings/string_view.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend_EngineConfig.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend_Errata.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend_ExecutionPlan.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend_Filters.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend_Heuristics.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend_MatMulDesc.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend_Operation.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend_OperationGraph.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend_PointWiseDesc.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend_Rng.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend_Tensor.h"
+#include "third_party/cudnn_frontend/include/cudnn_frontend_VariantPack.h"
+#endif  // CUDNN_VERSION >= 8100
 // clang-format on
 
 #ifdef __clang__
@@ -373,16 +416,13 @@ void PreloadCudnnSubLibs(PreloadCudnnType type) {
       [[clang::fallthrough]];
     }
     case PreloadCudnnType::ConvFwd: {
-#if CUDNN >= 9000 && TF_ENABLE_CUDNN_FRONTEND
+#if CUDNN_VERSION >= 9000
       cudnnGraphVersionCheck();
       cudnnOpsVersionCheck();
-#elif CUDNN_VERSION >= 9000
-      cudnnCnnVersionCheck();
-      cudnnOpsVersionCheck();
 #elif CUDNN_VERSION >= 8004
       cudnnOpsInferVersionCheck();
       cudnnCnnInferVersionCheck();
-#endif  // CUDNN >= 9000 && TF_ENABLE_CUDNN_FRONTEND
+#endif  // CUDNN_VERSION >= 9000
       break;
     }
     case PreloadCudnnType::Rnn: {
@@ -769,7 +809,7 @@ class CudnnFilterDescriptor {
   FilterDescriptor handle_;  // Owned.
 };
 
-#if CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
+#if CUDNN_VERSION >= 8100
 // The errata sheet (JSON format) for marking the cudnn engines that might be
 // buggy. For example, we don't want the engine 999 of forward convolution:
 // R"({ "version" : 1,
@@ -865,7 +905,7 @@ const json* CudnnExecutionPlanEngineFilterRuntime() {
   return json_handle;
 }
 
-#endif  // CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
+#endif  // CUDNN_VERSION >= 8100
 
 // A helper function to decide whether to use
 // CUDNN_BATCHNORM_SPATIAL_PERSISTENT in batchnorm. This mode can be faster in
@@ -2033,7 +2073,7 @@ absl::Status CheckRNNParameterSize(
     const CudnnHandle& cudnn, const CudnnRnnDescriptor& rnn_desc,
     const CudnnRnnSequenceTensorDescriptor& input_desc) {
   size_t params_size_in_bytes = 0;
-#if CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
+#if CUDNN_VERSION >= 8100
   RETURN_IF_CUDNN_ERROR(cudnnGetRNNWeightSpaceSize(
       /*handle=*/cudnn.handle(), /*rnnDesc=*/rnn_desc.handle(),
       /*sizeInBytes=*/&params_size_in_bytes));
@@ -2177,7 +2217,7 @@ static absl::Status PopulateProfileFromTimer(
       profile_result->set_scratch_size(*scratch_size);
     }
   }
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 template <class T>
@@ -2555,7 +2595,7 @@ absl::Status CudnnSupport::DoCtcLossImpl(
 }
 
 absl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
-CudnnSupport::createRnnDescriptor(
+CudnnSupport::CreateRnnDescriptor(
     int num_layers, int hidden_size, int input_size, int cell_size,
     int batch_size, dnn::RnnInputMode input_mode,
     dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
@@ -2579,7 +2619,7 @@ CudnnSupport::createRnnDescriptor(
 }
 
 absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
-CudnnSupport::createRnnSequenceTensorDescriptor(int max_seq_length,
+CudnnSupport::CreateRnnSequenceTensorDescriptor(int max_seq_length,
                                                 int batch_size, int data_size,
                                                 dnn::DataType data_type) {
   TF_ASSIGN_OR_RETURN(CudnnRnnSequenceTensorDescriptor descriptor,
@@ -2591,7 +2631,7 @@ CudnnSupport::createRnnSequenceTensorDescriptor(int max_seq_length,
 }
 
 absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
-CudnnSupport::createRnnSequenceTensorDescriptor(
+CudnnSupport::CreateRnnSequenceTensorDescriptor(
     int max_seq_length, int batch_size, int data_size,
     const absl::Span<const int>& seq_lengths, bool time_major,
     dnn::DataType data_type) {
@@ -2604,7 +2644,7 @@ CudnnSupport::createRnnSequenceTensorDescriptor(
 }
 
 absl::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
-CudnnSupport::createRnnStateTensorDescriptor(int num_layer, int batch_size,
+CudnnSupport::CreateRnnStateTensorDescriptor(int num_layer, int batch_size,
                                              int data_size,
                                              dnn::DataType data_type) {
   return std::unique_ptr<dnn::RnnStateTensorDescriptor>(
@@ -3439,7 +3479,7 @@ struct RnnDoFP32ComputationFP16Input {
 
 namespace {
 
-#if CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
+#if CUDNN_VERSION >= 8100
 
 bool GenericEngineFilter(cudnnBackendDescriptor_t engine_config,
                          bool disable_winograd, bool disable_nondeterminism,
@@ -3466,7 +3506,7 @@ bool GenericEngineFilter(cudnnBackendDescriptor_t engine_config,
   return ret;
 }
 
-#endif  // CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
+#endif  // CUDNN_VERSION >= 8100
 
 }  // namespace
 
@@ -3541,7 +3581,7 @@ dnn::DataType GetConvAccumulatorType(dnn::DataType data_type) {
   }
 }
 
-#if CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
+#if CUDNN_VERSION >= 8100
 
 namespace {
 static bool allowAllConfig(cudnnBackendDescriptor_t engine_config) {
@@ -3610,7 +3650,7 @@ std::tuple<int, int> GetTensorVectorSizeAndDim(
   return std::make_tuple(vector_size, vector_dim);
 }
 
-#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+#if CUDNN_VERSION >= 8800
 absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnTensor(
     absl::Span<const int64_t> dims, absl::Span<const int64_t> strides,
     int64_t uid, dnn::DataType dtype, int64_t vec_count, int64_t vec_dim,
@@ -3666,7 +3706,7 @@ absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnTensor(
 absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnTensor(
     const cudnn_frontend::Tensor& original, int64_t uid, dnn::DataType dtype,
     bool is_virtual = false) {
-#if (CUDNN_VERSION >= 8900 && TF_ENABLE_CUDNN_FRONTEND)
+#if CUDNN_VERSION >= 8900
   auto tensor = cudnn_frontend::TensorBuilder()
                     .cloneFrom(original, uid)
                     .setAlignment(32)
@@ -3677,10 +3717,10 @@ absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnTensor(
   return tensor;
 #else
   return tsl::errors::Internal("Not implemented.");
-#endif  // CUDNN_VERSION >= 8900 && TF_ENABLE_CUDNN_FRONTEND
+#endif  // CUDNN_VERSION >= 8900
 }
 
-#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+#if CUDNN_VERSION >= 8800
 enum CudnnfMHAUid {
   Q_ID = 400,
   K_ID,
@@ -4107,7 +4147,7 @@ absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnDropoutFwdTensor(
 
   return dropout_scale_out_tensor;
 }
-#endif  // CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND
+#endif  // CUDNN_VERSION >= 8800
 
 absl::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
 GetCudnnOperationGraph(dnn::ConvolutionKind kind, dnn::DataType input_type,
@@ -4154,7 +4194,7 @@ GetCudnnOperationGraph(dnn::ConvolutionKind kind, dnn::DataType input_type,
   std::vector<int64_t> filter_strides = filter_descriptor.vectorized_strides(
       dnn::FilterLayout::kOutputInputYX, vector_size, vector_dim);
 
-#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+#if CUDNN_VERSION >= 8800
   cudnnBackendTensorReordering_t tensor_ordering_type =
       filter_descriptor.layout() ==
               dnn::FilterLayout::kOutputInputYX32_CudnnReordered
@@ -4166,7 +4206,7 @@ GetCudnnOperationGraph(dnn::ConvolutionKind kind, dnn::DataType input_type,
       dnn::FilterLayout::kOutputInputYX32_CudnnReordered;
 #endif
 
-#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+#if CUDNN_VERSION >= 8800
   TF_ASSIGN_OR_RETURN(
       auto tensor_w,
       CreateCudnnTensor(filter_dims, filter_strides, 'w', input_type,
@@ -4316,7 +4356,7 @@ class OpGraph {
       }
       it->is_virtual = true;
     }
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 
   absl::StatusOr<OpDescriptor> FindOpDescriptor(int uid) const {
@@ -4342,7 +4382,7 @@ class OpGraph {
       return tsl::errors::Internal("Unknown ID.");
     }
     it->sequence_index = index;
-    return tsl::OkStatus();
+    return absl::OkStatus();
   }
 
   bool Empty() const { return ops_.empty(); }
@@ -4752,7 +4792,7 @@ GetCudnnFusedOperationGraph(
   std::vector<int64_t> filter_strides = filter_descriptor.vectorized_strides(
       dnn::FilterLayout::kOutputInputYX, vector_size, vector_dim);
 
-#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+#if CUDNN_VERSION >= 8800
   cudnnBackendTensorReordering_t tensor_ordering_type =
       filter_descriptor.layout() ==
               dnn::FilterLayout::kOutputInputYX32_CudnnReordered
@@ -4764,7 +4804,7 @@ GetCudnnFusedOperationGraph(
       dnn::FilterLayout::kOutputInputYX32_CudnnReordered;
 #endif
 
-#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+#if CUDNN_VERSION >= 8800
   TF_ASSIGN_OR_RETURN(
       auto tensor_w,
       CreateCudnnTensor(filter_dims, filter_strides, 'w', input_type,
@@ -4815,7 +4855,7 @@ GetCudnnFusedOperationGraph(
   auto maybe_tensor_b = CreateCudnnTensor(bias_dims, bias_strides, 'b',
                                           bias_type, vector_size, vector_dim,
                                           /*is_virtual=*/false,
-#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+#if CUDNN_VERSION >= 8800
                                           tensor_ordering_type
 #else
                                           is_reordered_nchw_vect
@@ -5152,7 +5192,7 @@ GetCudnnFusedMatmulGraph(dnn::DataType input_type, dnn::DataType bias_type,
   return std::make_unique<cudnn_frontend::OperationGraph>(std::move(op_graph));
 }
 
-#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+#if CUDNN_VERSION >= 8800
 absl::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
 GetCudnnFusedMHAOperationGraph(
     const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
@@ -5681,7 +5721,7 @@ absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnMaskBwdTensor(
     return dummy_mask_out_tensor;
   }
 }
-#if (CUDNN_VERSION >= 8901 && TF_ENABLE_CUDNN_FRONTEND)
+#if CUDNN_VERSION >= 8901
 absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnBiasBwdTensor(
     std::vector<cudnn_frontend::Operation>& ops, absl::Span<const int64_t> dims,
     absl::Span<const int64_t> strides, dnn::DataType dtype,
@@ -5757,7 +5797,7 @@ absl::StatusOr<cudnn_frontend::Tensor> CreateCudnnBiasBwdTensor(
 
   return dbias_tensor;
 }
-#endif  // (CUDNN_VERSION >= 8901 && TF_ENABLE_CUDNN_FRONTEND)
+#endif  // CUDNN_VERSION >= 8901
 absl::StatusOr<std::unique_ptr<cudnn_frontend::OperationGraph>>
 GetCudnnFusedMHABackwardOperationGraph(
     const dnn::MatmulTensorDescriptor& bmm1_grad_gemm1_rhs_descriptor,
@@ -6050,7 +6090,7 @@ GetCudnnFusedMHABackwardOperationGraph(
 
   // bias backward
   if (use_bias) {
-#if (CUDNN_VERSION >= 8901 && TF_ENABLE_CUDNN_FRONTEND)
+#if CUDNN_VERSION >= 8901
     TF_ASSIGN_OR_RETURN(
         auto tensor_dbias,
         CreateCudnnBiasBwdTensor(intermediate_ops, p_dims, p_strides, dtype,
@@ -6657,7 +6697,16 @@ GetCudnnFlashAttentionOperationGraph(
     bmm2_input_tensor = std::move(bias_out);
   }
 
-  if (use_causal_mask) {
+  if (use_mask) {
+    // Create mask op and tensor
+    TF_ASSIGN_OR_RETURN(
+        auto mask_out,
+        CreateCudnnMaskFwdTensor(intermediate_ops, intermediate_bmm2_lhs_dims,
+                                 intermediate_bmm2_lhs_strides,
+                                 intermediate_bmm2_lhs_descriptor.type(),
+                                 bmm2_input_tensor));
+    bmm2_input_tensor = std::move(mask_out);
+  } else if (use_causal_mask) {
     // Create mask op and tensor
     TF_ASSIGN_OR_RETURN(
         auto mask_out,
@@ -7130,7 +7179,15 @@ GetCudnnFlashAttentionBackwardOperationGraph(
                             tensor_p_after_alpha_scale));
     tensor_p_after_alpha_scale = std::move(tensor_p_after_bias);
   }
-  if (use_causal_mask) {
+
+  if (use_mask) {
+    // masking -> p_after_mask
+    TF_ASSIGN_OR_RETURN(
+        auto tensor_p_after_mask,
+        CreateCudnnMaskFwdTensor(intermediate_ops, p_dims, p_strides, dtype,
+                                 tensor_p_after_alpha_scale));
+    tensor_p_after_alpha_scale = std::move(tensor_p_after_mask);
+  } else if (use_causal_mask) {
     // Causal masking -> p_after_mask
     TF_ASSIGN_OR_RETURN(auto tensor_p_after_causal_mask,
                         CreateCudnnFlashAttentionCausalMaskTensor(
@@ -7436,7 +7493,7 @@ GetCudnnFlashAttentionBackwardOperationGraph(
   return std::make_unique<cudnn_frontend::OperationGraph>(std::move(op_graph));
 }
 
-#endif  // CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND
+#endif  // CUDNN_VERSION >= 8800
 
 }  // namespace
 
@@ -7539,7 +7596,7 @@ static absl::StatusOr<cudnn_frontend::ExecutionPlan> RebuildExecutionPlan(
   return {std::move(plan)};
 }
 
-#endif  // CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
+#endif  // CUDNN_VERSION >= 8100
 
 }  // namespace
 
@@ -7922,7 +7979,7 @@ class ScalingParam {
   dnn::DataType default_target_dtype_;
 };
 
-#if CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
+#if CUDNN_VERSION >= 8100
 struct BackendDescriptorDeleter {
   void operator()(cudnnBackendDescriptor_t desc) {
     cudnnBackendDestroyDescriptor(desc);
@@ -8142,7 +8199,7 @@ class CudnnExecutionPlanRunner<void(Args...)>
       }
     }
     if (offset_increment_ > 0) {
-#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+#if CUDNN_VERSION >= 8800
       initial_offset_ += offset_increment_;
       data_uids_vec.push_back(CudnnfMHAUid::D_SEED_ID);
       data_uids_vec.push_back(CudnnfMHAUid::D_OFFSET_ID);
@@ -8160,7 +8217,7 @@ class CudnnExecutionPlanRunner<void(Args...)>
       return absl::UnimplementedError(
           "Cudnn dropout offset and seed are only supported with Cudnn >= "
           "8.8.");
-#endif  // CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND
+#endif  // CUDNN_VERSION >= 8800
     }
     auto variantPack =
         cudnn_frontend::VariantPackBuilder()
@@ -8278,9 +8335,9 @@ class CudnnExecutionPlanRunner<void(Args...)>
   int64_t rng_seed_;
   bool is_flash_attention_;
 };
-#endif  // CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
+#endif  // CUDNN_VERSION >= 8100
 
-#if CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
+#if CUDNN_VERSION >= 8100
 namespace {
 
 template <typename Sig>
@@ -8389,11 +8446,11 @@ absl::Status CreateOpRunners(
 
   VLOG(4) << "\nReturned execution plans size: " << out_runners->size();
 
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
-#endif  // CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
+#endif  // CUDNN_VERSION >= 8100
 
 absl::Status CudnnSupport::GetConvolveRunners(
     bool use_cudnn_frontend, dnn::ConvolutionKind kind,
@@ -8486,7 +8543,7 @@ absl::Status CudnnSupport::GetConvolveRunners(
     return absl::OkStatus();
   }
 
-#if CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
+#if CUDNN_VERSION >= 8100
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   TF_ASSIGN_OR_RETURN(
       auto op_graph,
@@ -8501,7 +8558,7 @@ absl::Status CudnnSupport::GetConvolveRunners(
 #else
   return tsl::errors::Unimplemented(
       "Cudnn execution plans are only supported with Cudnn >= 8.1.");
-#endif  // CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
+#endif  // CUDNN_VERSION >= 8100
 }
 
 absl::Status CudnnSupport::GetGraphConvolveRunners(
@@ -8568,7 +8625,7 @@ CudnnSupport::ConvolveRunnerFromDesc(
     return {std::make_unique<CudnnLegacyConvRunner>(std::move(runner))};
   }
 
-#if CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
+#if CUDNN_VERSION >= 8100
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   TF_ASSIGN_OR_RETURN(
@@ -8607,7 +8664,7 @@ CudnnSupport::GraphConvolveRunnerFromDesc(
         "cuDNN graph execution requires the use of the cuDNN frontend.");
   }
 
-#if CUDNN_VERSION >= 8900 && TF_ENABLE_CUDNN_FRONTEND
+#if CUDNN_VERSION >= 8900
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   TF_ASSIGN_OR_RETURN(
@@ -8863,7 +8920,7 @@ CudnnSupport::FusedConvolveRunnerFromDesc(
     return {std::make_unique<CudnnLegacyFusedConvRunner>(std::move(runner))};
   }
 
-#if CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
+#if CUDNN_VERSION >= 8100
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   TF_ASSIGN_OR_RETURN(auto op_graph,
@@ -9006,7 +9063,7 @@ absl::Status CudnnSupport::GetFusedConvolveRunners(
     return absl::OkStatus();
   }
 
-#if CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
+#if CUDNN_VERSION >= 8100
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   auto op_graph_status = GetCudnnFusedOperationGraph(
       kind, input_type, bias_type, output_type, conv_scale, side_input_scale,
@@ -9027,7 +9084,7 @@ absl::Status CudnnSupport::GetFusedConvolveRunners(
 #else
   return tsl::errors::Unimplemented(
       "Cudnn execution plans are only supported with Cudnn >= 8.1.");
-#endif  // CUDNN_VERSION >= 8100 && TF_ENABLE_CUDNN_FRONTEND
+#endif  // CUDNN_VERSION >= 8100
 }
 
 absl::Status CudnnSupport::GetFusedMatmulRunners(
@@ -9038,7 +9095,7 @@ absl::Status CudnnSupport::GetFusedMatmulRunners(
     const NumericOptions& numeric_options,
     std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
         out_exec_plans) {
-#if CUDNN_VERSION >= 8400 && TF_ENABLE_CUDNN_FRONTEND
+#if CUDNN_VERSION >= 8400
   if (!use_cudnn_frontend) {
     return tsl::errors::Unimplemented(
         "Cudnn execution plans for matmul are only supported with cudnn "
@@ -9066,7 +9123,7 @@ absl::Status CudnnSupport::GetFusedMatmulRunners(
 #else
   return tsl::errors::Unimplemented(
       "Cudnn execution plans for matmul are only supported with Cudnn >= 8.4.");
-#endif  // CUDNN_VERSION >= 8400 && TF_ENABLE_CUDNN_FRONTEND
+#endif  // CUDNN_VERSION >= 8400
 }
 
 bool CudnnSupport::GetConvolveAlgorithms(
@@ -9111,14 +9168,18 @@ bool CudnnSupport::GetConvolveAlgorithms(
 
 absl::StatusOr<std::unique_ptr<const dnn::NormRunner>>
 CudnnSupport::NormRunnerFromDesc(
-    Stream* stream, const dnn::AlgorithmDesc& algorithm_desc, double epsilon,
-    const dnn::TensorDescriptor& input_descriptor,
+    Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+    dnn::NormKind kind, double epsilon,
+    const dnn::TensorDescriptor& x_descriptor,
     const dnn::TensorDescriptor& scale_descriptor,
-    const dnn::TensorDescriptor& bias_descriptor,
-    const dnn::TensorDescriptor& output_descriptor,
+    const dnn::TensorDescriptor& y_or_dx_descriptor,
+    std::optional<dnn::TensorDescriptor> bias_descriptor,
+    std::optional<dnn::TensorDescriptor> dy_descriptor,
     std::optional<dnn::TensorDescriptor> expectation_descriptor,
-    std::optional<dnn::TensorDescriptor> norm_factor_descriptor) {
-#if (CUDNN_VERSION >= 8905 && TF_ENABLE_CUDNN_FRONTEND)
+    std::optional<dnn::TensorDescriptor> norm_factor_descriptor,
+    std::optional<dnn::TensorDescriptor> dscale_descriptor,
+    std::optional<dnn::TensorDescriptor> dbias_descriptor) {
+#if (CUDNN_VERSION >= 8905)
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   std::vector<int64_t> uids;
@@ -9129,45 +9190,48 @@ CudnnSupport::NormRunnerFromDesc(
     return uids.emplace_back(uids.back() + 1);
   };
 
-  TF_ASSIGN_OR_RETURN(
-      auto xTensor,
-      CreateCudnnTensor(input_descriptor.dimensions(),
-                        input_descriptor.GetPhysicalStridesMajorToMinor(),
-                        next_uid(), input_descriptor.type(), 1, -1));
-  TF_ASSIGN_OR_RETURN(
-      auto scaleTensor,
-      CreateCudnnTensor(scale_descriptor.dimensions(),
-                        scale_descriptor.GetPhysicalStridesMajorToMinor(),
-                        next_uid(), scale_descriptor.type(), 1, -1));
-  TF_ASSIGN_OR_RETURN(
-      auto biasTensor,
-      CreateCudnnTensor(bias_descriptor.dimensions(),
-                        bias_descriptor.GetPhysicalStridesMajorToMinor(),
-                        next_uid(), bias_descriptor.type(), 1, -1));
-  TF_ASSIGN_OR_RETURN(
-      auto yTensor,
-      CreateCudnnTensor(output_descriptor.dimensions(),
-                        output_descriptor.GetPhysicalStridesMajorToMinor(),
-                        next_uid(), output_descriptor.type(), 1, -1));
-  std::optional<cudnn_frontend::Tensor> expectation_tensor, norm_factor_tensor;
-  if (expectation_descriptor) {
-    TF_ASSIGN_OR_RETURN(
-        expectation_tensor,
-        CreateCudnnTensor(
-            expectation_descriptor->dimensions(),
-            expectation_descriptor->GetPhysicalStridesMajorToMinor(),
-            next_uid(), expectation_descriptor->type(), 1, -1));
-    TF_ASSIGN_OR_RETURN(
-        norm_factor_tensor,
-        CreateCudnnTensor(
-            norm_factor_descriptor->dimensions(),
-            norm_factor_descriptor->GetPhysicalStridesMajorToMinor(),
-            next_uid(), norm_factor_descriptor->type(), 1, -1));
+  auto create_cudnn_tensor = [next_uid](dnn::TensorDescriptor tensor_descriptor)
+      -> tsl::StatusOr<cudnn_frontend::Tensor> {
+    return CreateCudnnTensor(tensor_descriptor.dimensions(),
+                             tensor_descriptor.GetPhysicalStridesMajorToMinor(),
+                             next_uid(), tensor_descriptor.type(), 1, -1);
+  };
+
+  TF_ASSIGN_OR_RETURN(auto x_tensor, create_cudnn_tensor(x_descriptor));
+  TF_ASSIGN_OR_RETURN(auto scale_tensor, create_cudnn_tensor(scale_descriptor));
+  TF_ASSIGN_OR_RETURN(auto y_or_dx_tensor,
+                      create_cudnn_tensor(y_or_dx_descriptor));
+
+  std::optional<cudnn_frontend::Tensor> bias_tensor, expectation_tensor,
+      norm_factor_tensor, dy_tensor, dscale_tensor, dbias_tensor;
+  if (kind == dnn::NormKind::LAYER_FWD_INFER ||
+      kind == dnn::NormKind::LAYER_FWD_TRAIN) {
+    TF_ASSIGN_OR_RETURN(bias_tensor,
+                        create_cudnn_tensor(bias_descriptor.value()));
+  }
+
+  if (kind == dnn::LAYER_FWD_TRAIN) {
+    TF_ASSIGN_OR_RETURN(expectation_tensor,
+                        create_cudnn_tensor(expectation_descriptor.value()));
+    TF_ASSIGN_OR_RETURN(norm_factor_tensor,
+                        create_cudnn_tensor(norm_factor_descriptor.value()));
+  }
+
+  if (kind == dnn::LAYER_BWD) {
+    TF_ASSIGN_OR_RETURN(dy_tensor, create_cudnn_tensor(dy_descriptor.value()));
+    TF_ASSIGN_OR_RETURN(expectation_tensor,
+                        create_cudnn_tensor(expectation_descriptor.value()));
+    TF_ASSIGN_OR_RETURN(norm_factor_tensor,
+                        create_cudnn_tensor(norm_factor_descriptor.value()));
+    TF_ASSIGN_OR_RETURN(dscale_tensor,
+                        create_cudnn_tensor(dscale_descriptor.value()));
+    TF_ASSIGN_OR_RETURN(dbias_tensor,
+                        create_cudnn_tensor(dbias_descriptor.value()));
   }
 
   std::vector<int64_t> scale_dim(4, 1), scalar_uids;
   TF_ASSIGN_OR_RETURN(
-      auto epsilonTensor,
+      auto epsilon_tensor,
       CreateCudnnTensor(scale_dim, scale_dim,
                         scalar_uids.emplace_back(uids.back() + 1),
                         dnn::DataType::kDouble, 1, -1, /*is_virtual=*/false,
@@ -9177,30 +9241,47 @@ CudnnSupport::NormRunnerFromDesc(
   cudnnBackendNormMode_t normalizationMode = CUDNN_LAYER_NORM;
 
   std::optional<cudnn_frontend::Operation> norm_op;
-  if (!expectation_descriptor) {
-    cudnnBackendNormFwdPhase_t phase = CUDNN_NORM_FWD_INFERENCE;
-    norm_op = cudnn_frontend::OperationBuilder(
-                  CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR)
-                  .setNormalizationMode(normalizationMode)
-                  .setNormFwdPhase(phase)
-                  .setxDesc(xTensor)
-                  .setScaleAndBias(scaleTensor, biasTensor)
-                  .setEpsilonTensor(epsilonTensor)
-                  .setyDesc(yTensor)
-                  .build();
-  } else {
-    cudnnBackendNormFwdPhase_t phase = CUDNN_NORM_FWD_TRAINING;
-    norm_op = cudnn_frontend::OperationBuilder(
-                  CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR)
-                  .setNormalizationMode(normalizationMode)
-                  .setNormFwdPhase(phase)
-                  .setxDesc(xTensor)
-                  .setScaleAndBias(scaleTensor, biasTensor)
-                  .setEpsilonTensor(epsilonTensor)
-                  .setSavedMeanAndInvVar(expectation_tensor.value(),
-                                         norm_factor_tensor.value())
-                  .setyDesc(yTensor)
-                  .build();
+  switch (kind) {
+    case dnn::LAYER_FWD_INFER:
+      norm_op = cudnn_frontend::OperationBuilder(
+                    CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR)
+                    .setNormalizationMode(normalizationMode)
+                    .setNormFwdPhase(CUDNN_NORM_FWD_INFERENCE)
+                    .setxDesc(x_tensor)
+                    .setScaleAndBias(scale_tensor, bias_tensor.value())
+                    .setEpsilonTensor(epsilon_tensor)
+                    .setyDesc(y_or_dx_tensor)
+                    .build();
+      break;
+    case dnn::LAYER_FWD_TRAIN:
+      norm_op = cudnn_frontend::OperationBuilder(
+                    CUDNN_BACKEND_OPERATION_NORM_FORWARD_DESCRIPTOR)
+                    .setNormalizationMode(normalizationMode)
+                    .setNormFwdPhase(CUDNN_NORM_FWD_TRAINING)
+                    .setxDesc(x_tensor)
+                    .setScaleAndBias(scale_tensor, bias_tensor.value())
+                    .setEpsilonTensor(epsilon_tensor)
+                    .setSavedMeanAndInvVar(expectation_tensor.value(),
+                                           norm_factor_tensor.value())
+                    .setyDesc(y_or_dx_tensor)
+                    .build();
+      break;
+    case dnn::LAYER_BWD:
+      norm_op =
+          cudnn_frontend::OperationBuilder(
+              CUDNN_BACKEND_OPERATION_NORM_BACKWARD_DESCRIPTOR)
+              .setNormalizationMode(normalizationMode)
+              .setxDesc(x_tensor)
+              .setScale(scale_tensor)
+              .setSavedMeanAndInvVar(expectation_tensor.value(),
+                                     norm_factor_tensor.value())
+              .setDScaleAndDBias(dscale_tensor.value(), dbias_tensor.value())
+              .setdyDesc(dy_tensor.value())
+              .setdxDesc(y_or_dx_tensor)
+              .build();
+      break;
+    default:
+      break;
   }
 
   std::array<cudnn_frontend::Operation const*, 1> ops = {&norm_op.value()};
@@ -9229,7 +9310,7 @@ CudnnSupport::NormRunnerFromDesc(
 #else
   return absl::UnimplementedError(
       "Layer norm kernels require cuDNN 8.9.5 or higher.");
-#endif  // CUDNN_VERSION >= 8905 && TF_ENABLE_CUDNN_FRONTEND
+#endif  // CUDNN_VERSION >= 8905
 }
 
 // Returns the offset to increment for the dropout rng.
@@ -9259,7 +9340,7 @@ CudnnSupport::FusedMHARunnerFromDesc(
     std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
     std::optional<double> dropout_rate, std::optional<int64_t> seed,
     bool is_flash_attention, bool is_causal_mask) {
-#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+#if CUDNN_VERSION >= 8800
   auto cudnn = cudnn_->GetHandle(parent_, stream);
   bool use_dropout = dropout_rate && *dropout_rate > 0.0;
   std::vector<int64_t> intermediate_shape;
@@ -9353,7 +9434,7 @@ CudnnSupport::FusedMHARunnerFromDesc(
 #else
   return absl::UnimplementedError(
       "Cudnn execution plans are only supported with Cudnn >= 8.8.");
-#endif  // CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND
+#endif  // CUDNN_VERSION >= 8800
 }
 
 absl::StatusOr<std::unique_ptr<const dnn::FusedMHABackwardRunner>>
@@ -9375,7 +9456,7 @@ CudnnSupport::FusedMHABackwardRunnerFromDesc(
     std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
     std::optional<double> dropout_rate, std::optional<int64_t> seed,
     bool is_flash_attention, bool is_causal_mask) {
-#if (CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND)
+#if CUDNN_VERSION >= 8800
   auto cudnn = cudnn_->GetHandle(parent_, stream);
 
   bool use_dropout = dropout_rate && *dropout_rate > 0.0;
@@ -9433,11 +9514,15 @@ CudnnSupport::FusedMHABackwardRunnerFromDesc(
     scalar_values = {alpha_scale, dropout_scale, scale_prob};
     // push dropout seed and offset here
     dropout_rng_offset = GetDropoutRngOffset(intermediate_shape);
-    uids = {
-        CudnnfMHAUid::Q_ID,         CudnnfMHAUid::K_ID,  CudnnfMHAUid::P_ID,
-        CudnnfMHAUid::V_ID,         CudnnfMHAUid::dO_ID, CudnnfMHAUid::dQ_ID,
-        CudnnfMHAUid::dK_ID,        CudnnfMHAUid::dV_ID, CudnnfMHAUid::S_SUM_ID,
-        CudnnfMHAUid::d_Q_accum_ID, CudnnfMHAUid::O_ID};
+    uids = {CudnnfMHAUid::Q_ID,     CudnnfMHAUid::K_ID,
+            CudnnfMHAUid::P_ID,     CudnnfMHAUid::V_ID,
+            CudnnfMHAUid::dO_ID,    CudnnfMHAUid::dQ_ID,
+            CudnnfMHAUid::dK_ID,    CudnnfMHAUid::dV_ID,
+            CudnnfMHAUid::S_SUM_ID, CudnnfMHAUid::d_Q_accum_ID};
+    if (mask_descriptor != std::nullopt) {
+      uids.push_back(CudnnfMHAUid::MASK_ID);
+    }
+    uids.push_back(CudnnfMHAUid::O_ID);
     if (bias_descriptor != std::nullopt) {
       uids.push_back(CudnnfMHAUid::BIAS_ID);
     }
@@ -9490,7 +9575,7 @@ CudnnSupport::FusedMHABackwardRunnerFromDesc(
   return absl::UnimplementedError(
       "Cudnn execution plans with dbias calculation in bwd are only "
       "supported with Cudnn >= 8.8.");
-#endif  // CUDNN_VERSION >= 8800 && TF_ENABLE_CUDNN_FRONTEND
+#endif  // CUDNN_VERSION >= 8800
 }
 
 bool CudnnSupport::GetRnnAlgorithms(
@@ -10087,7 +10172,7 @@ absl::Status CudnnSupport::CudnnReorderConvolutionFilterAndBias(
       /*reorderedBiasData=*/has_bias ? bias_output->opaque() : nullptr);
   RETURN_IF_CUDNN_ERROR(status);
 
-  return tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 absl::Status CudnnSupport::DoPrepareForCtcLoss(
@@ -10503,64 +10588,6 @@ bool CudnnSupport::DoNormalizeBackwardWithDimensions(
   return IsStatusOk(status, /*report_error=*/true);
 }
 
-bool CudnnSupport::DoDepthConcatenate(Stream* stream,
-                                      BatchDescriptorSlice input_dimensions,
-                                      DeviceMemorySlice<float> input_data,
-                                      DeviceMemory<float>* output_data) {
-  CHECK_EQ(input_dimensions.size(), input_data.size());
-
-  for (const auto& dimensions : input_dimensions) {
-    if (dimensions.layout() != dnn::DataLayout::kBatchDepthYX) {
-      LOG(ERROR) << "CudnnSupport::DoDepthConcatenate currently only "
-                    "supports the kBatchDepthYX layout.";
-      return false;
-    }
-  }
-
-  if (input_dimensions.empty()) {
-    return true;  // Nothing to do.
-  }
-
-  dnn::BatchDescriptor output_dimensions =
-      dnn::BatchDescriptor::DepthConcatenateOutputDescriptor(input_dimensions);
-
-  const int64_t area = output_dimensions.width() * output_dimensions.height();
-  const auto index = [area](int64_t batch, int64_t depth, int64_t yx,
-                            int64_t max_depth) {
-    return (batch * max_depth + depth) * area + yx;
-  };
-
-  std::vector<float> output_host(output_dimensions.ElementCount());
-  std::vector<float> tmp;
-  int64_t depth_sum = 0;
-  for (size_t i = 0; i < input_data.size(); ++i) {
-    const auto& dimensions = input_dimensions[i];
-    tmp.resize(dimensions.ElementCount());
-    stream->ThenMemcpyD2H<float>(*input_data[i], absl::MakeSpan(tmp));
-    absl::Status block_status = stream->BlockHostUntilDone();
-    if (!block_status.ok()) {
-      LOG(ERROR) << "BlockHostUntilDone failed: " << block_status;
-      return false;
-    }
-
-    for (int64_t batch = 0; batch < output_dimensions.count(); ++batch) {
-      for (int64_t yx = 0; yx < area; ++yx) {
-        for (int64_t depth = 0; depth < dimensions.feature_map_count();
-             ++depth) {
-          LOG(INFO) << output_dimensions.ElementCount() << ' ' << batch << ' '
-                    << yx << ' ' << depth;
-          output_host[index(batch, depth + depth_sum, yx,
-                            output_dimensions.feature_map_count())] =
-              tmp[index(batch, depth, yx, dimensions.feature_map_count())];
-        }
-      }
-    }
-    depth_sum += dimensions.feature_map_count();
-  }
-  stream->ThenMemcpyH2D<float>(output_host, output_data);
-  return true;
-}
-
 bool CudnnSupport::DeriveOutputBatchDescriptor(
     const dnn::BatchDescriptor& batch_descriptor,
     const dnn::FilterDescriptor& filter_descriptor,
@@ -10623,5 +10650,6 @@ void initialize_cudnn() {
 #pragma clang diagnostic pop
 #endif
 
-REGISTER_MODULE_INITIALIZER(register_cudnn,
-                            { stream_executor::initialize_cudnn(); });
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(register_cudnn, {
+  stream_executor::initialize_cudnn();
+});
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
index 0e11868b5d64ae..fb7999e5e0cce0 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_dnn.h
@@ -19,18 +19,21 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
 #define XLA_STREAM_EXECUTOR_CUDA_CUDA_DNN_H_
 
+#include <Eigen/Core>
 #include <cstdint>
+#include <memory>
 #include <optional>
 #include <string>
 #include <vector>
 
-#include "absl/base/thread_annotations.h"
 #include "absl/status/status.h"
+#include "absl/status/statusor.h"
 #include "absl/types/span.h"
-#include "xla/stream_executor/cuda/cuda_activation.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/dnn.h"
-#include "xla/stream_executor/plugin_registry.h"
-#include "xla/stream_executor/temporary_device_memory.h"
+#include "xla/stream_executor/numeric_options.h"
+#include "tsl/protobuf/dnn.pb.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -55,7 +58,7 @@ class CudnnSupport : public dnn::DnnSupport {
   absl::Status Init() override;
   absl::StatusOr<stream_executor::dnn::VersionInfo> GetVersion() override;
 
-  absl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
+  absl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> CreateRnnDescriptor(
       int num_layers, int hidden_size, int input_size, int cell_size,
       int batch_size, dnn::RnnInputMode input_mode,
       dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
@@ -64,19 +67,19 @@ class CudnnSupport : public dnn::DnnSupport {
       ScratchAllocator* state_allocator, bool use_padded_io) override;
 
   absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
-  createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
+  CreateRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
                                     int data_size,
                                     dnn::DataType data_type) override;
 
   absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
-  createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
+  CreateRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
                                     int data_size,
                                     const absl::Span<const int>& seq_lengths,
                                     bool time_major,
                                     dnn::DataType data_type) override;
 
   absl::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
-  createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
+  CreateRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
                                  dnn::DataType data_type) override;
 
   bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
@@ -296,13 +299,17 @@ class CudnnSupport : public dnn::DnnSupport {
       dnn::ActivationMode activation_mode) override;
 
   absl::StatusOr<std::unique_ptr<const dnn::NormRunner>> NormRunnerFromDesc(
-      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc, double epsilon,
-      const dnn::TensorDescriptor& input_descriptor,
+      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+      dnn::NormKind kind, double epsilon,
+      const dnn::TensorDescriptor& x_descriptor,
       const dnn::TensorDescriptor& scale_descriptor,
-      const dnn::TensorDescriptor& bias_descriptor,
-      const dnn::TensorDescriptor& output_descriptor,
+      const dnn::TensorDescriptor& y_or_dx_descriptor,
+      std::optional<dnn::TensorDescriptor> bias_descriptor,
+      std::optional<dnn::TensorDescriptor> dy_descriptor,
       std::optional<dnn::TensorDescriptor> expectation_descriptor,
-      std::optional<dnn::TensorDescriptor> norm_factor_descriptor) override;
+      std::optional<dnn::TensorDescriptor> norm_factor_descriptor,
+      std::optional<dnn::TensorDescriptor> dscale_descriptor,
+      std::optional<dnn::TensorDescriptor> dbias_descriptor) override;
 
   absl::StatusOr<std::unique_ptr<const dnn::FusedMHARunner>>
   FusedMHARunnerFromDesc(
@@ -515,10 +522,6 @@ class CudnnSupport : public dnn::DnnSupport {
       DeviceMemory<float>* raw_variable_gradient,
       ScratchAllocator* workspace_allocator) override;
 
-  bool DoDepthConcatenate(Stream* stream, BatchDescriptorSlice input_dimensions,
-                          DeviceMemorySlice<float> input_data,
-                          DeviceMemory<float>* output_data) override;
-
   // Derives an output batch descriptor from an input batch and convolution
   // descriptors.
   bool DeriveOutputBatchDescriptor(
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc b/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
index 36662e6cc70949..a9578bf06a455b 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_driver.cc
@@ -49,14 +49,12 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/platform.h"
 #include "tsl/platform/env.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/macros.h"
 #include "tsl/platform/numbers.h"
 #include "tsl/platform/stacktrace.h"
 #include "tsl/platform/threadpool.h"
 
-
 static constexpr bool FLAGS_gpuexec_cuda_driver_inject_init_error = false;
 static constexpr bool FLAGS_gpuexec_cuda_sync_around_driver_calls = false;
 static constexpr bool FLAGS_gpuexec_cuda_device_0_only = false;
@@ -792,7 +790,7 @@ static std::string ConditionalTypeToString(
 
 /* static */ absl::StatusOr<GpuDriver::GpuGraphNodeResult>
 GpuDriver::GraphAddNode(CUgraphNode* node, CUgraph graph,
-                        absl::Span<CUgraphNode> deps,
+                        absl::Span<const CUgraphNode> deps,
                         const GpuGraphNodeParams& params) {
 #if CUDA_VERSION >= 12030
   // Add conditional node to a graph.
@@ -834,7 +832,7 @@ GpuDriver::GraphAddNode(CUgraphNode* node, CUgraph graph,
 }
 
 /* static */ absl::Status GpuDriver::GraphAddEmptyNode(
-    CUgraphNode* node, CUgraph graph, absl::Span<CUgraphNode> deps) {
+    CUgraphNode* node, CUgraph graph, absl::Span<const CUgraphNode> deps) {
   VLOG(2) << "Add empty node to a graph " << graph << "; deps: " << deps.size();
 
   RETURN_IF_CUDA_RES_ERROR(
@@ -845,7 +843,7 @@ GpuDriver::GraphAddNode(CUgraphNode* node, CUgraph graph,
 }
 
 /* static */ absl::Status GpuDriver::GraphAddKernelNode(
-    CUgraphNode* node, CUgraph graph, absl::Span<CUgraphNode> deps,
+    CUgraphNode* node, CUgraph graph, absl::Span<const CUgraphNode> deps,
     absl::string_view kernel_name, CUfunction function, unsigned int grid_dim_x,
     unsigned int grid_dim_y, unsigned int grid_dim_z, unsigned int block_dim_x,
     unsigned int block_dim_y, unsigned int block_dim_z,
@@ -978,7 +976,7 @@ static CUmemAllocationType ToCudaAllocationType(
 }
 
 /*static*/ absl::Status GpuDriver::GraphAddMemAllocNode(
-    CUgraphNode* node, CUgraph graph, absl::Span<CUgraphNode> deps,
+    CUgraphNode* node, CUgraph graph, absl::Span<const CUgraphNode> deps,
     GpuDriver::MemAccessFlags access_flags,
     GpuDriver::MemLocationType location_type, int device_id,
     GpuDriver::MemAllocationType allocation_type, uint64_t size,
@@ -1029,7 +1027,7 @@ GpuDriver::GraphGetMemAllocNodeParams(CUgraphNode node) {
 }
 
 /*static*/ absl::Status GpuDriver::GraphAddMemFreeNode(
-    CUgraphNode* node, CUgraph graph, absl::Span<CUgraphNode> deps,
+    CUgraphNode* node, CUgraph graph, absl::Span<const CUgraphNode> deps,
     CUdeviceptr gpu_dst) {
   RETURN_IF_CUDA_RES_ERROR(
       cuGraphAddMemFreeNode(node, graph, deps.data(), deps.size(), gpu_dst),
@@ -1039,8 +1037,8 @@ GpuDriver::GraphGetMemAllocNodeParams(CUgraphNode node) {
 
 /* static */ absl::Status GpuDriver::GraphAddMemcpyD2DNode(
     GpuContext* context, CUgraphNode* node, CUgraph graph,
-    absl::Span<CUgraphNode> deps, CUdeviceptr gpu_dst, CUdeviceptr gpu_src,
-    uint64_t size) {
+    absl::Span<const CUgraphNode> deps, CUdeviceptr gpu_dst,
+    CUdeviceptr gpu_src, uint64_t size) {
   VLOG(2) << "Add memcpy d2d node to a graph " << graph
           << "; dst: " << reinterpret_cast<void*>(gpu_dst)
           << "; src: " << reinterpret_cast<void*>(gpu_src) << "; size: " << size
@@ -1125,7 +1123,7 @@ struct BitPatternToValue {
 
 /* static */ absl::Status GpuDriver::GraphAddMemsetNode(
     GpuContext* context, CUgraphNode* node, GpuGraphHandle graph,
-    absl::Span<CUgraphNode> deps, CUdeviceptr dst,
+    absl::Span<const CUgraphNode> deps, CUdeviceptr dst,
     std::variant<uint8_t, uint16_t, uint32_t> bit_pattern,
     uint64_t num_elements) {
   VLOG(2) << "Add memset node to a graph " << graph
@@ -1184,7 +1182,7 @@ struct BitPatternToValue {
 }
 
 /* static */ absl::Status GpuDriver::GraphAddChildNode(
-    CUgraphNode* node, CUgraph graph, absl::Span<CUgraphNode> deps,
+    CUgraphNode* node, CUgraph graph, absl::Span<const CUgraphNode> deps,
     CUgraph child) {
   VLOG(2) << "Create a new node by cloning the child graph " << child
           << " and add it to " << graph << "; deps: " << deps.size();
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_driver.h b/third_party/xla/xla/stream_executor/cuda/cuda_driver.h
index a3f9e065209a0f..a72740ef4ead86 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_driver.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_driver.h
@@ -18,10 +18,19 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
 #define XLA_STREAM_EXECUTOR_CUDA_CUDA_DRIVER_H_
 
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
 #include "absl/container/node_hash_map.h"
-#include "absl/memory/memory.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/strings/str_cat.h"
 #include "absl/synchronization/mutex.h"
+#include "third_party/gpus/cuda/include/cuda.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
 
 namespace stream_executor {
@@ -109,9 +118,8 @@ class CreatedContexts {
     }
   }
 
-  // Return the context associated to that ptr.
-  static CUcontext GetAnyContext(void* ptr) {
-    absl::ReaderMutexLock lock(&mu_);
+  // Find device id from cuda pointer value.
+  static int GetDeviceOrdinal(void* ptr) {
     int device_ordinal;
     CUresult result = cuPointerGetAttribute(static_cast<void*>(&device_ordinal),
                                             CU_POINTER_ATTRIBUTE_DEVICE_ORDINAL,
@@ -120,6 +128,13 @@ class CreatedContexts {
       LOG(FATAL) << "Not able to get the device_ordinal for ptr: " << ptr
                  << ". Error: " << ToString(result);
     }
+    return device_ordinal;
+  }
+
+  // Return the context associated to that ptr.
+  static CUcontext GetAnyContext(void* ptr) {
+    absl::ReaderMutexLock lock(&mu_);
+    int device_ordinal = GetDeviceOrdinal(ptr);
     CHECK_EQ(LiveOrdinal()->count(device_ordinal), 1);
     CHECK(!LiveOrdinal()->at(device_ordinal).empty())
         << "Need at least one context.";
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_driver_test.cc b/third_party/xla/xla/stream_executor/cuda/cuda_driver_test.cc
index bd54dac773a65f..05dd05d1a9ff8e 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_driver_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_driver_test.cc
@@ -13,10 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/log/log.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/driver_types.h"
+#include "xla/stream_executor/gpu/gpu_driver.h"
 #if GOOGLE_CUDA
 #include "xla/stream_executor/cuda/cuda_driver.h"
 
-#include "absl/memory/memory.h"
 #include "third_party/gpus/cuda/include/cuda_runtime_api.h"
 #include "tsl/platform/test.h"
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_event.cc b/third_party/xla/xla/stream_executor/cuda/cuda_event.cc
index e3fd233795d561..f42cf47e86e1ab 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_event.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_event.cc
@@ -13,11 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/stream_executor/cuda/cuda_event.h"
-
+#include "absl/log/log.h"
 #include "absl/status/statusor.h"
 #include "third_party/gpus/cuda/include/cuda.h"
-#include "xla/stream_executor/cuda/cuda_stream.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/gpu/gpu_driver.h"
+#include "xla/stream_executor/gpu/gpu_event.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
index e163e1fd675c75..9699addbc0a040 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_executor.cc
@@ -15,9 +15,27 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <cstdio>
+#include <ios>
 #include <memory>
 #include <optional>
+#include <string>
+#include <tuple>
 #include <utility>
+#include <vector>
+
+#include "absl/base/casts.h"
+#include "absl/numeric/int128.h"
+#include "absl/strings/str_join.h"
+#include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_options.h"
+#include "xla/stream_executor/dnn.h"
+#include "xla/stream_executor/event.h"
+#include "xla/stream_executor/fft.h"
+#include "xla/stream_executor/gpu/gpu_diagnostics.h"
+#include "xla/stream_executor/kernel_spec.h"
+#include "xla/stream_executor/launch_dim.h"
 
 #if defined(PLATFORM_WINDOWS)
 #include <windows.h>
@@ -454,7 +472,8 @@ absl::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
   if (VLOG_IS_ON(2)) {
     absl::MutexLock lock(&launched_kernels_mu_);
     if (!launched_kernels_.count(cufunc)) {
-      VlogOccupancyInfo(kernel, thread_dims, block_dims);
+      VlogOccupancyInfo(stream->parent()->GetDeviceDescription(), kernel,
+                        thread_dims, block_dims);
       // TODO(rspringer): Remove elements from launched_kernels_...if we ever
       // expose a kernel/module deallocation method.
       launched_kernels_.insert(cufunc);
@@ -469,8 +488,15 @@ absl::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
 
   // Launch CUDA kernels with packed arguments.
   auto launch = [&](const KernelArgsPackedArrayBase& packed) {
-    CHECK_EQ(kernel.Arity() + (packed.number_of_shared_bytes() > 0),
-             packed.number_of_arguments());
+    int32_t expected_number_of_arguments =
+        kernel.Arity() + (packed.number_of_shared_bytes() > 0);
+
+    CHECK_EQ(expected_number_of_arguments, packed.number_of_arguments())
+        << "Kernel " << kernel.name() << " has " << packed.number_of_arguments()
+        << " arguments, but expected " << expected_number_of_arguments
+        << "; arity=" << kernel.Arity()
+        << "; number_of_shared_bytes=" << packed.number_of_shared_bytes();
+
     void** params = const_cast<void**>(packed.argument_addresses().data());
 
     if (cluster_dims.has_value()) {
@@ -526,7 +552,8 @@ absl::Status GpuExecutor::Submit(Stream* stream,
 // This is a non-essential operation; if there's a failure, proceed without
 // logging an error. It's nearly certain that in case of failures, we'd never
 // get here in the first place; these are very low-impact routines.
-void GpuExecutor::VlogOccupancyInfo(const Kernel& kernel,
+void GpuExecutor::VlogOccupancyInfo(const DeviceDescription& device_description,
+                                    const Kernel& kernel,
                                     const ThreadDim& thread_dims,
                                     const BlockDim& block_dims) {
   VLOG(2) << "Computing kernel occupancy for kernel "
@@ -541,9 +568,6 @@ void GpuExecutor::VlogOccupancyInfo(const Kernel& kernel,
     return;
   }
 
-  const DeviceDescription& device_description =
-      kernel.parent()->GetDeviceDescription();
-
   const GpuKernel* cuda_kernel = AsGpuKernel(&kernel);
   CUfunction cufunc = cuda_kernel->AsGpuFunctionHandle();
 
@@ -924,28 +948,25 @@ GpuExecutor::CreateEventImplementation() {
   return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
 }
 
-std::unique_ptr<internal::KernelInterface>
-GpuExecutor::CreateKernelImplementation() {
-  return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
-}
-
 std::unique_ptr<internal::StreamInterface>
 GpuExecutor::GetStreamImplementation() {
   return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
 }
 
-absl::StatusOr<std::unique_ptr<internal::CommandBufferInterface>>
-GpuExecutor::GetCommandBufferImplementation(CommandBuffer::Mode mode) {
+absl::StatusOr<std::unique_ptr<Kernel>> GpuExecutor::CreateKernel() {
+  return std::make_unique<GpuKernel>(this);
+}
+
+absl::StatusOr<std::unique_ptr<CommandBuffer>> GpuExecutor::CreateCommandBuffer(
+    CommandBuffer::Mode mode) {
   VLOG(2) << "Create CUDA command buffer (CUDA graph)";
   GpuGraphHandle graph = nullptr;
   TF_RETURN_IF_ERROR(GpuDriver::CreateGraph(&graph));
   return std::make_unique<GpuCommandBuffer>(mode, /*parent=*/this, graph);
 }
 
-std::unique_ptr<internal::CommandBufferInterface>
-GpuExecutor::GetCommandBufferImplementation(CommandBuffer::Mode mode,
-                                            GpuGraphHandle graph,
-                                            bool is_owned_graph) {
+std::unique_ptr<GpuCommandBuffer> GpuExecutor::CreateCommandBuffer(
+    CommandBuffer::Mode mode, GpuGraphHandle graph, bool is_owned_graph) {
   VLOG(2) << "Create CUDA command buffer (CUDA graph) from existing graph "
           << graph << "; is_owned_graph=" << is_owned_graph;
   return std::make_unique<GpuCommandBuffer>(mode, /*parent=*/this, graph,
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc b/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
index e45c4e4fbd7a82..7a723b9230463e 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_fft.cc
@@ -19,22 +19,29 @@ limitations under the License.
 #include <complex>
 #include <cstdint>
 #include <limits>
+#include <memory>
+#include <type_traits>
+#include <utility>
 
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "third_party/gpus/cuda/include/cuda.h"
+#include "third_party/gpus/cuda/include/cufft.h"
 #include "xla/stream_executor/cuda/cuda_activation.h"
-#include "xla/stream_executor/cuda/cuda_helpers.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
-#include "xla/stream_executor/cuda/cuda_stream.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/fft.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
+#include "xla/stream_executor/gpu/gpu_helpers.h"
+#include "xla/stream_executor/gpu/gpu_stream.h"
 #include "xla/stream_executor/platform/initialize.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/plugin_registry.h"
+#include "xla/stream_executor/scratch_allocator.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_internal.h"
-#include "tsl/platform/errors.h"
 #include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -481,5 +488,6 @@ void initialize_cufft() {
 
 }  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(register_cufft,
-                            { stream_executor::initialize_cufft(); });
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(register_cufft, {
+  stream_executor::initialize_cufft();
+});
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_fft.h b/third_party/xla/xla/stream_executor/cuda/cuda_fft.h
index c3e0dbde33ee78..111f47903b2fc7 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_fft.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_fft.h
@@ -20,12 +20,14 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
 #define XLA_STREAM_EXECUTOR_CUDA_CUDA_FFT_H_
 
+#include <cstddef>
 #include <cstdint>
 
+#include "absl/log/log.h"
+#include "absl/status/status.h"
 #include "third_party/gpus/cuda/include/cufft.h"
 #include "xla/stream_executor/fft.h"
 #include "xla/stream_executor/platform/port.h"
-#include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/scratch_allocator.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
index 3018f70586d433..9fa69a09e2b472 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform.cc
@@ -15,20 +15,29 @@ limitations under the License.
 
 #include "xla/stream_executor/cuda/cuda_platform.h"
 
+#include <algorithm>
+#include <cstdlib>
+#include <cstring>
 #include <memory>
+#include <string>
+#include <utility>
 
 #include "absl/base/call_once.h"
-#include "absl/base/const_init.h"
-#include "absl/memory/memory.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "xla/stream_executor/cuda/cuda_driver.h"
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
+#include "xla/stream_executor/device_description.h"
+#include "xla/stream_executor/device_options.h"
+#include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_executor.h"
+#include "xla/stream_executor/multi_platform_manager.h"
+#include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/initialize.h"
-#include "tsl/platform/errors.h"
+#include "xla/stream_executor/platform_manager.h"
+#include "tsl/platform/status.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -177,14 +186,14 @@ CudaPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
 }  // namespace gpu
 
 static void InitializeCudaPlatform() {
-  // Disabling leak checking, MultiPlatformManager does not destroy its
+  // Disabling leak checking, PlatformManager does not destroy its
   // registered platforms.
 
   std::unique_ptr<gpu::CudaPlatform> platform(new gpu::CudaPlatform);
-  TF_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
+  TF_CHECK_OK(PlatformManager::RegisterPlatform(std::move(platform)));
 }
 
 }  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(cuda_platform,
-                            stream_executor::InitializeCudaPlatform());
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(
+    cuda_platform, stream_executor::InitializeCudaPlatform());
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform.h b/third_party/xla/xla/stream_executor/cuda/cuda_platform.h
index fb4c662eeaaff7..153282b26507e6 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform.h
@@ -17,16 +17,13 @@ limitations under the License.
 #define XLA_STREAM_EXECUTOR_CUDA_CUDA_PLATFORM_H_
 
 #include <memory>
-#include <vector>
+#include <string>
 
-#include "absl/base/thread_annotations.h"
 #include "absl/status/statusor.h"
 #include "xla/stream_executor/executor_cache.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/platform/port.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/stream_executor_internal.h"
 
 namespace stream_executor {
 namespace cuda {
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_platform_id.cc b/third_party/xla/xla/stream_executor/cuda/cuda_platform_id.cc
index 6a3f807ec0be7c..c8754155d6d511 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_platform_id.cc
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_platform_id.cc
@@ -15,6 +15,8 @@ limitations under the License.
 
 #include "xla/stream_executor/cuda/cuda_platform_id.h"
 
+#include "xla/stream_executor/platform.h"
+
 namespace stream_executor {
 namespace cuda {
 
diff --git a/third_party/xla/xla/stream_executor/cuda/cuda_stream.h b/third_party/xla/xla/stream_executor/cuda/cuda_stream.h
index 05106debf7ffa0..7e651b45d0e6fa 100644
--- a/third_party/xla/xla/stream_executor/cuda/cuda_stream.h
+++ b/third_party/xla/xla/stream_executor/cuda/cuda_stream.h
@@ -19,6 +19,7 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
 #define XLA_STREAM_EXECUTOR_CUDA_CUDA_STREAM_H_
 
+#include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/gpu/gpu_stream.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/cuda/memcpy_test.cc b/third_party/xla/xla/stream_executor/cuda/memcpy_test.cc
index 4f897859f81fb2..8e3bedbb21f199 100644
--- a/third_party/xla/xla/stream_executor/cuda/memcpy_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/memcpy_test.cc
@@ -14,24 +14,27 @@ limitations under the License.
 ==============================================================================*/
 
 #if GOOGLE_CUDA
-#include "absl/memory/memory.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/multi_platform_manager.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace stream_executor {
 
 TEST(MemcpyTest, PinnedHostMemory) {
-  Platform* platform = MultiPlatformManager::PlatformWithName("CUDA").value();
+  Platform* platform = PlatformManager::PlatformWithName("CUDA").value();
   StreamExecutor* executor = platform->ExecutorForDevice(0).value();
   Stream stream(executor);
   stream.Init();
   ASSERT_TRUE(stream.ok());
 
-  void* d_ptr = executor->HostMemoryAllocate(sizeof(int));
-  DeviceMemoryBase d_mem(d_ptr, sizeof(int));
+  TF_ASSERT_OK_AND_ASSIGN(auto d_ptr,
+                          executor->HostMemoryAllocate(sizeof(int)));
+  DeviceMemoryBase d_mem(d_ptr->opaque(), sizeof(int));
+
   int h_ptr;
   stream.ThenMemcpy(&h_ptr, d_mem, d_mem.size());
   EXPECT_TRUE(stream.BlockHostUntilDone().ok());
diff --git a/third_party/xla/xla/stream_executor/cuda/stream_search_test.cc b/third_party/xla/xla/stream_executor/cuda/stream_search_test.cc
index 8489f59186863c..6620622ff9c080 100644
--- a/third_party/xla/xla/stream_executor/cuda/stream_search_test.cc
+++ b/third_party/xla/xla/stream_executor/cuda/stream_search_test.cc
@@ -14,6 +14,8 @@ limitations under the License.
 ==============================================================================*/
 
 #include "absl/status/statusor.h"
+#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/test.h"
 
@@ -24,9 +26,7 @@ namespace {
 
 class StreamSearchTest : public ::testing::Test {
  public:
-  Platform* GetPlatform() {
-    return *MultiPlatformManager::PlatformWithName("CUDA");
-  }
+  Platform* GetPlatform() { return *PlatformManager::PlatformWithName("CUDA"); }
 };
 
 TEST_F(StreamSearchTest, NoMatchBadPtr) {
diff --git a/third_party/xla/xla/stream_executor/device_memory.h b/third_party/xla/xla/stream_executor/device_memory.h
index c201afc7656f9a..9e81a90dfba3eb 100644
--- a/third_party/xla/xla/stream_executor/device_memory.h
+++ b/third_party/xla/xla/stream_executor/device_memory.h
@@ -28,6 +28,7 @@ limitations under the License.
 
 #include <cstddef>
 #include <cstdint>
+#include <tuple>
 
 #include "xla/stream_executor/platform/port.h"
 #include "tsl/platform/logging.h"
@@ -62,12 +63,16 @@ class DeviceMemoryBase {
   bool operator==(std::nullptr_t other) const { return is_null(); }
   bool operator!=(std::nullptr_t other) const { return !is_null(); }
 
+  bool operator==(const DeviceMemoryBase &other) const {
+    return opaque_ == other.opaque_ && size_ == other.size_;
+  }
+
   // Provides a partial order between device memory values.
   //
   // This operator is provided so that this object can be used as a key in an
   // ordered map.
   bool operator<(const DeviceMemoryBase &other) const {
-    return opaque() < other.opaque();
+    return std::tie(opaque_, size_) < std::tie(other.opaque_, other.size_);
   }
 
   // Returns the size, in bytes, for the backing memory.
diff --git a/third_party/xla/xla/stream_executor/dnn.cc b/third_party/xla/xla/stream_executor/dnn.cc
index 91b5c0157d117f..eb0bf1028943c3 100644
--- a/third_party/xla/xla/stream_executor/dnn.cc
+++ b/third_party/xla/xla/stream_executor/dnn.cc
@@ -221,13 +221,17 @@ DnnSupport::FusedConvolveRunnerFromDesc(
 
 absl::StatusOr<std::unique_ptr<const dnn::NormRunner>>
 DnnSupport::NormRunnerFromDesc(
-    Stream* stream, const dnn::AlgorithmDesc& algorithm_desc, double epsilon,
-    const dnn::TensorDescriptor& input_descriptor,
+    Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+    dnn::NormKind kind, double epsilon,
+    const dnn::TensorDescriptor& x_descriptor,
     const dnn::TensorDescriptor& scale_descriptor,
-    const dnn::TensorDescriptor& bias_descriptor,
-    const dnn::TensorDescriptor& output_descriptor,
+    const dnn::TensorDescriptor& y_or_dx_descriptor,
+    std::optional<dnn::TensorDescriptor> bias_descriptor,
+    std::optional<dnn::TensorDescriptor> dy_descriptor,
     std::optional<dnn::TensorDescriptor> expectation_descriptor,
-    std::optional<dnn::TensorDescriptor> norm_factor_descriptor) {
+    std::optional<dnn::TensorDescriptor> norm_factor_descriptor,
+    std::optional<dnn::TensorDescriptor> dscale_descriptor,
+    std::optional<dnn::TensorDescriptor> dbias_descriptor) {
   return absl::UnimplementedError("NormRunnerFromDesc not implemented.");
 }
 
diff --git a/third_party/xla/xla/stream_executor/dnn.h b/third_party/xla/xla/stream_executor/dnn.h
index fa068226dbbabc..71b65131f72dbf 100644
--- a/third_party/xla/xla/stream_executor/dnn.h
+++ b/third_party/xla/xla/stream_executor/dnn.h
@@ -36,6 +36,7 @@ limitations under the License.
 #include "google/protobuf/wrappers.pb.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "absl/strings/str_cat.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/data_type.h"
 #include "xla/stream_executor/device_description.pb.h"
@@ -216,7 +217,7 @@ class MatmulTensorDescriptor {
 // Specifies the descriptor for a RNN model.
 //
 // An example use case:
-//   * The user first creates a model through createRnnDescriptor.
+//   * The user first creates a model through CreateRnnDescriptor.
 //   * The user queries the size of the underlying opaque parameter buffer.
 //   * The user creates and initializes a parameter buffer of the proper size.
 //   * The user runs forward and backward operations using this RNN descriptor.
@@ -415,7 +416,7 @@ class BatchDescriptor {
   // dimensions, except possibly for feature_map_count(), though this
   // function does not verify that.
   static BatchDescriptor DepthConcatenateOutputDescriptor(
-      absl::Span<const dnn::BatchDescriptor> inputs);
+      absl::Span<const BatchDescriptor> inputs);
 
  private:
   absl::Span<const int64_t> spatial_size() const {
@@ -566,7 +567,7 @@ enum class PadAlignment : int64_t {
 std::string PadAlignmentString(PadAlignment alignment);
 
 // Print alignment to str. Needed to use CHECK_EQ between two PadAlignments.
-std::ostream& operator<<(std::ostream& str, dnn::PadAlignment alignment);
+std::ostream& operator<<(std::ostream& str, PadAlignment alignment);
 
 // Describes a convolution.
 //
@@ -1308,12 +1309,11 @@ class DnnSupport {
       const DeviceMemory<float>& scale, const DeviceMemory<float>& offset,
       const DeviceMemory<float>& estimated_mean,
       const DeviceMemory<float>& estimated_variance,
-      const DeviceMemory<float>& side_input, const dnn::BatchDescriptor& x_desc,
-      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
-      const double exponential_average_factor,
-      dnn::ActivationMode activation_mode, DeviceMemory<float>* y,
-      DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
-      DeviceMemory<float>* reserve_space_1,
+      const DeviceMemory<float>& side_input, const BatchDescriptor& x_desc,
+      const BatchDescriptor& scale_offset_desc, const double epsilon,
+      const double exponential_average_factor, ActivationMode activation_mode,
+      DeviceMemory<float>* y, DeviceMemory<float>* batch_mean,
+      DeviceMemory<float>* batch_var, DeviceMemory<float>* reserve_space_1,
       DeviceMemory<float>* reserve_space_2, bool is_training,
       ScratchAllocator* reserve_space_allocator,
       ScratchAllocator* workspace_allocator) {
@@ -1328,10 +1328,9 @@ class DnnSupport {
       const DeviceMemory<float>& estimated_mean,
       const DeviceMemory<float>& estimated_variance,
       const DeviceMemory<Eigen::half>& side_input,
-      const dnn::BatchDescriptor& x_desc,
-      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
-      const double exponential_average_factor,
-      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half>* y,
+      const BatchDescriptor& x_desc, const BatchDescriptor& scale_offset_desc,
+      const double epsilon, const double exponential_average_factor,
+      ActivationMode activation_mode, DeviceMemory<Eigen::half>* y,
       DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
       DeviceMemory<float>* reserve_space_1,
       DeviceMemory<float>* reserve_space_2, bool is_training,
@@ -1348,10 +1347,9 @@ class DnnSupport {
       const DeviceMemory<float>& estimated_mean,
       const DeviceMemory<float>& estimated_variance,
       const DeviceMemory<Eigen::bfloat16>& side_input,
-      const dnn::BatchDescriptor& x_desc,
-      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
-      const double exponential_average_factor,
-      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::bfloat16>* y,
+      const BatchDescriptor& x_desc, const BatchDescriptor& scale_offset_desc,
+      const double epsilon, const double exponential_average_factor,
+      ActivationMode activation_mode, DeviceMemory<Eigen::bfloat16>* y,
       DeviceMemory<float>* batch_mean, DeviceMemory<float>* batch_var,
       DeviceMemory<float>* reserve_space_1,
       DeviceMemory<float>* reserve_space_2, bool is_training,
@@ -1382,10 +1380,10 @@ class DnnSupport {
       const DeviceMemory<float>& x, const DeviceMemory<float>& scale,
       const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
       const DeviceMemory<float>& inv_var, const DeviceMemory<float>& y,
-      const dnn::BatchDescriptor& x_desc,
-      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
-      dnn::ActivationMode activation_mode, DeviceMemory<float>* x_backprop,
-      DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
+      const BatchDescriptor& x_desc, const BatchDescriptor& scale_offset_desc,
+      const double epsilon, ActivationMode activation_mode,
+      DeviceMemory<float>* x_backprop, DeviceMemory<float>* scale_backprop,
+      DeviceMemory<float>* offset_backprop,
       DeviceMemory<float>* side_input_backprop,
       DeviceMemory<uint8_t>* reserve_space_data,
       ScratchAllocator* workspace_allocator) {
@@ -1400,9 +1398,8 @@ class DnnSupport {
       const DeviceMemory<Eigen::half>& x, const DeviceMemory<float>& scale,
       const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
       const DeviceMemory<float>& inv_var, const DeviceMemory<Eigen::half>& y,
-      const dnn::BatchDescriptor& x_desc,
-      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
-      dnn::ActivationMode activation_mode,
+      const BatchDescriptor& x_desc, const BatchDescriptor& scale_offset_desc,
+      const double epsilon, ActivationMode activation_mode,
       DeviceMemory<Eigen::half>* x_backprop,
       DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
       DeviceMemory<Eigen::half>* side_input_backprop,
@@ -1419,11 +1416,9 @@ class DnnSupport {
       const DeviceMemory<Eigen::bfloat16>& x, const DeviceMemory<float>& scale,
       const DeviceMemory<float>& offset, const DeviceMemory<float>& mean,
       const DeviceMemory<float>& inv_var,
-      const DeviceMemory<Eigen::bfloat16>& y,
-      const dnn::BatchDescriptor& x_desc,
-      const dnn::BatchDescriptor& scale_offset_desc, const double epsilon,
-      dnn::ActivationMode activation_mode,
-      DeviceMemory<Eigen::bfloat16>* x_backprop,
+      const DeviceMemory<Eigen::bfloat16>& y, const BatchDescriptor& x_desc,
+      const BatchDescriptor& scale_offset_desc, const double epsilon,
+      ActivationMode activation_mode, DeviceMemory<Eigen::bfloat16>* x_backprop,
       DeviceMemory<float>* scale_backprop, DeviceMemory<float>* offset_backprop,
       DeviceMemory<Eigen::bfloat16>* side_input_backprop,
       DeviceMemory<uint8_t>* reserve_space_data,
@@ -1484,22 +1479,44 @@ class DnnSupport {
   virtual absl::Status DoFusedConvolve(
       Stream* stream, DataType input_type, DataType side_input_type,
       DataType bias_type, DataType output_type,
-      const dnn::BatchDescriptor& conv_input_descriptor,
+      const BatchDescriptor& conv_input_descriptor,
       DeviceMemoryBase conv_input_data, double conv_input_scale,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemoryBase filter_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
+      const ConvolutionDescriptor& convolution_descriptor,
       DeviceMemoryBase side_input_data, double side_input_scale,
-      const dnn::BatchDescriptor& bias_descriptor, DeviceMemoryBase biases,
-      dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor& output_descriptor,
+      const BatchDescriptor& bias_descriptor, DeviceMemoryBase biases,
+      ActivationMode activation_mode, const BatchDescriptor& output_descriptor,
       DeviceMemoryBase output_data, ScratchAllocator* scratch_allocator,
-      const dnn::AlgorithmConfig& algorithm_config,
-      dnn::ProfileResult* output_profile_result) {
+      const AlgorithmConfig& algorithm_config,
+      ProfileResult* output_profile_result) {
     return absl::UnimplementedError(
         "DnnSupport::DoFusedConvolve not implemented on this platform.");
   }
 
+  template <typename InputT, typename ScaleT, typename SideInputT,
+            typename BiasT, typename OutputT>
+  absl::Status FusedConvolveWithAlgorithm(
+      Stream* stream, const BatchDescriptor& conv_input_descriptor,
+      const DeviceMemory<InputT>& conv_input_data, ScaleT conv_input_scale,
+      const FilterDescriptor& filter_descriptor,
+      const DeviceMemory<InputT>& filter_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      const DeviceMemory<SideInputT>& side_input_data, ScaleT side_input_scale,
+      const BatchDescriptor& bias_descriptor, const DeviceMemory<BiasT>& biases,
+      ActivationMode activation_mode, const BatchDescriptor& output_descriptor,
+      DeviceMemory<OutputT>* output, ScratchAllocator* scratch_allocator,
+      const AlgorithmConfig& algorithm_config,
+      ProfileResult* output_profile_result) {
+    return DoFusedConvolve(
+        stream, ToDataType<InputT>::value, ToDataType<SideInputT>::value,
+        ToDataType<BiasT>::value, ToDataType<OutputT>::value,
+        conv_input_descriptor, conv_input_data, conv_input_scale,
+        filter_descriptor, filter_data, convolution_descriptor, side_input_data,
+        side_input_scale, bias_descriptor, biases, activation_mode,
+        output_descriptor, *output, scratch_allocator, algorithm_config,
+        output_profile_result);
+  }
+
   template <typename ElementType, typename OutputType>
   absl::Status PrepareForConvolution(
       ConvolutionKind kind, Stream* stream,
@@ -1577,115 +1594,135 @@ class DnnSupport {
       AlgorithmDesc algorithm_desc, DeviceMemory<uint8_t> scratch_memory,
       ProfileResult* output_profile_result) = 0;
 
+  template <typename InputType, typename OutputType>
+  absl::Status ConvolveWithAlgorithm(
+      Stream* stream, ConvolutionKind kind,
+      const BatchDescriptor& input_descriptor,
+      DeviceMemory<InputType> input_data,
+      const FilterDescriptor& filter_descriptor,
+      DeviceMemory<InputType> filter_data,
+      const BatchDescriptor& output_descriptor,
+      DeviceMemory<OutputType> output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
+      ScratchAllocator* scratch_allocator,
+      const AlgorithmConfig& algorithm_config,
+      ProfileResult* output_profile_result) {
+    DeviceMemory<uint8_t> scratch_memory;
+    AlgorithmDesc algorithm_desc;
+    TF_RETURN_IF_ERROR(PrepareForConvolution(
+        kind, stream, input_descriptor, input_data, filter_descriptor,
+        filter_data, output_descriptor, output_data, convolution_descriptor,
+        algorithm_config, scratch_allocator, &algorithm_desc, &scratch_memory));
+    return DoConvolve(kind, ToDataType<InputType>::value,
+                      ToDataType<OutputType>::value, stream, input_descriptor,
+                      input_data, filter_descriptor, filter_data,
+                      output_descriptor, output_data, convolution_descriptor,
+                      algorithm_desc, scratch_memory, output_profile_result);
+  }
+
   virtual absl::Status GetConvolveRunners(
-      bool use_cudnn_frontend, dnn::ConvolutionKind kind,
-      dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
-      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemoryBase filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemoryBase output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      bool use_fallback, ScratchAllocator* scratch_allocator,
+      bool use_cudnn_frontend, ConvolutionKind kind, DataType input_type,
+      DataType output_type, Stream* stream,
+      const BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
+      const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data,
+      const ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
+      ScratchAllocator* scratch_allocator,
       const NumericOptions& numeric_options,
-      std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_exec_plans);
+      std::vector<std::unique_ptr<const ConvRunner>>* out_exec_plans);
 
-  virtual absl::StatusOr<std::unique_ptr<const dnn::ConvRunner>>
-  ConvolveRunnerFromDesc(
-      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
-      dnn::ConvolutionKind kind, dnn::DataType element_type,
-      dnn::DataType output_type, const dnn::BatchDescriptor& input_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      const dnn::ConvolutionDescriptor& convolution_descriptor);
+  virtual absl::StatusOr<std::unique_ptr<const ConvRunner>>
+  ConvolveRunnerFromDesc(Stream* stream, const AlgorithmDesc& algorithm_desc,
+                         ConvolutionKind kind, DataType element_type,
+                         DataType output_type,
+                         const BatchDescriptor& input_descriptor,
+                         const FilterDescriptor& filter_descriptor,
+                         const BatchDescriptor& output_descriptor,
+                         const ConvolutionDescriptor& convolution_descriptor);
 
   virtual absl::Status GetGraphConvolveRunners(
-      dnn::ConvolutionKind kind, dnn::DataType input_type,
-      dnn::DataType output_type, Stream* stream,
-      const dnn::BatchDescriptor& input_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      bool use_fallback, const NumericOptions& numeric_options,
-      std::vector<std::unique_ptr<const dnn::GraphConvRunner>>* out_exec_plans,
+      ConvolutionKind kind, DataType input_type, DataType output_type,
+      Stream* stream, const BatchDescriptor& input_descriptor,
+      const FilterDescriptor& filter_descriptor,
+      const BatchDescriptor& output_descriptor,
+      const ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
+      const NumericOptions& numeric_options,
+      std::vector<std::unique_ptr<const GraphConvRunner>>* out_exec_plans,
       std::string serialized_graph);
 
-  virtual absl::StatusOr<std::unique_ptr<const dnn::GraphConvRunner>>
+  virtual absl::StatusOr<std::unique_ptr<const GraphConvRunner>>
   GraphConvolveRunnerFromDesc(
-      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
-      dnn::ConvolutionKind kind, dnn::DataType element_type,
-      dnn::DataType output_type, const dnn::BatchDescriptor& input_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      Stream* stream, const AlgorithmDesc& algorithm_desc, ConvolutionKind kind,
+      DataType element_type, DataType output_type,
+      const BatchDescriptor& input_descriptor,
+      const FilterDescriptor& filter_descriptor,
+      const BatchDescriptor& output_descriptor,
+      const ConvolutionDescriptor& convolution_descriptor,
       std::string serialized_graph);
 
   virtual absl::Status GetFusedConvolveRunners(
-      bool use_cudnn_frontend, dnn::ConvolutionKind kind,
-      dnn::DataType element_type, dnn::DataType bias_type,
-      dnn::DataType output_type, double conv_input_scale,
+      bool use_cudnn_frontend, ConvolutionKind kind, DataType element_type,
+      DataType bias_type, DataType output_type, double conv_input_scale,
       double side_input_scale, double leakyrelu_alpha, Stream* stream,
-      const dnn::BatchDescriptor& input_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const dnn::BatchDescriptor& bias_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      bool use_fallback, dnn::ActivationMode activation_mode,
-      const NumericOptions& numeric_options,
-      std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans);
+      const BatchDescriptor& input_descriptor,
+      const FilterDescriptor& filter_descriptor,
+      const BatchDescriptor& bias_descriptor,
+      const BatchDescriptor& output_descriptor,
+      const ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
+      ActivationMode activation_mode, const NumericOptions& numeric_options,
+      std::vector<std::unique_ptr<const FusedConvRunner>>* out_exec_plans);
 
   virtual absl::Status GetFusedMatmulRunners(
-      bool use_cudnn_frontend, dnn::DataType element_type,
-      dnn::DataType bias_type, dnn::DataType output_type, Stream* stream,
-      bool trans_a, bool trans_b, uint64_t m, uint64_t n, uint64_t k,
-      int64_t lda, int64_t ldb, int64_t ldc,
-      dnn::ActivationMode activation_mode, bool use_fallback,
+      bool use_cudnn_frontend, DataType element_type, DataType bias_type,
+      DataType output_type, Stream* stream, bool trans_a, bool trans_b,
+      uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
+      ActivationMode activation_mode, bool use_fallback,
       const NumericOptions& numeric_options,
-      std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
-          out_exec_plans);
+      std::vector<std::unique_ptr<const FusedMatmulRunner>>* out_exec_plans);
 
-  virtual absl::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>>
+  virtual absl::StatusOr<std::unique_ptr<const FusedConvRunner>>
   FusedConvolveRunnerFromDesc(
-      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
-      dnn::ConvolutionKind kind, dnn::DataType element_type,
-      dnn::DataType bias_type, dnn::DataType output_type, double conv_scale,
-      double side_input_scale, double leakyrelu_alpha,
-      const dnn::BatchDescriptor& input_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const dnn::BatchDescriptor& bias_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      dnn::ActivationMode activation_mode);
+      Stream* stream, const AlgorithmDesc& algorithm_desc, ConvolutionKind kind,
+      DataType element_type, DataType bias_type, DataType output_type,
+      double conv_scale, double side_input_scale, double leakyrelu_alpha,
+      const BatchDescriptor& input_descriptor,
+      const FilterDescriptor& filter_descriptor,
+      const BatchDescriptor& bias_descriptor,
+      const BatchDescriptor& output_descriptor,
+      const ConvolutionDescriptor& convolution_descriptor,
+      ActivationMode activation_mode);
 
   virtual absl::StatusOr<std::unique_ptr<const dnn::NormRunner>>
   NormRunnerFromDesc(
-      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc, double epsilon,
-      const dnn::TensorDescriptor& input_descriptor,
+      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
+      dnn::NormKind kind, double epsilon,
+      const dnn::TensorDescriptor& x_descriptor,
       const dnn::TensorDescriptor& scale_descriptor,
-      const dnn::TensorDescriptor& bias_descriptor,
-      const dnn::TensorDescriptor& output_descriptor,
+      const dnn::TensorDescriptor& y_or_dx_descriptor,
+      std::optional<dnn::TensorDescriptor> bias_descriptor,
+      std::optional<dnn::TensorDescriptor> dy_descriptor,
       std::optional<dnn::TensorDescriptor> expectation_descriptor,
-      std::optional<dnn::TensorDescriptor> norm_factor_descriptor);
+      std::optional<dnn::TensorDescriptor> norm_factor_descriptor,
+      std::optional<dnn::TensorDescriptor> dscale_descriptor,
+      std::optional<dnn::TensorDescriptor> dbias_descriptor);
 
-  virtual absl::StatusOr<std::unique_ptr<const dnn::FusedMHARunner>>
+  virtual absl::StatusOr<std::unique_ptr<const FusedMHARunner>>
   FusedMHARunnerFromDesc(
-      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
-      dnn::FusedMHAKind kind,
-      const dnn::MatmulTensorDescriptor& bmm1_lhs_descriptor,
-      const dnn::MatmulTensorDescriptor& bmm1_rhs_descriptor,
-      const dnn::MatmulTensorDescriptor& bmm2_rhs_descriptor,
-      const dnn::MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
-      const dnn::TensorDescriptor& output_descriptor,
-      std::optional<dnn::TensorDescriptor> activation_descriptor,
-      std::optional<dnn::TensorDescriptor> mask_descriptor,
-      std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
+      Stream* stream, const AlgorithmDesc& algorithm_desc, FusedMHAKind kind,
+      const MatmulTensorDescriptor& bmm1_lhs_descriptor,
+      const MatmulTensorDescriptor& bmm1_rhs_descriptor,
+      const MatmulTensorDescriptor& bmm2_rhs_descriptor,
+      const MatmulTensorDescriptor& intermediate_bmm2_lhs_descriptor,
+      const TensorDescriptor& output_descriptor,
+      std::optional<TensorDescriptor> activation_descriptor,
+      std::optional<TensorDescriptor> mask_descriptor,
+      std::optional<TensorDescriptor> bias_descriptor, double scale,
       std::optional<double> dropout_rate, std::optional<int64_t> seed,
       bool is_flash_attention, bool is_causal_mask);
 
-  virtual absl::StatusOr<std::unique_ptr<const dnn::FusedMHABackwardRunner>>
+  virtual absl::StatusOr<std::unique_ptr<const FusedMHABackwardRunner>>
   FusedMHABackwardRunnerFromDesc(
-      Stream* stream, const dnn::AlgorithmDesc& algorithm_desc,
-      dnn::FusedMHAKind kind,
+      Stream* stream, const AlgorithmDesc& algorithm_desc, FusedMHAKind kind,
       const MatmulTensorDescriptor& bmm1_grad_gemm1_rhs_descriptor,
       const MatmulTensorDescriptor& bmm1_grad_gemm2_rhs_descriptor,
       const MatmulTensorDescriptor& bmm2_grad_gemm1_lhs_descriptor,
@@ -1694,29 +1731,57 @@ class DnnSupport {
       const TensorDescriptor& d_bmm1_lhs_descriptor,
       const TensorDescriptor& d_bmm1_rhs_descriptor,
       const TensorDescriptor& d_bmm2_rhs_descriptor,
-      std::optional<dnn::TensorDescriptor> d_s_descriptor,
-      std::optional<dnn::TensorDescriptor> mask_descriptor,
-      std::optional<dnn::TensorDescriptor> d_bias_descriptor,
-      std::optional<dnn::TensorDescriptor> fwd_output_descriptor,
-      std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
+      std::optional<TensorDescriptor> d_s_descriptor,
+      std::optional<TensorDescriptor> mask_descriptor,
+      std::optional<TensorDescriptor> d_bias_descriptor,
+      std::optional<TensorDescriptor> fwd_output_descriptor,
+      std::optional<TensorDescriptor> bias_descriptor, double scale,
       std::optional<double> dropout_rate, std::optional<int64_t> seed,
       bool is_flash_attention, bool is_causal_mask);
 
   virtual bool GetMIOpenConvolveAlgorithms(
-      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
-      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemoryBase filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemoryBase output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
+      ConvolutionKind kind, DataType element_type, Stream* stream,
+      const BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
+      const FilterDescriptor& filter_descriptor, DeviceMemoryBase filter_data,
+      const BatchDescriptor& output_descriptor, DeviceMemoryBase output_data,
+      const ConvolutionDescriptor& convolution_descriptor,
       ScratchAllocator* scratch_allocator,
       std::vector<ProfileResult>* out_algorithms);
 
   // Returns a list of supported rnn algorithms.
   virtual bool GetRnnAlgorithms(std::vector<AlgorithmDesc>* out_algorithms);
 
-  // Performs a forward pooling operation on input_data, writing to
+  template <typename ElementType>
+  absl::Status PoolForward(Stream* stream,
+                           const PoolingDescriptor& pooling_dimensions,
+                           const NumericOptions& numeric_options,
+                           const BatchDescriptor& input_dimensions,
+                           const DeviceMemory<ElementType>& input_data,
+                           const BatchDescriptor& output_dimensions,
+                           DeviceMemory<ElementType>* output_data,
+                           ScratchAllocator* workspace_allocator = nullptr) {
+    return DoPoolForward(ToDataType<ElementType>::value, stream,
+                         pooling_dimensions, numeric_options, input_dimensions,
+                         input_data, output_dimensions, *output_data,
+                         workspace_allocator);
+  }
+
+  template <typename ElementType>
+  absl::Status PoolBackward(Stream* stream,
+                            const PoolingDescriptor& pooling_dimensions,
+                            const NumericOptions& numeric_options,
+                            const BatchDescriptor& input_dimensions,
+                            const DeviceMemory<ElementType>& input_data,
+                            const BatchDescriptor& output_dimensions,
+                            const DeviceMemory<ElementType>& output_data,
+                            const DeviceMemory<ElementType>& input_diff_data,
+                            DeviceMemory<ElementType>* output_diff_data,
+                            ScratchAllocator* workspace_allocator = nullptr) {
+    return DoPoolBackward(
+        ToDataType<ElementType>::value, stream, pooling_dimensions,
+        numeric_options, input_dimensions, input_data, output_dimensions,
+        output_data, input_diff_data, *output_diff_data, workspace_allocator);
+  }  // Performs a forward pooling operation on input_data, writing to
   // output_data. See PoolingDescriptor for how to configure the
   // pooling operation.
   //
@@ -1731,37 +1796,36 @@ class DnnSupport {
   // See PoolingDescriptor for how to configure the pooling operation.
   virtual absl::Status DoPoolForward(
       DataType element_type, Stream* stream,
-      const dnn::PoolingDescriptor& pooling_dimensions,
-      const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
-      const dnn::BatchDescriptor& output_dimensions,
-      DeviceMemoryBase output_data, ScratchAllocator* workspace_allocator) = 0;
+      const PoolingDescriptor& pooling_dimensions,
+      const BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
+      const BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
+      ScratchAllocator* workspace_allocator) = 0;
 
   virtual absl::Status DoPoolForward(
       DataType element_type, Stream* stream,
-      const dnn::PoolingDescriptor& pooling_dimensions,
+      const PoolingDescriptor& pooling_dimensions,
       const NumericOptions& numeric_options,
-      const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
-      const dnn::BatchDescriptor& output_dimensions,
-      DeviceMemoryBase output_data, ScratchAllocator* workspace_allocator);
+      const BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
+      const BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
+      ScratchAllocator* workspace_allocator);
 
   // Performs differentiation of the pooling operation.
   virtual absl::Status DoPoolBackward(
       DataType element_type, Stream* stream,
-      const dnn::PoolingDescriptor& pooling_dimensions,
-      const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
-      const dnn::BatchDescriptor& output_dimensions,
-      DeviceMemoryBase output_data, DeviceMemoryBase input_diff_data,
-      DeviceMemoryBase output_diff_data,
+      const PoolingDescriptor& pooling_dimensions,
+      const BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
+      const BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
+      DeviceMemoryBase input_diff_data, DeviceMemoryBase output_diff_data,
       ScratchAllocator* workspace_allocator) = 0;
 
   virtual absl::Status DoPoolBackward(
       DataType element_type, Stream* stream,
-      const dnn::PoolingDescriptor& pooling_dimensions,
+      const PoolingDescriptor& pooling_dimensions,
       const NumericOptions& numeric_options,
-      const dnn::BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
-      const dnn::BatchDescriptor& output_dimensions,
-      DeviceMemoryBase output_data, DeviceMemoryBase input_diff_data,
-      DeviceMemoryBase output_diff_data, ScratchAllocator* workspace_allocator);
+      const BatchDescriptor& input_dimensions, DeviceMemoryBase input_data,
+      const BatchDescriptor& output_dimensions, DeviceMemoryBase output_data,
+      DeviceMemoryBase input_diff_data, DeviceMemoryBase output_diff_data,
+      ScratchAllocator* workspace_allocator);
 
   // Applies local response normalization to the values from input_data and
   // writes the result to output_data.
@@ -1769,9 +1833,9 @@ class DnnSupport {
   // See comments on NormalizeDescriptor for a description of local response
   // normalization.
   virtual bool DoNormalizeWithDimensions(
-      Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
-      const dnn::BatchDescriptor& dimensions,
-      const DeviceMemory<float>& input_data, DeviceMemory<float>* output_data) {
+      Stream* stream, const NormalizeDescriptor& normalize_descriptor,
+      const BatchDescriptor& dimensions, const DeviceMemory<float>& input_data,
+      DeviceMemory<float>* output_data) {
     return false;
   }
 
@@ -1788,9 +1852,8 @@ class DnnSupport {
   // See comments on NormalizeDescriptor for a description of local response
   // normalization.
   virtual bool DoNormalizeBackwardWithDimensions(
-      Stream* stream, const dnn::NormalizeDescriptor& normalize_descriptor,
-      const dnn::BatchDescriptor& dimensions,
-      const DeviceMemory<float>& raw_data,
+      Stream* stream, const NormalizeDescriptor& normalize_descriptor,
+      const BatchDescriptor& dimensions, const DeviceMemory<float>& raw_data,
       const DeviceMemory<float>& normalized_data,
       const DeviceMemory<float>& normalized_variable_gradient,
       DeviceMemory<float>* raw_variable_gradient,
@@ -1798,25 +1861,6 @@ class DnnSupport {
     return false;
   }
 
-  // Concatenates several layers into one, by concatenating the depth of each
-  // layer at matching x and y coordinates.
-  // The inputs must all have the same width and height, the output will have
-  // the same width and height as the inputs and its depth will be the sum of
-  // the input depths.
-  //
-  // Arguments (all borrowed):
-  //  stream: borrowed pointer to the stream that the 'depth concatenate'
-  // operation should be enqueued onto.
-  //  input_dimensions: The dimensions of each input.
-  //  input_data: un-owned device memory region which contains the
-  //    input data for each input layer.
-  //  output_data: un-owned device memory region in which to place the
-  //    depth concatenate result.
-  virtual bool DoDepthConcatenate(
-      Stream* stream, absl::Span<const dnn::BatchDescriptor> input_dimensions,
-      absl::Span<const DeviceMemory<float>* const> input_data,
-      DeviceMemory<float>* output_data) = 0;
-
   // Create an RNN descriptor based on model shapes and configurations.
   // The caller retains the ownership of the descriptor.
   //
@@ -1839,17 +1883,14 @@ class DnnSupport {
   //    for dropout layer. The user has to maintain the memory until the model
   //    is no longer in use.
   //  use_padded_io: a bool to specify whether the input is using padded IO.
-  virtual absl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
-  createRnnDescriptor(int num_layers, int hidden_size, int input_size,
-                      int cell_size, int batch_size,
-                      dnn::RnnInputMode input_mode,
-                      dnn::RnnDirectionMode direction_mode,
-                      dnn::RnnMode rnn_mode, dnn::DataType data_type,
-                      const dnn::AlgorithmConfig& algorithm_config,
-                      const NumericOptions& numeric_options, float dropout,
-                      uint64_t seed, ScratchAllocator* state_allocator,
-                      bool use_padded_io) {
-    return absl::UnimplementedError("createRnnDescriptor is unimplemented");
+  virtual absl::StatusOr<std::unique_ptr<RnnDescriptor>> CreateRnnDescriptor(
+      int num_layers, int hidden_size, int input_size, int cell_size,
+      int batch_size, RnnInputMode input_mode, RnnDirectionMode direction_mode,
+      RnnMode rnn_mode, DataType data_type,
+      const AlgorithmConfig& algorithm_config,
+      const NumericOptions& numeric_options, float dropout, uint64_t seed,
+      ScratchAllocator* state_allocator, bool use_padded_io) {
+    return absl::UnimplementedError("CreateRnnDescriptor is unimplemented");
   }
 
   // Create a RNN sequence descriptor that specifies either the input or output
@@ -1861,36 +1902,36 @@ class DnnSupport {
   //  data_size: the size of the state.
   //  seq_lengths: the lengths of sequences in a batch.
   //  data_type: an enum to specify the type for the underlying data.
-  virtual absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
-  createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
-                                    int data_size, dnn::DataType data_type) {
+  virtual absl::StatusOr<std::unique_ptr<RnnSequenceTensorDescriptor>>
+  CreateRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
+                                    int data_size, DataType data_type) {
     return absl::UnimplementedError(
-        "createRnnSequenceTensorDescriptor is unimplemented");
+        "CreateRnnSequenceTensorDescriptor is unimplemented");
   }
 
-  virtual absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
-  createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
+  virtual absl::StatusOr<std::unique_ptr<RnnSequenceTensorDescriptor>>
+  CreateRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
                                     int data_size,
                                     const absl::Span<const int>& seq_lengths,
-                                    bool time_major, dnn::DataType data_type) {
+                                    bool time_major, DataType data_type) {
     return absl::UnimplementedError(
-        "createRnnSequenceTensorDescriptor is unimplemented");
+        "CreateRnnSequenceTensorDescriptor is unimplemented");
   }
 
   // Create an RNN state descriptor that specifies the input or hidden state.
   // The caller retains the ownership of the returned descriptor.
-  virtual absl::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
-  createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
-                                 dnn::DataType data_type) {
+  virtual absl::StatusOr<std::unique_ptr<RnnStateTensorDescriptor>>
+  CreateRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
+                                 DataType data_type) {
     return absl::UnimplementedError(
-        "createRnnStateTensorDescriptor is unimplemented");
+        "CreateRnnStateTensorDescriptor is unimplemented");
   }
 
   // Enqueue a forward operation of the RNN model onto the stream.
   //
   // Arguments:
   //  stream: pointer to the stream where this operation should be enqueued to.
-  //  rnn_desc: a RNN descriptor created by createRnnDescriptor.
+  //  rnn_desc: a RNN descriptor created by CreateRnnDescriptor.
   //  input_desc: descriptor for the input sequence.
   //  input_data: the device memory region that contains the input data.
   //  input_h_desc: descriptor for the input "h" state.
@@ -1915,76 +1956,76 @@ class DnnSupport {
   //  workspace_allocator: an allocator to create temporary workspace used in
   //    this kernel. The caller is responsible for retaining the memory long
   //    enough for the lifespan of this operation, and recycles afterwards.
-  virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
-                            const dnn::RnnSequenceTensorDescriptor& input_desc,
+  virtual bool DoRnnForward(Stream* stream, const RnnDescriptor& rnn_desc,
+                            const RnnSequenceTensorDescriptor& input_desc,
                             const DeviceMemory<Eigen::half>& input_data,
                             const DeviceMemory<int>& seq_lengths_data,
-                            const dnn::RnnStateTensorDescriptor& input_h_desc,
+                            const RnnStateTensorDescriptor& input_h_desc,
                             const DeviceMemory<Eigen::half>& input_h_data,
-                            const dnn::RnnStateTensorDescriptor& input_c_desc,
+                            const RnnStateTensorDescriptor& input_c_desc,
                             const DeviceMemory<Eigen::half>& input_c_data,
                             const DeviceMemory<Eigen::half>& params,
-                            const dnn::RnnSequenceTensorDescriptor& output_desc,
+                            const RnnSequenceTensorDescriptor& output_desc,
                             DeviceMemory<Eigen::half>* output_data,
-                            const dnn::RnnStateTensorDescriptor& output_h_desc,
+                            const RnnStateTensorDescriptor& output_h_desc,
                             DeviceMemory<Eigen::half>* output_h_data,
-                            const dnn::RnnStateTensorDescriptor& output_c_desc,
+                            const RnnStateTensorDescriptor& output_c_desc,
                             DeviceMemory<Eigen::half>* output_c_data,
                             bool is_training,
                             ScratchAllocator* reserve_space_allocator,
                             ScratchAllocator* workspace_allocator,
-                            dnn::ProfileResult* output_profile_result) {
+                            ProfileResult* output_profile_result) {
     return false;
   }
 
-  virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
-                            const dnn::RnnSequenceTensorDescriptor& input_desc,
+  virtual bool DoRnnForward(Stream* stream, const RnnDescriptor& rnn_desc,
+                            const RnnSequenceTensorDescriptor& input_desc,
                             const DeviceMemory<float>& input_data,
                             const DeviceMemory<int>& seq_lengths_data,
-                            const dnn::RnnStateTensorDescriptor& input_h_desc,
+                            const RnnStateTensorDescriptor& input_h_desc,
                             const DeviceMemory<float>& input_h_data,
-                            const dnn::RnnStateTensorDescriptor& input_c_desc,
+                            const RnnStateTensorDescriptor& input_c_desc,
                             const DeviceMemory<float>& input_c_data,
                             const DeviceMemory<float>& params,
-                            const dnn::RnnSequenceTensorDescriptor& output_desc,
+                            const RnnSequenceTensorDescriptor& output_desc,
                             DeviceMemory<float>* output_data,
-                            const dnn::RnnStateTensorDescriptor& output_h_desc,
+                            const RnnStateTensorDescriptor& output_h_desc,
                             DeviceMemory<float>* output_h_data,
-                            const dnn::RnnStateTensorDescriptor& output_c_desc,
+                            const RnnStateTensorDescriptor& output_c_desc,
                             DeviceMemory<float>* output_c_data,
                             bool is_training,
                             ScratchAllocator* reserve_space_allocator,
                             ScratchAllocator* workspace_allocator,
-                            dnn::ProfileResult* output_profile_result) {
+                            ProfileResult* output_profile_result) {
     return false;
   }
 
-  virtual bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
-                            const dnn::RnnSequenceTensorDescriptor& input_desc,
+  virtual bool DoRnnForward(Stream* stream, const RnnDescriptor& rnn_desc,
+                            const RnnSequenceTensorDescriptor& input_desc,
                             const DeviceMemory<double>& input_data,
                             const DeviceMemory<int>& seq_lengths_data,
-                            const dnn::RnnStateTensorDescriptor& input_h_desc,
+                            const RnnStateTensorDescriptor& input_h_desc,
                             const DeviceMemory<double>& input_h_data,
-                            const dnn::RnnStateTensorDescriptor& input_c_desc,
+                            const RnnStateTensorDescriptor& input_c_desc,
                             const DeviceMemory<double>& input_c_data,
                             const DeviceMemory<double>& params,
-                            const dnn::RnnSequenceTensorDescriptor& output_desc,
+                            const RnnSequenceTensorDescriptor& output_desc,
                             DeviceMemory<double>* output_data,
-                            const dnn::RnnStateTensorDescriptor& output_h_desc,
+                            const RnnStateTensorDescriptor& output_h_desc,
                             DeviceMemory<double>* output_h_data,
-                            const dnn::RnnStateTensorDescriptor& output_c_desc,
+                            const RnnStateTensorDescriptor& output_c_desc,
                             DeviceMemory<double>* output_c_data,
                             bool is_training,
                             ScratchAllocator* reserve_space_allocator,
                             ScratchAllocator* workspace_allocator,
-                            dnn::ProfileResult* output_profile_result) {
+                            ProfileResult* output_profile_result) {
     return false;
   }
   // Enqueue a backward operation of the RNN model onto the stream.
   //
   // Arguments:
   //  stream: pointer to the stream where this operation should be enqueued to.
-  //  rnn_desc: a RNN descriptor created by createRnnDescriptor.
+  //  rnn_desc: a RNN descriptor created by CreateRnnDescriptor.
   //  input_desc: descriptor for the input sequence.
   //  input_data: the device memory region that contains the input data.
   //  input_h_desc: descriptor for the input "h" state.
@@ -2022,20 +2063,20 @@ class DnnSupport {
   //    keeping the memory alive long enough for this operation, and recylces
   //    afterwards.
   virtual bool DoRnnBackward(
-      Stream* stream, const dnn::RnnDescriptor& rnn_desc,
-      const dnn::RnnSequenceTensorDescriptor& input_desc,
+      Stream* stream, const RnnDescriptor& rnn_desc,
+      const RnnSequenceTensorDescriptor& input_desc,
       const DeviceMemory<Eigen::half>& input_data,
       const DeviceMemory<int>& seq_lengths_data,
-      const dnn::RnnStateTensorDescriptor& input_h_desc,
+      const RnnStateTensorDescriptor& input_h_desc,
       const DeviceMemory<Eigen::half>& input_h_data,
-      const dnn::RnnStateTensorDescriptor& input_c_desc,
+      const RnnStateTensorDescriptor& input_c_desc,
       const DeviceMemory<Eigen::half>& input_c_data,
       const DeviceMemory<Eigen::half>& params,
-      const dnn::RnnSequenceTensorDescriptor& output_desc,
+      const RnnSequenceTensorDescriptor& output_desc,
       const DeviceMemory<Eigen::half>& output_data,
-      const dnn::RnnStateTensorDescriptor& output_h_desc,
+      const RnnStateTensorDescriptor& output_h_desc,
       const DeviceMemory<Eigen::half>& output_h_data,
-      const dnn::RnnStateTensorDescriptor& output_c_desc,
+      const RnnStateTensorDescriptor& output_c_desc,
       const DeviceMemory<Eigen::half>& output_c_data,
       const DeviceMemory<Eigen::half>& output_backprop_data,
       const DeviceMemory<Eigen::half>& output_h_backprop_data,
@@ -2046,65 +2087,63 @@ class DnnSupport {
       DeviceMemory<Eigen::half>* params_backprop_data,
       DeviceMemory<uint8_t>* reserve_space_data,
       ScratchAllocator* workspace_allocator,
-      dnn::ProfileResult* output_profile_result) {
+      ProfileResult* output_profile_result) {
     return false;
   }
 
-  virtual bool DoRnnBackward(
-      Stream* stream, const dnn::RnnDescriptor& rnn_desc,
-      const dnn::RnnSequenceTensorDescriptor& input_desc,
-      const DeviceMemory<float>& input_data,
-      const DeviceMemory<int>& seq_lengths_data,
-      const dnn::RnnStateTensorDescriptor& input_h_desc,
-      const DeviceMemory<float>& input_h_data,
-      const dnn::RnnStateTensorDescriptor& input_c_desc,
-      const DeviceMemory<float>& input_c_data,
-      const DeviceMemory<float>& params,
-      const dnn::RnnSequenceTensorDescriptor& output_desc,
-      const DeviceMemory<float>& output_data,
-      const dnn::RnnStateTensorDescriptor& output_h_desc,
-      const DeviceMemory<float>& output_h_data,
-      const dnn::RnnStateTensorDescriptor& output_c_desc,
-      const DeviceMemory<float>& output_c_data,
-      const DeviceMemory<float>& output_backprop_data,
-      const DeviceMemory<float>& output_h_backprop_data,
-      const DeviceMemory<float>& output_c_backprop_data,
-      DeviceMemory<float>* input_backprop_data,
-      DeviceMemory<float>* input_h_backprop_data,
-      DeviceMemory<float>* input_c_backprop_data,
-      DeviceMemory<float>* params_backprop_data,
-      DeviceMemory<uint8_t>* reserve_space_data,
-      ScratchAllocator* workspace_allocator,
-      dnn::ProfileResult* output_profile_result) {
+  virtual bool DoRnnBackward(Stream* stream, const RnnDescriptor& rnn_desc,
+                             const RnnSequenceTensorDescriptor& input_desc,
+                             const DeviceMemory<float>& input_data,
+                             const DeviceMemory<int>& seq_lengths_data,
+                             const RnnStateTensorDescriptor& input_h_desc,
+                             const DeviceMemory<float>& input_h_data,
+                             const RnnStateTensorDescriptor& input_c_desc,
+                             const DeviceMemory<float>& input_c_data,
+                             const DeviceMemory<float>& params,
+                             const RnnSequenceTensorDescriptor& output_desc,
+                             const DeviceMemory<float>& output_data,
+                             const RnnStateTensorDescriptor& output_h_desc,
+                             const DeviceMemory<float>& output_h_data,
+                             const RnnStateTensorDescriptor& output_c_desc,
+                             const DeviceMemory<float>& output_c_data,
+                             const DeviceMemory<float>& output_backprop_data,
+                             const DeviceMemory<float>& output_h_backprop_data,
+                             const DeviceMemory<float>& output_c_backprop_data,
+                             DeviceMemory<float>* input_backprop_data,
+                             DeviceMemory<float>* input_h_backprop_data,
+                             DeviceMemory<float>* input_c_backprop_data,
+                             DeviceMemory<float>* params_backprop_data,
+                             DeviceMemory<uint8_t>* reserve_space_data,
+                             ScratchAllocator* workspace_allocator,
+                             ProfileResult* output_profile_result) {
     return false;
   }
 
-  virtual bool DoRnnBackward(
-      Stream* stream, const dnn::RnnDescriptor& rnn_desc,
-      const dnn::RnnSequenceTensorDescriptor& input_desc,
-      const DeviceMemory<double>& input_data,
-      const DeviceMemory<int>& seq_lengths_data,
-      const dnn::RnnStateTensorDescriptor& input_h_desc,
-      const DeviceMemory<double>& input_h_data,
-      const dnn::RnnStateTensorDescriptor& input_c_desc,
-      const DeviceMemory<double>& input_c_data,
-      const DeviceMemory<double>& params,
-      const dnn::RnnSequenceTensorDescriptor& output_desc,
-      const DeviceMemory<double>& output_data,
-      const dnn::RnnStateTensorDescriptor& output_h_desc,
-      const DeviceMemory<double>& output_h_data,
-      const dnn::RnnStateTensorDescriptor& output_c_desc,
-      const DeviceMemory<double>& output_c_data,
-      const DeviceMemory<double>& output_backprop_data,
-      const DeviceMemory<double>& output_h_backprop_data,
-      const DeviceMemory<double>& output_c_backprop_data,
-      DeviceMemory<double>* input_backprop_data,
-      DeviceMemory<double>* input_h_backprop_data,
-      DeviceMemory<double>* input_c_backprop_data,
-      DeviceMemory<double>* params_backprop_data,
-      DeviceMemory<uint8_t>* reserve_space_data,
-      ScratchAllocator* workspace_allocator,
-      dnn::ProfileResult* output_profile_result) {
+  virtual bool DoRnnBackward(Stream* stream, const RnnDescriptor& rnn_desc,
+                             const RnnSequenceTensorDescriptor& input_desc,
+                             const DeviceMemory<double>& input_data,
+                             const DeviceMemory<int>& seq_lengths_data,
+                             const RnnStateTensorDescriptor& input_h_desc,
+                             const DeviceMemory<double>& input_h_data,
+                             const RnnStateTensorDescriptor& input_c_desc,
+                             const DeviceMemory<double>& input_c_data,
+                             const DeviceMemory<double>& params,
+                             const RnnSequenceTensorDescriptor& output_desc,
+                             const DeviceMemory<double>& output_data,
+                             const RnnStateTensorDescriptor& output_h_desc,
+                             const DeviceMemory<double>& output_h_data,
+                             const RnnStateTensorDescriptor& output_c_desc,
+                             const DeviceMemory<double>& output_c_data,
+                             const DeviceMemory<double>& output_backprop_data,
+                             const DeviceMemory<double>& output_h_backprop_data,
+                             const DeviceMemory<double>& output_c_backprop_data,
+                             DeviceMemory<double>* input_backprop_data,
+                             DeviceMemory<double>* input_h_backprop_data,
+                             DeviceMemory<double>* input_c_backprop_data,
+                             DeviceMemory<double>* params_backprop_data,
+                             DeviceMemory<uint8_t>* reserve_space_data,
+                             ScratchAllocator* workspace_allocator,
+                             ProfileResult* output_profile_result) {
     return false;
   }
 
@@ -2148,7 +2187,7 @@ class DnnSupport {
   //    keeping the memory alive long enough for this operation, and recylces
   //    afterwards.
   virtual absl::Status DoCtcLoss(
-      Stream* stream, dnn::DataType element_type,
+      Stream* stream, DataType element_type,
       const RnnStateTensorDescriptor& probs_desc,
       const DeviceMemoryBase probs_data, absl::Span<const int> labels_data,
       absl::Span<const int> labels_lengths_data,
@@ -2157,14 +2196,13 @@ class DnnSupport {
       DeviceMemory<uint8_t> scratch_memory, int ctc_loss_algo_id);
 
   template <typename ElementType>
-  bool DoCtcLoss(Stream* stream,
-                 const dnn::RnnStateTensorDescriptor& probs_desc,
+  bool DoCtcLoss(Stream* stream, const RnnStateTensorDescriptor& probs_desc,
                  const DeviceMemory<ElementType>& probs_data,
                  absl::Span<const int> labels_data,
                  absl::Span<const int> labels_lengths_data,
                  absl::Span<const int> input_lengths_data,
                  DeviceMemory<ElementType>* costs_data,
-                 const dnn::RnnStateTensorDescriptor& grads_desc,
+                 const RnnStateTensorDescriptor& grads_desc,
                  DeviceMemory<ElementType>* grads_data,
                  DeviceMemory<uint8_t>* scratch_memory, int ctc_loss_algo_id) {
     return IsStatusOk(
@@ -2187,13 +2225,10 @@ class DnnSupport {
   //  output_type: the data type of the output tensor.
   //  scale: an element-wise scaling factor to apply.
   //  output_data: the device memory region that contains the output tensor.
-  virtual bool DoTransformTensor(Stream* stream,
-                                 const dnn::BatchDescriptor& input_desc,
-                                 dnn::DataType input_type,
-                                 const DeviceMemoryBase& input_data,
-                                 const dnn::BatchDescriptor& output_desc,
-                                 dnn::DataType output_type, float scale,
-                                 DeviceMemoryBase* output_data) {
+  virtual bool DoTransformTensor(
+      Stream* stream, const BatchDescriptor& input_desc, DataType input_type,
+      const DeviceMemoryBase& input_data, const BatchDescriptor& output_desc,
+      DataType output_type, float scale, DeviceMemoryBase* output_data) {
     return false;
   }
 
diff --git a/third_party/xla/xla/stream_executor/gpu/BUILD b/third_party/xla/xla/stream_executor/gpu/BUILD
index cb1f21ee6f9c2e..ae914fa21153a9 100644
--- a/third_party/xla/xla/stream_executor/gpu/BUILD
+++ b/third_party/xla/xla/stream_executor/gpu/BUILD
@@ -19,6 +19,7 @@ load(
 )
 load(
     "//xla/stream_executor:build_defs.bzl",
+    "gpu_only_cc_library",
     "if_gpu_is_configured",
 )
 load(
@@ -29,7 +30,7 @@ load(
 load(
     "@local_tsl//tsl:tsl.bzl",
     "if_libtpu",
-    "set_external_visibility",
+    "internal_visibility",
     "tsl_copts",
     "tsl_gpu_library",
 )
@@ -48,56 +49,67 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//tensorflow/compiler/tf2xla:__subpackages__",
+        "//xla:__subpackages__",
+        "//tensorflow/core/kernels:__subpackages__",
+        "//xla/pjrt:__subpackages__",
+        "//xla/service/gpu:__subpackages__",
+        "//xla/stream_executor:__subpackages__",
+        "//tensorflow/core/common_runtime/gpu:__subpackages__",
+        "//waymo/ml/compiler/triton:__subpackages__",
+    ]),
     licenses = ["notice"],
 )
 
 cc_library(
     name = "gpu_activation_header",
     hdrs = ["gpu_activation.h"],
-    visibility = ["//visibility:public"],
     deps = ["//xla/stream_executor/platform"],
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "gpu_activation",
-    srcs = if_gpu_is_configured(["gpu_activation.cc"]),
-    hdrs = if_gpu_is_configured(["gpu_activation.h"]),
-    visibility = ["//visibility:public"],
-    deps = if_gpu_is_configured([
-        ":gpu_executor_header",
+    srcs = ["gpu_activation.cc"],
+    hdrs = ["gpu_activation.h"],
+    deps = [
         ":gpu_activation_header",
         ":gpu_driver_header",
+        ":gpu_executor_header",
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_internal",
         "//xla/stream_executor/platform",
-    ]),
+    ],
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "gpu_diagnostics_header",
-    hdrs = if_gpu_is_configured(["gpu_diagnostics.h"]),
-    visibility = ["//visibility:public"],
+    hdrs = ["gpu_diagnostics.h"],
     deps = [
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/status:statusor",
     ],
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "gpu_collectives_header",
-    hdrs = if_gpu_is_configured(["gpu_collectives.h"]),
-    visibility = ["//visibility:public"],
+    hdrs = ["gpu_collectives.h"],
     deps = [
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
     ],
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "gpu_driver_header",
-    hdrs = if_gpu_is_configured(["gpu_driver.h"]),
-    visibility = ["//visibility:public"],
+    hdrs = ["gpu_driver.h"],
+    visibility = internal_visibility([
+        "//xla/service/gpu:__subpackages__",
+        "//xla/stream_executor:__subpackages__",
+        "//tensorflow/core/common_runtime/gpu:__subpackages__",
+        "//tensorflow/core/util/autotune_maps:__subpackages__",
+    ]),
     deps = [
         ":gpu_types_header",
         "//xla/stream_executor",
@@ -114,27 +126,31 @@ cc_library(
     ),
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "gpu_runtime_header",
-    hdrs = if_gpu_is_configured(["gpu_runtime.h"]),
-    visibility = ["//visibility:public"],
+    hdrs = ["gpu_runtime.h"],
+    visibility = internal_visibility([
+        "//xla/service/gpu:__subpackages__",
+        "//xla/stream_executor:__subpackages__",
+    ]),
     deps = [
         ":gpu_types_header",
         "@com_google_absl//absl/status:statusor",
     ],
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "gpu_kernels",
-    hdrs = if_gpu_is_configured(["gpu_kernels.h"]),
-    visibility = ["//visibility:public"],
+    hdrs = ["gpu_kernels.h"],
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "gpu_command_buffer",
-    srcs = if_gpu_is_configured(["gpu_command_buffer.cc"]),
-    hdrs = if_gpu_is_configured(["gpu_command_buffer.h"]),
-    visibility = ["//visibility:public"],
+    srcs = ["gpu_command_buffer.cc"],
+    hdrs = ["gpu_command_buffer.h"],
+    local_defines = if_rocm_is_configured([
+        "TENSORFLOW_USE_ROCM=1",
+    ]),
     deps = [
         ":gpu_driver_header",
         ":gpu_executor_header",
@@ -151,6 +167,7 @@ cc_library(
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
+        "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
@@ -162,27 +179,26 @@ cc_library(
         "//xla/stream_executor/cuda:cuda_conditional_kernels",
     ]) + if_rocm_is_configured([
         "//xla/stream_executor/rocm:hip_conditional_kernels",
+        "//xla/stream_executor/rocm:hip_noop_kernel",
     ]),
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "gpu_event_header",
-    hdrs = if_gpu_is_configured(["gpu_event.h"]),
-    visibility = ["//visibility:public"],
-    deps = if_gpu_is_configured([
+    hdrs = ["gpu_event.h"],
+    deps = [
         ":gpu_driver_header",
         ":gpu_stream_header",
-        "@com_google_absl//absl/status",
         "//xla/stream_executor",
+        "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:status",
-    ]),
+    ],
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "gpu_event",
-    srcs = if_gpu_is_configured(["gpu_event.cc"]),
-    hdrs = if_gpu_is_configured(["gpu_event.h"]),
-    visibility = ["//visibility:public"],
+    srcs = ["gpu_event.cc"],
+    hdrs = ["gpu_event.h"],
     deps = [
         ":gpu_driver_header",
         ":gpu_executor_header",
@@ -193,13 +209,12 @@ cc_library(
     ],
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "gpu_executor_header",
-    hdrs = if_gpu_is_configured(["gpu_executor.h"]),
-    visibility = ["//visibility:public"],
+    hdrs = ["gpu_executor.h"],
     deps = [
         ":gpu_collectives_header",
-        ":gpu_kernel_header",
+        ":gpu_driver_header",
         ":gpu_types_header",
         "//xla/stream_executor",
         "//xla/stream_executor:platform",
@@ -216,10 +231,9 @@ cc_library(
     ],
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "gpu_helpers_header",
-    hdrs = if_gpu_is_configured(["gpu_helpers.h"]),
-    visibility = ["//visibility:public"],
+    hdrs = ["gpu_helpers.h"],
     deps = [
         ":gpu_types_header",
         "@local_tsl//tsl/platform:logging",
@@ -231,7 +245,9 @@ tsl_gpu_library(
     hdrs = [
         "gpu_init.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "@local_tsl//tsl:internal",
+    ]),
     deps = [
         "@com_google_absl//absl/status",
         "@local_tsl//tsl/platform:status",
@@ -250,10 +266,15 @@ tsl_gpu_library(
     ],
     copts = tsl_copts(),
     linkstatic = True,
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//tensorflow/compiler/tf2xla:__subpackages__",
+        "//xla:__subpackages__",
+        "//tensorflow/core/common_runtime/gpu:__subpackages__",
+        "//tensorflow/stream_executor:__subpackages__",
+    ]),
     deps = [
-        "//xla/stream_executor:multi_platform_manager",
         "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@local_tsl//tsl/platform:logging",
@@ -261,23 +282,23 @@ tsl_gpu_library(
     alwayslink = True,
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "gpu_kernel_header",
-    hdrs = if_gpu_is_configured(["gpu_kernel.h"]),
-    visibility = ["//visibility:public"],
+    hdrs = ["gpu_kernel.h"],
     deps = [
         ":gpu_driver_header",
+        ":gpu_executor_header",
         ":gpu_types_header",
         "//xla/stream_executor",
         "//xla/stream_executor:stream_executor_internal",
         "@com_google_absl//absl/status:statusor",
+        "@local_tsl//tsl/platform:logging",
     ],
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "gpu_stream_header",
-    hdrs = if_gpu_is_configured(["gpu_stream.h"]),
-    visibility = ["//visibility:public"],
+    hdrs = ["gpu_stream.h"],
     deps = [
         ":gpu_types_header",
         "//xla/stream_executor",
@@ -285,11 +306,10 @@ cc_library(
     ],
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "gpu_stream",
-    srcs = if_gpu_is_configured(["gpu_stream.cc"]),
-    hdrs = if_gpu_is_configured(["gpu_stream.h"]),
-    visibility = ["//visibility:public"],
+    srcs = ["gpu_stream.cc"],
+    hdrs = ["gpu_stream.h"],
     deps = [
         ":gpu_executor_header",
         ":gpu_types_header",
@@ -299,10 +319,9 @@ cc_library(
     ],
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "gpu_timer_header",
-    hdrs = if_gpu_is_configured(["gpu_timer.h"]),
-    visibility = ["//visibility:public"],
+    hdrs = ["gpu_timer.h"],
     deps = [
         ":gpu_driver_header",
         ":gpu_executor_header",
@@ -312,11 +331,10 @@ cc_library(
     ],
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "gpu_timer",
-    srcs = if_gpu_is_configured(["gpu_timer.cc"]),
-    hdrs = if_gpu_is_configured(["gpu_timer.h"]),
-    visibility = ["//visibility:public"],
+    srcs = ["gpu_timer.cc"],
+    hdrs = ["gpu_timer.h"],
     deps = [
         ":gpu_driver_header",
         ":gpu_executor_header",
@@ -338,10 +356,9 @@ cc_library(
     ]),
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "gpu_types_header",
-    hdrs = if_gpu_is_configured(["gpu_types.h"]),
-    visibility = ["//visibility:public"],
+    hdrs = ["gpu_types.h"],
     deps = [
         "//xla/stream_executor/platform",
     ] + if_cuda_is_configured([
@@ -354,80 +371,98 @@ cc_library(
 cc_library(
     name = "gpu_asm_opts",
     hdrs = ["gpu_asm_opts.h"],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility([
+        "//xla/service/gpu:__subpackages__",
+        "//xla/stream_executor:__subpackages__",
+        "//tensorflow/core/kernels:__subpackages__",
+    ]),
     deps = [
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
     ],
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "asm_compiler_header",
-    hdrs = if_gpu_is_configured(["asm_compiler.h"]),
+    hdrs = ["asm_compiler.h"],
     copts = tsl_copts(),
-    visibility = ["//visibility:public"],
-    deps = if_gpu_is_configured([
+    visibility = internal_visibility([
+        "//tensorflow/compiler/mlir/tools/kernel_gen:__subpackages__",
+        "//xla/service/gpu:__subpackages__",
+        "//xla/stream_executor:__subpackages__",
+        "//tensorflow/core/kernels:__subpackages__",
+    ]),
+    deps = [
         ":gpu_asm_opts",
         ":gpu_driver_header",
         ":gpu_helpers_header",
-        "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/base:core_headers",
-        "@local_tsl//tsl/platform:regexp",
-        "@local_tsl//tsl/platform:mutex",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:subprocess",
-        "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:cuda_libdevice_path",
         "//xla/stream_executor",
-        "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:statusor",
         "//xla/stream_executor/platform",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-    ]) + if_cuda_is_configured([
+        "@local_tsl//tsl/platform:cuda_libdevice_path",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:regexp",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:subprocess",
+    ] + if_cuda_is_configured([
         "//xla/stream_executor/cuda:cuda_driver",
     ]) + if_rocm_is_configured([
         "//xla/stream_executor/rocm:rocm_driver",
     ]),
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "asm_compiler",
-    srcs = if_gpu_is_configured(["asm_compiler.cc"]),
-    hdrs = if_gpu_is_configured(["asm_compiler.h"]),
+    srcs = ["asm_compiler.cc"],
+    hdrs = ["asm_compiler.h"],
     copts = tsl_copts(),
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]),
-    visibility = ["//visibility:public"],
-    deps = if_gpu_is_configured([
+    visibility = internal_visibility([
+        "//third_party/py/jax:__subpackages__",
+        "//tensorflow/compiler/mlir/tools/kernel_gen:__subpackages__",
+        "//xla/service/gpu:__subpackages__",
+        "//xla/stream_executor:__subpackages__",
+        "//tensorflow/core/kernels:__subpackages__",
+    ]),
+    deps = [
         ":gpu_asm_opts",
         ":gpu_driver_header",
         ":gpu_helpers_header",
-        "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/base:core_headers",
-        "@local_tsl//tsl/platform:regexp",
-        "@local_tsl//tsl/platform:mutex",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:env",
-        "@local_tsl//tsl/platform:subprocess",
-        "@local_tsl//tsl/platform:path",
-        "@local_tsl//tsl/platform:cuda_libdevice_path",
+        "//xla:util",
         "//xla/stream_executor",
         "//xla/stream_executor/platform",
-        "//xla:util",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/cleanup",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@com_google_absl//absl/container:node_hash_map",
+        "@com_google_absl//absl/log",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/platform:statusor",
         "@com_google_absl//absl/synchronization",
         "@com_google_absl//absl/types:span",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/container:flat_hash_set",
-    ]) + if_cuda_is_configured([
+        "@local_tsl//tsl/platform:cuda_libdevice_path",
+        "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:errors",
+        "@local_tsl//tsl/platform:mutex",
+        "@local_tsl//tsl/platform:path",
+        "@local_tsl//tsl/platform:regexp",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/platform:subprocess",
+    ] + if_cuda_is_configured([
         "//xla/stream_executor/cuda:cuda_asm_compiler",
         "//xla/stream_executor/cuda:cuda_driver",
         "//xla/stream_executor/cuda:ptxas_wrapper",
@@ -457,33 +492,37 @@ gpu_kernel_library(
     ]),
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "redzone_allocator",
-    srcs = if_gpu_is_configured(["redzone_allocator.cc"]),
-    hdrs = if_gpu_is_configured(["redzone_allocator.h"]),
+    srcs = ["redzone_allocator.cc"],
+    hdrs = ["redzone_allocator.h"],
     copts = tsl_copts(),
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    visibility = ["//visibility:public"],
-    deps = if_gpu_is_configured([
+    visibility = internal_visibility([
+        "//xla/service/gpu:__subpackages__",
+        "//xla/stream_executor:__subpackages__",
+        "//tensorflow/core/kernels:__subpackages__",
+    ]),
+    deps = [
         ":asm_compiler",
         ":gpu_asm_opts",
+        "//xla/stream_executor",
+        "//xla/stream_executor:device_memory",
+        "//xla/stream_executor:device_memory_allocator",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/container:fixed_array",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings:str_format",
         "@com_google_absl//absl/types:optional",
+        "@local_tsl//tsl/framework:allocator",
         "@local_tsl//tsl/lib/math:math_util",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:logging",
-        "@com_google_absl//absl/status:statusor",
-        "@local_tsl//tsl/framework:allocator",
-        "//xla/stream_executor:device_memory",
-        "//xla/stream_executor:device_memory_allocator",
-        "//xla/stream_executor",
         "@local_tsl//tsl/platform:status",
-    ]) + if_rocm_is_configured([
+    ] + if_rocm_is_configured([
         ":redzone_allocator_kernel",
     ]),
 )
@@ -511,7 +550,6 @@ xla_cc_test(
 tsl_gpu_library(
     name = "gpu_cudamallocasync_allocator_header",
     hdrs = ["gpu_cudamallocasync_allocator.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/stream_executor",
         "@com_google_absl//absl/base:core_headers",
@@ -533,7 +571,6 @@ tsl_gpu_library(
         "//xla/stream_executor/cuda:cuda_activation",
         "//xla/stream_executor/cuda:cuda_executor",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":gpu_init_impl",
         "//xla/stream_executor",
@@ -549,30 +586,29 @@ tsl_gpu_library(
     ],
 )
 
-cc_library(
+gpu_only_cc_library(
     name = "gpu_graph",
-    srcs = if_gpu_is_configured(["gpu_graph.cc"]),
-    hdrs = if_gpu_is_configured(["gpu_graph.h"]),
-    visibility = ["//visibility:public"],
-    deps = if_gpu_is_configured([
+    srcs = ["gpu_graph.cc"],
+    hdrs = ["gpu_graph.h"],
+    deps = [
         ":gpu_driver_header",
+        ":gpu_executor_header",
         ":gpu_kernel_header",
         ":gpu_types_header",
-        ":gpu_executor_header",
+        "//xla/stream_executor",
+        "//xla/stream_executor/gpu:gpu_stream",
+        "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
-        "@com_google_absl//absl/types:span",
         "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/strings:str_format",
-        "@com_google_absl//absl/functional:any_invocable",
-        "//xla/stream_executor/gpu:gpu_stream",
-        "//xla/stream_executor",
+        "@com_google_absl//absl/types:span",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:status",
         "@local_tsl//tsl/platform:statusor",
-        "@com_google_absl//absl/strings",
-    ]) + if_cuda_is_configured([
+    ] + if_cuda_is_configured([
         "@local_config_cuda//cuda:cuda_headers",
         "//xla/stream_executor/cuda:cuda_driver",
     ]) + if_rocm_is_configured([
@@ -587,7 +623,6 @@ cc_library(
     local_defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured([
         "TENSORFLOW_USE_ROCM=1",
     ]),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:status",
@@ -631,9 +666,11 @@ xla_test(
         ":gpu_test_kernels",
         "//xla/service:platform_util",
         "//xla/stream_executor",
-        "//xla/stream_executor:multi_platform_manager",
         "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
+        "@com_google_absl//absl/strings",
         "@com_google_googletest//:gtest",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_main",
     ] + if_cuda([
@@ -653,13 +690,15 @@ xla_test(
         ":gpu_types_header",
         "//xla/service:platform_util",
         "//xla/stream_executor",
-        "//xla/stream_executor:multi_platform_manager",
         "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
         "@com_google_absl//absl/log:check",
         "@com_google_absl//absl/status",
+        "@com_google_absl//absl/strings",
         "@local_config_cuda//cuda:cuda_headers",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/platform:test_benchmark",
         "@local_tsl//tsl/platform:test_main",
diff --git a/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc b/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
index 16f7c69c0adc87..8e039d038f5b79 100644
--- a/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
+++ b/third_party/xla/xla/stream_executor/gpu/asm_compiler.cc
@@ -17,6 +17,7 @@ limitations under the License.
 
 #include <array>
 #include <cstdint>
+#include <cstdlib>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -28,11 +29,14 @@ limitations under the License.
 #include "absl/cleanup/cleanup.h"
 #include "absl/container/flat_hash_map.h"
 #include "absl/container/flat_hash_set.h"
+#include "absl/log/log.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
@@ -43,6 +47,7 @@ limitations under the License.
 #include "tsl/platform/errors.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/regexp.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/subprocess.h"
 
 namespace stream_executor {
@@ -170,8 +175,8 @@ absl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int device_ordinal,
   return CompileGpuAsm(cc_major, cc_minor, ptx_contents, options);
 }
 
-std::string FindCudaExecutable(const std::string& binary_name,
-                               const std::string& preferred_cuda_dir) {
+absl::StatusOr<std::string> FindCudaExecutable(
+    const std::string& binary_name, const std::string& preferred_cuda_dir) {
   static absl::Mutex mu(absl::kConstInit);
   static auto* seen_binary_paths ABSL_GUARDED_BY(mu) =
       new absl::flat_hash_map<std::pair<std::string, std::string>,
@@ -192,45 +197,44 @@ std::string FindCudaExecutable(const std::string& binary_name,
   }
 
   auto env = tsl::Env::Default();
-  std::string binary_path =
-      tsl::io::JoinPath(preferred_cuda_dir, "bin", binary_filename);
-
-  // Search in the preferred cuda directory
-  VLOG(2) << "Looking for " << binary_filename << " at " << binary_path;
-  if (env->FileExists(binary_path).ok() &&
-      GetToolVersionString(binary_path).ok()) {
-    VLOG(2) << "Using " << binary_filename << " at " << binary_path;
-    seen_binary_paths->emplace(std::move(cache_key), binary_path);
-    return binary_path;
-  }
-
-  // Try searching in PATH if the preferred cuda directory didn't work.
-  if (GetToolVersionString(binary_filename).ok()) {
-    VLOG(2) << "Using " << binary_filename;
-    seen_binary_paths->emplace(std::move(cache_key), binary_filename);
-    return binary_filename;
-  }
-
-  // Search in cuda root candidates.
-  for (const std::string& cuda_root : tsl::CandidateCudaRoots()) {
-    binary_path = tsl::io::JoinPath(cuda_root, "bin", binary_filename);
-    VLOG(2) << "Looking for " << binary_filename << " at " << binary_path;
-    if (env->FileExists(binary_path).ok() &&
-        GetToolVersionString(binary_path).ok()) {
-      VLOG(2) << "Using " << binary_filename << " at " << binary_path;
-      seen_binary_paths->emplace(std::move(cache_key), binary_path);
-      return binary_path;
+  std::vector<std::string> candidates{};
+
+  // #1 - Check the preferred CUDA directory
+  candidates.emplace_back(
+      tsl::io::JoinPath(preferred_cuda_dir, "bin", binary_filename));
+
+  std::string_view path_env = std::getenv("PATH");
+
+#if defined(PLATFORM_WINDOWS)
+  constexpr char kSearchPathSeparator = ';';
+#else
+  constexpr char kSearchPathSeparator = ':';
+#endif
+
+  // #2 - Check the PATH environment variable
+  for (std::string_view path : absl::StrSplit(path_env, kSearchPathSeparator)) {
+    candidates.emplace_back(tsl::io::JoinPath(path, binary_filename));
+  }
+
+  // #2 - Check generic CUDA locations
+  for (std::string_view path : tsl::CandidateCudaRoots()) {
+    candidates.emplace_back(tsl::io::JoinPath(path, "bin", binary_filename));
+  }
+
+  for (const auto& candidate : candidates) {
+    VLOG(2) << "Looking for " << candidate;
+    if (env->FileExists(candidate).ok() &&
+        GetToolVersionString(candidate).ok()) {
+      VLOG(2) << "Using " << candidate;
+      seen_binary_paths->emplace(std::move(cache_key), candidate);
+      return candidate;
     }
   }
 
-  // Give up and just rely on subprocess invocation to find the correct
-  // binary. This won't work, in all probability, given we already tried that
-  // above, but it's the best we can do.
-  VLOG(2) << "Unable to find " << binary_name;
-  binary_path = binary_filename;
-  VLOG(2) << "Using " << binary_filename << " at " << binary_path;
-  seen_binary_paths->emplace(std::move(cache_key), binary_path);
-  return binary_path;
+  return absl::NotFoundError(
+      absl::StrCat("Couldn't find ", binary_name,
+                   ". The following locations were considered: ",
+                   absl::StrJoin(candidates, ", ")));
 }
 
 static void LogPtxasTooOld(const std::string& ptxas_path, int cc_major,
@@ -263,7 +267,8 @@ static void AppendArgsFromOptions(GpuAsmOpts options,
 
 absl::StatusOr<std::array<int64_t, 3>> GetAsmCompilerVersion(
     const std::string& preferred_cuda_dir) {
-  std::string ptxas_path = FindCudaExecutable("ptxas", preferred_cuda_dir);
+  TF_ASSIGN_OR_RETURN(std::string ptxas_path,
+                      FindCudaExecutable("ptxas", preferred_cuda_dir));
   return GetToolVersion(ptxas_path);
 }
 
@@ -280,8 +285,8 @@ absl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int cc_major, int cc_minor,
         std::get<0>(ptxas_version_tuple), std::get<1>(ptxas_version_tuple),
         std::get<2>(ptxas_version_tuple)));
   }
-  std::string ptxas_path =
-      FindCudaExecutable("ptxas", options.preferred_cuda_dir);
+  TF_ASSIGN_OR_RETURN(std::string ptxas_path,
+                      FindCudaExecutable("ptxas", options.preferred_cuda_dir));
 
   WarnIfBadPtxasVersion(ptxas_path);
 
@@ -382,8 +387,9 @@ absl::StatusOr<std::vector<uint8_t>> CompileGpuAsm(int cc_major, int cc_minor,
 
 absl::StatusOr<std::vector<uint8_t>> BundleGpuAsm(
     std::vector<CubinOrPTXImage> images, GpuAsmOpts options) {
-  std::string fatbinary_path =
-      FindCudaExecutable("fatbinary", options.preferred_cuda_dir);
+  TF_ASSIGN_OR_RETURN(
+      std::string fatbinary_path,
+      FindCudaExecutable("fatbinary", options.preferred_cuda_dir));
 
   // Write images to temporary files.
   std::vector<std::string> image_paths;
diff --git a/third_party/xla/xla/stream_executor/gpu/asm_compiler.h b/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
index eda86fb9dc97ac..3b6099a88036ee 100644
--- a/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
+++ b/third_party/xla/xla/stream_executor/gpu/asm_compiler.h
@@ -23,7 +23,7 @@ limitations under the License.
 
 #include "absl/base/const_init.h"
 #include "absl/base/thread_annotations.h"
-#include "absl/container/flat_hash_map.h"
+#include "absl/container/node_hash_map.h"
 #include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
@@ -101,8 +101,8 @@ absl::StatusOr<std::vector<uint8_t>> LinkUsingNvlink(
     absl::string_view preferred_cuda_dir, gpu::GpuContext* context,
     std::vector<CubinOrPTXImage> images);
 
-std::string FindCudaExecutable(const std::string& binary_name,
-                               const std::string& preferred_cuda_dir);
+absl::StatusOr<std::string> FindCudaExecutable(
+    const std::string& binary_name, const std::string& preferred_cuda_dir);
 
 // Runs tool --version and parses its version string.
 absl::StatusOr<std::array<int64_t, 3>> GetToolVersion(
@@ -115,7 +115,7 @@ absl::StatusOr<std::array<int64_t, 3>> GetAsmCompilerVersion(
 #if GOOGLE_CUDA
 // Maintains a cache of pointers to loaded kernels
 template <typename... Args>
-absl::StatusOr<std::shared_ptr<TypedKernel<Args...>>> LoadKernelOrGetPtr(
+absl::StatusOr<TypedKernel<Args...>*> LoadKernelOrGetPtr(
     StreamExecutor* executor, absl::string_view kernel_name,
     absl::string_view ptx, absl::Span<const uint8_t> cubin_data) {
   using KernelPtrCacheKey =
@@ -123,8 +123,7 @@ absl::StatusOr<std::shared_ptr<TypedKernel<Args...>>> LoadKernelOrGetPtr(
 
   static absl::Mutex kernel_ptr_cache_mutex(absl::kConstInit);
   static auto& kernel_ptr_cache ABSL_GUARDED_BY(kernel_ptr_cache_mutex) =
-      *new absl::flat_hash_map<KernelPtrCacheKey,
-                               std::shared_ptr<TypedKernel<Args...>>>();
+      *new absl::node_hash_map<KernelPtrCacheKey, TypedKernel<Args...>>();
   CUcontext current_context = cuda::CurrentContextOrDie();
   KernelPtrCacheKey kernel_ptr_cache_key{current_context, kernel_name, ptx};
   absl::MutexLock lock(&kernel_ptr_cache_mutex);
@@ -132,14 +131,14 @@ absl::StatusOr<std::shared_ptr<TypedKernel<Args...>>> LoadKernelOrGetPtr(
   auto it = kernel_ptr_cache.find(kernel_ptr_cache_key);
   if (it == kernel_ptr_cache.end()) {
     TF_ASSIGN_OR_RETURN(
-        std::shared_ptr<TypedKernel<Args...>> loaded,
+        TypedKernel<Args...> loaded,
         executor->CreateTypedKernel<Args...>(kernel_name, ptx, cubin_data));
     it =
         kernel_ptr_cache.emplace(kernel_ptr_cache_key, std::move(loaded)).first;
   }
 
   CHECK(it != kernel_ptr_cache.end());
-  return it->second;
+  return &it->second;
 }
 #endif  // GOOGLE_CUDA
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_collectives.h b/third_party/xla/xla/stream_executor/gpu/gpu_collectives.h
index bd134477c37242..e9e0d724c2cfec 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_collectives.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_collectives.h
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
index 46047226ffc981..b3c0676a90c353 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.cc
@@ -32,6 +32,7 @@ limitations under the License.
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/device_memory.h"
@@ -126,12 +127,14 @@ GpuCommandBuffer::~GpuCommandBuffer() {
     VLOG(5) << "Destroy GPU command buffer executable graph " << exec_ << " "
             << "(remaining alive executable graphs: " << NotifyExecDestroyed()
             << ")";
-    auto st = GpuDriver::DestroyGraphExec(exec_);
-    CHECK(st.ok()) << "Failed to destroy GPU graph exec: " << st.message();
+    if (auto status = GpuDriver::DestroyGraphExec(exec_); !status.ok()) {
+      LOG(ERROR) << "Failed to destroy GPU graph exec: " << status.message();
+    }
   }
   if (graph_ != nullptr && is_owned_graph_) {
-    auto st = GpuDriver::DestroyGraph(graph_);
-    CHECK(st.ok()) << "Failed to destroy GPU graph: " << st.message();
+    if (auto status = GpuDriver::DestroyGraph(graph_); !status.ok()) {
+      LOG(ERROR) << "Failed to destroy GPU graph: " << status.message();
+    }
   }
 }
 
@@ -155,8 +158,6 @@ static GpuDevicePtr AsDevicePtr(const DeviceMemoryBase& mem) {
 
 absl::Status GpuCommandBuffer::Trace(
     Stream* stream, absl::AnyInvocable<absl::Status()> function) {
-  // TODO(ezhulenev): Check that graph is empty, because we should not be mixing
-  // graph tracing with explicit graph construction.
   TF_RETURN_IF_ERROR(CheckNotFinalized());
 
   VLOG(5) << "Trace into GPU command buffer graph " << graph_
@@ -191,19 +192,77 @@ GpuCommandBuffer::Dependencies GpuCommandBuffer::GetBarrier() {
   return barrier_ ? Dependencies{barrier_} : Dependencies{};
 }
 
+absl::StatusOr<GpuCommandBuffer::SetIfConditionKernel*>
+GpuCommandBuffer::GetSetIfConditionKernel(StreamExecutor* executor) {
+  if (!set_if_condition_kernel_) {
+    MultiKernelLoaderSpec spec(/*arity=*/2);
+    spec.AddInProcessSymbol(gpu::GetSetIfConditionKernel(), "set_if_condition");
+    TF_ASSIGN_OR_RETURN(set_if_condition_kernel_,
+                        SetIfConditionKernel::Create(executor, spec));
+  }
+  return &set_if_condition_kernel_;
+}
+
+absl::StatusOr<GpuCommandBuffer::SetIfElseConditionKernel*>
+GpuCommandBuffer::GetSetIfElseConditionKernel(StreamExecutor* executor) {
+  if (!set_if_else_condition_kernel_) {
+    MultiKernelLoaderSpec spec(/*arity=*/3);
+    spec.AddInProcessSymbol(gpu::GetSetIfElseConditionKernel(),
+                            "set_if_else_condition");
+    TF_ASSIGN_OR_RETURN(set_if_else_condition_kernel_,
+                        SetIfElseConditionKernel::Create(executor, spec));
+  }
+  return &set_if_else_condition_kernel_;
+}
+
+absl::StatusOr<GpuCommandBuffer::SetCaseConditionKernel*>
+GpuCommandBuffer::GetSetCaseConditionKernel(StreamExecutor* executor) {
+  if (!set_case_condition_kernel_) {
+    MultiKernelLoaderSpec spec(/*arity=*/10);
+    spec.AddInProcessSymbol(gpu::GetSetCaseConditionKernel(),
+                            "set_case_condition");
+    TF_ASSIGN_OR_RETURN(set_case_condition_kernel_,
+                        SetCaseConditionKernel::Create(executor, spec));
+  }
+  return &set_case_condition_kernel_;
+}
+
+absl::StatusOr<GpuCommandBuffer::SetForConditionKernel*>
+GpuCommandBuffer::GetSetForConditionKernel(StreamExecutor* executor) {
+  if (!set_for_condition_kernel_) {
+    MultiKernelLoaderSpec spec(/*arity=*/3);
+    spec.AddInProcessSymbol(gpu::GetSetForConditionKernel(),
+                            "set_for_condition");
+    TF_ASSIGN_OR_RETURN(set_for_condition_kernel_,
+                        SetForConditionKernel::Create(executor, spec));
+  }
+  return &set_for_condition_kernel_;
+}
+
+absl::StatusOr<GpuCommandBuffer::SetWhileConditionKernel*>
+GpuCommandBuffer::GetSetWhileConditionKernel(StreamExecutor* executor) {
+  if (!set_while_condition_kernel_) {
+    MultiKernelLoaderSpec spec(/*arity=*/2);
+    spec.AddInProcessSymbol(gpu::GetSetWhileConditionKernel(),
+                            "set_while_condition");
+    TF_ASSIGN_OR_RETURN(set_while_condition_kernel_,
+                        SetWhileConditionKernel::Create(executor, spec));
+  }
+  return &set_while_condition_kernel_;
+}
+
 absl::StatusOr<GpuCommandBuffer::NoOpKernel*> GpuCommandBuffer::GetNoOpKernel(
     StreamExecutor* executor) {
   if (!noop_kernel_) {
-    auto noop_kernel = std::make_unique<NoOpKernel>(executor);
-
     MultiKernelLoaderSpec spec(/*arity=*/0);
+#if !defined(TENSORFLOW_USE_ROCM)
     spec.AddCudaPtxInMemory(gpu::kNoOpKernel, "noop");
-    TF_RETURN_IF_ERROR(executor->GetKernel(spec, noop_kernel.get()));
-
-    noop_kernel_ = std::move(noop_kernel);
+#else
+    spec.AddInProcessSymbol(gpu::GetNoOpKernel(), "noop");
+#endif  // TENSORFLOW_USE_ROCM
+    TF_ASSIGN_OR_RETURN(noop_kernel_, NoOpKernel::Create(executor, spec));
   }
-
-  return noop_kernel_.get();
+  return &noop_kernel_;
 }
 
 absl::Status GpuCommandBuffer::DisableBarriersExecution(
@@ -213,8 +272,8 @@ absl::Status GpuCommandBuffer::DisableBarriersExecution(
     TF_RETURN_IF_ERROR(GpuDriver::GraphNodeSetEnabled(exec, barrier, false));
   }
   for (ConditionalCommandBuffers& cmd_buffers : conditional_command_buffers_) {
-    for (CommandBuffer& cmd_buffer : cmd_buffers.command_buffers) {
-      TF_RETURN_IF_ERROR(Cast(&cmd_buffer)->DisableBarriersExecution(exec));
+    for (auto& cmd_buffer : cmd_buffers.command_buffers) {
+      TF_RETURN_IF_ERROR(cmd_buffer->DisableBarriersExecution(exec));
     }
   }
   return absl::OkStatus();
@@ -247,28 +306,28 @@ absl::Status GpuCommandBuffer::Barrier(StreamExecutor* executor) {
     // Collect nodes that will become a new barrier dependencies.
     Dependencies dependencies;
     for (int32_t i = nodes_.size() - 1; i >= 0; --i) {
-      if (nodes_[i] == barrier_) break;
-      dependencies.push_back(nodes_[i]);
+      if (nodes_[i].handle == barrier_) break;
+      dependencies.push_back(nodes_[i].handle);
     }
 
     // Add a noop kernel node acting as a barrier.
     if (dependencies.size() > 1) {
       // TODO(b/316343054): This should be an empty node, however CUDA 12.3 does
       // not support empty nodes inside conditional command buffers.
-      GpuGraphNodeHandle* node = &nodes_.emplace_back();
-      TF_ASSIGN_OR_RETURN(auto noop, GetNoOpKernel(executor));
+      GpuGraphNodeInfo& node_info = nodes_.emplace_back();
+      TF_ASSIGN_OR_RETURN(NoOpKernel * noop, GetNoOpKernel(executor));
       TF_RETURN_IF_ERROR(GpuDriver::GraphAddKernelNode(
-          node, graph_, absl::MakeSpan(dependencies), noop->name(),
-          AsGpuKernel(noop)->AsGpuFunctionHandle(), 1, 1, 1, 1, 1, 1, 0,
+          &node_info.handle, graph_, absl::MakeSpan(dependencies), "noop",
+          AsGpuKernel(&**noop)->AsGpuFunctionHandle(), 1, 1, 1, 1, 1, 1, 0,
           /*kernel_params=*/nullptr, /*extra=*/nullptr));
-      barriers_.push_back(*node);
+      barriers_.push_back(node_info.handle);
     } else {
       barriers_.push_back(nullptr);
     }
 
     // Make the last node a barrier, if we didn't add a new no-op node acting
     // as a barrier we simply reuse the last node.
-    barrier_ = nodes_.back();
+    barrier_ = nodes_.back().handle;
     return absl::OkStatus();
   }
 
@@ -277,9 +336,9 @@ absl::Status GpuCommandBuffer::Barrier(StreamExecutor* executor) {
     // means that we just updated a "real" barrier node, otherwise barrier is
     // the last updated node.
     if (barriers_[update_state_.barrier_idx++]) {
-      barrier_ = nodes_[update_state_.node_idx++];
+      barrier_ = nodes_[update_state_.node_idx++].handle;
     } else if (update_state_.node_idx) {
-      barrier_ = nodes_[update_state_.node_idx - 1];
+      barrier_ = nodes_[update_state_.node_idx - 1].handle;
     }
     return absl::OkStatus();
   }
@@ -302,16 +361,16 @@ absl::Status GpuCommandBuffer::LaunchWithPackedArgs(
   // Adds a new kernel node to the graph under construction.
   if (state_ == State::kCreate) {
     Dependencies barrier = GetBarrier();
-    GpuGraphNodeHandle* node = &nodes_.emplace_back();
+    GpuGraphNodeInfo& node_info = nodes_.emplace_back();
     return GpuDriver::GraphAddKernelNode(
-        node, graph_, absl::MakeSpan(barrier), kernel.name(), gpu_func,
-        blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z,
+        &node_info.handle, graph_, absl::MakeSpan(barrier), kernel.name(),
+        gpu_func, blocks.x, blocks.y, blocks.z, threads.x, threads.y, threads.z,
         packed_args.number_of_shared_bytes(), kernel_params, /*extra=*/nullptr);
   }
 
   // Updates kernel node in the executable graph.
   if (state_ == State::kUpdate) {
-    GpuGraphNodeHandle node = nodes_[update_state_.node_idx++];
+    GpuGraphNodeHandle node = nodes_[update_state_.node_idx++].handle;
     return GpuDriver::GraphExecKernelNodeSetParams(
         exec_, node, kernel.name(), gpu_func, blocks.x, blocks.y, blocks.z,
         threads.x, threads.y, threads.z, packed_args.number_of_shared_bytes(),
@@ -357,14 +416,14 @@ absl::Status GpuCommandBuffer::AddNestedCommandBuffer(
   // Adds a child graph node to the graph under construction.
   if (state_ == State::kCreate) {
     Dependencies barrier = GetBarrier();
-    GpuGraphNodeHandle* node = &nodes_.emplace_back();
-    return GpuDriver::GraphAddChildNode(node, graph_, absl::MakeSpan(barrier),
-                                        child_graph);
+    GpuGraphNodeInfo& node_info = nodes_.emplace_back();
+    return GpuDriver::GraphAddChildNode(&node_info.handle, graph_,
+                                        absl::MakeSpan(barrier), child_graph);
   }
 
   // Updates child graph node in the executable graph.
   if (state_ == State::kUpdate) {
-    GpuGraphNodeHandle node = nodes_[update_state_.node_idx++];
+    GpuGraphNodeHandle node = nodes_[update_state_.node_idx++].handle;
     return GpuDriver::GraphExecChildNodeSetParams(exec_, node, child_graph);
   }
 
@@ -378,14 +437,14 @@ absl::Status GpuCommandBuffer::MemcpyDeviceToDevice(DeviceMemoryBase* dst,
 
   if (state_ == State::kCreate) {
     Dependencies barrier = GetBarrier();
-    GpuGraphNodeHandle* node = &nodes_.emplace_back();
+    GpuGraphNodeInfo& node_info = nodes_.emplace_back();
     return GpuDriver::GraphAddMemcpyD2DNode(
-        parent_->gpu_context(), node, graph_, absl::MakeSpan(barrier),
-        AsDevicePtr(*dst), AsDevicePtr(src), size);
+        parent_->gpu_context(), &node_info.handle, graph_,
+        absl::MakeSpan(barrier), AsDevicePtr(*dst), AsDevicePtr(src), size);
   }
 
   if (state_ == State::kUpdate) {
-    GpuGraphNodeHandle node = nodes_[update_state_.node_idx++];
+    GpuGraphNodeHandle node = nodes_[update_state_.node_idx++].handle;
     return GpuDriver::GraphExecMemcpyD2DNodeSetParams(
         parent_->gpu_context(), exec_, node, AsDevicePtr(*dst),
         AsDevicePtr(src), size);
@@ -401,14 +460,14 @@ absl::Status GpuCommandBuffer::Memset(DeviceMemoryBase* dst,
 
   if (state_ == State::kCreate) {
     Dependencies barrier = GetBarrier();
-    GpuGraphNodeHandle* node = &nodes_.emplace_back();
+    GpuGraphNodeInfo& node_info = nodes_.emplace_back();
     return GpuDriver::GraphAddMemsetNode(
-        parent_->gpu_context(), node, graph_, absl::MakeSpan(barrier),
-        AsDevicePtr(*dst), bit_pattern, num_elements);
+        parent_->gpu_context(), &node_info.handle, graph_,
+        absl::MakeSpan(barrier), AsDevicePtr(*dst), bit_pattern, num_elements);
   }
 
   if (state_ == State::kUpdate) {
-    GpuGraphNodeHandle node = nodes_[update_state_.node_idx++];
+    GpuGraphNodeHandle node = nodes_[update_state_.node_idx++].handle;
     return GpuDriver::GraphExecMemsetNodeSetParams(
         parent_->gpu_context(), exec_, node, AsDevicePtr(*dst), bit_pattern,
         num_elements);
@@ -423,11 +482,11 @@ absl::StatusOr<DeviceMemoryBase> GpuCommandBuffer::Allocate(size_t bytes) {
   // Adds a new memory allocation node to the graph under construction.
   if (state_ == State::kCreate) {
     Dependencies barrier = GetBarrier();
-    GpuGraphNodeHandle* node = &nodes_.emplace_back();
+    GpuGraphNodeInfo& node_info = nodes_.emplace_back();
 
     GpuDevicePtr ptr;
     TF_RETURN_IF_ERROR(GpuDriver::GraphAddMemAllocNode(
-        node, graph_, absl::MakeSpan(barrier),
+        &node_info.handle, graph_, absl::MakeSpan(barrier),
         GpuDriver::MemAccessFlags::kReadWrite,
         GpuDriver::MemLocationType::kDevice, parent_->device_ordinal(),
         GpuDriver::MemAllocationType::kPinned, bytes, &ptr));
@@ -446,7 +505,7 @@ absl::StatusOr<DeviceMemoryBase> GpuCommandBuffer::Allocate(size_t bytes) {
     // during the create step.
     TF_ASSIGN_OR_RETURN(AllocationResult params,
                         GpuDriver::GraphGetMemAllocNodeParams(
-                            nodes_[update_state_.node_idx++]));
+                            nodes_[update_state_.node_idx++].handle));
     return DeviceMemoryBase(reinterpret_cast<void*>(params.first),
                             params.second);
   }
@@ -460,10 +519,10 @@ absl::Status GpuCommandBuffer::Free(DeviceMemoryBase dst) {
   // Adds a new memfree node to the graph under construction.
   if (state_ == State::kCreate) {
     Dependencies barrier = GetBarrier();
-    GpuGraphNodeHandle* node = &nodes_.emplace_back();
+    GpuGraphNodeInfo& node_info = nodes_.emplace_back();
     GpuDevicePtr gpu_dptr = AsDevicePtr(dst);
     TF_RETURN_IF_ERROR(GpuDriver::GraphAddMemFreeNode(
-        node, graph_, absl::MakeSpan(barrier), gpu_dptr));
+        &node_info.handle, graph_, absl::MakeSpan(barrier), gpu_dptr));
     return absl::OkStatus();
   }
 
@@ -510,7 +569,7 @@ GpuCommandBuffer::CreateConditionalNodes(
 
   for (GpuGraphConditionalHandle handle : handles) {
     Dependencies barrier = GetBarrier();
-    GpuGraphNodeHandle* node = &nodes_.emplace_back();
+    GpuGraphNodeInfo& node_info = nodes_.emplace_back();
 
     ConditionalParams params;
     params.type = type;
@@ -519,7 +578,8 @@ GpuCommandBuffer::CreateConditionalNodes(
 
     TF_ASSIGN_OR_RETURN(
         GpuDriver::GpuGraphNodeResult result,
-        GpuDriver::GraphAddNode(node, graph_, absl::MakeSpan(barrier), params));
+        GpuDriver::GraphAddNode(&node_info.handle, graph_,
+                                absl::MakeSpan(barrier), params));
 
     conditional_graphs.push_back(std::get<ConditionalResult>(result).graph);
   }
@@ -527,12 +587,12 @@ GpuCommandBuffer::CreateConditionalNodes(
   return conditional_graphs;
 }
 
-absl::StatusOr<std::vector<CommandBuffer>>
+absl::StatusOr<std::vector<std::unique_ptr<GpuCommandBuffer>>>
 GpuCommandBuffer::CreateConditionalCommandBuffers(
     absl::Span<const GpuGraphConditionalHandle> handles,
     absl::Span<const GpuGraphHandle> graphs,
     absl::Span<const ConditionBuilder> builders) {
-  std::vector<CommandBuffer> cmd_buffers;
+  std::vector<std::unique_ptr<GpuCommandBuffer>> cmd_buffers;
 
   // Conditional command buffers always created in nested mode and with
   // underlying graphs owned by a conditional node.
@@ -540,13 +600,10 @@ GpuCommandBuffer::CreateConditionalCommandBuffers(
   bool is_owned_graph = false;
 
   for (size_t i = 0; i < handles.size(); ++i) {
-    auto command_buffer_impl = parent_->GetCommandBufferImplementation(
-        nested, graphs[i], is_owned_graph);
-
-    auto command_buffer = CommandBuffer::Create(std::move(command_buffer_impl));
-
-    TF_RETURN_IF_ERROR(builders[i](&command_buffer, handles[i]));
-    TF_RETURN_IF_ERROR(command_buffer.Finalize());
+    auto command_buffer =
+        parent_->CreateCommandBuffer(nested, graphs[i], is_owned_graph);
+    TF_RETURN_IF_ERROR(builders[i](command_buffer.get(), handles[i]));
+    TF_RETURN_IF_ERROR(command_buffer->Finalize());
 
     cmd_buffers.push_back(std::move(command_buffer));
   }
@@ -556,16 +613,16 @@ GpuCommandBuffer::CreateConditionalCommandBuffers(
 
 absl::Status GpuCommandBuffer::UpdateConditionalCommandBuffers(
     absl::Span<const GpuGraphConditionalHandle> handles,
-    absl::Span<CommandBuffer> command_buffers,
+    absl::Span<const std::unique_ptr<GpuCommandBuffer>> command_buffers,
     absl::Span<const ConditionBuilder> builders) {
   for (size_t i = 0; i < command_buffers.size(); ++i) {
     // Use parent graph executable for conditional command buffer update.
-    ScopedGpuGraphExec scoped_exec(Cast(&command_buffers[i]), exec_);
+    ScopedGpuGraphExec scoped_exec(command_buffers[i].get(), exec_);
 
     // Update command buffer using user-provided builder callback.
-    TF_RETURN_IF_ERROR(command_buffers[i].Update());
-    TF_RETURN_IF_ERROR(builders[i](&command_buffers[i], handles[i]));
-    TF_RETURN_IF_ERROR(command_buffers[i].Finalize());
+    TF_RETURN_IF_ERROR(command_buffers[i]->Update());
+    TF_RETURN_IF_ERROR(builders[i](command_buffers[i].get(), handles[i]));
+    TF_RETURN_IF_ERROR(command_buffers[i]->Finalize());
   }
   return absl::OkStatus();
 }
@@ -628,19 +685,12 @@ absl::Status GpuCommandBuffer::If(StreamExecutor* executor,
                                   CommandBuffer::Builder then_builder) {
   DCHECK(executor->implementation() == parent_);
 
-  // TODO(ezhulenev): Keep kernel in `GpuCommandBuffer` to avoid loading it on
-  // every call to `If`.
-  SetIfConditionKernel set_if_condition(executor);
-
-  {  // Load kernels that updates condition handle value.
-    MultiKernelLoaderSpec spec(/*arity=*/2);
-    spec.AddInProcessSymbol(gpu::GetSetIfConditionKernel(), "set_if_condition");
-    TF_RETURN_IF_ERROR(executor->GetKernel(spec, &set_if_condition));
-  }
+  TF_ASSIGN_OR_RETURN(SetIfConditionKernel * set_if_condition,
+                      GetSetIfConditionKernel(executor));
 
   auto set_cond_fn = [&](absl::Span<const GpuGraphConditionalHandle> handles) {
-    return Launch(set_if_condition, ThreadDim(), BlockDim(), handles[0],
-                  predicate);
+    return CommandBuffer::Launch(*set_if_condition, ThreadDim(), BlockDim(),
+                                 handles[0], predicate);
   };
 
   std::array<ConditionBuilder, 1> builders = {
@@ -656,20 +706,12 @@ absl::Status GpuCommandBuffer::IfElse(StreamExecutor* executor,
                                       CommandBuffer::Builder else_builder) {
   DCHECK(executor->implementation() == parent_);
 
-  // TODO(ezhulenev): Keep kernel in `GpuCommandBuffer` to avoid loading it on
-  // every call to `IfElse`.
-  SetIfElseConditionKernel set_if_else_condition(executor);
-
-  {  // Load kernels that updates condition handle value.
-    MultiKernelLoaderSpec spec(/*arity=*/3);
-    spec.AddInProcessSymbol(gpu::GetSetIfElseConditionKernel(),
-                            "set_if_else_condition");
-    TF_RETURN_IF_ERROR(executor->GetKernel(spec, &set_if_else_condition));
-  }
+  TF_ASSIGN_OR_RETURN(SetIfElseConditionKernel * set_if_else_condition,
+                      GetSetIfElseConditionKernel(executor));
 
   auto set_cond_fn = [&](absl::Span<const GpuGraphConditionalHandle> handles) {
-    return Launch(set_if_else_condition, ThreadDim(), BlockDim(), handles[0],
-                  handles[1], predicate);
+    return CommandBuffer::Launch(*set_if_else_condition, ThreadDim(),
+                                 BlockDim(), handles[0], handles[1], predicate);
   };
 
   std::array<ConditionBuilder, 2> builders = {
@@ -692,16 +734,8 @@ absl::Status GpuCommandBuffer::Case(
         "Case command supports only up to 8 branches, got: ", branches.size()));
   }
 
-  // TODO(ezhulenev): Keep kernel in `GpuCommandBuffer` to avoid loading it on
-  // every call to `Case`.
-  SetCaseConditionKernel set_case_condition(executor);
-
-  {  // Load kernels that updates condition handle value.
-    MultiKernelLoaderSpec spec(/*arity=*/10);
-    spec.AddInProcessSymbol(gpu::GetSetCaseConditionKernel(),
-                            "set_case_condition");
-    TF_RETURN_IF_ERROR(executor->GetKernel(spec, &set_case_condition));
-  }
+  TF_ASSIGN_OR_RETURN(SetCaseConditionKernel * set_case_condition,
+                      GetSetCaseConditionKernel(executor));
 
   auto set_cond_fn = [&](absl::Span<const GpuGraphConditionalHandle> handles) {
     int32_t num_handles = handles.size();
@@ -711,10 +745,11 @@ absl::Status GpuCommandBuffer::Case(
                                                           handles.end());
     padded_handles.resize(8);
 
-    return Launch(set_case_condition, ThreadDim(), BlockDim(),
-                  padded_handles[0], padded_handles[1], padded_handles[2],
-                  padded_handles[3], padded_handles[4], padded_handles[5],
-                  padded_handles[6], padded_handles[7], index, num_handles);
+    return CommandBuffer::Launch(
+        *set_case_condition, ThreadDim(), BlockDim(), padded_handles[0],
+        padded_handles[1], padded_handles[2], padded_handles[3],
+        padded_handles[4], padded_handles[5], padded_handles[6],
+        padded_handles[7], index, num_handles);
   };
 
   // Wrap all branches into conditional command buffer builders.
@@ -734,24 +769,16 @@ absl::Status GpuCommandBuffer::For(StreamExecutor* executor,
                                    CommandBuffer::Builder body_builder) {
   DCHECK(executor->implementation() == parent_);
 
-  // TODO(ezhulenev): Keep kernel in `GpuCommandBuffer` to avoid loading it on
-  // every call to `For`.
-  SetForConditionKernel set_for_condition(executor);
-
-  {  // Load kernels that updates condition handle value.
-    MultiKernelLoaderSpec spec(/*arity=*/3);
-    spec.AddInProcessSymbol(gpu::GetSetForConditionKernel(),
-                            "set_for_condition");
-    TF_RETURN_IF_ERROR(executor->GetKernel(spec, &set_for_condition));
-  }
+  TF_ASSIGN_OR_RETURN(SetForConditionKernel * set_for_condition,
+                      GetSetForConditionKernel(executor));
 
   // Reset loop counter to zero.
   TF_RETURN_IF_ERROR(Memset(&loop_counter, uint32_t{0}, 1));
   TF_RETURN_IF_ERROR(Barrier(executor));
 
   auto set_cond_fn = [&](absl::Span<const GpuGraphConditionalHandle> handles) {
-    return Launch(set_for_condition, ThreadDim(), BlockDim(), handles[0],
-                  loop_counter, num_iteration);
+    return CommandBuffer::Launch(*set_for_condition, ThreadDim(), BlockDim(),
+                                 handles[0], loop_counter, num_iteration);
   };
 
   auto body = [&](CommandBuffer* body, GpuGraphConditionalHandle handle) {
@@ -759,7 +786,7 @@ absl::Status GpuCommandBuffer::For(StreamExecutor* executor,
     TF_RETURN_IF_ERROR(body->Barrier(executor));
 
     // Decide if we want to continue loop iteration.
-    return body->Launch(set_for_condition, ThreadDim(), BlockDim(), handle,
+    return body->Launch(*set_for_condition, ThreadDim(), BlockDim(), handle,
                         loop_counter, num_iteration);
   };
 
@@ -775,24 +802,16 @@ absl::Status GpuCommandBuffer::While(StreamExecutor* executor,
                                      CommandBuffer::Builder body_builder) {
   DCHECK(executor->implementation() == parent_);
 
-  // TODO(ezhulenev): Keep kernel in `GpuCommandBuffer` to avoid loading it on
-  // every call to `While`.
-  SetWhileConditionKernel set_while_condition(executor);
-
-  {  // Load kernels that updates condition handle value.
-    MultiKernelLoaderSpec spec(/*arity=*/2);
-    spec.AddInProcessSymbol(gpu::GetSetWhileConditionKernel(),
-                            "set_while_condition");
-    TF_RETURN_IF_ERROR(executor->GetKernel(spec, &set_while_condition));
-  }
+  TF_ASSIGN_OR_RETURN(SetWhileConditionKernel * set_while_condition,
+                      GetSetWhileConditionKernel(executor));
 
   // Record condition commands into the parent command buffer.
-  TF_RETURN_IF_ERROR(CommandBuffer::Build(this, cond_builder));
+  TF_RETURN_IF_ERROR(cond_builder(this));
   TF_RETURN_IF_ERROR(Barrier(executor));
 
   auto set_cond_fn = [&](absl::Span<const GpuGraphConditionalHandle> handles) {
-    return Launch(set_while_condition, ThreadDim(), BlockDim(), handles[0],
-                  pred);
+    return CommandBuffer::Launch(*set_while_condition, ThreadDim(), BlockDim(),
+                                 handles[0], pred);
   };
 
   auto body = [&](CommandBuffer* body, GpuGraphConditionalHandle handle) {
@@ -800,7 +819,7 @@ absl::Status GpuCommandBuffer::While(StreamExecutor* executor,
     TF_RETURN_IF_ERROR(body->Barrier(executor));
     TF_RETURN_IF_ERROR(cond_builder(body));
     TF_RETURN_IF_ERROR(body->Barrier(executor));
-    return body->Launch(set_while_condition, ThreadDim(), BlockDim(), handle,
+    return body->Launch(*set_while_condition, ThreadDim(), BlockDim(), handle,
                         pred);
   };
 
@@ -841,7 +860,21 @@ absl::Status GpuCommandBuffer::Finalize() {
                    << "; alive executable graphs: " << AliveExecs();
 
       TF_RETURN_IF_ERROR(GpuDriver::DeviceGraphMemTrim(parent_->device()));
-      TF_RETURN_IF_ERROR(GpuDriver::GraphInstantiate(&exec_, graph_, flags));
+
+      auto retry = GpuDriver::GraphInstantiate(&exec_, graph_, flags);
+      if (retry.code() == absl::StatusCode::kResourceExhausted) {
+        return absl::ResourceExhaustedError(absl::StrFormat(
+            "CUDA driver ran out of memory trying to instantiate CUDA graph "
+            "with %d nodes and %d conditionals (total of %d alive CUDA graphs "
+            "in the process). You can try to (a) Give more memory to CUDA "
+            "driver by reducing XLA_PYTHON_CLIENT_MEM_FRACTION (b) Disable "
+            "CUDA graph with 'XLA_FLAGS=--xla_gpu_enable_command_buffer=' "
+            "(empty set). Original error: %s",
+            nodes_.size(), conditional_command_buffers_.size(), AliveExecs(),
+            retry.message()));
+      } else {
+        TF_RETURN_IF_ERROR(retry);
+      }
     }
 
     uint64_t end_nanos = tsl::Env::Default()->NowNanos();
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
index b78aa414355cf1..2b55dc53e4cd0b 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer.h
@@ -37,21 +37,17 @@ limitations under the License.
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/stream_executor_internal.h"
-#include "tsl/platform/errors.h"
 
 namespace stream_executor::gpu {
 
-// GpuCommandBuffer provides platform-specific CommandBufferInterface
-// implementation (it's backed by CUDA or HIP graphs on NVIDIA and AMD devices).
-class GpuCommandBuffer : public internal::CommandBufferInterface {
+// GpuCommandBuffer provides platform-specific CommandBuffer implementation
+// (it's backed by CUDA or HIP graphs on NVIDIA and AMD devices).
+class GpuCommandBuffer : public CommandBuffer {
  public:
-  GpuCommandBuffer(CommandBuffer::Mode mode, GpuExecutor* parent,
-                   GpuGraphHandle graph, bool is_owned_graph = true);
+  GpuCommandBuffer(Mode mode, GpuExecutor* parent, GpuGraphHandle graph,
+                   bool is_owned_graph = true);
   ~GpuCommandBuffer() override;
 
-  absl::Status Trace(Stream* stream,
-                     absl::AnyInvocable<absl::Status()> function) override;
-
   absl::Status Barrier(StreamExecutor* executor) override;
 
   absl::Status Launch(const ThreadDim& threads, const BlockDim& blocks,
@@ -63,8 +59,7 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
                                     const DeviceMemoryBase& src,
                                     uint64_t size) override;
 
-  absl::Status Memset(DeviceMemoryBase* dst,
-                      CommandBuffer::BitPattern bit_pattern,
+  absl::Status Memset(DeviceMemoryBase* dst, BitPattern bit_pattern,
                       size_t num_elements) override;
 
   absl::StatusOr<DeviceMemoryBase> Allocate(size_t bytes) override;
@@ -72,22 +67,20 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
   absl::Status Free(DeviceMemoryBase dst) override;
 
   absl::Status If(StreamExecutor* executor, DeviceMemory<bool> predicate,
-                  CommandBuffer::Builder then_builder) override;
+                  Builder then_builder) override;
 
   absl::Status IfElse(StreamExecutor* executor, DeviceMemory<bool> predicate,
-                      CommandBuffer::Builder then_builder,
-                      CommandBuffer::Builder else_builder) override;
+                      Builder then_builder, Builder else_builder) override;
 
   absl::Status Case(StreamExecutor* executor, DeviceMemory<int32_t> index,
-                    std::vector<CommandBuffer::Builder> branches) override;
+                    std::vector<Builder> branches) override;
 
   absl::Status For(StreamExecutor* executor, int32_t num_iteration,
                    DeviceMemory<int32_t> loop_counter,
-                   CommandBuffer::Builder body_builder) override;
+                   Builder body_builder) override;
 
   absl::Status While(StreamExecutor* executor, DeviceMemory<bool> pred,
-                     CommandBuffer::Builder cond_builder,
-                     CommandBuffer::Builder body_builder) override;
+                     Builder cond_builder, Builder body_builder) override;
 
   absl::Status Finalize() override;
   absl::Status Update() override;
@@ -95,37 +88,29 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
   GpuGraphExecHandle executable() const { return exec_; }
   GpuGraphHandle graph() const { return graph_; }
 
-  CommandBuffer::Mode mode() const override { return mode_; }
-  CommandBuffer::State state() const override { return state_; }
+  Mode mode() const override { return mode_; }
+  State state() const override { return state_; }
 
-  // A helper template for launching typed kernels.
-  template <typename... Params, typename... Args>
-  absl::Status Launch(const TypedKernel<Params...>& kernel,
-                      const ThreadDim& threads, const BlockDim& blocks,
-                      Args... args);
+  static GpuCommandBuffer* Cast(CommandBuffer* command_buffer) {
+    return static_cast<GpuCommandBuffer*>(command_buffer);
+  }
+
+  static const GpuCommandBuffer* Cast(const CommandBuffer* command_buffer) {
+    return static_cast<const GpuCommandBuffer*>(command_buffer);
+  }
+
+ private:
+  absl::Status Trace(Stream* stream,
+                     absl::AnyInvocable<absl::Status()> function) override;
 
   // We track the total number of allocated and alive executable graphs in the
   // process to track the command buffers resource usage. Executable graph
   // allocates resources on a GPU devices (rule of thumb is ~8kb per node), so
   // we have to be careful not to keep too many of them alive for too long, or
   // we have a higher risk of OOM errors.
-  //
-  // TODO(ezhulenev): We need to have a policy for how to evict unused
-  // executable graph instances from a device, currently lifetime of an
-  // executable graph is tied to a parent command buffer, and we can have
-  // thousands of command buffers alive at the same time.
   static int64_t AllocatedExecs();
   static int64_t AliveExecs();
 
-  static GpuCommandBuffer* Cast(CommandBuffer* command_buffer) {
-    return static_cast<GpuCommandBuffer*>(command_buffer->implementation());
-  }
-
-  static const GpuCommandBuffer* Cast(const CommandBuffer* command_buffer) {
-    return static_cast<const GpuCommandBuffer*>(
-        command_buffer->implementation());
-  }
-
  private:
   using Dependencies = absl::InlinedVector<GpuGraphNodeHandle, 1>;
 
@@ -156,13 +141,13 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
   using SetConditionFn =
       std::function<absl::Status(absl::Span<const GpuGraphConditionalHandle>)>;
 
-  // An extension of `CommandBuffer::Builder` for building conditional command
-  // buffers tied to conditional handles.
+  // An extension of `Builder` for building conditional command buffers tied to
+  // conditional handles.
   using ConditionBuilder =
       std::function<absl::Status(CommandBuffer*, GpuGraphConditionalHandle)>;
 
   // Wraps a regular command buffer builder into condition builder.
-  static ConditionBuilder ToConditionBuilder(CommandBuffer::Builder builder);
+  static ConditionBuilder ToConditionBuilder(Builder builder);
 
   using ConditionType = typename GpuDriver::GpuGraphConditionalNodeParams::Type;
 
@@ -183,13 +168,14 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
   // For each conditional node in the Gpu graph we keep a record of conditional
   // command buffers attached to a node, so we can apply updates to them.
   struct ConditionalCommandBuffers {
-    ConditionalCommandBuffers(std::vector<GpuGraphConditionalHandle> handles,
-                              std::vector<CommandBuffer> command_buffers)
+    ConditionalCommandBuffers(
+        std::vector<GpuGraphConditionalHandle> handles,
+        std::vector<std::unique_ptr<GpuCommandBuffer>> command_buffers)
         : handles(std::move(handles)),
           command_buffers(std::move(command_buffers)) {}
 
     std::vector<GpuGraphConditionalHandle> handles;
-    std::vector<CommandBuffer> command_buffers;
+    std::vector<std::unique_ptr<GpuCommandBuffer>> command_buffers;
   };
 
   using AllocationResult = std::pair<GpuDevicePtr, uint64_t>;
@@ -200,14 +186,15 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
   absl::StatusOr<std::vector<GpuGraphHandle>> CreateConditionalNodes(
       ConditionType type, absl::Span<const GpuGraphConditionalHandle> handles);
 
-  absl::StatusOr<std::vector<CommandBuffer>> CreateConditionalCommandBuffers(
+  absl::StatusOr<std::vector<std::unique_ptr<GpuCommandBuffer>>>
+  CreateConditionalCommandBuffers(
       absl::Span<const GpuGraphConditionalHandle> handles,
       absl::Span<const GpuGraphHandle> graphs,
       absl::Span<const ConditionBuilder> builders);
 
   absl::Status UpdateConditionalCommandBuffers(
       absl::Span<const GpuGraphConditionalHandle> handles,
-      absl::Span<CommandBuffer> command_buffers,
+      absl::Span<const std::unique_ptr<GpuCommandBuffer>> command_buffers,
       absl::Span<const ConditionBuilder> builders);
 
   absl::Status CreateConditionalCommand(
@@ -217,8 +204,18 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
 
   Dependencies GetBarrier();
 
-  // Returns loaded no-op kernel used as a barrier, or loads it on a given
-  // stream executor. Loaded kernel owned by a current command buffer.
+  // Returns loaded auxiliary kernels, or loads them on a given stream executor.
+  // Loaded kernels owned by a current command buffer.
+  absl::StatusOr<SetIfConditionKernel*> GetSetIfConditionKernel(
+      StreamExecutor* executor);
+  absl::StatusOr<SetIfElseConditionKernel*> GetSetIfElseConditionKernel(
+      StreamExecutor* executor);
+  absl::StatusOr<SetCaseConditionKernel*> GetSetCaseConditionKernel(
+      StreamExecutor* executor);
+  absl::StatusOr<SetForConditionKernel*> GetSetForConditionKernel(
+      StreamExecutor* executor);
+  absl::StatusOr<SetWhileConditionKernel*> GetSetWhileConditionKernel(
+      StreamExecutor* executor);
   absl::StatusOr<NoOpKernel*> GetNoOpKernel(StreamExecutor* executor);
 
   // Recursively disable all nodes corresponding to barriers (including nested
@@ -248,8 +245,8 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
   static_assert(std::is_pointer_v<GpuGraphNodeHandle>,
                 "GpuGraphNodeHandle must be a pointer");
 
-  CommandBuffer::Mode mode_;
-  CommandBuffer::State state_ = CommandBuffer::State::kCreate;
+  Mode mode_;
+  State state_ = State::kCreate;
 
   GpuExecutor* parent_;  // not owned, must outlive *this
 
@@ -262,14 +259,19 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
   // Handle of a graph node that acts as a barrier for all newly added commands.
   GpuGraphNodeHandle barrier_ = nullptr;
 
-  // Handles to load bearing graph nodes (kernel, memcpy, etc.) corresponding to
-  // command buffer commands and also to no-op nodes corresponding to barriers
-  // (nodes defining DAG structure). Owned by the `graph_` instance.
-  std::vector<GpuGraphNodeHandle> nodes_;
+  // A handle to a Gpu graph node and a metadata describing the node properties.
+  struct GpuGraphNodeInfo {
+    // Gpu graph node handle owned by `graph_` instance.
+    GpuGraphNodeHandle handle = nullptr;
+  };
+
+  // Gpu graph nodes info for load bearing graph nodes (kernel, memcpy, etc.)
+  // corresponding to command buffer commands and also to no-op nodes
+  // corresponding to barriers (nodes defining DAG structure).
+  std::vector<GpuGraphNodeInfo> nodes_;
 
   // Handles to no-op graph nodes corresponding to barriers that define nodes
-  // execution order. Can be nullptr if regular node acts as a barrier. Owned by
-  // the `graph_` instance.
+  // execution order. Can be nullptr if regular node acts as a barrier.
   std::vector<GpuGraphNodeHandle> barriers_;
 
   // Command buffers for conditional nodes in the Gpu graph. Underlying Gpu
@@ -295,19 +297,16 @@ class GpuCommandBuffer : public internal::CommandBufferInterface {
 
   UpdateState update_state_;
 
-  // Loaded instance of a no-op kernel used as command buffer barrier.
-  std::unique_ptr<NoOpKernel> noop_kernel_;
+  // Lazy loaded auxiliary kernels required for building CUDA graphs (no-op
+  // barriers, updating conditional handles, etc.).
+  SetIfConditionKernel set_if_condition_kernel_;
+  SetIfElseConditionKernel set_if_else_condition_kernel_;
+  SetCaseConditionKernel set_case_condition_kernel_;
+  SetForConditionKernel set_for_condition_kernel_;
+  SetWhileConditionKernel set_while_condition_kernel_;
+  NoOpKernel noop_kernel_;
 };
 
-template <typename... Params, typename... Args>
-inline absl::Status GpuCommandBuffer::Launch(
-    const TypedKernel<Params...>& kernel, const ThreadDim& threads,
-    const BlockDim& blocks, Args... args) {
-  auto kernel_args = PackKernelArgs(kernel, args...);
-  TF_RETURN_IF_ERROR(Launch(threads, blocks, kernel, *kernel_args));
-  return absl::OkStatus();
-}
-
 //===----------------------------------------------------------------------===//
 // Implementation details device kernels required by GpuCommandBuffer.
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer_test.cc
index 53bf0958d23851..c7b8600c6f9de3 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_command_buffer_test.cc
@@ -18,26 +18,33 @@ limitations under the License.
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/ascii.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels.h"
 #include "xla/stream_executor/gpu/gpu_types.h"  // IWYU pragma: keep
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/lib/core/status_test_util.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 #include "tsl/platform/test_benchmark.h"
 
+#if GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda.h"
+#endif
+
 namespace stream_executor::gpu {
 
 static Platform* GpuPlatform() {
   auto name = absl::AsciiStrToUpper(
       xla::PlatformUtil::CanonicalPlatformName("gpu").value());
-  return MultiPlatformManager::PlatformWithName(name).value();
+  return PlatformManager::PlatformWithName(name).value();
 }
 
 static MultiKernelLoaderSpec GetAddI32KernelSpec() {
@@ -73,9 +80,7 @@ TEST(GpuCommandBufferTest, LaunchSingleKernel) {
 
   MultiKernelLoaderSpec spec(/*arity=*/3);
   spec.AddInProcessSymbol(internal::GetAddI32Kernel(), "add");
-
-  AddI32Kernel add(executor);
-  TF_ASSERT_OK(executor->GetKernel(spec, &add));
+  TF_ASSERT_OK_AND_ASSIGN(auto add, AddI32Kernel::Create(executor, spec));
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
@@ -91,10 +96,10 @@ TEST(GpuCommandBufferTest, LaunchSingleKernel) {
 
   // Create a command buffer with a single kernel launch.
   auto cmd_buffer = CommandBuffer::Create(executor).value();
-  TF_ASSERT_OK(cmd_buffer.Launch(add, ThreadDim(), BlockDim(4), a, b, c));
-  TF_ASSERT_OK(cmd_buffer.Finalize());
+  TF_ASSERT_OK(cmd_buffer->Launch(add, ThreadDim(), BlockDim(4), a, b, c));
+  TF_ASSERT_OK(cmd_buffer->Finalize());
 
-  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
 
   // Copy `c` data back to host.
   std::vector<int32_t> dst(4, 42);
@@ -108,11 +113,11 @@ TEST(GpuCommandBufferTest, LaunchSingleKernel) {
   stream.ThenMemZero(&d, byte_length);
 
   // Update command buffer to write into `d` buffer.
-  TF_ASSERT_OK(cmd_buffer.Update());
-  TF_ASSERT_OK(cmd_buffer.Launch(add, ThreadDim(), BlockDim(4), a, b, d));
-  TF_ASSERT_OK(cmd_buffer.Finalize());
+  TF_ASSERT_OK(cmd_buffer->Update());
+  TF_ASSERT_OK(cmd_buffer->Launch(add, ThreadDim(), BlockDim(4), a, b, d));
+  TF_ASSERT_OK(cmd_buffer->Finalize());
 
-  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
 
   // Copy `d` data back to host.
   std::fill(dst.begin(), dst.end(), 42);
@@ -134,23 +139,21 @@ TEST(CudaCommandBufferTest, TraceSingleKernel) {
   stream.Init();
   ASSERT_TRUE(stream.ok());
 
-  AddI32Ptrs3 add(executor);
-
   // Register a kernel with a custom arguments packing function that packs
   // device memory arguments into a struct with pointers.
   MultiKernelLoaderSpec spec(/*arity=*/1, [&](const Kernel& kernel,
                                               const KernelArgs& args) {
     auto bufs = Cast<KernelArgsDeviceMemoryArray>(&args)->device_memory_args();
     auto cast = [](auto m) { return reinterpret_cast<int32_t*>(m.opaque()); };
-    return PackKernelArgs(add, internal::Ptrs3<int32_t>{
-                                   cast(bufs[0]),
-                                   cast(bufs[1]),
-                                   cast(bufs[2]),
-                               });
+    return PackKernelArgs(/*shmem_bytes=*/0, internal::Ptrs3<int32_t>{
+                                                 cast(bufs[0]),
+                                                 cast(bufs[1]),
+                                                 cast(bufs[2]),
+                                             });
   });
   spec.AddInProcessSymbol(internal::GetAddI32Ptrs3Kernel(), "add");
 
-  TF_ASSERT_OK(executor->GetKernel(spec, &add));
+  TF_ASSERT_OK_AND_ASSIGN(auto add, AddI32Ptrs3::Create(executor, spec));
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
@@ -171,12 +174,12 @@ TEST(CudaCommandBufferTest, TraceSingleKernel) {
   auto cmd_buffer = CommandBuffer::Trace(
       executor,
       [&](Stream* stream) {
-        return executor->Launch(stream, ThreadDim(), BlockDim(4), add, args);
+        return executor->Launch(stream, ThreadDim(), BlockDim(4), *add, args);
       },
       primary);
 
   TF_ASSERT_OK(cmd_buffer.status());
-  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, **cmd_buffer));
 
   // Copy data back to host.
   std::vector<int32_t> dst(4, 42);
@@ -195,9 +198,7 @@ TEST(GpuCommandBufferTest, LaunchNestedCommandBuffer) {
   ASSERT_TRUE(stream.ok());
 
   MultiKernelLoaderSpec spec = GetAddI32KernelSpec();
-
-  AddI32Kernel add(executor);
-  TF_ASSERT_OK(executor->GetKernel(spec, &add));
+  TF_ASSERT_OK_AND_ASSIGN(auto add, AddI32Kernel::Create(executor, spec));
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
@@ -214,11 +215,11 @@ TEST(GpuCommandBufferTest, LaunchNestedCommandBuffer) {
   // Create a command buffer with a single kernel launch.
   auto primary_cmd = CommandBuffer::Create(executor).value();
   auto nested_cmd = CommandBuffer::Create(executor, nested).value();
-  TF_ASSERT_OK(nested_cmd.Launch(add, ThreadDim(), BlockDim(4), a, b, c));
-  TF_ASSERT_OK(primary_cmd.AddNestedCommandBuffer(nested_cmd));
-  TF_ASSERT_OK(primary_cmd.Finalize());
+  TF_ASSERT_OK(nested_cmd->Launch(add, ThreadDim(), BlockDim(4), a, b, c));
+  TF_ASSERT_OK(primary_cmd->AddNestedCommandBuffer(*nested_cmd));
+  TF_ASSERT_OK(primary_cmd->Finalize());
 
-  TF_ASSERT_OK(executor->Submit(&stream, primary_cmd));
+  TF_ASSERT_OK(executor->Submit(&stream, *primary_cmd));
 
   // Copy `c` data back to host.
   std::vector<int32_t> dst(4, 42);
@@ -234,12 +235,12 @@ TEST(GpuCommandBufferTest, LaunchNestedCommandBuffer) {
   // Update command buffer to write into `d` buffer by creating a new nested
   // command buffer.
   nested_cmd = CommandBuffer::Create(executor, nested).value();
-  TF_ASSERT_OK(nested_cmd.Launch(add, ThreadDim(), BlockDim(4), a, b, d));
-  TF_ASSERT_OK(primary_cmd.Update());
-  TF_ASSERT_OK(primary_cmd.AddNestedCommandBuffer(nested_cmd));
-  TF_ASSERT_OK(primary_cmd.Finalize());
+  TF_ASSERT_OK(nested_cmd->Launch(add, ThreadDim(), BlockDim(4), a, b, d));
+  TF_ASSERT_OK(primary_cmd->Update());
+  TF_ASSERT_OK(primary_cmd->AddNestedCommandBuffer(*nested_cmd));
+  TF_ASSERT_OK(primary_cmd->Finalize());
 
-  TF_ASSERT_OK(executor->Submit(&stream, primary_cmd));
+  TF_ASSERT_OK(executor->Submit(&stream, *primary_cmd));
 
   // Copy `d` data back to host.
   std::fill(dst.begin(), dst.end(), 42);
@@ -266,10 +267,10 @@ TEST(GpuCommandBufferTest, MemcpyDeviceToDevice) {
 
   // Create a command buffer with a single a to b memcpy command.
   auto cmd_buffer = CommandBuffer::Create(executor).value();
-  TF_ASSERT_OK(cmd_buffer.MemcpyDeviceToDevice(&b, a, byte_length));
-  TF_ASSERT_OK(cmd_buffer.Finalize());
+  TF_ASSERT_OK(cmd_buffer->MemcpyDeviceToDevice(&b, a, byte_length));
+  TF_ASSERT_OK(cmd_buffer->Finalize());
 
-  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
 
   // Copy `b` data back to host.
   std::vector<int32_t> dst(4, 0);
@@ -279,14 +280,14 @@ TEST(GpuCommandBufferTest, MemcpyDeviceToDevice) {
   ASSERT_EQ(dst, expected);
 
   // Update command buffer to swap the memcpy direction.
-  TF_ASSERT_OK(cmd_buffer.Update());
-  TF_ASSERT_OK(cmd_buffer.MemcpyDeviceToDevice(&a, b, byte_length));
-  TF_ASSERT_OK(cmd_buffer.Finalize());
+  TF_ASSERT_OK(cmd_buffer->Update());
+  TF_ASSERT_OK(cmd_buffer->MemcpyDeviceToDevice(&a, b, byte_length));
+  TF_ASSERT_OK(cmd_buffer->Finalize());
 
   // Clear destination to test that command buffer actually copied memory.
   stream.ThenMemset32(&a, 0, byte_length);
 
-  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
 
   // Copy `a` data back to host.
   std::fill(dst.begin(), dst.end(), 0);
@@ -309,10 +310,10 @@ TEST(GpuCommandBufferTest, Memset) {
 
   // Create a command buffer with a single memset command.
   auto cmd_buffer = CommandBuffer::Create(executor).value();
-  TF_ASSERT_OK(cmd_buffer.Memset(&a, uint32_t{42}, length));
-  TF_ASSERT_OK(cmd_buffer.Finalize());
+  TF_ASSERT_OK(cmd_buffer->Memset(&a, uint32_t{42}, length));
+  TF_ASSERT_OK(cmd_buffer->Finalize());
 
-  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
 
   // Copy `a` data back to host.
   std::vector<int32_t> dst(4, 0);
@@ -322,11 +323,11 @@ TEST(GpuCommandBufferTest, Memset) {
   ASSERT_EQ(dst, expected);
 
   // Update command buffer to use a new bit pattern.
-  TF_ASSERT_OK(cmd_buffer.Update());
-  TF_ASSERT_OK(cmd_buffer.Memset(&a, uint32_t{43}, length));
-  TF_ASSERT_OK(cmd_buffer.Finalize());
+  TF_ASSERT_OK(cmd_buffer->Update());
+  TF_ASSERT_OK(cmd_buffer->Memset(&a, uint32_t{43}, length));
+  TF_ASSERT_OK(cmd_buffer->Finalize());
 
-  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
 
   // Copy `d` data back to host.
   std::fill(dst.begin(), dst.end(), 0);
@@ -348,13 +349,9 @@ TEST(GpuCommandBufferTest, ConditionalIf) {
   stream.Init();
   ASSERT_TRUE(stream.ok());
 
-  AddI32Kernel add(executor);
-
-  {  // Load addition kernel.
-    MultiKernelLoaderSpec spec(/*arity=*/3);
-    spec.AddInProcessSymbol(internal::GetAddI32Kernel(), "add");
-    TF_ASSERT_OK(executor->GetKernel(spec, &add));
-  }
+  MultiKernelLoaderSpec spec(/*arity=*/3);
+  spec.AddInProcessSymbol(internal::GetAddI32Kernel(), "add");
+  TF_ASSERT_OK_AND_ASSIGN(auto add, AddI32Kernel::Create(executor, spec));
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
@@ -378,10 +375,10 @@ TEST(GpuCommandBufferTest, ConditionalIf) {
 
   // Create a command buffer with a single conditional operation.
   auto cmd_buffer = CommandBuffer::Create(executor).value();
-  TF_ASSERT_OK(cmd_buffer.If(executor, pred, then_builder));
-  TF_ASSERT_OK(cmd_buffer.Finalize());
+  TF_ASSERT_OK(cmd_buffer->If(executor, pred, then_builder));
+  TF_ASSERT_OK(cmd_buffer->Finalize());
 
-  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
 
   // Copy `c` data back to host.
   std::vector<int32_t> dst(4, 42);
@@ -397,7 +394,7 @@ TEST(GpuCommandBufferTest, ConditionalIf) {
 
   // Submit the same command buffer, but this time it should not execute
   // conditional branch as conditional handle should be updated to false.
-  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
 
   stream.ThenMemcpy(dst.data(), c, byte_length);
   std::vector<int32_t> zeroes = {0, 0, 0, 0};
@@ -416,11 +413,11 @@ TEST(GpuCommandBufferTest, ConditionalIf) {
   };
 
   // Update command buffer with a conditional to use new builder.
-  TF_ASSERT_OK(cmd_buffer.Update());
-  TF_ASSERT_OK(cmd_buffer.If(executor, pred, then_builder));
-  TF_ASSERT_OK(cmd_buffer.Finalize());
+  TF_ASSERT_OK(cmd_buffer->Update());
+  TF_ASSERT_OK(cmd_buffer->If(executor, pred, then_builder));
+  TF_ASSERT_OK(cmd_buffer->Finalize());
 
-  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
 
   // Copy `d` data back to host.
   std::fill(dst.begin(), dst.end(), 42);
@@ -440,20 +437,15 @@ TEST(GpuCommandBufferTest, ConditionalIfElse) {
   stream.Init();
   ASSERT_TRUE(stream.ok());
 
-  AddI32Kernel add(executor);
-  MulI32Kernel mul(executor);
+  // Load addition kernel.
+  MultiKernelLoaderSpec add_spec(/*arity=*/3);
+  add_spec.AddInProcessSymbol(internal::GetAddI32Kernel(), "add");
+  TF_ASSERT_OK_AND_ASSIGN(auto add, AddI32Kernel::Create(executor, add_spec));
 
-  {  // Load addition kernel.
-    MultiKernelLoaderSpec spec(/*arity=*/3);
-    spec.AddInProcessSymbol(internal::GetAddI32Kernel(), "add");
-    TF_ASSERT_OK(executor->GetKernel(spec, &add));
-  }
-
-  {  // Load multiplication kernel.
-    MultiKernelLoaderSpec spec(/*arity=*/3);
-    spec.AddInProcessSymbol(internal::GetMulI32Kernel(), "mul");
-    TF_ASSERT_OK(executor->GetKernel(spec, &mul));
-  }
+  // Load multiplication kernel.
+  MultiKernelLoaderSpec mul_spec(/*arity=*/3);
+  mul_spec.AddInProcessSymbol(internal::GetMulI32Kernel(), "mul");
+  TF_ASSERT_OK_AND_ASSIGN(auto mul, MulI32Kernel::Create(executor, mul_spec));
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
@@ -482,10 +474,10 @@ TEST(GpuCommandBufferTest, ConditionalIfElse) {
 
   // Create a command buffer with a single conditional operation.
   auto cmd_buffer = CommandBuffer::Create(executor).value();
-  TF_ASSERT_OK(cmd_buffer.IfElse(executor, pred, then_builder, else_builder));
-  TF_ASSERT_OK(cmd_buffer.Finalize());
+  TF_ASSERT_OK(cmd_buffer->IfElse(executor, pred, then_builder, else_builder));
+  TF_ASSERT_OK(cmd_buffer->Finalize());
 
-  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
   TF_ASSERT_OK(stream.BlockHostUntilDone());
 
   // Copy `c` data back to host.
@@ -501,7 +493,7 @@ TEST(GpuCommandBufferTest, ConditionalIfElse) {
 
   // Submit the same command buffer, but this time it should execute `else`
   // branch and multiply inputs.
-  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
   TF_ASSERT_OK(stream.BlockHostUntilDone());
 
   stream.ThenMemcpy(dst.data(), c, byte_length);
@@ -518,11 +510,11 @@ TEST(GpuCommandBufferTest, ConditionalIfElse) {
   };
 
   // Update command buffer with a conditional to use new `else` builder.
-  TF_ASSERT_OK(cmd_buffer.Update());
-  TF_ASSERT_OK(cmd_buffer.IfElse(executor, pred, then_builder, else_builder));
-  TF_ASSERT_OK(cmd_buffer.Finalize());
+  TF_ASSERT_OK(cmd_buffer->Update());
+  TF_ASSERT_OK(cmd_buffer->IfElse(executor, pred, then_builder, else_builder));
+  TF_ASSERT_OK(cmd_buffer->Finalize());
 
-  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
   TF_ASSERT_OK(stream.BlockHostUntilDone());
 
   // Copy `d` data back to host.
@@ -543,20 +535,15 @@ TEST(GpuCommandBufferTest, ConditionalCase) {
   stream.Init();
   ASSERT_TRUE(stream.ok());
 
-  AddI32Kernel add(executor);
-  MulI32Kernel mul(executor);
+  // Load addition kernel.
+  MultiKernelLoaderSpec add_spec(/*arity=*/3);
+  add_spec.AddInProcessSymbol(internal::GetAddI32Kernel(), "add");
+  TF_ASSERT_OK_AND_ASSIGN(auto add, AddI32Kernel::Create(executor, add_spec));
 
-  {  // Load addition kernel.
-    MultiKernelLoaderSpec spec(/*arity=*/3);
-    spec.AddInProcessSymbol(internal::GetAddI32Kernel(), "add");
-    TF_ASSERT_OK(executor->GetKernel(spec, &add));
-  }
-
-  {  // Load multiplication kernel.
-    MultiKernelLoaderSpec spec(/*arity=*/3);
-    spec.AddInProcessSymbol(internal::GetMulI32Kernel(), "mul");
-    TF_ASSERT_OK(executor->GetKernel(spec, &mul));
-  }
+  // Load multiplication kernel.
+  MultiKernelLoaderSpec mul_spec(/*arity=*/3);
+  mul_spec.AddInProcessSymbol(internal::GetMulI32Kernel(), "mul");
+  TF_ASSERT_OK_AND_ASSIGN(auto mul, MulI32Kernel::Create(executor, mul_spec));
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
@@ -584,10 +571,10 @@ TEST(GpuCommandBufferTest, ConditionalCase) {
 
   // Create a command buffer with a single conditional operation.
   auto cmd_buffer = CommandBuffer::Create(executor).value();
-  TF_ASSERT_OK(cmd_buffer.Case(executor, index, {branch0, branch1}));
-  TF_ASSERT_OK(cmd_buffer.Finalize());
+  TF_ASSERT_OK(cmd_buffer->Case(executor, index, {branch0, branch1}));
+  TF_ASSERT_OK(cmd_buffer->Finalize());
 
-  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
   TF_ASSERT_OK(stream.BlockHostUntilDone());
 
   // Copy `c` data back to host.
@@ -601,7 +588,7 @@ TEST(GpuCommandBufferTest, ConditionalCase) {
   stream.ThenMemset32(&index, 1, sizeof(int32_t));
 
   // Submit the same command buffer, but this time it should multiply inputs.
-  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
   TF_ASSERT_OK(stream.BlockHostUntilDone());
 
   stream.ThenMemcpy(dst.data(), c, byte_length);
@@ -611,7 +598,7 @@ TEST(GpuCommandBufferTest, ConditionalCase) {
   // Set index to `-1` (out of bound index value).
   stream.ThenMemset32(&index, -1, sizeof(int32_t));
 
-  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
   TF_ASSERT_OK(stream.BlockHostUntilDone());
 
   stream.ThenMemcpy(dst.data(), c, byte_length);
@@ -620,7 +607,7 @@ TEST(GpuCommandBufferTest, ConditionalCase) {
   // Set index to `2` (out of bound index value).
   stream.ThenMemset32(&index, 2, sizeof(int32_t));
 
-  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
   TF_ASSERT_OK(stream.BlockHostUntilDone());
 
   stream.ThenMemcpy(dst.data(), c, byte_length);
@@ -639,13 +626,9 @@ TEST(GpuCommandBufferTest, ConditionalFor) {
   stream.Init();
   ASSERT_TRUE(stream.ok());
 
-  AddI32Kernel add(executor);
-
-  {  // Load addition kernel.
-    MultiKernelLoaderSpec spec(/*arity=*/3);
-    spec.AddInProcessSymbol(internal::GetAddI32Kernel(), "add");
-    TF_ASSERT_OK(executor->GetKernel(spec, &add));
-  }
+  MultiKernelLoaderSpec spec(/*arity=*/3);
+  spec.AddInProcessSymbol(internal::GetAddI32Kernel(), "add");
+  TF_ASSERT_OK_AND_ASSIGN(auto add, AddI32Kernel::Create(executor, spec));
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
@@ -669,10 +652,11 @@ TEST(GpuCommandBufferTest, ConditionalFor) {
 
   // Create a command buffer with a single conditional operation.
   auto cmd_buffer = CommandBuffer::Create(executor).value();
-  TF_ASSERT_OK(cmd_buffer.For(executor, num_iters, loop_counter, body_builder));
-  TF_ASSERT_OK(cmd_buffer.Finalize());
+  TF_ASSERT_OK(
+      cmd_buffer->For(executor, num_iters, loop_counter, body_builder));
+  TF_ASSERT_OK(cmd_buffer->Finalize());
 
-  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
 
   // Copy `b` data back to host.
   std::vector<int32_t> dst(4, 42);
@@ -694,20 +678,16 @@ TEST(GpuCommandBufferTest, ConditionalWhile) {
   stream.Init();
   ASSERT_TRUE(stream.ok());
 
-  AddI32Kernel add(executor);
-  IncAndCmpKernel inc_and_cmp(executor);
+  // Load addition kernel.
+  MultiKernelLoaderSpec add_spec(/*arity=*/3);
+  add_spec.AddInProcessSymbol(internal::GetAddI32Kernel(), "add");
+  TF_ASSERT_OK_AND_ASSIGN(auto add, AddI32Kernel::Create(executor, add_spec));
 
-  {  // Load addition kernel.
-    MultiKernelLoaderSpec spec(/*arity=*/3);
-    spec.AddInProcessSymbol(internal::GetAddI32Kernel(), "add");
-    TF_ASSERT_OK(executor->GetKernel(spec, &add));
-  }
-
-  {  // Load inc_and_cmp kernel.
-    MultiKernelLoaderSpec spec(/*arity=*/3);
-    spec.AddInProcessSymbol(internal::GetIncAndCmpKernel(), "inc_and_cmp");
-    TF_ASSERT_OK(executor->GetKernel(spec, &inc_and_cmp));
-  }
+  // Load inc_and_cmp kernel.
+  MultiKernelLoaderSpec icmp_spec(/*arity=*/3);
+  icmp_spec.AddInProcessSymbol(internal::GetIncAndCmpKernel(), "inc_and_cmp");
+  TF_ASSERT_OK_AND_ASSIGN(auto inc_and_cmp,
+                          IncAndCmpKernel::Create(executor, icmp_spec));
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
@@ -741,10 +721,10 @@ TEST(GpuCommandBufferTest, ConditionalWhile) {
 
   // Create a command buffer with a single conditional operation.
   auto cmd_buffer = CommandBuffer::Create(executor).value();
-  TF_ASSERT_OK(cmd_buffer.While(executor, pred, cond_builder, body_builder));
-  TF_ASSERT_OK(cmd_buffer.Finalize());
+  TF_ASSERT_OK(cmd_buffer->While(executor, pred, cond_builder, body_builder));
+  TF_ASSERT_OK(cmd_buffer->Finalize());
 
-  TF_ASSERT_OK(executor->Submit(&stream, cmd_buffer));
+  TF_ASSERT_OK(executor->Submit(&stream, *cmd_buffer));
 
   // Copy `b` data back to host.
   std::vector<int32_t> dst(4, 42);
@@ -767,19 +747,18 @@ static void BM_CreateCommandBuffer(benchmark::State& state) {
   Platform* platform = GpuPlatform();
   StreamExecutor* executor = platform->ExecutorForDevice(0).value();
 
-  MultiKernelLoaderSpec spec = GetAddI32KernelSpec();
-
-  AddI32Kernel add(executor);
-  CHECK_OK(executor->GetKernel(spec, &add));
+  MultiKernelLoaderSpec spec(/*arity=*/3);
+  spec.AddInProcessSymbol(internal::GetAddI32Kernel(), "add");
+  TF_ASSERT_OK_AND_ASSIGN(auto add, AddI32Kernel::Create(executor, spec));
 
   DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(1, 0);
 
   for (auto s : state) {
     auto cmd_buffer = CommandBuffer::Create(executor, nested).value();
     for (int i = 1; i < state.range(0); ++i) {
-      CHECK_OK(cmd_buffer.Launch(add, ThreadDim(), BlockDim(4), b, b, b));
+      CHECK_OK(cmd_buffer->Launch(add, ThreadDim(), BlockDim(4), b, b, b));
     }
-    CHECK_OK(cmd_buffer.Finalize());
+    CHECK_OK(cmd_buffer->Finalize());
   }
 }
 
@@ -793,10 +772,9 @@ static void BM_TraceCommandBuffer(benchmark::State& state) {
   stream.Init();
   CHECK(stream.ok());
 
-  MultiKernelLoaderSpec spec = GetAddI32KernelSpec();
-
-  AddI32Kernel add(executor);
-  CHECK_OK(executor->GetKernel(spec, &add));
+  MultiKernelLoaderSpec spec(/*arity=*/3);
+  spec.AddInProcessSymbol(internal::GetAddI32Kernel(), "add");
+  TF_ASSERT_OK_AND_ASSIGN(auto add, AddI32Kernel::Create(executor, spec));
 
   DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(1, 0);
 
@@ -818,25 +796,24 @@ static void BM_UpdateCommandBuffer(benchmark::State& state) {
   Platform* platform = GpuPlatform();
   StreamExecutor* executor = platform->ExecutorForDevice(0).value();
 
-  MultiKernelLoaderSpec spec = GetAddI32KernelSpec();
-
-  AddI32Kernel add(executor);
-  CHECK_OK(executor->GetKernel(spec, &add));
+  MultiKernelLoaderSpec spec(/*arity=*/3);
+  spec.AddInProcessSymbol(internal::GetAddI32Kernel(), "add");
+  TF_ASSERT_OK_AND_ASSIGN(auto add, AddI32Kernel::Create(executor, spec));
 
   DeviceMemory<int32_t> b = executor->AllocateArray<int32_t>(1, 0);
 
   auto cmd_buffer = CommandBuffer::Create(executor, primary).value();
   for (int i = 1; i < state.range(0); ++i) {
-    CHECK_OK(cmd_buffer.Launch(add, ThreadDim(), BlockDim(4), b, b, b));
+    CHECK_OK(cmd_buffer->Launch(add, ThreadDim(), BlockDim(4), b, b, b));
   }
-  CHECK_OK(cmd_buffer.Finalize());
+  CHECK_OK(cmd_buffer->Finalize());
 
   for (auto s : state) {
-    CHECK_OK(cmd_buffer.Update());
+    CHECK_OK(cmd_buffer->Update());
     for (int i = 1; i < state.range(0); ++i) {
-      CHECK_OK(cmd_buffer.Launch(add, ThreadDim(), BlockDim(4), b, b, b));
+      CHECK_OK(cmd_buffer->Launch(add, ThreadDim(), BlockDim(4), b, b, b));
     }
-    CHECK_OK(cmd_buffer.Finalize());
+    CHECK_OK(cmd_buffer->Finalize());
   }
 }
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_driver.h b/third_party/xla/xla/stream_executor/gpu/gpu_driver.h
index e9e3bafc323f29..313ca39c48ce97 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_driver.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_driver.h
@@ -33,6 +33,10 @@ limitations under the License.
 #include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/platform.h"
 
+#ifdef GOOGLE_CUDA
+#include "third_party/gpus/cuda/include/cuda.h"
+#endif
+
 namespace stream_executor {
 namespace gpu {
 
@@ -171,7 +175,7 @@ class GpuDriver {
       GpuContext* context, stream_executor::StreamPriority stream_priority);
 
   // Virtual memory support was added to CUDA in 10.2
-#if CUDA_VERSION >= 10020
+#if defined(GOOGLE_CUDA) && CUDA_VERSION >= 10020
 
   // Reserves a range of virtual device memory addresses via
   // cuMemAddressReserve. bytes must be a multiple of the host page size.
@@ -226,7 +230,7 @@ class GpuDriver {
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1gfb50aac00c848fd7087e858f59bf7e2a
   static void UnmapMemory(GpuContext* context, GpuDevicePtr va, uint64_t bytes);
 
-#endif  // CUDA_VERSION >= 10200
+#endif  // defined(GOOGLE_CUDA) && CUDA_VERSION >= 10020
 
   // Given a device ordinal, returns a device handle into the device outparam,
   // which must not be null.
@@ -489,20 +493,21 @@ class GpuDriver {
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g4210c258cbba352040a26d1b4e658f9d
   static absl::StatusOr<GpuGraphNodeResult> GraphAddNode(
       GpuGraphNodeHandle* node, GpuGraphHandle graph,
-      absl::Span<GpuGraphNodeHandle> deps, const GpuGraphNodeParams& params);
+      absl::Span<const GpuGraphNodeHandle> deps,
+      const GpuGraphNodeParams& params);
 
   // Creates an empty node and adds it to a graph.
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g14b625984430cb2d574c63f29c9b9223
-  static absl::Status GraphAddEmptyNode(GpuGraphNodeHandle* node,
-                                        GpuGraphHandle graph,
-                                        absl::Span<GpuGraphNodeHandle> deps);
+  static absl::Status GraphAddEmptyNode(
+      GpuGraphNodeHandle* node, GpuGraphHandle graph,
+      absl::Span<const GpuGraphNodeHandle> deps);
 
   // Creates a kernel execution node and adds it to a graph.
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g50d871e3bd06c1b835e52f2966ef366b
   // https://rocm.docs.amd.com/projects/HIPIFY/en/latest/tables/CUDA_Driver_API_functions_supported_by_HIP.html#graph-management
   static absl::Status GraphAddKernelNode(
       GpuGraphNodeHandle* node, GpuGraphHandle graph,
-      absl::Span<GpuGraphNodeHandle> deps, absl::string_view kernel_name,
+      absl::Span<const GpuGraphNodeHandle> deps, absl::string_view kernel_name,
       GpuFunctionHandle function, unsigned int grid_dim_x,
       unsigned int grid_dim_y, unsigned int grid_dim_z,
       unsigned int block_dim_x, unsigned int block_dim_y,
@@ -549,7 +554,7 @@ class GpuDriver {
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g73a351cb71b2945a0bcb913a93f69ec9
   static absl::Status GraphAddMemAllocNode(
       GpuGraphNodeHandle* node, GpuGraphHandle graph,
-      absl::Span<GpuGraphNodeHandle> deps, MemAccessFlags access_flags,
+      absl::Span<const GpuGraphNodeHandle> deps, MemAccessFlags access_flags,
       MemLocationType location_type, int device_id,
       MemAllocationType allocation_type, uint64_t size, GpuDevicePtr* d_ptr,
       uint64_t max_pool_size = 0);
@@ -561,16 +566,15 @@ class GpuDriver {
 
   // Create a memfree node and adds it to a graph.
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1geb7cdce5d9be2d28d9428e74eb00fa53
-  static absl::Status GraphAddMemFreeNode(GpuGraphNodeHandle* node,
-                                          GpuGraphHandle graph,
-                                          absl::Span<GpuGraphNodeHandle> deps,
-                                          GpuDevicePtr gpu_dst);
+  static absl::Status GraphAddMemFreeNode(
+      GpuGraphNodeHandle* node, GpuGraphHandle graph,
+      absl::Span<const GpuGraphNodeHandle> deps, GpuDevicePtr gpu_dst);
 
   // Creates a memcpy node and adds it to a graph.
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g674da6ab54a677f13e0e0e8206ff5073
   static absl::Status GraphAddMemcpyD2DNode(
       GpuContext* context, GpuGraphNodeHandle* node, GpuGraphHandle graph,
-      absl::Span<GpuGraphNodeHandle> deps, GpuDevicePtr gpu_dst,
+      absl::Span<const GpuGraphNodeHandle> deps, GpuDevicePtr gpu_dst,
       GpuDevicePtr gpu_src, uint64_t size);
 
   // Sets the parameters for a memcpy node in the given graphExec.
@@ -583,7 +587,7 @@ class GpuDriver {
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g89dc8fc3743392777c0daa2c4aca40d3
   static absl::Status GraphAddMemsetNode(
       GpuContext* context, GpuGraphNodeHandle* node, GpuGraphHandle graph,
-      absl::Span<GpuGraphNodeHandle> deps, GpuDevicePtr dst,
+      absl::Span<const GpuGraphNodeHandle> deps, GpuDevicePtr dst,
       std::variant<uint8_t, uint16_t, uint32_t> bit_pattern,
       uint64_t num_elements);
 
@@ -596,10 +600,9 @@ class GpuDriver {
 
   // Creates a child graph node and adds it to a graph.
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1gde52afbcf91a8c79d4d7efbe0e3b6844
-  static absl::Status GraphAddChildNode(GpuGraphNodeHandle* node,
-                                        GpuGraphHandle graph,
-                                        absl::Span<GpuGraphNodeHandle> deps,
-                                        GpuGraphHandle child);
+  static absl::Status GraphAddChildNode(
+      GpuGraphNodeHandle* node, GpuGraphHandle graph,
+      absl::Span<const GpuGraphNodeHandle> deps, GpuGraphHandle child);
 
   // Sets the parameters for a child graph node in the given graph exec.
   // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__GRAPH.html#group__CUDA__GRAPH_1g8f2d9893f6b899f992db1a2942ec03ff
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
index a728c2775bdc35..8ba8da9e819ae4 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_executor.h
@@ -38,8 +38,9 @@ limitations under the License.
 #include "xla/stream_executor/command_buffer.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/gpu/gpu_collectives.h"
-#include "xla/stream_executor/gpu/gpu_kernel.h"
+#include "xla/stream_executor/gpu/gpu_driver.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
+#include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/module_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/port.h"
@@ -52,6 +53,9 @@ class StreamExecutor;
 
 namespace gpu {
 
+class GpuKernel;
+class GpuCommandBuffer;
+
 // CUDA-platform implementation of the platform-agnostic
 // StreamExecutorInterface.
 class GpuExecutor : public internal::StreamExecutorInterface {
@@ -265,20 +269,18 @@ class GpuExecutor : public internal::StreamExecutorInterface {
   std::unique_ptr<internal::EventInterface> CreateEventImplementation()
       override;
 
-  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
-      override;
-
   std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
 
-  absl::StatusOr<std::unique_ptr<internal::CommandBufferInterface>>
-  GetCommandBufferImplementation(CommandBuffer::Mode mode) override;
+  absl::StatusOr<std::unique_ptr<Kernel>> CreateKernel() override;
+
+  absl::StatusOr<std::unique_ptr<CommandBuffer>> CreateCommandBuffer(
+      CommandBuffer::Mode mode) override;
 
   // Wraps existing Gpu graph handle into an instance of Gpu command buffer.
   // This is required for wrapping nested graphs constructed for conditional
   // nodes and owned by a parent graph executable.
-  std::unique_ptr<internal::CommandBufferInterface>
-  GetCommandBufferImplementation(CommandBuffer::Mode mode, GpuGraphHandle graph,
-                                 bool is_owned_graph);
+  std::unique_ptr<GpuCommandBuffer> CreateCommandBuffer(
+      CommandBuffer::Mode mode, GpuGraphHandle graph, bool is_owned_graph);
 
   void* platform_specific_context() override;
 
@@ -324,7 +326,8 @@ class GpuExecutor : public internal::StreamExecutorInterface {
 
   // Prints to VLOG(2) information about the kernel's occupancy and how it might
   // be improved.
-  void VlogOccupancyInfo(const Kernel& kernel, const ThreadDim& thread_dims,
+  void VlogOccupancyInfo(const DeviceDescription& device_description,
+                         const Kernel& kernel, const ThreadDim& thread_dims,
                          const BlockDim& block_dims);
 
   // (supported on CUDA only)
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_init.cc b/third_party/xla/xla/stream_executor/gpu/gpu_init.cc
index 9747deba245e88..a0f8e5919a5ea7 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_init.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_init.cc
@@ -19,14 +19,14 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "tsl/platform/logging.h"
 
 namespace stream_executor {
 
 absl::Status ValidateGPUMachineManager() {
-  return MultiPlatformManager::PlatformWithName(GpuPlatformName()).status();
+  return PlatformManager::PlatformWithName(GpuPlatformName()).status();
 }
 
 Platform* GPUMachineManager() {
@@ -34,7 +34,7 @@ Platform* GPUMachineManager() {
   // (and probably other things as well).
   static Platform* platform = [&] {
     absl::StatusOr<Platform*> p =
-        MultiPlatformManager::PlatformWithName(GpuPlatformName());
+        PlatformManager::PlatformWithName(GpuPlatformName());
     if (!p.ok()) {
       LOG(FATAL) << "Could not find Platform with name " << GpuPlatformName();
     }
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_kernel.h b/third_party/xla/xla/stream_executor/gpu/gpu_kernel.h
index 4cf8b79705694b..251093cf965141 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_kernel.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_kernel.h
@@ -29,23 +29,21 @@ limitations under the License.
 
 #include "absl/status/statusor.h"
 #include "xla/stream_executor/gpu/gpu_driver.h"
+#include "xla/stream_executor/gpu/gpu_executor.h"
 #include "xla/stream_executor/gpu/gpu_types.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
-#include "xla/stream_executor/stream_executor_internal.h"
+#include "tsl/platform/logging.h"
 
-namespace stream_executor {
-namespace gpu {
+namespace stream_executor::gpu {
 
-// Wraps a GpuFunctionHandle to implement the platform-independent
-// KernelInterface.
-class GpuKernel : public internal::KernelInterface {
+class GpuKernel : public Kernel {
  public:
-  GpuKernel() = default;
+  explicit GpuKernel(GpuExecutor* gpu_executor) : gpu_executor_(gpu_executor) {}
 
   // Note that the function is unloaded when the module is unloaded, and the
   // module that the function is contained in is owned by the GpuExecutor.
-  ~GpuKernel() override {}
+  ~GpuKernel() override { gpu_executor_->UnloadKernel(this); }
 
   // As arity cannot be reflected upon using the CUDA API, the arity is
   // explicitly set during the GpuExecutor::GetKernel initialization process.
@@ -89,6 +87,7 @@ class GpuKernel : public internal::KernelInterface {
       ThreadDim threads, size_t dynamic_shared_memory_bytes) const override;
 
  private:
+  GpuExecutor* gpu_executor_ = nullptr;
   GpuContext* gpu_context_ = nullptr;  // context where kernel is loaded
   std::string name_;                   // kernel name
 
@@ -99,19 +98,14 @@ class GpuKernel : public internal::KernelInterface {
   KernelCacheConfig preferred_cache_config_ = KernelCacheConfig::kNoPreference;
 };
 
-// Given a platform-independent kernel datatype, returns the (const) internal
-// CUDA platform implementation pointer.
 inline const GpuKernel* AsGpuKernel(const Kernel* kernel) {
-  return static_cast<const GpuKernel*>(kernel->implementation());
+  return static_cast<const GpuKernel*>(kernel);
 }
 
-// Given a platform-independent kernel datatype, returns the (non-const)
-// internal CUDA platform implementation pointer.
 inline GpuKernel* AsGpuKernel(Kernel* kernel) {
-  return static_cast<GpuKernel*>(kernel->implementation());
+  return static_cast<GpuKernel*>(kernel);
 }
 
-}  // namespace gpu
-}  // namespace stream_executor
+}  // namespace stream_executor::gpu
 
 #endif  // XLA_STREAM_EXECUTOR_GPU_GPU_KERNEL_H_
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc b/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc
index d7597e310b7554..f6077040ed2108 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_kernel_test.cc
@@ -17,13 +17,15 @@ limitations under the License.
 #include <vector>
 
 #include <gtest/gtest.h>
+#include "absl/strings/ascii.h"
 #include "xla/service/platform_util.h"
 #include "xla/stream_executor/gpu/gpu_test_kernels.h"
 #include "xla/stream_executor/launch_dim.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/platform/test.h"
 
 namespace stream_executor::gpu {
@@ -33,7 +35,7 @@ TEST(GpuKernelTest, Add) {
                                    DeviceMemory<int32_t>>;
   auto name = absl::AsciiStrToUpper(
       xla::PlatformUtil::CanonicalPlatformName("gpu").value());
-  Platform* platform = MultiPlatformManager::PlatformWithName(name).value();
+  Platform* platform = PlatformManager::PlatformWithName(name).value();
   StreamExecutor* executor = platform->ExecutorForDevice(0).value();
 
   Stream stream(executor);
@@ -48,8 +50,7 @@ TEST(GpuKernelTest, Add) {
       reinterpret_cast<const char*>(&internal::kAddI32KernelModule[0]), "add");
 #endif
 
-  AddI32Kernel add(executor);
-  ASSERT_TRUE(executor->GetKernel(spec, &add).ok());
+  TF_ASSERT_OK_AND_ASSIGN(auto add, AddI32Kernel::Create(executor, spec));
 
   int64_t length = 4;
   int64_t byte_length = sizeof(int32_t) * length;
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_kernels.h b/third_party/xla/xla/stream_executor/gpu/gpu_kernels.h
index 779864089a3f74..a2f14ec15f4514 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_kernels.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_kernels.h
@@ -28,7 +28,7 @@ namespace stream_executor::gpu {
 //
 // Easiest way to get PTX from C++ is to use https://godbolt.org.
 inline constexpr std::string_view kNoOpKernel = R"(
-.version 8.0
+.version 4.0
 .target sm_50
 .address_size 64
 
diff --git a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h
index 49be0a157c2df1..74931452bb6624 100644
--- a/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h
+++ b/third_party/xla/xla/stream_executor/gpu/gpu_test_kernels.h
@@ -37,7 +37,7 @@ namespace stream_executor::gpu::internal {
 //
 // Easiest way to get PTX from C++ is to use https://godbolt.org.
 inline constexpr std::string_view kAddI32Kernel = R"(
-.version 8.0
+.version 4.0
 .target sm_50
 .address_size 64
 
diff --git a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
index aa1643f5d987c1..c45405c240368d 100644
--- a/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
+++ b/third_party/xla/xla/stream_executor/gpu/redzone_allocator.cc
@@ -326,16 +326,18 @@ absl::StatusOr<RedzoneCheckStatus> RedzoneAllocator::CheckRedzones() const {
   }
 
   TF_ASSIGN_OR_RETURN(
-      std::shared_ptr<ComparisonKernelT> loaded_kernel,
+      ComparisonKernelT * kernel_ptr,
       (LoadKernelOrGetPtr<DeviceMemory<uint8_t>, uint8_t, uint64_t,
                           DeviceMemory<uint64_t>>(
           executor, "redzone_checker", redzone_checker_ptx, compiled_ptx)));
 #elif TENSORFLOW_USE_ROCM
   TF_ASSIGN_OR_RETURN(
-      std::unique_ptr<ComparisonKernelT> loaded_kernel,
+      ComparisonKernelT loaded_kernel,
       (executor->CreateTypedKernel<DeviceMemory<uint8>, uint8, uint64_t,
                                    DeviceMemory<uint64_t>>("redzone_checker",
                                                            kernel_symbol())));
+  // CUDA side returns a pointer => hence get a pointer to the loaded kernel
+  auto* kernel_ptr = &loaded_kernel;
 #endif  // GOOGLE_CUDA
 
   auto out_param = executor->AllocateOwnedScalar<uint64_t>();
@@ -345,8 +347,8 @@ absl::StatusOr<RedzoneCheckStatus> RedzoneAllocator::CheckRedzones() const {
     TF_ASSIGN_OR_RETURN(
         RedzoneCheckStatus redzone_status,
         CheckRedzonesForBuffer(stream_, *buf_and_size.first, out_param.cref(),
-                               *loaded_kernel, buf_and_size.second,
-                               redzone_size_, redzone_pattern_));
+                               *kernel_ptr, buf_and_size.second, redzone_size_,
+                               redzone_pattern_));
     if (!redzone_status.ok()) {
       return redzone_status;
     }
diff --git a/third_party/xla/xla/stream_executor/gpu/redzone_allocator_test.cc b/third_party/xla/xla/stream_executor/gpu/redzone_allocator_test.cc
index 4c9f4bda1fe054..bb140f609d05a6 100644
--- a/third_party/xla/xla/stream_executor/gpu/redzone_allocator_test.cc
+++ b/third_party/xla/xla/stream_executor/gpu/redzone_allocator_test.cc
@@ -21,8 +21,8 @@ limitations under the License.
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/gpu/gpu_asm_opts.h"
 #include "xla/stream_executor/gpu/gpu_init.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/test.h"
 
@@ -52,7 +52,7 @@ TEST(RedzoneAllocatorTest, WriteToRedzone) {
   constexpr int64_t kAllocSize = (1 << 25) + 1;
 
   Platform* platform =
-      MultiPlatformManager::PlatformWithName(GpuPlatformName()).value();
+      PlatformManager::PlatformWithName(GpuPlatformName()).value();
   StreamExecutor* stream_exec = platform->ExecutorForDevice(0).value();
   GpuAsmOpts opts;
   StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
@@ -126,7 +126,7 @@ TEST(RedzoneAllocatorTest, VeryLargeRedzone) {
   // Make sure the redzone size would require grid dimension > 65535.
   constexpr int64_t kRedzoneSize = 65535 * 1024 + 1;
   Platform* platform =
-      MultiPlatformManager::PlatformWithName(GpuPlatformName()).value();
+      PlatformManager::PlatformWithName(GpuPlatformName()).value();
   StreamExecutor* stream_exec = platform->ExecutorForDevice(0).value();
   GpuAsmOpts opts;
   StreamExecutorMemoryAllocator se_allocator(platform, {stream_exec});
diff --git a/third_party/xla/xla/stream_executor/host/BUILD b/third_party/xla/xla/stream_executor/host/BUILD
index 6ec4ff8b6b74db..6212e467a34b98 100644
--- a/third_party/xla/xla/stream_executor/host/BUILD
+++ b/third_party/xla/xla/stream_executor/host/BUILD
@@ -3,11 +3,12 @@
 
 load("//xla:xla.bzl", "xla_cc_test")
 load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
-load("@local_tsl//tsl:tsl.bzl", "set_external_visibility")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
     licenses = ["notice"],
 )
 
@@ -24,7 +25,6 @@ cc_library(
     hdrs = [
         "host_platform_id.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/stream_executor:platform",
     ],
@@ -43,13 +43,13 @@ cc_library(
         ":host_gpu_executor",
         ":host_platform_id",
         "//xla/stream_executor",
-        "//xla/stream_executor:multi_platform_manager",
+        "//xla/stream_executor:platform_manager",
         "//xla/stream_executor/platform",
         "@com_google_absl//absl/memory",
         "@com_google_absl//absl/strings:str_format",
         "@local_tsl//tsl/platform:errors",
     ],
-    alwayslink = True,  # Registers itself with the MultiPlatformManager.
+    alwayslink = True,  # Registers itself with the PlatformManager.
 )
 
 cc_library(
@@ -60,7 +60,6 @@ cc_library(
     hdrs = [
         "host_stream.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/stream_executor:stream_executor_internal",
         "@com_google_absl//absl/functional:any_invocable",
@@ -81,19 +80,15 @@ cc_library(
     hdrs = [
         "host_gpu_executor.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
-        ":host_platform_id",
         ":host_stream",
         "//xla/stream_executor",
-        "//xla/stream_executor:plugin_registry",
         "//xla/stream_executor:stream_executor_internal",
         "@com_google_absl//absl/functional:any_invocable",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/synchronization",
-        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform/profile_utils:profile_utils_cpu_utils",
     ],
@@ -106,8 +101,8 @@ xla_cc_test(
     deps = [
         ":host_platform",
         "//xla/stream_executor",
-        "//xla/stream_executor:multi_platform_manager",
         "//xla/stream_executor:platform",
+        "//xla/stream_executor:platform_manager",
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/lib/core:status_test_util",
diff --git a/third_party/xla/xla/stream_executor/host/host_gpu_executor.cc b/third_party/xla/xla/stream_executor/host/host_gpu_executor.cc
index af784988570852..902a88c628bf55 100644
--- a/third_party/xla/xla/stream_executor/host/host_gpu_executor.cc
+++ b/third_party/xla/xla/stream_executor/host/host_gpu_executor.cc
@@ -32,9 +32,7 @@ limitations under the License.
 #include "absl/synchronization/notification.h"
 #include "xla/stream_executor/device_description.h"
 #include "xla/stream_executor/device_options.h"
-#include "xla/stream_executor/host/host_platform_id.h"
 #include "xla/stream_executor/host/host_stream.h"
-#include "xla/stream_executor/plugin_registry.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
 #include "tsl/platform/mem.h"
@@ -275,32 +273,6 @@ HostExecutor::CreateDeviceDescription(int device_ordinal) {
   return builder.Build();
 }
 
-blas::BlasSupport* HostExecutor::CreateBlas() {
-  PluginRegistry* registry = PluginRegistry::Instance();
-  absl::StatusOr<PluginRegistry::BlasFactory> status =
-      registry->GetFactory<PluginRegistry::BlasFactory>(kHostPlatformId);
-  if (!status.ok()) {
-    LOG(ERROR) << "Unable to retrieve BLAS factory: "
-               << status.status().message();
-    return nullptr;
-  }
-
-  return status.value()(this);
-}
-
-fft::FftSupport* HostExecutor::CreateFft() {
-  PluginRegistry* registry = PluginRegistry::Instance();
-  absl::StatusOr<PluginRegistry::FftFactory> status =
-      registry->GetFactory<PluginRegistry::FftFactory>(kHostPlatformId);
-  if (!status.ok()) {
-    LOG(ERROR) << "Unable to retrieve FFT factory: "
-               << status.status().message();
-    return nullptr;
-  }
-
-  return status.value()(this);
-}
-
 std::unique_ptr<internal::StreamInterface>
 HostExecutor::GetStreamImplementation() {
   return std::unique_ptr<internal::StreamInterface>(
diff --git a/third_party/xla/xla/stream_executor/host/host_gpu_executor.h b/third_party/xla/xla/stream_executor/host/host_gpu_executor.h
index 4182066be0fe09..3c3e827d446454 100644
--- a/third_party/xla/xla/stream_executor/host/host_gpu_executor.h
+++ b/third_party/xla/xla/stream_executor/host/host_gpu_executor.h
@@ -22,18 +22,14 @@ limitations under the License.
 #include <cstdint>
 
 #include "absl/functional/any_invocable.h"
-#include "xla/stream_executor/blas.h"
-#include "xla/stream_executor/host/host_stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
-#include "tsl/platform/errors.h"
 
 namespace stream_executor {
 namespace host {
 
 // An implementation of StreamExecutor that does no communication or interaction
 // with a device, but DOES perform memory operations backed by the host.
-// Plugin routines (BLAS) are also supported and functional.
 // Kernel invocations will fail, but host callbacks may be enqueued on this
 // executor and its associated stream, and should follow standard ordering
 // semantics.
@@ -134,20 +130,9 @@ class HostExecutor : public internal::StreamExecutorInterface {
     return true;
   }
 
-  blas::BlasSupport* CreateBlas() override;
-
-  dnn::DnnSupport* CreateDnn() override { return nullptr; }
-
-  fft::FftSupport* CreateFft() override;
-
   std::unique_ptr<internal::EventInterface> CreateEventImplementation()
       override;
 
-  std::unique_ptr<internal::KernelInterface> CreateKernelImplementation()
-      override {
-    return nullptr;
-  }
-
   std::unique_ptr<internal::StreamInterface> GetStreamImplementation() override;
 
  private:
diff --git a/third_party/xla/xla/stream_executor/host/host_platform.cc b/third_party/xla/xla/stream_executor/host/host_platform.cc
index c425e0385c785b..5b8a36080ac668 100644
--- a/third_party/xla/xla/stream_executor/host/host_platform.cc
+++ b/third_party/xla/xla/stream_executor/host/host_platform.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include "xla/stream_executor/host/host_gpu_executor.h"
 #include "xla/stream_executor/host/host_platform_id.h"
 #include "xla/stream_executor/platform/initialize.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "tsl/platform/errors.h"
 
 namespace stream_executor {
@@ -73,11 +74,11 @@ HostPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
 
 static void InitializeHostPlatform() {
   std::unique_ptr<Platform> platform(new host::HostPlatform);
-  TF_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
+  TF_CHECK_OK(PlatformManager::RegisterPlatform(std::move(platform)));
 }
 
 }  // namespace host
 }  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(host_platform,
-                            stream_executor::host::InitializeHostPlatform());
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(
+    host_platform, stream_executor::host::InitializeHostPlatform());
diff --git a/third_party/xla/xla/stream_executor/host/host_platform.h b/third_party/xla/xla/stream_executor/host/host_platform.h
index 08f89fbf6cb1d8..74d234b940f4bc 100644
--- a/third_party/xla/xla/stream_executor/host/host_platform.h
+++ b/third_party/xla/xla/stream_executor/host/host_platform.h
@@ -24,9 +24,9 @@ limitations under the License.
 #include <vector>
 
 #include "xla/stream_executor/executor_cache.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/port.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/host/host_stream_test.cc b/third_party/xla/xla/stream_executor/host/host_stream_test.cc
index 11e8632f108110..7a388a6bc3c36f 100644
--- a/third_party/xla/xla/stream_executor/host/host_stream_test.cc
+++ b/third_party/xla/xla/stream_executor/host/host_stream_test.cc
@@ -15,8 +15,8 @@ limitations under the License.
 
 #include "absl/status/status.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/lib/core/status_test_util.h"
@@ -27,7 +27,7 @@ namespace se = stream_executor;
 
 TEST(HostStream, EnforcesFIFOOrder) {
   se::Platform* platform =
-      se::MultiPlatformManager::PlatformWithName("Host").value();
+      se::PlatformManager::PlatformWithName("Host").value();
   se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
   se::Stream stream(executor);
   stream.Init();
@@ -51,7 +51,7 @@ TEST(HostStream, EnforcesFIFOOrder) {
 
 TEST(HostStream, ReportsHostCallbackError) {
   se::Platform* platform =
-      se::MultiPlatformManager::PlatformWithName("Host").value();
+      se::PlatformManager::PlatformWithName("Host").value();
   se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
   se::Stream stream(executor);
   stream.Init();
@@ -66,7 +66,7 @@ TEST(HostStream, ReportsHostCallbackError) {
 
 TEST(HostStream, ReportsFirstHostCallbackError) {
   se::Platform* platform =
-      se::MultiPlatformManager::PlatformWithName("Host").value();
+      se::PlatformManager::PlatformWithName("Host").value();
   se::StreamExecutor* executor = platform->ExecutorForDevice(0).value();
   se::Stream stream(executor);
   stream.Init();
diff --git a/tensorflow/core/profiler/backends/gpu/nvtx_utils.h b/third_party/xla/xla/stream_executor/host_memory_allocation.cc
similarity index 52%
rename from tensorflow/core/profiler/backends/gpu/nvtx_utils.h
rename to third_party/xla/xla/stream_executor/host_memory_allocation.cc
index 31d22b64a9feee..12affb0b3c68b6 100644
--- a/tensorflow/core/profiler/backends/gpu/nvtx_utils.h
+++ b/third_party/xla/xla/stream_executor/host_memory_allocation.cc
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,21 +13,22 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_NVTX_UTILS_H_
-#define TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_NVTX_UTILS_H_
+#include "xla/stream_executor/host_memory_allocation.h"
 
-#include <stack>
+#include <cstdint>
 
-#include "absl/strings/string_view.h"
-#include "xla/backends/profiler/gpu/nvtx_utils.h"
-#include "tensorflow/core/platform/macros.h"
+#include "xla/stream_executor/stream_executor_internal.h"
 
-namespace tensorflow {
-namespace profiler {
+namespace stream_executor {
 
-using xla::profiler::NVTXRangeTracker;  // NOLINT
+HostMemoryAllocation::HostMemoryAllocation(
+    void* ptr, uint64_t size, internal::StreamExecutorInterface* executor)
+    : ptr_(ptr), size_(size), executor_(executor) {}
 
-}  // namespace profiler
-}  // namespace tensorflow
+HostMemoryAllocation::~HostMemoryAllocation() {
+  if (ptr_ != nullptr && executor_ != nullptr) {
+    executor_->HostMemoryDeallocate(ptr_);
+  }
+}
 
-#endif  // TENSORFLOW_CORE_PROFILER_BACKENDS_GPU_NVTX_UTILS_H_
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/host_memory_allocation.h b/third_party/xla/xla/stream_executor/host_memory_allocation.h
new file mode 100644
index 00000000000000..974eb63fb8daa5
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/host_memory_allocation.h
@@ -0,0 +1,48 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_HOST_MEMORY_ALLOCATION_H_
+#define XLA_STREAM_EXECUTOR_HOST_MEMORY_ALLOCATION_H_
+
+#include <cstdint>
+
+#include "xla/stream_executor/memory_allocation.h"
+
+namespace stream_executor {
+
+namespace internal {
+class StreamExecutorInterface;
+}
+
+// RAII container for pinned host memory allocation allocated on an underlying
+// device owned by `*this`.
+class HostMemoryAllocation final : public MemoryAllocation {
+ public:
+  HostMemoryAllocation(void* ptr, uint64_t size,
+                       internal::StreamExecutorInterface* executor);
+  ~HostMemoryAllocation() final;
+
+  void* opaque() const final { return ptr_; }
+  uint64_t size() const final { return size_; }
+
+ private:
+  void* ptr_ = nullptr;
+  uint64_t size_ = 0;
+  internal::StreamExecutorInterface* executor_ = nullptr;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_HOST_MEMORY_ALLOCATION_H_
diff --git a/third_party/xla/xla/stream_executor/integrations/BUILD b/third_party/xla/xla/stream_executor/integrations/BUILD
index 367e12463d6405..af782baf3a5185 100644
--- a/third_party/xla/xla/stream_executor/integrations/BUILD
+++ b/third_party/xla/xla/stream_executor/integrations/BUILD
@@ -1,11 +1,13 @@
+load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
 load("//xla:xla.bzl", "xla_cc_test")
-load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
-load("@local_tsl//tsl:tsl.bzl", "if_google", "set_external_visibility")
+load("//xla/stream_executor:build_defs.bzl", "if_gpu_is_configured", "stream_executor_friends")
+load("@local_tsl//tsl:tsl.bzl", "if_google", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
     licenses = ["notice"],
 )
 
@@ -28,7 +30,7 @@ filegroup(
         "device_host_allocator.h",
         "device_mem_allocator.h",
     ],
-    visibility = ["//visibility:public"],
+    visibility = internal_visibility(["//tensorflow/core:__pkg__"]),
 )
 
 #===--------------------------------------------------------------------------------------------===#
@@ -42,7 +44,6 @@ cc_library(
     name = "tf_allocator_adapter",
     srcs = ["tf_allocator_adapter.cc"],
     hdrs = ["tf_allocator_adapter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory",
@@ -65,15 +66,38 @@ cc_library(
         "device_host_allocator.h",
         "device_mem_allocator.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/stream_executor",
+        "//xla/stream_executor:memory_allocation",
+        "@com_google_absl//absl/base:core_headers",
+        "@com_google_absl//absl/container:flat_hash_map",
+        "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/framework:allocator",
         "@local_tsl//tsl/framework:device_id",
+        "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/profiler/lib:traceme",
     ],
 )
 
+cc_library(
+    name = "gpu_virtual_mem_allocator",
+    srcs = ["gpu_virtual_mem_allocator.cc"],
+    hdrs = ["gpu_virtual_mem_allocator.h"],
+    defines = if_cuda(["GOOGLE_CUDA=1"]),
+    deps = [
+        "//xla/stream_executor:stream_executor_headers",
+        "@com_google_absl//absl/strings:str_format",
+        "@local_tsl//tsl/framework:allocator",
+        "@local_tsl//tsl/framework:device_id_impl",
+        "@local_tsl//tsl/platform:numbers",
+        "@local_tsl//tsl/platform:statusor",
+        "@local_tsl//tsl/profiler/lib:traceme",
+    ] + if_cuda([
+        "//xla/stream_executor/gpu:gpu_driver_header",
+        "//xla/stream_executor/gpu:gpu_types_header",
+    ]),
+)
+
 xla_cc_test(
     name = "tf_allocator_adapter_test",
     srcs = ["tf_allocator_adapter_test.cc"],
@@ -92,3 +116,21 @@ xla_cc_test(
         "@local_tsl//tsl/framework:allocator",
     ]),
 )
+
+xla_cc_test(
+    name = "gpu_virtual_mem_allocator_test",
+    srcs = if_gpu_is_configured(["gpu_virtual_mem_allocator_test.cc"]),
+    tags = [
+        "gpu",
+        "no_oss",
+        "requires-gpu-nvidia",
+    ],
+    deps = [
+        ":gpu_virtual_mem_allocator",
+        "//xla/stream_executor/gpu:gpu_init",
+        "@local_tsl//tsl/framework:device_id_impl",
+        "@local_tsl//tsl/platform:test",
+        "@local_tsl//tsl/platform:test_benchmark",
+        "@local_tsl//tsl/platform:test_main",
+    ],
+)
diff --git a/third_party/xla/xla/stream_executor/integrations/device_host_allocator.h b/third_party/xla/xla/stream_executor/integrations/device_host_allocator.h
index 27af4b6f84b06a..90292674a54569 100644
--- a/third_party/xla/xla/stream_executor/integrations/device_host_allocator.h
+++ b/third_party/xla/xla/stream_executor/integrations/device_host_allocator.h
@@ -16,13 +16,22 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_INTEGRATIONS_DEVICE_HOST_ALLOCATOR_H_
 #define XLA_STREAM_EXECUTOR_INTEGRATIONS_DEVICE_HOST_ALLOCATOR_H_
 
+#include <cstddef>
+#include <memory>
+#include <utility>
 #include <vector>
 
+#include "absl/base/thread_annotations.h"
+#include "absl/container/flat_hash_map.h"
+#include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/memory_allocation.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/framework/allocator.h"
+#include "tsl/platform/logging.h"
 #include "tsl/profiler/lib/traceme.h"
 
 namespace stream_executor {
+
 // Allocator for pinned CPU RAM that is made known to a StreamExecutor-based
 // device for the purpose of efficient DMA with the device.
 class DeviceHostAllocator : public tsl::SubAllocator {
@@ -45,15 +54,22 @@ class DeviceHostAllocator : public tsl::SubAllocator {
 
     void* ptr = nullptr;
     *bytes_received = num_bytes;
+
     if (num_bytes > 0) {
-      ptr = stream_exec_->HostMemoryAllocate(num_bytes);
-      if (ptr == nullptr) {
+      auto allocation = stream_exec_->HostMemoryAllocate(num_bytes);
+      if (!allocation.ok()) {
         LOG(WARNING) << "could not allocate pinned host memory of size: "
                      << num_bytes;
-        return ptr;
+        return nullptr;
       }
+
+      ptr = (*allocation)->opaque();
       VisitAlloc(ptr, numa_node_, num_bytes);
+
+      absl::MutexLock lock(&mutex_);
+      allocs_[ptr] = std::move(*allocation);
     }
+
     return ptr;
   }
 
@@ -62,7 +78,8 @@ class DeviceHostAllocator : public tsl::SubAllocator {
 
     if (ptr != nullptr) {
       VisitFree(ptr, numa_node_, num_bytes);
-      stream_exec_->HostMemoryDeallocate(ptr);
+      absl::MutexLock lock(&mutex_);
+      allocs_.erase(ptr);
     }
   }
 
@@ -78,6 +95,10 @@ class DeviceHostAllocator : public tsl::SubAllocator {
 
   DeviceHostAllocator(const DeviceHostAllocator&) = delete;
   void operator=(const DeviceHostAllocator&) = delete;
+
+  absl::Mutex mutex_;
+  absl::flat_hash_map<void*, std::unique_ptr<MemoryAllocation>> allocs_
+      ABSL_GUARDED_BY(mutex_);
 };
 
 }  // namespace stream_executor
diff --git a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.cc b/third_party/xla/xla/stream_executor/integrations/gpu_virtual_mem_allocator.cc
similarity index 92%
rename from tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.cc
rename to third_party/xla/xla/stream_executor/integrations/gpu_virtual_mem_allocator.cc
index 774763ca4b24d2..470c9c7789e25b 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.cc
+++ b/third_party/xla/xla/stream_executor/integrations/gpu_virtual_mem_allocator.cc
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
+#include "xla/stream_executor/integrations/gpu_virtual_mem_allocator.h"  // IWYU pragma: keep
 
-#include "absl/strings/str_format.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "tsl/platform/numbers.h"
-#include "tsl/profiler/lib/traceme.h"
+#include "absl/strings/str_format.h"  // IWYU pragma: keep
+#include "xla/stream_executor/stream_executor.h"  // IWYU pragma: keep
+#include "tsl/platform/numbers.h"  // IWYU pragma: keep
+#include "tsl/profiler/lib/traceme.h"  // IWYU pragma: keep
 
-#if CUDA_VERSION >= 10020
+#if GOOGLE_CUDA
 
-namespace tensorflow {
+namespace stream_executor {
 namespace {
 
 using ::stream_executor::gpu::GpuContext;
@@ -49,7 +49,7 @@ Status CheckVirtualAddressManagementSupport(GpuDeviceHandle device,
   TF_ASSIGN_OR_RETURN(bool supports_virtual_address_management,
                       SupportsVirtualAddressManagement(device));
   if (!supports_virtual_address_management) {
-    return tsl::errors::Internal(absl::StrFormat(
+    return absl::InternalError(absl::StrFormat(
         "GPU %d does not support virtual memory address management.",
         gpu_id.value()));
   }
@@ -121,7 +121,10 @@ GpuVirtualMemAllocator::GpuVirtualMemAllocator(
       gpu_id_(gpu_id),
       access_gpu_handles_(access_gpu_handles),
       vmem_(vmem),
-      granularity_(granularity) {}
+      granularity_(granularity) {
+  CHECK_EQ(granularity & (granularity - 1), 0)
+      << "Granularity must be a power of two; granularity=" << granularity;
+}
 
 GpuVirtualMemAllocator::~GpuVirtualMemAllocator() {
   for (const auto mapping : mappings_) {
@@ -224,6 +227,6 @@ void GpuVirtualMemAllocator::Free(void* ptr, size_t num_bytes) {
   VisitFree(ptr, gpu_id_.value(), num_bytes);
 }
 
-}  // namespace tensorflow
+}  // namespace stream_executor
 
-#endif
+#endif  // GOOGLE_CUDA
diff --git a/third_party/xla/xla/stream_executor/integrations/gpu_virtual_mem_allocator.h b/third_party/xla/xla/stream_executor/integrations/gpu_virtual_mem_allocator.h
new file mode 100644
index 00000000000000..0562907e2e201f
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/integrations/gpu_virtual_mem_allocator.h
@@ -0,0 +1,113 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_INTEGRATIONS_GPU_VIRTUAL_MEM_ALLOCATOR_H_
+#define XLA_STREAM_EXECUTOR_INTEGRATIONS_GPU_VIRTUAL_MEM_ALLOCATOR_H_
+
+#include <memory>
+#include <vector>
+
+#include "xla/stream_executor/stream_executor.h"
+#include "tsl/framework/allocator.h"
+#include "tsl/framework/device_id.h"
+#include "tsl/platform/statusor.h"
+
+#if GOOGLE_CUDA
+#include "xla/stream_executor/gpu/gpu_driver.h"
+#include "xla/stream_executor/gpu/gpu_types.h"
+
+namespace stream_executor {
+
+// GpuVirtualMemAllocator is a SubAllocator for use with BFCAllocator which
+// provides contiguous allocations with each call to Alloc. This is done by
+// reserving a large chunk of virtual addresses at construction and then mapping
+// physical memory pages to this virtual address range as requested.
+//
+// This class is not thread-safe.
+class GpuVirtualMemAllocator : public tsl::SubAllocator {
+ public:
+  static tsl::StatusOr<std::unique_ptr<GpuVirtualMemAllocator>> Create(
+      const std::vector<Visitor>& alloc_visitors,
+      const std::vector<Visitor>& free_visitors,
+      stream_executor::gpu::GpuContext& gpu_context,
+      tsl::PlatformDeviceId gpu_id, size_t virtual_address_space_size,
+      const std::vector<tsl::PlatformDeviceId>& peer_gpu_ids);
+  ~GpuVirtualMemAllocator() override;
+
+  // Allocates memory at least as large as requested by num_bytes. Will be
+  // aligned to the min allocation granularity (typically 2MiB).
+  // alignment is ignored by this allocator.
+  void* Alloc(size_t alignment, size_t num_bytes,
+              size_t* bytes_received) override;
+
+  // Frees should only happen at the end of the contiguous memory allocations or
+  // else we introduce pointless fragmentation...But, this is supported. If the
+  // allocation happens at the end, then the next_alloc_offset_ is moved back,
+  // otherwise a hole is created.
+  //
+  // Holes are not re-used, all allocations continue to come at the end of the
+  // next_alloc_offset_. To accommodate this, the virtual_address_space_size
+  // should be much larger than the max physical size of the allocator.
+  //
+  // In practice, since the BFC allocator coalesces adjacent AllocationRegions,
+  // this free function should never be invoked.
+  void Free(void* ptr, size_t num_bytes) override;
+
+  bool SupportsCoalescing() const override { return true; }
+
+ private:
+  GpuVirtualMemAllocator(
+      const std::vector<Visitor>& alloc_visitors,
+      const std::vector<Visitor>& free_visitors,
+      stream_executor::gpu::GpuContext& gpu_context,
+      tsl::PlatformDeviceId gpu_id,
+      std::vector<stream_executor::gpu::GpuDeviceHandle> access_device_handles,
+      stream_executor::gpu::GpuDriver::VmemSpan vmem, size_t granularity);
+
+  stream_executor::gpu::GpuContext& gpu_context_;
+  tsl::PlatformDeviceId gpu_id_;
+
+  // Peer access is configured at mmap time so the allocator must be aware of
+  // all gpus that may want to read the memory. This list also includes the
+  // above gpu_id_ to facilitate the invocation of the GpuDriver::MapMemory
+  // function.
+  const std::vector<stream_executor::gpu::GpuDeviceHandle> access_gpu_handles_;
+
+  // The virtual memory span held by this allocator.
+  stream_executor::gpu::GpuDriver::VmemSpan vmem_;
+  // The next offset from the vmem base address that will be allocated. This
+  // corresponds to the size of physically pinned memory if holes haven't been
+  // created with "free".
+  size_t next_alloc_offset_ = 0;
+
+  // Smallest allocation as determined by CUDA.
+  const size_t granularity_;
+
+  struct Mapping {
+    stream_executor::gpu::GpuDevicePtr va;
+    stream_executor::gpu::GpuDriver::GenericMemoryHandle physical;
+  };
+  // List of mappings, sorted by va.
+  std::vector<Mapping> mappings_;
+
+  GpuVirtualMemAllocator(const GpuVirtualMemAllocator&) = delete;
+  void operator=(const GpuVirtualMemAllocator&) = delete;
+};
+
+}  // namespace stream_executor
+
+#endif  // GOOGLE_CUDA
+
+#endif  // XLA_STREAM_EXECUTOR_INTEGRATIONS_GPU_VIRTUAL_MEM_ALLOCATOR_H_
diff --git a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator_test.cc b/third_party/xla/xla/stream_executor/integrations/gpu_virtual_mem_allocator_test.cc
similarity index 92%
rename from tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator_test.cc
rename to third_party/xla/xla/stream_executor/integrations/gpu_virtual_mem_allocator_test.cc
index f40e70a2603aff..1db65c4d530ab7 100644
--- a/tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator_test.cc
+++ b/third_party/xla/xla/stream_executor/integrations/gpu_virtual_mem_allocator_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -13,16 +13,16 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "tensorflow/core/common_runtime/gpu/gpu_virtual_mem_allocator.h"
-
-#if CUDA_VERSION >= 10020
+#include "xla/stream_executor/integrations/gpu_virtual_mem_allocator.h"  // IWYU pragma: keep
 
 #include "xla/stream_executor/gpu/gpu_init.h"
-#include "tensorflow/core/platform/test.h"
-#include "tensorflow/core/platform/test_benchmark.h"
 #include "tsl/framework/device_id.h"
+#include "tsl/platform/test.h"  // IWYU pragma: keep
+#include "tsl/platform/test_benchmark.h"  // IWYU pragma: keep
+
+#if GOOGLE_CUDA
 
-namespace tensorflow {
+namespace stream_executor {
 namespace {
 
 using ::stream_executor::gpu::GpuContext;
@@ -36,7 +36,7 @@ constexpr size_t k2MiB{2 << 20};
 std::unique_ptr<GpuVirtualMemAllocator> CreateAllocator() {
   tsl::PlatformDeviceId gpu_id(0);
   auto executor =
-      se::GPUMachineManager()->ExecutorForDevice(gpu_id.value()).value();
+      GPUMachineManager()->ExecutorForDevice(gpu_id.value()).value();
   GpuContext* gpu_context = reinterpret_cast<GpuContext*>(
       executor->platform_specific_handle().context);
   return GpuVirtualMemAllocator::Create(
@@ -48,7 +48,7 @@ std::unique_ptr<GpuVirtualMemAllocator> CreateAllocator() {
 TEST(GpuVirtualMemAllocatorTest, SimpleAlloc) {
   tsl::PlatformDeviceId gpu_id(0);
   auto executor =
-      se::GPUMachineManager()->ExecutorForDevice(gpu_id.value()).value();
+      GPUMachineManager()->ExecutorForDevice(gpu_id.value()).value();
   GpuContext* gpu_context = reinterpret_cast<GpuContext*>(
       executor->platform_specific_handle().context);
   auto allocator = GpuVirtualMemAllocator::Create(
@@ -177,6 +177,6 @@ TEST(GpuVirtualMemAllocatorTest, FreeRange) {
 }
 
 }  // namespace
-}  // namespace tensorflow
+}  // namespace stream_executor
 
-#endif
+#endif  // GOOGLE_CUDA
diff --git a/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter_test.cc b/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter_test.cc
index af6127e95d946b..863cc49af2e770 100644
--- a/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter_test.cc
+++ b/third_party/xla/xla/stream_executor/integrations/tf_allocator_adapter_test.cc
@@ -1,4 +1,4 @@
-/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2019 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/third_party/xla/xla/stream_executor/kernel.cc b/third_party/xla/xla/stream_executor/kernel.cc
index f38d12555c7f1c..f51257aa049c36 100644
--- a/third_party/xla/xla/stream_executor/kernel.cc
+++ b/third_party/xla/xla/stream_executor/kernel.cc
@@ -15,19 +15,21 @@ limitations under the License.
 
 #include "xla/stream_executor/kernel.h"
 
-#include <cstddef>
 #include <cstdint>
+#include <memory>
 #include <optional>
 #include <string>
-#include <utility>
 
 #include "absl/status/statusor.h"
 #include "absl/strings/string_view.h"
 #include "absl/strings/strip.h"
+#include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
 #include "tsl/platform/demangle.h"
+#include "tsl/platform/errors.h"
+#include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 
@@ -51,39 +53,11 @@ void KernelMetadata::set_shared_memory_bytes(int shared_memory_bytes) {
 // Kernel
 //===----------------------------------------------------------------------===//
 
-Kernel::Kernel(Kernel &&from)
-    : parent_(from.parent_),
-      implementation_(std::move(from.implementation_)),
-      name_(std::move(from.name_)),
-      demangled_name_(std::move(from.demangled_name_)),
-      metadata_(from.metadata_) {
-  from.parent_ = nullptr;
-}
-
-Kernel::Kernel(StreamExecutor *parent)
-    : parent_(parent),
-      implementation_(parent->implementation()->CreateKernelImplementation()) {}
-
-Kernel::~Kernel() {
-  if (parent_) {
-    parent_->UnloadKernel(this);
-  }
-}
-
-unsigned Kernel::Arity() const { return implementation_->Arity(); }
-
-void Kernel::SetPreferredCacheConfig(KernelCacheConfig config) {
-  return implementation_->SetPreferredCacheConfig(config);
-}
-
-KernelCacheConfig Kernel::GetPreferredCacheConfig() const {
-  return implementation_->GetPreferredCacheConfig();
-}
-
-absl::StatusOr<int32_t> Kernel::GetMaxOccupiedBlocksPerCore(
-    ThreadDim threads, size_t dynamic_shared_memory_bytes) const {
-  return implementation_->GetMaxOccupiedBlocksPerCore(
-      threads, dynamic_shared_memory_bytes);
+absl::StatusOr<std::unique_ptr<Kernel>> Kernel::Create(
+    StreamExecutor *executor, const MultiKernelLoaderSpec &spec) {
+  TF_ASSIGN_OR_RETURN(auto kernel, executor->implementation()->CreateKernel());
+  TF_RETURN_IF_ERROR(executor->GetKernel(spec, kernel.get()));
+  return kernel;
 }
 
 void Kernel::set_name(absl::string_view name) {
diff --git a/third_party/xla/xla/stream_executor/kernel.h b/third_party/xla/xla/stream_executor/kernel.h
index 42f2ef6d5960f2..e333f4f9c7291c 100644
--- a/third_party/xla/xla/stream_executor/kernel.h
+++ b/third_party/xla/xla/stream_executor/kernel.h
@@ -88,18 +88,16 @@ limitations under the License.
 #include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "tsl/platform/logging.h"
+#include "tsl/platform/statusor.h"
 
 namespace stream_executor {
 
 class Kernel;
 class StreamExecutor;
 
-namespace internal {
-class KernelInterface;
-}  // namespace internal
-
 //===----------------------------------------------------------------------===//
 // Kernel cache config
 //===----------------------------------------------------------------------===//
@@ -230,34 +228,22 @@ class Kernel {
       std::function<absl::StatusOr<std::unique_ptr<KernelArgsPackedArrayBase>>(
           const Kernel &kernel, const KernelArgs &args)>;
 
-  Kernel(Kernel &&from);
+  // TODO(b/323534971): Kernel constructor should be moved to StreamExecutor or
+  // a dedicated KernelFactory accessible via StreamExecutor.
 
-  // Constructs an "empty" (not-yet-loaded) kernel instance.
-  //
-  // parent is the StreamExecutor that will be responsible for loading the
-  // implementation of this kernel. It must not be null.
-  explicit Kernel(StreamExecutor *parent);
+  // Creates kernel on a given executor from a given kernel specification.
+  static absl::StatusOr<std::unique_ptr<Kernel>> Create(
+      StreamExecutor *executor, const MultiKernelLoaderSpec &spec);
+
+  Kernel() = default;
+  virtual ~Kernel() = default;
 
-  // Releases resources associated with the kernel instance (i.e.
-  // platform-specific implementation).
-  ~Kernel();
+  Kernel(const Kernel &) = delete;
+  void operator=(const Kernel &) = delete;
 
   // Returns the number of parameters that this kernel accepts. (Arity refers to
   // nullary, unary, ...).
-  unsigned Arity() const;
-
-  // Returns the StreamExecutor that represents the platform this kernel
-  // executes upon.
-  StreamExecutor *parent() const { return parent_; }
-
-  // Returns a const pointer to the (opaque) platform-dependent implementation.
-  const internal::KernelInterface *implementation() const {
-    return implementation_.get();
-  }
-
-  // Returns a non-const pointer to the (opaque) platform-dependent
-  // implementation.
-  internal::KernelInterface *implementation() { return implementation_.get(); }
+  virtual unsigned Arity() const = 0;
 
   void set_metadata(const KernelMetadata &metadata) { metadata_ = metadata; }
 
@@ -265,15 +251,15 @@ class Kernel {
 
   // Sets the preferred cache configuration for a kernel. This is just a
   // suggestion to the runtime, and may not be honored during execution.
-  void SetPreferredCacheConfig(KernelCacheConfig config);
+  virtual void SetPreferredCacheConfig(KernelCacheConfig config) = 0;
 
   // Gets the preferred cache configuration for a kernel.
-  KernelCacheConfig GetPreferredCacheConfig() const;
+  virtual KernelCacheConfig GetPreferredCacheConfig() const = 0;
 
   // Returns the maximum number of blocks (per multiprocessor) occupied by the
   // kernel given the number of threads per block and shared memory size.
-  absl::StatusOr<int32_t> GetMaxOccupiedBlocksPerCore(
-      ThreadDim threads, size_t dynamic_shared_memory_bytes) const;
+  virtual absl::StatusOr<int32_t> GetMaxOccupiedBlocksPerCore(
+      ThreadDim threads, size_t dynamic_shared_memory_bytes) const = 0;
 
   // Sets custom kernels arguments packing function for a kernel.
   void set_kernel_args_packing(KernelArgsPacking kernel_args_packing) {
@@ -285,38 +271,51 @@ class Kernel {
   }
 
   void set_name(absl::string_view name);
-  const std::string &name() const { return name_; }
-  const std::string &demangled_name() const { return demangled_name_; }
+  std::string_view name() const { return name_; }
+  std::string_view demangled_name() const { return demangled_name_; }
 
  private:
-  // The StreamExecutor that loads this kernel object.
-  StreamExecutor *parent_;
-
-  // Implementation delegated to for platform-specific functionality.
-  std::unique_ptr<internal::KernelInterface> implementation_;
-
   std::string name_;
   std::string demangled_name_;
 
   KernelMetadata metadata_;
 
   KernelArgsPacking kernel_args_packing_;
-
-  Kernel(const Kernel &) = delete;
-  void operator=(const Kernel &) = delete;
 };
 
 //===----------------------------------------------------------------------===//
 // Typed kernel
 //===----------------------------------------------------------------------===//
 
-// Typed variant of Kernel, like a typed device function pointer.
+// Typed kernel is a typed smart-pointer-like wrapper around untyped Kernel.
 template <typename... Params>
-class TypedKernel : public Kernel {
+class TypedKernel {
  public:
   static constexpr size_t kNumberOfParameters = sizeof...(Params);
 
-  explicit TypedKernel(StreamExecutor *parent) : Kernel(parent) {}
+  // Creates a typed kernel on a given executor from a kernel specification.
+  static absl::StatusOr<TypedKernel> Create(StreamExecutor *executor,
+                                            const MultiKernelLoaderSpec &spec) {
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<Kernel> kernel,
+                        Kernel::Create(executor, spec));
+    return TypedKernel(std::move(kernel));
+  }
+
+  TypedKernel() = default;
+
+  Kernel &operator*() { return *kernel_; }
+  const Kernel &operator*() const { return *kernel_; }
+
+  Kernel *operator->() { return kernel_.get(); }
+  const Kernel *operator->() const { return kernel_.get(); }
+
+  operator bool() const { return static_cast<bool>(kernel_); }  // NOLINT
+
+ private:
+  explicit TypedKernel(std::unique_ptr<Kernel> kernel)
+      : kernel_(std::move(kernel)) {}
+
+  std::unique_ptr<Kernel> kernel_;
 };
 
 //===----------------------------------------------------------------------===//
@@ -720,7 +719,7 @@ std::unique_ptr<KernelArgsPackedArrayBase> PackKernelArgs(
 
   PackedParams::template CheckCompatibleStaticAssert<Args...>();
 
-  int64_t shmem_bytes = kernel.metadata().shared_memory_bytes().value_or(0);
+  int64_t shmem_bytes = kernel->metadata().shared_memory_bytes().value_or(0);
   return std::make_unique<PackedArgs>(std::forward<Args>(args)..., shmem_bytes);
 }
 
diff --git a/third_party/xla/xla/stream_executor/kernel_test.cc b/third_party/xla/xla/stream_executor/kernel_test.cc
index adbc998902e70d..f3a1ba465972fa 100644
--- a/third_party/xla/xla/stream_executor/kernel_test.cc
+++ b/third_party/xla/xla/stream_executor/kernel_test.cc
@@ -22,6 +22,7 @@ limitations under the License.
 #include <vector>
 
 #include "xla/stream_executor/device_memory.h"
+#include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/test.h"
 #include "tsl/platform/test_benchmark.h"
@@ -103,11 +104,8 @@ TEST(KernelTest, PackPodArguments) {
   ASSERT_EQ(f64, 3.0);
 }
 
-TEST(KernelTest, PackTypedKernelArguments) {
-  auto executor = NewStreamExecutor();
-  TypedKernel<int32_t, float, double> kernel(executor.get());
-
-  auto args = PackKernelArgs(kernel, 1, 2.0f, 3.0);
+TEST(KernelTest, PackTupleArguments) {
+  auto args = PackKernelArgs(/*shmem_bytes=*/0, 1, 2.0f, 3.0);
   ASSERT_EQ(args->number_of_arguments(), 3);
 
   auto packed = args->argument_addresses();
@@ -120,6 +118,14 @@ TEST(KernelTest, PackTypedKernelArguments) {
   ASSERT_EQ(f64, 3.0);
 }
 
+TEST(KernelTest, FailToCreateTypedKernelFromEmptySpec) {
+  MultiKernelLoaderSpec empty_spec(/*arity=*/0);
+
+  auto executor = NewStreamExecutor();
+  auto kernel = TypedKernel<>::Create(executor.get(), empty_spec);
+  EXPECT_FALSE(kernel.ok());
+}
+
 //===----------------------------------------------------------------------===//
 // Performance benchmarks below
 //===----------------------------------------------------------------------===//
diff --git a/third_party/xla/xla/stream_executor/lazy_op_runner.h b/third_party/xla/xla/stream_executor/lazy_op_runner.h
index ea9a7b165420a0..d69521777b07d2 100644
--- a/third_party/xla/xla/stream_executor/lazy_op_runner.h
+++ b/third_party/xla/xla/stream_executor/lazy_op_runner.h
@@ -25,6 +25,7 @@ limitations under the License.
 #include "absl/base/call_once.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
+#include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/stream.h"
 #include "tsl/platform/statusor.h"
@@ -32,6 +33,17 @@ limitations under the License.
 namespace stream_executor {
 namespace dnn {
 
+namespace internal {
+// Returns the DnnSupport object for the given stream.
+inline absl::StatusOr<DnnSupport*> GetDnnFromStream(Stream* stream) {
+  auto dnn = stream->parent()->AsDnn();
+  if (dnn == nullptr) {
+    return absl::InternalError("No DNN support for stream");
+  }
+  return dnn;
+}
+}  // namespace internal
+
 // A lazily-initialized OpRunner from an AlgorithmDesc.
 //
 // This exists to hold a choice of conv algorithm for a particular config,
@@ -154,8 +166,9 @@ struct ConvOp {
   static absl::StatusOr<std::unique_ptr<const OpRunner<ConvSignature>>>
   RunnerFromAlgorithmDesc(const AlgorithmDesc& desc, Config config,
                           Stream* stream) {
-    return stream->ConvolveRunnerFromDesc(
-        desc, config.kind, config.input_type, config.output_type,
+    TF_ASSIGN_OR_RETURN(auto dnn, internal::GetDnnFromStream(stream));
+    return dnn->ConvolveRunnerFromDesc(
+        stream, desc, config.kind, config.input_type, config.output_type,
         config.input_descriptor, config.filter_descriptor,
         config.output_descriptor, config.convolution_descriptor);
   }
@@ -179,8 +192,9 @@ struct GraphConvOp {
   static absl::StatusOr<std::unique_ptr<const OpRunner<Signature>>>
   RunnerFromAlgorithmDesc(const AlgorithmDesc& desc, Config config,
                           Stream* stream) {
-    return stream->GraphConvolveRunnerFromDesc(
-        desc, config.kind, config.input_type, config.output_type,
+    TF_ASSIGN_OR_RETURN(auto dnn, internal::GetDnnFromStream(stream));
+    return dnn->GraphConvolveRunnerFromDesc(
+        stream, desc, config.kind, config.input_type, config.output_type,
         config.input_descriptor, config.filter_descriptor,
         config.output_descriptor, config.convolution_descriptor,
         config.serialized_graph);
@@ -206,8 +220,9 @@ struct FusedConvOp {
   static absl::StatusOr<std::unique_ptr<const OpRunner<FusedConvSignature>>>
   RunnerFromAlgorithmDesc(const AlgorithmDesc& desc, Config config,
                           Stream* stream) {
-    return stream->FusedConvolveRunnerFromDesc(
-        desc, config.kind, config.input_type, config.bias_type,
+    TF_ASSIGN_OR_RETURN(auto dnn, internal::GetDnnFromStream(stream));
+    return dnn->FusedConvolveRunnerFromDesc(
+        stream, desc, config.kind, config.input_type, config.bias_type,
         config.output_type, config.conv_scale, config.side_input_scale,
         config.leakyrelu_alpha, config.input_descriptor,
         config.filter_descriptor, config.bias_descriptor,
@@ -221,22 +236,29 @@ struct NormOp {
   using Signature = NormSignature;
 
   struct Config {
+    NormKind kind;
     double epsilon;
-    const TensorDescriptor& input_descriptor;
+    const TensorDescriptor& x_descriptor;
     const TensorDescriptor& scale_descriptor;
-    const TensorDescriptor& bias_descriptor;
-    const TensorDescriptor& output_descriptor;
-    std::optional<dnn::TensorDescriptor> expectation_descriptor;
-    std::optional<dnn::TensorDescriptor> norm_factor_descriptor;
+    const TensorDescriptor& y_or_dx_descriptor;
+    std::optional<TensorDescriptor> bias_descriptor;
+    std::optional<TensorDescriptor> dy_descriptor;
+    std::optional<TensorDescriptor> expectation_descriptor;
+    std::optional<TensorDescriptor> norm_factor_descriptor;
+    std::optional<TensorDescriptor> dscale_descriptor;
+    std::optional<TensorDescriptor> dbias_descriptor;
   };
 
   static absl::StatusOr<std::unique_ptr<const OpRunner<Signature>>>
   RunnerFromAlgorithmDesc(const AlgorithmDesc& desc, Config config,
                           Stream* stream) {
-    return stream->NormRunnerFromDesc(
-        desc, config.epsilon, config.input_descriptor, config.scale_descriptor,
-        config.bias_descriptor, config.output_descriptor,
-        config.expectation_descriptor, config.norm_factor_descriptor);
+    TF_ASSIGN_OR_RETURN(auto dnn, internal::GetDnnFromStream(stream));
+    return dnn->NormRunnerFromDesc(
+        stream, desc, config.kind, config.epsilon, config.x_descriptor,
+        config.scale_descriptor, config.y_or_dx_descriptor,
+        config.bias_descriptor, config.dy_descriptor,
+        config.expectation_descriptor, config.norm_factor_descriptor,
+        config.dscale_descriptor, config.dbias_descriptor);
   }
 };
 
@@ -278,8 +300,9 @@ struct FusedMHAOp {
   static absl::StatusOr<std::unique_ptr<const OpRunner<FusedMHASignature>>>
   RunnerFromAlgorithmDesc(const AlgorithmDesc& desc, Config config,
                           Stream* stream) {
-    return stream->FusedMHARunnerFromDesc(
-        desc, config.kind, config.bmm1_lhs_descriptor,
+    TF_ASSIGN_OR_RETURN(auto dnn, internal::GetDnnFromStream(stream));
+    return dnn->FusedMHARunnerFromDesc(
+        stream, desc, config.kind, config.bmm1_lhs_descriptor,
         config.bmm1_rhs_descriptor, config.bmm2_rhs_descriptor,
         config.intermediate_bmm2_lhs_descriptor, config.output_descriptor,
         config.activation_descriptor, config.mask_descriptor,
@@ -317,8 +340,9 @@ struct FusedMHABackwardOp {
       std::unique_ptr<const OpRunner<FusedMHABackwardSignature>>>
   RunnerFromAlgorithmDesc(const AlgorithmDesc& desc, Config config,
                           Stream* stream) {
-    return stream->FusedMHABackwardRunnerFromDesc(
-        desc, config.kind, config.bmm1_grad_gemm1_rhs_descriptor,
+    TF_ASSIGN_OR_RETURN(auto dnn, internal::GetDnnFromStream(stream));
+    return dnn->FusedMHABackwardRunnerFromDesc(
+        stream, desc, config.kind, config.bmm1_grad_gemm1_rhs_descriptor,
         config.bmm1_grad_gemm2_rhs_descriptor,
         config.bmm2_grad_gemm1_lhs_descriptor,
         config.bmm2_grad_gemm2_rhs_descriptor, config.d_output_descriptor,
diff --git a/third_party/xla/xla/stream_executor/memory_allocation.h b/third_party/xla/xla/stream_executor/memory_allocation.h
new file mode 100644
index 00000000000000..0e0df2442001e0
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/memory_allocation.h
@@ -0,0 +1,40 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_STREAM_EXECUTOR_MEMORY_ALLOCATION_H_
+#define XLA_STREAM_EXECUTOR_MEMORY_ALLOCATION_H_
+
+#include <cstdint>
+
+namespace stream_executor {
+
+// An RAII handle for a memory allocated for a device. It can be pinned host
+// memory, unified memory, device memory, etc. depending on what kinds of
+// memories are supported by underlying device.
+class MemoryAllocation {
+ public:
+  MemoryAllocation() = default;
+  virtual ~MemoryAllocation() = default;
+
+  MemoryAllocation(MemoryAllocation&&) = delete;
+  MemoryAllocation& operator=(MemoryAllocation&&) = delete;
+
+  virtual void* opaque() const = 0;
+  virtual uint64_t size() const = 0;
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_MEMORY_ALLOCATION_H_
diff --git a/third_party/xla/xla/stream_executor/multi_platform_manager.h b/third_party/xla/xla/stream_executor/multi_platform_manager.h
index d7a726327a4648..352fae619fe2ea 100644
--- a/third_party/xla/xla/stream_executor/multi_platform_manager.h
+++ b/third_party/xla/xla/stream_executor/multi_platform_manager.h
@@ -13,129 +13,17 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-// This is a registration-oriented interface for multiple platforms.
-//
-// Usage:
-//
-// In your BUILD rule, add a dependency on a platform plugin that you'd like
-// to use, such as:
-//
-//   //third_party/tensorflow/compiler/xla/stream_executor/cuda:cuda_platform
-//   //third_party/tensorflow/compiler/xla/stream_executor/opencl:opencl_platform
-//
-// This will register platform plugins that can be discovered via this
-// interface. Sample API usage:
-//
-//   absl::StatusOr<Platform*> platform_status =
-//      se::MultiPlatformManager::PlatformWithName("OpenCL");
-//   if (!platform_status.ok()) { ... }
-//   Platform* platform = platform_status.value();
-//   LOG(INFO) << platform->VisibleDeviceCount() << " devices visible";
-//   if (platform->VisibleDeviceCount() <= 0) { return; }
-//
-//   for (int i = 0; i < platform->VisibleDeviceCount(); ++i) {
-//     absl::StatusOr<StreamExecutor*> executor_status =
-//        platform->ExecutorForDevice(i);
-//     if (!executor_status.ok()) {
-//       LOG(INFO) << "could not retrieve executor for device ordinal " << i
-//                 << ": " << executor_status.status();
-//       continue;
-//     }
-//     LOG(INFO) << "found usable executor: " << executor_status.value();
-//   }
-//
-// A few things to note:
-//  - There is no standard formatting/practice for identifying the name of a
-//    platform. Ideally, a platform will list its registered name in its header
-//    or in other associated documentation.
-//  - Platform name lookup is case-insensitive. "OpenCL" or "opencl" (or even
-//    ("OpEnCl") would work correctly in the above example.
-//
-// And similarly, for standard interfaces (BLAS, etc.) you can add
-// dependencies on support libraries, e.g.:
-//
-//    //third_party/tensorflow/compiler/xla/stream_executor/cuda:pluton_blas_plugin
-//    //third_party/tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin
-//    //third_party/tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin
-
 #ifndef XLA_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
 #define XLA_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
 
-#include <functional>
-#include <map>
-#include <memory>
-#include <vector>
-
-#include "absl/status/status.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "xla/stream_executor/platform.h"
+#include "xla/stream_executor/platform_manager.h"
 
 namespace stream_executor {
-
-// Manages multiple platforms that may be present on the current machine.
-class MultiPlatformManager {
- public:
-  // Registers a platform object, returns an error status if the platform is
-  // already registered. The associated listener, if not null, will be used to
-  // trace events for ALL executors for that platform.
-  // Takes ownership of platform.
-  static absl::Status RegisterPlatform(std::unique_ptr<Platform> platform);
-
-  // Retrieves the platform registered with the given platform name (e.g.
-  // "CUDA", "OpenCL", ...) or id (an opaque, comparable value provided by the
-  // Platform's Id() method).
-  //
-  // If the platform has not already been initialized, it will be initialized
-  // with a default set of parameters.
-  //
-  // If the requested platform is not registered, an error status is returned.
-  // Ownership of the platform is NOT transferred to the caller --
-  // the MultiPlatformManager owns the platforms in a singleton-like fashion.
-  static absl::StatusOr<Platform*> PlatformWithName(absl::string_view target);
-  static absl::StatusOr<Platform*> PlatformWithId(const Platform::Id& id);
-
-  // Same functions as above, but allows platforms to be returned without
-  // initialization if initialize_platform == false.
-  static absl::StatusOr<Platform*> PlatformWithName(absl::string_view target,
-                                                    bool initialize_platform);
-
-  // Retrieves the platform registered with the given platform id (an opaque,
-  // comparable value provided by the Platform's Id() method).
-  //
-  // The platform will be initialized with the given options. If the platform
-  // was already initialized, an error will be returned.
-  //
-  // If the requested platform is not registered, an error status is returned.
-  // Ownership of the platform is NOT transferred to the caller --
-  // the MultiPlatformManager owns the platforms in a singleton-like fashion.
-  static absl::StatusOr<Platform*> InitializePlatformWithId(
-      const Platform::Id& id,
-      const std::map<std::string, std::string>& options);
-
-  // Retrieves the platforms satisfying the given filter, i.e. returns true.
-  // Returned Platforms are always initialized.
-  static absl::StatusOr<std::vector<Platform*>> PlatformsWithFilter(
-      const std::function<bool(const Platform*)>& filter);
-
-  static absl::StatusOr<std::vector<Platform*>> PlatformsWithFilter(
-      const std::function<bool(const Platform*)>& filter,
-      bool initialize_platform);
-
-  // Although the MultiPlatformManager "owns" its platforms, it holds them as
-  // undecorated pointers to prevent races during program exit (between this
-  // object's data and the underlying platforms (e.g., CUDA, OpenCL).
-  // Because certain platforms have unpredictable deinitialization
-  // times/sequences, it is not possible to strucure a safe deinitialization
-  // sequence. Thus, we intentionally "leak" allocated platforms to defer
-  // cleanup to the OS. This should be acceptable, as these are one-time
-  // allocations per program invocation.
-  // The MultiPlatformManager should be considered the owner
-  // of any platforms registered with it, and leak checking should be disabled
-  // during allocation of such Platforms, to avoid spurious reporting at program
-  // exit.
-};
-
+// The name `MultiPlatformManager` is deprecated. Please use `PlatformManager`
+// instead and include `platform_manager.h`.
+// TODO(hebecker): A migration is to `PlatformManager` is under way.
+using MultiPlatformManager [[deprecated("Rename to PlatformManager")]] =
+    PlatformManager;
 }  // namespace stream_executor
 
 #endif  // XLA_STREAM_EXECUTOR_MULTI_PLATFORM_MANAGER_H_
diff --git a/third_party/xla/xla/stream_executor/platform/BUILD b/third_party/xla/xla/stream_executor/platform/BUILD
index 485a897c4aba3f..cd73a976be545b 100644
--- a/third_party/xla/xla/stream_executor/platform/BUILD
+++ b/third_party/xla/xla/stream_executor/platform/BUILD
@@ -1,10 +1,11 @@
 load("//xla/stream_executor:build_defs.bzl", "stream_executor_friends")
-load("@local_tsl//tsl:tsl.bzl", "set_external_visibility")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:build_config.bzl", "tf_stream_executor_deps")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
     licenses = ["notice"],
 )
 
@@ -20,7 +21,6 @@ cc_library(
         "platform.h",
         "port.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:macros",
@@ -31,7 +31,6 @@ cc_library(
 cc_library(
     name = "dso_loader",
     hdrs = ["dso_loader.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":platform",
     ] + tf_stream_executor_deps("dso_loader", "//xla/stream_executor/platform/"),
diff --git a/third_party/xla/xla/stream_executor/platform/default/BUILD b/third_party/xla/xla/stream_executor/platform/default/BUILD
index cdd0b54a4d29e6..8f201e9468172f 100644
--- a/third_party/xla/xla/stream_executor/platform/default/BUILD
+++ b/third_party/xla/xla/stream_executor/platform/default/BUILD
@@ -4,13 +4,15 @@ load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 licenses(["notice"])
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [
+        "//xla/stream_executor:__subpackages__",
+    ],
 )
 
 cc_library(
     name = "platform",
     textual_hdrs = ["initialize.h"],
-    visibility = ["//visibility:public"],
 )
 
 cc_library(
@@ -22,7 +24,6 @@ cc_library(
         "manual",
         "nobuilder",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:dso_loader",
diff --git a/third_party/xla/xla/stream_executor/platform/default/initialize.h b/third_party/xla/xla/stream_executor/platform/default/initialize.h
index 559faf28665e6b..cb951ed8b0611c 100644
--- a/third_party/xla/xla/stream_executor/platform/default/initialize.h
+++ b/third_party/xla/xla/stream_executor/platform/default/initialize.h
@@ -13,13 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+// IWYU pragma: private, include "third_party/tensorflow/compiler/xla/stream_executor/platform/initialize.h"
+
 #ifndef XLA_STREAM_EXECUTOR_PLATFORM_DEFAULT_INITIALIZE_H_
 #define XLA_STREAM_EXECUTOR_PLATFORM_DEFAULT_INITIALIZE_H_
 
-#undef REGISTER_MODULE_INITIALIZER
-#undef DECLARE_MODULE_INITIALIZER
-#undef REGISTER_MODULE_INITIALIZER_SEQUENCE
-
 namespace stream_executor {
 namespace port {
 
@@ -44,19 +42,20 @@ class Initializer {
 }  // namespace port
 }  // namespace stream_executor
 
-#define REGISTER_INITIALIZER(type, name, body)                             \
+#define STREAM_EXECUTOR_REGISTER_INITIALIZER(type, name, body)             \
   static void google_init_##type##_##name() { body; }                      \
   ::stream_executor::port::Initializer google_initializer_##type##_##name( \
       google_init_##type##_##name)
 
-#define REGISTER_MODULE_INITIALIZER(name, body) \
-  REGISTER_INITIALIZER(module, name, body)
+#define STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(name, body) \
+  STREAM_EXECUTOR_REGISTER_INITIALIZER(module, name, body)
 
-#define DECLARE_INITIALIZER(type, name) \
+#define STREAM_EXECUTOR_DECLARE_INITIALIZER(type, name) \
   extern ::stream_executor::port::Initializer google_initializer_##type##_##name
 
-#define DECLARE_MODULE_INITIALIZER(name) DECLARE_INITIALIZER(module, name)
+#define STREAM_EXECUTOR_DECLARE_MODULE_INITIALIZER(name) \
+  STREAM_EXECUTOR_DECLARE_INITIALIZER(module, name)
 
-#define REGISTER_MODULE_INITIALIZER_SEQUENCE(name1, name2)
+#define STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER_SEQUENCE(name1, name2)
 
 #endif  // XLA_STREAM_EXECUTOR_PLATFORM_DEFAULT_INITIALIZE_H_
diff --git a/third_party/xla/xla/stream_executor/multi_platform_manager.cc b/third_party/xla/xla/stream_executor/platform_manager.cc
similarity index 85%
rename from third_party/xla/xla/stream_executor/multi_platform_manager.cc
rename to third_party/xla/xla/stream_executor/platform_manager.cc
index 7d29885a994e91..ec97c3e9603e5a 100644
--- a/third_party/xla/xla/stream_executor/multi_platform_manager.cc
+++ b/third_party/xla/xla/stream_executor/platform_manager.cc
@@ -13,8 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#include "xla/stream_executor/multi_platform_manager.h"
-
 #include <memory>
 #include <string>
 #include <vector>
@@ -28,6 +26,7 @@ limitations under the License.
 #include "absl/strings/str_join.h"
 #include "absl/strings/string_view.h"
 #include "absl/synchronization/mutex.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/platform.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/statusor.h"
@@ -35,7 +34,7 @@ limitations under the License.
 namespace stream_executor {
 namespace {
 
-class MultiPlatformManagerImpl {
+class PlatformManagerImpl {
  public:
   absl::Status RegisterPlatform(std::unique_ptr<Platform> platform)
       ABSL_LOCKS_EXCLUDED(mu_);
@@ -89,7 +88,7 @@ class MultiPlatformManagerImpl {
   absl::flat_hash_map<std::string, Platform*> name_map_ ABSL_GUARDED_BY(mu_);
 };
 
-absl::Status MultiPlatformManagerImpl::RegisterPlatform(
+absl::Status PlatformManagerImpl::RegisterPlatform(
     std::unique_ptr<Platform> platform) {
   CHECK(platform != nullptr);
   std::string key = absl::AsciiStrToLower(platform->Name());
@@ -109,17 +108,17 @@ absl::Status MultiPlatformManagerImpl::RegisterPlatform(
   return absl::OkStatus();
 }
 
-absl::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithName(
+absl::StatusOr<Platform*> PlatformManagerImpl::PlatformWithName(
     absl::string_view target) {
   return PlatformWithName(target, /*initialize_platform=*/true);
 }
 
-absl::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithId(
+absl::StatusOr<Platform*> PlatformManagerImpl::PlatformWithId(
     const Platform::Id& id) {
   return PlatformWithId(id, /*initialize_platform=*/true);
 }
 
-absl::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithName(
+absl::StatusOr<Platform*> PlatformManagerImpl::PlatformWithName(
     absl::string_view target, bool initialize_platform) {
   absl::MutexLock lock(&mu_);
 
@@ -131,7 +130,7 @@ absl::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithName(
   return platform;
 }
 
-absl::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithId(
+absl::StatusOr<Platform*> PlatformManagerImpl::PlatformWithId(
     const Platform::Id& id, bool initialize_platform) {
   absl::MutexLock lock(&mu_);
 
@@ -143,7 +142,7 @@ absl::StatusOr<Platform*> MultiPlatformManagerImpl::PlatformWithId(
   return platform;
 }
 
-absl::StatusOr<Platform*> MultiPlatformManagerImpl::InitializePlatformWithName(
+absl::StatusOr<Platform*> PlatformManagerImpl::InitializePlatformWithName(
     absl::string_view target,
     const std::map<std::string, std::string>& options) {
   absl::MutexLock lock(&mu_);
@@ -159,7 +158,7 @@ absl::StatusOr<Platform*> MultiPlatformManagerImpl::InitializePlatformWithName(
   return platform;
 }
 
-absl::StatusOr<Platform*> MultiPlatformManagerImpl::InitializePlatformWithId(
+absl::StatusOr<Platform*> PlatformManagerImpl::InitializePlatformWithId(
     const Platform::Id& id, const std::map<std::string, std::string>& options) {
   absl::MutexLock lock(&mu_);
 
@@ -174,8 +173,7 @@ absl::StatusOr<Platform*> MultiPlatformManagerImpl::InitializePlatformWithId(
   return platform;
 }
 
-absl::StatusOr<std::vector<Platform*>>
-MultiPlatformManagerImpl::PlatformsWithFilter(
+absl::StatusOr<std::vector<Platform*>> PlatformManagerImpl::PlatformsWithFilter(
     const std::function<bool(const Platform*)>& filter,
     bool initialize_platform) {
   absl::MutexLock lock(&mu_);
@@ -195,7 +193,7 @@ MultiPlatformManagerImpl::PlatformsWithFilter(
 }
 
 std::vector<std::string>
-MultiPlatformManagerImpl::InitializedPlatformNamesWithFilter(
+PlatformManagerImpl::InitializedPlatformNamesWithFilter(
     const std::function<bool(const Platform*)>& filter) {
   CHECK_EQ(id_map_.size(), name_map_.size());
   std::vector<std::string> initialized_platforms_names;
@@ -211,7 +209,7 @@ MultiPlatformManagerImpl::InitializedPlatformNamesWithFilter(
   return initialized_platforms_names;
 }
 
-absl::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByNameLocked(
+absl::StatusOr<Platform*> PlatformManagerImpl::LookupByNameLocked(
     absl::string_view target) {
   auto it = name_map_.find(absl::AsciiStrToLower(target));
   if (it == name_map_.end()) {
@@ -223,7 +221,7 @@ absl::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByNameLocked(
   return it->second;
 }
 
-absl::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByIdLocked(
+absl::StatusOr<Platform*> PlatformManagerImpl::LookupByIdLocked(
     const Platform::Id& id) {
   auto it = id_map_.find(id);
   if (it == id_map_.end()) {
@@ -233,47 +231,46 @@ absl::StatusOr<Platform*> MultiPlatformManagerImpl::LookupByIdLocked(
   return it->second;
 }
 
-MultiPlatformManagerImpl& Impl() {
-  static MultiPlatformManagerImpl* impl = new MultiPlatformManagerImpl;
+PlatformManagerImpl& Impl() {
+  static PlatformManagerImpl* impl = new PlatformManagerImpl;
   return *impl;
 }
 
 }  // namespace
 
-/*static*/ absl::Status MultiPlatformManager::RegisterPlatform(
+/*static*/ absl::Status PlatformManager::RegisterPlatform(
     std::unique_ptr<Platform> platform) {
   return Impl().RegisterPlatform(std::move(platform));
 }
 
-/*static*/ absl::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
+/*static*/ absl::StatusOr<Platform*> PlatformManager::PlatformWithName(
     absl::string_view target) {
   return Impl().PlatformWithName(target);
 }
 
-/*static*/ absl::StatusOr<Platform*> MultiPlatformManager::PlatformWithId(
+/*static*/ absl::StatusOr<Platform*> PlatformManager::PlatformWithId(
     const Platform::Id& id) {
   return Impl().PlatformWithId(id);
 }
 
-/*static*/ absl::StatusOr<Platform*> MultiPlatformManager::PlatformWithName(
+/*static*/ absl::StatusOr<Platform*> PlatformManager::PlatformWithName(
     absl::string_view target, bool initialize_platform) {
   return Impl().PlatformWithName(target, initialize_platform);
 }
 
-/*static*/ absl::StatusOr<Platform*>
-MultiPlatformManager::InitializePlatformWithId(
+/*static*/ absl::StatusOr<Platform*> PlatformManager::InitializePlatformWithId(
     const Platform::Id& id, const std::map<std::string, std::string>& options) {
   return Impl().InitializePlatformWithId(id, options);
 }
 
 /*static*/ absl::StatusOr<std::vector<Platform*>>
-MultiPlatformManager::PlatformsWithFilter(
+PlatformManager::PlatformsWithFilter(
     const std::function<bool(const Platform*)>& filter) {
   return PlatformsWithFilter(filter, /*initialize_platform=*/true);
 }
 
 /*static*/ absl::StatusOr<std::vector<Platform*>>
-MultiPlatformManager::PlatformsWithFilter(
+PlatformManager::PlatformsWithFilter(
     const std::function<bool(const Platform*)>& filter,
     bool initialize_platform) {
   return Impl().PlatformsWithFilter(filter, initialize_platform);
diff --git a/third_party/xla/xla/stream_executor/platform_manager.h b/third_party/xla/xla/stream_executor/platform_manager.h
new file mode 100644
index 00000000000000..665bf3c2cd0eb2
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/platform_manager.h
@@ -0,0 +1,141 @@
+/* Copyright 2015 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+// This is a registration-oriented interface for multiple platforms.
+//
+// Usage:
+//
+// In your BUILD rule, add a dependency on a platform plugin that you'd like
+// to use, such as:
+//
+//   //third_party/tensorflow/compiler/xla/stream_executor/cuda:cuda_platform
+//   //third_party/tensorflow/compiler/xla/stream_executor/opencl:opencl_platform
+//
+// This will register platform plugins that can be discovered via this
+// interface. Sample API usage:
+//
+//   absl::StatusOr<Platform*> platform_status =
+//      se::PlatformManager::PlatformWithName("OpenCL");
+//   if (!platform_status.ok()) { ... }
+//   Platform* platform = platform_status.value();
+//   LOG(INFO) << platform->VisibleDeviceCount() << " devices visible";
+//   if (platform->VisibleDeviceCount() <= 0) { return; }
+//
+//   for (int i = 0; i < platform->VisibleDeviceCount(); ++i) {
+//     absl::StatusOr<StreamExecutor*> executor_status =
+//        platform->ExecutorForDevice(i);
+//     if (!executor_status.ok()) {
+//       LOG(INFO) << "could not retrieve executor for device ordinal " << i
+//                 << ": " << executor_status.status();
+//       continue;
+//     }
+//     LOG(INFO) << "found usable executor: " << executor_status.value();
+//   }
+//
+// A few things to note:
+//  - There is no standard formatting/practice for identifying the name of a
+//    platform. Ideally, a platform will list its registered name in its header
+//    or in other associated documentation.
+//  - Platform name lookup is case-insensitive. "OpenCL" or "opencl" (or even
+//    ("OpEnCl") would work correctly in the above example.
+//
+// And similarly, for standard interfaces (BLAS, etc.) you can add
+// dependencies on support libraries, e.g.:
+//
+//    //third_party/tensorflow/compiler/xla/stream_executor/cuda:pluton_blas_plugin
+//    //third_party/tensorflow/compiler/xla/stream_executor/cuda:cudnn_plugin
+//    //third_party/tensorflow/compiler/xla/stream_executor/cuda:cublas_plugin
+
+#ifndef XLA_STREAM_EXECUTOR_PLATFORM_MANAGER_H_
+#define XLA_STREAM_EXECUTOR_PLATFORM_MANAGER_H_
+
+#include <functional>
+#include <map>
+#include <memory>
+#include <vector>
+
+#include "absl/status/status.h"
+#include "absl/status/statusor.h"
+#include "absl/strings/string_view.h"
+#include "xla/stream_executor/platform.h"
+
+namespace stream_executor {
+
+// Manages multiple platforms that may be present on the current machine.
+class PlatformManager {
+ public:
+  // Registers a platform object, returns an error status if the platform is
+  // already registered. The associated listener, if not null, will be used to
+  // trace events for ALL executors for that platform.
+  // Takes ownership of platform.
+  static absl::Status RegisterPlatform(std::unique_ptr<Platform> platform);
+
+  // Retrieves the platform registered with the given platform name (e.g.
+  // "CUDA", "OpenCL", ...) or id (an opaque, comparable value provided by the
+  // Platform's Id() method).
+  //
+  // If the platform has not already been initialized, it will be initialized
+  // with a default set of parameters.
+  //
+  // If the requested platform is not registered, an error status is returned.
+  // Ownership of the platform is NOT transferred to the caller --
+  // the PlatformManager owns the platforms in a singleton-like fashion.
+  static absl::StatusOr<Platform*> PlatformWithName(absl::string_view target);
+  static absl::StatusOr<Platform*> PlatformWithId(const Platform::Id& id);
+
+  // Same functions as above, but allows platforms to be returned without
+  // initialization if initialize_platform == false.
+  static absl::StatusOr<Platform*> PlatformWithName(absl::string_view target,
+                                                    bool initialize_platform);
+
+  // Retrieves the platform registered with the given platform id (an opaque,
+  // comparable value provided by the Platform's Id() method).
+  //
+  // The platform will be initialized with the given options. If the platform
+  // was already initialized, an error will be returned.
+  //
+  // If the requested platform is not registered, an error status is returned.
+  // Ownership of the platform is NOT transferred to the caller --
+  // the PlatformManager owns the platforms in a singleton-like fashion.
+  static absl::StatusOr<Platform*> InitializePlatformWithId(
+      const Platform::Id& id,
+      const std::map<std::string, std::string>& options);
+
+  // Retrieves the platforms satisfying the given filter, i.e. returns true.
+  // Returned Platforms are always initialized.
+  static absl::StatusOr<std::vector<Platform*>> PlatformsWithFilter(
+      const std::function<bool(const Platform*)>& filter);
+
+  static absl::StatusOr<std::vector<Platform*>> PlatformsWithFilter(
+      const std::function<bool(const Platform*)>& filter,
+      bool initialize_platform);
+
+  // Although the PlatformManager "owns" its platforms, it holds them as
+  // undecorated pointers to prevent races during program exit (between this
+  // object's data and the underlying platforms (e.g., CUDA, OpenCL).
+  // Because certain platforms have unpredictable deinitialization
+  // times/sequences, it is not possible to strucure a safe deinitialization
+  // sequence. Thus, we intentionally "leak" allocated platforms to defer
+  // cleanup to the OS. This should be acceptable, as these are one-time
+  // allocations per program invocation.
+  // The PlatformManager should be considered the owner
+  // of any platforms registered with it, and leak checking should be disabled
+  // during allocation of such Platforms, to avoid spurious reporting at program
+  // exit.
+};
+
+}  // namespace stream_executor
+
+#endif  // XLA_STREAM_EXECUTOR_PLATFORM_MANAGER_H_
diff --git a/third_party/xla/xla/stream_executor/rocm/BUILD b/third_party/xla/xla/stream_executor/rocm/BUILD
index 98ac06eba6f836..e2157eb0028ce3 100644
--- a/third_party/xla/xla/stream_executor/rocm/BUILD
+++ b/third_party/xla/xla/stream_executor/rocm/BUILD
@@ -16,12 +16,13 @@ load(
     "if_rocm_is_configured",
     "rocm_library",
 )
-load("@local_tsl//tsl:tsl.bzl", "set_external_visibility", "tsl_copts")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility", "tsl_copts")
 load("@local_tsl//tsl/platform:build_config_root.bzl", "if_static")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
     licenses = ["notice"],
 )
 
@@ -34,7 +35,6 @@ cc_library(
     name = "rocm_diagnostics",
     srcs = if_rocm_is_configured(["rocm_diagnostics.cc"]),
     hdrs = if_rocm_is_configured(["rocm_diagnostics.h"]),
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         "@com_google_absl//absl/container:inlined_vector",
         "@com_google_absl//absl/strings",
@@ -52,7 +52,6 @@ cc_library(
         "rocm_driver_wrapper.h",
         "rocm_driver.h",
     ]),
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         ":rocm_diagnostics",
         "@com_google_absl//absl/base",
@@ -78,7 +77,6 @@ cc_library(
         "rocm_driver_wrapper.h",
         "rocm_driver.h",
     ]),
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         "@com_google_absl//absl/base:core_headers",
         "@com_google_absl//absl/log",
@@ -99,7 +97,6 @@ cc_library(
 cc_library(
     name = "rocm_collectives",
     srcs = if_rocm_is_configured(["rocm_collectives.cc"]),
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
@@ -112,7 +109,6 @@ cc_library(
     name = "rocm_activation",
     srcs = [],
     hdrs = if_rocm_is_configured(["rocm_activation.h"]),
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         ":rocm_driver",
         "@local_config_rocm//rocm:rocm_headers",
@@ -126,7 +122,6 @@ cc_library(
 cc_library(
     name = "rocm_event",
     srcs = if_rocm_is_configured(["rocm_event.cc"]),
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         ":rocm_driver",
         "//xla/stream_executor",
@@ -139,7 +134,6 @@ cc_library(
 cc_library(
     name = "rocm_executor",
     srcs = if_rocm_is_configured(["rocm_executor.cc"]),
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         ":rocm_diagnostics",
         ":rocm_driver",
@@ -162,6 +156,7 @@ cc_library(
         "//xla/stream_executor/platform",
         "//xla/stream_executor/platform:dso_loader",
         "@local_tsl//tsl/platform:env",
+        "@local_tsl//tsl/platform:fingerprint",
     ]),
     alwayslink = True,
 )
@@ -182,6 +177,12 @@ rocm_library(
     deps = if_rocm_is_configured(["@local_config_rocm//rocm:rocm_headers"]),
 )
 
+rocm_library(
+    name = "hip_noop_kernel",
+    srcs = if_rocm_is_configured(["hip_noop_kernel.cu.cc"]),
+    deps = if_rocm_is_configured(["@local_config_rocm//rocm:rocm_headers"]),
+)
+
 cc_library(
     name = "rocm_platform",
     srcs = if_rocm_is_configured(["rocm_platform.cc"]),
@@ -198,20 +199,18 @@ cc_library(
         "//xla/stream_executor",  # buildcleaner: keep
         "//xla/stream_executor/platform",
     ]),
-    alwayslink = True,  # Registers itself with the MultiPlatformManager.
+    alwayslink = True,  # Registers itself with the PlatformManager.
 )
 
 cc_library(
     name = "rocm_platform_id",
     srcs = ["rocm_platform_id.cc"],
     hdrs = ["rocm_platform_id.h"],
-    visibility = ["//visibility:public"],
     deps = ["//xla/stream_executor:platform"],
 )
 
 cc_library(
     name = "rocblas_if_static",
-    visibility = ["//visibility:public"],
     deps = if_static([
         ":rocblas_if_rocm_configured",
     ]),
@@ -219,7 +218,6 @@ cc_library(
 
 cc_library(
     name = "rocblas_if_rocm_configured",
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         "@local_config_rocm//rocm:rocblas",
     ]),
@@ -229,7 +227,6 @@ cc_library(
     name = "rocblas_wrapper",
     srcs = if_rocm_is_configured(["rocblas_wrapper.h"]),
     hdrs = if_rocm_is_configured(["rocblas_wrapper.h"]),
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         ":rocblas_if_static",
         ":rocm_executor",
@@ -275,7 +272,6 @@ cc_library(
 
 cc_library(
     name = "hipfft_if_static",
-    visibility = ["//visibility:public"],
     deps = if_static([
         ":hipfft_if_rocm_configured",
     ]),
@@ -283,7 +279,6 @@ cc_library(
 
 cc_library(
     name = "hipfft_if_rocm_configured",
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         "@local_config_rocm//rocm:hipfft",
     ]),
@@ -315,7 +310,6 @@ cc_library(
 
 cc_library(
     name = "miopen_if_static",
-    visibility = ["//visibility:public"],
     deps = if_static([
         ":miopen_if_rocm_configured",
     ]),
@@ -323,7 +317,6 @@ cc_library(
 
 cc_library(
     name = "miopen_if_rocm_configured",
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         "@local_config_rocm//rocm:miopen",
     ]),
@@ -353,6 +346,7 @@ cc_library(
         "//xla/stream_executor/gpu:gpu_stream_header",
         "//xla/stream_executor/gpu:gpu_timer_header",
         "//xla/stream_executor/gpu:gpu_types_header",
+        "//xla/stream_executor:device_memory_allocator",
         "//xla/stream_executor/platform",
         "//xla/stream_executor/platform:dso_loader",
         "@com_google_absl//absl/algorithm:container",
@@ -368,7 +362,6 @@ cc_library(
 
 cc_library(
     name = "hiprand_if_static",
-    visibility = ["//visibility:public"],
     deps = if_static([
         ":hiprand_if_rocm_configured",
     ]),
@@ -376,7 +369,6 @@ cc_library(
 
 cc_library(
     name = "hiprand_if_rocm_configured",
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         "@local_config_rocm//rocm:hiprand",
     ]),
@@ -384,7 +376,6 @@ cc_library(
 
 cc_library(
     name = "hipsparse_if_static",
-    visibility = ["//visibility:public"],
     deps = if_static([
         ":hipsparse_if_rocm_configured",
     ]),
@@ -392,7 +383,6 @@ cc_library(
 
 cc_library(
     name = "hipsparse_if_rocm_configured",
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         "@local_config_rocm//rocm:hipsparse",
     ]),
@@ -402,7 +392,6 @@ cc_library(
     name = "hipsparse_wrapper",
     srcs = if_rocm_is_configured(["hipsparse_wrapper.h"]),
     hdrs = if_rocm_is_configured(["hipsparse_wrapper.h"]),
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         ":hipsparse_if_static",
         ":rocm_executor",
@@ -417,7 +406,6 @@ cc_library(
 
 cc_library(
     name = "rocsolver_if_static",
-    visibility = ["//visibility:public"],
     deps = if_static([
         ":rocsolver_if_rocm_configured",
     ]),
@@ -425,7 +413,6 @@ cc_library(
 
 cc_library(
     name = "rocsolver_if_rocm_configured",
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         "@local_config_rocm//rocm:rocsolver",
     ]),
@@ -435,7 +422,6 @@ cc_library(
     name = "rocsolver_wrapper",
     srcs = if_rocm_is_configured(["rocsolver_wrapper.h"]),
     hdrs = if_rocm_is_configured(["rocsolver_wrapper.h"]),
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         ":rocm_executor",
         ":rocm_platform_id",
@@ -450,7 +436,6 @@ cc_library(
 
 cc_library(
     name = "hipsolver_if_static",
-    visibility = ["//visibility:public"],
     deps = if_static([
         ":hipsolver_if_rocm_configured",
     ]),
@@ -458,7 +443,6 @@ cc_library(
 
 cc_library(
     name = "hipsolver_if_rocm_configured",
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         "@local_config_rocm//rocm:hipsolver",
     ]),
@@ -468,7 +452,6 @@ cc_library(
     name = "hipsolver_wrapper",
     srcs = if_rocm_is_configured(["hipsolver_wrapper.h"]),
     hdrs = if_rocm_is_configured(["hipsolver_wrapper.h"]),
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         ":rocm_executor",
         ":rocm_platform_id",
@@ -483,7 +466,6 @@ cc_library(
 
 cc_library(
     name = "hipblaslt_if_static",
-    visibility = ["//visibility:public"],
     deps = if_rocm_hipblaslt([
         "@local_config_rocm//rocm:hipblaslt",
     ]),
@@ -497,7 +479,6 @@ cc_library(
         "hipblaslt_wrapper.h",
         "hip_blas_utils.h",
     ]),
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         ":rocm_executor",
         ":rocm_platform_id",
@@ -543,7 +524,6 @@ cc_library(
     name = "hip_blas_utils",
     srcs = if_rocm_is_configured(["hip_blas_utils.cc"]),
     hdrs = if_rocm_is_configured(["hip_blas_utils.h"]),
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         ":rocblas_plugin",
         ":hipblas_lt_header",
@@ -557,7 +537,6 @@ cc_library(
 
 cc_library(
     name = "roctracer_if_static",
-    visibility = ["//visibility:public"],
     deps = if_static([
         ":roctracer_if_rocm_configured",
     ]),
@@ -565,7 +544,6 @@ cc_library(
 
 cc_library(
     name = "roctracer_if_rocm_configured",
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         "@local_config_rocm//rocm:roctracer",
     ]),
@@ -575,7 +553,6 @@ cc_library(
     name = "roctracer_wrapper",
     srcs = if_rocm_is_configured(["roctracer_wrapper.h"]),
     hdrs = if_rocm_is_configured(["roctracer_wrapper.h"]),
-    visibility = ["//visibility:public"],
     deps = if_rocm_is_configured([
         ":rocm_executor",
         ":rocm_platform_id",
@@ -621,13 +598,11 @@ cc_library(
             "-Wl,-rpath,../local_config_rocm/rocm/rocm/lib",
         ],
     }),
-    visibility = ["//visibility:public"],
     deps = [],
 )
 
 cc_library(
     name = "stream_executor_rocm",
-    visibility = ["//visibility:public"],
     deps = [
         ":rocm_rpath",
         "//xla/stream_executor:stream_executor_bundle",
diff --git a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
index bbc7f834fccfcb..0a7df71587cd5e 100644
--- a/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
+++ b/third_party/xla/xla/stream_executor/rocm/hip_blas_lt.cc
@@ -110,6 +110,14 @@ static absl::StatusOr<hipblasLtEpilogue_t> AsHipblasLtEpilogue(
       return HIPBLASLT_EPILOGUE_RELU_BIAS;
     case gpu::BlasLt::Epilogue::kGELU:
       return HIPBLASLT_EPILOGUE_GELU;
+#if TF_ROCM_VERSION >= 60000
+    case gpu::BlasLt::Epilogue::kGELUWithAux:
+      return HIPBLASLT_EPILOGUE_GELU_AUX;
+    case gpu::BlasLt::Epilogue::kBiasThenGELU:
+      return HIPBLASLT_EPILOGUE_GELU_BIAS;
+    case gpu::BlasLt::Epilogue::kBiasThenGELUWithAux:
+      return HIPBLASLT_EPILOGUE_GELU_AUX_BIAS;
+#endif
     default:
       return absl::InternalError("Unsupported epilogue: " +
                                  std::to_string((int)epilogue));
diff --git a/third_party/xla/xla/service/gpu/runtime/custom_call.h b/third_party/xla/xla/stream_executor/rocm/hip_noop_kernel.cu.cc
similarity index 64%
rename from third_party/xla/xla/service/gpu/runtime/custom_call.h
rename to third_party/xla/xla/stream_executor/rocm/hip_noop_kernel.cu.cc
index b79081bd753b46..4de8ba70d103e5 100644
--- a/third_party/xla/xla/service/gpu/runtime/custom_call.h
+++ b/third_party/xla/xla/stream_executor/rocm/hip_noop_kernel.cu.cc
@@ -1,11 +1,8 @@
-/* Copyright 2022 The OpenXLA Authors.
-
+/* Copyright 2024 The OpenXLA Authors.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-
     http://www.apache.org/licenses/LICENSE-2.0
-
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,17 +10,19 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
-#ifndef XLA_SERVICE_GPU_RUNTIME_CUSTOM_CALL_H_
-#define XLA_SERVICE_GPU_RUNTIME_CUSTOM_CALL_H_
+#include <hip/hip_runtime.h>
 
-#include "xla/runtime/custom_call_registry.h"
+namespace stream_executor {
+namespace rocm {
+namespace {
 
-namespace xla {
-namespace gpu {
+__global__ void NoOp() {}
 
-void RegisterXlaClassicCustomCalls(runtime::DirectCustomCallRegistry& registry);
+}  // namespace
+}  // namespace rocm
 
+namespace gpu {
+void* GetNoOpKernel() { return reinterpret_cast<void*>(&rocm::NoOp); }
 }  // namespace gpu
-}  // namespace xla
 
-#endif  // XLA_SERVICE_GPU_RUNTIME_CUSTOM_CALL_H_
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/rocm/rocblas_wrapper.h b/third_party/xla/xla/stream_executor/rocm/rocblas_wrapper.h
index 116d9fc03ad578..a637d68428d5e3 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocblas_wrapper.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocblas_wrapper.h
@@ -20,6 +20,8 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_ROCM_ROCBLAS_WRAPPER_H_
 #define XLA_STREAM_EXECUTOR_ROCM_ROCBLAS_WRAPPER_H_
 
+// needed for rocblas_gemm_ex_get_solutions* functionality
+#define ROCBLAS_BETA_FEATURES_API
 #include "rocm/include/rocblas/rocblas.h"
 #include "xla/stream_executor/gpu/gpu_activation.h"
 #include "xla/stream_executor/platform/dso_loader.h"
@@ -32,44 +34,42 @@ namespace wrap {
 using stream_executor::internal::CachedDsoLoader::GetRocblasDsoHandle;
 
 #ifdef PLATFORM_GOOGLE
-#define ROCBLAS_API_WRAPPER(__name)           \
-  struct WrapperShim__##__name {              \
-    static const char* kName;                 \
-    template <typename... Args>               \
-    rocblas_status operator()(Args... args) { \
-      return ::__name(args...);               \
-    }                                         \
-  } __name;                                   \
-  const char* WrapperShim__##__name::kName = #__name;
+#define ROCBLAS_API_WRAPPER(__name)               \
+  struct WrapperShim__##__name {                  \
+    constexpr static const char* kName = #__name; \
+    template <typename... Args>                   \
+    rocblas_status operator()(Args... args) {     \
+      return ::__name(args...);                   \
+    }                                             \
+  } __name;
 
 #else
 
-#define ROCBLAS_API_WRAPPER(__name)                                 \
-  struct DynLoadShim__##__name {                                    \
-    static const char* kName;                                       \
-    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;    \
-    static void* GetDsoHandle() {                                   \
-      auto s = GetRocblasDsoHandle();                               \
-      return s.value();                                             \
-    }                                                               \
-    static FuncPtrT LoadOrDie() {                                   \
-      void* f;                                                      \
-      auto s = tsl::Env::Default()                                  \
-          -> GetSymbolFromLibrary(GetDsoHandle(), kName, &f);       \
-      CHECK(s.ok()) << "could not find " << kName                   \
-                    << " in rocblas DSO; dlerror: " << s.message(); \
-      return reinterpret_cast<FuncPtrT>(f);                         \
-    }                                                               \
-    static FuncPtrT DynLoad() {                                     \
-      static FuncPtrT f = LoadOrDie();                              \
-      return f;                                                     \
-    }                                                               \
-    template <typename... Args>                                     \
-    rocblas_status operator()(Args... args) {                       \
-      return DynLoad()(args...);                                    \
-    }                                                               \
-  } __name;                                                         \
-  const char* DynLoadShim__##__name::kName = #__name;
+#define ROCBLAS_API_WRAPPER(__name)                                      \
+  struct DynLoadShim__##__name {                                         \
+    constexpr static const char* kName = #__name;                        \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;         \
+    static void* GetDsoHandle() {                                        \
+      auto s = GetRocblasDsoHandle();                                    \
+      return s.value();                                                  \
+    }                                                                    \
+    static FuncPtrT LoadOrDie() {                                        \
+      void* f;                                                           \
+      auto s = tsl::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                         kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                        \
+                    << " in rocblas DSO; dlerror: " << s.message();      \
+      return reinterpret_cast<FuncPtrT>(f);                              \
+    }                                                                    \
+    static FuncPtrT DynLoad() {                                          \
+      static FuncPtrT f = LoadOrDie();                                   \
+      return f;                                                          \
+    }                                                                    \
+    template <typename... Args>                                          \
+    rocblas_status operator()(Args... args) {                            \
+      return DynLoad()(args...);                                         \
+    }                                                                    \
+  } __name;
 
 #endif
 
@@ -257,6 +257,11 @@ using stream_executor::internal::CachedDsoLoader::GetRocblasDsoHandle;
   __macro(rocblas_zgemm_strided_batched)        \
   __macro(rocblas_gemm_ex)                      \
   __macro(rocblas_gemm_strided_batched_ex)      \
+  __macro(rocblas_gemm_ex_get_solutions)        \
+  __macro(rocblas_gemm_ex_get_solutions_by_type) \
+  __macro(rocblas_gemm_batched_ex_get_solutions) \
+  __macro(rocblas_gemm_batched_ex_get_solutions_by_type) \
+  __macro(rocblas_gemm_strided_batched_ex_get_solutions) \
   __macro(rocblas_strsm_batched)                \
   __macro(rocblas_dtrsm_batched)                \
   __macro(rocblas_ctrsm_batched)                \
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
index cc5bc55e888812..cedb35140924c8 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_blas.cc
@@ -52,48 +52,48 @@ extern void rocm_Broadcast_fp32(void *stream, float *dst, int dst_stride,
                                 int size);
 
 template <class T>
-const typename RocBlasTypeConversionHelper<T>::mapped_type *complex_cast(
-    const DeviceMemory<T> &a) {
-  return reinterpret_cast<
-      const typename RocBlasTypeConversionHelper<T>::mapped_type *>(
-      GpuMemory(a));
+const RocBlasType_t<T> *complex_cast(const DeviceMemory<T> &a) {
+  return reinterpret_cast<const RocBlasType_t<T> *>(GpuMemory(a));
 }
 
 template <class T>
-const typename RocBlasTypeConversionHelper<T>::mapped_type *complex_cast(
-    const T &a) {
-  return reinterpret_cast<
-      const typename RocBlasTypeConversionHelper<T>::mapped_type *>(&a);
+const RocBlasType_t<T> *complex_cast(const T &a) {
+  return reinterpret_cast<const RocBlasType_t<T> *>(&a);
 }
 template <class T>
-typename RocBlasTypeConversionHelper<T>::mapped_type *complex_cast(
-    DeviceMemory<T> *a) {
-  return reinterpret_cast<
-      typename RocBlasTypeConversionHelper<T>::mapped_type *>(
-      GpuMemoryMutable(a));
+RocBlasType_t<T> *complex_cast(DeviceMemory<T> *a) {
+  return reinterpret_cast<RocBlasType_t<T> *>(GpuMemoryMutable(a));
 }
 
 static void blas_log(const char *c) {}
 
 static string ToString(rocblas_status status) {
+#define XVAL(x) \
+  case x:       \
+    return #x
   switch (status) {
-    case rocblas_status_success:
-      return "rocblas_status_success";
-    case rocblas_status_invalid_handle:
-      return "rocblas_status_invalid_handle";
-    case rocblas_status_not_implemented:
-      return "rocblas_status_not_implemented";
-    case rocblas_status_invalid_pointer:
-      return "rocblas_status_invalid_pointer";
-    case rocblas_status_invalid_size:
-      return "rocblas_status_invalid_size";
-    case rocblas_status_memory_error:
-      return "rocblas_status_memory_error";
-    case rocblas_status_internal_error:
-      return "rocblas_status_internal_error";
+    XVAL(rocblas_status_success);
+    XVAL(rocblas_status_invalid_handle);
+    XVAL(rocblas_status_not_implemented);
+    XVAL(rocblas_status_invalid_pointer);
+    XVAL(rocblas_status_invalid_size);
+    XVAL(rocblas_status_memory_error);
+    XVAL(rocblas_status_internal_error);
+#if TF_ROCM_VERSION >= 60000
+    XVAL(rocblas_status_perf_degraded);
+    XVAL(rocblas_status_size_query_mismatch);
+    XVAL(rocblas_status_size_increased);
+    XVAL(rocblas_status_size_unchanged);
+    XVAL(rocblas_status_invalid_value);
+    XVAL(rocblas_status_continue);
+    XVAL(rocblas_status_check_numerics_fail);
+    XVAL(rocblas_status_excluded_from_build);
+    XVAL(rocblas_status_arch_mismatch);
+#endif
     default:
       return absl::StrCat("<invalid rocBLAS status: ", status, ">");
   }
+#undef XVAL
 }
 
 bool ROCMBlas::Init() {
@@ -203,17 +203,112 @@ rocblas_side ROCMBlasSide(blas::Side side) {
   }
 }
 
+absl::StatusOr<rocblas_datatype> AsRocBlasType(blas::DataType type) {
+  switch (type) {
+    case blas::DataType::kHalf:
+      return rocblas_datatype_f16_r;
+    case blas::DataType::kBF16:
+      return rocblas_datatype_bf16_r;
+    case blas::DataType::kFloat:
+      return rocblas_datatype_f32_r;
+    case blas::DataType::kDouble:
+      return rocblas_datatype_f64_r;
+    case blas::DataType::kInt8:
+      return rocblas_datatype_i8_r;
+    case blas::DataType::kInt32:
+      return rocblas_datatype_i32_r;
+    case blas::DataType::kComplexFloat:
+      return rocblas_datatype_f32_c;
+    case blas::DataType::kComplexDouble:
+      return rocblas_datatype_f64_c;
+    default:
+      return absl::InternalError(
+          absl::StrFormat("Unsupported blas data type: %d", (int)type));
+  }
+}
+
+absl::StatusOr<rocblas_datatype> AsRocBlasComputeType(
+    blas::ComputationType type) {
+  switch (type) {
+    case blas::ComputationType::kF16:
+      return rocblas_datatype_f16_r;
+    case blas::ComputationType::kF32:
+      return rocblas_datatype_f32_r;
+    case blas::ComputationType::kF64:
+      return rocblas_datatype_f64_r;
+    case blas::ComputationType::kI32:
+      return rocblas_datatype_i32_r;
+    case blas::ComputationType::kF16AsF32:
+    case blas::ComputationType::kBF16AsF32:
+    case blas::ComputationType::kTF32AsF32:
+    default:
+      return absl::InternalError(
+          absl::StrFormat("Unsupported compute type: %d", (int)type));
+  }
+}
+
+void CheckPreconditions(blas::Transpose transa, blas::Transpose transb,
+                        uint64_t m, uint64_t n, uint64_t k,
+                        blas::DataType dtype, int lda, int ldb) {
+  if (dtype == blas::DataType::kHalf || dtype == blas::DataType::kFloat) {
+    if (transa == blas::Transpose::kNoTranspose) {
+      if (lda < static_cast<int64_t>(m)) {
+        LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); "
+                        "precondition violation";
+      }
+    } else {
+      if (lda < static_cast<int64_t>(k)) {
+        LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k
+                     << ") (transpose case); precondition violation";
+      }
+    }
+    if (transb == blas::Transpose::kNoTranspose) {
+      if (ldb < static_cast<int64_t>(k)) {
+        LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k
+                     << ") (no transpose case); precondition violation";
+      }
+    } else {
+      if (ldb < static_cast<int64_t>(n)) {
+        LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); "
+                        "precondition violation";
+      }
+    }
+  }
+}
+
+uint32_t GemmFloat16Flags(blas::DataType dtype, blas::CallContext context) {
+  bool is_backprop = (context == blas::CallContext::kBackpropInput1 ||
+                      context == blas::CallContext::kBackpropInput2);
+
+  return dtype == blas::DataType::kHalf && is_backprop
+             ? rocblas_gemm_flags_fp16_alt_impl
+             : rocblas_gemm_flags_none;
+}
+
+absl::Status PopulateProfileFromTimer(
+    std::optional<GpuTimer> &timer, blas::AlgorithmType algorithm,
+    blas::ProfileResult *output_profile_result) {
+  if (output_profile_result) {
+    TF_ASSIGN_OR_RETURN(absl::Duration duration, timer->GetElapsedDuration());
+    output_profile_result->set_is_valid(true);
+    output_profile_result->set_algorithm(algorithm);
+    output_profile_result->set_elapsed_time_in_ms(
+        absl::ToDoubleMilliseconds(duration));
+  }
+  return absl::OkStatus();
+}
+
 }  // namespace
 
 template <typename FuncT, typename... Args>
-bool ROCMBlas::DoBlasInternalImpl(FuncT rocblas_func, Stream *stream,
-                                  bool pointer_mode_host, bool err_on_failure,
-                                  Args... args) {
+absl::Status ROCMBlas::DoBlasInternalImpl(FuncT rocblas_func, Stream *stream,
+                                          bool pointer_mode_host,
+                                          bool err_on_failure, Args &&...args) {
   absl::MutexLock lock{&mu_};
 
   CHECK(blas_ != nullptr);
   if (!SetStream(stream)) {
-    return false;
+    return absl::InternalError("Setting stream failed");
   }
 
   gpu::ScopedActivateExecutorContext sac{parent_};
@@ -224,17 +319,21 @@ bool ROCMBlas::DoBlasInternalImpl(FuncT rocblas_func, Stream *stream,
   if (!allow_atomics) {
     ret = wrap::rocblas_set_atomics_mode(blas_, rocblas_atomics_not_allowed);
     if (err_on_failure && ret != rocblas_status_success) {
-      LOG(ERROR) << "failed to to set atomics mode before "
-                 << rocblas_func.kName << ": " << ToString(ret);
+      LOG(ERROR) << "failed to to set atomics mode before " << FuncT::kName
+                 << ": " << ToString(ret);
     }
   }
 
-  ret = rocblas_func(blas_, args...);
-  if (err_on_failure && ret != rocblas_status_success) {
-    LOG(ERROR) << "failed to run ROCBLAS routine " << rocblas_func.kName << ": "
-               << ToString(ret);
+  ret = rocblas_func(blas_, std::forward<Args>(args)...);
+  if (ret != rocblas_status_success) {
+    auto err_str =
+        absl::StrFormat("%s failed with: %s", FuncT::kName, ToString(ret));
+    if (err_on_failure) {
+      LOG(ERROR) << err_str;
+    }
+    return absl::InternalError(err_str);
   }
-  return ret == rocblas_status_success;
+  return absl::OkStatus();
 }
 
 bool ROCMBlas::DoBlasAxpy(Stream *stream, uint64_t elem_count, float alpha,
@@ -360,7 +459,7 @@ bool ROCMBlas::DoBlasSbmv(Stream *stream, blas::UpperLower uplo, uint64_t n,
 
 absl::Status ROCMBlas::DoBlasGemm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64 n, uint64_t k, blas::DataType dtype, const void *alpha,
+    uint64_t n, uint64_t k, blas::DataType dtype, const void *alpha,
     const DeviceMemoryBase &a, int lda, const DeviceMemoryBase &b, int ldb,
     const void *beta, DeviceMemoryBase *c, int ldc,
     const NumericOptions &numeric_options, blas::CallContext context) {
@@ -371,45 +470,14 @@ absl::Status ROCMBlas::DoBlasGemm(
       "c=%p ldc=%d",
       static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
       a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
-  if (dtype == blas::DataType::kHalf || dtype == blas::DataType::kFloat) {
-    if (transa == blas::Transpose::kNoTranspose) {
-      if (lda < static_cast<int64_t>(m)) {
-        LOG(WARNING) << "GEMM lda was smaller than m (no transpose case); "
-                        "precondition violation";
-      }
-    } else {
-      if (lda < static_cast<int64_t>(k)) {
-        LOG(WARNING) << "GEMM lda (" << lda << ") was smaller than k (" << k
-                     << ") (transpose case); precondition violation";
-      }
-    }
-    if (transb == blas::Transpose::kNoTranspose) {
-      if (ldb < static_cast<int64_t>(k)) {
-        LOG(WARNING) << "GEMM ldb (" << ldb << ") was smaller than k (" << k
-                     << ") (no transpose case); precondition violation";
-      }
-    } else {
-      if (ldb < static_cast<int64_t>(n)) {
-        LOG(WARNING) << "GEMM ldb was smaller than n (transpose case); "
-                        "precondition violation";
-      }
-    }
-  }
+
+  CheckPreconditions(transa, transb, m, n, k, dtype, lda, ldb);
 
   switch (dtype) {
     case blas::DataType::kHalf: {
       absl::StatusOr<bool> maybe_hasXDLOPS = GpuDriver::GetMFMASupport();
       if (maybe_hasXDLOPS.ok() && maybe_hasXDLOPS.value()) {
         VLOG(1) << "Using rocblas_gemm_ex";
-        bool is_backprop = (context == blas::CallContext::kBackpropInput1) ||
-                           (context == blas::CallContext::kBackpropInput2);
-
-        uint32_t flags = rocblas_gemm_flags_none;
-#if TF_ROCM_VERSION >= 50000
-        if (is_backprop) {
-          flags = rocblas_gemm_flags_fp16_alt_impl;
-        }
-#endif
         return DoBlasInternalStatus(
             wrap::rocblas_gemm_ex, stream, /* pointer_mode_host = */ true,
             ROCMBlasTranspose(transa), ROCMBlasTranspose(transb),
@@ -417,7 +485,7 @@ absl::Status ROCMBlas::DoBlasGemm(
             rocblas_datatype_f16_r, lda, b.opaque(), rocblas_datatype_f16_r,
             ldb, beta, c->opaque(), rocblas_datatype_f16_r, ldc, c->opaque(),
             rocblas_datatype_f16_r, ldc, rocblas_datatype_f32_r,
-            rocblas_gemm_algo_standard, 0, flags);
+            rocblas_gemm_algo_standard, 0, GemmFloat16Flags(dtype, context));
       } else {
         VLOG(1) << "Using rocblas_hgemm";
         const Eigen::half alpha_half(*static_cast<const float *>(alpha));
@@ -491,38 +559,208 @@ absl::Status ROCMBlas::DoBlasGemm(
 
 absl::Status ROCMBlas::DoBlasGemmWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, const void *alpha, const DeviceMemoryBase &a,
+    uint64_t n, uint64_t k, const void *alpha, const DeviceMemoryBase &a,
     blas::DataType type_a, int lda, const DeviceMemoryBase &b,
     blas::DataType type_b, int ldb, const void *beta, DeviceMemoryBase *c,
     blas::DataType type_c, int ldc, blas::ComputationType computation_type,
     blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
-    blas::ProfileResult *output_profile_result, blas::CallContext context) {
-  // ROCM TODO: properly implement the interface
-  return absl::InternalError(
-      "DoBlasGemmWithAlgorithm "
-      "is not implemented on ROCm yet");
+    blas::ProfileResult *profile_result, blas::CallContext context) {
+  blas_log("DoBlasGemmWithAlgorithm");
+  if (type_a != type_b) {
+    return absl::InternalError(absl::StrFormat(
+        "DoBlasGemmWithAlgorithm: different "
+        "datatypes for the inputs a (%d) and b (%d) are unsupported",
+        static_cast<int>(type_a), static_cast<int>(type_b)));
+  }
+  TF_ASSIGN_OR_RETURN(
+      auto timer,
+      GpuTimer::CreateIfNeeded(AsGpuStream(stream), profile_result != nullptr));
+
+  // fall back to the default implementation
+  if (algorithm == blas::kDefaultAlgorithm && type_a == type_c) {
+    TF_RETURN_IF_ERROR(DoBlasGemm(stream, transa, transb, m, n, k, type_a,
+                                  alpha, a, lda, b, ldb, beta, c, ldc,
+                                  numeric_options, context));
+
+  } else {
+    CheckPreconditions(transa, transb, m, n, k, type_a, lda, ldb);
+    TF_ASSIGN_OR_RETURN(auto roc_type_a, AsRocBlasType(type_a));
+    TF_ASSIGN_OR_RETURN(auto roc_type_c, AsRocBlasType(type_c));
+    TF_ASSIGN_OR_RETURN(auto roc_comp_type,
+                        AsRocBlasComputeType(computation_type));
+
+    VLOG(1) << absl::StreamFormat(
+        "doing rocBLAS GEMM with Algorithm: at=%d bt=%d m=%u n=%u "
+        "k=%llu alpha=%p a=%p lda=%d b=%p ldb=%d beta=%p "
+        "c=%p ldc=%d algorithm=%d type_a/b=%d type_c=%d comp_type=%d",
+        static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
+        a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc, algorithm,
+        static_cast<int>(roc_type_a), static_cast<int>(roc_type_c),
+        static_cast<int>(roc_comp_type));
+
+    TF_RETURN_IF_ERROR(DoBlasInternalImpl(
+        wrap::rocblas_gemm_ex, stream,
+        /* pointer_mode_host = */ true,
+        /* error_on_failure = */ false, ROCMBlasTranspose(transa),
+        ROCMBlasTranspose(transb), (rocblas_int)m, (rocblas_int)n,
+        (rocblas_int)k, alpha, a.opaque(), roc_type_a, lda, b.opaque(),
+        roc_type_a, ldb, beta, c->opaque(), roc_type_c, ldc, c->opaque(),
+        roc_type_c, ldc, roc_comp_type, rocblas_gemm_algo_solution_index,
+        algorithm, GemmFloat16Flags(type_a, context)));
+  }
+  TF_RETURN_IF_ERROR(
+      PopulateProfileFromTimer(timer, algorithm, profile_result));
+
+  return absl::OkStatus();
 }
 
 absl::Status ROCMBlas::DoBlasGemmStridedBatchedWithAlgorithm(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, const void *alpha, const DeviceMemoryBase &a,
+    uint64_t n, uint64_t k, const void *alpha, const DeviceMemoryBase &a,
     blas::DataType type_a, int lda, int64_t stride_a, const DeviceMemoryBase &b,
     blas::DataType type_b, int ldb, int64_t stride_b, const void *beta,
     DeviceMemoryBase *c, blas::DataType type_c, int ldc, int64_t stride_c,
     int batch_count, blas::ComputationType computation_type,
     blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
-    blas::ProfileResult *output_profile_result, blas::CallContext context) {
-  // ROCM TODO: properly implement the interface
-  return absl::InternalError(
-      "DoBlasGemmStridedBatchedWithAlgorithm "
-      "is not implemented on ROCm yet");
+    blas::ProfileResult *profile_result, blas::CallContext context) {
+  blas_log("DoBlasGemmStridedBatchedWithAlgorithm");
+  if (type_a != type_b) {
+    return absl::InternalError(absl::StrFormat(
+        "DoBlasGemmStridedBatchedWithAlgorithm: different "
+        "datatypes for the inputs a (%d) and b (%d) are unsupported",
+        static_cast<int>(type_a), static_cast<int>(type_b)));
+  }
+  TF_ASSIGN_OR_RETURN(
+      auto timer,
+      GpuTimer::CreateIfNeeded(AsGpuStream(stream), profile_result != nullptr));
+
+  // fall back to the default implementation
+  if (algorithm == blas::kDefaultAlgorithm && type_a == type_c) {
+    TF_RETURN_IF_ERROR(DoBlasGemmStridedBatched(
+        stream, transa, transb, m, n, k, type_a, alpha, a, lda, stride_a, b,
+        ldb, stride_b, beta, c, ldc, stride_c, batch_count, numeric_options,
+        context));
+  } else {
+    VLOG(1) << absl::StreamFormat(
+        "doing rocBLAS GEMM strided batched with Algorithm: at=%d bt=%d m=%u "
+        "n=%u "
+        "k=%llu alpha=%p a=%p lda=%d b=%p ldb=%d beta=%p "
+        "c=%p ldc=%d algorithm=%d type_a/b=%d type_c=%d stride_a/b/c=%d/%d/%d "
+        "batch_count=%d",
+        static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
+        a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc, algorithm,
+        static_cast<int>(type_a), static_cast<int>(type_c), stride_a, stride_b,
+        stride_c, batch_count);
+
+    TF_ASSIGN_OR_RETURN(auto roc_type_a, AsRocBlasType(type_a));
+    TF_ASSIGN_OR_RETURN(auto roc_type_c, AsRocBlasType(type_c));
+    TF_ASSIGN_OR_RETURN(auto roc_comp_type,
+                        AsRocBlasComputeType(computation_type));
+
+    TF_RETURN_IF_ERROR(DoBlasInternalImpl(
+        wrap::rocblas_gemm_strided_batched_ex, stream,
+        /* pointer_mode_host = */ true,
+        /* error_on_failure = */ false, ROCMBlasTranspose(transa),
+        ROCMBlasTranspose(transb), (rocblas_int)m, (rocblas_int)n,
+        (rocblas_int)k, alpha, a.opaque(), roc_type_a, lda, stride_a,
+        b.opaque(), roc_type_a, ldb, stride_b, beta, c->opaque(), roc_type_c,
+        ldc, stride_c, c->opaque(), roc_type_c, ldc, stride_c, batch_count,
+        roc_comp_type, rocblas_gemm_algo_solution_index, algorithm,
+        GemmFloat16Flags(type_a, context)));
+  }
+  TF_RETURN_IF_ERROR(
+      PopulateProfileFromTimer(timer, algorithm, profile_result));
+
+  return absl::OkStatus();
 }
 
+template <class Lambda>
+struct NameWrap : Lambda {
+  using Lambda::operator();
+  constexpr static const char *kName = "rocblas_gemm_ex_get_solutions";
+};
+template <class Func>
+NameWrap(Func) -> NameWrap<Func>;
+
+#define ASSIGN_OR_FALSE(lhs, rexpr)                 \
+  result = (rexpr);                                 \
+  if (TF_PREDICT_FALSE(!result.ok())) return false; \
+  lhs = std::move(result).value()
+
 bool ROCMBlas::GetBlasGemmAlgorithms(
-    Stream *stream, std::vector<blas::AlgorithmType> *out_algorithms) {
-  // ROCM TODO: properly implement the interface
-  return true;
+    Stream *stream, const gpu::MatrixDescriptor &a,
+    const gpu::MatrixDescriptor &b, gpu::OutputMatrixDescriptor *c,
+    const void *alpha, const void *beta,
+    std::vector<blas::AlgorithmType> *out_algorithms) {
+  out_algorithms->clear();
+  auto blas_lambda = [this, out_algorithms](auto handle, auto &&blas_func,
+                                            auto &&...rest) {
+    rocblas_int num_sols = 0;
+
+    if (auto ret = blas_func(handle, std::forward<decltype(rest)>(rest)...,
+                             nullptr, &num_sols);
+        ret != rocblas_status_success) {
+      return ret;
+    }
+    solutions_.resize(num_sols);
+    if (auto ret = blas_func(handle, std::forward<decltype(rest)>(rest)...,
+                             solutions_.data(), &num_sols);
+        ret != rocblas_status_success) {
+      return ret;
+    }
+    out_algorithms->resize(num_sols);
+    for (rocblas_int i = 0; i < num_sols; i++) {
+      (*out_algorithms)[i] = solutions_[i];
+    }
+    return rocblas_status_success;
+  };
+
+  VLOG(1) << absl::StreamFormat(
+      "GetBlasAlgorithms: at=%d bt=%d m=%u n=%u "
+      "k=%llu alpha=%p a=%p lda=%d b=%p ldb=%d beta=%p "
+      "c=%p ldc=%d type_a/b=%d type_c=%d stride_a/b/c=%d/%d/%d "
+      "batch_count=%d",
+      static_cast<int>(a.transpose), static_cast<int>(b.transpose), c->m, c->n,
+      c->k, alpha, a.data.opaque(), a.leading_dim_stride, b.data.opaque(),
+      b.leading_dim_stride, beta, c->data.opaque(), c->leading_dim_stride,
+      static_cast<int>(a.type), static_cast<int>(c->type), a.batch_stride,
+      b.batch_stride, c->batch_stride, c->batch_size);
+
+  if (a.type != b.type) {
+    LOG(ERROR) << "Gemm arguments types differ: no feasible solutions!";
+    return false;
+  }
+  absl::StatusOr<rocblas_datatype> result;
+  ASSIGN_OR_FALSE(auto roc_type_a, AsRocBlasType(a.type));
+  ASSIGN_OR_FALSE(auto roc_type_c, AsRocBlasType(c->type));
+  ASSIGN_OR_FALSE(auto roc_comp_type, AsRocBlasComputeType(c->compute_type));
+
+  if (c->batch_size == 1) {
+    // TODO: we should possibly use GemmFloat16Flags(type_a, context) here..
+    return DoBlasInternalFailureOK(
+        NameWrap{blas_lambda}, stream, true,
+        wrap::rocblas_gemm_ex_get_solutions, ROCMBlasTranspose(a.transpose),
+        ROCMBlasTranspose(b.transpose), c->m, c->n, c->k, alpha,
+        a.data.opaque(), roc_type_a, a.leading_dim_stride, b.data.opaque(),
+        roc_type_a, b.leading_dim_stride, beta, c->data.opaque(), roc_type_c,
+        c->leading_dim_stride, c->data.opaque(), roc_type_c,
+        c->leading_dim_stride, roc_comp_type, rocblas_gemm_algo_solution_index,
+        0);
+  }
+  return DoBlasInternalFailureOK(
+      NameWrap{blas_lambda}, stream, true,
+      wrap::rocblas_gemm_strided_batched_ex_get_solutions,
+      ROCMBlasTranspose(a.transpose), ROCMBlasTranspose(b.transpose), c->m,
+      c->n, c->k, alpha, a.data.opaque(), roc_type_a, a.leading_dim_stride,
+      a.batch_stride, b.data.opaque(), roc_type_a, b.leading_dim_stride,
+      b.batch_stride, beta, c->data.opaque(), roc_type_c, c->leading_dim_stride,
+      c->batch_stride, c->data.opaque(), roc_type_c, c->leading_dim_stride,
+      c->batch_stride, c->batch_size, roc_comp_type,
+      rocblas_gemm_algo_solution_index, 0);
 }
+#undef ASSIGN_OR_FALSE
+
+namespace {
 
 struct MemoryCopyOp {
   char *src_ptr;
@@ -535,7 +773,7 @@ struct MemoryCopyOp {
 
 // Check whether two Memory Copy Ops can be fold together.
 // If it's true, fold it. Otherwise, return false.
-static bool MemCopyOpsFold(MemoryCopyOp &y, const MemoryCopyOp &x) {
+bool MemCopyOpsFold(MemoryCopyOp &y, const MemoryCopyOp &x) {
   bool misaligned = (x.size & 3) ||
                     (reinterpret_cast<uint64_t>(x.dst_ptr) & 3) ||
                     (reinterpret_cast<uint64_t>(x.src_ptr) & 3) ||
@@ -636,19 +874,21 @@ absl::Status ReorganizeMemory(Stream *stream,
 }
 
 template <typename T>
-absl::Status ROCMBlas::AllocateStridedBuffer(
-    const std::vector<typename RocBlasTypeConversionHelper<T>::mapped_type *>
-        &raw_ptrs,
-    int batch_count, uint64_t batch_stride, ScratchAllocator *scratch_allocator,
-    Stream *stream,
-    std::unique_ptr<TemporaryDeviceMemory<
-        typename RocBlasTypeConversionHelper<T>::mapped_type>> *temp_memory,
-    DeviceMemory<typename RocBlasTypeConversionHelper<T>::mapped_type>
-        *device_memory,
-    bool copy_data, bool &reallocated) {
-  assert(device_memory != nullptr);
-
-  using MAPPED_T = typename RocBlasTypeConversionHelper<T>::mapped_type;
+struct AllocateStridedResult {
+  using Type = RocBlasType_t<T>;
+  DeviceMemory<Type> device_mem;
+  bool reallocated;
+};
+
+// A helper allocation function to convert raw pointers memory layout to
+// strided flavor
+template <typename T>
+absl::StatusOr<AllocateStridedResult<T>> AllocateStridedBuffer(
+    const std::vector<RocBlasType_t<T> *> &raw_ptrs, int batch_count,
+    uint64_t batch_stride, ScratchAllocator *scratch_allocator, Stream *stream,
+    bool copy_data) {
+  using MAPPED_T = RocBlasType_t<T>;
+  AllocateStridedResult<T> res;
 
   bool needs_allocate_strided = false;
   for (int i = 1; i < batch_count; ++i) {
@@ -664,42 +904,37 @@ absl::Status ROCMBlas::AllocateStridedBuffer(
 
   // No need to do re-allocation, take the short cut and return
   if (!needs_allocate_strided) {
-    *device_memory = DeviceMemory<MAPPED_T>(
+    res.device_mem = DeviceMemory<MAPPED_T>(
         DeviceMemoryBase(raw_ptrs[0], matrix_batch_byte_size));
-    reallocated = false;
-    return absl::OkStatus();
+    res.reallocated = false;
+    return res;
   }
 
-  if (scratch_allocator != nullptr) {
-    TF_ASSIGN_OR_RETURN(
-        DeviceMemory<uint8> batch_matrix_bytes,
-        scratch_allocator->AllocateBytes(matrix_batch_byte_size));
-    *device_memory = DeviceMemory<MAPPED_T>(batch_matrix_bytes);
-  } else {
-    assert(temp_memory != nullptr);
-    TF_ASSIGN_OR_RETURN(*temp_memory, stream->AllocateTemporaryArray<MAPPED_T>(
-                                          matrix_batch_byte_size));
-    *device_memory =
-        DeviceMemory<MAPPED_T>(*(*temp_memory)->mutable_device_memory());
+  if (scratch_allocator == nullptr) {
+    return absl::InternalError("scratch_allocator is null");
   }
-
-  reallocated = true;
-
-  if (copy_data)
-    return ReorganizeMemory(stream, device_memory, raw_ptrs, batch_count,
+  TF_ASSIGN_OR_RETURN(DeviceMemory<uint8> batch_matrix_bytes,
+                      scratch_allocator->AllocateBytes(matrix_batch_byte_size));
+  res.device_mem = DeviceMemory<MAPPED_T>(batch_matrix_bytes);
+  res.reallocated = true;
+  if (copy_data) {
+    return ReorganizeMemory(stream, &res.device_mem, raw_ptrs, batch_count,
                             batch_stride, true);
-  return absl::OkStatus();
+  }
+  return res;
 }
 
+}  // namespace
+
 template <typename T, typename FuncT>
 absl::Status ROCMBlas::DoBlasGemmBatchedInternal(
     FuncT rocblas_func, Stream *stream, blas::Transpose transa,
-    blas::Transpose transb, uint64_t m, uint64 n, uint64 k, T alpha,
+    blas::Transpose transb, uint64_t m, uint64_t n, uint64_t k, T alpha,
     DeviceMemorySlice<T> a_ptrs_to_wrappers, int lda,
     DeviceMemorySlice<T> b_ptrs_to_wrappers, int ldb, T beta,
     DeviceMemorySlice<T> c_ptrs_to_wrappers, int ldc, int batch_count,
     ScratchAllocator *scratch_allocator) {
-  using MAPPED_T = typename RocBlasTypeConversionHelper<T>::mapped_type;
+  using MAPPED_T = RocBlasType_t<T>;
 
   // Sanity checks before making any further progress
   uint64_t batch_stride_a = 0;
@@ -726,46 +961,32 @@ absl::Status ROCMBlas::DoBlasGemmBatchedInternal(
   }
 
   // Allocate local vectors to hold device pointers to matrices
-  std::vector<MAPPED_T *> a_raw_ptrs, b_raw_ptrs, c_raw_ptrs;
+  std::vector<MAPPED_T *> a_raw_ptrs(batch_count), b_raw_ptrs(batch_count),
+      c_raw_ptrs(batch_count);
   for (int i = 0; i < batch_count; ++i) {
     // static_cast does work when converting Eigen::half* to rocblas_half*,
     // hence the use of reinterpret_cast
-    a_raw_ptrs.push_back(
-        reinterpret_cast<MAPPED_T *>(a_ptrs_to_wrappers[i]->opaque()));
-    b_raw_ptrs.push_back(
-        reinterpret_cast<MAPPED_T *>(b_ptrs_to_wrappers[i]->opaque()));
-    c_raw_ptrs.push_back(
-        reinterpret_cast<MAPPED_T *>(c_ptrs_to_wrappers[i]->opaque()));
+    a_raw_ptrs[i] =
+        reinterpret_cast<MAPPED_T *>(a_ptrs_to_wrappers[i]->opaque());
+    b_raw_ptrs[i] =
+        reinterpret_cast<MAPPED_T *>(b_ptrs_to_wrappers[i]->opaque());
+    c_raw_ptrs[i] =
+        reinterpret_cast<MAPPED_T *>(c_ptrs_to_wrappers[i]->opaque());
   }
 
-  DeviceMemory<MAPPED_T> a;
   // Make sure the temporary memory are in-scope before the function returns
-  std::unique_ptr<TemporaryDeviceMemory<MAPPED_T>> a_temp;
-  bool reallocated_a, reallocated_b, reallocated_c;
-  absl::Status a_allocation_status = AllocateStridedBuffer<T>(
-      a_raw_ptrs, batch_count, batch_stride_a, scratch_allocator, stream,
-      &a_temp, &a, true, reallocated_a);
-  if (a_allocation_status != absl::OkStatus()) {
-    return a_allocation_status;
-  }
+  TF_ASSIGN_OR_RETURN(
+      auto a, AllocateStridedBuffer<T>(a_raw_ptrs, batch_count, batch_stride_a,
+                                       scratch_allocator, stream, true));
 
-  DeviceMemory<MAPPED_T> b;
-  std::unique_ptr<TemporaryDeviceMemory<MAPPED_T>> b_temp;
-  absl::Status b_allocation_status = AllocateStridedBuffer<T>(
-      b_raw_ptrs, batch_count, batch_stride_b, scratch_allocator, stream,
-      &b_temp, &b, true, reallocated_b);
-  if (b_allocation_status != absl::OkStatus()) {
-    return b_allocation_status;
-  }
+  TF_ASSIGN_OR_RETURN(
+      auto b, AllocateStridedBuffer<T>(b_raw_ptrs, batch_count, batch_stride_b,
+                                       scratch_allocator, stream, true));
 
-  DeviceMemory<MAPPED_T> c;
-  std::unique_ptr<TemporaryDeviceMemory<MAPPED_T>> c_temp;
-  absl::Status c_allocation_status = AllocateStridedBuffer<T>(
-      c_raw_ptrs, batch_count, batch_stride_c, scratch_allocator, stream,
-      &c_temp, &c, true, reallocated_c);  // can disable copy if beta=0
-  if (c_allocation_status != absl::OkStatus()) {
-    return c_allocation_status;
-  }
+  TF_ASSIGN_OR_RETURN(
+      auto c, AllocateStridedBuffer<T>(c_raw_ptrs, batch_count, batch_stride_c,
+                                       scratch_allocator, stream,
+                                       true));  // can disable copy if beta=0
 
   bool ok;
   if constexpr (std::is_same_v<T, Eigen::bfloat16>) {
@@ -777,31 +998,36 @@ absl::Status ROCMBlas::DoBlasGemmBatchedInternal(
     ok = DoBlasInternal(
         rocblas_func, stream, /* pointer_mode_host = */ true,
         ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m, n, k,
-        alpha_ptr, a.opaque(), rocblas_datatype_bf16_r, lda, batch_stride_a,
-        b.opaque(), rocblas_datatype_bf16_r, ldb, batch_stride_b, beta_ptr,
-        c.opaque(), rocblas_datatype_bf16_r, ldc, batch_stride_c, c.opaque(),
+        alpha_ptr, a.device_mem.opaque(), rocblas_datatype_bf16_r, lda,
+        batch_stride_a, b.device_mem.opaque(), rocblas_datatype_bf16_r, ldb,
+        batch_stride_b, beta_ptr, c.device_mem.opaque(),
+        rocblas_datatype_bf16_r, ldc, batch_stride_c, c.device_mem.opaque(),
         rocblas_datatype_bf16_r, ldc, batch_stride_c, batch_count,
         rocblas_datatype_f32_r, rocblas_gemm_algo_standard, 0, 0);
   } else {
     MAPPED_T *alpha_ptr = reinterpret_cast<MAPPED_T *>(&alpha);
     MAPPED_T *beta_ptr = reinterpret_cast<MAPPED_T *>(&beta);
-    ok = DoBlasInternal(rocblas_func, stream, /* pointer_mode_host = */ true,
-                        ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m,
-                        n, k, GpuComplex(alpha_ptr), GpuMemory(a), lda,
-                        batch_stride_a, GpuMemory(b), ldb, batch_stride_b,
-                        GpuComplex(beta_ptr), GpuMemoryMutable(&c), ldc,
-                        batch_stride_c, batch_count);
+    ok = DoBlasInternal(
+        rocblas_func, stream, /* pointer_mode_host = */ true,
+        ROCMBlasTranspose(transa), ROCMBlasTranspose(transb), m, n, k,
+        GpuComplex(alpha_ptr), GpuMemory(a.device_mem), lda, batch_stride_a,
+        GpuMemory(b.device_mem), ldb, batch_stride_b, GpuComplex(beta_ptr),
+        GpuMemoryMutable(&c.device_mem), ldc, batch_stride_c, batch_count);
+  }
+  if (!ok) {
+    return absl::Status(absl::StatusCode::kInternal,
+                        "failed BLAS call, see log for details");
+  }
+  if (c.reallocated) {
+    return ReorganizeMemory(stream, &c.device_mem, c_raw_ptrs, batch_count,
+                            batch_stride_c, false);
   }
-  if (!ok) return absl::InternalError("failed BLAS call, see log for details");
-  if (reallocated_c)
-    return ReorganizeMemory(stream, &c, c_raw_ptrs, batch_count, batch_stride_c,
-                            false);
   return absl::OkStatus();
 }
 
 bool ROCMBlas::DoBlasGemmBatched(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, float alpha, DeviceMemorySlice<Eigen::half> a,
+    uint64_t n, uint64_t k, float alpha, DeviceMemorySlice<Eigen::half> a,
     int lda, DeviceMemorySlice<Eigen::half> b, int ldb, float beta,
     DeviceMemorySlice<Eigen::half> c, int ldc, int batch_count,
     const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
@@ -823,7 +1049,7 @@ bool ROCMBlas::DoBlasGemmBatched(
 
 bool ROCMBlas::DoBlasGemmBatched(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, float alpha,
+    uint64_t n, uint64_t k, float alpha,
     DeviceMemorySlice<Eigen::bfloat16> a_array, int lda,
     DeviceMemorySlice<Eigen::bfloat16> b_array, int ldb, float beta,
     DeviceMemorySlice<Eigen::bfloat16> c_array, int ldc, int batch_count,
@@ -845,7 +1071,7 @@ bool ROCMBlas::DoBlasGemmBatched(
 
 bool ROCMBlas::DoBlasGemmBatched(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, float alpha, DeviceMemorySlice<float> a_array,
+    uint64_t n, uint64_t k, float alpha, DeviceMemorySlice<float> a_array,
     int lda, DeviceMemorySlice<float> b_array, int ldb, float beta,
     DeviceMemorySlice<float> c_array, int ldc, int batch_count,
     const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
@@ -863,7 +1089,7 @@ bool ROCMBlas::DoBlasGemmBatched(
 
 bool ROCMBlas::DoBlasGemmBatched(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, double alpha, DeviceMemorySlice<double> a_array,
+    uint64_t n, uint64_t k, double alpha, DeviceMemorySlice<double> a_array,
     int lda, DeviceMemorySlice<double> b_array, int ldb, double beta,
     DeviceMemorySlice<double> c_array, int ldc, int batch_count,
     const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
@@ -881,7 +1107,7 @@ bool ROCMBlas::DoBlasGemmBatched(
 
 bool ROCMBlas::DoBlasGemmBatched(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, std::complex<float> alpha,
+    uint64_t n, uint64_t k, std::complex<float> alpha,
     DeviceMemorySlice<std::complex<float>> a_array, int lda,
     DeviceMemorySlice<std::complex<float>> b_array, int ldb,
     std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c_array,
@@ -900,7 +1126,7 @@ bool ROCMBlas::DoBlasGemmBatched(
 
 bool ROCMBlas::DoBlasGemmBatched(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, std::complex<double> alpha,
+    uint64_t n, uint64_t k, std::complex<double> alpha,
     DeviceMemorySlice<std::complex<double>> a_array, int lda,
     DeviceMemorySlice<std::complex<double>> b_array, int ldb,
     std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c_array,
@@ -919,7 +1145,7 @@ bool ROCMBlas::DoBlasGemmBatched(
 
 bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
                           blas::UpperLower uplo, blas::Transpose transa,
-                          blas::Diagonal diag, uint64_t m, uint64 n,
+                          blas::Diagonal diag, uint64_t m, uint64_t n,
                           float alpha, const DeviceMemory<float> &a, int lda,
                           DeviceMemory<float> *b, int ldb) {
   blas_log("DoBlasTrsm");
@@ -932,7 +1158,7 @@ bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
 
 bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
                           blas::UpperLower uplo, blas::Transpose transa,
-                          blas::Diagonal diag, uint64_t m, uint64 n,
+                          blas::Diagonal diag, uint64_t m, uint64_t n,
                           double alpha, const DeviceMemory<double> &a, int lda,
                           DeviceMemory<double> *b, int ldb) {
   blas_log("DoBlasTrsm");
@@ -945,7 +1171,7 @@ bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
 
 bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
                           blas::UpperLower uplo, blas::Transpose transa,
-                          blas::Diagonal diag, uint64_t m, uint64 n,
+                          blas::Diagonal diag, uint64_t m, uint64_t n,
                           std::complex<float> alpha,
                           const DeviceMemory<std::complex<float>> &a, int lda,
                           DeviceMemory<std::complex<float>> *b, int ldb) {
@@ -958,7 +1184,7 @@ bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
 
 bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
                           blas::UpperLower uplo, blas::Transpose transa,
-                          blas::Diagonal diag, uint64_t m, uint64 n,
+                          blas::Diagonal diag, uint64_t m, uint64_t n,
                           std::complex<double> alpha,
                           const DeviceMemory<std::complex<double>> &a, int lda,
                           DeviceMemory<std::complex<double>> *b, int ldb) {
@@ -971,7 +1197,7 @@ bool ROCMBlas::DoBlasTrsm(Stream *stream, blas::Side side,
 
 bool ROCMBlas::DoBlasTrsmBatched(Stream *stream, blas::Side side,
                                  blas::UpperLower uplo, blas::Transpose transa,
-                                 blas::Diagonal diag, uint64_t m, uint64 n,
+                                 blas::Diagonal diag, uint64_t m, uint64_t n,
                                  float alpha, const DeviceMemory<float *> &as,
                                  int lda, DeviceMemory<float *> *bs, int ldb,
                                  int batch_count) {
@@ -984,7 +1210,7 @@ bool ROCMBlas::DoBlasTrsmBatched(Stream *stream, blas::Side side,
 
 bool ROCMBlas::DoBlasTrsmBatched(Stream *stream, blas::Side side,
                                  blas::UpperLower uplo, blas::Transpose transa,
-                                 blas::Diagonal diag, uint64_t m, uint64 n,
+                                 blas::Diagonal diag, uint64_t m, uint64_t n,
                                  double alpha, const DeviceMemory<double *> &as,
                                  int lda, DeviceMemory<double *> *bs, int ldb,
                                  int batch_count) {
@@ -997,7 +1223,7 @@ bool ROCMBlas::DoBlasTrsmBatched(Stream *stream, blas::Side side,
 
 bool ROCMBlas::DoBlasTrsmBatched(Stream *stream, blas::Side side,
                                  blas::UpperLower uplo, blas::Transpose transa,
-                                 blas::Diagonal diag, uint64_t m, uint64 n,
+                                 blas::Diagonal diag, uint64_t m, uint64_t n,
                                  std::complex<float> alpha,
                                  const DeviceMemory<std::complex<float> *> &as,
                                  int lda,
@@ -1014,7 +1240,7 @@ bool ROCMBlas::DoBlasTrsmBatched(Stream *stream, blas::Side side,
 
 bool ROCMBlas::DoBlasTrsmBatched(Stream *stream, blas::Side side,
                                  blas::UpperLower uplo, blas::Transpose transa,
-                                 blas::Diagonal diag, uint64_t m, uint64 n,
+                                 blas::Diagonal diag, uint64_t m, uint64_t n,
                                  std::complex<double> alpha,
                                  const DeviceMemory<std::complex<double> *> &as,
                                  int lda,
@@ -1031,17 +1257,18 @@ bool ROCMBlas::DoBlasTrsmBatched(Stream *stream, blas::Side side,
 
 absl::Status ROCMBlas::DoBlasGemmStridedBatched(
     Stream *stream, blas::Transpose transa, blas::Transpose transb, uint64_t m,
-    uint64_t n, uint64 k, blas::DataType dtype, const void *alpha,
+    uint64_t n, uint64_t k, blas::DataType dtype, const void *alpha,
     const DeviceMemoryBase &a, int lda, int64_t stride_a,
     const DeviceMemoryBase &b, int ldb, int64_t stride_b, const void *beta,
     DeviceMemoryBase *c, int ldc, int64_t stride_c, int batch_count,
     const NumericOptions &numeric_options, blas::CallContext context) {
   VLOG(1) << absl::StreamFormat(
-      "doing rocBLAS SGEMM Strided Batched<float>: at=%d bt=%d m=%u n=%u "
+      "doing rocBLAS GEMM Strided Batched: at=%d bt=%d m=%u n=%u "
       "k=%llu alpha=%p a=%p lda=%d b=%p ldb=%d beta=%p "
-      "c=%p ldc=%d",
+      "c=%p ldc=%d stride_a/b/c=%d/%d/%d batch_count=%d",
       static_cast<int>(transa), static_cast<int>(transb), m, n, k, alpha,
-      a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc);
+      a.opaque(), lda, b.opaque(), ldb, beta, c->opaque(), ldc, stride_a,
+      stride_b, stride_c, batch_count);
 
   switch (dtype) {
     case blas::DataType::kHalf: {
@@ -1124,7 +1351,25 @@ absl::Status ROCMBlas::DoBlasGemmStridedBatched(
 }
 
 absl::Status ROCMBlas::GetVersion(string *version) {
+#if TF_ROCM_VERSION >= 60300  // Not yet available in ROCM-6.1
+  absl::MutexLock lock{&mu_};
+  size_t len = 0;
+  if (auto res = rocblas_get_version_string_size(&len);
+      res != rocblas_status_success) {
+    return absl::InternalError(
+        absl::StrCat("GetVersion failed with: ", ToString(res)));
+  }
+  std::vector<char> buf(len + 1);
+  if (auto res = rocblas_get_version_string(buf.data(), len);
+      res != rocblas_status_success) {
+    return absl::InternalError(
+        absl::StrCat("GetVersion failed with: ", ToString(res)));
+  }
+  *version = string(buf.begin(), buf.end());
+  return absl::OkStatus();
+#else
   return absl::UnimplementedError("");
+#endif
 }
 
 }  // namespace gpu
@@ -1167,5 +1412,6 @@ void initialize_rocblas() {
 
 }  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(register_rocblas,
-                            { stream_executor::initialize_rocblas(); });
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(register_rocblas, {
+  stream_executor::initialize_rocblas();
+});
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_blas.h b/third_party/xla/xla/stream_executor/rocm/rocm_blas.h
index 85679e3934afda..1ff723ecb57a2a 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_blas.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_blas.h
@@ -24,15 +24,17 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "rocm/rocm_config.h"
+
+#define ROCBLAS_BETA_FEATURES_API
 #if TF_ROCM_VERSION >= 50600
 #include "rocm/include/rocblas/rocblas.h"
 #else
 #include "rocm/include/rocblas.h"
 #endif
 #include "xla/stream_executor/blas.h"
+#include "xla/stream_executor/gpu/gpu_blas_lt.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/plugin_registry.h"
-#include "xla/stream_executor/temporary_device_memory.h"
 #if TF_HIPBLASLT
 #include "xla/stream_executor/rocm/hip_blas_lt.h"
 #endif
@@ -43,32 +45,34 @@ class Stream;
 
 namespace gpu {
 
-// Type conversion helper that helps to map non-rocblas types to rocblas types
-// Right now, it only converts the Eigen::half type to rocblas_half type
-template <typename T>
-struct RocBlasTypeConversionHelper {
-  using mapped_type = T;
+template <bool ErrorIfMissing, class Target, class A, class B, class... T>
+struct ChooseType {
+  using type = std::conditional_t<
+      std::is_same_v<Target, A>, B,
+      typename ChooseType<ErrorIfMissing, Target, T...>::type>;
 };
 
-template <>
-struct RocBlasTypeConversionHelper<Eigen::half> {
-  using mapped_type = rocblas_half;
+template <class Target, class A, class B>
+struct ChooseType<false, Target, A, B> {
+  // default case: return the same type Target if there is no recursive match
+  using type = std::conditional_t<std::is_same_v<Target, A>, B, Target>;
 };
 
-template <>
-struct RocBlasTypeConversionHelper<Eigen::bfloat16> {
-  using mapped_type = rocblas_bfloat16;
+template <class Target, class A, class B>
+struct ChooseType<true, Target, A, B> {
+  // default case: return compile error if type is not found
+  static_assert(std::is_same_v<Target, A>,
+                "ChooseType: the target type is not found!");
+  using type = B;
 };
 
-template <>
-struct RocBlasTypeConversionHelper<std::complex<float>> {
-  using mapped_type = rocblas_float_complex;
-};
-
-template <>
-struct RocBlasTypeConversionHelper<std::complex<double>> {
-  using mapped_type = rocblas_double_complex;
-};
+// Type conversion helper that helps to map non-rocblas types to rocblas types
+template <typename T>
+using RocBlasType_t =
+    typename ChooseType<false, T, Eigen::half, rocblas_half, Eigen::bfloat16,
+                        rocblas_bfloat16, std::complex<float>,
+                        rocblas_float_complex, std::complex<double>,
+                        rocblas_double_complex>::type;
 
 class GpuExecutor;
 
@@ -124,49 +128,39 @@ class ROCMBlas : public blas::BlasSupport {
   // err_on_failure:     Whether to print an error if the rocBLAS function
   // fails. args:               Arguments of rocBLAS function.
   template <typename FuncT, typename... Args>
-  bool DoBlasInternalImpl(FuncT rocblas_func, Stream *stream,
-                          bool pointer_mode_host, bool err_on_failure,
-                          Args... args);
+  absl::Status DoBlasInternalImpl(FuncT rocblas_func, Stream *stream,
+                                  bool pointer_mode_host, bool err_on_failure,
+                                  Args &&...args);
 
   // Convenience functions that call DoBlasInternalImpl with different values
   // for err_on_failure.
   template <typename FuncT, typename... Args>
   bool DoBlasInternal(FuncT rocblas_func, Stream *stream,
-                      bool pointer_mode_host, Args... args) {
-    return DoBlasInternalImpl(rocblas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/true, args...);
+                      bool pointer_mode_host, Args &&...args) {
+    auto ret = DoBlasInternalImpl(rocblas_func, stream, pointer_mode_host,
+                                  /*err_on_failure=*/true,
+                                  std::forward<Args>(args)...);
+    return ret.ok();
   }
 
   // Same as above, but returns absl::Status.
-  template <typename... Args>
-  absl::Status DoBlasInternalStatus(Args... args) {
-    if (!DoBlasInternal(args...)) {
-      return absl::InternalError("Failed calling rocBLAS");
-    }
-    return absl::OkStatus();
+  template <typename FuncT, typename... Args>
+  absl::Status DoBlasInternalStatus(FuncT rocblas_func, Stream *stream,
+                                    bool pointer_mode_host, Args &&...args) {
+    return DoBlasInternalImpl(rocblas_func, stream, pointer_mode_host,
+                              /*err_on_failure=*/true,
+                              std::forward<Args>(args)...);
   }
 
   template <typename FuncT, typename... Args>
   bool DoBlasInternalFailureOK(FuncT rocblas_func, Stream *stream,
-                               bool pointer_mode_host, Args... args) {
-    return DoBlasInternalImpl(rocblas_func, stream, pointer_mode_host,
-                              /*err_on_failure=*/false, args...);
+                               bool pointer_mode_host, Args &&...args) {
+    auto ret = DoBlasInternalImpl(rocblas_func, stream, pointer_mode_host,
+                                  /*err_on_failure=*/false,
+                                  std::forward<Args>(args)...);
+    return ret.ok();
   }
 
-  // A helper allocation function to convert raw pointers memory layout to
-  // strided flavor
-  template <typename T>
-  absl::Status AllocateStridedBuffer(
-      const std::vector<typename RocBlasTypeConversionHelper<T>::mapped_type *>
-          &raw_ptrs,
-      int batch_count, uint64_t batch_stride,
-      ScratchAllocator *scratch_allocator, Stream *stream,
-      std::unique_ptr<TemporaryDeviceMemory<
-          typename RocBlasTypeConversionHelper<T>::mapped_type>> *temp_memory,
-      DeviceMemory<typename RocBlasTypeConversionHelper<T>::mapped_type>
-          *device_memory,
-      bool copy_data, bool &reallocated);
-
   // A helper function to implement DoBlasGemmBatched interfaces for generic
   // types.
   //
@@ -186,7 +180,7 @@ class ROCMBlas : public blas::BlasSupport {
   template <typename T, typename FuncT>
   absl::Status DoBlasGemmBatchedInternal(
       FuncT rocblas_func, Stream *stream, blas::Transpose transa,
-      blas::Transpose transb, uint64_t m, uint64 n, uint64 k, T alpha,
+      blas::Transpose transb, uint64_t m, uint64_t n, uint64_t k, T alpha,
       DeviceMemorySlice<T> a_ptrs_to_wrappers, int lda,
       DeviceMemorySlice<T> b_ptrs_to_wrappers, int ldb, T beta,
       DeviceMemorySlice<T> c_ptrs_to_wrappers, int ldc, int batch_count,
@@ -202,6 +196,9 @@ class ROCMBlas : public blas::BlasSupport {
   // rocBLAS library handle on the device.
   rocblas_handle blas_ ABSL_GUARDED_BY(mu_);
 
+  // container holding solutions vector (to avoid reallocating it each time)
+  std::vector<rocblas_int> solutions_;
+
 #if TF_HIPBLASLT
   rocm::BlasLt blas_lt_;
 #endif
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_collectives.cc b/third_party/xla/xla/stream_executor/rocm/rocm_collectives.cc
index 6b3b4791144fb4..44a6b39759ed57 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_collectives.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_collectives.cc
@@ -1,4 +1,4 @@
-/* Copyright 2024 The TensorFlow Authors. All Rights Reserved.
+/* Copyright 2024 The OpenXLA Authors.
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
index 3152c934375004..64bdda71ffb5db 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.cc
@@ -241,31 +241,31 @@ namespace wrap {
 
 #else
 
-#define STREAM_EXECUTOR_MIOPEN_WRAP(__name)                        \
-  struct DynLoadShim__##__name {                                   \
-    static const char* kName;                                      \
-    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;   \
-    static void* GetDsoHandle() {                                  \
-      auto s = internal::CachedDsoLoader::GetMiopenDsoHandle();    \
-      return s.value();                                            \
-    }                                                              \
-    static FuncPtrT LoadOrDie() {                                  \
-      void* f;                                                     \
-      auto s = tsl::Env::Default()                                 \
-          -> GetSymbolFromLibrary(GetDsoHandle(), kName, &f);      \
-      CHECK(s.ok()) << "could not find " << kName                  \
-                    << " in miopen DSO; dlerror: " << s.message(); \
-      return reinterpret_cast<FuncPtrT>(f);                        \
-    }                                                              \
-    static FuncPtrT DynLoad() {                                    \
-      static FuncPtrT f = LoadOrDie();                             \
-      return f;                                                    \
-    }                                                              \
-    template <typename... Args>                                    \
-    miopenStatus_t operator()(Args... args) {                      \
-      return DynLoad()(args...);                                   \
-    }                                                              \
-  } __name;                                                        \
+#define STREAM_EXECUTOR_MIOPEN_WRAP(__name)                              \
+  struct DynLoadShim__##__name {                                         \
+    static const char* kName;                                            \
+    using FuncPtrT = std::add_pointer<decltype(::__name)>::type;         \
+    static void* GetDsoHandle() {                                        \
+      auto s = internal::CachedDsoLoader::GetMiopenDsoHandle();          \
+      return s.value();                                                  \
+    }                                                                    \
+    static FuncPtrT LoadOrDie() {                                        \
+      void* f;                                                           \
+      auto s = tsl::Env::Default()->GetSymbolFromLibrary(GetDsoHandle(), \
+                                                         kName, &f);     \
+      CHECK(s.ok()) << "could not find " << kName                        \
+                    << " in miopen DSO; dlerror: " << s.message();       \
+      return reinterpret_cast<FuncPtrT>(f);                              \
+    }                                                                    \
+    static FuncPtrT DynLoad() {                                          \
+      static FuncPtrT f = LoadOrDie();                                   \
+      return f;                                                          \
+    }                                                                    \
+    template <typename... Args>                                          \
+    miopenStatus_t operator()(Args... args) {                            \
+      return DynLoad()(args...);                                         \
+    }                                                                    \
+  } __name;                                                              \
   const char* DynLoadShim__##__name::kName = #__name;
 
 #endif
@@ -2761,7 +2761,7 @@ absl::Status MIOpenSupport::DoCtcLoss(
 }
 
 absl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
-MIOpenSupport::createRnnDescriptor(
+MIOpenSupport::CreateRnnDescriptor(
     int num_layers, int hidden_size, int input_size, int cell_size,
     int batch_size, dnn::RnnInputMode input_mode,
     dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
@@ -2796,7 +2796,7 @@ MIOpenSupport::createRnnDescriptor(
 }
 
 absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
-MIOpenSupport::createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
+MIOpenSupport::CreateRnnSequenceTensorDescriptor(int seq_length, int batch_size,
                                                  int data_size,
                                                  dnn::DataType data_type) {
   std::unique_ptr<MIOpenRnnSequenceTensorDescriptor> seq_desc(
@@ -2810,7 +2810,7 @@ MIOpenSupport::createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
 }
 
 absl::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
-MIOpenSupport::createRnnStateTensorDescriptor(int num_layer, int batch_size,
+MIOpenSupport::CreateRnnStateTensorDescriptor(int num_layer, int batch_size,
                                               int data_size,
                                               dnn::DataType data_type) {
   std::unique_ptr<MIOpenRnnStateTensorDescriptor> state_desc(
@@ -4099,11 +4099,15 @@ absl::Status ROCmFusedMatmulRunner::gemm(Stream* stream,
   blas::Transpose tb =
       _trans_b ? blas::Transpose::kTranspose : blas::Transpose::kNoTranspose;
 
-  return stream->ThenBlasGemm<T, T>(
-      tb, ta, _n, _m, _k, static_cast<DeviceMemory<T>>(b_data), _ldb,
-      static_cast<DeviceMemory<T>>(a_data), _lda,
-      static_cast<DeviceMemory<T>*>(&c_data), _ldc, NumericOptions{},
-      blas::CallContext::kNone);
+  auto* blas = stream->parent()->AsBlas();
+  if (blas == nullptr) {
+    return absl::InternalError("No Blas support for stream");
+  }
+  return blas->BlasGemm<T, T>(stream, tb, ta, _n, _m, _k,
+                              static_cast<DeviceMemory<T>>(b_data), _ldb,
+                              static_cast<DeviceMemory<T>>(a_data), _lda,
+                              static_cast<DeviceMemory<T>*>(&c_data), _ldc,
+                              NumericOptions{}, blas::CallContext::kNone);
 }
 
 template <typename T>
@@ -4245,7 +4249,7 @@ absl::Status MIOpenSupport::DoPoolForward(
   bool do_backward = false;
   uint8* workspace = nullptr;
   size_t workspace_size = 0;
-  std::unique_ptr<TemporaryDeviceMemory<uint8>> wsp_mem;
+  ScopedDeviceMemory<uint8> wsp_mem;
   if (m_pooling_cache_enabled && element_type == dnn::DataType::kFloat) {
     do_backward = true;
     auto status = wrap::miopenPoolingGetWorkSpaceSizeV2(
@@ -4264,12 +4268,10 @@ absl::Status MIOpenSupport::DoPoolForward(
                                miopenFloat, pdesc);
       if (cache_hit) {
         // reusing the same buffer
-        workspace = reinterpret_cast<uint8*>(
-            pdesc->workspace->mutable_device_memory()->opaque());
+        workspace = reinterpret_cast<uint8*>(pdesc->workspace.ptr()->opaque());
       } else {
-        wsp_mem = stream->AllocateTemporaryArray<uint8>(workspace_size).value();
-        workspace = reinterpret_cast<uint8*>(
-            wsp_mem->mutable_device_memory()->opaque());
+        wsp_mem = stream->parent()->AllocateOwnedArray<uint8>(workspace_size);
+        workspace = reinterpret_cast<uint8*>(wsp_mem.ptr()->opaque());
         m_pooling_cache.insert(input_data.opaque(), input_dimensions,
                                output_dimensions, pooling_dimensions,
                                miopenFloat, wsp_mem, workspace_size,
@@ -4326,7 +4328,7 @@ void PoolingWorkspaceCache::insert(
     const void* p, const dnn::BatchDescriptor& input_dimensions,
     const dnn::BatchDescriptor& output_dimensions,
     const dnn::PoolingDescriptor& pooling_dimensions, int _type,
-    std::unique_ptr<TemporaryDeviceMemory<uint8>>& workspace, size_t wsp_size,
+    ScopedDeviceMemory<uint8>& workspace, size_t wsp_size,
     hipStream_t hip_stream) {
   PoolingWorkspaceDescriptor* desc = 0;
   auto it = cache.find(p);
@@ -4423,8 +4425,8 @@ absl::Status MIOpenSupport::DoPoolBackward(
                                           miopen_dtype, pdesc);
     if (cache_hit) {
       assert(pdesc != 0);
-      workspace_ptr = reinterpret_cast<uint8*>(
-          pdesc->workspace->mutable_device_memory()->opaque());
+      workspace_ptr =
+          reinterpret_cast<uint8*>(pdesc->workspace.ptr()->opaque());
       VLOG(1) << "Pooling cache hit";
     } else {
       VLOG(1) << "Pooling cache miss";
@@ -4623,64 +4625,6 @@ bool MIOpenSupport::DoNormalizeBackwardWithDimensions(
   return true;
 }
 
-bool MIOpenSupport::DoDepthConcatenate(
-    Stream* stream, absl::Span<const dnn::BatchDescriptor> input_dimensions,
-    absl::Span<const DeviceMemory<float>* const> input_data,
-    DeviceMemory<float>* output_data) {
-  CHECK_EQ(input_dimensions.size(), input_data.size());
-
-  for (const auto& dimensions : input_dimensions) {
-    if (dimensions.layout() != dnn::DataLayout::kBatchDepthYX) {
-      LOG(ERROR) << "MIOpenSupport::DoDepthConcatenate currently only "
-                    "supports the kBatchDepthYX layout.";
-      return false;
-    }
-  }
-
-  if (input_dimensions.empty()) {
-    return true;  // Nothing to do.
-  }
-
-  dnn::BatchDescriptor output_dimensions =
-      dnn::BatchDescriptor::DepthConcatenateOutputDescriptor(input_dimensions);
-
-  const int64_t area = output_dimensions.width() * output_dimensions.height();
-  const auto index = [area](int64_t batch, int64_t depth, int64_t yx,
-                            int64_t max_depth) {
-    return (batch * max_depth + depth) * area + yx;
-  };
-
-  std::vector<float> output_host(output_dimensions.ElementCount());
-  std::vector<float> tmp;
-  int64_t depth_sum = 0;
-  for (size_t i = 0; i < input_data.size(); ++i) {
-    const auto& dimensions = input_dimensions[i];
-    tmp.resize(dimensions.ElementCount());
-    stream->ThenMemcpyD2H<float>(*input_data[i], absl::MakeSpan(tmp));
-    absl::Status block_status = stream->BlockHostUntilDone();
-    if (!block_status.ok()) {
-      LOG(ERROR) << "BlockHostUntilDone failed: " << block_status;
-      return false;
-    }
-
-    for (int64_t batch = 0; batch < output_dimensions.count(); ++batch) {
-      for (int64_t yx = 0; yx < area; ++yx) {
-        for (int64_t depth = 0; depth < dimensions.feature_map_count();
-             ++depth) {
-          LOG(INFO) << output_dimensions.ElementCount() << ' ' << batch << ' '
-                    << yx << ' ' << depth;
-          output_host[index(batch, depth + depth_sum, yx,
-                            output_dimensions.feature_map_count())] =
-              tmp[index(batch, depth, yx, dimensions.feature_map_count())];
-        }
-      }
-    }
-    depth_sum += dimensions.feature_map_count();
-  }
-  stream->ThenMemcpyH2D<float>(output_host, output_data);
-  return true;
-}
-
 bool MIOpenSupport::DeriveOutputBatchDescriptor(
     const BatchDescriptor& batch_descriptor,
     const FilterDescriptor& filter_descriptor,
@@ -4763,5 +4707,6 @@ void initialize_miopen() {
 
 }  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(register_miopen,
-                            { stream_executor::initialize_miopen(); });
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(register_miopen, {
+  stream_executor::initialize_miopen();
+});
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
index b9ed7ce6b6d9ea..ecaffd3cad392d 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_dnn.h
@@ -22,9 +22,9 @@ limitations under the License.
 #include "absl/synchronization/mutex.h"
 #include "absl/types/span.h"
 #include "rocm/include/miopen/miopen.h"
+#include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/plugin_registry.h"
-#include "xla/stream_executor/temporary_device_memory.h"
 
 namespace stream_executor {
 namespace gpu {
@@ -41,7 +41,7 @@ struct PoolingWorkspaceDescriptor {
   dnn::PoolingDescriptor op;
   int dtype;
   uint64_t timestamp;
-  std::unique_ptr<TemporaryDeviceMemory<uint8>> workspace;
+  ScopedDeviceMemory<uint8> workspace;
   size_t workspace_size;
   bool IsSame(const dnn::BatchDescriptor& input_dimensions,
               const dnn::BatchDescriptor& output_dimensions,
@@ -61,8 +61,8 @@ struct PoolingWorkspaceCache {
   void insert(const void* p, const dnn::BatchDescriptor& input_dimensions,
               const dnn::BatchDescriptor& output_dimensions,
               const dnn::PoolingDescriptor& pooling_dimensions, int _type,
-              std::unique_ptr<TemporaryDeviceMemory<uint8>>& workspace,
-              size_t wsp_size, hipStream_t hip_stream);
+              ScopedDeviceMemory<uint8>& workspace, size_t wsp_size,
+              hipStream_t hip_stream);
 
  private:
   void trim(hipStream_t hip_stream);
@@ -77,7 +77,7 @@ class MIOpenSupport : public dnn::DnnSupport {
   absl::Status Init() override;
   absl::StatusOr<stream_executor::dnn::VersionInfo> GetVersion() override;
 
-  absl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
+  absl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> CreateRnnDescriptor(
       int num_layers, int hidden_size, int input_size, int cell_size,
       int batch_size, dnn::RnnInputMode input_mode,
       dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
@@ -86,12 +86,12 @@ class MIOpenSupport : public dnn::DnnSupport {
       ScratchAllocator* state_allocator, bool use_padded_io) override;
 
   absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
-  createRnnSequenceTensorDescriptor(int seq_length, int batch_size,
+  CreateRnnSequenceTensorDescriptor(int seq_length, int batch_size,
                                     int data_size,
                                     dnn::DataType data_type) override;
 
   absl::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
-  createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
+  CreateRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
                                  dnn::DataType data_type) override;
 
   bool DoRnnForward(Stream* stream, const dnn::RnnDescriptor& rnn_desc,
@@ -422,11 +422,6 @@ class MIOpenSupport : public dnn::DnnSupport {
       DeviceMemory<float>* raw_variable_gradient,
       ScratchAllocator* workspace_allocator = nullptr) override;
 
-  bool DoDepthConcatenate(
-      Stream* stream, absl::Span<const dnn::BatchDescriptor> input_dimensions,
-      absl::Span<const DeviceMemory<float>* const> input_data,
-      DeviceMemory<float>* output_data) override;
-
   // Derives an output batch descriptor from an input batch and convolution
   // descriptors.
   bool DeriveOutputBatchDescriptor(
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc b/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
index 02dcac01c008f3..712c1a9ed0b340 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_driver.cc
@@ -743,16 +743,16 @@ GpuDriver::GraphNodeGetType(hipGraphNode_t node) {
 
 /* static */ absl::StatusOr<GpuDriver::GpuGraphNodeResult>
 GpuDriver::GraphAddNode(hipGraphNode_t* node, hipGraph_t graph,
-                        absl::Span<hipGraphNode_t> deps,
+                        absl::Span<const hipGraphNode_t> deps,
                         const GpuGraphNodeParams& params) {
   return absl::UnimplementedError("unsupported node type");
 }
 
 /* static */ absl::Status GpuDriver::GraphAddKernelNode(
-    hipGraphNode_t* node, hipGraph_t graph, absl::Span<hipGraphNode_t> deps,
-    absl::string_view kernel_name, hipFunction_t function,
-    unsigned int grid_dim_x, unsigned int grid_dim_y, unsigned int grid_dim_z,
-    unsigned int block_dim_x, unsigned int block_dim_y,
+    hipGraphNode_t* node, hipGraph_t graph,
+    absl::Span<const hipGraphNode_t> deps, absl::string_view kernel_name,
+    hipFunction_t function, unsigned int grid_dim_x, unsigned int grid_dim_y,
+    unsigned int grid_dim_z, unsigned int block_dim_x, unsigned int block_dim_y,
     unsigned int block_dim_z, unsigned int shared_mem_bytes,
     void** kernel_params, void** extra) {
   VLOG(2) << "Add kernel node to a graph " << graph
@@ -833,8 +833,8 @@ GpuDriver::GraphAddNode(hipGraphNode_t* node, hipGraph_t graph,
 }
 
 /* static */ absl::Status GpuDriver::GraphAddChildNode(
-    hipGraphNode_t* node, hipGraph_t graph, absl::Span<hipGraphNode_t> deps,
-    hipGraph_t child) {
+    hipGraphNode_t* node, hipGraph_t graph,
+    absl::Span<const hipGraphNode_t> deps, hipGraph_t child) {
   VLOG(2) << "Create a new node by cloning the child graph " << child
           << " and add it to " << graph << "; deps: " << deps.size();
 
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
index e29e18f6597f77..de99f3a4567837 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_executor.cc
@@ -46,6 +46,7 @@ limitations under the License.
 #include "xla/stream_executor/stream_executor_internal.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/errors.h"
+#include "tsl/platform/fingerprint.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/statusor.h"
 
@@ -112,6 +113,11 @@ bool GpuExecutor::UnloadModule(ModuleHandle module_handle) {
 }
 
 namespace {
+absl::uint128 Fingerprint128(const absl::string_view s) {
+  auto fp = tsl::Fingerprint128(s);
+  return absl::MakeUint128(fp.high64, fp.low64);
+}
+
 int fpus_per_core(std::string gcn_arch_name) {
   // Source:
   // https://www.amd.com/content/dam/amd/en/documents/instinct-business-docs/white-papers/amd-cdna2-white-paper.pdf
@@ -126,7 +132,55 @@ int fpus_per_core(std::string gcn_arch_name) {
 absl::StatusOr<std::shared_ptr<DeviceMemoryBase>>
 GpuExecutor::CreateOrShareConstant(Stream* stream,
                                    absl::Span<const uint8_t> content) {
-  return absl::UnimplementedError("Not implemented for ROCm");
+  absl::MutexLock lock{&shared_constants_mu_};
+  // We assume all constants are uniquely identified by this hash. In the
+  // (highly unlikely) event of a hash collision, the program will likely crash
+  // (because the cached constant that will be returned by mistake is unlikely
+  // to have the correct size).
+  absl::uint128 fingerprint = Fingerprint128(absl::string_view(
+      reinterpret_cast<const char*>(content.data()), content.size()));
+  // Must insert nullptr first to get an iterator to the insertion point.
+  auto insert_result = shared_constants_.insert(
+      {fingerprint, std::weak_ptr<DeviceMemoryBase>()});
+  auto it = insert_result.first;
+  bool was_already_in_cache = !insert_result.second;
+  std::shared_ptr<DeviceMemoryBase> shared_constant;
+
+  if (was_already_in_cache) {
+    shared_constant = it->second.lock();
+  }
+
+  if (shared_constant == nullptr) {
+    // Either the constant wasn't found in the cache, or it was but its
+    // weak_ptr had expired.
+    DeviceMemoryBase* new_constant =
+        new DeviceMemoryBase(Allocate(content.size(), /*memory_space=*/0));
+    if (new_constant->opaque() == nullptr) {
+      return absl::InternalError(absl::StrFormat(
+          "Failed to allocate %d bytes for new constant", content.size()));
+    }
+
+    absl::Status status =
+        stream->ThenMemcpy(new_constant, content.data(), content.size())
+            .BlockHostUntilDone();
+    if (!status.ok()) {
+      Deallocate(new_constant);
+      status.Update(absl::InternalError(absl::StrFormat(
+          "Memcpy to device address %p failed", new_constant->opaque())));
+      return status;
+    }
+
+    // Capturing 'this' in the custom deleter means this executor must
+    // outlive all shared uses of this constant.
+    shared_constant = std::shared_ptr<DeviceMemoryBase>(
+        new_constant, [this](DeviceMemoryBase* p) {
+          Deallocate(p);
+          delete p;
+        });
+    it->second = std::weak_ptr<DeviceMemoryBase>(shared_constant);
+  }
+
+  return shared_constant;
 }
 
 bool GpuExecutor::UnloadGpuBinary(const void* gpu_binary) {
@@ -292,7 +346,8 @@ absl::Status GpuExecutor::Launch(Stream* stream, const ThreadDim& thread_dims,
   if (VLOG_IS_ON(2)) {
     absl::MutexLock lock(&launched_kernels_mu_);
     if (!launched_kernels_.count(hipfunc)) {
-      VlogOccupancyInfo(kernel, thread_dims, block_dims);
+      VlogOccupancyInfo(stream->parent()->GetDeviceDescription(), kernel,
+                        thread_dims, block_dims);
       // TODO(rspringer): Remove elements from launched_kernels_...if we ever
       // expose a kernel/module deallocation method.
       launched_kernels_.insert(hipfunc);
@@ -410,7 +465,8 @@ absl::Status GpuExecutor::LoadModuleFromHsaco(const char* hsaco,
 // This is a non-essential operation; if there's a failure, proceed without
 // logging an error. It's nearly certain that in case of failures, we'd never
 // get here in the first place; these are very low-impact routines.
-void GpuExecutor::VlogOccupancyInfo(const Kernel& kernel,
+void GpuExecutor::VlogOccupancyInfo(const DeviceDescription& device_description,
+                                    const Kernel& kernel,
                                     const ThreadDim& thread_dims,
                                     const BlockDim& block_dims) {
   VLOG(2) << "Computing kernel occupancy for kernel "
@@ -425,9 +481,6 @@ void GpuExecutor::VlogOccupancyInfo(const Kernel& kernel,
     return;
   }
 
-  const DeviceDescription& device_description =
-      kernel.parent()->GetDeviceDescription();
-
   const GpuKernel* rocm_kernel = AsGpuKernel(&kernel);
   auto hipfunc = rocm_kernel->AsGpuFunctionHandle();
 
@@ -803,28 +856,25 @@ GpuExecutor::CreateEventImplementation() {
   return std::unique_ptr<internal::EventInterface>(new GpuEvent(this));
 }
 
-std::unique_ptr<internal::KernelInterface>
-GpuExecutor::CreateKernelImplementation() {
-  return std::unique_ptr<internal::KernelInterface>(new GpuKernel());
-}
-
 std::unique_ptr<internal::StreamInterface>
 GpuExecutor::GetStreamImplementation() {
   return std::unique_ptr<internal::StreamInterface>(new GpuStream(this));
 }
 
-absl::StatusOr<std::unique_ptr<internal::CommandBufferInterface>>
-GpuExecutor::GetCommandBufferImplementation(CommandBuffer::Mode mode) {
+absl::StatusOr<std::unique_ptr<Kernel>> GpuExecutor::CreateKernel() {
+  return std::make_unique<GpuKernel>(this);
+}
+
+absl::StatusOr<std::unique_ptr<CommandBuffer>> GpuExecutor::CreateCommandBuffer(
+    CommandBuffer::Mode mode) {
   VLOG(2) << "Create ROCm command buffer (ROCm graph)";
   GpuGraphHandle graph = nullptr;
   TF_RETURN_IF_ERROR(GpuDriver::CreateGraph(&graph));
   return std::make_unique<GpuCommandBuffer>(mode, /*parent=*/this, graph);
 }
 
-std::unique_ptr<internal::CommandBufferInterface>
-GpuExecutor::GetCommandBufferImplementation(CommandBuffer::Mode mode,
-                                            GpuGraphHandle graph,
-                                            bool is_owned_graph) {
+std::unique_ptr<GpuCommandBuffer> GpuExecutor::CreateCommandBuffer(
+    CommandBuffer::Mode mode, GpuGraphHandle graph, bool is_owned_graph) {
   VLOG(2) << "Create HIP command buffer (HIP graph) from existing graph "
           << graph << "; is_owned_graph=" << is_owned_graph;
   return std::make_unique<GpuCommandBuffer>(mode, /*parent=*/this, graph,
@@ -1024,4 +1074,4 @@ GpuExecutor::CreateDeviceDescription(int device_ordinal) {
 
 }  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(rocm_executor, {});
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(rocm_executor, {});
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_fft.cc b/third_party/xla/xla/stream_executor/rocm/rocm_fft.cc
index 8c862d3dae070c..adec86c4f9ac15 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_fft.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_fft.cc
@@ -529,5 +529,6 @@ void initialize_rocfft() {
 
 }  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(register_rocfft,
-                            { stream_executor::initialize_rocfft(); });
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(register_rocfft, {
+  stream_executor::initialize_rocfft();
+});
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc b/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc
index 174affdfe07657..1b5f54d11076f2 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_platform.cc
@@ -147,16 +147,16 @@ ROCmPlatform::GetUncachedExecutor(const StreamExecutorConfig& config) {
 }  // namespace gpu
 
 static void InitializeROCmPlatform() {
-  // Disabling leak checking, MultiPlatformManager does not destroy its
+  // Disabling leak checking, PlatformManager does not destroy its
   // registered platforms.
-  auto status = MultiPlatformManager::PlatformWithName("ROCM");
+  auto status = PlatformManager::PlatformWithName("ROCM");
   if (!status.ok()) {
     std::unique_ptr<gpu::ROCmPlatform> platform(new gpu::ROCmPlatform);
-    TF_CHECK_OK(MultiPlatformManager::RegisterPlatform(std::move(platform)));
+    TF_CHECK_OK(PlatformManager::RegisterPlatform(std::move(platform)));
   }
 }
 
 }  // namespace stream_executor
 
-REGISTER_MODULE_INITIALIZER(rocm_platform,
-                            stream_executor::InitializeROCmPlatform());
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(
+    rocm_platform, stream_executor::InitializeROCmPlatform());
diff --git a/third_party/xla/xla/stream_executor/rocm/rocm_platform.h b/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
index 3f4de3120f906d..7c9f5037435496 100644
--- a/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
+++ b/third_party/xla/xla/stream_executor/rocm/rocm_platform.h
@@ -21,9 +21,9 @@ limitations under the License.
 
 #include "absl/synchronization/mutex.h"
 #include "xla/stream_executor/executor_cache.h"
-#include "xla/stream_executor/multi_platform_manager.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/port.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/stream_executor/stream_executor_internal.h"
 
diff --git a/third_party/xla/xla/stream_executor/scratch_allocator.h b/third_party/xla/xla/stream_executor/scratch_allocator.h
index adb9f566cbbf05..4c860627c947ae 100644
--- a/third_party/xla/xla/stream_executor/scratch_allocator.h
+++ b/third_party/xla/xla/stream_executor/scratch_allocator.h
@@ -25,7 +25,6 @@ limitations under the License.
 #include "xla/stream_executor/device_memory.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/platform/port.h"
-#include "xla/stream_executor/temporary_device_memory.h"
 #include "tsl/platform/statusor.h"
 
 namespace stream_executor {
diff --git a/third_party/xla/xla/stream_executor/stream.cc b/third_party/xla/xla/stream_executor/stream.cc
index e9adfc285480fb..1ba05a370fe50f 100644
--- a/third_party/xla/xla/stream_executor/stream.cc
+++ b/third_party/xla/xla/stream_executor/stream.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_format.h"
 #include "Eigen/Core"  // from @eigen_archive
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/numeric_options.h"
@@ -43,55 +44,6 @@ namespace {
 // will be VLOG'ed. We need overloads, instead of
 // e.g. BatchDescriptorToVlogString(), as the code that calls these
 // functions does not know what the type of the parameter is.
-std::string ToVlogString(const dnn::BatchDescriptor &descriptor) {
-  return descriptor.ToShortString();
-}
-
-std::string ToVlogString(const dnn::FilterDescriptor &descriptor) {
-  return descriptor.ToShortString();
-}
-
-std::string ToVlogString(const dnn::ConvolutionDescriptor &descriptor) {
-  return descriptor.ToShortString();
-}
-
-std::string ToVlogString(const dnn::PoolingDescriptor &descriptor) {
-  return descriptor.ToShortString();
-}
-
-std::string ToVlogString(const dnn::NormalizeDescriptor &descriptor) {
-  return descriptor.ToShortString();
-}
-
-std::string ToVlogString(dnn::ActivationMode mode) {
-  return dnn::ActivationModeString(mode);
-}
-
-std::string ToVlogString(const dnn::AlgorithmConfig &algo_config) {
-  return algo_config.ToString();
-}
-
-std::string ToVlogString(dnn::ElementwiseOperation op) {
-  return dnn::ElementwiseOperationString(op);
-}
-
-std::string ToVlogString(dnn::QuantizedActivationMode mode) {
-  return dnn::QuantizedActivationModeString(mode);
-}
-
-std::string ToVlogString(blas::Transpose t) { return blas::TransposeString(t); }
-
-std::string ToVlogString(blas::UpperLower ul) {
-  return blas::UpperLowerString(ul);
-}
-
-std::string ToVlogString(blas::Diagonal d) { return blas::DiagonalString(d); }
-
-std::string ToVlogString(blas::Side s) { return blas::SideString(s); }
-
-std::string ToVlogString(blas::ComputationType ty) {
-  return blas::ComputationTypeString(ty);
-}
 
 std::string ToVlogString(const void *ptr) {
   if (ptr == nullptr) {
@@ -104,14 +56,6 @@ std::string ToVlogString(const void *ptr) {
   return out.str();
 }
 
-template <class T>
-std::string ToVlogString(const std::complex<T> &c) {
-  // StrCat does not convert std::complex to text.
-  std::ostringstream out;
-  out << c;
-  return out.str();
-}
-
 template <class T>
 std::string ToVlogString(const std::function<T> &f) {
   return f == nullptr ? "null" : "<non-null function>";
@@ -130,76 +74,12 @@ std::string ToVlogString(const DeviceMemoryBase *memory) {
   return memory == nullptr ? "null" : ToVlogString(*memory);
 }
 
-std::string ToVlogString(const Eigen::half &h) {
-  return absl::StrCat(static_cast<float>(h));
-}
-
-std::string ToVlogString(const Eigen::bfloat16 &bf) {  // NOLINT
-  return absl::StrCat(static_cast<float>(bf));
-}
-
-std::string ToVlogString(int i) { return absl::StrCat(i); }
-
 std::string ToVlogString(uint32_t i) { return absl::StrCat(i); }
 
 std::string ToVlogString(uint64_t i) { return absl::StrCat(i); }
 
-std::string ToVlogString(int64_t i) { return absl::StrCat(i); }
-
 std::string ToVlogString(float f) { return absl::StrCat(f); }
 
-std::string ToVlogString(double d) { return absl::StrCat(d); }
-
-template <class T>
-std::string ToVlogString(absl::Span<const T> elements) {
-  std::string str = absl::StrCat(
-      ToVlogString(reinterpret_cast<const void *>(elements.data())), "[",
-      elements.size(), "]{");
-  const char *separator = "";
-  size_t max_to_show = std::numeric_limits<size_t>::max();
-  if (!VLOG_IS_ON(2)) {
-    max_to_show = 5;
-  } else if (!VLOG_IS_ON(3)) {
-    max_to_show = 20;
-  } else if (!VLOG_IS_ON(11)) {
-    max_to_show = 1000;
-  }
-  for (size_t i = 0; i < elements.size(); ++i) {
-    if (i == max_to_show) {
-      str += ", ...";
-      break;
-    }
-    absl::StrAppend(&str, separator, ToVlogString(elements[i]));
-    separator = ", ";
-  }
-  str += "}";
-  return str;
-}
-
-template <class T>
-std::string ToVlogString(absl::Span<T> elements) {
-  return ToVlogString(absl::Span<const T>(elements));
-}
-
-std::string ToVlogString(dnn::DataType data_type) {
-  switch (data_type) {
-    case dnn::DataType::kFloat:
-      return "dnn::DataType::kFloat";
-    case dnn::DataType::kDouble:
-      return "dnn::DataType::kDouble";
-    case dnn::DataType::kHalf:
-      return "dnn::DataType::kHalf";
-    case dnn::DataType::kInt8:
-      return "dnn::DataType::kInt8";
-    case dnn::DataType::kInt32:
-      return "dnn::DataType::kInt32";
-    case dnn::DataType::kBF16:
-      return "dnn::DataType::kBF16";
-    default:
-      return "unknown DataType";
-  }
-}
-
 // Used together with PARAM to VLOG calls made to the stream. Intended
 // to be used like this:
 //
@@ -255,8 +135,7 @@ Stream::Stream(StreamExecutor *parent)
     : parent_(parent),
       implementation_(parent->implementation()->GetStreamImplementation()),
       allocated_(false),
-      status_(absl::InternalError("Uninitialized stream")),
-      temporary_memory_manager_(this) {
+      status_(absl::InternalError("Uninitialized stream")) {
   VLOG_CALL(PARAM(parent));
 }
 
@@ -269,8 +148,6 @@ Stream::~Stream() {
     LOG(WARNING) << "Error blocking host until done in stream destructor: "
                  << status;
   }
-  temporary_memory_manager_.ForceDeallocateAll();
-  RunAfterBlockHostUntilDoneCallbacks();
 
   if (allocated_) {
     parent_->DeallocateStream(this);
@@ -306,20 +183,35 @@ absl::Status Stream::RefreshStatus() {
   return status;
 }
 
-Stream &Stream::Init() {
+absl::Status Stream::Initialize() {
   VLOG_CALL();
 
   absl::MutexLock lock(&mu_);
-  CHECK_EQ(false, allocated_)
-      << "stream appears to already have been initialized";
-  CHECK(!status_.ok()) << "stream should be in !ok() state pre-initialization";
+  if (allocated_) {
+    return absl::InternalError(
+        "stream appears to already have been initialized");
+  }
+  if (status_.ok()) {
+    return absl::InternalError(
+        "stream should be in !ok() state pre-initialization");
+  }
 
   if (parent_->AllocateStream(this)) {
     // Successful initialization!
     allocated_ = true;
     status_ = absl::OkStatus();
-  } else {
-    LOG(ERROR) << "failed to allocate stream during initialization";
+    return absl::OkStatus();
+  }
+
+  return absl::InternalError("failed to allocate stream during initialization");
+}
+
+Stream &Stream::Init() {
+  VLOG_CALL();
+
+  absl::Status status = Initialize();
+  if (!status.ok()) {
+    LOG(ERROR) << status;
   }
 
   return *this;
@@ -338,258 +230,6 @@ Stream &Stream::ThenRecordEvent(Event *event) {
   return *this;
 }
 
-Stream &Stream::ThenBatchNormalizationForward(
-    const DeviceMemory<float> &x, const DeviceMemory<float> &scale,
-    const DeviceMemory<float> &offset,
-    const DeviceMemory<float> &estimated_mean,
-    const DeviceMemory<float> &estimated_variance,
-    const DeviceMemory<float> &side_input, const dnn::BatchDescriptor &x_desc,
-    const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
-    const double exponential_average_factor,
-    dnn::ActivationMode activation_mode, DeviceMemory<float> *y,
-    DeviceMemory<float> *batch_mean, DeviceMemory<float> *batch_var,
-    DeviceMemory<float> *saved_mean, DeviceMemory<float> *saved_inv_var,
-    bool is_training, ScratchAllocator *reserve_space_allocator,
-    ScratchAllocator *workspace_allocator) {
-  VLOG_CALL(PARAM(x), PARAM(scale), PARAM(offset), PARAM(x_desc),
-            PARAM(scale_offset_desc), PARAM(epsilon), PARAM(y));
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    CheckError(dnn->DoBatchNormalizationForward(
-        this, x, scale, offset, estimated_mean, estimated_variance, side_input,
-        x_desc, scale_offset_desc, epsilon, exponential_average_factor,
-        activation_mode, y, batch_mean, batch_var, saved_mean, saved_inv_var,
-        is_training, reserve_space_allocator, workspace_allocator));
-  } else {
-    SetErrorAndLogNoDnnSupport();
-  }
-  return *this;
-}
-
-Stream &Stream::ThenBatchNormalizationBackward(
-    const DeviceMemory<float> &y_backprop, const DeviceMemory<float> &x,
-    const DeviceMemory<float> &scale, const DeviceMemory<float> &offset,
-    const DeviceMemory<float> &mean, const DeviceMemory<float> &inv_var,
-    const DeviceMemory<float> &y, const dnn::BatchDescriptor &x_desc,
-    const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
-    dnn::ActivationMode activation_mode, DeviceMemory<float> *x_backprop,
-    DeviceMemory<float> *scale_backprop, DeviceMemory<float> *offset_backprop,
-    DeviceMemory<float> *side_input_backprop,
-    DeviceMemory<uint8_t> *reserve_space_data,
-    ScratchAllocator *workspace_allocator) {
-  VLOG_CALL(PARAM(y_backprop), PARAM(x), PARAM(scale), PARAM(x_desc),
-            PARAM(scale_offset_desc), PARAM(epsilon), PARAM(x_backprop),
-            PARAM(scale_backprop), PARAM(offset_backprop));
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    CheckError(dnn->DoBatchNormalizationBackward(
-        this, y_backprop, x, scale, offset, mean, inv_var, y, x_desc,
-        scale_offset_desc, epsilon, activation_mode, x_backprop, scale_backprop,
-        offset_backprop, side_input_backprop, reserve_space_data,
-        workspace_allocator));
-  } else {
-    SetErrorAndLogNoDnnSupport();
-  }
-  return *this;
-}
-
-Stream &Stream::ThenBatchNormalizationForward(
-    const DeviceMemory<Eigen::half> &x, const DeviceMemory<float> &scale,
-    const DeviceMemory<float> &offset,
-    const DeviceMemory<float> &estimated_mean,
-    const DeviceMemory<float> &estimated_variance,
-    const DeviceMemory<Eigen::half> &side_input,
-    const dnn::BatchDescriptor &x_desc,
-    const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
-    const double exponential_average_factor,
-    dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half> *y,
-    DeviceMemory<float> *batch_mean, DeviceMemory<float> *batch_var,
-    DeviceMemory<float> *saved_mean, DeviceMemory<float> *saved_inv_var,
-    bool is_training, ScratchAllocator *reserve_space_allocator,
-    ScratchAllocator *workspace_allocator) {
-  VLOG_CALL(PARAM(x), PARAM(scale), PARAM(offset), PARAM(x_desc),
-            PARAM(scale_offset_desc), PARAM(epsilon), PARAM(y));
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    CheckError(dnn->DoBatchNormalizationForward(
-        this, x, scale, offset, estimated_mean, estimated_variance, side_input,
-        x_desc, scale_offset_desc, epsilon, exponential_average_factor,
-        activation_mode, y, batch_mean, batch_var, saved_mean, saved_inv_var,
-        is_training, reserve_space_allocator, workspace_allocator));
-  } else {
-    SetErrorAndLogNoDnnSupport();
-  }
-  return *this;
-}
-
-Stream &Stream::ThenBatchNormalizationBackward(
-    const DeviceMemory<Eigen::half> &y_backprop,
-    const DeviceMemory<Eigen::half> &x, const DeviceMemory<float> &scale,
-    const DeviceMemory<float> &offset, const DeviceMemory<float> &mean,
-    const DeviceMemory<float> &inv_var, const DeviceMemory<Eigen::half> &y,
-    const dnn::BatchDescriptor &x_desc,
-    const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
-    dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half> *x_backprop,
-    DeviceMemory<float> *scale_backprop, DeviceMemory<float> *offset_backprop,
-    DeviceMemory<Eigen::half> *side_input_backprop,
-    DeviceMemory<uint8_t> *reserve_space_data,
-    ScratchAllocator *workspace_allocator) {
-  VLOG_CALL(PARAM(y_backprop), PARAM(x), PARAM(scale), PARAM(x_desc),
-            PARAM(scale_offset_desc), PARAM(epsilon), PARAM(x_backprop),
-            PARAM(scale_backprop), PARAM(offset_backprop));
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    CheckError(dnn->DoBatchNormalizationBackward(
-        this, y_backprop, x, scale, offset, mean, inv_var, y, x_desc,
-        scale_offset_desc, epsilon, activation_mode, x_backprop, scale_backprop,
-        offset_backprop, side_input_backprop, reserve_space_data,
-        workspace_allocator));
-
-  } else {
-    SetErrorAndLogNoDnnSupport();
-  }
-  return *this;
-}
-
-Stream &Stream::ThenBatchNormalizationForward(
-    const DeviceMemory<Eigen::bfloat16> &x, const DeviceMemory<float> &scale,
-    const DeviceMemory<float> &offset,
-    const DeviceMemory<float> &estimated_mean,
-    const DeviceMemory<float> &estimated_variance,
-    const DeviceMemory<Eigen::bfloat16> &side_input,
-    const dnn::BatchDescriptor &x_desc,
-    const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
-    const double exponential_average_factor,
-    dnn::ActivationMode activation_mode, DeviceMemory<Eigen::bfloat16> *y,
-    DeviceMemory<float> *batch_mean, DeviceMemory<float> *batch_var,
-    DeviceMemory<float> *saved_mean, DeviceMemory<float> *saved_inv_var,
-    bool is_training, ScratchAllocator *reserve_space_allocator,
-    ScratchAllocator *workspace_allocator) {
-  VLOG_CALL(PARAM(x), PARAM(scale), PARAM(offset), PARAM(x_desc),
-            PARAM(scale_offset_desc), PARAM(epsilon), PARAM(y));
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    CheckError(dnn->DoBatchNormalizationForward(
-        this, x, scale, offset, estimated_mean, estimated_variance, side_input,
-        x_desc, scale_offset_desc, epsilon, exponential_average_factor,
-        activation_mode, y, batch_mean, batch_var, saved_mean, saved_inv_var,
-        is_training, reserve_space_allocator, workspace_allocator));
-  } else {
-    SetErrorAndLogNoDnnSupport();
-  }
-  return *this;
-}
-
-Stream &Stream::ThenBatchNormalizationBackward(
-    const DeviceMemory<Eigen::bfloat16> &y_backprop,
-    const DeviceMemory<Eigen::bfloat16> &x, const DeviceMemory<float> &scale,
-    const DeviceMemory<float> &offset, const DeviceMemory<float> &mean,
-    const DeviceMemory<float> &inv_var, const DeviceMemory<Eigen::bfloat16> &y,
-    const dnn::BatchDescriptor &x_desc,
-    const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
-    dnn::ActivationMode activation_mode,
-    DeviceMemory<Eigen::bfloat16> *x_backprop,
-    DeviceMemory<float> *scale_backprop, DeviceMemory<float> *offset_backprop,
-    DeviceMemory<Eigen::bfloat16> *side_input_backprop,
-    DeviceMemory<uint8_t> *reserve_space_data,
-    ScratchAllocator *workspace_allocator) {
-  VLOG_CALL(PARAM(y_backprop), PARAM(x), PARAM(scale), PARAM(x_desc),
-            PARAM(scale_offset_desc), PARAM(epsilon), PARAM(x_backprop),
-            PARAM(scale_backprop), PARAM(offset_backprop));
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    CheckError(dnn->DoBatchNormalizationBackward(
-        this, y_backprop, x, scale, offset, mean, inv_var, y, x_desc,
-        scale_offset_desc, epsilon, activation_mode, x_backprop, scale_backprop,
-        offset_backprop, side_input_backprop, reserve_space_data,
-        workspace_allocator));
-
-  } else {
-    SetErrorAndLogNoDnnSupport();
-  }
-  return *this;
-}
-
-Stream &Stream::ThenConvolve(
-    const dnn::BatchDescriptor &input_descriptor,
-    const DeviceMemory<float> &input_data,
-    const dnn::FilterDescriptor &filter_descriptor,
-    const DeviceMemory<float> &filter_data,
-    const dnn::ConvolutionDescriptor &convolution_descriptor,
-    const dnn::BatchDescriptor &output_descriptor,
-    DeviceMemory<float> *output) {
-  if (ok()) {
-    CheckError(ConvolveWithAlgorithm(
-                   dnn::ConvolutionKind::FORWARD, input_descriptor, input_data,
-                   filter_descriptor, filter_data, output_descriptor, *output,
-                   convolution_descriptor,
-                   /*scratch_allocator=*/nullptr, dnn::AlgorithmConfig(),
-                   /*output_profile_result=*/nullptr)
-                   .ok());
-  }
-  return *this;
-}
-
-Stream &Stream::ThenNormalizeWithDimensions(
-    const dnn::NormalizeDescriptor &normalize_descriptor,
-    const dnn::BatchDescriptor &dimensions,
-    const DeviceMemory<float> &input_data, DeviceMemory<float> *output_data) {
-  VLOG_CALL(PARAM(normalize_descriptor), PARAM(dimensions), PARAM(input_data),
-            PARAM(output_data));
-
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    CheckError(dnn->DoNormalizeWithDimensions(
-        this, normalize_descriptor, dimensions, input_data, output_data));
-  } else {
-    SetErrorAndLogNoDnnSupport();
-  }
-  return *this;
-}
-
-Stream &Stream::ThenNormalizeBackwardWithDimensions(
-    const dnn::NormalizeDescriptor &normalize_descriptor,
-    const dnn::BatchDescriptor &dimensions, const DeviceMemory<float> &raw_data,
-    const DeviceMemory<float> &normalized_data,
-    const DeviceMemory<float> &normalized_variable_gradient,
-    DeviceMemory<float> *raw_variable_gradient,
-    ScratchAllocator *workspace_allocator) {
-  VLOG_CALL(PARAM(normalize_descriptor), PARAM(dimensions), PARAM(raw_data),
-            PARAM(normalized_data), PARAM(normalized_variable_gradient),
-            PARAM(raw_variable_gradient), PARAM(workspace_allocator));
-
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    CheckError(dnn->DoNormalizeBackwardWithDimensions(
-        this, normalize_descriptor, dimensions, raw_data, normalized_data,
-        normalized_variable_gradient, raw_variable_gradient,
-        workspace_allocator));
-  } else {
-    SetErrorAndLogNoDnnSupport();
-  }
-  return *this;
-}
-
-Stream &Stream::ThenDepthConcatenate(
-    absl::Span<const dnn::BatchDescriptor> input_dimensions,
-    absl::Span<const DeviceMemory<float> *const> input_data,
-    DeviceMemory<float> *output_data) {
-  VLOG_CALL(PARAM(input_dimensions), PARAM(input_data), PARAM(output_data));
-
-  for (size_t i = 1; i < input_dimensions.size(); ++i) {
-    if (input_dimensions[i].count() != input_dimensions[0].count() ||
-        input_dimensions[i].height() != input_dimensions[0].height() ||
-        input_dimensions[i].width() != input_dimensions[0].width()) {
-      SetError();
-      LOG(ERROR) << "Incompatible dimensions for depth concatenation.\n"
-                 << "input_dimensions[0]: " << input_dimensions[0].ToString()
-                 << "input_dimensions[" << i
-                 << "]: " << input_dimensions[i].ToString();
-      return *this;
-    }
-  }
-
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    CheckError(dnn->DoDepthConcatenate(this, input_dimensions, input_data,
-                                       output_data));
-  } else {
-    SetErrorAndLogNoDnnSupport();
-  }
-  return *this;
-}
-
 Stream *Stream::GetOrCreateSubStream() {
   // Do not destroy bad streams when holding mu_ because ~Stream() may
   // BlockHostUntilDone and it's host callbacks might attempt to acquire mu_.
@@ -712,513 +352,6 @@ Stream &Stream::ThenWaitFor(Event *event) {
   return *this;
 }
 
-// A functor that implements ThenBlasXXX interfaces, which calls DoBlasXXX
-// functions and logs for errors.
-template <typename... Args>
-struct ThenBlasImpl {
-  // blas_func is the DoBlasXXX member function pointer, and args are its
-  // arguments except the first one of Stream* type.
-  Stream &operator()(Stream *stream,
-                     bool (blas::BlasSupport::*blas_func)(Stream *, Args...),
-                     Args... args) {
-    return Run(stream, blas_func, /*record_error=*/true, args...);
-  }
-
-  // Like operator(), but only calls stream->CheckError() if record_error is
-  // true.
-  Stream &Run(Stream *stream,
-              bool (blas::BlasSupport::*blas_func)(Stream *, Args...),
-              bool record_error, Args... args);
-};
-
-template <typename... Args>
-Stream &ThenBlasImpl<Args...>::Run(
-    Stream *stream, bool (blas::BlasSupport::*blas_func)(Stream *, Args...),
-    bool record_error, Args... args) {
-  if (stream->ok()) {
-    bool ok;
-    if (blas::BlasSupport *blas = stream->parent_->AsBlas()) {
-      ok = (blas->*blas_func)(stream, args...);
-    } else {
-      LOG(WARNING)
-          << "attempting to perform BLAS operation using StreamExecutor "
-             "without BLAS support";
-      ok = false;
-    }
-    if (record_error) {
-      stream->CheckError(ok);
-    }
-  }
-  return *stream;
-}
-
-Stream &Stream::ThenBlasAxpy(uint64_t elem_count, float alpha,
-                             const DeviceMemory<float> &x, int incx,
-                             DeviceMemory<float> *y, int incy) {
-  VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx), PARAM(y),
-            PARAM(incy));
-
-  ThenBlasImpl<uint64_t, float, const DeviceMemory<float> &, int,
-               DeviceMemory<float> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasAxpy, elem_count, alpha, x, incx,
-              y, incy);
-}
-
-Stream &Stream::ThenBlasCopy(uint64_t elem_count, const DeviceMemory<float> &x,
-                             int incx, DeviceMemory<float> *y, int incy) {
-  VLOG_CALL(PARAM(elem_count), PARAM(x), PARAM(incx), PARAM(y), PARAM(incy));
-
-  ThenBlasImpl<uint64_t, const DeviceMemory<float> &, int,
-               DeviceMemory<float> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasCopy, elem_count, x, incx, y,
-              incy);
-}
-
-Stream &Stream::ThenBlasScal(uint64_t elem_count, float alpha,
-                             DeviceMemory<float> *x, int incx) {
-  VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx));
-
-  ThenBlasImpl<uint64_t, float, DeviceMemory<float> *, int> impl;
-  return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx);
-}
-
-Stream &Stream::ThenBlasScal(uint64_t elem_count, double alpha,
-                             DeviceMemory<double> *x, int incx) {
-  VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx));
-
-  ThenBlasImpl<uint64_t, double, DeviceMemory<double> *, int> impl;
-  return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx);
-}
-
-Stream &Stream::ThenBlasScal(uint64_t elem_count, float alpha,
-                             DeviceMemory<std::complex<float>> *x, int incx) {
-  VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx));
-
-  ThenBlasImpl<uint64_t, float, DeviceMemory<std::complex<float>> *, int> impl;
-  return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx);
-}
-
-Stream &Stream::ThenBlasScal(uint64_t elem_count, double alpha,
-                             DeviceMemory<std::complex<double>> *x, int incx) {
-  VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx));
-
-  ThenBlasImpl<uint64_t, double, DeviceMemory<std::complex<double>> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx);
-}
-
-Stream &Stream::ThenBlasScal(uint64_t elem_count, std::complex<float> alpha,
-                             DeviceMemory<std::complex<float>> *x, int incx) {
-  VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx));
-
-  ThenBlasImpl<uint64_t, std::complex<float>,
-               DeviceMemory<std::complex<float>> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx);
-}
-
-Stream &Stream::ThenBlasScal(uint64_t elem_count, std::complex<double> alpha,
-                             DeviceMemory<std::complex<double>> *x, int incx) {
-  VLOG_CALL(PARAM(elem_count), PARAM(alpha), PARAM(x), PARAM(incx));
-
-  ThenBlasImpl<uint64_t, std::complex<double>,
-               DeviceMemory<std::complex<double>> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasScal, elem_count, alpha, x, incx);
-}
-
-Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64_t m, uint64 n,
-                             float alpha, const DeviceMemory<float> &a, int lda,
-                             const DeviceMemory<float> &x, int incx, float beta,
-                             DeviceMemory<float> *y, int incy) {
-  VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a),
-            PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y),
-            PARAM(incy));
-
-  ThenBlasImpl<blas::Transpose, uint64_t, uint64_t, float,
-               const DeviceMemory<float> &, int, const DeviceMemory<float> &,
-               int, float, DeviceMemory<float> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda,
-              x, incx, beta, y, incy);
-}
-
-Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64_t m, uint64 n,
-                             double alpha, const DeviceMemory<double> &a,
-                             int lda, const DeviceMemory<double> &x, int incx,
-                             double beta, DeviceMemory<double> *y, int incy) {
-  VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a),
-            PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y),
-            PARAM(incy));
-
-  ThenBlasImpl<blas::Transpose, uint64_t, uint64_t, double,
-               const DeviceMemory<double> &, int, const DeviceMemory<double> &,
-               int, double, DeviceMemory<double> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda,
-              x, incx, beta, y, incy);
-}
-
-Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64_t m, uint64 n,
-                             std::complex<float> alpha,
-                             const DeviceMemory<std::complex<float>> &a,
-                             int lda,
-                             const DeviceMemory<std::complex<float>> &x,
-                             int incx, std::complex<float> beta,
-                             DeviceMemory<std::complex<float>> *y, int incy) {
-  VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a),
-            PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y),
-            PARAM(incy));
-
-  ThenBlasImpl<blas::Transpose, uint64_t, uint64_t, std::complex<float>,
-               const DeviceMemory<std::complex<float>> &, int,
-               const DeviceMemory<std::complex<float>> &, int,
-               std::complex<float>, DeviceMemory<std::complex<float>> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda,
-              x, incx, beta, y, incy);
-}
-
-Stream &Stream::ThenBlasGemv(blas::Transpose trans, uint64_t m, uint64 n,
-                             std::complex<double> alpha,
-                             const DeviceMemory<std::complex<double>> &a,
-                             int lda,
-                             const DeviceMemory<std::complex<double>> &x,
-                             int incx, std::complex<double> beta,
-                             DeviceMemory<std::complex<double>> *y, int incy) {
-  VLOG_CALL(PARAM(trans), PARAM(m), PARAM(n), PARAM(alpha), PARAM(a),
-            PARAM(lda), PARAM(x), PARAM(incx), PARAM(beta), PARAM(y),
-            PARAM(incy));
-
-  ThenBlasImpl<blas::Transpose, uint64_t, uint64_t, std::complex<double>,
-               const DeviceMemory<std::complex<double>> &, int,
-               const DeviceMemory<std::complex<double>> &, int,
-               std::complex<double>, DeviceMemory<std::complex<double>> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemv, trans, m, n, alpha, a, lda,
-              x, incx, beta, y, incy);
-}
-
-Stream &Stream::ThenBlasSbmv(blas::UpperLower uplo, uint64_t n, uint64 k,
-                             float alpha, const DeviceMemory<float> &a, int lda,
-                             const DeviceMemory<float> &x, int incx, float beta,
-                             DeviceMemory<float> *y, int incy) {
-  VLOG_CALL(PARAM(uplo), PARAM(n), PARAM(k), PARAM(alpha), PARAM(a), PARAM(lda),
-            PARAM(x), PARAM(incx), PARAM(beta), PARAM(y), PARAM(incy));
-
-  ThenBlasImpl<blas::UpperLower, uint64_t, uint64_t, float,
-               const DeviceMemory<float> &, int, const DeviceMemory<float> &,
-               int, float, DeviceMemory<float> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasSbmv, uplo, n, k, alpha, a, lda,
-              x, incx, beta, y, incy);
-}
-
-namespace {
-// Like ThenBlasImpl, except this expects the last argument of blas_func to be a
-// blas::ProfileResult*.  This functor doesn't put the stream into an error
-// state if the op fails and the profile result is non-null.  Instead, the
-// error-ness is returned in the profile result itself.
-template <typename... Args>
-struct ThenBlasWithProfileImpl {
-  Stream &operator()(Stream *stream,
-                     bool (blas::BlasSupport::*blas_func)(
-                         Stream *, Args..., blas::ProfileResult *),
-                     Args... args, blas::ProfileResult *profile_result) {
-    ThenBlasImpl<Args..., blas::ProfileResult *> Runner;
-    bool record_error = profile_result == nullptr;
-    return Runner.Run(stream, blas_func, record_error, args..., profile_result);
-  }
-};
-}  // anonymous namespace
-
-Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
-                             blas::Transpose transa, blas::Diagonal diag,
-                             uint64_t m, uint64 n, float alpha,
-                             const DeviceMemory<float> &a, int lda,
-                             DeviceMemory<float> *b, int ldb) {
-  VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m),
-            PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb));
-
-  ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
-               uint64_t, uint64_t, float, const DeviceMemory<float> &, int,
-               DeviceMemory<float> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m,
-              n, alpha, a, lda, b, ldb);
-}
-
-Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
-                             blas::Transpose transa, blas::Diagonal diag,
-                             uint64_t m, uint64 n, double alpha,
-                             const DeviceMemory<double> &a, int lda,
-                             DeviceMemory<double> *b, int ldb) {
-  VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m),
-            PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb));
-
-  ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
-               uint64_t, uint64_t, double, const DeviceMemory<double> &, int,
-               DeviceMemory<double> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m,
-              n, alpha, a, lda, b, ldb);
-}
-
-Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
-                             blas::Transpose transa, blas::Diagonal diag,
-                             uint64_t m, uint64 n, std::complex<float> alpha,
-                             const DeviceMemory<std::complex<float>> &a,
-                             int lda, DeviceMemory<std::complex<float>> *b,
-                             int ldb) {
-  VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m),
-            PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb));
-
-  ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
-               uint64_t, uint64_t, std::complex<float>,
-               const DeviceMemory<std::complex<float>> &, int,
-               DeviceMemory<std::complex<float>> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m,
-              n, alpha, a, lda, b, ldb);
-}
-
-Stream &Stream::ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
-                             blas::Transpose transa, blas::Diagonal diag,
-                             uint64_t m, uint64 n, std::complex<double> alpha,
-                             const DeviceMemory<std::complex<double>> &a,
-                             int lda, DeviceMemory<std::complex<double>> *b,
-                             int ldb) {
-  VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m),
-            PARAM(n), PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb));
-
-  ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
-               uint64_t, uint64_t, std::complex<double>,
-               const DeviceMemory<std::complex<double>> &, int,
-               DeviceMemory<std::complex<double>> *, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasTrsm, side, uplo, transa, diag, m,
-              n, alpha, a, lda, b, ldb);
-}
-
-Stream &Stream::ThenBlasTrsmBatched(blas::Side side, blas::UpperLower uplo,
-                                    blas::Transpose transa, blas::Diagonal diag,
-                                    uint64_t m, uint64 n, float alpha,
-                                    const DeviceMemory<float *> &as, int lda,
-                                    DeviceMemory<float *> *bs, int ldb,
-                                    int batch_count) {
-  VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m),
-            PARAM(n), PARAM(alpha), PARAM(as), PARAM(lda), PARAM(bs),
-            PARAM(ldb), PARAM(batch_count));
-
-  ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
-               uint64_t, uint64_t, float, const DeviceMemory<float *> &, int,
-               DeviceMemory<float *> *, int, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasTrsmBatched, side, uplo, transa,
-              diag, m, n, alpha, as, lda, bs, ldb, batch_count);
-}
-
-Stream &Stream::ThenBlasTrsmBatched(blas::Side side, blas::UpperLower uplo,
-                                    blas::Transpose transa, blas::Diagonal diag,
-                                    uint64_t m, uint64 n, double alpha,
-                                    const DeviceMemory<double *> &as, int lda,
-                                    DeviceMemory<double *> *bs, int ldb,
-                                    int batch_count) {
-  VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m),
-            PARAM(n), PARAM(alpha), PARAM(as), PARAM(lda), PARAM(bs),
-            PARAM(ldb), PARAM(batch_count));
-
-  ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
-               uint64_t, uint64_t, double, const DeviceMemory<double *> &, int,
-               DeviceMemory<double *> *, int, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasTrsmBatched, side, uplo, transa,
-              diag, m, n, alpha, as, lda, bs, ldb, batch_count);
-}
-
-Stream &Stream::ThenBlasTrsmBatched(
-    blas::Side side, blas::UpperLower uplo, blas::Transpose transa,
-    blas::Diagonal diag, uint64_t m, uint64 n, std::complex<float> alpha,
-    const DeviceMemory<std::complex<float> *> &as, int lda,
-    DeviceMemory<std::complex<float> *> *bs, int ldb, int batch_count) {
-  VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m),
-            PARAM(n), PARAM(alpha), PARAM(as), PARAM(lda), PARAM(bs),
-            PARAM(ldb), PARAM(batch_count));
-
-  ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
-               uint64_t, uint64_t, std::complex<float>,
-               const DeviceMemory<std::complex<float> *> &, int,
-               DeviceMemory<std::complex<float> *> *, int, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasTrsmBatched, side, uplo, transa,
-              diag, m, n, alpha, as, lda, bs, ldb, batch_count);
-}
-
-Stream &Stream::ThenBlasTrsmBatched(
-    blas::Side side, blas::UpperLower uplo, blas::Transpose transa,
-    blas::Diagonal diag, uint64_t m, uint64 n, std::complex<double> alpha,
-    const DeviceMemory<std::complex<double> *> &as, int lda,
-    DeviceMemory<std::complex<double> *> *bs, int ldb, int batch_count) {
-  VLOG_CALL(PARAM(side), PARAM(uplo), PARAM(transa), PARAM(diag), PARAM(m),
-            PARAM(n), PARAM(alpha), PARAM(as), PARAM(lda), PARAM(bs),
-            PARAM(ldb), PARAM(batch_count));
-
-  ThenBlasImpl<blas::Side, blas::UpperLower, blas::Transpose, blas::Diagonal,
-               uint64_t, uint64_t, std::complex<double>,
-               const DeviceMemory<std::complex<double> *> &, int,
-               DeviceMemory<std::complex<double> *> *, int, int>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasTrsmBatched, side, uplo, transa,
-              diag, m, n, alpha, as, lda, bs, ldb, batch_count);
-}
-
-Stream &Stream::ThenBlasGemmBatchedWithScratch(
-    blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-    uint64_t k, float alpha, DeviceMemorySlice<Eigen::half> a, int lda,
-    DeviceMemorySlice<Eigen::half> b, int ldb, float beta,
-    DeviceMemorySlice<Eigen::half> c, int ldc, int batch_count,
-    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
-    blas::CallContext context) {
-  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
-            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
-            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
-
-  ThenBlasImpl<blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64,
-               float, DeviceMemorySlice<Eigen::half>, int,
-               DeviceMemorySlice<Eigen::half>, int, float,
-               DeviceMemorySlice<Eigen::half>, int, int, const NumericOptions &,
-               ScratchAllocator *, blas::CallContext>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
-              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
-              numeric_options, scratch_allocator, context);
-}
-
-Stream &Stream::ThenBlasGemmBatchedWithScratch(
-    blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-    uint64_t k, float alpha, DeviceMemorySlice<Eigen::bfloat16> a, int lda,
-    DeviceMemorySlice<Eigen::bfloat16> b, int ldb, float beta,
-    DeviceMemorySlice<Eigen::bfloat16> c, int ldc, int batch_count,
-    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
-    blas::CallContext context) {
-  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
-            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
-            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
-
-  ThenBlasImpl<blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64,
-               float, DeviceMemorySlice<Eigen::bfloat16>, int,
-               DeviceMemorySlice<Eigen::bfloat16>, int, float,
-               DeviceMemorySlice<Eigen::bfloat16>, int, int,
-               const NumericOptions &, ScratchAllocator *, blas::CallContext>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
-              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
-              numeric_options, scratch_allocator, context);
-}
-
-Stream &Stream::ThenBlasGemmBatched(
-    blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-    uint64_t k, float alpha, DeviceMemorySlice<float> a, int lda,
-    DeviceMemorySlice<float> b, int ldb, float beta, DeviceMemorySlice<float> c,
-    int ldc, int batch_count, const NumericOptions &numeric_options,
-    blas::CallContext context) {
-  return ThenBlasGemmBatchedWithScratch(transa, transb, m, n, k, alpha, a, lda,
-                                        b, ldb, beta, c, ldc, batch_count,
-                                        numeric_options,
-                                        /*scratch_allocator=*/nullptr, context);
-}
-
-Stream &Stream::ThenBlasGemmBatchedWithScratch(
-    blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-    uint64_t k, float alpha, DeviceMemorySlice<float> a, int lda,
-    DeviceMemorySlice<float> b, int ldb, float beta, DeviceMemorySlice<float> c,
-    int ldc, int batch_count, const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator, blas::CallContext context) {
-  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
-            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
-            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
-
-  ThenBlasImpl<blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64,
-               float, DeviceMemorySlice<float>, int, DeviceMemorySlice<float>,
-               int, float, DeviceMemorySlice<float>, int, int,
-               const NumericOptions &, ScratchAllocator *, blas::CallContext>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
-              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
-              numeric_options, scratch_allocator, context);
-}
-
-Stream &Stream::ThenBlasGemmBatchedWithScratch(
-    blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-    uint64_t k, double alpha, DeviceMemorySlice<double> a, int lda,
-    DeviceMemorySlice<double> b, int ldb, double beta,
-    DeviceMemorySlice<double> c, int ldc, int batch_count,
-    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
-    blas::CallContext context) {
-  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
-            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
-            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
-
-  ThenBlasImpl<blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64,
-               double, DeviceMemorySlice<double>, int,
-               DeviceMemorySlice<double>, int, double,
-               DeviceMemorySlice<double>, int, int, const NumericOptions &,
-               ScratchAllocator *, blas::CallContext>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
-              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
-              numeric_options, scratch_allocator, context);
-}
-
-Stream &Stream::ThenBlasGemmBatchedWithScratch(
-    blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-    uint64_t k, std::complex<float> alpha,
-    DeviceMemorySlice<std::complex<float>> a, int lda,
-    DeviceMemorySlice<std::complex<float>> b, int ldb, std::complex<float> beta,
-    DeviceMemorySlice<std::complex<float>> c, int ldc, int batch_count,
-    const NumericOptions &numeric_options, ScratchAllocator *scratch_allocator,
-    blas::CallContext context) {
-  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
-            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
-            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
-
-  ThenBlasImpl<blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64,
-               std::complex<float>, DeviceMemorySlice<std::complex<float>>, int,
-               DeviceMemorySlice<std::complex<float>>, int, std::complex<float>,
-               DeviceMemorySlice<std::complex<float>>, int, int,
-               const NumericOptions &, ScratchAllocator *, blas::CallContext>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
-              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
-              numeric_options, scratch_allocator, context);
-}
-
-Stream &Stream::ThenBlasGemmBatchedWithScratch(
-    blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-    uint64_t k, std::complex<double> alpha,
-    DeviceMemorySlice<std::complex<double>> a, int lda,
-    DeviceMemorySlice<std::complex<double>> b, int ldb,
-    std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c,
-    int ldc, int batch_count, const NumericOptions &numeric_options,
-    ScratchAllocator *scratch_allocator, blas::CallContext context) {
-  VLOG_CALL(PARAM(transa), PARAM(transb), PARAM(m), PARAM(n), PARAM(k),
-            PARAM(alpha), PARAM(a), PARAM(lda), PARAM(b), PARAM(ldb),
-            PARAM(beta), PARAM(c), PARAM(ldc), PARAM(batch_count));
-
-  ThenBlasImpl<blas::Transpose, blas::Transpose, uint64_t, uint64_t, uint64,
-               std::complex<double>, DeviceMemorySlice<std::complex<double>>,
-               int, DeviceMemorySlice<std::complex<double>>, int,
-               std::complex<double>, DeviceMemorySlice<std::complex<double>>,
-               int, int, const NumericOptions &, ScratchAllocator *,
-               blas::CallContext>
-      impl;
-  return impl(this, &blas::BlasSupport::DoBlasGemmBatched, transa, transb, m, n,
-              k, alpha, a, lda, b, ldb, beta, c, ldc, batch_count,
-              numeric_options, scratch_allocator, context);
-}
-
 Stream &Stream::ThenMemcpy(void *host_dst, const DeviceMemoryBase &gpu_src,
                            uint64_t size) {
   VLOG_CALL(PARAM(host_dst), PARAM(gpu_src), PARAM(size));
@@ -1258,303 +391,6 @@ Stream &Stream::ThenMemset32(DeviceMemoryBase *location, uint32_t pattern,
   return *this;
 }
 
-Stream &Stream::ThenRnnForward(
-    const dnn::RnnDescriptor &rnn_desc,
-    const dnn::RnnSequenceTensorDescriptor &input_desc,
-    const DeviceMemory<Eigen::half> &input_data,
-    const DeviceMemory<int> &seq_lengths_data,
-    const dnn::RnnStateTensorDescriptor &input_h_desc,
-    const DeviceMemory<Eigen::half> &input_h_data,
-    const dnn::RnnStateTensorDescriptor &input_c_desc,
-    const DeviceMemory<Eigen::half> &input_c_data,
-    const DeviceMemory<Eigen::half> &params,
-    const dnn::RnnSequenceTensorDescriptor &output_desc,
-    DeviceMemory<Eigen::half> *output_data,
-    const dnn::RnnStateTensorDescriptor &output_h_desc,
-    DeviceMemory<Eigen::half> *output_h_data,
-    const dnn::RnnStateTensorDescriptor &output_c_desc,
-    DeviceMemory<Eigen::half> *output_c_data, bool is_training,
-    ScratchAllocator *reserve_space_allocator,
-    ScratchAllocator *workspace_allocator,
-    dnn::ProfileResult *output_profile_result) {
-  // TODO(zhengxq): add VLOG PARAM calls.
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    auto status = dnn->DoRnnForward(
-        this, rnn_desc, input_desc, input_data, seq_lengths_data, input_h_desc,
-        input_h_data, input_c_desc, input_c_data, params, output_desc,
-        output_data, output_h_desc, output_h_data, output_c_desc, output_c_data,
-        is_training, reserve_space_allocator, workspace_allocator,
-        output_profile_result);
-    if (!status && !output_profile_result) {
-      SetError();
-    }
-  } else {
-    SetErrorAndLogNoDnnSupport();
-  }
-  return *this;
-}
-
-Stream &Stream::ThenRnnForward(
-    const dnn::RnnDescriptor &rnn_desc,
-    const dnn::RnnSequenceTensorDescriptor &input_desc,
-    const DeviceMemory<float> &input_data,
-    const DeviceMemory<int> &seq_lengths_data,
-    const dnn::RnnStateTensorDescriptor &input_h_desc,
-    const DeviceMemory<float> &input_h_data,
-    const dnn::RnnStateTensorDescriptor &input_c_desc,
-    const DeviceMemory<float> &input_c_data, const DeviceMemory<float> &params,
-    const dnn::RnnSequenceTensorDescriptor &output_desc,
-    DeviceMemory<float> *output_data,
-    const dnn::RnnStateTensorDescriptor &output_h_desc,
-    DeviceMemory<float> *output_h_data,
-    const dnn::RnnStateTensorDescriptor &output_c_desc,
-    DeviceMemory<float> *output_c_data, bool is_training,
-    ScratchAllocator *reserve_space_allocator,
-    ScratchAllocator *workspace_allocator,
-    dnn::ProfileResult *output_profile_result) {
-  // TODO(zhengxq): add VLOG PARAM calls.
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    auto status = dnn->DoRnnForward(
-        this, rnn_desc, input_desc, input_data, seq_lengths_data, input_h_desc,
-        input_h_data, input_c_desc, input_c_data, params, output_desc,
-        output_data, output_h_desc, output_h_data, output_c_desc, output_c_data,
-        is_training, reserve_space_allocator, workspace_allocator,
-        output_profile_result);
-    if (!status && !output_profile_result) {
-      SetError();
-    }
-  } else {
-    SetErrorAndLogNoDnnSupport();
-  }
-  return *this;
-}
-
-Stream &Stream::ThenRnnForward(
-    const dnn::RnnDescriptor &rnn_desc,
-    const dnn::RnnSequenceTensorDescriptor &input_desc,
-    const DeviceMemory<double> &input_data,
-    const DeviceMemory<int> &seq_lengths_data,
-    const dnn::RnnStateTensorDescriptor &input_h_desc,
-    const DeviceMemory<double> &input_h_data,
-    const dnn::RnnStateTensorDescriptor &input_c_desc,
-    const DeviceMemory<double> &input_c_data,
-    const DeviceMemory<double> &params,
-    const dnn::RnnSequenceTensorDescriptor &output_desc,
-    DeviceMemory<double> *output_data,
-    const dnn::RnnStateTensorDescriptor &output_h_desc,
-    DeviceMemory<double> *output_h_data,
-    const dnn::RnnStateTensorDescriptor &output_c_desc,
-    DeviceMemory<double> *output_c_data, bool is_training,
-    ScratchAllocator *reserve_space_allocator,
-    ScratchAllocator *workspace_allocator,
-    dnn::ProfileResult *output_profile_result) {
-  // TODO(zhengxq): add VLOG PARAM calls.
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    auto status = dnn->DoRnnForward(
-        this, rnn_desc, input_desc, input_data, seq_lengths_data, input_h_desc,
-        input_h_data, input_c_desc, input_c_data, params, output_desc,
-        output_data, output_h_desc, output_h_data, output_c_desc, output_c_data,
-        is_training, reserve_space_allocator, workspace_allocator,
-        output_profile_result);
-    if (!status && !output_profile_result) {
-      SetError();
-    }
-  } else {
-    SetErrorAndLogNoDnnSupport();
-  }
-  return *this;
-}
-
-Stream &Stream::ThenRnnBackward(
-    const dnn::RnnDescriptor &rnn_desc,
-    const dnn::RnnSequenceTensorDescriptor &input_desc,
-    const DeviceMemory<Eigen::half> &input_data,
-    const DeviceMemory<int> &seq_lengths_data,
-    const dnn::RnnStateTensorDescriptor &input_h_desc,
-    const DeviceMemory<Eigen::half> &input_h_data,
-    const dnn::RnnStateTensorDescriptor &input_c_desc,
-    const DeviceMemory<Eigen::half> &input_c_data,
-    const DeviceMemory<Eigen::half> &params,
-    const dnn::RnnSequenceTensorDescriptor &output_desc,
-    const DeviceMemory<Eigen::half> &output_data,
-    const dnn::RnnStateTensorDescriptor &output_h_desc,
-    const DeviceMemory<Eigen::half> &output_h_data,
-    const dnn::RnnStateTensorDescriptor &output_c_desc,
-    const DeviceMemory<Eigen::half> &output_c_data,
-    const DeviceMemory<Eigen::half> &output_backprop_data,
-    const DeviceMemory<Eigen::half> &output_h_backprop_data,
-    const DeviceMemory<Eigen::half> &output_c_backprop_data,
-    DeviceMemory<Eigen::half> *input_backprop_data,
-    DeviceMemory<Eigen::half> *input_h_backprop_data,
-    DeviceMemory<Eigen::half> *input_c_backprop_data,
-    DeviceMemory<Eigen::half> *params_backprop_data,
-    DeviceMemory<uint8_t> *reserve_space_data,
-    ScratchAllocator *workspace_allocator,
-    dnn::ProfileResult *output_profile_result) {
-  // TODO(zhengxq): add VLOG PARAM calls.
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    auto status = dnn->DoRnnBackward(
-        this, rnn_desc, input_desc, input_data, seq_lengths_data, input_h_desc,
-        input_h_data, input_c_desc, input_c_data, params, output_desc,
-        output_data, output_h_desc, output_h_data, output_c_desc, output_c_data,
-        output_backprop_data, output_h_backprop_data, output_c_backprop_data,
-        input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-        params_backprop_data, reserve_space_data, workspace_allocator,
-        output_profile_result);
-    if (!status && !output_profile_result) {
-      SetError();
-    }
-  } else {
-    SetError();
-    LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
-  }
-  return *this;
-}
-
-Stream &Stream::ThenRnnBackward(
-    const dnn::RnnDescriptor &rnn_desc,
-    const dnn::RnnSequenceTensorDescriptor &input_desc,
-    const DeviceMemory<float> &input_data,
-    const DeviceMemory<int> &seq_lengths_data,
-    const dnn::RnnStateTensorDescriptor &input_h_desc,
-    const DeviceMemory<float> &input_h_data,
-    const dnn::RnnStateTensorDescriptor &input_c_desc,
-    const DeviceMemory<float> &input_c_data, const DeviceMemory<float> &params,
-    const dnn::RnnSequenceTensorDescriptor &output_desc,
-    const DeviceMemory<float> &output_data,
-    const dnn::RnnStateTensorDescriptor &output_h_desc,
-    const DeviceMemory<float> &output_h_data,
-    const dnn::RnnStateTensorDescriptor &output_c_desc,
-    const DeviceMemory<float> &output_c_data,
-    const DeviceMemory<float> &output_backprop_data,
-    const DeviceMemory<float> &output_h_backprop_data,
-    const DeviceMemory<float> &output_c_backprop_data,
-    DeviceMemory<float> *input_backprop_data,
-    DeviceMemory<float> *input_h_backprop_data,
-    DeviceMemory<float> *input_c_backprop_data,
-    DeviceMemory<float> *params_backprop_data,
-    DeviceMemory<uint8_t> *reserve_space_data,
-    ScratchAllocator *workspace_allocator,
-    dnn::ProfileResult *output_profile_result) {
-  // TODO(zhengxq): add VLOG PARAM calls.
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    auto status = dnn->DoRnnBackward(
-        this, rnn_desc, input_desc, input_data, seq_lengths_data, input_h_desc,
-        input_h_data, input_c_desc, input_c_data, params, output_desc,
-        output_data, output_h_desc, output_h_data, output_c_desc, output_c_data,
-        output_backprop_data, output_h_backprop_data, output_c_backprop_data,
-        input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-        params_backprop_data, reserve_space_data, workspace_allocator,
-        output_profile_result);
-    if (!status && !output_profile_result) {
-      SetError();
-    }
-  } else {
-    SetError();
-    LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
-  }
-  return *this;
-}
-
-Stream &Stream::ThenRnnBackward(
-    const dnn::RnnDescriptor &rnn_desc,
-    const dnn::RnnSequenceTensorDescriptor &input_desc,
-    const DeviceMemory<double> &input_data,
-    const DeviceMemory<int> &seq_lengths_data,
-    const dnn::RnnStateTensorDescriptor &input_h_desc,
-    const DeviceMemory<double> &input_h_data,
-    const dnn::RnnStateTensorDescriptor &input_c_desc,
-    const DeviceMemory<double> &input_c_data,
-    const DeviceMemory<double> &params,
-    const dnn::RnnSequenceTensorDescriptor &output_desc,
-    const DeviceMemory<double> &output_data,
-    const dnn::RnnStateTensorDescriptor &output_h_desc,
-    const DeviceMemory<double> &output_h_data,
-    const dnn::RnnStateTensorDescriptor &output_c_desc,
-    const DeviceMemory<double> &output_c_data,
-    const DeviceMemory<double> &output_backprop_data,
-    const DeviceMemory<double> &output_h_backprop_data,
-    const DeviceMemory<double> &output_c_backprop_data,
-    DeviceMemory<double> *input_backprop_data,
-    DeviceMemory<double> *input_h_backprop_data,
-    DeviceMemory<double> *input_c_backprop_data,
-    DeviceMemory<double> *params_backprop_data,
-    DeviceMemory<uint8_t> *reserve_space_data,
-    ScratchAllocator *workspace_allocator,
-    dnn::ProfileResult *output_profile_result) {
-  // TODO(zhengxq): add VLOG PARAM calls.
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    auto status = dnn->DoRnnBackward(
-        this, rnn_desc, input_desc, input_data, seq_lengths_data, input_h_desc,
-        input_h_data, input_c_desc, input_c_data, params, output_desc,
-        output_data, output_h_desc, output_h_data, output_c_desc, output_c_data,
-        output_backprop_data, output_h_backprop_data, output_c_backprop_data,
-        input_backprop_data, input_h_backprop_data, input_c_backprop_data,
-        params_backprop_data, reserve_space_data, workspace_allocator,
-        output_profile_result);
-    if (!status && !output_profile_result) {
-      SetError();
-    }
-  } else {
-    SetError();
-    LOG(WARNING) << "Attempting to call ThenRnnBackward without DNN support";
-  }
-  return *this;
-}
-
-Stream &Stream::ThenCtcLoss(const dnn::RnnStateTensorDescriptor &probs_desc,
-                            const DeviceMemory<float> &probs_data,
-                            absl::Span<const int> labels_data,
-                            absl::Span<const int> labels_lengths_data,
-                            absl::Span<const int> input_lengths_data,
-                            const NumericOptions &numeric_options,
-                            DeviceMemory<float> *costs_data,
-                            const dnn::RnnStateTensorDescriptor &grads_desc,
-                            DeviceMemory<float> *grads_data,
-                            ScratchAllocator *workspace_allocator) {
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    DeviceMemory<uint8_t> scratch_memory;
-    int ctc_loss_algo_id;
-    auto status =
-        dnn->PrepareForCtcLoss(
-               this, probs_desc, probs_data, grads_desc, labels_data,
-               labels_lengths_data, input_lengths_data, numeric_options,
-               workspace_allocator, &scratch_memory, &ctc_loss_algo_id)
-            .ok();
-    if (status) {
-      status = dnn->DoCtcLoss(this, probs_desc, probs_data, labels_data,
-                              labels_lengths_data, input_lengths_data,
-                              costs_data, grads_desc, grads_data,
-                              &scratch_memory, ctc_loss_algo_id);
-    }
-    if (!status) {
-      SetError();
-    }
-  } else {
-    SetErrorAndLogNoDnnSupport();
-  }
-  return *this;
-}
-
-Stream &Stream::ThenTransformTensor(const dnn::BatchDescriptor &input_desc,
-                                    dnn::DataType input_type,
-                                    const DeviceMemoryBase &input_data,
-                                    const dnn::BatchDescriptor &output_desc,
-                                    dnn::DataType output_type, float scale,
-                                    DeviceMemoryBase *output_data) {
-  VLOG_CALL(PARAM(input_desc), PARAM(input_type), PARAM(input_data),
-            PARAM(output_desc), PARAM(output_type), PARAM(scale),
-            PARAM(output_data));
-  if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-    CheckError(dnn->DoTransformTensor(this, input_desc, input_type, input_data,
-                                      output_desc, output_type, scale,
-                                      output_data));
-  } else {
-    SetErrorAndLogNoDnnSupport();
-  }
-  return *this;
-}
-
 Stream &Stream::ThenDoHostCallback(absl::AnyInvocable<void() &&> callback) {
   return ThenDoHostCallbackWithStatus([cb = std::move(callback)]() mutable {
     std::move(cb)();
@@ -1582,114 +418,6 @@ void Stream::CheckError(bool operation_retcode) {
   status_ = absl::InternalError("Unknown error");
 }
 
-Stream &Stream::ThenFft(fft::Plan *plan,
-                        const DeviceMemory<std::complex<float>> &input,
-                        DeviceMemory<std::complex<float>> *output) {
-  VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output));
-
-  if (fft::FftSupport *fft = parent_->AsFft()) {
-    CheckError(fft->DoFft(this, plan, input, output));
-  } else {
-    SetError();
-    LOG(INFO) << DebugStreamPointers()
-              << " attempting to perform FFT operation using StreamExecutor"
-                 " without FFT support";
-  }
-  return *this;
-}
-
-Stream &Stream::ThenFft(fft::Plan *plan,
-                        const DeviceMemory<std::complex<double>> &input,
-                        DeviceMemory<std::complex<double>> *output) {
-  VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output));
-
-  if (fft::FftSupport *fft = parent_->AsFft()) {
-    CheckError(fft->DoFft(this, plan, input, output));
-  } else {
-    SetError();
-    LOG(INFO) << DebugStreamPointers()
-              << " attempting to perform FFT operation using StreamExecutor"
-                 " without FFT support";
-  }
-  return *this;
-}
-
-Stream &Stream::ThenFft(fft::Plan *plan, const DeviceMemory<float> &input,
-                        DeviceMemory<std::complex<float>> *output) {
-  VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output));
-
-  if (fft::FftSupport *fft = parent_->AsFft()) {
-    CheckError(fft->DoFft(this, plan, input, output));
-  } else {
-    SetError();
-    LOG(INFO) << DebugStreamPointers()
-              << " attempting to perform FFT operation using StreamExecutor"
-                 " without FFT support";
-  }
-  return *this;
-}
-
-Stream &Stream::ThenFft(fft::Plan *plan, const DeviceMemory<double> &input,
-                        DeviceMemory<std::complex<double>> *output) {
-  VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output));
-
-  if (fft::FftSupport *fft = parent_->AsFft()) {
-    CheckError(fft->DoFft(this, plan, input, output));
-  } else {
-    SetError();
-    LOG(INFO) << DebugStreamPointers()
-              << " attempting to perform FFT operation using StreamExecutor"
-                 " without FFT support";
-  }
-  return *this;
-}
-
-Stream &Stream::ThenFft(fft::Plan *plan,
-                        const DeviceMemory<std::complex<float>> &input,
-                        DeviceMemory<float> *output) {
-  VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output));
-
-  if (fft::FftSupport *fft = parent_->AsFft()) {
-    CheckError(fft->DoFft(this, plan, input, output));
-  } else {
-    SetError();
-    LOG(INFO) << DebugStreamPointers()
-              << " attempting to perform FFT operation using StreamExecutor"
-                 " without FFT support";
-  }
-  return *this;
-}
-
-Stream &Stream::ThenFft(fft::Plan *plan,
-                        const DeviceMemory<std::complex<double>> &input,
-                        DeviceMemory<double> *output) {
-  VLOG_CALL(PARAM(plan), PARAM(input), PARAM(output));
-
-  if (fft::FftSupport *fft = parent_->AsFft()) {
-    CheckError(fft->DoFft(this, plan, input, output));
-  } else {
-    SetError();
-    LOG(INFO) << DebugStreamPointers()
-              << " attempting to perform FFT operation using StreamExecutor"
-                 " without FFT support";
-  }
-  return *this;
-}
-
-// It looks confusing, but all this is doing is inserting a callback at the
-// present point in the stream to then enqueue a task on the host executor.
-Stream &Stream::ThenEnqueueOnBackgroundThread(
-    std::function<void(StreamExecutor *)> task) {
-  VLOG_CALL(PARAM(task));
-
-  StreamExecutor *stream_executor = this->parent_;
-  std::function<void()> bound_task = std::bind(task, stream_executor);
-
-  return ThenDoHostCallback([stream_executor, bound_task]() {
-    stream_executor->EnqueueOnBackgroundThread(bound_task);
-  });
-}
-
 absl::Status Stream::BlockHostUntilDone() {
   VLOG_CALL();
 
@@ -1702,26 +430,12 @@ absl::Status Stream::BlockHostUntilDone() {
     return status;
   }
 
-  temporary_memory_manager_.DeallocateFinalizedTemporaries();
-
   absl::Status error = parent_->BlockHostUntilDone(this);
   CheckError(error.ok());
 
-  RunAfterBlockHostUntilDoneCallbacks();
   return error;
 }
 
-void Stream::RunAfterBlockHostUntilDoneCallbacks() {
-  std::vector<absl::AnyInvocable<void() &&>> callbacks;
-  {
-    absl::MutexLock lock(&mu_);
-    std::swap(callbacks, after_block_host_until_done_callbacks_);
-  }
-  for (auto &fn : callbacks) {
-    std::move(fn)();
-  }
-}
-
 std::string Stream::DebugStreamPointers() const {
   // Relies on the ToVlogString(const void*) overload above.
   return absl::StrCat("[stream=", ToVlogString(this),
diff --git a/third_party/xla/xla/stream_executor/stream.h b/third_party/xla/xla/stream_executor/stream.h
index 67a89500fe4a80..345be4a9a42925 100644
--- a/third_party/xla/xla/stream_executor/stream.h
+++ b/third_party/xla/xla/stream_executor/stream.h
@@ -21,35 +21,26 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_STREAM_H_
 #define XLA_STREAM_EXECUTOR_STREAM_H_
 
-#include <complex>
 #include <cstdint>
-#include <functional>
 #include <memory>
-#include <optional>
 #include <string>
-#include <type_traits>
 #include <utility>
 #include <variant>
 #include <vector>
 
+#include "absl/base/attributes.h"
 #include "absl/base/thread_annotations.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
-#include "absl/status/statusor.h"
 #include "absl/synchronization/mutex.h"
-#include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/fft.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/launch_dim.h"
-#include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform.h"
 #include "xla/stream_executor/platform/port.h"
 #include "xla/stream_executor/stream_executor_pimpl.h"
-#include "xla/stream_executor/temporary_device_memory.h"
-#include "xla/stream_executor/temporary_memory_manager.h"
 #include "tsl/platform/errors.h"
 #include "tsl/platform/thread_annotations.h"
 
@@ -63,35 +54,7 @@ class DeviceMemoryBase;
 template <typename ElemT>
 class DeviceMemory;
 
-namespace dnn {
-class BatchDescriptor;
-class FilterDescriptor;
-class ConvolutionDescriptor;
-class ProfileResult;
-class AlgorithmDesc;
-}  // namespace dnn
-
 class StreamExecutor;
-class ScratchAllocator;
-
-namespace detail {
-
-// Helper to return if `T` is the same type as `First` or any or `Rest`.
-template <typename T>
-constexpr bool is_any_of() {
-  return false;
-}
-
-template <typename T, typename First, typename... Rest>
-constexpr bool is_any_of() {
-  return std::is_same_v<T, First> || is_any_of<T, Rest...>();
-}
-
-}  // namespace detail
-
-// Convert a type to the corresponding QuantizedActivationMode.
-template <typename ElementType>
-struct Quantization;
 
 // Represents a stream of dependent computations on a GPU device.
 //
@@ -144,7 +107,9 @@ class Stream {
 
   // Initialize the stream. This must be performed before entraining any other
   // operations.
+  ABSL_DEPRECATED("Use absl::Status Stream::Initialize instead.")
   Stream &Init() TF_LOCKS_EXCLUDED(mu_);
+  absl::Status Initialize();
 
   // Get or create a sub-stream from this stream. If there is any sub-stream in
   // the pool that can be reused then just return this sub-stream.  Otherwise
@@ -159,12 +124,6 @@ class Stream {
   // TODO(b/112196569): The semantics of failed sub-streams is error-prone.
   void ReturnSubStream(Stream *sub_stream) TF_LOCKS_EXCLUDED(mu_);
 
-  // Allocate temporary memories. The stream will deallocate them when blocked
-  // or destroyed.
-  template <typename T>
-  absl::StatusOr<std::unique_ptr<TemporaryDeviceMemory<T>>>
-  AllocateTemporaryArray(uint64_t element_count);
-
   // Entrains onto the stream of operations: a kernel launch with the given
   // (variadic) parameters for the invocation. These arguments can be things
   // like DeviceMemory or primitive types such as int. What arguments you may
@@ -227,749 +186,6 @@ class Stream {
   // must extend past the point at which it is marked complete!
   Stream &ThenRecordEvent(Event *event);
 
-  ////////////////
-  // DNN support
-  //
-  // See DnnSupport::* for comments on the following methods.
-
-  Stream &ThenBatchNormalizationForward(
-      const DeviceMemory<float> &x, const DeviceMemory<float> &scale,
-      const DeviceMemory<float> &offset,
-      const DeviceMemory<float> &estimated_mean,
-      const DeviceMemory<float> &estimated_variance,
-      const DeviceMemory<float> &side_input, const dnn::BatchDescriptor &x_desc,
-      const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
-      const double exponential_average_factor,
-      dnn::ActivationMode activation_mode, DeviceMemory<float> *y,
-      DeviceMemory<float> *batch_mean, DeviceMemory<float> *batch_var,
-      DeviceMemory<float> *saved_mean, DeviceMemory<float> *saved_inv_var,
-      bool is_training, ScratchAllocator *reserve_space_allocator,
-      ScratchAllocator *workspace_allocator);
-
-  Stream &ThenBatchNormalizationBackward(
-      const DeviceMemory<float> &y_backprop, const DeviceMemory<float> &x,
-      const DeviceMemory<float> &scale, const DeviceMemory<float> &offset,
-      const DeviceMemory<float> &mean, const DeviceMemory<float> &inv_var,
-      const DeviceMemory<float> &y, const dnn::BatchDescriptor &x_desc,
-      const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
-      dnn::ActivationMode activation_mode, DeviceMemory<float> *x_backprop,
-      DeviceMemory<float> *scale_backprop, DeviceMemory<float> *offset_backprop,
-      DeviceMemory<float> *side_input_backprop,
-      DeviceMemory<uint8_t> *reserve_space_data,
-      ScratchAllocator *workspace_allocator);
-
-  Stream &ThenBatchNormalizationForward(
-      const DeviceMemory<Eigen::half> &x, const DeviceMemory<float> &scale,
-      const DeviceMemory<float> &offset,
-      const DeviceMemory<float> &estimated_mean,
-      const DeviceMemory<float> &estimated_variance,
-      const DeviceMemory<Eigen::half> &side_input,
-      const dnn::BatchDescriptor &x_desc,
-      const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
-      const double exponential_average_factor,
-      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::half> *y,
-      DeviceMemory<float> *batch_mean, DeviceMemory<float> *batch_var,
-      DeviceMemory<float> *saved_mean, DeviceMemory<float> *saved_inv_var,
-      bool is_training, ScratchAllocator *reserve_space_allocator,
-      ScratchAllocator *workspace_allocator);
-
-  Stream &ThenBatchNormalizationBackward(
-      const DeviceMemory<Eigen::half> &y_backprop,
-      const DeviceMemory<Eigen::half> &x, const DeviceMemory<float> &scale,
-      const DeviceMemory<float> &offset, const DeviceMemory<float> &mean,
-      const DeviceMemory<float> &inv_var, const DeviceMemory<Eigen::half> &y,
-      const dnn::BatchDescriptor &x_desc,
-      const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
-      dnn::ActivationMode activation_mode,
-      DeviceMemory<Eigen::half> *x_backprop,
-      DeviceMemory<float> *scale_backprop, DeviceMemory<float> *offset_backprop,
-      DeviceMemory<Eigen::half> *side_input_backprop,
-      DeviceMemory<uint8_t> *reserve_space_data,
-      ScratchAllocator *workspace_allocator);
-
-  Stream &ThenBatchNormalizationForward(
-      const DeviceMemory<Eigen::bfloat16> &x, const DeviceMemory<float> &scale,
-      const DeviceMemory<float> &offset,
-      const DeviceMemory<float> &estimated_mean,
-      const DeviceMemory<float> &estimated_variance,
-      const DeviceMemory<Eigen::bfloat16> &side_input,
-      const dnn::BatchDescriptor &x_desc,
-      const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
-      const double exponential_average_factor,
-      dnn::ActivationMode activation_mode, DeviceMemory<Eigen::bfloat16> *y,
-      DeviceMemory<float> *batch_mean, DeviceMemory<float> *batch_var,
-      DeviceMemory<float> *saved_mean, DeviceMemory<float> *saved_inv_var,
-      bool is_training, ScratchAllocator *reserve_space_allocator,
-      ScratchAllocator *workspace_allocator);
-
-  Stream &ThenBatchNormalizationBackward(
-      const DeviceMemory<Eigen::bfloat16> &y_backprop,
-      const DeviceMemory<Eigen::bfloat16> &x, const DeviceMemory<float> &scale,
-      const DeviceMemory<float> &offset, const DeviceMemory<float> &mean,
-      const DeviceMemory<float> &inv_var,
-      const DeviceMemory<Eigen::bfloat16> &y,
-      const dnn::BatchDescriptor &x_desc,
-      const dnn::BatchDescriptor &scale_offset_desc, const double epsilon,
-      dnn::ActivationMode activation_mode,
-      DeviceMemory<Eigen::bfloat16> *x_backprop,
-      DeviceMemory<float> *scale_backprop, DeviceMemory<float> *offset_backprop,
-      DeviceMemory<Eigen::bfloat16> *side_input_backprop,
-      DeviceMemory<uint8_t> *reserve_space_data,
-      ScratchAllocator *workspace_allocator);
-
-  Stream &ThenConvolve(const dnn::BatchDescriptor &input_descriptor,
-                       const DeviceMemory<float> &input_data,
-                       const dnn::FilterDescriptor &filter_descriptor,
-                       const DeviceMemory<float> &filter_data,
-                       const dnn::ConvolutionDescriptor &convolution_descriptor,
-                       const dnn::BatchDescriptor &output_descriptor,
-                       DeviceMemory<float> *output);
-
-  template <typename InputType, typename OutputType>
-  absl::Status ConvolveWithAlgorithm(
-      dnn::ConvolutionKind kind, const dnn::BatchDescriptor &input_descriptor,
-      DeviceMemory<InputType> input_data,
-      const dnn::FilterDescriptor &filter_descriptor,
-      DeviceMemory<InputType> filter_data,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<OutputType> output_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      ScratchAllocator *scratch_allocator,
-      const dnn::AlgorithmConfig &algorithm_config,
-      dnn::ProfileResult *output_profile_result) {
-    DeviceMemory<uint8_t> scratch_memory;
-    dnn::AlgorithmDesc algorithm_desc;
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      TF_RETURN_IF_ERROR(dnn->PrepareForConvolution(
-          kind, this, input_descriptor, input_data, filter_descriptor,
-          filter_data, output_descriptor, output_data, convolution_descriptor,
-          algorithm_config, scratch_allocator, &algorithm_desc,
-          &scratch_memory));
-      return dnn->DoConvolve(kind, dnn::ToDataType<InputType>::value,
-                             dnn::ToDataType<OutputType>::value, this,
-                             input_descriptor, input_data, filter_descriptor,
-                             filter_data, output_descriptor, output_data,
-                             convolution_descriptor, algorithm_desc,
-                             scratch_memory, output_profile_result);
-    }
-    return absl::UnimplementedError("DNN library is not found.");
-  }
-
-  template <typename InputT, typename ScaleT, typename SideInputT,
-            typename BiasT, typename OutputT>
-  absl::Status FusedConvolveWithAlgorithm(
-      const dnn::BatchDescriptor &conv_input_descriptor,
-      const DeviceMemory<InputT> &conv_input_data, ScaleT conv_input_scale,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<InputT> &filter_data,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      const DeviceMemory<SideInputT> &side_input_data, ScaleT side_input_scale,
-      const dnn::BatchDescriptor &bias_descriptor,
-      const DeviceMemory<BiasT> &biases, dnn::ActivationMode activation_mode,
-      const dnn::BatchDescriptor &output_descriptor,
-      DeviceMemory<OutputT> *output, ScratchAllocator *scratch_allocator,
-      const dnn::AlgorithmConfig &algorithm_config,
-      dnn::ProfileResult *output_profile_result) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      return dnn->DoFusedConvolve(
-          this, dnn::ToDataType<InputT>::value,
-          dnn::ToDataType<SideInputT>::value, dnn::ToDataType<BiasT>::value,
-          dnn::ToDataType<OutputT>::value, conv_input_descriptor,
-          conv_input_data, conv_input_scale, filter_descriptor, filter_data,
-          convolution_descriptor, side_input_data, side_input_scale,
-          bias_descriptor, biases, activation_mode, output_descriptor, *output,
-          scratch_allocator, algorithm_config, output_profile_result);
-    }
-    return absl::UnimplementedError("DNN library is not found.");
-  }
-
-  absl::Status CudnnReorderConvolutionFilterAndBias(
-      const dnn::FilterDescriptor &filter_descriptor,
-      const DeviceMemory<int8_t> &filter_input,
-      DeviceMemory<int8_t> *filter_output,
-      std::optional<const DeviceMemory<float>> bias_input,
-      std::optional<DeviceMemory<float>> bias_output) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      return dnn->CudnnReorderConvolutionFilterAndBias(
-          this, filter_descriptor, filter_input, filter_output,
-          std::move(bias_input), std::move(bias_output));
-    }
-    return absl::UnimplementedError("DNN library is not found.");
-  }
-
-  absl::StatusOr<std::unique_ptr<const dnn::ConvRunner>> ConvolveRunnerFromDesc(
-      const dnn::AlgorithmDesc &algorithm_desc, dnn::ConvolutionKind kind,
-      dnn::DataType element_type, dnn::DataType output_type,
-      const dnn::BatchDescriptor &input_descriptor,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const dnn::BatchDescriptor &output_descriptor,
-      const dnn::ConvolutionDescriptor &convolution_descriptor) {
-    dnn::DnnSupport *dnn_support = parent_->AsDnn();
-    if (!dnn_support) {
-      return absl::UnimplementedError("DNN library is not found.");
-    }
-    return dnn_support->ConvolveRunnerFromDesc(
-        this, algorithm_desc, kind, element_type, output_type, input_descriptor,
-        filter_descriptor, output_descriptor, convolution_descriptor);
-  }
-
-  absl::StatusOr<std::unique_ptr<const dnn::GraphConvRunner>>
-  GraphConvolveRunnerFromDesc(
-      const dnn::AlgorithmDesc &algorithm_desc, dnn::ConvolutionKind kind,
-      dnn::DataType element_type, dnn::DataType output_type,
-      const dnn::BatchDescriptor &input_descriptor,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const dnn::BatchDescriptor &output_descriptor,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      std::string serialized_graph) {
-    dnn::DnnSupport *dnn_support = parent_->AsDnn();
-    if (!dnn_support) {
-      return absl::UnimplementedError("DNN library is not found.");
-    }
-    return dnn_support->GraphConvolveRunnerFromDesc(
-        this, algorithm_desc, kind, element_type, output_type, input_descriptor,
-        filter_descriptor, output_descriptor, convolution_descriptor,
-        serialized_graph);
-  }
-
-  absl::StatusOr<std::unique_ptr<const dnn::FusedConvRunner>>
-  FusedConvolveRunnerFromDesc(
-      const dnn::AlgorithmDesc &algorithm_desc, dnn::ConvolutionKind kind,
-      dnn::DataType element_type, dnn::DataType bias_type,
-      dnn::DataType output_type, double conv_input_scale,
-      double side_input_scale, double leakyrelu_alpha,
-      const dnn::BatchDescriptor &input_descriptor,
-      const dnn::FilterDescriptor &filter_descriptor,
-      const dnn::BatchDescriptor &bias_descriptor,
-      const dnn::BatchDescriptor &output_descriptor,
-      const dnn::ConvolutionDescriptor &convolution_descriptor,
-      dnn::ActivationMode activation_mode) {
-    dnn::DnnSupport *dnn_support = parent_->AsDnn();
-    if (!dnn_support) {
-      return absl::UnimplementedError("DNN library is not found.");
-    }
-    return dnn_support->FusedConvolveRunnerFromDesc(
-        this, algorithm_desc, kind, element_type, bias_type, output_type,
-        conv_input_scale, side_input_scale, leakyrelu_alpha, input_descriptor,
-        filter_descriptor, bias_descriptor, output_descriptor,
-        convolution_descriptor, activation_mode);
-  }
-
-  absl::StatusOr<std::unique_ptr<const dnn::NormRunner>> NormRunnerFromDesc(
-      const dnn::AlgorithmDesc &algorithm_desc, double epsilon,
-      const dnn::TensorDescriptor &input_descriptor,
-      const dnn::TensorDescriptor &scale_descriptor,
-      const dnn::TensorDescriptor &bias_descriptor,
-      const dnn::TensorDescriptor &output_descriptor,
-      std::optional<dnn::TensorDescriptor> expectation_descriptor,
-      std::optional<dnn::TensorDescriptor> norm_factor_descriptor) {
-    dnn::DnnSupport *dnn_support = parent_->AsDnn();
-    if (!dnn_support) {
-      return absl::UnimplementedError("DNN library is not found.");
-    }
-    return dnn_support->NormRunnerFromDesc(
-        this, algorithm_desc, epsilon, input_descriptor, scale_descriptor,
-        bias_descriptor, output_descriptor, expectation_descriptor,
-        norm_factor_descriptor);
-  }
-
-  absl::StatusOr<std::unique_ptr<const dnn::FusedMHARunner>>
-  FusedMHARunnerFromDesc(
-      const dnn::AlgorithmDesc &algorithm_desc, dnn::FusedMHAKind kind,
-      const dnn::MatmulTensorDescriptor &bmm1_lhs_descriptor,
-      const dnn::MatmulTensorDescriptor &bmm1_rhs_descriptor,
-      const dnn::MatmulTensorDescriptor &bmm2_rhs_descriptor,
-      const dnn::MatmulTensorDescriptor &intermediate_bmm2_lhs_descriptor,
-      const dnn::TensorDescriptor &output_descriptor,
-      std::optional<dnn::TensorDescriptor> activation_descriptor,
-      std::optional<dnn::TensorDescriptor> mask_descriptor,
-      std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
-      std::optional<double> dropout_rate, std::optional<int64_t> seed,
-      bool is_flash_attention, bool is_causal_mask) {
-    dnn::DnnSupport *dnn_support = parent_->AsDnn();
-    if (!dnn_support) {
-      return absl::UnimplementedError("DNN library is not found.");
-    }
-    return dnn_support->FusedMHARunnerFromDesc(
-        this, algorithm_desc, kind, bmm1_lhs_descriptor, bmm1_rhs_descriptor,
-        bmm2_rhs_descriptor, intermediate_bmm2_lhs_descriptor,
-        output_descriptor, activation_descriptor, mask_descriptor,
-        bias_descriptor, scale, dropout_rate, seed, is_flash_attention,
-        is_causal_mask);
-  }
-
-  absl::StatusOr<std::unique_ptr<const dnn::FusedMHABackwardRunner>>
-  FusedMHABackwardRunnerFromDesc(
-      const dnn::AlgorithmDesc &algorithm_desc, dnn::FusedMHAKind kind,
-      const dnn::MatmulTensorDescriptor &bmm1_grad_gemm1_rhs_descriptor,
-      const dnn::MatmulTensorDescriptor &bmm1_grad_gemm2_rhs_descriptor,
-      const dnn::MatmulTensorDescriptor &bmm2_grad_gemm1_lhs_descriptor,
-      const dnn::MatmulTensorDescriptor &bmm2_grad_gemm2_rhs_descriptor,
-      const dnn::MatmulTensorDescriptor &d_output_descriptor,
-      const dnn::TensorDescriptor &d_bmm1_lhs_descriptor,
-      const dnn::TensorDescriptor &d_bmm1_rhs_descriptor,
-      const dnn::TensorDescriptor &d_bmm2_rhs_descriptor,
-      std::optional<dnn::TensorDescriptor> d_s_descriptor,
-      std::optional<dnn::TensorDescriptor> mask_descriptor,
-      std::optional<dnn::TensorDescriptor> d_bias_descriptor,
-      std::optional<dnn::TensorDescriptor> fwd_output_descriptor,
-      std::optional<dnn::TensorDescriptor> bias_descriptor, double scale,
-      std::optional<double> dropout_rate, std::optional<int64_t> seed,
-      bool is_flash_attention, bool is_causal_mask) {
-    dnn::DnnSupport *dnn_support = parent_->AsDnn();
-    if (!dnn_support) {
-      return absl::UnimplementedError("DNN library is not found.");
-    }
-    return dnn_support->FusedMHABackwardRunnerFromDesc(
-        this, algorithm_desc, kind, bmm1_grad_gemm1_rhs_descriptor,
-        bmm1_grad_gemm2_rhs_descriptor, bmm2_grad_gemm1_lhs_descriptor,
-        bmm2_grad_gemm2_rhs_descriptor, d_output_descriptor,
-        d_bmm1_lhs_descriptor, d_bmm1_rhs_descriptor, d_bmm2_rhs_descriptor,
-        d_s_descriptor, mask_descriptor, d_bias_descriptor,
-        fwd_output_descriptor, bias_descriptor, scale, dropout_rate, seed,
-        is_flash_attention, is_causal_mask);
-  }
-
-  template <typename ElementType>
-  absl::Status ThenPoolForward(
-      const dnn::PoolingDescriptor &pooling_dimensions,
-      const NumericOptions &numeric_options,
-      const dnn::BatchDescriptor &input_dimensions,
-      const DeviceMemory<ElementType> &input_data,
-      const dnn::BatchDescriptor &output_dimensions,
-      DeviceMemory<ElementType> *output_data,
-      ScratchAllocator *workspace_allocator = nullptr) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      return dnn->DoPoolForward(dnn::ToDataType<ElementType>::value, this,
-                                pooling_dimensions, numeric_options,
-                                input_dimensions, input_data, output_dimensions,
-                                *output_data, workspace_allocator);
-    }
-    return absl::UnimplementedError("DNN library is not found.");
-  }
-
-  template <typename ElementType>
-  absl::Status ThenPoolBackward(
-      const dnn::PoolingDescriptor &pooling_dimensions,
-      const NumericOptions &numeric_options,
-      const dnn::BatchDescriptor &input_dimensions,
-      const DeviceMemory<ElementType> &input_data,
-      const dnn::BatchDescriptor &output_dimensions,
-      const DeviceMemory<ElementType> &output_data,
-      const DeviceMemory<ElementType> &input_diff_data,
-      DeviceMemory<ElementType> *output_diff_data,
-      ScratchAllocator *workspace_allocator = nullptr) {
-    if (dnn::DnnSupport *dnn = parent_->AsDnn()) {
-      return dnn->DoPoolBackward(
-          dnn::ToDataType<ElementType>::value, this, pooling_dimensions,
-          numeric_options, input_dimensions, input_data, output_dimensions,
-          output_data, input_diff_data, *output_diff_data, workspace_allocator);
-    }
-    return absl::UnimplementedError("DNN library is not found.");
-  }
-
-  Stream &ThenNormalizeWithDimensions(
-      const dnn::NormalizeDescriptor &normalize_descriptor,
-      const dnn::BatchDescriptor &dimensions,
-      const DeviceMemory<float> &input_data, DeviceMemory<float> *output_data);
-
-  Stream &ThenNormalizeBackwardWithDimensions(
-      const dnn::NormalizeDescriptor &normalize_descriptor,
-      const dnn::BatchDescriptor &dimensions,
-      const DeviceMemory<float> &raw_data,
-      const DeviceMemory<float> &normalized_data,
-      const DeviceMemory<float> &normalized_variable_gradient,
-      DeviceMemory<float> *raw_variable_gradient,
-      ScratchAllocator *workspace_allocator = nullptr);
-
-  Stream &ThenDepthConcatenate(
-      absl::Span<const dnn::BatchDescriptor> input_dimensions,
-      absl::Span<const DeviceMemory<float> *const> input_data,
-      DeviceMemory<float> *output_data);
-
-  /////////////////
-  // BLAS support
-
-  // See BlasSupport::DoBlasAxpy. Note that, even for the case where alpha is
-  // present in DeviceMemory, it must be an execution-time constant (i.e. a
-  // value
-  // that the stream does not change or populate during the course of
-  // execution). The value is effectively captured at stream-enqueue time.
-  Stream &ThenBlasAxpy(uint64_t elem_count, float alpha,
-                       const DeviceMemory<float> &x, int incx,
-                       DeviceMemory<float> *y, int incy);
-
-  // See BlasSupport::DoBlasCopy.
-  Stream &ThenBlasCopy(uint64_t elem_count, const DeviceMemory<float> &x,
-                       int incx, DeviceMemory<float> *y, int incy);
-
-  // See BlasSupport::DoBlasScal.
-  Stream &ThenBlasScal(uint64_t elem_count, float alpha, DeviceMemory<float> *x,
-                       int incx);
-  Stream &ThenBlasScal(uint64_t elem_count, double alpha,
-                       DeviceMemory<double> *x, int incx);
-  Stream &ThenBlasScal(uint64_t elem_count, float alpha,
-                       DeviceMemory<std::complex<float>> *x, int incx);
-  Stream &ThenBlasScal(uint64_t elem_count, double alpha,
-                       DeviceMemory<std::complex<double>> *x, int incx);
-  Stream &ThenBlasScal(uint64_t elem_count, std::complex<float> alpha,
-                       DeviceMemory<std::complex<float>> *x, int incx);
-  Stream &ThenBlasScal(uint64_t elem_count, std::complex<double> alpha,
-                       DeviceMemory<std::complex<double>> *x, int incx);
-
-  // See BlasSupport::DoBlasGemv.
-  Stream &ThenBlasGemv(blas::Transpose trans, uint64_t m, uint64 n, float alpha,
-                       const DeviceMemory<float> &a, int lda,
-                       const DeviceMemory<float> &x, int incx, float beta,
-                       DeviceMemory<float> *y, int incy);
-  Stream &ThenBlasGemv(blas::Transpose trans, uint64_t m, uint64 n,
-                       double alpha, const DeviceMemory<double> &a, int lda,
-                       const DeviceMemory<double> &x, int incx, double beta,
-                       DeviceMemory<double> *y, int incy);
-  Stream &ThenBlasGemv(blas::Transpose trans, uint64_t m, uint64 n,
-                       std::complex<float> alpha,
-                       const DeviceMemory<std::complex<float>> &a, int lda,
-                       const DeviceMemory<std::complex<float>> &x, int incx,
-                       std::complex<float> beta,
-                       DeviceMemory<std::complex<float>> *y, int incy);
-  Stream &ThenBlasGemv(blas::Transpose trans, uint64_t m, uint64 n,
-                       std::complex<double> alpha,
-                       const DeviceMemory<std::complex<double>> &a, int lda,
-                       const DeviceMemory<std::complex<double>> &x, int incx,
-                       std::complex<double> beta,
-                       DeviceMemory<std::complex<double>> *y, int incy);
-
-  // See BlasSupport::DoBlasSbmv.
-  Stream &ThenBlasSbmv(blas::UpperLower uplo, uint64_t n, uint64 k, float alpha,
-                       const DeviceMemory<float> &a, int lda,
-                       const DeviceMemory<float> &x, int incx, float beta,
-                       DeviceMemory<float> *y, int incy);
-
-  template <typename InputType, typename OutputType>
-  absl::Status ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
-                            uint64_t m, uint64 n, uint64 k,
-                            const DeviceMemory<InputType> &a, int lda,
-                            const DeviceMemory<InputType> &b, int ldb,
-                            DeviceMemory<OutputType> *c, int ldc,
-                            const NumericOptions &numeric_options,
-                            blas::CallContext context) {
-    InputType alpha{1.0};
-    InputType beta{0.0};
-    return ThenBlasGemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
-                        ldc, numeric_options, context);
-  }
-
-  template <typename InputType, typename OutputType, typename ConstantType>
-  absl::Status ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
-                            uint64_t m, uint64 n, uint64 k, ConstantType alpha,
-                            const DeviceMemory<InputType> &a, int lda,
-                            const DeviceMemory<InputType> &b, int ldb,
-                            ConstantType beta, DeviceMemory<OutputType> *c,
-                            int ldc, const NumericOptions &numeric_options,
-                            blas::CallContext context) {
-    static_assert(
-        detail::is_any_of<InputType, int8_t, Eigen::half, Eigen::bfloat16,
-                          float, double, std::complex<float>,
-                          std::complex<double>>(),
-        "Input can be int8_t, half, bf16, float, double, std::complex<float> "
-        "or "
-        "std::complex<double>");
-    static_assert(!std::is_same_v<InputType, Eigen::half> ||
-                      detail::is_any_of<ConstantType, float, Eigen::half>(),
-                  "If input is Eigen::half, constant has to be either "
-                  "Eigen::half or float");
-    static_assert(detail::is_any_of<InputType, int8_t, Eigen::half,
-                                    Eigen::bfloat16, ConstantType>(),
-                  "If input is not int8_t, Eigen::half, constant and input "
-                  "types have to match");
-    blas::BlasSupport *blas = parent()->AsBlas();
-    if (!blas) {
-      return absl::InternalError(
-          "Attempting to perform BLAS operation using "
-          "StreamExecutor without BLAS support");
-    }
-
-    void *alpha_ptr = &alpha;
-    void *beta_ptr = &beta;
-    float alpha_storage, beta_storage;
-    UpcastHalfToFloat<ConstantType>(&alpha_ptr, &beta_ptr, &alpha_storage,
-                                    &beta_storage);
-
-    return blas->DoBlasGemm(
-        this, transa, transb, m, n, k, blas::ToDataType<InputType>::value,
-        alpha_ptr, a, lda, b, ldb, beta_ptr, c, ldc, numeric_options, context);
-  }
-
-  // TODO(reedwm): Update all callers to pass correct NumericOptions.
-  template <typename InputType, typename OutputType, typename ConstantType>
-  absl::Status ThenBlasGemm(blas::Transpose transa, blas::Transpose transb,
-                            uint64_t m, uint64 n, uint64 k, ConstantType alpha,
-                            const DeviceMemory<InputType> &a, int lda,
-                            const DeviceMemory<InputType> &b, int ldb,
-                            ConstantType beta, DeviceMemory<OutputType> *c,
-                            int ldc, blas::CallContext context) {
-    return ThenBlasGemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c,
-                        ldc, NumericOptions{}, context);
-  }
-
-  template <typename InputType, typename OutputType>
-  absl::Status ThenBlasGemmWithAlgorithm(
-      blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-      uint64_t k, const DeviceMemory<InputType> &a, int lda,
-      const DeviceMemory<InputType> &b, int ldb, DeviceMemory<OutputType> *c,
-      int ldc, blas::ComputationType computation_type,
-      blas::AlgorithmType algorithm, blas::ProfileResult *output_profile_result,
-      blas::CallContext context) {
-    OutputType alpha{1};
-    OutputType beta{0};
-    return ThenBlasGemmWithAlgorithm(transa, transb, m, n, k, alpha, a, lda, b,
-                                     ldb, beta, c, ldc, computation_type,
-                                     algorithm, NumericOptions{},
-                                     output_profile_result, context);
-  }
-
-  template <typename InputType, typename OutputType, typename ConstantType>
-  absl::Status ThenBlasGemmWithAlgorithm(
-      blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-      uint64_t k, ConstantType alpha, const DeviceMemory<InputType> &a, int lda,
-      const DeviceMemory<InputType> &b, int ldb, ConstantType beta,
-      DeviceMemory<OutputType> *c, int ldc,
-      blas::ComputationType computation_type, blas::AlgorithmType algorithm,
-      const NumericOptions &numeric_options,
-      blas::ProfileResult *output_profile_result, blas::CallContext context) {
-    TF_RETURN_IF_ERROR(
-        CheckTypesForExtendedBlas<InputType, OutputType, ConstantType>(
-            computation_type));
-
-    blas::BlasSupport *blas = parent()->AsBlas();
-    if (!blas) {
-      return absl::InternalError(
-          "Attempting to perform BLAS operation using "
-          "StreamExecutor without BLAS support");
-    }
-
-    void *alpha_ptr = &alpha;
-    void *beta_ptr = &beta;
-    float alpha_storage, beta_storage;
-    UpcastHalfToFloat<ConstantType>(&alpha_ptr, &beta_ptr, &alpha_storage,
-                                    &beta_storage);
-
-    absl::Status st = blas->DoBlasGemmWithAlgorithm(
-        this, transa, transb, m, n, k, alpha_ptr, a,
-        blas::ToDataType<InputType>::value, lda, b,
-        blas::ToDataType<InputType>::value, ldb, beta_ptr, c,
-        blas::ToDataType<OutputType>::value, ldc, computation_type, algorithm,
-        numeric_options, output_profile_result, context);
-
-    if (output_profile_result) {
-      // The error is recorded in the profile.
-      return absl::OkStatus();
-    }
-    return st;
-  }
-
-  template <typename InputType, typename OutputType, typename ConstantType>
-  absl::Status ThenBlasGemmStridedBatchedWithAlgorithm(
-      blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-      uint64_t k, ConstantType alpha, const DeviceMemory<InputType> &a, int lda,
-      int64_t stride_a, const DeviceMemory<InputType> &b, int ldb,
-      int64_t stride_b, ConstantType beta, DeviceMemory<OutputType> *c, int ldc,
-      int64_t stride_c, int batch_count, blas::ComputationType computation_type,
-      blas::AlgorithmType algorithm, const NumericOptions &numeric_options,
-      blas::ProfileResult *output_profile_result, blas::CallContext context) {
-    TF_RETURN_IF_ERROR(
-        CheckTypesForExtendedBlas<InputType, OutputType, ConstantType>(
-            computation_type));
-
-    blas::BlasSupport *blas = parent()->AsBlas();
-    if (!blas) {
-      return absl::InternalError(
-          "Attempting to perform BLAS operation using "
-          "StreamExecutor without BLAS support");
-    }
-    void *alpha_ptr = &alpha;
-    void *beta_ptr = &beta;
-    float alpha_storage, beta_storage;
-    UpcastHalfToFloat<ConstantType>(&alpha_ptr, &beta_ptr, &alpha_storage,
-                                    &beta_storage);
-    absl::Status st = blas->DoBlasGemmStridedBatchedWithAlgorithm(
-        this, transa, transb, m, n, k, alpha_ptr, a,
-        blas::ToDataType<InputType>::value, lda, stride_a, b,
-        blas::ToDataType<InputType>::value, ldb, stride_b, beta_ptr, c,
-        blas::ToDataType<OutputType>::value, ldc, stride_c, batch_count,
-        computation_type, algorithm, numeric_options, output_profile_result,
-        context);
-    if (output_profile_result) {
-      // The error is recorded in the profile.
-      return absl::OkStatus();
-    }
-    return st;
-  }
-
-  template <typename T>
-  using DeviceMemorySlice = absl::Span<DeviceMemory<T> *const>;
-
-  // See BlasSupport::DoBlasGemmBatched.
-  Stream &ThenBlasGemmBatched(blas::Transpose transa, blas::Transpose transb,
-                              uint64_t m, uint64 n, uint64 k, float alpha,
-                              DeviceMemorySlice<float> a, int lda,
-                              DeviceMemorySlice<float> b, int ldb, float beta,
-                              DeviceMemorySlice<float> c, int ldc,
-                              int batch_count,
-                              const NumericOptions &numeric_options,
-                              blas::CallContext context);
-
-  Stream &ThenBlasGemmBatchedWithScratch(
-      blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-      uint64_t k, float alpha, DeviceMemorySlice<Eigen::half> a, int lda,
-      DeviceMemorySlice<Eigen::half> b, int ldb, float beta,
-      DeviceMemorySlice<Eigen::half> c, int ldc, int batch_count,
-      const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator, blas::CallContext context);
-
-  Stream &ThenBlasGemmBatchedWithScratch(
-      blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-      uint64_t k, float alpha, DeviceMemorySlice<Eigen::bfloat16> a, int lda,
-      DeviceMemorySlice<Eigen::bfloat16> b, int ldb, float beta,
-      DeviceMemorySlice<Eigen::bfloat16> c, int ldc, int batch_count,
-      const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator, blas::CallContext context);
-
-  Stream &ThenBlasGemmBatchedWithScratch(
-      blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-      uint64_t k, float alpha, DeviceMemorySlice<float> a, int lda,
-      DeviceMemorySlice<float> b, int ldb, float beta,
-      DeviceMemorySlice<float> c, int ldc, int batch_count,
-      const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator, blas::CallContext context);
-
-  Stream &ThenBlasGemmBatchedWithScratch(
-      blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-      uint64_t k, double alpha, DeviceMemorySlice<double> a, int lda,
-      DeviceMemorySlice<double> b, int ldb, double beta,
-      DeviceMemorySlice<double> c, int ldc, int batch_count,
-      const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator, blas::CallContext context);
-
-  Stream &ThenBlasGemmBatchedWithScratch(
-      blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-      uint64_t k, std::complex<float> alpha,
-      DeviceMemorySlice<std::complex<float>> a, int lda,
-      DeviceMemorySlice<std::complex<float>> b, int ldb,
-      std::complex<float> beta, DeviceMemorySlice<std::complex<float>> c,
-      int ldc, int batch_count, const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator, blas::CallContext context);
-
-  Stream &ThenBlasGemmBatchedWithScratch(
-      blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-      uint64_t k, std::complex<double> alpha,
-      DeviceMemorySlice<std::complex<double>> a, int lda,
-      DeviceMemorySlice<std::complex<double>> b, int ldb,
-      std::complex<double> beta, DeviceMemorySlice<std::complex<double>> c,
-      int ldc, int batch_count, const NumericOptions &numeric_options,
-      ScratchAllocator *scratch_allocator, blas::CallContext context);
-
-  template <typename InputType, typename OutputType, typename ConstantType>
-  absl::Status ThenBlasGemmStridedBatched(
-      blas::Transpose transa, blas::Transpose transb, uint64_t m, uint64 n,
-      uint64_t k, ConstantType alpha, const DeviceMemory<InputType> &a, int lda,
-      int64_t stride_a, const DeviceMemory<InputType> &b, int ldb,
-      int64_t stride_b, ConstantType beta, DeviceMemory<OutputType> *c, int ldc,
-      int64_t stride_c, int batch_count, const NumericOptions &numeric_options,
-      blas::CallContext context) {
-    static_assert(
-        detail::is_any_of<InputType, int8_t, float, Eigen::half,
-                          Eigen::bfloat16, double, std::complex<float>,
-                          std::complex<double>>(),
-        "Unsupported input type");
-    static_assert(std::is_same_v<ConstantType, InputType> ||
-                      (detail::is_any_of<InputType, int8_t, Eigen::half,
-                                         Eigen::bfloat16>() &&
-                       std::is_same_v<ConstantType, float>),
-                  "Mismatched input and alpha/beta types");
-    blas::BlasSupport *blas = parent()->AsBlas();
-    if (!blas) {
-      return absl::InternalError(
-          "Attempting to perform BLAS operation using "
-          "StreamExecutor without BLAS support");
-    }
-
-    void *alpha_ptr = &alpha;
-    void *beta_ptr = &beta;
-    float alpha_storage, beta_storage;
-    UpcastHalfToFloat<ConstantType>(&alpha_ptr, &beta_ptr, &alpha_storage,
-                                    &beta_storage);
-
-    return blas->DoBlasGemmStridedBatched(
-        this, transa, transb, m, n, k, blas::ToDataType<InputType>::value,
-        alpha_ptr, a, lda, stride_a, b, ldb, stride_b, beta_ptr, c, ldc,
-        stride_c, batch_count, numeric_options, context);
-  }
-
-  // See BlasSupport::DoBlasTrsm.
-  Stream &ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
-                       blas::Transpose transa, blas::Diagonal diag, uint64_t m,
-                       uint64_t n, float alpha, const DeviceMemory<float> &a,
-                       int lda, DeviceMemory<float> *b, int ldb);
-  Stream &ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
-                       blas::Transpose transa, blas::Diagonal diag, uint64_t m,
-                       uint64_t n, double alpha, const DeviceMemory<double> &a,
-                       int lda, DeviceMemory<double> *b, int ldb);
-  Stream &ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
-                       blas::Transpose transa, blas::Diagonal diag, uint64_t m,
-                       uint64_t n, std::complex<float> alpha,
-                       const DeviceMemory<std::complex<float>> &a, int lda,
-                       DeviceMemory<std::complex<float>> *b, int ldb);
-  Stream &ThenBlasTrsm(blas::Side side, blas::UpperLower uplo,
-                       blas::Transpose transa, blas::Diagonal diag, uint64_t m,
-                       uint64_t n, std::complex<double> alpha,
-                       const DeviceMemory<std::complex<double>> &a, int lda,
-                       DeviceMemory<std::complex<double>> *b, int ldb);
-
-  // See BlasSupport::DoBlasTrsmBatched.
-  Stream &ThenBlasTrsmBatched(blas::Side side, blas::UpperLower uplo,
-                              blas::Transpose transa, blas::Diagonal diag,
-                              uint64_t m, uint64 n, float alpha,
-                              const DeviceMemory<float *> &as, int lda,
-                              DeviceMemory<float *> *bs, int ldb,
-                              int batch_count);
-  Stream &ThenBlasTrsmBatched(blas::Side side, blas::UpperLower uplo,
-                              blas::Transpose transa, blas::Diagonal diag,
-                              uint64_t m, uint64 n, double alpha,
-                              const DeviceMemory<double *> &as, int lda,
-                              DeviceMemory<double *> *bs, int ldb,
-                              int batch_count);
-  Stream &ThenBlasTrsmBatched(blas::Side side, blas::UpperLower uplo,
-                              blas::Transpose transa, blas::Diagonal diag,
-                              uint64_t m, uint64 n, std::complex<float> alpha,
-                              const DeviceMemory<std::complex<float> *> &as,
-                              int lda, DeviceMemory<std::complex<float> *> *bs,
-                              int ldb, int batch_count);
-  Stream &ThenBlasTrsmBatched(blas::Side side, blas::UpperLower uplo,
-                              blas::Transpose transa, blas::Diagonal diag,
-                              uint64_t m, uint64 n, std::complex<double> alpha,
-                              const DeviceMemory<std::complex<double> *> &as,
-                              int lda, DeviceMemory<std::complex<double> *> *bs,
-                              int ldb, int batch_count);
-
-  // See FftSupport::DoFft.
-  Stream &ThenFft(fft::Plan *plan,
-                  const DeviceMemory<std::complex<float>> &input,
-                  DeviceMemory<std::complex<float>> *output);
-  Stream &ThenFft(fft::Plan *plan,
-                  const DeviceMemory<std::complex<double>> &input,
-                  DeviceMemory<std::complex<double>> *output);
-  Stream &ThenFft(fft::Plan *plan, const DeviceMemory<float> &input,
-                  DeviceMemory<std::complex<float>> *output);
-  Stream &ThenFft(fft::Plan *plan, const DeviceMemory<double> &input,
-                  DeviceMemory<std::complex<double>> *output);
-  Stream &ThenFft(fft::Plan *plan,
-                  const DeviceMemory<std::complex<float>> &input,
-                  DeviceMemory<float> *output);
-  Stream &ThenFft(fft::Plan *plan,
-                  const DeviceMemory<std::complex<double>> &input,
-                  DeviceMemory<double> *output);
-
   // Entrain onto the stream: a memcpy to a host destination from a GPU source
   // of the given target size. host_dst must be a pointer to host memory
   // allocated by StreamExecutor::HostMemoryAllocate or otherwise allocated and
@@ -1030,169 +246,6 @@ class Stream {
   Stream &ThenMemset32(DeviceMemoryBase *location, uint32_t pattern,
                        uint64_t size);
 
-  // Enqueue a forward operation of the RNN model onto the stream.
-  // See DnnSupport::DoRnnForward for more details.
-  Stream &ThenRnnForward(const dnn::RnnDescriptor &rnn_desc,
-                         const dnn::RnnSequenceTensorDescriptor &input_desc,
-                         const DeviceMemory<Eigen::half> &input_data,
-                         const DeviceMemory<int> &seq_lengths_data,
-                         const dnn::RnnStateTensorDescriptor &input_h_desc,
-                         const DeviceMemory<Eigen::half> &input_h_data,
-                         const dnn::RnnStateTensorDescriptor &input_c_desc,
-                         const DeviceMemory<Eigen::half> &input_c_data,
-                         const DeviceMemory<Eigen::half> &params,
-                         const dnn::RnnSequenceTensorDescriptor &output_desc,
-                         DeviceMemory<Eigen::half> *output_data,
-                         const dnn::RnnStateTensorDescriptor &output_h_desc,
-                         DeviceMemory<Eigen::half> *output_h_data,
-                         const dnn::RnnStateTensorDescriptor &output_c_desc,
-                         DeviceMemory<Eigen::half> *output_c_data,
-                         bool is_training,
-                         ScratchAllocator *reserve_space_allocator,
-                         ScratchAllocator *workspace_allocator,
-                         dnn::ProfileResult *output_profile_result);
-
-  Stream &ThenRnnForward(const dnn::RnnDescriptor &rnn_desc,
-                         const dnn::RnnSequenceTensorDescriptor &input_desc,
-                         const DeviceMemory<float> &input_data,
-                         const DeviceMemory<int> &seq_lengths_data,
-                         const dnn::RnnStateTensorDescriptor &input_h_desc,
-                         const DeviceMemory<float> &input_h_data,
-                         const dnn::RnnStateTensorDescriptor &input_c_desc,
-                         const DeviceMemory<float> &input_c_data,
-                         const DeviceMemory<float> &params,
-                         const dnn::RnnSequenceTensorDescriptor &output_desc,
-                         DeviceMemory<float> *output_data,
-                         const dnn::RnnStateTensorDescriptor &output_h_desc,
-                         DeviceMemory<float> *output_h_data,
-                         const dnn::RnnStateTensorDescriptor &output_c_desc,
-                         DeviceMemory<float> *output_c_data, bool is_training,
-                         ScratchAllocator *reserve_space_allocator,
-                         ScratchAllocator *workspace_allocator,
-                         dnn::ProfileResult *output_profile_result);
-
-  Stream &ThenRnnForward(const dnn::RnnDescriptor &rnn_desc,
-                         const dnn::RnnSequenceTensorDescriptor &input_desc,
-                         const DeviceMemory<double> &input_data,
-                         const DeviceMemory<int> &seq_lengths_data,
-                         const dnn::RnnStateTensorDescriptor &input_h_desc,
-                         const DeviceMemory<double> &input_h_data,
-                         const dnn::RnnStateTensorDescriptor &input_c_desc,
-                         const DeviceMemory<double> &input_c_data,
-                         const DeviceMemory<double> &params,
-                         const dnn::RnnSequenceTensorDescriptor &output_desc,
-                         DeviceMemory<double> *output_data,
-                         const dnn::RnnStateTensorDescriptor &output_h_desc,
-                         DeviceMemory<double> *output_h_data,
-                         const dnn::RnnStateTensorDescriptor &output_c_desc,
-                         DeviceMemory<double> *output_c_data, bool is_training,
-                         ScratchAllocator *reserve_space_allocator,
-                         ScratchAllocator *workspace_allocator,
-                         dnn::ProfileResult *output_profile_result);
-
-  // Enqueue a backward operation of the RNN model onto the stream.
-  // See DnnSupport::DoRnnBackward for more details.
-  Stream &ThenRnnBackward(
-      const dnn::RnnDescriptor &rnn_desc,
-      const dnn::RnnSequenceTensorDescriptor &input_desc,
-      const DeviceMemory<Eigen::half> &input_data,
-      const DeviceMemory<int> &seq_lengths_data,
-      const dnn::RnnStateTensorDescriptor &input_h_desc,
-      const DeviceMemory<Eigen::half> &input_h_data,
-      const dnn::RnnStateTensorDescriptor &input_c_desc,
-      const DeviceMemory<Eigen::half> &input_c_data,
-      const DeviceMemory<Eigen::half> &params,
-      const dnn::RnnSequenceTensorDescriptor &output_desc,
-      const DeviceMemory<Eigen::half> &output_data,
-      const dnn::RnnStateTensorDescriptor &output_h_desc,
-      const DeviceMemory<Eigen::half> &output_h_data,
-      const dnn::RnnStateTensorDescriptor &output_c_desc,
-      const DeviceMemory<Eigen::half> &output_c_data,
-      const DeviceMemory<Eigen::half> &output_backprop_data,
-      const DeviceMemory<Eigen::half> &output_h_backprop_data,
-      const DeviceMemory<Eigen::half> &output_c_backprop_data,
-      DeviceMemory<Eigen::half> *input_backprop_data,
-      DeviceMemory<Eigen::half> *input_h_backprop_data,
-      DeviceMemory<Eigen::half> *input_c_backprop_data,
-      DeviceMemory<Eigen::half> *params_backprop_data,
-      DeviceMemory<uint8_t> *reserve_space_data,
-      ScratchAllocator *workspace_allocator,
-      dnn::ProfileResult *output_profile_result);
-
-  Stream &ThenRnnBackward(const dnn::RnnDescriptor &rnn_desc,
-                          const dnn::RnnSequenceTensorDescriptor &input_desc,
-                          const DeviceMemory<float> &input_data,
-                          const DeviceMemory<int> &seq_lengths_data,
-                          const dnn::RnnStateTensorDescriptor &input_h_desc,
-                          const DeviceMemory<float> &input_h_data,
-                          const dnn::RnnStateTensorDescriptor &input_c_desc,
-                          const DeviceMemory<float> &input_c_data,
-                          const DeviceMemory<float> &params,
-                          const dnn::RnnSequenceTensorDescriptor &output_desc,
-                          const DeviceMemory<float> &output_data,
-                          const dnn::RnnStateTensorDescriptor &output_h_desc,
-                          const DeviceMemory<float> &output_h_data,
-                          const dnn::RnnStateTensorDescriptor &output_c_desc,
-                          const DeviceMemory<float> &output_c_data,
-                          const DeviceMemory<float> &output_backprop_data,
-                          const DeviceMemory<float> &output_h_backprop_data,
-                          const DeviceMemory<float> &output_c_backprop_data,
-                          DeviceMemory<float> *input_backprop_data,
-                          DeviceMemory<float> *input_h_backprop_data,
-                          DeviceMemory<float> *input_c_backprop_data,
-                          DeviceMemory<float> *params_backprop_data,
-                          DeviceMemory<uint8_t> *reserve_space_data,
-                          ScratchAllocator *workspace_allocator,
-                          dnn::ProfileResult *output_profile_result);
-
-  Stream &ThenRnnBackward(const dnn::RnnDescriptor &rnn_desc,
-                          const dnn::RnnSequenceTensorDescriptor &input_desc,
-                          const DeviceMemory<double> &input_data,
-                          const DeviceMemory<int> &seq_lengths_data,
-                          const dnn::RnnStateTensorDescriptor &input_h_desc,
-                          const DeviceMemory<double> &input_h_data,
-                          const dnn::RnnStateTensorDescriptor &input_c_desc,
-                          const DeviceMemory<double> &input_c_data,
-                          const DeviceMemory<double> &params,
-                          const dnn::RnnSequenceTensorDescriptor &output_desc,
-                          const DeviceMemory<double> &output_data,
-                          const dnn::RnnStateTensorDescriptor &output_h_desc,
-                          const DeviceMemory<double> &output_h_data,
-                          const dnn::RnnStateTensorDescriptor &output_c_desc,
-                          const DeviceMemory<double> &output_c_data,
-                          const DeviceMemory<double> &output_backprop_data,
-                          const DeviceMemory<double> &output_h_backprop_data,
-                          const DeviceMemory<double> &output_c_backprop_data,
-                          DeviceMemory<double> *input_backprop_data,
-                          DeviceMemory<double> *input_h_backprop_data,
-                          DeviceMemory<double> *input_c_backprop_data,
-                          DeviceMemory<double> *params_backprop_data,
-                          DeviceMemory<uint8_t> *reserve_space_data,
-                          ScratchAllocator *workspace_allocator,
-                          dnn::ProfileResult *output_profile_result);
-
-  // Enqueue a CTCLoss operation onto the stream.
-  // See DnnSupport::DoCtcLoss for more details.
-  Stream &ThenCtcLoss(const dnn::RnnStateTensorDescriptor &probs_desc,
-                      const DeviceMemory<float> &probs_data,
-                      absl::Span<const int> labels_data,
-                      absl::Span<const int> labels_lengths_data,
-                      absl::Span<const int> input_lengths_data,
-                      const NumericOptions &numeric_options,
-                      DeviceMemory<float> *costs_data,
-                      const dnn::RnnStateTensorDescriptor &grads_desc,
-                      DeviceMemory<float> *grads_data,
-                      ScratchAllocator *workspace_allocator);
-
-  // Enqueue onto the stream a operation that transforms a tensor.
-  // See DnnSupport::DoTransformTensor for more details.
-  Stream &ThenTransformTensor(const dnn::BatchDescriptor &input_desc,
-                              dnn::DataType input_type,
-                              const DeviceMemoryBase &input_data,
-                              const dnn::BatchDescriptor &output_desc,
-                              dnn::DataType output_type, float scale,
-                              DeviceMemoryBase *output_data);
-
   // (Synchronously) block the host code waiting for the operations
   // entrained on the stream (enqueued to this point in program
   // execution) to complete.
@@ -1201,25 +254,6 @@ class Stream {
   // Otherwise returns an error describing why the blocking failed.
   absl::Status BlockHostUntilDone() TF_LOCKS_EXCLUDED(mu_);
 
-  // Warning! This method interacts with internal threads in
-  // sometimes-unpredictable ways and is intended for GPU-Executor-internal
-  // use
-  // only. Please check with a member of the FASTR team before making use of
-  // this method.
-  //
-  // Entrains onto the stream a function to be executed on the host at some
-  // point in the future.
-  // Async host callbacks DO NOT block the stream as device functions (or as
-  // synchronous host callbacks). No synchronization is possible with
-  // asynchronous callbacks; they are strictly fire-and-forget.
-  // This method is private due to the potential for undefined behavior with
-  // synchronization using OpenCL user events.
-  // The ONLY lifetime guarantee in these calls is that the StreamExecutor
-  // parameter will still be valid - this Stream may not be!
-  // Any callbacks requiring device API calls must use this method.
-  Stream &ThenEnqueueOnBackgroundThread(
-      std::function<void(StreamExecutor *)> task);
-
   // Returns the (opaque) platform-specific backing object. Ownership is not
   // transferred to the caller.
   internal::StreamInterface *implementation() { return implementation_.get(); }
@@ -1262,9 +296,6 @@ class Stream {
   RocmComputeCapability GetRocmComputeCapability() const {
     return parent()->GetDeviceDescription().rocm_compute_capability();
   }
-  // Returns the (internal usage) temporary-memory-allocation manager associated
-  // with this stream.
-  internal::TemporaryMemoryManager *temporary_memory_manager();
 
   // Returns a debugging string "[stream=0x...,impl=0x...]".
   std::string DebugStreamPointers() const;
@@ -1275,51 +306,6 @@ class Stream {
   std::variant<StreamPriority, int> priority() const;
 
  private:
-  template <typename... Args>
-  friend struct ThenBlasImpl;  // for implementing ThenBlasXXX.
-
-  // Checks whether types match before a call to extended BLAS version.
-  template <typename ABType, typename CType, typename ScaleType>
-  absl::Status CheckTypesForExtendedBlas(
-      blas::ComputationType computation_type) {
-    static_assert(
-        detail::is_any_of<ABType, Eigen::half, Eigen::bfloat16, float, double,
-                          int8_t, std::complex<float>, std::complex<double>>(),
-        "The only buffer types supported are: Eigen::half, float, "
-        "double, int8, std::complex<float> and std::complex<double>");
-    static_assert(
-        std::is_same_v<ScaleType, CType> ||
-            (std::is_same_v<ScaleType, float> &&
-             detail::is_any_of<CType, Eigen::half, Eigen::bfloat16>()),
-        "Mismatched alpha/beta and output types");
-
-    bool valid_computation_type = [computation_type] {
-      switch (computation_type) {
-        case blas::ComputationType::kF16:
-          return std::is_same_v<CType, Eigen::half>;
-        case blas::ComputationType::kF32:
-          return detail::is_any_of<CType, Eigen::half, Eigen::bfloat16, float,
-                                   std::complex<float>>();
-        case blas::ComputationType::kF64:
-          return detail::is_any_of<CType, double, std::complex<double>>();
-        case blas::ComputationType::kI32:
-          return std::is_same_v<CType, int32_t>;
-        case blas::ComputationType::kF16AsF32:   // fall-through
-        case blas::ComputationType::kBF16AsF32:  // fall-through
-        case blas::ComputationType::kTF32AsF32:
-          return detail::is_any_of<CType, float, std::complex<float>>();
-      }
-    }();
-
-    if (!valid_computation_type) {
-      return absl::InternalError(absl::StrCat(
-          "Invalid computation type ",
-          blas::ComputationTypeString(computation_type), " for output type: ",
-          blas::DataTypeString(blas::ToDataType<CType>::value)));
-    }
-    return absl::OkStatus();
-  }
-
   bool InErrorState() const TF_LOCKS_EXCLUDED(mu_) {
     absl::ReaderMutexLock lock(&mu_);
     return !status_.ok();
@@ -1334,16 +320,6 @@ class Stream {
 
   void SetError() { CheckError(false /* = operation_retcode */); }
 
-  void SetErrorAndLogNoDnnSupport() {
-    SetError();
-    LOG(WARNING) << "attempting to perform DNN operation using StreamExecutor "
-                    "without DNN support";
-  }
-
-  // Runs the set of callbacks that are intended to run after
-  // BlockHostUntilDone.
-  void RunAfterBlockHostUntilDoneCallbacks();
-
   // The StreamExecutor that supports the operation of this stream.
   StreamExecutor *parent_;
 
@@ -1369,39 +345,6 @@ class Stream {
   std::vector<std::pair<std::unique_ptr<Stream>, bool>> sub_streams_
       ABSL_GUARDED_BY(mu_);
 
-  // Streams can allocate temporary memories to help with work they enqueue
-  // (e.g. for scratch memory spaces). This member tracks those allocations and
-  // notes when they can be reclaimed -- reclamation is attempted when
-  // BlockHostUntilDone() is called.
-  internal::TemporaryMemoryManager temporary_memory_manager_;
-
-  // Callbacks enqueued to be run after the next call to BlockHostUntilDone().
-  std::vector<absl::AnyInvocable<void() &&>>
-      after_block_host_until_done_callbacks_ ABSL_GUARDED_BY(mu_);
-
-  // Non-extended BLAS interface requires alpha/beta to be floats when input
-  // type is Eigen::half. However, for consistency purposes it is convenient
-  // for the interface to accept Eigen::half.
-  template <typename T>
-  void UpcastHalfToFloat(void **alpha_ptr, void **beta_ptr,
-                         float *alpha_storage, float *beta_storage) {
-    if (std::is_same<T, Eigen::half>::value) {
-      *alpha_storage =
-          static_cast<float>(*reinterpret_cast<Eigen::half *>(*alpha_ptr));
-      *beta_storage =
-          static_cast<float>(*reinterpret_cast<Eigen::half *>(*beta_ptr));
-      *alpha_ptr = alpha_storage;
-      *beta_ptr = beta_storage;
-    } else if (std::is_same<T, Eigen::bfloat16>::value) {
-      *alpha_storage =
-          static_cast<float>(*reinterpret_cast<Eigen::bfloat16 *>(*alpha_ptr));
-      *beta_storage =
-          static_cast<float>(*reinterpret_cast<Eigen::bfloat16 *>(*beta_ptr));
-      *alpha_ptr = alpha_storage;
-      *beta_ptr = beta_storage;
-    }
-  }
-
   Stream(const Stream &) = delete;
   void operator=(const Stream &) = delete;
 };
@@ -1416,7 +359,7 @@ inline absl::Status Stream::ThenLaunch(ThreadDim thread_dims,
                                        Args... args) {
   auto kernel_args = PackKernelArgs(kernel, args...);
   TF_RETURN_IF_ERROR(
-      parent_->Launch(this, thread_dims, block_dims, kernel, *kernel_args));
+      parent_->Launch(this, thread_dims, block_dims, *kernel, *kernel_args));
   return absl::OkStatus();
 }
 
@@ -1427,7 +370,7 @@ inline absl::Status Stream::ThenLaunch(ThreadDim thread_dims,
                                        Args... args) {
   auto kernel_args = PackKernelArgs(shmem_bytes, args...);
   TF_RETURN_IF_ERROR(
-      parent_->Launch(this, thread_dims, block_dims, kernel, *kernel_args));
+      parent_->Launch(this, thread_dims, block_dims, *kernel, *kernel_args));
   return absl::OkStatus();
 }
 
@@ -1439,7 +382,7 @@ inline absl::Status Stream::ThenLaunch(ThreadDim thread_dims,
                                        Args... args) {
   auto kernel_args = PackKernelArgs(kernel, args...);
   TF_RETURN_IF_ERROR(parent_->Launch(this, thread_dims, block_dims,
-                                     cluster_dims, kernel, *kernel_args));
+                                     cluster_dims, *kernel, *kernel_args));
   return absl::OkStatus();
 }
 
@@ -1449,38 +392,10 @@ inline absl::Status Stream::ThenLaunch(
     int32_t shmem_bytes, const TypedKernel<Params...> &kernel, Args... args) {
   auto kernel_args = PackKernelArgs(shmem_bytes, args...);
   TF_RETURN_IF_ERROR(parent_->Launch(this, thread_dims, block_dims,
-                                     cluster_dims, kernel, *kernel_args));
+                                     cluster_dims, *kernel, *kernel_args));
   return absl::OkStatus();
 }
 
-template <typename T>
-inline absl::StatusOr<std::unique_ptr<TemporaryDeviceMemory<T>>>
-Stream::AllocateTemporaryArray(uint64_t element_count) {
-  return temporary_memory_manager_.AllocateArray<T>(element_count);
-}
-
-inline internal::TemporaryMemoryManager *Stream::temporary_memory_manager() {
-  return &temporary_memory_manager_;
-}
-
-template <>
-struct Quantization<uint8_t> {
-  static constexpr dnn::QuantizedActivationMode kModeId =
-      dnn::QuantizedActivationMode::k8Bit;
-};
-
-template <>
-struct Quantization<uint16_t> {
-  static constexpr dnn::QuantizedActivationMode kModeId =
-      dnn::QuantizedActivationMode::k16Bit;
-};
-
-template <>
-struct Quantization<int32_t> {
-  static constexpr dnn::QuantizedActivationMode kModeId =
-      dnn::QuantizedActivationMode::k32Bit;
-};
-
 }  // namespace stream_executor
 
 #endif  // XLA_STREAM_EXECUTOR_STREAM_H_
diff --git a/third_party/xla/xla/stream_executor/stream_executor.h b/third_party/xla/xla/stream_executor/stream_executor.h
index 9be77b79d14ed8..84a9ff8674b41e 100644
--- a/third_party/xla/xla/stream_executor/stream_executor.h
+++ b/third_party/xla/xla/stream_executor/stream_executor.h
@@ -22,16 +22,8 @@ limitations under the License.
 #ifndef XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
 #define XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
 
-#include "xla/stream_executor/device_description.h"  // IWYU pragma: export
-#include "xla/stream_executor/device_memory.h"  // IWYU pragma: export
-#include "xla/stream_executor/device_options.h"  // IWYU pragma: export
-#include "xla/stream_executor/event.h"  // IWYU pragma: export
-#include "xla/stream_executor/kernel.h"  // IWYU pragma: export
-#include "xla/stream_executor/kernel_spec.h"  // IWYU pragma: export
-#include "xla/stream_executor/launch_dim.h"  // IWYU pragma: export
-#include "xla/stream_executor/multi_platform_manager.h"  // IWYU pragma: export
-#include "xla/stream_executor/platform.h"  // IWYU pragma: export
-#include "xla/stream_executor/stream.h"  // IWYU pragma: export
+#include "xla/stream_executor/multi_platform_manager.h"
+#include "xla/stream_executor/stream.h"
 #include "xla/stream_executor/stream_executor_pimpl.h"  // IWYU pragma: export
 
 #endif  // XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_H_
diff --git a/third_party/xla/xla/stream_executor/stream_executor_internal.h b/third_party/xla/xla/stream_executor/stream_executor_internal.h
index 105ad827d01547..3f2722b20a01e5 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_internal.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_internal.h
@@ -72,164 +72,6 @@ class EventInterface {
   void operator=(const EventInterface&) = delete;
 };
 
-//===----------------------------------------------------------------------===//
-// KernelInterface
-//===----------------------------------------------------------------------===//
-
-// Pointer-to-implementation object type (i.e. the Kernel class delegates to
-// this interface) with virtual destruction. This class exists for the
-// platform-dependent code to hang any kernel data/resource info/functionality
-// off of.
-class KernelInterface {
- public:
-  // Default constructor for the abstract interface.
-  KernelInterface() = default;
-
-  // Default destructor for the abstract interface.
-  virtual ~KernelInterface() = default;
-
-  // Returns the number of formal parameters that this kernel accepts.
-  virtual unsigned Arity() const = 0;
-
-  // Sets the preferred cache configuration.
-  virtual void SetPreferredCacheConfig(KernelCacheConfig config) = 0;
-
-  // Gets the preferred cache configuration.
-  virtual KernelCacheConfig GetPreferredCacheConfig() const = 0;
-
-  // Returns the maximum number of blocks (per multiprocessor) occupied by the
-  // kernel given the number of threads per block and shared memory size.
-  virtual absl::StatusOr<int32_t> GetMaxOccupiedBlocksPerCore(
-      ThreadDim threads, size_t dynamic_shared_memory_bytes) const {
-    return absl::UnimplementedError("Not Implemented");
-  }
-
- private:
-  KernelInterface(const KernelInterface&) = delete;
-  void operator=(const KernelInterface&) = delete;
-};
-
-//===----------------------------------------------------------------------===//
-// CommandBufferInterface
-//===----------------------------------------------------------------------===//
-
-// Platform-dependent interface class for implementing generic CommandBuffer.
-//
-// TODO(ezhulenev): Currently we assume that all operations between barriers
-// can execute concurrently, and it's up to the caller to insert barriers to
-// guarantee correctness. Consider adding finer grained synchronization
-// mechanism between different commands.
-//
-// TODO(ezhulenev): Currently command buffers do no support updates, and once
-// finalized can be executed as recorded. We need to support cheap command
-// buffer updates that in GPU backend will be mapped to CUDA/HIP graph node
-// updates.
-class CommandBufferInterface {
- public:
-  CommandBufferInterface() = default;
-  virtual ~CommandBufferInterface() = default;
-
-  // Traces `function` invocation by recording all operations on the `stream`
-  // into the command buffer. Command buffer must be empty.
-  virtual absl::Status Trace(Stream* stream,
-                             absl::AnyInvocable<absl::Status()> function) = 0;
-
-  // Adds an execution barrier to a command buffer: all commands added before a
-  // barrier will complete before any of the commands added after a barrier.
-  virtual absl::Status Barrier(StreamExecutor* executor) = 0;
-
-  // Adds a kernel launch command to the command buffer.
-  virtual absl::Status Launch(const ThreadDim& threads, const BlockDim& blocks,
-                              const Kernel& kernel, const KernelArgs& args) = 0;
-
-  // Adds a nested command buffer to the command buffer.
-  virtual absl::Status AddNestedCommandBuffer(const CommandBuffer& nested) = 0;
-
-  // Adds a device-to-device memory copy to the command buffer.
-  virtual absl::Status MemcpyDeviceToDevice(DeviceMemoryBase* dst,
-                                            const DeviceMemoryBase& src,
-                                            uint64_t size) = 0;
-
-  // Adds a memset node to the command buffer.
-  virtual absl::Status Memset(DeviceMemoryBase* dst,
-                              CommandBuffer::BitPattern bit_pattern,
-                              size_t num_elements) = 0;
-
-  // Adds a device memory allocation node to the command buffer.
-  virtual absl::StatusOr<DeviceMemoryBase> Allocate(size_t bytes) = 0;
-
-  // Adds a device memory free command to the command buffer, buffer is
-  // allocated in other command buffer, free through real address.
-  virtual absl::Status Free(DeviceMemoryBase dst) = 0;
-
-  // For all conditional command APIs defined below, nested command buffers
-  // constructed for conditional branches owned by *this and should never be
-  // finalized or updated inside builders.
-
-  // Adds a conditional operation that will run a command buffer constructed by
-  // `then_builder` if `predicate` value is `true`.
-  virtual absl::Status If(StreamExecutor* executor,
-                          DeviceMemory<bool> predicate,
-                          CommandBuffer::Builder then_builder) = 0;
-
-  // Adds a conditional operation that will run a command buffer constructed by
-  // `then_builder` if `predicate` value is `true`, or a command buffer
-  // constructed by `else_builder` if `predicate` is `false`.
-  virtual absl::Status IfElse(StreamExecutor* executor,
-                              DeviceMemory<bool> predicate,
-                              CommandBuffer::Builder then_builder,
-                              CommandBuffer::Builder else_builder) = 0;
-
-  // Adds a conditional operation that will run a command buffer constructed by
-  // the `branches` builder at `index`. If `index` is out of range, then it will
-  // run a conditional command buffer constructed by the last builder.
-  //
-  // See: https://github.com/openxla/stablehlo/blob/main/docs/spec.md#case
-  virtual absl::Status Case(StreamExecutor* executor,
-                            DeviceMemory<int32_t> index,
-                            std::vector<CommandBuffer::Builder> branches) = 0;
-
-  // Adds a conditional operation that will run a command buffer constructed by
-  // the `body_builder` exactly `num_iteration` times.
-  virtual absl::Status For(StreamExecutor* executor, int32_t num_iteration,
-                           DeviceMemory<int32_t> loop_index,
-                           CommandBuffer::Builder body_builder) = 0;
-
-  // Adds a conditional operation that will execute a command buffer constructed
-  // by the `cond_builder` that must update `pred` value, and then depending on
-  // the value might execute command buffer constructed by `body_builder` and
-  // `cond_builder`. Will continue while `pred` value is `true`.
-  //
-  // In pseudocode:
-  //
-  //   cond_builder()
-  //   while(pred):
-  //     body_builder()
-  //     cond_builder()
-  //
-  virtual absl::Status While(StreamExecutor* executor, DeviceMemory<bool> pred,
-                             CommandBuffer::Builder cond_builder,
-                             CommandBuffer::Builder body_builder) = 0;
-
-  // Finalizes command buffer and makes it executable. Once command buffer is
-  // finalized no commands can be added to it.
-  virtual absl::Status Finalize() = 0;
-
-  // Begins command buffer update. Command buffer update should be finalized
-  // before it can be executed.
-  virtual absl::Status Update() = 0;
-
-  // Returns command buffer execution mode.
-  virtual CommandBuffer::Mode mode() const = 0;
-
-  // Returns command buffer state.
-  virtual CommandBuffer::State state() const = 0;
-
- private:
-  CommandBufferInterface(const CommandBufferInterface&) = delete;
-  void operator=(const CommandBufferInterface&) = delete;
-};
-
 //===----------------------------------------------------------------------===//
 // StreamInterface
 //===----------------------------------------------------------------------===//
@@ -455,11 +297,14 @@ class StreamExecutorInterface {
   // Each call creates a new instance of the platform-specific implementation of
   // the corresponding interface type.
   virtual std::unique_ptr<EventInterface> CreateEventImplementation() = 0;
-  virtual std::unique_ptr<KernelInterface> CreateKernelImplementation() = 0;
   virtual std::unique_ptr<StreamInterface> GetStreamImplementation() = 0;
 
-  virtual absl::StatusOr<std::unique_ptr<CommandBufferInterface>>
-  GetCommandBufferImplementation(CommandBuffer::Mode mode) {
+  virtual absl::StatusOr<std::unique_ptr<Kernel>> CreateKernel() {
+    return absl::UnimplementedError("Kernels are not implemented");
+  }
+
+  virtual absl::StatusOr<std::unique_ptr<CommandBuffer>> CreateCommandBuffer(
+      CommandBuffer::Mode mode) {
     return absl::UnimplementedError("Command buffers are not implemented");
   }
 
diff --git a/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc b/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
index d2953901e2eb49..ce94bba619cd49 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
+++ b/third_party/xla/xla/stream_executor/stream_executor_pimpl.cc
@@ -21,15 +21,13 @@ limitations under the License.
 
 #include <cstdint>
 #include <memory>
+#include <utility>
 
-#include "absl/base/const_init.h"
 #include "absl/functional/any_invocable.h"
 #include "absl/status/status.h"
 #include "absl/status/statusor.h"
-#include "absl/strings/ascii.h"
 #include "absl/strings/str_cat.h"
 #include "absl/strings/str_format.h"
-#include "absl/synchronization/notification.h"
 #include "absl/types/span.h"
 #include "xla/stream_executor/blas.h"
 #include "xla/stream_executor/command_buffer.h"
@@ -38,6 +36,7 @@ limitations under the License.
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/fft.h"
+#include "xla/stream_executor/host_memory_allocation.h"
 #include "xla/stream_executor/kernel.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
@@ -62,14 +61,6 @@ std::string StackTraceIfVLOG10() {
   }
 }
 
-// Make sure the executor is done with its work; we know (because this isn't
-// publicly visible) that all enqueued work is quick.
-void BlockOnThreadExecutor(tsl::thread::ThreadPool* executor) {
-  absl::Notification n;
-  executor->Schedule([&n]() { n.Notify(); });
-  n.WaitForNotification();
-}
-
 }  // namespace
 
 // Get per-device memory limit in bytes. Returns 0 if
@@ -88,15 +79,11 @@ StreamExecutor::StreamExecutor(
     : platform_(platform),
       implementation_(std::move(implementation)),
       device_ordinal_(device_ordinal),
-      background_threads_(new tsl::thread::ThreadPool(
-          tsl::Env::Default(), "stream_executor", kNumBackgroundThreads)),
       live_stream_count_(0),
       memory_limit_bytes_(GetMemoryLimitBytes()),
       allocator_(this) {}
 
 StreamExecutor::~StreamExecutor() {
-  BlockOnThreadExecutor(background_threads_.get());
-
   if (live_stream_count_.load() != 0) {
     LOG(WARNING) << "Not all streams were deallocated at executor destruction "
                  << "time. This may lead to unexpected/bad behavior - "
@@ -173,183 +160,6 @@ int64_t StreamExecutor::GetDeviceLoad() const {
   return implementation_->GetDeviceLoad();
 }
 
-absl::Status StreamExecutor::GetConvolveRunners(
-    bool use_cudnn_frontend, dnn::ConvolutionKind kind,
-    dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
-    const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
-    const dnn::FilterDescriptor& filter_descriptor,
-    DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemoryBase output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-    ScratchAllocator* scratch_allocator, const NumericOptions& numeric_options,
-    std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_exec_plans) {
-  dnn::DnnSupport* dnn_support = AsDnn();
-  if (!dnn_support) {
-    return absl::UnimplementedError("DNN library is not found.");
-  }
-  return dnn_support->GetConvolveRunners(
-      use_cudnn_frontend, kind, input_type, output_type, stream,
-      input_descriptor, input_data, filter_descriptor, filter_data,
-      output_descriptor, output_data, convolution_descriptor, use_fallback,
-      scratch_allocator, numeric_options, out_exec_plans);
-}
-
-absl::Status StreamExecutor::GetGraphConvolveRunners(
-    dnn::ConvolutionKind kind, dnn::DataType input_type,
-    dnn::DataType output_type, Stream* stream,
-    const dnn::BatchDescriptor& input_descriptor,
-    const dnn::FilterDescriptor& filter_descriptor,
-    const dnn::BatchDescriptor& output_descriptor,
-    const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-    const NumericOptions& numeric_options,
-    std::vector<std::unique_ptr<const dnn::GraphConvRunner>>* out_exec_plans,
-    std::string serialized_graph) {
-  dnn::DnnSupport* dnn_support = AsDnn();
-  if (!dnn_support) {
-    return absl::UnimplementedError("DNN library is not found.");
-  }
-  return dnn_support->GetGraphConvolveRunners(
-      kind, input_type, output_type, stream, input_descriptor,
-      filter_descriptor, output_descriptor, convolution_descriptor,
-      use_fallback, numeric_options, out_exec_plans, serialized_graph);
-}
-
-absl::Status StreamExecutor::GetFusedConvolveRunners(
-    bool use_cudnn_frontend, dnn::ConvolutionKind kind,
-    dnn::DataType input_type, dnn::DataType bias_type,
-    dnn::DataType output_type, double conv_input_scale, double side_input_scale,
-    double leakyrelu_alpha, Stream* stream,
-    const dnn::BatchDescriptor& input_descriptor,
-    const dnn::FilterDescriptor& filter_descriptor,
-    const dnn::BatchDescriptor& bias_descriptor,
-    const dnn::BatchDescriptor& output_descriptor,
-    const dnn::ConvolutionDescriptor& convolution_descriptor, bool use_fallback,
-    dnn::ActivationMode activation_mode, const NumericOptions& numeric_options,
-    std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans) {
-  dnn::DnnSupport* dnn_support = AsDnn();
-  if (!dnn_support) {
-    return absl::UnimplementedError("DNN library is not found.");
-  }
-  return dnn_support->GetFusedConvolveRunners(
-      use_cudnn_frontend, kind, input_type, bias_type, output_type,
-      conv_input_scale, side_input_scale, leakyrelu_alpha, stream,
-      input_descriptor, filter_descriptor, bias_descriptor, output_descriptor,
-      convolution_descriptor, use_fallback, activation_mode, numeric_options,
-      out_exec_plans);
-}
-
-absl::Status StreamExecutor::GetFusedMatmulRunners(
-    bool use_cudnn_frontend, dnn::DataType input_type, dnn::DataType bias_type,
-    dnn::DataType output_type, Stream* stream, bool trans_a, bool trans_b,
-    uint64_t m, uint64_t n, uint64_t k, int64_t lda, int64_t ldb, int64_t ldc,
-    dnn::ActivationMode activation_mode, bool use_fallback,
-    const NumericOptions& numeric_options,
-    std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
-        out_exec_plans) {
-  dnn::DnnSupport* dnn_support = AsDnn();
-  if (!dnn_support) {
-    return absl::UnimplementedError("DNN library is not found.");
-  }
-
-  return dnn_support->GetFusedMatmulRunners(
-      use_cudnn_frontend, input_type, bias_type, output_type, stream, trans_a,
-      trans_b, m, n, k, lda, ldb, ldc, activation_mode, use_fallback,
-      numeric_options, out_exec_plans);
-}
-
-bool StreamExecutor::GetMIOpenConvolveAlgorithms(
-    dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
-    const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
-    const dnn::FilterDescriptor& filter_descriptor,
-    DeviceMemoryBase filter_data, const dnn::BatchDescriptor& output_descriptor,
-    DeviceMemoryBase output_data,
-    const dnn::ConvolutionDescriptor& convolution_descriptor,
-    ScratchAllocator* scratch_allocator,
-    std::vector<dnn::ProfileResult>* out_algorithms) {
-  dnn::DnnSupport* dnn_support = AsDnn();
-  if (!dnn_support) {
-    return false;
-  }
-  return dnn_support->GetMIOpenConvolveAlgorithms(
-      kind, element_type, stream, input_descriptor, input_data,
-      filter_descriptor, filter_data, output_descriptor, output_data,
-      convolution_descriptor, scratch_allocator, out_algorithms);
-}
-
-bool StreamExecutor::GetRnnAlgorithms(
-    std::vector<dnn::AlgorithmDesc>* out_algorithms) {
-  dnn::DnnSupport* dnn_support = AsDnn();
-  if (!dnn_support) {
-    return false;
-  }
-  return dnn_support->GetRnnAlgorithms(out_algorithms);
-}
-
-bool StreamExecutor::GetBlasGemmAlgorithms(
-    Stream* stream, std::vector<blas::AlgorithmType>* out_algorithms) {
-  blas::BlasSupport* blas_support = AsBlas();
-  if (!blas_support) {
-    return false;
-  }
-  return blas_support->GetBlasGemmAlgorithms(stream, out_algorithms);
-}
-
-absl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>>
-StreamExecutor::createRnnDescriptor(
-    int num_layers, int hidden_size, int input_size, int cell_size,
-    int batch_size, dnn::RnnInputMode input_mode,
-    dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
-    dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config,
-    const NumericOptions& numeric_options, float dropout, uint64_t seed,
-    ScratchAllocator* state_allocator, bool use_padded_io) {
-  dnn::DnnSupport* dnn_support = AsDnn();
-  if (!dnn_support) {
-    return absl::UnknownError("Fail to find the dnn implementation.");
-  }
-  return dnn_support->createRnnDescriptor(
-      num_layers, hidden_size, input_size, cell_size, batch_size, input_mode,
-      direction_mode, rnn_mode, data_type, algorithm_config, numeric_options,
-      dropout, seed, state_allocator, use_padded_io);
-}
-
-absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
-StreamExecutor::createRnnSequenceTensorDescriptor(int max_seq_length,
-                                                  int batch_size, int data_size,
-                                                  dnn::DataType data_type) {
-  dnn::DnnSupport* dnn_support = AsDnn();
-  if (!dnn_support) {
-    return absl::UnknownError("Fail to find the dnn implementation.");
-  }
-  return dnn_support->createRnnSequenceTensorDescriptor(
-      max_seq_length, batch_size, data_size, data_type);
-}
-
-absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
-StreamExecutor::createRnnSequenceTensorDescriptor(
-    int max_seq_length, int batch_size, int data_size,
-    const absl::Span<const int>& seq_lengths, bool time_major,
-    dnn::DataType data_type) {
-  dnn::DnnSupport* dnn_support = AsDnn();
-  if (!dnn_support) {
-    return absl::UnknownError("Fail to find the dnn implementation.");
-  }
-  return dnn_support->createRnnSequenceTensorDescriptor(
-      max_seq_length, batch_size, data_size, seq_lengths, time_major,
-      data_type);
-}
-
-absl::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
-StreamExecutor::createRnnStateTensorDescriptor(int num_layer, int batch_size,
-                                               int data_size,
-                                               dnn::DataType data_type) {
-  dnn::DnnSupport* dnn_support = AsDnn();
-  if (!dnn_support) {
-    return absl::UnknownError("Fail to find the dnn implementation.");
-  }
-  return dnn_support->createRnnStateTensorDescriptor(num_layer, batch_size,
-                                                     data_size, data_type);
-}
-
 dnn::DnnSupport* StreamExecutor::AsDnn() {
   absl::MutexLock lock(&mu_);
   if (dnn_ != nullptr) {
@@ -481,18 +291,23 @@ absl::Status StreamExecutor::CollectiveMemoryDeallocate(void* location) {
   return implementation_->CollectiveMemoryDeallocate(location);
 }
 
-void* StreamExecutor::HostMemoryAllocate(uint64_t size) {
+absl::StatusOr<std::unique_ptr<HostMemoryAllocation>>
+StreamExecutor::HostMemoryAllocate(uint64_t size) {
   void* buffer = implementation_->HostMemoryAllocate(size);
   VLOG(1) << "Called StreamExecutor::HostMemoryAllocate(size=" << size
           << ") returns " << buffer << StackTraceIfVLOG10();
-  return buffer;
+  if (buffer == nullptr && size > 0) {
+    return absl::InternalError(
+        absl::StrFormat("Failed to allocate HostMemory of size %d", size));
+  }
+  return std::make_unique<HostMemoryAllocation>(buffer, size, implementation());
 }
 
-void StreamExecutor::HostMemoryDeallocate(void* location) {
-  VLOG(1) << "Called StreamExecutor::HostMemoryDeallocate(location=" << location
-          << ")" << StackTraceIfVLOG10();
+void StreamExecutor::HostMemoryDeallocate(void* data, uint64_t size) {
+  VLOG(1) << "Called StreamExecutor::HostMemoryDeallocate(data=" << data << ")"
+          << StackTraceIfVLOG10();
 
-  return implementation_->HostMemoryDeallocate(location);
+  return implementation_->HostMemoryDeallocate(data);
 }
 
 bool StreamExecutor::SynchronizeAllActivity() {
@@ -500,10 +315,6 @@ bool StreamExecutor::SynchronizeAllActivity() {
           << StackTraceIfVLOG10();
   bool ok = implementation_->SynchronizeAllActivity();
 
-  // This should all be quick and infallible work, so we can perform the
-  // synchronization even in the case of failure.
-  BlockOnThreadExecutor(background_threads_.get());
-
   return ok;
 }
 
@@ -666,10 +477,6 @@ bool StreamExecutor::DeviceMemoryUsage(int64_t* free, int64_t* total) const {
   return implementation_->DeviceMemoryUsage(free, total);
 }
 
-void StreamExecutor::EnqueueOnBackgroundThread(std::function<void()> task) {
-  background_threads_->Schedule(std::move(task));
-}
-
 std::optional<AllocatorStats> StreamExecutor::GetAllocatorStats() {
   return implementation_->GetAllocatorStats();
 }
@@ -754,18 +561,15 @@ absl::StatusOr<Stream*> StreamExecutorMemoryAllocator::GetStream(
       << "The logic below only works for synchronous allocators";
   TF_ASSIGN_OR_RETURN(StreamExecutor * executor,
                       GetStreamExecutor(device_ordinal));
-  Stream* out = [&] {
-    absl::MutexLock lock(&mutex_);
-    if (!streams_.count(device_ordinal)) {
-      auto p = streams_.emplace(std::piecewise_construct,
-                                std::forward_as_tuple(device_ordinal),
-                                std::forward_as_tuple(executor));
-      p.first->second.Init();
-      return &p.first->second;
-    }
-    return &streams_.at(device_ordinal);
-  }();
-  return out;
+  absl::MutexLock lock(&mutex_);
+  if (!streams_.count(device_ordinal)) {
+    auto p = streams_.emplace(std::piecewise_construct,
+                              std::forward_as_tuple(device_ordinal),
+                              std::forward_as_tuple(executor));
+    TF_RETURN_IF_ERROR(p.first->second.Initialize());
+    return &p.first->second;
+  }
+  return &streams_.at(device_ordinal);
 }
 
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/stream_executor_pimpl.h b/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
index 6dc2d5ca9a5f0d..ad634b7163bf65 100644
--- a/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
+++ b/third_party/xla/xla/stream_executor/stream_executor_pimpl.h
@@ -17,6 +17,7 @@ limitations under the License.
 #define XLA_STREAM_EXECUTOR_STREAM_EXECUTOR_PIMPL_H_
 
 #include <atomic>
+#include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <optional>
@@ -40,14 +41,12 @@ limitations under the License.
 #include "xla/stream_executor/dnn.h"
 #include "xla/stream_executor/event.h"
 #include "xla/stream_executor/fft.h"
+#include "xla/stream_executor/host_memory_allocation.h"
 #include "xla/stream_executor/kernel_spec.h"
 #include "xla/stream_executor/launch_dim.h"
 #include "xla/stream_executor/module_spec.h"
-#include "xla/stream_executor/numeric_options.h"
 #include "xla/stream_executor/platform.h"
-#include "xla/stream_executor/platform/port.h"
-#include "tsl/platform/threadpool.h"
-#include "tsl/protobuf/dnn.pb.h"
+#include "tsl/platform/logging.h"
 
 namespace stream_executor {
 
@@ -57,11 +56,6 @@ namespace internal {
 class StreamExecutorInterface;
 }  // namespace internal
 
-// Forward declaration of private friend class.
-template <typename BeginCallT, typename CompleteCallT, typename ReturnT,
-          typename... BeginArgsT>
-class ScopedTracer;
-
 // A StreamExecutor manages a single device, in terms of executing work (kernel
 // launches) and memory management (allocation/deallocation, memory copies to
 // and from the device). It is conceptually the "handle" for a device -- Stream
@@ -192,10 +186,8 @@ class StreamExecutor {
   // Memory allocated in this manner (or allocated and registered with
   // HostMemoryRegister() is required for use in asynchronous memcpy operations,
   // such as Stream::ThenMemcpy.
-  void* HostMemoryAllocate(uint64_t size);
-
-  // Deallocates a region of host memory allocated by HostMemoryAllocate().
-  void HostMemoryDeallocate(void* location);
+  absl::StatusOr<std::unique_ptr<HostMemoryAllocation>> HostMemoryAllocate(
+      uint64_t size);
 
   // Synchronizes all activity occurring in the StreamExecutor's context (most
   // likely a whole device).
@@ -279,103 +271,6 @@ class StreamExecutor {
   // will be reflected in "free".
   bool DeviceMemoryUsage(int64_t* free, int64_t* total) const;
 
-  // Returns the supported algorithms / execution plans for a convolution.
-  absl::Status GetConvolveRunners(
-      bool use_cudnn_frontend, dnn::ConvolutionKind kind,
-      dnn::DataType input_type, dnn::DataType output_type, Stream* stream,
-      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemoryBase filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemoryBase output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      bool use_fallback, ScratchAllocator* scratch_allocator,
-      const NumericOptions& numeric_options,
-      std::vector<std::unique_ptr<const dnn::ConvRunner>>* out_exec_plans);
-
-  absl::Status GetGraphConvolveRunners(
-      dnn::ConvolutionKind kind, dnn::DataType input_type,
-      dnn::DataType output_type, Stream* stream,
-      const dnn::BatchDescriptor& input_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      bool use_fallback, const NumericOptions& numeric_options,
-      std::vector<std::unique_ptr<const dnn::GraphConvRunner>>* out_exec_plans,
-      std::string serialized_graph);
-
-  absl::Status GetFusedConvolveRunners(
-      bool use_cudnn_frontend, dnn::ConvolutionKind kind,
-      dnn::DataType input_type, dnn::DataType bias_type,
-      dnn::DataType output_type, double conv_input_scale,
-      double side_input_scale, double leakyrelu_alpha, Stream* stream,
-      const dnn::BatchDescriptor& input_descriptor,
-      const dnn::FilterDescriptor& filter_descriptor,
-      const dnn::BatchDescriptor& bias_descriptor,
-      const dnn::BatchDescriptor& output_descriptor,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      bool use_fallback, dnn::ActivationMode activation_mode,
-      const NumericOptions& numeric_options,
-      std::vector<std::unique_ptr<const dnn::FusedConvRunner>>* out_exec_plans);
-
-  absl::Status GetFusedMatmulRunners(
-      bool use_cudnn_frontend, dnn::DataType input_type,
-      dnn::DataType bias_type, dnn::DataType output_type, Stream* stream,
-      bool trans_a, bool trans_b, uint64_t m, uint64_t n, uint64_t k,
-      int64_t lda, int64_t ldb, int64_t ldc,
-      dnn::ActivationMode activation_mode, bool use_fallback,
-      const NumericOptions& numeric_options,
-      std::vector<std::unique_ptr<const dnn::FusedMatmulRunner>>*
-          out_exec_plans);
-
-  // Returns the list of supported algorithms for the forward convolution
-  // operation.
-  bool GetMIOpenConvolveAlgorithms(
-      dnn::ConvolutionKind kind, dnn::DataType element_type, Stream* stream,
-      const dnn::BatchDescriptor& input_descriptor, DeviceMemoryBase input_data,
-      const dnn::FilterDescriptor& filter_descriptor,
-      DeviceMemoryBase filter_data,
-      const dnn::BatchDescriptor& output_descriptor,
-      DeviceMemoryBase output_data,
-      const dnn::ConvolutionDescriptor& convolution_descriptor,
-      ScratchAllocator* scratch_allocator,
-      std::vector<dnn::ProfileResult>* out_algorithms);
-
-  // Returns the list of supported algorithms for rnn operation.
-  bool GetRnnAlgorithms(std::vector<dnn::AlgorithmDesc>* out_algorithms);
-
-  // Get the list of supported algorithms for BLAS gemm.
-  bool GetBlasGemmAlgorithms(Stream* stream,
-                             std::vector<blas::AlgorithmType>* out_algorithms);
-
-  // Create an RNN descriptor based on model shapes and configurations.
-  // The caller retains the ownership of the descriptor.
-  absl::StatusOr<std::unique_ptr<dnn::RnnDescriptor>> createRnnDescriptor(
-      int num_layers, int hidden_size, int input_size, int cell_size,
-      int batch_size, dnn::RnnInputMode input_mode,
-      dnn::RnnDirectionMode direction_mode, dnn::RnnMode rnn_mode,
-      dnn::DataType data_type, const dnn::AlgorithmConfig& algorithm_config,
-      const NumericOptions& numeric_options, float dropout, uint64_t seed,
-      ScratchAllocator* state_allocator, bool use_padded_io);
-
-  // Create a RNN sequence descriptor that specifies either the input or output
-  // sequence. The caller retains the ownership of the returned descriptor.
-  absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
-  createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
-                                    int data_size, dnn::DataType data_type);
-
-  absl::StatusOr<std::unique_ptr<dnn::RnnSequenceTensorDescriptor>>
-  createRnnSequenceTensorDescriptor(int max_seq_length, int batch_size,
-                                    int data_size,
-                                    const absl::Span<const int>& seq_lengths,
-                                    bool time_major, dnn::DataType data_type);
-
-  // Create an RNN state descriptor that specifies the input or hidden state.
-  // The caller retains the ownership of the returned descriptor.
-  absl::StatusOr<std::unique_ptr<dnn::RnnStateTensorDescriptor>>
-  createRnnStateTensorDescriptor(int num_layer, int batch_size, int data_size,
-                                 dnn::DataType data_type);
-
   // Returns the device ordinal that this StreamExecutor was initialized with.
   // Meaningless before initialization.
   int device_ordinal() const { return device_ordinal_; }
@@ -389,14 +284,14 @@ class StreamExecutor {
   // time. The canonical storage for both ptx and cubin_data should outlive the
   // lifetime of the kernel.
   template <typename... Args>
-  absl::StatusOr<std::unique_ptr<TypedKernel<Args...>>> CreateTypedKernel(
+  absl::StatusOr<TypedKernel<Args...>> CreateTypedKernel(
       absl::string_view kernel_name, absl::string_view ptx,
       absl::Span<const uint8_t> cubin_data);
 
   // Creates a kernel which can be launched with `stream.ThenLaunch(...)` from
   // an in-process symbol pointer.
   template <typename... Args>
-  absl::StatusOr<std::unique_ptr<TypedKernel<Args...>>> CreateTypedKernel(
+  absl::StatusOr<TypedKernel<Args...>> CreateTypedKernel(
       absl::string_view kernel_name, void* symbol);
 
   // Warning: use Stream::ThenLaunch instead, this method is not for general
@@ -471,15 +366,14 @@ class StreamExecutor {
   Stream* FindAllocatedStream(void* gpu_stream);
 
  private:
-  template <typename BeginCallT, typename CompleteCallT, typename ReturnT,
-            typename... BeginArgsT>
-  friend class ScopedTracer;
   friend class Event;
   friend class Stream;
   template <typename... Params>
   friend class TypedKernel;
-  template <typename... Args>
-  friend struct ThenBlasImpl;
+  friend class HostMemoryAllocation;
+
+  // Deallocates a region of host memory allocated by HostMemoryAllocate().
+  void HostMemoryDeallocate(void* data, uint64_t size);
 
   // Synchronously allocates size bytes on the underlying platform and returns
   // a DeviceMemoryBase representing that allocation. In the case of failure,
@@ -557,13 +451,6 @@ class StreamExecutor {
   // ownership transfer to caller.
   std::unique_ptr<DeviceDescription> CreateDeviceDescription() const;
 
-  // Adds a task to the tsl::thread::ThreadPool work queue. These tasks must be
-  // fire-and-forget and have no external data or timing dependencies; their
-  // execution order and completion time have no guarantees.
-  // For an example of an appropriate task, see HostBlas::DoBlasGemmInternal;
-  // there, temporary internal buffers are freed using this method.
-  void EnqueueOnBackgroundThread(std::function<void()> task);
-
   // Reader/writer lock for mutable data structures on this StreamExecutor.
   //
   // Mutable so that caching functions (like DeviceDescription, AsBlas, etc.)
@@ -600,16 +487,6 @@ class StreamExecutor {
   // Immutable post-initialization.
   int device_ordinal_;
 
-  // Executor for handling host callback work that cannot be performed
-  // by a host callback thread - for example, cleanup after a host BLAS routine
-  // (which may make device API calls). This work cannot block the host
-  // callback thread, will be completed asynchronously, and should be treated
-  // as fire-and-forget. Assume no ordering guarantees WRT the tasks enqueued
-  // here.
-  //
-  // Immutable post-initialization. Object is thread-safe.
-  std::unique_ptr<tsl::thread::ThreadPool> background_threads_;
-
   // Counter for the current number of live streams. This is used to check
   // for accidentally-outstanding streams at StreamExecutor teardown time, as
   // well
@@ -671,12 +548,10 @@ class ScopedModuleHandle {
 // Inlines
 
 template <typename... Args>
-inline absl::StatusOr<std::unique_ptr<TypedKernel<Args...>>>
-StreamExecutor::CreateTypedKernel(absl::string_view kernel_name,
-                                  absl::string_view ptx,
-                                  absl::Span<const uint8_t> cubin_data) {
-  auto kernel_base = std::make_unique<TypedKernel<Args...>>(this);
-  MultiKernelLoaderSpec loader_spec(kernel_base->kNumberOfParameters);
+inline absl::StatusOr<TypedKernel<Args...>> StreamExecutor::CreateTypedKernel(
+    absl::string_view kernel_name, absl::string_view ptx,
+    absl::Span<const uint8_t> cubin_data) {
+  MultiKernelLoaderSpec loader_spec(TypedKernel<Args...>::kNumberOfParameters);
   loader_spec.AddCudaPtxInMemory(ptx, kernel_name);
 
   if (!cubin_data.empty()) {
@@ -684,19 +559,16 @@ StreamExecutor::CreateTypedKernel(absl::string_view kernel_name,
         reinterpret_cast<const char*>(cubin_data.data()), kernel_name);
   }
 
-  TF_RETURN_IF_ERROR(GetKernel(loader_spec, kernel_base.get()));
-  return std::move(kernel_base);
+  return TypedKernel<Args...>::Create(this, loader_spec);
 }
 
 template <typename... Args>
-inline absl::StatusOr<std::unique_ptr<TypedKernel<Args...>>>
-StreamExecutor::CreateTypedKernel(absl::string_view kernel_name, void* symbol) {
-  auto kernel_base = std::make_unique<TypedKernel<Args...>>(this);
-  MultiKernelLoaderSpec loader_spec(kernel_base->kNumberOfParameters);
+inline absl::StatusOr<TypedKernel<Args...>> StreamExecutor::CreateTypedKernel(
+    absl::string_view kernel_name, void* symbol) {
+  MultiKernelLoaderSpec loader_spec(TypedKernel<Args...>::kNumberOfParameters);
   loader_spec.AddInProcessSymbol(symbol, kernel_name);
 
-  TF_RETURN_IF_ERROR(GetKernel(loader_spec, kernel_base.get()));
-  return std::move(kernel_base);
+  return TypedKernel<Args...>::Create(this, loader_spec);
 }
 
 template <typename T>
diff --git a/third_party/xla/xla/stream_executor/stream_executor_test.cc b/third_party/xla/xla/stream_executor/stream_executor_test.cc
new file mode 100644
index 00000000000000..baf5ba5392b413
--- /dev/null
+++ b/third_party/xla/xla/stream_executor/stream_executor_test.cc
@@ -0,0 +1,41 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/stream_executor/stream_executor.h"
+
+#include <memory>
+
+#include "xla/stream_executor/multi_platform_manager.h"
+#include "xla/stream_executor/platform.h"
+#include "tsl/platform/statusor.h"
+#include "tsl/platform/test.h"
+
+namespace stream_executor {
+
+static std::unique_ptr<StreamExecutor> NewStreamExecutor() {
+  Platform* platform = MultiPlatformManager::PlatformWithName("Host").value();
+  StreamExecutorConfig config(/*ordinal=*/0);
+  return platform->GetUncachedExecutor(config).value();
+}
+
+TEST(StreamExecutorTest, HostMemoryAllocate) {
+  auto executor = NewStreamExecutor();
+
+  TF_ASSERT_OK_AND_ASSIGN(auto allocation, executor->HostMemoryAllocate(1024));
+  EXPECT_NE(allocation->opaque(), nullptr);
+  EXPECT_EQ(allocation->size(), 1024);
+}
+
+}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/stream_test.cc b/third_party/xla/xla/stream_executor/stream_test.cc
index e726a771c53d2c..9f873344dc580d 100644
--- a/third_party/xla/xla/stream_executor/stream_test.cc
+++ b/third_party/xla/xla/stream_executor/stream_test.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License.
 ==============================================================================*/
 
+#include "absl/log/check.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "tsl/platform/test.h"
 
@@ -37,15 +38,13 @@ TEST_F(StreamTest, NoInitNotOk) {
 TEST_F(StreamTest, InitOk) {
   std::unique_ptr<StreamExecutor> executor = NewStreamExecutor();
   Stream stream(executor.get());
-  stream.Init();
-  EXPECT_TRUE(stream.ok());
+  CHECK_OK(stream.Initialize());
 }
 
 TEST_F(StreamTest, OneSubStream) {
   std::unique_ptr<StreamExecutor> executor = NewStreamExecutor();
   Stream stream(executor.get());
-  stream.Init();
-  EXPECT_TRUE(stream.ok());
+  CHECK_OK(stream.Initialize());
 
   // Get and return a sub-stream. Sub-streams are always initialized.
   Stream* sub_stream1 = stream.GetOrCreateSubStream();
@@ -65,8 +64,7 @@ TEST_F(StreamTest, OneSubStream) {
 TEST_F(StreamTest, TwoSubStreams) {
   std::unique_ptr<StreamExecutor> executor = NewStreamExecutor();
   Stream stream(executor.get());
-  stream.Init();
-  EXPECT_TRUE(stream.ok());
+  CHECK_OK(stream.Initialize());
 
   // Get two sub-streams.
   Stream* sub_stream1 = stream.GetOrCreateSubStream();
@@ -93,109 +91,5 @@ TEST_F(StreamTest, TwoSubStreams) {
   EXPECT_NE(sub_stream3, sub_stream4);
 }
 
-TEST_F(StreamTest, FailedSubStreamBeforeReturnNotReused) {
-  std::unique_ptr<StreamExecutor> executor = NewStreamExecutor();
-  Stream stream(executor.get());
-  stream.Init();
-  EXPECT_TRUE(stream.ok());
-
-  // Get sub_stream1.
-  Stream* sub_stream1 = stream.GetOrCreateSubStream();
-  EXPECT_TRUE(sub_stream1->ok());
-
-  // Force an error on sub_stream1; here we call a method that requires DNN
-  // support, which we know the Host platform doesn't support.
-  sub_stream1->ThenDepthConcatenate({}, {}, nullptr);
-  EXPECT_FALSE(sub_stream1->ok());
-
-  // Return sub_stream1 and get sub_stream2.
-  stream.ReturnSubStream(sub_stream1);
-  Stream* sub_stream2 = stream.GetOrCreateSubStream();
-  EXPECT_TRUE(sub_stream2->ok());
-
-  // The underlying sub_streams should be different. They would have been the
-  // same, but since we forced an error on sub_stream1, it will not be
-  // re-used. Sadly we can't just check:
-  //   EXPECT_NE(sub_stream1, sub_stream2);
-  //
-  // The above should hold logically, but it may fail if the new Stream instance
-  // allocated for sub_stream2 happens to reside in the same memory address as
-  // sub_stream1.
-  //
-  // The check that sub_stream2->ok() serves as a good-enough check.
-
-  // Return sub_stream2 and get sub_stream3. The previous error on sub_stream1
-  // has no effect on these streams, and they are the same.
-  stream.ReturnSubStream(sub_stream2);
-  Stream* sub_stream3 = stream.GetOrCreateSubStream();
-  EXPECT_TRUE(sub_stream3->ok());
-  EXPECT_EQ(sub_stream2, sub_stream3);
-}
-
-TEST_F(StreamTest, FailedSubStreamAfterReturnNotReused) {
-  std::unique_ptr<StreamExecutor> executor = NewStreamExecutor();
-  Stream stream(executor.get());
-  stream.Init();
-  EXPECT_TRUE(stream.ok());
-
-  // Get and return sub_stream1.
-  Stream* sub_stream1 = stream.GetOrCreateSubStream();
-  EXPECT_TRUE(sub_stream1->ok());
-  stream.ReturnSubStream(sub_stream1);
-
-  // Force an error on sub_stream1; here we call a method that requires DNN
-  // support, which we know the Host platform doesn't support.
-  //
-  // It is a bit weird to use sub_stream1 after it has already been returned. By
-  // doing this, we're simulating an asynchronous error that occurs during
-  // execution of the sub_stream, that occurs after the sub_stream is returned.
-  //
-  // E.g. the following is a common pattern of usage, where the execution of the
-  // operations enqueued onto the sub streams may occur after the streams have
-  // already been returned.
-  //
-  //   void EnqueueOnSubStreams(Stream* stream) {
-  //     Stream* sub_stream1 = stream.GetOrCreateSubStream();
-  //     Stream* sub_stream2 = stream.GetOrCreateSubStream();
-  //     // ... enqueue some operations on the sub streams ...
-  //     stream.ThenWaitFor(sub_stream1).ThenWaitFor(sub_stream2);
-  //     stream.ReturnSubStream(sub_stream1);
-  //     stream.ReturnSubStream(sub_stream2);
-  //   }
-  //
-  //   Stream* main_stream = ...;
-  //   EnqueueOnSubStreams(main_stream);
-  //   main_stream.BlockHostUntilDone();
-  //
-  // TODO(b/112196569): The semantics of failed sub-streams is error-prone;
-  // GetOrCreateSubStream can still return a sub-stream that has not encountered
-  // an error yet, but will encounter one in the future, based on previously
-  // enqueued operations.
-  sub_stream1->ThenDepthConcatenate({}, {}, nullptr);
-  EXPECT_FALSE(sub_stream1->ok());
-
-  // Get and return sub_stream2.
-  Stream* sub_stream2 = stream.GetOrCreateSubStream();
-  EXPECT_TRUE(sub_stream2->ok());
-
-  // The underlying streams should be different. They would have been the same,
-  // but since we forced an error on sub_stream1, it will not be re-used. Sadly
-  // we can't just check:
-  //   EXPECT_NE(sub_stream1, sub_stream2);
-  //
-  // The above should hold logically, but it may fail if the new stream instance
-  // allocated for sub_stream2 happens to reside in the same memory address as
-  // sub_stream1.
-  //
-  // The check that sub_stream2->ok() serves as a good-enough check.
-
-  // Return sub_stream2 and get sub_stream3. The previous error on sub_stream1
-  // has no effect on these streams, and they are the same.
-  stream.ReturnSubStream(sub_stream2);
-  Stream* sub_stream3 = stream.GetOrCreateSubStream();
-  EXPECT_TRUE(sub_stream3->ok());
-  EXPECT_EQ(sub_stream2, sub_stream3);
-}
-
 }  // namespace
 }  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/temporary_device_memory.cc b/third_party/xla/xla/stream_executor/temporary_device_memory.cc
deleted file mode 100644
index fee25d8056c838..00000000000000
--- a/third_party/xla/xla/stream_executor/temporary_device_memory.cc
+++ /dev/null
@@ -1,50 +0,0 @@
-/* Copyright 2015 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/stream_executor/temporary_device_memory.h"
-
-#include "xla/stream_executor/stream.h"
-
-namespace stream_executor {
-
-TemporaryDeviceMemoryBase::~TemporaryDeviceMemoryBase() {
-  parent_->temporary_memory_manager()->MarkFinalized(device_memory_,
-                                                     allocation_generation_,
-                                                     /*must_exist=*/false);
-}
-
-DeviceMemoryBase* TemporaryDeviceMemoryBase::mutable_device_memory() {
-  return &device_memory_;
-}
-
-const DeviceMemoryBase& TemporaryDeviceMemoryBase::device_memory() const {
-  return device_memory_;
-}
-
-bool TemporaryDeviceMemoryBase::IsAllocated() const {
-  return parent_->temporary_memory_manager()->HasAllocated(
-      device_memory_, allocation_generation_);
-}
-
-TemporaryDeviceMemoryBase::TemporaryDeviceMemoryBase(
-    Stream* parent, DeviceMemoryBase device_memory,
-    uint64_t allocation_generation)
-    : device_memory_(device_memory),
-      allocation_generation_(allocation_generation),
-      parent_(parent) {
-  DCHECK(IsAllocated());
-}
-
-}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/temporary_device_memory.h b/third_party/xla/xla/stream_executor/temporary_device_memory.h
deleted file mode 100644
index 7077f3ff700bc2..00000000000000
--- a/third_party/xla/xla/stream_executor/temporary_device_memory.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright 2015 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// Temporary memories are used to allocate scratch space required by an
-// operation about to be enqueued onto a stream.
-//
-//    std::unique_ptr<TemporaryDeviceMemory<float>> temporary_memory =
-//        stream.AllocateTemporaryArray<float>(1024).value();
-//    // ... enqueue stuff onto the stream using the temporary memory ...
-//    // Note that the memory is accessible via
-//    // temporary_memory->device_memory() and similar.
-//
-// Note that standard usage takes advantage of the type-safe wrapper,
-// TemporaryDeviceMemory<T>, defined below.
-//
-// Also see tests for executable sample usage.
-
-#ifndef XLA_STREAM_EXECUTOR_TEMPORARY_DEVICE_MEMORY_H_
-#define XLA_STREAM_EXECUTOR_TEMPORARY_DEVICE_MEMORY_H_
-
-#include "xla/stream_executor/device_memory.h"
-
-namespace stream_executor {
-
-class Stream;
-namespace internal {
-class TemporaryMemoryManager;
-}
-
-// Untyped base class (analogous to a void*) for temporary device memory
-// allocations associated with a stream.
-class TemporaryDeviceMemoryBase {
- public:
-  // Marks the temporary memory as finalized if it is not already marked as
-  // such.
-  ~TemporaryDeviceMemoryBase();
-
-  // Precondition: !IsFinalized()
-  DeviceMemoryBase* mutable_device_memory();
-
-  // Precondition: !IsFinalized()
-  const DeviceMemoryBase& device_memory() const;
-
-  // Returns true iff the temporary memory is still allocated.
-  //
-  // Note: this is a polling call, no guarantee is made that the temporary
-  // memory is still allocated after the call has completed.
-  bool IsAllocated() const;
-
- private:
-  friend class internal::TemporaryMemoryManager;
-  friend class TemporaryDeviceMemoryTest;
-
-  // Note: construction DCHECKs that the memory is known-allocated in the
-  // stream's temporary-allocation-manager.
-  TemporaryDeviceMemoryBase(Stream* parent, DeviceMemoryBase device_memory,
-                            uint64_t allocation_generation);
-
-  // The device memory region that has allocated.
-  DeviceMemoryBase device_memory_;
-
-  // The generation counter value for the temporary memory record in the
-  // temporary memory manager.
-  uint64_t allocation_generation_;
-
-  // The stream that this temporary memory was allocated for.
-  Stream* parent_;
-};
-
-// Type-safe wrapper around the base type (which is analogous to a void*).
-template <typename T>
-class TemporaryDeviceMemory : public TemporaryDeviceMemoryBase {
- public:
-  // Type-safe wrapper around TemporaryDeviceMemoryBase::mutable_device_memory.
-  DeviceMemory<T>* mutable_device_memory() {
-    StaticSlicingAssertionDummy();
-    return reinterpret_cast<DeviceMemory<T>*>(
-        TemporaryDeviceMemoryBase::mutable_device_memory());
-  }
-
-  // Type-safe wrapper around TemporaryDeviceMemoryBase::device_memory.
-  const DeviceMemory<T>& device_memory() const {
-    StaticSlicingAssertionDummy();
-    return reinterpret_cast<const DeviceMemory<T>&>(
-        TemporaryDeviceMemoryBase::device_memory());
-  }
-
- private:
-  static void StaticSlicingAssertionDummy() {
-    static_assert(
-        sizeof(TemporaryDeviceMemory) == sizeof(TemporaryDeviceMemoryBase),
-        "derived class is simply a wrapper, no members may be added due to "
-        "slicing");
-  }
-};
-
-}  // namespace stream_executor
-
-#endif  // XLA_STREAM_EXECUTOR_TEMPORARY_DEVICE_MEMORY_H_
diff --git a/third_party/xla/xla/stream_executor/temporary_memory_manager.cc b/third_party/xla/xla/stream_executor/temporary_memory_manager.cc
deleted file mode 100644
index 96dfc9a8dfe008..00000000000000
--- a/third_party/xla/xla/stream_executor/temporary_memory_manager.cc
+++ /dev/null
@@ -1,131 +0,0 @@
-/* Copyright 2015 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/stream_executor/temporary_memory_manager.h"
-
-#include <cstdint>
-#include <memory>
-
-#include "absl/status/status.h"
-#include "absl/strings/str_cat.h"
-#include "absl/strings/str_format.h"
-#include "xla/stream_executor/stream.h"
-#include "xla/stream_executor/stream_executor.h"
-#include "xla/stream_executor/temporary_device_memory.h"
-#include "tsl/platform/logging.h"
-
-namespace stream_executor {
-namespace internal {
-
-void TemporaryMemoryManager::ForceDeallocateAll() {
-  absl::MutexLock lock(&mutex_);
-  VLOG(1) << "force-deallocating " << records_.size() << " remaining records";
-  for (auto it = records_.begin(); it != records_.end(); ++it) {
-    DeviceMemoryBase device_memory = it->first;
-    stream_->parent()->Deallocate(&device_memory);
-  }
-}
-
-void TemporaryMemoryManager::MarkFinalized(
-    const DeviceMemoryBase& device_memory, uint64_t generation,
-    bool must_exist) {
-  absl::MutexLock lock(&mutex_);
-  auto it = records_.find(device_memory);
-  if (it == records_.end()) {
-    if (must_exist) {
-      LOG(FATAL) << "attempted to mark finalization for temporary "
-                    "memory that does not exist";
-    }
-    return;
-  }
-  it->second.finalized = true;
-}
-
-void TemporaryMemoryManager::DeallocateFinalizedTemporaries() {
-  absl::MutexLock lock(&mutex_);
-  int deallocated_count = 0;
-  for (auto it = records_.begin(); it != records_.end();) {
-    if (it->second.finalized) {
-      DeviceMemoryBase device_memory = it->first;
-      stream_->parent()->Deallocate(&device_memory);
-      ++deallocated_count;
-      it = records_.erase(it);
-    } else {
-      ++it;
-    }
-  }
-  VLOG(1) << "deallocated " << deallocated_count << " finalized temporaries";
-}
-
-bool TemporaryMemoryManager::IsFinalized(const DeviceMemoryBase& device_memory,
-                                         uint64_t allocation_generation) const {
-  absl::MutexLock lock(&mutex_);
-  auto it = records_.find(device_memory);
-  if (it == records_.end()) {
-    return true;  // If there's no record present it's vacuously finalized.
-  }
-
-  if (it->second.allocation_generation == allocation_generation) {
-    return it->second.finalized;
-  }
-
-  // If the allocation generation did not match, it's vacuously true.
-  return true;
-}
-
-bool TemporaryMemoryManager::HasAllocated(const DeviceMemoryBase& device_memory,
-                                          uint64_t generation) const {
-  absl::MutexLock lock(&mutex_);
-  auto it = records_.find(device_memory);
-  if (it == records_.end()) {
-    return false;
-  }
-  return it->second.allocation_generation == generation;
-}
-
-absl::StatusOr<std::unique_ptr<TemporaryDeviceMemoryBase>>
-TemporaryMemoryManager::AllocateArrayBase(uint64_t element_count,
-                                          uint64_t element_size) {
-  uint64_t byte_size = element_count * element_size;
-  DeviceMemoryBase device_memory =
-      stream_->parent()->AllocateArray<uint8_t>(byte_size);
-  if (device_memory == nullptr) {
-    return absl::ResourceExhaustedError(absl::StrCat(
-        "could not allocate temporary memory of ", byte_size, " bytes"));
-  }
-
-  uint64_t generation;
-
-  // Add the record before instantiating the device memory instance so we can
-  // check the allocation invariant at TemporaryDeviceMemory construction time.
-  {
-    absl::MutexLock lock(&mutex_);
-    generation = ++generation_;
-    DCHECK(records_.find(device_memory) == records_.end());
-    records_[device_memory] = {generation,
-                               /*finalized=*/false};
-  }
-
-  VLOG(1) << absl::StreamFormat(
-      "stream %p allocated temporary device memory at %p (size %u) in "
-      "generation %u",
-      stream_, device_memory.opaque(), byte_size, generation);
-  std::unique_ptr<TemporaryDeviceMemoryBase> result(
-      new TemporaryDeviceMemoryBase(stream_, device_memory, generation));
-  return std::move(result);
-}
-
-}  // namespace internal
-}  // namespace stream_executor
diff --git a/third_party/xla/xla/stream_executor/temporary_memory_manager.h b/third_party/xla/xla/stream_executor/temporary_memory_manager.h
deleted file mode 100644
index ddf4212d8e660a..00000000000000
--- a/third_party/xla/xla/stream_executor/temporary_memory_manager.h
+++ /dev/null
@@ -1,153 +0,0 @@
-/* Copyright 2015 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-// The temporary-memory-manager is a helper class for a Stream to keep track of
-// temporary allocations. These allocations defer their deallocation to the next
-// Stream::BlockHostUntilDone call for efficiency purposes (as deallocation
-// itself generally forces synchronization to occur).
-
-#ifndef XLA_STREAM_EXECUTOR_TEMPORARY_MEMORY_MANAGER_H_
-#define XLA_STREAM_EXECUTOR_TEMPORARY_MEMORY_MANAGER_H_
-
-#include <map>
-#include <memory>
-#include <utility>
-
-#include "absl/base/thread_annotations.h"
-#include "absl/status/statusor.h"
-#include "absl/strings/string_view.h"
-#include "absl/synchronization/mutex.h"
-#include "xla/stream_executor/device_memory.h"
-#include "xla/stream_executor/temporary_device_memory.h"
-
-namespace stream_executor {
-namespace internal {
-
-// Record used inside the TemporaryMemoryManager as metadata for a given device
-// memory region.
-struct TemporaryMemoryRecord {
-  // What "generation" this record was allocated in.
-  //
-  // Currently the generation counter is bumped for every allocation, but this
-  // could be made coarser if necessary.
-  uint64_t allocation_generation;
-
-  // Notes whether the temporary memory has been marked as finalized, such that
-  // we can release the DeviceMemory associated with this record at
-  // synchronization time.
-  bool finalized;
-};
-
-// Manages temporary memories associated with a stream -- keeps records of
-// outstanding temporaries and their state, and can deallocate them
-// appropriately at points in the Stream lifecycle (e.g. BlockHostUntilDone,
-// destruction).
-class TemporaryMemoryManager {
- public:
-  explicit TemporaryMemoryManager(Stream* stream) : stream_(stream) {}
-
-  // Allocates a temporary array that is then managed by this object.
-  template <typename T>
-  absl::StatusOr<std::unique_ptr<TemporaryDeviceMemory<T>>> AllocateArray(
-      uint64_t element_count);
-
-  // Forces deallocation of all managed temporary memory regions.
-  //
-  // Called, for example, when the Stream owning this temporary memory manager
-  // is destroyed.
-  //
-  // Note: These calls to Deallocate will likely force synchronization.
-  void ForceDeallocateAll();
-
-  // Marks the given memory region as finalized.
-  //
-  // If must_exist is set, this will check-fail if the temporary memory record
-  // is not found.
-  void MarkFinalized(const DeviceMemoryBase& device_memory, uint64_t generation,
-                     bool must_exist);
-
-  // Deallocates temporary memories that have been finalized.
-  //
-  // Note: These calls to Deallocate will likely force synchronization, so it is
-  // meant to be called before a "BlockHostUntilDone" is about to be performed.
-  void DeallocateFinalizedTemporaries();
-
-  // Returns whether the provided device_memory is finalized.
-  //
-  // In the vacuous case where the device memory doesn't appear in the temporary
-  // memory records, it is either not a temporary at all, or has already been
-  // deallocated, and thus returns true.
-  bool IsFinalized(const DeviceMemoryBase& device_memory,
-                   uint64_t allocation_generation) const;
-
-  // Returns whether the manager has a live allocation record for the given
-  // device memory pointer with the given generation counter.
-  //
-  // Note: this is a polling call -- there is no guarantee that the region is
-  // still allocated once the call has completed.
-  bool HasAllocated(const DeviceMemoryBase& device_memory,
-                    uint64_t generation) const;
-
- private:
-  // Allocates an array without type parameterization, so that the
-  // implementation can live in the source file. Without this base allocation
-  // method, we incur a circular dependency between the StreamExecutor
-  // definition and this class' definition.
-  absl::StatusOr<std::unique_ptr<TemporaryDeviceMemoryBase>> AllocateArrayBase(
-      uint64_t element_count, uint64 element_size);
-
-  // Mutex to guard temporary record state.
-  mutable absl::Mutex mutex_;
-
-  // Mapping from device memory to the current (live) temporary memory record.
-  //
-  // If a device memory is not in this mapping, it is not a temporary currently
-  // allocated and owned by this temporary memory manager.
-  std::map<DeviceMemoryBase, TemporaryMemoryRecord> records_
-      ABSL_GUARDED_BY(mutex_);
-
-  // Allocation generation -- we bump this counter to distinguish temporary
-  // memory handles that have been deallocated and later reallocated at the same
-  // device memory address.
-  uint64_t generation_ ABSL_GUARDED_BY(mutex_);
-
-  // The stream (parent object) for this temporary memory manager -- allocations
-  // are performed through this stream handle.
-  Stream* stream_;
-
-  TemporaryMemoryManager(const TemporaryMemoryManager&) = delete;
-  void operator=(const TemporaryMemoryManager&) = delete;
-};
-
-////////////
-// Inlines
-
-template <typename T>
-absl::StatusOr<std::unique_ptr<TemporaryDeviceMemory<T>>>
-TemporaryMemoryManager::AllocateArray(uint64_t element_count) {
-  absl::StatusOr<std::unique_ptr<TemporaryDeviceMemoryBase>> temporary_memory =
-      AllocateArrayBase(element_count, sizeof(T));
-  if (!temporary_memory.ok()) {
-    return temporary_memory.status();
-  }
-
-  return std::unique_ptr<TemporaryDeviceMemory<T>>(
-      reinterpret_cast<TemporaryDeviceMemory<T>*>(temporary_memory->release()));
-}
-
-}  // namespace internal
-}  // namespace stream_executor
-
-#endif  // XLA_STREAM_EXECUTOR_TEMPORARY_MEMORY_MANAGER_H_
diff --git a/third_party/xla/xla/stream_executor/tpu/BUILD b/third_party/xla/xla/stream_executor/tpu/BUILD
index a758bb89c1344b..28e6cc219f04d8 100644
--- a/third_party/xla/xla/stream_executor/tpu/BUILD
+++ b/third_party/xla/xla/stream_executor/tpu/BUILD
@@ -1,11 +1,26 @@
 # Description: StreamExecutor Interface for TPUs
 
 load("//xla:xla.bzl", "xla_cc_test")
-load("@local_tsl//tsl:tsl.bzl", "set_external_visibility")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//learning/brain/experimental/dtensor:__subpackages__",
+        "//learning/brain/google/xla/kernels:__subpackages__",
+        "//learning/brain/research/pjrt:__subpackages__",
+        "//learning/brain/tfrc/executor:__subpackages__",
+        "//learning/brain/tfrc/runtime/tpu_driver:__subpackages__",
+        "//learning/brain/tfrt/tpu_plugin:__subpackages__",
+        "//tensorflow/compiler/jit:__subpackages__",
+        "//tensorflow/compiler/mlir:__subpackages__",
+        "//xla:__subpackages__",
+        "//xla/backends/profiler/tpu:__subpackages__",
+        "//tensorflow/core/common_runtime/next_pluggable_device:__subpackages__",
+        "//tensorflow/core/tpu:__subpackages__",
+        "//tensorflow/dtensor:__subpackages__",
+    ]),
     licenses = ["notice"],
 )
 
@@ -36,7 +51,6 @@ cc_library(
     name = "c_api_conversions",
     srcs = ["c_api_conversions.cc"],
     hdrs = ["c_api_conversions.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":c_api_decl",
         ":proto_helper",
@@ -114,7 +128,6 @@ cc_library(
 cc_library(
     name = "status_helper",
     hdrs = ["status_helper.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":c_api_decl",
         ":tpu_executor_api",
@@ -127,7 +140,6 @@ cc_library(
 cc_library(
     name = "tsl_status_helper",
     hdrs = ["tsl_status_helper.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":c_api_decl",
         "@com_google_absl//absl/status",
@@ -141,7 +153,6 @@ cc_library(
     name = "proto_helper",
     srcs = ["proto_helper.cc"],
     hdrs = ["proto_helper.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":c_api_decl",
         "@local_tsl//tsl/platform:logging",
@@ -216,7 +227,6 @@ cc_library(
 cc_library(
     name = "tpu_platform_hdr",
     hdrs = ["tpu_platform.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":c_api_decl",
         ":tpu_executor_c_api_hdrs",
@@ -243,7 +253,6 @@ cc_library(
         "tpu_stream.h",
         "tpu_stream_interface.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":c_api_conversions",
         ":c_api_decl",
@@ -270,7 +279,6 @@ cc_library(
     name = "tpu_platform_id",
     srcs = ["tpu_platform_id.cc"],
     hdrs = ["tpu_platform_id.h"],
-    visibility = ["//visibility:public"],
     deps = ["//xla/stream_executor:platform"],
 )
 
@@ -286,7 +294,6 @@ cc_library(
         "tpu_platform.h",
         "tpu_stream.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":c_api_conversions",
         ":c_api_decl",
@@ -342,7 +349,6 @@ cc_library(
     name = "tpu_node_context",
     srcs = ["tpu_node_context.cc"],
     hdrs = ["tpu_node_context.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":status_helper",
         ":tpu_api",
@@ -402,7 +408,6 @@ cc_library(
     name = "tpu_transfer_manager_base",
     srcs = ["tpu_transfer_manager.cc"],
     hdrs = ["tpu_transfer_manager.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":c_api_conversions",
         ":c_api_decl",
@@ -433,7 +438,6 @@ cc_library(
     name = "tpu_op_executable",
     srcs = ["tpu_op_executable.cc"],
     hdrs = ["tpu_op_executable.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":c_api_conversions",
         ":c_api_decl",
@@ -608,7 +612,6 @@ cc_library(
     name = "tpu_executor_api",
     srcs = ["tpu_executor_api.cc"],
     hdrs = ["tpu_executor_api.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":libtftpu_header",
         ":tpu_executor_c_api_hdrs",
@@ -618,7 +621,6 @@ cc_library(
 cc_library(
     name = "tpu_profiler_c_api_hdrs",
     hdrs = ["tpu_profiler_c_api.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":c_api_decl",
         ":libtftpu_header",
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
index b61ae3c7636795..8b0aa4d4d11c0c 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_conversions.cc
@@ -355,6 +355,8 @@ void ToC(const xla::Layout& layout, XLA_Layout* c_layout) {
   c_layout->dynamic_shape_metadata_prefix_bytes =
       layout.dynamic_shape_metadata_prefix_bytes();
   CreateVector(layout.tiles(), &c_layout->tiles);
+  c_layout->tail_padding_alignment_in_elements =
+      layout.tail_padding_alignment_in_elements();
 }
 
 xla::Layout FromC(const XLA_Layout* c_layout) {
@@ -382,6 +384,7 @@ xla::Layout FromC(const XLA_Layout* c_layout) {
   }
   return xla::Layout(
       minor_to_major, dim_level_types, dim_unique, dim_ordered, tiles,
+      c_layout->tail_padding_alignment_in_elements,
       static_cast<xla::PrimitiveType>(c_layout->index_primitive_type),
       static_cast<xla::PrimitiveType>(c_layout->pointer_primitive_type),
       c_layout->element_size_in_bits, c_layout->memory_space,
diff --git a/third_party/xla/xla/stream_executor/tpu/c_api_decl.h b/third_party/xla/xla/stream_executor/tpu/c_api_decl.h
index 19279ec4c6d095..5681eb63086f94 100644
--- a/third_party/xla/xla/stream_executor/tpu/c_api_decl.h
+++ b/third_party/xla/xla/stream_executor/tpu/c_api_decl.h
@@ -246,6 +246,7 @@ typedef struct XLA_Layout {
   int64_t element_size_in_bits;
   int64_t memory_space;
   int64_t dynamic_shape_metadata_prefix_bytes;
+  int64_t tail_padding_alignment_in_elements;
 } XLA_Layout;
 
 // Represents an XLA shape tree.
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_executor.h b/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
index 289233e6e3c562..c9e694844b8e06 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_executor.h
@@ -167,10 +167,6 @@ class TpuExecutor : public tensorflow::tpu::TpuExecutorInterface {
   }
 
   // -- Unimplemented (stubbed out) methods.
-  std::unique_ptr<stream_executor::internal::KernelInterface>
-  CreateKernelImplementation() override {
-    LOG(FATAL) << "Not yet implemented";
-  }
 
   absl::Status MemZero(Stream* stream, DeviceMemoryBase* location,
                        uint64_t size) override {
diff --git a/third_party/xla/xla/stream_executor/tpu/tpu_platform_registration.cc b/third_party/xla/xla/stream_executor/tpu/tpu_platform_registration.cc
index 28450675347562..c0c15a98f6e6a9 100644
--- a/third_party/xla/xla/stream_executor/tpu/tpu_platform_registration.cc
+++ b/third_party/xla/xla/stream_executor/tpu/tpu_platform_registration.cc
@@ -17,7 +17,7 @@ limitations under the License.
 #include "xla/stream_executor/tpu/tpu_platform.h"
 
 #if defined(PLATFORM_GOOGLE)
-REGISTER_MODULE_INITIALIZER(tpu_platform,
-                            tensorflow::tpu::RegisterTpuPlatform());
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(
+    tpu_platform, tensorflow::tpu::RegisterTpuPlatform());
 
 #endif
diff --git a/third_party/xla/xla/tests/BUILD b/third_party/xla/xla/tests/BUILD
index d9b84109bf6a99..981569fdf3f7f4 100644
--- a/third_party/xla/xla/tests/BUILD
+++ b/third_party/xla/xla/tests/BUILD
@@ -11,7 +11,7 @@ load(
     "@local_config_rocm//rocm:build_defs.bzl",
     "if_rocm_is_configured",
 )
-load("@local_tsl//tsl:tsl.bzl", "tsl_copts")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility", "tsl_copts")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 load(
     "@local_tsl//tsl/platform:build_config_root.bzl",
@@ -24,7 +24,8 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([":friends"]),
     licenses = ["notice"],
 )
 
@@ -43,7 +44,6 @@ filegroup(
         "**/*.cc",
         "**/*.h",
     ]),
-    visibility = ["//visibility:public"],
 )
 
 # Generate test_suites for all backends, named "${backend}_tests".
@@ -55,7 +55,6 @@ cc_library(
     name = "xla_internal_test_main",
     testonly = True,
     srcs = ["xla_internal_test_main.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:debug_options_flags",
         "@com_google_absl//absl/flags:flag",
@@ -71,7 +70,6 @@ cc_library(
     name = "test_macros_header",
     testonly = True,
     hdrs = ["test_macros.h"],
-    visibility = ["//visibility:public"],
 )
 
 # Generate a test_macros_${BACKEND} library per backend with the proper copts.
@@ -82,7 +80,6 @@ cc_library(
     testonly = True,
     srcs = ["manifest_checking_test.cc"],
     hdrs = ["manifest_checking_test.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":test_macros_header",
         "@com_google_absl//absl/container:flat_hash_map",
@@ -97,7 +94,6 @@ cc_library(
     name = "test_utils",
     srcs = ["test_utils.cc"],
     hdrs = ["test_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:literal",
         "//xla:literal_util",
@@ -116,7 +112,6 @@ cc_library(
     testonly = True,
     srcs = ["literal_test_util.cc"],
     hdrs = ["literal_test_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:array2d",
         "//xla:array3d",
@@ -144,7 +139,6 @@ cc_library(
     testonly = True,
     srcs = ["verified_hlo_module.cc"],
     hdrs = ["verified_hlo_module.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:status_macros",
@@ -166,7 +160,6 @@ cc_library(
     name = "pjrt_client_registry",
     srcs = ["pjrt_client_registry.cc"],
     hdrs = ["pjrt_client_registry.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/pjrt:pjrt_client",
     ],
@@ -178,7 +171,6 @@ cc_library(
     srcs = [
         "pjrt_cpu_client_registry.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":pjrt_client_registry",
         "//xla/pjrt/cpu:cpu_client",
@@ -191,7 +183,6 @@ cc_library(
     srcs = [
         "pjrt_gpu_client_registry.cc",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":pjrt_client_registry",
         "//xla/pjrt/gpu:gpu_helpers",
@@ -204,7 +195,6 @@ cc_library(
     testonly = True,
     srcs = ["hlo_test_base.cc"],
     hdrs = ["hlo_test_base.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":filecheck",
         ":literal_test_util",
@@ -272,7 +262,6 @@ cc_library(
     testonly = True,
     srcs = ["client_library_test_base.cc"],
     hdrs = ["client_library_test_base.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":literal_test_util",
         ":manifest_checking_test",
@@ -287,6 +276,7 @@ cc_library(
         "//xla:status_macros",
         "//xla:statusor",
         "//xla:test_helpers",
+        "//xla:types",
         "//xla:xla_data_proto_cc",
         "//xla/client:client_library",
         "//xla/client:global_data",
@@ -310,7 +300,6 @@ cc_library(
     testonly = True,
     srcs = ["llvm_irgen_test_base.cc"],
     hdrs = ["llvm_irgen_test_base.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":codegen_test_base",
         ":filecheck",
@@ -326,7 +315,6 @@ cc_library(
     testonly = True,
     srcs = ["codegen_test_base.cc"],
     hdrs = ["codegen_test_base.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_test_base",
         "//xla/hlo/ir:hlo",
@@ -343,7 +331,6 @@ cc_library(
     data = [
         "@llvm-project//llvm:FileCheck",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:statusor",
         "//xla:types",
@@ -362,7 +349,6 @@ cc_library(
     testonly = True,
     srcs = ["local_client_test_base.cc"],
     hdrs = ["local_client_test_base.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":client_library_test_base",
         ":manifest_checking_test",
@@ -793,6 +779,7 @@ xla_test(
         "//xla:array2d",
         "//xla:array3d",
         "//xla:array4d",
+        "//xla:comparison_util",
         "//xla:literal",
         "//xla:shape_util",
         "//xla:statusor",
@@ -803,6 +790,8 @@ xla_test(
         "//xla/client:xla_builder",
         "@com_google_absl//absl/base",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:ml_dtypes",
+        "@ml_dtypes//:float8",
     ],
 )
 
@@ -811,7 +800,6 @@ cc_library(
     testonly = True,
     srcs = ["conv_depthwise_common.cc"],
     hdrs = ["conv_depthwise_common.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":client_library_test_base",
         ":hlo_test_base",
@@ -1313,6 +1301,7 @@ xla_test(
         "requires-gpu-sm80",
     ]},
     backends = ["gpu"],
+    tags = ["no_rocm"],  # No int8
     deps = [
         ":hlo_test_base",
         ":xla_internal_test_main",
@@ -2048,7 +2037,6 @@ xla_test(
             "multi_gpu",
             "no_oss",
             "notap",
-            "jitrt_executable",
         ],
         "cpu": [
             "notsan",
@@ -2087,7 +2075,6 @@ xla_test(
             "multi_gpu",
             "no_oss",
             "notap",
-            "jitrt_executable",
         ],
     },
     backends = [
@@ -2531,6 +2518,7 @@ xla_test(
         "//xla/service:transfer_manager",
         "//xla/stream_executor",
         "//xla/stream_executor:device_memory_allocator",
+        "//xla/stream_executor:platform_manager",
         "//xla/stream_executor/host:host_platform",
         "//xla/stream_executor/host:host_platform_id",
         "@local_tsl//tsl/platform:env",
@@ -2748,6 +2736,7 @@ xla_cc_test(
         "//xla/client:client_library",
         "//xla/client:xla_builder",
         "//xla/service:cpu_plugin",
+        "//xla/stream_executor:platform_manager",
         "@com_google_absl//absl/synchronization",
         "@local_tsl//tsl/lib/core:status_test_util",
         "@local_tsl//tsl/platform:env",
diff --git a/third_party/xla/xla/tests/array_elementwise_ops_test.cc b/third_party/xla/xla/tests/array_elementwise_ops_test.cc
index a19d01afbc336c..d4fc025dc175a2 100644
--- a/third_party/xla/xla/tests/array_elementwise_ops_test.cc
+++ b/third_party/xla/xla/tests/array_elementwise_ops_test.cc
@@ -16,6 +16,7 @@ limitations under the License.
 #include <algorithm>
 #include <array>
 #include <cmath>
+#include <cstddef>
 #include <functional>
 #include <limits>
 #include <memory>
@@ -27,20 +28,24 @@ limitations under the License.
 
 #include "absl/base/casts.h"
 #include "absl/types/span.h"
+#include "ml_dtypes/include/float8.h"  // from @ml_dtypes
 #include "xla/array2d.h"
 #include "xla/array3d.h"
 #include "xla/array4d.h"
 #include "xla/client/global_data.h"
 #include "xla/client/local_client.h"
 #include "xla/client/xla_builder.h"
+#include "xla/comparison_util.h"
 #include "xla/layout_util.h"
 #include "xla/literal.h"
+#include "xla/primitive_util.h"
 #include "xla/statusor.h"
 #include "xla/test.h"
 #include "xla/tests/client_library_test_base.h"
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/test_macros.h"
 #include "xla/types.h"
+#include "tsl/platform/ml_dtypes.h"
 
 #if TENSORFLOW_USE_ROCM
 #include "rocm/rocm_config.h"
@@ -1293,14 +1298,140 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareEqF32s) {
   ComputeAndCompareR1<bool>(&builder, {false, false, true, false, false}, {});
 }
 
-XLA_TEST_F(ArrayElementwiseOpTest, CompareEqF32sTO) {
-  SetFastMathDisabled(true);
-  XlaBuilder builder(TestName());
-  auto lhs = ConstantR1<float>(&builder, {-2.5f, 25.5f, 2.25f, NAN, 6.0f});
-  auto rhs = ConstantR1<float>(&builder, {10.0f, 5.0f, 2.25f, NAN, NAN});
-  EqTotalOrder(lhs, rhs);
+template <typename T>
+class TotalOrderTest : public ClientLibraryTestBase {
+ public:
+  void DoIt(ComparisonDirection direction) {
+    this->SetFastMathDisabled(true);
+    XlaBuilder builder(this->TestName());
+    std::vector<T> values = {
+        static_cast<T>(0.0f),
+        std::numeric_limits<T>::min(),
+        static_cast<T>(1.0f),
+        std::numeric_limits<T>::max(),
+    };
+    if constexpr (std::numeric_limits<T>::has_denorm) {
+      auto denorm = static_cast<float>(std::numeric_limits<T>::denorm_min());
+      if (denorm >= std::numeric_limits<float>::min()) {
+        values.push_back(std::numeric_limits<T>::denorm_min());
+      }
+    }
+    if constexpr (std::is_same_v<T, float> || std::is_same_v<T, double>) {
+      values.push_back(std::fabs(std::numeric_limits<float>::quiet_NaN()));
+    }
+    if constexpr (std::numeric_limits<T>::has_infinity) {
+      values.push_back(std::numeric_limits<T>::infinity());
+    }
+#if defined(XLA_TEST_BACKEND_CPU) || defined(XLA_TEST_BACKEND_GPU) || \
+    defined(XLA_TEST_BACKEND_INTERPRETER)
+    if constexpr (std::numeric_limits<T>::has_quiet_NaN) {
+      values.push_back(Eigen::numext::abs(std::numeric_limits<T>::quiet_NaN()));
+    }
+#endif
+    values.reserve(values.size() * 2);
+    for (size_t i = 0, n = values.size(); i < n; ++i) {
+      auto value = values[i];
+      auto neg = -value;
+      if (Eigen::numext::signbit(neg) != Eigen::numext::signbit(value)) {
+        values.push_back(neg);
+      }
+    }
+    std::vector<T> lhs_data;
+    std::vector<T> rhs_data;
+    lhs_data.reserve(values.size() * values.size());
+    rhs_data.reserve(values.size() * values.size());
+    for (T lhs_value : values) {
+      for (T rhs_value : values) {
+        lhs_data.push_back(lhs_value);
+        rhs_data.push_back(rhs_value);
+      }
+    }
+    absl::InlinedVector<bool, 10> results;
+    results.reserve(lhs_data.size());
+    Comparison comparison(direction, primitive_util::NativeToPrimitiveType<T>(),
+                          Comparison::Order::kTotal);
+    for (size_t i = 0; i < lhs_data.size(); ++i) {
+      results.push_back(comparison.Compare(lhs_data[i], rhs_data[i]));
+    }
+    auto lhs = ConstantR1<T>(&builder, lhs_data);
+    auto rhs = ConstantR1<T>(&builder, rhs_data);
+    switch (direction) {
+      case ComparisonDirection::kEq:
+        EqTotalOrder(lhs, rhs);
+        break;
+      case ComparisonDirection::kNe:
+        NeTotalOrder(lhs, rhs);
+        break;
+      case ComparisonDirection::kGt:
+        GtTotalOrder(lhs, rhs);
+        break;
+      case ComparisonDirection::kGe:
+        GeTotalOrder(lhs, rhs);
+        break;
+      case ComparisonDirection::kLt:
+        LtTotalOrder(lhs, rhs);
+        break;
+      case ComparisonDirection::kLe:
+        LeTotalOrder(lhs, rhs);
+        break;
+    }
+
+    this->ComputeAndCompareR1<bool>(&builder, results, {});
+  }
+};
 
-  ComputeAndCompareR1<bool>(&builder, {false, false, true, true, false}, {});
+using Types = ::testing::Types<tsl::float8_e4m3fnuz, tsl::float8_e4m3b11fnuz,
+                               tsl::float8_e5m2, tsl::float8_e5m2fnuz,
+#if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT16)
+                               Eigen::half,
+#endif
+#if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_BFLOAT16)
+                               Eigen::bfloat16,
+#endif
+#if !defined(XLA_BACKEND_DOES_NOT_SUPPORT_FLOAT64)
+                               double,
+#endif
+                               float>;
+
+TYPED_TEST_SUITE(TotalOrderTest, Types);
+
+TYPED_TEST(TotalOrderTest, Eq) { this->DoIt(ComparisonDirection::kEq); }
+TYPED_TEST(TotalOrderTest, Ne) { this->DoIt(ComparisonDirection::kNe); }
+TYPED_TEST(TotalOrderTest, Le) { this->DoIt(ComparisonDirection::kLe); }
+TYPED_TEST(TotalOrderTest, Lt) { this->DoIt(ComparisonDirection::kLt); }
+TYPED_TEST(TotalOrderTest, Ge) { this->DoIt(ComparisonDirection::kGe); }
+TYPED_TEST(TotalOrderTest, Gt) { this->DoIt(ComparisonDirection::kGt); }
+TYPED_TEST(TotalOrderTest, LargeMagnitudeVsNaN) {
+  using T = TypeParam;
+  if constexpr (!std::numeric_limits<T>::has_quiet_NaN) {
+    GTEST_SKIP();
+  }
+  this->SetFastMathDisabled(true);
+
+  XlaBuilder builder(this->TestName());
+  std::vector<T> values = {
+      static_cast<T>(0.0f),
+      std::numeric_limits<T>::min(),
+      static_cast<T>(1.0f),
+      std::numeric_limits<T>::max(),
+  };
+  if constexpr (std::numeric_limits<T>::has_infinity) {
+    values.push_back(std::numeric_limits<T>::infinity());
+  }
+  for (size_t i = 0, n = values.size(); i < n; ++i) {
+    auto value = values[i];
+    auto neg = -value;
+    if (Eigen::numext::signbit(neg) != Eigen::numext::signbit(value)) {
+      values.push_back(neg);
+    }
+  }
+  auto lhs = ConstantR1<T>(&builder, values);
+  auto rhs = ConstantR1<T>(
+      &builder,
+      std::vector<T>(values.size(), std::numeric_limits<T>::quiet_NaN()));
+  LtTotalOrder(lhs, rhs);
+  TF_ASSERT_OK_AND_ASSIGN(auto result, this->ComputeAndTransfer(&builder, {}));
+  EXPECT_TRUE(result.IsAll(0) || result.IsAll(1)) << result.ToString();
 }
 
 XLA_TEST_F(ArrayElementwiseOpTest, CompareEqZeroElementF32s) {
@@ -1322,23 +1453,6 @@ XLA_TEST_F(ArrayElementwiseOpTest, CompareGeF32s) {
   ComputeAndCompareR1<bool>(&builder, {false, true, true, false, false}, {});
 }
 
-XLA_TEST_F(ArrayElementwiseOpTest, CompareGeF32sTO) {
-  SetFastMathDisabled(true);
-  XlaBuilder builder(TestName());
-  // For portability, need to represent NAN using the following call.
-  // The C++ standard does not specify if quiet_NaN() sets the sign bit of
-  // its result. The call to std::fabs will ensure that it is not set.
-  auto kNaN = std::fabs(std::numeric_limits<float>::quiet_NaN());
-  auto lhs =
-      ConstantR1<float>(&builder, {-2.5f, 25.5f, 2.25f, kNaN, 6.0f, 6.0f});
-  auto rhs =
-      ConstantR1<float>(&builder, {10.0f, 5.0f, 1.0f, 10.0f, kNaN, -kNaN});
-  GeTotalOrder(lhs, rhs);
-
-  ComputeAndCompareR1<bool>(&builder, {false, true, true, true, false, true},
-                            {});
-}
-
 XLA_TEST_F(ArrayElementwiseOpTest, CompareGtF32s) {
   SetFastMathDisabled(true);
   XlaBuilder builder(TestName());
@@ -2716,6 +2830,19 @@ XLA_TEST_F(ArrayElementwiseOpTest, Atan2C64s) {
   ComputeAndCompare(&builder, {}, error_spec_);
 }
 
+XLA_TEST_F(ArrayElementwiseOpTest, ErfF32s) {
+  XlaBuilder builder(TestName());
+  auto kInf = std::numeric_limits<float>::infinity();
+  auto kNaN = std::numeric_limits<float>::quiet_NaN();
+  auto a = ConstantR1<float>(
+      &builder, {-kInf, -2.5f, 3.14f, -0.0f, 0.0f, 2.25f, kInf, kNaN});
+
+  Erf(a);
+
+  ErrorSpec error_spec{1e-5f, 1e-5f};
+  ComputeAndCompare(&builder, {}, error_spec);
+}
+
 XLA_TEST_F(ArrayElementwiseOpTest, TanhF32s) {
   XlaBuilder builder(TestName());
   auto kInf = std::numeric_limits<float>::infinity();
diff --git a/third_party/xla/xla/tests/client_library_test_base.h b/third_party/xla/xla/tests/client_library_test_base.h
index 475071de8631c0..6ed2cac8a238c4 100644
--- a/third_party/xla/xla/tests/client_library_test_base.h
+++ b/third_party/xla/xla/tests/client_library_test_base.h
@@ -37,6 +37,7 @@ limitations under the License.
 #include "xla/tests/literal_test_util.h"
 #include "xla/tests/manifest_checking_test.h"
 #include "xla/tests/test_utils.h"
+#include "xla/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/lib/core/bitmap.h"
 #include "tsl/platform/ml_dtypes.h"
@@ -444,6 +445,10 @@ class ClientLibraryTestBase : public ManifestCheckingTest {
 
   // Arguments to be passed to the computation when it runs.
   std::vector<Literal> arguments_;
+
+  template <typename T>
+  static constexpr inline bool is_floating_or_complex_v =
+      std::disjunction_v<is_specialized_floating_point<T>, is_complex<T>>;
 };
 
 template <typename NativeT>
@@ -459,17 +464,7 @@ template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR0(
     XlaBuilder* builder, NativeT expected,
     absl::Span<GlobalData* const> arguments, ErrorSpec error) {
-  static_assert(std::is_same<NativeT, float>::value ||
-                    std::is_same<NativeT, double>::value ||
-                    std::is_same<NativeT, bfloat16>::value ||
-                    std::is_same<NativeT, half>::value ||
-                    std::is_same<NativeT, tsl::float8_e5m2>::value ||
-                    std::is_same<NativeT, tsl::float8_e5m2fnuz>::value ||
-                    std::is_same<NativeT, tsl::float8_e4m3fn>::value ||
-                    std::is_same<NativeT, tsl::float8_e4m3b11>::value ||
-                    std::is_same<NativeT, tsl::float8_e4m3fnuz>::value ||
-                    std::is_same<NativeT, complex64>::value ||
-                    std::is_same<NativeT, complex128>::value,
+  static_assert(is_floating_or_complex_v<NativeT>,
                 "Float or complex type required when specifying an ErrorSpec");
   Literal expected_literal = LiteralUtil::CreateR0<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
@@ -489,17 +484,7 @@ template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR1(
     XlaBuilder* builder, absl::Span<const NativeT> expected,
     absl::Span<GlobalData* const> arguments, ErrorSpec error) {
-  static_assert(std::is_same<NativeT, float>::value ||
-                    std::is_same<NativeT, double>::value ||
-                    std::is_same<NativeT, bfloat16>::value ||
-                    std::is_same<NativeT, half>::value ||
-                    std::is_same<NativeT, tsl::float8_e5m2>::value ||
-                    std::is_same<NativeT, tsl::float8_e5m2fnuz>::value ||
-                    std::is_same<NativeT, tsl::float8_e4m3fn>::value ||
-                    std::is_same<NativeT, tsl::float8_e4m3b11>::value ||
-                    std::is_same<NativeT, tsl::float8_e4m3fnuz>::value ||
-                    std::is_same<NativeT, complex64>::value ||
-                    std::is_same<NativeT, complex128>::value,
+  static_assert(is_floating_or_complex_v<NativeT>,
                 "Float or complex type required when specifying an ErrorSpec");
   Literal expected_literal = LiteralUtil::CreateR1<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
@@ -520,17 +505,7 @@ template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR2(
     XlaBuilder* builder, const Array2D<NativeT>& expected,
     absl::Span<GlobalData* const> arguments, ErrorSpec error) {
-  static_assert(std::is_same<NativeT, float>::value ||
-                    std::is_same<NativeT, double>::value ||
-                    std::is_same<NativeT, bfloat16>::value ||
-                    std::is_same<NativeT, half>::value ||
-                    std::is_same<NativeT, tsl::float8_e5m2>::value ||
-                    std::is_same<NativeT, tsl::float8_e5m2fnuz>::value ||
-                    std::is_same<NativeT, tsl::float8_e4m3fn>::value ||
-                    std::is_same<NativeT, tsl::float8_e4m3b11>::value ||
-                    std::is_same<NativeT, tsl::float8_e4m3fnuz>::value ||
-                    std::is_same<NativeT, complex64>::value ||
-                    std::is_same<NativeT, complex128>::value,
+  static_assert(is_floating_or_complex_v<NativeT>,
                 "Float or complex type required when specifying an ErrorSpec");
   Literal expected_literal =
       LiteralUtil::CreateR2FromArray2D<NativeT>(expected);
@@ -552,17 +527,7 @@ template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR3(
     XlaBuilder* builder, const Array3D<NativeT>& expected,
     absl::Span<GlobalData* const> arguments, ErrorSpec error) {
-  static_assert(std::is_same<NativeT, float>::value ||
-                    std::is_same<NativeT, double>::value ||
-                    std::is_same<NativeT, bfloat16>::value ||
-                    std::is_same<NativeT, half>::value ||
-                    std::is_same<NativeT, tsl::float8_e5m2>::value ||
-                    std::is_same<NativeT, tsl::float8_e5m2fnuz>::value ||
-                    std::is_same<NativeT, tsl::float8_e4m3fn>::value ||
-                    std::is_same<NativeT, tsl::float8_e4m3b11>::value ||
-                    std::is_same<NativeT, tsl::float8_e4m3fnuz>::value ||
-                    std::is_same<NativeT, complex64>::value ||
-                    std::is_same<NativeT, complex128>::value,
+  static_assert(is_floating_or_complex_v<NativeT>,
                 "Float or complex type required when specifying an ErrorSpec");
   Literal expected_literal =
       LiteralUtil::CreateR3FromArray3D<NativeT>(expected);
@@ -584,17 +549,7 @@ template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompareR4(
     XlaBuilder* builder, const Array4D<NativeT>& expected,
     absl::Span<GlobalData* const> arguments, ErrorSpec error) {
-  static_assert(std::is_same<NativeT, float>::value ||
-                    std::is_same<NativeT, double>::value ||
-                    std::is_same<NativeT, bfloat16>::value ||
-                    std::is_same<NativeT, half>::value ||
-                    std::is_same<NativeT, tsl::float8_e5m2>::value ||
-                    std::is_same<NativeT, tsl::float8_e5m2fnuz>::value ||
-                    std::is_same<NativeT, tsl::float8_e4m3fn>::value ||
-                    std::is_same<NativeT, tsl::float8_e4m3b11>::value ||
-                    std::is_same<NativeT, tsl::float8_e4m3fnuz>::value ||
-                    std::is_same<NativeT, complex64>::value ||
-                    std::is_same<NativeT, complex128>::value,
+  static_assert(is_floating_or_complex_v<NativeT>,
                 "Float or complex type required when specifying an ErrorSpec");
   Literal expected_literal =
       LiteralUtil::CreateR4FromArray4D<NativeT>(expected);
@@ -615,17 +570,7 @@ template <typename NativeT>
 void ClientLibraryTestBase::ComputeAndCompare(
     XlaBuilder* builder, const Array<NativeT>& expected,
     absl::Span<GlobalData* const> arguments, ErrorSpec error) {
-  static_assert(std::is_same<NativeT, float>::value ||
-                    std::is_same<NativeT, double>::value ||
-                    std::is_same<NativeT, bfloat16>::value ||
-                    std::is_same<NativeT, half>::value ||
-                    std::is_same<NativeT, tsl::float8_e5m2>::value ||
-                    std::is_same<NativeT, tsl::float8_e5m2fnuz>::value ||
-                    std::is_same<NativeT, tsl::float8_e4m3fn>::value ||
-                    std::is_same<NativeT, tsl::float8_e4m3b11>::value ||
-                    std::is_same<NativeT, tsl::float8_e4m3fnuz>::value ||
-                    std::is_same<NativeT, complex64>::value ||
-                    std::is_same<NativeT, complex128>::value,
+  static_assert(is_floating_or_complex_v<NativeT>,
                 "Float or complex type required when specifying an ErrorSpec");
   Literal expected_literal = LiteralUtil::CreateFromArray<NativeT>(expected);
   ClientLibraryTestBase::ComputeAndCompareLiteral(builder, expected_literal,
diff --git a/third_party/xla/xla/tests/convert_test.cc b/third_party/xla/xla/tests/convert_test.cc
index ab9f9d3eaefece..91d545023c6374 100644
--- a/third_party/xla/xla/tests/convert_test.cc
+++ b/third_party/xla/xla/tests/convert_test.cc
@@ -43,7 +43,9 @@ class ConvertTest : public ClientLibraryTestBase {
       : ClientLibraryTestBase(platform) {
     mutable_debug_options()->add_xla_disable_hlo_passes("algsimp");
     mutable_debug_options()->add_xla_disable_hlo_passes("inline");
-    mutable_debug_options()->set_xla_gpu_simplify_all_fp_conversions(false);
+    mutable_debug_options()->add_xla_disable_hlo_passes(
+        "simplify-fp-conversions");
+    mutable_debug_options()->set_xla_allow_excess_precision(false);
   }
 };
 
diff --git a/third_party/xla/xla/tests/dot_operation_test.cc b/third_party/xla/xla/tests/dot_operation_test.cc
index 89a9b5987e5bb9..3430881bd8eeee 100644
--- a/third_party/xla/xla/tests/dot_operation_test.cc
+++ b/third_party/xla/xla/tests/dot_operation_test.cc
@@ -289,6 +289,34 @@ std::string PrintDotTestParam(
 class ParametricDotTest : public DotOperationTest,
                           public ::testing::WithParamInterface<DotTestParam> {
  protected:
+  // This method runs before each test runs.
+  void SetUp() override {
+    // Several F16 tests are subject to denormal issues on MI210 architecture.
+    // For that matter, we set propagate_grad_xy_ flag for these tests, which
+    // activates adapted GEMM algorithm on ROCM. Besides, the adapted algorithm
+    // does not work well with ROCBLAS autotuning, hence we also disable it.
+    // This also serves as a test that grad_x/y attributes are correctly
+    // propagated down to a GEMM routine.
+    const auto& gpu_comp = client_->backend()
+                               .default_stream_executor()
+                               ->GetDeviceDescription()
+                               .gpu_compute_capability();
+    if (std::holds_alternative<se::RocmComputeCapability>(gpu_comp)) {
+      std::string_view name(
+          ::testing::UnitTest::GetInstance()->current_test_info()->name());
+      if (name.find("TestF16/270x270x520_MajorToMinor") != std::string::npos) {
+        execution_options_.mutable_debug_options()->set_xla_gpu_autotune_level(
+            0);
+        DotTestParam param = GetParam();
+        // In order to test both grad_x and grad_y attributes, we set
+        // propagate_grad_xy_ to 1 or 2 based on some alternating parameter
+        // to set it deterministically.
+        propagate_grad_xy_ = param.dot_lhs_row_major ? 1 : 2;
+      }
+    }
+    ManifestCheckingTest::SetUp();
+  }
+
   template <typename NativeT>
   void TestImpl();
 
@@ -296,6 +324,8 @@ class ParametricDotTest : public DotOperationTest,
   void ComputeAndCompareR2WithError(XlaBuilder* builder,
                                     const Array2D<NativeT>& expected,
                                     absl::Span<GlobalData* const> arguments);
+
+  int32_t propagate_grad_xy_ = 0;
 };
 
 template <typename NativeT>
@@ -356,6 +386,15 @@ void ParametricDotTest::TestImpl() {
 
   XlaBuilder builder(TestName());
   auto prim_type = primitive_util::NativeToPrimitiveType<NativeT>();
+
+  if (propagate_grad_xy_ != 0) {
+    FrontendAttributes attributes;
+    if (propagate_grad_xy_ == 1)
+      (*attributes.mutable_map())["grad_x"] = "true";
+    else
+      (*attributes.mutable_map())["grad_y"] = "true";
+    builder.SetFrontendAttributes(attributes);
+  }
   auto result =
       Dot(Parameter(&builder, 0,
                     ShapeUtil::MakeShapeWithDenseLayout(
@@ -367,6 +406,9 @@ void ParametricDotTest::TestImpl() {
                         prim_type, {param.k, param.n},
                         MinorToMajorForIsRowMajor(param.dot_rhs_row_major)),
                     "dot_rhs"));
+  if (propagate_grad_xy_ != 0) {
+    builder.ClearFrontendAttributes();
+  }
 
   if (param.has_addend) {
     result =
diff --git a/third_party/xla/xla/tests/exhaustive/BUILD b/third_party/xla/xla/tests/exhaustive/BUILD
index 875bceabc7837b..1b3ee7e074d852 100644
--- a/third_party/xla/xla/tests/exhaustive/BUILD
+++ b/third_party/xla/xla/tests/exhaustive/BUILD
@@ -5,7 +5,8 @@ load("//xla/tests:build_defs.bzl", "xla_test")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = [":friends"],
     licenses = ["notice"],
 )
 
@@ -25,7 +26,6 @@ cc_library(
     srcs = ["exhaustive_op_test_utils.cc"],
     hdrs = ["exhaustive_op_test_utils.h"],
     tags = ["no_pip"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:bit_cast",
         "//xla:shape_util",
diff --git a/third_party/xla/xla/tests/fuzz/BUILD b/third_party/xla/xla/tests/fuzz/BUILD
index c1e4edddbc42f2..fa5dde0ff1c3df 100644
--- a/third_party/xla/xla/tests/fuzz/BUILD
+++ b/third_party/xla/xla/tests/fuzz/BUILD
@@ -6,7 +6,6 @@ cc_library(
     name = "hlo_test_lib",
     testonly = True,
     srcs = ["hlo_test_lib.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:error_spec",
         "//xla/tests:hlo_test_base",
@@ -17,6 +16,7 @@ cc_library(
 [hlo_test(
     name = hlo + "_test",
     hlo = hlo,
+    tags = (["no_rocm"] if hlo == "rand_000079.hlo" else []),  # No int8
 ) for hlo in glob(
     include = ["rand_*.hlo"],
     exclude = [
diff --git a/third_party/xla/xla/tests/hlo_metadata_test.cc b/third_party/xla/xla/tests/hlo_metadata_test.cc
index 77c86d5d783c23..ed5260426044eb 100644
--- a/third_party/xla/xla/tests/hlo_metadata_test.cc
+++ b/third_party/xla/xla/tests/hlo_metadata_test.cc
@@ -60,7 +60,6 @@ TEST_F(HloMetadataTest, MetadataPropagation) {
                          ->root_instruction();
   EXPECT_THAT(instruction->metadata().op_type(), StrEq("add"));
   EXPECT_THAT(instruction->metadata().op_name(), StrEq("my_sum_op"));
-  EXPECT_NE(instruction->metadata().logical_creation_pass_id(), 0);
 }
 
 TEST_F(HloMetadataTest, MetadataClearing) {
diff --git a/third_party/xla/xla/tests/hlo_test_base.cc b/third_party/xla/xla/tests/hlo_test_base.cc
index 064d0a943ceb1f..731f29d7d6d875 100644
--- a/third_party/xla/xla/tests/hlo_test_base.cc
+++ b/third_party/xla/xla/tests/hlo_test_base.cc
@@ -597,6 +597,31 @@ ::testing::AssertionResult HloTestBase::RunAndCompareTwoModules(
                                  run_hlo_passes, args_max_bits_of_precision);
 }
 
+::testing::AssertionResult HloTestBase::RunAndCompareTwoModules(
+    string_view hlo_string_module_0, string_view hlo_string_module_1,
+    const HloModuleConfig& config_0, const HloModuleConfig& config_1,
+    const std::optional<ErrorSpec>& error, bool run_hlo_passes,
+    std::optional<int64_t> args_max_bits_of_precision) {
+  auto module_0_or_status =
+      ParseAndReturnVerifiedModule(hlo_string_module_0, config_0);
+  if (!module_0_or_status.ok()) {
+    return ::testing::AssertionFailure()
+           << "Error while parsing HLO text format: "
+           << module_0_or_status.status().ToString();
+  }
+
+  auto module_1_or_status =
+      ParseAndReturnVerifiedModule(hlo_string_module_1, config_1);
+  if (!module_1_or_status.ok()) {
+    return ::testing::AssertionFailure()
+           << "Error while parsing HLO text format: "
+           << module_1_or_status.status().ToString();
+  }
+  return RunAndCompareTwoModules(std::move(module_0_or_status).value(),
+                                 std::move(module_1_or_status).value(), error,
+                                 run_hlo_passes, args_max_bits_of_precision);
+}
+
 ::testing::AssertionResult HloTestBase::RunAndCompareTwoModules(
     absl::string_view hlo_string_module_0,
     absl::string_view hlo_string_module_1,
diff --git a/third_party/xla/xla/tests/hlo_test_base.h b/third_party/xla/xla/tests/hlo_test_base.h
index 596096cb51832a..953f905ce8de32 100644
--- a/third_party/xla/xla/tests/hlo_test_base.h
+++ b/third_party/xla/xla/tests/hlo_test_base.h
@@ -315,6 +315,14 @@ class HloTestBase : public ManifestCheckingTest {
       const std::optional<ErrorSpec>& error, bool run_hlo_passes = true,
       std::optional<int64_t> args_max_bits_of_precision = std::nullopt);
 
+  // Same as above but allows running with different configs.
+  ::testing::AssertionResult RunAndCompareTwoModules(
+      absl::string_view hlo_string_module_0,
+      absl::string_view hlo_string_module_1, const HloModuleConfig& config_0,
+      const HloModuleConfig& config_1, const std::optional<ErrorSpec>& error,
+      bool run_hlo_passes = true,
+      std::optional<int64_t> args_max_bits_of_precision = std::nullopt);
+
   // Same as above but requires explicit arguments.
   ::testing::AssertionResult RunAndCompareTwoModules(
       absl::string_view hlo_string_module_0,
diff --git a/third_party/xla/xla/tests/int4_test.cc b/third_party/xla/xla/tests/int4_test.cc
index 2107215c6082bd..6d83d9489aee4d 100644
--- a/third_party/xla/xla/tests/int4_test.cc
+++ b/third_party/xla/xla/tests/int4_test.cc
@@ -61,6 +61,20 @@ XLA_TEST_F(HloTestBase, Slice) {
   EXPECT_TRUE(RunAndCompare(hlo_text, std::nullopt));
 }
 
+XLA_TEST_F(HloTestBase, Add) {
+  const std::string hlo_text = R"(
+  HloModule HorizontalLoopFusion
+
+  ENTRY main {
+    x = s4[5,5] parameter(0)
+    x8 = s8[5,5] convert(x)
+    y8 = add(x8, x8)
+    ROOT y = s4[5,5] convert(y8)
+  }
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_text, std::nullopt));
+}
+
 XLA_TEST_F(HloTestBase, NonMajorToMinorLayout) {
   // Tests transposing a matrix with a non-major-to-minor layout.
   const std::string hlo_text = R"(
@@ -121,5 +135,27 @@ XLA_TEST_F(HloTestBase, Scalar) {
   EXPECT_TRUE(RunAndCompare(hlo_text, std::nullopt));
 }
 
+XLA_TEST_F(HloTestBase, HorizontalLoopFusion) {
+  // Tests an HLO module where horizontal loop fusion can be done on GPUs
+  const std::string hlo_text = R"(
+  HloModule HorizontalLoopFusion
+
+  ENTRY main {
+    x4 = s4[10] parameter(0)
+    x8 = s8[10] convert(x4)
+    y8 = s8[10] add(x8, x8)
+    y4 = s4[10] convert(y8)
+
+    x4_b = s4[13] parameter(1)
+    x8_b = s8[13] convert(x4_b)
+    y8_b = s8[13] add(x8_b, x8_b)
+    y4_b = s4[13] convert(y8_b)
+
+    ROOT t = (s4[10], s4[13]) tuple(y4, y4_b)
+  }
+)";
+  EXPECT_TRUE(RunAndCompare(hlo_text, std::nullopt));
+}
+
 }  // namespace
 }  // namespace xla
diff --git a/third_party/xla/xla/tests/local_client_execute_test.cc b/third_party/xla/xla/tests/local_client_execute_test.cc
index 0483835c35f359..79ff96013223e5 100644
--- a/third_party/xla/xla/tests/local_client_execute_test.cc
+++ b/third_party/xla/xla/tests/local_client_execute_test.cc
@@ -30,6 +30,7 @@ limitations under the License.
 #include "xla/statusor.h"
 #include "xla/stream_executor/device_memory_allocator.h"
 #include "xla/stream_executor/host/host_platform_id.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "xla/stream_executor/stream_executor.h"
 #include "xla/test_helpers.h"
 #include "xla/tests/literal_test_util.h"
@@ -673,8 +674,7 @@ XLA_TEST_F(LocalClientExecuteTest,
   // Try to run a computation on a stream for a platform (CPU) which does not
   // match the platform of the service (!= CPU).
   se::Platform* wrong_platform =
-      se::MultiPlatformManager::PlatformWithId(se::host::kHostPlatformId)
-          .value();
+      se::PlatformManager::PlatformWithId(se::host::kHostPlatformId).value();
   se::Stream wrong_stream(wrong_platform->ExecutorForDevice(0).value());
   wrong_stream.Init();
 
@@ -691,8 +691,7 @@ XLA_TEST_F(LocalClientExecuteTest,
 XLA_TEST_F(LocalClientExecuteTest,
            DISABLED_ON_CPU(AllocatorDoesNotMatchPlatform)) {
   se::Platform* wrong_platform =
-      se::MultiPlatformManager::PlatformWithId(se::host::kHostPlatformId)
-          .value();
+      se::PlatformManager::PlatformWithId(se::host::kHostPlatformId).value();
   TestAllocator allocator(wrong_platform);
 
   XlaBuilder builder(TestName());
diff --git a/third_party/xla/xla/tests/multioutput_fusion_test.cc b/third_party/xla/xla/tests/multioutput_fusion_test.cc
index 4e9598c96d29fb..c2220217b12de2 100644
--- a/third_party/xla/xla/tests/multioutput_fusion_test.cc
+++ b/third_party/xla/xla/tests/multioutput_fusion_test.cc
@@ -474,6 +474,46 @@ XLA_TEST_F(MultiOutputFusionTest,
   EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), ErrorSpec(1e-5)));
 }
 
+XLA_TEST_F(MultiOutputFusionTest, MultiOutputReduceCanonicalizationIsSame) {
+  const std::string testcase = absl::StrCat(kScalarOps, R"(
+fused_computation {
+  param_0 = f32[64,128]{1,0} parameter(0)
+  neg = f32[64,128]{1,0} negate(param_0)
+  bitcast = f32[8,8,128]{2,1,0} bitcast(neg)
+  constant_0 = f32[] constant(0)
+  reduce.1 = f32[128]{0} reduce(param_0, constant_0), dimensions={0}, to_apply=Add
+  reduce.2 = f32[128]{0} reduce(bitcast, constant_0), dimensions={0,1}, to_apply=Add
+  ROOT tuple.12 = (f32[128]{0}, f32[128]{0}) tuple(reduce.1, reduce.2)
+}
+
+ENTRY main {
+  Arg_2.1 = f32[64,128]{1,0} parameter(0)
+  ROOT fusion = (f32[128]{0}, f32[128]{0}) fusion(Arg_2.1), kind=kInput, calls=fused_computation
+})");
+  auto module = ParseAndReturnVerifiedModule(testcase).value();
+  EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), ErrorSpec(1e-5)));
+}
+
+XLA_TEST_F(MultiOutputFusionTest, MultiOutputReduceGeneralBitcastCompatible) {
+  const std::string testcase = absl::StrCat(kScalarOps, R"(
+fused_computation {
+  param_0 = f32[64,128]{1,0} parameter(0)
+  neg = f32[64,128]{1,0} negate(param_0)
+  bitcast = f32[8,8,128]{2,1,0} bitcast(neg)
+  bitcast2 = f32[128,64]{0,1} bitcast(neg)
+  constant_0 = f32[] constant(0)
+  reduce.1 = f32[128]{0} reduce(bitcast, constant_0), dimensions={0,1}, to_apply=Add
+  ROOT tuple.12 = (f32[128]{0}, f32[64,128]{1,0}, f32[128,64]{0,1}) tuple(reduce.1, neg, bitcast2)
+}
+
+ENTRY main {
+  Arg_2.1 = f32[64,128]{1,0} parameter(0)
+  ROOT fusion = (f32[128]{0}, f32[64,128]{1,0}, f32[128,64]{0,1}) fusion(Arg_2.1), kind=kInput, calls=fused_computation
+})");
+  auto module = ParseAndReturnVerifiedModule(testcase).value();
+  EXPECT_TRUE(RunAndCompareNoHloPasses(std::move(module), ErrorSpec(1e-5)));
+}
+
 XLA_TEST_F(MultiOutputFusionTest, MultiOutputReduceWithEpilogue) {
   const std::string testcase = absl::StrCat(kScalarOps, R"(
 fused_computation {
diff --git a/third_party/xla/xla/tests/multiple_devices_on_host_test.cc b/third_party/xla/xla/tests/multiple_devices_on_host_test.cc
index 2604f4d10961d4..f7513acbe67b46 100644
--- a/third_party/xla/xla/tests/multiple_devices_on_host_test.cc
+++ b/third_party/xla/xla/tests/multiple_devices_on_host_test.cc
@@ -17,6 +17,7 @@ limitations under the License.
 #include "xla/client/client_library.h"
 #include "xla/client/xla_builder.h"
 #include "xla/shape_util.h"
+#include "xla/stream_executor/platform_manager.h"
 #include "tsl/lib/core/status_test_util.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/test.h"
@@ -57,7 +58,7 @@ void TestWithDeviceCount(const int device_count) {
   // Run `device_count` copies of the XLA program built by BuildComputation.
   TF_ASSERT_OK_AND_ASSIGN(
       se::Platform* const platform,
-      stream_executor::MultiPlatformManager::PlatformWithName("Host"));
+      stream_executor::PlatformManager::PlatformWithName("Host"));
   xla::LocalClientOptions client_options;
   client_options.set_platform(platform);
   TF_ASSERT_OK_AND_ASSIGN(
diff --git a/third_party/xla/xla/tests/multithreaded_compilation_test.cc b/third_party/xla/xla/tests/multithreaded_compilation_test.cc
index d34dd83cc90bff..530384d16e894d 100644
--- a/third_party/xla/xla/tests/multithreaded_compilation_test.cc
+++ b/third_party/xla/xla/tests/multithreaded_compilation_test.cc
@@ -76,7 +76,7 @@ XLA_TEST_F(MultithreadedCompilation, EightModuleCompilation) {
     absl::MutexLock lock(&mu);
     executables.push_back(std::move(executable));
     VLOG(2) << "Adding executable obtained from thread: " << iteration;
-    return tsl::OkStatus();
+    return absl::OkStatus();
   };
 
   {
diff --git a/third_party/xla/xla/tests/onednn_matmul_test.cc b/third_party/xla/xla/tests/onednn_matmul_test.cc
index 2dc13cc83ee3d4..8d6a307b029a90 100644
--- a/third_party/xla/xla/tests/onednn_matmul_test.cc
+++ b/third_party/xla/xla/tests/onednn_matmul_test.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include <utility>
 
 #include "xla/literal.h"
+#include "xla/service/cpu/onednn_util.h"
 #include "xla/shape_util.h"
 #include "xla/test.h"
 #include "xla/test_helpers.h"
@@ -48,63 +49,63 @@ class MatmulTest : public HloTestBase {
     ; CHECK-DAG:   }
     ; CHECK:     }
     )";
+  const char* matmul_rewrite_str_ = R"(
+    ; CHECK:     custom_call_target="__onednn$matmul",
+    ; CHECK:       backend_config={
+    ; CHECK-DAG:     "outer_dimension_partitions":[],
+    ; CHECK-DAG:     "onednn_matmul_config":{
+    ; CHECK-DAG:       "fused_ops":[]
+    ; CHECK-DAG:   }
+    ; CHECK:     }
+    )";
 };
 
 TEST_F(MatmulTest, SimpleTestF32) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.f32, entry_computation_layout={(f32[2,8,4,16]{3,2,1,0},f32[2,8,16,32]{3,2,1,0})->f32[2,8,4,32]{3,2,1,0}}
+  HloModule matmul.test.f32, entry_computation_layout={(f32[32,8,128,64]{3,2,1,0},f32[32,8,64,128]{3,2,1,0})->f32[32,8,128,128]{3,2,1,0}}
 
   ENTRY matmul.test.f32 {
-    arg.0 = f32[2,8,4,16]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg.1 = f32[2,8,16,32]{3,2,1,0} parameter(1), parameter_replication={false}
-    ROOT onednn.matmul.0 = f32[2,8,4,32]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    arg.0 = f32[32,8,128,64]{3,2,1,0} parameter(0), parameter_replication={false}
+    arg.1 = f32[32,8,64,128]{3,2,1,0} parameter(1), parameter_replication={false}
+    ROOT onednn.matmul.0 = f32[32,8,128,128]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, matmul_rewrite_str_);
 }
 
 TEST_F(MatmulTest, SimpleTestBF16) {
   // TODO(penporn): Refactor IsBF16SupportedByOneDNNOnThisCPU() from
   // tensorflow/core/graph/mkl_graph_util.h and call the function instead.
-  using tsl::port::TestCPUFeature;
-  if (!TestCPUFeature(tsl::port::CPUFeature::AVX512_BF16) &&
-      !TestCPUFeature(tsl::port::CPUFeature::AMX_BF16)) {
+  if (!IsSupportedType(PrimitiveType::BF16)) {
     GTEST_SKIP() << "CPU does not support BF16.";
   }
 
   const char* matmul_module_str = R"(
-  HloModule matmul.test.bf16, entry_computation_layout={(bf16[2,8,4,16]{3,2,1,0},bf16[2,8,16,32]{3,2,1,0})->bf16[2,8,4,32]{3,2,1,0}}
+  HloModule matmul.test.bf16, entry_computation_layout={(bf16[32,8,128,64]{3,2,1,0},bf16[32,8,64,128]{3,2,1,0})->bf16[32,8,128,128]{3,2,1,0}}
 
   ENTRY matmul.test.bf16 {
-    arg.0 = bf16[2,8,4,16]{3,2,1,0} parameter(0), parameter_replication={false}
-    arg.1 = bf16[2,8,16,32]{3,2,1,0} parameter(1), parameter_replication={false}
-    ROOT onednn.matmul.0 = bf16[2,8,4,32]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
+    arg.0 = bf16[32,8,128,64]{3,2,1,0} parameter(0), parameter_replication={false}
+    arg.1 = bf16[32,8,64,128]{3,2,1,0} parameter(1), parameter_replication={false}
+    ROOT onednn.matmul.0 = bf16[32,8,128,128]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={2}
   })";
 
-  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
-  MatchOptimizedHlo(matmul_module_str,
-                    R"(
-  ; CHECK:     custom_call_target="__onednn$matmul",
-  ; CHECK:       backend_config={
-  ; CHECK-DAG:     "outer_dimension_partitions":[],
-  ; CHECK-DAG:     "onednn_matmul_config":{
-  ; CHECK-DAG:       "fused_ops":[]
-  ; CHECK-DAG:   }
-  ; CHECK:     }
-  )");
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-2, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, matmul_rewrite_str_);
 }
 
 TEST_F(MatmulTest, SimpleTestF32TransposeB) {
   const char* matmul_module_str = R"(
-  HloModule matmul.test.1, entry_computation_layout={(f32[2,8,4,16]{3,1,2,0},f32[2,8,4,16]{3,1,2,0})->f32[2,8,4,4]{3,2,1,0}}
+  HloModule matmul.test.1, entry_computation_layout={(f32[32,8,128,64]{3,1,2,0},f32[32,8,128,64]{3,1,2,0})->f32[32,8,128,128]{3,2,1,0}}
 
   ENTRY matmul.test.1 {
-    arg.0 = f32[2,8,4,16]{3,1,2,0} parameter(0), parameter_replication={false}
-    arg.1 = f32[2,8,4,16]{3,1,2,0} parameter(1), parameter_replication={false}
-    ROOT onednn.matmul.0 = f32[2,8,4,4]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
+    arg.0 = f32[32,8,128,64]{3,1,2,0} parameter(0), parameter_replication={false}
+    arg.1 = f32[32,8,128,64]{3,1,2,0} parameter(1), parameter_replication={false}
+    ROOT onednn.matmul.0 = f32[32,8,128,128]{3,2,1,0} dot(arg.0, arg.1), lhs_batch_dims={0,1}, lhs_contracting_dims={3}, rhs_batch_dims={0,1}, rhs_contracting_dims={3}
   })";
 
   EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, matmul_rewrite_str_);
 }
 
 TEST_F(MatmulTest, SimpleTestF32WithBiasAddFusion1) {
@@ -273,6 +274,27 @@ TEST_F(MatmulTest, SimpleTestF32TransposeBWithBiasAddFusion) {
   MatchOptimizedHlo(matmul_module_str, fused_matmul_binary_add_);
 }
 
+TEST_F(MatmulTest, F32BiasAddFusionNonCompatibleBias) {
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f32, entry_computation_layout={(f32[12288,2]{1,0},f32[2,1024]{1,0})->f32[32,384,1024]{2,1,0}}
+
+  ENTRY matmul.test.1 {
+    arg.0 = f32[12288,2]{1,0} parameter(0), parameter_replication={false}
+    arg.1 = f32[2,1024]{1,0} parameter(1), parameter_replication={false}
+    dot.0 = f32[12288,1024]{1,0} dot(arg.0, arg.1), lhs_contracting_dims={1}, rhs_contracting_dims={0}
+    reshape.0 = f32[32,384,1024]{2,1,0} reshape(dot.0)
+    constant.0 = f32[1,384,1024]{2,1,0} constant(15)
+    reshape.1 = f32[384,1024]{1,0} reshape(constant.0)
+    broadcast.0 = f32[32,384,1024]{2,1,0} broadcast(reshape.1), dimensions={1,2}
+    add.0 = f32[32,384,1024]{2,1,0} add(reshape.0, broadcast.0)
+    tuple.0 = (f32[32,384,1024]{2,1,0}) tuple(add.0)
+    ROOT get-tuple-element.0 = f32[32,384,1024]{2,1,0} get-tuple-element(tuple.0), index=0
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str, matmul_rewrite_str_);
+}
+
 TEST_F(MatmulTest, ApproxGELUTestF32) {
   const char* matmul_module_str = R"(
   HloModule matmul.test.f32, entry_computation_layout={(f32[32,32,4,16]{3,2,1,0},f32[32,32,16,32]{3,2,1,0})->f32[32,32,4,32]{3,2,1,0}}
@@ -434,6 +456,70 @@ TEST_F(MatmulTest, SimpleBiasTestBF16_PARAM_BF16) {
   MatchOptimizedHlo(matmul_module_str, fused_matmul_bias_);
 }
 
+TEST_F(MatmulTest, DivisionByConstantWithEltwiseLinearF32) {
+  const char* matmul_module_str = R"(
+  HloModule matmul.divide.test.1, entry_computation_layout={(f32[16,128,768]{2,1,0}, f32[768,12,64]{2,1,0})->f32[16,128,12,64]{3,2,1,0}}
+  ENTRY matmul.divide.test.f32 {
+    Arg_4.5 = f32[16,128,768]{2,1,0} parameter(0), sharding={replicated}
+    Arg_2.3 = f32[768,12,64]{2,1,0} parameter(1), sharding={replicated}
+    onednn.matmul.0 = f32[16,128,12,64]{3,2,1,0} dot(Arg_4.5, Arg_2.3), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+    constant.8 = f32[] constant(8)
+    broadcast.9 = f32[16,128,12,64]{3,2,1,0} broadcast(constant.8), dimensions={}
+    ROOT divide.16 = f32[16,128,12,64]{3,2,1,0} divide(onednn.matmul.0, broadcast.9)
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec(1e-4, 1e-4)));
+  MatchOptimizedHlo(matmul_module_str,
+                    R"(
+  ; CHECK:     custom_call_target="__onednn$matmul",
+  ; CHECK:       backend_config={
+  ; CHECK-DAG:     "outer_dimension_partitions":[],
+  ; CHECK-DAG:     "onednn_matmul_config":{
+  ; CHECK-DAG:       "fused_ops":["LINEAR"]
+  ; CHECK-DAG:   }
+  ; CHECK:     }
+  )");
+}
+
+TEST_F(MatmulTest, TestF32NonConstantWeights) {
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f32, entry_computation_layout={(f32[64,256,16]{2,1,0},f32[16,32]{1,0})->f32[64,256,32]{2,1,0}}
+
+  ENTRY matmul.test.f32 {
+    arg.0 = f32[64,256,16]{2,1,0} parameter(0), parameter_replication={false}
+    arg.1 = f32[16,32]{1,0} parameter(1), parameter_replication={false}
+    ROOT onednn.matmul.0 = f32[64,256,32]{2,1,0} dot(arg.0, arg.1), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str,
+                    R"(
+  ; CHECK:     %matmul.test.f32
+  ; CHECK-NOT: custom_call_target="__onednn$matmul_reorder",
+  ; CHECK:     custom-call(%{{[a-z,A-Z,0-9,\.]*}}, %arg.1), custom_call_target="__onednn$matmul",
+  )");
+}
+
+TEST_F(MatmulTest, TestF32ConstantWeights) {
+  const char* matmul_module_str = R"(
+  HloModule matmul.test.f32, entry_computation_layout={(f32[64,256,16]{2,1,0})->f32[64,256,32]{2,1,0}}
+
+  ENTRY matmul.test.f32 {
+    arg.0 = f32[64,256,16]{2,1,0} parameter(0), parameter_replication={false}
+    constant = f32[] constant(1)
+    arg.1 = f32[16,32]{1,0} broadcast(constant), dimensions={}
+    ROOT onednn.matmul.0 = f32[64,256,32]{2,1,0} dot(arg.0, arg.1), lhs_contracting_dims={2}, rhs_contracting_dims={0}
+  })";
+
+  EXPECT_TRUE(RunAndCompare(matmul_module_str, ErrorSpec{1e-4, 1e-4}));
+  MatchOptimizedHlo(matmul_module_str,
+                    R"(
+  ; CHECK:     %matmul.test.f32
+  ; CHECK-NOT: custom_call_target="__onednn$matmul_reorder",
+  ; CHECK:     custom-call(%{{[a-z,A-Z,0-9,\.]*}}, %constant{{[a-z,A-Z,0-9,\.]*}}), custom_call_target="__onednn$matmul",
+  )");
+}
+
 }  // namespace cpu
 }  // namespace xla
 
diff --git a/third_party/xla/xla/tests/reduce_hlo_test.cc b/third_party/xla/xla/tests/reduce_hlo_test.cc
index ae2c74fdec8fec..24b2678d52a1e2 100644
--- a/third_party/xla/xla/tests/reduce_hlo_test.cc
+++ b/third_party/xla/xla/tests/reduce_hlo_test.cc
@@ -125,9 +125,7 @@ INSTANTIATE_TEST_CASE_P(ReduceWithLayoutTest_Instantiation,
                             ReduceLayout{{3, 2, 1, 0}, {1, 0, 2}},   //
                             ReduceLayout{{3, 2, 1, 0}, {2, 0, 1}},   //
                             ReduceLayout{{3, 2, 1, 0}, {2, 1, 0}},   //
-                            ReduceLayout{{3, 1, 2, 0}, {1, 2, 0}},   //
-                            ReduceLayout{{1, 2, 3, 0}, {1, 0, 2}},   //
-                            ReduceLayout{{0, 2, 1, 3}, {2, 0, 1}}),  //
+                            ReduceLayout{{3, 1, 2, 0}, {1, 2, 0}}),  //
                         PrintReduceLayout);
 
 }  // namespace
diff --git a/third_party/xla/xla/tests/select_and_scatter_test.cc b/third_party/xla/xla/tests/select_and_scatter_test.cc
index 3addef18a25def..c4c9bb7b0a3fe0 100644
--- a/third_party/xla/xla/tests/select_and_scatter_test.cc
+++ b/third_party/xla/xla/tests/select_and_scatter_test.cc
@@ -86,7 +86,7 @@ XLA_TEST_P(SelectAndScatterTest, ParamTest) {
                    GetParam().window_strides, GetParam().padding_type, source,
                    ConstantR0<float>(&builder_, 0.0f), add_f32_);
 
-  ComputeAndCompare(&builder_, {}, ErrorSpec(1e-5, 1e-5));
+  ComputeAndCompare(&builder_, {}, ErrorSpec(3e-5, 3e-5));
 }
 
 INSTANTIATE_TEST_CASE_P(
diff --git a/third_party/xla/xla/tests/test_utils.cc b/third_party/xla/xla/tests/test_utils.cc
index 916060676ad513..608dd63a9ac156 100644
--- a/third_party/xla/xla/tests/test_utils.cc
+++ b/third_party/xla/xla/tests/test_utils.cc
@@ -256,6 +256,7 @@ StatusOr<Literal> MakeFakeLiteralInternal(
   // literal.
   Shape new_shape = shape;
   new_shape.mutable_layout()->clear_tiles();
+  new_shape.mutable_layout()->set_tail_padding_alignment_in_elements(1);
   new_shape.mutable_layout()->set_element_size_in_bits(0);
   Literal literal(new_shape);
 
diff --git a/third_party/xla/xla/tools/BUILD b/third_party/xla/xla/tools/BUILD
index e4ba4451f9f5e4..1d136b8935abff 100644
--- a/third_party/xla/xla/tools/BUILD
+++ b/third_party/xla/xla/tools/BUILD
@@ -28,7 +28,8 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla:internal"],
     licenses = ["notice"],
 )
 
@@ -39,7 +40,7 @@ filegroup(
         "**/*.cc",
         "**/*.h",
     ]),
-    visibility = ["//visibility:public"],
+    visibility = ["//xla:internal"],
 )
 
 build_test(
@@ -253,7 +254,6 @@ cc_library(
     name = "hlo_extractor",
     srcs = ["hlo_extractor.cc"],
     hdrs = ["hlo_extractor.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:literal",
         "//xla:literal_util",
@@ -291,7 +291,6 @@ xla_cc_binary(
 cc_library(
     name = "hlo_expand_main",
     srcs = ["hlo_expand_main.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_expand_lib",
         ":hlo_module_loader",
@@ -312,7 +311,6 @@ cc_library(
     name = "hlo_expand_lib",
     srcs = ["hlo_expand.cc"],
     hdrs = ["hlo_expand.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:xla_data_proto_cc",
         "//xla/service:batchnorm_expander",
@@ -370,7 +368,6 @@ cc_library(
     name = "hlo_slicer",
     srcs = ["hlo_slicer.cc"],
     hdrs = ["hlo_slicer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_extractor",
         "//xla:shape_util",
@@ -436,19 +433,19 @@ cc_library(
     name = "hlo_module_loader",
     srcs = ["hlo_module_loader.cc"],
     hdrs = ["hlo_module_loader.h"],
-    visibility = ["//visibility:public"],
+    visibility = ["//xla:friends"],
     deps = [
         ":run_hlo_module_proto_cc",
         "//xla:debug_options_flags",
         "//xla:statusor",
         "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_module_config",
         "//xla/service:hlo_parser",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:logging",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:protobuf",
-        "@local_tsl//tsl/platform:regexp",
     ],
 )
 
@@ -468,7 +465,6 @@ cc_library(
     name = "prepare_reference_module",
     srcs = ["prepare_reference_module.cc"],
     hdrs = ["prepare_reference_module.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:debug_options_flags",
         "//xla:statusor",
@@ -500,29 +496,51 @@ xla_py_proto_library(
     deps = [":run_hlo_module_proto"],
 )
 
+cc_library(
+    name = "hlo_decomposer_lib",
+    srcs = ["hlo_decomposer.cc"],
+    hdrs = ["hlo_decomposer.h"],
+    deps = [
+        "//xla:status",
+        "//xla/hlo/ir:hlo",
+        "//xla/service:call_graph",
+        "//xla/service:compilation_environments",
+        "@com_google_absl//absl/container:flat_hash_set",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
+
 cc_library(
     name = "run_hlo_module_lib",
     srcs = ["run_hlo_module.cc"],
     hdrs = ["run_hlo_module.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_control_flow_flattening",
+        ":hlo_decomposer_lib",
         ":hlo_module_loader",
         ":prepare_reference_module",
         ":run_hlo_module_proto_cc",
         "//xla:error_spec",
         "//xla:literal",
         "//xla:literal_comparison",
+        "//xla:status",
         "//xla:util",
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
+        "//xla/service:hlo_module_config",
         "//xla/service:hlo_proto_cc",
         "//xla/service:hlo_runner",
         "//xla/service:hlo_verifier",
         "//xla/tests:test_utils",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
+        "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -575,7 +593,6 @@ cc_library(
     name = "hlo_control_flow_flattening",
     srcs = ["hlo_control_flow_flattening.cc"],
     hdrs = ["hlo_control_flow_flattening.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:literal_util",
         "//xla/hlo/ir:hlo",
@@ -612,7 +629,6 @@ cc_library(
     name = "driver",
     srcs = ["driver.cc"],
     tags = ["nofixdeps"],
-    visibility = ["//visibility:public"],
     deps = [],
 )
 
@@ -640,7 +656,6 @@ tsl_gpu_library(
     cuda_deps = [
     ],
     defines = if_cuda_is_configured(["GOOGLE_CUDA=1"]) + if_rocm_is_configured(["TENSORFLOW_USE_ROCM=1"]),
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:util",
         "//xla/hlo/ir:hlo",
@@ -715,3 +730,18 @@ xla_test(
         "@local_tsl//tsl/protobuf:error_codes_proto_impl_cc",
     ],
 )
+
+xla_test(
+    name = "hlo_decomposer_test",
+    srcs = ["hlo_decomposer_test.cc"],
+    deps = [
+        ":hlo_decomposer_lib",
+        "//xla/hlo/ir:hlo",
+        "//xla/tests:filecheck",
+        "//xla/tests:hlo_test_base",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_googletest//:gtest_main",
+        "@local_tsl//tsl/platform:statusor",
+    ],
+)
diff --git a/third_party/xla/xla/tools/compute_cost.cc b/third_party/xla/xla/tools/compute_cost.cc
index dfa4e2beacad8f..c5e0ddcda4e221 100644
--- a/third_party/xla/xla/tools/compute_cost.cc
+++ b/third_party/xla/xla/tools/compute_cost.cc
@@ -56,7 +56,7 @@ int main(int argc, char** argv) {
     return xla::ShapeUtil::ByteSizeOf(shape, 8);
   });
 
-  TF_CHECK_OK(xla::LoadModuleFromFile(input, {}, format)
+  TF_CHECK_OK(xla::LoadModuleFromFile(input, format, {})
                   .value()
                   ->entry_computation()
                   ->root_instruction()
diff --git a/third_party/xla/xla/tools/hlo_bisect/BUILD b/third_party/xla/xla/tools/hlo_bisect/BUILD
index cc965be000854b..f0dce33a38533a 100644
--- a/third_party/xla/xla/tools/hlo_bisect/BUILD
+++ b/third_party/xla/xla/tools/hlo_bisect/BUILD
@@ -10,7 +10,8 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla:internal"],
 )
 
 build_test(
@@ -40,7 +41,6 @@ cc_library(
     name = "hlo_bisect_state",
     srcs = ["hlo_bisect_state.cc"],
     hdrs = ["hlo_bisect_state.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:literal",
         "//xla:status",
@@ -77,7 +77,6 @@ cc_library(
     testonly = True,
     srcs = ["hlo_bisect_utils.cc"],
     hdrs = ["hlo_bisect_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_bisect_state",
         "//xla:error_spec",
diff --git a/third_party/xla/xla/tools/hlo_decomposer.cc b/third_party/xla/xla/tools/hlo_decomposer.cc
new file mode 100644
index 00000000000000..800ffa110fb6bc
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_decomposer.cc
@@ -0,0 +1,152 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tools/hlo_decomposer.h"
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "absl/container/flat_hash_set.h"
+#include "xla/hlo/ir/hlo_clone_context.h"
+#include "xla/hlo/ir/hlo_computation.h"
+#include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/service/call_graph.h"
+#include "xla/service/compilation_environments.h"
+#include "xla/status.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+// Returns whether it makes sense to run the given instruction in isolation
+// (e.g. whether it can run without dependent instructions).
+bool ShouldIsolateOpcode(HloOpcode opcode) {
+  switch (opcode) {
+    case HloOpcode::kConstant:
+    case HloOpcode::kGetTupleElement:
+    case HloOpcode::kParameter:
+    case HloOpcode::kTuple:
+      return false;
+    default:
+      return true;
+  }
+}
+
+StatusOr<std::vector<std::unique_ptr<HloModule>>> Decompose(
+    const HloModule& module) {
+  std::vector<std::unique_ptr<HloModule>> modules;
+
+  absl::flat_hash_set<const HloComputation*> computations_to_visit{
+      module.entry_computation()};
+  absl::flat_hash_set<const HloComputation*> visited_computations;
+
+  // Traverse the computation tree, starting from the entry computation, and
+  // recursing into the called computations.
+  while (!computations_to_visit.empty()) {
+    const HloComputation* computation = *computations_to_visit.begin();
+    computations_to_visit.erase(computations_to_visit.begin());
+    visited_computations.insert(computation);
+
+    for (const HloInstruction* instruction : computation->instructions()) {
+      // Skip called computations in the embedded context (fusion, reduce, map,
+      // etc), as within these computations instructions are not lowered
+      // individually and it doesn't make sense to test them in isolation.
+      if (GetInstructionCallContext(instruction->opcode()) !=
+          CallContext::kEmbedded) {
+        for (const HloComputation* called_computation :
+             instruction->called_computations()) {
+          if (!visited_computations.contains(called_computation)) {
+            computations_to_visit.insert(called_computation);
+          }
+        }
+      }
+      if (ShouldIsolateOpcode(instruction->opcode())) {
+        modules.push_back(ExtractInstructionIntoNewModule(*instruction));
+      }
+    }
+  }
+
+  return modules;
+}
+
+}  // namespace
+
+StatusOr<std::vector<std::unique_ptr<HloModule>>> DecomposeHloModule(
+    const HloModule& module, bool deduplicate_modules) {
+  std::vector<std::unique_ptr<HloModule>> modules;
+  absl::flat_hash_set<std::string> module_fingerprints;
+
+  auto should_add_module = [&](const HloModule* module) {
+    if (!deduplicate_modules) {
+      return true;
+    }
+    const std::string fingerprint = module->GetFingerprint128();
+    if (module_fingerprints.contains(fingerprint)) {
+      return false;
+    }
+    module_fingerprints.insert(fingerprint);
+    return true;
+  };
+
+  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<HloModule>> isolated_modules,
+                      Decompose(module));
+  for (auto& module : isolated_modules) {
+    if (should_add_module(module.get())) {
+      modules.push_back(std::move(module));
+    }
+  }
+  return modules;
+}
+
+std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
+    const HloInstruction& hlo) {
+  auto new_hlo_module = std::make_unique<HloModule>(
+      std::string(hlo.name()), HloModuleConfig{},
+      std::make_unique<CompilationEnvironments>(hlo.GetModule()->comp_envs()));
+  int parameter_number = 0;
+  HloComputation::Builder builder("entry_computation");
+  HloCloneContext clone_context(new_hlo_module.get());
+  std::vector<HloInstruction*> new_operands;
+  for (const HloInstruction* operand : hlo.operands()) {
+    std::unique_ptr<HloInstruction> new_parameter =
+        HloInstruction::CreateParameter(parameter_number, operand->shape(),
+                                        operand->name());
+    ++parameter_number;
+    new_operands.push_back(builder.AddInstruction(std::move(new_parameter)));
+  }
+  std::unique_ptr<HloInstruction> new_instruction =
+      hlo.CloneWithNewOperands(hlo.shape(), new_operands, &clone_context);
+  builder.AddInstruction(std::move(new_instruction));
+  new_hlo_module->AddEntryComputationWithLayouts(builder.Build());
+  return new_hlo_module;
+}
+
+std::unique_ptr<HloModule> ExtractComputationIntoNewModule(
+    const HloComputation& computation) {
+  auto new_hlo_module =
+      std::make_unique<HloModule>("extracted", HloModuleConfig{},
+                                  std::make_unique<CompilationEnvironments>(
+                                      computation.parent()->comp_envs()));
+  HloCloneContext clone_context(new_hlo_module.get());
+  new_hlo_module->AddEntryComputationWithLayouts(
+      computation.CloneInContext(clone_context));
+  return new_hlo_module;
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/tools/hlo_decomposer.h b/third_party/xla/xla/tools/hlo_decomposer.h
new file mode 100644
index 00000000000000..7b1519481e4c2f
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_decomposer.h
@@ -0,0 +1,45 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TOOLS_HLO_DECOMPOSER_H_
+#define XLA_TOOLS_HLO_DECOMPOSER_H_
+
+#include <memory>
+#include <vector>
+
+#include "xla/hlo/ir/hlo_module.h"
+
+namespace xla {
+
+// Decomposes the `module` into individual ops and de-duplicates the decomposed
+// op if `deduplicate_modules` is true. The modules are considered duplicate if
+// if their computation graphs are isomorphic (i.e. computations and
+// instructions are sorted, names are ignored etc).
+StatusOr<std::vector<std::unique_ptr<HloModule>>> DecomposeHloModule(
+    const HloModule& module, bool deduplicate_modules);
+
+// Extracts an HLO instruction into a new HLO module replacing its operands
+// with parameter instructions.
+std::unique_ptr<HloModule> ExtractInstructionIntoNewModule(
+    const HloInstruction& hlo);
+
+// Extracts an HLO computation into a new HLO module, using its clone as the
+// root computation.
+std::unique_ptr<HloModule> ExtractComputationIntoNewModule(
+    const HloComputation& computation);
+
+}  // namespace xla
+
+#endif  // XLA_TOOLS_HLO_DECOMPOSER_H_
diff --git a/third_party/xla/xla/tools/hlo_decomposer_test.cc b/third_party/xla/xla/tools/hlo_decomposer_test.cc
new file mode 100644
index 00000000000000..d60f94fdd26aa6
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_decomposer_test.cc
@@ -0,0 +1,161 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/tools/hlo_decomposer.h"
+
+#include <memory>
+#include <vector>
+
+#include <gtest/gtest.h>
+#include "absl/algorithm/container.h"
+#include "absl/strings/string_view.h"
+#include "xla/hlo/ir/hlo_module.h"
+#include "xla/tests/filecheck.h"
+#include "xla/tests/hlo_test_base.h"
+#include "tsl/platform/statusor.h"
+
+namespace xla {
+namespace {
+
+class HloDecomposerTest : public HloTestBase {
+ protected:
+  std::unique_ptr<HloModule> GetModule() {
+    absl::string_view kHlo = R"(
+HloModule test_module, entry_computation_layout={(bf16[1024,8192]{1,0}, f32[8192]{0}, f32[16384]{0})->(bf16[1024]{0}, bf16[1024]{0}, f32[16384]{0}, f32[16384]{0})}
+
+add {
+  p0 = f32[] parameter(0)
+  p1 = f32[] parameter(1)
+  ROOT add.1 = f32[] add(p0, p1)
+}
+
+fused_computation.1 {
+  param_1.3 = f32[8192]{0} parameter(1)
+  broadcast.2 = f32[1024,8192]{1,0} broadcast(param_1.3), dimensions={1}
+  param_0.3 = bf16[1024,8192]{1,0} parameter(0)
+  convert.5 = f32[1024,8192]{1,0} convert(param_0.3)
+  multiply.2 = f32[1024,8192]{1,0} multiply(broadcast.2, convert.5)
+  c0_1 = f32[] constant(0)
+  reduce.1 = f32[1024]{0} reduce(multiply.2, c0_1), dimensions={1}, to_apply=add
+  ROOT convert.4 = bf16[1024]{0} convert(reduce.1)
+}
+
+fused_computation.2 {
+  p0.0 = bf16[1024,8192]{1,0} parameter(0)
+  c.0 = f32[1024,8192]{1,0} convert(p0.0)
+  co0_1.1 = f32[] constant(0)
+  p.0 = f32[8192]{0} parameter(1)
+  b.0 = f32[1024,8192]{1,0} broadcast(p.0), dimensions={1}
+  m.0 = f32[1024,8192]{1,0} multiply(b.0, c.0)
+  r.0 = f32[1024]{0} reduce(m.0, co0_1.1), dimensions={1}, to_apply=add
+  ROOT c.1 = bf16[1024]{0} convert(r.0)
+}
+
+exp {
+  param_0.5 = f32[16384]{0} parameter(0)
+  m.4 = f32[16384]{0} multiply(param_0.5, param_0.5)
+  e = f32[16384]{0} exponential(m.4)
+  l.clone.1 = f32[16384]{0} log(m.4)
+  ROOT tuple = (f32[16384]{0}, f32[16384]{0}) tuple(e, l.clone.1)
+}
+
+ENTRY main {
+  p0.1 = bf16[1024,8192]{1,0} parameter(0)
+  p1.1 = f32[8192]{0} parameter(1)
+  fusion.1 = bf16[1024]{0} fusion(p0.1, p1.1), kind=kInput, calls=fused_computation.1
+  fusion.2 = bf16[1024]{0} fusion(p0.1, p1.1), kind=kInput, calls=fused_computation.2
+  p2 = f32[16384]{0} parameter(2)
+  e.1 = (f32[16384]{0}, f32[16384]{0}) fusion(p2), kind=kInput, calls=exp
+  get-tuple-element.1 = f32[16384]{0} get-tuple-element(e.1), index=1
+  get-tuple-element = f32[16384]{0} get-tuple-element(e.1), index=0
+  ROOT result = (bf16[1024]{0}, bf16[1024]{0}, f32[16384]{0}, f32[16384]{0}) tuple(fusion.1, fusion.2, get-tuple-element.1, get-tuple-element)
+})";
+    return ParseAndReturnVerifiedModule(kHlo).value();
+  }
+
+  void FindAndCompare(const std::vector<std::unique_ptr<HloModule>>& modules,
+                      absl::string_view module_name,
+                      absl::string_view pattern) {
+    auto iter =
+        absl::c_find_if(modules, [&](const std::unique_ptr<HloModule>& module) {
+          return module->name() == module_name;
+        });
+    EXPECT_NE(iter, modules.end()) << "No module named " << module_name;
+    if (iter == modules.end()) {
+      return;
+    }
+    EXPECT_TRUE(*RunFileCheck((*iter)->ToString(), pattern));
+  }
+};
+
+TEST_F(HloDecomposerTest, DecomposeNoDedup) {
+  auto module = GetModule();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto decomposed,
+      DecomposeHloModule(*module, /*deduplicate_modules=*/false));
+  EXPECT_EQ(decomposed.size(), 3);
+
+  FindAndCompare(decomposed, "fusion.1", R"(
+CHECK: %add{{.*}} {
+CHECK: %fused_computation.1
+CHECK: ENTRY
+CHECK-THEN: %parameter.0 = bf16[1024,8192]{1,0} parameter(0)
+CHECK-THEN: %parameter.1 = f32[8192]{0} parameter(1)
+CHECK-THEN: ROOT %fusion.1
+)");
+
+  FindAndCompare(decomposed, "fusion.2", R"(
+CHECK: %add{{.*}} {
+CHECK: %fused_computation.2
+CHECK: ENTRY
+CHECK-THEN: %parameter.0 = bf16[1024,8192]{1,0} parameter(0)
+CHECK-THEN: %parameter.1 = f32[8192]{0} parameter(1)
+CHECK-THEN: ROOT %fusion.2
+)");
+
+  FindAndCompare(decomposed, "e.1", R"(
+CHECK: %exp{{.*}} {
+CHECK: ENTRY
+CHECK-THEN: %parameter.0 = f32[16384]{0} parameter(0)
+CHECK-THEN: ROOT %e.1
+)");
+}
+
+TEST_F(HloDecomposerTest, DecomposeDedup) {
+  auto module = GetModule();
+  TF_ASSERT_OK_AND_ASSIGN(
+      auto decomposed,
+      DecomposeHloModule(*module, /*deduplicate_modules=*/true));
+  EXPECT_EQ(decomposed.size(), 2);
+
+  FindAndCompare(decomposed, "fusion.1", R"(
+CHECK: %add{{.*}} {
+CHECK: %fused_computation.1
+CHECK: ENTRY
+CHECK-THEN: %parameter.0 = bf16[1024,8192]{1,0} parameter(0)
+CHECK-THEN: %parameter.1 = f32[8192]{0} parameter(1)
+CHECK-THEN: ROOT %fusion.1
+)");
+
+  FindAndCompare(decomposed, "e.1", R"(
+CHECK: %exp{{.*}} {
+CHECK: ENTRY
+CHECK-THEN: %parameter.0 = f32[16384]{0} parameter(0)
+CHECK-THEN: ROOT %e.1
+)");
+}
+
+}  // namespace
+}  // namespace xla
diff --git a/third_party/xla/xla/tools/hlo_expand_main.cc b/third_party/xla/xla/tools/hlo_expand_main.cc
index 30eb92b5ee1501..4e34c9f6c37fed 100644
--- a/third_party/xla/xla/tools/hlo_expand_main.cc
+++ b/third_party/xla/xla/tools/hlo_expand_main.cc
@@ -130,9 +130,8 @@ int main(int argc, char** argv) {
     std::getline(std::cin, input, static_cast<char>(EOF));
     status_or_module = xla::LoadModuleFromData(input, config.input_format);
   } else {
-    status_or_module = xla::LoadModuleFromFile(
-        hlo_filename, xla::hlo_module_loader_details::Config(),
-        config.input_format);
+    status_or_module =
+        xla::LoadModuleFromFile(hlo_filename, config.input_format);
   }
   if (!status_or_module.ok()) {
     std::cerr << status_or_module.status() << "\nTry: hlo-expand --help\n";
diff --git a/third_party/xla/xla/tools/hlo_module_loader.cc b/third_party/xla/xla/tools/hlo_module_loader.cc
index b9756f996da689..7e3547e5ef53e1 100644
--- a/third_party/xla/xla/tools/hlo_module_loader.cc
+++ b/third_party/xla/xla/tools/hlo_module_loader.cc
@@ -20,6 +20,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <string>
+#include <string_view>
 #include <utility>
 
 #include "absl/strings/str_cat.h"
@@ -28,18 +29,18 @@ limitations under the License.
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_parser.h"
 #include "tsl/platform/env.h"
 #include "tsl/platform/logging.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/protobuf.h"
-#include "tsl/platform/regexp.h"
 
 namespace xla {
 namespace {
 
-Status OverrideConfig(const hlo_module_loader_details::Config& ovr_config,
-                      HloModuleConfig* config) {
+absl::Status OverrideConfig(const hlo_module_loader_details::Config& ovr_config,
+                            HloModuleConfig* config) {
   config->set_replica_count(ovr_config.num_replicas);
   config->set_num_partitions(ovr_config.num_partitions);
   return OkStatus();
@@ -47,12 +48,12 @@ Status OverrideConfig(const hlo_module_loader_details::Config& ovr_config,
 
 }  // namespace
 
-std::string StripLogHeaders(const std::string& hlo_string) {
+std::string StripLogHeaders(std::string_view hlo_string) {
   // I0521 12:04:45.883483    1509 service.cc:186] ...
   static RE2* matcher = new RE2(
       "[IWEF]\\d{4} "
       "\\d{2}:\\d{2}:\\d{2}\\.\\d+\\s+\\d+\\s+[^:]+:\\d+\\]\\s?(.*)");
-  absl::string_view matches[4];
+  std::string_view matches[4];
   std::vector<std::string> lines = absl::StrSplit(hlo_string, '\n');
   for (auto& line : lines) {
     if (matcher->Match(line, 0, line.size(), RE2::ANCHOR_START, matches, 4)) {
@@ -65,9 +66,9 @@ std::string StripLogHeaders(const std::string& hlo_string) {
                        });
 }
 
-StatusOr<std::unique_ptr<HloModule>> LoadModuleFromData(
-    const std::string& data, const std::string& format,
-    hlo_module_loader_details::Config ovr_config,
+absl::StatusOr<std::unique_ptr<HloModule>> LoadModuleFromData(
+    const std::string& data, std::string_view format,
+    const hlo_module_loader_details::Config& ovr_config,
     const std::function<void(HloModuleConfig*)>& config_modifier_hook,
     BufferAssignmentProto* buffer_assignment_proto) {
   DebugOptions debug_options = GetDebugOptionsFromFlags();
@@ -125,9 +126,9 @@ StatusOr<std::unique_ptr<HloModule>> LoadModuleFromData(
   return std::move(module);
 }
 
-StatusOr<std::unique_ptr<HloModule>> LoadModuleFromFile(
-    const std::string& path, hlo_module_loader_details::Config ovr_config,
-    std::string format,
+absl::StatusOr<std::unique_ptr<HloModule>> LoadModuleFromFile(
+    const std::string& path, std::string format,
+    const hlo_module_loader_details::Config& ovr_config,
     const std::function<void(HloModuleConfig*)>& config_modifier_hook,
     BufferAssignmentProto* buffer_assignment_proto) {
   std::string data;
@@ -139,8 +140,8 @@ StatusOr<std::unique_ptr<HloModule>> LoadModuleFromFile(
                             buffer_assignment_proto);
 }
 
-StatusOr<std::unique_ptr<RunHloModuleIterationLiterals>> LoadInputFromData(
-    const std::string& data, absl::string_view format) {
+absl::StatusOr<std::unique_ptr<RunHloModuleIterationLiterals>>
+LoadInputFromData(const std::string& data, std::string_view format) {
   HloSnapshot proto;
   if (format == "pb") {
     if (!proto.ParseFromString(data) &&
@@ -171,8 +172,8 @@ StatusOr<std::unique_ptr<RunHloModuleIterationLiterals>> LoadInputFromData(
   return std::move(iteration_literals_proto);
 }
 
-StatusOr<std::unique_ptr<RunHloModuleIterationLiterals>> LoadInputFromFile(
-    const std::string& path, std::string format) {
+absl::StatusOr<std::unique_ptr<RunHloModuleIterationLiterals>>
+LoadInputFromFile(const std::string& path, std::string format) {
   std::string data;
   if (format.empty()) {
     format = std::string(tsl::io::Extension(path));
diff --git a/third_party/xla/xla/tools/hlo_module_loader.h b/third_party/xla/xla/tools/hlo_module_loader.h
index a7d334a42956e3..b723e47c26d70a 100644
--- a/third_party/xla/xla/tools/hlo_module_loader.h
+++ b/third_party/xla/xla/tools/hlo_module_loader.h
@@ -19,6 +19,7 @@ limitations under the License.
 #include <functional>
 #include <memory>
 #include <string>
+#include <string_view>
 
 #include "absl/strings/string_view.h"
 #include "xla/hlo/ir/hlo_module.h"
@@ -38,7 +39,7 @@ struct Config {
 
 // Given a string composed by multiple lines, strip the log headers, if present
 // at the beginning of each line.
-std::string StripLogHeaders(const std::string& hlo_string);
+std::string StripLogHeaders(std::string_view hlo_string);
 
 // Loads an HLO module from a string.
 // The data can have the followings formats:
@@ -55,8 +56,8 @@ std::string StripLogHeaders(const std::string& hlo_string);
 // and the hlo module format is proto, it loads buffer assignment from the
 // proto.
 StatusOr<std::unique_ptr<HloModule>> LoadModuleFromData(
-    const std::string& data, const std::string& format,
-    hlo_module_loader_details::Config ovr_config =
+    const std::string& data, std::string_view format,
+    const hlo_module_loader_details::Config& ovr_config =
         hlo_module_loader_details::Config(),
     const std::function<void(HloModuleConfig*)>& config_modifier_hook = {},
     BufferAssignmentProto* buffer_assignment_proto = nullptr);
@@ -77,10 +78,9 @@ StatusOr<std::unique_ptr<HloModule>> LoadModuleFromData(
 // and the hlo module format is proto, it loads buffer assignment from the
 // proto.
 StatusOr<std::unique_ptr<HloModule>> LoadModuleFromFile(
-    const std::string& path,
-    hlo_module_loader_details::Config ovr_config =
+    const std::string& path, std::string format = "",
+    const hlo_module_loader_details::Config& ovr_config =
         hlo_module_loader_details::Config(),
-    std::string format = "",
     const std::function<void(HloModuleConfig*)>& config_modifier_hook = {},
     BufferAssignmentProto* buffer_assignment_proto = nullptr);
 
@@ -89,7 +89,7 @@ StatusOr<std::unique_ptr<HloModule>> LoadModuleFromFile(
 // 1) A binary proto (format "pb")
 // 2) A text proto (format "pbtxt")
 StatusOr<std::unique_ptr<RunHloModuleIterationLiterals>> LoadInputFromData(
-    const std::string& data, absl::string_view format);
+    const std::string& data, std::string_view format);
 
 // Loads an HLO snapshot from file, only for its inputs
 // The file must be one of the following:
diff --git a/third_party/xla/xla/tools/hlo_opt/BUILD b/third_party/xla/xla/tools/hlo_opt/BUILD
index fb05a8a623a339..3bd009b55481d8 100644
--- a/third_party/xla/xla/tools/hlo_opt/BUILD
+++ b/third_party/xla/xla/tools/hlo_opt/BUILD
@@ -20,7 +20,8 @@ load(
 )
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla:internal"],
     licenses = ["notice"],
 )
 
@@ -29,7 +30,6 @@ cc_library(
     name = "opt_lib",
     srcs = ["opt_lib.cc"],
     hdrs = ["opt_lib.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:debug_options_flags",
         "//xla:statusor",
@@ -52,7 +52,6 @@ cc_library(
     name = "gpu_opt",
     testonly = True,
     srcs = if_gpu_is_configured(["gpu_opt.cc"]),
-    visibility = ["//visibility:public"],
     deps = [
         ":opt_lib",
         "//xla:debug_options_flags",
@@ -93,7 +92,6 @@ cc_library(
     name = "cpu_opt",
     testonly = True,
     srcs = ["cpu_opt.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":opt_lib",
         "//xla/service:cpu_plugin",
@@ -108,7 +106,6 @@ cc_library(
     name = "opt_main",
     testonly = True,
     srcs = ["opt_main.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         "cpu_opt",
         ":opt_lib",
@@ -121,6 +118,7 @@ cc_library(
         "//xla/tools:hlo_module_loader",
         "//xla/tools:run_hlo_module_lib",
         "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status",
         "@com_google_absl//absl/strings",
         "@local_tsl//tsl/platform:env",
         "@local_tsl//tsl/platform:errors",
@@ -128,6 +126,7 @@ cc_library(
         "@local_tsl//tsl/platform:path",
         "@local_tsl//tsl/platform:platform_port",
         "@local_tsl//tsl/platform:status",
+        "@local_tsl//tsl/platform:statusor",
         "@local_tsl//tsl/platform:test",
         "@local_tsl//tsl/util:command_line_flags",
     ] + if_gpu_is_configured([
@@ -156,8 +155,10 @@ lit_test_suite(
         ],
     ),
     args = if_cuda_is_configured([
-        "--param=GPU=a100",
+        "--param=PTX=PTX",
+        "--param=GPU=a100_80",
     ]) + if_rocm_is_configured([
+        "--param=PTX=GCN",
         "--param=GPU=mi200",
     ]),
     cfg = "//xla:lit.cfg.py",
@@ -177,17 +178,18 @@ filegroup(
     name = "test_utilities",
     testonly = True,
     data = [
-        "gpu_specs/a100.txtpb",
+        "gpu_specs/a100_80.txtpb",
         "gpu_specs/mi200.txtpb",
         "//xla/tools:hlo-opt",
         "@llvm-project//llvm:FileCheck",
     ],
-    visibility = ["//visibility:public"],
 )
 
-exports_files(
-    glob([
-        "gpu_specs/*.txtpb",
-    ]),
-    visibility = ["//visibility:public"],
+filegroup(
+    name = "all_gpu_specs",
+    data = glob(["gpu_specs/*.txtpb"]),
 )
+
+exports_files(glob([
+    "gpu_specs/*.txtpb",
+]))
diff --git a/third_party/xla/xla/tools/hlo_opt/cpu_opt.cc b/third_party/xla/xla/tools/hlo_opt/cpu_opt.cc
index 16dd19cc24e656..9e7d5c2b72ace7 100644
--- a/third_party/xla/xla/tools/hlo_opt/cpu_opt.cc
+++ b/third_party/xla/xla/tools/hlo_opt/cpu_opt.cc
@@ -31,7 +31,7 @@ class CpuOptProvider : public OptProvider {
 }  // namespace
 }  // namespace xla
 
-REGISTER_MODULE_INITIALIZER(cpu_opt_provider, {
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(cpu_opt_provider, {
   xla::OptProvider::RegisterForPlatform(
       "cpu", std::make_unique<xla::CpuOptProvider>());
 });
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_llvm.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_llvm.hlo
index cd8f4ea3567295..59800a9d170560 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_llvm.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_llvm.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=gpu --stage=llvm --xla_gpu_target_config_filename=%S/gpu_specs/%{GPU}.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm --xla_gpu_target_config_filename=%S/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck --check-prefixes=CHECK,CHECK-%{PTX} %s
 
 HloModule m
 
@@ -9,9 +9,29 @@ add {
 }
 
 
-// CHECK: load <2 x half>
+// CHECK-LABEL: fusion
+// CHECK: 2 x half
 ENTRY e {
  p1 = f16[1048576] parameter(0)
  i = f16[] constant(0)
  ROOT out = f16[] reduce(p1, i), dimensions={0}, to_apply=add
 }
+
+// -----
+
+HloModule Test, is_scheduled=true
+
+
+// CHECK-LABEL: fusion
+// CHECK-PTX:     call void @llvm.nvvm.barrier0
+// CHECK-GCN:     call void @llvm.amdgcn.s.barrier
+fused_computation {
+  param_0 = f32[100,200]{1,0} parameter(0)
+  ROOT b.1 = f32[100,200]{0,1} copy(f32[100,200]{1,0} param_0)
+}
+
+ENTRY main {
+  a = f32[100, 200]{1,0} parameter(0)
+  ROOT wrapped_b = f32[100,200]{0,1} fusion(f32[100,200]{1,0} a), kind=kInput, calls=fused_computation
+}
+
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_ptx.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_ptx.hlo
index 428bb07d95329c..5c6485a57813a8 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_ptx.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_ptx.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=CUDA --stage=ptx --xla_gpu_target_config_filename=%S/gpu_specs/a100.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=CUDA --stage=ptx --xla_gpu_target_config_filename=%S/gpu_specs/%{GPU}.txtpb --split-input-file | FileCheck %s
 
 HloModule m
 
@@ -9,9 +9,29 @@ add {
 }
 
 
+// CHECK-LABEL: .target sm_80
 // CHECK: shfl.sync.down
 ENTRY e {
  p1 = f16[1048576] parameter(0)
  i = f16[] constant(0)
  ROOT out = f16[] reduce(p1, i), dimensions={0}, to_apply=add
 }
+
+// -----
+
+// CHECK-LABEL: .target sm_80
+
+HloModule m
+
+add {
+ a = f32[] parameter(0)
+ b = f32[] parameter(1)
+ ROOT out = f32[] add(a, b)
+}
+
+// CHECK-NOT: cvt.u16.u32
+ENTRY e {
+ p1 = f32[3] parameter(0)
+ i = f32[] constant(0)
+ ROOT out = f32[] reduce(p1, i), dimensions={0}, to_apply=add
+}
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_unoptimized_llvm.hlo b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_unoptimized_llvm.hlo
index 3ae9bd1f0ee64d..6c8bc8bd54fe6a 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_hlo_unoptimized_llvm.hlo
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_hlo_unoptimized_llvm.hlo
@@ -1,4 +1,4 @@
-// RUN: hlo-opt %s --platform=CUDA --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/gpu_specs/a100.txtpb | FileCheck %s
+// RUN: hlo-opt %s --platform=gpu --stage=llvm-before-optimizations --xla_gpu_target_config_filename=%S/gpu_specs/%{GPU}.txtpb | FileCheck %s
 
 // CHECK:       fusion.in_bounds-true:
 // CHECK:         br label
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc b/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc
index 01e14c9da835d0..3521ccc51664c2 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_opt.cc
@@ -131,7 +131,7 @@ class GpuOptProvider : public OptProvider {
 }  // namespace
 }  // namespace xla
 
-REGISTER_MODULE_INITIALIZER(gpu_opt_provider, {
+STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(gpu_opt_provider, {
   xla::OptProvider::RegisterForPlatform(
       "gpu", std::make_unique<xla::GpuOptProvider>());
 });
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_40.txtpb b/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_40.txtpb
new file mode 100644
index 00000000000000..755bca39671147
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_40.txtpb
@@ -0,0 +1,41 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+gpu_device_info {
+  threads_per_block_limit: 1024
+  threads_per_warp: 32
+  shared_memory_per_block: 49152
+  shared_memory_per_core: 167936
+  threads_per_core_limit: 2048
+  core_count: 108
+  fpus_per_core: 64
+  block_dim_limit_x: 2147483647
+  block_dim_limit_y: 65535
+  block_dim_limit_z: 65535
+  memory_bandwidth: 1555200000000
+  l2_cache_size: 41943040
+  clock_rate_ghz: 1.41
+  device_memory_size: 42331013120
+  shared_memory_per_block_optin: 166912
+  cuda_compute_capability {
+    major: 8
+  }
+}
+platform_name: "CUDA"
+dnn_version_info {
+  major: 8
+  minor: 9
+  patch: 4
+}
+device_description_str: "NVIDIA A100-SXM4-40GB"
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100.txtpb b/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_80.txtpb
similarity index 86%
rename from third_party/xla/xla/tools/hlo_opt/gpu_specs/a100.txtpb
rename to third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_80.txtpb
index 97ae2a95a9b9dc..cf29fa306fdb94 100644
--- a/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100.txtpb
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_specs/a100_80.txtpb
@@ -13,25 +13,24 @@
 # limitations under the License.
 
 gpu_device_info {
-  cuda_compute_capability {
-    major: 8
-    minor: 0
-  }
   threads_per_block_limit: 1024
   threads_per_warp: 32
-  shared_memory_per_block: 65536
-  shared_memory_per_block_optin: 65536
-  shared_memory_per_core: 65536
+  shared_memory_per_block: 49152
+  shared_memory_per_core: 167936
   threads_per_core_limit: 2048
-  core_count: 6192
+  core_count: 108
   fpus_per_core: 64
   block_dim_limit_x: 2147483647
   block_dim_limit_y: 65535
   block_dim_limit_z: 65535
   memory_bandwidth: 2039000000000
-  l2_cache_size: 4194304
+  l2_cache_size: 41943040
   clock_rate_ghz: 1.1105
   device_memory_size: 79050250240
+  shared_memory_per_block_optin: 166912
+  cuda_compute_capability {
+    major: 8
+  }
 }
 platform_name: "CUDA"
 dnn_version_info {
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/a6000.txtpb b/third_party/xla/xla/tools/hlo_opt/gpu_specs/a6000.txtpb
new file mode 100644
index 00000000000000..c4dafc26b15b7f
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_specs/a6000.txtpb
@@ -0,0 +1,43 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+gpu_device_info {
+  threads_per_block_limit: 1024
+  threads_per_warp: 32
+  shared_memory_per_block: 49152
+  shared_memory_per_core: 102400
+  threads_per_core_limit: 1536
+  core_count: 84
+  fpus_per_core: 128
+  block_dim_limit_x: 2147483647
+  block_dim_limit_y: 65535
+  block_dim_limit_z: 65535
+  memory_bandwidth: 768096000000
+  l2_cache_size: 6291456
+  clock_rate_ghz: 1.8
+  device_memory_size: 51041271808
+  shared_memory_per_block_optin: 101376
+  cuda_compute_capability {
+    major: 8
+    minor: 6
+  }
+}
+platform_name: "CUDA"
+dnn_version_info {
+  major: 8
+  minor: 9
+  patch: 4
+}
+device_description_str: "NVIDIA RTX A6000"
+
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/h100.txtpb b/third_party/xla/xla/tools/hlo_opt/gpu_specs/h100.txtpb
new file mode 100644
index 00000000000000..cee3fd499eb060
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_specs/h100.txtpb
@@ -0,0 +1,41 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+gpu_device_info {
+  threads_per_block_limit: 1024
+  threads_per_warp: 32
+  shared_memory_per_block: 49152
+  shared_memory_per_core: 233472
+  threads_per_core_limit: 2048
+  core_count: 132
+  fpus_per_core: 128
+  block_dim_limit_x: 2147483647
+  block_dim_limit_y: 65535
+  block_dim_limit_z: 65535
+  memory_bandwidth: 3352320000000
+  l2_cache_size: 52428800
+  clock_rate_ghz: 1.98
+  device_memory_size: 84978434048
+  shared_memory_per_block_optin: 232448
+  cuda_compute_capability {
+    major: 9
+  }
+}
+platform_name: "CUDA"
+dnn_version_info {
+  major: 8
+  minor: 9
+  patch: 4
+}
+device_description_str: "NVIDIA H100 80GB HBM3"
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/p100.txtpb b/third_party/xla/xla/tools/hlo_opt/gpu_specs/p100.txtpb
new file mode 100644
index 00000000000000..c9fcfb0b5981a3
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_specs/p100.txtpb
@@ -0,0 +1,41 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+gpu_device_info {
+  threads_per_block_limit: 1024
+  threads_per_warp: 32
+  shared_memory_per_block: 49152
+  shared_memory_per_core: 65536
+  threads_per_core_limit: 2048
+  core_count: 56
+  fpus_per_core: 64
+  block_dim_limit_x: 2147483647
+  block_dim_limit_y: 65535
+  block_dim_limit_z: 65535
+  memory_bandwidth: 732160000000
+  l2_cache_size: 4194304
+  clock_rate_ghz: 1.4805
+  device_memory_size: 17066622976
+  shared_memory_per_block_optin: 49152
+  cuda_compute_capability {
+    major: 6
+  }
+}
+platform_name: "CUDA"
+dnn_version_info {
+  major: 8
+  minor: 9
+  patch: 4
+}
+device_description_str: "Tesla P100-SXM2-16GB"
diff --git a/third_party/xla/xla/tools/hlo_opt/gpu_specs/v100.txtpb b/third_party/xla/xla/tools/hlo_opt/gpu_specs/v100.txtpb
new file mode 100644
index 00000000000000..5a6526e88eda87
--- /dev/null
+++ b/third_party/xla/xla/tools/hlo_opt/gpu_specs/v100.txtpb
@@ -0,0 +1,38 @@
+# Copyright 2024 The OpenXLA Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+gpu_device_info {
+  threads_per_block_limit: 1024
+  threads_per_warp: 32
+  shared_memory_per_block: 49152
+  shared_memory_per_core: 98304
+  threads_per_core_limit: 2048
+  core_count: 80
+  fpus_per_core: 64
+  block_dim_limit_x: 2147483647
+  block_dim_limit_y: 65535
+  block_dim_limit_z: 65535
+  memory_bandwidth: 898048000000
+  l2_cache_size: 6291456
+  clock_rate_ghz: 1.53
+  device_memory_size: 16935419904
+  shared_memory_per_block_optin: 98304
+  cuda_compute_capability {
+    major: 7
+  }
+}
+platform_name: "CUDA"
+dnn_version_info {
+}
+device_description_str: "Tesla V100-SXM2-16GB"
diff --git a/third_party/xla/xla/tools/hlo_opt/opt_lib.cc b/third_party/xla/xla/tools/hlo_opt/opt_lib.cc
index dfe5c1e83a951d..105f06d58df8e4 100644
--- a/third_party/xla/xla/tools/hlo_opt/opt_lib.cc
+++ b/third_party/xla/xla/tools/hlo_opt/opt_lib.cc
@@ -127,6 +127,10 @@ StatusOr<std::unique_ptr<HloModule>> OptProvider::GetOptimizedHlo(
   DebugOptions debug_opts = GetDebugOptionsFromFlags();
   Compiler::CompileOptions opts;
   TF_ASSIGN_OR_RETURN(Compiler * compiler, GetCompiler());
+  DebugOptions d = input_module->config().debug_options();
+  d.set_xla_embed_ir_in_executable(true);
+  input_module->mutable_config().set_debug_options(d);
+
   if (input_module->has_schedule()) {
     return input_module;
   }
@@ -136,9 +140,6 @@ StatusOr<std::unique_ptr<HloModule>> OptProvider::GetOptimizedHlo(
       std::unique_ptr<HloModule> optimized_module,
       compiler->RunHloPasses(std::move(input_module), executor, opts));
 
-  DebugOptions d = optimized_module->config().debug_options();
-  d.set_xla_embed_ir_in_executable(true);
-  optimized_module->mutable_config().set_debug_options(d);
   return optimized_module;
 }
 
diff --git a/third_party/xla/xla/tools/hlo_opt/opt_main.cc b/third_party/xla/xla/tools/hlo_opt/opt_main.cc
index 3be97735134349..ef045504861e17 100644
--- a/third_party/xla/xla/tools/hlo_opt/opt_main.cc
+++ b/third_party/xla/xla/tools/hlo_opt/opt_main.cc
@@ -20,16 +20,23 @@ limitations under the License.
 #include <cstdio>
 #include <iostream>
 #include <memory>
+#include <optional>
 #include <random>
 #include <string>
+#include <utility>
 #include <vector>
 
 #include "absl/log/check.h"
+#include "absl/status/status.h"
+#include "absl/strings/match.h"
 #include "absl/strings/str_cat.h"
+#include "absl/strings/str_join.h"
+#include "absl/strings/str_split.h"
 #include "xla/debug_options_flags.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_runner.h"
 #include "xla/service/platform_util.h"
+#include "xla/status.h"
 #include "xla/statusor.h"
 #include "xla/tools/hlo_module_loader.h"
 #include "xla/tools/hlo_opt/opt_lib.h"
@@ -40,6 +47,7 @@ limitations under the License.
 #include "tsl/platform/logging.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
 #include "tsl/util/command_line_flags.h"
 
 namespace {
@@ -75,6 +83,9 @@ namespace xla {
 
 namespace {
 
+// Convention separator as set by mlir-opt tool.
+const char* kOptSeparator = "// -----";
+
 std::string GetHloPath(const HloOptConfig& opts, int argc, char** argv) {
   if (!opts.input_file.empty()) {
     return opts.input_file;
@@ -98,16 +109,43 @@ StatusOr<std::string> GetHloContents(const HloOptConfig& opts, int argc,
   return data;
 }
 
-StatusOr<std::unique_ptr<HloModule>> GetModule(const HloOptConfig& opts,
-                                               int argc, char** argv) {
+StatusOr<std::vector<std::unique_ptr<HloModule>>> GetModules(
+    const HloOptConfig& opts, int argc, char** argv) {
   TF_ASSIGN_OR_RETURN(std::string module_data,
                       GetHloContents(opts, argc, argv));
 
+  std::vector<std::string> hlos;
+  if (opts.split_input_file) {
+    hlos = absl::StrSplit(module_data, kOptSeparator);
+  } else {
+    hlos.push_back(module_data);
+  }
+
   std::string format = opts.input_format;
   if (format.empty()) {
     format = std::string(tsl::io::Extension(GetHloPath(opts, argc, argv)));
   }
-  return LoadModuleFromData(module_data, format);
+
+  std::vector<std::unique_ptr<HloModule>> out;
+  out.reserve(hlos.size());
+
+  for (const std::string& hlo : hlos) {
+    if (absl::StrContains(hlo, "// ---")) {
+      if (opts.split_input_file) {
+        return absl::InternalError(
+            "Unexpected separator found, expected exactly '// -----', found "
+            "'// ---'");
+      } else {
+        return absl::InternalError(
+            "'// ---' separator found in input, but -split-input-file not "
+            "specified");
+      }
+    }
+    TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
+                        LoadModuleFromData(hlo, format));
+    out.push_back(std::move(module));
+  }
+  return out;
 }
 
 StatusOr<std::string> TranslateToStage(int argc, char** argv,
@@ -118,17 +156,21 @@ StatusOr<std::string> TranslateToStage(int argc, char** argv,
   if (opts.list_stages) {
     return absl::StrJoin(provider->SupportedStages(), "\n");
   }
-  TF_ASSIGN_OR_RETURN(std::unique_ptr<HloModule> module,
-                      GetModule(opts, argc, argv));
-
-  TF_ASSIGN_OR_RETURN(std::optional<std::string> out,
-                      provider->GenerateStage(std::move(module), opts.stage));
-
-  if (!out.has_value()) {
-    return absl::UnimplementedError("Stage not supported");
+  TF_ASSIGN_OR_RETURN(std::vector<std::unique_ptr<HloModule>> modules,
+                      GetModules(opts, argc, argv));
+
+  std::string out_combined;
+
+  for (std::unique_ptr<HloModule>& m : modules) {
+    TF_ASSIGN_OR_RETURN(std::optional<std::string> out,
+                        provider->GenerateStage(std::move(m), opts.stage));
+    if (!out.has_value()) {
+      return absl::UnimplementedError("Stage not supported");
+    }
+    absl::StrAppend(&out_combined, *out, "\n");
   }
 
-  return *out;
+  return out_combined;
 }
 
 Status RunOpt(int argc, char** argv, const HloOptConfig& opts) {
@@ -169,7 +211,10 @@ int main(int argc, char** argv) {
                 "\t\t\t * buffer-assignment: Buffer Assignment\n"
                 "\t\t\t * hlo-backend: HLO after backend passes\n"),
       tsl::Flag("list-stages", &opts.list_stages,
-                "Print all supported stages for a given platform and exit")};
+                "Print all supported stages for a given platform and exit"),
+      tsl::Flag("split-input-file", &opts.split_input_file,
+                "Splits the input file in pieces based on '// -----' "
+                "substring, and processes each chunk independently")};
   // Modifies global DebugOptions, populates flags with every flag available
   // from xla.proto.
   xla::AppendDebugOptionsFlags(&flag_list);
diff --git a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
index ce9994d6169108..25c1777f3ba528 100644
--- a/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
+++ b/third_party/xla/xla/tools/multihost_hlo_runner/BUILD
@@ -7,7 +7,8 @@ load("@local_tsl//tsl:tsl.bzl", "if_cuda_or_rocm")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = ["//xla:internal"],
     licenses = ["notice"],
 )
 
@@ -57,7 +58,6 @@ cc_library(
     name = "functional_hlo_runner",
     srcs = ["functional_hlo_runner.cc"],
     hdrs = ["functional_hlo_runner.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:literal",
         "//xla:shape_util",
@@ -92,7 +92,6 @@ cc_library(
     name = "hlo_runner_flags",
     srcs = ["hlo_runner_flags.cc"],
     hdrs = ["hlo_runner_flags.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":functional_hlo_runner",
         "//xla/pjrt:pjrt_executable",
diff --git a/third_party/xla/xla/tools/run_hlo_module.cc b/third_party/xla/xla/tools/run_hlo_module.cc
index 5049dc8225dfef..4eab3038c0b546 100644
--- a/third_party/xla/xla/tools/run_hlo_module.cc
+++ b/third_party/xla/xla/tools/run_hlo_module.cc
@@ -16,32 +16,80 @@ limitations under the License.
 #include "xla/tools/run_hlo_module.h"
 
 #include <functional>
+#include <iomanip>
 #include <iostream>
+#include <map>
 #include <memory>
+#include <optional>
 #include <random>
+#include <sstream>
 #include <string>
 #include <utility>
 #include <vector>
 
+#include "absl/algorithm/container.h"
+#include "absl/log/check.h"
+#include "absl/log/log.h"
+#include "absl/strings/str_cat.h"
+#include "absl/strings/str_replace.h"
+#include "absl/strings/string_view.h"
 #include "absl/types/span.h"
 #include "xla/error_spec.h"
+#include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
+#include "xla/hlo/ir/hlo_opcode.h"
 #include "xla/literal.h"
 #include "xla/literal_comparison.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/service/hlo_module_config.h"
 #include "xla/service/hlo_verifier.h"
+#include "xla/status.h"
 #include "xla/tests/test_utils.h"
 #include "xla/tools/hlo_control_flow_flattening.h"
+#include "xla/tools/hlo_decomposer.h"
 #include "xla/tools/hlo_module_loader.h"
 #include "xla/tools/prepare_reference_module.h"
 #include "xla/tools/run_hlo_module.pb.h"
 #include "xla/util.h"
 #include "xla/xla_data.pb.h"
+#include "tsl/platform/errors.h"
 #include "tsl/platform/path.h"
 #include "tsl/platform/status.h"
+#include "tsl/platform/statusor.h"
 
 namespace xla {
 namespace {
+enum class ModuleResult {
+  kMatched,
+  kRan,
+  kSkipped,
+  kDidntRun,
+  kOtherError,
+  kCompilationError,
+  kRuntimeError,
+  kMismatch,
+};
+
+constexpr absl::string_view ModuleResultToString(ModuleResult result) {
+  switch (result) {
+    case ModuleResult::kMatched:
+      return "MATCHED";
+    case ModuleResult::kRan:
+      return "RAN";
+    case ModuleResult::kSkipped:
+      return "SKIPPED";
+    case ModuleResult::kDidntRun:
+      return "DIDN'T RUN";
+    case ModuleResult::kOtherError:
+      return "OTHER ERROR";
+    case ModuleResult::kCompilationError:
+      return "COMPILATION ERROR";
+    case ModuleResult::kRuntimeError:
+      return "RUNTIME ERROR";
+    case ModuleResult::kMismatch:
+      return "MISMATCH";
+  }
+}
 
 // Writes the given literal to a file in the test temporary directory.
 void WriteLiteralToTempFile(const LiteralSlice& literal,
@@ -118,9 +166,8 @@ StatusOr<Literal> ExecuteWithRunner(
 
   return std::move(result_status).value();
 }
-}  // namespace
 
-Status RunAndCompare(
+Status RunAndCompareInternal(
     std::unique_ptr<HloModule> test_module,
     const BufferAssignmentProto* buffer_assignment_proto,
     HloRunnerInterface* test_runner, HloRunnerInterface* reference_runner,
@@ -128,7 +175,16 @@ Status RunAndCompare(
     xla::RunHloModuleIterationLiterals* iteration_literals_proto,
     std::function<Status(const HloModule&, HloRunnerInterface*, HloModule*)>
         reference_module_modifier_hook,
-    std::function<void(HloModuleConfig*)> config_modifier_hook) {
+    std::function<void(HloModuleConfig*)> config_modifier_hook,
+    ModuleResult* test_run_result, ModuleResult* reference_run_result) {
+  auto copy_result_on_failure = [](auto status, ModuleResult result,
+                                   ModuleResult* out_result) {
+    if (!status.ok() && out_result != nullptr) {
+      *out_result = result;
+    }
+    return status;
+  };
+
   if (!config_modifier_hook) {
     config_modifier_hook = [](HloModuleConfig* config) {
       config->set_seed(42);
@@ -138,19 +194,27 @@ Status RunAndCompare(
   if (options.flatten_control_flow) {
     HloControlFlowFlattening control_flow_flattening(
         HloControlFlowFlattening::Options{/*while_execution_count=*/1});
-    TF_RETURN_IF_ERROR(control_flow_flattening.Run(test_module.get()).status());
+    TF_RETURN_IF_ERROR(
+        copy_result_on_failure(control_flow_flattening.Run(test_module.get()),
+                               ModuleResult::kCompilationError, test_run_result)
+            .status());
   }
 
   const HloModuleProto test_module_proto = test_module->ToProto();
 
-  TF_ASSIGN_OR_RETURN(auto args,
-                      MakeFakeArguments(test_module.get(), engine,
-                                        options.use_large_float_range,
-                                        options.treat_gte_as_data_formatting));
+  TF_ASSIGN_OR_RETURN(
+      auto args, copy_result_on_failure(
+                     MakeFakeArguments(test_module.get(), engine,
+                                       options.use_large_float_range,
+                                       options.treat_gte_as_data_formatting),
+                     ModuleResult::kOtherError, test_run_result));
   // Use provided input literals as arguments, if any.
   if (iteration_literals_proto != nullptr &&
       iteration_literals_proto->arguments_size() != 0) {
     if (iteration_literals_proto->arguments_size() != args.size()) {
+      if (test_run_result != nullptr) {
+        *test_run_result = ModuleResult::kOtherError;
+      }
       return xla::InvalidArgument(
           "Failed to use input literals as arguments; mismatched "
           "number of expected arguments.");
@@ -160,14 +224,19 @@ Status RunAndCompare(
                  xla::Shape(args[i].shape()),
                  xla::Shape(iteration_literals_proto->arguments(i).shape()))
                  .ok()) {
+          if (test_run_result != nullptr) {
+            *test_run_result = ModuleResult::kOtherError;
+          }
           return xla::InvalidArgument(
               "Failed to use input literals for argument %d "
               "because of a shape mismatch.",
               i);
         }
-        TF_ASSIGN_OR_RETURN(args[i],
-                            xla::Literal::CreateFromProto(
-                                iteration_literals_proto->arguments(i)));
+        TF_ASSIGN_OR_RETURN(
+            args[i],
+            copy_result_on_failure(xla::Literal::CreateFromProto(
+                                       iteration_literals_proto->arguments(i)),
+                                   ModuleResult::kOtherError, test_run_result));
       }
     }
   }
@@ -190,14 +259,22 @@ Status RunAndCompare(
     // properly match the test runner's numerics.
     TF_ASSIGN_OR_RETURN(
         reference_module,
-        PrepareReferenceModule(*test_module, test_runner, config_modifier_hook,
-                               reference_module_modifier_hook));
+        copy_result_on_failure(
+            PrepareReferenceModule(*test_module, test_runner,
+                                   config_modifier_hook,
+                                   reference_module_modifier_hook),
+            ModuleResult::kCompilationError, reference_run_result));
   }
 
   TF_ASSIGN_OR_RETURN(
       auto test_result,
-      ExecuteWithRunner(std::move(test_module), buffer_assignment_proto, args,
-                        test_runner, options.run_test_hlo_passes));
+      copy_result_on_failure(
+          ExecuteWithRunner(std::move(test_module), buffer_assignment_proto,
+                            args, test_runner, options.run_test_hlo_passes),
+          ModuleResult::kRuntimeError, test_run_result));
+  if (test_run_result != nullptr) {
+    *test_run_result = ModuleResult::kRan;
+  }
   if (options.print_literals) {
     std::cout << "\n** Result with test runner " << test_runner->Name()
               << " **\n"
@@ -209,15 +286,31 @@ Status RunAndCompare(
   }
 
   if (reference_module == nullptr) {
-    std::cerr << "Skipping reference runner\n";
+    std::cerr << "Skipping reference runner";
+    return OkStatus();
+  }
+  if (const HloInstruction* root_instruction =
+          reference_module->entry_computation()->root_instruction();
+      root_instruction->opcode() == HloOpcode::kCustomCall) {
+    // TODO(b/323849999) Use original computation for the reference platform.
+    std::cerr << "Skipping reference runner for a custom call "
+              << root_instruction->custom_call_target() << "\n";
+    if (reference_run_result != nullptr) {
+      *reference_run_result = ModuleResult::kSkipped;
+    }
     return OkStatus();
   }
 
   TF_ASSIGN_OR_RETURN(
       auto reference_result,
-      ExecuteWithRunner(std::move(reference_module),
-                        /*buffer_assignment_proto=*/nullptr, args,
-                        reference_runner, options.run_reference_hlo_passes));
+      copy_result_on_failure(
+          ExecuteWithRunner(std::move(reference_module),
+                            /*buffer_assignment_proto=*/nullptr, args,
+                            reference_runner, options.run_reference_hlo_passes),
+          ModuleResult::kRuntimeError, reference_run_result));
+  if (reference_run_result != nullptr) {
+    *reference_run_result = ModuleResult::kRan;
+  }
 
   if (options.print_literals) {
     std::cout << "\n** Result with reference runner "
@@ -231,10 +324,164 @@ Status RunAndCompare(
   }
   ErrorSpec error_spec(static_cast<float>(options.abs_error_bound),
                        static_cast<float>(options.rel_error_bound));
-  return literal_comparison::Near(/*expected=*/reference_result,
-                                  /*actual=*/test_result,
-                                  /*error=*/error_spec,
-                                  /*detailed_message=*/true, &OnMiscompare);
+
+  Status comparison_status =
+      literal_comparison::Near(/*expected=*/reference_result,
+                               /*actual=*/test_result,
+                               /*error=*/error_spec,
+                               /*detailed_message=*/true, &OnMiscompare);
+  const ModuleResult comparison_result =
+      comparison_status.ok() ? ModuleResult::kMatched : ModuleResult::kMismatch;
+  if (test_run_result != nullptr) {
+    *test_run_result = comparison_result;
+  }
+  if (reference_run_result != nullptr) {
+    *reference_run_result = comparison_result;
+  }
+  return comparison_status;
+}
+
+struct ChunkResult {
+  std::string module_name;
+  ModuleResult test_result = ModuleResult::kDidntRun;
+  ModuleResult reference_result = ModuleResult::kDidntRun;
+  Status status;
+
+  bool operator<(const ChunkResult& other) const {
+    if (test_result != other.test_result) {
+      return test_result < other.test_result;
+    }
+    return reference_result < other.reference_result;
+  }
+};
+
+std::string BuildResultsTable(absl::Span<const ChunkResult> chunk_results,
+                              size_t num_modules) {
+  constexpr int kStatusWidth = 21;
+  constexpr int kNameWidth = 30;
+  constexpr int kThreeColumnsWidth = 5 + 2 * kStatusWidth + kNameWidth;
+  constexpr int kTableWidth = kThreeColumnsWidth + 30;
+
+  std::ostringstream strstr;
+  auto print_row = [&](absl::string_view reference, absl::string_view test,
+                       absl::string_view module_name, absl::string_view error) {
+    std::string formatted_error = absl::StrReplaceAll(
+        error, {{"\n", absl::StrCat("\n", std::string(kThreeColumnsWidth, ' '),
+                                    "|")}});
+    strstr << " " << std::left << std::setw(kStatusWidth) << reference << "| "
+           << std::setw(kStatusWidth) << test << "| " << std::setw(kNameWidth)
+           << module_name << "| " << formatted_error << "\n";
+  };
+  auto print_line = [&](int line_width) {
+    strstr << std::string(line_width, '-') << "\n";
+  };
+
+  print_row("Reference", "Test", "Module", "Status");
+  print_line(kTableWidth);
+
+  std::map<std::pair<ModuleResult, ModuleResult>, int> result_counts;
+
+  for (const ChunkResult& chunk_result : chunk_results) {
+    const std::pair<ModuleResult, ModuleResult> result_pair(
+        chunk_result.reference_result, chunk_result.test_result);
+
+    ++result_counts[result_pair];
+    print_row(ModuleResultToString(chunk_result.reference_result),
+              ModuleResultToString(chunk_result.test_result),
+              chunk_result.module_name, chunk_result.status.ToString());
+  }
+  print_line(kTableWidth);
+  print_row("Reference", "Test", "Module", "Status");
+  print_line(kTableWidth);
+
+  strstr << "\n\n";
+
+  // Summary table.
+  print_line(kThreeColumnsWidth);
+  print_row("Reference", "Test", "Total count", "");
+  print_line(kThreeColumnsWidth);
+  for (const auto& [result, count] : result_counts) {
+    print_row(ModuleResultToString(result.first),
+              ModuleResultToString(result.second), absl::StrCat(count), "");
+  }
+  print_line(kThreeColumnsWidth);
+  if (chunk_results.size() < num_modules) {
+    strstr << "\n(did not " << (num_modules - chunk_results.size())
+           << " modules due to earlier failures)\n\n";
+  }
+  return strstr.str();
+}
+
+Status RunIsolatedAndCompare(
+    std::unique_ptr<HloModule> test_module,
+    const BufferAssignmentProto* buffer_assignment_proto,
+    HloRunnerInterface* test_runner, HloRunnerInterface* reference_runner,
+    std::minstd_rand0* engine, const RunHloModuleOptions& options,
+    xla::RunHloModuleIterationLiterals* iteration_literals_proto,
+    std::function<Status(const HloModule&, HloRunnerInterface*, HloModule*)>
+        reference_module_modifier_hook,
+    std::function<void(HloModuleConfig*)> config_modifier_hook) {
+  CHECK(test_module);
+  CHECK(iteration_literals_proto == nullptr)
+      << "Cannot run decomposed module if input literals are provided.";
+  if (options.run_test_hlo_passes || (options.run_reference_hlo_passes &&
+                                      !options.reference_platform.empty())) {
+    LOG(WARNING)
+        << "!!! Warning !!! When running decomposed module, running HLO "
+           "passes is likely not what you want. If you have unoptimized "
+           "HLO, first convert it to the optimized e.g. using the "
+           "hlo-opt tool, and then isolate without HLO passes.";
+  }
+
+  std::vector<ChunkResult> chunk_results;
+
+  TF_ASSIGN_OR_RETURN(
+      std::vector<std::unique_ptr<HloModule>> modules,
+      DecomposeHloModule(*test_module, /*deduplicate_modules=*/true));
+
+  Status status = OkStatus();
+  for (std::unique_ptr<HloModule>& module : modules) {
+    const std::string module_name = module->name();
+    ModuleResult test_module_result = ModuleResult::kDidntRun;
+    ModuleResult reference_module_result = ModuleResult::kDidntRun;
+    Status chunk_status = RunAndCompareInternal(
+        std::move(module), buffer_assignment_proto, test_runner,
+        reference_runner, engine, options, iteration_literals_proto,
+        reference_module_modifier_hook, config_modifier_hook,
+        &test_module_result, &reference_module_result);
+    chunk_results.push_back({std::move(module_name), test_module_result,
+                             reference_module_result, chunk_status});
+    status.Update(chunk_status);
+    if (!chunk_status.ok() && test_module_result != ModuleResult::kMismatch) {
+      break;
+    }
+  }
+  absl::c_sort(chunk_results);
+  std::cout << BuildResultsTable(chunk_results, modules.size());
+  return status;
+}
+
+}  // namespace
+
+Status RunAndCompare(
+    std::unique_ptr<HloModule> test_module,
+    const BufferAssignmentProto* buffer_assignment_proto,
+    HloRunnerInterface* test_runner, HloRunnerInterface* reference_runner,
+    std::minstd_rand0* engine, const RunHloModuleOptions& options,
+    xla::RunHloModuleIterationLiterals* iteration_literals_proto,
+    std::function<Status(const HloModule&, HloRunnerInterface*, HloModule*)>
+        reference_module_modifier_hook,
+    std::function<void(HloModuleConfig*)> config_modifier_hook) {
+  if (options.isolate_instructions) {
+    return RunIsolatedAndCompare(
+        std::move(test_module), buffer_assignment_proto, test_runner,
+        reference_runner, engine, options, iteration_literals_proto,
+        reference_module_modifier_hook, config_modifier_hook);
+  }
+  return RunAndCompareInternal(
+      std::move(test_module), buffer_assignment_proto, test_runner,
+      reference_runner, engine, options, iteration_literals_proto,
+      reference_module_modifier_hook, config_modifier_hook, nullptr, nullptr);
 }
 
 Status RunAndCompare(
@@ -250,11 +497,11 @@ Status RunAndCompare(
   BufferAssignmentProto buffer_assignment_proto;
   TF_ASSIGN_OR_RETURN(
       auto test_module,
-      LoadModuleFromFile(hlo_filename, hlo_module_loader_details::Config(),
-                         options.input_format, config_modifier_hook,
-                         options.use_buffer_assignment_from_proto
-                             ? &buffer_assignment_proto
-                             : nullptr));
+      LoadModuleFromFile(
+          hlo_filename, options.input_format,
+          hlo_module_loader_details::Config(), config_modifier_hook,
+          options.use_buffer_assignment_from_proto ? &buffer_assignment_proto
+                                                   : nullptr));
   HloVerifier verifier(
       HloVerifierOpts{}.WithLayoutSensitive(false).WithAllowMixedPrecision(
           true));
@@ -272,7 +519,7 @@ Status RunAndCompare(
   std::unique_ptr<RunHloModuleIterationLiterals> iteration_literals_proto_local;
   if (iteration_literals_proto == nullptr) {
     // User did not explicitly give input
-    if (!options.force_fake_data &&
+    if (!options.force_fake_data && !options.isolate_instructions &&
         (options.input_format == "pb" || options.input_format == "pbtxt")) {
       // User is giving a snapshot (which contains inputs)
       LOG(INFO) << "Using input data from the user-provided snapshot.";
diff --git a/third_party/xla/xla/tools/run_hlo_module.h b/third_party/xla/xla/tools/run_hlo_module.h
index 0f9cc805b4e1d7..80b16b277c7f08 100644
--- a/third_party/xla/xla/tools/run_hlo_module.h
+++ b/third_party/xla/xla/tools/run_hlo_module.h
@@ -23,6 +23,7 @@ limitations under the License.
 
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/service/hlo_runner.h"
+#include "xla/status.h"
 #include "xla/tools/run_hlo_module.pb.h"
 #include "tsl/platform/status.h"
 
@@ -55,6 +56,7 @@ struct RunHloModuleOptions {
   std::string input_literals_file;
   bool random_init_input_literals{true};
   bool force_fake_data{false};
+  bool isolate_instructions{false};
 };
 
 // Runs test_module on the platform with the name
diff --git a/third_party/xla/xla/tools/run_hlo_module_main.cc b/third_party/xla/xla/tools/run_hlo_module_main.cc
index 6e5dc66f42c0b5..6f4005a2da5d9c 100644
--- a/third_party/xla/xla/tools/run_hlo_module_main.cc
+++ b/third_party/xla/xla/tools/run_hlo_module_main.cc
@@ -127,6 +127,12 @@ int main(int argc, char** argv) {
           "iterations", &opts.iterations,
           "The number of times to run the module. Each iteration will be run "
           "with different input data."),
+      tsl::Flag(
+          "isolate_instructions", &opts.isolate_instructions,
+          "Rather than executing the entire module at once, run every "
+          "instruction individually, including the top-level and control-flow "
+          "dependent computations (e.g. inside conditions, calls). Skip "
+          "instructions inside fused computations etc."),
       tsl::Flag("different_random_seeds", &different_random_seeds,
                 "Whether each iteration should use a different random seed for "
                 "the HloModuleConfig."),
@@ -194,8 +200,7 @@ int main(int argc, char** argv) {
   }
 
   if (!reference_platform_name.empty()) {
-    std::cerr << failure_count << "/" << iteration_count
-              << " runs miscompared.\n";
+    std::cerr << failure_count << "/" << iteration_count << " runs failed.\n";
   }
 
   return failure_count == 0 ? 0 : -1;
diff --git a/third_party/xla/xla/tools/xla_compile_lib_test.cc b/third_party/xla/xla/tools/xla_compile_lib_test.cc
index 953f6ad54499ab..380e6466a01145 100644
--- a/third_party/xla/xla/tools/xla_compile_lib_test.cc
+++ b/third_party/xla/xla/tools/xla_compile_lib_test.cc
@@ -89,6 +89,7 @@ TEST_F(XlaCompileLibTest, DISABLED_ON_CPU(CompilesForGpuWithDevice)) {
   EXPECT_THAT(
       CompileExecutable(std::move(module_), "gpu", std::nullopt, result),
       IsOkAndHolds(Not(IsEmpty())));
+  EXPECT_TRUE(result.has_hlo_module()) << result.DebugString();
 }
 
 TEST_F(XlaCompileLibTest, DISABLED_ON_CPU(CompilesForGpuWithoutDevice)) {
@@ -102,14 +103,6 @@ TEST_F(XlaCompileLibTest, DISABLED_ON_CPU(CompilesForGpuWithoutDevice)) {
   EXPECT_THAT(
       CompileExecutable(std::move(module_), "gpu", std::nullopt, result),
       IsOkAndHolds(Not(IsEmpty())));
-}
-
-TEST_F(XlaCompileLibTest,
-       DISABLED_ON_CPU(ReturnsOptimizedModuleWhenRequested)) {
-  CompilationResult result;
-  EXPECT_THAT(
-      CompileExecutable(std::move(module_), "gpu", std::nullopt, result),
-      IsOkAndHolds(Not(IsEmpty())));
   EXPECT_TRUE(result.has_hlo_module()) << result.DebugString();
 }
 
diff --git a/third_party/xla/xla/translate/BUILD b/third_party/xla/xla/translate/BUILD
index 302ac9c5432700..aad084173418de 100644
--- a/third_party/xla/xla/translate/BUILD
+++ b/third_party/xla/xla/translate/BUILD
@@ -1,8 +1,13 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("//xla:xla.bzl", "xla_cc_binary")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//learning/brain/mlir:tensorflow_friends",
+        "//learning/brain/mlir:xla_friends",
+    ]),
     licenses = ["notice"],
 )
 
@@ -23,7 +28,6 @@ xla_cc_binary(
         "//xla/stream_executor/host:host_platform",
         "//xla/translate/hlo_to_mhlo:translate_registration",
         "//xla/translate/mhlo_to_hlo:translate_registration",
-        "//xla/translate/mhlo_to_lhlo_with_xla:translate_registration",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:IR",
         "@llvm-project//mlir:Support",
@@ -31,3 +35,28 @@ xla_cc_binary(
         "@local_tsl//tsl/platform:platform_port",
     ],
 )
+
+build_test(
+    name = "xla-translate-opt_build_test",
+    targets = [
+        ":xla-translate-opt",
+    ],
+)
+
+xla_cc_binary(
+    name = "xla-translate-opt",
+    testonly = True,
+    srcs = ["xla_translate_opt_main.cc"],
+    deps = [
+        "//xla/mlir/framework/ir:xla_framework",
+        "//xla/mlir/framework/transforms:passes",
+        "//xla/mlir_hlo:hlo_dialect_registration",
+        "//xla/service:cpu_plugin",
+        "//xla/service/cpu:hlo_xla_runtime_pipeline",  # buildcleaner: keep
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AllPassesAndDialects",
+        "@llvm-project//mlir:MlirOptLib",
+        "@local_tsl//tsl/platform:platform_port",
+        "@stablehlo//:register",
+    ],
+)
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/BUILD b/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
index 19dc3a498b2e4d..c3092da10f93ed 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/BUILD
@@ -1,8 +1,13 @@
 load("//xla:xla.bzl", "xla_cc_test")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//learning/brain/mlir:tensorflow_friends",
+        "//learning/brain/mlir:xla_friends",
+    ]),
     licenses = ["notice"],
 )
 
@@ -10,7 +15,6 @@ cc_library(
     name = "attribute_importer",
     srcs = ["attribute_importer.cc"],
     hdrs = ["attribute_importer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:statusor",
@@ -22,11 +26,28 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "custom_call_importer",
+    srcs = ["custom_call_importer.cc"],
+    hdrs = ["custom_call_importer.h"],
+    deps = [
+        "//xla:status",
+        "//xla:statusor",
+        "//xla:util",
+        "//xla/hlo/ir:hlo",
+        "//xla/mlir_hlo",
+        "@com_google_absl//absl/strings",
+        "@llvm-project//llvm:Support",
+        "@llvm-project//mlir:AsmParser",
+        "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:Support",
+    ],
+)
+
 cc_library(
     name = "stack_location_utils",
     srcs = ["stack_location_utils.cc"],
     hdrs = ["stack_location_utils.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/hlo/ir:hlo",
         "//xla/service:hlo_proto_cc",
@@ -44,9 +65,9 @@ cc_library(
         "hlo_function_importer.h",
         "hlo_module_importer.h",
     ],
-    visibility = ["//visibility:public"],
     deps = [
         ":attribute_importer",
+        ":custom_call_importer",
         ":hlo_utils",
         ":location_importer",
         "//xla:comparison_util",
@@ -66,11 +87,13 @@ cc_library(
         "@com_google_absl//absl/algorithm:container",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:optional",
+        "@com_google_absl//absl/types:span",
         "@llvm-project//llvm:Support",
         "@llvm-project//mlir:ArithDialect",
         "@llvm-project//mlir:AsmParser",
         "@llvm-project//mlir:FuncDialect",
         "@llvm-project//mlir:IR",
+        "@llvm-project//mlir:QuantOps",
         "@llvm-project//mlir:SparseTensorDialect",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -80,7 +103,6 @@ cc_library(
     name = "hlo_to_mlir_hlo",
     srcs = ["hlo_to_mlir_hlo.cc"],
     hdrs = ["hlo_to_mlir_hlo.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_module_importer",
         "//xla:status",
@@ -94,7 +116,6 @@ cc_library(
     srcs = ["hlo_utils.cc"],
     hdrs = ["hlo_utils.h"],
     includes = ["include"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:literal",
         "//xla:shape_util",
@@ -134,7 +155,6 @@ cc_library(
     name = "location_importer",
     srcs = ["location_importer.cc"],
     hdrs = ["location_importer.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "stack_location_utils",
         "//xla/hlo/ir:hlo",
@@ -146,7 +166,6 @@ cc_library(
     name = "translate",
     srcs = ["translate.cc"],
     hdrs = ["translate.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":hlo_to_mlir_hlo",
         "//xla:status",
@@ -163,7 +182,6 @@ cc_library(
     name = "translate_registration",
     testonly = True,
     srcs = ["translate_registration.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":translate",
         "@llvm-project//mlir:IR",
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/custom_call_importer.cc b/third_party/xla/xla/translate/hlo_to_mhlo/custom_call_importer.cc
new file mode 100644
index 00000000000000..472207133b2390
--- /dev/null
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/custom_call_importer.cc
@@ -0,0 +1,123 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#include "xla/translate/hlo_to_mhlo/custom_call_importer.h"
+
+#include <cstdint>
+#include <vector>
+
+#include "absl/strings/match.h"
+#include "llvm/ADT/STLExtras.h"
+#include "mlir/AsmParser/AsmParser.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "mlir/Support/LLVM.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
+#include "xla/status.h"
+#include "xla/util.h"
+
+namespace xla {
+namespace {
+
+StatusOr<mlir::Operation*> ImportDynamicBroadcastInDimOp(
+    mlir::StringRef backend_config, mlir::Location loc, mlir::Type result_type,
+    mlir::ValueRange operands, mlir::OpBuilder* builder) {
+  if (backend_config.empty()) {
+    return Internal("backend_config attribute cannot be empty.");
+  }
+
+  auto attr = mlir::parseAttribute(backend_config, builder->getContext())
+                  .dyn_cast<mlir::DictionaryAttr>();
+  if (!attr) {
+    return Internal(
+        "Couldn't parse backend config into a dictionary attribute");
+  }
+
+  auto broadcast_dimensions_attr =
+      attr.get("broadcast_dimensions").dyn_cast_or_null<mlir::ArrayAttr>();
+  if (!broadcast_dimensions_attr) {
+    return Internal("broadcast_dimensions attribute is required.");
+  }
+
+  std::vector<int64_t> broadcast_dimensions(broadcast_dimensions_attr.size());
+  for (auto [i, broadcast_dimension] :
+       llvm::enumerate(broadcast_dimensions_attr)) {
+    broadcast_dimensions[i] =
+        broadcast_dimension.cast<mlir::IntegerAttr>().getInt();
+  }
+
+  return builder
+      ->create<mlir::mhlo::DynamicBroadcastInDimOp>(
+          loc, result_type, operands[0], operands[1],
+          builder->getI64TensorAttr(broadcast_dimensions))
+      .getOperation();
+}
+
+StatusOr<mlir::Operation*> ImportDynamicReshapeOp(
+    mlir::StringRef backend_config, mlir::Location loc, mlir::Type result_type,
+    mlir::ValueRange operands, mlir::OpBuilder* builder) {
+  if (!backend_config.empty()) {
+    return Internal("backend_config attribute must be empty.");
+  }
+  return builder
+      ->create<mlir::mhlo::DynamicReshapeOp>(loc, result_type, operands)
+      .getOperation();
+}
+
+StatusOr<mlir::Operation*> ImportRealDynamicSliceOp(
+    mlir::StringRef backend_config, mlir::Location loc, mlir::Type result_type,
+    mlir::ValueRange operands, mlir::OpBuilder* builder) {
+  if (!backend_config.empty()) {
+    return Internal("backend_config attribute must be empty.");
+  }
+  return builder
+      ->create<mlir::mhlo::RealDynamicSliceOp>(loc, result_type, operands)
+      .getOperation();
+}
+
+}  // namespace
+
+absl::StatusOr<mlir::Operation*> ImportCustomCallAsOp(
+    const HloCustomCallInstruction* instruction, mlir::Location loc,
+    mlir::Type result_type, mlir::ValueRange operands,
+    mlir::OpBuilder* builder) {
+  const std::string& custom_call_target = instruction->custom_call_target();
+  const std::string& backend_config_str =
+      instruction->raw_backend_config_string();
+  if (custom_call_target == "mhlo.dynamic_broadcast_in_dim") {
+    return ImportDynamicBroadcastInDimOp(backend_config_str, loc, result_type,
+                                         operands, builder);
+  }
+  if (custom_call_target == "mhlo.dynamic_reshape") {
+    return ImportDynamicReshapeOp(backend_config_str, loc, result_type,
+                                  operands, builder);
+  }
+  if (custom_call_target == "mhlo.real_dynamic_slice") {
+    return ImportRealDynamicSliceOp(backend_config_str, loc, result_type,
+                                    operands, builder);
+  }
+  return InvalidArgument("Unsupported MHLO op custom_call %s",
+                         custom_call_target);
+}
+
+bool IsOpEncodedCustomCall(const HloCustomCallInstruction* instruction) {
+  return absl::StartsWith(instruction->custom_call_target(), "mhlo.");
+}
+
+}  // namespace xla
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/custom_call_importer.h b/third_party/xla/xla/translate/hlo_to_mhlo/custom_call_importer.h
new file mode 100644
index 00000000000000..c3fde619c23cd9
--- /dev/null
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/custom_call_importer.h
@@ -0,0 +1,45 @@
+/* Copyright 2024 The OpenXLA Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+==============================================================================*/
+
+#ifndef XLA_TRANSLATE_HLO_TO_MHLO_CUSTOM_CALL_IMPORTER_H_
+#define XLA_TRANSLATE_HLO_TO_MHLO_CUSTOM_CALL_IMPORTER_H_
+
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
+#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
+#include "mlir/IR/Location.h"  // from @llvm-project
+#include "mlir/IR/Operation.h"  // from @llvm-project
+#include "mlir/IR/ValueRange.h"  // from @llvm-project
+#include "xla/hlo/ir/hlo_instructions.h"
+#include "xla/statusor.h"
+
+namespace xla {
+
+// Imports custom_calls prefixed with `mhlo.` from HLO to MHLO.
+// This is used for ops in MHLO / StableHLO that don't exist in HLO. Many of
+// these ops are needed for XlaBuilder clients that need to raise HLO to
+// StableHLO.
+StatusOr<mlir::Operation*> ImportCustomCallAsOp(
+    const HloCustomCallInstruction* instruction, mlir::Location loc,
+    mlir::Type result_type, mlir::ValueRange operands,
+    mlir::OpBuilder* builder);
+
+// Indicates whether a custom call is an encoded MHLO op.
+// Currently returns true for `mhlo.` prefixed custom calls.
+bool IsOpEncodedCustomCall(const HloCustomCallInstruction* instruction);
+
+}  // namespace xla
+
+#endif  // XLA_TRANSLATE_HLO_TO_MHLO_CUSTOM_CALL_IMPORTER_H_
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
index a2d74adf8c29aa..06526c6bba5061 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.cc
@@ -26,6 +26,7 @@ limitations under the License.
 #include "absl/algorithm/container.h"
 #include "absl/strings/match.h"
 #include "absl/types/optional.h"
+#include "absl/types/span.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -48,13 +49,18 @@ limitations under the License.
 #include "xla/hlo/ir/hlo_instructions.h"
 #include "xla/hlo/ir/hlo_module.h"
 #include "xla/hlo/ir/hlo_opcode.h"
+#include "xla/hlo/ir/hlo_sharding.h"
 #include "xla/hlo/ir/hlo_sharding_metadata.h"
+#include "xla/layout_util.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
 #include "xla/printer.h"
 #include "xla/protobuf_util.h"
 #include "xla/service/hlo.pb.h"
+#include "xla/shape_util.h"
+#include "xla/status.h"
 #include "xla/status_macros.h"
 #include "xla/translate/hlo_to_mhlo/attribute_importer.h"
+#include "xla/translate/hlo_to_mhlo/custom_call_importer.h"
 #include "xla/translate/hlo_to_mhlo/hlo_utils.h"
 #include "xla/translate/hlo_to_mhlo/location_importer.h"
 #include "xla/util.h"
@@ -105,7 +111,7 @@ bool DotIsDefault(const HloInstruction* instruction) {
   default_dimension_numbers.add_lhs_contracting_dimensions(
       instruction->operand(0)->shape().dimensions_size() == 1 ? 0 : 1);
   default_dimension_numbers.add_rhs_contracting_dimensions(0);
-  return xla::protobuf_util::ProtobufEquals(dnums, default_dimension_numbers);
+  return protobuf_util::ProtobufEquals(dnums, default_dimension_numbers);
 }
 
 // Clean up the GetTupleElementOp, created during the flattening of
@@ -295,16 +301,16 @@ static bool IsNestedTupleInData(Type type) {
   return false;
 }
 
-static bool HasCustomLayout(const xla::Shape& shape) {
+static bool HasCustomLayout(const Shape& shape) {
   if (shape.IsTuple()) {
     return llvm::any_of(shape.tuple_shapes(), HasCustomLayout);
   }
   return shape.has_layout() && !shape.layout().minor_to_major().empty() &&
-         shape.layout() != xla::LayoutUtil::GetDefaultLayoutForShape(shape);
+         shape.layout() != LayoutUtil::GetDefaultLayoutForShape(shape);
 }
 
 static mlir::Attribute GetLayoutAttribute(mlir::Builder& b,
-                                          const xla::Shape& shape) {
+                                          const Shape& shape) {
   if (shape.IsTuple()) {
     llvm::SmallVector<mlir::Attribute> element_attrs;
     for (const auto& tuple_shape : shape.tuple_shapes()) {
@@ -325,8 +331,8 @@ static mlir::Attribute GetLayoutAttribute(mlir::Builder& b,
   return b.getIndexTensorAttr(layout);
 }
 
-mlir::Attribute GetFrontendAttributes(
-    mlir::Builder& b, const xla::FrontendAttributes& attributes) {
+mlir::Attribute GetFrontendAttributes(mlir::Builder& b,
+                                      const FrontendAttributes& attributes) {
   llvm::SmallVector<mlir::NamedAttribute> attrs;
   attrs.reserve(attributes.map_size());
   for (const auto& [k, v] : attributes.map()) {
@@ -393,10 +399,11 @@ StatusOr<mlir::func::FuncOp> HloFunctionImporter::ImportAsFunc(
   return importer.ImportAsFunc(computation, is_main);
 }
 
-Status HloFunctionImporter::ImportAsRegion(
-    const xla::HloComputation& computation, mlir::SymbolTable& symbol_table,
-    mlir::Region* region, mlir::Builder* builder,
-    bool flatten_region_arg_tuple) {
+Status HloFunctionImporter::ImportAsRegion(const HloComputation& computation,
+                                           mlir::SymbolTable& symbol_table,
+                                           mlir::Region* region,
+                                           mlir::Builder* builder,
+                                           bool flatten_region_arg_tuple) {
   HloFunctionImporter importer(symbol_table, {}, builder);
   return importer.ImportAsRegion(computation, region, flatten_region_arg_tuple);
 }
@@ -482,7 +489,7 @@ StatusOr<FuncOp> HloFunctionImporter::ImportAsFunc(
                                computation_layout.result_layout().shape()));
       }
       if (llvm::any_of(computation_layout.parameter_layouts(),
-                       [](const xla::ShapeLayout& shape) {
+                       [](const ShapeLayout& shape) {
                          return HasCustomLayout(shape.shape());
                        })) {
         llvm::SmallVector<mlir::Attribute> parameter_layouts;
@@ -539,7 +546,7 @@ Status HloFunctionImporter::ImportAsRegion(const HloComputation& computation,
 }
 
 StatusOr<Value> HloFunctionImporter::ImportInstructionsImpl(
-    const xla::HloComputation& computation,
+    const HloComputation& computation,
     const llvm::SmallVectorImpl<Value>& arguments, mlir::OpBuilder* builder) {
   // Setup the input parameters.
   const int num_parameters = computation.num_parameters();
@@ -639,11 +646,11 @@ Status HloFunctionImporter::ImportInstructions(
 
   CleanUpTupleOps(block, &builder);
 
-  return ::tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<Value> HloFunctionImporter::ImportInstructions(
-    const xla::HloComputation& computation,
+    const HloComputation& computation,
     const llvm::SmallVectorImpl<Value>& arguments,
     mlir::SymbolTable& symbol_table, mlir::OpBuilder* builder) {
   mlir::Block* block = builder->getBlock();
@@ -656,7 +663,7 @@ StatusOr<Value> HloFunctionImporter::ImportInstructions(
 }
 
 StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
-    const xla::HloInstruction* instr,
+    const HloInstruction* instr,
     const llvm::SmallVectorImpl<mlir::Value>& operands,
     mlir::SymbolTable& symbol_table, mlir::OpBuilder* builder,
     DynamicShapeHandlingMode mode) {
@@ -669,63 +676,13 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstruction(
   return importer.ImportInstructionWithLayout(instr, operands, builder, mode);
 }
 
-StatusOr<mlir::Operation*> HloFunctionImporter::ImportCustomCallAsOp(
-    const HloInstruction* instruction, mlir::Location loc,
-    const Type result_type, mlir::ValueRange operands,
-    mlir::OpBuilder* func_builder) {
-  auto custom_call = Cast<HloCustomCallInstruction>(instruction);
-  if (custom_call->custom_call_target() == "mhlo.dynamic_broadcast_in_dim") {
-    auto raw_backend_config = custom_call->raw_backend_config_string();
-    if (raw_backend_config.empty()) {
-      return Internal("backend_config attribute cannot be empty.");
-    }
-
-    auto attr = mlir::parseAttribute(raw_backend_config, builder_->getContext())
-                    .dyn_cast<mlir::DictionaryAttr>();
-    if (!attr) {
-      return Internal(
-          "Couldn't parse backend config into a dictionary attribute");
-    }
-
-    auto broadcast_dimensions_attr =
-        attr.get("broadcast_dimensions").dyn_cast_or_null<mlir::ArrayAttr>();
-    if (!broadcast_dimensions_attr) {
-      return Internal("broadcast_dimensions attribute is required.");
-    }
-
-    std::vector<int64_t> broadcast_dimensions(broadcast_dimensions_attr.size());
-    for (auto [i, broadcast_dimension] :
-         llvm::enumerate(broadcast_dimensions_attr)) {
-      broadcast_dimensions[i] =
-          broadcast_dimension.cast<mlir::IntegerAttr>().getInt();
-    }
-
-    return func_builder
-        ->create<mlir::mhlo::DynamicBroadcastInDimOp>(
-            loc, result_type, operands[0], operands[1],
-            builder_->getI64TensorAttr(broadcast_dimensions))
-        .getOperation();
-  }
-  if (custom_call->custom_call_target() == "mhlo.dynamic_reshape") {
-    auto raw_backend_config = custom_call->raw_backend_config_string();
-    if (!raw_backend_config.empty()) {
-      return Internal("backend_config attribute should be empty.");
-    }
-    return func_builder
-        ->create<mlir::mhlo::DynamicReshapeOp>(loc, result_type, operands)
-        .getOperation();
-  }
-  return InvalidArgument("Unsupported MHLO op custom_call %s",
-                         custom_call->custom_call_target());
-}
-
 StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
     const HloInstruction* instruction,
     const llvm::SmallVectorImpl<mlir::Value>& operands,
     mlir::OpBuilder* func_builder, DynamicShapeHandlingMode mode) {
   const Shape& instruction_shape = instruction->shape();
   const Shape& shape = mode == DynamicShapeHandlingMode::kConvertToStatic
-                           ? xla::ShapeUtil::MakeStaticShape(instruction_shape)
+                           ? ShapeUtil::MakeStaticShape(instruction_shape)
                            : instruction_shape;
   TF_ASSIGN_OR_RETURN(auto result_type,
                       ConvertShapeToType<RankedTensorType>(shape, *builder_));
@@ -911,7 +868,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           instruction->source_target_pairs(), builder_));
       return ImportOldStyleAsyncStart<mlir::mhlo::CollectivePermuteOp>(
           attributes, operands, loc, result_type, func_builder,
-          "collective_permute_", [&](auto) { return ::tsl::OkStatus(); });
+          "collective_permute_", [&](auto) { return absl::OkStatus(); });
     }
     case HloOpcode::kCollectivePermuteDone: {
       return ImportOldStyleAsyncDone(attributes, operands, loc, result_type,
@@ -919,8 +876,8 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
     }
     case HloOpcode::kCustomCall: {
       auto custom_call = Cast<HloCustomCallInstruction>(instruction);
-      if (absl::StrContains(custom_call->custom_call_target(), "mhlo.")) {
-        return ImportCustomCallAsOp(instruction, loc, result_type, operands,
+      if (IsOpEncodedCustomCall(custom_call)) {
+        return ImportCustomCallAsOp(custom_call, loc, result_type, operands,
                                     func_builder);
       }
       const auto& called_computations = custom_call->called_computations();
@@ -1262,7 +1219,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       }
       return ImportOldStyleAsyncStart<mlir::mhlo::CopyOp>(
           attributes, operands, loc, result_type, func_builder, "copy_",
-          [](auto) { return ::tsl::OkStatus(); });
+          [](auto) { return absl::OkStatus(); });
     }
     case HloOpcode::kCopyDone: {
       return ImportOldStyleAsyncDone(attributes, operands, loc, result_type,
@@ -1287,11 +1244,11 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           "is_host_transfer",
           builder_->getBoolAttr(send_op->is_host_transfer())));
       if (send_op->channel_id().has_value()) {
-        xla::ChannelHandle channel_handle;
+        ChannelHandle channel_handle;
         channel_handle.set_handle(send_op->channel_id().value());
         channel_handle.set_type(send_op->is_host_transfer()
-                                    ? xla::ChannelHandle::DEVICE_TO_HOST
-                                    : xla::ChannelHandle::DEVICE_TO_DEVICE);
+                                    ? ChannelHandle::DEVICE_TO_HOST
+                                    : ChannelHandle::DEVICE_TO_DEVICE);
         attributes.push_back(ConvertChannelHandle(channel_handle));
       }
       return ImportOldStyleAsyncStart<mlir::mhlo::SendOp>(
@@ -1321,11 +1278,11 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
           "is_host_transfer",
           builder_->getBoolAttr(recv_op->is_host_transfer())));
       if (recv_op->channel_id().has_value()) {
-        xla::ChannelHandle channel_handle;
+        ChannelHandle channel_handle;
         channel_handle.set_handle(recv_op->channel_id().value());
         channel_handle.set_type(recv_op->is_host_transfer()
-                                    ? xla::ChannelHandle::HOST_TO_DEVICE
-                                    : xla::ChannelHandle::DEVICE_TO_DEVICE);
+                                    ? ChannelHandle::HOST_TO_DEVICE
+                                    : ChannelHandle::DEVICE_TO_DEVICE);
         attributes.push_back(ConvertChannelHandle(channel_handle));
       }
       return ImportOldStyleAsyncStart<mlir::mhlo::RecvOp>(
@@ -1466,7 +1423,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
 
       return ImportOldStyleAsyncStart<mlir::mhlo::AllGatherOp>(
           attributes, operands, loc, result_type, func_builder, "all_gather_",
-          [](auto) { return ::tsl::OkStatus(); });
+          [](auto) { return absl::OkStatus(); });
     }
     case HloOpcode::kAllGatherDone: {
       return ImportOldStyleAsyncDone(attributes, operands, loc, result_type,
@@ -1520,7 +1477,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
             TF_RETURN_IF_ERROR(ImportAsRegion(
                 *instruction->to_apply(), &all_reduce_sync.getComputation(),
                 /*flatten_region_arg_tuple=*/true));
-            return ::tsl::OkStatus();
+            return absl::OkStatus();
           });
     }
     case HloOpcode::kAllReduceDone: {
@@ -1624,14 +1581,14 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       auto shape = func_builder->create<mlir::mhlo::ConstantOp>(
           loc, Convert(result_type.cast<RankedTensorType>().getShape()));
       switch (instruction->random_distribution()) {
-        case xla::RNG_UNIFORM:
+        case RNG_UNIFORM:
           return func_builder
               ->create<mlir::mhlo::RngOp>(
                   loc, result_type, operands[0], operands[1], shape,
                   ::mlir::mhlo::RngDistribution::UNIFORM)
               .getOperation();
 
-        case xla::RNG_NORMAL:
+        case RNG_NORMAL:
           return func_builder
               ->create<mlir::mhlo::RngOp>(loc, result_type, operands[0],
                                           operands[1], shape,
@@ -1912,7 +1869,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       }
 
       // Return type is boolean, let's use `operand != 0` instead of Convert.
-      xla::Shape input_shape = instruction->operand(0)->shape();
+      Shape input_shape = instruction->operand(0)->shape();
       TF_ASSIGN_OR_RETURN(mlir::Type type,
                           ConvertTensorShapeToType<mlir::RankedTensorType>(
                               input_shape, *func_builder));
@@ -2012,6 +1969,7 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       NO_ATTRIBUTE_CASE(kReplicaId, ReplicaIdOp);
       NO_ATTRIBUTE_CASE(kStochasticConvert, StochasticConvertOp);
       NO_ATTRIBUTE_CASE(kLogistic, LogisticOp);
+      NO_ATTRIBUTE_CASE(kErf, ErfOp);
       // The dimensions attribute is not present on the HLO Reshape
       // instruction. If dimensions are non-default, the XLA builder
       // implements it as a separate transpose.
@@ -2049,8 +2007,8 @@ StatusOr<mlir::Operation*> HloFunctionImporter::ImportInstructionImpl(
       llvm::SmallVector<Type> flattened_ret_types;
       FlattenTupleType(result_type, flattened_ret_types);
 
-      auto fusion_kind = mlir::mhlo::symbolizeFusionKind(
-          xla::ToString(instruction->fusion_kind()));
+      auto fusion_kind =
+          mlir::mhlo::symbolizeFusionKind(ToString(instruction->fusion_kind()));
       attributes.push_back(builder_->getNamedAttr(
           "fusion_kind", mlir::mhlo::FusionKindAttr::get(
                              func_builder->getContext(), fusion_kind.value())));
@@ -2152,7 +2110,7 @@ Status HloFunctionImporter::GetMlirTypes(
                                            instruction->shape(), *builder_));
     types->push_back(ret_type);
   }
-  return ::tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 StatusOr<Value> HloFunctionImporter::GetMlirValue(
@@ -2279,13 +2237,13 @@ mlir::NamedAttribute HloFunctionImporter::ConvertReplicaGroups(
 
 mlir::NamedAttribute HloFunctionImporter::ConvertChannelHandle(
     std::optional<int64_t> channel_id) {
-  xla::ChannelHandle channel_handle;
+  ChannelHandle channel_handle;
   if (channel_id) channel_handle.set_handle(*channel_id);
   return ConvertChannelHandle(channel_handle);
 }
 
 mlir::NamedAttribute HloFunctionImporter::ConvertChannelHandle(
-    const xla::ChannelHandle& channel) {
+    const ChannelHandle& channel) {
   return builder_->getNamedAttr(
       "channel_handle", mlir::mhlo::ChannelHandleAttr::get(
                             context_, channel.handle(), channel.type()));
@@ -2304,10 +2262,10 @@ void HloFunctionImporter::SetLayoutForMlir(mlir::Operation* op,
 }
 
 Status HloFunctionImporter::ConvertShapeToMlirLayout(
-    const xla::Shape& shape,
+    const Shape& shape,
     llvm::SmallVectorImpl<mlir::Attribute>& flattened_attr) {
   if (shape.IsToken()) {
-    return ::tsl::OkStatus();
+    return absl::OkStatus();
   }
   if (shape.IsTuple()) {
     std::vector<mlir::Attribute> tuple_layouts;
@@ -2315,29 +2273,29 @@ Status HloFunctionImporter::ConvertShapeToMlirLayout(
       TF_RETURN_IF_ERROR(
           ConvertShapeToMlirLayout(shape.tuple_shapes(i), flattened_attr));
     }
-    return ::tsl::OkStatus();
+    return absl::OkStatus();
   }
   if (shape.IsArray()) {
-    const xla::Layout l = shape.layout();
+    const Layout l = shape.layout();
     std::vector<mlir::Attribute> minor_to_major;
     for (int64_t i : l.minor_to_major()) {
       minor_to_major.push_back(builder_->getI64IntegerAttr(i));
     }
     llvm::ArrayRef<mlir::Attribute> array_ref(minor_to_major);
     flattened_attr.push_back(builder_->getArrayAttr(array_ref));
-    return ::tsl::OkStatus();
+    return absl::OkStatus();
   }
   return Internal("Couldn't convert layout.");
 }
 
-mlir::Attribute ConvertSharding(const xla::HloSharding& sharding,
+mlir::Attribute ConvertSharding(const HloSharding& sharding,
                                 mlir::Builder* builder) {
   return builder->getStringAttr(sharding.ToString(/*include_metadata=*/true));
 }
 
-mlir::Attribute ConvertSharding(const xla::OpSharding& sharding,
+mlir::Attribute ConvertSharding(const OpSharding& sharding,
                                 mlir::Builder* builder) {
-  auto hlo_sharding = xla::HloSharding::FromProto(sharding);
+  auto hlo_sharding = HloSharding::FromProto(sharding);
   if (!hlo_sharding.ok()) return {};
   return ConvertSharding(hlo_sharding.value(), builder);
 }
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.h b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.h
index 0d8da56ad38fa3..fbf7077dab98a5 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.h
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_function_importer.h
@@ -57,15 +57,15 @@ class HloFunctionImporter {
   // returns the FuncOp. This also imports any computations referred by
   // instructions in this computation.
   static StatusOr<mlir::func::FuncOp> ImportAsFunc(
-      const xla::HloComputation& computation, mlir::SymbolTable& symbol_table,
-      std::unordered_map<const xla::HloComputation*, mlir::func::FuncOp>*
+      const HloComputation& computation, mlir::SymbolTable& symbol_table,
+      std::unordered_map<const HloComputation*, mlir::func::FuncOp>*
           function_map,
       mlir::Builder* builder, bool is_main);
 
   // Imports the given hlo computation to the specified region. If
   // 'flatten_region_arg_tuple' is true, then flatten the tuple-typed region
   // argument(s) and return value(s).
-  static Status ImportAsRegion(const xla::HloComputation& computation,
+  static Status ImportAsRegion(const HloComputation& computation,
                                mlir::SymbolTable& symbol_table,
                                mlir::Region* region, mlir::Builder* builder,
                                bool flatten_region_arg_tuple = false);
@@ -73,12 +73,12 @@ class HloFunctionImporter {
   // Imports the given computation to the given place specified by `builder`.
   // `arguments` contains values for all parameters.
   static StatusOr<mlir::Value> ImportInstructions(
-      const xla::HloComputation& computation,
+      const HloComputation& computation,
       const llvm::SmallVectorImpl<mlir::Value>& arguments,
       mlir::SymbolTable& symbol_table, mlir::OpBuilder* builder);
 
   static StatusOr<mlir::Operation*> ImportInstruction(
-      const xla::HloInstruction* instr,
+      const HloInstruction* instr,
       const llvm::SmallVectorImpl<mlir::Value>& operands,
       mlir::SymbolTable& symbol_table, mlir::OpBuilder* builder,
       DynamicShapeHandlingMode mode = DynamicShapeHandlingMode::kDynamic);
@@ -139,7 +139,7 @@ class HloFunctionImporter {
 
  private:
   HloFunctionImporter(mlir::SymbolTable& symbol_table,
-                      std::unordered_map<const xla::HloComputation*,
+                      std::unordered_map<const HloComputation*,
                                          mlir::func::FuncOp>* function_map,
                       mlir::Builder* builder)
       : context_(symbol_table.getOp()->getContext()),
@@ -154,8 +154,8 @@ class HloFunctionImporter {
 
   // Imports the given computation as a new function, if it hasn't been already
   // imported.
-  StatusOr<mlir::func::FuncOp> ImportAsFunc(
-      const xla::HloComputation& computation, bool is_main);
+  StatusOr<mlir::func::FuncOp> ImportAsFunc(const HloComputation& computation,
+                                            bool is_main);
 
   // Imports the given computation in the specified region.
   Status ImportAsRegion(const HloComputation& computation, mlir::Region* region,
@@ -166,19 +166,13 @@ class HloFunctionImporter {
   Status ImportInstructions(const HloComputation& computation,
                             mlir::Block* block, bool flatten_region_arg_tuple);
   StatusOr<mlir::Value> ImportInstructionsImpl(
-      const xla::HloComputation& computation,
+      const HloComputation& computation,
       const llvm::SmallVectorImpl<mlir::Value>& arguments,
       mlir::OpBuilder* builder);
 
-  // Imports custom_calls prefixed with `stablehlo.`.
-  StatusOr<mlir::Operation*> ImportCustomCallAsOp(
-      const HloInstruction* instruction, mlir::Location loc,
-      mlir::Type result_type, mlir::ValueRange operands,
-      mlir::OpBuilder* func_builder);
-
   // Imports an instruction.
   StatusOr<mlir::Operation*> ImportInstructionWithLayout(
-      const xla::HloInstruction* instruction,
+      const HloInstruction* instruction,
       const llvm::SmallVectorImpl<mlir::Value>& operands,
       mlir::OpBuilder* func_builder,
       DynamicShapeHandlingMode mode = DynamicShapeHandlingMode::kDynamic);
@@ -191,19 +185,19 @@ class HloFunctionImporter {
 
   // Gets the MLIR operand values from an HLO Instruction.
   StatusOr<llvm::SmallVector<mlir::Value, 4>> GetOperands(
-      const xla::HloInstruction* instruction);
+      const HloInstruction* instruction);
 
   // Converts xla Tensor type to the corresponding MLIR type.
-  StatusOr<mlir::RankedTensorType> ConvertTensorType(const xla::Shape& shape);
+  StatusOr<mlir::RankedTensorType> ConvertTensorType(const Shape& shape);
 
   // Converts an XLA shape/layout to the corresponding MLIR layout, in
   // flattened_attr, while flattening the tuple layout.
   Status ConvertShapeToMlirLayout(
-      const xla::Shape& shape,
+      const Shape& shape,
       llvm::SmallVectorImpl<mlir::Attribute>& flattened_attr);
 
   // Returns the output type of an HloInstruction.
-  StatusOr<mlir::Type> GetReturnType(const xla::HloInstruction* instruction);
+  StatusOr<mlir::Type> GetReturnType(const HloInstruction* instruction);
 
   // Takes a list of HloInstructions and generates the list of types used for
   // input, bypassing tuples to subsets.
@@ -211,7 +205,7 @@ class HloFunctionImporter {
                       llvm::SmallVectorImpl<mlir::Type>* types);
 
   // Returns the Mlir Value for the corresponding HloInstruction.
-  StatusOr<mlir::Value> GetMlirValue(const xla::HloInstruction* instruction);
+  StatusOr<mlir::Value> GetMlirValue(const HloInstruction* instruction);
 
   // Converts an XLA ComparisonDirection to the corresponding MLIR attribute.
   mlir::NamedAttribute ConvertComparisonDirection(
@@ -221,8 +215,7 @@ class HloFunctionImporter {
   mlir::NamedAttribute ConvertComparisonType(Comparison::Type type);
 
   // Converts an XLA CustomCallSchedule to the corresponding MLIR attribute.
-  mlir::NamedAttribute ConvertCustomCallSchedule(
-      xla::CustomCallSchedule schedule);
+  mlir::NamedAttribute ConvertCustomCallSchedule(CustomCallSchedule schedule);
 
   // Converts the dimensions of an HLO instruction into an MLIR attribute.
   mlir::DenseIntElementsAttr ConvertDimensions(
@@ -245,7 +238,7 @@ class HloFunctionImporter {
   mlir::NamedAttribute ConvertUseGlobalDeviceIds();
 
   // Converts channel handle to attribute
-  mlir::NamedAttribute ConvertChannelHandle(const xla::ChannelHandle& channel);
+  mlir::NamedAttribute ConvertChannelHandle(const ChannelHandle& channel);
 
   // ============
   // Imports an old-style async start op. E.g. an HLO all-gather-start
@@ -283,25 +276,23 @@ class HloFunctionImporter {
   mlir::Builder* builder_;
 
   // Mapping from HloComputation to the created MLIR function.
-  std::unordered_map<const xla::HloComputation*, mlir::func::FuncOp>*
-      function_map_;
+  std::unordered_map<const HloComputation*, mlir::func::FuncOp>* function_map_;
 
   // Mapping from HloInstructions to the associative MLIR values.
-  std::unordered_map<const xla::HloInstruction*, mlir::Value>
-      instruction_value_map_;
+  std::unordered_map<const HloInstruction*, mlir::Value> instruction_value_map_;
 };
 
 // Returns a StringAttr that carries a prettyprinted representation of the
 // given HLO C++ sharding.
 // Always succeeds and returns a non-empty attribute.
-mlir::Attribute ConvertSharding(const xla::HloSharding& sharding,
+mlir::Attribute ConvertSharding(const HloSharding& sharding,
                                 mlir::Builder* builder);
 
 // Returns a StringAttr that carries a prettyprinted representation of the
 // given HLO proto sharding.
 // Will fail and return an empty attribute if the proto sharding cannot be
 // converted to the C++ sharding.
-mlir::Attribute ConvertSharding(const xla::OpSharding& sharding,
+mlir::Attribute ConvertSharding(const OpSharding& sharding,
                                 mlir::Builder* builder);
 
 }  // namespace xla
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.cc b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.cc
index f9edd12727b7e2..75bd2745d06d76 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_module_importer.cc
@@ -15,20 +15,25 @@ limitations under the License.
 
 #include "xla/translate/hlo_to_mhlo/hlo_module_importer.h"
 
-#include <iterator>
+#include <cstdint>
 #include <memory>
-#include <vector>
-
+#include <optional>
+
+#include "absl/types/span.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Casting.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
+#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
+#include "mlir/Dialect/Quant/QuantOps.h"  // from @llvm-project
 #include "mlir/IR/Attributes.h"  // from @llvm-project
+#include "mlir/IR/Builders.h"  // from @llvm-project
+#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
 #include "xla/hlo/ir/hlo_computation.h"
 #include "xla/hlo/ir/hlo_instruction.h"
 #include "xla/hlo/ir/hlo_module.h"
-#include "xla/layout_util.h"
 #include "xla/mlir_hlo/mhlo/IR/hlo_ops.h"
-#include "xla/permutation_util.h"
-#include "xla/translate/hlo_to_mhlo/attribute_importer.h"
+#include "xla/status.h"
 #include "xla/translate/hlo_to_mhlo/hlo_function_importer.h"
-#include "xla/translate/hlo_to_mhlo/hlo_utils.h"
 #include "xla/xla.pb.h"
 
 namespace xla {
@@ -41,6 +46,7 @@ HloModuleImporter::HloModuleImporter(mlir::ModuleOp module,
   module.getContext()->loadDialect<mlir::arith::ArithDialect>();
   module.getContext()->loadDialect<mlir::func::FuncDialect>();
   module.getContext()->loadDialect<mlir::mhlo::MhloDialect>();
+  module.getContext()->loadDialect<mlir::quant::QuantizationDialect>();
 }
 
 namespace {
@@ -48,7 +54,7 @@ namespace {
 constexpr char kFrontendAttributesAttr[] = "mhlo.frontend_attributes";
 
 mlir::ArrayAttr ConvertCrossProgramPrefetches(
-    const absl::Span<const xla::HloModule::CrossProgramPrefetchInfo> prefetches,
+    const absl::Span<const HloModule::CrossProgramPrefetchInfo> prefetches,
     mlir::Builder* builder) {
   llvm::SmallVector<mlir::Attribute, 4> shapes;
   for (auto [parameter, index, alt_memory_offset] : prefetches) {
@@ -65,7 +71,7 @@ mlir::ArrayAttr ConvertCrossProgramPrefetches(
 }
 }  // namespace
 
-Status HloModuleImporter::Import(const xla::HloModule& hlo_module) {
+Status HloModuleImporter::Import(const HloModule& hlo_module) {
   auto module = llvm::cast<mlir::ModuleOp>(symbol_table_.getOp());
   module.setName(hlo_module.name());
   module->setAttr("mhlo.cross_program_prefetches",
@@ -113,19 +119,19 @@ Status HloModuleImporter::Import(const xla::HloModule& hlo_module) {
   return OkStatus();
 }
 
-Status HloModuleImporter::Import(const xla::HloModuleProto& module_proto) {
-  xla::DebugOptions debug_options;
+Status HloModuleImporter::Import(const HloModuleProto& module_proto) {
+  DebugOptions debug_options;
   TF_ASSIGN_OR_RETURN(
       auto module_config,
-      xla::HloModule::CreateModuleConfigFromProto(module_proto, debug_options));
-  TF_ASSIGN_OR_RETURN(auto module, xla::HloModule::CreateFromProto(
-                                       module_proto, module_config));
+      HloModule::CreateModuleConfigFromProto(module_proto, debug_options));
+  TF_ASSIGN_OR_RETURN(auto module,
+                      HloModule::CreateFromProto(module_proto, module_config));
 
   return Import(*module);
 }
 
-void HloModuleImporter::ImportFrontendAttributes(
-    const xla::HloModule& hlo_module, mlir::ModuleOp module) {
+void HloModuleImporter::ImportFrontendAttributes(const HloModule& hlo_module,
+                                                 mlir::ModuleOp module) {
   if (!hlo_module.frontend_attributes().map().empty()) {
     llvm::SmallVector<mlir::NamedAttribute, 4> frontend_attributes;
     for (const auto& [k, v] : hlo_module.frontend_attributes().map()) {
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc
index 548011353daf4f..d60dee5dabda11 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/hlo_utils.cc
@@ -309,6 +309,8 @@ StatusOr<::xla::HloOpcode> MhloToHloOpcode(mlir::Operation* op) {
     return xla::HloOpcode::kClz;
   } else if (isa<mlir::mhlo::CosineOp, mlir::lmhlo::CosineOp>(op)) {
     return xla::HloOpcode::kCos;
+  } else if (isa<mlir::mhlo::ErfOp>(op)) {
+    return xla::HloOpcode::kErf;
   } else if (isa<mlir::mhlo::ExpOp, mlir::lmhlo::ExpOp>(op)) {
     return xla::HloOpcode::kExp;
   } else if (isa<mlir::mhlo::Expm1Op, mlir::lmhlo::Expm1Op>(op)) {
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/BUILD b/third_party/xla/xla/translate/hlo_to_mhlo/tests/BUILD
index 1e348e25ed8acc..e2961242bb96b8 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/BUILD
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/BUILD
@@ -1,7 +1,6 @@
 load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
@@ -12,6 +11,7 @@ lit_test_suite(
         [
             "bool_compare.hlotxt",
             "case_conditional.hlotxt",
+            "custom_call.hlotxt",
             "dynamic_param.hlo",
             "entry_computation_layout.hlotxt",
             "frontend_attributes.hlotxt",
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/custom_call.hlotxt b/third_party/xla/xla/translate/hlo_to_mhlo/tests/custom_call.hlotxt
new file mode 100644
index 00000000000000..f91b2f98028bce
--- /dev/null
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/custom_call.hlotxt
@@ -0,0 +1,41 @@
+// RUN: xla-translate --print-sugar=false -hlo-text-to-mlir-hlo -hlo-import-all-computations %s -o - | FileCheck %s
+
+// CHECK: module @foobar
+HloModule foobar
+
+// CHECK-LABEL:  func @main(%arg0: tensor<f32>) -> tensor<f32> {
+ENTRY %dummy_main (Arg_0.1: f32[]) -> f32[] {
+  ROOT %Arg_0.1 = f32[] parameter(0)
+}
+
+// CHECK-LABEL:  func private @test_custom_call_dynamic_broadcast_in_dim
+// CHECK-SAME:  [[ARG_0:%.*]]: tensor<1x?xf32>, [[ARG_1:%.*]]: tensor<3xi64>) -> tensor<2x?x2xf32>
+%test_custom_call_dynamic_broadcast_in_dim (arg1: f32[1,?], arg2: s64[3]) -> f32[2,?,2] {
+  %arg1 = f32[1,?] parameter(0)
+  %arg2 = s64[3] parameter(1)
+  // CHECK:  "mhlo.dynamic_broadcast_in_dim"([[ARG_0]], [[ARG_1]]) {
+  // CHECK-SAME: broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
+  // CHECK-SAME: (tensor<1x?xf32>, tensor<3xi64>) -> tensor<2x?x2xf32>
+  ROOT %custom-call = f32[2,?,2] custom-call(f32[1,?] %arg1, s64[3] %arg2), custom_call_target="mhlo.dynamic_broadcast_in_dim", backend_config={broadcast_dimensions=[0,1]}
+}
+
+// CHECK-LABEL:  func private @test_custom_call_dynamic_reshape
+// CHECK-SAME:  [[ARG_0:%.*]]: tensor<?xf32>, [[ARG_1:%.*]]: tensor<2xi64>) -> tensor<?x?xf32>
+%test_custom_call_dynamic_reshape (arg1: f32[?], arg2: s64[2]) -> f32[?,?] {
+  %arg1 = f32[?] parameter(0)
+  %arg2 = s64[2] parameter(1)
+  // CHECK:  mhlo.dynamic_reshape [[ARG_0]], [[ARG_1]] : (tensor<?xf32>, tensor<2xi64>) -> tensor<?x?xf32>
+  ROOT %custom-call = f32[?,?] custom-call(f32[?] %arg1, s64[2] %arg2), custom_call_target="mhlo.dynamic_reshape"
+}
+
+// CHECK-LABEL:  func private @test_custom_call_real_dynamic_slice
+// CHECK-SAME:  ([[ARG_0:%.*]]: tensor<?x3x224x224xf32>, [[ARG_1:%.*]]: tensor<4xi32>, [[ARG_2:%.*]]: tensor<4xi32>, [[ARG_3:%.*]]: tensor<4xi32>) -> tensor<?x3x224x224xf32>
+%test_custom_call_real_dynamic_slice(arg1: f32[?,3,224,224], arg2: s32[4], arg3: s32[4], arg4: s32[4]) -> f32[?,3,224,224] {
+  %Arg_0.1 = f32[?,3,224,224] parameter(0)
+  %Arg_1.2 = s32[4] parameter(1)
+  %Arg_2.3 = s32[4] parameter(2)
+  %Arg_3.4 = s32[4] parameter(3)
+
+  // CHECK:  mhlo.real_dynamic_slice [[ARG_0]], [[ARG_1]], [[ARG_2]], [[ARG_3]] : (tensor<?x3x224x224xf32>, tensor<4xi32>, tensor<4xi32>, tensor<4xi32>) -> tensor<?x3x224x224xf32>
+  ROOT %custom-call.12 = f32[?,3,224,224] custom-call(f32[?,3,224,224] %Arg_0.1, s32[4] %Arg_1.2, s32[4] %Arg_2.3, s32[4] %Arg_3.4), custom_call_target="mhlo.real_dynamic_slice"
+}
diff --git a/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt b/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
index 5df4e887cd26cd..0bd967feb623a1 100644
--- a/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
+++ b/third_party/xla/xla/translate/hlo_to_mhlo/tests/import.hlotxt
@@ -608,26 +608,6 @@ add {
   ROOT %custom-call = (f32[2,3]) custom-call((f32[1,1], f32[2,3]) %arg1, f32[5,5] %arg2), custom_call_target="foo", output_to_operand_aliasing={{0}: (0, {1})}
 }
 
-// CHECK-LABEL:  func private @test_custom_call_dynamic_broadcast_in_dim
-// CHECK-SAME:  [[ARG_0:%.*]]: tensor<1x?xf32>, [[ARG_1:%.*]]: tensor<3xi64>) -> tensor<2x?x2xf32>
-%test_custom_call_dynamic_broadcast_in_dim (arg1: f32[1,?], arg2: s64[3]) -> f32[2,?,2] {
-  %arg1 = f32[1,?] parameter(0)
-  %arg2 = s64[3] parameter(1)
-  // CHECK:  "mhlo.dynamic_broadcast_in_dim"([[ARG_0]], [[ARG_1]]) {
-  // CHECK-SAME: broadcast_dimensions = dense<[0, 1]> : tensor<2xi64>}
-  // CHECK-SAME: (tensor<1x?xf32>, tensor<3xi64>) -> tensor<2x?x2xf32>
-  ROOT %custom-call = f32[2,?,2] custom-call(f32[1,?] %arg1, s64[3] %arg2), custom_call_target="mhlo.dynamic_broadcast_in_dim", backend_config={broadcast_dimensions=[0,1]}
-}
-
-// CHECK-LABEL:  func private @test_custom_call_dynamic_reshape
-// CHECK-SAME:  [[ARG_0:%.*]]: tensor<?xf32>, [[ARG_1:%.*]]: tensor<2xi64>) -> tensor<?x?xf32>
-%test_custom_call_dynamic_reshape (arg1: f32[?], arg2: s64[2]) -> f32[?,?] {
-  %arg1 = f32[?] parameter(0)
-  %arg2 = s64[2] parameter(1)
-  // CHECK:  mhlo.dynamic_reshape [[ARG_0]], [[ARG_1]] : (tensor<?xf32>, tensor<2xi64>) -> tensor<?x?xf32>
-  ROOT %custom-call = f32[?,?] custom-call(f32[?] %arg1, s64[2] %arg2), custom_call_target="mhlo.dynamic_reshape"
-}
-
 // CHECK-LABEL:  func private @test_div(%arg0: tensor<4xf32>, %arg1: tensor<4xf32>) -> tensor<4xf32>
 %test_div (Arg_0.1: f32[4], Arg_1.2: f32[4]) -> f32[4] {
   %Arg_0.1 = f32[4] parameter(0)
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/BUILD b/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
index e6a9493e9f592f..eeaf6fbc3adb6a 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/BUILD
@@ -1,11 +1,16 @@
 load("@bazel_skylib//rules:build_test.bzl", "build_test")
 load("@llvm-project//mlir:tblgen.bzl", "gentbl_cc_library")
 load("//xla:xla.bzl", "xla_cc_test")
+load("@local_tsl//tsl:tsl.bzl", "internal_visibility")
 load("@local_tsl//tsl:tsl.default.bzl", "get_compatible_with_portable")
 load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_binary", "cc_library")
 
 package(
-    default_visibility = ["//visibility:public"],
+    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
+    default_visibility = internal_visibility([
+        "//learning/brain/mlir:tensorflow_friends",
+        "//learning/brain/mlir:xla_friends",
+    ]),
     licenses = ["notice"],
 )
 
@@ -13,7 +18,6 @@ cc_library(
     name = "attribute_exporter",
     srcs = ["attribute_exporter.cc"],
     hdrs = ["attribute_exporter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:statusor",
@@ -33,7 +37,6 @@ cc_library(
     name = "layout_util",
     srcs = ["layout_util.cc"],
     hdrs = ["layout_util.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:statusor",
@@ -47,7 +50,6 @@ cc_library(
     name = "location_exporter",
     srcs = ["location_exporter.cc"],
     hdrs = ["location_exporter.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":stack_frame_index_builder",
         "//xla:xla_data_proto_cc",
@@ -60,7 +62,6 @@ cc_library(
     name = "stack_frame_index_builder",
     srcs = ["stack_frame_index_builder.cc"],
     hdrs = ["stack_frame_index_builder.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla/service:hlo_proto_cc",
         "@llvm-project//mlir:IR",
@@ -74,7 +75,6 @@ cc_library(
         "operator_writers.inc",
     ],
     hdrs = ["mlir_hlo_to_hlo.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":attribute_exporter",
         ":layout_util",
@@ -156,7 +156,6 @@ cc_library(
     name = "translate",
     srcs = ["translate.cc"],
     hdrs = ["translate.h"],
-    visibility = ["//visibility:public"],
     deps = [
         ":mlir_hlo_to_hlo",
         ":type_to_shape",
@@ -171,7 +170,6 @@ cc_library(
     name = "translate_registration",
     testonly = True,
     srcs = ["translate_registration.cc"],
-    visibility = ["//visibility:public"],
     deps = [
         ":translate",
         "//xla/mlir_hlo:hlo_dialect_registration",
@@ -188,7 +186,6 @@ cc_library(
     name = "type_to_shape",
     srcs = ["type_to_shape.cc"],
     hdrs = ["type_to_shape.h"],
-    visibility = ["//visibility:public"],
     deps = [
         "//xla:shape_util",
         "//xla:statusor",
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
index 68ece3eb7dab81..f6f13091dadf57 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.cc
@@ -3526,7 +3526,7 @@ xla::Status PrepareForExport(mlir::ModuleOp module) {
   }
   if (failed(pm.run(module)))
     return tsl::errors::Internal("Unable to prepare for XLA export");
-  return ::tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace
@@ -3603,7 +3603,7 @@ xla::Status ConvertMlirHloToHlo(mlir::ModuleOp module, xla::HloProto* hlo_proto,
       converter.BuildStackFramesIndexProto();
   hlo_module.mutable_stack_frame_index()->Swap(&stack_frame_index);
   hlo_proto->mutable_hlo_module()->Swap(&hlo_module);
-  return ::tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 xla::Status BuildHloFromMlirHlo(mlir::Block& block, xla::XlaBuilder& builder,
@@ -3648,7 +3648,7 @@ xla::Status BuildHloFromMlirHlo(mlir::Block& block, xla::XlaBuilder& builder,
     }
   }
 
-  return ::tsl::OkStatus();
+  return absl::OkStatus();
 }
 
 }  // namespace mlir
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/BUILD b/third_party/xla/xla/translate/mhlo_to_hlo/tests/BUILD
index ad944460e69158..46548c0f647d91 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/BUILD
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/BUILD
@@ -2,7 +2,6 @@ load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
 load("@local_tsl//tsl:tsl.default.bzl", "filegroup")
 
 package(
-    default_visibility = ["//visibility:public"],
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
     licenses = ["notice"],
 )
@@ -54,5 +53,4 @@ lit_test_suite(
 filegroup(
     name = "test_utilities",
     testonly = True,
-    visibility = ["//visibility:public"],
 )
diff --git a/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir b/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir
index 63144f253e5933..9f6925c2b8066f 100644
--- a/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir
+++ b/third_party/xla/xla/translate/mhlo_to_hlo/tests/export.mlir
@@ -3108,3 +3108,166 @@ func.func @main(%operand: tensor<?x784xf32>) -> tensor<?x784xf32> {
 //  CHECK-NEXT:   [[ARG0]] = f32[?,784] parameter(0)
 //  CHECK-NEXT:   ROOT {{.*}} = f32[?,784] abs(f32[?,784] %Arg_0.1), {{.*}}
 //  CHECK-NEXT: }
+
+// -----
+
+// reduce multiple implicit captures test
+// CHECK: HloModule
+// CHECK: [[REG0:%region.*]] ({{.*}} {
+// CHECK: f32[] constant(0)
+// CHECK: f32[] constant(1)
+// CHECK: ROOT
+// CHECK: ENTRY
+// CHECK: {{.*}} reduce{{.*}} to_apply=[[REG0]]
+// CEHCK: ROOT
+func.func @main(%arg0: tensor<2x2xf32>) -> tuple<tensor<i1>> {
+  %0 = mhlo.constant dense<1.000000e+00> : tensor<f32>
+  %1 = mhlo.constant dense<0.000000e+00> : tensor<f32>
+  %2 = mhlo.reduce(%arg0 init: %1) across dimensions = [0, 1] : (tensor<2x2xf32>, tensor<f32>) -> tensor<f32>
+   reducer(%arg1: tensor<f32>, %arg2: tensor<f32>)  {
+    %5 = mhlo.compare  NE, %arg1, %1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %6 = mhlo.compare  NE, %arg2, %1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    %7 = mhlo.or %5, %6 : tensor<i1>
+    %8 = mhlo.select %7, %0, %1 : tensor<i1>, tensor<f32>
+    mhlo.return %8 : tensor<f32>
+  }
+  %3 = mhlo.compare  NE, %2, %1 : (tensor<f32>, tensor<f32>) -> tensor<i1>
+  %4 = mhlo.tuple %3 {xla_shape = "(pred[])"} : tuple<tensor<i1>>
+  return %4 : tuple<tensor<i1>>
+}
+
+// -----
+
+// all_reduce implicit capture test
+// CHECK: HloModule
+// CHECK: [[REG0:%region.*]] ({{.*}} {
+// CHECK: f32[] constant(0)
+// CHECK: ROOT
+// CHECK: ENTRY
+// CHECK: ROOT {{.*}} all-reduce{{.*}} to_apply=[[REG0]]
+func.func @main(%arg0: tensor<f32>) -> tensor<f32> {
+  %c = mhlo.constant dense<0.0> : tensor<f32>
+  %0 = "mhlo.all_reduce"(%arg0) ({
+  ^bb0(%arg1: tensor<f32>, %arg2: tensor<f32>):
+    %1 = mhlo.add %arg1, %c : tensor<f32>
+    mhlo.return %1 : tensor<f32>
+  }) {replica_groups = dense<[[0], [1]]> : tensor<2x1xi64>} : (tensor<f32>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+// reduce_scatter implicit capture test
+// CHECK:  HloModule
+// CHECK: [[REG0:%region.*]] ({{.*}} {
+// CHECK: f32[] constant(0)
+// CHECK: ROOT
+// CHECK: ENTRY
+// CHECK: ROOT {{.*}} reduce-scatter{{.*}} to_apply=[[REG0]]
+func.func @main(%data: tensor<4x16xf32>) -> tensor<4x4xf32> {
+  %c = mhlo.constant dense<0.0> : tensor<f32>
+  %0 = "mhlo.reduce_scatter"(%data) ({
+    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+    %1 = mhlo.add %arg2, %c : tensor<f32>
+    "mhlo.return"(%1) : (tensor<f32>) -> ()
+  }) {replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>,
+      scatter_dimension = 1 : i64,
+      channel_handle = #mhlo.channel_handle<handle = 1, type = 0>,
+      use_global_device_ids} : (tensor<4x16xf32>) -> tensor<4x4xf32>
+  func.return %0 : tensor<4x4xf32>
+}
+
+// -----
+
+// reduce_window implicit capture test
+// CHECK: HloModule
+// CHECK: [[REG0:%region.*]] ({{.*}} {
+// CHECK: f32[] constant(0)
+// CHECK: ROOT
+// CHECK: ENTRY
+// DCHECK: ROOT {{.*}} reduce-window{{.*}} to_apply=[[REG0]]
+func.func @main(%arg0: tensor<2x17x31x7xf32>, %arg1: tensor<f32>) -> tensor<2x16x30x7xf32> {
+    %c = mhlo.constant dense<0.0> : tensor<f32>
+    %0 = "mhlo.reduce_window"(%arg0, %arg1) ({
+    ^bb0(%arg2: tensor<f32>, %arg3: tensor<f32>):
+      %1 = mhlo.maximum %arg2, %c : tensor<f32>
+      mhlo.return %1 : tensor<f32>
+    }) {window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>} : (tensor<2x17x31x7xf32>, tensor<f32>) -> tensor<2x16x30x7xf32>
+    return %0 : tensor<2x16x30x7xf32>
+  }
+
+// -----
+
+// Scatter implicit capture test
+// CHECK: HloModule
+// CHECK: [[REG0:%region.*]] ({{.*}} {
+// CHECK: s32[] constant(0)
+// CHECK: ROOT
+// CHECK: ENTRY
+// CHECK: ROOT {{.*}} scatter{{.*}} to_apply=[[REG0]]
+func.func @main(%arg0: tensor<3xi32>, %arg1: tensor<1x1xi32>,
+                            %arg2: tensor<1xi32>) -> tensor<3xi32> {
+ %c = mhlo.constant dense<0> : tensor<i32>
+  %0 = "mhlo.scatter"(%arg0, %arg1, %arg2) ({
+  ^bb0(%arg3: tensor<i32>, %arg4: tensor<i32>):
+    %x = mhlo.add %arg4, %c : tensor<i32>
+    "mhlo.return"(%x) : (tensor<i32>) -> ()
+  }) {
+    indices_are_sorted = false,
+    scatter_dimension_numbers = #mhlo.scatter<
+      update_window_dims = [],
+      inserted_window_dims = [0],
+      scatter_dims_to_operand_dims = [0],
+      index_vector_dim = 1,
+    >,
+    unique_indices = false
+  } : (tensor<3xi32>, tensor<1x1xi32>, tensor<1xi32>) -> tensor<3xi32>
+  func.return %0 : tensor<3xi32>
+}
+
+// -----
+
+// select_and_scatter implicit capture test
+// CHECK: HloModule
+// CHECK: [[SEL_REG:%region.*]] ({{.*}} {
+// CHECK: f32[] constant(0)
+// CHECK: ROOT
+// CHECK: [[SCAT_REG:%region.*]] ({{.*}} {
+// CHECK: f32[] constant(0)
+// CHECK: ROOT
+// CHECK: ENTRY
+// CHECK: ROOT {{.*}} select-and-scatter{{.*}} select=[[SEL_REG]], scatter=[[SCAT_REG]]
+func.func @main(%arg0: tensor<10x24x24x64xf32>, %arg1: tensor<10x23x23x64xf32>, %arg2: tensor<f32>) -> tensor<10x24x24x64xf32> {
+    %c1 = mhlo.constant dense<0.0> : tensor<f32>
+    %c2 = mhlo.constant dense<0.0> : tensor<f32>
+    %0 = "mhlo.select_and_scatter"(%arg0, %arg1, %arg2) ({
+    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+      %1 = mhlo.compare  GE, %arg3, %c1,  TOTALORDER : (tensor<f32>, tensor<f32>) -> tensor<i1>
+      mhlo.return %1 : tensor<i1>
+    }, {
+    ^bb0(%arg3: tensor<f32>, %arg4: tensor<f32>):
+      %1 = mhlo.add %arg4, %c2 : tensor<f32>
+      mhlo.return %1 : tensor<f32>
+    }) {window_dimensions = dense<[1, 2, 2, 1]> : tensor<4xi64>} : (tensor<10x24x24x64xf32>, tensor<10x23x23x64xf32>, tensor<f32>) -> tensor<10x24x24x64xf32>
+    return %0 : tensor<10x24x24x64xf32>
+  }
+
+// -----
+
+// sort implicit capture test
+// CHECK: HloModule
+// CHECK: [[REG0:%region.*]] ({{.*}} {
+// CHECK: f32[] constant(0)
+// CHECK: ROOT
+// CHECK: ENTRY
+// CHECK: {{.*}} sort{{.*}} to_apply=[[REG0]]
+// CHECK: ROOT
+func.func @main(%input0: tensor<16x16xf32>, %input1: tensor<16x16xi32>) {
+  %c = mhlo.constant dense<0.0> : tensor<f32>
+  %0:2 = "mhlo.sort"(%input0, %input1) ({
+  ^bb0(%arg0: tensor<f32>, %arg1: tensor<f32>, %arg2: tensor<i32>, %arg3: tensor<i32>):
+    %7 = "mhlo.compare"(%arg0, %c) {comparison_direction = #mhlo<comparison_direction GT>} : (tensor<f32>, tensor<f32>) -> tensor<i1>
+    "mhlo.return"(%7) : (tensor<i1>) -> ()
+  }) {dimension = 1 : i64, is_stable = true} : (tensor<16x16xf32>, tensor<16x16xi32>) -> (tensor<16x16xf32>, tensor<16x16xi32>)
+  func.return
+}
diff --git a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/BUILD b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/BUILD
deleted file mode 100644
index 0d7bd289090705..00000000000000
--- a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/BUILD
+++ /dev/null
@@ -1,130 +0,0 @@
-load("@bazel_skylib//rules:build_test.bzl", "build_test")
-load("@local_config_cuda//cuda:build_defs.bzl", "if_cuda")
-load("//xla:xla.bzl", "xla_cc_binary")
-load("@local_config_rocm//rocm:build_defs.bzl", "if_rocm")
-load("@local_tsl//tsl/platform:rules_cc.bzl", "cc_library")
-
-package(
-    default_visibility = ["//visibility:public"],
-    licenses = ["notice"],
-)
-
-cc_library(
-    name = "mhlo_to_lhlo_with_xla",
-    srcs = ["mhlo_to_lhlo_with_xla.cc"],
-    hdrs = ["mhlo_to_lhlo_with_xla.h"],
-    visibility = ["//visibility:public"],
-    deps = [
-        "//xla:debug_options_flags",
-        "//xla:shape_util",
-        "//xla:statusor",
-        "//xla:util",
-        "//xla:window_util",
-        "//xla:xla_data_proto_cc",
-        "//xla/hlo/ir:hlo",
-        "//xla/mlir/utils:error_util",
-        "//xla/mlir_hlo:lhlo",
-        "//xla/mlir_hlo:lhlo_gpu",
-        "//xla/service:backend",
-        "//xla/service:buffer_assignment",
-        "//xla/service:collective_ops_utils",
-        "//xla/service:hlo_parser",
-        "//xla/service/gpu:backend_configs_cc",
-        "//xla/service/gpu:cublas_cudnn",
-        "//xla/service/gpu:ir_emission_utils",
-        "//xla/service/gpu:matmul_utils",
-        "//xla/service/llvm_ir:buffer_assignment_util",
-        "//xla/service/llvm_ir:llvm_util",
-        "//xla/translate/hlo_to_mhlo:attribute_importer",
-        "//xla/translate/hlo_to_mhlo:hlo_module_importer",
-        "//xla/translate/hlo_to_mhlo:hlo_utils",
-        "//xla/translate/mhlo_to_hlo:mlir_hlo_to_hlo",
-        "//xla/translate/mhlo_to_hlo:type_to_shape",
-        "@com_google_absl//absl/algorithm:container",
-        "@com_google_absl//absl/cleanup",
-        "@com_google_absl//absl/container:flat_hash_map",
-        "@com_google_absl//absl/status",
-        "@com_google_absl//absl/strings",
-        "@com_google_absl//absl/types:optional",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:ArithDialect",
-        "@llvm-project//mlir:AsmParser",
-        "@llvm-project//mlir:BufferizationDialect",
-        "@llvm-project//mlir:FuncDialect",
-        "@llvm-project//mlir:IR",
-        "@llvm-project//mlir:MemRefDialect",
-        "@llvm-project//mlir:Pass",
-        "@llvm-project//mlir:Support",
-        "@llvm-project//mlir:TranslateLib",
-        "@local_tsl//tsl/platform:errors",
-        "@local_tsl//tsl/platform:status",
-        "@local_tsl//tsl/platform:statusor",
-    ],
-)
-
-cc_library(
-    name = "translate_registration",
-    testonly = True,
-    srcs = ["translate_registration.cc"],
-    visibility = ["//visibility:public"],
-    deps = [
-        ":mhlo_to_lhlo_with_xla",
-        "@llvm-project//mlir:TranslateLib",
-    ],
-    alwayslink = 1,
-)
-
-build_test(
-    name = "xla-translate-opt_build_test",
-    targets = [
-        ":xla-translate-opt",
-    ],
-)
-
-xla_cc_binary(
-    name = "xla-translate-opt",
-    testonly = True,
-    srcs = ["xla_translate_opt_main.cc"],
-    deps = [
-        ":mhlo_to_lhlo_with_xla",  # buildcleaner: keep
-        "//xla/mlir/framework/ir:xla_framework",
-        "//xla/mlir/framework/transforms:passes",
-        "//xla/mlir_hlo:hlo_dialect_registration",
-        "//xla/service:cpu_plugin",
-        "//xla/service/cpu:hlo_xla_runtime_pipeline",  # buildcleaner: keep
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:MlirOptLib",
-        "@local_tsl//tsl/platform:platform_port",
-        "@stablehlo//:register",
-    ],
-)
-
-build_test(
-    name = "xla-translate-gpu-opt_build_test",
-    targets = [
-        ":xla-translate-gpu-opt",
-    ],
-)
-
-xla_cc_binary(
-    name = "xla-translate-gpu-opt",
-    testonly = True,
-    srcs = ["xla_translate_opt_main.cc"],
-    deps = [
-        ":mhlo_to_lhlo_with_xla",  # buildcleaner: keep
-        "//xla/mlir/framework/ir:xla_framework",
-        "//xla/mlir/framework/transforms:passes",
-        "//xla/mlir_hlo:all_passes",
-        "//xla/mlir_hlo:hlo_dialect_registration",
-        "//xla/service:gpu_plugin",
-        "//xla/stream_executor",
-        "@llvm-project//llvm:Support",
-        "@llvm-project//mlir:AllPassesAndDialects",
-        "@llvm-project//mlir:MlirOptLib",
-        "@local_tsl//tsl/platform:platform_port",
-        "@stablehlo//:register",
-    ] + if_cuda(["//xla/stream_executor/cuda:cublas_plugin"]) + if_rocm([
-        "//xla/stream_executor/rocm:rocblas_plugin",
-    ]),
-)
diff --git a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc
deleted file mode 100644
index 164ee8049e3e29..00000000000000
--- a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.cc
+++ /dev/null
@@ -1,2588 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h"
-
-#include <algorithm>
-#include <array>
-#include <cstdint>
-#include <functional>
-#include <memory>
-#include <optional>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
-#include "absl/algorithm/container.h"
-#include "absl/cleanup/cleanup.h"
-#include "absl/container/flat_hash_map.h"
-#include "absl/status/status.h"
-#include "absl/strings/match.h"
-#include "absl/types/optional.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "mlir/AsmParser/AsmParser.h"  // from @llvm-project
-#include "mlir/Dialect/Arith/IR/Arith.h"  // from @llvm-project
-#include "mlir/Dialect/Bufferization/IR/Bufferization.h"  // from @llvm-project
-#include "mlir/Dialect/Func/IR/FuncOps.h"  // from @llvm-project
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/IR/Attributes.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinAttributes.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "mlir/IR/BuiltinTypes.h"  // from @llvm-project
-#include "mlir/IR/Location.h"  // from @llvm-project
-#include "mlir/IR/MLIRContext.h"  // from @llvm-project
-#include "mlir/IR/OpDefinition.h"  // from @llvm-project
-#include "mlir/IR/Operation.h"  // from @llvm-project
-#include "mlir/IR/SymbolTable.h"  // from @llvm-project
-#include "mlir/IR/Value.h"  // from @llvm-project
-#include "mlir/IR/ValueRange.h"  // from @llvm-project
-#include "mlir/IR/Verifier.h"  // from @llvm-project
-#include "mlir/Pass/Pass.h"  // from @llvm-project
-#include "mlir/Support/LLVM.h"  // from @llvm-project
-#include "mlir/Support/LogicalResult.h"  // from @llvm-project
-#include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
-#include "xla/debug_options_flags.h"
-#include "xla/hlo/ir/hlo_casting_utils.h"
-#include "xla/hlo/ir/hlo_opcode.h"
-#include "xla/mlir/utils/error_util.h"
-#include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
-#include "xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
-#include "xla/service/backend.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/service/collective_ops_utils.h"
-#include "xla/service/gpu/backend_configs.pb.h"
-#include "xla/service/gpu/cublas_cudnn.h"
-#include "xla/service/gpu/ir_emission_utils.h"
-#include "xla/service/gpu/matmul_utils.h"
-#include "xla/service/hlo_parser.h"
-#include "xla/service/llvm_ir/buffer_assignment_util.h"
-#include "xla/service/llvm_ir/llvm_util.h"
-#include "xla/shape_util.h"
-#include "xla/statusor.h"
-#include "xla/translate/hlo_to_mhlo/attribute_importer.h"
-#include "xla/translate/hlo_to_mhlo/hlo_function_importer.h"
-#include "xla/translate/hlo_to_mhlo/hlo_utils.h"
-#include "xla/translate/mhlo_to_hlo/mlir_hlo_to_hlo.h"
-#include "xla/translate/mhlo_to_hlo/type_to_shape.h"
-#include "xla/util.h"
-#include "xla/window_util.h"
-#include "xla/xla_data.pb.h"
-#include "tsl/platform/errors.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
-
-using xla::BufferAllocation;
-using xla::BufferAssignment;
-using xla::HloComputation;
-using xla::HloCustomCallInstruction;
-using xla::HloInfeedInstruction;
-using xla::HloInstruction;
-using xla::HloModule;
-using xla::HloModuleProto;
-using xla::HloOutfeedInstruction;
-using xla::HloProto;
-using xla::Shape;
-
-namespace mlir {
-namespace {
-
-absl::string_view StringRefToView(llvm::StringRef ref) {
-  return {ref.data(), ref.size()};
-}
-
-tsl::StatusOr<std::unique_ptr<HloModule>> HloModuleFromProto(
-    const HloProto& hlo_proto) {
-  const HloModuleProto& module_proto = hlo_proto.hlo_module();
-  TF_ASSIGN_OR_RETURN(const xla::HloModuleConfig module_config,
-                      HloModule::CreateModuleConfigFromProto(
-                          module_proto, xla::GetDebugOptionsFromFlags()));
-  return HloModule::CreateFromProto(module_proto, module_config);
-}
-
-bool NoParallelCustomCallCollective(const HloInstruction* instr) {
-  auto backend_config = instr->backend_config<xla::gpu::GpuBackendConfig>()
-                            .value()
-                            .collective_backend_config();
-  return backend_config.no_parallel_custom_call();
-}
-
-// Convert the MLIR `module` from HLO dialect to LHLO dialect using XLA for the
-// given platform.
-tsl::Status ConvertHloToLmhlo(std::unique_ptr<HloModule> hlo_module,
-                              ModuleOp module, StringRef platform_name) {
-  auto platform = xla::se::MultiPlatformManager::PlatformWithName(
-      StringRefToView(platform_name));
-  if (!platform.ok()) {
-    std::string error_msg;
-    llvm::raw_string_ostream os(error_msg);
-    os << "failed to get platform: " << platform.status().ToString()
-       << " (available Platform: ";
-    std::vector<std::string> available_platforms;
-    (void)xla::se::MultiPlatformManager::PlatformsWithFilter(
-        [&](const stream_executor::Platform* p) {
-          available_platforms.push_back(p->Name());
-          return false;
-        });
-    llvm::interleaveComma(available_platforms, os);
-    os << ")";
-    return tsl::errors::InvalidArgument("%s", os.str().c_str());
-  }
-
-  xla::BackendOptions backend_options;
-  backend_options.set_platform(platform.value());
-  auto backend_or_err = xla::Backend::CreateBackend(backend_options);
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(backend_or_err.status(),
-                                  "failed to create XLA Backend ");
-  auto backend = std::move(backend_or_err.value());
-
-  tsl::StatusOr<std::unique_ptr<BufferAssignment>> assignment =
-      backend->compiler()->AssignBuffers(hlo_module.get(),
-                                         backend->default_stream_executor());
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(assignment.status(),
-                                  "running XLA buffer assigment");
-
-  // Clear the module before populating it back with the result of the
-  // conversion.
-  module.getBody()->clear();
-  OpBuilder builder(module);
-
-  std::vector<const BufferAllocation*> ordered_allocations;
-  TF_RETURN_WITH_CONTEXT_IF_ERROR(
-      HloToLhloModule(**assignment, *hlo_module, module, &ordered_allocations),
-      "converting HLO to LHLO");
-
-  return ::tsl::OkStatus();
-}
-
-}  // namespace
-
-// Creates MLIR operands corresponding to operands and results of the XLA HLO
-// instruction. If `num_operands` is valid, then only the first `num_operands`
-// operands of the HLO instruction will be considered.
-tsl::Status LhloDialectEmitter::CreateOperands(
-    const HloInstruction* instr, std::optional<int64_t> num_operands,
-    TokenLoweringMode token_mode, llvm::SmallVectorImpl<Value>& operands,
-    size_t& num_arguments, size_t& num_results) {
-  if (num_operands.value_or(0) > instr->operand_count())
-    return tsl::errors::InvalidArgument(
-        "num_operands must be <= operand count");
-  for (int64_t i = 0; i < num_operands.value_or(instr->operand_count()); ++i) {
-    TF_RETURN_IF_ERROR(GetOrCreateView(instr->operand(i), &operands,
-                                       /*result_subset=*/{}, token_mode));
-  }
-  num_arguments = operands.size();
-  TF_RETURN_IF_ERROR(
-      GetOrCreateView(instr, &operands, /*result_subset=*/{}, token_mode));
-  num_results = operands.size() - num_arguments;
-  return ::tsl::OkStatus();
-}
-
-template <typename OpType>
-OpType LhloDialectEmitter::CreateOpWithoutAttrs(const HloInstruction* instr,
-                                                ValueRange operands) {
-  Location loc = getLocation(instr);
-  return builder_.create<OpType>(loc, std::nullopt, operands,
-                                 llvm::ArrayRef<NamedAttribute>{});
-}
-
-template <typename OpType>
-tsl::StatusOr<OpType> LhloDialectEmitter::CreateOpWithoutAttrs(
-    const HloInstruction* instr, size_t& num_arguments, size_t& num_results,
-    std::optional<int64_t> num_operands) {
-  llvm::SmallVector<Value, 4> operands;
-  TF_RETURN_IF_ERROR(CreateOperands(instr, num_operands,
-                                    TokenLoweringMode::kFailToLower, operands,
-                                    num_arguments, num_results));
-  return CreateOpWithoutAttrs<OpType>(instr, operands);
-}
-
-tsl::StatusOr<mlir::Operation*> LhloDialectEmitter::EmitOp(
-    const HloInstruction* instr) {
-  using xla::HloOpcode;
-  switch (instr->opcode()) {
-    case HloOpcode::kAddDependency:
-      return nullptr;
-    case HloOpcode::kAfterAll:
-      // LMHLO is already ordered. This assumption may be broken after
-      // introducing async regions and partial orders.
-      return nullptr;
-    case HloOpcode::kAllGatherStart:
-      return EmitAllGatherStartOp(instr);
-    case HloOpcode::kAllGatherDone:
-      return EmitAllGatherDoneOp(instr);
-    case HloOpcode::kAllReduceStart:
-      return EmitAllReduceStartOp(instr);
-    case HloOpcode::kAllReduceDone:
-      return EmitAllReduceDoneOp(instr);
-    case HloOpcode::kAsyncStart:
-      return EmitAsyncStartOp(instr);
-    case HloOpcode::kAsyncDone:
-      return EmitAsyncDoneOp(instr);
-    case HloOpcode::kBitcast:
-      return EmitBitcast(instr);
-    case HloOpcode::kCollectivePermuteStart:
-      return EmitCollectivePermuteStartOp(instr);
-    case HloOpcode::kCollectivePermuteDone:
-      return EmitCollectivePermuteDoneOp(instr);
-    case HloOpcode::kConditional:
-      return EmitCaseOp(instr);
-    case HloOpcode::kFft:
-      return EmitFftOp(instr);
-    case HloOpcode::kGetTupleElement:
-      return nullptr;
-    case HloOpcode::kInfeed:
-      return EmitInfeedOp(instr);
-    case HloOpcode::kOutfeed:
-      return EmitOutfeedOp(instr);
-    case HloOpcode::kPartitionId:
-      return CreateOpWithoutAttrs<lmhlo::PartitionIdOp>(instr);
-    case HloOpcode::kReplicaId:
-      return CreateOpWithoutAttrs<lmhlo::ReplicaIdOp>(instr);
-    case HloOpcode::kTriangularSolve:
-      return EmitTriangularSolveOp(instr);
-    case HloOpcode::kTuple:
-      return nullptr;
-    case HloOpcode::kSort:
-      return EmitSortOp(instr);
-    case HloOpcode::kFusion:
-      return EmitFusionOp(instr);
-    case HloOpcode::kScatter:
-      return EmitScatterOp(instr);
-    case HloOpcode::kSelectAndScatter:
-      return EmitSelectAndScatterOp(instr);
-    case HloOpcode::kCustomCall:
-      return EmitCustomCallOp(instr);
-    case HloOpcode::kConstant:
-      return EmitConstant(instr);
-    case HloOpcode::kRngGetAndUpdateState:
-      return EmitRngGetAndUpdateStateOp(instr);
-    case HloOpcode::kWhile:
-      return EmitWhileOp(instr);
-    case HloOpcode::kSend:
-      return EmitSendOp(instr);
-    case HloOpcode::kSendDone:
-      return EmitSendDoneOp(instr);
-    case HloOpcode::kRecv:
-      return EmitRecvOp(instr);
-    case HloOpcode::kRecvDone:
-      return EmitRecvDoneOp(instr);
-    // TODO(b/302038092): Currently the command buffer call is represented by
-    // a kCall. We need to be able to differentiate it from a regular kCall.
-    case HloOpcode::kCall:
-      return EmitCommandBufferOp(instr);
-    default:
-      llvm::errs() << instr->ToString();
-      llvm::errs() << "\n\nModule:\n"
-                   << instr->GetModule()->ToString() << "\n\n";
-      return tsl::errors::Internal(
-          absl::StrCat("LHLO opcode ", xla::HloOpcodeString(instr->opcode()),
-                       " is not supported."));
-  }
-}
-
-tsl::Status LhloDialectEmitter::DefaultAction(const HloInstruction* instr) {
-  TF_ASSIGN_OR_RETURN(auto* op, EmitOp(instr));
-  if (op) {
-    lhlo_to_hlo_[op] = instr;
-  }
-  return tsl::OkStatus();
-}
-
-tsl::StatusOr<lmhlo::SortOp> LhloDialectEmitter::EmitSortOp(
-    const HloInstruction* instr) {
-  TF_ASSIGN_OR_RETURN(auto sort, CreateOpWithoutAttrs<lmhlo::SortOp>(instr));
-  auto* sort_instr = xla::Cast<xla::HloSortInstruction>(instr);
-  sort.setDimensionAttr(
-      builder_.getI64IntegerAttr(sort_instr->sort_dimension()));
-  sort.setIsStableAttr(builder_.getBoolAttr(sort_instr->is_stable()));
-  TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
-      *sort_instr->called_computations()[0], symbol_table_,
-      &sort.getComparator(), &builder_));
-  return sort;
-}
-
-// Walks MHLO::TupleOp recursively.
-tsl::Status WalkTuplePostOrder(
-    Value v, const std::function<tsl::Status(Value)>& visitor) {
-  if (auto* op = v.getDefiningOp()) {
-    if (auto tuple = dyn_cast<mhlo::TupleOp>(op)) {
-      for (Value sub_v : tuple.getVal()) {
-        TF_RETURN_IF_ERROR(WalkTuplePostOrder(sub_v, visitor));
-      }
-      return ::tsl::OkStatus();
-    }
-  }
-  return visitor(v);
-}
-
-tsl::StatusOr<Value> LhloDialectEmitter::RewriteFusionOperand(
-    const HloInstruction* root, const Shape& shape,
-    xla::ShapeIndex* shape_index, OpBuilder* b, Location loc) {
-  if (shape.IsTuple()) {
-    llvm::SmallVector<Value, 4> values;
-    for (int i = 0; i < shape.tuple_shapes_size(); ++i) {
-      shape_index->push_back(i);
-      TF_ASSIGN_OR_RETURN(
-          auto v, RewriteFusionOperand(root, shape.tuple_shapes(i), shape_index,
-                                       b, loc));
-      values.push_back(v);
-      shape_index->pop_back();
-    }
-    return Value(b->create<mhlo::TupleOp>(loc, values));
-  }
-  TF_ASSIGN_OR_RETURN(Value memref,
-                      GetOrCreateArrayView(root, shape, *shape_index));
-  auto load = b->create<bufferization::ToTensorOp>(loc, memref);
-  if (shape.layout() !=
-      xla::LayoutUtil::MakeDescendingLayout(shape.dimensions().size())) {
-    llvm::SmallVector<int64_t, 4> minor_to_major(
-        shape.layout().minor_to_major().begin(),
-        shape.layout().minor_to_major().end());
-    load->setAttr("xla_shape",
-                  b->getStringAttr(shape.ToString(/*print_layout=*/true)));
-  }
-  return load.getResult();
-}
-
-// Emit a lmhlo.fusion based on XLA HLO fusion. Structurally they are not neatly
-// equivalent. Specifically, XLA HLO fusion:
-//     fused_computation {
-//       %p0 = parameter(0)
-//       %p1 = parameter(1)
-//       ...
-//       ROOT %ret = ...
-//     }
-// will be converted to
-//     lmhlo.fusion() {  // no explicit operands
-//       // capturing outside buffers
-//       %p0 = bufferization.to_tensor(%arg0) : memref<...> -> tensor<...>
-//       %p1 = bufferization.to_tensor(%arg1) : memref<...> -> tensor<...>
-//       ...
-//       tensor_store ..., %ret // store a tensor to a memref
-//     }
-tsl::StatusOr<lmhlo::FusionOp> LhloDialectEmitter::EmitFusionOp(
-    const HloInstruction* instr) {
-  Location loc = getLocation(instr);
-
-  auto* fusion_instr = xla::Cast<xla::HloFusionInstruction>(instr);
-
-  auto fusion = builder_.create<lmhlo::FusionOp>(getLocation(instr));
-  auto after_fusion = builder_.saveInsertionPoint();
-  auto reverter = absl::MakeCleanup(
-      [this, after_fusion] { builder_.restoreInsertionPoint(after_fusion); });
-  builder_ = mlir::OpBuilder(fusion);
-
-  auto region_builder = OpBuilder::atBlockBegin(&fusion.getRegion().front());
-
-  llvm::SmallVector<Value, 8> arguments;
-  for (int i = 0; i < instr->operands().size(); ++i) {
-    const HloInstruction* operand = instr->operand(i);
-    xla::ShapeIndex shape_index;
-    TF_ASSIGN_OR_RETURN(
-        auto arg, RewriteFusionOperand(operand, operand->shape(), &shape_index,
-                                       &region_builder, loc));
-    arguments.push_back(arg);
-  }
-
-  TF_ASSIGN_OR_RETURN(Value result,
-                      xla::HloFunctionImporter::ImportInstructions(
-                          *fusion_instr->fused_instructions_computation(),
-                          arguments, symbol_table_, &region_builder));
-  {
-    int i = 0;
-    llvm::SmallVector<Value, 4> output;
-    TF_RETURN_IF_ERROR(GetOrCreateView(instr, &output));
-    TF_RETURN_IF_ERROR(WalkTuplePostOrder(result, [&](Value v) mutable {
-      auto materialize_op =
-          region_builder.create<bufferization::MaterializeInDestinationOp>(
-              loc, v, output[i++]);
-      materialize_op.setWritable(true);
-      return ::tsl::OkStatus();
-    }));
-    if (i != output.size()) {
-      return xla::Internal("output sizes don't match");
-    }
-  }
-
-  // The fusion op might not have a backend-config.  But we at least want to set
-  // the fusion kind, because LMHLO doesn't have this concept.
-  TF_ASSIGN_OR_RETURN(auto gpu_config,
-                      instr->backend_config<xla::gpu::GpuBackendConfig>());
-  xla::gpu::FusionBackendConfig& backend_config =
-      *gpu_config.mutable_fusion_backend_config();
-  if (backend_config.kind().empty() &&
-      instr->opcode() == xla::HloOpcode::kFusion) {
-    backend_config.set_kind(std::string(ToString(instr->fusion_kind())));
-  }
-
-  TF_ASSIGN_OR_RETURN(std::string backend_config_str,
-                      HloInstruction::BackendConfigToRawString(gpu_config));
-  fusion.setBackendConfigAttr(builder_.getStringAttr(backend_config_str));
-
-  // For custom fusion backend config we also attach serialized version of the
-  // attached HLO computation.
-  if (backend_config.kind() == "__custom_fusion") {
-    std::string computation_str;
-    fusion_instr->fused_instructions_computation()->ToProto().SerializeToString(
-        &computation_str);
-    fusion->setAttr("__custom_fusion_computation",
-                    builder_.getStringAttr(computation_str));
-  }
-
-  // Fold GTE/Tuple pairs.
-  //
-  // Since the fused region refers to values in its parent region, we can't
-  // call applyPatternAndFoldGreedily. We optimize it manually.
-  //
-  // Only walk once, because post-ordering is exactly what we need for GTE
-  // optimizations.
-  fusion.getRegion().walk([](mhlo::GetTupleElementOp gte) {
-    SmallVector<Value, 4> folded_values;
-    if (succeeded(OpBuilder(gte).tryFold(gte, folded_values))) {
-      gte.replaceAllUsesWith(folded_values[0]);
-    }
-  });
-
-  // Effectively a DCE on the region.
-  {
-    llvm::SmallVector<mlir::Operation*, 4> ops;
-    fusion.getRegion().walk([&](mlir::Operation* op) { ops.push_back(op); });
-    // Visit the user first.
-    std::reverse(ops.begin(), ops.end());
-    for (auto op : ops) {
-      if (isOpTriviallyDead(op)) op->erase();
-    }
-  }
-
-  return fusion;
-}
-
-tsl::StatusOr<mhlo::ScatterDimensionNumbersAttr>
-LhloDialectEmitter::GetScatterDimensionNumbers(const HloInstruction* instr,
-                                               mlir::MLIRContext* context) {
-  auto* scatter_instr = xla::Cast<xla::HloScatterInstruction>(instr);
-
-  const xla::ScatterDimensionNumbers& xla_scatter_dim =
-      scatter_instr->scatter_dimension_numbers();
-
-  auto get_i64_array = [](absl::Span<const int64_t> container) {
-    return ArrayRef<int64_t>{container.data(),
-                             static_cast<size_t>(container.size())};
-  };
-  auto scatter_dimension_numbers = mhlo::ScatterDimensionNumbersAttr::get(
-      context, get_i64_array(xla_scatter_dim.update_window_dims()),
-      get_i64_array(xla_scatter_dim.inserted_window_dims()),
-      get_i64_array(xla_scatter_dim.scatter_dims_to_operand_dims()),
-      xla_scatter_dim.index_vector_dim());
-  return scatter_dimension_numbers;
-}
-
-tsl::StatusOr<lmhlo::ScatterOp> LhloDialectEmitter::EmitScatterOp(
-    const HloInstruction* instr) {
-  TF_ASSIGN_OR_RETURN(auto scatter,
-                      CreateOpWithoutAttrs<lmhlo::ScatterOp>(instr));
-
-  // copy attributes
-  auto* scatter_instr = xla::Cast<xla::HloScatterInstruction>(instr);
-
-  TF_ASSIGN_OR_RETURN(auto scatter_dimension_numbers,
-                      GetScatterDimensionNumbers(instr, builder_.getContext()));
-  scatter.setScatterDimensionNumbersAttr(scatter_dimension_numbers);
-  scatter.setIndicesAreSortedAttr(
-      builder_.getBoolAttr(scatter_instr->indices_are_sorted()));
-  scatter.setUniqueIndicesAttr(
-      builder_.getBoolAttr(scatter_instr->unique_indices()));
-
-  // import update computation as region
-  TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
-      *scatter_instr->called_computations()[0], symbol_table_,
-      &scatter.getUpdateComputation(), &builder_));
-
-  return scatter;
-}
-
-tsl::StatusOr<lmhlo::SelectAndScatterOp>
-LhloDialectEmitter::EmitSelectAndScatterOp(const HloInstruction* instr) {
-  TF_ASSIGN_OR_RETURN(auto select_and_scatter,
-                      CreateOpWithoutAttrs<lmhlo::SelectAndScatterOp>(instr));
-
-  // copy attributes
-  auto* select_and_scatter_instr =
-      xla::Cast<xla::HloSelectAndScatterInstruction>(instr);
-  const xla::Window& window = select_and_scatter_instr->window();
-
-  if (xla::window_util::HasDilation(window)) {
-    return tsl::errors::Unimplemented(
-        "Dilation for SelectAndScatter is not supported");
-  }
-
-  select_and_scatter.setWindowDimensionsAttr(
-      GetWindowElements(window, [](const xla::WindowDimension& dim) {
-        return static_cast<int64_t>(dim.size());
-      }));
-  select_and_scatter.setWindowStridesAttr(
-      GetWindowElements(window, [](const xla::WindowDimension& dim) {
-        return static_cast<int64_t>(dim.stride());
-      }));
-  select_and_scatter.setPaddingAttr(
-      GetWindowElements(window, [](const xla::WindowDimension& dim) {
-        return static_cast<int64_t>(dim.padding_low());
-      }));
-
-  // import select and scatter computation as region
-  TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
-      *select_and_scatter_instr->select(), symbol_table_,
-      &select_and_scatter.getSelect(), &builder_));
-  TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
-      *select_and_scatter_instr->scatter(), symbol_table_,
-      &select_and_scatter.getScatter(), &builder_));
-  return select_and_scatter;
-}
-
-tsl::StatusOr<mlir::Operation*> LhloDialectEmitter::EmitCustomCallOp(
-    const HloInstruction* instr) {
-  auto* custom_call_instr = xla::Cast<xla::HloCustomCallInstruction>(instr);
-
-  if (xla::gpu::IsCustomCallToCusolver(*instr)) {
-    return EmitCholesky(custom_call_instr);
-  }
-
-  if (xla::gpu::IsLegacyCublasMatmul(*instr)) {
-    return EmitGemm(custom_call_instr);
-  }
-
-  if (xla::gpu::IsCublasLtMatmul(*instr)) {
-    return EmitCublasLtMatmul(custom_call_instr);
-  }
-
-  if (xla::gpu::IsCublasLtMatmulF8(*instr)) {
-    return EmitCublasLtMatmulF8(custom_call_instr);
-  }
-
-  if (xla::gpu::IsCustomCallToDnnConvolution(*instr)) {
-    return EmitDnnConvolution(custom_call_instr);
-  }
-
-  if (xla::gpu::IsCudnnConvolutionReorder(*instr)) {
-    return EmitDnnConvolutionReorderVectorized(custom_call_instr);
-  }
-
-  if (xla::gpu::IsCustomCallToDnnNorm(*instr)) {
-    return EmitDnnNorm(custom_call_instr);
-  }
-
-  if (xla::gpu::IsFwdCustomCallTofMHA(*instr)) {
-    return EmitDnnfMHA(custom_call_instr);
-  }
-  if (xla::gpu::IsBwdCustomCallTofMHA(*instr)) {
-    return EmitDnnfMHABackward(custom_call_instr);
-  }
-  if (xla::gpu::IsCubDeviceRadixSort(*instr)) {
-    return EmitCubDeviceRadixSort(custom_call_instr);
-  }
-
-  // For custom call, if there are any token operands or results, they will not
-  // be represented in LHLO so we need to remember the mapping. First create
-  // operands where each token is replaced with a null Value.
-  llvm::SmallVector<Value, 4> operands;
-  size_t num_arguments, num_results;
-  TF_RETURN_IF_ERROR(CreateOperands(instr, /*num_operands=*/std::nullopt,
-                                    TokenLoweringMode::kUseNull, operands,
-                                    num_arguments, num_results));
-
-  // Now check if any of the operands is Null, which would indicate the presence
-  // of a token in the input or output.
-  bool has_token = llvm::any_of(operands, [](Value v) { return !v; });
-
-  lmhlo::CustomCallTargetArgMappingAttr target_mapping;
-  if (has_token) {
-    // If there was a token, squeeze all the non-token arguments and results
-    // (in-place) and remember the mapping.
-    int next_index = 0;
-    llvm::SmallVector<int64_t> arg_to_target_arg_mapping;
-    for (int i = 0; i < num_arguments; ++i) {
-      if (operands[i]) {
-        arg_to_target_arg_mapping.push_back(i);
-        operands[next_index++] = operands[i];
-      }
-    }
-    // Size of arg_to_target_arg_mapping is the number of arguments in LHLO.
-    llvm::SmallVector<int64_t> result_to_target_result_mapping;
-    for (int i = num_arguments; i < operands.size(); ++i) {
-      if (operands[i]) {
-        result_to_target_result_mapping.push_back(i - num_arguments);
-        operands[next_index++] = operands[i];
-      }
-    }
-
-    // Build the mapping attribute.
-    target_mapping = lmhlo::CustomCallTargetArgMappingAttr::get(
-        builder_.getContext(), num_arguments, num_results,
-        arg_to_target_arg_mapping, result_to_target_result_mapping);
-
-    // Drop the remaining operands and adjust num_arguments and num_results
-    // for LMHLO creation.
-    operands.resize(next_index);
-    num_arguments = arg_to_target_arg_mapping.size();
-    num_results = result_to_target_result_mapping.size();
-  }
-
-  auto custom_call = CreateOpWithoutAttrs<lmhlo::CustomCallOp>(instr, operands);
-  TF_ASSIGN_OR_RETURN(
-      auto mlir_api_version,
-      ConvertCustomCallApiVersion(custom_call_instr->api_version()));
-  custom_call.setCallTargetNameAttr(
-      builder_.getStringAttr(custom_call_instr->custom_call_target()));
-  custom_call.setApiVersionAttr(mhlo::CustomCallApiVersionAttr::get(
-      builder_.getContext(), mlir_api_version));
-
-  // For typed custom calls we need to parse user-defined attributes back to the
-  // dictionary attribute, and then add them back to the custom call op.
-  if (mlir_api_version == mhlo::CustomCallApiVersion::API_VERSION_TYPED_FFI) {
-    if (custom_call_instr->opaque().empty()) {
-      auto empty = mlir::DictionaryAttr::get(builder_.getContext());
-      custom_call.setBackendConfigAttr(empty);
-    } else {
-      mlir::Attribute attr = mlir::parseAttribute(custom_call_instr->opaque(),
-                                                  builder_.getContext());
-      TF_RET_CHECK(attr.isa<mlir::DictionaryAttr>())
-          << "Couldn't parse backend config into a dictionary attribute";
-      custom_call.setBackendConfigAttr(attr);
-    }
-  } else {
-    custom_call.setBackendConfigAttr(
-        builder_.getStringAttr(custom_call_instr->opaque()));
-  }
-
-  const int32_t segments[2] = {static_cast<int32_t>(num_arguments),
-                               static_cast<int32_t>(num_results)};
-  custom_call->setAttr(lmhlo::CustomCallOp::getOperandSegmentSizeAttr(),
-                       builder_.getDenseI32ArrayAttr(segments));
-  if (target_mapping) custom_call.setTargetArgMappingAttr(target_mapping);
-
-  for (int i = 0; i < custom_call_instr->called_computations().size(); ++i) {
-    auto& region = custom_call->getRegion(i);
-    TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
-        *custom_call_instr->called_computation(), symbol_table_, &region,
-        &builder_));
-  }
-
-  return custom_call.getOperation();
-}
-
-tsl::StatusOr<lmhlo_gpu::CholeskyOp> LhloDialectEmitter::EmitCholesky(
-    const HloCustomCallInstruction* custom_call) {
-  TF_ASSIGN_OR_RETURN(auto cholesky_op,
-                      CreateOpWithoutAttrs<lmhlo_gpu::CholeskyOp>(custom_call));
-  TF_ASSIGN_OR_RETURN(xla::CholeskyOptions options,
-                      custom_call->backend_config<xla::CholeskyOptions>());
-  cholesky_op.setIsLowerAttr(builder_.getBoolAttr(options.lower()));
-  return cholesky_op;
-}
-
-namespace {
-
-mhlo::DotDimensionNumbersAttr GetDotDimensionNumbersAttr(
-    const OpBuilder& builder, const xla::DotDimensionNumbers& hlo_dims) {
-  auto arrayref = [](absl::Span<const int64_t> array) {
-    return llvm::ArrayRef<int64_t>{array.data(), array.size()};
-  };
-  return mhlo::DotDimensionNumbersAttr::get(
-      builder.getContext(), arrayref(hlo_dims.lhs_batch_dimensions()),
-      arrayref(hlo_dims.rhs_batch_dimensions()),
-      arrayref(hlo_dims.lhs_contracting_dimensions()),
-      arrayref(hlo_dims.rhs_contracting_dimensions()));
-}
-
-template <typename OpT>
-void SetMatmulAttributes(OpT op, const xla::gpu::GemmBackendConfig& config,
-                         OpBuilder& builder) {
-  op.setDotDimensionNumbersAttr(
-      GetDotDimensionNumbersAttr(builder, config.dot_dimension_numbers()));
-  op.setAlphaRealAttr(builder.getF64FloatAttr(config.alpha_real()));
-  op.setAlphaImagAttr(builder.getF64FloatAttr(config.alpha_imag()));
-  op.setBetaAttr(builder.getF64FloatAttr(config.beta()));
-  if (config.algorithm_case() ==
-      xla::gpu::GemmBackendConfig::kSelectedAlgorithm) {
-    op.setAlgorithmAttr(builder.getI64IntegerAttr(config.selected_algorithm()));
-  }
-  op.setPrecisionConfigAttr(
-      xla::ConvertPrecisionConfig(&config.precision_config(), &builder));
-  op.setGradXAttr(builder.getBoolAttr(config.grad_x()));
-  op.setGradYAttr(builder.getBoolAttr(config.grad_y()));
-}
-
-tsl::StatusOr<lmhlo_gpu::CublasLtMatmulEpilogue> AsLhloEpilogue(
-    xla::gpu::GemmBackendConfig_Epilogue epilogue) {
-  switch (epilogue) {
-    case xla::gpu::GemmBackendConfig::DEFAULT:
-      return lmhlo_gpu::CublasLtMatmulEpilogue::Default;
-    case xla::gpu::GemmBackendConfig::RELU:
-      return lmhlo_gpu::CublasLtMatmulEpilogue::Relu;
-    case xla::gpu::GemmBackendConfig::GELU:
-      return lmhlo_gpu::CublasLtMatmulEpilogue::Gelu;
-    case xla::gpu::GemmBackendConfig::GELU_AUX:
-      return lmhlo_gpu::CublasLtMatmulEpilogue::GeluAux;
-    case xla::gpu::GemmBackendConfig::BIAS:
-      return lmhlo_gpu::CublasLtMatmulEpilogue::Bias;
-    case xla::gpu::GemmBackendConfig::BIAS_RELU:
-      return lmhlo_gpu::CublasLtMatmulEpilogue::BiasRelu;
-    case xla::gpu::GemmBackendConfig::BIAS_GELU:
-      return lmhlo_gpu::CublasLtMatmulEpilogue::BiasGelu;
-    case xla::gpu::GemmBackendConfig::BIAS_GELU_AUX:
-      return lmhlo_gpu::CublasLtMatmulEpilogue::BiasGeluAux;
-    default:
-      return xla::Internal("unknown epilogue");
-  }
-}
-
-tsl::StatusOr<lmhlo_gpu::FusedMhaDagSignature> AsLhloFusedMhaDagSignature(
-    xla::gpu::CudnnfMHAKind kind) {
-  switch (kind) {
-    case xla::gpu::CudnnfMHAKind::kBmmBmm:
-      return lmhlo_gpu::FusedMhaDagSignature::Default;
-    case xla::gpu::CudnnfMHAKind::kScaleBiasMaskSoftmax:
-      return lmhlo_gpu::FusedMhaDagSignature::ScaleBiasMaskSoftmax;
-    case xla::gpu::CudnnfMHAKind::kScaleBiasMaskSoftmaxDropout:
-      return lmhlo_gpu::FusedMhaDagSignature::ScaleBiasMaskSoftmaxDropout;
-    case xla::gpu::CudnnfMHAKind::kScaleMaskSoftmax:
-      return lmhlo_gpu::FusedMhaDagSignature::ScaleMaskSoftmax;
-    case xla::gpu::CudnnfMHAKind::kScaleMaskSoftmaxDropout:
-      return lmhlo_gpu::FusedMhaDagSignature::ScaleMaskSoftmaxDropout;
-    case xla::gpu::CudnnfMHAKind::kSoftmaxDropout:
-      return lmhlo_gpu::FusedMhaDagSignature::SoftmaxDropout;
-    case xla::gpu::CudnnfMHAKind::kSoftmax:
-      return lmhlo_gpu::FusedMhaDagSignature::Softmax;
-    case xla::gpu::CudnnfMHAKind::kScaleBiasSoftmax:
-      return lmhlo_gpu::FusedMhaDagSignature::ScaleBiasSoftmax;
-    case xla::gpu::CudnnfMHAKind::kScaleBiasSoftmaxDropout:
-      return lmhlo_gpu::FusedMhaDagSignature::ScaleBiasSoftmaxDropout;
-    default:
-      return xla::Internal("unknown cudnn fmha fwd kind");
-  }
-}
-tsl::StatusOr<lmhlo_gpu::FusedMhaBackwardDagSignature>
-AsLhloFusedMhaBackwardDagSignature(xla::gpu::CudnnfMHAKind kind) {
-  switch (kind) {
-    case xla::gpu::CudnnfMHAKind::kBackwardScaleBiasSoftmax:
-      return lmhlo_gpu::FusedMhaBackwardDagSignature::BackwardScaleBiasSoftmax;
-      break;
-    case xla::gpu::CudnnfMHAKind::kBackwardScaleBiasSoftmaxDropout:
-      return lmhlo_gpu::FusedMhaBackwardDagSignature::
-          BackwardScaleBiasSoftmaxDropout;
-      break;
-    case xla::gpu::CudnnfMHAKind::kBackwardScaleBiasMaskSoftmax:
-      return lmhlo_gpu::FusedMhaBackwardDagSignature::
-          BackwardScaleBiasMaskSoftmax;
-      break;
-    case xla::gpu::CudnnfMHAKind::kBackwardScaleBiasMaskSoftmaxDropout:
-      return lmhlo_gpu::FusedMhaBackwardDagSignature::
-          BackwardScaleBiasMaskSoftmaxDropout;
-      break;
-    case xla::gpu::CudnnfMHAKind::kBackwardSoftmax:
-      return lmhlo_gpu::FusedMhaBackwardDagSignature::BackwardSoftmax;
-      break;
-    case xla::gpu::CudnnfMHAKind::kBackwardSoftmaxDropout:
-      return lmhlo_gpu::FusedMhaBackwardDagSignature::BackwardSoftmaxDropout;
-      break;
-    default:
-      return xla::Internal("unknown cudnn fmha bwd kind");
-  }
-}
-}  // namespace
-
-tsl::StatusOr<Operation*> LhloDialectEmitter::EmitGemm(
-    const HloCustomCallInstruction* custom_call) {
-  TF_ASSIGN_OR_RETURN(
-      auto const gpu_config,
-      custom_call->backend_config<xla::gpu::GpuBackendConfig>());
-  const xla::gpu::GemmBackendConfig& config = gpu_config.gemm_backend_config();
-  if (custom_call->operand_count() == 2) {
-    TF_RET_CHECK(config.beta() == 0.);
-  } else if (custom_call->operand_count() != 3) {
-    return xla::InvalidArgument("GEMM custom call should have 2 or 3 operands");
-  }
-
-  // GEMM may have two or three operands. However, in the three operand case,
-  // the third operand is updated in-place, so we treat that as an output here.
-  TF_ASSIGN_OR_RETURN(
-      lmhlo_gpu::GEMMOp op,
-      CreateOpWithoutAttrs<lmhlo_gpu::GEMMOp>(custom_call,
-                                              /*num_operands=*/2));
-
-  SetMatmulAttributes(op, config, builder_);
-  return op.getOperation();
-}
-
-tsl::StatusOr<Operation*> LhloDialectEmitter::EmitCublasLtMatmul(
-    const HloCustomCallInstruction* custom_call) {
-  TF_ASSIGN_OR_RETURN(
-      auto const gpu_config,
-      custom_call->backend_config<xla::gpu::GpuBackendConfig>());
-  const xla::gpu::GemmBackendConfig& config = gpu_config.gemm_backend_config();
-  bool has_matrix_bias = config.beta() != 0.;
-
-  TF_ASSIGN_OR_RETURN(
-      bool has_vector_bias,
-      xla::gpu::gpublas_lt::EpilogueAddsVectorBias(config.epilogue()));
-
-  TF_ASSIGN_OR_RETURN(
-      bool has_aux_output,
-      xla::gpu::gpublas_lt::EpilogueHasAuxiliaryOutput(config.epilogue()));
-
-  TF_RET_CHECK(custom_call->operand_count() ==
-               2 + int{has_matrix_bias} + int{has_vector_bias});
-
-  xla::ShapeIndex output_index =
-      has_aux_output ? xla::ShapeIndex{0} : xla::ShapeIndex{};
-
-  llvm::SmallVector<Value, 6> operands;
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(0), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(1), &operands));
-  if (has_matrix_bias) {
-    TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(2), &operands));
-  } else {
-    TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands, output_index));
-  }
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands, output_index));
-
-  if (has_vector_bias) {
-    TF_RETURN_IF_ERROR(GetOrCreateView(
-        custom_call->operand(has_matrix_bias ? 3 : 2), &operands));
-  }
-
-  if (has_aux_output) {
-    TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands, {1}));
-  }
-
-  auto op =
-      CreateOpWithoutAttrs<lmhlo_gpu::CublasLtMatmulOp>(custom_call, operands);
-  SetMatmulAttributes(op, config, builder_);
-
-  int32_t operand_sizes[] = {
-      1, 1, 1, 1, has_vector_bias ? 1 : 0, has_aux_output ? 1 : 0};
-  op->setAttr(op.getOperandSegmentSizeAttr(),
-              builder_.getDenseI32ArrayAttr(operand_sizes));
-
-  TF_ASSIGN_OR_RETURN(lmhlo_gpu::CublasLtMatmulEpilogue epilogue,
-                      AsLhloEpilogue(config.epilogue()));
-  op.setEpilogueAttr(lmhlo_gpu::CublasLtMatmulEpilogueAttr::get(
-      builder_.getContext(), epilogue));
-
-  // Use the first algorithm by default (i.e. fastest according to heuristics).
-  if (config.algorithm_case() !=
-      xla::gpu::GemmBackendConfig::kSelectedAlgorithm) {
-    op.setAlgorithmAttr(builder_.getI64IntegerAttr(0));
-  }
-
-  return op.getOperation();
-}
-
-tsl::StatusOr<Operation*> LhloDialectEmitter::EmitCublasLtMatmulF8(
-    const HloCustomCallInstruction* custom_call) {
-  TF_ASSIGN_OR_RETURN(
-      auto const gpu_config,
-      custom_call->backend_config<xla::gpu::GpuBackendConfig>());
-  const xla::gpu::GemmBackendConfig& config = gpu_config.gemm_backend_config();
-  int ops_num = custom_call->operand_count();
-  TF_RET_CHECK(ops_num == 6 || ops_num == 7 || ops_num == 8);
-  TF_ASSIGN_OR_RETURN(
-      bool has_vector_bias,
-      xla::gpu::gpublas_lt::EpilogueAddsVectorBias(config.epilogue()));
-
-  bool has_damax = custom_call->shape().IsTuple();
-  bool has_matrix_bias = config.beta() != 0.;
-  xla::ShapeIndex output_index =
-      has_damax ? xla::ShapeIndex{0} : xla::ShapeIndex{};
-
-  llvm::SmallVector<Value, 10> operands;
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(0), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(1), &operands));
-
-  int a_scale_index = has_matrix_bias ? 3 : 2;
-  if (has_matrix_bias) {
-    TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(2), &operands));
-  } else {
-    TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands, output_index));
-  }
-
-  TF_RETURN_IF_ERROR(
-      GetOrCreateView(custom_call->operand(a_scale_index), &operands));
-  TF_RETURN_IF_ERROR(
-      GetOrCreateView(custom_call->operand(a_scale_index + 1), &operands));
-  TF_RETURN_IF_ERROR(
-      GetOrCreateView(custom_call->operand(a_scale_index + 2), &operands));
-  TF_RETURN_IF_ERROR(
-      GetOrCreateView(custom_call->operand(a_scale_index + 3), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands, output_index));
-
-  if (has_vector_bias) {
-    TF_RETURN_IF_ERROR(
-        GetOrCreateView(custom_call->operand(a_scale_index + 4), &operands));
-  }
-  if (has_damax) {
-    TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands, {1}));
-  }
-  auto op = CreateOpWithoutAttrs<lmhlo_gpu::CublasLtMatmulF8Op>(custom_call,
-                                                                operands);
-
-  SetMatmulAttributes(op, config, builder_);
-  int32_t operand_sizes[] = {
-      1, 1, 1, 1, 1, 1, 1, 1, has_vector_bias ? 1 : 0, has_damax ? 1 : 0};
-  op->setAttr(op.getOperandSegmentSizeAttr(),
-              builder_.getDenseI32ArrayAttr(operand_sizes));
-  TF_ASSIGN_OR_RETURN(lmhlo_gpu::CublasLtMatmulEpilogue epilogue,
-                      AsLhloEpilogue(config.epilogue()));
-  op.setEpilogueAttr(lmhlo_gpu::CublasLtMatmulEpilogueAttr::get(
-      builder_.getContext(), epilogue));
-
-  // Use the first algorithm by default (i.e. fastest according to heuristics).
-  if (config.algorithm_case() !=
-      xla::gpu::GemmBackendConfig::kSelectedAlgorithm) {
-    op.setAlgorithmAttr(builder_.getI64IntegerAttr(0));
-  }
-
-  return op.getOperation();
-}
-
-static tsl::StatusOr<mlir::lmhlo_gpu::Activation> GetLHLOActivation(
-    stream_executor::dnn::ActivationMode activation) {
-  switch (activation) {
-    case stream_executor::dnn::kNone:
-      return mlir::lmhlo_gpu::Activation::None;
-    case stream_executor::dnn::kSigmoid:
-      return mlir::lmhlo_gpu::Activation::Sigmoid;
-    case stream_executor::dnn::kRelu:
-      return mlir::lmhlo_gpu::Activation::Relu;
-    case stream_executor::dnn::kRelu6:
-      return mlir::lmhlo_gpu::Activation::Relu6;
-    case stream_executor::dnn::kReluX:
-      return mlir::lmhlo_gpu::Activation::ReluX;
-    case stream_executor::dnn::kTanh:
-      return mlir::lmhlo_gpu::Activation::Tanh;
-    case stream_executor::dnn::kBandPass:
-      return mlir::lmhlo_gpu::Activation::BandPass;
-    case stream_executor::dnn::kElu:
-      return mlir::lmhlo_gpu::Activation::Elu;
-    case stream_executor::dnn::kLeakyRelu:
-      return mlir::lmhlo_gpu::Activation::LeakyRelu;
-    default:
-      return xla::Internal("Unknown activation");
-  }
-}
-
-tsl::StatusOr<Operation*> LhloDialectEmitter::EmitDnnConvolution(
-    const HloCustomCallInstruction* custom_call) {
-  TF_ASSIGN_OR_RETURN(
-      auto const gpu_backend_config,
-      custom_call->backend_config<xla::gpu::GpuBackendConfig>());
-  const xla::gpu::CudnnConvBackendConfig& backend_config =
-      gpu_backend_config.cudnn_conv_backend_config();
-  TF_ASSIGN_OR_RETURN(const xla::gpu::CudnnConvKind kind,
-                      xla::gpu::GetCudnnConvKind(custom_call));
-
-  auto get_layout_attribute = [&](const xla::Layout& layout) {
-    std::vector<int64_t> minor_to_major(layout.minor_to_major_size());
-    absl::c_transform(layout.minor_to_major(), minor_to_major.begin(),
-                      [](int64_t x) { return static_cast<int64_t>(x); });
-    return minor_to_major;
-  };
-
-  auto set_common_conv_attributes = [&, this](auto op) -> Operation* {
-    const xla::Window& window = custom_call->window();
-    // Window size for Cudnn Conv is same as the kernel size.
-    NamedAttrList attrs(op->getAttrDictionary());
-    DenseIntElementsAttr window_strides;
-    attrs.set(op.getWindowStridesAttrName(),
-              window_strides = GetWindowElements(
-                  window, [](const xla::WindowDimension& dim) {
-                    return static_cast<int64_t>(dim.stride());
-                  }));
-    // Cudnn Conv requires low and high padding to be equal.
-    attrs.set(op.getPaddingAttrName(),
-              GetWindowElements(window, [](const xla::WindowDimension& dim) {
-                return static_cast<int64_t>(dim.padding_low());
-              }));
-    // LHS dilation is encoded in base_dilation of the backend config.
-    // RHS dilation is encoded in window_dilation of the backend config.
-    attrs.set(op.getLhsDilationAttrName(),
-              GetWindowElements(window, [](const xla::WindowDimension& dim) {
-                return static_cast<int64_t>(dim.base_dilation());
-              }));
-    attrs.set(op.getRhsDilationAttrName(),
-              GetWindowElements(window, [](const xla::WindowDimension& dim) {
-                return static_cast<int64_t>(dim.window_dilation());
-              }));
-    // Setup window reversal.
-    auto window_reversal = llvm::to_vector<4>(llvm::map_range(
-        window.dimensions(),
-        [](const xla::WindowDimension& dim) { return dim.window_reversal(); }));
-    auto type = RankedTensorType::get(window_strides.getType().getShape(),
-                                      builder_.getIntegerType(/*width=*/1));
-    attrs.set(op.getWindowReversalAttrName(),
-              DenseElementsAttr::get(type, window_reversal));
-
-    attrs.set(op.getDimensionNumbersAttrName(),
-              xla::ConvertConvDimensionNumbers(
-                  custom_call->convolution_dimension_numbers(), &builder_));
-    attrs.set(op.getFeatureGroupCountAttrName(),
-              builder_.getI64IntegerAttr(custom_call->feature_group_count()));
-    attrs.set(op.getBatchGroupCountAttrName(),
-              builder_.getI64IntegerAttr(custom_call->batch_group_count()));
-    attrs.set(op.getPrecisionConfigAttrName(),
-              xla::ConvertPrecisionConfig(&custom_call->precision_config(),
-                                          &builder_));
-    attrs.set(op.getResultScaleAttrName(),
-              builder_.getF64FloatAttr(backend_config.conv_result_scale()));
-
-    const auto& algorithm = backend_config.algorithm();
-    std::vector<int64_t> knob_ids;
-    std::vector<int64_t> knob_values;
-    for (const auto& entry : algorithm.tuning_knobs()) {
-      knob_ids.push_back(entry.first);
-      knob_values.push_back(entry.second);
-    }
-
-    auto config = mlir::lmhlo_gpu::ConvolutionBackendConfigAttr::get(
-        builder_.getContext(), algorithm.algo_id(),
-
-        algorithm.math_type() ==
-            stream_executor::dnn::AlgorithmProto::TENSOR_OP_MATH,
-        knob_ids, knob_values, algorithm.is_cudnn_frontend(),
-        backend_config.reordered_int8_nchw_vect(),
-        algorithm.has_workspace_size() ? algorithm.workspace_size().value()
-                                       : -1,
-        get_layout_attribute(custom_call->operand(0)->shape().layout()),
-        get_layout_attribute(custom_call->operand(1)->shape().layout()),
-        get_layout_attribute(custom_call->shape().tuple_shapes(0).layout()));
-    attrs.set(op.getBackendConfigAttrName(), config);
-    op->setAttrs(attrs.getDictionary(op->getContext()));
-
-    return op.getOperation();
-  };
-
-  auto set_activation = [&, this](auto op) -> tsl::Status {
-    auto se_activation = static_cast<stream_executor::dnn::ActivationMode>(
-        backend_config.activation_mode());
-    TF_ASSIGN_OR_RETURN(mlir::lmhlo_gpu::Activation activation,
-                        GetLHLOActivation(se_activation));
-    auto activation_attr = ::mlir::lmhlo_gpu::ActivationAttr::get(
-        getLocation(custom_call).getContext(), activation);
-    op.setActivationModeAttr(activation_attr);
-    return ::tsl::OkStatus();
-  };
-
-  switch (kind) {
-    case xla::gpu::CudnnConvKind::kForward: {
-      TF_ASSIGN_OR_RETURN(
-          auto cnn_forward,
-          CreateOpWithoutAttrs<lmhlo_gpu::ConvForwardOp>(custom_call));
-      return set_common_conv_attributes(cnn_forward);
-    }
-    case xla::gpu::CudnnConvKind::kBackwardInput: {
-      TF_ASSIGN_OR_RETURN(
-          auto cnn_backward,
-          CreateOpWithoutAttrs<lmhlo_gpu::ConvBackwardInputOp>(custom_call));
-      return set_common_conv_attributes(cnn_backward);
-    }
-    case xla::gpu::CudnnConvKind::kBackwardFilter: {
-      TF_ASSIGN_OR_RETURN(
-          auto cnn_backward,
-          CreateOpWithoutAttrs<lmhlo_gpu::ConvBackwardFilterOp>(custom_call));
-      return set_common_conv_attributes(cnn_backward);
-    }
-    case xla::gpu::CudnnConvKind::kForwardActivation: {
-      // Fused conv can be either with side input or without.
-      if (custom_call->operand_count() == 3) {
-        TF_ASSIGN_OR_RETURN(
-            auto cnn_fused,
-            CreateOpWithoutAttrs<lmhlo_gpu::ConvForwardFusedOp>(custom_call));
-        TF_RETURN_IF_ERROR(set_activation(cnn_fused));
-        cnn_fused.setLeakyreluAlphaAttr(
-            builder_.getF64FloatAttr(backend_config.leakyrelu_alpha()));
-        return set_common_conv_attributes(cnn_fused);
-      }
-
-      TF_RET_CHECK(custom_call->operand_count() == 4);
-      TF_ASSIGN_OR_RETURN(
-          auto cnn_fused_side_input,
-          CreateOpWithoutAttrs<lmhlo_gpu::ConvForwardFusedSideInputOp>(
-              custom_call));
-      cnn_fused_side_input.setSideInputScaleAttr(
-          builder_.getF64FloatAttr(backend_config.side_input_scale()));
-      TF_RETURN_IF_ERROR(set_activation(cnn_fused_side_input));
-      return set_common_conv_attributes(cnn_fused_side_input);
-    }
-    case xla::gpu::CudnnConvKind::kForwardGraph: {
-      const int32_t n_binary_operands = custom_call->operand_count() - 2;
-      const int32_t n_aux_outputs =
-          custom_call->shape().tuple_shapes_size() - 2;
-      TF_ASSIGN_OR_RETURN(
-          auto cnn_graph,
-          CreateOpWithoutAttrs<lmhlo_gpu::ConvForwardGraphOp>(custom_call));
-      cnn_graph.setSerializedGraph(backend_config.serialized_graph());
-      cnn_graph.setNAuxOutputs(n_aux_outputs);
-      int32_t operand_sizes[] = {1, 1, n_binary_operands, 1, n_aux_outputs, 1};
-      cnn_graph->setAttr(cnn_graph.getOperandSegmentSizeAttr(),
-                         builder_.getDenseI32ArrayAttr(operand_sizes));
-      return set_common_conv_attributes(cnn_graph);
-    }
-  }
-}
-
-tsl::StatusOr<Operation*>
-LhloDialectEmitter::EmitDnnConvolutionReorderVectorized(
-    const HloCustomCallInstruction* custom_call) {
-  auto set_common_attributes = [&, this](auto op) -> Operation* {
-    // Output shape defines the filter, it must have NCHW_VECT_C layout.
-    Shape shape = custom_call->shape();
-    if (shape.IsTuple()) {
-      // We explicitly create a copy here to avoid self-assignment issues
-      shape = Shape{shape.tuple_shapes(0)};
-    }
-
-    CHECK_EQ(shape.rank(), 5);
-    CHECK_EQ(shape.dimensions(4), 32);
-    llvm::SmallVector<int64_t, 4> nchw = {
-        shape.dimensions(0), shape.dimensions(1) * 32, shape.dimensions(2),
-        shape.dimensions(3)};
-    op->setAttr("filter_dims", GetI64DenseElementsAttr(nchw));
-
-    return op.getOperation();
-  };
-
-  if (custom_call->operand_count() > 1) {
-    TF_ASSIGN_OR_RETURN(
-        auto reorder_filter_and_bias,
-        CreateOpWithoutAttrs<lmhlo_gpu::CudnnConvReorderFilterAndBiasOp>(
-            custom_call));
-    return set_common_attributes(reorder_filter_and_bias);
-  } else {
-    TF_ASSIGN_OR_RETURN(
-        auto reorder_filter,
-        CreateOpWithoutAttrs<lmhlo_gpu::CudnnConvReorderFilterOp>(custom_call));
-    return set_common_attributes(reorder_filter);
-  }
-}
-
-tsl::StatusOr<Operation*> LhloDialectEmitter::EmitDnnNorm(
-    const HloCustomCallInstruction* custom_call) {
-  TF_ASSIGN_OR_RETURN(
-      auto const gpu_config,
-      custom_call->backend_config<xla::gpu::GpuBackendConfig>());
-  const xla::gpu::CudnnNormBackendConfig& backend_config =
-      gpu_config.cudnn_norm_backend_config();
-  llvm::SmallVector<Value, 7> operands;
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(0), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(1), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(2), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
-
-  auto norm =
-      CreateOpWithoutAttrs<lmhlo_gpu::CudnnNormOp>(custom_call, operands);
-  norm.setEpsilonAttr(builder_.getF64FloatAttr(backend_config.epsilon()));
-
-  const auto& algorithm = backend_config.algorithm();
-  auto norm_algo_config = mlir::lmhlo_gpu::NormAlgorithmConfigAttr::get(
-      builder_.getContext(), algorithm.algo_id(),
-      algorithm.has_workspace_size() ? algorithm.workspace_size().value() : -1);
-  norm.setAlgorithmConfigAttr(norm_algo_config);
-
-  std::vector<int64_t> operand_minor_to_major;
-
-  auto get_minor_to_major =
-      [&operand_minor_to_major](const xla::Layout& layout) -> void {
-    std::vector<int64_t> minor_to_major(layout.minor_to_major_size());
-    absl::c_transform(layout.minor_to_major(), minor_to_major.begin(),
-                      [](int64_t x) { return static_cast<int64_t>(x); });
-    operand_minor_to_major.insert(operand_minor_to_major.end(),
-                                  minor_to_major.begin(), minor_to_major.end());
-  };
-
-  // Store the layout information of all operands and outputs.
-  for (HloInstruction* operand : custom_call->operands()) {
-    get_minor_to_major(operand->shape().layout());
-  }
-  for (int i = 0; i < custom_call->shape().tuple_shapes_size() - 1; ++i) {
-    get_minor_to_major(custom_call->shape().tuple_shapes(i).layout());
-  }
-
-  norm.setOperandLayoutsAttr(builder_.getI64ArrayAttr(llvm::ArrayRef<int64_t>{
-      operand_minor_to_major.data(), operand_minor_to_major.size()}));
-
-  bool has_aux_outputs = custom_call->shape().tuple_shapes_size() == 4;
-  int32_t operand_sizes[] = {1, 1, 1, 1, has_aux_outputs, has_aux_outputs, 1};
-  norm->setAttr(norm.getOperandSegmentSizeAttr(),
-                builder_.getDenseI32ArrayAttr(operand_sizes));
-
-  return norm.getOperation();
-}
-
-tsl::StatusOr<Operation*> LhloDialectEmitter::EmitDnnfMHA(
-    const HloCustomCallInstruction* custom_call) {
-  TF_ASSIGN_OR_RETURN(
-      auto const gpu_config,
-      custom_call->backend_config<xla::gpu::GpuBackendConfig>());
-  const xla::gpu::CudnnfMHABackendConfig& config =
-      gpu_config.cudnn_fmha_backend_config();
-  TF_ASSIGN_OR_RETURN(const xla::gpu::CudnnfMHAKind kind,
-                      xla::gpu::GetCudnnfMHAKind(custom_call));
-
-  bool has_activation =
-      xla::ShapeUtil::TupleElementCount(custom_call->shape()) == 3;
-  bool has_mask = false;
-  bool has_bias = false;
-
-  auto set_common_fmha_attributes =
-      [&, this](auto op) -> tsl::StatusOr<Operation*> {
-    TF_ASSIGN_OR_RETURN(lmhlo_gpu::FusedMhaDagSignature fused_mha_dag_signature,
-                        AsLhloFusedMhaDagSignature(kind));
-    op.setFusedMhaDagAttr(lmhlo_gpu::FusedMhaDagSignatureAttr::get(
-        builder_.getContext(), fused_mha_dag_signature));
-    op.setBmm1DotDimensionNumbersAttr(GetDotDimensionNumbersAttr(
-        builder_, config.bmm1_dot_dimension_numbers()));
-    op.setBmm2DotDimensionNumbersAttr(GetDotDimensionNumbersAttr(
-        builder_, config.bmm2_dot_dimension_numbers()));
-
-    const auto& algorithm = config.algorithm();
-    std::vector<int64_t> knob_ids;
-    std::vector<int64_t> knob_values;
-    for (const auto& entry : algorithm.tuning_knobs()) {
-      knob_ids.push_back(entry.first);
-      knob_values.push_back(entry.second);
-    }
-    auto fmha_algo_config = mlir::lmhlo_gpu::FusedMHAAlgorithmConfigAttr::get(
-        builder_.getContext(), algorithm.algo_id(), knob_ids, knob_values,
-        algorithm.has_workspace_size() ? algorithm.workspace_size().value()
-                                       : -1);
-    op.setAlgorithmConfigAttr(fmha_algo_config);
-
-    auto intermediate_tensor_shape = Shape(config.intermediate_tensor_shape());
-    auto arrayref = [](absl::Span<const int64_t> array) {
-      return llvm::ArrayRef<int64_t>{array.data(), array.size()};
-    };
-    auto intermediate_tensor_dims = builder_.getI64ArrayAttr(
-        arrayref(intermediate_tensor_shape.dimensions()));
-    op.setIntermediateTensorDimensionsAttr(intermediate_tensor_dims);
-
-    auto intermediate_tensor_layout = builder_.getI64ArrayAttr(
-        arrayref(intermediate_tensor_shape.layout().minor_to_major()));
-    op.setIntermediateTensorLayoutAttr(intermediate_tensor_layout);
-    op.setFmhaScaleAttr(builder_.getF64FloatAttr(config.fmha_scale()));
-    int32_t operand_sizes[] = {1,
-                               1,
-                               1,
-                               has_mask ? 1 : 0,
-                               has_bias ? 1 : 0,
-                               1,
-                               1,
-                               has_activation ? 1 : 0};
-    op->setAttr(op.getOperandSegmentSizeAttr(),
-                builder_.getDenseI32ArrayAttr(operand_sizes));
-    // set is flash attention here
-    op.setIsFlashAttentionAttr(
-        builder_.getBoolAttr(config.is_flash_attention()));
-    // set is causal mask here
-    op.setIsCausalMaskAttr(builder_.getBoolAttr(config.is_causal_mask()));
-    return op.getOperation();
-  };
-  llvm::SmallVector<Value, 8> operands;
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(0), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(1), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(2), &operands));
-  switch (kind) {
-    case xla::gpu::CudnnfMHAKind::kBmmBmm:
-    case xla::gpu::CudnnfMHAKind::kSoftmax: {
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
-      auto fmha =
-          CreateOpWithoutAttrs<lmhlo_gpu::fusedMHAOp>(custom_call, operands);
-      return set_common_fmha_attributes(fmha);
-    }
-    case xla::gpu::CudnnfMHAKind::kSoftmaxDropout: {
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
-      auto fmha =
-          CreateOpWithoutAttrs<lmhlo_gpu::fusedMHAOp>(custom_call, operands);
-      fmha.setDropoutRateAttr(builder_.getF64FloatAttr(config.dropout_rate()));
-      fmha.setSeedAttr(builder_.getI64IntegerAttr(config.seed()));
-      return set_common_fmha_attributes(fmha);
-    }
-    case xla::gpu::CudnnfMHAKind::kScaleMaskSoftmax: {
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(3), &operands));
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
-      auto fmha =
-          CreateOpWithoutAttrs<lmhlo_gpu::fusedMHAOp>(custom_call, operands);
-      has_mask = true;
-      return set_common_fmha_attributes(fmha);
-    }
-    case xla::gpu::CudnnfMHAKind::kScaleMaskSoftmaxDropout: {
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(3), &operands));
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
-      auto fmha =
-          CreateOpWithoutAttrs<lmhlo_gpu::fusedMHAOp>(custom_call, operands);
-      fmha.setDropoutRateAttr(builder_.getF64FloatAttr(config.dropout_rate()));
-      fmha.setSeedAttr(builder_.getI64IntegerAttr(config.seed()));
-      has_mask = true;
-      return set_common_fmha_attributes(fmha);
-    }
-    case xla::gpu::CudnnfMHAKind::kScaleBiasMaskSoftmax: {
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(3), &operands));
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(4), &operands));
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
-      auto fmha =
-          CreateOpWithoutAttrs<lmhlo_gpu::fusedMHAOp>(custom_call, operands);
-      has_mask = true;
-      has_bias = true;
-      return set_common_fmha_attributes(fmha);
-    }
-    case xla::gpu::CudnnfMHAKind::kScaleBiasMaskSoftmaxDropout: {
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(3), &operands));
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(4), &operands));
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
-      auto fmha =
-          CreateOpWithoutAttrs<lmhlo_gpu::fusedMHAOp>(custom_call, operands);
-      fmha.setDropoutRateAttr(builder_.getF64FloatAttr(config.dropout_rate()));
-      fmha.setSeedAttr(builder_.getI64IntegerAttr(config.seed()));
-      has_mask = true;
-      has_bias = true;
-      return set_common_fmha_attributes(fmha);
-    }
-    case xla::gpu::CudnnfMHAKind::kScaleBiasSoftmax: {
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(3), &operands));
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
-      auto fmha =
-          CreateOpWithoutAttrs<lmhlo_gpu::fusedMHAOp>(custom_call, operands);
-      has_bias = true;
-      return set_common_fmha_attributes(fmha);
-    }
-    case xla::gpu::CudnnfMHAKind::kScaleBiasSoftmaxDropout: {
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(3), &operands));
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
-      auto fmha =
-          CreateOpWithoutAttrs<lmhlo_gpu::fusedMHAOp>(custom_call, operands);
-      fmha.setDropoutRateAttr(builder_.getF64FloatAttr(config.dropout_rate()));
-      fmha.setSeedAttr(builder_.getI64IntegerAttr(config.seed()));
-      has_bias = true;
-      return set_common_fmha_attributes(fmha);
-    }
-    default:
-      return xla::Internal("Unknown forward fused MHA call.");
-  }
-}
-
-tsl::StatusOr<Operation*> LhloDialectEmitter::EmitDnnfMHABackward(
-    const HloCustomCallInstruction* custom_call) {
-  TF_ASSIGN_OR_RETURN(
-      auto const gpu_config,
-      custom_call->backend_config<xla::gpu::GpuBackendConfig>());
-  const xla::gpu::CudnnfMHABackendConfig& config =
-      gpu_config.cudnn_fmha_backend_config();
-  TF_ASSIGN_OR_RETURN(const xla::gpu::CudnnfMHAKind kind,
-                      xla::gpu::GetCudnnfMHAKind(custom_call));
-
-  bool is_flash_attention = config.is_flash_attention();
-  bool has_dbias =
-      custom_call->shape().tuple_shapes().size() == 6 && !is_flash_attention;
-  bool has_mask = false;
-  bool has_bias = false;
-
-  auto set_common_fmha_backward_attributes =
-      [&, this](auto op) -> tsl::StatusOr<Operation*> {
-    TF_ASSIGN_OR_RETURN(lmhlo_gpu::FusedMhaBackwardDagSignature
-                            fused_mha_backward_dag_signature,
-                        AsLhloFusedMhaBackwardDagSignature(kind));
-    op.setFusedMhaDagAttr(lmhlo_gpu::FusedMhaBackwardDagSignatureAttr::get(
-        builder_.getContext(), fused_mha_backward_dag_signature));
-    op.setBmm1GradGemm1DotDimensionNumbersAttr(GetDotDimensionNumbersAttr(
-        builder_, config.bmm1_grad_gemm1_dot_dimension_numbers()));
-    op.setBmm1GradGemm2DotDimensionNumbersAttr(GetDotDimensionNumbersAttr(
-        builder_, config.bmm1_grad_gemm2_dot_dimension_numbers()));
-    op.setBmm2GradGemm1DotDimensionNumbersAttr(GetDotDimensionNumbersAttr(
-        builder_, config.bmm2_grad_gemm1_dot_dimension_numbers()));
-    op.setBmm2GradGemm2DotDimensionNumbersAttr(GetDotDimensionNumbersAttr(
-        builder_, config.bmm2_grad_gemm2_dot_dimension_numbers()));
-
-    auto intermediate_tensor_shape = Shape(config.intermediate_tensor_shape());
-    auto arrayref = [](absl::Span<const int64_t> array) {
-      return llvm::ArrayRef<int64_t>{array.data(), array.size()};
-    };
-    auto intermediate_tensor_dims = builder_.getI64ArrayAttr(
-        arrayref(intermediate_tensor_shape.dimensions()));
-    op.setIntermediateTensorDimensionsAttr(intermediate_tensor_dims);
-
-    auto intermediate_tensor_layout = builder_.getI64ArrayAttr(
-        arrayref(intermediate_tensor_shape.layout().minor_to_major()));
-    op.setIntermediateTensorLayoutAttr(intermediate_tensor_layout);
-
-    op.setFmhaScaleAttr(builder_.getF64FloatAttr(config.fmha_scale()));
-
-    int32_t operand_sizes[] = {1,
-                               1,
-                               1,
-                               1,
-                               1,
-                               has_mask ? 1 : 0,
-                               has_bias ? 1 : 0,
-                               is_flash_attention ? 1 : 0,  // fwd_output
-                               1,
-                               1,
-                               1,
-                               is_flash_attention ? 0 : 1,  // d_S
-                               is_flash_attention ? 1 : 0,  // softmax_sum
-                               is_flash_attention ? 1 : 0,  // d_Q_accum
-                               1,
-                               has_dbias ? 1 : 0};
-    op->setAttr(op.getOperandSegmentSizeAttr(),
-                builder_.getDenseI32ArrayAttr(operand_sizes));
-
-    // set is flash attention here
-    op.setIsFlashAttentionAttr(
-        builder_.getBoolAttr(config.is_flash_attention()));
-    // set is causal mask here
-    op.setIsCausalMaskAttr(builder_.getBoolAttr(config.is_causal_mask()));
-    const auto& algorithm = config.algorithm();
-    std::vector<int64_t> knob_ids;
-    std::vector<int64_t> knob_values;
-    for (const auto& entry : algorithm.tuning_knobs()) {
-      knob_ids.push_back(entry.first);
-      knob_values.push_back(entry.second);
-    }
-    auto fmha_algo_config = mlir::lmhlo_gpu::FusedMHAAlgorithmConfigAttr::get(
-        builder_.getContext(), algorithm.algo_id(), knob_ids, knob_values,
-        algorithm.has_workspace_size() ? algorithm.workspace_size().value()
-                                       : -1);
-    op.setAlgorithmConfigAttr(fmha_algo_config);
-    return op.getOperation();
-  };
-
-  llvm::SmallVector<Value, 15> operands;
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(0), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(1), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(2), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(3), &operands));
-  TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(4), &operands));
-
-  switch (kind) {
-    case xla::gpu::CudnnfMHAKind::kBackwardBmmBmm:
-    case xla::gpu::CudnnfMHAKind::kBackwardSoftmax: {
-      // push fwd output for bwd here if it is flash attention
-      if (config.is_flash_attention()) {
-        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(5), &operands));
-      }
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
-      auto fmha_backward = CreateOpWithoutAttrs<lmhlo_gpu::fusedMHABackwardOp>(
-          custom_call, operands);
-      return set_common_fmha_backward_attributes(fmha_backward);
-    }
-    case xla::gpu::CudnnfMHAKind::kBackwardSoftmaxDropout: {
-      // push fwd output for bwd here if it is flash attention
-      if (config.is_flash_attention()) {
-        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(5), &operands));
-      }
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
-      auto fmha_backward = CreateOpWithoutAttrs<lmhlo_gpu::fusedMHABackwardOp>(
-          custom_call, operands);
-      fmha_backward.setDropoutRateAttr(
-          builder_.getF64FloatAttr(config.dropout_rate()));
-      fmha_backward.setSeedAttr(builder_.getI64IntegerAttr(config.seed()));
-      return set_common_fmha_backward_attributes(fmha_backward);
-    }
-    case xla::gpu::CudnnfMHAKind::kBackwardScaleBiasSoftmax: {
-      // push fwd output for bwd here if it is flash attention
-      if (config.is_flash_attention()) {
-        has_bias = true;
-        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(5), &operands));
-        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(6), &operands));
-      }
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
-      auto fmha_backward = CreateOpWithoutAttrs<lmhlo_gpu::fusedMHABackwardOp>(
-          custom_call, operands);
-      return set_common_fmha_backward_attributes(fmha_backward);
-    }
-    case xla::gpu::CudnnfMHAKind::kBackwardScaleBiasSoftmaxDropout: {
-      // push fwd output for bwd here if it is flash attention
-      if (config.is_flash_attention()) {
-        has_bias = true;
-        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(5), &operands));
-        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(6), &operands));
-      }
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
-      auto fmha_backward = CreateOpWithoutAttrs<lmhlo_gpu::fusedMHABackwardOp>(
-          custom_call, operands);
-      fmha_backward.setDropoutRateAttr(
-          builder_.getF64FloatAttr(config.dropout_rate()));
-      fmha_backward.setSeedAttr(builder_.getI64IntegerAttr(config.seed()));
-      return set_common_fmha_backward_attributes(fmha_backward);
-    }
-
-    case xla::gpu::CudnnfMHAKind::kBackwardScaleMaskSoftmax: {
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(5), &operands));
-      // push fwd output for bwd here if it is flash attention
-      if (config.is_flash_attention()) {
-        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(6), &operands));
-      }
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
-      has_mask = true;
-      auto fmha_backward = CreateOpWithoutAttrs<lmhlo_gpu::fusedMHABackwardOp>(
-          custom_call, operands);
-      return set_common_fmha_backward_attributes(fmha_backward);
-    }
-    case xla::gpu::CudnnfMHAKind::kBackwardScaleBiasMaskSoftmax: {
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(5), &operands));
-      // push fwd output for bwd here if it is flash attention
-      if (config.is_flash_attention()) {
-        has_bias = true;
-        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(6), &operands));
-        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(7), &operands));
-      }
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
-      has_mask = true;
-      auto fmha_backward = CreateOpWithoutAttrs<lmhlo_gpu::fusedMHABackwardOp>(
-          custom_call, operands);
-      return set_common_fmha_backward_attributes(fmha_backward);
-    }
-
-    case xla::gpu::CudnnfMHAKind::kBackwardScaleMaskSoftmaxDropout: {
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(5), &operands));
-      // push fwd output for bwd here if it is flash attention
-      if (config.is_flash_attention()) {
-        TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(6), &operands));
-      }
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
-      has_mask = true;
-      auto fmha_backward = CreateOpWithoutAttrs<lmhlo_gpu::fusedMHABackwardOp>(
-          custom_call, operands);
-      fmha_backward.setDropoutRateAttr(
-          builder_.getF64FloatAttr(config.dropout_rate()));
-      fmha_backward.setSeedAttr(builder_.getI64IntegerAttr(config.seed()));
-      return set_common_fmha_backward_attributes(fmha_backward);
-    }
-    case xla::gpu::CudnnfMHAKind::kBackwardScaleBiasMaskSoftmaxDropout: {
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call->operand(5), &operands));
-      // push fwd output for bwd here if it is flash attention
-      if (config.is_flash_attention()) {
-        has_bias = true;
-        TF_RETURN_IF_ERROR(
-            GetOrCreateView(custom_call->operand(6), &operands));  // bias
-        TF_RETURN_IF_ERROR(
-            GetOrCreateView(custom_call->operand(7), &operands));  // fwd_output
-      }
-      TF_RETURN_IF_ERROR(GetOrCreateView(custom_call, &operands));
-      has_mask = true;
-      auto fmha_backward = CreateOpWithoutAttrs<lmhlo_gpu::fusedMHABackwardOp>(
-          custom_call, operands);
-      fmha_backward.setDropoutRateAttr(
-          builder_.getF64FloatAttr(config.dropout_rate()));
-      fmha_backward.setSeedAttr(builder_.getI64IntegerAttr(config.seed()));
-      return set_common_fmha_backward_attributes(fmha_backward);
-    }
-
-    default:
-      return xla::Internal("Unknown backward fused MHA call.");
-  }
-}
-
-xla::StatusOr<Operation*> LhloDialectEmitter::EmitCubDeviceRadixSort(
-    const xla::HloCustomCallInstruction* custom_call) {
-  TF_ASSIGN_OR_RETURN(
-      auto radix_sort_op,
-      CreateOpWithoutAttrs<lmhlo_gpu::RadixSortOp>(custom_call));
-  TF_ASSIGN_OR_RETURN(xla::SortOptions options,
-                      custom_call->backend_config<xla::SortOptions>());
-  radix_sort_op.setDescendingAttr(builder_.getBoolAttr(options.descending()));
-  return radix_sort_op.getOperation();
-}
-
-// Convert an XLA HLO constant to a global_memref + get_global_memref pair.
-tsl::StatusOr<mlir::memref::GetGlobalOp> LhloDialectEmitter::EmitConstant(
-    const HloInstruction* instr) {
-  auto& instr_slice = instr_slices_[std::make_pair(instr, xla::ShapeIndex())];
-  if (instr_slice) {
-    return dyn_cast<mlir::memref::GetGlobalOp>(instr_slice.getDefiningOp());
-  }
-
-  // Insert a global_memref in the module.
-  Location loc = getLocation(instr);
-
-  auto const_instr = xla::Cast<xla::HloConstantInstruction>(instr);
-  TF_RET_CHECK(const_instr->shape().IsArray() &&
-               const_instr->shape().is_static());
-  TF_ASSIGN_OR_RETURN(Type type, xla::ConvertShapeToType<MemRefType>(
-                                     const_instr->shape(), builder_));
-  auto memref_type = type.dyn_cast<MemRefType>();
-  TF_RET_CHECK(memref_type != nullptr);
-
-  TF_ASSIGN_OR_RETURN(
-      DenseElementsAttr initial_value,
-      CreateDenseElementsAttrFromLiteral(const_instr->literal(), builder_));
-
-  std::string constant_name = xla::llvm_ir::ConstantNameToGlobalName(
-      xla::llvm_ir::SanitizeConstantName(instr->name()));
-
-  // Insert the global memref at the top level.
-  {
-    OpBuilder::InsertionGuard guard(builder_);
-    builder_.clearInsertionPoint();
-    auto global_var = builder_.create<memref::GlobalOp>(
-        loc, constant_name, builder_.getStringAttr("private"), memref_type,
-        initial_value, true, /*alignment=*/IntegerAttr());
-    symbol_table_.insert(global_var);
-    global_var.getOperation()->moveBefore(&module_.front());
-
-    // For operations that do not fold this constant value in their codegen, we
-    // still need to materialize it into a buffer. Since buffer allocation is
-    // already done, annotate the global_memref with the information to get to
-    // the allocated buffer slice for this constant if need be.
-    TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
-                        assignment_.GetUniqueTopLevelSlice(instr));
-    global_var->setAttr(
-        "lmhlo.alloc",
-        builder_.getIndexAttr(allocations_.find(slice.allocation())
-                                  ->second.cast<BlockArgument>()
-                                  .getArgNumber()));
-    TF_RET_CHECK(slice.offset() == 0)
-        << "Each constant should have its own allocation from BufferAssignment";
-    TF_RET_CHECK(slice.allocation()->size() == slice.size())
-        << "Each constant should have its own allocation from BufferAssignment";
-  }
-
-  auto get_global_memref =
-      builder_.create<memref::GetGlobalOp>(loc, memref_type, constant_name);
-
-  // Update the cache to remember this value.
-  instr_slice = get_global_memref;
-  return get_global_memref;
-}
-
-namespace {
-template <typename OpT>
-void SetupChannelIdAttribute(OpT op, const xla::HloChannelInstruction* instr,
-                             mlir::Builder builder) {
-  if (instr->channel_id().has_value()) {
-    op.setChannelIdAttr(mlir::mhlo::ChannelHandleAttr::get(
-        builder.getContext(), *instr->channel_id(), 0));
-  }
-}
-
-template <typename OpT>
-tsl::Status SetupCommonCollectiveOpAttributes(OpT op,
-                                              const HloInstruction* instr,
-                                              mlir::OpBuilder& builder) {
-  auto* collective = xla::Cast<xla::HloCollectiveInstruction>(instr);
-  auto replica_groups_attr = xla::HloFunctionImporter::ConvertReplicaGroups(
-      collective->replica_groups(), &builder);
-  op->setAttr(replica_groups_attr.getName(), replica_groups_attr.getValue());
-  op.setConstrainLayoutAttr(
-      builder.getBoolAttr(collective->constrain_layout()));
-  SetupChannelIdAttribute(op, collective, builder);
-  return ::tsl::OkStatus();
-}
-}  // namespace
-
-template <typename OpT>
-tsl::StatusOr<OpT> LhloDialectEmitter::EmitDoneOp(
-    const xla::HloInstruction* instr) {
-  auto token = ret_tokens_.extract(instr->operand(0));
-  TF_RET_CHECK(token) << "didn't find " << OpT::getOperationName().str()
-                      << " token";
-  return builder_.create<OpT>(getLocation(instr), /*resultTypes=*/std::nullopt,
-                              token.mapped());
-}
-
-tsl::StatusOr<lmhlo_gpu::AllToAllStartOp>
-LhloDialectEmitter::EmitAllToAllStartOp(const xla::HloInstruction* instr) {
-  // All the input of async-done (which wraps the all-to-all) are also
-  // listed as outputs, so we just create operands for the outputs.
-  llvm::SmallVector<Value, 4> operands;
-  TF_RETURN_IF_ERROR(GetOrCreateView(instr, &operands, /*result_subset=*/{}));
-
-  mlir::Location loc = getLocation(instr);
-  mlir::Type token_type = mlir::mhlo::TokenType::get(builder_.getContext());
-  std::array<mlir::Type, 1> result_types = {token_type};
-  auto all_to_all_start_op =
-      builder_.create<lmhlo_gpu::AllToAllStartOp>(loc, result_types, operands);
-
-  auto* all_to_all = xla::Cast<xla::HloAllToAllInstruction>(
-      instr->async_wrapped_instruction());
-  TF_RETURN_IF_ERROR(SetupCommonCollectiveOpAttributes(all_to_all_start_op,
-                                                       all_to_all, builder_));
-  if (all_to_all->split_dimension().has_value()) {
-    all_to_all_start_op.setSplitDimensionAttr(
-        builder_.getI64IntegerAttr(*all_to_all->split_dimension()));
-  }
-  all_to_all_start_op.setIsSync(IsSyncCollective(instr));
-  all_to_all_start_op.setNoParallelCustomCall(
-      NoParallelCustomCallCollective(instr));
-
-  auto [_, was_inserted] =
-      ret_tokens_.insert({instr, all_to_all_start_op.getToken()});
-  TF_RET_CHECK(was_inserted) << "all-to-all-start already lowered";
-  return all_to_all_start_op;
-}
-
-tsl::StatusOr<lmhlo_gpu::AllToAllDoneOp> LhloDialectEmitter::EmitAllToAllDoneOp(
-    const HloInstruction* instr) {
-  return EmitDoneOp<lmhlo_gpu::AllToAllDoneOp>(instr);
-}
-
-tsl::StatusOr<lmhlo_gpu::AllGatherStartOp>
-LhloDialectEmitter::EmitAllGatherStartOp(const HloInstruction* instr) {
-  llvm::SmallVector<Value, 4> operands;
-  // In all-gather-start HLO, all inputs are also outputs of the HLO. In LMHLO
-  // though, we list the inputs and outputs just once. In the HLO result,
-  // the inputs are listed first, followed by outputs, which matches the order
-  // of operands we need for LMHLO AllGatherOp.
-  TF_RETURN_IF_ERROR(GetOrCreateView(instr, &operands, /*result_subset=*/{}));
-
-  mlir::Location loc = getLocation(instr);
-  mlir::Type token_type = mlir::mhlo::TokenType::get(builder_.getContext());
-  std::array<mlir::Type, 1> result_types = {token_type};
-  auto all_gather_start_op =
-      builder_.create<lmhlo_gpu::AllGatherStartOp>(loc, result_types, operands);
-
-  auto* all_gather = xla::Cast<xla::HloAllGatherInstruction>(instr);
-  TF_RETURN_IF_ERROR(
-      SetupCommonCollectiveOpAttributes(all_gather_start_op, instr, builder_));
-  all_gather_start_op.setUseGlobalDeviceIdsAttr(
-      builder_.getBoolAttr(all_gather->use_global_device_ids()));
-  all_gather_start_op.setAllGatherDimensionAttr(
-      builder_.getI64IntegerAttr(all_gather->all_gather_dimension()));
-  all_gather_start_op.setIsSync(IsSyncCollective(instr));
-  all_gather_start_op.setNoParallelCustomCall(
-      NoParallelCustomCallCollective(instr));
-  auto [_, was_inserted] =
-      ret_tokens_.insert({instr, all_gather_start_op.getToken()});
-  TF_RET_CHECK(was_inserted) << "all-gather-start already lowered";
-  return all_gather_start_op;
-}
-
-tsl::StatusOr<lmhlo_gpu::AllGatherDoneOp>
-LhloDialectEmitter::EmitAllGatherDoneOp(const HloInstruction* instr) {
-  return EmitDoneOp<lmhlo_gpu::AllGatherDoneOp>(instr);
-}
-
-tsl::StatusOr<lmhlo_gpu::AllReduceStartOp>
-LhloDialectEmitter::EmitAllReduceStartOp(const HloInstruction* instr) {
-  llvm::SmallVector<Value, 4> operands;
-  for (const HloInstruction* operand : instr->operands()) {
-    TF_RETURN_IF_ERROR(GetOrCreateView(operand, &operands));
-  }
-  TF_RETURN_IF_ERROR(GetOrCreateView(instr, &operands, /*result_subset=*/{}));
-
-  mlir::Location loc = getLocation(instr);
-  mlir::Type token_type = mlir::mhlo::TokenType::get(builder_.getContext());
-  std::array<mlir::Type, 1> result_types = {token_type};
-  auto all_reduce_start_op =
-      builder_.create<lmhlo_gpu::AllReduceStartOp>(loc, result_types, operands);
-
-  auto* all_reduce = xla::Cast<xla::HloAllReduceInstruction>(instr);
-  TF_RETURN_IF_ERROR(
-      SetupCommonCollectiveOpAttributes(all_reduce_start_op, instr, builder_));
-  all_reduce_start_op.setUseGlobalDeviceIdsAttr(
-      builder_.getBoolAttr(all_reduce->use_global_device_ids()));
-  all_reduce_start_op.setIsSync(IsSyncCollective(instr));
-  all_reduce_start_op.setNoParallelCustomCall(
-      NoParallelCustomCallCollective(instr));
-
-  TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
-      *instr->called_computations()[0], symbol_table_,
-      &all_reduce_start_op.getComputation(), &builder_));
-
-  auto [_, was_inserted] =
-      ret_tokens_.insert({instr, all_reduce_start_op.getToken()});
-  TF_RET_CHECK(was_inserted) << "all-reduce-start already lowered";
-  return all_reduce_start_op;
-}
-
-tsl::StatusOr<lmhlo_gpu::AllReduceDoneOp>
-LhloDialectEmitter::EmitAllReduceDoneOp(const HloInstruction* instr) {
-  return EmitDoneOp<lmhlo_gpu::AllReduceDoneOp>(instr);
-}
-
-tsl::StatusOr<mlir::Operation*> LhloDialectEmitter::EmitAsyncStartOp(
-    const xla::HloInstruction* instr) {
-  const xla::HloAsyncInstruction* async =
-      xla::Cast<xla::HloAsyncInstruction>(instr);
-
-  switch (async->async_wrapped_opcode()) {
-    case xla::HloOpcode::kReduceScatter:
-      return EmitReduceScatterStartOp(instr);
-    case xla::HloOpcode::kAllToAll:
-      return EmitAllToAllStartOp(instr);
-    default:
-      return tsl::errors::InvalidArgument(
-          "Unexpected instruction %s wrapped in %s",
-          xla::HloOpcodeString(async->async_wrapped_opcode()),
-          HloOpcodeString(instr->opcode()));
-  }
-}
-
-tsl::StatusOr<mlir::Operation*> LhloDialectEmitter::EmitAsyncDoneOp(
-    const xla::HloInstruction* instr) {
-  const xla::HloAsyncInstruction* async =
-      xla::Cast<xla::HloAsyncInstruction>(instr);
-  switch (async->async_wrapped_opcode()) {
-    case xla::HloOpcode::kReduceScatter:
-      return EmitReduceScatterDoneOp(instr);
-    case xla::HloOpcode::kAllToAll:
-      return EmitAllToAllDoneOp(instr);
-    default:
-      return tsl::errors::InvalidArgument(
-          "Unexpected instruction %s wrapped in %s",
-          xla::HloOpcodeString(async->async_wrapped_opcode()),
-          HloOpcodeString(instr->opcode()));
-  }
-}
-
-tsl::StatusOr<lmhlo_gpu::ReduceScatterStartOp>
-LhloDialectEmitter::EmitReduceScatterStartOp(const xla::HloInstruction* instr) {
-  // All the input of async-done (which wraps the reduce-scatter) are also
-  // listed as outputs, so we just create operands for the outputs.
-  llvm::SmallVector<Value, 4> operands;
-  TF_RETURN_IF_ERROR(GetOrCreateView(instr, &operands, /*result_subset=*/{}));
-
-  mlir::Location loc = getLocation(instr);
-  mlir::Type token_type = mlir::mhlo::TokenType::get(builder_.getContext());
-  std::array<mlir::Type, 1> result_types = {token_type};
-  auto reduce_scatter_start_op =
-      builder_.create<lmhlo_gpu::ReduceScatterStartOp>(loc, result_types,
-                                                       operands);
-
-  auto* reduce_scatter = xla::Cast<xla::HloReduceScatterInstruction>(
-      instr->async_wrapped_instruction());
-  TF_RETURN_IF_ERROR(SetupCommonCollectiveOpAttributes(
-      reduce_scatter_start_op, reduce_scatter, builder_));
-  reduce_scatter_start_op.setUseGlobalDeviceIdsAttr(
-      builder_.getBoolAttr(reduce_scatter->use_global_device_ids()));
-  reduce_scatter_start_op.setScatterDimensionAttr(
-      builder_.getI64IntegerAttr(reduce_scatter->scatter_dimension()));
-  reduce_scatter_start_op.setIsSync(IsSyncCollective(instr));
-  reduce_scatter_start_op.setNoParallelCustomCall(
-      NoParallelCustomCallCollective(instr));
-  TF_RETURN_IF_ERROR(xla::HloFunctionImporter::ImportAsRegion(
-      *reduce_scatter->to_apply(), symbol_table_,
-      &reduce_scatter_start_op.getComputation(), &builder_));
-
-  auto [_, was_inserted] =
-      ret_tokens_.insert({instr, reduce_scatter_start_op.getToken()});
-  TF_RET_CHECK(was_inserted) << "reduce-scatter-start already lowered";
-  return reduce_scatter_start_op;
-}
-
-tsl::StatusOr<lmhlo_gpu::ReduceScatterDoneOp>
-LhloDialectEmitter::EmitReduceScatterDoneOp(const xla::HloInstruction* instr) {
-  return EmitDoneOp<lmhlo_gpu::ReduceScatterDoneOp>(instr);
-}
-
-tsl::StatusOr<lmhlo_gpu::CollectivePermuteStartOp>
-LhloDialectEmitter::EmitCollectivePermuteStartOp(const HloInstruction* instr) {
-  llvm::SmallVector<Value, 2> operands;
-  for (const HloInstruction* operand : instr->operands()) {
-    TF_RETURN_IF_ERROR(GetOrCreateView(operand, &operands));
-  }
-  // Ignore the aliased first output and TPU-specific outputs.
-  TF_RETURN_IF_ERROR(GetOrCreateView(instr, &operands, /*result_subset=*/{1}));
-
-  mlir::Location loc = getLocation(instr);
-  mlir::Type token_type = mlir::mhlo::TokenType::get(builder_.getContext());
-  std::array<mlir::Type, 1> result_types = {token_type};
-  auto permute_start_op = builder_.create<lmhlo_gpu::CollectivePermuteStartOp>(
-      loc, result_types, operands);
-
-  auto* permute = xla::Cast<xla::HloCollectivePermuteInstruction>(instr);
-  SetupChannelIdAttribute(permute_start_op, permute, builder_);
-  mlir::NamedAttribute source_target_pairs_attr =
-      xla::HloFunctionImporter::ConvertSourceTargetPairs(
-          permute->source_target_pairs(), &builder_);
-  permute_start_op->setAttr(source_target_pairs_attr.getName(),
-                            source_target_pairs_attr.getValue());
-  permute_start_op.setIsSync(IsSyncCollective(instr));
-  permute_start_op.setNoParallelCustomCall(
-      NoParallelCustomCallCollective(instr));
-
-  auto [_, was_inserted] =
-      ret_tokens_.insert({instr, permute_start_op.getToken()});
-  TF_RET_CHECK(was_inserted) << "collective-permute-start already lowered";
-  return permute_start_op;
-}
-
-tsl::StatusOr<lmhlo_gpu::CollectivePermuteDoneOp>
-LhloDialectEmitter::EmitCollectivePermuteDoneOp(const HloInstruction* instr) {
-  return EmitDoneOp<lmhlo_gpu::CollectivePermuteDoneOp>(instr);
-}
-
-tsl::StatusOr<lmhlo::InfeedOp> LhloDialectEmitter::EmitInfeedOp(
-    const HloInstruction* instr) {
-  const HloInfeedInstruction* infeed = xla::Cast<HloInfeedInstruction>(instr);
-  // HLO Infeed instruction has a single operand of token type and a tuple
-  // with buffers and a token as its output. LMHLO Infeed operation does not
-  // need the token operand or result, so drop it.
-  SmallVector<Value, 2> operands;
-  TF_RETURN_IF_ERROR(GetOrCreateView(instr, &operands, /*result_subset=*/{0}));
-  auto infeed_op = CreateOpWithoutAttrs<lmhlo::InfeedOp>(instr, operands);
-  infeed_op.setConfigAttr(builder_.getStringAttr(infeed->infeed_config()));
-  return infeed_op;
-}
-
-tsl::StatusOr<lmhlo::OutfeedOp> LhloDialectEmitter::EmitOutfeedOp(
-    const HloInstruction* instr) {
-  const HloOutfeedInstruction* outfeed =
-      xla::Cast<HloOutfeedInstruction>(instr);
-  // HLO outfeed instruction has 2 operands, the source and a token, and a
-  // single token output. LMHLO Outfeed does not need the token operand and
-  // result, do drop it.
-  SmallVector<Value, 2> operands;
-  TF_RETURN_IF_ERROR(GetOrCreateView(instr->operand(0), &operands));
-  auto outfeed_op = CreateOpWithoutAttrs<lmhlo::OutfeedOp>(instr, operands);
-  outfeed_op.setConfigAttr(builder_.getStringAttr(outfeed->outfeed_config()));
-  return outfeed_op;
-}
-
-tsl::StatusOr<lmhlo::RngGetAndUpdateStateOp>
-LhloDialectEmitter::EmitRngGetAndUpdateStateOp(
-    const xla::HloInstruction* instr) {
-  TF_ASSIGN_OR_RETURN(
-      auto rng, CreateOpWithoutAttrs<lmhlo::RngGetAndUpdateStateOp>(instr));
-  auto hlo_rng = xla::Cast<xla::HloRngGetAndUpdateStateInstruction>(instr);
-  rng.setDeltaAttr(builder_.getI64IntegerAttr(hlo_rng->delta()));
-  return rng;
-}
-
-tsl::StatusOr<lmhlo::FftOp> LhloDialectEmitter::EmitFftOp(
-    const HloInstruction* instr) {
-  auto hlo_fft = xla::Cast<xla::HloFftInstruction>(instr);
-  TF_ASSIGN_OR_RETURN(auto fft, CreateOpWithoutAttrs<lmhlo::FftOp>(instr));
-  TF_ASSIGN_OR_RETURN(mlir::mhlo::FftType fft_type,
-                      xla::ConvertFftType(hlo_fft->fft_type()));
-  fft.setFftTypeAttr(
-      mlir::mhlo::FftTypeAttr::get(builder_.getContext(), fft_type));
-  fft.setFftLengthAttr(GetI64DenseElementsAttr(instr->fft_length()));
-  return fft;
-}
-
-tsl::StatusOr<lmhlo::TriangularSolveOp>
-LhloDialectEmitter::EmitTriangularSolveOp(const xla::HloInstruction* instr) {
-  auto hlo_triangular_solve =
-      xla::Cast<xla::HloTriangularSolveInstruction>(instr);
-  TF_ASSIGN_OR_RETURN(auto triangular_solve,
-                      CreateOpWithoutAttrs<lmhlo::TriangularSolveOp>(instr));
-  const xla::TriangularSolveOptions& options =
-      hlo_triangular_solve->triangular_solve_options();
-  triangular_solve.setLeftSideAttr(builder_.getBoolAttr(options.left_side()));
-  triangular_solve.setLowerAttr(builder_.getBoolAttr(options.lower()));
-  triangular_solve.setUnitDiagonalAttr(
-      builder_.getBoolAttr(options.unit_diagonal()));
-  TF_ASSIGN_OR_RETURN(mlir::mhlo::Transpose transpose,
-                      xla::ConvertTranspose(options.transpose_a()));
-  triangular_solve.setTransposeAAttr(
-      mlir::mhlo::TransposeAttr::get(builder_.getContext(), transpose));
-  triangular_solve.setLayoutAAttr(
-      GetLayoutAttribute(instr->operand(0)->shape().layout(), &builder_));
-  triangular_solve.setLayoutBAttr(
-      GetLayoutAttribute(instr->operand(1)->shape().layout(), &builder_));
-  triangular_solve.setLayoutOutputAttr(
-      GetLayoutAttribute(instr->shape().layout(), &builder_));
-  return triangular_solve;
-}
-
-tsl::StatusOr<Operation*> LhloDialectEmitter::EmitBitcast(
-    const xla::HloInstruction* instr) {
-  // XLA buffer assignment should assign the same slice to a bitcast input and
-  // output.
-  const xla::ShapeIndex top_index;
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice result_slice,
-                      assignment_.GetUniqueSlice(instr, top_index));
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice input_slice,
-                      assignment_.GetUniqueSlice(instr->operand(0), top_index));
-
-  if (input_slice != result_slice) {
-    return tsl::errors::InvalidArgument(
-        "Bitcast input and result slice should be same");
-  }
-  return nullptr;
-}
-
-mlir::DenseIntElementsAttr LhloDialectEmitter::GetLayoutAttribute(
-    const xla::Layout& layout, Builder* builder) {
-  llvm::SmallVector<int64_t, 4> minor_to_major(layout.minor_to_major().begin(),
-                                               layout.minor_to_major().end());
-  return builder->getIndexTensorAttr(minor_to_major);
-}
-
-tsl::Status LhloDialectEmitter::ImportAsLmhloRegion(
-    xla::HloComputation* computation, mlir::Region* region) {
-  auto after = builder_.saveInsertionPoint();
-  auto reverter = absl::MakeCleanup(
-      [this, after] { builder_.restoreInsertionPoint(after); });
-
-  builder_ = OpBuilder(region);
-  xla::HloModule* hlo_module = computation->parent();
-  if (!hlo_module->has_schedule()) {
-    return tsl::errors::Unimplemented(
-        "Missing sequential order for the computation");
-  }
-  const xla::HloInstructionSequence* schedule =
-      &hlo_module->schedule().sequence(computation);
-  TF_RETURN_IF_ERROR(
-      computation->AcceptOrdered(this, schedule->instructions()));
-  builder_.create<lmhlo::TerminatorOp>(builder_.getUnknownLoc());
-  return ::tsl::OkStatus();
-}
-
-tsl::StatusOr<lmhlo::CaseOp> LhloDialectEmitter::EmitCaseOp(
-    const HloInstruction* instr) {
-  Location loc = getLocation(instr);
-  llvm::SmallVector<Value, 4> operands;
-  size_t num_arguments, num_results;
-  TF_RETURN_IF_ERROR(CreateOperands(instr, 1, TokenLoweringMode::kUseNull,
-                                    operands, num_arguments, num_results));
-
-  auto case_op =
-      builder_.create<lmhlo::CaseOp>(loc, operands[0], instr->branch_count());
-
-  for (int i = 0; i < instr->branch_count(); i++) {
-    case_op.getBranches()[i].push_back(new mlir::Block());
-    TF_RETURN_IF_ERROR(ImportAsLmhloRegion(instr->called_computations()[i],
-                                           &case_op.getBranches()[i]));
-  }
-
-  return case_op;
-}
-
-tsl::StatusOr<lmhlo::WhileOp> LhloDialectEmitter::EmitWhileOp(
-    const xla::HloInstruction* instr) {
-  Location loc = getLocation(instr);
-  SmallVector<Value, 1> operands;
-  TF_RETURN_IF_ERROR(GetOrCreateView(
-      instr->called_computations()[1]->root_instruction(), &operands));
-  TF_RET_CHECK(operands.size() == 1);
-
-  TF_ASSIGN_OR_RETURN(auto config,
-                      instr->backend_config<xla::WhileLoopBackendConfig>());
-  mlir::IntegerAttr trip_count;
-  if (config.has_known_trip_count()) {
-    trip_count = builder_.getI64IntegerAttr(config.known_trip_count().n());
-  }
-  lmhlo::WhileOp while_op =
-      builder_.create<lmhlo::WhileOp>(loc, operands[0], trip_count);
-
-  while_op.getCond().push_back(new mlir::Block());
-  while_op.getBody().push_back(new mlir::Block());
-  TF_RETURN_IF_ERROR(ImportAsLmhloRegion(instr->called_computations()[1],
-                                         &while_op.getCond()));
-
-  TF_RETURN_IF_ERROR(ImportAsLmhloRegion(instr->called_computations()[0],
-                                         &while_op.getBody()));
-
-  return while_op;
-}
-
-// TODO(b/264291989): Use enum to define the host transfer type (channel type).
-template <typename Instr, typename OpTy>
-static void CopyChannelAttrs(OpBuilder& b, Instr* instr, OpTy op,
-                             int host_transfer_type) {
-  op.setIsHostTransferAttr(b.getBoolAttr(instr->is_host_transfer()));
-  op.setChannelHandleAttr(mlir::mhlo::ChannelHandleAttr::get(
-      b.getContext(), *instr->channel_id(),
-      instr->is_host_transfer() ? host_transfer_type : /*DEVICE_TO_DEVICE*/ 1));
-}
-
-template <typename Instr, typename OpTy>
-static void CopyFrontendAttrs(OpBuilder& b, Instr* instr, OpTy op) {
-  llvm::SmallVector<NamedAttribute> frontend_attrs;
-  for (auto& [name, value] : instr->frontend_attributes().map()) {
-    frontend_attrs.push_back(b.getNamedAttr(name, b.getStringAttr(value)));
-  }
-  op->setAttr(b.getStringAttr("frontend_attributes"),
-              b.getDictionaryAttr(frontend_attrs));
-}
-
-tsl::StatusOr<lmhlo::SendOp> LhloDialectEmitter::EmitSendOp(
-    const xla::HloInstruction* instr) {
-  llvm::SmallVector<Value, 2> operands;
-  TF_RETURN_IF_ERROR(GetOrCreateView(instr->operand(0), &operands));
-
-  auto token = mhlo::TokenType::get(builder_.getContext());
-  auto send_op = builder_.create<lmhlo::SendOp>(getLocation(instr),
-                                                TypeRange(token), operands);
-
-  // Set point-to-point op communication attributes.
-  auto* send = xla::Cast<xla::HloSendInstruction>(instr);
-  CopyChannelAttrs(builder_, send, send_op, /*host_transfer_type=*/2);
-  CopyFrontendAttrs(builder_, send, send_op);
-
-  auto [_, emplaced] = ret_tokens_.try_emplace(instr, send_op.getToken());
-  TF_RET_CHECK(emplaced) << "send already lowered";
-  return send_op;
-}
-
-tsl::StatusOr<lmhlo::SendDoneOp> LhloDialectEmitter::EmitSendDoneOp(
-    const xla::HloInstruction* instr) {
-  TF_ASSIGN_OR_RETURN(auto send_done_op, EmitDoneOp<lmhlo::SendDoneOp>(instr));
-  // Copy send-done attributes.
-  auto* send_done = xla::Cast<xla::HloSendDoneInstruction>(instr);
-  CopyChannelAttrs(builder_, send_done, send_done_op,
-                   /*host_transfer_type=*/2);
-
-  return send_done_op;
-}
-
-tsl::StatusOr<lmhlo::RecvOp> LhloDialectEmitter::EmitRecvOp(
-    const xla::HloInstruction* instr) {
-  llvm::SmallVector<Value, 2> operands;
-  TF_RETURN_IF_ERROR(GetOrCreateView(instr, &operands, {0}));
-
-  auto token = mhlo::TokenType::get(builder_.getContext());
-  auto recv_op = builder_.create<lmhlo::RecvOp>(getLocation(instr),
-                                                TypeRange(token), operands);
-
-  // Set point-to-point op communication attributes.
-  auto* recv = xla::Cast<xla::HloRecvInstruction>(instr);
-  CopyChannelAttrs(builder_, recv, recv_op, /*host_transfer_type=*/3);
-  CopyFrontendAttrs(builder_, recv, recv_op);
-
-  auto [_, emplaced] = ret_tokens_.try_emplace(instr, recv_op.getToken());
-  TF_RET_CHECK(emplaced) << "recv already lowered";
-  return recv_op;
-}
-
-tsl::StatusOr<lmhlo::RecvDoneOp> LhloDialectEmitter::EmitRecvDoneOp(
-    const xla::HloInstruction* instr) {
-  TF_ASSIGN_OR_RETURN(auto recv_done_op, EmitDoneOp<lmhlo::RecvDoneOp>(instr));
-  // Copy recv-done attributes.
-  auto* recv_done = xla::Cast<xla::HloRecvDoneInstruction>(instr);
-  CopyChannelAttrs(builder_, recv_done, recv_done_op,
-                   /*host_transfer_type=*/3);
-
-  return recv_done_op;
-}
-
-tsl::StatusOr<lmhlo::CommandBufferOp> LhloDialectEmitter::EmitCommandBufferOp(
-    const xla::HloInstruction* instr) {
-  const std::vector<HloComputation*> called_computations =
-      instr->called_computations();
-  if (called_computations.size() != 1) {
-    return absl::InternalError(
-        "Command buffer calls must have one called computation");
-  }
-
-  if (!absl::StartsWith(called_computations[0]->name(), "command_buffer")) {
-    return absl::InternalError("Called computation must be a command buffer");
-  }
-  return builder_.create<lmhlo::CommandBufferOp>(getLocation(instr));
-}
-
-// Sets builder insertion point for a new `memref.view` operation in the parent
-// function. We create just one `memref.view` operation for every unique
-// subspan of allocation, and because first use of the slice can be inside a
-// block nested in a control flow operation, we have to find an insertion point
-// in the parent function. Returns insertion guard for the original insertion
-// point.
-static tsl::StatusOr<OpBuilder::InsertionGuard> SetArrayViewInsertionPoint(
-    OpBuilder& builder) {
-  OpBuilder::InsertionGuard guard(builder);
-
-  Operation* parent = builder.getInsertionBlock()->getParentOp();
-  while (!isa<func::FuncOp>(parent)) {
-    builder.setInsertionPoint(parent);
-    if ((parent = parent->getParentOp()) == nullptr)
-      return absl::InternalError(
-          "Can't find an insertion point for memref.view operation");
-  }
-
-  return guard;
-}
-
-tsl::StatusOr<Value> LhloDialectEmitter::GetOrCreateArrayView(
-    const xla::HloInstruction* instr, const xla::Shape& current_shape,
-    const xla::ShapeIndex& shape_index) {
-  // For constants, the cache is managed inside EmitConstant since it can
-  // be called either from here or when we see a top-level HloConstant instr.
-  if (instr->IsConstant() && shape_index.empty()) {
-    TF_ASSIGN_OR_RETURN(Value constant_memref, EmitConstant(instr));
-    return constant_memref;
-  }
-
-  // Cache generated ViewOp and StaticMemRefCastOp by (instruction,
-  // shape_index).
-  auto& instr_slice = instr_slices_[std::make_pair(instr, shape_index)];
-  if (instr_slice) {
-    return instr_slice;
-  }
-
-  TF_ASSIGN_OR_RETURN(BufferAllocation::Slice slice,
-                      assignment_.GetUniqueSlice(instr, shape_index));
-
-  // If the shape happens to have dynamic dimensions, create the memref using
-  // the underlying static shape.
-  // TODO(jurahul): Revisit this when we can model memrefs with dynamic shape
-  // but static bounds in MLIR.
-  xla::Shape static_shape = xla::ShapeUtil::MakeStaticShape(current_shape);
-
-  // Try to find allocation slice with the same physical shape so that we always
-  // have only one memref.view operation covering the same buffer subspan. All
-  // reinterpret casts into different layouts will use the same source memref.
-  xla::Shape physical_shape =
-      xla::ShapeUtil::MakeShapeWithDescendingLayoutAndSamePhysicalLayout(
-          static_shape);
-
-  // Initialize values in `allocation_slices_` before taking references,
-  // otherwise we can invalidate them and trigger asan errors below.
-  auto static_shape_key = std::make_pair(slice, static_shape);
-  auto physical_shape_key = std::make_pair(slice, physical_shape);
-  allocation_slices_[static_shape_key];
-  allocation_slices_[physical_shape_key];
-
-  // Check if we already have a memref.view for a given slice and shape.
-  auto& allocation_slice = allocation_slices_[static_shape_key];
-  if (allocation_slice) {
-    return instr_slice = allocation_slice;
-  }
-
-  TF_ASSIGN_OR_RETURN(Type out_type, xla::ConvertShapeToType<MemRefType>(
-                                         static_shape, builder_));
-  TF_ASSIGN_OR_RETURN(
-      Type physical_out_type,
-      xla::ConvertShapeToType<MemRefType>(physical_shape, builder_));
-
-  // Try to find an insertion point for a new memref.view operation.
-  TF_ASSIGN_OR_RETURN(auto guard, SetArrayViewInsertionPoint(builder_));
-
-  // TODO(timshen): revisit location handling.
-  Location loc = builder_.getUnknownLoc();
-
-  // Creates new memref.view operation with a `physical_shape`.
-  auto create_physical_slice = [&]() -> Value {
-    Value alloc = allocations_[slice.allocation()];
-    Value byte_shift =
-        builder_.create<arith::ConstantIndexOp>(alloc.getLoc(), slice.offset());
-
-    // ViewOp only takes memrefs without affine maps (layouts). Let ViewOp
-    // produce the physical shape (where dimensions are ordered in major to
-    // minor) first, then follow up with a MemRefReinterpretCast to cast the
-    // resulting memref to the original layout.
-    return builder_.create<memref::ViewOp>(loc, physical_out_type, alloc,
-                                           byte_shift,
-                                           /*sizes=*/ValueRange());
-  };
-
-  // Reuse existing physical slice if it exists, otherwise build a new
-  // memref.view operation and cache it.
-  auto& physical_slice = allocation_slices_[physical_shape_key];
-  if (!physical_slice) {
-    physical_slice = create_physical_slice();
-  }
-
-  // Start from a physical slice as a result, and maybe reinterpret cast it into
-  // logical shape.
-  Value result = physical_slice;
-
-  if (result.getType() != out_type) {
-    int64_t out_offset;
-    SmallVector<int64_t, 4> out_strides;
-    auto out_memref_type = out_type.dyn_cast<MemRefType>();
-    if (!out_memref_type)
-      return tsl::errors::Internal(
-          "Expected memref type when creating a view for leaf type of a "
-          "tuple.");
-    if (failed(getStridesAndOffset(out_memref_type, out_strides, out_offset)))
-      return tsl::errors::Internal(
-          "Failed to get strides and offset from the output type.");
-    result = builder_.create<memref::ReinterpretCastOp>(
-        loc, out_memref_type, result, out_offset, out_memref_type.getShape(),
-        out_strides);
-  }
-
-  return instr_slice = allocation_slice = result;
-}
-
-tsl::Status LhloDialectEmitter::GetOrCreateViewImpl(
-    const HloInstruction* instr, const Shape& current_shape,
-    xla::ShapeIndex* current_shape_index, SmallVectorImpl<Value>* values,
-    TokenLoweringMode token_mode) {
-  if (current_shape.IsTuple()) {
-    for (int i = 0; i < current_shape.tuple_shapes().size(); ++i) {
-      current_shape_index->push_back(i);
-      TF_RETURN_IF_ERROR(
-          GetOrCreateViewImpl(instr, current_shape.tuple_shapes(i),
-                              current_shape_index, values, token_mode));
-      current_shape_index->pop_back();
-    }
-    return ::tsl::OkStatus();
-  }
-  if (current_shape.IsArray()) {
-    TF_ASSIGN_OR_RETURN(auto v, GetOrCreateArrayView(instr, current_shape,
-                                                     *current_shape_index));
-    values->push_back(v);
-    return ::tsl::OkStatus();
-  }
-  if (current_shape.IsToken()) {
-    switch (token_mode) {
-      case TokenLoweringMode::kFailToLower:
-        return tsl::errors::Internal(
-            "Unexpected token kind for %s and shape index %s",
-            instr->ToString(), current_shape_index->ToString());
-
-      case TokenLoweringMode::kUseNull:
-        values->push_back(Value{});
-        return ::tsl::OkStatus();
-    }
-  }
-  return tsl::errors::Internal(
-      "Unexpected shape kind for %s and shape index %s", instr->ToString(),
-      current_shape_index->ToString());
-}
-
-// Returns a view for the result of an instruction.
-// We first get a view for the slice in the allocation, and then may need to
-// create another view to adjust the slice for the shape of the instruction.
-tsl::Status LhloDialectEmitter::GetOrCreateView(
-    const HloInstruction* instr, SmallVectorImpl<Value>* values,
-    const xla::ShapeIndex& result_subset, TokenLoweringMode token_mode) {
-  xla::ShapeIndex shape_index = result_subset;
-  const Shape& sub_shape =
-      xla::ShapeUtil::GetSubshape(instr->shape(), shape_index);
-  return GetOrCreateViewImpl(instr, sub_shape, &shape_index, values,
-                             token_mode);
-}
-
-tsl::Status LhloDialectEmitter::Initialize(
-    std::vector<const BufferAllocation*>* ordered_allocations) {
-  TF_RET_CHECK(computation_.IsEntryComputation());
-
-  mlir::IntegerAttr unique_id =
-      builder_.getI32IntegerAttr(computation_.parent()->unique_id());
-  module_->setAttr("hlo.unique_id", unique_id);
-  llvm::StringRef function_name =
-      computation_.name().empty() ? "__compute"
-                                  : llvm::StringRef(computation_.name().data(),
-                                                    computation_.name().size());
-
-  // Create the function as () -> (), we'll compute the arguments from the
-  // buffer allocation and update the type then.
-  auto func_op = func::FuncOp::create(builder_.getUnknownLoc(), function_name,
-                                      builder_.getFunctionType({}, {}));
-
-  {
-    // This is an optional attribute used by the XLA backend. If the resulting
-    // LMHLO doesn't go through XLA, this is not needed.
-    const Shape& shape = computation_.root_instruction()->shape();
-    func_op->setAttr(
-        "result_xla_shape",
-        builder_.getStringAttr(shape.ToString(/*print_layout=*/true)));
-  }
-  Block* block = func_op.addEntryBlock();
-
-  for (const BufferAllocation& alloc : assignment_.Allocations()) {
-    if (!alloc.is_thread_local()) {
-      ordered_allocations->push_back(&alloc);
-    }
-  }
-
-  if (computation_.IsEntryComputation()) {
-    // Sort the rather arbitrarily ordered allocations to match the input/output
-    // parameters. Specifically we want to sort buffer allocations in the
-    // following order:
-    // * Parameters always order before non-parameters.
-    // * Different parameters order by parameter number.
-    // * Different allocations for the same parameter order by the shape index.
-    //
-    // TODO(timshen): there should be only one non-parameter buffer, the temp
-    // buffer. Check on that.
-    const auto allocation_comparator = [](const BufferAllocation* lhs,
-                                          const BufferAllocation* rhs) {
-      if (lhs->is_entry_computation_parameter() !=
-          rhs->is_entry_computation_parameter()) {
-        return lhs->is_entry_computation_parameter() >
-               rhs->is_entry_computation_parameter();
-      }
-      if (lhs->is_entry_computation_parameter()) {
-        return std::tuple<int, const xla::ShapeIndex&>(
-                   lhs->parameter_number(), lhs->param_shape_index()) <
-               std::tuple<int, const xla::ShapeIndex&>(
-                   rhs->parameter_number(), rhs->param_shape_index());
-      }
-      return false;
-    };
-
-    std::stable_sort(ordered_allocations->begin(), ordered_allocations->end(),
-                     allocation_comparator);
-  }
-
-  absl::flat_hash_map<const BufferAllocation*,
-                      std::pair<const Shape*, xla::ShapeIndex>>
-      allocation_to_output_info;
-  TF_RETURN_IF_ERROR(xla::ShapeUtil::ForEachSubshapeWithStatus(
-      computation_.root_instruction()->shape(),
-      [&](const Shape& sub_shape, xla::ShapeIndex index) -> tsl::Status {
-        TF_ASSIGN_OR_RETURN(
-            auto slice,
-            assignment_.GetUniqueSlice(computation_.root_instruction(), index));
-        const BufferAllocation* alloc = slice.allocation();
-        TF_RET_CHECK(slice.offset() == 0);
-        TF_RET_CHECK(slice.size() == alloc->size());
-        allocation_to_output_info[alloc] = std::make_pair(&sub_shape, index);
-        return ::tsl::OkStatus();
-      }));
-
-  // The function signature will be composed of:
-  // - one memref for each of the parameters.
-  // - one memref for each other buffer allocation.
-  llvm::SmallVector<DictionaryAttr, 8> args_attrs;
-  auto it = ordered_allocations->begin();
-  while (it != ordered_allocations->end()) {
-    const BufferAllocation* alloc = *it;
-    // There are optional attributes to help the program run through XLA. XLA
-    // defines ExecutionInput and ExecutionOutput structures to carry
-    // input-output type and buffer information, therefore any information they
-    // need (mainly the type structure, potentially containing tuples) to be
-    // preserved. They are not needed if the generated LMHLO is not sent to XLA.
-    NamedAttrList arg_attr_list;
-    mlir::Type arg_type = MemRefType::get({alloc->size()}, i8_type_);
-
-    // Propagate source location information for every HLOInstruction that
-    // uses this allocation.
-    std::vector<mlir::Location> buf_locs;
-    buf_locs.reserve(alloc->assigned_buffers().size());
-    for (const auto& entry : alloc->assigned_buffers()) {
-      const xla::HloValue* hlo_value = entry.first;
-      buf_locs.push_back(getLocation(hlo_value->instruction()));
-    }
-    mlir::Location loc = builder_.getFusedLoc(buf_locs);
-
-    if (alloc->is_entry_computation_parameter()) {
-      arg_attr_list.set("lmhlo.params",
-                        builder_.getIndexAttr(alloc->parameter_number()));
-      if (!alloc->param_shape_index().empty()) {
-        arg_attr_list.set("lmhlo.param_shape_index",
-                          builder_.getI64TensorAttr(llvm::ArrayRef(
-                              alloc->param_shape_index().begin(),
-                              alloc->param_shape_index().end())));
-      }
-    }
-    // Optional: an attribute for optimization. If a kernel uses this
-    // allocation, but the allocation has lmhlo.constant_name, then the kernel
-    // will instead use the global value indicated by the name for potentially
-    // more optimizations (e.g. constant propagation).
-    if (alloc->is_constant()) {
-      arg_attr_list.set(
-          "lmhlo.constant_name",
-          builder_.getStringAttr(
-              xla::llvm_ir::ConstantBufferAllocationToGlobalName(*alloc)));
-    }
-    auto iter = allocation_to_output_info.find(alloc);
-    if (iter != allocation_to_output_info.end()) {
-      const Shape* sub_shape = iter->second.first;
-      const xla::ShapeIndex& shape_index = iter->second.second;
-      if (!sub_shape->IsArray()) {
-        it = ordered_allocations->erase(it);
-        continue;
-      }
-      arg_attr_list.set("lmhlo.output_index",
-                        builder_.getI64TensorAttr(llvm::ArrayRef(
-                            shape_index.begin(), shape_index.end())));
-      if (auto alias = computation_.parent()
-                           ->input_output_alias_config()
-                           .GetAliasedParameter(shape_index)) {
-        if (alias->must_alias()) {
-          arg_attr_list.set("lmhlo.must_alias", builder_.getUnitAttr());
-        }
-      }
-    }
-    block->addArgument(arg_type, loc);
-    allocations_[alloc] = block->getArguments().back();
-    args_attrs.push_back(arg_attr_list.getDictionary(builder_.getContext()));
-    it++;
-  }
-
-  FunctionType function_type =
-      builder_.getFunctionType(block->getArgumentTypes(), {});
-  func_op.setType(function_type);
-  func_op.setAllArgAttrs(args_attrs);
-
-  symbol_table_.insert(func_op);
-  builder_.setInsertionPointToEnd(block);
-
-  auto return_op =
-      builder_.create<lmhlo::TerminatorOp>(builder_.getUnknownLoc());
-  builder_ = OpBuilder(return_op);
-
-  return ::tsl::OkStatus();
-}
-
-tsl::Status HloToLhloModule(
-    const BufferAssignment& assignment, const HloModule& hlo_module,
-    ModuleOp module, std::vector<const BufferAllocation*>* ordered_allocations,
-    absl::flat_hash_map<const mlir::Operation*, const xla::HloInstruction*>*
-        lhlo_to_hlo_map) {
-  module.getContext()
-      ->loadDialect<arith::ArithDialect, bufferization::BufferizationDialect,
-                    func::FuncDialect, memref::MemRefDialect, mhlo::MhloDialect,
-                    lmhlo::LmhloDialect, lmhlo_gpu::LmhloGpuDialect>();
-
-  module->setLoc(mlir::NameLoc::get(
-      mlir::StringAttr::get(module.getContext(), hlo_module.name())));
-
-  // Store the HloModule's unique_id in the MLIR module.
-  Builder builder(module.getContext());
-  module->setAttr("mhlo.unique_id",
-                  builder.getI64IntegerAttr(hlo_module.unique_id()));
-
-  const HloComputation* computation = hlo_module.entry_computation();
-
-  LhloDialectEmitter emitter(assignment, *computation, module);
-  TF_RETURN_IF_ERROR(emitter.Initialize(ordered_allocations));
-
-  const xla::HloInstructionSequence* schedule =
-      &hlo_module.schedule().sequence(computation);
-
-  if (!schedule) {
-    return tsl::errors::Unimplemented(
-        "Missing sequential order for the computation");
-  }
-  BaseScopedDiagnosticHandler status_handler(module.getContext());
-
-  const std::vector<HloInstruction*>& ordering = schedule->instructions();
-  TF_RETURN_IF_ERROR(computation->AcceptOrdered(&emitter, ordering));
-  TF_RETURN_IF_ERROR(status_handler.ConsumeStatus());
-
-  (void)mlir::verify(module);
-
-  if (lhlo_to_hlo_map) {
-    auto map = emitter.ConsumeLhloToHloMap();
-    std::swap(*lhlo_to_hlo_map, map);
-  }
-  return status_handler.ConsumeStatus();
-}
-
-OwningOpRef<mlir::ModuleOp> HloTextToLhloTranslateFunction(
-    llvm::StringRef input, MLIRContext* context) {
-  tsl::StatusOr<std::unique_ptr<HloModule>> maybe_module =
-      xla::ParseAndReturnUnverifiedModule(
-          absl::string_view(input.data(), input.size()));
-  TF_CHECK_OK(maybe_module.status());
-
-  OwningOpRef<mlir::ModuleOp> module =
-      xla::llvm_ir::CreateMlirModuleOp(UnknownLoc::get(context));
-
-  TF_CHECK_OK(
-      ConvertHloToLmhlo(std::move(maybe_module).value(), module.get(), "Host"));
-
-  return module;
-}
-
-}  // namespace mlir
diff --git a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h
deleted file mode 100644
index e945ea2d2c5807..00000000000000
--- a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h
+++ /dev/null
@@ -1,348 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#ifndef XLA_TRANSLATE_MHLO_TO_LHLO_WITH_XLA_MHLO_TO_LHLO_WITH_XLA_H_
-#define XLA_TRANSLATE_MHLO_TO_LHLO_WITH_XLA_MHLO_TO_LHLO_WITH_XLA_H_
-
-#include <functional>
-#include <memory>
-#include <optional>
-#include <utility>
-
-#include "absl/types/optional.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"  // from @llvm-project
-#include "mlir/IR/Builders.h"  // from @llvm-project
-#include "mlir/IR/BuiltinOps.h"  // from @llvm-project
-#include "xla/hlo/ir/hlo_instruction.h"
-#include "xla/hlo/ir/hlo_instructions.h"
-#include "xla/mlir_hlo/lhlo/IR/lhlo_ops.h"
-#include "xla/mlir_hlo/lhlo_gpu/IR/lhlo_gpu_ops.h"
-#include "xla/service/buffer_assignment.h"
-#include "xla/shape_util.h"
-#include "tsl/platform/status.h"
-#include "tsl/platform/statusor.h"
-
-namespace mlir {
-
-// This class will process an HloModule with the supplied BufferAssignment and
-// populate the MLIR ModuleOp with the computation converted in the LHLO
-// dialect.
-class LhloDialectEmitter : public xla::ConstDfsHloVisitorWithDefault {
- public:
-  // Initializes internal data structures. It must be called before calling any
-  // of the visitors.
-  tsl::Status Initialize(
-      std::vector<const xla::BufferAllocation*>* ordered_allocations);
-
-  LhloDialectEmitter(const xla::BufferAssignment& assignment,
-                     const xla::HloComputation& computation, ModuleOp module)
-      : assignment_(assignment),
-        computation_(computation),
-        module_(module),
-        symbol_table_(module),
-        builder_(module.getContext()),
-        i8_type_(builder_.getIntegerType(8)) {}
-
-  tsl::StatusOr<mlir::Operation*> EmitOp(const xla::HloInstruction* instr);
-
-  static tsl::StatusOr<mhlo::ScatterDimensionNumbersAttr>
-  GetScatterDimensionNumbers(const xla::HloInstruction* instr,
-                             mlir::MLIRContext* context);
-
-  absl::flat_hash_map<const mlir::Operation*, const xla::HloInstruction*>
-  ConsumeLhloToHloMap() {
-    return std::move(lhlo_to_hlo_);
-  }
-
- private:
-  tsl::StatusOr<lmhlo::SortOp> EmitSortOp(const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo::FusionOp> EmitFusionOp(const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo::ScatterOp> EmitScatterOp(
-      const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo::SelectAndScatterOp> EmitSelectAndScatterOp(
-      const xla::HloInstruction* instr);
-
-  tsl::StatusOr<Operation*> EmitCustomCallOp(const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo_gpu::CholeskyOp> EmitCholesky(
-      const xla::HloCustomCallInstruction* custom_call);
-  tsl::StatusOr<Operation*> EmitGemm(
-      const xla::HloCustomCallInstruction* custom_call);
-  tsl::StatusOr<Operation*> EmitCublasLtMatmul(
-      const xla::HloCustomCallInstruction* custom_call);
-  tsl::StatusOr<Operation*> EmitCublasLtMatmulF8(
-      const xla::HloCustomCallInstruction* custom_call);
-  tsl::StatusOr<Operation*> EmitDnnConvolution(
-      const xla::HloCustomCallInstruction* custom_call);
-  tsl::StatusOr<Operation*> EmitDnnConvolutionReorderVectorized(
-      const xla::HloCustomCallInstruction* custom_call);
-  tsl::StatusOr<Operation*> EmitDnnBatchNorm(
-      const xla::HloCustomCallInstruction* custom_call);
-  xla::StatusOr<Operation*> EmitDnnfMHA(
-      const xla::HloCustomCallInstruction* custom_call);
-  xla::StatusOr<Operation*> EmitDnnfMHABackward(
-      const xla::HloCustomCallInstruction* custom_call);
-  tsl::StatusOr<Operation*> EmitDnnNorm(
-      const xla::HloCustomCallInstruction* custom_call);
-  xla::StatusOr<Operation*> EmitCubDeviceRadixSort(
-      const xla::HloCustomCallInstruction* custom_call);
-  tsl::StatusOr<memref::GetGlobalOp> EmitConstant(
-      const xla::HloInstruction* instr);
-
-  tsl::StatusOr<lmhlo::InfeedOp> EmitInfeedOp(const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo::OutfeedOp> EmitOutfeedOp(
-      const xla::HloInstruction* instr);
-
-  template <typename OpT>
-  tsl::StatusOr<OpT> EmitDoneOp(const xla::HloInstruction* instr);
-
-  tsl::StatusOr<lmhlo_gpu::AllToAllStartOp> EmitAllToAllStartOp(
-      const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo_gpu::AllToAllDoneOp> EmitAllToAllDoneOp(
-      const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo_gpu::AllGatherStartOp> EmitAllGatherStartOp(
-      const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo_gpu::AllGatherDoneOp> EmitAllGatherDoneOp(
-      const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo_gpu::AllReduceStartOp> EmitAllReduceStartOp(
-      const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo_gpu::AllReduceDoneOp> EmitAllReduceDoneOp(
-      const xla::HloInstruction* instr);
-  tsl::StatusOr<mlir::Operation*> EmitAsyncStartOp(
-      const xla::HloInstruction* instr);
-  tsl::StatusOr<mlir::Operation*> EmitAsyncDoneOp(
-      const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo_gpu::ReduceScatterStartOp> EmitReduceScatterStartOp(
-      const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo_gpu::ReduceScatterDoneOp> EmitReduceScatterDoneOp(
-      const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo_gpu::CollectivePermuteStartOp>
-  EmitCollectivePermuteStartOp(const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo_gpu::CollectivePermuteDoneOp> EmitCollectivePermuteDoneOp(
-      const xla::HloInstruction* instr);
-
-  tsl::StatusOr<lmhlo::RngGetAndUpdateStateOp> EmitRngGetAndUpdateStateOp(
-      const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo::FftOp> EmitFftOp(const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo::TriangularSolveOp> EmitTriangularSolveOp(
-      const xla::HloInstruction* instr);
-  tsl::StatusOr<Operation*> EmitBitcast(const xla::HloInstruction* instr);
-
-  tsl::StatusOr<lmhlo::CaseOp> EmitCaseOp(const xla::HloInstruction* instr);
-
-  tsl::StatusOr<lmhlo::WhileOp> EmitWhileOp(const xla::HloInstruction* instr);
-
-  tsl::StatusOr<lmhlo::SendOp> EmitSendOp(const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo::SendDoneOp> EmitSendDoneOp(
-      const xla::HloInstruction* instr);
-
-  tsl::StatusOr<lmhlo::RecvOp> EmitRecvOp(const xla::HloInstruction* instr);
-  tsl::StatusOr<lmhlo::RecvDoneOp> EmitRecvDoneOp(
-      const xla::HloInstruction* instr);
-
-  tsl::StatusOr<lmhlo::CommandBufferOp> EmitCommandBufferOp(
-      const xla::HloInstruction* instr);
-
-  tsl::Status ImportAsLmhloRegion(xla::HloComputation* computation,
-                                  mlir::Region* region);
-
-  // Since LMHLO dialect does not define token types, this enum controls how
-  // token operand/results from XLA:HLO are lowered to MLIR.
-  enum class TokenLoweringMode {
-    kFailToLower,  // Fail lowering if token inputs are encountered.
-    kUseNull,      // Use a null Value in the operand list for each token.
-    // kSkip,        // Skip any token inputs or outputs (not yet needed)
-  };
-
-  // Create LHLO operation operands given an XLA HLO instruction. By default,
-  // all XLA HLO operands and results are converted to MLIR and appended to
-  // `operands`. If `num_operands` is specified, only the first `num_operand`
-  // operands of the instruction are converted to MLIR. The function returns the
-  // actual number of operands and results generated for MLIR in `num_arguments`
-  // and `num_results`.
-  tsl::Status CreateOperands(const xla::HloInstruction* instr,
-                             std::optional<int64_t> num_operands,
-                             TokenLoweringMode token_mode,
-                             SmallVectorImpl<Value>& operands,
-                             size_t& num_arguments, size_t& num_results);
-
-  template <typename OpType>
-  tsl::StatusOr<OpType> CreateOpWithoutAttrs(
-      const xla::HloInstruction* instr,
-      std::optional<int64_t> num_operands = std::nullopt) {
-    size_t unused;
-    return CreateOpWithoutAttrs<OpType>(instr, unused, unused, num_operands);
-  }
-
-  template <typename OpType>
-  tsl::StatusOr<OpType> CreateOpWithoutAttrs(
-      const xla::HloInstruction* instr, size_t& num_arguments,
-      size_t& num_results, std::optional<int64_t> num_operands = std::nullopt);
-
-  template <typename OpType>
-  OpType CreateOpWithoutAttrs(const xla::HloInstruction* instr,
-                              ValueRange operands);
-
-  template <typename T>
-  DenseIntElementsAttr GetI64DenseElementsAttr(const T& container) {
-    return builder_.getI64TensorAttr(
-        {container.data(), static_cast<size_t>(container.size())});
-  }
-
-  DenseIntElementsAttr GetWindowElements(
-      const xla::Window& window,
-      std::function<int64_t(const xla::WindowDimension& dim)> getter) {
-    llvm::SmallVector<int64_t, 4> elements;
-    elements.reserve(window.dimensions_size());
-    for (const xla::WindowDimension& dim : window.dimensions()) {
-      elements.push_back(getter(dim));
-    }
-    return GetI64DenseElementsAttr(elements);
-  }
-
-  static mlir::DenseIntElementsAttr GetLayoutAttribute(
-      const xla::Layout& layout, Builder* builder);
-
-  tsl::Status DefaultAction(const xla::HloInstruction* instr) final;
-
-  // Computation parameters don't need any specific handling when they are
-  // visited, they are already processed when we enter a new computation.
-  tsl::Status HandleParameter(const xla::HloInstruction* instr) final {
-    return ::tsl::OkStatus();
-  }
-
-  // Helper function that recursively visits the tuple structure in
-  // `current_shape`, and reconstruct a matching lmhlo::TupleOp.
-  // Each leaf node is converted to an std.view op with corresponding offsets.
-  // If no tuple presents, it simply returns a view of the buffer.
-  tsl::Status GetOrCreateViewImpl(const xla::HloInstruction* instr,
-                                  const xla::Shape& current_shape,
-                                  xla::ShapeIndex* current_shape_index,
-                                  SmallVectorImpl<Value>* values,
-                                  TokenLoweringMode token_mode);
-
-  // Helper function to create view/tuple of views to a buffer for a given
-  // instruction result. `result_subset` can be used to for instructions that
-  // have a tuple result and MLIR conversion needs to convert only one of the
-  // tuple elements. Note that if needed, this can be extended to take a list of
-  // ShapeIndex values in case we need finer control on what elements of the
-  // output tuple to be converted to MLIR.
-  tsl::Status GetOrCreateView(
-      const xla::HloInstruction* instr, SmallVectorImpl<Value>* values,
-      const xla::ShapeIndex& result_subset = {},
-      TokenLoweringMode token_mode = TokenLoweringMode::kFailToLower);
-
-  tsl::StatusOr<Value> GetOrCreateArrayView(
-      const xla::HloInstruction* instr, const xla::Shape& current_shape,
-      const xla::ShapeIndex& current_shape_index);
-
-  tsl::StatusOr<Value> RewriteFusionOperand(const xla::HloInstruction* root,
-                                            const xla::Shape& shape,
-                                            xla::ShapeIndex* shape_index,
-                                            OpBuilder* b, Location loc);
-
-  // Return an MLIR location for an HLO instruction.
-  Location getLocation(const xla::HloInstruction* inst) {
-    return NameLoc::get(builder_.getStringAttr(inst->name()));
-  }
-
-  // This map provides access to MLIR buffers for each HLO buffer allocation.
-  // The MLIR buffers are all `memref<{size}xi8>` and correspond to function
-  // parameters. It is populated at the beginning of the processing with all
-  // the buffer allocations and is unchanged afterward. Every HLOInstruction
-  // is using a "slice" of the buffer allocation and providing shape, layout,
-  // and Dtype. An MLIR view is used separately to model slices into the
-  // allocations (see below).
-  llvm::DenseMap<const xla::BufferAllocation*, Value> allocations_;
-
-  // This map provides access to MLIR buffers constructed from memref arguments
-  // (allocations) using memref.view operation at the given offset (defined by
-  // slice) and result type (defined by shape). By using this cache we guarantee
-  // that we have a unique memref.view operation corresponding to each
-  // allocation slice.
-  absl::flat_hash_map<std::pair<xla::BufferAllocation::Slice, xla::Shape>,
-                      Value>
-      allocation_slices_;
-
-  // This map provides access to MLIR buffers for each HLO instruction, keyed
-  // instruction identity. A slice is contained in a BufferAllocation, and has
-  // an offset and a size.
-  //
-  // As for why we don't use HloInstruction*, see GetOrCreateView(), but
-  // mostly we want to leverage better of the aliased buffers.
-  //
-  // If the HloInstruction is a tuple, all leaf nodes are stored flattened.
-  // Otherwise, there will be a single buffer.
-  //
-  // An MLIR buffer is either an input parameter, or a ViewOp in the case
-  // where the slice is only part of its allocation.
-  //
-  // `instr_slices_` is populated lazily in the `GetOrCreateView()` helper as we
-  // process every instruction.
-  absl::flat_hash_map<std::pair<const xla::HloInstruction*, xla::ShapeIndex>,
-                      Value>
-      instr_slices_;
-
-  // The BufferAssignment computed by XLA ahead of time.
-  const xla::BufferAssignment& assignment_;
-
-  // The HLO module that will be converted.
-  const xla::HloComputation& computation_;
-
-  // This is the MLIR module in which a function will be created for every HLO
-  // computation.
-  ModuleOp module_;
-
-  // SymbolTable associated with the module. New functions should be added using
-  // this to avoid name conflicts.
-  mlir::SymbolTable symbol_table_;
-
-  // The builder keeps track of the current insertion point in the MLIR
-  // module.
-  OpBuilder builder_;
-  // Convenient "cached" access to this widely used MLIR type (i8).
-  Type i8_type_;
-
-  // Map ops returning tokens to their output (async collectives start ops, and
-  // point-to-point communication ops), to connect the correct done op.
-  absl::flat_hash_map<const xla::HloInstruction*, mlir::Value> ret_tokens_;
-
-  // Maps each LHLO op created directly by this emitter to the corresponding HLO
-  // instruction.
-  // Note: this does not contain ops that are inside the bodies of fusions.
-  absl::flat_hash_map<const mlir::Operation*, const xla::HloInstruction*>
-      lhlo_to_hlo_;
-};
-
-// Populate the MLIR `module` with the computation from the `hlo_module` using
-// the provided buffer `assignment`. The returned `Status` indicates success
-// or failure in the conversion.
-// `lhlo_to_hlo_map`, if non-null, is populated with a mapping from generated
-// top-level MLIR operations to the original HLO instructions. "top-level" means
-// that ops inside the bodies of fusions are not included (but all fusions are).
-// Store buffer allocations from buffer assignment in the order of inputs to the
-// LMHLO entry function.
-tsl::Status HloToLhloModule(
-    const xla::BufferAssignment& assignment, const xla::HloModule& hlo_module,
-    ModuleOp module,
-    std::vector<const xla::BufferAllocation*>* ordered_allocation,
-    absl::flat_hash_map<const mlir::Operation*, const xla::HloInstruction*>*
-        lhlo_to_hlo_map = nullptr);
-
-OwningOpRef<mlir::ModuleOp> HloTextToLhloTranslateFunction(
-    llvm::StringRef input, MLIRContext* context);
-
-}  // namespace mlir
-
-#endif  // XLA_TRANSLATE_MHLO_TO_LHLO_WITH_XLA_MHLO_TO_LHLO_WITH_XLA_H_
diff --git a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/BUILD b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/BUILD
deleted file mode 100644
index 2196d02175d1a3..00000000000000
--- a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/BUILD
+++ /dev/null
@@ -1,29 +0,0 @@
-load("//xla:lit.bzl", "enforce_glob", "lit_test_suite")
-
-package(
-    default_visibility = ["//visibility:public"],
-    # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
-    licenses = ["notice"],
-)
-
-lit_test_suite(
-    name = "all_tests",
-    srcs = enforce_glob(
-        [
-            "hlo_text_to_lhlo_no_opt.hlotxt",
-            "non_identity_layouts.hlotxt",
-            "no_opt_ops.hlotxt",
-        ],
-        include = [
-            "*.hlotxt",
-        ],
-    ),
-    cfg = "//xla:lit.cfg.py",
-    tools = [
-        "//xla/translate:xla-translate",
-        "//xla/translate/mhlo_to_lhlo_with_xla:xla-translate-gpu-opt",
-        "//xla/translate/mhlo_to_lhlo_with_xla:xla-translate-opt",
-        "@llvm-project//llvm:FileCheck",
-        "@llvm-project//llvm:not",
-    ],
-)
diff --git a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/hlo_text_to_lhlo_no_opt.hlotxt b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/hlo_text_to_lhlo_no_opt.hlotxt
deleted file mode 100644
index ed6c3f71e95d35..00000000000000
--- a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/hlo_text_to_lhlo_no_opt.hlotxt
+++ /dev/null
@@ -1,736 +0,0 @@
-// RUN: xla-translate -split-input-file -hlo-text-to-lhlo %s | FileCheck %s
-
-HloModule TestModule
-
-// CHECK-LABEL: func @TestComputation
-
-FusedComputation {
-  // CHECK: to_tensor {{.*}} {xla_shape = "f32[3,2]{0,1}"}
-  x = f32[3, 2]{0,1} parameter(0)
-  ROOT y = f32[3, 2]{0,1} add(x, x)
-}
-
-ENTRY TestComputation {
-  x = f32[3, 2]{0,1} parameter(0)
-  ROOT y = f32[3, 2]{0,1} fusion(x), kind=kLoop, calls=FusedComputation
-}
-
-// -----
-
-HloModule ScatterModule
-
-update_s32 (lhs: s32[], rhs: s32[]) -> s32[] {
-  lhs = s32[] parameter(0)
-  ROOT rhs = s32[] parameter(1)
-}
-
-// CHECK-LABEL: func @main
-// CHECK: "lmhlo.scatter"
-// CHECK: indices_are_sorted = false
-// CHECK: update_window_dims = [1]
-// CHECK: inserted_window_dims = [0]
-// CHECK: scatter_dims_to_operand_dims = [0]
-// CHECK: index_vector_dim = 1
-// CHECK: unique_indices = false
-// CHECK: ^bb0(%[[ARG5:.*]]: tensor<i32>, %[[ARG6:.*]]: tensor<i32>):
-// CHECK:  mhlo.return %[[ARG6]]
-// CHECK: (memref<3x3xi32>, memref<2xi32>, memref<2x3xi32>, memref<3x3xi32>) -> ()
-ENTRY main {
-  operand = s32[3,3] parameter(0)
-  indices = s32[2] parameter(1)
-  updates = s32[2,3] parameter(2)
-  ROOT scatter_op = s32[3,3] scatter(operand, indices, updates),
-      to_apply=update_s32,
-      update_window_dims={1},
-      inserted_window_dims={0},
-      scatter_dims_to_operand_dims={0},
-      index_vector_dim=1
-}
-
-// -----
-
-HloModule SelectAndScatter
-
-%ge_F32 (lhs.5: f32[], rhs.6: f32[]) -> pred[] {
-  %lhs.5 = f32[] parameter(0)
-  %rhs.6 = f32[] parameter(1)
-  ROOT %compare.7 = pred[] compare(f32[] %lhs.5, f32[] %rhs.6), direction=GE
-}
-
-%add_F32 (lhs.9: f32[], rhs.10: f32[]) -> f32[] {
-  %lhs.9 = f32[] parameter(0)
-  %rhs.10 = f32[] parameter(1)
-  ROOT %add.11 = f32[] add(f32[] %lhs.9, f32[] %rhs.10)
-}
-
-// CHECK-LABEL: module
-// CHECK: memref.global "private" constant @[[$GLOBAL:.*]] : memref<f32> = dense<0.000000e+00>
-// CHECK-LABEL: func @main
-// CHECK: %[[GLOBAL_MEMREF:.*]] = memref.get_global @[[$GLOBAL]] : memref<f32>
-// CHECK: "lmhlo.select_and_scatter"(%{{.*}}, %{{.*}}, %[[GLOBAL_MEMREF]], %{{.*}})
-// CHECK: padding = dense<0> : tensor<1xi64>
-// CHECK: window_dimensions = dense<3> : tensor<1xi64>
-// CHECK: window_strides = dense<3> : tensor<1xi64>
-// CHECK: ^bb0(%[[ARG0:.*]]: tensor<f32>, %[[ARG1:.*]]: tensor<f32>):
-// CHECK: %[[COMPARE:.*]] = mhlo.compare GE, %[[ARG0]], %[[ARG1]]
-// CHECK: mhlo.return %[[COMPARE]] : tensor<i1>
-// CHECK: ^bb0(%[[ARG2:.*]]: tensor<f32>, %[[ARG3:.*]]: tensor<f32>):
-// CHECK: %[[ADD:.*]] = mhlo.add %[[ARG2]], %[[ARG3]]
-// CHECK: mhlo.return %[[ADD]] : tensor<f32>
-// CHECK: (memref<6xf32>, memref<2xf32>, memref<f32>, memref<6xf32>) -> ()
-ENTRY main () -> f32[6] {
-  %operand = f32[6]{0} parameter(0)
-  %source = f32[2]{0} parameter(1)
-  %init = f32[] constant(0)
-  ROOT %select-and-scatter.12 = f32[6]{0} select-and-scatter(f32[6]{0} %operand, f32[2]{0} %source, f32[] %init), window={size=3 stride=3}, select=%ge_F32, scatter=%add_F32
-}
-
-// -----
-
-HloModule SliceToDynamic
-
-// CHECK-LABEL: func @main
-// CHECK: "lmhlo.custom_call"
-// CHECK: backend_config = "", call_target_name = "SliceToDynamic"
-// CHECK-SAME: operandSegmentSizes = array<i32: 4, 1>
-// CHECK-NOT: target_arg_mapping
-// CHECK: (memref<2x2x2xi32>, memref<i32>, memref<i32>, memref<i32>, memref<2x2x2xi32>) -> ()
-ENTRY main {
-  %param = s32[2,2,2] parameter(0)
-  %static = s32[] parameter(1)
-  %dynamic = s32[] parameter(2)
-  ROOT %custom-call = s32[2,<=2, 2] custom-call(s32[2,2,2] %param,
-                                                  s32[] %static,
-                                                  s32[] %dynamic,
-                                                  s32[] %static),
-                                      custom_call_target="SliceToDynamic",
-                                      backend_config=""
-}
-
-// -----
-
-HloModule Cholesky
-
-// CHECK-LABEL: func @main
-// CHECK: "lmhlo_gpu.cholesky"
-// CHECK-SAME: is_lower = true
-ENTRY main {
-  %param = f32[3,3] parameter(0)
-  ROOT %custom-call = (f32[3,3], f32[3], s32[]) custom-call(f32[3,3] %param),
-                                custom_call_target="__cusolver$cholesky",
-                                operand_layout_constraints={f32[3,3]},
-                                backend_config="{\"lower\":true}"
-}
-
-// -----
-
-HloModule Gemm
-
-// CHECK-LABEL: func @main
-// CHECK: "lmhlo_gpu.gemm"
-// CHECK-SAME: algorithm = 7 : i64
-// CHECK-SAME: alpha_imag = 0.000000e+00 : f64
-// CHECK-SAME: alpha_real = 1.000000e+00 : f64
-// CHECK-SAME: beta = 0.000000e+00 : f64
-// CHECK-NOT: lhs_batching_dimensions
-// CHECK-NOT: rhs_batching_dimensions
-// CHECK-SAME: lhs_contracting_dimensions = [1]
-// CHECK-SAME: rhs_contracting_dimensions = [0]
-// CHECK-SAME: precision_config = [#mhlo<precision HIGH>, #mhlo<precision HIGHEST>]
-// CHECK: (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
-ENTRY main {
-  %A = f32[2,2]{1,0} parameter(0)
-  %B = f32[2,2]{1,0} parameter(1)
-  ROOT %sgemm = f32[2,2]{1,0} custom-call(f32[2,2]{1,0} %A, f32[2,2]{1,0} %B),
-                              custom_call_target="__cublas$gemm",
-  backend_config="{ \"gemm_backend_config\": {\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"precision_config\":{\"operand_precision\":[\"HIGH\",\"HIGHEST\"]},\"selected_algorithm\":\"7\"}}"
-}
-
-// -----
-
-HloModule CublasLtMatmul
-
-// CHECK-LABEL: func @main
-// CHECK: "lmhlo_gpu.cublas.lt.matmul"
-// CHECK-SAME: alpha_imag = 0.000000e+00 : f64
-// CHECK-SAME: alpha_real = 1.000000e+00 : f64
-// CHECK-SAME: beta = 0.000000e+00 : f64
-// CHECK-NOT: lhs_batching_dimensions
-// CHECK-NOT: rhs_batching_dimensions
-// CHECK-SAME: lhs_contracting_dimensions = [1]
-// CHECK-SAME: rhs_contracting_dimensions = [0]
-// CHECK-SAME: precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-// CHECK: (memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>, memref<2x2xf32>) -> ()
-
-ENTRY main {
-  %A = f32[2,2]{1,0} parameter(0)
-  %B = f32[2,2]{1,0} parameter(1)
-  ROOT %custom-call = f32[2,2]{1,0} custom-call(f32[2,2]{1,0} %A, f32[2,2]{1,0} %B), custom_call_target="__cublas$lt$matmul",
-    backend_config="{ \"gemm_backend_config\": {\"alpha_real\":1,\"alpha_imag\":0,\"beta\":0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]}}}"
-}
-
-// -----
-
-HloModule CublasLtMatmulF8
-
-// CHECK-LABEL: func @main
-// CHECK: "lmhlo_gpu.cublas.lt.matmul.f8"
-// CHECK-SAME: alpha_imag = 0.000000e+00 : f64
-// CHECK-SAME: alpha_real = 1.000000e+00 : f64
-// CHECK-SAME: beta = 1.000000e+00 : f64
-// CHECK-NOT: lhs_batching_dimensions
-// CHECK-NOT: rhs_batching_dimensions
-// CHECK-SAME: lhs_contracting_dimensions = [1]
-// CHECK-SAME: rhs_contracting_dimensions = [0]
-// CHECK-SAME: precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-// CHECK: (memref<16x16xf8E4M3FN>, memref<16x16xf8E4M3FN>, memref<16x16xf16>, memref<f32>, memref<f32>, memref<f32>, memref<f32>, memref<16x16xf8E4M3FN>, memref<f32>) -> ()
-
-ENTRY main {
-  %A = f8e4m3fn[16,16]{1,0} parameter(0)
-  %B = f8e4m3fn[16,16]{1,0} parameter(1)
-  %C = f16[16,16]{1,0} parameter(2)
-  %A_SCALE = f32[] parameter(3)
-  %B_SCALE = f32[] parameter(4)
-  %C_SCALE = f32[] parameter(5)
-  %D_SCALE = f32[] parameter(6)
-  ROOT %custom-call = (f8e4m3fn[16,16]{1,0}, f32[]) custom-call(f8e4m3fn[16,16]{1,0} %A, f8e4m3fn[16,16]{1,0} %B, f16[16,16]{1,0} %C, f32[] %A_SCALE, f32[] %B_SCALE, f32[] %C_SCALE, f32[] %D_SCALE), custom_call_target="__cublas$lt$matmul$f8",
-    backend_config="{ \"gemm_backend_config\": {\"alpha_real\":1,\"alpha_imag\":0,\"beta\":1.0,\"dot_dimension_numbers\":{\"lhs_contracting_dimensions\":[\"1\"],\"rhs_contracting_dimensions\":[\"0\"],\"lhs_batch_dimensions\":[],\"rhs_batch_dimensions\":[]},\"precision_config\":{\"operand_precision\":[\"DEFAULT\",\"DEFAULT\"]}}}"
-}
-
-
-// -----
-
-HloModule AsyncAllReduce
-
-// Test all-reduce-async
-add {
-  lhs = f32[] parameter(0)
-  rhs = f32[] parameter(1)
-  ROOT add = f32[] add(lhs, rhs)
-}
-
-// CHECK-LABEL: func @test_async_all_reduce
-// CHECK-SAME: [[BUFFER:%.*]]: memref<32xi8>
-%test_async_all_reduce {
-  param0 = f32[8] parameter(0)
-  // CHECK:  [[VIEW:%.*]] = memref.view [[BUFFER]]{{.*}} : memref<32xi8> to memref<8xf32>
-  // CHECK:  [[TOKEN:%.*]] = "lmhlo_gpu.all_reduce_start"([[VIEW]], [[VIEW]])
-  // CHECK-SAME:  channel_id = #mhlo.channel_handle<handle = 1, type = 0>
-  // CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : tensor<2x4xi64>
-  // CHECK:  ^bb0([[ARG0:%.*]]: tensor<f32>, [[ARG1:%.*]]: tensor<f32>):
-  // CHECK:    [[ADD:%.*]] = mhlo.add [[ARG0]], [[ARG1]]
-  // CHECK:    mhlo.return [[ADD]] : tensor<f32>
-  // CHECK:  })
-  // CHECK:  "lmhlo_gpu.all_reduce_done"([[TOKEN]])
-  start = f32[8] all-reduce-start(param0),
-      channel_id=1, replica_groups={{0,1,2,3}, {4,5,6,7}}, to_apply=add
-  ROOT done = f32[8] all-reduce-done(start)
-}
-
-// -----
-
-HloModule AsyncAllReduceTwoOperands
-
-// Test all-reduce-async
-add {
-  lhs = f32[] parameter(0)
-  rhs = f32[] parameter(1)
-  ROOT add = f32[] add(lhs, rhs)
-}
-
-// CHECK-LABEL: func @test_async_all_reduce_two_operands
-// CHECK-SAME: [[BUFFER0:%.*]]: memref<32xi8>
-// CHECK-SAME: [[BUFFER1:%.*]]: memref<36xi8>
-%test_async_all_reduce_two_operands {
-  param0 = f32[8] parameter(0)
-  param1 = f32[9] parameter(1)
-  // CHECK:  [[VIEW0:%.*]] = memref.view [[BUFFER0]]{{.*}} : memref<32xi8> to memref<8xf32>
-  // CHECK:  [[VIEW1:%.*]] = memref.view [[BUFFER1]]{{.*}} : memref<36xi8> to memref<9xf32>
-  // CHECK:  [[TOKEN:%.*]] = "lmhlo_gpu.all_reduce_start"([[VIEW0]], [[VIEW1]], [[VIEW0]], [[VIEW1]])
-  // CHECK-SAME:  channel_id = #mhlo.channel_handle<handle = 1, type = 0>
-  // CHECK-SAME{LITERAL}:  replica_groups = dense<[[0, 1, 2, 3], [4, 5, 6, 7]]> : tensor<2x4xi64>
-  // CHECK:  ^bb0([[ARG0:%.*]]: tensor<f32>, [[ARG1:%.*]]: tensor<f32>):
-  // CHECK:    [[ADD:%.*]] = mhlo.add [[ARG0]], [[ARG1]]
-  // CHECK:    mhlo.return [[ADD]] : tensor<f32>
-  // CHECK:  })
-  // CHECK:  "lmhlo_gpu.all_reduce_done"([[TOKEN]])
-  start = (f32[8], f32[9]) all-reduce-start(param0, param1),
-      channel_id=1, replica_groups={{0,1,2,3}, {4,5,6,7}}, to_apply=add
-  ROOT done = (f32[8], f32[9]) all-reduce-done(start)
-}
-
-// -----
-
-HloModule ConvForward
-
-// CHECK-LABEL: func @main
-// CHECK: lmhlo_gpu.conv_forward
-// CHECK-SAME: dim_numbers = [b, f, 0, 1]x[o, i, 0, 1]->[b, f, 0, 1]
-// CHECK-SAME{LITERAL}: window = {stride = [1, 1], pad = [0, 0], lhs_dilate = [1, 1], rhs_dilate = [1, 1], reverse = [1, 1]}
-// CHECK-SAME: algorithm = 2
-// CHECK-SAME: tensor_ops_enabled = false
-// CHECK-SAME: operand_0_layout = [3, 2, 1, 0]
-// CKECK-SAME: operand_1_layout = [3, 2, 1, 0]
-// CHECK-SAME: result_layout = [3, 2, 1, 0]
-// CHECK-SAME: batch_group_count = 1 : i64
-// CHECK-SAME: feature_group_count = 1 : i64
-// CHECK-SAME: result_scale = 1.000000e+00 : f64
-// CHECK: (memref<4x256x3x3xf32>, memref<256x256x2x2xf32>, memref<4x256x2x2xf32>, memref<65536xui8>)
-ENTRY main {
-  %input = f32[4,256,3,3]{3,2,1,0} parameter(0)
-  %filter = f32[256,256,2,2]{3,2,1,0} parameter(1)
-  ROOT %custom-call.1 = (f32[4,256,2,2]{3,2, 1,0}, u8[65536]{0}) custom-call(f32[4,256,3,3]{3,2,1,0} %input, f32[256,256,2,2]{3,2,1,0} %filter),
-                        window={size=2x2 rhs_reversal=1x1}, dim_labels=bf01_oi01->bf01,
-                        custom_call_target="__cudnn$convForward",
-                        backend_config="{ \"cudnn_conv_backend_config\": {\"algorithm\": {\"algo_id\":\"2\",\"math_type\":\"DEFAULT_MATH\"},\"conv_result_scale\":1,\"activation_mode\":\"0\",\"side_input_scale\":0}}"
-}
-
-// -----
-
-// CHECK: func @main
-// CHECK: lmhlo_gpu.conv_forward_fused
-// CHECK-SAME: dim_numbers = [b, f, 0, 1]x[0, 1, i, o]->[b, f, 0, 1]
-// CHECK-SAME{LITERAL}: window = {stride = [1, 1], pad = [1, 1], lhs_dilate = [1, 1], rhs_dilate = [1, 1], reverse = [0, 0]}
-// CHECK-SAME: activation_mode = #lmhlo_gpu<activation Relu>
-// CHECK-SAME: algorithm = 0
-// CHECK-SAME: tensor_ops_enabled = false
-// CHECK-SAME: operand_0_layout = [1, 3, 2, 0]
-// CHECK-SAME: operand_1_layout = [2, 1, 0, 3]
-// CHECK-SAME: result_layout = [1, 3, 2, 0]
-// CHECK-SAME: batch_group_count = 1 : i64
-// CHECK-SAME: feature_group_count = 1 : i64
-// CHECK-SAME: precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-// CHECK-SAME: result_scale = 1.000000e+00 : f64
-// CHECK-SAME: (memref<1x17x9x9xf16, #map{{.*}}>, memref<3x3x17x32xf16, #map{{.*}}>, memref<32xf16>, memref<1x32x9x9xf16, #{{.*}}>, memref<0xui8>) -> ()
-
-HloModule FusedConvForward
-
-ENTRY main {
-  %input = f16[1,17,9,9]{1,3,2,0} parameter(0)
-  %filter = f16[3,3,17,32]{2,1,0,3} parameter(1)
-  %bias = f16[32]{0} parameter(2)
-  ROOT %custom-call.2 = (f16[1,32,9,9]{1,3,2,0}, u8[0]{0}) custom-call(f16[1,17,9,9]{1,3,2,0} %input, f16[3,3,17,32]{2,1,0,3} %filter, f16[32]{0} %bias), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config="{ \"cudnn_conv_backend_config\": {\"algorithm\": {\"algo_id\":\"0\",\"math_type\":\"DEFAULT_MATH\"},\"conv_result_scale\":1,\"activation_mode\":\"2\",\"side_input_scale\":0}}"
-}
-
-// -----
-
-// CHECK: func @main
-// CHECK: lmhlo_gpu.conv_forward_fused_with_side_input
-// CHECK-SAME: dim_numbers = [b, f, 0, 1]x[0, 1, i, o]->[b, f, 0, 1]
-// CHECK-SAME{LITERAL}: window = {stride = [1, 1], pad = [1, 1], lhs_dilate = [1, 1], rhs_dilate = [1, 1], reverse = [0, 0]}
-// CHECK-SAME: activation_mode = #lmhlo_gpu<activation Relu>
-// CHECK-SAME: algorithm = 0
-// CHECK-SAME: tensor_ops_enabled = false
-// CHECK-SAME: operand_0_layout = [1, 3, 2, 0]
-// CHECK-SAME: operand_1_layout = [2, 1, 0, 3]
-// CHECK-SAME: result_layout = [1, 3, 2, 0]
-// CHECK-SAME: batch_group_count = 1 : i64
-// CHECK-SAME: feature_group_count = 1 : i64
-// CHECK-SAME: precision_config = [#mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>, #mhlo<precision DEFAULT>]
-// CHECK-SAME: result_scale = 1.000000e+00 : f64
-// CHECK-SAME: side_input_scale = 1.000000e+00
-// CHECK-SAME: (memref<1x17x9x9xf16, #map{{.*}}>, memref<3x3x17x32xf16, #map{{.*}}>, memref<32xf16>, memref<1x32x9x9xf16, #{{.*}}>, memref<0xui8>) -> ()
-
-HloModule FusedConvForwardSideInput
-
-ENTRY main {
-  %input = f16[1,17,9,9]{1,3,2,0} parameter(0)
-  %filter = f16[3,3,17,32]{2,1,0,3} parameter(1)
-  %bias = f16[32]{0} parameter(2)
-  %side = f16[32]{0} parameter(3)
-  ROOT %custom-call.2 = (f16[1,32,9,9]{1,3,2,0}, u8[0]{0}) custom-call(f16[1,17,9,9]{1,3,2,0} %input, f16[3,3,17,32]{2,1,0,3} %filter, f16[32]{0} %bias, f16[32]{0} %side), window={size=3x3 pad=1_1x1_1}, dim_labels=bf01_01io->bf01, custom_call_target="__cudnn$convBiasActivationForward", backend_config="{ \"cudnn_conv_backend_config\": {\"algorithm\":{\"algo_id\":\"0\",\"math_type\":\"DEFAULT_MATH\"},\"conv_result_scale\":1,\"activation_mode\":\"2\",\"side_input_scale\":1}}"
-}
-
-// -----
-
-HloModule Infeed
-
-// CHECK: func @main
-// CHECK: "lmhlo.infeed"
-// CHECK-SAME: (memref<3xf32>) -> ()
-ENTRY main {
-  %tok = token[] parameter(0)
-  ROOT %infeed = (f32[3]{0}, token[]) infeed(token[] %tok)
-}
-
-// -----
-
-HloModule Outfeed
-
-// CHECK: func @main
-// CHECK: "lmhlo.outfeed"
-// CHECK-SAME: config = ""
-// CHECK-SAME: (memref<3xf32>) -> ()
-ENTRY main {
-  %source = f32[3] parameter(0)
-  %tok = token[] parameter(1)
-  ROOT %o = token[] outfeed(f32[3] %source, token[] %tok)
-}
-
-// -----
-
-HloModule Outfeed
-
-// CHECK: func @main
-// CHECK: "lmhlo.custom_call"
-// CHECK: call_target_name = "foo"
-// CHECK: "lmhlo.outfeed"
-// CHECK-SAME: config = ""
-// CHECK-SAME: (memref<3xf32>, memref<5xf16>) -> ()
-ENTRY main {
-  %tok = token[] parameter(0)
-  %tuple = (f32[3], f16[5]) custom-call(),custom_call_target="foo"
-  ROOT %o = token[] outfeed((f32[3], f16[5]) %tuple, token[] %tok)
-}
-
-// -----
-
-HloModule TestModule
-
-// CHECK: func @main
-// CHECK:   "lmhlo.rng_get_and_update_state"(%{{.*}}) <{delta = 131072 : i64}> : (memref<2xui64>) -> ()
-ENTRY main {
-  ROOT %rng-get-and-update-state = u64[2]{0} rng-get-and-update-state(), delta=131072
-}
-
-// -----
-
-HloModule TestReplicaId
-
-// CHECK: func @main
-// CHECK: "lmhlo.replica_id"
-// CHECK-SAME: (memref<ui32>) -> ()
-ENTRY main {
-  ROOT %replica_id = u32[] replica-id()
-}
-
-// -----
-
-HloModule fft
-
-// CHECK: func @main
-// CHECK: "lmhlo.fft"
-// CHECK-SAME: fft_length = dense<[8, 32]> : tensor<2xi64>
-// CHECK-SAME: fft_type = #mhlo<fft_type IFFT>
-ENTRY main {
-  %input = c64[5,8,32] parameter(0)
-  ROOT %fft = c64[5,8,32] fft(c64[5,8,32] %input), fft_type=IFFT, fft_length={8,32}
-}
-
-// -----
-
-HloModule TriangularSolve_module
-
-// CHECK: func @main
-// CHECK: "lmhlo.triangular_solve"
-// CHECK-SAME: layout_a = dense<[1, 0]> : tensor<2xindex>
-// CHECK-SAME: layout_b = dense<[1, 0]> : tensor<2xindex>
-// CHECK-SAME: layout_output = dense<[1, 0]> : tensor<2xindex>
-// CHECK-SAME: left_side = false
-// CHECK-SAME: lower = true
-// CHECK-SAME: transpose_a = #mhlo<transpose NO_TRANSPOSE>
-// CHECK-SAME: unit_diagonal = false
-ENTRY main {
-  %a = f32[4,4]{1,0} parameter(0)
-  %b = f32[3,4]{1,0} parameter(1)
-  ROOT %triangular-solve = f32[3,4]{1,0} triangular-solve(f32[4,4]{1,0} %a, f32[3,4]{1,0} %b), lower=true, transpose_a=NO_TRANSPOSE
-}
-
-// -----
-
-HloModule CustomCallWithTypedFFIBackendConfig
-
-// CHECK: func @main
-// CHECK: "lmhlo.custom_call"
-// CHECK: api_version = 4 : i32
-// CHECK-SAME: backend_config = {
-// CHECK-SAME:   user_attr0 = 123 : i32
-// CHECK-SAME:   user_attr1 = dense<42> : tensor<i32>
-// CHECK-SAME: }
-// CHECK-SAME: num_args = 1
-// CHECK-SAME: num_results = 2
-// CHECK-SAME: args_to_target_args = []
-// CHECK-SAME: results_to_target_results = [0]
-ENTRY main {
-  %tok = token[] parameter(0)
-  ROOT %call = (f32[3], token[]) custom-call (%tok), custom_call_target="foo",
-                                                     api_version=API_VERSION_TYPED_FFI,
-                                                     backend_config="{user_attr0 = 123 : i32, user_attr1 = dense<42> : tensor<i32>}"
-}
-
-// -----
-
-HloModule CustomCallWithTokens
-
-// CHECK: func @main
-// CHECK: "lmhlo.custom_call"
-// CHECK: num_args = 1
-// CHECK-SAME: num_results = 2
-// CHECK-SAME: args_to_target_args = []
-// CHECK-SAME: results_to_target_results = [0]
-ENTRY main {
-  %tok = token[] parameter(0)
-  ROOT %call = (f32[3], token[]) custom-call (%tok), custom_call_target="foo",
-                                                     backend_config=""
-}
-
-// -----
-
-HloModule CustomCallWithTokens
-
-// CHECK: func @main
-// CHECK: "lmhlo.custom_call"
-// CHECK: num_args = 3
-// CHECK-SAME: num_results = 3
-// CHECK-SAME: args_to_target_args = [1]
-// CHECK-SAME: results_to_target_results = [0, 2]
-ENTRY main {
-  %tok = token[] parameter(0)
-  %input = f32[5,8,32] parameter(1)
-  ROOT %call = (f32[3]{0}, token[], f32[3]) custom-call (%tok, %input, %tok),
-                                      custom_call_target="foo",
-                                      backend_config=""
-}
-
-// -----
-
-HloModule CustomCallWithTokens
-
-// CHECK: func @main
-// CHECK: "lmhlo.custom_call"
-// CHECK: num_args = 3
-// CHECK-SAME: num_results = 1
-// CHECK-SAME: args_to_target_args = [1]
-// CHECK-SAME: results_to_target_results = [0]
-ENTRY main {
-  %tok = token[] parameter(0)
-  %input = f32[5,8,32] parameter(1)
-  ROOT %call =  f32[3] custom-call (%tok, %input, %tok),
-                                    custom_call_target="foo",
-                                    backend_config=""
-}
-
-// -----
-
-HloModule CustomCallWithTokens
-
-// CHECK: func @main
-// CHECK: "lmhlo.custom_call"
-// CHECK: num_args = 1
-// CHECK-SAME: num_results = 4
-// CHECK-SAME: args_to_target_args = [0]
-// CHECK-SAME: results_to_target_results = [1]
-ENTRY main {
-  %input = f32[5,8,32] parameter(0)
-  ROOT %call = (token[], f32[3]{0}, token[], token[]) custom-call (%input),
-                                      custom_call_target="foo",
-                                      backend_config=""
-}
-
-// -----
-// CHECK: func @main
-// CHECK: "lmhlo.while"(%{{.*}}) ({
-HloModule WhileConstantCondition
-
-%body {
-  ROOT %parameter.5 = (f32[5]{0}) parameter(0)
-}
-
-%cond {
-  %parameter.12 = (f32[5]{0}) parameter(0)
-  ROOT %constant_1 = pred[] constant(false)
-}
-
-ENTRY %main (parameter.1: f32[5]) -> (f32[5]) {
-  %parameter.1 = f32[5]{0} parameter(0)
-  %tuple = (f32[5]{0}) tuple(f32[5]{0} %parameter.1)
-  ROOT %while.19 = (f32[5]{0}) while((f32[5]{0}) %tuple), condition=%cond, body=%body
-}
-
-// -----
-
-HloModule CustomCallNoComputation
-
-// CHECK: "lmhlo.custom_call"
-// CHECK: call_target_name = "__custom"
-
-ENTRY main {
-  param = f32[] parameter(0)
-  ROOT cr = f32[] custom-call(param), custom_call_target="__custom"
-}
-
-// -----
-
-HloModule CustomCallWithComputation
-
-// CHECK: "lmhlo.custom_call"
-// CHECK: call_target_name = "__custom"
-// CHECK: %0 = mhlo.add
-// CHECK: mhlo.return %0
-
-computation1 {
-  param_0 = f32[] parameter(0)
-  ROOT r = f32[] add(param_0, param_0)
-}
-
-ENTRY main {
-  param = f32[] parameter(0)
-  ROOT cr = f32[] custom-call(param), custom_call_target="__custom",
-                                      to_apply=computation1
-}
-
-// -----
-
-HloModule Send
-
-// CHECK: func @main
-// CHECK: %[[ARG1:arg[0-9]+]]: memref<16xi8> {lmhlo.params = 1 : index}
-// CHECK: %[[VIEW:.*]] = memref.view %[[ARG1]][%c0][]
-// CHECK: %[[TOKEN:.*]] = "lmhlo.send"(%[[VIEW]])
-// CHECK:   channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
-// CHECK:   frontend_attributes = {_xla_dcn_recv_channel = "2",
-// CHECK:                          _xla_host_transfer_handler_name = "undef",
-// CHECK:                          _xla_host_transfer_rendezvous = "undef"}
-// CHECK:   is_host_transfer = true
-// CHECK: : (memref<4xf32>) -> !mhlo.token
-// CHECK: "lmhlo.send_done"(%0)
-// CHECK:   channel_handle = #mhlo.channel_handle<handle = 1, type = 2>,
-// CHECK    is_host_transfer = true
-// CHECK: : (!mhlo.token) -> ()
-ENTRY main {
-  %tok = token[] parameter(0)
-  %buf = f32[4]{0} parameter(1)
-  %send = (f32[4]{0}, u32[], token[]) send(f32[4]{0} %buf, token[] %tok), channel_id=1, is_host_transfer=true, frontend_attributes={_xla_dcn_recv_channel="2",_xla_host_transfer_handler_name="undef",_xla_host_transfer_rendezvous="undef"}
-  ROOT %send-done = token[] send-done((f32[4]{0}, u32[], token[]) %send), channel_id=1, is_host_transfer=true
-}
-
-// -----
-
-HloModule Recv
-
-// CHECK: func @main
-// CHECK: %[[ARG1:arg[0-9]+]]: memref<16xi8> {lmhlo.output_index = dense<0> : tensor<1xi64>}
-// CHECK: %[[VIEW:.*]] = memref.view %[[ARG1]][%c0][]
-// CHECK: %[[TOKEN:.*]] = "lmhlo.recv"(%[[VIEW]])
-// CHECK:   channel_handle = #mhlo.channel_handle<handle = 1, type = 3>,
-// CHECK:    frontend_attributes = {_xla_host_transfer_handler_name = "undef",
-// CHECK:                           _xla_host_transfer_rendezvous = "undef"}
-// CHECK:   is_host_transfer = true
-// CHECK: : (memref<4xf32>) -> !mhlo.token
-// CHECK: "lmhlo.recv_done"(%0)
-// CHECK:   channel_handle = #mhlo.channel_handle<handle = 1, type = 3>,
-// CHECK    is_host_transfer = true
-// CHECK: : (!mhlo.token) -> ()
-ENTRY main {
-  %tok = token[] parameter(0)
-  %recv = (f32[4]{0}, u32[], token[]) recv(token[] %tok), channel_id=1, is_host_transfer=true, frontend_attributes={_xla_host_transfer_handler_name="undef",_xla_host_transfer_rendezvous="undef"}
-  ROOT %recv-done = (f32[4]{0}, token[]) recv-done((f32[4]{0}, u32[], token[]) %recv), channel_id=1, is_host_transfer=true
-}
-
-// -----
-
-HloModule TestAllGatherAsync
-
-// CHECK: func @main
-// CHECK: %[[TOKEN:.*]] = "lmhlo_gpu.all_gather_start"(%{{.*}}, %{{.*}}) <
-// CHECK-SAME: all_gather_dimension = 1 : i64
-// CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>
-// CHECK-SAME: use_global_device_ids = false
-// CHECK ""lmhlo_gpu.all_gather_done"(%[[TOKEN]])
-ENTRY main {
-  param0 = f32[10,20] parameter(0)
-  ags = (f32[10,20], f32[10,80]) all-gather-start(param0), replica_groups={{0,1,2,3}},
-    dimensions={1}
-  ROOT ag = f32[10,80] all-gather-done(ags)
-}
-
-// -----
-
-HloModule AsyncReduceScatter
-
-// CHECK: func @main
-// CHECK: %[[TOKEN:.*]] = "lmhlo_gpu.reduce_scatter_start"(%{{.*}}, %{{.*}}) <
-// CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
-// CHECK-SAME: scatter_dimension = 0
-// CHECK-SAME: use_global_device_ids = false
-// CHECK:  ^bb0([[ARG0:%.*]]: tensor<f32>, [[ARG1:%.*]]: tensor<f32>):
-// CHECK:    [[ADD:%.*]] = mhlo.add [[ARG0]], [[ARG1]]
-// CHECK:    mhlo.return [[ADD]] : tensor<f32>
-// CHECK:  }) :
-// CHECK ""lmhlo_gpu.reduce_scatter_done"(%[[TOKEN]])
-
-add {
-  lhs = f32[] parameter(0)
-  rhs = f32[] parameter(1)
-  ROOT add = f32[] add(lhs, rhs)
-}
-
-reduce_scatter {
-  p0 = f32[8] parameter(0)
-  ROOT result = f32[4] reduce-scatter(p0), replica_groups={{0,1}},
-                    dimensions={0}, to_apply=add
-}
-
-ENTRY main {
-  input = f32[8] parameter(0)
-  rs-start = ((f32[8]), f32[4]) async-start(input), calls=reduce_scatter
-  ROOT rs-done = f32[4] async-done(rs-start), calls=reduce_scatter
-}
-
-// -----
-
-HloModule AsyncAllToAll
-
-// CHECK: func @main
-// CHECK: %[[TOKEN:.*]] = "lmhlo_gpu.all_to_all_start"(%{{.*}}, %{{.*}}) <
-// CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1]]> : tensor<1x2xi64>
-// CHECK ""lmhlo_gpu.all_to_all_done"(%[[TOKEN]])
-
-all_to_all {
-  p0 = f32[128,4] parameter(0)
-  ROOT a2a = f32[128,4] all-to-all(p0), replica_groups={{0,1}}
-}
-
-ENTRY main {
-  p0 = f32[128,4] parameter(0)
-  a2a-start = ((f32[128,4]), f32[128,4]) async-start(p0), calls=all_to_all
-  ROOT a2a-done = f32[128,4] async-done(a2a-start), calls=all_to_all
-}
-
-// -----
-
-HloModule TestAllGatherAsyncWithSyncFlagFalse
-
-// CHECK: func @main
-// CHECK: %[[TOKEN:.*]] = "lmhlo_gpu.all_gather_start"(%{{.*}}, %{{.*}}) <
-// CHECK-SAME: all_gather_dimension = 1 : i64
-// CHECK-SAME: is_sync = false
-// CHECK-SAME: no_parallel_custom_call = false
-// CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>
-// CHECK-SAME: use_global_device_ids = false
-// CHECK ""lmhlo_gpu.all_gather_done"(%[[TOKEN]])
-ENTRY main {
-  param0 = f32[10,20] parameter(0)
-  ags = (f32[10,20], f32[10,80]) all-gather-start(param0), replica_groups={{0,1,2,3}},
-    dimensions={1}
-  ROOT ag = f32[10,80] all-gather-done(ags)
-}
-
-// -----
-
-HloModule TestAllGatherAsyncWithSyncFlagTrue
-
-// CHECK: func @main
-// CHECK: %[[TOKEN:.*]] = "lmhlo_gpu.all_gather_start"(%{{.*}}, %{{.*}}) <
-// CHECK-SAME: all_gather_dimension = 1 : i64
-// CHECK-SAME: is_sync = true
-// CHECK-SAME: no_parallel_custom_call = true
-// CHECK-SAME{LITERAL}: replica_groups = dense<[[0, 1, 2, 3]]> : tensor<1x4xi64>
-// CHECK-SAME: use_global_device_ids = false
-// CHECK ""lmhlo_gpu.all_gather_done"(%[[TOKEN]])
-ENTRY main {
-  param0 = f32[10,20] parameter(0)
-  ags = (f32[10,20], f32[10,80]) all-gather-start(param0), replica_groups={{0,1,2,3}},
-    dimensions={1}, backend_config="{ \"collective_backend_config\": {\"is_sync\":true, \"no_parallel_custom_call\":true}}"
-  ROOT ag = f32[10,80] all-gather-done(ags)
-}
diff --git a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/no_opt_ops.hlotxt b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/no_opt_ops.hlotxt
deleted file mode 100644
index 0c7fc220f73afc..00000000000000
--- a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/no_opt_ops.hlotxt
+++ /dev/null
@@ -1,95 +0,0 @@
-// RUN: xla-translate -split-input-file -hlo-text-to-lhlo %s | FileCheck %s
-
-HloModule indexed_conditional
-
-%Negate (x: f32[]) -> f32[] {
-  %x = f32[] parameter(0)
-  ROOT %negate = f32[] negate(f32[] %x)
-}
-
-%NegateCond (x: f32[]) -> f32[] {
-  %x = f32[] parameter(0)
-  ROOT %negate = f32[] fusion(f32[] %x), kind=kLoop, calls=%Negate
-}
-
-%Identity (y: f32[]) -> f32[] {
-  %y = f32[] parameter(0)
-  ROOT %copy = f32[] copy(f32[] %y)
-}
-
-%IdentityCond (x: f32[]) -> f32[] {
-  %y = f32[] parameter(0)
-  ROOT %copy = f32[] fusion(f32[] %y), kind=kLoop, calls=%Identity
-}
-
-%Floor (z: f32[]) -> f32[] {
-  %z = f32[] parameter(0)
-  ROOT %floor = f32[] floor(f32[] %z)
-}
-
-%FloorCond (x: f32[]) -> f32[] {
-  %z = f32[] parameter(0)
-  ROOT %floor = f32[] fusion(f32[] %z), kind=kLoop, calls=%Floor
-}
-
-// CHECK: %{{.*}} = memref.view
-// CHECK: "lmhlo.case"(%{{.*}}) ({
-// CHECK:   mhlo.negate
-// CHECK:   "lmhlo.terminator"() : () -> ()
-// CHECK: },  {
-// CHECK:   mhlo.copy
-// CHECK:   "lmhlo.terminator"() : () -> ()
-// CHECK: },  {
-// CHECK:   mhlo.floor
-// CHECK:   "lmhlo.terminator"() : () -> ()
-// CHECK: }) : (memref<i32>) -> ()
-
-ENTRY %Parameters1.v4 () -> (f32[]) {
-  %constant = s32[] parameter(0)
-  %constant.1 = f32[] parameter(1)
-  %constant.2 = f32[] parameter(2)
-  %constant.3 = f32[] parameter(3)
-  %conditional = f32[] conditional(s32[] %constant, f32[] %constant.1, f32[] %constant.2, f32[] %constant.3), branch_computations={%NegateCond, %IdentityCond, %FloorCond}
-  ROOT %t = (f32[]) tuple(%conditional)
-}
-
-// -----
-
-HloModule WhileWithScalarS32Result_module
-
-%Add (a: s32[], b: s32[]) -> s32[] {
-  %a = s32[] parameter(0)
-  %b = s32[] parameter(1)
-  ROOT %add = s32[] add(s32[] %a, s32[] %b)
-}
-
-%body.v3 (prev.1: s32[]) -> s32[] {
-  %constant = s32[] constant(1)
-  %prev.1 = s32[] parameter(0)
-  ROOT %add = s32[] fusion(s32[] %constant, s32[] %prev.1), kind=kLoop, calls=%Add
-}
-
-%Compare (a: s32[], b: s32[]) -> pred[] {
-  %a = s32[] parameter(0)
-  %b = s32[] parameter(1)
-  ROOT %greater-than = pred[] compare(s32[] %a, s32[] %b), direction=GT
-}
-
-%condition.v3 (prev.2: s32[]) -> pred[] {
-  %constant.1 = s32[] constant(5)
-  %prev.2 = s32[] parameter(0)
-  ROOT %greater-than = pred[] fusion(s32[] %constant.1, s32[] %prev.2), kind=kLoop, calls=%Compare
-}
-
-// CHECK: %{{.*}} = memref.view
-// CHECK: "lmhlo.while"(%{{.*}}) ({
-// CHECK:   mhlo.compare
-// CHECK:   "lmhlo.terminator"() : () -> ()
-// CHECK: },  {
-// CHECK:   mhlo.add
-// CHECK:   "lmhlo.terminator"() : () -> ()
-// CHECK: }) : (memref<i1>) -> ()
-ENTRY %WhileWithScalarS32Result.v2 () -> s32[] {
-  %constant.2 = s32[] constant(0)
-  ROOT %while = s32[] while(s32[] %constant.2), condition=%condition.v3, body=%body.v3
-}
diff --git a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/non_identity_layouts.hlotxt b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/non_identity_layouts.hlotxt
deleted file mode 100644
index d74ec4e3434c93..00000000000000
--- a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/tests/non_identity_layouts.hlotxt
+++ /dev/null
@@ -1,28 +0,0 @@
-// RUN: xla-translate -hlo-text-to-lhlo %s | FileCheck %s
-
-HloModule TestModule
-
-// CHECK: #[[MAP:.*]] = affine_map<(d0, d1) -> (d0 + d1 * 3)>
-
-Fusion {
-  x = f32[3, 2]{1,0} parameter(0)
-  ROOT x.copy = f32[3, 2]{0,1} copy(x)
-}
-
-// CHECK: func @TestComputation
-ENTRY TestComputation {
-  x = f32[3, 2]{1,0} parameter(0)
-
-  // CHECK:   %[[VIEW:.*]] = memref.view {{.*}} : memref<24xi8> to memref<3x2xf32>
-  // CHECK: "lmhlo.fusion"() <{backend_config = "{{.*}}"}> ({
-  // CHECK:   %[[VAL2:.*]] = bufferization.to_tensor %[[VIEW]] : memref<3x2xf32>
-  // CHECK:   %[[VAL3:.*]] = mhlo.copy %[[VAL2]] {
-  // CHECK-SAME:               result_layout = dense<[0, 1]>
-  // CHECK-SAME:               xla_shape = "f32[3,2]{0,1}"
-  // CHECK-SAME:             } : tensor<3x2xf32>
-  // CHECK:   bufferization.materialize_in_destination %[[VAL3:.*]] in
-  // CHECK-SAME: writable %{{.*}} : (tensor<3x2xf32>, memref<3x2xf32, #[[MAP]]>)
-  // CHECK:   "lmhlo.terminator"() : () -> ()
-  // CHECK: }) : () -> ()
-  ROOT fusion = f32[3, 2]{0,1} fusion(f32[3, 2]{1,0} x), kind=kLoop, calls=Fusion
-}
diff --git a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/translate_registration.cc b/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/translate_registration.cc
deleted file mode 100644
index 4de3c85777234e..00000000000000
--- a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/translate_registration.cc
+++ /dev/null
@@ -1,29 +0,0 @@
-/* Copyright 2020 The OpenXLA Authors.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License.
-==============================================================================*/
-
-#include "mlir/Tools/mlir-translate/Translation.h"  // from @llvm-project
-#include "xla/translate/mhlo_to_lhlo_with_xla/mhlo_to_lhlo_with_xla.h"
-
-//----------------------------------------------------------------------------//
-// Hooks for tf-mlir-translate
-//----------------------------------------------------------------------------/
-
-// MHLO doesn't support explicit layouts, while XLA service does.
-// TODO(timshen): remove it once MHLO supports explicit layouts.
-static mlir::TranslateToMLIRRegistration HloTextToLhloMlirTranslate(
-    "hlo-text-to-lhlo", "hlo-text-to-lhlo",
-    [](llvm::StringRef input, mlir::MLIRContext* context) {
-      return mlir::HloTextToLhloTranslateFunction(input, context);
-    });
diff --git a/third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/xla_translate_opt_main.cc b/third_party/xla/xla/translate/xla_translate_opt_main.cc
similarity index 100%
rename from third_party/xla/xla/translate/mhlo_to_lhlo_with_xla/xla_translate_opt_main.cc
rename to third_party/xla/xla/translate/xla_translate_opt_main.cc
diff --git a/third_party/xla/xla/types.h b/third_party/xla/xla/types.h
index ef9dd11da6893a..f1891b9a8c076e 100644
--- a/third_party/xla/xla/types.h
+++ b/third_party/xla/xla/types.h
@@ -19,11 +19,11 @@ limitations under the License.
 #include <complex>
 #include <cstdint>
 #include <limits>
+#include <string>
 #include <type_traits>
 
-#include "absl/strings/str_format.h"
-#include "Eigen/Core"  // from @eigen_archive            // IWYU pragma: export
-#include "ml_dtypes/include/int4.h"  // from @ml_dtypes  // IWYU pragma: export
+#include "Eigen/Core"  // from @eigen_archive  // IWYU pragma: export
+#include "tsl/platform/ml_dtypes.h"  // IWYU pragma: export
 
 namespace xla {
 
@@ -41,18 +41,26 @@ struct is_complex<std::complex<T>> : std::true_type {};
 template <typename T>
 inline constexpr bool is_complex_v = is_complex<T>::value;
 
+template <typename T>
+struct is_specialized_floating_point
+    : std::bool_constant<std::numeric_limits<T>::is_specialized &&
+                         !std::numeric_limits<T>::is_integer> {};
+
 template <typename T>
 inline constexpr bool is_specialized_floating_point_v =
-    std::numeric_limits<T>::is_specialized &&
-    !std::numeric_limits<T>::is_integer;
+    is_specialized_floating_point<T>::value;
+
+template <typename T>
+struct is_specialized_integral
+    : std::bool_constant<std::numeric_limits<T>::is_specialized &&
+                         std::numeric_limits<T>::is_integer> {};
 
 template <typename T>
 inline constexpr bool is_specialized_integral_v =
-    std::numeric_limits<T>::is_specialized &&
-    std::numeric_limits<T>::is_integer;
+    is_specialized_integral<T>::value;
 
-using u4 = ml_dtypes::uint4;
-using s4 = ml_dtypes::int4;
+using u4 = tsl::uint4;
+using s4 = tsl::int4;
 
 }  // namespace xla
 
@@ -60,12 +68,12 @@ using s4 = ml_dtypes::int4;
 namespace ml_dtypes {
 template <typename Sink>
 void AbslStringify(Sink& sink, const xla::s4& i) {
-  absl::Format(&sink, "%d", static_cast<int32_t>(i));
+  sink.Append(std::to_string(static_cast<int32_t>(i)));
 }
 
 template <typename Sink>
 void AbslStringify(Sink& sink, const xla::u4& i) {
-  absl::Format(&sink, "%d", static_cast<uint32_t>(i));
+  sink.Append(std::to_string(static_cast<uint32_t>(i)));
 }
 }  // namespace ml_dtypes
 
@@ -112,6 +120,16 @@ struct make_specialized_signed<xla::u4> {
 template <typename T>
 using make_specialized_signed_t = typename make_specialized_signed<T>::type;
 
+template <typename T>
+struct has_negative_zero
+    : std::bool_constant<std::numeric_limits<T>::is_iec559> {};
+
+template <>
+struct has_negative_zero<tsl::float8_e4m3fn> : std::bool_constant<true> {};
+
+template <typename T>
+inline constexpr bool has_negative_zero_v = has_negative_zero<T>::value;
+
 }  // namespace xla
 
 #endif  // XLA_TYPES_H_
diff --git a/third_party/xla/xla/util.cc b/third_party/xla/xla/util.cc
index 94fa536dd1b0bf..c4abd63d8c7f90 100644
--- a/third_party/xla/xla/util.cc
+++ b/third_party/xla/xla/util.cc
@@ -480,64 +480,4 @@ void UnpackInt4(absl::Span<const char> input, absl::Span<char> output) {
   }
 }
 
-/*static*/ MaybeOwningThreadPool MaybeOwningThreadPool::GetOrCreate(
-    int parallelism, tsl::thread::ThreadPool* default_thread_pool,
-    int default_parallelism) {
-  CHECK_GE(parallelism, 0);
-  CHECK_GE(default_parallelism, 1);
-
-  auto create_thread_pool = [&](int num_threads) {
-    CHECK_GE(num_threads, 1);
-    return std::make_unique<tsl::thread::ThreadPool>(tsl::Env::Default(), "",
-                                                     num_threads);
-  };
-
-  switch (parallelism) {
-    case 0:
-      if (default_thread_pool == nullptr && default_parallelism > 1) {
-        return MaybeOwningThreadPool(create_thread_pool(default_parallelism));
-      }
-      return MaybeOwningThreadPool(default_thread_pool);
-    case 1:
-      return MaybeOwningThreadPool(nullptr);
-    default:
-      return MaybeOwningThreadPool(create_thread_pool(parallelism));
-  }
-}
-
-MaybeOwningThreadPool::MaybeOwningThreadPool() : thread_pool_(nullptr) {}
-
-MaybeOwningThreadPool::MaybeOwningThreadPool(
-    tsl::thread::ThreadPool* thread_pool)
-    : thread_pool_(thread_pool) {}
-
-MaybeOwningThreadPool::MaybeOwningThreadPool(
-    std::unique_ptr<tsl::thread::ThreadPool> thread_pool)
-    : thread_pool_(std::move(thread_pool)) {}
-
-tsl::thread::ThreadPool* MaybeOwningThreadPool::get() {
-  if (std::holds_alternative<tsl::thread::ThreadPool*>(thread_pool_)) {
-    return std::get<tsl::thread::ThreadPool*>(thread_pool_);
-  }
-  return std::get<std::unique_ptr<tsl::thread::ThreadPool>>(thread_pool_).get();
-}
-
-const tsl::thread::ThreadPool* MaybeOwningThreadPool::get() const {
-  return const_cast<MaybeOwningThreadPool*>(this)->get();
-}
-
-tsl::thread::ThreadPool* MaybeOwningThreadPool::operator->() {
-  tsl::thread::ThreadPool* thread_pool = get();
-  CHECK_NE(thread_pool, nullptr);
-  return thread_pool;
-}
-
-const tsl::thread::ThreadPool* MaybeOwningThreadPool::operator->() const {
-  return const_cast<MaybeOwningThreadPool*>(this)->operator->();
-}
-
-MaybeOwningThreadPool::operator bool() const { return get() != nullptr; }
-
-bool MaybeOwningThreadPool::operator!() const { return get() == nullptr; }
-
 }  // namespace xla
diff --git a/third_party/xla/xla/util.h b/third_party/xla/xla/util.h
index faa381949d6b69..23df713bea51a7 100644
--- a/third_party/xla/xla/util.h
+++ b/third_party/xla/xla/util.h
@@ -48,6 +48,7 @@ limitations under the License.
 #include "Eigen/Core"  // from @eigen_archive
 #include "xla/status.h"
 #include "xla/status_macros.h"
+#include "xla/types.h"
 #include "xla/xla_data.pb.h"
 #include "tsl/lib/math/math_util.h"
 #include "tsl/platform/bfloat16.h"
@@ -55,7 +56,6 @@ limitations under the License.
 #include "tsl/platform/errors.h"  // IWYU pragma: keep
 #include "tsl/platform/logging.h"
 #include "tsl/platform/ml_dtypes.h"
-#include "tsl/platform/threadpool.h"
 
 namespace xla {
 
@@ -212,13 +212,6 @@ void StridedCopy(D* dest, int64_t dest_stride, const S* src, int64_t src_stride,
 Status AddStatus(Status prior, absl::string_view context);
 Status AppendStatus(Status prior, absl::string_view context);
 
-template <typename... Args>
-Status InternalError(const absl::FormatSpec<Args...>& format,
-                     const Args&... args) {
-  return WithLogBacktrace(
-      tsl::errors::Internal(absl::StrFormat(format, args...)));
-}
-
 // This macro defines the arguments to be used as an error
 // message to be passed to absl::StrFormat, and returns a status in the
 // canonical error space.
@@ -581,9 +574,7 @@ auto SignAndMagnitude(T x) {
   BitType x_abs_bits = Eigen::numext::bit_cast<BitType>(Eigen::numext::abs(x));
   const BitType x_bits = Eigen::numext::bit_cast<BitType>(x);
   const BitType x_sign = x_bits ^ x_abs_bits;
-  if constexpr (std::is_same_v<T, tsl::float8_e4m3b11> ||
-                std::is_same_v<T, tsl::float8_e4m3fnuz> ||
-                std::is_same_v<T, tsl::float8_e5m2fnuz>) {
+  if constexpr (!has_negative_zero_v<T>) {
     //  f8e4m3b11, f8e4m3fnuz, and f8e5m2fnuz don't support -0, adjust negative
     //  numbers to fill in the gap.
     if (x_sign) {
@@ -775,37 +766,6 @@ inline bool HloPredicateFalse(const HloInstruction*) { return false; }
 using Vector2 = std::array<int64_t, 2>;
 using Vector3 = std::array<int64_t, 3>;
 
-// A class for storing either an owned thread pool or a non-owning pointer to an
-// external thread pool.
-class MaybeOwningThreadPool {
- public:
-  // Gets or creates a thread pool.
-  //
-  // See the code for the logic.
-  static MaybeOwningThreadPool GetOrCreate(
-      int parallelism, tsl::thread::ThreadPool* default_thread_pool,
-      int default_parallelism);
-
-  // Not owning (nullptr).
-  MaybeOwningThreadPool();
-  // Not owning.
-  explicit MaybeOwningThreadPool(tsl::thread::ThreadPool* thread_pool);
-  // Owning.
-  explicit MaybeOwningThreadPool(
-      std::unique_ptr<tsl::thread::ThreadPool> thread_pool);
-  tsl::thread::ThreadPool* get();
-  const tsl::thread::ThreadPool* get() const;
-  tsl::thread::ThreadPool* operator->();
-  const tsl::thread::ThreadPool* operator->() const;
-  explicit operator bool() const;
-  bool operator!() const;
-
- private:
-  std::variant<tsl::thread::ThreadPool*,
-               std::unique_ptr<tsl::thread::ThreadPool>>
-      thread_pool_;
-};
-
 }  // namespace xla
 
 #define XLA_LOG_LINES(SEV, STRING) \
diff --git a/third_party/xla/xla/xla.bzl b/third_party/xla/xla/xla.bzl
index 5a69793c18496c..260fa5c1731310 100644
--- a/third_party/xla/xla/xla.bzl
+++ b/third_party/xla/xla/xla.bzl
@@ -65,6 +65,7 @@ _XLA_SHARED_OBJECT_SENSITIVE_DEPS = if_static(extra_deps = [], otherwise = [
     Label("//xla/stream_executor/gpu:gpu_stream"),
     Label("//xla/stream_executor/rocm:all_runtime"),
     Label("//xla/stream_executor/rocm:stream_executor_rocm"),
+    "@local_tsl//tsl/util:determinism",
 ])
 
 def xla_cc_binary(deps = [], copts = tsl_copts(), **kwargs):
diff --git a/third_party/xla/xla/xla.proto b/third_party/xla/xla/xla.proto
index 8efa57455f948b..7405b5a07ff95f 100644
--- a/third_party/xla/xla/xla.proto
+++ b/third_party/xla/xla/xla.proto
@@ -172,7 +172,12 @@ message DebugOptions {
   // useful when accelerating structured sparsity.
   int32 xla_cpu_sparse_cuda_threads = 207;
 
-  // Allows xla to increase the output precision of floating point operations.
+  // Allows xla to increase the output precision of floating point operations
+  // and all floating-point conversions to be simplified, including those
+  // that affect the numerics. The `FloatNormalization` pass inserts many
+  // `f32 -> bf16 -> f32` conversion pairs. These are not removed by the
+  // `AlgebraicSimplifier`, as that will only simplify conversions that are
+  // no-ops, e.g. `bf16 -> f32 -> bf16`. Removing these improves accuracy.
   bool xla_allow_excess_precision = 122;
 
   // Crashes the program when any kind of verification fails, instead of just
@@ -446,6 +451,9 @@ message DebugOptions {
   // if `xla_gpu_enable_custom_fusion` set to true.
   string xla_gpu_enable_custom_fusions_re = 264;
 
+  // If true, use XLA runtime for XLA:GPU backend.
+  bool xla_gpu_enable_address_computation_fusion = 105;
+
   reserved 233;  // was xla_gpu_enable_gpu2_runtime
   reserved 234;  // was xla_gpu_enable_gpu2_hal
 
@@ -508,12 +516,8 @@ message DebugOptions {
   // scratch), so this can be multiplied by quite a lot.
   int64 xla_gpu_redzone_padding_bytes = 228;
 
-  // Allows all floating-point conversions to be simplified, including those
-  // that affect the numerics. The `FloatNormalization` pass inserts many
-  // `f32 -> bf16 -> f32` conversion pairs. These are not removed by the
-  // `AlgebraicSimplifier`, as that will only simplify conversions that are
-  // no-ops, e.g. `bf16 -> f32 -> bf16`. Removing these improves accuracy.
-  bool xla_gpu_simplify_all_fp_conversions = 168;
+  // Deprecated. Use xla_allow_excess_precision instead.
+  bool xla_gpu_simplify_all_fp_conversions = 168 [deprecated = true];
 
   // An experimental option to force all layouts present in the
   // after-optimizations HLO to be descending, e.g.
@@ -672,10 +676,18 @@ message DebugOptions {
   // Enable NCCL user buffers.
   bool xla_gpu_enable_nccl_user_buffers = 267;
 
+  // Enable NCCL communicator splitting.
+  bool xla_gpu_enable_nccl_comm_splitting = 272;
+
   // If enabled, uses the libnvptxcompiler library to compile PTX to cuBIN.
   bool xla_gpu_enable_libnvptxcompiler = 269;
 
-  // Next id: 270
+  bool xla_gpu_enable_dot_strength_reduction = 270;
+
+  // If enabled, uses bf16_6way gemm to compute F32 gemm.
+  bool xla_gpu_enable_bf16_6way_gemm = 271;
+
+  // Next id: 273
 
   // Extra options to pass to the compilation backend (e.g. LLVM); specific
   // interpretation of these values is left to the backend.
diff --git a/third_party/xla/xla/xla_data.proto b/third_party/xla/xla/xla_data.proto
index 210bb9aea266f0..72458ff52edd24 100644
--- a/third_party/xla/xla/xla_data.proto
+++ b/third_party/xla/xla/xla_data.proto
@@ -216,6 +216,15 @@ message LayoutProto {
   // error.
   repeated TileProto tiles = 6;
 
+  // The shape is padded at the end to multiple of, in terms of number of
+  // elements. This is useful when tiling does not bring the shape to certain
+  // desired granules. Tiling effectively pads/reshapes/transposes the shape
+  // to another shape. This field pads the total number of elements of that
+  // new shape to a multiple of certain number of elements. This is useful such
+  // as we want a layout which does not tile the data but still requires it to
+  // be padded to certain number of elements.
+  int64 tail_padding_alignment_in_elements = 16;
+
   // (Optional) Bit size of each element. When unspecified or being 0, default
   // to ShapeUtil::ByteSizeOfPrimitiveType.
   int64 element_size_in_bits = 7;
@@ -371,16 +380,11 @@ message OpMetadata {
   // Deprecated, use [ProfileInfo][profile_type] instead.
   repeated ProfileType profile_type = 5 [deprecated = true];
 
-  // HloPassMetadata.pass_id of the pass that created this HLO instruction
-  // object. Should never be copied between HLO instructions. Zero if unset and
-  // -1 if the instruction was created before HLO passes began.
-  int64 creation_pass_id = 6;
+  reserved 6;
+  reserved "creation_pass_id";
 
-  // HloPassMetadata.pass_id of the pass that created the logical functionality
-  // that this HLO instruction represents. Should be copied between HLO
-  // instructions that correspond across compilation passes. Zero if unset and
-  // -1 if the instruction was created before HLO passes began.
-  int64 logical_creation_pass_id = 7;
+  reserved 7;
+  reserved "logical_creation_pass_id";
 
   // The footprint of the generated code for the instruction.
   int64 size_of_generated_code_in_bytes = 8;
@@ -710,6 +714,36 @@ message DotDimensionNumbers {
   repeated int64 rhs_batch_dimensions = 4;
 }
 
+enum SparsityType {
+  SPARSITY_INVALID = 0;
+
+  // Structured N:M sparsity.
+  SPARSITY_STRUCTURED_N_M = 1;
+
+  // Next: 2
+}
+
+// Contains sparsity metadata for a sparse dot operation.
+// The only supported type atm is structured 2:4 sparsity, which is natively
+// supported on NVidia GPUs.
+// Restrictions:
+// - only one operand of the dot operation may be sparse;
+// - only the contracting dimension may be sparse.
+message SparsityDescriptor {
+  SparsityType type = 1;
+
+  // Sparse operand index (0 or 1).
+  int32 index = 2;
+  // Sparse dimension number.
+  int32 dimension = 3;
+
+  // Structured N:M sparsity (N < M).
+  int32 n = 4;
+  int32 m = 5;
+
+  // Next: 6
+}
+
 enum RandomDistribution {
   RNG_INVALID = 0;